diff --git a/applications/solvers/dfLowMachFoam/EEqn.H b/applications/solvers/dfLowMachFoam/EEqn.H index 6a49d9ced..bdef9043f 100644 --- a/applications/solvers/dfLowMachFoam/EEqn.H +++ b/applications/solvers/dfLowMachFoam/EEqn.H @@ -1,24 +1,18 @@ { volScalarField& he = thermo.he(); - #ifdef GPUSolver_ - // start1 = std::clock(); - // t.join(); - // UEqn_GPU.updatePsi(&U[0][0]); - // K = 0.5*magSqr(U); - // end1 = std::clock(); - // time_monitor_UEqn += double(end1 - start1) / double(CLOCKS_PER_SEC); - // time_monitor_UEqn_mtxAssembly += double(end1 - start1) / double(CLOCKS_PER_SEC); - // time_monitor_CPU += double(end1 - start1) / double(CLOCKS_PER_SEC); - // time_monitor_UinE += double(end1 - start1) / double(CLOCKS_PER_SEC); + +#ifdef GPUSolver_ start1 = std::clock(); - // // t.join(); UEqn_GPU.updatePsi(&U[0][0]); K = 0.5*magSqr(U); end1 = std::clock(); time_monitor_UEqn += double(end1 - start1) / double(CLOCKS_PER_SEC); time_monitor_UEqn_mtxAssembly += double(end1 - start1) / double(CLOCKS_PER_SEC); time_monitor_CPU += double(end1 - start1) / double(CLOCKS_PER_SEC); - #endif +#endif + +#ifdef CPUSolver_ + start1 = std::clock(); fvScalarMatrix EEqn ( fvm::ddt(rho, he) + fvm::div(phi, he) @@ -33,4 +27,91 @@ EEqn.relax(); EEqn.solve(); + + end1 = std::clock(); + time_monitor_EEqn += double(end1 - start1) / double(CLOCKS_PER_SEC); +#endif + +#ifdef GPUSolver_ + // prepare data on CPU + start1 = std::clock(); + start2 = std::clock(); + const tmp alphaEff_tmp(turbulence->alphaEff()); + const volScalarField& alphaEff = alphaEff_tmp(); + int eeqn_offset = 0; + forAll(he.boundaryField(), patchi) + { + const fvsPatchScalarField& patchFlux = phi.boundaryField()[patchi]; + const fvsPatchScalarField& pw = mesh.surfaceInterpolation::weights().boundaryField()[patchi]; + int patchSize = pw.size(); + + const scalarField& patchK = K.boundaryField()[patchi]; + const vectorField& patchhDiffCorrFlux = hDiffCorrFlux.boundaryField()[patchi]; + const scalarField& patchAlphaEff = alphaEff.boundaryField()[patchi]; + memcpy(boundary_K + eeqn_offset, &patchK[0], patchSize*sizeof(double)); + memcpy(boundary_hDiffCorrFlux + eeqn_offset * 3, &patchhDiffCorrFlux[0][0], 3 * patchSize*sizeof(double)); + memcpy(boundary_alphaEff + eeqn_offset, &patchAlphaEff[0], patchSize*sizeof(double)); + + Field valueInternalCoeffs = patchFlux*he.boundaryField()[patchi].valueInternalCoeffs(pw); + Field valueBoundaryCoeffs = -patchFlux*he.boundaryField()[patchi].valueBoundaryCoeffs(pw); + Field gradientInternalCoeffs = he.boundaryField()[patchi].gradientInternalCoeffs(); + Field gradientBoundaryCoeffs = he.boundaryField()[patchi].gradientBoundaryCoeffs(); + memcpy(eeqn_valueInternalCoeffs + eeqn_offset, &valueInternalCoeffs[0], patchSize*sizeof(double)); + memcpy(eeqn_valueBoundaryCoeffs + eeqn_offset, &valueBoundaryCoeffs[0], patchSize*sizeof(double)); + memcpy(eeqn_gradientInternalCoeffs + eeqn_offset, &gradientInternalCoeffs[0], patchSize*sizeof(double)); + memcpy(eeqn_gradientBoundaryCoeffs + eeqn_offset, &gradientBoundaryCoeffs[0], patchSize*sizeof(double)); + + eeqn_offset += patchSize; + } + eeqn_offset = 0; + end1 = std::clock(); + time_monitor_CPU += double(end1 - start1) / double(CLOCKS_PER_SEC); + time_monitor_EEqn_mtxAssembly_CPU_Prepare += double(end1 - start1) / double(CLOCKS_PER_SEC); + + // prepare data on GPU + start1 = std::clock(); + EEqn_GPU.prepare_data(&he.oldTime()[0], &K[0], &K.oldTime()[0], &alphaEff[0], + &dpdt[0], &diffAlphaD[0], &hDiffCorrFlux[0][0], + boundary_K, boundary_hDiffCorrFlux, boundary_alphaEff, + eeqn_valueInternalCoeffs, eeqn_valueBoundaryCoeffs, + eeqn_gradientInternalCoeffs, eeqn_gradientBoundaryCoeffs); + if (doSync) EEqn_GPU.sync(); + end1 = std::clock(); + time_monitor_EEqn_mtxAssembly_GPU_Prepare += double(end1 - start1) / double(CLOCKS_PER_SEC); + + start1 = std::clock(); + EEqn_GPU.resetAb(); + EEqn_GPU.fvm_ddt(); + EEqn_GPU.fvm_div(); + EEqn_GPU.fvm_laplacian(); + EEqn_GPU.fvc_ddt(); + EEqn_GPU.fvc_div_phi_scalar(); + EEqn_GPU.fvc_div_vector(); + EEqn_GPU.add_to_source(); + if (doSync) EEqn_GPU.sync(); + end1 = std::clock(); + time_monitor_EEqn_mtxAssembly_GPU_Run += double(end1 - start1) / double(CLOCKS_PER_SEC); + + EEqn_GPU.sync(); + end2 = std::clock(); + time_monitor_EEqn += double(end2 - start2) / double(CLOCKS_PER_SEC); + time_monitor_EEqn_mtxAssembly += double(end2 - start2) / double(CLOCKS_PER_SEC); + + // check value of mtxAssembly, no time monitor + EEqn_GPU.checkValue(false); + + start1 = std::clock(); + EEqn_GPU.solve(); + if (doSync) EEqn_GPU.sync(); + end1 = std::clock(); + time_monitor_EEqn += double(end1 - start1) / double(CLOCKS_PER_SEC); + time_monitor_EEqn_Solve += double(end1 - start1) / double(CLOCKS_PER_SEC); + + start1 = std::clock(); + EEqn_GPU.updatePsi(&he[0]); + end1 = std::clock(); + time_monitor_CPU += double(end1 - start1) / double(CLOCKS_PER_SEC); + time_monitor_EEqn += double(end1 - start1) / double(CLOCKS_PER_SEC); + time_monitor_EEqn_mtxAssembly += double(end1 - start1) / double(CLOCKS_PER_SEC); +#endif } diff --git a/applications/solvers/dfLowMachFoam/createdfSolver.H b/applications/solvers/dfLowMachFoam/createdfSolver.H index b60307463..880ae674c 100644 --- a/applications/solvers/dfLowMachFoam/createdfSolver.H +++ b/applications/solvers/dfLowMachFoam/createdfSolver.H @@ -31,9 +31,11 @@ settingPath = CanteraTorchProperties.subDict("AmgxSettings").lookupOrDefault("UE dfMatrixDataBase dfDataBase(num_surfaces, num_cells, num_boundary_faces, Y.size(), num_boundary_cells, &neighbour[0], &owner[0], &mesh.V()[0], &mesh.surfaceInterpolation::weights()[0], &mesh.Sf()[0][0], &mesh.magSf()[0], &mesh.nonOrthDeltaCoeffs()[0], boundary_face_vector_init, boundary_face_init, boundary_deltaCoeffs_init, boundaryCellIndex); +#ifdef GPUSolver_ dfRhoEqn rhoEqn_GPU(dfDataBase); dfUEqn UEqn_GPU(dfDataBase, "dDDI", settingPath); dfYEqn YEqn_GPU(dfDataBase, "dDDI", settingPath, inertIndex); +dfEEqn EEqn_GPU(dfDataBase, "dDDI", settingPath); double *ueqn_internalCoeffs_init, *ueqn_boundaryCoeffs_init, *boundary_pressure_init, *boundary_velocity_init, *boundary_nuEff_init, *boundary_rho_init, *ueqn_laplac_internalCoeffs_init, *ueqn_laplac_boundaryCoeffs_init, *boundary_phi_init; @@ -45,4 +47,15 @@ cudaMallocHost(&boundary_velocity_init, 3*num_boundary_faces*sizeof(double)); cudaMallocHost(&boundary_pressure_init, num_boundary_faces*sizeof(double)); cudaMallocHost(&boundary_nuEff_init, num_boundary_faces*sizeof(double)); cudaMallocHost(&boundary_rho_init, num_boundary_faces*sizeof(double)); -cudaMallocHost(&boundary_phi_init, num_boundary_faces*sizeof(double)); \ No newline at end of file +cudaMallocHost(&boundary_phi_init, num_boundary_faces*sizeof(double)); + +double *boundary_alphaEff, *boundary_K, *boundary_hDiffCorrFlux; +double *eeqn_valueInternalCoeffs, *eeqn_valueBoundaryCoeffs, *eeqn_gradientInternalCoeffs, *eeqn_gradientBoundaryCoeffs; +cudaMallocHost(&boundary_K, num_boundary_faces*sizeof(double)); +cudaMallocHost(&boundary_hDiffCorrFlux, 3 * num_boundary_faces*sizeof(double)); +cudaMallocHost(&boundary_alphaEff, num_boundary_faces*sizeof(double)); +cudaMallocHost(&eeqn_valueInternalCoeffs, num_boundary_faces*sizeof(double)); +cudaMallocHost(&eeqn_valueBoundaryCoeffs, num_boundary_faces*sizeof(double)); +cudaMallocHost(&eeqn_gradientInternalCoeffs, num_boundary_faces*sizeof(double)); +cudaMallocHost(&eeqn_gradientBoundaryCoeffs, num_boundary_faces*sizeof(double)); +#endif diff --git a/applications/solvers/dfLowMachFoam/dfLowMachFoam.C b/applications/solvers/dfLowMachFoam/dfLowMachFoam.C index d875b607d..80bf0bcd8 100644 --- a/applications/solvers/dfLowMachFoam/dfLowMachFoam.C +++ b/applications/solvers/dfLowMachFoam/dfLowMachFoam.C @@ -62,11 +62,13 @@ Description #include "dfUEqn.H" #include "dfYEqn.H" #include "dfRhoEqn.H" +#include "dfEEqn.H" #include #include #include "upwind.H" #define GPUSolver_ +//#define CPUSolver_ // * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * // @@ -94,11 +96,19 @@ int main(int argc, char *argv[]) #include "createFields.H" #include "createRhoUfIfPresent.H" + bool doSync = true; double time_monitor_flow=0; double time_monitor_UEqn=0; double time_monitor_UEqn_mtxAssembly=0; double time_monitor_UEqn_Solve=0; double time_monitor_UEqn_sum=0; + double time_monitor_EEqn=0; + double time_monitor_EEqn_mtxAssembly=0; + double time_monitor_EEqn_mtxAssembly_CPU_Prepare; + double time_monitor_EEqn_mtxAssembly_GPU_Prepare; + double time_monitor_EEqn_mtxAssembly_GPU_Run; + double time_monitor_EEqn_Solve=0; + double time_monitor_EEqn_sum=0; double time_monitor_chem=0; double time_monitor_Y=0; double time_monitor_E=0; @@ -223,6 +233,7 @@ int main(int argc, char *argv[]) runTime.write(); time_monitor_UEqn_sum += time_monitor_UEqn; + time_monitor_EEqn_sum += time_monitor_EEqn; Info << "output time index " << runTime.timeIndex() << endl; @@ -238,6 +249,12 @@ int main(int argc, char *argv[]) Info<< "UEqn Time solve = " << time_monitor_UEqn_Solve << " s" << endl; // Info<< "UEqn sum Time = " << time_monitor_UEqn_sum << " s" << endl; // Info<< "UEqn sum Time - overhead = " << time_monitor_UEqn_sum - time_UEqn_initial << " s" << endl; + Info<< "EEqn Time = " << time_monitor_EEqn << " s" << endl; + Info<< "EEqn Time assamble Mtx = " << time_monitor_EEqn_mtxAssembly << " s" << endl; + Info<< "EEqn assamble(CPU prepare) = " << time_monitor_EEqn_mtxAssembly_CPU_Prepare << " s" << endl; + Info<< "EEqn assamble(GPU prepare) = " << time_monitor_EEqn_mtxAssembly_GPU_Prepare << " s" << endl; + Info<< "EEqn assamble(GPU run) = " << time_monitor_EEqn_mtxAssembly_GPU_Run << " s" << endl; + Info<< "EEqn Time solve = " << time_monitor_EEqn_Solve << " s" << endl; Info<< "sum Time = " << (time_monitor_chem + time_monitor_Y + time_monitor_flow + time_monitor_E + time_monitor_corrThermo + time_monitor_corrDiff) << " s" << endl; Info<< "CPU Time (get turb souce) = " << time_monitor_CPU << " s" << endl; Info<< "UEqn time in EEqn = " << time_monitor_UinE << " s" << endl; @@ -248,6 +265,12 @@ int main(int argc, char *argv[]) time_monitor_UEqn = 0; time_monitor_UEqn_mtxAssembly = 0; time_monitor_UEqn_Solve = 0; + time_monitor_EEqn = 0; + time_monitor_EEqn_mtxAssembly = 0; + time_monitor_EEqn_mtxAssembly_CPU_Prepare = 0; + time_monitor_EEqn_mtxAssembly_GPU_Prepare = 0; + time_monitor_EEqn_mtxAssembly_GPU_Run = 0; + time_monitor_EEqn_Solve = 0; time_monitor_chem = 0; time_monitor_Y = 0; time_monitor_E = 0; diff --git a/examples/dfLowMachFoam/threeD_reactingTGV/H2/cvodeIntegrator_gpu/system/fvSchemes b/examples/dfLowMachFoam/threeD_reactingTGV/H2/cvodeIntegrator_gpu/system/fvSchemes index c5c9136e5..e2c0058cd 100644 --- a/examples/dfLowMachFoam/threeD_reactingTGV/H2/cvodeIntegrator_gpu/system/fvSchemes +++ b/examples/dfLowMachFoam/threeD_reactingTGV/H2/cvodeIntegrator_gpu/system/fvSchemes @@ -30,15 +30,15 @@ divSchemes default none; div(phi,U) Gauss linear; - div(phi,Yi) Gauss limitedLinear01 1; - div(phi,h) Gauss limitedLinear 1; - div(phi,ha) Gauss limitedLinear 1; - div(phi,K) Gauss limitedLinear 1; - div(phid,p) Gauss limitedLinear 1; - div(phi,epsilon) Gauss limitedLinear 1; - div(phi,Yi_h) Gauss limitedLinear01 1; - div(phi,k) Gauss limitedLinear 1; - div(hDiffCorrFlux) Gauss cubic; + div(phi,Yi) Gauss linear; + div(phi,h) Gauss linear; + div(phi,ha) Gauss linear; + div(phi,K) Gauss linear; + div(phid,p) Gauss linear; + div(phi,epsilon) Gauss linear; + div(phi,Yi_h) Gauss upwind; + div(phi,k) Gauss linear; + div(hDiffCorrFlux) Gauss linear; div(((rho*nuEff)*dev2(T(grad(U))))) Gauss linear; } diff --git a/src_gpu/CMakeLists.txt b/src_gpu/CMakeLists.txt index c749eaa42..1040cee14 100644 --- a/src_gpu/CMakeLists.txt +++ b/src_gpu/CMakeLists.txt @@ -23,6 +23,7 @@ add_library(${PROJECT_NAME} dfUEqn.cu dfRhoEqn.cu dfYEqn.cu + dfEEqn.cu AmgXSolver.cu) target_link_libraries(${PROJECT_NAME} diff --git a/src_gpu/dfEEqn.H b/src_gpu/dfEEqn.H new file mode 100644 index 000000000..b606e5f28 --- /dev/null +++ b/src_gpu/dfEEqn.H @@ -0,0 +1,75 @@ +#pragma once + +#include "AmgXSolver.H" +#include +#include "dfMatrixDataBase.H" + +class dfEEqn +{ +private: + dfMatrixDataBase &dataBase_; + cudaStream_t stream; + AmgXSolver *ESolver = nullptr; + int num_iteration; + + // common variables + int num_cells, cell_bytes, num_faces, num_surfaces, cell_vec_bytes, csr_value_vec_bytes, num_boundary_cells; + int num_boundary_faces, boundary_face_bytes; + int *d_A_csr_row_index, *d_A_csr_diag_index, *d_A_csr_col_index; + + // Matrix variables + double *d_A_csr, *d_b = nullptr; + double *h_A_csr, *h_b = nullptr; + double *d_he_old = nullptr; + double *h_he_new = nullptr; + + // fields used by EEqn + double *d_alphaEff = nullptr; + double *d_K = nullptr; + double *d_K_old = nullptr; + double *d_dpdt = nullptr; + double *d_diffAlphaD = nullptr; + double *d_hDiffCorrFlux = nullptr; + double *d_boundary_K_init = nullptr; + double *d_boundary_K = nullptr; + double *d_boundary_hDiffCorrFlux_init = nullptr; + double *d_boundary_hDiffCorrFlux = nullptr; + double *d_boundary_alphaEff_init = nullptr; + double *d_boundary_alphaEff = nullptr; + double *d_value_internal_coeffs_init = nullptr; + double *d_value_boundary_coeffs_init = nullptr; + double *d_gradient_internal_coeffs_init = nullptr; + double *d_gradient_boundary_coeffs_init = nullptr; + double *d_value_internal_coeffs = nullptr; + double *d_value_boundary_coeffs = nullptr; + double *d_gradient_internal_coeffs = nullptr; + double *d_gradient_boundary_coeffs = nullptr; + +public: + dfEEqn(); + dfEEqn(dfMatrixDataBase &dataBase, const std::string &modeStr, const std::string &cfgFile); + ~dfEEqn(); + + void prepare_data(const double *he_old, const double *K, const double *K_old, const double *alphaEff, + const double *hDiffCorrFlux, const double *dpdt, const double *diffAlphaD, + const double *boundary_K, const double *boundary_hDiffCorrFlux, const double *boundary_alphaEff, + const double *valueInternalCoeffs, const double *valueBoundaryCoeffs, + const double *gradientInternalCoeffs, const double *gradientBoundaryCoeffs); + + void resetAb(); + + void fvm_ddt(); + void fvm_div(); + void fvm_laplacian(); + + void fvc_ddt(); + void fvc_div_phi_scalar(); + void fvc_div_vector(); + void add_to_source(); + + void solve(); + void checkValue(bool print); + void updatePsi(double *Psi); + + void sync(); +}; diff --git a/src_gpu/dfEEqn.cu b/src_gpu/dfEEqn.cu new file mode 100644 index 000000000..6907ed4cd --- /dev/null +++ b/src_gpu/dfEEqn.cu @@ -0,0 +1,743 @@ +#include "dfEEqn.H" + +// kernel functions + +__global__ void eeqn_fvm_ddt_kernel(int num_cells, const double rdelta_t, + const int *csr_row_index, const int *csr_diag_index, + const double *rho_old, const double *rho_new, const double *volume, const double *he_old, + const double sign, const double *A_csr_input, const double *b_input, + double *A_csr_output, double *b_output) +{ + int index = blockDim.x * blockIdx.x + threadIdx.x; + if (index >= num_cells) + return; + + // A_csr has one more element in each row: itself + int row_index = csr_row_index[index]; + int diag_index = csr_diag_index[index]; + int csr_index = row_index + diag_index; + double ddt_diag = rdelta_t * rho_new[index] * volume[index]; + A_csr_output[csr_index] = A_csr_input[csr_index] + ddt_diag * sign; + + double ddt_part_term = rdelta_t * rho_old[index] * volume[index]; + b_output[index] = b_input[index] + ddt_part_term * he_old[index] * sign; +} + +__global__ void eeqn_fvm_div_internal(int num_cells, + const int *csr_row_index, const int *csr_diag_index, + const double *weight, const double *phi, + const double sign, const double *A_csr_input, const double *b_input, + double *A_csr_output, double *b_output) +{ + int index = blockDim.x * blockIdx.x + threadIdx.x; + if (index >= num_cells) + return; + + // A_csr has one more element in each row: itself + int row_index = csr_row_index[index]; + int next_row_index = csr_row_index[index + 1]; + int diag_index = csr_diag_index[index]; + int neighbor_offset = csr_row_index[index] - index; + + double div_diag = 0; + for (int i = row_index; i < next_row_index; i++) + { + int inner_index = i - row_index; + // lower + if (inner_index < diag_index) + { + int neighbor_index = neighbor_offset + inner_index; + double w = weight[neighbor_index]; + double f = phi[neighbor_index]; + A_csr_output[i] = A_csr_input[i] + (-w) * f * sign; + // lower neighbors contribute to sum of -1 + div_diag += (w - 1) * f; + } + // upper + if (inner_index > diag_index) + { + // upper, index - 1, consider of diag + int neighbor_index = neighbor_offset + inner_index - 1; + double w = weight[neighbor_index]; + double f = phi[neighbor_index]; + A_csr_output[i] = A_csr_input[i] + (1 - w) * f * sign; + // upper neighbors contribute to sum of 1 + div_diag += w * f; + } + } + A_csr_output[row_index + diag_index] = A_csr_input[row_index + diag_index] + div_diag * sign; // diag +} + +__global__ void eeqn_fvm_div_boundary(int num_boundary_cells, + const int *csr_row_index, const int *csr_diag_index, + const int *boundary_cell_offset, const int *boundary_cell_id, + const double *value_internal_coeffs, const double *value_boundary_coeffs, + const double sign, const double *A_csr_input, const double *b_input, + double *A_csr_output, double *b_output) +{ + int index = blockDim.x * blockIdx.x + threadIdx.x; + if (index >= num_boundary_cells) + return; + + int cell_offset = boundary_cell_offset[index]; + int cell_index = boundary_cell_id[cell_offset]; + int loop_size = boundary_cell_offset[index + 1] - cell_offset; + + int row_index = csr_row_index[cell_index]; + int diag_index = csr_diag_index[cell_index]; + int csr_index = row_index + diag_index; + + // construct internalCoeffs & boundaryCoeffs + double internal_coeffs = 0; + double boundary_coeffs = 0; + for (int i = 0; i < loop_size; i++) + { + internal_coeffs += value_internal_coeffs[cell_offset + i]; + boundary_coeffs += value_boundary_coeffs[cell_offset + i]; + } + A_csr_output[csr_index] = A_csr_input[csr_index] + internal_coeffs * sign; + b_output[cell_index] = b_input[cell_index] + boundary_coeffs * sign; +} + +__global__ void eeqn_fvm_laplacian_uncorrected_internal(int num_cells, + const int *csr_row_index, const int *csr_col_index, const int *csr_diag_index, + const double *alphaEff, const double *weight, + const double *magsf, const double *distance, + const double sign, const double *A_csr_input, double *A_csr_output) +{ + int index = blockDim.x * blockIdx.x + threadIdx.x; + if (index >= num_cells) + return; + + // A_csr has one more element in each row: itself + int row_index = csr_row_index[index]; + int row_elements = csr_row_index[index + 1] - row_index; + int diag_index = csr_diag_index[index]; + int neighbor_offset = csr_row_index[index] - index; + + double own_alphaEff = alphaEff[index]; + // fvm.upper() = deltaCoeffs.primitiveField()*gammaMagSf.primitiveField(); + // fvm.negSumDiag(); + double sum_diag = 0; + // lower + for (int i = 0; i < diag_index; i++) + { + int neighbor_index = neighbor_offset + i; + int neighbor_cell_id = csr_col_index[i + row_index]; + double w = weight[neighbor_index]; + double nei_alphaEff = alphaEff[neighbor_cell_id]; + double gamma = w * (nei_alphaEff - own_alphaEff) + own_alphaEff; + double gamma_magsf = gamma * magsf[neighbor_index]; + double coeff = gamma_magsf * distance[neighbor_index]; + A_csr_output[row_index + i] = A_csr_input[row_index + i] + coeff * sign; + sum_diag += (-coeff); + } + // upper + for (int i = diag_index + 1; i < row_elements; i++) + { + int neighbor_index = neighbor_offset + i - 1; + int neighbor_cell_id = csr_col_index[i + row_index]; + double w = weight[neighbor_index]; + double nei_alphaEff = alphaEff[neighbor_cell_id]; + double gamma = w * (nei_alphaEff - own_alphaEff) + own_alphaEff; + double gamma_magsf = gamma * magsf[neighbor_index]; + double coeff = gamma_magsf * distance[neighbor_index]; + A_csr_output[row_index + i] = A_csr_input[row_index + i] + coeff * sign; + sum_diag += (-coeff); + } + A_csr_output[row_index + diag_index] = A_csr_input[row_index + diag_index] + sum_diag * sign; // diag +} + +__global__ void eeqn_fvm_laplacian_uncorrected_boundary(int num_boundary_cells, + const int *csr_row_index, const int *csr_diag_index, + const int *boundary_cell_offset, const int *boundary_cell_id, + const double *boundary_alphaEff, const double *boundary_magsf, + const double *gradient_internal_coeffs, const double *gradient_boundary_coeffs, + const double sign, const double *A_csr_input, const double *b_input, + double *A_csr_output, double *b_output) +{ + int index = blockDim.x * blockIdx.x + threadIdx.x; + if (index >= num_boundary_cells) + return; + + int cell_offset = boundary_cell_offset[index]; + int next_cell_offset = boundary_cell_offset[index + 1]; + int cell_index = boundary_cell_id[cell_offset]; + + int row_index = csr_row_index[cell_index]; + int diag_index = csr_diag_index[cell_index]; + int csr_index = row_index + diag_index; + + // OpenFoam code + // if (pvf.coupled()) + // { + // fvm.internalCoeffs()[patchi] = + // pGamma*pvf.gradientInternalCoeffs(pDeltaCoeffs); + // fvm.boundaryCoeffs()[patchi] = + // -pGamma*pvf.gradientBoundaryCoeffs(pDeltaCoeffs); + // } + // else + // { + // fvm.internalCoeffs()[patchi] = pGamma*pvf.gradientInternalCoeffs(); + // fvm.boundaryCoeffs()[patchi] = - + // pGamma*pvf.gradientBoundaryCoeffs(); + // } + double internal_coeffs = 0; + double boundary_coeffs = 0; + for (int i = cell_offset; i < next_cell_offset; i++) + { + double gamma = boundary_alphaEff[i]; + double gamma_magsf = gamma * boundary_magsf[i]; + internal_coeffs += gamma_magsf * gradient_internal_coeffs[i]; + boundary_coeffs += gamma_magsf * gradient_boundary_coeffs[i]; + } + + A_csr_output[csr_index] = A_csr_input[csr_index] + internal_coeffs * sign; + b_output[cell_index] = b_input[cell_index] + boundary_coeffs * sign; +} + +__global__ void eeqn_fvc_ddt_kernel(int num_cells, const double rdelta_t, + const double *rho_old, const double *rho_new, + const double *K_old, const double *K, + const double *volume, + const double sign, const double *b_input, double *b_output) +{ + int index = blockDim.x * blockIdx.x + threadIdx.x; + if (index >= num_cells) + return; + + double fvc_ddt_term = rdelta_t * (rho_new[index] * K[index] - rho_old[index] * K_old[index]) * volume[index]; + b_output[index] = b_input[index] + fvc_ddt_term * sign; +} + +__global__ void eeqn_fvc_div_vector_internal(int num_cells, + const int *csr_row_index, const int *csr_col_index, const int *csr_diag_index, + const double *sf, const double *vf, const double *tlambdas, + const double sign, const double *b_input, double *b_output) +{ + int index = blockDim.x * blockIdx.x + threadIdx.x; + if (index >= num_cells) + return; + + // A_csr has one more element in each row: itself + int row_index = csr_row_index[index]; + int row_elements = csr_row_index[index + 1] - row_index; + int diag_index = csr_diag_index[index]; + int neighbor_offset = csr_row_index[index] - index; + + double own_vf_x = vf[index * 3 + 0]; + double own_vf_y = vf[index * 3 + 1]; + double own_vf_z = vf[index * 3 + 2]; + double sum = 0; + // lower + for (int i = 0; i < diag_index; i++) + { + int neighbor_index = neighbor_offset + i; + int neighbor_cell_id = csr_col_index[row_index + i]; + double w = tlambdas[neighbor_index]; + double sf_x = sf[neighbor_index * 3 + 0]; + double sf_y = sf[neighbor_index * 3 + 1]; + double sf_z = sf[neighbor_index * 3 + 2]; + double neighbor_vf_x = vf[neighbor_cell_id * 3 + 0]; + double neighbor_vf_y = vf[neighbor_cell_id * 3 + 1]; + double neighbor_vf_z = vf[neighbor_cell_id * 3 + 2]; + double face_x = (1 - w) * own_vf_x + w * neighbor_vf_x; + double face_y = (1 - w) * own_vf_y + w * neighbor_vf_y; + double face_z = (1 - w) * own_vf_z + w * neighbor_vf_z; + sum -= sf_x * face_x + sf_y * face_y + sf_z * face_z; + } + // upper + for (int i = diag_index + 1; i < row_elements; i++) + { + int neighbor_index = neighbor_offset + i - 1; + int neighbor_cell_id = csr_col_index[row_index + i]; + double w = tlambdas[neighbor_index]; + double sf_x = sf[neighbor_index * 3 + 0]; + double sf_y = sf[neighbor_index * 3 + 1]; + double sf_z = sf[neighbor_index * 3 + 2]; + double neighbor_vf_x = vf[neighbor_cell_id * 3 + 0]; + double neighbor_vf_y = vf[neighbor_cell_id * 3 + 1]; + double neighbor_vf_z = vf[neighbor_cell_id * 3 + 2]; + double face_x = w * own_vf_x + (1 - w) * neighbor_vf_x; + double face_y = w * own_vf_y + (1 - w) * neighbor_vf_y; + double face_z = w * own_vf_z + (1 - w) * neighbor_vf_z; + sum += sf_x * face_x + sf_y * face_y + sf_z * face_z; + } + b_output[index] = b_input[index] + sum * sign; +} + +__global__ void eeqn_fvc_div_vector_boundary(int num_boundary_cells, + const int *boundary_cell_offset, const int *boundary_cell_id, + const double *boundary_sf, const double *boundary_vf, + const double sign, const double *b_input, double *b_output) +{ + int index = blockDim.x * blockIdx.x + threadIdx.x; + if (index >= num_boundary_cells) + return; + + int cell_offset = boundary_cell_offset[index]; + int next_cell_offset = boundary_cell_offset[index + 1]; + int cell_index = boundary_cell_id[cell_offset]; + + // OpenFoam code + // Foam::surfaceInterpolationScheme::dotInterpolate + // if (vf.boundaryField()[pi].coupled()) + // { + // psf = + // pSf + // & ( + // pLambda*vf.boundaryField()[pi].patchInternalField() + // + (1.0 - pLambda)*vf.boundaryField()[pi].patchNeighbourField() + // ); + // } + // else + // { + // psf = pSf & vf.boundaryField()[pi]; + // } + // tmp> surfaceIntegrate + // forAll(mesh.boundary()[patchi], facei) + // { + // ivf[pFaceCells[facei]] += pssf[facei]; + // } + double sum = 0; + for (int i = cell_offset; i < next_cell_offset; i++) + { + double sf_x = boundary_sf[i * 3 + 0]; + double sf_y = boundary_sf[i * 3 + 1]; + double sf_z = boundary_sf[i * 3 + 2]; + double face_x = boundary_vf[i * 3 + 0]; + double face_y = boundary_vf[i * 3 + 1]; + double face_z = boundary_vf[i * 3 + 2]; + + // if not coupled + sum += (sf_x * face_x + sf_y * face_y + sf_z * face_z); + } + b_output[cell_index] = b_input[cell_index] + sum * sign; +} + +__global__ void eeqn_fvc_div_phi_scalar_internal(int num_cells, + const int *csr_row_index, const int *csr_col_index, const int *csr_diag_index, + const double *weight, const double *phi, const double *K, + const double sign, const double *b_input, double *b_output) +{ + int index = blockDim.x * blockIdx.x + threadIdx.x; + if (index >= num_cells) + return; + + // A_csr has one more element in each row: itself + int row_index = csr_row_index[index]; + int next_row_index = csr_row_index[index + 1]; + int diag_index = csr_diag_index[index]; + int neighbor_offset = csr_row_index[index] - index; + + double own_cell_k = K[index]; + double interp = 0; + for (int i = row_index; i < next_row_index; i++) + { + int inner_index = i - row_index; + // lower + if (inner_index < diag_index) + { + int neighbor_index = neighbor_offset + inner_index; + double w = weight[neighbor_index]; + double p = phi[neighbor_index]; + int neighbor_cell_id = csr_col_index[row_index + inner_index]; + double neighbor_cell_k = K[neighbor_cell_id]; + double face_k = (1 - w) * own_cell_k + w * neighbor_cell_k; + interp -= p * face_k; + } + // upper + if (inner_index > diag_index) + { + int neighbor_index = neighbor_offset + inner_index - 1; + double w = weight[neighbor_index]; + double p = phi[neighbor_index]; + int neighbor_cell_id = csr_col_index[row_index + inner_index]; + double neighbor_cell_k = K[neighbor_cell_id]; + double face_k = w * own_cell_k + (1 - w) * neighbor_cell_k; + interp += p * face_k; + } + } + b_output[index] = b_input[index] + interp * sign; +} + +__global__ void eeqn_fvc_div_phi_scalar_boundary(int num_boundary_cells, + const int *boundary_cell_offset, const int *boundary_cell_id, + const double *boundary_phi, const double* boundary_K, + const double sign, const double *b_input, double *b_output) +{ + int index = blockDim.x * blockIdx.x + threadIdx.x; + if (index >= num_boundary_cells) + return; + + int cell_offset = boundary_cell_offset[index]; + int next_cell_offset = boundary_cell_offset[index + 1]; + int cell_index = boundary_cell_id[cell_offset]; + + // boundary interplate + double interp = 0; + for (int i = cell_offset; i < next_cell_offset; i++) + { + interp += boundary_phi[i] * boundary_K[i]; + } + + b_output[cell_index] = b_input[cell_index] + interp * sign; +} + + +__global__ void eeqn_add_to_source_kernel(int num_cells, + const double sign_dpdt, const double *dpdt, + const double sign_diffAlphaD, const double *diffAlphaD, + const double *volume, + const double *b_input, double *b_output) +{ + int index = blockDim.x * blockIdx.x + threadIdx.x; + if (index >= num_cells) + return; + + b_output[index] = b_input[index] + sign_dpdt * dpdt[index] * volume[index] + sign_diffAlphaD * diffAlphaD[index] * volume[index]; +} + +__global__ void eeqn_boundaryPermutation(const int num_boundary_faces, const int *bouPermedIndex, + const double *boundary_K_init, const double *boundary_hDiffCorrFlux_init, + const double *boundary_alphaEff_init, + const double *value_internal_coeffs_init, const double *value_boundary_coeffs_init, + const double *gradient_internal_coeffs_init, const double *gradient_boundary_coeffs_init, + double *boundary_K, double *boundary_hDiffCorrFlux, + double *boundary_alphaEff, + double *value_internal_coeffs, double *value_boundary_coeffs, + double *gradient_internal_coeffs, double *gradient_boundary_coeffs) +{ + int index = blockDim.x * blockIdx.x + threadIdx.x; + if (index >= num_boundary_faces) + return; + + int p = bouPermedIndex[index]; + + boundary_K[index] = boundary_K_init[p]; + boundary_hDiffCorrFlux[index * 3 + 0] = boundary_hDiffCorrFlux_init[p * 3 + 0]; + boundary_hDiffCorrFlux[index * 3 + 1] = boundary_hDiffCorrFlux_init[p * 3 + 1]; + boundary_hDiffCorrFlux[index * 3 + 2] = boundary_hDiffCorrFlux_init[p * 3 + 2]; + boundary_alphaEff[index] = boundary_alphaEff_init[p]; + value_internal_coeffs[index] = value_internal_coeffs_init[p]; + value_boundary_coeffs[index] = value_boundary_coeffs_init[p]; + gradient_internal_coeffs[index] = gradient_internal_coeffs_init[p]; + gradient_boundary_coeffs[index] = gradient_boundary_coeffs_init[p]; +} + +// constructor +dfEEqn::dfEEqn(dfMatrixDataBase &dataBase, const std::string &modeStr, const std::string &cfgFile) + : dataBase_(dataBase) +{ + ESolver = new AmgXSolver(modeStr, cfgFile); + + num_cells = dataBase_.num_cells; + cell_bytes = dataBase_.cell_bytes; + num_faces = dataBase_.num_faces; + cell_vec_bytes = dataBase_.cell_vec_bytes; + csr_value_vec_bytes = dataBase_.csr_value_vec_bytes; + num_boundary_cells = dataBase_.num_boundary_cells; + num_surfaces = dataBase_.num_surfaces; + num_boundary_faces = dataBase_.num_boundary_faces; + boundary_face_bytes = dataBase_.boundary_face_bytes; + + d_A_csr_row_index = dataBase_.d_A_csr_row_index; + d_A_csr_diag_index = dataBase_.d_A_csr_diag_index; + d_A_csr_col_index = dataBase_.d_A_csr_col_index; + + h_A_csr = new double[(num_cells + num_faces) * 3]; + h_b = new double[num_cells * 3]; + cudaMallocHost(&h_he_new, cell_bytes); + + checkCudaErrors(cudaMalloc((void **)&d_A_csr, csr_value_vec_bytes)); + checkCudaErrors(cudaMalloc((void **)&d_b, cell_vec_bytes)); + + checkCudaErrors(cudaMalloc((void **)&d_he_old, cell_bytes)); + checkCudaErrors(cudaMalloc((void **)&d_K, cell_bytes)); + checkCudaErrors(cudaMalloc((void **)&d_K_old, cell_bytes)); + checkCudaErrors(cudaMalloc((void **)&d_alphaEff, cell_bytes)); + checkCudaErrors(cudaMalloc((void **)&d_dpdt, cell_bytes)); + checkCudaErrors(cudaMalloc((void **)&d_diffAlphaD, cell_bytes)); + checkCudaErrors(cudaMalloc((void **)&d_hDiffCorrFlux, cell_bytes * 3)); + + checkCudaErrors(cudaMalloc((void **)&d_boundary_K_init, boundary_face_bytes)); + checkCudaErrors(cudaMalloc((void **)&d_boundary_K, boundary_face_bytes)); + checkCudaErrors(cudaMalloc((void **)&d_boundary_hDiffCorrFlux_init, boundary_face_bytes * 3)); + checkCudaErrors(cudaMalloc((void **)&d_boundary_hDiffCorrFlux, boundary_face_bytes * 3)); + checkCudaErrors(cudaMalloc((void **)&d_boundary_alphaEff_init, boundary_face_bytes)); + checkCudaErrors(cudaMalloc((void **)&d_boundary_alphaEff, boundary_face_bytes)); + + checkCudaErrors(cudaMalloc((void **)&d_value_internal_coeffs_init, boundary_face_bytes)); + checkCudaErrors(cudaMalloc((void **)&d_value_boundary_coeffs_init, boundary_face_bytes)); + checkCudaErrors(cudaMalloc((void **)&d_gradient_internal_coeffs_init, boundary_face_bytes)); + checkCudaErrors(cudaMalloc((void **)&d_gradient_boundary_coeffs_init, boundary_face_bytes)); + checkCudaErrors(cudaMalloc((void **)&d_value_internal_coeffs, boundary_face_bytes)); + checkCudaErrors(cudaMalloc((void **)&d_value_boundary_coeffs, boundary_face_bytes)); + checkCudaErrors(cudaMalloc((void **)&d_gradient_internal_coeffs, boundary_face_bytes)); + checkCudaErrors(cudaMalloc((void **)&d_gradient_boundary_coeffs, boundary_face_bytes)); + + checkCudaErrors(cudaStreamCreate(&stream)); +} + +void dfEEqn::prepare_data(const double *he_old, const double *K, const double *K_old, const double *alphaEff, + const double *dpdt, const double *diffAlphaD, const double *hDiffCorrFlux, + const double *boundary_K, const double *boundary_hDiffCorrFlux, const double *boundary_alphaEff, + const double *valueInternalCoeffs, const double *valueBoundaryCoeffs, + const double *gradientInternalCoeffs, const double *gradientBoundaryCoeffs) { + // TODO not real async copy now, because some host array are not in pinned memory. + + // copy the host input array in host memory to the device input array in device memory + checkCudaErrors(cudaMemcpyAsync(d_he_old, he_old, cell_bytes, cudaMemcpyHostToDevice, stream)); + checkCudaErrors(cudaMemcpyAsync(d_K, K, cell_bytes, cudaMemcpyHostToDevice, stream)); + checkCudaErrors(cudaMemcpyAsync(d_K_old, K_old, cell_bytes, cudaMemcpyHostToDevice, stream)); + checkCudaErrors(cudaMemcpyAsync(d_alphaEff, alphaEff, cell_bytes, cudaMemcpyHostToDevice, stream)); + checkCudaErrors(cudaMemcpyAsync(d_dpdt, dpdt, cell_bytes, cudaMemcpyHostToDevice, stream)); + checkCudaErrors(cudaMemcpyAsync(d_diffAlphaD, diffAlphaD, cell_bytes, cudaMemcpyHostToDevice, stream)); + checkCudaErrors(cudaMemcpyAsync(d_hDiffCorrFlux, hDiffCorrFlux, cell_bytes * 3, cudaMemcpyHostToDevice, stream)); + + // copy and permutate boundary variable + checkCudaErrors(cudaMemcpyAsync(d_boundary_K_init, boundary_K, boundary_face_bytes, cudaMemcpyHostToDevice, stream)); + checkCudaErrors(cudaMemcpyAsync(d_boundary_hDiffCorrFlux_init, boundary_hDiffCorrFlux, boundary_face_bytes * 3, cudaMemcpyHostToDevice, stream)); + checkCudaErrors(cudaMemcpyAsync(d_boundary_alphaEff_init, boundary_alphaEff, boundary_face_bytes, cudaMemcpyHostToDevice, stream)); + checkCudaErrors(cudaMemcpyAsync(d_value_internal_coeffs_init, valueInternalCoeffs, boundary_face_bytes, cudaMemcpyHostToDevice, stream)); + checkCudaErrors(cudaMemcpyAsync(d_value_boundary_coeffs_init, valueBoundaryCoeffs, boundary_face_bytes, cudaMemcpyHostToDevice, stream)); + checkCudaErrors(cudaMemcpyAsync(d_gradient_internal_coeffs_init, gradientInternalCoeffs, boundary_face_bytes, cudaMemcpyHostToDevice, stream)); + checkCudaErrors(cudaMemcpyAsync(d_gradient_boundary_coeffs_init, gradientBoundaryCoeffs, boundary_face_bytes, cudaMemcpyHostToDevice, stream)); + + size_t threads_per_block = 1024; + size_t blocks_per_grid = (num_boundary_faces + threads_per_block - 1) / threads_per_block; + eeqn_boundaryPermutation<<>>(num_boundary_faces, dataBase_.d_bouPermedIndex, + d_boundary_K_init, d_boundary_hDiffCorrFlux_init, d_boundary_alphaEff_init, + d_value_internal_coeffs_init, d_value_boundary_coeffs_init, + d_gradient_internal_coeffs_init, d_gradient_boundary_coeffs_init, + d_boundary_K, d_boundary_hDiffCorrFlux, d_boundary_alphaEff, + d_value_internal_coeffs, d_value_boundary_coeffs, + d_gradient_internal_coeffs, d_gradient_boundary_coeffs); +} + +void dfEEqn::resetAb() { + // initialize matrix value + checkCudaErrors(cudaMemsetAsync(d_A_csr, 0, csr_value_vec_bytes, stream)); + checkCudaErrors(cudaMemsetAsync(d_b, 0, cell_vec_bytes, stream)); +} + +void dfEEqn::fvm_ddt() +{ + size_t threads_per_block = 1024; + size_t blocks_per_grid = (num_cells + threads_per_block - 1) / threads_per_block; + eeqn_fvm_ddt_kernel<<>>(num_cells, dataBase_.rdelta_t, + d_A_csr_row_index, d_A_csr_diag_index, + dataBase_.d_rho_old, dataBase_.d_rho_new, dataBase_.d_volume, d_he_old, + 1., d_A_csr, d_b, d_A_csr, d_b); +} + +void dfEEqn::fvm_div() +{ + size_t threads_per_block = 1024; + size_t blocks_per_grid = (num_cells + threads_per_block - 1) / threads_per_block; + eeqn_fvm_div_internal<<>>(num_cells, + d_A_csr_row_index, d_A_csr_diag_index, + dataBase_.d_weight, dataBase_.d_phi, + 1., d_A_csr, d_b, d_A_csr, d_b); + blocks_per_grid = (num_boundary_cells + threads_per_block - 1) / threads_per_block; + eeqn_fvm_div_boundary<<>>(num_boundary_cells, + d_A_csr_row_index, d_A_csr_diag_index, + dataBase_.d_boundary_cell_offset, dataBase_.d_boundary_cell_id, + d_value_internal_coeffs, d_value_boundary_coeffs, + 1., d_A_csr, d_b, d_A_csr, d_b); +} + +void dfEEqn::fvm_laplacian() +{ + size_t threads_per_block = 1024; + size_t blocks_per_grid = (num_cells + threads_per_block - 1) / threads_per_block; + eeqn_fvm_laplacian_uncorrected_internal<<>>(num_cells, + d_A_csr_row_index, d_A_csr_col_index, d_A_csr_diag_index, d_alphaEff, dataBase_.d_weight, + dataBase_.d_face, dataBase_.d_deltaCoeffs, + -1., d_A_csr, d_A_csr); + blocks_per_grid = (num_boundary_cells + threads_per_block - 1) / threads_per_block; + eeqn_fvm_laplacian_uncorrected_boundary<<>>(num_boundary_cells, + d_A_csr_row_index, d_A_csr_diag_index, dataBase_.d_boundary_cell_offset, dataBase_.d_boundary_cell_id, + d_boundary_alphaEff, dataBase_.d_boundary_face, d_gradient_internal_coeffs, d_gradient_boundary_coeffs, + -1., d_A_csr, d_b, d_A_csr, d_b); +} + +void dfEEqn::fvc_ddt() +{ + // " + fvc::ddt(rho,K)" is on the left side of "==", thus should minus from source. + size_t threads_per_block = 1024; + size_t blocks_per_grid = (num_cells + threads_per_block - 1) / threads_per_block; + eeqn_fvc_ddt_kernel<<>>(num_cells, dataBase_.rdelta_t, + dataBase_.d_rho_old, dataBase_.d_rho_new, d_K_old, d_K, dataBase_.d_volume, + -1., d_b, d_b); +} + +void dfEEqn::fvc_div_vector() +{ + // " + fvc::div(hDiffCorrFlux)" is on the right side of "==", thus should add to source. + size_t threads_per_block = 512; + size_t blocks_per_grid = (num_cells + threads_per_block - 1) / threads_per_block; + eeqn_fvc_div_vector_internal<<>>(num_cells, + d_A_csr_row_index, d_A_csr_col_index, d_A_csr_diag_index, + dataBase_.d_face_vector, d_hDiffCorrFlux, dataBase_.d_weight, + 1., d_b, d_b); + blocks_per_grid = (num_boundary_cells + threads_per_block - 1) / threads_per_block; + eeqn_fvc_div_vector_boundary<<>>(num_boundary_cells, + dataBase_.d_boundary_cell_offset, dataBase_.d_boundary_cell_id, + dataBase_.d_boundary_face_vector, d_boundary_hDiffCorrFlux, + 1., d_b, d_b); +} + +void dfEEqn::fvc_div_phi_scalar() +{ + // " + fvc::div(phi,K)" is on the left side of "==", thus should minus from source. + size_t threads_per_block = 1024; + size_t blocks_per_grid = (num_cells + threads_per_block - 1) / threads_per_block; + eeqn_fvc_div_phi_scalar_internal<<>>(num_cells, + d_A_csr_row_index, d_A_csr_col_index, d_A_csr_diag_index, + dataBase_.d_weight, dataBase_.d_phi, d_K, + -1., d_b, d_b); + blocks_per_grid = (num_boundary_cells + threads_per_block - 1) / threads_per_block; + eeqn_fvc_div_phi_scalar_boundary<<>>(num_boundary_cells, + dataBase_.d_boundary_cell_offset, dataBase_.d_boundary_cell_id, + dataBase_.d_boundary_phi, d_boundary_K, + -1., d_b, d_b); +} + +void dfEEqn::add_to_source() +{ + // " - dpdt" is on the left side of "==", thus should add to source. + // "+ diffAlphaD" is on the left side of "==", thus should minus from source. + size_t threads_per_block = 1024; + size_t blocks_per_grid = (num_cells + threads_per_block - 1) / threads_per_block; + // " + fvc::ddt(rho,K)" is on the left side of "==", thus should minus from source. + eeqn_add_to_source_kernel<<>>(num_cells, + 1., d_dpdt, -1., d_diffAlphaD, dataBase_.d_volume, d_b, d_b); +} + +void dfEEqn::checkValue(bool print) +{ + checkCudaErrors(cudaMemcpyAsync(h_A_csr, d_A_csr, csr_value_vec_bytes, cudaMemcpyDeviceToHost, stream)); + checkCudaErrors(cudaMemcpyAsync(h_b, d_b, cell_vec_bytes, cudaMemcpyDeviceToHost, stream)); + + // Synchronize stream + checkCudaErrors(cudaStreamSynchronize(stream)); + if (print) + { + for (int i = 0; i < (num_faces + num_cells); i++) + fprintf(stderr, "h_A_csr[%d]: %.16lf\n", i, h_A_csr[i]); + for (int i = 0; i < num_cells; i++) + fprintf(stderr, "h_b[%d]: %.16lf\n", i, h_b[i]); + } + + char *input_file = "of_output.txt"; + FILE *fp = fopen(input_file, "rb+"); + if (fp == NULL) + { + fprintf(stderr, "Failed to open input file: %s!\n", input_file); + } + int readfile = 0; + double *of_b = new double[num_cells]; + double *of_A = new double[num_faces + num_cells]; + readfile = fread(of_b, num_cells * sizeof(double), 1, fp); + readfile = fread(of_A, (num_faces + num_cells) * sizeof(double), 1, fp); + + std::vector h_A_of_init_vec(num_cells + num_faces); + std::copy(of_A, of_A + num_cells + num_faces, h_A_of_init_vec.begin()); + + std::vector h_A_of_vec_1mtx(num_faces + num_cells, 0); + for (int i = 0; i < num_faces + num_cells; i++) + { + h_A_of_vec_1mtx[i] = h_A_of_init_vec[dataBase_.tmpPermutatedList[i]]; + } + + // b + std::vector h_b_of_vec(num_cells); + std::copy(of_b, of_b + num_cells, h_b_of_vec.begin()); + + if (print) + { + for (int i = 0; i < (num_faces + num_cells); i++) + fprintf(stderr, "h_A_of_vec_1mtx[%d]: %.16lf\n", i, h_A_of_vec_1mtx[i]); + for (int i = 0; i < num_cells; i++) + fprintf(stderr, "h_b_of_vec[%d]: %.16lf\n", i, h_b_of_vec[i]); + } + + // check + fprintf(stderr, "check of h_A_csr\n"); + checkVectorEqual(num_faces + num_cells, h_A_of_vec_1mtx.data(), h_A_csr, 1e-6); + fprintf(stderr, "check of h_b\n"); + checkVectorEqual(num_cells, h_b_of_vec.data(), h_b, 1e-6); +} + +void dfEEqn::solve() +{ + checkCudaErrors(cudaStreamSynchronize(stream)); + // nvtxRangePush("solve"); + + int nNz = num_cells + num_faces; // matrix entries + if (num_iteration == 0) // first interation + { + printf("Initializing AmgX Linear Solver\n"); + ESolver->setOperator(num_cells, nNz, d_A_csr_row_index, d_A_csr_col_index, d_A_csr); + } + else + { + ESolver->updateOperator(num_cells, nNz, d_A_csr); + } + ESolver->solve(num_cells, d_he_old, d_b); + num_iteration++; + + checkCudaErrors(cudaMemcpyAsync(h_he_new, d_he_old, cell_bytes, cudaMemcpyDeviceToHost, stream)); + // checkCudaErrors(cudaStreamSynchronize(stream)); + // for (size_t i = 0; i < num_cells; i++) + // fprintf(stderr, "h_he_after[%d]: %.16lf\n", i, h_he_new[i]); +} + +void dfEEqn::sync() +{ + checkCudaErrors(cudaStreamSynchronize(stream)); +} + +void dfEEqn::updatePsi(double *Psi) +{ + for (size_t i = 0; i < num_cells; i++) + { + Psi[i] = h_he_new[i]; + } +} + +dfEEqn::~dfEEqn() +{ + delete h_A_csr; + delete h_b; + + checkCudaErrors(cudaFreeHost(h_he_new)); + + checkCudaErrors(cudaFree(d_A_csr)); + checkCudaErrors(cudaFree(d_b)); + + checkCudaErrors(cudaFree(d_he_old)); + checkCudaErrors(cudaFree(d_K)); + checkCudaErrors(cudaFree(d_K_old)); + checkCudaErrors(cudaFree(d_alphaEff)); + checkCudaErrors(cudaFree(d_dpdt)); + checkCudaErrors(cudaFree(d_diffAlphaD)); + checkCudaErrors(cudaFree(d_hDiffCorrFlux)); + + checkCudaErrors(cudaFree(d_boundary_K_init)); + checkCudaErrors(cudaFree(d_boundary_K)); + checkCudaErrors(cudaFree(d_boundary_hDiffCorrFlux_init)); + checkCudaErrors(cudaFree(d_boundary_hDiffCorrFlux)); + checkCudaErrors(cudaFree(d_boundary_alphaEff_init)); + checkCudaErrors(cudaFree(d_boundary_alphaEff)); + + checkCudaErrors(cudaFree(d_value_internal_coeffs_init)); + checkCudaErrors(cudaFree(d_value_boundary_coeffs_init)); + checkCudaErrors(cudaFree(d_gradient_internal_coeffs_init)); + checkCudaErrors(cudaFree(d_gradient_boundary_coeffs_init)); + checkCudaErrors(cudaFree(d_value_internal_coeffs)); + checkCudaErrors(cudaFree(d_value_boundary_coeffs)); + checkCudaErrors(cudaFree(d_gradient_internal_coeffs)); + checkCudaErrors(cudaFree(d_gradient_boundary_coeffs)); +} diff --git a/src_gpu/dfUEqn.cu b/src_gpu/dfUEqn.cu index 4afebd8c1..350befb50 100644 --- a/src_gpu/dfUEqn.cu +++ b/src_gpu/dfUEqn.cu @@ -181,7 +181,7 @@ __global__ void fvc_grad_internal_face(int num_cells, double sfz = face_vector[neighbor_index * 3 + 2]; int neighbor_cell_id = csr_col_index[row_index + inner_index]; double neighbor_cell_p = pressure[neighbor_cell_id]; - double face_p = (1 - w) * own_cell_p + w * neighbor_cell_p; + double face_p = w * own_cell_p + (1 - w) * neighbor_cell_p; grad_bx_upp += face_p * sfx; grad_by_upp += face_p * sfy; grad_bz_upp += face_p * sfz; @@ -1206,4 +1206,4 @@ void dfUEqn::updatePsi(double *Psi) dfUEqn::~dfUEqn() { -} \ No newline at end of file +}