From f827b9d413f09cec4fa14b8483099bba418b0f74 Mon Sep 17 00:00:00 2001 From: STwangyingrui Date: Thu, 8 Jun 2023 18:12:33 +0800 Subject: [PATCH 1/6] add GPU version of hDiffCorrFlux and diffAlphaD in YEqn, merge branch GPU and fix conflict --- applications/solvers/dfLowMachFoam/EEqn.H | 5 +- applications/solvers/dfLowMachFoam/YEqn.H | 38 +- .../solvers/dfLowMachFoam/createdfSolver.H | 3 +- src_gpu/dfEEqn.H | 7 +- src_gpu/dfEEqn.cu | 40 +- src_gpu/dfMatrixDataBase.H | 8 + src_gpu/dfYEqn.H | 23 +- src_gpu/dfYEqn.cu | 385 ++++++++++++------ 8 files changed, 321 insertions(+), 188 deletions(-) diff --git a/applications/solvers/dfLowMachFoam/EEqn.H b/applications/solvers/dfLowMachFoam/EEqn.H index 8e8c0bee0..764b6be10 100644 --- a/applications/solvers/dfLowMachFoam/EEqn.H +++ b/applications/solvers/dfLowMachFoam/EEqn.H @@ -49,10 +49,8 @@ int patchSize = pw.size(); const scalarField& patchK = K.boundaryField()[patchi]; - const vectorField& patchhDiffCorrFlux = hDiffCorrFlux.boundaryField()[patchi]; const scalarField& patchAlphaEff = alphaEff.boundaryField()[patchi]; memcpy(boundary_K + eeqn_offset, &patchK[0], patchSize*sizeof(double)); - memcpy(boundary_hDiffCorrFlux + eeqn_offset * 3, &patchhDiffCorrFlux[0][0], 3 * patchSize*sizeof(double)); memcpy(boundary_alphaEff + eeqn_offset, &patchAlphaEff[0], patchSize*sizeof(double)); eeqn_offset += patchSize; @@ -63,8 +61,7 @@ // prepare data on GPU start1 = std::clock(); EEqn_GPU.prepare_data(&he.oldTime()[0], &K[0], &K.oldTime()[0], &alphaEff[0], - &dpdt[0], &diffAlphaD[0], &hDiffCorrFlux[0][0], - boundary_K, boundary_hDiffCorrFlux, boundary_alphaEff); + &dpdt[0], boundary_K, boundary_alphaEff); if (doSync) EEqn_GPU.sync(); end1 = std::clock(); time_monitor_EEqn_mtxAssembly_GPU_Prepare += double(end1 - start1) / double(CLOCKS_PER_SEC); diff --git a/applications/solvers/dfLowMachFoam/YEqn.H b/applications/solvers/dfLowMachFoam/YEqn.H index c741a8f48..baa12633e 100644 --- a/applications/solvers/dfLowMachFoam/YEqn.H +++ b/applications/solvers/dfLowMachFoam/YEqn.H @@ -14,6 +14,7 @@ tmp> mvConvection ) ); +#ifdef CPUSolver_ start1 = std::clock(); forAll(Y, i) { @@ -23,6 +24,7 @@ const surfaceScalarField phiUc = linearInterpolate(sumYDiffError) & mesh.Sf(); start1 = std::clock(); time_monitor_YEqn += double(end1 - start1) / double(CLOCKS_PER_SEC); time_monitor_YEqn_Solve += double(end1 - start1) / double(CLOCKS_PER_SEC); +#endif #ifdef GPUSolver_ start1 = std::clock(); @@ -36,47 +38,56 @@ time_monitor_YEqn_Solve += double(end1 - start1) / double(CLOCKS_PER_SEC); #ifdef GPUSolver_ start1 = std::clock(); start2 = std::clock(); - std::vector Y_old(Y.size()), boundary_Y_init(Y.size()), boundary_rhoD_init(Y.size()); - std::vector rhoD_GPU(Y.size()); + std::vector Y_old(Y.size()), boundary_Y(Y.size()), boundary_hai(Y.size()), boundary_rhoD(Y.size()); + std::vector hai(Y.size()), rhoD(Y.size()); for (size_t i = 0; i < Y.size(); ++i) { volScalarField& Yi = Y[i]; - const volScalarField& rhoDi = chemistry->rhoD(i); Y_old[i] = &Yi.oldTime()[0]; - rhoD_GPU[i] = &chemistry->rhoD(i)[0]; - cudaMallocHost(&boundary_Y_init[i], num_boundary_faces*sizeof(double)); - cudaMallocHost(&boundary_rhoD_init[i], num_boundary_faces*sizeof(double)); + cudaMallocHost(&boundary_Y[i], num_boundary_faces*sizeof(double)); + const volScalarField& haii = chemistry->hai(i); + const volScalarField& rhoDi = chemistry->rhoD(i); + hai[i] = &haii[0]; + rhoD[i] = &rhoDi[0]; + cudaMallocHost(&boundary_hai[i], num_boundary_faces*sizeof(double)); + cudaMallocHost(&boundary_rhoD[i], num_boundary_faces*sizeof(double)); int offset = 0; forAll(Yi.boundaryField(), patchi) { const scalarField& patchYi = Yi.boundaryField()[patchi]; + const scalarField& patchHaii = haii.boundaryField()[patchi]; const scalarField& patchRhoDi = rhoDi.boundaryField()[patchi]; int patchSize = patchYi.size(); - memcpy(boundary_Y_init[i]+offset, &patchYi[0], patchSize*sizeof(double)); - memcpy(boundary_rhoD_init[i]+offset, &patchRhoDi[0], patchSize*sizeof(double)); + memcpy(boundary_Y[i] + offset, &patchYi[0], patchSize*sizeof(double)); + memcpy(boundary_hai[i] + offset, &patchHaii[0], patchSize*sizeof(double)); + memcpy(boundary_rhoD[i] + offset, &patchRhoDi[0], patchSize*sizeof(double)); offset += patchSize; } } volScalarField mut_sct = turbulence->mut().ref()/Sct; - std::vector boundary_mutsct; + double *boundary_mutsct = nullptr; + cudaMallocHost(&boundary_mutsct, num_boundary_faces*sizeof(double)); + int offset = 0; forAll(p.boundaryField(), patchi) { const scalarField& patchMut_sct = mut_sct.boundaryField()[patchi]; int patchSize = patchMut_sct.size(); - boundary_mutsct.insert(boundary_mutsct.end(), &patchMut_sct[0], &patchMut_sct[0] + patchSize); + memcpy(boundary_mutsct + offset, &patchMut_sct[0], patchSize*sizeof(double)); + offset += patchSize; } end2 = std::clock(); time_monitor_YEqn_mtxAssembly_CPU_Prepare += double(end2 - start2) / double(CLOCKS_PER_SEC); YEqn_GPU.upwindWeight(); - YEqn_GPU.correctVelocity(Y_old, boundary_Y_init, rhoD_GPU); + YEqn_GPU.correctVelocity(Y_old, boundary_Y, hai, boundary_hai, rhoD, boundary_rhoD); YEqn_GPU.fvm_ddt(); YEqn_GPU.fvm_div_phi(); YEqn_GPU.fvm_div_phiUc(); - YEqn_GPU.fvm_laplacian(&mut_sct[0], boundary_mutsct.data(), boundary_rhoD_init); + YEqn_GPU.fvm_laplacian(&mut_sct[0], boundary_mutsct, rhoD, boundary_rhoD); + YEqn_GPU.compute_diffAlphaD(&thermo.alpha()[0], hai); end1 = std::clock(); time_monitor_YEqn += double(end1 - start1) / double(CLOCKS_PER_SEC); time_monitor_YEqn_mtxAssembly += double(end1 - start1) / double(CLOCKS_PER_SEC); @@ -115,9 +126,10 @@ time_monitor_corrDiff += double(end - start) / double(CLOCKS_PER_SEC); forAll(Y, i) { volScalarField& Yi = Y[i]; +#ifdef CPUSolver_ hDiffCorrFlux += chemistry->hai(i)*(chemistry->rhoD(i)*fvc::grad(Yi) - Yi*sumYDiffError); diffAlphaD += fvc::laplacian(thermo.alpha()*chemistry->hai(i), Yi); - +#endif if (i != inertIndex) { #ifdef GPUSolver_ diff --git a/applications/solvers/dfLowMachFoam/createdfSolver.H b/applications/solvers/dfLowMachFoam/createdfSolver.H index 880ae674c..e56376b28 100644 --- a/applications/solvers/dfLowMachFoam/createdfSolver.H +++ b/applications/solvers/dfLowMachFoam/createdfSolver.H @@ -49,10 +49,9 @@ cudaMallocHost(&boundary_nuEff_init, num_boundary_faces*sizeof(double)); cudaMallocHost(&boundary_rho_init, num_boundary_faces*sizeof(double)); cudaMallocHost(&boundary_phi_init, num_boundary_faces*sizeof(double)); -double *boundary_alphaEff, *boundary_K, *boundary_hDiffCorrFlux; +double *boundary_alphaEff, *boundary_K; double *eeqn_valueInternalCoeffs, *eeqn_valueBoundaryCoeffs, *eeqn_gradientInternalCoeffs, *eeqn_gradientBoundaryCoeffs; cudaMallocHost(&boundary_K, num_boundary_faces*sizeof(double)); -cudaMallocHost(&boundary_hDiffCorrFlux, 3 * num_boundary_faces*sizeof(double)); cudaMallocHost(&boundary_alphaEff, num_boundary_faces*sizeof(double)); cudaMallocHost(&eeqn_valueInternalCoeffs, num_boundary_faces*sizeof(double)); cudaMallocHost(&eeqn_valueBoundaryCoeffs, num_boundary_faces*sizeof(double)); diff --git a/src_gpu/dfEEqn.H b/src_gpu/dfEEqn.H index 483c44bb6..732e32ca7 100644 --- a/src_gpu/dfEEqn.H +++ b/src_gpu/dfEEqn.H @@ -28,12 +28,8 @@ private: double *d_K = nullptr; double *d_K_old = nullptr; double *d_dpdt = nullptr; - double *d_diffAlphaD = nullptr; - double *d_hDiffCorrFlux = nullptr; double *d_boundary_K_init = nullptr; double *d_boundary_K = nullptr; - double *d_boundary_hDiffCorrFlux_init = nullptr; - double *d_boundary_hDiffCorrFlux = nullptr; double *d_boundary_alphaEff_init = nullptr; double *d_boundary_alphaEff = nullptr; double *d_value_internal_coeffs_init = nullptr; @@ -51,8 +47,7 @@ public: ~dfEEqn(); void prepare_data(const double *he_old, const double *K, const double *K_old, const double *alphaEff, - const double *hDiffCorrFlux, const double *dpdt, const double *diffAlphaD, - const double *boundary_K, const double *boundary_hDiffCorrFlux, const double *boundary_alphaEff); + const double *dpdt, const double *boundary_K, const double *boundary_alphaEff); void initializeTimeStep(); diff --git a/src_gpu/dfEEqn.cu b/src_gpu/dfEEqn.cu index 2ef941f3b..48d6a19dc 100644 --- a/src_gpu/dfEEqn.cu +++ b/src_gpu/dfEEqn.cu @@ -398,9 +398,9 @@ __global__ void eeqn_add_to_source_kernel(int num_cells, } __global__ void eeqn_boundaryPermutation(const int num_boundary_faces, const int *bouPermedIndex, - const double *boundary_K_init, const double *boundary_hDiffCorrFlux_init, + const double *boundary_K_init, const double *boundary_alphaEff_init, - double *boundary_K, double *boundary_hDiffCorrFlux, + double *boundary_K, double *boundary_alphaEff) { int index = blockDim.x * blockIdx.x + threadIdx.x; @@ -410,9 +410,6 @@ __global__ void eeqn_boundaryPermutation(const int num_boundary_faces, const int int p = bouPermedIndex[index]; boundary_K[index] = boundary_K_init[p]; - boundary_hDiffCorrFlux[index * 3 + 0] = boundary_hDiffCorrFlux_init[p * 3 + 0]; - boundary_hDiffCorrFlux[index * 3 + 1] = boundary_hDiffCorrFlux_init[p * 3 + 1]; - boundary_hDiffCorrFlux[index * 3 + 2] = boundary_hDiffCorrFlux_init[p * 3 + 2]; boundary_alphaEff[index] = boundary_alphaEff_init[p]; } @@ -468,13 +465,9 @@ dfEEqn::dfEEqn(dfMatrixDataBase &dataBase, const std::string &modeStr, const std checkCudaErrors(cudaMalloc((void **)&d_K_old, cell_bytes)); checkCudaErrors(cudaMalloc((void **)&d_alphaEff, cell_bytes)); checkCudaErrors(cudaMalloc((void **)&d_dpdt, cell_bytes)); - checkCudaErrors(cudaMalloc((void **)&d_diffAlphaD, cell_bytes)); - checkCudaErrors(cudaMalloc((void **)&d_hDiffCorrFlux, cell_bytes * 3)); checkCudaErrors(cudaMalloc((void **)&d_boundary_K_init, boundary_face_bytes)); checkCudaErrors(cudaMalloc((void **)&d_boundary_K, boundary_face_bytes)); - checkCudaErrors(cudaMalloc((void **)&d_boundary_hDiffCorrFlux_init, boundary_face_bytes * 3)); - checkCudaErrors(cudaMalloc((void **)&d_boundary_hDiffCorrFlux, boundary_face_bytes * 3)); checkCudaErrors(cudaMalloc((void **)&d_boundary_alphaEff_init, boundary_face_bytes)); checkCudaErrors(cudaMalloc((void **)&d_boundary_alphaEff, boundary_face_bytes)); @@ -491,8 +484,7 @@ dfEEqn::dfEEqn(dfMatrixDataBase &dataBase, const std::string &modeStr, const std } void dfEEqn::prepare_data(const double *he_old, const double *K, const double *K_old, const double *alphaEff, - const double *dpdt, const double *diffAlphaD, const double *hDiffCorrFlux, - const double *boundary_K, const double *boundary_hDiffCorrFlux, const double *boundary_alphaEff) + const double *dpdt, const double *boundary_K, const double *boundary_alphaEff) { // TODO not real async copy now, because some host array are not in pinned memory. @@ -502,19 +494,16 @@ void dfEEqn::prepare_data(const double *he_old, const double *K, const double *K checkCudaErrors(cudaMemcpyAsync(d_K_old, K_old, cell_bytes, cudaMemcpyHostToDevice, stream)); checkCudaErrors(cudaMemcpyAsync(d_alphaEff, alphaEff, cell_bytes, cudaMemcpyHostToDevice, stream)); checkCudaErrors(cudaMemcpyAsync(d_dpdt, dpdt, cell_bytes, cudaMemcpyHostToDevice, stream)); - checkCudaErrors(cudaMemcpyAsync(d_diffAlphaD, diffAlphaD, cell_bytes, cudaMemcpyHostToDevice, stream)); - checkCudaErrors(cudaMemcpyAsync(d_hDiffCorrFlux, hDiffCorrFlux, cell_bytes * 3, cudaMemcpyHostToDevice, stream)); // copy and permutate boundary variable checkCudaErrors(cudaMemcpyAsync(d_boundary_K_init, boundary_K, boundary_face_bytes, cudaMemcpyHostToDevice, stream)); - checkCudaErrors(cudaMemcpyAsync(d_boundary_hDiffCorrFlux_init, boundary_hDiffCorrFlux, boundary_face_bytes * 3, cudaMemcpyHostToDevice, stream)); checkCudaErrors(cudaMemcpyAsync(d_boundary_alphaEff_init, boundary_alphaEff, boundary_face_bytes, cudaMemcpyHostToDevice, stream)); size_t threads_per_block = 1024; size_t blocks_per_grid = (num_boundary_faces + threads_per_block - 1) / threads_per_block; eeqn_boundaryPermutation<<>>(num_boundary_faces, dataBase_.d_bouPermedIndex, - d_boundary_K_init, d_boundary_hDiffCorrFlux_init, d_boundary_alphaEff_init, - d_boundary_K, d_boundary_hDiffCorrFlux, d_boundary_alphaEff); + d_boundary_K_init, d_boundary_alphaEff_init, + d_boundary_K, d_boundary_alphaEff); } void dfEEqn::initializeTimeStep() @@ -587,14 +576,14 @@ void dfEEqn::fvc_div_vector() size_t threads_per_block = 512; size_t blocks_per_grid = (num_cells + threads_per_block - 1) / threads_per_block; eeqn_fvc_div_vector_internal<<>>(num_cells, - d_A_csr_row_index, d_A_csr_col_index, d_A_csr_diag_index, - dataBase_.d_face_vector, d_hDiffCorrFlux, dataBase_.d_weight, - 1., d_b, d_b); + d_A_csr_row_index, d_A_csr_col_index, d_A_csr_diag_index, + dataBase_.d_face_vector, dataBase_.d_hDiffCorrFlux, dataBase_.d_weight, + 1., d_b, d_b); blocks_per_grid = (num_boundary_cells + threads_per_block - 1) / threads_per_block; eeqn_fvc_div_vector_boundary<<>>(num_boundary_cells, - dataBase_.d_boundary_cell_offset, dataBase_.d_boundary_cell_id, - dataBase_.d_boundary_face_vector, d_boundary_hDiffCorrFlux, - 1., d_b, d_b); + dataBase_.d_boundary_cell_offset, dataBase_.d_boundary_cell_id, + dataBase_.d_boundary_face_vector, dataBase_.d_boundary_hDiffCorrFlux, + 1., d_b, d_b); } void dfEEqn::fvc_div_phi_scalar() @@ -621,7 +610,8 @@ void dfEEqn::add_to_source() size_t blocks_per_grid = (num_cells + threads_per_block - 1) / threads_per_block; // " + fvc::ddt(rho,K)" is on the left side of "==", thus should minus from source. eeqn_add_to_source_kernel<<>>(num_cells, - 1., d_dpdt, -1., d_diffAlphaD, dataBase_.d_volume, d_b, d_b); + //1., d_dpdt, -1., d_diffAlphaD, dataBase_.d_volume, d_b, d_b); + 1., d_dpdt, -1., dataBase_.d_diffAlphaD, dataBase_.d_volume, d_b, d_b); } void dfEEqn::checkValue(bool print) @@ -728,13 +718,9 @@ dfEEqn::~dfEEqn() checkCudaErrors(cudaFree(d_K_old)); checkCudaErrors(cudaFree(d_alphaEff)); checkCudaErrors(cudaFree(d_dpdt)); - checkCudaErrors(cudaFree(d_diffAlphaD)); - checkCudaErrors(cudaFree(d_hDiffCorrFlux)); checkCudaErrors(cudaFree(d_boundary_K_init)); checkCudaErrors(cudaFree(d_boundary_K)); - checkCudaErrors(cudaFree(d_boundary_hDiffCorrFlux_init)); - checkCudaErrors(cudaFree(d_boundary_hDiffCorrFlux)); checkCudaErrors(cudaFree(d_boundary_alphaEff_init)); checkCudaErrors(cudaFree(d_boundary_alphaEff)); diff --git a/src_gpu/dfMatrixDataBase.H b/src_gpu/dfMatrixDataBase.H index 2fa79b13f..778d7c56c 100644 --- a/src_gpu/dfMatrixDataBase.H +++ b/src_gpu/dfMatrixDataBase.H @@ -101,6 +101,9 @@ struct dfMatrixDataBase double *d_deltaCoeffs_init = nullptr, *d_deltaCoeffs = nullptr; std::vector d_rhoD_vector; + double *d_hDiffCorrFlux = nullptr; + double *d_diffAlphaD = nullptr; + double rdelta_t = 1/1e-6; /** @@ -131,6 +134,8 @@ struct dfMatrixDataBase std::vector d_boundary_rhoD_vector; double *d_boundary_mut_sct = nullptr; + double *d_boundary_hDiffCorrFlux = nullptr; + std::vector boundPermutationList; std::vector ueqn_internalCoeffs, ueqn_boundaryCoeffs; std::vector boundary_face_vector; @@ -562,6 +567,9 @@ struct dfMatrixDataBase checkCudaErrors(cudaMalloc((void**)&d_grad_boundary_init, boundary_cell_bytes * 9)); total_bytes += (num_cells * 9 * sizeof(double) + boundary_face_bytes * 9 + boundary_cell_bytes * 9); // FIXME: rename + checkCudaErrors(cudaMalloc((void**)&d_hDiffCorrFlux, 3 * cell_bytes)); + checkCudaErrors(cudaMalloc((void**)&d_diffAlphaD, cell_bytes)); + checkCudaErrors(cudaMalloc((void**)&d_boundary_hDiffCorrFlux, 3 * boundary_face_bytes)); fprintf(stderr, "Total bytes malloc on GPU: %.2fMB\n", total_bytes * 1.0 / 1024 / 1024); diff --git a/src_gpu/dfYEqn.H b/src_gpu/dfYEqn.H index 260baf4f2..6e2a408e4 100644 --- a/src_gpu/dfYEqn.H +++ b/src_gpu/dfYEqn.H @@ -16,7 +16,7 @@ private: double time_monitor_GPU_kernel, time_monitor_GPU_memcpy, time_monitor_GPU_memcpy_test; // common variables - int num_cells, cell_bytes, num_faces, num_surfaces, num_boundary_cells, num_boundary_faces, num_species, inertIndex; + int num_cells, cell_bytes, num_faces, num_surfaces, num_boundary_cells, num_boundary_faces, num_species, boundary_face_bytes, inertIndex; int *d_A_csr_row_index, *d_A_csr_diag_index, *d_A_csr_col_index; // Matrix variables @@ -27,6 +27,18 @@ private: double *d_phiUc_boundary = nullptr; double *d_phiUc = nullptr; double *d_mut_Sct = nullptr; + double *d_boundary_mut_sct = nullptr; + + std::vector d_Y_old; + double *d_boundary_Y = nullptr; + double *d_hai, *d_boundary_hai = nullptr; + double *d_rhoD, *d_boundary_rhoD = nullptr; + double *d_sum_rhoD_grady, *d_sum_boundary_rhoD_grady = nullptr; + double *d_sum_hai_rhoD_grady, *d_sum_boundary_hai_rhoD_grady = nullptr; + double *d_sum_hai_y, *d_sum_boundary_hai_y = nullptr; + double *d_grady, *d_boundary_grady = nullptr; + + double *d_alpha = nullptr; public: dfYEqn(dfMatrixDataBase &dataBase, const std::string &modeStr, const std::string &cfgFile, const int inertIndex); @@ -35,7 +47,9 @@ public: void checkValue(bool print, char *filename); - void correctVelocity(std::vector Y_old, std::vector boundary_Y_init, std::vector rhoD_GPU); + void correctVelocity(std::vector Y_old, std::vector boundary_Y, + std::vector hai, std::vector boundary_hai, + std::vector rhoD, std::vector boundary_rhoD); void upwindWeight(); @@ -45,7 +59,10 @@ public: void fvm_div_phiUc(); - void fvm_laplacian(double *mut_Sct, double *boundary_mut_Sct, std::vectorboundary_rhoD); + void fvm_laplacian(const double *mut_Sct, const double *boundary_mut_Sct, + std::vectorrhoD, std::vectorboundary_rhoD); + + void compute_diffAlphaD(const double *alpha, std::vector hai); void solve(); diff --git a/src_gpu/dfYEqn.cu b/src_gpu/dfYEqn.cu index 9ef75747f..a9ede60ec 100644 --- a/src_gpu/dfYEqn.cu +++ b/src_gpu/dfYEqn.cu @@ -12,10 +12,10 @@ __global__ void getUpwindWeight(int num_faces, double *phi, double *weight) weight[index] = 0.; } -__global__ void fvc_grad_internal_face_Y(int num_cells, - const int *csr_row_index, const int *csr_col_index, const int *csr_diag_index, const double *volume, - const double *face_vector, const double *weight, const double *species, const double *rhoD, - const double *sumYDiffError, double *sumYDiffError_output) +__global__ void fvc_grad_internal(int num_cells, + const int *csr_row_index, const int *csr_col_index, const int *csr_diag_index, + const double *face_vector, const double *weight, const double *species, + const double *volume, double *grady) { int index = blockDim.x * blockIdx.x + threadIdx.x; if (index >= num_cells) @@ -28,12 +28,9 @@ __global__ void fvc_grad_internal_face_Y(int num_cells, int neighbor_offset = csr_row_index[index] - index; double own_cell_Y = species[index]; - double grad_bx_upper = 0; - double grad_by_upper = 0; - double grad_bz_upper = 0; - double grad_bx_lower = 0; - double grad_by_lower = 0; - double grad_bz_lower = 0; + double grad_bx = 0; + double grad_by = 0; + double grad_bz = 0; for (int i = row_index; i < next_row_index; i++) { int inner_index = i - row_index; @@ -48,9 +45,9 @@ __global__ void fvc_grad_internal_face_Y(int num_cells, int neighbor_cell_id = csr_col_index[row_index + inner_index]; double neighbor_cell_Y = species[neighbor_cell_id]; double face_Y = w * (neighbor_cell_Y - own_cell_Y) + own_cell_Y; - grad_bx_lower -= face_Y * sfx; - grad_by_lower -= face_Y * sfy; - grad_bz_lower -= face_Y * sfz; + grad_bx -= face_Y * sfx; + grad_by -= face_Y * sfy; + grad_bz -= face_Y * sfz; } // upper if (inner_index > diag_index) @@ -63,29 +60,21 @@ __global__ void fvc_grad_internal_face_Y(int num_cells, int neighbor_cell_id = csr_col_index[row_index + inner_index]; double neighbor_cell_Y = species[neighbor_cell_id]; double face_Y = w * (own_cell_Y - neighbor_cell_Y) + neighbor_cell_Y; - grad_bx_upper += face_Y * sfx; - grad_by_upper += face_Y * sfy; - grad_bz_upper += face_Y * sfz; + grad_bx += face_Y * sfx; + grad_by += face_Y * sfy; + grad_bz += face_Y * sfz; } } double vol = volume[index]; - - sumYDiffError_output[index * 3 + 0] = sumYDiffError[index * 3 + 0] + (grad_bx_upper + grad_bx_lower); - sumYDiffError_output[index * 3 + 1] = sumYDiffError[index * 3 + 1] + (grad_by_upper + grad_by_lower); - sumYDiffError_output[index * 3 + 2] = sumYDiffError[index * 3 + 2] + (grad_bz_upper + grad_bz_lower); - - // if (index == 0) - // { - // printf("grad_bz_upper = %e\n", grad_bz_upper); - // printf("grad_bz_lower = %e\n", grad_bz_lower); - // printf("(grad_bz_upper + grad_bz_lower) = %e\n", (grad_bz_upper + grad_bz_lower)); - // } + grady[index * 3 + 0] = grad_bx / vol; + grady[index * 3 + 1] = grad_by / vol; + grady[index * 3 + 2] = grad_bz / vol; } -__global__ void fvc_grad_boundary_face_Y(int num_cells, int num_boundary_cells, - const int *boundary_cell_offset, const int *boundary_cell_id, const double *rhoD, const int *bouPermedIndex, - const double *boundary_face_vector, const double *boundary_species_init, double *boundary_species, - const double *volume, const double *sumYDiffError, double *sumYDiffError_output) +__global__ void fvc_grad_boundary(int num_boundary_cells, + const int *boundary_cell_offset, const int *boundary_cell_id, const int *bouPermedIndex, + const double *boundary_face_vector, const double *boundary_species_init, + const double *volume, double *grady) { int index = blockDim.x * blockIdx.x + threadIdx.x; if (index >= num_boundary_cells) @@ -106,62 +95,21 @@ __global__ void fvc_grad_boundary_face_Y(int num_cells, int num_boundary_cells, double sfz = boundary_face_vector[i * 3 + 2]; int permute_index = bouPermedIndex[i]; double face_Y = boundary_species_init[permute_index]; - boundary_species[i] = face_Y; grad_bx += face_Y * sfx; grad_by += face_Y * sfy; grad_bz += face_Y * sfz; - // if (index == 0) - // { - // printf("face_Y = %e\n", face_Y); - // printf("sfz = %e\n", sfz); - // printf("face_Y * sfz = %e\n", face_Y * sfz); - // } } - //// correct the boundary gradient - // double nx = boundary_face_vector[face_index * 3 + 0] / magSf[face_index]; - // double ny = boundary_face_vector[face_index * 3 + 1] / magSf[face_index]; - // double nz = boundary_face_vector[face_index * 3 + 2] / magSf[face_index]; - // double sn_grad = 0; - // double grad_correction = sn_grad * volume[cell_index] - (nx * grad_bx + ny * grad_by + nz * grad_bz); - // grad_bx += nx * grad_correction; - // grad_by += ny * grad_correction; - // grad_bz += nz * grad_correction; - - sumYDiffError_output[cell_index * 3 + 0] = sumYDiffError[cell_index * 3 + 0] + grad_bx; - sumYDiffError_output[cell_index * 3 + 1] = sumYDiffError[cell_index * 3 + 1] + grad_by; - sumYDiffError_output[cell_index * 3 + 2] = sumYDiffError[cell_index * 3 + 2] + grad_bz; -} - -__global__ void sumError(int num_cells, const double *volume, const double *rhoD, - const double *sumYDiffErrorTmp, double *sumYDiffError_output) -{ - int index = blockDim.x * blockIdx.x + threadIdx.x; - if (index >= num_cells) - return; - - sumYDiffError_output[index * 3 + 0] = sumYDiffError_output[index * 3 + 0] + sumYDiffErrorTmp[index * 3 + 0] * rhoD[index]; - sumYDiffError_output[index * 3 + 1] = sumYDiffError_output[index * 3 + 1] + sumYDiffErrorTmp[index * 3 + 1] * rhoD[index]; - sumYDiffError_output[index * 3 + 2] = sumYDiffError_output[index * 3 + 2] + sumYDiffErrorTmp[index * 3 + 2] * rhoD[index]; -} - -__global__ void divide_vol(int num_cells, const double *volume, - const double *sumYDiffError, double *sumYDiffError_output) -{ - int index = blockDim.x * blockIdx.x + threadIdx.x; - if (index >= num_cells) - return; double vol = volume[index]; - - sumYDiffError_output[index * 3 + 0] = sumYDiffError[index * 3 + 0] / vol; - sumYDiffError_output[index * 3 + 1] = sumYDiffError[index * 3 + 1] / vol; - sumYDiffError_output[index * 3 + 2] = sumYDiffError[index * 3 + 2] / vol; + grady[cell_index * 3 + 0] += grad_bx / vol; + grady[cell_index * 3 + 1] += grad_by / vol; + grady[cell_index * 3 + 2] += grad_bz / vol; } -__global__ void correct_boundary_conditions_vec(int num_boundary_cells, +__global__ void correct_boundary_conditions(int num_boundary_cells, const int *boundary_cell_offset, const int *boundary_cell_id, const double *boundary_sf, const double *mag_sf, - double *boundary_sumYDiffError, double *sumYDiffError) + const double *grady, double* boundary_grady) { int index = blockDim.x * blockIdx.x + threadIdx.x; if (index >= num_boundary_cells) @@ -172,9 +120,9 @@ __global__ void correct_boundary_conditions_vec(int num_boundary_cells, int cell_index = boundary_cell_id[cell_offset]; // initialize boundary_sumYDiffError - double sumYDiffError_x = sumYDiffError[cell_index * 3 + 0]; - double sumYDiffError_y = sumYDiffError[cell_index * 3 + 1]; - double sumYDiffError_z = sumYDiffError[cell_index * 3 + 2]; + double grady_x = grady[cell_index * 3 + 0]; + double grady_y = grady[cell_index * 3 + 1]; + double grady_z = grady[cell_index * 3 + 2]; for (int i = cell_offset; i < next_cell_offset; i++) { @@ -182,15 +130,67 @@ __global__ void correct_boundary_conditions_vec(int num_boundary_cells, double n_y = boundary_sf[i * 3 + 1] / mag_sf[i]; double n_z = boundary_sf[i * 3 + 2] / mag_sf[i]; double sn_grad = 0; - double grad_correction = sn_grad - (n_x * sumYDiffError_x + n_y * sumYDiffError_y + n_z * sumYDiffError_z); - boundary_sumYDiffError[i * 3 + 0] = sumYDiffError_x + grad_correction * n_x; - boundary_sumYDiffError[i * 3 + 1] = sumYDiffError_y + grad_correction * n_y; - boundary_sumYDiffError[i * 3 + 2] = sumYDiffError_z + grad_correction * n_z; + double grad_correction = sn_grad - (n_x * grady_x + n_y * grady_y + n_z * grady_z); + boundary_grady[i * 3 + 0] = grady_x + grad_correction * n_x; + boundary_grady[i * 3 + 1] = grady_y + grad_correction * n_y; + boundary_grady[i * 3 + 2] = grady_z + grad_correction * n_z; } } -__global__ void calculate_phiUc(int num_cells, const int *csr_row_index, const int *csr_col_index, const int *csr_diag_index, - const double *face_vector, const double *weight, const double *sumYDiffError, double *phiUc) +__global__ void sumError_internal(int num_cells, + const double *hai, const double *rhoD, const double *y, const double *grady, + double *sum_hai_rhoD_grady, double *sum_rhoD_grady, double *sum_hai_y) +{ + int index = blockDim.x * blockIdx.x + threadIdx.x; + if (index >= num_cells) + return; + + sum_hai_rhoD_grady[index * 3 + 0] += hai[index] * rhoD[index] * grady[index * 3 + 0]; + sum_hai_rhoD_grady[index * 3 + 1] += hai[index] * rhoD[index] * grady[index * 3 + 1]; + sum_hai_rhoD_grady[index * 3 + 2] += hai[index] * rhoD[index] * grady[index * 3 + 2]; + + sum_rhoD_grady[index * 3 + 0] += rhoD[index] * grady[index * 3 + 0]; + sum_rhoD_grady[index * 3 + 1] += rhoD[index] * grady[index * 3 + 1]; + sum_rhoD_grady[index * 3 + 2] += rhoD[index] * grady[index * 3 + 2]; + + sum_hai_y[index] += hai[index] * y[index]; +} + +__global__ void sumError_boundary(int num_boundary_faces, const int *bouPermedIndex, + const double *boundary_hai, const double *boundary_rhoD, const double *boundary_y, const double *boundary_grady, + double *sum_boundary_hai_rhoD_grady, double *sum_boundary_rhoD_grady, double *sum_boundary_hai_y) +{ + int index = blockDim.x * blockIdx.x + threadIdx.x; + if (index >= num_boundary_faces) + return; + + int permute_index = bouPermedIndex[index]; + sum_boundary_hai_rhoD_grady[index * 3 + 0] += boundary_hai[permute_index] * boundary_rhoD[permute_index] * boundary_grady[index * 3 + 0]; + sum_boundary_hai_rhoD_grady[index * 3 + 1] += boundary_hai[permute_index] * boundary_rhoD[permute_index] * boundary_grady[index * 3 + 1]; + sum_boundary_hai_rhoD_grady[index * 3 + 2] += boundary_hai[permute_index] * boundary_rhoD[permute_index] * boundary_grady[index * 3 + 2]; + + sum_boundary_rhoD_grady[index * 3 + 0] += boundary_rhoD[permute_index] * boundary_grady[index * 3 + 0]; + sum_boundary_rhoD_grady[index * 3 + 1] += boundary_rhoD[permute_index] * boundary_grady[index * 3 + 1]; + sum_boundary_rhoD_grady[index * 3 + 2] += boundary_rhoD[permute_index] * boundary_grady[index * 3 + 2]; + + sum_boundary_hai_y[index] += boundary_hai[permute_index] * boundary_y[permute_index]; +} + +__global__ void calculate_hDiffCorrFlux(int num, + const double *sum_hai_rhoD_grady, const double *sum_rhoD_grady, const double *sum_hai_y, double *hDiffCorrFlux) +{ + int index = blockDim.x * blockIdx.x + threadIdx.x; + if (index >= num) + return; + + hDiffCorrFlux[index * 3 + 0] += (sum_hai_rhoD_grady[index * 3 + 0] - sum_hai_y[index] * sum_rhoD_grady[index * 3 + 0]); + hDiffCorrFlux[index * 3 + 1] += (sum_hai_rhoD_grady[index * 3 + 1] - sum_hai_y[index] * sum_rhoD_grady[index * 3 + 1]); + hDiffCorrFlux[index * 3 + 2] += (sum_hai_rhoD_grady[index * 3 + 2] - sum_hai_y[index] * sum_rhoD_grady[index * 3 + 2]); +} + +__global__ void calculate_phiUc_internal(int num_cells, + const int *csr_row_index, const int *csr_col_index, const int *csr_diag_index, + const double *face_vector, const double *weight, const double *sumYDiffError, double *phiUc) { int index = blockDim.x * blockIdx.x + threadIdx.x; if (index >= num_cells) @@ -456,6 +456,55 @@ __global__ void fvm_laplacian_uncorrected_scalar_boundary(int num_cells, int num b_output[cell_index] = b_input[cell_index] + boundary_coeffs * sign; } +__global__ void fvc_laplacian_internal(int num_cells, + const int *csr_row_index, const int *csr_col_index, const int *csr_diag_index, + const double *alpha, const double *hai, const double* y, + const double *weight, const double *magsf, const double *distance, + const double* volume, double *output) +{ + int index = blockDim.x * blockIdx.x + threadIdx.x; + if (index >= num_cells) + return; + + // A_csr has one more element in each row: itself + int row_index = csr_row_index[index]; + int row_elements = csr_row_index[index + 1] - row_index; + int diag_index = csr_diag_index[index]; + int neighbor_offset = csr_row_index[index] - index; + + double own_vf = y[index]; + double own_coeff = alpha[index] * hai[index]; + double sum = 0; + // lower + for (int i = 0; i < diag_index; i++) + { + int neighbor_index = neighbor_offset + i; + int neighbor_cell_id = csr_col_index[i + row_index]; + double w = weight[neighbor_index]; + double nei_vf = y[neighbor_cell_id]; + double nei_coeff = alpha[neighbor_cell_id] * hai[neighbor_cell_id]; + double face_gamma = (1 - w) * own_coeff + w * nei_coeff; + double sngrad = distance[neighbor_index] * (own_vf - nei_vf); + double value = face_gamma * sngrad * magsf[neighbor_index]; + sum -= value; + } + // upper + for (int i = diag_index + 1; i < row_elements; i++) + { + int neighbor_index = neighbor_offset + i - 1; + int neighbor_cell_id = csr_col_index[i + row_index]; + double w = weight[neighbor_index]; + double nei_vf = y[neighbor_cell_id]; + double nei_coeff = alpha[neighbor_cell_id] * hai[neighbor_cell_id]; + double face_gamma = w * own_coeff + (1 - w) * nei_coeff; + double sngrad = distance[neighbor_index] * (nei_vf - own_vf); + double value = face_gamma * sngrad * magsf[neighbor_index]; + sum += value; + } + double vol = volume[index]; + output[index] += sum / vol; +} + dfYEqn::dfYEqn(dfMatrixDataBase &dataBase, const std::string &modeStr, const std::string &cfgFile, const int inertIndex) : dataBase_(dataBase), inertIndex(inertIndex) { @@ -466,6 +515,7 @@ dfYEqn::dfYEqn(dfMatrixDataBase &dataBase, const std::string &modeStr, const std num_boundary_cells = dataBase_.num_boundary_cells; num_boundary_faces = dataBase_.num_boundary_faces; cell_bytes = dataBase_.cell_bytes; + boundary_face_bytes = dataBase_.boundary_face_bytes; YSolverSet.resize(num_species - 1); // consider inert species for (auto &solver : YSolverSet) @@ -482,22 +532,44 @@ dfYEqn::dfYEqn(dfMatrixDataBase &dataBase, const std::string &modeStr, const std checkCudaErrors(cudaMalloc((void **)&d_A_csr, (num_cells + num_faces) * (num_species - 1) * sizeof(double))); checkCudaErrors(cudaMalloc((void **)&d_b, cell_bytes * (num_species - 1))); checkCudaErrors(cudaMalloc((void **)&d_psi, cell_bytes * (num_species - 1))); - checkCudaErrors(cudaMalloc((void **)&d_sumYDiffError, 3 * cell_bytes)); - checkCudaErrors(cudaMalloc((void **)&d_sumYDiffError_tmp, 3 * cell_bytes)); checkCudaErrors(cudaMalloc((void **)&d_phiUc, num_faces * sizeof(double))); - checkCudaErrors(cudaMalloc((void **)&d_phiUc, num_faces * sizeof(double))); - checkCudaErrors(cudaMalloc((void **)&d_sumYDiffError_boundary, 3 * num_boundary_faces * sizeof(double))); checkCudaErrors(cudaMalloc((void **)&d_phiUc_boundary, num_boundary_faces * sizeof(double))); checkCudaErrors(cudaMalloc((void **)&d_mut_Sct, cell_bytes)); + checkCudaErrors(cudaMalloc((void **)&d_boundary_mut_sct, boundary_face_bytes)); + + checkCudaErrors(cudaMalloc((void **)&d_boundary_Y, boundary_face_bytes)); + + checkCudaErrors(cudaMalloc((void **)&d_hai, cell_bytes)); + checkCudaErrors(cudaMalloc((void **)&d_boundary_hai, boundary_face_bytes)); + checkCudaErrors(cudaMalloc((void **)&d_rhoD, cell_bytes)); + checkCudaErrors(cudaMalloc((void **)&d_boundary_rhoD, boundary_face_bytes)); + + checkCudaErrors(cudaMalloc((void **)&d_sum_rhoD_grady, 3 * cell_bytes)); + checkCudaErrors(cudaMalloc((void **)&d_sum_boundary_rhoD_grady, 3 * boundary_face_bytes)); + checkCudaErrors(cudaMalloc((void **)&d_sum_hai_rhoD_grady, 3 * cell_bytes)); + checkCudaErrors(cudaMalloc((void **)&d_sum_boundary_hai_rhoD_grady, 3 * boundary_face_bytes)); + checkCudaErrors(cudaMalloc((void **)&d_sum_hai_y, cell_bytes)); + checkCudaErrors(cudaMalloc((void **)&d_sum_boundary_hai_y, boundary_face_bytes)); + + checkCudaErrors(cudaMalloc((void **)&d_grady, 3 * cell_bytes)); + checkCudaErrors(cudaMalloc((void **)&d_boundary_grady, 3 * boundary_face_bytes)); + + checkCudaErrors(cudaMalloc((void **)&d_alpha, cell_bytes)); + + d_Y_old.resize(num_species); + for (size_t i = 0; i < num_species; i++) + { + checkCudaErrors(cudaMalloc((void **)&d_Y_old[i], cell_bytes)); + } checkCudaErrors(cudaStreamCreate(&stream)); // zeroGradient for (size_t i = 0; i < num_species; i++) { - checkCudaErrors(cudaMemsetAsync(dataBase_.d_internal_coeffs_Y_vector[i], 1, dataBase_.boundary_face_bytes, stream)); - checkCudaErrors(cudaMemsetAsync(dataBase_.d_boundary_coeffs_Y_vector[i], 0, dataBase_.boundary_face_bytes, stream)); - checkCudaErrors(cudaMemsetAsync(dataBase_.d_laplac_internal_coeffs_Y_vector[i], 0, dataBase_.boundary_face_bytes, stream)); - checkCudaErrors(cudaMemsetAsync(dataBase_.d_laplac_boundary_coeffs_Y_vector[i], 0, dataBase_.boundary_face_bytes, stream)); + checkCudaErrors(cudaMemsetAsync(dataBase_.d_internal_coeffs_Y_vector[i], 1, boundary_face_bytes, stream)); + checkCudaErrors(cudaMemsetAsync(dataBase_.d_boundary_coeffs_Y_vector[i], 0, boundary_face_bytes, stream)); + checkCudaErrors(cudaMemsetAsync(dataBase_.d_laplac_internal_coeffs_Y_vector[i], 0, boundary_face_bytes, stream)); + checkCudaErrors(cudaMemsetAsync(dataBase_.d_laplac_boundary_coeffs_Y_vector[i], 0, boundary_face_bytes, stream)); } } @@ -508,50 +580,77 @@ void dfYEqn::upwindWeight() getUpwindWeight<<>>(num_faces, dataBase_.d_phi, dataBase_.d_weight_upwind); } -void dfYEqn::correctVelocity(std::vector Y_old, std::vector boundary_Y_init, std::vector rhoD_GPU) +void dfYEqn::correctVelocity(std::vector Y_old, std::vector boundary_Y, + std::vector hai, std::vector boundary_hai, + std::vector rhoD, std::vector boundary_rhoD) { // initialize variables in each time step - checkCudaErrors(cudaMemsetAsync(d_sumYDiffError, 0, 3 * cell_bytes, stream)); - checkCudaErrors(cudaMemsetAsync(d_phiUc, 0, num_faces * sizeof(double), stream)); + checkCudaErrors(cudaMemsetAsync(d_sum_rhoD_grady, 0, 3 * cell_bytes, stream)); + checkCudaErrors(cudaMemsetAsync(d_sum_boundary_rhoD_grady, 0, 3 * boundary_face_bytes, stream)); + checkCudaErrors(cudaMemsetAsync(d_sum_hai_rhoD_grady, 0, 3 * cell_bytes, stream)); + checkCudaErrors(cudaMemsetAsync(d_sum_boundary_hai_rhoD_grady, 0, 3 * boundary_face_bytes, stream)); + checkCudaErrors(cudaMemsetAsync(d_sum_hai_y, 0, cell_bytes, stream)); + checkCudaErrors(cudaMemsetAsync(d_sum_boundary_hai_y, 0, boundary_face_bytes, stream)); + checkCudaErrors(cudaMemsetAsync(dataBase_.d_hDiffCorrFlux, 0, 3 * cell_bytes, stream)); + checkCudaErrors(cudaMemsetAsync(dataBase_.d_boundary_hDiffCorrFlux, 0, 3 * boundary_face_bytes, stream)); size_t threads_per_block, blocks_per_grid; for (size_t i = 0; i < num_species; ++i) { - checkCudaErrors(cudaMemsetAsync(d_sumYDiffError_tmp, 0, 3 * cell_bytes, stream)); - checkCudaErrors(cudaMemcpyAsync(dataBase_.d_Y_old_vector[i], Y_old[i], cell_bytes, cudaMemcpyHostToDevice, stream)); - checkCudaErrors(cudaMemcpyAsync(dataBase_.d_boundary_Y_init_vector[i], boundary_Y_init[i], dataBase_.boundary_face_bytes, cudaMemcpyHostToDevice, stream)); - checkCudaErrors(cudaMemcpyAsync(dataBase_.d_rhoD_vector[i], rhoD_GPU[i], cell_bytes, cudaMemcpyHostToDevice, stream)); + checkCudaErrors(cudaMemcpyAsync(d_Y_old[i], Y_old[i], cell_bytes, cudaMemcpyHostToDevice, stream)); + checkCudaErrors(cudaMemcpyAsync(d_boundary_Y, boundary_Y[i], boundary_face_bytes, cudaMemcpyHostToDevice, stream)); + checkCudaErrors(cudaMemcpyAsync(d_hai, hai[i], cell_bytes, cudaMemcpyHostToDevice, stream)); + checkCudaErrors(cudaMemcpyAsync(d_boundary_hai, boundary_hai[i], boundary_face_bytes, cudaMemcpyHostToDevice, stream)); + checkCudaErrors(cudaMemcpyAsync(d_rhoD, rhoD[i], cell_bytes, cudaMemcpyHostToDevice, stream)); + checkCudaErrors(cudaMemcpyAsync(d_boundary_rhoD, boundary_rhoD[i], boundary_face_bytes, cudaMemcpyHostToDevice, stream)); + + checkCudaErrors(cudaMemsetAsync(d_grady, 0, 3 * cell_bytes, stream)); + checkCudaErrors(cudaMemsetAsync(d_boundary_grady, 0, 3 * boundary_face_bytes, stream)); // launch cuda kernel threads_per_block = 1024; blocks_per_grid = (num_cells + threads_per_block - 1) / threads_per_block; - fvc_grad_internal_face_Y<<>>(num_cells, - d_A_csr_row_index, d_A_csr_col_index, d_A_csr_diag_index, dataBase_.d_volume, - dataBase_.d_face_vector, dataBase_.d_weight, dataBase_.d_Y_old_vector[i], - dataBase_.d_rhoD_vector[i], d_sumYDiffError_tmp, d_sumYDiffError_tmp); + fvc_grad_internal<<>>(num_cells, + d_A_csr_row_index, d_A_csr_col_index, d_A_csr_diag_index, + dataBase_.d_face_vector, dataBase_.d_weight, d_Y_old[i], + dataBase_.d_volume, d_grady); + blocks_per_grid = (num_boundary_cells + threads_per_block - 1) / threads_per_block; + fvc_grad_boundary<<>>(num_boundary_cells, + dataBase_.d_boundary_cell_offset, dataBase_.d_boundary_cell_id, dataBase_.d_bouPermedIndex, + dataBase_.d_boundary_face_vector, d_boundary_Y, + dataBase_.d_volume, d_grady); blocks_per_grid = (num_boundary_cells + threads_per_block - 1) / threads_per_block; - fvc_grad_boundary_face_Y<<>>(num_cells, num_boundary_cells, dataBase_.d_boundary_cell_offset, - dataBase_.d_boundary_cell_id, dataBase_.d_rhoD_vector[i], dataBase_.d_bouPermedIndex, - dataBase_.d_boundary_face_vector, dataBase_.d_boundary_Y_init_vector[i], dataBase_.d_boundary_Y_vector[i], - dataBase_.d_volume, d_sumYDiffError_tmp, d_sumYDiffError_tmp); + correct_boundary_conditions<<>>(num_boundary_cells, + dataBase_.d_boundary_cell_offset, dataBase_.d_boundary_cell_id, + dataBase_.d_boundary_face_vector, dataBase_.d_boundary_face, + d_grady, d_boundary_grady); + blocks_per_grid = (num_cells + threads_per_block - 1) / threads_per_block; - sumError<<>>(num_cells, dataBase_.d_volume, dataBase_.d_rhoD_vector[i], d_sumYDiffError_tmp, d_sumYDiffError); + sumError_internal<<>>(num_cells, + d_hai, d_rhoD, d_Y_old[i], d_grady, + d_sum_hai_rhoD_grady, d_sum_rhoD_grady, d_sum_hai_y); + blocks_per_grid = (num_boundary_faces + threads_per_block - 1) / threads_per_block; + sumError_boundary<<>>(num_boundary_faces, + dataBase_.d_bouPermedIndex, + d_boundary_hai, d_boundary_rhoD, d_boundary_Y, d_boundary_grady, + d_sum_boundary_hai_rhoD_grady, d_sum_boundary_rhoD_grady, d_sum_boundary_hai_y); } - blocks_per_grid = (num_cells + threads_per_block - 1) / threads_per_block; - divide_vol<<>>(num_cells, dataBase_.d_volume, d_sumYDiffError, d_sumYDiffError); - - blocks_per_grid = (num_boundary_cells + threads_per_block - 1) / threads_per_block; - correct_boundary_conditions_vec<<>>(num_boundary_cells, dataBase_.d_boundary_cell_offset, dataBase_.d_boundary_cell_id, - dataBase_.d_boundary_face_vector, dataBase_.d_boundary_face, d_sumYDiffError_boundary, d_sumYDiffError); blocks_per_grid = (num_cells + threads_per_block - 1) / threads_per_block; - calculate_phiUc<<>>(num_cells, - d_A_csr_row_index, d_A_csr_col_index, d_A_csr_diag_index, dataBase_.d_face_vector, - dataBase_.d_weight, d_sumYDiffError, d_phiUc); + calculate_phiUc_internal<<>>(num_cells, + d_A_csr_row_index, d_A_csr_col_index, d_A_csr_diag_index, + dataBase_.d_face_vector, dataBase_.d_weight, d_sum_rhoD_grady, d_phiUc); + blocks_per_grid = (num_boundary_faces + threads_per_block - 1) / threads_per_block; + calculate_phiUc_boundary<<>>(num_boundary_faces, + dataBase_.d_boundary_cell_offset, dataBase_.d_boundary_cell_id, + dataBase_.d_boundary_face_vector, d_sum_boundary_rhoD_grady, d_phiUc_boundary); + blocks_per_grid = (num_cells + threads_per_block - 1) / threads_per_block; + calculate_hDiffCorrFlux<<>>(num_cells, + d_sum_hai_rhoD_grady, d_sum_rhoD_grady, d_sum_hai_y, dataBase_.d_hDiffCorrFlux); blocks_per_grid = (num_boundary_faces + threads_per_block - 1) / threads_per_block; - calculate_phiUc_boundary<<>>(num_boundary_faces, dataBase_.d_boundary_cell_offset, dataBase_.d_boundary_cell_id, - dataBase_.d_boundary_face_vector, d_sumYDiffError_boundary, d_phiUc_boundary); + calculate_hDiffCorrFlux<<>>(num_boundary_faces, + d_sum_boundary_hai_rhoD_grady, d_sum_boundary_rhoD_grady, d_sum_boundary_hai_y, dataBase_.d_boundary_hDiffCorrFlux); } void dfYEqn::fvm_ddt() @@ -572,11 +671,11 @@ void dfYEqn::fvm_ddt() threads_per_block = 1024; blocks_per_grid = (num_cells + threads_per_block - 1) / threads_per_block; fvm_ddt_kernel_scalar<<>>(num_cells, num_faces, dataBase_.rdelta_t, - d_A_csr_row_index, d_A_csr_diag_index, - dataBase_.d_rho_old, dataBase_.d_rho_new, dataBase_.d_volume, dataBase_.d_Y_old_vector[i], - d_A_csr + mtxIndex * (num_cells + num_faces), d_b + mtxIndex * num_cells, - d_A_csr + mtxIndex * (num_cells + num_faces), d_b + mtxIndex * num_cells, - d_psi + mtxIndex * num_cells); + d_A_csr_row_index, d_A_csr_diag_index, + dataBase_.d_rho_old, dataBase_.d_rho_new, dataBase_.d_volume, d_Y_old[i], + d_A_csr + mtxIndex * (num_cells + num_faces), d_b + mtxIndex * num_cells, + d_A_csr + mtxIndex * (num_cells + num_faces), d_b + mtxIndex * num_cells, + d_psi + mtxIndex * num_cells); ++mtxIndex; } } @@ -636,32 +735,34 @@ void dfYEqn::fvm_div_phiUc() } } -void dfYEqn::fvm_laplacian(double *mut_Sct, double *boundary_mut_Sct, std::vector boundary_rhoD) +void dfYEqn::fvm_laplacian(const double *mut_Sct, const double *boundary_mut_Sct, + std::vector rhoD, std::vector boundary_rhoD) { size_t threads_per_block, blocks_per_grid; int mtxIndex = 0; - checkCudaErrors(cudaMemcpyAsync(dataBase_.d_boundary_mut_sct, boundary_mut_Sct, dataBase_.boundary_face_bytes, cudaMemcpyHostToDevice, stream)); + checkCudaErrors(cudaMemcpyAsync(d_boundary_mut_sct, boundary_mut_Sct, boundary_face_bytes, cudaMemcpyHostToDevice, stream)); checkCudaErrors(cudaMemcpyAsync(d_mut_Sct, mut_Sct, cell_bytes, cudaMemcpyHostToDevice, stream)); for (size_t i = 0; i < num_species; ++i) { if (i == inertIndex) continue; - checkCudaErrors(cudaMemcpyAsync(dataBase_.d_boundary_rhoD_vector[i], boundary_rhoD[i], dataBase_.boundary_face_bytes, cudaMemcpyHostToDevice, stream)); + checkCudaErrors(cudaMemcpyAsync(d_rhoD, rhoD[i], cell_bytes, cudaMemcpyHostToDevice, stream)); + checkCudaErrors(cudaMemcpyAsync(d_boundary_rhoD, boundary_rhoD[i], boundary_face_bytes, cudaMemcpyHostToDevice, stream)); // launch cuda kernel threads_per_block = 1024; blocks_per_grid = (num_cells + threads_per_block - 1) / threads_per_block; fvm_laplacian_uncorrected_scalar_internal<<>>(num_cells, d_A_csr_row_index, d_A_csr_col_index, d_A_csr_diag_index, - d_mut_Sct, dataBase_.d_rhoD_vector[i], dataBase_.d_weight, dataBase_.d_face, + d_mut_Sct, d_rhoD, dataBase_.d_weight, dataBase_.d_face, dataBase_.d_deltaCoeffs, -1., d_A_csr + mtxIndex * (num_cells + num_faces), d_A_csr + mtxIndex * (num_cells + num_faces)); blocks_per_grid = (num_boundary_cells + threads_per_block - 1) / threads_per_block; fvm_laplacian_uncorrected_scalar_boundary<<>>(num_cells, num_boundary_cells, d_A_csr_row_index, d_A_csr_diag_index, dataBase_.d_boundary_cell_offset, - dataBase_.d_boundary_cell_id, dataBase_.d_boundary_rhoD_vector[i], - dataBase_.d_boundary_mut_sct, dataBase_.d_boundary_face, dataBase_.d_bouPermedIndex, + dataBase_.d_boundary_cell_id, d_boundary_rhoD, + d_boundary_mut_sct, dataBase_.d_boundary_face, dataBase_.d_bouPermedIndex, dataBase_.d_laplac_internal_coeffs_Y_vector[i], dataBase_.d_laplac_boundary_coeffs_Y_vector[i], -1., d_A_csr + mtxIndex * (num_cells + num_faces), d_b + mtxIndex * num_cells, d_A_csr + mtxIndex * (num_cells + num_faces), d_b + mtxIndex * num_cells); @@ -759,6 +860,24 @@ void dfYEqn::updatePsi(double *Psi, int speciesIndex) memcpy(Psi, h_psi + speciesIndex * num_cells, cell_bytes); } +void dfYEqn::compute_diffAlphaD(const double *alpha, std::vector hai) +{ + checkCudaErrors(cudaMemsetAsync(dataBase_.d_diffAlphaD, 0, cell_bytes, stream)); + checkCudaErrors(cudaMemcpyAsync(d_alpha, alpha, cell_bytes, cudaMemcpyHostToDevice, stream)); + size_t threads_per_block, blocks_per_grid; + for (size_t i = 0; i < num_species; ++i) + { + checkCudaErrors(cudaMemcpyAsync(d_hai, hai[i], cell_bytes, cudaMemcpyHostToDevice, stream)); + threads_per_block = 1024; + blocks_per_grid = (num_cells + threads_per_block - 1) / threads_per_block; + fvc_laplacian_internal<<>>(num_cells, + d_A_csr_row_index, d_A_csr_col_index, d_A_csr_diag_index, + d_alpha, d_hai, d_Y_old[i], + dataBase_.d_weight, dataBase_.d_face, dataBase_.d_deltaCoeffs, + dataBase_.d_volume, dataBase_.d_diffAlphaD); + } +} + dfYEqn::~dfYEqn() { -} \ No newline at end of file +} From b1580807f3e4e494d1724006413009d00a0b5067 Mon Sep 17 00:00:00 2001 From: STwangyingrui Date: Thu, 8 Jun 2023 22:14:48 +0800 Subject: [PATCH 2/6] refactors: same stream cross equations, initializeTimeStep for each equation, profile the CPU correctBoundaryConditions --- applications/solvers/dfLowMachFoam/EEqn.H | 4 +++- applications/solvers/dfLowMachFoam/YEqn.H | 1 + .../solvers/dfLowMachFoam/dfLowMachFoam.C | 3 +++ applications/solvers/dfLowMachFoam/rhoEqn.H | 1 + src_gpu/dfEEqn.H | 2 ++ src_gpu/dfEEqn.cu | 10 ++++++++-- src_gpu/dfMatrixDataBase.H | 14 +++++++------- src_gpu/dfRhoEqn.H | 5 ++++- src_gpu/dfRhoEqn.cu | 10 +++++++--- src_gpu/dfUEqn.cu | 4 ++-- src_gpu/dfYEqn.H | 2 ++ src_gpu/dfYEqn.cu | 17 +++++++++++------ 12 files changed, 51 insertions(+), 22 deletions(-) diff --git a/applications/solvers/dfLowMachFoam/EEqn.H b/applications/solvers/dfLowMachFoam/EEqn.H index 764b6be10..d84da50a1 100644 --- a/applications/solvers/dfLowMachFoam/EEqn.H +++ b/applications/solvers/dfLowMachFoam/EEqn.H @@ -58,6 +58,8 @@ end1 = std::clock(); time_monitor_EEqn_mtxAssembly_CPU_Prepare += double(end1 - start1) / double(CLOCKS_PER_SEC); + EEqn_GPU.initializeTimeStep(); + // prepare data on GPU start1 = std::clock(); EEqn_GPU.prepare_data(&he.oldTime()[0], &K[0], &K.oldTime()[0], &alphaEff[0], @@ -67,7 +69,6 @@ time_monitor_EEqn_mtxAssembly_GPU_Prepare += double(end1 - start1) / double(CLOCKS_PER_SEC); start1 = std::clock(); - EEqn_GPU.initializeTimeStep(); EEqn_GPU.fvm_ddt(); EEqn_GPU.fvm_div(); EEqn_GPU.fvm_laplacian(); @@ -99,6 +100,7 @@ he.correctBoundaryConditions(); end1 = std::clock(); time_monitor_EEqn += double(end1 - start1) / double(CLOCKS_PER_SEC); + time_monitor_EEqn_mtxAssembly_CPU_CorrectBC += double(end1 - start1) / double(CLOCKS_PER_SEC); time_monitor_EEqn_mtxAssembly += double(end1 - start1) / double(CLOCKS_PER_SEC); #endif } diff --git a/applications/solvers/dfLowMachFoam/YEqn.H b/applications/solvers/dfLowMachFoam/YEqn.H index baa12633e..d8c69d20b 100644 --- a/applications/solvers/dfLowMachFoam/YEqn.H +++ b/applications/solvers/dfLowMachFoam/YEqn.H @@ -80,6 +80,7 @@ time_monitor_YEqn_Solve += double(end1 - start1) / double(CLOCKS_PER_SEC); end2 = std::clock(); time_monitor_YEqn_mtxAssembly_CPU_Prepare += double(end2 - start2) / double(CLOCKS_PER_SEC); + YEqn_GPU.initializeTimeStep(); YEqn_GPU.upwindWeight(); YEqn_GPU.correctVelocity(Y_old, boundary_Y, hai, boundary_hai, rhoD, boundary_rhoD); YEqn_GPU.fvm_ddt(); diff --git a/applications/solvers/dfLowMachFoam/dfLowMachFoam.C b/applications/solvers/dfLowMachFoam/dfLowMachFoam.C index c2a62797a..6f54e0a95 100644 --- a/applications/solvers/dfLowMachFoam/dfLowMachFoam.C +++ b/applications/solvers/dfLowMachFoam/dfLowMachFoam.C @@ -108,6 +108,7 @@ int main(int argc, char *argv[]) double time_monitor_EEqn_mtxAssembly_CPU_Prepare; double time_monitor_EEqn_mtxAssembly_GPU_Prepare; double time_monitor_EEqn_mtxAssembly_GPU_Run; + double time_monitor_EEqn_mtxAssembly_CPU_CorrectBC; double time_monitor_EEqn_Solve=0; double time_monitor_EEqn_sum=0; double time_monitor_YEqn=0; @@ -259,6 +260,7 @@ int main(int argc, char *argv[]) Info<< "EEqn assamble(CPU prepare) = " << time_monitor_EEqn_mtxAssembly_CPU_Prepare << " s" << endl; Info<< "EEqn assamble(GPU prepare) = " << time_monitor_EEqn_mtxAssembly_GPU_Prepare << " s" << endl; Info<< "EEqn assamble(GPU run) = " << time_monitor_EEqn_mtxAssembly_GPU_Run << " s" << endl; + Info<< "EEqn assamble(CPU correct) = " << time_monitor_EEqn_mtxAssembly_CPU_CorrectBC << " s" << endl; Info<< "EEqn Time solve = " << time_monitor_EEqn_Solve << " s" << endl; Info<< "YEqn Time = " << time_monitor_YEqn << " s" << endl; Info<< "YEqn Time assamble Mtx = " << time_monitor_YEqn_mtxAssembly << " s" << endl; @@ -278,6 +280,7 @@ int main(int argc, char *argv[]) time_monitor_EEqn_mtxAssembly_CPU_Prepare = 0; time_monitor_EEqn_mtxAssembly_GPU_Prepare = 0; time_monitor_EEqn_mtxAssembly_GPU_Run = 0; + time_monitor_EEqn_mtxAssembly_CPU_CorrectBC = 0; time_monitor_EEqn_Solve = 0; time_monitor_chem = 0; time_monitor_Y = 0; diff --git a/applications/solvers/dfLowMachFoam/rhoEqn.H b/applications/solvers/dfLowMachFoam/rhoEqn.H index 1c0652dca..1d0793384 100644 --- a/applications/solvers/dfLowMachFoam/rhoEqn.H +++ b/applications/solvers/dfLowMachFoam/rhoEqn.H @@ -40,6 +40,7 @@ Description memcpy(boundary_phi_init+offset, &patchFlux[0], patchSize*sizeof(double)); offset += patchSize; } + rhoEqn_GPU.initializeTimeStep(); rhoEqn_GPU.fvc_div(&phi[0], boundary_phi_init); rhoEqn_GPU.fvm_ddt(&rho.oldTime()[0]); rhoEqn_GPU.updatePsi(&rho.primitiveFieldRef()[0]); diff --git a/src_gpu/dfEEqn.H b/src_gpu/dfEEqn.H index 732e32ca7..f15a44a49 100644 --- a/src_gpu/dfEEqn.H +++ b/src_gpu/dfEEqn.H @@ -9,6 +9,8 @@ class dfEEqn private: dfMatrixDataBase &dataBase_; cudaStream_t stream; + //cudaEvent_t event; + AmgXSolver *ESolver = nullptr; int num_iteration; diff --git a/src_gpu/dfEEqn.cu b/src_gpu/dfEEqn.cu index 48d6a19dc..f1aa3d14e 100644 --- a/src_gpu/dfEEqn.cu +++ b/src_gpu/dfEEqn.cu @@ -439,6 +439,9 @@ dfEEqn::dfEEqn(dfMatrixDataBase &dataBase, const std::string &modeStr, const std { ESolver = new AmgXSolver(modeStr, cfgFile); + stream = dataBase_.stream; + //checkCudaErrors(cudaEventCreate(&event)); + num_cells = dataBase_.num_cells; cell_bytes = dataBase_.cell_bytes; num_faces = dataBase_.num_faces; @@ -479,8 +482,6 @@ dfEEqn::dfEEqn(dfMatrixDataBase &dataBase, const std::string &modeStr, const std checkCudaErrors(cudaMalloc((void **)&d_value_boundary_coeffs, boundary_face_bytes)); checkCudaErrors(cudaMalloc((void **)&d_gradient_internal_coeffs, boundary_face_bytes)); checkCudaErrors(cudaMalloc((void **)&d_gradient_boundary_coeffs, boundary_face_bytes)); - - checkCudaErrors(cudaStreamCreate(&stream)); } void dfEEqn::prepare_data(const double *he_old, const double *K, const double *K_old, const double *alphaEff, @@ -688,6 +689,7 @@ void dfEEqn::solve() num_iteration++; checkCudaErrors(cudaMemcpyAsync(h_he_new, d_he_old, cell_bytes, cudaMemcpyDeviceToHost, stream)); + //checkCudaErrors(cudaEventRecord(event, stream)); // checkCudaErrors(cudaStreamSynchronize(stream)); // for (size_t i = 0; i < num_cells; i++) // fprintf(stderr, "h_he_after[%d]: %.16lf\n", i, h_he_new[i]); @@ -700,6 +702,8 @@ void dfEEqn::sync() void dfEEqn::updatePsi(double *Psi) { + checkCudaErrors(cudaStreamSynchronize(stream)); + //checkCudaErrors(cudaEventSynchronize(event)); memcpy(Psi, h_he_new, cell_bytes); } @@ -732,4 +736,6 @@ dfEEqn::~dfEEqn() checkCudaErrors(cudaFree(d_value_boundary_coeffs)); checkCudaErrors(cudaFree(d_gradient_internal_coeffs)); checkCudaErrors(cudaFree(d_gradient_boundary_coeffs)); + + //checkCudaErrors(cudaEventDestroy(event)); } diff --git a/src_gpu/dfMatrixDataBase.H b/src_gpu/dfMatrixDataBase.H index 778d7c56c..ed9915438 100644 --- a/src_gpu/dfMatrixDataBase.H +++ b/src_gpu/dfMatrixDataBase.H @@ -43,6 +43,9 @@ inline void checkVectorEqual(int count, const double* basevec, double* vec, doub struct dfMatrixDataBase { + // - cuda resource + cudaStream_t stream; + // - number of cell size int num_cells; // - number of face size @@ -185,11 +188,6 @@ struct dfMatrixDataBase // double *h_A_csr = nullptr, *h_b = nullptr, *h_psi = nullptr; // double *d_A_csr = nullptr, *d_b = nullptr, *d_psi = nullptr; - /** - * @brief cuda functions - */ - cudaStream_t stream; - int num_iteration; double time_monitor_CPU; @@ -208,6 +206,9 @@ struct dfMatrixDataBase : num_cells(num_cells), num_faces(num_surfaces*2), num_surfaces(num_surfaces), num_species(num_species), num_iteration(0), num_boundary_faces(num_boundary_faces), h_volume(volume) { + // create cuda stream + checkCudaErrors(cudaStreamCreate(&stream)); + // allocate field pointer in pin memory cudaMallocHost(&h_phi_init, num_faces * sizeof(double)); cudaMallocHost(&h_rho_old, num_cells * sizeof(double)); @@ -573,8 +574,6 @@ struct dfMatrixDataBase fprintf(stderr, "Total bytes malloc on GPU: %.2fMB\n", total_bytes * 1.0 / 1024 / 1024); - checkCudaErrors(cudaStreamCreate(&stream)); - checkCudaErrors(cudaMemcpyAsync(d_A_csr_row_index, h_A_csr_row_index, csr_row_index_bytes, cudaMemcpyHostToDevice, stream)); checkCudaErrors(cudaMemcpyAsync(d_A_csr_col_index, h_A_csr_col_index, csr_col_index_bytes, cudaMemcpyHostToDevice, stream)); checkCudaErrors(cudaMemcpyAsync(d_A_csr_diag_index, h_A_csr_diag_index, cell_index_bytes, cudaMemcpyHostToDevice, stream)); @@ -594,6 +593,7 @@ struct dfMatrixDataBase checkCudaErrors(cudaMemcpyAsync(d_bouPermedIndex, boundPermutationList.data(), boundary_face_index_bytes, cudaMemcpyHostToDevice, stream)); checkCudaErrors(cudaMemcpyAsync(d_tmpPermutatedList, tmpPermutatedList.data(), csr_col_index_bytes, cudaMemcpyHostToDevice, stream)); }; + ~dfMatrixDataBase(){ std::cout << "Destructor called." << std::endl; // TODO: free pointers diff --git a/src_gpu/dfRhoEqn.H b/src_gpu/dfRhoEqn.H index 35885a517..75d40225d 100644 --- a/src_gpu/dfRhoEqn.H +++ b/src_gpu/dfRhoEqn.H @@ -16,6 +16,7 @@ class dfRhoEqn private: dfMatrixDataBase& dataBase_; cudaStream_t stream; + int num_iteration; double time_monitor_CPU; double time_monitor_GPU_kernel, time_monitor_GPU_memcpy, time_monitor_GPU_memcpy_test; @@ -33,6 +34,8 @@ public: dfRhoEqn(dfMatrixDataBase& dataBase); ~dfRhoEqn(); + void initializeTimeStep(); + void checkValue(bool print); void fvc_div(double *phi, double *boundary_phi_init); @@ -40,4 +43,4 @@ public: void fvm_ddt(double *rho_old); void updatePsi(double* Psi); -}; \ No newline at end of file +}; diff --git a/src_gpu/dfRhoEqn.cu b/src_gpu/dfRhoEqn.cu index fc933f95b..a18c79a03 100644 --- a/src_gpu/dfRhoEqn.cu +++ b/src_gpu/dfRhoEqn.cu @@ -82,6 +82,7 @@ __global__ void fvm_ddt_rho(int num_cells, const double rdelta_t, dfRhoEqn::dfRhoEqn(dfMatrixDataBase &dataBase) : dataBase_(dataBase) { + stream = dataBase_.stream; num_cells = dataBase_.num_cells; cell_bytes = dataBase_.cell_bytes; num_surfaces = dataBase_.num_surfaces; @@ -94,13 +95,16 @@ dfRhoEqn::dfRhoEqn(dfMatrixDataBase &dataBase) checkCudaErrors(cudaMalloc((void **)&d_b, cell_bytes)); checkCudaErrors(cudaMalloc((void **)&d_psi, cell_bytes)); +} - checkCudaErrors(cudaStreamCreate(&stream)); +void dfRhoEqn::initializeTimeStep() +{ + // initialize matrix value + checkCudaErrors(cudaMemsetAsync(d_b, 0, cell_bytes, stream)); } void dfRhoEqn::fvc_div(double *phi, double *boundary_phi_init) { - checkCudaErrors(cudaMemsetAsync(d_b, 0, cell_bytes, stream)); clock_t start = std::clock(); memcpy(dataBase_.h_phi_init, phi, num_surfaces * sizeof(double)); @@ -138,4 +142,4 @@ void dfRhoEqn::updatePsi(double *Psi) for (size_t i = 0; i < num_cells; i++) Psi[i] = h_psi[i]; } -dfRhoEqn::~dfRhoEqn(){} \ No newline at end of file +dfRhoEqn::~dfRhoEqn(){} diff --git a/src_gpu/dfUEqn.cu b/src_gpu/dfUEqn.cu index 7eb7cbe31..a82d85c76 100644 --- a/src_gpu/dfUEqn.cu +++ b/src_gpu/dfUEqn.cu @@ -1005,6 +1005,8 @@ __global__ void ueqn_update_BoundaryCoeffs_kernel(int num_boundary_faces, const dfUEqn::dfUEqn(dfMatrixDataBase &dataBase, const std::string &modeStr, const std::string &cfgFile) : dataBase_(dataBase) { + stream = dataBase_.stream; + UxSolver = new AmgXSolver(modeStr, cfgFile); UySolver = new AmgXSolver(modeStr, cfgFile); UzSolver = new AmgXSolver(modeStr, cfgFile); @@ -1034,8 +1036,6 @@ dfUEqn::dfUEqn(dfMatrixDataBase &dataBase, const std::string &modeStr, const std checkCudaErrors(cudaMalloc((void **)&d_A, cell_bytes)); checkCudaErrors(cudaMalloc((void **)&d_ueqn_internal_coeffs, cell_vec_bytes)); checkCudaErrors(cudaMalloc((void **)&d_ueqn_boundary_coeffs, cell_vec_bytes)); - - checkCudaErrors(cudaStreamCreate(&stream)); } void dfUEqn::fvm_ddt(double *vector_old) diff --git a/src_gpu/dfYEqn.H b/src_gpu/dfYEqn.H index 6e2a408e4..2903d5934 100644 --- a/src_gpu/dfYEqn.H +++ b/src_gpu/dfYEqn.H @@ -45,6 +45,8 @@ public: ~dfYEqn(); + void initializeTimeStep(); + void checkValue(bool print, char *filename); void correctVelocity(std::vector Y_old, std::vector boundary_Y, diff --git a/src_gpu/dfYEqn.cu b/src_gpu/dfYEqn.cu index a9ede60ec..0dbfdba8c 100644 --- a/src_gpu/dfYEqn.cu +++ b/src_gpu/dfYEqn.cu @@ -508,6 +508,7 @@ __global__ void fvc_laplacian_internal(int num_cells, dfYEqn::dfYEqn(dfMatrixDataBase &dataBase, const std::string &modeStr, const std::string &cfgFile, const int inertIndex) : dataBase_(dataBase), inertIndex(inertIndex) { + stream = dataBase_.stream; num_species = dataBase_.num_species; num_cells = dataBase_.num_cells; num_faces = dataBase_.num_faces; @@ -562,7 +563,6 @@ dfYEqn::dfYEqn(dfMatrixDataBase &dataBase, const std::string &modeStr, const std checkCudaErrors(cudaMalloc((void **)&d_Y_old[i], cell_bytes)); } - checkCudaErrors(cudaStreamCreate(&stream)); // zeroGradient for (size_t i = 0; i < num_species; i++) { @@ -573,6 +573,16 @@ dfYEqn::dfYEqn(dfMatrixDataBase &dataBase, const std::string &modeStr, const std } } +void dfYEqn::initializeTimeStep() +{ + // consider inert species + // initialize matrix value + checkCudaErrors(cudaMemsetAsync(d_A_csr, 0, (num_cells + num_faces) * (num_species - 1) * sizeof(double), stream)); + checkCudaErrors(cudaMemsetAsync(d_b, 0, cell_bytes * (num_species - 1), stream)); + // initialize variables in each time step + checkCudaErrors(cudaMemsetAsync(d_psi, 0, cell_bytes * (num_species - 1), stream)); +} + void dfYEqn::upwindWeight() { size_t threads_per_block = 1024; @@ -655,11 +665,6 @@ void dfYEqn::correctVelocity(std::vector Y_old, std::vector void dfYEqn::fvm_ddt() { - // initialize variables in each time step - checkCudaErrors(cudaMemsetAsync(d_A_csr, 0, (num_cells + num_faces) * (num_species - 1) * sizeof(double), stream)); // consider inert species - checkCudaErrors(cudaMemsetAsync(d_b, 0, cell_bytes * (num_species - 1), stream)); - checkCudaErrors(cudaMemsetAsync(d_psi, 0, cell_bytes * (num_species - 1), stream)); - size_t threads_per_block, blocks_per_grid; int mtxIndex = 0; for (size_t i = 0; i < num_species; ++i) From 9316c680d0a3398077d8fde3c6d47086f09c2e2f Mon Sep 17 00:00:00 2001 From: STwangyingrui Date: Fri, 9 Jun 2023 00:05:46 +0800 Subject: [PATCH 3/6] refactor YEqn: minimize memcpy for species of rhoD and hai --- applications/solvers/dfLowMachFoam/YEqn.H | 5 +- src_gpu/dfMatrixDataBase.H | 11 +- src_gpu/dfYEqn.H | 11 +- src_gpu/dfYEqn.cu | 181 ++++++++++------------ 4 files changed, 89 insertions(+), 119 deletions(-) diff --git a/applications/solvers/dfLowMachFoam/YEqn.H b/applications/solvers/dfLowMachFoam/YEqn.H index d8c69d20b..2e106fca0 100644 --- a/applications/solvers/dfLowMachFoam/YEqn.H +++ b/applications/solvers/dfLowMachFoam/YEqn.H @@ -82,13 +82,12 @@ time_monitor_YEqn_Solve += double(end1 - start1) / double(CLOCKS_PER_SEC); YEqn_GPU.initializeTimeStep(); YEqn_GPU.upwindWeight(); - YEqn_GPU.correctVelocity(Y_old, boundary_Y, hai, boundary_hai, rhoD, boundary_rhoD); + YEqn_GPU.fvm_laplacian_and_sumYDiffError_diffAlphaD_hDiffCorrFlux(Y_old, boundary_Y, + hai, boundary_hai, rhoD, boundary_rhoD, &mut_sct[0], boundary_mutsct, &thermo.alpha()[0]); YEqn_GPU.fvm_ddt(); YEqn_GPU.fvm_div_phi(); YEqn_GPU.fvm_div_phiUc(); - YEqn_GPU.fvm_laplacian(&mut_sct[0], boundary_mutsct, rhoD, boundary_rhoD); - YEqn_GPU.compute_diffAlphaD(&thermo.alpha()[0], hai); end1 = std::clock(); time_monitor_YEqn += double(end1 - start1) / double(CLOCKS_PER_SEC); time_monitor_YEqn_mtxAssembly += double(end1 - start1) / double(CLOCKS_PER_SEC); diff --git a/src_gpu/dfMatrixDataBase.H b/src_gpu/dfMatrixDataBase.H index ed9915438..81c82a9d8 100644 --- a/src_gpu/dfMatrixDataBase.H +++ b/src_gpu/dfMatrixDataBase.H @@ -90,9 +90,8 @@ struct dfMatrixDataBase // - the device pointer to rho_new, rho_old, velocity_old, pressure and volume list double *d_rho_new = nullptr, *d_rho_old = nullptr, *d_velocity_old = nullptr, *d_pressure = nullptr, *d_volume = nullptr; - // - the device pointer to Y_new and Y_old - std::vector d_Y_new_vector; - std::vector d_Y_old_vector; + // - the device pointer to Y(vector Yi) + std::vector d_Y; // - the device pointer to the pre-permutated and post-permutated interpolation weight list double *d_weight_init = nullptr, *d_weight = nullptr; double *d_weight_upwind = nullptr; @@ -479,8 +478,7 @@ struct dfMatrixDataBase checkCudaErrors(cudaMalloc((void**)&d_A_csr_diag_index, cell_index_bytes)); total_bytes += (csr_row_index_bytes + csr_col_index_bytes + cell_index_bytes); - d_Y_new_vector.resize(num_species); - d_Y_old_vector.resize(num_species); + d_Y.resize(num_species); d_rhoD_vector.resize(num_species); d_boundary_Y_vector.resize(num_species); d_boundary_Y_init_vector.resize(num_species); @@ -491,8 +489,7 @@ struct dfMatrixDataBase d_boundary_rhoD_vector.resize(num_species); for (size_t i = 0; i < num_species; ++i){ - checkCudaErrors(cudaMalloc((void**)&d_Y_new_vector[i], cell_bytes)); - checkCudaErrors(cudaMalloc((void**)&d_Y_old_vector[i], cell_bytes)); + checkCudaErrors(cudaMalloc((void**)&d_Y[i], cell_bytes)); checkCudaErrors(cudaMalloc((void**)&d_rhoD_vector[i], cell_bytes)); } checkCudaErrors(cudaMalloc((void**)&d_rho_old, cell_bytes)); diff --git a/src_gpu/dfYEqn.H b/src_gpu/dfYEqn.H index 2903d5934..f41cd9799 100644 --- a/src_gpu/dfYEqn.H +++ b/src_gpu/dfYEqn.H @@ -29,7 +29,6 @@ private: double *d_mut_Sct = nullptr; double *d_boundary_mut_sct = nullptr; - std::vector d_Y_old; double *d_boundary_Y = nullptr; double *d_hai, *d_boundary_hai = nullptr; double *d_rhoD, *d_boundary_rhoD = nullptr; @@ -49,9 +48,10 @@ public: void checkValue(bool print, char *filename); - void correctVelocity(std::vector Y_old, std::vector boundary_Y, + void fvm_laplacian_and_sumYDiffError_diffAlphaD_hDiffCorrFlux(std::vector Y_old, std::vector boundary_Y, std::vector hai, std::vector boundary_hai, - std::vector rhoD, std::vector boundary_rhoD); + std::vector rhoD, std::vector boundary_rhoD, + const double *mut_Sct, const double *boundary_mut_Sct, const double *alpha); void upwindWeight(); @@ -61,11 +61,6 @@ public: void fvm_div_phiUc(); - void fvm_laplacian(const double *mut_Sct, const double *boundary_mut_Sct, - std::vectorrhoD, std::vectorboundary_rhoD); - - void compute_diffAlphaD(const double *alpha, std::vector hai); - void solve(); void updatePsi(double *Psi, int speciesIndex); diff --git a/src_gpu/dfYEqn.cu b/src_gpu/dfYEqn.cu index 0dbfdba8c..41987ee26 100644 --- a/src_gpu/dfYEqn.cu +++ b/src_gpu/dfYEqn.cu @@ -420,12 +420,12 @@ __global__ void fvm_laplacian_uncorrected_scalar_internal(int num_cells, A_csr_output[row_index + diag_index] = A_csr_input[row_index + diag_index] + sum_diag * sign; } -__global__ void fvm_laplacian_uncorrected_scalar_boundary(int num_cells, int num_boundary_cells, - const int *csr_row_index, const int *csr_diag_index, const int *boundary_cell_offset, - const int *boundary_cell_id, const double *boundary_scalar0, const double *boundary_scalar1, - const double *boundary_magsf, const int *bouPermedIndex, - const double *gradient_internal_coeffs, const double *gradient_boundary_coeffs, - const double sign, const double *A_csr_input, const double *b_input, double *A_csr_output, double *b_output) +__global__ void fvm_laplacian_uncorrected_scalar_boundary(int num_boundary_cells, + const int *csr_row_index, const int *csr_diag_index, const int *boundary_cell_offset, + const int *boundary_cell_id, const double *boundary_scalar0, const double *boundary_scalar1, + const double *boundary_magsf, const int *bouPermedIndex, + const double *gradient_internal_coeffs, const double *gradient_boundary_coeffs, + const double sign, const double *A_csr_input, const double *b_input, double *A_csr_output, double *b_output) { int index = blockDim.x * blockIdx.x + threadIdx.x; if (index >= num_boundary_cells) @@ -557,12 +557,6 @@ dfYEqn::dfYEqn(dfMatrixDataBase &dataBase, const std::string &modeStr, const std checkCudaErrors(cudaMalloc((void **)&d_alpha, cell_bytes)); - d_Y_old.resize(num_species); - for (size_t i = 0; i < num_species; i++) - { - checkCudaErrors(cudaMalloc((void **)&d_Y_old[i], cell_bytes)); - } - // zeroGradient for (size_t i = 0; i < num_species; i++) { @@ -590,11 +584,16 @@ void dfYEqn::upwindWeight() getUpwindWeight<<>>(num_faces, dataBase_.d_phi, dataBase_.d_weight_upwind); } -void dfYEqn::correctVelocity(std::vector Y_old, std::vector boundary_Y, +void dfYEqn::fvm_laplacian_and_sumYDiffError_diffAlphaD_hDiffCorrFlux(std::vector Y_old, std::vector boundary_Y, std::vector hai, std::vector boundary_hai, - std::vector rhoD, std::vector boundary_rhoD) + std::vector rhoD, std::vector boundary_rhoD, + const double *mut_Sct, const double *boundary_mut_Sct, const double *alpha) { // initialize variables in each time step + checkCudaErrors(cudaMemcpyAsync(d_boundary_mut_sct, boundary_mut_Sct, boundary_face_bytes, cudaMemcpyHostToDevice, stream)); + checkCudaErrors(cudaMemcpyAsync(d_mut_Sct, mut_Sct, cell_bytes, cudaMemcpyHostToDevice, stream)); + checkCudaErrors(cudaMemcpyAsync(d_alpha, alpha, cell_bytes, cudaMemcpyHostToDevice, stream)); + checkCudaErrors(cudaMemsetAsync(d_sum_rhoD_grady, 0, 3 * cell_bytes, stream)); checkCudaErrors(cudaMemsetAsync(d_sum_boundary_rhoD_grady, 0, 3 * boundary_face_bytes, stream)); checkCudaErrors(cudaMemsetAsync(d_sum_hai_rhoD_grady, 0, 3 * cell_bytes, stream)); @@ -603,11 +602,13 @@ void dfYEqn::correctVelocity(std::vector Y_old, std::vector checkCudaErrors(cudaMemsetAsync(d_sum_boundary_hai_y, 0, boundary_face_bytes, stream)); checkCudaErrors(cudaMemsetAsync(dataBase_.d_hDiffCorrFlux, 0, 3 * cell_bytes, stream)); checkCudaErrors(cudaMemsetAsync(dataBase_.d_boundary_hDiffCorrFlux, 0, 3 * boundary_face_bytes, stream)); + checkCudaErrors(cudaMemsetAsync(dataBase_.d_diffAlphaD, 0, cell_bytes, stream)); size_t threads_per_block, blocks_per_grid; + int mtxIndex = 0; for (size_t i = 0; i < num_species; ++i) { - checkCudaErrors(cudaMemcpyAsync(d_Y_old[i], Y_old[i], cell_bytes, cudaMemcpyHostToDevice, stream)); + checkCudaErrors(cudaMemcpyAsync(dataBase_.d_Y[i], Y_old[i], cell_bytes, cudaMemcpyHostToDevice, stream)); checkCudaErrors(cudaMemcpyAsync(d_boundary_Y, boundary_Y[i], boundary_face_bytes, cudaMemcpyHostToDevice, stream)); checkCudaErrors(cudaMemcpyAsync(d_hai, hai[i], cell_bytes, cudaMemcpyHostToDevice, stream)); checkCudaErrors(cudaMemcpyAsync(d_boundary_hai, boundary_hai[i], boundary_face_bytes, cudaMemcpyHostToDevice, stream)); @@ -617,12 +618,12 @@ void dfYEqn::correctVelocity(std::vector Y_old, std::vector checkCudaErrors(cudaMemsetAsync(d_grady, 0, 3 * cell_bytes, stream)); checkCudaErrors(cudaMemsetAsync(d_boundary_grady, 0, 3 * boundary_face_bytes, stream)); - // launch cuda kernel + // fvc::grad(Yi) threads_per_block = 1024; blocks_per_grid = (num_cells + threads_per_block - 1) / threads_per_block; fvc_grad_internal<<>>(num_cells, d_A_csr_row_index, d_A_csr_col_index, d_A_csr_diag_index, - dataBase_.d_face_vector, dataBase_.d_weight, d_Y_old[i], + dataBase_.d_face_vector, dataBase_.d_weight, dataBase_.d_Y[i], dataBase_.d_volume, d_grady); blocks_per_grid = (num_boundary_cells + threads_per_block - 1) / threads_per_block; fvc_grad_boundary<<>>(num_boundary_cells, @@ -635,25 +636,46 @@ void dfYEqn::correctVelocity(std::vector Y_old, std::vector dataBase_.d_boundary_face_vector, dataBase_.d_boundary_face, d_grady, d_boundary_grady); + // sum(chemistry->hai(i)*chemistry->rhoD(i)*fvc::grad(Yi)) + // sum(chemistry->rhoD(i)*fvc::grad(Yi)), also be called sumYDiffError + // sum(chemistry->hai(i)*Yi) blocks_per_grid = (num_cells + threads_per_block - 1) / threads_per_block; sumError_internal<<>>(num_cells, - d_hai, d_rhoD, d_Y_old[i], d_grady, + d_hai, d_rhoD, dataBase_.d_Y[i], d_grady, d_sum_hai_rhoD_grady, d_sum_rhoD_grady, d_sum_hai_y); blocks_per_grid = (num_boundary_faces + threads_per_block - 1) / threads_per_block; sumError_boundary<<>>(num_boundary_faces, dataBase_.d_bouPermedIndex, d_boundary_hai, d_boundary_rhoD, d_boundary_Y, d_boundary_grady, d_sum_boundary_hai_rhoD_grady, d_sum_boundary_rhoD_grady, d_sum_boundary_hai_y); - } - blocks_per_grid = (num_cells + threads_per_block - 1) / threads_per_block; - calculate_phiUc_internal<<>>(num_cells, - d_A_csr_row_index, d_A_csr_col_index, d_A_csr_diag_index, - dataBase_.d_face_vector, dataBase_.d_weight, d_sum_rhoD_grady, d_phiUc); - blocks_per_grid = (num_boundary_faces + threads_per_block - 1) / threads_per_block; - calculate_phiUc_boundary<<>>(num_boundary_faces, - dataBase_.d_boundary_cell_offset, dataBase_.d_boundary_cell_id, - dataBase_.d_boundary_face_vector, d_sum_boundary_rhoD_grady, d_phiUc_boundary); + // compute diffAlphaD + blocks_per_grid = (num_cells + threads_per_block - 1) / threads_per_block; + fvc_laplacian_internal<<>>(num_cells, + d_A_csr_row_index, d_A_csr_col_index, d_A_csr_diag_index, + d_alpha, d_hai, dataBase_.d_Y[i], + dataBase_.d_weight, dataBase_.d_face, dataBase_.d_deltaCoeffs, + dataBase_.d_volume, dataBase_.d_diffAlphaD); + + // fvm::laplacian + if (i != inertIndex) + { + blocks_per_grid = (num_cells + threads_per_block - 1) / threads_per_block; + fvm_laplacian_uncorrected_scalar_internal<<>>(num_cells, + d_A_csr_row_index, d_A_csr_col_index, d_A_csr_diag_index, + d_mut_Sct, d_rhoD, dataBase_.d_weight, dataBase_.d_face, dataBase_.d_deltaCoeffs, + -1., d_A_csr + mtxIndex * (num_cells + num_faces), d_A_csr + mtxIndex * (num_cells + num_faces)); + blocks_per_grid = (num_boundary_cells + threads_per_block - 1) / threads_per_block; + fvm_laplacian_uncorrected_scalar_boundary<<>>(num_boundary_cells, + d_A_csr_row_index, d_A_csr_diag_index, + dataBase_.d_boundary_cell_offset, dataBase_.d_boundary_cell_id, + d_boundary_rhoD, d_boundary_mut_sct, dataBase_.d_boundary_face, dataBase_.d_bouPermedIndex, + dataBase_.d_laplac_internal_coeffs_Y_vector[i], dataBase_.d_laplac_boundary_coeffs_Y_vector[i], + -1., d_A_csr + mtxIndex * (num_cells + num_faces), d_b + mtxIndex * num_cells, + d_A_csr + mtxIndex * (num_cells + num_faces), d_b + mtxIndex * num_cells); + } + ++mtxIndex; + } blocks_per_grid = (num_cells + threads_per_block - 1) / threads_per_block; calculate_hDiffCorrFlux<<>>(num_cells, @@ -677,7 +699,7 @@ void dfYEqn::fvm_ddt() blocks_per_grid = (num_cells + threads_per_block - 1) / threads_per_block; fvm_ddt_kernel_scalar<<>>(num_cells, num_faces, dataBase_.rdelta_t, d_A_csr_row_index, d_A_csr_diag_index, - dataBase_.d_rho_old, dataBase_.d_rho_new, dataBase_.d_volume, d_Y_old[i], + dataBase_.d_rho_old, dataBase_.d_rho_new, dataBase_.d_volume, dataBase_.d_Y[i], d_A_csr + mtxIndex * (num_cells + num_faces), d_b + mtxIndex * num_cells, d_A_csr + mtxIndex * (num_cells + num_faces), d_b + mtxIndex * num_cells, d_psi + mtxIndex * num_cells); @@ -694,21 +716,20 @@ void dfYEqn::fvm_div_phi() if (i == inertIndex) continue; - // launch cuda kernel + // mvConvection->fvmDiv(phi, Yi) threads_per_block = 512; blocks_per_grid = (num_cells + threads_per_block - 1) / threads_per_block; fvm_div_internal_scalar<<>>(num_cells, num_faces, - d_A_csr_row_index, d_A_csr_diag_index, dataBase_.d_weight_upwind, dataBase_.d_phi, - d_A_csr + mtxIndex * (num_cells + num_faces), d_b + mtxIndex * num_cells, - d_A_csr + mtxIndex * (num_cells + num_faces), d_b + mtxIndex * num_cells); - + d_A_csr_row_index, d_A_csr_diag_index, dataBase_.d_weight_upwind, dataBase_.d_phi, + d_A_csr + mtxIndex * (num_cells + num_faces), d_b + mtxIndex * num_cells, + d_A_csr + mtxIndex * (num_cells + num_faces), d_b + mtxIndex * num_cells); blocks_per_grid = (num_boundary_cells + threads_per_block - 1) / threads_per_block; fvm_div_boundary_scalar<<>>(num_cells, num_faces, num_boundary_cells, - d_A_csr_row_index, d_A_csr_diag_index, dataBase_.d_boundary_phi, - dataBase_.d_boundary_cell_offset, dataBase_.d_boundary_cell_id, - dataBase_.d_internal_coeffs_Y_vector[i], dataBase_.d_boundary_coeffs_Y_vector[i], - d_A_csr + mtxIndex * (num_cells + num_faces), d_b + mtxIndex * num_cells, - d_A_csr + mtxIndex * (num_cells + num_faces), d_b + mtxIndex * num_cells); + d_A_csr_row_index, d_A_csr_diag_index, dataBase_.d_boundary_phi, + dataBase_.d_boundary_cell_offset, dataBase_.d_boundary_cell_id, + dataBase_.d_internal_coeffs_Y_vector[i], dataBase_.d_boundary_coeffs_Y_vector[i], + d_A_csr + mtxIndex * (num_cells + num_faces), d_b + mtxIndex * num_cells, + d_A_csr + mtxIndex * (num_cells + num_faces), d_b + mtxIndex * num_cells); ++mtxIndex; } } @@ -716,61 +737,37 @@ void dfYEqn::fvm_div_phi() void dfYEqn::fvm_div_phiUc() { size_t threads_per_block, blocks_per_grid; + + // compue phiUc + threads_per_block = 512; + blocks_per_grid = (num_cells + threads_per_block - 1) / threads_per_block; + calculate_phiUc_internal<<>>(num_cells, + d_A_csr_row_index, d_A_csr_col_index, d_A_csr_diag_index, + dataBase_.d_face_vector, dataBase_.d_weight, d_sum_rhoD_grady, d_phiUc); + blocks_per_grid = (num_boundary_faces + threads_per_block - 1) / threads_per_block; + calculate_phiUc_boundary<<>>(num_boundary_faces, + dataBase_.d_boundary_cell_offset, dataBase_.d_boundary_cell_id, + dataBase_.d_boundary_face_vector, d_sum_boundary_rhoD_grady, d_phiUc_boundary); + + // mvConvection->fvmDiv(phiUc, Yi) int mtxIndex = 0; for (size_t i = 0; i < num_species; ++i) { if (i == inertIndex) continue; - // launch cuda kernel - threads_per_block = 512; blocks_per_grid = (num_cells + threads_per_block - 1) / threads_per_block; fvm_div_internal_scalar<<>>(num_cells, num_faces, - d_A_csr_row_index, d_A_csr_diag_index, dataBase_.d_weight_upwind, d_phiUc, - d_A_csr + mtxIndex * (num_cells + num_faces), d_b + mtxIndex * num_cells, - d_A_csr + mtxIndex * (num_cells + num_faces), d_b + mtxIndex * num_cells); + d_A_csr_row_index, d_A_csr_diag_index, dataBase_.d_weight_upwind, d_phiUc, + d_A_csr + mtxIndex * (num_cells + num_faces), d_b + mtxIndex * num_cells, + d_A_csr + mtxIndex * (num_cells + num_faces), d_b + mtxIndex * num_cells); blocks_per_grid = (num_boundary_cells + threads_per_block - 1) / threads_per_block; fvm_div_boundary_scalar<<>>(num_cells, num_faces, num_boundary_cells, - d_A_csr_row_index, d_A_csr_diag_index, d_phiUc_boundary, - dataBase_.d_boundary_cell_offset, dataBase_.d_boundary_cell_id, - dataBase_.d_internal_coeffs_Y_vector[i], dataBase_.d_boundary_coeffs_Y_vector[i], - d_A_csr + mtxIndex * (num_cells + num_faces), d_b + mtxIndex * num_cells, - d_A_csr + mtxIndex * (num_cells + num_faces), d_b + mtxIndex * num_cells); - ++mtxIndex; - } -} - -void dfYEqn::fvm_laplacian(const double *mut_Sct, const double *boundary_mut_Sct, - std::vector rhoD, std::vector boundary_rhoD) -{ - size_t threads_per_block, blocks_per_grid; - int mtxIndex = 0; - - checkCudaErrors(cudaMemcpyAsync(d_boundary_mut_sct, boundary_mut_Sct, boundary_face_bytes, cudaMemcpyHostToDevice, stream)); - checkCudaErrors(cudaMemcpyAsync(d_mut_Sct, mut_Sct, cell_bytes, cudaMemcpyHostToDevice, stream)); - - for (size_t i = 0; i < num_species; ++i) - { - if (i == inertIndex) - continue; - checkCudaErrors(cudaMemcpyAsync(d_rhoD, rhoD[i], cell_bytes, cudaMemcpyHostToDevice, stream)); - checkCudaErrors(cudaMemcpyAsync(d_boundary_rhoD, boundary_rhoD[i], boundary_face_bytes, cudaMemcpyHostToDevice, stream)); - - // launch cuda kernel - threads_per_block = 1024; - blocks_per_grid = (num_cells + threads_per_block - 1) / threads_per_block; - fvm_laplacian_uncorrected_scalar_internal<<>>(num_cells, d_A_csr_row_index, d_A_csr_col_index, d_A_csr_diag_index, - d_mut_Sct, d_rhoD, dataBase_.d_weight, dataBase_.d_face, - dataBase_.d_deltaCoeffs, -1., d_A_csr + mtxIndex * (num_cells + num_faces), - d_A_csr + mtxIndex * (num_cells + num_faces)); - blocks_per_grid = (num_boundary_cells + threads_per_block - 1) / threads_per_block; - fvm_laplacian_uncorrected_scalar_boundary<<>>(num_cells, num_boundary_cells, - d_A_csr_row_index, d_A_csr_diag_index, dataBase_.d_boundary_cell_offset, - dataBase_.d_boundary_cell_id, d_boundary_rhoD, - d_boundary_mut_sct, dataBase_.d_boundary_face, dataBase_.d_bouPermedIndex, - dataBase_.d_laplac_internal_coeffs_Y_vector[i], dataBase_.d_laplac_boundary_coeffs_Y_vector[i], -1., - d_A_csr + mtxIndex * (num_cells + num_faces), d_b + mtxIndex * num_cells, - d_A_csr + mtxIndex * (num_cells + num_faces), d_b + mtxIndex * num_cells); + d_A_csr_row_index, d_A_csr_diag_index, d_phiUc_boundary, + dataBase_.d_boundary_cell_offset, dataBase_.d_boundary_cell_id, + dataBase_.d_internal_coeffs_Y_vector[i], dataBase_.d_boundary_coeffs_Y_vector[i], + d_A_csr + mtxIndex * (num_cells + num_faces), d_b + mtxIndex * num_cells, + d_A_csr + mtxIndex * (num_cells + num_faces), d_b + mtxIndex * num_cells); ++mtxIndex; } } @@ -865,24 +862,6 @@ void dfYEqn::updatePsi(double *Psi, int speciesIndex) memcpy(Psi, h_psi + speciesIndex * num_cells, cell_bytes); } -void dfYEqn::compute_diffAlphaD(const double *alpha, std::vector hai) -{ - checkCudaErrors(cudaMemsetAsync(dataBase_.d_diffAlphaD, 0, cell_bytes, stream)); - checkCudaErrors(cudaMemcpyAsync(d_alpha, alpha, cell_bytes, cudaMemcpyHostToDevice, stream)); - size_t threads_per_block, blocks_per_grid; - for (size_t i = 0; i < num_species; ++i) - { - checkCudaErrors(cudaMemcpyAsync(d_hai, hai[i], cell_bytes, cudaMemcpyHostToDevice, stream)); - threads_per_block = 1024; - blocks_per_grid = (num_cells + threads_per_block - 1) / threads_per_block; - fvc_laplacian_internal<<>>(num_cells, - d_A_csr_row_index, d_A_csr_col_index, d_A_csr_diag_index, - d_alpha, d_hai, d_Y_old[i], - dataBase_.d_weight, dataBase_.d_face, dataBase_.d_deltaCoeffs, - dataBase_.d_volume, dataBase_.d_diffAlphaD); - } -} - dfYEqn::~dfYEqn() { } From 5a0918ba03da72cae269f4bfce81abd2b4c25657 Mon Sep 17 00:00:00 2001 From: STwangyingrui Date: Sat, 10 Jun 2023 00:50:09 +0800 Subject: [PATCH 4/6] refactor profile, add permute_psi for UEqn, add sync for each equation --- applications/solvers/dfLowMachFoam/EEqn.H | 52 ++-- applications/solvers/dfLowMachFoam/UEqn.H | 14 +- applications/solvers/dfLowMachFoam/YEqn.H | 40 ++- .../solvers/dfLowMachFoam/dfLowMachFoam.C | 242 ++++++++++++------ applications/solvers/dfLowMachFoam/pEqn.H | 40 +++ applications/solvers/dfLowMachFoam/rhoEqn.H | 25 ++ src_gpu/dfRhoEqn.H | 2 + src_gpu/dfRhoEqn.cu | 5 + src_gpu/dfUEqn.H | 12 +- src_gpu/dfUEqn.cu | 127 ++++----- src_gpu/dfYEqn.H | 30 +-- src_gpu/dfYEqn.cu | 5 + 12 files changed, 362 insertions(+), 232 deletions(-) diff --git a/applications/solvers/dfLowMachFoam/EEqn.H b/applications/solvers/dfLowMachFoam/EEqn.H index d84da50a1..bcab9affb 100644 --- a/applications/solvers/dfLowMachFoam/EEqn.H +++ b/applications/solvers/dfLowMachFoam/EEqn.H @@ -2,13 +2,12 @@ volScalarField& he = thermo.he(); #ifdef GPUSolver_ - start1 = std::clock(); - UEqn_GPU.updatePsi(&U[0][0]); - K = 0.5*magSqr(U); - end1 = std::clock(); - time_monitor_UEqn += double(end1 - start1) / double(CLOCKS_PER_SEC); - time_monitor_UEqn_mtxAssembly += double(end1 - start1) / double(CLOCKS_PER_SEC); - time_monitor_CPU += double(end1 - start1) / double(CLOCKS_PER_SEC); + start1 = std::clock(); + UEqn_GPU.updatePsi(&U[0][0]); + K = 0.5*magSqr(U); + end1 = std::clock(); + time_monitor_UEqn += double(end1 - start1) / double(CLOCKS_PER_SEC); + time_monitor_UEqn_correctBC += double(end1 - start1) / double(CLOCKS_PER_SEC); #endif #ifdef CPUSolver_ @@ -32,7 +31,7 @@ EEqn.solve(); end1 = std::clock(); time_monitor_EEqn += double(end1 - start1) / double(CLOCKS_PER_SEC); - time_monitor_EEqn_Solve += double(end1 - start1) / double(CLOCKS_PER_SEC); + time_monitor_EEqn_solve += double(end1 - start1) / double(CLOCKS_PER_SEC); #endif #ifdef GPUSolver_ @@ -41,10 +40,12 @@ start2 = std::clock(); const tmp alphaEff_tmp(turbulence->alphaEff()); const volScalarField& alphaEff = alphaEff_tmp(); + end2 = std::clock(); int eeqn_offset = 0; + int patchNum = 0; forAll(he.boundaryField(), patchi) { - const fvsPatchScalarField& patchFlux = phi.boundaryField()[patchi]; + patchNum++; const fvsPatchScalarField& pw = mesh.surfaceInterpolation::weights().boundaryField()[patchi]; int patchSize = pw.size(); @@ -56,19 +57,25 @@ eeqn_offset += patchSize; } end1 = std::clock(); - time_monitor_EEqn_mtxAssembly_CPU_Prepare += double(end1 - start1) / double(CLOCKS_PER_SEC); - - EEqn_GPU.initializeTimeStep(); + time_monitor_EEqn += double(end1 - start1) / double(CLOCKS_PER_SEC); + time_monitor_EEqn_mtxAssembly += double(end1 - start1) / double(CLOCKS_PER_SEC); + time_monitor_EEqn_mtxAssembly_CPU_prepare += double(end1 - start1) / double(CLOCKS_PER_SEC); + fprintf(stderr, "time_monitor_EEqn_mtxAssembly_CPU_prepare: %lf, build alphaEff time: %lf, patchNum: %d\n", + time_monitor_EEqn_mtxAssembly_CPU_prepare, + double(end2 - start2) / double(CLOCKS_PER_SEC), patchNum); // prepare data on GPU start1 = std::clock(); EEqn_GPU.prepare_data(&he.oldTime()[0], &K[0], &K.oldTime()[0], &alphaEff[0], &dpdt[0], boundary_K, boundary_alphaEff); - if (doSync) EEqn_GPU.sync(); + EEqn_GPU.sync(); end1 = std::clock(); - time_monitor_EEqn_mtxAssembly_GPU_Prepare += double(end1 - start1) / double(CLOCKS_PER_SEC); + time_monitor_EEqn += double(end1 - start1) / double(CLOCKS_PER_SEC); + time_monitor_EEqn_mtxAssembly += double(end1 - start1) / double(CLOCKS_PER_SEC); + time_monitor_EEqn_mtxAssembly_GPU_prepare += double(end1 - start1) / double(CLOCKS_PER_SEC); start1 = std::clock(); + EEqn_GPU.initializeTimeStep(); EEqn_GPU.fvm_ddt(); EEqn_GPU.fvm_div(); EEqn_GPU.fvm_laplacian(); @@ -76,31 +83,26 @@ EEqn_GPU.fvc_div_phi_scalar(); EEqn_GPU.fvc_div_vector(); EEqn_GPU.add_to_source(); - if (doSync) EEqn_GPU.sync(); - end1 = std::clock(); - time_monitor_EEqn_mtxAssembly_GPU_Run += double(end1 - start1) / double(CLOCKS_PER_SEC); - EEqn_GPU.sync(); - end2 = std::clock(); - time_monitor_EEqn += double(end2 - start2) / double(CLOCKS_PER_SEC); - time_monitor_EEqn_mtxAssembly += double(end2 - start2) / double(CLOCKS_PER_SEC); + end1 = std::clock(); + time_monitor_EEqn += double(end1 - start1) / double(CLOCKS_PER_SEC); + time_monitor_EEqn_mtxAssembly += double(end1 - start1) / double(CLOCKS_PER_SEC); + time_monitor_EEqn_mtxAssembly_GPU_run += double(end1 - start1) / double(CLOCKS_PER_SEC); // check value of mtxAssembly, no time monitor // EEqn_GPU.checkValue(false); start1 = std::clock(); EEqn_GPU.solve(); - if (doSync) EEqn_GPU.sync(); end1 = std::clock(); time_monitor_EEqn += double(end1 - start1) / double(CLOCKS_PER_SEC); - time_monitor_EEqn_Solve += double(end1 - start1) / double(CLOCKS_PER_SEC); + time_monitor_EEqn_solve += double(end1 - start1) / double(CLOCKS_PER_SEC); start1 = std::clock(); EEqn_GPU.updatePsi(&he[0]); he.correctBoundaryConditions(); end1 = std::clock(); time_monitor_EEqn += double(end1 - start1) / double(CLOCKS_PER_SEC); - time_monitor_EEqn_mtxAssembly_CPU_CorrectBC += double(end1 - start1) / double(CLOCKS_PER_SEC); - time_monitor_EEqn_mtxAssembly += double(end1 - start1) / double(CLOCKS_PER_SEC); + time_monitor_EEqn_correctBC += double(end1 - start1) / double(CLOCKS_PER_SEC); #endif } diff --git a/applications/solvers/dfLowMachFoam/UEqn.H b/applications/solvers/dfLowMachFoam/UEqn.H index 4427ece1b..190636f6f 100644 --- a/applications/solvers/dfLowMachFoam/UEqn.H +++ b/applications/solvers/dfLowMachFoam/UEqn.H @@ -1,9 +1,6 @@ // Solve the Momentum equation #ifdef GPUSolver_ start1 = std::clock(); - UEqn_GPU.initializeTimeStep(); - UEqn_GPU.fvm_ddt(&U.oldTime()[0][0]); - start2 = std::clock(); int offset = 0; const tmp nuEff_tmp(turbulence->nuEff()); const volScalarField& nuEff = nuEff_tmp(); @@ -27,21 +24,24 @@ offset += patchSize; } end1 = std::clock(); - end2 = std::clock(); - time_monitor_UEqn_CPU += double(end2 - start2) / double(CLOCKS_PER_SEC); time_monitor_UEqn += double(end1 - start1) / double(CLOCKS_PER_SEC); time_monitor_UEqn_mtxAssembly += double(end1 - start1) / double(CLOCKS_PER_SEC); + time_monitor_UEqn_mtxAssembly_CPU_prepare += double(end1 - start1) / double(CLOCKS_PER_SEC); start1 = std::clock(); + UEqn_GPU.initializeTimeStep(); + UEqn_GPU.fvm_ddt(&U.oldTime()[0][0]); UEqn_GPU.fvm_div(boundary_pressure_init, boundary_velocity_init, boundary_nuEff_init, boundary_rho_init); UEqn_GPU.fvc_grad(&p[0]); UEqn_GPU.fvc_grad_vector(); UEqn_GPU.dev2T(); UEqn_GPU.fvc_div_tensor(&nuEff[0]); UEqn_GPU.fvm_laplacian(); + UEqn_GPU.sync(); end1 = std::clock(); time_monitor_UEqn += double(end1 - start1) / double(CLOCKS_PER_SEC); time_monitor_UEqn_mtxAssembly += double(end1 - start1) / double(CLOCKS_PER_SEC); + time_monitor_UEqn_mtxAssembly_GPU_run += double(end1 - start1) / double(CLOCKS_PER_SEC); // start2 = std::clock(); // fvVectorMatrix turb_source @@ -88,7 +88,7 @@ } end1 = std::clock(); time_monitor_UEqn += double(end1 - start1) / double(CLOCKS_PER_SEC); - time_monitor_UEqn_Solve += double(end1 - start1) / double(CLOCKS_PER_SEC); + time_monitor_UEqn_solve += double(end1 - start1) / double(CLOCKS_PER_SEC); #endif // start1 = std::clock(); @@ -96,7 +96,7 @@ // UEqn_GPU.solve(); // end1 = std::clock(); // time_monitor_UEqn += double(end1 - start1) / double(CLOCKS_PER_SEC); -// time_monitor_UEqn_Solve += double(end1 - start1) / double(CLOCKS_PER_SEC); +// time_monitor_UEqn_solve += double(end1 - start1) / double(CLOCKS_PER_SEC); // start1 = std::clock(); // // // t.join(); diff --git a/applications/solvers/dfLowMachFoam/YEqn.H b/applications/solvers/dfLowMachFoam/YEqn.H index 2e106fca0..a547e266d 100644 --- a/applications/solvers/dfLowMachFoam/YEqn.H +++ b/applications/solvers/dfLowMachFoam/YEqn.H @@ -1,4 +1,3 @@ -start = std::clock(); hDiffCorrFlux = Zero; diffAlphaD = Zero; sumYDiffError = Zero; @@ -23,21 +22,17 @@ forAll(Y, i) const surfaceScalarField phiUc = linearInterpolate(sumYDiffError) & mesh.Sf(); start1 = std::clock(); time_monitor_YEqn += double(end1 - start1) / double(CLOCKS_PER_SEC); -time_monitor_YEqn_Solve += double(end1 - start1) / double(CLOCKS_PER_SEC); +time_monitor_YEqn_solve += double(end1 - start1) / double(CLOCKS_PER_SEC); #endif #ifdef GPUSolver_ start1 = std::clock(); - // // std::thread t(&dfMatrix::solve, &UEqn_GPU); UEqn_GPU.solve(); end1 = std::clock(); time_monitor_UEqn += double(end1 - start1) / double(CLOCKS_PER_SEC); - time_monitor_UEqn_Solve += double(end1 - start1) / double(CLOCKS_PER_SEC); -#endif + time_monitor_UEqn_solve += double(end1 - start1) / double(CLOCKS_PER_SEC); -#ifdef GPUSolver_ start1 = std::clock(); - start2 = std::clock(); std::vector Y_old(Y.size()), boundary_Y(Y.size()), boundary_hai(Y.size()), boundary_rhoD(Y.size()); std::vector hai(Y.size()), rhoD(Y.size()); for (size_t i = 0; i < Y.size(); ++i) @@ -65,7 +60,6 @@ time_monitor_YEqn_Solve += double(end1 - start1) / double(CLOCKS_PER_SEC); offset += patchSize; } } - volScalarField mut_sct = turbulence->mut().ref()/Sct; double *boundary_mutsct = nullptr; cudaMallocHost(&boundary_mutsct, num_boundary_faces*sizeof(double)); @@ -77,9 +71,13 @@ time_monitor_YEqn_Solve += double(end1 - start1) / double(CLOCKS_PER_SEC); memcpy(boundary_mutsct + offset, &patchMut_sct[0], patchSize*sizeof(double)); offset += patchSize; } - end2 = std::clock(); - time_monitor_YEqn_mtxAssembly_CPU_Prepare += double(end2 - start2) / double(CLOCKS_PER_SEC); + end1 = std::clock(); + time_monitor_YEqn += double(end1 - start1) / double(CLOCKS_PER_SEC); + time_monitor_YEqn_mtxAssembly += double(end1 - start1) / double(CLOCKS_PER_SEC); + time_monitor_YEqn_mtxAssembly_CPU_prepare += double(end1 - start1) / double(CLOCKS_PER_SEC); + fprintf(stderr, "time_monitor_YEqn_mtxAssembly_CPU_prepare: %lf\n", time_monitor_YEqn_mtxAssembly_CPU_prepare); + start1 = std::clock(); YEqn_GPU.initializeTimeStep(); YEqn_GPU.upwindWeight(); YEqn_GPU.fvm_laplacian_and_sumYDiffError_diffAlphaD_hDiffCorrFlux(Y_old, boundary_Y, @@ -87,24 +85,23 @@ time_monitor_YEqn_Solve += double(end1 - start1) / double(CLOCKS_PER_SEC); YEqn_GPU.fvm_ddt(); YEqn_GPU.fvm_div_phi(); YEqn_GPU.fvm_div_phiUc(); - + YEqn_GPU.sync(); end1 = std::clock(); time_monitor_YEqn += double(end1 - start1) / double(CLOCKS_PER_SEC); time_monitor_YEqn_mtxAssembly += double(end1 - start1) / double(CLOCKS_PER_SEC); + time_monitor_YEqn_mtxAssembly_GPU_run += double(end1 - start1) / double(CLOCKS_PER_SEC); start1 = std::clock(); YEqn_GPU.solve(); end1 = std::clock(); time_monitor_YEqn += double(end1 - start1) / double(CLOCKS_PER_SEC); - time_monitor_YEqn_Solve += double(end1 - start1) / double(CLOCKS_PER_SEC); + time_monitor_YEqn_solve += double(end1 - start1) / double(CLOCKS_PER_SEC); #endif //MPI_Barrier(PstreamGlobals::MPI_COMM_FOAM); label flag_mpi_init; MPI_Initialized(&flag_mpi_init); if(flag_mpi_init) MPI_Barrier(PstreamGlobals::MPI_COMM_FOAM); -end = std::clock(); -time_monitor_corrDiff += double(end - start) / double(CLOCKS_PER_SEC); { if (!splitting) @@ -119,9 +116,8 @@ time_monitor_corrDiff += double(end - start) / double(CLOCKS_PER_SEC); time_monitor_chem += processingTime.count(); } + start2 = std::clock(); volScalarField Yt(0.0*Y[0]); - - start = std::clock(); int speciesIndex = 0; forAll(Y, i) { @@ -137,9 +133,7 @@ time_monitor_corrDiff += double(end - start) / double(CLOCKS_PER_SEC); YEqn_GPU.updatePsi(&Yi[0], speciesIndex); Yi.correctBoundaryConditions(); end1 = std::clock(); - time_monitor_YEqn += double(end1 - start1) / double(CLOCKS_PER_SEC); - time_monitor_YEqn_mtxAssembly += double(end1 - start1) / double(CLOCKS_PER_SEC); - time_monitor_YEqn_mtxAssembly_CPU_Prepare += double(end1 - start1) / double(CLOCKS_PER_SEC); + time_monitor_YEqn_correctBC += double(end1 - start1) / double(CLOCKS_PER_SEC); #else start1 = std::clock(); tmp DEff = chemistry->rhoD(i) + turbulence->mut()/Sct; @@ -156,15 +150,13 @@ time_monitor_corrDiff += double(end - start) / double(CLOCKS_PER_SEC); ) ); end1 = std::clock(); - time_monitor_YEqn += double(end1 - start1) / double(CLOCKS_PER_SEC); time_monitor_YEqn_mtxAssembly += double(end1 - start1) / double(CLOCKS_PER_SEC); // YiEqn.relax(); start1 = std::clock(); YiEqn.solve("Yi"); end1 = std::clock(); - time_monitor_YEqn += double(end1 - start1) / double(CLOCKS_PER_SEC); - time_monitor_YEqn_Solve += double(end1 - start1) / double(CLOCKS_PER_SEC); + time_monitor_YEqn_solve += double(end1 - start1) / double(CLOCKS_PER_SEC); #endif Yi.max(0.0); @@ -176,6 +168,6 @@ time_monitor_corrDiff += double(end - start) / double(CLOCKS_PER_SEC); Y[inertIndex] = scalar(1) - Yt; Y[inertIndex].max(0.0); - end = std::clock(); - time_monitor_Y += double(end - start) / double(CLOCKS_PER_SEC); + end2 = std::clock(); + time_monitor_YEqn += double(end2 - start2) / double(CLOCKS_PER_SEC); } diff --git a/applications/solvers/dfLowMachFoam/dfLowMachFoam.C b/applications/solvers/dfLowMachFoam/dfLowMachFoam.C index 6f54e0a95..c47a03c0d 100644 --- a/applications/solvers/dfLowMachFoam/dfLowMachFoam.C +++ b/applications/solvers/dfLowMachFoam/dfLowMachFoam.C @@ -96,32 +96,56 @@ int main(int argc, char *argv[]) #include "createFields.H" #include "createRhoUfIfPresent.H" - bool doSync = true; - double time_monitor_flow=0; - double time_monitor_UEqn=0; - double time_monitor_UEqn_mtxAssembly=0; - double time_monitor_UEqn_Solve=0; - double time_monitor_UEqn_sum=0; - double time_monitor_UEqn_CPU=0; - double time_monitor_EEqn=0; - double time_monitor_EEqn_mtxAssembly=0; - double time_monitor_EEqn_mtxAssembly_CPU_Prepare; - double time_monitor_EEqn_mtxAssembly_GPU_Prepare; - double time_monitor_EEqn_mtxAssembly_GPU_Run; - double time_monitor_EEqn_mtxAssembly_CPU_CorrectBC; - double time_monitor_EEqn_Solve=0; - double time_monitor_EEqn_sum=0; - double time_monitor_YEqn=0; - double time_monitor_YEqn_mtxAssembly=0; - double time_monitor_YEqn_mtxAssembly_CPU_Prepare=0; - double time_monitor_YEqn_Solve=0; - double time_monitor_chem=0; - double time_monitor_Y=0; - double time_monitor_E=0; - double time_monitor_corrThermo=0; - double time_monitor_corrDiff=0; - double time_monitor_CPU=0; - double time_monitor_UinE=0; + double time_monitor_init = 0; + + double time_monitor_other = 0; + double time_monitor_rho = 0; + double time_monitor_U = 0; + double time_monitor_Y = 0; + double time_monitor_E = 0; + double time_monitor_p = 0; + double time_monitor_chemistry_correctThermo = 0; + double time_monitor_turbulence_correct = 0; + double time_monitor_chem = 0; // combustion correct + + double time_monitor_rhoEqn = 0; + double time_monitor_rhoEqn_mtxAssembly = 0; + double time_monitor_rhoEqn_mtxAssembly_CPU_prepare = 0; + double time_monitor_rhoEqn_mtxAssembly_GPU_run = 0; + double time_monitor_rhoEqn_solve = 0; + double time_monitor_rhoEqn_correctBC = 0; + + double time_monitor_UEqn = 0; + double time_monitor_UEqn_mtxAssembly = 0; + double time_monitor_UEqn_mtxAssembly_CPU_prepare = 0; + double time_monitor_UEqn_mtxAssembly_GPU_run = 0; + double time_monitor_UEqn_solve = 0; + double time_monitor_UEqn_correctBC = 0; + double time_monitor_UEqn_H = 0; + double time_monitor_UEqn_H_GPU_run = 0; + double time_monitor_UEqn_H_correctBC = 0; + double time_monitor_UEqn_A = 0; + double time_monitor_UEqn_A_GPU_run = 0; + double time_monitor_UEqn_A_correctBC = 0; + + double time_monitor_YEqn = 0; + double time_monitor_YEqn_mtxAssembly = 0; + double time_monitor_YEqn_mtxAssembly_CPU_prepare = 0; + double time_monitor_YEqn_mtxAssembly_GPU_run = 0; + double time_monitor_YEqn_solve = 0; + double time_monitor_YEqn_correctBC = 0; + + double time_monitor_EEqn = 0; + double time_monitor_EEqn_mtxAssembly = 0; + double time_monitor_EEqn_mtxAssembly_CPU_prepare = 0; + double time_monitor_EEqn_mtxAssembly_GPU_prepare = 0; + double time_monitor_EEqn_mtxAssembly_GPU_run = 0; + double time_monitor_EEqn_solve = 0; + double time_monitor_EEqn_correctBC = 0; + + double time_monitor_pEqn = 0; + double time_monitor_pEqn_solve = 0; + label timeIndex = 0; clock_t start, end, start1, end1, start2, end2; @@ -136,9 +160,7 @@ int main(int argc, char *argv[]) start1 = std::clock(); #include "createdfSolver.H" end1 = std::clock(); - // time_monitor_UEqn += double(end1 - start1) / double(CLOCKS_PER_SEC); - // time_monitor_UEqn_mtxAssembly += double(end1 - start1) / double(CLOCKS_PER_SEC); - // double time_UEqn_initial = time_monitor_UEqn; + time_monitor_init += double(end1 - start1) / double(CLOCKS_PER_SEC); // * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * // @@ -164,9 +186,11 @@ int main(int argc, char *argv[]) Info<< "Time = " << runTime.timeName() << nl << endl; + clock_t loop_start = std::clock(); // --- Pressure-velocity PIMPLE corrector loop while (pimple.loop()) { + start = std::clock(); if (splitting) { #include "YEqn_RR.H" @@ -180,20 +204,28 @@ int main(int argc, char *argv[]) rhoU = new volVectorField("rhoU", rho*U); } } + end = std::clock(); + time_monitor_other += double(end - start) / double(CLOCKS_PER_SEC); + start = std::clock(); if (pimple.firstPimpleIter() && !pimple.simpleRho()) { #include "rhoEqn.H" } + end = std::clock(); + time_monitor_rho += double(end - start) / double(CLOCKS_PER_SEC); start = std::clock(); #include "UEqn.H" end = std::clock(); - time_monitor_flow += double(end - start) / double(CLOCKS_PER_SEC); + time_monitor_U += double(end - start) / double(CLOCKS_PER_SEC); if(combModelName!="ESF" && combModelName!="flareFGM" ) { + start = std::clock(); #include "YEqn.H" + end = std::clock(); + time_monitor_Y += double(end - start) / double(CLOCKS_PER_SEC); start = std::clock(); #include "EEqn.H" @@ -203,7 +235,7 @@ int main(int argc, char *argv[]) start = std::clock(); chemistry->correctThermo(); end = std::clock(); - time_monitor_corrThermo += double(end - start) / double(CLOCKS_PER_SEC); + time_monitor_chemistry_correctThermo += double(end - start) / double(CLOCKS_PER_SEC); } else { @@ -227,73 +259,131 @@ int main(int argc, char *argv[]) } } end = std::clock(); - time_monitor_flow += double(end - start) / double(CLOCKS_PER_SEC); + time_monitor_p += double(end - start) / double(CLOCKS_PER_SEC); + start = std::clock(); if (pimple.turbCorr()) { turbulence->correct(); } + end = std::clock(); + time_monitor_turbulence_correct += double(end - start) / double(CLOCKS_PER_SEC); } + clock_t loop_end = std::clock(); + double loop_time = double(loop_end - loop_start) / double(CLOCKS_PER_SEC); rho = thermo.rho(); runTime.write(); - time_monitor_UEqn_sum += time_monitor_UEqn; - time_monitor_EEqn_sum += time_monitor_EEqn; Info << "output time index " << runTime.timeIndex() << endl; + Info<< "Init Time(createdfSolver) = " << time_monitor_init << " s" << endl; + Info<< "========Time Spent in diffenet parts========"<< endl; - Info<< "Chemical sources = " << time_monitor_chem << " s" << endl; - Info<< "Species Equations = " << time_monitor_Y << " s" << endl; - Info<< "U & p Equations = " << time_monitor_flow << " s" << endl; - Info<< "Energy Equations = " << time_monitor_E << " s" << endl; - Info<< "thermo & Trans Properties = " << time_monitor_corrThermo << " s" << endl; - Info<< "Diffusion Correction Time = " << time_monitor_corrDiff << " s" << endl; - Info<< "UEqn Time = " << time_monitor_UEqn << " s" << endl; - Info<< "UEqn Time assamble Mtx = " << time_monitor_UEqn_mtxAssembly << " s" << endl; - Info<< "UEqn Time solve = " << time_monitor_UEqn_Solve << " s" << endl; - // Info<< "UEqn sum Time = " << time_monitor_UEqn_sum << " s" << endl; - // Info<< "UEqn sum Time - overhead = " << time_monitor_UEqn_sum - time_UEqn_initial << " s" << endl; - Info<< "EEqn Time = " << time_monitor_EEqn << " s" << endl; - Info<< "EEqn Time assamble Mtx = " << time_monitor_EEqn_mtxAssembly << " s" << endl; - Info<< "EEqn assamble(CPU prepare) = " << time_monitor_EEqn_mtxAssembly_CPU_Prepare << " s" << endl; - Info<< "EEqn assamble(GPU prepare) = " << time_monitor_EEqn_mtxAssembly_GPU_Prepare << " s" << endl; - Info<< "EEqn assamble(GPU run) = " << time_monitor_EEqn_mtxAssembly_GPU_Run << " s" << endl; - Info<< "EEqn assamble(CPU correct) = " << time_monitor_EEqn_mtxAssembly_CPU_CorrectBC << " s" << endl; - Info<< "EEqn Time solve = " << time_monitor_EEqn_Solve << " s" << endl; - Info<< "YEqn Time = " << time_monitor_YEqn << " s" << endl; - Info<< "YEqn Time assamble Mtx = " << time_monitor_YEqn_mtxAssembly << " s" << endl; - Info<< "YEqn assamble(CPU prepare) = " << time_monitor_YEqn_mtxAssembly_CPU_Prepare << " s" << endl; - Info<< "YEqn Time solve = " << time_monitor_YEqn_Solve << " s" << endl; - Info<< "sum Time = " << (time_monitor_chem + time_monitor_Y + time_monitor_flow + time_monitor_E + time_monitor_corrThermo + time_monitor_corrDiff) << " s" << endl; - Info<< "CPU Time (get turb souce) = " << time_monitor_CPU << " s" << endl; + Info<< "loop Time = " << loop_time << " s" << endl; + Info<< "other Time = " << time_monitor_other << " s" << endl; + Info<< "rho Equations = " << time_monitor_rho << " s" << endl; + Info<< "U Equations = " << time_monitor_U << " s" << endl; + Info<< "Y Equations = " << time_monitor_Y << " s" << endl; + Info<< "E Equations = " << time_monitor_E << " s" << endl; + Info<< "p Equations = " << time_monitor_p << " s" << endl; + Info<< "chemistry correctThermo = " << time_monitor_chemistry_correctThermo << " s" << endl; + Info<< "turbulence correct = " << time_monitor_turbulence_correct << " s" << endl; + Info<< "combustion correct(in Y) = " << time_monitor_chem << " s" << endl; + + Info<< "========Time details of each equation======="<< endl; + + Info<< "rhoEqn Time = " << time_monitor_rhoEqn << " s" << endl; + Info<< "rhoEqn assamble = " << time_monitor_rhoEqn_mtxAssembly << " s" << endl; + Info<< "rhoEqn assamble(CPU prepare) = " << time_monitor_rhoEqn_mtxAssembly_CPU_prepare << " s" << endl; + Info<< "rhoEqn assamble(GPU run) = " << time_monitor_rhoEqn_mtxAssembly_GPU_run << " s" << endl; + Info<< "rhoEqn solve = " << time_monitor_rhoEqn_solve << " s" << endl; + Info<< "rhoEqn correct boundary = " << time_monitor_rhoEqn_correctBC << " s" << endl; + + Info<< "UEqn Time = " << time_monitor_UEqn << " s" << endl; + Info<< "UEqn assamble = " << time_monitor_UEqn_mtxAssembly << " s" << endl; + Info<< "UEqn assamble(CPU prepare) = " << time_monitor_UEqn_mtxAssembly_CPU_prepare << " s" << endl; + Info<< "UEqn assamble(GPU run) = " << time_monitor_UEqn_mtxAssembly_GPU_run << " s" << endl; + Info<< "UEqn solve = " << time_monitor_UEqn_solve << " s" << endl; + Info<< "UEqn correct boundary = " << time_monitor_UEqn_correctBC << " s" << endl; + Info<< "UEqn H = " << time_monitor_UEqn_H << " s" << endl; + Info<< "UEqn H(GPU run) = " << time_monitor_UEqn_H_GPU_run << " s" << endl; + Info<< "UEqn H(correct boundary) = " << time_monitor_UEqn_H_correctBC << " s" << endl; + Info<< "UEqn A = " << time_monitor_UEqn_A << " s" << endl; + Info<< "UEqn A(GPU run) = " << time_monitor_UEqn_A_GPU_run << " s" << endl; + Info<< "UEqn A(correct boundary) = " << time_monitor_UEqn_A_correctBC << " s" << endl; + + Info<< "YEqn Time = " << time_monitor_YEqn << " s" << endl; + Info<< "YEqn assamble = " << time_monitor_YEqn_mtxAssembly << " s" << endl; + Info<< "YEqn assamble(CPU prepare) = " << time_monitor_YEqn_mtxAssembly_CPU_prepare << " s" << endl; + Info<< "YEqn assamble(GPU run) = " << time_monitor_YEqn_mtxAssembly_GPU_run << " s" << endl; + Info<< "YEqn solve = " << time_monitor_YEqn_solve << " s" << endl; + Info<< "YEqn correct boundary = " << time_monitor_YEqn_correctBC << " s" << endl; + + Info<< "EEqn Time = " << time_monitor_EEqn << " s" << endl; + Info<< "EEqn assamble = " << time_monitor_EEqn_mtxAssembly << " s" << endl; + Info<< "EEqn assamble(CPU prepare) = " << time_monitor_EEqn_mtxAssembly_CPU_prepare << " s" << endl; + Info<< "EEqn assamble(GPU prepare) = " << time_monitor_EEqn_mtxAssembly_GPU_prepare << " s" << endl; + Info<< "EEqn assamble(GPU run) = " << time_monitor_EEqn_mtxAssembly_GPU_run << " s" << endl; + Info<< "EEqn solve = " << time_monitor_EEqn_solve << " s" << endl; + Info<< "EEqn correct boundary = " << time_monitor_EEqn_correctBC << " s" << endl; + + Info<< "pEqn Time = " << time_monitor_pEqn << " s" << endl; + Info<< "pEqn Time solve = " << time_monitor_pEqn_solve << " s" << endl; + Info<< "============================================"<= num_cells) + return; + + output[index * 3 + 0] = input[num_cells * 0 + index]; + output[index * 3 + 1] = input[num_cells * 1 + index]; + output[index * 3 + 2] = input[num_cells * 2 + index]; +} + +__global__ void permute_psi_h2d(int num_cells, const double *input, double *output) +{ + int index = blockDim.x * blockIdx.x + threadIdx.x; + if (index >= num_cells) + return; + + output[num_cells * 0 + index] = input[index * 3 + 0]; + output[num_cells * 1 + index] = input[index * 3 + 1]; + output[num_cells * 2 + index] = input[index * 3 + 2]; +} + __global__ void lduMatrix_H(int num_cells, const int *csr_row_index, const int *csr_col_index, const int *csr_diag_index, const double *volume, const double *psi, const double *A_csr, const double *b, @@ -1032,7 +1054,9 @@ dfUEqn::dfUEqn(dfMatrixDataBase &dataBase, const std::string &modeStr, const std checkCudaErrors(cudaMalloc((void **)&d_A_csr, csr_value_vec_bytes)); checkCudaErrors(cudaMalloc((void **)&d_b, cell_vec_bytes)); checkCudaErrors(cudaMalloc((void **)&d_psi, cell_vec_bytes)); + checkCudaErrors(cudaMalloc((void **)&d_psi_permute, cell_vec_bytes)); checkCudaErrors(cudaMalloc((void **)&d_H, cell_vec_bytes)); + checkCudaErrors(cudaMalloc((void **)&d_H_permute, cell_vec_bytes)); checkCudaErrors(cudaMalloc((void **)&d_A, cell_bytes)); checkCudaErrors(cudaMalloc((void **)&d_ueqn_internal_coeffs, cell_vec_bytes)); checkCudaErrors(cudaMalloc((void **)&d_ueqn_boundary_coeffs, cell_vec_bytes)); @@ -1041,42 +1065,29 @@ dfUEqn::dfUEqn(dfMatrixDataBase &dataBase, const std::string &modeStr, const std void dfUEqn::fvm_ddt(double *vector_old) { // Copy the host input array in host memory to the device input array in device memory - clock_t start = std::clock(); checkCudaErrors(cudaMemcpyAsync(dataBase_.d_velocity_old, vector_old, cell_vec_bytes, cudaMemcpyHostToDevice, stream)); - clock_t end = std::clock(); - time_monitor_GPU_memcpy += double(end - start) / double(CLOCKS_PER_SEC); - - // launch cuda kernel - start = std::clock(); size_t threads_per_block = 1024; size_t blocks_per_grid = (num_cells + threads_per_block - 1) / threads_per_block; fvm_ddt_kernel<<>>(num_cells, num_faces, dataBase_.rdelta_t, - d_A_csr_row_index, d_A_csr_diag_index, - dataBase_.d_rho_old, dataBase_.d_rho_new, dataBase_.d_volume, dataBase_.d_velocity_old, d_A_csr, d_b, d_A_csr, d_b, d_psi); - end = std::clock(); - time_monitor_GPU_kernel += double(end - start) / double(CLOCKS_PER_SEC); + d_A_csr_row_index, d_A_csr_diag_index, + dataBase_.d_rho_old, dataBase_.d_rho_new, dataBase_.d_volume, dataBase_.d_velocity_old, d_A_csr, d_b, d_A_csr, d_b, d_psi); } void dfUEqn::fvm_div(double *boundary_pressure_init, double *boundary_velocity_init, double *boundary_nuEff_init, double *boundary_rho_init) { // copy and permutate boundary variable - clock_t start = std::clock(); checkCudaErrors(cudaMemcpyAsync(dataBase_.d_boundary_velocity_init, boundary_velocity_init, dataBase_.boundary_face_vec_bytes, cudaMemcpyHostToDevice, stream)); checkCudaErrors(cudaMemcpyAsync(dataBase_.d_boundary_pressure_init, boundary_pressure_init, dataBase_.boundary_face_bytes, cudaMemcpyHostToDevice, stream)); checkCudaErrors(cudaMemcpyAsync(dataBase_.d_boundary_nuEff_init, boundary_nuEff_init, dataBase_.boundary_face_bytes, cudaMemcpyHostToDevice, stream)); checkCudaErrors(cudaMemcpyAsync(dataBase_.d_boundary_rho_init, boundary_rho_init, dataBase_.boundary_face_bytes, cudaMemcpyHostToDevice, stream)); - clock_t end = std::clock(); - time_monitor_GPU_memcpy += double(end - start) / double(CLOCKS_PER_SEC); - start = std::clock(); size_t threads_per_block = 1024; size_t blocks_per_grid = (dataBase_.num_boundary_faces + threads_per_block - 1) / threads_per_block; boundaryPermutation<<>>(dataBase_.num_boundary_faces, dataBase_.d_bouPermedIndex, dataBase_.d_boundary_pressure_init, - dataBase_.d_boundary_velocity_init, dataBase_.d_boundary_pressure, dataBase_.d_boundary_velocity, - dataBase_.d_boundary_nuEff_init, dataBase_.d_boundary_nuEff, dataBase_.d_boundary_rho_init, dataBase_.d_boundary_rho); + dataBase_.d_boundary_velocity_init, dataBase_.d_boundary_pressure, dataBase_.d_boundary_velocity, + dataBase_.d_boundary_nuEff_init, dataBase_.d_boundary_nuEff, dataBase_.d_boundary_rho_init, dataBase_.d_boundary_rho); - // launch cuda kernel blocks_per_grid = (num_cells + threads_per_block - 1) / threads_per_block; fvm_div_internal<<>>(num_cells, num_faces, d_A_csr_row_index, d_A_csr_diag_index, @@ -1087,20 +1098,14 @@ void dfUEqn::fvm_div(double *boundary_pressure_init, double *boundary_velocity_i dataBase_.d_boundary_cell_offset, dataBase_.d_boundary_cell_id, dataBase_.d_internal_coeffs, dataBase_.d_boundary_coeffs, d_A_csr, d_b, d_A_csr, d_b, d_ueqn_internal_coeffs, d_ueqn_boundary_coeffs); - end = std::clock(); - time_monitor_GPU_kernel += double(end - start) / double(CLOCKS_PER_SEC); } void dfUEqn::fvc_grad(double *pressure) { // Copy the host input array in host memory to the device input array in device memory - clock_t start = std::clock(); checkCudaErrors(cudaMemcpyAsync(dataBase_.d_pressure, pressure, cell_bytes, cudaMemcpyHostToDevice, stream)); - clock_t end = std::clock(); - time_monitor_GPU_memcpy += double(end - start) / double(CLOCKS_PER_SEC); // launch cuda kernel - start = std::clock(); size_t threads_per_block = 1024; size_t blocks_per_grid = (num_cells + threads_per_block - 1) / threads_per_block; fvc_grad_internal_face<<>>(num_cells, @@ -1110,14 +1115,10 @@ void dfUEqn::fvc_grad(double *pressure) fvc_grad_boundary_face<<>>(num_cells, num_boundary_cells, dataBase_.d_boundary_cell_offset, dataBase_.d_boundary_cell_id, dataBase_.d_boundary_face_vector, dataBase_.d_boundary_pressure, d_b, d_b); - end = std::clock(); - time_monitor_GPU_kernel += double(end - start) / double(CLOCKS_PER_SEC); } void dfUEqn::fvc_grad_vector() { - clock_t start = std::clock(); - // launch CUDA kernal size_t threads_per_block = 1024; size_t blocks_per_grid = (num_cells + threads_per_block - 1) / threads_per_block; fvc_grad_vector_internal<<>>(num_cells, @@ -1132,33 +1133,21 @@ void dfUEqn::fvc_grad_vector() correct_boundary_conditions<<>>(num_boundary_cells, dataBase_.d_boundary_cell_offset, dataBase_.d_boundary_cell_id, dataBase_.d_boundary_face_vector, dataBase_.d_boundary_face, dataBase_.d_grad_boundary_init, dataBase_.d_grad_boundary); - clock_t end = std::clock(); - time_monitor_GPU_kernel += double(end - start) / double(CLOCKS_PER_SEC); } void dfUEqn::dev2T() { - clock_t start = std::clock(); - // launch CUDA kernal size_t threads_per_block = 1024; size_t blocks_per_grid = (num_cells + threads_per_block - 1) / threads_per_block; dev2_t_tensor<<>>(num_cells, dataBase_.d_grad); blocks_per_grid = (dataBase_.num_boundary_faces + threads_per_block - 1) / threads_per_block; dev2_t_tensor<<>>(dataBase_.num_boundary_faces, dataBase_.d_grad_boundary); - clock_t end = std::clock(); - time_monitor_GPU_kernel += double(end - start) / double(CLOCKS_PER_SEC); } void dfUEqn::fvc_div_tensor(const double *nuEff) { - clock_t start = std::clock(); checkCudaErrors(cudaMemcpyAsync(dataBase_.d_nuEff, nuEff, cell_bytes, cudaMemcpyHostToDevice, stream)); - clock_t end = std::clock(); - time_monitor_GPU_memcpy += double(end - start) / double(CLOCKS_PER_SEC); - - // launch cuda kernel - start = std::clock(); size_t threads_per_block = 512; size_t blocks_per_grid = (num_cells + threads_per_block - 1) / threads_per_block; fvc_div_tensor_internal<<>>(num_cells, @@ -1171,14 +1160,10 @@ void dfUEqn::fvc_div_tensor(const double *nuEff) dataBase_.d_boundary_cell_offset, dataBase_.d_boundary_cell_id, dataBase_.d_boundary_nuEff, dataBase_.d_boundary_rho, dataBase_.d_boundary_face_vector, dataBase_.d_grad_boundary, dataBase_.d_volume, 1., d_b, d_b); - end = std::clock(); - time_monitor_GPU_kernel += double(end - start) / double(CLOCKS_PER_SEC); } void dfUEqn::fvm_laplacian() { - clock_t start = std::clock(); - // launch CUDA kernels size_t threads_per_block = 1024; size_t blocks_per_grid = (num_cells + threads_per_block - 1) / threads_per_block; fvm_laplacian_uncorrected_vector_internal<<>>(num_cells, num_faces, @@ -1190,8 +1175,6 @@ void dfUEqn::fvm_laplacian() d_A_csr_row_index, d_A_csr_diag_index, dataBase_.d_boundary_cell_offset, dataBase_.d_boundary_cell_id, dataBase_.d_boundary_nuEff, dataBase_.d_boundary_rho, dataBase_.d_boundary_face, dataBase_.d_laplac_internal_coeffs, dataBase_.d_laplac_boundary_coeffs, -1., d_A_csr, d_b, d_A_csr, d_b, d_ueqn_internal_coeffs, d_ueqn_boundary_coeffs); - clock_t end = std::clock(); - time_monitor_GPU_kernel += double(end - start) / double(CLOCKS_PER_SEC); } void dfUEqn::A(double *Psi) @@ -1209,8 +1192,7 @@ void dfUEqn::A(double *Psi) checkCudaErrors(cudaMemcpyAsync(h_A, d_A, cell_bytes, cudaMemcpyDeviceToHost, stream)); checkCudaErrors(cudaStreamSynchronize(stream)); - for (size_t i = 0; i < num_cells; i++) - Psi[i] = h_A[i]; + memcpy(Psi, h_A, cell_bytes); } void dfUEqn::H(double *Psi) @@ -1227,18 +1209,13 @@ void dfUEqn::H(double *Psi) lduMatrix_H<<>>(num_cells, d_A_csr_row_index, d_A_csr_col_index, d_A_csr_diag_index, dataBase_.d_volume, d_psi, d_A_csr, d_b, d_ueqn_boundary_coeffs, d_H); - checkCudaErrors(cudaMemcpyAsync(h_H, d_H, cell_vec_bytes, cudaMemcpyDeviceToHost, stream)); - checkCudaErrors(cudaStreamSynchronize(stream)); + blocks_per_grid = (num_cells + threads_per_block - 1) / threads_per_block; + permute_psi_d2h<<>>(num_cells, d_H, d_H_permute); - for (size_t i = 0; i < num_cells; i++) - { - Psi[i * 3] = h_H[i]; - Psi[i * 3 + 1] = h_H[num_cells + i]; - Psi[i * 3 + 2] = h_H[num_cells * 2 + i]; - } + checkCudaErrors(cudaMemcpyAsync(h_H, d_H_permute, cell_vec_bytes, cudaMemcpyDeviceToHost, stream)); + checkCudaErrors(cudaStreamSynchronize(stream)); - // for (int i = 0; i < num_cells; i++) - // fprintf(stderr, "h_H_GPU[%d]: (%.5e, %.5e, %.5e)\n", i, h_H[i], h_H[num_cells + i], h_H[num_cells * 2 + i]); + memcpy(Psi, h_H, cell_vec_bytes); } void dfUEqn::initializeTimeStep() @@ -1341,12 +1318,6 @@ void dfUEqn::solve() // checkCudaErrors(cudaMemcpyAsync(h_psi, d_psi, cell_vec_bytes, cudaMemcpyDeviceToHost, stream)); checkCudaErrors(cudaStreamSynchronize(stream)); - printf("CPU Time (copy&permutate) = %.6lf s\n", time_monitor_CPU); - printf("GPU Time (kernel launch) = %.6lf s\n", time_monitor_GPU_kernel); - printf("GPU Time (memcpy) = %.6lf s\n", time_monitor_GPU_memcpy); - time_monitor_CPU = 0; - time_monitor_GPU_kernel = 0; - time_monitor_GPU_memcpy = 0; // nvtxRangePush("solve"); @@ -1369,33 +1340,35 @@ void dfUEqn::solve() UzSolver->solve(num_cells, d_psi + 2 * num_cells, d_b + 2 * num_cells); num_iteration++; - checkCudaErrors(cudaMemcpyAsync(h_psi, d_psi, cell_vec_bytes, cudaMemcpyDeviceToHost, stream)); + size_t threads_per_block = 1024; + size_t blocks_per_grid = (num_cells + threads_per_block - 1) / threads_per_block; + permute_psi_d2h<<>>(num_cells, d_psi, d_psi_permute); + checkCudaErrors(cudaMemcpyAsync(h_psi, d_psi_permute, cell_vec_bytes, cudaMemcpyDeviceToHost, stream)); // for (size_t i = 0; i < num_cells; i++) // fprintf(stderr, "h_velocity_after[%d]: (%.15lf, %.15lf, %.15lf)\n", i, h_psi[i], // h_psi[num_cells + i], h_psi[num_cells*2 + i]); } +void dfUEqn::sync() +{ + checkCudaErrors(cudaStreamSynchronize(stream)); +} + void dfUEqn::updatePsi(double *Psi) { checkCudaErrors(cudaStreamSynchronize(stream)); - for (size_t i = 0; i < num_cells; i++) - { - Psi[i * 3] = h_psi[i]; - Psi[i * 3 + 1] = h_psi[num_cells + i]; - Psi[i * 3 + 2] = h_psi[num_cells * 2 + i]; - } + memcpy(Psi, h_psi, cell_vec_bytes); } // correct volecity in pEqn void dfUEqn::correctPsi(double *Psi) { - for (size_t i = 0; i < num_cells; i++) - { - h_psi[i] = Psi[i * 3]; - h_psi[num_cells + i] = Psi[i * 3 + 1]; - h_psi[num_cells * 2 + i] = Psi[i * 3 + 2]; - } - checkCudaErrors(cudaMemcpyAsync(d_psi, h_psi, cell_vec_bytes, cudaMemcpyDeviceToHost, stream)); + memcpy(h_psi, Psi, cell_vec_bytes); + checkCudaErrors(cudaMemcpyAsync(d_psi_permute, h_psi, cell_vec_bytes, cudaMemcpyDeviceToHost, stream)); + + size_t threads_per_block = 1024; + size_t blocks_per_grid = (num_cells + threads_per_block - 1) / threads_per_block; + permute_psi_h2d<<>>(num_cells, d_psi_permute, d_psi); } dfUEqn::~dfUEqn() diff --git a/src_gpu/dfYEqn.H b/src_gpu/dfYEqn.H index f41cd9799..2bc645cdd 100644 --- a/src_gpu/dfYEqn.H +++ b/src_gpu/dfYEqn.H @@ -12,8 +12,6 @@ private: std::vector YSolverSet; int num_iteration = 0; - double time_monitor_CPU; - double time_monitor_GPU_kernel, time_monitor_GPU_memcpy, time_monitor_GPU_memcpy_test; // common variables int num_cells, cell_bytes, num_faces, num_surfaces, num_boundary_cells, num_boundary_faces, num_species, boundary_face_bytes, inertIndex; @@ -22,23 +20,18 @@ private: // Matrix variables double *d_A_csr, *d_b, *d_psi = nullptr; double *h_A_csr, *h_b, *h_psi = nullptr; - double *d_sumYDiffError = nullptr, *d_sumYDiffError_tmp = nullptr; - double *d_sumYDiffError_boundary = nullptr; - double *d_phiUc_boundary = nullptr; - double *d_phiUc = nullptr; - double *d_mut_Sct = nullptr; - double *d_boundary_mut_sct = nullptr; - double *d_boundary_Y = nullptr; + double *d_alpha = nullptr; + double *d_mut_Sct, *d_boundary_mut_sct = nullptr; double *d_hai, *d_boundary_hai = nullptr; double *d_rhoD, *d_boundary_rhoD = nullptr; - double *d_sum_rhoD_grady, *d_sum_boundary_rhoD_grady = nullptr; double *d_sum_hai_rhoD_grady, *d_sum_boundary_hai_rhoD_grady = nullptr; + double *d_sum_rhoD_grady, *d_sum_boundary_rhoD_grady = nullptr; double *d_sum_hai_y, *d_sum_boundary_hai_y = nullptr; + double *d_phiUc, *d_phiUc_boundary = nullptr; + double *d_boundary_Y = nullptr; double *d_grady, *d_boundary_grady = nullptr; - double *d_alpha = nullptr; - public: dfYEqn(dfMatrixDataBase &dataBase, const std::string &modeStr, const std::string &cfgFile, const int inertIndex); @@ -48,13 +41,14 @@ public: void checkValue(bool print, char *filename); - void fvm_laplacian_and_sumYDiffError_diffAlphaD_hDiffCorrFlux(std::vector Y_old, std::vector boundary_Y, - std::vector hai, std::vector boundary_hai, - std::vector rhoD, std::vector boundary_rhoD, - const double *mut_Sct, const double *boundary_mut_Sct, const double *alpha); - void upwindWeight(); + void fvm_laplacian_and_sumYDiffError_diffAlphaD_hDiffCorrFlux( + std::vector Y_old, std::vector boundary_Y, + std::vector hai, std::vector boundary_hai, + std::vector rhoD, std::vector boundary_rhoD, + const double *mut_Sct, const double *boundary_mut_Sct, const double *alpha); + void fvm_ddt(); void fvm_div_phi(); @@ -63,5 +57,7 @@ public: void solve(); + void sync(); + void updatePsi(double *Psi, int speciesIndex); }; diff --git a/src_gpu/dfYEqn.cu b/src_gpu/dfYEqn.cu index 41987ee26..1e31bf0ed 100644 --- a/src_gpu/dfYEqn.cu +++ b/src_gpu/dfYEqn.cu @@ -856,6 +856,11 @@ void dfYEqn::solve() // fprintf(stderr, "h_species_gpu[%d]: %.5e\n", i, h_psi[i + 0 * num_cells]); } +void dfYEqn::sync() +{ + checkCudaErrors(cudaStreamSynchronize(stream)); +} + void dfYEqn::updatePsi(double *Psi, int speciesIndex) { checkCudaErrors(cudaStreamSynchronize(stream)); From 4f97772f651833a0dfb5373ecebd367fbf8b4757 Mon Sep 17 00:00:00 2001 From: STwangyingrui Date: Sat, 10 Jun 2023 23:37:54 +0800 Subject: [PATCH 5/6] move Yt to GPU --- applications/solvers/dfLowMachFoam/YEqn.H | 62 ++++++++++++----------- src_gpu/dfMatrixDataBase.H | 8 +-- src_gpu/dfRhoEqn.cu | 6 --- src_gpu/dfYEqn.cu | 55 ++++++++++++++------ 4 files changed, 77 insertions(+), 54 deletions(-) diff --git a/applications/solvers/dfLowMachFoam/YEqn.H b/applications/solvers/dfLowMachFoam/YEqn.H index a547e266d..94fb6cf8d 100644 --- a/applications/solvers/dfLowMachFoam/YEqn.H +++ b/applications/solvers/dfLowMachFoam/YEqn.H @@ -116,48 +116,50 @@ if(flag_mpi_init) MPI_Barrier(PstreamGlobals::MPI_COMM_FOAM); time_monitor_chem += processingTime.count(); } +#ifdef GPUSolver_ + start1 = std::clock(); + forAll(Y, i) + { + volScalarField& Yi = Y[i]; + YEqn_GPU.updatePsi(&Yi[0], i); + Yi.correctBoundaryConditions(); + } + end1 = std::clock(); + time_monitor_YEqn += double(end1 - start1) / double(CLOCKS_PER_SEC); + time_monitor_YEqn_correctBC += double(end1 - start1) / double(CLOCKS_PER_SEC); +#else start2 = std::clock(); volScalarField Yt(0.0*Y[0]); int speciesIndex = 0; forAll(Y, i) { volScalarField& Yi = Y[i]; -#ifdef CPUSolver_ hDiffCorrFlux += chemistry->hai(i)*(chemistry->rhoD(i)*fvc::grad(Yi) - Yi*sumYDiffError); diffAlphaD += fvc::laplacian(thermo.alpha()*chemistry->hai(i), Yi); -#endif if (i != inertIndex) { - #ifdef GPUSolver_ - start1 = std::clock(); - YEqn_GPU.updatePsi(&Yi[0], speciesIndex); - Yi.correctBoundaryConditions(); - end1 = std::clock(); - time_monitor_YEqn_correctBC += double(end1 - start1) / double(CLOCKS_PER_SEC); - #else - start1 = std::clock(); - tmp DEff = chemistry->rhoD(i) + turbulence->mut()/Sct; - fvScalarMatrix YiEqn + start1 = std::clock(); + tmp DEff = chemistry->rhoD(i) + turbulence->mut()/Sct; + fvScalarMatrix YiEqn ( - fvm::ddt(rho, Yi) - + mvConvection->fvmDiv(phi, Yi) - + mvConvection->fvmDiv(phiUc, Yi) - == - ( - splitting - ? fvm::laplacian(DEff(), Yi) - : (fvm::laplacian(DEff(), Yi) + combustion->R(Yi)) - ) + fvm::ddt(rho, Yi) + + mvConvection->fvmDiv(phi, Yi) + + mvConvection->fvmDiv(phiUc, Yi) + == + ( + splitting + ? fvm::laplacian(DEff(), Yi) + : (fvm::laplacian(DEff(), Yi) + combustion->R(Yi)) + ) ); - end1 = std::clock(); - time_monitor_YEqn_mtxAssembly += double(end1 - start1) / double(CLOCKS_PER_SEC); - // YiEqn.relax(); + end1 = std::clock(); + time_monitor_YEqn_mtxAssembly += double(end1 - start1) / double(CLOCKS_PER_SEC); + // YiEqn.relax(); - start1 = std::clock(); - YiEqn.solve("Yi"); - end1 = std::clock(); - time_monitor_YEqn_solve += double(end1 - start1) / double(CLOCKS_PER_SEC); - #endif + start1 = std::clock(); + YiEqn.solve("Yi"); + end1 = std::clock(); + time_monitor_YEqn_solve += double(end1 - start1) / double(CLOCKS_PER_SEC); Yi.max(0.0); Yt += Yi; @@ -167,7 +169,7 @@ if(flag_mpi_init) MPI_Barrier(PstreamGlobals::MPI_COMM_FOAM); Y[inertIndex] = scalar(1) - Yt; Y[inertIndex].max(0.0); - end2 = std::clock(); time_monitor_YEqn += double(end2 - start2) / double(CLOCKS_PER_SEC); +#endif } diff --git a/src_gpu/dfMatrixDataBase.H b/src_gpu/dfMatrixDataBase.H index 81c82a9d8..8ed3dcd32 100644 --- a/src_gpu/dfMatrixDataBase.H +++ b/src_gpu/dfMatrixDataBase.H @@ -91,7 +91,8 @@ struct dfMatrixDataBase double *d_rho_new = nullptr, *d_rho_old = nullptr, *d_velocity_old = nullptr, *d_pressure = nullptr, *d_volume = nullptr; // - the device pointer to Y(vector Yi) - std::vector d_Y; + //std::vector d_Y; + double *d_Y = nullptr; // - the device pointer to the pre-permutated and post-permutated interpolation weight list double *d_weight_init = nullptr, *d_weight = nullptr; double *d_weight_upwind = nullptr; @@ -478,7 +479,7 @@ struct dfMatrixDataBase checkCudaErrors(cudaMalloc((void**)&d_A_csr_diag_index, cell_index_bytes)); total_bytes += (csr_row_index_bytes + csr_col_index_bytes + cell_index_bytes); - d_Y.resize(num_species); + //d_Y.resize(num_species); d_rhoD_vector.resize(num_species); d_boundary_Y_vector.resize(num_species); d_boundary_Y_init_vector.resize(num_species); @@ -489,9 +490,10 @@ struct dfMatrixDataBase d_boundary_rhoD_vector.resize(num_species); for (size_t i = 0; i < num_species; ++i){ - checkCudaErrors(cudaMalloc((void**)&d_Y[i], cell_bytes)); + //checkCudaErrors(cudaMalloc((void**)&d_Y[i], cell_bytes)); checkCudaErrors(cudaMalloc((void**)&d_rhoD_vector[i], cell_bytes)); } + checkCudaErrors(cudaMalloc((void**)&d_Y, cell_bytes * num_species)); checkCudaErrors(cudaMalloc((void**)&d_rho_old, cell_bytes)); checkCudaErrors(cudaMalloc((void**)&d_rho_new, cell_bytes)); checkCudaErrors(cudaMalloc((void**)&d_volume, cell_bytes)); diff --git a/src_gpu/dfRhoEqn.cu b/src_gpu/dfRhoEqn.cu index 112a376ec..7c60f3bea 100644 --- a/src_gpu/dfRhoEqn.cu +++ b/src_gpu/dfRhoEqn.cu @@ -105,17 +105,11 @@ void dfRhoEqn::initializeTimeStep() void dfRhoEqn::fvc_div(double *phi, double *boundary_phi_init) { - clock_t start = std::clock(); memcpy(dataBase_.h_phi_init, phi, num_surfaces * sizeof(double)); - clock_t end = std::clock(); - time_monitor_CPU += double(end - start) / double(CLOCKS_PER_SEC); - - start = std::clock(); checkCudaErrors(cudaMemcpyAsync(dataBase_.d_phi_init, dataBase_.h_phi_init, num_surfaces * sizeof(double), cudaMemcpyHostToDevice, stream)); checkCudaErrors(cudaMemcpyAsync(dataBase_.d_phi_init + num_surfaces, dataBase_.d_phi_init, num_surfaces * sizeof(double), cudaMemcpyDeviceToDevice, stream)); checkCudaErrors(cudaMemcpyAsync(dataBase_.d_boundary_phi_init, boundary_phi_init, dataBase_.boundary_face_bytes, cudaMemcpyHostToDevice, stream)); - end = std::clock(); size_t threads_per_block = 1024; size_t blocks_per_grid = (num_cells + threads_per_block - 1) / threads_per_block; diff --git a/src_gpu/dfYEqn.cu b/src_gpu/dfYEqn.cu index 1e31bf0ed..85db544da 100644 --- a/src_gpu/dfYEqn.cu +++ b/src_gpu/dfYEqn.cu @@ -273,7 +273,7 @@ __global__ void calculate_phiUc_boundary(int num_boundary_faces, __global__ void fvm_ddt_kernel_scalar(int num_cells, int num_faces, const double rdelta_t, const int *csr_row_index, const int *csr_diag_index, const double *rho_old, const double *rho_new, const double *volume, const double *species_old, - const double *A_csr_input, const double *b_input, double *A_csr_output, double *b_output, double *psi) + const double *A_csr_input, const double *b_input, double *A_csr_output, double *b_output) { int index = blockDim.x * blockIdx.x + threadIdx.x; if (index >= num_cells) @@ -289,8 +289,24 @@ __global__ void fvm_ddt_kernel_scalar(int num_cells, int num_faces, const double double ddt_part_term = rdelta_t * rho_old[index] * volume[index]; b_output[index] = b_input[index] + ddt_part_term * species_old[index]; +} + +__global__ void compute_inertIndex_y(int num_cells, int num_species, int inertIndex, double *y) +{ + int index = blockDim.x * blockIdx.x + threadIdx.x; + if (index >= num_cells) + return; - psi[index] = species_old[index]; + double sum_yi = 0; + for (int i = 0; i < num_species; i++) + { + if (i == inertIndex) continue; + + double yi = y[num_cells * i + index]; + sum_yi += yi > 0 ? yi : 0; + } + sum_yi = 1 - sum_yi; + y[num_cells * inertIndex + index] = (sum_yi > 0 ? sum_yi : 0); } __global__ void fvm_div_internal_scalar(int num_cells, int num_faces, @@ -528,7 +544,7 @@ dfYEqn::dfYEqn(dfMatrixDataBase &dataBase, const std::string &modeStr, const std h_A_csr = new double[(num_cells + num_faces) * (num_species - 1)]; h_b = new double[num_cells * (num_species - 1)]; - cudaMallocHost(&h_psi, num_cells * (num_species - 1) * sizeof(double)); + cudaMallocHost(&h_psi, num_cells * num_species * sizeof(double)); checkCudaErrors(cudaMalloc((void **)&d_A_csr, (num_cells + num_faces) * (num_species - 1) * sizeof(double))); checkCudaErrors(cudaMalloc((void **)&d_b, cell_bytes * (num_species - 1))); @@ -608,7 +624,7 @@ void dfYEqn::fvm_laplacian_and_sumYDiffError_diffAlphaD_hDiffCorrFlux(std::vecto int mtxIndex = 0; for (size_t i = 0; i < num_species; ++i) { - checkCudaErrors(cudaMemcpyAsync(dataBase_.d_Y[i], Y_old[i], cell_bytes, cudaMemcpyHostToDevice, stream)); + checkCudaErrors(cudaMemcpyAsync(dataBase_.d_Y + i * num_cells, Y_old[i], cell_bytes, cudaMemcpyHostToDevice, stream)); checkCudaErrors(cudaMemcpyAsync(d_boundary_Y, boundary_Y[i], boundary_face_bytes, cudaMemcpyHostToDevice, stream)); checkCudaErrors(cudaMemcpyAsync(d_hai, hai[i], cell_bytes, cudaMemcpyHostToDevice, stream)); checkCudaErrors(cudaMemcpyAsync(d_boundary_hai, boundary_hai[i], boundary_face_bytes, cudaMemcpyHostToDevice, stream)); @@ -623,7 +639,7 @@ void dfYEqn::fvm_laplacian_and_sumYDiffError_diffAlphaD_hDiffCorrFlux(std::vecto blocks_per_grid = (num_cells + threads_per_block - 1) / threads_per_block; fvc_grad_internal<<>>(num_cells, d_A_csr_row_index, d_A_csr_col_index, d_A_csr_diag_index, - dataBase_.d_face_vector, dataBase_.d_weight, dataBase_.d_Y[i], + dataBase_.d_face_vector, dataBase_.d_weight, dataBase_.d_Y + i * num_cells, dataBase_.d_volume, d_grady); blocks_per_grid = (num_boundary_cells + threads_per_block - 1) / threads_per_block; fvc_grad_boundary<<>>(num_boundary_cells, @@ -641,7 +657,7 @@ void dfYEqn::fvm_laplacian_and_sumYDiffError_diffAlphaD_hDiffCorrFlux(std::vecto // sum(chemistry->hai(i)*Yi) blocks_per_grid = (num_cells + threads_per_block - 1) / threads_per_block; sumError_internal<<>>(num_cells, - d_hai, d_rhoD, dataBase_.d_Y[i], d_grady, + d_hai, d_rhoD, dataBase_.d_Y + i * num_cells, d_grady, d_sum_hai_rhoD_grady, d_sum_rhoD_grady, d_sum_hai_y); blocks_per_grid = (num_boundary_faces + threads_per_block - 1) / threads_per_block; sumError_boundary<<>>(num_boundary_faces, @@ -653,7 +669,7 @@ void dfYEqn::fvm_laplacian_and_sumYDiffError_diffAlphaD_hDiffCorrFlux(std::vecto blocks_per_grid = (num_cells + threads_per_block - 1) / threads_per_block; fvc_laplacian_internal<<>>(num_cells, d_A_csr_row_index, d_A_csr_col_index, d_A_csr_diag_index, - d_alpha, d_hai, dataBase_.d_Y[i], + d_alpha, d_hai, dataBase_.d_Y + i * num_cells, dataBase_.d_weight, dataBase_.d_face, dataBase_.d_deltaCoeffs, dataBase_.d_volume, dataBase_.d_diffAlphaD); @@ -699,10 +715,10 @@ void dfYEqn::fvm_ddt() blocks_per_grid = (num_cells + threads_per_block - 1) / threads_per_block; fvm_ddt_kernel_scalar<<>>(num_cells, num_faces, dataBase_.rdelta_t, d_A_csr_row_index, d_A_csr_diag_index, - dataBase_.d_rho_old, dataBase_.d_rho_new, dataBase_.d_volume, dataBase_.d_Y[i], + dataBase_.d_rho_old, dataBase_.d_rho_new, dataBase_.d_volume, dataBase_.d_Y + i * num_cells, d_A_csr + mtxIndex * (num_cells + num_faces), d_b + mtxIndex * num_cells, - d_A_csr + mtxIndex * (num_cells + num_faces), d_b + mtxIndex * num_cells, - d_psi + mtxIndex * num_cells); + d_A_csr + mtxIndex * (num_cells + num_faces), d_b + mtxIndex * num_cells); + //d_psi + mtxIndex * num_cells); ++mtxIndex; } } @@ -843,14 +859,23 @@ void dfYEqn::solve() ++solverIndex; } } - int solverIndex = 0; - for (auto &solver : YSolverSet) + int mtxIndex = 0; + for (size_t i = 0; i < num_species; ++i) { - solver->solve(num_cells, d_psi + solverIndex * num_cells, d_b + solverIndex * num_cells); - ++solverIndex; + if (i == inertIndex) + continue; + + YSolverSet[mtxIndex]->solve(num_cells, dataBase_.d_Y + i * num_cells, d_b + mtxIndex * num_cells); + ++mtxIndex; } + + size_t threads_per_block, blocks_per_grid; + threads_per_block = 1024; + blocks_per_grid = (num_cells + threads_per_block - 1) / threads_per_block; + compute_inertIndex_y<<>>(num_cells, num_species, inertIndex, dataBase_.d_Y); + checkCudaErrors(cudaMemcpyAsync(h_psi, dataBase_.d_Y, num_species * cell_bytes, cudaMemcpyDeviceToHost, stream)); + num_iteration++; - checkCudaErrors(cudaMemcpyAsync(h_psi, d_psi, num_cells * (num_species - 1) * sizeof(double), cudaMemcpyDeviceToHost, stream)); // checkCudaErrors(cudaStreamSynchronize(stream)); // for (size_t i = 0; i < num_cells; i++) // fprintf(stderr, "h_species_gpu[%d]: %.5e\n", i, h_psi[i + 0 * num_cells]); From 192994709739bc023b563f9b5b30c044c92fddf8 Mon Sep 17 00:00:00 2001 From: STwangyingrui Date: Sun, 11 Jun 2023 15:29:58 +0800 Subject: [PATCH 6/6] small refactor of time profiling --- applications/solvers/dfLowMachFoam/EEqn.H | 18 +++++++++--------- applications/solvers/dfLowMachFoam/pEqn.H | 4 ---- 2 files changed, 9 insertions(+), 13 deletions(-) diff --git a/applications/solvers/dfLowMachFoam/EEqn.H b/applications/solvers/dfLowMachFoam/EEqn.H index bcab9affb..93e60e7bf 100644 --- a/applications/solvers/dfLowMachFoam/EEqn.H +++ b/applications/solvers/dfLowMachFoam/EEqn.H @@ -1,15 +1,6 @@ { volScalarField& he = thermo.he(); -#ifdef GPUSolver_ - start1 = std::clock(); - UEqn_GPU.updatePsi(&U[0][0]); - K = 0.5*magSqr(U); - end1 = std::clock(); - time_monitor_UEqn += double(end1 - start1) / double(CLOCKS_PER_SEC); - time_monitor_UEqn_correctBC += double(end1 - start1) / double(CLOCKS_PER_SEC); -#endif - #ifdef CPUSolver_ start1 = std::clock(); fvScalarMatrix EEqn @@ -34,6 +25,15 @@ time_monitor_EEqn_solve += double(end1 - start1) / double(CLOCKS_PER_SEC); #endif +#ifdef GPUSolver_ + start1 = std::clock(); + UEqn_GPU.updatePsi(&U[0][0]); + K = 0.5*magSqr(U); + end1 = std::clock(); + time_monitor_UEqn += double(end1 - start1) / double(CLOCKS_PER_SEC); + time_monitor_UEqn_correctBC += double(end1 - start1) / double(CLOCKS_PER_SEC); +#endif + #ifdef GPUSolver_ // prepare data on CPU start1 = std::clock(); diff --git a/applications/solvers/dfLowMachFoam/pEqn.H b/applications/solvers/dfLowMachFoam/pEqn.H index 913bee4fa..34925327f 100644 --- a/applications/solvers/dfLowMachFoam/pEqn.H +++ b/applications/solvers/dfLowMachFoam/pEqn.H @@ -69,13 +69,9 @@ const volScalarField psip0(psi*p); start2 = std::clock(); #ifdef GPUSolver_ - start1 = std::clock(); volScalarField rAU(1.0/UEqn_A); surfaceScalarField rhorAUf("rhorAUf", fvc::interpolate(rho*rAU)); volVectorField HbyA(constrainHbyA(rAU*UEqn_H, U, p)); - end1 = std::clock(); - time_monitor_UEqn += double(end1 - start1) / double(CLOCKS_PER_SEC); - time_monitor_UEqn_A += double(end1 - start1) / double(CLOCKS_PER_SEC); #else volScalarField rAU(1.0/UEqn.A()); surfaceScalarField rhorAUf("rhorAUf", fvc::interpolate(rho*rAU));