From 3ea932de2780066b1cd9ecd233e563f00a5a22ea Mon Sep 17 00:00:00 2001 From: maorz1998 Date: Tue, 6 Jun 2023 13:13:26 +0800 Subject: [PATCH 1/2] GPU version of UEqn.H and UEqn.A --- applications/solvers/dfLowMachFoam/UEqn.H | 54 +++-- applications/solvers/dfLowMachFoam/YEqn.H | 10 +- applications/solvers/dfLowMachFoam/pEqn.H | 51 ++++- applications/solvers/dfLowMachFoam/rhoEqn.H | 22 +- src_gpu/dfUEqn.H | 11 +- src_gpu/dfUEqn.cu | 234 +++++++++++++++++++- 6 files changed, 322 insertions(+), 60 deletions(-) diff --git a/applications/solvers/dfLowMachFoam/UEqn.H b/applications/solvers/dfLowMachFoam/UEqn.H index 3ceca690b..5a089e0c0 100644 --- a/applications/solvers/dfLowMachFoam/UEqn.H +++ b/applications/solvers/dfLowMachFoam/UEqn.H @@ -1,33 +1,4 @@ // Solve the Momentum equation -start1 = std::clock(); -tmp tUEqn -( - fvm::ddt(rho, U) + fvm::div(phi, U) - + turbulence->divDevRhoReff(U) - == -fvc::grad(p) -); -fvVectorMatrix& UEqn = tUEqn.ref(); - -end1 = std::clock(); -#ifdef CPUSolver_ - time_monitor_UEqn += double(end1 - start1) / double(CLOCKS_PER_SEC); - time_monitor_UEqn_mtxAssembly += double(end1 - start1) / double(CLOCKS_PER_SEC); -#endif - -#ifdef CPUSolver_ - // UEqn.relax(); - start1 = std::clock(); - if (pimple.momentumPredictor()) - { - solve(UEqn); - - K = 0.5*magSqr(U); - } - end1 = std::clock(); - time_monitor_UEqn += double(end1 - start1) / double(CLOCKS_PER_SEC); - time_monitor_UEqn_Solve += double(end1 - start1) / double(CLOCKS_PER_SEC); -#endif - #ifdef GPUSolver_ start1 = std::clock(); UEqn_GPU.fvm_ddt(&U.oldTime()[0][0]); @@ -111,6 +82,31 @@ end1 = std::clock(); // K = 0.5*magSqr(U); // } // UEqn_GPU.checkValue(false); +#else + start1 = std::clock(); + tmp tUEqn + ( + fvm::ddt(rho, U) + fvm::div(phi, U) + + turbulence->divDevRhoReff(U) + == -fvc::grad(p) + ); + fvVectorMatrix& UEqn = tUEqn.ref(); + + end1 = std::clock(); + time_monitor_UEqn += double(end1 - start1) / double(CLOCKS_PER_SEC); + time_monitor_UEqn_mtxAssembly += double(end1 - start1) / double(CLOCKS_PER_SEC); + + // UEqn.relax(); + start1 = std::clock(); + if (pimple.momentumPredictor()) + { + solve(UEqn); + + K = 0.5*magSqr(U); + } + end1 = std::clock(); + time_monitor_UEqn += double(end1 - start1) / double(CLOCKS_PER_SEC); + time_monitor_UEqn_Solve += double(end1 - start1) / double(CLOCKS_PER_SEC); #endif // start1 = std::clock(); diff --git a/applications/solvers/dfLowMachFoam/YEqn.H b/applications/solvers/dfLowMachFoam/YEqn.H index 5e44d9b3f..eeaea40e7 100644 --- a/applications/solvers/dfLowMachFoam/YEqn.H +++ b/applications/solvers/dfLowMachFoam/YEqn.H @@ -107,7 +107,10 @@ time_monitor_corrDiff += double(end - start) / double(CLOCKS_PER_SEC); if (i != inertIndex) { - #ifdef CPUSolver_ + #ifdef GPUSolver_ + YEqn_GPU.updatePsi(&Yi[0], speciesIndex); + Yi.correctBoundaryConditions(); + #else tmp DEff = chemistry->rhoD(i) + turbulence->mut()/Sct; fvScalarMatrix YiEqn ( @@ -127,11 +130,6 @@ time_monitor_corrDiff += double(end - start) / double(CLOCKS_PER_SEC); YiEqn.solve("Yi"); #endif - #ifdef GPUSolver_ - YEqn_GPU.updatePsi(&Yi[0], speciesIndex); - Yi.correctBoundaryConditions(); - #endif - Yi.max(0.0); Yt += Yi; ++speciesIndex; diff --git a/applications/solvers/dfLowMachFoam/pEqn.H b/applications/solvers/dfLowMachFoam/pEqn.H index ea653d261..f8e7763be 100644 --- a/applications/solvers/dfLowMachFoam/pEqn.H +++ b/applications/solvers/dfLowMachFoam/pEqn.H @@ -7,9 +7,53 @@ if (!pimple.simpleRho()) // pressure solution const volScalarField psip0(psi*p); -volScalarField rAU(1.0/UEqn.A()); -surfaceScalarField rhorAUf("rhorAUf", fvc::interpolate(rho*rAU)); -volVectorField HbyA(constrainHbyA(rAU*UEqn.H(), U, p)); +#ifdef GPUSolver_ + // UEqn.H() + volVectorField UEqn_H + ( + IOobject + ( + "H("+U.name()+')', + runTime.timeName(), + mesh, + IOobject::NO_READ, + IOobject::NO_WRITE + ), + mesh, + dimensionedVector(dimensionSet(1,-2,-2,0,0,0,0), Zero), + extrapolatedCalculatedFvPatchScalarField::typeName + ); + UEqn_GPU.H(&UEqn_H[0][0]); + UEqn_H.correctBoundaryConditions(); + + // UEqn.A() + volScalarField UEqn_A + ( + IOobject + ( + "A("+U.name()+')', + runTime.timeName(), + mesh, + IOobject::NO_READ, + IOobject::NO_WRITE + ), + mesh, + dimensionedScalar(dimensionSet(1,-3,-1,0,0,0,0), Zero), + extrapolatedCalculatedFvPatchScalarField::typeName + ); + UEqn_GPU.A(&UEqn_A[0]); + UEqn_A.correctBoundaryConditions(); +#endif + +#ifdef GPUSolver_ + volScalarField rAU(1.0/UEqn_A); + surfaceScalarField rhorAUf("rhorAUf", fvc::interpolate(rho*rAU)); + volVectorField HbyA(constrainHbyA(rAU*UEqn_H, U, p)); +#else + volScalarField rAU(1.0/UEqn.A()); + surfaceScalarField rhorAUf("rhorAUf", fvc::interpolate(rho*rAU)); + volVectorField HbyA(constrainHbyA(rAU*UEqn.H(), U, p)); +#endif if (pimple.nCorrPiso() <= 1) { @@ -99,6 +143,7 @@ p.relax(); U = HbyA - rAU*fvc::grad(p); U.correctBoundaryConditions(); K = 0.5*magSqr(U); +UEqn_GPU.correctPsi(&U[0][0]); if (pimple.simpleRho()) { diff --git a/applications/solvers/dfLowMachFoam/rhoEqn.H b/applications/solvers/dfLowMachFoam/rhoEqn.H index e49ef9e5b..a7d5552fe 100644 --- a/applications/solvers/dfLowMachFoam/rhoEqn.H +++ b/applications/solvers/dfLowMachFoam/rhoEqn.H @@ -28,18 +28,6 @@ Description Solve the continuity for density. \*---------------------------------------------------------------------------*/ - -#ifdef CPUSolver_ -{ - fvScalarMatrix rhoEqn - ( - fvm::ddt(rho) - + fvc::div(phi) - ); - - rhoEqn.solve(); -} -#endif #ifdef GPUSolver_ { rho.oldTime(); @@ -57,6 +45,16 @@ Description rhoEqn_GPU.updatePsi(&rho.primitiveFieldRef()[0]); rho.correctBoundaryConditions(); } +#else +{ + fvScalarMatrix rhoEqn + ( + fvm::ddt(rho) + + fvc::div(phi) + ); + + rhoEqn.solve(); +} #endif // ************************************************************************* // diff --git a/src_gpu/dfUEqn.H b/src_gpu/dfUEqn.H index fb79a1dcf..8c6ed9c1e 100644 --- a/src_gpu/dfUEqn.H +++ b/src_gpu/dfUEqn.H @@ -19,8 +19,8 @@ private: int *d_A_csr_row_index, *d_A_csr_diag_index, *d_A_csr_col_index; // Matrix variables - double *d_A_csr, *d_b, *d_psi = nullptr; - double *h_A_csr, *h_b, *h_psi = nullptr; + double *d_A_csr, *d_b, *d_psi, *d_H, *d_A, *d_ueqn_internal_coeffs, *d_ueqn_boundary_coeffs= nullptr; + double *h_A_csr, *h_b, *h_psi, *h_H, *h_A = nullptr; public: dfUEqn(); @@ -48,6 +48,13 @@ public: void add_fvMatrix(double *turbSrc_low, double *turbSrc_diag, double *turbSrc_upp, double *turbSrc_source); + void A(double *Psi); + + void H(double *Psi); + void solve(); + void updatePsi(double *Psi); + + void correctPsi(double *Psi); }; \ No newline at end of file diff --git a/src_gpu/dfUEqn.cu b/src_gpu/dfUEqn.cu index 4afebd8c1..eb2c45c9a 100644 --- a/src_gpu/dfUEqn.cu +++ b/src_gpu/dfUEqn.cu @@ -86,7 +86,8 @@ __global__ void fvm_div_boundary(int num_cells, int num_faces, int num_boundary_ const int *csr_row_index, const int *csr_diag_index, const int *boundary_cell_offset, const int *boundary_cell_id, const double *internal_coeffs, const double *boundary_coeffs, - const double *A_csr_input, const double *b_input, double *A_csr_output, double *b_output) + const double *A_csr_input, const double *b_input, double *A_csr_output, double *b_output, + double *ueqn_internal_coeffs, double *ueqn_boundary_coeffs) { int index = blockDim.x * blockIdx.x + threadIdx.x; if (index >= num_boundary_cells) @@ -120,6 +121,13 @@ __global__ void fvm_div_boundary(int num_cells, int num_faces, int num_boundary_ boundary_coeffs_y += boundary_coeffs[(cell_offset + i) * 3 + 1]; boundary_coeffs_z += boundary_coeffs[(cell_offset + i) * 3 + 2]; } + ueqn_internal_coeffs[cell_index * 3 + 0] = internal_coeffs_x; + ueqn_internal_coeffs[cell_index * 3 + 1] = internal_coeffs_y; + ueqn_internal_coeffs[cell_index * 3 + 2] = internal_coeffs_z; + ueqn_boundary_coeffs[cell_index * 3 + 0] = boundary_coeffs_x; + ueqn_boundary_coeffs[cell_index * 3 + 1] = boundary_coeffs_y; + ueqn_boundary_coeffs[cell_index * 3 + 2] = boundary_coeffs_z; + A_csr_output[csr_dim * 0 + csr_index] = A_csr_input[csr_dim * 0 + csr_index] + internal_coeffs_x; A_csr_output[csr_dim * 1 + csr_index] = A_csr_input[csr_dim * 1 + csr_index] + internal_coeffs_y; A_csr_output[csr_dim * 2 + csr_index] = A_csr_input[csr_dim * 2 + csr_index] + internal_coeffs_z; @@ -799,7 +807,8 @@ __global__ void fvm_laplacian_uncorrected_vector_boundary(int num_cells, int num const int *boundary_cell_offset, const int *boundary_cell_id, const double *boundary_scalar0, const double *boundary_scalar1, const double *boundary_magsf, const double *gradient_internal_coeffs, const double *gradient_boundary_coeffs, - const double sign, const double *A_csr_input, const double *b_input, double *A_csr_output, double *b_output) + const double sign, const double *A_csr_input, const double *b_input, double *A_csr_output, double *b_output, + double *ueqn_internal_coeffs, double *ueqn_boundary_coeffs) { int index = blockDim.x * blockIdx.x + threadIdx.x; if (index >= num_boundary_cells) @@ -846,6 +855,13 @@ __global__ void fvm_laplacian_uncorrected_vector_boundary(int num_cells, int num boundary_coeffs_z += gamma_magsf * gradient_boundary_coeffs[i * 3 + 2]; } + ueqn_internal_coeffs[cell_index * 3 + 0] += internal_coeffs_x * sign; + ueqn_internal_coeffs[cell_index * 3 + 1] += internal_coeffs_y * sign; + ueqn_internal_coeffs[cell_index * 3 + 2] += internal_coeffs_z * sign; + ueqn_boundary_coeffs[cell_index * 3 + 0] += boundary_coeffs_x * sign; + ueqn_boundary_coeffs[cell_index * 3 + 1] += boundary_coeffs_y * sign; + ueqn_boundary_coeffs[cell_index * 3 + 2] += boundary_coeffs_z * sign; + A_csr_output[csr_dim * 0 + csr_index] = A_csr_input[csr_dim * 0 + csr_index] + internal_coeffs_x * sign; A_csr_output[csr_dim * 1 + csr_index] = A_csr_input[csr_dim * 1 + csr_index] + internal_coeffs_y * sign; A_csr_output[csr_dim * 2 + csr_index] = A_csr_input[csr_dim * 2 + csr_index] + internal_coeffs_z * sign; @@ -854,6 +870,138 @@ __global__ void fvm_laplacian_uncorrected_vector_boundary(int num_cells, int num b_output[num_cells * 2 + cell_index] = b_input[num_cells * 2 + cell_index] + boundary_coeffs_z * sign; } +__global__ void addBoundaryDiag(int num_cells, int num_boundary_cells, + const int *csr_row_index, const int *csr_diag_index, + const int *boundary_cell_offset, const int *boundary_cell_id, + const double *ueqn_internal_coeffs, const double *ueqn_boundary_coeffs, + const double *psi, double *H) +{ + int index = blockDim.x * blockIdx.x + threadIdx.x; + if (index >= num_boundary_cells) + return; + + int cell_offset = boundary_cell_offset[index]; + int next_cell_offset = boundary_cell_offset[index + 1]; + int cell_index = boundary_cell_id[cell_offset]; + + // addBoundaryDiag(boundaryDiagCmpt, cmpt); // add internal coeffs + // boundaryDiagCmpt.negate(); + double internal_x = ueqn_internal_coeffs[cell_index * 3 + 0]; + double internal_y = ueqn_internal_coeffs[cell_index * 3 + 1]; + double internal_z = ueqn_internal_coeffs[cell_index * 3 + 2]; + + // addCmptAvBoundaryDiag(boundaryDiagCmpt); + double ave_internal = (internal_x + internal_y + internal_z) / 3; + + H[num_cells * 0 + cell_index] = (-internal_x + ave_internal) * psi[num_cells * 0 + cell_index]; + H[num_cells * 1 + cell_index] = (-internal_y + ave_internal) * psi[num_cells * 1 + cell_index]; + H[num_cells * 2 + cell_index] = (-internal_z + ave_internal) * psi[num_cells * 2 + cell_index]; +} + +__global__ void lduMatrix_H(int num_cells, + const int *csr_row_index, const int *csr_col_index, const int *csr_diag_index, + const double *volume, const double *psi, const double *A_csr, const double *b, + const double *ueqn_boundary_coeffs, double *H) +{ + int index = blockDim.x * blockIdx.x + threadIdx.x; + if (index >= num_cells) + return; + + // A_csr has one more element in each row: itself + int row_index = csr_row_index[index]; + int row_elements = csr_row_index[index + 1] - row_index; + int diag_index = csr_diag_index[index]; + int neighbor_offset = csr_row_index[index] - index; + + double APsi_x = 0.; + double APsi_y = 0.; + double APsi_z = 0.; + // lower + for (int i = 0; i < diag_index; i++) + { + int neighbor_cell_id = csr_col_index[i + row_index]; + APsi_x += A_csr[row_index + i] * psi[num_cells * 0 + neighbor_cell_id]; + APsi_y += A_csr[row_index + i] * psi[num_cells * 1 + neighbor_cell_id]; + APsi_z += A_csr[row_index + i] * psi[num_cells * 2 + neighbor_cell_id]; + } + // upper + for (int i = diag_index + 1; i < row_elements; i++) + { + int neighbor_cell_id = csr_col_index[i + row_index]; + APsi_x += A_csr[row_index + i] * psi[num_cells * 0 + neighbor_cell_id]; + APsi_y += A_csr[row_index + i] * psi[num_cells * 1 + neighbor_cell_id]; + APsi_z += A_csr[row_index + i] * psi[num_cells * 2 + neighbor_cell_id]; + } + + H[num_cells * 0 + index] = H[num_cells * 0 + index] - APsi_x + b[num_cells * 0 + index]; + H[num_cells * 1 + index] = H[num_cells * 1 + index] - APsi_y + b[num_cells * 1 + index]; + H[num_cells * 2 + index] = H[num_cells * 2 + index] - APsi_z + b[num_cells * 2 + index]; + + double vol = volume[index]; + H[num_cells * 0 + index] = H[num_cells * 0 + index] / vol; + H[num_cells * 1 + index] = H[num_cells * 1 + index] / vol; + H[num_cells * 2 + index] = H[num_cells * 2 + index] / vol; +} + +__global__ void addBoundarySource(int num_cells, int num_boundary_cells, + const int *csr_row_index, const int *csr_diag_index, + const int *boundary_cell_offset, const int *boundary_cell_id, + const double *ueqn_internal_coeffs, const double *ueqn_boundary_coeffs, + const double *volume, double *H) +{ + int index = blockDim.x * blockIdx.x + threadIdx.x; + if (index >= num_boundary_cells) + return; + + int cell_offset = boundary_cell_offset[index]; + int cell_index = boundary_cell_id[cell_offset]; + + double vol = volume[index]; + + H[num_cells * 0 + index] = H[num_cells * 0 + index] + ueqn_boundary_coeffs[cell_index * 3 + 0] / vol; + H[num_cells * 1 + index] = H[num_cells * 1 + index] + ueqn_boundary_coeffs[cell_index * 3 + 1] / vol; + H[num_cells * 2 + index] = H[num_cells * 2 + index] + ueqn_boundary_coeffs[cell_index * 3 + 2] / vol; +} + +__global__ void addAveInternaltoDiag(int num_cells, int num_boundary_cells, + const int *csr_row_index, const int *csr_diag_index, + const int *boundary_cell_offset, const int *boundary_cell_id, + const double *ueqn_internal_coeffs, const double *ueqn_boundary_coeffs, double *A) +{ + int index = blockDim.x * blockIdx.x + threadIdx.x; + if (index >= num_boundary_cells) + return; + + int cell_offset = boundary_cell_offset[index]; + int next_cell_offset = boundary_cell_offset[index + 1]; + int cell_index = boundary_cell_id[cell_offset]; + + double internal_x = ueqn_internal_coeffs[cell_index * 3 + 0]; + double internal_y = ueqn_internal_coeffs[cell_index * 3 + 1]; + double internal_z = ueqn_internal_coeffs[cell_index * 3 + 2]; + + double ave_internal = (internal_x + internal_y + internal_z) / 3; + + A[cell_index] = ave_internal; +} + +__global__ void addDiagDivVolume(int num_cells, const int *csr_row_index, + const int *csr_diag_index, const double *A_csr, const double *volume, + double *ueqn_internal_coeffs, const double *A_input, double *A_output) +{ + int index = blockDim.x * blockIdx.x + threadIdx.x; + if (index >= num_cells) + return; + + int row_index = csr_row_index[index]; + int diag_index = csr_diag_index[index]; + int csr_index = row_index + diag_index; + + double vol = volume[index]; + + A_output[index] = (A_input[index] + A_csr[csr_index] - ueqn_internal_coeffs[index * 3]) / vol; +} + // constructor dfUEqn::dfUEqn(dfMatrixDataBase &dataBase, const std::string &modeStr, const std::string &cfgFile) : dataBase_(dataBase) @@ -877,10 +1025,16 @@ dfUEqn::dfUEqn(dfMatrixDataBase &dataBase, const std::string &modeStr, const std h_A_csr = new double[(num_cells + num_faces) * 3]; h_b = new double[num_cells * 3]; cudaMallocHost(&h_psi, cell_vec_bytes); + cudaMallocHost(&h_H, cell_vec_bytes); + cudaMallocHost(&h_A, cell_bytes); checkCudaErrors(cudaMalloc((void **)&d_A_csr, csr_value_vec_bytes)); checkCudaErrors(cudaMalloc((void **)&d_b, cell_vec_bytes)); checkCudaErrors(cudaMalloc((void **)&d_psi, cell_vec_bytes)); + checkCudaErrors(cudaMalloc((void **)&d_H, cell_vec_bytes)); + checkCudaErrors(cudaMalloc((void **)&d_A, cell_bytes)); + checkCudaErrors(cudaMalloc((void **)&d_ueqn_internal_coeffs, cell_vec_bytes)); + checkCudaErrors(cudaMalloc((void **)&d_ueqn_boundary_coeffs, cell_vec_bytes)); checkCudaErrors(cudaStreamCreate(&stream)); } @@ -964,7 +1118,8 @@ void dfUEqn::fvm_div(double *phi, double *ueqn_internalCoeffs_init, double *ueqn fvm_div_boundary<<>>(num_cells, num_faces, num_boundary_cells, d_A_csr_row_index, d_A_csr_diag_index, dataBase_.d_boundary_cell_offset, dataBase_.d_boundary_cell_id, - dataBase_.d_internal_coeffs, dataBase_.d_boundary_coeffs, d_A_csr, d_b, d_A_csr, d_b); + dataBase_.d_internal_coeffs, dataBase_.d_boundary_coeffs, d_A_csr, d_b, d_A_csr, d_b, + d_ueqn_internal_coeffs, d_ueqn_boundary_coeffs); end = std::clock(); time_monitor_GPU_kernel += double(end - start) / double(CLOCKS_PER_SEC); } @@ -1067,11 +1222,62 @@ void dfUEqn::fvm_laplacian() fvm_laplacian_uncorrected_vector_boundary<<>>(num_cells, num_faces, num_boundary_cells, d_A_csr_row_index, d_A_csr_diag_index, dataBase_.d_boundary_cell_offset, dataBase_.d_boundary_cell_id, dataBase_.d_boundary_nuEff, dataBase_.d_boundary_rho, dataBase_.d_boundary_face, dataBase_.d_laplac_internal_coeffs, - dataBase_.d_laplac_boundary_coeffs, -1., d_A_csr, d_b, d_A_csr, d_b); + dataBase_.d_laplac_boundary_coeffs, -1., d_A_csr, d_b, d_A_csr, d_b, d_ueqn_internal_coeffs, d_ueqn_boundary_coeffs); clock_t end = std::clock(); time_monitor_GPU_kernel += double(end - start) / double(CLOCKS_PER_SEC); } +void dfUEqn::A(double *Psi) +{ + // tmp tdiag(new scalarField(diag())); + // addCmptAvBoundaryDiag(tdiag.ref()); + checkCudaErrors(cudaMemsetAsync(d_A, 0, cell_bytes, stream)); + + size_t threads_per_block = 1024; + size_t blocks_per_grid = (num_boundary_cells + threads_per_block - 1) / threads_per_block; + addAveInternaltoDiag<<>>(num_cells, num_boundary_cells, d_A_csr_row_index, d_A_csr_diag_index, + dataBase_.d_boundary_cell_offset, dataBase_.d_boundary_cell_id, + d_ueqn_internal_coeffs, d_ueqn_boundary_coeffs, d_A); + blocks_per_grid = (num_cells + threads_per_block - 1) / threads_per_block; + addDiagDivVolume<<>>(num_cells, d_A_csr_row_index, d_A_csr_diag_index, d_A_csr, + dataBase_.d_volume, d_ueqn_internal_coeffs, d_A, d_A); + + checkCudaErrors(cudaMemcpyAsync(h_A, d_A, cell_bytes, cudaMemcpyDeviceToHost, stream)); + checkCudaErrors(cudaStreamSynchronize(stream)); + for (size_t i = 0; i < num_cells; i++) + Psi[i] = h_A[i]; +} + +void dfUEqn::H(double *Psi) +{ + checkCudaErrors(cudaMemsetAsync(d_H, 0, cell_vec_bytes, stream)); + checkCudaErrors(cudaStreamSynchronize(stream)); + + size_t threads_per_block = 1024; + size_t blocks_per_grid = (num_boundary_cells + threads_per_block - 1) / threads_per_block; + addBoundaryDiag<<>>(num_cells, num_boundary_cells, d_A_csr_row_index, d_A_csr_diag_index, + dataBase_.d_boundary_cell_offset, dataBase_.d_boundary_cell_id, + d_ueqn_internal_coeffs, d_ueqn_boundary_coeffs, + d_psi, d_H); + + blocks_per_grid = (num_cells + threads_per_block - 1) / threads_per_block; + lduMatrix_H<<>>(num_cells, d_A_csr_row_index, d_A_csr_col_index, d_A_csr_diag_index, + dataBase_.d_volume, d_psi, d_A_csr, d_b, d_ueqn_boundary_coeffs, d_H); + + checkCudaErrors(cudaMemcpyAsync(h_H, d_H, cell_vec_bytes, cudaMemcpyDeviceToHost, stream)); + checkCudaErrors(cudaStreamSynchronize(stream)); + + for (size_t i = 0; i < num_cells; i++) + { + Psi[i * 3] = h_H[i]; + Psi[i * 3 + 1] = h_H[num_cells + i]; + Psi[i * 3 + 2] = h_H[num_cells * 2 + i]; + } + + // for (int i = 0; i < num_cells; i++) + // fprintf(stderr, "h_H_GPU[%d]: (%.5e, %.5e, %.5e)\n", i, h_H[i], h_H[num_cells + i], h_H[num_cells * 2 + i]); +} + void dfUEqn::checkValue(bool print) { checkCudaErrors(cudaMemcpyAsync(h_A_csr, d_A_csr, csr_value_vec_bytes, cudaMemcpyDeviceToHost, stream)); @@ -1082,9 +1288,9 @@ void dfUEqn::checkValue(bool print) if (print) { for (int i = 0; i < (num_faces + num_cells); i++) - fprintf(stderr, "h_A_csr[%d]: %.15lf\n", i, h_A_csr[i]); + fprintf(stderr, "h_A_csr[%d]: %.5e\n", i, h_A_csr[i]); for (int i = 0; i < num_cells * 3; i++) - fprintf(stderr, "h_b[%d]: %.15lf\n", i, h_b[i]); + fprintf(stderr, "h_b[%d]: %.5e\n", i, h_b[i]); } char *input_file = "of_output.txt"; @@ -1136,9 +1342,9 @@ void dfUEqn::checkValue(bool print) if (print) { for (int i = 0; i < (num_faces + num_cells); i++) - fprintf(stderr, "h_A_of_vec_1mtx[%d]: %.15lf\n", i, h_A_of_vec_1mtx[i]); + fprintf(stderr, "h_A_of_vec_1mtx[%d]: %.5e\n", i, h_A_of_vec_1mtx[i]); for (int i = 0; i < 3 * num_cells; i++) - fprintf(stderr, "h_b_of_vec[%d]: %.15lf\n", i, h_b_of_vec[i]); + fprintf(stderr, "h_b_of_vec[%d]: %.5e\n", i, h_b_of_vec[i]); } // check @@ -1204,6 +1410,18 @@ void dfUEqn::updatePsi(double *Psi) } } +// correct volecity in pEqn +void dfUEqn::correctPsi(double *Psi) +{ + for (size_t i = 0; i < num_cells; i++) + { + h_psi[i] = Psi[i * 3]; + h_psi[num_cells + i] = Psi[i * 3 + 1]; + h_psi[num_cells * 2 + i] = Psi[i * 3 + 2]; + } + checkCudaErrors(cudaMemcpyAsync(d_psi, h_psi, cell_vec_bytes, cudaMemcpyDeviceToHost, stream)); +} + dfUEqn::~dfUEqn() { } \ No newline at end of file From 7775dd97da9351160e87e088de8916e098764eec Mon Sep 17 00:00:00 2001 From: maorz1998 Date: Tue, 6 Jun 2023 13:15:05 +0800 Subject: [PATCH 2/2] upwind scheme --- .../H2/cvodeIntegrator_gpu/system/fvSchemes | 18 +++++++++--------- 1 file changed, 9 insertions(+), 9 deletions(-) diff --git a/examples/dfLowMachFoam/threeD_reactingTGV/H2/cvodeIntegrator_gpu/system/fvSchemes b/examples/dfLowMachFoam/threeD_reactingTGV/H2/cvodeIntegrator_gpu/system/fvSchemes index c5c9136e5..e2c0058cd 100644 --- a/examples/dfLowMachFoam/threeD_reactingTGV/H2/cvodeIntegrator_gpu/system/fvSchemes +++ b/examples/dfLowMachFoam/threeD_reactingTGV/H2/cvodeIntegrator_gpu/system/fvSchemes @@ -30,15 +30,15 @@ divSchemes default none; div(phi,U) Gauss linear; - div(phi,Yi) Gauss limitedLinear01 1; - div(phi,h) Gauss limitedLinear 1; - div(phi,ha) Gauss limitedLinear 1; - div(phi,K) Gauss limitedLinear 1; - div(phid,p) Gauss limitedLinear 1; - div(phi,epsilon) Gauss limitedLinear 1; - div(phi,Yi_h) Gauss limitedLinear01 1; - div(phi,k) Gauss limitedLinear 1; - div(hDiffCorrFlux) Gauss cubic; + div(phi,Yi) Gauss linear; + div(phi,h) Gauss linear; + div(phi,ha) Gauss linear; + div(phi,K) Gauss linear; + div(phid,p) Gauss linear; + div(phi,epsilon) Gauss linear; + div(phi,Yi_h) Gauss upwind; + div(phi,k) Gauss linear; + div(hDiffCorrFlux) Gauss linear; div(((rho*nuEff)*dev2(T(grad(U))))) Gauss linear; }