From a3e1a1d81bbd35d6d2f5ecd3eb5ec150dfcc6416 Mon Sep 17 00:00:00 2001 From: maorz1998 Date: Fri, 2 Jun 2023 11:52:09 +0800 Subject: [PATCH 1/2] preliminary GPU version of YEqn --- applications/solvers/dfLowMachFoam/YEqn.H | 124 ++- .../solvers/dfLowMachFoam/createdfSolver.H | 5 +- .../solvers/dfLowMachFoam/dfLowMachFoam.C | 2 + src_gpu/CMakeLists.txt | 9 +- src_gpu/dfMatrixDataBase.H | 52 +- src_gpu/dfYEqn.H | 54 ++ src_gpu/dfYEqn.cu | 831 ++++++++++++++++++ 7 files changed, 1052 insertions(+), 25 deletions(-) create mode 100644 src_gpu/dfYEqn.H create mode 100644 src_gpu/dfYEqn.cu diff --git a/applications/solvers/dfLowMachFoam/YEqn.H b/applications/solvers/dfLowMachFoam/YEqn.H index 94550fcdc..e5cbd15d7 100644 --- a/applications/solvers/dfLowMachFoam/YEqn.H +++ b/applications/solvers/dfLowMachFoam/YEqn.H @@ -15,12 +15,49 @@ tmp> mvConvection ) ); +upwind mvConvectionInterpolation(mesh, phi); +tmp t_upwind_weight = mvConvectionInterpolation.weights(); +surfaceScalarField upwind_weight = t_upwind_weight.ref(); + +volVectorField sumYGrad +( + IOobject + ( + "sumYGrad", + runTime.timeName(), + mesh, + IOobject::NO_READ, + IOobject::NO_WRITE + ), + mesh, + dimensionedVector(dimensionSet(0,-1,0,0,0,0,0), Zero) +); + forAll(Y, i) { sumYDiffError += chemistry->rhoD(i)*fvc::grad(Y[i]); } const surfaceScalarField phiUc = linearInterpolate(sumYDiffError) & mesh.Sf(); +// for (size_t i = 0; i < num_cells; i++) +// { +// printf("sumYDiffError[%d] = (%e, %e, %e)\n", i, sumYDiffError[i][0], sumYDiffError[i][1], +// sumYDiffError[i][2]); +// // printf("fvc::grad[%d] = (%e, %e, %e)\n", i, test_grad[i][0], test_grad[i][1], +// // test_grad[i][2]); +// } + +const surfaceVectorField test_interpolate = linearInterpolate(sumYDiffError).ref(); + +// for (size_t i = 0; i < num_surfaces; i++) +// { +// // printf("test_interpolate[%d] = (%e, %e, %e)\n", i, test_interpolate[i][0], test_interpolate[i][1], +// // test_interpolate[i][2]); +// // printf("surface vector[%d] = (%e, %e, %e)\n", i, mesh.Sf()[i][0], mesh.Sf()[i][1], +// // mesh.Sf()[i][2]); +// printf("phiUcOf[%d] = %e\n", i, phiUc[i]); +// } + #ifdef GPUSolver_ start1 = std::clock(); // // std::thread t(&dfMatrix::solve, &UEqn_GPU); @@ -30,6 +67,54 @@ const surfaceScalarField phiUc = linearInterpolate(sumYDiffError) & mesh.Sf(); time_monitor_UEqn_Solve += double(end1 - start1) / double(CLOCKS_PER_SEC); #endif +std::vector Y_new(Y.size()), Y_old(Y.size()), boundary_Y_init(Y.size()), + boundary_rhoD_init(Y.size()); +std::vector rhoD_GPU(Y.size()); +for (size_t i = 0; i < Y.size(); ++i) +{ + volScalarField& Yi = Y[i]; + const volScalarField& rhoDi = chemistry->rhoD(i); + Y_new[i] = &Yi[0]; + Y_old[i] = &Yi.oldTime()[0]; + rhoD_GPU[i] = &chemistry->rhoD(i)[0]; + cudaMallocHost(&boundary_Y_init[i], num_boundary_faces*sizeof(double)); + cudaMallocHost(&boundary_rhoD_init[i], num_boundary_faces*sizeof(double)); + int offset = 0; + forAll(Yi.boundaryField(), patchi) + { + const scalarField& patchYi = Yi.boundaryField()[patchi]; + const scalarField& patchRhoDi = rhoDi.boundaryField()[patchi]; + int patchSize = patchYi.size(); + + memcpy(boundary_Y_init[i]+offset, &patchYi[0], patchSize*sizeof(double)); + memcpy(boundary_rhoD_init[i]+offset, &patchRhoDi[0], patchSize*sizeof(double)); + offset += patchSize; + } +} + +volScalarField mut_sct = turbulence->mut().ref()/Sct; +std::vector boundary_sumYDiffError, boundary_phiUc, boundary_mutsct; +forAll(p.boundaryField(), patchi) +{ + const vectorField& patchErri = sumYDiffError.boundaryField()[patchi]; + const scalarField& patchPhiUc = phiUc.boundaryField()[patchi]; + const scalarField& patchMut_sct = mut_sct.boundaryField()[patchi]; + + int patchSize = patchErri.size(); + boundary_sumYDiffError.insert(boundary_sumYDiffError.end(), &patchErri[0][0], &patchErri[0][0]+3*patchSize); + boundary_phiUc.insert(boundary_phiUc.end(), &patchPhiUc[0], &patchPhiUc[0] + patchSize); + boundary_mutsct.insert(boundary_mutsct.end(), &patchMut_sct[0], &patchMut_sct[0] + patchSize); +} + +YEqn_GPU.upwindWeight(&upwind_weight[0]); +YEqn_GPU.correctVelocity(Y_new, boundary_Y_init, rhoD_GPU, &phiUc[0], boundary_sumYDiffError.data(), boundary_phiUc.data()); +YEqn_GPU.fvm_ddt(Y_old); +YEqn_GPU.fvm_div_phi(); +YEqn_GPU.fvm_div_phiUc(); +YEqn_GPU.fvm_laplacian(&mut_sct[0], boundary_mutsct.data(), boundary_rhoD_init); + +YEqn_GPU.solve(); + //MPI_Barrier(PstreamGlobals::MPI_COMM_FOAM); label flag_mpi_init; MPI_Initialized(&flag_mpi_init); @@ -53,6 +138,7 @@ time_monitor_corrDiff += double(end - start) / double(CLOCKS_PER_SEC); volScalarField Yt(0.0*Y[0]); start = std::clock(); + int speciesIndex = 0; forAll(Y, i) { volScalarField& Yi = Y[i]; @@ -61,26 +147,30 @@ time_monitor_corrDiff += double(end - start) / double(CLOCKS_PER_SEC); if (i != inertIndex) { - tmp DEff = chemistry->rhoD(i) + turbulence->mut()/Sct; - fvScalarMatrix YiEqn - ( - fvm::ddt(rho, Yi) - + mvConvection->fvmDiv(phi, Yi) - + mvConvection->fvmDiv(phiUc, Yi) - == - ( - splitting - ? fvm::laplacian(DEff(), Yi) - : (fvm::laplacian(DEff(), Yi) + combustion->R(Yi)) - ) - ); - - YiEqn.relax(); - - YiEqn.solve("Yi"); + // tmp DEff = chemistry->rhoD(i) + turbulence->mut()/Sct; + // fvScalarMatrix YiEqn + // ( + // fvm::ddt(rho, Yi) + // + mvConvection->fvmDiv(phi, Yi) + // + mvConvection->fvmDiv(phiUc, Yi) + // == + // ( + // splitting + // ? fvm::laplacian(DEff(), Yi) + // : (fvm::laplacian(DEff(), Yi) + combustion->R(Yi)) + // ) + // ); + + // YiEqn.relax(); + + // YiEqn.solve("Yi"); + + YEqn_GPU.updatePsi(&Yi[0], speciesIndex); + Yi.correctBoundaryConditions(); Yi.max(0.0); Yt += Yi; + ++speciesIndex; } } diff --git a/applications/solvers/dfLowMachFoam/createdfSolver.H b/applications/solvers/dfLowMachFoam/createdfSolver.H index ed87aceb5..b60307463 100644 --- a/applications/solvers/dfLowMachFoam/createdfSolver.H +++ b/applications/solvers/dfLowMachFoam/createdfSolver.H @@ -28,11 +28,12 @@ int num_boundary_cells; string settingPath; settingPath = CanteraTorchProperties.subDict("AmgxSettings").lookupOrDefault("UEqnSettingPath", string("")); -dfMatrixDataBase dfDataBase(num_surfaces, num_cells, num_boundary_faces, num_boundary_cells, &neighbour[0], &owner[0], &mesh.V()[0], &mesh.surfaceInterpolation::weights()[0], +dfMatrixDataBase dfDataBase(num_surfaces, num_cells, num_boundary_faces, Y.size(), num_boundary_cells, &neighbour[0], &owner[0], &mesh.V()[0], &mesh.surfaceInterpolation::weights()[0], &mesh.Sf()[0][0], &mesh.magSf()[0], &mesh.nonOrthDeltaCoeffs()[0], boundary_face_vector_init, boundary_face_init, boundary_deltaCoeffs_init, boundaryCellIndex); -dfUEqn UEqn_GPU(dfDataBase, "dDDI", settingPath); dfRhoEqn rhoEqn_GPU(dfDataBase); +dfUEqn UEqn_GPU(dfDataBase, "dDDI", settingPath); +dfYEqn YEqn_GPU(dfDataBase, "dDDI", settingPath, inertIndex); double *ueqn_internalCoeffs_init, *ueqn_boundaryCoeffs_init, *boundary_pressure_init, *boundary_velocity_init, *boundary_nuEff_init, *boundary_rho_init, *ueqn_laplac_internalCoeffs_init, *ueqn_laplac_boundaryCoeffs_init, *boundary_phi_init; diff --git a/applications/solvers/dfLowMachFoam/dfLowMachFoam.C b/applications/solvers/dfLowMachFoam/dfLowMachFoam.C index 8012df850..d875b607d 100644 --- a/applications/solvers/dfLowMachFoam/dfLowMachFoam.C +++ b/applications/solvers/dfLowMachFoam/dfLowMachFoam.C @@ -60,9 +60,11 @@ Description #include "CombustionModel.H" #include "dfUEqn.H" +#include "dfYEqn.H" #include "dfRhoEqn.H" #include #include +#include "upwind.H" #define GPUSolver_ diff --git a/src_gpu/CMakeLists.txt b/src_gpu/CMakeLists.txt index 2c17c56ed..c749eaa42 100644 --- a/src_gpu/CMakeLists.txt +++ b/src_gpu/CMakeLists.txt @@ -18,14 +18,19 @@ include_directories( $ENV{AMGX_DIR}/include ) -add_library(${PROJECT_NAME} SHARED dfUEqn.cu dfRhoEqn.cu AmgXSolver.cu) +add_library(${PROJECT_NAME} + SHARED + dfUEqn.cu + dfRhoEqn.cu + dfYEqn.cu + AmgXSolver.cu) target_link_libraries(${PROJECT_NAME} ${MPI_LIBRARIES} ${CUDA_LIBRARIES} ${LIBAMGXSH} ) - +target_compile_options(dfMatrix PUBLIC -g) option(DFMATRIX_ENABLE_DETAILED_DEBUG "Enable detailed debug build" OFF) if (DFMATRIX_ENABLE_DETAILED_DEBUG) target_compile_definitions(${PROJECT_NAME} PRIVATE DEBUG) diff --git a/src_gpu/dfMatrixDataBase.H b/src_gpu/dfMatrixDataBase.H index 971fdc6bb..2fa79b13f 100644 --- a/src_gpu/dfMatrixDataBase.H +++ b/src_gpu/dfMatrixDataBase.H @@ -34,7 +34,8 @@ inline void checkVectorEqual(int count, const double* basevec, double* vec, doub { double abs_diff = fabs(basevec[i] - vec[i]); double rel_diff = fabs(basevec[i] - vec[i]) / fabs(basevec[i]); - if (abs_diff > 1e-12 && rel_diff > max_relative_error && !std::isinf(rel_diff)) + // if (abs_diff > 1e-12 && rel_diff > max_relative_error && !std::isinf(rel_diff)) + if (abs_diff > 1e-15 && rel_diff > max_relative_error) fprintf(stderr, "mismatch index %d, cpu data: %.16lf, gpu data: %.16lf, relative error: %.16lf\n", i, basevec[i], vec[i], rel_diff); } } @@ -53,6 +54,8 @@ struct dfMatrixDataBase // - number of boundary faces int num_boundary_faces; + int num_species; + // - mesh variables // - csr_row_index int *h_A_csr_row_index=nullptr, *d_A_csr_row_index=nullptr; @@ -84,14 +87,19 @@ struct dfMatrixDataBase // - the device pointer to rho_new, rho_old, velocity_old, pressure and volume list double *d_rho_new = nullptr, *d_rho_old = nullptr, *d_velocity_old = nullptr, *d_pressure = nullptr, *d_volume = nullptr; + // - the device pointer to Y_new and Y_old + std::vector d_Y_new_vector; + std::vector d_Y_old_vector; // - the device pointer to the pre-permutated and post-permutated interpolation weight list double *d_weight_init = nullptr, *d_weight = nullptr; + double *d_weight_upwind = nullptr; // - the device pointer to the pre-permutated and post-permutated flux (phi) list double *d_phi_init = nullptr, *d_phi = nullptr; // - the device pointer to the pre-permutated and post-permutated cell face vector list double *d_face_vector_init = nullptr, *d_face_vector = nullptr; double *d_face_init = nullptr, *d_face = nullptr; double *d_deltaCoeffs_init = nullptr, *d_deltaCoeffs = nullptr; + std::vector d_rhoD_vector; double rdelta_t = 1/1e-6; @@ -114,6 +122,14 @@ struct dfMatrixDataBase *d_boundary_velocity = nullptr, *d_boundary_velocity_init = nullptr, *d_boundary_nuEff = nullptr, *d_boundary_nuEff_init = nullptr, *d_boundary_rho = nullptr, *d_boundary_rho_init = nullptr; + std::vector d_boundary_Y_vector; + std::vector d_boundary_Y_init_vector; + std::vector d_internal_coeffs_Y_vector; + std::vector d_boundary_coeffs_Y_vector; + std::vector d_laplac_internal_coeffs_Y_vector; + std::vector d_laplac_boundary_coeffs_Y_vector; + std::vector d_boundary_rhoD_vector; + double *d_boundary_mut_sct = nullptr; std::vector boundPermutationList; std::vector ueqn_internalCoeffs, ueqn_boundaryCoeffs; @@ -180,11 +196,11 @@ struct dfMatrixDataBase // constructor dfMatrixDataBase(); - dfMatrixDataBase(int num_surfaces, int num_cells, int num_boundary_faces, int & num_boundary_cells_output, + dfMatrixDataBase(int num_surfaces, int num_cells, int num_boundary_faces, int num_species, int & num_boundary_cells_output, const int *neighbour, const int *owner, const double* volume, const double* weight, const double* face_vector, const double* face, const double* deltaCoeffs, std::vector boundary_face_vector_init, std::vector boundary_face_init, std::vector boundary_deltaCoeffs_init, std::vector boundary_cell_id_init) - : num_cells(num_cells), num_faces(num_surfaces*2), num_surfaces(num_surfaces), num_iteration(0), + : num_cells(num_cells), num_faces(num_surfaces*2), num_surfaces(num_surfaces), num_species(num_species), num_iteration(0), num_boundary_faces(num_boundary_faces), h_volume(volume) { // allocate field pointer in pin memory @@ -457,19 +473,36 @@ struct dfMatrixDataBase checkCudaErrors(cudaMalloc((void**)&d_A_csr_diag_index, cell_index_bytes)); total_bytes += (csr_row_index_bytes + csr_col_index_bytes + cell_index_bytes); + d_Y_new_vector.resize(num_species); + d_Y_old_vector.resize(num_species); + d_rhoD_vector.resize(num_species); + d_boundary_Y_vector.resize(num_species); + d_boundary_Y_init_vector.resize(num_species); + d_internal_coeffs_Y_vector.resize(num_species); + d_boundary_coeffs_Y_vector.resize(num_species); + d_laplac_internal_coeffs_Y_vector.resize(num_species); + d_laplac_boundary_coeffs_Y_vector.resize(num_species); + d_boundary_rhoD_vector.resize(num_species); + + for (size_t i = 0; i < num_species; ++i){ + checkCudaErrors(cudaMalloc((void**)&d_Y_new_vector[i], cell_bytes)); + checkCudaErrors(cudaMalloc((void**)&d_Y_old_vector[i], cell_bytes)); + checkCudaErrors(cudaMalloc((void**)&d_rhoD_vector[i], cell_bytes)); + } checkCudaErrors(cudaMalloc((void**)&d_rho_old, cell_bytes)); checkCudaErrors(cudaMalloc((void**)&d_rho_new, cell_bytes)); checkCudaErrors(cudaMalloc((void**)&d_volume, cell_bytes)); checkCudaErrors(cudaMalloc((void**)&d_pressure, cell_bytes)); checkCudaErrors(cudaMalloc((void**)&d_velocity_old, cell_vec_bytes)); checkCudaErrors(cudaMalloc((void**)&d_weight, face_bytes)); + checkCudaErrors(cudaMalloc((void**)&d_weight_upwind, face_bytes)); checkCudaErrors(cudaMalloc((void**)&d_face, face_bytes)); checkCudaErrors(cudaMalloc((void**)&d_deltaCoeffs, face_bytes)); checkCudaErrors(cudaMalloc((void**)&d_phi, face_bytes)); checkCudaErrors(cudaMalloc((void**)&d_phi_init, face_bytes)); checkCudaErrors(cudaMalloc((void**)&d_face_vector, face_vec_bytes)); checkCudaErrors(cudaMalloc((void**)&d_nuEff, cell_bytes)); - total_bytes += (cell_bytes * 5 + face_bytes * 5 + cell_vec_bytes + face_vec_bytes); + total_bytes += (cell_bytes * (5 + 2*num_species) + face_bytes * 6 + cell_vec_bytes + face_vec_bytes); checkCudaErrors(cudaMalloc((void**)&d_boundary_cell_offset, (num_boundary_cells+1) * sizeof(int))); checkCudaErrors(cudaMalloc((void**)&d_boundary_cell_id, boundary_face_index_bytes)); @@ -496,6 +529,17 @@ struct dfMatrixDataBase checkCudaErrors(cudaMalloc((void**)&d_boundary_nuEff_init, boundary_face_bytes)); checkCudaErrors(cudaMalloc((void**)&d_boundary_rho, boundary_face_bytes)); checkCudaErrors(cudaMalloc((void**)&d_boundary_rho_init, boundary_face_bytes)); + checkCudaErrors(cudaMalloc((void**)&d_boundary_mut_sct, boundary_face_bytes)); + for (size_t i = 0; i < num_species; ++i){ + checkCudaErrors(cudaMalloc((void**)&d_boundary_Y_vector[i], boundary_face_bytes)); + checkCudaErrors(cudaMalloc((void**)&d_boundary_Y_init_vector[i], boundary_face_bytes)); + checkCudaErrors(cudaMalloc((void**)&d_internal_coeffs_Y_vector[i], boundary_face_bytes)); + checkCudaErrors(cudaMalloc((void**)&d_boundary_coeffs_Y_vector[i], boundary_face_bytes)); + checkCudaErrors(cudaMalloc((void**)&d_laplac_internal_coeffs_Y_vector[i], boundary_face_bytes)); + checkCudaErrors(cudaMalloc((void**)&d_laplac_boundary_coeffs_Y_vector[i], boundary_face_bytes)); + checkCudaErrors(cudaMalloc((void**)&d_boundary_rhoD_vector[i], boundary_face_bytes)); + } + total_bytes += (boundary_face_bytes*10 + boundary_face_vec_bytes * 11); // checkCudaErrors(cudaMalloc((void**)&d_A_csr, csr_value_vec_bytes)); diff --git a/src_gpu/dfYEqn.H b/src_gpu/dfYEqn.H new file mode 100644 index 000000000..85f4493a6 --- /dev/null +++ b/src_gpu/dfYEqn.H @@ -0,0 +1,54 @@ +#pragma once + +#include "AmgXSolver.H" +#include +#include "dfMatrixDataBase.H" + +class dfYEqn +{ +private: + dfMatrixDataBase &dataBase_; + cudaStream_t stream; + + std::vector YSolverSet; + int num_iteration = 0; + double time_monitor_CPU; + double time_monitor_GPU_kernel, time_monitor_GPU_memcpy, time_monitor_GPU_memcpy_test; + + // common variables + int num_cells, cell_bytes, num_faces, num_surfaces, num_boundary_cells, num_boundary_faces, num_species, inertIndex; + int *d_A_csr_row_index, *d_A_csr_diag_index, *d_A_csr_col_index; + + // Matrix variables + double *d_A_csr, *d_b, *d_psi = nullptr; + double *h_A_csr, *h_b, *h_psi = nullptr; + double *d_sumYDiffError = nullptr, *d_sumYDiffError_tmp = nullptr; + double *d_sumYDiffError_boundary = nullptr; + double *d_phiUc_boundary = nullptr; + double *d_phiUc = nullptr; + double *d_mut_Sct = nullptr; + +public: + dfYEqn(dfMatrixDataBase &dataBase, const std::string &modeStr, const std::string &cfgFile, const int inertIndex); + + ~dfYEqn(); + + void checkValue(bool print, char *filename); + + void correctVelocity(std::vector Y_new, std::vector boundary_Y_init, std::vector rhoD_GPU, + const double *phiUcRef, double *boundary_sumYDiffError_ref, double *phiUcBouRef); + + void upwindWeight(double *refWeight); + + void fvm_ddt(std::vector Y_old); + + void fvm_div_phi(); + + void fvm_div_phiUc(); + + void fvm_laplacian(double *mut_Sct, double *boundary_mut_Sct, std::vectorboundary_rhoD); + + void solve(); + + void updatePsi(double *Psi, int speciesIndex); +}; diff --git a/src_gpu/dfYEqn.cu b/src_gpu/dfYEqn.cu new file mode 100644 index 000000000..d25efb862 --- /dev/null +++ b/src_gpu/dfYEqn.cu @@ -0,0 +1,831 @@ +#include "dfYEqn.H" + +// kernel functions +__global__ void getUpwindWeight(int num_faces, double *phi, double *weight) +{ + int index = blockDim.x * blockIdx.x + threadIdx.x; + if (index >= num_faces) + return; + if (phi[index] >= 0) + weight[index] = 1.; + else + weight[index] = 0.; +} + +__global__ void fvc_grad_internal_face_Y(int num_cells, + const int *csr_row_index, const int *csr_col_index, const int *csr_diag_index, const double *volume, + const double *face_vector, const double *weight, const double *species, const double *rhoD, + const double *sumYDiffError, double *sumYDiffError_output) +{ + int index = blockDim.x * blockIdx.x + threadIdx.x; + if (index >= num_cells) + return; + + // A_csr has one more element in each row: itself + int row_index = csr_row_index[index]; + int next_row_index = csr_row_index[index + 1]; + int diag_index = csr_diag_index[index]; + int neighbor_offset = csr_row_index[index] - index; + + double own_cell_Y = species[index]; + double grad_bx_upper = 0; + double grad_by_upper = 0; + double grad_bz_upper = 0; + double grad_bx_lower = 0; + double grad_by_lower = 0; + double grad_bz_lower = 0; + for (int i = row_index; i < next_row_index; i++) + { + int inner_index = i - row_index; + // lower + if (inner_index < diag_index) + { + int neighbor_index = neighbor_offset + inner_index; + double w = weight[neighbor_index]; + double sfx = face_vector[neighbor_index * 3 + 0]; + double sfy = face_vector[neighbor_index * 3 + 1]; + double sfz = face_vector[neighbor_index * 3 + 2]; + int neighbor_cell_id = csr_col_index[row_index + inner_index]; + double neighbor_cell_Y = species[neighbor_cell_id]; + double face_Y = w * (neighbor_cell_Y - own_cell_Y) + own_cell_Y; + grad_bx_lower -= face_Y * sfx; + grad_by_lower -= face_Y * sfy; + grad_bz_lower -= face_Y * sfz; + } + // upper + if (inner_index > diag_index) + { + int neighbor_index = neighbor_offset + inner_index - 1; + double w = weight[neighbor_index]; + double sfx = face_vector[neighbor_index * 3 + 0]; + double sfy = face_vector[neighbor_index * 3 + 1]; + double sfz = face_vector[neighbor_index * 3 + 2]; + int neighbor_cell_id = csr_col_index[row_index + inner_index]; + double neighbor_cell_Y = species[neighbor_cell_id]; + double face_Y = w * (own_cell_Y - neighbor_cell_Y) + neighbor_cell_Y; + grad_bx_upper += face_Y * sfx; + grad_by_upper += face_Y * sfy; + grad_bz_upper += face_Y * sfz; + } + } + double vol = volume[index]; + + sumYDiffError_output[index * 3 + 0] = sumYDiffError[index * 3 + 0] + (grad_bx_upper + grad_bx_lower); + sumYDiffError_output[index * 3 + 1] = sumYDiffError[index * 3 + 1] + (grad_by_upper + grad_by_lower); + sumYDiffError_output[index * 3 + 2] = sumYDiffError[index * 3 + 2] + (grad_bz_upper + grad_bz_lower); + + // if (index == 0) + // { + // printf("grad_bz_upper = %e\n", grad_bz_upper); + // printf("grad_bz_lower = %e\n", grad_bz_lower); + // printf("(grad_bz_upper + grad_bz_lower) = %e\n", (grad_bz_upper + grad_bz_lower)); + // } +} + +__global__ void fvc_grad_boundary_face_Y(int num_cells, int num_boundary_cells, + const int *boundary_cell_offset, const int *boundary_cell_id, const double *rhoD, const int *bouPermedIndex, + const double *boundary_face_vector, const double *boundary_species_init, double *boundary_species, + const double *volume, const double *sumYDiffError, double *sumYDiffError_output) +{ + int index = blockDim.x * blockIdx.x + threadIdx.x; + if (index >= num_boundary_cells) + return; + + int cell_offset = boundary_cell_offset[index]; + int next_cell_offset = boundary_cell_offset[index + 1]; + int cell_index = boundary_cell_id[cell_offset]; + + // compute boundary gradient + double grad_bx = 0; + double grad_by = 0; + double grad_bz = 0; + for (int i = cell_offset; i < next_cell_offset; i++) + { + double sfx = boundary_face_vector[i * 3 + 0]; + double sfy = boundary_face_vector[i * 3 + 1]; + double sfz = boundary_face_vector[i * 3 + 2]; + int permute_index = bouPermedIndex[i]; + double face_Y = boundary_species_init[permute_index]; + boundary_species[i] = face_Y; + grad_bx += face_Y * sfx; + grad_by += face_Y * sfy; + grad_bz += face_Y * sfz; + // if (index == 0) + // { + // printf("face_Y = %e\n", face_Y); + // printf("sfz = %e\n", sfz); + // printf("face_Y * sfz = %e\n", face_Y * sfz); + // } + } + + //// correct the boundary gradient + // double nx = boundary_face_vector[face_index * 3 + 0] / magSf[face_index]; + // double ny = boundary_face_vector[face_index * 3 + 1] / magSf[face_index]; + // double nz = boundary_face_vector[face_index * 3 + 2] / magSf[face_index]; + // double sn_grad = 0; + // double grad_correction = sn_grad * volume[cell_index] - (nx * grad_bx + ny * grad_by + nz * grad_bz); + // grad_bx += nx * grad_correction; + // grad_by += ny * grad_correction; + // grad_bz += nz * grad_correction; + + sumYDiffError_output[cell_index * 3 + 0] = sumYDiffError[cell_index * 3 + 0] + grad_bx; + sumYDiffError_output[cell_index * 3 + 1] = sumYDiffError[cell_index * 3 + 1] + grad_by; + sumYDiffError_output[cell_index * 3 + 2] = sumYDiffError[cell_index * 3 + 2] + grad_bz; +} + +__global__ void sumError(int num_cells, const double *volume, const double *rhoD, + const double *sumYDiffErrorTmp, double *sumYDiffError_output) +{ + int index = blockDim.x * blockIdx.x + threadIdx.x; + if (index >= num_cells) + return; + + sumYDiffError_output[index * 3 + 0] = sumYDiffError_output[index * 3 + 0] + sumYDiffErrorTmp[index * 3 + 0] * rhoD[index]; + sumYDiffError_output[index * 3 + 1] = sumYDiffError_output[index * 3 + 1] + sumYDiffErrorTmp[index * 3 + 1] * rhoD[index]; + sumYDiffError_output[index * 3 + 2] = sumYDiffError_output[index * 3 + 2] + sumYDiffErrorTmp[index * 3 + 2] * rhoD[index]; +} + +__global__ void divide_vol(int num_cells, const double *volume, + const double *sumYDiffError, double *sumYDiffError_output) +{ + int index = blockDim.x * blockIdx.x + threadIdx.x; + if (index >= num_cells) + return; + double vol = volume[index]; + + sumYDiffError_output[index * 3 + 0] = sumYDiffError[index * 3 + 0] / vol; + sumYDiffError_output[index * 3 + 1] = sumYDiffError[index * 3 + 1] / vol; + sumYDiffError_output[index * 3 + 2] = sumYDiffError[index * 3 + 2] / vol; +} + +__global__ void correct_boundary_conditions_vec(int num_boundary_cells, + const int *boundary_cell_offset, const int *boundary_cell_id, + const double *boundary_sf, const double *mag_sf, + double *boundary_sumYDiffError, double *sumYDiffError) +{ + int index = blockDim.x * blockIdx.x + threadIdx.x; + if (index >= num_boundary_cells) + return; + + int cell_offset = boundary_cell_offset[index]; + int next_cell_offset = boundary_cell_offset[index + 1]; + int cell_index = boundary_cell_id[cell_offset]; + + // initialize boundary_sumYDiffError + double sumYDiffError_x = sumYDiffError[cell_index * 3 + 0]; + double sumYDiffError_y = sumYDiffError[cell_index * 3 + 1]; + double sumYDiffError_z = sumYDiffError[cell_index * 3 + 2]; + + for (int i = cell_offset; i < next_cell_offset; i++) + { + double n_x = boundary_sf[i * 3 + 0] / mag_sf[i]; + double n_y = boundary_sf[i * 3 + 1] / mag_sf[i]; + double n_z = boundary_sf[i * 3 + 2] / mag_sf[i]; + double sn_grad = 0; + double grad_correction = sn_grad - (n_x * sumYDiffError_x + n_y * sumYDiffError_y + n_z * sumYDiffError_z); + boundary_sumYDiffError[i * 3 + 0] = sumYDiffError_x + grad_correction * n_x; + boundary_sumYDiffError[i * 3 + 1] = sumYDiffError_y + grad_correction * n_y; + boundary_sumYDiffError[i * 3 + 2] = sumYDiffError_z + grad_correction * n_z; + } +} + +__global__ void calculate_phiUc(int num_cells, const int *csr_row_index, const int *csr_col_index, const int *csr_diag_index, + const double *face_vector, const double *weight, const double *sumYDiffError, double *phiUc) +{ + int index = blockDim.x * blockIdx.x + threadIdx.x; + if (index >= num_cells) + return; + + // A_csr has one more element in each row: itself + int row_index = csr_row_index[index]; + int row_elements = csr_row_index[index + 1] - row_index; + int diag_index = csr_diag_index[index]; + int neighbor_offset = csr_row_index[index] - index; + + double own_cell_sumYDiffError_x = sumYDiffError[index * 3 + 0]; + double own_cell_sumYDiffError_y = sumYDiffError[index * 3 + 1]; + double own_cell_sumYDiffError_z = sumYDiffError[index * 3 + 2]; + + // lower + for (int i = 0; i < diag_index; i++) + { + double phiUc_face = 0; + + int neighbor_index = neighbor_offset + i; + int neighbor_cell_id = csr_col_index[row_index + i]; + double w = weight[neighbor_index]; + double sfx = face_vector[neighbor_index * 3 + 0]; + double sfy = face_vector[neighbor_index * 3 + 1]; + double sfz = face_vector[neighbor_index * 3 + 2]; + double neighbor_cell_sumYDiffError_x = sumYDiffError[neighbor_cell_id * 3 + 0]; + double neighbor_cell_sumYDiffError_y = sumYDiffError[neighbor_cell_id * 3 + 1]; + double neighbor_cell_sumYDiffError_z = sumYDiffError[neighbor_cell_id * 3 + 2]; + double face_x = w * (neighbor_cell_sumYDiffError_x - own_cell_sumYDiffError_x) + own_cell_sumYDiffError_x; + double face_y = w * (neighbor_cell_sumYDiffError_y - own_cell_sumYDiffError_y) + own_cell_sumYDiffError_y; + double face_z = w * (neighbor_cell_sumYDiffError_z - own_cell_sumYDiffError_z) + own_cell_sumYDiffError_z; + + phiUc_face = face_x * sfx + face_y * sfy + face_z * sfz; + phiUc[neighbor_index] = phiUc_face; + } + // upper + for (int i = diag_index + 1; i < row_elements; i++) + { + double phiUc_face = 0; + + int neighbor_index = neighbor_offset + i - 1; + int neighbor_cell_id = csr_col_index[row_index + i]; + double w = weight[neighbor_index]; + double sfx = face_vector[neighbor_index * 3 + 0]; + double sfy = face_vector[neighbor_index * 3 + 1]; + double sfz = face_vector[neighbor_index * 3 + 2]; + double neighbor_cell_sumYDiffError_x = sumYDiffError[neighbor_cell_id * 3 + 0]; + double neighbor_cell_sumYDiffError_y = sumYDiffError[neighbor_cell_id * 3 + 1]; + double neighbor_cell_sumYDiffError_z = sumYDiffError[neighbor_cell_id * 3 + 2]; + double face_x = w * (own_cell_sumYDiffError_x - neighbor_cell_sumYDiffError_x) + neighbor_cell_sumYDiffError_x; + double face_y = w * (own_cell_sumYDiffError_y - neighbor_cell_sumYDiffError_y) + neighbor_cell_sumYDiffError_y; + double face_z = w * (own_cell_sumYDiffError_z - neighbor_cell_sumYDiffError_z) + neighbor_cell_sumYDiffError_z; + + phiUc_face = face_x * sfx + face_y * sfy + face_z * sfz; + phiUc[neighbor_index] = phiUc_face; + } +} + +__global__ void calculate_phiUc_boundary(int num_boundary_faces, + const int *boundary_cell_offset, const int *boundary_cell_id, + const double *boundary_sf, const double *boundary_sumYDiffError, + double *boundary_phiUc) +{ + int index = blockDim.x * blockIdx.x + threadIdx.x; + if (index >= num_boundary_faces) + return; + + double n_x = boundary_sf[index * 3 + 0]; + double n_y = boundary_sf[index * 3 + 1]; + double n_z = boundary_sf[index * 3 + 2]; + + double err_x = boundary_sumYDiffError[index * 3 + 0]; + double err_y = boundary_sumYDiffError[index * 3 + 1]; + double err_z = boundary_sumYDiffError[index * 3 + 2]; + + boundary_phiUc[index] = n_x * err_x + n_y * err_y + n_z * err_z; +} + +__global__ void fvm_ddt_kernel_scalar(int num_cells, int num_faces, const double rdelta_t, + const int *csr_row_index, const int *csr_diag_index, + const double *rho_old, const double *rho_new, const double *volume, const double *species_old, + const double *A_csr_input, const double *b_input, double *A_csr_output, double *b_output, double *psi) +{ + int index = blockDim.x * blockIdx.x + threadIdx.x; + if (index >= num_cells) + return; + + // A_csr has one more element in each row: itself + int row_index = csr_row_index[index]; + int diag_index = csr_diag_index[index]; + int csr_index = row_index + diag_index; + + double ddt_diag = rdelta_t * rho_new[index] * volume[index]; + A_csr_output[csr_index] = A_csr_input[csr_index] + ddt_diag; + + double ddt_part_term = rdelta_t * rho_old[index] * volume[index]; + b_output[index] = b_input[index] + ddt_part_term * species_old[index]; + + psi[index] = species_old[index]; +} + +__global__ void fvm_div_internal_scalar(int num_cells, int num_faces, + const int *csr_row_index, const int *csr_diag_index, + const double *div_weight, const double *phi, + const double *A_csr_input, const double *b_input, double *A_csr_output, double *b_output) +{ + int index = blockDim.x * blockIdx.x + threadIdx.x; + if (index >= num_cells) + return; + + // A_csr has one more element in each row: itself + int row_index = csr_row_index[index]; + int next_row_index = csr_row_index[index + 1]; + int diag_index = csr_diag_index[index]; + int neighbor_offset = csr_row_index[index] - index; + + double div_diag = 0; + for (int i = row_index; i < next_row_index; i++) + { + int inner_index = i - row_index; + // lower + if (inner_index < diag_index) + { + int neighbor_index = neighbor_offset + inner_index; + double w = div_weight[neighbor_index]; + double f = phi[neighbor_index]; + A_csr_output[i] = A_csr_input[i] + (-w) * f; + // lower neighbors contribute to sum of -1 + div_diag += (w - 1) * f; + } + // upper + if (inner_index > diag_index) + { + // upper, index - 1, consider of diag + int neighbor_index = neighbor_offset + inner_index - 1; + double w = div_weight[neighbor_index]; + double f = phi[neighbor_index]; + A_csr_output[i] = A_csr_input[i] + (1 - w) * f; + // upper neighbors contribute to sum of 1 + div_diag += w * f; + } + } + A_csr_output[row_index + diag_index] = A_csr_input[row_index + diag_index] + div_diag; // diag +} + +__global__ void fvm_div_boundary_scalar(int num_cells, int num_faces, int num_boundary_cells, + const int *csr_row_index, const int *csr_diag_index, const double *boundary_phi, + const int *boundary_cell_offset, const int *boundary_cell_id, + double *internal_coeffs, const double *boundary_coeffs, + const double *A_csr_input, const double *b_input, double *A_csr_output, double *b_output) +{ + int index = blockDim.x * blockIdx.x + threadIdx.x; + if (index >= num_boundary_cells) + return; + + int cell_offset = boundary_cell_offset[index]; + int cell_index = boundary_cell_id[cell_offset]; + int loop_size = boundary_cell_offset[index + 1] - cell_offset; + + int row_index = csr_row_index[cell_index]; + int diag_index = csr_diag_index[cell_index]; + int csr_dim = num_cells + num_faces; + int csr_index = row_index + diag_index; + + // construct internalCoeffs & boundaryCoeffs + double internal_coeffs_own = 0; + double boundary_coeffs_own = 0; + for (int i = 0; i < loop_size; i++) + { + // zeroGradient + internal_coeffs[cell_offset + i] = 1.; + internal_coeffs_own += boundary_phi[cell_offset + i] * internal_coeffs[cell_offset + i]; + boundary_coeffs_own += -boundary_phi[cell_offset + i] * boundary_coeffs[cell_offset + i]; + } + A_csr_output[csr_index] = A_csr_input[csr_index] + internal_coeffs_own; + b_output[cell_index] = b_input[cell_index] + boundary_coeffs_own; +} + +__global__ void fvm_laplacian_uncorrected_scalar_internal(int num_cells, + const int *csr_row_index, const int *csr_col_index, const int *csr_diag_index, + const double *scalar0, const double *scalar1, const double *weight, + const double *magsf, const double *distance, + const double sign, const double *A_csr_input, double *A_csr_output) +{ + int index = blockDim.x * blockIdx.x + threadIdx.x; + if (index >= num_cells) + return; + + // A_csr has one more element in each row: itself + int row_index = csr_row_index[index]; + int row_elements = csr_row_index[index + 1] - row_index; + int diag_index = csr_diag_index[index]; + int neighbor_offset = csr_row_index[index] - index; + + double own_coeff = scalar0[index] + scalar1[index]; + double sum_diag = 0; + // lower + for (int i = 0; i < diag_index; i++) + { + int neighbor_index = neighbor_offset + i; + int neighbor_cell_id = csr_col_index[i + row_index]; + double w = weight[neighbor_index]; + double nei_coeff = scalar0[neighbor_cell_id] + scalar1[neighbor_cell_id]; + double gamma = w * (nei_coeff - own_coeff) + own_coeff; + double gamma_magsf = gamma * magsf[neighbor_index]; + double coeff = gamma_magsf * distance[neighbor_index]; + A_csr_output[row_index + i] = A_csr_input[row_index + i] + coeff * sign; + + sum_diag += (-coeff); + } + // upper + for (int i = diag_index + 1; i < row_elements; i++) + { + int neighbor_index = neighbor_offset + i - 1; + int neighbor_cell_id = csr_col_index[i + row_index]; + double w = weight[neighbor_index]; + double nei_coeff = scalar0[neighbor_cell_id] + scalar1[neighbor_cell_id]; + double gamma = w * (own_coeff - nei_coeff) + nei_coeff; + double gamma_magsf = gamma * magsf[neighbor_index]; + double coeff = gamma_magsf * distance[neighbor_index]; + A_csr_output[row_index + i] = A_csr_input[row_index + i] + coeff * sign; + + sum_diag += (-coeff); + } + // diag + A_csr_output[row_index + diag_index] = A_csr_input[row_index + diag_index] + sum_diag * sign; +} + +__global__ void fvm_laplacian_uncorrected_scalar_boundary(int num_cells, int num_boundary_cells, + const int *csr_row_index, const int *csr_diag_index, const int *boundary_cell_offset, + const int *boundary_cell_id, const double *boundary_scalar0, const double *boundary_scalar1, + const double *boundary_magsf, const int *bouPermedIndex, + const double *gradient_internal_coeffs, const double *gradient_boundary_coeffs, + const double sign, const double *A_csr_input, const double *b_input, double *A_csr_output, double *b_output) +{ + int index = blockDim.x * blockIdx.x + threadIdx.x; + if (index >= num_boundary_cells) + return; + + int cell_offset = boundary_cell_offset[index]; + int next_cell_offset = boundary_cell_offset[index + 1]; + int cell_index = boundary_cell_id[cell_offset]; + + int row_index = csr_row_index[cell_index]; + int diag_index = csr_diag_index[cell_index]; + int csr_index = row_index + diag_index; + + double internal_coeffs = 0; + double boundary_coeffs = 0; + for (int i = cell_offset; i < next_cell_offset; i++) + { + int permute_index = bouPermedIndex[i]; + double gamma = boundary_scalar0[permute_index] + boundary_scalar1[permute_index]; + double gamma_magsf = gamma * boundary_magsf[i]; + // internal_coeffs += gamma_magsf * gradient_internal_coeffs[i * 3 + 0]; + // boundary_coeffs += gamma_magsf * gradient_boundary_coeffs[i * 3 + 0]; + internal_coeffs += gamma_magsf * 0.; + boundary_coeffs += gamma_magsf * 0.; + } + + A_csr_output[csr_index] = A_csr_input[csr_index] + internal_coeffs * sign; + b_output[cell_index] = b_input[cell_index] + boundary_coeffs * sign; +} + +dfYEqn::dfYEqn(dfMatrixDataBase &dataBase, const std::string &modeStr, const std::string &cfgFile, const int inertIndex) + : dataBase_(dataBase), inertIndex(inertIndex) +{ + num_species = dataBase_.num_species; + num_cells = dataBase_.num_cells; + num_faces = dataBase_.num_faces; + num_surfaces = dataBase_.num_surfaces; + num_boundary_cells = dataBase_.num_boundary_cells; + num_boundary_faces = dataBase_.num_boundary_faces; + cell_bytes = dataBase_.cell_bytes; + + YSolverSet.resize(num_species - 1); // consider inert species + for (auto &solver : YSolverSet) + solver = new AmgXSolver(modeStr, cfgFile); + + d_A_csr_row_index = dataBase_.d_A_csr_row_index; + d_A_csr_diag_index = dataBase_.d_A_csr_diag_index; + d_A_csr_col_index = dataBase_.d_A_csr_col_index; + + h_A_csr = new double[(num_cells + num_faces) * (num_species - 1)]; + h_b = new double[num_cells * (num_species - 1)]; + cudaMallocHost(&h_psi, num_cells * (num_species - 1) * sizeof(double)); + + checkCudaErrors(cudaMalloc((void **)&d_A_csr, (num_cells + num_faces) * (num_species - 1) * sizeof(double))); + checkCudaErrors(cudaMalloc((void **)&d_b, cell_bytes * (num_species - 1))); + checkCudaErrors(cudaMalloc((void **)&d_psi, cell_bytes * (num_species - 1))); + checkCudaErrors(cudaMalloc((void **)&d_sumYDiffError, 3 * cell_bytes)); + checkCudaErrors(cudaMalloc((void **)&d_sumYDiffError_tmp, 3 * cell_bytes)); + checkCudaErrors(cudaMalloc((void **)&d_phiUc, num_faces * sizeof(double))); + checkCudaErrors(cudaMalloc((void **)&d_phiUc, num_faces * sizeof(double))); + checkCudaErrors(cudaMalloc((void **)&d_sumYDiffError_boundary, 3 * num_boundary_faces * sizeof(double))); + checkCudaErrors(cudaMalloc((void **)&d_phiUc_boundary, num_boundary_faces * sizeof(double))); + checkCudaErrors(cudaMalloc((void **)&d_mut_Sct, cell_bytes)); + + checkCudaErrors(cudaStreamCreate(&stream)); + // zeroGradient + for (size_t i = 0; i < num_species; i++) + { + checkCudaErrors(cudaMemsetAsync(dataBase_.d_internal_coeffs_Y_vector[i], 1, dataBase_.boundary_face_bytes, stream)); + checkCudaErrors(cudaMemsetAsync(dataBase_.d_boundary_coeffs_Y_vector[i], 0, dataBase_.boundary_face_bytes, stream)); + checkCudaErrors(cudaMemsetAsync(dataBase_.d_laplac_internal_coeffs_Y_vector[i], 0, dataBase_.boundary_face_bytes, stream)); + checkCudaErrors(cudaMemsetAsync(dataBase_.d_laplac_boundary_coeffs_Y_vector[i], 0, dataBase_.boundary_face_bytes, stream)); + } +} + +void dfYEqn::upwindWeight(double *refWeight) +{ + size_t threads_per_block = 1024; + size_t blocks_per_grid = (num_faces + threads_per_block - 1) / threads_per_block; + getUpwindWeight<<>>(num_faces, dataBase_.d_phi, dataBase_.d_weight_upwind); + +#ifdef _CHECK_ + double *h_refWeight_init = new double[num_faces]; + double *h_refWeight = new double[num_faces]; + double *h_weight_upwind = new double[num_faces]; + cudaMemcpy(h_weight_upwind, dataBase_.d_weight_upwind, num_faces * sizeof(double), cudaMemcpyDeviceToHost); + memcpy(h_refWeight_init, refWeight, num_surfaces * sizeof(double)); + memcpy(h_refWeight_init + num_surfaces, refWeight, num_surfaces * sizeof(double)); + for (size_t i = 0; i < num_faces; i++) + h_refWeight[i] = h_refWeight_init[dataBase_.permedIndex[i]]; + for (size_t i = 0; i < num_faces; i++) + printf("h_weight_upwind[%d] = %lf\n", i, h_weight_upwind[i]); + for (size_t i = 0; i < num_faces; i++) + printf("h_refWeight[%d] = %lf\n", i, h_refWeight[i]); +#endif +} + +void dfYEqn::correctVelocity(std::vector Y_new, std::vector boundary_Y_init, std::vector rhoD_GPU, + const double *phiUcRef, double *boundary_sumYDiffError_ref, double *phiUcBouRef) +{ + // initialize variables in each time step + checkCudaErrors(cudaMemsetAsync(d_sumYDiffError, 0, 3 * cell_bytes, stream)); + checkCudaErrors(cudaMemsetAsync(d_phiUc, 0, num_faces * sizeof(double), stream)); + + size_t threads_per_block, blocks_per_grid; + for (size_t i = 0; i < num_species; ++i) + { + checkCudaErrors(cudaMemsetAsync(d_sumYDiffError_tmp, 0, 3 * cell_bytes, stream)); + checkCudaErrors(cudaMemcpyAsync(dataBase_.d_Y_new_vector[i], Y_new[i], cell_bytes, cudaMemcpyHostToDevice, stream)); + checkCudaErrors(cudaMemcpyAsync(dataBase_.d_boundary_Y_init_vector[i], boundary_Y_init[i], dataBase_.boundary_face_bytes, cudaMemcpyHostToDevice, stream)); + checkCudaErrors(cudaMemcpyAsync(dataBase_.d_rhoD_vector[i], rhoD_GPU[i], cell_bytes, cudaMemcpyHostToDevice, stream)); + + // launch cuda kernel + threads_per_block = 1024; + blocks_per_grid = (num_cells + threads_per_block - 1) / threads_per_block; + fvc_grad_internal_face_Y<<>>(num_cells, + d_A_csr_row_index, d_A_csr_col_index, d_A_csr_diag_index, dataBase_.d_volume, + dataBase_.d_face_vector, dataBase_.d_weight, dataBase_.d_Y_new_vector[i], + dataBase_.d_rhoD_vector[i], d_sumYDiffError_tmp, d_sumYDiffError_tmp); + blocks_per_grid = (num_boundary_cells + threads_per_block - 1) / threads_per_block; + fvc_grad_boundary_face_Y<<>>(num_cells, num_boundary_cells, dataBase_.d_boundary_cell_offset, + dataBase_.d_boundary_cell_id, dataBase_.d_rhoD_vector[i], dataBase_.d_bouPermedIndex, + dataBase_.d_boundary_face_vector, dataBase_.d_boundary_Y_init_vector[i], dataBase_.d_boundary_Y_vector[i], + dataBase_.d_volume, d_sumYDiffError_tmp, d_sumYDiffError_tmp); + blocks_per_grid = (num_cells + threads_per_block - 1) / threads_per_block; + sumError<<>>(num_cells, dataBase_.d_volume, dataBase_.d_rhoD_vector[i], d_sumYDiffError_tmp, d_sumYDiffError); + } + blocks_per_grid = (num_cells + threads_per_block - 1) / threads_per_block; + divide_vol<<>>(num_cells, dataBase_.d_volume, d_sumYDiffError, d_sumYDiffError); + + blocks_per_grid = (num_boundary_cells + threads_per_block - 1) / threads_per_block; + correct_boundary_conditions_vec<<>>(num_boundary_cells, dataBase_.d_boundary_cell_offset, dataBase_.d_boundary_cell_id, + dataBase_.d_boundary_face_vector, dataBase_.d_boundary_face, d_sumYDiffError_boundary, d_sumYDiffError); + + blocks_per_grid = (num_cells + threads_per_block - 1) / threads_per_block; + calculate_phiUc<<>>(num_cells, + d_A_csr_row_index, d_A_csr_col_index, d_A_csr_diag_index, dataBase_.d_face_vector, + dataBase_.d_weight, d_sumYDiffError, d_phiUc); + + blocks_per_grid = (num_boundary_faces + threads_per_block - 1) / threads_per_block; + calculate_phiUc_boundary<<>>(num_boundary_faces, dataBase_.d_boundary_cell_offset, dataBase_.d_boundary_cell_id, + dataBase_.d_boundary_face_vector, d_sumYDiffError_boundary, d_phiUc_boundary); + +#ifdef _CHECK_ + checkCudaErrors(cudaStreamSynchronize(stream)); + double *h_sumYDiffError = new double[num_cells * 3]; + double *h_phiUc = new double[num_faces]; + double *h_phiUc_ref_init = new double[num_faces]; + double *h_phiUc_ref = new double[num_faces]; + double *h_sumYDiffError_boundary = new double[num_boundary_faces * 3]; + double *h_sumYDiffError_boundary_gpu = new double[num_boundary_faces * 3]; + double *h_phiUc_boundary = new double[num_boundary_faces]; + double *h_phiUc_boundary_gpu = new double[num_boundary_faces]; + + memcpy(h_phiUc_ref_init, phiUcRef, num_surfaces * sizeof(double)); + memcpy(h_phiUc_ref_init + num_surfaces, phiUcRef, num_surfaces * sizeof(double)); + for (int i = 0; i < num_faces; i++) + { + h_phiUc_ref[i] = h_phiUc_ref_init[dataBase_.permedIndex[i]]; + } + for (int i = 0; i < num_boundary_faces; i++) + { + h_sumYDiffError_boundary[3 * i] = boundary_sumYDiffError_ref[3 * dataBase_.boundPermutationList[i]]; + h_sumYDiffError_boundary[3 * i + 1] = boundary_sumYDiffError_ref[3 * dataBase_.boundPermutationList[i] + 1]; + h_sumYDiffError_boundary[3 * i + 2] = boundary_sumYDiffError_ref[3 * dataBase_.boundPermutationList[i] + 2]; + h_phiUc_boundary[i] = phiUcBouRef[dataBase_.boundPermutationList[i]]; + } + cudaMemcpy(h_sumYDiffError, d_sumYDiffError, num_cells * sizeof(double) * 3, cudaMemcpyDeviceToHost); + cudaMemcpy(h_phiUc, d_phiUc, num_faces * sizeof(double), cudaMemcpyDeviceToHost); + cudaMemcpy(h_sumYDiffError_boundary_gpu, d_sumYDiffError_boundary, 3 * num_boundary_faces * sizeof(double), cudaMemcpyDeviceToHost); + cudaMemcpy(h_phiUc_boundary_gpu, d_phiUc_boundary, num_boundary_faces * sizeof(double), cudaMemcpyDeviceToHost); + // for (size_t i = 0; i < num_cells; i++) + // printf("h_sumYDiffError[%d] = (%e, %e, %e)\n", i, h_sumYDiffError[num_cells * 0 + i], + // h_sumYDiffError[num_cells * 1 + i], h_sumYDiffError[num_cells * 2 + i]); + // for (size_t i = 0; i < num_faces; i++) + // printf("phiUc[%d] = %e\n", i, h_phiUc[i]); + // for (size_t i = 0; i < num_faces; i++) + // printf("phiUcof[%d] = %e\n", i, h_phiUc_ref[i]); + for (int i = 0; i < num_boundary_faces; i++) + { + // printf("boundary_sumYDiffError_of[%d] = (%e, %e, %e)\n", i, h_sumYDiffError_boundary[3 * i], + // h_sumYDiffError_boundary[3 * i + 1], h_sumYDiffError_boundary[3 * i + 2]); + printf("boundary_phiUc_of[%d] = %e\n", i, h_phiUc_boundary[i]); + } + for (int i = 0; i < num_boundary_faces; i++) + { + // printf("boundary_phiUc_gpu[%d] = (%e, %e, %e)\n", i, h_sumYDiffError_boundary_gpu[3 * i], + // h_sumYDiffError_boundary_gpu[3 * i + 1], h_sumYDiffError_boundary_gpu[3 * i + 2]); + printf("boundary_phiUc_gpu[%d] = %e\n", i, h_phiUc_boundary_gpu[i]); + } +#endif +} + +void dfYEqn::fvm_ddt(std::vector Y_old) +{ + // initialize variables in each time step + checkCudaErrors(cudaMemsetAsync(d_A_csr, 0, (num_cells + num_faces) * (num_species - 1) * sizeof(double), stream)); // consider inert species + checkCudaErrors(cudaMemsetAsync(d_b, 0, cell_bytes * (num_species - 1), stream)); + checkCudaErrors(cudaMemsetAsync(d_psi, 0, cell_bytes * (num_species - 1), stream)); + + size_t threads_per_block, blocks_per_grid; + int mtxIndex = 0; + for (size_t i = 0; i < num_species; ++i) + { + if (i == inertIndex) + continue; + checkCudaErrors(cudaMemcpyAsync(dataBase_.d_Y_old_vector[i], Y_old[i], cell_bytes, cudaMemcpyHostToDevice, stream)); + + // launch cuda kernel + threads_per_block = 1024; + blocks_per_grid = (num_cells + threads_per_block - 1) / threads_per_block; + fvm_ddt_kernel_scalar<<>>(num_cells, num_faces, dataBase_.rdelta_t, + d_A_csr_row_index, d_A_csr_diag_index, + dataBase_.d_rho_old, dataBase_.d_rho_new, dataBase_.d_volume, dataBase_.d_Y_old_vector[i], + d_A_csr + mtxIndex * (num_cells + num_faces), d_b + mtxIndex * num_cells, + d_A_csr + mtxIndex * (num_cells + num_faces), d_b + mtxIndex * num_cells, + d_psi + mtxIndex * num_cells); + ++mtxIndex; + } +} + +void dfYEqn::fvm_div_phi() +{ + size_t threads_per_block, blocks_per_grid; + int mtxIndex = 0; + for (size_t i = 0; i < num_species; ++i) + { + if (i == inertIndex) + continue; + + // launch cuda kernel + threads_per_block = 512; + blocks_per_grid = (num_cells + threads_per_block - 1) / threads_per_block; + fvm_div_internal_scalar<<>>(num_cells, num_faces, + d_A_csr_row_index, d_A_csr_diag_index, dataBase_.d_weight_upwind, dataBase_.d_phi, + d_A_csr + mtxIndex * (num_cells + num_faces), d_b + mtxIndex * num_cells, + d_A_csr + mtxIndex * (num_cells + num_faces), d_b + mtxIndex * num_cells); + + blocks_per_grid = (num_boundary_cells + threads_per_block - 1) / threads_per_block; + fvm_div_boundary_scalar<<>>(num_cells, num_faces, num_boundary_cells, + d_A_csr_row_index, d_A_csr_diag_index, dataBase_.d_boundary_phi, + dataBase_.d_boundary_cell_offset, dataBase_.d_boundary_cell_id, + dataBase_.d_internal_coeffs_Y_vector[i], dataBase_.d_boundary_coeffs_Y_vector[i], + d_A_csr + mtxIndex * (num_cells + num_faces), d_b + mtxIndex * num_cells, + d_A_csr + mtxIndex * (num_cells + num_faces), d_b + mtxIndex * num_cells); + ++mtxIndex; + } +} + +void dfYEqn::fvm_div_phiUc() +{ + size_t threads_per_block, blocks_per_grid; + int mtxIndex = 0; + for (size_t i = 0; i < num_species; ++i) + { + if (i == inertIndex) + continue; + // launch cuda kernel + threads_per_block = 512; + blocks_per_grid = (num_cells + threads_per_block - 1) / threads_per_block; + fvm_div_internal_scalar<<>>(num_cells, num_faces, + d_A_csr_row_index, d_A_csr_diag_index, dataBase_.d_weight_upwind, d_phiUc, + d_A_csr + mtxIndex * (num_cells + num_faces), d_b + mtxIndex * num_cells, + d_A_csr + mtxIndex * (num_cells + num_faces), d_b + mtxIndex * num_cells); + + blocks_per_grid = (num_boundary_cells + threads_per_block - 1) / threads_per_block; + fvm_div_boundary_scalar<<>>(num_cells, num_faces, num_boundary_cells, + d_A_csr_row_index, d_A_csr_diag_index, d_phiUc_boundary, + dataBase_.d_boundary_cell_offset, dataBase_.d_boundary_cell_id, + dataBase_.d_internal_coeffs_Y_vector[i], dataBase_.d_boundary_coeffs_Y_vector[i], + d_A_csr + mtxIndex * (num_cells + num_faces), d_b + mtxIndex * num_cells, + d_A_csr + mtxIndex * (num_cells + num_faces), d_b + mtxIndex * num_cells); + ++mtxIndex; + } +} + +void dfYEqn::fvm_laplacian(double *mut_Sct, double *boundary_mut_Sct, std::vector boundary_rhoD) +{ + size_t threads_per_block, blocks_per_grid; + int mtxIndex = 0; + + checkCudaErrors(cudaMemcpyAsync(dataBase_.d_boundary_mut_sct, boundary_mut_Sct, dataBase_.boundary_face_bytes, cudaMemcpyHostToDevice, stream)); + checkCudaErrors(cudaMemcpyAsync(d_mut_Sct, mut_Sct, cell_bytes, cudaMemcpyHostToDevice, stream)); + + for (size_t i = 0; i < num_species; ++i) + { + if (i == inertIndex) + continue; + checkCudaErrors(cudaMemcpyAsync(dataBase_.d_boundary_rhoD_vector[i], boundary_rhoD[i], dataBase_.boundary_face_bytes, cudaMemcpyHostToDevice, stream)); + + // launch cuda kernel + threads_per_block = 1024; + blocks_per_grid = (num_cells + threads_per_block - 1) / threads_per_block; + fvm_laplacian_uncorrected_scalar_internal<<>>(num_cells, d_A_csr_row_index, d_A_csr_col_index, d_A_csr_diag_index, + d_mut_Sct, dataBase_.d_rhoD_vector[i], dataBase_.d_weight, dataBase_.d_face, + dataBase_.d_deltaCoeffs, -1., d_A_csr + mtxIndex * (num_cells + num_faces), + d_A_csr + mtxIndex * (num_cells + num_faces)); + blocks_per_grid = (num_boundary_cells + threads_per_block - 1) / threads_per_block; + fvm_laplacian_uncorrected_scalar_boundary<<>>(num_cells, num_boundary_cells, + d_A_csr_row_index, d_A_csr_diag_index, dataBase_.d_boundary_cell_offset, + dataBase_.d_boundary_cell_id, dataBase_.d_boundary_rhoD_vector[i], + dataBase_.d_boundary_mut_sct, dataBase_.d_boundary_face, dataBase_.d_bouPermedIndex, + dataBase_.d_laplac_internal_coeffs_Y_vector[i], dataBase_.d_laplac_boundary_coeffs_Y_vector[i], -1., + d_A_csr + mtxIndex * (num_cells + num_faces), d_b + mtxIndex * num_cells, + d_A_csr + mtxIndex * (num_cells + num_faces), d_b + mtxIndex * num_cells); + ++mtxIndex; + } +} + +void dfYEqn::checkValue(bool print, char *filename) +{ + checkCudaErrors(cudaMemcpyAsync(h_A_csr, d_A_csr, (num_cells + num_faces) * sizeof(double), cudaMemcpyDeviceToHost, stream)); + checkCudaErrors(cudaMemcpyAsync(h_b, d_b, num_cells * sizeof(double), cudaMemcpyDeviceToHost, stream)); + + // Synchronize stream + checkCudaErrors(cudaStreamSynchronize(stream)); + if (print) + { + for (int i = 0; i < (num_faces + num_cells); i++) + fprintf(stderr, "h_A_csr[%d]: %.15lf\n", i, h_A_csr[i]); + for (int i = 0; i < num_cells; i++) + fprintf(stderr, "h_b[%d]: %.15lf\n", i, h_b[i]); + } + + char *input_file = filename; + FILE *fp = fopen(input_file, "rb+"); + if (fp == NULL) + { + fprintf(stderr, "Failed to open input file: %s!\n", input_file); + } + + int readfile = 0; + double *of_b = new double[num_cells]; + double *of_A = new double[num_faces + num_cells]; + readfile = fread(of_b, num_cells * sizeof(double), 1, fp); + readfile = fread(of_A, (num_faces + num_cells) * sizeof(double), 1, fp); + + std::vector h_A_of_vec_1mtx(num_faces + num_cells, 0); + for (int i = 0; i < num_faces + num_cells; i++) + { + h_A_of_vec_1mtx[i] = of_A[dataBase_.tmpPermutatedList[i]]; + } + if (print) + { + for (int i = 0; i < (num_faces + num_cells); i++) + printf("h_A_of_vec_1mtx[%d]: %.15lf\n", i, h_A_of_vec_1mtx[i]); + for (int i = 0; i < num_cells; i++) + printf("h_b_of_vec[%d]: %.15lf\n", i, of_b[i]); + } + + fprintf(stderr, "check of h_A_csr\n"); + checkVectorEqual(num_faces + num_cells, h_A_of_vec_1mtx.data(), h_A_csr, 1e-5); + fprintf(stderr, "check of h_b\n"); + checkVectorEqual(num_cells, of_b, h_b, 1e-5); +} + +void dfYEqn::solve() +{ + checkCudaErrors(cudaStreamSynchronize(stream)); + + int nNz = num_cells + num_faces; // matrix entries + if (num_iteration == 0) // first interation + { + printf("Initializing AmgX Linear Solver\n"); + int solverIndex = 0; + for (auto &solver : YSolverSet) + { + solver->setOperator(num_cells, nNz, d_A_csr_row_index, d_A_csr_col_index, d_A_csr + solverIndex * nNz); + ++solverIndex; + } + } + else + { + int solverIndex = 0; + for (auto &solver : YSolverSet) + { + solver->updateOperator(num_cells, nNz, d_A_csr + solverIndex * nNz); + ++solverIndex; + } + } + int solverIndex = 0; + for (auto &solver : YSolverSet) + { + solver->solve(num_cells, d_psi + solverIndex * num_cells, d_b + solverIndex * num_cells); + ++solverIndex; + } + num_iteration++; + checkCudaErrors(cudaMemcpyAsync(h_psi, d_psi, num_cells * (num_species - 1) * sizeof(double), cudaMemcpyDeviceToHost, stream)); + // checkCudaErrors(cudaStreamSynchronize(stream)); + // for (size_t i = 0; i < num_cells; i++) + // fprintf(stderr, "h_species_gpu[%d]: %.5e\n", i, h_psi[i + 0 * num_cells]); +} + +void dfYEqn::updatePsi(double *Psi, int speciesIndex) +{ + checkCudaErrors(cudaStreamSynchronize(stream)); + for (size_t i = 0; i < num_cells; i++) + Psi[i] = h_psi[i + speciesIndex * num_cells]; +} + +dfYEqn::~dfYEqn() +{ +} \ No newline at end of file From 5cd1e536394ae7967ac5c792f4d6b28d5faca6b3 Mon Sep 17 00:00:00 2001 From: maorz1998 Date: Fri, 2 Jun 2023 12:59:23 +0800 Subject: [PATCH 2/2] clean version --- applications/solvers/dfLowMachFoam/YEqn.H | 158 +++++++++------------- src_gpu/dfYEqn.H | 5 +- src_gpu/dfYEqn.cu | 69 +--------- 3 files changed, 65 insertions(+), 167 deletions(-) diff --git a/applications/solvers/dfLowMachFoam/YEqn.H b/applications/solvers/dfLowMachFoam/YEqn.H index e5cbd15d7..5e44d9b3f 100644 --- a/applications/solvers/dfLowMachFoam/YEqn.H +++ b/applications/solvers/dfLowMachFoam/YEqn.H @@ -15,23 +15,6 @@ tmp> mvConvection ) ); -upwind mvConvectionInterpolation(mesh, phi); -tmp t_upwind_weight = mvConvectionInterpolation.weights(); -surfaceScalarField upwind_weight = t_upwind_weight.ref(); - -volVectorField sumYGrad -( - IOobject - ( - "sumYGrad", - runTime.timeName(), - mesh, - IOobject::NO_READ, - IOobject::NO_WRITE - ), - mesh, - dimensionedVector(dimensionSet(0,-1,0,0,0,0,0), Zero) -); forAll(Y, i) { @@ -39,25 +22,6 @@ forAll(Y, i) } const surfaceScalarField phiUc = linearInterpolate(sumYDiffError) & mesh.Sf(); -// for (size_t i = 0; i < num_cells; i++) -// { -// printf("sumYDiffError[%d] = (%e, %e, %e)\n", i, sumYDiffError[i][0], sumYDiffError[i][1], -// sumYDiffError[i][2]); -// // printf("fvc::grad[%d] = (%e, %e, %e)\n", i, test_grad[i][0], test_grad[i][1], -// // test_grad[i][2]); -// } - -const surfaceVectorField test_interpolate = linearInterpolate(sumYDiffError).ref(); - -// for (size_t i = 0; i < num_surfaces; i++) -// { -// // printf("test_interpolate[%d] = (%e, %e, %e)\n", i, test_interpolate[i][0], test_interpolate[i][1], -// // test_interpolate[i][2]); -// // printf("surface vector[%d] = (%e, %e, %e)\n", i, mesh.Sf()[i][0], mesh.Sf()[i][1], -// // mesh.Sf()[i][2]); -// printf("phiUcOf[%d] = %e\n", i, phiUc[i]); -// } - #ifdef GPUSolver_ start1 = std::clock(); // // std::thread t(&dfMatrix::solve, &UEqn_GPU); @@ -67,53 +31,49 @@ const surfaceVectorField test_interpolate = linearInterpolate(sumYDiffError).ref time_monitor_UEqn_Solve += double(end1 - start1) / double(CLOCKS_PER_SEC); #endif -std::vector Y_new(Y.size()), Y_old(Y.size()), boundary_Y_init(Y.size()), - boundary_rhoD_init(Y.size()); -std::vector rhoD_GPU(Y.size()); -for (size_t i = 0; i < Y.size(); ++i) -{ - volScalarField& Yi = Y[i]; - const volScalarField& rhoDi = chemistry->rhoD(i); - Y_new[i] = &Yi[0]; - Y_old[i] = &Yi.oldTime()[0]; - rhoD_GPU[i] = &chemistry->rhoD(i)[0]; - cudaMallocHost(&boundary_Y_init[i], num_boundary_faces*sizeof(double)); - cudaMallocHost(&boundary_rhoD_init[i], num_boundary_faces*sizeof(double)); - int offset = 0; - forAll(Yi.boundaryField(), patchi) +#ifdef GPUSolver_ + std::vector Y_new(Y.size()), Y_old(Y.size()), boundary_Y_init(Y.size()), boundary_rhoD_init(Y.size()); + std::vector rhoD_GPU(Y.size()); + for (size_t i = 0; i < Y.size(); ++i) { - const scalarField& patchYi = Yi.boundaryField()[patchi]; - const scalarField& patchRhoDi = rhoDi.boundaryField()[patchi]; - int patchSize = patchYi.size(); + volScalarField& Yi = Y[i]; + const volScalarField& rhoDi = chemistry->rhoD(i); + Y_new[i] = &Yi[0]; + Y_old[i] = &Yi.oldTime()[0]; + rhoD_GPU[i] = &chemistry->rhoD(i)[0]; + cudaMallocHost(&boundary_Y_init[i], num_boundary_faces*sizeof(double)); + cudaMallocHost(&boundary_rhoD_init[i], num_boundary_faces*sizeof(double)); + int offset = 0; + forAll(Yi.boundaryField(), patchi) + { + const scalarField& patchYi = Yi.boundaryField()[patchi]; + const scalarField& patchRhoDi = rhoDi.boundaryField()[patchi]; + int patchSize = patchYi.size(); - memcpy(boundary_Y_init[i]+offset, &patchYi[0], patchSize*sizeof(double)); - memcpy(boundary_rhoD_init[i]+offset, &patchRhoDi[0], patchSize*sizeof(double)); - offset += patchSize; + memcpy(boundary_Y_init[i]+offset, &patchYi[0], patchSize*sizeof(double)); + memcpy(boundary_rhoD_init[i]+offset, &patchRhoDi[0], patchSize*sizeof(double)); + offset += patchSize; + } } -} -volScalarField mut_sct = turbulence->mut().ref()/Sct; -std::vector boundary_sumYDiffError, boundary_phiUc, boundary_mutsct; -forAll(p.boundaryField(), patchi) -{ - const vectorField& patchErri = sumYDiffError.boundaryField()[patchi]; - const scalarField& patchPhiUc = phiUc.boundaryField()[patchi]; - const scalarField& patchMut_sct = mut_sct.boundaryField()[patchi]; - - int patchSize = patchErri.size(); - boundary_sumYDiffError.insert(boundary_sumYDiffError.end(), &patchErri[0][0], &patchErri[0][0]+3*patchSize); - boundary_phiUc.insert(boundary_phiUc.end(), &patchPhiUc[0], &patchPhiUc[0] + patchSize); - boundary_mutsct.insert(boundary_mutsct.end(), &patchMut_sct[0], &patchMut_sct[0] + patchSize); -} + volScalarField mut_sct = turbulence->mut().ref()/Sct; + std::vector boundary_mutsct; + forAll(p.boundaryField(), patchi) + { + const scalarField& patchMut_sct = mut_sct.boundaryField()[patchi]; + int patchSize = patchMut_sct.size(); + boundary_mutsct.insert(boundary_mutsct.end(), &patchMut_sct[0], &patchMut_sct[0] + patchSize); + } -YEqn_GPU.upwindWeight(&upwind_weight[0]); -YEqn_GPU.correctVelocity(Y_new, boundary_Y_init, rhoD_GPU, &phiUc[0], boundary_sumYDiffError.data(), boundary_phiUc.data()); -YEqn_GPU.fvm_ddt(Y_old); -YEqn_GPU.fvm_div_phi(); -YEqn_GPU.fvm_div_phiUc(); -YEqn_GPU.fvm_laplacian(&mut_sct[0], boundary_mutsct.data(), boundary_rhoD_init); + YEqn_GPU.upwindWeight(); + YEqn_GPU.correctVelocity(Y_new, boundary_Y_init, rhoD_GPU); + YEqn_GPU.fvm_ddt(Y_old); + YEqn_GPU.fvm_div_phi(); + YEqn_GPU.fvm_div_phiUc(); + YEqn_GPU.fvm_laplacian(&mut_sct[0], boundary_mutsct.data(), boundary_rhoD_init); -YEqn_GPU.solve(); + YEqn_GPU.solve(); +#endif //MPI_Barrier(PstreamGlobals::MPI_COMM_FOAM); label flag_mpi_init; @@ -147,26 +107,30 @@ time_monitor_corrDiff += double(end - start) / double(CLOCKS_PER_SEC); if (i != inertIndex) { - // tmp DEff = chemistry->rhoD(i) + turbulence->mut()/Sct; - // fvScalarMatrix YiEqn - // ( - // fvm::ddt(rho, Yi) - // + mvConvection->fvmDiv(phi, Yi) - // + mvConvection->fvmDiv(phiUc, Yi) - // == - // ( - // splitting - // ? fvm::laplacian(DEff(), Yi) - // : (fvm::laplacian(DEff(), Yi) + combustion->R(Yi)) - // ) - // ); - - // YiEqn.relax(); - - // YiEqn.solve("Yi"); - - YEqn_GPU.updatePsi(&Yi[0], speciesIndex); - Yi.correctBoundaryConditions(); + #ifdef CPUSolver_ + tmp DEff = chemistry->rhoD(i) + turbulence->mut()/Sct; + fvScalarMatrix YiEqn + ( + fvm::ddt(rho, Yi) + + mvConvection->fvmDiv(phi, Yi) + + mvConvection->fvmDiv(phiUc, Yi) + == + ( + splitting + ? fvm::laplacian(DEff(), Yi) + : (fvm::laplacian(DEff(), Yi) + combustion->R(Yi)) + ) + ); + + YiEqn.relax(); + + YiEqn.solve("Yi"); + #endif + + #ifdef GPUSolver_ + YEqn_GPU.updatePsi(&Yi[0], speciesIndex); + Yi.correctBoundaryConditions(); + #endif Yi.max(0.0); Yt += Yi; diff --git a/src_gpu/dfYEqn.H b/src_gpu/dfYEqn.H index 85f4493a6..e477d0d2e 100644 --- a/src_gpu/dfYEqn.H +++ b/src_gpu/dfYEqn.H @@ -35,10 +35,9 @@ public: void checkValue(bool print, char *filename); - void correctVelocity(std::vector Y_new, std::vector boundary_Y_init, std::vector rhoD_GPU, - const double *phiUcRef, double *boundary_sumYDiffError_ref, double *phiUcBouRef); + void correctVelocity(std::vector Y_new, std::vector boundary_Y_init, std::vector rhoD_GPU); - void upwindWeight(double *refWeight); + void upwindWeight(); void fvm_ddt(std::vector Y_old); diff --git a/src_gpu/dfYEqn.cu b/src_gpu/dfYEqn.cu index d25efb862..b507763a0 100644 --- a/src_gpu/dfYEqn.cu +++ b/src_gpu/dfYEqn.cu @@ -501,30 +501,14 @@ dfYEqn::dfYEqn(dfMatrixDataBase &dataBase, const std::string &modeStr, const std } } -void dfYEqn::upwindWeight(double *refWeight) +void dfYEqn::upwindWeight() { size_t threads_per_block = 1024; size_t blocks_per_grid = (num_faces + threads_per_block - 1) / threads_per_block; getUpwindWeight<<>>(num_faces, dataBase_.d_phi, dataBase_.d_weight_upwind); - -#ifdef _CHECK_ - double *h_refWeight_init = new double[num_faces]; - double *h_refWeight = new double[num_faces]; - double *h_weight_upwind = new double[num_faces]; - cudaMemcpy(h_weight_upwind, dataBase_.d_weight_upwind, num_faces * sizeof(double), cudaMemcpyDeviceToHost); - memcpy(h_refWeight_init, refWeight, num_surfaces * sizeof(double)); - memcpy(h_refWeight_init + num_surfaces, refWeight, num_surfaces * sizeof(double)); - for (size_t i = 0; i < num_faces; i++) - h_refWeight[i] = h_refWeight_init[dataBase_.permedIndex[i]]; - for (size_t i = 0; i < num_faces; i++) - printf("h_weight_upwind[%d] = %lf\n", i, h_weight_upwind[i]); - for (size_t i = 0; i < num_faces; i++) - printf("h_refWeight[%d] = %lf\n", i, h_refWeight[i]); -#endif } -void dfYEqn::correctVelocity(std::vector Y_new, std::vector boundary_Y_init, std::vector rhoD_GPU, - const double *phiUcRef, double *boundary_sumYDiffError_ref, double *phiUcBouRef) +void dfYEqn::correctVelocity(std::vector Y_new, std::vector boundary_Y_init, std::vector rhoD_GPU) { // initialize variables in each time step checkCudaErrors(cudaMemsetAsync(d_sumYDiffError, 0, 3 * cell_bytes, stream)); @@ -568,55 +552,6 @@ void dfYEqn::correctVelocity(std::vector Y_new, std::vector blocks_per_grid = (num_boundary_faces + threads_per_block - 1) / threads_per_block; calculate_phiUc_boundary<<>>(num_boundary_faces, dataBase_.d_boundary_cell_offset, dataBase_.d_boundary_cell_id, dataBase_.d_boundary_face_vector, d_sumYDiffError_boundary, d_phiUc_boundary); - -#ifdef _CHECK_ - checkCudaErrors(cudaStreamSynchronize(stream)); - double *h_sumYDiffError = new double[num_cells * 3]; - double *h_phiUc = new double[num_faces]; - double *h_phiUc_ref_init = new double[num_faces]; - double *h_phiUc_ref = new double[num_faces]; - double *h_sumYDiffError_boundary = new double[num_boundary_faces * 3]; - double *h_sumYDiffError_boundary_gpu = new double[num_boundary_faces * 3]; - double *h_phiUc_boundary = new double[num_boundary_faces]; - double *h_phiUc_boundary_gpu = new double[num_boundary_faces]; - - memcpy(h_phiUc_ref_init, phiUcRef, num_surfaces * sizeof(double)); - memcpy(h_phiUc_ref_init + num_surfaces, phiUcRef, num_surfaces * sizeof(double)); - for (int i = 0; i < num_faces; i++) - { - h_phiUc_ref[i] = h_phiUc_ref_init[dataBase_.permedIndex[i]]; - } - for (int i = 0; i < num_boundary_faces; i++) - { - h_sumYDiffError_boundary[3 * i] = boundary_sumYDiffError_ref[3 * dataBase_.boundPermutationList[i]]; - h_sumYDiffError_boundary[3 * i + 1] = boundary_sumYDiffError_ref[3 * dataBase_.boundPermutationList[i] + 1]; - h_sumYDiffError_boundary[3 * i + 2] = boundary_sumYDiffError_ref[3 * dataBase_.boundPermutationList[i] + 2]; - h_phiUc_boundary[i] = phiUcBouRef[dataBase_.boundPermutationList[i]]; - } - cudaMemcpy(h_sumYDiffError, d_sumYDiffError, num_cells * sizeof(double) * 3, cudaMemcpyDeviceToHost); - cudaMemcpy(h_phiUc, d_phiUc, num_faces * sizeof(double), cudaMemcpyDeviceToHost); - cudaMemcpy(h_sumYDiffError_boundary_gpu, d_sumYDiffError_boundary, 3 * num_boundary_faces * sizeof(double), cudaMemcpyDeviceToHost); - cudaMemcpy(h_phiUc_boundary_gpu, d_phiUc_boundary, num_boundary_faces * sizeof(double), cudaMemcpyDeviceToHost); - // for (size_t i = 0; i < num_cells; i++) - // printf("h_sumYDiffError[%d] = (%e, %e, %e)\n", i, h_sumYDiffError[num_cells * 0 + i], - // h_sumYDiffError[num_cells * 1 + i], h_sumYDiffError[num_cells * 2 + i]); - // for (size_t i = 0; i < num_faces; i++) - // printf("phiUc[%d] = %e\n", i, h_phiUc[i]); - // for (size_t i = 0; i < num_faces; i++) - // printf("phiUcof[%d] = %e\n", i, h_phiUc_ref[i]); - for (int i = 0; i < num_boundary_faces; i++) - { - // printf("boundary_sumYDiffError_of[%d] = (%e, %e, %e)\n", i, h_sumYDiffError_boundary[3 * i], - // h_sumYDiffError_boundary[3 * i + 1], h_sumYDiffError_boundary[3 * i + 2]); - printf("boundary_phiUc_of[%d] = %e\n", i, h_phiUc_boundary[i]); - } - for (int i = 0; i < num_boundary_faces; i++) - { - // printf("boundary_phiUc_gpu[%d] = (%e, %e, %e)\n", i, h_sumYDiffError_boundary_gpu[3 * i], - // h_sumYDiffError_boundary_gpu[3 * i + 1], h_sumYDiffError_boundary_gpu[3 * i + 2]); - printf("boundary_phiUc_gpu[%d] = %e\n", i, h_phiUc_boundary_gpu[i]); - } -#endif } void dfYEqn::fvm_ddt(std::vector Y_old)