diff --git a/applications/solvers/dfLowMachFoam/EEqn.H b/applications/solvers/dfLowMachFoam/EEqn.H index 93e60e7bf..c9cbd1e68 100644 --- a/applications/solvers/dfLowMachFoam/EEqn.H +++ b/applications/solvers/dfLowMachFoam/EEqn.H @@ -28,6 +28,8 @@ #ifdef GPUSolver_ start1 = std::clock(); UEqn_GPU.updatePsi(&U[0][0]); + UEqn_GPU.correctBoundaryConditions(); + U.correctBoundaryConditions(); K = 0.5*magSqr(U); end1 = std::clock(); time_monitor_UEqn += double(end1 - start1) / double(CLOCKS_PER_SEC); @@ -51,8 +53,10 @@ const scalarField& patchK = K.boundaryField()[patchi]; const scalarField& patchAlphaEff = alphaEff.boundaryField()[patchi]; + const scalarField& patchGrad = he.boundaryField()[patchi].gradientBoundaryCoeffs(); // gradient_ memcpy(boundary_K + eeqn_offset, &patchK[0], patchSize*sizeof(double)); memcpy(boundary_alphaEff + eeqn_offset, &patchAlphaEff[0], patchSize*sizeof(double)); + memcpy(boundary_gradient + eeqn_offset, &patchGrad[0], patchSize*sizeof(double)); eeqn_offset += patchSize; } @@ -67,7 +71,7 @@ // prepare data on GPU start1 = std::clock(); EEqn_GPU.prepare_data(&he.oldTime()[0], &K[0], &K.oldTime()[0], &alphaEff[0], - &dpdt[0], boundary_K, boundary_alphaEff); + &dpdt[0], boundary_K, boundary_alphaEff, boundary_gradient); EEqn_GPU.sync(); end1 = std::clock(); time_monitor_EEqn += double(end1 - start1) / double(CLOCKS_PER_SEC); diff --git a/applications/solvers/dfLowMachFoam/YEqn.H b/applications/solvers/dfLowMachFoam/YEqn.H index 94fb6cf8d..6e6c9876a 100644 --- a/applications/solvers/dfLowMachFoam/YEqn.H +++ b/applications/solvers/dfLowMachFoam/YEqn.H @@ -39,7 +39,11 @@ time_monitor_YEqn_solve += double(end1 - start1) / double(CLOCKS_PER_SEC); { volScalarField& Yi = Y[i]; Y_old[i] = &Yi.oldTime()[0]; - cudaMallocHost(&boundary_Y[i], num_boundary_faces*sizeof(double)); + if (updateBoundaryFields) + { + cudaMallocHost(&boundary_Y[i], num_boundary_faces*sizeof(double)); + Info << "here" << endl; + } const volScalarField& haii = chemistry->hai(i); const volScalarField& rhoDi = chemistry->rhoD(i); hai[i] = &haii[0]; @@ -54,12 +58,16 @@ time_monitor_YEqn_solve += double(end1 - start1) / double(CLOCKS_PER_SEC); const scalarField& patchRhoDi = rhoDi.boundaryField()[patchi]; int patchSize = patchYi.size(); - memcpy(boundary_Y[i] + offset, &patchYi[0], patchSize*sizeof(double)); + if (updateBoundaryFields) + { + memcpy(boundary_Y[i] + offset, &patchYi[0], patchSize*sizeof(double)); + } memcpy(boundary_hai[i] + offset, &patchHaii[0], patchSize*sizeof(double)); memcpy(boundary_rhoD[i] + offset, &patchRhoDi[0], patchSize*sizeof(double)); offset += patchSize; } } + updateBoundaryFields = false; volScalarField mut_sct = turbulence->mut().ref()/Sct; double *boundary_mutsct = nullptr; cudaMallocHost(&boundary_mutsct, num_boundary_faces*sizeof(double)); @@ -124,6 +132,7 @@ if(flag_mpi_init) MPI_Barrier(PstreamGlobals::MPI_COMM_FOAM); YEqn_GPU.updatePsi(&Yi[0], i); Yi.correctBoundaryConditions(); } + YEqn_GPU.correctBoundaryConditions(); end1 = std::clock(); time_monitor_YEqn += double(end1 - start1) / double(CLOCKS_PER_SEC); time_monitor_YEqn_correctBC += double(end1 - start1) / double(CLOCKS_PER_SEC); diff --git a/applications/solvers/dfLowMachFoam/createdfSolver.H b/applications/solvers/dfLowMachFoam/createdfSolver.H index e56376b28..0707dc629 100644 --- a/applications/solvers/dfLowMachFoam/createdfSolver.H +++ b/applications/solvers/dfLowMachFoam/createdfSolver.H @@ -7,6 +7,8 @@ std::vector boundaryCellIndex; std::vector boundary_face_vector_init; std::vector boundary_face_init; std::vector boundary_deltaCoeffs_init; +std::vector> patchTypes; +std::vector patchTypeU, patchTypeY; int num_boundary_faces = 0; int patchSize; forAll(mesh.boundary(), patchi) @@ -22,14 +24,20 @@ forAll(mesh.boundary(), patchi) boundary_face_init.insert(boundary_face_init.end(), &pMagSf[0], &pMagSf[0]+patchSize); boundary_deltaCoeffs_init.insert(boundary_deltaCoeffs_init.end(), &pDeltaCoeffs[0], &pDeltaCoeffs[0]+patchSize); num_boundary_faces += patchSize; + + constructBoundarySelector(patchTypeU, U.boundaryField()[patchi].type(), patchSize); + constructBoundarySelector(patchTypeY, Y[0].boundaryField()[patchi].type(), patchSize); } +patchTypes.emplace_back(patchTypeU); +patchTypes.emplace_back(patchTypeY); + int num_boundary_cells; string settingPath; settingPath = CanteraTorchProperties.subDict("AmgxSettings").lookupOrDefault("UEqnSettingPath", string("")); dfMatrixDataBase dfDataBase(num_surfaces, num_cells, num_boundary_faces, Y.size(), num_boundary_cells, &neighbour[0], &owner[0], &mesh.V()[0], &mesh.surfaceInterpolation::weights()[0], -&mesh.Sf()[0][0], &mesh.magSf()[0], &mesh.nonOrthDeltaCoeffs()[0], boundary_face_vector_init, boundary_face_init, boundary_deltaCoeffs_init, boundaryCellIndex); +&mesh.Sf()[0][0], &mesh.magSf()[0], &mesh.nonOrthDeltaCoeffs()[0], boundary_face_vector_init, boundary_face_init, boundary_deltaCoeffs_init, boundaryCellIndex, patchTypes); #ifdef GPUSolver_ dfRhoEqn rhoEqn_GPU(dfDataBase); @@ -49,12 +57,10 @@ cudaMallocHost(&boundary_nuEff_init, num_boundary_faces*sizeof(double)); cudaMallocHost(&boundary_rho_init, num_boundary_faces*sizeof(double)); cudaMallocHost(&boundary_phi_init, num_boundary_faces*sizeof(double)); -double *boundary_alphaEff, *boundary_K; -double *eeqn_valueInternalCoeffs, *eeqn_valueBoundaryCoeffs, *eeqn_gradientInternalCoeffs, *eeqn_gradientBoundaryCoeffs; +double *boundary_alphaEff, *boundary_K, *boundary_gradient; cudaMallocHost(&boundary_K, num_boundary_faces*sizeof(double)); cudaMallocHost(&boundary_alphaEff, num_boundary_faces*sizeof(double)); -cudaMallocHost(&eeqn_valueInternalCoeffs, num_boundary_faces*sizeof(double)); -cudaMallocHost(&eeqn_valueBoundaryCoeffs, num_boundary_faces*sizeof(double)); -cudaMallocHost(&eeqn_gradientInternalCoeffs, num_boundary_faces*sizeof(double)); -cudaMallocHost(&eeqn_gradientBoundaryCoeffs, num_boundary_faces*sizeof(double)); +cudaMallocHost(&boundary_gradient, num_boundary_faces * sizeof(double)); + +bool updateBoundaryFields = true; // make sure that the boundary fields do H2D copy at 1st timestep #endif diff --git a/src_gpu/CMakeLists.txt b/src_gpu/CMakeLists.txt index 1040cee14..686607bae 100644 --- a/src_gpu/CMakeLists.txt +++ b/src_gpu/CMakeLists.txt @@ -24,7 +24,8 @@ add_library(${PROJECT_NAME} dfRhoEqn.cu dfYEqn.cu dfEEqn.cu - AmgXSolver.cu) + AmgXSolver.cu + dfMatrixDataBase.cpp) target_link_libraries(${PROJECT_NAME} ${MPI_LIBRARIES} diff --git a/src_gpu/dfEEqn.H b/src_gpu/dfEEqn.H index f15a44a49..61b2612cf 100644 --- a/src_gpu/dfEEqn.H +++ b/src_gpu/dfEEqn.H @@ -42,6 +42,8 @@ private: double *d_value_boundary_coeffs = nullptr; double *d_gradient_internal_coeffs = nullptr; double *d_gradient_boundary_coeffs = nullptr; + double *d_boundary_gradient_init = nullptr; + double *d_boundary_gradient = nullptr; public: dfEEqn(); @@ -49,7 +51,7 @@ public: ~dfEEqn(); void prepare_data(const double *he_old, const double *K, const double *K_old, const double *alphaEff, - const double *dpdt, const double *boundary_K, const double *boundary_alphaEff); + const double *dpdt, const double *boundary_K, const double *boundary_alphaEff, const double *boundary_gradient); void initializeTimeStep(); diff --git a/src_gpu/dfEEqn.cu b/src_gpu/dfEEqn.cu index f1aa3d14e..9cf1efe6c 100644 --- a/src_gpu/dfEEqn.cu +++ b/src_gpu/dfEEqn.cu @@ -399,9 +399,10 @@ __global__ void eeqn_add_to_source_kernel(int num_cells, __global__ void eeqn_boundaryPermutation(const int num_boundary_faces, const int *bouPermedIndex, const double *boundary_K_init, - const double *boundary_alphaEff_init, + const double *boundary_alphaEff_init, const double *boundary_gradient_init, double *boundary_K, - double *boundary_alphaEff) + double *boundary_alphaEff, + double *boundary_gradient) { int index = blockDim.x * blockIdx.x + threadIdx.x; if (index >= num_boundary_faces) @@ -411,9 +412,12 @@ __global__ void eeqn_boundaryPermutation(const int num_boundary_faces, const int boundary_K[index] = boundary_K_init[p]; boundary_alphaEff[index] = boundary_alphaEff_init[p]; + boundary_gradient[index] = boundary_gradient_init[p]; } -__global__ void eeqn_update_BoundaryCoeffs_kernel(int num_boundary_faces, const double *boundary_phi, double *internal_coeffs, +__global__ void eeqn_update_BoundaryCoeffs_kernel(int num_boundary_faces, const double *boundary_phi, + double *gradient, const double *boundary_deltaCoeffs, + double *internal_coeffs, double *boundary_coeffs, double *laplac_internal_coeffs, double *laplac_boundary_coeffs) { @@ -421,11 +425,12 @@ __global__ void eeqn_update_BoundaryCoeffs_kernel(int num_boundary_faces, const if (index >= num_boundary_faces) return; - // zeroGradient + double grad = gradient[index]; + // energyGradient double valueInternalCoeffs = 1.; - double valueBoundaryCoeffs = 0.; + double valueBoundaryCoeffs = grad / boundary_deltaCoeffs[index]; double gradientInternalCoeffs = 0.; - double gradientBoundaryCoeffs = 0.; + double gradientBoundaryCoeffs = grad; internal_coeffs[index] = boundary_phi[index] * valueInternalCoeffs; boundary_coeffs[index] = -boundary_phi[index] * valueBoundaryCoeffs; @@ -440,7 +445,7 @@ dfEEqn::dfEEqn(dfMatrixDataBase &dataBase, const std::string &modeStr, const std ESolver = new AmgXSolver(modeStr, cfgFile); stream = dataBase_.stream; - //checkCudaErrors(cudaEventCreate(&event)); + // checkCudaErrors(cudaEventCreate(&event)); num_cells = dataBase_.num_cells; cell_bytes = dataBase_.cell_bytes; @@ -473,6 +478,8 @@ dfEEqn::dfEEqn(dfMatrixDataBase &dataBase, const std::string &modeStr, const std checkCudaErrors(cudaMalloc((void **)&d_boundary_K, boundary_face_bytes)); checkCudaErrors(cudaMalloc((void **)&d_boundary_alphaEff_init, boundary_face_bytes)); checkCudaErrors(cudaMalloc((void **)&d_boundary_alphaEff, boundary_face_bytes)); + checkCudaErrors(cudaMalloc((void **)&d_boundary_gradient_init, boundary_face_bytes)); + checkCudaErrors(cudaMalloc((void **)&d_boundary_gradient, boundary_face_bytes)); checkCudaErrors(cudaMalloc((void **)&d_value_internal_coeffs_init, boundary_face_bytes)); checkCudaErrors(cudaMalloc((void **)&d_value_boundary_coeffs_init, boundary_face_bytes)); @@ -485,7 +492,7 @@ dfEEqn::dfEEqn(dfMatrixDataBase &dataBase, const std::string &modeStr, const std } void dfEEqn::prepare_data(const double *he_old, const double *K, const double *K_old, const double *alphaEff, - const double *dpdt, const double *boundary_K, const double *boundary_alphaEff) + const double *dpdt, const double *boundary_K, const double *boundary_alphaEff, const double *boundary_gradient) { // TODO not real async copy now, because some host array are not in pinned memory. @@ -499,12 +506,13 @@ void dfEEqn::prepare_data(const double *he_old, const double *K, const double *K // copy and permutate boundary variable checkCudaErrors(cudaMemcpyAsync(d_boundary_K_init, boundary_K, boundary_face_bytes, cudaMemcpyHostToDevice, stream)); checkCudaErrors(cudaMemcpyAsync(d_boundary_alphaEff_init, boundary_alphaEff, boundary_face_bytes, cudaMemcpyHostToDevice, stream)); + checkCudaErrors(cudaMemcpyAsync(d_boundary_gradient_init, boundary_gradient, boundary_face_bytes, cudaMemcpyHostToDevice, stream)); size_t threads_per_block = 1024; size_t blocks_per_grid = (num_boundary_faces + threads_per_block - 1) / threads_per_block; eeqn_boundaryPermutation<<>>(num_boundary_faces, dataBase_.d_bouPermedIndex, - d_boundary_K_init, d_boundary_alphaEff_init, - d_boundary_K, d_boundary_alphaEff); + d_boundary_K_init, d_boundary_alphaEff_init, d_boundary_gradient_init, + d_boundary_K, d_boundary_alphaEff, d_boundary_gradient); } void dfEEqn::initializeTimeStep() @@ -516,6 +524,7 @@ void dfEEqn::initializeTimeStep() size_t threads_per_block = 1024; size_t blocks_per_grid = (dataBase_.num_boundary_faces + threads_per_block - 1) / threads_per_block; eeqn_update_BoundaryCoeffs_kernel<<>>(dataBase_.num_boundary_faces, dataBase_.d_boundary_phi, + d_boundary_gradient, dataBase_.d_boundary_deltaCoeffs, d_value_internal_coeffs, d_value_boundary_coeffs, d_gradient_internal_coeffs, d_gradient_boundary_coeffs); } @@ -577,14 +586,14 @@ void dfEEqn::fvc_div_vector() size_t threads_per_block = 512; size_t blocks_per_grid = (num_cells + threads_per_block - 1) / threads_per_block; eeqn_fvc_div_vector_internal<<>>(num_cells, - d_A_csr_row_index, d_A_csr_col_index, d_A_csr_diag_index, - dataBase_.d_face_vector, dataBase_.d_hDiffCorrFlux, dataBase_.d_weight, - 1., d_b, d_b); + d_A_csr_row_index, d_A_csr_col_index, d_A_csr_diag_index, + dataBase_.d_face_vector, dataBase_.d_hDiffCorrFlux, dataBase_.d_weight, + 1., d_b, d_b); blocks_per_grid = (num_boundary_cells + threads_per_block - 1) / threads_per_block; eeqn_fvc_div_vector_boundary<<>>(num_boundary_cells, - dataBase_.d_boundary_cell_offset, dataBase_.d_boundary_cell_id, - dataBase_.d_boundary_face_vector, dataBase_.d_boundary_hDiffCorrFlux, - 1., d_b, d_b); + dataBase_.d_boundary_cell_offset, dataBase_.d_boundary_cell_id, + dataBase_.d_boundary_face_vector, dataBase_.d_boundary_hDiffCorrFlux, + 1., d_b, d_b); } void dfEEqn::fvc_div_phi_scalar() @@ -611,8 +620,8 @@ void dfEEqn::add_to_source() size_t blocks_per_grid = (num_cells + threads_per_block - 1) / threads_per_block; // " + fvc::ddt(rho,K)" is on the left side of "==", thus should minus from source. eeqn_add_to_source_kernel<<>>(num_cells, - //1., d_dpdt, -1., d_diffAlphaD, dataBase_.d_volume, d_b, d_b); - 1., d_dpdt, -1., dataBase_.d_diffAlphaD, dataBase_.d_volume, d_b, d_b); + // 1., d_dpdt, -1., d_diffAlphaD, dataBase_.d_volume, d_b, d_b); + 1., d_dpdt, -1., dataBase_.d_diffAlphaD, dataBase_.d_volume, d_b, d_b); } void dfEEqn::checkValue(bool print) @@ -689,10 +698,10 @@ void dfEEqn::solve() num_iteration++; checkCudaErrors(cudaMemcpyAsync(h_he_new, d_he_old, cell_bytes, cudaMemcpyDeviceToHost, stream)); - //checkCudaErrors(cudaEventRecord(event, stream)); - // checkCudaErrors(cudaStreamSynchronize(stream)); - // for (size_t i = 0; i < num_cells; i++) - // fprintf(stderr, "h_he_after[%d]: %.16lf\n", i, h_he_new[i]); + // checkCudaErrors(cudaEventRecord(event, stream)); + // checkCudaErrors(cudaStreamSynchronize(stream)); + // for (size_t i = 0; i < num_cells; i++) + // fprintf(stderr, "h_he_after[%d]: %.16lf\n", i, h_he_new[i]); } void dfEEqn::sync() @@ -703,7 +712,7 @@ void dfEEqn::sync() void dfEEqn::updatePsi(double *Psi) { checkCudaErrors(cudaStreamSynchronize(stream)); - //checkCudaErrors(cudaEventSynchronize(event)); + // checkCudaErrors(cudaEventSynchronize(event)); memcpy(Psi, h_he_new, cell_bytes); } @@ -727,6 +736,8 @@ dfEEqn::~dfEEqn() checkCudaErrors(cudaFree(d_boundary_K)); checkCudaErrors(cudaFree(d_boundary_alphaEff_init)); checkCudaErrors(cudaFree(d_boundary_alphaEff)); + checkCudaErrors(cudaFree(d_boundary_gradient_init)); + checkCudaErrors(cudaFree(d_boundary_gradient)); checkCudaErrors(cudaFree(d_value_internal_coeffs_init)); checkCudaErrors(cudaFree(d_value_boundary_coeffs_init)); @@ -737,5 +748,5 @@ dfEEqn::~dfEEqn() checkCudaErrors(cudaFree(d_gradient_internal_coeffs)); checkCudaErrors(cudaFree(d_gradient_boundary_coeffs)); - //checkCudaErrors(cudaEventDestroy(event)); + // checkCudaErrors(cudaEventDestroy(event)); } diff --git a/src_gpu/dfMatrixDataBase.H b/src_gpu/dfMatrixDataBase.H index 8ed3dcd32..3fd5eb69e 100644 --- a/src_gpu/dfMatrixDataBase.H +++ b/src_gpu/dfMatrixDataBase.H @@ -11,6 +11,7 @@ #include #include #include +#include static const char *_cudaGetErrorEnum(cudaError_t error) { @@ -40,6 +41,13 @@ inline void checkVectorEqual(int count, const double* basevec, double* vec, doub } } +enum boundaryConditions{ + zeroGradient, + fixedValue, + coupled +}; + +void constructBoundarySelector(std::vector& patchTypeSelector, const std::string& patchTypeStr, const int patchSize); struct dfMatrixDataBase { @@ -138,6 +146,8 @@ struct dfMatrixDataBase double *d_boundary_mut_sct = nullptr; double *d_boundary_hDiffCorrFlux = nullptr; + int *d_boundary_UpatchType = nullptr; + int *d_boundary_YpatchType = nullptr; std::vector boundPermutationList; std::vector ueqn_internalCoeffs, ueqn_boundaryCoeffs; @@ -145,6 +155,8 @@ struct dfMatrixDataBase std::vector boundary_pressure; std::vector boundary_face; std::vector boundary_deltaCoeffs; + std::vector> patch_type_init; + std::vector> patch_type; // - the device pointer to the permutated index list std::vector permedIndex; @@ -202,9 +214,9 @@ struct dfMatrixDataBase dfMatrixDataBase(int num_surfaces, int num_cells, int num_boundary_faces, int num_species, int & num_boundary_cells_output, const int *neighbour, const int *owner, const double* volume, const double* weight, const double* face_vector, const double* face, const double* deltaCoeffs, std::vector boundary_face_vector_init, std::vector boundary_face_init, - std::vector boundary_deltaCoeffs_init, std::vector boundary_cell_id_init) + std::vector boundary_deltaCoeffs_init, std::vector boundary_cell_id_init, std::vector> patch_type_init) : num_cells(num_cells), num_faces(num_surfaces*2), num_surfaces(num_surfaces), num_species(num_species), num_iteration(0), - num_boundary_faces(num_boundary_faces), h_volume(volume) + num_boundary_faces(num_boundary_faces), h_volume(volume), patch_type_init(patch_type_init) { // create cuda stream checkCudaErrors(cudaStreamCreate(&stream)); @@ -403,6 +415,10 @@ struct dfMatrixDataBase boundary_face.resize(num_boundary_faces); boundary_deltaCoeffs.resize(num_boundary_faces); + patch_type.resize(2); + patch_type[0].resize(num_boundary_faces); + patch_type[1].resize(num_boundary_faces); + /** * 4. permutation list for field variables */ @@ -466,6 +482,8 @@ struct dfMatrixDataBase boundary_face_vector[3*i+2] = boundary_face_vector_init[3*boundPermutationList[i]+2]; boundary_face[i] = boundary_face_init[boundPermutationList[i]]; boundary_deltaCoeffs[i] = boundary_deltaCoeffs_init[boundPermutationList[i]]; + patch_type[0][i] = patch_type_init[0][boundPermutationList[i]]; + patch_type[1][i] = patch_type_init[1][boundPermutationList[i]]; } h_boundary_face_vector = boundary_face_vector.data(); h_boundary_face = boundary_face.data(); @@ -535,6 +553,8 @@ struct dfMatrixDataBase checkCudaErrors(cudaMalloc((void**)&d_boundary_rho, boundary_face_bytes)); checkCudaErrors(cudaMalloc((void**)&d_boundary_rho_init, boundary_face_bytes)); checkCudaErrors(cudaMalloc((void**)&d_boundary_mut_sct, boundary_face_bytes)); + checkCudaErrors(cudaMalloc((void**)&d_boundary_UpatchType, boundary_face_index_bytes)); + checkCudaErrors(cudaMalloc((void**)&d_boundary_YpatchType, boundary_face_index_bytes)); for (size_t i = 0; i < num_species; ++i){ checkCudaErrors(cudaMalloc((void**)&d_boundary_Y_vector[i], boundary_face_bytes)); checkCudaErrors(cudaMalloc((void**)&d_boundary_Y_init_vector[i], boundary_face_bytes)); @@ -587,6 +607,8 @@ struct dfMatrixDataBase checkCudaErrors(cudaMemcpyAsync(d_boundary_face_vector, h_boundary_face_vector, boundary_face_vec_bytes, cudaMemcpyHostToDevice, stream)); checkCudaErrors(cudaMemcpyAsync(d_boundary_face, h_boundary_face, boundary_face_bytes, cudaMemcpyHostToDevice, stream)); checkCudaErrors(cudaMemcpyAsync(d_boundary_deltaCoeffs, h_boundary_deltaCoeffs, boundary_face_bytes, cudaMemcpyHostToDevice, stream)); + checkCudaErrors(cudaMemcpyAsync(d_boundary_UpatchType, patch_type[0].data(), boundary_face_index_bytes, cudaMemcpyHostToDevice, stream)); + checkCudaErrors(cudaMemcpyAsync(d_boundary_YpatchType, patch_type[1].data(), boundary_face_index_bytes, cudaMemcpyHostToDevice, stream)); checkCudaErrors(cudaMemcpyAsync(d_permedIndex, permedIndex.data(), face_index_bytes, cudaMemcpyHostToDevice, stream)); checkCudaErrors(cudaMemcpyAsync(d_bouPermedIndex, boundPermutationList.data(), boundary_face_index_bytes, cudaMemcpyHostToDevice, stream)); diff --git a/src_gpu/dfMatrixDataBase.cpp b/src_gpu/dfMatrixDataBase.cpp new file mode 100644 index 000000000..9c9f9a02a --- /dev/null +++ b/src_gpu/dfMatrixDataBase.cpp @@ -0,0 +1,41 @@ +#include "dfMatrixDataBase.H" + + +void constructBoundarySelector(std::vector& patchTypeSelector, const std::string& patchTypeStr, + const int patchSize) +{ + boundaryConditions patchCondition; + std::vector tmpSelector; + static std::map BCMap = { + {"zeroGradient", zeroGradient}, + {"fixedValue", fixedValue}, + {"coupled", coupled} + }; + auto iter = BCMap.find(patchTypeStr); + if (iter != BCMap.end()) { + patchCondition = iter->second; + } else { + throw std::runtime_error("Unknown boundary condition: " + patchTypeStr); + } + // zeroGradient labeled as 0, fixedValue labeled as 1, coupled labeled as 2 + switch (patchCondition){ + case zeroGradient: + { + tmpSelector.resize(patchSize, 0); + patchTypeSelector.insert(patchTypeSelector.end(), tmpSelector.begin(), tmpSelector.end()); + break; + } + case fixedValue: + { + tmpSelector.resize(patchSize, 1); + patchTypeSelector.insert(patchTypeSelector.end(), tmpSelector.begin(), tmpSelector.end()); + break; + } + case coupled: + { + tmpSelector.resize(patchSize, 2); + patchTypeSelector.insert(patchTypeSelector.end(), tmpSelector.begin(), tmpSelector.end()); + break; + } + } +} diff --git a/src_gpu/dfUEqn.H b/src_gpu/dfUEqn.H index 14288209b..ec739db5e 100644 --- a/src_gpu/dfUEqn.H +++ b/src_gpu/dfUEqn.H @@ -54,6 +54,8 @@ public: void updatePsi(double *Psi); + void correctBoundaryConditions(); + void correctPsi(double *Psi); void initializeTimeStep(); diff --git a/src_gpu/dfUEqn.cu b/src_gpu/dfUEqn.cu index 6aa479796..0a4d80756 100644 --- a/src_gpu/dfUEqn.cu +++ b/src_gpu/dfUEqn.cu @@ -997,17 +997,26 @@ __global__ void addDiagDivVolume(int num_cells, const int *csr_row_index, __global__ void ueqn_update_BoundaryCoeffs_kernel(int num_boundary_faces, const double *boundary_phi, double *internal_coeffs, double *boundary_coeffs, double *laplac_internal_coeffs, - double *laplac_boundary_coeffs) + double *laplac_boundary_coeffs, const int *U_patch_type) { int index = blockDim.x * blockIdx.x + threadIdx.x; if (index >= num_boundary_faces) return; - // zeroGradient - double valueInternalCoeffs = 1.; - double valueBoundaryCoeffs = 0.; - double gradientInternalCoeffs = 0.; - double gradientBoundaryCoeffs = 0.; + int patchIndex = U_patch_type[index]; + double valueInternalCoeffs, valueBoundaryCoeffs, gradientInternalCoeffs, gradientBoundaryCoeffs; + switch (patchIndex) + { + case 0: // zeroGradient + { + valueInternalCoeffs = 1.; + valueBoundaryCoeffs = 0.; + gradientInternalCoeffs = 0.; + gradientBoundaryCoeffs = 0.; + break; + } + // TODO implement coupled and fixedValue conditions + } internal_coeffs[index * 3 + 0] = boundary_phi[index] * valueInternalCoeffs; internal_coeffs[index * 3 + 1] = boundary_phi[index] * valueInternalCoeffs; @@ -1023,6 +1032,37 @@ __global__ void ueqn_update_BoundaryCoeffs_kernel(int num_boundary_faces, const laplac_boundary_coeffs[index * 3 + 2] = gradientBoundaryCoeffs; } +__global__ void ueqn_correct_BoundaryConditions_kernel(int num_cells, int num_boundary_cells, + const int *boundary_cell_offset, const int *boundary_cell_id, + const double *velocity, double *boundary_velocity, const int *U_patch_type) +{ + int index = blockDim.x * blockIdx.x + threadIdx.x; + if (index >= num_boundary_cells) + return; + + int cell_offset = boundary_cell_offset[index]; + int next_cell_offset = boundary_cell_offset[index + 1]; + int cell_index = boundary_cell_id[cell_offset]; + + for (int i = cell_offset; i < next_cell_offset; i++) + { + int patchIndex = U_patch_type[i]; + switch (patchIndex) + { + case 0: // zeroGradient + { + boundary_velocity[i * 3 + 0] = velocity[cell_index]; + boundary_velocity[i * 3 + 1] = velocity[num_cells * 1 + cell_index]; + boundary_velocity[i * 3 + 2] = velocity[num_cells * 2 + cell_index]; + break; + } + case 1: + break; + // TODO implement coupled conditions + } + } +} + // constructor dfUEqn::dfUEqn(dfMatrixDataBase &dataBase, const std::string &modeStr, const std::string &cfgFile) : dataBase_(dataBase) @@ -1228,7 +1268,8 @@ void dfUEqn::initializeTimeStep() size_t blocks_per_grid = (dataBase_.num_boundary_faces + threads_per_block - 1) / threads_per_block; ueqn_update_BoundaryCoeffs_kernel<<>>(dataBase_.num_boundary_faces, dataBase_.d_boundary_phi, dataBase_.d_internal_coeffs, dataBase_.d_boundary_coeffs, - dataBase_.d_laplac_internal_coeffs, dataBase_.d_laplac_boundary_coeffs); + dataBase_.d_laplac_internal_coeffs, dataBase_.d_laplac_boundary_coeffs, + dataBase_.d_boundary_UpatchType); } void dfUEqn::checkValue(bool print) @@ -1360,6 +1401,15 @@ void dfUEqn::updatePsi(double *Psi) memcpy(Psi, h_psi, cell_vec_bytes); } +void dfUEqn::correctBoundaryConditions() +{ + size_t threads_per_block = 1024; + size_t blocks_per_grid = (num_boundary_cells + threads_per_block - 1) / threads_per_block; + ueqn_correct_BoundaryConditions_kernel<<>>(num_cells, num_boundary_cells, + dataBase_.d_boundary_cell_offset, dataBase_.d_boundary_cell_id, + d_psi, dataBase_.d_boundary_velocity, dataBase_.d_boundary_UpatchType); +} + // correct volecity in pEqn void dfUEqn::correctPsi(double *Psi) { diff --git a/src_gpu/dfYEqn.H b/src_gpu/dfYEqn.H index 2bc645cdd..4d1c87457 100644 --- a/src_gpu/dfYEqn.H +++ b/src_gpu/dfYEqn.H @@ -32,6 +32,8 @@ private: double *d_boundary_Y = nullptr; double *d_grady, *d_boundary_grady = nullptr; + bool uploadBoundaryY = true; + public: dfYEqn(dfMatrixDataBase &dataBase, const std::string &modeStr, const std::string &cfgFile, const int inertIndex); @@ -57,6 +59,8 @@ public: void solve(); + void correctBoundaryConditions(); + void sync(); void updatePsi(double *Psi, int speciesIndex); diff --git a/src_gpu/dfYEqn.cu b/src_gpu/dfYEqn.cu index 85db544da..22fe5e1aa 100644 --- a/src_gpu/dfYEqn.cu +++ b/src_gpu/dfYEqn.cu @@ -74,7 +74,7 @@ __global__ void fvc_grad_internal(int num_cells, __global__ void fvc_grad_boundary(int num_boundary_cells, const int *boundary_cell_offset, const int *boundary_cell_id, const int *bouPermedIndex, const double *boundary_face_vector, const double *boundary_species_init, - const double *volume, double *grady) + const double *volume, double *grady, bool uploadBoundaryY) { int index = blockDim.x * blockIdx.x + threadIdx.x; if (index >= num_boundary_cells) @@ -93,8 +93,16 @@ __global__ void fvc_grad_boundary(int num_boundary_cells, double sfx = boundary_face_vector[i * 3 + 0]; double sfy = boundary_face_vector[i * 3 + 1]; double sfz = boundary_face_vector[i * 3 + 2]; - int permute_index = bouPermedIndex[i]; - double face_Y = boundary_species_init[permute_index]; + double face_Y; + if (!uploadBoundaryY) + { + face_Y = boundary_species_init[i]; + } + else + { + int permute_index = bouPermedIndex[i]; + face_Y = boundary_species_init[permute_index]; + } grad_bx += face_Y * sfx; grad_by += face_Y * sfy; grad_bz += face_Y * sfz; @@ -158,13 +166,21 @@ __global__ void sumError_internal(int num_cells, __global__ void sumError_boundary(int num_boundary_faces, const int *bouPermedIndex, const double *boundary_hai, const double *boundary_rhoD, const double *boundary_y, const double *boundary_grady, - double *sum_boundary_hai_rhoD_grady, double *sum_boundary_rhoD_grady, double *sum_boundary_hai_y) + double *sum_boundary_hai_rhoD_grady, double *sum_boundary_rhoD_grady, double *sum_boundary_hai_y, bool uploadBoundaryY) { int index = blockDim.x * blockIdx.x + threadIdx.x; if (index >= num_boundary_faces) return; - int permute_index = bouPermedIndex[index]; + int permute_index; + if (!uploadBoundaryY) + { + permute_index = index; + } + else + { + permute_index = bouPermedIndex[index]; + } sum_boundary_hai_rhoD_grady[index * 3 + 0] += boundary_hai[permute_index] * boundary_rhoD[permute_index] * boundary_grady[index * 3 + 0]; sum_boundary_hai_rhoD_grady[index * 3 + 1] += boundary_hai[permute_index] * boundary_rhoD[permute_index] * boundary_grady[index * 3 + 1]; sum_boundary_hai_rhoD_grady[index * 3 + 2] += boundary_hai[permute_index] * boundary_rhoD[permute_index] * boundary_grady[index * 3 + 2]; @@ -377,8 +393,6 @@ __global__ void fvm_div_boundary_scalar(int num_cells, int num_faces, int num_bo double boundary_coeffs_own = 0; for (int i = 0; i < loop_size; i++) { - // zeroGradient - internal_coeffs[cell_offset + i] = 1.; internal_coeffs_own += boundary_phi[cell_offset + i] * internal_coeffs[cell_offset + i]; boundary_coeffs_own += -boundary_phi[cell_offset + i] * boundary_coeffs[cell_offset + i]; } @@ -462,10 +476,8 @@ __global__ void fvm_laplacian_uncorrected_scalar_boundary(int num_boundary_cells int permute_index = bouPermedIndex[i]; double gamma = boundary_scalar0[permute_index] + boundary_scalar1[permute_index]; double gamma_magsf = gamma * boundary_magsf[i]; - // internal_coeffs += gamma_magsf * gradient_internal_coeffs[i * 3 + 0]; - // boundary_coeffs += gamma_magsf * gradient_boundary_coeffs[i * 3 + 0]; - internal_coeffs += gamma_magsf * 0.; - boundary_coeffs += gamma_magsf * 0.; + internal_coeffs += gamma_magsf * gradient_internal_coeffs[i]; + boundary_coeffs += gamma_magsf * gradient_boundary_coeffs[i]; } A_csr_output[csr_index] = A_csr_input[csr_index] + internal_coeffs * sign; @@ -521,6 +533,67 @@ __global__ void fvc_laplacian_internal(int num_cells, output[index] += sum / vol; } +__global__ void yeqn_update_BoundaryCoeffs_kernel(int num_boundary_faces, const double *boundary_phi, double *internal_coeffs, + double *boundary_coeffs, double *laplac_internal_coeffs, + double *laplac_boundary_coeffs, const int *U_patch_type) +{ + int index = blockDim.x * blockIdx.x + threadIdx.x; + if (index >= num_boundary_faces) + return; + + int patchIndex = U_patch_type[index]; + double valueInternalCoeffs, valueBoundaryCoeffs, gradientInternalCoeffs, gradientBoundaryCoeffs; + switch (patchIndex) + { + case 0: // zeroGradient + { + valueInternalCoeffs = 1.; + valueBoundaryCoeffs = 0.; + gradientInternalCoeffs = 0.; + gradientBoundaryCoeffs = 0.; + break; + } + // TODO implement coupled and fixedValue conditions + } + + internal_coeffs[index] = valueInternalCoeffs; + boundary_coeffs[index] = valueBoundaryCoeffs; + laplac_internal_coeffs[index] = gradientInternalCoeffs; + laplac_boundary_coeffs[index] = gradientBoundaryCoeffs; +} + +__global__ void yeqn_correct_BoundaryConditions_kernel(int num_cells, int num_boundary_cells, int num_boundary_faces, int num_species, + const int *boundary_cell_offset, const int *boundary_cell_id, + const double *species, double *boundary_species, const int *Y_patch_type) +{ + int index = blockDim.x * blockIdx.x + threadIdx.x; + if (index >= num_boundary_cells) + return; + + int cell_offset = boundary_cell_offset[index]; + int next_cell_offset = boundary_cell_offset[index + 1]; + int cell_index = boundary_cell_id[cell_offset]; + + for (int i = cell_offset; i < next_cell_offset; i++) + { + int patchIndex = Y_patch_type[i]; + switch (patchIndex) + { + case 0: // zeroGradient + { + for (int speciesID = 0; speciesID < num_species; speciesID++) + { + boundary_species[speciesID * num_boundary_faces + i] = species[speciesID * num_cells + cell_index]; + } + break; + } + case 1: + break; + // TODO implement coupled conditions + } + } +} + dfYEqn::dfYEqn(dfMatrixDataBase &dataBase, const std::string &modeStr, const std::string &cfgFile, const int inertIndex) : dataBase_(dataBase), inertIndex(inertIndex) { @@ -554,7 +627,7 @@ dfYEqn::dfYEqn(dfMatrixDataBase &dataBase, const std::string &modeStr, const std checkCudaErrors(cudaMalloc((void **)&d_mut_Sct, cell_bytes)); checkCudaErrors(cudaMalloc((void **)&d_boundary_mut_sct, boundary_face_bytes)); - checkCudaErrors(cudaMalloc((void **)&d_boundary_Y, boundary_face_bytes)); + checkCudaErrors(cudaMalloc((void **)&d_boundary_Y, boundary_face_bytes * num_species)); checkCudaErrors(cudaMalloc((void **)&d_hai, cell_bytes)); checkCudaErrors(cudaMalloc((void **)&d_boundary_hai, boundary_face_bytes)); @@ -591,6 +664,17 @@ void dfYEqn::initializeTimeStep() checkCudaErrors(cudaMemsetAsync(d_b, 0, cell_bytes * (num_species - 1), stream)); // initialize variables in each time step checkCudaErrors(cudaMemsetAsync(d_psi, 0, cell_bytes * (num_species - 1), stream)); + + // initialize boundary coeffs + size_t threads_per_block = 1024; + size_t blocks_per_grid = (dataBase_.num_boundary_faces + threads_per_block - 1) / threads_per_block; + for (int i = 0; i < num_species; i++) + { + yeqn_update_BoundaryCoeffs_kernel<<>>(dataBase_.num_boundary_faces, dataBase_.d_boundary_phi, + dataBase_.d_internal_coeffs_Y_vector[i], dataBase_.d_boundary_coeffs_Y_vector[i], + dataBase_.d_laplac_internal_coeffs_Y_vector[i], dataBase_.d_laplac_boundary_coeffs_Y_vector[i], + dataBase_.d_boundary_YpatchType); + } } void dfYEqn::upwindWeight() @@ -622,10 +706,14 @@ void dfYEqn::fvm_laplacian_and_sumYDiffError_diffAlphaD_hDiffCorrFlux(std::vecto size_t threads_per_block, blocks_per_grid; int mtxIndex = 0; + for (size_t i = 0; i < num_species; ++i) { checkCudaErrors(cudaMemcpyAsync(dataBase_.d_Y + i * num_cells, Y_old[i], cell_bytes, cudaMemcpyHostToDevice, stream)); - checkCudaErrors(cudaMemcpyAsync(d_boundary_Y, boundary_Y[i], boundary_face_bytes, cudaMemcpyHostToDevice, stream)); + if (uploadBoundaryY) + { + checkCudaErrors(cudaMemcpyAsync(d_boundary_Y + i * num_boundary_faces, boundary_Y[i], boundary_face_bytes, cudaMemcpyHostToDevice, stream)); + } checkCudaErrors(cudaMemcpyAsync(d_hai, hai[i], cell_bytes, cudaMemcpyHostToDevice, stream)); checkCudaErrors(cudaMemcpyAsync(d_boundary_hai, boundary_hai[i], boundary_face_bytes, cudaMemcpyHostToDevice, stream)); checkCudaErrors(cudaMemcpyAsync(d_rhoD, rhoD[i], cell_bytes, cudaMemcpyHostToDevice, stream)); @@ -644,8 +732,8 @@ void dfYEqn::fvm_laplacian_and_sumYDiffError_diffAlphaD_hDiffCorrFlux(std::vecto blocks_per_grid = (num_boundary_cells + threads_per_block - 1) / threads_per_block; fvc_grad_boundary<<>>(num_boundary_cells, dataBase_.d_boundary_cell_offset, dataBase_.d_boundary_cell_id, dataBase_.d_bouPermedIndex, - dataBase_.d_boundary_face_vector, d_boundary_Y, - dataBase_.d_volume, d_grady); + dataBase_.d_boundary_face_vector, d_boundary_Y + i * num_boundary_faces, + dataBase_.d_volume, d_grady, uploadBoundaryY); blocks_per_grid = (num_boundary_cells + threads_per_block - 1) / threads_per_block; correct_boundary_conditions<<>>(num_boundary_cells, dataBase_.d_boundary_cell_offset, dataBase_.d_boundary_cell_id, @@ -662,8 +750,8 @@ void dfYEqn::fvm_laplacian_and_sumYDiffError_diffAlphaD_hDiffCorrFlux(std::vecto blocks_per_grid = (num_boundary_faces + threads_per_block - 1) / threads_per_block; sumError_boundary<<>>(num_boundary_faces, dataBase_.d_bouPermedIndex, - d_boundary_hai, d_boundary_rhoD, d_boundary_Y, d_boundary_grady, - d_sum_boundary_hai_rhoD_grady, d_sum_boundary_rhoD_grady, d_sum_boundary_hai_y); + d_boundary_hai, d_boundary_rhoD, d_boundary_Y + num_boundary_faces, d_boundary_grady, + d_sum_boundary_hai_rhoD_grady, d_sum_boundary_rhoD_grady, d_sum_boundary_hai_y, uploadBoundaryY); // compute diffAlphaD blocks_per_grid = (num_cells + threads_per_block - 1) / threads_per_block; @@ -692,6 +780,7 @@ void dfYEqn::fvm_laplacian_and_sumYDiffError_diffAlphaD_hDiffCorrFlux(std::vecto } ++mtxIndex; } + uploadBoundaryY = false; blocks_per_grid = (num_cells + threads_per_block - 1) / threads_per_block; calculate_hDiffCorrFlux<<>>(num_cells, @@ -892,6 +981,21 @@ void dfYEqn::updatePsi(double *Psi, int speciesIndex) memcpy(Psi, h_psi + speciesIndex * num_cells, cell_bytes); } +void dfYEqn::correctBoundaryConditions() +{ + size_t threads_per_block = 1024; + size_t blocks_per_grid = (num_boundary_cells + threads_per_block - 1) / threads_per_block; + yeqn_correct_BoundaryConditions_kernel<<>>(num_cells, num_boundary_cells, num_boundary_faces, num_species, + dataBase_.d_boundary_cell_offset, dataBase_.d_boundary_cell_id, + dataBase_.d_Y, d_boundary_Y, dataBase_.d_boundary_YpatchType); + // double *h_boundary_Y = new double[num_boundary_faces]; + // cudaMemcpy(h_boundary_Y, d_boundary_Y, num_boundary_faces * sizeof(double), cudaMemcpyDeviceToHost); + // for (int i = 0; i < num_boundary_faces; i++) + // { + // printf("h_boundary_GPU[%d] = %e\n", i, h_boundary_Y[i]); + // } +} + dfYEqn::~dfYEqn() { }