deepmodeling · maorz1998 · Jun 11, 2023 · Jun 11, 2023
diff --git a/applications/solvers/dfLowMachFoam/EEqn.H b/applications/solvers/dfLowMachFoam/EEqn.H
@@ -28,6 +28,8 @@
 #ifdef GPUSolver_
     start1 = std::clock();
     UEqn_GPU.updatePsi(&U[0][0]);
+    UEqn_GPU.correctBoundaryConditions();
+    U.correctBoundaryConditions();
     K = 0.5*magSqr(U);
     end1 = std::clock();
     time_monitor_UEqn += double(end1 - start1) / double(CLOCKS_PER_SEC);
@@ -51,8 +53,10 @@
 
         const scalarField& patchK = K.boundaryField()[patchi];
         const scalarField& patchAlphaEff = alphaEff.boundaryField()[patchi];
+        const scalarField& patchGrad = he.boundaryField()[patchi].gradientBoundaryCoeffs(); // gradient_
         memcpy(boundary_K + eeqn_offset, &patchK[0], patchSize*sizeof(double));
         memcpy(boundary_alphaEff + eeqn_offset, &patchAlphaEff[0], patchSize*sizeof(double));
+        memcpy(boundary_gradient + eeqn_offset, &patchGrad[0], patchSize*sizeof(double));
 
         eeqn_offset += patchSize;
     }
@@ -67,7 +71,7 @@
     // prepare data on GPU
     start1 = std::clock();
     EEqn_GPU.prepare_data(&he.oldTime()[0], &K[0], &K.oldTime()[0], &alphaEff[0],
-            &dpdt[0], boundary_K, boundary_alphaEff);
+            &dpdt[0], boundary_K, boundary_alphaEff, boundary_gradient);
     EEqn_GPU.sync();
     end1 = std::clock();
     time_monitor_EEqn += double(end1 - start1) / double(CLOCKS_PER_SEC);

diff --git a/applications/solvers/dfLowMachFoam/YEqn.H b/applications/solvers/dfLowMachFoam/YEqn.H
@@ -39,7 +39,11 @@ time_monitor_YEqn_solve += double(end1 - start1) / double(CLOCKS_PER_SEC);
     {
         volScalarField& Yi = Y[i];
         Y_old[i] = &Yi.oldTime()[0];
-        cudaMallocHost(&boundary_Y[i], num_boundary_faces*sizeof(double));
+        if (updateBoundaryFields)
+        {
+            cudaMallocHost(&boundary_Y[i], num_boundary_faces*sizeof(double));
+            Info << "here" << endl;
+        }
         const volScalarField& haii = chemistry->hai(i);
         const volScalarField& rhoDi = chemistry->rhoD(i);
         hai[i] = &haii[0];
@@ -54,12 +58,16 @@ time_monitor_YEqn_solve += double(end1 - start1) / double(CLOCKS_PER_SEC);
             const scalarField& patchRhoDi = rhoDi.boundaryField()[patchi];
             int patchSize = patchYi.size();
 
-            memcpy(boundary_Y[i] + offset, &patchYi[0], patchSize*sizeof(double));
+            if (updateBoundaryFields)
+            {
+                memcpy(boundary_Y[i] + offset, &patchYi[0], patchSize*sizeof(double));
+            }
             memcpy(boundary_hai[i] + offset, &patchHaii[0], patchSize*sizeof(double));
             memcpy(boundary_rhoD[i] + offset, &patchRhoDi[0], patchSize*sizeof(double));
             offset += patchSize;
         }
     }
+    updateBoundaryFields = false;
     volScalarField mut_sct = turbulence->mut().ref()/Sct;
     double *boundary_mutsct = nullptr;
     cudaMallocHost(&boundary_mutsct, num_boundary_faces*sizeof(double));
@@ -124,6 +132,7 @@ if(flag_mpi_init) MPI_Barrier(PstreamGlobals::MPI_COMM_FOAM);
         YEqn_GPU.updatePsi(&Yi[0], i);
         Yi.correctBoundaryConditions();
     }
+    YEqn_GPU.correctBoundaryConditions();
     end1 = std::clock();
     time_monitor_YEqn += double(end1 - start1) / double(CLOCKS_PER_SEC);
     time_monitor_YEqn_correctBC += double(end1 - start1) / double(CLOCKS_PER_SEC);

diff --git a/applications/solvers/dfLowMachFoam/createdfSolver.H b/applications/solvers/dfLowMachFoam/createdfSolver.H
@@ -7,6 +7,8 @@ std::vector<int> boundaryCellIndex;
 std::vector<double> boundary_face_vector_init;
 std::vector<double> boundary_face_init;
 std::vector<double> boundary_deltaCoeffs_init;
+std::vector<std::vector<int>> patchTypes;
+std::vector<int> patchTypeU, patchTypeY;
 int num_boundary_faces = 0;
 int patchSize;
 forAll(mesh.boundary(), patchi)
@@ -22,14 +24,20 @@ forAll(mesh.boundary(), patchi)
     boundary_face_init.insert(boundary_face_init.end(), &pMagSf[0], &pMagSf[0]+patchSize);
     boundary_deltaCoeffs_init.insert(boundary_deltaCoeffs_init.end(), &pDeltaCoeffs[0], &pDeltaCoeffs[0]+patchSize);
     num_boundary_faces += patchSize;
+
+    constructBoundarySelector(patchTypeU, U.boundaryField()[patchi].type(), patchSize);
+    constructBoundarySelector(patchTypeY, Y[0].boundaryField()[patchi].type(), patchSize);
 }
+patchTypes.emplace_back(patchTypeU);
+patchTypes.emplace_back(patchTypeY);
+
 int num_boundary_cells;
 
 string settingPath;
 settingPath = CanteraTorchProperties.subDict("AmgxSettings").lookupOrDefault("UEqnSettingPath", string(""));
 
 dfMatrixDataBase dfDataBase(num_surfaces, num_cells, num_boundary_faces, Y.size(), num_boundary_cells, &neighbour[0], &owner[0], &mesh.V()[0], &mesh.surfaceInterpolation::weights()[0], 
-&mesh.Sf()[0][0], &mesh.magSf()[0], &mesh.nonOrthDeltaCoeffs()[0], boundary_face_vector_init, boundary_face_init, boundary_deltaCoeffs_init, boundaryCellIndex);
+&mesh.Sf()[0][0], &mesh.magSf()[0], &mesh.nonOrthDeltaCoeffs()[0], boundary_face_vector_init, boundary_face_init, boundary_deltaCoeffs_init, boundaryCellIndex, patchTypes);
 
 #ifdef GPUSolver_
 dfRhoEqn rhoEqn_GPU(dfDataBase);
@@ -49,12 +57,10 @@ cudaMallocHost(&boundary_nuEff_init, num_boundary_faces*sizeof(double));
 cudaMallocHost(&boundary_rho_init, num_boundary_faces*sizeof(double));
 cudaMallocHost(&boundary_phi_init, num_boundary_faces*sizeof(double));
 
-double *boundary_alphaEff, *boundary_K;
-double *eeqn_valueInternalCoeffs, *eeqn_valueBoundaryCoeffs, *eeqn_gradientInternalCoeffs, *eeqn_gradientBoundaryCoeffs;
+double *boundary_alphaEff, *boundary_K, *boundary_gradient;
 cudaMallocHost(&boundary_K, num_boundary_faces*sizeof(double));
 cudaMallocHost(&boundary_alphaEff, num_boundary_faces*sizeof(double));
-cudaMallocHost(&eeqn_valueInternalCoeffs, num_boundary_faces*sizeof(double));
-cudaMallocHost(&eeqn_valueBoundaryCoeffs, num_boundary_faces*sizeof(double));
-cudaMallocHost(&eeqn_gradientInternalCoeffs, num_boundary_faces*sizeof(double));
-cudaMallocHost(&eeqn_gradientBoundaryCoeffs, num_boundary_faces*sizeof(double));
+cudaMallocHost(&boundary_gradient, num_boundary_faces * sizeof(double));
+
+bool updateBoundaryFields = true; // make sure that the boundary fields do H2D copy at 1st timestep
 #endif
diff --git a/src_gpu/CMakeLists.txt b/src_gpu/CMakeLists.txt
@@ -24,7 +24,8 @@ add_library(${PROJECT_NAME}
         dfRhoEqn.cu 
         dfYEqn.cu
         dfEEqn.cu
-        AmgXSolver.cu)
+        AmgXSolver.cu
+        dfMatrixDataBase.cpp)
 
 target_link_libraries(${PROJECT_NAME}
     ${MPI_LIBRARIES}

diff --git a/src_gpu/dfEEqn.H b/src_gpu/dfEEqn.H
@@ -42,14 +42,16 @@ private:
     double *d_value_boundary_coeffs = nullptr;
     double *d_gradient_internal_coeffs = nullptr;
     double *d_gradient_boundary_coeffs = nullptr;
+    double *d_boundary_gradient_init = nullptr;
+    double *d_boundary_gradient = nullptr;
 
 public:
     dfEEqn();
     dfEEqn(dfMatrixDataBase &dataBase, const std::string &modeStr, const std::string &cfgFile);
     ~dfEEqn();
 
     void prepare_data(const double *he_old, const double *K, const double *K_old, const double *alphaEff,
-            const double *dpdt, const double *boundary_K, const double *boundary_alphaEff);
+            const double *dpdt, const double *boundary_K, const double *boundary_alphaEff, const double *boundary_gradient);
 
     void initializeTimeStep();
 

diff --git a/src_gpu/dfEEqn.cu b/src_gpu/dfEEqn.cu
@@ -399,9 +399,10 @@ __global__ void eeqn_add_to_source_kernel(int num_cells,
 
 __global__ void eeqn_boundaryPermutation(const int num_boundary_faces, const int *bouPermedIndex,
                                          const double *boundary_K_init,
-                                         const double *boundary_alphaEff_init,
+                                         const double *boundary_alphaEff_init, const double *boundary_gradient_init,
                                          double *boundary_K,
-                                         double *boundary_alphaEff)
+                                         double *boundary_alphaEff,
+                                         double *boundary_gradient)
 {
     int index = blockDim.x * blockIdx.x + threadIdx.x;
     if (index >= num_boundary_faces)
@@ -411,21 +412,25 @@ __global__ void eeqn_boundaryPermutation(const int num_boundary_faces, const int
 
     boundary_K[index] = boundary_K_init[p];
     boundary_alphaEff[index] = boundary_alphaEff_init[p];
+    boundary_gradient[index] = boundary_gradient_init[p];
 }
 
-__global__ void eeqn_update_BoundaryCoeffs_kernel(int num_boundary_faces, const double *boundary_phi, double *internal_coeffs,
+__global__ void eeqn_update_BoundaryCoeffs_kernel(int num_boundary_faces, const double *boundary_phi,
+                                                  double *gradient, const double *boundary_deltaCoeffs,
+                                                  double *internal_coeffs,
                                                   double *boundary_coeffs, double *laplac_internal_coeffs,
                                                   double *laplac_boundary_coeffs)
 {
     int index = blockDim.x * blockIdx.x + threadIdx.x;
     if (index >= num_boundary_faces)
         return;
 
-    // zeroGradient
+    double grad = gradient[index];
+    // energyGradient
     double valueInternalCoeffs = 1.;
-    double valueBoundaryCoeffs = 0.;
+    double valueBoundaryCoeffs = grad / boundary_deltaCoeffs[index];
     double gradientInternalCoeffs = 0.;
-    double gradientBoundaryCoeffs = 0.;
+    double gradientBoundaryCoeffs = grad;
 
     internal_coeffs[index] = boundary_phi[index] * valueInternalCoeffs;
     boundary_coeffs[index] = -boundary_phi[index] * valueBoundaryCoeffs;
@@ -440,7 +445,7 @@ dfEEqn::dfEEqn(dfMatrixDataBase &dataBase, const std::string &modeStr, const std
     ESolver = new AmgXSolver(modeStr, cfgFile);
 
     stream = dataBase_.stream;
-    //checkCudaErrors(cudaEventCreate(&event));
+    // checkCudaErrors(cudaEventCreate(&event));
 
     num_cells = dataBase_.num_cells;
     cell_bytes = dataBase_.cell_bytes;
@@ -473,6 +478,8 @@ dfEEqn::dfEEqn(dfMatrixDataBase &dataBase, const std::string &modeStr, const std
     checkCudaErrors(cudaMalloc((void **)&d_boundary_K, boundary_face_bytes));
     checkCudaErrors(cudaMalloc((void **)&d_boundary_alphaEff_init, boundary_face_bytes));
     checkCudaErrors(cudaMalloc((void **)&d_boundary_alphaEff, boundary_face_bytes));
+    checkCudaErrors(cudaMalloc((void **)&d_boundary_gradient_init, boundary_face_bytes));
+    checkCudaErrors(cudaMalloc((void **)&d_boundary_gradient, boundary_face_bytes));
 
     checkCudaErrors(cudaMalloc((void **)&d_value_internal_coeffs_init, boundary_face_bytes));
     checkCudaErrors(cudaMalloc((void **)&d_value_boundary_coeffs_init, boundary_face_bytes));
@@ -485,7 +492,7 @@ dfEEqn::dfEEqn(dfMatrixDataBase &dataBase, const std::string &modeStr, const std
 }
 
 void dfEEqn::prepare_data(const double *he_old, const double *K, const double *K_old, const double *alphaEff,
-                          const double *dpdt, const double *boundary_K, const double *boundary_alphaEff)
+                          const double *dpdt, const double *boundary_K, const double *boundary_alphaEff, const double *boundary_gradient)
 {
     // TODO not real async copy now, because some host array are not in pinned memory.
 
@@ -499,12 +506,13 @@ void dfEEqn::prepare_data(const double *he_old, const double *K, const double *K
     // copy and permutate boundary variable
     checkCudaErrors(cudaMemcpyAsync(d_boundary_K_init, boundary_K, boundary_face_bytes, cudaMemcpyHostToDevice, stream));
     checkCudaErrors(cudaMemcpyAsync(d_boundary_alphaEff_init, boundary_alphaEff, boundary_face_bytes, cudaMemcpyHostToDevice, stream));
+    checkCudaErrors(cudaMemcpyAsync(d_boundary_gradient_init, boundary_gradient, boundary_face_bytes, cudaMemcpyHostToDevice, stream));
 
     size_t threads_per_block = 1024;
     size_t blocks_per_grid = (num_boundary_faces + threads_per_block - 1) / threads_per_block;
     eeqn_boundaryPermutation<<<blocks_per_grid, threads_per_block, 0, stream>>>(num_boundary_faces, dataBase_.d_bouPermedIndex,
-            d_boundary_K_init, d_boundary_alphaEff_init,
-            d_boundary_K, d_boundary_alphaEff);
+                                                                                d_boundary_K_init, d_boundary_alphaEff_init, d_boundary_gradient_init,
+                                                                                d_boundary_K, d_boundary_alphaEff, d_boundary_gradient);
 }
 
 void dfEEqn::initializeTimeStep()
@@ -516,6 +524,7 @@ void dfEEqn::initializeTimeStep()
     size_t threads_per_block = 1024;
     size_t blocks_per_grid = (dataBase_.num_boundary_faces + threads_per_block - 1) / threads_per_block;
     eeqn_update_BoundaryCoeffs_kernel<<<blocks_per_grid, threads_per_block, 0, stream>>>(dataBase_.num_boundary_faces, dataBase_.d_boundary_phi,
+                                                                                         d_boundary_gradient, dataBase_.d_boundary_deltaCoeffs,
                                                                                          d_value_internal_coeffs, d_value_boundary_coeffs,
                                                                                          d_gradient_internal_coeffs, d_gradient_boundary_coeffs);
 }
@@ -577,14 +586,14 @@ void dfEEqn::fvc_div_vector()
     size_t threads_per_block = 512;
     size_t blocks_per_grid = (num_cells + threads_per_block - 1) / threads_per_block;
     eeqn_fvc_div_vector_internal<<<blocks_per_grid, threads_per_block, 0, stream>>>(num_cells,
-            d_A_csr_row_index, d_A_csr_col_index, d_A_csr_diag_index,
-            dataBase_.d_face_vector, dataBase_.d_hDiffCorrFlux, dataBase_.d_weight,
-            1., d_b, d_b);
+                                                                                    d_A_csr_row_index, d_A_csr_col_index, d_A_csr_diag_index,
+                                                                                    dataBase_.d_face_vector, dataBase_.d_hDiffCorrFlux, dataBase_.d_weight,
+                                                                                    1., d_b, d_b);
     blocks_per_grid = (num_boundary_cells + threads_per_block - 1) / threads_per_block;
     eeqn_fvc_div_vector_boundary<<<blocks_per_grid, threads_per_block, 0, stream>>>(num_boundary_cells,
-            dataBase_.d_boundary_cell_offset, dataBase_.d_boundary_cell_id,
-            dataBase_.d_boundary_face_vector, dataBase_.d_boundary_hDiffCorrFlux,
-            1., d_b, d_b);
+                                                                                    dataBase_.d_boundary_cell_offset, dataBase_.d_boundary_cell_id,
+                                                                                    dataBase_.d_boundary_face_vector, dataBase_.d_boundary_hDiffCorrFlux,
+                                                                                    1., d_b, d_b);
 }
 
 void dfEEqn::fvc_div_phi_scalar()
@@ -611,8 +620,8 @@ void dfEEqn::add_to_source()
     size_t blocks_per_grid = (num_cells + threads_per_block - 1) / threads_per_block;
     // " + fvc::ddt(rho，K)" is on the left side of "==", thus should minus from source.
     eeqn_add_to_source_kernel<<<blocks_per_grid, threads_per_block, 0, stream>>>(num_cells,
-            //1., d_dpdt, -1., d_diffAlphaD, dataBase_.d_volume, d_b, d_b);
-            1., d_dpdt, -1., dataBase_.d_diffAlphaD, dataBase_.d_volume, d_b, d_b);
+                                                                                 // 1., d_dpdt, -1., d_diffAlphaD, dataBase_.d_volume, d_b, d_b);
+                                                                                 1., d_dpdt, -1., dataBase_.d_diffAlphaD, dataBase_.d_volume, d_b, d_b);
 }
 
 void dfEEqn::checkValue(bool print)
@@ -689,10 +698,10 @@ void dfEEqn::solve()
     num_iteration++;
 
     checkCudaErrors(cudaMemcpyAsync(h_he_new, d_he_old, cell_bytes, cudaMemcpyDeviceToHost, stream));
-    //checkCudaErrors(cudaEventRecord(event, stream));
-    // checkCudaErrors(cudaStreamSynchronize(stream));
-    // for (size_t i = 0; i < num_cells; i++)
-    //     fprintf(stderr, "h_he_after[%d]: %.16lf\n", i, h_he_new[i]);
+    // checkCudaErrors(cudaEventRecord(event, stream));
+    //  checkCudaErrors(cudaStreamSynchronize(stream));
+    //  for (size_t i = 0; i < num_cells; i++)
+    //      fprintf(stderr, "h_he_after[%d]: %.16lf\n", i, h_he_new[i]);
 }
 
 void dfEEqn::sync()
@@ -703,7 +712,7 @@ void dfEEqn::sync()
 void dfEEqn::updatePsi(double *Psi)
 {
     checkCudaErrors(cudaStreamSynchronize(stream));
-    //checkCudaErrors(cudaEventSynchronize(event));
+    // checkCudaErrors(cudaEventSynchronize(event));
     memcpy(Psi, h_he_new, cell_bytes);
 }
 
@@ -727,6 +736,8 @@ dfEEqn::~dfEEqn()
     checkCudaErrors(cudaFree(d_boundary_K));
     checkCudaErrors(cudaFree(d_boundary_alphaEff_init));
     checkCudaErrors(cudaFree(d_boundary_alphaEff));
+    checkCudaErrors(cudaFree(d_boundary_gradient_init));
+    checkCudaErrors(cudaFree(d_boundary_gradient));
 
     checkCudaErrors(cudaFree(d_value_internal_coeffs_init));
     checkCudaErrors(cudaFree(d_value_boundary_coeffs_init));
@@ -737,5 +748,5 @@ dfEEqn::~dfEEqn()
     checkCudaErrors(cudaFree(d_gradient_internal_coeffs));
     checkCudaErrors(cudaFree(d_gradient_boundary_coeffs));
 
-    //checkCudaErrors(cudaEventDestroy(event));
+    // checkCudaErrors(cudaEventDestroy(event));
 }