deepmodeling · maorz1998 · May 29, 2023 · May 29, 2023
diff --git a/applications/solvers/dfLowMachFoam/UEqn.H b/applications/solvers/dfLowMachFoam/UEqn.H
@@ -30,7 +30,7 @@ end1 = std::clock();
 
 #ifdef GPUSolver_
     start1 = std::clock();
-    UEqn_GPU.fvm_ddt(&rho.oldTime()[0], &rho[0], &U.oldTime()[0][0]);
+    UEqn_GPU.fvm_ddt(&U.oldTime()[0][0]);
     start2 = std::clock();
     int offset = 0;
     const tmp<volScalarField> nuEff_tmp(turbulence->nuEff());
@@ -69,24 +69,6 @@ end1 = std::clock();
         memcpy(boundary_nuEff_init+offset, &patchNuEff[0], patchSize*sizeof(double));
         // boundary rho
         memcpy(boundary_rho_init+offset, &patchRho[0], patchSize*sizeof(double));
-
-        // // only need to construct once
-        // std::copy(&ueqn_internalCoeffs_vec[0][0], &ueqn_internalCoeffs_vec[0][0]+3*patchSize, ueqn_internalCoeffs_init + 3*offset);
-        // // need to construct every time step
-        // std::copy(&ueqn_boundaryCoeffs_vec[0][0], &ueqn_boundaryCoeffs_vec[0][0]+3*patchSize, ueqn_boundaryCoeffs_init + 3*offset);
-        // // laplacian internalCoeffs
-        // std::copy(&ueqn_laplac_internalCoeffs_vec[0][0], &ueqn_laplac_internalCoeffs_vec[0][0]+3*patchSize, ueqn_laplac_internalCoeffs_init + 3*offset);
-        // // laplacian boundaryCoeffs
-        // std::copy(&ueqn_laplac_boundaryCoeffs_vec[0][0], &ueqn_laplac_boundaryCoeffs_vec[0][0]+3*patchSize, ueqn_laplac_boundaryCoeffs_init + 3*offset);
-        // // boundary pressure
-        // std::copy(&patchP[0], &patchP[0]+patchSize, boundary_pressure_init+offset);
-        // // boundary velocity
-        // std::copy(&patchU[0][0], &patchU[0][0]+3*patchSize, boundary_velocity_init+3*offset);
-        // // boundary nuEff
-        // std::copy(&patchNuEff[0], &patchNuEff[0]+patchSize, boundary_nuEff_init+offset);
-        // // boundary rho
-        // std::copy(&patchRho[0], &patchRho[0]+patchSize, boundary_rho_init+offset);
-
         offset += patchSize;
     }
     end1 = std::clock();
@@ -97,8 +79,8 @@ end1 = std::clock();
 
     start1 = std::clock();
     UEqn_GPU.fvm_div(&phi[0], ueqn_internalCoeffs_init, ueqn_boundaryCoeffs_init, 
-    ueqn_laplac_internalCoeffs_init, ueqn_laplac_boundaryCoeffs_init, 
-    boundary_pressure_init, boundary_velocity_init, boundary_nuEff_init, boundary_rho_init);
+        ueqn_laplac_internalCoeffs_init, ueqn_laplac_boundaryCoeffs_init, 
+        boundary_pressure_init, boundary_velocity_init, boundary_nuEff_init, boundary_rho_init);
     UEqn_GPU.fvc_grad(&p[0]);
     UEqn_GPU.fvc_grad_vector();
     UEqn_GPU.dev2T();

diff --git a/applications/solvers/dfLowMachFoam/createdfSolver.H b/applications/solvers/dfLowMachFoam/createdfSolver.H
@@ -32,14 +32,16 @@ dfMatrixDataBase dfDataBase(num_surfaces, num_cells, num_boundary_faces, num_bou
 &mesh.Sf()[0][0], &mesh.magSf()[0], &mesh.nonOrthDeltaCoeffs()[0], boundary_face_vector_init, boundary_face_init, boundary_deltaCoeffs_init, boundaryCellIndex);
 
 dfUEqn UEqn_GPU(dfDataBase, "dDDI", settingPath);
+dfRhoEqn rhoEqn_GPU(dfDataBase);
 
 double *ueqn_internalCoeffs_init, *ueqn_boundaryCoeffs_init, *boundary_pressure_init, *boundary_velocity_init,
-    *boundary_nuEff_init, *boundary_rho_init, *ueqn_laplac_internalCoeffs_init, *ueqn_laplac_boundaryCoeffs_init;
+    *boundary_nuEff_init, *boundary_rho_init, *ueqn_laplac_internalCoeffs_init, *ueqn_laplac_boundaryCoeffs_init, *boundary_phi_init;
 cudaMallocHost(&ueqn_internalCoeffs_init, 3*num_boundary_faces*sizeof(double));
 cudaMallocHost(&ueqn_boundaryCoeffs_init, 3*num_boundary_faces*sizeof(double));
 cudaMallocHost(&ueqn_laplac_internalCoeffs_init, 3*num_boundary_faces*sizeof(double));
 cudaMallocHost(&ueqn_laplac_boundaryCoeffs_init, 3*num_boundary_faces*sizeof(double));
 cudaMallocHost(&boundary_velocity_init, 3*num_boundary_faces*sizeof(double));
 cudaMallocHost(&boundary_pressure_init, num_boundary_faces*sizeof(double));
 cudaMallocHost(&boundary_nuEff_init, num_boundary_faces*sizeof(double));
-cudaMallocHost(&boundary_rho_init, num_boundary_faces*sizeof(double));
+cudaMallocHost(&boundary_rho_init, num_boundary_faces*sizeof(double));
+cudaMallocHost(&boundary_phi_init, num_boundary_faces*sizeof(double));
diff --git a/applications/solvers/dfLowMachFoam/dfLowMachFoam.C b/applications/solvers/dfLowMachFoam/dfLowMachFoam.C
@@ -60,6 +60,7 @@ Description
 #include "CombustionModel.H"
 
 #include "dfUEqn.H"
+#include "dfRhoEqn.H"
 #include <cuda_runtime.h>
 #include <thread>
 

diff --git a/applications/solvers/dfLowMachFoam/rhoEqn.H b/applications/solvers/dfLowMachFoam/rhoEqn.H
@@ -29,6 +29,7 @@ Description
 
 \*---------------------------------------------------------------------------*/
 
+#ifdef CPUSolver_
 {
     fvScalarMatrix rhoEqn
     (
@@ -38,5 +39,24 @@ Description
 
     rhoEqn.solve();
 }
+#endif
+#ifdef GPUSolver_
+{
+    rho.oldTime();
+
+    int offset = 0;
+    forAll(U.boundaryField(), patchi)
+    {
+        const fvsPatchScalarField& patchFlux = phi.boundaryField()[patchi];
+        int patchSize = patchFlux.size();
+        memcpy(boundary_phi_init+offset, &patchFlux[0], patchSize*sizeof(double));
+        offset += patchSize;
+    }
+    rhoEqn_GPU.fvc_div(&phi[0], boundary_phi_init);
+    rhoEqn_GPU.fvm_ddt(&rho.oldTime()[0], &rho[0]);
+    rhoEqn_GPU.updatePsi(&rho.primitiveFieldRef()[0]);
+    rho.correctBoundaryConditions();
+}
+#endif
 
 // ************************************************************************* //
diff --git a/src_gpu/CMakeLists.txt b/src_gpu/CMakeLists.txt
@@ -18,7 +18,7 @@ include_directories(
     $ENV{AMGX_DIR}/include
 )
 
-add_library(${PROJECT_NAME} SHARED dfUEqn.cu AmgXSolver.cu)
+add_library(${PROJECT_NAME} SHARED dfUEqn.cu dfRhoEqn.cu AmgXSolver.cu)
 
 target_link_libraries(${PROJECT_NAME}
     ${MPI_LIBRARIES}

diff --git a/src_gpu/dfMatrixDataBase.H b/src_gpu/dfMatrixDataBase.H
@@ -29,7 +29,7 @@ void check(T result, char const *const func, const char *const file,
 
 #define checkCudaErrors(val) check((val), #val, __FILE__, __LINE__)
 
-void checkVectorEqual(int count, const double* basevec, double* vec, double max_relative_error) {
+inline void checkVectorEqual(int count, const double* basevec, double* vec, double max_relative_error) {
     for (size_t i = 0; i < count; ++i)
     {
         double abs_diff = fabs(basevec[i] - vec[i]);
@@ -110,6 +110,7 @@ struct dfMatrixDataBase
     *d_laplac_internal_coeffs_init = nullptr, *d_laplac_boundary_coeffs_init = nullptr,
     *d_boundary_pressure = nullptr, *d_boundary_face_vector = nullptr,
     *d_boundary_pressure_init = nullptr,
+    *d_boundary_phi = nullptr, *d_boundary_phi_init = nullptr, 
     *d_boundary_velocity = nullptr, *d_boundary_velocity_init = nullptr,
     *d_boundary_nuEff = nullptr, *d_boundary_nuEff_init = nullptr,
     *d_boundary_rho = nullptr, *d_boundary_rho_init = nullptr;
@@ -167,10 +168,7 @@ struct dfMatrixDataBase
      * @brief cuda functions
      */
     cudaStream_t stream;
-    /**
-     * @brief AmgX functions
-     */
-    AmgXSolver* UxSolver = nullptr, *UySolver = nullptr, *UzSolver = nullptr;
+
     int num_iteration;
 
     double time_monitor_CPU;
@@ -191,6 +189,7 @@ struct dfMatrixDataBase
     {
         // allocate field pointer in pin memory
         cudaMallocHost(&h_phi_init, num_faces * sizeof(double));
+        cudaMallocHost(&h_rho_old, num_cells * sizeof(double));
 
         h_weight_vec_init.resize(num_faces);
         h_weight_vec.resize(num_faces);
@@ -478,6 +477,8 @@ struct dfMatrixDataBase
 
         checkCudaErrors(cudaMalloc((void**)&d_boundary_pressure_init, boundary_face_bytes));
         checkCudaErrors(cudaMalloc((void**)&d_boundary_pressure, boundary_face_bytes));
+        checkCudaErrors(cudaMalloc((void**)&d_boundary_phi_init, boundary_face_bytes));
+        checkCudaErrors(cudaMalloc((void**)&d_boundary_phi, boundary_face_bytes));
         checkCudaErrors(cudaMalloc((void**)&d_boundary_velocity_init, boundary_face_vec_bytes));
         checkCudaErrors(cudaMalloc((void**)&d_boundary_velocity, boundary_face_vec_bytes));
         checkCudaErrors(cudaMalloc((void**)&d_boundary_face_vector, boundary_face_vec_bytes));
@@ -495,7 +496,7 @@ struct dfMatrixDataBase
         checkCudaErrors(cudaMalloc((void**)&d_boundary_nuEff_init, boundary_face_bytes));
         checkCudaErrors(cudaMalloc((void**)&d_boundary_rho, boundary_face_bytes));
         checkCudaErrors(cudaMalloc((void**)&d_boundary_rho_init, boundary_face_bytes));
-        total_bytes += (boundary_face_bytes*8 + boundary_face_vec_bytes * 11);
+        total_bytes += (boundary_face_bytes*10 + boundary_face_vec_bytes * 11);
 
         // checkCudaErrors(cudaMalloc((void**)&d_A_csr, csr_value_vec_bytes));
         // checkCudaErrors(cudaMalloc((void**)&d_b, cell_vec_bytes));

diff --git a/src_gpu/dfRhoEqn.H b/src_gpu/dfRhoEqn.H
@@ -0,0 +1,43 @@
+#pragma once
+#include "dfMatrixDataBase.H"
+
+/*
+    fvScalarMatrix rhoEqn
+    (
+        fvm::ddt(rho)
+        + fvc::div(phi)
+    );
+
+    rhoEqn.solve();
+*/
+
+class dfRhoEqn
+{
+private:
+    dfMatrixDataBase& dataBase_;
+    cudaStream_t stream;
+    int num_iteration;
+    double time_monitor_CPU;
+    double time_monitor_GPU_kernel, time_monitor_GPU_memcpy, time_monitor_GPU_memcpy_test;
+
+    // common variables
+    int num_cells, cell_bytes, num_surfaces, num_boundary_cells;
+    int *d_A_csr_row_index, *d_A_csr_diag_index;
+
+    // Matrix variables
+    double *d_b, *d_psi = nullptr;
+    double *h_b, *h_psi = nullptr;
+
+public:
+    dfRhoEqn();
+    dfRhoEqn(dfMatrixDataBase& dataBase);
+    ~dfRhoEqn();
+
+    void checkValue(bool print);
+
+    void fvc_div(double *phi, double *boundary_phi_init);
+
+    void fvm_ddt(double *rho_old, double *rho_new);
+
+    void updatePsi(double* Psi);
+};
diff --git a/src_gpu/dfRhoEqn.cu b/src_gpu/dfRhoEqn.cu
@@ -0,0 +1,141 @@
+#include "dfRhoEqn.H"
+
+// kernel functions
+__global__ void fvc_div_internal_rho(int num_cells, const int *csr_row_index,
+                                     const int *csr_diag_index, const int *permedIndex, const double *phi_init,
+                                     double *phi_out, const double sign, const double *b_input, double *b_output)
+{
+    int index = blockDim.x * blockIdx.x + threadIdx.x;
+    if (index >= num_cells)
+        return;
+
+    // A_csr has one more element in each row: itself
+    int row_index = csr_row_index[index];
+    int row_elements = csr_row_index[index + 1] - row_index;
+    int diag_index = csr_diag_index[index];
+    int neighbor_offset = csr_row_index[index] - index;
+
+    double sum = 0;
+
+    // lower
+    for (int i = 0; i < diag_index; i++)
+    {
+        int neighbor_index = neighbor_offset + i;
+        int permute_index = permedIndex[neighbor_index];
+        double phi = phi_init[permute_index];
+        phi_out[neighbor_index] = phi;
+        sum -= phi;
+    }
+    // upper
+    for (int i = diag_index + 1; i < row_elements; i++)
+    {
+        int neighbor_index = neighbor_offset + i - 1;
+        int permute_index = permedIndex[neighbor_index];
+        double phi = phi_init[permute_index];
+        phi_out[neighbor_index] = phi;
+        sum += phi;
+    }
+
+    b_output[index] = b_input[index] + sum * sign;
+}
+
+__global__ void fvc_div_boundary_rho(int num_cells, int num_boundary_cells, const int *boundary_cell_offset,
+                                     const int *boundary_cell_id, const int *bouPermedIndex, const double *boundary_phi_init,
+                                     double *boundary_phi, const double sign, const double *b_input, double *b_output)
+{
+    int index = blockDim.x * blockIdx.x + threadIdx.x;
+    if (index >= num_boundary_cells)
+        return;
+
+    int cell_offset = boundary_cell_offset[index];
+    int next_cell_offset = boundary_cell_offset[index + 1];
+    int cell_index = boundary_cell_id[cell_offset];
+
+    double sum = 0;
+
+    for (int i = cell_offset; i < next_cell_offset; i++)
+    {
+        int permute_index = bouPermedIndex[i];
+        double phi = boundary_phi_init[permute_index];
+        boundary_phi[i] = phi;
+        sum += phi;
+    }
+
+    b_output[cell_index] = b_input[cell_index] + sum * sign;
+}
+
+__global__ void fvm_ddt_rho(int num_cells, const double rdelta_t,
+                            const double *rho_old, double *rho_new, const double *volume, const double *b)
+{
+    int index = blockDim.x * blockIdx.x + threadIdx.x;
+    if (index >= num_cells)
+        return;
+
+    double ddt_diag = rdelta_t * volume[index];
+    double ddt_source = rdelta_t * rho_old[index] * volume[index];
+    double source_sum = ddt_source - b[index];
+
+    rho_new[index] = source_sum / ddt_diag;
+}
+
+// constructor
+dfRhoEqn::dfRhoEqn(dfMatrixDataBase &dataBase)
+    : dataBase_(dataBase)
+{
+    num_cells = dataBase_.num_cells;
+    cell_bytes = dataBase_.cell_bytes;
+    num_surfaces = dataBase_.num_surfaces;
+    num_boundary_cells = dataBase_.num_boundary_cells;
+
+    d_A_csr_row_index = dataBase_.d_A_csr_row_index;
+    d_A_csr_diag_index = dataBase_.d_A_csr_diag_index;
+
+    cudaMallocHost(&h_psi, cell_bytes);
+
+    checkCudaErrors(cudaMalloc((void **)&d_b, cell_bytes));
+    checkCudaErrors(cudaMalloc((void **)&d_psi, cell_bytes));
+
+    checkCudaErrors(cudaStreamCreate(&stream));
+}
+
+void dfRhoEqn::fvc_div(double *phi, double *boundary_phi_init)
+{
+    checkCudaErrors(cudaMemsetAsync(d_b, 0, cell_bytes, stream));
+    clock_t start = std::clock();
+    memcpy(dataBase_.h_phi_init, phi, num_surfaces * sizeof(double));
+
+    clock_t end = std::clock();
+    time_monitor_CPU += double(end - start) / double(CLOCKS_PER_SEC);
+
+    start = std::clock();
+    checkCudaErrors(cudaMemcpyAsync(dataBase_.d_phi_init, dataBase_.h_phi_init, num_surfaces * sizeof(double), cudaMemcpyHostToDevice, stream));
+    checkCudaErrors(cudaMemcpyAsync(dataBase_.d_phi_init + num_surfaces, dataBase_.d_phi_init, num_surfaces * sizeof(double), cudaMemcpyDeviceToDevice, stream));
+    checkCudaErrors(cudaMemcpyAsync(dataBase_.d_boundary_phi_init, boundary_phi_init, dataBase_.boundary_face_bytes, cudaMemcpyHostToDevice, stream));
+    end = std::clock();
+
+    size_t threads_per_block = 1024;
+    size_t blocks_per_grid = (num_cells + threads_per_block - 1) / threads_per_block;
+    fvc_div_internal_rho<<<blocks_per_grid, threads_per_block, 0, stream>>>(num_cells, d_A_csr_row_index, d_A_csr_diag_index, dataBase_.d_permedIndex,
+                                                                            dataBase_.d_phi_init, dataBase_.d_phi, 1., d_b, d_b);
+
+    blocks_per_grid = (num_boundary_cells + threads_per_block - 1) / threads_per_block;
+    fvc_div_boundary_rho<<<blocks_per_grid, threads_per_block, 0, stream>>>(num_cells, num_boundary_cells, dataBase_.d_boundary_cell_offset, dataBase_.d_boundary_cell_id,
+                                                                            dataBase_.d_bouPermedIndex, dataBase_.d_boundary_phi_init, dataBase_.d_boundary_phi, 1., d_b, d_b);
+}
+
+void dfRhoEqn::fvm_ddt(double *rho_old, double *rho_new)
+{
+    checkCudaErrors(cudaMemcpyAsync(dataBase_.d_rho_old, rho_old, cell_bytes, cudaMemcpyHostToDevice, stream));
+    size_t threads_per_block = 1024;
+    size_t blocks_per_grid = (num_cells + threads_per_block - 1) / threads_per_block;
+    fvm_ddt_rho<<<blocks_per_grid, threads_per_block, 0, stream>>>(num_cells, dataBase_.rdelta_t, dataBase_.d_rho_old, dataBase_.d_rho_new, dataBase_.d_volume, d_b);
+    checkCudaErrors(cudaMemcpyAsync(h_psi, dataBase_.d_rho_new, cell_bytes, cudaMemcpyDeviceToHost, stream));
+}
+
+void dfRhoEqn::updatePsi(double *Psi)
+{
+    checkCudaErrors(cudaStreamSynchronize(stream));
+    for (size_t i = 0; i < num_cells; i++)
+        Psi[i] = h_psi[i];
+}
+dfRhoEqn::~dfRhoEqn(){}