From 3ea932de2780066b1cd9ecd233e563f00a5a22ea Mon Sep 17 00:00:00 2001
From: maorz1998 <maorz1998@stu.pku.edu.cn>
Date: Tue, 6 Jun 2023 13:13:26 +0800
Subject: [PATCH 1/2] GPU version of UEqn.H and UEqn.A

---
 applications/solvers/dfLowMachFoam/UEqn.H   |  54 +++--
 applications/solvers/dfLowMachFoam/YEqn.H   |  10 +-
 applications/solvers/dfLowMachFoam/pEqn.H   |  51 ++++-
 applications/solvers/dfLowMachFoam/rhoEqn.H |  22 +-
 src_gpu/dfUEqn.H                            |  11 +-
 src_gpu/dfUEqn.cu                           | 234 +++++++++++++++++++-
 6 files changed, 322 insertions(+), 60 deletions(-)
diff --git a/applications/solvers/dfLowMachFoam/UEqn.H b/applications/solvers/dfLowMachFoam/UEqn.H
index 3ceca690b..5a089e0c0 100644
--- a/applications/solvers/dfLowMachFoam/UEqn.H
+++ b/applications/solvers/dfLowMachFoam/UEqn.H
@@ -1,33 +1,4 @@
 // Solve the Momentum equation
-start1 = std::clock();
-tmp<fvVectorMatrix> tUEqn
-(
-    fvm::ddt(rho, U) + fvm::div(phi, U)
-  + turbulence->divDevRhoReff(U) 
-  == -fvc::grad(p)
-);
-fvVectorMatrix& UEqn = tUEqn.ref();
-
-end1 = std::clock();
-#ifdef CPUSolver_
-    time_monitor_UEqn += double(end1 - start1) / double(CLOCKS_PER_SEC);
-    time_monitor_UEqn_mtxAssembly += double(end1 - start1) / double(CLOCKS_PER_SEC);
-#endif
-
-#ifdef CPUSolver_
-    // UEqn.relax();
-    start1 = std::clock();
-    if (pimple.momentumPredictor())
-    {
-        solve(UEqn);
-
-        K = 0.5*magSqr(U);
-    }
-    end1 = std::clock();
-    time_monitor_UEqn += double(end1 - start1) / double(CLOCKS_PER_SEC);
-    time_monitor_UEqn_Solve += double(end1 - start1) / double(CLOCKS_PER_SEC);
-#endif
-
 #ifdef GPUSolver_
     start1 = std::clock();
     UEqn_GPU.fvm_ddt(&U.oldTime()[0][0]);
@@ -111,6 +82,31 @@ end1 = std::clock();
     //     K = 0.5*magSqr(U);
     // }
     // UEqn_GPU.checkValue(false);
+#else
+    start1 = std::clock();
+    tmp<fvVectorMatrix> tUEqn
+    (
+        fvm::ddt(rho, U) + fvm::div(phi, U)
+    + turbulence->divDevRhoReff(U) 
+    == -fvc::grad(p)
+    );
+    fvVectorMatrix& UEqn = tUEqn.ref();
+
+    end1 = std::clock();
+    time_monitor_UEqn += double(end1 - start1) / double(CLOCKS_PER_SEC);
+    time_monitor_UEqn_mtxAssembly += double(end1 - start1) / double(CLOCKS_PER_SEC);
+
+    // UEqn.relax();
+    start1 = std::clock();
+    if (pimple.momentumPredictor())
+    {
+        solve(UEqn);
+
+        K = 0.5*magSqr(U);
+    }
+    end1 = std::clock();
+    time_monitor_UEqn += double(end1 - start1) / double(CLOCKS_PER_SEC);
+    time_monitor_UEqn_Solve += double(end1 - start1) / double(CLOCKS_PER_SEC);
 #endif
 
 // start1 = std::clock();
diff --git a/applications/solvers/dfLowMachFoam/YEqn.H b/applications/solvers/dfLowMachFoam/YEqn.H
index 5e44d9b3f..eeaea40e7 100644
--- a/applications/solvers/dfLowMachFoam/YEqn.H
+++ b/applications/solvers/dfLowMachFoam/YEqn.H
@@ -107,7 +107,10 @@ time_monitor_corrDiff += double(end - start) / double(CLOCKS_PER_SEC);
 
         if (i != inertIndex)
         {
-            #ifdef CPUSolver_
+            #ifdef GPUSolver_
+                YEqn_GPU.updatePsi(&Yi[0], speciesIndex);
+                Yi.correctBoundaryConditions();
+            #else
                 tmp<volScalarField> DEff = chemistry->rhoD(i) + turbulence->mut()/Sct;
                 fvScalarMatrix YiEqn
                 (
@@ -127,11 +130,6 @@ time_monitor_corrDiff += double(end - start) / double(CLOCKS_PER_SEC);
                 YiEqn.solve("Yi");
             #endif
 
-            #ifdef GPUSolver_
-                YEqn_GPU.updatePsi(&Yi[0], speciesIndex);
-                Yi.correctBoundaryConditions();
-            #endif
-
             Yi.max(0.0);
             Yt += Yi;
             ++speciesIndex;
diff --git a/applications/solvers/dfLowMachFoam/pEqn.H b/applications/solvers/dfLowMachFoam/pEqn.H
index ea653d261..f8e7763be 100644
--- a/applications/solvers/dfLowMachFoam/pEqn.H
+++ b/applications/solvers/dfLowMachFoam/pEqn.H
@@ -7,9 +7,53 @@ if (!pimple.simpleRho())
 // pressure solution
 const volScalarField psip0(psi*p);
 
-volScalarField rAU(1.0/UEqn.A());
-surfaceScalarField rhorAUf("rhorAUf", fvc::interpolate(rho*rAU));
-volVectorField HbyA(constrainHbyA(rAU*UEqn.H(), U, p));
+#ifdef GPUSolver_
+    // UEqn.H()
+    volVectorField UEqn_H
+    (
+        IOobject
+        (
+            "H("+U.name()+')',
+            runTime.timeName(),
+            mesh,
+            IOobject::NO_READ,
+            IOobject::NO_WRITE
+        ),
+        mesh,
+        dimensionedVector(dimensionSet(1,-2,-2,0,0,0,0), Zero),
+        extrapolatedCalculatedFvPatchScalarField::typeName
+    );
+    UEqn_GPU.H(&UEqn_H[0][0]);
+    UEqn_H.correctBoundaryConditions();
+
+    // UEqn.A()
+    volScalarField UEqn_A
+    (
+        IOobject
+        (
+            "A("+U.name()+')',
+            runTime.timeName(),
+            mesh,
+            IOobject::NO_READ,
+            IOobject::NO_WRITE
+        ),
+        mesh,
+        dimensionedScalar(dimensionSet(1,-3,-1,0,0,0,0), Zero),
+        extrapolatedCalculatedFvPatchScalarField::typeName
+    );
+    UEqn_GPU.A(&UEqn_A[0]);
+    UEqn_A.correctBoundaryConditions();
+#endif
+
+#ifdef GPUSolver_
+    volScalarField rAU(1.0/UEqn_A);
+    surfaceScalarField rhorAUf("rhorAUf", fvc::interpolate(rho*rAU));
+    volVectorField HbyA(constrainHbyA(rAU*UEqn_H, U, p));
+#else
+    volScalarField rAU(1.0/UEqn.A());
+    surfaceScalarField rhorAUf("rhorAUf", fvc::interpolate(rho*rAU));
+    volVectorField HbyA(constrainHbyA(rAU*UEqn.H(), U, p));
+#endif
 
 if (pimple.nCorrPiso() <= 1)
 {
@@ -99,6 +143,7 @@ p.relax();
 U = HbyA - rAU*fvc::grad(p);
 U.correctBoundaryConditions();
 K = 0.5*magSqr(U);
+UEqn_GPU.correctPsi(&U[0][0]);
 
 if (pimple.simpleRho())
 {
diff --git a/applications/solvers/dfLowMachFoam/rhoEqn.H b/applications/solvers/dfLowMachFoam/rhoEqn.H
index e49ef9e5b..a7d5552fe 100644
--- a/applications/solvers/dfLowMachFoam/rhoEqn.H
+++ b/applications/solvers/dfLowMachFoam/rhoEqn.H
@@ -28,18 +28,6 @@ Description
     Solve the continuity for density.
 
 \*---------------------------------------------------------------------------*/
-
-#ifdef CPUSolver_
-{
-    fvScalarMatrix rhoEqn
-    (
-        fvm::ddt(rho)
-      + fvc::div(phi)
-    );
-
-    rhoEqn.solve();
-}
-#endif
 #ifdef GPUSolver_
 {
     rho.oldTime();
@@ -57,6 +45,16 @@ Description
     rhoEqn_GPU.updatePsi(&rho.primitiveFieldRef()[0]);
     rho.correctBoundaryConditions();
 }
+#else
+{
+    fvScalarMatrix rhoEqn
+    (
+        fvm::ddt(rho)
+      + fvc::div(phi)
+    );
+
+    rhoEqn.solve();
+}
 #endif
 
 // ************************************************************************* //
diff --git a/src_gpu/dfUEqn.H b/src_gpu/dfUEqn.H
index fb79a1dcf..8c6ed9c1e 100644
--- a/src_gpu/dfUEqn.H
+++ b/src_gpu/dfUEqn.H
@@ -19,8 +19,8 @@ private:
     int *d_A_csr_row_index, *d_A_csr_diag_index, *d_A_csr_col_index;
 
     // Matrix variables
-    double *d_A_csr, *d_b, *d_psi = nullptr;
-    double *h_A_csr, *h_b, *h_psi = nullptr;
+    double *d_A_csr, *d_b, *d_psi, *d_H, *d_A, *d_ueqn_internal_coeffs, *d_ueqn_boundary_coeffs= nullptr;
+    double *h_A_csr, *h_b, *h_psi, *h_H, *h_A = nullptr;
 
 public:
     dfUEqn();
@@ -48,6 +48,13 @@ public:
 
     void add_fvMatrix(double *turbSrc_low, double *turbSrc_diag, double *turbSrc_upp, double *turbSrc_source);
 
+    void A(double *Psi);
+
+    void H(double *Psi);
+
     void solve();
+
     void updatePsi(double *Psi);
+
+    void correctPsi(double *Psi);
 };
\ No newline at end of file
diff --git a/src_gpu/dfUEqn.cu b/src_gpu/dfUEqn.cu
index 4afebd8c1..eb2c45c9a 100644
--- a/src_gpu/dfUEqn.cu
+++ b/src_gpu/dfUEqn.cu
@@ -86,7 +86,8 @@ __global__ void fvm_div_boundary(int num_cells, int num_faces, int num_boundary_
                                  const int *csr_row_index, const int *csr_diag_index,
                                  const int *boundary_cell_offset, const int *boundary_cell_id,
                                  const double *internal_coeffs, const double *boundary_coeffs,
-                                 const double *A_csr_input, const double *b_input, double *A_csr_output, double *b_output)
+                                 const double *A_csr_input, const double *b_input, double *A_csr_output, double *b_output,
+                                 double *ueqn_internal_coeffs, double *ueqn_boundary_coeffs)
 {
     int index = blockDim.x * blockIdx.x + threadIdx.x;
     if (index >= num_boundary_cells)
@@ -120,6 +121,13 @@ __global__ void fvm_div_boundary(int num_cells, int num_faces, int num_boundary_
         boundary_coeffs_y += boundary_coeffs[(cell_offset + i) * 3 + 1];
         boundary_coeffs_z += boundary_coeffs[(cell_offset + i) * 3 + 2];
     }
+    ueqn_internal_coeffs[cell_index * 3 + 0] = internal_coeffs_x;
+    ueqn_internal_coeffs[cell_index * 3 + 1] = internal_coeffs_y;
+    ueqn_internal_coeffs[cell_index * 3 + 2] = internal_coeffs_z;
+    ueqn_boundary_coeffs[cell_index * 3 + 0] = boundary_coeffs_x;
+    ueqn_boundary_coeffs[cell_index * 3 + 1] = boundary_coeffs_y;
+    ueqn_boundary_coeffs[cell_index * 3 + 2] = boundary_coeffs_z;
+
     A_csr_output[csr_dim * 0 + csr_index] = A_csr_input[csr_dim * 0 + csr_index] + internal_coeffs_x;
     A_csr_output[csr_dim * 1 + csr_index] = A_csr_input[csr_dim * 1 + csr_index] + internal_coeffs_y;
     A_csr_output[csr_dim * 2 + csr_index] = A_csr_input[csr_dim * 2 + csr_index] + internal_coeffs_z;
@@ -799,7 +807,8 @@ __global__ void fvm_laplacian_uncorrected_vector_boundary(int num_cells, int num
                                                           const int *boundary_cell_offset, const int *boundary_cell_id,
                                                           const double *boundary_scalar0, const double *boundary_scalar1,
                                                           const double *boundary_magsf, const double *gradient_internal_coeffs, const double *gradient_boundary_coeffs,
-                                                          const double sign, const double *A_csr_input, const double *b_input, double *A_csr_output, double *b_output)
+                                                          const double sign, const double *A_csr_input, const double *b_input, double *A_csr_output, double *b_output,
+                                                          double *ueqn_internal_coeffs, double *ueqn_boundary_coeffs)
 {
     int index = blockDim.x * blockIdx.x + threadIdx.x;
     if (index >= num_boundary_cells)
@@ -846,6 +855,13 @@ __global__ void fvm_laplacian_uncorrected_vector_boundary(int num_cells, int num
         boundary_coeffs_z += gamma_magsf * gradient_boundary_coeffs[i * 3 + 2];
     }
 
+    ueqn_internal_coeffs[cell_index * 3 + 0] += internal_coeffs_x * sign;
+    ueqn_internal_coeffs[cell_index * 3 + 1] += internal_coeffs_y * sign;
+    ueqn_internal_coeffs[cell_index * 3 + 2] += internal_coeffs_z * sign;
+    ueqn_boundary_coeffs[cell_index * 3 + 0] += boundary_coeffs_x * sign;
+    ueqn_boundary_coeffs[cell_index * 3 + 1] += boundary_coeffs_y * sign;
+    ueqn_boundary_coeffs[cell_index * 3 + 2] += boundary_coeffs_z * sign;
+
     A_csr_output[csr_dim * 0 + csr_index] = A_csr_input[csr_dim * 0 + csr_index] + internal_coeffs_x * sign;
     A_csr_output[csr_dim * 1 + csr_index] = A_csr_input[csr_dim * 1 + csr_index] + internal_coeffs_y * sign;
     A_csr_output[csr_dim * 2 + csr_index] = A_csr_input[csr_dim * 2 + csr_index] + internal_coeffs_z * sign;
@@ -854,6 +870,138 @@ __global__ void fvm_laplacian_uncorrected_vector_boundary(int num_cells, int num
     b_output[num_cells * 2 + cell_index] = b_input[num_cells * 2 + cell_index] + boundary_coeffs_z * sign;
 }
 
+__global__ void addBoundaryDiag(int num_cells, int num_boundary_cells,
+                                const int *csr_row_index, const int *csr_diag_index,
+                                const int *boundary_cell_offset, const int *boundary_cell_id,
+                                const double *ueqn_internal_coeffs, const double *ueqn_boundary_coeffs,
+                                const double *psi, double *H)
+{
+    int index = blockDim.x * blockIdx.x + threadIdx.x;
+    if (index >= num_boundary_cells)
+        return;
+
+    int cell_offset = boundary_cell_offset[index];
+    int next_cell_offset = boundary_cell_offset[index + 1];
+    int cell_index = boundary_cell_id[cell_offset];
+
+    // addBoundaryDiag(boundaryDiagCmpt, cmpt); // add internal coeffs
+    // boundaryDiagCmpt.negate();
+    double internal_x = ueqn_internal_coeffs[cell_index * 3 + 0];
+    double internal_y = ueqn_internal_coeffs[cell_index * 3 + 1];
+    double internal_z = ueqn_internal_coeffs[cell_index * 3 + 2];
+
+    // addCmptAvBoundaryDiag(boundaryDiagCmpt);
+    double ave_internal = (internal_x + internal_y + internal_z) / 3;
+
+    H[num_cells * 0 + cell_index] = (-internal_x + ave_internal) * psi[num_cells * 0 + cell_index];
+    H[num_cells * 1 + cell_index] = (-internal_y + ave_internal) * psi[num_cells * 1 + cell_index];
+    H[num_cells * 2 + cell_index] = (-internal_z + ave_internal) * psi[num_cells * 2 + cell_index];
+}
+
+__global__ void lduMatrix_H(int num_cells,
+                            const int *csr_row_index, const int *csr_col_index, const int *csr_diag_index,
+                            const double *volume, const double *psi, const double *A_csr, const double *b,
+                            const double *ueqn_boundary_coeffs, double *H)
+{
+    int index = blockDim.x * blockIdx.x + threadIdx.x;
+    if (index >= num_cells)
+        return;
+
+    // A_csr has one more element in each row: itself
+    int row_index = csr_row_index[index];
+    int row_elements = csr_row_index[index + 1] - row_index;
+    int diag_index = csr_diag_index[index];
+    int neighbor_offset = csr_row_index[index] - index;
+
+    double APsi_x = 0.;
+    double APsi_y = 0.;
+    double APsi_z = 0.;
+    // lower
+    for (int i = 0; i < diag_index; i++)
+    {
+        int neighbor_cell_id = csr_col_index[i + row_index];
+        APsi_x += A_csr[row_index + i] * psi[num_cells * 0 + neighbor_cell_id];
+        APsi_y += A_csr[row_index + i] * psi[num_cells * 1 + neighbor_cell_id];
+        APsi_z += A_csr[row_index + i] * psi[num_cells * 2 + neighbor_cell_id];
+    }
+    // upper
+    for (int i = diag_index + 1; i < row_elements; i++)
+    {
+        int neighbor_cell_id = csr_col_index[i + row_index];
+        APsi_x += A_csr[row_index + i] * psi[num_cells * 0 + neighbor_cell_id];
+        APsi_y += A_csr[row_index + i] * psi[num_cells * 1 + neighbor_cell_id];
+        APsi_z += A_csr[row_index + i] * psi[num_cells * 2 + neighbor_cell_id];
+    }
+
+    H[num_cells * 0 + index] = H[num_cells * 0 + index] - APsi_x + b[num_cells * 0 + index];
+    H[num_cells * 1 + index] = H[num_cells * 1 + index] - APsi_y + b[num_cells * 1 + index];
+    H[num_cells * 2 + index] = H[num_cells * 2 + index] - APsi_z + b[num_cells * 2 + index];
+
+    double vol = volume[index];
+    H[num_cells * 0 + index] = H[num_cells * 0 + index] / vol;
+    H[num_cells * 1 + index] = H[num_cells * 1 + index] / vol;
+    H[num_cells * 2 + index] = H[num_cells * 2 + index] / vol;
+}
+
+__global__ void addBoundarySource(int num_cells, int num_boundary_cells,
+                                  const int *csr_row_index, const int *csr_diag_index,
+                                  const int *boundary_cell_offset, const int *boundary_cell_id,
+                                  const double *ueqn_internal_coeffs, const double *ueqn_boundary_coeffs,
+                                  const double *volume, double *H)
+{
+    int index = blockDim.x * blockIdx.x + threadIdx.x;
+    if (index >= num_boundary_cells)
+        return;
+
+    int cell_offset = boundary_cell_offset[index];
+    int cell_index = boundary_cell_id[cell_offset];
+
+    double vol = volume[index];
+
+    H[num_cells * 0 + index] = H[num_cells * 0 + index] + ueqn_boundary_coeffs[cell_index * 3 + 0] / vol;
+    H[num_cells * 1 + index] = H[num_cells * 1 + index] + ueqn_boundary_coeffs[cell_index * 3 + 1] / vol;
+    H[num_cells * 2 + index] = H[num_cells * 2 + index] + ueqn_boundary_coeffs[cell_index * 3 + 2] / vol;
+}
+
+__global__ void addAveInternaltoDiag(int num_cells, int num_boundary_cells,
+                                     const int *csr_row_index, const int *csr_diag_index,
+                                     const int *boundary_cell_offset, const int *boundary_cell_id,
+                                     const double *ueqn_internal_coeffs, const double *ueqn_boundary_coeffs, double *A)
+{
+    int index = blockDim.x * blockIdx.x + threadIdx.x;
+    if (index >= num_boundary_cells)
+        return;
+
+    int cell_offset = boundary_cell_offset[index];
+    int next_cell_offset = boundary_cell_offset[index + 1];
+    int cell_index = boundary_cell_id[cell_offset];
+
+    double internal_x = ueqn_internal_coeffs[cell_index * 3 + 0];
+    double internal_y = ueqn_internal_coeffs[cell_index * 3 + 1];
+    double internal_z = ueqn_internal_coeffs[cell_index * 3 + 2];
+
+    double ave_internal = (internal_x + internal_y + internal_z) / 3;
+
+    A[cell_index] = ave_internal;
+}
+
+__global__ void addDiagDivVolume(int num_cells, const int *csr_row_index,
+                                 const int *csr_diag_index, const double *A_csr, const double *volume,
+                                 double *ueqn_internal_coeffs, const double *A_input, double *A_output)
+{
+    int index = blockDim.x * blockIdx.x + threadIdx.x;
+    if (index >= num_cells)
+        return;
+
+    int row_index = csr_row_index[index];
+    int diag_index = csr_diag_index[index];
+    int csr_index = row_index + diag_index;
+
+    double vol = volume[index];
+
+    A_output[index] = (A_input[index] + A_csr[csr_index] - ueqn_internal_coeffs[index * 3]) / vol; 
+}
+
 // constructor
 dfUEqn::dfUEqn(dfMatrixDataBase &dataBase, const std::string &modeStr, const std::string &cfgFile)
     : dataBase_(dataBase)
@@ -877,10 +1025,16 @@ dfUEqn::dfUEqn(dfMatrixDataBase &dataBase, const std::string &modeStr, const std
     h_A_csr = new double[(num_cells + num_faces) * 3];
     h_b = new double[num_cells * 3];
     cudaMallocHost(&h_psi, cell_vec_bytes);
+    cudaMallocHost(&h_H, cell_vec_bytes);
+    cudaMallocHost(&h_A, cell_bytes);
 
     checkCudaErrors(cudaMalloc((void **)&d_A_csr, csr_value_vec_bytes));
     checkCudaErrors(cudaMalloc((void **)&d_b, cell_vec_bytes));
     checkCudaErrors(cudaMalloc((void **)&d_psi, cell_vec_bytes));
+    checkCudaErrors(cudaMalloc((void **)&d_H, cell_vec_bytes));
+    checkCudaErrors(cudaMalloc((void **)&d_A, cell_bytes));
+    checkCudaErrors(cudaMalloc((void **)&d_ueqn_internal_coeffs, cell_vec_bytes));
+    checkCudaErrors(cudaMalloc((void **)&d_ueqn_boundary_coeffs, cell_vec_bytes));
 
     checkCudaErrors(cudaStreamCreate(&stream));
 }
@@ -964,7 +1118,8 @@ void dfUEqn::fvm_div(double *phi, double *ueqn_internalCoeffs_init, double *ueqn
     fvm_div_boundary<<<blocks_per_grid, threads_per_block, 0, stream>>>(num_cells, num_faces, num_boundary_cells,
                                                                         d_A_csr_row_index, d_A_csr_diag_index,
                                                                         dataBase_.d_boundary_cell_offset, dataBase_.d_boundary_cell_id,
-                                                                        dataBase_.d_internal_coeffs, dataBase_.d_boundary_coeffs, d_A_csr, d_b, d_A_csr, d_b);
+                                                                        dataBase_.d_internal_coeffs, dataBase_.d_boundary_coeffs, d_A_csr, d_b, d_A_csr, d_b,
+                                                                        d_ueqn_internal_coeffs, d_ueqn_boundary_coeffs);
     end = std::clock();
     time_monitor_GPU_kernel += double(end - start) / double(CLOCKS_PER_SEC);
 }
@@ -1067,11 +1222,62 @@ void dfUEqn::fvm_laplacian()
     fvm_laplacian_uncorrected_vector_boundary<<<blocks_per_grid, threads_per_block, 0, stream>>>(num_cells, num_faces, num_boundary_cells,
                                                                                                  d_A_csr_row_index, d_A_csr_diag_index, dataBase_.d_boundary_cell_offset, dataBase_.d_boundary_cell_id,
                                                                                                  dataBase_.d_boundary_nuEff, dataBase_.d_boundary_rho, dataBase_.d_boundary_face, dataBase_.d_laplac_internal_coeffs,
-                                                                                                 dataBase_.d_laplac_boundary_coeffs, -1., d_A_csr, d_b, d_A_csr, d_b);
+                                                                                                 dataBase_.d_laplac_boundary_coeffs, -1., d_A_csr, d_b, d_A_csr, d_b, d_ueqn_internal_coeffs, d_ueqn_boundary_coeffs);
     clock_t end = std::clock();
     time_monitor_GPU_kernel += double(end - start) / double(CLOCKS_PER_SEC);
 }
 
+void dfUEqn::A(double *Psi)
+{
+    // tmp<scalarField> tdiag(new scalarField(diag()));
+    // addCmptAvBoundaryDiag(tdiag.ref());
+    checkCudaErrors(cudaMemsetAsync(d_A, 0, cell_bytes, stream));
+
+    size_t threads_per_block = 1024;
+    size_t blocks_per_grid = (num_boundary_cells + threads_per_block - 1) / threads_per_block;
+    addAveInternaltoDiag<<<blocks_per_grid, threads_per_block, 0, stream>>>(num_cells, num_boundary_cells, d_A_csr_row_index, d_A_csr_diag_index,
+                                                                            dataBase_.d_boundary_cell_offset, dataBase_.d_boundary_cell_id,
+                                                                            d_ueqn_internal_coeffs, d_ueqn_boundary_coeffs, d_A);
+    blocks_per_grid = (num_cells + threads_per_block - 1) / threads_per_block;
+    addDiagDivVolume<<<blocks_per_grid, threads_per_block, 0, stream>>>(num_cells, d_A_csr_row_index, d_A_csr_diag_index, d_A_csr,
+                                                                        dataBase_.d_volume, d_ueqn_internal_coeffs, d_A, d_A);
+    
+    checkCudaErrors(cudaMemcpyAsync(h_A, d_A, cell_bytes, cudaMemcpyDeviceToHost, stream));
+    checkCudaErrors(cudaStreamSynchronize(stream));
+    for (size_t i = 0; i < num_cells; i++)
+        Psi[i] = h_A[i];
+}
+
+void dfUEqn::H(double *Psi)
+{
+    checkCudaErrors(cudaMemsetAsync(d_H, 0, cell_vec_bytes, stream));
+    checkCudaErrors(cudaStreamSynchronize(stream));
+
+    size_t threads_per_block = 1024;
+    size_t blocks_per_grid = (num_boundary_cells + threads_per_block - 1) / threads_per_block;
+    addBoundaryDiag<<<blocks_per_grid, threads_per_block, 0, stream>>>(num_cells, num_boundary_cells, d_A_csr_row_index, d_A_csr_diag_index,
+                                                                       dataBase_.d_boundary_cell_offset, dataBase_.d_boundary_cell_id,
+                                                                       d_ueqn_internal_coeffs, d_ueqn_boundary_coeffs,
+                                                                       d_psi, d_H);
+
+    blocks_per_grid = (num_cells + threads_per_block - 1) / threads_per_block;
+    lduMatrix_H<<<blocks_per_grid, threads_per_block, 0, stream>>>(num_cells, d_A_csr_row_index, d_A_csr_col_index, d_A_csr_diag_index,
+                                                                   dataBase_.d_volume, d_psi, d_A_csr, d_b, d_ueqn_boundary_coeffs, d_H);
+
+    checkCudaErrors(cudaMemcpyAsync(h_H, d_H, cell_vec_bytes, cudaMemcpyDeviceToHost, stream));
+    checkCudaErrors(cudaStreamSynchronize(stream));
+
+    for (size_t i = 0; i < num_cells; i++)
+    {
+        Psi[i * 3] = h_H[i];
+        Psi[i * 3 + 1] = h_H[num_cells + i];
+        Psi[i * 3 + 2] = h_H[num_cells * 2 + i];
+    }
+
+    // for (int i = 0; i < num_cells; i++)
+    // fprintf(stderr, "h_H_GPU[%d]: (%.5e, %.5e, %.5e)\n", i, h_H[i], h_H[num_cells + i], h_H[num_cells * 2 + i]);
+}
+
 void dfUEqn::checkValue(bool print)
 {
     checkCudaErrors(cudaMemcpyAsync(h_A_csr, d_A_csr, csr_value_vec_bytes, cudaMemcpyDeviceToHost, stream));
@@ -1082,9 +1288,9 @@ void dfUEqn::checkValue(bool print)
     if (print)
     {
         for (int i = 0; i < (num_faces + num_cells); i++)
-            fprintf(stderr, "h_A_csr[%d]: %.15lf\n", i, h_A_csr[i]);
+            fprintf(stderr, "h_A_csr[%d]: %.5e\n", i, h_A_csr[i]);
         for (int i = 0; i < num_cells * 3; i++)
-            fprintf(stderr, "h_b[%d]: %.15lf\n", i, h_b[i]);
+            fprintf(stderr, "h_b[%d]: %.5e\n", i, h_b[i]);
     }
 
     char *input_file = "of_output.txt";
@@ -1136,9 +1342,9 @@ void dfUEqn::checkValue(bool print)
     if (print)
     {
         for (int i = 0; i < (num_faces + num_cells); i++)
-            fprintf(stderr, "h_A_of_vec_1mtx[%d]: %.15lf\n", i, h_A_of_vec_1mtx[i]);
+            fprintf(stderr, "h_A_of_vec_1mtx[%d]: %.5e\n", i, h_A_of_vec_1mtx[i]);
         for (int i = 0; i < 3 * num_cells; i++)
-            fprintf(stderr, "h_b_of_vec[%d]: %.15lf\n", i, h_b_of_vec[i]);
+            fprintf(stderr, "h_b_of_vec[%d]: %.5e\n", i, h_b_of_vec[i]);
     }
 
     // check
@@ -1204,6 +1410,18 @@ void dfUEqn::updatePsi(double *Psi)
     }
 }
 
+// correct volecity in pEqn
+void dfUEqn::correctPsi(double *Psi)
+{
+    for (size_t i = 0; i < num_cells; i++)
+    {
+        h_psi[i] = Psi[i * 3];
+        h_psi[num_cells + i] = Psi[i * 3 + 1];
+        h_psi[num_cells * 2 + i] = Psi[i * 3 + 2];
+    }
+    checkCudaErrors(cudaMemcpyAsync(d_psi, h_psi, cell_vec_bytes, cudaMemcpyDeviceToHost, stream));
+}
+
 dfUEqn::~dfUEqn()
 {
 }
\ No newline at end of file

From 7775dd97da9351160e87e088de8916e098764eec Mon Sep 17 00:00:00 2001
From: maorz1998 <maorz1998@stu.pku.edu.cn>
Date: Tue, 6 Jun 2023 13:15:05 +0800
Subject: [PATCH 2/2] upwind scheme

---
 .../H2/cvodeIntegrator_gpu/system/fvSchemes    | 18 +++++++++---------
 1 file changed, 9 insertions(+), 9 deletions(-)

diff --git a/examples/dfLowMachFoam/threeD_reactingTGV/H2/cvodeIntegrator_gpu/system/fvSchemes b/examples/dfLowMachFoam/threeD_reactingTGV/H2/cvodeIntegrator_gpu/system/fvSchemes
index c5c9136e5..e2c0058cd 100644
--- a/examples/dfLowMachFoam/threeD_reactingTGV/H2/cvodeIntegrator_gpu/system/fvSchemes
+++ b/examples/dfLowMachFoam/threeD_reactingTGV/H2/cvodeIntegrator_gpu/system/fvSchemes
@@ -30,15 +30,15 @@ divSchemes
     default             none;
 
     div(phi,U)          Gauss linear;
-    div(phi,Yi)         Gauss limitedLinear01 1;
-    div(phi,h)          Gauss limitedLinear 1;
-    div(phi,ha)          Gauss limitedLinear 1;
-    div(phi,K)          Gauss limitedLinear 1;
-    div(phid,p)         Gauss limitedLinear 1;
-    div(phi,epsilon)    Gauss limitedLinear 1;
-    div(phi,Yi_h)       Gauss limitedLinear01 1;
-    div(phi,k)          Gauss limitedLinear 1;
-    div(hDiffCorrFlux)             Gauss cubic;
+    div(phi,Yi)         Gauss linear;
+    div(phi,h)          Gauss linear;
+    div(phi,ha)          Gauss linear;
+    div(phi,K)          Gauss linear;
+    div(phid,p)         Gauss linear;
+    div(phi,epsilon)    Gauss linear;
+    div(phi,Yi_h)       Gauss upwind;
+    div(phi,k)          Gauss linear;
+    div(hDiffCorrFlux)             Gauss linear;
     div(((rho*nuEff)*dev2(T(grad(U)))))     Gauss linear;
 }