From 7ee80e549d5f5dfad6dfc97dc85515ddfea6e38a Mon Sep 17 00:00:00 2001
From: STwangyingrui <wangyingrui@sensetime.com>
Date: Fri, 2 Jun 2023 14:01:48 +0800
Subject: [PATCH 1/3] implement eeqn, fvm_div debugging

---
 applications/solvers/dfLowMachFoam/EEqn.H     | 128 ++-
 .../solvers/dfLowMachFoam/createdfSolver.H    |  14 +-
 .../solvers/dfLowMachFoam/dfLowMachFoam.C     |   6 +
 src_gpu/CMakeLists.txt                        |   1 +
 src_gpu/dfEEqn.H                              |  75 ++
 src_gpu/dfEEqn.cu                             | 772 ++++++++++++++++++
 6 files changed, 986 insertions(+), 10 deletions(-)
 create mode 100644 src_gpu/dfEEqn.H
 create mode 100644 src_gpu/dfEEqn.cu
diff --git a/applications/solvers/dfLowMachFoam/EEqn.H b/applications/solvers/dfLowMachFoam/EEqn.H
index 6a49d9ced..3b8d38c8e 100644
--- a/applications/solvers/dfLowMachFoam/EEqn.H
+++ b/applications/solvers/dfLowMachFoam/EEqn.H
@@ -1,4 +1,5 @@
 {
+    fprintf(stderr, "\n\n=====================enter EEqn=====================\n\n\n");
     volScalarField& he = thermo.he();
     #ifdef GPUSolver_
         // start1 = std::clock();
@@ -14,23 +15,132 @@
         // // t.join();
         UEqn_GPU.updatePsi(&U[0][0]);
         K = 0.5*magSqr(U);
+        // K = 0.5*magSqr(U) * 10000000000;
         end1 = std::clock();
         time_monitor_UEqn += double(end1 - start1) / double(CLOCKS_PER_SEC);
         time_monitor_UEqn_mtxAssembly += double(end1 - start1) / double(CLOCKS_PER_SEC);
         time_monitor_CPU += double(end1 - start1) / double(CLOCKS_PER_SEC);
     #endif
+#ifdef CPUSolver_
+    //fvScalarMatrix EEqn
+    //(
+    //    fvm::ddt(rho, he) + fvm::div(phi, he)
+    //  + fvc::ddt(rho, K) + fvc::div(phi, K)
+    //  - dpdt
+    //  - fvm::laplacian(turbulence->alphaEff(), he)
+    //  + diffAlphaD
+    // ==
+    //    fvc::div(hDiffCorrFlux)
+    //);
+
+    //EEqn.relax();
+
+    //EEqn.solve();
+
     fvScalarMatrix EEqn
     (
-        fvm::ddt(rho, he) + fvm::div(phi, he)
-      + fvc::ddt(rho, K) + fvc::div(phi, K)
-      - dpdt
-      - fvm::laplacian(turbulence->alphaEff(), he)
-      + diffAlphaD
-     ==
-        fvc::div(hDiffCorrFlux)
+        //fvm::ddt(rho, he) + fvm::div(phi, he)
+        //fvm::ddt(rho, he) + fvc::ddt(rho, K)
+        //fvm::ddt(rho, he) + fvc::div(phi, K) * 10000000000
+        //fvm::ddt(rho, he) + fvc::div(phi, K)
+        //fvm::laplacian(turbulence->alphaEff(), he)
+        fvm::div(phi, he)
+        //fvm::laplacian(turbulence->alphaEff(), he) - dpdt + diffAlphaD
+        //fvm::laplacian(turbulence->alphaEff(), he) - fvc::div(hDiffCorrFlux)
+        //fvm::laplacian(turbulence->alphaEff(), he) + fvc::div(phi, K) * 10000000000
+        //==
+        //fvc::div(hDiffCorrFlux)
     );
-
     EEqn.relax();
-
     EEqn.solve();
+#endif
+
+#ifdef GPUSolver_
+    start2 = std::clock();
+    // prepare data on CPU
+    const tmp<volScalarField> alphaEff_tmp(turbulence->alphaEff());
+    const volScalarField& alphaEff = alphaEff_tmp();
+
+    int idx = 0;
+    offset = 0;
+    forAll(he.boundaryField(), patchi)
+    {
+        const fvsPatchScalarField& patchFlux = phi.boundaryField()[patchi];
+        const fvsPatchScalarField& pw = mesh.surfaceInterpolation::weights().boundaryField()[patchi];
+        int patchSize = pw.size();
+
+        const scalarField& patchK = K.boundaryField()[patchi];
+        const scalarField& patchAlphaEff = alphaEff.boundaryField()[patchi];
+        memcpy(boundary_K + offset, &patchK[0], patchSize*sizeof(double));
+        memcpy(boundary_alphaEff + offset, &patchAlphaEff[0], patchSize*sizeof(double));
+
+        Field<scalar> valueInternalCoeffs = patchFlux*he.boundaryField()[patchi].valueInternalCoeffs(pw);
+        Field<scalar> valueBoundaryCoeffs = -patchFlux*he.boundaryField()[patchi].valueBoundaryCoeffs(pw);
+        Field<scalar> gradientInternalCoeffs = he.boundaryField()[patchi].gradientInternalCoeffs();
+        Field<scalar> gradientBoundaryCoeffs = he.boundaryField()[patchi].gradientBoundaryCoeffs();
+        memcpy(eeqn_valueInternalCoeffs + offset, &valueInternalCoeffs[0], patchSize*sizeof(double));
+        memcpy(eeqn_valueBoundaryCoeffs + offset, &valueBoundaryCoeffs[0], patchSize*sizeof(double));
+        memcpy(eeqn_gradientInternalCoeffs + offset, &gradientInternalCoeffs[0], patchSize*sizeof(double));
+        memcpy(eeqn_gradientBoundaryCoeffs + offset, &gradientBoundaryCoeffs[0], patchSize*sizeof(double));
+
+        //for (int t = 0; t < patchSize; t++) {
+        //    fprintf(stderr, "gpu valueInternalCoeffs[%d][%d]: %.10lf\n", patchi, t, eeqn_valueInternalCoeffs[offset + t]);
+        //    if (idx == 64 || idx == 4160 || idx == 8385)
+        //        fprintf(stderr, "eeqn_value_internal_coeffs[%d](other index: %d %d): %.10lf(other value: %.10lf)\n", idx, offset, t, eeqn_valueInternalCoeffs[idx], eeqn_valueInternalCoeffs[offset + t]);
+        //    idx++;
+        //}
+        offset += patchSize;
+    }
+    offset = 0;
+    end2 = std::clock();
+    time_monitor_CPU += double(end2 - start2) / double(CLOCKS_PER_SEC);
+    start1 = std::clock();
+    // prepare data on GPU
+    EEqn_GPU.prepare_data(&he.oldTime()[0], &K[0], &K.oldTime()[0], &alphaEff[0],
+            &dpdt[0], &diffAlphaD[0], &hDiffCorrFlux[0][0],
+            boundary_K, boundary_alphaEff,
+            eeqn_valueInternalCoeffs, eeqn_valueBoundaryCoeffs,
+            eeqn_gradientInternalCoeffs, eeqn_gradientBoundaryCoeffs);
+    end1 = std::clock();
+    time_monitor_CPU += double(end1 - start1) / double(CLOCKS_PER_SEC);
+    time_monitor_EEqn += double(end1 - start1) / double(CLOCKS_PER_SEC);
+    time_monitor_EEqn_mtxAssembly += double(end1 - start1) / double(CLOCKS_PER_SEC);
+    fprintf(stderr, "\n\n=====================EEqn GPUSolver_ part2=====================\n\n\n");
+
+    start1 = std::clock();
+    EEqn_GPU.resetAb();
+    //EEqn_GPU.fvm_ddt();
+    EEqn_GPU.fvm_div();
+    //EEqn_GPU.fvm_laplacian();
+    //EEqn_GPU.fvc_ddt();
+    //EEqn_GPU.fvc_div_phi_scalar();
+    //EEqn_GPU.fvc_div_vector();
+    //EEqn_GPU.add_to_source();
+    EEqn_GPU.sync();
+    end1 = std::clock();
+    time_monitor_EEqn += double(end1 - start1) / double(CLOCKS_PER_SEC);
+    time_monitor_EEqn_mtxAssembly += double(end1 - start1) / double(CLOCKS_PER_SEC);
+    fprintf(stderr, "\n\n=====================EEqn GPUSolver_ part3=====================\n\n\n");
+
+    // check value of mtxAssembly
+    EEqn_GPU.checkValue(false);
+    //EEqn_GPU.checkValue(true);
+    fprintf(stderr, "\n\n=====================EEqn GPUSolver_ part4=====================\n\n\n");
+
+    start1 = std::clock();
+    EEqn_GPU.solve();
+    end1 = std::clock();
+    time_monitor_EEqn += double(end1 - start1) / double(CLOCKS_PER_SEC);
+    time_monitor_EEqn_Solve += double(end1 - start1) / double(CLOCKS_PER_SEC);
+    fprintf(stderr, "\n\n=====================EEqn GPUSolver_ part5=====================\n\n\n");
+
+    start1 = std::clock();
+    EEqn_GPU.updatePsi(&he[0]);
+    end1 = std::clock();
+    time_monitor_EEqn += double(end1 - start1) / double(CLOCKS_PER_SEC);
+    time_monitor_EEqn_mtxAssembly += double(end1 - start1) / double(CLOCKS_PER_SEC);
+    time_monitor_CPU += double(end1 - start1) / double(CLOCKS_PER_SEC);
+    fprintf(stderr, "\n\n=====================EEqn GPUSolver_ part6=====================\n\n\n");
+#endif
+    fprintf(stderr, "\n\n=====================leave EEqn=====================\n\n\n");
 }
diff --git a/applications/solvers/dfLowMachFoam/createdfSolver.H b/applications/solvers/dfLowMachFoam/createdfSolver.H
index b60307463..e56376b28 100644
--- a/applications/solvers/dfLowMachFoam/createdfSolver.H
+++ b/applications/solvers/dfLowMachFoam/createdfSolver.H
@@ -31,9 +31,11 @@ settingPath = CanteraTorchProperties.subDict("AmgxSettings").lookupOrDefault("UE
 dfMatrixDataBase dfDataBase(num_surfaces, num_cells, num_boundary_faces, Y.size(), num_boundary_cells, &neighbour[0], &owner[0], &mesh.V()[0], &mesh.surfaceInterpolation::weights()[0], 
 &mesh.Sf()[0][0], &mesh.magSf()[0], &mesh.nonOrthDeltaCoeffs()[0], boundary_face_vector_init, boundary_face_init, boundary_deltaCoeffs_init, boundaryCellIndex);
 
+#ifdef GPUSolver_
 dfRhoEqn rhoEqn_GPU(dfDataBase);
 dfUEqn UEqn_GPU(dfDataBase, "dDDI", settingPath);
 dfYEqn YEqn_GPU(dfDataBase, "dDDI", settingPath, inertIndex);
+dfEEqn EEqn_GPU(dfDataBase, "dDDI", settingPath);
 
 double *ueqn_internalCoeffs_init, *ueqn_boundaryCoeffs_init, *boundary_pressure_init, *boundary_velocity_init,
     *boundary_nuEff_init, *boundary_rho_init, *ueqn_laplac_internalCoeffs_init, *ueqn_laplac_boundaryCoeffs_init, *boundary_phi_init;
@@ -45,4 +47,14 @@ cudaMallocHost(&boundary_velocity_init, 3*num_boundary_faces*sizeof(double));
 cudaMallocHost(&boundary_pressure_init, num_boundary_faces*sizeof(double));
 cudaMallocHost(&boundary_nuEff_init, num_boundary_faces*sizeof(double));
 cudaMallocHost(&boundary_rho_init, num_boundary_faces*sizeof(double));
-cudaMallocHost(&boundary_phi_init, num_boundary_faces*sizeof(double));
\ No newline at end of file
+cudaMallocHost(&boundary_phi_init, num_boundary_faces*sizeof(double));
+
+double *boundary_alphaEff, *boundary_K;
+double *eeqn_valueInternalCoeffs, *eeqn_valueBoundaryCoeffs, *eeqn_gradientInternalCoeffs, *eeqn_gradientBoundaryCoeffs;
+cudaMallocHost(&boundary_K, num_boundary_faces*sizeof(double));
+cudaMallocHost(&boundary_alphaEff, num_boundary_faces*sizeof(double));
+cudaMallocHost(&eeqn_valueInternalCoeffs, num_boundary_faces*sizeof(double));
+cudaMallocHost(&eeqn_valueBoundaryCoeffs, num_boundary_faces*sizeof(double));
+cudaMallocHost(&eeqn_gradientInternalCoeffs, num_boundary_faces*sizeof(double));
+cudaMallocHost(&eeqn_gradientBoundaryCoeffs, num_boundary_faces*sizeof(double));
+#endif
diff --git a/applications/solvers/dfLowMachFoam/dfLowMachFoam.C b/applications/solvers/dfLowMachFoam/dfLowMachFoam.C
index d875b607d..7b6ba15ee 100644
--- a/applications/solvers/dfLowMachFoam/dfLowMachFoam.C
+++ b/applications/solvers/dfLowMachFoam/dfLowMachFoam.C
@@ -62,11 +62,13 @@ Description
 #include "dfUEqn.H"
 #include "dfYEqn.H"
 #include "dfRhoEqn.H"
+#include "dfEEqn.H"
 #include <cuda_runtime.h>
 #include <thread>
 #include "upwind.H"
 
 #define GPUSolver_
+//#define CPUSolver_
 
 // * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * //
 
@@ -99,6 +101,10 @@ int main(int argc, char *argv[])
     double time_monitor_UEqn_mtxAssembly=0;
     double time_monitor_UEqn_Solve=0;
     double time_monitor_UEqn_sum=0;
+    double time_monitor_EEqn=0;
+    double time_monitor_EEqn_mtxAssembly=0;
+    double time_monitor_EEqn_Solve=0;
+    double time_monitor_EEqn_sum=0;
     double time_monitor_chem=0;
     double time_monitor_Y=0;
     double time_monitor_E=0;
diff --git a/src_gpu/CMakeLists.txt b/src_gpu/CMakeLists.txt
index c749eaa42..1040cee14 100644
--- a/src_gpu/CMakeLists.txt
+++ b/src_gpu/CMakeLists.txt
@@ -23,6 +23,7 @@ add_library(${PROJECT_NAME}
         dfUEqn.cu 
         dfRhoEqn.cu 
         dfYEqn.cu
+        dfEEqn.cu
         AmgXSolver.cu)
 
 target_link_libraries(${PROJECT_NAME}
diff --git a/src_gpu/dfEEqn.H b/src_gpu/dfEEqn.H
new file mode 100644
index 000000000..10fc3569a
--- /dev/null
+++ b/src_gpu/dfEEqn.H
@@ -0,0 +1,75 @@
+#pragma once
+
+#include "AmgXSolver.H"
+#include <amgx_c.h>
+#include "dfMatrixDataBase.H"
+
+class dfEEqn
+{
+private:
+    dfMatrixDataBase &dataBase_;
+    cudaStream_t stream;
+    AmgXSolver *ESolver = nullptr;
+    int num_iteration;
+    double time_monitor_CPU;
+    double time_monitor_GPU_kernel, time_monitor_GPU_memcpy, time_monitor_GPU_memcpy_test;
+
+    // common variables
+    int num_cells, cell_bytes, num_faces, num_surfaces, cell_vec_bytes, csr_value_vec_bytes, num_boundary_cells;
+    int num_boundary_faces, boundary_face_bytes;
+    int *d_A_csr_row_index, *d_A_csr_diag_index, *d_A_csr_col_index;
+
+    // Matrix variables
+    double *d_A_csr, *d_b = nullptr;
+    double *h_A_csr, *h_b = nullptr;
+    double *d_he_old = nullptr;
+    double *h_he_new = nullptr;
+
+    // fields used by EEqn
+    double *d_alphaEff = nullptr;
+    double *d_K = nullptr;
+    double *d_K_old = nullptr;
+    double *d_dpdt = nullptr;
+    double *d_diffAlphaD = nullptr;
+    double *d_hDiffCorrFlux = nullptr;
+    double *d_boundary_K_init = nullptr;
+    double *d_boundary_K = nullptr;
+    double *d_boundary_alphaEff_init = nullptr;
+    double *d_boundary_alphaEff = nullptr;
+    double *d_value_internal_coeffs_init = nullptr;
+    double *d_value_boundary_coeffs_init = nullptr;
+    double *d_gradient_internal_coeffs_init = nullptr;
+    double *d_gradient_boundary_coeffs_init = nullptr;
+    double *d_value_internal_coeffs = nullptr;
+    double *d_value_boundary_coeffs = nullptr;
+    double *d_gradient_internal_coeffs = nullptr;
+    double *d_gradient_boundary_coeffs = nullptr;
+
+public:
+    dfEEqn();
+    dfEEqn(dfMatrixDataBase &dataBase, const std::string &modeStr, const std::string &cfgFile);
+    ~dfEEqn();
+
+    void prepare_data(const double *he_old, const double *K, const double *K_old, const double *alphaEff,
+            const double *hDiffCorrFlux, const double *dpdt, const double *diffAlphaD,
+            const double *boundary_K, const double *boundary_alphaEff,
+            const double *valueInternalCoeffs, const double *valueBoundaryCoeffs,
+            const double *gradientInternalCoeffs, const double *gradientBoundaryCoeffs);
+
+    void resetAb();
+
+    void fvm_ddt();
+    void fvm_div();
+    void fvm_laplacian();
+
+    void fvc_ddt();
+    void fvc_div_phi_scalar();
+    void fvc_div_vector();
+    void add_to_source();
+
+    void solve();
+    void checkValue(bool print);
+    void updatePsi(double *Psi);
+
+    void sync();
+};
diff --git a/src_gpu/dfEEqn.cu b/src_gpu/dfEEqn.cu
new file mode 100644
index 000000000..6f3d58acb
--- /dev/null
+++ b/src_gpu/dfEEqn.cu
@@ -0,0 +1,772 @@
+#include "dfEEqn.H"
+
+// kernel functions
+
+__global__ void eeqn_fvm_ddt_kernel(int num_cells, const double rdelta_t,
+                                    const int *csr_row_index, const int *csr_diag_index,
+                                    const double *rho_old, const double *rho_new, const double *volume, const double *he_old,
+                                    const double sign, const double *A_csr_input, const double *b_input,
+                                    double *A_csr_output, double *b_output)
+{
+    int index = blockDim.x * blockIdx.x + threadIdx.x;
+    if (index >= num_cells)
+        return;
+
+    // A_csr has one more element in each row: itself
+    int row_index = csr_row_index[index];
+    int diag_index = csr_diag_index[index];
+    int csr_index = row_index + diag_index;
+    double ddt_diag = rdelta_t * rho_new[index] * volume[index];
+    A_csr_output[csr_index] = A_csr_input[csr_index] + ddt_diag * sign;
+
+    double ddt_part_term = rdelta_t * rho_old[index] * volume[index];
+    b_output[index] = b_input[index] + ddt_part_term * he_old[index] * sign;
+}
+
+__global__ void eeqn_fvm_div_internal(int num_cells,
+                                      const int *csr_row_index, const int *csr_diag_index,
+                                      const double *weight, const double *phi,
+                                      const double sign, const double *A_csr_input, const double *b_input,
+                                      double *A_csr_output, double *b_output)
+{
+    int index = blockDim.x * blockIdx.x + threadIdx.x;
+    if (index >= num_cells)
+        return;
+
+    // A_csr has one more element in each row: itself
+    int row_index = csr_row_index[index];
+    int next_row_index = csr_row_index[index + 1];
+    int diag_index = csr_diag_index[index];
+    int neighbor_offset = csr_row_index[index] - index;
+
+    double div_diag = 0;
+    for (int i = row_index; i < next_row_index; i++)
+    {
+        int inner_index = i - row_index;
+        // lower
+        if (inner_index < diag_index)
+        {
+            int neighbor_index = neighbor_offset + inner_index;
+            double w = weight[neighbor_index];
+            double f = phi[neighbor_index];
+            //if (i == 267) printf("lower A[%d], %.10lf + %.10lf = %.10lf\n", i, A_csr_input[i], (-w) * f * sign, A_csr_input[i] + (1 - w) * f * sign);
+            A_csr_output[i] = A_csr_input[i] + (-w) * f * sign;
+            // lower neighbors contribute to sum of -1
+            div_diag += (w - 1) * f;
+        }
+        // upper
+        if (inner_index > diag_index)
+        {
+            // upper, index - 1, consider of diag
+            int neighbor_index = neighbor_offset + inner_index - 1;
+            double w = weight[neighbor_index];
+            double f = phi[neighbor_index];
+            //if (i == 267) printf("upper A[%d], %.10lf + %.10lf = %.10lf\n", i, A_csr_input[i], (-w) * f * sign, A_csr_input[i] + (1 - w) * f * sign);
+            A_csr_output[i] = A_csr_input[i] + (1 - w) * f * sign;
+            // upper neighbors contribute to sum of 1
+            div_diag += w * f;
+        }
+    }
+    //if (row_index + diag_index == 267) printf("diag A[%d], %.10lf + %.10lf = %.10lf\n", row_index + diag_index, A_csr_input[row_index + diag_index], div_diag * sign, A_csr_input[row_index + diag_index] + div_diag * sign);
+    A_csr_output[row_index + diag_index] = A_csr_input[row_index + diag_index] + div_diag * sign; // diag
+}
+
+__global__ void eeqn_fvm_div_boundary(int num_boundary_cells,
+                                      const int *csr_row_index, const int *csr_diag_index,
+                                      const int *boundary_cell_offset, const int *boundary_cell_id,
+                                      const double *value_internal_coeffs, const double *value_boundary_coeffs,
+                                      const double sign, const double *A_csr_input, const double *b_input,
+                                      double *A_csr_output, double *b_output)
+{
+    int index = blockDim.x * blockIdx.x + threadIdx.x;
+    if (index >= num_boundary_cells)
+        return;
+
+    int cell_offset = boundary_cell_offset[index];
+    int cell_index = boundary_cell_id[cell_offset];
+    int loop_size = boundary_cell_offset[index + 1] - cell_offset;
+
+    int row_index = csr_row_index[cell_index];
+    int diag_index = csr_diag_index[cell_index];
+    int csr_index = row_index + diag_index;
+
+    // construct internalCoeffs & boundaryCoeffs
+    double internal_coeffs = 0;
+    double boundary_coeffs = 0;
+    for (int i = 0; i < loop_size; i++)
+    {
+        //if (cell_index == 1) printf("term[%d], src index: %d, src value: %.10lf\n", i, cell_offset + i, value_internal_coeffs[cell_offset + i]);
+        internal_coeffs += value_internal_coeffs[cell_offset + i];
+        boundary_coeffs += value_boundary_coeffs[cell_offset + i];
+    }
+    //if (cell_index == 1) printf("boundary A[%d], cell_index: %d, loop_size: %d, %.10lf + %.10lf = %.10lf, \n", csr_index, cell_index, loop_size, A_csr_input[csr_index], internal_coeffs * sign, A_csr_input[csr_index] + internal_coeffs * sign);
+    A_csr_output[csr_index] = A_csr_input[csr_index] + internal_coeffs * sign;
+    b_output[cell_index] = b_input[cell_index] + boundary_coeffs * sign;
+}
+
+__global__ void eeqn_fvm_laplacian_uncorrected_internal(int num_cells,
+                                                        const int *csr_row_index, const int *csr_col_index, const int *csr_diag_index,
+                                                        const double *alphaEff, const double *weight,
+                                                        const double *magsf, const double *distance,
+                                                        const double sign, const double *A_csr_input, double *A_csr_output)
+{
+    int index = blockDim.x * blockIdx.x + threadIdx.x;
+    if (index >= num_cells)
+        return;
+
+    // A_csr has one more element in each row: itself
+    int row_index = csr_row_index[index];
+    int row_elements = csr_row_index[index + 1] - row_index;
+    int diag_index = csr_diag_index[index];
+    int neighbor_offset = csr_row_index[index] - index;
+
+    double own_alphaEff = alphaEff[index];
+    // fvm.upper() = deltaCoeffs.primitiveField()*gammaMagSf.primitiveField();
+    // fvm.negSumDiag();
+    double sum_diag = 0;
+    // lower
+    for (int i = 0; i < diag_index; i++)
+    {
+        int neighbor_index = neighbor_offset + i;
+        int neighbor_cell_id = csr_col_index[i + row_index];
+        double w = weight[neighbor_index];
+        double nei_alphaEff = alphaEff[neighbor_cell_id];
+        double gamma = w * (nei_alphaEff - own_alphaEff) + own_alphaEff;
+        double gamma_magsf = gamma * magsf[neighbor_index];
+        double coeff = gamma_magsf * distance[neighbor_index];
+        A_csr_output[row_index + i] = A_csr_input[row_index + i] + coeff * sign;
+        sum_diag += (-coeff);
+    }
+    // upper
+    for (int i = diag_index + 1; i < row_elements; i++)
+    {
+        int neighbor_index = neighbor_offset + i - 1;
+        int neighbor_cell_id = csr_col_index[i + row_index];
+        double w = weight[neighbor_index];
+        double nei_alphaEff = alphaEff[neighbor_cell_id];
+        double gamma = w * (nei_alphaEff - own_alphaEff) + own_alphaEff;
+        double gamma_magsf = gamma * magsf[neighbor_index];
+        double coeff = gamma_magsf * distance[neighbor_index];
+        A_csr_output[row_index + i] = A_csr_input[row_index + i] + coeff * sign;
+        sum_diag += (-coeff);
+    }
+    A_csr_output[row_index + diag_index] = A_csr_input[row_index + diag_index] + sum_diag * sign; // diag
+}
+
+__global__ void eeqn_fvm_laplacian_uncorrected_boundary(int num_boundary_cells,
+                                                               const int *csr_row_index, const int *csr_diag_index,
+                                                               const int *boundary_cell_offset, const int *boundary_cell_id,
+                                                               const double *boundary_alphaEff, const double *boundary_magsf,
+                                                               const double *gradient_internal_coeffs, const double *gradient_boundary_coeffs,
+                                                               const double sign, const double *A_csr_input, const double *b_input,
+                                                               double *A_csr_output, double *b_output)
+{
+    int index = blockDim.x * blockIdx.x + threadIdx.x;
+    if (index >= num_boundary_cells)
+        return;
+
+    int cell_offset = boundary_cell_offset[index];
+    int next_cell_offset = boundary_cell_offset[index + 1];
+    int cell_index = boundary_cell_id[cell_offset];
+
+    int row_index = csr_row_index[cell_index];
+    int diag_index = csr_diag_index[cell_index];
+    int csr_index = row_index + diag_index;
+
+    // OpenFoam code
+    // if (pvf.coupled())
+    // {
+    //     fvm.internalCoeffs()[patchi] =
+    //         pGamma*pvf.gradientInternalCoeffs(pDeltaCoeffs);
+    //     fvm.boundaryCoeffs()[patchi] =
+    //         -pGamma*pvf.gradientBoundaryCoeffs(pDeltaCoeffs);
+    // }
+    // else
+    // {
+    //     fvm.internalCoeffs()[patchi] = pGamma*pvf.gradientInternalCoeffs();
+    //     fvm.boundaryCoeffs()[patchi] = -
+    //         pGamma*pvf.gradientBoundaryCoeffs();
+    // }
+    double internal_coeffs = 0;
+    double boundary_coeffs = 0;
+    for (int i = cell_offset; i < next_cell_offset; i++)
+    {
+        double gamma = boundary_alphaEff[i];
+        double gamma_magsf = gamma * boundary_magsf[i];
+        internal_coeffs += gamma_magsf * gradient_internal_coeffs[i];
+        boundary_coeffs += gamma_magsf * gradient_boundary_coeffs[i];
+    }
+
+    A_csr_output[csr_index] = A_csr_input[csr_index] + internal_coeffs * sign;
+    b_output[cell_index] = b_input[cell_index] + boundary_coeffs * sign;
+}
+
+__global__ void eeqn_fvc_ddt_kernel(int num_cells, const double rdelta_t,
+                                    const double *rho_old, const double *rho_new,
+                                    const double *K_old, const double *K,
+                                    const double sign, const double *b_input, double *b_output)
+{
+    int index = blockDim.x * blockIdx.x + threadIdx.x;
+    if (index >= num_cells)
+        return;
+
+    double fvc_ddt_term = rdelta_t * (rho_new[index] * K[index] - rho_old[index] * K_old[index]);
+    b_output[index] = b_input[index] + fvc_ddt_term * sign;
+}
+
+__global__ void eeqn_fvc_div_vector_internal(int num_cells,
+        const int *csr_row_index, const int *csr_col_index, const int *csr_diag_index,
+        const double *sf, const double *vf, const double *tlambdas,
+        const double sign, const double *b_input, double *b_output)
+{
+    int index = blockDim.x * blockIdx.x + threadIdx.x;
+    if (index >= num_cells)
+        return;
+
+    // A_csr has one more element in each row: itself
+    int row_index = csr_row_index[index];
+    int row_elements = csr_row_index[index + 1] - row_index;
+    int diag_index = csr_diag_index[index];
+    int neighbor_offset = csr_row_index[index] - index;
+
+    double own_vf_x = vf[index * 3 + 0];
+    double own_vf_y = vf[index * 3 + 1];
+    double own_vf_z = vf[index * 3 + 2];
+    double sum = 0;
+    // lower
+    for (int i = 0; i < diag_index; i++)
+    {
+        int neighbor_index = neighbor_offset + i;
+        int neighbor_cell_id = csr_col_index[row_index + i];
+        double w = tlambdas[neighbor_index];
+        double sf_x = sf[neighbor_index * 3 + 0];
+        double sf_y = sf[neighbor_index * 3 + 1];
+        double sf_z = sf[neighbor_index * 3 + 2];
+        double neighbor_vf_x = vf[neighbor_cell_id * 3 + 0];
+        double neighbor_vf_y = vf[neighbor_cell_id * 3 + 1];
+        double neighbor_vf_z = vf[neighbor_cell_id * 3 + 2];
+        double face_x = (1 - w) * own_vf_x + w * neighbor_vf_x;
+        double face_y = (1 - w) * own_vf_y + w * neighbor_vf_y;
+        double face_z = (1 - w) * own_vf_z + w * neighbor_vf_z;
+        sum -= sf_x * face_x + sf_y * face_y + sf_z * face_z;
+    }
+    // upper
+    for (int i = diag_index + 1; i < row_elements; i++)
+    {
+        int neighbor_index = neighbor_offset + i - 1;
+        int neighbor_cell_id = csr_col_index[row_index + i];
+        double w = tlambdas[neighbor_index];
+        double sf_x = sf[neighbor_index * 3 + 0];
+        double sf_y = sf[neighbor_index * 3 + 1];
+        double sf_z = sf[neighbor_index * 3 + 2];
+        double neighbor_vf_x = vf[neighbor_cell_id * 3 + 0];
+        double neighbor_vf_y = vf[neighbor_cell_id * 3 + 1];
+        double neighbor_vf_z = vf[neighbor_cell_id * 3 + 2];
+        double face_x = w * own_vf_x + (1 - w) * neighbor_vf_x;
+        double face_y = w * own_vf_y + (1 - w) * neighbor_vf_y;
+        double face_z = w * own_vf_z + (1 - w) * neighbor_vf_z;
+        sum += sf_x * face_x + sf_y * face_y + sf_z * face_z;
+    }
+    b_output[index] = b_input[index] + sum * sign;
+}
+
+__global__ void eeqn_fvc_div_vector_boundary(int num_boundary_cells,
+        const int *boundary_cell_offset, const int *boundary_cell_id,
+        const double *boundary_sf, const double *boundary_vf,
+        const double sign, const double *b_input, double *b_output)
+{
+    int index = blockDim.x * blockIdx.x + threadIdx.x;
+    if (index >= num_boundary_cells)
+        return;
+
+    int cell_offset = boundary_cell_offset[index];
+    int next_cell_offset = boundary_cell_offset[index + 1];
+    int cell_index = boundary_cell_id[cell_offset];
+
+    // OpenFoam code
+    // Foam::surfaceInterpolationScheme<Type>::dotInterpolate
+    // if (vf.boundaryField()[pi].coupled())
+    // {
+    //     psf =
+    //         pSf
+    //         & (
+    //                 pLambda*vf.boundaryField()[pi].patchInternalField()
+    //                 + (1.0 - pLambda)*vf.boundaryField()[pi].patchNeighbourField()
+    //           );
+    // }
+    // else
+    // {
+    //     psf = pSf & vf.boundaryField()[pi];
+    // }
+    // tmp<GeometricField<Type, fvPatchField, volMesh>> surfaceIntegrate
+    // forAll(mesh.boundary()[patchi], facei)
+    // {
+    //     ivf[pFaceCells[facei]] += pssf[facei];
+    // }
+    double sum = 0;
+    for (int i = cell_offset; i < next_cell_offset; i++)
+    {
+        double sf_x = boundary_sf[i * 3 + 0];
+        double sf_y = boundary_sf[i * 3 + 1];
+        double sf_z = boundary_sf[i * 3 + 2];
+        double face_x = boundary_vf[i * 3 + 0];
+        double face_y = boundary_vf[i * 3 + 1];
+        double face_z = boundary_vf[i * 3 + 2];
+
+        // if not coupled
+        sum += (sf_x * face_x + sf_y * face_y + sf_z * face_z);
+    }
+    b_output[cell_index] = b_input[cell_index] + sum * sign;
+}
+
+__global__ void eeqn_fvc_div_phi_scalar_internal(int num_cells,
+                                       const int *csr_row_index, const int *csr_col_index, const int *csr_diag_index,
+                                       const double *weight, const double *phi, const double *K,
+                                       const double sign, const double *b_input, double *b_output)
+{
+    int index = blockDim.x * blockIdx.x + threadIdx.x;
+    if (index >= num_cells)
+        return;
+
+    // A_csr has one more element in each row: itself
+    int row_index = csr_row_index[index];
+    int next_row_index = csr_row_index[index + 1];
+    int diag_index = csr_diag_index[index];
+    int neighbor_offset = csr_row_index[index] - index;
+
+    double own_cell_k = K[index];
+    double interp = 0;
+    for (int i = row_index; i < next_row_index; i++)
+    {
+        int inner_index = i - row_index;
+        // lower
+        if (inner_index < diag_index)
+        {
+            int neighbor_index = neighbor_offset + inner_index;
+            double w = weight[neighbor_index];
+            int neighbor_cell_id = csr_col_index[row_index + inner_index];
+            double neighbor_cell_k = K[neighbor_cell_id];
+            double face_k = (1 - w) * own_cell_k + w * neighbor_cell_k;
+            interp -= phi[i] * face_k;
+        }
+        // upper
+        if (inner_index > diag_index)
+        {
+            int neighbor_index = neighbor_offset + inner_index - 1;
+            double w = weight[neighbor_index];
+            int neighbor_cell_id = csr_col_index[row_index + inner_index];
+            double neighbor_cell_k = K[neighbor_cell_id];
+            double face_k = (1 - w) * own_cell_k + w * neighbor_cell_k;
+            interp += phi[i] * face_k;
+        }
+    }
+    b_output[index] = b_input[index] + interp * sign;
+}
+
+__global__ void eeqn_fvc_div_phi_scalar_boundary(int num_boundary_cells,
+                                       const int *boundary_cell_offset, const int *boundary_cell_id,
+                                       const double *boundary_phi, const double* boundary_K,
+                                       const double sign, const double *b_input, double *b_output)
+{
+    int index = blockDim.x * blockIdx.x + threadIdx.x;
+    if (index >= num_boundary_cells)
+        return;
+
+    int cell_offset = boundary_cell_offset[index];
+    int next_cell_offset = boundary_cell_offset[index + 1];
+    int cell_index = boundary_cell_id[cell_offset];
+
+    // boundary interplate
+    double interp = 0;
+    for (int i = cell_offset; i < next_cell_offset; i++)
+    {
+        interp += boundary_phi[i] * boundary_K[i];
+    }
+
+    b_output[cell_index] = b_input[cell_index] + interp * sign;
+}
+
+
+__global__ void eeqn_add_to_source_kernel(int num_cells,
+                                    const double sign_dpdt, const double *dpdt,
+                                    const double sign_diffAlphaD, const double *diffAlphaD,
+                                    const double *volume,
+                                    const double *b_input, double *b_output)
+{
+    int index = blockDim.x * blockIdx.x + threadIdx.x;
+    if (index >= num_cells)
+        return;
+
+    b_output[index] = b_input[index] + sign_dpdt * dpdt[index] * volume[index] + sign_diffAlphaD * diffAlphaD[index] * volume[index];
+}
+
+__global__ void eeqn_boundaryPermutation(const int num_boundary_faces, const int *bouPermedIndex,
+                                    const double *boundary_K_init, const double *boundary_alphaEff_init,
+                                    const double *value_internal_coeffs_init, const double *value_boundary_coeffs_init,
+                                    const double *gradient_internal_coeffs_init, const double *gradient_boundary_coeffs_init,
+                                    double *boundary_K, double *boundary_alphaEff,
+                                    double *value_internal_coeffs, double *value_boundary_coeffs,
+                                    double *gradient_internal_coeffs, double *gradient_boundary_coeffs)
+{
+    int index = blockDim.x * blockIdx.x + threadIdx.x;
+    if (index >= num_boundary_faces)
+        return;
+
+    int p = bouPermedIndex[index];
+
+    //if (index == 4 || index == 5 || index == 6) {
+    //    printf("dst index: %d, src index: %d, value: %.10lf\n", index, p, value_internal_coeffs_init[p]);
+    //}
+    boundary_K[index] = boundary_K_init[p];
+    boundary_alphaEff[index] = boundary_alphaEff_init[p];
+    value_internal_coeffs[index] = value_internal_coeffs_init[p];
+    value_boundary_coeffs[index] = value_boundary_coeffs_init[p];
+    gradient_internal_coeffs[index] = gradient_internal_coeffs_init[p];
+    gradient_boundary_coeffs[index] = gradient_boundary_coeffs_init[p];
+}
+
+// constructor
+dfEEqn::dfEEqn(dfMatrixDataBase &dataBase, const std::string &modeStr, const std::string &cfgFile)
+    : dataBase_(dataBase)
+{
+    ESolver = new AmgXSolver(modeStr, cfgFile);
+
+    num_cells = dataBase_.num_cells;
+    cell_bytes = dataBase_.cell_bytes;
+    num_faces = dataBase_.num_faces;
+    cell_vec_bytes = dataBase_.cell_vec_bytes;
+    csr_value_vec_bytes = dataBase_.csr_value_vec_bytes;
+    num_boundary_cells = dataBase_.num_boundary_cells;
+    num_surfaces = dataBase_.num_surfaces;
+    num_boundary_faces = dataBase_.num_boundary_faces;
+    boundary_face_bytes = dataBase_.boundary_face_bytes;
+
+    d_A_csr_row_index = dataBase_.d_A_csr_row_index;
+    d_A_csr_diag_index = dataBase_.d_A_csr_diag_index;
+    d_A_csr_col_index = dataBase_.d_A_csr_col_index;
+
+    h_A_csr = new double[(num_cells + num_faces) * 3];
+    h_b = new double[num_cells * 3];
+    cudaMallocHost(&h_he_new, cell_bytes);
+
+    checkCudaErrors(cudaMalloc((void **)&d_A_csr, csr_value_vec_bytes));
+    checkCudaErrors(cudaMalloc((void **)&d_b, cell_vec_bytes));
+
+    checkCudaErrors(cudaMalloc((void **)&d_he_old, cell_bytes));
+    checkCudaErrors(cudaMalloc((void **)&d_K, cell_bytes));
+    checkCudaErrors(cudaMalloc((void **)&d_K_old, cell_bytes));
+    checkCudaErrors(cudaMalloc((void **)&d_alphaEff, cell_bytes));
+    checkCudaErrors(cudaMalloc((void **)&d_dpdt, cell_bytes));
+    checkCudaErrors(cudaMalloc((void **)&d_diffAlphaD, cell_bytes));
+    checkCudaErrors(cudaMalloc((void **)&d_hDiffCorrFlux, cell_bytes * 3));
+
+    checkCudaErrors(cudaMalloc((void **)&d_boundary_K_init, boundary_face_bytes));
+    checkCudaErrors(cudaMalloc((void **)&d_boundary_K, boundary_face_bytes));
+    checkCudaErrors(cudaMalloc((void **)&d_boundary_alphaEff_init, boundary_face_bytes));
+    checkCudaErrors(cudaMalloc((void **)&d_boundary_alphaEff, boundary_face_bytes));
+
+    checkCudaErrors(cudaMalloc((void **)&d_value_internal_coeffs_init, boundary_face_bytes));
+    checkCudaErrors(cudaMalloc((void **)&d_value_boundary_coeffs_init, boundary_face_bytes));
+    checkCudaErrors(cudaMalloc((void **)&d_gradient_internal_coeffs_init, boundary_face_bytes));
+    checkCudaErrors(cudaMalloc((void **)&d_gradient_boundary_coeffs_init, boundary_face_bytes));
+    checkCudaErrors(cudaMalloc((void **)&d_value_internal_coeffs, boundary_face_bytes));
+    checkCudaErrors(cudaMalloc((void **)&d_value_boundary_coeffs, boundary_face_bytes));
+    checkCudaErrors(cudaMalloc((void **)&d_gradient_internal_coeffs, boundary_face_bytes));
+    checkCudaErrors(cudaMalloc((void **)&d_gradient_boundary_coeffs, boundary_face_bytes));
+
+    checkCudaErrors(cudaStreamCreate(&stream));
+}
+
+void dfEEqn::prepare_data(const double *he_old, const double *K, const double *K_old, const double *alphaEff,
+        const double *dpdt, const double *diffAlphaD, const double *hDiffCorrFlux,
+        const double *boundary_K, const double *boundary_alphaEff,
+        const double *valueInternalCoeffs, const double *valueBoundaryCoeffs,
+        const double *gradientInternalCoeffs, const double *gradientBoundaryCoeffs) {
+    // TODO not real async copy now, because some host array are not in pinned memory.
+
+    // copy the host input array in host memory to the device input array in device memory
+    clock_t start = std::clock();
+
+    checkCudaErrors(cudaMemcpyAsync(d_he_old, he_old, cell_bytes, cudaMemcpyHostToDevice, stream));
+    checkCudaErrors(cudaMemcpyAsync(d_K, K, cell_bytes, cudaMemcpyHostToDevice, stream));
+    checkCudaErrors(cudaMemcpyAsync(d_K_old, K_old, cell_bytes, cudaMemcpyHostToDevice, stream));
+    checkCudaErrors(cudaMemcpyAsync(d_alphaEff, alphaEff, cell_bytes, cudaMemcpyHostToDevice, stream));
+    checkCudaErrors(cudaMemcpyAsync(d_dpdt, dpdt, cell_bytes, cudaMemcpyHostToDevice, stream));
+    checkCudaErrors(cudaMemcpyAsync(d_diffAlphaD, diffAlphaD, cell_bytes, cudaMemcpyHostToDevice, stream));
+    checkCudaErrors(cudaMemcpyAsync(d_hDiffCorrFlux, hDiffCorrFlux, cell_bytes * 3, cudaMemcpyHostToDevice, stream));
+
+    // copy and permutate boundary variable
+    checkCudaErrors(cudaMemcpyAsync(d_boundary_K_init, boundary_K, boundary_face_bytes, cudaMemcpyHostToDevice, stream));
+    checkCudaErrors(cudaMemcpyAsync(d_boundary_alphaEff_init, boundary_alphaEff, boundary_face_bytes, cudaMemcpyHostToDevice, stream));
+    checkCudaErrors(cudaMemcpyAsync(d_value_internal_coeffs_init, valueInternalCoeffs, boundary_face_bytes, cudaMemcpyHostToDevice, stream));
+    checkCudaErrors(cudaMemcpyAsync(d_value_boundary_coeffs_init, valueBoundaryCoeffs, boundary_face_bytes, cudaMemcpyHostToDevice, stream));
+    checkCudaErrors(cudaMemcpyAsync(d_gradient_internal_coeffs_init, gradientInternalCoeffs, boundary_face_bytes, cudaMemcpyHostToDevice, stream));
+    checkCudaErrors(cudaMemcpyAsync(d_gradient_boundary_coeffs_init, gradientBoundaryCoeffs, boundary_face_bytes, cudaMemcpyHostToDevice, stream));
+
+    size_t threads_per_block = 1024;
+    size_t blocks_per_grid = (num_boundary_faces + threads_per_block - 1) / threads_per_block;
+    eeqn_boundaryPermutation<<<blocks_per_grid, threads_per_block, 0, stream>>>(num_boundary_faces, dataBase_.d_bouPermedIndex,
+            d_boundary_K_init, d_boundary_alphaEff_init,
+            d_value_internal_coeffs_init, d_value_boundary_coeffs_init,
+            d_gradient_internal_coeffs_init, d_gradient_boundary_coeffs_init,
+            d_boundary_K, d_boundary_alphaEff,
+            d_value_internal_coeffs, d_value_boundary_coeffs,
+            d_gradient_internal_coeffs, d_gradient_boundary_coeffs);
+    clock_t end = std::clock();
+    time_monitor_GPU_memcpy += double(end - start) / double(CLOCKS_PER_SEC);
+}
+
+void dfEEqn::resetAb() {
+    // initialize matrix value
+    checkCudaErrors(cudaMemsetAsync(d_A_csr, 0, csr_value_vec_bytes, stream));
+    checkCudaErrors(cudaMemsetAsync(d_b, 0, cell_vec_bytes, stream));
+}
+
+#define TRACE fprintf(stderr, "%s %d\n", __FILE__, __LINE__);
+
+void dfEEqn::fvm_ddt()
+{
+    clock_t start = std::clock();
+    size_t threads_per_block = 1024;
+    size_t blocks_per_grid = (num_cells + threads_per_block - 1) / threads_per_block;
+    eeqn_fvm_ddt_kernel<<<blocks_per_grid, threads_per_block, 0, stream>>>(num_cells, dataBase_.rdelta_t,
+            d_A_csr_row_index, d_A_csr_diag_index,
+            dataBase_.d_rho_old, dataBase_.d_rho_new, dataBase_.d_volume, d_he_old,
+            1., d_A_csr, d_b, d_A_csr, d_b);
+    clock_t end = std::clock();
+    time_monitor_GPU_kernel += double(end - start) / double(CLOCKS_PER_SEC);
+}
+
+void dfEEqn::fvm_div()
+{
+    clock_t start = std::clock();
+    size_t threads_per_block = 1024;
+    size_t blocks_per_grid = (num_cells + threads_per_block - 1) / threads_per_block;
+    eeqn_fvm_div_internal<<<blocks_per_grid, threads_per_block, 0, stream>>>(num_cells,
+            d_A_csr_row_index, d_A_csr_diag_index,
+            dataBase_.d_weight, dataBase_.d_phi,
+            1., d_A_csr, d_b, d_A_csr, d_b);
+    blocks_per_grid = (num_boundary_cells + threads_per_block - 1) / threads_per_block;
+    eeqn_fvm_div_boundary<<<blocks_per_grid, threads_per_block, 0, stream>>>(num_boundary_cells,
+            d_A_csr_row_index, d_A_csr_diag_index,
+            dataBase_.d_boundary_cell_offset, dataBase_.d_boundary_cell_id,
+            d_value_internal_coeffs, d_value_boundary_coeffs,
+            1., d_A_csr, d_b, d_A_csr, d_b);
+    clock_t end = std::clock();
+    time_monitor_GPU_kernel += double(end - start) / double(CLOCKS_PER_SEC);
+}
+
+void dfEEqn::fvm_laplacian()
+{
+    clock_t start = std::clock();
+    size_t threads_per_block = 1024;
+    size_t blocks_per_grid = (num_cells + threads_per_block - 1) / threads_per_block;
+    eeqn_fvm_laplacian_uncorrected_internal<<<blocks_per_grid, threads_per_block, 0, stream>>>(num_cells,
+            d_A_csr_row_index, d_A_csr_col_index, d_A_csr_diag_index, d_alphaEff, dataBase_.d_weight,
+            dataBase_.d_face, dataBase_.d_deltaCoeffs,
+            1., d_A_csr, d_A_csr);
+
+    blocks_per_grid = (num_boundary_cells + threads_per_block - 1) / threads_per_block;
+    eeqn_fvm_laplacian_uncorrected_boundary<<<blocks_per_grid, threads_per_block, 0, stream>>>(num_boundary_cells,
+            d_A_csr_row_index, d_A_csr_diag_index, dataBase_.d_boundary_cell_offset, dataBase_.d_boundary_cell_id,
+            d_boundary_alphaEff, dataBase_.d_boundary_face, d_gradient_internal_coeffs, d_gradient_boundary_coeffs,
+            1., d_A_csr, d_b, d_A_csr, d_b);
+    clock_t end = std::clock();
+    time_monitor_GPU_kernel += double(end - start) / double(CLOCKS_PER_SEC);
+}
+
+void dfEEqn::fvc_ddt()
+{
+    // " + fvc::ddt(rho，K)" is on the left side of "==", thus should minus from source.
+    clock_t start = std::clock();
+    size_t threads_per_block = 1024;
+    size_t blocks_per_grid = (num_cells + threads_per_block - 1) / threads_per_block;
+    eeqn_fvc_ddt_kernel<<<blocks_per_grid, threads_per_block, 0, stream>>>(num_cells, dataBase_.rdelta_t,
+            dataBase_.d_rho_old, dataBase_.d_rho_new, d_K_old, d_K,
+            -1., d_b, d_b);
+    clock_t end = std::clock();
+    time_monitor_GPU_kernel += double(end - start) / double(CLOCKS_PER_SEC);
+}
+
+void dfEEqn::fvc_div_vector()
+{
+    // " + fvc::div(hDiffCorrFlux)" is on the right side of "==", thus should add to source.
+    clock_t start = std::clock();
+    size_t threads_per_block = 512;
+    size_t blocks_per_grid = (num_cells + threads_per_block - 1) / threads_per_block;
+    eeqn_fvc_div_vector_internal<<<blocks_per_grid, threads_per_block, 0, stream>>>(num_cells,
+            d_A_csr_row_index, d_A_csr_col_index, d_A_csr_diag_index,
+            dataBase_.d_face_vector, dataBase_.d_grad, dataBase_.d_weight,
+            1., d_b, d_b);
+    blocks_per_grid = (num_boundary_cells + threads_per_block - 1) / threads_per_block;
+    eeqn_fvc_div_vector_boundary<<<blocks_per_grid, threads_per_block, 0, stream>>>(num_boundary_cells,
+            dataBase_.d_boundary_cell_offset, dataBase_.d_boundary_cell_id,
+            dataBase_.d_boundary_face_vector, dataBase_.d_grad_boundary,
+            1., d_b, d_b);
+    clock_t end = std::clock();
+    time_monitor_GPU_kernel += double(end - start) / double(CLOCKS_PER_SEC);
+}
+
+void dfEEqn::fvc_div_phi_scalar()
+{
+    // " + fvc::div(phi，K)" is on the left side of "==", thus should minus from source.
+    clock_t start = std::clock();
+    size_t threads_per_block = 1024;
+    size_t blocks_per_grid = (num_cells + threads_per_block - 1) / threads_per_block;
+    eeqn_fvc_div_phi_scalar_internal<<<blocks_per_grid, threads_per_block, 0, stream>>>(num_cells,
+            d_A_csr_row_index, d_A_csr_col_index, d_A_csr_diag_index,
+            dataBase_.d_weight, dataBase_.d_phi, d_K,
+            -1., d_b, d_b);
+    blocks_per_grid = (num_boundary_cells + threads_per_block - 1) / threads_per_block;
+    eeqn_fvc_div_phi_scalar_boundary<<<blocks_per_grid, threads_per_block, 0, stream>>>(num_boundary_cells,
+            dataBase_.d_boundary_cell_offset, dataBase_.d_boundary_cell_id,
+            dataBase_.d_boundary_phi, d_boundary_K,
+            -1., d_b, d_b);
+    clock_t end = std::clock();
+    time_monitor_GPU_kernel += double(end - start) / double(CLOCKS_PER_SEC);
+}
+
+void dfEEqn::add_to_source()
+{
+    // " - dpdt" is on the left side of "==", thus should add to source.
+    // "+ diffAlphaD" is on the left side of "==", thus should minus from source.
+    clock_t start = std::clock();
+    size_t threads_per_block = 1024;
+    size_t blocks_per_grid = (num_cells + threads_per_block - 1) / threads_per_block;
+    // " + fvc::ddt(rho，K)" is on the left side of "==", thus should minus from source.
+    eeqn_add_to_source_kernel<<<blocks_per_grid, threads_per_block, 0, stream>>>(num_cells,
+            1., d_dpdt, -1., d_diffAlphaD, dataBase_.d_volume, d_b, d_b);
+    clock_t end = std::clock();
+    time_monitor_GPU_kernel += double(end - start) / double(CLOCKS_PER_SEC);
+}
+
+void dfEEqn::checkValue(bool print)
+{
+    checkCudaErrors(cudaMemcpyAsync(h_A_csr, d_A_csr, csr_value_vec_bytes, cudaMemcpyDeviceToHost, stream));
+    checkCudaErrors(cudaMemcpyAsync(h_b, d_b, cell_vec_bytes, cudaMemcpyDeviceToHost, stream));
+
+    // Synchronize stream
+    checkCudaErrors(cudaStreamSynchronize(stream));
+    if (print)
+    {
+        for (int i = 0; i < (num_faces + num_cells); i++)
+            fprintf(stderr, "h_A_csr[%d]: %.15lf\n", i, h_A_csr[i]);
+        for (int i = 0; i < num_cells * 3; i++)
+            fprintf(stderr, "h_b[%d]: %.15lf\n", i, h_b[i]);
+    }
+
+    char *input_file = "of_output.txt";
+    FILE *fp = fopen(input_file, "rb+");
+    if (fp == NULL)
+    {
+        fprintf(stderr, "Failed to open input file: %s!\n", input_file);
+    }
+    int readfile = 0;
+    double *of_b = new double[num_cells];
+    double *of_A = new double[num_faces + num_cells];
+    readfile = fread(of_b, num_cells * sizeof(double), 1, fp);
+    readfile = fread(of_A, (num_faces + num_cells) * sizeof(double), 1, fp);
+
+    std::vector<double> h_A_of_init_vec(num_cells + num_faces);
+    std::copy(of_A, of_A + num_cells + num_faces, h_A_of_init_vec.begin());
+
+    std::vector<double> h_A_of_vec_1mtx(num_faces + num_cells, 0);
+    for (int i = 0; i < num_faces + num_cells; i++)
+    {
+        h_A_of_vec_1mtx[i] = h_A_of_init_vec[dataBase_.tmpPermutatedList[i]];
+    }
+
+    // b
+    std::vector<double> h_b_of_vec(num_cells);
+    std::copy(of_b, of_b + num_cells, h_b_of_vec.begin());
+
+    if (print)
+    {
+        for (int i = 0; i < (num_faces + num_cells); i++)
+            fprintf(stderr, "h_A_of_vec_1mtx[%d]: %.15lf\n", i, h_A_of_vec_1mtx[i]);
+        for (int i = 0; i < num_cells; i++)
+            fprintf(stderr, "h_b_of_vec[%d]: %.15lf\n", i, h_b_of_vec[i]);
+    }
+
+    // check
+    fprintf(stderr, "check of h_A_csr\n");
+    checkVectorEqual(num_faces + num_cells, h_A_of_vec_1mtx.data(), h_A_csr, 1e-5);
+    fprintf(stderr, "check of h_b\n");
+    checkVectorEqual(num_cells, h_b_of_vec.data(), h_b, 1e-5);
+}
+
+void dfEEqn::solve()
+{
+    checkCudaErrors(cudaStreamSynchronize(stream));
+    printf("CPU Time (copy&permutate)  = %.6lf s\n", time_monitor_CPU);
+    printf("GPU Time (kernel launch)   = %.6lf s\n", time_monitor_GPU_kernel);
+    printf("GPU Time (memcpy)          = %.6lf s\n", time_monitor_GPU_memcpy);
+    time_monitor_CPU = 0;
+    time_monitor_GPU_kernel = 0;
+    time_monitor_GPU_memcpy = 0;
+
+    // nvtxRangePush("solve");
+
+    int nNz = num_cells + num_faces; // matrix entries
+    if (num_iteration == 0)          // first interation
+    {
+        printf("Initializing AmgX Linear Solver\n");
+        ESolver->setOperator(num_cells, nNz, d_A_csr_row_index, d_A_csr_col_index, d_A_csr);
+    }
+    else
+    {
+        ESolver->updateOperator(num_cells, nNz, d_A_csr);
+    }
+    ESolver->solve(num_cells, d_he_old, d_b);
+    num_iteration++;
+
+    checkCudaErrors(cudaMemcpyAsync(h_he_new, d_he_old, cell_bytes, cudaMemcpyDeviceToHost, stream));
+    // for (size_t i = 0; i < num_cells; i++)
+    //     fprintf(stderr, "h_he_after[%d]: %.15lf\n", i, h_he_new[i]);
+}
+
+void dfEEqn::sync()
+{
+    checkCudaErrors(cudaStreamSynchronize(stream));
+}
+
+void dfEEqn::updatePsi(double *Psi)
+{
+    for (size_t i = 0; i < num_cells; i++)
+    {
+        Psi[i] = h_he_new[i];
+    }
+}
+
+dfEEqn::~dfEEqn()
+{
+    delete h_A_csr;
+    delete h_b;
+
+    checkCudaErrors(cudaFree(d_A_csr));
+    checkCudaErrors(cudaFree(d_b));
+
+    checkCudaErrors(cudaFree(d_he_old));
+    checkCudaErrors(cudaFree(h_he_new));
+
+    checkCudaErrors(cudaFree(d_K));
+    checkCudaErrors(cudaFree(d_K_old));
+    checkCudaErrors(cudaFree(d_alphaEff));
+    checkCudaErrors(cudaFree(d_dpdt));
+    checkCudaErrors(cudaFree(d_diffAlphaD));
+    checkCudaErrors(cudaFree(d_hDiffCorrFlux));
+
+    checkCudaErrors(cudaFree(d_boundary_K_init));
+    checkCudaErrors(cudaFree(d_boundary_K));
+    checkCudaErrors(cudaFree(d_boundary_alphaEff_init));
+    checkCudaErrors(cudaFree(d_boundary_alphaEff));
+
+    checkCudaErrors(cudaFree(d_value_internal_coeffs_init));
+    checkCudaErrors(cudaFree(d_value_boundary_coeffs_init));
+    checkCudaErrors(cudaFree(d_gradient_internal_coeffs_init));
+    checkCudaErrors(cudaFree(d_gradient_boundary_coeffs_init));
+    checkCudaErrors(cudaFree(d_value_internal_coeffs));
+    checkCudaErrors(cudaFree(d_value_boundary_coeffs));
+    checkCudaErrors(cudaFree(d_gradient_internal_coeffs));
+    checkCudaErrors(cudaFree(d_gradient_boundary_coeffs));
+}

From e847450106ca42e99b4ca432eb1c64851cdcc512 Mon Sep 17 00:00:00 2001
From: STwangyingrui <wangyingrui@sensetime.com>
Date: Mon, 5 Jun 2023 14:14:33 +0800
Subject: [PATCH 2/3] fix bug in fvc_grad_internal_face

---
 .../solvers/dfLowMachFoam/createdfSolver.H     |  3 ++-
 .../H2/cvodeIntegrator_gpu/system/fvSchemes    | 18 +++++++++---------
 src_gpu/dfUEqn.cu                              |  4 ++--
 3 files changed, 13 insertions(+), 12 deletions(-)

diff --git a/applications/solvers/dfLowMachFoam/createdfSolver.H b/applications/solvers/dfLowMachFoam/createdfSolver.H
index e56376b28..880ae674c 100644
--- a/applications/solvers/dfLowMachFoam/createdfSolver.H
+++ b/applications/solvers/dfLowMachFoam/createdfSolver.H
@@ -49,9 +49,10 @@ cudaMallocHost(&boundary_nuEff_init, num_boundary_faces*sizeof(double));
 cudaMallocHost(&boundary_rho_init, num_boundary_faces*sizeof(double));
 cudaMallocHost(&boundary_phi_init, num_boundary_faces*sizeof(double));
 
-double *boundary_alphaEff, *boundary_K;
+double *boundary_alphaEff, *boundary_K, *boundary_hDiffCorrFlux;
 double *eeqn_valueInternalCoeffs, *eeqn_valueBoundaryCoeffs, *eeqn_gradientInternalCoeffs, *eeqn_gradientBoundaryCoeffs;
 cudaMallocHost(&boundary_K, num_boundary_faces*sizeof(double));
+cudaMallocHost(&boundary_hDiffCorrFlux, 3 * num_boundary_faces*sizeof(double));
 cudaMallocHost(&boundary_alphaEff, num_boundary_faces*sizeof(double));
 cudaMallocHost(&eeqn_valueInternalCoeffs, num_boundary_faces*sizeof(double));
 cudaMallocHost(&eeqn_valueBoundaryCoeffs, num_boundary_faces*sizeof(double));
diff --git a/examples/dfLowMachFoam/threeD_reactingTGV/H2/cvodeIntegrator_gpu/system/fvSchemes b/examples/dfLowMachFoam/threeD_reactingTGV/H2/cvodeIntegrator_gpu/system/fvSchemes
index c5c9136e5..e2c0058cd 100644
--- a/examples/dfLowMachFoam/threeD_reactingTGV/H2/cvodeIntegrator_gpu/system/fvSchemes
+++ b/examples/dfLowMachFoam/threeD_reactingTGV/H2/cvodeIntegrator_gpu/system/fvSchemes
@@ -30,15 +30,15 @@ divSchemes
     default             none;
 
     div(phi,U)          Gauss linear;
-    div(phi,Yi)         Gauss limitedLinear01 1;
-    div(phi,h)          Gauss limitedLinear 1;
-    div(phi,ha)          Gauss limitedLinear 1;
-    div(phi,K)          Gauss limitedLinear 1;
-    div(phid,p)         Gauss limitedLinear 1;
-    div(phi,epsilon)    Gauss limitedLinear 1;
-    div(phi,Yi_h)       Gauss limitedLinear01 1;
-    div(phi,k)          Gauss limitedLinear 1;
-    div(hDiffCorrFlux)             Gauss cubic;
+    div(phi,Yi)         Gauss linear;
+    div(phi,h)          Gauss linear;
+    div(phi,ha)          Gauss linear;
+    div(phi,K)          Gauss linear;
+    div(phid,p)         Gauss linear;
+    div(phi,epsilon)    Gauss linear;
+    div(phi,Yi_h)       Gauss upwind;
+    div(phi,k)          Gauss linear;
+    div(hDiffCorrFlux)             Gauss linear;
     div(((rho*nuEff)*dev2(T(grad(U)))))     Gauss linear;
 }
 
diff --git a/src_gpu/dfUEqn.cu b/src_gpu/dfUEqn.cu
index 4afebd8c1..350befb50 100644
--- a/src_gpu/dfUEqn.cu
+++ b/src_gpu/dfUEqn.cu
@@ -181,7 +181,7 @@ __global__ void fvc_grad_internal_face(int num_cells,
             double sfz = face_vector[neighbor_index * 3 + 2];
             int neighbor_cell_id = csr_col_index[row_index + inner_index];
             double neighbor_cell_p = pressure[neighbor_cell_id];
-            double face_p = (1 - w) * own_cell_p + w * neighbor_cell_p;
+            double face_p = w * own_cell_p + (1 - w) * neighbor_cell_p;
             grad_bx_upp += face_p * sfx;
             grad_by_upp += face_p * sfy;
             grad_bz_upp += face_p * sfz;
@@ -1206,4 +1206,4 @@ void dfUEqn::updatePsi(double *Psi)
 
 dfUEqn::~dfUEqn()
 {
-}
\ No newline at end of file
+}

From 56f23a335014c6aadd62672e956210aae8c07772 Mon Sep 17 00:00:00 2001
From: STwangyingrui <wangyingrui@sensetime.com>
Date: Mon, 5 Jun 2023 17:13:26 +0800
Subject: [PATCH 3/3] fix EEqn_GPU

---
 applications/solvers/dfLowMachFoam/EEqn.H     | 137 +++++++-----------
 .../solvers/dfLowMachFoam/dfLowMachFoam.C     |  17 +++
 src_gpu/dfEEqn.H                              |   6 +-
 src_gpu/dfEEqn.cu                             | 105 +++++---------
 4 files changed, 112 insertions(+), 153 deletions(-)

diff --git a/applications/solvers/dfLowMachFoam/EEqn.H b/applications/solvers/dfLowMachFoam/EEqn.H
index 3b8d38c8e..bdef9043f 100644
--- a/applications/solvers/dfLowMachFoam/EEqn.H
+++ b/applications/solvers/dfLowMachFoam/EEqn.H
@@ -1,68 +1,44 @@
 {
-    fprintf(stderr, "\n\n=====================enter EEqn=====================\n\n\n");
     volScalarField& he = thermo.he();
-    #ifdef GPUSolver_
-        // start1 = std::clock();
-        // t.join();
-        // UEqn_GPU.updatePsi(&U[0][0]);
-        // K = 0.5*magSqr(U);
-        // end1 = std::clock();
-        // time_monitor_UEqn += double(end1 - start1) / double(CLOCKS_PER_SEC);
-        // time_monitor_UEqn_mtxAssembly += double(end1 - start1) / double(CLOCKS_PER_SEC);
-        // time_monitor_CPU += double(end1 - start1) / double(CLOCKS_PER_SEC);
-        // time_monitor_UinE += double(end1 - start1) / double(CLOCKS_PER_SEC);
+
+#ifdef GPUSolver_
         start1 = std::clock();
-        // // t.join();
         UEqn_GPU.updatePsi(&U[0][0]);
         K = 0.5*magSqr(U);
-        // K = 0.5*magSqr(U) * 10000000000;
         end1 = std::clock();
         time_monitor_UEqn += double(end1 - start1) / double(CLOCKS_PER_SEC);
         time_monitor_UEqn_mtxAssembly += double(end1 - start1) / double(CLOCKS_PER_SEC);
         time_monitor_CPU += double(end1 - start1) / double(CLOCKS_PER_SEC);
-    #endif
-#ifdef CPUSolver_
-    //fvScalarMatrix EEqn
-    //(
-    //    fvm::ddt(rho, he) + fvm::div(phi, he)
-    //  + fvc::ddt(rho, K) + fvc::div(phi, K)
-    //  - dpdt
-    //  - fvm::laplacian(turbulence->alphaEff(), he)
-    //  + diffAlphaD
-    // ==
-    //    fvc::div(hDiffCorrFlux)
-    //);
-
-    //EEqn.relax();
-
-    //EEqn.solve();
+#endif
 
+#ifdef CPUSolver_
+    start1 = std::clock();
     fvScalarMatrix EEqn
     (
-        //fvm::ddt(rho, he) + fvm::div(phi, he)
-        //fvm::ddt(rho, he) + fvc::ddt(rho, K)
-        //fvm::ddt(rho, he) + fvc::div(phi, K) * 10000000000
-        //fvm::ddt(rho, he) + fvc::div(phi, K)
-        //fvm::laplacian(turbulence->alphaEff(), he)
-        fvm::div(phi, he)
-        //fvm::laplacian(turbulence->alphaEff(), he) - dpdt + diffAlphaD
-        //fvm::laplacian(turbulence->alphaEff(), he) - fvc::div(hDiffCorrFlux)
-        //fvm::laplacian(turbulence->alphaEff(), he) + fvc::div(phi, K) * 10000000000
-        //==
-        //fvc::div(hDiffCorrFlux)
+        fvm::ddt(rho, he) + fvm::div(phi, he)
+      + fvc::ddt(rho, K) + fvc::div(phi, K)
+      - dpdt
+      - fvm::laplacian(turbulence->alphaEff(), he)
+      + diffAlphaD
+     ==
+        fvc::div(hDiffCorrFlux)
     );
+
     EEqn.relax();
+
     EEqn.solve();
+
+    end1 = std::clock();
+    time_monitor_EEqn += double(end1 - start1) / double(CLOCKS_PER_SEC);
 #endif
 
 #ifdef GPUSolver_
-    start2 = std::clock();
     // prepare data on CPU
+    start1 = std::clock();
+    start2 = std::clock();
     const tmp<volScalarField> alphaEff_tmp(turbulence->alphaEff());
     const volScalarField& alphaEff = alphaEff_tmp();
-
-    int idx = 0;
-    offset = 0;
+    int eeqn_offset = 0;
     forAll(he.boundaryField(), patchi)
     {
         const fvsPatchScalarField& patchFlux = phi.boundaryField()[patchi];
@@ -70,77 +46,72 @@
         int patchSize = pw.size();
 
         const scalarField& patchK = K.boundaryField()[patchi];
+        const vectorField& patchhDiffCorrFlux = hDiffCorrFlux.boundaryField()[patchi];
         const scalarField& patchAlphaEff = alphaEff.boundaryField()[patchi];
-        memcpy(boundary_K + offset, &patchK[0], patchSize*sizeof(double));
-        memcpy(boundary_alphaEff + offset, &patchAlphaEff[0], patchSize*sizeof(double));
+        memcpy(boundary_K + eeqn_offset, &patchK[0], patchSize*sizeof(double));
+        memcpy(boundary_hDiffCorrFlux + eeqn_offset * 3, &patchhDiffCorrFlux[0][0], 3 * patchSize*sizeof(double));
+        memcpy(boundary_alphaEff + eeqn_offset, &patchAlphaEff[0], patchSize*sizeof(double));
 
         Field<scalar> valueInternalCoeffs = patchFlux*he.boundaryField()[patchi].valueInternalCoeffs(pw);
         Field<scalar> valueBoundaryCoeffs = -patchFlux*he.boundaryField()[patchi].valueBoundaryCoeffs(pw);
         Field<scalar> gradientInternalCoeffs = he.boundaryField()[patchi].gradientInternalCoeffs();
         Field<scalar> gradientBoundaryCoeffs = he.boundaryField()[patchi].gradientBoundaryCoeffs();
-        memcpy(eeqn_valueInternalCoeffs + offset, &valueInternalCoeffs[0], patchSize*sizeof(double));
-        memcpy(eeqn_valueBoundaryCoeffs + offset, &valueBoundaryCoeffs[0], patchSize*sizeof(double));
-        memcpy(eeqn_gradientInternalCoeffs + offset, &gradientInternalCoeffs[0], patchSize*sizeof(double));
-        memcpy(eeqn_gradientBoundaryCoeffs + offset, &gradientBoundaryCoeffs[0], patchSize*sizeof(double));
+        memcpy(eeqn_valueInternalCoeffs + eeqn_offset, &valueInternalCoeffs[0], patchSize*sizeof(double));
+        memcpy(eeqn_valueBoundaryCoeffs + eeqn_offset, &valueBoundaryCoeffs[0], patchSize*sizeof(double));
+        memcpy(eeqn_gradientInternalCoeffs + eeqn_offset, &gradientInternalCoeffs[0], patchSize*sizeof(double));
+        memcpy(eeqn_gradientBoundaryCoeffs + eeqn_offset, &gradientBoundaryCoeffs[0], patchSize*sizeof(double));
 
-        //for (int t = 0; t < patchSize; t++) {
-        //    fprintf(stderr, "gpu valueInternalCoeffs[%d][%d]: %.10lf\n", patchi, t, eeqn_valueInternalCoeffs[offset + t]);
-        //    if (idx == 64 || idx == 4160 || idx == 8385)
-        //        fprintf(stderr, "eeqn_value_internal_coeffs[%d](other index: %d %d): %.10lf(other value: %.10lf)\n", idx, offset, t, eeqn_valueInternalCoeffs[idx], eeqn_valueInternalCoeffs[offset + t]);
-        //    idx++;
-        //}
-        offset += patchSize;
+        eeqn_offset += patchSize;
     }
-    offset = 0;
-    end2 = std::clock();
-    time_monitor_CPU += double(end2 - start2) / double(CLOCKS_PER_SEC);
-    start1 = std::clock();
+    eeqn_offset = 0;
+    end1 = std::clock();
+    time_monitor_CPU += double(end1 - start1) / double(CLOCKS_PER_SEC);
+    time_monitor_EEqn_mtxAssembly_CPU_Prepare += double(end1 - start1) / double(CLOCKS_PER_SEC);
+
     // prepare data on GPU
+    start1 = std::clock();
     EEqn_GPU.prepare_data(&he.oldTime()[0], &K[0], &K.oldTime()[0], &alphaEff[0],
             &dpdt[0], &diffAlphaD[0], &hDiffCorrFlux[0][0],
-            boundary_K, boundary_alphaEff,
+            boundary_K, boundary_hDiffCorrFlux, boundary_alphaEff,
             eeqn_valueInternalCoeffs, eeqn_valueBoundaryCoeffs,
             eeqn_gradientInternalCoeffs, eeqn_gradientBoundaryCoeffs);
+    if (doSync) EEqn_GPU.sync();
     end1 = std::clock();
-    time_monitor_CPU += double(end1 - start1) / double(CLOCKS_PER_SEC);
-    time_monitor_EEqn += double(end1 - start1) / double(CLOCKS_PER_SEC);
-    time_monitor_EEqn_mtxAssembly += double(end1 - start1) / double(CLOCKS_PER_SEC);
-    fprintf(stderr, "\n\n=====================EEqn GPUSolver_ part2=====================\n\n\n");
+    time_monitor_EEqn_mtxAssembly_GPU_Prepare += double(end1 - start1) / double(CLOCKS_PER_SEC);
 
     start1 = std::clock();
     EEqn_GPU.resetAb();
-    //EEqn_GPU.fvm_ddt();
+    EEqn_GPU.fvm_ddt();
     EEqn_GPU.fvm_div();
-    //EEqn_GPU.fvm_laplacian();
-    //EEqn_GPU.fvc_ddt();
-    //EEqn_GPU.fvc_div_phi_scalar();
-    //EEqn_GPU.fvc_div_vector();
-    //EEqn_GPU.add_to_source();
-    EEqn_GPU.sync();
+    EEqn_GPU.fvm_laplacian();
+    EEqn_GPU.fvc_ddt();
+    EEqn_GPU.fvc_div_phi_scalar();
+    EEqn_GPU.fvc_div_vector();
+    EEqn_GPU.add_to_source();
+    if (doSync) EEqn_GPU.sync();
     end1 = std::clock();
-    time_monitor_EEqn += double(end1 - start1) / double(CLOCKS_PER_SEC);
-    time_monitor_EEqn_mtxAssembly += double(end1 - start1) / double(CLOCKS_PER_SEC);
-    fprintf(stderr, "\n\n=====================EEqn GPUSolver_ part3=====================\n\n\n");
+    time_monitor_EEqn_mtxAssembly_GPU_Run += double(end1 - start1) / double(CLOCKS_PER_SEC);
+
+    EEqn_GPU.sync();
+    end2 = std::clock();
+    time_monitor_EEqn += double(end2 - start2) / double(CLOCKS_PER_SEC);
+    time_monitor_EEqn_mtxAssembly += double(end2 - start2) / double(CLOCKS_PER_SEC);
 
-    // check value of mtxAssembly
+    // check value of mtxAssembly, no time monitor
     EEqn_GPU.checkValue(false);
-    //EEqn_GPU.checkValue(true);
-    fprintf(stderr, "\n\n=====================EEqn GPUSolver_ part4=====================\n\n\n");
 
     start1 = std::clock();
     EEqn_GPU.solve();
+    if (doSync) EEqn_GPU.sync();
     end1 = std::clock();
     time_monitor_EEqn += double(end1 - start1) / double(CLOCKS_PER_SEC);
     time_monitor_EEqn_Solve += double(end1 - start1) / double(CLOCKS_PER_SEC);
-    fprintf(stderr, "\n\n=====================EEqn GPUSolver_ part5=====================\n\n\n");
 
     start1 = std::clock();
     EEqn_GPU.updatePsi(&he[0]);
     end1 = std::clock();
+    time_monitor_CPU += double(end1 - start1) / double(CLOCKS_PER_SEC);
     time_monitor_EEqn += double(end1 - start1) / double(CLOCKS_PER_SEC);
     time_monitor_EEqn_mtxAssembly += double(end1 - start1) / double(CLOCKS_PER_SEC);
-    time_monitor_CPU += double(end1 - start1) / double(CLOCKS_PER_SEC);
-    fprintf(stderr, "\n\n=====================EEqn GPUSolver_ part6=====================\n\n\n");
 #endif
-    fprintf(stderr, "\n\n=====================leave EEqn=====================\n\n\n");
 }
diff --git a/applications/solvers/dfLowMachFoam/dfLowMachFoam.C b/applications/solvers/dfLowMachFoam/dfLowMachFoam.C
index 7b6ba15ee..80bf0bcd8 100644
--- a/applications/solvers/dfLowMachFoam/dfLowMachFoam.C
+++ b/applications/solvers/dfLowMachFoam/dfLowMachFoam.C
@@ -96,6 +96,7 @@ int main(int argc, char *argv[])
     #include "createFields.H"
     #include "createRhoUfIfPresent.H"
 
+    bool doSync = true;
     double time_monitor_flow=0;
     double time_monitor_UEqn=0;
     double time_monitor_UEqn_mtxAssembly=0;
@@ -103,6 +104,9 @@ int main(int argc, char *argv[])
     double time_monitor_UEqn_sum=0;
     double time_monitor_EEqn=0;
     double time_monitor_EEqn_mtxAssembly=0;
+    double time_monitor_EEqn_mtxAssembly_CPU_Prepare;
+    double time_monitor_EEqn_mtxAssembly_GPU_Prepare;
+    double time_monitor_EEqn_mtxAssembly_GPU_Run;
     double time_monitor_EEqn_Solve=0;
     double time_monitor_EEqn_sum=0;
     double time_monitor_chem=0;
@@ -229,6 +233,7 @@ int main(int argc, char *argv[])
 
         runTime.write();
         time_monitor_UEqn_sum += time_monitor_UEqn;
+        time_monitor_EEqn_sum += time_monitor_EEqn;
 
         Info << "output time index " << runTime.timeIndex() << endl;
 
@@ -244,6 +249,12 @@ int main(int argc, char *argv[])
         Info<< "UEqn Time solve            = " << time_monitor_UEqn_Solve << " s" << endl;
         // Info<< "UEqn sum Time              = " << time_monitor_UEqn_sum << " s" << endl;
         // Info<< "UEqn sum Time - overhead   = " << time_monitor_UEqn_sum - time_UEqn_initial << " s" << endl;
+        Info<< "EEqn Time                  = " << time_monitor_EEqn << " s" << endl;
+        Info<< "EEqn Time assamble Mtx     = " << time_monitor_EEqn_mtxAssembly << " s" << endl;
+        Info<< "EEqn assamble(CPU prepare) = " << time_monitor_EEqn_mtxAssembly_CPU_Prepare << " s" << endl;
+        Info<< "EEqn assamble(GPU prepare) = " << time_monitor_EEqn_mtxAssembly_GPU_Prepare << " s" << endl;
+        Info<< "EEqn assamble(GPU run)     = " << time_monitor_EEqn_mtxAssembly_GPU_Run << " s" << endl;
+        Info<< "EEqn Time solve            = " << time_monitor_EEqn_Solve << " s" << endl;
         Info<< "sum Time                   = " << (time_monitor_chem + time_monitor_Y + time_monitor_flow + time_monitor_E + time_monitor_corrThermo + time_monitor_corrDiff) << " s" << endl;
         Info<< "CPU Time (get turb souce)  = " << time_monitor_CPU << " s" << endl;
         Info<< "UEqn time in EEqn          = " << time_monitor_UinE << " s" << endl;
@@ -254,6 +265,12 @@ int main(int argc, char *argv[])
         time_monitor_UEqn = 0;
         time_monitor_UEqn_mtxAssembly = 0;
         time_monitor_UEqn_Solve = 0;
+        time_monitor_EEqn = 0;
+        time_monitor_EEqn_mtxAssembly = 0;
+        time_monitor_EEqn_mtxAssembly_CPU_Prepare = 0;
+        time_monitor_EEqn_mtxAssembly_GPU_Prepare = 0;
+        time_monitor_EEqn_mtxAssembly_GPU_Run = 0;
+        time_monitor_EEqn_Solve = 0;
         time_monitor_chem = 0;
         time_monitor_Y = 0;
         time_monitor_E = 0;
diff --git a/src_gpu/dfEEqn.H b/src_gpu/dfEEqn.H
index 10fc3569a..b606e5f28 100644
--- a/src_gpu/dfEEqn.H
+++ b/src_gpu/dfEEqn.H
@@ -11,8 +11,6 @@ private:
     cudaStream_t stream;
     AmgXSolver *ESolver = nullptr;
     int num_iteration;
-    double time_monitor_CPU;
-    double time_monitor_GPU_kernel, time_monitor_GPU_memcpy, time_monitor_GPU_memcpy_test;
 
     // common variables
     int num_cells, cell_bytes, num_faces, num_surfaces, cell_vec_bytes, csr_value_vec_bytes, num_boundary_cells;
@@ -34,6 +32,8 @@ private:
     double *d_hDiffCorrFlux = nullptr;
     double *d_boundary_K_init = nullptr;
     double *d_boundary_K = nullptr;
+    double *d_boundary_hDiffCorrFlux_init = nullptr;
+    double *d_boundary_hDiffCorrFlux = nullptr;
     double *d_boundary_alphaEff_init = nullptr;
     double *d_boundary_alphaEff = nullptr;
     double *d_value_internal_coeffs_init = nullptr;
@@ -52,7 +52,7 @@ public:
 
     void prepare_data(const double *he_old, const double *K, const double *K_old, const double *alphaEff,
             const double *hDiffCorrFlux, const double *dpdt, const double *diffAlphaD,
-            const double *boundary_K, const double *boundary_alphaEff,
+            const double *boundary_K, const double *boundary_hDiffCorrFlux, const double *boundary_alphaEff,
             const double *valueInternalCoeffs, const double *valueBoundaryCoeffs,
             const double *gradientInternalCoeffs, const double *gradientBoundaryCoeffs);
 
diff --git a/src_gpu/dfEEqn.cu b/src_gpu/dfEEqn.cu
index 6f3d58acb..6907ed4cd 100644
--- a/src_gpu/dfEEqn.cu
+++ b/src_gpu/dfEEqn.cu
@@ -49,7 +49,6 @@ __global__ void eeqn_fvm_div_internal(int num_cells,
             int neighbor_index = neighbor_offset + inner_index;
             double w = weight[neighbor_index];
             double f = phi[neighbor_index];
-            //if (i == 267) printf("lower A[%d], %.10lf + %.10lf = %.10lf\n", i, A_csr_input[i], (-w) * f * sign, A_csr_input[i] + (1 - w) * f * sign);
             A_csr_output[i] = A_csr_input[i] + (-w) * f * sign;
             // lower neighbors contribute to sum of -1
             div_diag += (w - 1) * f;
@@ -61,13 +60,11 @@ __global__ void eeqn_fvm_div_internal(int num_cells,
             int neighbor_index = neighbor_offset + inner_index - 1;
             double w = weight[neighbor_index];
             double f = phi[neighbor_index];
-            //if (i == 267) printf("upper A[%d], %.10lf + %.10lf = %.10lf\n", i, A_csr_input[i], (-w) * f * sign, A_csr_input[i] + (1 - w) * f * sign);
             A_csr_output[i] = A_csr_input[i] + (1 - w) * f * sign;
             // upper neighbors contribute to sum of 1
             div_diag += w * f;
         }
     }
-    //if (row_index + diag_index == 267) printf("diag A[%d], %.10lf + %.10lf = %.10lf\n", row_index + diag_index, A_csr_input[row_index + diag_index], div_diag * sign, A_csr_input[row_index + diag_index] + div_diag * sign);
     A_csr_output[row_index + diag_index] = A_csr_input[row_index + diag_index] + div_diag * sign; // diag
 }
 
@@ -95,11 +92,9 @@ __global__ void eeqn_fvm_div_boundary(int num_boundary_cells,
     double boundary_coeffs = 0;
     for (int i = 0; i < loop_size; i++)
     {
-        //if (cell_index == 1) printf("term[%d], src index: %d, src value: %.10lf\n", i, cell_offset + i, value_internal_coeffs[cell_offset + i]);
         internal_coeffs += value_internal_coeffs[cell_offset + i];
         boundary_coeffs += value_boundary_coeffs[cell_offset + i];
     }
-    //if (cell_index == 1) printf("boundary A[%d], cell_index: %d, loop_size: %d, %.10lf + %.10lf = %.10lf, \n", csr_index, cell_index, loop_size, A_csr_input[csr_index], internal_coeffs * sign, A_csr_input[csr_index] + internal_coeffs * sign);
     A_csr_output[csr_index] = A_csr_input[csr_index] + internal_coeffs * sign;
     b_output[cell_index] = b_input[cell_index] + boundary_coeffs * sign;
 }
@@ -204,13 +199,14 @@ __global__ void eeqn_fvm_laplacian_uncorrected_boundary(int num_boundary_cells,
 __global__ void eeqn_fvc_ddt_kernel(int num_cells, const double rdelta_t,
                                     const double *rho_old, const double *rho_new,
                                     const double *K_old, const double *K,
+                                    const double *volume,
                                     const double sign, const double *b_input, double *b_output)
 {
     int index = blockDim.x * blockIdx.x + threadIdx.x;
     if (index >= num_cells)
         return;
 
-    double fvc_ddt_term = rdelta_t * (rho_new[index] * K[index] - rho_old[index] * K_old[index]);
+    double fvc_ddt_term = rdelta_t * (rho_new[index] * K[index] - rho_old[index] * K_old[index]) * volume[index];
     b_output[index] = b_input[index] + fvc_ddt_term * sign;
 }
 
@@ -344,20 +340,22 @@ __global__ void eeqn_fvc_div_phi_scalar_internal(int num_cells,
         {
             int neighbor_index = neighbor_offset + inner_index;
             double w = weight[neighbor_index];
+            double p = phi[neighbor_index];
             int neighbor_cell_id = csr_col_index[row_index + inner_index];
             double neighbor_cell_k = K[neighbor_cell_id];
             double face_k = (1 - w) * own_cell_k + w * neighbor_cell_k;
-            interp -= phi[i] * face_k;
+            interp -= p * face_k;
         }
         // upper
         if (inner_index > diag_index)
         {
             int neighbor_index = neighbor_offset + inner_index - 1;
             double w = weight[neighbor_index];
+            double p = phi[neighbor_index];
             int neighbor_cell_id = csr_col_index[row_index + inner_index];
             double neighbor_cell_k = K[neighbor_cell_id];
-            double face_k = (1 - w) * own_cell_k + w * neighbor_cell_k;
-            interp += phi[i] * face_k;
+            double face_k = w * own_cell_k + (1 - w) * neighbor_cell_k;
+            interp += p * face_k;
         }
     }
     b_output[index] = b_input[index] + interp * sign;
@@ -401,10 +399,12 @@ __global__ void eeqn_add_to_source_kernel(int num_cells,
 }
 
 __global__ void eeqn_boundaryPermutation(const int num_boundary_faces, const int *bouPermedIndex,
-                                    const double *boundary_K_init, const double *boundary_alphaEff_init,
+                                    const double *boundary_K_init, const double *boundary_hDiffCorrFlux_init,
+                                    const double *boundary_alphaEff_init,
                                     const double *value_internal_coeffs_init, const double *value_boundary_coeffs_init,
                                     const double *gradient_internal_coeffs_init, const double *gradient_boundary_coeffs_init,
-                                    double *boundary_K, double *boundary_alphaEff,
+                                    double *boundary_K, double *boundary_hDiffCorrFlux,
+                                    double *boundary_alphaEff,
                                     double *value_internal_coeffs, double *value_boundary_coeffs,
                                     double *gradient_internal_coeffs, double *gradient_boundary_coeffs)
 {
@@ -414,10 +414,10 @@ __global__ void eeqn_boundaryPermutation(const int num_boundary_faces, const int
 
     int p = bouPermedIndex[index];
 
-    //if (index == 4 || index == 5 || index == 6) {
-    //    printf("dst index: %d, src index: %d, value: %.10lf\n", index, p, value_internal_coeffs_init[p]);
-    //}
     boundary_K[index] = boundary_K_init[p];
+    boundary_hDiffCorrFlux[index * 3 + 0] = boundary_hDiffCorrFlux_init[p * 3 + 0];
+    boundary_hDiffCorrFlux[index * 3 + 1] = boundary_hDiffCorrFlux_init[p * 3 + 1];
+    boundary_hDiffCorrFlux[index * 3 + 2] = boundary_hDiffCorrFlux_init[p * 3 + 2];
     boundary_alphaEff[index] = boundary_alphaEff_init[p];
     value_internal_coeffs[index] = value_internal_coeffs_init[p];
     value_boundary_coeffs[index] = value_boundary_coeffs_init[p];
@@ -462,6 +462,8 @@ dfEEqn::dfEEqn(dfMatrixDataBase &dataBase, const std::string &modeStr, const std
 
     checkCudaErrors(cudaMalloc((void **)&d_boundary_K_init, boundary_face_bytes));
     checkCudaErrors(cudaMalloc((void **)&d_boundary_K, boundary_face_bytes));
+    checkCudaErrors(cudaMalloc((void **)&d_boundary_hDiffCorrFlux_init, boundary_face_bytes * 3));
+    checkCudaErrors(cudaMalloc((void **)&d_boundary_hDiffCorrFlux, boundary_face_bytes * 3));
     checkCudaErrors(cudaMalloc((void **)&d_boundary_alphaEff_init, boundary_face_bytes));
     checkCudaErrors(cudaMalloc((void **)&d_boundary_alphaEff, boundary_face_bytes));
 
@@ -479,14 +481,12 @@ dfEEqn::dfEEqn(dfMatrixDataBase &dataBase, const std::string &modeStr, const std
 
 void dfEEqn::prepare_data(const double *he_old, const double *K, const double *K_old, const double *alphaEff,
         const double *dpdt, const double *diffAlphaD, const double *hDiffCorrFlux,
-        const double *boundary_K, const double *boundary_alphaEff,
+        const double *boundary_K, const double *boundary_hDiffCorrFlux, const double *boundary_alphaEff,
         const double *valueInternalCoeffs, const double *valueBoundaryCoeffs,
         const double *gradientInternalCoeffs, const double *gradientBoundaryCoeffs) {
     // TODO not real async copy now, because some host array are not in pinned memory.
 
     // copy the host input array in host memory to the device input array in device memory
-    clock_t start = std::clock();
-
     checkCudaErrors(cudaMemcpyAsync(d_he_old, he_old, cell_bytes, cudaMemcpyHostToDevice, stream));
     checkCudaErrors(cudaMemcpyAsync(d_K, K, cell_bytes, cudaMemcpyHostToDevice, stream));
     checkCudaErrors(cudaMemcpyAsync(d_K_old, K_old, cell_bytes, cudaMemcpyHostToDevice, stream));
@@ -497,6 +497,7 @@ void dfEEqn::prepare_data(const double *he_old, const double *K, const double *K
 
     // copy and permutate boundary variable
     checkCudaErrors(cudaMemcpyAsync(d_boundary_K_init, boundary_K, boundary_face_bytes, cudaMemcpyHostToDevice, stream));
+    checkCudaErrors(cudaMemcpyAsync(d_boundary_hDiffCorrFlux_init, boundary_hDiffCorrFlux, boundary_face_bytes * 3, cudaMemcpyHostToDevice, stream));
     checkCudaErrors(cudaMemcpyAsync(d_boundary_alphaEff_init, boundary_alphaEff, boundary_face_bytes, cudaMemcpyHostToDevice, stream));
     checkCudaErrors(cudaMemcpyAsync(d_value_internal_coeffs_init, valueInternalCoeffs, boundary_face_bytes, cudaMemcpyHostToDevice, stream));
     checkCudaErrors(cudaMemcpyAsync(d_value_boundary_coeffs_init, valueBoundaryCoeffs, boundary_face_bytes, cudaMemcpyHostToDevice, stream));
@@ -506,14 +507,12 @@ void dfEEqn::prepare_data(const double *he_old, const double *K, const double *K
     size_t threads_per_block = 1024;
     size_t blocks_per_grid = (num_boundary_faces + threads_per_block - 1) / threads_per_block;
     eeqn_boundaryPermutation<<<blocks_per_grid, threads_per_block, 0, stream>>>(num_boundary_faces, dataBase_.d_bouPermedIndex,
-            d_boundary_K_init, d_boundary_alphaEff_init,
+            d_boundary_K_init, d_boundary_hDiffCorrFlux_init, d_boundary_alphaEff_init,
             d_value_internal_coeffs_init, d_value_boundary_coeffs_init,
             d_gradient_internal_coeffs_init, d_gradient_boundary_coeffs_init,
-            d_boundary_K, d_boundary_alphaEff,
+            d_boundary_K, d_boundary_hDiffCorrFlux, d_boundary_alphaEff,
             d_value_internal_coeffs, d_value_boundary_coeffs,
             d_gradient_internal_coeffs, d_gradient_boundary_coeffs);
-    clock_t end = std::clock();
-    time_monitor_GPU_memcpy += double(end - start) / double(CLOCKS_PER_SEC);
 }
 
 void dfEEqn::resetAb() {
@@ -522,24 +521,18 @@ void dfEEqn::resetAb() {
     checkCudaErrors(cudaMemsetAsync(d_b, 0, cell_vec_bytes, stream));
 }
 
-#define TRACE fprintf(stderr, "%s %d\n", __FILE__, __LINE__);
-
 void dfEEqn::fvm_ddt()
 {
-    clock_t start = std::clock();
     size_t threads_per_block = 1024;
     size_t blocks_per_grid = (num_cells + threads_per_block - 1) / threads_per_block;
     eeqn_fvm_ddt_kernel<<<blocks_per_grid, threads_per_block, 0, stream>>>(num_cells, dataBase_.rdelta_t,
             d_A_csr_row_index, d_A_csr_diag_index,
             dataBase_.d_rho_old, dataBase_.d_rho_new, dataBase_.d_volume, d_he_old,
             1., d_A_csr, d_b, d_A_csr, d_b);
-    clock_t end = std::clock();
-    time_monitor_GPU_kernel += double(end - start) / double(CLOCKS_PER_SEC);
 }
 
 void dfEEqn::fvm_div()
 {
-    clock_t start = std::clock();
     size_t threads_per_block = 1024;
     size_t blocks_per_grid = (num_cells + threads_per_block - 1) / threads_per_block;
     eeqn_fvm_div_internal<<<blocks_per_grid, threads_per_block, 0, stream>>>(num_cells,
@@ -552,65 +545,52 @@ void dfEEqn::fvm_div()
             dataBase_.d_boundary_cell_offset, dataBase_.d_boundary_cell_id,
             d_value_internal_coeffs, d_value_boundary_coeffs,
             1., d_A_csr, d_b, d_A_csr, d_b);
-    clock_t end = std::clock();
-    time_monitor_GPU_kernel += double(end - start) / double(CLOCKS_PER_SEC);
 }
 
 void dfEEqn::fvm_laplacian()
 {
-    clock_t start = std::clock();
     size_t threads_per_block = 1024;
     size_t blocks_per_grid = (num_cells + threads_per_block - 1) / threads_per_block;
     eeqn_fvm_laplacian_uncorrected_internal<<<blocks_per_grid, threads_per_block, 0, stream>>>(num_cells,
             d_A_csr_row_index, d_A_csr_col_index, d_A_csr_diag_index, d_alphaEff, dataBase_.d_weight,
             dataBase_.d_face, dataBase_.d_deltaCoeffs,
-            1., d_A_csr, d_A_csr);
-
+            -1., d_A_csr, d_A_csr);
     blocks_per_grid = (num_boundary_cells + threads_per_block - 1) / threads_per_block;
     eeqn_fvm_laplacian_uncorrected_boundary<<<blocks_per_grid, threads_per_block, 0, stream>>>(num_boundary_cells,
             d_A_csr_row_index, d_A_csr_diag_index, dataBase_.d_boundary_cell_offset, dataBase_.d_boundary_cell_id,
             d_boundary_alphaEff, dataBase_.d_boundary_face, d_gradient_internal_coeffs, d_gradient_boundary_coeffs,
-            1., d_A_csr, d_b, d_A_csr, d_b);
-    clock_t end = std::clock();
-    time_monitor_GPU_kernel += double(end - start) / double(CLOCKS_PER_SEC);
+            -1., d_A_csr, d_b, d_A_csr, d_b);
 }
 
 void dfEEqn::fvc_ddt()
 {
     // " + fvc::ddt(rho，K)" is on the left side of "==", thus should minus from source.
-    clock_t start = std::clock();
     size_t threads_per_block = 1024;
     size_t blocks_per_grid = (num_cells + threads_per_block - 1) / threads_per_block;
     eeqn_fvc_ddt_kernel<<<blocks_per_grid, threads_per_block, 0, stream>>>(num_cells, dataBase_.rdelta_t,
-            dataBase_.d_rho_old, dataBase_.d_rho_new, d_K_old, d_K,
+            dataBase_.d_rho_old, dataBase_.d_rho_new, d_K_old, d_K, dataBase_.d_volume,
             -1., d_b, d_b);
-    clock_t end = std::clock();
-    time_monitor_GPU_kernel += double(end - start) / double(CLOCKS_PER_SEC);
 }
 
 void dfEEqn::fvc_div_vector()
 {
     // " + fvc::div(hDiffCorrFlux)" is on the right side of "==", thus should add to source.
-    clock_t start = std::clock();
     size_t threads_per_block = 512;
     size_t blocks_per_grid = (num_cells + threads_per_block - 1) / threads_per_block;
     eeqn_fvc_div_vector_internal<<<blocks_per_grid, threads_per_block, 0, stream>>>(num_cells,
             d_A_csr_row_index, d_A_csr_col_index, d_A_csr_diag_index,
-            dataBase_.d_face_vector, dataBase_.d_grad, dataBase_.d_weight,
+            dataBase_.d_face_vector, d_hDiffCorrFlux, dataBase_.d_weight,
             1., d_b, d_b);
     blocks_per_grid = (num_boundary_cells + threads_per_block - 1) / threads_per_block;
     eeqn_fvc_div_vector_boundary<<<blocks_per_grid, threads_per_block, 0, stream>>>(num_boundary_cells,
             dataBase_.d_boundary_cell_offset, dataBase_.d_boundary_cell_id,
-            dataBase_.d_boundary_face_vector, dataBase_.d_grad_boundary,
+            dataBase_.d_boundary_face_vector, d_boundary_hDiffCorrFlux,
             1., d_b, d_b);
-    clock_t end = std::clock();
-    time_monitor_GPU_kernel += double(end - start) / double(CLOCKS_PER_SEC);
 }
 
 void dfEEqn::fvc_div_phi_scalar()
 {
     // " + fvc::div(phi，K)" is on the left side of "==", thus should minus from source.
-    clock_t start = std::clock();
     size_t threads_per_block = 1024;
     size_t blocks_per_grid = (num_cells + threads_per_block - 1) / threads_per_block;
     eeqn_fvc_div_phi_scalar_internal<<<blocks_per_grid, threads_per_block, 0, stream>>>(num_cells,
@@ -622,22 +602,17 @@ void dfEEqn::fvc_div_phi_scalar()
             dataBase_.d_boundary_cell_offset, dataBase_.d_boundary_cell_id,
             dataBase_.d_boundary_phi, d_boundary_K,
             -1., d_b, d_b);
-    clock_t end = std::clock();
-    time_monitor_GPU_kernel += double(end - start) / double(CLOCKS_PER_SEC);
 }
 
 void dfEEqn::add_to_source()
 {
     // " - dpdt" is on the left side of "==", thus should add to source.
     // "+ diffAlphaD" is on the left side of "==", thus should minus from source.
-    clock_t start = std::clock();
     size_t threads_per_block = 1024;
     size_t blocks_per_grid = (num_cells + threads_per_block - 1) / threads_per_block;
     // " + fvc::ddt(rho，K)" is on the left side of "==", thus should minus from source.
     eeqn_add_to_source_kernel<<<blocks_per_grid, threads_per_block, 0, stream>>>(num_cells,
             1., d_dpdt, -1., d_diffAlphaD, dataBase_.d_volume, d_b, d_b);
-    clock_t end = std::clock();
-    time_monitor_GPU_kernel += double(end - start) / double(CLOCKS_PER_SEC);
 }
 
 void dfEEqn::checkValue(bool print)
@@ -650,9 +625,9 @@ void dfEEqn::checkValue(bool print)
     if (print)
     {
         for (int i = 0; i < (num_faces + num_cells); i++)
-            fprintf(stderr, "h_A_csr[%d]: %.15lf\n", i, h_A_csr[i]);
-        for (int i = 0; i < num_cells * 3; i++)
-            fprintf(stderr, "h_b[%d]: %.15lf\n", i, h_b[i]);
+            fprintf(stderr, "h_A_csr[%d]: %.16lf\n", i, h_A_csr[i]);
+        for (int i = 0; i < num_cells; i++)
+            fprintf(stderr, "h_b[%d]: %.16lf\n", i, h_b[i]);
     }
 
     char *input_file = "of_output.txt";
@@ -683,28 +658,21 @@ void dfEEqn::checkValue(bool print)
     if (print)
     {
         for (int i = 0; i < (num_faces + num_cells); i++)
-            fprintf(stderr, "h_A_of_vec_1mtx[%d]: %.15lf\n", i, h_A_of_vec_1mtx[i]);
+            fprintf(stderr, "h_A_of_vec_1mtx[%d]: %.16lf\n", i, h_A_of_vec_1mtx[i]);
         for (int i = 0; i < num_cells; i++)
-            fprintf(stderr, "h_b_of_vec[%d]: %.15lf\n", i, h_b_of_vec[i]);
+            fprintf(stderr, "h_b_of_vec[%d]: %.16lf\n", i, h_b_of_vec[i]);
     }
 
     // check
     fprintf(stderr, "check of h_A_csr\n");
-    checkVectorEqual(num_faces + num_cells, h_A_of_vec_1mtx.data(), h_A_csr, 1e-5);
+    checkVectorEqual(num_faces + num_cells, h_A_of_vec_1mtx.data(), h_A_csr, 1e-6);
     fprintf(stderr, "check of h_b\n");
-    checkVectorEqual(num_cells, h_b_of_vec.data(), h_b, 1e-5);
+    checkVectorEqual(num_cells, h_b_of_vec.data(), h_b, 1e-6);
 }
 
 void dfEEqn::solve()
 {
     checkCudaErrors(cudaStreamSynchronize(stream));
-    printf("CPU Time (copy&permutate)  = %.6lf s\n", time_monitor_CPU);
-    printf("GPU Time (kernel launch)   = %.6lf s\n", time_monitor_GPU_kernel);
-    printf("GPU Time (memcpy)          = %.6lf s\n", time_monitor_GPU_memcpy);
-    time_monitor_CPU = 0;
-    time_monitor_GPU_kernel = 0;
-    time_monitor_GPU_memcpy = 0;
-
     // nvtxRangePush("solve");
 
     int nNz = num_cells + num_faces; // matrix entries
@@ -721,8 +689,9 @@ void dfEEqn::solve()
     num_iteration++;
 
     checkCudaErrors(cudaMemcpyAsync(h_he_new, d_he_old, cell_bytes, cudaMemcpyDeviceToHost, stream));
+    // checkCudaErrors(cudaStreamSynchronize(stream));
     // for (size_t i = 0; i < num_cells; i++)
-    //     fprintf(stderr, "h_he_after[%d]: %.15lf\n", i, h_he_new[i]);
+    //     fprintf(stderr, "h_he_after[%d]: %.16lf\n", i, h_he_new[i]);
 }
 
 void dfEEqn::sync()
@@ -743,12 +712,12 @@ dfEEqn::~dfEEqn()
     delete h_A_csr;
     delete h_b;
 
+    checkCudaErrors(cudaFreeHost(h_he_new));
+
     checkCudaErrors(cudaFree(d_A_csr));
     checkCudaErrors(cudaFree(d_b));
 
     checkCudaErrors(cudaFree(d_he_old));
-    checkCudaErrors(cudaFree(h_he_new));
-
     checkCudaErrors(cudaFree(d_K));
     checkCudaErrors(cudaFree(d_K_old));
     checkCudaErrors(cudaFree(d_alphaEff));
@@ -758,6 +727,8 @@ dfEEqn::~dfEEqn()
 
     checkCudaErrors(cudaFree(d_boundary_K_init));
     checkCudaErrors(cudaFree(d_boundary_K));
+    checkCudaErrors(cudaFree(d_boundary_hDiffCorrFlux_init));
+    checkCudaErrors(cudaFree(d_boundary_hDiffCorrFlux));
     checkCudaErrors(cudaFree(d_boundary_alphaEff_init));
     checkCudaErrors(cudaFree(d_boundary_alphaEff));