From f827b9d413f09cec4fa14b8483099bba418b0f74 Mon Sep 17 00:00:00 2001
From: STwangyingrui <wangyingrui@sensetime.com>
Date: Thu, 8 Jun 2023 18:12:33 +0800
Subject: [PATCH 1/6] add GPU version of hDiffCorrFlux and diffAlphaD in YEqn,
 merge branch GPU and fix conflict

---
 applications/solvers/dfLowMachFoam/EEqn.H     |   5 +-
 applications/solvers/dfLowMachFoam/YEqn.H     |  38 +-
 .../solvers/dfLowMachFoam/createdfSolver.H    |   3 +-
 src_gpu/dfEEqn.H                              |   7 +-
 src_gpu/dfEEqn.cu                             |  40 +-
 src_gpu/dfMatrixDataBase.H                    |   8 +
 src_gpu/dfYEqn.H                              |  23 +-
 src_gpu/dfYEqn.cu                             | 385 ++++++++++++------
 8 files changed, 321 insertions(+), 188 deletions(-)
diff --git a/applications/solvers/dfLowMachFoam/EEqn.H b/applications/solvers/dfLowMachFoam/EEqn.H
index 8e8c0bee0..764b6be10 100644
--- a/applications/solvers/dfLowMachFoam/EEqn.H
+++ b/applications/solvers/dfLowMachFoam/EEqn.H
@@ -49,10 +49,8 @@
         int patchSize = pw.size();
 
         const scalarField& patchK = K.boundaryField()[patchi];
-        const vectorField& patchhDiffCorrFlux = hDiffCorrFlux.boundaryField()[patchi];
         const scalarField& patchAlphaEff = alphaEff.boundaryField()[patchi];
         memcpy(boundary_K + eeqn_offset, &patchK[0], patchSize*sizeof(double));
-        memcpy(boundary_hDiffCorrFlux + eeqn_offset * 3, &patchhDiffCorrFlux[0][0], 3 * patchSize*sizeof(double));
         memcpy(boundary_alphaEff + eeqn_offset, &patchAlphaEff[0], patchSize*sizeof(double));
 
         eeqn_offset += patchSize;
@@ -63,8 +61,7 @@
     // prepare data on GPU
     start1 = std::clock();
     EEqn_GPU.prepare_data(&he.oldTime()[0], &K[0], &K.oldTime()[0], &alphaEff[0],
-            &dpdt[0], &diffAlphaD[0], &hDiffCorrFlux[0][0],
-            boundary_K, boundary_hDiffCorrFlux, boundary_alphaEff);
+            &dpdt[0], boundary_K, boundary_alphaEff);
     if (doSync) EEqn_GPU.sync();
     end1 = std::clock();
     time_monitor_EEqn_mtxAssembly_GPU_Prepare += double(end1 - start1) / double(CLOCKS_PER_SEC);
diff --git a/applications/solvers/dfLowMachFoam/YEqn.H b/applications/solvers/dfLowMachFoam/YEqn.H
index c741a8f48..baa12633e 100644
--- a/applications/solvers/dfLowMachFoam/YEqn.H
+++ b/applications/solvers/dfLowMachFoam/YEqn.H
@@ -14,6 +14,7 @@ tmp<fv::convectionScheme<scalar>> mvConvection
     )
 );
 
+#ifdef CPUSolver_
 start1 = std::clock();
 forAll(Y, i)
 {
@@ -23,6 +24,7 @@ const surfaceScalarField phiUc = linearInterpolate(sumYDiffError) & mesh.Sf();
 start1 = std::clock();
 time_monitor_YEqn += double(end1 - start1) / double(CLOCKS_PER_SEC);
 time_monitor_YEqn_Solve += double(end1 - start1) / double(CLOCKS_PER_SEC);
+#endif
 
 #ifdef GPUSolver_
     start1 = std::clock();
@@ -36,47 +38,56 @@ time_monitor_YEqn_Solve += double(end1 - start1) / double(CLOCKS_PER_SEC);
 #ifdef GPUSolver_
     start1 = std::clock();
     start2 = std::clock();
-    std::vector<double*> Y_old(Y.size()), boundary_Y_init(Y.size()), boundary_rhoD_init(Y.size());
-    std::vector<const double*> rhoD_GPU(Y.size());
+    std::vector<double*> Y_old(Y.size()), boundary_Y(Y.size()), boundary_hai(Y.size()), boundary_rhoD(Y.size());
+    std::vector<const double*> hai(Y.size()), rhoD(Y.size());
     for (size_t i = 0; i < Y.size(); ++i)
     {
         volScalarField& Yi = Y[i];
-        const volScalarField& rhoDi = chemistry->rhoD(i);
         Y_old[i] = &Yi.oldTime()[0];
-        rhoD_GPU[i] = &chemistry->rhoD(i)[0];
-        cudaMallocHost(&boundary_Y_init[i], num_boundary_faces*sizeof(double));
-        cudaMallocHost(&boundary_rhoD_init[i], num_boundary_faces*sizeof(double));
+        cudaMallocHost(&boundary_Y[i], num_boundary_faces*sizeof(double));
+        const volScalarField& haii = chemistry->hai(i);
+        const volScalarField& rhoDi = chemistry->rhoD(i);
+        hai[i] = &haii[0];
+        rhoD[i] = &rhoDi[0];
+        cudaMallocHost(&boundary_hai[i], num_boundary_faces*sizeof(double));
+        cudaMallocHost(&boundary_rhoD[i], num_boundary_faces*sizeof(double));
         int offset = 0;
         forAll(Yi.boundaryField(), patchi)
         {
             const scalarField& patchYi = Yi.boundaryField()[patchi];
+            const scalarField& patchHaii = haii.boundaryField()[patchi];
             const scalarField& patchRhoDi = rhoDi.boundaryField()[patchi];
             int patchSize = patchYi.size();
 
-            memcpy(boundary_Y_init[i]+offset, &patchYi[0], patchSize*sizeof(double));
-            memcpy(boundary_rhoD_init[i]+offset, &patchRhoDi[0], patchSize*sizeof(double));
+            memcpy(boundary_Y[i] + offset, &patchYi[0], patchSize*sizeof(double));
+            memcpy(boundary_hai[i] + offset, &patchHaii[0], patchSize*sizeof(double));
+            memcpy(boundary_rhoD[i] + offset, &patchRhoDi[0], patchSize*sizeof(double));
             offset += patchSize;
         }
     }
 
     volScalarField mut_sct = turbulence->mut().ref()/Sct;
-    std::vector<double> boundary_mutsct;
+    double *boundary_mutsct = nullptr;
+    cudaMallocHost(&boundary_mutsct, num_boundary_faces*sizeof(double));
+    int offset = 0;
     forAll(p.boundaryField(), patchi)
     {
         const scalarField& patchMut_sct = mut_sct.boundaryField()[patchi];
         int patchSize = patchMut_sct.size();
-        boundary_mutsct.insert(boundary_mutsct.end(), &patchMut_sct[0], &patchMut_sct[0] + patchSize);
+        memcpy(boundary_mutsct + offset, &patchMut_sct[0], patchSize*sizeof(double));
+        offset += patchSize;
     }
     end2 = std::clock();
     time_monitor_YEqn_mtxAssembly_CPU_Prepare += double(end2 - start2) / double(CLOCKS_PER_SEC);
 
     YEqn_GPU.upwindWeight();
-    YEqn_GPU.correctVelocity(Y_old, boundary_Y_init, rhoD_GPU);
+    YEqn_GPU.correctVelocity(Y_old, boundary_Y, hai, boundary_hai, rhoD, boundary_rhoD);
     YEqn_GPU.fvm_ddt();
     YEqn_GPU.fvm_div_phi();
     YEqn_GPU.fvm_div_phiUc();
-    YEqn_GPU.fvm_laplacian(&mut_sct[0], boundary_mutsct.data(), boundary_rhoD_init);
+    YEqn_GPU.fvm_laplacian(&mut_sct[0], boundary_mutsct, rhoD, boundary_rhoD);
 
+    YEqn_GPU.compute_diffAlphaD(&thermo.alpha()[0], hai);
     end1 = std::clock();
     time_monitor_YEqn += double(end1 - start1) / double(CLOCKS_PER_SEC);
     time_monitor_YEqn_mtxAssembly += double(end1 - start1) / double(CLOCKS_PER_SEC);
@@ -115,9 +126,10 @@ time_monitor_corrDiff += double(end - start) / double(CLOCKS_PER_SEC);
     forAll(Y, i)
     {
         volScalarField& Yi = Y[i];
+#ifdef CPUSolver_
         hDiffCorrFlux += chemistry->hai(i)*(chemistry->rhoD(i)*fvc::grad(Yi) - Yi*sumYDiffError);
         diffAlphaD += fvc::laplacian(thermo.alpha()*chemistry->hai(i), Yi);
-
+#endif
         if (i != inertIndex)
         {
             #ifdef GPUSolver_
diff --git a/applications/solvers/dfLowMachFoam/createdfSolver.H b/applications/solvers/dfLowMachFoam/createdfSolver.H
index 880ae674c..e56376b28 100644
--- a/applications/solvers/dfLowMachFoam/createdfSolver.H
+++ b/applications/solvers/dfLowMachFoam/createdfSolver.H
@@ -49,10 +49,9 @@ cudaMallocHost(&boundary_nuEff_init, num_boundary_faces*sizeof(double));
 cudaMallocHost(&boundary_rho_init, num_boundary_faces*sizeof(double));
 cudaMallocHost(&boundary_phi_init, num_boundary_faces*sizeof(double));
 
-double *boundary_alphaEff, *boundary_K, *boundary_hDiffCorrFlux;
+double *boundary_alphaEff, *boundary_K;
 double *eeqn_valueInternalCoeffs, *eeqn_valueBoundaryCoeffs, *eeqn_gradientInternalCoeffs, *eeqn_gradientBoundaryCoeffs;
 cudaMallocHost(&boundary_K, num_boundary_faces*sizeof(double));
-cudaMallocHost(&boundary_hDiffCorrFlux, 3 * num_boundary_faces*sizeof(double));
 cudaMallocHost(&boundary_alphaEff, num_boundary_faces*sizeof(double));
 cudaMallocHost(&eeqn_valueInternalCoeffs, num_boundary_faces*sizeof(double));
 cudaMallocHost(&eeqn_valueBoundaryCoeffs, num_boundary_faces*sizeof(double));
diff --git a/src_gpu/dfEEqn.H b/src_gpu/dfEEqn.H
index 483c44bb6..732e32ca7 100644
--- a/src_gpu/dfEEqn.H
+++ b/src_gpu/dfEEqn.H
@@ -28,12 +28,8 @@ private:
     double *d_K = nullptr;
     double *d_K_old = nullptr;
     double *d_dpdt = nullptr;
-    double *d_diffAlphaD = nullptr;
-    double *d_hDiffCorrFlux = nullptr;
     double *d_boundary_K_init = nullptr;
     double *d_boundary_K = nullptr;
-    double *d_boundary_hDiffCorrFlux_init = nullptr;
-    double *d_boundary_hDiffCorrFlux = nullptr;
     double *d_boundary_alphaEff_init = nullptr;
     double *d_boundary_alphaEff = nullptr;
     double *d_value_internal_coeffs_init = nullptr;
@@ -51,8 +47,7 @@ public:
     ~dfEEqn();
 
     void prepare_data(const double *he_old, const double *K, const double *K_old, const double *alphaEff,
-            const double *hDiffCorrFlux, const double *dpdt, const double *diffAlphaD,
-            const double *boundary_K, const double *boundary_hDiffCorrFlux, const double *boundary_alphaEff);
+            const double *dpdt, const double *boundary_K, const double *boundary_alphaEff);
 
     void initializeTimeStep();
 
diff --git a/src_gpu/dfEEqn.cu b/src_gpu/dfEEqn.cu
index 2ef941f3b..48d6a19dc 100644
--- a/src_gpu/dfEEqn.cu
+++ b/src_gpu/dfEEqn.cu
@@ -398,9 +398,9 @@ __global__ void eeqn_add_to_source_kernel(int num_cells,
 }
 
 __global__ void eeqn_boundaryPermutation(const int num_boundary_faces, const int *bouPermedIndex,
-                                         const double *boundary_K_init, const double *boundary_hDiffCorrFlux_init,
+                                         const double *boundary_K_init,
                                          const double *boundary_alphaEff_init,
-                                         double *boundary_K, double *boundary_hDiffCorrFlux,
+                                         double *boundary_K,
                                          double *boundary_alphaEff)
 {
     int index = blockDim.x * blockIdx.x + threadIdx.x;
@@ -410,9 +410,6 @@ __global__ void eeqn_boundaryPermutation(const int num_boundary_faces, const int
     int p = bouPermedIndex[index];
 
     boundary_K[index] = boundary_K_init[p];
-    boundary_hDiffCorrFlux[index * 3 + 0] = boundary_hDiffCorrFlux_init[p * 3 + 0];
-    boundary_hDiffCorrFlux[index * 3 + 1] = boundary_hDiffCorrFlux_init[p * 3 + 1];
-    boundary_hDiffCorrFlux[index * 3 + 2] = boundary_hDiffCorrFlux_init[p * 3 + 2];
     boundary_alphaEff[index] = boundary_alphaEff_init[p];
 }
 
@@ -468,13 +465,9 @@ dfEEqn::dfEEqn(dfMatrixDataBase &dataBase, const std::string &modeStr, const std
     checkCudaErrors(cudaMalloc((void **)&d_K_old, cell_bytes));
     checkCudaErrors(cudaMalloc((void **)&d_alphaEff, cell_bytes));
     checkCudaErrors(cudaMalloc((void **)&d_dpdt, cell_bytes));
-    checkCudaErrors(cudaMalloc((void **)&d_diffAlphaD, cell_bytes));
-    checkCudaErrors(cudaMalloc((void **)&d_hDiffCorrFlux, cell_bytes * 3));
 
     checkCudaErrors(cudaMalloc((void **)&d_boundary_K_init, boundary_face_bytes));
     checkCudaErrors(cudaMalloc((void **)&d_boundary_K, boundary_face_bytes));
-    checkCudaErrors(cudaMalloc((void **)&d_boundary_hDiffCorrFlux_init, boundary_face_bytes * 3));
-    checkCudaErrors(cudaMalloc((void **)&d_boundary_hDiffCorrFlux, boundary_face_bytes * 3));
     checkCudaErrors(cudaMalloc((void **)&d_boundary_alphaEff_init, boundary_face_bytes));
     checkCudaErrors(cudaMalloc((void **)&d_boundary_alphaEff, boundary_face_bytes));
 
@@ -491,8 +484,7 @@ dfEEqn::dfEEqn(dfMatrixDataBase &dataBase, const std::string &modeStr, const std
 }
 
 void dfEEqn::prepare_data(const double *he_old, const double *K, const double *K_old, const double *alphaEff,
-                          const double *dpdt, const double *diffAlphaD, const double *hDiffCorrFlux,
-                          const double *boundary_K, const double *boundary_hDiffCorrFlux, const double *boundary_alphaEff)
+                          const double *dpdt, const double *boundary_K, const double *boundary_alphaEff)
 {
     // TODO not real async copy now, because some host array are not in pinned memory.
 
@@ -502,19 +494,16 @@ void dfEEqn::prepare_data(const double *he_old, const double *K, const double *K
     checkCudaErrors(cudaMemcpyAsync(d_K_old, K_old, cell_bytes, cudaMemcpyHostToDevice, stream));
     checkCudaErrors(cudaMemcpyAsync(d_alphaEff, alphaEff, cell_bytes, cudaMemcpyHostToDevice, stream));
     checkCudaErrors(cudaMemcpyAsync(d_dpdt, dpdt, cell_bytes, cudaMemcpyHostToDevice, stream));
-    checkCudaErrors(cudaMemcpyAsync(d_diffAlphaD, diffAlphaD, cell_bytes, cudaMemcpyHostToDevice, stream));
-    checkCudaErrors(cudaMemcpyAsync(d_hDiffCorrFlux, hDiffCorrFlux, cell_bytes * 3, cudaMemcpyHostToDevice, stream));
 
     // copy and permutate boundary variable
     checkCudaErrors(cudaMemcpyAsync(d_boundary_K_init, boundary_K, boundary_face_bytes, cudaMemcpyHostToDevice, stream));
-    checkCudaErrors(cudaMemcpyAsync(d_boundary_hDiffCorrFlux_init, boundary_hDiffCorrFlux, boundary_face_bytes * 3, cudaMemcpyHostToDevice, stream));
     checkCudaErrors(cudaMemcpyAsync(d_boundary_alphaEff_init, boundary_alphaEff, boundary_face_bytes, cudaMemcpyHostToDevice, stream));
 
     size_t threads_per_block = 1024;
     size_t blocks_per_grid = (num_boundary_faces + threads_per_block - 1) / threads_per_block;
     eeqn_boundaryPermutation<<<blocks_per_grid, threads_per_block, 0, stream>>>(num_boundary_faces, dataBase_.d_bouPermedIndex,
-                                                                                d_boundary_K_init, d_boundary_hDiffCorrFlux_init, d_boundary_alphaEff_init,
-                                                                                d_boundary_K, d_boundary_hDiffCorrFlux, d_boundary_alphaEff);
+            d_boundary_K_init, d_boundary_alphaEff_init,
+            d_boundary_K, d_boundary_alphaEff);
 }
 
 void dfEEqn::initializeTimeStep()
@@ -587,14 +576,14 @@ void dfEEqn::fvc_div_vector()
     size_t threads_per_block = 512;
     size_t blocks_per_grid = (num_cells + threads_per_block - 1) / threads_per_block;
     eeqn_fvc_div_vector_internal<<<blocks_per_grid, threads_per_block, 0, stream>>>(num_cells,
-                                                                                    d_A_csr_row_index, d_A_csr_col_index, d_A_csr_diag_index,
-                                                                                    dataBase_.d_face_vector, d_hDiffCorrFlux, dataBase_.d_weight,
-                                                                                    1., d_b, d_b);
+            d_A_csr_row_index, d_A_csr_col_index, d_A_csr_diag_index,
+            dataBase_.d_face_vector, dataBase_.d_hDiffCorrFlux, dataBase_.d_weight,
+            1., d_b, d_b);
     blocks_per_grid = (num_boundary_cells + threads_per_block - 1) / threads_per_block;
     eeqn_fvc_div_vector_boundary<<<blocks_per_grid, threads_per_block, 0, stream>>>(num_boundary_cells,
-                                                                                    dataBase_.d_boundary_cell_offset, dataBase_.d_boundary_cell_id,
-                                                                                    dataBase_.d_boundary_face_vector, d_boundary_hDiffCorrFlux,
-                                                                                    1., d_b, d_b);
+            dataBase_.d_boundary_cell_offset, dataBase_.d_boundary_cell_id,
+            dataBase_.d_boundary_face_vector, dataBase_.d_boundary_hDiffCorrFlux,
+            1., d_b, d_b);
 }
 
 void dfEEqn::fvc_div_phi_scalar()
@@ -621,7 +610,8 @@ void dfEEqn::add_to_source()
     size_t blocks_per_grid = (num_cells + threads_per_block - 1) / threads_per_block;
     // " + fvc::ddt(rho，K)" is on the left side of "==", thus should minus from source.
     eeqn_add_to_source_kernel<<<blocks_per_grid, threads_per_block, 0, stream>>>(num_cells,
-                                                                                 1., d_dpdt, -1., d_diffAlphaD, dataBase_.d_volume, d_b, d_b);
+            //1., d_dpdt, -1., d_diffAlphaD, dataBase_.d_volume, d_b, d_b);
+            1., d_dpdt, -1., dataBase_.d_diffAlphaD, dataBase_.d_volume, d_b, d_b);
 }
 
 void dfEEqn::checkValue(bool print)
@@ -728,13 +718,9 @@ dfEEqn::~dfEEqn()
     checkCudaErrors(cudaFree(d_K_old));
     checkCudaErrors(cudaFree(d_alphaEff));
     checkCudaErrors(cudaFree(d_dpdt));
-    checkCudaErrors(cudaFree(d_diffAlphaD));
-    checkCudaErrors(cudaFree(d_hDiffCorrFlux));
 
     checkCudaErrors(cudaFree(d_boundary_K_init));
     checkCudaErrors(cudaFree(d_boundary_K));
-    checkCudaErrors(cudaFree(d_boundary_hDiffCorrFlux_init));
-    checkCudaErrors(cudaFree(d_boundary_hDiffCorrFlux));
     checkCudaErrors(cudaFree(d_boundary_alphaEff_init));
     checkCudaErrors(cudaFree(d_boundary_alphaEff));
 
diff --git a/src_gpu/dfMatrixDataBase.H b/src_gpu/dfMatrixDataBase.H
index 2fa79b13f..778d7c56c 100644
--- a/src_gpu/dfMatrixDataBase.H
+++ b/src_gpu/dfMatrixDataBase.H
@@ -101,6 +101,9 @@ struct dfMatrixDataBase
     double *d_deltaCoeffs_init = nullptr, *d_deltaCoeffs = nullptr;
     std::vector<double*> d_rhoD_vector;
 
+    double *d_hDiffCorrFlux = nullptr;
+    double *d_diffAlphaD = nullptr;
+
     double rdelta_t = 1/1e-6;
 
     /**
@@ -131,6 +134,8 @@ struct dfMatrixDataBase
     std::vector<double*> d_boundary_rhoD_vector;
     double *d_boundary_mut_sct = nullptr;
 
+    double *d_boundary_hDiffCorrFlux = nullptr;
+
     std::vector<int> boundPermutationList;
     std::vector<double> ueqn_internalCoeffs, ueqn_boundaryCoeffs;
     std::vector<double> boundary_face_vector;
@@ -562,6 +567,9 @@ struct dfMatrixDataBase
         checkCudaErrors(cudaMalloc((void**)&d_grad_boundary_init, boundary_cell_bytes * 9));
         total_bytes += (num_cells * 9 * sizeof(double) + boundary_face_bytes * 9 + boundary_cell_bytes * 9); // FIXME: rename
 
+        checkCudaErrors(cudaMalloc((void**)&d_hDiffCorrFlux, 3 * cell_bytes));
+        checkCudaErrors(cudaMalloc((void**)&d_diffAlphaD, cell_bytes));
+        checkCudaErrors(cudaMalloc((void**)&d_boundary_hDiffCorrFlux, 3 * boundary_face_bytes));
 
         fprintf(stderr, "Total bytes malloc on GPU: %.2fMB\n", total_bytes * 1.0 / 1024 / 1024);
 
diff --git a/src_gpu/dfYEqn.H b/src_gpu/dfYEqn.H
index 260baf4f2..6e2a408e4 100644
--- a/src_gpu/dfYEqn.H
+++ b/src_gpu/dfYEqn.H
@@ -16,7 +16,7 @@ private:
     double time_monitor_GPU_kernel, time_monitor_GPU_memcpy, time_monitor_GPU_memcpy_test;
 
     // common variables
-    int num_cells, cell_bytes, num_faces, num_surfaces, num_boundary_cells, num_boundary_faces, num_species, inertIndex;
+    int num_cells, cell_bytes, num_faces, num_surfaces, num_boundary_cells, num_boundary_faces, num_species, boundary_face_bytes, inertIndex;
     int *d_A_csr_row_index, *d_A_csr_diag_index, *d_A_csr_col_index;
 
     // Matrix variables
@@ -27,6 +27,18 @@ private:
     double *d_phiUc_boundary = nullptr;
     double *d_phiUc = nullptr;
     double *d_mut_Sct = nullptr;
+    double *d_boundary_mut_sct = nullptr;
+
+    std::vector<double*> d_Y_old;
+    double *d_boundary_Y = nullptr;
+    double *d_hai, *d_boundary_hai = nullptr;
+    double *d_rhoD, *d_boundary_rhoD = nullptr;
+    double *d_sum_rhoD_grady, *d_sum_boundary_rhoD_grady = nullptr;
+    double *d_sum_hai_rhoD_grady, *d_sum_boundary_hai_rhoD_grady = nullptr; 
+    double *d_sum_hai_y, *d_sum_boundary_hai_y = nullptr;
+    double *d_grady, *d_boundary_grady = nullptr;
+
+    double *d_alpha = nullptr;
 
 public:
     dfYEqn(dfMatrixDataBase &dataBase, const std::string &modeStr, const std::string &cfgFile, const int inertIndex);
@@ -35,7 +47,9 @@ public:
 
     void checkValue(bool print, char *filename);
 
-    void correctVelocity(std::vector<double *> Y_old, std::vector<double *> boundary_Y_init, std::vector<const double *> rhoD_GPU);
+    void correctVelocity(std::vector<double *> Y_old, std::vector<double *> boundary_Y,
+        std::vector<const double *> hai, std::vector<double *> boundary_hai,
+        std::vector<const double *> rhoD, std::vector<double *> boundary_rhoD);
 
     void upwindWeight();
 
@@ -45,7 +59,10 @@ public:
 
     void fvm_div_phiUc();
 
-    void fvm_laplacian(double *mut_Sct, double *boundary_mut_Sct, std::vector<double*>boundary_rhoD);
+    void fvm_laplacian(const double *mut_Sct, const double *boundary_mut_Sct,
+            std::vector<const double*>rhoD, std::vector<double*>boundary_rhoD);
+
+    void compute_diffAlphaD(const double *alpha, std::vector<const double *> hai);
 
     void solve();
 
diff --git a/src_gpu/dfYEqn.cu b/src_gpu/dfYEqn.cu
index 9ef75747f..a9ede60ec 100644
--- a/src_gpu/dfYEqn.cu
+++ b/src_gpu/dfYEqn.cu
@@ -12,10 +12,10 @@ __global__ void getUpwindWeight(int num_faces, double *phi, double *weight)
         weight[index] = 0.;
 }
 
-__global__ void fvc_grad_internal_face_Y(int num_cells,
-                                         const int *csr_row_index, const int *csr_col_index, const int *csr_diag_index, const double *volume,
-                                         const double *face_vector, const double *weight, const double *species, const double *rhoD,
-                                         const double *sumYDiffError, double *sumYDiffError_output)
+__global__ void fvc_grad_internal(int num_cells,
+        const int *csr_row_index, const int *csr_col_index, const int *csr_diag_index,
+        const double *face_vector, const double *weight, const double *species,
+        const double *volume, double *grady)
 {
     int index = blockDim.x * blockIdx.x + threadIdx.x;
     if (index >= num_cells)
@@ -28,12 +28,9 @@ __global__ void fvc_grad_internal_face_Y(int num_cells,
     int neighbor_offset = csr_row_index[index] - index;
 
     double own_cell_Y = species[index];
-    double grad_bx_upper = 0;
-    double grad_by_upper = 0;
-    double grad_bz_upper = 0;
-    double grad_bx_lower = 0;
-    double grad_by_lower = 0;
-    double grad_bz_lower = 0;
+    double grad_bx = 0;
+    double grad_by = 0;
+    double grad_bz = 0;
     for (int i = row_index; i < next_row_index; i++)
     {
         int inner_index = i - row_index;
@@ -48,9 +45,9 @@ __global__ void fvc_grad_internal_face_Y(int num_cells,
             int neighbor_cell_id = csr_col_index[row_index + inner_index];
             double neighbor_cell_Y = species[neighbor_cell_id];
             double face_Y = w * (neighbor_cell_Y - own_cell_Y) + own_cell_Y;
-            grad_bx_lower -= face_Y * sfx;
-            grad_by_lower -= face_Y * sfy;
-            grad_bz_lower -= face_Y * sfz;
+            grad_bx -= face_Y * sfx;
+            grad_by -= face_Y * sfy;
+            grad_bz -= face_Y * sfz;
         }
         // upper
         if (inner_index > diag_index)
@@ -63,29 +60,21 @@ __global__ void fvc_grad_internal_face_Y(int num_cells,
             int neighbor_cell_id = csr_col_index[row_index + inner_index];
             double neighbor_cell_Y = species[neighbor_cell_id];
             double face_Y = w * (own_cell_Y - neighbor_cell_Y) + neighbor_cell_Y;
-            grad_bx_upper += face_Y * sfx;
-            grad_by_upper += face_Y * sfy;
-            grad_bz_upper += face_Y * sfz;
+            grad_bx += face_Y * sfx;
+            grad_by += face_Y * sfy;
+            grad_bz += face_Y * sfz;
         }
     }
     double vol = volume[index];
-
-    sumYDiffError_output[index * 3 + 0] = sumYDiffError[index * 3 + 0] + (grad_bx_upper + grad_bx_lower);
-    sumYDiffError_output[index * 3 + 1] = sumYDiffError[index * 3 + 1] + (grad_by_upper + grad_by_lower);
-    sumYDiffError_output[index * 3 + 2] = sumYDiffError[index * 3 + 2] + (grad_bz_upper + grad_bz_lower);
-
-    // if (index == 0)
-    // {
-    //     printf("grad_bz_upper = %e\n", grad_bz_upper);
-    //     printf("grad_bz_lower = %e\n", grad_bz_lower);
-    //     printf("(grad_bz_upper + grad_bz_lower) = %e\n", (grad_bz_upper + grad_bz_lower));
-    // }
+    grady[index * 3 + 0] = grad_bx / vol;
+    grady[index * 3 + 1] = grad_by / vol;
+    grady[index * 3 + 2] = grad_bz / vol;
 }
 
-__global__ void fvc_grad_boundary_face_Y(int num_cells, int num_boundary_cells,
-                                         const int *boundary_cell_offset, const int *boundary_cell_id, const double *rhoD, const int *bouPermedIndex,
-                                         const double *boundary_face_vector, const double *boundary_species_init, double *boundary_species,
-                                         const double *volume, const double *sumYDiffError, double *sumYDiffError_output)
+__global__ void fvc_grad_boundary(int num_boundary_cells,
+        const int *boundary_cell_offset, const int *boundary_cell_id, const int *bouPermedIndex,
+        const double *boundary_face_vector, const double *boundary_species_init,
+        const double *volume, double *grady)
 {
     int index = blockDim.x * blockIdx.x + threadIdx.x;
     if (index >= num_boundary_cells)
@@ -106,62 +95,21 @@ __global__ void fvc_grad_boundary_face_Y(int num_cells, int num_boundary_cells,
         double sfz = boundary_face_vector[i * 3 + 2];
         int permute_index = bouPermedIndex[i];
         double face_Y = boundary_species_init[permute_index];
-        boundary_species[i] = face_Y;
         grad_bx += face_Y * sfx;
         grad_by += face_Y * sfy;
         grad_bz += face_Y * sfz;
-        // if (index == 0)
-        // {
-        //     printf("face_Y = %e\n", face_Y);
-        //     printf("sfz = %e\n", sfz);
-        //     printf("face_Y * sfz = %e\n", face_Y * sfz);
-        // }
     }
 
-    //// correct the boundary gradient
-    // double nx = boundary_face_vector[face_index * 3 + 0] / magSf[face_index];
-    // double ny = boundary_face_vector[face_index * 3 + 1] / magSf[face_index];
-    // double nz = boundary_face_vector[face_index * 3 + 2] / magSf[face_index];
-    // double sn_grad = 0;
-    // double grad_correction = sn_grad * volume[cell_index] - (nx * grad_bx + ny * grad_by + nz * grad_bz);
-    // grad_bx += nx * grad_correction;
-    // grad_by += ny * grad_correction;
-    // grad_bz += nz * grad_correction;
-
-    sumYDiffError_output[cell_index * 3 + 0] = sumYDiffError[cell_index * 3 + 0] + grad_bx;
-    sumYDiffError_output[cell_index * 3 + 1] = sumYDiffError[cell_index * 3 + 1] + grad_by;
-    sumYDiffError_output[cell_index * 3 + 2] = sumYDiffError[cell_index * 3 + 2] + grad_bz;
-}
-
-__global__ void sumError(int num_cells, const double *volume, const double *rhoD,
-                         const double *sumYDiffErrorTmp, double *sumYDiffError_output)
-{
-    int index = blockDim.x * blockIdx.x + threadIdx.x;
-    if (index >= num_cells)
-        return;
-
-    sumYDiffError_output[index * 3 + 0] = sumYDiffError_output[index * 3 + 0] + sumYDiffErrorTmp[index * 3 + 0] * rhoD[index];
-    sumYDiffError_output[index * 3 + 1] = sumYDiffError_output[index * 3 + 1] + sumYDiffErrorTmp[index * 3 + 1] * rhoD[index];
-    sumYDiffError_output[index * 3 + 2] = sumYDiffError_output[index * 3 + 2] + sumYDiffErrorTmp[index * 3 + 2] * rhoD[index];
-}
-
-__global__ void divide_vol(int num_cells, const double *volume,
-                           const double *sumYDiffError, double *sumYDiffError_output)
-{
-    int index = blockDim.x * blockIdx.x + threadIdx.x;
-    if (index >= num_cells)
-        return;
     double vol = volume[index];
-
-    sumYDiffError_output[index * 3 + 0] = sumYDiffError[index * 3 + 0] / vol;
-    sumYDiffError_output[index * 3 + 1] = sumYDiffError[index * 3 + 1] / vol;
-    sumYDiffError_output[index * 3 + 2] = sumYDiffError[index * 3 + 2] / vol;
+    grady[cell_index * 3 + 0] += grad_bx / vol;
+    grady[cell_index * 3 + 1] += grad_by / vol;
+    grady[cell_index * 3 + 2] += grad_bz / vol;
 }
 
-__global__ void correct_boundary_conditions_vec(int num_boundary_cells,
+__global__ void correct_boundary_conditions(int num_boundary_cells,
                                                 const int *boundary_cell_offset, const int *boundary_cell_id,
                                                 const double *boundary_sf, const double *mag_sf,
-                                                double *boundary_sumYDiffError, double *sumYDiffError)
+                                                const double *grady, double* boundary_grady)
 {
     int index = blockDim.x * blockIdx.x + threadIdx.x;
     if (index >= num_boundary_cells)
@@ -172,9 +120,9 @@ __global__ void correct_boundary_conditions_vec(int num_boundary_cells,
     int cell_index = boundary_cell_id[cell_offset];
 
     // initialize boundary_sumYDiffError
-    double sumYDiffError_x = sumYDiffError[cell_index * 3 + 0];
-    double sumYDiffError_y = sumYDiffError[cell_index * 3 + 1];
-    double sumYDiffError_z = sumYDiffError[cell_index * 3 + 2];
+    double grady_x = grady[cell_index * 3 + 0];
+    double grady_y = grady[cell_index * 3 + 1];
+    double grady_z = grady[cell_index * 3 + 2];
 
     for (int i = cell_offset; i < next_cell_offset; i++)
     {
@@ -182,15 +130,67 @@ __global__ void correct_boundary_conditions_vec(int num_boundary_cells,
         double n_y = boundary_sf[i * 3 + 1] / mag_sf[i];
         double n_z = boundary_sf[i * 3 + 2] / mag_sf[i];
         double sn_grad = 0;
-        double grad_correction = sn_grad - (n_x * sumYDiffError_x + n_y * sumYDiffError_y + n_z * sumYDiffError_z);
-        boundary_sumYDiffError[i * 3 + 0] = sumYDiffError_x + grad_correction * n_x;
-        boundary_sumYDiffError[i * 3 + 1] = sumYDiffError_y + grad_correction * n_y;
-        boundary_sumYDiffError[i * 3 + 2] = sumYDiffError_z + grad_correction * n_z;
+        double grad_correction = sn_grad - (n_x * grady_x + n_y * grady_y + n_z * grady_z);
+        boundary_grady[i * 3 + 0] = grady_x + grad_correction * n_x;
+        boundary_grady[i * 3 + 1] = grady_y + grad_correction * n_y;
+        boundary_grady[i * 3 + 2] = grady_z + grad_correction * n_z;
     }
 }
 
-__global__ void calculate_phiUc(int num_cells, const int *csr_row_index, const int *csr_col_index, const int *csr_diag_index,
-                                const double *face_vector, const double *weight, const double *sumYDiffError, double *phiUc)
+__global__ void sumError_internal(int num_cells,
+        const double *hai, const double *rhoD, const double *y, const double *grady,
+        double *sum_hai_rhoD_grady, double *sum_rhoD_grady, double *sum_hai_y)
+{
+    int index = blockDim.x * blockIdx.x + threadIdx.x;
+    if (index >= num_cells)
+        return;
+
+    sum_hai_rhoD_grady[index * 3 + 0] += hai[index] * rhoD[index] * grady[index * 3 + 0];
+    sum_hai_rhoD_grady[index * 3 + 1] += hai[index] * rhoD[index] * grady[index * 3 + 1];
+    sum_hai_rhoD_grady[index * 3 + 2] += hai[index] * rhoD[index] * grady[index * 3 + 2];
+
+    sum_rhoD_grady[index * 3 + 0] += rhoD[index] * grady[index * 3 + 0];
+    sum_rhoD_grady[index * 3 + 1] += rhoD[index] * grady[index * 3 + 1];
+    sum_rhoD_grady[index * 3 + 2] += rhoD[index] * grady[index * 3 + 2];
+
+    sum_hai_y[index] += hai[index] * y[index];
+}
+
+__global__ void sumError_boundary(int num_boundary_faces, const int *bouPermedIndex,
+        const double *boundary_hai, const double *boundary_rhoD, const double *boundary_y, const double *boundary_grady,
+        double *sum_boundary_hai_rhoD_grady, double *sum_boundary_rhoD_grady, double *sum_boundary_hai_y)
+{
+    int index = blockDim.x * blockIdx.x + threadIdx.x;
+    if (index >= num_boundary_faces)
+        return;
+
+    int permute_index = bouPermedIndex[index];
+    sum_boundary_hai_rhoD_grady[index * 3 + 0] += boundary_hai[permute_index] * boundary_rhoD[permute_index] * boundary_grady[index * 3 + 0];
+    sum_boundary_hai_rhoD_grady[index * 3 + 1] += boundary_hai[permute_index] * boundary_rhoD[permute_index] * boundary_grady[index * 3 + 1];
+    sum_boundary_hai_rhoD_grady[index * 3 + 2] += boundary_hai[permute_index] * boundary_rhoD[permute_index] * boundary_grady[index * 3 + 2];
+
+    sum_boundary_rhoD_grady[index * 3 + 0] += boundary_rhoD[permute_index] * boundary_grady[index * 3 + 0];
+    sum_boundary_rhoD_grady[index * 3 + 1] += boundary_rhoD[permute_index] * boundary_grady[index * 3 + 1];
+    sum_boundary_rhoD_grady[index * 3 + 2] += boundary_rhoD[permute_index] * boundary_grady[index * 3 + 2];
+
+    sum_boundary_hai_y[index] += boundary_hai[permute_index] * boundary_y[permute_index];
+}
+
+__global__ void calculate_hDiffCorrFlux(int num,
+        const double *sum_hai_rhoD_grady, const double *sum_rhoD_grady, const double *sum_hai_y, double *hDiffCorrFlux)
+{
+    int index = blockDim.x * blockIdx.x + threadIdx.x;
+    if (index >= num)
+        return;
+
+    hDiffCorrFlux[index * 3 + 0] += (sum_hai_rhoD_grady[index * 3 + 0] - sum_hai_y[index] * sum_rhoD_grady[index * 3 + 0]);
+    hDiffCorrFlux[index * 3 + 1] += (sum_hai_rhoD_grady[index * 3 + 1] - sum_hai_y[index] * sum_rhoD_grady[index * 3 + 1]);
+    hDiffCorrFlux[index * 3 + 2] += (sum_hai_rhoD_grady[index * 3 + 2] - sum_hai_y[index] * sum_rhoD_grady[index * 3 + 2]);
+}
+
+__global__ void calculate_phiUc_internal(int num_cells,
+        const int *csr_row_index, const int *csr_col_index, const int *csr_diag_index,
+        const double *face_vector, const double *weight, const double *sumYDiffError, double *phiUc)
 {
     int index = blockDim.x * blockIdx.x + threadIdx.x;
     if (index >= num_cells)
@@ -456,6 +456,55 @@ __global__ void fvm_laplacian_uncorrected_scalar_boundary(int num_cells, int num
     b_output[cell_index] = b_input[cell_index] + boundary_coeffs * sign;
 }
 
+__global__ void fvc_laplacian_internal(int num_cells,
+        const int *csr_row_index, const int *csr_col_index, const int *csr_diag_index,
+        const double *alpha, const double *hai, const double* y,
+        const double *weight, const double *magsf, const double *distance,
+        const double* volume, double *output)
+{
+    int index = blockDim.x * blockIdx.x + threadIdx.x;
+    if (index >= num_cells)
+        return;
+
+    // A_csr has one more element in each row: itself
+    int row_index = csr_row_index[index];
+    int row_elements = csr_row_index[index + 1] - row_index;
+    int diag_index = csr_diag_index[index];
+    int neighbor_offset = csr_row_index[index] - index;
+
+    double own_vf = y[index];
+    double own_coeff = alpha[index] * hai[index];
+    double sum = 0;
+    // lower
+    for (int i = 0; i < diag_index; i++)
+    {
+        int neighbor_index = neighbor_offset + i;
+        int neighbor_cell_id = csr_col_index[i + row_index];
+        double w = weight[neighbor_index];
+        double nei_vf = y[neighbor_cell_id];
+        double nei_coeff = alpha[neighbor_cell_id] * hai[neighbor_cell_id];
+        double face_gamma = (1 - w) * own_coeff + w * nei_coeff;
+        double sngrad = distance[neighbor_index] * (own_vf - nei_vf);
+        double value = face_gamma * sngrad * magsf[neighbor_index];
+        sum -= value;
+    }
+    // upper
+    for (int i = diag_index + 1; i < row_elements; i++)
+    {
+        int neighbor_index = neighbor_offset + i - 1;
+        int neighbor_cell_id = csr_col_index[i + row_index];
+        double w = weight[neighbor_index];
+        double nei_vf = y[neighbor_cell_id];
+        double nei_coeff = alpha[neighbor_cell_id] * hai[neighbor_cell_id];
+        double face_gamma = w * own_coeff + (1 - w) * nei_coeff;
+        double sngrad = distance[neighbor_index] * (nei_vf - own_vf);
+        double value = face_gamma * sngrad * magsf[neighbor_index];
+        sum += value;
+    }
+    double vol = volume[index];
+    output[index] += sum / vol;
+}
+
 dfYEqn::dfYEqn(dfMatrixDataBase &dataBase, const std::string &modeStr, const std::string &cfgFile, const int inertIndex)
     : dataBase_(dataBase), inertIndex(inertIndex)
 {
@@ -466,6 +515,7 @@ dfYEqn::dfYEqn(dfMatrixDataBase &dataBase, const std::string &modeStr, const std
     num_boundary_cells = dataBase_.num_boundary_cells;
     num_boundary_faces = dataBase_.num_boundary_faces;
     cell_bytes = dataBase_.cell_bytes;
+    boundary_face_bytes = dataBase_.boundary_face_bytes;
 
     YSolverSet.resize(num_species - 1); // consider inert species
     for (auto &solver : YSolverSet)
@@ -482,22 +532,44 @@ dfYEqn::dfYEqn(dfMatrixDataBase &dataBase, const std::string &modeStr, const std
     checkCudaErrors(cudaMalloc((void **)&d_A_csr, (num_cells + num_faces) * (num_species - 1) * sizeof(double)));
     checkCudaErrors(cudaMalloc((void **)&d_b, cell_bytes * (num_species - 1)));
     checkCudaErrors(cudaMalloc((void **)&d_psi, cell_bytes * (num_species - 1)));
-    checkCudaErrors(cudaMalloc((void **)&d_sumYDiffError, 3 * cell_bytes));
-    checkCudaErrors(cudaMalloc((void **)&d_sumYDiffError_tmp, 3 * cell_bytes));
     checkCudaErrors(cudaMalloc((void **)&d_phiUc, num_faces * sizeof(double)));
-    checkCudaErrors(cudaMalloc((void **)&d_phiUc, num_faces * sizeof(double)));
-    checkCudaErrors(cudaMalloc((void **)&d_sumYDiffError_boundary, 3 * num_boundary_faces * sizeof(double)));
     checkCudaErrors(cudaMalloc((void **)&d_phiUc_boundary, num_boundary_faces * sizeof(double)));
     checkCudaErrors(cudaMalloc((void **)&d_mut_Sct, cell_bytes));
+    checkCudaErrors(cudaMalloc((void **)&d_boundary_mut_sct, boundary_face_bytes));
+
+    checkCudaErrors(cudaMalloc((void **)&d_boundary_Y, boundary_face_bytes));
+
+    checkCudaErrors(cudaMalloc((void **)&d_hai, cell_bytes));
+    checkCudaErrors(cudaMalloc((void **)&d_boundary_hai, boundary_face_bytes));
+    checkCudaErrors(cudaMalloc((void **)&d_rhoD, cell_bytes));
+    checkCudaErrors(cudaMalloc((void **)&d_boundary_rhoD, boundary_face_bytes));
+
+    checkCudaErrors(cudaMalloc((void **)&d_sum_rhoD_grady, 3 * cell_bytes));
+    checkCudaErrors(cudaMalloc((void **)&d_sum_boundary_rhoD_grady, 3 * boundary_face_bytes));
+    checkCudaErrors(cudaMalloc((void **)&d_sum_hai_rhoD_grady, 3 * cell_bytes));
+    checkCudaErrors(cudaMalloc((void **)&d_sum_boundary_hai_rhoD_grady, 3 * boundary_face_bytes));
+    checkCudaErrors(cudaMalloc((void **)&d_sum_hai_y, cell_bytes));
+    checkCudaErrors(cudaMalloc((void **)&d_sum_boundary_hai_y, boundary_face_bytes));
+
+    checkCudaErrors(cudaMalloc((void **)&d_grady, 3 * cell_bytes));
+    checkCudaErrors(cudaMalloc((void **)&d_boundary_grady, 3 * boundary_face_bytes));
+
+    checkCudaErrors(cudaMalloc((void **)&d_alpha, cell_bytes));
+
+    d_Y_old.resize(num_species);
+    for (size_t i = 0; i < num_species; i++)
+    {
+        checkCudaErrors(cudaMalloc((void **)&d_Y_old[i], cell_bytes));
+    }
 
     checkCudaErrors(cudaStreamCreate(&stream));
     // zeroGradient
     for (size_t i = 0; i < num_species; i++)
     {
-        checkCudaErrors(cudaMemsetAsync(dataBase_.d_internal_coeffs_Y_vector[i], 1, dataBase_.boundary_face_bytes, stream));
-        checkCudaErrors(cudaMemsetAsync(dataBase_.d_boundary_coeffs_Y_vector[i], 0, dataBase_.boundary_face_bytes, stream));
-        checkCudaErrors(cudaMemsetAsync(dataBase_.d_laplac_internal_coeffs_Y_vector[i], 0, dataBase_.boundary_face_bytes, stream));
-        checkCudaErrors(cudaMemsetAsync(dataBase_.d_laplac_boundary_coeffs_Y_vector[i], 0, dataBase_.boundary_face_bytes, stream));
+        checkCudaErrors(cudaMemsetAsync(dataBase_.d_internal_coeffs_Y_vector[i], 1, boundary_face_bytes, stream));
+        checkCudaErrors(cudaMemsetAsync(dataBase_.d_boundary_coeffs_Y_vector[i], 0, boundary_face_bytes, stream));
+        checkCudaErrors(cudaMemsetAsync(dataBase_.d_laplac_internal_coeffs_Y_vector[i], 0, boundary_face_bytes, stream));
+        checkCudaErrors(cudaMemsetAsync(dataBase_.d_laplac_boundary_coeffs_Y_vector[i], 0, boundary_face_bytes, stream));
     }
 }
 
@@ -508,50 +580,77 @@ void dfYEqn::upwindWeight()
     getUpwindWeight<<<blocks_per_grid, threads_per_block, 0, stream>>>(num_faces, dataBase_.d_phi, dataBase_.d_weight_upwind);
 }
 
-void dfYEqn::correctVelocity(std::vector<double *> Y_old, std::vector<double *> boundary_Y_init, std::vector<const double *> rhoD_GPU)
+void dfYEqn::correctVelocity(std::vector<double *> Y_old, std::vector<double *> boundary_Y,
+        std::vector<const double *> hai, std::vector<double *> boundary_hai,
+        std::vector<const double *> rhoD, std::vector<double *> boundary_rhoD)
 {
     // initialize variables in each time step
-    checkCudaErrors(cudaMemsetAsync(d_sumYDiffError, 0, 3 * cell_bytes, stream));
-    checkCudaErrors(cudaMemsetAsync(d_phiUc, 0, num_faces * sizeof(double), stream));
+    checkCudaErrors(cudaMemsetAsync(d_sum_rhoD_grady, 0, 3 * cell_bytes, stream));
+    checkCudaErrors(cudaMemsetAsync(d_sum_boundary_rhoD_grady, 0, 3 * boundary_face_bytes, stream));
+    checkCudaErrors(cudaMemsetAsync(d_sum_hai_rhoD_grady, 0, 3 * cell_bytes, stream));
+    checkCudaErrors(cudaMemsetAsync(d_sum_boundary_hai_rhoD_grady, 0, 3 * boundary_face_bytes, stream));
+    checkCudaErrors(cudaMemsetAsync(d_sum_hai_y, 0, cell_bytes, stream));
+    checkCudaErrors(cudaMemsetAsync(d_sum_boundary_hai_y, 0, boundary_face_bytes, stream));
+    checkCudaErrors(cudaMemsetAsync(dataBase_.d_hDiffCorrFlux, 0, 3 * cell_bytes, stream));
+    checkCudaErrors(cudaMemsetAsync(dataBase_.d_boundary_hDiffCorrFlux, 0, 3 * boundary_face_bytes, stream));
 
     size_t threads_per_block, blocks_per_grid;
     for (size_t i = 0; i < num_species; ++i)
     {
-        checkCudaErrors(cudaMemsetAsync(d_sumYDiffError_tmp, 0, 3 * cell_bytes, stream));
-        checkCudaErrors(cudaMemcpyAsync(dataBase_.d_Y_old_vector[i], Y_old[i], cell_bytes, cudaMemcpyHostToDevice, stream));
-        checkCudaErrors(cudaMemcpyAsync(dataBase_.d_boundary_Y_init_vector[i], boundary_Y_init[i], dataBase_.boundary_face_bytes, cudaMemcpyHostToDevice, stream));
-        checkCudaErrors(cudaMemcpyAsync(dataBase_.d_rhoD_vector[i], rhoD_GPU[i], cell_bytes, cudaMemcpyHostToDevice, stream));
+        checkCudaErrors(cudaMemcpyAsync(d_Y_old[i], Y_old[i], cell_bytes, cudaMemcpyHostToDevice, stream));
+        checkCudaErrors(cudaMemcpyAsync(d_boundary_Y, boundary_Y[i], boundary_face_bytes, cudaMemcpyHostToDevice, stream));
+        checkCudaErrors(cudaMemcpyAsync(d_hai, hai[i], cell_bytes, cudaMemcpyHostToDevice, stream));
+        checkCudaErrors(cudaMemcpyAsync(d_boundary_hai, boundary_hai[i], boundary_face_bytes, cudaMemcpyHostToDevice, stream));
+        checkCudaErrors(cudaMemcpyAsync(d_rhoD, rhoD[i], cell_bytes, cudaMemcpyHostToDevice, stream));
+        checkCudaErrors(cudaMemcpyAsync(d_boundary_rhoD, boundary_rhoD[i], boundary_face_bytes, cudaMemcpyHostToDevice, stream));
+
+        checkCudaErrors(cudaMemsetAsync(d_grady, 0, 3 * cell_bytes, stream));
+        checkCudaErrors(cudaMemsetAsync(d_boundary_grady, 0, 3 * boundary_face_bytes, stream));
 
         // launch cuda kernel
         threads_per_block = 1024;
         blocks_per_grid = (num_cells + threads_per_block - 1) / threads_per_block;
-        fvc_grad_internal_face_Y<<<blocks_per_grid, threads_per_block, 0, stream>>>(num_cells,
-                                                                                    d_A_csr_row_index, d_A_csr_col_index, d_A_csr_diag_index, dataBase_.d_volume,
-                                                                                    dataBase_.d_face_vector, dataBase_.d_weight, dataBase_.d_Y_old_vector[i],
-                                                                                    dataBase_.d_rhoD_vector[i], d_sumYDiffError_tmp, d_sumYDiffError_tmp);
+        fvc_grad_internal<<<blocks_per_grid, threads_per_block, 0, stream>>>(num_cells,
+                d_A_csr_row_index, d_A_csr_col_index, d_A_csr_diag_index,
+                dataBase_.d_face_vector, dataBase_.d_weight, d_Y_old[i],
+                dataBase_.d_volume, d_grady);
+        blocks_per_grid = (num_boundary_cells + threads_per_block - 1) / threads_per_block;
+        fvc_grad_boundary<<<blocks_per_grid, threads_per_block, 0, stream>>>(num_boundary_cells,
+                dataBase_.d_boundary_cell_offset, dataBase_.d_boundary_cell_id, dataBase_.d_bouPermedIndex,
+                dataBase_.d_boundary_face_vector, d_boundary_Y,
+                dataBase_.d_volume, d_grady);
         blocks_per_grid = (num_boundary_cells + threads_per_block - 1) / threads_per_block;
-        fvc_grad_boundary_face_Y<<<blocks_per_grid, threads_per_block, 0, stream>>>(num_cells, num_boundary_cells, dataBase_.d_boundary_cell_offset,
-                                                                                    dataBase_.d_boundary_cell_id, dataBase_.d_rhoD_vector[i], dataBase_.d_bouPermedIndex,
-                                                                                    dataBase_.d_boundary_face_vector, dataBase_.d_boundary_Y_init_vector[i], dataBase_.d_boundary_Y_vector[i],
-                                                                                    dataBase_.d_volume, d_sumYDiffError_tmp, d_sumYDiffError_tmp);
+        correct_boundary_conditions<<<blocks_per_grid, threads_per_block, 0, stream>>>(num_boundary_cells,
+                dataBase_.d_boundary_cell_offset, dataBase_.d_boundary_cell_id,
+                dataBase_.d_boundary_face_vector, dataBase_.d_boundary_face,
+                d_grady, d_boundary_grady);
+
         blocks_per_grid = (num_cells + threads_per_block - 1) / threads_per_block;
-        sumError<<<blocks_per_grid, threads_per_block, 0, stream>>>(num_cells, dataBase_.d_volume, dataBase_.d_rhoD_vector[i], d_sumYDiffError_tmp, d_sumYDiffError);
+        sumError_internal<<<blocks_per_grid, threads_per_block, 0, stream>>>(num_cells,
+                d_hai, d_rhoD, d_Y_old[i], d_grady,
+                d_sum_hai_rhoD_grady, d_sum_rhoD_grady, d_sum_hai_y);
+        blocks_per_grid = (num_boundary_faces + threads_per_block - 1) / threads_per_block;
+        sumError_boundary<<<blocks_per_grid, threads_per_block, 0, stream>>>(num_boundary_faces,
+                dataBase_.d_bouPermedIndex,
+                d_boundary_hai, d_boundary_rhoD, d_boundary_Y, d_boundary_grady,
+                d_sum_boundary_hai_rhoD_grady, d_sum_boundary_rhoD_grady, d_sum_boundary_hai_y);
     }
-    blocks_per_grid = (num_cells + threads_per_block - 1) / threads_per_block;
-    divide_vol<<<blocks_per_grid, threads_per_block, 0, stream>>>(num_cells, dataBase_.d_volume, d_sumYDiffError, d_sumYDiffError);
-
-    blocks_per_grid = (num_boundary_cells + threads_per_block - 1) / threads_per_block;
-    correct_boundary_conditions_vec<<<blocks_per_grid, threads_per_block, 0, stream>>>(num_boundary_cells, dataBase_.d_boundary_cell_offset, dataBase_.d_boundary_cell_id,
-                                                                                       dataBase_.d_boundary_face_vector, dataBase_.d_boundary_face, d_sumYDiffError_boundary, d_sumYDiffError);
 
     blocks_per_grid = (num_cells + threads_per_block - 1) / threads_per_block;
-    calculate_phiUc<<<blocks_per_grid, threads_per_block, 0, stream>>>(num_cells,
-                                                                       d_A_csr_row_index, d_A_csr_col_index, d_A_csr_diag_index, dataBase_.d_face_vector,
-                                                                       dataBase_.d_weight, d_sumYDiffError, d_phiUc);
+    calculate_phiUc_internal<<<blocks_per_grid, threads_per_block, 0, stream>>>(num_cells,
+            d_A_csr_row_index, d_A_csr_col_index, d_A_csr_diag_index,
+            dataBase_.d_face_vector, dataBase_.d_weight, d_sum_rhoD_grady, d_phiUc);
+    blocks_per_grid = (num_boundary_faces + threads_per_block - 1) / threads_per_block;
+    calculate_phiUc_boundary<<<blocks_per_grid, threads_per_block, 0, stream>>>(num_boundary_faces,
+            dataBase_.d_boundary_cell_offset, dataBase_.d_boundary_cell_id,
+            dataBase_.d_boundary_face_vector, d_sum_boundary_rhoD_grady, d_phiUc_boundary);
 
+    blocks_per_grid = (num_cells + threads_per_block - 1) / threads_per_block;
+    calculate_hDiffCorrFlux<<<blocks_per_grid, threads_per_block, 0, stream>>>(num_cells,
+            d_sum_hai_rhoD_grady, d_sum_rhoD_grady, d_sum_hai_y, dataBase_.d_hDiffCorrFlux);
     blocks_per_grid = (num_boundary_faces + threads_per_block - 1) / threads_per_block;
-    calculate_phiUc_boundary<<<blocks_per_grid, threads_per_block, 0, stream>>>(num_boundary_faces, dataBase_.d_boundary_cell_offset, dataBase_.d_boundary_cell_id,
-                                                                                dataBase_.d_boundary_face_vector, d_sumYDiffError_boundary, d_phiUc_boundary);
+    calculate_hDiffCorrFlux<<<blocks_per_grid, threads_per_block, 0, stream>>>(num_boundary_faces,
+            d_sum_boundary_hai_rhoD_grady, d_sum_boundary_rhoD_grady, d_sum_boundary_hai_y, dataBase_.d_boundary_hDiffCorrFlux);
 }
 
 void dfYEqn::fvm_ddt()
@@ -572,11 +671,11 @@ void dfYEqn::fvm_ddt()
         threads_per_block = 1024;
         blocks_per_grid = (num_cells + threads_per_block - 1) / threads_per_block;
         fvm_ddt_kernel_scalar<<<blocks_per_grid, threads_per_block, 0, stream>>>(num_cells, num_faces, dataBase_.rdelta_t,
-                                                                                 d_A_csr_row_index, d_A_csr_diag_index,
-                                                                                 dataBase_.d_rho_old, dataBase_.d_rho_new, dataBase_.d_volume, dataBase_.d_Y_old_vector[i],
-                                                                                 d_A_csr + mtxIndex * (num_cells + num_faces), d_b + mtxIndex * num_cells,
-                                                                                 d_A_csr + mtxIndex * (num_cells + num_faces), d_b + mtxIndex * num_cells,
-                                                                                 d_psi + mtxIndex * num_cells);
+                d_A_csr_row_index, d_A_csr_diag_index,
+                dataBase_.d_rho_old, dataBase_.d_rho_new, dataBase_.d_volume, d_Y_old[i],
+                d_A_csr + mtxIndex * (num_cells + num_faces), d_b + mtxIndex * num_cells,
+                d_A_csr + mtxIndex * (num_cells + num_faces), d_b + mtxIndex * num_cells,
+                d_psi + mtxIndex * num_cells);
         ++mtxIndex;
     }
 }
@@ -636,32 +735,34 @@ void dfYEqn::fvm_div_phiUc()
     }
 }
 
-void dfYEqn::fvm_laplacian(double *mut_Sct, double *boundary_mut_Sct, std::vector<double *> boundary_rhoD)
+void dfYEqn::fvm_laplacian(const double *mut_Sct, const double *boundary_mut_Sct,
+        std::vector<const double *> rhoD, std::vector<double *> boundary_rhoD)
 {
     size_t threads_per_block, blocks_per_grid;
     int mtxIndex = 0;
 
-    checkCudaErrors(cudaMemcpyAsync(dataBase_.d_boundary_mut_sct, boundary_mut_Sct, dataBase_.boundary_face_bytes, cudaMemcpyHostToDevice, stream));
+    checkCudaErrors(cudaMemcpyAsync(d_boundary_mut_sct, boundary_mut_Sct, boundary_face_bytes, cudaMemcpyHostToDevice, stream));
     checkCudaErrors(cudaMemcpyAsync(d_mut_Sct, mut_Sct, cell_bytes, cudaMemcpyHostToDevice, stream));
 
     for (size_t i = 0; i < num_species; ++i)
     {
         if (i == inertIndex)
             continue;
-        checkCudaErrors(cudaMemcpyAsync(dataBase_.d_boundary_rhoD_vector[i], boundary_rhoD[i], dataBase_.boundary_face_bytes, cudaMemcpyHostToDevice, stream));
+        checkCudaErrors(cudaMemcpyAsync(d_rhoD, rhoD[i], cell_bytes, cudaMemcpyHostToDevice, stream));
+        checkCudaErrors(cudaMemcpyAsync(d_boundary_rhoD, boundary_rhoD[i], boundary_face_bytes, cudaMemcpyHostToDevice, stream));
 
         // launch cuda kernel
         threads_per_block = 1024;
         blocks_per_grid = (num_cells + threads_per_block - 1) / threads_per_block;
         fvm_laplacian_uncorrected_scalar_internal<<<blocks_per_grid, threads_per_block, 0, stream>>>(num_cells, d_A_csr_row_index, d_A_csr_col_index, d_A_csr_diag_index,
-                                                                                                     d_mut_Sct, dataBase_.d_rhoD_vector[i], dataBase_.d_weight, dataBase_.d_face,
+                                                                                                     d_mut_Sct, d_rhoD, dataBase_.d_weight, dataBase_.d_face,
                                                                                                      dataBase_.d_deltaCoeffs, -1., d_A_csr + mtxIndex * (num_cells + num_faces),
                                                                                                      d_A_csr + mtxIndex * (num_cells + num_faces));
         blocks_per_grid = (num_boundary_cells + threads_per_block - 1) / threads_per_block;
         fvm_laplacian_uncorrected_scalar_boundary<<<blocks_per_grid, threads_per_block, 0, stream>>>(num_cells, num_boundary_cells,
                                                                                                      d_A_csr_row_index, d_A_csr_diag_index, dataBase_.d_boundary_cell_offset,
-                                                                                                     dataBase_.d_boundary_cell_id, dataBase_.d_boundary_rhoD_vector[i],
-                                                                                                     dataBase_.d_boundary_mut_sct, dataBase_.d_boundary_face, dataBase_.d_bouPermedIndex,
+                                                                                                     dataBase_.d_boundary_cell_id, d_boundary_rhoD,
+                                                                                                     d_boundary_mut_sct, dataBase_.d_boundary_face, dataBase_.d_bouPermedIndex,
                                                                                                      dataBase_.d_laplac_internal_coeffs_Y_vector[i], dataBase_.d_laplac_boundary_coeffs_Y_vector[i], -1.,
                                                                                                      d_A_csr + mtxIndex * (num_cells + num_faces), d_b + mtxIndex * num_cells,
                                                                                                      d_A_csr + mtxIndex * (num_cells + num_faces), d_b + mtxIndex * num_cells);
@@ -759,6 +860,24 @@ void dfYEqn::updatePsi(double *Psi, int speciesIndex)
     memcpy(Psi, h_psi + speciesIndex * num_cells, cell_bytes);
 }
 
+void dfYEqn::compute_diffAlphaD(const double *alpha, std::vector<const double *> hai)
+{
+    checkCudaErrors(cudaMemsetAsync(dataBase_.d_diffAlphaD, 0, cell_bytes, stream));
+    checkCudaErrors(cudaMemcpyAsync(d_alpha, alpha, cell_bytes, cudaMemcpyHostToDevice, stream));
+    size_t threads_per_block, blocks_per_grid;
+    for (size_t i = 0; i < num_species; ++i)
+    {
+        checkCudaErrors(cudaMemcpyAsync(d_hai, hai[i], cell_bytes, cudaMemcpyHostToDevice, stream));
+        threads_per_block = 1024;
+        blocks_per_grid = (num_cells + threads_per_block - 1) / threads_per_block;
+        fvc_laplacian_internal<<<blocks_per_grid, threads_per_block, 0, stream>>>(num_cells,
+                d_A_csr_row_index, d_A_csr_col_index, d_A_csr_diag_index,
+                d_alpha, d_hai, d_Y_old[i],
+                dataBase_.d_weight, dataBase_.d_face, dataBase_.d_deltaCoeffs,
+                dataBase_.d_volume, dataBase_.d_diffAlphaD);
+    }
+}
+
 dfYEqn::~dfYEqn()
 {
-}
\ No newline at end of file
+}

From b1580807f3e4e494d1724006413009d00a0b5067 Mon Sep 17 00:00:00 2001
From: STwangyingrui <wangyingrui@sensetime.com>
Date: Thu, 8 Jun 2023 22:14:48 +0800
Subject: [PATCH 2/6] refactors: same stream cross equations,
 initializeTimeStep for each equation, profile the CPU
 correctBoundaryConditions

---
 applications/solvers/dfLowMachFoam/EEqn.H       |  4 +++-
 applications/solvers/dfLowMachFoam/YEqn.H       |  1 +
 .../solvers/dfLowMachFoam/dfLowMachFoam.C       |  3 +++
 applications/solvers/dfLowMachFoam/rhoEqn.H     |  1 +
 src_gpu/dfEEqn.H                                |  2 ++
 src_gpu/dfEEqn.cu                               | 10 ++++++++--
 src_gpu/dfMatrixDataBase.H                      | 14 +++++++-------
 src_gpu/dfRhoEqn.H                              |  5 ++++-
 src_gpu/dfRhoEqn.cu                             | 10 +++++++---
 src_gpu/dfUEqn.cu                               |  4 ++--
 src_gpu/dfYEqn.H                                |  2 ++
 src_gpu/dfYEqn.cu                               | 17 +++++++++++------
 12 files changed, 51 insertions(+), 22 deletions(-)

diff --git a/applications/solvers/dfLowMachFoam/EEqn.H b/applications/solvers/dfLowMachFoam/EEqn.H
index 764b6be10..d84da50a1 100644
--- a/applications/solvers/dfLowMachFoam/EEqn.H
+++ b/applications/solvers/dfLowMachFoam/EEqn.H
@@ -58,6 +58,8 @@
     end1 = std::clock();
     time_monitor_EEqn_mtxAssembly_CPU_Prepare += double(end1 - start1) / double(CLOCKS_PER_SEC);
 
+    EEqn_GPU.initializeTimeStep();
+
     // prepare data on GPU
     start1 = std::clock();
     EEqn_GPU.prepare_data(&he.oldTime()[0], &K[0], &K.oldTime()[0], &alphaEff[0],
@@ -67,7 +69,6 @@
     time_monitor_EEqn_mtxAssembly_GPU_Prepare += double(end1 - start1) / double(CLOCKS_PER_SEC);
 
     start1 = std::clock();
-    EEqn_GPU.initializeTimeStep();
     EEqn_GPU.fvm_ddt();
     EEqn_GPU.fvm_div();
     EEqn_GPU.fvm_laplacian();
@@ -99,6 +100,7 @@
     he.correctBoundaryConditions();
     end1 = std::clock();
     time_monitor_EEqn += double(end1 - start1) / double(CLOCKS_PER_SEC);
+    time_monitor_EEqn_mtxAssembly_CPU_CorrectBC += double(end1 - start1) / double(CLOCKS_PER_SEC);
     time_monitor_EEqn_mtxAssembly += double(end1 - start1) / double(CLOCKS_PER_SEC);
 #endif
 }
diff --git a/applications/solvers/dfLowMachFoam/YEqn.H b/applications/solvers/dfLowMachFoam/YEqn.H
index baa12633e..d8c69d20b 100644
--- a/applications/solvers/dfLowMachFoam/YEqn.H
+++ b/applications/solvers/dfLowMachFoam/YEqn.H
@@ -80,6 +80,7 @@ time_monitor_YEqn_Solve += double(end1 - start1) / double(CLOCKS_PER_SEC);
     end2 = std::clock();
     time_monitor_YEqn_mtxAssembly_CPU_Prepare += double(end2 - start2) / double(CLOCKS_PER_SEC);
 
+    YEqn_GPU.initializeTimeStep();
     YEqn_GPU.upwindWeight();
     YEqn_GPU.correctVelocity(Y_old, boundary_Y, hai, boundary_hai, rhoD, boundary_rhoD);
     YEqn_GPU.fvm_ddt();
diff --git a/applications/solvers/dfLowMachFoam/dfLowMachFoam.C b/applications/solvers/dfLowMachFoam/dfLowMachFoam.C
index c2a62797a..6f54e0a95 100644
--- a/applications/solvers/dfLowMachFoam/dfLowMachFoam.C
+++ b/applications/solvers/dfLowMachFoam/dfLowMachFoam.C
@@ -108,6 +108,7 @@ int main(int argc, char *argv[])
     double time_monitor_EEqn_mtxAssembly_CPU_Prepare;
     double time_monitor_EEqn_mtxAssembly_GPU_Prepare;
     double time_monitor_EEqn_mtxAssembly_GPU_Run;
+    double time_monitor_EEqn_mtxAssembly_CPU_CorrectBC;
     double time_monitor_EEqn_Solve=0;
     double time_monitor_EEqn_sum=0;
     double time_monitor_YEqn=0;
@@ -259,6 +260,7 @@ int main(int argc, char *argv[])
         Info<< "EEqn assamble(CPU prepare) = " << time_monitor_EEqn_mtxAssembly_CPU_Prepare << " s" << endl;
         Info<< "EEqn assamble(GPU prepare) = " << time_monitor_EEqn_mtxAssembly_GPU_Prepare << " s" << endl;
         Info<< "EEqn assamble(GPU run)     = " << time_monitor_EEqn_mtxAssembly_GPU_Run << " s" << endl;
+        Info<< "EEqn assamble(CPU correct) = " << time_monitor_EEqn_mtxAssembly_CPU_CorrectBC << " s" << endl;
         Info<< "EEqn Time solve            = " << time_monitor_EEqn_Solve << " s" << endl;
         Info<< "YEqn Time                  = " << time_monitor_YEqn << " s" << endl;
         Info<< "YEqn Time assamble Mtx     = " << time_monitor_YEqn_mtxAssembly << " s" << endl;
@@ -278,6 +280,7 @@ int main(int argc, char *argv[])
         time_monitor_EEqn_mtxAssembly_CPU_Prepare = 0;
         time_monitor_EEqn_mtxAssembly_GPU_Prepare = 0;
         time_monitor_EEqn_mtxAssembly_GPU_Run = 0;
+        time_monitor_EEqn_mtxAssembly_CPU_CorrectBC = 0;
         time_monitor_EEqn_Solve = 0;
         time_monitor_chem = 0;
         time_monitor_Y = 0;
diff --git a/applications/solvers/dfLowMachFoam/rhoEqn.H b/applications/solvers/dfLowMachFoam/rhoEqn.H
index 1c0652dca..1d0793384 100644
--- a/applications/solvers/dfLowMachFoam/rhoEqn.H
+++ b/applications/solvers/dfLowMachFoam/rhoEqn.H
@@ -40,6 +40,7 @@ Description
         memcpy(boundary_phi_init+offset, &patchFlux[0], patchSize*sizeof(double));
         offset += patchSize;
     }
+    rhoEqn_GPU.initializeTimeStep();
     rhoEqn_GPU.fvc_div(&phi[0], boundary_phi_init);
     rhoEqn_GPU.fvm_ddt(&rho.oldTime()[0]);
     rhoEqn_GPU.updatePsi(&rho.primitiveFieldRef()[0]);
diff --git a/src_gpu/dfEEqn.H b/src_gpu/dfEEqn.H
index 732e32ca7..f15a44a49 100644
--- a/src_gpu/dfEEqn.H
+++ b/src_gpu/dfEEqn.H
@@ -9,6 +9,8 @@ class dfEEqn
 private:
     dfMatrixDataBase &dataBase_;
     cudaStream_t stream;
+    //cudaEvent_t event;
+
     AmgXSolver *ESolver = nullptr;
     int num_iteration;
 
diff --git a/src_gpu/dfEEqn.cu b/src_gpu/dfEEqn.cu
index 48d6a19dc..f1aa3d14e 100644
--- a/src_gpu/dfEEqn.cu
+++ b/src_gpu/dfEEqn.cu
@@ -439,6 +439,9 @@ dfEEqn::dfEEqn(dfMatrixDataBase &dataBase, const std::string &modeStr, const std
 {
     ESolver = new AmgXSolver(modeStr, cfgFile);
 
+    stream = dataBase_.stream;
+    //checkCudaErrors(cudaEventCreate(&event));
+
     num_cells = dataBase_.num_cells;
     cell_bytes = dataBase_.cell_bytes;
     num_faces = dataBase_.num_faces;
@@ -479,8 +482,6 @@ dfEEqn::dfEEqn(dfMatrixDataBase &dataBase, const std::string &modeStr, const std
     checkCudaErrors(cudaMalloc((void **)&d_value_boundary_coeffs, boundary_face_bytes));
     checkCudaErrors(cudaMalloc((void **)&d_gradient_internal_coeffs, boundary_face_bytes));
     checkCudaErrors(cudaMalloc((void **)&d_gradient_boundary_coeffs, boundary_face_bytes));
-
-    checkCudaErrors(cudaStreamCreate(&stream));
 }
 
 void dfEEqn::prepare_data(const double *he_old, const double *K, const double *K_old, const double *alphaEff,
@@ -688,6 +689,7 @@ void dfEEqn::solve()
     num_iteration++;
 
     checkCudaErrors(cudaMemcpyAsync(h_he_new, d_he_old, cell_bytes, cudaMemcpyDeviceToHost, stream));
+    //checkCudaErrors(cudaEventRecord(event, stream));
     // checkCudaErrors(cudaStreamSynchronize(stream));
     // for (size_t i = 0; i < num_cells; i++)
     //     fprintf(stderr, "h_he_after[%d]: %.16lf\n", i, h_he_new[i]);
@@ -700,6 +702,8 @@ void dfEEqn::sync()
 
 void dfEEqn::updatePsi(double *Psi)
 {
+    checkCudaErrors(cudaStreamSynchronize(stream));
+    //checkCudaErrors(cudaEventSynchronize(event));
     memcpy(Psi, h_he_new, cell_bytes);
 }
 
@@ -732,4 +736,6 @@ dfEEqn::~dfEEqn()
     checkCudaErrors(cudaFree(d_value_boundary_coeffs));
     checkCudaErrors(cudaFree(d_gradient_internal_coeffs));
     checkCudaErrors(cudaFree(d_gradient_boundary_coeffs));
+
+    //checkCudaErrors(cudaEventDestroy(event));
 }
diff --git a/src_gpu/dfMatrixDataBase.H b/src_gpu/dfMatrixDataBase.H
index 778d7c56c..ed9915438 100644
--- a/src_gpu/dfMatrixDataBase.H
+++ b/src_gpu/dfMatrixDataBase.H
@@ -43,6 +43,9 @@ inline void checkVectorEqual(int count, const double* basevec, double* vec, doub
 
 struct dfMatrixDataBase
 {
+    // - cuda resource
+    cudaStream_t stream;
+
     // - number of cell size
     int num_cells;
     // - number of face size
@@ -185,11 +188,6 @@ struct dfMatrixDataBase
     // double *h_A_csr = nullptr, *h_b = nullptr, *h_psi = nullptr;
     // double *d_A_csr = nullptr, *d_b = nullptr, *d_psi = nullptr;
 
-    /**
-     * @brief cuda functions
-     */
-    cudaStream_t stream;
-
     int num_iteration;
 
     double time_monitor_CPU;
@@ -208,6 +206,9 @@ struct dfMatrixDataBase
     : num_cells(num_cells), num_faces(num_surfaces*2), num_surfaces(num_surfaces), num_species(num_species), num_iteration(0),
       num_boundary_faces(num_boundary_faces), h_volume(volume)
     {
+        // create cuda stream
+        checkCudaErrors(cudaStreamCreate(&stream));
+
         // allocate field pointer in pin memory
         cudaMallocHost(&h_phi_init, num_faces * sizeof(double));
         cudaMallocHost(&h_rho_old, num_cells * sizeof(double));
@@ -573,8 +574,6 @@ struct dfMatrixDataBase
 
         fprintf(stderr, "Total bytes malloc on GPU: %.2fMB\n", total_bytes * 1.0 / 1024 / 1024);
 
-        checkCudaErrors(cudaStreamCreate(&stream));
-
         checkCudaErrors(cudaMemcpyAsync(d_A_csr_row_index, h_A_csr_row_index, csr_row_index_bytes, cudaMemcpyHostToDevice, stream));
         checkCudaErrors(cudaMemcpyAsync(d_A_csr_col_index, h_A_csr_col_index, csr_col_index_bytes, cudaMemcpyHostToDevice, stream));
         checkCudaErrors(cudaMemcpyAsync(d_A_csr_diag_index, h_A_csr_diag_index, cell_index_bytes, cudaMemcpyHostToDevice, stream));
@@ -594,6 +593,7 @@ struct dfMatrixDataBase
         checkCudaErrors(cudaMemcpyAsync(d_bouPermedIndex, boundPermutationList.data(), boundary_face_index_bytes, cudaMemcpyHostToDevice, stream));
         checkCudaErrors(cudaMemcpyAsync(d_tmpPermutatedList, tmpPermutatedList.data(), csr_col_index_bytes, cudaMemcpyHostToDevice, stream));
     };
+
     ~dfMatrixDataBase(){
         std::cout << "Destructor called." << std::endl;
         // TODO: free pointers
diff --git a/src_gpu/dfRhoEqn.H b/src_gpu/dfRhoEqn.H
index 35885a517..75d40225d 100644
--- a/src_gpu/dfRhoEqn.H
+++ b/src_gpu/dfRhoEqn.H
@@ -16,6 +16,7 @@ class dfRhoEqn
 private:
     dfMatrixDataBase& dataBase_;
     cudaStream_t stream;
+
     int num_iteration;
     double time_monitor_CPU;
     double time_monitor_GPU_kernel, time_monitor_GPU_memcpy, time_monitor_GPU_memcpy_test;
@@ -33,6 +34,8 @@ public:
     dfRhoEqn(dfMatrixDataBase& dataBase);
     ~dfRhoEqn();
 
+    void initializeTimeStep();
+
     void checkValue(bool print);
 
     void fvc_div(double *phi, double *boundary_phi_init);
@@ -40,4 +43,4 @@ public:
     void fvm_ddt(double *rho_old);
 
     void updatePsi(double* Psi);
-};
\ No newline at end of file
+};
diff --git a/src_gpu/dfRhoEqn.cu b/src_gpu/dfRhoEqn.cu
index fc933f95b..a18c79a03 100644
--- a/src_gpu/dfRhoEqn.cu
+++ b/src_gpu/dfRhoEqn.cu
@@ -82,6 +82,7 @@ __global__ void fvm_ddt_rho(int num_cells, const double rdelta_t,
 dfRhoEqn::dfRhoEqn(dfMatrixDataBase &dataBase)
     : dataBase_(dataBase)
 {
+    stream = dataBase_.stream;
     num_cells = dataBase_.num_cells;
     cell_bytes = dataBase_.cell_bytes;
     num_surfaces = dataBase_.num_surfaces;
@@ -94,13 +95,16 @@ dfRhoEqn::dfRhoEqn(dfMatrixDataBase &dataBase)
 
     checkCudaErrors(cudaMalloc((void **)&d_b, cell_bytes));
     checkCudaErrors(cudaMalloc((void **)&d_psi, cell_bytes));
+}
 
-    checkCudaErrors(cudaStreamCreate(&stream));
+void dfRhoEqn::initializeTimeStep()
+{
+    // initialize matrix value
+    checkCudaErrors(cudaMemsetAsync(d_b, 0, cell_bytes, stream));
 }
 
 void dfRhoEqn::fvc_div(double *phi, double *boundary_phi_init)
 {
-    checkCudaErrors(cudaMemsetAsync(d_b, 0, cell_bytes, stream));
     clock_t start = std::clock();
     memcpy(dataBase_.h_phi_init, phi, num_surfaces * sizeof(double));
 
@@ -138,4 +142,4 @@ void dfRhoEqn::updatePsi(double *Psi)
     for (size_t i = 0; i < num_cells; i++)
         Psi[i] = h_psi[i];
 }
-dfRhoEqn::~dfRhoEqn(){}
\ No newline at end of file
+dfRhoEqn::~dfRhoEqn(){}
diff --git a/src_gpu/dfUEqn.cu b/src_gpu/dfUEqn.cu
index 7eb7cbe31..a82d85c76 100644
--- a/src_gpu/dfUEqn.cu
+++ b/src_gpu/dfUEqn.cu
@@ -1005,6 +1005,8 @@ __global__ void ueqn_update_BoundaryCoeffs_kernel(int num_boundary_faces, const
 dfUEqn::dfUEqn(dfMatrixDataBase &dataBase, const std::string &modeStr, const std::string &cfgFile)
     : dataBase_(dataBase)
 {
+    stream = dataBase_.stream;
+
     UxSolver = new AmgXSolver(modeStr, cfgFile);
     UySolver = new AmgXSolver(modeStr, cfgFile);
     UzSolver = new AmgXSolver(modeStr, cfgFile);
@@ -1034,8 +1036,6 @@ dfUEqn::dfUEqn(dfMatrixDataBase &dataBase, const std::string &modeStr, const std
     checkCudaErrors(cudaMalloc((void **)&d_A, cell_bytes));
     checkCudaErrors(cudaMalloc((void **)&d_ueqn_internal_coeffs, cell_vec_bytes));
     checkCudaErrors(cudaMalloc((void **)&d_ueqn_boundary_coeffs, cell_vec_bytes));
-
-    checkCudaErrors(cudaStreamCreate(&stream));
 }
 
 void dfUEqn::fvm_ddt(double *vector_old)
diff --git a/src_gpu/dfYEqn.H b/src_gpu/dfYEqn.H
index 6e2a408e4..2903d5934 100644
--- a/src_gpu/dfYEqn.H
+++ b/src_gpu/dfYEqn.H
@@ -45,6 +45,8 @@ public:
 
     ~dfYEqn();
 
+    void initializeTimeStep();
+
     void checkValue(bool print, char *filename);
 
     void correctVelocity(std::vector<double *> Y_old, std::vector<double *> boundary_Y,
diff --git a/src_gpu/dfYEqn.cu b/src_gpu/dfYEqn.cu
index a9ede60ec..0dbfdba8c 100644
--- a/src_gpu/dfYEqn.cu
+++ b/src_gpu/dfYEqn.cu
@@ -508,6 +508,7 @@ __global__ void fvc_laplacian_internal(int num_cells,
 dfYEqn::dfYEqn(dfMatrixDataBase &dataBase, const std::string &modeStr, const std::string &cfgFile, const int inertIndex)
     : dataBase_(dataBase), inertIndex(inertIndex)
 {
+    stream = dataBase_.stream;
     num_species = dataBase_.num_species;
     num_cells = dataBase_.num_cells;
     num_faces = dataBase_.num_faces;
@@ -562,7 +563,6 @@ dfYEqn::dfYEqn(dfMatrixDataBase &dataBase, const std::string &modeStr, const std
         checkCudaErrors(cudaMalloc((void **)&d_Y_old[i], cell_bytes));
     }
 
-    checkCudaErrors(cudaStreamCreate(&stream));
     // zeroGradient
     for (size_t i = 0; i < num_species; i++)
     {
@@ -573,6 +573,16 @@ dfYEqn::dfYEqn(dfMatrixDataBase &dataBase, const std::string &modeStr, const std
     }
 }
 
+void dfYEqn::initializeTimeStep()
+{
+    // consider inert species
+    // initialize matrix value
+    checkCudaErrors(cudaMemsetAsync(d_A_csr, 0, (num_cells + num_faces) * (num_species - 1) * sizeof(double), stream));
+    checkCudaErrors(cudaMemsetAsync(d_b, 0, cell_bytes * (num_species - 1), stream));
+    // initialize variables in each time step
+    checkCudaErrors(cudaMemsetAsync(d_psi, 0, cell_bytes * (num_species - 1), stream));
+}
+
 void dfYEqn::upwindWeight()
 {
     size_t threads_per_block = 1024;
@@ -655,11 +665,6 @@ void dfYEqn::correctVelocity(std::vector<double *> Y_old, std::vector<double *>
 
 void dfYEqn::fvm_ddt()
 {
-    // initialize variables in each time step
-    checkCudaErrors(cudaMemsetAsync(d_A_csr, 0, (num_cells + num_faces) * (num_species - 1) * sizeof(double), stream)); // consider inert species
-    checkCudaErrors(cudaMemsetAsync(d_b, 0, cell_bytes * (num_species - 1), stream));
-    checkCudaErrors(cudaMemsetAsync(d_psi, 0, cell_bytes * (num_species - 1), stream));
-
     size_t threads_per_block, blocks_per_grid;
     int mtxIndex = 0;
     for (size_t i = 0; i < num_species; ++i)

From 9316c680d0a3398077d8fde3c6d47086f09c2e2f Mon Sep 17 00:00:00 2001
From: STwangyingrui <wangyingrui@sensetime.com>
Date: Fri, 9 Jun 2023 00:05:46 +0800
Subject: [PATCH 3/6] refactor YEqn: minimize memcpy for species of rhoD and
 hai

---
 applications/solvers/dfLowMachFoam/YEqn.H |   5 +-
 src_gpu/dfMatrixDataBase.H                |  11 +-
 src_gpu/dfYEqn.H                          |  11 +-
 src_gpu/dfYEqn.cu                         | 181 ++++++++++------------
 4 files changed, 89 insertions(+), 119 deletions(-)

diff --git a/applications/solvers/dfLowMachFoam/YEqn.H b/applications/solvers/dfLowMachFoam/YEqn.H
index d8c69d20b..2e106fca0 100644
--- a/applications/solvers/dfLowMachFoam/YEqn.H
+++ b/applications/solvers/dfLowMachFoam/YEqn.H
@@ -82,13 +82,12 @@ time_monitor_YEqn_Solve += double(end1 - start1) / double(CLOCKS_PER_SEC);
 
     YEqn_GPU.initializeTimeStep();
     YEqn_GPU.upwindWeight();
-    YEqn_GPU.correctVelocity(Y_old, boundary_Y, hai, boundary_hai, rhoD, boundary_rhoD);
+    YEqn_GPU.fvm_laplacian_and_sumYDiffError_diffAlphaD_hDiffCorrFlux(Y_old, boundary_Y,
+            hai, boundary_hai, rhoD, boundary_rhoD, &mut_sct[0], boundary_mutsct, &thermo.alpha()[0]);
     YEqn_GPU.fvm_ddt();
     YEqn_GPU.fvm_div_phi();
     YEqn_GPU.fvm_div_phiUc();
-    YEqn_GPU.fvm_laplacian(&mut_sct[0], boundary_mutsct, rhoD, boundary_rhoD);
 
-    YEqn_GPU.compute_diffAlphaD(&thermo.alpha()[0], hai);
     end1 = std::clock();
     time_monitor_YEqn += double(end1 - start1) / double(CLOCKS_PER_SEC);
     time_monitor_YEqn_mtxAssembly += double(end1 - start1) / double(CLOCKS_PER_SEC);
diff --git a/src_gpu/dfMatrixDataBase.H b/src_gpu/dfMatrixDataBase.H
index ed9915438..81c82a9d8 100644
--- a/src_gpu/dfMatrixDataBase.H
+++ b/src_gpu/dfMatrixDataBase.H
@@ -90,9 +90,8 @@ struct dfMatrixDataBase
     // - the device pointer to rho_new, rho_old, velocity_old, pressure and volume list
     double *d_rho_new = nullptr, *d_rho_old = nullptr, *d_velocity_old = nullptr, 
     *d_pressure = nullptr, *d_volume = nullptr;
-    // - the device pointer to Y_new and Y_old
-    std::vector<double*> d_Y_new_vector;
-    std::vector<double*> d_Y_old_vector;
+    // - the device pointer to Y(vector Yi)
+    std::vector<double*> d_Y;
     // - the device pointer to the pre-permutated and post-permutated interpolation weight list
     double *d_weight_init = nullptr, *d_weight = nullptr;
     double *d_weight_upwind = nullptr;
@@ -479,8 +478,7 @@ struct dfMatrixDataBase
         checkCudaErrors(cudaMalloc((void**)&d_A_csr_diag_index, cell_index_bytes));
         total_bytes += (csr_row_index_bytes + csr_col_index_bytes + cell_index_bytes);
 
-        d_Y_new_vector.resize(num_species);
-        d_Y_old_vector.resize(num_species);
+        d_Y.resize(num_species);
         d_rhoD_vector.resize(num_species);
         d_boundary_Y_vector.resize(num_species);
         d_boundary_Y_init_vector.resize(num_species);
@@ -491,8 +489,7 @@ struct dfMatrixDataBase
         d_boundary_rhoD_vector.resize(num_species);
 
         for (size_t i = 0; i < num_species; ++i){
-            checkCudaErrors(cudaMalloc((void**)&d_Y_new_vector[i], cell_bytes));
-            checkCudaErrors(cudaMalloc((void**)&d_Y_old_vector[i], cell_bytes));
+            checkCudaErrors(cudaMalloc((void**)&d_Y[i], cell_bytes));
             checkCudaErrors(cudaMalloc((void**)&d_rhoD_vector[i], cell_bytes));
         }
         checkCudaErrors(cudaMalloc((void**)&d_rho_old, cell_bytes));
diff --git a/src_gpu/dfYEqn.H b/src_gpu/dfYEqn.H
index 2903d5934..f41cd9799 100644
--- a/src_gpu/dfYEqn.H
+++ b/src_gpu/dfYEqn.H
@@ -29,7 +29,6 @@ private:
     double *d_mut_Sct = nullptr;
     double *d_boundary_mut_sct = nullptr;
 
-    std::vector<double*> d_Y_old;
     double *d_boundary_Y = nullptr;
     double *d_hai, *d_boundary_hai = nullptr;
     double *d_rhoD, *d_boundary_rhoD = nullptr;
@@ -49,9 +48,10 @@ public:
 
     void checkValue(bool print, char *filename);
 
-    void correctVelocity(std::vector<double *> Y_old, std::vector<double *> boundary_Y,
+    void fvm_laplacian_and_sumYDiffError_diffAlphaD_hDiffCorrFlux(std::vector<double *> Y_old, std::vector<double *> boundary_Y,
         std::vector<const double *> hai, std::vector<double *> boundary_hai,
-        std::vector<const double *> rhoD, std::vector<double *> boundary_rhoD);
+        std::vector<const double *> rhoD, std::vector<double *> boundary_rhoD,
+        const double *mut_Sct, const double *boundary_mut_Sct, const double *alpha);
 
     void upwindWeight();
 
@@ -61,11 +61,6 @@ public:
 
     void fvm_div_phiUc();
 
-    void fvm_laplacian(const double *mut_Sct, const double *boundary_mut_Sct,
-            std::vector<const double*>rhoD, std::vector<double*>boundary_rhoD);
-
-    void compute_diffAlphaD(const double *alpha, std::vector<const double *> hai);
-
     void solve();
 
     void updatePsi(double *Psi, int speciesIndex);
diff --git a/src_gpu/dfYEqn.cu b/src_gpu/dfYEqn.cu
index 0dbfdba8c..41987ee26 100644
--- a/src_gpu/dfYEqn.cu
+++ b/src_gpu/dfYEqn.cu
@@ -420,12 +420,12 @@ __global__ void fvm_laplacian_uncorrected_scalar_internal(int num_cells,
     A_csr_output[row_index + diag_index] = A_csr_input[row_index + diag_index] + sum_diag * sign;
 }
 
-__global__ void fvm_laplacian_uncorrected_scalar_boundary(int num_cells, int num_boundary_cells,
-                                                          const int *csr_row_index, const int *csr_diag_index, const int *boundary_cell_offset,
-                                                          const int *boundary_cell_id, const double *boundary_scalar0, const double *boundary_scalar1,
-                                                          const double *boundary_magsf, const int *bouPermedIndex,
-                                                          const double *gradient_internal_coeffs, const double *gradient_boundary_coeffs,
-                                                          const double sign, const double *A_csr_input, const double *b_input, double *A_csr_output, double *b_output)
+__global__ void fvm_laplacian_uncorrected_scalar_boundary(int num_boundary_cells,
+        const int *csr_row_index, const int *csr_diag_index, const int *boundary_cell_offset,
+        const int *boundary_cell_id, const double *boundary_scalar0, const double *boundary_scalar1,
+        const double *boundary_magsf, const int *bouPermedIndex,
+        const double *gradient_internal_coeffs, const double *gradient_boundary_coeffs,
+        const double sign, const double *A_csr_input, const double *b_input, double *A_csr_output, double *b_output)
 {
     int index = blockDim.x * blockIdx.x + threadIdx.x;
     if (index >= num_boundary_cells)
@@ -557,12 +557,6 @@ dfYEqn::dfYEqn(dfMatrixDataBase &dataBase, const std::string &modeStr, const std
 
     checkCudaErrors(cudaMalloc((void **)&d_alpha, cell_bytes));
 
-    d_Y_old.resize(num_species);
-    for (size_t i = 0; i < num_species; i++)
-    {
-        checkCudaErrors(cudaMalloc((void **)&d_Y_old[i], cell_bytes));
-    }
-
     // zeroGradient
     for (size_t i = 0; i < num_species; i++)
     {
@@ -590,11 +584,16 @@ void dfYEqn::upwindWeight()
     getUpwindWeight<<<blocks_per_grid, threads_per_block, 0, stream>>>(num_faces, dataBase_.d_phi, dataBase_.d_weight_upwind);
 }
 
-void dfYEqn::correctVelocity(std::vector<double *> Y_old, std::vector<double *> boundary_Y,
+void dfYEqn::fvm_laplacian_and_sumYDiffError_diffAlphaD_hDiffCorrFlux(std::vector<double *> Y_old, std::vector<double *> boundary_Y,
         std::vector<const double *> hai, std::vector<double *> boundary_hai,
-        std::vector<const double *> rhoD, std::vector<double *> boundary_rhoD)
+        std::vector<const double *> rhoD, std::vector<double *> boundary_rhoD,
+        const double *mut_Sct, const double *boundary_mut_Sct, const double *alpha)
 {
     // initialize variables in each time step
+    checkCudaErrors(cudaMemcpyAsync(d_boundary_mut_sct, boundary_mut_Sct, boundary_face_bytes, cudaMemcpyHostToDevice, stream));
+    checkCudaErrors(cudaMemcpyAsync(d_mut_Sct, mut_Sct, cell_bytes, cudaMemcpyHostToDevice, stream));
+    checkCudaErrors(cudaMemcpyAsync(d_alpha, alpha, cell_bytes, cudaMemcpyHostToDevice, stream));
+
     checkCudaErrors(cudaMemsetAsync(d_sum_rhoD_grady, 0, 3 * cell_bytes, stream));
     checkCudaErrors(cudaMemsetAsync(d_sum_boundary_rhoD_grady, 0, 3 * boundary_face_bytes, stream));
     checkCudaErrors(cudaMemsetAsync(d_sum_hai_rhoD_grady, 0, 3 * cell_bytes, stream));
@@ -603,11 +602,13 @@ void dfYEqn::correctVelocity(std::vector<double *> Y_old, std::vector<double *>
     checkCudaErrors(cudaMemsetAsync(d_sum_boundary_hai_y, 0, boundary_face_bytes, stream));
     checkCudaErrors(cudaMemsetAsync(dataBase_.d_hDiffCorrFlux, 0, 3 * cell_bytes, stream));
     checkCudaErrors(cudaMemsetAsync(dataBase_.d_boundary_hDiffCorrFlux, 0, 3 * boundary_face_bytes, stream));
+    checkCudaErrors(cudaMemsetAsync(dataBase_.d_diffAlphaD, 0, cell_bytes, stream));
 
     size_t threads_per_block, blocks_per_grid;
+    int mtxIndex = 0;
     for (size_t i = 0; i < num_species; ++i)
     {
-        checkCudaErrors(cudaMemcpyAsync(d_Y_old[i], Y_old[i], cell_bytes, cudaMemcpyHostToDevice, stream));
+        checkCudaErrors(cudaMemcpyAsync(dataBase_.d_Y[i], Y_old[i], cell_bytes, cudaMemcpyHostToDevice, stream));
         checkCudaErrors(cudaMemcpyAsync(d_boundary_Y, boundary_Y[i], boundary_face_bytes, cudaMemcpyHostToDevice, stream));
         checkCudaErrors(cudaMemcpyAsync(d_hai, hai[i], cell_bytes, cudaMemcpyHostToDevice, stream));
         checkCudaErrors(cudaMemcpyAsync(d_boundary_hai, boundary_hai[i], boundary_face_bytes, cudaMemcpyHostToDevice, stream));
@@ -617,12 +618,12 @@ void dfYEqn::correctVelocity(std::vector<double *> Y_old, std::vector<double *>
         checkCudaErrors(cudaMemsetAsync(d_grady, 0, 3 * cell_bytes, stream));
         checkCudaErrors(cudaMemsetAsync(d_boundary_grady, 0, 3 * boundary_face_bytes, stream));
 
-        // launch cuda kernel
+        // fvc::grad(Yi)
         threads_per_block = 1024;
         blocks_per_grid = (num_cells + threads_per_block - 1) / threads_per_block;
         fvc_grad_internal<<<blocks_per_grid, threads_per_block, 0, stream>>>(num_cells,
                 d_A_csr_row_index, d_A_csr_col_index, d_A_csr_diag_index,
-                dataBase_.d_face_vector, dataBase_.d_weight, d_Y_old[i],
+                dataBase_.d_face_vector, dataBase_.d_weight, dataBase_.d_Y[i],
                 dataBase_.d_volume, d_grady);
         blocks_per_grid = (num_boundary_cells + threads_per_block - 1) / threads_per_block;
         fvc_grad_boundary<<<blocks_per_grid, threads_per_block, 0, stream>>>(num_boundary_cells,
@@ -635,25 +636,46 @@ void dfYEqn::correctVelocity(std::vector<double *> Y_old, std::vector<double *>
                 dataBase_.d_boundary_face_vector, dataBase_.d_boundary_face,
                 d_grady, d_boundary_grady);
 
+        // sum(chemistry->hai(i)*chemistry->rhoD(i)*fvc::grad(Yi))
+        // sum(chemistry->rhoD(i)*fvc::grad(Yi)), also be called sumYDiffError
+        // sum(chemistry->hai(i)*Yi)
         blocks_per_grid = (num_cells + threads_per_block - 1) / threads_per_block;
         sumError_internal<<<blocks_per_grid, threads_per_block, 0, stream>>>(num_cells,
-                d_hai, d_rhoD, d_Y_old[i], d_grady,
+                d_hai, d_rhoD, dataBase_.d_Y[i], d_grady,
                 d_sum_hai_rhoD_grady, d_sum_rhoD_grady, d_sum_hai_y);
         blocks_per_grid = (num_boundary_faces + threads_per_block - 1) / threads_per_block;
         sumError_boundary<<<blocks_per_grid, threads_per_block, 0, stream>>>(num_boundary_faces,
                 dataBase_.d_bouPermedIndex,
                 d_boundary_hai, d_boundary_rhoD, d_boundary_Y, d_boundary_grady,
                 d_sum_boundary_hai_rhoD_grady, d_sum_boundary_rhoD_grady, d_sum_boundary_hai_y);
-    }
 
-    blocks_per_grid = (num_cells + threads_per_block - 1) / threads_per_block;
-    calculate_phiUc_internal<<<blocks_per_grid, threads_per_block, 0, stream>>>(num_cells,
-            d_A_csr_row_index, d_A_csr_col_index, d_A_csr_diag_index,
-            dataBase_.d_face_vector, dataBase_.d_weight, d_sum_rhoD_grady, d_phiUc);
-    blocks_per_grid = (num_boundary_faces + threads_per_block - 1) / threads_per_block;
-    calculate_phiUc_boundary<<<blocks_per_grid, threads_per_block, 0, stream>>>(num_boundary_faces,
-            dataBase_.d_boundary_cell_offset, dataBase_.d_boundary_cell_id,
-            dataBase_.d_boundary_face_vector, d_sum_boundary_rhoD_grady, d_phiUc_boundary);
+        // compute diffAlphaD
+        blocks_per_grid = (num_cells + threads_per_block - 1) / threads_per_block;
+        fvc_laplacian_internal<<<blocks_per_grid, threads_per_block, 0, stream>>>(num_cells,
+                d_A_csr_row_index, d_A_csr_col_index, d_A_csr_diag_index,
+                d_alpha, d_hai, dataBase_.d_Y[i],
+                dataBase_.d_weight, dataBase_.d_face, dataBase_.d_deltaCoeffs,
+                dataBase_.d_volume, dataBase_.d_diffAlphaD);
+
+        // fvm::laplacian
+        if (i != inertIndex)
+        {
+            blocks_per_grid = (num_cells + threads_per_block - 1) / threads_per_block;
+            fvm_laplacian_uncorrected_scalar_internal<<<blocks_per_grid, threads_per_block, 0, stream>>>(num_cells,
+                    d_A_csr_row_index, d_A_csr_col_index, d_A_csr_diag_index,
+                    d_mut_Sct, d_rhoD, dataBase_.d_weight, dataBase_.d_face, dataBase_.d_deltaCoeffs,
+                    -1., d_A_csr + mtxIndex * (num_cells + num_faces), d_A_csr + mtxIndex * (num_cells + num_faces));
+            blocks_per_grid = (num_boundary_cells + threads_per_block - 1) / threads_per_block;
+            fvm_laplacian_uncorrected_scalar_boundary<<<blocks_per_grid, threads_per_block, 0, stream>>>(num_boundary_cells,
+                    d_A_csr_row_index, d_A_csr_diag_index,
+                    dataBase_.d_boundary_cell_offset, dataBase_.d_boundary_cell_id,
+                    d_boundary_rhoD, d_boundary_mut_sct, dataBase_.d_boundary_face, dataBase_.d_bouPermedIndex,
+                    dataBase_.d_laplac_internal_coeffs_Y_vector[i], dataBase_.d_laplac_boundary_coeffs_Y_vector[i],
+                    -1., d_A_csr + mtxIndex * (num_cells + num_faces), d_b + mtxIndex * num_cells,
+                    d_A_csr + mtxIndex * (num_cells + num_faces), d_b + mtxIndex * num_cells);
+        }
+        ++mtxIndex;
+    }
 
     blocks_per_grid = (num_cells + threads_per_block - 1) / threads_per_block;
     calculate_hDiffCorrFlux<<<blocks_per_grid, threads_per_block, 0, stream>>>(num_cells,
@@ -677,7 +699,7 @@ void dfYEqn::fvm_ddt()
         blocks_per_grid = (num_cells + threads_per_block - 1) / threads_per_block;
         fvm_ddt_kernel_scalar<<<blocks_per_grid, threads_per_block, 0, stream>>>(num_cells, num_faces, dataBase_.rdelta_t,
                 d_A_csr_row_index, d_A_csr_diag_index,
-                dataBase_.d_rho_old, dataBase_.d_rho_new, dataBase_.d_volume, d_Y_old[i],
+                dataBase_.d_rho_old, dataBase_.d_rho_new, dataBase_.d_volume, dataBase_.d_Y[i],
                 d_A_csr + mtxIndex * (num_cells + num_faces), d_b + mtxIndex * num_cells,
                 d_A_csr + mtxIndex * (num_cells + num_faces), d_b + mtxIndex * num_cells,
                 d_psi + mtxIndex * num_cells);
@@ -694,21 +716,20 @@ void dfYEqn::fvm_div_phi()
         if (i == inertIndex)
             continue;
 
-        // launch cuda kernel
+        // mvConvection->fvmDiv(phi, Yi)
         threads_per_block = 512;
         blocks_per_grid = (num_cells + threads_per_block - 1) / threads_per_block;
         fvm_div_internal_scalar<<<blocks_per_grid, threads_per_block, 0, stream>>>(num_cells, num_faces,
-                                                                                   d_A_csr_row_index, d_A_csr_diag_index, dataBase_.d_weight_upwind, dataBase_.d_phi,
-                                                                                   d_A_csr + mtxIndex * (num_cells + num_faces), d_b + mtxIndex * num_cells,
-                                                                                   d_A_csr + mtxIndex * (num_cells + num_faces), d_b + mtxIndex * num_cells);
-
+                d_A_csr_row_index, d_A_csr_diag_index, dataBase_.d_weight_upwind, dataBase_.d_phi,
+                d_A_csr + mtxIndex * (num_cells + num_faces), d_b + mtxIndex * num_cells,
+                d_A_csr + mtxIndex * (num_cells + num_faces), d_b + mtxIndex * num_cells);
         blocks_per_grid = (num_boundary_cells + threads_per_block - 1) / threads_per_block;
         fvm_div_boundary_scalar<<<blocks_per_grid, threads_per_block, 0, stream>>>(num_cells, num_faces, num_boundary_cells,
-                                                                                   d_A_csr_row_index, d_A_csr_diag_index, dataBase_.d_boundary_phi,
-                                                                                   dataBase_.d_boundary_cell_offset, dataBase_.d_boundary_cell_id,
-                                                                                   dataBase_.d_internal_coeffs_Y_vector[i], dataBase_.d_boundary_coeffs_Y_vector[i],
-                                                                                   d_A_csr + mtxIndex * (num_cells + num_faces), d_b + mtxIndex * num_cells,
-                                                                                   d_A_csr + mtxIndex * (num_cells + num_faces), d_b + mtxIndex * num_cells);
+                d_A_csr_row_index, d_A_csr_diag_index, dataBase_.d_boundary_phi,
+                dataBase_.d_boundary_cell_offset, dataBase_.d_boundary_cell_id,
+                dataBase_.d_internal_coeffs_Y_vector[i], dataBase_.d_boundary_coeffs_Y_vector[i],
+                d_A_csr + mtxIndex * (num_cells + num_faces), d_b + mtxIndex * num_cells,
+                d_A_csr + mtxIndex * (num_cells + num_faces), d_b + mtxIndex * num_cells);
         ++mtxIndex;
     }
 }
@@ -716,61 +737,37 @@ void dfYEqn::fvm_div_phi()
 void dfYEqn::fvm_div_phiUc()
 {
     size_t threads_per_block, blocks_per_grid;
+
+    // compue phiUc
+    threads_per_block = 512;
+    blocks_per_grid = (num_cells + threads_per_block - 1) / threads_per_block;
+    calculate_phiUc_internal<<<blocks_per_grid, threads_per_block, 0, stream>>>(num_cells,
+            d_A_csr_row_index, d_A_csr_col_index, d_A_csr_diag_index,
+            dataBase_.d_face_vector, dataBase_.d_weight, d_sum_rhoD_grady, d_phiUc);
+    blocks_per_grid = (num_boundary_faces + threads_per_block - 1) / threads_per_block;
+    calculate_phiUc_boundary<<<blocks_per_grid, threads_per_block, 0, stream>>>(num_boundary_faces,
+            dataBase_.d_boundary_cell_offset, dataBase_.d_boundary_cell_id,
+            dataBase_.d_boundary_face_vector, d_sum_boundary_rhoD_grady, d_phiUc_boundary);
+
+    // mvConvection->fvmDiv(phiUc, Yi)
     int mtxIndex = 0;
     for (size_t i = 0; i < num_species; ++i)
     {
         if (i == inertIndex)
             continue;
-        // launch cuda kernel
-        threads_per_block = 512;
         blocks_per_grid = (num_cells + threads_per_block - 1) / threads_per_block;
         fvm_div_internal_scalar<<<blocks_per_grid, threads_per_block, 0, stream>>>(num_cells, num_faces,
-                                                                                   d_A_csr_row_index, d_A_csr_diag_index, dataBase_.d_weight_upwind, d_phiUc,
-                                                                                   d_A_csr + mtxIndex * (num_cells + num_faces), d_b + mtxIndex * num_cells,
-                                                                                   d_A_csr + mtxIndex * (num_cells + num_faces), d_b + mtxIndex * num_cells);
+                d_A_csr_row_index, d_A_csr_diag_index, dataBase_.d_weight_upwind, d_phiUc,
+                d_A_csr + mtxIndex * (num_cells + num_faces), d_b + mtxIndex * num_cells,
+                d_A_csr + mtxIndex * (num_cells + num_faces), d_b + mtxIndex * num_cells);
 
         blocks_per_grid = (num_boundary_cells + threads_per_block - 1) / threads_per_block;
         fvm_div_boundary_scalar<<<blocks_per_grid, threads_per_block, 0, stream>>>(num_cells, num_faces, num_boundary_cells,
-                                                                                   d_A_csr_row_index, d_A_csr_diag_index, d_phiUc_boundary,
-                                                                                   dataBase_.d_boundary_cell_offset, dataBase_.d_boundary_cell_id,
-                                                                                   dataBase_.d_internal_coeffs_Y_vector[i], dataBase_.d_boundary_coeffs_Y_vector[i],
-                                                                                   d_A_csr + mtxIndex * (num_cells + num_faces), d_b + mtxIndex * num_cells,
-                                                                                   d_A_csr + mtxIndex * (num_cells + num_faces), d_b + mtxIndex * num_cells);
-        ++mtxIndex;
-    }
-}
-
-void dfYEqn::fvm_laplacian(const double *mut_Sct, const double *boundary_mut_Sct,
-        std::vector<const double *> rhoD, std::vector<double *> boundary_rhoD)
-{
-    size_t threads_per_block, blocks_per_grid;
-    int mtxIndex = 0;
-
-    checkCudaErrors(cudaMemcpyAsync(d_boundary_mut_sct, boundary_mut_Sct, boundary_face_bytes, cudaMemcpyHostToDevice, stream));
-    checkCudaErrors(cudaMemcpyAsync(d_mut_Sct, mut_Sct, cell_bytes, cudaMemcpyHostToDevice, stream));
-
-    for (size_t i = 0; i < num_species; ++i)
-    {
-        if (i == inertIndex)
-            continue;
-        checkCudaErrors(cudaMemcpyAsync(d_rhoD, rhoD[i], cell_bytes, cudaMemcpyHostToDevice, stream));
-        checkCudaErrors(cudaMemcpyAsync(d_boundary_rhoD, boundary_rhoD[i], boundary_face_bytes, cudaMemcpyHostToDevice, stream));
-
-        // launch cuda kernel
-        threads_per_block = 1024;
-        blocks_per_grid = (num_cells + threads_per_block - 1) / threads_per_block;
-        fvm_laplacian_uncorrected_scalar_internal<<<blocks_per_grid, threads_per_block, 0, stream>>>(num_cells, d_A_csr_row_index, d_A_csr_col_index, d_A_csr_diag_index,
-                                                                                                     d_mut_Sct, d_rhoD, dataBase_.d_weight, dataBase_.d_face,
-                                                                                                     dataBase_.d_deltaCoeffs, -1., d_A_csr + mtxIndex * (num_cells + num_faces),
-                                                                                                     d_A_csr + mtxIndex * (num_cells + num_faces));
-        blocks_per_grid = (num_boundary_cells + threads_per_block - 1) / threads_per_block;
-        fvm_laplacian_uncorrected_scalar_boundary<<<blocks_per_grid, threads_per_block, 0, stream>>>(num_cells, num_boundary_cells,
-                                                                                                     d_A_csr_row_index, d_A_csr_diag_index, dataBase_.d_boundary_cell_offset,
-                                                                                                     dataBase_.d_boundary_cell_id, d_boundary_rhoD,
-                                                                                                     d_boundary_mut_sct, dataBase_.d_boundary_face, dataBase_.d_bouPermedIndex,
-                                                                                                     dataBase_.d_laplac_internal_coeffs_Y_vector[i], dataBase_.d_laplac_boundary_coeffs_Y_vector[i], -1.,
-                                                                                                     d_A_csr + mtxIndex * (num_cells + num_faces), d_b + mtxIndex * num_cells,
-                                                                                                     d_A_csr + mtxIndex * (num_cells + num_faces), d_b + mtxIndex * num_cells);
+                d_A_csr_row_index, d_A_csr_diag_index, d_phiUc_boundary,
+                dataBase_.d_boundary_cell_offset, dataBase_.d_boundary_cell_id,
+                dataBase_.d_internal_coeffs_Y_vector[i], dataBase_.d_boundary_coeffs_Y_vector[i],
+                d_A_csr + mtxIndex * (num_cells + num_faces), d_b + mtxIndex * num_cells,
+                d_A_csr + mtxIndex * (num_cells + num_faces), d_b + mtxIndex * num_cells);
         ++mtxIndex;
     }
 }
@@ -865,24 +862,6 @@ void dfYEqn::updatePsi(double *Psi, int speciesIndex)
     memcpy(Psi, h_psi + speciesIndex * num_cells, cell_bytes);
 }
 
-void dfYEqn::compute_diffAlphaD(const double *alpha, std::vector<const double *> hai)
-{
-    checkCudaErrors(cudaMemsetAsync(dataBase_.d_diffAlphaD, 0, cell_bytes, stream));
-    checkCudaErrors(cudaMemcpyAsync(d_alpha, alpha, cell_bytes, cudaMemcpyHostToDevice, stream));
-    size_t threads_per_block, blocks_per_grid;
-    for (size_t i = 0; i < num_species; ++i)
-    {
-        checkCudaErrors(cudaMemcpyAsync(d_hai, hai[i], cell_bytes, cudaMemcpyHostToDevice, stream));
-        threads_per_block = 1024;
-        blocks_per_grid = (num_cells + threads_per_block - 1) / threads_per_block;
-        fvc_laplacian_internal<<<blocks_per_grid, threads_per_block, 0, stream>>>(num_cells,
-                d_A_csr_row_index, d_A_csr_col_index, d_A_csr_diag_index,
-                d_alpha, d_hai, d_Y_old[i],
-                dataBase_.d_weight, dataBase_.d_face, dataBase_.d_deltaCoeffs,
-                dataBase_.d_volume, dataBase_.d_diffAlphaD);
-    }
-}
-
 dfYEqn::~dfYEqn()
 {
 }

From 5a0918ba03da72cae269f4bfce81abd2b4c25657 Mon Sep 17 00:00:00 2001
From: STwangyingrui <wangyingrui@sensetime.com>
Date: Sat, 10 Jun 2023 00:50:09 +0800
Subject: [PATCH 4/6] refactor profile, add permute_psi for UEqn, add sync for
 each equation

---
 applications/solvers/dfLowMachFoam/EEqn.H     |  52 ++--
 applications/solvers/dfLowMachFoam/UEqn.H     |  14 +-
 applications/solvers/dfLowMachFoam/YEqn.H     |  40 ++-
 .../solvers/dfLowMachFoam/dfLowMachFoam.C     | 242 ++++++++++++------
 applications/solvers/dfLowMachFoam/pEqn.H     |  40 +++
 applications/solvers/dfLowMachFoam/rhoEqn.H   |  25 ++
 src_gpu/dfRhoEqn.H                            |   2 +
 src_gpu/dfRhoEqn.cu                           |   5 +
 src_gpu/dfUEqn.H                              |  12 +-
 src_gpu/dfUEqn.cu                             | 127 ++++-----
 src_gpu/dfYEqn.H                              |  30 +--
 src_gpu/dfYEqn.cu                             |   5 +
 12 files changed, 362 insertions(+), 232 deletions(-)

diff --git a/applications/solvers/dfLowMachFoam/EEqn.H b/applications/solvers/dfLowMachFoam/EEqn.H
index d84da50a1..bcab9affb 100644
--- a/applications/solvers/dfLowMachFoam/EEqn.H
+++ b/applications/solvers/dfLowMachFoam/EEqn.H
@@ -2,13 +2,12 @@
     volScalarField& he = thermo.he();
 
 #ifdef GPUSolver_
-        start1 = std::clock();
-        UEqn_GPU.updatePsi(&U[0][0]);
-        K = 0.5*magSqr(U);
-        end1 = std::clock();
-        time_monitor_UEqn += double(end1 - start1) / double(CLOCKS_PER_SEC);
-        time_monitor_UEqn_mtxAssembly += double(end1 - start1) / double(CLOCKS_PER_SEC);
-        time_monitor_CPU += double(end1 - start1) / double(CLOCKS_PER_SEC);
+    start1 = std::clock();
+    UEqn_GPU.updatePsi(&U[0][0]);
+    K = 0.5*magSqr(U);
+    end1 = std::clock();
+    time_monitor_UEqn += double(end1 - start1) / double(CLOCKS_PER_SEC);
+    time_monitor_UEqn_correctBC += double(end1 - start1) / double(CLOCKS_PER_SEC);
 #endif
 
 #ifdef CPUSolver_
@@ -32,7 +31,7 @@
     EEqn.solve();
     end1 = std::clock();
     time_monitor_EEqn += double(end1 - start1) / double(CLOCKS_PER_SEC);
-    time_monitor_EEqn_Solve += double(end1 - start1) / double(CLOCKS_PER_SEC);
+    time_monitor_EEqn_solve += double(end1 - start1) / double(CLOCKS_PER_SEC);
 #endif
 
 #ifdef GPUSolver_
@@ -41,10 +40,12 @@
     start2 = std::clock();
     const tmp<volScalarField> alphaEff_tmp(turbulence->alphaEff());
     const volScalarField& alphaEff = alphaEff_tmp();
+    end2 = std::clock();
     int eeqn_offset = 0;
+    int patchNum = 0;
     forAll(he.boundaryField(), patchi)
     {
-        const fvsPatchScalarField& patchFlux = phi.boundaryField()[patchi];
+        patchNum++;
         const fvsPatchScalarField& pw = mesh.surfaceInterpolation::weights().boundaryField()[patchi];
         int patchSize = pw.size();
 
@@ -56,19 +57,25 @@
         eeqn_offset += patchSize;
     }
     end1 = std::clock();
-    time_monitor_EEqn_mtxAssembly_CPU_Prepare += double(end1 - start1) / double(CLOCKS_PER_SEC);
-
-    EEqn_GPU.initializeTimeStep();
+    time_monitor_EEqn += double(end1 - start1) / double(CLOCKS_PER_SEC);
+    time_monitor_EEqn_mtxAssembly += double(end1 - start1) / double(CLOCKS_PER_SEC);
+    time_monitor_EEqn_mtxAssembly_CPU_prepare += double(end1 - start1) / double(CLOCKS_PER_SEC);
+    fprintf(stderr, "time_monitor_EEqn_mtxAssembly_CPU_prepare: %lf, build alphaEff time: %lf, patchNum: %d\n",
+            time_monitor_EEqn_mtxAssembly_CPU_prepare,
+            double(end2 - start2) / double(CLOCKS_PER_SEC), patchNum);
 
     // prepare data on GPU
     start1 = std::clock();
     EEqn_GPU.prepare_data(&he.oldTime()[0], &K[0], &K.oldTime()[0], &alphaEff[0],
             &dpdt[0], boundary_K, boundary_alphaEff);
-    if (doSync) EEqn_GPU.sync();
+    EEqn_GPU.sync();
     end1 = std::clock();
-    time_monitor_EEqn_mtxAssembly_GPU_Prepare += double(end1 - start1) / double(CLOCKS_PER_SEC);
+    time_monitor_EEqn += double(end1 - start1) / double(CLOCKS_PER_SEC);
+    time_monitor_EEqn_mtxAssembly += double(end1 - start1) / double(CLOCKS_PER_SEC);
+    time_monitor_EEqn_mtxAssembly_GPU_prepare += double(end1 - start1) / double(CLOCKS_PER_SEC);
 
     start1 = std::clock();
+    EEqn_GPU.initializeTimeStep();
     EEqn_GPU.fvm_ddt();
     EEqn_GPU.fvm_div();
     EEqn_GPU.fvm_laplacian();
@@ -76,31 +83,26 @@
     EEqn_GPU.fvc_div_phi_scalar();
     EEqn_GPU.fvc_div_vector();
     EEqn_GPU.add_to_source();
-    if (doSync) EEqn_GPU.sync();
-    end1 = std::clock();
-    time_monitor_EEqn_mtxAssembly_GPU_Run += double(end1 - start1) / double(CLOCKS_PER_SEC);
-
     EEqn_GPU.sync();
-    end2 = std::clock();
-    time_monitor_EEqn += double(end2 - start2) / double(CLOCKS_PER_SEC);
-    time_monitor_EEqn_mtxAssembly += double(end2 - start2) / double(CLOCKS_PER_SEC);
+    end1 = std::clock();
+    time_monitor_EEqn += double(end1 - start1) / double(CLOCKS_PER_SEC);
+    time_monitor_EEqn_mtxAssembly += double(end1 - start1) / double(CLOCKS_PER_SEC);
+    time_monitor_EEqn_mtxAssembly_GPU_run += double(end1 - start1) / double(CLOCKS_PER_SEC);
 
     // check value of mtxAssembly, no time monitor
     // EEqn_GPU.checkValue(false);
 
     start1 = std::clock();
     EEqn_GPU.solve();
-    if (doSync) EEqn_GPU.sync();
     end1 = std::clock();
     time_monitor_EEqn += double(end1 - start1) / double(CLOCKS_PER_SEC);
-    time_monitor_EEqn_Solve += double(end1 - start1) / double(CLOCKS_PER_SEC);
+    time_monitor_EEqn_solve += double(end1 - start1) / double(CLOCKS_PER_SEC);
 
     start1 = std::clock();
     EEqn_GPU.updatePsi(&he[0]);
     he.correctBoundaryConditions();
     end1 = std::clock();
     time_monitor_EEqn += double(end1 - start1) / double(CLOCKS_PER_SEC);
-    time_monitor_EEqn_mtxAssembly_CPU_CorrectBC += double(end1 - start1) / double(CLOCKS_PER_SEC);
-    time_monitor_EEqn_mtxAssembly += double(end1 - start1) / double(CLOCKS_PER_SEC);
+    time_monitor_EEqn_correctBC += double(end1 - start1) / double(CLOCKS_PER_SEC);
 #endif
 }
diff --git a/applications/solvers/dfLowMachFoam/UEqn.H b/applications/solvers/dfLowMachFoam/UEqn.H
index 4427ece1b..190636f6f 100644
--- a/applications/solvers/dfLowMachFoam/UEqn.H
+++ b/applications/solvers/dfLowMachFoam/UEqn.H
@@ -1,9 +1,6 @@
 // Solve the Momentum equation
 #ifdef GPUSolver_
     start1 = std::clock();
-    UEqn_GPU.initializeTimeStep();
-    UEqn_GPU.fvm_ddt(&U.oldTime()[0][0]);
-    start2 = std::clock();
     int offset = 0;
     const tmp<volScalarField> nuEff_tmp(turbulence->nuEff());
     const volScalarField& nuEff = nuEff_tmp();
@@ -27,21 +24,24 @@
         offset += patchSize;
     }
     end1 = std::clock();
-    end2 = std::clock();
-    time_monitor_UEqn_CPU += double(end2 - start2) / double(CLOCKS_PER_SEC);
     time_monitor_UEqn += double(end1 - start1) / double(CLOCKS_PER_SEC);
     time_monitor_UEqn_mtxAssembly += double(end1 - start1) / double(CLOCKS_PER_SEC);
+    time_monitor_UEqn_mtxAssembly_CPU_prepare += double(end1 - start1) / double(CLOCKS_PER_SEC);
 
     start1 = std::clock();
+    UEqn_GPU.initializeTimeStep();
+    UEqn_GPU.fvm_ddt(&U.oldTime()[0][0]);
     UEqn_GPU.fvm_div(boundary_pressure_init, boundary_velocity_init, boundary_nuEff_init, boundary_rho_init);
     UEqn_GPU.fvc_grad(&p[0]);
     UEqn_GPU.fvc_grad_vector();
     UEqn_GPU.dev2T();
     UEqn_GPU.fvc_div_tensor(&nuEff[0]);
     UEqn_GPU.fvm_laplacian();
+    UEqn_GPU.sync();
     end1 = std::clock();
     time_monitor_UEqn += double(end1 - start1) / double(CLOCKS_PER_SEC);
     time_monitor_UEqn_mtxAssembly += double(end1 - start1) / double(CLOCKS_PER_SEC);
+    time_monitor_UEqn_mtxAssembly_GPU_run += double(end1 - start1) / double(CLOCKS_PER_SEC);
 
     // start2 = std::clock();
     // fvVectorMatrix turb_source
@@ -88,7 +88,7 @@
     }
     end1 = std::clock();
     time_monitor_UEqn += double(end1 - start1) / double(CLOCKS_PER_SEC);
-    time_monitor_UEqn_Solve += double(end1 - start1) / double(CLOCKS_PER_SEC);
+    time_monitor_UEqn_solve += double(end1 - start1) / double(CLOCKS_PER_SEC);
 #endif
 
 // start1 = std::clock();
@@ -96,7 +96,7 @@
 // UEqn_GPU.solve();
 // end1 = std::clock();
 // time_monitor_UEqn += double(end1 - start1) / double(CLOCKS_PER_SEC);
-// time_monitor_UEqn_Solve += double(end1 - start1) / double(CLOCKS_PER_SEC);
+// time_monitor_UEqn_solve += double(end1 - start1) / double(CLOCKS_PER_SEC);
 
 // start1 = std::clock();
 // // // t.join();
diff --git a/applications/solvers/dfLowMachFoam/YEqn.H b/applications/solvers/dfLowMachFoam/YEqn.H
index 2e106fca0..a547e266d 100644
--- a/applications/solvers/dfLowMachFoam/YEqn.H
+++ b/applications/solvers/dfLowMachFoam/YEqn.H
@@ -1,4 +1,3 @@
-start = std::clock();
 hDiffCorrFlux = Zero;
 diffAlphaD = Zero;
 sumYDiffError = Zero;
@@ -23,21 +22,17 @@ forAll(Y, i)
 const surfaceScalarField phiUc = linearInterpolate(sumYDiffError) & mesh.Sf();
 start1 = std::clock();
 time_monitor_YEqn += double(end1 - start1) / double(CLOCKS_PER_SEC);
-time_monitor_YEqn_Solve += double(end1 - start1) / double(CLOCKS_PER_SEC);
+time_monitor_YEqn_solve += double(end1 - start1) / double(CLOCKS_PER_SEC);
 #endif
 
 #ifdef GPUSolver_
     start1 = std::clock();
-    // // std::thread t(&dfMatrix::solve, &UEqn_GPU);
     UEqn_GPU.solve();
     end1 = std::clock();
     time_monitor_UEqn += double(end1 - start1) / double(CLOCKS_PER_SEC);
-    time_monitor_UEqn_Solve += double(end1 - start1) / double(CLOCKS_PER_SEC);
-#endif
+    time_monitor_UEqn_solve += double(end1 - start1) / double(CLOCKS_PER_SEC);
 
-#ifdef GPUSolver_
     start1 = std::clock();
-    start2 = std::clock();
     std::vector<double*> Y_old(Y.size()), boundary_Y(Y.size()), boundary_hai(Y.size()), boundary_rhoD(Y.size());
     std::vector<const double*> hai(Y.size()), rhoD(Y.size());
     for (size_t i = 0; i < Y.size(); ++i)
@@ -65,7 +60,6 @@ time_monitor_YEqn_Solve += double(end1 - start1) / double(CLOCKS_PER_SEC);
             offset += patchSize;
         }
     }
-
     volScalarField mut_sct = turbulence->mut().ref()/Sct;
     double *boundary_mutsct = nullptr;
     cudaMallocHost(&boundary_mutsct, num_boundary_faces*sizeof(double));
@@ -77,9 +71,13 @@ time_monitor_YEqn_Solve += double(end1 - start1) / double(CLOCKS_PER_SEC);
         memcpy(boundary_mutsct + offset, &patchMut_sct[0], patchSize*sizeof(double));
         offset += patchSize;
     }
-    end2 = std::clock();
-    time_monitor_YEqn_mtxAssembly_CPU_Prepare += double(end2 - start2) / double(CLOCKS_PER_SEC);
+    end1 = std::clock();
+    time_monitor_YEqn += double(end1 - start1) / double(CLOCKS_PER_SEC);
+    time_monitor_YEqn_mtxAssembly += double(end1 - start1) / double(CLOCKS_PER_SEC);
+    time_monitor_YEqn_mtxAssembly_CPU_prepare += double(end1 - start1) / double(CLOCKS_PER_SEC);
+    fprintf(stderr, "time_monitor_YEqn_mtxAssembly_CPU_prepare: %lf\n", time_monitor_YEqn_mtxAssembly_CPU_prepare);
 
+    start1 = std::clock();
     YEqn_GPU.initializeTimeStep();
     YEqn_GPU.upwindWeight();
     YEqn_GPU.fvm_laplacian_and_sumYDiffError_diffAlphaD_hDiffCorrFlux(Y_old, boundary_Y,
@@ -87,24 +85,23 @@ time_monitor_YEqn_Solve += double(end1 - start1) / double(CLOCKS_PER_SEC);
     YEqn_GPU.fvm_ddt();
     YEqn_GPU.fvm_div_phi();
     YEqn_GPU.fvm_div_phiUc();
-
+    YEqn_GPU.sync();
     end1 = std::clock();
     time_monitor_YEqn += double(end1 - start1) / double(CLOCKS_PER_SEC);
     time_monitor_YEqn_mtxAssembly += double(end1 - start1) / double(CLOCKS_PER_SEC);
+    time_monitor_YEqn_mtxAssembly_GPU_run += double(end1 - start1) / double(CLOCKS_PER_SEC);
 
     start1 = std::clock();
     YEqn_GPU.solve();
     end1 = std::clock();
     time_monitor_YEqn += double(end1 - start1) / double(CLOCKS_PER_SEC);
-    time_monitor_YEqn_Solve += double(end1 - start1) / double(CLOCKS_PER_SEC);
+    time_monitor_YEqn_solve += double(end1 - start1) / double(CLOCKS_PER_SEC);
 #endif
 
 //MPI_Barrier(PstreamGlobals::MPI_COMM_FOAM);
 label flag_mpi_init;
 MPI_Initialized(&flag_mpi_init);
 if(flag_mpi_init) MPI_Barrier(PstreamGlobals::MPI_COMM_FOAM);
-end = std::clock();
-time_monitor_corrDiff += double(end - start) / double(CLOCKS_PER_SEC);
 
 {
     if (!splitting)
@@ -119,9 +116,8 @@ time_monitor_corrDiff += double(end - start) / double(CLOCKS_PER_SEC);
         time_monitor_chem += processingTime.count();
     }
 
+    start2 = std::clock();
     volScalarField Yt(0.0*Y[0]);
-
-    start = std::clock();
     int speciesIndex = 0;
     forAll(Y, i)
     {
@@ -137,9 +133,7 @@ time_monitor_corrDiff += double(end - start) / double(CLOCKS_PER_SEC);
                 YEqn_GPU.updatePsi(&Yi[0], speciesIndex);
                 Yi.correctBoundaryConditions();
                 end1 = std::clock();
-                time_monitor_YEqn += double(end1 - start1) / double(CLOCKS_PER_SEC);
-                time_monitor_YEqn_mtxAssembly += double(end1 - start1) / double(CLOCKS_PER_SEC);
-                time_monitor_YEqn_mtxAssembly_CPU_Prepare += double(end1 - start1) / double(CLOCKS_PER_SEC);
+                time_monitor_YEqn_correctBC += double(end1 - start1) / double(CLOCKS_PER_SEC);
             #else
                 start1 = std::clock();
                 tmp<volScalarField> DEff = chemistry->rhoD(i) + turbulence->mut()/Sct;
@@ -156,15 +150,13 @@ time_monitor_corrDiff += double(end - start) / double(CLOCKS_PER_SEC);
                     )
                 );
                 end1 = std::clock();
-                time_monitor_YEqn += double(end1 - start1) / double(CLOCKS_PER_SEC);
                 time_monitor_YEqn_mtxAssembly += double(end1 - start1) / double(CLOCKS_PER_SEC);
                 // YiEqn.relax();
 
                 start1 = std::clock();
                 YiEqn.solve("Yi");
                 end1 = std::clock();
-                time_monitor_YEqn += double(end1 - start1) / double(CLOCKS_PER_SEC);
-                time_monitor_YEqn_Solve += double(end1 - start1) / double(CLOCKS_PER_SEC);
+                time_monitor_YEqn_solve += double(end1 - start1) / double(CLOCKS_PER_SEC);
             #endif
 
             Yi.max(0.0);
@@ -176,6 +168,6 @@ time_monitor_corrDiff += double(end - start) / double(CLOCKS_PER_SEC);
     Y[inertIndex] = scalar(1) - Yt;
     Y[inertIndex].max(0.0);
 
-    end = std::clock();
-    time_monitor_Y += double(end - start) / double(CLOCKS_PER_SEC);
+    end2 = std::clock();
+    time_monitor_YEqn += double(end2 - start2) / double(CLOCKS_PER_SEC);
 }
diff --git a/applications/solvers/dfLowMachFoam/dfLowMachFoam.C b/applications/solvers/dfLowMachFoam/dfLowMachFoam.C
index 6f54e0a95..c47a03c0d 100644
--- a/applications/solvers/dfLowMachFoam/dfLowMachFoam.C
+++ b/applications/solvers/dfLowMachFoam/dfLowMachFoam.C
@@ -96,32 +96,56 @@ int main(int argc, char *argv[])
     #include "createFields.H"
     #include "createRhoUfIfPresent.H"
 
-    bool doSync = true;
-    double time_monitor_flow=0;
-    double time_monitor_UEqn=0;
-    double time_monitor_UEqn_mtxAssembly=0;
-    double time_monitor_UEqn_Solve=0;
-    double time_monitor_UEqn_sum=0;
-    double time_monitor_UEqn_CPU=0;
-    double time_monitor_EEqn=0;
-    double time_monitor_EEqn_mtxAssembly=0;
-    double time_monitor_EEqn_mtxAssembly_CPU_Prepare;
-    double time_monitor_EEqn_mtxAssembly_GPU_Prepare;
-    double time_monitor_EEqn_mtxAssembly_GPU_Run;
-    double time_monitor_EEqn_mtxAssembly_CPU_CorrectBC;
-    double time_monitor_EEqn_Solve=0;
-    double time_monitor_EEqn_sum=0;
-    double time_monitor_YEqn=0;
-    double time_monitor_YEqn_mtxAssembly=0;
-    double time_monitor_YEqn_mtxAssembly_CPU_Prepare=0;
-    double time_monitor_YEqn_Solve=0;
-    double time_monitor_chem=0;
-    double time_monitor_Y=0;
-    double time_monitor_E=0;
-    double time_monitor_corrThermo=0;
-    double time_monitor_corrDiff=0;
-    double time_monitor_CPU=0;
-    double time_monitor_UinE=0;
+    double time_monitor_init = 0;
+
+    double time_monitor_other = 0;
+    double time_monitor_rho = 0;
+    double time_monitor_U = 0;
+    double time_monitor_Y = 0;
+    double time_monitor_E = 0;
+    double time_monitor_p = 0;
+    double time_monitor_chemistry_correctThermo = 0;
+    double time_monitor_turbulence_correct = 0;
+    double time_monitor_chem = 0; // combustion correct
+
+    double time_monitor_rhoEqn = 0;
+    double time_monitor_rhoEqn_mtxAssembly = 0;
+    double time_monitor_rhoEqn_mtxAssembly_CPU_prepare = 0;
+    double time_monitor_rhoEqn_mtxAssembly_GPU_run = 0;
+    double time_monitor_rhoEqn_solve = 0;
+    double time_monitor_rhoEqn_correctBC = 0;
+
+    double time_monitor_UEqn = 0;
+    double time_monitor_UEqn_mtxAssembly = 0;
+    double time_monitor_UEqn_mtxAssembly_CPU_prepare = 0;
+    double time_monitor_UEqn_mtxAssembly_GPU_run = 0;
+    double time_monitor_UEqn_solve = 0;
+    double time_monitor_UEqn_correctBC = 0;
+    double time_monitor_UEqn_H = 0;
+    double time_monitor_UEqn_H_GPU_run = 0;
+    double time_monitor_UEqn_H_correctBC = 0;
+    double time_monitor_UEqn_A = 0;
+    double time_monitor_UEqn_A_GPU_run = 0;
+    double time_monitor_UEqn_A_correctBC = 0;
+
+    double time_monitor_YEqn = 0;
+    double time_monitor_YEqn_mtxAssembly = 0;
+    double time_monitor_YEqn_mtxAssembly_CPU_prepare = 0;
+    double time_monitor_YEqn_mtxAssembly_GPU_run = 0;
+    double time_monitor_YEqn_solve = 0;
+    double time_monitor_YEqn_correctBC = 0;
+
+    double time_monitor_EEqn = 0;
+    double time_monitor_EEqn_mtxAssembly = 0;
+    double time_monitor_EEqn_mtxAssembly_CPU_prepare = 0;
+    double time_monitor_EEqn_mtxAssembly_GPU_prepare = 0;
+    double time_monitor_EEqn_mtxAssembly_GPU_run = 0;
+    double time_monitor_EEqn_solve = 0;
+    double time_monitor_EEqn_correctBC = 0;
+
+    double time_monitor_pEqn = 0;
+    double time_monitor_pEqn_solve = 0;
+
     label timeIndex = 0;
     clock_t start, end, start1, end1, start2, end2;
 
@@ -136,9 +160,7 @@ int main(int argc, char *argv[])
     start1 = std::clock();
     #include "createdfSolver.H"
     end1 = std::clock();
-    // time_monitor_UEqn += double(end1 - start1) / double(CLOCKS_PER_SEC);
-    // time_monitor_UEqn_mtxAssembly += double(end1 - start1) / double(CLOCKS_PER_SEC);
-    // double time_UEqn_initial = time_monitor_UEqn;
+    time_monitor_init += double(end1 - start1) / double(CLOCKS_PER_SEC);
 
     // * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * //
 
@@ -164,9 +186,11 @@ int main(int argc, char *argv[])
 
         Info<< "Time = " << runTime.timeName() << nl << endl;
 
+        clock_t loop_start = std::clock();
         // --- Pressure-velocity PIMPLE corrector loop
         while (pimple.loop())
         {
+            start = std::clock();
             if (splitting)
             {
                 #include "YEqn_RR.H"
@@ -180,20 +204,28 @@ int main(int argc, char *argv[])
                     rhoU = new volVectorField("rhoU", rho*U);
                 }
             }
+            end = std::clock();
+            time_monitor_other += double(end - start) / double(CLOCKS_PER_SEC);
 
+            start = std::clock();
             if (pimple.firstPimpleIter() && !pimple.simpleRho())
             {
                 #include "rhoEqn.H"
             }
+            end = std::clock();
+            time_monitor_rho += double(end - start) / double(CLOCKS_PER_SEC);
             
             start = std::clock();
             #include "UEqn.H"
             end = std::clock();
-            time_monitor_flow += double(end - start) / double(CLOCKS_PER_SEC);
+            time_monitor_U += double(end - start) / double(CLOCKS_PER_SEC);
 
             if(combModelName!="ESF" && combModelName!="flareFGM" )
             {
+                start = std::clock();
                 #include "YEqn.H"
+                end = std::clock();
+                time_monitor_Y += double(end - start) / double(CLOCKS_PER_SEC);
 
                 start = std::clock();
                 #include "EEqn.H"
@@ -203,7 +235,7 @@ int main(int argc, char *argv[])
                 start = std::clock();
                 chemistry->correctThermo();
                 end = std::clock();
-                time_monitor_corrThermo += double(end - start) / double(CLOCKS_PER_SEC);
+                time_monitor_chemistry_correctThermo += double(end - start) / double(CLOCKS_PER_SEC);
             }
             else
             {
@@ -227,73 +259,131 @@ int main(int argc, char *argv[])
                 }
             }
             end = std::clock();
-            time_monitor_flow += double(end - start) / double(CLOCKS_PER_SEC);
+            time_monitor_p += double(end - start) / double(CLOCKS_PER_SEC);
 
+            start = std::clock();
             if (pimple.turbCorr())
             {
                 turbulence->correct();
             }
+            end = std::clock();
+            time_monitor_turbulence_correct += double(end - start) / double(CLOCKS_PER_SEC);
         }
+        clock_t loop_end = std::clock();
+        double loop_time = double(loop_end - loop_start) / double(CLOCKS_PER_SEC);
 
         rho = thermo.rho();
 
         runTime.write();
-        time_monitor_UEqn_sum += time_monitor_UEqn;
-        time_monitor_EEqn_sum += time_monitor_EEqn;
 
         Info << "output time index " << runTime.timeIndex() << endl;
 
+        Info<< "Init Time(createdfSolver)    = " << time_monitor_init << " s" << endl;
+
         Info<< "========Time Spent in diffenet parts========"<< endl;
-        Info<< "Chemical sources           = " << time_monitor_chem << " s" << endl;
-        Info<< "Species Equations          = " << time_monitor_Y << " s" << endl;
-        Info<< "U & p Equations            = " << time_monitor_flow << " s" << endl;
-        Info<< "Energy Equations           = " << time_monitor_E << " s" << endl;
-        Info<< "thermo & Trans Properties  = " << time_monitor_corrThermo << " s" << endl;
-        Info<< "Diffusion Correction Time  = " << time_monitor_corrDiff << " s" << endl;
-        Info<< "UEqn Time                  = " << time_monitor_UEqn << " s" << endl;
-        Info<< "UEqn Time assamble Mtx     = " << time_monitor_UEqn_mtxAssembly << " s" << endl;
-        Info<< "UEqn Time solve            = " << time_monitor_UEqn_Solve << " s" << endl;
-        // Info<< "UEqn sum Time              = " << time_monitor_UEqn_sum << " s" << endl;
-        // Info<< "UEqn sum Time - overhead   = " << time_monitor_UEqn_sum - time_UEqn_initial << " s" << endl;
-        Info<< "EEqn Time                  = " << time_monitor_EEqn << " s" << endl;
-        Info<< "EEqn Time assamble Mtx     = " << time_monitor_EEqn_mtxAssembly << " s" << endl;
-        Info<< "EEqn assamble(CPU prepare) = " << time_monitor_EEqn_mtxAssembly_CPU_Prepare << " s" << endl;
-        Info<< "EEqn assamble(GPU prepare) = " << time_monitor_EEqn_mtxAssembly_GPU_Prepare << " s" << endl;
-        Info<< "EEqn assamble(GPU run)     = " << time_monitor_EEqn_mtxAssembly_GPU_Run << " s" << endl;
-        Info<< "EEqn assamble(CPU correct) = " << time_monitor_EEqn_mtxAssembly_CPU_CorrectBC << " s" << endl;
-        Info<< "EEqn Time solve            = " << time_monitor_EEqn_Solve << " s" << endl;
-        Info<< "YEqn Time                  = " << time_monitor_YEqn << " s" << endl;
-        Info<< "YEqn Time assamble Mtx     = " << time_monitor_YEqn_mtxAssembly << " s" << endl;
-        Info<< "YEqn assamble(CPU prepare) = " << time_monitor_YEqn_mtxAssembly_CPU_Prepare << " s" << endl;
-        Info<< "YEqn Time solve            = " << time_monitor_YEqn_Solve << " s" << endl;
-        Info<< "sum Time                   = " << (time_monitor_chem + time_monitor_Y + time_monitor_flow + time_monitor_E + time_monitor_corrThermo + time_monitor_corrDiff) << " s" << endl;
-        Info<< "CPU Time (get turb souce)  = " << time_monitor_CPU << " s" << endl;
+        Info<< "loop Time                    = " << loop_time << " s" << endl;
+        Info<< "other Time                   = " << time_monitor_other << " s" << endl;
+        Info<< "rho Equations                = " << time_monitor_rho << " s" << endl;
+        Info<< "U Equations                  = " << time_monitor_U << " s" << endl;
+        Info<< "Y Equations                  = " << time_monitor_Y << " s" << endl;
+        Info<< "E Equations                  = " << time_monitor_E << " s" << endl;
+        Info<< "p Equations                  = " << time_monitor_p << " s" << endl;
+        Info<< "chemistry correctThermo      = " << time_monitor_chemistry_correctThermo << " s" << endl;
+        Info<< "turbulence correct           = " << time_monitor_turbulence_correct << " s" << endl;
+        Info<< "combustion correct(in Y)     = " << time_monitor_chem << " s" << endl;
+
+        Info<< "========Time details of each equation======="<< endl;
+
+        Info<< "rhoEqn Time                  = " << time_monitor_rhoEqn << " s" << endl;
+        Info<< "rhoEqn assamble              = " << time_monitor_rhoEqn_mtxAssembly << " s" << endl;
+        Info<< "rhoEqn assamble(CPU prepare) = " << time_monitor_rhoEqn_mtxAssembly_CPU_prepare << " s" << endl;
+        Info<< "rhoEqn assamble(GPU run)     = " << time_monitor_rhoEqn_mtxAssembly_GPU_run << " s" << endl;
+        Info<< "rhoEqn solve                 = " << time_monitor_rhoEqn_solve << " s" << endl;
+        Info<< "rhoEqn correct boundary      = " << time_monitor_rhoEqn_correctBC << " s" << endl;
+
+        Info<< "UEqn Time                    = " << time_monitor_UEqn << " s" << endl;
+        Info<< "UEqn assamble                = " << time_monitor_UEqn_mtxAssembly << " s" << endl;
+        Info<< "UEqn assamble(CPU prepare)   = " << time_monitor_UEqn_mtxAssembly_CPU_prepare << " s" << endl;
+        Info<< "UEqn assamble(GPU run)       = " << time_monitor_UEqn_mtxAssembly_GPU_run << " s" << endl;
+        Info<< "UEqn solve                   = " << time_monitor_UEqn_solve << " s" << endl;
+        Info<< "UEqn correct boundary        = " << time_monitor_UEqn_correctBC << " s" << endl;
+        Info<< "UEqn H                       = " << time_monitor_UEqn_H << " s" << endl;
+        Info<< "UEqn H(GPU run)              = " << time_monitor_UEqn_H_GPU_run << " s" << endl;
+        Info<< "UEqn H(correct boundary)     = " << time_monitor_UEqn_H_correctBC << " s" << endl;
+        Info<< "UEqn A                       = " << time_monitor_UEqn_A << " s" << endl;
+        Info<< "UEqn A(GPU run)              = " << time_monitor_UEqn_A_GPU_run << " s" << endl;
+        Info<< "UEqn A(correct boundary)     = " << time_monitor_UEqn_A_correctBC << " s" << endl;
+
+        Info<< "YEqn Time                    = " << time_monitor_YEqn << " s" << endl;
+        Info<< "YEqn assamble                = " << time_monitor_YEqn_mtxAssembly << " s" << endl;
+        Info<< "YEqn assamble(CPU prepare)   = " << time_monitor_YEqn_mtxAssembly_CPU_prepare << " s" << endl;
+        Info<< "YEqn assamble(GPU run)       = " << time_monitor_YEqn_mtxAssembly_GPU_run << " s" << endl;
+        Info<< "YEqn solve                   = " << time_monitor_YEqn_solve << " s" << endl;
+        Info<< "YEqn correct boundary        = " << time_monitor_YEqn_correctBC << " s" << endl;
+
+        Info<< "EEqn Time                    = " << time_monitor_EEqn << " s" << endl;
+        Info<< "EEqn assamble                = " << time_monitor_EEqn_mtxAssembly << " s" << endl;
+        Info<< "EEqn assamble(CPU prepare)   = " << time_monitor_EEqn_mtxAssembly_CPU_prepare << " s" << endl;
+        Info<< "EEqn assamble(GPU prepare)   = " << time_monitor_EEqn_mtxAssembly_GPU_prepare << " s" << endl;
+        Info<< "EEqn assamble(GPU run)       = " << time_monitor_EEqn_mtxAssembly_GPU_run << " s" << endl;
+        Info<< "EEqn solve                   = " << time_monitor_EEqn_solve << " s" << endl;
+        Info<< "EEqn correct boundary        = " << time_monitor_EEqn_correctBC << " s" << endl;
+
+        Info<< "pEqn Time                    = " << time_monitor_pEqn << " s" << endl;
+        Info<< "pEqn Time solve              = " << time_monitor_pEqn_solve << " s" << endl;
+
         Info<< "============================================"<<nl<< endl;
 
         Info<< "ExecutionTime = " << runTime.elapsedCpuTime() << " s"
             << "  ClockTime = " << runTime.elapsedClockTime() << " s" << endl;
+
+        time_monitor_other = 0;
+        time_monitor_rho = 0;
+        time_monitor_U = 0;
+        time_monitor_Y = 0;
+        time_monitor_E = 0;
+        time_monitor_p = 0;
+        time_monitor_chemistry_correctThermo = 0;
+        time_monitor_turbulence_correct = 0;
+        time_monitor_chem = 0;
+
+        time_monitor_rhoEqn = 0;
+        time_monitor_rhoEqn_mtxAssembly = 0;
+        time_monitor_rhoEqn_mtxAssembly_CPU_prepare = 0;
+        time_monitor_rhoEqn_mtxAssembly_GPU_run = 0;
+        time_monitor_rhoEqn_solve = 0;
+        time_monitor_rhoEqn_correctBC = 0;
+
         time_monitor_UEqn = 0;
         time_monitor_UEqn_mtxAssembly = 0;
-        time_monitor_UEqn_Solve = 0;
+        time_monitor_UEqn_mtxAssembly_CPU_prepare = 0;
+        time_monitor_UEqn_mtxAssembly_GPU_run = 0;
+        time_monitor_UEqn_solve = 0;
+        time_monitor_UEqn_correctBC = 0;
+        time_monitor_UEqn_H = 0;
+        time_monitor_UEqn_H_GPU_run = 0;
+        time_monitor_UEqn_H_correctBC = 0;
+        time_monitor_UEqn_A = 0;
+        time_monitor_UEqn_A_GPU_run = 0;
+        time_monitor_UEqn_A_correctBC = 0;
+
+        time_monitor_YEqn = 0;
+        time_monitor_YEqn_mtxAssembly = 0;
+        time_monitor_YEqn_mtxAssembly_CPU_prepare = 0;
+        time_monitor_YEqn_mtxAssembly_GPU_run = 0;
+        time_monitor_YEqn_solve = 0;
+        time_monitor_YEqn_correctBC = 0;
+
         time_monitor_EEqn = 0;
         time_monitor_EEqn_mtxAssembly = 0;
-        time_monitor_EEqn_mtxAssembly_CPU_Prepare = 0;
-        time_monitor_EEqn_mtxAssembly_GPU_Prepare = 0;
-        time_monitor_EEqn_mtxAssembly_GPU_Run = 0;
-        time_monitor_EEqn_mtxAssembly_CPU_CorrectBC = 0;
-        time_monitor_EEqn_Solve = 0;
-        time_monitor_chem = 0;
-        time_monitor_Y = 0;
-        time_monitor_E = 0;
-        time_monitor_flow = 0;
-        time_monitor_corrThermo = 0;
-        time_monitor_corrDiff = 0;
-        time_monitor_CPU = 0;
-        time_monitor_UinE = 0;
-        time_monitor_YEqn=0;
-        time_monitor_YEqn_mtxAssembly=0;
-        time_monitor_YEqn_mtxAssembly_CPU_Prepare=0;
-        time_monitor_YEqn_Solve=0;
+        time_monitor_EEqn_mtxAssembly_CPU_prepare = 0;
+        time_monitor_EEqn_mtxAssembly_GPU_prepare = 0;
+        time_monitor_EEqn_mtxAssembly_GPU_run = 0;
+        time_monitor_EEqn_solve = 0;
+        time_monitor_EEqn_correctBC = 0;
+
+        time_monitor_pEqn = 0;
+        time_monitor_pEqn_solve = 0;
 
 #ifdef USE_PYTORCH
         if (log_ && torch_)
diff --git a/applications/solvers/dfLowMachFoam/pEqn.H b/applications/solvers/dfLowMachFoam/pEqn.H
index de3d8bfd5..913bee4fa 100644
--- a/applications/solvers/dfLowMachFoam/pEqn.H
+++ b/applications/solvers/dfLowMachFoam/pEqn.H
@@ -9,6 +9,7 @@ const volScalarField psip0(psi*p);
 
 #ifdef GPUSolver_
     // UEqn.H()
+    start1 = std::clock();
     volVectorField UEqn_H
     (
         IOobject
@@ -24,9 +25,20 @@ const volScalarField psip0(psi*p);
         extrapolatedCalculatedFvPatchScalarField::typeName
     );
     UEqn_GPU.H(&UEqn_H[0][0]);
+    end1 = std::clock();
+    time_monitor_UEqn += double(end1 - start1) / double(CLOCKS_PER_SEC);
+    time_monitor_UEqn_H += double(end1 - start1) / double(CLOCKS_PER_SEC);
+    time_monitor_UEqn_H_GPU_run += double(end1 - start1) / double(CLOCKS_PER_SEC);
+
+    start1 = std::clock();
     UEqn_H.correctBoundaryConditions();
+    end1 = std::clock();
+    time_monitor_UEqn += double(end1 - start1) / double(CLOCKS_PER_SEC);
+    time_monitor_UEqn_H += double(end1 - start1) / double(CLOCKS_PER_SEC);
+    time_monitor_UEqn_H_correctBC += double(end1 - start1) / double(CLOCKS_PER_SEC);
 
     // UEqn.A()
+    start1 = std::clock();
     volScalarField UEqn_A
     (
         IOobject
@@ -42,13 +54,28 @@ const volScalarField psip0(psi*p);
         extrapolatedCalculatedFvPatchScalarField::typeName
     );
     UEqn_GPU.A(&UEqn_A[0]);
+    end1 = std::clock();
+    time_monitor_UEqn += double(end1 - start1) / double(CLOCKS_PER_SEC);
+    time_monitor_UEqn_A += double(end1 - start1) / double(CLOCKS_PER_SEC);
+    time_monitor_UEqn_A_correctBC += double(end1 - start1) / double(CLOCKS_PER_SEC);
+
+    start1 = std::clock();
     UEqn_A.correctBoundaryConditions();
+    end1 = std::clock();
+    time_monitor_UEqn += double(end1 - start1) / double(CLOCKS_PER_SEC);
+    time_monitor_UEqn_A += double(end1 - start1) / double(CLOCKS_PER_SEC);
+    time_monitor_UEqn_A_correctBC += double(end1 - start1) / double(CLOCKS_PER_SEC);
 #endif
 
+start2 = std::clock();
 #ifdef GPUSolver_
+    start1 = std::clock();
     volScalarField rAU(1.0/UEqn_A);
     surfaceScalarField rhorAUf("rhorAUf", fvc::interpolate(rho*rAU));
     volVectorField HbyA(constrainHbyA(rAU*UEqn_H, U, p));
+    end1 = std::clock();
+    time_monitor_UEqn += double(end1 - start1) / double(CLOCKS_PER_SEC);
+    time_monitor_UEqn_A += double(end1 - start1) / double(CLOCKS_PER_SEC);
 #else
     volScalarField rAU(1.0/UEqn.A());
     surfaceScalarField rhorAUf("rhorAUf", fvc::interpolate(rho*rAU));
@@ -95,7 +122,10 @@ if (pimple.transonic())
         // Relax the pressure equation to ensure diagonal-dominance
         pEqn.relax();
 
+        start1 = std::clock();
         pEqn.solve();
+        end1 = std::clock();
+        time_monitor_pEqn_solve += double(end1 - start1) / double(CLOCKS_PER_SEC);
 
         if (pimple.finalNonOrthogonalIter())
         {
@@ -115,7 +145,10 @@ else
     {
         fvScalarMatrix pEqn(pDDtEqn - fvm::laplacian(rhorAUf, p));
 
+        start1 = std::clock();
         pEqn.solve();
+        end1 = std::clock();
+        time_monitor_pEqn_solve += double(end1 - start1) / double(CLOCKS_PER_SEC);
 
         if (pimple.finalNonOrthogonalIter())
         {
@@ -145,7 +178,11 @@ U.correctBoundaryConditions();
 K = 0.5*magSqr(U);
 
 #ifdef GPUSolver_
+    start1 = std::clock();
     UEqn_GPU.correctPsi(&U[0][0]);
+    end1 = std::clock();
+    time_monitor_UEqn += double(end1 - start1) / double(CLOCKS_PER_SEC);
+    time_monitor_UEqn_correctBC += double(end1 - start1) / double(CLOCKS_PER_SEC);
 #endif
 
 if (pimple.simpleRho())
@@ -165,3 +202,6 @@ if (thermo.dpdt())
         dpdt -= fvc::div(fvc::meshPhi(rho, U), p);
     }
 }
+end2 = std::clock();
+time_monitor_pEqn += double(end2 - start2) / double(CLOCKS_PER_SEC);
+
diff --git a/applications/solvers/dfLowMachFoam/rhoEqn.H b/applications/solvers/dfLowMachFoam/rhoEqn.H
index 1d0793384..93965ca52 100644
--- a/applications/solvers/dfLowMachFoam/rhoEqn.H
+++ b/applications/solvers/dfLowMachFoam/rhoEqn.H
@@ -30,6 +30,7 @@ Description
 \*---------------------------------------------------------------------------*/
 #ifdef GPUSolver_
 {
+    start1 = std::clock();
     rho.oldTime();
 
     int offset = 0;
@@ -40,21 +41,45 @@ Description
         memcpy(boundary_phi_init+offset, &patchFlux[0], patchSize*sizeof(double));
         offset += patchSize;
     }
+    end1 = std::clock();
+    time_monitor_rhoEqn += double(end1 - start1) / double(CLOCKS_PER_SEC);
+    time_monitor_rhoEqn_mtxAssembly += double(end1 - start1) / double(CLOCKS_PER_SEC);
+    time_monitor_rhoEqn_mtxAssembly_CPU_prepare += double(end1 - start1) / double(CLOCKS_PER_SEC);
+
+    start1 = std::clock();
     rhoEqn_GPU.initializeTimeStep();
     rhoEqn_GPU.fvc_div(&phi[0], boundary_phi_init);
     rhoEqn_GPU.fvm_ddt(&rho.oldTime()[0]);
+    rhoEqn_GPU.sync();
+    end1 = std::clock();
+    time_monitor_rhoEqn += double(end1 - start1) / double(CLOCKS_PER_SEC);
+    time_monitor_rhoEqn_mtxAssembly += double(end1 - start1) / double(CLOCKS_PER_SEC);
+    time_monitor_rhoEqn_mtxAssembly_GPU_run += double(end1 - start1) / double(CLOCKS_PER_SEC);
+
+    start1 = std::clock();
     rhoEqn_GPU.updatePsi(&rho.primitiveFieldRef()[0]);
     rho.correctBoundaryConditions();
+    end1 = std::clock();
+    time_monitor_rhoEqn += double(end1 - start1) / double(CLOCKS_PER_SEC);
+    time_monitor_rhoEqn_correctBC += double(end1 - start1) / double(CLOCKS_PER_SEC);
 }
 #else
 {
+    start1 = std::clock();
     fvScalarMatrix rhoEqn
     (
         fvm::ddt(rho)
       + fvc::div(phi)
     );
+    end1 = std::clock();
+    time_monitor_rhoEqn += double(end1 - start1) / double(CLOCKS_PER_SEC);
+    time_monitor_rhoEqn_mtxAssembly += double(end1 - start1) / double(CLOCKS_PER_SEC);
 
+    start1 = std::clock();
     rhoEqn.solve();
+    end1 = std::clock();
+    time_monitor_rhoEqn += double(end1 - start1) / double(CLOCKS_PER_SEC);
+    time_monitor_rhoEqn_solve += double(end1 - start1) / double(CLOCKS_PER_SEC);
 }
 #endif
 
diff --git a/src_gpu/dfRhoEqn.H b/src_gpu/dfRhoEqn.H
index 75d40225d..419072dc6 100644
--- a/src_gpu/dfRhoEqn.H
+++ b/src_gpu/dfRhoEqn.H
@@ -42,5 +42,7 @@ public:
 
     void fvm_ddt(double *rho_old);
 
+    void sync();
+
     void updatePsi(double* Psi);
 };
diff --git a/src_gpu/dfRhoEqn.cu b/src_gpu/dfRhoEqn.cu
index a18c79a03..112a376ec 100644
--- a/src_gpu/dfRhoEqn.cu
+++ b/src_gpu/dfRhoEqn.cu
@@ -136,6 +136,11 @@ void dfRhoEqn::fvm_ddt(double *rho_old)
     checkCudaErrors(cudaMemcpyAsync(h_psi, dataBase_.d_rho_new, cell_bytes, cudaMemcpyDeviceToHost, stream));
 }
 
+void dfRhoEqn::sync()
+{
+    checkCudaErrors(cudaStreamSynchronize(stream));
+}
+
 void dfRhoEqn::updatePsi(double *Psi)
 {
     checkCudaErrors(cudaStreamSynchronize(stream));
diff --git a/src_gpu/dfUEqn.H b/src_gpu/dfUEqn.H
index 3405a5bbc..14288209b 100644
--- a/src_gpu/dfUEqn.H
+++ b/src_gpu/dfUEqn.H
@@ -11,17 +11,17 @@ private:
     cudaStream_t stream;
     AmgXSolver *UxSolver, *UySolver, *UzSolver = nullptr;
     int num_iteration;
-    double time_monitor_CPU;
-    double time_monitor_GPU_kernel, time_monitor_GPU_memcpy, time_monitor_GPU_memcpy_test;
 
     // common variables
     int num_cells, cell_bytes, num_faces, num_surfaces, cell_vec_bytes, csr_value_vec_bytes, num_boundary_cells;
     int *d_A_csr_row_index, *d_A_csr_diag_index, *d_A_csr_col_index;
 
     // Matrix variables
-    double *d_A_csr, *d_b, *d_psi, *d_H, *d_A, *d_ueqn_internal_coeffs, *d_ueqn_boundary_coeffs= nullptr;
+    double *d_A_csr, *d_b, *d_psi, *d_psi_permute, *d_H, *d_H_permute, *d_A;
     double *h_A_csr, *h_b, *h_psi, *h_H, *h_A = nullptr;
 
+    double *d_ueqn_internal_coeffs, *d_ueqn_boundary_coeffs= nullptr;
+
 public:
     dfUEqn();
     dfUEqn(dfMatrixDataBase &dataBase, const std::string &modeStr, const std::string &cfgFile);
@@ -44,17 +44,17 @@ public:
 
     void fvm_laplacian();
 
-    void add_fvMatrix(double *turbSrc_low, double *turbSrc_diag, double *turbSrc_upp, double *turbSrc_source);
-
     void A(double *Psi);
 
     void H(double *Psi);
 
     void solve();
 
+    void sync();
+
     void updatePsi(double *Psi);
 
     void correctPsi(double *Psi);
 
     void initializeTimeStep();
-};
\ No newline at end of file
+};
diff --git a/src_gpu/dfUEqn.cu b/src_gpu/dfUEqn.cu
index a82d85c76..6aa479796 100644
--- a/src_gpu/dfUEqn.cu
+++ b/src_gpu/dfUEqn.cu
@@ -869,6 +869,28 @@ __global__ void addBoundaryDiag(int num_cells, int num_boundary_cells,
     H[num_cells * 2 + cell_index] = (-internal_z + ave_internal) * psi[num_cells * 2 + cell_index];
 }
 
+__global__ void permute_psi_d2h(int num_cells, const double *input, double *output)
+{
+    int index = blockDim.x * blockIdx.x + threadIdx.x;
+    if (index >= num_cells)
+        return;
+
+    output[index * 3 + 0] = input[num_cells * 0 + index];
+    output[index * 3 + 1] = input[num_cells * 1 + index];
+    output[index * 3 + 2] = input[num_cells * 2 + index];
+}
+
+__global__ void permute_psi_h2d(int num_cells, const double *input, double *output)
+{
+    int index = blockDim.x * blockIdx.x + threadIdx.x;
+    if (index >= num_cells)
+        return;
+
+    output[num_cells * 0 + index] = input[index * 3 + 0];
+    output[num_cells * 1 + index] = input[index * 3 + 1];
+    output[num_cells * 2 + index] = input[index * 3 + 2];
+}
+
 __global__ void lduMatrix_H(int num_cells,
                             const int *csr_row_index, const int *csr_col_index, const int *csr_diag_index,
                             const double *volume, const double *psi, const double *A_csr, const double *b,
@@ -1032,7 +1054,9 @@ dfUEqn::dfUEqn(dfMatrixDataBase &dataBase, const std::string &modeStr, const std
     checkCudaErrors(cudaMalloc((void **)&d_A_csr, csr_value_vec_bytes));
     checkCudaErrors(cudaMalloc((void **)&d_b, cell_vec_bytes));
     checkCudaErrors(cudaMalloc((void **)&d_psi, cell_vec_bytes));
+    checkCudaErrors(cudaMalloc((void **)&d_psi_permute, cell_vec_bytes));
     checkCudaErrors(cudaMalloc((void **)&d_H, cell_vec_bytes));
+    checkCudaErrors(cudaMalloc((void **)&d_H_permute, cell_vec_bytes));
     checkCudaErrors(cudaMalloc((void **)&d_A, cell_bytes));
     checkCudaErrors(cudaMalloc((void **)&d_ueqn_internal_coeffs, cell_vec_bytes));
     checkCudaErrors(cudaMalloc((void **)&d_ueqn_boundary_coeffs, cell_vec_bytes));
@@ -1041,42 +1065,29 @@ dfUEqn::dfUEqn(dfMatrixDataBase &dataBase, const std::string &modeStr, const std
 void dfUEqn::fvm_ddt(double *vector_old)
 {
     // Copy the host input array in host memory to the device input array in device memory
-    clock_t start = std::clock();
     checkCudaErrors(cudaMemcpyAsync(dataBase_.d_velocity_old, vector_old, cell_vec_bytes, cudaMemcpyHostToDevice, stream));
-    clock_t end = std::clock();
-    time_monitor_GPU_memcpy += double(end - start) / double(CLOCKS_PER_SEC);
-
-    // launch cuda kernel
-    start = std::clock();
     size_t threads_per_block = 1024;
     size_t blocks_per_grid = (num_cells + threads_per_block - 1) / threads_per_block;
     fvm_ddt_kernel<<<blocks_per_grid, threads_per_block, 0, stream>>>(num_cells, num_faces, dataBase_.rdelta_t,
-                                                                      d_A_csr_row_index, d_A_csr_diag_index,
-                                                                      dataBase_.d_rho_old, dataBase_.d_rho_new, dataBase_.d_volume, dataBase_.d_velocity_old, d_A_csr, d_b, d_A_csr, d_b, d_psi);
-    end = std::clock();
-    time_monitor_GPU_kernel += double(end - start) / double(CLOCKS_PER_SEC);
+            d_A_csr_row_index, d_A_csr_diag_index,
+            dataBase_.d_rho_old, dataBase_.d_rho_new, dataBase_.d_volume, dataBase_.d_velocity_old, d_A_csr, d_b, d_A_csr, d_b, d_psi);
 }
 
 void dfUEqn::fvm_div(double *boundary_pressure_init, double *boundary_velocity_init,
                      double *boundary_nuEff_init, double *boundary_rho_init)
 {
     // copy and permutate boundary variable
-    clock_t start = std::clock();
     checkCudaErrors(cudaMemcpyAsync(dataBase_.d_boundary_velocity_init, boundary_velocity_init, dataBase_.boundary_face_vec_bytes, cudaMemcpyHostToDevice, stream));
     checkCudaErrors(cudaMemcpyAsync(dataBase_.d_boundary_pressure_init, boundary_pressure_init, dataBase_.boundary_face_bytes, cudaMemcpyHostToDevice, stream));
     checkCudaErrors(cudaMemcpyAsync(dataBase_.d_boundary_nuEff_init, boundary_nuEff_init, dataBase_.boundary_face_bytes, cudaMemcpyHostToDevice, stream));
     checkCudaErrors(cudaMemcpyAsync(dataBase_.d_boundary_rho_init, boundary_rho_init, dataBase_.boundary_face_bytes, cudaMemcpyHostToDevice, stream));
-    clock_t end = std::clock();
-    time_monitor_GPU_memcpy += double(end - start) / double(CLOCKS_PER_SEC);
 
-    start = std::clock();
     size_t threads_per_block = 1024;
     size_t blocks_per_grid = (dataBase_.num_boundary_faces + threads_per_block - 1) / threads_per_block;
     boundaryPermutation<<<blocks_per_grid, threads_per_block, 0, stream>>>(dataBase_.num_boundary_faces, dataBase_.d_bouPermedIndex, dataBase_.d_boundary_pressure_init,
-                                                                           dataBase_.d_boundary_velocity_init, dataBase_.d_boundary_pressure, dataBase_.d_boundary_velocity, 
-                                                                           dataBase_.d_boundary_nuEff_init, dataBase_.d_boundary_nuEff, dataBase_.d_boundary_rho_init, dataBase_.d_boundary_rho);
+            dataBase_.d_boundary_velocity_init, dataBase_.d_boundary_pressure, dataBase_.d_boundary_velocity, 
+            dataBase_.d_boundary_nuEff_init, dataBase_.d_boundary_nuEff, dataBase_.d_boundary_rho_init, dataBase_.d_boundary_rho);
 
-    // launch cuda kernel
     blocks_per_grid = (num_cells + threads_per_block - 1) / threads_per_block;
     fvm_div_internal<<<blocks_per_grid, threads_per_block, 0, stream>>>(num_cells, num_faces,
                                                                         d_A_csr_row_index, d_A_csr_diag_index,
@@ -1087,20 +1098,14 @@ void dfUEqn::fvm_div(double *boundary_pressure_init, double *boundary_velocity_i
                                                                         dataBase_.d_boundary_cell_offset, dataBase_.d_boundary_cell_id,
                                                                         dataBase_.d_internal_coeffs, dataBase_.d_boundary_coeffs, d_A_csr, d_b, d_A_csr, d_b,
                                                                         d_ueqn_internal_coeffs, d_ueqn_boundary_coeffs);
-    end = std::clock();
-    time_monitor_GPU_kernel += double(end - start) / double(CLOCKS_PER_SEC);
 }
 
 void dfUEqn::fvc_grad(double *pressure)
 {
     // Copy the host input array in host memory to the device input array in device memory
-    clock_t start = std::clock();
     checkCudaErrors(cudaMemcpyAsync(dataBase_.d_pressure, pressure, cell_bytes, cudaMemcpyHostToDevice, stream));
-    clock_t end = std::clock();
-    time_monitor_GPU_memcpy += double(end - start) / double(CLOCKS_PER_SEC);
 
     // launch cuda kernel
-    start = std::clock();
     size_t threads_per_block = 1024;
     size_t blocks_per_grid = (num_cells + threads_per_block - 1) / threads_per_block;
     fvc_grad_internal_face<<<blocks_per_grid, threads_per_block, 0, stream>>>(num_cells,
@@ -1110,14 +1115,10 @@ void dfUEqn::fvc_grad(double *pressure)
     fvc_grad_boundary_face<<<blocks_per_grid, threads_per_block, 0, stream>>>(num_cells, num_boundary_cells,
                                                                               dataBase_.d_boundary_cell_offset, dataBase_.d_boundary_cell_id,
                                                                               dataBase_.d_boundary_face_vector, dataBase_.d_boundary_pressure, d_b, d_b);
-    end = std::clock();
-    time_monitor_GPU_kernel += double(end - start) / double(CLOCKS_PER_SEC);
 }
 
 void dfUEqn::fvc_grad_vector()
 {
-    clock_t start = std::clock();
-    // launch CUDA kernal
     size_t threads_per_block = 1024;
     size_t blocks_per_grid = (num_cells + threads_per_block - 1) / threads_per_block;
     fvc_grad_vector_internal<<<blocks_per_grid, threads_per_block, 0, stream>>>(num_cells,
@@ -1132,33 +1133,21 @@ void dfUEqn::fvc_grad_vector()
     correct_boundary_conditions<<<blocks_per_grid, threads_per_block, 0, stream>>>(num_boundary_cells,
                                                                                    dataBase_.d_boundary_cell_offset, dataBase_.d_boundary_cell_id, dataBase_.d_boundary_face_vector, dataBase_.d_boundary_face,
                                                                                    dataBase_.d_grad_boundary_init, dataBase_.d_grad_boundary);
-    clock_t end = std::clock();
-    time_monitor_GPU_kernel += double(end - start) / double(CLOCKS_PER_SEC);
 }
 
 void dfUEqn::dev2T()
 {
-    clock_t start = std::clock();
-    // launch CUDA kernal
     size_t threads_per_block = 1024;
     size_t blocks_per_grid = (num_cells + threads_per_block - 1) / threads_per_block;
     dev2_t_tensor<<<blocks_per_grid, threads_per_block, 0, stream>>>(num_cells, dataBase_.d_grad);
 
     blocks_per_grid = (dataBase_.num_boundary_faces + threads_per_block - 1) / threads_per_block;
     dev2_t_tensor<<<blocks_per_grid, threads_per_block, 0, stream>>>(dataBase_.num_boundary_faces, dataBase_.d_grad_boundary);
-    clock_t end = std::clock();
-    time_monitor_GPU_kernel += double(end - start) / double(CLOCKS_PER_SEC);
 }
 
 void dfUEqn::fvc_div_tensor(const double *nuEff)
 {
-    clock_t start = std::clock();
     checkCudaErrors(cudaMemcpyAsync(dataBase_.d_nuEff, nuEff, cell_bytes, cudaMemcpyHostToDevice, stream));
-    clock_t end = std::clock();
-    time_monitor_GPU_memcpy += double(end - start) / double(CLOCKS_PER_SEC);
-
-    // launch cuda kernel
-    start = std::clock();
     size_t threads_per_block = 512;
     size_t blocks_per_grid = (num_cells + threads_per_block - 1) / threads_per_block;
     fvc_div_tensor_internal<<<blocks_per_grid, threads_per_block, 0, stream>>>(num_cells,
@@ -1171,14 +1160,10 @@ void dfUEqn::fvc_div_tensor(const double *nuEff)
                                                                                dataBase_.d_boundary_cell_offset, dataBase_.d_boundary_cell_id,
                                                                                dataBase_.d_boundary_nuEff, dataBase_.d_boundary_rho, dataBase_.d_boundary_face_vector, dataBase_.d_grad_boundary,
                                                                                dataBase_.d_volume, 1., d_b, d_b);
-    end = std::clock();
-    time_monitor_GPU_kernel += double(end - start) / double(CLOCKS_PER_SEC);
 }
 
 void dfUEqn::fvm_laplacian()
 {
-    clock_t start = std::clock();
-    // launch CUDA kernels
     size_t threads_per_block = 1024;
     size_t blocks_per_grid = (num_cells + threads_per_block - 1) / threads_per_block;
     fvm_laplacian_uncorrected_vector_internal<<<blocks_per_grid, threads_per_block, 0, stream>>>(num_cells, num_faces,
@@ -1190,8 +1175,6 @@ void dfUEqn::fvm_laplacian()
                                                                                                  d_A_csr_row_index, d_A_csr_diag_index, dataBase_.d_boundary_cell_offset, dataBase_.d_boundary_cell_id,
                                                                                                  dataBase_.d_boundary_nuEff, dataBase_.d_boundary_rho, dataBase_.d_boundary_face, dataBase_.d_laplac_internal_coeffs,
                                                                                                  dataBase_.d_laplac_boundary_coeffs, -1., d_A_csr, d_b, d_A_csr, d_b, d_ueqn_internal_coeffs, d_ueqn_boundary_coeffs);
-    clock_t end = std::clock();
-    time_monitor_GPU_kernel += double(end - start) / double(CLOCKS_PER_SEC);
 }
 
 void dfUEqn::A(double *Psi)
@@ -1209,8 +1192,7 @@ void dfUEqn::A(double *Psi)
 
     checkCudaErrors(cudaMemcpyAsync(h_A, d_A, cell_bytes, cudaMemcpyDeviceToHost, stream));
     checkCudaErrors(cudaStreamSynchronize(stream));
-    for (size_t i = 0; i < num_cells; i++)
-        Psi[i] = h_A[i];
+    memcpy(Psi, h_A, cell_bytes);
 }
 
 void dfUEqn::H(double *Psi)
@@ -1227,18 +1209,13 @@ void dfUEqn::H(double *Psi)
     lduMatrix_H<<<blocks_per_grid, threads_per_block, 0, stream>>>(num_cells, d_A_csr_row_index, d_A_csr_col_index, d_A_csr_diag_index,
                                                                    dataBase_.d_volume, d_psi, d_A_csr, d_b, d_ueqn_boundary_coeffs, d_H);
 
-    checkCudaErrors(cudaMemcpyAsync(h_H, d_H, cell_vec_bytes, cudaMemcpyDeviceToHost, stream));
-    checkCudaErrors(cudaStreamSynchronize(stream));
+    blocks_per_grid = (num_cells + threads_per_block - 1) / threads_per_block;
+    permute_psi_d2h<<<blocks_per_grid, threads_per_block, 0, stream>>>(num_cells, d_H, d_H_permute);
 
-    for (size_t i = 0; i < num_cells; i++)
-    {
-        Psi[i * 3] = h_H[i];
-        Psi[i * 3 + 1] = h_H[num_cells + i];
-        Psi[i * 3 + 2] = h_H[num_cells * 2 + i];
-    }
+    checkCudaErrors(cudaMemcpyAsync(h_H, d_H_permute, cell_vec_bytes, cudaMemcpyDeviceToHost, stream));
+    checkCudaErrors(cudaStreamSynchronize(stream));
 
-    // for (int i = 0; i < num_cells; i++)
-    //     fprintf(stderr, "h_H_GPU[%d]: (%.5e, %.5e, %.5e)\n", i, h_H[i], h_H[num_cells + i], h_H[num_cells * 2 + i]);
+    memcpy(Psi, h_H, cell_vec_bytes);
 }
 
 void dfUEqn::initializeTimeStep()
@@ -1341,12 +1318,6 @@ void dfUEqn::solve()
     // checkCudaErrors(cudaMemcpyAsync(h_psi, d_psi, cell_vec_bytes, cudaMemcpyDeviceToHost, stream));
 
     checkCudaErrors(cudaStreamSynchronize(stream));
-    printf("CPU Time (copy&permutate)  = %.6lf s\n", time_monitor_CPU);
-    printf("GPU Time (kernel launch)   = %.6lf s\n", time_monitor_GPU_kernel);
-    printf("GPU Time (memcpy)          = %.6lf s\n", time_monitor_GPU_memcpy);
-    time_monitor_CPU = 0;
-    time_monitor_GPU_kernel = 0;
-    time_monitor_GPU_memcpy = 0;
 
     // nvtxRangePush("solve");
 
@@ -1369,33 +1340,35 @@ void dfUEqn::solve()
     UzSolver->solve(num_cells, d_psi + 2 * num_cells, d_b + 2 * num_cells);
     num_iteration++;
 
-    checkCudaErrors(cudaMemcpyAsync(h_psi, d_psi, cell_vec_bytes, cudaMemcpyDeviceToHost, stream));
+    size_t threads_per_block = 1024;
+    size_t blocks_per_grid = (num_cells + threads_per_block - 1) / threads_per_block;
+    permute_psi_d2h<<<blocks_per_grid, threads_per_block, 0, stream>>>(num_cells, d_psi, d_psi_permute);
+    checkCudaErrors(cudaMemcpyAsync(h_psi, d_psi_permute, cell_vec_bytes, cudaMemcpyDeviceToHost, stream));
     // for (size_t i = 0; i < num_cells; i++)
     //     fprintf(stderr, "h_velocity_after[%d]: (%.15lf, %.15lf, %.15lf)\n", i, h_psi[i],
     //     h_psi[num_cells + i], h_psi[num_cells*2 + i]);
 }
 
+void dfUEqn::sync()
+{
+    checkCudaErrors(cudaStreamSynchronize(stream));
+}
+
 void dfUEqn::updatePsi(double *Psi)
 {
     checkCudaErrors(cudaStreamSynchronize(stream));
-    for (size_t i = 0; i < num_cells; i++)
-    {
-        Psi[i * 3] = h_psi[i];
-        Psi[i * 3 + 1] = h_psi[num_cells + i];
-        Psi[i * 3 + 2] = h_psi[num_cells * 2 + i];
-    }
+    memcpy(Psi, h_psi, cell_vec_bytes);
 }
 
 // correct volecity in pEqn
 void dfUEqn::correctPsi(double *Psi)
 {
-    for (size_t i = 0; i < num_cells; i++)
-    {
-        h_psi[i] = Psi[i * 3];
-        h_psi[num_cells + i] = Psi[i * 3 + 1];
-        h_psi[num_cells * 2 + i] = Psi[i * 3 + 2];
-    }
-    checkCudaErrors(cudaMemcpyAsync(d_psi, h_psi, cell_vec_bytes, cudaMemcpyDeviceToHost, stream));
+    memcpy(h_psi, Psi, cell_vec_bytes);
+    checkCudaErrors(cudaMemcpyAsync(d_psi_permute, h_psi, cell_vec_bytes, cudaMemcpyDeviceToHost, stream));
+
+    size_t threads_per_block = 1024;
+    size_t blocks_per_grid = (num_cells + threads_per_block - 1) / threads_per_block;
+    permute_psi_h2d<<<blocks_per_grid, threads_per_block, 0, stream>>>(num_cells, d_psi_permute, d_psi);
 }
 
 dfUEqn::~dfUEqn()
diff --git a/src_gpu/dfYEqn.H b/src_gpu/dfYEqn.H
index f41cd9799..2bc645cdd 100644
--- a/src_gpu/dfYEqn.H
+++ b/src_gpu/dfYEqn.H
@@ -12,8 +12,6 @@ private:
 
     std::vector<AmgXSolver *> YSolverSet;
     int num_iteration = 0;
-    double time_monitor_CPU;
-    double time_monitor_GPU_kernel, time_monitor_GPU_memcpy, time_monitor_GPU_memcpy_test;
 
     // common variables
     int num_cells, cell_bytes, num_faces, num_surfaces, num_boundary_cells, num_boundary_faces, num_species, boundary_face_bytes, inertIndex;
@@ -22,23 +20,18 @@ private:
     // Matrix variables
     double *d_A_csr, *d_b, *d_psi = nullptr;
     double *h_A_csr, *h_b, *h_psi = nullptr;
-    double *d_sumYDiffError = nullptr, *d_sumYDiffError_tmp = nullptr;
-    double *d_sumYDiffError_boundary = nullptr;
-    double *d_phiUc_boundary = nullptr;
-    double *d_phiUc = nullptr;
-    double *d_mut_Sct = nullptr;
-    double *d_boundary_mut_sct = nullptr;
 
-    double *d_boundary_Y = nullptr;
+    double *d_alpha = nullptr;
+    double *d_mut_Sct, *d_boundary_mut_sct = nullptr;
     double *d_hai, *d_boundary_hai = nullptr;
     double *d_rhoD, *d_boundary_rhoD = nullptr;
-    double *d_sum_rhoD_grady, *d_sum_boundary_rhoD_grady = nullptr;
     double *d_sum_hai_rhoD_grady, *d_sum_boundary_hai_rhoD_grady = nullptr; 
+    double *d_sum_rhoD_grady, *d_sum_boundary_rhoD_grady = nullptr;
     double *d_sum_hai_y, *d_sum_boundary_hai_y = nullptr;
+    double *d_phiUc, *d_phiUc_boundary = nullptr;
+    double *d_boundary_Y = nullptr;
     double *d_grady, *d_boundary_grady = nullptr;
 
-    double *d_alpha = nullptr;
-
 public:
     dfYEqn(dfMatrixDataBase &dataBase, const std::string &modeStr, const std::string &cfgFile, const int inertIndex);
 
@@ -48,13 +41,14 @@ public:
 
     void checkValue(bool print, char *filename);
 
-    void fvm_laplacian_and_sumYDiffError_diffAlphaD_hDiffCorrFlux(std::vector<double *> Y_old, std::vector<double *> boundary_Y,
-        std::vector<const double *> hai, std::vector<double *> boundary_hai,
-        std::vector<const double *> rhoD, std::vector<double *> boundary_rhoD,
-        const double *mut_Sct, const double *boundary_mut_Sct, const double *alpha);
-
     void upwindWeight();
 
+    void fvm_laplacian_and_sumYDiffError_diffAlphaD_hDiffCorrFlux(
+            std::vector<double *> Y_old, std::vector<double *> boundary_Y,
+            std::vector<const double *> hai, std::vector<double *> boundary_hai,
+            std::vector<const double *> rhoD, std::vector<double *> boundary_rhoD,
+            const double *mut_Sct, const double *boundary_mut_Sct, const double *alpha);
+
     void fvm_ddt();
 
     void fvm_div_phi();
@@ -63,5 +57,7 @@ public:
 
     void solve();
 
+    void sync();
+
     void updatePsi(double *Psi, int speciesIndex);
 };
diff --git a/src_gpu/dfYEqn.cu b/src_gpu/dfYEqn.cu
index 41987ee26..1e31bf0ed 100644
--- a/src_gpu/dfYEqn.cu
+++ b/src_gpu/dfYEqn.cu
@@ -856,6 +856,11 @@ void dfYEqn::solve()
     //     fprintf(stderr, "h_species_gpu[%d]: %.5e\n", i, h_psi[i + 0 * num_cells]);
 }
 
+void dfYEqn::sync()
+{
+    checkCudaErrors(cudaStreamSynchronize(stream));
+}
+
 void dfYEqn::updatePsi(double *Psi, int speciesIndex)
 {
     checkCudaErrors(cudaStreamSynchronize(stream));

From 4f97772f651833a0dfb5373ecebd367fbf8b4757 Mon Sep 17 00:00:00 2001
From: STwangyingrui <wangyingrui@sensetime.com>
Date: Sat, 10 Jun 2023 23:37:54 +0800
Subject: [PATCH 5/6] move Yt to GPU

---
 applications/solvers/dfLowMachFoam/YEqn.H | 62 ++++++++++++-----------
 src_gpu/dfMatrixDataBase.H                |  8 +--
 src_gpu/dfRhoEqn.cu                       |  6 ---
 src_gpu/dfYEqn.cu                         | 55 ++++++++++++++------
 4 files changed, 77 insertions(+), 54 deletions(-)

diff --git a/applications/solvers/dfLowMachFoam/YEqn.H b/applications/solvers/dfLowMachFoam/YEqn.H
index a547e266d..94fb6cf8d 100644
--- a/applications/solvers/dfLowMachFoam/YEqn.H
+++ b/applications/solvers/dfLowMachFoam/YEqn.H
@@ -116,48 +116,50 @@ if(flag_mpi_init) MPI_Barrier(PstreamGlobals::MPI_COMM_FOAM);
         time_monitor_chem += processingTime.count();
     }
 
+#ifdef GPUSolver_
+    start1 = std::clock();
+    forAll(Y, i)
+    {
+        volScalarField& Yi = Y[i];
+        YEqn_GPU.updatePsi(&Yi[0], i);
+        Yi.correctBoundaryConditions();
+    }
+    end1 = std::clock();
+    time_monitor_YEqn += double(end1 - start1) / double(CLOCKS_PER_SEC);
+    time_monitor_YEqn_correctBC += double(end1 - start1) / double(CLOCKS_PER_SEC);
+#else
     start2 = std::clock();
     volScalarField Yt(0.0*Y[0]);
     int speciesIndex = 0;
     forAll(Y, i)
     {
         volScalarField& Yi = Y[i];
-#ifdef CPUSolver_
         hDiffCorrFlux += chemistry->hai(i)*(chemistry->rhoD(i)*fvc::grad(Yi) - Yi*sumYDiffError);
         diffAlphaD += fvc::laplacian(thermo.alpha()*chemistry->hai(i), Yi);
-#endif
         if (i != inertIndex)
         {
-            #ifdef GPUSolver_
-                start1 = std::clock();
-                YEqn_GPU.updatePsi(&Yi[0], speciesIndex);
-                Yi.correctBoundaryConditions();
-                end1 = std::clock();
-                time_monitor_YEqn_correctBC += double(end1 - start1) / double(CLOCKS_PER_SEC);
-            #else
-                start1 = std::clock();
-                tmp<volScalarField> DEff = chemistry->rhoD(i) + turbulence->mut()/Sct;
-                fvScalarMatrix YiEqn
+            start1 = std::clock();
+            tmp<volScalarField> DEff = chemistry->rhoD(i) + turbulence->mut()/Sct;
+            fvScalarMatrix YiEqn
                 (
-                    fvm::ddt(rho, Yi)
-                + mvConvection->fvmDiv(phi, Yi)
-                + mvConvection->fvmDiv(phiUc, Yi)
-                ==
-                    (
-                        splitting
-                    ?   fvm::laplacian(DEff(), Yi)
-                    :  (fvm::laplacian(DEff(), Yi) + combustion->R(Yi))
-                    )
+                 fvm::ddt(rho, Yi)
+                 + mvConvection->fvmDiv(phi, Yi)
+                 + mvConvection->fvmDiv(phiUc, Yi)
+                 ==
+                 (
+                  splitting
+                  ?   fvm::laplacian(DEff(), Yi)
+                  :  (fvm::laplacian(DEff(), Yi) + combustion->R(Yi))
+                 )
                 );
-                end1 = std::clock();
-                time_monitor_YEqn_mtxAssembly += double(end1 - start1) / double(CLOCKS_PER_SEC);
-                // YiEqn.relax();
+            end1 = std::clock();
+            time_monitor_YEqn_mtxAssembly += double(end1 - start1) / double(CLOCKS_PER_SEC);
+            // YiEqn.relax();
 
-                start1 = std::clock();
-                YiEqn.solve("Yi");
-                end1 = std::clock();
-                time_monitor_YEqn_solve += double(end1 - start1) / double(CLOCKS_PER_SEC);
-            #endif
+            start1 = std::clock();
+            YiEqn.solve("Yi");
+            end1 = std::clock();
+            time_monitor_YEqn_solve += double(end1 - start1) / double(CLOCKS_PER_SEC);
 
             Yi.max(0.0);
             Yt += Yi;
@@ -167,7 +169,7 @@ if(flag_mpi_init) MPI_Barrier(PstreamGlobals::MPI_COMM_FOAM);
 
     Y[inertIndex] = scalar(1) - Yt;
     Y[inertIndex].max(0.0);
-
     end2 = std::clock();
     time_monitor_YEqn += double(end2 - start2) / double(CLOCKS_PER_SEC);
+#endif
 }
diff --git a/src_gpu/dfMatrixDataBase.H b/src_gpu/dfMatrixDataBase.H
index 81c82a9d8..8ed3dcd32 100644
--- a/src_gpu/dfMatrixDataBase.H
+++ b/src_gpu/dfMatrixDataBase.H
@@ -91,7 +91,8 @@ struct dfMatrixDataBase
     double *d_rho_new = nullptr, *d_rho_old = nullptr, *d_velocity_old = nullptr, 
     *d_pressure = nullptr, *d_volume = nullptr;
     // - the device pointer to Y(vector Yi)
-    std::vector<double*> d_Y;
+    //std::vector<double*> d_Y;
+    double *d_Y = nullptr;
     // - the device pointer to the pre-permutated and post-permutated interpolation weight list
     double *d_weight_init = nullptr, *d_weight = nullptr;
     double *d_weight_upwind = nullptr;
@@ -478,7 +479,7 @@ struct dfMatrixDataBase
         checkCudaErrors(cudaMalloc((void**)&d_A_csr_diag_index, cell_index_bytes));
         total_bytes += (csr_row_index_bytes + csr_col_index_bytes + cell_index_bytes);
 
-        d_Y.resize(num_species);
+        //d_Y.resize(num_species);
         d_rhoD_vector.resize(num_species);
         d_boundary_Y_vector.resize(num_species);
         d_boundary_Y_init_vector.resize(num_species);
@@ -489,9 +490,10 @@ struct dfMatrixDataBase
         d_boundary_rhoD_vector.resize(num_species);
 
         for (size_t i = 0; i < num_species; ++i){
-            checkCudaErrors(cudaMalloc((void**)&d_Y[i], cell_bytes));
+            //checkCudaErrors(cudaMalloc((void**)&d_Y[i], cell_bytes));
             checkCudaErrors(cudaMalloc((void**)&d_rhoD_vector[i], cell_bytes));
         }
+        checkCudaErrors(cudaMalloc((void**)&d_Y, cell_bytes * num_species));
         checkCudaErrors(cudaMalloc((void**)&d_rho_old, cell_bytes));
         checkCudaErrors(cudaMalloc((void**)&d_rho_new, cell_bytes));
         checkCudaErrors(cudaMalloc((void**)&d_volume, cell_bytes));
diff --git a/src_gpu/dfRhoEqn.cu b/src_gpu/dfRhoEqn.cu
index 112a376ec..7c60f3bea 100644
--- a/src_gpu/dfRhoEqn.cu
+++ b/src_gpu/dfRhoEqn.cu
@@ -105,17 +105,11 @@ void dfRhoEqn::initializeTimeStep()
 
 void dfRhoEqn::fvc_div(double *phi, double *boundary_phi_init)
 {
-    clock_t start = std::clock();
     memcpy(dataBase_.h_phi_init, phi, num_surfaces * sizeof(double));
 
-    clock_t end = std::clock();
-    time_monitor_CPU += double(end - start) / double(CLOCKS_PER_SEC);
-
-    start = std::clock();
     checkCudaErrors(cudaMemcpyAsync(dataBase_.d_phi_init, dataBase_.h_phi_init, num_surfaces * sizeof(double), cudaMemcpyHostToDevice, stream));
     checkCudaErrors(cudaMemcpyAsync(dataBase_.d_phi_init + num_surfaces, dataBase_.d_phi_init, num_surfaces * sizeof(double), cudaMemcpyDeviceToDevice, stream));
     checkCudaErrors(cudaMemcpyAsync(dataBase_.d_boundary_phi_init, boundary_phi_init, dataBase_.boundary_face_bytes, cudaMemcpyHostToDevice, stream));
-    end = std::clock();
 
     size_t threads_per_block = 1024;
     size_t blocks_per_grid = (num_cells + threads_per_block - 1) / threads_per_block;
diff --git a/src_gpu/dfYEqn.cu b/src_gpu/dfYEqn.cu
index 1e31bf0ed..85db544da 100644
--- a/src_gpu/dfYEqn.cu
+++ b/src_gpu/dfYEqn.cu
@@ -273,7 +273,7 @@ __global__ void calculate_phiUc_boundary(int num_boundary_faces,
 __global__ void fvm_ddt_kernel_scalar(int num_cells, int num_faces, const double rdelta_t,
                                       const int *csr_row_index, const int *csr_diag_index,
                                       const double *rho_old, const double *rho_new, const double *volume, const double *species_old,
-                                      const double *A_csr_input, const double *b_input, double *A_csr_output, double *b_output, double *psi)
+                                      const double *A_csr_input, const double *b_input, double *A_csr_output, double *b_output)
 {
     int index = blockDim.x * blockIdx.x + threadIdx.x;
     if (index >= num_cells)
@@ -289,8 +289,24 @@ __global__ void fvm_ddt_kernel_scalar(int num_cells, int num_faces, const double
 
     double ddt_part_term = rdelta_t * rho_old[index] * volume[index];
     b_output[index] = b_input[index] + ddt_part_term * species_old[index];
+}
+
+__global__ void compute_inertIndex_y(int num_cells, int num_species, int inertIndex, double *y)
+{
+    int index = blockDim.x * blockIdx.x + threadIdx.x;
+    if (index >= num_cells)
+        return;
 
-    psi[index] = species_old[index];
+    double sum_yi = 0;
+    for (int i = 0; i < num_species; i++)
+    {
+        if (i == inertIndex) continue;
+
+        double yi = y[num_cells * i + index];
+        sum_yi += yi > 0 ? yi : 0;
+    }
+    sum_yi = 1 - sum_yi;
+    y[num_cells * inertIndex + index] = (sum_yi > 0 ? sum_yi : 0);
 }
 
 __global__ void fvm_div_internal_scalar(int num_cells, int num_faces,
@@ -528,7 +544,7 @@ dfYEqn::dfYEqn(dfMatrixDataBase &dataBase, const std::string &modeStr, const std
 
     h_A_csr = new double[(num_cells + num_faces) * (num_species - 1)];
     h_b = new double[num_cells * (num_species - 1)];
-    cudaMallocHost(&h_psi, num_cells * (num_species - 1) * sizeof(double));
+    cudaMallocHost(&h_psi, num_cells * num_species * sizeof(double));
 
     checkCudaErrors(cudaMalloc((void **)&d_A_csr, (num_cells + num_faces) * (num_species - 1) * sizeof(double)));
     checkCudaErrors(cudaMalloc((void **)&d_b, cell_bytes * (num_species - 1)));
@@ -608,7 +624,7 @@ void dfYEqn::fvm_laplacian_and_sumYDiffError_diffAlphaD_hDiffCorrFlux(std::vecto
     int mtxIndex = 0;
     for (size_t i = 0; i < num_species; ++i)
     {
-        checkCudaErrors(cudaMemcpyAsync(dataBase_.d_Y[i], Y_old[i], cell_bytes, cudaMemcpyHostToDevice, stream));
+        checkCudaErrors(cudaMemcpyAsync(dataBase_.d_Y + i * num_cells, Y_old[i], cell_bytes, cudaMemcpyHostToDevice, stream));
         checkCudaErrors(cudaMemcpyAsync(d_boundary_Y, boundary_Y[i], boundary_face_bytes, cudaMemcpyHostToDevice, stream));
         checkCudaErrors(cudaMemcpyAsync(d_hai, hai[i], cell_bytes, cudaMemcpyHostToDevice, stream));
         checkCudaErrors(cudaMemcpyAsync(d_boundary_hai, boundary_hai[i], boundary_face_bytes, cudaMemcpyHostToDevice, stream));
@@ -623,7 +639,7 @@ void dfYEqn::fvm_laplacian_and_sumYDiffError_diffAlphaD_hDiffCorrFlux(std::vecto
         blocks_per_grid = (num_cells + threads_per_block - 1) / threads_per_block;
         fvc_grad_internal<<<blocks_per_grid, threads_per_block, 0, stream>>>(num_cells,
                 d_A_csr_row_index, d_A_csr_col_index, d_A_csr_diag_index,
-                dataBase_.d_face_vector, dataBase_.d_weight, dataBase_.d_Y[i],
+                dataBase_.d_face_vector, dataBase_.d_weight, dataBase_.d_Y + i * num_cells,
                 dataBase_.d_volume, d_grady);
         blocks_per_grid = (num_boundary_cells + threads_per_block - 1) / threads_per_block;
         fvc_grad_boundary<<<blocks_per_grid, threads_per_block, 0, stream>>>(num_boundary_cells,
@@ -641,7 +657,7 @@ void dfYEqn::fvm_laplacian_and_sumYDiffError_diffAlphaD_hDiffCorrFlux(std::vecto
         // sum(chemistry->hai(i)*Yi)
         blocks_per_grid = (num_cells + threads_per_block - 1) / threads_per_block;
         sumError_internal<<<blocks_per_grid, threads_per_block, 0, stream>>>(num_cells,
-                d_hai, d_rhoD, dataBase_.d_Y[i], d_grady,
+                d_hai, d_rhoD, dataBase_.d_Y + i * num_cells, d_grady,
                 d_sum_hai_rhoD_grady, d_sum_rhoD_grady, d_sum_hai_y);
         blocks_per_grid = (num_boundary_faces + threads_per_block - 1) / threads_per_block;
         sumError_boundary<<<blocks_per_grid, threads_per_block, 0, stream>>>(num_boundary_faces,
@@ -653,7 +669,7 @@ void dfYEqn::fvm_laplacian_and_sumYDiffError_diffAlphaD_hDiffCorrFlux(std::vecto
         blocks_per_grid = (num_cells + threads_per_block - 1) / threads_per_block;
         fvc_laplacian_internal<<<blocks_per_grid, threads_per_block, 0, stream>>>(num_cells,
                 d_A_csr_row_index, d_A_csr_col_index, d_A_csr_diag_index,
-                d_alpha, d_hai, dataBase_.d_Y[i],
+                d_alpha, d_hai, dataBase_.d_Y + i * num_cells,
                 dataBase_.d_weight, dataBase_.d_face, dataBase_.d_deltaCoeffs,
                 dataBase_.d_volume, dataBase_.d_diffAlphaD);
 
@@ -699,10 +715,10 @@ void dfYEqn::fvm_ddt()
         blocks_per_grid = (num_cells + threads_per_block - 1) / threads_per_block;
         fvm_ddt_kernel_scalar<<<blocks_per_grid, threads_per_block, 0, stream>>>(num_cells, num_faces, dataBase_.rdelta_t,
                 d_A_csr_row_index, d_A_csr_diag_index,
-                dataBase_.d_rho_old, dataBase_.d_rho_new, dataBase_.d_volume, dataBase_.d_Y[i],
+                dataBase_.d_rho_old, dataBase_.d_rho_new, dataBase_.d_volume, dataBase_.d_Y + i * num_cells,
                 d_A_csr + mtxIndex * (num_cells + num_faces), d_b + mtxIndex * num_cells,
-                d_A_csr + mtxIndex * (num_cells + num_faces), d_b + mtxIndex * num_cells,
-                d_psi + mtxIndex * num_cells);
+                d_A_csr + mtxIndex * (num_cells + num_faces), d_b + mtxIndex * num_cells);
+                //d_psi + mtxIndex * num_cells);
         ++mtxIndex;
     }
 }
@@ -843,14 +859,23 @@ void dfYEqn::solve()
             ++solverIndex;
         }
     }
-    int solverIndex = 0;
-    for (auto &solver : YSolverSet)
+    int mtxIndex = 0;
+    for (size_t i = 0; i < num_species; ++i)
     {
-        solver->solve(num_cells, d_psi + solverIndex * num_cells, d_b + solverIndex * num_cells);
-        ++solverIndex;
+        if (i == inertIndex)
+            continue;
+
+        YSolverSet[mtxIndex]->solve(num_cells, dataBase_.d_Y + i * num_cells, d_b + mtxIndex * num_cells);
+        ++mtxIndex;
     }
+
+    size_t threads_per_block, blocks_per_grid;
+    threads_per_block = 1024;
+    blocks_per_grid = (num_cells + threads_per_block - 1) / threads_per_block;
+    compute_inertIndex_y<<<blocks_per_grid, threads_per_block, 0, stream>>>(num_cells, num_species, inertIndex, dataBase_.d_Y);
+    checkCudaErrors(cudaMemcpyAsync(h_psi, dataBase_.d_Y, num_species * cell_bytes, cudaMemcpyDeviceToHost, stream));
+
     num_iteration++;
-    checkCudaErrors(cudaMemcpyAsync(h_psi, d_psi, num_cells * (num_species - 1) * sizeof(double), cudaMemcpyDeviceToHost, stream));
     // checkCudaErrors(cudaStreamSynchronize(stream));
     // for (size_t i = 0; i < num_cells; i++)
     //     fprintf(stderr, "h_species_gpu[%d]: %.5e\n", i, h_psi[i + 0 * num_cells]);

From 192994709739bc023b563f9b5b30c044c92fddf8 Mon Sep 17 00:00:00 2001
From: STwangyingrui <wangyingrui@sensetime.com>
Date: Sun, 11 Jun 2023 15:29:58 +0800
Subject: [PATCH 6/6] small refactor of time profiling

---
 applications/solvers/dfLowMachFoam/EEqn.H | 18 +++++++++---------
 applications/solvers/dfLowMachFoam/pEqn.H |  4 ----
 2 files changed, 9 insertions(+), 13 deletions(-)

diff --git a/applications/solvers/dfLowMachFoam/EEqn.H b/applications/solvers/dfLowMachFoam/EEqn.H
index bcab9affb..93e60e7bf 100644
--- a/applications/solvers/dfLowMachFoam/EEqn.H
+++ b/applications/solvers/dfLowMachFoam/EEqn.H
@@ -1,15 +1,6 @@
 {
     volScalarField& he = thermo.he();
 
-#ifdef GPUSolver_
-    start1 = std::clock();
-    UEqn_GPU.updatePsi(&U[0][0]);
-    K = 0.5*magSqr(U);
-    end1 = std::clock();
-    time_monitor_UEqn += double(end1 - start1) / double(CLOCKS_PER_SEC);
-    time_monitor_UEqn_correctBC += double(end1 - start1) / double(CLOCKS_PER_SEC);
-#endif
-
 #ifdef CPUSolver_
     start1 = std::clock();
     fvScalarMatrix EEqn
@@ -34,6 +25,15 @@
     time_monitor_EEqn_solve += double(end1 - start1) / double(CLOCKS_PER_SEC);
 #endif
 
+#ifdef GPUSolver_
+    start1 = std::clock();
+    UEqn_GPU.updatePsi(&U[0][0]);
+    K = 0.5*magSqr(U);
+    end1 = std::clock();
+    time_monitor_UEqn += double(end1 - start1) / double(CLOCKS_PER_SEC);
+    time_monitor_UEqn_correctBC += double(end1 - start1) / double(CLOCKS_PER_SEC);
+#endif
+
 #ifdef GPUSolver_
     // prepare data on CPU
     start1 = std::clock();
diff --git a/applications/solvers/dfLowMachFoam/pEqn.H b/applications/solvers/dfLowMachFoam/pEqn.H
index 913bee4fa..34925327f 100644
--- a/applications/solvers/dfLowMachFoam/pEqn.H
+++ b/applications/solvers/dfLowMachFoam/pEqn.H
@@ -69,13 +69,9 @@ const volScalarField psip0(psi*p);
 
 start2 = std::clock();
 #ifdef GPUSolver_
-    start1 = std::clock();
     volScalarField rAU(1.0/UEqn_A);
     surfaceScalarField rhorAUf("rhorAUf", fvc::interpolate(rho*rAU));
     volVectorField HbyA(constrainHbyA(rAU*UEqn_H, U, p));
-    end1 = std::clock();
-    time_monitor_UEqn += double(end1 - start1) / double(CLOCKS_PER_SEC);
-    time_monitor_UEqn_A += double(end1 - start1) / double(CLOCKS_PER_SEC);
 #else
     volScalarField rAU(1.0/UEqn.A());
     surfaceScalarField rhorAUf("rhorAUf", fvc::interpolate(rho*rAU));