diff --git a/applications/solvers/dfLowMachFoam/EEqn.H b/applications/solvers/dfLowMachFoam/EEqn.H
index bdef9043f..8e8c0bee0 100644
--- a/applications/solvers/dfLowMachFoam/EEqn.H
+++ b/applications/solvers/dfLowMachFoam/EEqn.H
@@ -23,13 +23,16 @@
      ==
         fvc::div(hDiffCorrFlux)
     );
+    end1 = std::clock();
+    time_monitor_EEqn += double(end1 - start1) / double(CLOCKS_PER_SEC);
+    time_monitor_EEqn_mtxAssembly += double(end1 - start1) / double(CLOCKS_PER_SEC);
 
-    EEqn.relax();
-
+    // EEqn.relax();
+    start1 = std::clock();
     EEqn.solve();
-
     end1 = std::clock();
     time_monitor_EEqn += double(end1 - start1) / double(CLOCKS_PER_SEC);
+    time_monitor_EEqn_Solve += double(end1 - start1) / double(CLOCKS_PER_SEC);
 #endif
 
 #ifdef GPUSolver_
@@ -52,35 +55,22 @@
         memcpy(boundary_hDiffCorrFlux + eeqn_offset * 3, &patchhDiffCorrFlux[0][0], 3 * patchSize*sizeof(double));
         memcpy(boundary_alphaEff + eeqn_offset, &patchAlphaEff[0], patchSize*sizeof(double));
 
-        Field<scalar> valueInternalCoeffs = patchFlux*he.boundaryField()[patchi].valueInternalCoeffs(pw);
-        Field<scalar> valueBoundaryCoeffs = -patchFlux*he.boundaryField()[patchi].valueBoundaryCoeffs(pw);
-        Field<scalar> gradientInternalCoeffs = he.boundaryField()[patchi].gradientInternalCoeffs();
-        Field<scalar> gradientBoundaryCoeffs = he.boundaryField()[patchi].gradientBoundaryCoeffs();
-        memcpy(eeqn_valueInternalCoeffs + eeqn_offset, &valueInternalCoeffs[0], patchSize*sizeof(double));
-        memcpy(eeqn_valueBoundaryCoeffs + eeqn_offset, &valueBoundaryCoeffs[0], patchSize*sizeof(double));
-        memcpy(eeqn_gradientInternalCoeffs + eeqn_offset, &gradientInternalCoeffs[0], patchSize*sizeof(double));
-        memcpy(eeqn_gradientBoundaryCoeffs + eeqn_offset, &gradientBoundaryCoeffs[0], patchSize*sizeof(double));
-
         eeqn_offset += patchSize;
     }
-    eeqn_offset = 0;
     end1 = std::clock();
-    time_monitor_CPU += double(end1 - start1) / double(CLOCKS_PER_SEC);
     time_monitor_EEqn_mtxAssembly_CPU_Prepare += double(end1 - start1) / double(CLOCKS_PER_SEC);
 
     // prepare data on GPU
     start1 = std::clock();
     EEqn_GPU.prepare_data(&he.oldTime()[0], &K[0], &K.oldTime()[0], &alphaEff[0],
             &dpdt[0], &diffAlphaD[0], &hDiffCorrFlux[0][0],
-            boundary_K, boundary_hDiffCorrFlux, boundary_alphaEff,
-            eeqn_valueInternalCoeffs, eeqn_valueBoundaryCoeffs,
-            eeqn_gradientInternalCoeffs, eeqn_gradientBoundaryCoeffs);
+            boundary_K, boundary_hDiffCorrFlux, boundary_alphaEff);
     if (doSync) EEqn_GPU.sync();
     end1 = std::clock();
     time_monitor_EEqn_mtxAssembly_GPU_Prepare += double(end1 - start1) / double(CLOCKS_PER_SEC);
 
     start1 = std::clock();
-    EEqn_GPU.resetAb();
+    EEqn_GPU.initializeTimeStep();
     EEqn_GPU.fvm_ddt();
     EEqn_GPU.fvm_div();
     EEqn_GPU.fvm_laplacian();
@@ -98,7 +88,7 @@
     time_monitor_EEqn_mtxAssembly += double(end2 - start2) / double(CLOCKS_PER_SEC);
 
     // check value of mtxAssembly, no time monitor
-    EEqn_GPU.checkValue(false);
+    // EEqn_GPU.checkValue(false);
 
     start1 = std::clock();
     EEqn_GPU.solve();
@@ -109,8 +99,8 @@
 
     start1 = std::clock();
     EEqn_GPU.updatePsi(&he[0]);
+    he.correctBoundaryConditions();
     end1 = std::clock();
-    time_monitor_CPU += double(end1 - start1) / double(CLOCKS_PER_SEC);
     time_monitor_EEqn += double(end1 - start1) / double(CLOCKS_PER_SEC);
     time_monitor_EEqn_mtxAssembly += double(end1 - start1) / double(CLOCKS_PER_SEC);
 #endif
diff --git a/applications/solvers/dfLowMachFoam/UEqn.H b/applications/solvers/dfLowMachFoam/UEqn.H
index 5a089e0c0..4427ece1b 100644
--- a/applications/solvers/dfLowMachFoam/UEqn.H
+++ b/applications/solvers/dfLowMachFoam/UEqn.H
@@ -1,6 +1,7 @@
 // Solve the Momentum equation
 #ifdef GPUSolver_
     start1 = std::clock();
+    UEqn_GPU.initializeTimeStep();
     UEqn_GPU.fvm_ddt(&U.oldTime()[0][0]);
     start2 = std::clock();
     int offset = 0;
@@ -8,30 +9,13 @@
     const volScalarField& nuEff = nuEff_tmp();
     forAll(U.boundaryField(), patchi)
     {
-        const fvsPatchScalarField& patchFlux = phi.boundaryField()[patchi];
-        const fvsPatchScalarField& pw = mesh.surfaceInterpolation::weights().boundaryField()[patchi];
-
         const scalarField& patchP = p.boundaryField()[patchi];
-        const vectorField& pSf = mesh.Sf().boundaryField()[patchi];
         const vectorField& patchU = U.boundaryField()[patchi];
         const scalarField& patchRho = rho.boundaryField()[patchi];
         const scalarField& patchNuEff = nuEff.boundaryField()[patchi];
 
-        int patchSize = pw.size();
-
-        Field<vector> ueqn_internalCoeffs_vec = patchFlux*U.boundaryField()[patchi].valueInternalCoeffs(pw);
-        Field<vector> ueqn_boundaryCoeffs_vec = -patchFlux*U.boundaryField()[patchi].valueBoundaryCoeffs(pw); 
-        Field<vector> ueqn_laplac_internalCoeffs_vec = U.boundaryField()[patchi].gradientInternalCoeffs();
-        Field<vector> ueqn_laplac_boundaryCoeffs_vec = U.boundaryField()[patchi].gradientBoundaryCoeffs();
+        int patchSize = patchP.size();
 
-        // only need to construct once
-        memcpy(ueqn_internalCoeffs_init + 3*offset, &ueqn_internalCoeffs_vec[0][0], 3*patchSize*sizeof(double));
-        // need to construct every time step
-        memcpy(ueqn_boundaryCoeffs_init + 3*offset, &ueqn_boundaryCoeffs_vec[0][0], 3*patchSize*sizeof(double));
-        // laplacian internalCoeffs
-        memcpy(ueqn_laplac_internalCoeffs_init + 3*offset, &ueqn_laplac_internalCoeffs_vec[0][0], 3*patchSize*sizeof(double));
-        // laplacian boundaryCoeffs
-        memcpy(ueqn_laplac_boundaryCoeffs_init + 3*offset, &ueqn_laplac_boundaryCoeffs_vec[0][0], 3*patchSize*sizeof(double));
         // boundary pressure
         memcpy(boundary_pressure_init+offset, &patchP[0], patchSize*sizeof(double));
         // boundary velocity
@@ -44,14 +28,12 @@
     }
     end1 = std::clock();
     end2 = std::clock();
-    time_monitor_CPU += double(end2 - start2) / double(CLOCKS_PER_SEC);
+    time_monitor_UEqn_CPU += double(end2 - start2) / double(CLOCKS_PER_SEC);
     time_monitor_UEqn += double(end1 - start1) / double(CLOCKS_PER_SEC);
     time_monitor_UEqn_mtxAssembly += double(end1 - start1) / double(CLOCKS_PER_SEC);
 
     start1 = std::clock();
-    UEqn_GPU.fvm_div(&phi[0], ueqn_internalCoeffs_init, ueqn_boundaryCoeffs_init, 
-        ueqn_laplac_internalCoeffs_init, ueqn_laplac_boundaryCoeffs_init, 
-        boundary_pressure_init, boundary_velocity_init, boundary_nuEff_init, boundary_rho_init);
+    UEqn_GPU.fvm_div(boundary_pressure_init, boundary_velocity_init, boundary_nuEff_init, boundary_rho_init);
     UEqn_GPU.fvc_grad(&p[0]);
     UEqn_GPU.fvc_grad_vector();
     UEqn_GPU.dev2T();
diff --git a/applications/solvers/dfLowMachFoam/YEqn.H b/applications/solvers/dfLowMachFoam/YEqn.H
index eeaea40e7..c741a8f48 100644
--- a/applications/solvers/dfLowMachFoam/YEqn.H
+++ b/applications/solvers/dfLowMachFoam/YEqn.H
@@ -1,5 +1,4 @@
 start = std::clock();
-cudaSetDevice(0);
 hDiffCorrFlux = Zero;
 diffAlphaD = Zero;
 sumYDiffError = Zero;
@@ -15,12 +14,15 @@ tmp<fv::convectionScheme<scalar>> mvConvection
     )
 );
 
-
+start1 = std::clock();
 forAll(Y, i)
 {
     sumYDiffError += chemistry->rhoD(i)*fvc::grad(Y[i]);
 }
 const surfaceScalarField phiUc = linearInterpolate(sumYDiffError) & mesh.Sf();
+start1 = std::clock();
+time_monitor_YEqn += double(end1 - start1) / double(CLOCKS_PER_SEC);
+time_monitor_YEqn_Solve += double(end1 - start1) / double(CLOCKS_PER_SEC);
 
 #ifdef GPUSolver_
     start1 = std::clock();
@@ -32,13 +34,14 @@ const surfaceScalarField phiUc = linearInterpolate(sumYDiffError) & mesh.Sf();
 #endif
 
 #ifdef GPUSolver_
-    std::vector<double*> Y_new(Y.size()), Y_old(Y.size()), boundary_Y_init(Y.size()), boundary_rhoD_init(Y.size());
+    start1 = std::clock();
+    start2 = std::clock();
+    std::vector<double*> Y_old(Y.size()), boundary_Y_init(Y.size()), boundary_rhoD_init(Y.size());
     std::vector<const double*> rhoD_GPU(Y.size());
     for (size_t i = 0; i < Y.size(); ++i)
     {
         volScalarField& Yi = Y[i];
         const volScalarField& rhoDi = chemistry->rhoD(i);
-        Y_new[i] = &Yi[0];
         Y_old[i] = &Yi.oldTime()[0];
         rhoD_GPU[i] = &chemistry->rhoD(i)[0];
         cudaMallocHost(&boundary_Y_init[i], num_boundary_faces*sizeof(double));
@@ -64,15 +67,25 @@ const surfaceScalarField phiUc = linearInterpolate(sumYDiffError) & mesh.Sf();
         int patchSize = patchMut_sct.size();
         boundary_mutsct.insert(boundary_mutsct.end(), &patchMut_sct[0], &patchMut_sct[0] + patchSize);
     }
+    end2 = std::clock();
+    time_monitor_YEqn_mtxAssembly_CPU_Prepare += double(end2 - start2) / double(CLOCKS_PER_SEC);
 
     YEqn_GPU.upwindWeight();
-    YEqn_GPU.correctVelocity(Y_new, boundary_Y_init, rhoD_GPU);
-    YEqn_GPU.fvm_ddt(Y_old);
+    YEqn_GPU.correctVelocity(Y_old, boundary_Y_init, rhoD_GPU);
+    YEqn_GPU.fvm_ddt();
     YEqn_GPU.fvm_div_phi();
     YEqn_GPU.fvm_div_phiUc();
     YEqn_GPU.fvm_laplacian(&mut_sct[0], boundary_mutsct.data(), boundary_rhoD_init);
 
+    end1 = std::clock();
+    time_monitor_YEqn += double(end1 - start1) / double(CLOCKS_PER_SEC);
+    time_monitor_YEqn_mtxAssembly += double(end1 - start1) / double(CLOCKS_PER_SEC);
+
+    start1 = std::clock();
     YEqn_GPU.solve();
+    end1 = std::clock();
+    time_monitor_YEqn += double(end1 - start1) / double(CLOCKS_PER_SEC);
+    time_monitor_YEqn_Solve += double(end1 - start1) / double(CLOCKS_PER_SEC);
 #endif
 
 //MPI_Barrier(PstreamGlobals::MPI_COMM_FOAM);
@@ -108,9 +121,15 @@ time_monitor_corrDiff += double(end - start) / double(CLOCKS_PER_SEC);
         if (i != inertIndex)
         {
             #ifdef GPUSolver_
+                start1 = std::clock();
                 YEqn_GPU.updatePsi(&Yi[0], speciesIndex);
                 Yi.correctBoundaryConditions();
+                end1 = std::clock();
+                time_monitor_YEqn += double(end1 - start1) / double(CLOCKS_PER_SEC);
+                time_monitor_YEqn_mtxAssembly += double(end1 - start1) / double(CLOCKS_PER_SEC);
+                time_monitor_YEqn_mtxAssembly_CPU_Prepare += double(end1 - start1) / double(CLOCKS_PER_SEC);
             #else
+                start1 = std::clock();
                 tmp<volScalarField> DEff = chemistry->rhoD(i) + turbulence->mut()/Sct;
                 fvScalarMatrix YiEqn
                 (
@@ -124,10 +143,16 @@ time_monitor_corrDiff += double(end - start) / double(CLOCKS_PER_SEC);
                     :  (fvm::laplacian(DEff(), Yi) + combustion->R(Yi))
                     )
                 );
+                end1 = std::clock();
+                time_monitor_YEqn += double(end1 - start1) / double(CLOCKS_PER_SEC);
+                time_monitor_YEqn_mtxAssembly += double(end1 - start1) / double(CLOCKS_PER_SEC);
+                // YiEqn.relax();
 
-                YiEqn.relax();
-
+                start1 = std::clock();
                 YiEqn.solve("Yi");
+                end1 = std::clock();
+                time_monitor_YEqn += double(end1 - start1) / double(CLOCKS_PER_SEC);
+                time_monitor_YEqn_Solve += double(end1 - start1) / double(CLOCKS_PER_SEC);
             #endif
 
             Yi.max(0.0);
@@ -141,5 +166,4 @@ time_monitor_corrDiff += double(end - start) / double(CLOCKS_PER_SEC);
 
     end = std::clock();
     time_monitor_Y += double(end - start) / double(CLOCKS_PER_SEC);
-    cudaSetDevice(0);
 }
diff --git a/applications/solvers/dfLowMachFoam/dfLowMachFoam.C b/applications/solvers/dfLowMachFoam/dfLowMachFoam.C
index 80bf0bcd8..c2a62797a 100644
--- a/applications/solvers/dfLowMachFoam/dfLowMachFoam.C
+++ b/applications/solvers/dfLowMachFoam/dfLowMachFoam.C
@@ -102,6 +102,7 @@ int main(int argc, char *argv[])
     double time_monitor_UEqn_mtxAssembly=0;
     double time_monitor_UEqn_Solve=0;
     double time_monitor_UEqn_sum=0;
+    double time_monitor_UEqn_CPU=0;
     double time_monitor_EEqn=0;
     double time_monitor_EEqn_mtxAssembly=0;
     double time_monitor_EEqn_mtxAssembly_CPU_Prepare;
@@ -109,6 +110,10 @@ int main(int argc, char *argv[])
     double time_monitor_EEqn_mtxAssembly_GPU_Run;
     double time_monitor_EEqn_Solve=0;
     double time_monitor_EEqn_sum=0;
+    double time_monitor_YEqn=0;
+    double time_monitor_YEqn_mtxAssembly=0;
+    double time_monitor_YEqn_mtxAssembly_CPU_Prepare=0;
+    double time_monitor_YEqn_Solve=0;
     double time_monitor_chem=0;
     double time_monitor_Y=0;
     double time_monitor_E=0;
@@ -213,7 +218,7 @@ int main(int argc, char *argv[])
             {
                 if (pimple.consistent())
                 {
-                    #include "pcEqn.H"
+                    // #include "pcEqn.H"
                 }
                 else
                 {
@@ -255,9 +260,12 @@ int main(int argc, char *argv[])
         Info<< "EEqn assamble(GPU prepare) = " << time_monitor_EEqn_mtxAssembly_GPU_Prepare << " s" << endl;
         Info<< "EEqn assamble(GPU run)     = " << time_monitor_EEqn_mtxAssembly_GPU_Run << " s" << endl;
         Info<< "EEqn Time solve            = " << time_monitor_EEqn_Solve << " s" << endl;
+        Info<< "YEqn Time                  = " << time_monitor_YEqn << " s" << endl;
+        Info<< "YEqn Time assamble Mtx     = " << time_monitor_YEqn_mtxAssembly << " s" << endl;
+        Info<< "YEqn assamble(CPU prepare) = " << time_monitor_YEqn_mtxAssembly_CPU_Prepare << " s" << endl;
+        Info<< "YEqn Time solve            = " << time_monitor_YEqn_Solve << " s" << endl;
         Info<< "sum Time                   = " << (time_monitor_chem + time_monitor_Y + time_monitor_flow + time_monitor_E + time_monitor_corrThermo + time_monitor_corrDiff) << " s" << endl;
         Info<< "CPU Time (get turb souce)  = " << time_monitor_CPU << " s" << endl;
-        Info<< "UEqn time in EEqn          = " << time_monitor_UinE << " s" << endl;
         Info<< "============================================"<<nl<< endl;
 
         Info<< "ExecutionTime = " << runTime.elapsedCpuTime() << " s"
@@ -279,6 +287,10 @@ int main(int argc, char *argv[])
         time_monitor_corrDiff = 0;
         time_monitor_CPU = 0;
         time_monitor_UinE = 0;
+        time_monitor_YEqn=0;
+        time_monitor_YEqn_mtxAssembly=0;
+        time_monitor_YEqn_mtxAssembly_CPU_Prepare=0;
+        time_monitor_YEqn_Solve=0;
 
 #ifdef USE_PYTORCH
         if (log_ && torch_)
diff --git a/applications/solvers/dfLowMachFoam/pEqn.H b/applications/solvers/dfLowMachFoam/pEqn.H
index f8e7763be..de3d8bfd5 100644
--- a/applications/solvers/dfLowMachFoam/pEqn.H
+++ b/applications/solvers/dfLowMachFoam/pEqn.H
@@ -53,12 +53,12 @@ const volScalarField psip0(psi*p);
     volScalarField rAU(1.0/UEqn.A());
     surfaceScalarField rhorAUf("rhorAUf", fvc::interpolate(rho*rAU));
     volVectorField HbyA(constrainHbyA(rAU*UEqn.H(), U, p));
-#endif
 
-if (pimple.nCorrPiso() <= 1)
-{
-    tUEqn.clear();
-}
+    if (pimple.nCorrPiso() <= 1)
+    {
+        tUEqn.clear();
+    }
+#endif
 
 surfaceScalarField phiHbyA
 (
@@ -143,7 +143,10 @@ p.relax();
 U = HbyA - rAU*fvc::grad(p);
 U.correctBoundaryConditions();
 K = 0.5*magSqr(U);
-UEqn_GPU.correctPsi(&U[0][0]);
+
+#ifdef GPUSolver_
+    UEqn_GPU.correctPsi(&U[0][0]);
+#endif
 
 if (pimple.simpleRho())
 {
diff --git a/applications/solvers/dfLowMachFoam/rhoEqn.H b/applications/solvers/dfLowMachFoam/rhoEqn.H
index a7d5552fe..1c0652dca 100644
--- a/applications/solvers/dfLowMachFoam/rhoEqn.H
+++ b/applications/solvers/dfLowMachFoam/rhoEqn.H
@@ -41,7 +41,7 @@ Description
         offset += patchSize;
     }
     rhoEqn_GPU.fvc_div(&phi[0], boundary_phi_init);
-    rhoEqn_GPU.fvm_ddt(&rho.oldTime()[0], &rho[0]);
+    rhoEqn_GPU.fvm_ddt(&rho.oldTime()[0]);
     rhoEqn_GPU.updatePsi(&rho.primitiveFieldRef()[0]);
     rho.correctBoundaryConditions();
 }
diff --git a/src_gpu/dfEEqn.H b/src_gpu/dfEEqn.H
index b606e5f28..483c44bb6 100644
--- a/src_gpu/dfEEqn.H
+++ b/src_gpu/dfEEqn.H
@@ -52,11 +52,9 @@ public:
 
     void prepare_data(const double *he_old, const double *K, const double *K_old, const double *alphaEff,
             const double *hDiffCorrFlux, const double *dpdt, const double *diffAlphaD,
-            const double *boundary_K, const double *boundary_hDiffCorrFlux, const double *boundary_alphaEff,
-            const double *valueInternalCoeffs, const double *valueBoundaryCoeffs,
-            const double *gradientInternalCoeffs, const double *gradientBoundaryCoeffs);
+            const double *boundary_K, const double *boundary_hDiffCorrFlux, const double *boundary_alphaEff);
 
-    void resetAb();
+    void initializeTimeStep();
 
     void fvm_ddt();
     void fvm_div();
diff --git a/src_gpu/dfEEqn.cu b/src_gpu/dfEEqn.cu
index 6907ed4cd..2ef941f3b 100644
--- a/src_gpu/dfEEqn.cu
+++ b/src_gpu/dfEEqn.cu
@@ -149,12 +149,12 @@ __global__ void eeqn_fvm_laplacian_uncorrected_internal(int num_cells,
 }
 
 __global__ void eeqn_fvm_laplacian_uncorrected_boundary(int num_boundary_cells,
-                                                               const int *csr_row_index, const int *csr_diag_index,
-                                                               const int *boundary_cell_offset, const int *boundary_cell_id,
-                                                               const double *boundary_alphaEff, const double *boundary_magsf,
-                                                               const double *gradient_internal_coeffs, const double *gradient_boundary_coeffs,
-                                                               const double sign, const double *A_csr_input, const double *b_input,
-                                                               double *A_csr_output, double *b_output)
+                                                        const int *csr_row_index, const int *csr_diag_index,
+                                                        const int *boundary_cell_offset, const int *boundary_cell_id,
+                                                        const double *boundary_alphaEff, const double *boundary_magsf,
+                                                        const double *gradient_internal_coeffs, const double *gradient_boundary_coeffs,
+                                                        const double sign, const double *A_csr_input, const double *b_input,
+                                                        double *A_csr_output, double *b_output)
 {
     int index = blockDim.x * blockIdx.x + threadIdx.x;
     if (index >= num_boundary_cells)
@@ -211,9 +211,9 @@ __global__ void eeqn_fvc_ddt_kernel(int num_cells, const double rdelta_t,
 }
 
 __global__ void eeqn_fvc_div_vector_internal(int num_cells,
-        const int *csr_row_index, const int *csr_col_index, const int *csr_diag_index,
-        const double *sf, const double *vf, const double *tlambdas,
-        const double sign, const double *b_input, double *b_output)
+                                             const int *csr_row_index, const int *csr_col_index, const int *csr_diag_index,
+                                             const double *sf, const double *vf, const double *tlambdas,
+                                             const double sign, const double *b_input, double *b_output)
 {
     int index = blockDim.x * blockIdx.x + threadIdx.x;
     if (index >= num_cells)
@@ -267,9 +267,9 @@ __global__ void eeqn_fvc_div_vector_internal(int num_cells,
 }
 
 __global__ void eeqn_fvc_div_vector_boundary(int num_boundary_cells,
-        const int *boundary_cell_offset, const int *boundary_cell_id,
-        const double *boundary_sf, const double *boundary_vf,
-        const double sign, const double *b_input, double *b_output)
+                                             const int *boundary_cell_offset, const int *boundary_cell_id,
+                                             const double *boundary_sf, const double *boundary_vf,
+                                             const double sign, const double *b_input, double *b_output)
 {
     int index = blockDim.x * blockIdx.x + threadIdx.x;
     if (index >= num_boundary_cells)
@@ -316,9 +316,9 @@ __global__ void eeqn_fvc_div_vector_boundary(int num_boundary_cells,
 }
 
 __global__ void eeqn_fvc_div_phi_scalar_internal(int num_cells,
-                                       const int *csr_row_index, const int *csr_col_index, const int *csr_diag_index,
-                                       const double *weight, const double *phi, const double *K,
-                                       const double sign, const double *b_input, double *b_output)
+                                                 const int *csr_row_index, const int *csr_col_index, const int *csr_diag_index,
+                                                 const double *weight, const double *phi, const double *K,
+                                                 const double sign, const double *b_input, double *b_output)
 {
     int index = blockDim.x * blockIdx.x + threadIdx.x;
     if (index >= num_cells)
@@ -362,9 +362,9 @@ __global__ void eeqn_fvc_div_phi_scalar_internal(int num_cells,
 }
 
 __global__ void eeqn_fvc_div_phi_scalar_boundary(int num_boundary_cells,
-                                       const int *boundary_cell_offset, const int *boundary_cell_id,
-                                       const double *boundary_phi, const double* boundary_K,
-                                       const double sign, const double *b_input, double *b_output)
+                                                 const int *boundary_cell_offset, const int *boundary_cell_id,
+                                                 const double *boundary_phi, const double *boundary_K,
+                                                 const double sign, const double *b_input, double *b_output)
 {
     int index = blockDim.x * blockIdx.x + threadIdx.x;
     if (index >= num_boundary_cells)
@@ -384,12 +384,11 @@ __global__ void eeqn_fvc_div_phi_scalar_boundary(int num_boundary_cells,
     b_output[cell_index] = b_input[cell_index] + interp * sign;
 }
 
-
 __global__ void eeqn_add_to_source_kernel(int num_cells,
-                                    const double sign_dpdt, const double *dpdt,
-                                    const double sign_diffAlphaD, const double *diffAlphaD,
-                                    const double *volume,
-                                    const double *b_input, double *b_output)
+                                          const double sign_dpdt, const double *dpdt,
+                                          const double sign_diffAlphaD, const double *diffAlphaD,
+                                          const double *volume,
+                                          const double *b_input, double *b_output)
 {
     int index = blockDim.x * blockIdx.x + threadIdx.x;
     if (index >= num_cells)
@@ -399,14 +398,10 @@ __global__ void eeqn_add_to_source_kernel(int num_cells,
 }
 
 __global__ void eeqn_boundaryPermutation(const int num_boundary_faces, const int *bouPermedIndex,
-                                    const double *boundary_K_init, const double *boundary_hDiffCorrFlux_init,
-                                    const double *boundary_alphaEff_init,
-                                    const double *value_internal_coeffs_init, const double *value_boundary_coeffs_init,
-                                    const double *gradient_internal_coeffs_init, const double *gradient_boundary_coeffs_init,
-                                    double *boundary_K, double *boundary_hDiffCorrFlux,
-                                    double *boundary_alphaEff,
-                                    double *value_internal_coeffs, double *value_boundary_coeffs,
-                                    double *gradient_internal_coeffs, double *gradient_boundary_coeffs)
+                                         const double *boundary_K_init, const double *boundary_hDiffCorrFlux_init,
+                                         const double *boundary_alphaEff_init,
+                                         double *boundary_K, double *boundary_hDiffCorrFlux,
+                                         double *boundary_alphaEff)
 {
     int index = blockDim.x * blockIdx.x + threadIdx.x;
     if (index >= num_boundary_faces)
@@ -419,10 +414,26 @@ __global__ void eeqn_boundaryPermutation(const int num_boundary_faces, const int
     boundary_hDiffCorrFlux[index * 3 + 1] = boundary_hDiffCorrFlux_init[p * 3 + 1];
     boundary_hDiffCorrFlux[index * 3 + 2] = boundary_hDiffCorrFlux_init[p * 3 + 2];
     boundary_alphaEff[index] = boundary_alphaEff_init[p];
-    value_internal_coeffs[index] = value_internal_coeffs_init[p];
-    value_boundary_coeffs[index] = value_boundary_coeffs_init[p];
-    gradient_internal_coeffs[index] = gradient_internal_coeffs_init[p];
-    gradient_boundary_coeffs[index] = gradient_boundary_coeffs_init[p];
+}
+
+__global__ void eeqn_update_BoundaryCoeffs_kernel(int num_boundary_faces, const double *boundary_phi, double *internal_coeffs,
+                                                  double *boundary_coeffs, double *laplac_internal_coeffs,
+                                                  double *laplac_boundary_coeffs)
+{
+    int index = blockDim.x * blockIdx.x + threadIdx.x;
+    if (index >= num_boundary_faces)
+        return;
+
+    // zeroGradient
+    double valueInternalCoeffs = 1.;
+    double valueBoundaryCoeffs = 0.;
+    double gradientInternalCoeffs = 0.;
+    double gradientBoundaryCoeffs = 0.;
+
+    internal_coeffs[index] = boundary_phi[index] * valueInternalCoeffs;
+    boundary_coeffs[index] = -boundary_phi[index] * valueBoundaryCoeffs;
+    laplac_internal_coeffs[index] = gradientInternalCoeffs;
+    laplac_boundary_coeffs[index] = gradientBoundaryCoeffs;
 }
 
 // constructor
@@ -480,10 +491,9 @@ dfEEqn::dfEEqn(dfMatrixDataBase &dataBase, const std::string &modeStr, const std
 }
 
 void dfEEqn::prepare_data(const double *he_old, const double *K, const double *K_old, const double *alphaEff,
-        const double *dpdt, const double *diffAlphaD, const double *hDiffCorrFlux,
-        const double *boundary_K, const double *boundary_hDiffCorrFlux, const double *boundary_alphaEff,
-        const double *valueInternalCoeffs, const double *valueBoundaryCoeffs,
-        const double *gradientInternalCoeffs, const double *gradientBoundaryCoeffs) {
+                          const double *dpdt, const double *diffAlphaD, const double *hDiffCorrFlux,
+                          const double *boundary_K, const double *boundary_hDiffCorrFlux, const double *boundary_alphaEff)
+{
     // TODO not real async copy now, because some host array are not in pinned memory.
 
     // copy the host input array in host memory to the device input array in device memory
@@ -499,26 +509,25 @@ void dfEEqn::prepare_data(const double *he_old, const double *K, const double *K
     checkCudaErrors(cudaMemcpyAsync(d_boundary_K_init, boundary_K, boundary_face_bytes, cudaMemcpyHostToDevice, stream));
     checkCudaErrors(cudaMemcpyAsync(d_boundary_hDiffCorrFlux_init, boundary_hDiffCorrFlux, boundary_face_bytes * 3, cudaMemcpyHostToDevice, stream));
     checkCudaErrors(cudaMemcpyAsync(d_boundary_alphaEff_init, boundary_alphaEff, boundary_face_bytes, cudaMemcpyHostToDevice, stream));
-    checkCudaErrors(cudaMemcpyAsync(d_value_internal_coeffs_init, valueInternalCoeffs, boundary_face_bytes, cudaMemcpyHostToDevice, stream));
-    checkCudaErrors(cudaMemcpyAsync(d_value_boundary_coeffs_init, valueBoundaryCoeffs, boundary_face_bytes, cudaMemcpyHostToDevice, stream));
-    checkCudaErrors(cudaMemcpyAsync(d_gradient_internal_coeffs_init, gradientInternalCoeffs, boundary_face_bytes, cudaMemcpyHostToDevice, stream));
-    checkCudaErrors(cudaMemcpyAsync(d_gradient_boundary_coeffs_init, gradientBoundaryCoeffs, boundary_face_bytes, cudaMemcpyHostToDevice, stream));
 
     size_t threads_per_block = 1024;
     size_t blocks_per_grid = (num_boundary_faces + threads_per_block - 1) / threads_per_block;
     eeqn_boundaryPermutation<<<blocks_per_grid, threads_per_block, 0, stream>>>(num_boundary_faces, dataBase_.d_bouPermedIndex,
-            d_boundary_K_init, d_boundary_hDiffCorrFlux_init, d_boundary_alphaEff_init,
-            d_value_internal_coeffs_init, d_value_boundary_coeffs_init,
-            d_gradient_internal_coeffs_init, d_gradient_boundary_coeffs_init,
-            d_boundary_K, d_boundary_hDiffCorrFlux, d_boundary_alphaEff,
-            d_value_internal_coeffs, d_value_boundary_coeffs,
-            d_gradient_internal_coeffs, d_gradient_boundary_coeffs);
+                                                                                d_boundary_K_init, d_boundary_hDiffCorrFlux_init, d_boundary_alphaEff_init,
+                                                                                d_boundary_K, d_boundary_hDiffCorrFlux, d_boundary_alphaEff);
 }
 
-void dfEEqn::resetAb() {
+void dfEEqn::initializeTimeStep()
+{
     // initialize matrix value
     checkCudaErrors(cudaMemsetAsync(d_A_csr, 0, csr_value_vec_bytes, stream));
     checkCudaErrors(cudaMemsetAsync(d_b, 0, cell_vec_bytes, stream));
+    // initialize boundary coeffs
+    size_t threads_per_block = 1024;
+    size_t blocks_per_grid = (dataBase_.num_boundary_faces + threads_per_block - 1) / threads_per_block;
+    eeqn_update_BoundaryCoeffs_kernel<<<blocks_per_grid, threads_per_block, 0, stream>>>(dataBase_.num_boundary_faces, dataBase_.d_boundary_phi,
+                                                                                         d_value_internal_coeffs, d_value_boundary_coeffs,
+                                                                                         d_gradient_internal_coeffs, d_gradient_boundary_coeffs);
 }
 
 void dfEEqn::fvm_ddt()
@@ -526,9 +535,9 @@ void dfEEqn::fvm_ddt()
     size_t threads_per_block = 1024;
     size_t blocks_per_grid = (num_cells + threads_per_block - 1) / threads_per_block;
     eeqn_fvm_ddt_kernel<<<blocks_per_grid, threads_per_block, 0, stream>>>(num_cells, dataBase_.rdelta_t,
-            d_A_csr_row_index, d_A_csr_diag_index,
-            dataBase_.d_rho_old, dataBase_.d_rho_new, dataBase_.d_volume, d_he_old,
-            1., d_A_csr, d_b, d_A_csr, d_b);
+                                                                           d_A_csr_row_index, d_A_csr_diag_index,
+                                                                           dataBase_.d_rho_old, dataBase_.d_rho_new, dataBase_.d_volume, d_he_old,
+                                                                           1., d_A_csr, d_b, d_A_csr, d_b);
 }
 
 void dfEEqn::fvm_div()
@@ -536,15 +545,15 @@ void dfEEqn::fvm_div()
     size_t threads_per_block = 1024;
     size_t blocks_per_grid = (num_cells + threads_per_block - 1) / threads_per_block;
     eeqn_fvm_div_internal<<<blocks_per_grid, threads_per_block, 0, stream>>>(num_cells,
-            d_A_csr_row_index, d_A_csr_diag_index,
-            dataBase_.d_weight, dataBase_.d_phi,
-            1., d_A_csr, d_b, d_A_csr, d_b);
+                                                                             d_A_csr_row_index, d_A_csr_diag_index,
+                                                                             dataBase_.d_weight, dataBase_.d_phi,
+                                                                             1., d_A_csr, d_b, d_A_csr, d_b);
     blocks_per_grid = (num_boundary_cells + threads_per_block - 1) / threads_per_block;
     eeqn_fvm_div_boundary<<<blocks_per_grid, threads_per_block, 0, stream>>>(num_boundary_cells,
-            d_A_csr_row_index, d_A_csr_diag_index,
-            dataBase_.d_boundary_cell_offset, dataBase_.d_boundary_cell_id,
-            d_value_internal_coeffs, d_value_boundary_coeffs,
-            1., d_A_csr, d_b, d_A_csr, d_b);
+                                                                             d_A_csr_row_index, d_A_csr_diag_index,
+                                                                             dataBase_.d_boundary_cell_offset, dataBase_.d_boundary_cell_id,
+                                                                             d_value_internal_coeffs, d_value_boundary_coeffs,
+                                                                             1., d_A_csr, d_b, d_A_csr, d_b);
 }
 
 void dfEEqn::fvm_laplacian()
@@ -552,14 +561,14 @@ void dfEEqn::fvm_laplacian()
     size_t threads_per_block = 1024;
     size_t blocks_per_grid = (num_cells + threads_per_block - 1) / threads_per_block;
     eeqn_fvm_laplacian_uncorrected_internal<<<blocks_per_grid, threads_per_block, 0, stream>>>(num_cells,
-            d_A_csr_row_index, d_A_csr_col_index, d_A_csr_diag_index, d_alphaEff, dataBase_.d_weight,
-            dataBase_.d_face, dataBase_.d_deltaCoeffs,
-            -1., d_A_csr, d_A_csr);
+                                                                                               d_A_csr_row_index, d_A_csr_col_index, d_A_csr_diag_index, d_alphaEff, dataBase_.d_weight,
+                                                                                               dataBase_.d_face, dataBase_.d_deltaCoeffs,
+                                                                                               -1., d_A_csr, d_A_csr);
     blocks_per_grid = (num_boundary_cells + threads_per_block - 1) / threads_per_block;
     eeqn_fvm_laplacian_uncorrected_boundary<<<blocks_per_grid, threads_per_block, 0, stream>>>(num_boundary_cells,
-            d_A_csr_row_index, d_A_csr_diag_index, dataBase_.d_boundary_cell_offset, dataBase_.d_boundary_cell_id,
-            d_boundary_alphaEff, dataBase_.d_boundary_face, d_gradient_internal_coeffs, d_gradient_boundary_coeffs,
-            -1., d_A_csr, d_b, d_A_csr, d_b);
+                                                                                               d_A_csr_row_index, d_A_csr_diag_index, dataBase_.d_boundary_cell_offset, dataBase_.d_boundary_cell_id,
+                                                                                               d_boundary_alphaEff, dataBase_.d_boundary_face, d_gradient_internal_coeffs, d_gradient_boundary_coeffs,
+                                                                                               -1., d_A_csr, d_b, d_A_csr, d_b);
 }
 
 void dfEEqn::fvc_ddt()
@@ -568,8 +577,8 @@ void dfEEqn::fvc_ddt()
     size_t threads_per_block = 1024;
     size_t blocks_per_grid = (num_cells + threads_per_block - 1) / threads_per_block;
     eeqn_fvc_ddt_kernel<<<blocks_per_grid, threads_per_block, 0, stream>>>(num_cells, dataBase_.rdelta_t,
-            dataBase_.d_rho_old, dataBase_.d_rho_new, d_K_old, d_K, dataBase_.d_volume,
-            -1., d_b, d_b);
+                                                                           dataBase_.d_rho_old, dataBase_.d_rho_new, d_K_old, d_K, dataBase_.d_volume,
+                                                                           -1., d_b, d_b);
 }
 
 void dfEEqn::fvc_div_vector()
@@ -578,14 +587,14 @@ void dfEEqn::fvc_div_vector()
     size_t threads_per_block = 512;
     size_t blocks_per_grid = (num_cells + threads_per_block - 1) / threads_per_block;
     eeqn_fvc_div_vector_internal<<<blocks_per_grid, threads_per_block, 0, stream>>>(num_cells,
-            d_A_csr_row_index, d_A_csr_col_index, d_A_csr_diag_index,
-            dataBase_.d_face_vector, d_hDiffCorrFlux, dataBase_.d_weight,
-            1., d_b, d_b);
+                                                                                    d_A_csr_row_index, d_A_csr_col_index, d_A_csr_diag_index,
+                                                                                    dataBase_.d_face_vector, d_hDiffCorrFlux, dataBase_.d_weight,
+                                                                                    1., d_b, d_b);
     blocks_per_grid = (num_boundary_cells + threads_per_block - 1) / threads_per_block;
     eeqn_fvc_div_vector_boundary<<<blocks_per_grid, threads_per_block, 0, stream>>>(num_boundary_cells,
-            dataBase_.d_boundary_cell_offset, dataBase_.d_boundary_cell_id,
-            dataBase_.d_boundary_face_vector, d_boundary_hDiffCorrFlux,
-            1., d_b, d_b);
+                                                                                    dataBase_.d_boundary_cell_offset, dataBase_.d_boundary_cell_id,
+                                                                                    dataBase_.d_boundary_face_vector, d_boundary_hDiffCorrFlux,
+                                                                                    1., d_b, d_b);
 }
 
 void dfEEqn::fvc_div_phi_scalar()
@@ -594,14 +603,14 @@ void dfEEqn::fvc_div_phi_scalar()
     size_t threads_per_block = 1024;
     size_t blocks_per_grid = (num_cells + threads_per_block - 1) / threads_per_block;
     eeqn_fvc_div_phi_scalar_internal<<<blocks_per_grid, threads_per_block, 0, stream>>>(num_cells,
-            d_A_csr_row_index, d_A_csr_col_index, d_A_csr_diag_index,
-            dataBase_.d_weight, dataBase_.d_phi, d_K,
-            -1., d_b, d_b);
+                                                                                        d_A_csr_row_index, d_A_csr_col_index, d_A_csr_diag_index,
+                                                                                        dataBase_.d_weight, dataBase_.d_phi, d_K,
+                                                                                        -1., d_b, d_b);
     blocks_per_grid = (num_boundary_cells + threads_per_block - 1) / threads_per_block;
     eeqn_fvc_div_phi_scalar_boundary<<<blocks_per_grid, threads_per_block, 0, stream>>>(num_boundary_cells,
-            dataBase_.d_boundary_cell_offset, dataBase_.d_boundary_cell_id,
-            dataBase_.d_boundary_phi, d_boundary_K,
-            -1., d_b, d_b);
+                                                                                        dataBase_.d_boundary_cell_offset, dataBase_.d_boundary_cell_id,
+                                                                                        dataBase_.d_boundary_phi, d_boundary_K,
+                                                                                        -1., d_b, d_b);
 }
 
 void dfEEqn::add_to_source()
@@ -612,7 +621,7 @@ void dfEEqn::add_to_source()
     size_t blocks_per_grid = (num_cells + threads_per_block - 1) / threads_per_block;
     // " + fvc::ddt(rho，K)" is on the left side of "==", thus should minus from source.
     eeqn_add_to_source_kernel<<<blocks_per_grid, threads_per_block, 0, stream>>>(num_cells,
-            1., d_dpdt, -1., d_diffAlphaD, dataBase_.d_volume, d_b, d_b);
+                                                                                 1., d_dpdt, -1., d_diffAlphaD, dataBase_.d_volume, d_b, d_b);
 }
 
 void dfEEqn::checkValue(bool print)
@@ -701,10 +710,7 @@ void dfEEqn::sync()
 
 void dfEEqn::updatePsi(double *Psi)
 {
-    for (size_t i = 0; i < num_cells; i++)
-    {
-        Psi[i] = h_he_new[i];
-    }
+    memcpy(Psi, h_he_new, cell_bytes);
 }
 
 dfEEqn::~dfEEqn()
diff --git a/src_gpu/dfRhoEqn.H b/src_gpu/dfRhoEqn.H
index 24bb834d2..35885a517 100644
--- a/src_gpu/dfRhoEqn.H
+++ b/src_gpu/dfRhoEqn.H
@@ -37,7 +37,7 @@ public:
 
     void fvc_div(double *phi, double *boundary_phi_init);
 
-    void fvm_ddt(double *rho_old, double *rho_new);
+    void fvm_ddt(double *rho_old);
 
     void updatePsi(double* Psi);
 };
\ No newline at end of file
diff --git a/src_gpu/dfRhoEqn.cu b/src_gpu/dfRhoEqn.cu
index 89aa4ae56..fc933f95b 100644
--- a/src_gpu/dfRhoEqn.cu
+++ b/src_gpu/dfRhoEqn.cu
@@ -123,7 +123,7 @@ void dfRhoEqn::fvc_div(double *phi, double *boundary_phi_init)
                                                                             dataBase_.d_bouPermedIndex, dataBase_.d_boundary_phi_init, dataBase_.d_boundary_phi, 1., d_b, d_b);
 }
 
-void dfRhoEqn::fvm_ddt(double *rho_old, double *rho_new)
+void dfRhoEqn::fvm_ddt(double *rho_old)
 {
     checkCudaErrors(cudaMemcpyAsync(dataBase_.d_rho_old, rho_old, cell_bytes, cudaMemcpyHostToDevice, stream));
     size_t threads_per_block = 1024;
diff --git a/src_gpu/dfUEqn.H b/src_gpu/dfUEqn.H
index 8c6ed9c1e..3405a5bbc 100644
--- a/src_gpu/dfUEqn.H
+++ b/src_gpu/dfUEqn.H
@@ -31,9 +31,7 @@ public:
 
     void fvm_ddt(double *vector_old);
 
-    void fvm_div(double *phi, double *ueqn_internalCoeffs_init, double *ueqn_boundaryCoeffs_init,
-                 double *ueqn_laplac_internalCoeffs_init, double *ueqn_laplac_boundaryCoeffs_init,
-                 double *boundary_pressure_init, double *boundary_velocity_init,
+    void fvm_div(double *boundary_pressure_init, double *boundary_velocity_init,
                  double *boundary_nuEff_init, double *boundary_rho_init);
 
     void fvc_grad(double *pressure);
@@ -57,4 +55,6 @@ public:
     void updatePsi(double *Psi);
 
     void correctPsi(double *Psi);
+
+    void initializeTimeStep();
 };
\ No newline at end of file
diff --git a/src_gpu/dfUEqn.cu b/src_gpu/dfUEqn.cu
index 5e664cde4..7eb7cbe31 100644
--- a/src_gpu/dfUEqn.cu
+++ b/src_gpu/dfUEqn.cu
@@ -281,11 +281,7 @@ __global__ void offdiagPermutation(const int num_faces, const int *permedIndex,
 }
 
 __global__ void boundaryPermutation(const int num_boundary_faces, const int *bouPermedIndex,
-                                    const double *ueqn_internalCoeffs_init, const double *ueqn_boundaryCoeffs_init,
-                                    const double *ueqn_laplac_internalCoeffs_init, const double *ueqn_laplac_boundaryCoeffs_init,
                                     const double *boundary_pressure_init, const double *boundary_velocity_init,
-                                    double *ueqn_internalCoeffs, double *ueqn_boundaryCoeffs,
-                                    double *ueqn_laplac_internalCoeffs, double *ueqn_laplac_boundaryCoeffs,
                                     double *boundary_pressure, double *boundary_velocity,
                                     double *boundary_nuEff_init, double *boundary_nuEff,
                                     double *boundary_rho_init, double *boundary_rho)
@@ -295,20 +291,6 @@ __global__ void boundaryPermutation(const int num_boundary_faces, const int *bou
         return;
 
     int p = bouPermedIndex[index];
-    ueqn_internalCoeffs[3 * index] = ueqn_internalCoeffs_init[3 * p];
-    ueqn_internalCoeffs[3 * index + 1] = ueqn_internalCoeffs_init[3 * p + 1];
-    ueqn_internalCoeffs[3 * index + 2] = ueqn_internalCoeffs_init[3 * p + 2];
-    ueqn_boundaryCoeffs[3 * index] = ueqn_boundaryCoeffs_init[3 * p];
-    ueqn_boundaryCoeffs[3 * index + 1] = ueqn_boundaryCoeffs_init[3 * p + 1];
-    ueqn_boundaryCoeffs[3 * index + 2] = ueqn_boundaryCoeffs_init[3 * p + 2];
-
-    ueqn_laplac_internalCoeffs[3 * index] = ueqn_laplac_internalCoeffs_init[3 * p];
-    ueqn_laplac_internalCoeffs[3 * index + 1] = ueqn_laplac_internalCoeffs_init[3 * p + 1];
-    ueqn_laplac_internalCoeffs[3 * index + 2] = ueqn_laplac_internalCoeffs_init[3 * p + 2];
-    ueqn_laplac_boundaryCoeffs[3 * index] = ueqn_laplac_boundaryCoeffs_init[3 * p];
-    ueqn_laplac_boundaryCoeffs[3 * index + 1] = ueqn_laplac_boundaryCoeffs_init[3 * p + 1];
-    ueqn_laplac_boundaryCoeffs[3 * index + 2] = ueqn_laplac_boundaryCoeffs_init[3 * p + 2];
-
     boundary_velocity[3 * index] = boundary_velocity_init[3 * p];
     boundary_velocity[3 * index + 1] = boundary_velocity_init[3 * p + 1];
     boundary_velocity[3 * index + 2] = boundary_velocity_init[3 * p + 2];
@@ -317,17 +299,6 @@ __global__ void boundaryPermutation(const int num_boundary_faces, const int *bou
     boundary_nuEff[index] = boundary_nuEff_init[p];
 }
 
-__global__ void csrPermutation(const int num_Nz, const int *csrPermedIndex,
-                               const double *d_turbSrc_A_init, double *d_turbSrc_A)
-{
-    int index = blockDim.x * blockIdx.x + threadIdx.x;
-    if (index >= num_Nz)
-        return;
-
-    int p = csrPermedIndex[index];
-    d_turbSrc_A[index] = d_turbSrc_A_init[p];
-}
-
 __global__ void fvc_grad_vector_internal(int num_cells,
                                          const int *csr_row_index, const int *csr_col_index, const int *csr_diag_index,
                                          const double *sf, const double *vf, const double *tlambdas, const double *volume,
@@ -999,7 +970,35 @@ __global__ void addDiagDivVolume(int num_cells, const int *csr_row_index,
 
     double vol = volume[index];
 
-    A_output[index] = (A_input[index] + A_csr[csr_index] - ueqn_internal_coeffs[index * 3]) / vol; 
+    A_output[index] = (A_input[index] + A_csr[csr_index] - ueqn_internal_coeffs[index * 3]) / vol;
+}
+
+__global__ void ueqn_update_BoundaryCoeffs_kernel(int num_boundary_faces, const double *boundary_phi, double *internal_coeffs,
+                                                  double *boundary_coeffs, double *laplac_internal_coeffs,
+                                                  double *laplac_boundary_coeffs)
+{
+    int index = blockDim.x * blockIdx.x + threadIdx.x;
+    if (index >= num_boundary_faces)
+        return;
+
+    // zeroGradient
+    double valueInternalCoeffs = 1.;
+    double valueBoundaryCoeffs = 0.;
+    double gradientInternalCoeffs = 0.;
+    double gradientBoundaryCoeffs = 0.;
+
+    internal_coeffs[index * 3 + 0] = boundary_phi[index] * valueInternalCoeffs;
+    internal_coeffs[index * 3 + 1] = boundary_phi[index] * valueInternalCoeffs;
+    internal_coeffs[index * 3 + 2] = boundary_phi[index] * valueInternalCoeffs;
+    boundary_coeffs[index * 3 + 0] = -boundary_phi[index] * valueBoundaryCoeffs;
+    boundary_coeffs[index * 3 + 1] = -boundary_phi[index] * valueBoundaryCoeffs;
+    boundary_coeffs[index * 3 + 2] = -boundary_phi[index] * valueBoundaryCoeffs;
+    laplac_internal_coeffs[index * 3 + 0] = gradientInternalCoeffs;
+    laplac_internal_coeffs[index * 3 + 1] = gradientInternalCoeffs;
+    laplac_internal_coeffs[index * 3 + 2] = gradientInternalCoeffs;
+    laplac_boundary_coeffs[index * 3 + 0] = gradientBoundaryCoeffs;
+    laplac_boundary_coeffs[index * 3 + 1] = gradientBoundaryCoeffs;
+    laplac_boundary_coeffs[index * 3 + 2] = gradientBoundaryCoeffs;
 }
 
 // constructor
@@ -1041,9 +1040,6 @@ dfUEqn::dfUEqn(dfMatrixDataBase &dataBase, const std::string &modeStr, const std
 
 void dfUEqn::fvm_ddt(double *vector_old)
 {
-    // initialize matrix value
-    checkCudaErrors(cudaMemsetAsync(d_A_csr, 0, csr_value_vec_bytes, stream));
-    checkCudaErrors(cudaMemsetAsync(d_b, 0, cell_vec_bytes, stream));
     // Copy the host input array in host memory to the device input array in device memory
     clock_t start = std::clock();
     checkCudaErrors(cudaMemcpyAsync(dataBase_.d_velocity_old, vector_old, cell_vec_bytes, cudaMemcpyHostToDevice, stream));
@@ -1061,53 +1057,24 @@ void dfUEqn::fvm_ddt(double *vector_old)
     time_monitor_GPU_kernel += double(end - start) / double(CLOCKS_PER_SEC);
 }
 
-void dfUEqn::fvm_div(double *phi, double *ueqn_internalCoeffs_init, double *ueqn_boundaryCoeffs_init,
-                     double *ueqn_laplac_internalCoeffs_init, double *ueqn_laplac_boundaryCoeffs_init,
-                     double *boundary_pressure_init, double *boundary_velocity_init,
+void dfUEqn::fvm_div(double *boundary_pressure_init, double *boundary_velocity_init,
                      double *boundary_nuEff_init, double *boundary_rho_init)
 {
-    // copy and permutate face variables
-    clock_t start = std::clock();
-    // std::copy(phi, phi + num_surfaces, h_phi_init);
-    // std::copy(phi, phi + num_surfaces, h_phi_init + num_surfaces);
-    memcpy(dataBase_.h_phi_init, phi, num_surfaces * sizeof(double));
-    // memcpy(h_phi_init + num_surfaces, phi, num_surfaces*sizeof(double));
-
-    clock_t end = std::clock();
-    time_monitor_CPU += double(end - start) / double(CLOCKS_PER_SEC);
-
-    start = std::clock();
-    checkCudaErrors(cudaMemcpyAsync(dataBase_.d_phi_init, dataBase_.h_phi_init, num_surfaces * sizeof(double), cudaMemcpyHostToDevice, stream));
-    checkCudaErrors(cudaMemcpyAsync(dataBase_.d_phi_init + num_surfaces, dataBase_.d_phi_init, num_surfaces * sizeof(double), cudaMemcpyDeviceToDevice, stream));
-    end = std::clock();
-    time_monitor_GPU_memcpy += double(end - start) / double(CLOCKS_PER_SEC);
-
-    start = std::clock();
-    size_t threads_per_block = 1024;
-    size_t blocks_per_grid = (num_faces + threads_per_block - 1) / threads_per_block;
-    offdiagPermutation<<<blocks_per_grid, threads_per_block, 0, stream>>>(num_faces, dataBase_.d_permedIndex, dataBase_.d_phi_init, dataBase_.d_phi);
-    end = std::clock();
-    time_monitor_GPU_kernel += double(end - start) / double(CLOCKS_PER_SEC);
-
     // copy and permutate boundary variable
-    start = std::clock();
-    checkCudaErrors(cudaMemcpyAsync(dataBase_.d_internal_coeffs_init, ueqn_internalCoeffs_init, dataBase_.boundary_face_vec_bytes, cudaMemcpyHostToDevice, stream));
-    checkCudaErrors(cudaMemcpyAsync(dataBase_.d_boundary_coeffs_init, ueqn_boundaryCoeffs_init, dataBase_.boundary_face_vec_bytes, cudaMemcpyHostToDevice, stream));
-    checkCudaErrors(cudaMemcpyAsync(dataBase_.d_laplac_internal_coeffs_init, ueqn_laplac_internalCoeffs_init, dataBase_.boundary_face_vec_bytes, cudaMemcpyHostToDevice, stream));
-    checkCudaErrors(cudaMemcpyAsync(dataBase_.d_laplac_boundary_coeffs_init, ueqn_laplac_boundaryCoeffs_init, dataBase_.boundary_face_vec_bytes, cudaMemcpyHostToDevice, stream));
+    clock_t start = std::clock();
     checkCudaErrors(cudaMemcpyAsync(dataBase_.d_boundary_velocity_init, boundary_velocity_init, dataBase_.boundary_face_vec_bytes, cudaMemcpyHostToDevice, stream));
     checkCudaErrors(cudaMemcpyAsync(dataBase_.d_boundary_pressure_init, boundary_pressure_init, dataBase_.boundary_face_bytes, cudaMemcpyHostToDevice, stream));
     checkCudaErrors(cudaMemcpyAsync(dataBase_.d_boundary_nuEff_init, boundary_nuEff_init, dataBase_.boundary_face_bytes, cudaMemcpyHostToDevice, stream));
     checkCudaErrors(cudaMemcpyAsync(dataBase_.d_boundary_rho_init, boundary_rho_init, dataBase_.boundary_face_bytes, cudaMemcpyHostToDevice, stream));
-    end = std::clock();
+    clock_t end = std::clock();
     time_monitor_GPU_memcpy += double(end - start) / double(CLOCKS_PER_SEC);
 
     start = std::clock();
-    blocks_per_grid = (dataBase_.num_boundary_faces + threads_per_block - 1) / threads_per_block;
-    boundaryPermutation<<<blocks_per_grid, threads_per_block, 0, stream>>>(dataBase_.num_boundary_faces, dataBase_.d_bouPermedIndex, dataBase_.d_internal_coeffs_init,
-                                                                           dataBase_.d_boundary_coeffs_init, dataBase_.d_laplac_internal_coeffs_init, dataBase_.d_laplac_boundary_coeffs_init, dataBase_.d_boundary_pressure_init,
-                                                                           dataBase_.d_boundary_velocity_init, dataBase_.d_internal_coeffs, dataBase_.d_boundary_coeffs, dataBase_.d_laplac_internal_coeffs, dataBase_.d_laplac_boundary_coeffs,
-                                                                           dataBase_.d_boundary_pressure, dataBase_.d_boundary_velocity, dataBase_.d_boundary_nuEff_init, dataBase_.d_boundary_nuEff, dataBase_.d_boundary_rho_init, dataBase_.d_boundary_rho);
+    size_t threads_per_block = 1024;
+    size_t blocks_per_grid = (dataBase_.num_boundary_faces + threads_per_block - 1) / threads_per_block;
+    boundaryPermutation<<<blocks_per_grid, threads_per_block, 0, stream>>>(dataBase_.num_boundary_faces, dataBase_.d_bouPermedIndex, dataBase_.d_boundary_pressure_init,
+                                                                           dataBase_.d_boundary_velocity_init, dataBase_.d_boundary_pressure, dataBase_.d_boundary_velocity, 
+                                                                           dataBase_.d_boundary_nuEff_init, dataBase_.d_boundary_nuEff, dataBase_.d_boundary_rho_init, dataBase_.d_boundary_rho);
 
     // launch cuda kernel
     blocks_per_grid = (num_cells + threads_per_block - 1) / threads_per_block;
@@ -1229,8 +1196,6 @@ void dfUEqn::fvm_laplacian()
 
 void dfUEqn::A(double *Psi)
 {
-    // tmp<scalarField> tdiag(new scalarField(diag()));
-    // addCmptAvBoundaryDiag(tdiag.ref());
     checkCudaErrors(cudaMemsetAsync(d_A, 0, cell_bytes, stream));
 
     size_t threads_per_block = 1024;
@@ -1241,7 +1206,7 @@ void dfUEqn::A(double *Psi)
     blocks_per_grid = (num_cells + threads_per_block - 1) / threads_per_block;
     addDiagDivVolume<<<blocks_per_grid, threads_per_block, 0, stream>>>(num_cells, d_A_csr_row_index, d_A_csr_diag_index, d_A_csr,
                                                                         dataBase_.d_volume, d_ueqn_internal_coeffs, d_A, d_A);
-    
+
     checkCudaErrors(cudaMemcpyAsync(h_A, d_A, cell_bytes, cudaMemcpyDeviceToHost, stream));
     checkCudaErrors(cudaStreamSynchronize(stream));
     for (size_t i = 0; i < num_cells; i++)
@@ -1250,9 +1215,7 @@ void dfUEqn::A(double *Psi)
 
 void dfUEqn::H(double *Psi)
 {
-    checkCudaErrors(cudaMemsetAsync(d_H, 0, cell_vec_bytes, stream));
-    checkCudaErrors(cudaStreamSynchronize(stream));
-
+    checkCudaErrors(cudaMemsetAsync(d_H, 0, cell_bytes * 3, stream));
     size_t threads_per_block = 1024;
     size_t blocks_per_grid = (num_boundary_cells + threads_per_block - 1) / threads_per_block;
     addBoundaryDiag<<<blocks_per_grid, threads_per_block, 0, stream>>>(num_cells, num_boundary_cells, d_A_csr_row_index, d_A_csr_diag_index,
@@ -1275,7 +1238,20 @@ void dfUEqn::H(double *Psi)
     }
 
     // for (int i = 0; i < num_cells; i++)
-    // fprintf(stderr, "h_H_GPU[%d]: (%.5e, %.5e, %.5e)\n", i, h_H[i], h_H[num_cells + i], h_H[num_cells * 2 + i]);
+    //     fprintf(stderr, "h_H_GPU[%d]: (%.5e, %.5e, %.5e)\n", i, h_H[i], h_H[num_cells + i], h_H[num_cells * 2 + i]);
+}
+
+void dfUEqn::initializeTimeStep()
+{
+    // initialize matrix value
+    checkCudaErrors(cudaMemsetAsync(d_A_csr, 0, csr_value_vec_bytes, stream));
+    checkCudaErrors(cudaMemsetAsync(d_b, 0, cell_vec_bytes, stream));
+    // initialize boundary coeffs
+    size_t threads_per_block = 1024;
+    size_t blocks_per_grid = (dataBase_.num_boundary_faces + threads_per_block - 1) / threads_per_block;
+    ueqn_update_BoundaryCoeffs_kernel<<<blocks_per_grid, threads_per_block, 0, stream>>>(dataBase_.num_boundary_faces, dataBase_.d_boundary_phi,
+                                                                                         dataBase_.d_internal_coeffs, dataBase_.d_boundary_coeffs,
+                                                                                         dataBase_.d_laplac_internal_coeffs, dataBase_.d_laplac_boundary_coeffs);
 }
 
 void dfUEqn::checkValue(bool print)
diff --git a/src_gpu/dfYEqn.H b/src_gpu/dfYEqn.H
index e477d0d2e..260baf4f2 100644
--- a/src_gpu/dfYEqn.H
+++ b/src_gpu/dfYEqn.H
@@ -35,11 +35,11 @@ public:
 
     void checkValue(bool print, char *filename);
 
-    void correctVelocity(std::vector<double *> Y_new, std::vector<double *> boundary_Y_init, std::vector<const double *> rhoD_GPU);
+    void correctVelocity(std::vector<double *> Y_old, std::vector<double *> boundary_Y_init, std::vector<const double *> rhoD_GPU);
 
     void upwindWeight();
 
-    void fvm_ddt(std::vector<double *> Y_old);
+    void fvm_ddt();
 
     void fvm_div_phi();
 
diff --git a/src_gpu/dfYEqn.cu b/src_gpu/dfYEqn.cu
index b507763a0..9ef75747f 100644
--- a/src_gpu/dfYEqn.cu
+++ b/src_gpu/dfYEqn.cu
@@ -508,7 +508,7 @@ void dfYEqn::upwindWeight()
     getUpwindWeight<<<blocks_per_grid, threads_per_block, 0, stream>>>(num_faces, dataBase_.d_phi, dataBase_.d_weight_upwind);
 }
 
-void dfYEqn::correctVelocity(std::vector<double *> Y_new, std::vector<double *> boundary_Y_init, std::vector<const double *> rhoD_GPU)
+void dfYEqn::correctVelocity(std::vector<double *> Y_old, std::vector<double *> boundary_Y_init, std::vector<const double *> rhoD_GPU)
 {
     // initialize variables in each time step
     checkCudaErrors(cudaMemsetAsync(d_sumYDiffError, 0, 3 * cell_bytes, stream));
@@ -518,7 +518,7 @@ void dfYEqn::correctVelocity(std::vector<double *> Y_new, std::vector<double *>
     for (size_t i = 0; i < num_species; ++i)
     {
         checkCudaErrors(cudaMemsetAsync(d_sumYDiffError_tmp, 0, 3 * cell_bytes, stream));
-        checkCudaErrors(cudaMemcpyAsync(dataBase_.d_Y_new_vector[i], Y_new[i], cell_bytes, cudaMemcpyHostToDevice, stream));
+        checkCudaErrors(cudaMemcpyAsync(dataBase_.d_Y_old_vector[i], Y_old[i], cell_bytes, cudaMemcpyHostToDevice, stream));
         checkCudaErrors(cudaMemcpyAsync(dataBase_.d_boundary_Y_init_vector[i], boundary_Y_init[i], dataBase_.boundary_face_bytes, cudaMemcpyHostToDevice, stream));
         checkCudaErrors(cudaMemcpyAsync(dataBase_.d_rhoD_vector[i], rhoD_GPU[i], cell_bytes, cudaMemcpyHostToDevice, stream));
 
@@ -527,7 +527,7 @@ void dfYEqn::correctVelocity(std::vector<double *> Y_new, std::vector<double *>
         blocks_per_grid = (num_cells + threads_per_block - 1) / threads_per_block;
         fvc_grad_internal_face_Y<<<blocks_per_grid, threads_per_block, 0, stream>>>(num_cells,
                                                                                     d_A_csr_row_index, d_A_csr_col_index, d_A_csr_diag_index, dataBase_.d_volume,
-                                                                                    dataBase_.d_face_vector, dataBase_.d_weight, dataBase_.d_Y_new_vector[i],
+                                                                                    dataBase_.d_face_vector, dataBase_.d_weight, dataBase_.d_Y_old_vector[i],
                                                                                     dataBase_.d_rhoD_vector[i], d_sumYDiffError_tmp, d_sumYDiffError_tmp);
         blocks_per_grid = (num_boundary_cells + threads_per_block - 1) / threads_per_block;
         fvc_grad_boundary_face_Y<<<blocks_per_grid, threads_per_block, 0, stream>>>(num_cells, num_boundary_cells, dataBase_.d_boundary_cell_offset,
@@ -554,7 +554,7 @@ void dfYEqn::correctVelocity(std::vector<double *> Y_new, std::vector<double *>
                                                                                 dataBase_.d_boundary_face_vector, d_sumYDiffError_boundary, d_phiUc_boundary);
 }
 
-void dfYEqn::fvm_ddt(std::vector<double *> Y_old)
+void dfYEqn::fvm_ddt()
 {
     // initialize variables in each time step
     checkCudaErrors(cudaMemsetAsync(d_A_csr, 0, (num_cells + num_faces) * (num_species - 1) * sizeof(double), stream)); // consider inert species
@@ -567,7 +567,6 @@ void dfYEqn::fvm_ddt(std::vector<double *> Y_old)
     {
         if (i == inertIndex)
             continue;
-        checkCudaErrors(cudaMemcpyAsync(dataBase_.d_Y_old_vector[i], Y_old[i], cell_bytes, cudaMemcpyHostToDevice, stream));
 
         // launch cuda kernel
         threads_per_block = 1024;
@@ -757,8 +756,7 @@ void dfYEqn::solve()
 void dfYEqn::updatePsi(double *Psi, int speciesIndex)
 {
     checkCudaErrors(cudaStreamSynchronize(stream));
-    for (size_t i = 0; i < num_cells; i++)
-        Psi[i] = h_psi[i + speciesIndex * num_cells];
+    memcpy(Psi, h_psi + speciesIndex * num_cells, cell_bytes);
 }
 
 dfYEqn::~dfYEqn()