From a3e1a1d81bbd35d6d2f5ecd3eb5ec150dfcc6416 Mon Sep 17 00:00:00 2001
From: maorz1998 <maorz1998@stu.pku.edu.cn>
Date: Fri, 2 Jun 2023 11:52:09 +0800
Subject: [PATCH 1/2] preliminary GPU version of YEqn

---
 applications/solvers/dfLowMachFoam/YEqn.H     | 124 ++-
 .../solvers/dfLowMachFoam/createdfSolver.H    |   5 +-
 .../solvers/dfLowMachFoam/dfLowMachFoam.C     |   2 +
 src_gpu/CMakeLists.txt                        |   9 +-
 src_gpu/dfMatrixDataBase.H                    |  52 +-
 src_gpu/dfYEqn.H                              |  54 ++
 src_gpu/dfYEqn.cu                             | 831 ++++++++++++++++++
 7 files changed, 1052 insertions(+), 25 deletions(-)
 create mode 100644 src_gpu/dfYEqn.H
 create mode 100644 src_gpu/dfYEqn.cu
diff --git a/applications/solvers/dfLowMachFoam/YEqn.H b/applications/solvers/dfLowMachFoam/YEqn.H
index 94550fcdc..e5cbd15d7 100644
--- a/applications/solvers/dfLowMachFoam/YEqn.H
+++ b/applications/solvers/dfLowMachFoam/YEqn.H
@@ -15,12 +15,49 @@ tmp<fv::convectionScheme<scalar>> mvConvection
     )
 );
 
+upwind<scalar> mvConvectionInterpolation(mesh, phi);
+tmp<surfaceScalarField> t_upwind_weight = mvConvectionInterpolation.weights();
+surfaceScalarField upwind_weight = t_upwind_weight.ref();
+
+volVectorField sumYGrad
+(
+    IOobject
+    (
+        "sumYGrad",
+        runTime.timeName(),
+        mesh,
+        IOobject::NO_READ,
+        IOobject::NO_WRITE
+    ),
+    mesh,
+    dimensionedVector(dimensionSet(0,-1,0,0,0,0,0), Zero)
+);
+
 forAll(Y, i)
 {
     sumYDiffError += chemistry->rhoD(i)*fvc::grad(Y[i]);
 }
 const surfaceScalarField phiUc = linearInterpolate(sumYDiffError) & mesh.Sf();
 
+// for (size_t i = 0; i < num_cells; i++)
+// {
+//     printf("sumYDiffError[%d] = (%e, %e, %e)\n", i, sumYDiffError[i][0], sumYDiffError[i][1], 
+//     sumYDiffError[i][2]);
+//     // printf("fvc::grad[%d] = (%e, %e, %e)\n", i, test_grad[i][0], test_grad[i][1], 
+//     // test_grad[i][2]);
+// }
+
+const surfaceVectorField test_interpolate = linearInterpolate(sumYDiffError).ref();
+
+// for (size_t i = 0; i < num_surfaces; i++)
+// {
+//     // printf("test_interpolate[%d] = (%e, %e, %e)\n", i, test_interpolate[i][0], test_interpolate[i][1], 
+//     // test_interpolate[i][2]);
+//     // printf("surface vector[%d] = (%e, %e, %e)\n", i, mesh.Sf()[i][0], mesh.Sf()[i][1], 
+//     // mesh.Sf()[i][2]);
+//     printf("phiUcOf[%d] = %e\n", i, phiUc[i]);
+// }
+
 #ifdef GPUSolver_
     start1 = std::clock();
     // // std::thread t(&dfMatrix::solve, &UEqn_GPU);
@@ -30,6 +67,54 @@ const surfaceScalarField phiUc = linearInterpolate(sumYDiffError) & mesh.Sf();
     time_monitor_UEqn_Solve += double(end1 - start1) / double(CLOCKS_PER_SEC);
 #endif
 
+std::vector<double*> Y_new(Y.size()), Y_old(Y.size()), boundary_Y_init(Y.size()), 
+    boundary_rhoD_init(Y.size());
+std::vector<const double*> rhoD_GPU(Y.size());
+for (size_t i = 0; i < Y.size(); ++i)
+{
+    volScalarField& Yi = Y[i];
+    const volScalarField& rhoDi = chemistry->rhoD(i);
+    Y_new[i] = &Yi[0];
+    Y_old[i] = &Yi.oldTime()[0];
+    rhoD_GPU[i] = &chemistry->rhoD(i)[0];
+    cudaMallocHost(&boundary_Y_init[i], num_boundary_faces*sizeof(double));
+    cudaMallocHost(&boundary_rhoD_init[i], num_boundary_faces*sizeof(double));
+    int offset = 0;
+    forAll(Yi.boundaryField(), patchi)
+    {
+        const scalarField& patchYi = Yi.boundaryField()[patchi];
+        const scalarField& patchRhoDi = rhoDi.boundaryField()[patchi];
+        int patchSize = patchYi.size();
+
+        memcpy(boundary_Y_init[i]+offset, &patchYi[0], patchSize*sizeof(double));
+        memcpy(boundary_rhoD_init[i]+offset, &patchRhoDi[0], patchSize*sizeof(double));
+        offset += patchSize;
+    }
+}
+
+volScalarField mut_sct = turbulence->mut().ref()/Sct;
+std::vector<double> boundary_sumYDiffError, boundary_phiUc, boundary_mutsct;
+forAll(p.boundaryField(), patchi)
+{
+    const vectorField& patchErri = sumYDiffError.boundaryField()[patchi];
+    const scalarField& patchPhiUc = phiUc.boundaryField()[patchi];
+    const scalarField& patchMut_sct = mut_sct.boundaryField()[patchi];
+
+    int patchSize = patchErri.size();
+    boundary_sumYDiffError.insert(boundary_sumYDiffError.end(), &patchErri[0][0], &patchErri[0][0]+3*patchSize);
+    boundary_phiUc.insert(boundary_phiUc.end(), &patchPhiUc[0], &patchPhiUc[0] + patchSize);
+    boundary_mutsct.insert(boundary_mutsct.end(), &patchMut_sct[0], &patchMut_sct[0] + patchSize);
+}
+
+YEqn_GPU.upwindWeight(&upwind_weight[0]);
+YEqn_GPU.correctVelocity(Y_new, boundary_Y_init, rhoD_GPU, &phiUc[0], boundary_sumYDiffError.data(), boundary_phiUc.data());
+YEqn_GPU.fvm_ddt(Y_old);
+YEqn_GPU.fvm_div_phi();
+YEqn_GPU.fvm_div_phiUc();
+YEqn_GPU.fvm_laplacian(&mut_sct[0], boundary_mutsct.data(), boundary_rhoD_init);
+
+YEqn_GPU.solve();
+
 //MPI_Barrier(PstreamGlobals::MPI_COMM_FOAM);
 label flag_mpi_init;
 MPI_Initialized(&flag_mpi_init);
@@ -53,6 +138,7 @@ time_monitor_corrDiff += double(end - start) / double(CLOCKS_PER_SEC);
     volScalarField Yt(0.0*Y[0]);
 
     start = std::clock();
+    int speciesIndex = 0;
     forAll(Y, i)
     {
         volScalarField& Yi = Y[i];
@@ -61,26 +147,30 @@ time_monitor_corrDiff += double(end - start) / double(CLOCKS_PER_SEC);
 
         if (i != inertIndex)
         {
-            tmp<volScalarField> DEff = chemistry->rhoD(i) + turbulence->mut()/Sct;
-            fvScalarMatrix YiEqn
-            (
-                fvm::ddt(rho, Yi)
-              + mvConvection->fvmDiv(phi, Yi)
-              + mvConvection->fvmDiv(phiUc, Yi)
-             ==
-                (
-                    splitting
-                ?   fvm::laplacian(DEff(), Yi)
-                :  (fvm::laplacian(DEff(), Yi) + combustion->R(Yi))
-                )
-            );
-
-            YiEqn.relax();
-
-            YiEqn.solve("Yi");
+            // tmp<volScalarField> DEff = chemistry->rhoD(i) + turbulence->mut()/Sct;
+            // fvScalarMatrix YiEqn
+            // (
+            //     fvm::ddt(rho, Yi)
+            // + mvConvection->fvmDiv(phi, Yi)
+            // + mvConvection->fvmDiv(phiUc, Yi)
+            // ==
+            //     (
+            //         splitting
+            //     ?   fvm::laplacian(DEff(), Yi)
+            //     :  (fvm::laplacian(DEff(), Yi) + combustion->R(Yi))
+            //     )
+            // );
+
+            // YiEqn.relax();
+
+            // YiEqn.solve("Yi");
+
+            YEqn_GPU.updatePsi(&Yi[0], speciesIndex);
+            Yi.correctBoundaryConditions();
 
             Yi.max(0.0);
             Yt += Yi;
+            ++speciesIndex;
         }
     }
 
diff --git a/applications/solvers/dfLowMachFoam/createdfSolver.H b/applications/solvers/dfLowMachFoam/createdfSolver.H
index ed87aceb5..b60307463 100644
--- a/applications/solvers/dfLowMachFoam/createdfSolver.H
+++ b/applications/solvers/dfLowMachFoam/createdfSolver.H
@@ -28,11 +28,12 @@ int num_boundary_cells;
 string settingPath;
 settingPath = CanteraTorchProperties.subDict("AmgxSettings").lookupOrDefault("UEqnSettingPath", string(""));
 
-dfMatrixDataBase dfDataBase(num_surfaces, num_cells, num_boundary_faces, num_boundary_cells, &neighbour[0], &owner[0], &mesh.V()[0], &mesh.surfaceInterpolation::weights()[0], 
+dfMatrixDataBase dfDataBase(num_surfaces, num_cells, num_boundary_faces, Y.size(), num_boundary_cells, &neighbour[0], &owner[0], &mesh.V()[0], &mesh.surfaceInterpolation::weights()[0], 
 &mesh.Sf()[0][0], &mesh.magSf()[0], &mesh.nonOrthDeltaCoeffs()[0], boundary_face_vector_init, boundary_face_init, boundary_deltaCoeffs_init, boundaryCellIndex);
 
-dfUEqn UEqn_GPU(dfDataBase, "dDDI", settingPath);
 dfRhoEqn rhoEqn_GPU(dfDataBase);
+dfUEqn UEqn_GPU(dfDataBase, "dDDI", settingPath);
+dfYEqn YEqn_GPU(dfDataBase, "dDDI", settingPath, inertIndex);
 
 double *ueqn_internalCoeffs_init, *ueqn_boundaryCoeffs_init, *boundary_pressure_init, *boundary_velocity_init,
     *boundary_nuEff_init, *boundary_rho_init, *ueqn_laplac_internalCoeffs_init, *ueqn_laplac_boundaryCoeffs_init, *boundary_phi_init;
diff --git a/applications/solvers/dfLowMachFoam/dfLowMachFoam.C b/applications/solvers/dfLowMachFoam/dfLowMachFoam.C
index 8012df850..d875b607d 100644
--- a/applications/solvers/dfLowMachFoam/dfLowMachFoam.C
+++ b/applications/solvers/dfLowMachFoam/dfLowMachFoam.C
@@ -60,9 +60,11 @@ Description
 #include "CombustionModel.H"
 
 #include "dfUEqn.H"
+#include "dfYEqn.H"
 #include "dfRhoEqn.H"
 #include <cuda_runtime.h>
 #include <thread>
+#include "upwind.H"
 
 #define GPUSolver_
 
diff --git a/src_gpu/CMakeLists.txt b/src_gpu/CMakeLists.txt
index 2c17c56ed..c749eaa42 100644
--- a/src_gpu/CMakeLists.txt
+++ b/src_gpu/CMakeLists.txt
@@ -18,14 +18,19 @@ include_directories(
     $ENV{AMGX_DIR}/include
 )
 
-add_library(${PROJECT_NAME} SHARED dfUEqn.cu dfRhoEqn.cu AmgXSolver.cu)
+add_library(${PROJECT_NAME} 
+    SHARED 
+        dfUEqn.cu 
+        dfRhoEqn.cu 
+        dfYEqn.cu
+        AmgXSolver.cu)
 
 target_link_libraries(${PROJECT_NAME}
     ${MPI_LIBRARIES}
     ${CUDA_LIBRARIES}
     ${LIBAMGXSH}
 )
-
+target_compile_options(dfMatrix PUBLIC -g)
 option(DFMATRIX_ENABLE_DETAILED_DEBUG "Enable detailed debug build" OFF)
 if (DFMATRIX_ENABLE_DETAILED_DEBUG)
     target_compile_definitions(${PROJECT_NAME} PRIVATE DEBUG)
diff --git a/src_gpu/dfMatrixDataBase.H b/src_gpu/dfMatrixDataBase.H
index 971fdc6bb..2fa79b13f 100644
--- a/src_gpu/dfMatrixDataBase.H
+++ b/src_gpu/dfMatrixDataBase.H
@@ -34,7 +34,8 @@ inline void checkVectorEqual(int count, const double* basevec, double* vec, doub
     {
         double abs_diff = fabs(basevec[i] - vec[i]);
         double rel_diff = fabs(basevec[i] - vec[i]) / fabs(basevec[i]);
-        if (abs_diff > 1e-12 && rel_diff > max_relative_error && !std::isinf(rel_diff))
+        // if (abs_diff > 1e-12 && rel_diff > max_relative_error && !std::isinf(rel_diff))
+        if (abs_diff > 1e-15 && rel_diff > max_relative_error)
             fprintf(stderr, "mismatch index %d, cpu data: %.16lf, gpu data: %.16lf, relative error: %.16lf\n", i, basevec[i], vec[i], rel_diff);
     }
 }
@@ -53,6 +54,8 @@ struct dfMatrixDataBase
     // - number of boundary faces
     int num_boundary_faces;
 
+    int num_species;
+
     // - mesh variables
     // - csr_row_index
     int *h_A_csr_row_index=nullptr, *d_A_csr_row_index=nullptr;
@@ -84,14 +87,19 @@ struct dfMatrixDataBase
     // - the device pointer to rho_new, rho_old, velocity_old, pressure and volume list
     double *d_rho_new = nullptr, *d_rho_old = nullptr, *d_velocity_old = nullptr, 
     *d_pressure = nullptr, *d_volume = nullptr;
+    // - the device pointer to Y_new and Y_old
+    std::vector<double*> d_Y_new_vector;
+    std::vector<double*> d_Y_old_vector;
     // - the device pointer to the pre-permutated and post-permutated interpolation weight list
     double *d_weight_init = nullptr, *d_weight = nullptr;
+    double *d_weight_upwind = nullptr;
     // - the device pointer to the pre-permutated and post-permutated flux (phi) list
     double *d_phi_init = nullptr, *d_phi = nullptr;
     // - the device pointer to the pre-permutated and post-permutated cell face vector list
     double *d_face_vector_init = nullptr, *d_face_vector = nullptr;
     double *d_face_init = nullptr, *d_face = nullptr;
     double *d_deltaCoeffs_init = nullptr, *d_deltaCoeffs = nullptr;
+    std::vector<double*> d_rhoD_vector;
 
     double rdelta_t = 1/1e-6;
 
@@ -114,6 +122,14 @@ struct dfMatrixDataBase
     *d_boundary_velocity = nullptr, *d_boundary_velocity_init = nullptr,
     *d_boundary_nuEff = nullptr, *d_boundary_nuEff_init = nullptr,
     *d_boundary_rho = nullptr, *d_boundary_rho_init = nullptr;
+    std::vector<double*> d_boundary_Y_vector;
+    std::vector<double*> d_boundary_Y_init_vector;
+    std::vector<double*> d_internal_coeffs_Y_vector;
+    std::vector<double*> d_boundary_coeffs_Y_vector;
+    std::vector<double*> d_laplac_internal_coeffs_Y_vector;
+    std::vector<double*> d_laplac_boundary_coeffs_Y_vector;
+    std::vector<double*> d_boundary_rhoD_vector;
+    double *d_boundary_mut_sct = nullptr;
 
     std::vector<int> boundPermutationList;
     std::vector<double> ueqn_internalCoeffs, ueqn_boundaryCoeffs;
@@ -180,11 +196,11 @@ struct dfMatrixDataBase
 
     // constructor
     dfMatrixDataBase();
-    dfMatrixDataBase(int num_surfaces, int num_cells, int num_boundary_faces, int & num_boundary_cells_output,
+    dfMatrixDataBase(int num_surfaces, int num_cells, int num_boundary_faces, int num_species, int & num_boundary_cells_output,
         const int *neighbour, const int *owner, const double* volume, const double* weight, const double* face_vector, const double* face, 
         const double* deltaCoeffs, std::vector<double> boundary_face_vector_init, std::vector<double> boundary_face_init, 
         std::vector<double> boundary_deltaCoeffs_init, std::vector<int> boundary_cell_id_init)
-    : num_cells(num_cells), num_faces(num_surfaces*2), num_surfaces(num_surfaces), num_iteration(0),
+    : num_cells(num_cells), num_faces(num_surfaces*2), num_surfaces(num_surfaces), num_species(num_species), num_iteration(0),
       num_boundary_faces(num_boundary_faces), h_volume(volume)
     {
         // allocate field pointer in pin memory
@@ -457,19 +473,36 @@ struct dfMatrixDataBase
         checkCudaErrors(cudaMalloc((void**)&d_A_csr_diag_index, cell_index_bytes));
         total_bytes += (csr_row_index_bytes + csr_col_index_bytes + cell_index_bytes);
 
+        d_Y_new_vector.resize(num_species);
+        d_Y_old_vector.resize(num_species);
+        d_rhoD_vector.resize(num_species);
+        d_boundary_Y_vector.resize(num_species);
+        d_boundary_Y_init_vector.resize(num_species);
+        d_internal_coeffs_Y_vector.resize(num_species);
+        d_boundary_coeffs_Y_vector.resize(num_species);
+        d_laplac_internal_coeffs_Y_vector.resize(num_species);
+        d_laplac_boundary_coeffs_Y_vector.resize(num_species);
+        d_boundary_rhoD_vector.resize(num_species);
+
+        for (size_t i = 0; i < num_species; ++i){
+            checkCudaErrors(cudaMalloc((void**)&d_Y_new_vector[i], cell_bytes));
+            checkCudaErrors(cudaMalloc((void**)&d_Y_old_vector[i], cell_bytes));
+            checkCudaErrors(cudaMalloc((void**)&d_rhoD_vector[i], cell_bytes));
+        }
         checkCudaErrors(cudaMalloc((void**)&d_rho_old, cell_bytes));
         checkCudaErrors(cudaMalloc((void**)&d_rho_new, cell_bytes));
         checkCudaErrors(cudaMalloc((void**)&d_volume, cell_bytes));
         checkCudaErrors(cudaMalloc((void**)&d_pressure, cell_bytes));
         checkCudaErrors(cudaMalloc((void**)&d_velocity_old, cell_vec_bytes));
         checkCudaErrors(cudaMalloc((void**)&d_weight, face_bytes));
+        checkCudaErrors(cudaMalloc((void**)&d_weight_upwind, face_bytes));
         checkCudaErrors(cudaMalloc((void**)&d_face, face_bytes));
         checkCudaErrors(cudaMalloc((void**)&d_deltaCoeffs, face_bytes));
         checkCudaErrors(cudaMalloc((void**)&d_phi, face_bytes));
         checkCudaErrors(cudaMalloc((void**)&d_phi_init, face_bytes));
         checkCudaErrors(cudaMalloc((void**)&d_face_vector, face_vec_bytes));
         checkCudaErrors(cudaMalloc((void**)&d_nuEff, cell_bytes));
-        total_bytes += (cell_bytes * 5 + face_bytes * 5 + cell_vec_bytes + face_vec_bytes);
+        total_bytes += (cell_bytes * (5 + 2*num_species) + face_bytes * 6 + cell_vec_bytes + face_vec_bytes);
 
         checkCudaErrors(cudaMalloc((void**)&d_boundary_cell_offset, (num_boundary_cells+1) * sizeof(int)));
         checkCudaErrors(cudaMalloc((void**)&d_boundary_cell_id, boundary_face_index_bytes));
@@ -496,6 +529,17 @@ struct dfMatrixDataBase
         checkCudaErrors(cudaMalloc((void**)&d_boundary_nuEff_init, boundary_face_bytes));
         checkCudaErrors(cudaMalloc((void**)&d_boundary_rho, boundary_face_bytes));
         checkCudaErrors(cudaMalloc((void**)&d_boundary_rho_init, boundary_face_bytes));
+        checkCudaErrors(cudaMalloc((void**)&d_boundary_mut_sct, boundary_face_bytes));
+        for (size_t i = 0; i < num_species; ++i){
+            checkCudaErrors(cudaMalloc((void**)&d_boundary_Y_vector[i], boundary_face_bytes));
+            checkCudaErrors(cudaMalloc((void**)&d_boundary_Y_init_vector[i], boundary_face_bytes));
+            checkCudaErrors(cudaMalloc((void**)&d_internal_coeffs_Y_vector[i], boundary_face_bytes));
+            checkCudaErrors(cudaMalloc((void**)&d_boundary_coeffs_Y_vector[i], boundary_face_bytes));
+            checkCudaErrors(cudaMalloc((void**)&d_laplac_internal_coeffs_Y_vector[i], boundary_face_bytes));
+            checkCudaErrors(cudaMalloc((void**)&d_laplac_boundary_coeffs_Y_vector[i], boundary_face_bytes));
+            checkCudaErrors(cudaMalloc((void**)&d_boundary_rhoD_vector[i], boundary_face_bytes));
+        }
+        
         total_bytes += (boundary_face_bytes*10 + boundary_face_vec_bytes * 11);
 
         // checkCudaErrors(cudaMalloc((void**)&d_A_csr, csr_value_vec_bytes));
diff --git a/src_gpu/dfYEqn.H b/src_gpu/dfYEqn.H
new file mode 100644
index 000000000..85f4493a6
--- /dev/null
+++ b/src_gpu/dfYEqn.H
@@ -0,0 +1,54 @@
+#pragma once
+
+#include "AmgXSolver.H"
+#include <amgx_c.h>
+#include "dfMatrixDataBase.H"
+
+class dfYEqn
+{
+private:
+    dfMatrixDataBase &dataBase_;
+    cudaStream_t stream;
+
+    std::vector<AmgXSolver *> YSolverSet;
+    int num_iteration = 0;
+    double time_monitor_CPU;
+    double time_monitor_GPU_kernel, time_monitor_GPU_memcpy, time_monitor_GPU_memcpy_test;
+
+    // common variables
+    int num_cells, cell_bytes, num_faces, num_surfaces, num_boundary_cells, num_boundary_faces, num_species, inertIndex;
+    int *d_A_csr_row_index, *d_A_csr_diag_index, *d_A_csr_col_index;
+
+    // Matrix variables
+    double *d_A_csr, *d_b, *d_psi = nullptr;
+    double *h_A_csr, *h_b, *h_psi = nullptr;
+    double *d_sumYDiffError = nullptr, *d_sumYDiffError_tmp = nullptr;
+    double *d_sumYDiffError_boundary = nullptr;
+    double *d_phiUc_boundary = nullptr;
+    double *d_phiUc = nullptr;
+    double *d_mut_Sct = nullptr;
+
+public:
+    dfYEqn(dfMatrixDataBase &dataBase, const std::string &modeStr, const std::string &cfgFile, const int inertIndex);
+
+    ~dfYEqn();
+
+    void checkValue(bool print, char *filename);
+
+    void correctVelocity(std::vector<double *> Y_new, std::vector<double *> boundary_Y_init, std::vector<const double *> rhoD_GPU,
+                         const double *phiUcRef, double *boundary_sumYDiffError_ref, double *phiUcBouRef);
+
+    void upwindWeight(double *refWeight);
+
+    void fvm_ddt(std::vector<double *> Y_old);
+
+    void fvm_div_phi();
+
+    void fvm_div_phiUc();
+
+    void fvm_laplacian(double *mut_Sct, double *boundary_mut_Sct, std::vector<double*>boundary_rhoD);
+
+    void solve();
+
+    void updatePsi(double *Psi, int speciesIndex);
+};
diff --git a/src_gpu/dfYEqn.cu b/src_gpu/dfYEqn.cu
new file mode 100644
index 000000000..d25efb862
--- /dev/null
+++ b/src_gpu/dfYEqn.cu
@@ -0,0 +1,831 @@
+#include "dfYEqn.H"
+
+// kernel functions
+__global__ void getUpwindWeight(int num_faces, double *phi, double *weight)
+{
+    int index = blockDim.x * blockIdx.x + threadIdx.x;
+    if (index >= num_faces)
+        return;
+    if (phi[index] >= 0)
+        weight[index] = 1.;
+    else
+        weight[index] = 0.;
+}
+
+__global__ void fvc_grad_internal_face_Y(int num_cells,
+                                         const int *csr_row_index, const int *csr_col_index, const int *csr_diag_index, const double *volume,
+                                         const double *face_vector, const double *weight, const double *species, const double *rhoD,
+                                         const double *sumYDiffError, double *sumYDiffError_output)
+{
+    int index = blockDim.x * blockIdx.x + threadIdx.x;
+    if (index >= num_cells)
+        return;
+
+    // A_csr has one more element in each row: itself
+    int row_index = csr_row_index[index];
+    int next_row_index = csr_row_index[index + 1];
+    int diag_index = csr_diag_index[index];
+    int neighbor_offset = csr_row_index[index] - index;
+
+    double own_cell_Y = species[index];
+    double grad_bx_upper = 0;
+    double grad_by_upper = 0;
+    double grad_bz_upper = 0;
+    double grad_bx_lower = 0;
+    double grad_by_lower = 0;
+    double grad_bz_lower = 0;
+    for (int i = row_index; i < next_row_index; i++)
+    {
+        int inner_index = i - row_index;
+        // lower
+        if (inner_index < diag_index)
+        {
+            int neighbor_index = neighbor_offset + inner_index;
+            double w = weight[neighbor_index];
+            double sfx = face_vector[neighbor_index * 3 + 0];
+            double sfy = face_vector[neighbor_index * 3 + 1];
+            double sfz = face_vector[neighbor_index * 3 + 2];
+            int neighbor_cell_id = csr_col_index[row_index + inner_index];
+            double neighbor_cell_Y = species[neighbor_cell_id];
+            double face_Y = w * (neighbor_cell_Y - own_cell_Y) + own_cell_Y;
+            grad_bx_lower -= face_Y * sfx;
+            grad_by_lower -= face_Y * sfy;
+            grad_bz_lower -= face_Y * sfz;
+        }
+        // upper
+        if (inner_index > diag_index)
+        {
+            int neighbor_index = neighbor_offset + inner_index - 1;
+            double w = weight[neighbor_index];
+            double sfx = face_vector[neighbor_index * 3 + 0];
+            double sfy = face_vector[neighbor_index * 3 + 1];
+            double sfz = face_vector[neighbor_index * 3 + 2];
+            int neighbor_cell_id = csr_col_index[row_index + inner_index];
+            double neighbor_cell_Y = species[neighbor_cell_id];
+            double face_Y = w * (own_cell_Y - neighbor_cell_Y) + neighbor_cell_Y;
+            grad_bx_upper += face_Y * sfx;
+            grad_by_upper += face_Y * sfy;
+            grad_bz_upper += face_Y * sfz;
+        }
+    }
+    double vol = volume[index];
+
+    sumYDiffError_output[index * 3 + 0] = sumYDiffError[index * 3 + 0] + (grad_bx_upper + grad_bx_lower);
+    sumYDiffError_output[index * 3 + 1] = sumYDiffError[index * 3 + 1] + (grad_by_upper + grad_by_lower);
+    sumYDiffError_output[index * 3 + 2] = sumYDiffError[index * 3 + 2] + (grad_bz_upper + grad_bz_lower);
+
+    // if (index == 0)
+    // {
+    //     printf("grad_bz_upper = %e\n", grad_bz_upper);
+    //     printf("grad_bz_lower = %e\n", grad_bz_lower);
+    //     printf("(grad_bz_upper + grad_bz_lower) = %e\n", (grad_bz_upper + grad_bz_lower));
+    // }
+}
+
+__global__ void fvc_grad_boundary_face_Y(int num_cells, int num_boundary_cells,
+                                         const int *boundary_cell_offset, const int *boundary_cell_id, const double *rhoD, const int *bouPermedIndex,
+                                         const double *boundary_face_vector, const double *boundary_species_init, double *boundary_species,
+                                         const double *volume, const double *sumYDiffError, double *sumYDiffError_output)
+{
+    int index = blockDim.x * blockIdx.x + threadIdx.x;
+    if (index >= num_boundary_cells)
+        return;
+
+    int cell_offset = boundary_cell_offset[index];
+    int next_cell_offset = boundary_cell_offset[index + 1];
+    int cell_index = boundary_cell_id[cell_offset];
+
+    // compute boundary gradient
+    double grad_bx = 0;
+    double grad_by = 0;
+    double grad_bz = 0;
+    for (int i = cell_offset; i < next_cell_offset; i++)
+    {
+        double sfx = boundary_face_vector[i * 3 + 0];
+        double sfy = boundary_face_vector[i * 3 + 1];
+        double sfz = boundary_face_vector[i * 3 + 2];
+        int permute_index = bouPermedIndex[i];
+        double face_Y = boundary_species_init[permute_index];
+        boundary_species[i] = face_Y;
+        grad_bx += face_Y * sfx;
+        grad_by += face_Y * sfy;
+        grad_bz += face_Y * sfz;
+        // if (index == 0)
+        // {
+        //     printf("face_Y = %e\n", face_Y);
+        //     printf("sfz = %e\n", sfz);
+        //     printf("face_Y * sfz = %e\n", face_Y * sfz);
+        // }
+    }
+
+    //// correct the boundary gradient
+    // double nx = boundary_face_vector[face_index * 3 + 0] / magSf[face_index];
+    // double ny = boundary_face_vector[face_index * 3 + 1] / magSf[face_index];
+    // double nz = boundary_face_vector[face_index * 3 + 2] / magSf[face_index];
+    // double sn_grad = 0;
+    // double grad_correction = sn_grad * volume[cell_index] - (nx * grad_bx + ny * grad_by + nz * grad_bz);
+    // grad_bx += nx * grad_correction;
+    // grad_by += ny * grad_correction;
+    // grad_bz += nz * grad_correction;
+
+    sumYDiffError_output[cell_index * 3 + 0] = sumYDiffError[cell_index * 3 + 0] + grad_bx;
+    sumYDiffError_output[cell_index * 3 + 1] = sumYDiffError[cell_index * 3 + 1] + grad_by;
+    sumYDiffError_output[cell_index * 3 + 2] = sumYDiffError[cell_index * 3 + 2] + grad_bz;
+}
+
+__global__ void sumError(int num_cells, const double *volume, const double *rhoD,
+                         const double *sumYDiffErrorTmp, double *sumYDiffError_output)
+{
+    int index = blockDim.x * blockIdx.x + threadIdx.x;
+    if (index >= num_cells)
+        return;
+
+    sumYDiffError_output[index * 3 + 0] = sumYDiffError_output[index * 3 + 0] + sumYDiffErrorTmp[index * 3 + 0] * rhoD[index];
+    sumYDiffError_output[index * 3 + 1] = sumYDiffError_output[index * 3 + 1] + sumYDiffErrorTmp[index * 3 + 1] * rhoD[index];
+    sumYDiffError_output[index * 3 + 2] = sumYDiffError_output[index * 3 + 2] + sumYDiffErrorTmp[index * 3 + 2] * rhoD[index];
+}
+
+__global__ void divide_vol(int num_cells, const double *volume,
+                           const double *sumYDiffError, double *sumYDiffError_output)
+{
+    int index = blockDim.x * blockIdx.x + threadIdx.x;
+    if (index >= num_cells)
+        return;
+    double vol = volume[index];
+
+    sumYDiffError_output[index * 3 + 0] = sumYDiffError[index * 3 + 0] / vol;
+    sumYDiffError_output[index * 3 + 1] = sumYDiffError[index * 3 + 1] / vol;
+    sumYDiffError_output[index * 3 + 2] = sumYDiffError[index * 3 + 2] / vol;
+}
+
+__global__ void correct_boundary_conditions_vec(int num_boundary_cells,
+                                                const int *boundary_cell_offset, const int *boundary_cell_id,
+                                                const double *boundary_sf, const double *mag_sf,
+                                                double *boundary_sumYDiffError, double *sumYDiffError)
+{
+    int index = blockDim.x * blockIdx.x + threadIdx.x;
+    if (index >= num_boundary_cells)
+        return;
+
+    int cell_offset = boundary_cell_offset[index];
+    int next_cell_offset = boundary_cell_offset[index + 1];
+    int cell_index = boundary_cell_id[cell_offset];
+
+    // initialize boundary_sumYDiffError
+    double sumYDiffError_x = sumYDiffError[cell_index * 3 + 0];
+    double sumYDiffError_y = sumYDiffError[cell_index * 3 + 1];
+    double sumYDiffError_z = sumYDiffError[cell_index * 3 + 2];
+
+    for (int i = cell_offset; i < next_cell_offset; i++)
+    {
+        double n_x = boundary_sf[i * 3 + 0] / mag_sf[i];
+        double n_y = boundary_sf[i * 3 + 1] / mag_sf[i];
+        double n_z = boundary_sf[i * 3 + 2] / mag_sf[i];
+        double sn_grad = 0;
+        double grad_correction = sn_grad - (n_x * sumYDiffError_x + n_y * sumYDiffError_y + n_z * sumYDiffError_z);
+        boundary_sumYDiffError[i * 3 + 0] = sumYDiffError_x + grad_correction * n_x;
+        boundary_sumYDiffError[i * 3 + 1] = sumYDiffError_y + grad_correction * n_y;
+        boundary_sumYDiffError[i * 3 + 2] = sumYDiffError_z + grad_correction * n_z;
+    }
+}
+
+__global__ void calculate_phiUc(int num_cells, const int *csr_row_index, const int *csr_col_index, const int *csr_diag_index,
+                                const double *face_vector, const double *weight, const double *sumYDiffError, double *phiUc)
+{
+    int index = blockDim.x * blockIdx.x + threadIdx.x;
+    if (index >= num_cells)
+        return;
+
+    // A_csr has one more element in each row: itself
+    int row_index = csr_row_index[index];
+    int row_elements = csr_row_index[index + 1] - row_index;
+    int diag_index = csr_diag_index[index];
+    int neighbor_offset = csr_row_index[index] - index;
+
+    double own_cell_sumYDiffError_x = sumYDiffError[index * 3 + 0];
+    double own_cell_sumYDiffError_y = sumYDiffError[index * 3 + 1];
+    double own_cell_sumYDiffError_z = sumYDiffError[index * 3 + 2];
+
+    // lower
+    for (int i = 0; i < diag_index; i++)
+    {
+        double phiUc_face = 0;
+
+        int neighbor_index = neighbor_offset + i;
+        int neighbor_cell_id = csr_col_index[row_index + i];
+        double w = weight[neighbor_index];
+        double sfx = face_vector[neighbor_index * 3 + 0];
+        double sfy = face_vector[neighbor_index * 3 + 1];
+        double sfz = face_vector[neighbor_index * 3 + 2];
+        double neighbor_cell_sumYDiffError_x = sumYDiffError[neighbor_cell_id * 3 + 0];
+        double neighbor_cell_sumYDiffError_y = sumYDiffError[neighbor_cell_id * 3 + 1];
+        double neighbor_cell_sumYDiffError_z = sumYDiffError[neighbor_cell_id * 3 + 2];
+        double face_x = w * (neighbor_cell_sumYDiffError_x - own_cell_sumYDiffError_x) + own_cell_sumYDiffError_x;
+        double face_y = w * (neighbor_cell_sumYDiffError_y - own_cell_sumYDiffError_y) + own_cell_sumYDiffError_y;
+        double face_z = w * (neighbor_cell_sumYDiffError_z - own_cell_sumYDiffError_z) + own_cell_sumYDiffError_z;
+
+        phiUc_face = face_x * sfx + face_y * sfy + face_z * sfz;
+        phiUc[neighbor_index] = phiUc_face;
+    }
+    // upper
+    for (int i = diag_index + 1; i < row_elements; i++)
+    {
+        double phiUc_face = 0;
+
+        int neighbor_index = neighbor_offset + i - 1;
+        int neighbor_cell_id = csr_col_index[row_index + i];
+        double w = weight[neighbor_index];
+        double sfx = face_vector[neighbor_index * 3 + 0];
+        double sfy = face_vector[neighbor_index * 3 + 1];
+        double sfz = face_vector[neighbor_index * 3 + 2];
+        double neighbor_cell_sumYDiffError_x = sumYDiffError[neighbor_cell_id * 3 + 0];
+        double neighbor_cell_sumYDiffError_y = sumYDiffError[neighbor_cell_id * 3 + 1];
+        double neighbor_cell_sumYDiffError_z = sumYDiffError[neighbor_cell_id * 3 + 2];
+        double face_x = w * (own_cell_sumYDiffError_x - neighbor_cell_sumYDiffError_x) + neighbor_cell_sumYDiffError_x;
+        double face_y = w * (own_cell_sumYDiffError_y - neighbor_cell_sumYDiffError_y) + neighbor_cell_sumYDiffError_y;
+        double face_z = w * (own_cell_sumYDiffError_z - neighbor_cell_sumYDiffError_z) + neighbor_cell_sumYDiffError_z;
+
+        phiUc_face = face_x * sfx + face_y * sfy + face_z * sfz;
+        phiUc[neighbor_index] = phiUc_face;
+    }
+}
+
+__global__ void calculate_phiUc_boundary(int num_boundary_faces,
+                                         const int *boundary_cell_offset, const int *boundary_cell_id,
+                                         const double *boundary_sf, const double *boundary_sumYDiffError,
+                                         double *boundary_phiUc)
+{
+    int index = blockDim.x * blockIdx.x + threadIdx.x;
+    if (index >= num_boundary_faces)
+        return;
+
+    double n_x = boundary_sf[index * 3 + 0];
+    double n_y = boundary_sf[index * 3 + 1];
+    double n_z = boundary_sf[index * 3 + 2];
+
+    double err_x = boundary_sumYDiffError[index * 3 + 0];
+    double err_y = boundary_sumYDiffError[index * 3 + 1];
+    double err_z = boundary_sumYDiffError[index * 3 + 2];
+
+    boundary_phiUc[index] = n_x * err_x + n_y * err_y + n_z * err_z;
+}
+
+__global__ void fvm_ddt_kernel_scalar(int num_cells, int num_faces, const double rdelta_t,
+                                      const int *csr_row_index, const int *csr_diag_index,
+                                      const double *rho_old, const double *rho_new, const double *volume, const double *species_old,
+                                      const double *A_csr_input, const double *b_input, double *A_csr_output, double *b_output, double *psi)
+{
+    int index = blockDim.x * blockIdx.x + threadIdx.x;
+    if (index >= num_cells)
+        return;
+
+    // A_csr has one more element in each row: itself
+    int row_index = csr_row_index[index];
+    int diag_index = csr_diag_index[index];
+    int csr_index = row_index + diag_index;
+
+    double ddt_diag = rdelta_t * rho_new[index] * volume[index];
+    A_csr_output[csr_index] = A_csr_input[csr_index] + ddt_diag;
+
+    double ddt_part_term = rdelta_t * rho_old[index] * volume[index];
+    b_output[index] = b_input[index] + ddt_part_term * species_old[index];
+
+    psi[index] = species_old[index];
+}
+
+__global__ void fvm_div_internal_scalar(int num_cells, int num_faces,
+                                        const int *csr_row_index, const int *csr_diag_index,
+                                        const double *div_weight, const double *phi,
+                                        const double *A_csr_input, const double *b_input, double *A_csr_output, double *b_output)
+{
+    int index = blockDim.x * blockIdx.x + threadIdx.x;
+    if (index >= num_cells)
+        return;
+
+    // A_csr has one more element in each row: itself
+    int row_index = csr_row_index[index];
+    int next_row_index = csr_row_index[index + 1];
+    int diag_index = csr_diag_index[index];
+    int neighbor_offset = csr_row_index[index] - index;
+
+    double div_diag = 0;
+    for (int i = row_index; i < next_row_index; i++)
+    {
+        int inner_index = i - row_index;
+        // lower
+        if (inner_index < diag_index)
+        {
+            int neighbor_index = neighbor_offset + inner_index;
+            double w = div_weight[neighbor_index];
+            double f = phi[neighbor_index];
+            A_csr_output[i] = A_csr_input[i] + (-w) * f;
+            // lower neighbors contribute to sum of -1
+            div_diag += (w - 1) * f;
+        }
+        // upper
+        if (inner_index > diag_index)
+        {
+            // upper, index - 1, consider of diag
+            int neighbor_index = neighbor_offset + inner_index - 1;
+            double w = div_weight[neighbor_index];
+            double f = phi[neighbor_index];
+            A_csr_output[i] = A_csr_input[i] + (1 - w) * f;
+            // upper neighbors contribute to sum of 1
+            div_diag += w * f;
+        }
+    }
+    A_csr_output[row_index + diag_index] = A_csr_input[row_index + diag_index] + div_diag; // diag
+}
+
+__global__ void fvm_div_boundary_scalar(int num_cells, int num_faces, int num_boundary_cells,
+                                        const int *csr_row_index, const int *csr_diag_index, const double *boundary_phi,
+                                        const int *boundary_cell_offset, const int *boundary_cell_id,
+                                        double *internal_coeffs, const double *boundary_coeffs,
+                                        const double *A_csr_input, const double *b_input, double *A_csr_output, double *b_output)
+{
+    int index = blockDim.x * blockIdx.x + threadIdx.x;
+    if (index >= num_boundary_cells)
+        return;
+
+    int cell_offset = boundary_cell_offset[index];
+    int cell_index = boundary_cell_id[cell_offset];
+    int loop_size = boundary_cell_offset[index + 1] - cell_offset;
+
+    int row_index = csr_row_index[cell_index];
+    int diag_index = csr_diag_index[cell_index];
+    int csr_dim = num_cells + num_faces;
+    int csr_index = row_index + diag_index;
+
+    // construct internalCoeffs & boundaryCoeffs
+    double internal_coeffs_own = 0;
+    double boundary_coeffs_own = 0;
+    for (int i = 0; i < loop_size; i++)
+    {
+        // zeroGradient
+        internal_coeffs[cell_offset + i] = 1.;
+        internal_coeffs_own += boundary_phi[cell_offset + i] * internal_coeffs[cell_offset + i];
+        boundary_coeffs_own += -boundary_phi[cell_offset + i] * boundary_coeffs[cell_offset + i];
+    }
+    A_csr_output[csr_index] = A_csr_input[csr_index] + internal_coeffs_own;
+    b_output[cell_index] = b_input[cell_index] + boundary_coeffs_own;
+}
+
+__global__ void fvm_laplacian_uncorrected_scalar_internal(int num_cells,
+                                                          const int *csr_row_index, const int *csr_col_index, const int *csr_diag_index,
+                                                          const double *scalar0, const double *scalar1, const double *weight,
+                                                          const double *magsf, const double *distance,
+                                                          const double sign, const double *A_csr_input, double *A_csr_output)
+{
+    int index = blockDim.x * blockIdx.x + threadIdx.x;
+    if (index >= num_cells)
+        return;
+
+    // A_csr has one more element in each row: itself
+    int row_index = csr_row_index[index];
+    int row_elements = csr_row_index[index + 1] - row_index;
+    int diag_index = csr_diag_index[index];
+    int neighbor_offset = csr_row_index[index] - index;
+
+    double own_coeff = scalar0[index] + scalar1[index];
+    double sum_diag = 0;
+    // lower
+    for (int i = 0; i < diag_index; i++)
+    {
+        int neighbor_index = neighbor_offset + i;
+        int neighbor_cell_id = csr_col_index[i + row_index];
+        double w = weight[neighbor_index];
+        double nei_coeff = scalar0[neighbor_cell_id] + scalar1[neighbor_cell_id];
+        double gamma = w * (nei_coeff - own_coeff) + own_coeff;
+        double gamma_magsf = gamma * magsf[neighbor_index];
+        double coeff = gamma_magsf * distance[neighbor_index];
+        A_csr_output[row_index + i] = A_csr_input[row_index + i] + coeff * sign;
+
+        sum_diag += (-coeff);
+    }
+    // upper
+    for (int i = diag_index + 1; i < row_elements; i++)
+    {
+        int neighbor_index = neighbor_offset + i - 1;
+        int neighbor_cell_id = csr_col_index[i + row_index];
+        double w = weight[neighbor_index];
+        double nei_coeff = scalar0[neighbor_cell_id] + scalar1[neighbor_cell_id];
+        double gamma = w * (own_coeff - nei_coeff) + nei_coeff;
+        double gamma_magsf = gamma * magsf[neighbor_index];
+        double coeff = gamma_magsf * distance[neighbor_index];
+        A_csr_output[row_index + i] = A_csr_input[row_index + i] + coeff * sign;
+
+        sum_diag += (-coeff);
+    }
+    // diag
+    A_csr_output[row_index + diag_index] = A_csr_input[row_index + diag_index] + sum_diag * sign;
+}
+
+__global__ void fvm_laplacian_uncorrected_scalar_boundary(int num_cells, int num_boundary_cells,
+                                                          const int *csr_row_index, const int *csr_diag_index, const int *boundary_cell_offset,
+                                                          const int *boundary_cell_id, const double *boundary_scalar0, const double *boundary_scalar1,
+                                                          const double *boundary_magsf, const int *bouPermedIndex,
+                                                          const double *gradient_internal_coeffs, const double *gradient_boundary_coeffs,
+                                                          const double sign, const double *A_csr_input, const double *b_input, double *A_csr_output, double *b_output)
+{
+    int index = blockDim.x * blockIdx.x + threadIdx.x;
+    if (index >= num_boundary_cells)
+        return;
+
+    int cell_offset = boundary_cell_offset[index];
+    int next_cell_offset = boundary_cell_offset[index + 1];
+    int cell_index = boundary_cell_id[cell_offset];
+
+    int row_index = csr_row_index[cell_index];
+    int diag_index = csr_diag_index[cell_index];
+    int csr_index = row_index + diag_index;
+
+    double internal_coeffs = 0;
+    double boundary_coeffs = 0;
+    for (int i = cell_offset; i < next_cell_offset; i++)
+    {
+        int permute_index = bouPermedIndex[i];
+        double gamma = boundary_scalar0[permute_index] + boundary_scalar1[permute_index];
+        double gamma_magsf = gamma * boundary_magsf[i];
+        // internal_coeffs += gamma_magsf * gradient_internal_coeffs[i * 3 + 0];
+        // boundary_coeffs += gamma_magsf * gradient_boundary_coeffs[i * 3 + 0];
+        internal_coeffs += gamma_magsf * 0.;
+        boundary_coeffs += gamma_magsf * 0.;
+    }
+
+    A_csr_output[csr_index] = A_csr_input[csr_index] + internal_coeffs * sign;
+    b_output[cell_index] = b_input[cell_index] + boundary_coeffs * sign;
+}
+
+dfYEqn::dfYEqn(dfMatrixDataBase &dataBase, const std::string &modeStr, const std::string &cfgFile, const int inertIndex)
+    : dataBase_(dataBase), inertIndex(inertIndex)
+{
+    num_species = dataBase_.num_species;
+    num_cells = dataBase_.num_cells;
+    num_faces = dataBase_.num_faces;
+    num_surfaces = dataBase_.num_surfaces;
+    num_boundary_cells = dataBase_.num_boundary_cells;
+    num_boundary_faces = dataBase_.num_boundary_faces;
+    cell_bytes = dataBase_.cell_bytes;
+
+    YSolverSet.resize(num_species - 1); // consider inert species
+    for (auto &solver : YSolverSet)
+        solver = new AmgXSolver(modeStr, cfgFile);
+
+    d_A_csr_row_index = dataBase_.d_A_csr_row_index;
+    d_A_csr_diag_index = dataBase_.d_A_csr_diag_index;
+    d_A_csr_col_index = dataBase_.d_A_csr_col_index;
+
+    h_A_csr = new double[(num_cells + num_faces) * (num_species - 1)];
+    h_b = new double[num_cells * (num_species - 1)];
+    cudaMallocHost(&h_psi, num_cells * (num_species - 1) * sizeof(double));
+
+    checkCudaErrors(cudaMalloc((void **)&d_A_csr, (num_cells + num_faces) * (num_species - 1) * sizeof(double)));
+    checkCudaErrors(cudaMalloc((void **)&d_b, cell_bytes * (num_species - 1)));
+    checkCudaErrors(cudaMalloc((void **)&d_psi, cell_bytes * (num_species - 1)));
+    checkCudaErrors(cudaMalloc((void **)&d_sumYDiffError, 3 * cell_bytes));
+    checkCudaErrors(cudaMalloc((void **)&d_sumYDiffError_tmp, 3 * cell_bytes));
+    checkCudaErrors(cudaMalloc((void **)&d_phiUc, num_faces * sizeof(double)));
+    checkCudaErrors(cudaMalloc((void **)&d_phiUc, num_faces * sizeof(double)));
+    checkCudaErrors(cudaMalloc((void **)&d_sumYDiffError_boundary, 3 * num_boundary_faces * sizeof(double)));
+    checkCudaErrors(cudaMalloc((void **)&d_phiUc_boundary, num_boundary_faces * sizeof(double)));
+    checkCudaErrors(cudaMalloc((void **)&d_mut_Sct, cell_bytes));
+
+    checkCudaErrors(cudaStreamCreate(&stream));
+    // zeroGradient
+    for (size_t i = 0; i < num_species; i++)
+    {
+        checkCudaErrors(cudaMemsetAsync(dataBase_.d_internal_coeffs_Y_vector[i], 1, dataBase_.boundary_face_bytes, stream));
+        checkCudaErrors(cudaMemsetAsync(dataBase_.d_boundary_coeffs_Y_vector[i], 0, dataBase_.boundary_face_bytes, stream));
+        checkCudaErrors(cudaMemsetAsync(dataBase_.d_laplac_internal_coeffs_Y_vector[i], 0, dataBase_.boundary_face_bytes, stream));
+        checkCudaErrors(cudaMemsetAsync(dataBase_.d_laplac_boundary_coeffs_Y_vector[i], 0, dataBase_.boundary_face_bytes, stream));
+    }
+}
+
+void dfYEqn::upwindWeight(double *refWeight)
+{
+    size_t threads_per_block = 1024;
+    size_t blocks_per_grid = (num_faces + threads_per_block - 1) / threads_per_block;
+    getUpwindWeight<<<blocks_per_grid, threads_per_block, 0, stream>>>(num_faces, dataBase_.d_phi, dataBase_.d_weight_upwind);
+
+#ifdef _CHECK_
+    double *h_refWeight_init = new double[num_faces];
+    double *h_refWeight = new double[num_faces];
+    double *h_weight_upwind = new double[num_faces];
+    cudaMemcpy(h_weight_upwind, dataBase_.d_weight_upwind, num_faces * sizeof(double), cudaMemcpyDeviceToHost);
+    memcpy(h_refWeight_init, refWeight, num_surfaces * sizeof(double));
+    memcpy(h_refWeight_init + num_surfaces, refWeight, num_surfaces * sizeof(double));
+    for (size_t i = 0; i < num_faces; i++)
+        h_refWeight[i] = h_refWeight_init[dataBase_.permedIndex[i]];
+    for (size_t i = 0; i < num_faces; i++)
+        printf("h_weight_upwind[%d] = %lf\n", i, h_weight_upwind[i]);
+    for (size_t i = 0; i < num_faces; i++)
+        printf("h_refWeight[%d] = %lf\n", i, h_refWeight[i]);
+#endif
+}
+
+void dfYEqn::correctVelocity(std::vector<double *> Y_new, std::vector<double *> boundary_Y_init, std::vector<const double *> rhoD_GPU,
+                             const double *phiUcRef, double *boundary_sumYDiffError_ref, double *phiUcBouRef)
+{
+    // initialize variables in each time step
+    checkCudaErrors(cudaMemsetAsync(d_sumYDiffError, 0, 3 * cell_bytes, stream));
+    checkCudaErrors(cudaMemsetAsync(d_phiUc, 0, num_faces * sizeof(double), stream));
+
+    size_t threads_per_block, blocks_per_grid;
+    for (size_t i = 0; i < num_species; ++i)
+    {
+        checkCudaErrors(cudaMemsetAsync(d_sumYDiffError_tmp, 0, 3 * cell_bytes, stream));
+        checkCudaErrors(cudaMemcpyAsync(dataBase_.d_Y_new_vector[i], Y_new[i], cell_bytes, cudaMemcpyHostToDevice, stream));
+        checkCudaErrors(cudaMemcpyAsync(dataBase_.d_boundary_Y_init_vector[i], boundary_Y_init[i], dataBase_.boundary_face_bytes, cudaMemcpyHostToDevice, stream));
+        checkCudaErrors(cudaMemcpyAsync(dataBase_.d_rhoD_vector[i], rhoD_GPU[i], cell_bytes, cudaMemcpyHostToDevice, stream));
+
+        // launch cuda kernel
+        threads_per_block = 1024;
+        blocks_per_grid = (num_cells + threads_per_block - 1) / threads_per_block;
+        fvc_grad_internal_face_Y<<<blocks_per_grid, threads_per_block, 0, stream>>>(num_cells,
+                                                                                    d_A_csr_row_index, d_A_csr_col_index, d_A_csr_diag_index, dataBase_.d_volume,
+                                                                                    dataBase_.d_face_vector, dataBase_.d_weight, dataBase_.d_Y_new_vector[i],
+                                                                                    dataBase_.d_rhoD_vector[i], d_sumYDiffError_tmp, d_sumYDiffError_tmp);
+        blocks_per_grid = (num_boundary_cells + threads_per_block - 1) / threads_per_block;
+        fvc_grad_boundary_face_Y<<<blocks_per_grid, threads_per_block, 0, stream>>>(num_cells, num_boundary_cells, dataBase_.d_boundary_cell_offset,
+                                                                                    dataBase_.d_boundary_cell_id, dataBase_.d_rhoD_vector[i], dataBase_.d_bouPermedIndex,
+                                                                                    dataBase_.d_boundary_face_vector, dataBase_.d_boundary_Y_init_vector[i], dataBase_.d_boundary_Y_vector[i],
+                                                                                    dataBase_.d_volume, d_sumYDiffError_tmp, d_sumYDiffError_tmp);
+        blocks_per_grid = (num_cells + threads_per_block - 1) / threads_per_block;
+        sumError<<<blocks_per_grid, threads_per_block, 0, stream>>>(num_cells, dataBase_.d_volume, dataBase_.d_rhoD_vector[i], d_sumYDiffError_tmp, d_sumYDiffError);
+    }
+    blocks_per_grid = (num_cells + threads_per_block - 1) / threads_per_block;
+    divide_vol<<<blocks_per_grid, threads_per_block, 0, stream>>>(num_cells, dataBase_.d_volume, d_sumYDiffError, d_sumYDiffError);
+
+    blocks_per_grid = (num_boundary_cells + threads_per_block - 1) / threads_per_block;
+    correct_boundary_conditions_vec<<<blocks_per_grid, threads_per_block, 0, stream>>>(num_boundary_cells, dataBase_.d_boundary_cell_offset, dataBase_.d_boundary_cell_id,
+                                                                                       dataBase_.d_boundary_face_vector, dataBase_.d_boundary_face, d_sumYDiffError_boundary, d_sumYDiffError);
+
+    blocks_per_grid = (num_cells + threads_per_block - 1) / threads_per_block;
+    calculate_phiUc<<<blocks_per_grid, threads_per_block, 0, stream>>>(num_cells,
+                                                                       d_A_csr_row_index, d_A_csr_col_index, d_A_csr_diag_index, dataBase_.d_face_vector,
+                                                                       dataBase_.d_weight, d_sumYDiffError, d_phiUc);
+
+    blocks_per_grid = (num_boundary_faces + threads_per_block - 1) / threads_per_block;
+    calculate_phiUc_boundary<<<blocks_per_grid, threads_per_block, 0, stream>>>(num_boundary_faces, dataBase_.d_boundary_cell_offset, dataBase_.d_boundary_cell_id,
+                                                                                dataBase_.d_boundary_face_vector, d_sumYDiffError_boundary, d_phiUc_boundary);
+
+#ifdef _CHECK_
+    checkCudaErrors(cudaStreamSynchronize(stream));
+    double *h_sumYDiffError = new double[num_cells * 3];
+    double *h_phiUc = new double[num_faces];
+    double *h_phiUc_ref_init = new double[num_faces];
+    double *h_phiUc_ref = new double[num_faces];
+    double *h_sumYDiffError_boundary = new double[num_boundary_faces * 3];
+    double *h_sumYDiffError_boundary_gpu = new double[num_boundary_faces * 3];
+    double *h_phiUc_boundary = new double[num_boundary_faces];
+    double *h_phiUc_boundary_gpu = new double[num_boundary_faces];
+
+    memcpy(h_phiUc_ref_init, phiUcRef, num_surfaces * sizeof(double));
+    memcpy(h_phiUc_ref_init + num_surfaces, phiUcRef, num_surfaces * sizeof(double));
+    for (int i = 0; i < num_faces; i++)
+    {
+        h_phiUc_ref[i] = h_phiUc_ref_init[dataBase_.permedIndex[i]];
+    }
+    for (int i = 0; i < num_boundary_faces; i++)
+    {
+        h_sumYDiffError_boundary[3 * i] = boundary_sumYDiffError_ref[3 * dataBase_.boundPermutationList[i]];
+        h_sumYDiffError_boundary[3 * i + 1] = boundary_sumYDiffError_ref[3 * dataBase_.boundPermutationList[i] + 1];
+        h_sumYDiffError_boundary[3 * i + 2] = boundary_sumYDiffError_ref[3 * dataBase_.boundPermutationList[i] + 2];
+        h_phiUc_boundary[i] = phiUcBouRef[dataBase_.boundPermutationList[i]];
+    }
+    cudaMemcpy(h_sumYDiffError, d_sumYDiffError, num_cells * sizeof(double) * 3, cudaMemcpyDeviceToHost);
+    cudaMemcpy(h_phiUc, d_phiUc, num_faces * sizeof(double), cudaMemcpyDeviceToHost);
+    cudaMemcpy(h_sumYDiffError_boundary_gpu, d_sumYDiffError_boundary, 3 * num_boundary_faces * sizeof(double), cudaMemcpyDeviceToHost);
+    cudaMemcpy(h_phiUc_boundary_gpu, d_phiUc_boundary, num_boundary_faces * sizeof(double), cudaMemcpyDeviceToHost);
+    // for (size_t i = 0; i < num_cells; i++)
+    //     printf("h_sumYDiffError[%d] = (%e, %e, %e)\n", i, h_sumYDiffError[num_cells * 0 + i],
+    //            h_sumYDiffError[num_cells * 1 + i], h_sumYDiffError[num_cells * 2 + i]);
+    // for (size_t i = 0; i < num_faces; i++)
+    //     printf("phiUc[%d] = %e\n", i, h_phiUc[i]);
+    // for (size_t i = 0; i < num_faces; i++)
+    //     printf("phiUcof[%d] = %e\n", i, h_phiUc_ref[i]);
+    for (int i = 0; i < num_boundary_faces; i++)
+    {
+        // printf("boundary_sumYDiffError_of[%d] = (%e, %e, %e)\n", i, h_sumYDiffError_boundary[3 * i],
+        //        h_sumYDiffError_boundary[3 * i + 1], h_sumYDiffError_boundary[3 * i + 2]);
+        printf("boundary_phiUc_of[%d] = %e\n", i, h_phiUc_boundary[i]);
+    }
+    for (int i = 0; i < num_boundary_faces; i++)
+    {
+        // printf("boundary_phiUc_gpu[%d] = (%e, %e, %e)\n", i, h_sumYDiffError_boundary_gpu[3 * i],
+        //        h_sumYDiffError_boundary_gpu[3 * i + 1], h_sumYDiffError_boundary_gpu[3 * i + 2]);
+        printf("boundary_phiUc_gpu[%d] = %e\n", i, h_phiUc_boundary_gpu[i]);
+    }
+#endif
+}
+
+void dfYEqn::fvm_ddt(std::vector<double *> Y_old)
+{
+    // initialize variables in each time step
+    checkCudaErrors(cudaMemsetAsync(d_A_csr, 0, (num_cells + num_faces) * (num_species - 1) * sizeof(double), stream)); // consider inert species
+    checkCudaErrors(cudaMemsetAsync(d_b, 0, cell_bytes * (num_species - 1), stream));
+    checkCudaErrors(cudaMemsetAsync(d_psi, 0, cell_bytes * (num_species - 1), stream));
+
+    size_t threads_per_block, blocks_per_grid;
+    int mtxIndex = 0;
+    for (size_t i = 0; i < num_species; ++i)
+    {
+        if (i == inertIndex)
+            continue;
+        checkCudaErrors(cudaMemcpyAsync(dataBase_.d_Y_old_vector[i], Y_old[i], cell_bytes, cudaMemcpyHostToDevice, stream));
+
+        // launch cuda kernel
+        threads_per_block = 1024;
+        blocks_per_grid = (num_cells + threads_per_block - 1) / threads_per_block;
+        fvm_ddt_kernel_scalar<<<blocks_per_grid, threads_per_block, 0, stream>>>(num_cells, num_faces, dataBase_.rdelta_t,
+                                                                                 d_A_csr_row_index, d_A_csr_diag_index,
+                                                                                 dataBase_.d_rho_old, dataBase_.d_rho_new, dataBase_.d_volume, dataBase_.d_Y_old_vector[i],
+                                                                                 d_A_csr + mtxIndex * (num_cells + num_faces), d_b + mtxIndex * num_cells,
+                                                                                 d_A_csr + mtxIndex * (num_cells + num_faces), d_b + mtxIndex * num_cells,
+                                                                                 d_psi + mtxIndex * num_cells);
+        ++mtxIndex;
+    }
+}
+
+void dfYEqn::fvm_div_phi()
+{
+    size_t threads_per_block, blocks_per_grid;
+    int mtxIndex = 0;
+    for (size_t i = 0; i < num_species; ++i)
+    {
+        if (i == inertIndex)
+            continue;
+
+        // launch cuda kernel
+        threads_per_block = 512;
+        blocks_per_grid = (num_cells + threads_per_block - 1) / threads_per_block;
+        fvm_div_internal_scalar<<<blocks_per_grid, threads_per_block, 0, stream>>>(num_cells, num_faces,
+                                                                                   d_A_csr_row_index, d_A_csr_diag_index, dataBase_.d_weight_upwind, dataBase_.d_phi,
+                                                                                   d_A_csr + mtxIndex * (num_cells + num_faces), d_b + mtxIndex * num_cells,
+                                                                                   d_A_csr + mtxIndex * (num_cells + num_faces), d_b + mtxIndex * num_cells);
+
+        blocks_per_grid = (num_boundary_cells + threads_per_block - 1) / threads_per_block;
+        fvm_div_boundary_scalar<<<blocks_per_grid, threads_per_block, 0, stream>>>(num_cells, num_faces, num_boundary_cells,
+                                                                                   d_A_csr_row_index, d_A_csr_diag_index, dataBase_.d_boundary_phi,
+                                                                                   dataBase_.d_boundary_cell_offset, dataBase_.d_boundary_cell_id,
+                                                                                   dataBase_.d_internal_coeffs_Y_vector[i], dataBase_.d_boundary_coeffs_Y_vector[i],
+                                                                                   d_A_csr + mtxIndex * (num_cells + num_faces), d_b + mtxIndex * num_cells,
+                                                                                   d_A_csr + mtxIndex * (num_cells + num_faces), d_b + mtxIndex * num_cells);
+        ++mtxIndex;
+    }
+}
+
+void dfYEqn::fvm_div_phiUc()
+{
+    size_t threads_per_block, blocks_per_grid;
+    int mtxIndex = 0;
+    for (size_t i = 0; i < num_species; ++i)
+    {
+        if (i == inertIndex)
+            continue;
+        // launch cuda kernel
+        threads_per_block = 512;
+        blocks_per_grid = (num_cells + threads_per_block - 1) / threads_per_block;
+        fvm_div_internal_scalar<<<blocks_per_grid, threads_per_block, 0, stream>>>(num_cells, num_faces,
+                                                                                   d_A_csr_row_index, d_A_csr_diag_index, dataBase_.d_weight_upwind, d_phiUc,
+                                                                                   d_A_csr + mtxIndex * (num_cells + num_faces), d_b + mtxIndex * num_cells,
+                                                                                   d_A_csr + mtxIndex * (num_cells + num_faces), d_b + mtxIndex * num_cells);
+
+        blocks_per_grid = (num_boundary_cells + threads_per_block - 1) / threads_per_block;
+        fvm_div_boundary_scalar<<<blocks_per_grid, threads_per_block, 0, stream>>>(num_cells, num_faces, num_boundary_cells,
+                                                                                   d_A_csr_row_index, d_A_csr_diag_index, d_phiUc_boundary,
+                                                                                   dataBase_.d_boundary_cell_offset, dataBase_.d_boundary_cell_id,
+                                                                                   dataBase_.d_internal_coeffs_Y_vector[i], dataBase_.d_boundary_coeffs_Y_vector[i],
+                                                                                   d_A_csr + mtxIndex * (num_cells + num_faces), d_b + mtxIndex * num_cells,
+                                                                                   d_A_csr + mtxIndex * (num_cells + num_faces), d_b + mtxIndex * num_cells);
+        ++mtxIndex;
+    }
+}
+
+void dfYEqn::fvm_laplacian(double *mut_Sct, double *boundary_mut_Sct, std::vector<double *> boundary_rhoD)
+{
+    size_t threads_per_block, blocks_per_grid;
+    int mtxIndex = 0;
+
+    checkCudaErrors(cudaMemcpyAsync(dataBase_.d_boundary_mut_sct, boundary_mut_Sct, dataBase_.boundary_face_bytes, cudaMemcpyHostToDevice, stream));
+    checkCudaErrors(cudaMemcpyAsync(d_mut_Sct, mut_Sct, cell_bytes, cudaMemcpyHostToDevice, stream));
+
+    for (size_t i = 0; i < num_species; ++i)
+    {
+        if (i == inertIndex)
+            continue;
+        checkCudaErrors(cudaMemcpyAsync(dataBase_.d_boundary_rhoD_vector[i], boundary_rhoD[i], dataBase_.boundary_face_bytes, cudaMemcpyHostToDevice, stream));
+
+        // launch cuda kernel
+        threads_per_block = 1024;
+        blocks_per_grid = (num_cells + threads_per_block - 1) / threads_per_block;
+        fvm_laplacian_uncorrected_scalar_internal<<<blocks_per_grid, threads_per_block, 0, stream>>>(num_cells, d_A_csr_row_index, d_A_csr_col_index, d_A_csr_diag_index,
+                                                                                                     d_mut_Sct, dataBase_.d_rhoD_vector[i], dataBase_.d_weight, dataBase_.d_face,
+                                                                                                     dataBase_.d_deltaCoeffs, -1., d_A_csr + mtxIndex * (num_cells + num_faces),
+                                                                                                     d_A_csr + mtxIndex * (num_cells + num_faces));
+        blocks_per_grid = (num_boundary_cells + threads_per_block - 1) / threads_per_block;
+        fvm_laplacian_uncorrected_scalar_boundary<<<blocks_per_grid, threads_per_block, 0, stream>>>(num_cells, num_boundary_cells,
+                                                                                                     d_A_csr_row_index, d_A_csr_diag_index, dataBase_.d_boundary_cell_offset,
+                                                                                                     dataBase_.d_boundary_cell_id, dataBase_.d_boundary_rhoD_vector[i],
+                                                                                                     dataBase_.d_boundary_mut_sct, dataBase_.d_boundary_face, dataBase_.d_bouPermedIndex,
+                                                                                                     dataBase_.d_laplac_internal_coeffs_Y_vector[i], dataBase_.d_laplac_boundary_coeffs_Y_vector[i], -1.,
+                                                                                                     d_A_csr + mtxIndex * (num_cells + num_faces), d_b + mtxIndex * num_cells,
+                                                                                                     d_A_csr + mtxIndex * (num_cells + num_faces), d_b + mtxIndex * num_cells);
+        ++mtxIndex;
+    }
+}
+
+void dfYEqn::checkValue(bool print, char *filename)
+{
+    checkCudaErrors(cudaMemcpyAsync(h_A_csr, d_A_csr, (num_cells + num_faces) * sizeof(double), cudaMemcpyDeviceToHost, stream));
+    checkCudaErrors(cudaMemcpyAsync(h_b, d_b, num_cells * sizeof(double), cudaMemcpyDeviceToHost, stream));
+
+    // Synchronize stream
+    checkCudaErrors(cudaStreamSynchronize(stream));
+    if (print)
+    {
+        for (int i = 0; i < (num_faces + num_cells); i++)
+            fprintf(stderr, "h_A_csr[%d]: %.15lf\n", i, h_A_csr[i]);
+        for (int i = 0; i < num_cells; i++)
+            fprintf(stderr, "h_b[%d]: %.15lf\n", i, h_b[i]);
+    }
+
+    char *input_file = filename;
+    FILE *fp = fopen(input_file, "rb+");
+    if (fp == NULL)
+    {
+        fprintf(stderr, "Failed to open input file: %s!\n", input_file);
+    }
+
+    int readfile = 0;
+    double *of_b = new double[num_cells];
+    double *of_A = new double[num_faces + num_cells];
+    readfile = fread(of_b, num_cells * sizeof(double), 1, fp);
+    readfile = fread(of_A, (num_faces + num_cells) * sizeof(double), 1, fp);
+
+    std::vector<double> h_A_of_vec_1mtx(num_faces + num_cells, 0);
+    for (int i = 0; i < num_faces + num_cells; i++)
+    {
+        h_A_of_vec_1mtx[i] = of_A[dataBase_.tmpPermutatedList[i]];
+    }
+    if (print)
+    {
+        for (int i = 0; i < (num_faces + num_cells); i++)
+            printf("h_A_of_vec_1mtx[%d]: %.15lf\n", i, h_A_of_vec_1mtx[i]);
+        for (int i = 0; i < num_cells; i++)
+            printf("h_b_of_vec[%d]: %.15lf\n", i, of_b[i]);
+    }
+
+    fprintf(stderr, "check of h_A_csr\n");
+    checkVectorEqual(num_faces + num_cells, h_A_of_vec_1mtx.data(), h_A_csr, 1e-5);
+    fprintf(stderr, "check of h_b\n");
+    checkVectorEqual(num_cells, of_b, h_b, 1e-5);
+}
+
+void dfYEqn::solve()
+{
+    checkCudaErrors(cudaStreamSynchronize(stream));
+
+    int nNz = num_cells + num_faces; // matrix entries
+    if (num_iteration == 0)          // first interation
+    {
+        printf("Initializing AmgX Linear Solver\n");
+        int solverIndex = 0;
+        for (auto &solver : YSolverSet)
+        {
+            solver->setOperator(num_cells, nNz, d_A_csr_row_index, d_A_csr_col_index, d_A_csr + solverIndex * nNz);
+            ++solverIndex;
+        }
+    }
+    else
+    {
+        int solverIndex = 0;
+        for (auto &solver : YSolverSet)
+        {
+            solver->updateOperator(num_cells, nNz, d_A_csr + solverIndex * nNz);
+            ++solverIndex;
+        }
+    }
+    int solverIndex = 0;
+    for (auto &solver : YSolverSet)
+    {
+        solver->solve(num_cells, d_psi + solverIndex * num_cells, d_b + solverIndex * num_cells);
+        ++solverIndex;
+    }
+    num_iteration++;
+    checkCudaErrors(cudaMemcpyAsync(h_psi, d_psi, num_cells * (num_species - 1) * sizeof(double), cudaMemcpyDeviceToHost, stream));
+    // checkCudaErrors(cudaStreamSynchronize(stream));
+    // for (size_t i = 0; i < num_cells; i++)
+    //     fprintf(stderr, "h_species_gpu[%d]: %.5e\n", i, h_psi[i + 0 * num_cells]);
+}
+
+void dfYEqn::updatePsi(double *Psi, int speciesIndex)
+{
+    checkCudaErrors(cudaStreamSynchronize(stream));
+    for (size_t i = 0; i < num_cells; i++)
+        Psi[i] = h_psi[i + speciesIndex * num_cells];
+}
+
+dfYEqn::~dfYEqn()
+{
+}
\ No newline at end of file

From 5cd1e536394ae7967ac5c792f4d6b28d5faca6b3 Mon Sep 17 00:00:00 2001
From: maorz1998 <maorz1998@stu.pku.edu.cn>
Date: Fri, 2 Jun 2023 12:59:23 +0800
Subject: [PATCH 2/2] clean version

---
 applications/solvers/dfLowMachFoam/YEqn.H | 158 +++++++++-------------
 src_gpu/dfYEqn.H                          |   5 +-
 src_gpu/dfYEqn.cu                         |  69 +---------
 3 files changed, 65 insertions(+), 167 deletions(-)

diff --git a/applications/solvers/dfLowMachFoam/YEqn.H b/applications/solvers/dfLowMachFoam/YEqn.H
index e5cbd15d7..5e44d9b3f 100644
--- a/applications/solvers/dfLowMachFoam/YEqn.H
+++ b/applications/solvers/dfLowMachFoam/YEqn.H
@@ -15,23 +15,6 @@ tmp<fv::convectionScheme<scalar>> mvConvection
     )
 );
 
-upwind<scalar> mvConvectionInterpolation(mesh, phi);
-tmp<surfaceScalarField> t_upwind_weight = mvConvectionInterpolation.weights();
-surfaceScalarField upwind_weight = t_upwind_weight.ref();
-
-volVectorField sumYGrad
-(
-    IOobject
-    (
-        "sumYGrad",
-        runTime.timeName(),
-        mesh,
-        IOobject::NO_READ,
-        IOobject::NO_WRITE
-    ),
-    mesh,
-    dimensionedVector(dimensionSet(0,-1,0,0,0,0,0), Zero)
-);
 
 forAll(Y, i)
 {
@@ -39,25 +22,6 @@ forAll(Y, i)
 }
 const surfaceScalarField phiUc = linearInterpolate(sumYDiffError) & mesh.Sf();
 
-// for (size_t i = 0; i < num_cells; i++)
-// {
-//     printf("sumYDiffError[%d] = (%e, %e, %e)\n", i, sumYDiffError[i][0], sumYDiffError[i][1], 
-//     sumYDiffError[i][2]);
-//     // printf("fvc::grad[%d] = (%e, %e, %e)\n", i, test_grad[i][0], test_grad[i][1], 
-//     // test_grad[i][2]);
-// }
-
-const surfaceVectorField test_interpolate = linearInterpolate(sumYDiffError).ref();
-
-// for (size_t i = 0; i < num_surfaces; i++)
-// {
-//     // printf("test_interpolate[%d] = (%e, %e, %e)\n", i, test_interpolate[i][0], test_interpolate[i][1], 
-//     // test_interpolate[i][2]);
-//     // printf("surface vector[%d] = (%e, %e, %e)\n", i, mesh.Sf()[i][0], mesh.Sf()[i][1], 
-//     // mesh.Sf()[i][2]);
-//     printf("phiUcOf[%d] = %e\n", i, phiUc[i]);
-// }
-
 #ifdef GPUSolver_
     start1 = std::clock();
     // // std::thread t(&dfMatrix::solve, &UEqn_GPU);
@@ -67,53 +31,49 @@ const surfaceVectorField test_interpolate = linearInterpolate(sumYDiffError).ref
     time_monitor_UEqn_Solve += double(end1 - start1) / double(CLOCKS_PER_SEC);
 #endif
 
-std::vector<double*> Y_new(Y.size()), Y_old(Y.size()), boundary_Y_init(Y.size()), 
-    boundary_rhoD_init(Y.size());
-std::vector<const double*> rhoD_GPU(Y.size());
-for (size_t i = 0; i < Y.size(); ++i)
-{
-    volScalarField& Yi = Y[i];
-    const volScalarField& rhoDi = chemistry->rhoD(i);
-    Y_new[i] = &Yi[0];
-    Y_old[i] = &Yi.oldTime()[0];
-    rhoD_GPU[i] = &chemistry->rhoD(i)[0];
-    cudaMallocHost(&boundary_Y_init[i], num_boundary_faces*sizeof(double));
-    cudaMallocHost(&boundary_rhoD_init[i], num_boundary_faces*sizeof(double));
-    int offset = 0;
-    forAll(Yi.boundaryField(), patchi)
+#ifdef GPUSolver_
+    std::vector<double*> Y_new(Y.size()), Y_old(Y.size()), boundary_Y_init(Y.size()), boundary_rhoD_init(Y.size());
+    std::vector<const double*> rhoD_GPU(Y.size());
+    for (size_t i = 0; i < Y.size(); ++i)
     {
-        const scalarField& patchYi = Yi.boundaryField()[patchi];
-        const scalarField& patchRhoDi = rhoDi.boundaryField()[patchi];
-        int patchSize = patchYi.size();
+        volScalarField& Yi = Y[i];
+        const volScalarField& rhoDi = chemistry->rhoD(i);
+        Y_new[i] = &Yi[0];
+        Y_old[i] = &Yi.oldTime()[0];
+        rhoD_GPU[i] = &chemistry->rhoD(i)[0];
+        cudaMallocHost(&boundary_Y_init[i], num_boundary_faces*sizeof(double));
+        cudaMallocHost(&boundary_rhoD_init[i], num_boundary_faces*sizeof(double));
+        int offset = 0;
+        forAll(Yi.boundaryField(), patchi)
+        {
+            const scalarField& patchYi = Yi.boundaryField()[patchi];
+            const scalarField& patchRhoDi = rhoDi.boundaryField()[patchi];
+            int patchSize = patchYi.size();
 
-        memcpy(boundary_Y_init[i]+offset, &patchYi[0], patchSize*sizeof(double));
-        memcpy(boundary_rhoD_init[i]+offset, &patchRhoDi[0], patchSize*sizeof(double));
-        offset += patchSize;
+            memcpy(boundary_Y_init[i]+offset, &patchYi[0], patchSize*sizeof(double));
+            memcpy(boundary_rhoD_init[i]+offset, &patchRhoDi[0], patchSize*sizeof(double));
+            offset += patchSize;
+        }
     }
-}
 
-volScalarField mut_sct = turbulence->mut().ref()/Sct;
-std::vector<double> boundary_sumYDiffError, boundary_phiUc, boundary_mutsct;
-forAll(p.boundaryField(), patchi)
-{
-    const vectorField& patchErri = sumYDiffError.boundaryField()[patchi];
-    const scalarField& patchPhiUc = phiUc.boundaryField()[patchi];
-    const scalarField& patchMut_sct = mut_sct.boundaryField()[patchi];
-
-    int patchSize = patchErri.size();
-    boundary_sumYDiffError.insert(boundary_sumYDiffError.end(), &patchErri[0][0], &patchErri[0][0]+3*patchSize);
-    boundary_phiUc.insert(boundary_phiUc.end(), &patchPhiUc[0], &patchPhiUc[0] + patchSize);
-    boundary_mutsct.insert(boundary_mutsct.end(), &patchMut_sct[0], &patchMut_sct[0] + patchSize);
-}
+    volScalarField mut_sct = turbulence->mut().ref()/Sct;
+    std::vector<double> boundary_mutsct;
+    forAll(p.boundaryField(), patchi)
+    {
+        const scalarField& patchMut_sct = mut_sct.boundaryField()[patchi];
+        int patchSize = patchMut_sct.size();
+        boundary_mutsct.insert(boundary_mutsct.end(), &patchMut_sct[0], &patchMut_sct[0] + patchSize);
+    }
 
-YEqn_GPU.upwindWeight(&upwind_weight[0]);
-YEqn_GPU.correctVelocity(Y_new, boundary_Y_init, rhoD_GPU, &phiUc[0], boundary_sumYDiffError.data(), boundary_phiUc.data());
-YEqn_GPU.fvm_ddt(Y_old);
-YEqn_GPU.fvm_div_phi();
-YEqn_GPU.fvm_div_phiUc();
-YEqn_GPU.fvm_laplacian(&mut_sct[0], boundary_mutsct.data(), boundary_rhoD_init);
+    YEqn_GPU.upwindWeight();
+    YEqn_GPU.correctVelocity(Y_new, boundary_Y_init, rhoD_GPU);
+    YEqn_GPU.fvm_ddt(Y_old);
+    YEqn_GPU.fvm_div_phi();
+    YEqn_GPU.fvm_div_phiUc();
+    YEqn_GPU.fvm_laplacian(&mut_sct[0], boundary_mutsct.data(), boundary_rhoD_init);
 
-YEqn_GPU.solve();
+    YEqn_GPU.solve();
+#endif
 
 //MPI_Barrier(PstreamGlobals::MPI_COMM_FOAM);
 label flag_mpi_init;
@@ -147,26 +107,30 @@ time_monitor_corrDiff += double(end - start) / double(CLOCKS_PER_SEC);
 
         if (i != inertIndex)
         {
-            // tmp<volScalarField> DEff = chemistry->rhoD(i) + turbulence->mut()/Sct;
-            // fvScalarMatrix YiEqn
-            // (
-            //     fvm::ddt(rho, Yi)
-            // + mvConvection->fvmDiv(phi, Yi)
-            // + mvConvection->fvmDiv(phiUc, Yi)
-            // ==
-            //     (
-            //         splitting
-            //     ?   fvm::laplacian(DEff(), Yi)
-            //     :  (fvm::laplacian(DEff(), Yi) + combustion->R(Yi))
-            //     )
-            // );
-
-            // YiEqn.relax();
-
-            // YiEqn.solve("Yi");
-
-            YEqn_GPU.updatePsi(&Yi[0], speciesIndex);
-            Yi.correctBoundaryConditions();
+            #ifdef CPUSolver_
+                tmp<volScalarField> DEff = chemistry->rhoD(i) + turbulence->mut()/Sct;
+                fvScalarMatrix YiEqn
+                (
+                    fvm::ddt(rho, Yi)
+                + mvConvection->fvmDiv(phi, Yi)
+                + mvConvection->fvmDiv(phiUc, Yi)
+                ==
+                    (
+                        splitting
+                    ?   fvm::laplacian(DEff(), Yi)
+                    :  (fvm::laplacian(DEff(), Yi) + combustion->R(Yi))
+                    )
+                );
+
+                YiEqn.relax();
+
+                YiEqn.solve("Yi");
+            #endif
+
+            #ifdef GPUSolver_
+                YEqn_GPU.updatePsi(&Yi[0], speciesIndex);
+                Yi.correctBoundaryConditions();
+            #endif
 
             Yi.max(0.0);
             Yt += Yi;
diff --git a/src_gpu/dfYEqn.H b/src_gpu/dfYEqn.H
index 85f4493a6..e477d0d2e 100644
--- a/src_gpu/dfYEqn.H
+++ b/src_gpu/dfYEqn.H
@@ -35,10 +35,9 @@ public:
 
     void checkValue(bool print, char *filename);
 
-    void correctVelocity(std::vector<double *> Y_new, std::vector<double *> boundary_Y_init, std::vector<const double *> rhoD_GPU,
-                         const double *phiUcRef, double *boundary_sumYDiffError_ref, double *phiUcBouRef);
+    void correctVelocity(std::vector<double *> Y_new, std::vector<double *> boundary_Y_init, std::vector<const double *> rhoD_GPU);
 
-    void upwindWeight(double *refWeight);
+    void upwindWeight();
 
     void fvm_ddt(std::vector<double *> Y_old);
 
diff --git a/src_gpu/dfYEqn.cu b/src_gpu/dfYEqn.cu
index d25efb862..b507763a0 100644
--- a/src_gpu/dfYEqn.cu
+++ b/src_gpu/dfYEqn.cu
@@ -501,30 +501,14 @@ dfYEqn::dfYEqn(dfMatrixDataBase &dataBase, const std::string &modeStr, const std
     }
 }
 
-void dfYEqn::upwindWeight(double *refWeight)
+void dfYEqn::upwindWeight()
 {
     size_t threads_per_block = 1024;
     size_t blocks_per_grid = (num_faces + threads_per_block - 1) / threads_per_block;
     getUpwindWeight<<<blocks_per_grid, threads_per_block, 0, stream>>>(num_faces, dataBase_.d_phi, dataBase_.d_weight_upwind);
-
-#ifdef _CHECK_
-    double *h_refWeight_init = new double[num_faces];
-    double *h_refWeight = new double[num_faces];
-    double *h_weight_upwind = new double[num_faces];
-    cudaMemcpy(h_weight_upwind, dataBase_.d_weight_upwind, num_faces * sizeof(double), cudaMemcpyDeviceToHost);
-    memcpy(h_refWeight_init, refWeight, num_surfaces * sizeof(double));
-    memcpy(h_refWeight_init + num_surfaces, refWeight, num_surfaces * sizeof(double));
-    for (size_t i = 0; i < num_faces; i++)
-        h_refWeight[i] = h_refWeight_init[dataBase_.permedIndex[i]];
-    for (size_t i = 0; i < num_faces; i++)
-        printf("h_weight_upwind[%d] = %lf\n", i, h_weight_upwind[i]);
-    for (size_t i = 0; i < num_faces; i++)
-        printf("h_refWeight[%d] = %lf\n", i, h_refWeight[i]);
-#endif
 }
 
-void dfYEqn::correctVelocity(std::vector<double *> Y_new, std::vector<double *> boundary_Y_init, std::vector<const double *> rhoD_GPU,
-                             const double *phiUcRef, double *boundary_sumYDiffError_ref, double *phiUcBouRef)
+void dfYEqn::correctVelocity(std::vector<double *> Y_new, std::vector<double *> boundary_Y_init, std::vector<const double *> rhoD_GPU)
 {
     // initialize variables in each time step
     checkCudaErrors(cudaMemsetAsync(d_sumYDiffError, 0, 3 * cell_bytes, stream));
@@ -568,55 +552,6 @@ void dfYEqn::correctVelocity(std::vector<double *> Y_new, std::vector<double *>
     blocks_per_grid = (num_boundary_faces + threads_per_block - 1) / threads_per_block;
     calculate_phiUc_boundary<<<blocks_per_grid, threads_per_block, 0, stream>>>(num_boundary_faces, dataBase_.d_boundary_cell_offset, dataBase_.d_boundary_cell_id,
                                                                                 dataBase_.d_boundary_face_vector, d_sumYDiffError_boundary, d_phiUc_boundary);
-
-#ifdef _CHECK_
-    checkCudaErrors(cudaStreamSynchronize(stream));
-    double *h_sumYDiffError = new double[num_cells * 3];
-    double *h_phiUc = new double[num_faces];
-    double *h_phiUc_ref_init = new double[num_faces];
-    double *h_phiUc_ref = new double[num_faces];
-    double *h_sumYDiffError_boundary = new double[num_boundary_faces * 3];
-    double *h_sumYDiffError_boundary_gpu = new double[num_boundary_faces * 3];
-    double *h_phiUc_boundary = new double[num_boundary_faces];
-    double *h_phiUc_boundary_gpu = new double[num_boundary_faces];
-
-    memcpy(h_phiUc_ref_init, phiUcRef, num_surfaces * sizeof(double));
-    memcpy(h_phiUc_ref_init + num_surfaces, phiUcRef, num_surfaces * sizeof(double));
-    for (int i = 0; i < num_faces; i++)
-    {
-        h_phiUc_ref[i] = h_phiUc_ref_init[dataBase_.permedIndex[i]];
-    }
-    for (int i = 0; i < num_boundary_faces; i++)
-    {
-        h_sumYDiffError_boundary[3 * i] = boundary_sumYDiffError_ref[3 * dataBase_.boundPermutationList[i]];
-        h_sumYDiffError_boundary[3 * i + 1] = boundary_sumYDiffError_ref[3 * dataBase_.boundPermutationList[i] + 1];
-        h_sumYDiffError_boundary[3 * i + 2] = boundary_sumYDiffError_ref[3 * dataBase_.boundPermutationList[i] + 2];
-        h_phiUc_boundary[i] = phiUcBouRef[dataBase_.boundPermutationList[i]];
-    }
-    cudaMemcpy(h_sumYDiffError, d_sumYDiffError, num_cells * sizeof(double) * 3, cudaMemcpyDeviceToHost);
-    cudaMemcpy(h_phiUc, d_phiUc, num_faces * sizeof(double), cudaMemcpyDeviceToHost);
-    cudaMemcpy(h_sumYDiffError_boundary_gpu, d_sumYDiffError_boundary, 3 * num_boundary_faces * sizeof(double), cudaMemcpyDeviceToHost);
-    cudaMemcpy(h_phiUc_boundary_gpu, d_phiUc_boundary, num_boundary_faces * sizeof(double), cudaMemcpyDeviceToHost);
-    // for (size_t i = 0; i < num_cells; i++)
-    //     printf("h_sumYDiffError[%d] = (%e, %e, %e)\n", i, h_sumYDiffError[num_cells * 0 + i],
-    //            h_sumYDiffError[num_cells * 1 + i], h_sumYDiffError[num_cells * 2 + i]);
-    // for (size_t i = 0; i < num_faces; i++)
-    //     printf("phiUc[%d] = %e\n", i, h_phiUc[i]);
-    // for (size_t i = 0; i < num_faces; i++)
-    //     printf("phiUcof[%d] = %e\n", i, h_phiUc_ref[i]);
-    for (int i = 0; i < num_boundary_faces; i++)
-    {
-        // printf("boundary_sumYDiffError_of[%d] = (%e, %e, %e)\n", i, h_sumYDiffError_boundary[3 * i],
-        //        h_sumYDiffError_boundary[3 * i + 1], h_sumYDiffError_boundary[3 * i + 2]);
-        printf("boundary_phiUc_of[%d] = %e\n", i, h_phiUc_boundary[i]);
-    }
-    for (int i = 0; i < num_boundary_faces; i++)
-    {
-        // printf("boundary_phiUc_gpu[%d] = (%e, %e, %e)\n", i, h_sumYDiffError_boundary_gpu[3 * i],
-        //        h_sumYDiffError_boundary_gpu[3 * i + 1], h_sumYDiffError_boundary_gpu[3 * i + 2]);
-        printf("boundary_phiUc_gpu[%d] = %e\n", i, h_phiUc_boundary_gpu[i]);
-    }
-#endif
 }
 
 void dfYEqn::fvm_ddt(std::vector<double *> Y_old)