diff --git a/applications/solvers/dfLowMachFoam/EEqn.H b/applications/solvers/dfLowMachFoam/EEqn.H
index e676f28cc..a2e351074 100644
--- a/applications/solvers/dfLowMachFoam/EEqn.H
+++ b/applications/solvers/dfLowMachFoam/EEqn.H
@@ -30,9 +30,6 @@
      ==
         fvc::div(hDiffCorrFlux)
     );
-    printf("EEqn = %.15lf\n", EEqn.boundaryCoeffs()[0][64]);
-    printf("EEqn = %.15lf\n", EEqn.boundaryCoeffs()[1][64]);
-    printf("EEqn = %.15lf\n", EEqn.boundaryCoeffs()[5][1]);
     end1 = std::clock();
     time_monitor_EEqn += double(end1 - start1) / double(CLOCKS_PER_SEC);
     time_monitor_EEqn_mtxAssembly += double(end1 - start1) / double(CLOCKS_PER_SEC);
@@ -67,23 +64,6 @@
     int eeqn_offset = 0;
     int patchNum = 0;
 
-    //debug
-    // {
-    //     const fvPatchScalarField& hew = he.boundaryField()[5];
-    //     const basicThermo& bThermo = basicThermo::lookupThermo(hew);
-    //     const scalarField& pw = bThermo.p().boundaryField()[5];
-    //     fvPatchScalarField& Tw =
-    //         const_cast<fvPatchScalarField&>(bThermo.T().boundaryField()[5]);
-    //     scalarField& Tw_v = Tw;
-
-    //     Tw.evaluate();
-
-    //     Info << "internal field" <<bThermo.he(pw, Tw, mesh.boundary()[5].faceCells()) << endl;
-    //     Info << "boundary field" <<bThermo.he(pw, Tw, 5) << endl;
-    //     Info << "calculated grad" << 
-    //         mesh.boundary()[5].deltaCoeffs() * (bThermo.he(pw, Tw, 5) - bThermo.he(pw, Tw, mesh.boundary()[5].faceCells())) << endl;
-    // }
-
     forAll(he.boundaryField(), patchi)
     {
         patchNum++;
@@ -105,49 +85,17 @@
         const scalarField patchGradMau = patchDeltaCoeff * (heInternal - heBoundary);
 
         const scalarField& patchK = K.boundaryField()[patchi];
-        // const scalarField& patchAlphaEff = alphaEff.boundaryField()[patchi];
-        const scalarField& patchGrad = he.boundaryField()[patchi].gradientBoundaryCoeffs(); // gradient_
+        // const scalarField& patchAlphaEff = alphaEff.boundaryField()[patchi]; // not H2Dcopy when use UnityLewis
+        // const scalarField& patchGrad = he.boundaryField()[patchi].gradientBoundaryCoeffs(); // gradient_
+
+        // const DimensionedField<scalar, volMesh>& patchHa_ = he.boundaryField()[patchi];
+        // const gradientEnergyFvPatchScalarField patchHa(mesh.boundary()[patchi], patchHa_);
+        // const scalarField& patchGrad = patchHa.gradient(); // gradient_
         memcpy(boundary_K + eeqn_offset, &patchK[0], patchSize*sizeof(double));
         // memcpy(boundary_alphaEff + eeqn_offset, &patchAlphaEff[0], patchSize*sizeof(double));
-        memcpy(boundary_gradient + eeqn_offset, &patchGrad[0], patchSize*sizeof(double));
+        memcpy(boundary_gradient + eeqn_offset, &patchGradMau[0], patchSize*sizeof(double));
 
         eeqn_offset += patchSize;
-
-        // debug
-        // const fvsPatchScalarField& patchFlux = phi.boundaryField()[patchi];
-        // // Field<scalar> valueInternalCoeffs = patchFlux*he.boundaryField()[patchi].valueInternalCoeffs(pw);
-        // // Field<scalar> valueBoundaryCoeffs = -patchFlux*he.boundaryField()[patchi].valueBoundaryCoeffs(pw);
-        // Field<scalar> valueInternalCoeffs = patchFlux*he.boundaryField()[patchi].valueInternalCoeffs(pw);
-        // Field<scalar> valueBoundaryCoeffs = -patchFlux*he.boundaryField()[patchi].valueBoundaryCoeffs(pw);
-        // Field<scalar> gradientInternalCoeffs = he.boundaryField()[patchi].gradientInternalCoeffs();
-        // Field<scalar> gradientBoundaryCoeffs = he.boundaryField()[patchi].gradientBoundaryCoeffs();
-        // labelUList sub_boundary = mesh.boundary()[patchi].faceCells();
-
-        // calculate grad manually
-        // const fvPatchScalarField& hew = he.boundaryField()[patchi];
-        // const basicThermo& bThermo = basicThermo::lookupThermo(hew);
-        // const scalarField& ppw = bThermo.p().boundaryField()[patchi];
-        // fvPatchScalarField& Tw =
-        //     const_cast<fvPatchScalarField&>(bThermo.T().boundaryField()[patchi]);
-        // scalarField& Tw_v = Tw;
-
-        // Tw.evaluate();
-        // const scalarField& patchDeltaCoeff = mesh.boundary()[patchi].deltaCoeffs();
-        // const scalarField heInternal = bThermo.he(ppw, Tw, patchi)();
-        // const scalarField heBoundary = bThermo.he(ppw, Tw, mesh.boundary()[patchi].faceCells())();
-        // const scalarField patchGradMau = patchDeltaCoeff * (heInternal - heBoundary);
-                
-        // forAll(sub_boundary, i){
-        //     if (sub_boundary[i] == 1)
-        //     {
-        //         printf("\npatchFlux = %.10lf\n", patchFlux[i]);
-        //         printf("valueBoundaryCoeffs = %.10lf\n", he.boundaryField()[patchi].valueBoundaryCoeffs(pw)()[i]);
-        //         printf("valueBoundaryCoeffs_CPU = %.10lf\n", valueBoundaryCoeffs[i]);
-        //         printf("patchGrad = %.10lf\n", patchGrad[i]);
-        //         printf("patchGradMau = %.10lf\n", patchGradMau[i]);
-        //         printf("patch = %d, cellID = %d\n", patchi, i);
-        //     }
-        // }
     }
     end1 = std::clock();
     time_monitor_EEqn += double(end1 - start1) / double(CLOCKS_PER_SEC);
@@ -159,6 +107,8 @@
 
     // prepare data on GPU
     start1 = std::clock();
+    he.oldTime();
+    K.oldTime();
     EEqn_GPU.prepare_data(&he.oldTime()[0], &K[0], &K.oldTime()[0], alphaEff,
             &dpdt[0], boundary_K, boundary_alphaEff, boundary_gradient);
     EEqn_GPU.sync();
@@ -194,6 +144,7 @@
     start1 = std::clock();
     EEqn_GPU.updatePsi(&he[0]);
     he.correctBoundaryConditions();
+    he.write();
     end1 = std::clock();
     time_monitor_EEqn += double(end1 - start1) / double(CLOCKS_PER_SEC);
     time_monitor_EEqn_correctBC += double(end1 - start1) / double(CLOCKS_PER_SEC);
diff --git a/applications/solvers/dfLowMachFoam/UEqn.H b/applications/solvers/dfLowMachFoam/UEqn.H
index 190636f6f..65bc7e788 100644
--- a/applications/solvers/dfLowMachFoam/UEqn.H
+++ b/applications/solvers/dfLowMachFoam/UEqn.H
@@ -30,6 +30,7 @@
 
     start1 = std::clock();
     UEqn_GPU.initializeTimeStep();
+    U.oldTime();
     UEqn_GPU.fvm_ddt(&U.oldTime()[0][0]);
     UEqn_GPU.fvm_div(boundary_pressure_init, boundary_velocity_init, boundary_nuEff_init, boundary_rho_init);
     UEqn_GPU.fvc_grad(&p[0]);
@@ -57,13 +58,44 @@
     // time_monitor_UEqn_mtxAssembly += double(end1 - start1) / double(CLOCKS_PER_SEC);
 
     // check value
+    // U.oldTime();
+    // tmp<fvVectorMatrix> tUEqn
+    // (
+        // fvm::ddt(rho, U) 
+        // + 
+        // fvm::div(phi, U)
+        // + 
+        // turbulence->divDevRhoReff(U) 
+        // == -fvc::grad(p)
+    // );
+    // fvVectorMatrix& UEqn = tUEqn.ref();
+    // printf("b_cpu = %e\n", UEqn.source()[1][1]);
+    // forAll(U.boundaryField(), patchi){
+        // labelUList sub_boundary = mesh.boundary()[patchi].faceCells();
+        // forAll(sub_boundary, i){
+        //     if (sub_boundary[i] == 1){
+        //         printf("b_cpu_bou = %e\n", UEqn.boundaryCoeffs()[patchi][i][1]);
+        //         printf("patchi = %d, i = %d\n", patchi, i);
+        //     }
+        // }
+    // }
+    // const tmp<volTensorField> tgradU(fvc::grad(U));
+    // const volTensorField& gradU = tgradU();
+    // Pout << "gradU_of[1]\n" << gradU[1] << nl;
+    // Pout << "gradU_of[0][64]\n" << gradU.boundaryField()[0][64] << nl;
+    // Pout << "gradU_of[1][64]\n" << gradU.boundaryField()[1][64] << nl;
+    // Pout << "gradU_of[5][1]\n" << gradU.boundaryField()[5][1] << nl;
+    // Pout << "U[1][1]\n" << U[1] << nl;
+    // Pout << "Ubou[0][64]\n" << U.boundaryField()[0][64] << nl;
+    // Pout << "Ubou[1][64]\n" << U.boundaryField()[1][64] << nl;
+    // Pout << "Ubou[5][1]\n" << U.boundaryField()[5][1] << nl;
     // if (pimple.momentumPredictor())
     // {
     //     solve(UEqn);
-
+    //     Info << "U_CPU\n" << U << endl;
     //     K = 0.5*magSqr(U);
     // }
-    // UEqn_GPU.checkValue(false);
+    // UEqn_GPU.checkValue(true);
 #else
     start1 = std::clock();
     tmp<fvVectorMatrix> tUEqn
diff --git a/applications/solvers/dfLowMachFoam/YEqn.H b/applications/solvers/dfLowMachFoam/YEqn.H
index 689a151d4..78a20397a 100644
--- a/applications/solvers/dfLowMachFoam/YEqn.H
+++ b/applications/solvers/dfLowMachFoam/YEqn.H
@@ -39,6 +39,7 @@ time_monitor_YEqn_solve += double(end1 - start1) / double(CLOCKS_PER_SEC);
     for (size_t i = 0; i < Y.size(); ++i)
     {
         volScalarField& Yi = Y[i];
+        Yi.oldTime();
         Y_old[i] = &Yi.oldTime()[0];
         if (updateBoundaryFields)
         {
diff --git a/applications/solvers/dfLowMachFoam/dfLowMachFoam.C b/applications/solvers/dfLowMachFoam/dfLowMachFoam.C
index 9982f3989..284dc88f0 100644
--- a/applications/solvers/dfLowMachFoam/dfLowMachFoam.C
+++ b/applications/solvers/dfLowMachFoam/dfLowMachFoam.C
@@ -67,8 +67,8 @@ Description
 #include <thread>
 #include "upwind.H"
 
-// #define GPUSolver_
-#define CPUSolver_
+#define GPUSolver_
+// #define CPUSolver_
 
 // * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * //
 
diff --git a/src/dfChemistryModel/dfChemistryModel.C b/src/dfChemistryModel/dfChemistryModel.C
index 990161187..5c735b5d6 100644
--- a/src/dfChemistryModel/dfChemistryModel.C
+++ b/src/dfChemistryModel/dfChemistryModel.C
@@ -387,8 +387,10 @@ void Foam::dfChemistryModel<ThermoType>::correctThermo()
         psi_[celli] = CanteraGas_->meanMolecularWeight()/CanteraGas_->RT();
 
         mu_[celli] = mixture_.CanteraTransport()->viscosity(); // Pa-s
+        
 
-        alpha_[celli] = mixture_.CanteraTransport()->thermalConductivity()/(CanteraGas_->cp_mass()); // kg/(m*s)
+        // alpha_[celli] = mixture_.CanteraTransport()->thermalConductivity()/(CanteraGas_->cp_mass()); // kg/(m*s) mu/alpha = 0.7
+        alpha_[celli] = mu_[celli] / 0.7;
         // thermalConductivity() W/m/K
         // cp_mass()   J/kg/K
 
@@ -457,7 +459,8 @@ void Foam::dfChemistryModel<ThermoType>::correctThermo()
 
                 pmu[facei] = mixture_.CanteraTransport()->viscosity();
 
-                palpha[facei] = mixture_.CanteraTransport()->thermalConductivity()/(CanteraGas_->cp_mass());
+                // palpha[facei] = mixture_.CanteraTransport()->thermalConductivity()/(CanteraGas_->cp_mass());
+                palpha[facei] = pmu[facei] / 0.7;
 
                 if (mixture_.transportModelName() == "UnityLewis")
                 {
diff --git a/src_gpu/dfUEqn.cu b/src_gpu/dfUEqn.cu
index 8e39d423f..56983e038 100644
--- a/src_gpu/dfUEqn.cu
+++ b/src_gpu/dfUEqn.cu
@@ -101,10 +101,7 @@ __global__ void fvm_div_boundary(int num_cells, int num_faces, int num_boundary_
     int diag_index = csr_diag_index[cell_index];
     int csr_dim = num_cells + num_faces;
     int csr_index = row_index + diag_index;
-    if (index == 24570)
-    {
-        printf("csr_index = %d\n", csr_index);
-    }
+
     // construct internalCoeffs & boundaryCoeffs
     double internal_coeffs_x = 0;
     double internal_coeffs_y = 0;
@@ -456,12 +453,18 @@ __global__ void fvc_grad_vector_boundary(int num_cells, int num_boundary_cells,
     grad_boundary_init[index * 9 + 6] = grad[cell_index * 9 + 6];
     grad_boundary_init[index * 9 + 7] = grad[cell_index * 9 + 7];
     grad_boundary_init[index * 9 + 8] = grad[cell_index * 9 + 8];
+    // if (index == 1)
+    // {
+    //     printf("grad[1] = (%lf, %lf, %lf, %lf, %lf, %lf, %lf, %lf, %lf)\n", grad[index * 9 + 0], grad[index * 9 + 1], grad[index * 9 + 2],
+    //             grad[index * 9 + 3], grad[index * 9 + 4], grad[index * 9 + 5], grad[index * 9 + 6], grad[index * 9 + 7], grad[index * 9 + 8]);
+    // }
 }
 
 __global__ void correct_boundary_conditions(int num_boundary_cells,
                                             const int *boundary_cell_offset, const int *boundary_cell_id,
                                             const double *boundary_sf, const double *mag_sf,
-                                            double *boundary_grad_init, double *boundary_grad)
+                                            double *boundary_grad_init, double *boundary_grad, const double *boundary_deltaCoeffs,
+                                            const double *internal_velocity, const double *boundary_velocity, const int *U_patch_type)
 {
     int index = blockDim.x * blockIdx.x + threadIdx.x;
     if (index >= num_boundary_cells)
@@ -469,6 +472,7 @@ __global__ void correct_boundary_conditions(int num_boundary_cells,
 
     int cell_offset = boundary_cell_offset[index];
     int next_cell_offset = boundary_cell_offset[index + 1];
+    int cell_index = boundary_cell_id[cell_offset];
 
     // initialize boundary_grad
     double grad_xx = boundary_grad_init[index * 9 + 0];
@@ -481,6 +485,10 @@ __global__ void correct_boundary_conditions(int num_boundary_cells,
     double grad_zy = boundary_grad_init[index * 9 + 7];
     double grad_zz = boundary_grad_init[index * 9 + 8];
 
+    double internal_U_x = internal_velocity[cell_index * 3 + 0];
+    double internal_U_y = internal_velocity[cell_index * 3 + 1];
+    double internal_U_z = internal_velocity[cell_index * 3 + 2];
+
     for (int i = cell_offset; i < next_cell_offset; i++)
     {
         // OpenFoam code
@@ -494,13 +502,35 @@ __global__ void correct_boundary_conditions(int num_boundary_cells,
         //      vsf.boundaryField()[patchi].snGrad()
         //      - (n & gGradbf[patchi])
         //     );
+        // template<class Type> // fixedValue
+        // Foam::tmp<Foam::Field<Type>> Foam::fvPatchField<Type>::snGrad() const
+        // {
+        //     return patch_.deltaCoeffs()*(*this - patchInternalField());
+        // }
 
         double n_x = boundary_sf[i * 3 + 0] / mag_sf[i];
         double n_y = boundary_sf[i * 3 + 1] / mag_sf[i];
         double n_z = boundary_sf[i * 3 + 2] / mag_sf[i];
-        double sn_grad_x = 0;
-        double sn_grad_y = 0;
-        double sn_grad_z = 0;
+
+        double sn_grad_x, sn_grad_y, sn_grad_z;
+        int patchIndex = U_patch_type[i];
+        if (patchIndex == 0) { // zeroGradient
+            sn_grad_x = 0;
+            sn_grad_y = 0;
+            sn_grad_z = 0;
+        } else if (patchIndex == 1) { // fixedValue
+            sn_grad_x = boundary_deltaCoeffs[i] * (boundary_velocity[i * 3 + 0] - internal_velocity[cell_index * 3 + 0]);
+            sn_grad_y = boundary_deltaCoeffs[i] * (boundary_velocity[i * 3 + 1] - internal_velocity[cell_index * 3 + 1]);
+            sn_grad_z = boundary_deltaCoeffs[i] * (boundary_velocity[i * 3 + 2] - internal_velocity[cell_index * 3 + 2]);
+            // if (index == 1)
+            // {
+            //     printf("cell_index = %d\n", cell_index);
+            //     printf("boundary_velocity = %e\n", boundary_velocity[i * 3 + 1]);
+            //     printf("internal_velocity = %e\n", internal_velocity[cell_index * 3 + 0]);
+            // }
+            
+        }
+        // TODO: implement other BCs
         double grad_correction_x = sn_grad_x - (n_x * grad_xx + n_y * grad_yx + n_z * grad_zx);
         double grad_correction_y = sn_grad_y - (n_x * grad_xy + n_y * grad_yy + n_z * grad_zy);
         double grad_correction_z = sn_grad_z - (n_x * grad_xz + n_y * grad_yz + n_z * grad_zz);
@@ -513,6 +543,12 @@ __global__ void correct_boundary_conditions(int num_boundary_cells,
         boundary_grad[i * 9 + 6] = grad_zx + n_z * grad_correction_x;
         boundary_grad[i * 9 + 7] = grad_zy + n_z * grad_correction_y;
         boundary_grad[i * 9 + 8] = grad_zz + n_z * grad_correction_z;
+        // if (index == 1)
+        // {
+        //     printf("boundary_grad = (%lf, %lf, %lf, %lf, %lf, %lf, %lf, %lf, %lf)\n", boundary_grad[i * 9 + 0], boundary_grad[i * 9 + 1], boundary_grad[i * 9 + 2],
+        //         boundary_grad[i * 9 + 3], boundary_grad[i * 9 + 4], boundary_grad[i * 9 + 5], boundary_grad[i * 9 + 6], boundary_grad[i * 9 + 7], boundary_grad[i * 9 + 8]);
+        // }
+        
     }
 }
 
@@ -639,7 +675,6 @@ __global__ void fvc_div_tensor_internal(int num_cells,
         sum_z += sf_x * face_xz + sf_y * face_yz + sf_z * face_zz;
     }
     double vol = volume[index];
-
     b_output[num_cells * 0 + index] = b_input[num_cells * 0 + index] + sum_x * sign;
     b_output[num_cells * 1 + index] = b_input[num_cells * 1 + index] + sum_y * sign;
     b_output[num_cells * 2 + index] = b_input[num_cells * 2 + index] + sum_z * sign;
@@ -1026,8 +1061,8 @@ __global__ void ueqn_update_BoundaryCoeffs_kernel(int num_boundary_faces, const
         internal_coeffs[index * 3 + 1] = bouPhi * 0.;
         internal_coeffs[index * 3 + 2] = bouPhi * 0.;
         boundary_coeffs[index * 3 + 0] = -bouPhi * boundary_velocity[index * 3 + 0]; // valueBoundaryCoeffs = boundaryValue
-        boundary_coeffs[index * 3 + 1] = -bouPhi * boundary_velocity[index * 3 + 0];
-        boundary_coeffs[index * 3 + 2] = -bouPhi * boundary_velocity[index * 3 + 0];
+        boundary_coeffs[index * 3 + 1] = -bouPhi * boundary_velocity[index * 3 + 1];
+        boundary_coeffs[index * 3 + 2] = -bouPhi * boundary_velocity[index * 3 + 2];
         laplac_internal_coeffs[index * 3 + 0] = -1 * bouDeltaCoeffs; // gradientInternalCoeffs = -1 * boundaryDeltaCoeffs
         laplac_internal_coeffs[index * 3 + 1] = -1 * bouDeltaCoeffs;
         laplac_internal_coeffs[index * 3 + 2] = -1 * bouDeltaCoeffs;
@@ -1189,7 +1224,7 @@ void dfUEqn::fvc_grad(double *pressure)
 
 void dfUEqn::fvc_grad_vector()
 {
-    size_t threads_per_block = 1024;
+    size_t threads_per_block = 512;
     size_t blocks_per_grid = (num_cells + threads_per_block - 1) / threads_per_block;
     fvc_grad_vector_internal<<<blocks_per_grid, threads_per_block, 0, stream>>>(num_cells,
                                                                                 d_A_csr_row_index, d_A_csr_col_index, d_A_csr_diag_index,
@@ -1202,7 +1237,8 @@ void dfUEqn::fvc_grad_vector()
 
     correct_boundary_conditions<<<blocks_per_grid, threads_per_block, 0, stream>>>(num_boundary_cells,
                                                                                    dataBase_.d_boundary_cell_offset, dataBase_.d_boundary_cell_id, dataBase_.d_boundary_face_vector, dataBase_.d_boundary_face,
-                                                                                   dataBase_.d_grad_boundary_init, dataBase_.d_grad_boundary);
+                                                                                   dataBase_.d_grad_boundary_init, dataBase_.d_grad_boundary, dataBase_.d_boundary_deltaCoeffs, dataBase_.d_velocity_old,
+                                                                                   dataBase_.d_boundary_velocity, dataBase_.d_boundary_UpatchType);
 }
 
 void dfUEqn::dev2T()
@@ -1305,9 +1341,9 @@ void dfUEqn::checkValue(bool print)
     if (print)
     {
         for (int i = 0; i < (num_faces + num_cells); i++)
-            fprintf(stderr, "h_A_csr[%d]: %.5e\n", i, h_A_csr[i]);
-        for (int i = 0; i < num_cells * 3; i++)
-            fprintf(stderr, "h_b[%d]: %.5e\n", i, h_b[i]);
+            fprintf(stderr, "h_A_csr[%d]: (%.10lf, %.10lf, %.10lf)\n", i, h_A_csr[i], h_A_csr[i + (num_faces + num_cells)], h_A_csr[i + 2 * (num_faces + num_cells)]);
+        for (int i = 0; i < num_cells; i++)
+            fprintf(stderr, "h_b[%d]: (%.10lf, %.10lf, %.10lf)\n", i, h_b[i], h_b[i + num_cells], h_b[i + 2 * num_cells]);
     }
 
     char *input_file = "of_output.txt";
@@ -1318,23 +1354,19 @@ void dfUEqn::checkValue(bool print)
     }
     int readfile = 0;
     double *of_b = new double[3 * num_cells];
-    double *of_A = new double[num_faces + num_cells];
+    double *of_A = new double[3 * (num_faces + num_cells)];
     readfile = fread(of_b, num_cells * 3 * sizeof(double), 1, fp);
-    readfile = fread(of_A, (num_faces + num_cells) * sizeof(double), 1, fp);
+    readfile = fread(of_A, (num_faces + num_cells) * sizeof(double) * 3, 1, fp);
 
-    std::vector<double> h_A_of_init_vec(num_cells + num_faces);
-    std::copy(of_A, of_A + num_cells + num_faces, h_A_of_init_vec.begin());
+    std::vector<double> h_A_of_init_vec(3 * (num_cells + num_faces));
+    std::copy(of_A, of_A + (num_cells + num_faces) * 3, h_A_of_init_vec.begin());
 
-    std::vector<double> h_A_of_vec_1mtx(num_faces + num_cells, 0);
+    std::vector<double> h_A_of_vec_perm(3 * (num_faces + num_cells), 0);
     for (int i = 0; i < num_faces + num_cells; i++)
     {
-        h_A_of_vec_1mtx[i] = h_A_of_init_vec[dataBase_.tmpPermutatedList[i]];
-    }
-
-    std::vector<double> h_A_of_vec((num_faces + num_cells) * 3);
-    for (int i = 0; i < 3; i++)
-    {
-        std::copy(h_A_of_vec_1mtx.begin(), h_A_of_vec_1mtx.end(), h_A_of_vec.begin() + i * (num_faces + num_cells));
+        h_A_of_vec_perm[i] = h_A_of_init_vec[dataBase_.tmpPermutatedList[i]];
+        h_A_of_vec_perm[i + num_faces + num_cells] = h_A_of_init_vec[dataBase_.tmpPermutatedList[i] + num_faces + num_cells];
+        h_A_of_vec_perm[i + 2 * (num_faces + num_cells)] = h_A_of_init_vec[dataBase_.tmpPermutatedList[i] + 2 * (num_faces + num_cells)];
     }
 
     // b
@@ -1359,16 +1391,16 @@ void dfUEqn::checkValue(bool print)
     if (print)
     {
         for (int i = 0; i < (num_faces + num_cells); i++)
-            fprintf(stderr, "h_A_of_vec_1mtx[%d]: %.5e\n", i, h_A_of_vec_1mtx[i]);
-        for (int i = 0; i < 3 * num_cells; i++)
-            fprintf(stderr, "h_b_of_vec[%d]: %.5e\n", i, h_b_of_vec[i]);
+            printf("h_A_of_vec[%d]:(%.10lf, %.10lf, %.10lf)\n", i, h_A_of_vec_perm[i], h_A_of_vec_perm[i + (num_faces + num_cells)], h_A_of_vec_perm[i + (num_faces + num_cells) * 2]);
+        for (int i = 0; i < num_cells; i++)
+            printf("h_b_of_vec[%d]: (%.10lf, %.10lf, %.10lf)\n", i, of_b[i * 3], of_b[i * 3 + 1], of_b[i * 3 + 2]);
     }
 
     // check
-    fprintf(stderr, "check of h_A_csr\n");
-    checkVectorEqual(num_faces + num_cells, h_A_of_vec_1mtx.data(), h_A_csr, 1e-5);
-    fprintf(stderr, "check of h_b\n");
-    checkVectorEqual(3 * num_cells, h_b_of_vec.data(), h_b, 1e-5);
+    // fprintf(stderr, "check of h_A_csr\n");
+    // checkVectorEqual(num_faces + num_cells, h_A_of_vec_1mtx.data(), h_A_csr, 1e-5);
+    // fprintf(stderr, "check of h_b\n");
+    // checkVectorEqual(3 * num_cells, h_b_of_vec.data(), h_b, 1e-5);
 }
 
 void dfUEqn::solve()
diff --git a/src_gpu/dfYEqn.cu b/src_gpu/dfYEqn.cu
index 7da385a5d..990a96d1d 100644
--- a/src_gpu/dfYEqn.cu
+++ b/src_gpu/dfYEqn.cu
@@ -732,8 +732,8 @@ __global__ void yeqn_calculate_rhoD_alpha_via_nuEff_internal(int num_cells, int
     alpha[index] = rhoD[index];
 }
 
-__global__ void yeqn_calculate_rhoD_alpha_via_nuEff_boundary(int num_boundary_face, int num_species, double *boundary_rhoD, const double *boundary_nuEff,
-        const double *boundary_rho, double *boundary_alpha)    
+__global__ void yeqn_calculate_rhoD_alpha_via_nuEff_boundary(int num_boundary_face, int num_species, int *permutIndex,
+        double *boundary_rhoD, const double *boundary_nuEff, const double *boundary_rho, double *boundary_alpha)    
 {
     int index = blockDim.x * blockIdx.x + threadIdx.x;
     if (index >= num_boundary_face)
@@ -743,7 +743,7 @@ __global__ void yeqn_calculate_rhoD_alpha_via_nuEff_boundary(int num_boundary_fa
     //     boundary_rhoD[i * num_boundary_face + index] = boundary_nuEff[index] * boundary_rho[index] / 0.7;
     // }
 
-    boundary_alpha[index] = boundary_rhoD[index];
+    boundary_alpha[index] = boundary_rhoD[permutIndex[index]];
 }
 
 dfYEqn::dfYEqn(dfMatrixDataBase &dataBase, const std::string &modeStr, const std::string &cfgFile, const int inertIndex)
@@ -916,7 +916,7 @@ void dfYEqn::fvm_laplacian_and_sumYDiffError_diffAlphaD_hDiffCorrFlux(std::vecto
 
     blocks_per_grid = (num_boundary_faces + threads_per_block - 1) / threads_per_block;
     yeqn_calculate_rhoD_alpha_via_nuEff_boundary<<<blocks_per_grid, threads_per_block, 0, stream>>>(
-            num_boundary_faces, num_species,
+            num_boundary_faces, num_species, dataBase_.d_bouPermedIndex,
             d_boundary_rhoD, dataBase_.d_boundary_nuEff, dataBase_.d_boundary_rho, dataBase_.d_boundary_alpha);
     clock_t end = std::clock();
     fprintf(stderr, "GPU memcpy time in YEqn = %lf s\n", double(end - start) / double(CLOCKS_PER_SEC));