From 03744a4d059d0264fc71eaf78a9960d0ac5d6dce Mon Sep 17 00:00:00 2001
From: maorz1998 <maorz1998@stu.pku.edu.cn>
Date: Tue, 22 Aug 2023 18:07:59 +0800
Subject: [PATCH 1/3] fix bugs in UEqn

---
 applications/solvers/dfLowMachFoam/UEqn.H     |  40 +++---
 .../solvers/dfLowMachFoam/dfLowMachFoam.C     |   2 +-
 applications/solvers/dfLowMachFoam/new_UEqn.H | 116 ------------------
 .../solvers/dfLowMachFoam/new_dfLowMachFoam.C | 113 -----------------
 src_gpu/dfUEqn.cu                             |   2 +-
 5 files changed, 24 insertions(+), 249 deletions(-)
 delete mode 100644 applications/solvers/dfLowMachFoam/new_UEqn.H
 delete mode 100644 applications/solvers/dfLowMachFoam/new_dfLowMachFoam.C
diff --git a/applications/solvers/dfLowMachFoam/UEqn.H b/applications/solvers/dfLowMachFoam/UEqn.H
index 38934abdb..4e8c090ad 100644
--- a/applications/solvers/dfLowMachFoam/UEqn.H
+++ b/applications/solvers/dfLowMachFoam/UEqn.H
@@ -177,30 +177,34 @@
     // postProcess
     TICK_START;
     UEqn_GPU.postProcess(h_u);
+    memcpy(&U[0][0], h_u, dfDataBase.cell_value_vec_bytes);
     U.correctBoundaryConditions();
+    K = 0.5*magSqr(U);
     DEBUG_TRACE;
     TICK_STOP(post process time);
 
     // checkResult
     // TODO: for temp, now we compare ldu, finally we compare csr
-    std::vector<double> h_internal_coeffs(dfDataBase.num_boundary_surfaces * 3);
-    std::vector<double> h_boundary_coeffs(dfDataBase.num_boundary_surfaces * 3);
-    offset = 0;
-    for (int patchi = 0; patchi < dfDataBase.num_patches; patchi++)
-    {
-        int patchsize = dfDataBase.patch_size[patchi];
-        const double* internal_coeff_ptr = &UEqn.internalCoeffs()[patchi][0][0];
-        const double* boundary_coeff_ptr = &UEqn.boundaryCoeffs()[patchi][0][0];
-        memcpy(h_internal_coeffs.data() + offset * 3, internal_coeff_ptr, patchsize * 3 * sizeof(double));
-        memcpy(h_boundary_coeffs.data() + offset * 3, boundary_coeff_ptr, patchsize * 3 * sizeof(double));
-        offset += patchsize;
-    }
-    bool printFlag = false;
-    UEqn_GPU.compareResult(&UEqn.lower()[0], &UEqn.upper()[0], &UEqn.diag()[0], &UEqn.source()[0][0],
-            h_internal_coeffs.data(), h_boundary_coeffs.data(), 
-            // &DivTensor[0][0], 
-            printFlag);
-    DEBUG_TRACE;
+    // std::vector<double> h_internal_coeffs(dfDataBase.num_boundary_surfaces * 3);
+    // std::vector<double> h_boundary_coeffs(dfDataBase.num_boundary_surfaces * 3);
+    // Info << "location 0 " << endl;
+    // offset = 0;
+    // for (int patchi = 0; patchi < dfDataBase.num_patches; patchi++)
+    // {
+    //     int patchsize = dfDataBase.patch_size[patchi];
+    //     const double* internal_coeff_ptr = &UEqn.internalCoeffs()[patchi][0][0];
+    //     const double* boundary_coeff_ptr = &UEqn.boundaryCoeffs()[patchi][0][0];
+    //     memcpy(h_internal_coeffs.data() + offset * 3, internal_coeff_ptr, patchsize * 3 * sizeof(double));
+    //     memcpy(h_boundary_coeffs.data() + offset * 3, boundary_coeff_ptr, patchsize * 3 * sizeof(double));
+    //     offset += patchsize;
+    // }
+    // bool printFlag = false;
+    // UEqn_GPU.compareResult(&UEqn.lower()[0], &UEqn.upper()[0], &UEqn.diag()[0], &UEqn.source()[0][0],
+    //         h_internal_coeffs.data(), h_boundary_coeffs.data(), 
+    //         // &DivTensor[0][0], 
+    //         printFlag);
+    // DEBUG_TRACE;
+
 #else
     start1 = std::clock();
     tmp<fvVectorMatrix> tUEqn
diff --git a/applications/solvers/dfLowMachFoam/dfLowMachFoam.C b/applications/solvers/dfLowMachFoam/dfLowMachFoam.C
index 6ea4251af..4a17ab215 100644
--- a/applications/solvers/dfLowMachFoam/dfLowMachFoam.C
+++ b/applications/solvers/dfLowMachFoam/dfLowMachFoam.C
@@ -60,7 +60,7 @@ Description
 #include "basicThermo.H"
 #include "CombustionModel.H"
 
-#define GPUSolverNew_
+// #define GPUSolverNew_
 #define TIME
 
 #ifdef GPUSolverNew_
diff --git a/applications/solvers/dfLowMachFoam/new_UEqn.H b/applications/solvers/dfLowMachFoam/new_UEqn.H
deleted file mode 100644
index 41b804a4b..000000000
--- a/applications/solvers/dfLowMachFoam/new_UEqn.H
+++ /dev/null
@@ -1,116 +0,0 @@
-#ifdef GPUSolver_
-const tmp<volScalarField> nuEff_tmp(turbulence->nuEff());
-const volScalarField& nuEff = nuEff_tmp();
-
-// run CPU, for temp
-tmp<fvVectorMatrix> tUEqn
-(
-    fvm::ddt(rho, U) + fvm::div(phi, U)
-    +  turbulence->divDevRhoReff(U)
-    == -fvc::grad(p)
-);
-// tmp<fvVectorMatrix> tUEqn_ref // test turbulence->divDevRhoReff(U)
-// (
-//     - fvc::div((turbulence->rho()*turbulence->nuEff())*dev2(Foam::T(fvc::grad(U))))
-//     - fvm::laplacian(turbulence->rho()*turbulence->nuEff(), U)
-// );
-
-fvVectorMatrix& UEqn = tUEqn.ref();
-
-// run GPU
-// preProcess
-// TODO: preProcessForRhoEqn for temp, now we only transfer phi(instead of rhoEqn) used by fvm::div(phi, U)
-UEqn_GPU.sync();
-double *h_rho = dfDataBase.getFieldPointer("rho", location::cpu, position::internal);
-double *h_phi = dfDataBase.getFieldPointer("phi", location::cpu, position::internal);
-double *h_boundary_phi = dfDataBase.getFieldPointer("phi", location::cpu, position::boundary);
-memcpy(h_rho, &rho[0], dfDataBase.cell_value_bytes);
-memcpy(h_phi, &phi[0], dfDataBase.surface_value_bytes);
-int offset = 0;
-forAll(phi.boundaryField(), patchi)
-{
-    const fvsPatchScalarField& patchPhi = phi.boundaryField()[patchi];
-    int patchsize = patchPhi.size();
-    memcpy(h_boundary_phi + offset, &patchPhi[0], patchsize * sizeof(double));
-    offset += patchsize;
-}
-UEqn_GPU.preProcessForRhoEqn(h_rho, h_phi, h_boundary_phi);
-DEBUG_TRACE;
-clock_t start = std::clock();
-// preparing u, p, nu_eff, and rho.boundary used by UEqn_GPU.preProcess()
-double *h_u = dfDataBase.getFieldPointer("u", location::cpu, position::internal);
-double *h_boundary_u = dfDataBase.getFieldPointer("u", location::cpu, position::boundary);
-double *h_p = dfDataBase.getFieldPointer("p", location::cpu, position::internal);
-double *h_boundary_p = dfDataBase.getFieldPointer("p", location::cpu, position::boundary);
-double *h_nu_eff = UEqn_GPU.getFieldPointer("nu_eff", location::cpu, position::internal);
-double *h_boundary_nu_eff = UEqn_GPU.getFieldPointer("nu_eff", location::cpu, position::boundary);
-double *h_boundary_rho = dfDataBase.getFieldPointer("rho", location::cpu, position::boundary);
-double end = std::clock();
-Info << "get pointer" << double(end - start) / double(CLOCKS_PER_SEC) << endl;
-
-start = std::clock();
-memcpy(h_u, &U.oldTime()[0][0], dfDataBase.cell_value_vec_bytes);
-memcpy(h_p, &p[0], dfDataBase.cell_value_bytes);
-memcpy(h_nu_eff, &nuEff[0], dfDataBase.cell_value_bytes);
-end = std::clock();
-Info << "copy to pinned memory" << double(end - start) / double(CLOCKS_PER_SEC) << endl;
-
-start = std::clock();
-offset = 0;
-forAll(U.boundaryField(), patchi)
-{
-    const fvPatchVectorField& patchU = U.boundaryField()[patchi];
-    const fvPatchScalarField& patchP = p.boundaryField()[patchi];
-    const fvPatchScalarField& patchNuEff = nuEff.boundaryField()[patchi];
-    const fvPatchScalarField& patchRho = rho.boundaryField()[patchi];
-    int patchsize = patchU.size();
-    memcpy(h_boundary_u + offset * 3, &patchU[0][0], patchsize * 3 * sizeof(double));
-    memcpy(h_boundary_p + offset, &patchP[0], patchsize * sizeof(double));
-    memcpy(h_boundary_nu_eff + offset, &patchNuEff[0], patchsize * sizeof(double));
-    memcpy(h_boundary_rho + offset, &patchRho[0], patchsize * sizeof(double));
-    offset += patchsize;
-}
-end = std::clock();
-Info << "CPU prepare boundary time" << double(end - start) / double(CLOCKS_PER_SEC) << endl;
-
-start = std::clock();
-UEqn_GPU.preProcess(h_u, h_boundary_u, h_p, h_boundary_p, h_nu_eff, h_boundary_nu_eff, h_boundary_rho);
-DEBUG_TRACE;
-UEqn_GPU.sync();
-end = std::clock();
-Info << "GPU preProcess time" << double(end - start) / double(CLOCKS_PER_SEC) << endl;
-
-// process
-start = std::clock();
-UEqn_GPU.process();
-end = std::clock();
-DEBUG_TRACE;
-UEqn_GPU.sync();
-// end = std::clock();
-Info << "GPU process time" << double(end - start) / double(CLOCKS_PER_SEC) << endl;
-
-// postProcess
-UEqn_GPU.postProcess(h_u);
-DEBUG_TRACE;
-
-// checkResult
-// TODO: for temp, now we compare ldu, finally we compare csr
-std::vector<double> h_internal_coeffs(dfDataBase.num_boundary_surfaces * 3);
-std::vector<double> h_boundary_coeffs(dfDataBase.num_boundary_surfaces * 3);
-offset = 0;
-for (int patchi = 0; patchi < dfDataBase.num_patches; patchi++)
-{
-    int patchsize = dfDataBase.patch_size[patchi];
-    const double* internal_coeff_ptr = &UEqn.internalCoeffs()[patchi][0][0];
-    const double* boundary_coeff_ptr = &UEqn.boundaryCoeffs()[patchi][0][0];
-    memcpy(h_internal_coeffs.data() + offset * 3, internal_coeff_ptr, patchsize * 3 * sizeof(double));
-    memcpy(h_boundary_coeffs.data() + offset * 3, boundary_coeff_ptr, patchsize * 3 * sizeof(double));
-    offset += patchsize;
-}
-bool printFlag = true;
-UEqn_GPU.compareResult(&UEqn.lower()[0], &UEqn.upper()[0], &UEqn.diag()[0], &UEqn.source()[0][0],
-        h_internal_coeffs.data(), h_boundary_coeffs.data(), 
-        // &DivTensor[0][0], 
-        printFlag);
-DEBUG_TRACE;
-#endif
diff --git a/applications/solvers/dfLowMachFoam/new_dfLowMachFoam.C b/applications/solvers/dfLowMachFoam/new_dfLowMachFoam.C
deleted file mode 100644
index 7d867687f..000000000
--- a/applications/solvers/dfLowMachFoam/new_dfLowMachFoam.C
+++ /dev/null
@@ -1,113 +0,0 @@
-/*---------------------------------------------------------------------------*\
-  =========                 |
-  \\      /  F ield         | OpenFOAM: The Open Source CFD Toolbox
-   \\    /   O peration     | Website:  https://openfoam.org
-    \\  /    A nd           | Copyright (C) 2011-2019 OpenFOAM Foundation
-     \\/     M anipulation  |
--------------------------------------------------------------------------------
-License
-    This file is part of OpenFOAM.
-
-    OpenFOAM is free software: you can redistribute it and/or modify it
-    under the terms of the GNU General Public License as published by
-    the Free Software Foundation, either version 3 of the License, or
-    (at your option) any later version.
-
-    OpenFOAM is distributed in the hope that it will be useful, but WITHOUT
-    ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
-    FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
-    for more details.
-
-    You should have received a copy of the GNU General Public License
-    along with OpenFOAM.  If not, see <http://www.gnu.org/licenses/>.
-
-Application
-    unittest
-
-Description
-    GPU unittest
-
-\*---------------------------------------------------------------------------*/
-
-#include "dfChemistryModel.H"
-#include "CanteraMixture.H"
-// #include "hePsiThermo.H"
-#include "heRhoThermo.H"
-
-#include "fvCFD.H"
-#include "fluidThermo.H"
-#include "turbulentFluidThermoModel.H"
-#include "pimpleControl.H"
-#include "pressureControl.H"
-#include "localEulerDdtScheme.H"
-#include "fvcSmooth.H"
-#include "PstreamGlobals.H"
-#include "basicThermo.H"
-#include "CombustionModel.H"
-
-#include <cuda_runtime.h>
-#include <thread>
-#include "upwind.H"
-
-#include "dfMatrixDataBase.H"
-#include "dfMatrixOpBase.H"
-#include "GenFvMatrix.H"
-#include "dfUEqn.H"
-#include "createGPUSolver.H"
-
-#define GPUSolver_
-
-int main(int argc, char *argv[])
-{
-#ifdef USE_PYTORCH
-    pybind11::scoped_interpreter guard{};//start python interpreter
-#endif
-    #include "postProcess.H"
-
-    // #include "setRootCaseLists.H"
-    #include "listOptions.H"
-    #include "setRootCase2.H"
-    #include "listOutput.H"
-
-    #include "createTime.H"
-    #include "createMesh.H"
-    #include "createDyMControls.H"
-    #include "initContinuityErrs.H"
-    #include "createFields.H"
-    #include "createRhoUfIfPresent.H"
-
-    turbulence->validate();
-
-    if (!LTS)
-    {
-        #include "compressibleCourantNo.H"
-        #include "setInitialDeltaT.H"
-    }
-
-    // * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * //
-    {
-        #include "readDyMControls.H"
-
-        if (LTS)
-        {
-            #include "setRDeltaT.H"
-        }
-        else
-        {
-            #include "compressibleCourantNo.H"
-            #include "setDeltaT.H"
-        }
-
-        createGPUBase(mesh, Y);
-        createGPUUEqn(CanteraTorchProperties, U);
-
-        // for (int timestep = 0; timestep < 10; timestep++) {
-            dfDataBase.preTimeStep(&rho.oldTime()[0]);
-            #include "new_UEqn.H"
-            dfDataBase.postTimeStep();
-        // }
-    }
-    return 0;
-}
-
-
diff --git a/src_gpu/dfUEqn.cu b/src_gpu/dfUEqn.cu
index d30c06131..5b97d259d 100644
--- a/src_gpu/dfUEqn.cu
+++ b/src_gpu/dfUEqn.cu
@@ -203,7 +203,7 @@ void dfUEqn::solve() {
 
 void dfUEqn::postProcess(double *h_u) { // TODO: Here may be a bug
     permute_vector_d2h(dataBase_.stream, dataBase_.num_cells, d_permute, dataBase_.d_u);
-    checkCudaErrors(cudaMemcpyAsync(dataBase_.d_u, d_permute, dataBase_.cell_value_vec_bytes, cudaMemcpyDeviceToHost, dataBase_.stream));
+    checkCudaErrors(cudaMemcpyAsync(h_u, dataBase_.d_u, dataBase_.cell_value_vec_bytes, cudaMemcpyDeviceToHost, dataBase_.stream));
     checkCudaErrors(cudaStreamSynchronize(dataBase_.stream));
 
     // some boundary conditions may also need vf.boundary, deltaCoeffs.boundary, and weight.boundary

From 5c7daa58cc2198962077f8a8ec8cd7d8c111fe7e Mon Sep 17 00:00:00 2001
From: maorz1998 <maorz1998@stu.pku.edu.cn>
Date: Tue, 22 Aug 2023 21:11:04 +0800
Subject: [PATCH 2/3] opt ldu2csr

---
 src_gpu/dfMatrixOpBase.H  |  2 +-
 src_gpu/dfMatrixOpBase.cu | 49 +++++++++++++++++++++------------------
 src_gpu/dfUEqn.cu         | 13 +++++------
 3 files changed, 33 insertions(+), 31 deletions(-)

diff --git a/src_gpu/dfMatrixOpBase.H b/src_gpu/dfMatrixOpBase.H
index 71dd82c38..7a98a685d 100644
--- a/src_gpu/dfMatrixOpBase.H
+++ b/src_gpu/dfMatrixOpBase.H
@@ -15,7 +15,7 @@ void ldu_to_csr(cudaStream_t stream, int num_cells, int num_surfaces, int num_bo
         const int* boundary_cell_face, const int *lower_to_csr_index, const int *upper_to_csr_index, const int *diag_to_csr_index,
         const double *lower, const double *upper, const double *diag, const double *source,
         const double *internal_coeffs, const double *boundary_coeffs,
-        double *A, double *b, double *diag_vec);
+        double *A, double *b);
 
 void update_boundary_coeffs_vector(cudaStream_t stream, int num_boundary_surfaces, int num_patches,
         const int *patch_size, const int *patch_type,
diff --git a/src_gpu/dfMatrixOpBase.cu b/src_gpu/dfMatrixOpBase.cu
index e3616fac3..54a9b6893 100644
--- a/src_gpu/dfMatrixOpBase.cu
+++ b/src_gpu/dfMatrixOpBase.cu
@@ -805,14 +805,17 @@ __global__ void constructVecDiag(int num_cells, const double *diag, double *diag
     b[num_cells * 2 + index] = source[num_cells * 2 + index];
 }
 
-__global__ void addBoundaryDiagSrc(int num_cells, int num_boundary_surfaces, const int *face2Cells, 
-        const double *internal_coeffs, const double *boundary_coeffs, double *diag_vec, double *b)
+__global__ void addBoundaryDiagSrc(int num_cells, int num_surfaces, int num_boundary_surfaces, const int *face2Cells, 
+        const double *internal_coeffs, const double *boundary_coeffs, const int *diagCSRIndex, 
+        double *A_csr, double *b)
 {
     int index = blockDim.x * blockIdx.x + threadIdx.x;
     if (index >= num_boundary_surfaces)
         return;
     
     int cellIndex = face2Cells[index];
+    int diagIndex = diagCSRIndex[cellIndex];
+    int nNz = num_cells + 2 * num_surfaces;
 
     double internalCoeffx = internal_coeffs[num_boundary_surfaces * 0 + index];
     double internalCoeffy = internal_coeffs[num_boundary_surfaces * 1 + index];
@@ -822,9 +825,9 @@ __global__ void addBoundaryDiagSrc(int num_cells, int num_boundary_surfaces, con
     double boundaryCoeffy = boundary_coeffs[num_boundary_surfaces * 1 + index];
     double boundaryCoeffz = boundary_coeffs[num_boundary_surfaces * 2 + index];
 
-    atomicAdd(&diag_vec[num_cells * 0 + cellIndex], internalCoeffx);
-    atomicAdd(&diag_vec[num_cells * 1 + cellIndex], internalCoeffy);
-    atomicAdd(&diag_vec[num_cells * 2 + cellIndex], internalCoeffz);
+    atomicAdd(&A_csr[nNz * 0 + diagIndex], internalCoeffx);
+    atomicAdd(&A_csr[nNz * 1 + diagIndex], internalCoeffy);
+    atomicAdd(&A_csr[nNz * 2 + diagIndex], internalCoeffz);
 
     atomicAdd(&b[num_cells * 0 + cellIndex], boundaryCoeffx);
     atomicAdd(&b[num_cells * 1 + cellIndex], boundaryCoeffy);
@@ -842,8 +845,8 @@ __global__ void ldu_to_csr_offDiag(int num_cells, int num_surfaces,
 
     int uppIndex = uppCSRIndex[index];
     int lowIndex = lowCSRIndex[index];
-    int upp = upper[index];
-    int low = lower[index];
+    double upp = upper[index];
+    double low = lower[index];
     A_csr[(num_cells + 2 * num_surfaces) * 0 + uppIndex] = upper[index];
     A_csr[(num_cells + 2 * num_surfaces) * 1 + uppIndex] = upper[index];
     A_csr[(num_cells + 2 * num_surfaces) * 2 + uppIndex] = upper[index];
@@ -854,17 +857,22 @@ __global__ void ldu_to_csr_offDiag(int num_cells, int num_surfaces,
 }
 
 __global__ void ldu_to_csr_Diag(int num_cells, int num_surfaces, 
-        const int *diagCSRIndex, const double *diag_vec,
-        double *A_csr)
+        const int *diagCSRIndex, const double *diag, const double *source, 
+        double *A_csr, double *b)
 {
     int index = blockDim.x * blockIdx.x + threadIdx.x;
     if (index >= num_cells)
         return;
 
     int diagIndex = diagCSRIndex[index];
-    A_csr[(num_cells + 2 * num_surfaces) * 0 + diagIndex] = diag_vec[num_cells * 0 + index];
-    A_csr[(num_cells + 2 * num_surfaces) * 1 + diagIndex] = diag_vec[num_cells * 1 + index];
-    A_csr[(num_cells + 2 * num_surfaces) * 2 + diagIndex] = diag_vec[num_cells * 2 + index];
+    double diagVal = diag[index];
+    A_csr[(num_cells + 2 * num_surfaces) * 0 + diagIndex] = diagVal;
+    A_csr[(num_cells + 2 * num_surfaces) * 1 + diagIndex] = diagVal;
+    A_csr[(num_cells + 2 * num_surfaces) * 2 + diagIndex] = diagVal;
+
+    b[num_cells * 0 + index] = source[num_cells * 0 + index];
+    b[num_cells * 1 + index] = source[num_cells * 1 + index];
+    b[num_cells * 2 + index] = source[num_cells * 2 + index];
 }
 
 
@@ -907,28 +915,23 @@ void ldu_to_csr(cudaStream_t stream, int num_cells, int num_surfaces, int num_bo
         const int* boundary_cell_face, const int *lower_to_csr_index, const int *upper_to_csr_index, const int *diag_to_csr_index,
         const double *lower, const double *upper, const double *diag, const double *source,
         const double *internal_coeffs, const double *boundary_coeffs,
-        double *A, double *b, double *diag_vec)
+        double *A, double *b)
 {
-    // construct new diag with size of 3*num_cells
+    // construct diag
     size_t threads_per_block = 1024;
     size_t blocks_per_grid = (num_cells + threads_per_block - 1) / threads_per_block;
-    constructVecDiag<<<blocks_per_grid, threads_per_block, 0, stream>>>(num_cells, diag, diag_vec, source, b);
+    ldu_to_csr_Diag<<<blocks_per_grid, threads_per_block, 0, stream>>>(num_cells, num_surfaces, 
+            diag_to_csr_index, diag, source, A, b);
 
     // add coeff to source and diagnal
     blocks_per_grid = (num_boundary_surface + threads_per_block - 1) / threads_per_block;
-    addBoundaryDiagSrc<<<blocks_per_grid, threads_per_block, 0, stream>>>(num_cells, num_boundary_surface, 
-            boundary_cell_face, internal_coeffs, boundary_coeffs, diag_vec, b);
+    addBoundaryDiagSrc<<<blocks_per_grid, threads_per_block, 0, stream>>>(num_cells, num_surfaces, num_boundary_surface, 
+            boundary_cell_face, internal_coeffs, boundary_coeffs, diag_to_csr_index, A, b);
     
     // convert offdiag
     blocks_per_grid = (num_surfaces + threads_per_block - 1) / threads_per_block;
     ldu_to_csr_offDiag<<<blocks_per_grid, threads_per_block, 0, stream>>>(num_cells, num_surfaces, 
             lower_to_csr_index, upper_to_csr_index, lower, upper, A);
-
-    // convert diag
-    blocks_per_grid = (num_cells + threads_per_block - 1) / threads_per_block;
-    ldu_to_csr_Diag<<<blocks_per_grid, threads_per_block, 0, stream>>>(num_cells, num_surfaces, 
-            diag_to_csr_index, diag_vec, A);
-
 }
 
 void update_boundary_coeffs_vector(cudaStream_t stream, int num_boundary_surfaces, int num_patches,
diff --git a/src_gpu/dfUEqn.cu b/src_gpu/dfUEqn.cu
index 5b97d259d..837a7587a 100644
--- a/src_gpu/dfUEqn.cu
+++ b/src_gpu/dfUEqn.cu
@@ -50,7 +50,6 @@ void dfUEqn::createNonConstantLduAndCsrFields() {
   checkCudaErrors(cudaMalloc((void**)&d_upper, dataBase_.surface_value_bytes));
   checkCudaErrors(cudaMalloc((void**)&d_diag, dataBase_.cell_value_bytes));
   checkCudaErrors(cudaMalloc((void**)&d_source, dataBase_.cell_value_vec_bytes));
-  checkCudaErrors(cudaMalloc((void**)&d_diag_vector, dataBase_.cell_value_vec_bytes));
   checkCudaErrors(cudaMalloc((void**)&d_internal_coeffs, dataBase_.boundary_surface_value_vec_bytes));
   checkCudaErrors(cudaMalloc((void**)&d_boundary_coeffs, dataBase_.boundary_surface_value_vec_bytes));
   checkCudaErrors(cudaMalloc((void**)&d_A, dataBase_.csr_value_vec_bytes));
@@ -88,7 +87,6 @@ void dfUEqn::preProcess(const double *h_u, const double *h_boundary_u, const dou
   checkCudaErrors(cudaMemsetAsync(d_boundary_coeffs, 0, dataBase_.boundary_surface_value_vec_bytes, dataBase_.stream));
   checkCudaErrors(cudaMemsetAsync(d_A, 0, dataBase_.csr_value_vec_bytes, dataBase_.stream));
   checkCudaErrors(cudaMemsetAsync(d_b, 0, dataBase_.cell_value_vec_bytes, dataBase_.stream));
-  checkCudaErrors(cudaMemsetAsync(d_diag_vector, 0, dataBase_.cell_value_vec_bytes, dataBase_.stream)); // TODO: maybe a better way
 }
 
 void dfUEqn::process() {
@@ -140,15 +138,11 @@ void dfUEqn::process() {
                dataBase_.d_weight, dataBase_.d_sf, d_grad_u, d_source, // end for internal
                dataBase_.num_patches, dataBase_.patch_size.data(), patch_type.data(),
                dataBase_.d_boundary_face_cell, d_boundary_grad_u, dataBase_.d_boundary_sf, dataBase_.d_volume);
-        // fvc_to_source_vector(dataBase_.stream, dataBase_.num_cells,
-        //        dataBase_.d_volume, d_fvc_output, d_source);
         fvc_grad_cell_scalar(dataBase_.stream, dataBase_.num_cells, dataBase_.num_surfaces,
                 dataBase_.d_owner, dataBase_.d_neighbor,
                 dataBase_.d_weight, dataBase_.d_sf, dataBase_.d_p, d_source,
                 dataBase_.num_patches, dataBase_.patch_size.data(), patch_type.data(),
                 dataBase_.d_boundary_face_cell, dataBase_.d_boundary_p, dataBase_.d_boundary_sf, dataBase_.d_volume, -1.);
-        // fvc_to_source_vector(dataBase_.stream, dataBase_.num_cells,
-        //         dataBase_.d_volume, d_fvc_output, d_source);
 
 #ifndef TIME_GPU
         checkCudaErrors(cudaStreamEndCapture(dataBase_.stream, &graph));
@@ -177,11 +171,16 @@ void dfUEqn::solve() {
     ldu_to_csr(dataBase_.stream, dataBase_.num_cells, dataBase_.num_surfaces, dataBase_.num_boundary_surfaces,
             dataBase_.d_boundary_face_cell,
             dataBase_.d_lower_to_csr_index, dataBase_.d_upper_to_csr_index, dataBase_.d_diag_to_csr_index,
-            d_lower, d_upper, d_diag, d_source, d_internal_coeffs, d_boundary_coeffs, d_A, d_b, d_diag_vector);
+            d_lower, d_upper, d_diag, d_source, d_internal_coeffs, d_boundary_coeffs, d_A, d_b);
 
     int nNz = dataBase_.num_cells + dataBase_.num_surfaces * 2; // matrix entries
     sync();
 
+    // double *h_A_csr = new double[nNz * 3];
+    // checkCudaErrors(cudaMemcpy(h_A_csr, d_A, dataBase_.csr_value_vec_bytes, cudaMemcpyDeviceToHost));
+    // for (int i = 0; i < nNz; i++)
+    //     fprintf(stderr, "h_A_csr[%d]: (%.10lf, %.10lf, %.10lf)\n", i, h_A_csr[i], h_A_csr[i + nNz], h_A_csr[i + 2 * nNz]);
+
     if (num_iteration == 0)                                     // first interation
     {
         printf("Initializing AmgX Linear Solver\n");

From 920fc84d8402a21c3c9338684be91549fdb11e9f Mon Sep 17 00:00:00 2001
From: maorz1998 <maorz1998@stu.pku.edu.cn>
Date: Sun, 27 Aug 2023 16:44:38 +0800
Subject: [PATCH 3/3] fvMatrix.A(), fvMatrix.H(), opt ldu2csr

---
 applications/solvers/dfLowMachFoam/UEqn.H     |   2 +-
 .../solvers/dfLowMachFoam/dfLowMachFoam.C     |   2 +-
 applications/solvers/dfLowMachFoam/pEqn.H     |  43 ++++
 src_gpu/dfMatrixDataBase.H                    |   1 +
 src_gpu/dfMatrixDataBase.cu                   |  36 ++-
 src_gpu/dfMatrixOpBase.H                      |  19 +-
 src_gpu/dfMatrixOpBase.cu                     | 241 ++++++++++++++----
 src_gpu/dfUEqn.H                              |  10 +
 src_gpu/dfUEqn.cu                             |  54 +++-
 9 files changed, 335 insertions(+), 73 deletions(-)

diff --git a/applications/solvers/dfLowMachFoam/UEqn.H b/applications/solvers/dfLowMachFoam/UEqn.H
index 4e8c090ad..ea15c8b43 100644
--- a/applications/solvers/dfLowMachFoam/UEqn.H
+++ b/applications/solvers/dfLowMachFoam/UEqn.H
@@ -135,7 +135,7 @@
 
     TICK_START;
     U.oldTime();
-    memcpy(h_u, &U.oldTime()[0][0], dfDataBase.cell_value_vec_bytes);
+    memcpy(h_u, &U.oldTime()[0][0], dfDataBase.cell_value_vec_bytes); 
     memcpy(h_p, &p[0], dfDataBase.cell_value_bytes);
     memcpy(h_nu_eff, &nuEff[0], dfDataBase.cell_value_bytes);
     TICK_STOP(copy to pinned memory);
diff --git a/applications/solvers/dfLowMachFoam/dfLowMachFoam.C b/applications/solvers/dfLowMachFoam/dfLowMachFoam.C
index 4a17ab215..6ea4251af 100644
--- a/applications/solvers/dfLowMachFoam/dfLowMachFoam.C
+++ b/applications/solvers/dfLowMachFoam/dfLowMachFoam.C
@@ -60,7 +60,7 @@ Description
 #include "basicThermo.H"
 #include "CombustionModel.H"
 
-// #define GPUSolverNew_
+#define GPUSolverNew_
 #define TIME
 
 #ifdef GPUSolverNew_
diff --git a/applications/solvers/dfLowMachFoam/pEqn.H b/applications/solvers/dfLowMachFoam/pEqn.H
index 34925327f..cc35d93fe 100644
--- a/applications/solvers/dfLowMachFoam/pEqn.H
+++ b/applications/solvers/dfLowMachFoam/pEqn.H
@@ -67,11 +67,52 @@ const volScalarField psip0(psi*p);
     time_monitor_UEqn_A_correctBC += double(end1 - start1) / double(CLOCKS_PER_SEC);
 #endif
 
+#ifdef GPUSolverNew_
+volVectorField UEqn_H
+(
+    IOobject
+    (
+        "H("+U.name()+')',
+        runTime.timeName(),
+        mesh,
+        IOobject::NO_READ,
+        IOobject::NO_WRITE
+    ),
+    mesh,
+    dimensionedVector(dimensionSet(1,-2,-2,0,0,0,0), Zero),
+    extrapolatedCalculatedFvPatchScalarField::typeName
+);
+UEqn_GPU.H(&UEqn_H[0][0]);
+UEqn_H.correctBoundaryConditions();
+
+volScalarField UEqn_A
+(
+    IOobject
+    (
+        "A("+U.name()+')',
+        runTime.timeName(),
+        mesh,
+        IOobject::NO_READ,
+        IOobject::NO_WRITE
+    ),
+    mesh,
+    dimensionedScalar(dimensionSet(1,-3,-1,0,0,0,0), Zero),
+    extrapolatedCalculatedFvPatchScalarField::typeName
+);
+UEqn_GPU.A(&UEqn_A[0]);
+UEqn_A.correctBoundaryConditions();
+#endif
+
+
 start2 = std::clock();
 #ifdef GPUSolver_
     volScalarField rAU(1.0/UEqn_A);
     surfaceScalarField rhorAUf("rhorAUf", fvc::interpolate(rho*rAU));
     volVectorField HbyA(constrainHbyA(rAU*UEqn_H, U, p));
+#elif defined GPUSolverNew_
+    volScalarField rAU(1.0/UEqn_A);
+    surfaceScalarField rhorAUf("rhorAUf", fvc::interpolate(rho*rAU));
+    volVectorField HbyA(constrainHbyA(rAU*UEqn_H, U, p));
 #else
     volScalarField rAU(1.0/UEqn.A());
     surfaceScalarField rhorAUf("rhorAUf", fvc::interpolate(rho*rAU));
@@ -179,6 +220,8 @@ K = 0.5*magSqr(U);
     end1 = std::clock();
     time_monitor_UEqn += double(end1 - start1) / double(CLOCKS_PER_SEC);
     time_monitor_UEqn_correctBC += double(end1 - start1) / double(CLOCKS_PER_SEC);
+#elif defined GPUSolverNew_
+    UEqn_GPU.correctPsi(&U[0][0]);
 #endif
 
 if (pimple.simpleRho())
diff --git a/src_gpu/dfMatrixDataBase.H b/src_gpu/dfMatrixDataBase.H
index cac7264a8..a4eeee70a 100644
--- a/src_gpu/dfMatrixDataBase.H
+++ b/src_gpu/dfMatrixDataBase.H
@@ -103,6 +103,7 @@ struct dfMatrixDataBase
     int *d_lower_to_csr_index = nullptr;
     int *d_diag_to_csr_index= nullptr;
     int *d_upper_to_csr_index= nullptr;
+    int *d_ldu_to_csr_index = nullptr;
     int *d_csr_row_index= nullptr;
     int *d_csr_col_index= nullptr;
 
diff --git a/src_gpu/dfMatrixDataBase.cu b/src_gpu/dfMatrixDataBase.cu
index 4e49faf99..1c1ee5ce0 100644
--- a/src_gpu/dfMatrixDataBase.cu
+++ b/src_gpu/dfMatrixDataBase.cu
@@ -160,14 +160,48 @@ void dfMatrixDataBase::setConstantIndexes(const int *owner, const int *neighbor)
         diagCSRIndex.push_back(diagIndexInCSR);
         CSRColIndex[diagIndexInCSR] = i; // fill diag entry in CSRColIndex
     }
-    CSRRowIndex.push_back(2 * num_surfaces + num_cells);
+    int nNz = 2 * num_surfaces + num_cells;
+    CSRRowIndex.push_back(nNz);
+
+    // get reverseIndex from csr to ldu (low + diag + upp)
+    std::vector<int> CSRIndex;
+    CSRIndex.insert(CSRIndex.end(), lowCSRIndex.begin(), lowCSRIndex.end());
+    CSRIndex.insert(CSRIndex.end(), diagCSRIndex.begin(), diagCSRIndex.end());
+    CSRIndex.insert(CSRIndex.end(), uppCSRIndex.begin(), uppCSRIndex.end());
+
+    std::vector<int> lduCSRIndexPermInit(nNz);
+    std::iota(lduCSRIndexPermInit.begin(), lduCSRIndexPermInit.end(), 0);
+
+    std::multimap<int,int> IndexPermutation;
+    for (int i = 0; i < nNz; ++i){
+        IndexPermutation.insert(std::make_pair(CSRIndex[i], lduCSRIndexPermInit[i]));
+    }
+    std::vector<std::pair<int, int>> IndexPermPair(IndexPermutation.begin(), IndexPermutation.end());
+    std::sort(IndexPermPair.begin(), IndexPermPair.end(), []
+    (const std::pair<int, int>& pair1, const std::pair<int, int>& pair2){
+        return pair1.first < pair2.first;});
+
+    std::vector<int> lduCSRIndex;
+    std::transform(IndexPermPair.begin(), IndexPermPair.end(), std::back_inserter(lduCSRIndex), []
+        (const std::pair<int, int>& pair) {
+        return pair.second;
+    });
+
+    std::vector<int> test;
+    std::transform(IndexPermPair.begin(), IndexPermPair.end(), std::back_inserter(test), []
+        (const std::pair<int, int>& pair) {
+        return pair.first;
+    });
+    
 
     checkCudaErrors(cudaMalloc((void**)&d_lower_to_csr_index, surface_index_bytes));
     checkCudaErrors(cudaMalloc((void**)&d_diag_to_csr_index, cell_index_bytes));
     checkCudaErrors(cudaMalloc((void**)&d_upper_to_csr_index, surface_index_bytes));
+    checkCudaErrors(cudaMalloc((void**)&d_ldu_to_csr_index, csr_col_index_bytes));
     checkCudaErrors(cudaMemcpyAsync(d_lower_to_csr_index, lowCSRIndex.data(), surface_index_bytes, cudaMemcpyHostToDevice, stream));
     checkCudaErrors(cudaMemcpyAsync(d_diag_to_csr_index, diagCSRIndex.data(), cell_index_bytes, cudaMemcpyHostToDevice, stream));
     checkCudaErrors(cudaMemcpyAsync(d_upper_to_csr_index, uppCSRIndex.data(), surface_index_bytes, cudaMemcpyHostToDevice, stream));
+    checkCudaErrors(cudaMemcpyAsync(d_ldu_to_csr_index, lduCSRIndex.data(), csr_col_index_bytes, cudaMemcpyHostToDevice, stream));
 
 
     // build d_csr_row_index, d_csr_col_index
diff --git a/src_gpu/dfMatrixOpBase.H b/src_gpu/dfMatrixOpBase.H
index 7a98a685d..c22fd5c4f 100644
--- a/src_gpu/dfMatrixOpBase.H
+++ b/src_gpu/dfMatrixOpBase.H
@@ -1,5 +1,6 @@
 #pragma once
-// #define TIME_GPU
+#include <cuda_runtime.h>
+#define TIME_GPU
 
 // tools
 void permute_vector_d2h(cudaStream_t stream, int num_cells, const double *input, double *output);
@@ -12,9 +13,8 @@ void field_multiply_scalar(cudaStream_t stream,
 void fvc_to_source_vector(cudaStream_t stream, int num_cells, const double *volume, const double *fvc_output, double *source);
 
 void ldu_to_csr(cudaStream_t stream, int num_cells, int num_surfaces, int num_boundary_surface,
-        const int* boundary_cell_face, const int *lower_to_csr_index, const int *upper_to_csr_index, const int *diag_to_csr_index,
-        const double *lower, const double *upper, const double *diag, const double *source,
-        const double *internal_coeffs, const double *boundary_coeffs,
+        const int* boundary_cell_face, const int *ldu_to_csr_index, const int *diag_to_csr_index,
+        const double *ldu, const double *source, const double *internal_coeffs, const double *boundary_coeffs,
         double *A, double *b);
 
 void update_boundary_coeffs_vector(cudaStream_t stream, int num_boundary_surfaces, int num_patches,
@@ -86,3 +86,14 @@ void fvc_grad_cell_scalar(cudaStream_t stream, int num_cells, int num_surfaces,
 // others
 void scale_dev2T_tensor(cudaStream_t stream, int num_cells, const double *vf1, double *vf2,
         int num_boundary_surfaces, const double *boundary_vf1, double *boundary_vf2);
+
+void fvMtx_A(cudaStream_t stream, int num_cells, int num_surfaces, int num_boundary_surfaces, 
+        const int *boundary_cell_face, const double *internal_coeffs, const double *volume, const double *diag, 
+        double *A_pEqn, double sign = 1.);
+
+void fvMtx_H(cudaStream_t stream, int num_cells, int num_surfaces, int num_boundary_surfaces, 
+        const int *lowerAddr, const int *upperAddr, const double *volume,
+        int num_patches, const int *patch_size, const int *patch_type,
+        const int *boundary_cell_face, const double *internal_coffs, const double *boundary_coeffs, 
+        const double *lower, const double *upper, const double *source, const double *psi, 
+        double *H_pEqn, double *H_pEqn_perm, double sign = 1.);
diff --git a/src_gpu/dfMatrixOpBase.cu b/src_gpu/dfMatrixOpBase.cu
index 54a9b6893..08ff4aefc 100644
--- a/src_gpu/dfMatrixOpBase.cu
+++ b/src_gpu/dfMatrixOpBase.cu
@@ -789,22 +789,6 @@ __global__ void fvc_div_cell_tensor_boundary(int num_cells, int num_boundary_fac
     // }
 }
 
-__global__ void constructVecDiag(int num_cells, const double *diag, double *diag_vec, 
-        const double *source, double *b)
-{
-    int index = blockDim.x * blockIdx.x + threadIdx.x;
-    if (index >= num_cells)
-        return;
-    
-    diag_vec[num_cells * 0 + index] = diag[index];
-    diag_vec[num_cells * 1 + index] = diag[index];
-    diag_vec[num_cells * 2 + index] = diag[index];
-
-    b[num_cells * 0 + index] = source[num_cells * 0 + index];
-    b[num_cells * 1 + index] = source[num_cells * 1 + index];
-    b[num_cells * 2 + index] = source[num_cells * 2 + index];
-}
-
 __global__ void addBoundaryDiagSrc(int num_cells, int num_surfaces, int num_boundary_surfaces, const int *face2Cells, 
         const double *internal_coeffs, const double *boundary_coeffs, const int *diagCSRIndex, 
         double *A_csr, double *b)
@@ -834,47 +818,128 @@ __global__ void addBoundaryDiagSrc(int num_cells, int num_surfaces, int num_boun
     atomicAdd(&b[num_cells * 2 + cellIndex], boundaryCoeffz);
 }
 
-__global__ void ldu_to_csr_offDiag(int num_cells, int num_surfaces,
-        const int *lowCSRIndex, const int *uppCSRIndex,
-        const double *lower, const double *upper,
-        double *A_csr)
+__global__ void ldu_to_csr_kernel(int nNz, const int *ldu_to_csr_index, 
+        const double *ldu, double *A_csr)
 {
     int index = blockDim.x * blockIdx.x + threadIdx.x;
-    if (index >= num_surfaces)
+    if (index >= nNz)
         return;
+    
+    int lduIndex = ldu_to_csr_index[index];
+    double csrVal = ldu[lduIndex];
+    A_csr[nNz * 0 + index] = csrVal;
+    A_csr[nNz * 1 + index] = csrVal;
+    A_csr[nNz * 2 + index] = csrVal;
+}
 
-    int uppIndex = uppCSRIndex[index];
-    int lowIndex = lowCSRIndex[index];
-    double upp = upper[index];
-    double low = lower[index];
-    A_csr[(num_cells + 2 * num_surfaces) * 0 + uppIndex] = upper[index];
-    A_csr[(num_cells + 2 * num_surfaces) * 1 + uppIndex] = upper[index];
-    A_csr[(num_cells + 2 * num_surfaces) * 2 + uppIndex] = upper[index];
-
-    A_csr[(num_cells + 2 * num_surfaces) * 0 + lowIndex] = lower[index];
-    A_csr[(num_cells + 2 * num_surfaces) * 1 + lowIndex] = lower[index];
-    A_csr[(num_cells + 2 * num_surfaces) * 2 + lowIndex] = lower[index];
+__global__ void addAveInternaltoDiag(int num_cells, int num_boundary_surfaces, const int *face2Cells, 
+        const double *internal_coeffs, double *A_pEqn)
+{
+    int index = blockDim.x * blockIdx.x + threadIdx.x;
+    if (index >= num_boundary_surfaces)
+        return;
+
+    int cellIndex = face2Cells[index];
+
+    double internal_x = internal_coeffs[num_boundary_surfaces * 0 + index];
+    double internal_y = internal_coeffs[num_boundary_surfaces * 1 + index];
+    double internal_z = internal_coeffs[num_boundary_surfaces * 2 + index];
+
+    double ave_internal = (internal_x + internal_y + internal_z) / 3;
+
+    atomicAdd(&A_pEqn[cellIndex], ave_internal);
 }
 
-__global__ void ldu_to_csr_Diag(int num_cells, int num_surfaces, 
-        const int *diagCSRIndex, const double *diag, const double *source, 
-        double *A_csr, double *b)
+__global__ void addBoundaryDiag(int num_cells, int num_boundary_surfaces, const int *face2Cells, 
+        const double *internal_coeffs, const double *psi, double *H_pEqn)
 {
     int index = blockDim.x * blockIdx.x + threadIdx.x;
-    if (index >= num_cells)
+    if (index >= num_boundary_surfaces)
         return;
 
-    int diagIndex = diagCSRIndex[index];
-    double diagVal = diag[index];
-    A_csr[(num_cells + 2 * num_surfaces) * 0 + diagIndex] = diagVal;
-    A_csr[(num_cells + 2 * num_surfaces) * 1 + diagIndex] = diagVal;
-    A_csr[(num_cells + 2 * num_surfaces) * 2 + diagIndex] = diagVal;
+    // addBoundaryDiag(boundaryDiagCmpt, cmpt); // add internal coeffs
+    // boundaryDiagCmpt.negate();
+    double internal_x = internal_coeffs[num_boundary_surfaces * 0 + index];
+    double internal_y = internal_coeffs[num_boundary_surfaces * 1 + index];
+    double internal_z = internal_coeffs[num_boundary_surfaces * 2 + index];
+
+    // addCmptAvBoundaryDiag(boundaryDiagCmpt);
+    double ave_internal = (internal_x + internal_y + internal_z) / 3;
+
+    int cellIndex = face2Cells[index];
 
-    b[num_cells * 0 + index] = source[num_cells * 0 + index];
-    b[num_cells * 1 + index] = source[num_cells * 1 + index];
-    b[num_cells * 2 + index] = source[num_cells * 2 + index];
+    // do not permute H anymore
+    atomicAdd(&H_pEqn[num_cells * 0 + cellIndex], (-internal_x + ave_internal) * psi[num_cells * 0 + cellIndex]);
+    atomicAdd(&H_pEqn[num_cells * 1 + cellIndex], (-internal_y + ave_internal) * psi[num_cells * 1 + cellIndex]);
+    atomicAdd(&H_pEqn[num_cells * 2 + cellIndex], (-internal_z + ave_internal) * psi[num_cells * 2 + cellIndex]);
 }
 
+__global__ void lduMatrix_H(int num_cells, int num_surfaces,
+        const int *lower_index, const int *upper_index, const double *lower, const double *upper,
+        const double *psi, double *H_pEqn)
+{
+    /*
+    for (label face=0; face<nFaces; face++)
+    {
+        HpsiPtr[uPtr[face]] -= lowerPtr[face]*psiPtr[lPtr[face]];
+        HpsiPtr[lPtr[face]] -= upperPtr[face]*psiPtr[uPtr[face]];
+    }*/
+    int index = blockDim.x * blockIdx.x + threadIdx.x;
+    if (index >= num_surfaces)
+        return;
+    
+    int l = lower_index[index];
+    int u = upper_index[index];
+
+    atomicAdd(&H_pEqn[num_cells * 0 + u], -lower[index] * psi[num_cells * 0 + l]);
+    atomicAdd(&H_pEqn[num_cells * 1 + u], -lower[index] * psi[num_cells * 1 + l]);
+    atomicAdd(&H_pEqn[num_cells * 2 + u], -lower[index] * psi[num_cells * 2 + l]);
+    atomicAdd(&H_pEqn[num_cells * 0 + l], -upper[index] * psi[num_cells * 0 + u]);
+    atomicAdd(&H_pEqn[num_cells * 1 + l], -upper[index] * psi[num_cells * 1 + u]);
+    atomicAdd(&H_pEqn[num_cells * 2 + l], -upper[index] * psi[num_cells * 2 + u]);
+}
+
+__global__ void addBoundarySrc_unCoupled(int num_cells, int num, int offset, 
+        int num_boundary_surfaces, const int *face2Cells, const double *boundary_coeffs, double *H_pEqn)
+{
+    int index = blockDim.x * blockIdx.x + threadIdx.x;
+    if (index >= num)
+        return;
+
+    int start_index = offset + index;
+
+    // addBoundaryDiag(boundaryDiagCmpt, cmpt); // add internal coeffs
+    // boundaryDiagCmpt.negate();
+    double boundary_x = boundary_coeffs[num_boundary_surfaces * 0 + start_index];
+    double boundary_y = boundary_coeffs[num_boundary_surfaces * 1 + start_index];
+    double boundary_z = boundary_coeffs[num_boundary_surfaces * 2 + start_index];
+
+
+    int cellIndex = face2Cells[start_index];
+
+    // do not permute H anymore
+    atomicAdd(&H_pEqn[num_cells * 0 + cellIndex], boundary_x);
+    atomicAdd(&H_pEqn[num_cells * 1 + cellIndex], boundary_y);
+    atomicAdd(&H_pEqn[num_cells * 2 + cellIndex], boundary_z);
+}
+
+__global__ void divideVol_permute_vec(int num_cells, const double *volume, double *H_pEqn, double *H_pEqn_perm)
+{
+    int index = blockDim.x * blockIdx.x + threadIdx.x;
+    if (index >= num_cells)
+        return;
+    
+    // divide volume
+    double vol = volume[index];
+    double H_pEqn_x = H_pEqn[num_cells * 0 + index] / vol;
+    double H_pEqn_y = H_pEqn[num_cells * 1 + index] / vol;
+    double H_pEqn_z = H_pEqn[num_cells * 2 + index] / vol;
+
+    // permute
+    H_pEqn_perm[index * 3 + 0] = H_pEqn_x;
+    H_pEqn_perm[index * 3 + 1] = H_pEqn_y;
+    H_pEqn_perm[index * 3 + 2] = H_pEqn_z;
+}
 
 void permute_vector_d2h(cudaStream_t stream, int num_cells, const double *input, double *output)
 {
@@ -912,26 +977,26 @@ void fvc_to_source_vector(cudaStream_t stream, int num_cells, const double *volu
 }
 
 void ldu_to_csr(cudaStream_t stream, int num_cells, int num_surfaces, int num_boundary_surface,
-        const int* boundary_cell_face, const int *lower_to_csr_index, const int *upper_to_csr_index, const int *diag_to_csr_index,
-        const double *lower, const double *upper, const double *diag, const double *source,
-        const double *internal_coeffs, const double *boundary_coeffs,
+        const int* boundary_cell_face, const int *ldu_to_csr_index, const int *diag_to_csr_index,
+        const double *ldu, const double *source, const double *internal_coeffs, const double *boundary_coeffs,
         double *A, double *b)
 {
+    TICK_INIT_EVENT;
     // construct diag
+    int nNz = num_cells + 2 * num_surfaces;
     size_t threads_per_block = 1024;
-    size_t blocks_per_grid = (num_cells + threads_per_block - 1) / threads_per_block;
-    ldu_to_csr_Diag<<<blocks_per_grid, threads_per_block, 0, stream>>>(num_cells, num_surfaces, 
-            diag_to_csr_index, diag, source, A, b);
+    size_t blocks_per_grid = (nNz + threads_per_block - 1) / threads_per_block;
+    TICK_START_EVENT;
+    checkCudaErrors(cudaMemcpyAsync(b, source, num_cells * 3 * sizeof(double), cudaMemcpyDeviceToDevice, stream));
+    ldu_to_csr_kernel<<<blocks_per_grid, threads_per_block, 0, stream>>>(nNz, ldu_to_csr_index, ldu, A);
+    TICK_END_EVENT(ldu_to_csr_kernel);
 
     // add coeff to source and diagnal
     blocks_per_grid = (num_boundary_surface + threads_per_block - 1) / threads_per_block;
+    TICK_START_EVENT;
     addBoundaryDiagSrc<<<blocks_per_grid, threads_per_block, 0, stream>>>(num_cells, num_surfaces, num_boundary_surface, 
             boundary_cell_face, internal_coeffs, boundary_coeffs, diag_to_csr_index, A, b);
-    
-    // convert offdiag
-    blocks_per_grid = (num_surfaces + threads_per_block - 1) / threads_per_block;
-    ldu_to_csr_offDiag<<<blocks_per_grid, threads_per_block, 0, stream>>>(num_cells, num_surfaces, 
-            lower_to_csr_index, upper_to_csr_index, lower, upper, A);
+    TICK_END_EVENT(addBoundaryDiagSrc);
 }
 
 void update_boundary_coeffs_vector(cudaStream_t stream, int num_boundary_surfaces, int num_patches,
@@ -1287,3 +1352,69 @@ void fvc_grad_cell_scalar(cudaStream_t stream, int num_cells, int num_surfaces,
     // blocks_per_grid = (num_cells + threads_per_block - 1) / threads_per_block;
     // divide_cell_volume_vec<<<blocks_per_grid, threads_per_block, 0, stream>>>(num_cells, volume, output, sign);
 }
+
+void fvMtx_A(cudaStream_t stream, int num_cells, int num_surfaces, int num_boundary_surfaces,
+        const int *boundary_cell_face, const double *internal_coeffs, const double *volume, const double *diag, 
+        double *A_pEqn, double sign)
+{
+    checkCudaErrors(cudaMemcpyAsync(A_pEqn, diag, num_cells * sizeof(double), cudaMemcpyDeviceToDevice, stream));
+    TICK_INIT_EVENT;
+
+    size_t threads_per_block = 1024;
+    size_t blocks_per_grid = (num_boundary_surfaces + threads_per_block - 1) / threads_per_block;
+    TICK_START_EVENT;
+    addAveInternaltoDiag<<<blocks_per_grid, threads_per_block, 0, stream>>>(num_cells, num_boundary_surfaces, boundary_cell_face, 
+            internal_coeffs, A_pEqn);
+    TICK_END_EVENT(addAveInternaltoDiag);
+    
+    blocks_per_grid = (num_cells + threads_per_block - 1) / threads_per_block;
+    TICK_START_EVENT;
+    divide_cell_volume_scalar<<<blocks_per_grid, threads_per_block, 0, stream>>>(num_cells, volume, A_pEqn, sign);
+    TICK_END_EVENT(divide_cell_volume_scalar);
+}
+
+void fvMtx_H(cudaStream_t stream, int num_cells, int num_surfaces, int num_boundary_surfaces, 
+        const int *lowerAddr, const int *upperAddr, const double *volume,
+        int num_patches, const int *patch_size, const int *patch_type,
+        const int *boundary_cell_face, const double *internal_coffs, const double *boundary_coeffs, 
+        const double *lower, const double *upper, const double *source, const double *psi, 
+        double *H_pEqn, double *H_pEqn_perm, double sign)
+{
+    TICK_INIT_EVENT;
+    size_t threads_per_block = 1024;
+    size_t blocks_per_grid = (num_boundary_surfaces + threads_per_block - 1) / threads_per_block;
+
+    TICK_START_EVENT;
+    checkCudaErrors(cudaMemcpyAsync(H_pEqn, source, num_cells * 3 * sizeof(double), cudaMemcpyDeviceToDevice, stream));
+    addBoundaryDiag<<<blocks_per_grid, threads_per_block, 0, stream>>>(num_cells, num_boundary_surfaces, boundary_cell_face, 
+            internal_coffs, psi, H_pEqn);
+    TICK_END_EVENT(addBoundaryDiag);
+    
+    blocks_per_grid = (num_surfaces + threads_per_block - 1) / threads_per_block;
+    TICK_START_EVENT;
+    lduMatrix_H<<<blocks_per_grid, threads_per_block, 0, stream>>>(num_cells, num_surfaces, lowerAddr, upperAddr, 
+            lower, upper, psi, H_pEqn);
+    TICK_END_EVENT(lduMatrix_H);
+
+    int offset = 0;
+    for (int i = 0; i < num_patches; i++) {
+        threads_per_block = 64;
+        blocks_per_grid = (patch_size[i] + threads_per_block - 1) / threads_per_block;
+        // TODO: just non-coupled patch type now
+        if (patch_type[i] == boundaryConditions::zeroGradient
+                || patch_type[i] == boundaryConditions::fixedValue) {
+            TICK_START_EVENT;
+            addBoundarySrc_unCoupled<<<blocks_per_grid, threads_per_block, 0, stream>>>(num_cells, patch_size[i], offset, 
+                    num_boundary_surfaces, boundary_cell_face, boundary_coeffs, H_pEqn);
+            TICK_END_EVENT(addBoundarySrc_unCoupled);
+        } else if (0) {
+            // xxx
+            fprintf(stderr, "boundaryConditions other than zeroGradient are not support yet!\n");
+        }
+        offset += patch_size[i];
+    }
+    blocks_per_grid = (num_cells + threads_per_block - 1) / threads_per_block;
+    TICK_START_EVENT;
+    divideVol_permute_vec<<<blocks_per_grid, threads_per_block, 0, stream>>>(num_cells, volume, H_pEqn, H_pEqn_perm);
+    TICK_END_EVENT(divideVol_permute_vec);
+}
diff --git a/src_gpu/dfUEqn.H b/src_gpu/dfUEqn.H
index 49edc1b7a..43ee3402b 100644
--- a/src_gpu/dfUEqn.H
+++ b/src_gpu/dfUEqn.H
@@ -37,6 +37,8 @@ private:
 	double *d_nu_eff = nullptr;
 	// computed on CPU, used on GPU, need memcpyh2d - host
 	double *h_nu_eff = nullptr;
+    double *h_A_pEqn = nullptr;
+    double *h_H_pEqn = nullptr;
 	// intermediate fields
 	double *d_grad_u = nullptr;
 	double *d_rho_nueff = nullptr;
@@ -58,6 +60,7 @@ private:
 	double *d_gradient_boundary_coeffs= nullptr;
 
 	// non-constant fields - ldu
+    double *d_ldu = nullptr;
 	double *d_lower = nullptr;
 	double *d_upper = nullptr;
 	double *d_diag = nullptr;
@@ -69,6 +72,9 @@ private:
 	// non-constant fields - csr
 	double *d_A = nullptr;
 	double *d_b = nullptr; // TODO: needless
+    double *d_A_pEqn = nullptr;
+    double *d_H_pEqn = nullptr;
+    double *d_H_pEqn_perm = nullptr;
 
     // field pointer map
     std::unordered_map<std::string, double*> fieldPointerMap;
@@ -107,6 +113,10 @@ public:
 	void process();
 	void postProcess(double *h_u);
 
+    void A(double *Psi);
+    void H(double *Psi);
+    void correctPsi(double *Psi);
+
     void solve();
     void compareResult(const double *lower, const double *upper, const double *diag, const double *source, const double *internal_coeffs, const double *boundary_coeffs, 
     // const double *tmpVal, 
diff --git a/src_gpu/dfUEqn.cu b/src_gpu/dfUEqn.cu
index 837a7587a..2dc569915 100644
--- a/src_gpu/dfUEqn.cu
+++ b/src_gpu/dfUEqn.cu
@@ -17,6 +17,8 @@ void dfUEqn::createNonConstantFieldsInternal() {
   checkCudaErrors(cudaMalloc((void**)&d_nu_eff, dataBase_.cell_value_bytes));
   // computed on CPU, used on GPU, need memcpyh2d
   checkCudaErrors(cudaMallocHost((void**)&h_nu_eff , dataBase_.cell_value_bytes));
+  checkCudaErrors(cudaMallocHost((void**)&h_A_pEqn , dataBase_.cell_value_bytes));
+  checkCudaErrors(cudaMallocHost((void**)&h_H_pEqn , dataBase_.cell_value_vec_bytes));
   // intermediate fields
   checkCudaErrors(cudaMalloc((void**)&d_grad_u, dataBase_.cell_value_tsr_bytes));
   checkCudaErrors(cudaMalloc((void**)&d_rho_nueff, dataBase_.cell_value_bytes));
@@ -46,14 +48,18 @@ void dfUEqn::createNonConstantFieldsBoundary() {
 }
 
 void dfUEqn::createNonConstantLduAndCsrFields() {
-  checkCudaErrors(cudaMalloc((void**)&d_lower, dataBase_.surface_value_bytes));
-  checkCudaErrors(cudaMalloc((void**)&d_upper, dataBase_.surface_value_bytes));
-  checkCudaErrors(cudaMalloc((void**)&d_diag, dataBase_.cell_value_bytes));
+  checkCudaErrors(cudaMalloc((void**)&d_ldu, dataBase_.csr_value_bytes));
+  d_lower = d_ldu;
+  d_diag = d_ldu + dataBase_.num_surfaces;
+  d_upper = d_ldu + dataBase_.num_cells + dataBase_.num_surfaces;
   checkCudaErrors(cudaMalloc((void**)&d_source, dataBase_.cell_value_vec_bytes));
   checkCudaErrors(cudaMalloc((void**)&d_internal_coeffs, dataBase_.boundary_surface_value_vec_bytes));
   checkCudaErrors(cudaMalloc((void**)&d_boundary_coeffs, dataBase_.boundary_surface_value_vec_bytes));
   checkCudaErrors(cudaMalloc((void**)&d_A, dataBase_.csr_value_vec_bytes));
   checkCudaErrors(cudaMalloc((void**)&d_b, dataBase_.cell_value_vec_bytes));
+  checkCudaErrors(cudaMalloc((void**)&d_A_pEqn, dataBase_.cell_value_bytes));
+  checkCudaErrors(cudaMalloc((void**)&d_H_pEqn, dataBase_.cell_value_vec_bytes));
+  checkCudaErrors(cudaMalloc((void**)&d_H_pEqn_perm, dataBase_.cell_value_vec_bytes));
 }
 
 void dfUEqn::initNonConstantFieldsBoundary() {
@@ -79,14 +85,14 @@ void dfUEqn::preProcess(const double *h_u, const double *h_boundary_u, const dou
   checkCudaErrors(cudaMemcpyAsync(d_boundary_nu_eff, h_boundary_nu_eff, dataBase_.boundary_surface_value_bytes, cudaMemcpyHostToDevice, dataBase_.stream));
   checkCudaErrors(cudaMemcpyAsync(dataBase_.d_boundary_rho, h_boundary_rho, dataBase_.boundary_surface_value_bytes, cudaMemcpyHostToDevice, dataBase_.stream));
 
-  checkCudaErrors(cudaMemsetAsync(d_lower, 0, dataBase_.surface_value_bytes, dataBase_.stream));
-  checkCudaErrors(cudaMemsetAsync(d_upper, 0, dataBase_.surface_value_bytes, dataBase_.stream));
-  checkCudaErrors(cudaMemsetAsync(d_diag, 0, dataBase_.cell_value_bytes, dataBase_.stream));
+  checkCudaErrors(cudaMemsetAsync(d_ldu, 0, dataBase_.csr_value_bytes, dataBase_.stream)); // d_ldu contains d_lower, d_diag, and d_upper
   checkCudaErrors(cudaMemsetAsync(d_source, 0, dataBase_.cell_value_vec_bytes, dataBase_.stream));
   checkCudaErrors(cudaMemsetAsync(d_internal_coeffs, 0, dataBase_.boundary_surface_value_vec_bytes, dataBase_.stream));
   checkCudaErrors(cudaMemsetAsync(d_boundary_coeffs, 0, dataBase_.boundary_surface_value_vec_bytes, dataBase_.stream));
   checkCudaErrors(cudaMemsetAsync(d_A, 0, dataBase_.csr_value_vec_bytes, dataBase_.stream));
   checkCudaErrors(cudaMemsetAsync(d_b, 0, dataBase_.cell_value_vec_bytes, dataBase_.stream));
+  checkCudaErrors(cudaMemsetAsync(d_A_pEqn, 0, dataBase_.cell_value_bytes, dataBase_.stream));
+  checkCudaErrors(cudaMemsetAsync(d_H_pEqn, 0, dataBase_.cell_value_vec_bytes, dataBase_.stream));
 }
 
 void dfUEqn::process() {
@@ -143,6 +149,9 @@ void dfUEqn::process() {
                 dataBase_.d_weight, dataBase_.d_sf, dataBase_.d_p, d_source,
                 dataBase_.num_patches, dataBase_.patch_size.data(), patch_type.data(),
                 dataBase_.d_boundary_face_cell, dataBase_.d_boundary_p, dataBase_.d_boundary_sf, dataBase_.d_volume, -1.);
+        ldu_to_csr(dataBase_.stream, dataBase_.num_cells, dataBase_.num_surfaces, dataBase_.num_boundary_surfaces,
+            dataBase_.d_boundary_face_cell, dataBase_.d_ldu_to_csr_index, dataBase_.d_diag_to_csr_index, 
+            d_ldu, d_source, d_internal_coeffs, d_boundary_coeffs, d_A, d_b);
 
 #ifndef TIME_GPU
         checkCudaErrors(cudaStreamEndCapture(dataBase_.stream, &graph));
@@ -168,10 +177,6 @@ void dfUEqn::sync()
 }
 
 void dfUEqn::solve() {
-    ldu_to_csr(dataBase_.stream, dataBase_.num_cells, dataBase_.num_surfaces, dataBase_.num_boundary_surfaces,
-            dataBase_.d_boundary_face_cell,
-            dataBase_.d_lower_to_csr_index, dataBase_.d_upper_to_csr_index, dataBase_.d_diag_to_csr_index,
-            d_lower, d_upper, d_diag, d_source, d_internal_coeffs, d_boundary_coeffs, d_A, d_b);
 
     int nNz = dataBase_.num_cells + dataBase_.num_surfaces * 2; // matrix entries
     sync();
@@ -200,7 +205,7 @@ void dfUEqn::solve() {
     num_iteration++;
 }
 
-void dfUEqn::postProcess(double *h_u) { // TODO: Here may be a bug
+void dfUEqn::postProcess(double *h_u) {
     permute_vector_d2h(dataBase_.stream, dataBase_.num_cells, d_permute, dataBase_.d_u);
     checkCudaErrors(cudaMemcpyAsync(h_u, dataBase_.d_u, dataBase_.cell_value_vec_bytes, cudaMemcpyDeviceToHost, dataBase_.stream));
     checkCudaErrors(cudaStreamSynchronize(dataBase_.stream));
@@ -212,6 +217,33 @@ void dfUEqn::postProcess(double *h_u) { // TODO: Here may be a bug
             d_gradient_internal_coeffs, d_gradient_boundary_coeffs);
 }
 
+void dfUEqn::A(double *Psi) {
+    fvMtx_A(dataBase_.stream, dataBase_.num_cells, dataBase_.num_surfaces, dataBase_.num_boundary_surfaces, 
+            dataBase_.d_boundary_face_cell, d_internal_coeffs, dataBase_.d_volume, d_diag, d_A_pEqn);
+    checkCudaErrors(cudaMemcpyAsync(h_A_pEqn, d_A_pEqn, dataBase_.cell_value_bytes, cudaMemcpyDeviceToHost, dataBase_.stream));
+    sync();
+    // TODO: correct Boundary conditions
+    memcpy(Psi, h_A_pEqn, dataBase_.cell_value_bytes);
+}
+
+void dfUEqn::H(double *Psi) {
+    fvMtx_H(dataBase_.stream, dataBase_.num_cells, dataBase_.num_surfaces, dataBase_.num_boundary_surfaces, 
+            dataBase_.d_owner, dataBase_.d_neighbor, dataBase_.d_volume,
+            dataBase_.num_patches, dataBase_.patch_size.data(), patch_type.data(),
+            dataBase_.d_boundary_face_cell, d_internal_coeffs, d_boundary_coeffs,
+            d_lower, d_upper, d_source, d_permute, d_H_pEqn, d_H_pEqn_perm);
+    checkCudaErrors(cudaMemcpyAsync(h_H_pEqn, d_H_pEqn_perm, dataBase_.cell_value_vec_bytes, cudaMemcpyDeviceToHost, dataBase_.stream));
+    sync();
+    // TODO: correct Boundary conditions
+    memcpy(Psi, h_H_pEqn, dataBase_.cell_value_vec_bytes);
+}
+
+void dfUEqn::correctPsi(double *Psi) {
+    checkCudaErrors(cudaMemcpy(dataBase_.d_u, Psi, dataBase_.cell_value_vec_bytes, cudaMemcpyHostToDevice));
+    permute_vector_h2d(dataBase_.stream, dataBase_.num_cells, dataBase_.d_u, d_permute);
+}
+
+
 double* dfUEqn::getFieldPointer(const char* fieldAlias, location loc, position pos) {
     char mergedName[256];
     if (pos == position::internal) {