From 3f5e136e0d601809d586a925d9532bf88a5e8457 Mon Sep 17 00:00:00 2001
From: maorz1998 <maorz1998@stu.pku.edu.cn>
Date: Tue, 1 Aug 2023 22:32:34 +0800
Subject: [PATCH 01/25] dfMatrixDataBase

---
 GPUTest/Make/files                      |   4 +
 GPUTest/Make/options                    |  46 ++
 GPUTest/correctPhi.H                    |  12 +
 GPUTest/createFields.H                  | 176 ++++++
 GPUTest/createGPUSolver.H               |  71 +++
 GPUTest/setRDeltaT.H                    |  85 +++
 GPUTest/setRootCase2.H                  |   5 +
 GPUTest/unittest.C                      | 100 ++++
 src_gpu/CMakeLists.txt                  |   7 +-
 src_gpu/dfMatrixDataBase.H              | 700 +++++-------------------
 src_gpu/dfMatrixDataBase.cu             | 261 +++++++--
 {src_gpu => src_gpu_orig}/AmgXSolver.H  |   0
 {src_gpu => src_gpu_orig}/AmgXSolver.cu |   0
 src_gpu_orig/CMakeLists.txt             |  39 ++
 {src_gpu => src_gpu_orig}/GPUMesh.H     |   0
 {src_gpu => src_gpu_orig}/GPUfield.H    |   0
 {src_gpu => src_gpu_orig}/GPUfield.cpp  |   0
 {src_gpu => src_gpu_orig}/dfEEqn.H      |   0
 {src_gpu => src_gpu_orig}/dfEEqn.cu     |   0
 src_gpu_orig/dfMatrixDataBase.H         | 641 ++++++++++++++++++++++
 src_gpu_orig/dfMatrixDataBase.cu        |  48 ++
 {src_gpu => src_gpu_orig}/dfRhoEqn.H    |   0
 {src_gpu => src_gpu_orig}/dfRhoEqn.cu   |   0
 {src_gpu => src_gpu_orig}/dfUEqn.H      |   0
 {src_gpu => src_gpu_orig}/dfUEqn.cu     |   0
 {src_gpu => src_gpu_orig}/dfYEqn.H      |   0
 {src_gpu => src_gpu_orig}/dfYEqn.cu     |   0
 27 files changed, 1575 insertions(+), 620 deletions(-)
 create mode 100644 GPUTest/Make/files
 create mode 100644 GPUTest/Make/options
 create mode 100644 GPUTest/correctPhi.H
 create mode 100644 GPUTest/createFields.H
 create mode 100644 GPUTest/createGPUSolver.H
 create mode 100644 GPUTest/setRDeltaT.H
 create mode 100644 GPUTest/setRootCase2.H
 create mode 100644 GPUTest/unittest.C
 rename {src_gpu => src_gpu_orig}/AmgXSolver.H (100%)
 rename {src_gpu => src_gpu_orig}/AmgXSolver.cu (100%)
 create mode 100644 src_gpu_orig/CMakeLists.txt
 rename {src_gpu => src_gpu_orig}/GPUMesh.H (100%)
 rename {src_gpu => src_gpu_orig}/GPUfield.H (100%)
 rename {src_gpu => src_gpu_orig}/GPUfield.cpp (100%)
 rename {src_gpu => src_gpu_orig}/dfEEqn.H (100%)
 rename {src_gpu => src_gpu_orig}/dfEEqn.cu (100%)
 create mode 100644 src_gpu_orig/dfMatrixDataBase.H
 create mode 100644 src_gpu_orig/dfMatrixDataBase.cu
 rename {src_gpu => src_gpu_orig}/dfRhoEqn.H (100%)
 rename {src_gpu => src_gpu_orig}/dfRhoEqn.cu (100%)
 rename {src_gpu => src_gpu_orig}/dfUEqn.H (100%)
 rename {src_gpu => src_gpu_orig}/dfUEqn.cu (100%)
 rename {src_gpu => src_gpu_orig}/dfYEqn.H (100%)
 rename {src_gpu => src_gpu_orig}/dfYEqn.cu (100%)

diff --git a/GPUTest/Make/files b/GPUTest/Make/files
new file mode 100644
index 000000000..d78085ff8
--- /dev/null
+++ b/GPUTest/Make/files
@@ -0,0 +1,4 @@
+unittest.C
+
+EXE = $(DF_APPBIN)/unitTest
+
diff --git a/GPUTest/Make/options b/GPUTest/Make/options
new file mode 100644
index 000000000..637eb0e9b
--- /dev/null
+++ b/GPUTest/Make/options
@@ -0,0 +1,46 @@
+-include $(GENERAL_RULES)/mplibType
+
+EXE_INC = -std=c++14 \
+    -g \
+    -fopenmp \
+    -Wno-unused-variable \
+    -Wno-unused-but-set-variable \
+    -Wno-old-style-cast \
+    $(PFLAGS) $(PINC) \
+    $(if $(LIBTORCH_ROOT),-DUSE_LIBTORCH,) \
+    $(if $(PYTHON_INC_DIR),-DUSE_PYTORCH,) \
+    -I$(LIB_SRC)/transportModels/compressible/lnInclude \
+    -I$(LIB_SRC)/thermophysicalModels/basic/lnInclude \
+    -I$(LIB_SRC)/TurbulenceModels/turbulenceModels/lnInclude \
+    -I$(LIB_SRC)/TurbulenceModels/compressible/lnInclude \
+    -I$(LIB_SRC)/finiteVolume/cfdTools \
+    -I$(LIB_SRC)/finiteVolume/lnInclude \
+    -I$(LIB_SRC)/meshTools/lnInclude \
+    -I$(LIB_SRC)/sampling/lnInclude \
+    -I$(LIB_SRC)/dynamicFvMesh/lnInclude \
+    -I$(LIB_SRC)/Pstream/mpi \
+    -I$(DF_SRC)/dfCanteraMixture/lnInclude \
+    -I$(DF_SRC)/dfChemistryModel/lnInclude \
+    -I$(DF_SRC)/dfCombustionModels/lnInclude \
+    -I$(CANTERA_ROOT)/include \
+	-I$(DF_ROOT)/src_gpu \
+	-I/usr/local/cuda-11.6/include \
+	-I$(AMGX_DIR)/include
+
+EXE_LIBS = \
+    -lcompressibleTransportModels \
+    -lturbulenceModels \
+    -lfiniteVolume \
+    -lmeshTools \
+    -lsampling \
+    -L$(DF_LIBBIN) \
+    -ldfFluidThermophysicalModels \
+    -ldfCompressibleTurbulenceModels \
+    -ldfCanteraMixture \
+    -ldfChemistryModel \
+    -ldfCombustionModels \
+    $(CANTERA_ROOT)/lib/libcantera.so \
+	/usr/local/cuda-11.6/lib64/libcudart.so \
+	$(AMGX_DIR)/build/libamgxsh.so \
+    $(DF_ROOT)/src_gpu/build/libdfMatrix.so
+
diff --git a/GPUTest/correctPhi.H b/GPUTest/correctPhi.H
new file mode 100644
index 000000000..3cd82d29e
--- /dev/null
+++ b/GPUTest/correctPhi.H
@@ -0,0 +1,12 @@
+CorrectPhi
+(
+    U,
+    phi,
+    p,
+    rho,
+    psi,
+    dimensionedScalar("rAUf", dimTime, 1),
+    divrhoU(),
+    pimple,
+    true
+);
diff --git a/GPUTest/createFields.H b/GPUTest/createFields.H
new file mode 100644
index 000000000..9e750c334
--- /dev/null
+++ b/GPUTest/createFields.H
@@ -0,0 +1,176 @@
+#include "createRDeltaT.H"
+
+Info<< "Reading thermophysical properties\n" << endl;
+
+// fluidThermo* pThermo = new hePsiThermo<psiThermo, CanteraMixture>(mesh, word::null);
+fluidThermo* pThermo = new heRhoThermo<rhoThermo, CanteraMixture>(mesh, word::null);
+fluidThermo& thermo = *pThermo;
+// thermo.validate(args.executable(), "ha");
+
+const volScalarField& psi = thermo.psi();
+volScalarField& p = thermo.p();
+volScalarField& T = thermo.T();
+volScalarField rho
+(
+    IOobject
+    (
+        "rho",
+        runTime.timeName(),
+        mesh,
+        IOobject::READ_IF_PRESENT,
+        IOobject::AUTO_WRITE
+    ),
+    thermo.rho()
+);
+
+
+Info<< "Reading field U\n" << endl;
+volVectorField U
+(
+    IOobject
+    (
+        "U",
+        runTime.timeName(),
+        mesh,
+        IOobject::MUST_READ,
+        IOobject::AUTO_WRITE
+    ),
+    mesh
+);
+
+#include "compressibleCreatePhi.H"
+
+pressureControl pressureControl(p, rho, pimple.dict(), false);
+
+mesh.setFluxRequired(p.name());
+
+Info<< "Creating turbulence model\n" << endl;
+autoPtr<compressible::turbulenceModel> turbulence
+(
+    compressible::turbulenceModel::New
+    (
+        rho,
+        U,
+        phi,
+        thermo
+    )
+);
+
+Info<< "Creating field dpdt\n" << endl;
+volScalarField dpdt
+(
+    IOobject
+    (
+        "dpdt",
+        runTime.timeName(),
+        mesh,
+        IOobject::NO_READ,
+        IOobject::NO_WRITE
+    ),
+    mesh,
+    dimensionedScalar("dpdt",p.dimensions()/dimTime, 0)
+);
+
+
+Info<< "Creating reaction model\n" << endl;
+autoPtr<CombustionModel<basicThermo>> combustion
+(
+    CombustionModel<basicThermo>::New(thermo, turbulence())
+);
+Info<< "end Creating reaction model\n" << endl;
+
+
+const word combModelName(mesh.objectRegistry::lookupObject<IOdictionary>("combustionProperties").lookup("combustionModel"));
+Info << "Combustion Model Name is confirmed as "<< combModelName << endl;
+
+const word turbName(mesh.objectRegistry::lookupObject<IOdictionary>("turbulenceProperties").lookup("simulationType"));
+
+dfChemistryModel<basicThermo>* chemistry = combustion->chemistry();
+PtrList<volScalarField>& Y = chemistry->Y();
+const word inertSpecie(chemistry->lookup("inertSpecie"));
+const label inertIndex(chemistry->species()[inertSpecie]);
+chemistry->setEnergyName("ha");
+chemistry->updateEnergy();
+
+
+chemistry->correctThermo();
+Info<< "At initial time, min/max(T) = " << min(T).value() << ", " << max(T).value() << endl;
+
+//for dpdt
+
+Info<< "Creating field kinetic energy K\n" << endl;
+volScalarField K("K", 0.5*magSqr(U));
+
+multivariateSurfaceInterpolationScheme<scalar>::fieldTable fields;
+
+if(combModelName!="flareFGM")
+{
+forAll(Y, i)
+{
+    fields.add(Y[i]);
+}
+fields.add(thermo.he());
+}
+
+
+const scalar Sct = chemistry->lookupOrDefault("Sct", 1.);
+volScalarField diffAlphaD
+(
+    IOobject
+    (
+        "diffAlphaD",
+        runTime.timeName(),
+        mesh,
+        IOobject::NO_READ,
+        IOobject::NO_WRITE
+    ),
+    mesh,
+    dimensionedScalar(dimEnergy/dimTime/dimVolume, 0)
+);
+volVectorField hDiffCorrFlux
+(
+    IOobject
+    (
+        "hDiffCorrFlux",
+        runTime.timeName(),
+        mesh,
+        IOobject::NO_READ,
+        IOobject::NO_WRITE
+    ),
+    mesh,
+    dimensionedVector(dimensionSet(1,0,-3,0,0,0,0), Zero)
+);
+volVectorField sumYDiffError
+(
+    IOobject
+    (
+        "sumYDiffError",
+        runTime.timeName(),
+        mesh,
+        IOobject::NO_READ,
+        IOobject::NO_WRITE
+    ),
+    mesh,
+    dimensionedVector("sumYDiffError", dimDynamicViscosity/dimLength, Zero)
+);
+
+IOdictionary CanteraTorchProperties
+(
+    IOobject
+    (
+        "CanteraTorchProperties",
+        runTime.constant(),
+        mesh,
+        IOobject::MUST_READ,
+        IOobject::NO_WRITE
+    )
+);
+const Switch splitting = CanteraTorchProperties.lookupOrDefault("splittingStrategy", false);
+#ifdef USE_PYTORCH
+    const Switch log_ = CanteraTorchProperties.subDict("TorchSettings").lookupOrDefault("log", false);
+    const Switch torch_ = CanteraTorchProperties.subDict("TorchSettings").lookupOrDefault("torch", false);
+#endif
+#ifdef USE_LIBTORCH
+    const Switch log_ = CanteraTorchProperties.subDict("TorchSettings").lookupOrDefault("log", false);
+    const Switch torch_ = CanteraTorchProperties.subDict("TorchSettings").lookupOrDefault("torch", false);
+#endif
diff --git a/GPUTest/createGPUSolver.H b/GPUTest/createGPUSolver.H
new file mode 100644
index 000000000..9a6c289ab
--- /dev/null
+++ b/GPUTest/createGPUSolver.H
@@ -0,0 +1,71 @@
+dfMatrixDataBase dfDataBase;
+
+void createGPUBase(fvMesh& mesh, PtrList<volScalarField>& Y) {
+    // obtain variables from fvMesh
+    const labelUList& owner = mesh.owner();
+    const labelUList& neighbour = mesh.neighbour();
+    int num_cells = mesh.nCells();
+    int num_surfaces = neighbour.size();
+
+    
+    // prepare num_cells, num_surfaces, num_boundary_surfaces, num_patches, patch_sizes, num_species, rdelta_t
+    // - obtain boundary size info from mesh
+    int patchSize = 0, num_patches = 0, num_boundary_surfaces = 0;
+    std::vector<int> patch_sizes;
+    forAll(mesh.boundary(), patchi) {
+        labelUList sub_boundary = mesh.boundary()[patchi].faceCells();
+        patchSize = sub_boundary.size();
+
+        patch_sizes.push_back(patchSize);
+        num_boundary_surfaces += patchSize;
+        num_patches ++;
+    }
+    dfDataBase.setConstantValues(num_cells, num_surfaces, num_boundary_surfaces, num_patches, patch_sizes, Y.size(), 1e-6); // TODO: get deltaT fomr time API
+    
+    // prepare owner, neighbor
+    dfDataBase.setConstantIndexes(&owner[0], &neighbour[0]);
+    
+    // prepare internal and boundary of sf, mag_sf, weights, delta_coeffs, volume
+    // - obtain boundary field info from mesh
+    double *boundary_sf = new double[3 * num_boundary_surfaces];
+    double *boundary_mag_sf = new double[num_boundary_surfaces];
+    double *boundary_delta_coeffs = new double[num_boundary_surfaces];
+    int offset = 0;
+    forAll(mesh.boundary(), patchi) {
+        const vectorField& pSf = mesh.Sf().boundaryField()[patchi];
+        const scalarField& pMagSf = mesh.magSf().boundaryField()[patchi];
+        const scalarField& pDeltaCoeffs = mesh.nonOrthDeltaCoeffs().boundaryField()[patchi];
+
+        patchSize = pMagSf.size();
+
+        memcpy(boundary_sf + 3*offset, &pSf[0][0], 3*patchSize*sizeof(double));
+        memcpy(boundary_mag_sf + offset, &pMagSf[0], patchSize*sizeof(double));
+        memcpy(boundary_delta_coeffs + offset, &pDeltaCoeffs[0], patchSize*sizeof(double));
+        offset += patchSize;
+    }
+
+    dfDataBase.createConstantFieldsInternal();
+    dfDataBase.createConstantFieldsBoundary();
+    dfDataBase.initConstantFieldsInternal(&mesh.Sf()[0][0], &mesh.magSf()[0], &mesh.surfaceInterpolation::weights()[0], &mesh.nonOrthDeltaCoeffs()[0], &mesh.V()[0]);
+    dfDataBase.initConstantFieldsBoundary(boundary_sf, boundary_mag_sf, boundary_delta_coeffs);
+    
+    // prepare internal and boundary of xxx
+    // - obtain init_Y
+    double *h_Y = new double[Y.size() * num_cells];
+    double *boundary_Y = new double[Y.size() * num_boundary_surfaces];
+    forAll(Y, speciesI) {
+        volScalarField& Yi = Y[speciesI];
+        memcpy(h_Y + speciesI * num_cells, &Yi[0], num_cells * sizeof(double));
+        offset = 0;
+        forAll(Yi.boundaryField(), patchi) {
+            const scalarField& patchYi = Yi.boundaryField()[patchi];
+            patchSize = patchYi.size();
+            memcpy(boundary_Y + speciesI * num_boundary_surfaces + offset, &patchYi[0], patchSize*sizeof(double));
+            offset += patchSize;
+        }
+    }
+    dfDataBase.createNonConstantFieldsInternal();
+    dfDataBase.createNonConstantFieldsBoundary();
+    dfDataBase.initNonConstantFieldsInternal(h_Y);
+    dfDataBase.initNonConstantFieldsBoundary(boundary_Y);
+};
\ No newline at end of file
diff --git a/GPUTest/setRDeltaT.H b/GPUTest/setRDeltaT.H
new file mode 100644
index 000000000..074d05e3d
--- /dev/null
+++ b/GPUTest/setRDeltaT.H
@@ -0,0 +1,85 @@
+{
+    volScalarField& rDeltaT = trDeltaT.ref();
+
+    const dictionary& pimpleDict = pimple.dict();
+
+    scalar maxCo
+    (
+        pimpleDict.lookupOrDefault<scalar>("maxCo", 0.8)
+    );
+
+    scalar rDeltaTSmoothingCoeff
+    (
+        pimpleDict.lookupOrDefault<scalar>("rDeltaTSmoothingCoeff", 0.02)
+    );
+
+    scalar rDeltaTDampingCoeff
+    (
+        pimpleDict.lookupOrDefault<scalar>("rDeltaTDampingCoeff", 1.0)
+    );
+
+    scalar maxDeltaT
+    (
+        pimpleDict.lookupOrDefault<scalar>("maxDeltaT", great)
+    );
+
+    volScalarField rDeltaT0("rDeltaT0", rDeltaT);
+
+    // Set the reciprocal time-step from the local Courant number
+    rDeltaT.ref() = max
+    (
+        1/dimensionedScalar(dimTime, maxDeltaT),
+        fvc::surfaceSum(mag(phi))()()
+       /((2*maxCo)*mesh.V()*rho())
+    );
+
+    if (pimple.transonic())
+    {
+        surfaceScalarField phid
+        (
+            "phid",
+            fvc::interpolate(psi)*fvc::flux(U)
+        );
+
+        rDeltaT.ref() = max
+        (
+            rDeltaT(),
+            fvc::surfaceSum(mag(phid))()()
+            /((2*maxCo)*mesh.V()*psi())
+        );
+    }
+
+    // Update tho boundary values of the reciprocal time-step
+    rDeltaT.correctBoundaryConditions();
+
+    Info<< "Flow time scale min/max = "
+        << gMin(1/rDeltaT.primitiveField())
+        << ", " << gMax(1/rDeltaT.primitiveField()) << endl;
+
+    if (rDeltaTSmoothingCoeff < 1.0)
+    {
+        fvc::smooth(rDeltaT, rDeltaTSmoothingCoeff);
+    }
+
+    Info<< "Smoothed flow time scale min/max = "
+        << gMin(1/rDeltaT.primitiveField())
+        << ", " << gMax(1/rDeltaT.primitiveField()) << endl;
+
+    // Limit rate of change of time scale
+    // - reduce as much as required
+    // - only increase at a fraction of old time scale
+    if
+    (
+        rDeltaTDampingCoeff < 1.0
+     && runTime.timeIndex() > runTime.startTimeIndex() + 1
+    )
+    {
+        rDeltaT =
+            rDeltaT0
+           *max(rDeltaT/rDeltaT0, scalar(1) - rDeltaTDampingCoeff);
+
+        Info<< "Damped flow time scale min/max = "
+            << gMin(1/rDeltaT.primitiveField())
+            << ", " << gMax(1/rDeltaT.primitiveField()) << endl;
+    }
+}
diff --git a/GPUTest/setRootCase2.H b/GPUTest/setRootCase2.H
new file mode 100644
index 000000000..45d966e63
--- /dev/null
+++ b/GPUTest/setRootCase2.H
@@ -0,0 +1,5 @@
+Foam::argList args(argc,argv,true,true,/*initialise=*/false);
+if (!args.checkRootCase())
+{
+    Foam::FatalError.exit();
+}
\ No newline at end of file
diff --git a/GPUTest/unittest.C b/GPUTest/unittest.C
new file mode 100644
index 000000000..2e3d55ce5
--- /dev/null
+++ b/GPUTest/unittest.C
@@ -0,0 +1,100 @@
+/*---------------------------------------------------------------------------*\
+  =========                 |
+  \\      /  F ield         | OpenFOAM: The Open Source CFD Toolbox
+   \\    /   O peration     | Website:  https://openfoam.org
+    \\  /    A nd           | Copyright (C) 2011-2019 OpenFOAM Foundation
+     \\/     M anipulation  |
+-------------------------------------------------------------------------------
+License
+    This file is part of OpenFOAM.
+
+    OpenFOAM is free software: you can redistribute it and/or modify it
+    under the terms of the GNU General Public License as published by
+    the Free Software Foundation, either version 3 of the License, or
+    (at your option) any later version.
+
+    OpenFOAM is distributed in the hope that it will be useful, but WITHOUT
+    ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
+    FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
+    for more details.
+
+    You should have received a copy of the GNU General Public License
+    along with OpenFOAM.  If not, see <http://www.gnu.org/licenses/>.
+
+Application
+    unittest
+
+Description
+    GPU unittest
+
+\*---------------------------------------------------------------------------*/
+
+#include "dfChemistryModel.H"
+#include "CanteraMixture.H"
+// #include "hePsiThermo.H"
+#include "heRhoThermo.H"
+
+#include "fvCFD.H"
+#include "fluidThermo.H"
+#include "turbulentFluidThermoModel.H"
+#include "pimpleControl.H"
+#include "pressureControl.H"
+#include "localEulerDdtScheme.H"
+#include "fvcSmooth.H"
+#include "PstreamGlobals.H"
+#include "basicThermo.H"
+#include "CombustionModel.H"
+
+#include "dfMatrixDataBase.H"
+#include <cuda_runtime.h>
+#include <thread>
+#include "upwind.H"
+#include "createGPUSolver.H"
+
+int main(int argc, char *argv[])
+{
+#ifdef USE_PYTORCH
+    pybind11::scoped_interpreter guard{};//start python interpreter
+#endif
+    #include "postProcess.H"
+
+    // #include "setRootCaseLists.H"
+    #include "listOptions.H"
+    #include "setRootCase2.H"
+    #include "listOutput.H"
+
+    #include "createTime.H"
+    #include "createMesh.H"
+    #include "createDyMControls.H"
+    #include "initContinuityErrs.H"
+    #include "createFields.H"
+    #include "createRhoUfIfPresent.H"
+
+    turbulence->validate();
+
+    if (!LTS)
+    {
+        #include "compressibleCourantNo.H"
+        #include "setInitialDeltaT.H"
+    }
+
+    // * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * //
+    {
+        #include "readDyMControls.H"
+
+        if (LTS)
+        {
+            #include "setRDeltaT.H"
+        }
+        else
+        {
+            #include "compressibleCourantNo.H"
+            #include "setDeltaT.H"
+        }
+
+        createGPUBase(mesh, Y);
+    }
+    return 0;
+}
+
+
diff --git a/src_gpu/CMakeLists.txt b/src_gpu/CMakeLists.txt
index 6e4a7efef..015a1d11b 100644
--- a/src_gpu/CMakeLists.txt
+++ b/src_gpu/CMakeLists.txt
@@ -12,6 +12,8 @@ find_package(MPI REQUIRED)
 find_package(CUDAToolkit REQUIRED)
 find_library(LIBAMGXSH amgxsh PATHS $ENV{AMGX_DIR}/build)
 
+add_compile_options(-arch=sm_70)
+
 include_directories(
     ${MPI_INCLUDE_PATH}
     ${CUDA_INCLUDE_DIRS}
@@ -20,11 +22,6 @@ include_directories(
 
 add_library(${PROJECT_NAME} 
     SHARED 
-        dfUEqn.cu 
-        dfRhoEqn.cu 
-        dfYEqn.cu
-        dfEEqn.cu
-        AmgXSolver.cu
         dfMatrixDataBase.cu)
 
 target_link_libraries(${PROJECT_NAME}
diff --git a/src_gpu/dfMatrixDataBase.H b/src_gpu/dfMatrixDataBase.H
index 8efb4bf62..c2e1446ec 100644
--- a/src_gpu/dfMatrixDataBase.H
+++ b/src_gpu/dfMatrixDataBase.H
@@ -52,590 +52,138 @@ void constructBoundarySelector(std::vector<int>& patchTypeSelector, const std::s
 
 struct dfMatrixDataBase
 {
-    // - cuda resource
+    // cuda resource
     cudaStream_t stream;
-
-    // - number of cell size
-    int num_cells;
-    // - number of face size
-    int num_surfaces;
-    // - number of offdiagnal entry size (2*num_surfaces)
-    int num_faces;
-    // - number of boundary cells
-    int num_boundary_cells;
-    // - number of boundary faces
-    int num_boundary_faces;
-
-    int num_species;
-
-    // - mesh variables
-    // - csr_row_index
-    int *h_A_csr_row_index=nullptr, *d_A_csr_row_index=nullptr;
-    // - csr_col_index
-    int *h_A_csr_col_index=nullptr, *d_A_csr_col_index=nullptr;
-    // - csr_diag_index
-    int *h_A_csr_diag_index=nullptr, *d_A_csr_diag_index=nullptr;
-
-    // - the pre-permutated and post-permutated interpolation weight list
-    std::vector<double> h_weight_vec_init, h_weight_vec;
-    // - the pre-permutated and post-permutated flux (phi) list
-    std::vector<double> h_phi_vec_init, h_phi_vec;
-    // - the pre-permutated and post-permutated cell face vector list
-    std::vector<double> h_face_vector_vec_init, h_face_vector_vec;
-    std::vector<double> h_face_vec_init, h_face_vec;
-    std::vector<double> h_deltaCoeffs_vec_init, h_deltaCoeffs_vec;
-    // - the host pointer to rho_new, rho_old, velocity_old, pressure and volume list
-    double *h_rho_new = nullptr, *h_rho_old = nullptr, *h_velocity_old = nullptr, 
-    *h_pressure = nullptr;
-    const double *h_volume = nullptr;
-    // - the host pointer to the pre-permutated and post-permutated interpolation weight list
-    double *h_weight_init = nullptr, *h_weight = nullptr;
-    // - the host pointer to the pre-permutated and post-permutated flux (phi) list
-    double *h_phi_init = nullptr, *h_phi = nullptr;
-    // - the host pointer to the pre-permutated and post-permutated cell face vector list
-    double *h_face_vector_init = nullptr, *h_face_vector = nullptr;
-    double *h_face_init = nullptr, *h_face = nullptr;
-    double *h_deltaCoeffs_init = nullptr, *h_deltaCoeffs = nullptr;
-    // - the device pointer to rho_new, rho_old, velocity_old, pressure and volume list
-    double *d_rho_new = nullptr, *d_rho_old = nullptr, *d_velocity_old = nullptr, 
-    *d_pressure = nullptr, *d_volume = nullptr;
-    // - the device pointer to Y(vector Yi)
-    //std::vector<double*> d_Y;
-    double *d_Y = nullptr;
-    // - the device pointer to the pre-permutated and post-permutated interpolation weight list
-    double *d_weight_init = nullptr, *d_weight = nullptr;
-    double *d_weight_upwind = nullptr;
-    // - the device pointer to the pre-permutated and post-permutated flux (phi) list
-    double *d_phi_init = nullptr, *d_phi = nullptr;
-    // - the device pointer to the pre-permutated and post-permutated cell face vector list
-    double *d_face_vector_init = nullptr, *d_face_vector = nullptr;
-    double *d_face_init = nullptr, *d_face = nullptr;
-    double *d_deltaCoeffs_init = nullptr, *d_deltaCoeffs = nullptr;
-    std::vector<double*> d_rhoD_vector;
-
-    double *d_hDiffCorrFlux = nullptr;
-    double *d_diffAlphaD = nullptr;
-    double *d_rhoD = nullptr;
-    double *d_alpha = nullptr;
-
-    double rdelta_t = 1/1e-6;
-
-    /**
-     * @brief boundary related variables
-     */
-    int *h_boundary_cell_offset = nullptr, *d_boundary_cell_offset=nullptr;
-    int *h_boundary_cell_id = nullptr, *d_boundary_cell_id = nullptr;
-    double *h_internal_coeffs = nullptr, *h_boundary_coeffs = nullptr,
-    *h_boundary_pressure = nullptr, *h_boundary_face_vector = nullptr,
-    *h_boundary_face = nullptr, *d_boundary_face = nullptr,
-    *h_boundary_deltaCoeffs = nullptr, *d_boundary_deltaCoeffs = nullptr, 
-    *d_internal_coeffs = nullptr, *d_boundary_coeffs = nullptr,
-    *d_internal_coeffs_init = nullptr, *d_boundary_coeffs_init = nullptr,
-    *d_laplac_internal_coeffs = nullptr, *d_laplac_boundary_coeffs = nullptr,
-    *d_laplac_internal_coeffs_init = nullptr, *d_laplac_boundary_coeffs_init = nullptr,
-    *d_boundary_pressure = nullptr, *d_boundary_face_vector = nullptr,
-    *d_boundary_pressure_init = nullptr,
-    *d_boundary_phi = nullptr, *d_boundary_phi_init = nullptr, 
-    *d_boundary_velocity = nullptr, *d_boundary_velocity_init = nullptr,
-    *d_boundary_nuEff = nullptr, *d_boundary_nuEff_init = nullptr,
-    *d_boundary_rho = nullptr, *d_boundary_rho_init = nullptr;
-    std::vector<double*> d_boundary_Y_vector;
-    std::vector<double*> d_boundary_Y_init_vector;
-    std::vector<double*> d_internal_coeffs_Y_vector;
-    std::vector<double*> d_boundary_coeffs_Y_vector;
-    std::vector<double*> d_laplac_internal_coeffs_Y_vector;
-    std::vector<double*> d_laplac_boundary_coeffs_Y_vector;
-    double *d_internal_coeffs_Y = nullptr;
-    double *d_boundary_coeffs_Y = nullptr;
-    double *d_laplac_internal_coeffs_Y = nullptr;
-    double *d_laplac_boundary_coeffs_Y = nullptr;
-    std::vector<double*> d_boundary_rhoD_vector;
-    double *d_boundary_mut_sct = nullptr;
-    double *d_boundary_rhoD = nullptr;
-    double *d_boundary_alpha = nullptr;
-
-    double *d_boundary_hDiffCorrFlux = nullptr;
-    int *d_boundary_UpatchType = nullptr;
-    int *d_boundary_YpatchType = nullptr;
-
-    std::vector<int> boundPermutationList;
-    std::vector<double> ueqn_internalCoeffs, ueqn_boundaryCoeffs;
-    std::vector<double> boundary_face_vector;
-    std::vector<double> boundary_pressure;
-    std::vector<double> boundary_face;
-    std::vector<double> boundary_deltaCoeffs;
-    std::vector<std::vector<int>> patch_type_init;
-    std::vector<std::vector<int>> patch_type;
-
-    // - the device pointer to the permutated index list
-    std::vector<int> permedIndex;
-    int *d_permedIndex=nullptr;
-    int *d_bouPermedIndex = nullptr;
-
-
-    // bytesize
-    // - bytes of diagnal entries
-    size_t cell_bytes;
-    // - bytes of diagnal entries (vector)
-    size_t cell_vec_bytes;
-    // - bytes of diagnal index
-    size_t cell_index_bytes;
-     // - bytes of diagnal index
-    size_t face_bytes;
-    size_t face_vec_bytes;
-    size_t face_index_bytes;
-
-    size_t boundary_cell_bytes;
-    size_t boundary_cell_vec_bytes;
-    size_t boundary_cell_index_bytes;
-
-    size_t boundary_face_bytes;
-    size_t boundary_face_vec_bytes;
-    size_t boundary_face_index_bytes;
-
-    // A_csr has one more element in each row: itself
-    size_t csr_row_index_bytes;
-    size_t csr_col_index_bytes;
-    size_t csr_value_bytes;
-    size_t csr_value_vec_bytes;
-
-    // extra matrix information
-    double *d_turbSrc_A = nullptr, *d_turbSrc_b = nullptr, *d_turbSrc_A_init = nullptr;
-    std::vector<double> h_turbSrc_init_mtx_vec, h_turbSrc_init_1mtx;
-    std::vector<double> h_turbSrc_init_src_vec, h_turbSrc_src_vec;
-    std::vector<int> tmpPermutatedList;
-    int * d_tmpPermutatedList = nullptr;
-
-    // double *h_A_csr = nullptr, *h_b = nullptr, *h_psi = nullptr;
-    // double *d_A_csr = nullptr, *d_b = nullptr, *d_psi = nullptr;
-
-    int num_iteration;
-
-    double time_monitor_CPU;
-    double time_monitor_GPU_kernel, time_monitor_GPU_memcpy, time_monitor_GPU_memcpy_test;
-
-    double* d_grad = nullptr; 
-    double* d_grad_boundary = nullptr, *d_grad_boundary_init = nullptr;
-    double* d_nuEff = nullptr;
+    // maybe one graph for one eqn before using self-developed solver
+    // and should be located in each eqn.
+    cudaGraph_t graph;
+    cudaGraphExec_t graph_instance;
+    bool graph_created=false;
+
+    // constant values -- basic
+    int num_cells = 0;
+    int num_surfaces = 0;
+    int num_boundary_surfaces = 0;
+    int num_patches = 0;
+    int num_species = 0;
+    std::vector<int> patch_sizes;
+    double rdelta_t = 0;
+
+    // constant values -- ldu bytesize
+    size_t cell_value_bytes = 0;
+    size_t cell_value_vec_bytes = 0;
+    size_t cell_value_tsr_bytes = 0;
+    size_t cell_index_bytes = 0;
+    size_t surface_value_bytes = 0;
+    size_t surface_index_bytes = 0;
+    size_t surface_value_vec_bytes = 0;
+    size_t boundary_surface_value_bytes = 0;
+    size_t boundary_surface_value_vec_bytes = 0;
+    size_t boundary_surface_value_tsr_bytes = 0;
+    size_t boundary_surface_index_bytes = 0;
+
+    // constant values -- csr bytesize
+    size_t csr_row_index_bytes = 0;
+    size_t csr_col_index_bytes = 0;
+    size_t csr_value_bytes = 0;
+    size_t csr_value_vec_bytes = 0;
+
+    // constant indexes
+    int *d_owner = nullptr;
+    int *d_neighbor = nullptr;
+    int *d_lower_to_csr_index = nullptr;
+    int *d_diag_to_csr_index= nullptr;
+    int *d_upper_to_csr_index= nullptr;
+    int *d_csr_row_index= nullptr;
+    int *d_csr_col_index= nullptr;
+
+    // constant fields - internal
+    double *d_sf = nullptr;
+    double *d_mag_sf = nullptr;
+    double *d_weight = nullptr;
+    double *d_delta_coeffs = nullptr;
+    double *d_volume = nullptr;
+    
+    // constant fields - boundary
+    double *d_boundary_sf = nullptr;
+    double *d_boundary_mag_sf = nullptr;
+    double *d_boundary_weight = nullptr;
+    double *d_boundary_delta_coeffs = nullptr;
+
+    // non-constant fields - internal 
+    // TODO: further estimate
+    // fields solved by eqns - new
+    double *d_rho = nullptr;
+    double *d_u = nullptr;
+    double *d_y = nullptr;
+    double *d_he = nullptr;
+    double *d_p = nullptr;
+    // fields solved by eqns - old 
+    // TODO: not all fields need to store oldTime
+    double *d_rho_old = nullptr;
+    double *d_u_old = nullptr;
+    double *d_y_old = nullptr;
+    double *d_he_old = nullptr;
+    double *d_p_old = nullptr;
+    // other shared fields between eqns
+    double *d_phi = nullptr;
+    // computed on GPU, used on CPU, need memcpyd2h - host
+    double *h_rho = nullptr;
+    double *h_u= nullptr;
+    double *h_y= nullptr;
+    double *h_he= nullptr;
+    // computed on CPU, used on GPU, need memcpyh2d - host
+    double *h_p= nullptr;
+    double *h_phi= nullptr;
+
+    // non-constant fields - boundary
+    // TODO: further estimate
+    // fields solved by eqns - new
+    double *d_boundary_rho = nullptr;
+    double *d_boundary_u = nullptr;
+    double *d_boundary_y = nullptr;
+    double *d_boundary_he = nullptr;
+    double *d_boundary_p = nullptr;
+    // fields solved by eqns - old
+    double *d_boundary_rho_old = nullptr;
+    double *d_boundary_u_old = nullptr;
+    double *d_boundary_y_old = nullptr;
+    double *d_boundary_he_old = nullptr;
+    double *d_boundary_p_old = nullptr;
+    // other shared fields between eqns
+    double *d_boundary_phi = nullptr;
+    // computed on GPU, used on CPU, need memcpyd2h - host
+    double *h_boundary_rho = nullptr;
+    double *h_boundary_u= nullptr;
+    double *h_boundary_y= nullptr;
+    double *h_boundary_he= nullptr;
+    // computed on CPU, used on GPU, need memcpyh2d - host
+    double *h_boundary_p= nullptr;
+    double *h_boundary_phi= nullptr;
 
     // constructor
     dfMatrixDataBase();
-    dfMatrixDataBase(int num_surfaces, int num_cells, int num_boundary_faces, int num_species, int & num_boundary_cells_output,
-        const int *neighbour, const int *owner, const double* volume, const double* weight, const double* face_vector, const double* face, 
-        const double* deltaCoeffs, std::vector<double> boundary_face_vector_init, std::vector<double> boundary_face_init, 
-        std::vector<double> boundary_deltaCoeffs_init, std::vector<int> boundary_cell_id_init, std::vector<std::vector<int>> patch_type_init)
-    : num_cells(num_cells), num_faces(num_surfaces*2), num_surfaces(num_surfaces), num_species(num_species), num_iteration(0),
-      num_boundary_faces(num_boundary_faces), h_volume(volume), patch_type_init(patch_type_init)
-    {
-        // create cuda stream
-        checkCudaErrors(cudaStreamCreate(&stream));
-
-        // allocate field pointer in pin memory
-        cudaMallocHost(&h_phi_init, num_faces * sizeof(double));
-        cudaMallocHost(&h_rho_old, num_cells * sizeof(double));
-
-        h_weight_vec_init.resize(num_faces);
-        h_weight_vec.resize(num_faces);
-        h_face_vector_vec_init.resize(num_faces*3);
-        h_face_vector_vec.resize(num_faces*3);
-        h_face_vec_init.resize(num_faces);
-        h_face_vec.resize(num_faces);
-        h_deltaCoeffs_vec_init.resize(num_faces);
-        h_deltaCoeffs_vec.resize(num_faces);
-        h_turbSrc_init_mtx_vec.resize(num_faces + num_cells);
-        h_turbSrc_init_1mtx.resize(num_faces + num_cells);
-        h_turbSrc_init_src_vec.resize(3*num_cells);
-        h_turbSrc_src_vec.resize(3*num_cells);
-
-        // byte sizes
-        cell_bytes = num_cells * sizeof(double);
-        cell_vec_bytes = num_cells * 3 * sizeof(double);
-        cell_index_bytes = num_cells * sizeof(int);
-
-        face_bytes = num_faces * sizeof(double);
-        face_vec_bytes = num_faces * 3 * sizeof(double);
-        face_index_bytes = num_faces * sizeof(int);
-
-        // A_csr has one more element in each row: itself
-        csr_row_index_bytes = (num_cells + 1) * sizeof(int);
-        csr_col_index_bytes = (num_cells + num_faces) * sizeof(int);
-        csr_value_bytes = (num_cells + num_faces) * sizeof(double);
-        csr_value_vec_bytes = (num_cells + num_faces) * 3 * sizeof(double);
-
-        /************************construct mesh variables****************************/
-        /**
-         * 1. h_csr_row_index & h_csr_diag_index
-        */
-        std::vector<int> h_mtxEntry_perRow_vec(num_cells);
-        std::vector<int> h_csr_diag_index_vec(num_cells);
-        std::vector<int> h_csr_row_index_vec(num_cells + 1, 0);
-
-        for (int faceI = 0; faceI < num_surfaces; faceI++)
-        {
-            h_csr_diag_index_vec[neighbour[faceI]]++;
-            h_mtxEntry_perRow_vec[neighbour[faceI]]++;
-            h_mtxEntry_perRow_vec[owner[faceI]]++;
-        }
-
-        // - consider diagnal element in each row
-        std::transform(h_mtxEntry_perRow_vec.begin(), h_mtxEntry_perRow_vec.end(), h_mtxEntry_perRow_vec.begin(), [](int n)
-            {return n + 1;});
-        // - construct h_csr_row_index & h_csr_diag_index
-        std::partial_sum(h_mtxEntry_perRow_vec.begin(), h_mtxEntry_perRow_vec.end(), h_csr_row_index_vec.begin()+1);
-        // - assign h_csr_row_index & h_csr_diag_index
-        h_A_csr_row_index = h_csr_row_index_vec.data();
-        h_A_csr_diag_index = h_csr_diag_index_vec.data();
-
-        /**
-         * 2. h_csr_col_index
-        */
-        std::vector<int> rowIndex(num_faces + num_cells), colIndex(num_faces + num_cells), diagIndex(num_cells);
-        std::iota(diagIndex.begin(), diagIndex.end(), 0);
-
-        // initialize the RowIndex (rowIndex of lower + upper + diagnal)
-        std::copy(neighbour, neighbour + num_surfaces, rowIndex.begin());
-        std::copy(owner, owner + num_surfaces, rowIndex.begin() + num_surfaces);
-        std::copy(diagIndex.begin(), diagIndex.end(), rowIndex.begin() + num_faces);
-        // initialize the ColIndex (colIndex of lower + upper + diagnal)
-        std::copy(owner, owner + num_surfaces, colIndex.begin());
-        std::copy(neighbour, neighbour + num_surfaces, colIndex.begin() + num_surfaces);
-        std::copy(diagIndex.begin(), diagIndex.end(), colIndex.begin() + num_faces);
-
-        // - construct hashTable for sorting
-        std::multimap<int,int> rowColPair;
-        for (int i = 0; i < 2*num_surfaces+num_cells; i++)
-        {
-            rowColPair.insert(std::make_pair(rowIndex[i], colIndex[i]));
-        }
-        // - sort
-        std::vector<std::pair<int, int>> globalPerm(rowColPair.begin(), rowColPair.end());
-        std::sort(globalPerm.begin(), globalPerm.end(), []
-        (const std::pair<int, int>& pair1, const std::pair<int, int>& pair2){
-        if (pair1.first != pair2.first) {
-            return pair1.first < pair2.first;
-        } else {
-            return pair1.second < pair2.second;
-        }
-        });
-
-        std::vector<int> h_csr_col_index_vec;
-        std::transform(globalPerm.begin(), globalPerm.end(), std::back_inserter(h_csr_col_index_vec), []
-            (const std::pair<int, int>& pair) {
-            return pair.second;
-        });
-        h_A_csr_col_index = h_csr_col_index_vec.data();
-        
-        // construct a tmp permutated List for add fvMatrix
-        std::vector<int> tmp_permutation(2*num_surfaces + num_cells);
-        std::vector<int> tmp_rowIndex(2*num_surfaces + num_cells);
-        std::iota(tmp_permutation.begin(), tmp_permutation.end(), 0);
-        std::copy(neighbour, neighbour + num_surfaces, tmp_rowIndex.begin());
-        std::copy(diagIndex.begin(), diagIndex.end(), tmp_rowIndex.begin() + num_surfaces);
-        std::copy(owner, owner + num_surfaces, tmp_rowIndex.begin() + num_surfaces + num_cells);
-        std::multimap<int,int> tmpPair;
-        for (int i = 0; i < 2*num_surfaces+num_cells; i++)
-        {
-            tmpPair.insert(std::make_pair(tmp_rowIndex[i], tmp_permutation[i]));
-        }
-        std::vector<std::pair<int, int>> tmpPerm(tmpPair.begin(), tmpPair.end());
-        std::sort(tmpPerm.begin(), tmpPerm.end(), []
-        (const std::pair<int, int>& pair1, const std::pair<int, int>& pair2){
-        if (pair1.first != pair2.first) {
-            return pair1.first < pair2.first;
-        } else {
-            return pair1.second < pair2.second;
-        }
-        });
-        std::transform(tmpPerm.begin(), tmpPerm.end(), std::back_inserter(tmpPermutatedList), []
-            (const std::pair<int, int>& pair) {
-            return pair.second;
-        });
-
-        /**
-         * 3. boundary imformations
-        */
-        // get boundPermutation and offset lists
-        std::vector<int> boundPermutationListInit(num_boundary_faces);
-        std::vector<int> boundOffsetList;
-        std::iota(boundPermutationListInit.begin(), boundPermutationListInit.end(), 0);
-
-        // - construct hashTable for sorting
-        std::multimap<int,int> boundPermutation;
-        for (int i = 0; i < num_boundary_faces; i++)
-        {
-            boundPermutation.insert(std::make_pair(boundary_cell_id_init[i], boundPermutationListInit[i]));
-        }
-
-        // - sort 
-        std::vector<std::pair<int, int>> boundPermPair(boundPermutation.begin(), boundPermutation.end());
-        std::sort(boundPermPair.begin(), boundPermPair.end(), []
-        (const std::pair<int, int>& pair1, const std::pair<int, int>& pair2){
-            if (pair1.first != pair2.first) {
-                return pair1.first < pair2.first;
-            } else {
-                return pair1.second < pair2.second;
-            }
-        });
-
-        // - construct boundPermedIndex and boundary_cell_id
-        std::vector<int> boundary_cell_id;
-        boundPermutationList.clear();
-        std::transform(boundPermPair.begin(), boundPermPair.end(), std::back_inserter(boundary_cell_id), []
-            (const std::pair<int, int>& pair) {
-            return pair.first;
-        });
-        std::transform(boundPermPair.begin(), boundPermPair.end(), std::back_inserter(boundPermutationList), []
-            (const std::pair<int, int>& pair) {
-            return pair.second;
-        });
-
-        // construct boundary_cell_offset
-        std::map<int, int> countMap;
-        std::vector<int> boundaryCellcount;
-        for (const auto& cellIndex : boundary_cell_id)
-            ++ countMap[cellIndex];
-        for (const auto& [cellIndex, count] : countMap)
-            boundaryCellcount.push_back(count);
-
-        num_boundary_cells = boundaryCellcount.size();
-        num_boundary_cells_output = num_boundary_cells;
-
-        std::vector<int> boundary_cell_offset(boundaryCellcount.size() + 1, 0);
-        std::partial_sum(boundaryCellcount.begin(), boundaryCellcount.end(), boundary_cell_offset.begin()+1);
-        
-        // assign h_boundary_cell_offset & h_boundary_cell_id
-        h_boundary_cell_offset = boundary_cell_offset.data();
-        h_boundary_cell_id = boundary_cell_id.data();
-
-        // 
-        boundary_cell_bytes = num_boundary_cells * sizeof(double);
-        boundary_cell_vec_bytes = num_boundary_cells * 3 * sizeof(double);
-        boundary_cell_index_bytes = num_boundary_cells * sizeof(int);
-
-        boundary_face_bytes = num_boundary_faces * sizeof(double);
-        boundary_face_vec_bytes = num_boundary_faces * 3 * sizeof(double);
-        boundary_face_index_bytes = num_boundary_faces * sizeof(int);
-
-        ueqn_internalCoeffs.resize(3*num_boundary_faces);
-        ueqn_boundaryCoeffs.resize(3*num_boundary_faces);
-
-        boundary_face_vector.resize(3*num_boundary_faces);
-        boundary_pressure.resize(num_boundary_faces);
-        boundary_face.resize(num_boundary_faces);
-        boundary_deltaCoeffs.resize(num_boundary_faces);
-
-        patch_type.resize(2);
-        patch_type[0].resize(num_boundary_faces);
-        patch_type[1].resize(num_boundary_faces);
-
-        /**
-         * 4. permutation list for field variables
-        */
-        std::vector<int> offdiagRowIndex(2*num_surfaces), permIndex(2*num_surfaces);
-        // - initialize the offdiagRowIndex (rowIndex of lower + rowIndex of upper)
-        std::copy(neighbour, neighbour + num_surfaces, offdiagRowIndex.begin());
-        std::copy(owner, owner + num_surfaces, offdiagRowIndex.begin() + num_surfaces);
-
-        // - initialize the permIndex (0, 1, ..., 2*num_surfaces)
-        std::iota(permIndex.begin(), permIndex.end(), 0);
-
-        // - construct hashTable for sorting
-        std::multimap<int,int> permutation;
-        for (int i = 0; i < 2*num_surfaces; i++)
-        {
-            permutation.insert(std::make_pair(offdiagRowIndex[i], permIndex[i]));
-        }
-        // - sort 
-        std::vector<std::pair<int, int>> permPair(permutation.begin(), permutation.end());
-        std::sort(permPair.begin(), permPair.end(), []
-        (const std::pair<int, int>& pair1, const std::pair<int, int>& pair2){
-            if (pair1.first != pair2.first) {
-                return pair1.first < pair2.first;
-            } else {
-                return pair1.second < pair2.second;
-            }
-        });
-        // - form permedIndex list
-        std::transform(permPair.begin(), permPair.end(), std::back_inserter(permedIndex), []
-            (const std::pair<int, int>& pair) {
-            return pair.second;
-        });
-
-        // copy and permutate cell variables
-        std::copy(weight, weight + num_surfaces, h_weight_vec_init.begin());
-        std::copy(weight, weight + num_surfaces, h_weight_vec_init.begin() + num_surfaces);
-        std::copy(face_vector, face_vector + 3*num_surfaces, h_face_vector_vec_init.begin());
-        std::copy(face_vector, face_vector + 3*num_surfaces, h_face_vector_vec_init.begin() + 3*num_surfaces);
-        std::copy(face, face + num_surfaces, h_face_vec_init.begin());
-        std::copy(face, face + num_surfaces, h_face_vec_init.begin() + num_surfaces);
-        std::copy(deltaCoeffs, deltaCoeffs + num_surfaces, h_deltaCoeffs_vec_init.begin());
-        std::copy(deltaCoeffs, deltaCoeffs + num_surfaces, h_deltaCoeffs_vec_init.begin() + num_surfaces);
-        for (int i = 0; i < num_faces; i++)
-        {
-            h_weight_vec[i] = h_weight_vec_init[permedIndex[i]];
-            h_face_vec[i] = h_face_vec_init[permedIndex[i]];
-            h_deltaCoeffs_vec[i] = h_deltaCoeffs_vec_init[permedIndex[i]];
-            h_face_vector_vec[i*3] = h_face_vector_vec_init[3*permedIndex[i]];
-            h_face_vector_vec[i*3+1] = h_face_vector_vec_init[3*permedIndex[i]+1];
-            h_face_vector_vec[i*3+2] = h_face_vector_vec_init[3*permedIndex[i]+2];
-        }
-        h_weight = h_weight_vec.data();
-        h_face_vector = h_face_vector_vec.data();
-        h_face = h_face_vec.data();
-        h_deltaCoeffs = h_deltaCoeffs_vec.data();
-
-        for (int i = 0; i < num_boundary_faces; i++)
-        {
-            boundary_face_vector[3*i] = boundary_face_vector_init[3*boundPermutationList[i]];
-            boundary_face_vector[3*i+1] = boundary_face_vector_init[3*boundPermutationList[i]+1];
-            boundary_face_vector[3*i+2] = boundary_face_vector_init[3*boundPermutationList[i]+2];
-            boundary_face[i] = boundary_face_init[boundPermutationList[i]];
-            boundary_deltaCoeffs[i] = boundary_deltaCoeffs_init[boundPermutationList[i]];
-            patch_type[0][i] = patch_type_init[0][boundPermutationList[i]];
-            patch_type[1][i] = patch_type_init[1][boundPermutationList[i]];
-        }
-        h_boundary_face_vector = boundary_face_vector.data();
-        h_boundary_face = boundary_face.data();
-        h_boundary_deltaCoeffs = boundary_deltaCoeffs.data();
-
-        /************************allocate memory on device****************************/
-        int total_bytes = 0;
-
-        checkCudaErrors(cudaMalloc((void**)&d_A_csr_row_index, csr_row_index_bytes));
-        checkCudaErrors(cudaMalloc((void**)&d_A_csr_col_index, csr_col_index_bytes));
-        checkCudaErrors(cudaMalloc((void**)&d_A_csr_diag_index, cell_index_bytes));
-        total_bytes += (csr_row_index_bytes + csr_col_index_bytes + cell_index_bytes);
-
-        //d_Y.resize(num_species);
-        d_rhoD_vector.resize(num_species);
-        d_boundary_Y_vector.resize(num_species);
-        d_boundary_Y_init_vector.resize(num_species);
-        d_internal_coeffs_Y_vector.resize(num_species);
-        d_boundary_coeffs_Y_vector.resize(num_species);
-        d_laplac_internal_coeffs_Y_vector.resize(num_species);
-        d_laplac_boundary_coeffs_Y_vector.resize(num_species);
-        d_boundary_rhoD_vector.resize(num_species);
-
-        for (size_t i = 0; i < num_species; ++i){
-            //checkCudaErrors(cudaMalloc((void**)&d_Y[i], cell_bytes));
-            checkCudaErrors(cudaMalloc((void**)&d_rhoD_vector[i], cell_bytes));
-        }
-        checkCudaErrors(cudaMalloc((void**)&d_Y, cell_bytes * num_species));
-        checkCudaErrors(cudaMalloc((void**)&d_rho_old, cell_bytes));
-        checkCudaErrors(cudaMalloc((void**)&d_rho_new, cell_bytes));
-        checkCudaErrors(cudaMalloc((void**)&d_volume, cell_bytes));
-        checkCudaErrors(cudaMalloc((void**)&d_pressure, cell_bytes));
-        checkCudaErrors(cudaMalloc((void**)&d_velocity_old, cell_vec_bytes));
-        checkCudaErrors(cudaMalloc((void**)&d_weight, face_bytes));
-        checkCudaErrors(cudaMalloc((void**)&d_weight_upwind, face_bytes));
-        checkCudaErrors(cudaMalloc((void**)&d_face, face_bytes));
-        checkCudaErrors(cudaMalloc((void**)&d_deltaCoeffs, face_bytes));
-        checkCudaErrors(cudaMalloc((void**)&d_phi, face_bytes));
-        checkCudaErrors(cudaMalloc((void**)&d_phi_init, face_bytes));
-        checkCudaErrors(cudaMalloc((void**)&d_face_vector, face_vec_bytes));
-        checkCudaErrors(cudaMalloc((void**)&d_nuEff, cell_bytes));
-        checkCudaErrors(cudaMalloc((void**)&d_rhoD, cell_bytes * num_species));
-        checkCudaErrors(cudaMalloc((void**)&d_alpha, cell_bytes));
-        total_bytes += (cell_bytes * (5 + 2*num_species) + face_bytes * 6 + cell_vec_bytes + face_vec_bytes);
-
-        checkCudaErrors(cudaMalloc((void**)&d_boundary_cell_offset, (num_boundary_cells+1) * sizeof(int)));
-        checkCudaErrors(cudaMalloc((void**)&d_boundary_cell_id, boundary_face_index_bytes));
-        total_bytes += (boundary_face_index_bytes + (num_boundary_cells+1) * sizeof(int));
-
-        checkCudaErrors(cudaMalloc((void**)&d_boundary_pressure_init, boundary_face_bytes));
-        checkCudaErrors(cudaMalloc((void**)&d_boundary_pressure, boundary_face_bytes));
-        checkCudaErrors(cudaMalloc((void**)&d_boundary_phi_init, boundary_face_bytes));
-        checkCudaErrors(cudaMalloc((void**)&d_boundary_phi, boundary_face_bytes));
-        checkCudaErrors(cudaMalloc((void**)&d_boundary_velocity_init, boundary_face_vec_bytes));
-        checkCudaErrors(cudaMalloc((void**)&d_boundary_velocity, boundary_face_vec_bytes));
-        checkCudaErrors(cudaMalloc((void**)&d_boundary_face_vector, boundary_face_vec_bytes));
-        checkCudaErrors(cudaMalloc((void**)&d_boundary_face, boundary_face_bytes));
-        checkCudaErrors(cudaMalloc((void**)&d_boundary_deltaCoeffs, boundary_face_bytes));
-        checkCudaErrors(cudaMalloc((void**)&d_internal_coeffs_init, boundary_face_vec_bytes));
-        checkCudaErrors(cudaMalloc((void**)&d_boundary_coeffs_init, boundary_face_vec_bytes));
-        checkCudaErrors(cudaMalloc((void**)&d_laplac_internal_coeffs_init, boundary_face_vec_bytes));
-        checkCudaErrors(cudaMalloc((void**)&d_laplac_boundary_coeffs_init, boundary_face_vec_bytes));
-        checkCudaErrors(cudaMalloc((void**)&d_internal_coeffs, boundary_face_vec_bytes));
-        checkCudaErrors(cudaMalloc((void**)&d_boundary_coeffs, boundary_face_vec_bytes));
-        checkCudaErrors(cudaMalloc((void**)&d_laplac_internal_coeffs, boundary_face_vec_bytes));
-        checkCudaErrors(cudaMalloc((void**)&d_laplac_boundary_coeffs, boundary_face_vec_bytes));
-        checkCudaErrors(cudaMalloc((void**)&d_boundary_nuEff, boundary_face_bytes));
-        checkCudaErrors(cudaMalloc((void**)&d_boundary_nuEff_init, boundary_face_bytes));
-        checkCudaErrors(cudaMalloc((void**)&d_boundary_rho, boundary_face_bytes));
-        checkCudaErrors(cudaMalloc((void**)&d_boundary_rho_init, boundary_face_bytes));
-        checkCudaErrors(cudaMalloc((void**)&d_boundary_mut_sct, boundary_face_bytes));
-        checkCudaErrors(cudaMalloc((void**)&d_boundary_UpatchType, boundary_face_index_bytes));
-        checkCudaErrors(cudaMalloc((void**)&d_boundary_YpatchType, boundary_face_index_bytes));
-        for (size_t i = 0; i < num_species; ++i){
-            checkCudaErrors(cudaMalloc((void**)&d_boundary_Y_vector[i], boundary_face_bytes));
-            checkCudaErrors(cudaMalloc((void**)&d_boundary_Y_init_vector[i], boundary_face_bytes));
-            checkCudaErrors(cudaMalloc((void**)&d_internal_coeffs_Y_vector[i], boundary_face_bytes));
-            checkCudaErrors(cudaMalloc((void**)&d_boundary_coeffs_Y_vector[i], boundary_face_bytes));
-            checkCudaErrors(cudaMalloc((void**)&d_laplac_internal_coeffs_Y_vector[i], boundary_face_bytes));
-            checkCudaErrors(cudaMalloc((void**)&d_laplac_boundary_coeffs_Y_vector[i], boundary_face_bytes));
-            checkCudaErrors(cudaMalloc((void**)&d_boundary_rhoD_vector[i], boundary_face_bytes));
-        }
-        checkCudaErrors(cudaMalloc((void**)&d_internal_coeffs_Y, boundary_face_bytes * num_species));
-        checkCudaErrors(cudaMalloc((void**)&d_boundary_coeffs_Y, boundary_face_bytes * num_species));
-        checkCudaErrors(cudaMalloc((void**)&d_laplac_internal_coeffs_Y, boundary_face_bytes * num_species));
-        checkCudaErrors(cudaMalloc((void**)&d_laplac_boundary_coeffs_Y, boundary_face_bytes * num_species));
-        checkCudaErrors(cudaMalloc((void**)&d_boundary_rhoD, boundary_face_bytes * num_species));
-        checkCudaErrors(cudaMalloc((void**)&d_boundary_alpha, boundary_face_bytes));
-        
-        total_bytes += (boundary_face_bytes*10 + boundary_face_vec_bytes * 11);
-
-        // checkCudaErrors(cudaMalloc((void**)&d_A_csr, csr_value_vec_bytes));
-        // checkCudaErrors(cudaMalloc((void**)&d_b, cell_vec_bytes));
-        // checkCudaErrors(cudaMalloc((void**)&d_psi, cell_vec_bytes));
-        total_bytes += (boundary_face_bytes + boundary_face_vec_bytes * 3);
-
-        checkCudaErrors(cudaMalloc((void**)&d_turbSrc_A, csr_value_bytes));
-        checkCudaErrors(cudaMalloc((void**)&d_turbSrc_A_init, csr_value_bytes));
-        checkCudaErrors(cudaMalloc((void**)&d_turbSrc_b, cell_vec_bytes));
-        total_bytes += (2*csr_value_bytes + cell_vec_bytes);
-
-        checkCudaErrors(cudaMalloc((void**)&d_permedIndex, face_index_bytes));
-        checkCudaErrors(cudaMalloc((void**)&d_bouPermedIndex, boundary_face_index_bytes));
-        checkCudaErrors(cudaMalloc((void**)&d_tmpPermutatedList, csr_col_index_bytes));
-        total_bytes += (face_index_bytes + boundary_face_index_bytes + csr_col_index_bytes);
-
-        checkCudaErrors(cudaMalloc((void**)&d_grad, num_cells * 9 * sizeof(double)));
-        checkCudaErrors(cudaMalloc((void**)&d_grad_boundary, boundary_face_bytes * 9));
-        checkCudaErrors(cudaMalloc((void**)&d_grad_boundary_init, boundary_cell_bytes * 9));
-        total_bytes += (num_cells * 9 * sizeof(double) + boundary_face_bytes * 9 + boundary_cell_bytes * 9); // FIXME: rename
-
-        checkCudaErrors(cudaMalloc((void**)&d_hDiffCorrFlux, 3 * cell_bytes));
-        checkCudaErrors(cudaMalloc((void**)&d_diffAlphaD, cell_bytes));
-        checkCudaErrors(cudaMalloc((void**)&d_boundary_hDiffCorrFlux, 3 * boundary_face_bytes));
 
-        fprintf(stderr, "Total bytes malloc on GPU: %.2fMB\n", total_bytes * 1.0 / 1024 / 1024);
+    // deconstructor
+    ~dfMatrixDataBase();
 
-        checkCudaErrors(cudaMemcpyAsync(d_A_csr_row_index, h_A_csr_row_index, csr_row_index_bytes, cudaMemcpyHostToDevice, stream));
-        checkCudaErrors(cudaMemcpyAsync(d_A_csr_col_index, h_A_csr_col_index, csr_col_index_bytes, cudaMemcpyHostToDevice, stream));
-        checkCudaErrors(cudaMemcpyAsync(d_A_csr_diag_index, h_A_csr_diag_index, cell_index_bytes, cudaMemcpyHostToDevice, stream));
-        checkCudaErrors(cudaMemcpyAsync(d_boundary_cell_offset, h_boundary_cell_offset, (num_boundary_cells+1) * sizeof(int), cudaMemcpyHostToDevice, stream));
-        checkCudaErrors(cudaMemcpyAsync(d_boundary_cell_id, h_boundary_cell_id, boundary_face_index_bytes, cudaMemcpyHostToDevice, stream));
+    // member function
+    void setConstantValues(int num_cells, int num_surfaces, int num_boundary_surfaces,
+                   int num_patches, std::vector<int> patch_sizes,
+                   int num_species, double rdelta_t);             
+    void setConstantIndexes(const int *owner, const int *neighbor);
 
-        checkCudaErrors(cudaMemcpyAsync(d_volume, h_volume, cell_bytes, cudaMemcpyHostToDevice, stream));
-        checkCudaErrors(cudaMemcpyAsync(d_weight, h_weight, face_bytes, cudaMemcpyHostToDevice, stream));
-        checkCudaErrors(cudaMemcpyAsync(d_face, h_face, face_bytes, cudaMemcpyHostToDevice, stream));
-        checkCudaErrors(cudaMemcpyAsync(d_deltaCoeffs, h_deltaCoeffs, face_bytes, cudaMemcpyHostToDevice, stream));
-        checkCudaErrors(cudaMemcpyAsync(d_face_vector, h_face_vector, face_vec_bytes, cudaMemcpyHostToDevice, stream));
-        checkCudaErrors(cudaMemcpyAsync(d_boundary_face_vector, h_boundary_face_vector, boundary_face_vec_bytes, cudaMemcpyHostToDevice, stream));
-        checkCudaErrors(cudaMemcpyAsync(d_boundary_face, h_boundary_face, boundary_face_bytes, cudaMemcpyHostToDevice, stream));
-        checkCudaErrors(cudaMemcpyAsync(d_boundary_deltaCoeffs, h_boundary_deltaCoeffs, boundary_face_bytes, cudaMemcpyHostToDevice, stream));
-        checkCudaErrors(cudaMemcpyAsync(d_boundary_UpatchType, patch_type[0].data(), boundary_face_index_bytes, cudaMemcpyHostToDevice, stream));
-        checkCudaErrors(cudaMemcpyAsync(d_boundary_YpatchType, patch_type[1].data(), boundary_face_index_bytes, cudaMemcpyHostToDevice, stream));
+    void createConstantFieldsInternal();
+    void createConstantFieldsBoundary();
+    void initConstantFieldsInternal(const double *sf, const double *mag_sf, 
+        const double *weight, const double *delta_coeffs, const double *volume);
+    void initConstantFieldsBoundary(const double *boundary_sf, const double *boundary_mag_sf, 
+        const double *boundary_delta_coeffs);
 
-        checkCudaErrors(cudaMemcpyAsync(d_permedIndex, permedIndex.data(), face_index_bytes, cudaMemcpyHostToDevice, stream));
-        checkCudaErrors(cudaMemcpyAsync(d_bouPermedIndex, boundPermutationList.data(), boundary_face_index_bytes, cudaMemcpyHostToDevice, stream));
-        checkCudaErrors(cudaMemcpyAsync(d_tmpPermutatedList, tmpPermutatedList.data(), csr_col_index_bytes, cudaMemcpyHostToDevice, stream));
-    };
+    void createNonConstantFieldsInternal();
+    void createNonConstantFieldsBoundary();
+    void initNonConstantFieldsInternal(const double *y);
+    void initNonConstantFieldsBoundary(const double *boundary_y);
 
-    ~dfMatrixDataBase(){
-        std::cout << "Destructor called." << std::endl;
-        // TODO: free pointers
-        
-    };
 };
 
diff --git a/src_gpu/dfMatrixDataBase.cu b/src_gpu/dfMatrixDataBase.cu
index d4f5a7ab0..4ecbc25c8 100644
--- a/src_gpu/dfMatrixDataBase.cu
+++ b/src_gpu/dfMatrixDataBase.cu
@@ -1,48 +1,231 @@
 #include "dfMatrixDataBase.H"
 
+dfMatrixDataBase::dfMatrixDataBase() {
+    checkCudaErrors(cudaStreamCreate(&stream));
+}
 
-void constructBoundarySelector(std::vector<int>& patchTypeSelector, const std::string& patchTypeStr,
-    const int patchSize)
-{
-    boundaryConditions patchCondition;
-    std::vector<int> tmpSelector;
-    static std::map<std::string, boundaryConditions> BCMap = {
-        {"zeroGradient", zeroGradient},
-        {"fixedValue", fixedValue},
-        {"empty", empty},
-        {"coupled", coupled}
-    };
-    auto iter = BCMap.find(patchTypeStr);
-    if (iter != BCMap.end()) {
-        patchCondition = iter->second;
-    } else {
-        throw std::runtime_error("Unknown boundary condition: " + patchTypeStr);
+dfMatrixDataBase::~dfMatrixDataBase() {
+    // destroy cuda resources
+    checkCudaErrors(cudaStreamDestroy(stream));
+    if (graph_created) {
+        checkCudaErrors(cudaGraphExecDestroy(graph_instance));
+        checkCudaErrors(cudaGraphDestroy(graph));
     }
-    // zeroGradient labeled as 0, fixedValue labeled as 1, coupled labeled as 2
-    switch (patchCondition){
-        case zeroGradient:
-        {
-            tmpSelector.resize(patchSize, 0);
-            patchTypeSelector.insert(patchTypeSelector.end(), tmpSelector.begin(), tmpSelector.end());
-            break;
-        }
-        case fixedValue:
-        {
-            tmpSelector.resize(patchSize, 1);
-            patchTypeSelector.insert(patchTypeSelector.end(), tmpSelector.begin(), tmpSelector.end());
-            break;
+    // TODO: free pointers
+}
+
+void dfMatrixDataBase::setConstantValues(int num_cells, int num_surfaces, int num_boundary_surfaces,
+                   int num_patches, std::vector<int> patch_sizes,
+                   int num_species, double rdelta_t) {
+    // constant values -- basic
+    this->num_cells = num_cells;
+    this->num_surfaces = num_surfaces;
+    this->num_boundary_surfaces = num_boundary_surfaces;
+    this->num_patches = num_patches;
+    this->patch_sizes = patch_sizes;
+    this->num_species = num_species;
+    this->rdelta_t = rdelta_t;
+
+    // constant values -- ldu bytesize
+    cell_value_bytes = num_cells * sizeof(double);
+    cell_value_vec_bytes = num_cells * 3 * sizeof(double);
+    cell_value_tsr_bytes = num_cells * 9 * sizeof(double);
+    cell_index_bytes = num_cells * sizeof(int);
+    surface_value_bytes = num_surfaces * sizeof(double);
+    surface_index_bytes = num_surfaces * sizeof(int);
+    surface_value_vec_bytes = num_surfaces * 3 * sizeof(double);
+    boundary_surface_value_bytes = num_boundary_surfaces * sizeof(double);
+    boundary_surface_value_vec_bytes = num_boundary_surfaces * 3 * sizeof(double);
+    boundary_surface_value_tsr_bytes = num_boundary_surfaces * 9 * sizeof(double);
+    boundary_surface_index_bytes = num_boundary_surfaces * sizeof(int);
+
+    // constant values -- csr bytesize
+    csr_row_index_bytes = (num_cells + 1) * sizeof(int);
+    csr_col_index_bytes = (num_cells + num_surfaces * 2) * sizeof(int);
+    csr_value_bytes = (num_cells + num_surfaces * 2) * sizeof(double);
+    csr_value_vec_bytes = (num_cells + num_surfaces * 2) * 3 * sizeof(double);
+}
+
+void dfMatrixDataBase::setConstantIndexes(const int *owner, const int *neighbor) {
+    // build d_owner, d_neighbor
+    checkCudaErrors(cudaMalloc((void**)&d_owner, surface_index_bytes));
+    checkCudaErrors(cudaMalloc((void**)&d_neighbor, surface_index_bytes));
+    checkCudaErrors(cudaMemcpyAsync(d_owner, owner, surface_index_bytes, cudaMemcpyHostToDevice, stream));
+    checkCudaErrors(cudaMemcpyAsync(d_neighbor, neighbor, surface_index_bytes, cudaMemcpyHostToDevice, stream));
+
+
+    // build d_lower_to_csr_index, d_diag_to_csr_index, d_upper_to_csr_index
+    std::vector<int> upperNum(num_cells, 0);
+    std::vector<int> lowerNum(num_cells, 0);
+    std::vector<int> lowerPermListInit(num_surfaces);
+
+    int *upperOffset = (int*)calloc(num_cells + 1, sizeof(int));
+    int *lowerOffset = (int*)calloc(num_cells + 1, sizeof(int));
+
+    for(int faceI = 0; faceI < num_surfaces; ++faceI){
+        upperNum[owner[faceI]] ++;
+        lowerNum[neighbor[faceI]] ++;
+    }
+    std::partial_sum(upperNum.begin(), upperNum.end(), 
+        upperOffset+1);
+    std::partial_sum(lowerNum.begin(), lowerNum.end(), 
+        lowerOffset+1);
+
+    std::iota(lowerPermListInit.begin(), lowerPermListInit.end(), 0);
+
+    std::multimap<int,int> permutation;
+    for (int i = 0; i < num_surfaces; ++i){
+        permutation.insert(std::make_pair(neighbor[i], lowerPermListInit[i]));
+    }
+    std::vector<std::pair<int, int>> permPair(permutation.begin(), permutation.end());
+    std::sort(permPair.begin(), permPair.end(), []
+    (const std::pair<int, int>& pair1, const std::pair<int, int>& pair2){
+        if (pair1.first != pair2.first) {
+            return pair1.first < pair2.first;
+        } else {
+            return pair1.second < pair2.second;
         }
-        case empty:
-        {
-            tmpSelector.resize(patchSize, 2);
-            patchTypeSelector.insert(patchTypeSelector.end(), tmpSelector.begin(), tmpSelector.end());
-            break;
+    });
+
+    std::vector<int> lowerPermList;
+    std::transform(permPair.begin(), permPair.end(), std::back_inserter(lowerPermList), []
+        (const std::pair<int, int>& pair) {
+        return pair.second;
+    }); 
+
+    std::vector<int> lowCSRIndex, uppCSRIndex, diagCSRIndex, CSRRowIndex, CSRColIndex;
+    int uppIndexInCSR = 0, uppIndexInLdu = 0, lowIndexInCSR = 0, lowIndexInLdu = 0, lowNumInLdu = 0;
+    CSRRowIndex.push_back(0);
+    CSRColIndex.resize(2 * num_surfaces + num_cells);
+    lowCSRIndex.resize(num_surfaces);
+    for (int i = 0; i < num_cells; ++i) {
+        int numUppPerRow = upperOffset[i + 1] - upperOffset[i];
+        int numLowPerRow = lowerOffset[i + 1] - lowerOffset[i];
+        int numNZBefore = upperOffset[i] + lowerOffset[i] + i; // add diag
+        // csr row index
+        CSRRowIndex.push_back(numNZBefore);
+        // upper
+        for (int j = 0; j < numUppPerRow; ++j) {
+            uppIndexInCSR = numNZBefore + numLowPerRow + 1 + j; // 1 means diag
+            uppCSRIndex.push_back(uppIndexInCSR);
+            CSRColIndex[uppIndexInCSR] = neighbor[uppIndexInLdu]; // fill upper entry in CSRColIndex
+            uppIndexInLdu ++;
         }
-        case coupled:
-        {
-            tmpSelector.resize(patchSize, 3);
-            patchTypeSelector.insert(patchTypeSelector.end(), tmpSelector.begin(), tmpSelector.end());
-            break;
+        // lower
+        for (int j = 0; j < numLowPerRow; ++j) {
+            lowIndexInCSR = numNZBefore + j;
+            lowIndexInLdu = lowerPermList[lowNumInLdu];
+            lowCSRIndex[lowIndexInLdu] = lowIndexInCSR;
+            CSRColIndex[lowIndexInCSR] = owner[lowIndexInLdu]; // fill lower entry in CSRColIndex
+            lowNumInLdu ++;
         }
+        // diag
+        int diagIndexInCSR = numNZBefore + numLowPerRow;
+        diagCSRIndex.push_back(diagIndexInCSR);
+        CSRColIndex[diagIndexInCSR] = i; // fill diag entry in CSRColIndex
     }
+
+    checkCudaErrors(cudaMalloc((void**)&d_lower_to_csr_index, surface_index_bytes));
+    checkCudaErrors(cudaMalloc((void**)&d_diag_to_csr_index, cell_index_bytes));
+    checkCudaErrors(cudaMalloc((void**)&d_upper_to_csr_index, surface_index_bytes));
+    checkCudaErrors(cudaMemcpyAsync(d_lower_to_csr_index, lowCSRIndex.data(), surface_index_bytes, cudaMemcpyHostToDevice, stream));
+    checkCudaErrors(cudaMemcpyAsync(d_diag_to_csr_index, diagCSRIndex.data(), cell_index_bytes, cudaMemcpyHostToDevice, stream));
+    checkCudaErrors(cudaMemcpyAsync(d_upper_to_csr_index, uppCSRIndex.data(), surface_index_bytes, cudaMemcpyHostToDevice, stream));
+
+
+    // build d_csr_row_index, d_csr_col_index
+    checkCudaErrors(cudaMalloc((void**)&d_csr_row_index, csr_row_index_bytes));
+    checkCudaErrors(cudaMalloc((void**)&d_csr_col_index, csr_col_index_bytes));
+    checkCudaErrors(cudaMemcpyAsync(d_csr_row_index, CSRRowIndex.data(), csr_row_index_bytes, cudaMemcpyHostToDevice, stream));
+    checkCudaErrors(cudaMemcpyAsync(d_csr_col_index, CSRColIndex.data(), csr_col_index_bytes, cudaMemcpyHostToDevice, stream));
+}
+
+void dfMatrixDataBase::createConstantFieldsInternal() {
+    checkCudaErrors(cudaMalloc((void**)&d_sf, surface_value_vec_bytes));
+    checkCudaErrors(cudaMalloc((void**)&d_mag_sf, surface_value_bytes));
+    checkCudaErrors(cudaMalloc((void**)&d_weight, surface_value_bytes));
+    checkCudaErrors(cudaMalloc((void**)&d_delta_coeffs, surface_value_bytes));
+    checkCudaErrors(cudaMalloc((void**)&d_volume, cell_value_bytes));
+}
+
+void dfMatrixDataBase::createConstantFieldsBoundary() {
+    checkCudaErrors(cudaMalloc((void**)&d_boundary_sf, boundary_surface_value_vec_bytes));
+    checkCudaErrors(cudaMalloc((void**)&d_boundary_mag_sf, boundary_surface_value_bytes));
+    checkCudaErrors(cudaMalloc((void**)&d_boundary_delta_coeffs, boundary_surface_value_bytes));
+}
+
+void dfMatrixDataBase::initConstantFieldsInternal(const double *sf, const double *mag_sf, 
+        const double *weight, const double *delta_coeffs, const double *volume) {
+    checkCudaErrors(cudaMemcpyAsync(d_sf, sf, surface_value_vec_bytes, cudaMemcpyHostToDevice, stream));
+    checkCudaErrors(cudaMemcpyAsync(d_mag_sf, mag_sf, surface_value_bytes, cudaMemcpyHostToDevice, stream));
+    checkCudaErrors(cudaMemcpyAsync(d_weight, weight, surface_value_bytes, cudaMemcpyHostToDevice, stream));
+    checkCudaErrors(cudaMemcpyAsync(d_delta_coeffs, delta_coeffs, surface_value_bytes, cudaMemcpyHostToDevice, stream));
+    checkCudaErrors(cudaMemcpyAsync(d_volume, volume, cell_value_bytes, cudaMemcpyHostToDevice, stream));
+}
+
+void dfMatrixDataBase::initConstantFieldsBoundary(const double *boundary_sf, const double *boundary_mag_sf, 
+        const double *boundary_delta_coeffs) {
+    checkCudaErrors(cudaMemcpyAsync(d_boundary_sf, boundary_sf, boundary_surface_value_vec_bytes, cudaMemcpyHostToDevice, stream));
+    checkCudaErrors(cudaMemcpyAsync(d_boundary_mag_sf, boundary_mag_sf, boundary_surface_value_bytes, cudaMemcpyHostToDevice, stream));
+    checkCudaErrors(cudaMemcpyAsync(d_boundary_delta_coeffs, boundary_delta_coeffs, boundary_surface_value_bytes, cudaMemcpyHostToDevice, stream));
+}
+
+void dfMatrixDataBase::createNonConstantFieldsInternal() {
+    checkCudaErrors(cudaMalloc((void**)&d_rho, cell_value_bytes));
+    checkCudaErrors(cudaMalloc((void**)&d_u, cell_value_vec_bytes));
+    checkCudaErrors(cudaMalloc((void**)&d_y, cell_value_bytes * num_species));
+    checkCudaErrors(cudaMalloc((void**)&d_he, cell_value_bytes));
+    checkCudaErrors(cudaMalloc((void**)&d_p, cell_value_bytes));
+    
+    checkCudaErrors(cudaMalloc((void**)&d_rho_old, cell_value_bytes));
+    checkCudaErrors(cudaMalloc((void**)&d_u_old, cell_value_vec_bytes));
+    checkCudaErrors(cudaMalloc((void**)&d_y_old, cell_value_bytes * num_species));
+    checkCudaErrors(cudaMalloc((void**)&d_he_old, cell_value_bytes));
+    checkCudaErrors(cudaMalloc((void**)&d_p_old, cell_value_bytes));
+    
+    checkCudaErrors(cudaMalloc((void**)&d_phi, surface_value_bytes));
+
+    // computed on GPU, used on CPU, need memcpyd2h
+    checkCudaErrors(cudaMallocHost((void**)&h_rho, cell_value_bytes));
+    checkCudaErrors(cudaMallocHost((void**)&h_u, cell_value_vec_bytes));
+    checkCudaErrors(cudaMallocHost((void**)&h_y, cell_value_bytes * num_species));
+    checkCudaErrors(cudaMallocHost((void**)&h_he, cell_value_bytes));
+
+    // computed on CPU, used on GPU, need memcpyh2d
+    checkCudaErrors(cudaMallocHost((void**)&h_p, cell_value_bytes));
+    checkCudaErrors(cudaMallocHost((void**)&h_phi, surface_value_bytes));
+}
+
+void dfMatrixDataBase::createNonConstantFieldsBoundary() {
+    checkCudaErrors(cudaMalloc((void**)&d_boundary_rho, boundary_surface_value_bytes));
+    checkCudaErrors(cudaMalloc((void**)&d_boundary_u, boundary_surface_value_vec_bytes));
+    checkCudaErrors(cudaMalloc((void**)&d_boundary_y, boundary_surface_value_bytes * num_species));
+    checkCudaErrors(cudaMalloc((void**)&d_boundary_he, boundary_surface_value_bytes));
+    checkCudaErrors(cudaMalloc((void**)&d_boundary_p, boundary_surface_value_bytes));
+
+    checkCudaErrors(cudaMalloc((void**)&d_boundary_rho_old, boundary_surface_value_bytes));
+    checkCudaErrors(cudaMalloc((void**)&d_boundary_u_old, boundary_surface_value_vec_bytes));
+    checkCudaErrors(cudaMalloc((void**)&d_boundary_y_old, boundary_surface_value_bytes * num_species));
+    checkCudaErrors(cudaMalloc((void**)&d_boundary_he_old, boundary_surface_value_bytes));
+    checkCudaErrors(cudaMalloc((void**)&d_boundary_p_old, boundary_surface_value_bytes));
+
+    checkCudaErrors(cudaMalloc((void**)&d_boundary_phi, boundary_surface_value_bytes));
+
+    // computed on GPU, used on CPU, need memcpyd2h
+    checkCudaErrors(cudaMallocHost((void**)&h_boundary_rho, boundary_surface_value_bytes));
+    checkCudaErrors(cudaMallocHost((void**)&h_boundary_u, boundary_surface_value_vec_bytes));
+    checkCudaErrors(cudaMallocHost((void**)&h_boundary_y, boundary_surface_value_bytes * num_species));
+    checkCudaErrors(cudaMallocHost((void**)&h_boundary_he, boundary_surface_value_bytes));
+
+    // computed on CPU, used on GPU, need memcpyh2d
+    checkCudaErrors(cudaMallocHost((void**)&h_boundary_p, boundary_surface_value_bytes));
+    checkCudaErrors(cudaMallocHost((void**)&h_boundary_phi, boundary_surface_value_bytes));
+}
+
+void dfMatrixDataBase::initNonConstantFieldsInternal(const double *y) {
+    checkCudaErrors(cudaMemcpyAsync(d_y, y, cell_value_bytes * num_species, cudaMemcpyHostToDevice, stream));
+}
+
+void dfMatrixDataBase::initNonConstantFieldsBoundary(const double *boundary_y) {
+    checkCudaErrors(cudaMemcpyAsync(d_boundary_y, boundary_y, boundary_surface_value_bytes* num_species, cudaMemcpyHostToDevice, stream));
 }
diff --git a/src_gpu/AmgXSolver.H b/src_gpu_orig/AmgXSolver.H
similarity index 100%
rename from src_gpu/AmgXSolver.H
rename to src_gpu_orig/AmgXSolver.H
diff --git a/src_gpu/AmgXSolver.cu b/src_gpu_orig/AmgXSolver.cu
similarity index 100%
rename from src_gpu/AmgXSolver.cu
rename to src_gpu_orig/AmgXSolver.cu
diff --git a/src_gpu_orig/CMakeLists.txt b/src_gpu_orig/CMakeLists.txt
new file mode 100644
index 000000000..6e4a7efef
--- /dev/null
+++ b/src_gpu_orig/CMakeLists.txt
@@ -0,0 +1,39 @@
+#
+# dfMatrix CMake configuration
+#
+cmake_minimum_required(VERSION 3.5)
+
+project(dfMatrix LANGUAGES CXX CUDA)
+set(CMAKE_CXX_STANDARD 17)
+set(CMAKE_CXX_STANDARD_REQUIRED ON)
+
+find_package(CUDA REQUIRED)
+find_package(MPI REQUIRED)
+find_package(CUDAToolkit REQUIRED)
+find_library(LIBAMGXSH amgxsh PATHS $ENV{AMGX_DIR}/build)
+
+include_directories(
+    ${MPI_INCLUDE_PATH}
+    ${CUDA_INCLUDE_DIRS}
+    $ENV{AMGX_DIR}/include
+)
+
+add_library(${PROJECT_NAME} 
+    SHARED 
+        dfUEqn.cu 
+        dfRhoEqn.cu 
+        dfYEqn.cu
+        dfEEqn.cu
+        AmgXSolver.cu
+        dfMatrixDataBase.cu)
+
+target_link_libraries(${PROJECT_NAME}
+    ${MPI_LIBRARIES}
+    ${CUDA_LIBRARIES}
+    ${LIBAMGXSH}
+)
+target_compile_options(dfMatrix PUBLIC -g)
+option(DFMATRIX_ENABLE_DETAILED_DEBUG "Enable detailed debug build" OFF)
+if (DFMATRIX_ENABLE_DETAILED_DEBUG)
+    target_compile_definitions(${PROJECT_NAME} PRIVATE DEBUG)
+endif()
diff --git a/src_gpu/GPUMesh.H b/src_gpu_orig/GPUMesh.H
similarity index 100%
rename from src_gpu/GPUMesh.H
rename to src_gpu_orig/GPUMesh.H
diff --git a/src_gpu/GPUfield.H b/src_gpu_orig/GPUfield.H
similarity index 100%
rename from src_gpu/GPUfield.H
rename to src_gpu_orig/GPUfield.H
diff --git a/src_gpu/GPUfield.cpp b/src_gpu_orig/GPUfield.cpp
similarity index 100%
rename from src_gpu/GPUfield.cpp
rename to src_gpu_orig/GPUfield.cpp
diff --git a/src_gpu/dfEEqn.H b/src_gpu_orig/dfEEqn.H
similarity index 100%
rename from src_gpu/dfEEqn.H
rename to src_gpu_orig/dfEEqn.H
diff --git a/src_gpu/dfEEqn.cu b/src_gpu_orig/dfEEqn.cu
similarity index 100%
rename from src_gpu/dfEEqn.cu
rename to src_gpu_orig/dfEEqn.cu
diff --git a/src_gpu_orig/dfMatrixDataBase.H b/src_gpu_orig/dfMatrixDataBase.H
new file mode 100644
index 000000000..8efb4bf62
--- /dev/null
+++ b/src_gpu_orig/dfMatrixDataBase.H
@@ -0,0 +1,641 @@
+#pragma once
+
+#include <stdio.h>
+#include <unistd.h>
+#include "cuda_profiler_api.h"
+#include <cuda_runtime.h>
+#include "nvtx3/nvToolsExt.h"
+#include <vector>
+#include <numeric>
+#include <algorithm>
+#include <map>
+#include <iostream>
+#include <ctime>
+#include <cmath>
+
+
+static const char *_cudaGetErrorEnum(cudaError_t error) {
+  return cudaGetErrorName(error);
+}
+
+template <typename T>
+void check(T result, char const *const func, const char *const file,
+        int const line) {
+  if (result) {
+    fprintf(stderr, "cuda error at %s:%d code=%d(%s) \"%s\" \n", file, line,
+            static_cast<unsigned int>(result), _cudaGetErrorEnum(result), func);
+    exit(EXIT_FAILURE);
+  }
+}
+
+#define checkCudaErrors(val) check((val), #val, __FILE__, __LINE__)
+
+inline void checkVectorEqual(int count, const double* basevec, double* vec, double max_relative_error) {
+    for (size_t i = 0; i < count; ++i)
+    {
+        double abs_diff = fabs(basevec[i] - vec[i]);
+        double rel_diff = fabs(basevec[i] - vec[i]) / fabs(basevec[i]);
+        // if (abs_diff > 1e-12 && rel_diff > max_relative_error && !std::isinf(rel_diff))
+        if (abs_diff > 1e-15 && rel_diff > max_relative_error)
+            fprintf(stderr, "mismatch index %d, cpu data: %.16lf, gpu data: %.16lf, relative error: %.16lf\n", i, basevec[i], vec[i], rel_diff);
+    }
+}
+
+enum boundaryConditions{
+    zeroGradient,
+    fixedValue,
+    coupled,
+    empty
+};
+
+void constructBoundarySelector(std::vector<int>& patchTypeSelector, const std::string& patchTypeStr, const int patchSize);
+
+struct dfMatrixDataBase
+{
+    // - cuda resource
+    cudaStream_t stream;
+
+    // - number of cell size
+    int num_cells;
+    // - number of face size
+    int num_surfaces;
+    // - number of offdiagnal entry size (2*num_surfaces)
+    int num_faces;
+    // - number of boundary cells
+    int num_boundary_cells;
+    // - number of boundary faces
+    int num_boundary_faces;
+
+    int num_species;
+
+    // - mesh variables
+    // - csr_row_index
+    int *h_A_csr_row_index=nullptr, *d_A_csr_row_index=nullptr;
+    // - csr_col_index
+    int *h_A_csr_col_index=nullptr, *d_A_csr_col_index=nullptr;
+    // - csr_diag_index
+    int *h_A_csr_diag_index=nullptr, *d_A_csr_diag_index=nullptr;
+
+    // - the pre-permutated and post-permutated interpolation weight list
+    std::vector<double> h_weight_vec_init, h_weight_vec;
+    // - the pre-permutated and post-permutated flux (phi) list
+    std::vector<double> h_phi_vec_init, h_phi_vec;
+    // - the pre-permutated and post-permutated cell face vector list
+    std::vector<double> h_face_vector_vec_init, h_face_vector_vec;
+    std::vector<double> h_face_vec_init, h_face_vec;
+    std::vector<double> h_deltaCoeffs_vec_init, h_deltaCoeffs_vec;
+    // - the host pointer to rho_new, rho_old, velocity_old, pressure and volume list
+    double *h_rho_new = nullptr, *h_rho_old = nullptr, *h_velocity_old = nullptr, 
+    *h_pressure = nullptr;
+    const double *h_volume = nullptr;
+    // - the host pointer to the pre-permutated and post-permutated interpolation weight list
+    double *h_weight_init = nullptr, *h_weight = nullptr;
+    // - the host pointer to the pre-permutated and post-permutated flux (phi) list
+    double *h_phi_init = nullptr, *h_phi = nullptr;
+    // - the host pointer to the pre-permutated and post-permutated cell face vector list
+    double *h_face_vector_init = nullptr, *h_face_vector = nullptr;
+    double *h_face_init = nullptr, *h_face = nullptr;
+    double *h_deltaCoeffs_init = nullptr, *h_deltaCoeffs = nullptr;
+    // - the device pointer to rho_new, rho_old, velocity_old, pressure and volume list
+    double *d_rho_new = nullptr, *d_rho_old = nullptr, *d_velocity_old = nullptr, 
+    *d_pressure = nullptr, *d_volume = nullptr;
+    // - the device pointer to Y(vector Yi)
+    //std::vector<double*> d_Y;
+    double *d_Y = nullptr;
+    // - the device pointer to the pre-permutated and post-permutated interpolation weight list
+    double *d_weight_init = nullptr, *d_weight = nullptr;
+    double *d_weight_upwind = nullptr;
+    // - the device pointer to the pre-permutated and post-permutated flux (phi) list
+    double *d_phi_init = nullptr, *d_phi = nullptr;
+    // - the device pointer to the pre-permutated and post-permutated cell face vector list
+    double *d_face_vector_init = nullptr, *d_face_vector = nullptr;
+    double *d_face_init = nullptr, *d_face = nullptr;
+    double *d_deltaCoeffs_init = nullptr, *d_deltaCoeffs = nullptr;
+    std::vector<double*> d_rhoD_vector;
+
+    double *d_hDiffCorrFlux = nullptr;
+    double *d_diffAlphaD = nullptr;
+    double *d_rhoD = nullptr;
+    double *d_alpha = nullptr;
+
+    double rdelta_t = 1/1e-6;
+
+    /**
+     * @brief boundary related variables
+     */
+    int *h_boundary_cell_offset = nullptr, *d_boundary_cell_offset=nullptr;
+    int *h_boundary_cell_id = nullptr, *d_boundary_cell_id = nullptr;
+    double *h_internal_coeffs = nullptr, *h_boundary_coeffs = nullptr,
+    *h_boundary_pressure = nullptr, *h_boundary_face_vector = nullptr,
+    *h_boundary_face = nullptr, *d_boundary_face = nullptr,
+    *h_boundary_deltaCoeffs = nullptr, *d_boundary_deltaCoeffs = nullptr, 
+    *d_internal_coeffs = nullptr, *d_boundary_coeffs = nullptr,
+    *d_internal_coeffs_init = nullptr, *d_boundary_coeffs_init = nullptr,
+    *d_laplac_internal_coeffs = nullptr, *d_laplac_boundary_coeffs = nullptr,
+    *d_laplac_internal_coeffs_init = nullptr, *d_laplac_boundary_coeffs_init = nullptr,
+    *d_boundary_pressure = nullptr, *d_boundary_face_vector = nullptr,
+    *d_boundary_pressure_init = nullptr,
+    *d_boundary_phi = nullptr, *d_boundary_phi_init = nullptr, 
+    *d_boundary_velocity = nullptr, *d_boundary_velocity_init = nullptr,
+    *d_boundary_nuEff = nullptr, *d_boundary_nuEff_init = nullptr,
+    *d_boundary_rho = nullptr, *d_boundary_rho_init = nullptr;
+    std::vector<double*> d_boundary_Y_vector;
+    std::vector<double*> d_boundary_Y_init_vector;
+    std::vector<double*> d_internal_coeffs_Y_vector;
+    std::vector<double*> d_boundary_coeffs_Y_vector;
+    std::vector<double*> d_laplac_internal_coeffs_Y_vector;
+    std::vector<double*> d_laplac_boundary_coeffs_Y_vector;
+    double *d_internal_coeffs_Y = nullptr;
+    double *d_boundary_coeffs_Y = nullptr;
+    double *d_laplac_internal_coeffs_Y = nullptr;
+    double *d_laplac_boundary_coeffs_Y = nullptr;
+    std::vector<double*> d_boundary_rhoD_vector;
+    double *d_boundary_mut_sct = nullptr;
+    double *d_boundary_rhoD = nullptr;
+    double *d_boundary_alpha = nullptr;
+
+    double *d_boundary_hDiffCorrFlux = nullptr;
+    int *d_boundary_UpatchType = nullptr;
+    int *d_boundary_YpatchType = nullptr;
+
+    std::vector<int> boundPermutationList;
+    std::vector<double> ueqn_internalCoeffs, ueqn_boundaryCoeffs;
+    std::vector<double> boundary_face_vector;
+    std::vector<double> boundary_pressure;
+    std::vector<double> boundary_face;
+    std::vector<double> boundary_deltaCoeffs;
+    std::vector<std::vector<int>> patch_type_init;
+    std::vector<std::vector<int>> patch_type;
+
+    // - the device pointer to the permutated index list
+    std::vector<int> permedIndex;
+    int *d_permedIndex=nullptr;
+    int *d_bouPermedIndex = nullptr;
+
+
+    // bytesize
+    // - bytes of diagnal entries
+    size_t cell_bytes;
+    // - bytes of diagnal entries (vector)
+    size_t cell_vec_bytes;
+    // - bytes of diagnal index
+    size_t cell_index_bytes;
+     // - bytes of diagnal index
+    size_t face_bytes;
+    size_t face_vec_bytes;
+    size_t face_index_bytes;
+
+    size_t boundary_cell_bytes;
+    size_t boundary_cell_vec_bytes;
+    size_t boundary_cell_index_bytes;
+
+    size_t boundary_face_bytes;
+    size_t boundary_face_vec_bytes;
+    size_t boundary_face_index_bytes;
+
+    // A_csr has one more element in each row: itself
+    size_t csr_row_index_bytes;
+    size_t csr_col_index_bytes;
+    size_t csr_value_bytes;
+    size_t csr_value_vec_bytes;
+
+    // extra matrix information
+    double *d_turbSrc_A = nullptr, *d_turbSrc_b = nullptr, *d_turbSrc_A_init = nullptr;
+    std::vector<double> h_turbSrc_init_mtx_vec, h_turbSrc_init_1mtx;
+    std::vector<double> h_turbSrc_init_src_vec, h_turbSrc_src_vec;
+    std::vector<int> tmpPermutatedList;
+    int * d_tmpPermutatedList = nullptr;
+
+    // double *h_A_csr = nullptr, *h_b = nullptr, *h_psi = nullptr;
+    // double *d_A_csr = nullptr, *d_b = nullptr, *d_psi = nullptr;
+
+    int num_iteration;
+
+    double time_monitor_CPU;
+    double time_monitor_GPU_kernel, time_monitor_GPU_memcpy, time_monitor_GPU_memcpy_test;
+
+    double* d_grad = nullptr; 
+    double* d_grad_boundary = nullptr, *d_grad_boundary_init = nullptr;
+    double* d_nuEff = nullptr;
+
+    // constructor
+    dfMatrixDataBase();
+    dfMatrixDataBase(int num_surfaces, int num_cells, int num_boundary_faces, int num_species, int & num_boundary_cells_output,
+        const int *neighbour, const int *owner, const double* volume, const double* weight, const double* face_vector, const double* face, 
+        const double* deltaCoeffs, std::vector<double> boundary_face_vector_init, std::vector<double> boundary_face_init, 
+        std::vector<double> boundary_deltaCoeffs_init, std::vector<int> boundary_cell_id_init, std::vector<std::vector<int>> patch_type_init)
+    : num_cells(num_cells), num_faces(num_surfaces*2), num_surfaces(num_surfaces), num_species(num_species), num_iteration(0),
+      num_boundary_faces(num_boundary_faces), h_volume(volume), patch_type_init(patch_type_init)
+    {
+        // create cuda stream
+        checkCudaErrors(cudaStreamCreate(&stream));
+
+        // allocate field pointer in pin memory
+        cudaMallocHost(&h_phi_init, num_faces * sizeof(double));
+        cudaMallocHost(&h_rho_old, num_cells * sizeof(double));
+
+        h_weight_vec_init.resize(num_faces);
+        h_weight_vec.resize(num_faces);
+        h_face_vector_vec_init.resize(num_faces*3);
+        h_face_vector_vec.resize(num_faces*3);
+        h_face_vec_init.resize(num_faces);
+        h_face_vec.resize(num_faces);
+        h_deltaCoeffs_vec_init.resize(num_faces);
+        h_deltaCoeffs_vec.resize(num_faces);
+        h_turbSrc_init_mtx_vec.resize(num_faces + num_cells);
+        h_turbSrc_init_1mtx.resize(num_faces + num_cells);
+        h_turbSrc_init_src_vec.resize(3*num_cells);
+        h_turbSrc_src_vec.resize(3*num_cells);
+
+        // byte sizes
+        cell_bytes = num_cells * sizeof(double);
+        cell_vec_bytes = num_cells * 3 * sizeof(double);
+        cell_index_bytes = num_cells * sizeof(int);
+
+        face_bytes = num_faces * sizeof(double);
+        face_vec_bytes = num_faces * 3 * sizeof(double);
+        face_index_bytes = num_faces * sizeof(int);
+
+        // A_csr has one more element in each row: itself
+        csr_row_index_bytes = (num_cells + 1) * sizeof(int);
+        csr_col_index_bytes = (num_cells + num_faces) * sizeof(int);
+        csr_value_bytes = (num_cells + num_faces) * sizeof(double);
+        csr_value_vec_bytes = (num_cells + num_faces) * 3 * sizeof(double);
+
+        /************************construct mesh variables****************************/
+        /**
+         * 1. h_csr_row_index & h_csr_diag_index
+        */
+        std::vector<int> h_mtxEntry_perRow_vec(num_cells);
+        std::vector<int> h_csr_diag_index_vec(num_cells);
+        std::vector<int> h_csr_row_index_vec(num_cells + 1, 0);
+
+        for (int faceI = 0; faceI < num_surfaces; faceI++)
+        {
+            h_csr_diag_index_vec[neighbour[faceI]]++;
+            h_mtxEntry_perRow_vec[neighbour[faceI]]++;
+            h_mtxEntry_perRow_vec[owner[faceI]]++;
+        }
+
+        // - consider diagnal element in each row
+        std::transform(h_mtxEntry_perRow_vec.begin(), h_mtxEntry_perRow_vec.end(), h_mtxEntry_perRow_vec.begin(), [](int n)
+            {return n + 1;});
+        // - construct h_csr_row_index & h_csr_diag_index
+        std::partial_sum(h_mtxEntry_perRow_vec.begin(), h_mtxEntry_perRow_vec.end(), h_csr_row_index_vec.begin()+1);
+        // - assign h_csr_row_index & h_csr_diag_index
+        h_A_csr_row_index = h_csr_row_index_vec.data();
+        h_A_csr_diag_index = h_csr_diag_index_vec.data();
+
+        /**
+         * 2. h_csr_col_index
+        */
+        std::vector<int> rowIndex(num_faces + num_cells), colIndex(num_faces + num_cells), diagIndex(num_cells);
+        std::iota(diagIndex.begin(), diagIndex.end(), 0);
+
+        // initialize the RowIndex (rowIndex of lower + upper + diagnal)
+        std::copy(neighbour, neighbour + num_surfaces, rowIndex.begin());
+        std::copy(owner, owner + num_surfaces, rowIndex.begin() + num_surfaces);
+        std::copy(diagIndex.begin(), diagIndex.end(), rowIndex.begin() + num_faces);
+        // initialize the ColIndex (colIndex of lower + upper + diagnal)
+        std::copy(owner, owner + num_surfaces, colIndex.begin());
+        std::copy(neighbour, neighbour + num_surfaces, colIndex.begin() + num_surfaces);
+        std::copy(diagIndex.begin(), diagIndex.end(), colIndex.begin() + num_faces);
+
+        // - construct hashTable for sorting
+        std::multimap<int,int> rowColPair;
+        for (int i = 0; i < 2*num_surfaces+num_cells; i++)
+        {
+            rowColPair.insert(std::make_pair(rowIndex[i], colIndex[i]));
+        }
+        // - sort
+        std::vector<std::pair<int, int>> globalPerm(rowColPair.begin(), rowColPair.end());
+        std::sort(globalPerm.begin(), globalPerm.end(), []
+        (const std::pair<int, int>& pair1, const std::pair<int, int>& pair2){
+        if (pair1.first != pair2.first) {
+            return pair1.first < pair2.first;
+        } else {
+            return pair1.second < pair2.second;
+        }
+        });
+
+        std::vector<int> h_csr_col_index_vec;
+        std::transform(globalPerm.begin(), globalPerm.end(), std::back_inserter(h_csr_col_index_vec), []
+            (const std::pair<int, int>& pair) {
+            return pair.second;
+        });
+        h_A_csr_col_index = h_csr_col_index_vec.data();
+        
+        // construct a tmp permutated List for add fvMatrix
+        std::vector<int> tmp_permutation(2*num_surfaces + num_cells);
+        std::vector<int> tmp_rowIndex(2*num_surfaces + num_cells);
+        std::iota(tmp_permutation.begin(), tmp_permutation.end(), 0);
+        std::copy(neighbour, neighbour + num_surfaces, tmp_rowIndex.begin());
+        std::copy(diagIndex.begin(), diagIndex.end(), tmp_rowIndex.begin() + num_surfaces);
+        std::copy(owner, owner + num_surfaces, tmp_rowIndex.begin() + num_surfaces + num_cells);
+        std::multimap<int,int> tmpPair;
+        for (int i = 0; i < 2*num_surfaces+num_cells; i++)
+        {
+            tmpPair.insert(std::make_pair(tmp_rowIndex[i], tmp_permutation[i]));
+        }
+        std::vector<std::pair<int, int>> tmpPerm(tmpPair.begin(), tmpPair.end());
+        std::sort(tmpPerm.begin(), tmpPerm.end(), []
+        (const std::pair<int, int>& pair1, const std::pair<int, int>& pair2){
+        if (pair1.first != pair2.first) {
+            return pair1.first < pair2.first;
+        } else {
+            return pair1.second < pair2.second;
+        }
+        });
+        std::transform(tmpPerm.begin(), tmpPerm.end(), std::back_inserter(tmpPermutatedList), []
+            (const std::pair<int, int>& pair) {
+            return pair.second;
+        });
+
+        /**
+         * 3. boundary imformations
+        */
+        // get boundPermutation and offset lists
+        std::vector<int> boundPermutationListInit(num_boundary_faces);
+        std::vector<int> boundOffsetList;
+        std::iota(boundPermutationListInit.begin(), boundPermutationListInit.end(), 0);
+
+        // - construct hashTable for sorting
+        std::multimap<int,int> boundPermutation;
+        for (int i = 0; i < num_boundary_faces; i++)
+        {
+            boundPermutation.insert(std::make_pair(boundary_cell_id_init[i], boundPermutationListInit[i]));
+        }
+
+        // - sort 
+        std::vector<std::pair<int, int>> boundPermPair(boundPermutation.begin(), boundPermutation.end());
+        std::sort(boundPermPair.begin(), boundPermPair.end(), []
+        (const std::pair<int, int>& pair1, const std::pair<int, int>& pair2){
+            if (pair1.first != pair2.first) {
+                return pair1.first < pair2.first;
+            } else {
+                return pair1.second < pair2.second;
+            }
+        });
+
+        // - construct boundPermedIndex and boundary_cell_id
+        std::vector<int> boundary_cell_id;
+        boundPermutationList.clear();
+        std::transform(boundPermPair.begin(), boundPermPair.end(), std::back_inserter(boundary_cell_id), []
+            (const std::pair<int, int>& pair) {
+            return pair.first;
+        });
+        std::transform(boundPermPair.begin(), boundPermPair.end(), std::back_inserter(boundPermutationList), []
+            (const std::pair<int, int>& pair) {
+            return pair.second;
+        });
+
+        // construct boundary_cell_offset
+        std::map<int, int> countMap;
+        std::vector<int> boundaryCellcount;
+        for (const auto& cellIndex : boundary_cell_id)
+            ++ countMap[cellIndex];
+        for (const auto& [cellIndex, count] : countMap)
+            boundaryCellcount.push_back(count);
+
+        num_boundary_cells = boundaryCellcount.size();
+        num_boundary_cells_output = num_boundary_cells;
+
+        std::vector<int> boundary_cell_offset(boundaryCellcount.size() + 1, 0);
+        std::partial_sum(boundaryCellcount.begin(), boundaryCellcount.end(), boundary_cell_offset.begin()+1);
+        
+        // assign h_boundary_cell_offset & h_boundary_cell_id
+        h_boundary_cell_offset = boundary_cell_offset.data();
+        h_boundary_cell_id = boundary_cell_id.data();
+
+        // 
+        boundary_cell_bytes = num_boundary_cells * sizeof(double);
+        boundary_cell_vec_bytes = num_boundary_cells * 3 * sizeof(double);
+        boundary_cell_index_bytes = num_boundary_cells * sizeof(int);
+
+        boundary_face_bytes = num_boundary_faces * sizeof(double);
+        boundary_face_vec_bytes = num_boundary_faces * 3 * sizeof(double);
+        boundary_face_index_bytes = num_boundary_faces * sizeof(int);
+
+        ueqn_internalCoeffs.resize(3*num_boundary_faces);
+        ueqn_boundaryCoeffs.resize(3*num_boundary_faces);
+
+        boundary_face_vector.resize(3*num_boundary_faces);
+        boundary_pressure.resize(num_boundary_faces);
+        boundary_face.resize(num_boundary_faces);
+        boundary_deltaCoeffs.resize(num_boundary_faces);
+
+        patch_type.resize(2);
+        patch_type[0].resize(num_boundary_faces);
+        patch_type[1].resize(num_boundary_faces);
+
+        /**
+         * 4. permutation list for field variables
+        */
+        std::vector<int> offdiagRowIndex(2*num_surfaces), permIndex(2*num_surfaces);
+        // - initialize the offdiagRowIndex (rowIndex of lower + rowIndex of upper)
+        std::copy(neighbour, neighbour + num_surfaces, offdiagRowIndex.begin());
+        std::copy(owner, owner + num_surfaces, offdiagRowIndex.begin() + num_surfaces);
+
+        // - initialize the permIndex (0, 1, ..., 2*num_surfaces)
+        std::iota(permIndex.begin(), permIndex.end(), 0);
+
+        // - construct hashTable for sorting
+        std::multimap<int,int> permutation;
+        for (int i = 0; i < 2*num_surfaces; i++)
+        {
+            permutation.insert(std::make_pair(offdiagRowIndex[i], permIndex[i]));
+        }
+        // - sort 
+        std::vector<std::pair<int, int>> permPair(permutation.begin(), permutation.end());
+        std::sort(permPair.begin(), permPair.end(), []
+        (const std::pair<int, int>& pair1, const std::pair<int, int>& pair2){
+            if (pair1.first != pair2.first) {
+                return pair1.first < pair2.first;
+            } else {
+                return pair1.second < pair2.second;
+            }
+        });
+        // - form permedIndex list
+        std::transform(permPair.begin(), permPair.end(), std::back_inserter(permedIndex), []
+            (const std::pair<int, int>& pair) {
+            return pair.second;
+        });
+
+        // copy and permutate cell variables
+        std::copy(weight, weight + num_surfaces, h_weight_vec_init.begin());
+        std::copy(weight, weight + num_surfaces, h_weight_vec_init.begin() + num_surfaces);
+        std::copy(face_vector, face_vector + 3*num_surfaces, h_face_vector_vec_init.begin());
+        std::copy(face_vector, face_vector + 3*num_surfaces, h_face_vector_vec_init.begin() + 3*num_surfaces);
+        std::copy(face, face + num_surfaces, h_face_vec_init.begin());
+        std::copy(face, face + num_surfaces, h_face_vec_init.begin() + num_surfaces);
+        std::copy(deltaCoeffs, deltaCoeffs + num_surfaces, h_deltaCoeffs_vec_init.begin());
+        std::copy(deltaCoeffs, deltaCoeffs + num_surfaces, h_deltaCoeffs_vec_init.begin() + num_surfaces);
+        for (int i = 0; i < num_faces; i++)
+        {
+            h_weight_vec[i] = h_weight_vec_init[permedIndex[i]];
+            h_face_vec[i] = h_face_vec_init[permedIndex[i]];
+            h_deltaCoeffs_vec[i] = h_deltaCoeffs_vec_init[permedIndex[i]];
+            h_face_vector_vec[i*3] = h_face_vector_vec_init[3*permedIndex[i]];
+            h_face_vector_vec[i*3+1] = h_face_vector_vec_init[3*permedIndex[i]+1];
+            h_face_vector_vec[i*3+2] = h_face_vector_vec_init[3*permedIndex[i]+2];
+        }
+        h_weight = h_weight_vec.data();
+        h_face_vector = h_face_vector_vec.data();
+        h_face = h_face_vec.data();
+        h_deltaCoeffs = h_deltaCoeffs_vec.data();
+
+        for (int i = 0; i < num_boundary_faces; i++)
+        {
+            boundary_face_vector[3*i] = boundary_face_vector_init[3*boundPermutationList[i]];
+            boundary_face_vector[3*i+1] = boundary_face_vector_init[3*boundPermutationList[i]+1];
+            boundary_face_vector[3*i+2] = boundary_face_vector_init[3*boundPermutationList[i]+2];
+            boundary_face[i] = boundary_face_init[boundPermutationList[i]];
+            boundary_deltaCoeffs[i] = boundary_deltaCoeffs_init[boundPermutationList[i]];
+            patch_type[0][i] = patch_type_init[0][boundPermutationList[i]];
+            patch_type[1][i] = patch_type_init[1][boundPermutationList[i]];
+        }
+        h_boundary_face_vector = boundary_face_vector.data();
+        h_boundary_face = boundary_face.data();
+        h_boundary_deltaCoeffs = boundary_deltaCoeffs.data();
+
+        /************************allocate memory on device****************************/
+        int total_bytes = 0;
+
+        checkCudaErrors(cudaMalloc((void**)&d_A_csr_row_index, csr_row_index_bytes));
+        checkCudaErrors(cudaMalloc((void**)&d_A_csr_col_index, csr_col_index_bytes));
+        checkCudaErrors(cudaMalloc((void**)&d_A_csr_diag_index, cell_index_bytes));
+        total_bytes += (csr_row_index_bytes + csr_col_index_bytes + cell_index_bytes);
+
+        //d_Y.resize(num_species);
+        d_rhoD_vector.resize(num_species);
+        d_boundary_Y_vector.resize(num_species);
+        d_boundary_Y_init_vector.resize(num_species);
+        d_internal_coeffs_Y_vector.resize(num_species);
+        d_boundary_coeffs_Y_vector.resize(num_species);
+        d_laplac_internal_coeffs_Y_vector.resize(num_species);
+        d_laplac_boundary_coeffs_Y_vector.resize(num_species);
+        d_boundary_rhoD_vector.resize(num_species);
+
+        for (size_t i = 0; i < num_species; ++i){
+            //checkCudaErrors(cudaMalloc((void**)&d_Y[i], cell_bytes));
+            checkCudaErrors(cudaMalloc((void**)&d_rhoD_vector[i], cell_bytes));
+        }
+        checkCudaErrors(cudaMalloc((void**)&d_Y, cell_bytes * num_species));
+        checkCudaErrors(cudaMalloc((void**)&d_rho_old, cell_bytes));
+        checkCudaErrors(cudaMalloc((void**)&d_rho_new, cell_bytes));
+        checkCudaErrors(cudaMalloc((void**)&d_volume, cell_bytes));
+        checkCudaErrors(cudaMalloc((void**)&d_pressure, cell_bytes));
+        checkCudaErrors(cudaMalloc((void**)&d_velocity_old, cell_vec_bytes));
+        checkCudaErrors(cudaMalloc((void**)&d_weight, face_bytes));
+        checkCudaErrors(cudaMalloc((void**)&d_weight_upwind, face_bytes));
+        checkCudaErrors(cudaMalloc((void**)&d_face, face_bytes));
+        checkCudaErrors(cudaMalloc((void**)&d_deltaCoeffs, face_bytes));
+        checkCudaErrors(cudaMalloc((void**)&d_phi, face_bytes));
+        checkCudaErrors(cudaMalloc((void**)&d_phi_init, face_bytes));
+        checkCudaErrors(cudaMalloc((void**)&d_face_vector, face_vec_bytes));
+        checkCudaErrors(cudaMalloc((void**)&d_nuEff, cell_bytes));
+        checkCudaErrors(cudaMalloc((void**)&d_rhoD, cell_bytes * num_species));
+        checkCudaErrors(cudaMalloc((void**)&d_alpha, cell_bytes));
+        total_bytes += (cell_bytes * (5 + 2*num_species) + face_bytes * 6 + cell_vec_bytes + face_vec_bytes);
+
+        checkCudaErrors(cudaMalloc((void**)&d_boundary_cell_offset, (num_boundary_cells+1) * sizeof(int)));
+        checkCudaErrors(cudaMalloc((void**)&d_boundary_cell_id, boundary_face_index_bytes));
+        total_bytes += (boundary_face_index_bytes + (num_boundary_cells+1) * sizeof(int));
+
+        checkCudaErrors(cudaMalloc((void**)&d_boundary_pressure_init, boundary_face_bytes));
+        checkCudaErrors(cudaMalloc((void**)&d_boundary_pressure, boundary_face_bytes));
+        checkCudaErrors(cudaMalloc((void**)&d_boundary_phi_init, boundary_face_bytes));
+        checkCudaErrors(cudaMalloc((void**)&d_boundary_phi, boundary_face_bytes));
+        checkCudaErrors(cudaMalloc((void**)&d_boundary_velocity_init, boundary_face_vec_bytes));
+        checkCudaErrors(cudaMalloc((void**)&d_boundary_velocity, boundary_face_vec_bytes));
+        checkCudaErrors(cudaMalloc((void**)&d_boundary_face_vector, boundary_face_vec_bytes));
+        checkCudaErrors(cudaMalloc((void**)&d_boundary_face, boundary_face_bytes));
+        checkCudaErrors(cudaMalloc((void**)&d_boundary_deltaCoeffs, boundary_face_bytes));
+        checkCudaErrors(cudaMalloc((void**)&d_internal_coeffs_init, boundary_face_vec_bytes));
+        checkCudaErrors(cudaMalloc((void**)&d_boundary_coeffs_init, boundary_face_vec_bytes));
+        checkCudaErrors(cudaMalloc((void**)&d_laplac_internal_coeffs_init, boundary_face_vec_bytes));
+        checkCudaErrors(cudaMalloc((void**)&d_laplac_boundary_coeffs_init, boundary_face_vec_bytes));
+        checkCudaErrors(cudaMalloc((void**)&d_internal_coeffs, boundary_face_vec_bytes));
+        checkCudaErrors(cudaMalloc((void**)&d_boundary_coeffs, boundary_face_vec_bytes));
+        checkCudaErrors(cudaMalloc((void**)&d_laplac_internal_coeffs, boundary_face_vec_bytes));
+        checkCudaErrors(cudaMalloc((void**)&d_laplac_boundary_coeffs, boundary_face_vec_bytes));
+        checkCudaErrors(cudaMalloc((void**)&d_boundary_nuEff, boundary_face_bytes));
+        checkCudaErrors(cudaMalloc((void**)&d_boundary_nuEff_init, boundary_face_bytes));
+        checkCudaErrors(cudaMalloc((void**)&d_boundary_rho, boundary_face_bytes));
+        checkCudaErrors(cudaMalloc((void**)&d_boundary_rho_init, boundary_face_bytes));
+        checkCudaErrors(cudaMalloc((void**)&d_boundary_mut_sct, boundary_face_bytes));
+        checkCudaErrors(cudaMalloc((void**)&d_boundary_UpatchType, boundary_face_index_bytes));
+        checkCudaErrors(cudaMalloc((void**)&d_boundary_YpatchType, boundary_face_index_bytes));
+        for (size_t i = 0; i < num_species; ++i){
+            checkCudaErrors(cudaMalloc((void**)&d_boundary_Y_vector[i], boundary_face_bytes));
+            checkCudaErrors(cudaMalloc((void**)&d_boundary_Y_init_vector[i], boundary_face_bytes));
+            checkCudaErrors(cudaMalloc((void**)&d_internal_coeffs_Y_vector[i], boundary_face_bytes));
+            checkCudaErrors(cudaMalloc((void**)&d_boundary_coeffs_Y_vector[i], boundary_face_bytes));
+            checkCudaErrors(cudaMalloc((void**)&d_laplac_internal_coeffs_Y_vector[i], boundary_face_bytes));
+            checkCudaErrors(cudaMalloc((void**)&d_laplac_boundary_coeffs_Y_vector[i], boundary_face_bytes));
+            checkCudaErrors(cudaMalloc((void**)&d_boundary_rhoD_vector[i], boundary_face_bytes));
+        }
+        checkCudaErrors(cudaMalloc((void**)&d_internal_coeffs_Y, boundary_face_bytes * num_species));
+        checkCudaErrors(cudaMalloc((void**)&d_boundary_coeffs_Y, boundary_face_bytes * num_species));
+        checkCudaErrors(cudaMalloc((void**)&d_laplac_internal_coeffs_Y, boundary_face_bytes * num_species));
+        checkCudaErrors(cudaMalloc((void**)&d_laplac_boundary_coeffs_Y, boundary_face_bytes * num_species));
+        checkCudaErrors(cudaMalloc((void**)&d_boundary_rhoD, boundary_face_bytes * num_species));
+        checkCudaErrors(cudaMalloc((void**)&d_boundary_alpha, boundary_face_bytes));
+        
+        total_bytes += (boundary_face_bytes*10 + boundary_face_vec_bytes * 11);
+
+        // checkCudaErrors(cudaMalloc((void**)&d_A_csr, csr_value_vec_bytes));
+        // checkCudaErrors(cudaMalloc((void**)&d_b, cell_vec_bytes));
+        // checkCudaErrors(cudaMalloc((void**)&d_psi, cell_vec_bytes));
+        total_bytes += (boundary_face_bytes + boundary_face_vec_bytes * 3);
+
+        checkCudaErrors(cudaMalloc((void**)&d_turbSrc_A, csr_value_bytes));
+        checkCudaErrors(cudaMalloc((void**)&d_turbSrc_A_init, csr_value_bytes));
+        checkCudaErrors(cudaMalloc((void**)&d_turbSrc_b, cell_vec_bytes));
+        total_bytes += (2*csr_value_bytes + cell_vec_bytes);
+
+        checkCudaErrors(cudaMalloc((void**)&d_permedIndex, face_index_bytes));
+        checkCudaErrors(cudaMalloc((void**)&d_bouPermedIndex, boundary_face_index_bytes));
+        checkCudaErrors(cudaMalloc((void**)&d_tmpPermutatedList, csr_col_index_bytes));
+        total_bytes += (face_index_bytes + boundary_face_index_bytes + csr_col_index_bytes);
+
+        checkCudaErrors(cudaMalloc((void**)&d_grad, num_cells * 9 * sizeof(double)));
+        checkCudaErrors(cudaMalloc((void**)&d_grad_boundary, boundary_face_bytes * 9));
+        checkCudaErrors(cudaMalloc((void**)&d_grad_boundary_init, boundary_cell_bytes * 9));
+        total_bytes += (num_cells * 9 * sizeof(double) + boundary_face_bytes * 9 + boundary_cell_bytes * 9); // FIXME: rename
+
+        checkCudaErrors(cudaMalloc((void**)&d_hDiffCorrFlux, 3 * cell_bytes));
+        checkCudaErrors(cudaMalloc((void**)&d_diffAlphaD, cell_bytes));
+        checkCudaErrors(cudaMalloc((void**)&d_boundary_hDiffCorrFlux, 3 * boundary_face_bytes));
+
+        fprintf(stderr, "Total bytes malloc on GPU: %.2fMB\n", total_bytes * 1.0 / 1024 / 1024);
+
+        checkCudaErrors(cudaMemcpyAsync(d_A_csr_row_index, h_A_csr_row_index, csr_row_index_bytes, cudaMemcpyHostToDevice, stream));
+        checkCudaErrors(cudaMemcpyAsync(d_A_csr_col_index, h_A_csr_col_index, csr_col_index_bytes, cudaMemcpyHostToDevice, stream));
+        checkCudaErrors(cudaMemcpyAsync(d_A_csr_diag_index, h_A_csr_diag_index, cell_index_bytes, cudaMemcpyHostToDevice, stream));
+        checkCudaErrors(cudaMemcpyAsync(d_boundary_cell_offset, h_boundary_cell_offset, (num_boundary_cells+1) * sizeof(int), cudaMemcpyHostToDevice, stream));
+        checkCudaErrors(cudaMemcpyAsync(d_boundary_cell_id, h_boundary_cell_id, boundary_face_index_bytes, cudaMemcpyHostToDevice, stream));
+
+        checkCudaErrors(cudaMemcpyAsync(d_volume, h_volume, cell_bytes, cudaMemcpyHostToDevice, stream));
+        checkCudaErrors(cudaMemcpyAsync(d_weight, h_weight, face_bytes, cudaMemcpyHostToDevice, stream));
+        checkCudaErrors(cudaMemcpyAsync(d_face, h_face, face_bytes, cudaMemcpyHostToDevice, stream));
+        checkCudaErrors(cudaMemcpyAsync(d_deltaCoeffs, h_deltaCoeffs, face_bytes, cudaMemcpyHostToDevice, stream));
+        checkCudaErrors(cudaMemcpyAsync(d_face_vector, h_face_vector, face_vec_bytes, cudaMemcpyHostToDevice, stream));
+        checkCudaErrors(cudaMemcpyAsync(d_boundary_face_vector, h_boundary_face_vector, boundary_face_vec_bytes, cudaMemcpyHostToDevice, stream));
+        checkCudaErrors(cudaMemcpyAsync(d_boundary_face, h_boundary_face, boundary_face_bytes, cudaMemcpyHostToDevice, stream));
+        checkCudaErrors(cudaMemcpyAsync(d_boundary_deltaCoeffs, h_boundary_deltaCoeffs, boundary_face_bytes, cudaMemcpyHostToDevice, stream));
+        checkCudaErrors(cudaMemcpyAsync(d_boundary_UpatchType, patch_type[0].data(), boundary_face_index_bytes, cudaMemcpyHostToDevice, stream));
+        checkCudaErrors(cudaMemcpyAsync(d_boundary_YpatchType, patch_type[1].data(), boundary_face_index_bytes, cudaMemcpyHostToDevice, stream));
+
+        checkCudaErrors(cudaMemcpyAsync(d_permedIndex, permedIndex.data(), face_index_bytes, cudaMemcpyHostToDevice, stream));
+        checkCudaErrors(cudaMemcpyAsync(d_bouPermedIndex, boundPermutationList.data(), boundary_face_index_bytes, cudaMemcpyHostToDevice, stream));
+        checkCudaErrors(cudaMemcpyAsync(d_tmpPermutatedList, tmpPermutatedList.data(), csr_col_index_bytes, cudaMemcpyHostToDevice, stream));
+    };
+
+    ~dfMatrixDataBase(){
+        std::cout << "Destructor called." << std::endl;
+        // TODO: free pointers
+        
+    };
+};
+
diff --git a/src_gpu_orig/dfMatrixDataBase.cu b/src_gpu_orig/dfMatrixDataBase.cu
new file mode 100644
index 000000000..d4f5a7ab0
--- /dev/null
+++ b/src_gpu_orig/dfMatrixDataBase.cu
@@ -0,0 +1,48 @@
+#include "dfMatrixDataBase.H"
+
+
+void constructBoundarySelector(std::vector<int>& patchTypeSelector, const std::string& patchTypeStr,
+    const int patchSize)
+{
+    boundaryConditions patchCondition;
+    std::vector<int> tmpSelector;
+    static std::map<std::string, boundaryConditions> BCMap = {
+        {"zeroGradient", zeroGradient},
+        {"fixedValue", fixedValue},
+        {"empty", empty},
+        {"coupled", coupled}
+    };
+    auto iter = BCMap.find(patchTypeStr);
+    if (iter != BCMap.end()) {
+        patchCondition = iter->second;
+    } else {
+        throw std::runtime_error("Unknown boundary condition: " + patchTypeStr);
+    }
+    // zeroGradient labeled as 0, fixedValue labeled as 1, coupled labeled as 2
+    switch (patchCondition){
+        case zeroGradient:
+        {
+            tmpSelector.resize(patchSize, 0);
+            patchTypeSelector.insert(patchTypeSelector.end(), tmpSelector.begin(), tmpSelector.end());
+            break;
+        }
+        case fixedValue:
+        {
+            tmpSelector.resize(patchSize, 1);
+            patchTypeSelector.insert(patchTypeSelector.end(), tmpSelector.begin(), tmpSelector.end());
+            break;
+        }
+        case empty:
+        {
+            tmpSelector.resize(patchSize, 2);
+            patchTypeSelector.insert(patchTypeSelector.end(), tmpSelector.begin(), tmpSelector.end());
+            break;
+        }
+        case coupled:
+        {
+            tmpSelector.resize(patchSize, 3);
+            patchTypeSelector.insert(patchTypeSelector.end(), tmpSelector.begin(), tmpSelector.end());
+            break;
+        }
+    }
+}
diff --git a/src_gpu/dfRhoEqn.H b/src_gpu_orig/dfRhoEqn.H
similarity index 100%
rename from src_gpu/dfRhoEqn.H
rename to src_gpu_orig/dfRhoEqn.H
diff --git a/src_gpu/dfRhoEqn.cu b/src_gpu_orig/dfRhoEqn.cu
similarity index 100%
rename from src_gpu/dfRhoEqn.cu
rename to src_gpu_orig/dfRhoEqn.cu
diff --git a/src_gpu/dfUEqn.H b/src_gpu_orig/dfUEqn.H
similarity index 100%
rename from src_gpu/dfUEqn.H
rename to src_gpu_orig/dfUEqn.H
diff --git a/src_gpu/dfUEqn.cu b/src_gpu_orig/dfUEqn.cu
similarity index 100%
rename from src_gpu/dfUEqn.cu
rename to src_gpu_orig/dfUEqn.cu
diff --git a/src_gpu/dfYEqn.H b/src_gpu_orig/dfYEqn.H
similarity index 100%
rename from src_gpu/dfYEqn.H
rename to src_gpu_orig/dfYEqn.H
diff --git a/src_gpu/dfYEqn.cu b/src_gpu_orig/dfYEqn.cu
similarity index 100%
rename from src_gpu/dfYEqn.cu
rename to src_gpu_orig/dfYEqn.cu

From a59a190d2b7fc92d65874d054f20cb3bf410bd7e Mon Sep 17 00:00:00 2001
From: STwangyingrui <wangyingrui@sensetime.com>
Date: Thu, 3 Aug 2023 00:32:54 +0800
Subject: [PATCH 02/25] add unittest of fvm::div(phi, U)

---
 GPUTest/GPUTestBase.H       | 141 ++++++++++++++++++++++++++++++++++++
 GPUTest/createGPUSolver.H   |  60 +++++++--------
 GPUTest/unittest.C          |   9 ++-
 src_gpu/CMakeLists.txt      |   5 +-
 src_gpu/dfMatrixDataBase.H  |   5 +-
 src_gpu/dfMatrixDataBase.cu |  45 +++++++++++-
 src_gpu/dfMatrixOpBase.H    |  30 ++++++++
 src_gpu/dfMatrixOpBase.cu   | 140 +++++++++++++++++++++++++++++++++++
 8 files changed, 395 insertions(+), 40 deletions(-)
 create mode 100644 GPUTest/GPUTestBase.H
 create mode 100644 src_gpu/dfMatrixOpBase.H
 create mode 100644 src_gpu/dfMatrixOpBase.cu

diff --git a/GPUTest/GPUTestBase.H b/GPUTest/GPUTestBase.H
new file mode 100644
index 000000000..e1ffb0bd5
--- /dev/null
+++ b/GPUTest/GPUTestBase.H
@@ -0,0 +1,141 @@
+
+enum initType{
+    original,
+    randomInit
+};
+
+// unittest of fvm::div(phi, U)
+void test_fvm_div_scalar(dfMatrixDataBase& dfDataBase, Foam::fvMesh& mesh, surfaceScalarField& phi, volVectorField& U, initType type) {
+    int offset = 0;
+
+    // deal with init type
+    if (type == initType::randomInit) {
+      // random init phi to (-0.5, 0.5)
+      // internal
+      double *phi_internal_ptr = &phi[0];
+      std::vector<double> init_phi_internal;
+      init_phi_internal.resize(dfDataBase.num_surfaces);
+      for (int i = 0; i < dfDataBase.num_surfaces; i++) {
+          init_phi_internal[i] = (rand() % 10000 - 5000) / 10000.0;
+      }
+      memcpy(phi_internal_ptr, init_phi_internal.data(), dfDataBase.surface_value_bytes);
+      // boundary
+      offset = 0;
+      forAll(U.boundaryField(), patchi)
+      {
+          fvsPatchScalarField& patchPhi = phi.boundaryFieldRef()[patchi];
+          int patchsize = patchPhi.size();
+          double *phi_boundary_ptr = &patchPhi[0];
+          std::vector<double> init_phi_boundary;
+          init_phi_boundary.resize(patchsize);
+          for (int i = 0; i < patchsize; i++) {
+              init_phi_boundary[i] = (rand() % 10000 - 5000) / 10000.0;
+          }
+          memcpy(phi_boundary_ptr, init_phi_boundary.data(), patchsize * sizeof(double));
+          offset += patchsize;
+      }
+      // TODO: random init weight to (0, 1)
+      // failed, weight is const. 
+    }
+
+    // run CPU
+    fvVectorMatrix df_U = fvm::div(phi, U);
+
+    // run GPU
+    // run GPU - preProcess
+    // prepare phi
+    memcpy(dfDataBase.h_phi, &phi[0], dfDataBase.surface_value_bytes);
+    offset = 0;
+    forAll(U.boundaryField(), patchi)
+    {
+        const fvsPatchScalarField& patchPhi = phi.boundaryField()[patchi];
+        int patchsize = patchPhi.size();
+        memcpy(dfDataBase.h_boundary_phi + offset, &patchPhi[0], patchsize * sizeof(double));
+        offset += patchsize;
+    }
+    checkCudaErrors(cudaMemcpyAsync(dfDataBase.d_phi, dfDataBase.h_phi, dfDataBase.surface_value_bytes, cudaMemcpyHostToDevice, dfDataBase.stream));
+    checkCudaErrors(cudaMemcpyAsync(dfDataBase.d_boundary_phi, dfDataBase.h_boundary_phi, dfDataBase.boundary_surface_value_bytes, cudaMemcpyHostToDevice, dfDataBase.stream));
+    // prepare patch_type
+    std::vector<int> patch_type_U;
+    patch_type_U.resize(dfDataBase.num_patches);
+    forAll(U.boundaryField(), patchi)
+    {
+        constructBoundarySelectorPerPatch(&(patch_type_U[patchi]), U.boundaryField()[patchi].type());
+    }
+    // prepare boundary coeffs
+    // TODO: updating boundary coeffs should be complemented later
+    double *d_value_internal_coeffs_U = nullptr;
+    double *d_value_boundary_coeffs_U = nullptr;
+    double *d_gradient_internal_coeffs_U = nullptr;
+    double *d_gradient_boundary_coeffs_U = nullptr;
+    checkCudaErrors(cudaMalloc((void**)&d_value_internal_coeffs_U, dfDataBase.boundary_surface_value_vec_bytes));
+    checkCudaErrors(cudaMalloc((void**)&d_value_boundary_coeffs_U, dfDataBase.boundary_surface_value_vec_bytes));
+    checkCudaErrors(cudaMalloc((void**)&d_gradient_internal_coeffs_U, dfDataBase.boundary_surface_value_vec_bytes));
+    checkCudaErrors(cudaMalloc((void**)&d_gradient_boundary_coeffs_U, dfDataBase.boundary_surface_value_vec_bytes));
+    update_boundary_coeffs_vector(dfDataBase.stream, dfDataBase.num_patches,
+            dfDataBase.patch_size.data(), patch_type_U.data(),
+            d_value_internal_coeffs_U, d_value_boundary_coeffs_U,
+            d_gradient_internal_coeffs_U, d_gradient_boundary_coeffs_U);
+    // prepare ldu
+    double *d_lower = nullptr;
+    double *d_upper = nullptr;
+    double *d_diag = nullptr;
+    double *d_internal_coeffs = nullptr;
+    double *d_boundary_coeffs = nullptr;
+    checkCudaErrors(cudaMalloc((void**)&d_lower, dfDataBase.surface_value_bytes));
+    checkCudaErrors(cudaMalloc((void**)&d_upper, dfDataBase.surface_value_bytes));
+    checkCudaErrors(cudaMalloc((void**)&d_diag, dfDataBase.cell_value_bytes));
+    checkCudaErrors(cudaMalloc((void**)&d_internal_coeffs, dfDataBase.boundary_surface_value_vec_bytes));
+    checkCudaErrors(cudaMalloc((void**)&d_boundary_coeffs, dfDataBase.boundary_surface_value_vec_bytes));
+    // run GPU - Process
+    fvm_div_scalar(dfDataBase.stream, dfDataBase.num_surfaces, dfDataBase.d_owner, dfDataBase.d_neighbor,
+            dfDataBase.d_phi, dfDataBase.d_weight,
+            d_lower, d_upper, d_diag, // end for internal
+            dfDataBase.num_patches, dfDataBase.patch_size.data(), patch_type_U.data(),
+            dfDataBase.d_boundary_phi, d_value_internal_coeffs_U, d_value_boundary_coeffs_U,
+            d_internal_coeffs, d_boundary_coeffs);
+    // run GPU - postProcess
+    std::vector<double> h_lower;
+    h_lower.resize(dfDataBase.num_surfaces);
+    std::vector<double> h_upper;
+    h_upper.resize(dfDataBase.num_surfaces);
+    std::vector<double> h_diag;
+    h_diag.resize(dfDataBase.num_cells);
+    std::vector<double> h_internal_coeffs;
+    h_internal_coeffs.resize(dfDataBase.num_boundary_surfaces * 3);
+    std::vector<double> h_boundary_coeffs;
+    h_boundary_coeffs.resize(dfDataBase.num_boundary_surfaces * 3);
+    checkCudaErrors(cudaMemcpy(h_lower.data(), d_lower, dfDataBase.surface_value_bytes, cudaMemcpyDeviceToHost));
+    checkCudaErrors(cudaMemcpy(h_upper.data(), d_upper, dfDataBase.surface_value_bytes, cudaMemcpyDeviceToHost));
+    checkCudaErrors(cudaMemcpy(h_diag.data(), d_diag, dfDataBase.cell_value_bytes, cudaMemcpyDeviceToHost));
+    checkCudaErrors(cudaMemcpy(h_internal_coeffs.data(), d_internal_coeffs, dfDataBase.boundary_surface_value_vec_bytes, cudaMemcpyDeviceToHost));
+    checkCudaErrors(cudaMemcpy(h_boundary_coeffs.data(), d_boundary_coeffs, dfDataBase.boundary_surface_value_vec_bytes, cudaMemcpyDeviceToHost));
+    checkCudaErrors(cudaFree(d_lower));
+    checkCudaErrors(cudaFree(d_upper));
+    checkCudaErrors(cudaFree(d_diag));
+    checkCudaErrors(cudaFree(d_internal_coeffs));
+    checkCudaErrors(cudaFree(d_boundary_coeffs));
+    checkCudaErrors(cudaFree(d_value_internal_coeffs_U));
+    checkCudaErrors(cudaFree(d_value_boundary_coeffs_U));
+    checkCudaErrors(cudaFree(d_gradient_internal_coeffs_U));
+    checkCudaErrors(cudaFree(d_gradient_boundary_coeffs_U));
+
+    // compare CPU and GPU results
+    checkVectorEqual(dfDataBase.num_surfaces, &df_U.lower()[0], h_lower.data(), 1e-14);
+    checkVectorEqual(dfDataBase.num_surfaces, &df_U.upper()[0], h_upper.data(), 1e-14);
+    checkVectorEqual(dfDataBase.num_cells, &df_U.diag()[0], h_diag.data(), 1e-14);
+    std::vector<double> cpu_internal_coeffs(dfDataBase.num_boundary_surfaces * 3);
+    std::vector<double> cpu_boundary_coeffs(dfDataBase.num_boundary_surfaces * 3);
+    offset = 0;
+    forAll(U.boundaryField(), patchi)
+    {
+        int patchSize = U.boundaryField()[patchi].size();
+        const double* internal_coeff_ptr = &df_U.internalCoeffs()[patchi][0][0];
+        const double* boundary_coeff_ptr = &df_U.boundaryCoeffs()[patchi][0][0];
+        memcpy(cpu_internal_coeffs.data() + offset * 3, internal_coeff_ptr, patchSize * 3 * sizeof(double));
+        memcpy(cpu_boundary_coeffs.data() + offset * 3, boundary_coeff_ptr, patchSize * 3 * sizeof(double));
+        offset += patchSize;
+    }
+    checkVectorEqual(dfDataBase.num_boundary_surfaces * 3, cpu_internal_coeffs.data(), h_internal_coeffs.data(), 1e-14);
+    checkVectorEqual(dfDataBase.num_boundary_surfaces * 3, cpu_boundary_coeffs.data(), h_boundary_coeffs.data(), 1e-14);
+}
diff --git a/GPUTest/createGPUSolver.H b/GPUTest/createGPUSolver.H
index 9a6c289ab..0f1e20eca 100644
--- a/GPUTest/createGPUSolver.H
+++ b/GPUTest/createGPUSolver.H
@@ -1,32 +1,29 @@
+
 dfMatrixDataBase dfDataBase;
 
 void createGPUBase(fvMesh& mesh, PtrList<volScalarField>& Y) {
-    // obtain variables from fvMesh
+    // prepare constant values: num_cells, num_surfaces, num_boundary_surfaces, num_patches, patch_size, num_species, rdelta_t
     const labelUList& owner = mesh.owner();
     const labelUList& neighbour = mesh.neighbour();
     int num_cells = mesh.nCells();
     int num_surfaces = neighbour.size();
-
-    
-    // prepare num_cells, num_surfaces, num_boundary_surfaces, num_patches, patch_sizes, num_species, rdelta_t
-    // - obtain boundary size info from mesh
-    int patchSize = 0, num_patches = 0, num_boundary_surfaces = 0;
-    std::vector<int> patch_sizes;
+    int num_boundary_surfaces = 0;
+    int num_patches = 0;
+    std::vector<int> patch_size;
     forAll(mesh.boundary(), patchi) {
         labelUList sub_boundary = mesh.boundary()[patchi].faceCells();
-        patchSize = sub_boundary.size();
-
-        patch_sizes.push_back(patchSize);
-        num_boundary_surfaces += patchSize;
-        num_patches ++;
+        int patchsize = sub_boundary.size();
+        patch_size.push_back(patchsize);
+        num_boundary_surfaces += patchsize;
+        num_patches++;
     }
-    dfDataBase.setConstantValues(num_cells, num_surfaces, num_boundary_surfaces, num_patches, patch_sizes, Y.size(), 1e-6); // TODO: get deltaT fomr time API
+    // TODO: get deltaT fomr time API
+    dfDataBase.setConstantValues(num_cells, num_surfaces, num_boundary_surfaces, num_patches, patch_size, Y.size(), 1e-6);
     
-    // prepare owner, neighbor
+    // prepare constant indexes: owner, neighbor
     dfDataBase.setConstantIndexes(&owner[0], &neighbour[0]);
     
     // prepare internal and boundary of sf, mag_sf, weights, delta_coeffs, volume
-    // - obtain boundary field info from mesh
     double *boundary_sf = new double[3 * num_boundary_surfaces];
     double *boundary_mag_sf = new double[num_boundary_surfaces];
     double *boundary_delta_coeffs = new double[num_boundary_surfaces];
@@ -36,12 +33,12 @@ void createGPUBase(fvMesh& mesh, PtrList<volScalarField>& Y) {
         const scalarField& pMagSf = mesh.magSf().boundaryField()[patchi];
         const scalarField& pDeltaCoeffs = mesh.nonOrthDeltaCoeffs().boundaryField()[patchi];
 
-        patchSize = pMagSf.size();
+        int patchsize = pMagSf.size();
 
-        memcpy(boundary_sf + 3*offset, &pSf[0][0], 3*patchSize*sizeof(double));
-        memcpy(boundary_mag_sf + offset, &pMagSf[0], patchSize*sizeof(double));
-        memcpy(boundary_delta_coeffs + offset, &pDeltaCoeffs[0], patchSize*sizeof(double));
-        offset += patchSize;
+        memcpy(boundary_sf + 3*offset, &pSf[0][0], 3*patchsize*sizeof(double));
+        memcpy(boundary_mag_sf + offset, &pMagSf[0], patchsize*sizeof(double));
+        memcpy(boundary_delta_coeffs + offset, &pDeltaCoeffs[0], patchsize*sizeof(double));
+        offset += patchsize;
     }
 
     dfDataBase.createConstantFieldsInternal();
@@ -49,23 +46,20 @@ void createGPUBase(fvMesh& mesh, PtrList<volScalarField>& Y) {
     dfDataBase.initConstantFieldsInternal(&mesh.Sf()[0][0], &mesh.magSf()[0], &mesh.surfaceInterpolation::weights()[0], &mesh.nonOrthDeltaCoeffs()[0], &mesh.V()[0]);
     dfDataBase.initConstantFieldsBoundary(boundary_sf, boundary_mag_sf, boundary_delta_coeffs);
     
-    // prepare internal and boundary of xxx
-    // - obtain init_Y
-    double *h_Y = new double[Y.size() * num_cells];
-    double *boundary_Y = new double[Y.size() * num_boundary_surfaces];
+    // prepare internal and boundary of Y
+    dfDataBase.createNonConstantFieldsInternal();
+    dfDataBase.createNonConstantFieldsBoundary();
     forAll(Y, speciesI) {
         volScalarField& Yi = Y[speciesI];
-        memcpy(h_Y + speciesI * num_cells, &Yi[0], num_cells * sizeof(double));
+        memcpy(dfDataBase.h_y + speciesI * num_cells, &Yi[0], num_cells * sizeof(double));
         offset = 0;
         forAll(Yi.boundaryField(), patchi) {
             const scalarField& patchYi = Yi.boundaryField()[patchi];
-            patchSize = patchYi.size();
-            memcpy(boundary_Y + speciesI * num_boundary_surfaces + offset, &patchYi[0], patchSize*sizeof(double));
-            offset += patchSize;
+            int patchsize = patchYi.size();
+            memcpy(dfDataBase.h_boundary_y + speciesI * num_boundary_surfaces + offset, &patchYi[0], patchsize*sizeof(double));
+            offset += patchsize;
         }
     }
-    dfDataBase.createNonConstantFieldsInternal();
-    dfDataBase.createNonConstantFieldsBoundary();
-    dfDataBase.initNonConstantFieldsInternal(h_Y);
-    dfDataBase.initNonConstantFieldsBoundary(boundary_Y);
-};
\ No newline at end of file
+    dfDataBase.initNonConstantFieldsInternal(dfDataBase.h_y);
+    dfDataBase.initNonConstantFieldsBoundary(dfDataBase.h_boundary_y);
+};
diff --git a/GPUTest/unittest.C b/GPUTest/unittest.C
index 2e3d55ce5..b57a8efd6 100644
--- a/GPUTest/unittest.C
+++ b/GPUTest/unittest.C
@@ -45,11 +45,14 @@ Description
 #include "basicThermo.H"
 #include "CombustionModel.H"
 
-#include "dfMatrixDataBase.H"
 #include <cuda_runtime.h>
 #include <thread>
 #include "upwind.H"
+
+#include "dfMatrixDataBase.H"
+#include "dfMatrixOpBase.H"
 #include "createGPUSolver.H"
+#include "GPUTestBase.H"
 
 int main(int argc, char *argv[])
 {
@@ -93,6 +96,10 @@ int main(int argc, char *argv[])
         }
 
         createGPUBase(mesh, Y);
+
+        // unittest of fvm::div(phi, U)
+        test_fvm_div_scalar(dfDataBase, mesh, phi, U, initType::original);
+        test_fvm_div_scalar(dfDataBase, mesh, phi, U, initType::randomInit);
     }
     return 0;
 }
diff --git a/src_gpu/CMakeLists.txt b/src_gpu/CMakeLists.txt
index 015a1d11b..d82c86df5 100644
--- a/src_gpu/CMakeLists.txt
+++ b/src_gpu/CMakeLists.txt
@@ -21,8 +21,9 @@ include_directories(
 )
 
 add_library(${PROJECT_NAME} 
-    SHARED 
-        dfMatrixDataBase.cu)
+    SHARED
+        dfMatrixDataBase.cu
+        dfMatrixOpBase.cu)
 
 target_link_libraries(${PROJECT_NAME}
     ${MPI_LIBRARIES}
diff --git a/src_gpu/dfMatrixDataBase.H b/src_gpu/dfMatrixDataBase.H
index c2e1446ec..efcb78190 100644
--- a/src_gpu/dfMatrixDataBase.H
+++ b/src_gpu/dfMatrixDataBase.H
@@ -48,6 +48,7 @@ enum boundaryConditions{
     empty
 };
 
+void constructBoundarySelectorPerPatch(int *patchTypeSelector, const std::string& patchTypeStr);
 void constructBoundarySelector(std::vector<int>& patchTypeSelector, const std::string& patchTypeStr, const int patchSize);
 
 struct dfMatrixDataBase
@@ -66,7 +67,7 @@ struct dfMatrixDataBase
     int num_boundary_surfaces = 0;
     int num_patches = 0;
     int num_species = 0;
-    std::vector<int> patch_sizes;
+    std::vector<int> patch_size;
     double rdelta_t = 0;
 
     // constant values -- ldu bytesize
@@ -169,7 +170,7 @@ struct dfMatrixDataBase
 
     // member function
     void setConstantValues(int num_cells, int num_surfaces, int num_boundary_surfaces,
-                   int num_patches, std::vector<int> patch_sizes,
+                   int num_patches, std::vector<int> patch_size,
                    int num_species, double rdelta_t);             
     void setConstantIndexes(const int *owner, const int *neighbor);
 
diff --git a/src_gpu/dfMatrixDataBase.cu b/src_gpu/dfMatrixDataBase.cu
index 4ecbc25c8..4bcbe88a4 100644
--- a/src_gpu/dfMatrixDataBase.cu
+++ b/src_gpu/dfMatrixDataBase.cu
@@ -1,5 +1,46 @@
 #include "dfMatrixDataBase.H"
 
+void constructBoundarySelectorPerPatch(int *patchTypeSelector, const std::string& patchTypeStr)
+{
+    boundaryConditions patchCondition;
+    std::vector<int> tmpSelector;
+    static std::map<std::string, boundaryConditions> BCMap = {
+        {"zeroGradient", zeroGradient},
+        {"fixedValue", fixedValue},
+        {"empty", empty},
+        {"coupled", coupled}
+    };
+    auto iter = BCMap.find(patchTypeStr);
+    if (iter != BCMap.end()) {
+        patchCondition = iter->second;
+    } else {
+        throw std::runtime_error("Unknown boundary condition: " + patchTypeStr);
+    }
+    // zeroGradient labeled as 0, fixedValue labeled as 1, coupled labeled as 2
+    switch (patchCondition){
+        case zeroGradient:
+        {
+            *patchTypeSelector = 0;
+            break;
+        }
+        case fixedValue:
+        {
+            *patchTypeSelector = 1;
+            break;
+        }
+        case empty:
+        {
+            *patchTypeSelector = 2;
+            break;
+        }
+        case coupled:
+        {
+            *patchTypeSelector = 3;
+            break;
+        }
+    }
+}
+
 dfMatrixDataBase::dfMatrixDataBase() {
     checkCudaErrors(cudaStreamCreate(&stream));
 }
@@ -15,14 +56,14 @@ dfMatrixDataBase::~dfMatrixDataBase() {
 }
 
 void dfMatrixDataBase::setConstantValues(int num_cells, int num_surfaces, int num_boundary_surfaces,
-                   int num_patches, std::vector<int> patch_sizes,
+                   int num_patches, std::vector<int> patch_size,
                    int num_species, double rdelta_t) {
     // constant values -- basic
     this->num_cells = num_cells;
     this->num_surfaces = num_surfaces;
     this->num_boundary_surfaces = num_boundary_surfaces;
     this->num_patches = num_patches;
-    this->patch_sizes = patch_sizes;
+    this->patch_size = patch_size;
     this->num_species = num_species;
     this->rdelta_t = rdelta_t;
 
diff --git a/src_gpu/dfMatrixOpBase.H b/src_gpu/dfMatrixOpBase.H
new file mode 100644
index 000000000..47692e239
--- /dev/null
+++ b/src_gpu/dfMatrixOpBase.H
@@ -0,0 +1,30 @@
+#pragma once
+
+void ldu_to_csr(cudaStream_t stream, int num_cells, int num_surfaces,
+        const int *lower_to_csr_index, const int *upper_to_csr_index, const int *diag_to_csr_index, 
+        const double *lower, const double *upper, const double *diag, const double *source,
+        const double *internal_coeffs, const double *boundary_coeffs,
+        double *A, double *b);
+
+void update_boundary_coeffs_vector(cudaStream_t stream, int num_patches,
+        const int *patch_size, const int *patch_type,
+        double *value_internal_coeffs, double *value_boundary_coeffs,
+        double *gradient_internal_coeffs, double *gradient_boundary_coeffs);
+
+// void fvm_ddt();
+
+void fvm_div_scalar(cudaStream_t stream, int num_surfaces, const int *lowerAddr, const int *upperAddr,
+        const double *phi, const double *weight,
+        double *lower, double *upper, double *diag, // end for internal
+        int num_patches, const int *patch_size, const int *patch_type,
+        const double *boundary_phi, const double *value_internal_coeffs, const double *value_boundary_coeffs,
+        double *internal_coeffs, double *boundary_coeffs);
+
+// void fvm_laplacian();
+// 
+// void fvc_ddt();
+// 
+// void fvc_grad_surface();
+// 
+// void fvc_div_cell();
+
diff --git a/src_gpu/dfMatrixOpBase.cu b/src_gpu/dfMatrixOpBase.cu
new file mode 100644
index 000000000..6c533e05e
--- /dev/null
+++ b/src_gpu/dfMatrixOpBase.cu
@@ -0,0 +1,140 @@
+#include "dfMatrixOpBase.H"
+#include "dfMatrixDataBase.H"
+
+#include <cuda_runtime.h>
+#include "cuda_profiler_api.h"
+
+__global__ void update_boundary_coeffs_zeroGradient_vector(int num, int offset,
+        double *value_internal_coeffs, double *value_boundary_coeffs,
+        double *gradient_internal_coeffs, double *gradient_boundary_coeffs)
+{
+    int index = blockDim.x * blockIdx.x + threadIdx.x;
+    if (index >= num)
+        return;
+
+    int start_index = offset + index;
+    // valueInternalCoeffs = 1
+    // valueBoundaryCoeffs = 0
+    // gradientInternalCoeffs = 0
+    // gradientBoundaryCoeffs = 0
+    value_internal_coeffs[start_index * 3 + 0] = 1;
+    value_internal_coeffs[start_index * 3 + 1] = 1;
+    value_internal_coeffs[start_index * 3 + 2] = 1;
+    value_boundary_coeffs[start_index * 3 + 0] = 0;
+    value_boundary_coeffs[start_index * 3 + 1] = 0;
+    value_boundary_coeffs[start_index * 3 + 2] = 0;
+    gradient_internal_coeffs[start_index * 3 + 0] = 0;
+    gradient_internal_coeffs[start_index * 3 + 1] = 0;
+    gradient_internal_coeffs[start_index * 3 + 2] = 0;
+    gradient_boundary_coeffs[start_index * 3 + 0] = 0;
+    gradient_boundary_coeffs[start_index * 3 + 1] = 0;
+    gradient_boundary_coeffs[start_index * 3 + 2] = 0;
+}
+
+__global__ void fvm_div_scalar_internal(int num_surfaces,
+        const int *lower_index, const int *upper_index,
+        const double *phi, const double *weight,
+        double *lower, double *upper, double *diag)
+{
+    int index = blockDim.x * blockIdx.x + threadIdx.x;
+    if (index >= num_surfaces)
+        return;
+
+    double w = weight[index];
+    double f = phi[index];
+
+    lower[index] += (-w) * f;
+    upper[index] += (1 - w) * f;
+
+    int l = lower_index[index];
+    int u = upper_index[index];
+    atomicAdd(&(diag[l]), w * f);
+    atomicAdd(&(diag[u]), (w - 1) * f);
+}
+
+__global__ void fvm_div_scalar_boundary(int num, int offset,
+        const double *boundary_phi, const double *value_internal_coeffs, const double *value_boundary_coeffs,
+        double *internal_coeffs, double *boundary_coeffs)
+{
+    int index = blockDim.x * blockIdx.x + threadIdx.x;
+    if (index >= num)
+        return;
+
+    int start_index = offset + index;
+    double boundary_f = boundary_phi[start_index];
+    internal_coeffs[start_index * 3 + 0] = boundary_f * value_internal_coeffs[start_index * 3 + 0];
+    internal_coeffs[start_index * 3 + 1] = boundary_f * value_internal_coeffs[start_index * 3 + 1];
+    internal_coeffs[start_index * 3 + 2] = boundary_f * value_internal_coeffs[start_index * 3 + 2];
+    boundary_coeffs[start_index * 3 + 0] = boundary_f * value_boundary_coeffs[start_index * 3 + 0];
+    boundary_coeffs[start_index * 3 + 1] = boundary_f * value_boundary_coeffs[start_index * 3 + 1];
+    boundary_coeffs[start_index * 3 + 2] = boundary_f * value_boundary_coeffs[start_index * 3 + 2];
+}
+
+void ldu_to_csr(cudaStream_t stream, int num_cells, int num_surfaces,
+        const int *lower_to_csr_index, const int *upper_to_csr_index, const int *diag_to_csr_index,
+        const double *lower, const double *upper, const double *diag, const double *source,
+        const double *internal_coeffs, const double *boundary_coeffs,
+        double *A, double *b)
+{
+
+}
+
+void update_boundary_coeffs_vector(cudaStream_t stream, int num_patches,
+        const int *patch_size, const int *patch_type,
+        double *value_internal_coeffs, double *value_boundary_coeffs,
+        double *gradient_internal_coeffs, double *gradient_boundary_coeffs)
+{
+    size_t threads_per_block = 1024;
+    size_t blocks_per_grid = 1;
+
+    int offset = 0;
+    for (int i = 0; i < num_patches; i++) {
+        threads_per_block = 256;
+        blocks_per_grid = (patch_size[i] + threads_per_block - 1) / threads_per_block;
+        // TODO: just basic patch type now
+        // TODO: just vector version now
+        if (patch_type[i] == boundaryConditions::zeroGradient) {
+            update_boundary_coeffs_zeroGradient_vector<<<blocks_per_grid, threads_per_block, 0, stream>>>(patch_size[i], offset,
+                    value_internal_coeffs, value_boundary_coeffs, gradient_internal_coeffs, gradient_boundary_coeffs);
+        } else if (patch_type[i] == boundaryConditions::fixedValue) {
+            // xxx
+        } else if (0) {
+            // xxx
+        }
+        offset += patch_size[i];
+    }
+}
+
+void fvm_div_scalar(cudaStream_t stream, int num_surfaces, const int *lowerAddr, const int *upperAddr,
+        const double *phi, const double *weight,
+        double *lower, double *upper, double *diag, // end for internal
+        int num_patches, const int *patch_size, const int *patch_type,
+        const double *boundary_phi, const double *value_internal_coeffs, const double *value_boundary_coeffs,
+        double *internal_coeffs, double *boundary_coeffs)
+{
+    size_t threads_per_block = 1024;
+    size_t blocks_per_grid = 1;
+
+    blocks_per_grid = (num_surfaces + threads_per_block - 1) / threads_per_block;
+    fvm_div_scalar_internal<<<blocks_per_grid, threads_per_block, 0, stream>>>(num_surfaces,
+            lowerAddr, upperAddr,
+            phi, weight, lower, upper, diag);
+
+    int offset = 0;
+    for (int i = 0; i < num_patches; i++) {
+        threads_per_block = 256;
+        blocks_per_grid = (patch_size[i] + threads_per_block - 1) / threads_per_block;
+        // TODO: just basic patch type now
+        if (patch_type[i] == boundaryConditions::zeroGradient
+                || patch_type[i] == boundaryConditions::fixedValue) {
+            // TODO: just vector version now
+            fvm_div_scalar_boundary<<<blocks_per_grid, threads_per_block, 0, stream>>>(patch_size[i], offset,
+                    boundary_phi, value_internal_coeffs, value_boundary_coeffs,
+                    internal_coeffs, boundary_coeffs);
+        } else if (0) {
+            // xxx
+        }
+        offset += patch_size[i];
+    }
+}
+

From 1b25756d75b6c9719b2396dc317de8da4dcc2411 Mon Sep 17 00:00:00 2001
From: STwangyingrui <wangyingrui@sensetime.com>
Date: Thu, 3 Aug 2023 18:30:26 +0800
Subject: [PATCH 03/25] simplify unittest

---
 GPUTest/GPUTestBase.H       | 294 ++++++++++++++++++++++--------------
 src_gpu/dfMatrixDataBase.H  |  20 ++-
 src_gpu/dfMatrixDataBase.cu |  74 +++++++--
 3 files changed, 261 insertions(+), 127 deletions(-)

diff --git a/GPUTest/GPUTestBase.H b/GPUTest/GPUTestBase.H
index e1ffb0bd5..bce191a9e 100644
--- a/GPUTest/GPUTestBase.H
+++ b/GPUTest/GPUTestBase.H
@@ -4,138 +4,204 @@ enum initType{
     randomInit
 };
 
-// unittest of fvm::div(phi, U)
-void test_fvm_div_scalar(dfMatrixDataBase& dfDataBase, Foam::fvMesh& mesh, surfaceScalarField& phi, volVectorField& U, initType type) {
-    int offset = 0;
+struct testGPUDataBase {
+    // some fvm ops don't use d_source;
+    // some fvm ops don't use d_internal_coeffs and d_boundary_coeffs;
+    // all the fvc ops only use d_source
+    double *d_lower = nullptr;
+    double *d_upper = nullptr;
+    double *d_diag = nullptr;
+    double *d_source = nullptr;
+    double *d_internal_coeffs = nullptr;
+    double *d_boundary_coeffs = nullptr;
 
-    // deal with init type
-    if (type == initType::randomInit) {
-      // random init phi to (-0.5, 0.5)
+    double *d_value_internal_coeffs = nullptr;
+    double *d_value_boundary_coeffs = nullptr;
+    double *d_gradient_internal_coeffs = nullptr;
+    double *d_gradient_boundary_coeffs = nullptr;
+
+    std::vector<int> patch_type;
+
+    // constructor
+    testGPUDataBase() {}
+
+    // deconstructor
+    ~testGPUDataBase() {
+      if (d_lower) checkCudaErrors(cudaFree(d_lower));
+      if (d_upper) checkCudaErrors(cudaFree(d_upper));
+      if (d_diag) checkCudaErrors(cudaFree(d_diag));
+      if (d_source) checkCudaErrors(cudaFree(d_source));
+      if (d_internal_coeffs) checkCudaErrors(cudaFree(d_internal_coeffs));
+      if (d_boundary_coeffs) checkCudaErrors(cudaFree(d_boundary_coeffs));
+
+      if (d_value_internal_coeffs) checkCudaErrors(cudaFree(d_value_internal_coeffs));
+      if (d_value_boundary_coeffs) checkCudaErrors(cudaFree(d_value_boundary_coeffs));
+      if (d_gradient_internal_coeffs) checkCudaErrors(cudaFree(d_gradient_internal_coeffs));
+      if (d_gradient_boundary_coeffs) checkCudaErrors(cudaFree(d_gradient_boundary_coeffs));
+    }
+};
+
+void randomInitSurfaceScalar(surfaceScalarField& field) {
+      // random init field value to (-0.5, 0.5)
       // internal
-      double *phi_internal_ptr = &phi[0];
-      std::vector<double> init_phi_internal;
-      init_phi_internal.resize(dfDataBase.num_surfaces);
+      double *field_internal_ptr = &field[0];
+      std::vector<double> init_field_internal;
+      init_field_internal.resize(dfDataBase.num_surfaces);
       for (int i = 0; i < dfDataBase.num_surfaces; i++) {
-          init_phi_internal[i] = (rand() % 10000 - 5000) / 10000.0;
+          init_field_internal[i] = (rand() % 10000 - 5000) / 10000.0;
       }
-      memcpy(phi_internal_ptr, init_phi_internal.data(), dfDataBase.surface_value_bytes);
+      memcpy(field_internal_ptr, init_field_internal.data(), dfDataBase.surface_value_bytes);
       // boundary
-      offset = 0;
-      forAll(U.boundaryField(), patchi)
+      int offset = 0;
+      forAll(field.boundaryField(), patchi)
       {
-          fvsPatchScalarField& patchPhi = phi.boundaryFieldRef()[patchi];
-          int patchsize = patchPhi.size();
-          double *phi_boundary_ptr = &patchPhi[0];
-          std::vector<double> init_phi_boundary;
-          init_phi_boundary.resize(patchsize);
+          fvsPatchScalarField& patchField = field.boundaryFieldRef()[patchi];
+          int patchsize = patchField.size();
+          double *field_boundary_ptr = &patchField[0];
+          std::vector<double> init_field_boundary;
+          init_field_boundary.resize(patchsize);
           for (int i = 0; i < patchsize; i++) {
-              init_phi_boundary[i] = (rand() % 10000 - 5000) / 10000.0;
+              init_field_boundary[i] = (rand() % 10000 - 5000) / 10000.0;
           }
-          memcpy(phi_boundary_ptr, init_phi_boundary.data(), patchsize * sizeof(double));
+          memcpy(field_boundary_ptr, init_field_boundary.data(), patchsize * sizeof(double));
           offset += patchsize;
       }
-      // TODO: random init weight to (0, 1)
-      // failed, weight is const. 
-    }
-
-    // run CPU
-    fvVectorMatrix df_U = fvm::div(phi, U);
+}
 
-    // run GPU
-    // run GPU - preProcess
-    // prepare phi
-    memcpy(dfDataBase.h_phi, &phi[0], dfDataBase.surface_value_bytes);
-    offset = 0;
-    forAll(U.boundaryField(), patchi)
+void uploadSurfaceScalar(dfMatrixDataBase& dfDataBase, const surfaceScalarField& field, const char* fieldAlias) {
+    double *h_internal_field = dfDataBase.getFieldPointer(fieldAlias, location::cpu, position::internal);
+    double *h_boundary_field = dfDataBase.getFieldPointer(fieldAlias, location::cpu, position::boundary);
+    double *d_internal_field = dfDataBase.getFieldPointer(fieldAlias, location::gpu, position::internal);
+    double *d_boundary_field = dfDataBase.getFieldPointer(fieldAlias, location::gpu, position::boundary);
+    // internal
+    memcpy(h_internal_field, &field[0], dfDataBase.surface_value_bytes);
+    // boundary
+    int offset = 0;
+    forAll(field.boundaryField(), patchi)
     {
-        const fvsPatchScalarField& patchPhi = phi.boundaryField()[patchi];
-        int patchsize = patchPhi.size();
-        memcpy(dfDataBase.h_boundary_phi + offset, &patchPhi[0], patchsize * sizeof(double));
+        const fvsPatchScalarField& patchField = field.boundaryField()[patchi];
+        int patchsize = patchField.size();
+        memcpy(h_boundary_field + offset, &patchField[0], patchsize * sizeof(double));
         offset += patchsize;
     }
-    checkCudaErrors(cudaMemcpyAsync(dfDataBase.d_phi, dfDataBase.h_phi, dfDataBase.surface_value_bytes, cudaMemcpyHostToDevice, dfDataBase.stream));
-    checkCudaErrors(cudaMemcpyAsync(dfDataBase.d_boundary_phi, dfDataBase.h_boundary_phi, dfDataBase.boundary_surface_value_bytes, cudaMemcpyHostToDevice, dfDataBase.stream));
-    // prepare patch_type
-    std::vector<int> patch_type_U;
-    patch_type_U.resize(dfDataBase.num_patches);
-    forAll(U.boundaryField(), patchi)
+    // transfer
+    checkCudaErrors(cudaMemcpyAsync(d_internal_field, h_internal_field, dfDataBase.surface_value_bytes, cudaMemcpyHostToDevice, dfDataBase.stream));
+    checkCudaErrors(cudaMemcpyAsync(d_boundary_field, h_boundary_field, dfDataBase.boundary_surface_value_bytes, cudaMemcpyHostToDevice, dfDataBase.stream));
+}
+
+void buildTestGPUDataBaseVector(const dfMatrixDataBase& dfDataBase, testGPUDataBase& testData, const volVectorField& field) {
+    // ldu
+    checkCudaErrors(cudaMalloc((void**)&testData.d_lower, dfDataBase.surface_value_bytes));
+    checkCudaErrors(cudaMalloc((void**)&testData.d_upper, dfDataBase.surface_value_bytes));
+    checkCudaErrors(cudaMalloc((void**)&testData.d_diag, dfDataBase.cell_value_bytes));
+    checkCudaErrors(cudaMalloc((void**)&testData.d_internal_coeffs, dfDataBase.boundary_surface_value_vec_bytes));
+    checkCudaErrors(cudaMalloc((void**)&testData.d_boundary_coeffs, dfDataBase.boundary_surface_value_vec_bytes));
+    // boundary coeffs
+    checkCudaErrors(cudaMalloc((void**)&testData.d_value_internal_coeffs, dfDataBase.boundary_surface_value_vec_bytes));
+    checkCudaErrors(cudaMalloc((void**)&testData.d_value_boundary_coeffs, dfDataBase.boundary_surface_value_vec_bytes));
+    checkCudaErrors(cudaMalloc((void**)&testData.d_gradient_internal_coeffs, dfDataBase.boundary_surface_value_vec_bytes));
+    checkCudaErrors(cudaMalloc((void**)&testData.d_gradient_boundary_coeffs, dfDataBase.boundary_surface_value_vec_bytes));
+    // patch type
+    testData.patch_type.resize(dfDataBase.num_patches);
+    forAll(field.boundaryField(), patchi)
     {
-        constructBoundarySelectorPerPatch(&(patch_type_U[patchi]), U.boundaryField()[patchi].type());
+        constructBoundarySelectorPerPatch(&(testData.patch_type[patchi]), field.boundaryField()[patchi].type());
     }
+}
+
+void updateBoundaryCoeffsVector(const dfMatrixDataBase& dfDataBase, testGPUDataBase& testData) {
+    update_boundary_coeffs_vector(dfDataBase.stream, dfDataBase.num_patches,
+            dfDataBase.patch_size.data(), testData.patch_type.data(),
+            testData.d_value_internal_coeffs, testData.d_value_boundary_coeffs,
+            testData.d_gradient_internal_coeffs, testData.d_gradient_boundary_coeffs);
+}
+
+void compareResultVector(const dfMatrixDataBase& dfDataBase, const testGPUDataBase& testData, fvVectorMatrix& dfMatrix, bool printFlag) {
+    if (testData.d_lower) {
+        std::vector<double> h_lower;
+        h_lower.resize(dfDataBase.num_surfaces);
+        checkCudaErrors(cudaMemcpy(h_lower.data(), testData.d_lower, dfDataBase.surface_value_bytes, cudaMemcpyDeviceToHost));
+        checkVectorEqual(dfDataBase.num_surfaces, &dfMatrix.lower()[0], h_lower.data(), 1e-14, printFlag);
+    }
+    if (testData.d_upper) {
+        std::vector<double> h_upper;
+        h_upper.resize(dfDataBase.num_surfaces);
+        checkCudaErrors(cudaMemcpy(h_upper.data(), testData.d_upper, dfDataBase.surface_value_bytes, cudaMemcpyDeviceToHost));
+        checkVectorEqual(dfDataBase.num_surfaces, &dfMatrix.upper()[0], h_upper.data(), 1e-14, printFlag);
+    }
+    if (testData.d_diag) {
+        std::vector<double> h_diag;
+        h_diag.resize(dfDataBase.num_cells);
+        checkCudaErrors(cudaMemcpy(h_diag.data(), testData.d_diag, dfDataBase.cell_value_bytes, cudaMemcpyDeviceToHost));
+        checkVectorEqual(dfDataBase.num_cells, &dfMatrix.diag()[0], h_diag.data(), 1e-14, printFlag);
+    }
+    if (testData.d_source) {
+        std::vector<double> h_source;
+        h_source.resize(dfDataBase.num_cells * 3);
+        checkCudaErrors(cudaMemcpy(h_source.data(), testData.d_source, dfDataBase.cell_value_vec_bytes, cudaMemcpyDeviceToHost));
+        checkVectorEqual(dfDataBase.num_cells, &dfMatrix.source()[0][0], h_source.data(), 1e-14, printFlag);
+    }
+    if (testData.d_internal_coeffs) {
+        std::vector<double> h_internal_coeffs;
+        h_internal_coeffs.resize(dfDataBase.num_boundary_surfaces * 3);
+        checkCudaErrors(cudaMemcpy(h_internal_coeffs.data(), testData.d_internal_coeffs, dfDataBase.boundary_surface_value_vec_bytes, cudaMemcpyDeviceToHost));
+        std::vector<double> cpu_internal_coeffs(dfDataBase.num_boundary_surfaces * 3);
+        int offset = 0;
+        for (int patchi = 0; patchi < dfDataBase.num_patches; patchi++)
+        {
+            int patchsize = dfDataBase.patch_size[patchi];
+            const double* internal_coeff_ptr = &dfMatrix.internalCoeffs()[patchi][0][0];
+            memcpy(cpu_internal_coeffs.data() + offset * 3, internal_coeff_ptr, patchsize * 3 * sizeof(double));
+            offset += patchsize;
+        }
+        checkVectorEqual(dfDataBase.num_boundary_surfaces * 3, cpu_internal_coeffs.data(), h_internal_coeffs.data(), 1e-14, printFlag);
+    }
+    if (testData.d_boundary_coeffs) {
+        std::vector<double> h_boundary_coeffs;
+        h_boundary_coeffs.resize(dfDataBase.num_boundary_surfaces * 3);
+        checkCudaErrors(cudaMemcpy(h_boundary_coeffs.data(), testData.d_boundary_coeffs, dfDataBase.boundary_surface_value_vec_bytes, cudaMemcpyDeviceToHost));
+        std::vector<double> cpu_boundary_coeffs(dfDataBase.num_boundary_surfaces * 3);
+        int offset = 0;
+        for (int patchi = 0; patchi < dfDataBase.num_patches; patchi++)
+        {
+            int patchsize = dfDataBase.patch_size[patchi];
+            const double* boundary_coeff_ptr = &dfMatrix.boundaryCoeffs()[patchi][0][0];
+            memcpy(cpu_boundary_coeffs.data() + offset * 3, boundary_coeff_ptr, patchsize * 3 * sizeof(double));
+            offset += patchsize;
+        }
+        checkVectorEqual(dfDataBase.num_boundary_surfaces * 3, cpu_boundary_coeffs.data(), h_boundary_coeffs.data(), 1e-14, printFlag);
+    }
+}
+
+// unittest of fvm::div(phi, U)
+void test_fvm_div_scalar(dfMatrixDataBase& dfDataBase, Foam::fvMesh& mesh, surfaceScalarField& phi, volVectorField& U, initType type) {
+    if (type == initType::randomInit) {
+      randomInitSurfaceScalar(phi);
+      // TODO: random init weight failed, weight is const.
+    }
+
+    // run CPU
+    fvVectorMatrix dfMatrix = fvm::div(phi, U);
+
+    // prepare for run GPU
+    // prepare phi field
+    uploadSurfaceScalar(dfDataBase, phi, "phi");
+    // prepare testData
+    testGPUDataBase testData;
+    buildTestGPUDataBaseVector(dfDataBase, testData, U);
     // prepare boundary coeffs
     // TODO: updating boundary coeffs should be complemented later
-    double *d_value_internal_coeffs_U = nullptr;
-    double *d_value_boundary_coeffs_U = nullptr;
-    double *d_gradient_internal_coeffs_U = nullptr;
-    double *d_gradient_boundary_coeffs_U = nullptr;
-    checkCudaErrors(cudaMalloc((void**)&d_value_internal_coeffs_U, dfDataBase.boundary_surface_value_vec_bytes));
-    checkCudaErrors(cudaMalloc((void**)&d_value_boundary_coeffs_U, dfDataBase.boundary_surface_value_vec_bytes));
-    checkCudaErrors(cudaMalloc((void**)&d_gradient_internal_coeffs_U, dfDataBase.boundary_surface_value_vec_bytes));
-    checkCudaErrors(cudaMalloc((void**)&d_gradient_boundary_coeffs_U, dfDataBase.boundary_surface_value_vec_bytes));
-    update_boundary_coeffs_vector(dfDataBase.stream, dfDataBase.num_patches,
-            dfDataBase.patch_size.data(), patch_type_U.data(),
-            d_value_internal_coeffs_U, d_value_boundary_coeffs_U,
-            d_gradient_internal_coeffs_U, d_gradient_boundary_coeffs_U);
-    // prepare ldu
-    double *d_lower = nullptr;
-    double *d_upper = nullptr;
-    double *d_diag = nullptr;
-    double *d_internal_coeffs = nullptr;
-    double *d_boundary_coeffs = nullptr;
-    checkCudaErrors(cudaMalloc((void**)&d_lower, dfDataBase.surface_value_bytes));
-    checkCudaErrors(cudaMalloc((void**)&d_upper, dfDataBase.surface_value_bytes));
-    checkCudaErrors(cudaMalloc((void**)&d_diag, dfDataBase.cell_value_bytes));
-    checkCudaErrors(cudaMalloc((void**)&d_internal_coeffs, dfDataBase.boundary_surface_value_vec_bytes));
-    checkCudaErrors(cudaMalloc((void**)&d_boundary_coeffs, dfDataBase.boundary_surface_value_vec_bytes));
-    // run GPU - Process
+    updateBoundaryCoeffsVector(dfDataBase, testData);
+
+    // run GPU
     fvm_div_scalar(dfDataBase.stream, dfDataBase.num_surfaces, dfDataBase.d_owner, dfDataBase.d_neighbor,
             dfDataBase.d_phi, dfDataBase.d_weight,
-            d_lower, d_upper, d_diag, // end for internal
-            dfDataBase.num_patches, dfDataBase.patch_size.data(), patch_type_U.data(),
-            dfDataBase.d_boundary_phi, d_value_internal_coeffs_U, d_value_boundary_coeffs_U,
-            d_internal_coeffs, d_boundary_coeffs);
-    // run GPU - postProcess
-    std::vector<double> h_lower;
-    h_lower.resize(dfDataBase.num_surfaces);
-    std::vector<double> h_upper;
-    h_upper.resize(dfDataBase.num_surfaces);
-    std::vector<double> h_diag;
-    h_diag.resize(dfDataBase.num_cells);
-    std::vector<double> h_internal_coeffs;
-    h_internal_coeffs.resize(dfDataBase.num_boundary_surfaces * 3);
-    std::vector<double> h_boundary_coeffs;
-    h_boundary_coeffs.resize(dfDataBase.num_boundary_surfaces * 3);
-    checkCudaErrors(cudaMemcpy(h_lower.data(), d_lower, dfDataBase.surface_value_bytes, cudaMemcpyDeviceToHost));
-    checkCudaErrors(cudaMemcpy(h_upper.data(), d_upper, dfDataBase.surface_value_bytes, cudaMemcpyDeviceToHost));
-    checkCudaErrors(cudaMemcpy(h_diag.data(), d_diag, dfDataBase.cell_value_bytes, cudaMemcpyDeviceToHost));
-    checkCudaErrors(cudaMemcpy(h_internal_coeffs.data(), d_internal_coeffs, dfDataBase.boundary_surface_value_vec_bytes, cudaMemcpyDeviceToHost));
-    checkCudaErrors(cudaMemcpy(h_boundary_coeffs.data(), d_boundary_coeffs, dfDataBase.boundary_surface_value_vec_bytes, cudaMemcpyDeviceToHost));
-    checkCudaErrors(cudaFree(d_lower));
-    checkCudaErrors(cudaFree(d_upper));
-    checkCudaErrors(cudaFree(d_diag));
-    checkCudaErrors(cudaFree(d_internal_coeffs));
-    checkCudaErrors(cudaFree(d_boundary_coeffs));
-    checkCudaErrors(cudaFree(d_value_internal_coeffs_U));
-    checkCudaErrors(cudaFree(d_value_boundary_coeffs_U));
-    checkCudaErrors(cudaFree(d_gradient_internal_coeffs_U));
-    checkCudaErrors(cudaFree(d_gradient_boundary_coeffs_U));
-
-    // compare CPU and GPU results
-    checkVectorEqual(dfDataBase.num_surfaces, &df_U.lower()[0], h_lower.data(), 1e-14);
-    checkVectorEqual(dfDataBase.num_surfaces, &df_U.upper()[0], h_upper.data(), 1e-14);
-    checkVectorEqual(dfDataBase.num_cells, &df_U.diag()[0], h_diag.data(), 1e-14);
-    std::vector<double> cpu_internal_coeffs(dfDataBase.num_boundary_surfaces * 3);
-    std::vector<double> cpu_boundary_coeffs(dfDataBase.num_boundary_surfaces * 3);
-    offset = 0;
-    forAll(U.boundaryField(), patchi)
-    {
-        int patchSize = U.boundaryField()[patchi].size();
-        const double* internal_coeff_ptr = &df_U.internalCoeffs()[patchi][0][0];
-        const double* boundary_coeff_ptr = &df_U.boundaryCoeffs()[patchi][0][0];
-        memcpy(cpu_internal_coeffs.data() + offset * 3, internal_coeff_ptr, patchSize * 3 * sizeof(double));
-        memcpy(cpu_boundary_coeffs.data() + offset * 3, boundary_coeff_ptr, patchSize * 3 * sizeof(double));
-        offset += patchSize;
-    }
-    checkVectorEqual(dfDataBase.num_boundary_surfaces * 3, cpu_internal_coeffs.data(), h_internal_coeffs.data(), 1e-14);
-    checkVectorEqual(dfDataBase.num_boundary_surfaces * 3, cpu_boundary_coeffs.data(), h_boundary_coeffs.data(), 1e-14);
+            testData.d_lower, testData.d_upper, testData.d_diag, // end for internal
+            dfDataBase.num_patches, dfDataBase.patch_size.data(), testData.patch_type.data(),
+            dfDataBase.d_boundary_phi, testData.d_value_internal_coeffs, testData.d_value_boundary_coeffs,
+            testData.d_internal_coeffs, testData.d_boundary_coeffs);
+
+    // compare result
+    bool printFlag = false;
+    compareResultVector(dfDataBase, testData, dfMatrix, printFlag);
 }
diff --git a/src_gpu/dfMatrixDataBase.H b/src_gpu/dfMatrixDataBase.H
index efcb78190..4d8a7d29d 100644
--- a/src_gpu/dfMatrixDataBase.H
+++ b/src_gpu/dfMatrixDataBase.H
@@ -12,7 +12,7 @@
 #include <iostream>
 #include <ctime>
 #include <cmath>
-
+#include <unordered_map>
 
 static const char *_cudaGetErrorEnum(cudaError_t error) {
   return cudaGetErrorName(error);
@@ -30,17 +30,29 @@ void check(T result, char const *const func, const char *const file,
 
 #define checkCudaErrors(val) check((val), #val, __FILE__, __LINE__)
 
-inline void checkVectorEqual(int count, const double* basevec, double* vec, double max_relative_error) {
+inline void checkVectorEqual(int count, const double* basevec, double* vec, double max_relative_error, bool print = false) {
     for (size_t i = 0; i < count; ++i)
     {
         double abs_diff = fabs(basevec[i] - vec[i]);
         double rel_diff = fabs(basevec[i] - vec[i]) / fabs(basevec[i]);
+        if (print)
+            fprintf(stderr, "index %d, cpu data: %.16lf, gpu data: %.16lf, relative error: %.16lf\n", i, basevec[i], vec[i], rel_diff);
         // if (abs_diff > 1e-12 && rel_diff > max_relative_error && !std::isinf(rel_diff))
         if (abs_diff > 1e-15 && rel_diff > max_relative_error)
             fprintf(stderr, "mismatch index %d, cpu data: %.16lf, gpu data: %.16lf, relative error: %.16lf\n", i, basevec[i], vec[i], rel_diff);
     }
 }
 
+enum location {
+    cpu,
+    gpu
+};
+
+enum position {
+    internal,
+    boundary
+};
+
 enum boundaryConditions{
     zeroGradient,
     fixedValue,
@@ -162,6 +174,8 @@ struct dfMatrixDataBase
     double *h_boundary_p= nullptr;
     double *h_boundary_phi= nullptr;
 
+    std::unordered_map<std::string, double*> fieldPointerMap;
+
     // constructor
     dfMatrixDataBase();
 
@@ -186,5 +200,7 @@ struct dfMatrixDataBase
     void initNonConstantFieldsInternal(const double *y);
     void initNonConstantFieldsBoundary(const double *boundary_y);
 
+    // getter
+    double* getFieldPointer(const char* fieldAlias, location loc, position pos);
 };
 
diff --git a/src_gpu/dfMatrixDataBase.cu b/src_gpu/dfMatrixDataBase.cu
index 4bcbe88a4..2ef707bbc 100644
--- a/src_gpu/dfMatrixDataBase.cu
+++ b/src_gpu/dfMatrixDataBase.cu
@@ -187,12 +187,20 @@ void dfMatrixDataBase::createConstantFieldsInternal() {
     checkCudaErrors(cudaMalloc((void**)&d_weight, surface_value_bytes));
     checkCudaErrors(cudaMalloc((void**)&d_delta_coeffs, surface_value_bytes));
     checkCudaErrors(cudaMalloc((void**)&d_volume, cell_value_bytes));
+    fieldPointerMap["d_sf"] = d_sf;
+    fieldPointerMap["d_mag_sf"] = d_mag_sf;
+    fieldPointerMap["d_weight"] = d_weight;
+    fieldPointerMap["d_delta_coeffs"] = d_delta_coeffs;
+    fieldPointerMap["d_volume"] = d_volume;
 }
 
 void dfMatrixDataBase::createConstantFieldsBoundary() {
     checkCudaErrors(cudaMalloc((void**)&d_boundary_sf, boundary_surface_value_vec_bytes));
     checkCudaErrors(cudaMalloc((void**)&d_boundary_mag_sf, boundary_surface_value_bytes));
     checkCudaErrors(cudaMalloc((void**)&d_boundary_delta_coeffs, boundary_surface_value_bytes));
+    fieldPointerMap["d_boundary_sf"] = d_boundary_sf;
+    fieldPointerMap["d_boundary_mag_sf"] = d_boundary_mag_sf;
+    fieldPointerMap["d_boundary_delta_coeffs"] = d_boundary_delta_coeffs;
 }
 
 void dfMatrixDataBase::initConstantFieldsInternal(const double *sf, const double *mag_sf, 
@@ -217,24 +225,36 @@ void dfMatrixDataBase::createNonConstantFieldsInternal() {
     checkCudaErrors(cudaMalloc((void**)&d_y, cell_value_bytes * num_species));
     checkCudaErrors(cudaMalloc((void**)&d_he, cell_value_bytes));
     checkCudaErrors(cudaMalloc((void**)&d_p, cell_value_bytes));
+    fieldPointerMap["d_rho"] = d_rho;
+    fieldPointerMap["d_u"] = d_u;
+    fieldPointerMap["d_y"] = d_y;
+    fieldPointerMap["d_he"] = d_he;
+    fieldPointerMap["d_p"] = d_p;
     
-    checkCudaErrors(cudaMalloc((void**)&d_rho_old, cell_value_bytes));
-    checkCudaErrors(cudaMalloc((void**)&d_u_old, cell_value_vec_bytes));
-    checkCudaErrors(cudaMalloc((void**)&d_y_old, cell_value_bytes * num_species));
-    checkCudaErrors(cudaMalloc((void**)&d_he_old, cell_value_bytes));
-    checkCudaErrors(cudaMalloc((void**)&d_p_old, cell_value_bytes));
+    // checkCudaErrors(cudaMalloc((void**)&d_rho_old, cell_value_bytes));
+    // checkCudaErrors(cudaMalloc((void**)&d_u_old, cell_value_vec_bytes));
+    // checkCudaErrors(cudaMalloc((void**)&d_y_old, cell_value_bytes * num_species));
+    // checkCudaErrors(cudaMalloc((void**)&d_he_old, cell_value_bytes));
+    // checkCudaErrors(cudaMalloc((void**)&d_p_old, cell_value_bytes));
     
     checkCudaErrors(cudaMalloc((void**)&d_phi, surface_value_bytes));
+    fieldPointerMap["d_phi"] = d_phi;
 
     // computed on GPU, used on CPU, need memcpyd2h
     checkCudaErrors(cudaMallocHost((void**)&h_rho, cell_value_bytes));
     checkCudaErrors(cudaMallocHost((void**)&h_u, cell_value_vec_bytes));
     checkCudaErrors(cudaMallocHost((void**)&h_y, cell_value_bytes * num_species));
     checkCudaErrors(cudaMallocHost((void**)&h_he, cell_value_bytes));
+    fieldPointerMap["h_rho"] = h_rho;
+    fieldPointerMap["h_u"] = h_u;
+    fieldPointerMap["h_y"] = h_y;
+    fieldPointerMap["h_he"] = h_he;
 
     // computed on CPU, used on GPU, need memcpyh2d
     checkCudaErrors(cudaMallocHost((void**)&h_p, cell_value_bytes));
     checkCudaErrors(cudaMallocHost((void**)&h_phi, surface_value_bytes));
+    fieldPointerMap["h_p"] = h_p;
+    fieldPointerMap["h_phi"] = h_phi;
 }
 
 void dfMatrixDataBase::createNonConstantFieldsBoundary() {
@@ -243,24 +263,36 @@ void dfMatrixDataBase::createNonConstantFieldsBoundary() {
     checkCudaErrors(cudaMalloc((void**)&d_boundary_y, boundary_surface_value_bytes * num_species));
     checkCudaErrors(cudaMalloc((void**)&d_boundary_he, boundary_surface_value_bytes));
     checkCudaErrors(cudaMalloc((void**)&d_boundary_p, boundary_surface_value_bytes));
-
-    checkCudaErrors(cudaMalloc((void**)&d_boundary_rho_old, boundary_surface_value_bytes));
-    checkCudaErrors(cudaMalloc((void**)&d_boundary_u_old, boundary_surface_value_vec_bytes));
-    checkCudaErrors(cudaMalloc((void**)&d_boundary_y_old, boundary_surface_value_bytes * num_species));
-    checkCudaErrors(cudaMalloc((void**)&d_boundary_he_old, boundary_surface_value_bytes));
-    checkCudaErrors(cudaMalloc((void**)&d_boundary_p_old, boundary_surface_value_bytes));
+    fieldPointerMap["d_boundary_rho"] = d_boundary_rho;
+    fieldPointerMap["d_boundary_u"] = d_boundary_u;
+    fieldPointerMap["d_boundary_y"] = d_boundary_y;
+    fieldPointerMap["d_boundary_he"] = d_boundary_he;
+    fieldPointerMap["d_boundary_p"] = d_boundary_p;
+
+    // checkCudaErrors(cudaMalloc((void**)&d_boundary_rho_old, boundary_surface_value_bytes));
+    // checkCudaErrors(cudaMalloc((void**)&d_boundary_u_old, boundary_surface_value_vec_bytes));
+    // checkCudaErrors(cudaMalloc((void**)&d_boundary_y_old, boundary_surface_value_bytes * num_species));
+    // checkCudaErrors(cudaMalloc((void**)&d_boundary_he_old, boundary_surface_value_bytes));
+    // checkCudaErrors(cudaMalloc((void**)&d_boundary_p_old, boundary_surface_value_bytes));
 
     checkCudaErrors(cudaMalloc((void**)&d_boundary_phi, boundary_surface_value_bytes));
+    fieldPointerMap["d_boundary_phi"] = d_boundary_phi;
 
     // computed on GPU, used on CPU, need memcpyd2h
     checkCudaErrors(cudaMallocHost((void**)&h_boundary_rho, boundary_surface_value_bytes));
     checkCudaErrors(cudaMallocHost((void**)&h_boundary_u, boundary_surface_value_vec_bytes));
     checkCudaErrors(cudaMallocHost((void**)&h_boundary_y, boundary_surface_value_bytes * num_species));
     checkCudaErrors(cudaMallocHost((void**)&h_boundary_he, boundary_surface_value_bytes));
+    fieldPointerMap["h_boundary_rho"] = h_boundary_rho;
+    fieldPointerMap["h_boundary_u"] = h_boundary_u;
+    fieldPointerMap["h_boundary_y"] = h_boundary_y;
+    fieldPointerMap["h_boundary_he"] = h_boundary_he;
 
     // computed on CPU, used on GPU, need memcpyh2d
     checkCudaErrors(cudaMallocHost((void**)&h_boundary_p, boundary_surface_value_bytes));
     checkCudaErrors(cudaMallocHost((void**)&h_boundary_phi, boundary_surface_value_bytes));
+    fieldPointerMap["h_boundary_p"] = h_boundary_p;
+    fieldPointerMap["h_boundary_phi"] = h_boundary_phi;
 }
 
 void dfMatrixDataBase::initNonConstantFieldsInternal(const double *y) {
@@ -270,3 +302,23 @@ void dfMatrixDataBase::initNonConstantFieldsInternal(const double *y) {
 void dfMatrixDataBase::initNonConstantFieldsBoundary(const double *boundary_y) {
     checkCudaErrors(cudaMemcpyAsync(d_boundary_y, boundary_y, boundary_surface_value_bytes* num_species, cudaMemcpyHostToDevice, stream));
 }
+
+double* dfMatrixDataBase::getFieldPointer(const char* fieldAlias, location loc, position pos) {
+    char mergedName[256];
+    if (pos == position::internal) {
+        sprintf(mergedName, "%s_%s", (loc == location::cpu) ? "h" : "d", fieldAlias);
+    } else if (pos == position::boundary) {
+        sprintf(mergedName, "%s_boundary_%s", (loc == location::cpu) ? "h" : "d", fieldAlias);
+    }
+
+    double *pointer = nullptr;
+    if (fieldPointerMap.find(std::string(mergedName)) != fieldPointerMap.end()) {
+        pointer = fieldPointerMap[std::string(mergedName)];
+    }
+    if (pointer == nullptr) {
+        fprintf(stderr, "Warning! getFieldPointer of %s returns nullptr!\n", mergedName);
+    }
+    //fprintf(stderr, "fieldAlias: %s, mergedName: %s, pointer: %p\n", fieldAlias, mergedName, pointer);
+
+    return pointer;
+}

From cbd7b49ef5ffa6d80c2b99722ddbb399fab26fd5 Mon Sep 17 00:00:00 2001
From: STwangyingrui <wangyingrui@sensetime.com>
Date: Fri, 4 Aug 2023 00:25:14 +0800
Subject: [PATCH 04/25] add initial version of GPU new UEqn

---
 applications/solvers/dfLowMachFoam/Make/files |   2 +-
 .../solvers/dfLowMachFoam/createGPUSolver.H   |  93 ++++++
 applications/solvers/dfLowMachFoam/new_UEqn.H |  47 +++
 .../solvers/dfLowMachFoam/new_dfLowMachFoam.C | 108 ++++++
 src_gpu/AmgXSolver.H                          | 310 ++++++++++++++++++
 src_gpu/AmgXSolver.cu                         | 296 +++++++++++++++++
 src_gpu/CMakeLists.txt                        |   4 +-
 src_gpu/dfMatrixOpBase.H                      |   3 +
 src_gpu/dfMatrixOpBase.cu                     |  37 +++
 src_gpu/dfUEqn.H                              |  99 ++++++
 src_gpu/dfUEqn.cu                             | 201 ++++++++++++
 11 files changed, 1198 insertions(+), 2 deletions(-)
 create mode 100644 applications/solvers/dfLowMachFoam/createGPUSolver.H
 create mode 100644 applications/solvers/dfLowMachFoam/new_UEqn.H
 create mode 100644 applications/solvers/dfLowMachFoam/new_dfLowMachFoam.C
 create mode 100644 src_gpu/AmgXSolver.H
 create mode 100644 src_gpu/AmgXSolver.cu
 create mode 100644 src_gpu/dfUEqn.H
 create mode 100644 src_gpu/dfUEqn.cu

diff --git a/applications/solvers/dfLowMachFoam/Make/files b/applications/solvers/dfLowMachFoam/Make/files
index 9b7e89945..4eff5915e 100644
--- a/applications/solvers/dfLowMachFoam/Make/files
+++ b/applications/solvers/dfLowMachFoam/Make/files
@@ -1,3 +1,3 @@
-dfLowMachFoam.C
+new_dfLowMachFoam.C
 
 EXE = $(DF_APPBIN)/dfLowMachFoam
diff --git a/applications/solvers/dfLowMachFoam/createGPUSolver.H b/applications/solvers/dfLowMachFoam/createGPUSolver.H
new file mode 100644
index 000000000..5d16f7b80
--- /dev/null
+++ b/applications/solvers/dfLowMachFoam/createGPUSolver.H
@@ -0,0 +1,93 @@
+dfMatrixDataBase dfDataBase;
+//dfRhoEqn rhoEqn_GPU;
+dfUEqn UEqn_GPU(dfDataBase);
+//dfYEqn YEqn_GPU;
+//dfEEqn EEqn_GPU;
+
+void createGPUBase(fvMesh& mesh, PtrList<volScalarField>& Y) {
+    // prepare constant values: num_cells, num_surfaces, num_boundary_surfaces, num_patches, patch_size, num_species, rdelta_t
+    const labelUList& owner = mesh.owner();
+    const labelUList& neighbour = mesh.neighbour();
+    int num_cells = mesh.nCells();
+    int num_surfaces = neighbour.size();
+    int num_boundary_surfaces = 0;
+    int num_patches = 0;
+    std::vector<int> patch_size;
+    forAll(mesh.boundary(), patchi) {
+        labelUList sub_boundary = mesh.boundary()[patchi].faceCells();
+        int patchsize = sub_boundary.size();
+        patch_size.push_back(patchsize);
+        num_boundary_surfaces += patchsize;
+        num_patches++;
+    }
+    // TODO: get deltaT fomr time API
+    dfDataBase.setConstantValues(num_cells, num_surfaces, num_boundary_surfaces, num_patches, patch_size, Y.size(), 1e-6);
+    
+    // prepare constant indexes: owner, neighbor
+    dfDataBase.setConstantIndexes(&owner[0], &neighbour[0]);
+    
+    // prepare internal and boundary of sf, mag_sf, weights, delta_coeffs, volume
+    double *boundary_sf = new double[3 * num_boundary_surfaces];
+    double *boundary_mag_sf = new double[num_boundary_surfaces];
+    double *boundary_delta_coeffs = new double[num_boundary_surfaces];
+    int offset = 0;
+    forAll(mesh.boundary(), patchi) {
+        const vectorField& pSf = mesh.Sf().boundaryField()[patchi];
+        const scalarField& pMagSf = mesh.magSf().boundaryField()[patchi];
+        const scalarField& pDeltaCoeffs = mesh.nonOrthDeltaCoeffs().boundaryField()[patchi];
+
+        int patchsize = pMagSf.size();
+
+        memcpy(boundary_sf + 3*offset, &pSf[0][0], 3*patchsize*sizeof(double));
+        memcpy(boundary_mag_sf + offset, &pMagSf[0], patchsize*sizeof(double));
+        memcpy(boundary_delta_coeffs + offset, &pDeltaCoeffs[0], patchsize*sizeof(double));
+        offset += patchsize;
+    }
+
+    dfDataBase.createConstantFieldsInternal();
+    dfDataBase.createConstantFieldsBoundary();
+    dfDataBase.initConstantFieldsInternal(&mesh.Sf()[0][0], &mesh.magSf()[0], &mesh.surfaceInterpolation::weights()[0], &mesh.nonOrthDeltaCoeffs()[0], &mesh.V()[0]);
+    dfDataBase.initConstantFieldsBoundary(boundary_sf, boundary_mag_sf, boundary_delta_coeffs);
+    
+    // prepare internal and boundary of Y
+    dfDataBase.createNonConstantFieldsInternal();
+    dfDataBase.createNonConstantFieldsBoundary();
+    forAll(Y, speciesI) {
+        volScalarField& Yi = Y[speciesI];
+        memcpy(dfDataBase.h_y + speciesI * num_cells, &Yi[0], num_cells * sizeof(double));
+        offset = 0;
+        forAll(Yi.boundaryField(), patchi) {
+            const scalarField& patchYi = Yi.boundaryField()[patchi];
+            int patchsize = patchYi.size();
+            memcpy(dfDataBase.h_boundary_y + speciesI * num_boundary_surfaces + offset, &patchYi[0], patchsize*sizeof(double));
+            offset += patchsize;
+        }
+    }
+    dfDataBase.initNonConstantFieldsInternal(dfDataBase.h_y);
+    dfDataBase.initNonConstantFieldsBoundary(dfDataBase.h_boundary_y);
+}
+
+void createGPUUEqn(const IOdictionary& CanteraTorchProperties, const volVectorField& U) {
+    // prepare mode_string and setting_path
+    string mode_string = "dDDI";
+    string settingPath;
+    settingPath = CanteraTorchProperties.subDict("AmgxSettings").lookupOrDefault("UEqnSettingPath", string(""));
+    UEqn_GPU.setConstantValues(mode_string, settingPath);
+
+    // prepare patch_type
+    std::vector<int> patch_type;
+    patch_type.resize(dfDataBase.num_patches);
+    forAll(U.boundaryField(), patchi)
+    {
+        constructBoundarySelectorPerPatch(&(patch_type[patchi]), U.boundaryField()[patchi].type());
+    }
+    UEqn_GPU.setConstantFields(patch_type);
+
+    // prepare internal and boundary of xxx
+    UEqn_GPU.createNonConstantFieldsInternal();
+    UEqn_GPU.createNonConstantFieldsBoundary();
+    UEqn_GPU.createNonConstantLduAndCsrFields();
+    // UEqn_GPU has no internal non-constant fields to be init
+    // UEqn_GPU.initNonConstantFieldsInternal();
+    UEqn_GPU.initNonConstantFieldsBoundary();
+}
diff --git a/applications/solvers/dfLowMachFoam/new_UEqn.H b/applications/solvers/dfLowMachFoam/new_UEqn.H
new file mode 100644
index 000000000..c38735375
--- /dev/null
+++ b/applications/solvers/dfLowMachFoam/new_UEqn.H
@@ -0,0 +1,47 @@
+#ifdef GPUSolver_
+// run CPU
+tmp<fvVectorMatrix> tUEqn
+(
+ fvm::div(phi, U)
+);
+fvVectorMatrix& UEqn = tUEqn.ref();
+
+// run GPU
+// preProcess
+// skip preparing u, p, nu_eff, and rho.boundary used by UEqn_GPU.preProcess()
+// TODO: temp, now we only transfer phi(instead of rhoEqn) used by fvm::div(phi, U)
+double *h_u = dfDataBase.getFieldPointer("u", location::cpu, position::internal);
+double *h_phi = dfDataBase.getFieldPointer("phi", location::cpu, position::internal);
+double *h_boundary_phi = dfDataBase.getFieldPointer("phi", location::cpu, position::boundary);
+memcpy(h_phi, &phi[0], dfDataBase.surface_value_bytes);
+int offset = 0;
+forAll(phi.boundaryField(), patchi)
+{
+    const fvsPatchScalarField& patchPhi = phi.boundaryField()[patchi];
+    int patchsize = patchPhi.size();
+    memcpy(h_boundary_phi + offset, &patchPhi[0], patchsize * sizeof(double));
+    offset += patchsize;
+}
+UEqn_GPU.preProcessForRhoEqn(h_phi, h_boundary_phi);
+// process
+UEqn_GPU.process();
+// postProcess
+UEqn_GPU.postProcess(h_u);
+// checkResult
+// TODO: temp, now we compare ldu, finally we compare csr
+std::vector<double> h_internal_coeffs(dfDataBase.num_boundary_surfaces * 3);
+std::vector<double> h_boundary_coeffs(dfDataBase.num_boundary_surfaces * 3);
+offset = 0;
+for (int patchi = 0; patchi < dfDataBase.num_patches; patchi++)
+{
+    int patchsize = dfDataBase.patch_size[patchi];
+    const double* internal_coeff_ptr = &UEqn.internalCoeffs()[patchi][0][0];
+    const double* boundary_coeff_ptr = &UEqn.boundaryCoeffs()[patchi][0][0];
+    memcpy(h_internal_coeffs.data() + offset * 3, internal_coeff_ptr, patchsize * 3 * sizeof(double));
+    memcpy(h_boundary_coeffs.data() + offset * 3, boundary_coeff_ptr, patchsize * 3 * sizeof(double));
+    offset += patchsize;
+}
+bool printFlag = false;
+UEqn_GPU.compareResult(&UEqn.lower()[0], &UEqn.upper()[0], &UEqn.diag()[0],
+        h_internal_coeffs.data(), h_boundary_coeffs.data(), printFlag);
+#endif
diff --git a/applications/solvers/dfLowMachFoam/new_dfLowMachFoam.C b/applications/solvers/dfLowMachFoam/new_dfLowMachFoam.C
new file mode 100644
index 000000000..530a9f7ec
--- /dev/null
+++ b/applications/solvers/dfLowMachFoam/new_dfLowMachFoam.C
@@ -0,0 +1,108 @@
+/*---------------------------------------------------------------------------*\
+  =========                 |
+  \\      /  F ield         | OpenFOAM: The Open Source CFD Toolbox
+   \\    /   O peration     | Website:  https://openfoam.org
+    \\  /    A nd           | Copyright (C) 2011-2019 OpenFOAM Foundation
+     \\/     M anipulation  |
+-------------------------------------------------------------------------------
+License
+    This file is part of OpenFOAM.
+
+    OpenFOAM is free software: you can redistribute it and/or modify it
+    under the terms of the GNU General Public License as published by
+    the Free Software Foundation, either version 3 of the License, or
+    (at your option) any later version.
+
+    OpenFOAM is distributed in the hope that it will be useful, but WITHOUT
+    ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
+    FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
+    for more details.
+
+    You should have received a copy of the GNU General Public License
+    along with OpenFOAM.  If not, see <http://www.gnu.org/licenses/>.
+
+Application
+    unittest
+
+Description
+    GPU unittest
+
+\*---------------------------------------------------------------------------*/
+
+#include "dfChemistryModel.H"
+#include "CanteraMixture.H"
+// #include "hePsiThermo.H"
+#include "heRhoThermo.H"
+
+#include "fvCFD.H"
+#include "fluidThermo.H"
+#include "turbulentFluidThermoModel.H"
+#include "pimpleControl.H"
+#include "pressureControl.H"
+#include "localEulerDdtScheme.H"
+#include "fvcSmooth.H"
+#include "PstreamGlobals.H"
+#include "basicThermo.H"
+#include "CombustionModel.H"
+
+#include <cuda_runtime.h>
+#include <thread>
+#include "upwind.H"
+
+#include "dfMatrixDataBase.H"
+#include "dfMatrixOpBase.H"
+#include "dfUEqn.H"
+#include "createGPUSolver.H"
+
+int main(int argc, char *argv[])
+{
+#ifdef USE_PYTORCH
+    pybind11::scoped_interpreter guard{};//start python interpreter
+#endif
+    #include "postProcess.H"
+
+    // #include "setRootCaseLists.H"
+    #include "listOptions.H"
+    #include "setRootCase2.H"
+    #include "listOutput.H"
+
+    #include "createTime.H"
+    #include "createMesh.H"
+    #include "createDyMControls.H"
+    #include "initContinuityErrs.H"
+    #include "createFields.H"
+    #include "createRhoUfIfPresent.H"
+
+    turbulence->validate();
+
+    if (!LTS)
+    {
+        #include "compressibleCourantNo.H"
+        #include "setInitialDeltaT.H"
+    }
+
+    // * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * //
+    {
+        #include "readDyMControls.H"
+
+        if (LTS)
+        {
+            #include "setRDeltaT.H"
+        }
+        else
+        {
+            #include "compressibleCourantNo.H"
+            #include "setDeltaT.H"
+        }
+
+        createGPUBase(mesh, Y);
+        createGPUUEqn(CanteraTorchProperties, U);
+
+        // foreach(timestep) {
+        #include "new_UEqn.H"
+        // }
+    }
+    return 0;
+}
+
+
diff --git a/src_gpu/AmgXSolver.H b/src_gpu/AmgXSolver.H
new file mode 100644
index 000000000..190808934
--- /dev/null
+++ b/src_gpu/AmgXSolver.H
@@ -0,0 +1,310 @@
+/**
+ * \file AmgXSolver.hpp
+ * \brief Definition of class AmgXSolver.
+ * \author Pi-Yueh Chuang (pychuang@gwu.edu)
+ * \author Matt Martineau (mmartineau@nvidia.com)
+ * \date 2015-09-01
+ * \copyright Copyright (c) 2015-2019 Pi-Yueh Chuang, Lorena A. Barba.
+ * \copyright Copyright (c) 2019-2021, NVIDIA CORPORATION. All rights reserved.
+ *            This project is released under MIT License.
+ */
+
+
+#ifndef __AMGX_SOLVER_H__
+#define __AMGX_SOLVER_H__
+
+// CUDA
+#include <cuda_runtime.h>
+
+// STL
+# include <string>
+# include <vector>
+# include <ctime>
+
+// AmgX
+# include <amgx_c.h>
+
+// PETSc
+// # include <petscvec.h>
+
+
+/** \brief A macro to check the returned CUDA error code.
+ *
+ * \param call [in] Function call to CUDA API.
+ */
+# define CHECK(call)                                                        \
+do                                                    \
+{                                                     \
+    const cudaError_t error_code = call;              \
+    if (error_code != cudaSuccess)                    \
+    {                                                 \
+        printf("CUDA Error:\n");                      \
+        printf("    File:       %s\n", __FILE__);     \
+        printf("    Line:       %d\n", __LINE__);     \
+        printf("    Error code: %d\n", error_code);   \
+        printf("    Error text: %s\n",                \
+            cudaGetErrorString(error_code));          \
+        exit(1);                                      \
+    }                                                 \
+} while (0)
+
+
+
+
+
+
+/** \brief A wrapper class for coupling PETSc and AmgX.
+ *
+ * This class is a wrapper of AmgX library for PETSc. PETSc users only need to
+ * pass a PETSc matrix and vectors into an AmgXSolver instance to solve their
+ * linear systems. The class is designed specifically for the situation where
+ * the number of MPI processes is more than the number of GPU devices.
+ *
+ * Eaxmple usage:
+ * \code
+ * int main(int argc, char **argv)
+ * {
+ *     // initialize matrix A, RHS, etc using PETSc
+ *     ...
+ *
+ *     // create an instance of the solver wrapper
+ *     AmgXSolver    solver;
+ *     // initialize the instance with communicator, executation mode, and config file
+ *     solver.initialize(comm, mode, file);
+ *     // set matrix A. Currently it only accept PETSc AIJ matrix
+ *     solver.setA(A);
+ *     // solve. x and rhs are PETSc vectors. unkns will be the final result in the end
+ *     solver.solve(unks, rhs);
+ *     // get number of iterations
+ *     int         iters;
+ *     solver.getIters(iters);
+ *     // get residual at the last iteration
+ *     double      res;
+ *     solver.getResidual(iters, res);
+ *     // finalization
+ *     solver.finalize();
+ *
+ *     // other codes
+ *     ....
+ *
+ *     return 0;
+ * }
+ * \endcode
+ */
+class AmgXSolver
+{
+    public:
+
+        /** \brief Default constructor. */
+        AmgXSolver() = default;
+
+        /** \brief Construct a AmgXSolver instance.
+         *
+         * \param comm [in] MPI communicator.
+         * \param modeStr [in] A string; target mode of AmgX (e.g., dDDI).
+         * \param cfgFile [in] A string; the path to AmgX configuration file.
+         */
+        AmgXSolver
+        (
+            const std::string &modeStr,
+            const std::string &cfgFile
+        );
+
+        /** \brief Destructor. */
+        ~AmgXSolver();
+
+        /** \brief Initialize a AmgXSolver instance.
+         *
+         * \param comm [in] MPI communicator.
+         * \param modeStr [in] A string; target mode of AmgX (e.g., dDDI).
+         * \param cfgFile [in] A string; the path to AmgX configuration file.
+         *
+         */
+        void initialize
+        (
+            const std::string &modeStr,
+            const std::string &cfgFile
+        );
+
+
+        /** \brief Finalize this instance.
+         *
+         * This function destroys AmgX data. When there are more than one
+         * AmgXSolver instances, the last one destroyed is also in charge of
+         * destroying the shared resource object and finalizing AmgX.
+         *
+         */
+        void finalize();
+
+        /** \brief Set up the matrix used by AmgX.
+         *
+         * This function sets up the AmgX matrix from the provided CSR data
+         * structures and partition data.
+         *
+         * \param nGlobalRows [in] The number of global rows.
+         * \param nLocalRows [in] The number of local rows on this rank.
+         * \param nLocalNz [in] The total number of non zero entries locally.
+         * \param rowOffsets [in] The local CSR matrix row offsets.
+         * \param colIndicesGlobal [in] The global CSR matrix column indices.
+         * \param values [in] The local CSR matrix values.
+         * id of the owning rank for each row.
+         *
+         */
+        void setOperator
+        (
+            const int nRows,
+            const int nNz,
+            const int *rowIndex,
+            const int *colIndex,
+            const double *value
+        );
+
+        /** \brief Re-sets up an existing AmgX matrix.
+         *
+         * Replaces the matrix coefficients with the provided values and performs
+         * a resetup for the AmgX matrix.
+         *
+         * \param nLocalRows [in] The number of local rows on this rank.
+         * \param nLocalNz [in] The total number of non zero entries locally.
+         * \param values [in] The local CSR matrix values.
+         *
+         */
+        void updateOperator
+        (
+            const int nRows,
+            const int nNz,
+            const double *value
+        );
+
+        /** \brief Solve the linear system.
+         *
+         * \p p vector will be used as an initial guess and will be updated to the
+         * solution by the end of solving.
+         *
+         * For cases that use more MPI processes than the number of GPUs, this
+         * function will do data gathering before solving and data scattering
+         * after the solving.
+         *
+         * \param nLocalRows [in] The number of rows owned by this rank.
+         * \param pscalar [in, out] The unknown array.
+         * \param bscalar [in] The RHS array.
+         * \param matrix [in,out] The AmgX CSR matrix, A.
+         *
+         */
+        void solve
+        (
+            int nRows,
+            double* psi,
+            const double* rhs
+        );
+
+	/** \brief Solve the linear system.
+         *
+         * \p p vector will be used as an initial guess and will be updated to the
+         * solution by the end of solving.
+         *
+         * For cases that use more MPI processes than the number of GPUs, this
+         * function will do data gathering before solving and data scattering
+         * after the solving.
+         *
+         * \param nLocalRows [in] The number of rows owned by this rank.
+         * \param p [in, out] The unknown vector.
+         * \param b [in] The RHS vector.
+         * \param matrix [in,out] The AmgX CSR matrix, A.
+         *
+         */
+        // void solve
+        // (
+        //     int nLocalRows,
+        //     Vec& p,
+        //     Vec& b,
+        //     AmgXCSRMatrix& matrix
+        // );
+
+
+        /** \brief Get the number of iterations of the last solving.
+         *
+         * \param iter [out] Number of iterations.
+         *
+         */
+        void getIters
+        (
+            int &iter
+        );
+
+        /** \brief Get the residual at a specific iteration during the last solving.
+         *
+         * \param iter [in] Target iteration.
+         * \param res [out] Returned residual.
+         *
+         */
+        void getResidual
+        (
+            const int &iter,
+            double &res
+        );
+
+
+    private:
+
+        /** \brief Current count of AmgXSolver instances.
+         *
+         * This static variable is used to count the number of instances. The
+         * fisrt instance is responsable for initializing AmgX library and the
+         * resource instance.
+         */
+        static int              count;
+
+        /** \brief A flag indicating if this instance has been initialized. */
+        bool                    isInitialised = false;
+
+        /** \brief A parameter used by AmgX. */
+        int                     ring;
+
+        /** \brief AmgX solver mode. */
+        AMGX_Mode               mode;
+
+        /** \brief AmgX config object. */
+        AMGX_config_handle      cfg = nullptr;
+
+        /** \brief AmgX matrix object. */
+        AMGX_matrix_handle      AmgXA = nullptr;
+
+        /** \brief AmgX vector object representing unknowns. */
+        AMGX_vector_handle      AmgXP = nullptr;
+
+        /** \brief AmgX vector object representing RHS. */
+        AMGX_vector_handle      AmgXRHS = nullptr;
+
+        /** \brief AmgX solver object. */
+        AMGX_solver_handle      solver = nullptr;
+
+        /** \brief AmgX resource object.
+         *
+         * Due to the design of AmgX library, using more than one resource
+         * instance may cause some problems. So we make the resource instance
+         * as a static member to keep only one instance.
+         */
+        static AMGX_resources_handle   rsrc;
+
+        /** \brief Set AmgX solver mode based on the user-provided string.
+         *
+         * Available modes are: dDDI, dDFI, dFFI, hDDI, hDFI, hFFI.
+         *
+         * \param modeStr [in] a std::string.
+         */
+        void setMode(const std::string &modeStr);
+
+        /** \brief Perform necessary initialization of AmgX.
+         *
+         * This function initializes AmgX for current instance. Based on
+         * \ref AmgXSolver::count "count", only the instance initialized first
+         * is in charge of initializing AmgX and the resource instance.
+         *
+         * \param cfgFile [in] Path to AmgX solver configuration file.
+         */
+        void initAmgX(const std::string &cfgFile);
+};
+
+#endif
+
diff --git a/src_gpu/AmgXSolver.cu b/src_gpu/AmgXSolver.cu
new file mode 100644
index 000000000..b0076e5c3
--- /dev/null
+++ b/src_gpu/AmgXSolver.cu
@@ -0,0 +1,296 @@
+/**
+ * \file AmgXSolver.cpp
+ * \brief Definition of member functions of the class AmgXSolver.
+ * \author Pi-Yueh Chuang (pychuang@gwu.edu)
+ * \author Matt Martineau (mmartineau@nvidia.com)
+ * \date 2015-09-01
+ * \copyright Copyright (c) 2015-2019 Pi-Yueh Chuang, Lorena A. Barba.
+ * \copyright Copyright (c) 2019-2021, NVIDIA CORPORATION. All rights reserved.
+ *            This project is released under MIT License.
+ */
+
+// AmgXWrapper
+#include "AmgXSolver.H"
+#include <numeric>
+#include <limits>
+
+// initialize AmgXSolver::count to 0
+int AmgXSolver::count = 0;
+
+// initialize AmgXSolver::rsrc to nullptr;
+AMGX_resources_handle AmgXSolver::rsrc = nullptr;
+
+
+/* \implements AmgXSolver::AmgXSolver */
+AmgXSolver::AmgXSolver(const std::string &modeStr, const std::string &cfgFile)
+{
+    initialize(modeStr, cfgFile);
+}
+
+
+/* \implements AmgXSolver::~AmgXSolver */
+AmgXSolver::~AmgXSolver()
+{
+    if (isInitialised) finalize();
+}
+
+
+/* \implements AmgXSolver::initialize */
+void AmgXSolver::initialize(const std::string &modeStr, const std::string &cfgFile)
+{
+    
+    // if this instance has already been initialized, skip
+    if (isInitialised) {
+        fprintf(stderr,
+                "This AmgXSolver instance has been initialized on this process.\n");
+        exit(0);
+    }
+
+    // increase the number of AmgXSolver instances
+    count += 1;
+
+    // get the mode of AmgX solver
+    setMode(modeStr);  
+
+    // initialize AmgX
+    initAmgX(cfgFile);  
+
+    // a bool indicating if this instance is initialized
+    isInitialised = true;
+
+    return;
+}
+
+/* \implements AmgXSolver::setMode */
+void AmgXSolver::setMode(const std::string &modeStr)
+{
+    if (modeStr == "dDDI")
+        mode = AMGX_mode_dDDI;
+    else if (modeStr == "dDFI")
+        mode = AMGX_mode_dDFI;
+    else if (modeStr == "dFFI")
+        mode = AMGX_mode_dFFI;
+    else if (modeStr[0] == 'h') {
+        printf("CPU mode, %s, is not supported in this wrapper!",
+                modeStr.c_str());
+        exit(0);
+    }
+    else {
+        printf("%s is not an available mode! Available modes are: "
+                "dDDI, dDFI, dFFI.\n", modeStr.c_str());
+        exit(0);
+    }
+}
+
+
+/* \implements AmgXSolver::initAmgX */
+ void AmgXSolver::initAmgX(const std::string &cfgFile)
+{
+    // only the first instance (AmgX solver) is in charge of initializing AmgX
+    if (count == 1)
+    {
+        // initialize AmgX
+        AMGX_SAFE_CALL(AMGX_initialize());
+
+        // intialize AmgX plugings
+        AMGX_SAFE_CALL(AMGX_initialize_plugins());
+
+        // let AmgX to handle errors returned
+        AMGX_SAFE_CALL(AMGX_install_signal_handler());
+    }
+
+    // create an AmgX configure object
+    AMGX_SAFE_CALL(AMGX_config_create_from_file(&cfg, cfgFile.c_str()));
+
+    // let AmgX handle returned error codes internally
+    AMGX_SAFE_CALL(AMGX_config_add_parameters(&cfg, "exception_handling=1"));
+
+    // create an AmgX resource object, only the first instance is in charge
+    if (count == 1) AMGX_resources_create_simple(&rsrc, cfg);
+
+    // create AmgX vector object for unknowns and RHS
+    AMGX_vector_create(&AmgXP, rsrc, mode);
+    AMGX_vector_create(&AmgXRHS, rsrc, mode);
+
+    // create AmgX matrix object for unknowns and RHS
+    AMGX_matrix_create(&AmgXA, rsrc, mode);
+
+    // create an AmgX solver object
+    AMGX_solver_create(&solver, rsrc, mode, cfg);
+
+    // obtain the default number of rings based on current configuration
+    AMGX_config_get_default_number_of_rings(cfg, &ring);
+}
+
+/* \implements AmgXSolver::finalize */
+void AmgXSolver::finalize()
+{
+    // skip if this instance has not been initialised
+    if (!isInitialised)
+    {
+        fprintf(stderr,
+                "This AmgXWrapper has not been initialised. "
+                "Please initialise it before finalization.\n");
+        exit(0);
+    }
+
+    // destroy solver instance
+    AMGX_solver_destroy(solver);
+
+    // destroy matrix instance
+    AMGX_matrix_destroy(AmgXA);
+
+    // destroy RHS and unknown vectors
+    AMGX_vector_destroy(AmgXP);
+    AMGX_vector_destroy(AmgXRHS);
+
+    // only the last instance need to destroy resource and finalizing AmgX
+    if (count == 1)
+    {
+        AMGX_resources_destroy(rsrc);
+        AMGX_SAFE_CALL(AMGX_config_destroy(cfg));
+
+        AMGX_SAFE_CALL(AMGX_finalize_plugins());
+        AMGX_SAFE_CALL(AMGX_finalize());
+    }
+    else
+    {
+        AMGX_config_destroy(cfg);
+    }
+
+    // decrease the number of instances
+    count -= 1;
+
+    // change status
+    isInitialised = false;
+}
+
+/* \implements AmgXSolver::setOperator */
+void AmgXSolver::setOperator
+(
+    const int nRows,
+    const int nNz,
+    const int *rowIndex,
+    const int *colIndex,
+    const double *value
+)
+{
+
+    // Check the matrix size is not larger than tolerated by AmgX
+    if(nRows > std::numeric_limits<int>::max())
+    {
+        fprintf(stderr,
+                "AmgX does not support a global number of rows greater than "
+                "what can be stored in 32 bits (nGlobalRows = %d).\n",
+                nRows);
+        exit(0);
+    }
+
+    if (nNz > std::numeric_limits<int>::max())
+    {
+        fprintf(stderr,
+                "AmgX does not support non-zeros per (consolidated) rank greater than"
+                "what can be stored in 32 bits (nLocalNz = %d).\n",
+                nNz);
+        exit(0);
+    }
+
+    // upload matrix A to AmgX
+    AMGX_matrix_upload_all(
+        AmgXA, nRows, nNz, 1, 1, rowIndex, colIndex, value, nullptr);
+
+    // bind the matrix A to the solver
+    AMGX_solver_setup(solver, AmgXA);
+
+    // connect (bind) vectors to the matrix
+    AMGX_vector_bind(AmgXP, AmgXA);
+    AMGX_vector_bind(AmgXRHS, AmgXA);
+}
+
+
+/* \implements AmgXSolver::updateOperator */
+void AmgXSolver::updateOperator
+(
+    const int nRows,
+    const int nNz,
+    const double *value
+)
+{
+
+    // Replace the coefficients for the CSR matrix A within AmgX
+    AMGX_matrix_replace_coefficients(AmgXA, nRows, nNz, value, nullptr);
+
+    // Re-setup the solver (a reduced overhead setup that accounts for consistent matrix structure)
+    AMGX_solver_resetup(solver, AmgXA);
+}
+
+/* \implements AmgXSolver::solve */
+// void AmgXSolver::solve(
+//     int nLocalRows, Vec& p, Vec& b, AmgXCSRMatrix& matrix)
+// {
+//     double* pscalar;
+//     double* bscalar;
+
+//     // get pointers to the raw data of local vectors
+//     VecGetArray(p, &pscalar);
+//     VecGetArray(b, &bscalar);
+
+//     solve(nLocalRows, pscalar, bscalar, matrix);
+
+//     VecRestoreArray(p, &pscalar);
+//     VecRestoreArray(b, &bscalar);
+// }
+
+
+/* \implements AmgXSolver::solve */
+void AmgXSolver::solve(
+    int nRows, double* psi, const double* rhs)
+{
+    // Upload potentially consolidated vectors to AmgX
+    AMGX_vector_upload(AmgXP, nRows, 1, psi);
+    AMGX_vector_upload(AmgXRHS, nRows, 1, rhs);
+
+    // Solve
+    AMGX_solver_solve(solver, AmgXRHS, AmgXP);
+
+    // Get the status of the solver
+    AMGX_SOLVE_STATUS status;
+    AMGX_solver_get_status(solver, &status);
+
+    // Check whether the solver successfully solved the problem
+    if (status != AMGX_SOLVE_SUCCESS)
+    {
+        fprintf(stderr, "AmgX solver failed to solve the system! "
+                        "The error code is %d.\n",
+                status);
+    }
+
+    // Download data from device
+    AMGX_vector_download(AmgXP, psi);
+
+    // get norm and iteration number
+    double irnorm = 0., rnorm = 0.;
+    int nIters = 0;
+    getResidual(0, irnorm);
+    getIters(nIters);
+    getResidual(nIters, rnorm);
+    printf("Initial residual = %.10lf, Final residual = %.5e, No Iterations %d\n", irnorm, rnorm, nIters);
+
+}
+
+
+/* \implements AmgXSolver::getIters */
+void AmgXSolver::getIters(int &iter)
+{
+    // only processes using AmgX will try to get # of iterations
+    AMGX_solver_get_iterations_number(solver, &iter);
+}
+
+
+/* \implements AmgXSolver::getResidual */
+void AmgXSolver::getResidual(const int &iter, double &res)
+{
+    // only processes using AmgX will try to get residual
+    AMGX_solver_get_iteration_residual(solver, iter, 0, &res);
+}
+
diff --git a/src_gpu/CMakeLists.txt b/src_gpu/CMakeLists.txt
index d82c86df5..ed9070476 100644
--- a/src_gpu/CMakeLists.txt
+++ b/src_gpu/CMakeLists.txt
@@ -22,8 +22,10 @@ include_directories(
 
 add_library(${PROJECT_NAME} 
     SHARED
+        AmgXSolver.cu
         dfMatrixDataBase.cu
-        dfMatrixOpBase.cu)
+        dfMatrixOpBase.cu
+        dfUEqn.cu)
 
 target_link_libraries(${PROJECT_NAME}
     ${MPI_LIBRARIES}
diff --git a/src_gpu/dfMatrixOpBase.H b/src_gpu/dfMatrixOpBase.H
index 47692e239..3e533a281 100644
--- a/src_gpu/dfMatrixOpBase.H
+++ b/src_gpu/dfMatrixOpBase.H
@@ -1,5 +1,8 @@
 #pragma once
 
+void permute_vector_d2h(cudaStream_t stream, int num_cells, const double *input, double *output);
+void permute_vector_h2d(cudaStream_t stream, int num_cells, const double *input, double *output);
+
 void ldu_to_csr(cudaStream_t stream, int num_cells, int num_surfaces,
         const int *lower_to_csr_index, const int *upper_to_csr_index, const int *diag_to_csr_index, 
         const double *lower, const double *upper, const double *diag, const double *source,
diff --git a/src_gpu/dfMatrixOpBase.cu b/src_gpu/dfMatrixOpBase.cu
index 6c533e05e..99801737e 100644
--- a/src_gpu/dfMatrixOpBase.cu
+++ b/src_gpu/dfMatrixOpBase.cu
@@ -4,6 +4,28 @@
 #include <cuda_runtime.h>
 #include "cuda_profiler_api.h"
 
+__global__ void permute_vector_d2h_kernel(int num_cells, const double *input, double *output)
+{
+    int index = blockDim.x * blockIdx.x + threadIdx.x;
+    if (index >= num_cells)
+        return;
+
+    output[index * 3 + 0] = input[num_cells * 0 + index];
+    output[index * 3 + 1] = input[num_cells * 1 + index];
+    output[index * 3 + 2] = input[num_cells * 2 + index];
+}
+
+__global__ void permute_vector_h2d_kernel(int num_cells, const double *input, double *output)
+{
+    int index = blockDim.x * blockIdx.x + threadIdx.x;
+    if (index >= num_cells)
+        return;
+
+    output[num_cells * 0 + index] = input[index * 3 + 0];
+    output[num_cells * 1 + index] = input[index * 3 + 1];
+    output[num_cells * 2 + index] = input[index * 3 + 2];
+}
+
 __global__ void update_boundary_coeffs_zeroGradient_vector(int num, int offset,
         double *value_internal_coeffs, double *value_boundary_coeffs,
         double *gradient_internal_coeffs, double *gradient_boundary_coeffs)
@@ -45,6 +67,7 @@ __global__ void fvm_div_scalar_internal(int num_surfaces,
 
     lower[index] += (-w) * f;
     upper[index] += (1 - w) * f;
+    // if (index == 0) printf("index = 0, lower: %.16lf, upper:%.16lf\n", lower[index], upper[index]);
 
     int l = lower_index[index];
     int u = upper_index[index];
@@ -70,6 +93,20 @@ __global__ void fvm_div_scalar_boundary(int num, int offset,
     boundary_coeffs[start_index * 3 + 2] = boundary_f * value_boundary_coeffs[start_index * 3 + 2];
 }
 
+void permute_vector_d2h(cudaStream_t stream, int num_cells, const double *input, double *output)
+{
+    size_t threads_per_block = 256;
+    size_t blocks_per_grid = (num_cells + threads_per_block - 1) / threads_per_block;
+    permute_vector_d2h_kernel<<<blocks_per_grid, threads_per_block, 0, stream>>>(num_cells, input, output);
+}
+
+void permute_vector_h2d(cudaStream_t stream, int num_cells, const double *input, double *output)
+{
+    size_t threads_per_block = 256;
+    size_t blocks_per_grid = (num_cells + threads_per_block - 1) / threads_per_block;
+    permute_vector_h2d_kernel<<<blocks_per_grid, threads_per_block, 0, stream>>>(num_cells, input, output);
+}
+
 void ldu_to_csr(cudaStream_t stream, int num_cells, int num_surfaces,
         const int *lower_to_csr_index, const int *upper_to_csr_index, const int *diag_to_csr_index,
         const double *lower, const double *upper, const double *diag, const double *source,
diff --git a/src_gpu/dfUEqn.H b/src_gpu/dfUEqn.H
new file mode 100644
index 000000000..0ee570b9d
--- /dev/null
+++ b/src_gpu/dfUEqn.H
@@ -0,0 +1,99 @@
+#pragma once
+
+#include "AmgXSolver.H"
+#include <amgx_c.h>
+#include "dfMatrixDataBase.H"
+#include "dfMatrixOpBase.H"
+
+class dfUEqn
+{
+private:
+	dfMatrixDataBase &dataBase_;
+
+	// constant values -- basic
+	std::string mode_string;
+	std::string setting_path;
+
+	// constant values -- amgx solvers
+	AmgXSolver *UxSolver = nullptr;
+	AmgXSolver *UySolver = nullptr;
+	AmgXSolver *UzSolver = nullptr;
+    int num_iteration = 0;
+
+	// constant fields - internal
+	// 无
+
+	// constant fields - boundary
+	std::vector<int> patch_type;
+
+	// non-constant fields - internal
+	// thermophysical fields
+	double *d_nu_eff = nullptr;
+	// computed on CPU, used on GPU, need memcpyh2d - host
+	double *h_nu_eff = nullptr;
+	// intermediate fields
+	double *d_grad_u = nullptr;
+	double *d_rho_nueff = nullptr;
+	double *d_permute = nullptr;
+
+	// non-constant fields - boundary
+	// thermophysical fields
+	double *d_boundary_nu_eff = nullptr;
+	// computed on CPU, used on GPU, need memcpyh2d - host
+	double *h_boundary_nu_eff = nullptr;
+	// intermediate fields
+	double *d_boundary_grad_u = nullptr;
+	double *d_boundary_rho_nueff = nullptr;
+    // boundary coeff fields
+	double *d_value_internal_coeffs = nullptr;
+	double *d_value_boundary_coeffs= nullptr;
+	double *d_gradient_internal_coeffs= nullptr;
+	double *d_gradient_boundary_coeffs= nullptr;
+
+	// non-constant fields - ldu
+	double *d_lower = nullptr;
+	double *d_upper = nullptr;
+	double *d_diag = nullptr;
+	double *d_source = nullptr;
+	double *d_internal_coeffs = nullptr;
+	double *d_boundary_coeffs = nullptr;
+
+	// non-constant fields - csr
+	double *d_A = nullptr;
+	double *d_b = nullptr;
+
+    // field pointer map
+    std::unordered_map<std::string, double*> fieldPointerMap;
+
+public:
+	// 构造函数
+    dfUEqn(dfMatrixDataBase &dataBase)
+        : dataBase_(dataBase) {}
+
+	// 析构函数
+	~dfUEqn(){}
+
+	// 成员函数
+
+    // getter函数
+    double* getFieldPointer(const char* fieldAlias, location loc, position pos);
+
+	// 初始化构建
+	void setConstantValues(const std::string &mode_string, const std::string &setting_path); 
+	void setConstantFields(const std::vector<int> patch_type);
+	void createNonConstantFieldsInternal();
+	void createNonConstantFieldsBoundary();
+	void createNonConstantLduAndCsrFields();
+	// dfUEqn has no internal non-constant fields to be init
+	//void initNonConstantFieldsInternal(xxx);
+	void initNonConstantFieldsBoundary();
+
+	// 方程运行
+	void preProcess(const double *h_u, const double *h_boundary_u, const double *h_p, const double *h_boundary_p, const double *h_nu_eff, const double *h_boundary_nu_eff, const double *h_boundary_rho);
+    void preProcessForRhoEqn(const double *h_phi, const double *h_boundary_phi);
+	void process();
+	void postProcess(double *h_u);
+
+    void solve();
+    void compareResult(const double *lower, const double *upper, const double *diag, const double *internal_coeffs, const double *boundary_coeffs, bool printFlag);
+};
diff --git a/src_gpu/dfUEqn.cu b/src_gpu/dfUEqn.cu
new file mode 100644
index 000000000..e6c52ec7f
--- /dev/null
+++ b/src_gpu/dfUEqn.cu
@@ -0,0 +1,201 @@
+#include "dfUEqn.H"
+
+void dfUEqn::setConstantValues(const std::string &mode_string, const std::string &setting_path) {
+  this->mode_string = mode_string;
+  this->setting_path = setting_path;
+  UxSolver = new AmgXSolver(mode_string, setting_path);
+  UySolver = new AmgXSolver(mode_string, setting_path);
+  UzSolver = new AmgXSolver(mode_string, setting_path);
+}
+
+void dfUEqn::setConstantFields(const std::vector<int> patch_type) {
+  this->patch_type = patch_type;
+}
+
+void dfUEqn::createNonConstantFieldsInternal() {
+  // thermophysical fields
+  checkCudaErrors(cudaMalloc((void**)&d_nu_eff, dataBase_.cell_value_bytes));
+  // computed on CPU, used on GPU, need memcpyh2d
+  checkCudaErrors(cudaMallocHost((void**)&h_nu_eff , dataBase_.cell_value_bytes));
+  // intermediate fields
+  checkCudaErrors(cudaMalloc((void**)&d_grad_u, dataBase_.cell_value_tsr_bytes));
+  checkCudaErrors(cudaMalloc((void**)&d_rho_nueff, dataBase_.cell_value_bytes));
+  checkCudaErrors(cudaMalloc((void**)&d_permute, dataBase_.cell_value_vec_bytes));
+
+  // getter for h_nu_eff
+  fieldPointerMap["h_nu_eff"] = h_nu_eff;
+}
+        
+void dfUEqn::createNonConstantFieldsBoundary() {
+  // thermophysical fields
+  checkCudaErrors(cudaMalloc((void**)&d_boundary_nu_eff, dataBase_.boundary_surface_value_bytes));
+  // computed on CPU, used on GPU, need memcpyh2d
+  checkCudaErrors(cudaMallocHost((void**)&h_boundary_nu_eff, dataBase_.boundary_surface_value_bytes));
+  // intermediate fields
+  checkCudaErrors(cudaMalloc((void**)&d_boundary_grad_u, dataBase_.boundary_surface_value_tsr_bytes));
+  checkCudaErrors(cudaMalloc((void**)&d_boundary_rho_nueff, dataBase_.boundary_surface_value_bytes));
+  // boundary coeff fields
+  checkCudaErrors(cudaMalloc((void**)&d_value_internal_coeffs, dataBase_.boundary_surface_value_vec_bytes));
+  checkCudaErrors(cudaMalloc((void**)&d_value_boundary_coeffs, dataBase_.boundary_surface_value_vec_bytes));
+  checkCudaErrors(cudaMalloc((void**)&d_gradient_internal_coeffs, dataBase_.boundary_surface_value_vec_bytes));
+  checkCudaErrors(cudaMalloc((void**)&d_gradient_boundary_coeffs, dataBase_.boundary_surface_value_vec_bytes));
+
+  // getter for h_boundary_nu_eff
+  fieldPointerMap["h_boundary_nu_eff"] = h_boundary_nu_eff;
+}
+
+void dfUEqn::createNonConstantLduAndCsrFields() {
+  checkCudaErrors(cudaMalloc((void**)&d_lower, dataBase_.surface_value_bytes));
+  checkCudaErrors(cudaMalloc((void**)&d_upper, dataBase_.surface_value_bytes));
+  checkCudaErrors(cudaMalloc((void**)&d_diag, dataBase_.cell_value_bytes));
+  checkCudaErrors(cudaMalloc((void**)&d_source, dataBase_.cell_value_vec_bytes));
+  checkCudaErrors(cudaMalloc((void**)&d_internal_coeffs, dataBase_.boundary_surface_value_vec_bytes));
+  checkCudaErrors(cudaMalloc((void**)&d_boundary_coeffs, dataBase_.boundary_surface_value_vec_bytes));
+  checkCudaErrors(cudaMalloc((void**)&d_A, dataBase_.csr_value_vec_bytes));
+  checkCudaErrors(cudaMalloc((void**)&d_b, dataBase_.cell_value_vec_bytes));
+}
+
+void dfUEqn::initNonConstantFieldsBoundary() {
+    update_boundary_coeffs_vector(dataBase_.stream, dataBase_.num_patches,
+            dataBase_.patch_size.data(), patch_type.data(),
+            d_value_internal_coeffs, d_value_boundary_coeffs,
+            d_gradient_internal_coeffs, d_gradient_boundary_coeffs);
+}
+
+void dfUEqn::preProcess(const double *h_u, const double *h_boundary_u, const double *h_p, const double *h_boundary_p, 
+        const double *h_nu_eff, const double *h_boundary_nu_eff, const double *h_boundary_rho) {
+  checkCudaErrors(cudaMemcpyAsync(dataBase_.d_u, h_u, dataBase_.cell_value_vec_bytes, cudaMemcpyHostToDevice, dataBase_.stream));
+  checkCudaErrors(cudaMemcpyAsync(dataBase_.d_boundary_u, h_boundary_u, dataBase_.boundary_surface_value_vec_bytes, cudaMemcpyHostToDevice, dataBase_.stream));
+  checkCudaErrors(cudaMemcpyAsync(dataBase_.d_p, h_p, dataBase_.cell_value_bytes, cudaMemcpyHostToDevice, dataBase_.stream));
+  checkCudaErrors(cudaMemcpyAsync(dataBase_.d_boundary_p, h_boundary_p, dataBase_.boundary_surface_value_bytes, cudaMemcpyHostToDevice, dataBase_.stream));
+  checkCudaErrors(cudaMemcpyAsync(d_nu_eff, h_nu_eff, dataBase_.cell_value_bytes, cudaMemcpyHostToDevice, dataBase_.stream));
+  checkCudaErrors(cudaMemcpyAsync(d_boundary_nu_eff, h_boundary_nu_eff, dataBase_.boundary_surface_value_bytes, cudaMemcpyHostToDevice, dataBase_.stream));
+  checkCudaErrors(cudaMemcpyAsync(dataBase_.d_boundary_rho, h_boundary_rho, dataBase_.boundary_surface_value_bytes, cudaMemcpyHostToDevice, dataBase_.stream));
+
+  checkCudaErrors(cudaMemsetAsync(d_lower, 0, dataBase_.surface_value_bytes, dataBase_.stream));
+  checkCudaErrors(cudaMemsetAsync(d_upper, 0, dataBase_.surface_value_bytes, dataBase_.stream));
+  checkCudaErrors(cudaMemsetAsync(d_diag, 0, dataBase_.cell_value_bytes, dataBase_.stream));
+  checkCudaErrors(cudaMemsetAsync(d_source, 0, dataBase_.cell_value_vec_bytes, dataBase_.stream));
+  checkCudaErrors(cudaMemsetAsync(d_internal_coeffs, 0, dataBase_.boundary_surface_value_vec_bytes, dataBase_.stream));
+  checkCudaErrors(cudaMemsetAsync(d_boundary_coeffs, 0, dataBase_.boundary_surface_value_vec_bytes, dataBase_.stream));
+  checkCudaErrors(cudaMemsetAsync(d_A, 0, dataBase_.csr_value_vec_bytes, dataBase_.stream));
+  checkCudaErrors(cudaMemsetAsync(d_b, 0, dataBase_.cell_value_vec_bytes, dataBase_.stream));
+}
+
+void dfUEqn::preProcessForRhoEqn(const double *h_phi, const double *h_boundary_phi) {
+  checkCudaErrors(cudaMemcpyAsync(dataBase_.d_phi, h_phi, dataBase_.surface_value_bytes, cudaMemcpyHostToDevice, dataBase_.stream));
+  checkCudaErrors(cudaMemcpyAsync(dataBase_.d_boundary_phi, h_boundary_phi, dataBase_.boundary_surface_value_bytes, cudaMemcpyHostToDevice, dataBase_.stream));
+
+  checkCudaErrors(cudaMemsetAsync(d_lower, 0, dataBase_.surface_value_bytes, dataBase_.stream));
+  checkCudaErrors(cudaMemsetAsync(d_upper, 0, dataBase_.surface_value_bytes, dataBase_.stream));
+  checkCudaErrors(cudaMemsetAsync(d_diag, 0, dataBase_.cell_value_bytes, dataBase_.stream));
+  checkCudaErrors(cudaMemsetAsync(d_source, 0, dataBase_.cell_value_vec_bytes, dataBase_.stream));
+  checkCudaErrors(cudaMemsetAsync(d_internal_coeffs, 0, dataBase_.boundary_surface_value_vec_bytes, dataBase_.stream));
+  checkCudaErrors(cudaMemsetAsync(d_boundary_coeffs, 0, dataBase_.boundary_surface_value_vec_bytes, dataBase_.stream));
+  checkCudaErrors(cudaMemsetAsync(d_A, 0, dataBase_.csr_value_vec_bytes, dataBase_.stream));
+  checkCudaErrors(cudaMemsetAsync(d_b, 0, dataBase_.cell_value_vec_bytes, dataBase_.stream));
+
+}
+
+void dfUEqn::process() {
+  // run each fvc or fvm function
+  fvm_div_scalar(dataBase_.stream, dataBase_.num_surfaces, dataBase_.d_owner, dataBase_.d_neighbor,
+          dataBase_.d_phi, dataBase_.d_weight,
+          d_lower, d_upper, d_diag, // end for internal
+          dataBase_.num_patches, dataBase_.patch_size.data(), patch_type.data(),
+          dataBase_.d_boundary_phi, d_value_internal_coeffs, d_value_boundary_coeffs,
+          d_internal_coeffs, d_boundary_coeffs);
+  //ldu_to_csr(dataBase_.stream, dataBase_.num_cells, dataBase_.num_surfaces,
+  //        dataBase_.d_lower_to_csr_index, dataBase_.d_upper_to_csr_index, dataBase_.d_diag_to_csr_index,
+  //        d_lower, d_upper, d_diag, d_source, d_internal_coeffs, d_boundary_coeffs, d_A, d_b);
+  //solve();
+}
+
+void dfUEqn::solve() {
+    checkCudaErrors(cudaStreamSynchronize(dataBase_.stream));
+
+    int nNz = dataBase_.num_cells + dataBase_.num_surfaces * 2; // matrix entries
+    if (num_iteration == 0)                                     // first interation
+    {
+        printf("Initializing AmgX Linear Solver\n");
+        UxSolver->setOperator(dataBase_.num_cells, nNz, dataBase_.d_csr_row_index, dataBase_.d_csr_col_index, d_A);
+        UySolver->setOperator(dataBase_.num_cells, nNz, dataBase_.d_csr_row_index, dataBase_.d_csr_col_index, d_A + nNz);
+        UzSolver->setOperator(dataBase_.num_cells, nNz, dataBase_.d_csr_row_index, dataBase_.d_csr_col_index, d_A + 2 * nNz);
+    }
+    else
+    {
+        UxSolver->updateOperator(dataBase_.num_cells, nNz, d_A);
+        UySolver->updateOperator(dataBase_.num_cells, nNz, d_A + nNz);
+        UzSolver->updateOperator(dataBase_.num_cells, nNz, d_A + 2 * nNz);
+    }
+    UxSolver->solve(dataBase_.num_cells, dataBase_.d_u, d_b);
+    UySolver->solve(dataBase_.num_cells, dataBase_.d_u + dataBase_.num_cells, d_b + dataBase_.num_cells);
+    UzSolver->solve(dataBase_.num_cells, dataBase_.d_u + 2 * dataBase_.num_cells, d_b + 2 * dataBase_.num_cells);
+    num_iteration++;
+}
+
+void dfUEqn::postProcess(double *h_u) {
+    permute_vector_d2h(dataBase_.stream, dataBase_.num_cells, dataBase_.d_u, d_permute);
+    checkCudaErrors(cudaMemcpyAsync(h_u, d_permute, dataBase_.cell_value_vec_bytes, cudaMemcpyDeviceToHost, dataBase_.stream));
+    checkCudaErrors(cudaStreamSynchronize(dataBase_.stream));
+
+    // some boundary conditions may also need vf.boundary, deltaCoeffs.boundary, and weight.boundary
+    update_boundary_coeffs_vector(dataBase_.stream, dataBase_.num_patches,
+            dataBase_.patch_size.data(), patch_type.data(),
+            d_value_internal_coeffs, d_value_boundary_coeffs,
+            d_gradient_internal_coeffs, d_gradient_boundary_coeffs);
+}
+
+double* dfUEqn::getFieldPointer(const char* fieldAlias, location loc, position pos) {
+    char mergedName[256];
+    if (pos == position::internal) {
+        sprintf(mergedName, "%s_%s", (loc == location::cpu) ? "h" : "d", fieldAlias);
+    } else if (pos == position::boundary) {
+        sprintf(mergedName, "%s_boundary_%s", (loc == location::cpu) ? "h" : "d", fieldAlias);
+    }
+
+    double *pointer = nullptr;
+    if (fieldPointerMap.find(std::string(mergedName)) != fieldPointerMap.end()) {
+        pointer = fieldPointerMap[std::string(mergedName)];
+    }
+    if (pointer == nullptr) {
+        fprintf(stderr, "Warning! getFieldPointer of %s returns nullptr!\n", mergedName);
+    }
+    //fprintf(stderr, "fieldAlias: %s, mergedName: %s, pointer: %p\n", fieldAlias, mergedName, pointer);
+
+    return pointer;
+}
+
+void dfUEqn::compareResult(const double *lower, const double *upper, const double *diag, const double *internal_coeffs, const double *boundary_coeffs, bool printFlag)
+{
+    std::vector<double> h_lower;
+    h_lower.resize(dataBase_.num_surfaces);
+    checkCudaErrors(cudaMemcpy(h_lower.data(), d_lower, dataBase_.surface_value_bytes, cudaMemcpyDeviceToHost));
+    checkVectorEqual(dataBase_.num_surfaces, lower, h_lower.data(), 1e-14, printFlag);
+
+    std::vector<double> h_upper;
+    h_upper.resize(dataBase_.num_surfaces);
+    checkCudaErrors(cudaMemcpy(h_upper.data(), d_upper, dataBase_.surface_value_bytes, cudaMemcpyDeviceToHost));
+    checkVectorEqual(dataBase_.num_surfaces, upper, h_upper.data(), 1e-14, printFlag);
+
+    std::vector<double> h_diag;
+    h_diag.resize(dataBase_.num_cells);
+    checkCudaErrors(cudaMemcpy(h_diag.data(), d_diag, dataBase_.cell_value_bytes, cudaMemcpyDeviceToHost));
+    checkVectorEqual(dataBase_.num_cells, diag, h_diag.data(), 1e-14, printFlag);
+
+    //std::vector<double> h_source;
+    //h_source.resize(dataBase_.num_cells * 3);
+    //checkCudaErrors(cudaMemcpy(h_source.data(), d_source, dataBase_.cell_value_vec_bytes, cudaMemcpyDeviceToHost));
+    //checkVectorEqual(dataBase_.num_cells, source, h_source.data(), 1e-14, printFlag);
+
+    std::vector<double> h_internal_coeffs;
+    h_internal_coeffs.resize(dataBase_.num_boundary_surfaces * 3);
+    checkCudaErrors(cudaMemcpy(h_internal_coeffs.data(), d_internal_coeffs, dataBase_.boundary_surface_value_vec_bytes, cudaMemcpyDeviceToHost));
+    checkVectorEqual(dataBase_.num_boundary_surfaces * 3, internal_coeffs, h_internal_coeffs.data(), 1e-14, printFlag);
+
+    std::vector<double> h_boundary_coeffs;
+    h_boundary_coeffs.resize(dataBase_.num_boundary_surfaces * 3);
+    checkCudaErrors(cudaMemcpy(h_boundary_coeffs.data(), d_boundary_coeffs, dataBase_.boundary_surface_value_vec_bytes, cudaMemcpyDeviceToHost));
+    checkVectorEqual(dataBase_.num_boundary_surfaces * 3, boundary_coeffs, h_boundary_coeffs.data(), 1e-14, printFlag);
+}
+

From be72eb01f3024483fc3cdf20e2d0a767317014ba Mon Sep 17 00:00:00 2001
From: STwangyingrui <wangyingrui@sensetime.com>
Date: Fri, 4 Aug 2023 21:28:32 +0800
Subject: [PATCH 05/25] small fix of fvm_div_boundary

---
 src_gpu/dfMatrixOpBase.cu | 12 ++++++------
 1 file changed, 6 insertions(+), 6 deletions(-)

diff --git a/src_gpu/dfMatrixOpBase.cu b/src_gpu/dfMatrixOpBase.cu
index 99801737e..4a3c25088 100644
--- a/src_gpu/dfMatrixOpBase.cu
+++ b/src_gpu/dfMatrixOpBase.cu
@@ -85,12 +85,12 @@ __global__ void fvm_div_scalar_boundary(int num, int offset,
 
     int start_index = offset + index;
     double boundary_f = boundary_phi[start_index];
-    internal_coeffs[start_index * 3 + 0] = boundary_f * value_internal_coeffs[start_index * 3 + 0];
-    internal_coeffs[start_index * 3 + 1] = boundary_f * value_internal_coeffs[start_index * 3 + 1];
-    internal_coeffs[start_index * 3 + 2] = boundary_f * value_internal_coeffs[start_index * 3 + 2];
-    boundary_coeffs[start_index * 3 + 0] = boundary_f * value_boundary_coeffs[start_index * 3 + 0];
-    boundary_coeffs[start_index * 3 + 1] = boundary_f * value_boundary_coeffs[start_index * 3 + 1];
-    boundary_coeffs[start_index * 3 + 2] = boundary_f * value_boundary_coeffs[start_index * 3 + 2];
+    internal_coeffs[start_index * 3 + 0] += boundary_f * value_internal_coeffs[start_index * 3 + 0];
+    internal_coeffs[start_index * 3 + 1] += boundary_f * value_internal_coeffs[start_index * 3 + 1];
+    internal_coeffs[start_index * 3 + 2] += boundary_f * value_internal_coeffs[start_index * 3 + 2];
+    boundary_coeffs[start_index * 3 + 0] += boundary_f * value_boundary_coeffs[start_index * 3 + 0];
+    boundary_coeffs[start_index * 3 + 1] += boundary_f * value_boundary_coeffs[start_index * 3 + 1];
+    boundary_coeffs[start_index * 3 + 2] += boundary_f * value_boundary_coeffs[start_index * 3 + 2];
 }
 
 void permute_vector_d2h(cudaStream_t stream, int num_cells, const double *input, double *output)

From 7be119075b30fa4ad8843d598e5f178808a6721c Mon Sep 17 00:00:00 2001
From: STwangyingrui <wangyingrui@sensetime.com>
Date: Fri, 4 Aug 2023 21:33:32 +0800
Subject: [PATCH 06/25] modify fvm_div_scalar to fvm_div_vector

---
 GPUTest/GPUTestBase.H     |  4 ++--
 GPUTest/unittest.C        |  4 ++--
 src_gpu/dfMatrixOpBase.H  |  2 +-
 src_gpu/dfMatrixOpBase.cu | 10 +++++-----
 src_gpu/dfUEqn.cu         |  2 +-
 5 files changed, 11 insertions(+), 11 deletions(-)

diff --git a/GPUTest/GPUTestBase.H b/GPUTest/GPUTestBase.H
index bce191a9e..9de15dd90 100644
--- a/GPUTest/GPUTestBase.H
+++ b/GPUTest/GPUTestBase.H
@@ -174,7 +174,7 @@ void compareResultVector(const dfMatrixDataBase& dfDataBase, const testGPUDataBa
 }
 
 // unittest of fvm::div(phi, U)
-void test_fvm_div_scalar(dfMatrixDataBase& dfDataBase, Foam::fvMesh& mesh, surfaceScalarField& phi, volVectorField& U, initType type) {
+void test_fvm_div_vector(dfMatrixDataBase& dfDataBase, Foam::fvMesh& mesh, surfaceScalarField& phi, volVectorField& U, initType type) {
     if (type == initType::randomInit) {
       randomInitSurfaceScalar(phi);
       // TODO: random init weight failed, weight is const.
@@ -194,7 +194,7 @@ void test_fvm_div_scalar(dfMatrixDataBase& dfDataBase, Foam::fvMesh& mesh, surfa
     updateBoundaryCoeffsVector(dfDataBase, testData);
 
     // run GPU
-    fvm_div_scalar(dfDataBase.stream, dfDataBase.num_surfaces, dfDataBase.d_owner, dfDataBase.d_neighbor,
+    fvm_div_vector(dfDataBase.stream, dfDataBase.num_surfaces, dfDataBase.d_owner, dfDataBase.d_neighbor,
             dfDataBase.d_phi, dfDataBase.d_weight,
             testData.d_lower, testData.d_upper, testData.d_diag, // end for internal
             dfDataBase.num_patches, dfDataBase.patch_size.data(), testData.patch_type.data(),
diff --git a/GPUTest/unittest.C b/GPUTest/unittest.C
index b57a8efd6..f9826eb35 100644
--- a/GPUTest/unittest.C
+++ b/GPUTest/unittest.C
@@ -98,8 +98,8 @@ int main(int argc, char *argv[])
         createGPUBase(mesh, Y);
 
         // unittest of fvm::div(phi, U)
-        test_fvm_div_scalar(dfDataBase, mesh, phi, U, initType::original);
-        test_fvm_div_scalar(dfDataBase, mesh, phi, U, initType::randomInit);
+        test_fvm_div_vector(dfDataBase, mesh, phi, U, initType::original);
+        test_fvm_div_vector(dfDataBase, mesh, phi, U, initType::randomInit);
     }
     return 0;
 }
diff --git a/src_gpu/dfMatrixOpBase.H b/src_gpu/dfMatrixOpBase.H
index 3e533a281..cfe953d4e 100644
--- a/src_gpu/dfMatrixOpBase.H
+++ b/src_gpu/dfMatrixOpBase.H
@@ -16,7 +16,7 @@ void update_boundary_coeffs_vector(cudaStream_t stream, int num_patches,
 
 // void fvm_ddt();
 
-void fvm_div_scalar(cudaStream_t stream, int num_surfaces, const int *lowerAddr, const int *upperAddr,
+void fvm_div_vector(cudaStream_t stream, int num_surfaces, const int *lowerAddr, const int *upperAddr,
         const double *phi, const double *weight,
         double *lower, double *upper, double *diag, // end for internal
         int num_patches, const int *patch_size, const int *patch_type,
diff --git a/src_gpu/dfMatrixOpBase.cu b/src_gpu/dfMatrixOpBase.cu
index 4a3c25088..8fce760fd 100644
--- a/src_gpu/dfMatrixOpBase.cu
+++ b/src_gpu/dfMatrixOpBase.cu
@@ -53,7 +53,7 @@ __global__ void update_boundary_coeffs_zeroGradient_vector(int num, int offset,
     gradient_boundary_coeffs[start_index * 3 + 2] = 0;
 }
 
-__global__ void fvm_div_scalar_internal(int num_surfaces,
+__global__ void fvm_div_vector_internal(int num_surfaces,
         const int *lower_index, const int *upper_index,
         const double *phi, const double *weight,
         double *lower, double *upper, double *diag)
@@ -75,7 +75,7 @@ __global__ void fvm_div_scalar_internal(int num_surfaces,
     atomicAdd(&(diag[u]), (w - 1) * f);
 }
 
-__global__ void fvm_div_scalar_boundary(int num, int offset,
+__global__ void fvm_div_vector_boundary(int num, int offset,
         const double *boundary_phi, const double *value_internal_coeffs, const double *value_boundary_coeffs,
         double *internal_coeffs, double *boundary_coeffs)
 {
@@ -142,7 +142,7 @@ void update_boundary_coeffs_vector(cudaStream_t stream, int num_patches,
     }
 }
 
-void fvm_div_scalar(cudaStream_t stream, int num_surfaces, const int *lowerAddr, const int *upperAddr,
+void fvm_div_vector(cudaStream_t stream, int num_surfaces, const int *lowerAddr, const int *upperAddr,
         const double *phi, const double *weight,
         double *lower, double *upper, double *diag, // end for internal
         int num_patches, const int *patch_size, const int *patch_type,
@@ -153,7 +153,7 @@ void fvm_div_scalar(cudaStream_t stream, int num_surfaces, const int *lowerAddr,
     size_t blocks_per_grid = 1;
 
     blocks_per_grid = (num_surfaces + threads_per_block - 1) / threads_per_block;
-    fvm_div_scalar_internal<<<blocks_per_grid, threads_per_block, 0, stream>>>(num_surfaces,
+    fvm_div_vector_internal<<<blocks_per_grid, threads_per_block, 0, stream>>>(num_surfaces,
             lowerAddr, upperAddr,
             phi, weight, lower, upper, diag);
 
@@ -165,7 +165,7 @@ void fvm_div_scalar(cudaStream_t stream, int num_surfaces, const int *lowerAddr,
         if (patch_type[i] == boundaryConditions::zeroGradient
                 || patch_type[i] == boundaryConditions::fixedValue) {
             // TODO: just vector version now
-            fvm_div_scalar_boundary<<<blocks_per_grid, threads_per_block, 0, stream>>>(patch_size[i], offset,
+            fvm_div_vector_boundary<<<blocks_per_grid, threads_per_block, 0, stream>>>(patch_size[i], offset,
                     boundary_phi, value_internal_coeffs, value_boundary_coeffs,
                     internal_coeffs, boundary_coeffs);
         } else if (0) {
diff --git a/src_gpu/dfUEqn.cu b/src_gpu/dfUEqn.cu
index e6c52ec7f..fbbf9e71d 100644
--- a/src_gpu/dfUEqn.cu
+++ b/src_gpu/dfUEqn.cu
@@ -99,7 +99,7 @@ void dfUEqn::preProcessForRhoEqn(const double *h_phi, const double *h_boundary_p
 
 void dfUEqn::process() {
   // run each fvc or fvm function
-  fvm_div_scalar(dataBase_.stream, dataBase_.num_surfaces, dataBase_.d_owner, dataBase_.d_neighbor,
+  fvm_div_vector(dataBase_.stream, dataBase_.num_surfaces, dataBase_.d_owner, dataBase_.d_neighbor,
           dataBase_.d_phi, dataBase_.d_weight,
           d_lower, d_upper, d_diag, // end for internal
           dataBase_.num_patches, dataBase_.patch_size.data(), patch_type.data(),

From 196f22a1aaeb23729704ba8f1fc918557e86734d Mon Sep 17 00:00:00 2001
From: STwangyingrui <wangyingrui@sensetime.com>
Date: Fri, 4 Aug 2023 15:41:07 +0000
Subject: [PATCH 07/25] implement fvm::ddt(rho, U) and add unittest for it

---
 GPUTest/GPUTestBase.H                         | 165 ++++++++++++++++--
 GPUTest/createGPUSolver.H                     |   3 +-
 GPUTest/unittest.C                            |   3 +
 .../solvers/dfLowMachFoam/createGPUSolver.H   |   3 +-
 src_gpu/dfMatrixDataBase.H                    |  16 +-
 src_gpu/dfMatrixDataBase.cu                   |   6 +-
 src_gpu/dfMatrixOpBase.H                      |  17 +-
 src_gpu/dfMatrixOpBase.cu                     |  25 +++
 8 files changed, 207 insertions(+), 31 deletions(-)

diff --git a/GPUTest/GPUTestBase.H b/GPUTest/GPUTestBase.H
index 9de15dd90..d46d3e95b 100644
--- a/GPUTest/GPUTestBase.H
+++ b/GPUTest/GPUTestBase.H
@@ -55,7 +55,7 @@ void randomInitSurfaceScalar(surfaceScalarField& field) {
       int offset = 0;
       forAll(field.boundaryField(), patchi)
       {
-          fvsPatchScalarField& patchField = field.boundaryFieldRef()[patchi];
+          auto& patchField = field.boundaryFieldRef()[patchi];
           int patchsize = patchField.size();
           double *field_boundary_ptr = &patchField[0];
           std::vector<double> init_field_boundary;
@@ -68,6 +68,97 @@ void randomInitSurfaceScalar(surfaceScalarField& field) {
       }
 }
 
+void randomInitVolScalar(volScalarField& field) {
+      // random init field value to (-0.5, 0.5)
+      // internal
+      double *field_internal_ptr = &field[0];
+      std::vector<double> init_field_internal;
+      init_field_internal.resize(dfDataBase.num_cells);
+      for (int i = 0; i < dfDataBase.num_cells; i++) {
+          init_field_internal[i] = (rand() % 10000 - 5000) / 10000.0;
+      }
+      memcpy(field_internal_ptr, init_field_internal.data(), dfDataBase.cell_value_bytes);
+      // boundary
+      int offset = 0;
+      forAll(field.boundaryField(), patchi)
+      {
+          auto& patchField = field.boundaryFieldRef()[patchi];
+          int patchsize = patchField.size();
+          double *field_boundary_ptr = &patchField[0];
+          std::vector<double> init_field_boundary;
+          init_field_boundary.resize(patchsize);
+          for (int i = 0; i < patchsize; i++) {
+              init_field_boundary[i] = (rand() % 10000 - 5000) / 10000.0;
+          }
+          memcpy(field_boundary_ptr, init_field_boundary.data(), patchsize * sizeof(double));
+          offset += patchsize;
+      }
+}
+
+// rho_old need special treatment
+void uploadRhoOld(dfMatrixDataBase& dfDataBase, const volScalarField& field) {
+    double *h_internal_field = dfDataBase.getFieldPointer("rho", location::cpu, position::internal);
+    double *h_boundary_field = dfDataBase.getFieldPointer("rho", location::cpu, position::boundary);
+    double *d_internal_field = dfDataBase.getFieldPointer("rho_old", location::gpu, position::internal);
+    double *d_boundary_field = dfDataBase.getFieldPointer("rho_old", location::gpu, position::boundary);
+    // internal
+    memcpy(h_internal_field, &field[0], dfDataBase.cell_value_bytes);
+    // boundary
+    int offset = 0;
+    forAll(field.boundaryField(), patchi)
+    {
+        const auto& patchField = field.boundaryField()[patchi];
+        int patchsize = patchField.size();
+        memcpy(h_boundary_field + offset, &patchField[0], patchsize * sizeof(double));
+        offset += patchsize;
+    }
+    // transfer
+    checkCudaErrors(cudaMemcpyAsync(d_internal_field, h_internal_field, dfDataBase.cell_value_bytes, cudaMemcpyHostToDevice, dfDataBase.stream));
+    checkCudaErrors(cudaMemcpyAsync(d_boundary_field, h_boundary_field, dfDataBase.boundary_surface_value_bytes, cudaMemcpyHostToDevice, dfDataBase.stream));
+}
+
+void uploadVolScalar(dfMatrixDataBase& dfDataBase, const volScalarField& field, const char* fieldAlias) {
+    double *h_internal_field = dfDataBase.getFieldPointer(fieldAlias, location::cpu, position::internal);
+    double *h_boundary_field = dfDataBase.getFieldPointer(fieldAlias, location::cpu, position::boundary);
+    double *d_internal_field = dfDataBase.getFieldPointer(fieldAlias, location::gpu, position::internal);
+    double *d_boundary_field = dfDataBase.getFieldPointer(fieldAlias, location::gpu, position::boundary);
+    // internal
+    memcpy(h_internal_field, &field[0], dfDataBase.cell_value_bytes);
+    // boundary
+    int offset = 0;
+    forAll(field.boundaryField(), patchi)
+    {
+        const auto& patchField = field.boundaryField()[patchi];
+        int patchsize = patchField.size();
+        memcpy(h_boundary_field + offset, &patchField[0], patchsize * sizeof(double));
+        offset += patchsize;
+    }
+    // transfer
+    checkCudaErrors(cudaMemcpyAsync(d_internal_field, h_internal_field, dfDataBase.cell_value_bytes, cudaMemcpyHostToDevice, dfDataBase.stream));
+    checkCudaErrors(cudaMemcpyAsync(d_boundary_field, h_boundary_field, dfDataBase.boundary_surface_value_bytes, cudaMemcpyHostToDevice, dfDataBase.stream));
+}
+
+void uploadVolVector(dfMatrixDataBase& dfDataBase, const volVectorField& field, const char* fieldAlias) {
+    double *h_internal_field = dfDataBase.getFieldPointer(fieldAlias, location::cpu, position::internal);
+    double *h_boundary_field = dfDataBase.getFieldPointer(fieldAlias, location::cpu, position::boundary);
+    double *d_internal_field = dfDataBase.getFieldPointer(fieldAlias, location::gpu, position::internal);
+    double *d_boundary_field = dfDataBase.getFieldPointer(fieldAlias, location::gpu, position::boundary);
+    // internal
+    memcpy(h_internal_field, &field[0], dfDataBase.cell_value_vec_bytes);
+    // boundary
+    int offset = 0;
+    forAll(field.boundaryField(), patchi)
+    {
+        const auto& patchField = field.boundaryField()[patchi];
+        int patchsize = patchField.size();
+        memcpy(h_boundary_field + offset * 3, &patchField[0], patchsize * 3 * sizeof(double));
+        offset += patchsize;
+    }
+    // transfer
+    checkCudaErrors(cudaMemcpyAsync(d_internal_field, h_internal_field, dfDataBase.cell_value_vec_bytes, cudaMemcpyHostToDevice, dfDataBase.stream));
+    checkCudaErrors(cudaMemcpyAsync(d_boundary_field, h_boundary_field, dfDataBase.boundary_surface_value_vec_bytes, cudaMemcpyHostToDevice, dfDataBase.stream));
+}
+
 void uploadSurfaceScalar(dfMatrixDataBase& dfDataBase, const surfaceScalarField& field, const char* fieldAlias) {
     double *h_internal_field = dfDataBase.getFieldPointer(fieldAlias, location::cpu, position::internal);
     double *h_boundary_field = dfDataBase.getFieldPointer(fieldAlias, location::cpu, position::boundary);
@@ -79,7 +170,7 @@ void uploadSurfaceScalar(dfMatrixDataBase& dfDataBase, const surfaceScalarField&
     int offset = 0;
     forAll(field.boundaryField(), patchi)
     {
-        const fvsPatchScalarField& patchField = field.boundaryField()[patchi];
+        const auto& patchField = field.boundaryField()[patchi];
         int patchsize = patchField.size();
         memcpy(h_boundary_field + offset, &patchField[0], patchsize * sizeof(double));
         offset += patchsize;
@@ -89,18 +180,31 @@ void uploadSurfaceScalar(dfMatrixDataBase& dfDataBase, const surfaceScalarField&
     checkCudaErrors(cudaMemcpyAsync(d_boundary_field, h_boundary_field, dfDataBase.boundary_surface_value_bytes, cudaMemcpyHostToDevice, dfDataBase.stream));
 }
 
-void buildTestGPUDataBaseVector(const dfMatrixDataBase& dfDataBase, testGPUDataBase& testData, const volVectorField& field) {
+void buildTestGPUDataBaseVector(const dfMatrixDataBase& dfDataBase, testGPUDataBase& testData, const volVectorField& field,
+        bool lowerFlag, bool upperFlag, bool diagFlag, bool sourceFlag, bool internalCoeffsFlag, bool boundaryCoeffsFlag,
+        bool valueInternalCoeffsFlag, bool valueBoundaryCoeffsFlag, bool gradientInternalCoeffsFlag, bool gradientBoundaryCoeffsFlag) {
     // ldu
-    checkCudaErrors(cudaMalloc((void**)&testData.d_lower, dfDataBase.surface_value_bytes));
-    checkCudaErrors(cudaMalloc((void**)&testData.d_upper, dfDataBase.surface_value_bytes));
-    checkCudaErrors(cudaMalloc((void**)&testData.d_diag, dfDataBase.cell_value_bytes));
-    checkCudaErrors(cudaMalloc((void**)&testData.d_internal_coeffs, dfDataBase.boundary_surface_value_vec_bytes));
-    checkCudaErrors(cudaMalloc((void**)&testData.d_boundary_coeffs, dfDataBase.boundary_surface_value_vec_bytes));
+    if (lowerFlag)
+        checkCudaErrors(cudaMalloc((void**)&testData.d_lower, dfDataBase.surface_value_bytes));
+    if (upperFlag)
+        checkCudaErrors(cudaMalloc((void**)&testData.d_upper, dfDataBase.surface_value_bytes));
+    if (diagFlag)
+        checkCudaErrors(cudaMalloc((void**)&testData.d_diag, dfDataBase.cell_value_bytes));
+    if (sourceFlag)
+        checkCudaErrors(cudaMalloc((void**)&testData.d_source, dfDataBase.cell_value_vec_bytes));
+    if (internalCoeffsFlag)
+        checkCudaErrors(cudaMalloc((void**)&testData.d_internal_coeffs, dfDataBase.boundary_surface_value_vec_bytes));
+    if (boundaryCoeffsFlag)
+        checkCudaErrors(cudaMalloc((void**)&testData.d_boundary_coeffs, dfDataBase.boundary_surface_value_vec_bytes));
     // boundary coeffs
-    checkCudaErrors(cudaMalloc((void**)&testData.d_value_internal_coeffs, dfDataBase.boundary_surface_value_vec_bytes));
-    checkCudaErrors(cudaMalloc((void**)&testData.d_value_boundary_coeffs, dfDataBase.boundary_surface_value_vec_bytes));
-    checkCudaErrors(cudaMalloc((void**)&testData.d_gradient_internal_coeffs, dfDataBase.boundary_surface_value_vec_bytes));
-    checkCudaErrors(cudaMalloc((void**)&testData.d_gradient_boundary_coeffs, dfDataBase.boundary_surface_value_vec_bytes));
+    if (valueInternalCoeffsFlag)
+        checkCudaErrors(cudaMalloc((void**)&testData.d_value_internal_coeffs, dfDataBase.boundary_surface_value_vec_bytes));
+    if (valueBoundaryCoeffsFlag)
+        checkCudaErrors(cudaMalloc((void**)&testData.d_value_boundary_coeffs, dfDataBase.boundary_surface_value_vec_bytes));
+    if (gradientInternalCoeffsFlag)
+        checkCudaErrors(cudaMalloc((void**)&testData.d_gradient_internal_coeffs, dfDataBase.boundary_surface_value_vec_bytes));
+    if (gradientBoundaryCoeffsFlag)
+        checkCudaErrors(cudaMalloc((void**)&testData.d_gradient_boundary_coeffs, dfDataBase.boundary_surface_value_vec_bytes));
     // patch type
     testData.patch_type.resize(dfDataBase.num_patches);
     forAll(field.boundaryField(), patchi)
@@ -139,7 +243,7 @@ void compareResultVector(const dfMatrixDataBase& dfDataBase, const testGPUDataBa
         std::vector<double> h_source;
         h_source.resize(dfDataBase.num_cells * 3);
         checkCudaErrors(cudaMemcpy(h_source.data(), testData.d_source, dfDataBase.cell_value_vec_bytes, cudaMemcpyDeviceToHost));
-        checkVectorEqual(dfDataBase.num_cells, &dfMatrix.source()[0][0], h_source.data(), 1e-14, printFlag);
+        checkVectorEqual(dfDataBase.num_cells * 3, &dfMatrix.source()[0][0], h_source.data(), 1e-14, printFlag);
     }
     if (testData.d_internal_coeffs) {
         std::vector<double> h_internal_coeffs;
@@ -173,6 +277,37 @@ void compareResultVector(const dfMatrixDataBase& dfDataBase, const testGPUDataBa
     }
 }
 
+// unittest of fvm::ddt(rho, U)
+void test_fvm_ddt_vector(dfMatrixDataBase& dfDataBase, Foam::fvMesh& mesh, volScalarField& rho, volVectorField& U, initType type) {
+
+    if (type == initType::randomInit) {
+      // random init rho and rho.old
+      randomInitVolScalar(rho);
+      rho.oldTime();
+    }
+
+    // run CPU
+    fvVectorMatrix dfMatrix = fvm::ddt(rho, U);
+
+    // prepare for run GPU
+    // prepare rho, rho.old, U
+    uploadVolScalar(dfDataBase, rho, "rho");
+    uploadRhoOld(dfDataBase, rho.oldTime());
+    uploadVolVector(dfDataBase, U.oldTime(), "u");
+    // prepare testData
+    testGPUDataBase testData;
+    // only use diag and source
+    buildTestGPUDataBaseVector(dfDataBase, testData, U, false, false, true, true, false, false, false, false, false, false);
+    // run GPU
+    fvm_ddt_vector(dfDataBase.stream, dfDataBase.num_cells, dfDataBase.rdelta_t,
+            dfDataBase.d_rho, dfDataBase.d_rho_old, dfDataBase.d_u, dfDataBase.d_volume,
+            testData.d_diag, testData.d_source);
+
+    // compare result
+    bool printFlag = false;
+    compareResultVector(dfDataBase, testData, dfMatrix, printFlag);
+}
+
 // unittest of fvm::div(phi, U)
 void test_fvm_div_vector(dfMatrixDataBase& dfDataBase, Foam::fvMesh& mesh, surfaceScalarField& phi, volVectorField& U, initType type) {
     if (type == initType::randomInit) {
@@ -188,7 +323,9 @@ void test_fvm_div_vector(dfMatrixDataBase& dfDataBase, Foam::fvMesh& mesh, surfa
     uploadSurfaceScalar(dfDataBase, phi, "phi");
     // prepare testData
     testGPUDataBase testData;
-    buildTestGPUDataBaseVector(dfDataBase, testData, U);
+    // not use source
+    // gradient_internal_coeffs, gradient_boundary_coeffs are not needed actually, but updateBoundaryCoeffsVector will access them
+    buildTestGPUDataBaseVector(dfDataBase, testData, U, true, true, true, false, true, true, true, true, true, true);
     // prepare boundary coeffs
     // TODO: updating boundary coeffs should be complemented later
     updateBoundaryCoeffsVector(dfDataBase, testData);
diff --git a/GPUTest/createGPUSolver.H b/GPUTest/createGPUSolver.H
index 0f1e20eca..3dd593337 100644
--- a/GPUTest/createGPUSolver.H
+++ b/GPUTest/createGPUSolver.H
@@ -18,7 +18,8 @@ void createGPUBase(fvMesh& mesh, PtrList<volScalarField>& Y) {
         num_patches++;
     }
     // TODO: get deltaT fomr time API
-    dfDataBase.setConstantValues(num_cells, num_surfaces, num_boundary_surfaces, num_patches, patch_size, Y.size(), 1e-6);
+    double rDeltaT = 1 / 1e-6;
+    dfDataBase.setConstantValues(num_cells, num_surfaces, num_boundary_surfaces, num_patches, patch_size, Y.size(), rDeltaT);
     
     // prepare constant indexes: owner, neighbor
     dfDataBase.setConstantIndexes(&owner[0], &neighbour[0]);
diff --git a/GPUTest/unittest.C b/GPUTest/unittest.C
index f9826eb35..dd1f29e53 100644
--- a/GPUTest/unittest.C
+++ b/GPUTest/unittest.C
@@ -97,6 +97,9 @@ int main(int argc, char *argv[])
 
         createGPUBase(mesh, Y);
 
+        // unittest of fvm::ddt(rho, U)
+        test_fvm_ddt_vector(dfDataBase, mesh, rho, U, initType::original);
+        test_fvm_ddt_vector(dfDataBase, mesh, rho, U, initType::randomInit);
         // unittest of fvm::div(phi, U)
         test_fvm_div_vector(dfDataBase, mesh, phi, U, initType::original);
         test_fvm_div_vector(dfDataBase, mesh, phi, U, initType::randomInit);
diff --git a/applications/solvers/dfLowMachFoam/createGPUSolver.H b/applications/solvers/dfLowMachFoam/createGPUSolver.H
index 5d16f7b80..d9ce745d7 100644
--- a/applications/solvers/dfLowMachFoam/createGPUSolver.H
+++ b/applications/solvers/dfLowMachFoam/createGPUSolver.H
@@ -21,7 +21,8 @@ void createGPUBase(fvMesh& mesh, PtrList<volScalarField>& Y) {
         num_patches++;
     }
     // TODO: get deltaT fomr time API
-    dfDataBase.setConstantValues(num_cells, num_surfaces, num_boundary_surfaces, num_patches, patch_size, Y.size(), 1e-6);
+    double rDeltaT = 1 / 1e-6;
+    dfDataBase.setConstantValues(num_cells, num_surfaces, num_boundary_surfaces, num_patches, patch_size, Y.size(), rDeltaT);
     
     // prepare constant indexes: owner, neighbor
     dfDataBase.setConstantIndexes(&owner[0], &neighbour[0]);
diff --git a/src_gpu/dfMatrixDataBase.H b/src_gpu/dfMatrixDataBase.H
index 4d8a7d29d..cce7e6adc 100644
--- a/src_gpu/dfMatrixDataBase.H
+++ b/src_gpu/dfMatrixDataBase.H
@@ -134,10 +134,10 @@ struct dfMatrixDataBase
     // fields solved by eqns - old 
     // TODO: not all fields need to store oldTime
     double *d_rho_old = nullptr;
-    double *d_u_old = nullptr;
-    double *d_y_old = nullptr;
-    double *d_he_old = nullptr;
-    double *d_p_old = nullptr;
+    //double *d_u_old = nullptr;
+    //double *d_y_old = nullptr;
+    //double *d_he_old = nullptr;
+    //double *d_p_old = nullptr;
     // other shared fields between eqns
     double *d_phi = nullptr;
     // computed on GPU, used on CPU, need memcpyd2h - host
@@ -159,10 +159,10 @@ struct dfMatrixDataBase
     double *d_boundary_p = nullptr;
     // fields solved by eqns - old
     double *d_boundary_rho_old = nullptr;
-    double *d_boundary_u_old = nullptr;
-    double *d_boundary_y_old = nullptr;
-    double *d_boundary_he_old = nullptr;
-    double *d_boundary_p_old = nullptr;
+    //double *d_boundary_u_old = nullptr;
+    //double *d_boundary_y_old = nullptr;
+    //double *d_boundary_he_old = nullptr;
+    //double *d_boundary_p_old = nullptr;
     // other shared fields between eqns
     double *d_boundary_phi = nullptr;
     // computed on GPU, used on CPU, need memcpyd2h - host
diff --git a/src_gpu/dfMatrixDataBase.cu b/src_gpu/dfMatrixDataBase.cu
index 2ef707bbc..341241bf4 100644
--- a/src_gpu/dfMatrixDataBase.cu
+++ b/src_gpu/dfMatrixDataBase.cu
@@ -231,7 +231,8 @@ void dfMatrixDataBase::createNonConstantFieldsInternal() {
     fieldPointerMap["d_he"] = d_he;
     fieldPointerMap["d_p"] = d_p;
     
-    // checkCudaErrors(cudaMalloc((void**)&d_rho_old, cell_value_bytes));
+    checkCudaErrors(cudaMalloc((void**)&d_rho_old, cell_value_bytes));
+    fieldPointerMap["d_rho_old"] = d_rho_old;
     // checkCudaErrors(cudaMalloc((void**)&d_u_old, cell_value_vec_bytes));
     // checkCudaErrors(cudaMalloc((void**)&d_y_old, cell_value_bytes * num_species));
     // checkCudaErrors(cudaMalloc((void**)&d_he_old, cell_value_bytes));
@@ -269,7 +270,8 @@ void dfMatrixDataBase::createNonConstantFieldsBoundary() {
     fieldPointerMap["d_boundary_he"] = d_boundary_he;
     fieldPointerMap["d_boundary_p"] = d_boundary_p;
 
-    // checkCudaErrors(cudaMalloc((void**)&d_boundary_rho_old, boundary_surface_value_bytes));
+    checkCudaErrors(cudaMalloc((void**)&d_boundary_rho_old, boundary_surface_value_bytes));
+    fieldPointerMap["d_boundary_rho_old"] = d_boundary_rho_old;
     // checkCudaErrors(cudaMalloc((void**)&d_boundary_u_old, boundary_surface_value_vec_bytes));
     // checkCudaErrors(cudaMalloc((void**)&d_boundary_y_old, boundary_surface_value_bytes * num_species));
     // checkCudaErrors(cudaMalloc((void**)&d_boundary_he_old, boundary_surface_value_bytes));
diff --git a/src_gpu/dfMatrixOpBase.H b/src_gpu/dfMatrixOpBase.H
index cfe953d4e..40404944d 100644
--- a/src_gpu/dfMatrixOpBase.H
+++ b/src_gpu/dfMatrixOpBase.H
@@ -1,5 +1,6 @@
 #pragma once
 
+// tools
 void permute_vector_d2h(cudaStream_t stream, int num_cells, const double *input, double *output);
 void permute_vector_h2d(cudaStream_t stream, int num_cells, const double *input, double *output);
 
@@ -14,7 +15,11 @@ void update_boundary_coeffs_vector(cudaStream_t stream, int num_patches,
         double *value_internal_coeffs, double *value_boundary_coeffs,
         double *gradient_internal_coeffs, double *gradient_boundary_coeffs);
 
-// void fvm_ddt();
+// fvm ops
+
+void fvm_ddt_vector(cudaStream_t stream, int num_cells, double rDeltaT,
+        const double *rho, const double *rho_old, const double *vf, const double *volume,
+        double *diag, double *source);
 
 void fvm_div_vector(cudaStream_t stream, int num_surfaces, const int *lowerAddr, const int *upperAddr,
         const double *phi, const double *weight,
@@ -23,10 +28,12 @@ void fvm_div_vector(cudaStream_t stream, int num_surfaces, const int *lowerAddr,
         const double *boundary_phi, const double *value_internal_coeffs, const double *value_boundary_coeffs,
         double *internal_coeffs, double *boundary_coeffs);
 
-// void fvm_laplacian();
-// 
-// void fvc_ddt();
-// 
+void fvm_laplacian();
+
+// fvc ops
+
+void fvc_ddt();
+
 // void fvc_grad_surface();
 // 
 // void fvc_div_cell();
diff --git a/src_gpu/dfMatrixOpBase.cu b/src_gpu/dfMatrixOpBase.cu
index 8fce760fd..1728a91dc 100644
--- a/src_gpu/dfMatrixOpBase.cu
+++ b/src_gpu/dfMatrixOpBase.cu
@@ -53,6 +53,21 @@ __global__ void update_boundary_coeffs_zeroGradient_vector(int num, int offset,
     gradient_boundary_coeffs[start_index * 3 + 2] = 0;
 }
 
+__global__ void fvm_ddt_vector_kernel(int num_cells, double rDeltaT,
+        const double *rho, const double *rho_old, const double *vf, const double *volume,
+        double *diag, double *source)
+{
+    int index = blockDim.x * blockIdx.x + threadIdx.x;
+    if (index >= num_cells)
+        return;
+
+    diag[index] += rDeltaT * rho[index] * volume[index];
+    // TODO: skip moving
+    source[index * 3 + 0] += rDeltaT * rho_old[index] * vf[index * 3 + 0] * volume[index];
+    source[index * 3 + 1] += rDeltaT * rho_old[index] * vf[index * 3 + 1] * volume[index];
+    source[index * 3 + 2] += rDeltaT * rho_old[index] * vf[index * 3 + 2] * volume[index];
+}
+
 __global__ void fvm_div_vector_internal(int num_surfaces,
         const int *lower_index, const int *upper_index,
         const double *phi, const double *weight,
@@ -142,6 +157,16 @@ void update_boundary_coeffs_vector(cudaStream_t stream, int num_patches,
     }
 }
 
+void fvm_ddt_vector(cudaStream_t stream, int num_cells, double rDeltaT,
+        const double *rho, const double *rho_old, const double *vf, const double *volume,
+        double *diag, double *source)
+{
+    size_t threads_per_block = 1024;
+    size_t blocks_per_grid = (num_cells + threads_per_block - 1) / threads_per_block;
+    fvm_ddt_vector_kernel<<<blocks_per_grid, threads_per_block, 0, stream>>>(num_cells,
+            rDeltaT, rho, rho_old, vf, volume, diag, source);
+}
+
 void fvm_div_vector(cudaStream_t stream, int num_surfaces, const int *lowerAddr, const int *upperAddr,
         const double *phi, const double *weight,
         double *lower, double *upper, double *diag, // end for internal

From 46062382354ff8444cc6220af00ba42a39d94097 Mon Sep 17 00:00:00 2001
From: STwangyingrui <wangyingrui@sensetime.com>
Date: Mon, 7 Aug 2023 06:27:09 +0000
Subject: [PATCH 08/25] implement fvm::laplacian(gamma, U) and add unittest for
 it; fix several old bugs;

---
 GPUTest/GPUTestBase.H     | 120 ++++++++++++++++++++++++++++++++------
 GPUTest/unittest.C        |   7 ++-
 src_gpu/dfMatrixOpBase.H  |   9 ++-
 src_gpu/dfMatrixOpBase.cu |  98 ++++++++++++++++++++++++++++---
 4 files changed, 205 insertions(+), 29 deletions(-)

diff --git a/GPUTest/GPUTestBase.H b/GPUTest/GPUTestBase.H
index d46d3e95b..83c9976f3 100644
--- a/GPUTest/GPUTestBase.H
+++ b/GPUTest/GPUTestBase.H
@@ -95,8 +95,8 @@ void randomInitVolScalar(volScalarField& field) {
       }
 }
 
-// rho_old need special treatment
-void uploadRhoOld(dfMatrixDataBase& dfDataBase, const volScalarField& field) {
+// rho_old need special treatment: it use h_xxx of rho
+void uploadRegisteredRhoOld(dfMatrixDataBase& dfDataBase, const volScalarField& field) {
     double *h_internal_field = dfDataBase.getFieldPointer("rho", location::cpu, position::internal);
     double *h_boundary_field = dfDataBase.getFieldPointer("rho", location::cpu, position::boundary);
     double *d_internal_field = dfDataBase.getFieldPointer("rho_old", location::gpu, position::internal);
@@ -117,7 +117,7 @@ void uploadRhoOld(dfMatrixDataBase& dfDataBase, const volScalarField& field) {
     checkCudaErrors(cudaMemcpyAsync(d_boundary_field, h_boundary_field, dfDataBase.boundary_surface_value_bytes, cudaMemcpyHostToDevice, dfDataBase.stream));
 }
 
-void uploadVolScalar(dfMatrixDataBase& dfDataBase, const volScalarField& field, const char* fieldAlias) {
+void uploadRegisteredVolScalar(dfMatrixDataBase& dfDataBase, const volScalarField& field, const char* fieldAlias) {
     double *h_internal_field = dfDataBase.getFieldPointer(fieldAlias, location::cpu, position::internal);
     double *h_boundary_field = dfDataBase.getFieldPointer(fieldAlias, location::cpu, position::boundary);
     double *d_internal_field = dfDataBase.getFieldPointer(fieldAlias, location::gpu, position::internal);
@@ -138,7 +138,7 @@ void uploadVolScalar(dfMatrixDataBase& dfDataBase, const volScalarField& field,
     checkCudaErrors(cudaMemcpyAsync(d_boundary_field, h_boundary_field, dfDataBase.boundary_surface_value_bytes, cudaMemcpyHostToDevice, dfDataBase.stream));
 }
 
-void uploadVolVector(dfMatrixDataBase& dfDataBase, const volVectorField& field, const char* fieldAlias) {
+void uploadRegisteredVolVector(dfMatrixDataBase& dfDataBase, const volVectorField& field, const char* fieldAlias) {
     double *h_internal_field = dfDataBase.getFieldPointer(fieldAlias, location::cpu, position::internal);
     double *h_boundary_field = dfDataBase.getFieldPointer(fieldAlias, location::cpu, position::boundary);
     double *d_internal_field = dfDataBase.getFieldPointer(fieldAlias, location::gpu, position::internal);
@@ -159,7 +159,7 @@ void uploadVolVector(dfMatrixDataBase& dfDataBase, const volVectorField& field,
     checkCudaErrors(cudaMemcpyAsync(d_boundary_field, h_boundary_field, dfDataBase.boundary_surface_value_vec_bytes, cudaMemcpyHostToDevice, dfDataBase.stream));
 }
 
-void uploadSurfaceScalar(dfMatrixDataBase& dfDataBase, const surfaceScalarField& field, const char* fieldAlias) {
+void uploadRegisteredSurfaceScalar(dfMatrixDataBase& dfDataBase, const surfaceScalarField& field, const char* fieldAlias) {
     double *h_internal_field = dfDataBase.getFieldPointer(fieldAlias, location::cpu, position::internal);
     double *h_boundary_field = dfDataBase.getFieldPointer(fieldAlias, location::cpu, position::boundary);
     double *d_internal_field = dfDataBase.getFieldPointer(fieldAlias, location::gpu, position::internal);
@@ -180,31 +180,67 @@ void uploadSurfaceScalar(dfMatrixDataBase& dfDataBase, const surfaceScalarField&
     checkCudaErrors(cudaMemcpyAsync(d_boundary_field, h_boundary_field, dfDataBase.boundary_surface_value_bytes, cudaMemcpyHostToDevice, dfDataBase.stream));
 }
 
+void uploadVolScalar(dfMatrixDataBase& dfDataBase, const volScalarField& field, double *d_field, double *d_boundary_field) {
+    std::vector<double> h_boundary_field;
+    h_boundary_field.resize(dfDataBase.num_boundary_surfaces);
+    int offset = 0;
+    forAll(field.boundaryField(), patchi)
+    {
+        const auto& patchField = field.boundaryField()[patchi];
+        int patchsize = patchField.size();
+        memcpy(h_boundary_field.data() + offset, &patchField[0], patchsize * sizeof(double));
+        offset += patchsize;
+    }
+    checkCudaErrors(cudaMemcpyAsync(d_field, &field[0], dfDataBase.cell_value_bytes, cudaMemcpyHostToDevice, dfDataBase.stream));
+    checkCudaErrors(cudaMemcpyAsync(d_boundary_field, h_boundary_field.data(), dfDataBase.boundary_surface_value_bytes, cudaMemcpyHostToDevice, dfDataBase.stream));
+}
+
+
 void buildTestGPUDataBaseVector(const dfMatrixDataBase& dfDataBase, testGPUDataBase& testData, const volVectorField& field,
         bool lowerFlag, bool upperFlag, bool diagFlag, bool sourceFlag, bool internalCoeffsFlag, bool boundaryCoeffsFlag,
         bool valueInternalCoeffsFlag, bool valueBoundaryCoeffsFlag, bool gradientInternalCoeffsFlag, bool gradientBoundaryCoeffsFlag) {
     // ldu
-    if (lowerFlag)
+    if (lowerFlag) {
         checkCudaErrors(cudaMalloc((void**)&testData.d_lower, dfDataBase.surface_value_bytes));
-    if (upperFlag)
+        checkCudaErrors(cudaMemset(testData.d_lower, 0, dfDataBase.surface_value_bytes));
+    }
+    if (upperFlag) {
         checkCudaErrors(cudaMalloc((void**)&testData.d_upper, dfDataBase.surface_value_bytes));
-    if (diagFlag)
+        checkCudaErrors(cudaMemset(testData.d_upper, 0, dfDataBase.surface_value_bytes));
+    }
+    if (diagFlag) {
         checkCudaErrors(cudaMalloc((void**)&testData.d_diag, dfDataBase.cell_value_bytes));
-    if (sourceFlag)
+        checkCudaErrors(cudaMemset(testData.d_diag, 0, dfDataBase.cell_value_bytes));
+    }
+    if (sourceFlag) {
         checkCudaErrors(cudaMalloc((void**)&testData.d_source, dfDataBase.cell_value_vec_bytes));
-    if (internalCoeffsFlag)
+        checkCudaErrors(cudaMemset(testData.d_source, 0, dfDataBase.cell_value_vec_bytes));
+    }
+    if (internalCoeffsFlag) {
         checkCudaErrors(cudaMalloc((void**)&testData.d_internal_coeffs, dfDataBase.boundary_surface_value_vec_bytes));
-    if (boundaryCoeffsFlag)
+        checkCudaErrors(cudaMemset(testData.d_internal_coeffs, 0, dfDataBase.boundary_surface_value_vec_bytes));
+    }
+    if (boundaryCoeffsFlag) {
         checkCudaErrors(cudaMalloc((void**)&testData.d_boundary_coeffs, dfDataBase.boundary_surface_value_vec_bytes));
+        checkCudaErrors(cudaMemset(testData.d_boundary_coeffs, 0, dfDataBase.boundary_surface_value_vec_bytes));
+    }
     // boundary coeffs
-    if (valueInternalCoeffsFlag)
+    if (valueInternalCoeffsFlag) {
         checkCudaErrors(cudaMalloc((void**)&testData.d_value_internal_coeffs, dfDataBase.boundary_surface_value_vec_bytes));
-    if (valueBoundaryCoeffsFlag)
+        checkCudaErrors(cudaMemset(testData.d_value_internal_coeffs, 0, dfDataBase.boundary_surface_value_vec_bytes));
+    }
+    if (valueBoundaryCoeffsFlag) {
         checkCudaErrors(cudaMalloc((void**)&testData.d_value_boundary_coeffs, dfDataBase.boundary_surface_value_vec_bytes));
-    if (gradientInternalCoeffsFlag)
+        checkCudaErrors(cudaMemset(testData.d_value_boundary_coeffs, 0, dfDataBase.boundary_surface_value_vec_bytes));
+    }
+    if (gradientInternalCoeffsFlag) {
         checkCudaErrors(cudaMalloc((void**)&testData.d_gradient_internal_coeffs, dfDataBase.boundary_surface_value_vec_bytes));
-    if (gradientBoundaryCoeffsFlag)
+        checkCudaErrors(cudaMemset(testData.d_gradient_internal_coeffs, 0, dfDataBase.boundary_surface_value_vec_bytes));
+    }
+    if (gradientBoundaryCoeffsFlag) {
         checkCudaErrors(cudaMalloc((void**)&testData.d_gradient_boundary_coeffs, dfDataBase.boundary_surface_value_vec_bytes));
+        checkCudaErrors(cudaMemset(testData.d_gradient_boundary_coeffs, 0, dfDataBase.boundary_surface_value_vec_bytes));
+    }
     // patch type
     testData.patch_type.resize(dfDataBase.num_patches);
     forAll(field.boundaryField(), patchi)
@@ -291,9 +327,9 @@ void test_fvm_ddt_vector(dfMatrixDataBase& dfDataBase, Foam::fvMesh& mesh, volSc
 
     // prepare for run GPU
     // prepare rho, rho.old, U
-    uploadVolScalar(dfDataBase, rho, "rho");
-    uploadRhoOld(dfDataBase, rho.oldTime());
-    uploadVolVector(dfDataBase, U.oldTime(), "u");
+    uploadRegisteredVolScalar(dfDataBase, rho, "rho");
+    uploadRegisteredRhoOld(dfDataBase, rho.oldTime());
+    uploadRegisteredVolVector(dfDataBase, U.oldTime(), "u");
     // prepare testData
     testGPUDataBase testData;
     // only use diag and source
@@ -320,7 +356,7 @@ void test_fvm_div_vector(dfMatrixDataBase& dfDataBase, Foam::fvMesh& mesh, surfa
 
     // prepare for run GPU
     // prepare phi field
-    uploadSurfaceScalar(dfDataBase, phi, "phi");
+    uploadRegisteredSurfaceScalar(dfDataBase, phi, "phi");
     // prepare testData
     testGPUDataBase testData;
     // not use source
@@ -342,3 +378,49 @@ void test_fvm_div_vector(dfMatrixDataBase& dfDataBase, Foam::fvMesh& mesh, surfa
     bool printFlag = false;
     compareResultVector(dfDataBase, testData, dfMatrix, printFlag);
 }
+
+void test_fvm_laplacian_vector(dfMatrixDataBase& dfDataBase, Foam::fvMesh& mesh,
+        volScalarField& gamma, volVectorField& U, initType type)
+{
+    if (type == initType::randomInit) {
+      randomInitVolScalar(gamma);
+    }
+
+    // run CPU
+    fvVectorMatrix dfMatrix = fvm::laplacian(gamma, U);
+
+    // prepare for run GPU
+    // prepare gamma on GPU
+    double *d_gamma = nullptr;
+    double *d_boundary_gamma = nullptr;
+    checkCudaErrors(cudaMalloc((void**)&d_gamma, dfDataBase.cell_value_bytes));
+    checkCudaErrors(cudaMalloc((void**)&d_boundary_gamma, dfDataBase.boundary_surface_value_bytes));
+    uploadVolScalar(dfDataBase, gamma, d_gamma, d_boundary_gamma);
+    // prepare testData
+    testGPUDataBase testData;
+    // not use source
+    // value_internal_coeffs, value_boundary_coeffs are not needed actually, but updateBoundaryCoeffsVector will access them
+    buildTestGPUDataBaseVector(dfDataBase, testData, U, true, true, true, false, true, true, true, true, true, true);
+    // prepare boundary coeffs
+    // TODO: updating boundary coeffs should be complemented later
+    updateBoundaryCoeffsVector(dfDataBase, testData);
+
+    // run GPU
+    fvm_laplacian_vector(dfDataBase.stream, dfDataBase.num_surfaces, dfDataBase.num_boundary_surfaces,
+            dfDataBase.d_owner, dfDataBase.d_neighbor,
+            dfDataBase.d_weight, dfDataBase.d_mag_sf, dfDataBase.d_delta_coeffs, d_gamma,
+            testData.d_lower, testData.d_upper, testData.d_diag, // end for internal
+            dfDataBase.num_patches, dfDataBase.patch_size.data(), testData.patch_type.data(),
+            dfDataBase.d_boundary_mag_sf, d_boundary_gamma,
+            testData.d_gradient_internal_coeffs, testData.d_gradient_boundary_coeffs,
+            testData.d_internal_coeffs, testData.d_boundary_coeffs);
+
+    // compare result
+    bool printFlag = false;
+    compareResultVector(dfDataBase, testData, dfMatrix, printFlag);
+
+    // free resources
+    checkCudaErrors(cudaFree(d_gamma));
+    checkCudaErrors(cudaFree(d_boundary_gamma));
+}
+
diff --git a/GPUTest/unittest.C b/GPUTest/unittest.C
index dd1f29e53..78608e46e 100644
--- a/GPUTest/unittest.C
+++ b/GPUTest/unittest.C
@@ -103,8 +103,13 @@ int main(int argc, char *argv[])
         // unittest of fvm::div(phi, U)
         test_fvm_div_vector(dfDataBase, mesh, phi, U, initType::original);
         test_fvm_div_vector(dfDataBase, mesh, phi, U, initType::randomInit);
+        // unittest of fvm::laplacian(gamma, U)
+        const tmp<volScalarField> nuEff_tmp(turbulence->nuEff());
+        const volScalarField& nuEff = nuEff_tmp();
+        volScalarField gamma = rho * nuEff;
+        test_fvm_laplacian_vector(dfDataBase, mesh, gamma, U, initType::original);
+        test_fvm_laplacian_vector(dfDataBase, mesh, gamma, U, initType::randomInit);
     }
     return 0;
 }
 
-
diff --git a/src_gpu/dfMatrixOpBase.H b/src_gpu/dfMatrixOpBase.H
index 40404944d..617a2e787 100644
--- a/src_gpu/dfMatrixOpBase.H
+++ b/src_gpu/dfMatrixOpBase.H
@@ -28,7 +28,14 @@ void fvm_div_vector(cudaStream_t stream, int num_surfaces, const int *lowerAddr,
         const double *boundary_phi, const double *value_internal_coeffs, const double *value_boundary_coeffs,
         double *internal_coeffs, double *boundary_coeffs);
 
-void fvm_laplacian();
+void fvm_laplacian_vector(cudaStream_t stream, int num_surfaces, int num_boundary_surfaces,
+        const int *lowerAddr, const int *upperAddr,
+        const double *weight, const double *mag_sf, const double *delta_coeffs, const double *gamma,
+        double *lower, double *upper, double *diag, // end for internal
+        int num_patches, const int *patch_size, const int *patch_type,
+        const double *boundary_mag_sf, const double *boundary_gamma,
+        const double *gradient_internal_coeffs, const double *gradient_boundary_coeffs,
+        double *internal_coeffs, double *boundary_coeffs);
 
 // fvc ops
 
diff --git a/src_gpu/dfMatrixOpBase.cu b/src_gpu/dfMatrixOpBase.cu
index 1728a91dc..5b63cad61 100644
--- a/src_gpu/dfMatrixOpBase.cu
+++ b/src_gpu/dfMatrixOpBase.cu
@@ -80,14 +80,16 @@ __global__ void fvm_div_vector_internal(int num_surfaces,
     double w = weight[index];
     double f = phi[index];
 
-    lower[index] += (-w) * f;
-    upper[index] += (1 - w) * f;
+    double lower_value = (-w) * f;
+    double upper_value = (1 - w) * f;
+    lower[index] += lower_value;
+    upper[index] += upper_value;
     // if (index == 0) printf("index = 0, lower: %.16lf, upper:%.16lf\n", lower[index], upper[index]);
 
-    int l = lower_index[index];
-    int u = upper_index[index];
-    atomicAdd(&(diag[l]), w * f);
-    atomicAdd(&(diag[u]), (w - 1) * f);
+    int owner = lower_index[index];
+    int neighbor = upper_index[index];
+    atomicAdd(&(diag[owner]), -lower_value);
+    atomicAdd(&(diag[neighbor]), -upper_value);
 }
 
 __global__ void fvm_div_vector_boundary(int num, int offset,
@@ -108,6 +110,53 @@ __global__ void fvm_div_vector_boundary(int num, int offset,
     boundary_coeffs[start_index * 3 + 2] += boundary_f * value_boundary_coeffs[start_index * 3 + 2];
 }
 
+__global__ void fvm_laplacian_vector_internal(int num_surfaces,
+        const int *lower_index, const int *upper_index,
+        const double *weight, const double *mag_sf, const double *delta_coeffs, const double *gamma,
+        double *lower, double *upper, double *diag)
+{
+    int index = blockDim.x * blockIdx.x + threadIdx.x;
+    if (index >= num_surfaces)
+        return;
+
+    int owner = lower_index[index];
+    int neighbor = upper_index[index];
+
+    double w = weight[index];
+    double upper_face_gamma = w * gamma[owner] + (1 - w) * gamma[neighbor];
+    double upper_value = upper_face_gamma * mag_sf[index] * delta_coeffs[index];
+
+    // laplacian doesn't use the original lower, but use lower = upper
+    //double lower_face_gamma = w * gamma[neighbor] + (1 - w) * gamma[owner];
+    //double lower_value = lower_face_gamma * mag_sf[index] * delta_coeffs[index];
+    double lower_value = upper_value;
+
+    lower[index] += lower_value;
+    upper[index] += upper_value;
+
+    atomicAdd(&(diag[owner]), -lower_value);
+    atomicAdd(&(diag[neighbor]), -upper_value);
+}
+
+__global__ void fvm_laplacian_vector_boundary(int num, int offset,
+        const double *boundary_mag_sf, const double *boundary_gamma,
+        const double *gradient_internal_coeffs, const double *gradient_boundary_coeffs,
+        double *internal_coeffs, double *boundary_coeffs)
+{
+    int index = blockDim.x * blockIdx.x + threadIdx.x;
+    if (index >= num)
+        return;
+
+    int start_index = offset + index;
+    double boundary_value = boundary_gamma[start_index] * boundary_mag_sf[start_index];
+    internal_coeffs[start_index * 3 + 0] += boundary_value * gradient_internal_coeffs[start_index * 3 + 0];
+    internal_coeffs[start_index * 3 + 1] += boundary_value * gradient_internal_coeffs[start_index * 3 + 1];
+    internal_coeffs[start_index * 3 + 2] += boundary_value * gradient_internal_coeffs[start_index * 3 + 2];
+    boundary_coeffs[start_index * 3 + 0] += boundary_value * gradient_boundary_coeffs[start_index * 3 + 0];
+    boundary_coeffs[start_index * 3 + 1] += boundary_value * gradient_boundary_coeffs[start_index * 3 + 1];
+    boundary_coeffs[start_index * 3 + 2] += boundary_value * gradient_boundary_coeffs[start_index * 3 + 2];
+}
+
 void permute_vector_d2h(cudaStream_t stream, int num_cells, const double *input, double *output)
 {
     size_t threads_per_block = 256;
@@ -178,8 +227,7 @@ void fvm_div_vector(cudaStream_t stream, int num_surfaces, const int *lowerAddr,
     size_t blocks_per_grid = 1;
 
     blocks_per_grid = (num_surfaces + threads_per_block - 1) / threads_per_block;
-    fvm_div_vector_internal<<<blocks_per_grid, threads_per_block, 0, stream>>>(num_surfaces,
-            lowerAddr, upperAddr,
+    fvm_div_vector_internal<<<blocks_per_grid, threads_per_block, 0, stream>>>(num_surfaces, lowerAddr, upperAddr,
             phi, weight, lower, upper, diag);
 
     int offset = 0;
@@ -200,3 +248,37 @@ void fvm_div_vector(cudaStream_t stream, int num_surfaces, const int *lowerAddr,
     }
 }
 
+void fvm_laplacian_vector(cudaStream_t stream, int num_surfaces, int num_boundary_surfaces, // TODO: num_boundary_surfaces may not be in use
+        const int *lowerAddr, const int *upperAddr,
+        const double *weight, const double *mag_sf, const double *delta_coeffs, const double *gamma,
+        double *lower, double *upper, double *diag, // end for internal
+        int num_patches, const int *patch_size, const int *patch_type,
+        const double *boundary_mag_sf, const double *boundary_gamma,
+        const double *gradient_internal_coeffs, const double *gradient_boundary_coeffs,
+        double *internal_coeffs, double *boundary_coeffs)
+{
+    size_t threads_per_block = 1024;
+    size_t blocks_per_grid = 1;
+
+    blocks_per_grid = (num_surfaces + threads_per_block - 1) / threads_per_block;
+    fvm_laplacian_vector_internal<<<blocks_per_grid, threads_per_block, 0, stream>>>(num_surfaces, lowerAddr, upperAddr,
+            weight, mag_sf, delta_coeffs, gamma, lower, upper, diag);
+
+    int offset = 0;
+    for (int i = 0; i < num_patches; i++) {
+        threads_per_block = 256;
+        blocks_per_grid = (patch_size[i] + threads_per_block - 1) / threads_per_block;
+        // TODO: just basic patch type now
+        if (patch_type[i] == boundaryConditions::zeroGradient
+                || patch_type[i] == boundaryConditions::fixedValue) {
+            // TODO: just vector version now
+            fvm_laplacian_vector_boundary<<<blocks_per_grid, threads_per_block, 0, stream>>>(patch_size[i], offset,
+                    boundary_mag_sf, boundary_gamma, gradient_internal_coeffs, gradient_boundary_coeffs,
+                    internal_coeffs, boundary_coeffs);
+        } else if (0) {
+            // xxx
+        }
+        offset += patch_size[i];
+    }
+}
+

From cc7223d9a8bf13d58a7ec000423958acde27b50f Mon Sep 17 00:00:00 2001
From: STwangyingrui <wangyingrui@sensetime.com>
Date: Mon, 7 Aug 2023 08:40:42 +0000
Subject: [PATCH 09/25] implement fvc::ddt(rho, K) and add unittest for it; fix
 several old bugs;

---
 GPUTest/GPUTestBase.H     | 106 +++++++++++++++++++++++++++++++++++++-
 GPUTest/unittest.C        |   7 +++
 src_gpu/dfMatrixOpBase.H  |   8 +--
 src_gpu/dfMatrixOpBase.cu |  29 +++++++++--
 4 files changed, 142 insertions(+), 8 deletions(-)

diff --git a/GPUTest/GPUTestBase.H b/GPUTest/GPUTestBase.H
index 83c9976f3..b6b7bba62 100644
--- a/GPUTest/GPUTestBase.H
+++ b/GPUTest/GPUTestBase.H
@@ -195,6 +195,58 @@ void uploadVolScalar(dfMatrixDataBase& dfDataBase, const volScalarField& field,
     checkCudaErrors(cudaMemcpyAsync(d_boundary_field, h_boundary_field.data(), dfDataBase.boundary_surface_value_bytes, cudaMemcpyHostToDevice, dfDataBase.stream));
 }
 
+void buildTestGPUDataBaseScalar(const dfMatrixDataBase& dfDataBase, testGPUDataBase& testData, const volScalarField& field,
+        bool lowerFlag, bool upperFlag, bool diagFlag, bool sourceFlag, bool internalCoeffsFlag, bool boundaryCoeffsFlag,
+        bool valueInternalCoeffsFlag, bool valueBoundaryCoeffsFlag, bool gradientInternalCoeffsFlag, bool gradientBoundaryCoeffsFlag) {
+    // ldu
+    if (lowerFlag) {
+        checkCudaErrors(cudaMalloc((void**)&testData.d_lower, dfDataBase.surface_value_bytes));
+        checkCudaErrors(cudaMemset(testData.d_lower, 0, dfDataBase.surface_value_bytes));
+    }
+    if (upperFlag) {
+        checkCudaErrors(cudaMalloc((void**)&testData.d_upper, dfDataBase.surface_value_bytes));
+        checkCudaErrors(cudaMemset(testData.d_upper, 0, dfDataBase.surface_value_bytes));
+    }
+    if (diagFlag) {
+        checkCudaErrors(cudaMalloc((void**)&testData.d_diag, dfDataBase.cell_value_bytes));
+        checkCudaErrors(cudaMemset(testData.d_diag, 0, dfDataBase.cell_value_bytes));
+    }
+    if (sourceFlag) {
+        checkCudaErrors(cudaMalloc((void**)&testData.d_source, dfDataBase.cell_value_bytes));
+        checkCudaErrors(cudaMemset(testData.d_source, 0, dfDataBase.cell_value_bytes));
+    }
+    if (internalCoeffsFlag) {
+        checkCudaErrors(cudaMalloc((void**)&testData.d_internal_coeffs, dfDataBase.boundary_surface_value_bytes));
+        checkCudaErrors(cudaMemset(testData.d_internal_coeffs, 0, dfDataBase.boundary_surface_value_bytes));
+    }
+    if (boundaryCoeffsFlag) {
+        checkCudaErrors(cudaMalloc((void**)&testData.d_boundary_coeffs, dfDataBase.boundary_surface_value_bytes));
+        checkCudaErrors(cudaMemset(testData.d_boundary_coeffs, 0, dfDataBase.boundary_surface_value_bytes));
+    }
+    // boundary coeffs
+    if (valueInternalCoeffsFlag) {
+        checkCudaErrors(cudaMalloc((void**)&testData.d_value_internal_coeffs, dfDataBase.boundary_surface_value_bytes));
+        checkCudaErrors(cudaMemset(testData.d_value_internal_coeffs, 0, dfDataBase.boundary_surface_value_bytes));
+    }
+    if (valueBoundaryCoeffsFlag) {
+        checkCudaErrors(cudaMalloc((void**)&testData.d_value_boundary_coeffs, dfDataBase.boundary_surface_value_bytes));
+        checkCudaErrors(cudaMemset(testData.d_value_boundary_coeffs, 0, dfDataBase.boundary_surface_value_bytes));
+    }
+    if (gradientInternalCoeffsFlag) {
+        checkCudaErrors(cudaMalloc((void**)&testData.d_gradient_internal_coeffs, dfDataBase.boundary_surface_value_bytes));
+        checkCudaErrors(cudaMemset(testData.d_gradient_internal_coeffs, 0, dfDataBase.boundary_surface_value_bytes));
+    }
+    if (gradientBoundaryCoeffsFlag) {
+        checkCudaErrors(cudaMalloc((void**)&testData.d_gradient_boundary_coeffs, dfDataBase.boundary_surface_value_bytes));
+        checkCudaErrors(cudaMemset(testData.d_gradient_boundary_coeffs, 0, dfDataBase.boundary_surface_value_bytes));
+    }
+    // patch type
+    testData.patch_type.resize(dfDataBase.num_patches);
+    forAll(field.boundaryField(), patchi)
+    {
+        constructBoundarySelectorPerPatch(&(testData.patch_type[patchi]), field.boundaryField()[patchi].type());
+    }
+}
 
 void buildTestGPUDataBaseVector(const dfMatrixDataBase& dfDataBase, testGPUDataBase& testData, const volVectorField& field,
         bool lowerFlag, bool upperFlag, bool diagFlag, bool sourceFlag, bool internalCoeffsFlag, bool boundaryCoeffsFlag,
@@ -318,8 +370,8 @@ void test_fvm_ddt_vector(dfMatrixDataBase& dfDataBase, Foam::fvMesh& mesh, volSc
 
     if (type == initType::randomInit) {
       // random init rho and rho.old
-      randomInitVolScalar(rho);
       rho.oldTime();
+      randomInitVolScalar(rho);
     }
 
     // run CPU
@@ -379,6 +431,7 @@ void test_fvm_div_vector(dfMatrixDataBase& dfDataBase, Foam::fvMesh& mesh, surfa
     compareResultVector(dfDataBase, testData, dfMatrix, printFlag);
 }
 
+// unittest of fvm::laplacian(gamma, vf)
 void test_fvm_laplacian_vector(dfMatrixDataBase& dfDataBase, Foam::fvMesh& mesh,
         volScalarField& gamma, volVectorField& U, initType type)
 {
@@ -406,7 +459,7 @@ void test_fvm_laplacian_vector(dfMatrixDataBase& dfDataBase, Foam::fvMesh& mesh,
     updateBoundaryCoeffsVector(dfDataBase, testData);
 
     // run GPU
-    fvm_laplacian_vector(dfDataBase.stream, dfDataBase.num_surfaces, dfDataBase.num_boundary_surfaces,
+    fvm_laplacian_vector(dfDataBase.stream, dfDataBase.num_surfaces,
             dfDataBase.d_owner, dfDataBase.d_neighbor,
             dfDataBase.d_weight, dfDataBase.d_mag_sf, dfDataBase.d_delta_coeffs, d_gamma,
             testData.d_lower, testData.d_upper, testData.d_diag, // end for internal
@@ -424,3 +477,52 @@ void test_fvm_laplacian_vector(dfMatrixDataBase& dfDataBase, Foam::fvMesh& mesh,
     checkCudaErrors(cudaFree(d_boundary_gamma));
 }
 
+// unittest of fvc::ddt(rho, K)
+void test_fvc_ddt_scalar(dfMatrixDataBase& dfDataBase, Foam::fvMesh& mesh, volScalarField& rho, volScalarField& K, initType type) {
+
+    if (type == initType::randomInit) {
+      // random init rho and rho.old
+      rho.oldTime();
+      randomInitVolScalar(rho);
+      K.oldTime();
+      randomInitVolScalar(K);
+    }
+
+    // run CPU
+    volScalarField fvc_ouput_scalar = fvc::ddt(rho, K);
+
+    // prepare for run GPU
+    // prepare rho, rho.old on GPU
+    uploadRegisteredVolScalar(dfDataBase, rho, "rho");
+    uploadRegisteredRhoOld(dfDataBase, rho.oldTime());
+    // prepare K, K_old on GPU
+    double *d_K = nullptr;
+    double *d_K_old = nullptr;
+    checkCudaErrors(cudaMalloc((void**)&d_K, dfDataBase.cell_value_bytes));
+    checkCudaErrors(cudaMalloc((void**)&d_K_old, dfDataBase.cell_value_bytes));
+    checkCudaErrors(cudaMemcpyAsync(d_K, &K[0], dfDataBase.cell_value_bytes, cudaMemcpyHostToDevice, dfDataBase.stream));
+    checkCudaErrors(cudaMemcpyAsync(d_K_old, &K.oldTime()[0], dfDataBase.cell_value_bytes, cudaMemcpyHostToDevice, dfDataBase.stream));
+    // there is no need for fvc ops to build testGPUDataBase, just build d_fvc_ouput_scalar directly.
+    double *d_fvc_ouput_scalar = nullptr;
+    checkCudaErrors(cudaMalloc((void**)&d_fvc_ouput_scalar, dfDataBase.cell_value_bytes));
+    checkCudaErrors(cudaMemset(d_fvc_ouput_scalar, 0, dfDataBase.cell_value_bytes));
+    // run GPU
+    // fvc_ddt doesn't consider to add fvc_output to source yet, which needs (fvc_output * volume * sign).
+    fvc_ddt_scalar(dfDataBase.stream, dfDataBase.num_cells, dfDataBase.rdelta_t,
+            dfDataBase.d_rho, dfDataBase.d_rho_old, d_K, d_K_old,
+            d_fvc_ouput_scalar);
+
+    // compare result
+    bool printFlag = false;
+    std::vector<double> h_fvc_ouput_scalar;
+    h_fvc_ouput_scalar.resize(dfDataBase.num_cells);
+    checkCudaErrors(cudaMemcpy(h_fvc_ouput_scalar.data(), d_fvc_ouput_scalar, dfDataBase.cell_value_bytes, cudaMemcpyDeviceToHost));
+    checkVectorEqual(dfDataBase.num_cells, &fvc_ouput_scalar[0], h_fvc_ouput_scalar.data(), 1e-12, printFlag);
+
+    // free resources
+    checkCudaErrors(cudaFree(d_K));
+    checkCudaErrors(cudaFree(d_K_old));
+    checkCudaErrors(cudaFree(d_fvc_ouput_scalar));
+}
+
+
diff --git a/GPUTest/unittest.C b/GPUTest/unittest.C
index 78608e46e..cf58ec093 100644
--- a/GPUTest/unittest.C
+++ b/GPUTest/unittest.C
@@ -100,15 +100,22 @@ int main(int argc, char *argv[])
         // unittest of fvm::ddt(rho, U)
         test_fvm_ddt_vector(dfDataBase, mesh, rho, U, initType::original);
         test_fvm_ddt_vector(dfDataBase, mesh, rho, U, initType::randomInit);
+
         // unittest of fvm::div(phi, U)
         test_fvm_div_vector(dfDataBase, mesh, phi, U, initType::original);
         test_fvm_div_vector(dfDataBase, mesh, phi, U, initType::randomInit);
+
         // unittest of fvm::laplacian(gamma, U)
         const tmp<volScalarField> nuEff_tmp(turbulence->nuEff());
         const volScalarField& nuEff = nuEff_tmp();
         volScalarField gamma = rho * nuEff;
         test_fvm_laplacian_vector(dfDataBase, mesh, gamma, U, initType::original);
         test_fvm_laplacian_vector(dfDataBase, mesh, gamma, U, initType::randomInit);
+
+        // unittest of fvc::ddt(rho, K)
+        K = 0.5*magSqr(U);
+        test_fvc_ddt_scalar(dfDataBase, mesh, rho, K, initType::original);
+        test_fvc_ddt_scalar(dfDataBase, mesh, rho, K, initType::randomInit);
     }
     return 0;
 }
diff --git a/src_gpu/dfMatrixOpBase.H b/src_gpu/dfMatrixOpBase.H
index 617a2e787..9de229b14 100644
--- a/src_gpu/dfMatrixOpBase.H
+++ b/src_gpu/dfMatrixOpBase.H
@@ -28,7 +28,7 @@ void fvm_div_vector(cudaStream_t stream, int num_surfaces, const int *lowerAddr,
         const double *boundary_phi, const double *value_internal_coeffs, const double *value_boundary_coeffs,
         double *internal_coeffs, double *boundary_coeffs);
 
-void fvm_laplacian_vector(cudaStream_t stream, int num_surfaces, int num_boundary_surfaces,
+void fvm_laplacian_vector(cudaStream_t stream, int num_surfaces,
         const int *lowerAddr, const int *upperAddr,
         const double *weight, const double *mag_sf, const double *delta_coeffs, const double *gamma,
         double *lower, double *upper, double *diag, // end for internal
@@ -38,8 +38,10 @@ void fvm_laplacian_vector(cudaStream_t stream, int num_surfaces, int num_boundar
         double *internal_coeffs, double *boundary_coeffs);
 
 // fvc ops
-
-void fvc_ddt();
+// fvc_ddt doesn't consider to add fvc_output to source yet, which needs (fvc_output * volume * sign).
+void fvc_ddt_scalar(cudaStream_t stream, int num_cells, double rDeltaT,
+        const double *rho, const double *rho_old, const double *vf, const double *vf_old,
+        double *output);
 
 // void fvc_grad_surface();
 // 
diff --git a/src_gpu/dfMatrixOpBase.cu b/src_gpu/dfMatrixOpBase.cu
index 5b63cad61..c8583cd17 100644
--- a/src_gpu/dfMatrixOpBase.cu
+++ b/src_gpu/dfMatrixOpBase.cu
@@ -157,6 +157,18 @@ __global__ void fvm_laplacian_vector_boundary(int num, int offset,
     boundary_coeffs[start_index * 3 + 2] += boundary_value * gradient_boundary_coeffs[start_index * 3 + 2];
 }
 
+__global__ void fvc_ddt_scalar_kernel(int num_cells, double rDeltaT,
+        const double *rho, const double *rho_old, const double *vf, const double *vf_old,
+        double *output)
+{
+    int index = blockDim.x * blockIdx.x + threadIdx.x;
+    if (index >= num_cells)
+        return;
+
+    // TODO: skip moving
+    output[index] += rDeltaT * (rho[index] * vf[index] - rho_old[index] * vf_old[index]);
+}
+
 void permute_vector_d2h(cudaStream_t stream, int num_cells, const double *input, double *output)
 {
     size_t threads_per_block = 256;
@@ -197,10 +209,9 @@ void update_boundary_coeffs_vector(cudaStream_t stream, int num_patches,
         if (patch_type[i] == boundaryConditions::zeroGradient) {
             update_boundary_coeffs_zeroGradient_vector<<<blocks_per_grid, threads_per_block, 0, stream>>>(patch_size[i], offset,
                     value_internal_coeffs, value_boundary_coeffs, gradient_internal_coeffs, gradient_boundary_coeffs);
-        } else if (patch_type[i] == boundaryConditions::fixedValue) {
-            // xxx
         } else if (0) {
             // xxx
+            fprintf(stderr, "boundaryConditions other than zeroGradient are not support yet!\n");
         }
         offset += patch_size[i];
     }
@@ -243,12 +254,13 @@ void fvm_div_vector(cudaStream_t stream, int num_surfaces, const int *lowerAddr,
                     internal_coeffs, boundary_coeffs);
         } else if (0) {
             // xxx
+            fprintf(stderr, "boundaryConditions other than zeroGradient are not support yet!\n");
         }
         offset += patch_size[i];
     }
 }
 
-void fvm_laplacian_vector(cudaStream_t stream, int num_surfaces, int num_boundary_surfaces, // TODO: num_boundary_surfaces may not be in use
+void fvm_laplacian_vector(cudaStream_t stream, int num_surfaces,
         const int *lowerAddr, const int *upperAddr,
         const double *weight, const double *mag_sf, const double *delta_coeffs, const double *gamma,
         double *lower, double *upper, double *diag, // end for internal
@@ -277,8 +289,19 @@ void fvm_laplacian_vector(cudaStream_t stream, int num_surfaces, int num_boundar
                     internal_coeffs, boundary_coeffs);
         } else if (0) {
             // xxx
+            fprintf(stderr, "boundaryConditions other than zeroGradient are not support yet!\n");
         }
         offset += patch_size[i];
     }
 }
 
+void fvc_ddt_scalar(cudaStream_t stream, int num_cells, double rDeltaT,
+        const double *rho, const double *rho_old, const double *vf, const double *vf_old,
+        double *output)
+{
+    size_t threads_per_block = 1024;
+    size_t blocks_per_grid = (num_cells + threads_per_block - 1) / threads_per_block;
+    fvc_ddt_scalar_kernel<<<blocks_per_grid, threads_per_block, 0, stream>>>(num_cells,
+            rDeltaT, rho, rho_old, vf, vf_old, output);
+}
+

From 15e30b43e7586c00b9cbd67d3724a3058ed91b47 Mon Sep 17 00:00:00 2001
From: STwangyingrui <wangyingrui@sensetime.com>
Date: Mon, 7 Aug 2023 11:09:26 +0000
Subject: [PATCH 10/25] workaround to fix a bug of floating-point numerical
 error for fvc_ddt

---
 GPUTest/GPUTestBase.H      | 27 +++++++++++++--------------
 GPUTest/unittest.C         |  9 +++++++++
 src_gpu/dfMatrixDataBase.H |  2 ++
 src_gpu/dfMatrixOpBase.cu  | 14 +++++++++++++-
 4 files changed, 37 insertions(+), 15 deletions(-)

diff --git a/GPUTest/GPUTestBase.H b/GPUTest/GPUTestBase.H
index b6b7bba62..360abdf2c 100644
--- a/GPUTest/GPUTestBase.H
+++ b/GPUTest/GPUTestBase.H
@@ -367,11 +367,10 @@ void compareResultVector(const dfMatrixDataBase& dfDataBase, const testGPUDataBa
 
 // unittest of fvm::ddt(rho, U)
 void test_fvm_ddt_vector(dfMatrixDataBase& dfDataBase, Foam::fvMesh& mesh, volScalarField& rho, volVectorField& U, initType type) {
-
     if (type == initType::randomInit) {
-      // random init rho and rho.old
-      rho.oldTime();
-      randomInitVolScalar(rho);
+        // random init rho and rho.old
+        rho.oldTime();
+        randomInitVolScalar(rho);
     }
 
     // run CPU
@@ -399,8 +398,8 @@ void test_fvm_ddt_vector(dfMatrixDataBase& dfDataBase, Foam::fvMesh& mesh, volSc
 // unittest of fvm::div(phi, U)
 void test_fvm_div_vector(dfMatrixDataBase& dfDataBase, Foam::fvMesh& mesh, surfaceScalarField& phi, volVectorField& U, initType type) {
     if (type == initType::randomInit) {
-      randomInitSurfaceScalar(phi);
-      // TODO: random init weight failed, weight is const.
+        phi.oldTime();
+        randomInitSurfaceScalar(phi);
     }
 
     // run CPU
@@ -436,7 +435,8 @@ void test_fvm_laplacian_vector(dfMatrixDataBase& dfDataBase, Foam::fvMesh& mesh,
         volScalarField& gamma, volVectorField& U, initType type)
 {
     if (type == initType::randomInit) {
-      randomInitVolScalar(gamma);
+        gamma.oldTime();
+        randomInitVolScalar(gamma);
     }
 
     // run CPU
@@ -479,13 +479,12 @@ void test_fvm_laplacian_vector(dfMatrixDataBase& dfDataBase, Foam::fvMesh& mesh,
 
 // unittest of fvc::ddt(rho, K)
 void test_fvc_ddt_scalar(dfMatrixDataBase& dfDataBase, Foam::fvMesh& mesh, volScalarField& rho, volScalarField& K, initType type) {
-
     if (type == initType::randomInit) {
-      // random init rho and rho.old
-      rho.oldTime();
-      randomInitVolScalar(rho);
-      K.oldTime();
-      randomInitVolScalar(K);
+        // random init rho and rho.old
+        rho.oldTime();
+        randomInitVolScalar(rho);
+        K.oldTime();
+        randomInitVolScalar(K);
     }
 
     // run CPU
@@ -517,7 +516,7 @@ void test_fvc_ddt_scalar(dfMatrixDataBase& dfDataBase, Foam::fvMesh& mesh, volSc
     std::vector<double> h_fvc_ouput_scalar;
     h_fvc_ouput_scalar.resize(dfDataBase.num_cells);
     checkCudaErrors(cudaMemcpy(h_fvc_ouput_scalar.data(), d_fvc_ouput_scalar, dfDataBase.cell_value_bytes, cudaMemcpyDeviceToHost));
-    checkVectorEqual(dfDataBase.num_cells, &fvc_ouput_scalar[0], h_fvc_ouput_scalar.data(), 1e-12, printFlag);
+    checkVectorEqual(dfDataBase.num_cells, &fvc_ouput_scalar[0], h_fvc_ouput_scalar.data(), 1e-14, printFlag);
 
     // free resources
     checkCudaErrors(cudaFree(d_K));
diff --git a/GPUTest/unittest.C b/GPUTest/unittest.C
index cf58ec093..edd5b7856 100644
--- a/GPUTest/unittest.C
+++ b/GPUTest/unittest.C
@@ -96,26 +96,35 @@ int main(int argc, char *argv[])
         }
 
         createGPUBase(mesh, Y);
+        DEBUG_TRACE;
 
         // unittest of fvm::ddt(rho, U)
         test_fvm_ddt_vector(dfDataBase, mesh, rho, U, initType::original);
+        DEBUG_TRACE;
         test_fvm_ddt_vector(dfDataBase, mesh, rho, U, initType::randomInit);
+        DEBUG_TRACE;
 
         // unittest of fvm::div(phi, U)
         test_fvm_div_vector(dfDataBase, mesh, phi, U, initType::original);
+        DEBUG_TRACE;
         test_fvm_div_vector(dfDataBase, mesh, phi, U, initType::randomInit);
+        DEBUG_TRACE;
 
         // unittest of fvm::laplacian(gamma, U)
         const tmp<volScalarField> nuEff_tmp(turbulence->nuEff());
         const volScalarField& nuEff = nuEff_tmp();
         volScalarField gamma = rho * nuEff;
         test_fvm_laplacian_vector(dfDataBase, mesh, gamma, U, initType::original);
+        DEBUG_TRACE;
         test_fvm_laplacian_vector(dfDataBase, mesh, gamma, U, initType::randomInit);
+        DEBUG_TRACE;
 
         // unittest of fvc::ddt(rho, K)
         K = 0.5*magSqr(U);
         test_fvc_ddt_scalar(dfDataBase, mesh, rho, K, initType::original);
+        DEBUG_TRACE;
         test_fvc_ddt_scalar(dfDataBase, mesh, rho, K, initType::randomInit);
+        DEBUG_TRACE;
     }
     return 0;
 }
diff --git a/src_gpu/dfMatrixDataBase.H b/src_gpu/dfMatrixDataBase.H
index cce7e6adc..8aee29b45 100644
--- a/src_gpu/dfMatrixDataBase.H
+++ b/src_gpu/dfMatrixDataBase.H
@@ -14,6 +14,8 @@
 #include <cmath>
 #include <unordered_map>
 
+#define DEBUG_TRACE fprintf(stderr, "%s %d\n", __FILE__, __LINE__);
+
 static const char *_cudaGetErrorEnum(cudaError_t error) {
   return cudaGetErrorName(error);
 }
diff --git a/src_gpu/dfMatrixOpBase.cu b/src_gpu/dfMatrixOpBase.cu
index c8583cd17..e8836060f 100644
--- a/src_gpu/dfMatrixOpBase.cu
+++ b/src_gpu/dfMatrixOpBase.cu
@@ -165,8 +165,20 @@ __global__ void fvc_ddt_scalar_kernel(int num_cells, double rDeltaT,
     if (index >= num_cells)
         return;
 
+    double val_new = rho[index] * vf[index];
+    double val_old = rho_old[index] * vf_old[index];
     // TODO: skip moving
-    output[index] += rDeltaT * (rho[index] * vf[index] - rho_old[index] * vf_old[index]);
+    // TODO: wyr
+    // for the case of rho = rho_old and vf = vf_old, the floating-point numerical problem will be exposed.
+    // it expect zero as output, but the gpu result get a sub-normal minimal value for (val_new - val_old),
+    // which smaller than 1e-16, and then enlarged by rDeltaT (1e6)
+    // then the comparison of cpu result and gpu result will failed with relative error: inf,
+    // e.g.:
+    // cpu data: 0.0000000000000000, gpu data: 0.0000000000298050, relative error: inf
+    // if I add the print line for intermediate variables of val_new and val_old, the problem disappears.
+    // It seems that print line will change the compiler behavior, maybe avoiding the fma optimization of compiler.
+    if (index == -1) printf("index = 0, val_new: %.40lf, val_old: %.40lf\n", val_new, val_old);
+    output[index] += rDeltaT * (val_new - val_old);
 }
 
 void permute_vector_d2h(cudaStream_t stream, int num_cells, const double *input, double *output)

From a8c68c69a2419f7e9de16a9104f699f37ac93f4d Mon Sep 17 00:00:00 2001
From: STwangyingrui <wangyingrui@sensetime.com>
Date: Mon, 7 Aug 2023 20:24:16 +0800
Subject: [PATCH 11/25] fix occasional errors of fvm fvm::ddt and fvc::ddt:
 caused by re-using h_rho between async-uploading rho and async-uploading
 rho.old

---
 GPUTest/GPUTestBase.H       | 26 ++------------------------
 src_gpu/dfMatrixDataBase.H  |  2 ++
 src_gpu/dfMatrixDataBase.cu |  4 ++++
 3 files changed, 8 insertions(+), 24 deletions(-)

diff --git a/GPUTest/GPUTestBase.H b/GPUTest/GPUTestBase.H
index 360abdf2c..efd6ba4fd 100644
--- a/GPUTest/GPUTestBase.H
+++ b/GPUTest/GPUTestBase.H
@@ -95,28 +95,6 @@ void randomInitVolScalar(volScalarField& field) {
       }
 }
 
-// rho_old need special treatment: it use h_xxx of rho
-void uploadRegisteredRhoOld(dfMatrixDataBase& dfDataBase, const volScalarField& field) {
-    double *h_internal_field = dfDataBase.getFieldPointer("rho", location::cpu, position::internal);
-    double *h_boundary_field = dfDataBase.getFieldPointer("rho", location::cpu, position::boundary);
-    double *d_internal_field = dfDataBase.getFieldPointer("rho_old", location::gpu, position::internal);
-    double *d_boundary_field = dfDataBase.getFieldPointer("rho_old", location::gpu, position::boundary);
-    // internal
-    memcpy(h_internal_field, &field[0], dfDataBase.cell_value_bytes);
-    // boundary
-    int offset = 0;
-    forAll(field.boundaryField(), patchi)
-    {
-        const auto& patchField = field.boundaryField()[patchi];
-        int patchsize = patchField.size();
-        memcpy(h_boundary_field + offset, &patchField[0], patchsize * sizeof(double));
-        offset += patchsize;
-    }
-    // transfer
-    checkCudaErrors(cudaMemcpyAsync(d_internal_field, h_internal_field, dfDataBase.cell_value_bytes, cudaMemcpyHostToDevice, dfDataBase.stream));
-    checkCudaErrors(cudaMemcpyAsync(d_boundary_field, h_boundary_field, dfDataBase.boundary_surface_value_bytes, cudaMemcpyHostToDevice, dfDataBase.stream));
-}
-
 void uploadRegisteredVolScalar(dfMatrixDataBase& dfDataBase, const volScalarField& field, const char* fieldAlias) {
     double *h_internal_field = dfDataBase.getFieldPointer(fieldAlias, location::cpu, position::internal);
     double *h_boundary_field = dfDataBase.getFieldPointer(fieldAlias, location::cpu, position::boundary);
@@ -379,7 +357,7 @@ void test_fvm_ddt_vector(dfMatrixDataBase& dfDataBase, Foam::fvMesh& mesh, volSc
     // prepare for run GPU
     // prepare rho, rho.old, U
     uploadRegisteredVolScalar(dfDataBase, rho, "rho");
-    uploadRegisteredRhoOld(dfDataBase, rho.oldTime());
+    uploadRegisteredVolScalar(dfDataBase, rho.oldTime(), "rho_old");
     uploadRegisteredVolVector(dfDataBase, U.oldTime(), "u");
     // prepare testData
     testGPUDataBase testData;
@@ -493,7 +471,7 @@ void test_fvc_ddt_scalar(dfMatrixDataBase& dfDataBase, Foam::fvMesh& mesh, volSc
     // prepare for run GPU
     // prepare rho, rho.old on GPU
     uploadRegisteredVolScalar(dfDataBase, rho, "rho");
-    uploadRegisteredRhoOld(dfDataBase, rho.oldTime());
+    uploadRegisteredVolScalar(dfDataBase, rho.oldTime(), "rho_old");
     // prepare K, K_old on GPU
     double *d_K = nullptr;
     double *d_K_old = nullptr;
diff --git a/src_gpu/dfMatrixDataBase.H b/src_gpu/dfMatrixDataBase.H
index 8aee29b45..4e0bd4cbe 100644
--- a/src_gpu/dfMatrixDataBase.H
+++ b/src_gpu/dfMatrixDataBase.H
@@ -144,6 +144,7 @@ struct dfMatrixDataBase
     double *d_phi = nullptr;
     // computed on GPU, used on CPU, need memcpyd2h - host
     double *h_rho = nullptr;
+    double *h_rho_old = nullptr;
     double *h_u= nullptr;
     double *h_y= nullptr;
     double *h_he= nullptr;
@@ -169,6 +170,7 @@ struct dfMatrixDataBase
     double *d_boundary_phi = nullptr;
     // computed on GPU, used on CPU, need memcpyd2h - host
     double *h_boundary_rho = nullptr;
+    double *h_boundary_rho_old = nullptr;
     double *h_boundary_u= nullptr;
     double *h_boundary_y= nullptr;
     double *h_boundary_he= nullptr;
diff --git a/src_gpu/dfMatrixDataBase.cu b/src_gpu/dfMatrixDataBase.cu
index 341241bf4..b426201a2 100644
--- a/src_gpu/dfMatrixDataBase.cu
+++ b/src_gpu/dfMatrixDataBase.cu
@@ -243,10 +243,12 @@ void dfMatrixDataBase::createNonConstantFieldsInternal() {
 
     // computed on GPU, used on CPU, need memcpyd2h
     checkCudaErrors(cudaMallocHost((void**)&h_rho, cell_value_bytes));
+    checkCudaErrors(cudaMallocHost((void**)&h_rho_old, cell_value_bytes));
     checkCudaErrors(cudaMallocHost((void**)&h_u, cell_value_vec_bytes));
     checkCudaErrors(cudaMallocHost((void**)&h_y, cell_value_bytes * num_species));
     checkCudaErrors(cudaMallocHost((void**)&h_he, cell_value_bytes));
     fieldPointerMap["h_rho"] = h_rho;
+    fieldPointerMap["h_rho_old"] = h_rho_old;
     fieldPointerMap["h_u"] = h_u;
     fieldPointerMap["h_y"] = h_y;
     fieldPointerMap["h_he"] = h_he;
@@ -282,10 +284,12 @@ void dfMatrixDataBase::createNonConstantFieldsBoundary() {
 
     // computed on GPU, used on CPU, need memcpyd2h
     checkCudaErrors(cudaMallocHost((void**)&h_boundary_rho, boundary_surface_value_bytes));
+    checkCudaErrors(cudaMallocHost((void**)&h_boundary_rho_old, boundary_surface_value_bytes));
     checkCudaErrors(cudaMallocHost((void**)&h_boundary_u, boundary_surface_value_vec_bytes));
     checkCudaErrors(cudaMallocHost((void**)&h_boundary_y, boundary_surface_value_bytes * num_species));
     checkCudaErrors(cudaMallocHost((void**)&h_boundary_he, boundary_surface_value_bytes));
     fieldPointerMap["h_boundary_rho"] = h_boundary_rho;
+    fieldPointerMap["h_boundary_rho_old"] = h_boundary_rho_old;
     fieldPointerMap["h_boundary_u"] = h_boundary_u;
     fieldPointerMap["h_boundary_y"] = h_boundary_y;
     fieldPointerMap["h_boundary_he"] = h_boundary_he;

From 24530375a9d3f09ff518967a7c42237c962aae2e Mon Sep 17 00:00:00 2001
From: STwangyingrui <wangyingrui@sensetime.com>
Date: Mon, 7 Aug 2023 21:07:07 +0800
Subject: [PATCH 12/25] workaround way two (use volatile) to avoid
 floating-point numerical errors, which may be caused by fma contraction

---
 src_gpu/dfMatrixOpBase.cu | 9 ++++++++-
 1 file changed, 8 insertions(+), 1 deletion(-)

diff --git a/src_gpu/dfMatrixOpBase.cu b/src_gpu/dfMatrixOpBase.cu
index e8836060f..934441c81 100644
--- a/src_gpu/dfMatrixOpBase.cu
+++ b/src_gpu/dfMatrixOpBase.cu
@@ -164,7 +164,8 @@ __global__ void fvc_ddt_scalar_kernel(int num_cells, double rDeltaT,
     int index = blockDim.x * blockIdx.x + threadIdx.x;
     if (index >= num_cells)
         return;
-
+    /*
+    // workaround way1 (use printf):
     double val_new = rho[index] * vf[index];
     double val_old = rho_old[index] * vf_old[index];
     // TODO: skip moving
@@ -179,6 +180,12 @@ __global__ void fvc_ddt_scalar_kernel(int num_cells, double rDeltaT,
     // It seems that print line will change the compiler behavior, maybe avoiding the fma optimization of compiler.
     if (index == -1) printf("index = 0, val_new: %.40lf, val_old: %.40lf\n", val_new, val_old);
     output[index] += rDeltaT * (val_new - val_old);
+    */
+    // workaround way2 (use volatile):
+    // volatile will change the compiler behavior, maybe avoiding the fma optimization of compiler.
+    volatile double val_new = rho[index] * vf[index];
+    volatile double val_old = rho_old[index] * vf_old[index];
+    output[index] += rDeltaT * (val_new - val_old);
 }
 
 void permute_vector_d2h(cudaStream_t stream, int num_cells, const double *input, double *output)

From ae31072113dfe38cd5e32ca5468cba90ddc7faeb Mon Sep 17 00:00:00 2001
From: STwangyingrui <wangyingrui@sensetime.com>
Date: Mon, 7 Aug 2023 21:14:20 +0800
Subject: [PATCH 13/25] workaround way three (use nvcc option -fmad=false) to
 avoid floating-point numerical errors, which may be caused by fma contraction

---
 src_gpu/CMakeLists.txt    | 2 +-
 src_gpu/dfMatrixOpBase.cu | 4 ++++
 2 files changed, 5 insertions(+), 1 deletion(-)

diff --git a/src_gpu/CMakeLists.txt b/src_gpu/CMakeLists.txt
index ed9070476..03a7fe6db 100644
--- a/src_gpu/CMakeLists.txt
+++ b/src_gpu/CMakeLists.txt
@@ -12,7 +12,7 @@ find_package(MPI REQUIRED)
 find_package(CUDAToolkit REQUIRED)
 find_library(LIBAMGXSH amgxsh PATHS $ENV{AMGX_DIR}/build)
 
-add_compile_options(-arch=sm_70)
+add_compile_options(-arch=sm_70 -fmad=false)
 
 include_directories(
     ${MPI_INCLUDE_PATH}
diff --git a/src_gpu/dfMatrixOpBase.cu b/src_gpu/dfMatrixOpBase.cu
index 934441c81..f55b6895a 100644
--- a/src_gpu/dfMatrixOpBase.cu
+++ b/src_gpu/dfMatrixOpBase.cu
@@ -181,11 +181,15 @@ __global__ void fvc_ddt_scalar_kernel(int num_cells, double rDeltaT,
     if (index == -1) printf("index = 0, val_new: %.40lf, val_old: %.40lf\n", val_new, val_old);
     output[index] += rDeltaT * (val_new - val_old);
     */
+    /*
     // workaround way2 (use volatile):
     // volatile will change the compiler behavior, maybe avoiding the fma optimization of compiler.
     volatile double val_new = rho[index] * vf[index];
     volatile double val_old = rho_old[index] * vf_old[index];
     output[index] += rDeltaT * (val_new - val_old);
+    */
+    // workaround way3 (use nvcc option -fmad=false)
+    output[index] += rDeltaT * (rho[index] * vf[index] - rho_old[index] * vf_old[index]);
 }
 
 void permute_vector_d2h(cudaStream_t stream, int num_cells, const double *input, double *output)

From b16100fdb8c4db1d15ea21afbbee90d1947e48e5 Mon Sep 17 00:00:00 2001
From: STwangyingrui <wangyingrui@sensetime.com>
Date: Mon, 7 Aug 2023 23:55:14 +0800
Subject: [PATCH 14/25] use template to simplify unittest

---
 GPUTest/GPUTestBase.H | 344 +++++++++++++++++-------------------------
 1 file changed, 138 insertions(+), 206 deletions(-)

diff --git a/GPUTest/GPUTestBase.H b/GPUTest/GPUTestBase.H
index efd6ba4fd..42a64cd51 100644
--- a/GPUTest/GPUTestBase.H
+++ b/GPUTest/GPUTestBase.H
@@ -41,194 +41,118 @@ struct testGPUDataBase {
     }
 };
 
-void randomInitSurfaceScalar(surfaceScalarField& field) {
-      // random init field value to (-0.5, 0.5)
-      // internal
-      double *field_internal_ptr = &field[0];
-      std::vector<double> init_field_internal;
-      init_field_internal.resize(dfDataBase.num_surfaces);
-      for (int i = 0; i < dfDataBase.num_surfaces; i++) {
-          init_field_internal[i] = (rand() % 10000 - 5000) / 10000.0;
-      }
-      memcpy(field_internal_ptr, init_field_internal.data(), dfDataBase.surface_value_bytes);
-      // boundary
-      int offset = 0;
-      forAll(field.boundaryField(), patchi)
-      {
-          auto& patchField = field.boundaryFieldRef()[patchi];
-          int patchsize = patchField.size();
-          double *field_boundary_ptr = &patchField[0];
-          std::vector<double> init_field_boundary;
-          init_field_boundary.resize(patchsize);
-          for (int i = 0; i < patchsize; i++) {
-              init_field_boundary[i] = (rand() % 10000 - 5000) / 10000.0;
-          }
-          memcpy(field_boundary_ptr, init_field_boundary.data(), patchsize * sizeof(double));
-          offset += patchsize;
-      }
+template <typename T>
+void getTypeInfo(size_t *stride, size_t *internal_size, size_t *boundary_size) {
+    if ((typeid(T) != typeid(surfaceScalarField))
+            && (typeid(T) != typeid(surfaceVectorField))
+            && (typeid(T) != typeid(volScalarField))
+            && (typeid(T) != typeid(volVectorField))) {
+        fprintf(stderr, "ERROR! Unsupported field type()!\n");
+        exit(EXIT_FAILURE);
+    }
+    bool isVol = ((typeid(T) == typeid(volScalarField)) || (typeid(T) == typeid(volVectorField)));
+    bool isVec = ((typeid(T) == typeid(surfaceVectorField)) || (typeid(T) == typeid(volVectorField)));
+    *stride = isVec ? 3 : 1;
+    *internal_size = (isVol ? dfDataBase.num_cells : dfDataBase.num_surfaces) * (*stride);
+    *boundary_size = dfDataBase.num_boundary_surfaces * (*stride);
 }
 
-void randomInitVolScalar(volScalarField& field) {
-      // random init field value to (-0.5, 0.5)
-      // internal
-      double *field_internal_ptr = &field[0];
-      std::vector<double> init_field_internal;
-      init_field_internal.resize(dfDataBase.num_cells);
-      for (int i = 0; i < dfDataBase.num_cells; i++) {
-          init_field_internal[i] = (rand() % 10000 - 5000) / 10000.0;
-      }
-      memcpy(field_internal_ptr, init_field_internal.data(), dfDataBase.cell_value_bytes);
-      // boundary
-      int offset = 0;
-      forAll(field.boundaryField(), patchi)
-      {
-          auto& patchField = field.boundaryFieldRef()[patchi];
-          int patchsize = patchField.size();
-          double *field_boundary_ptr = &patchField[0];
-          std::vector<double> init_field_boundary;
-          init_field_boundary.resize(patchsize);
-          for (int i = 0; i < patchsize; i++) {
-              init_field_boundary[i] = (rand() % 10000 - 5000) / 10000.0;
-          }
-          memcpy(field_boundary_ptr, init_field_boundary.data(), patchsize * sizeof(double));
-          offset += patchsize;
-      }
-}
+template <typename T>
+void randomInitField(T& field) {
+    size_t stride = 0;
+    size_t internal_size = 0;
+    size_t boundary_size = 0;
+    getTypeInfo<T>(&stride, &internal_size, &boundary_size);
+    size_t internal_value_bytes = internal_size * sizeof(double);
 
-void uploadRegisteredVolScalar(dfMatrixDataBase& dfDataBase, const volScalarField& field, const char* fieldAlias) {
-    double *h_internal_field = dfDataBase.getFieldPointer(fieldAlias, location::cpu, position::internal);
-    double *h_boundary_field = dfDataBase.getFieldPointer(fieldAlias, location::cpu, position::boundary);
-    double *d_internal_field = dfDataBase.getFieldPointer(fieldAlias, location::gpu, position::internal);
-    double *d_boundary_field = dfDataBase.getFieldPointer(fieldAlias, location::gpu, position::boundary);
+    // random init field value to (-0.5, 0.5)
     // internal
-    memcpy(h_internal_field, &field[0], dfDataBase.cell_value_bytes);
-    // boundary
-    int offset = 0;
-    forAll(field.boundaryField(), patchi)
-    {
-        const auto& patchField = field.boundaryField()[patchi];
-        int patchsize = patchField.size();
-        memcpy(h_boundary_field + offset, &patchField[0], patchsize * sizeof(double));
-        offset += patchsize;
+    double *field_internal_ptr = &field[0];
+    std::vector<double> init_field_internal;
+    init_field_internal.resize(internal_size);
+    for (size_t i = 0; i < internal_size; i++) {
+        init_field_internal[i] = (rand() % 10000 - 5000) / 10000.0;
     }
-    // transfer
-    checkCudaErrors(cudaMemcpyAsync(d_internal_field, h_internal_field, dfDataBase.cell_value_bytes, cudaMemcpyHostToDevice, dfDataBase.stream));
-    checkCudaErrors(cudaMemcpyAsync(d_boundary_field, h_boundary_field, dfDataBase.boundary_surface_value_bytes, cudaMemcpyHostToDevice, dfDataBase.stream));
-}
-
-void uploadRegisteredVolVector(dfMatrixDataBase& dfDataBase, const volVectorField& field, const char* fieldAlias) {
-    double *h_internal_field = dfDataBase.getFieldPointer(fieldAlias, location::cpu, position::internal);
-    double *h_boundary_field = dfDataBase.getFieldPointer(fieldAlias, location::cpu, position::boundary);
-    double *d_internal_field = dfDataBase.getFieldPointer(fieldAlias, location::gpu, position::internal);
-    double *d_boundary_field = dfDataBase.getFieldPointer(fieldAlias, location::gpu, position::boundary);
-    // internal
-    memcpy(h_internal_field, &field[0], dfDataBase.cell_value_vec_bytes);
+    memcpy(field_internal_ptr, init_field_internal.data(), internal_value_bytes);
     // boundary
-    int offset = 0;
     forAll(field.boundaryField(), patchi)
     {
-        const auto& patchField = field.boundaryField()[patchi];
-        int patchsize = patchField.size();
-        memcpy(h_boundary_field + offset * 3, &patchField[0], patchsize * 3 * sizeof(double));
-        offset += patchsize;
+        auto& patchField = field.boundaryFieldRef()[patchi];
+        size_t patchsize = patchField.size();
+        double *field_boundary_ptr = &patchField[0];
+        std::vector<double> init_field_boundary;
+        init_field_boundary.resize(patchsize * stride);
+        for (size_t i = 0; i < patchsize * stride; i++) {
+            init_field_boundary[i] = (rand() % 10000 - 5000) / 10000.0;
+        }
+        memcpy(field_boundary_ptr, init_field_boundary.data(), patchsize * stride * sizeof(double));
     }
-    // transfer
-    checkCudaErrors(cudaMemcpyAsync(d_internal_field, h_internal_field, dfDataBase.cell_value_vec_bytes, cudaMemcpyHostToDevice, dfDataBase.stream));
-    checkCudaErrors(cudaMemcpyAsync(d_boundary_field, h_boundary_field, dfDataBase.boundary_surface_value_vec_bytes, cudaMemcpyHostToDevice, dfDataBase.stream));
 }
 
-void uploadRegisteredSurfaceScalar(dfMatrixDataBase& dfDataBase, const surfaceScalarField& field, const char* fieldAlias) {
+template <typename T>
+void uploadRegisteredField(dfMatrixDataBase& dfDataBase, const T& field, const char* fieldAlias) {
+    size_t stride = 0;
+    size_t internal_size = 0;
+    size_t boundary_size = 0;
+    getTypeInfo<T>(&stride, &internal_size, &boundary_size);
+    size_t internal_value_bytes = internal_size * sizeof(double);
+    size_t boundary_value_bytes = boundary_size * sizeof(double);
+
     double *h_internal_field = dfDataBase.getFieldPointer(fieldAlias, location::cpu, position::internal);
     double *h_boundary_field = dfDataBase.getFieldPointer(fieldAlias, location::cpu, position::boundary);
     double *d_internal_field = dfDataBase.getFieldPointer(fieldAlias, location::gpu, position::internal);
     double *d_boundary_field = dfDataBase.getFieldPointer(fieldAlias, location::gpu, position::boundary);
+
     // internal
-    memcpy(h_internal_field, &field[0], dfDataBase.surface_value_bytes);
+    memcpy(h_internal_field, &field[0], internal_value_bytes);
     // boundary
     int offset = 0;
     forAll(field.boundaryField(), patchi)
     {
         const auto& patchField = field.boundaryField()[patchi];
         int patchsize = patchField.size();
-        memcpy(h_boundary_field + offset, &patchField[0], patchsize * sizeof(double));
+        memcpy(h_boundary_field + offset * stride, &patchField[0], patchsize * stride * sizeof(double));
         offset += patchsize;
     }
     // transfer
-    checkCudaErrors(cudaMemcpyAsync(d_internal_field, h_internal_field, dfDataBase.surface_value_bytes, cudaMemcpyHostToDevice, dfDataBase.stream));
-    checkCudaErrors(cudaMemcpyAsync(d_boundary_field, h_boundary_field, dfDataBase.boundary_surface_value_bytes, cudaMemcpyHostToDevice, dfDataBase.stream));
+    checkCudaErrors(cudaMemcpyAsync(d_internal_field, h_internal_field, internal_value_bytes, cudaMemcpyHostToDevice, dfDataBase.stream));
+    checkCudaErrors(cudaMemcpyAsync(d_boundary_field, h_boundary_field, boundary_value_bytes, cudaMemcpyHostToDevice, dfDataBase.stream));
 }
 
-void uploadVolScalar(dfMatrixDataBase& dfDataBase, const volScalarField& field, double *d_field, double *d_boundary_field) {
+template <typename T>
+void uploadField(dfMatrixDataBase& dfDataBase, const T& field, double *d_field, double *d_boundary_field) {
+    size_t stride = 0;
+    size_t internal_size = 0;
+    size_t boundary_size = 0;
+    getTypeInfo<T>(&stride, &internal_size, &boundary_size);
+    size_t internal_value_bytes = internal_size * sizeof(double);
+    size_t boundary_value_bytes = boundary_size * sizeof(double);
+
     std::vector<double> h_boundary_field;
-    h_boundary_field.resize(dfDataBase.num_boundary_surfaces);
+    h_boundary_field.resize(boundary_size);
     int offset = 0;
     forAll(field.boundaryField(), patchi)
     {
         const auto& patchField = field.boundaryField()[patchi];
         int patchsize = patchField.size();
-        memcpy(h_boundary_field.data() + offset, &patchField[0], patchsize * sizeof(double));
+        memcpy(h_boundary_field.data() + offset * stride, &patchField[0], patchsize * stride * sizeof(double));
         offset += patchsize;
     }
-    checkCudaErrors(cudaMemcpyAsync(d_field, &field[0], dfDataBase.cell_value_bytes, cudaMemcpyHostToDevice, dfDataBase.stream));
-    checkCudaErrors(cudaMemcpyAsync(d_boundary_field, h_boundary_field.data(), dfDataBase.boundary_surface_value_bytes, cudaMemcpyHostToDevice, dfDataBase.stream));
+    checkCudaErrors(cudaMemcpyAsync(d_field, &field[0], internal_value_bytes, cudaMemcpyHostToDevice, dfDataBase.stream));
+    checkCudaErrors(cudaMemcpyAsync(d_boundary_field, h_boundary_field.data(), boundary_value_bytes, cudaMemcpyHostToDevice, dfDataBase.stream));
 }
 
-void buildTestGPUDataBaseScalar(const dfMatrixDataBase& dfDataBase, testGPUDataBase& testData, const volScalarField& field,
+template <typename T>
+void buildTestGPUDataBase(const dfMatrixDataBase& dfDataBase, testGPUDataBase& testData, const T& field,
         bool lowerFlag, bool upperFlag, bool diagFlag, bool sourceFlag, bool internalCoeffsFlag, bool boundaryCoeffsFlag,
         bool valueInternalCoeffsFlag, bool valueBoundaryCoeffsFlag, bool gradientInternalCoeffsFlag, bool gradientBoundaryCoeffsFlag) {
-    // ldu
-    if (lowerFlag) {
-        checkCudaErrors(cudaMalloc((void**)&testData.d_lower, dfDataBase.surface_value_bytes));
-        checkCudaErrors(cudaMemset(testData.d_lower, 0, dfDataBase.surface_value_bytes));
-    }
-    if (upperFlag) {
-        checkCudaErrors(cudaMalloc((void**)&testData.d_upper, dfDataBase.surface_value_bytes));
-        checkCudaErrors(cudaMemset(testData.d_upper, 0, dfDataBase.surface_value_bytes));
-    }
-    if (diagFlag) {
-        checkCudaErrors(cudaMalloc((void**)&testData.d_diag, dfDataBase.cell_value_bytes));
-        checkCudaErrors(cudaMemset(testData.d_diag, 0, dfDataBase.cell_value_bytes));
+    if ((typeid(T) != typeid(volScalarField)) && (typeid(T) != typeid(volVectorField))) {
+        fprintf(stderr, "ERROR! Unsupported field type()!\n");
+        exit(EXIT_FAILURE);
     }
-    if (sourceFlag) {
-        checkCudaErrors(cudaMalloc((void**)&testData.d_source, dfDataBase.cell_value_bytes));
-        checkCudaErrors(cudaMemset(testData.d_source, 0, dfDataBase.cell_value_bytes));
-    }
-    if (internalCoeffsFlag) {
-        checkCudaErrors(cudaMalloc((void**)&testData.d_internal_coeffs, dfDataBase.boundary_surface_value_bytes));
-        checkCudaErrors(cudaMemset(testData.d_internal_coeffs, 0, dfDataBase.boundary_surface_value_bytes));
-    }
-    if (boundaryCoeffsFlag) {
-        checkCudaErrors(cudaMalloc((void**)&testData.d_boundary_coeffs, dfDataBase.boundary_surface_value_bytes));
-        checkCudaErrors(cudaMemset(testData.d_boundary_coeffs, 0, dfDataBase.boundary_surface_value_bytes));
-    }
-    // boundary coeffs
-    if (valueInternalCoeffsFlag) {
-        checkCudaErrors(cudaMalloc((void**)&testData.d_value_internal_coeffs, dfDataBase.boundary_surface_value_bytes));
-        checkCudaErrors(cudaMemset(testData.d_value_internal_coeffs, 0, dfDataBase.boundary_surface_value_bytes));
-    }
-    if (valueBoundaryCoeffsFlag) {
-        checkCudaErrors(cudaMalloc((void**)&testData.d_value_boundary_coeffs, dfDataBase.boundary_surface_value_bytes));
-        checkCudaErrors(cudaMemset(testData.d_value_boundary_coeffs, 0, dfDataBase.boundary_surface_value_bytes));
-    }
-    if (gradientInternalCoeffsFlag) {
-        checkCudaErrors(cudaMalloc((void**)&testData.d_gradient_internal_coeffs, dfDataBase.boundary_surface_value_bytes));
-        checkCudaErrors(cudaMemset(testData.d_gradient_internal_coeffs, 0, dfDataBase.boundary_surface_value_bytes));
-    }
-    if (gradientBoundaryCoeffsFlag) {
-        checkCudaErrors(cudaMalloc((void**)&testData.d_gradient_boundary_coeffs, dfDataBase.boundary_surface_value_bytes));
-        checkCudaErrors(cudaMemset(testData.d_gradient_boundary_coeffs, 0, dfDataBase.boundary_surface_value_bytes));
-    }
-    // patch type
-    testData.patch_type.resize(dfDataBase.num_patches);
-    forAll(field.boundaryField(), patchi)
-    {
-        constructBoundarySelectorPerPatch(&(testData.patch_type[patchi]), field.boundaryField()[patchi].type());
-    }
-}
+    bool isVec = (typeid(T) == typeid(volVectorField));
+    size_t stride = isVec ? 3 : 1;
 
-void buildTestGPUDataBaseVector(const dfMatrixDataBase& dfDataBase, testGPUDataBase& testData, const volVectorField& field,
-        bool lowerFlag, bool upperFlag, bool diagFlag, bool sourceFlag, bool internalCoeffsFlag, bool boundaryCoeffsFlag,
-        bool valueInternalCoeffsFlag, bool valueBoundaryCoeffsFlag, bool gradientInternalCoeffsFlag, bool gradientBoundaryCoeffsFlag) {
     // ldu
     if (lowerFlag) {
         checkCudaErrors(cudaMalloc((void**)&testData.d_lower, dfDataBase.surface_value_bytes));
@@ -243,33 +167,33 @@ void buildTestGPUDataBaseVector(const dfMatrixDataBase& dfDataBase, testGPUDataB
         checkCudaErrors(cudaMemset(testData.d_diag, 0, dfDataBase.cell_value_bytes));
     }
     if (sourceFlag) {
-        checkCudaErrors(cudaMalloc((void**)&testData.d_source, dfDataBase.cell_value_vec_bytes));
-        checkCudaErrors(cudaMemset(testData.d_source, 0, dfDataBase.cell_value_vec_bytes));
+        checkCudaErrors(cudaMalloc((void**)&testData.d_source, dfDataBase.cell_value_bytes * stride));
+        checkCudaErrors(cudaMemset(testData.d_source, 0, dfDataBase.cell_value_bytes * stride));
     }
     if (internalCoeffsFlag) {
-        checkCudaErrors(cudaMalloc((void**)&testData.d_internal_coeffs, dfDataBase.boundary_surface_value_vec_bytes));
-        checkCudaErrors(cudaMemset(testData.d_internal_coeffs, 0, dfDataBase.boundary_surface_value_vec_bytes));
+        checkCudaErrors(cudaMalloc((void**)&testData.d_internal_coeffs, dfDataBase.boundary_surface_value_bytes * stride));
+        checkCudaErrors(cudaMemset(testData.d_internal_coeffs, 0, dfDataBase.boundary_surface_value_bytes * stride));
     }
     if (boundaryCoeffsFlag) {
-        checkCudaErrors(cudaMalloc((void**)&testData.d_boundary_coeffs, dfDataBase.boundary_surface_value_vec_bytes));
-        checkCudaErrors(cudaMemset(testData.d_boundary_coeffs, 0, dfDataBase.boundary_surface_value_vec_bytes));
+        checkCudaErrors(cudaMalloc((void**)&testData.d_boundary_coeffs, dfDataBase.boundary_surface_value_bytes * stride));
+        checkCudaErrors(cudaMemset(testData.d_boundary_coeffs, 0, dfDataBase.boundary_surface_value_bytes * stride));
     }
     // boundary coeffs
     if (valueInternalCoeffsFlag) {
-        checkCudaErrors(cudaMalloc((void**)&testData.d_value_internal_coeffs, dfDataBase.boundary_surface_value_vec_bytes));
-        checkCudaErrors(cudaMemset(testData.d_value_internal_coeffs, 0, dfDataBase.boundary_surface_value_vec_bytes));
+        checkCudaErrors(cudaMalloc((void**)&testData.d_value_internal_coeffs, dfDataBase.boundary_surface_value_bytes * stride));
+        checkCudaErrors(cudaMemset(testData.d_value_internal_coeffs, 0, dfDataBase.boundary_surface_value_bytes * stride));
     }
     if (valueBoundaryCoeffsFlag) {
-        checkCudaErrors(cudaMalloc((void**)&testData.d_value_boundary_coeffs, dfDataBase.boundary_surface_value_vec_bytes));
-        checkCudaErrors(cudaMemset(testData.d_value_boundary_coeffs, 0, dfDataBase.boundary_surface_value_vec_bytes));
+        checkCudaErrors(cudaMalloc((void**)&testData.d_value_boundary_coeffs, dfDataBase.boundary_surface_value_bytes * stride));
+        checkCudaErrors(cudaMemset(testData.d_value_boundary_coeffs, 0, dfDataBase.boundary_surface_value_bytes * stride));
     }
     if (gradientInternalCoeffsFlag) {
-        checkCudaErrors(cudaMalloc((void**)&testData.d_gradient_internal_coeffs, dfDataBase.boundary_surface_value_vec_bytes));
-        checkCudaErrors(cudaMemset(testData.d_gradient_internal_coeffs, 0, dfDataBase.boundary_surface_value_vec_bytes));
+        checkCudaErrors(cudaMalloc((void**)&testData.d_gradient_internal_coeffs, dfDataBase.boundary_surface_value_bytes * stride));
+        checkCudaErrors(cudaMemset(testData.d_gradient_internal_coeffs, 0, dfDataBase.boundary_surface_value_bytes * stride));
     }
     if (gradientBoundaryCoeffsFlag) {
-        checkCudaErrors(cudaMalloc((void**)&testData.d_gradient_boundary_coeffs, dfDataBase.boundary_surface_value_vec_bytes));
-        checkCudaErrors(cudaMemset(testData.d_gradient_boundary_coeffs, 0, dfDataBase.boundary_surface_value_vec_bytes));
+        checkCudaErrors(cudaMalloc((void**)&testData.d_gradient_boundary_coeffs, dfDataBase.boundary_surface_value_bytes * stride));
+        checkCudaErrors(cudaMemset(testData.d_gradient_boundary_coeffs, 0, dfDataBase.boundary_surface_value_bytes * stride));
     }
     // patch type
     testData.patch_type.resize(dfDataBase.num_patches);
@@ -279,14 +203,16 @@ void buildTestGPUDataBaseVector(const dfMatrixDataBase& dfDataBase, testGPUDataB
     }
 }
 
-void updateBoundaryCoeffsVector(const dfMatrixDataBase& dfDataBase, testGPUDataBase& testData) {
-    update_boundary_coeffs_vector(dfDataBase.stream, dfDataBase.num_patches,
-            dfDataBase.patch_size.data(), testData.patch_type.data(),
-            testData.d_value_internal_coeffs, testData.d_value_boundary_coeffs,
-            testData.d_gradient_internal_coeffs, testData.d_gradient_boundary_coeffs);
-}
-
+// TODO: It seems that compareResult of scalar and vector can't be merged
 void compareResultVector(const dfMatrixDataBase& dfDataBase, const testGPUDataBase& testData, fvVectorMatrix& dfMatrix, bool printFlag) {
+    //if ((typeid(T) != typeid(fvScalarMatrix)) && (typeid(T) != typeid(fvVectorMatrix))) {
+    //    fprintf(stderr, "ERROR! Unsupported field type()!\n");
+    //    exit(EXIT_FAILURE);
+    //}
+    //bool isVec = (typeid(T) == typeid(fvVectorMatrix));
+    //size_t stride = isVec ? 3 : 1;
+
+    size_t stride = 3;
     if (testData.d_lower) {
         std::vector<double> h_lower;
         h_lower.resize(dfDataBase.num_surfaces);
@@ -307,48 +233,51 @@ void compareResultVector(const dfMatrixDataBase& dfDataBase, const testGPUDataBa
     }
     if (testData.d_source) {
         std::vector<double> h_source;
-        h_source.resize(dfDataBase.num_cells * 3);
-        checkCudaErrors(cudaMemcpy(h_source.data(), testData.d_source, dfDataBase.cell_value_vec_bytes, cudaMemcpyDeviceToHost));
-        checkVectorEqual(dfDataBase.num_cells * 3, &dfMatrix.source()[0][0], h_source.data(), 1e-14, printFlag);
+        h_source.resize(dfDataBase.num_cells * stride);
+        checkCudaErrors(cudaMemcpy(h_source.data(), testData.d_source, dfDataBase.cell_value_bytes * stride, cudaMemcpyDeviceToHost));
+        //void *source_ptr = isVec ? (&dfMatrix.source()[0][0]) : (&dfMatrix.source()[0]);
+        double *source_ptr = &dfMatrix.source()[0][0];
+        checkVectorEqual(dfDataBase.num_cells * stride, source_ptr, h_source.data(), 1e-14, printFlag);
     }
     if (testData.d_internal_coeffs) {
         std::vector<double> h_internal_coeffs;
-        h_internal_coeffs.resize(dfDataBase.num_boundary_surfaces * 3);
-        checkCudaErrors(cudaMemcpy(h_internal_coeffs.data(), testData.d_internal_coeffs, dfDataBase.boundary_surface_value_vec_bytes, cudaMemcpyDeviceToHost));
-        std::vector<double> cpu_internal_coeffs(dfDataBase.num_boundary_surfaces * 3);
+        h_internal_coeffs.resize(dfDataBase.num_boundary_surfaces * stride);
+        checkCudaErrors(cudaMemcpy(h_internal_coeffs.data(), testData.d_internal_coeffs, dfDataBase.boundary_surface_value_bytes * stride, cudaMemcpyDeviceToHost));
+        std::vector<double> cpu_internal_coeffs(dfDataBase.num_boundary_surfaces * stride);
         int offset = 0;
         for (int patchi = 0; patchi < dfDataBase.num_patches; patchi++)
         {
             int patchsize = dfDataBase.patch_size[patchi];
-            const double* internal_coeff_ptr = &dfMatrix.internalCoeffs()[patchi][0][0];
-            memcpy(cpu_internal_coeffs.data() + offset * 3, internal_coeff_ptr, patchsize * 3 * sizeof(double));
+            //const void* internal_coeff_ptr = isVec ? (&dfMatrix.internalCoeffs()[patchi][0][0]) : (&dfMatrix.internalCoeffs()[patchi][0]);
+            const void* internal_coeff_ptr = &dfMatrix.internalCoeffs()[patchi][0][0];
+            memcpy(cpu_internal_coeffs.data() + offset * stride, internal_coeff_ptr, patchsize * stride * sizeof(double));
             offset += patchsize;
         }
-        checkVectorEqual(dfDataBase.num_boundary_surfaces * 3, cpu_internal_coeffs.data(), h_internal_coeffs.data(), 1e-14, printFlag);
+        checkVectorEqual(dfDataBase.num_boundary_surfaces * stride, cpu_internal_coeffs.data(), h_internal_coeffs.data(), 1e-14, printFlag);
     }
     if (testData.d_boundary_coeffs) {
         std::vector<double> h_boundary_coeffs;
-        h_boundary_coeffs.resize(dfDataBase.num_boundary_surfaces * 3);
-        checkCudaErrors(cudaMemcpy(h_boundary_coeffs.data(), testData.d_boundary_coeffs, dfDataBase.boundary_surface_value_vec_bytes, cudaMemcpyDeviceToHost));
-        std::vector<double> cpu_boundary_coeffs(dfDataBase.num_boundary_surfaces * 3);
+        h_boundary_coeffs.resize(dfDataBase.num_boundary_surfaces * stride);
+        checkCudaErrors(cudaMemcpy(h_boundary_coeffs.data(), testData.d_boundary_coeffs, dfDataBase.boundary_surface_value_bytes * stride, cudaMemcpyDeviceToHost));
+        std::vector<double> cpu_boundary_coeffs(dfDataBase.num_boundary_surfaces * stride);
         int offset = 0;
         for (int patchi = 0; patchi < dfDataBase.num_patches; patchi++)
         {
             int patchsize = dfDataBase.patch_size[patchi];
-            const double* boundary_coeff_ptr = &dfMatrix.boundaryCoeffs()[patchi][0][0];
-            memcpy(cpu_boundary_coeffs.data() + offset * 3, boundary_coeff_ptr, patchsize * 3 * sizeof(double));
+            //const void* boundary_coeff_ptr = isVec ? (&dfMatrix.boundaryCoeffs()[patchi][0][0]) : (&dfMatrix.boundaryCoeffs()[patchi][0]);
+            const void* boundary_coeff_ptr = &dfMatrix.boundaryCoeffs()[patchi][0][0];
+            memcpy(cpu_boundary_coeffs.data() + offset * stride, boundary_coeff_ptr, patchsize * stride * sizeof(double));
             offset += patchsize;
         }
-        checkVectorEqual(dfDataBase.num_boundary_surfaces * 3, cpu_boundary_coeffs.data(), h_boundary_coeffs.data(), 1e-14, printFlag);
+        checkVectorEqual(dfDataBase.num_boundary_surfaces * stride, cpu_boundary_coeffs.data(), h_boundary_coeffs.data(), 1e-14, printFlag);
     }
 }
 
 // unittest of fvm::ddt(rho, U)
 void test_fvm_ddt_vector(dfMatrixDataBase& dfDataBase, Foam::fvMesh& mesh, volScalarField& rho, volVectorField& U, initType type) {
     if (type == initType::randomInit) {
-        // random init rho and rho.old
         rho.oldTime();
-        randomInitVolScalar(rho);
+        randomInitField<volScalarField>(rho);
     }
 
     // run CPU
@@ -356,13 +285,13 @@ void test_fvm_ddt_vector(dfMatrixDataBase& dfDataBase, Foam::fvMesh& mesh, volSc
 
     // prepare for run GPU
     // prepare rho, rho.old, U
-    uploadRegisteredVolScalar(dfDataBase, rho, "rho");
-    uploadRegisteredVolScalar(dfDataBase, rho.oldTime(), "rho_old");
-    uploadRegisteredVolVector(dfDataBase, U.oldTime(), "u");
+    uploadRegisteredField<volScalarField>(dfDataBase, rho, "rho");
+    uploadRegisteredField<volScalarField>(dfDataBase, rho.oldTime(), "rho_old");
+    uploadRegisteredField<volVectorField>(dfDataBase, U.oldTime(), "u");
     // prepare testData
     testGPUDataBase testData;
     // only use diag and source
-    buildTestGPUDataBaseVector(dfDataBase, testData, U, false, false, true, true, false, false, false, false, false, false);
+    buildTestGPUDataBase<volVectorField>(dfDataBase, testData, U, false, false, true, true, false, false, false, false, false, false);
     // run GPU
     fvm_ddt_vector(dfDataBase.stream, dfDataBase.num_cells, dfDataBase.rdelta_t,
             dfDataBase.d_rho, dfDataBase.d_rho_old, dfDataBase.d_u, dfDataBase.d_volume,
@@ -377,7 +306,7 @@ void test_fvm_ddt_vector(dfMatrixDataBase& dfDataBase, Foam::fvMesh& mesh, volSc
 void test_fvm_div_vector(dfMatrixDataBase& dfDataBase, Foam::fvMesh& mesh, surfaceScalarField& phi, volVectorField& U, initType type) {
     if (type == initType::randomInit) {
         phi.oldTime();
-        randomInitSurfaceScalar(phi);
+        randomInitField<surfaceScalarField>(phi);
     }
 
     // run CPU
@@ -385,15 +314,18 @@ void test_fvm_div_vector(dfMatrixDataBase& dfDataBase, Foam::fvMesh& mesh, surfa
 
     // prepare for run GPU
     // prepare phi field
-    uploadRegisteredSurfaceScalar(dfDataBase, phi, "phi");
+    uploadRegisteredField<surfaceScalarField>(dfDataBase, phi, "phi");
     // prepare testData
     testGPUDataBase testData;
     // not use source
-    // gradient_internal_coeffs, gradient_boundary_coeffs are not needed actually, but updateBoundaryCoeffsVector will access them
-    buildTestGPUDataBaseVector(dfDataBase, testData, U, true, true, true, false, true, true, true, true, true, true);
+    // gradient_internal_coeffs, gradient_boundary_coeffs are not needed actually, but update_boundary_coeffs_vector will access them
+    buildTestGPUDataBase<volVectorField>(dfDataBase, testData, U, true, true, true, false, true, true, true, true, true, true);
     // prepare boundary coeffs
     // TODO: updating boundary coeffs should be complemented later
-    updateBoundaryCoeffsVector(dfDataBase, testData);
+    update_boundary_coeffs_vector(dfDataBase.stream, dfDataBase.num_patches,
+            dfDataBase.patch_size.data(), testData.patch_type.data(),
+            testData.d_value_internal_coeffs, testData.d_value_boundary_coeffs,
+            testData.d_gradient_internal_coeffs, testData.d_gradient_boundary_coeffs);
 
     // run GPU
     fvm_div_vector(dfDataBase.stream, dfDataBase.num_surfaces, dfDataBase.d_owner, dfDataBase.d_neighbor,
@@ -414,7 +346,7 @@ void test_fvm_laplacian_vector(dfMatrixDataBase& dfDataBase, Foam::fvMesh& mesh,
 {
     if (type == initType::randomInit) {
         gamma.oldTime();
-        randomInitVolScalar(gamma);
+        randomInitField<volScalarField>(gamma);
     }
 
     // run CPU
@@ -426,15 +358,18 @@ void test_fvm_laplacian_vector(dfMatrixDataBase& dfDataBase, Foam::fvMesh& mesh,
     double *d_boundary_gamma = nullptr;
     checkCudaErrors(cudaMalloc((void**)&d_gamma, dfDataBase.cell_value_bytes));
     checkCudaErrors(cudaMalloc((void**)&d_boundary_gamma, dfDataBase.boundary_surface_value_bytes));
-    uploadVolScalar(dfDataBase, gamma, d_gamma, d_boundary_gamma);
+    uploadField<volScalarField>(dfDataBase, gamma, d_gamma, d_boundary_gamma);
     // prepare testData
     testGPUDataBase testData;
     // not use source
-    // value_internal_coeffs, value_boundary_coeffs are not needed actually, but updateBoundaryCoeffsVector will access them
-    buildTestGPUDataBaseVector(dfDataBase, testData, U, true, true, true, false, true, true, true, true, true, true);
+    // value_internal_coeffs, value_boundary_coeffs are not needed actually, but update_boundary_coeffs_vector will access them
+    buildTestGPUDataBase<volVectorField>(dfDataBase, testData, U, true, true, true, false, true, true, true, true, true, true);
     // prepare boundary coeffs
     // TODO: updating boundary coeffs should be complemented later
-    updateBoundaryCoeffsVector(dfDataBase, testData);
+    update_boundary_coeffs_vector(dfDataBase.stream, dfDataBase.num_patches,
+            dfDataBase.patch_size.data(), testData.patch_type.data(),
+            testData.d_value_internal_coeffs, testData.d_value_boundary_coeffs,
+            testData.d_gradient_internal_coeffs, testData.d_gradient_boundary_coeffs);
 
     // run GPU
     fvm_laplacian_vector(dfDataBase.stream, dfDataBase.num_surfaces,
@@ -458,11 +393,10 @@ void test_fvm_laplacian_vector(dfMatrixDataBase& dfDataBase, Foam::fvMesh& mesh,
 // unittest of fvc::ddt(rho, K)
 void test_fvc_ddt_scalar(dfMatrixDataBase& dfDataBase, Foam::fvMesh& mesh, volScalarField& rho, volScalarField& K, initType type) {
     if (type == initType::randomInit) {
-        // random init rho and rho.old
         rho.oldTime();
-        randomInitVolScalar(rho);
+        randomInitField<volScalarField>(rho);
         K.oldTime();
-        randomInitVolScalar(K);
+        randomInitField<volScalarField>(K);
     }
 
     // run CPU
@@ -470,8 +404,8 @@ void test_fvc_ddt_scalar(dfMatrixDataBase& dfDataBase, Foam::fvMesh& mesh, volSc
 
     // prepare for run GPU
     // prepare rho, rho.old on GPU
-    uploadRegisteredVolScalar(dfDataBase, rho, "rho");
-    uploadRegisteredVolScalar(dfDataBase, rho.oldTime(), "rho_old");
+    uploadRegisteredField<volScalarField>(dfDataBase, rho, "rho");
+    uploadRegisteredField<volScalarField>(dfDataBase, rho.oldTime(), "rho_old");
     // prepare K, K_old on GPU
     double *d_K = nullptr;
     double *d_K_old = nullptr;
@@ -501,5 +435,3 @@ void test_fvc_ddt_scalar(dfMatrixDataBase& dfDataBase, Foam::fvMesh& mesh, volSc
     checkCudaErrors(cudaFree(d_K_old));
     checkCudaErrors(cudaFree(d_fvc_ouput_scalar));
 }
-
-

From 8b3805fb699040f1e9ee89502c1e02389bbb1311 Mon Sep 17 00:00:00 2001
From: STwangyingrui <wangyingrui@sensetime.com>
Date: Tue, 8 Aug 2023 00:05:29 +0800
Subject: [PATCH 15/25] modify getTypeInfo to support tensor type

---
 GPUTest/GPUTestBase.H | 33 ++++++++++++++++++++++++---------
 1 file changed, 24 insertions(+), 9 deletions(-)

diff --git a/GPUTest/GPUTestBase.H b/GPUTest/GPUTestBase.H
index 42a64cd51..e35588006 100644
--- a/GPUTest/GPUTestBase.H
+++ b/GPUTest/GPUTestBase.H
@@ -43,18 +43,33 @@ struct testGPUDataBase {
 
 template <typename T>
 void getTypeInfo(size_t *stride, size_t *internal_size, size_t *boundary_size) {
-    if ((typeid(T) != typeid(surfaceScalarField))
-            && (typeid(T) != typeid(surfaceVectorField))
-            && (typeid(T) != typeid(volScalarField))
-            && (typeid(T) != typeid(volVectorField))) {
+    size_t s = 1;
+    bool isVol = false;
+    if (typeid(T) == typeid(surfaceScalarField)) {
+        s = 1;
+        isVol = false;
+    } else if (typeid(T) == typeid(surfaceVectorField)) {
+        s = 3;
+        isVol = false;
+    } else if (typeid(T) == typeid(surfaceTensorField)) {
+        s = 9;
+        isVol = false;
+    } else if (typeid(T) == typeid(volScalarField)) {
+        s = 1;
+        isVol = true;
+    } else if (typeid(T) == typeid(volVectorField)) {
+        s = 3;
+        isVol = true;
+    } else if (typeid(T) == typeid(volTensorField)) {
+        s = 9;
+        isVol = true;
+    } else {
         fprintf(stderr, "ERROR! Unsupported field type()!\n");
         exit(EXIT_FAILURE);
     }
-    bool isVol = ((typeid(T) == typeid(volScalarField)) || (typeid(T) == typeid(volVectorField)));
-    bool isVec = ((typeid(T) == typeid(surfaceVectorField)) || (typeid(T) == typeid(volVectorField)));
-    *stride = isVec ? 3 : 1;
-    *internal_size = (isVol ? dfDataBase.num_cells : dfDataBase.num_surfaces) * (*stride);
-    *boundary_size = dfDataBase.num_boundary_surfaces * (*stride);
+    *stride = s;
+    *internal_size = (isVol ? dfDataBase.num_cells : dfDataBase.num_surfaces) * s;
+    *boundary_size = dfDataBase.num_boundary_surfaces * s;
 }
 
 template <typename T>

From 776aa69b519825144cfb9a2ebfcbf39fe4697e8d Mon Sep 17 00:00:00 2001
From: maorz1998 <maorz1998@stu.pku.edu.cn>
Date: Thu, 10 Aug 2023 00:26:32 +0800
Subject: [PATCH 16/25] first commit for debugging

---
 GPUTest/GPUTestBase.H              |  69 ++++++
 GPUTest/Make/options               |   2 +
 GPUTest/createGPUSolver.H          |   5 +-
 GPUTest/unittest.C                 |  13 +
 GPUTestRef/GenFvMatrix.H           | 248 ++++++++++++++++++++
 GPUTestRef/Make/files              |   4 +
 GPUTestRef/Make/options            |  31 +++
 GPUTestRef/gaussConvectionScheme.C | 305 ++++++++++++++++++++++++
 GPUTestRef/gaussGrad.C             | 332 ++++++++++++++++++++++++++
 src_gpu/dfMatrixDataBase.H         |   3 +-
 src_gpu/dfMatrixDataBase.cu        |   4 +-
 src_gpu/dfMatrixOpBase.H           |  12 +
 src_gpu/dfMatrixOpBase.cu          | 365 +++++++++++++++++++++++++++++
 13 files changed, 1390 insertions(+), 3 deletions(-)
 create mode 100644 GPUTestRef/GenFvMatrix.H
 create mode 100644 GPUTestRef/Make/files
 create mode 100644 GPUTestRef/Make/options
 create mode 100644 GPUTestRef/gaussConvectionScheme.C
 create mode 100644 GPUTestRef/gaussGrad.C

diff --git a/GPUTest/GPUTestBase.H b/GPUTest/GPUTestBase.H
index e35588006..38676528e 100644
--- a/GPUTest/GPUTestBase.H
+++ b/GPUTest/GPUTestBase.H
@@ -450,3 +450,72 @@ void test_fvc_ddt_scalar(dfMatrixDataBase& dfDataBase, Foam::fvMesh& mesh, volSc
     checkCudaErrors(cudaFree(d_K_old));
     checkCudaErrors(cudaFree(d_fvc_ouput_scalar));
 }
+
+// unittest of fvc::grad(U)
+void test_fvc_grad_vector(dfMatrixDataBase& dfDataBase, Foam::fvMesh& mesh, volVectorField& U, initType type) {
+    // if (type == initType::randomInit) {
+    //     U.oldTime();
+    //     randomInitField<volVectorField>(U);
+    // }
+
+    // run CPU
+    volTensorField fvc_ouput_tensor = fvc::grad(U);
+    // volTensorField fvc_ouput_tensor = gaussGradSchemeGrad(U);
+
+    // prepare for run GPU
+    // prepare U on GPU
+    uploadRegisteredField<volVectorField>(dfDataBase, U, "u");
+    
+    double *d_fvc_ouput_tensor = nullptr, *d_fvc_ouput_boundary_tensor = nullptr;
+    checkCudaErrors(cudaMalloc((void**)&d_fvc_ouput_tensor, dfDataBase.cell_value_tsr_bytes));
+    checkCudaErrors(cudaMalloc((void**)&d_fvc_ouput_boundary_tensor, dfDataBase.boundary_surface_value_tsr_bytes));
+    checkCudaErrors(cudaMemset(d_fvc_ouput_tensor, 0, dfDataBase.cell_value_tsr_bytes));
+    checkCudaErrors(cudaMemset(d_fvc_ouput_boundary_tensor, 0, dfDataBase.boundary_surface_value_tsr_bytes));
+    
+    // only need patch_type
+    testGPUDataBase testData;
+    buildTestGPUDataBase<volVectorField>(dfDataBase, testData, U, false, false, false, false, false, false, false, false, false, false);
+
+    fvc_grad_vector(dfDataBase.stream, dfDataBase.num_cells, dfDataBase.num_surfaces,
+            dfDataBase.d_owner, dfDataBase.d_neighbor, 
+            dfDataBase.d_weight, dfDataBase.d_sf, dfDataBase.d_u, d_fvc_ouput_tensor,
+            dfDataBase.num_patches, dfDataBase.patch_size.data(), testData.patch_type.data(), 
+            dfDataBase.d_boundary_face_cell, dfDataBase.d_boundary_u, dfDataBase.d_boundary_sf,
+            dfDataBase.d_volume, dfDataBase.d_boundary_mag_sf, d_fvc_ouput_boundary_tensor, dfDataBase.d_boundary_delta_coeffs);
+
+    // compare result
+    bool printFlag = false;
+    std::vector<double> h_fvc_ouput_tensor(dfDataBase.num_cells * 9);
+    checkCudaErrors(cudaMemcpy(h_fvc_ouput_tensor.data(), d_fvc_ouput_tensor, dfDataBase.cell_value_tsr_bytes, cudaMemcpyDeviceToHost));
+    checkVectorEqual(dfDataBase.num_cells * 9, &fvc_ouput_tensor[0][0], h_fvc_ouput_tensor.data(), 1e-14, printFlag);
+}
+
+// unittest of fvc::div(phi)
+void test_fvc_div_scalar(dfMatrixDataBase& dfDataBase, Foam::fvMesh& mesh, surfaceScalarField& phi, initType type) {
+    if (type == initType::randomInit) {
+        phi.oldTime();
+        randomInitField<surfaceScalarField>(phi);
+    }
+
+    // run CPU
+    volScalarField fvc_ouput_scalar = fvc::div(phi);
+    // volScalarField fvc_ouput_scalar = gaussConvectionSchemeFvcDiv(phi);
+
+    // prepare for run GPU
+    // prepare phi on GPU
+    uploadRegisteredField<surfaceScalarField>(dfDataBase, phi, "phi");
+
+    double *d_fvc_ouput_scalar = nullptr;
+    checkCudaErrors(cudaMalloc((void**)&d_fvc_ouput_scalar, dfDataBase.cell_value_bytes));
+    checkCudaErrors(cudaMemset(d_fvc_ouput_scalar, 0, dfDataBase.cell_value_bytes));
+
+    fvc_div_surface_scalar(dfDataBase.stream, dfDataBase.num_cells, dfDataBase.num_surfaces, dfDataBase.num_boundary_surfaces,
+            dfDataBase.d_owner, dfDataBase.d_neighbor,
+            dfDataBase.d_phi, dfDataBase.d_boundary_face_cell, dfDataBase.d_boundary_phi, dfDataBase.d_volume, d_fvc_ouput_scalar);
+    
+    // compare result
+    bool printFlag = false;
+    std::vector<double> h_fvc_ouput_scalar(dfDataBase.num_cells);
+    checkCudaErrors(cudaMemcpy(h_fvc_ouput_scalar.data(), d_fvc_ouput_scalar, dfDataBase.cell_value_bytes, cudaMemcpyDeviceToHost));
+    checkVectorEqual(dfDataBase.num_cells, &fvc_ouput_scalar[0], h_fvc_ouput_scalar.data(), 1e-14, printFlag);
+}
diff --git a/GPUTest/Make/options b/GPUTest/Make/options
index 637eb0e9b..197663050 100644
--- a/GPUTest/Make/options
+++ b/GPUTest/Make/options
@@ -24,6 +24,7 @@ EXE_INC = -std=c++14 \
     -I$(DF_SRC)/dfCombustionModels/lnInclude \
     -I$(CANTERA_ROOT)/include \
 	-I$(DF_ROOT)/src_gpu \
+	-I$(DF_ROOT)/GPUTestRef/lnInclude \
 	-I/usr/local/cuda-11.6/include \
 	-I$(AMGX_DIR)/include
 
@@ -39,6 +40,7 @@ EXE_LIBS = \
     -ldfCanteraMixture \
     -ldfChemistryModel \
     -ldfCombustionModels \
+	-ldfGenMatrix \
     $(CANTERA_ROOT)/lib/libcantera.so \
 	/usr/local/cuda-11.6/lib64/libcudart.so \
 	$(AMGX_DIR)/build/libamgxsh.so \
diff --git a/GPUTest/createGPUSolver.H b/GPUTest/createGPUSolver.H
index 3dd593337..478b15ed1 100644
--- a/GPUTest/createGPUSolver.H
+++ b/GPUTest/createGPUSolver.H
@@ -28,24 +28,27 @@ void createGPUBase(fvMesh& mesh, PtrList<volScalarField>& Y) {
     double *boundary_sf = new double[3 * num_boundary_surfaces];
     double *boundary_mag_sf = new double[num_boundary_surfaces];
     double *boundary_delta_coeffs = new double[num_boundary_surfaces];
+    int *boundary_face_cell = new int[num_boundary_surfaces];
     int offset = 0;
     forAll(mesh.boundary(), patchi) {
         const vectorField& pSf = mesh.Sf().boundaryField()[patchi];
         const scalarField& pMagSf = mesh.magSf().boundaryField()[patchi];
         const scalarField& pDeltaCoeffs = mesh.nonOrthDeltaCoeffs().boundaryField()[patchi];
+        const labelUList& pFaceCells = mesh.boundary()[patchi].faceCells();
 
         int patchsize = pMagSf.size();
 
         memcpy(boundary_sf + 3*offset, &pSf[0][0], 3*patchsize*sizeof(double));
         memcpy(boundary_mag_sf + offset, &pMagSf[0], patchsize*sizeof(double));
         memcpy(boundary_delta_coeffs + offset, &pDeltaCoeffs[0], patchsize*sizeof(double));
+        memcpy(boundary_face_cell + offset, &pFaceCells[0], patchsize * sizeof(int));
         offset += patchsize;
     }
 
     dfDataBase.createConstantFieldsInternal();
     dfDataBase.createConstantFieldsBoundary();
     dfDataBase.initConstantFieldsInternal(&mesh.Sf()[0][0], &mesh.magSf()[0], &mesh.surfaceInterpolation::weights()[0], &mesh.nonOrthDeltaCoeffs()[0], &mesh.V()[0]);
-    dfDataBase.initConstantFieldsBoundary(boundary_sf, boundary_mag_sf, boundary_delta_coeffs);
+    dfDataBase.initConstantFieldsBoundary(boundary_sf, boundary_mag_sf, boundary_delta_coeffs, boundary_face_cell);
     
     // prepare internal and boundary of Y
     dfDataBase.createNonConstantFieldsInternal();
diff --git a/GPUTest/unittest.C b/GPUTest/unittest.C
index edd5b7856..ccbaefa71 100644
--- a/GPUTest/unittest.C
+++ b/GPUTest/unittest.C
@@ -49,6 +49,9 @@ Description
 #include <thread>
 #include "upwind.H"
 
+// debug
+#include "GenFvMatrix.H"
+
 #include "dfMatrixDataBase.H"
 #include "dfMatrixOpBase.H"
 #include "createGPUSolver.H"
@@ -125,6 +128,16 @@ int main(int argc, char *argv[])
         DEBUG_TRACE;
         test_fvc_ddt_scalar(dfDataBase, mesh, rho, K, initType::randomInit);
         DEBUG_TRACE;
+
+        // unittest of fvc::grad(U)
+        test_fvc_grad_vector(dfDataBase, mesh, U, initType::original);
+        DEBUG_TRACE;
+
+        // unittest of fvc::div(phi)
+        test_fvc_div_scalar(dfDataBase, mesh, phi, initType::original);
+        DEBUG_TRACE;
+        test_fvc_div_scalar(dfDataBase, mesh, phi, initType::randomInit);
+        DEBUG_TRACE;
     }
     return 0;
 }
diff --git a/GPUTestRef/GenFvMatrix.H b/GPUTestRef/GenFvMatrix.H
new file mode 100644
index 000000000..d328fe504
--- /dev/null
+++ b/GPUTestRef/GenFvMatrix.H
@@ -0,0 +1,248 @@
+#pragma once
+
+#include "tmp.H"
+#include "dimensionedType.H"
+#include "volFieldsFwd.H"
+#include "surfaceFieldsFwd.H"
+#include "typeInfo.H"
+#include "runTimeSelectionTables.H"
+#include "fvMatrices.H"
+#include "fvMesh.H"
+#include "turbulentFluidThermoModel.H"
+#include "CombustionModel.H"
+#include <mpi.h>
+#include <algorithm>
+#include "PstreamGlobals.H"
+
+// * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * //
+
+namespace Foam
+{
+
+
+// namespace fv
+// {
+
+// fvm::ddt
+template<class Type>
+tmp<fvMatrix<Type>>
+EulerDdtSchemeFvmDdt
+(
+    const volScalarField& rho,
+    const GeometricField<Type, fvPatchField, volMesh>& vf
+);
+
+template<class Type>
+tmp<fvMatrix<Type>>
+EulerDdtSchemeFvmDdt
+(
+    const GeometricField<Type, fvPatchField, volMesh>& vf
+);
+
+// fvc::ddt
+template<class Type>
+tmp<GeometricField<Type, fvPatchField, volMesh>>
+EulerDdtSchemeFvcDdt
+(
+    const volScalarField& rho,
+    const GeometricField<Type, fvPatchField, volMesh>& vf
+);
+
+template<class Type>
+tmp<GeometricField<Type, fvPatchField, volMesh>>
+EulerDdtSchemeFvcDdt
+(
+    const GeometricField<Type, fvPatchField, volMesh>& vf
+);
+
+// fvc::ddtCorr
+tmp<surfaceScalarField>
+EulerDdtSchemeFvcDdtCorr
+(
+    const volScalarField& rho,
+    const volVectorField& U,
+    const surfaceScalarField& phi,
+    const autoPtr<surfaceVectorField>& Uf
+);
+
+tmp<surfaceScalarField>
+EulerDdtSchemeFvcDdtPhiCoeff
+(
+    const volVectorField& U,
+    const surfaceScalarField& phi,
+    const surfaceScalarField& phiCorr,
+    const volScalarField& rho
+);
+
+template<class Type>
+Foam::tmp<Foam::GeometricField<Type, Foam::fvPatchField, Foam::volMesh>>
+UEqn_H
+(
+    fvMatrix<Type>& UEqn
+);
+
+tmp<volScalarField>
+rAUConstructor
+(
+    fvMatrix<vector>& UEqn
+);
+
+tmp<surfaceScalarField>
+rhorAUfConstructor
+(
+    const volScalarField& rhorAU,
+    const surfaceScalarField& linear_weights
+);
+
+tmp<surfaceScalarField>
+phiHbyAConstructor
+(
+    const volScalarField& rho,
+    const volVectorField& HbyA,
+    const surfaceScalarField& rhorAUf,
+    const surfaceScalarField& tddtCorr,
+    const surfaceScalarField& linear_weights
+);
+
+
+// fvm::div
+template<class Type>
+tmp<fvMatrix<Type>>
+gaussConvectionSchemeFvmDiv
+(
+    const surfaceScalarField& faceFlux,
+    const GeometricField<Type, fvPatchField, volMesh>& vf
+);
+
+template<class Type>
+tmp<fvMatrix<Type>>
+gaussConvectionSchemeFvmDiv
+(
+    const surfaceScalarField& faceFlux,
+    const GeometricField<Type, fvPatchField, volMesh>& vf,
+    const word& name
+);
+
+// fvc::div
+template<class Type>
+tmp<GeometricField<Type, fvPatchField, volMesh>>
+gaussConvectionSchemeFvcDiv
+(
+    const surfaceScalarField& faceFlux,
+    const GeometricField<Type, fvPatchField, volMesh>& vf
+);
+
+template<class Type>
+tmp<GeometricField<Type, fvPatchField, volMesh>>
+gaussConvectionSchemeFvcDiv
+(
+    const GeometricField<Type, fvsPatchField, surfaceMesh>& ssf
+);
+
+// fvc::grad
+template<class Type>
+tmp
+<
+    GeometricField
+    <
+        typename outerProduct<vector, Type>::type,
+        fvPatchField,
+        volMesh
+    >
+>
+gaussGradSchemeGrad
+(
+    const GeometricField<Type, fvPatchField, volMesh>& vsf
+);
+
+
+template<class Type>
+tmp<fvMatrix<Type>>
+gaussLaplacianSchemeFvmLaplacianUncorrected
+(
+    const surfaceScalarField& gammaMagSf,
+    const surfaceScalarField& deltaCoeffs,
+    const GeometricField<Type, fvPatchField, volMesh>& vf
+);
+
+// fvm::laplacian
+template<class Type>
+tmp<fvMatrix<Type>>
+gaussLaplacianSchemeFvmLaplacian
+(
+    const GeometricField<scalar, fvPatchField, volMesh>& gammaScalarVol,
+    const GeometricField<Type, fvPatchField, volMesh>& vf
+);
+
+template<class Type>
+tmp<fvMatrix<Type>>
+gaussLaplacianSchemeFvmLaplacian
+(
+    const GeometricField<scalar, fvsPatchField, surfaceMesh>& gamma,
+    const GeometricField<Type, fvPatchField, volMesh>& vf
+);
+
+// turbulence->divDevRhoReff(U)
+tmp<fvVectorMatrix>
+turbulenceModelLinearViscousStressDivDevRhoReff
+(
+    volVectorField& U,
+    compressible::turbulenceModel& turbulence
+);
+
+tmp<fvVectorMatrix>
+GenMatrix_U(
+    const volScalarField& rho,
+    volVectorField& U,
+    const surfaceScalarField& phi,
+    const volScalarField& p, 
+    compressible::turbulenceModel& turbulence
+);
+
+tmp<fvScalarMatrix>
+GenMatrix_Y(
+    const volScalarField& rho,
+    volScalarField& Yi,
+    const surfaceScalarField& phi,
+    const surfaceScalarField& phiUc,
+    const volScalarField& rhoD,
+    const volScalarField& mut,
+    const Switch splitting,
+    const scalar Sct,
+    CombustionModel<basicThermo>& combustion,
+    fv::convectionScheme<scalar>& mvConvection
+);
+
+tmp<fvScalarMatrix>
+GenMatrix_E(
+    const volScalarField& rho,
+    volScalarField& he,
+    const surfaceScalarField& phi,
+    const volScalarField& K,
+    const volScalarField& dpdt,
+    const volScalarField& alphaEff,
+    const volScalarField& diffAlphaD,
+    const volVectorField& hDiffCorrFlux,
+    const surfaceScalarField& linear_weights
+);
+
+tmp<fvScalarMatrix>
+GenMatrix_p(
+    const volScalarField& rho,
+    volScalarField& p,
+    const surfaceScalarField& phiHbyA,
+    const surfaceScalarField& rhorAUf,
+    const volScalarField& phi
+);
+
+
+void check_fvmatrix_equal(fvScalarMatrix& a,fvScalarMatrix& b);
+void check_fvmatrix_equal(fvVectorMatrix& a,fvVectorMatrix& b);
+
+void check_field_equal(Field<scalar>& a, Field<scalar>& b);
+
+
+} // End namespace Foam
+
+
+// ************************************************************************* //
\ No newline at end of file
diff --git a/GPUTestRef/Make/files b/GPUTestRef/Make/files
new file mode 100644
index 000000000..1137c3eed
--- /dev/null
+++ b/GPUTestRef/Make/files
@@ -0,0 +1,4 @@
+gaussGrad.C
+gaussConvectionScheme.C
+
+LIB = $(DF_LIBBIN)/libdfGenMatrix
\ No newline at end of file
diff --git a/GPUTestRef/Make/options b/GPUTestRef/Make/options
new file mode 100644
index 000000000..0523a67e8
--- /dev/null
+++ b/GPUTestRef/Make/options
@@ -0,0 +1,31 @@
+-include $(GENERAL_RULES)/mplibType
+
+EXE_INC = \
+	-g \
+    $(PFLAGS) $(PINC) \
+    -I$(LIB_SRC)/transportModels/compressible/lnInclude \
+    -I$(LIB_SRC)/thermophysicalModels/basic/lnInclude \
+    -I$(LIB_SRC)/TurbulenceModels/turbulenceModels/lnInclude \
+    -I$(LIB_SRC)/TurbulenceModels/compressible/lnInclude \
+    -I$(LIB_SRC)/finiteVolume/cfdTools \
+    -I$(LIB_SRC)/finiteVolume/lnInclude \
+    -I$(LIB_SRC)/meshTools/lnInclude \
+    -I$(LIB_SRC)/sampling/lnInclude \
+    -I$(LIB_SRC)/dynamicFvMesh/lnInclude \
+    -I$(LIB_SRC)/Pstream/mpi \
+    -I$(DF_SRC)/dfCanteraMixture/lnInclude \
+    -I$(DF_SRC)/dfChemistryModel/lnInclude \
+    -I$(DF_SRC)/dfCombustionModels/lnInclude \
+    -I$(LIB_SRC)/parallel/decompose/decompositionMethods/lnInclude \
+    -I$(LIB_SRC)/meshTools/lnInclude \
+    -I$(LIB_SRC)/fileFormats/lnInclude \
+    -I$(LIB_SRC)/triSurface/lnInclude \
+    -I$(LIB_SRC)/surfMesh/lnInclude \
+    -I$(LIB_SRC)/dynamicMesh/lnInclude \
+    -I$(LIB_SRC)/finiteVolume/lnInclude \
+    -I$(CANTERA_ROOT)/include
+
+EXE_LIBS = \
+    -lOpenFOAM \
+    -ltriSurface \
+    -lmeshTools
\ No newline at end of file
diff --git a/GPUTestRef/gaussConvectionScheme.C b/GPUTestRef/gaussConvectionScheme.C
new file mode 100644
index 000000000..83d11bda9
--- /dev/null
+++ b/GPUTestRef/gaussConvectionScheme.C
@@ -0,0 +1,305 @@
+/*---------------------------------------------------------------------------*\
+  =========                 |
+  \\      /  F ield         | OpenFOAM: The Open Source CFD Toolbox
+   \\    /   O peration     | Website:  https://openfoam.org
+    \\  /    A nd           | Copyright (C) 2011-2018 OpenFOAM Foundation
+     \\/     M anipulation  |
+-------------------------------------------------------------------------------
+License
+    This file is part of OpenFOAM.
+
+    OpenFOAM is free software: you can redistribute it and/or modify it
+    under the terms of the GNU General Public License as published by
+    the Free Software Foundation, either version 3 of the License, or
+    (at your option) any later version.
+
+    OpenFOAM is distributed in the hope that it will be useful, but WITHOUT
+    ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
+    FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
+    for more details.
+
+    You should have received a copy of the GNU General Public License
+    along with OpenFOAM.  If not, see <http://www.gnu.org/licenses/>.
+
+\*---------------------------------------------------------------------------*/
+
+#include "GenFvMatrix.H"
+#include "fvcSurfaceIntegrate.H"
+#include "fvMatrices.H"
+#include "gaussConvectionScheme.H"
+
+// * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * //
+
+namespace Foam
+{
+
+// * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * //
+
+// * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * //
+
+template<class Type>
+tmp<fvMatrix<Type>>
+gaussConvectionSchemeFvmDiv
+(
+    const surfaceScalarField& faceFlux,
+    const GeometricField<Type, fvPatchField, volMesh>& vf,
+    const word& name
+)
+{
+    const fvMesh& mesh = vf.mesh();
+
+    tmp<fv::convectionScheme<Type>> cs = fv::convectionScheme<Type>::New(mesh,faceFlux,mesh.divScheme(name));
+    fv::gaussConvectionScheme<Type>& gcs = dynamic_cast<fv::gaussConvectionScheme<Type>&>(cs.ref());
+
+    tmp<surfaceScalarField> tweights = gcs.interpScheme().weights(vf);
+    const surfaceScalarField& weights = tweights();
+
+    tmp<fvMatrix<Type>> tfvm
+    (
+        new fvMatrix<Type>
+        (
+            vf,
+            faceFlux.dimensions()*vf.dimensions()
+        )
+    );
+    fvMatrix<Type>& fvm = tfvm.ref();
+    fvm.lower() = -weights.primitiveField()*faceFlux.primitiveField();
+    fvm.upper() = fvm.lower() + faceFlux.primitiveField();
+    fvm.negSumDiag();
+    forAll(vf.boundaryField(), patchi)
+    {
+        const fvPatchField<Type>& psf = vf.boundaryField()[patchi];
+        const fvsPatchScalarField& patchFlux = faceFlux.boundaryField()[patchi];
+        const fvsPatchScalarField& pw = weights.boundaryField()[patchi];
+
+        fvm.internalCoeffs()[patchi] = patchFlux*psf.valueInternalCoeffs(pw);
+        fvm.boundaryCoeffs()[patchi] = -patchFlux*psf.valueBoundaryCoeffs(pw);
+    }
+    if (gcs.interpScheme().corrected())
+    {
+        fvm += fvc::surfaceIntegrate(faceFlux*gcs.interpScheme().correction(vf));
+    }
+    return tfvm;
+}
+
+template<class Type>
+tmp<fvMatrix<Type>>
+gaussConvectionSchemeFvmDiv
+(
+    const surfaceScalarField& faceFlux,
+    const GeometricField<Type, fvPatchField, volMesh>& vf
+)
+{
+    word name("div("+faceFlux.name()+','+vf.name()+')');
+    return gaussConvectionSchemeFvmDiv(faceFlux,vf,name);
+}
+
+template<class Type>
+tmp<GeometricField<Type, fvPatchField, volMesh>>
+gaussConvectionSchemeFvcDiv
+(
+    const surfaceScalarField& faceFlux,
+    const GeometricField<Type, fvPatchField, volMesh>& vf
+)
+{
+    word name("div("+faceFlux.name()+','+vf.name()+')');
+    return gaussConvectionSchemeFvcDiv(faceFlux, vf, name);
+}
+
+template<class Type>
+tmp<GeometricField<Type, fvPatchField, volMesh>>
+gaussConvectionSchemeFvcDiv
+(
+    const surfaceScalarField& faceFlux,
+    const GeometricField<Type, fvPatchField, volMesh>& vf,
+    const word& name
+)
+{
+    Info << "gaussConvectionSchemeFvcDiv start" << endl;
+    
+    const fvMesh& mesh = vf.mesh();
+
+    Istream& divIntScheme = mesh.divScheme(name);
+    word divScheme(divIntScheme);
+    
+    tmp<surfaceInterpolationScheme<Type>> tinterpScheme_ = 
+        surfaceInterpolationScheme<Type>::New(mesh, faceFlux, divIntScheme);
+
+    // tmp<surfaceInterpolationScheme<Type>> tinterpScheme_ = 
+    // tmp<surfaceInterpolationScheme<Type>>
+    // (
+    //     new linear<Type>(mesh)
+    // );
+
+    
+    // surfaceInterpolationScheme<Type> interpScheme_ = tinterpScheme_.ref();
+    
+    tmp<GeometricField<Type, fvPatchField, volMesh>> tConvection
+    (
+        fvc::surfaceIntegrate(gaussConvectionSchemeFlux(faceFlux, vf, tinterpScheme_))
+    );
+
+    tConvection.ref().rename
+    (
+        "convection(" + faceFlux.name() + ',' + vf.name() + ')'
+    );
+
+    return tConvection;
+}
+
+template<class Type>
+tmp<GeometricField<Type, fvPatchField, volMesh>>
+gaussConvectionSchemeFvcDiv
+(
+    const GeometricField<Type, fvsPatchField, surfaceMesh>& ssf
+)
+{
+    return tmp<GeometricField<Type, fvPatchField, volMesh>>
+    (
+        new GeometricField<Type, fvPatchField, volMesh>
+        (
+            "div("+ssf.name()+')',
+            fvcSurfaceIntegrate(ssf)
+        )
+    );
+}
+
+template<class Type>
+tmp<GeometricField<Type, fvPatchField, volMesh>>
+fvcSurfaceIntegrate
+(
+    const GeometricField<Type, fvsPatchField, surfaceMesh>& ssf
+)
+{
+    const fvMesh& mesh = ssf.mesh();
+
+    tmp<GeometricField<Type, fvPatchField, volMesh>> tvf
+    (
+        new GeometricField<Type, fvPatchField, volMesh>
+        (
+            IOobject
+            (
+                "surfaceIntegrate("+ssf.name()+')',
+                ssf.instance(),
+                mesh,
+                IOobject::NO_READ,
+                IOobject::NO_WRITE
+            ),
+            mesh,
+            dimensioned<Type>
+            (
+                "0",
+                ssf.dimensions()/dimVol,
+                Zero
+            ),
+            extrapolatedCalculatedFvPatchField<Type>::typeName
+        )
+    );
+    GeometricField<Type, fvPatchField, volMesh>& vf = tvf.ref();
+
+    fvcSurfaceIntegrate(vf.primitiveFieldRef(), ssf);
+    vf.correctBoundaryConditions();
+
+    return tvf;
+}
+
+template<class Type>
+void fvcSurfaceIntegrate
+(
+    Field<Type>& ivf,
+    const GeometricField<Type, fvsPatchField, surfaceMesh>& ssf
+)
+{
+    const fvMesh& mesh = ssf.mesh();
+
+    const labelUList& owner = mesh.owner();
+    const labelUList& neighbour = mesh.neighbour();
+
+    const Field<Type>& issf = ssf;
+
+    forAll(owner, facei)
+    {
+        ivf[owner[facei]] += issf[facei];
+        ivf[neighbour[facei]] -= issf[facei];
+    }
+    Info << "ivfcpu[473]before bou = " << ivf[473] << endl;
+
+    forAll(mesh.boundary(), patchi)
+    {
+        const labelUList& pFaceCells =
+            mesh.boundary()[patchi].faceCells();
+
+        const fvsPatchField<Type>& pssf = ssf.boundaryField()[patchi];
+
+        forAll(mesh.boundary()[patchi], facei)
+        {
+            ivf[pFaceCells[facei]] += pssf[facei];
+            if (pFaceCells[facei] == 473)
+            {
+                Info << "pssfcpu[473] += " << pssf[facei] << endl;
+            }
+            
+        }
+    }
+
+    Info << "ivfcpu[473] = " << ivf[473] << endl;
+
+    ivf /= mesh.Vsc();
+
+    printf("vol cpu = %.15e\n", mesh.Vsc()()[473]);
+}
+
+template<class Type>
+tmp<GeometricField<Type, fvsPatchField, surfaceMesh>>
+gaussConvectionSchemeFlux
+(
+    const surfaceScalarField& faceFlux,
+    const GeometricField<Type, fvPatchField, volMesh>& vf,
+    tmp<surfaceInterpolationScheme<Type>> tinterpScheme
+)
+{
+    Info << vf.name() <<tinterpScheme().interpolate(vf) << endl;
+    return faceFlux*tinterpScheme().interpolate(vf);
+}
+
+
+
+// * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * //
+
+template
+tmp<fvMatrix<scalar>>
+gaussConvectionSchemeFvmDiv
+(
+    const surfaceScalarField& faceFlux,
+    const GeometricField<scalar, fvPatchField, volMesh>& vf
+);
+
+template
+tmp<fvMatrix<vector>>
+gaussConvectionSchemeFvmDiv
+(
+    const surfaceScalarField& faceFlux,
+    const GeometricField<vector, fvPatchField, volMesh>& vf
+);
+
+template
+tmp<GeometricField<scalar, fvPatchField, volMesh>>
+gaussConvectionSchemeFvcDiv
+(
+    const surfaceScalarField& faceFlux,
+    const GeometricField<scalar, fvPatchField, volMesh>& vf
+);
+
+
+template
+tmp<GeometricField<scalar, fvPatchField, volMesh>>
+gaussConvectionSchemeFvcDiv
+(
+    const GeometricField<scalar, fvsPatchField, surfaceMesh>& ssf
+);
+
+// * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * //
+
+} // End namespace Foam
+
+// ************************************************************************* //
\ No newline at end of file
diff --git a/GPUTestRef/gaussGrad.C b/GPUTestRef/gaussGrad.C
new file mode 100644
index 000000000..401eab38b
--- /dev/null
+++ b/GPUTestRef/gaussGrad.C
@@ -0,0 +1,332 @@
+/*---------------------------------------------------------------------------*\
+  =========                 |
+  \\      /  F ield         | OpenFOAM: The Open Source CFD Toolbox
+   \\    /   O peration     | Website:  https://openfoam.org
+    \\  /    A nd           | Copyright (C) 2011-2018 OpenFOAM Foundation
+     \\/     M anipulation  |
+-------------------------------------------------------------------------------
+License
+    This file is part of OpenFOAM.
+
+    OpenFOAM is free software: you can redistribute it and/or modify it
+    under the terms of the GNU General Public License as published by
+    the Free Software Foundation, either version 3 of the License, or
+    (at your option) any later version.
+
+    OpenFOAM is distributed in the hope that it will be useful, but WITHOUT
+    ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
+    FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
+    for more details.
+
+    You should have received a copy of the GNU General Public License
+    along with OpenFOAM.  If not, see <http://www.gnu.org/licenses/>.
+
+\*---------------------------------------------------------------------------*/
+
+#include "gaussGrad.H"
+#include "extrapolatedCalculatedFvPatchField.H"
+
+// * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * //
+
+namespace Foam
+{
+
+
+
+template<class Type>
+tmp
+<
+    GeometricField
+    <
+        typename outerProduct<vector, Type>::type,
+        fvPatchField,
+        volMesh
+    >
+>
+gaussGradSchemeGrad
+(
+    const GeometricField<Type, fvPatchField, volMesh>& vsf
+)
+{
+    return gaussGradSchemeGrad(vsf, "grad(" + vsf.name() + ')');
+}
+
+template<class Type>
+tmp
+<
+    GeometricField
+    <
+        typename outerProduct<vector, Type>::type,
+        fvPatchField,
+        volMesh
+    >
+>
+gaussGradSchemeGrad
+(
+    const GeometricField<Type, fvPatchField, volMesh>& vsf,
+    const word& name
+)
+{
+    const fvMesh& mesh = vsf.mesh();
+
+    typedef typename outerProduct<vector, Type>::type GradType;
+    typedef GeometricField<GradType, fvPatchField, volMesh> GradFieldType;
+
+    if (!mesh.changing() && mesh.cache(name))
+    {
+        if (!mesh.objectRegistry::template foundObject<GradFieldType>(name))
+        {
+            solution::cachePrintMessage("Calculating and caching", name, vsf);
+            tmp<GradFieldType> tgGrad = gaussGradCalcGrad(vsf, name);
+            regIOobject::store(tgGrad.ptr());
+        }
+
+        solution::cachePrintMessage("Retrieving", name, vsf);
+        GradFieldType& gGrad =
+            mesh.objectRegistry::template lookupObjectRef<GradFieldType>
+            (
+                name
+            );
+
+        if (gGrad.upToDate(vsf))
+        {
+            return gGrad;
+        }
+        else
+        {
+            solution::cachePrintMessage("Deleting", name, vsf);
+            gGrad.release();
+            delete &gGrad;
+
+            solution::cachePrintMessage("Recalculating", name, vsf);
+            tmp<GradFieldType> tgGrad = gaussGradCalcGrad(vsf, name);
+
+            solution::cachePrintMessage("Storing", name, vsf);
+            regIOobject::store(tgGrad.ptr());
+            GradFieldType& gGrad =
+                mesh.objectRegistry::template lookupObjectRef<GradFieldType>
+                (
+                    name
+                );
+
+            return gGrad;
+        }
+    }
+    else
+    {
+        if (mesh.objectRegistry::template foundObject<GradFieldType>(name))
+        {
+            GradFieldType& gGrad =
+                mesh.objectRegistry::template lookupObjectRef<GradFieldType>
+                (
+                    name
+                );
+
+            if (gGrad.ownedByRegistry())
+            {
+                solution::cachePrintMessage("Deleting", name, vsf);
+                gGrad.release();
+                delete &gGrad;
+            }
+        }
+
+        solution::cachePrintMessage("Calculating", name, vsf);
+        return gaussGradCalcGrad(vsf, name);
+    }
+}
+
+template<class Type>
+tmp
+<
+    GeometricField
+    <
+        typename outerProduct<vector, Type>::type,
+        fvPatchField,
+        volMesh
+    >
+>
+gaussGradCalcGrad
+(
+    const GeometricField<Type, fvPatchField, volMesh>& vsf,
+    const word& name
+)
+{
+    const fvMesh& mesh = vsf.mesh();
+
+    tmp<surfaceInterpolationScheme<Type>> tinterpScheme_ =
+    tmp<surfaceInterpolationScheme<Type>>
+    (
+        new linear<Type>(mesh)
+    );
+
+    typedef typename outerProduct<vector, Type>::type GradType;
+
+    tmp<GeometricField<Type, fvsPatchField, surfaceMesh>> tinterpolate = tinterpScheme_().interpolate(vsf);
+
+    tmp<GeometricField<GradType, fvPatchField, volMesh>> tgGrad
+    (
+        gaussGradGradf(tinterpolate.ref(), name)
+    );
+    GeometricField<GradType, fvPatchField, volMesh>& gGrad = tgGrad.ref();
+
+    gaussGradCorrectBoundaryConditions(vsf, gGrad);
+
+    return tgGrad;
+}
+
+template<class Type>
+void gaussGradCorrectBoundaryConditions
+(
+    const GeometricField<Type, fvPatchField, volMesh>& vsf,
+    GeometricField
+    <
+        typename outerProduct<vector, Type>::type, fvPatchField, volMesh
+    >& gGrad
+)
+{
+    typename GeometricField
+    <
+        typename outerProduct<vector, Type>::type, fvPatchField, volMesh
+    >::Boundary& gGradbf = gGrad.boundaryFieldRef();
+
+    forAll(vsf.boundaryField(), patchi)
+    {
+        if (!vsf.boundaryField()[patchi].coupled())
+        {
+            const vectorField n
+            (
+                vsf.mesh().Sf().boundaryField()[patchi]
+              / vsf.mesh().magSf().boundaryField()[patchi]
+            );
+
+            gGradbf[patchi] += n *
+            (
+                vsf.boundaryField()[patchi].snGrad()
+              - (n & gGradbf[patchi])
+            );
+        }
+     }
+}
+
+template<class Type>
+tmp
+<
+    GeometricField
+    <
+        typename outerProduct<vector, Type>::type,
+        fvPatchField,
+        volMesh
+    >
+>
+gaussGradGradf
+(
+    const GeometricField<Type, fvsPatchField, surfaceMesh>& ssf,
+    const word& name
+)
+{
+    typedef typename outerProduct<vector, Type>::type GradType;
+
+    const fvMesh& mesh = ssf.mesh();
+
+    tmp<GeometricField<GradType, fvPatchField, volMesh>> tgGrad
+    (
+        new GeometricField<GradType, fvPatchField, volMesh>
+        (
+            IOobject
+            (
+                name,
+                ssf.instance(),
+                mesh,
+                IOobject::NO_READ,
+                IOobject::NO_WRITE
+            ),
+            mesh,
+            dimensioned<GradType>
+            (
+                "0",
+                ssf.dimensions()/dimLength,
+                Zero
+            ),
+            extrapolatedCalculatedFvPatchField<GradType>::typeName
+        )
+    );
+    GeometricField<GradType, fvPatchField, volMesh>& gGrad = tgGrad.ref();
+
+    const labelUList& owner = mesh.owner();
+    const labelUList& neighbour = mesh.neighbour();
+    const vectorField& Sf = mesh.Sf();
+
+    Field<GradType>& igGrad = gGrad;
+    const Field<Type>& issf = ssf;
+
+    forAll(owner, facei)
+    {
+        GradType Sfssf = Sf[facei]*issf[facei];
+
+        igGrad[owner[facei]] += Sfssf;
+        igGrad[neighbour[facei]] -= Sfssf;
+    }
+
+    forAll(mesh.boundary(), patchi)
+    {
+        const labelUList& pFaceCells =
+            mesh.boundary()[patchi].faceCells();
+
+        const vectorField& pSf = mesh.Sf().boundaryField()[patchi];
+
+        const fvsPatchField<Type>& pssf = ssf.boundaryField()[patchi];
+
+        forAll(mesh.boundary()[patchi], facei)
+        {
+            igGrad[pFaceCells[facei]] += pSf[facei]*pssf[facei];
+            if (pFaceCells[facei] == 0)
+            {
+                // Info << "CPU add = " << pSf[facei]*pssf[facei] << endl;
+                // Info << "surface CPU = " << pSf[facei] << endl;
+                // Info << "field CPU = " << pssf[facei] << endl;
+            }
+        }
+    }
+
+    igGrad /= mesh.V();
+
+    gGrad.correctBoundaryConditions();
+
+    return tgGrad;
+}
+
+
+template
+tmp
+<
+    GeometricField
+    <
+        typename outerProduct<vector, scalar>::type,
+        fvPatchField,
+        volMesh
+    >
+>
+gaussGradSchemeGrad
+(
+    const GeometricField<scalar, fvPatchField, volMesh>& vsf
+);
+
+
+template
+tmp
+<
+    GeometricField
+    <
+        typename outerProduct<vector, vector>::type,
+        fvPatchField,
+        volMesh
+    >
+>
+gaussGradSchemeGrad
+(
+    const GeometricField<vector, fvPatchField, volMesh>& vsf
+);
+
+} // End namespace Foam
+
+// ************************************************************************* //
\ No newline at end of file
diff --git a/src_gpu/dfMatrixDataBase.H b/src_gpu/dfMatrixDataBase.H
index 4e0bd4cbe..7eb8b9ec2 100644
--- a/src_gpu/dfMatrixDataBase.H
+++ b/src_gpu/dfMatrixDataBase.H
@@ -124,6 +124,7 @@ struct dfMatrixDataBase
     double *d_boundary_mag_sf = nullptr;
     double *d_boundary_weight = nullptr;
     double *d_boundary_delta_coeffs = nullptr;
+    int *d_boundary_face_cell = nullptr;
 
     // non-constant fields - internal 
     // TODO: further estimate
@@ -197,7 +198,7 @@ struct dfMatrixDataBase
     void initConstantFieldsInternal(const double *sf, const double *mag_sf, 
         const double *weight, const double *delta_coeffs, const double *volume);
     void initConstantFieldsBoundary(const double *boundary_sf, const double *boundary_mag_sf, 
-        const double *boundary_delta_coeffs);
+        const double *boundary_delta_coeffs, const int *boundary_face_cell);
 
     void createNonConstantFieldsInternal();
     void createNonConstantFieldsBoundary();
diff --git a/src_gpu/dfMatrixDataBase.cu b/src_gpu/dfMatrixDataBase.cu
index b426201a2..cb6a44d5f 100644
--- a/src_gpu/dfMatrixDataBase.cu
+++ b/src_gpu/dfMatrixDataBase.cu
@@ -198,6 +198,7 @@ void dfMatrixDataBase::createConstantFieldsBoundary() {
     checkCudaErrors(cudaMalloc((void**)&d_boundary_sf, boundary_surface_value_vec_bytes));
     checkCudaErrors(cudaMalloc((void**)&d_boundary_mag_sf, boundary_surface_value_bytes));
     checkCudaErrors(cudaMalloc((void**)&d_boundary_delta_coeffs, boundary_surface_value_bytes));
+    checkCudaErrors(cudaMalloc((void**)&d_boundary_face_cell, boundary_surface_index_bytes));
     fieldPointerMap["d_boundary_sf"] = d_boundary_sf;
     fieldPointerMap["d_boundary_mag_sf"] = d_boundary_mag_sf;
     fieldPointerMap["d_boundary_delta_coeffs"] = d_boundary_delta_coeffs;
@@ -213,10 +214,11 @@ void dfMatrixDataBase::initConstantFieldsInternal(const double *sf, const double
 }
 
 void dfMatrixDataBase::initConstantFieldsBoundary(const double *boundary_sf, const double *boundary_mag_sf, 
-        const double *boundary_delta_coeffs) {
+        const double *boundary_delta_coeffs, const int *boundary_face_cell) {
     checkCudaErrors(cudaMemcpyAsync(d_boundary_sf, boundary_sf, boundary_surface_value_vec_bytes, cudaMemcpyHostToDevice, stream));
     checkCudaErrors(cudaMemcpyAsync(d_boundary_mag_sf, boundary_mag_sf, boundary_surface_value_bytes, cudaMemcpyHostToDevice, stream));
     checkCudaErrors(cudaMemcpyAsync(d_boundary_delta_coeffs, boundary_delta_coeffs, boundary_surface_value_bytes, cudaMemcpyHostToDevice, stream));
+    checkCudaErrors(cudaMemcpyAsync(d_boundary_face_cell, boundary_face_cell, boundary_surface_index_bytes, cudaMemcpyHostToDevice, stream));
 }
 
 void dfMatrixDataBase::createNonConstantFieldsInternal() {
diff --git a/src_gpu/dfMatrixOpBase.H b/src_gpu/dfMatrixOpBase.H
index 9de229b14..b4015f0a6 100644
--- a/src_gpu/dfMatrixOpBase.H
+++ b/src_gpu/dfMatrixOpBase.H
@@ -43,6 +43,18 @@ void fvc_ddt_scalar(cudaStream_t stream, int num_cells, double rDeltaT,
         const double *rho, const double *rho_old, const double *vf, const double *vf_old,
         double *output);
 
+
+void fvc_grad_vector(cudaStream_t stream, int num_cells, int num_surfaces, 
+        const int *lowerAddr, const int *upperAddr, 
+        const double *weight, const double *Sf, const double *vf, double *output, // end for internal
+        int num_patches, const int *patch_size, const int *patch_type,
+        const int *boundary_cell_face, const double *boundary_vf, const double *boundary_Sf,
+        const double *volume, const double *boundary_mag_Sf, double *boundary_output,
+        const double *boundary_deltaCoeffs);
+
+void fvc_div_surface_scalar(cudaStream_t stream, int num_cells, int num_surfaces, int num_boundary_surfaces,
+        const int *lowerAddr, const int *upperAddr, const double *ssf, const int *boundary_cell_face,
+        const double *boundary_ssf, const double *volume, double *output);
 // void fvc_grad_surface();
 // 
 // void fvc_div_cell();
diff --git a/src_gpu/dfMatrixOpBase.cu b/src_gpu/dfMatrixOpBase.cu
index f55b6895a..e397b4232 100644
--- a/src_gpu/dfMatrixOpBase.cu
+++ b/src_gpu/dfMatrixOpBase.cu
@@ -192,6 +192,295 @@ __global__ void fvc_ddt_scalar_kernel(int num_cells, double rDeltaT,
     output[index] += rDeltaT * (rho[index] * vf[index] - rho_old[index] * vf_old[index]);
 }
 
+__global__ void fvc_grad_vector_internal(int num_surfaces, 
+        const int *lower_index, const int *upper_index, const double *face_vector,
+        const double *weight, const double *field_vector, 
+        double *output)
+{
+    int index = blockDim.x * blockIdx.x + threadIdx.x;
+    if (index >= num_surfaces)
+        return;
+    
+    double w = weight[index];
+    double Sfx = face_vector[index * 3 + 0];
+    double Sfy = face_vector[index * 3 + 1];
+    double Sfz = face_vector[index * 3 + 2];
+
+    int owner = lower_index[index];
+    int neighbor = upper_index[index];
+
+    double ssfx = (w * (field_vector[owner * 3 + 0] - field_vector[neighbor * 3 + 0]) + field_vector[neighbor * 3 + 0]);
+    double ssfy = (w * (field_vector[owner * 3 + 1] - field_vector[neighbor * 3 + 1]) + field_vector[neighbor * 3 + 1]);
+    double ssfz = (w * (field_vector[owner * 3 + 2] - field_vector[neighbor * 3 + 2]) + field_vector[neighbor * 3 + 2]);    
+
+    double grad_xx = Sfx * ssfx;
+    double grad_xy = Sfx * ssfy;
+    double grad_xz = Sfx * ssfz;
+    double grad_yx = Sfy * ssfx;
+    double grad_yy = Sfy * ssfy;
+    double grad_yz = Sfy * ssfz;
+    double grad_zx = Sfz * ssfx;
+    double grad_zy = Sfz * ssfy;
+    double grad_zz = Sfz * ssfz;
+
+    // owner
+    atomicAdd(&(output[owner * 9 + 0]), grad_xx);
+    atomicAdd(&(output[owner * 9 + 1]), grad_xy);
+    atomicAdd(&(output[owner * 9 + 2]), grad_xz);
+    atomicAdd(&(output[owner * 9 + 3]), grad_yx);
+    atomicAdd(&(output[owner * 9 + 4]), grad_yy);
+    atomicAdd(&(output[owner * 9 + 5]), grad_yz);
+    atomicAdd(&(output[owner * 9 + 6]), grad_zx);
+    atomicAdd(&(output[owner * 9 + 7]), grad_zy);
+    atomicAdd(&(output[owner * 9 + 8]), grad_zz);
+
+    // neighbour
+    atomicAdd(&(output[neighbor * 9 + 0]), -grad_xx);
+    atomicAdd(&(output[neighbor * 9 + 1]), -grad_xy);
+    atomicAdd(&(output[neighbor * 9 + 2]), -grad_xz);
+    atomicAdd(&(output[neighbor * 9 + 3]), -grad_yx);
+    atomicAdd(&(output[neighbor * 9 + 4]), -grad_yy);
+    atomicAdd(&(output[neighbor * 9 + 5]), -grad_yz);
+    atomicAdd(&(output[neighbor * 9 + 6]), -grad_zx);
+    atomicAdd(&(output[neighbor * 9 + 7]), -grad_zy);
+    atomicAdd(&(output[neighbor * 9 + 8]), -grad_zz);
+
+    // if (owner == 0)
+    // {
+    //     printf("tensor[0] = (%.5e, %.5e, %.5e, %.5e, %.5e, %.5e, %.5e, %.5e, %.5e)\n", output[owner * 9 + 0],
+    //         output[owner * 9 + 1], output[owner * 9 + 2], output[owner * 9 + 3], output[owner * 9 + 4], output[owner * 9 + 5], 
+    //         output[owner * 9 + 6], output[owner * 9 + 7], output[owner * 9 + 8]);
+    // }
+}
+
+// update boundary of interpolation field
+// calculate the grad field
+// TODO: this function is implemented for uncoupled boundary conditions
+//       so it should use the more specific func name
+__global__ void fvc_grad_vector_boundary(int num, int offset, const int *face2Cells,
+        const double *boundary_face_vector, const double *boundary_field_vector, double *output)
+{
+    int index = blockDim.x * blockIdx.x + threadIdx.x;
+    if (index >= num)
+        return;
+
+    int start_index = offset + index;
+
+    double bouSfx = boundary_face_vector[start_index * 3 + 0];
+    double bouSfy = boundary_face_vector[start_index * 3 + 1];
+    double bouSfz = boundary_face_vector[start_index * 3 + 2];
+
+    double boussfx = boundary_field_vector[start_index * 3 + 0];
+    double boussfy = boundary_field_vector[start_index * 3 + 1];
+    double boussfz = boundary_field_vector[start_index * 3 + 2];
+
+    int cellIndex = face2Cells[start_index];
+
+    // if (cellIndex == 0)
+    // {
+    //     printf("surface vector = (%.5e, %.5e, %.5e)\n field vector = (%.5e, %.5e, %.5e)\n", bouSfx, bouSfy, bouSfz,
+    //             boussfx, boussfy, boussfz);
+    // }
+
+    double grad_xx = bouSfx * boussfx;
+    double grad_xy = bouSfx * boussfy;
+    double grad_xz = bouSfx * boussfz;
+    double grad_yx = bouSfy * boussfx;
+    double grad_yy = bouSfy * boussfy;
+    double grad_yz = bouSfy * boussfz;
+    double grad_zx = bouSfz * boussfx;
+    double grad_zy = bouSfz * boussfy;
+    double grad_zz = bouSfz * boussfz;
+
+    atomicAdd(&(output[cellIndex * 9 + 0]), grad_xx);
+    atomicAdd(&(output[cellIndex * 9 + 1]), grad_xy);
+    atomicAdd(&(output[cellIndex * 9 + 2]), grad_xz);
+    atomicAdd(&(output[cellIndex * 9 + 3]), grad_yx);
+    atomicAdd(&(output[cellIndex * 9 + 4]), grad_yy);
+    atomicAdd(&(output[cellIndex * 9 + 5]), grad_yz);
+    atomicAdd(&(output[cellIndex * 9 + 6]), grad_zx);
+    atomicAdd(&(output[cellIndex * 9 + 7]), grad_zy);
+    atomicAdd(&(output[cellIndex * 9 + 8]), grad_zz);
+}
+
+__global__ void divide_cell_volume_tsr(int num_cells, const double* volume, double *output)
+{
+    int index = blockDim.x * blockIdx.x + threadIdx.x;
+    if (index >= num_cells)
+        return;
+    
+    double vol = volume[index];
+    output[index * 9 + 0] = output[index * 9 + 0] / vol;
+    output[index * 9 + 1] = output[index * 9 + 1] / vol;
+    output[index * 9 + 2] = output[index * 9 + 2] / vol;
+    output[index * 9 + 3] = output[index * 9 + 3] / vol;
+    output[index * 9 + 4] = output[index * 9 + 4] / vol;
+    output[index * 9 + 5] = output[index * 9 + 5] / vol;
+    output[index * 9 + 6] = output[index * 9 + 6] / vol;
+    output[index * 9 + 7] = output[index * 9 + 7] / vol;
+    output[index * 9 + 8] = output[index * 9 + 8] / vol;
+}
+
+__global__ void divide_cell_volume_scalar(int num_cells, const double* volume, double *output)
+{
+    int index = blockDim.x * blockIdx.x + threadIdx.x;
+    if (index >= num_cells)
+        return;
+    
+    double vol = volume[index];
+
+    if (index == 473)
+    {
+        printf("vol gpu = %.15e\n", vol);
+    }
+
+    output[index] = output[index] / vol;
+}
+
+__global__ void fvc_grad_vector_correctBC_zeroGradient(int num, int offset, const int *face2Cells, 
+        const double *internal_grad, const double *vf, const double *boundary_sf,
+        const double *boundary_mag_sf, double *boundary_grad)
+{
+    int index = blockDim.x * blockIdx.x + threadIdx.x;
+    if (index >= num)
+        return;
+
+    int start_index = offset + index;
+
+    int cellIndex = face2Cells[start_index];
+
+    double grad_xx = internal_grad[cellIndex * 9 + 0];
+    double grad_xy = internal_grad[cellIndex * 9 + 1];
+    double grad_xz = internal_grad[cellIndex * 9 + 2];
+    double grad_yx = internal_grad[cellIndex * 9 + 3];
+    double grad_yy = internal_grad[cellIndex * 9 + 4];
+    double grad_yz = internal_grad[cellIndex * 9 + 5];
+    double grad_zx = internal_grad[cellIndex * 9 + 6];
+    double grad_zy = internal_grad[cellIndex * 9 + 7];
+    double grad_zz = internal_grad[cellIndex * 9 + 8];
+
+    double vfx = vf[cellIndex * 3 + 0];
+    double vfy = vf[cellIndex * 3 + 1];
+    double vfz = vf[cellIndex * 3 + 2];
+
+    double n_x = boundary_sf[cellIndex * 3 + 0] / boundary_mag_sf[cellIndex];
+    double n_y = boundary_sf[cellIndex * 3 + 1] / boundary_mag_sf[cellIndex];
+    double n_z = boundary_sf[cellIndex * 3 + 2] / boundary_mag_sf[cellIndex];
+    
+    double grad_correction_x = - (n_x * grad_xx + n_y * grad_yx + n_z * grad_zx); // sn_grad_x = 0
+    double grad_correction_y = - (n_x * grad_xy + n_y * grad_yy + n_z * grad_zy);
+    double grad_correction_z = - (n_x * grad_xz + n_y * grad_yz + n_z * grad_zz);
+
+    boundary_grad[start_index * 9 + 0] = grad_xx + n_x * grad_correction_x;
+    boundary_grad[start_index * 9 + 1] = grad_xy + n_x * grad_correction_y;
+    boundary_grad[start_index * 9 + 2] = grad_xz + n_x * grad_correction_z;
+    boundary_grad[start_index * 9 + 3] = grad_yx + n_y * grad_correction_x;
+    boundary_grad[start_index * 9 + 4] = grad_yy + n_y * grad_correction_y;
+    boundary_grad[start_index * 9 + 5] = grad_yz + n_y * grad_correction_z;
+    boundary_grad[start_index * 9 + 6] = grad_zx + n_z * grad_correction_x;
+    boundary_grad[start_index * 9 + 7] = grad_zy + n_z * grad_correction_y;
+    boundary_grad[start_index * 9 + 8] = grad_zz + n_z * grad_correction_z;
+}
+
+__global__ void fvc_grad_vector_correctBC_fixedValue(int num, int offset, const int *face2Cells, 
+        const double *internal_grad, const double *vf, const double *boundary_sf,
+        const double *boundary_mag_sf, double *boundary_grad,
+        const double *boundary_deltaCoeffs, const double *boundary_vf)
+{
+    int index = blockDim.x * blockIdx.x + threadIdx.x;
+    if (index >= num)
+        return;
+
+    int start_index = offset + index;
+
+    int cellIndex = face2Cells[start_index];
+
+    double grad_xx = internal_grad[cellIndex * 9 + 0];
+    double grad_xy = internal_grad[cellIndex * 9 + 1];
+    double grad_xz = internal_grad[cellIndex * 9 + 2];
+    double grad_yx = internal_grad[cellIndex * 9 + 3];
+    double grad_yy = internal_grad[cellIndex * 9 + 4];
+    double grad_yz = internal_grad[cellIndex * 9 + 5];
+    double grad_zx = internal_grad[cellIndex * 9 + 6];
+    double grad_zy = internal_grad[cellIndex * 9 + 7];
+    double grad_zz = internal_grad[cellIndex * 9 + 8];
+
+    double vfx = vf[cellIndex * 3 + 0];
+    double vfy = vf[cellIndex * 3 + 1];
+    double vfz = vf[cellIndex * 3 + 2];
+
+    double n_x = boundary_sf[start_index * 3 + 0] / boundary_mag_sf[start_index];
+    double n_y = boundary_sf[start_index * 3 + 1] / boundary_mag_sf[start_index];
+    double n_z = boundary_sf[start_index * 3 + 2] / boundary_mag_sf[start_index];
+    
+    // sn_grad: solving according to fixedValue BC
+    double sn_grad_x = boundary_deltaCoeffs[start_index] * (boundary_vf[start_index * 3 + 0] - vf[cellIndex * 3 + 0]);
+    double sn_grad_y = boundary_deltaCoeffs[start_index] * (boundary_vf[start_index * 3 + 1] - vf[cellIndex * 3 + 1]);
+    double sn_grad_z = boundary_deltaCoeffs[start_index] * (boundary_vf[start_index * 3 + 2] - vf[cellIndex * 3 + 2]);
+
+    double grad_correction_x = sn_grad_x - (n_x * grad_xx + n_y * grad_yx + n_z * grad_zx); // sn_grad_x = 0
+    double grad_correction_y = sn_grad_y - (n_x * grad_xy + n_y * grad_yy + n_z * grad_zy);
+    double grad_correction_z = sn_grad_z - (n_x * grad_xz + n_y * grad_yz + n_z * grad_zz);
+
+    boundary_grad[start_index * 9 + 0] = grad_xx + n_x * grad_correction_x;
+    boundary_grad[start_index * 9 + 1] = grad_xy + n_x * grad_correction_y;
+    boundary_grad[start_index * 9 + 2] = grad_xz + n_x * grad_correction_z;
+    boundary_grad[start_index * 9 + 3] = grad_yx + n_y * grad_correction_x;
+    boundary_grad[start_index * 9 + 4] = grad_yy + n_y * grad_correction_y;
+    boundary_grad[start_index * 9 + 5] = grad_yz + n_y * grad_correction_z;
+    boundary_grad[start_index * 9 + 6] = grad_zx + n_z * grad_correction_x;
+    boundary_grad[start_index * 9 + 7] = grad_zy + n_z * grad_correction_y;
+    boundary_grad[start_index * 9 + 8] = grad_zz + n_z * grad_correction_z;
+}
+
+__global__ void fvc_div_surface_scalar_internal(int num_surfaces, 
+        const int *lower_index, const int *upper_index, const double *ssf,
+        double *output)
+{
+    int index = blockDim.x * blockIdx.x + threadIdx.x;
+    if (index >= num_surfaces)
+        return;
+
+    int owner = lower_index[index];
+    int neighbor = upper_index[index];
+
+    double issf = ssf[index];
+
+    // owner
+    atomicAdd(&(output[owner]), issf);
+
+    // neighbor
+    atomicAdd(&(output[neighbor]), -issf);
+
+    if (index == 0)
+    {
+        printf("output[3511]before = %.5e\n", output[473]);
+    }
+}
+
+__global__ void fvc_div_surface_scalar_boundary(int num_boundary_face, const int *face2Cells,
+        const double *boundary_ssf, double *output)
+{
+    int index = blockDim.x * blockIdx.x + threadIdx.x;
+    if (index >= num_boundary_face)
+        return;
+    
+    int cellIndex = face2Cells[index];
+
+    atomicAdd(&(output[cellIndex]), boundary_ssf[index]);
+
+    // if (index == 0)
+    // {
+    //     printf("output[3511] = %.5e\n", output[3511]);
+    // }
+
+    if (cellIndex == 473)
+    {
+        printf("output[473] = %.5e\n", output[473]);
+        printf("boundary_ssf[473] = %.5e\n", boundary_ssf[index]);
+    }
+}
+
 void permute_vector_d2h(cudaStream_t stream, int num_cells, const double *input, double *output)
 {
     size_t threads_per_block = 256;
@@ -328,3 +617,79 @@ void fvc_ddt_scalar(cudaStream_t stream, int num_cells, double rDeltaT,
             rDeltaT, rho, rho_old, vf, vf_old, output);
 }
 
+void fvc_grad_vector(cudaStream_t stream, int num_cells, int num_surfaces, 
+        const int *lowerAddr, const int *upperAddr, 
+        const double *weight, const double *Sf, const double *vf, double *output, // end for internal
+        int num_patches, const int *patch_size, const int *patch_type,
+        const int *boundary_cell_face, const double *boundary_vf, const double *boundary_Sf,
+        const double *volume, const double *boundary_mag_Sf, double *boundary_output,
+        const double *boundary_deltaCoeffs)
+{
+    size_t threads_per_block = 1024;
+    size_t blocks_per_grid = (num_surfaces + threads_per_block - 1) / threads_per_block;
+    fvc_grad_vector_internal<<<blocks_per_grid, threads_per_block, 0, stream>>>(num_surfaces, lowerAddr, upperAddr,
+            Sf, weight, vf, output);
+
+    int offset = 0;
+    // finish conctruct grad field except dividing cell volume
+    for (int i = 0; i < num_patches; i++) {
+        threads_per_block = 256;
+        blocks_per_grid = (patch_size[i] + threads_per_block - 1) / threads_per_block;
+        // TODO: just basic patch type now
+        if (patch_type[i] == boundaryConditions::zeroGradient
+                || patch_type[i] == boundaryConditions::fixedValue) {
+            // TODO: just vector version now
+            fvc_grad_vector_boundary<<<blocks_per_grid, threads_per_block, 0, stream>>>(patch_size[i], offset, boundary_cell_face,
+                    boundary_Sf, boundary_vf, output);
+        } else if (0) {
+            // xxx
+            fprintf(stderr, "boundaryConditions other than zeroGradient are not support yet!\n");
+        }
+        offset += patch_size[i];
+    }
+
+    // divide cell volume
+    threads_per_block = 1024;
+    blocks_per_grid = (num_cells + threads_per_block - 1) / threads_per_block;
+    divide_cell_volume_tsr<<<blocks_per_grid, threads_per_block, 0, stream>>>(num_cells, volume, output);
+
+    offset = 0;
+    // correct boundary conditions
+    for (int i = 0; i < num_patches; i++) {
+        threads_per_block = 256;
+        blocks_per_grid = (patch_size[i] + threads_per_block - 1) / threads_per_block;
+        // TODO: just basic patch type now
+        if (patch_type[i] == boundaryConditions::zeroGradient) {
+            // TODO: just vector version now
+            fvc_grad_vector_correctBC_zeroGradient<<<blocks_per_grid, threads_per_block, 0, stream>>>(patch_size[i], offset, boundary_cell_face,
+                    output, boundary_vf, boundary_Sf, boundary_mag_Sf, boundary_output);
+        } else if (patch_type[i] == boundaryConditions::fixedValue) {
+            fvc_grad_vector_correctBC_fixedValue<<<blocks_per_grid, threads_per_block, 0, stream>>>(patch_size[i], offset, boundary_cell_face,
+                    output, boundary_vf, boundary_Sf, boundary_mag_Sf, boundary_output, boundary_deltaCoeffs, boundary_vf);
+        } else if (0) {
+            // xxx
+            fprintf(stderr, "boundaryConditions other than zeroGradient are not support yet!\n");
+        }
+        offset += patch_size[i];
+    }
+}
+
+void fvc_div_surface_scalar(cudaStream_t stream, int num_cells, int num_surfaces, int num_boundary_surfaces,
+        const int *lowerAddr, const int *upperAddr, const double *ssf, const int *boundary_cell_face,
+        const double *boundary_ssf, const double *volume, double *output)
+{
+    size_t threads_per_block = 1024;
+    size_t blocks_per_grid = (num_surfaces + threads_per_block - 1) / threads_per_block;
+    fvc_div_surface_scalar_internal<<<blocks_per_grid, threads_per_block, 0, stream>>>(num_surfaces, lowerAddr, upperAddr, ssf, output);
+
+    threads_per_block = 1024;
+    blocks_per_grid = (num_boundary_surfaces + threads_per_block - 1) / threads_per_block;
+    fvc_div_surface_scalar_boundary<<<blocks_per_grid, threads_per_block, 0, stream>>>(num_boundary_surfaces, boundary_cell_face, 
+            boundary_ssf, output);
+
+    // divide cell volume
+    threads_per_block = 1024;
+    blocks_per_grid = (num_cells + threads_per_block - 1) / threads_per_block;
+    divide_cell_volume_scalar<<<blocks_per_grid, threads_per_block, 0, stream>>>(num_cells, volume, output);
+}
+

From 4ee65cc4a83cbc0d71c06d44b260d1872cec502d Mon Sep 17 00:00:00 2001
From: maorz1998 <maorz1998@stu.pku.edu.cn>
Date: Sat, 12 Aug 2023 00:10:11 +0800
Subject: [PATCH 17/25] fvc op & CPU op

---
 GPUTest/GPUTestBase.H              | 115 +++++++++--
 GPUTest/unittest.C                 |  10 +
 GPUTestRef/EulerDdtScheme.C        | 322 +++++++++++++++++++++++++++++
 GPUTestRef/GenFvMatrix.H           |  13 ++
 GPUTestRef/Make/files              |   2 +
 GPUTestRef/gaussConvectionScheme.C |  66 +++++-
 GPUTestRef/gaussLaplacianScheme.C  | 273 ++++++++++++++++++++++++
 src_gpu/dfMatrixOpBase.H           |  10 +-
 src_gpu/dfMatrixOpBase.cu          | 119 ++++++++---
 9 files changed, 872 insertions(+), 58 deletions(-)
 create mode 100644 GPUTestRef/EulerDdtScheme.C
 create mode 100644 GPUTestRef/gaussLaplacianScheme.C

diff --git a/GPUTest/GPUTestBase.H b/GPUTest/GPUTestBase.H
index 38676528e..b5f5b944b 100644
--- a/GPUTest/GPUTestBase.H
+++ b/GPUTest/GPUTestBase.H
@@ -72,29 +72,57 @@ void getTypeInfo(size_t *stride, size_t *internal_size, size_t *boundary_size) {
     *boundary_size = dfDataBase.num_boundary_surfaces * s;
 }
 
+
+template <typename T>
+void getFieldPtr(std::queue<double*>& fieldPtrQue, T& field){
+    fieldPtrQue.push(&field[0]);
+    forAll(field.boundaryField(), patchi){
+        auto& patchField = field.boundaryFieldRef()[patchi];
+        fieldPtrQue.push(&patchField[0]);
+    }
+};
+
+// template <typename T>
+// void getFieldPtr(std::vector<double*>& fieldPtrQue, T& field){
+//     fieldPtrQue.push_back(&field[0]);
+//     forAll(field.boundaryField(), patchi){
+//         auto& patchField = field.boundaryFieldRef()[patchi];
+//         fieldPtrQue.push_back(&patchField[0]);
+//         Info << "patchi " << patchi << endl;
+//     }
+// };
+
+
 template <typename T>
 void randomInitField(T& field) {
     size_t stride = 0;
     size_t internal_size = 0;
     size_t boundary_size = 0;
     getTypeInfo<T>(&stride, &internal_size, &boundary_size);
-    size_t internal_value_bytes = internal_size * sizeof(double);
+    size_t internal_value_bytes = internal_size * sizeof(double) * stride;
+    std::queue<double*> fieldPtrQue;
+    // std::vector<double*> fieldPtrQue;
+    getFieldPtr(fieldPtrQue, field);
 
     // random init field value to (-0.5, 0.5)
     // internal
-    double *field_internal_ptr = &field[0];
+    double *&field_internal_ptr = fieldPtrQue.front(); fieldPtrQue.pop();
+    // double *field_internal_ptr = fieldPtrQue[0];
     std::vector<double> init_field_internal;
-    init_field_internal.resize(internal_size);
-    for (size_t i = 0; i < internal_size; i++) {
+    init_field_internal.resize(internal_size * stride);
+    for (size_t i = 0; i < internal_size * stride; i++) {
         init_field_internal[i] = (rand() % 10000 - 5000) / 10000.0;
     }
     memcpy(field_internal_ptr, init_field_internal.data(), internal_value_bytes);
     // boundary
+    int ptrIndex = 1; 
     forAll(field.boundaryField(), patchi)
     {
         auto& patchField = field.boundaryFieldRef()[patchi];
         size_t patchsize = patchField.size();
-        double *field_boundary_ptr = &patchField[0];
+        double *&field_boundary_ptr = fieldPtrQue.front(); fieldPtrQue.pop();
+        // double *field_boundary_ptr = fieldPtrQue[ptrIndex];
+        // ptrIndex ++;
         std::vector<double> init_field_boundary;
         init_field_boundary.resize(patchsize * stride);
         for (size_t i = 0; i < patchsize * stride; i++) {
@@ -296,7 +324,8 @@ void test_fvm_ddt_vector(dfMatrixDataBase& dfDataBase, Foam::fvMesh& mesh, volSc
     }
 
     // run CPU
-    fvVectorMatrix dfMatrix = fvm::ddt(rho, U);
+    // fvVectorMatrix dfMatrix = fvm::ddt(rho, U);
+    fvVectorMatrix dfMatrix = EulerDdtSchemeFvmDdt(rho, U);
 
     // prepare for run GPU
     // prepare rho, rho.old, U
@@ -325,7 +354,8 @@ void test_fvm_div_vector(dfMatrixDataBase& dfDataBase, Foam::fvMesh& mesh, surfa
     }
 
     // run CPU
-    fvVectorMatrix dfMatrix = fvm::div(phi, U);
+    // fvVectorMatrix dfMatrix = fvm::div(phi, U);
+    fvVectorMatrix dfMatrix = gaussConvectionSchemeFvmDiv(phi, U);
 
     // prepare for run GPU
     // prepare phi field
@@ -365,7 +395,8 @@ void test_fvm_laplacian_vector(dfMatrixDataBase& dfDataBase, Foam::fvMesh& mesh,
     }
 
     // run CPU
-    fvVectorMatrix dfMatrix = fvm::laplacian(gamma, U);
+    // fvVectorMatrix dfMatrix = fvm::laplacian(gamma, U);
+    fvVectorMatrix dfMatrix = gaussLaplacianSchemeFvmLaplacian(gamma, U);
 
     // prepare for run GPU
     // prepare gamma on GPU
@@ -415,7 +446,8 @@ void test_fvc_ddt_scalar(dfMatrixDataBase& dfDataBase, Foam::fvMesh& mesh, volSc
     }
 
     // run CPU
-    volScalarField fvc_ouput_scalar = fvc::ddt(rho, K);
+    // volScalarField fvc_ouput_scalar = fvc::ddt(rho, K);
+    volScalarField fvc_ouput_scalar = EulerDdtSchemeFvcDdt(rho, K);
 
     // prepare for run GPU
     // prepare rho, rho.old on GPU
@@ -453,10 +485,10 @@ void test_fvc_ddt_scalar(dfMatrixDataBase& dfDataBase, Foam::fvMesh& mesh, volSc
 
 // unittest of fvc::grad(U)
 void test_fvc_grad_vector(dfMatrixDataBase& dfDataBase, Foam::fvMesh& mesh, volVectorField& U, initType type) {
-    // if (type == initType::randomInit) {
-    //     U.oldTime();
-    //     randomInitField<volVectorField>(U);
-    // }
+    if (type == initType::randomInit) {
+        U.oldTime();
+        randomInitField<volVectorField>(U);
+    }
 
     // run CPU
     volTensorField fvc_ouput_tensor = fvc::grad(U);
@@ -519,3 +551,60 @@ void test_fvc_div_scalar(dfMatrixDataBase& dfDataBase, Foam::fvMesh& mesh, surfa
     checkCudaErrors(cudaMemcpy(h_fvc_ouput_scalar.data(), d_fvc_ouput_scalar, dfDataBase.cell_value_bytes, cudaMemcpyDeviceToHost));
     checkVectorEqual(dfDataBase.num_cells, &fvc_ouput_scalar[0], h_fvc_ouput_scalar.data(), 1e-14, printFlag);
 }
+
+// unittest of fvc::div(U)
+void test_fvc_div_vector(dfMatrixDataBase& dfDataBase, Foam::fvMesh& mesh, volVectorField& U, initType type) {
+    if (type == initType::randomInit) {
+        U.oldTime();
+        randomInitField<volVectorField>(U);
+    }
+
+    // run CPU
+    // volScalarField fvc_ouput_scalar = fvc::div(U);
+    volScalarField fvc_ouput_scalar = gaussDivFvcdiv(U);
+
+    // prepare for run GPU
+    // prepare phi on GPU
+    uploadRegisteredField<volVectorField>(dfDataBase, U, "u");
+
+    double *d_fvc_ouput_scalar = nullptr;
+    checkCudaErrors(cudaMalloc((void**)&d_fvc_ouput_scalar, dfDataBase.cell_value_bytes));
+    checkCudaErrors(cudaMemset(d_fvc_ouput_scalar, 0, dfDataBase.cell_value_bytes));
+
+    // only need patch_type
+    testGPUDataBase testData;
+    buildTestGPUDataBase<volVectorField>(dfDataBase, testData, U, false, false, false, false, false, false, false, false, false, false);
+
+    fvc_div_cell_vector(dfDataBase.stream, dfDataBase.num_cells, dfDataBase.num_surfaces, 
+            dfDataBase.d_owner, dfDataBase.d_neighbor,
+            dfDataBase.d_weight, dfDataBase.d_sf, dfDataBase.d_u, d_fvc_ouput_scalar,
+            dfDataBase.num_patches, dfDataBase.patch_size.data(), testData.patch_type.data(), 
+            dfDataBase.d_boundary_face_cell, dfDataBase.d_boundary_u, dfDataBase.d_boundary_sf,
+            dfDataBase.d_volume);
+    
+    // compare result
+    bool printFlag = false;
+    std::vector<double> h_fvc_ouput_scalar(dfDataBase.num_cells);
+    checkCudaErrors(cudaMemcpy(h_fvc_ouput_scalar.data(), d_fvc_ouput_scalar, dfDataBase.cell_value_bytes, cudaMemcpyDeviceToHost));
+    checkVectorEqual(dfDataBase.num_cells, &fvc_ouput_scalar[0], h_fvc_ouput_scalar.data(), 1e-14, printFlag);
+}
+
+
+// * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * //
+template <>
+void getFieldPtr<volVectorField>(std::queue<double*>& fieldPtrQue, volVectorField& field) {
+    fieldPtrQue.push(&field[0][0]);
+    forAll(field.boundaryField(), patchi){
+        auto& patchField = field.boundaryFieldRef()[patchi];
+        fieldPtrQue.push(&patchField[0][0]);
+    }
+};
+
+template <>
+void getFieldPtr<volTensorField>(std::queue<double*>& fieldPtrQue, volTensorField& field) {
+    fieldPtrQue.push(&field[0][0]);
+    forAll(field.boundaryField(), patchi){
+        auto& patchField = field.boundaryFieldRef()[patchi];
+        fieldPtrQue.push(&patchField[0][0]);
+    }
+};
\ No newline at end of file
diff --git a/GPUTest/unittest.C b/GPUTest/unittest.C
index ccbaefa71..0577b3d2e 100644
--- a/GPUTest/unittest.C
+++ b/GPUTest/unittest.C
@@ -51,6 +51,8 @@ Description
 
 // debug
 #include "GenFvMatrix.H"
+#include <iostream>
+#include <queue>
 
 #include "dfMatrixDataBase.H"
 #include "dfMatrixOpBase.H"
@@ -132,12 +134,20 @@ int main(int argc, char *argv[])
         // unittest of fvc::grad(U)
         test_fvc_grad_vector(dfDataBase, mesh, U, initType::original);
         DEBUG_TRACE;
+        test_fvc_grad_vector(dfDataBase, mesh, U, initType::randomInit);
+        DEBUG_TRACE;
 
         // unittest of fvc::div(phi)
         test_fvc_div_scalar(dfDataBase, mesh, phi, initType::original);
         DEBUG_TRACE;
         test_fvc_div_scalar(dfDataBase, mesh, phi, initType::randomInit);
         DEBUG_TRACE;
+
+        // unittest of fvc::div(U)
+        test_fvc_div_vector(dfDataBase, mesh, U, initType::original);
+        DEBUG_TRACE;
+        test_fvc_div_vector(dfDataBase, mesh, U, initType::randomInit);
+        DEBUG_TRACE;
     }
     return 0;
 }
diff --git a/GPUTestRef/EulerDdtScheme.C b/GPUTestRef/EulerDdtScheme.C
new file mode 100644
index 000000000..0875e0033
--- /dev/null
+++ b/GPUTestRef/EulerDdtScheme.C
@@ -0,0 +1,322 @@
+/*---------------------------------------------------------------------------*\
+  =========                 |
+  \\      /  F ield         | OpenFOAM: The Open Source CFD Toolbox
+   \\    /   O peration     | Website:  https://openfoam.org
+    \\  /    A nd           | Copyright (C) 2011-2018 OpenFOAM Foundation
+     \\/     M anipulation  |
+-------------------------------------------------------------------------------
+License
+    This file is part of OpenFOAM.
+
+    OpenFOAM is free software: you can redistribute it and/or modify it
+    under the terms of the GNU General Public License as published by
+    the Free Software Foundation, either version 3 of the License, or
+    (at your option) any later version.
+
+    OpenFOAM is distributed in the hope that it will be useful, but WITHOUT
+    ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
+    FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
+    for more details.
+
+    You should have received a copy of the GNU General Public License
+    along with OpenFOAM.  If not, see <http://www.gnu.org/licenses/>.
+
+\*---------------------------------------------------------------------------*/
+
+#include "GenFvMatrix.H"
+
+// * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * //
+
+namespace Foam
+{
+
+// * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * //
+
+// namespace fv
+// {
+
+// * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * //
+
+template<class Type>
+tmp<fvMatrix<Type>>
+EulerDdtSchemeFvmDdt
+(
+    const volScalarField& rho,
+    const GeometricField<Type, fvPatchField, volMesh>& vf
+)
+{
+    const fvMesh& mesh = vf.mesh();
+
+    tmp<fvMatrix<Type>> tfvm
+    (
+        new fvMatrix<Type>
+        (
+            vf,
+            rho.dimensions()*vf.dimensions()*dimVol/dimTime
+        )
+    );
+    fvMatrix<Type>& fvm = tfvm.ref();
+
+    scalar rDeltaT = 1.0/mesh.time().deltaTValue();
+
+    fvm.diag() = rDeltaT*rho.primitiveField()*mesh.Vsc();
+
+    if (mesh.moving())
+    {
+        fvm.source() = rDeltaT
+            *rho.oldTime().primitiveField()
+            *vf.oldTime().primitiveField()*mesh.Vsc0();
+    }
+    else
+    {
+        fvm.source() = rDeltaT
+            *rho.oldTime().primitiveField()
+            *vf.oldTime().primitiveField()*mesh.Vsc();
+    }
+    return tfvm;
+}
+
+template<class Type>
+tmp<GeometricField<Type, fvPatchField, volMesh>>
+EulerDdtSchemeFvcDdt
+(
+    const volScalarField& rho,
+    const GeometricField<Type, fvPatchField, volMesh>& vf
+)
+{
+    const fvMesh& mesh = vf.mesh();
+
+    dimensionedScalar rDeltaT = 1.0/mesh.time().deltaT();
+
+    IOobject ddtIOobject
+    (
+        "ddt("+rho.name()+','+vf.name()+')',
+        mesh.time().timeName(),
+        mesh
+    );
+
+    if (mesh.moving())
+    {
+        return tmp<GeometricField<Type, fvPatchField, volMesh>>
+        (
+            new GeometricField<Type, fvPatchField, volMesh>
+            (
+                ddtIOobject,
+                rDeltaT*
+                (
+                    rho()*vf()
+                  - rho.oldTime()()
+                   *vf.oldTime()()*mesh.Vsc0()/mesh.Vsc()
+                ),
+                rDeltaT.value()*
+                (
+                    rho.boundaryField()*vf.boundaryField()
+                  - rho.oldTime().boundaryField()
+                   *vf.oldTime().boundaryField()
+                )
+            )
+        );
+    }
+    else
+    {
+        return tmp<GeometricField<Type, fvPatchField, volMesh>>
+        (
+            new GeometricField<Type, fvPatchField, volMesh>
+            (
+                ddtIOobject,
+                rDeltaT*(rho*vf - rho.oldTime()*vf.oldTime())
+            )
+        );
+    }
+}
+
+
+tmp<surfaceScalarField>
+EulerDdtSchemeFvcDdtCorr
+(
+    const volScalarField& rho,
+    const volVectorField& U,
+    const surfaceScalarField& phi,
+    const autoPtr<surfaceVectorField>& Uf
+)
+{
+    Info << "EulerDdtSchemeFvcDdtCorr start" << endl;
+
+    const fvMesh& mesh = U.mesh();
+
+    dimensionedScalar rDeltaT = 1.0/mesh.time().deltaT();
+
+    GeometricField<vector, fvPatchField, volMesh> rhoU0
+    (
+        rho.oldTime() * U.oldTime()
+    );
+
+    surfaceScalarField phiCorr
+    (
+        phi.oldTime() - fvc::dotInterpolate(mesh.Sf(), rhoU0)
+    );
+
+    return tmp<surfaceScalarField>
+    (
+        new surfaceScalarField
+        (
+            IOobject
+            (
+                "ddtCorr("
+                + rho.name() + ',' + U.name() + ',' + phi.name() + ')',
+                mesh.time().timeName(),
+                mesh
+            ),
+            EulerDdtSchemeFvcDdtPhiCoeff
+            (
+                rhoU0,
+                phi.oldTime(),
+                phiCorr,
+                rho.oldTime()
+            )*rDeltaT*phiCorr
+        )
+    );
+
+}
+
+tmp<surfaceScalarField>
+EulerDdtSchemeFvcDdtPhiCoeff
+(
+    const volVectorField& U,
+    const surfaceScalarField& phi,
+    const surfaceScalarField& phiCorr,
+    const volScalarField& rho
+)
+{
+    const fvMesh& mesh = U.mesh();
+    tmp<surfaceScalarField> tddtCouplingCoeff = scalar(1) - min(mag(phiCorr)/(mag(phi) + dimensionedScalar("small", phi.dimensions(), SMALL)),scalar(1));
+
+    surfaceScalarField& ddtCouplingCoeff = tddtCouplingCoeff.ref();
+
+    surfaceScalarField::Boundary& ccbf = ddtCouplingCoeff.boundaryFieldRef();
+
+    forAll(U.boundaryField(), patchi)
+    {
+        if
+        ( U.boundaryField()[patchi].fixesValue()
+         || isA<cyclicAMIFvPatch>(mesh.boundary()[patchi])
+        )
+        {
+            ccbf[patchi] = 0.0;
+        }
+    }
+
+    return tddtCouplingCoeff;
+}
+
+template<class Type>
+tmp<fvMatrix<Type>>
+EulerDdtSchemeFvmDdt
+(
+    const GeometricField<Type, fvPatchField, volMesh>& vf
+)
+{
+    const fvMesh& mesh = vf.mesh();
+
+    tmp<fvMatrix<Type>> tfvm
+    (
+        new fvMatrix<Type>
+        (
+            vf,
+            vf.dimensions()*dimVol/dimTime
+        )
+    );
+
+    fvMatrix<Type>& fvm = tfvm.ref();
+
+    scalar rDeltaT = 1.0/mesh.time().deltaTValue();
+
+    fvm.diag() = rDeltaT*mesh.Vsc();
+
+    if (mesh.moving())
+    {
+        fvm.source() = rDeltaT*vf.oldTime().primitiveField()*mesh.Vsc0();
+    }
+    else
+    {
+        fvm.source() = rDeltaT*vf.oldTime().primitiveField()*mesh.Vsc();
+    }
+
+    return tfvm;
+}
+
+template<class Type>
+tmp<GeometricField<Type, fvPatchField, volMesh>>
+EulerDdtSchemeFvcDdt
+(
+    const GeometricField<Type, fvPatchField, volMesh>& vf
+)
+{
+    const fvMesh& mesh = vf.mesh();
+
+    dimensionedScalar rDeltaT = 1.0/mesh.time().deltaT();
+
+    IOobject ddtIOobject
+    (
+        "ddt("+vf.name()+')',
+        mesh.time().timeName(),
+        mesh
+    );
+
+    return tmp<GeometricField<Type, fvPatchField, volMesh>>
+    (
+        new GeometricField<Type, fvPatchField, volMesh>
+        (
+            ddtIOobject,
+            rDeltaT*(vf - vf.oldTime())
+        )
+    );
+}
+
+template
+tmp<fvMatrix<scalar>>
+EulerDdtSchemeFvmDdt<scalar>
+(
+    const volScalarField& rho,
+    const GeometricField<scalar, fvPatchField, volMesh>& vf
+);
+
+template
+tmp<fvMatrix<vector>>
+EulerDdtSchemeFvmDdt<vector>
+(
+    const volScalarField& rho,
+    const GeometricField<vector, fvPatchField, volMesh>& vf
+);
+
+template
+tmp<GeometricField<scalar, fvPatchField, volMesh>>
+EulerDdtSchemeFvcDdt<scalar>
+(
+    const volScalarField& rho,
+    const GeometricField<scalar, fvPatchField, volMesh>& vf
+);
+
+template
+tmp<fvMatrix<scalar>>
+EulerDdtSchemeFvmDdt
+(
+    const GeometricField<scalar, fvPatchField, volMesh>& vf
+);
+
+template
+tmp<GeometricField<scalar, fvPatchField, volMesh>>
+EulerDdtSchemeFvcDdt
+(
+    const GeometricField<scalar, fvPatchField, volMesh>& vf
+);
+
+// * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * //
+
+// } // End namespace fv
+
+// * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * //
+
+} // End namespace Foam
+
+// ************************************************************************* //
\ No newline at end of file
diff --git a/GPUTestRef/GenFvMatrix.H b/GPUTestRef/GenFvMatrix.H
index d328fe504..d76fa94d9 100644
--- a/GPUTestRef/GenFvMatrix.H
+++ b/GPUTestRef/GenFvMatrix.H
@@ -139,6 +139,19 @@ gaussConvectionSchemeFvcDiv
     const GeometricField<Type, fvsPatchField, surfaceMesh>& ssf
 );
 
+template<class Type>
+tmp
+<
+    GeometricField
+    <
+        typename innerProduct<vector, Type>::type, fvPatchField, volMesh
+    >
+>
+gaussDivFvcdiv
+(
+    const GeometricField<Type, fvPatchField, volMesh>& vf
+);
+
 // fvc::grad
 template<class Type>
 tmp
diff --git a/GPUTestRef/Make/files b/GPUTestRef/Make/files
index 1137c3eed..314f1f495 100644
--- a/GPUTestRef/Make/files
+++ b/GPUTestRef/Make/files
@@ -1,4 +1,6 @@
 gaussGrad.C
 gaussConvectionScheme.C
+gaussLaplacianScheme.C
+EulerDdtScheme.C
 
 LIB = $(DF_LIBBIN)/libdfGenMatrix
\ No newline at end of file
diff --git a/GPUTestRef/gaussConvectionScheme.C b/GPUTestRef/gaussConvectionScheme.C
index 83d11bda9..b8157d2d1 100644
--- a/GPUTestRef/gaussConvectionScheme.C
+++ b/GPUTestRef/gaussConvectionScheme.C
@@ -164,6 +164,42 @@ gaussConvectionSchemeFvcDiv
     );
 }
 
+template<class Type>
+tmp
+<
+    GeometricField
+    <
+        typename innerProduct<vector, Type>::type, fvPatchField, volMesh
+    >
+>
+gaussDivFvcdiv
+(
+    const GeometricField<Type, fvPatchField, volMesh>& vf
+)
+{
+    const fvMesh& mesh = vf.mesh();
+    Istream& divIntScheme = mesh.divScheme("div("+vf.name()+')');
+    word divScheme(divIntScheme);
+
+    tmp<surfaceInterpolationScheme<Type>> tinterpScheme_ = 
+        surfaceInterpolationScheme<Type>::New(mesh, divIntScheme);
+
+    tmp
+    <
+        GeometricField
+        <typename innerProduct<vector, Type>::type, fvPatchField, volMesh>
+    > tDiv
+    (
+        fvcSurfaceIntegrate
+        (
+            (tinterpScheme_().dotInterpolate(mesh.Sf(), vf))()
+        )
+    );
+
+    
+    return tDiv;
+}
+
 template<class Type>
 tmp<GeometricField<Type, fvPatchField, volMesh>>
 fvcSurfaceIntegrate
@@ -222,7 +258,6 @@ void fvcSurfaceIntegrate
         ivf[owner[facei]] += issf[facei];
         ivf[neighbour[facei]] -= issf[facei];
     }
-    Info << "ivfcpu[473]before bou = " << ivf[473] << endl;
 
     forAll(mesh.boundary(), patchi)
     {
@@ -234,19 +269,10 @@ void fvcSurfaceIntegrate
         forAll(mesh.boundary()[patchi], facei)
         {
             ivf[pFaceCells[facei]] += pssf[facei];
-            if (pFaceCells[facei] == 473)
-            {
-                Info << "pssfcpu[473] += " << pssf[facei] << endl;
-            }
-            
         }
     }
 
-    Info << "ivfcpu[473] = " << ivf[473] << endl;
-
     ivf /= mesh.Vsc();
-
-    printf("vol cpu = %.15e\n", mesh.Vsc()()[473]);
 }
 
 template<class Type>
@@ -298,6 +324,26 @@ gaussConvectionSchemeFvcDiv
     const GeometricField<scalar, fvsPatchField, surfaceMesh>& ssf
 );
 
+template
+tmp<GeometricField<vector, fvPatchField, volMesh>>
+gaussConvectionSchemeFvcDiv
+(
+    const GeometricField<vector, fvsPatchField, surfaceMesh>& ssf
+);
+
+template
+tmp
+<
+    GeometricField
+    <
+        typename innerProduct<vector, vector>::type, fvPatchField, volMesh
+    >
+>
+gaussDivFvcdiv
+(
+    const GeometricField<vector, fvPatchField, volMesh>& vf
+);
+
 // * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * //
 
 } // End namespace Foam
diff --git a/GPUTestRef/gaussLaplacianScheme.C b/GPUTestRef/gaussLaplacianScheme.C
new file mode 100644
index 000000000..ed321ceda
--- /dev/null
+++ b/GPUTestRef/gaussLaplacianScheme.C
@@ -0,0 +1,273 @@
+/*---------------------------------------------------------------------------*
+  =========                 |
+      /  F ield         | OpenFOAM: The Open Source CFD Toolbox
+    /   O peration     | Website:  https://openfoam.org
+  /    A nd           | Copyright (C) 2011-2018 OpenFOAM Foundation
+/     M anipulation  |
+-------------------------------------------------------------------------------
+License
+    This file is part of OpenFOAM.
+
+    OpenFOAM is free software: you can redistribute it and/or modify it
+    under the terms of the GNU General Public License as published by
+    the Free Software Foundation, either version 3 of the License, or
+    (at your option) any later version.
+
+    OpenFOAM is distributed in the hope that it will be useful, but WITHOUT
+    ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
+    FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
+    for more details.
+
+    You should have received a copy of the GNU General Public License
+    along with OpenFOAM.  If not, see <http://www.gnu.org/licenses/>.
+
+*---------------------------------------------------------------------------*/
+
+#include "gaussLaplacianScheme.H"
+#include "surfaceInterpolate.H"
+#include "fvcDiv.H"
+#include "fvcGrad.H"
+#include "fvMatrices.H"
+#include "snGradScheme.H"
+#include "linear.H"
+#include "orthogonalSnGrad.H"
+
+// * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * //
+
+namespace Foam
+{
+
+// * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * //
+
+// * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * //
+
+template<class Type>
+tmp<fvMatrix<Type>>
+gaussLaplacianSchemeFvmLaplacianUncorrected
+(
+    const surfaceScalarField& gammaMagSf,
+    const surfaceScalarField& deltaCoeffs,
+    const GeometricField<Type, fvPatchField, volMesh>& vf
+)
+{
+    tmp<fvMatrix<Type>> tfvm
+    (
+        new fvMatrix<Type>
+        (
+            vf,
+            deltaCoeffs.dimensions()*gammaMagSf.dimensions()*vf.dimensions()
+        )
+    );
+    fvMatrix<Type>& fvm = tfvm.ref();
+
+    fvm.upper() = deltaCoeffs.primitiveField()*gammaMagSf.primitiveField();
+    fvm.negSumDiag();
+
+    forAll(vf.boundaryField(), patchi)
+    {
+        const fvPatchField<Type>& pvf = vf.boundaryField()[patchi];
+        const fvsPatchScalarField& pGamma = gammaMagSf.boundaryField()[patchi];
+        const fvsPatchScalarField& pDeltaCoeffs =
+            deltaCoeffs.boundaryField()[patchi];
+
+        if (pvf.coupled())
+        {
+            fvm.internalCoeffs()[patchi] =
+                pGamma*pvf.gradientInternalCoeffs(pDeltaCoeffs);
+            fvm.boundaryCoeffs()[patchi] =
+               -pGamma*pvf.gradientBoundaryCoeffs(pDeltaCoeffs);
+        }
+        else
+        {
+            fvm.internalCoeffs()[patchi] = pGamma*pvf.gradientInternalCoeffs();
+            fvm.boundaryCoeffs()[patchi] = -pGamma*pvf.gradientBoundaryCoeffs();
+        }
+    }
+
+    return tfvm;
+}
+
+
+template<class Type>
+tmp<GeometricField<Type, fvsPatchField, surfaceMesh>>
+gaussLaplacianSchemeGammaSnGradCorr
+(
+    const surfaceVectorField& SfGammaCorr,
+    const GeometricField<Type, fvPatchField, volMesh>& vf
+)
+{
+    const fvMesh& mesh = vf.mesh();
+
+    tmp<GeometricField<Type, fvsPatchField, surfaceMesh>> tgammaSnGradCorr
+    (
+        new GeometricField<Type, fvsPatchField, surfaceMesh>
+        (
+            IOobject
+            (
+                "gammaSnGradCorr("+vf.name()+')',
+                vf.instance(),
+                mesh,
+                IOobject::NO_READ,
+                IOobject::NO_WRITE
+            ),
+            mesh,
+            SfGammaCorr.dimensions()
+           *vf.dimensions()*mesh.deltaCoeffs().dimensions()
+        )
+    );
+
+    for (direction cmpt = 0; cmpt < pTraits<Type>::nComponents; cmpt++)
+    {
+        tgammaSnGradCorr.ref().replace
+        (
+            cmpt,
+            fvc::dotInterpolate(SfGammaCorr, fvc::grad(vf.component(cmpt)))
+        );
+    }
+
+    return tgammaSnGradCorr;
+}
+
+template<class Type>
+tmp<fvMatrix<Type>>
+gaussLaplacianSchemeFvmLaplacian
+(
+    const GeometricField<scalar, fvPatchField, volMesh>& gammaScalarVol,
+    const GeometricField<Type, fvPatchField, volMesh>& vf
+)
+{
+    const fvMesh& mesh = vf.mesh();
+    tmp<surfaceInterpolationScheme<scalar>> tinterpGammaScheme_(new linear<scalar>(mesh));
+    tmp<fv::snGradScheme<Type>> tsnGradScheme_(new fv::orthogonalSnGrad<Type>(mesh));
+
+    tmp<GeometricField<scalar, fvsPatchField, surfaceMesh>> tgamma = tinterpGammaScheme_().interpolate(gammaScalarVol);
+    const GeometricField<scalar, fvsPatchField, surfaceMesh>& gamma = tgamma.ref();
+
+    GeometricField<scalar, fvsPatchField, surfaceMesh> gammaMagSf
+    (
+        gamma*mesh.magSf()
+    );
+
+    tmp<fvMatrix<Type>> tfvm = gaussLaplacianSchemeFvmLaplacianUncorrected
+    (
+        gammaMagSf,
+        tsnGradScheme_().deltaCoeffs(vf),
+        vf
+    );
+    fvMatrix<Type>& fvm = tfvm.ref();
+
+    if (tsnGradScheme_().corrected())
+    {
+        if (mesh.fluxRequired(vf.name()))
+        {
+            fvm.faceFluxCorrectionPtr() = new
+            GeometricField<Type, fvsPatchField, surfaceMesh>
+            (
+                gammaMagSf*tsnGradScheme_().correction(vf)
+            );
+
+            fvm.source() -=
+                mesh.V()*
+                fvc::div
+                (
+                    *fvm.faceFluxCorrectionPtr()
+                )().primitiveField();
+        }
+        else
+        {
+            fvm.source() -=
+                mesh.V()*
+                fvc::div
+                (
+                    gammaMagSf*tsnGradScheme_().correction(vf)
+                )().primitiveField();
+        }
+    }
+    return tfvm;
+}
+
+template<class Type>
+tmp<fvMatrix<Type>>
+gaussLaplacianSchemeFvmLaplacian
+(
+    const GeometricField<scalar, fvsPatchField, surfaceMesh>& gamma,
+    const GeometricField<Type, fvPatchField, volMesh>& vf
+)
+{
+    const fvMesh& mesh = vf.mesh();
+    tmp<fv::snGradScheme<Type>> tsnGradScheme_(new fv::orthogonalSnGrad<Type>(mesh));
+
+    GeometricField<scalar, fvsPatchField, surfaceMesh> gammaMagSf
+    (
+        gamma*mesh.magSf()
+    );
+
+    tmp<fvMatrix<Type>> tfvm = gaussLaplacianSchemeFvmLaplacianUncorrected
+    (
+        gammaMagSf,
+        tsnGradScheme_().deltaCoeffs(vf),
+        vf
+    );
+    fvMatrix<Type>& fvm = tfvm.ref();
+
+    if (tsnGradScheme_().corrected())
+    {
+        if (mesh.fluxRequired(vf.name()))
+        {
+            fvm.faceFluxCorrectionPtr() = new
+            GeometricField<Type, fvsPatchField, surfaceMesh>
+            (
+                gammaMagSf*tsnGradScheme_().correction(vf)
+            );
+
+            fvm.source() -=
+                mesh.V()*
+                fvc::div
+                (
+                    *fvm.faceFluxCorrectionPtr()
+                )().primitiveField();
+        }
+        else
+        {
+            fvm.source() -=
+                mesh.V()*
+                fvc::div
+                (
+                    gammaMagSf*tsnGradScheme_().correction(vf)
+                )().primitiveField();
+        }
+    }
+    return tfvm;
+}  
+
+// * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * //
+
+template
+tmp<fvMatrix<scalar>>
+gaussLaplacianSchemeFvmLaplacian
+(
+    const GeometricField<scalar, fvPatchField, volMesh>& gammaScalarVol,
+    const GeometricField<scalar, fvPatchField, volMesh>& vf
+);
+
+template
+tmp<fvMatrix<vector>>
+gaussLaplacianSchemeFvmLaplacian
+(
+    const GeometricField<scalar, fvPatchField, volMesh>& gammaScalarVol,
+    const GeometricField<vector, fvPatchField, volMesh>& vf
+);
+
+template
+tmp<fvMatrix<scalar>>
+gaussLaplacianSchemeFvmLaplacian
+(
+    const GeometricField<scalar, fvsPatchField, surfaceMesh>& gamma,
+    const GeometricField<scalar, fvPatchField, volMesh>& vf
+);
+
+// * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * //
+
+} // End namespace Foam
+
+// ************************************************************************* //
\ No newline at end of file
diff --git a/src_gpu/dfMatrixOpBase.H b/src_gpu/dfMatrixOpBase.H
index b4015f0a6..35ac78c82 100644
--- a/src_gpu/dfMatrixOpBase.H
+++ b/src_gpu/dfMatrixOpBase.H
@@ -55,7 +55,11 @@ void fvc_grad_vector(cudaStream_t stream, int num_cells, int num_surfaces,
 void fvc_div_surface_scalar(cudaStream_t stream, int num_cells, int num_surfaces, int num_boundary_surfaces,
         const int *lowerAddr, const int *upperAddr, const double *ssf, const int *boundary_cell_face,
         const double *boundary_ssf, const double *volume, double *output);
-// void fvc_grad_surface();
-// 
-// void fvc_div_cell();
+
+void fvc_div_cell_vector(cudaStream_t stream, int num_cells, int num_surfaces,
+        const int *lowerAddr, const int *upperAddr, 
+        const double *weight, const double *Sf, const double *vf, double *output, // end for internal
+        int num_patches, const int *patch_size, const int *patch_type,
+        const int *boundary_cell_face, const double *boundary_vf, const double *boundary_Sf,
+        const double *volume);
 
diff --git a/src_gpu/dfMatrixOpBase.cu b/src_gpu/dfMatrixOpBase.cu
index e397b4232..39ad1f6a3 100644
--- a/src_gpu/dfMatrixOpBase.cu
+++ b/src_gpu/dfMatrixOpBase.cu
@@ -244,13 +244,6 @@ __global__ void fvc_grad_vector_internal(int num_surfaces,
     atomicAdd(&(output[neighbor * 9 + 6]), -grad_zx);
     atomicAdd(&(output[neighbor * 9 + 7]), -grad_zy);
     atomicAdd(&(output[neighbor * 9 + 8]), -grad_zz);
-
-    // if (owner == 0)
-    // {
-    //     printf("tensor[0] = (%.5e, %.5e, %.5e, %.5e, %.5e, %.5e, %.5e, %.5e, %.5e)\n", output[owner * 9 + 0],
-    //         output[owner * 9 + 1], output[owner * 9 + 2], output[owner * 9 + 3], output[owner * 9 + 4], output[owner * 9 + 5], 
-    //         output[owner * 9 + 6], output[owner * 9 + 7], output[owner * 9 + 8]);
-    // }
 }
 
 // update boundary of interpolation field
@@ -276,12 +269,6 @@ __global__ void fvc_grad_vector_boundary(int num, int offset, const int *face2Ce
 
     int cellIndex = face2Cells[start_index];
 
-    // if (cellIndex == 0)
-    // {
-    //     printf("surface vector = (%.5e, %.5e, %.5e)\n field vector = (%.5e, %.5e, %.5e)\n", bouSfx, bouSfy, bouSfz,
-    //             boussfx, boussfy, boussfz);
-    // }
-
     double grad_xx = bouSfx * boussfx;
     double grad_xy = bouSfx * boussfy;
     double grad_xz = bouSfx * boussfz;
@@ -329,11 +316,6 @@ __global__ void divide_cell_volume_scalar(int num_cells, const double* volume, d
     
     double vol = volume[index];
 
-    if (index == 473)
-    {
-        printf("vol gpu = %.15e\n", vol);
-    }
-
     output[index] = output[index] / vol;
 }
 
@@ -451,11 +433,6 @@ __global__ void fvc_div_surface_scalar_internal(int num_surfaces,
 
     // neighbor
     atomicAdd(&(output[neighbor]), -issf);
-
-    if (index == 0)
-    {
-        printf("output[3511]before = %.5e\n", output[473]);
-    }
 }
 
 __global__ void fvc_div_surface_scalar_boundary(int num_boundary_face, const int *face2Cells,
@@ -468,17 +445,61 @@ __global__ void fvc_div_surface_scalar_boundary(int num_boundary_face, const int
     int cellIndex = face2Cells[index];
 
     atomicAdd(&(output[cellIndex]), boundary_ssf[index]);
+}
 
-    // if (index == 0)
-    // {
-    //     printf("output[3511] = %.5e\n", output[3511]);
-    // }
+__global__ void fvc_div_cell_vector_internal(int num_surfaces, 
+        const int *lower_index, const int *upper_index,
+        const double *field_vector, const double *weight, const double *face_vector,
+        double *output)
+{
+    int index = blockDim.x * blockIdx.x + threadIdx.x;
+    if (index >= num_surfaces)
+        return;
+
+    double w = weight[index];
+    double Sfx = face_vector[index * 3 + 0];
+    double Sfy = face_vector[index * 3 + 1];
+    double Sfz = face_vector[index * 3 + 2];
+
+    int owner = lower_index[index];
+    int neighbor = upper_index[index];
+
+    double ssfx = (w * (field_vector[owner * 3 + 0] - field_vector[neighbor * 3 + 0]) + field_vector[neighbor * 3 + 0]);
+    double ssfy = (w * (field_vector[owner * 3 + 1] - field_vector[neighbor * 3 + 1]) + field_vector[neighbor * 3 + 1]);
+    double ssfz = (w * (field_vector[owner * 3 + 2] - field_vector[neighbor * 3 + 2]) + field_vector[neighbor * 3 + 2]);
+
+    double div = Sfx * ssfx + Sfy * ssfy + Sfz * ssfz;
+
+    // owner
+    atomicAdd(&(output[owner]), div);
+
+    // neighbour
+    atomicAdd(&(output[neighbor]), -div);
+}
+
+__global__ void fvc_div_cell_vector_boundary(int num, int offset, const int *face2Cells,
+        const double *boundary_face_vector, const double *boundary_field_vector, double *output)
+{
+    int index = blockDim.x * blockIdx.x + threadIdx.x;
+    if (index >= num)
+        return;
+
+    int start_index = offset + index;
+
+    double bouSfx = boundary_face_vector[start_index * 3 + 0];
+    double bouSfy = boundary_face_vector[start_index * 3 + 1];
+    double bouSfz = boundary_face_vector[start_index * 3 + 2];
+
+    double boussfx = boundary_field_vector[start_index * 3 + 0];
+    double boussfy = boundary_field_vector[start_index * 3 + 1];
+    double boussfz = boundary_field_vector[start_index * 3 + 2];
+
+    int cellIndex = face2Cells[start_index];
+
+    double bouDiv = bouSfx * boussfx + bouSfy * boussfy + bouSfz * boussfz;
+
+    atomicAdd(&(output[cellIndex]), bouDiv);
 
-    if (cellIndex == 473)
-    {
-        printf("output[473] = %.5e\n", output[473]);
-        printf("boundary_ssf[473] = %.5e\n", boundary_ssf[index]);
-    }
 }
 
 void permute_vector_d2h(cudaStream_t stream, int num_cells, const double *input, double *output)
@@ -693,3 +714,37 @@ void fvc_div_surface_scalar(cudaStream_t stream, int num_cells, int num_surfaces
     divide_cell_volume_scalar<<<blocks_per_grid, threads_per_block, 0, stream>>>(num_cells, volume, output);
 }
 
+void fvc_div_cell_vector(cudaStream_t stream, int num_cells, int num_surfaces,
+        const int *lowerAddr, const int *upperAddr, 
+        const double *weight, const double *Sf, const double *vf, double *output, // end for internal
+        int num_patches, const int *patch_size, const int *patch_type,
+        const int *boundary_cell_face, const double *boundary_vf, const double *boundary_Sf,
+        const double *volume)
+{
+    size_t threads_per_block = 1024;
+    size_t blocks_per_grid = (num_surfaces + threads_per_block - 1) / threads_per_block;
+    fvc_div_cell_vector_internal<<<blocks_per_grid, threads_per_block, 0, stream>>>(num_surfaces, lowerAddr, upperAddr, vf, weight, Sf, output);
+
+    int offset = 0;
+    for (int i = 0; i < num_patches; i++) {
+        threads_per_block = 256;
+        blocks_per_grid = (patch_size[i] + threads_per_block - 1) / threads_per_block;
+        // TODO: just basic patch type now
+        if (patch_type[i] == boundaryConditions::zeroGradient
+                || patch_type[i] == boundaryConditions::fixedValue) {
+            // TODO: just vector version now
+            fvc_div_cell_vector_boundary<<<blocks_per_grid, threads_per_block, 0, stream>>>(patch_size[i], offset, boundary_cell_face,
+                    boundary_Sf, boundary_vf, output);
+        } else if (0) {
+            // xxx
+            fprintf(stderr, "boundaryConditions other than zeroGradient are not support yet!\n");
+        }
+        offset += patch_size[i];
+    }
+
+    // divide cell volume
+    threads_per_block = 1024;
+    blocks_per_grid = (num_cells + threads_per_block - 1) / threads_per_block;
+    divide_cell_volume_scalar<<<blocks_per_grid, threads_per_block, 0, stream>>>(num_cells, volume, output);
+}
+

From 7e29a106203cd90cd8192b1fca04c26e14dad7e5 Mon Sep 17 00:00:00 2001
From: maorz1998 <maorz1998@stu.pku.edu.cn>
Date: Sat, 12 Aug 2023 23:09:50 +0800
Subject: [PATCH 18/25] add the comparison with the original method

---
 GPUTest/GPUTestBase.H                         |  40 +-
 GPUTest/GPUTestRefBase.H                      |  63 +++
 GPUTest/Make/options                          |   4 +-
 GPUTest/createGPUSolver.H                     |  45 ++
 GPUTest/unittest.C                            |  19 +-
 src_gpu/dfMatrixOpBase.H                      |   6 +
 src_gpu/dfMatrixOpBase.cu                     | 196 +++++++-
 src_gpu_orig/CMakeLists.txt                   |  15 +-
 ...atrixDataBase.H => dfMatrixDataBaseOrig.H} |  44 +-
 ...rixDataBase.cu => dfMatrixDataBaseOrig.cu} |   2 +-
 src_gpu_orig/dfMatrixOpBaseOrig.H             |   9 +
 src_gpu_orig/dfMatrixOpBaseOrig.cu            | 460 ++++++++++++++++++
 12 files changed, 847 insertions(+), 56 deletions(-)
 create mode 100644 GPUTest/GPUTestRefBase.H
 rename src_gpu_orig/{dfMatrixDataBase.H => dfMatrixDataBaseOrig.H} (95%)
 rename src_gpu_orig/{dfMatrixDataBase.cu => dfMatrixDataBaseOrig.cu} (97%)
 create mode 100644 src_gpu_orig/dfMatrixOpBaseOrig.H
 create mode 100644 src_gpu_orig/dfMatrixOpBaseOrig.cu

diff --git a/GPUTest/GPUTestBase.H b/GPUTest/GPUTestBase.H
index b5f5b944b..2520485a7 100644
--- a/GPUTest/GPUTestBase.H
+++ b/GPUTest/GPUTestBase.H
@@ -491,8 +491,8 @@ void test_fvc_grad_vector(dfMatrixDataBase& dfDataBase, Foam::fvMesh& mesh, volV
     }
 
     // run CPU
-    volTensorField fvc_ouput_tensor = fvc::grad(U);
-    // volTensorField fvc_ouput_tensor = gaussGradSchemeGrad(U);
+    // volTensorField fvc_ouput_tensor = fvc::grad(U);
+    volTensorField fvc_ouput_tensor = gaussGradSchemeGrad(U);
 
     // prepare for run GPU
     // prepare U on GPU
@@ -589,6 +589,42 @@ void test_fvc_div_vector(dfMatrixDataBase& dfDataBase, Foam::fvMesh& mesh, volVe
     checkVectorEqual(dfDataBase.num_cells, &fvc_ouput_scalar[0], h_fvc_ouput_scalar.data(), 1e-14, printFlag);
 }
 
+// unittest of fvc::grad(p)
+void test_fvc_grad_scalar(dfMatrixDataBase& dfDataBase, Foam::fvMesh& mesh, volScalarField& p, initType type) {
+    if (type == initType::randomInit) {
+        p.oldTime();
+        randomInitField<volScalarField>(p);
+    }
+
+    // run CPU
+    // volVectorField fvc_ouput_vector = fvc::grad(p);
+    volVectorField fvc_ouput_vector = gaussGradSchemeGrad(p);
+
+    // prepare for run GPU
+    // prepare p on GPU
+    uploadRegisteredField<volScalarField>(dfDataBase, p, "p");
+
+    double *d_fvc_ouput_vector = nullptr;
+    checkCudaErrors(cudaMalloc((void**)&d_fvc_ouput_vector, dfDataBase.cell_value_vec_bytes));
+    checkCudaErrors(cudaMemset(d_fvc_ouput_vector, 0, dfDataBase.cell_value_vec_bytes));
+
+    // only need patch_type
+    testGPUDataBase testData;
+    buildTestGPUDataBase<volScalarField>(dfDataBase, testData, p, false, false, false, false, false, false, false, false, false, false);
+
+    fvc_grad_cell_scalar(dfDataBase.stream, dfDataBase.num_cells, dfDataBase.num_surfaces,
+            dfDataBase.d_owner, dfDataBase.d_neighbor, 
+            dfDataBase.d_weight, dfDataBase.d_sf, dfDataBase.d_p, d_fvc_ouput_vector,
+            dfDataBase.num_patches, dfDataBase.patch_size.data(), testData.patch_type.data(), 
+            dfDataBase.d_boundary_face_cell, dfDataBase.d_boundary_p, dfDataBase.d_boundary_sf, dfDataBase.d_volume);
+
+    // compare result
+    bool printFlag = false;
+    std::vector<double> h_fvc_ouput_vector(dfDataBase.num_cells * 3);
+    checkCudaErrors(cudaMemcpy(h_fvc_ouput_vector.data(), d_fvc_ouput_vector, dfDataBase.cell_value_vec_bytes, cudaMemcpyDeviceToHost));
+    checkVectorEqual(dfDataBase.num_cells * 3, &fvc_ouput_vector[0][0], h_fvc_ouput_vector.data(), 1e-14, printFlag);
+}
+
 
 // * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * //
 template <>
diff --git a/GPUTest/GPUTestRefBase.H b/GPUTest/GPUTestRefBase.H
new file mode 100644
index 000000000..754219e64
--- /dev/null
+++ b/GPUTest/GPUTestRefBase.H
@@ -0,0 +1,63 @@
+
+// unittest of fvc::grad(U)
+void test_fvc_grad_vector_orig(dfMatrixDataBase& dfDataBase, Foam::fvMesh& mesh, volVectorField& U, initType type,
+        dfMatrixDataBaseOrig* dfDataBaseOrig) 
+{
+    if (type == initType::randomInit) {
+        U.oldTime();
+        randomInitField<volVectorField>(U);
+    }
+
+    // run CPU
+    // volTensorField fvc_ouput_tensor = fvc::grad(U);
+    volTensorField fvc_ouput_tensor = gaussGradSchemeGrad(U);
+
+    // prepare for run GPU
+    // prepare U on GPU
+    uploadRegisteredField<volVectorField>(dfDataBase, U, "u");
+    
+    double *d_fvc_ouput_tensor = nullptr, *d_fvc_ouput_boundary_tensor = nullptr, *d_fvc_ouput_boundary_tensor_init = nullptr;
+    checkCudaErrors(cudaMalloc((void**)&d_fvc_ouput_tensor, dfDataBase.cell_value_tsr_bytes));
+    checkCudaErrors(cudaMalloc((void**)&d_fvc_ouput_boundary_tensor, dfDataBase.boundary_surface_value_tsr_bytes));
+    checkCudaErrors(cudaMalloc((void**)&d_fvc_ouput_boundary_tensor_init, dfDataBase.boundary_surface_value_tsr_bytes));
+    checkCudaErrors(cudaMemset(d_fvc_ouput_tensor, 0, dfDataBase.cell_value_tsr_bytes));
+    checkCudaErrors(cudaMemset(d_fvc_ouput_boundary_tensor, 0, dfDataBase.boundary_surface_value_tsr_bytes));
+    checkCudaErrors(cudaMemset(d_fvc_ouput_boundary_tensor_init, 0, dfDataBase.boundary_surface_value_tsr_bytes));
+
+    fvc_grad_vector_orig(dfDataBase.stream, dfDataBaseOrig, dfDataBase, d_fvc_ouput_tensor, d_fvc_ouput_boundary_tensor_init, d_fvc_ouput_boundary_tensor);
+
+    // compare result
+    bool printFlag = false;
+    std::vector<double> h_fvc_ouput_tensor(dfDataBase.num_cells * 9);
+    checkCudaErrors(cudaMemcpy(h_fvc_ouput_tensor.data(), d_fvc_ouput_tensor, dfDataBase.cell_value_tsr_bytes, cudaMemcpyDeviceToHost));
+    checkVectorEqual(dfDataBase.num_cells * 9, &fvc_ouput_tensor[0][0], h_fvc_ouput_tensor.data(), 1e-14, printFlag);
+}
+
+void test_fvc_grad_scalar_orig(dfMatrixDataBase& dfDataBase, Foam::fvMesh& mesh, volScalarField& p, initType type,
+        dfMatrixDataBaseOrig* dfDataBaseOrig) 
+{
+    if (type == initType::randomInit) {
+        p.oldTime();
+        randomInitField<volScalarField>(p);
+    }
+
+    // run CPU
+    // volVectorField fvc_ouput_vector = fvc::grad(p);
+    volVectorField fvc_ouput_vector = gaussGradSchemeGrad(p);
+
+    // prepare for run GPU
+    // prepare p on GPU
+    uploadRegisteredField<volScalarField>(dfDataBase, p, "p");
+
+    double *d_fvc_ouput_vector = nullptr;
+    checkCudaErrors(cudaMalloc((void**)&d_fvc_ouput_vector, dfDataBase.cell_value_vec_bytes));
+    checkCudaErrors(cudaMemset(d_fvc_ouput_vector, 0, dfDataBase.cell_value_vec_bytes));
+
+    fvc_grad_scalar_orig(dfDataBase.stream, dfDataBaseOrig, dfDataBase, d_fvc_ouput_vector);
+
+    // compare result
+    bool printFlag = false;
+    std::vector<double> h_fvc_ouput_vector(dfDataBase.num_cells * 3);
+    checkCudaErrors(cudaMemcpy(h_fvc_ouput_vector.data(), d_fvc_ouput_vector, dfDataBase.cell_value_vec_bytes, cudaMemcpyDeviceToHost));
+    checkVectorEqual(dfDataBase.num_cells * 3, &fvc_ouput_vector[0][0], h_fvc_ouput_vector.data(), 1e-14, printFlag);
+}
\ No newline at end of file
diff --git a/GPUTest/Make/options b/GPUTest/Make/options
index 197663050..e8e07b6a5 100644
--- a/GPUTest/Make/options
+++ b/GPUTest/Make/options
@@ -24,6 +24,7 @@ EXE_INC = -std=c++14 \
     -I$(DF_SRC)/dfCombustionModels/lnInclude \
     -I$(CANTERA_ROOT)/include \
 	-I$(DF_ROOT)/src_gpu \
+	-I$(DF_ROOT)/src_gpu_orig \
 	-I$(DF_ROOT)/GPUTestRef/lnInclude \
 	-I/usr/local/cuda-11.6/include \
 	-I$(AMGX_DIR)/include
@@ -44,5 +45,6 @@ EXE_LIBS = \
     $(CANTERA_ROOT)/lib/libcantera.so \
 	/usr/local/cuda-11.6/lib64/libcudart.so \
 	$(AMGX_DIR)/build/libamgxsh.so \
-    $(DF_ROOT)/src_gpu/build/libdfMatrix.so
+    $(DF_ROOT)/src_gpu/build/libdfMatrix.so \
+	$(DF_ROOT)/src_gpu_orig/build/libdfMatrixOrig.so 
 
diff --git a/GPUTest/createGPUSolver.H b/GPUTest/createGPUSolver.H
index 478b15ed1..516386473 100644
--- a/GPUTest/createGPUSolver.H
+++ b/GPUTest/createGPUSolver.H
@@ -67,3 +67,48 @@ void createGPUBase(fvMesh& mesh, PtrList<volScalarField>& Y) {
     dfDataBase.initNonConstantFieldsInternal(dfDataBase.h_y);
     dfDataBase.initNonConstantFieldsBoundary(dfDataBase.h_boundary_y);
 };
+
+
+dfMatrixDataBaseOrig* createGPUBaseOrig(fvMesh& mesh, PtrList<volScalarField>& Y, volVectorField& U) {
+    const labelUList& owner = mesh.owner();
+    const labelUList& neighbour = mesh.neighbour();
+    int num_cells = mesh.nCells();
+    int num_surfaces = neighbour.size();
+
+    std::vector<int> boundaryCellIndex;
+    std::vector<double> boundary_face_vector_init;
+    std::vector<double> boundary_face_init;
+    std::vector<double> boundary_deltaCoeffs_init;
+    std::vector<std::vector<int>> patchTypes;
+    std::vector<int> patchTypeU, patchTypeY;
+    int num_boundary_faces = 0;
+    int patchSize;
+    forAll(mesh.boundary(), patchi)
+    {
+        labelUList sub_boundary = mesh.boundary()[patchi].faceCells();
+        patchSize = sub_boundary.size();
+        const vectorField& pSf = mesh.Sf().boundaryField()[patchi];
+        const scalarField& pMagSf = mesh.magSf().boundaryField()[patchi];
+        const scalarField& pDeltaCoeffs = mesh.nonOrthDeltaCoeffs().boundaryField()[patchi];
+
+        boundaryCellIndex.insert(boundaryCellIndex.end(), &sub_boundary[0], &sub_boundary[0]+patchSize);
+        boundary_face_vector_init.insert(boundary_face_vector_init.end(), &pSf[0][0], &pSf[0][0]+3*patchSize);
+        boundary_face_init.insert(boundary_face_init.end(), &pMagSf[0], &pMagSf[0]+patchSize);
+        boundary_deltaCoeffs_init.insert(boundary_deltaCoeffs_init.end(), &pDeltaCoeffs[0], &pDeltaCoeffs[0]+patchSize);
+        num_boundary_faces += patchSize;
+
+        constructBoundarySelector(patchTypeU, U.boundaryField()[patchi].type(), patchSize);
+        constructBoundarySelector(patchTypeY, Y[0].boundaryField()[patchi].type(), patchSize);
+    }
+    patchTypes.emplace_back(patchTypeU);
+    patchTypes.emplace_back(patchTypeY);
+
+    int num_boundary_cells;
+
+    dfMatrixDataBaseOrig* dfDataBase = new dfMatrixDataBaseOrig(num_surfaces, num_cells, num_boundary_faces, Y.size(), num_boundary_cells, 
+            &neighbour[0], &owner[0], &mesh.V()[0], &mesh.surfaceInterpolation::weights()[0], 
+            &mesh.Sf()[0][0], &mesh.magSf()[0], &mesh.nonOrthDeltaCoeffs()[0], boundary_face_vector_init, 
+            boundary_face_init, boundary_deltaCoeffs_init, boundaryCellIndex, patchTypes);
+    
+    return dfDataBase;
+}
\ No newline at end of file
diff --git a/GPUTest/unittest.C b/GPUTest/unittest.C
index 0577b3d2e..80eafef9d 100644
--- a/GPUTest/unittest.C
+++ b/GPUTest/unittest.C
@@ -55,9 +55,12 @@ Description
 #include <queue>
 
 #include "dfMatrixDataBase.H"
+#include "dfMatrixDataBaseOrig.H"
 #include "dfMatrixOpBase.H"
+#include "dfMatrixOpBaseOrig.H"
 #include "createGPUSolver.H"
 #include "GPUTestBase.H"
+#include "GPUTestRefBase.H"
 
 int main(int argc, char *argv[])
 {
@@ -102,6 +105,8 @@ int main(int argc, char *argv[])
 
         createGPUBase(mesh, Y);
         DEBUG_TRACE;
+        dfMatrixDataBaseOrig* dfDataBaseOrig = createGPUBaseOrig(mesh, Y, U);
+        DEBUG_TRACE;
 
         // unittest of fvm::ddt(rho, U)
         test_fvm_ddt_vector(dfDataBase, mesh, rho, U, initType::original);
@@ -134,7 +139,9 @@ int main(int argc, char *argv[])
         // unittest of fvc::grad(U)
         test_fvc_grad_vector(dfDataBase, mesh, U, initType::original);
         DEBUG_TRACE;
-        test_fvc_grad_vector(dfDataBase, mesh, U, initType::randomInit);
+        // test_fvc_grad_vector(dfDataBase, mesh, U, initType::randomInit);
+        // DEBUG_TRACE;
+        test_fvc_grad_vector_orig(dfDataBase, mesh, U, initType::original, dfDataBaseOrig);
         DEBUG_TRACE;
 
         // unittest of fvc::div(phi)
@@ -146,8 +153,16 @@ int main(int argc, char *argv[])
         // unittest of fvc::div(U)
         test_fvc_div_vector(dfDataBase, mesh, U, initType::original);
         DEBUG_TRACE;
-        test_fvc_div_vector(dfDataBase, mesh, U, initType::randomInit);
+        // test_fvc_div_vector(dfDataBase, mesh, U, initType::randomInit);
+        // DEBUG_TRACE;
+
+        // unittest of fvc::grad(p)
+        test_fvc_grad_scalar(dfDataBase, mesh, p, initType::original);
+        DEBUG_TRACE;
+        test_fvc_grad_scalar(dfDataBase, mesh, p, initType::randomInit);
         DEBUG_TRACE;
+        test_fvc_grad_scalar_orig(dfDataBase, mesh, p, initType::original, dfDataBaseOrig);
+        DEBUG_TRACE
     }
     return 0;
 }
diff --git a/src_gpu/dfMatrixOpBase.H b/src_gpu/dfMatrixOpBase.H
index 35ac78c82..109f20c3f 100644
--- a/src_gpu/dfMatrixOpBase.H
+++ b/src_gpu/dfMatrixOpBase.H
@@ -63,3 +63,9 @@ void fvc_div_cell_vector(cudaStream_t stream, int num_cells, int num_surfaces,
         const int *boundary_cell_face, const double *boundary_vf, const double *boundary_Sf,
         const double *volume);
 
+void fvc_grad_cell_scalar(cudaStream_t stream, int num_cells, int num_surfaces, 
+        const int *lowerAddr, const int *upperAddr, 
+        const double *weight, const double *Sf, const double *vf, double *output, // end for internal
+        int num_patches, const int *patch_size, const int *patch_type,
+        const int *boundary_cell_face, const double *boundary_vf, const double *boundary_Sf, const double *volume);
+
diff --git a/src_gpu/dfMatrixOpBase.cu b/src_gpu/dfMatrixOpBase.cu
index 39ad1f6a3..7a76db89c 100644
--- a/src_gpu/dfMatrixOpBase.cu
+++ b/src_gpu/dfMatrixOpBase.cu
@@ -290,6 +290,81 @@ __global__ void fvc_grad_vector_boundary(int num, int offset, const int *face2Ce
     atomicAdd(&(output[cellIndex * 9 + 8]), grad_zz);
 }
 
+__global__ void fvc_grad_scalar_internal(int num_cells, int num_surfaces,
+        const int *lower_index, const int *upper_index, const double *face_vector, 
+        const double *weight, const double *vf, double *output)
+{
+    int index = blockDim.x * blockIdx.x + threadIdx.x;
+    if (index >= num_surfaces)
+        return;
+    
+    double w = weight[index];
+    double Sfx = face_vector[index * 3 + 0];
+    double Sfy = face_vector[index * 3 + 1];
+    double Sfz = face_vector[index * 3 + 2];
+
+    int owner = lower_index[index];
+    int neighbor = upper_index[index];
+
+    double ssf = (w * (vf[owner] - vf[neighbor]) + vf[neighbor]);
+
+    double grad_x = Sfx * ssf;
+    double grad_y = Sfy * ssf;
+    double grad_z = Sfz * ssf;
+
+    // // owner
+    // atomicAdd(&(output[num_cells * 0 + owner]), grad_x);
+    // atomicAdd(&(output[num_cells * 1 + owner]), grad_y);
+    // atomicAdd(&(output[num_cells * 2 + owner]), grad_z);
+
+    // // neighbour
+    // atomicAdd(&(output[num_cells * 0 + neighbor]), -grad_x);
+    // atomicAdd(&(output[num_cells * 1 + neighbor]), -grad_y);
+    // atomicAdd(&(output[num_cells * 2 + neighbor]), -grad_z);
+
+    // owner
+    atomicAdd(&(output[owner * 3 + 0]), grad_x);
+    atomicAdd(&(output[owner * 3 + 1]), grad_y);
+    atomicAdd(&(output[owner * 3 + 2]), grad_z);
+
+    // neighbour
+    atomicAdd(&(output[neighbor * 3 + 0]), -grad_x);
+    atomicAdd(&(output[neighbor * 3 + 1]), -grad_y);
+    atomicAdd(&(output[neighbor * 3 + 2]), -grad_z);
+    
+}
+
+__global__ void fvc_grad_scalar_boundary(int num_cells, int num, int offset, const int *face2Cells,
+        const double *boundary_face_vector, const double *boundary_vf, double *output)
+{
+    int index = blockDim.x * blockIdx.x + threadIdx.x;
+    if (index >= num)
+        return;
+
+    int start_index = offset + index;
+
+    double bouvf = boundary_vf[start_index];
+    double bouSfx = boundary_face_vector[start_index * 3 + 0];
+    double bouSfy = boundary_face_vector[start_index * 3 + 1];
+    double bouSfz = boundary_face_vector[start_index * 3 + 2];
+
+    int cellIndex = face2Cells[start_index];
+
+    double grad_x = bouSfx * bouvf;
+    double grad_y = bouSfy * bouvf;
+    double grad_z = bouSfz * bouvf;
+
+    atomicAdd(&(output[cellIndex * 3 + 0]), grad_x);
+    atomicAdd(&(output[cellIndex * 3 + 1]), grad_y);
+    atomicAdd(&(output[cellIndex * 3 + 2]), grad_z);
+
+    // if (cellIndex == 5)
+    // {
+    //     printf("Sfx = %.10e, ssf = %.10e\n", bouSfx, bouvf);
+    //     printf("gradx = %.10e, output = %.10e\n\n", grad_x, output[5]);
+    // }
+}
+
 __global__ void divide_cell_volume_tsr(int num_cells, const double* volume, double *output)
 {
     int index = blockDim.x * blockIdx.x + threadIdx.x;
@@ -308,6 +383,19 @@ __global__ void divide_cell_volume_tsr(int num_cells, const double* volume, doub
     output[index * 9 + 8] = output[index * 9 + 8] / vol;
 }
 
+__global__ void divide_cell_volume_vec(int num_cells, const double* volume, double *output)
+{
+    int index = blockDim.x * blockIdx.x + threadIdx.x;
+    if (index >= num_cells)
+        return;
+    
+    double vol = volume[index];
+
+    output[index * 3 + 0] = output[index * 3 + 0] / vol;
+    output[index * 3 + 1] = output[index * 3 + 1] / vol;
+    output[index * 3 + 2] = output[index * 3 + 2] / vol;
+}
+
 __global__ void divide_cell_volume_scalar(int num_cells, const double* volume, double *output)
 {
     int index = blockDim.x * blockIdx.x + threadIdx.x;
@@ -646,11 +734,28 @@ void fvc_grad_vector(cudaStream_t stream, int num_cells, int num_surfaces,
         const double *volume, const double *boundary_mag_Sf, double *boundary_output,
         const double *boundary_deltaCoeffs)
 {
+    float time_elapsed = 0;
+    cudaEvent_t start, stop;
+    checkCudaErrors(cudaEventCreate(&start));
+    checkCudaErrors(cudaEventCreate(&stop));
+    
+    // checkCudaErrors(cudaStreamSynchronize(stream));
+    checkCudaErrors(cudaEventRecord(start, 0));
+    
     size_t threads_per_block = 1024;
     size_t blocks_per_grid = (num_surfaces + threads_per_block - 1) / threads_per_block;
     fvc_grad_vector_internal<<<blocks_per_grid, threads_per_block, 0, stream>>>(num_surfaces, lowerAddr, upperAddr,
             Sf, weight, vf, output);
-
+    
+    // checkCudaErrors(cudaStreamSynchronize(stream));
+    checkCudaErrors(cudaEventRecord(stop, 0));
+    checkCudaErrors(cudaEventSynchronize(start));
+    checkCudaErrors(cudaEventSynchronize(stop));
+    checkCudaErrors(cudaEventElapsedTime(&time_elapsed, start, stop));
+    printf("\nfvc_grad_vector_new internal 执行时间：%f(ms)\n", time_elapsed);
+
+    // checkCudaErrors(cudaStreamSynchronize(stream));
+    checkCudaErrors(cudaEventRecord(start, 0));
     int offset = 0;
     // finish conctruct grad field except dividing cell volume
     for (int i = 0; i < num_patches; i++) {
@@ -668,14 +773,33 @@ void fvc_grad_vector(cudaStream_t stream, int num_cells, int num_surfaces,
         }
         offset += patch_size[i];
     }
+    // checkCudaErrors(cudaStreamSynchronize(stream));
+    checkCudaErrors(cudaEventRecord(stop, 0));
+    checkCudaErrors(cudaEventSynchronize(start));
+    checkCudaErrors(cudaEventSynchronize(stop));
+    checkCudaErrors(cudaEventElapsedTime(&time_elapsed, start, stop));
+    printf("fvc_grad_vector_new boundary1 执行时间：%f(ms)\n", time_elapsed);
 
     // divide cell volume
-    threads_per_block = 1024;
+    // checkCudaErrors(cudaStreamSynchronize(stream));
+    checkCudaErrors(cudaEventRecord(start, 0));
+
+    threads_per_block = 512;
     blocks_per_grid = (num_cells + threads_per_block - 1) / threads_per_block;
     divide_cell_volume_tsr<<<blocks_per_grid, threads_per_block, 0, stream>>>(num_cells, volume, output);
 
-    offset = 0;
+    // checkCudaErrors(cudaStreamSynchronize(stream));
+    checkCudaErrors(cudaEventRecord(stop, 0));
+    checkCudaErrors(cudaEventSynchronize(start));
+    checkCudaErrors(cudaEventSynchronize(stop));
+    checkCudaErrors(cudaEventElapsedTime(&time_elapsed, start, stop));
+    printf("fvc_grad_vector_new divide_cell 执行时间：%f(ms)\n", time_elapsed);
+
     // correct boundary conditions
+    // checkCudaErrors(cudaStreamSynchronize(stream));
+    checkCudaErrors(cudaEventRecord(start, 0));
+    
+    offset = 0;
     for (int i = 0; i < num_patches; i++) {
         threads_per_block = 256;
         blocks_per_grid = (patch_size[i] + threads_per_block - 1) / threads_per_block;
@@ -693,6 +817,13 @@ void fvc_grad_vector(cudaStream_t stream, int num_cells, int num_surfaces,
         }
         offset += patch_size[i];
     }
+    // checkCudaErrors(cudaStreamSynchronize(stream));
+
+    checkCudaErrors(cudaEventRecord(stop, 0));
+    checkCudaErrors(cudaEventSynchronize(start));
+    checkCudaErrors(cudaEventSynchronize(stop));
+    checkCudaErrors(cudaEventElapsedTime(&time_elapsed, start, stop));
+    printf("fvc_grad_vector_new boundary2 执行时间：%f(ms)\n", time_elapsed);
 }
 
 void fvc_div_surface_scalar(cudaStream_t stream, int num_cells, int num_surfaces, int num_boundary_surfaces,
@@ -748,3 +879,62 @@ void fvc_div_cell_vector(cudaStream_t stream, int num_cells, int num_surfaces,
     divide_cell_volume_scalar<<<blocks_per_grid, threads_per_block, 0, stream>>>(num_cells, volume, output);
 }
 
+void fvc_grad_cell_scalar(cudaStream_t stream, int num_cells, int num_surfaces, 
+        const int *lowerAddr, const int *upperAddr, 
+        const double *weight, const double *Sf, const double *vf, double *output, // end for internal
+        int num_patches, const int *patch_size, const int *patch_type,
+        const int *boundary_cell_face, const double *boundary_vf, const double *boundary_Sf, const double *volume)
+{
+    float time_elapsed = 0;
+    cudaEvent_t start, stop;
+    checkCudaErrors(cudaEventCreate(&start));
+    checkCudaErrors(cudaEventCreate(&stop));
+    checkCudaErrors(cudaEventRecord(start, 0));
+
+    size_t threads_per_block = 1024;
+    size_t blocks_per_grid = (num_surfaces + threads_per_block - 1) / threads_per_block;
+    fvc_grad_scalar_internal<<<blocks_per_grid, threads_per_block, 0, stream>>>(num_cells, num_surfaces, lowerAddr, upperAddr,
+            Sf, weight, vf, output);
+    
+    checkCudaErrors(cudaEventRecord(stop, 0));
+    checkCudaErrors(cudaEventSynchronize(start));
+    checkCudaErrors(cudaEventSynchronize(stop));
+    checkCudaErrors(cudaEventElapsedTime(&time_elapsed, start, stop));
+    printf("\nfvc_grad_scalar_new internal 执行时间：%f(ms)\n", time_elapsed);
+
+    checkCudaErrors(cudaEventRecord(start, 0));
+
+    int offset = 0;
+    for (int i = 0; i < num_patches; i++) {
+        threads_per_block = 256;
+        blocks_per_grid = (patch_size[i] + threads_per_block - 1) / threads_per_block;
+        // TODO: just non-coupled patch type now
+        if (patch_type[i] == boundaryConditions::zeroGradient
+                || patch_type[i] == boundaryConditions::fixedValue) {
+            fvc_grad_scalar_boundary<<<blocks_per_grid, threads_per_block, 0, stream>>>(num_cells, patch_size[i], offset, boundary_cell_face,
+                    boundary_Sf, boundary_vf, output);
+        } else if (0) {
+            // xxx
+            fprintf(stderr, "boundaryConditions other than zeroGradient are not support yet!\n");
+        }
+        offset += patch_size[i];
+    }
+    checkCudaErrors(cudaEventRecord(stop, 0));
+    checkCudaErrors(cudaEventSynchronize(start));
+    checkCudaErrors(cudaEventSynchronize(stop));
+    checkCudaErrors(cudaEventElapsedTime(&time_elapsed, start, stop));
+    printf("fvc_grad_scalar_new boundary 执行时间：%f(ms)\n", time_elapsed);
+
+    checkCudaErrors(cudaEventRecord(start, 0));
+
+    // divide cell volume
+    threads_per_block = 1024;
+    blocks_per_grid = (num_cells + threads_per_block - 1) / threads_per_block;
+    divide_cell_volume_vec<<<blocks_per_grid, threads_per_block, 0, stream>>>(num_cells, volume, output);
+
+    checkCudaErrors(cudaEventRecord(stop, 0));
+    checkCudaErrors(cudaEventSynchronize(start));
+    checkCudaErrors(cudaEventSynchronize(stop));
+    checkCudaErrors(cudaEventElapsedTime(&time_elapsed, start, stop));
+    printf("fvc_grad_scalar_new divide_cell_vector 执行时间：%f(ms)\n", time_elapsed);
+}
diff --git a/src_gpu_orig/CMakeLists.txt b/src_gpu_orig/CMakeLists.txt
index 6e4a7efef..3a6d59825 100644
--- a/src_gpu_orig/CMakeLists.txt
+++ b/src_gpu_orig/CMakeLists.txt
@@ -3,7 +3,7 @@
 #
 cmake_minimum_required(VERSION 3.5)
 
-project(dfMatrix LANGUAGES CXX CUDA)
+project(dfMatrixOrig LANGUAGES CXX CUDA)
 set(CMAKE_CXX_STANDARD 17)
 set(CMAKE_CXX_STANDARD_REQUIRED ON)
 
@@ -12,27 +12,26 @@ find_package(MPI REQUIRED)
 find_package(CUDAToolkit REQUIRED)
 find_library(LIBAMGXSH amgxsh PATHS $ENV{AMGX_DIR}/build)
 
+add_compile_options(-arch=sm_70 -fmad=false)
+
 include_directories(
     ${MPI_INCLUDE_PATH}
     ${CUDA_INCLUDE_DIRS}
     $ENV{AMGX_DIR}/include
+    $ENV{DF_ROOT}/src_gpu
 )
 
 add_library(${PROJECT_NAME} 
     SHARED 
-        dfUEqn.cu 
-        dfRhoEqn.cu 
-        dfYEqn.cu
-        dfEEqn.cu
-        AmgXSolver.cu
-        dfMatrixDataBase.cu)
+        dfMatrixDataBaseOrig.cu
+        dfMatrixOpBaseOrig.cu)
 
 target_link_libraries(${PROJECT_NAME}
     ${MPI_LIBRARIES}
     ${CUDA_LIBRARIES}
     ${LIBAMGXSH}
 )
-target_compile_options(dfMatrix PUBLIC -g)
+target_compile_options(dfMatrixOrig PUBLIC -g)
 option(DFMATRIX_ENABLE_DETAILED_DEBUG "Enable detailed debug build" OFF)
 if (DFMATRIX_ENABLE_DETAILED_DEBUG)
     target_compile_definitions(${PROJECT_NAME} PRIVATE DEBUG)
diff --git a/src_gpu_orig/dfMatrixDataBase.H b/src_gpu_orig/dfMatrixDataBaseOrig.H
similarity index 95%
rename from src_gpu_orig/dfMatrixDataBase.H
rename to src_gpu_orig/dfMatrixDataBaseOrig.H
index 8efb4bf62..e4a06d861 100644
--- a/src_gpu_orig/dfMatrixDataBase.H
+++ b/src_gpu_orig/dfMatrixDataBaseOrig.H
@@ -12,45 +12,12 @@
 #include <iostream>
 #include <ctime>
 #include <cmath>
+#include "dfMatrixDataBase.H"
 
 
-static const char *_cudaGetErrorEnum(cudaError_t error) {
-  return cudaGetErrorName(error);
-}
-
-template <typename T>
-void check(T result, char const *const func, const char *const file,
-        int const line) {
-  if (result) {
-    fprintf(stderr, "cuda error at %s:%d code=%d(%s) \"%s\" \n", file, line,
-            static_cast<unsigned int>(result), _cudaGetErrorEnum(result), func);
-    exit(EXIT_FAILURE);
-  }
-}
-
-#define checkCudaErrors(val) check((val), #val, __FILE__, __LINE__)
-
-inline void checkVectorEqual(int count, const double* basevec, double* vec, double max_relative_error) {
-    for (size_t i = 0; i < count; ++i)
-    {
-        double abs_diff = fabs(basevec[i] - vec[i]);
-        double rel_diff = fabs(basevec[i] - vec[i]) / fabs(basevec[i]);
-        // if (abs_diff > 1e-12 && rel_diff > max_relative_error && !std::isinf(rel_diff))
-        if (abs_diff > 1e-15 && rel_diff > max_relative_error)
-            fprintf(stderr, "mismatch index %d, cpu data: %.16lf, gpu data: %.16lf, relative error: %.16lf\n", i, basevec[i], vec[i], rel_diff);
-    }
-}
-
-enum boundaryConditions{
-    zeroGradient,
-    fixedValue,
-    coupled,
-    empty
-};
-
 void constructBoundarySelector(std::vector<int>& patchTypeSelector, const std::string& patchTypeStr, const int patchSize);
 
-struct dfMatrixDataBase
+struct dfMatrixDataBaseOrig
 {
     // - cuda resource
     cudaStream_t stream;
@@ -219,8 +186,8 @@ struct dfMatrixDataBase
     double* d_nuEff = nullptr;
 
     // constructor
-    dfMatrixDataBase();
-    dfMatrixDataBase(int num_surfaces, int num_cells, int num_boundary_faces, int num_species, int & num_boundary_cells_output,
+    dfMatrixDataBaseOrig();
+    dfMatrixDataBaseOrig(int num_surfaces, int num_cells, int num_boundary_faces, int num_species, int & num_boundary_cells_output,
         const int *neighbour, const int *owner, const double* volume, const double* weight, const double* face_vector, const double* face, 
         const double* deltaCoeffs, std::vector<double> boundary_face_vector_init, std::vector<double> boundary_face_init, 
         std::vector<double> boundary_deltaCoeffs_init, std::vector<int> boundary_cell_id_init, std::vector<std::vector<int>> patch_type_init)
@@ -632,10 +599,9 @@ struct dfMatrixDataBase
         checkCudaErrors(cudaMemcpyAsync(d_tmpPermutatedList, tmpPermutatedList.data(), csr_col_index_bytes, cudaMemcpyHostToDevice, stream));
     };
 
-    ~dfMatrixDataBase(){
+    ~dfMatrixDataBaseOrig(){
         std::cout << "Destructor called." << std::endl;
         // TODO: free pointers
-        
     };
 };
 
diff --git a/src_gpu_orig/dfMatrixDataBase.cu b/src_gpu_orig/dfMatrixDataBaseOrig.cu
similarity index 97%
rename from src_gpu_orig/dfMatrixDataBase.cu
rename to src_gpu_orig/dfMatrixDataBaseOrig.cu
index d4f5a7ab0..7eb0ba593 100644
--- a/src_gpu_orig/dfMatrixDataBase.cu
+++ b/src_gpu_orig/dfMatrixDataBaseOrig.cu
@@ -1,4 +1,4 @@
-#include "dfMatrixDataBase.H"
+#include "dfMatrixDataBaseOrig.H"
 
 
 void constructBoundarySelector(std::vector<int>& patchTypeSelector, const std::string& patchTypeStr,
diff --git a/src_gpu_orig/dfMatrixOpBaseOrig.H b/src_gpu_orig/dfMatrixOpBaseOrig.H
new file mode 100644
index 000000000..0f61b558b
--- /dev/null
+++ b/src_gpu_orig/dfMatrixOpBaseOrig.H
@@ -0,0 +1,9 @@
+#pragma once
+
+#include "dfMatrixDataBaseOrig.H"
+#include "dfMatrixDataBase.H"
+
+void fvc_grad_vector_orig(cudaStream_t stream, dfMatrixDataBaseOrig* dataBaseOrig, dfMatrixDataBase& dataBase, double *d_grad, 
+        double *d_grad_boundary_init, double *d_grad_boundary);
+
+void fvc_grad_scalar_orig(cudaStream_t stream, dfMatrixDataBaseOrig* dataBaseOrig, dfMatrixDataBase& dataBase, double *d_grad);
\ No newline at end of file
diff --git a/src_gpu_orig/dfMatrixOpBaseOrig.cu b/src_gpu_orig/dfMatrixOpBaseOrig.cu
new file mode 100644
index 000000000..95737ab12
--- /dev/null
+++ b/src_gpu_orig/dfMatrixOpBaseOrig.cu
@@ -0,0 +1,460 @@
+#include "dfMatrixOpBaseOrig.H"
+
+
+__global__ void fvc_grad_vector_internal(int num_cells,
+                                         const int *csr_row_index, const int *csr_col_index, const int *csr_diag_index,
+                                         const double *sf, const double *vf, const double *tlambdas, const double *volume,
+                                         double *grad)
+{
+    int index = blockDim.x * blockIdx.x + threadIdx.x;
+    if (index >= num_cells)
+        return;
+
+    // A_csr has one more element in each row: itself
+    int row_index = csr_row_index[index];
+    int row_elements = csr_row_index[index + 1] - row_index;
+    int diag_index = csr_diag_index[index];
+    int neighbor_offset = csr_row_index[index] - index;
+
+    double own_vf_x = vf[index * 3 + 0];
+    double own_vf_y = vf[index * 3 + 1];
+    double own_vf_z = vf[index * 3 + 2];
+    double grad_xx = 0;
+    double grad_xy = 0;
+    double grad_xz = 0;
+    double grad_yx = 0;
+    double grad_yy = 0;
+    double grad_yz = 0;
+    double grad_zx = 0;
+    double grad_zy = 0;
+    double grad_zz = 0;
+    // lower
+    for (int i = 0; i < diag_index; i++)
+    {
+        int neighbor_index = neighbor_offset + i;
+        int neighbor_cell_id = csr_col_index[row_index + i];
+        double w = tlambdas[neighbor_index];
+        double sf_x = sf[neighbor_index * 3 + 0];
+        double sf_y = sf[neighbor_index * 3 + 1];
+        double sf_z = sf[neighbor_index * 3 + 2];
+        double neighbor_vf_x = vf[neighbor_cell_id * 3 + 0];
+        double neighbor_vf_y = vf[neighbor_cell_id * 3 + 1];
+        double neighbor_vf_z = vf[neighbor_cell_id * 3 + 2];
+        double face_x = (1 - w) * own_vf_x + w * neighbor_vf_x;
+        double face_y = (1 - w) * own_vf_y + w * neighbor_vf_y;
+        double face_z = (1 - w) * own_vf_z + w * neighbor_vf_z;
+        grad_xx -= sf_x * face_x;
+        grad_xy -= sf_x * face_y;
+        grad_xz -= sf_x * face_z;
+        grad_yx -= sf_y * face_x;
+        grad_yy -= sf_y * face_y;
+        grad_yz -= sf_y * face_z;
+        grad_zx -= sf_z * face_x;
+        grad_zy -= sf_z * face_y;
+        grad_zz -= sf_z * face_z;
+    }
+    // upper
+    for (int i = diag_index + 1; i < row_elements; i++)
+    {
+        int neighbor_index = neighbor_offset + i - 1;
+        int neighbor_cell_id = csr_col_index[row_index + i];
+        double w = tlambdas[neighbor_index];
+        double sf_x = sf[neighbor_index * 3 + 0];
+        double sf_y = sf[neighbor_index * 3 + 1];
+        double sf_z = sf[neighbor_index * 3 + 2];
+        double neighbor_vf_x = vf[neighbor_cell_id * 3 + 0];
+        double neighbor_vf_y = vf[neighbor_cell_id * 3 + 1];
+        double neighbor_vf_z = vf[neighbor_cell_id * 3 + 2];
+        double face_x = w * own_vf_x + (1 - w) * neighbor_vf_x;
+        double face_y = w * own_vf_y + (1 - w) * neighbor_vf_y;
+        double face_z = w * own_vf_z + (1 - w) * neighbor_vf_z;
+        grad_xx += sf_x * face_x;
+        grad_xy += sf_x * face_y;
+        grad_xz += sf_x * face_z;
+        grad_yx += sf_y * face_x;
+        grad_yy += sf_y * face_y;
+        grad_yz += sf_y * face_z;
+        grad_zx += sf_z * face_x;
+        grad_zy += sf_z * face_y;
+        grad_zz += sf_z * face_z;
+        // if (index == 0)
+        // {
+        //     printf("grad_xx = %.20lf\n", grad_xx);
+        //     // printf("sf_x = %.20lf\n", sf_x);
+        //     // printf("face_x = %.20lf\n", face_x);
+        // }
+    }
+    double vol = volume[index];
+    grad[index * 9 + 0] = grad_xx / vol;
+    grad[index * 9 + 1] = grad_xy / vol;
+    grad[index * 9 + 2] = grad_xz / vol;
+    grad[index * 9 + 3] = grad_yx / vol;
+    grad[index * 9 + 4] = grad_yy / vol;
+    grad[index * 9 + 5] = grad_yz / vol;
+    grad[index * 9 + 6] = grad_zx / vol;
+    grad[index * 9 + 7] = grad_zy / vol;
+    grad[index * 9 + 8] = grad_zz / vol;
+
+
+    // if (index == 2257)
+    // {
+    //     printf("grad[2257] = (%.5e, %.5e, %.5e, %.5e, %.5e, %.5e, %.5e, %.5e, %.5e)\n", grad[index * 9 + 0], grad[index * 9 + 1], grad[index * 9 + 2],
+    //             grad[index * 9 + 3], grad[index * 9 + 4], grad[index * 9 + 5], grad[index * 9 + 6], grad[index * 9 + 7], grad[index * 9 + 8]);
+    // }
+}
+
+__global__ void fvc_grad_vector_boundary(int num_cells, int num_boundary_cells, const int *bouPermedIndex,
+                                         const int *boundary_cell_offset, const int *boundary_cell_id,
+                                         const double *boundary_sf, const double *boundary_vf, const double *volume,
+                                         double *grad, double *grad_boundary_init)
+{
+    int index = blockDim.x * blockIdx.x + threadIdx.x;
+    if (index >= num_boundary_cells)
+        return;
+
+    int cell_offset = boundary_cell_offset[index];
+    int next_cell_offset = boundary_cell_offset[index + 1];
+    int cell_index = boundary_cell_id[cell_offset];
+
+    double grad_xx = 0;
+    double grad_xy = 0;
+    double grad_xz = 0;
+    double grad_yx = 0;
+    double grad_yy = 0;
+    double grad_yz = 0;
+    double grad_zx = 0;
+    double grad_zy = 0;
+    double grad_zz = 0;
+    for (int i = cell_offset; i < next_cell_offset; i++)
+    {
+        int p = bouPermedIndex[i];
+        double sf_x = boundary_sf[i * 3 + 0];
+        double sf_y = boundary_sf[i * 3 + 1];
+        double sf_z = boundary_sf[i * 3 + 2];
+        double vf_x = boundary_vf[p * 3 + 0];
+        double vf_y = boundary_vf[p * 3 + 1];
+        double vf_z = boundary_vf[p * 3 + 2];
+        grad_xx += sf_x * vf_x;
+        grad_xy += sf_x * vf_y;
+        grad_xz += sf_x * vf_z;
+        grad_yx += sf_y * vf_x;
+        grad_yy += sf_y * vf_y;
+        grad_yz += sf_y * vf_z;
+        grad_zx += sf_z * vf_x;
+        grad_zy += sf_z * vf_y;
+        grad_zz += sf_z * vf_z;
+    }
+
+    double vol = volume[cell_index];
+
+    grad[cell_index * 9 + 0] += grad_xx / vol;
+    grad[cell_index * 9 + 1] += grad_xy / vol;
+    grad[cell_index * 9 + 2] += grad_xz / vol;
+    grad[cell_index * 9 + 3] += grad_yx / vol;
+    grad[cell_index * 9 + 4] += grad_yy / vol;
+    grad[cell_index * 9 + 5] += grad_yz / vol;
+    grad[cell_index * 9 + 6] += grad_zx / vol;
+    grad[cell_index * 9 + 7] += grad_zy / vol;
+    grad[cell_index * 9 + 8] += grad_zz / vol;
+
+    grad_boundary_init[index * 9 + 0] = grad[cell_index * 9 + 0];
+    grad_boundary_init[index * 9 + 1] = grad[cell_index * 9 + 1];
+    grad_boundary_init[index * 9 + 2] = grad[cell_index * 9 + 2];
+    grad_boundary_init[index * 9 + 3] = grad[cell_index * 9 + 3];
+    grad_boundary_init[index * 9 + 4] = grad[cell_index * 9 + 4];
+    grad_boundary_init[index * 9 + 5] = grad[cell_index * 9 + 5];
+    grad_boundary_init[index * 9 + 6] = grad[cell_index * 9 + 6];
+    grad_boundary_init[index * 9 + 7] = grad[cell_index * 9 + 7];
+    grad_boundary_init[index * 9 + 8] = grad[cell_index * 9 + 8];
+
+    // if (index == 0)
+    // {
+    //     printf("grad[0] = (%.5e, %.5e, %.5e, %.5e, %.5e, %.5e, %.5e, %.5e, %.5e)\n", grad[index * 9 + 0], grad[index * 9 + 1], grad[index * 9 + 2],
+    //             grad[index * 9 + 3], grad[index * 9 + 4], grad[index * 9 + 5], grad[index * 9 + 6], grad[index * 9 + 7], grad[index * 9 + 8]);
+    // }
+}
+
+__global__ void correct_boundary_conditions(int num_boundary_cells, const int *bouPermedIndex,
+                                            const int *boundary_cell_offset, const int *boundary_cell_id,
+                                            const double *boundary_sf, const double *mag_sf,
+                                            double *boundary_grad_init, double *boundary_grad, const double *boundary_deltaCoeffs,
+                                            const double *internal_velocity, const double *boundary_velocity, const int *U_patch_type)
+{
+    int index = blockDim.x * blockIdx.x + threadIdx.x;
+    if (index >= num_boundary_cells)
+        return;
+
+    int cell_offset = boundary_cell_offset[index];
+    int next_cell_offset = boundary_cell_offset[index + 1];
+    int cell_index = boundary_cell_id[cell_offset];
+
+    // initialize boundary_grad
+    double grad_xx = boundary_grad_init[index * 9 + 0];
+    double grad_xy = boundary_grad_init[index * 9 + 1];
+    double grad_xz = boundary_grad_init[index * 9 + 2];
+    double grad_yx = boundary_grad_init[index * 9 + 3];
+    double grad_yy = boundary_grad_init[index * 9 + 4];
+    double grad_yz = boundary_grad_init[index * 9 + 5];
+    double grad_zx = boundary_grad_init[index * 9 + 6];
+    double grad_zy = boundary_grad_init[index * 9 + 7];
+    double grad_zz = boundary_grad_init[index * 9 + 8];
+
+    double internal_U_x = internal_velocity[cell_index * 3 + 0];
+    double internal_U_y = internal_velocity[cell_index * 3 + 1];
+    double internal_U_z = internal_velocity[cell_index * 3 + 2];
+
+    for (int i = cell_offset; i < next_cell_offset; i++)
+    {
+        // OpenFoam code
+        // const vectorField n
+        //     (
+        //      vsf.mesh().Sf().boundaryField()[patchi]
+        //      / vsf.mesh().magSf().boundaryField()[patchi]
+        //     );
+        // gGradbf[patchi] += n *
+        //     (
+        //      vsf.boundaryField()[patchi].snGrad()
+        //      - (n & gGradbf[patchi])
+        //     );
+        // template<class Type> // fixedValue
+        // Foam::tmp<Foam::Field<Type>> Foam::fvPatchField<Type>::snGrad() const
+        // {
+        //     return patch_.deltaCoeffs()*(*this - patchInternalField());
+        // }
+
+        double n_x = boundary_sf[i * 3 + 0] / mag_sf[i];
+        double n_y = boundary_sf[i * 3 + 1] / mag_sf[i];
+        double n_z = boundary_sf[i * 3 + 2] / mag_sf[i];
+
+        int p = bouPermedIndex[i];
+
+        double sn_grad_x, sn_grad_y, sn_grad_z;
+        int patchIndex = U_patch_type[i];
+        if (patchIndex == 0) { // zeroGradient
+            sn_grad_x = 0;
+            sn_grad_y = 0;
+            sn_grad_z = 0;
+        } else if (patchIndex == 1) { // fixedValue
+            sn_grad_x = boundary_deltaCoeffs[i] * (boundary_velocity[p * 3 + 0] - internal_velocity[cell_index * 3 + 0]);
+            sn_grad_y = boundary_deltaCoeffs[i] * (boundary_velocity[p * 3 + 1] - internal_velocity[cell_index * 3 + 1]);
+            sn_grad_z = boundary_deltaCoeffs[i] * (boundary_velocity[p * 3 + 2] - internal_velocity[cell_index * 3 + 2]);
+            // if (index == 1)
+            // {
+            //     printf("cell_index = %d\n", cell_index);
+            //     printf("boundary_velocity = %e\n", boundary_velocity[i * 3 + 1]);
+            //     printf("internal_velocity = %e\n", internal_velocity[cell_index * 3 + 0]);
+            // }
+            
+        }
+        // TODO: implement other BCs
+        double grad_correction_x = sn_grad_x - (n_x * grad_xx + n_y * grad_yx + n_z * grad_zx);
+        double grad_correction_y = sn_grad_y - (n_x * grad_xy + n_y * grad_yy + n_z * grad_zy);
+        double grad_correction_z = sn_grad_z - (n_x * grad_xz + n_y * grad_yz + n_z * grad_zz);
+        boundary_grad[i * 9 + 0] = grad_xx + n_x * grad_correction_x;
+        boundary_grad[i * 9 + 1] = grad_xy + n_x * grad_correction_y;
+        boundary_grad[i * 9 + 2] = grad_xz + n_x * grad_correction_z;
+        boundary_grad[i * 9 + 3] = grad_yx + n_y * grad_correction_x;
+        boundary_grad[i * 9 + 4] = grad_yy + n_y * grad_correction_y;
+        boundary_grad[i * 9 + 5] = grad_yz + n_y * grad_correction_z;
+        boundary_grad[i * 9 + 6] = grad_zx + n_z * grad_correction_x;
+        boundary_grad[i * 9 + 7] = grad_zy + n_z * grad_correction_y;
+        boundary_grad[i * 9 + 8] = grad_zz + n_z * grad_correction_z;
+        
+    }
+}
+
+__global__ void fvc_grad_scalar_internal(int num_cells,
+                                       const int *csr_row_index, const int *csr_col_index, const int *csr_diag_index,
+                                       const double *face_vector, const double *weight, const double *pressure, const double *volume,
+                                       const double *b_input, double *b_output)
+{
+    int index = blockDim.x * blockIdx.x + threadIdx.x;
+    if (index >= num_cells)
+        return;
+
+    // A_csr has one more element in each row: itself
+    int row_index = csr_row_index[index];
+    int next_row_index = csr_row_index[index + 1];
+    int diag_index = csr_diag_index[index];
+    int neighbor_offset = csr_row_index[index] - index;
+
+    double own_cell_p = pressure[index];
+    double grad_bx = 0;
+    double grad_by = 0;
+    double grad_bz = 0;
+    double grad_bx_low = 0;
+    double grad_bx_upp = 0;
+    double grad_by_low = 0;
+    double grad_by_upp = 0;
+    double grad_bz_low = 0;
+    double grad_bz_upp = 0;
+    for (int i = row_index; i < next_row_index; i++)
+    {
+        int inner_index = i - row_index;
+        // lower
+        if (inner_index < diag_index)
+        {
+            int neighbor_index = neighbor_offset + inner_index;
+            double w = weight[neighbor_index];
+            double sfx = face_vector[neighbor_index * 3 + 0];
+            double sfy = face_vector[neighbor_index * 3 + 1];
+            double sfz = face_vector[neighbor_index * 3 + 2];
+            int neighbor_cell_id = csr_col_index[row_index + inner_index];
+            double neighbor_cell_p = pressure[neighbor_cell_id];
+            double face_p = (1 - w) * own_cell_p + w * neighbor_cell_p;
+            grad_bx_low -= face_p * sfx;
+            grad_by_low -= face_p * sfy;
+            grad_bz_low -= face_p * sfz;
+        }
+        // upper
+        if (inner_index > diag_index)
+        {
+            int neighbor_index = neighbor_offset + inner_index - 1;
+            double w = weight[neighbor_index];
+            double sfx = face_vector[neighbor_index * 3 + 0];
+            double sfy = face_vector[neighbor_index * 3 + 1];
+            double sfz = face_vector[neighbor_index * 3 + 2];
+            int neighbor_cell_id = csr_col_index[row_index + inner_index];
+            double neighbor_cell_p = pressure[neighbor_cell_id];
+            double face_p = w * own_cell_p + (1 - w) * neighbor_cell_p;
+            grad_bx_upp += face_p * sfx;
+            grad_by_upp += face_p * sfy;
+            grad_bz_upp += face_p * sfz;
+        }
+    }
+    double vol = volume[index];
+    b_output[index * 3 + 0] = b_input[index * 3 + 0] + (grad_bx_low + grad_bx_upp) / vol;
+    b_output[index * 3 + 1] = b_input[index * 3 + 1] + (grad_by_low + grad_by_upp) / vol;
+    b_output[index * 3 + 2] = b_input[index * 3 + 2] + (grad_bz_low + grad_bz_upp) / vol;
+    // b_output[index * 3 + 0] = b_input[index * 3 + 0] + grad_bx_low + grad_bx_upp;
+    // b_output[index * 3 + 1] = b_input[index * 3 + 1] + grad_by_low + grad_by_upp;
+    // b_output[index * 3 + 2] = b_input[index * 3 + 2] + grad_bz_low + grad_bz_upp;
+
+}
+
+__global__ void fvc_grad_scalar_boundary(int num_cells, int num_boundary_cells, const int *bouPermedIndex,
+                                       const int *boundary_cell_offset, const int *boundary_cell_id,
+                                       const double *boundary_face_vector, const double *boundary_pressure, const double *volume,
+                                       const double *b_input, double *b_output)
+{
+    int index = blockDim.x * blockIdx.x + threadIdx.x;
+    if (index >= num_boundary_cells)
+        return;
+
+    int cell_offset = boundary_cell_offset[index];
+    int next_cell_offset = boundary_cell_offset[index + 1];
+    int cell_index = boundary_cell_id[cell_offset];
+
+    // compute boundary gradient
+    double grad_bx = 0;
+    double grad_by = 0;
+    double grad_bz = 0;
+    for (int i = cell_offset; i < next_cell_offset; i++)
+    {
+        int p = bouPermedIndex[i];
+        double sfx = boundary_face_vector[i * 3 + 0];
+        double sfy = boundary_face_vector[i * 3 + 1];
+        double sfz = boundary_face_vector[i * 3 + 2];
+        double face_p = boundary_pressure[p];
+        grad_bx += face_p * sfx;
+        grad_by += face_p * sfy;
+        grad_bz += face_p * sfz;
+    }
+
+    //// correct the boundary gradient
+    // double nx = boundary_face_vector[face_index * 3 + 0] / magSf[face_index];
+    // double ny = boundary_face_vector[face_index * 3 + 1] / magSf[face_index];
+    // double nz = boundary_face_vector[face_index * 3 + 2] / magSf[face_index];
+    // double sn_grad = 0;
+    // double grad_correction = sn_grad * volume[cell_index] - (nx * grad_bx + ny * grad_by + nz * grad_bz);
+    // grad_bx += nx * grad_correction;
+    // grad_by += ny * grad_correction;
+    // grad_bz += nz * grad_correction;
+
+    double vol = volume[cell_index];
+    b_output[cell_index * 3 + 0] = b_input[cell_index * 3 + 0] + grad_bx / vol;
+    b_output[cell_index * 3 + 1] = b_input[cell_index * 3 + 1] + grad_by / vol;
+    b_output[cell_index * 3 + 2] = b_input[cell_index * 3 + 2] + grad_bz / vol;
+}
+
+
+void fvc_grad_vector_orig(cudaStream_t stream, dfMatrixDataBaseOrig* dataBaseOrig, dfMatrixDataBase& dataBase, double *d_grad, 
+        double *d_grad_boundary_init, double *d_grad_boundary)
+{
+    float time_elapsed = 0;
+    cudaEvent_t start, stop;
+    checkCudaErrors(cudaEventCreate(&start));
+    checkCudaErrors(cudaEventCreate(&stop));
+    checkCudaErrors(cudaEventRecord(start, 0));
+
+    size_t threads_per_block = 1024;
+    size_t blocks_per_grid = (dataBase.num_cells + threads_per_block - 1) / threads_per_block;
+    fvc_grad_vector_internal<<<blocks_per_grid, threads_per_block, 0, stream>>>(dataBase.num_cells,
+                                                                                dataBaseOrig->d_A_csr_row_index, dataBaseOrig->d_A_csr_col_index, dataBaseOrig->d_A_csr_diag_index,
+                                                                                dataBaseOrig->d_face_vector, dataBase.d_u, dataBaseOrig->d_weight, dataBaseOrig->d_volume, d_grad);
+
+    checkCudaErrors(cudaEventRecord(stop, 0));
+    checkCudaErrors(cudaEventSynchronize(start));
+    checkCudaErrors(cudaEventSynchronize(stop));
+    checkCudaErrors(cudaEventElapsedTime(&time_elapsed, start, stop));
+    printf("\nfvc_grad_vector_orig internal 执行时间：%f(ms)\n", time_elapsed);
+    
+    
+    checkCudaErrors(cudaEventRecord(start, 0));
+    blocks_per_grid = (dataBaseOrig->num_boundary_cells + threads_per_block - 1) / threads_per_block;
+    fvc_grad_vector_boundary<<<blocks_per_grid, threads_per_block, 0, stream>>>(dataBase.num_cells, dataBaseOrig->num_boundary_cells, dataBaseOrig->d_bouPermedIndex,
+                                                                                dataBaseOrig->d_boundary_cell_offset, dataBaseOrig->d_boundary_cell_id, dataBaseOrig->d_boundary_face_vector, 
+                                                                                dataBase.d_boundary_u, dataBase.d_volume, d_grad, d_grad_boundary_init);
+    
+    checkCudaErrors(cudaEventRecord(stop, 0));
+    checkCudaErrors(cudaEventSynchronize(start));
+    checkCudaErrors(cudaEventSynchronize(stop));
+    checkCudaErrors(cudaEventElapsedTime(&time_elapsed, start, stop));
+    printf("fvc_grad_vector_orig boundary1 执行时间：%f(ms)\n", time_elapsed);
+    
+    
+    checkCudaErrors(cudaEventRecord(start, 0));
+    correct_boundary_conditions<<<blocks_per_grid, threads_per_block, 0, stream>>>(dataBaseOrig->num_boundary_cells, dataBaseOrig->d_bouPermedIndex,
+                                                                                   dataBaseOrig->d_boundary_cell_offset, dataBaseOrig->d_boundary_cell_id, dataBaseOrig->d_boundary_face_vector, 
+                                                                                   dataBaseOrig->d_boundary_face, d_grad_boundary_init, d_grad_boundary, dataBaseOrig->d_boundary_deltaCoeffs, 
+                                                                                   dataBase.d_u, dataBase.d_boundary_u, dataBaseOrig->d_boundary_UpatchType);
+    checkCudaErrors(cudaEventRecord(stop, 0));
+    checkCudaErrors(cudaEventSynchronize(start));
+    checkCudaErrors(cudaEventSynchronize(stop));
+    checkCudaErrors(cudaEventElapsedTime(&time_elapsed, start, stop));
+    printf("fvc_grad_vector_orig boundary2 执行时间：%f(ms)\n", time_elapsed);
+}
+
+void fvc_grad_scalar_orig(cudaStream_t stream, dfMatrixDataBaseOrig* dataBaseOrig, dfMatrixDataBase& dataBase, double *d_grad)
+{
+    float time_elapsed = 0;
+    cudaEvent_t start, stop;
+    checkCudaErrors(cudaEventCreate(&start));
+    checkCudaErrors(cudaEventCreate(&stop));
+    checkCudaErrors(cudaEventRecord(start, 0));
+
+    size_t threads_per_block = 1024;
+    size_t blocks_per_grid = (dataBase.num_cells + threads_per_block - 1) / threads_per_block;
+    fvc_grad_scalar_internal<<<blocks_per_grid, threads_per_block, 0, stream>>>(dataBase.num_cells,
+                                                                                dataBaseOrig->d_A_csr_row_index, dataBaseOrig->d_A_csr_col_index, dataBaseOrig->d_A_csr_diag_index,
+                                                                                dataBaseOrig->d_face_vector, dataBaseOrig->d_weight, dataBase.d_p, dataBaseOrig->d_volume, d_grad, d_grad);
+
+    checkCudaErrors(cudaEventRecord(stop, 0));
+    checkCudaErrors(cudaEventSynchronize(start));
+    checkCudaErrors(cudaEventSynchronize(stop));
+    checkCudaErrors(cudaEventElapsedTime(&time_elapsed, start, stop));
+    printf("\nfvc_grad_scalar_orig internal 执行时间：%f(ms)\n", time_elapsed);
+
+    checkCudaErrors(cudaEventRecord(start, 0));
+
+    blocks_per_grid = (dataBaseOrig->num_boundary_cells + threads_per_block - 1) / threads_per_block;
+    fvc_grad_scalar_boundary<<<blocks_per_grid, threads_per_block, 0, stream>>>(dataBase.num_cells, dataBaseOrig->num_boundary_cells, dataBaseOrig->d_bouPermedIndex,
+                                                                              dataBaseOrig->d_boundary_cell_offset, dataBaseOrig->d_boundary_cell_id,
+                                                                              dataBaseOrig->d_boundary_face_vector, dataBase.d_boundary_p, dataBaseOrig->d_volume, d_grad, d_grad);
+    
+    checkCudaErrors(cudaEventRecord(stop, 0));
+    checkCudaErrors(cudaEventSynchronize(start));
+    checkCudaErrors(cudaEventSynchronize(stop));
+    checkCudaErrors(cudaEventElapsedTime(&time_elapsed, start, stop));
+    printf("fvc_grad_scalar_orig boundary 执行时间：%f(ms)\n", time_elapsed);
+}
\ No newline at end of file

From db5a689cce42bbc436817f0f64cad671385c5f10 Mon Sep 17 00:00:00 2001
From: STwangyingrui <wangyingrui@sensetime.com>
Date: Tue, 15 Aug 2023 17:59:50 +0800
Subject: [PATCH 19/25] run pass basic ueqn_gpu

---
 .../solvers/dfLowMachFoam/createGPUSolver.H   |   5 +-
 applications/solvers/dfLowMachFoam/new_UEqn.H |  54 +++-
 .../solvers/dfLowMachFoam/new_dfLowMachFoam.C |   2 +
 src_gpu/dfMatrixDataBase.H                    |   6 +-
 src_gpu/dfMatrixDataBase.cu                   |   6 +
 src_gpu/dfMatrixOpBase.H                      |  16 ++
 src_gpu/dfMatrixOpBase.cu                     | 266 +++++++++++++-----
 src_gpu/dfUEqn.H                              |   5 +-
 src_gpu/dfUEqn.cu                             |  98 ++++---
 9 files changed, 344 insertions(+), 114 deletions(-)

diff --git a/applications/solvers/dfLowMachFoam/createGPUSolver.H b/applications/solvers/dfLowMachFoam/createGPUSolver.H
index d9ce745d7..94fff1125 100644
--- a/applications/solvers/dfLowMachFoam/createGPUSolver.H
+++ b/applications/solvers/dfLowMachFoam/createGPUSolver.H
@@ -31,24 +31,27 @@ void createGPUBase(fvMesh& mesh, PtrList<volScalarField>& Y) {
     double *boundary_sf = new double[3 * num_boundary_surfaces];
     double *boundary_mag_sf = new double[num_boundary_surfaces];
     double *boundary_delta_coeffs = new double[num_boundary_surfaces];
+    int *boundary_face_cell = new int[num_boundary_surfaces];
     int offset = 0;
     forAll(mesh.boundary(), patchi) {
         const vectorField& pSf = mesh.Sf().boundaryField()[patchi];
         const scalarField& pMagSf = mesh.magSf().boundaryField()[patchi];
         const scalarField& pDeltaCoeffs = mesh.nonOrthDeltaCoeffs().boundaryField()[patchi];
+        const labelUList& pFaceCells = mesh.boundary()[patchi].faceCells();
 
         int patchsize = pMagSf.size();
 
         memcpy(boundary_sf + 3*offset, &pSf[0][0], 3*patchsize*sizeof(double));
         memcpy(boundary_mag_sf + offset, &pMagSf[0], patchsize*sizeof(double));
         memcpy(boundary_delta_coeffs + offset, &pDeltaCoeffs[0], patchsize*sizeof(double));
+        memcpy(boundary_face_cell + offset, &pFaceCells[0], patchsize * sizeof(int));
         offset += patchsize;
     }
 
     dfDataBase.createConstantFieldsInternal();
     dfDataBase.createConstantFieldsBoundary();
     dfDataBase.initConstantFieldsInternal(&mesh.Sf()[0][0], &mesh.magSf()[0], &mesh.surfaceInterpolation::weights()[0], &mesh.nonOrthDeltaCoeffs()[0], &mesh.V()[0]);
-    dfDataBase.initConstantFieldsBoundary(boundary_sf, boundary_mag_sf, boundary_delta_coeffs);
+    dfDataBase.initConstantFieldsBoundary(boundary_sf, boundary_mag_sf, boundary_delta_coeffs, boundary_face_cell);
     
     // prepare internal and boundary of Y
     dfDataBase.createNonConstantFieldsInternal();
diff --git a/applications/solvers/dfLowMachFoam/new_UEqn.H b/applications/solvers/dfLowMachFoam/new_UEqn.H
index c38735375..3d84f3631 100644
--- a/applications/solvers/dfLowMachFoam/new_UEqn.H
+++ b/applications/solvers/dfLowMachFoam/new_UEqn.H
@@ -1,18 +1,22 @@
 #ifdef GPUSolver_
-// run CPU
+const tmp<volScalarField> nuEff_tmp(turbulence->nuEff());
+const volScalarField& nuEff = nuEff_tmp();
+
+// run CPU, for temp
 tmp<fvVectorMatrix> tUEqn
 (
- fvm::div(phi, U)
+ fvm::ddt(rho, U) + fvm::div(phi, U) == -fvc::grad(p)
+ //turbulence->divDevRhoReff(U)
 );
 fvVectorMatrix& UEqn = tUEqn.ref();
 
 // run GPU
 // preProcess
-// skip preparing u, p, nu_eff, and rho.boundary used by UEqn_GPU.preProcess()
-// TODO: temp, now we only transfer phi(instead of rhoEqn) used by fvm::div(phi, U)
-double *h_u = dfDataBase.getFieldPointer("u", location::cpu, position::internal);
+// TODO: preProcessForRhoEqn for temp, now we only transfer phi(instead of rhoEqn) used by fvm::div(phi, U)
+double *h_rho = dfDataBase.getFieldPointer("rho", location::cpu, position::internal);
 double *h_phi = dfDataBase.getFieldPointer("phi", location::cpu, position::internal);
 double *h_boundary_phi = dfDataBase.getFieldPointer("phi", location::cpu, position::boundary);
+memcpy(h_rho, &rho[0], dfDataBase.cell_value_bytes);
 memcpy(h_phi, &phi[0], dfDataBase.surface_value_bytes);
 int offset = 0;
 forAll(phi.boundaryField(), patchi)
@@ -22,13 +26,46 @@ forAll(phi.boundaryField(), patchi)
     memcpy(h_boundary_phi + offset, &patchPhi[0], patchsize * sizeof(double));
     offset += patchsize;
 }
-UEqn_GPU.preProcessForRhoEqn(h_phi, h_boundary_phi);
+UEqn_GPU.preProcessForRhoEqn(h_rho, h_phi, h_boundary_phi);
+DEBUG_TRACE;
+// preparing u, p, nu_eff, and rho.boundary used by UEqn_GPU.preProcess()
+double *h_u = dfDataBase.getFieldPointer("u", location::cpu, position::internal);
+double *h_boundary_u = dfDataBase.getFieldPointer("u", location::cpu, position::boundary);
+double *h_p = dfDataBase.getFieldPointer("p", location::cpu, position::internal);
+double *h_boundary_p = dfDataBase.getFieldPointer("p", location::cpu, position::boundary);
+double *h_nu_eff = UEqn_GPU.getFieldPointer("nu_eff", location::cpu, position::internal);
+double *h_boundary_nu_eff = UEqn_GPU.getFieldPointer("nu_eff", location::cpu, position::boundary);
+double *h_boundary_rho = dfDataBase.getFieldPointer("rho", location::cpu, position::boundary);
+memcpy(h_u, &U.oldTime()[0][0], dfDataBase.cell_value_vec_bytes);
+memcpy(h_p, &p[0], dfDataBase.cell_value_bytes);
+memcpy(h_nu_eff, &nuEff[0], dfDataBase.cell_value_bytes);
+offset = 0;
+forAll(U.boundaryField(), patchi)
+{
+    const fvPatchVectorField& patchU = U.boundaryField()[patchi];
+    const fvPatchScalarField& patchP = p.boundaryField()[patchi];
+    const fvPatchScalarField& patchNuEff = nuEff.boundaryField()[patchi];
+    const fvPatchScalarField& patchRho = rho.boundaryField()[patchi];
+    int patchsize = patchU.size();
+    memcpy(h_boundary_u + offset * 3, &patchU[0][0], patchsize * 3 * sizeof(double));
+    memcpy(h_boundary_p + offset, &patchP[0], patchsize * sizeof(double));
+    memcpy(h_boundary_nu_eff + offset, &patchNuEff[0], patchsize * sizeof(double));
+    memcpy(h_boundary_rho + offset, &patchRho[0], patchsize * sizeof(double));
+    offset += patchsize;
+}
+UEqn_GPU.preProcess(h_u, h_boundary_u, h_p, h_boundary_p, h_nu_eff, h_boundary_nu_eff, h_boundary_rho);
+DEBUG_TRACE;
+
 // process
 UEqn_GPU.process();
+DEBUG_TRACE;
+
 // postProcess
 UEqn_GPU.postProcess(h_u);
+DEBUG_TRACE;
+
 // checkResult
-// TODO: temp, now we compare ldu, finally we compare csr
+// TODO: for temp, now we compare ldu, finally we compare csr
 std::vector<double> h_internal_coeffs(dfDataBase.num_boundary_surfaces * 3);
 std::vector<double> h_boundary_coeffs(dfDataBase.num_boundary_surfaces * 3);
 offset = 0;
@@ -42,6 +79,7 @@ for (int patchi = 0; patchi < dfDataBase.num_patches; patchi++)
     offset += patchsize;
 }
 bool printFlag = false;
-UEqn_GPU.compareResult(&UEqn.lower()[0], &UEqn.upper()[0], &UEqn.diag()[0],
+UEqn_GPU.compareResult(&UEqn.lower()[0], &UEqn.upper()[0], &UEqn.diag()[0], &UEqn.source()[0][0],
         h_internal_coeffs.data(), h_boundary_coeffs.data(), printFlag);
+DEBUG_TRACE;
 #endif
diff --git a/applications/solvers/dfLowMachFoam/new_dfLowMachFoam.C b/applications/solvers/dfLowMachFoam/new_dfLowMachFoam.C
index 530a9f7ec..0deffb40f 100644
--- a/applications/solvers/dfLowMachFoam/new_dfLowMachFoam.C
+++ b/applications/solvers/dfLowMachFoam/new_dfLowMachFoam.C
@@ -99,7 +99,9 @@ int main(int argc, char *argv[])
         createGPUUEqn(CanteraTorchProperties, U);
 
         // foreach(timestep) {
+        dfDataBase.preTimeStep(&rho.oldTime()[0]);
         #include "new_UEqn.H"
+        dfDataBase.postTimeStep();
         // }
     }
     return 0;
diff --git a/src_gpu/dfMatrixDataBase.H b/src_gpu/dfMatrixDataBase.H
index 7eb8b9ec2..69d20d7af 100644
--- a/src_gpu/dfMatrixDataBase.H
+++ b/src_gpu/dfMatrixDataBase.H
@@ -41,7 +41,7 @@ inline void checkVectorEqual(int count, const double* basevec, double* vec, doub
             fprintf(stderr, "index %d, cpu data: %.16lf, gpu data: %.16lf, relative error: %.16lf\n", i, basevec[i], vec[i], rel_diff);
         // if (abs_diff > 1e-12 && rel_diff > max_relative_error && !std::isinf(rel_diff))
         if (abs_diff > 1e-15 && rel_diff > max_relative_error)
-            fprintf(stderr, "mismatch index %d, cpu data: %.16lf, gpu data: %.16lf, relative error: %.16lf\n", i, basevec[i], vec[i], rel_diff);
+            fprintf(stderr, "mismatch index %d, cpu data: %.30lf, gpu data: %.30lf, relative error: %.16lf\n", i, basevec[i], vec[i], rel_diff);
     }
 }
 
@@ -63,7 +63,6 @@ enum boundaryConditions{
 };
 
 void constructBoundarySelectorPerPatch(int *patchTypeSelector, const std::string& patchTypeStr);
-void constructBoundarySelector(std::vector<int>& patchTypeSelector, const std::string& patchTypeStr, const int patchSize);
 
 struct dfMatrixDataBase
 {
@@ -205,6 +204,9 @@ struct dfMatrixDataBase
     void initNonConstantFieldsInternal(const double *y);
     void initNonConstantFieldsBoundary(const double *boundary_y);
 
+    void preTimeStep(const double *rho_old);
+    void postTimeStep();
+
     // getter
     double* getFieldPointer(const char* fieldAlias, location loc, position pos);
 };
diff --git a/src_gpu/dfMatrixDataBase.cu b/src_gpu/dfMatrixDataBase.cu
index cb6a44d5f..64b35f956 100644
--- a/src_gpu/dfMatrixDataBase.cu
+++ b/src_gpu/dfMatrixDataBase.cu
@@ -311,6 +311,12 @@ void dfMatrixDataBase::initNonConstantFieldsBoundary(const double *boundary_y) {
     checkCudaErrors(cudaMemcpyAsync(d_boundary_y, boundary_y, boundary_surface_value_bytes* num_species, cudaMemcpyHostToDevice, stream));
 }
 
+void dfMatrixDataBase::preTimeStep(const double *rho_old) {
+    checkCudaErrors(cudaMemcpyAsync(d_rho_old, rho_old, cell_value_bytes, cudaMemcpyHostToDevice, stream));
+}
+
+void dfMatrixDataBase::postTimeStep() {}
+
 double* dfMatrixDataBase::getFieldPointer(const char* fieldAlias, location loc, position pos) {
     char mergedName[256];
     if (pos == position::internal) {
diff --git a/src_gpu/dfMatrixOpBase.H b/src_gpu/dfMatrixOpBase.H
index 109f20c3f..a415a8a1b 100644
--- a/src_gpu/dfMatrixOpBase.H
+++ b/src_gpu/dfMatrixOpBase.H
@@ -4,6 +4,12 @@
 void permute_vector_d2h(cudaStream_t stream, int num_cells, const double *input, double *output);
 void permute_vector_h2d(cudaStream_t stream, int num_cells, const double *input, double *output);
 
+void field_multiply_scalar(cudaStream_t stream,
+        int num_cells, const double *input1, const double *input2, double *output,
+        int num_boundary_surfaces, const double *boundary_input1, const double *boundary_input2, double *boundary_output);
+
+void fvc_to_source_vector(cudaStream_t stream, int num_cells, const double *volume, const double *fvc_output, double *source);
+
 void ldu_to_csr(cudaStream_t stream, int num_cells, int num_surfaces,
         const int *lower_to_csr_index, const int *upper_to_csr_index, const int *diag_to_csr_index, 
         const double *lower, const double *upper, const double *diag, const double *source,
@@ -57,6 +63,13 @@ void fvc_div_surface_scalar(cudaStream_t stream, int num_cells, int num_surfaces
         const double *boundary_ssf, const double *volume, double *output);
 
 void fvc_div_cell_vector(cudaStream_t stream, int num_cells, int num_surfaces,
+        const int *lowerAddr, const int *upperAddr,
+        const double *weight, const double *Sf, const double *vf, double *output, // end for internal
+        int num_patches, const int *patch_size, const int *patch_type,
+        const int *boundary_cell_face, const double *boundary_vf, const double *boundary_Sf,
+        const double *volume);
+
+void fvc_div_cell_tensor(cudaStream_t stream, int num_cells, int num_surfaces,
         const int *lowerAddr, const int *upperAddr, 
         const double *weight, const double *Sf, const double *vf, double *output, // end for internal
         int num_patches, const int *patch_size, const int *patch_type,
@@ -69,3 +82,6 @@ void fvc_grad_cell_scalar(cudaStream_t stream, int num_cells, int num_surfaces,
         int num_patches, const int *patch_size, const int *patch_type,
         const int *boundary_cell_face, const double *boundary_vf, const double *boundary_Sf, const double *volume);
 
+// others
+void scale_dev2T_tensor(cudaStream_t stream, int num_cells, const double *vf1, double *vf2,
+        int num_boundary_surfaces, const double *boundary_vf1, double *boundary_vf2);
diff --git a/src_gpu/dfMatrixOpBase.cu b/src_gpu/dfMatrixOpBase.cu
index 7a76db89c..e4b2a25e9 100644
--- a/src_gpu/dfMatrixOpBase.cu
+++ b/src_gpu/dfMatrixOpBase.cu
@@ -26,6 +26,30 @@ __global__ void permute_vector_h2d_kernel(int num_cells, const double *input, do
     output[num_cells * 2 + index] = input[index * 3 + 2];
 }
 
+__global__ void field_multiply_scalar_kernel(int num_cells, int num_boundary_surfaces,
+        const double *input1, const double *input2, double *output,
+        const double *boundary_input1, const double *boundary_input2, double *boundary_output)
+{
+    int index = blockDim.x * blockIdx.x + threadIdx.x;
+    if (index < num_cells) {
+        output[index] = input1[index] * input2[index];
+    }
+    if (index < num_boundary_surfaces) {
+        boundary_output[index] = boundary_input1[index] * boundary_input2[index];
+    }
+}
+
+__global__ void fvc_to_source_vector_kernel(int num_cells, const double *volume, const double *fvc_output, double *source)
+{
+    int index = blockDim.x * blockIdx.x + threadIdx.x;
+    if (index >= num_cells)
+        return;
+
+    source[index * 3 + 0] += fvc_output[index * 3 + 0] * volume[index];
+    source[index * 3 + 1] += fvc_output[index * 3 + 1] * volume[index];
+    source[index * 3 + 2] += fvc_output[index * 3 + 2] * volume[index];
+}
+
 __global__ void update_boundary_coeffs_zeroGradient_vector(int num, int offset,
         double *value_internal_coeffs, double *value_boundary_coeffs,
         double *gradient_internal_coeffs, double *gradient_boundary_coeffs)
@@ -53,6 +77,34 @@ __global__ void update_boundary_coeffs_zeroGradient_vector(int num, int offset,
     gradient_boundary_coeffs[start_index * 3 + 2] = 0;
 }
 
+__global__ void scale_dev2t_tensor_kernel(int num, const double *vf1, double *vf2)
+{
+    int index = blockDim.x * blockIdx.x + threadIdx.x;
+    if (index >= num)
+        return;
+
+    double scale = vf1[index];
+    double val_xx = vf2[index * 9 + 0];
+    double val_xy = vf2[index * 9 + 1];
+    double val_xz = vf2[index * 9 + 2];
+    double val_yx = vf2[index * 9 + 3];
+    double val_yy = vf2[index * 9 + 4];
+    double val_yz = vf2[index * 9 + 5];
+    double val_zx = vf2[index * 9 + 6];
+    double val_zy = vf2[index * 9 + 7];
+    double val_zz = vf2[index * 9 + 8];
+    double trace_coeff = (2. / 3.) * (val_xx + val_yy + val_zz);
+    vf2[index * 9 + 0] = scale * (val_xx - trace_coeff);
+    vf2[index * 9 + 1] = scale * val_yx;
+    vf2[index * 9 + 2] = scale * val_zx;
+    vf2[index * 9 + 3] = scale * val_xy;
+    vf2[index * 9 + 4] = scale * (val_yy - trace_coeff);
+    vf2[index * 9 + 5] = scale * val_zy;
+    vf2[index * 9 + 6] = scale * val_xz;
+    vf2[index * 9 + 7] = scale * val_yz;
+    vf2[index * 9 + 8] = scale * (val_zz - trace_coeff);
+}
+
 __global__ void fvm_ddt_vector_kernel(int num_cells, double rDeltaT,
         const double *rho, const double *rho_old, const double *vf, const double *volume,
         double *diag, double *source)
@@ -590,6 +642,79 @@ __global__ void fvc_div_cell_vector_boundary(int num, int offset, const int *fac
 
 }
 
+__global__ void fvc_div_cell_tensor_internal(int num_surfaces,
+        const int *lower_index, const int *upper_index,
+        const double *vf, const double *weight, const double *face_vector,
+        double *output)
+{
+    int index = blockDim.x * blockIdx.x + threadIdx.x;
+    if (index >= num_surfaces)
+        return;
+
+    double w = weight[index];
+    double Sfx = face_vector[index * 3 + 0];
+    double Sfy = face_vector[index * 3 + 1];
+    double Sfz = face_vector[index * 3 + 2];
+    int owner = lower_index[index];
+    int neighbor = upper_index[index];
+
+    double ssf_xx = (w * (vf[owner * 9 + 0] - vf[neighbor * 9 + 0]) + vf[neighbor * 9 + 0]);
+    double ssf_xy = (w * (vf[owner * 9 + 1] - vf[neighbor * 9 + 1]) + vf[neighbor * 9 + 1]);
+    double ssf_xz = (w * (vf[owner * 9 + 2] - vf[neighbor * 9 + 2]) + vf[neighbor * 9 + 2]);
+    double ssf_yx = (w * (vf[owner * 9 + 3] - vf[neighbor * 9 + 3]) + vf[neighbor * 9 + 3]);
+    double ssf_yy = (w * (vf[owner * 9 + 4] - vf[neighbor * 9 + 4]) + vf[neighbor * 9 + 4]);
+    double ssf_yz = (w * (vf[owner * 9 + 5] - vf[neighbor * 9 + 5]) + vf[neighbor * 9 + 5]);
+    double ssf_zx = (w * (vf[owner * 9 + 6] - vf[neighbor * 9 + 6]) + vf[neighbor * 9 + 6]);
+    double ssf_zy = (w * (vf[owner * 9 + 7] - vf[neighbor * 9 + 7]) + vf[neighbor * 9 + 7]);
+    double ssf_zz = (w * (vf[owner * 9 + 8] - vf[neighbor * 9 + 8]) + vf[neighbor * 9 + 8]);
+    double div_x = Sfx * ssf_xx + Sfy * ssf_xy + Sfz * ssf_xz;
+    double div_y = Sfx * ssf_yx + Sfy * ssf_yy + Sfz * ssf_yz;
+    double div_z = Sfx * ssf_zx + Sfy * ssf_zy + Sfz * ssf_zz;
+
+    // owner
+    atomicAdd(&(output[owner * 3 + 0]), div_x);
+    atomicAdd(&(output[owner * 3 + 1]), div_y);
+    atomicAdd(&(output[owner * 3 + 2]), div_z);
+
+    // neighbour
+    atomicAdd(&(output[neighbor * 3 + 0]), -div_x);
+    atomicAdd(&(output[neighbor * 3 + 1]), -div_y);
+    atomicAdd(&(output[neighbor * 3 + 2]), -div_z);
+}
+
+__global__ void fvc_div_cell_tensor_boundary(int num, int offset, const int *face2Cells,
+        const double *boundary_face_vector, const double *boundary_vf, double *output)
+{
+    int index = blockDim.x * blockIdx.x + threadIdx.x;
+    if (index >= num)
+        return;
+
+    int start_index = offset + index;
+
+    double bouSfx = boundary_face_vector[start_index * 3 + 0];
+    double bouSfy = boundary_face_vector[start_index * 3 + 1];
+    double bouSfz = boundary_face_vector[start_index * 3 + 2];
+
+    double boussf_xx = boundary_vf[start_index * 9 + 0];
+    double boussf_xy = boundary_vf[start_index * 9 + 1];
+    double boussf_xz = boundary_vf[start_index * 9 + 2];
+    double boussf_yx = boundary_vf[start_index * 9 + 3];
+    double boussf_yy = boundary_vf[start_index * 9 + 4];
+    double boussf_yz = boundary_vf[start_index * 9 + 5];
+    double boussf_zx = boundary_vf[start_index * 9 + 6];
+    double boussf_zy = boundary_vf[start_index * 9 + 7];
+    double boussf_zz = boundary_vf[start_index * 9 + 8];
+    int cellIndex = face2Cells[start_index];
+
+    double bouDiv_x = bouSfx * boussf_xx + bouSfy * boussf_xy + bouSfz * boussf_xz;
+    double bouDiv_y = bouSfx * boussf_yx + bouSfy * boussf_yy + bouSfz * boussf_yz;
+    double bouDiv_z = bouSfx * boussf_zx + bouSfy * boussf_zy + bouSfz * boussf_zz;
+
+    atomicAdd(&(output[cellIndex * 3 + 0]), bouDiv_x);
+    atomicAdd(&(output[cellIndex * 3 + 1]), bouDiv_y);
+    atomicAdd(&(output[cellIndex * 3 + 2]), bouDiv_z);
+}
+
 void permute_vector_d2h(cudaStream_t stream, int num_cells, const double *input, double *output)
 {
     size_t threads_per_block = 256;
@@ -604,6 +729,24 @@ void permute_vector_h2d(cudaStream_t stream, int num_cells, const double *input,
     permute_vector_h2d_kernel<<<blocks_per_grid, threads_per_block, 0, stream>>>(num_cells, input, output);
 }
 
+void field_multiply_scalar(cudaStream_t stream,
+        int num_cells, const double *input1, const double *input2, double *output,
+        int num_boundary_surfaces, const double *boundary_input1, const double *boundary_input2, double *boundary_output)
+{
+    size_t threads_per_block = 256;
+    size_t blocks_per_grid = (std::max(num_cells, num_boundary_surfaces) + threads_per_block - 1) / threads_per_block;
+    field_multiply_scalar_kernel<<<blocks_per_grid, threads_per_block, 0, stream>>>(num_cells, num_boundary_surfaces,
+            input1, input2, output, boundary_input1, boundary_input2, boundary_output);
+}
+
+void fvc_to_source_vector(cudaStream_t stream, int num_cells, const double *volume, const double *fvc_output, double *source)
+{
+    size_t threads_per_block = 256;
+    size_t blocks_per_grid = (num_cells + threads_per_block - 1) / threads_per_block;
+    fvc_to_source_vector_kernel<<<blocks_per_grid, threads_per_block, 0, stream>>>(num_cells,
+            volume, fvc_output, source);
+}
+
 void ldu_to_csr(cudaStream_t stream, int num_cells, int num_surfaces,
         const int *lower_to_csr_index, const int *upper_to_csr_index, const int *diag_to_csr_index,
         const double *lower, const double *upper, const double *diag, const double *source,
@@ -720,6 +863,8 @@ void fvc_ddt_scalar(cudaStream_t stream, int num_cells, double rDeltaT,
         const double *rho, const double *rho_old, const double *vf, const double *vf_old,
         double *output)
 {
+    checkCudaErrors(cudaMemsetAsync(output, 0, num_cells * sizeof(double), stream));
+
     size_t threads_per_block = 1024;
     size_t blocks_per_grid = (num_cells + threads_per_block - 1) / threads_per_block;
     fvc_ddt_scalar_kernel<<<blocks_per_grid, threads_per_block, 0, stream>>>(num_cells,
@@ -734,28 +879,12 @@ void fvc_grad_vector(cudaStream_t stream, int num_cells, int num_surfaces,
         const double *volume, const double *boundary_mag_Sf, double *boundary_output,
         const double *boundary_deltaCoeffs)
 {
-    float time_elapsed = 0;
-    cudaEvent_t start, stop;
-    checkCudaErrors(cudaEventCreate(&start));
-    checkCudaErrors(cudaEventCreate(&stop));
-    
-    // checkCudaErrors(cudaStreamSynchronize(stream));
-    checkCudaErrors(cudaEventRecord(start, 0));
-    
+    checkCudaErrors(cudaMemsetAsync(output, 0, num_cells * 9 * sizeof(double), stream));
     size_t threads_per_block = 1024;
     size_t blocks_per_grid = (num_surfaces + threads_per_block - 1) / threads_per_block;
     fvc_grad_vector_internal<<<blocks_per_grid, threads_per_block, 0, stream>>>(num_surfaces, lowerAddr, upperAddr,
             Sf, weight, vf, output);
     
-    // checkCudaErrors(cudaStreamSynchronize(stream));
-    checkCudaErrors(cudaEventRecord(stop, 0));
-    checkCudaErrors(cudaEventSynchronize(start));
-    checkCudaErrors(cudaEventSynchronize(stop));
-    checkCudaErrors(cudaEventElapsedTime(&time_elapsed, start, stop));
-    printf("\nfvc_grad_vector_new internal 执行时间：%f(ms)\n", time_elapsed);
-
-    // checkCudaErrors(cudaStreamSynchronize(stream));
-    checkCudaErrors(cudaEventRecord(start, 0));
     int offset = 0;
     // finish conctruct grad field except dividing cell volume
     for (int i = 0; i < num_patches; i++) {
@@ -773,32 +902,13 @@ void fvc_grad_vector(cudaStream_t stream, int num_cells, int num_surfaces,
         }
         offset += patch_size[i];
     }
-    // checkCudaErrors(cudaStreamSynchronize(stream));
-    checkCudaErrors(cudaEventRecord(stop, 0));
-    checkCudaErrors(cudaEventSynchronize(start));
-    checkCudaErrors(cudaEventSynchronize(stop));
-    checkCudaErrors(cudaEventElapsedTime(&time_elapsed, start, stop));
-    printf("fvc_grad_vector_new boundary1 执行时间：%f(ms)\n", time_elapsed);
 
     // divide cell volume
-    // checkCudaErrors(cudaStreamSynchronize(stream));
-    checkCudaErrors(cudaEventRecord(start, 0));
-
     threads_per_block = 512;
     blocks_per_grid = (num_cells + threads_per_block - 1) / threads_per_block;
     divide_cell_volume_tsr<<<blocks_per_grid, threads_per_block, 0, stream>>>(num_cells, volume, output);
 
-    // checkCudaErrors(cudaStreamSynchronize(stream));
-    checkCudaErrors(cudaEventRecord(stop, 0));
-    checkCudaErrors(cudaEventSynchronize(start));
-    checkCudaErrors(cudaEventSynchronize(stop));
-    checkCudaErrors(cudaEventElapsedTime(&time_elapsed, start, stop));
-    printf("fvc_grad_vector_new divide_cell 执行时间：%f(ms)\n", time_elapsed);
-
     // correct boundary conditions
-    // checkCudaErrors(cudaStreamSynchronize(stream));
-    checkCudaErrors(cudaEventRecord(start, 0));
-    
     offset = 0;
     for (int i = 0; i < num_patches; i++) {
         threads_per_block = 256;
@@ -817,19 +927,25 @@ void fvc_grad_vector(cudaStream_t stream, int num_cells, int num_surfaces,
         }
         offset += patch_size[i];
     }
-    // checkCudaErrors(cudaStreamSynchronize(stream));
+}
 
-    checkCudaErrors(cudaEventRecord(stop, 0));
-    checkCudaErrors(cudaEventSynchronize(start));
-    checkCudaErrors(cudaEventSynchronize(stop));
-    checkCudaErrors(cudaEventElapsedTime(&time_elapsed, start, stop));
-    printf("fvc_grad_vector_new boundary2 执行时间：%f(ms)\n", time_elapsed);
+void scale_dev2T_tensor(cudaStream_t stream, int num_cells, const double *vf1, double *vf2,
+        int num_boundary_surfaces, const double *boundary_vf1, double *boundary_vf2)
+{
+    size_t threads_per_block = 1024;
+    size_t blocks_per_grid = (num_cells + threads_per_block - 1) / threads_per_block;
+    scale_dev2t_tensor_kernel<<<blocks_per_grid, threads_per_block, 0, stream>>>(num_cells, vf1, vf2);
+
+    blocks_per_grid = (num_boundary_surfaces + threads_per_block - 1) / threads_per_block;
+    scale_dev2t_tensor_kernel<<<blocks_per_grid, threads_per_block, 0, stream>>>(num_boundary_surfaces, boundary_vf1, boundary_vf2);
 }
 
 void fvc_div_surface_scalar(cudaStream_t stream, int num_cells, int num_surfaces, int num_boundary_surfaces,
         const int *lowerAddr, const int *upperAddr, const double *ssf, const int *boundary_cell_face,
         const double *boundary_ssf, const double *volume, double *output)
 {
+    checkCudaErrors(cudaMemsetAsync(output, 0, num_cells * sizeof(double), stream));
+
     size_t threads_per_block = 1024;
     size_t blocks_per_grid = (num_surfaces + threads_per_block - 1) / threads_per_block;
     fvc_div_surface_scalar_internal<<<blocks_per_grid, threads_per_block, 0, stream>>>(num_surfaces, lowerAddr, upperAddr, ssf, output);
@@ -852,6 +968,8 @@ void fvc_div_cell_vector(cudaStream_t stream, int num_cells, int num_surfaces,
         const int *boundary_cell_face, const double *boundary_vf, const double *boundary_Sf,
         const double *volume)
 {
+    checkCudaErrors(cudaMemsetAsync(output, 0, num_cells * sizeof(double), stream));
+
     size_t threads_per_block = 1024;
     size_t blocks_per_grid = (num_surfaces + threads_per_block - 1) / threads_per_block;
     fvc_div_cell_vector_internal<<<blocks_per_grid, threads_per_block, 0, stream>>>(num_surfaces, lowerAddr, upperAddr, vf, weight, Sf, output);
@@ -879,31 +997,54 @@ void fvc_div_cell_vector(cudaStream_t stream, int num_cells, int num_surfaces,
     divide_cell_volume_scalar<<<blocks_per_grid, threads_per_block, 0, stream>>>(num_cells, volume, output);
 }
 
+void fvc_div_cell_tensor(cudaStream_t stream, int num_cells, int num_surfaces,
+        const int *lowerAddr, const int *upperAddr,
+        const double *weight, const double *Sf, const double *vf, double *output, // end for internal
+        int num_patches, const int *patch_size, const int *patch_type,
+        const int *boundary_cell_face, const double *boundary_vf, const double *boundary_Sf,
+        const double *volume)
+{
+    checkCudaErrors(cudaMemsetAsync(output, 0, num_cells * 3 * sizeof(double), stream));
+
+    size_t threads_per_block = 1024;
+    size_t blocks_per_grid = (num_surfaces + threads_per_block - 1) / threads_per_block;
+    fvc_div_cell_tensor_internal<<<blocks_per_grid, threads_per_block, 0, stream>>>(num_surfaces, lowerAddr, upperAddr, vf, weight, Sf, output);
+
+    int offset = 0;
+    for (int i = 0; i < num_patches; i++) {
+        threads_per_block = 256;
+        blocks_per_grid = (patch_size[i] + threads_per_block - 1) / threads_per_block;
+        // TODO: just basic patch type now
+        if (patch_type[i] == boundaryConditions::zeroGradient
+                || patch_type[i] == boundaryConditions::fixedValue) {
+            // TODO: just vector version now
+            fvc_div_cell_tensor_boundary<<<blocks_per_grid, threads_per_block, 0, stream>>>(patch_size[i], offset, boundary_cell_face,
+                    boundary_Sf, boundary_vf, output);
+        } else if (0) {
+            // xxx
+            fprintf(stderr, "boundaryConditions other than zeroGradient are not support yet!\n");
+        }
+        offset += patch_size[i];
+    }
+
+    // divide cell volume
+    threads_per_block = 1024;
+    blocks_per_grid = (num_cells + threads_per_block - 1) / threads_per_block;
+    divide_cell_volume_vec<<<blocks_per_grid, threads_per_block, 0, stream>>>(num_cells, volume, output);
+}
+
 void fvc_grad_cell_scalar(cudaStream_t stream, int num_cells, int num_surfaces, 
         const int *lowerAddr, const int *upperAddr, 
         const double *weight, const double *Sf, const double *vf, double *output, // end for internal
         int num_patches, const int *patch_size, const int *patch_type,
         const int *boundary_cell_face, const double *boundary_vf, const double *boundary_Sf, const double *volume)
 {
-    float time_elapsed = 0;
-    cudaEvent_t start, stop;
-    checkCudaErrors(cudaEventCreate(&start));
-    checkCudaErrors(cudaEventCreate(&stop));
-    checkCudaErrors(cudaEventRecord(start, 0));
-
+    checkCudaErrors(cudaMemsetAsync(output, 0, num_cells * 3 * sizeof(double), stream));
     size_t threads_per_block = 1024;
     size_t blocks_per_grid = (num_surfaces + threads_per_block - 1) / threads_per_block;
     fvc_grad_scalar_internal<<<blocks_per_grid, threads_per_block, 0, stream>>>(num_cells, num_surfaces, lowerAddr, upperAddr,
             Sf, weight, vf, output);
     
-    checkCudaErrors(cudaEventRecord(stop, 0));
-    checkCudaErrors(cudaEventSynchronize(start));
-    checkCudaErrors(cudaEventSynchronize(stop));
-    checkCudaErrors(cudaEventElapsedTime(&time_elapsed, start, stop));
-    printf("\nfvc_grad_scalar_new internal 执行时间：%f(ms)\n", time_elapsed);
-
-    checkCudaErrors(cudaEventRecord(start, 0));
-
     int offset = 0;
     for (int i = 0; i < num_patches; i++) {
         threads_per_block = 256;
@@ -919,22 +1060,9 @@ void fvc_grad_cell_scalar(cudaStream_t stream, int num_cells, int num_surfaces,
         }
         offset += patch_size[i];
     }
-    checkCudaErrors(cudaEventRecord(stop, 0));
-    checkCudaErrors(cudaEventSynchronize(start));
-    checkCudaErrors(cudaEventSynchronize(stop));
-    checkCudaErrors(cudaEventElapsedTime(&time_elapsed, start, stop));
-    printf("fvc_grad_scalar_new boundary 执行时间：%f(ms)\n", time_elapsed);
-
-    checkCudaErrors(cudaEventRecord(start, 0));
 
     // divide cell volume
     threads_per_block = 1024;
     blocks_per_grid = (num_cells + threads_per_block - 1) / threads_per_block;
     divide_cell_volume_vec<<<blocks_per_grid, threads_per_block, 0, stream>>>(num_cells, volume, output);
-
-    checkCudaErrors(cudaEventRecord(stop, 0));
-    checkCudaErrors(cudaEventSynchronize(start));
-    checkCudaErrors(cudaEventSynchronize(stop));
-    checkCudaErrors(cudaEventElapsedTime(&time_elapsed, start, stop));
-    printf("fvc_grad_scalar_new divide_cell_vector 执行时间：%f(ms)\n", time_elapsed);
 }
diff --git a/src_gpu/dfUEqn.H b/src_gpu/dfUEqn.H
index 0ee570b9d..880b9c347 100644
--- a/src_gpu/dfUEqn.H
+++ b/src_gpu/dfUEqn.H
@@ -35,6 +35,7 @@ private:
 	double *d_grad_u = nullptr;
 	double *d_rho_nueff = nullptr;
 	double *d_permute = nullptr;
+    double *d_fvc_output = nullptr;
 
 	// non-constant fields - boundary
 	// thermophysical fields
@@ -89,11 +90,11 @@ public:
 	void initNonConstantFieldsBoundary();
 
 	// 方程运行
+    void preProcessForRhoEqn(const double *h_rho, const double *h_phi, const double *h_boundary_phi);
 	void preProcess(const double *h_u, const double *h_boundary_u, const double *h_p, const double *h_boundary_p, const double *h_nu_eff, const double *h_boundary_nu_eff, const double *h_boundary_rho);
-    void preProcessForRhoEqn(const double *h_phi, const double *h_boundary_phi);
 	void process();
 	void postProcess(double *h_u);
 
     void solve();
-    void compareResult(const double *lower, const double *upper, const double *diag, const double *internal_coeffs, const double *boundary_coeffs, bool printFlag);
+    void compareResult(const double *lower, const double *upper, const double *diag, const double *source, const double *internal_coeffs, const double *boundary_coeffs, bool printFlag);
 };
diff --git a/src_gpu/dfUEqn.cu b/src_gpu/dfUEqn.cu
index fbbf9e71d..1e8065721 100644
--- a/src_gpu/dfUEqn.cu
+++ b/src_gpu/dfUEqn.cu
@@ -20,6 +20,7 @@ void dfUEqn::createNonConstantFieldsInternal() {
   // intermediate fields
   checkCudaErrors(cudaMalloc((void**)&d_grad_u, dataBase_.cell_value_tsr_bytes));
   checkCudaErrors(cudaMalloc((void**)&d_rho_nueff, dataBase_.cell_value_bytes));
+  checkCudaErrors(cudaMalloc((void**)&d_fvc_output, dataBase_.cell_value_vec_bytes));
   checkCudaErrors(cudaMalloc((void**)&d_permute, dataBase_.cell_value_vec_bytes));
 
   // getter for h_nu_eff
@@ -62,6 +63,12 @@ void dfUEqn::initNonConstantFieldsBoundary() {
             d_gradient_internal_coeffs, d_gradient_boundary_coeffs);
 }
 
+void dfUEqn::preProcessForRhoEqn(const double *h_rho, const double *h_phi, const double *h_boundary_phi) {
+  checkCudaErrors(cudaMemcpyAsync(dataBase_.d_rho, h_rho, dataBase_.cell_value_bytes, cudaMemcpyHostToDevice, dataBase_.stream));
+  checkCudaErrors(cudaMemcpyAsync(dataBase_.d_phi, h_phi, dataBase_.surface_value_bytes, cudaMemcpyHostToDevice, dataBase_.stream));
+  checkCudaErrors(cudaMemcpyAsync(dataBase_.d_boundary_phi, h_boundary_phi, dataBase_.boundary_surface_value_bytes, cudaMemcpyHostToDevice, dataBase_.stream));
+}
+
 void dfUEqn::preProcess(const double *h_u, const double *h_boundary_u, const double *h_p, const double *h_boundary_p, 
         const double *h_nu_eff, const double *h_boundary_nu_eff, const double *h_boundary_rho) {
   checkCudaErrors(cudaMemcpyAsync(dataBase_.d_u, h_u, dataBase_.cell_value_vec_bytes, cudaMemcpyHostToDevice, dataBase_.stream));
@@ -82,37 +89,57 @@ void dfUEqn::preProcess(const double *h_u, const double *h_boundary_u, const dou
   checkCudaErrors(cudaMemsetAsync(d_b, 0, dataBase_.cell_value_vec_bytes, dataBase_.stream));
 }
 
-void dfUEqn::preProcessForRhoEqn(const double *h_phi, const double *h_boundary_phi) {
-  checkCudaErrors(cudaMemcpyAsync(dataBase_.d_phi, h_phi, dataBase_.surface_value_bytes, cudaMemcpyHostToDevice, dataBase_.stream));
-  checkCudaErrors(cudaMemcpyAsync(dataBase_.d_boundary_phi, h_boundary_phi, dataBase_.boundary_surface_value_bytes, cudaMemcpyHostToDevice, dataBase_.stream));
-
-  checkCudaErrors(cudaMemsetAsync(d_lower, 0, dataBase_.surface_value_bytes, dataBase_.stream));
-  checkCudaErrors(cudaMemsetAsync(d_upper, 0, dataBase_.surface_value_bytes, dataBase_.stream));
-  checkCudaErrors(cudaMemsetAsync(d_diag, 0, dataBase_.cell_value_bytes, dataBase_.stream));
-  checkCudaErrors(cudaMemsetAsync(d_source, 0, dataBase_.cell_value_vec_bytes, dataBase_.stream));
-  checkCudaErrors(cudaMemsetAsync(d_internal_coeffs, 0, dataBase_.boundary_surface_value_vec_bytes, dataBase_.stream));
-  checkCudaErrors(cudaMemsetAsync(d_boundary_coeffs, 0, dataBase_.boundary_surface_value_vec_bytes, dataBase_.stream));
-  checkCudaErrors(cudaMemsetAsync(d_A, 0, dataBase_.csr_value_vec_bytes, dataBase_.stream));
-  checkCudaErrors(cudaMemsetAsync(d_b, 0, dataBase_.cell_value_vec_bytes, dataBase_.stream));
-
-}
-
 void dfUEqn::process() {
-  // run each fvc or fvm function
-  fvm_div_vector(dataBase_.stream, dataBase_.num_surfaces, dataBase_.d_owner, dataBase_.d_neighbor,
-          dataBase_.d_phi, dataBase_.d_weight,
-          d_lower, d_upper, d_diag, // end for internal
-          dataBase_.num_patches, dataBase_.patch_size.data(), patch_type.data(),
-          dataBase_.d_boundary_phi, d_value_internal_coeffs, d_value_boundary_coeffs,
-          d_internal_coeffs, d_boundary_coeffs);
-  //ldu_to_csr(dataBase_.stream, dataBase_.num_cells, dataBase_.num_surfaces,
-  //        dataBase_.d_lower_to_csr_index, dataBase_.d_upper_to_csr_index, dataBase_.d_diag_to_csr_index,
-  //        d_lower, d_upper, d_diag, d_source, d_internal_coeffs, d_boundary_coeffs, d_A, d_b);
-  //solve();
+    fvm_ddt_vector(dataBase_.stream, dataBase_.num_cells, dataBase_.rdelta_t,
+            dataBase_.d_rho, dataBase_.d_rho_old, dataBase_.d_u, dataBase_.d_volume,
+            d_diag, d_source);
+    fvm_div_vector(dataBase_.stream, dataBase_.num_surfaces, dataBase_.d_owner, dataBase_.d_neighbor,
+            dataBase_.d_phi, dataBase_.d_weight,
+            d_lower, d_upper, d_diag, // end for internal
+            dataBase_.num_patches, dataBase_.patch_size.data(), patch_type.data(),
+            dataBase_.d_boundary_phi, d_value_internal_coeffs, d_value_boundary_coeffs,
+            d_internal_coeffs, d_boundary_coeffs);
+    //field_multiply_scalar(dataBase_.stream,
+    //        dataBase_.num_cells, dataBase_.d_rho, d_nu_eff, d_rho_nueff, // end for internal
+    //        dataBase_.num_boundary_surfaces, dataBase_.d_boundary_rho, d_boundary_nu_eff, d_boundary_rho_nueff);
+    //fvm_laplacian_vector(dataBase_.stream, dataBase_.num_surfaces,
+    //        dataBase_.d_owner, dataBase_.d_neighbor,
+    //        dataBase_.d_weight, dataBase_.d_mag_sf, dataBase_.d_delta_coeffs, d_rho_nueff,
+    //        d_lower, d_upper, d_diag, // end for internal
+    //        dataBase_.num_patches, dataBase_.patch_size.data(), patch_type.data(),
+    //        dataBase_.d_boundary_mag_sf, d_boundary_rho_nueff,
+    //        d_gradient_internal_coeffs, d_gradient_boundary_coeffs,
+    //        d_internal_coeffs, d_boundary_coeffs);
+    //fvc_grad_vector(dataBase_.stream, dataBase_.num_cells, dataBase_.num_surfaces,
+    //        dataBase_.d_owner, dataBase_.d_neighbor,
+    //        dataBase_.d_weight, dataBase_.d_sf, dataBase_.d_u, d_grad_u,
+    //        dataBase_.num_patches, dataBase_.patch_size.data(), patch_type.data(),
+    //        dataBase_.d_boundary_face_cell, dataBase_.d_boundary_u, dataBase_.d_boundary_sf,
+    //        dataBase_.d_volume, dataBase_.d_boundary_mag_sf, d_boundary_grad_u, dataBase_.d_boundary_delta_coeffs);
+    //scale_dev2T_tensor(dataBase_.stream, dataBase_.num_cells, d_rho_nueff, d_grad_u, // end for internal
+    //        dataBase_.num_boundary_surfaces, d_boundary_rho_nueff, d_boundary_grad_u);
+    //fvc_div_cell_tensor(dataBase_.stream, dataBase_.num_cells, dataBase_.num_surfaces,
+    //        dataBase_.d_owner, dataBase_.d_neighbor,
+    //        dataBase_.d_weight, dataBase_.d_sf, d_grad_u, d_fvc_output, // end for internal
+    //        dataBase_.num_patches, dataBase_.patch_size.data(), patch_type.data(),
+    //        dataBase_.d_boundary_face_cell, d_boundary_grad_u, dataBase_.d_boundary_sf, dataBase_.d_volume);
+    //fvc_to_source_vector(dataBase_.stream, dataBase_.num_cells,
+    //        dataBase_.d_volume, d_fvc_output, d_source);
+    fvc_grad_cell_scalar(dataBase_.stream, dataBase_.num_cells, dataBase_.num_surfaces,
+            dataBase_.d_owner, dataBase_.d_neighbor,
+            dataBase_.d_weight, dataBase_.d_sf, dataBase_.d_p, d_fvc_output,
+            dataBase_.num_patches, dataBase_.patch_size.data(), patch_type.data(),
+            dataBase_.d_boundary_face_cell, dataBase_.d_boundary_p, dataBase_.d_boundary_sf, dataBase_.d_volume);
+    fvc_to_source_vector(dataBase_.stream, dataBase_.num_cells,
+            dataBase_.d_volume, d_fvc_output, d_source);
+    //solve();
 }
 
 void dfUEqn::solve() {
-    checkCudaErrors(cudaStreamSynchronize(dataBase_.stream));
+    //ldu_to_csr(dataBase_.stream, dataBase_.num_cells, dataBase_.num_surfaces,
+    //        dataBase_.d_lower_to_csr_index, dataBase_.d_upper_to_csr_index, dataBase_.d_diag_to_csr_index,
+    //        d_lower, d_upper, d_diag, d_source, d_internal_coeffs, d_boundary_coeffs, d_A, d_b);
+    ////checkCudaErrors(cudaStreamSynchronize(dataBase_.stream));
 
     int nNz = dataBase_.num_cells + dataBase_.num_surfaces * 2; // matrix entries
     if (num_iteration == 0)                                     // first interation
@@ -166,36 +193,43 @@ double* dfUEqn::getFieldPointer(const char* fieldAlias, location loc, position p
     return pointer;
 }
 
-void dfUEqn::compareResult(const double *lower, const double *upper, const double *diag, const double *internal_coeffs, const double *boundary_coeffs, bool printFlag)
+void dfUEqn::compareResult(const double *lower, const double *upper, const double *diag, const double *source, const double *internal_coeffs, const double *boundary_coeffs, bool printFlag)
 {
+    DEBUG_TRACE;
     std::vector<double> h_lower;
     h_lower.resize(dataBase_.num_surfaces);
     checkCudaErrors(cudaMemcpy(h_lower.data(), d_lower, dataBase_.surface_value_bytes, cudaMemcpyDeviceToHost));
     checkVectorEqual(dataBase_.num_surfaces, lower, h_lower.data(), 1e-14, printFlag);
+    DEBUG_TRACE;
 
     std::vector<double> h_upper;
     h_upper.resize(dataBase_.num_surfaces);
     checkCudaErrors(cudaMemcpy(h_upper.data(), d_upper, dataBase_.surface_value_bytes, cudaMemcpyDeviceToHost));
     checkVectorEqual(dataBase_.num_surfaces, upper, h_upper.data(), 1e-14, printFlag);
+    DEBUG_TRACE;
 
     std::vector<double> h_diag;
     h_diag.resize(dataBase_.num_cells);
     checkCudaErrors(cudaMemcpy(h_diag.data(), d_diag, dataBase_.cell_value_bytes, cudaMemcpyDeviceToHost));
     checkVectorEqual(dataBase_.num_cells, diag, h_diag.data(), 1e-14, printFlag);
+    DEBUG_TRACE;
 
-    //std::vector<double> h_source;
-    //h_source.resize(dataBase_.num_cells * 3);
-    //checkCudaErrors(cudaMemcpy(h_source.data(), d_source, dataBase_.cell_value_vec_bytes, cudaMemcpyDeviceToHost));
-    //checkVectorEqual(dataBase_.num_cells, source, h_source.data(), 1e-14, printFlag);
+    std::vector<double> h_source;
+    h_source.resize(dataBase_.num_cells * 3);
+    checkCudaErrors(cudaMemcpy(h_source.data(), d_source, dataBase_.cell_value_vec_bytes, cudaMemcpyDeviceToHost));
+    checkVectorEqual(dataBase_.num_cells, source, h_source.data(), 1e-14, printFlag);
+    DEBUG_TRACE;
 
     std::vector<double> h_internal_coeffs;
     h_internal_coeffs.resize(dataBase_.num_boundary_surfaces * 3);
     checkCudaErrors(cudaMemcpy(h_internal_coeffs.data(), d_internal_coeffs, dataBase_.boundary_surface_value_vec_bytes, cudaMemcpyDeviceToHost));
     checkVectorEqual(dataBase_.num_boundary_surfaces * 3, internal_coeffs, h_internal_coeffs.data(), 1e-14, printFlag);
+    DEBUG_TRACE;
 
     std::vector<double> h_boundary_coeffs;
     h_boundary_coeffs.resize(dataBase_.num_boundary_surfaces * 3);
     checkCudaErrors(cudaMemcpy(h_boundary_coeffs.data(), d_boundary_coeffs, dataBase_.boundary_surface_value_vec_bytes, cudaMemcpyDeviceToHost));
     checkVectorEqual(dataBase_.num_boundary_surfaces * 3, boundary_coeffs, h_boundary_coeffs.data(), 1e-14, printFlag);
+    DEBUG_TRACE;
 }
 

From 04d5512a627a3d5bf1e6b258070bf4b67187c223 Mon Sep 17 00:00:00 2001
From: STwangyingrui <wangyingrui@sensetime.com>
Date: Tue, 15 Aug 2023 19:04:48 +0800
Subject: [PATCH 20/25] fvc/fvm ops support sign

---
 applications/solvers/dfLowMachFoam/new_UEqn.H |   3 +-
 src_gpu/dfMatrixOpBase.H                      |  19 +-
 src_gpu/dfMatrixOpBase.cu                     | 169 +++++++++---------
 src_gpu/dfUEqn.cu                             |   6 +-
 4 files changed, 100 insertions(+), 97 deletions(-)

diff --git a/applications/solvers/dfLowMachFoam/new_UEqn.H b/applications/solvers/dfLowMachFoam/new_UEqn.H
index 3d84f3631..1b5487139 100644
--- a/applications/solvers/dfLowMachFoam/new_UEqn.H
+++ b/applications/solvers/dfLowMachFoam/new_UEqn.H
@@ -5,7 +5,8 @@ const volScalarField& nuEff = nuEff_tmp();
 // run CPU, for temp
 tmp<fvVectorMatrix> tUEqn
 (
- fvm::ddt(rho, U) + fvm::div(phi, U) == -fvc::grad(p)
+ -fvm::ddt(rho, U) - fvm::div(phi, U) == fvc::grad(p)
+ //fvm::ddt(rho, U) + fvm::div(phi, U) == -fvc::grad(p)
  //turbulence->divDevRhoReff(U)
 );
 fvVectorMatrix& UEqn = tUEqn.ref();
diff --git a/src_gpu/dfMatrixOpBase.H b/src_gpu/dfMatrixOpBase.H
index a415a8a1b..ae303cedb 100644
--- a/src_gpu/dfMatrixOpBase.H
+++ b/src_gpu/dfMatrixOpBase.H
@@ -25,14 +25,14 @@ void update_boundary_coeffs_vector(cudaStream_t stream, int num_patches,
 
 void fvm_ddt_vector(cudaStream_t stream, int num_cells, double rDeltaT,
         const double *rho, const double *rho_old, const double *vf, const double *volume,
-        double *diag, double *source);
+        double *diag, double *source, double sign = 1.);
 
 void fvm_div_vector(cudaStream_t stream, int num_surfaces, const int *lowerAddr, const int *upperAddr,
         const double *phi, const double *weight,
         double *lower, double *upper, double *diag, // end for internal
         int num_patches, const int *patch_size, const int *patch_type,
         const double *boundary_phi, const double *value_internal_coeffs, const double *value_boundary_coeffs,
-        double *internal_coeffs, double *boundary_coeffs);
+        double *internal_coeffs, double *boundary_coeffs, double sign = 1.);
 
 void fvm_laplacian_vector(cudaStream_t stream, int num_surfaces,
         const int *lowerAddr, const int *upperAddr,
@@ -41,14 +41,13 @@ void fvm_laplacian_vector(cudaStream_t stream, int num_surfaces,
         int num_patches, const int *patch_size, const int *patch_type,
         const double *boundary_mag_sf, const double *boundary_gamma,
         const double *gradient_internal_coeffs, const double *gradient_boundary_coeffs,
-        double *internal_coeffs, double *boundary_coeffs);
+        double *internal_coeffs, double *boundary_coeffs, double sign = 1.);
 
 // fvc ops
 // fvc_ddt doesn't consider to add fvc_output to source yet, which needs (fvc_output * volume * sign).
 void fvc_ddt_scalar(cudaStream_t stream, int num_cells, double rDeltaT,
         const double *rho, const double *rho_old, const double *vf, const double *vf_old,
-        double *output);
-
+        double *output, double sign = 1.);
 
 void fvc_grad_vector(cudaStream_t stream, int num_cells, int num_surfaces, 
         const int *lowerAddr, const int *upperAddr, 
@@ -56,31 +55,31 @@ void fvc_grad_vector(cudaStream_t stream, int num_cells, int num_surfaces,
         int num_patches, const int *patch_size, const int *patch_type,
         const int *boundary_cell_face, const double *boundary_vf, const double *boundary_Sf,
         const double *volume, const double *boundary_mag_Sf, double *boundary_output,
-        const double *boundary_deltaCoeffs);
+        const double *boundary_deltaCoeffs, double sign = 1.);
 
 void fvc_div_surface_scalar(cudaStream_t stream, int num_cells, int num_surfaces, int num_boundary_surfaces,
         const int *lowerAddr, const int *upperAddr, const double *ssf, const int *boundary_cell_face,
-        const double *boundary_ssf, const double *volume, double *output);
+        const double *boundary_ssf, const double *volume, double *output, double sign = 1.);
 
 void fvc_div_cell_vector(cudaStream_t stream, int num_cells, int num_surfaces,
         const int *lowerAddr, const int *upperAddr,
         const double *weight, const double *Sf, const double *vf, double *output, // end for internal
         int num_patches, const int *patch_size, const int *patch_type,
         const int *boundary_cell_face, const double *boundary_vf, const double *boundary_Sf,
-        const double *volume);
+        const double *volume, double sign = 1.);
 
 void fvc_div_cell_tensor(cudaStream_t stream, int num_cells, int num_surfaces,
         const int *lowerAddr, const int *upperAddr, 
         const double *weight, const double *Sf, const double *vf, double *output, // end for internal
         int num_patches, const int *patch_size, const int *patch_type,
         const int *boundary_cell_face, const double *boundary_vf, const double *boundary_Sf,
-        const double *volume);
+        const double *volume, double sign = 1.);
 
 void fvc_grad_cell_scalar(cudaStream_t stream, int num_cells, int num_surfaces, 
         const int *lowerAddr, const int *upperAddr, 
         const double *weight, const double *Sf, const double *vf, double *output, // end for internal
         int num_patches, const int *patch_size, const int *patch_type,
-        const int *boundary_cell_face, const double *boundary_vf, const double *boundary_Sf, const double *volume);
+        const int *boundary_cell_face, const double *boundary_vf, const double *boundary_Sf, const double *volume, double sign = 1.);
 
 // others
 void scale_dev2T_tensor(cudaStream_t stream, int num_cells, const double *vf1, double *vf2,
diff --git a/src_gpu/dfMatrixOpBase.cu b/src_gpu/dfMatrixOpBase.cu
index e4b2a25e9..fd90ce480 100644
--- a/src_gpu/dfMatrixOpBase.cu
+++ b/src_gpu/dfMatrixOpBase.cu
@@ -107,23 +107,23 @@ __global__ void scale_dev2t_tensor_kernel(int num, const double *vf1, double *vf
 
 __global__ void fvm_ddt_vector_kernel(int num_cells, double rDeltaT,
         const double *rho, const double *rho_old, const double *vf, const double *volume,
-        double *diag, double *source)
+        double *diag, double *source, double sign)
 {
     int index = blockDim.x * blockIdx.x + threadIdx.x;
     if (index >= num_cells)
         return;
 
-    diag[index] += rDeltaT * rho[index] * volume[index];
+    diag[index] += rDeltaT * rho[index] * volume[index] * sign;
     // TODO: skip moving
-    source[index * 3 + 0] += rDeltaT * rho_old[index] * vf[index * 3 + 0] * volume[index];
-    source[index * 3 + 1] += rDeltaT * rho_old[index] * vf[index * 3 + 1] * volume[index];
-    source[index * 3 + 2] += rDeltaT * rho_old[index] * vf[index * 3 + 2] * volume[index];
+    source[index * 3 + 0] += rDeltaT * rho_old[index] * vf[index * 3 + 0] * volume[index] * sign;
+    source[index * 3 + 1] += rDeltaT * rho_old[index] * vf[index * 3 + 1] * volume[index] * sign;
+    source[index * 3 + 2] += rDeltaT * rho_old[index] * vf[index * 3 + 2] * volume[index] * sign;
 }
 
 __global__ void fvm_div_vector_internal(int num_surfaces,
         const int *lower_index, const int *upper_index,
         const double *phi, const double *weight,
-        double *lower, double *upper, double *diag)
+        double *lower, double *upper, double *diag, double sign)
 {
     int index = blockDim.x * blockIdx.x + threadIdx.x;
     if (index >= num_surfaces)
@@ -132,8 +132,8 @@ __global__ void fvm_div_vector_internal(int num_surfaces,
     double w = weight[index];
     double f = phi[index];
 
-    double lower_value = (-w) * f;
-    double upper_value = (1 - w) * f;
+    double lower_value = (-w) * f * sign;
+    double upper_value = (1 - w) * f * sign;
     lower[index] += lower_value;
     upper[index] += upper_value;
     // if (index == 0) printf("index = 0, lower: %.16lf, upper:%.16lf\n", lower[index], upper[index]);
@@ -146,7 +146,7 @@ __global__ void fvm_div_vector_internal(int num_surfaces,
 
 __global__ void fvm_div_vector_boundary(int num, int offset,
         const double *boundary_phi, const double *value_internal_coeffs, const double *value_boundary_coeffs,
-        double *internal_coeffs, double *boundary_coeffs)
+        double *internal_coeffs, double *boundary_coeffs, double sign)
 {
     int index = blockDim.x * blockIdx.x + threadIdx.x;
     if (index >= num)
@@ -154,18 +154,18 @@ __global__ void fvm_div_vector_boundary(int num, int offset,
 
     int start_index = offset + index;
     double boundary_f = boundary_phi[start_index];
-    internal_coeffs[start_index * 3 + 0] += boundary_f * value_internal_coeffs[start_index * 3 + 0];
-    internal_coeffs[start_index * 3 + 1] += boundary_f * value_internal_coeffs[start_index * 3 + 1];
-    internal_coeffs[start_index * 3 + 2] += boundary_f * value_internal_coeffs[start_index * 3 + 2];
-    boundary_coeffs[start_index * 3 + 0] += boundary_f * value_boundary_coeffs[start_index * 3 + 0];
-    boundary_coeffs[start_index * 3 + 1] += boundary_f * value_boundary_coeffs[start_index * 3 + 1];
-    boundary_coeffs[start_index * 3 + 2] += boundary_f * value_boundary_coeffs[start_index * 3 + 2];
+    internal_coeffs[start_index * 3 + 0] += boundary_f * value_internal_coeffs[start_index * 3 + 0] * sign;
+    internal_coeffs[start_index * 3 + 1] += boundary_f * value_internal_coeffs[start_index * 3 + 1] * sign;
+    internal_coeffs[start_index * 3 + 2] += boundary_f * value_internal_coeffs[start_index * 3 + 2] * sign;
+    boundary_coeffs[start_index * 3 + 0] += boundary_f * value_boundary_coeffs[start_index * 3 + 0] * sign;
+    boundary_coeffs[start_index * 3 + 1] += boundary_f * value_boundary_coeffs[start_index * 3 + 1] * sign;
+    boundary_coeffs[start_index * 3 + 2] += boundary_f * value_boundary_coeffs[start_index * 3 + 2] * sign;
 }
 
 __global__ void fvm_laplacian_vector_internal(int num_surfaces,
         const int *lower_index, const int *upper_index,
         const double *weight, const double *mag_sf, const double *delta_coeffs, const double *gamma,
-        double *lower, double *upper, double *diag)
+        double *lower, double *upper, double *diag, double sign)
 {
     int index = blockDim.x * blockIdx.x + threadIdx.x;
     if (index >= num_surfaces)
@@ -183,6 +183,9 @@ __global__ void fvm_laplacian_vector_internal(int num_surfaces,
     //double lower_value = lower_face_gamma * mag_sf[index] * delta_coeffs[index];
     double lower_value = upper_value;
 
+    lower_value = lower_value * sign;
+    upper_value = upper_value * sign;
+
     lower[index] += lower_value;
     upper[index] += upper_value;
 
@@ -193,7 +196,7 @@ __global__ void fvm_laplacian_vector_internal(int num_surfaces,
 __global__ void fvm_laplacian_vector_boundary(int num, int offset,
         const double *boundary_mag_sf, const double *boundary_gamma,
         const double *gradient_internal_coeffs, const double *gradient_boundary_coeffs,
-        double *internal_coeffs, double *boundary_coeffs)
+        double *internal_coeffs, double *boundary_coeffs, double sign)
 {
     int index = blockDim.x * blockIdx.x + threadIdx.x;
     if (index >= num)
@@ -201,17 +204,17 @@ __global__ void fvm_laplacian_vector_boundary(int num, int offset,
 
     int start_index = offset + index;
     double boundary_value = boundary_gamma[start_index] * boundary_mag_sf[start_index];
-    internal_coeffs[start_index * 3 + 0] += boundary_value * gradient_internal_coeffs[start_index * 3 + 0];
-    internal_coeffs[start_index * 3 + 1] += boundary_value * gradient_internal_coeffs[start_index * 3 + 1];
-    internal_coeffs[start_index * 3 + 2] += boundary_value * gradient_internal_coeffs[start_index * 3 + 2];
-    boundary_coeffs[start_index * 3 + 0] += boundary_value * gradient_boundary_coeffs[start_index * 3 + 0];
-    boundary_coeffs[start_index * 3 + 1] += boundary_value * gradient_boundary_coeffs[start_index * 3 + 1];
-    boundary_coeffs[start_index * 3 + 2] += boundary_value * gradient_boundary_coeffs[start_index * 3 + 2];
+    internal_coeffs[start_index * 3 + 0] += boundary_value * gradient_internal_coeffs[start_index * 3 + 0] * sign;
+    internal_coeffs[start_index * 3 + 1] += boundary_value * gradient_internal_coeffs[start_index * 3 + 1] * sign;
+    internal_coeffs[start_index * 3 + 2] += boundary_value * gradient_internal_coeffs[start_index * 3 + 2] * sign;
+    boundary_coeffs[start_index * 3 + 0] += boundary_value * gradient_boundary_coeffs[start_index * 3 + 0] * sign;
+    boundary_coeffs[start_index * 3 + 1] += boundary_value * gradient_boundary_coeffs[start_index * 3 + 1] * sign;
+    boundary_coeffs[start_index * 3 + 2] += boundary_value * gradient_boundary_coeffs[start_index * 3 + 2] * sign;
 }
 
 __global__ void fvc_ddt_scalar_kernel(int num_cells, double rDeltaT,
         const double *rho, const double *rho_old, const double *vf, const double *vf_old,
-        double *output)
+        double *output, double sign)
 {
     int index = blockDim.x * blockIdx.x + threadIdx.x;
     if (index >= num_cells)
@@ -241,7 +244,7 @@ __global__ void fvc_ddt_scalar_kernel(int num_cells, double rDeltaT,
     output[index] += rDeltaT * (val_new - val_old);
     */
     // workaround way3 (use nvcc option -fmad=false)
-    output[index] += rDeltaT * (rho[index] * vf[index] - rho_old[index] * vf_old[index]);
+    output[index] += rDeltaT * (rho[index] * vf[index] - rho_old[index] * vf_old[index]) * sign;
 }
 
 __global__ void fvc_grad_vector_internal(int num_surfaces, 
@@ -417,25 +420,25 @@ __global__ void fvc_grad_scalar_boundary(int num_cells, int num, int offset, con
     // }
 }
 
-__global__ void divide_cell_volume_tsr(int num_cells, const double* volume, double *output)
+__global__ void divide_cell_volume_tsr(int num_cells, const double* volume, double *output, double sign)
 {
     int index = blockDim.x * blockIdx.x + threadIdx.x;
     if (index >= num_cells)
         return;
     
     double vol = volume[index];
-    output[index * 9 + 0] = output[index * 9 + 0] / vol;
-    output[index * 9 + 1] = output[index * 9 + 1] / vol;
-    output[index * 9 + 2] = output[index * 9 + 2] / vol;
-    output[index * 9 + 3] = output[index * 9 + 3] / vol;
-    output[index * 9 + 4] = output[index * 9 + 4] / vol;
-    output[index * 9 + 5] = output[index * 9 + 5] / vol;
-    output[index * 9 + 6] = output[index * 9 + 6] / vol;
-    output[index * 9 + 7] = output[index * 9 + 7] / vol;
-    output[index * 9 + 8] = output[index * 9 + 8] / vol;
+    output[index * 9 + 0] = output[index * 9 + 0] / vol * sign;
+    output[index * 9 + 1] = output[index * 9 + 1] / vol * sign;
+    output[index * 9 + 2] = output[index * 9 + 2] / vol * sign;
+    output[index * 9 + 3] = output[index * 9 + 3] / vol * sign;
+    output[index * 9 + 4] = output[index * 9 + 4] / vol * sign;
+    output[index * 9 + 5] = output[index * 9 + 5] / vol * sign;
+    output[index * 9 + 6] = output[index * 9 + 6] / vol * sign;
+    output[index * 9 + 7] = output[index * 9 + 7] / vol * sign;
+    output[index * 9 + 8] = output[index * 9 + 8] / vol * sign;
 }
 
-__global__ void divide_cell_volume_vec(int num_cells, const double* volume, double *output)
+__global__ void divide_cell_volume_vec(int num_cells, const double* volume, double *output, double sign)
 {
     int index = blockDim.x * blockIdx.x + threadIdx.x;
     if (index >= num_cells)
@@ -443,12 +446,12 @@ __global__ void divide_cell_volume_vec(int num_cells, const double* volume, doub
     
     double vol = volume[index];
 
-    output[index * 3 + 0] = output[index * 3 + 0] / vol;
-    output[index * 3 + 1] = output[index * 3 + 1] / vol;
-    output[index * 3 + 2] = output[index * 3 + 2] / vol;
+    output[index * 3 + 0] = output[index * 3 + 0] / vol * sign;
+    output[index * 3 + 1] = output[index * 3 + 1] / vol * sign;
+    output[index * 3 + 2] = output[index * 3 + 2] / vol * sign;
 }
 
-__global__ void divide_cell_volume_scalar(int num_cells, const double* volume, double *output)
+__global__ void divide_cell_volume_scalar(int num_cells, const double* volume, double *output, double sign)
 {
     int index = blockDim.x * blockIdx.x + threadIdx.x;
     if (index >= num_cells)
@@ -456,12 +459,12 @@ __global__ void divide_cell_volume_scalar(int num_cells, const double* volume, d
     
     double vol = volume[index];
 
-    output[index] = output[index] / vol;
+    output[index] = output[index] / vol * sign;
 }
 
 __global__ void fvc_grad_vector_correctBC_zeroGradient(int num, int offset, const int *face2Cells, 
         const double *internal_grad, const double *vf, const double *boundary_sf,
-        const double *boundary_mag_sf, double *boundary_grad)
+        const double *boundary_mag_sf, double *boundary_grad, double sign)
 {
     int index = blockDim.x * blockIdx.x + threadIdx.x;
     if (index >= num)
@@ -493,21 +496,21 @@ __global__ void fvc_grad_vector_correctBC_zeroGradient(int num, int offset, cons
     double grad_correction_y = - (n_x * grad_xy + n_y * grad_yy + n_z * grad_zy);
     double grad_correction_z = - (n_x * grad_xz + n_y * grad_yz + n_z * grad_zz);
 
-    boundary_grad[start_index * 9 + 0] = grad_xx + n_x * grad_correction_x;
-    boundary_grad[start_index * 9 + 1] = grad_xy + n_x * grad_correction_y;
-    boundary_grad[start_index * 9 + 2] = grad_xz + n_x * grad_correction_z;
-    boundary_grad[start_index * 9 + 3] = grad_yx + n_y * grad_correction_x;
-    boundary_grad[start_index * 9 + 4] = grad_yy + n_y * grad_correction_y;
-    boundary_grad[start_index * 9 + 5] = grad_yz + n_y * grad_correction_z;
-    boundary_grad[start_index * 9 + 6] = grad_zx + n_z * grad_correction_x;
-    boundary_grad[start_index * 9 + 7] = grad_zy + n_z * grad_correction_y;
-    boundary_grad[start_index * 9 + 8] = grad_zz + n_z * grad_correction_z;
+    boundary_grad[start_index * 9 + 0] = (grad_xx + n_x * grad_correction_x) * sign;
+    boundary_grad[start_index * 9 + 1] = (grad_xy + n_x * grad_correction_y) * sign;
+    boundary_grad[start_index * 9 + 2] = (grad_xz + n_x * grad_correction_z) * sign;
+    boundary_grad[start_index * 9 + 3] = (grad_yx + n_y * grad_correction_x) * sign;
+    boundary_grad[start_index * 9 + 4] = (grad_yy + n_y * grad_correction_y) * sign;
+    boundary_grad[start_index * 9 + 5] = (grad_yz + n_y * grad_correction_z) * sign;
+    boundary_grad[start_index * 9 + 6] = (grad_zx + n_z * grad_correction_x) * sign;
+    boundary_grad[start_index * 9 + 7] = (grad_zy + n_z * grad_correction_y) * sign;
+    boundary_grad[start_index * 9 + 8] = (grad_zz + n_z * grad_correction_z) * sign;
 }
 
 __global__ void fvc_grad_vector_correctBC_fixedValue(int num, int offset, const int *face2Cells, 
         const double *internal_grad, const double *vf, const double *boundary_sf,
         const double *boundary_mag_sf, double *boundary_grad,
-        const double *boundary_deltaCoeffs, const double *boundary_vf)
+        const double *boundary_deltaCoeffs, const double *boundary_vf, double sign)
 {
     int index = blockDim.x * blockIdx.x + threadIdx.x;
     if (index >= num)
@@ -544,15 +547,15 @@ __global__ void fvc_grad_vector_correctBC_fixedValue(int num, int offset, const
     double grad_correction_y = sn_grad_y - (n_x * grad_xy + n_y * grad_yy + n_z * grad_zy);
     double grad_correction_z = sn_grad_z - (n_x * grad_xz + n_y * grad_yz + n_z * grad_zz);
 
-    boundary_grad[start_index * 9 + 0] = grad_xx + n_x * grad_correction_x;
-    boundary_grad[start_index * 9 + 1] = grad_xy + n_x * grad_correction_y;
-    boundary_grad[start_index * 9 + 2] = grad_xz + n_x * grad_correction_z;
-    boundary_grad[start_index * 9 + 3] = grad_yx + n_y * grad_correction_x;
-    boundary_grad[start_index * 9 + 4] = grad_yy + n_y * grad_correction_y;
-    boundary_grad[start_index * 9 + 5] = grad_yz + n_y * grad_correction_z;
-    boundary_grad[start_index * 9 + 6] = grad_zx + n_z * grad_correction_x;
-    boundary_grad[start_index * 9 + 7] = grad_zy + n_z * grad_correction_y;
-    boundary_grad[start_index * 9 + 8] = grad_zz + n_z * grad_correction_z;
+    boundary_grad[start_index * 9 + 0] = (grad_xx + n_x * grad_correction_x) * sign;
+    boundary_grad[start_index * 9 + 1] = (grad_xy + n_x * grad_correction_y) * sign;
+    boundary_grad[start_index * 9 + 2] = (grad_xz + n_x * grad_correction_z) * sign;
+    boundary_grad[start_index * 9 + 3] = (grad_yx + n_y * grad_correction_x) * sign;
+    boundary_grad[start_index * 9 + 4] = (grad_yy + n_y * grad_correction_y) * sign;
+    boundary_grad[start_index * 9 + 5] = (grad_yz + n_y * grad_correction_z) * sign;
+    boundary_grad[start_index * 9 + 6] = (grad_zx + n_z * grad_correction_x) * sign;
+    boundary_grad[start_index * 9 + 7] = (grad_zy + n_z * grad_correction_y) * sign;
+    boundary_grad[start_index * 9 + 8] = (grad_zz + n_z * grad_correction_z) * sign;
 }
 
 __global__ void fvc_div_surface_scalar_internal(int num_surfaces, 
@@ -783,12 +786,12 @@ void update_boundary_coeffs_vector(cudaStream_t stream, int num_patches,
 
 void fvm_ddt_vector(cudaStream_t stream, int num_cells, double rDeltaT,
         const double *rho, const double *rho_old, const double *vf, const double *volume,
-        double *diag, double *source)
+        double *diag, double *source, double sign)
 {
     size_t threads_per_block = 1024;
     size_t blocks_per_grid = (num_cells + threads_per_block - 1) / threads_per_block;
     fvm_ddt_vector_kernel<<<blocks_per_grid, threads_per_block, 0, stream>>>(num_cells,
-            rDeltaT, rho, rho_old, vf, volume, diag, source);
+            rDeltaT, rho, rho_old, vf, volume, diag, source, sign);
 }
 
 void fvm_div_vector(cudaStream_t stream, int num_surfaces, const int *lowerAddr, const int *upperAddr,
@@ -796,14 +799,14 @@ void fvm_div_vector(cudaStream_t stream, int num_surfaces, const int *lowerAddr,
         double *lower, double *upper, double *diag, // end for internal
         int num_patches, const int *patch_size, const int *patch_type,
         const double *boundary_phi, const double *value_internal_coeffs, const double *value_boundary_coeffs,
-        double *internal_coeffs, double *boundary_coeffs)
+        double *internal_coeffs, double *boundary_coeffs, double sign)
 {
     size_t threads_per_block = 1024;
     size_t blocks_per_grid = 1;
 
     blocks_per_grid = (num_surfaces + threads_per_block - 1) / threads_per_block;
     fvm_div_vector_internal<<<blocks_per_grid, threads_per_block, 0, stream>>>(num_surfaces, lowerAddr, upperAddr,
-            phi, weight, lower, upper, diag);
+            phi, weight, lower, upper, diag, sign);
 
     int offset = 0;
     for (int i = 0; i < num_patches; i++) {
@@ -815,7 +818,7 @@ void fvm_div_vector(cudaStream_t stream, int num_surfaces, const int *lowerAddr,
             // TODO: just vector version now
             fvm_div_vector_boundary<<<blocks_per_grid, threads_per_block, 0, stream>>>(patch_size[i], offset,
                     boundary_phi, value_internal_coeffs, value_boundary_coeffs,
-                    internal_coeffs, boundary_coeffs);
+                    internal_coeffs, boundary_coeffs, sign);
         } else if (0) {
             // xxx
             fprintf(stderr, "boundaryConditions other than zeroGradient are not support yet!\n");
@@ -831,14 +834,14 @@ void fvm_laplacian_vector(cudaStream_t stream, int num_surfaces,
         int num_patches, const int *patch_size, const int *patch_type,
         const double *boundary_mag_sf, const double *boundary_gamma,
         const double *gradient_internal_coeffs, const double *gradient_boundary_coeffs,
-        double *internal_coeffs, double *boundary_coeffs)
+        double *internal_coeffs, double *boundary_coeffs, double sign)
 {
     size_t threads_per_block = 1024;
     size_t blocks_per_grid = 1;
 
     blocks_per_grid = (num_surfaces + threads_per_block - 1) / threads_per_block;
     fvm_laplacian_vector_internal<<<blocks_per_grid, threads_per_block, 0, stream>>>(num_surfaces, lowerAddr, upperAddr,
-            weight, mag_sf, delta_coeffs, gamma, lower, upper, diag);
+            weight, mag_sf, delta_coeffs, gamma, lower, upper, diag, sign);
 
     int offset = 0;
     for (int i = 0; i < num_patches; i++) {
@@ -850,7 +853,7 @@ void fvm_laplacian_vector(cudaStream_t stream, int num_surfaces,
             // TODO: just vector version now
             fvm_laplacian_vector_boundary<<<blocks_per_grid, threads_per_block, 0, stream>>>(patch_size[i], offset,
                     boundary_mag_sf, boundary_gamma, gradient_internal_coeffs, gradient_boundary_coeffs,
-                    internal_coeffs, boundary_coeffs);
+                    internal_coeffs, boundary_coeffs, sign);
         } else if (0) {
             // xxx
             fprintf(stderr, "boundaryConditions other than zeroGradient are not support yet!\n");
@@ -861,14 +864,14 @@ void fvm_laplacian_vector(cudaStream_t stream, int num_surfaces,
 
 void fvc_ddt_scalar(cudaStream_t stream, int num_cells, double rDeltaT,
         const double *rho, const double *rho_old, const double *vf, const double *vf_old,
-        double *output)
+        double *output, double sign)
 {
     checkCudaErrors(cudaMemsetAsync(output, 0, num_cells * sizeof(double), stream));
 
     size_t threads_per_block = 1024;
     size_t blocks_per_grid = (num_cells + threads_per_block - 1) / threads_per_block;
     fvc_ddt_scalar_kernel<<<blocks_per_grid, threads_per_block, 0, stream>>>(num_cells,
-            rDeltaT, rho, rho_old, vf, vf_old, output);
+            rDeltaT, rho, rho_old, vf, vf_old, output, sign);
 }
 
 void fvc_grad_vector(cudaStream_t stream, int num_cells, int num_surfaces, 
@@ -877,7 +880,7 @@ void fvc_grad_vector(cudaStream_t stream, int num_cells, int num_surfaces,
         int num_patches, const int *patch_size, const int *patch_type,
         const int *boundary_cell_face, const double *boundary_vf, const double *boundary_Sf,
         const double *volume, const double *boundary_mag_Sf, double *boundary_output,
-        const double *boundary_deltaCoeffs)
+        const double *boundary_deltaCoeffs, double sign)
 {
     checkCudaErrors(cudaMemsetAsync(output, 0, num_cells * 9 * sizeof(double), stream));
     size_t threads_per_block = 1024;
@@ -906,7 +909,7 @@ void fvc_grad_vector(cudaStream_t stream, int num_cells, int num_surfaces,
     // divide cell volume
     threads_per_block = 512;
     blocks_per_grid = (num_cells + threads_per_block - 1) / threads_per_block;
-    divide_cell_volume_tsr<<<blocks_per_grid, threads_per_block, 0, stream>>>(num_cells, volume, output);
+    divide_cell_volume_tsr<<<blocks_per_grid, threads_per_block, 0, stream>>>(num_cells, volume, output, sign);
 
     // correct boundary conditions
     offset = 0;
@@ -917,10 +920,10 @@ void fvc_grad_vector(cudaStream_t stream, int num_cells, int num_surfaces,
         if (patch_type[i] == boundaryConditions::zeroGradient) {
             // TODO: just vector version now
             fvc_grad_vector_correctBC_zeroGradient<<<blocks_per_grid, threads_per_block, 0, stream>>>(patch_size[i], offset, boundary_cell_face,
-                    output, boundary_vf, boundary_Sf, boundary_mag_Sf, boundary_output);
+                    output, boundary_vf, boundary_Sf, boundary_mag_Sf, boundary_output, sign);
         } else if (patch_type[i] == boundaryConditions::fixedValue) {
             fvc_grad_vector_correctBC_fixedValue<<<blocks_per_grid, threads_per_block, 0, stream>>>(patch_size[i], offset, boundary_cell_face,
-                    output, boundary_vf, boundary_Sf, boundary_mag_Sf, boundary_output, boundary_deltaCoeffs, boundary_vf);
+                    output, boundary_vf, boundary_Sf, boundary_mag_Sf, boundary_output, boundary_deltaCoeffs, boundary_vf, sign);
         } else if (0) {
             // xxx
             fprintf(stderr, "boundaryConditions other than zeroGradient are not support yet!\n");
@@ -942,7 +945,7 @@ void scale_dev2T_tensor(cudaStream_t stream, int num_cells, const double *vf1, d
 
 void fvc_div_surface_scalar(cudaStream_t stream, int num_cells, int num_surfaces, int num_boundary_surfaces,
         const int *lowerAddr, const int *upperAddr, const double *ssf, const int *boundary_cell_face,
-        const double *boundary_ssf, const double *volume, double *output)
+        const double *boundary_ssf, const double *volume, double *output, double sign)
 {
     checkCudaErrors(cudaMemsetAsync(output, 0, num_cells * sizeof(double), stream));
 
@@ -958,7 +961,7 @@ void fvc_div_surface_scalar(cudaStream_t stream, int num_cells, int num_surfaces
     // divide cell volume
     threads_per_block = 1024;
     blocks_per_grid = (num_cells + threads_per_block - 1) / threads_per_block;
-    divide_cell_volume_scalar<<<blocks_per_grid, threads_per_block, 0, stream>>>(num_cells, volume, output);
+    divide_cell_volume_scalar<<<blocks_per_grid, threads_per_block, 0, stream>>>(num_cells, volume, output, sign);
 }
 
 void fvc_div_cell_vector(cudaStream_t stream, int num_cells, int num_surfaces,
@@ -966,7 +969,7 @@ void fvc_div_cell_vector(cudaStream_t stream, int num_cells, int num_surfaces,
         const double *weight, const double *Sf, const double *vf, double *output, // end for internal
         int num_patches, const int *patch_size, const int *patch_type,
         const int *boundary_cell_face, const double *boundary_vf, const double *boundary_Sf,
-        const double *volume)
+        const double *volume, double sign)
 {
     checkCudaErrors(cudaMemsetAsync(output, 0, num_cells * sizeof(double), stream));
 
@@ -994,7 +997,7 @@ void fvc_div_cell_vector(cudaStream_t stream, int num_cells, int num_surfaces,
     // divide cell volume
     threads_per_block = 1024;
     blocks_per_grid = (num_cells + threads_per_block - 1) / threads_per_block;
-    divide_cell_volume_scalar<<<blocks_per_grid, threads_per_block, 0, stream>>>(num_cells, volume, output);
+    divide_cell_volume_scalar<<<blocks_per_grid, threads_per_block, 0, stream>>>(num_cells, volume, output, sign);
 }
 
 void fvc_div_cell_tensor(cudaStream_t stream, int num_cells, int num_surfaces,
@@ -1002,7 +1005,7 @@ void fvc_div_cell_tensor(cudaStream_t stream, int num_cells, int num_surfaces,
         const double *weight, const double *Sf, const double *vf, double *output, // end for internal
         int num_patches, const int *patch_size, const int *patch_type,
         const int *boundary_cell_face, const double *boundary_vf, const double *boundary_Sf,
-        const double *volume)
+        const double *volume, double sign)
 {
     checkCudaErrors(cudaMemsetAsync(output, 0, num_cells * 3 * sizeof(double), stream));
 
@@ -1030,14 +1033,14 @@ void fvc_div_cell_tensor(cudaStream_t stream, int num_cells, int num_surfaces,
     // divide cell volume
     threads_per_block = 1024;
     blocks_per_grid = (num_cells + threads_per_block - 1) / threads_per_block;
-    divide_cell_volume_vec<<<blocks_per_grid, threads_per_block, 0, stream>>>(num_cells, volume, output);
+    divide_cell_volume_vec<<<blocks_per_grid, threads_per_block, 0, stream>>>(num_cells, volume, output, sign);
 }
 
 void fvc_grad_cell_scalar(cudaStream_t stream, int num_cells, int num_surfaces, 
         const int *lowerAddr, const int *upperAddr, 
         const double *weight, const double *Sf, const double *vf, double *output, // end for internal
         int num_patches, const int *patch_size, const int *patch_type,
-        const int *boundary_cell_face, const double *boundary_vf, const double *boundary_Sf, const double *volume)
+        const int *boundary_cell_face, const double *boundary_vf, const double *boundary_Sf, const double *volume, double sign)
 {
     checkCudaErrors(cudaMemsetAsync(output, 0, num_cells * 3 * sizeof(double), stream));
     size_t threads_per_block = 1024;
@@ -1064,5 +1067,5 @@ void fvc_grad_cell_scalar(cudaStream_t stream, int num_cells, int num_surfaces,
     // divide cell volume
     threads_per_block = 1024;
     blocks_per_grid = (num_cells + threads_per_block - 1) / threads_per_block;
-    divide_cell_volume_vec<<<blocks_per_grid, threads_per_block, 0, stream>>>(num_cells, volume, output);
+    divide_cell_volume_vec<<<blocks_per_grid, threads_per_block, 0, stream>>>(num_cells, volume, output, sign);
 }
diff --git a/src_gpu/dfUEqn.cu b/src_gpu/dfUEqn.cu
index 1e8065721..f20808ec8 100644
--- a/src_gpu/dfUEqn.cu
+++ b/src_gpu/dfUEqn.cu
@@ -92,13 +92,13 @@ void dfUEqn::preProcess(const double *h_u, const double *h_boundary_u, const dou
 void dfUEqn::process() {
     fvm_ddt_vector(dataBase_.stream, dataBase_.num_cells, dataBase_.rdelta_t,
             dataBase_.d_rho, dataBase_.d_rho_old, dataBase_.d_u, dataBase_.d_volume,
-            d_diag, d_source);
+            d_diag, d_source, -1.);
     fvm_div_vector(dataBase_.stream, dataBase_.num_surfaces, dataBase_.d_owner, dataBase_.d_neighbor,
             dataBase_.d_phi, dataBase_.d_weight,
             d_lower, d_upper, d_diag, // end for internal
             dataBase_.num_patches, dataBase_.patch_size.data(), patch_type.data(),
             dataBase_.d_boundary_phi, d_value_internal_coeffs, d_value_boundary_coeffs,
-            d_internal_coeffs, d_boundary_coeffs);
+            d_internal_coeffs, d_boundary_coeffs, -1.);
     //field_multiply_scalar(dataBase_.stream,
     //        dataBase_.num_cells, dataBase_.d_rho, d_nu_eff, d_rho_nueff, // end for internal
     //        dataBase_.num_boundary_surfaces, dataBase_.d_boundary_rho, d_boundary_nu_eff, d_boundary_rho_nueff);
@@ -129,7 +129,7 @@ void dfUEqn::process() {
             dataBase_.d_owner, dataBase_.d_neighbor,
             dataBase_.d_weight, dataBase_.d_sf, dataBase_.d_p, d_fvc_output,
             dataBase_.num_patches, dataBase_.patch_size.data(), patch_type.data(),
-            dataBase_.d_boundary_face_cell, dataBase_.d_boundary_p, dataBase_.d_boundary_sf, dataBase_.d_volume);
+            dataBase_.d_boundary_face_cell, dataBase_.d_boundary_p, dataBase_.d_boundary_sf, dataBase_.d_volume, -1.);
     fvc_to_source_vector(dataBase_.stream, dataBase_.num_cells,
             dataBase_.d_volume, d_fvc_output, d_source);
     //solve();

From cbb74cd69e44cd1017a62e4e0a67ec33627f0a57 Mon Sep 17 00:00:00 2001
From: STwangyingrui <wangyingrui@sensetime.com>
Date: Tue, 15 Aug 2023 19:49:38 +0800
Subject: [PATCH 21/25] use cuda graph in ueqn

---
 .../solvers/dfLowMachFoam/new_dfLowMachFoam.C |  10 +-
 src_gpu/dfMatrixDataBase.H                    |   5 -
 src_gpu/dfMatrixDataBase.cu                   |   4 -
 src_gpu/dfUEqn.H                              |  13 ++-
 src_gpu/dfUEqn.cu                             | 110 +++++++++++-------
 5 files changed, 84 insertions(+), 58 deletions(-)

diff --git a/applications/solvers/dfLowMachFoam/new_dfLowMachFoam.C b/applications/solvers/dfLowMachFoam/new_dfLowMachFoam.C
index 0deffb40f..a8368f5af 100644
--- a/applications/solvers/dfLowMachFoam/new_dfLowMachFoam.C
+++ b/applications/solvers/dfLowMachFoam/new_dfLowMachFoam.C
@@ -98,11 +98,11 @@ int main(int argc, char *argv[])
         createGPUBase(mesh, Y);
         createGPUUEqn(CanteraTorchProperties, U);
 
-        // foreach(timestep) {
-        dfDataBase.preTimeStep(&rho.oldTime()[0]);
-        #include "new_UEqn.H"
-        dfDataBase.postTimeStep();
-        // }
+        for (int timestep = 0; timestep < 10; timestep++) {
+            dfDataBase.preTimeStep(&rho.oldTime()[0]);
+            #include "new_UEqn.H"
+            dfDataBase.postTimeStep();
+        }
     }
     return 0;
 }
diff --git a/src_gpu/dfMatrixDataBase.H b/src_gpu/dfMatrixDataBase.H
index 69d20d7af..cac7264a8 100644
--- a/src_gpu/dfMatrixDataBase.H
+++ b/src_gpu/dfMatrixDataBase.H
@@ -68,11 +68,6 @@ struct dfMatrixDataBase
 {
     // cuda resource
     cudaStream_t stream;
-    // maybe one graph for one eqn before using self-developed solver
-    // and should be located in each eqn.
-    cudaGraph_t graph;
-    cudaGraphExec_t graph_instance;
-    bool graph_created=false;
 
     // constant values -- basic
     int num_cells = 0;
diff --git a/src_gpu/dfMatrixDataBase.cu b/src_gpu/dfMatrixDataBase.cu
index 64b35f956..8c2c26faf 100644
--- a/src_gpu/dfMatrixDataBase.cu
+++ b/src_gpu/dfMatrixDataBase.cu
@@ -48,10 +48,6 @@ dfMatrixDataBase::dfMatrixDataBase() {
 dfMatrixDataBase::~dfMatrixDataBase() {
     // destroy cuda resources
     checkCudaErrors(cudaStreamDestroy(stream));
-    if (graph_created) {
-        checkCudaErrors(cudaGraphExecDestroy(graph_instance));
-        checkCudaErrors(cudaGraphDestroy(graph));
-    }
     // TODO: free pointers
 }
 
diff --git a/src_gpu/dfUEqn.H b/src_gpu/dfUEqn.H
index 880b9c347..4d1c25697 100644
--- a/src_gpu/dfUEqn.H
+++ b/src_gpu/dfUEqn.H
@@ -10,6 +10,12 @@ class dfUEqn
 private:
 	dfMatrixDataBase &dataBase_;
 
+    // cuda resource
+    // one graph for one eqn before using self-developed solver
+    cudaGraph_t graph;
+    cudaGraphExec_t graph_instance;
+    bool graph_created=false;
+
 	// constant values -- basic
 	std::string mode_string;
 	std::string setting_path;
@@ -72,7 +78,12 @@ public:
         : dataBase_(dataBase) {}
 
 	// 析构函数
-	~dfUEqn(){}
+	~dfUEqn(){
+        if (graph_created) {
+            checkCudaErrors(cudaGraphExecDestroy(graph_instance));
+            checkCudaErrors(cudaGraphDestroy(graph));
+        }
+    }
 
 	// 成员函数
 
diff --git a/src_gpu/dfUEqn.cu b/src_gpu/dfUEqn.cu
index f20808ec8..470550cd0 100644
--- a/src_gpu/dfUEqn.cu
+++ b/src_gpu/dfUEqn.cu
@@ -90,48 +90,73 @@ void dfUEqn::preProcess(const double *h_u, const double *h_boundary_u, const dou
 }
 
 void dfUEqn::process() {
-    fvm_ddt_vector(dataBase_.stream, dataBase_.num_cells, dataBase_.rdelta_t,
-            dataBase_.d_rho, dataBase_.d_rho_old, dataBase_.d_u, dataBase_.d_volume,
-            d_diag, d_source, -1.);
-    fvm_div_vector(dataBase_.stream, dataBase_.num_surfaces, dataBase_.d_owner, dataBase_.d_neighbor,
-            dataBase_.d_phi, dataBase_.d_weight,
-            d_lower, d_upper, d_diag, // end for internal
-            dataBase_.num_patches, dataBase_.patch_size.data(), patch_type.data(),
-            dataBase_.d_boundary_phi, d_value_internal_coeffs, d_value_boundary_coeffs,
-            d_internal_coeffs, d_boundary_coeffs, -1.);
-    //field_multiply_scalar(dataBase_.stream,
-    //        dataBase_.num_cells, dataBase_.d_rho, d_nu_eff, d_rho_nueff, // end for internal
-    //        dataBase_.num_boundary_surfaces, dataBase_.d_boundary_rho, d_boundary_nu_eff, d_boundary_rho_nueff);
-    //fvm_laplacian_vector(dataBase_.stream, dataBase_.num_surfaces,
-    //        dataBase_.d_owner, dataBase_.d_neighbor,
-    //        dataBase_.d_weight, dataBase_.d_mag_sf, dataBase_.d_delta_coeffs, d_rho_nueff,
-    //        d_lower, d_upper, d_diag, // end for internal
-    //        dataBase_.num_patches, dataBase_.patch_size.data(), patch_type.data(),
-    //        dataBase_.d_boundary_mag_sf, d_boundary_rho_nueff,
-    //        d_gradient_internal_coeffs, d_gradient_boundary_coeffs,
-    //        d_internal_coeffs, d_boundary_coeffs);
-    //fvc_grad_vector(dataBase_.stream, dataBase_.num_cells, dataBase_.num_surfaces,
-    //        dataBase_.d_owner, dataBase_.d_neighbor,
-    //        dataBase_.d_weight, dataBase_.d_sf, dataBase_.d_u, d_grad_u,
-    //        dataBase_.num_patches, dataBase_.patch_size.data(), patch_type.data(),
-    //        dataBase_.d_boundary_face_cell, dataBase_.d_boundary_u, dataBase_.d_boundary_sf,
-    //        dataBase_.d_volume, dataBase_.d_boundary_mag_sf, d_boundary_grad_u, dataBase_.d_boundary_delta_coeffs);
-    //scale_dev2T_tensor(dataBase_.stream, dataBase_.num_cells, d_rho_nueff, d_grad_u, // end for internal
-    //        dataBase_.num_boundary_surfaces, d_boundary_rho_nueff, d_boundary_grad_u);
-    //fvc_div_cell_tensor(dataBase_.stream, dataBase_.num_cells, dataBase_.num_surfaces,
-    //        dataBase_.d_owner, dataBase_.d_neighbor,
-    //        dataBase_.d_weight, dataBase_.d_sf, d_grad_u, d_fvc_output, // end for internal
-    //        dataBase_.num_patches, dataBase_.patch_size.data(), patch_type.data(),
-    //        dataBase_.d_boundary_face_cell, d_boundary_grad_u, dataBase_.d_boundary_sf, dataBase_.d_volume);
-    //fvc_to_source_vector(dataBase_.stream, dataBase_.num_cells,
-    //        dataBase_.d_volume, d_fvc_output, d_source);
-    fvc_grad_cell_scalar(dataBase_.stream, dataBase_.num_cells, dataBase_.num_surfaces,
-            dataBase_.d_owner, dataBase_.d_neighbor,
-            dataBase_.d_weight, dataBase_.d_sf, dataBase_.d_p, d_fvc_output,
-            dataBase_.num_patches, dataBase_.patch_size.data(), patch_type.data(),
-            dataBase_.d_boundary_face_cell, dataBase_.d_boundary_p, dataBase_.d_boundary_sf, dataBase_.d_volume, -1.);
-    fvc_to_source_vector(dataBase_.stream, dataBase_.num_cells,
-            dataBase_.d_volume, d_fvc_output, d_source);
+    //使用event计算时间
+    float time_elapsed=0;
+    cudaEvent_t start,stop;
+    checkCudaErrors(cudaEventCreate(&start));
+    checkCudaErrors(cudaEventCreate(&stop));
+    checkCudaErrors(cudaEventRecord(start,0));
+
+    if(!graph_created) {
+        DEBUG_TRACE;
+        checkCudaErrors(cudaStreamBeginCapture(dataBase_.stream, cudaStreamCaptureModeGlobal));
+
+        fvm_ddt_vector(dataBase_.stream, dataBase_.num_cells, dataBase_.rdelta_t,
+                dataBase_.d_rho, dataBase_.d_rho_old, dataBase_.d_u, dataBase_.d_volume,
+                d_diag, d_source, -1.);
+        fvm_div_vector(dataBase_.stream, dataBase_.num_surfaces, dataBase_.d_owner, dataBase_.d_neighbor,
+                dataBase_.d_phi, dataBase_.d_weight,
+                d_lower, d_upper, d_diag, // end for internal
+                dataBase_.num_patches, dataBase_.patch_size.data(), patch_type.data(),
+                dataBase_.d_boundary_phi, d_value_internal_coeffs, d_value_boundary_coeffs,
+                d_internal_coeffs, d_boundary_coeffs, -1.);
+        //field_multiply_scalar(dataBase_.stream,
+        //        dataBase_.num_cells, dataBase_.d_rho, d_nu_eff, d_rho_nueff, // end for internal
+        //        dataBase_.num_boundary_surfaces, dataBase_.d_boundary_rho, d_boundary_nu_eff, d_boundary_rho_nueff);
+        //fvm_laplacian_vector(dataBase_.stream, dataBase_.num_surfaces,
+        //        dataBase_.d_owner, dataBase_.d_neighbor,
+        //        dataBase_.d_weight, dataBase_.d_mag_sf, dataBase_.d_delta_coeffs, d_rho_nueff,
+        //        d_lower, d_upper, d_diag, // end for internal
+        //        dataBase_.num_patches, dataBase_.patch_size.data(), patch_type.data(),
+        //        dataBase_.d_boundary_mag_sf, d_boundary_rho_nueff,
+        //        d_gradient_internal_coeffs, d_gradient_boundary_coeffs,
+        //        d_internal_coeffs, d_boundary_coeffs);
+        //fvc_grad_vector(dataBase_.stream, dataBase_.num_cells, dataBase_.num_surfaces,
+        //        dataBase_.d_owner, dataBase_.d_neighbor,
+        //        dataBase_.d_weight, dataBase_.d_sf, dataBase_.d_u, d_grad_u,
+        //        dataBase_.num_patches, dataBase_.patch_size.data(), patch_type.data(),
+        //        dataBase_.d_boundary_face_cell, dataBase_.d_boundary_u, dataBase_.d_boundary_sf,
+        //        dataBase_.d_volume, dataBase_.d_boundary_mag_sf, d_boundary_grad_u, dataBase_.d_boundary_delta_coeffs);
+        //scale_dev2T_tensor(dataBase_.stream, dataBase_.num_cells, d_rho_nueff, d_grad_u, // end for internal
+        //        dataBase_.num_boundary_surfaces, d_boundary_rho_nueff, d_boundary_grad_u);
+        //fvc_div_cell_tensor(dataBase_.stream, dataBase_.num_cells, dataBase_.num_surfaces,
+        //        dataBase_.d_owner, dataBase_.d_neighbor,
+        //        dataBase_.d_weight, dataBase_.d_sf, d_grad_u, d_fvc_output, // end for internal
+        //        dataBase_.num_patches, dataBase_.patch_size.data(), patch_type.data(),
+        //        dataBase_.d_boundary_face_cell, d_boundary_grad_u, dataBase_.d_boundary_sf, dataBase_.d_volume);
+        //fvc_to_source_vector(dataBase_.stream, dataBase_.num_cells,
+        //        dataBase_.d_volume, d_fvc_output, d_source);
+        fvc_grad_cell_scalar(dataBase_.stream, dataBase_.num_cells, dataBase_.num_surfaces,
+                dataBase_.d_owner, dataBase_.d_neighbor,
+                dataBase_.d_weight, dataBase_.d_sf, dataBase_.d_p, d_fvc_output,
+                dataBase_.num_patches, dataBase_.patch_size.data(), patch_type.data(),
+                dataBase_.d_boundary_face_cell, dataBase_.d_boundary_p, dataBase_.d_boundary_sf, dataBase_.d_volume, -1.);
+        fvc_to_source_vector(dataBase_.stream, dataBase_.num_cells,
+                dataBase_.d_volume, d_fvc_output, d_source);
+
+        checkCudaErrors(cudaStreamEndCapture(dataBase_.stream, &graph));
+        checkCudaErrors(cudaGraphInstantiate(&graph_instance, graph, NULL, NULL, 0));
+        graph_created = true;
+    }
+    DEBUG_TRACE;
+    checkCudaErrors(cudaGraphLaunch(graph_instance, dataBase_.stream));
+
+    checkCudaErrors(cudaEventRecord(stop,0));
+    checkCudaErrors(cudaEventSynchronize(start));
+    checkCudaErrors(cudaEventSynchronize(stop));
+    checkCudaErrors(cudaEventElapsedTime(&time_elapsed,start,stop));
+    fprintf(stderr, "ueqn process time：%f(ms)\n",time_elapsed);
+
     //solve();
 }
 
@@ -139,7 +164,6 @@ void dfUEqn::solve() {
     //ldu_to_csr(dataBase_.stream, dataBase_.num_cells, dataBase_.num_surfaces,
     //        dataBase_.d_lower_to_csr_index, dataBase_.d_upper_to_csr_index, dataBase_.d_diag_to_csr_index,
     //        d_lower, d_upper, d_diag, d_source, d_internal_coeffs, d_boundary_coeffs, d_A, d_b);
-    ////checkCudaErrors(cudaStreamSynchronize(dataBase_.stream));
 
     int nNz = dataBase_.num_cells + dataBase_.num_surfaces * 2; // matrix entries
     if (num_iteration == 0)                                     // first interation

From 50eee68f0c5f338e5f5b8a35caa042bc0924bf76 Mon Sep 17 00:00:00 2001
From: maorz1998 <maorz1998@stu.pku.edu.cn>
Date: Wed, 16 Aug 2023 18:22:40 +0800
Subject: [PATCH 22/25] fix bugs in turbulence term

---
 .../solvers/dfLowMachFoam/Make/options        |  4 +-
 applications/solvers/dfLowMachFoam/new_UEqn.H | 16 +++--
 .../solvers/dfLowMachFoam/new_dfLowMachFoam.C |  7 +-
 src_gpu/dfMatrixOpBase.cu                     | 44 +++++++++---
 src_gpu/dfUEqn.H                              |  4 +-
 src_gpu/dfUEqn.cu                             | 69 +++++++++++--------
 6 files changed, 97 insertions(+), 47 deletions(-)

diff --git a/applications/solvers/dfLowMachFoam/Make/options b/applications/solvers/dfLowMachFoam/Make/options
index e2a57bd00..e1959ada3 100644
--- a/applications/solvers/dfLowMachFoam/Make/options
+++ b/applications/solvers/dfLowMachFoam/Make/options
@@ -29,7 +29,8 @@ EXE_INC = -std=c++14 \
     $(PYTHON_INC_DIR) \
     $(if $(AMGX_DIR), -I$(DF_ROOT)/src_gpu,) \
     $(if $(AMGX_DIR), -I/usr/local/cuda-11.6/include,) \
-    $(if $(AMGX_DIR), -I$(AMGX_DIR)/include,)
+    $(if $(AMGX_DIR), -I$(AMGX_DIR)/include,) \
+	-I$(DF_ROOT)/GPUTestRef/lnInclude \
 
 EXE_LIBS = \
     -lcompressibleTransportModels \
@@ -43,6 +44,7 @@ EXE_LIBS = \
     -ldfCanteraMixture \
     -ldfChemistryModel \
     -ldfCombustionModels  \
+	-ldfGenMatrix \
     $(CANTERA_ROOT)/lib/libcantera.so \
     $(if $(LIBTORCH_ROOT),$(LIBTORCH_ROOT)/lib/libtorch.so,) \
     $(if $(LIBTORCH_ROOT),$(LIBTORCH_ROOT)/lib/libc10.so,) \
diff --git a/applications/solvers/dfLowMachFoam/new_UEqn.H b/applications/solvers/dfLowMachFoam/new_UEqn.H
index 1b5487139..231c38c29 100644
--- a/applications/solvers/dfLowMachFoam/new_UEqn.H
+++ b/applications/solvers/dfLowMachFoam/new_UEqn.H
@@ -5,10 +5,16 @@ const volScalarField& nuEff = nuEff_tmp();
 // run CPU, for temp
 tmp<fvVectorMatrix> tUEqn
 (
- -fvm::ddt(rho, U) - fvm::div(phi, U) == fvc::grad(p)
- //fvm::ddt(rho, U) + fvm::div(phi, U) == -fvc::grad(p)
- //turbulence->divDevRhoReff(U)
+    fvm::ddt(rho, U) + fvm::div(phi, U)
+    +  turbulence->divDevRhoReff(U)
+    == -fvc::grad(p)
 );
+// tmp<fvVectorMatrix> tUEqn_ref // test turbulence->divDevRhoReff(U)
+// (
+//     - fvc::div((turbulence->rho()*turbulence->nuEff())*dev2(Foam::T(fvc::grad(U))))
+//     - fvm::laplacian(turbulence->rho()*turbulence->nuEff(), U)
+// );
+
 fvVectorMatrix& UEqn = tUEqn.ref();
 
 // run GPU
@@ -81,6 +87,8 @@ for (int patchi = 0; patchi < dfDataBase.num_patches; patchi++)
 }
 bool printFlag = false;
 UEqn_GPU.compareResult(&UEqn.lower()[0], &UEqn.upper()[0], &UEqn.diag()[0], &UEqn.source()[0][0],
-        h_internal_coeffs.data(), h_boundary_coeffs.data(), printFlag);
+        h_internal_coeffs.data(), h_boundary_coeffs.data(), 
+        // &DivTensor[0][0], 
+        printFlag);
 DEBUG_TRACE;
 #endif
diff --git a/applications/solvers/dfLowMachFoam/new_dfLowMachFoam.C b/applications/solvers/dfLowMachFoam/new_dfLowMachFoam.C
index a8368f5af..7d867687f 100644
--- a/applications/solvers/dfLowMachFoam/new_dfLowMachFoam.C
+++ b/applications/solvers/dfLowMachFoam/new_dfLowMachFoam.C
@@ -51,9 +51,12 @@ Description
 
 #include "dfMatrixDataBase.H"
 #include "dfMatrixOpBase.H"
+#include "GenFvMatrix.H"
 #include "dfUEqn.H"
 #include "createGPUSolver.H"
 
+#define GPUSolver_
+
 int main(int argc, char *argv[])
 {
 #ifdef USE_PYTORCH
@@ -98,11 +101,11 @@ int main(int argc, char *argv[])
         createGPUBase(mesh, Y);
         createGPUUEqn(CanteraTorchProperties, U);
 
-        for (int timestep = 0; timestep < 10; timestep++) {
+        // for (int timestep = 0; timestep < 10; timestep++) {
             dfDataBase.preTimeStep(&rho.oldTime()[0]);
             #include "new_UEqn.H"
             dfDataBase.postTimeStep();
-        }
+        // }
     }
     return 0;
 }
diff --git a/src_gpu/dfMatrixOpBase.cu b/src_gpu/dfMatrixOpBase.cu
index fd90ce480..f8d71cca7 100644
--- a/src_gpu/dfMatrixOpBase.cu
+++ b/src_gpu/dfMatrixOpBase.cu
@@ -103,6 +103,13 @@ __global__ void scale_dev2t_tensor_kernel(int num, const double *vf1, double *vf
     vf2[index * 9 + 6] = scale * val_xz;
     vf2[index * 9 + 7] = scale * val_yz;
     vf2[index * 9 + 8] = scale * (val_zz - trace_coeff);
+
+    // if (index == 0)
+    // {
+    //     printf("bou_grad_U = (%.5e, %.5e, %.5e, %.5e, %.5e, %.5e, %.5e, %.5e, %.5e)", vf2[0], vf2[1], vf2[2],
+    //             vf2[3], vf2[4], vf2[5], vf2[6], vf2[7], vf2[8]);
+    // }
+    
 }
 
 __global__ void fvm_ddt_vector_kernel(int num_cells, double rDeltaT,
@@ -488,9 +495,9 @@ __global__ void fvc_grad_vector_correctBC_zeroGradient(int num, int offset, cons
     double vfy = vf[cellIndex * 3 + 1];
     double vfz = vf[cellIndex * 3 + 2];
 
-    double n_x = boundary_sf[cellIndex * 3 + 0] / boundary_mag_sf[cellIndex];
-    double n_y = boundary_sf[cellIndex * 3 + 1] / boundary_mag_sf[cellIndex];
-    double n_z = boundary_sf[cellIndex * 3 + 2] / boundary_mag_sf[cellIndex];
+    double n_x = boundary_sf[start_index * 3 + 0] / boundary_mag_sf[start_index];
+    double n_y = boundary_sf[start_index * 3 + 1] / boundary_mag_sf[start_index];
+    double n_z = boundary_sf[start_index * 3 + 2] / boundary_mag_sf[start_index];
     
     double grad_correction_x = - (n_x * grad_xx + n_y * grad_yx + n_z * grad_zx); // sn_grad_x = 0
     double grad_correction_y = - (n_x * grad_xy + n_y * grad_yy + n_z * grad_zy);
@@ -670,9 +677,9 @@ __global__ void fvc_div_cell_tensor_internal(int num_surfaces,
     double ssf_zx = (w * (vf[owner * 9 + 6] - vf[neighbor * 9 + 6]) + vf[neighbor * 9 + 6]);
     double ssf_zy = (w * (vf[owner * 9 + 7] - vf[neighbor * 9 + 7]) + vf[neighbor * 9 + 7]);
     double ssf_zz = (w * (vf[owner * 9 + 8] - vf[neighbor * 9 + 8]) + vf[neighbor * 9 + 8]);
-    double div_x = Sfx * ssf_xx + Sfy * ssf_xy + Sfz * ssf_xz;
-    double div_y = Sfx * ssf_yx + Sfy * ssf_yy + Sfz * ssf_yz;
-    double div_z = Sfx * ssf_zx + Sfy * ssf_zy + Sfz * ssf_zz;
+    double div_x = Sfx * ssf_xx + Sfy * ssf_yx + Sfz * ssf_zx;
+    double div_y = Sfx * ssf_xy + Sfy * ssf_yy + Sfz * ssf_zy;
+    double div_z = Sfx * ssf_xz + Sfy * ssf_yz + Sfz * ssf_zz;
 
     // owner
     atomicAdd(&(output[owner * 3 + 0]), div_x);
@@ -683,6 +690,8 @@ __global__ void fvc_div_cell_tensor_internal(int num_surfaces,
     atomicAdd(&(output[neighbor * 3 + 0]), -div_x);
     atomicAdd(&(output[neighbor * 3 + 1]), -div_y);
     atomicAdd(&(output[neighbor * 3 + 2]), -div_z);
+    
+    
 }
 
 __global__ void fvc_div_cell_tensor_boundary(int num, int offset, const int *face2Cells,
@@ -709,13 +718,30 @@ __global__ void fvc_div_cell_tensor_boundary(int num, int offset, const int *fac
     double boussf_zz = boundary_vf[start_index * 9 + 8];
     int cellIndex = face2Cells[start_index];
 
-    double bouDiv_x = bouSfx * boussf_xx + bouSfy * boussf_xy + bouSfz * boussf_xz;
-    double bouDiv_y = bouSfx * boussf_yx + bouSfy * boussf_yy + bouSfz * boussf_yz;
-    double bouDiv_z = bouSfx * boussf_zx + bouSfy * boussf_zy + bouSfz * boussf_zz;
+    double bouDiv_x = bouSfx * boussf_xx + bouSfy * boussf_yx + bouSfz * boussf_zx;
+    double bouDiv_y = bouSfx * boussf_xy + bouSfy * boussf_yy + bouSfz * boussf_zy;
+    double bouDiv_z = bouSfx * boussf_xz + bouSfy * boussf_yz + bouSfz * boussf_zz;
 
     atomicAdd(&(output[cellIndex * 3 + 0]), bouDiv_x);
     atomicAdd(&(output[cellIndex * 3 + 1]), bouDiv_y);
     atomicAdd(&(output[cellIndex * 3 + 2]), bouDiv_z);
+
+    // if (cellIndex == 0)
+    // {
+    //     // printf("gpu output[0] = %.5e, %.5e, %.5e\n", output[0], output[1], output[2]);
+    //     // printf("gpu output[0] += %.5e, %.5e, %.5e\n", bouDiv_x, bouDiv_y, bouDiv_z);
+    //     printf("gpu bouvf[0] = (%.5e, %.5e, %.5e, %.5e, %.5e, %.5e, %.5e, %.5e, %.5e)\n", 
+    //             boussf_xx, boussf_xy, boussf_xz, boussf_yx, boussf_yy, boussf_yz, boussf_zx, boussf_zy, boussf_zz);
+    //     printf("gpu bouSf[0] = (%.5e, %.5e, %.5e)\n", bouSfx, bouSfy, bouSfz);
+    //     printf("gpu boufinal[0] = (%.5e, %.5e, %.5e)\n", bouDiv_x, bouDiv_y, bouDiv_z);
+    //     printf("bouIndex = %d\n\n", start_index);
+    // }
+
+    // if (index == 0)
+    // {
+    //     printf("bou_grad_U = (%.5e, %.5e, %.5e, %.5e, %.5e, %.5e, %.5e, %.5e, %.5e)", vf2[0], vf2[1], vf2[2],
+    //             vf2[3], vf2[4], vf2[5], vf2[6], vf2[7], vf2[8]);
+    // }
 }
 
 void permute_vector_d2h(cudaStream_t stream, int num_cells, const double *input, double *output)
diff --git a/src_gpu/dfUEqn.H b/src_gpu/dfUEqn.H
index 4d1c25697..7b28f082c 100644
--- a/src_gpu/dfUEqn.H
+++ b/src_gpu/dfUEqn.H
@@ -107,5 +107,7 @@ public:
 	void postProcess(double *h_u);
 
     void solve();
-    void compareResult(const double *lower, const double *upper, const double *diag, const double *source, const double *internal_coeffs, const double *boundary_coeffs, bool printFlag);
+    void compareResult(const double *lower, const double *upper, const double *diag, const double *source, const double *internal_coeffs, const double *boundary_coeffs, 
+    // const double *tmpVal, 
+    bool printFlag);
 };
diff --git a/src_gpu/dfUEqn.cu b/src_gpu/dfUEqn.cu
index 470550cd0..90efcf670 100644
--- a/src_gpu/dfUEqn.cu
+++ b/src_gpu/dfUEqn.cu
@@ -103,39 +103,39 @@ void dfUEqn::process() {
 
         fvm_ddt_vector(dataBase_.stream, dataBase_.num_cells, dataBase_.rdelta_t,
                 dataBase_.d_rho, dataBase_.d_rho_old, dataBase_.d_u, dataBase_.d_volume,
-                d_diag, d_source, -1.);
+                d_diag, d_source, 1.);
         fvm_div_vector(dataBase_.stream, dataBase_.num_surfaces, dataBase_.d_owner, dataBase_.d_neighbor,
                 dataBase_.d_phi, dataBase_.d_weight,
                 d_lower, d_upper, d_diag, // end for internal
                 dataBase_.num_patches, dataBase_.patch_size.data(), patch_type.data(),
                 dataBase_.d_boundary_phi, d_value_internal_coeffs, d_value_boundary_coeffs,
-                d_internal_coeffs, d_boundary_coeffs, -1.);
-        //field_multiply_scalar(dataBase_.stream,
-        //        dataBase_.num_cells, dataBase_.d_rho, d_nu_eff, d_rho_nueff, // end for internal
-        //        dataBase_.num_boundary_surfaces, dataBase_.d_boundary_rho, d_boundary_nu_eff, d_boundary_rho_nueff);
-        //fvm_laplacian_vector(dataBase_.stream, dataBase_.num_surfaces,
-        //        dataBase_.d_owner, dataBase_.d_neighbor,
-        //        dataBase_.d_weight, dataBase_.d_mag_sf, dataBase_.d_delta_coeffs, d_rho_nueff,
-        //        d_lower, d_upper, d_diag, // end for internal
-        //        dataBase_.num_patches, dataBase_.patch_size.data(), patch_type.data(),
-        //        dataBase_.d_boundary_mag_sf, d_boundary_rho_nueff,
-        //        d_gradient_internal_coeffs, d_gradient_boundary_coeffs,
-        //        d_internal_coeffs, d_boundary_coeffs);
-        //fvc_grad_vector(dataBase_.stream, dataBase_.num_cells, dataBase_.num_surfaces,
-        //        dataBase_.d_owner, dataBase_.d_neighbor,
-        //        dataBase_.d_weight, dataBase_.d_sf, dataBase_.d_u, d_grad_u,
-        //        dataBase_.num_patches, dataBase_.patch_size.data(), patch_type.data(),
-        //        dataBase_.d_boundary_face_cell, dataBase_.d_boundary_u, dataBase_.d_boundary_sf,
-        //        dataBase_.d_volume, dataBase_.d_boundary_mag_sf, d_boundary_grad_u, dataBase_.d_boundary_delta_coeffs);
-        //scale_dev2T_tensor(dataBase_.stream, dataBase_.num_cells, d_rho_nueff, d_grad_u, // end for internal
-        //        dataBase_.num_boundary_surfaces, d_boundary_rho_nueff, d_boundary_grad_u);
-        //fvc_div_cell_tensor(dataBase_.stream, dataBase_.num_cells, dataBase_.num_surfaces,
-        //        dataBase_.d_owner, dataBase_.d_neighbor,
-        //        dataBase_.d_weight, dataBase_.d_sf, d_grad_u, d_fvc_output, // end for internal
-        //        dataBase_.num_patches, dataBase_.patch_size.data(), patch_type.data(),
-        //        dataBase_.d_boundary_face_cell, d_boundary_grad_u, dataBase_.d_boundary_sf, dataBase_.d_volume);
-        //fvc_to_source_vector(dataBase_.stream, dataBase_.num_cells,
-        //        dataBase_.d_volume, d_fvc_output, d_source);
+                d_internal_coeffs, d_boundary_coeffs, 1.);
+        field_multiply_scalar(dataBase_.stream,
+               dataBase_.num_cells, dataBase_.d_rho, d_nu_eff, d_rho_nueff, // end for internal
+               dataBase_.num_boundary_surfaces, dataBase_.d_boundary_rho, d_boundary_nu_eff, d_boundary_rho_nueff);
+        fvm_laplacian_vector(dataBase_.stream, dataBase_.num_surfaces,
+               dataBase_.d_owner, dataBase_.d_neighbor,
+               dataBase_.d_weight, dataBase_.d_mag_sf, dataBase_.d_delta_coeffs, d_rho_nueff,
+               d_lower, d_upper, d_diag, // end for internal
+               dataBase_.num_patches, dataBase_.patch_size.data(), patch_type.data(),
+               dataBase_.d_boundary_mag_sf, d_boundary_rho_nueff,
+               d_gradient_internal_coeffs, d_gradient_boundary_coeffs,
+               d_internal_coeffs, d_boundary_coeffs, -1);
+        fvc_grad_vector(dataBase_.stream, dataBase_.num_cells, dataBase_.num_surfaces,
+               dataBase_.d_owner, dataBase_.d_neighbor,
+               dataBase_.d_weight, dataBase_.d_sf, dataBase_.d_u, d_grad_u,
+               dataBase_.num_patches, dataBase_.patch_size.data(), patch_type.data(),
+               dataBase_.d_boundary_face_cell, dataBase_.d_boundary_u, dataBase_.d_boundary_sf,
+               dataBase_.d_volume, dataBase_.d_boundary_mag_sf, d_boundary_grad_u, dataBase_.d_boundary_delta_coeffs);
+        scale_dev2T_tensor(dataBase_.stream, dataBase_.num_cells, d_rho_nueff, d_grad_u, // end for internal
+               dataBase_.num_boundary_surfaces, d_boundary_rho_nueff, d_boundary_grad_u);
+        fvc_div_cell_tensor(dataBase_.stream, dataBase_.num_cells, dataBase_.num_surfaces,
+               dataBase_.d_owner, dataBase_.d_neighbor,
+               dataBase_.d_weight, dataBase_.d_sf, d_grad_u, d_fvc_output, // end for internal
+               dataBase_.num_patches, dataBase_.patch_size.data(), patch_type.data(),
+               dataBase_.d_boundary_face_cell, d_boundary_grad_u, dataBase_.d_boundary_sf, dataBase_.d_volume);
+        fvc_to_source_vector(dataBase_.stream, dataBase_.num_cells,
+               dataBase_.d_volume, d_fvc_output, d_source);
         fvc_grad_cell_scalar(dataBase_.stream, dataBase_.num_cells, dataBase_.num_surfaces,
                 dataBase_.d_owner, dataBase_.d_neighbor,
                 dataBase_.d_weight, dataBase_.d_sf, dataBase_.d_p, d_fvc_output,
@@ -217,7 +217,10 @@ double* dfUEqn::getFieldPointer(const char* fieldAlias, location loc, position p
     return pointer;
 }
 
-void dfUEqn::compareResult(const double *lower, const double *upper, const double *diag, const double *source, const double *internal_coeffs, const double *boundary_coeffs, bool printFlag)
+void dfUEqn::compareResult(const double *lower, const double *upper, const double *diag, 
+        const double *source, const double *internal_coeffs, const double *boundary_coeffs, 
+        // const double *tmpVal, 
+        bool printFlag)
 {
     DEBUG_TRACE;
     std::vector<double> h_lower;
@@ -241,7 +244,7 @@ void dfUEqn::compareResult(const double *lower, const double *upper, const doubl
     std::vector<double> h_source;
     h_source.resize(dataBase_.num_cells * 3);
     checkCudaErrors(cudaMemcpy(h_source.data(), d_source, dataBase_.cell_value_vec_bytes, cudaMemcpyDeviceToHost));
-    checkVectorEqual(dataBase_.num_cells, source, h_source.data(), 1e-14, printFlag);
+    checkVectorEqual(dataBase_.num_cells * 3, source, h_source.data(), 1e-14, printFlag);
     DEBUG_TRACE;
 
     std::vector<double> h_internal_coeffs;
@@ -255,5 +258,11 @@ void dfUEqn::compareResult(const double *lower, const double *upper, const doubl
     checkCudaErrors(cudaMemcpy(h_boundary_coeffs.data(), d_boundary_coeffs, dataBase_.boundary_surface_value_vec_bytes, cudaMemcpyDeviceToHost));
     checkVectorEqual(dataBase_.num_boundary_surfaces * 3, boundary_coeffs, h_boundary_coeffs.data(), 1e-14, printFlag);
     DEBUG_TRACE;
+
+    // std::vector<double> h_tmpVal;
+    // h_tmpVal.resize(dataBase_.num_cells * 3);
+    // checkCudaErrors(cudaMemcpy(h_tmpVal.data(), d_fvc_output, dataBase_.cell_value_vec_bytes, cudaMemcpyDeviceToHost));
+    // checkVectorEqual(dataBase_.num_cells * 3, tmpVal, h_tmpVal.data(), 1e-14, printFlag);
+    // DEBUG_TRACE;
 }
 

From f69147863645ccc2eca9b52e5bf8eae836f04e10 Mon Sep 17 00:00:00 2001
From: maorz1998 <maorz1998@stu.pku.edu.cn>
Date: Thu, 17 Aug 2023 21:52:39 +0800
Subject: [PATCH 23/25] primiry opt & add time monitor

---
 applications/solvers/dfLowMachFoam/new_UEqn.H |  22 ++
 src_gpu/dfMatrixOpBase.H                      |  20 +-
 src_gpu/dfMatrixOpBase.cu                     | 343 ++++++++++--------
 src_gpu/dfUEqn.H                              |   2 +
 src_gpu/dfUEqn.cu                             |  54 ++-
 5 files changed, 273 insertions(+), 168 deletions(-)

diff --git a/applications/solvers/dfLowMachFoam/new_UEqn.H b/applications/solvers/dfLowMachFoam/new_UEqn.H
index 231c38c29..9d94d27b6 100644
--- a/applications/solvers/dfLowMachFoam/new_UEqn.H
+++ b/applications/solvers/dfLowMachFoam/new_UEqn.H
@@ -20,6 +20,7 @@ fvVectorMatrix& UEqn = tUEqn.ref();
 // run GPU
 // preProcess
 // TODO: preProcessForRhoEqn for temp, now we only transfer phi(instead of rhoEqn) used by fvm::div(phi, U)
+UEqn_GPU.sync();
 double *h_rho = dfDataBase.getFieldPointer("rho", location::cpu, position::internal);
 double *h_phi = dfDataBase.getFieldPointer("phi", location::cpu, position::internal);
 double *h_boundary_phi = dfDataBase.getFieldPointer("phi", location::cpu, position::boundary);
@@ -35,6 +36,7 @@ forAll(phi.boundaryField(), patchi)
 }
 UEqn_GPU.preProcessForRhoEqn(h_rho, h_phi, h_boundary_phi);
 DEBUG_TRACE;
+clock_t start = std::clock();
 // preparing u, p, nu_eff, and rho.boundary used by UEqn_GPU.preProcess()
 double *h_u = dfDataBase.getFieldPointer("u", location::cpu, position::internal);
 double *h_boundary_u = dfDataBase.getFieldPointer("u", location::cpu, position::boundary);
@@ -43,9 +45,17 @@ double *h_boundary_p = dfDataBase.getFieldPointer("p", location::cpu, position::
 double *h_nu_eff = UEqn_GPU.getFieldPointer("nu_eff", location::cpu, position::internal);
 double *h_boundary_nu_eff = UEqn_GPU.getFieldPointer("nu_eff", location::cpu, position::boundary);
 double *h_boundary_rho = dfDataBase.getFieldPointer("rho", location::cpu, position::boundary);
+double end = std::clock();
+Info << "get pointer" << double(end - start) / double(CLOCKS_PER_SEC) << endl;
+
+start = std::clock();
 memcpy(h_u, &U.oldTime()[0][0], dfDataBase.cell_value_vec_bytes);
 memcpy(h_p, &p[0], dfDataBase.cell_value_bytes);
 memcpy(h_nu_eff, &nuEff[0], dfDataBase.cell_value_bytes);
+end = std::clock();
+Info << "copy to pinned memory" << double(end - start) / double(CLOCKS_PER_SEC) << endl;
+
+start = std::clock();
 offset = 0;
 forAll(U.boundaryField(), patchi)
 {
@@ -60,12 +70,24 @@ forAll(U.boundaryField(), patchi)
     memcpy(h_boundary_rho + offset, &patchRho[0], patchsize * sizeof(double));
     offset += patchsize;
 }
+end = std::clock();
+Info << "CPU prepare boundary time" << double(end - start) / double(CLOCKS_PER_SEC) << endl;
+
+start = std::clock();
 UEqn_GPU.preProcess(h_u, h_boundary_u, h_p, h_boundary_p, h_nu_eff, h_boundary_nu_eff, h_boundary_rho);
 DEBUG_TRACE;
+UEqn_GPU.sync();
+end = std::clock();
+Info << "GPU preProcess time" << double(end - start) / double(CLOCKS_PER_SEC) << endl;
 
 // process
+start = std::clock();
 UEqn_GPU.process();
+end = std::clock();
 DEBUG_TRACE;
+UEqn_GPU.sync();
+// end = std::clock();
+Info << "GPU process time" << double(end - start) / double(CLOCKS_PER_SEC) << endl;
 
 // postProcess
 UEqn_GPU.postProcess(h_u);
diff --git a/src_gpu/dfMatrixOpBase.H b/src_gpu/dfMatrixOpBase.H
index ae303cedb..f64220186 100644
--- a/src_gpu/dfMatrixOpBase.H
+++ b/src_gpu/dfMatrixOpBase.H
@@ -1,5 +1,21 @@
 #pragma once
 
+#define TICK_INIT \
+    float time_elapsed_kernel=0;\
+    cudaEvent_t start_kernel, stop_kernel;\
+    checkCudaErrors(cudaEventCreate(&start_kernel));\
+    checkCudaErrors(cudaEventCreate(&stop_kernel));
+
+#define TICK_START \
+    checkCudaErrors(cudaEventRecord(start_kernel,0));
+
+#define TICK_END(prefix) \
+    checkCudaErrors(cudaEventRecord(stop_kernel,0));\
+    checkCudaErrors(cudaEventSynchronize(start_kernel));\
+    checkCudaErrors(cudaEventSynchronize(stop_kernel));\
+    checkCudaErrors(cudaEventElapsedTime(&time_elapsed_kernel,start_kernel,stop_kernel));\
+    printf("try %s 执行时间：%lf(ms)\n", #prefix, time_elapsed_kernel);
+
 // tools
 void permute_vector_d2h(cudaStream_t stream, int num_cells, const double *input, double *output);
 void permute_vector_h2d(cudaStream_t stream, int num_cells, const double *input, double *output);
@@ -49,7 +65,7 @@ void fvc_ddt_scalar(cudaStream_t stream, int num_cells, double rDeltaT,
         const double *rho, const double *rho_old, const double *vf, const double *vf_old,
         double *output, double sign = 1.);
 
-void fvc_grad_vector(cudaStream_t stream, int num_cells, int num_surfaces, 
+void fvc_grad_vector(cudaStream_t stream, int num_cells, int num_surfaces, int num_boundary_surfaces,
         const int *lowerAddr, const int *upperAddr, 
         const double *weight, const double *Sf, const double *vf, double *output, // end for internal
         int num_patches, const int *patch_size, const int *patch_type,
@@ -68,7 +84,7 @@ void fvc_div_cell_vector(cudaStream_t stream, int num_cells, int num_surfaces,
         const int *boundary_cell_face, const double *boundary_vf, const double *boundary_Sf,
         const double *volume, double sign = 1.);
 
-void fvc_div_cell_tensor(cudaStream_t stream, int num_cells, int num_surfaces,
+void fvc_div_cell_tensor(cudaStream_t stream, int num_cells, int num_surfaces, int num_boundary_surfaces, 
         const int *lowerAddr, const int *upperAddr, 
         const double *weight, const double *Sf, const double *vf, double *output, // end for internal
         int num_patches, const int *patch_size, const int *patch_type,
diff --git a/src_gpu/dfMatrixOpBase.cu b/src_gpu/dfMatrixOpBase.cu
index f8d71cca7..d4f6ea7f8 100644
--- a/src_gpu/dfMatrixOpBase.cu
+++ b/src_gpu/dfMatrixOpBase.cu
@@ -45,9 +45,12 @@ __global__ void fvc_to_source_vector_kernel(int num_cells, const double *volume,
     if (index >= num_cells)
         return;
 
-    source[index * 3 + 0] += fvc_output[index * 3 + 0] * volume[index];
-    source[index * 3 + 1] += fvc_output[index * 3 + 1] * volume[index];
-    source[index * 3 + 2] += fvc_output[index * 3 + 2] * volume[index];
+    // source[index * 3 + 0] += fvc_output[index * 3 + 0] * volume[index];
+    // source[index * 3 + 1] += fvc_output[index * 3 + 1] * volume[index];
+    // source[index * 3 + 2] += fvc_output[index * 3 + 2] * volume[index];
+    source[index * 3 + 0] += fvc_output[index * 3 + 0];
+    source[index * 3 + 1] += fvc_output[index * 3 + 1];
+    source[index * 3 + 2] += fvc_output[index * 3 + 2];
 }
 
 __global__ void update_boundary_coeffs_zeroGradient_vector(int num, int offset,
@@ -84,25 +87,25 @@ __global__ void scale_dev2t_tensor_kernel(int num, const double *vf1, double *vf
         return;
 
     double scale = vf1[index];
-    double val_xx = vf2[index * 9 + 0];
-    double val_xy = vf2[index * 9 + 1];
-    double val_xz = vf2[index * 9 + 2];
-    double val_yx = vf2[index * 9 + 3];
-    double val_yy = vf2[index * 9 + 4];
-    double val_yz = vf2[index * 9 + 5];
-    double val_zx = vf2[index * 9 + 6];
-    double val_zy = vf2[index * 9 + 7];
-    double val_zz = vf2[index * 9 + 8];
+    double val_xx = vf2[num * 0 + index];
+    double val_xy = vf2[num * 1 + index];
+    double val_xz = vf2[num * 2 + index];
+    double val_yx = vf2[num * 3 + index];
+    double val_yy = vf2[num * 4 + index];
+    double val_yz = vf2[num * 5 + index];
+    double val_zx = vf2[num * 6 + index];
+    double val_zy = vf2[num * 7 + index];
+    double val_zz = vf2[num * 8 + index];
     double trace_coeff = (2. / 3.) * (val_xx + val_yy + val_zz);
-    vf2[index * 9 + 0] = scale * (val_xx - trace_coeff);
-    vf2[index * 9 + 1] = scale * val_yx;
-    vf2[index * 9 + 2] = scale * val_zx;
-    vf2[index * 9 + 3] = scale * val_xy;
-    vf2[index * 9 + 4] = scale * (val_yy - trace_coeff);
-    vf2[index * 9 + 5] = scale * val_zy;
-    vf2[index * 9 + 6] = scale * val_xz;
-    vf2[index * 9 + 7] = scale * val_yz;
-    vf2[index * 9 + 8] = scale * (val_zz - trace_coeff);
+    vf2[num * 0 + index] = scale * (val_xx - trace_coeff);
+    vf2[num * 1 + index] = scale * val_yx;
+    vf2[num * 2 + index] = scale * val_zx;
+    vf2[num * 3 + index] = scale * val_xy;
+    vf2[num * 4 + index] = scale * (val_yy - trace_coeff);
+    vf2[num * 5 + index] = scale * val_zy;
+    vf2[num * 6 + index] = scale * val_xz;
+    vf2[num * 7 + index] = scale * val_yz;
+    vf2[num * 8 + index] = scale * (val_zz - trace_coeff);
 
     // if (index == 0)
     // {
@@ -120,11 +123,14 @@ __global__ void fvm_ddt_vector_kernel(int num_cells, double rDeltaT,
     if (index >= num_cells)
         return;
 
-    diag[index] += rDeltaT * rho[index] * volume[index] * sign;
+    double vol = volume[index];
+    double rho_old_kernel = rho_old[index];
+
+    diag[index] += rDeltaT * rho[index] * vol * sign;
     // TODO: skip moving
-    source[index * 3 + 0] += rDeltaT * rho_old[index] * vf[index * 3 + 0] * volume[index] * sign;
-    source[index * 3 + 1] += rDeltaT * rho_old[index] * vf[index * 3 + 1] * volume[index] * sign;
-    source[index * 3 + 2] += rDeltaT * rho_old[index] * vf[index * 3 + 2] * volume[index] * sign;
+    source[index * 3 + 0] += rDeltaT * rho_old_kernel * vf[index * 3 + 0] * vol * sign;
+    source[index * 3 + 1] += rDeltaT * rho_old_kernel * vf[index * 3 + 1] * vol * sign;
+    source[index * 3 + 2] += rDeltaT * rho_old_kernel * vf[index * 3 + 2] * vol * sign;
 }
 
 __global__ void fvm_div_vector_internal(int num_surfaces,
@@ -254,7 +260,7 @@ __global__ void fvc_ddt_scalar_kernel(int num_cells, double rDeltaT,
     output[index] += rDeltaT * (rho[index] * vf[index] - rho_old[index] * vf_old[index]) * sign;
 }
 
-__global__ void fvc_grad_vector_internal(int num_surfaces, 
+__global__ void fvc_grad_vector_internal(int num_cells, int num_surfaces, 
         const int *lower_index, const int *upper_index, const double *face_vector,
         const double *weight, const double *field_vector, 
         double *output)
@@ -286,34 +292,35 @@ __global__ void fvc_grad_vector_internal(int num_surfaces,
     double grad_zz = Sfz * ssfz;
 
     // owner
-    atomicAdd(&(output[owner * 9 + 0]), grad_xx);
-    atomicAdd(&(output[owner * 9 + 1]), grad_xy);
-    atomicAdd(&(output[owner * 9 + 2]), grad_xz);
-    atomicAdd(&(output[owner * 9 + 3]), grad_yx);
-    atomicAdd(&(output[owner * 9 + 4]), grad_yy);
-    atomicAdd(&(output[owner * 9 + 5]), grad_yz);
-    atomicAdd(&(output[owner * 9 + 6]), grad_zx);
-    atomicAdd(&(output[owner * 9 + 7]), grad_zy);
-    atomicAdd(&(output[owner * 9 + 8]), grad_zz);
+    atomicAdd(&(output[num_cells * 0 + owner]), grad_xx);
+    atomicAdd(&(output[num_cells * 1 + owner]), grad_xy);
+    atomicAdd(&(output[num_cells * 2 + owner]), grad_xz);
+    atomicAdd(&(output[num_cells * 3 + owner]), grad_yx);
+    atomicAdd(&(output[num_cells * 4 + owner]), grad_yy);
+    atomicAdd(&(output[num_cells * 5 + owner]), grad_yz);
+    atomicAdd(&(output[num_cells * 6 + owner]), grad_zx);
+    atomicAdd(&(output[num_cells * 7 + owner]), grad_zy);
+    atomicAdd(&(output[num_cells * 8 + owner]), grad_zz);
 
     // neighbour
-    atomicAdd(&(output[neighbor * 9 + 0]), -grad_xx);
-    atomicAdd(&(output[neighbor * 9 + 1]), -grad_xy);
-    atomicAdd(&(output[neighbor * 9 + 2]), -grad_xz);
-    atomicAdd(&(output[neighbor * 9 + 3]), -grad_yx);
-    atomicAdd(&(output[neighbor * 9 + 4]), -grad_yy);
-    atomicAdd(&(output[neighbor * 9 + 5]), -grad_yz);
-    atomicAdd(&(output[neighbor * 9 + 6]), -grad_zx);
-    atomicAdd(&(output[neighbor * 9 + 7]), -grad_zy);
-    atomicAdd(&(output[neighbor * 9 + 8]), -grad_zz);
+    atomicAdd(&(output[num_cells * 0 + neighbor]), -grad_xx);
+    atomicAdd(&(output[num_cells * 1 + neighbor]), -grad_xy);
+    atomicAdd(&(output[num_cells * 2 + neighbor]), -grad_xz);
+    atomicAdd(&(output[num_cells * 3 + neighbor]), -grad_yx);
+    atomicAdd(&(output[num_cells * 4 + neighbor]), -grad_yy);
+    atomicAdd(&(output[num_cells * 5 + neighbor]), -grad_yz);
+    atomicAdd(&(output[num_cells * 6 + neighbor]), -grad_zx);
+    atomicAdd(&(output[num_cells * 7 + neighbor]), -grad_zy);
+    atomicAdd(&(output[num_cells * 8 + neighbor]), -grad_zz);
 }
 
 // update boundary of interpolation field
 // calculate the grad field
 // TODO: this function is implemented for uncoupled boundary conditions
 //       so it should use the more specific func name
-__global__ void fvc_grad_vector_boundary(int num, int offset, const int *face2Cells,
-        const double *boundary_face_vector, const double *boundary_field_vector, double *output)
+__global__ void fvc_grad_vector_boundary(int num_cells, int num, 
+        int offset, const int *face2Cells, const double *boundary_face_vector, 
+        const double *boundary_field_vector, double *output)
 {
     int index = blockDim.x * blockIdx.x + threadIdx.x;
     if (index >= num)
@@ -341,20 +348,20 @@ __global__ void fvc_grad_vector_boundary(int num, int offset, const int *face2Ce
     double grad_zy = bouSfz * boussfy;
     double grad_zz = bouSfz * boussfz;
 
-    atomicAdd(&(output[cellIndex * 9 + 0]), grad_xx);
-    atomicAdd(&(output[cellIndex * 9 + 1]), grad_xy);
-    atomicAdd(&(output[cellIndex * 9 + 2]), grad_xz);
-    atomicAdd(&(output[cellIndex * 9 + 3]), grad_yx);
-    atomicAdd(&(output[cellIndex * 9 + 4]), grad_yy);
-    atomicAdd(&(output[cellIndex * 9 + 5]), grad_yz);
-    atomicAdd(&(output[cellIndex * 9 + 6]), grad_zx);
-    atomicAdd(&(output[cellIndex * 9 + 7]), grad_zy);
-    atomicAdd(&(output[cellIndex * 9 + 8]), grad_zz);
+    atomicAdd(&(output[num_cells * 0 + cellIndex]), grad_xx);
+    atomicAdd(&(output[num_cells * 1 + cellIndex]), grad_xy);
+    atomicAdd(&(output[num_cells * 2 + cellIndex]), grad_xz);
+    atomicAdd(&(output[num_cells * 3 + cellIndex]), grad_yx);
+    atomicAdd(&(output[num_cells * 4 + cellIndex]), grad_yy);
+    atomicAdd(&(output[num_cells * 5 + cellIndex]), grad_yz);
+    atomicAdd(&(output[num_cells * 6 + cellIndex]), grad_zx);
+    atomicAdd(&(output[num_cells * 7 + cellIndex]), grad_zy);
+    atomicAdd(&(output[num_cells * 8 + cellIndex]), grad_zz);
 }
 
 __global__ void fvc_grad_scalar_internal(int num_cells, int num_surfaces,
         const int *lower_index, const int *upper_index, const double *face_vector, 
-        const double *weight, const double *vf, double *output)
+        const double *weight, const double *vf, double *output, double sign)
 {
     int index = blockDim.x * blockIdx.x + threadIdx.x;
     if (index >= num_surfaces)
@@ -370,9 +377,9 @@ __global__ void fvc_grad_scalar_internal(int num_cells, int num_surfaces,
 
     double ssf = (w * (vf[owner] - vf[neighbor]) + vf[neighbor]);
 
-    double grad_x = Sfx * ssf;
-    double grad_y = Sfy * ssf;
-    double grad_z = Sfz * ssf;
+    double grad_x = Sfx * ssf * sign;
+    double grad_y = Sfy * ssf * sign;
+    double grad_z = Sfz * ssf * sign;
 
     // // owner
     // atomicAdd(&(output[num_cells * 0 + owner]), grad_x);
@@ -397,7 +404,7 @@ __global__ void fvc_grad_scalar_internal(int num_cells, int num_surfaces,
 }
 
 __global__ void fvc_grad_scalar_boundary(int num_cells, int num, int offset, const int *face2Cells,
-        const double *boundary_face_vector, const double *boundary_vf, double *output)
+        const double *boundary_face_vector, const double *boundary_vf, double *output, double sign)
 {
     int index = blockDim.x * blockIdx.x + threadIdx.x;
     if (index >= num)
@@ -416,9 +423,9 @@ __global__ void fvc_grad_scalar_boundary(int num_cells, int num, int offset, con
     double grad_y = bouSfy * bouvf;
     double grad_z = bouSfz * bouvf;
 
-    atomicAdd(&(output[cellIndex * 3 + 0]), grad_x);
-    atomicAdd(&(output[cellIndex * 3 + 1]), grad_y);
-    atomicAdd(&(output[cellIndex * 3 + 2]), grad_z);
+    atomicAdd(&(output[cellIndex * 3 + 0]), grad_x * sign);
+    atomicAdd(&(output[cellIndex * 3 + 1]), grad_y * sign);
+    atomicAdd(&(output[cellIndex * 3 + 2]), grad_z * sign);
 
     // if (cellIndex == 5)
     // {
@@ -434,15 +441,15 @@ __global__ void divide_cell_volume_tsr(int num_cells, const double* volume, doub
         return;
     
     double vol = volume[index];
-    output[index * 9 + 0] = output[index * 9 + 0] / vol * sign;
-    output[index * 9 + 1] = output[index * 9 + 1] / vol * sign;
-    output[index * 9 + 2] = output[index * 9 + 2] / vol * sign;
-    output[index * 9 + 3] = output[index * 9 + 3] / vol * sign;
-    output[index * 9 + 4] = output[index * 9 + 4] / vol * sign;
-    output[index * 9 + 5] = output[index * 9 + 5] / vol * sign;
-    output[index * 9 + 6] = output[index * 9 + 6] / vol * sign;
-    output[index * 9 + 7] = output[index * 9 + 7] / vol * sign;
-    output[index * 9 + 8] = output[index * 9 + 8] / vol * sign;
+    output[num_cells * 0 + index] = output[num_cells * 0 + index] / vol * sign;
+    output[num_cells * 1 + index] = output[num_cells * 1 + index] / vol * sign;
+    output[num_cells * 2 + index] = output[num_cells * 2 + index] / vol * sign;
+    output[num_cells * 3 + index] = output[num_cells * 3 + index] / vol * sign;
+    output[num_cells * 4 + index] = output[num_cells * 4 + index] / vol * sign;
+    output[num_cells * 5 + index] = output[num_cells * 5 + index] / vol * sign;
+    output[num_cells * 6 + index] = output[num_cells * 6 + index] / vol * sign;
+    output[num_cells * 7 + index] = output[num_cells * 7 + index] / vol * sign;
+    output[num_cells * 8 + index] = output[num_cells * 8 + index] / vol * sign;
 }
 
 __global__ void divide_cell_volume_vec(int num_cells, const double* volume, double *output, double sign)
@@ -469,7 +476,8 @@ __global__ void divide_cell_volume_scalar(int num_cells, const double* volume, d
     output[index] = output[index] / vol * sign;
 }
 
-__global__ void fvc_grad_vector_correctBC_zeroGradient(int num, int offset, const int *face2Cells, 
+__global__ void fvc_grad_vector_correctBC_zeroGradient(int num_cells, int num_boundary_surfaces, 
+        int num, int offset, const int *face2Cells, 
         const double *internal_grad, const double *vf, const double *boundary_sf,
         const double *boundary_mag_sf, double *boundary_grad, double sign)
 {
@@ -481,15 +489,15 @@ __global__ void fvc_grad_vector_correctBC_zeroGradient(int num, int offset, cons
 
     int cellIndex = face2Cells[start_index];
 
-    double grad_xx = internal_grad[cellIndex * 9 + 0];
-    double grad_xy = internal_grad[cellIndex * 9 + 1];
-    double grad_xz = internal_grad[cellIndex * 9 + 2];
-    double grad_yx = internal_grad[cellIndex * 9 + 3];
-    double grad_yy = internal_grad[cellIndex * 9 + 4];
-    double grad_yz = internal_grad[cellIndex * 9 + 5];
-    double grad_zx = internal_grad[cellIndex * 9 + 6];
-    double grad_zy = internal_grad[cellIndex * 9 + 7];
-    double grad_zz = internal_grad[cellIndex * 9 + 8];
+    double grad_xx = internal_grad[num_cells * 0 + cellIndex];
+    double grad_xy = internal_grad[num_cells * 1 + cellIndex];
+    double grad_xz = internal_grad[num_cells * 2 + cellIndex];
+    double grad_yx = internal_grad[num_cells * 3 + cellIndex];
+    double grad_yy = internal_grad[num_cells * 4 + cellIndex];
+    double grad_yz = internal_grad[num_cells * 5 + cellIndex];
+    double grad_zx = internal_grad[num_cells * 6 + cellIndex];
+    double grad_zy = internal_grad[num_cells * 7 + cellIndex];
+    double grad_zz = internal_grad[num_cells * 8 + cellIndex];
 
     double vfx = vf[cellIndex * 3 + 0];
     double vfy = vf[cellIndex * 3 + 1];
@@ -503,15 +511,15 @@ __global__ void fvc_grad_vector_correctBC_zeroGradient(int num, int offset, cons
     double grad_correction_y = - (n_x * grad_xy + n_y * grad_yy + n_z * grad_zy);
     double grad_correction_z = - (n_x * grad_xz + n_y * grad_yz + n_z * grad_zz);
 
-    boundary_grad[start_index * 9 + 0] = (grad_xx + n_x * grad_correction_x) * sign;
-    boundary_grad[start_index * 9 + 1] = (grad_xy + n_x * grad_correction_y) * sign;
-    boundary_grad[start_index * 9 + 2] = (grad_xz + n_x * grad_correction_z) * sign;
-    boundary_grad[start_index * 9 + 3] = (grad_yx + n_y * grad_correction_x) * sign;
-    boundary_grad[start_index * 9 + 4] = (grad_yy + n_y * grad_correction_y) * sign;
-    boundary_grad[start_index * 9 + 5] = (grad_yz + n_y * grad_correction_z) * sign;
-    boundary_grad[start_index * 9 + 6] = (grad_zx + n_z * grad_correction_x) * sign;
-    boundary_grad[start_index * 9 + 7] = (grad_zy + n_z * grad_correction_y) * sign;
-    boundary_grad[start_index * 9 + 8] = (grad_zz + n_z * grad_correction_z) * sign;
+    boundary_grad[num_boundary_surfaces * 0 + start_index] = (grad_xx + n_x * grad_correction_x) * sign;
+    boundary_grad[num_boundary_surfaces * 1 + start_index] = (grad_xy + n_x * grad_correction_y) * sign;
+    boundary_grad[num_boundary_surfaces * 2 + start_index] = (grad_xz + n_x * grad_correction_z) * sign;
+    boundary_grad[num_boundary_surfaces * 3 + start_index] = (grad_yx + n_y * grad_correction_x) * sign;
+    boundary_grad[num_boundary_surfaces * 4 + start_index] = (grad_yy + n_y * grad_correction_y) * sign;
+    boundary_grad[num_boundary_surfaces * 5 + start_index] = (grad_yz + n_y * grad_correction_z) * sign;
+    boundary_grad[num_boundary_surfaces * 6 + start_index] = (grad_zx + n_z * grad_correction_x) * sign;
+    boundary_grad[num_boundary_surfaces * 7 + start_index] = (grad_zy + n_z * grad_correction_y) * sign;
+    boundary_grad[num_boundary_surfaces * 8 + start_index] = (grad_zz + n_z * grad_correction_z) * sign;
 }
 
 __global__ void fvc_grad_vector_correctBC_fixedValue(int num, int offset, const int *face2Cells, 
@@ -652,10 +660,10 @@ __global__ void fvc_div_cell_vector_boundary(int num, int offset, const int *fac
 
 }
 
-__global__ void fvc_div_cell_tensor_internal(int num_surfaces,
+__global__ void fvc_div_cell_tensor_internal(int num_cells, int num_surfaces,
         const int *lower_index, const int *upper_index,
         const double *vf, const double *weight, const double *face_vector,
-        double *output)
+        double *output, double sign)
 {
     int index = blockDim.x * blockIdx.x + threadIdx.x;
     if (index >= num_surfaces)
@@ -668,19 +676,19 @@ __global__ void fvc_div_cell_tensor_internal(int num_surfaces,
     int owner = lower_index[index];
     int neighbor = upper_index[index];
 
-    double ssf_xx = (w * (vf[owner * 9 + 0] - vf[neighbor * 9 + 0]) + vf[neighbor * 9 + 0]);
-    double ssf_xy = (w * (vf[owner * 9 + 1] - vf[neighbor * 9 + 1]) + vf[neighbor * 9 + 1]);
-    double ssf_xz = (w * (vf[owner * 9 + 2] - vf[neighbor * 9 + 2]) + vf[neighbor * 9 + 2]);
-    double ssf_yx = (w * (vf[owner * 9 + 3] - vf[neighbor * 9 + 3]) + vf[neighbor * 9 + 3]);
-    double ssf_yy = (w * (vf[owner * 9 + 4] - vf[neighbor * 9 + 4]) + vf[neighbor * 9 + 4]);
-    double ssf_yz = (w * (vf[owner * 9 + 5] - vf[neighbor * 9 + 5]) + vf[neighbor * 9 + 5]);
-    double ssf_zx = (w * (vf[owner * 9 + 6] - vf[neighbor * 9 + 6]) + vf[neighbor * 9 + 6]);
-    double ssf_zy = (w * (vf[owner * 9 + 7] - vf[neighbor * 9 + 7]) + vf[neighbor * 9 + 7]);
-    double ssf_zz = (w * (vf[owner * 9 + 8] - vf[neighbor * 9 + 8]) + vf[neighbor * 9 + 8]);
-    double div_x = Sfx * ssf_xx + Sfy * ssf_yx + Sfz * ssf_zx;
-    double div_y = Sfx * ssf_xy + Sfy * ssf_yy + Sfz * ssf_zy;
-    double div_z = Sfx * ssf_xz + Sfy * ssf_yz + Sfz * ssf_zz;
-
+    double ssf_xx = (w * (vf[num_cells * 0 + owner] - vf[num_cells * 0 + neighbor]) + vf[num_cells * 0 + neighbor]);
+    double ssf_xy = (w * (vf[num_cells * 1 + owner] - vf[num_cells * 1 + neighbor]) + vf[num_cells * 1 + neighbor]);
+    double ssf_xz = (w * (vf[num_cells * 2 + owner] - vf[num_cells * 2 + neighbor]) + vf[num_cells * 2 + neighbor]);
+    double ssf_yx = (w * (vf[num_cells * 3 + owner] - vf[num_cells * 3 + neighbor]) + vf[num_cells * 3 + neighbor]);
+    double ssf_yy = (w * (vf[num_cells * 4 + owner] - vf[num_cells * 4 + neighbor]) + vf[num_cells * 4 + neighbor]);
+    double ssf_yz = (w * (vf[num_cells * 5 + owner] - vf[num_cells * 5 + neighbor]) + vf[num_cells * 5 + neighbor]);
+    double ssf_zx = (w * (vf[num_cells * 6 + owner] - vf[num_cells * 6 + neighbor]) + vf[num_cells * 6 + neighbor]);
+    double ssf_zy = (w * (vf[num_cells * 7 + owner] - vf[num_cells * 7 + neighbor]) + vf[num_cells * 7 + neighbor]);
+    double ssf_zz = (w * (vf[num_cells * 8 + owner] - vf[num_cells * 8 + neighbor]) + vf[num_cells * 8 + neighbor]);
+    double div_x = (Sfx * ssf_xx + Sfy * ssf_yx + Sfz * ssf_zx) * sign;
+    double div_y = (Sfx * ssf_xy + Sfy * ssf_yy + Sfz * ssf_zy) * sign;
+    double div_z = (Sfx * ssf_xz + Sfy * ssf_yz + Sfz * ssf_zz) * sign;
+    
     // owner
     atomicAdd(&(output[owner * 3 + 0]), div_x);
     atomicAdd(&(output[owner * 3 + 1]), div_y);
@@ -694,8 +702,8 @@ __global__ void fvc_div_cell_tensor_internal(int num_surfaces,
     
 }
 
-__global__ void fvc_div_cell_tensor_boundary(int num, int offset, const int *face2Cells,
-        const double *boundary_face_vector, const double *boundary_vf, double *output)
+__global__ void fvc_div_cell_tensor_boundary(int num_boundary_faces, int num, int offset, const int *face2Cells,
+        const double *boundary_face_vector, const double *boundary_vf, double *output, double sign)
 {
     int index = blockDim.x * blockIdx.x + threadIdx.x;
     if (index >= num)
@@ -707,20 +715,20 @@ __global__ void fvc_div_cell_tensor_boundary(int num, int offset, const int *fac
     double bouSfy = boundary_face_vector[start_index * 3 + 1];
     double bouSfz = boundary_face_vector[start_index * 3 + 2];
 
-    double boussf_xx = boundary_vf[start_index * 9 + 0];
-    double boussf_xy = boundary_vf[start_index * 9 + 1];
-    double boussf_xz = boundary_vf[start_index * 9 + 2];
-    double boussf_yx = boundary_vf[start_index * 9 + 3];
-    double boussf_yy = boundary_vf[start_index * 9 + 4];
-    double boussf_yz = boundary_vf[start_index * 9 + 5];
-    double boussf_zx = boundary_vf[start_index * 9 + 6];
-    double boussf_zy = boundary_vf[start_index * 9 + 7];
-    double boussf_zz = boundary_vf[start_index * 9 + 8];
+    double boussf_xx = boundary_vf[num_boundary_faces * 0 + start_index];
+    double boussf_xy = boundary_vf[num_boundary_faces * 1 + start_index];
+    double boussf_xz = boundary_vf[num_boundary_faces * 2 + start_index];
+    double boussf_yx = boundary_vf[num_boundary_faces * 3 + start_index];
+    double boussf_yy = boundary_vf[num_boundary_faces * 4 + start_index];
+    double boussf_yz = boundary_vf[num_boundary_faces * 5 + start_index];
+    double boussf_zx = boundary_vf[num_boundary_faces * 6 + start_index];
+    double boussf_zy = boundary_vf[num_boundary_faces * 7 + start_index];
+    double boussf_zz = boundary_vf[num_boundary_faces * 8 + start_index];
     int cellIndex = face2Cells[start_index];
 
-    double bouDiv_x = bouSfx * boussf_xx + bouSfy * boussf_yx + bouSfz * boussf_zx;
-    double bouDiv_y = bouSfx * boussf_xy + bouSfy * boussf_yy + bouSfz * boussf_zy;
-    double bouDiv_z = bouSfx * boussf_xz + bouSfy * boussf_yz + bouSfz * boussf_zz;
+    double bouDiv_x = (bouSfx * boussf_xx + bouSfy * boussf_yx + bouSfz * boussf_zx) * sign;
+    double bouDiv_y = (bouSfx * boussf_xy + bouSfy * boussf_yy + bouSfz * boussf_zy) * sign;
+    double bouDiv_z = (bouSfx * boussf_xz + bouSfy * boussf_yz + bouSfz * boussf_zz) * sign;
 
     atomicAdd(&(output[cellIndex * 3 + 0]), bouDiv_x);
     atomicAdd(&(output[cellIndex * 3 + 1]), bouDiv_y);
@@ -762,10 +770,13 @@ void field_multiply_scalar(cudaStream_t stream,
         int num_cells, const double *input1, const double *input2, double *output,
         int num_boundary_surfaces, const double *boundary_input1, const double *boundary_input2, double *boundary_output)
 {
+    TICK_INIT;
     size_t threads_per_block = 256;
     size_t blocks_per_grid = (std::max(num_cells, num_boundary_surfaces) + threads_per_block - 1) / threads_per_block;
+    TICK_START;
     field_multiply_scalar_kernel<<<blocks_per_grid, threads_per_block, 0, stream>>>(num_cells, num_boundary_surfaces,
             input1, input2, output, boundary_input1, boundary_input2, boundary_output);
+    TICK_END(field_multiply_scalar_kernel);
 }
 
 void fvc_to_source_vector(cudaStream_t stream, int num_cells, const double *volume, const double *fvc_output, double *source)
@@ -814,10 +825,14 @@ void fvm_ddt_vector(cudaStream_t stream, int num_cells, double rDeltaT,
         const double *rho, const double *rho_old, const double *vf, const double *volume,
         double *diag, double *source, double sign)
 {
+    printf("#############kernel profile#############\n");
+    TICK_INIT;
     size_t threads_per_block = 1024;
     size_t blocks_per_grid = (num_cells + threads_per_block - 1) / threads_per_block;
+    TICK_START;
     fvm_ddt_vector_kernel<<<blocks_per_grid, threads_per_block, 0, stream>>>(num_cells,
             rDeltaT, rho, rho_old, vf, volume, diag, source, sign);
+    TICK_END(fvm_ddt_vector_kernel);
 }
 
 void fvm_div_vector(cudaStream_t stream, int num_surfaces, const int *lowerAddr, const int *upperAddr,
@@ -827,24 +842,29 @@ void fvm_div_vector(cudaStream_t stream, int num_surfaces, const int *lowerAddr,
         const double *boundary_phi, const double *value_internal_coeffs, const double *value_boundary_coeffs,
         double *internal_coeffs, double *boundary_coeffs, double sign)
 {
+    TICK_INIT;
     size_t threads_per_block = 1024;
     size_t blocks_per_grid = 1;
 
     blocks_per_grid = (num_surfaces + threads_per_block - 1) / threads_per_block;
+    TICK_START;
     fvm_div_vector_internal<<<blocks_per_grid, threads_per_block, 0, stream>>>(num_surfaces, lowerAddr, upperAddr,
             phi, weight, lower, upper, diag, sign);
+    TICK_END(fvm_div_vector_internal);
 
     int offset = 0;
     for (int i = 0; i < num_patches; i++) {
-        threads_per_block = 256;
+        threads_per_block = 64;
         blocks_per_grid = (patch_size[i] + threads_per_block - 1) / threads_per_block;
         // TODO: just basic patch type now
         if (patch_type[i] == boundaryConditions::zeroGradient
                 || patch_type[i] == boundaryConditions::fixedValue) {
             // TODO: just vector version now
+            TICK_START;
             fvm_div_vector_boundary<<<blocks_per_grid, threads_per_block, 0, stream>>>(patch_size[i], offset,
                     boundary_phi, value_internal_coeffs, value_boundary_coeffs,
                     internal_coeffs, boundary_coeffs, sign);
+            TICK_END(fvm_div_vector_boundary);
         } else if (0) {
             // xxx
             fprintf(stderr, "boundaryConditions other than zeroGradient are not support yet!\n");
@@ -862,24 +882,28 @@ void fvm_laplacian_vector(cudaStream_t stream, int num_surfaces,
         const double *gradient_internal_coeffs, const double *gradient_boundary_coeffs,
         double *internal_coeffs, double *boundary_coeffs, double sign)
 {
+    TICK_INIT;
     size_t threads_per_block = 1024;
     size_t blocks_per_grid = 1;
 
     blocks_per_grid = (num_surfaces + threads_per_block - 1) / threads_per_block;
+    TICK_START;
     fvm_laplacian_vector_internal<<<blocks_per_grid, threads_per_block, 0, stream>>>(num_surfaces, lowerAddr, upperAddr,
             weight, mag_sf, delta_coeffs, gamma, lower, upper, diag, sign);
-
+    TICK_END(fvm_laplacian_vector_internal);
     int offset = 0;
     for (int i = 0; i < num_patches; i++) {
-        threads_per_block = 256;
+        threads_per_block = 64;
         blocks_per_grid = (patch_size[i] + threads_per_block - 1) / threads_per_block;
         // TODO: just basic patch type now
         if (patch_type[i] == boundaryConditions::zeroGradient
                 || patch_type[i] == boundaryConditions::fixedValue) {
             // TODO: just vector version now
+            TICK_START;
             fvm_laplacian_vector_boundary<<<blocks_per_grid, threads_per_block, 0, stream>>>(patch_size[i], offset,
                     boundary_mag_sf, boundary_gamma, gradient_internal_coeffs, gradient_boundary_coeffs,
                     internal_coeffs, boundary_coeffs, sign);
+            TICK_END(fvm_laplacian_vector_boundary);
         } else if (0) {
             // xxx
             fprintf(stderr, "boundaryConditions other than zeroGradient are not support yet!\n");
@@ -900,7 +924,7 @@ void fvc_ddt_scalar(cudaStream_t stream, int num_cells, double rDeltaT,
             rDeltaT, rho, rho_old, vf, vf_old, output, sign);
 }
 
-void fvc_grad_vector(cudaStream_t stream, int num_cells, int num_surfaces, 
+void fvc_grad_vector(cudaStream_t stream, int num_cells, int num_surfaces, int num_boundary_surfaces,
         const int *lowerAddr, const int *upperAddr, 
         const double *weight, const double *Sf, const double *vf, double *output, // end for internal
         int num_patches, const int *patch_size, const int *patch_type,
@@ -909,22 +933,28 @@ void fvc_grad_vector(cudaStream_t stream, int num_cells, int num_surfaces,
         const double *boundary_deltaCoeffs, double sign)
 {
     checkCudaErrors(cudaMemsetAsync(output, 0, num_cells * 9 * sizeof(double), stream));
+    TICK_INIT;
     size_t threads_per_block = 1024;
     size_t blocks_per_grid = (num_surfaces + threads_per_block - 1) / threads_per_block;
-    fvc_grad_vector_internal<<<blocks_per_grid, threads_per_block, 0, stream>>>(num_surfaces, lowerAddr, upperAddr,
+    TICK_START;
+    fvc_grad_vector_internal<<<blocks_per_grid, threads_per_block, 0, stream>>>(num_cells, num_surfaces, lowerAddr, upperAddr,
             Sf, weight, vf, output);
+    TICK_END(fvc_grad_vector_internal);
     
     int offset = 0;
     // finish conctruct grad field except dividing cell volume
     for (int i = 0; i < num_patches; i++) {
-        threads_per_block = 256;
+        threads_per_block = 64;
         blocks_per_grid = (patch_size[i] + threads_per_block - 1) / threads_per_block;
         // TODO: just basic patch type now
         if (patch_type[i] == boundaryConditions::zeroGradient
                 || patch_type[i] == boundaryConditions::fixedValue) {
             // TODO: just vector version now
-            fvc_grad_vector_boundary<<<blocks_per_grid, threads_per_block, 0, stream>>>(patch_size[i], offset, boundary_cell_face,
+            TICK_START;
+            fvc_grad_vector_boundary<<<blocks_per_grid, threads_per_block, 0, stream>>>(num_cells, 
+                    patch_size[i], offset, boundary_cell_face,
                     boundary_Sf, boundary_vf, output);
+            TICK_END(fvc_grad_vector_boundary);
         } else if (0) {
             // xxx
             fprintf(stderr, "boundaryConditions other than zeroGradient are not support yet!\n");
@@ -935,19 +965,25 @@ void fvc_grad_vector(cudaStream_t stream, int num_cells, int num_surfaces,
     // divide cell volume
     threads_per_block = 512;
     blocks_per_grid = (num_cells + threads_per_block - 1) / threads_per_block;
+    TICK_START;
     divide_cell_volume_tsr<<<blocks_per_grid, threads_per_block, 0, stream>>>(num_cells, volume, output, sign);
+    TICK_END(divide_cell_volume_tsr);
 
     // correct boundary conditions
     offset = 0;
     for (int i = 0; i < num_patches; i++) {
-        threads_per_block = 256;
+        threads_per_block = 64;
         blocks_per_grid = (patch_size[i] + threads_per_block - 1) / threads_per_block;
         // TODO: just basic patch type now
         if (patch_type[i] == boundaryConditions::zeroGradient) {
             // TODO: just vector version now
-            fvc_grad_vector_correctBC_zeroGradient<<<blocks_per_grid, threads_per_block, 0, stream>>>(patch_size[i], offset, boundary_cell_face,
+            TICK_START;
+            fvc_grad_vector_correctBC_zeroGradient<<<blocks_per_grid, threads_per_block, 0, stream>>>(num_cells, num_boundary_surfaces,
+                    patch_size[i], offset, boundary_cell_face,
                     output, boundary_vf, boundary_Sf, boundary_mag_Sf, boundary_output, sign);
+            TICK_END(fvc_grad_vector_correctBC_zeroGradient);
         } else if (patch_type[i] == boundaryConditions::fixedValue) {
+            // TODO: implement fixedValue version
             fvc_grad_vector_correctBC_fixedValue<<<blocks_per_grid, threads_per_block, 0, stream>>>(patch_size[i], offset, boundary_cell_face,
                     output, boundary_vf, boundary_Sf, boundary_mag_Sf, boundary_output, boundary_deltaCoeffs, boundary_vf, sign);
         } else if (0) {
@@ -961,9 +997,12 @@ void fvc_grad_vector(cudaStream_t stream, int num_cells, int num_surfaces,
 void scale_dev2T_tensor(cudaStream_t stream, int num_cells, const double *vf1, double *vf2,
         int num_boundary_surfaces, const double *boundary_vf1, double *boundary_vf2)
 {
+    TICK_INIT;
     size_t threads_per_block = 1024;
     size_t blocks_per_grid = (num_cells + threads_per_block - 1) / threads_per_block;
+    TICK_START;
     scale_dev2t_tensor_kernel<<<blocks_per_grid, threads_per_block, 0, stream>>>(num_cells, vf1, vf2);
+    TICK_END(scale_dev2t_tensor_kernel);
 
     blocks_per_grid = (num_boundary_surfaces + threads_per_block - 1) / threads_per_block;
     scale_dev2t_tensor_kernel<<<blocks_per_grid, threads_per_block, 0, stream>>>(num_boundary_surfaces, boundary_vf1, boundary_vf2);
@@ -1026,29 +1065,33 @@ void fvc_div_cell_vector(cudaStream_t stream, int num_cells, int num_surfaces,
     divide_cell_volume_scalar<<<blocks_per_grid, threads_per_block, 0, stream>>>(num_cells, volume, output, sign);
 }
 
-void fvc_div_cell_tensor(cudaStream_t stream, int num_cells, int num_surfaces,
+void fvc_div_cell_tensor(cudaStream_t stream, int num_cells, int num_surfaces, int num_boundary_surfaces, 
         const int *lowerAddr, const int *upperAddr,
         const double *weight, const double *Sf, const double *vf, double *output, // end for internal
         int num_patches, const int *patch_size, const int *patch_type,
         const int *boundary_cell_face, const double *boundary_vf, const double *boundary_Sf,
         const double *volume, double sign)
 {
-    checkCudaErrors(cudaMemsetAsync(output, 0, num_cells * 3 * sizeof(double), stream));
-
+    // checkCudaErrors(cudaMemsetAsync(output, 0, num_cells * 3 * sizeof(double), stream));
+    TICK_INIT;
     size_t threads_per_block = 1024;
     size_t blocks_per_grid = (num_surfaces + threads_per_block - 1) / threads_per_block;
-    fvc_div_cell_tensor_internal<<<blocks_per_grid, threads_per_block, 0, stream>>>(num_surfaces, lowerAddr, upperAddr, vf, weight, Sf, output);
+    TICK_START;
+    fvc_div_cell_tensor_internal<<<blocks_per_grid, threads_per_block, 0, stream>>>(num_cells, num_surfaces, lowerAddr, upperAddr, vf, weight, Sf, output, sign);
+    TICK_END(fvc_div_cell_tensor_internal);
 
     int offset = 0;
     for (int i = 0; i < num_patches; i++) {
-        threads_per_block = 256;
+        threads_per_block = 64;
         blocks_per_grid = (patch_size[i] + threads_per_block - 1) / threads_per_block;
         // TODO: just basic patch type now
         if (patch_type[i] == boundaryConditions::zeroGradient
                 || patch_type[i] == boundaryConditions::fixedValue) {
             // TODO: just vector version now
-            fvc_div_cell_tensor_boundary<<<blocks_per_grid, threads_per_block, 0, stream>>>(patch_size[i], offset, boundary_cell_face,
-                    boundary_Sf, boundary_vf, output);
+            TICK_START;
+            fvc_div_cell_tensor_boundary<<<blocks_per_grid, threads_per_block, 0, stream>>>(num_boundary_surfaces, patch_size[i], offset, boundary_cell_face,
+                    boundary_Sf, boundary_vf, output, sign);
+            TICK_END(fvc_div_cell_tensor_boundary);
         } else if (0) {
             // xxx
             fprintf(stderr, "boundaryConditions other than zeroGradient are not support yet!\n");
@@ -1056,10 +1099,10 @@ void fvc_div_cell_tensor(cudaStream_t stream, int num_cells, int num_surfaces,
         offset += patch_size[i];
     }
 
-    // divide cell volume
-    threads_per_block = 1024;
-    blocks_per_grid = (num_cells + threads_per_block - 1) / threads_per_block;
-    divide_cell_volume_vec<<<blocks_per_grid, threads_per_block, 0, stream>>>(num_cells, volume, output, sign);
+    // // divide cell volume
+    // threads_per_block = 1024;
+    // blocks_per_grid = (num_cells + threads_per_block - 1) / threads_per_block;
+    // divide_cell_volume_vec<<<blocks_per_grid, threads_per_block, 0, stream>>>(num_cells, volume, output, sign);
 }
 
 void fvc_grad_cell_scalar(cudaStream_t stream, int num_cells, int num_surfaces, 
@@ -1068,21 +1111,25 @@ void fvc_grad_cell_scalar(cudaStream_t stream, int num_cells, int num_surfaces,
         int num_patches, const int *patch_size, const int *patch_type,
         const int *boundary_cell_face, const double *boundary_vf, const double *boundary_Sf, const double *volume, double sign)
 {
-    checkCudaErrors(cudaMemsetAsync(output, 0, num_cells * 3 * sizeof(double), stream));
+    TICK_INIT;
     size_t threads_per_block = 1024;
     size_t blocks_per_grid = (num_surfaces + threads_per_block - 1) / threads_per_block;
+    TICK_START;
     fvc_grad_scalar_internal<<<blocks_per_grid, threads_per_block, 0, stream>>>(num_cells, num_surfaces, lowerAddr, upperAddr,
-            Sf, weight, vf, output);
+            Sf, weight, vf, output, sign);
+    TICK_END(fvc_grad_scalar_internal);
     
     int offset = 0;
     for (int i = 0; i < num_patches; i++) {
-        threads_per_block = 256;
+        threads_per_block = 64;
         blocks_per_grid = (patch_size[i] + threads_per_block - 1) / threads_per_block;
         // TODO: just non-coupled patch type now
         if (patch_type[i] == boundaryConditions::zeroGradient
                 || patch_type[i] == boundaryConditions::fixedValue) {
+            TICK_START;
             fvc_grad_scalar_boundary<<<blocks_per_grid, threads_per_block, 0, stream>>>(num_cells, patch_size[i], offset, boundary_cell_face,
-                    boundary_Sf, boundary_vf, output);
+                    boundary_Sf, boundary_vf, output, sign);
+            TICK_END(fvc_grad_scalar_internal);
         } else if (0) {
             // xxx
             fprintf(stderr, "boundaryConditions other than zeroGradient are not support yet!\n");
@@ -1090,8 +1137,8 @@ void fvc_grad_cell_scalar(cudaStream_t stream, int num_cells, int num_surfaces,
         offset += patch_size[i];
     }
 
-    // divide cell volume
-    threads_per_block = 1024;
-    blocks_per_grid = (num_cells + threads_per_block - 1) / threads_per_block;
-    divide_cell_volume_vec<<<blocks_per_grid, threads_per_block, 0, stream>>>(num_cells, volume, output, sign);
+    // // divide cell volume
+    // threads_per_block = 1024;
+    // blocks_per_grid = (num_cells + threads_per_block - 1) / threads_per_block;
+    // divide_cell_volume_vec<<<blocks_per_grid, threads_per_block, 0, stream>>>(num_cells, volume, output, sign);
 }
diff --git a/src_gpu/dfUEqn.H b/src_gpu/dfUEqn.H
index 7b28f082c..80cdc7144 100644
--- a/src_gpu/dfUEqn.H
+++ b/src_gpu/dfUEqn.H
@@ -110,4 +110,6 @@ public:
     void compareResult(const double *lower, const double *upper, const double *diag, const double *source, const double *internal_coeffs, const double *boundary_coeffs, 
     // const double *tmpVal, 
     bool printFlag);
+
+    void sync();
 };
diff --git a/src_gpu/dfUEqn.cu b/src_gpu/dfUEqn.cu
index 90efcf670..73b7516c5 100644
--- a/src_gpu/dfUEqn.cu
+++ b/src_gpu/dfUEqn.cu
@@ -97,9 +97,9 @@ void dfUEqn::process() {
     checkCudaErrors(cudaEventCreate(&stop));
     checkCudaErrors(cudaEventRecord(start,0));
 
-    if(!graph_created) {
-        DEBUG_TRACE;
-        checkCudaErrors(cudaStreamBeginCapture(dataBase_.stream, cudaStreamCaptureModeGlobal));
+    // if(!graph_created) {
+    //     DEBUG_TRACE;
+    //     checkCudaErrors(cudaStreamBeginCapture(dataBase_.stream, cudaStreamCaptureModeGlobal));
 
         fvm_ddt_vector(dataBase_.stream, dataBase_.num_cells, dataBase_.rdelta_t,
                 dataBase_.d_rho, dataBase_.d_rho_old, dataBase_.d_u, dataBase_.d_volume,
@@ -110,6 +110,7 @@ void dfUEqn::process() {
                 dataBase_.num_patches, dataBase_.patch_size.data(), patch_type.data(),
                 dataBase_.d_boundary_phi, d_value_internal_coeffs, d_value_boundary_coeffs,
                 d_internal_coeffs, d_boundary_coeffs, 1.);
+        //TODO: merge bellow six kernels
         field_multiply_scalar(dataBase_.stream,
                dataBase_.num_cells, dataBase_.d_rho, d_nu_eff, d_rho_nueff, // end for internal
                dataBase_.num_boundary_surfaces, dataBase_.d_boundary_rho, d_boundary_nu_eff, d_boundary_rho_nueff);
@@ -121,7 +122,7 @@ void dfUEqn::process() {
                dataBase_.d_boundary_mag_sf, d_boundary_rho_nueff,
                d_gradient_internal_coeffs, d_gradient_boundary_coeffs,
                d_internal_coeffs, d_boundary_coeffs, -1);
-        fvc_grad_vector(dataBase_.stream, dataBase_.num_cells, dataBase_.num_surfaces,
+        fvc_grad_vector(dataBase_.stream, dataBase_.num_cells, dataBase_.num_surfaces, dataBase_.num_boundary_surfaces,
                dataBase_.d_owner, dataBase_.d_neighbor,
                dataBase_.d_weight, dataBase_.d_sf, dataBase_.d_u, d_grad_u,
                dataBase_.num_patches, dataBase_.patch_size.data(), patch_type.data(),
@@ -129,37 +130,43 @@ void dfUEqn::process() {
                dataBase_.d_volume, dataBase_.d_boundary_mag_sf, d_boundary_grad_u, dataBase_.d_boundary_delta_coeffs);
         scale_dev2T_tensor(dataBase_.stream, dataBase_.num_cells, d_rho_nueff, d_grad_u, // end for internal
                dataBase_.num_boundary_surfaces, d_boundary_rho_nueff, d_boundary_grad_u);
-        fvc_div_cell_tensor(dataBase_.stream, dataBase_.num_cells, dataBase_.num_surfaces,
+        fvc_div_cell_tensor(dataBase_.stream, dataBase_.num_cells, dataBase_.num_surfaces, dataBase_.num_boundary_surfaces,
                dataBase_.d_owner, dataBase_.d_neighbor,
-               dataBase_.d_weight, dataBase_.d_sf, d_grad_u, d_fvc_output, // end for internal
+               dataBase_.d_weight, dataBase_.d_sf, d_grad_u, d_source, // end for internal
                dataBase_.num_patches, dataBase_.patch_size.data(), patch_type.data(),
                dataBase_.d_boundary_face_cell, d_boundary_grad_u, dataBase_.d_boundary_sf, dataBase_.d_volume);
-        fvc_to_source_vector(dataBase_.stream, dataBase_.num_cells,
-               dataBase_.d_volume, d_fvc_output, d_source);
+        // fvc_to_source_vector(dataBase_.stream, dataBase_.num_cells,
+        //        dataBase_.d_volume, d_fvc_output, d_source);
+        // TODO: merge bellow two kernel
         fvc_grad_cell_scalar(dataBase_.stream, dataBase_.num_cells, dataBase_.num_surfaces,
                 dataBase_.d_owner, dataBase_.d_neighbor,
-                dataBase_.d_weight, dataBase_.d_sf, dataBase_.d_p, d_fvc_output,
+                dataBase_.d_weight, dataBase_.d_sf, dataBase_.d_p, d_source,
                 dataBase_.num_patches, dataBase_.patch_size.data(), patch_type.data(),
                 dataBase_.d_boundary_face_cell, dataBase_.d_boundary_p, dataBase_.d_boundary_sf, dataBase_.d_volume, -1.);
-        fvc_to_source_vector(dataBase_.stream, dataBase_.num_cells,
-                dataBase_.d_volume, d_fvc_output, d_source);
+        // fvc_to_source_vector(dataBase_.stream, dataBase_.num_cells,
+        //         dataBase_.d_volume, d_fvc_output, d_source);
 
-        checkCudaErrors(cudaStreamEndCapture(dataBase_.stream, &graph));
-        checkCudaErrors(cudaGraphInstantiate(&graph_instance, graph, NULL, NULL, 0));
-        graph_created = true;
-    }
-    DEBUG_TRACE;
-    checkCudaErrors(cudaGraphLaunch(graph_instance, dataBase_.stream));
+    //     checkCudaErrors(cudaStreamEndCapture(dataBase_.stream, &graph));
+    //     checkCudaErrors(cudaGraphInstantiate(&graph_instance, graph, NULL, NULL, 0));
+    //     graph_created = true;
+    // }
+    // DEBUG_TRACE;
+    // checkCudaErrors(cudaGraphLaunch(graph_instance, dataBase_.stream));
 
     checkCudaErrors(cudaEventRecord(stop,0));
     checkCudaErrors(cudaEventSynchronize(start));
     checkCudaErrors(cudaEventSynchronize(stop));
     checkCudaErrors(cudaEventElapsedTime(&time_elapsed,start,stop));
-    fprintf(stderr, "ueqn process time：%f(ms)\n",time_elapsed);
+    fprintf(stderr, "ueqn process time:%f(ms)\n",time_elapsed);
 
     //solve();
 }
 
+void dfUEqn::sync()
+{
+    checkCudaErrors(cudaStreamSynchronize(dataBase_.stream));
+}
+
 void dfUEqn::solve() {
     //ldu_to_csr(dataBase_.stream, dataBase_.num_cells, dataBase_.num_surfaces,
     //        dataBase_.d_lower_to_csr_index, dataBase_.d_upper_to_csr_index, dataBase_.d_diag_to_csr_index,
@@ -226,24 +233,35 @@ void dfUEqn::compareResult(const double *lower, const double *upper, const doubl
     std::vector<double> h_lower;
     h_lower.resize(dataBase_.num_surfaces);
     checkCudaErrors(cudaMemcpy(h_lower.data(), d_lower, dataBase_.surface_value_bytes, cudaMemcpyDeviceToHost));
+    fprintf(stderr, "check h_lower");
     checkVectorEqual(dataBase_.num_surfaces, lower, h_lower.data(), 1e-14, printFlag);
     DEBUG_TRACE;
 
     std::vector<double> h_upper;
     h_upper.resize(dataBase_.num_surfaces);
     checkCudaErrors(cudaMemcpy(h_upper.data(), d_upper, dataBase_.surface_value_bytes, cudaMemcpyDeviceToHost));
+    fprintf(stderr, "check h_upper");
     checkVectorEqual(dataBase_.num_surfaces, upper, h_upper.data(), 1e-14, printFlag);
     DEBUG_TRACE;
 
     std::vector<double> h_diag;
     h_diag.resize(dataBase_.num_cells);
     checkCudaErrors(cudaMemcpy(h_diag.data(), d_diag, dataBase_.cell_value_bytes, cudaMemcpyDeviceToHost));
+    fprintf(stderr, "check h_diag");
     checkVectorEqual(dataBase_.num_cells, diag, h_diag.data(), 1e-14, printFlag);
     DEBUG_TRACE;
 
     std::vector<double> h_source;
+    // , h_source_ref;
     h_source.resize(dataBase_.num_cells * 3);
+    // h_source_ref.resize(dataBase_.num_cells * 3);
+    // for (int i = 0; i < dataBase_.num_cells; i++) {
+    //     h_source_ref[0 * dataBase_.num_cells + i] = source[i * 3 + 0];
+    //     h_source_ref[1 * dataBase_.num_cells + i] = source[i * 3 + 1];
+    //     h_source_ref[2 * dataBase_.num_cells + i] = source[i * 3 + 2];
+    // }
     checkCudaErrors(cudaMemcpy(h_source.data(), d_source, dataBase_.cell_value_vec_bytes, cudaMemcpyDeviceToHost));
+    fprintf(stderr, "check h_source");
     checkVectorEqual(dataBase_.num_cells * 3, source, h_source.data(), 1e-14, printFlag);
     DEBUG_TRACE;
 

From 7a7accf67d6d7e264697c91e201cc01fa7610f90 Mon Sep 17 00:00:00 2001
From: maorz1998 <maorz1998@stu.pku.edu.cn>
Date: Mon, 21 Aug 2023 21:12:17 +0800
Subject: [PATCH 24/25] add solve part, fix some bugs

---
 applications/solvers/dfLowMachFoam/new_UEqn.H |   2 +-
 .../solvers/dfLowMachFoam_new/CMakeLists.txt  | 126 +++++
 applications/solvers/dfLowMachFoam_new/EEqn.H | 141 ++++++
 .../solvers/dfLowMachFoam_new/Make/files      |   3 +
 .../solvers/dfLowMachFoam_new/Make/options    |  58 +++
 applications/solvers/dfLowMachFoam_new/UEqn.H | 247 ++++++++++
 applications/solvers/dfLowMachFoam_new/YEqn.H | 207 ++++++++
 .../solvers/dfLowMachFoam_new/YEqn_RR.H       |  61 +++
 .../solvers/dfLowMachFoam_new/correctPhi.H    |  12 +
 .../solvers/dfLowMachFoam_new/createFields.H  | 176 +++++++
 .../dfLowMachFoam_new/createGPUSolver.H       |  97 ++++
 .../dfLowMachFoam_new/createdfSolver.H        |  65 +++
 .../solvers/dfLowMachFoam_new/dfLowMachFoam.C | 447 ++++++++++++++++++
 applications/solvers/dfLowMachFoam_new/pEqn.H | 203 ++++++++
 .../solvers/dfLowMachFoam_new/pcEqn.H         | 130 +++++
 .../solvers/dfLowMachFoam_new/rhoEqn.H        |  86 ++++
 .../solvers/dfLowMachFoam_new/setRDeltaT.H    |  85 ++++
 .../solvers/dfLowMachFoam_new/setRootCase2.H  |   5 +
 src_gpu/dfMatrixDataBase.cu                   |   2 +-
 src_gpu/dfMatrixOpBase.H                      |  30 +-
 src_gpu/dfMatrixOpBase.cu                     | 406 ++++++++++------
 src_gpu/dfUEqn.H                              |   5 +-
 src_gpu/dfUEqn.cu                             |  98 ++--
 23 files changed, 2495 insertions(+), 197 deletions(-)
 create mode 100644 applications/solvers/dfLowMachFoam_new/CMakeLists.txt
 create mode 100644 applications/solvers/dfLowMachFoam_new/EEqn.H
 create mode 100644 applications/solvers/dfLowMachFoam_new/Make/files
 create mode 100644 applications/solvers/dfLowMachFoam_new/Make/options
 create mode 100644 applications/solvers/dfLowMachFoam_new/UEqn.H
 create mode 100644 applications/solvers/dfLowMachFoam_new/YEqn.H
 create mode 100644 applications/solvers/dfLowMachFoam_new/YEqn_RR.H
 create mode 100644 applications/solvers/dfLowMachFoam_new/correctPhi.H
 create mode 100644 applications/solvers/dfLowMachFoam_new/createFields.H
 create mode 100644 applications/solvers/dfLowMachFoam_new/createGPUSolver.H
 create mode 100644 applications/solvers/dfLowMachFoam_new/createdfSolver.H
 create mode 100644 applications/solvers/dfLowMachFoam_new/dfLowMachFoam.C
 create mode 100644 applications/solvers/dfLowMachFoam_new/pEqn.H
 create mode 100644 applications/solvers/dfLowMachFoam_new/pcEqn.H
 create mode 100644 applications/solvers/dfLowMachFoam_new/rhoEqn.H
 create mode 100644 applications/solvers/dfLowMachFoam_new/setRDeltaT.H
 create mode 100644 applications/solvers/dfLowMachFoam_new/setRootCase2.H

diff --git a/applications/solvers/dfLowMachFoam/new_UEqn.H b/applications/solvers/dfLowMachFoam/new_UEqn.H
index 9d94d27b6..41b804a4b 100644
--- a/applications/solvers/dfLowMachFoam/new_UEqn.H
+++ b/applications/solvers/dfLowMachFoam/new_UEqn.H
@@ -107,7 +107,7 @@ for (int patchi = 0; patchi < dfDataBase.num_patches; patchi++)
     memcpy(h_boundary_coeffs.data() + offset * 3, boundary_coeff_ptr, patchsize * 3 * sizeof(double));
     offset += patchsize;
 }
-bool printFlag = false;
+bool printFlag = true;
 UEqn_GPU.compareResult(&UEqn.lower()[0], &UEqn.upper()[0], &UEqn.diag()[0], &UEqn.source()[0][0],
         h_internal_coeffs.data(), h_boundary_coeffs.data(), 
         // &DivTensor[0][0], 
diff --git a/applications/solvers/dfLowMachFoam_new/CMakeLists.txt b/applications/solvers/dfLowMachFoam_new/CMakeLists.txt
new file mode 100644
index 000000000..645289a64
--- /dev/null
+++ b/applications/solvers/dfLowMachFoam_new/CMakeLists.txt
@@ -0,0 +1,126 @@
+cmake_minimum_required(VERSION 3.5)
+project(dfLowMachFoam LANGUAGES CXX)
+FIND_PACKAGE(MPI REQUIRED)
+FIND_PACKAGE(OpenMP REQUIRED)
+FIND_PACKAGE(CUDA REQUIRED)
+
+# Check valid thirdParty
+if(DEFINED ENV{WM_PROJECT_DIR})
+	MESSAGE(STATUS "OpenFOAM: " $ENV{WM_PROJECT_DIR})
+else()
+	message(FATAL_ERROR "OpenFOAM is not sourced")
+endif(DEFINED ENV{WM_PROJECT_DIR})
+
+if(DEFINED ENV{CANTERA_ROOT})
+	MESSAGE(STATUS "libcantera: " $ENV{CANTERA_ROOT})
+  SET(CANTERA_ROOT $ENV{CANTERA_ROOT})
+else()
+	message(FATAL_ERROR "libcantera directory is not specified")
+endif(DEFINED ENV{CANTERA_ROOT}) 
+
+# define variables
+SET(OpenFOAM_LIB_DIR $ENV{FOAM_LIBBIN})
+SET(OpenFOAM_SRC $ENV{FOAM_SRC})
+
+SET(DF_ROOT $ENV{DF_ROOT})
+SET(DF_SRC $ENV{DF_SRC})
+SET(SRC_ORIG $ENV{SRC_ORIG})
+
+# set compilation options
+SET(CMAKE_EXE_LINKER_FLAGS "-fuse-ld=bfd -Xlinker --add-needed -Xlinker --no-as-needed")
+SET (CMAKE_C_FLAGS ${CMAKE_C_FLAGS} ${OpenMP_C_FLAGS})
+SET (CMAKE_CXX_FLAGS ${CMAKE_CXX_FLAGS} ${OpenMP_CXX_FLAGS})
+
+SET(CMAKE_C_COMPILER g++)
+SET(PATH_LIB_OPENMPI "openmpi-system")  # Foundation version
+SET(EXE_COMPILE_OPTION "-std=c++11 -m64 -Dlinux64 -DWM_ARCH_OPTION=64 
+-DWM_DP -DWM_LABEL_SIZE=32 -Wall -Wextra -Wold-style-cast -Wnon-virtual-dtor 
+-Wno-unused-parameter -Wno-invalid-offsetof -Wno-attributes -O3  
+-DNoRepository -ftemplate-depth-100 -std=c++14 
+-Wno-unused-variable -Wno-unused-but-set-variable -Wno-old-style-cast -DOMPI_SKIP_MPICXX  
+-pthread -fPIC")
+add_definitions("${EXE_COMPILE_OPTION}")
+
+# add header files
+FUNCTION(R_SEARCH search_path return_list)
+    FILE(GLOB_RECURSE new_list ${search_path}/*.H)
+    SET(dir_list "")
+    FOREACH(file_path ${new_list})
+        GET_FILENAME_COMPONENT(dir_path ${file_path} PATH)
+        SET(dir_list ${dir_list} ${dir_path})
+    ENDFOREACH()
+    LIST(REMOVE_DUPLICATES dir_list)
+    SET(${return_list} ${dir_list} PARENT_SCOPE)
+ENDFUNCTION(R_SEARCH)
+
+R_SEARCH(${DF_SRC}/dfCombustionModels dfcombustion_inc)
+R_SEARCH(${DF_SRC}/dfCanteraMixture dfcantera_inc)
+R_SEARCH(${DF_SRC}/lagrangian/intermediate dflagrangianinter_inc)
+R_SEARCH(${DF_SRC}/lagrangian/spray dflagrangianspray_inc)
+R_SEARCH(${DF_SRC}/lagrangian/turbulence dflagrangianturb_inc)
+R_SEARCH(${DF_SRC}/dfChemistryModel dfchemistry_inc)
+R_SEARCH(${DF_SRC}/thermophysicalModels/thermophysicalProperties dfthermophysicalprop_inc)
+R_SEARCH(${DF_SRC}/thermophysicalModels/thermophysicalProperties dfthermophysicalprop_inc)
+R_SEARCH(${DF_SRC}/thermophysicalModels/basic dfthermophysicalbasic_inc)
+R_SEARCH(${DF_SRC}/thermophysicalModels/SLGThermo dfthermophysicalslg_inc)
+R_SEARCH(${DF_SRC}/TurbulenceModels dfturbulence_inc)
+R_SEARCH(${DF_SRC}/dynamicMesh dfnewdynamic_inc)
+R_SEARCH(${DF_SRC}/dynamicFvMesh dffvdynamic_inc)
+
+include_directories(
+    ${OpenFOAM_SRC}/finiteVolume/lnInclude
+    ${OpenFOAM_SRC}/OSspecific/POSIX/lnInclude
+    ${OpenFOAM_SRC}/OpenFOAM/lnInclude
+    ${OpenFOAM_SRC}/transportModels/compressible/lnInclude 
+    ${OpenFOAM_SRC}/thermophysicalModels/basic/lnInclude 
+    ${OpenFOAM_SRC}/TurbulenceModels/turbulenceModels/lnInclude 
+    ${OpenFOAM_SRC}/TurbulenceModels/compressible/lnInclude 
+    ${OpenFOAM_SRC}/finiteVolume/cfdTools 
+    ${OpenFOAM_SRC}/finiteVolume/lnInclude 
+    ${OpenFOAM_SRC}/meshTools/lnInclude 
+    ${OpenFOAM_SRC}/sampling/lnInclude 
+    ${OpenFOAM_SRC}/dynamicFvMesh/lnInclude 
+    ${OpenFOAM_SRC}/Pstream/mpi 
+    ${dfcantera_inc}
+    ${dfchemistry_inc}
+    ${dfcombustion_inc}
+    ${CANTERA_ROOT}/include 
+    ${MPI_INCLUDE_PATH}
+    ${PROJECT_SOURCE_DIR}
+    ${CUDA_INCLUDE_DIRS}
+    /home/runze/AmgX/AMGX/include
+    /home/runze/deepflame-dev/src_gpu
+)
+
+# add execution
+add_executable(${PROJECT_NAME} ${PROJECT_SOURCE_DIR}/dfLowMachFoam.C)
+
+target_link_libraries(${PROJECT_NAME} 
+    $ENV{FOAM_LIBBIN}/libfiniteVolume.so libmeshTools.so libcompressibleTransportModels.so 
+    libturbulenceModels.so libsampling.so libOpenFOAM.so 
+    ${CANTERA_ROOT}/lib/libcantera_shared.so.2
+    ${DF_ROOT}/lib/libdfChemistryModel.so
+    ${DF_ROOT}/lib/libdfCanteraMixture.so
+    ${DF_ROOT}/lib/libdfFluidThermophysicalModels.so
+    ${DF_ROOT}/lib/libdfCombustionModels.so
+    $ENV{FOAM_LIBBIN}/openmpi-system/libPstream.so
+    ${MPI_LIBRARIES}
+    ${CUDA_LIBRARIES}
+    /home/runze/AmgX/AMGX/build/libamgxsh.so
+    /home/runze/deepflame-dev/src_gpu/build/libdfMatrix.so
+)
+
+if(DEFINED ENV{PYTHON_INC_DIR})
+    add_definitions(-DUSE_PYTORCH)
+    find_package (Python REQUIRED COMPONENTS Interpreter Development)
+    find_package(pybind11)
+    include_directories(
+    ${Python_INCLUDE_DIRS}
+    ${pybind11_INCLUDE_DIR}/pybind11
+    )
+    target_link_libraries(${PROJECT_NAME} ${Python_LIBRARIES})
+endif()
+
+# install
+set(CMAKE_INSTALL_PREFIX ${DF_ROOT})
+install(TARGETS ${PROJECT_NAME} DESTINATION bin)
diff --git a/applications/solvers/dfLowMachFoam_new/EEqn.H b/applications/solvers/dfLowMachFoam_new/EEqn.H
new file mode 100644
index 000000000..896baaa06
--- /dev/null
+++ b/applications/solvers/dfLowMachFoam_new/EEqn.H
@@ -0,0 +1,141 @@
+{
+    volScalarField& he = thermo.he();
+#ifdef GPUSolver_
+    start1 = std::clock();
+    UEqn_GPU.updatePsi(&U[0][0]);
+    UEqn_GPU.correctBoundaryConditions();
+    U.correctBoundaryConditions();
+    K = 0.5*magSqr(U);
+    end1 = std::clock();
+    time_monitor_UEqn += double(end1 - start1) / double(CLOCKS_PER_SEC);
+    time_monitor_UEqn_correctBC += double(end1 - start1) / double(CLOCKS_PER_SEC);
+
+    // prepare data on CPU
+    start1 = std::clock();
+    start2 = std::clock();
+    // const tmp<volScalarField> alphaEff_tmp(thermo.alpha());
+    // const volScalarField& alphaEff = alphaEff_tmp();
+    double *alphaEff = nullptr; // tmp
+    end2 = std::clock();
+    int eeqn_offset = 0;
+    int patchNum = 0;
+
+    forAll(he.boundaryField(), patchi)
+    {
+        patchNum++;
+        const fvsPatchScalarField& pw = mesh.surfaceInterpolation::weights().boundaryField()[patchi];
+        int patchSize = pw.size();
+
+        // construct gradient manually
+        const fvPatchScalarField& hew = he.boundaryField()[patchi];
+        const basicThermo& bThermo = basicThermo::lookupThermo(hew);
+        const scalarField& ppw = bThermo.p().boundaryField()[patchi];
+        fvPatchScalarField& Tw =
+            const_cast<fvPatchScalarField&>(bThermo.T().boundaryField()[patchi]);
+        scalarField& Tw_v = Tw;
+
+        Tw.evaluate();
+        const scalarField& patchDeltaCoeff = mesh.boundary()[patchi].deltaCoeffs();
+        const scalarField heInternal = bThermo.he(ppw, Tw, patchi)();
+        const scalarField heBoundary = bThermo.he(ppw, Tw, mesh.boundary()[patchi].faceCells())();
+        const scalarField patchGradMau = patchDeltaCoeff * (heInternal - heBoundary);
+
+        const scalarField& patchK = K.boundaryField()[patchi];
+        // const scalarField& patchAlphaEff = alphaEff.boundaryField()[patchi]; // not H2Dcopy when use UnityLewis
+        // const scalarField& patchGrad = he.boundaryField()[patchi].gradientBoundaryCoeffs(); // gradient_
+
+        // const DimensionedField<scalar, volMesh>& patchHa_ = he.boundaryField()[patchi];
+        // const gradientEnergyFvPatchScalarField patchHa(mesh.boundary()[patchi], patchHa_);
+        // const scalarField& patchGrad = patchHa.gradient(); // gradient_
+        memcpy(boundary_K + eeqn_offset, &patchK[0], patchSize*sizeof(double));
+        // memcpy(boundary_alphaEff + eeqn_offset, &patchAlphaEff[0], patchSize*sizeof(double));
+        memcpy(boundary_gradient + eeqn_offset, &patchGradMau[0], patchSize*sizeof(double));
+
+        eeqn_offset += patchSize;
+    }
+    end1 = std::clock();
+    time_monitor_EEqn += double(end1 - start1) / double(CLOCKS_PER_SEC);
+    time_monitor_EEqn_mtxAssembly += double(end1 - start1) / double(CLOCKS_PER_SEC);
+    time_monitor_EEqn_mtxAssembly_CPU_prepare += double(end1 - start1) / double(CLOCKS_PER_SEC);
+    fprintf(stderr, "time_monitor_EEqn_mtxAssembly_CPU_prepare: %lf, build alphaEff time: %lf, patchNum: %d\n",
+            time_monitor_EEqn_mtxAssembly_CPU_prepare,
+            double(end2 - start2) / double(CLOCKS_PER_SEC), patchNum);
+
+    // prepare data on GPU
+    start1 = std::clock();
+    he.oldTime();
+    K.oldTime();
+    EEqn_GPU.prepare_data(&he.oldTime()[0], &K[0], &K.oldTime()[0], alphaEff,
+            &dpdt[0], boundary_K, boundary_alphaEff, boundary_gradient);
+    EEqn_GPU.sync();
+    end1 = std::clock();
+    time_monitor_EEqn += double(end1 - start1) / double(CLOCKS_PER_SEC);
+    time_monitor_EEqn_mtxAssembly += double(end1 - start1) / double(CLOCKS_PER_SEC);
+    time_monitor_EEqn_mtxAssembly_GPU_prepare += double(end1 - start1) / double(CLOCKS_PER_SEC);
+
+    start1 = std::clock();
+    EEqn_GPU.initializeTimeStep();
+    EEqn_GPU.fvm_ddt();
+    EEqn_GPU.fvm_div();
+    EEqn_GPU.fvm_laplacian();
+    EEqn_GPU.fvc_ddt();
+    EEqn_GPU.fvc_div_phi_scalar();
+    EEqn_GPU.fvc_div_vector();
+    EEqn_GPU.add_to_source();
+    EEqn_GPU.sync();
+    end1 = std::clock();
+    time_monitor_EEqn += double(end1 - start1) / double(CLOCKS_PER_SEC);
+    time_monitor_EEqn_mtxAssembly += double(end1 - start1) / double(CLOCKS_PER_SEC);
+    time_monitor_EEqn_mtxAssembly_GPU_run += double(end1 - start1) / double(CLOCKS_PER_SEC);
+
+    // check value of mtxAssembly, no time monitor
+    // EEqn_GPU.checkValue(true);
+
+    start1 = std::clock();
+    EEqn_GPU.solve();
+    end1 = std::clock();
+    time_monitor_EEqn += double(end1 - start1) / double(CLOCKS_PER_SEC);
+    time_monitor_EEqn_solve += double(end1 - start1) / double(CLOCKS_PER_SEC);
+
+    start1 = std::clock();
+    EEqn_GPU.updatePsi(&he[0]);
+    he.correctBoundaryConditions();
+    he.write();
+    end1 = std::clock();
+    time_monitor_EEqn += double(end1 - start1) / double(CLOCKS_PER_SEC);
+    time_monitor_EEqn_correctBC += double(end1 - start1) / double(CLOCKS_PER_SEC);
+#else
+    start1 = std::clock();
+    fvScalarMatrix EEqn
+    (
+
+            fvm::ddt(rho, he) + mvConvection->fvmDiv(phi, he)
+        +   fvc::ddt(rho, K) + fvc::div(phi, K)
+        -   dpdt
+        ==
+            (
+                turbName == "laminar"
+                ?
+                (
+                    fvm::laplacian(turbulence->alpha(), he)
+                -   diffAlphaD
+                +   fvc::div(hDiffCorrFlux)
+                )
+                :
+                (
+                    fvm::laplacian(turbulence->alphaEff(), he)
+                )
+            )
+        );
+    end1 = std::clock();
+    time_monitor_EEqn += double(end1 - start1) / double(CLOCKS_PER_SEC);
+    time_monitor_EEqn_mtxAssembly += double(end1 - start1) / double(CLOCKS_PER_SEC);
+
+    EEqn.relax();
+    start1 = std::clock();
+    EEqn.solve("ha");
+    end1 = std::clock();
+    time_monitor_EEqn += double(end1 - start1) / double(CLOCKS_PER_SEC);
+    time_monitor_EEqn_solve += double(end1 - start1) / double(CLOCKS_PER_SEC);
+#endif
+}
diff --git a/applications/solvers/dfLowMachFoam_new/Make/files b/applications/solvers/dfLowMachFoam_new/Make/files
new file mode 100644
index 000000000..92df9b4e3
--- /dev/null
+++ b/applications/solvers/dfLowMachFoam_new/Make/files
@@ -0,0 +1,3 @@
+dfLowMachFoam.C
+
+EXE = $(DF_APPBIN)/dfLowMachFoam_new
diff --git a/applications/solvers/dfLowMachFoam_new/Make/options b/applications/solvers/dfLowMachFoam_new/Make/options
new file mode 100644
index 000000000..bda93210e
--- /dev/null
+++ b/applications/solvers/dfLowMachFoam_new/Make/options
@@ -0,0 +1,58 @@
+-include $(GENERAL_RULES)/mplibType
+
+EXE_INC = -std=c++14 \
+    -g \
+    -fopenmp \
+    -Wno-unused-variable \
+    -Wno-unused-but-set-variable \
+    -Wno-old-style-cast \
+    $(PFLAGS) $(PINC) \
+    $(if $(LIBTORCH_ROOT),-DUSE_LIBTORCH,) \
+    $(if $(PYTHON_INC_DIR),-DUSE_PYTORCH,) \
+    -I$(LIB_SRC)/transportModels/compressible/lnInclude \
+    -I$(LIB_SRC)/thermophysicalModels/basic/lnInclude \
+    -I$(LIB_SRC)/TurbulenceModels/turbulenceModels/lnInclude \
+    -I$(LIB_SRC)/TurbulenceModels/compressible/lnInclude \
+    -I$(LIB_SRC)/finiteVolume/cfdTools \
+    -I$(LIB_SRC)/finiteVolume/lnInclude \
+    -I$(LIB_SRC)/meshTools/lnInclude \
+    -I$(LIB_SRC)/sampling/lnInclude \
+    -I$(LIB_SRC)/dynamicFvMesh/lnInclude \
+    -I$(LIB_SRC)/Pstream/mpi \
+    -I$(DF_SRC)/dfCanteraMixture/lnInclude \
+    -I$(DF_SRC)/dfChemistryModel/lnInclude \
+    -I$(DF_SRC)/dfCombustionModels/lnInclude \
+    -I$(CANTERA_ROOT)/include \
+    $(if $(LIBTORCH_ROOT),-I$(LIBTORCH_ROOT)/include,) \
+    $(if $(LIBTORCH_ROOT),-I$(LIBTORCH_ROOT)/include/torch/csrc/api/include,) \
+    $(PYTHON_INC_DIR) \
+    $(if $(AMGX_DIR), -I$(DF_ROOT)/src_gpu,) \
+    $(if $(AMGX_DIR), -I/usr/local/cuda-11.6/include,) \
+    $(if $(AMGX_DIR), -I$(AMGX_DIR)/include,) \
+	-I$(DF_ROOT)/GPUTestRef/lnInclude \
+
+EXE_LIBS = \
+    -lcompressibleTransportModels \
+    -lturbulenceModels \
+    -lfiniteVolume \
+    -lmeshTools \
+    -lsampling \
+    -L$(DF_LIBBIN) \
+    -ldfFluidThermophysicalModels \
+    -ldfCompressibleTurbulenceModels \
+    -ldfCanteraMixture \
+    -ldfChemistryModel \
+    -ldfCombustionModels  \
+	-ldfGenMatrix \
+    $(CANTERA_ROOT)/lib/libcantera.so \
+    $(if $(LIBTORCH_ROOT),$(LIBTORCH_ROOT)/lib/libtorch.so,) \
+    $(if $(LIBTORCH_ROOT),$(LIBTORCH_ROOT)/lib/libc10.so,) \
+    $(if $(LIBTORCH_ROOT),-rdynamic,) \
+    $(if $(LIBTORCH_ROOT),-lpthread,) \
+    $(if $(LIBTORCH_ROOT),$(DF_SRC)/dfChemistryModel/DNNInferencer/build/libDNNInferencer.so,) \
+    $(if $(PYTHON_LIB_DIR),-L$(PYTHON_LIB_DIR),) \
+    $(if $(PYTHON_LIB_DIR),-lpython3.8,) \
+    $(if $(AMGX_DIR), /usr/local/cuda-11.6/lib64/libcudart.so,) \
+    $(if $(AMGX_DIR), $(DF_ROOT)/src_gpu/build/libdfMatrix.so,) \
+    $(if $(AMGX_DIR), $(AMGX_DIR)/build/libamgxsh.so,)
+
diff --git a/applications/solvers/dfLowMachFoam_new/UEqn.H b/applications/solvers/dfLowMachFoam_new/UEqn.H
new file mode 100644
index 000000000..38934abdb
--- /dev/null
+++ b/applications/solvers/dfLowMachFoam_new/UEqn.H
@@ -0,0 +1,247 @@
+// Solve the Momentum equation
+#ifdef GPUSolver_
+    start1 = std::clock();
+    int offset = 0;
+    const tmp<volScalarField> nuEff_tmp(turbulence->nuEff());
+    const volScalarField& nuEff = nuEff_tmp();
+    forAll(U.boundaryField(), patchi)
+    {
+        const scalarField& patchP = p.boundaryField()[patchi];
+        const vectorField& patchU = U.boundaryField()[patchi];
+        const scalarField& patchRho = rho.boundaryField()[patchi];
+        const scalarField& patchNuEff = nuEff.boundaryField()[patchi];
+
+        int patchSize = patchP.size();
+
+        // boundary pressure
+        memcpy(boundary_pressure_init+offset, &patchP[0], patchSize*sizeof(double));
+        // boundary velocity
+        memcpy(boundary_velocity_init+3*offset, &patchU[0][0], 3*patchSize*sizeof(double));
+        // boundary nuEff
+        memcpy(boundary_nuEff_init+offset, &patchNuEff[0], patchSize*sizeof(double));
+        // boundary rho
+        memcpy(boundary_rho_init+offset, &patchRho[0], patchSize*sizeof(double));
+        offset += patchSize;
+    }
+    end1 = std::clock();
+    time_monitor_UEqn += double(end1 - start1) / double(CLOCKS_PER_SEC);
+    time_monitor_UEqn_mtxAssembly += double(end1 - start1) / double(CLOCKS_PER_SEC);
+    time_monitor_UEqn_mtxAssembly_CPU_prepare += double(end1 - start1) / double(CLOCKS_PER_SEC);
+
+    start1 = std::clock();
+    UEqn_GPU.initializeTimeStep();
+    U.oldTime();
+    UEqn_GPU.fvm_ddt(&U.oldTime()[0][0]);
+    UEqn_GPU.fvm_div(boundary_pressure_init, boundary_velocity_init, boundary_nuEff_init, boundary_rho_init);
+    UEqn_GPU.fvc_grad(&p[0]);
+    UEqn_GPU.fvc_grad_vector();
+    UEqn_GPU.dev2T();
+    UEqn_GPU.fvc_div_tensor(&nuEff[0]);
+    UEqn_GPU.fvm_laplacian();
+    UEqn_GPU.sync();
+    end1 = std::clock();
+    time_monitor_UEqn += double(end1 - start1) / double(CLOCKS_PER_SEC);
+    time_monitor_UEqn_mtxAssembly += double(end1 - start1) / double(CLOCKS_PER_SEC);
+    time_monitor_UEqn_mtxAssembly_GPU_run += double(end1 - start1) / double(CLOCKS_PER_SEC);
+
+    // start2 = std::clock();
+    // fvVectorMatrix turb_source
+    // (
+    //     turbulence->divDevRhoReff(U)
+    // );
+    // end2 = std::clock();
+    // time_monitor_CPU += double(end2 - start2) / double(CLOCKS_PER_SEC);
+
+    // UEqn_GPU.add_fvMatrix(&turb_source.lower()[0], &turb_source.diag()[0], &turb_source.upper()[0], &turb_source.source()[0][0]);
+    // end1 = std::clock();
+    // time_monitor_UEqn += double(end1 - start1) / double(CLOCKS_PER_SEC);
+    // time_monitor_UEqn_mtxAssembly += double(end1 - start1) / double(CLOCKS_PER_SEC);
+
+    // check value
+    // U.oldTime();
+    // tmp<fvVectorMatrix> tUEqn
+    // (
+        // fvm::ddt(rho, U) 
+        // + 
+        // fvm::div(phi, U)
+        // + 
+        // turbulence->divDevRhoReff(U) 
+        // == -fvc::grad(p)
+    // );
+    // fvVectorMatrix& UEqn = tUEqn.ref();
+    // printf("b_cpu = %e\n", UEqn.source()[1][1]);
+    // forAll(U.boundaryField(), patchi){
+        // labelUList sub_boundary = mesh.boundary()[patchi].faceCells();
+        // forAll(sub_boundary, i){
+        //     if (sub_boundary[i] == 1){
+        //         printf("b_cpu_bou = %e\n", UEqn.boundaryCoeffs()[patchi][i][1]);
+        //         printf("patchi = %d, i = %d\n", patchi, i);
+        //     }
+        // }
+    // }
+    // if (pimple.momentumPredictor())
+    // {
+    //     solve(UEqn);
+    //     Info << "U_CPU\n" << U << endl;
+    //     K = 0.5*magSqr(U);
+    // }
+    // UEqn_GPU.checkValue(true);
+#elif defined GPUSolverNew_
+    const tmp<volScalarField> nuEff_tmp(turbulence->nuEff());
+    const volScalarField& nuEff = nuEff_tmp();
+
+    // run CPU, for temp
+    tmp<fvVectorMatrix> tUEqn
+    (
+        fvm::ddt(rho, U) 
+        + 
+        fvm::div(phi, U)
+        +  
+        turbulence->divDevRhoReff(U)
+        == -fvc::grad(p)
+    );
+    fvVectorMatrix& UEqn = tUEqn.ref();
+
+    // run GPU
+    // preProcess
+    // TODO: preProcessForRhoEqn for temp, now we only transfer phi(instead of rhoEqn) used by fvm::div(phi, U)
+    UEqn_GPU.sync();
+    double *h_rho = dfDataBase.getFieldPointer("rho", location::cpu, position::internal);
+    double *h_phi = dfDataBase.getFieldPointer("phi", location::cpu, position::internal);
+    double *h_boundary_phi = dfDataBase.getFieldPointer("phi", location::cpu, position::boundary);
+    memcpy(h_rho, &rho[0], dfDataBase.cell_value_bytes);
+    memcpy(h_phi, &phi[0], dfDataBase.surface_value_bytes);
+    int offset = 0;
+    forAll(phi.boundaryField(), patchi)
+    {
+        const fvsPatchScalarField& patchPhi = phi.boundaryField()[patchi];
+        int patchsize = patchPhi.size();
+        memcpy(h_boundary_phi + offset, &patchPhi[0], patchsize * sizeof(double));
+        offset += patchsize;
+    }
+    UEqn_GPU.preProcessForRhoEqn(h_rho, h_phi, h_boundary_phi);
+    DEBUG_TRACE;
+    
+    TICK_START;
+    // preparing u, p, nu_eff, and rho.boundary used by UEqn_GPU.preProcess()
+    double *h_u = dfDataBase.getFieldPointer("u", location::cpu, position::internal);
+    double *h_boundary_u = dfDataBase.getFieldPointer("u", location::cpu, position::boundary);
+    double *h_p = dfDataBase.getFieldPointer("p", location::cpu, position::internal);
+    double *h_boundary_p = dfDataBase.getFieldPointer("p", location::cpu, position::boundary);
+    double *h_nu_eff = UEqn_GPU.getFieldPointer("nu_eff", location::cpu, position::internal);
+    double *h_boundary_nu_eff = UEqn_GPU.getFieldPointer("nu_eff", location::cpu, position::boundary);
+    double *h_boundary_rho = dfDataBase.getFieldPointer("rho", location::cpu, position::boundary);
+    TICK_STOP(get pointer);
+
+    TICK_START;
+    U.oldTime();
+    memcpy(h_u, &U.oldTime()[0][0], dfDataBase.cell_value_vec_bytes);
+    memcpy(h_p, &p[0], dfDataBase.cell_value_bytes);
+    memcpy(h_nu_eff, &nuEff[0], dfDataBase.cell_value_bytes);
+    TICK_STOP(copy to pinned memory);
+
+    TICK_START;
+    offset = 0;
+    forAll(U.boundaryField(), patchi)
+    {
+        const fvPatchVectorField& patchU = U.boundaryField()[patchi];
+        const fvPatchScalarField& patchP = p.boundaryField()[patchi];
+        const fvPatchScalarField& patchNuEff = nuEff.boundaryField()[patchi];
+        const fvPatchScalarField& patchRho = rho.boundaryField()[patchi];
+        int patchsize = patchU.size();
+        memcpy(h_boundary_u + offset * 3, &patchU[0][0], patchsize * 3 * sizeof(double));
+        memcpy(h_boundary_p + offset, &patchP[0], patchsize * sizeof(double));
+        memcpy(h_boundary_nu_eff + offset, &patchNuEff[0], patchsize * sizeof(double));
+        memcpy(h_boundary_rho + offset, &patchRho[0], patchsize * sizeof(double));
+        offset += patchsize;
+    }
+    TICK_STOP(CPU prepare boundary time);
+
+    TICK_START;
+    UEqn_GPU.preProcess(h_u, h_boundary_u, h_p, h_boundary_p, h_nu_eff, h_boundary_nu_eff, h_boundary_rho);
+    DEBUG_TRACE;
+    UEqn_GPU.sync();
+    TICK_STOP(GPU preProcess time);
+
+    // process
+    TICK_START;
+    UEqn_GPU.process();
+    DEBUG_TRACE;
+    UEqn_GPU.sync();
+    TICK_STOP(GPU process time);
+
+    TICK_START;
+    UEqn_GPU.solve();
+    TICK_STOP(GPU solve time);
+
+    // postProcess
+    TICK_START;
+    UEqn_GPU.postProcess(h_u);
+    U.correctBoundaryConditions();
+    DEBUG_TRACE;
+    TICK_STOP(post process time);
+
+    // checkResult
+    // TODO: for temp, now we compare ldu, finally we compare csr
+    std::vector<double> h_internal_coeffs(dfDataBase.num_boundary_surfaces * 3);
+    std::vector<double> h_boundary_coeffs(dfDataBase.num_boundary_surfaces * 3);
+    offset = 0;
+    for (int patchi = 0; patchi < dfDataBase.num_patches; patchi++)
+    {
+        int patchsize = dfDataBase.patch_size[patchi];
+        const double* internal_coeff_ptr = &UEqn.internalCoeffs()[patchi][0][0];
+        const double* boundary_coeff_ptr = &UEqn.boundaryCoeffs()[patchi][0][0];
+        memcpy(h_internal_coeffs.data() + offset * 3, internal_coeff_ptr, patchsize * 3 * sizeof(double));
+        memcpy(h_boundary_coeffs.data() + offset * 3, boundary_coeff_ptr, patchsize * 3 * sizeof(double));
+        offset += patchsize;
+    }
+    bool printFlag = false;
+    UEqn_GPU.compareResult(&UEqn.lower()[0], &UEqn.upper()[0], &UEqn.diag()[0], &UEqn.source()[0][0],
+            h_internal_coeffs.data(), h_boundary_coeffs.data(), 
+            // &DivTensor[0][0], 
+            printFlag);
+    DEBUG_TRACE;
+#else
+    start1 = std::clock();
+    tmp<fvVectorMatrix> tUEqn
+    (
+        fvm::ddt(rho, U) + fvm::div(phi, U)
+    + turbulence->divDevRhoReff(U) 
+    == -fvc::grad(p)
+    );
+    fvVectorMatrix& UEqn = tUEqn.ref();
+
+    end1 = std::clock();
+    time_monitor_UEqn += double(end1 - start1) / double(CLOCKS_PER_SEC);
+    time_monitor_UEqn_mtxAssembly += double(end1 - start1) / double(CLOCKS_PER_SEC);
+
+    UEqn.relax();
+    start1 = std::clock();
+    if (pimple.momentumPredictor())
+    {
+        solve(UEqn);
+
+        K = 0.5*magSqr(U);
+    }
+    end1 = std::clock();
+    time_monitor_UEqn += double(end1 - start1) / double(CLOCKS_PER_SEC);
+    time_monitor_UEqn_solve += double(end1 - start1) / double(CLOCKS_PER_SEC);
+#endif
+
+// start1 = std::clock();
+// // // std::thread t(&dfMatrix::solve, &UEqn_GPU);
+// UEqn_GPU.solve();
+// end1 = std::clock();
+// time_monitor_UEqn += double(end1 - start1) / double(CLOCKS_PER_SEC);
+// time_monitor_UEqn_solve += double(end1 - start1) / double(CLOCKS_PER_SEC);
+
+// start1 = std::clock();
+// // // t.join();
+// // UEqn_GPU.updatePsi(&U[0][0]);
+// K = 0.5*magSqr(U);
+// end1 = std::clock();
+// time_monitor_UEqn += double(end1 - start1) / double(CLOCKS_PER_SEC);
+// time_monitor_UEqn_mtxAssembly += double(end1 - start1) / double(CLOCKS_PER_SEC);
+// time_monitor_CPU += double(end1 - start1) / double(CLOCKS_PER_SEC);
+// // Info << "U_amgx = " << U << endl;
+
diff --git a/applications/solvers/dfLowMachFoam_new/YEqn.H b/applications/solvers/dfLowMachFoam_new/YEqn.H
new file mode 100644
index 000000000..76570b24d
--- /dev/null
+++ b/applications/solvers/dfLowMachFoam_new/YEqn.H
@@ -0,0 +1,207 @@
+hDiffCorrFlux = Zero;
+diffAlphaD = Zero;
+sumYDiffError = Zero;
+
+tmp<fv::convectionScheme<scalar>> mvConvection
+(
+    fv::convectionScheme<scalar>::New
+    (
+        mesh,
+        fields,
+        phi,
+        mesh.divScheme("div(phi,Yi_h)")
+    )
+);
+#ifdef GPUSolver_
+    start1 = std::clock();
+    UEqn_GPU.solve();
+    end1 = std::clock();
+    time_monitor_UEqn += double(end1 - start1) / double(CLOCKS_PER_SEC);
+    time_monitor_UEqn_solve += double(end1 - start1) / double(CLOCKS_PER_SEC);
+
+    start1 = std::clock();
+    std::vector<double*> Y_old(Y.size()), boundary_Y(Y.size()), boundary_hai(Y.size()), boundary_rhoD(Y.size());
+    std::vector<const double*> hai(Y.size()), rhoD(Y.size());
+    for (size_t i = 0; i < Y.size(); ++i)
+    {
+        volScalarField& Yi = Y[i];
+        Yi.oldTime();
+        Y_old[i] = &Yi.oldTime()[0];
+        if (updateBoundaryFields)
+        {
+            cudaMallocHost(&boundary_Y[i], num_boundary_faces*sizeof(double));
+        }
+        const volScalarField& haii = chemistry->hai(i);
+        const volScalarField& rhoDi = chemistry->rhoD(i);
+        // hai[i] = &haii[0];
+        rhoD[i] = &rhoDi[0];
+        // cudaMallocHost(&boundary_hai[i], num_boundary_faces*sizeof(double));
+        cudaMallocHost(&boundary_rhoD[i], num_boundary_faces*sizeof(double));
+        int offset = 0;
+        forAll(Yi.boundaryField(), patchi)
+        {
+            const scalarField& patchYi = Yi.boundaryField()[patchi];
+            // const scalarField& patchHaii = haii.boundaryField()[patchi];
+            const scalarField& patchRhoDi = rhoDi.boundaryField()[patchi];
+            int patchSize = patchYi.size();
+
+            if (updateBoundaryFields)
+            {
+                memcpy(boundary_Y[i] + offset, &patchYi[0], patchSize*sizeof(double));
+            }
+            // memcpy(boundary_hai[i] + offset, &patchHaii[0], patchSize*sizeof(double));
+            memcpy(boundary_rhoD[i] + offset, &patchRhoDi[0], patchSize*sizeof(double));
+            offset += patchSize;
+        }
+        // if (i == 5)
+        // {
+        //     Info << "rhoD_CPU" << rhoDi << endl;
+        // }
+        
+    }
+    // Info << "rhoD from nuEff\n" << nuEff * rho / 0.7 << endl;
+    updateBoundaryFields = false;
+    volScalarField mut_sct = turbulence->mut().ref()/Sct;
+    double *boundary_mutsct = nullptr;
+    cudaMallocHost(&boundary_mutsct, num_boundary_faces*sizeof(double));
+    int offset = 0;
+    forAll(p.boundaryField(), patchi)
+    {
+        const scalarField& patchMut_sct = mut_sct.boundaryField()[patchi];
+        int patchSize = patchMut_sct.size();
+        memcpy(boundary_mutsct + offset, &patchMut_sct[0], patchSize*sizeof(double));
+        offset += patchSize;
+
+        // debug
+        // const fvsPatchScalarField& pw = mesh.surfaceInterpolation::weights().boundaryField()[patchi];
+        // Field<scalar> valueInternalCoeffs = Y[5].boundaryField()[patchi].valueInternalCoeffs(pw);
+        // Field<scalar> valueBoundaryCoeffs = Y[5].boundaryField()[patchi].valueBoundaryCoeffs(pw);
+        // Field<scalar> gradientInternalCoeffs = Y[5].boundaryField()[patchi].gradientInternalCoeffs();
+        // Field<scalar> gradientBoundaryCoeffs = Y[5].boundaryField()[patchi].gradientBoundaryCoeffs();
+        // Info << "valueInternalCoeffs\n" << valueInternalCoeffs << endl;
+        // Info << "valueBoundaryCoeffs\n" << valueBoundaryCoeffs << endl;
+        // Info << "gradientInternalCoeffs\n" << gradientInternalCoeffs << endl;
+        // Info << "gradientBoundaryCoeffs\n" << gradientBoundaryCoeffs << endl;
+    }
+    end1 = std::clock();
+    time_monitor_YEqn += double(end1 - start1) / double(CLOCKS_PER_SEC);
+    time_monitor_YEqn_mtxAssembly += double(end1 - start1) / double(CLOCKS_PER_SEC);
+    time_monitor_YEqn_mtxAssembly_CPU_prepare += double(end1 - start1) / double(CLOCKS_PER_SEC);
+    //fprintf(stderr, "time_monitor_YEqn_mtxAssembly_CPU_prepare: %lf\n", time_monitor_YEqn_mtxAssembly_CPU_prepare);
+
+    start1 = std::clock();
+    YEqn_GPU.initializeTimeStep();
+    YEqn_GPU.upwindWeight();
+    YEqn_GPU.fvm_laplacian_and_sumYDiffError_diffAlphaD_hDiffCorrFlux(Y_old, boundary_Y,
+            hai, boundary_hai, rhoD, boundary_rhoD, &mut_sct[0], boundary_mutsct, &thermo.alpha()[0]);
+    YEqn_GPU.fvm_ddt();
+    YEqn_GPU.fvm_div_phi();
+    YEqn_GPU.fvm_div_phiUc();
+    YEqn_GPU.sync();
+    // YEqn_GPU.checkValue(true, "of_output_H2.txt");
+    end1 = std::clock();
+    time_monitor_YEqn += double(end1 - start1) / double(CLOCKS_PER_SEC);
+    time_monitor_YEqn_mtxAssembly += double(end1 - start1) / double(CLOCKS_PER_SEC);
+    time_monitor_YEqn_mtxAssembly_GPU_run += double(end1 - start1) / double(CLOCKS_PER_SEC);
+    //fprintf(stderr, "time_monitor_YEqn_mtxAssembly_GPU_run: %lf\n", time_monitor_YEqn_mtxAssembly_GPU_run);
+
+    start1 = std::clock();
+    YEqn_GPU.solve();
+    end1 = std::clock();
+    time_monitor_YEqn += double(end1 - start1) / double(CLOCKS_PER_SEC);
+    time_monitor_YEqn_solve += double(end1 - start1) / double(CLOCKS_PER_SEC);
+#else
+    start1 = std::clock();
+    forAll(Y, i)
+    {
+        sumYDiffError += chemistry->rhoD(i)*fvc::grad(Y[i]);
+    }
+    // Info << "sumYDiffError\n" << sumYDiffError << endl;
+    const surfaceScalarField phiUc = linearInterpolate(sumYDiffError) & mesh.Sf();
+    start1 = std::clock();
+    time_monitor_YEqn += double(end1 - start1) / double(CLOCKS_PER_SEC);
+    time_monitor_YEqn_solve += double(end1 - start1) / double(CLOCKS_PER_SEC);  
+#endif
+
+//MPI_Barrier(PstreamGlobals::MPI_COMM_FOAM);
+label flag_mpi_init;
+MPI_Initialized(&flag_mpi_init);
+if(flag_mpi_init) MPI_Barrier(PstreamGlobals::MPI_COMM_FOAM);
+
+{
+    if (!splitting)
+    {
+        std::chrono::steady_clock::time_point start = std::chrono::steady_clock::now();
+        combustion->correct();
+        //label flag_mpi_init;
+        //MPI_Initialized(&flag_mpi_init);
+        if(flag_mpi_init) MPI_Barrier(PstreamGlobals::MPI_COMM_FOAM);
+        std::chrono::steady_clock::time_point stop = std::chrono::steady_clock::now();
+        std::chrono::duration<double> processingTime = std::chrono::duration_cast<std::chrono::duration<double>>(stop - start);
+        time_monitor_chem += processingTime.count();
+    }
+
+#ifdef GPUSolver_
+    start1 = std::clock();
+    forAll(Y, i)
+    {
+        volScalarField& Yi = Y[i];
+        YEqn_GPU.updatePsi(&Yi[0], i);
+        Yi.correctBoundaryConditions();
+    }
+    YEqn_GPU.correctBoundaryConditions();
+    end1 = std::clock();
+    time_monitor_YEqn += double(end1 - start1) / double(CLOCKS_PER_SEC);
+    time_monitor_YEqn_correctBC += double(end1 - start1) / double(CLOCKS_PER_SEC);
+#else
+    start2 = std::clock();
+    volScalarField Yt(0.0*Y[0]);
+    int speciesIndex = 0;
+    forAll(Y, i)
+    {
+        volScalarField& Yi = Y[i];
+        hDiffCorrFlux += chemistry->hai(i)*(chemistry->rhoD(i)*fvc::grad(Yi) - Yi*sumYDiffError);
+        diffAlphaD += fvc::laplacian(thermo.alpha()*chemistry->hai(i), Yi);
+        if (i != inertIndex)
+        {
+            start1 = std::clock();
+            tmp<volScalarField> DEff = chemistry->rhoD(i) + turbulence->mut()/Sct;
+
+            fvScalarMatrix YiEqn
+            (
+                fvm::ddt(rho, Yi)
+            +
+                (
+                    turbName == "laminar"
+                    ?  (mvConvection->fvmDiv(phi, Yi) + mvConvection->fvmDiv(phiUc, Yi))
+                    :   mvConvection->fvmDiv(phi, Yi)
+                )
+            ==
+                (
+                    splitting
+                    ?   fvm::laplacian(DEff(), Yi)
+                    :  (fvm::laplacian(DEff(), Yi) + combustion->R(Yi))
+                    )
+            );
+            
+            end1 = std::clock();
+            time_monitor_YEqn_mtxAssembly += double(end1 - start1) / double(CLOCKS_PER_SEC);
+            YiEqn.relax();
+
+            start1 = std::clock();
+            YiEqn.solve("Yi");
+            end1 = std::clock();
+            time_monitor_YEqn_solve += double(end1 - start1) / double(CLOCKS_PER_SEC);
+
+            Yi.max(0.0);
+            Yt += Yi;
+            ++speciesIndex;
+        }
+    }
+
+    Y[inertIndex] = scalar(1) - Yt;
+    Y[inertIndex].max(0.0);
+    end2 = std::clock();
+    time_monitor_YEqn += double(end2 - start2) / double(CLOCKS_PER_SEC);
+#endif
+}
diff --git a/applications/solvers/dfLowMachFoam_new/YEqn_RR.H b/applications/solvers/dfLowMachFoam_new/YEqn_RR.H
new file mode 100644
index 000000000..f5752e95e
--- /dev/null
+++ b/applications/solvers/dfLowMachFoam_new/YEqn_RR.H
@@ -0,0 +1,61 @@
+if (!(timeIndex % 2))
+{
+    volScalarField Yt(0.0*Y[0]);
+
+    scalar dtSave = runTime.deltaT().value();
+    runTime.setDeltaT(dtSave * 2);
+
+    start = std::clock();
+    combustion->correct();
+
+    label flag_mpi_init;
+    MPI_Initialized(&flag_mpi_init);
+    if(flag_mpi_init) MPI_Barrier(PstreamGlobals::MPI_COMM_FOAM);
+    end = std::clock();
+    time_monitor_chem += double(end - start) / double(CLOCKS_PER_SEC);
+
+    forAll(Y, i)
+    {
+        volScalarField& Yi = Y[i];
+
+        if (i != inertIndex)
+        {
+            volScalarField& Yi = Y[i];
+            fvScalarMatrix YiEqn
+            (
+                fvm::ddt(rho, Yi)
+                ==
+                combustion->R(Yi)
+            );
+
+            YiEqn.relax();
+
+            YiEqn.solve("Yi");
+
+            Yi.max(0.0);
+            Yt += Yi;
+        }
+    }
+    Y[inertIndex] = scalar(1) - Yt;
+    Y[inertIndex].max(0.0);
+
+    forAll (Y, i)
+    {
+        volScalarField& tYi = Y[i].oldTime();
+
+        forAll(tYi, celli)
+        {
+            tYi[celli] = Y[i][celli];
+        }
+        volScalarField::Boundary& Bf = tYi.boundaryFieldRef(); 
+        forAll(Bf, patchi)
+        {
+            forAll(Bf[patchi], facei)   
+            {
+                Bf[patchi][facei] = Y[i].boundaryField()[patchi][facei];
+            }
+        }
+    }
+
+    runTime.setDeltaT(dtSave);
+}
\ No newline at end of file
diff --git a/applications/solvers/dfLowMachFoam_new/correctPhi.H b/applications/solvers/dfLowMachFoam_new/correctPhi.H
new file mode 100644
index 000000000..3cd82d29e
--- /dev/null
+++ b/applications/solvers/dfLowMachFoam_new/correctPhi.H
@@ -0,0 +1,12 @@
+CorrectPhi
+(
+    U,
+    phi,
+    p,
+    rho,
+    psi,
+    dimensionedScalar("rAUf", dimTime, 1),
+    divrhoU(),
+    pimple,
+    true
+);
diff --git a/applications/solvers/dfLowMachFoam_new/createFields.H b/applications/solvers/dfLowMachFoam_new/createFields.H
new file mode 100644
index 000000000..9e750c334
--- /dev/null
+++ b/applications/solvers/dfLowMachFoam_new/createFields.H
@@ -0,0 +1,176 @@
+#include "createRDeltaT.H"
+
+Info<< "Reading thermophysical properties\n" << endl;
+
+// fluidThermo* pThermo = new hePsiThermo<psiThermo, CanteraMixture>(mesh, word::null);
+fluidThermo* pThermo = new heRhoThermo<rhoThermo, CanteraMixture>(mesh, word::null);
+fluidThermo& thermo = *pThermo;
+// thermo.validate(args.executable(), "ha");
+
+const volScalarField& psi = thermo.psi();
+volScalarField& p = thermo.p();
+volScalarField& T = thermo.T();
+volScalarField rho
+(
+    IOobject
+    (
+        "rho",
+        runTime.timeName(),
+        mesh,
+        IOobject::READ_IF_PRESENT,
+        IOobject::AUTO_WRITE
+    ),
+    thermo.rho()
+);
+
+
+Info<< "Reading field U\n" << endl;
+volVectorField U
+(
+    IOobject
+    (
+        "U",
+        runTime.timeName(),
+        mesh,
+        IOobject::MUST_READ,
+        IOobject::AUTO_WRITE
+    ),
+    mesh
+);
+
+#include "compressibleCreatePhi.H"
+
+pressureControl pressureControl(p, rho, pimple.dict(), false);
+
+mesh.setFluxRequired(p.name());
+
+Info<< "Creating turbulence model\n" << endl;
+autoPtr<compressible::turbulenceModel> turbulence
+(
+    compressible::turbulenceModel::New
+    (
+        rho,
+        U,
+        phi,
+        thermo
+    )
+);
+
+Info<< "Creating field dpdt\n" << endl;
+volScalarField dpdt
+(
+    IOobject
+    (
+        "dpdt",
+        runTime.timeName(),
+        mesh,
+        IOobject::NO_READ,
+        IOobject::NO_WRITE
+    ),
+    mesh,
+    dimensionedScalar("dpdt",p.dimensions()/dimTime, 0)
+);
+
+
+Info<< "Creating reaction model\n" << endl;
+autoPtr<CombustionModel<basicThermo>> combustion
+(
+    CombustionModel<basicThermo>::New(thermo, turbulence())
+);
+Info<< "end Creating reaction model\n" << endl;
+
+
+const word combModelName(mesh.objectRegistry::lookupObject<IOdictionary>("combustionProperties").lookup("combustionModel"));
+Info << "Combustion Model Name is confirmed as "<< combModelName << endl;
+
+const word turbName(mesh.objectRegistry::lookupObject<IOdictionary>("turbulenceProperties").lookup("simulationType"));
+
+dfChemistryModel<basicThermo>* chemistry = combustion->chemistry();
+PtrList<volScalarField>& Y = chemistry->Y();
+const word inertSpecie(chemistry->lookup("inertSpecie"));
+const label inertIndex(chemistry->species()[inertSpecie]);
+chemistry->setEnergyName("ha");
+chemistry->updateEnergy();
+
+
+chemistry->correctThermo();
+Info<< "At initial time, min/max(T) = " << min(T).value() << ", " << max(T).value() << endl;
+
+//for dpdt
+
+Info<< "Creating field kinetic energy K\n" << endl;
+volScalarField K("K", 0.5*magSqr(U));
+
+multivariateSurfaceInterpolationScheme<scalar>::fieldTable fields;
+
+if(combModelName!="flareFGM")
+{
+forAll(Y, i)
+{
+    fields.add(Y[i]);
+}
+fields.add(thermo.he());
+}
+
+
+const scalar Sct = chemistry->lookupOrDefault("Sct", 1.);
+volScalarField diffAlphaD
+(
+    IOobject
+    (
+        "diffAlphaD",
+        runTime.timeName(),
+        mesh,
+        IOobject::NO_READ,
+        IOobject::NO_WRITE
+    ),
+    mesh,
+    dimensionedScalar(dimEnergy/dimTime/dimVolume, 0)
+);
+volVectorField hDiffCorrFlux
+(
+    IOobject
+    (
+        "hDiffCorrFlux",
+        runTime.timeName(),
+        mesh,
+        IOobject::NO_READ,
+        IOobject::NO_WRITE
+    ),
+    mesh,
+    dimensionedVector(dimensionSet(1,0,-3,0,0,0,0), Zero)
+);
+volVectorField sumYDiffError
+(
+    IOobject
+    (
+        "sumYDiffError",
+        runTime.timeName(),
+        mesh,
+        IOobject::NO_READ,
+        IOobject::NO_WRITE
+    ),
+    mesh,
+    dimensionedVector("sumYDiffError", dimDynamicViscosity/dimLength, Zero)
+);
+
+IOdictionary CanteraTorchProperties
+(
+    IOobject
+    (
+        "CanteraTorchProperties",
+        runTime.constant(),
+        mesh,
+        IOobject::MUST_READ,
+        IOobject::NO_WRITE
+    )
+);
+const Switch splitting = CanteraTorchProperties.lookupOrDefault("splittingStrategy", false);
+#ifdef USE_PYTORCH
+    const Switch log_ = CanteraTorchProperties.subDict("TorchSettings").lookupOrDefault("log", false);
+    const Switch torch_ = CanteraTorchProperties.subDict("TorchSettings").lookupOrDefault("torch", false);
+#endif
+#ifdef USE_LIBTORCH
+    const Switch log_ = CanteraTorchProperties.subDict("TorchSettings").lookupOrDefault("log", false);
+    const Switch torch_ = CanteraTorchProperties.subDict("TorchSettings").lookupOrDefault("torch", false);
+#endif
diff --git a/applications/solvers/dfLowMachFoam_new/createGPUSolver.H b/applications/solvers/dfLowMachFoam_new/createGPUSolver.H
new file mode 100644
index 000000000..94fff1125
--- /dev/null
+++ b/applications/solvers/dfLowMachFoam_new/createGPUSolver.H
@@ -0,0 +1,97 @@
+dfMatrixDataBase dfDataBase;
+//dfRhoEqn rhoEqn_GPU;
+dfUEqn UEqn_GPU(dfDataBase);
+//dfYEqn YEqn_GPU;
+//dfEEqn EEqn_GPU;
+
+void createGPUBase(fvMesh& mesh, PtrList<volScalarField>& Y) {
+    // prepare constant values: num_cells, num_surfaces, num_boundary_surfaces, num_patches, patch_size, num_species, rdelta_t
+    const labelUList& owner = mesh.owner();
+    const labelUList& neighbour = mesh.neighbour();
+    int num_cells = mesh.nCells();
+    int num_surfaces = neighbour.size();
+    int num_boundary_surfaces = 0;
+    int num_patches = 0;
+    std::vector<int> patch_size;
+    forAll(mesh.boundary(), patchi) {
+        labelUList sub_boundary = mesh.boundary()[patchi].faceCells();
+        int patchsize = sub_boundary.size();
+        patch_size.push_back(patchsize);
+        num_boundary_surfaces += patchsize;
+        num_patches++;
+    }
+    // TODO: get deltaT fomr time API
+    double rDeltaT = 1 / 1e-6;
+    dfDataBase.setConstantValues(num_cells, num_surfaces, num_boundary_surfaces, num_patches, patch_size, Y.size(), rDeltaT);
+    
+    // prepare constant indexes: owner, neighbor
+    dfDataBase.setConstantIndexes(&owner[0], &neighbour[0]);
+    
+    // prepare internal and boundary of sf, mag_sf, weights, delta_coeffs, volume
+    double *boundary_sf = new double[3 * num_boundary_surfaces];
+    double *boundary_mag_sf = new double[num_boundary_surfaces];
+    double *boundary_delta_coeffs = new double[num_boundary_surfaces];
+    int *boundary_face_cell = new int[num_boundary_surfaces];
+    int offset = 0;
+    forAll(mesh.boundary(), patchi) {
+        const vectorField& pSf = mesh.Sf().boundaryField()[patchi];
+        const scalarField& pMagSf = mesh.magSf().boundaryField()[patchi];
+        const scalarField& pDeltaCoeffs = mesh.nonOrthDeltaCoeffs().boundaryField()[patchi];
+        const labelUList& pFaceCells = mesh.boundary()[patchi].faceCells();
+
+        int patchsize = pMagSf.size();
+
+        memcpy(boundary_sf + 3*offset, &pSf[0][0], 3*patchsize*sizeof(double));
+        memcpy(boundary_mag_sf + offset, &pMagSf[0], patchsize*sizeof(double));
+        memcpy(boundary_delta_coeffs + offset, &pDeltaCoeffs[0], patchsize*sizeof(double));
+        memcpy(boundary_face_cell + offset, &pFaceCells[0], patchsize * sizeof(int));
+        offset += patchsize;
+    }
+
+    dfDataBase.createConstantFieldsInternal();
+    dfDataBase.createConstantFieldsBoundary();
+    dfDataBase.initConstantFieldsInternal(&mesh.Sf()[0][0], &mesh.magSf()[0], &mesh.surfaceInterpolation::weights()[0], &mesh.nonOrthDeltaCoeffs()[0], &mesh.V()[0]);
+    dfDataBase.initConstantFieldsBoundary(boundary_sf, boundary_mag_sf, boundary_delta_coeffs, boundary_face_cell);
+    
+    // prepare internal and boundary of Y
+    dfDataBase.createNonConstantFieldsInternal();
+    dfDataBase.createNonConstantFieldsBoundary();
+    forAll(Y, speciesI) {
+        volScalarField& Yi = Y[speciesI];
+        memcpy(dfDataBase.h_y + speciesI * num_cells, &Yi[0], num_cells * sizeof(double));
+        offset = 0;
+        forAll(Yi.boundaryField(), patchi) {
+            const scalarField& patchYi = Yi.boundaryField()[patchi];
+            int patchsize = patchYi.size();
+            memcpy(dfDataBase.h_boundary_y + speciesI * num_boundary_surfaces + offset, &patchYi[0], patchsize*sizeof(double));
+            offset += patchsize;
+        }
+    }
+    dfDataBase.initNonConstantFieldsInternal(dfDataBase.h_y);
+    dfDataBase.initNonConstantFieldsBoundary(dfDataBase.h_boundary_y);
+}
+
+void createGPUUEqn(const IOdictionary& CanteraTorchProperties, const volVectorField& U) {
+    // prepare mode_string and setting_path
+    string mode_string = "dDDI";
+    string settingPath;
+    settingPath = CanteraTorchProperties.subDict("AmgxSettings").lookupOrDefault("UEqnSettingPath", string(""));
+    UEqn_GPU.setConstantValues(mode_string, settingPath);
+
+    // prepare patch_type
+    std::vector<int> patch_type;
+    patch_type.resize(dfDataBase.num_patches);
+    forAll(U.boundaryField(), patchi)
+    {
+        constructBoundarySelectorPerPatch(&(patch_type[patchi]), U.boundaryField()[patchi].type());
+    }
+    UEqn_GPU.setConstantFields(patch_type);
+
+    // prepare internal and boundary of xxx
+    UEqn_GPU.createNonConstantFieldsInternal();
+    UEqn_GPU.createNonConstantFieldsBoundary();
+    UEqn_GPU.createNonConstantLduAndCsrFields();
+    // UEqn_GPU has no internal non-constant fields to be init
+    // UEqn_GPU.initNonConstantFieldsInternal();
+    UEqn_GPU.initNonConstantFieldsBoundary();
+}
diff --git a/applications/solvers/dfLowMachFoam_new/createdfSolver.H b/applications/solvers/dfLowMachFoam_new/createdfSolver.H
new file mode 100644
index 000000000..3c5593833
--- /dev/null
+++ b/applications/solvers/dfLowMachFoam_new/createdfSolver.H
@@ -0,0 +1,65 @@
+const labelUList& owner = mesh.owner();
+const labelUList& neighbour = mesh.neighbour();
+int num_cells = mesh.nCells();
+int num_surfaces = neighbour.size();
+
+std::vector<int> boundaryCellIndex;
+std::vector<double> boundary_face_vector_init;
+std::vector<double> boundary_face_init;
+std::vector<double> boundary_deltaCoeffs_init;
+std::vector<std::vector<int>> patchTypes;
+std::vector<int> patchTypeU, patchTypeY;
+int num_boundary_faces = 0;
+int patchSize;
+forAll(mesh.boundary(), patchi)
+{
+    labelUList sub_boundary = mesh.boundary()[patchi].faceCells();
+    patchSize = sub_boundary.size();
+    const vectorField& pSf = mesh.Sf().boundaryField()[patchi];
+    const scalarField& pMagSf = mesh.magSf().boundaryField()[patchi];
+    const scalarField& pDeltaCoeffs = mesh.nonOrthDeltaCoeffs().boundaryField()[patchi];
+
+    boundaryCellIndex.insert(boundaryCellIndex.end(), &sub_boundary[0], &sub_boundary[0]+patchSize);
+    boundary_face_vector_init.insert(boundary_face_vector_init.end(), &pSf[0][0], &pSf[0][0]+3*patchSize);
+    boundary_face_init.insert(boundary_face_init.end(), &pMagSf[0], &pMagSf[0]+patchSize);
+    boundary_deltaCoeffs_init.insert(boundary_deltaCoeffs_init.end(), &pDeltaCoeffs[0], &pDeltaCoeffs[0]+patchSize);
+    num_boundary_faces += patchSize;
+
+    constructBoundarySelector(patchTypeU, U.boundaryField()[patchi].type(), patchSize);
+    constructBoundarySelector(patchTypeY, Y[0].boundaryField()[patchi].type(), patchSize);
+}
+patchTypes.emplace_back(patchTypeU);
+patchTypes.emplace_back(patchTypeY);
+
+int num_boundary_cells;
+
+string settingPath;
+settingPath = CanteraTorchProperties.subDict("AmgxSettings").lookupOrDefault("UEqnSettingPath", string(""));
+
+#ifdef GPUSolver_
+    dfMatrixDataBase dfDataBase(num_surfaces, num_cells, num_boundary_faces, Y.size(), num_boundary_cells, &neighbour[0], &owner[0], &mesh.V()[0], &mesh.surfaceInterpolation::weights()[0], 
+    &mesh.Sf()[0][0], &mesh.magSf()[0], &mesh.nonOrthDeltaCoeffs()[0], boundary_face_vector_init, boundary_face_init, boundary_deltaCoeffs_init, boundaryCellIndex, patchTypes);
+    dfRhoEqn rhoEqn_GPU(dfDataBase);
+    dfUEqn UEqn_GPU(dfDataBase, "dDDI", settingPath);
+    dfYEqn YEqn_GPU(dfDataBase, "dDDI", settingPath, inertIndex);
+    dfEEqn EEqn_GPU(dfDataBase, "dDDI", settingPath);
+
+    double *ueqn_internalCoeffs_init, *ueqn_boundaryCoeffs_init, *boundary_pressure_init, *boundary_velocity_init,
+        *boundary_nuEff_init, *boundary_rho_init, *ueqn_laplac_internalCoeffs_init, *ueqn_laplac_boundaryCoeffs_init, *boundary_phi_init;
+    cudaMallocHost(&ueqn_internalCoeffs_init, 3*num_boundary_faces*sizeof(double));
+    cudaMallocHost(&ueqn_boundaryCoeffs_init, 3*num_boundary_faces*sizeof(double));
+    cudaMallocHost(&ueqn_laplac_internalCoeffs_init, 3*num_boundary_faces*sizeof(double));
+    cudaMallocHost(&ueqn_laplac_boundaryCoeffs_init, 3*num_boundary_faces*sizeof(double));
+    cudaMallocHost(&boundary_velocity_init, 3*num_boundary_faces*sizeof(double));
+    cudaMallocHost(&boundary_pressure_init, num_boundary_faces*sizeof(double));
+    cudaMallocHost(&boundary_nuEff_init, num_boundary_faces*sizeof(double));
+    cudaMallocHost(&boundary_rho_init, num_boundary_faces*sizeof(double));
+    cudaMallocHost(&boundary_phi_init, num_boundary_faces*sizeof(double));
+
+    double *boundary_alphaEff, *boundary_K, *boundary_gradient;
+    cudaMallocHost(&boundary_K, num_boundary_faces*sizeof(double));
+    cudaMallocHost(&boundary_alphaEff, num_boundary_faces*sizeof(double));
+    cudaMallocHost(&boundary_gradient, num_boundary_faces * sizeof(double));
+
+    bool updateBoundaryFields = true; // make sure that the boundary fields do H2D copy at 1st timestep
+#endif
diff --git a/applications/solvers/dfLowMachFoam_new/dfLowMachFoam.C b/applications/solvers/dfLowMachFoam_new/dfLowMachFoam.C
new file mode 100644
index 000000000..f5b6ec90d
--- /dev/null
+++ b/applications/solvers/dfLowMachFoam_new/dfLowMachFoam.C
@@ -0,0 +1,447 @@
+/*---------------------------------------------------------------------------*\
+  =========                 |
+  \\      /  F ield         | OpenFOAM: The Open Source CFD Toolbox
+   \\    /   O peration     | Website:  https://openfoam.org
+    \\  /    A nd           | Copyright (C) 2011-2019 OpenFOAM Foundation
+     \\/     M anipulation  |
+-------------------------------------------------------------------------------
+License
+    This file is part of OpenFOAM.
+
+    OpenFOAM is free software: you can redistribute it and/or modify it
+    under the terms of the GNU General Public License as published by
+    the Free Software Foundation, either version 3 of the License, or
+    (at your option) any later version.
+
+    OpenFOAM is distributed in the hope that it will be useful, but WITHOUT
+    ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
+    FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
+    for more details.
+
+    You should have received a copy of the GNU General Public License
+    along with OpenFOAM.  If not, see <http://www.gnu.org/licenses/>.
+
+Application
+    rhoPimpleFoam
+
+Description
+    Transient solver for turbulent flow of compressible fluids for HVAC and
+    similar applications, with optional mesh motion and mesh topology changes.
+
+    Uses the flexible PIMPLE (PISO-SIMPLE) solution for time-resolved and
+    pseudo-transient simulations.
+
+\*---------------------------------------------------------------------------*/
+
+#include "dfChemistryModel.H"
+#include "CanteraMixture.H"
+// #include "hePsiThermo.H"
+#include "heRhoThermo.H"
+
+#ifdef USE_PYTORCH
+#include <pybind11/embed.h>
+#include <pybind11/numpy.h>
+#include <pybind11/stl.h> //used to convert
+#endif
+
+#ifdef USE_LIBTORCH
+#include <torch/script.h>
+#include "DNNInferencer.H"
+#endif
+
+#include "fvCFD.H"
+#include "fluidThermo.H"
+#include "turbulentFluidThermoModel.H"
+#include "pimpleControl.H"
+#include "pressureControl.H"
+#include "localEulerDdtScheme.H"
+#include "fvcSmooth.H"
+#include "PstreamGlobals.H"
+#include "basicThermo.H"
+#include "CombustionModel.H"
+
+#define GPUSolverNew_
+#define TIME
+
+#ifdef GPUSolverNew_
+#include "dfUEqn.H"
+// #include "dfYEqn.H"
+// #include "dfRhoEqn.H"
+// #include "dfEEqn.H"
+#include "dfMatrixDataBase.H"
+#include "dfMatrixOpBase.H"
+#include <cuda_runtime.h>
+#include <thread>
+
+#include "createGPUSolver.H"
+
+#include "upwind.H"
+#include "GenFvMatrix.H"
+#endif
+
+#ifdef TIME
+    #define TICK_START \
+        start_new = std::clock(); 
+    #define TICK_STOP(prefix) \
+        stop_new = std::clock(); \
+        Foam::Info << #prefix << " time = " << double(stop_new - start_new) / double(CLOCKS_PER_SEC) << " s" << Foam::endl;
+#else
+    #define TICK_START
+    #define TICK_STOP(prefix)
+#endif
+
+// * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * //
+
+int main(int argc, char *argv[])
+{
+#ifdef USE_PYTORCH
+    pybind11::scoped_interpreter guard{};//start python interpreter
+#endif
+    #include "postProcess.H"
+
+    // unsigned int flags = 0;
+    // checkCudaErrors(cudaGetDeviceFlags(&flags));
+    // flags |= cudaDeviceScheduleYield;
+    // checkCudaErrors(cudaSetDeviceFlags(flags));
+
+    // #include "setRootCaseLists.H"
+    #include "listOptions.H"
+    #include "setRootCase2.H"
+    #include "listOutput.H"
+
+    #include "createTime.H"
+    #include "createMesh.H"
+    #include "createDyMControls.H"
+    #include "initContinuityErrs.H"
+    #include "createFields.H"
+    #include "createRhoUfIfPresent.H"
+
+    double time_monitor_init = 0;
+
+    double time_monitor_other = 0;
+    double time_monitor_rho = 0;
+    double time_monitor_U = 0;
+    double time_monitor_Y = 0;
+    double time_monitor_E = 0;
+    double time_monitor_p = 0;
+    double time_monitor_chemistry_correctThermo = 0;
+    double time_monitor_turbulence_correct = 0;
+    double time_monitor_chem = 0; // combustion correct
+
+    double time_monitor_rhoEqn = 0;
+    double time_monitor_rhoEqn_mtxAssembly = 0;
+    double time_monitor_rhoEqn_mtxAssembly_CPU_prepare = 0;
+    double time_monitor_rhoEqn_mtxAssembly_GPU_run = 0;
+    double time_monitor_rhoEqn_solve = 0;
+    double time_monitor_rhoEqn_correctBC = 0;
+
+    double time_monitor_UEqn = 0;
+    double time_monitor_UEqn_mtxAssembly = 0;
+    double time_monitor_UEqn_mtxAssembly_CPU_prepare = 0;
+    double time_monitor_UEqn_mtxAssembly_GPU_run = 0;
+    double time_monitor_UEqn_solve = 0;
+    double time_monitor_UEqn_correctBC = 0;
+    double time_monitor_UEqn_H = 0;
+    double time_monitor_UEqn_H_GPU_run = 0;
+    double time_monitor_UEqn_H_correctBC = 0;
+    double time_monitor_UEqn_A = 0;
+    double time_monitor_UEqn_A_GPU_run = 0;
+    double time_monitor_UEqn_A_correctBC = 0;
+
+    double time_monitor_YEqn = 0;
+    double time_monitor_YEqn_mtxAssembly = 0;
+    double time_monitor_YEqn_mtxAssembly_CPU_prepare = 0;
+    double time_monitor_YEqn_mtxAssembly_GPU_run = 0;
+    double time_monitor_YEqn_solve = 0;
+    double time_monitor_YEqn_correctBC = 0;
+
+    double time_monitor_EEqn = 0;
+    double time_monitor_EEqn_mtxAssembly = 0;
+    double time_monitor_EEqn_mtxAssembly_CPU_prepare = 0;
+    double time_monitor_EEqn_mtxAssembly_GPU_prepare = 0;
+    double time_monitor_EEqn_mtxAssembly_GPU_run = 0;
+    double time_monitor_EEqn_solve = 0;
+    double time_monitor_EEqn_correctBC = 0;
+
+    double time_monitor_pEqn = 0;
+    double time_monitor_pEqn_solve = 0;
+
+    label timeIndex = 0;
+    clock_t start, end, start1, end1, start2, end2;
+    clock_t start_new, stop_new;
+    double time_new = 0;
+
+    turbulence->validate();
+
+    if (!LTS)
+    {
+        #include "compressibleCourantNo.H"
+        #include "setInitialDeltaT.H"
+    }
+
+    start1 = std::clock();
+    createGPUBase(mesh, Y);
+    createGPUUEqn(CanteraTorchProperties, U);
+
+    end1 = std::clock();
+    time_monitor_init += double(end1 - start1) / double(CLOCKS_PER_SEC);
+
+    // * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * //
+
+    Info<< "\nStarting time loop\n" << endl;
+
+    while (runTime.run())
+    {
+        timeIndex ++;
+
+        #include "readDyMControls.H"
+
+        if (LTS)
+        {
+            #include "setRDeltaT.H"
+        }
+        else
+        {
+            #include "compressibleCourantNo.H"
+            #include "setDeltaT.H"
+        }
+
+        runTime++;
+
+        Info<< "Time = " << runTime.timeName() << nl << endl;
+        dfDataBase.preTimeStep(&rho.oldTime()[0]);
+        clock_t loop_start = std::clock();
+        // --- Pressure-velocity PIMPLE corrector loop
+        while (pimple.loop())
+        {
+            start = std::clock();
+            if (splitting)
+            {
+                #include "YEqn_RR.H"
+            }
+            if (pimple.firstPimpleIter() || moveMeshOuterCorrectors)
+            {
+                // Store momentum to set rhoUf for introduced faces.
+                autoPtr<volVectorField> rhoU;
+                if (rhoUf.valid())
+                {
+                    rhoU = new volVectorField("rhoU", rho*U);
+                }
+            }
+            end = std::clock();
+            time_monitor_other += double(end - start) / double(CLOCKS_PER_SEC);
+
+            start = std::clock();
+            if (pimple.firstPimpleIter() && !pimple.simpleRho())
+            {
+                #include "rhoEqn.H"
+            }
+            end = std::clock();
+            time_monitor_rho += double(end - start) / double(CLOCKS_PER_SEC);
+            
+            start = std::clock();
+            #include "UEqn.H"
+            end = std::clock();
+            time_monitor_U += double(end - start) / double(CLOCKS_PER_SEC);
+
+            if(combModelName!="ESF" && combModelName!="flareFGM" && combModelName!="DeePFGM")
+            {
+                start = std::clock();
+                #include "YEqn.H"
+                end = std::clock();
+                time_monitor_Y += double(end - start) / double(CLOCKS_PER_SEC);
+
+                start = std::clock();
+                #include "EEqn.H"
+                end = std::clock();
+                time_monitor_E += double(end - start) / double(CLOCKS_PER_SEC);
+
+                start = std::clock();
+                chemistry->correctThermo();
+                end = std::clock();
+                time_monitor_chemistry_correctThermo += double(end - start) / double(CLOCKS_PER_SEC);
+            }
+            else
+            {
+                combustion->correct();
+            }
+
+            Info<< "min/max(T) = " << min(T).value() << ", " << max(T).value() << endl;
+
+            // --- Pressure corrector loop
+
+            start = std::clock();
+            while (pimple.correct())
+            {
+                if (pimple.consistent())
+                {
+                    // #include "pcEqn.H"
+                }
+                else
+                {
+                    #include "pEqn.H"
+                }
+            }
+            end = std::clock();
+            time_monitor_p += double(end - start) / double(CLOCKS_PER_SEC);
+
+            start = std::clock();
+            if (pimple.turbCorr())
+            {
+                turbulence->correct();
+            }
+            end = std::clock();
+            time_monitor_turbulence_correct += double(end - start) / double(CLOCKS_PER_SEC);
+        }
+        clock_t loop_end = std::clock();
+        double loop_time = double(loop_end - loop_start) / double(CLOCKS_PER_SEC);
+
+        rho = thermo.rho();
+
+        dfDataBase.postTimeStep();
+
+        runTime.write();
+        Info<< "========Time Spent in diffenet parts========"<< endl;
+        Info<< "loop Time                    = " << loop_time << " s" << endl;
+        Info<< "other Time                   = " << time_monitor_other << " s" << endl;
+        Info<< "rho Equations                = " << time_monitor_rho << " s" << endl;
+        Info<< "U Equations                  = " << time_monitor_U << " s" << endl;
+        Info<< "Y Equations                  = " << time_monitor_Y - time_monitor_chem << " s" << endl;
+        Info<< "E Equations                  = " << time_monitor_E << " s" << endl;
+        Info<< "p Equations                  = " << time_monitor_p << " s" << endl;
+        Info<< "chemistry correctThermo      = " << time_monitor_chemistry_correctThermo << " s" << endl;
+        Info<< "turbulence correct           = " << time_monitor_turbulence_correct << " s" << endl;
+        Info<< "combustion correct(in Y)     = " << time_monitor_chem << " s" << endl;
+        Info<< "percentage of chemistry      = " << time_monitor_chem / loop_time * 100 << " %" << endl;
+        Info<< "percentage of rho/U/Y/E      = " << (time_monitor_E + time_monitor_Y + time_monitor_U + time_monitor_rho - time_monitor_chem) / loop_time * 100 << " %" << endl;
+
+
+        Info<< "========Time details of each equation======="<< endl;
+
+        Info<< "rhoEqn Time                  = " << time_monitor_rhoEqn << " s" << endl;
+        Info<< "rhoEqn assamble              = " << time_monitor_rhoEqn_mtxAssembly << " s" << endl;
+        Info<< "rhoEqn assamble(CPU prepare) = " << time_monitor_rhoEqn_mtxAssembly_CPU_prepare << " s" << endl;
+        Info<< "rhoEqn assamble(GPU run)     = " << time_monitor_rhoEqn_mtxAssembly_GPU_run << " s" << endl;
+        Info<< "rhoEqn solve                 = " << time_monitor_rhoEqn_solve << " s" << endl;
+        Info<< "rhoEqn correct boundary      = " << time_monitor_rhoEqn_correctBC << " s" << endl;
+
+        Info<< "UEqn Time                    = " << time_monitor_UEqn << " s" << endl;
+        Info<< "UEqn assamble                = " << time_monitor_UEqn_mtxAssembly << " s" << endl;
+        Info<< "UEqn assamble(CPU prepare)   = " << time_monitor_UEqn_mtxAssembly_CPU_prepare << " s" << endl;
+        Info<< "UEqn assamble(GPU run)       = " << time_monitor_UEqn_mtxAssembly_GPU_run << " s" << endl;
+        Info<< "UEqn solve                   = " << time_monitor_UEqn_solve << " s" << endl;
+        Info<< "UEqn correct boundary        = " << time_monitor_UEqn_correctBC << " s" << endl;
+        Info<< "UEqn H                       = " << time_monitor_UEqn_H << " s" << endl;
+        Info<< "UEqn H(GPU run)              = " << time_monitor_UEqn_H_GPU_run << " s" << endl;
+        Info<< "UEqn H(correct boundary)     = " << time_monitor_UEqn_H_correctBC << " s" << endl;
+        Info<< "UEqn A                       = " << time_monitor_UEqn_A << " s" << endl;
+        Info<< "UEqn A(GPU run)              = " << time_monitor_UEqn_A_GPU_run << " s" << endl;
+        Info<< "UEqn A(correct boundary)     = " << time_monitor_UEqn_A_correctBC << " s" << endl;
+
+        Info<< "YEqn Time                    = " << time_monitor_YEqn << " s" << endl;
+        Info<< "YEqn assamble                = " << time_monitor_YEqn_mtxAssembly << " s" << endl;
+        Info<< "YEqn assamble(CPU prepare)   = " << time_monitor_YEqn_mtxAssembly_CPU_prepare << " s" << endl;
+        Info<< "YEqn assamble(GPU run)       = " << time_monitor_YEqn_mtxAssembly_GPU_run << " s" << endl;
+        Info<< "YEqn solve                   = " << time_monitor_YEqn_solve << " s" << endl;
+        Info<< "YEqn correct boundary        = " << time_monitor_YEqn_correctBC << " s" << endl;
+
+        Info<< "EEqn Time                    = " << time_monitor_EEqn << " s" << endl;
+        Info<< "EEqn assamble                = " << time_monitor_EEqn_mtxAssembly << " s" << endl;
+        Info<< "EEqn assamble(CPU prepare)   = " << time_monitor_EEqn_mtxAssembly_CPU_prepare << " s" << endl;
+        Info<< "EEqn assamble(GPU prepare)   = " << time_monitor_EEqn_mtxAssembly_GPU_prepare << " s" << endl;
+        Info<< "EEqn assamble(GPU run)       = " << time_monitor_EEqn_mtxAssembly_GPU_run << " s" << endl;
+        Info<< "EEqn solve                   = " << time_monitor_EEqn_solve << " s" << endl;
+        Info<< "EEqn correct boundary        = " << time_monitor_EEqn_correctBC << " s" << endl;
+
+        Info<< "pEqn Time                    = " << time_monitor_pEqn << " s" << endl;
+        Info<< "pEqn Time solve              = " << time_monitor_pEqn_solve << " s" << endl;
+
+        Info<< "============================================"<<nl<< endl;
+
+        Info<< "ExecutionTime = " << runTime.elapsedCpuTime() << " s"
+            << "  ClockTime = " << runTime.elapsedClockTime() << " s" << endl;
+
+        time_monitor_other = 0;
+        time_monitor_rho = 0;
+        time_monitor_U = 0;
+        time_monitor_Y = 0;
+        time_monitor_E = 0;
+        time_monitor_p = 0;
+        time_monitor_chemistry_correctThermo = 0;
+        time_monitor_turbulence_correct = 0;
+        time_monitor_chem = 0;
+
+        time_monitor_rhoEqn = 0;
+        time_monitor_rhoEqn_mtxAssembly = 0;
+        time_monitor_rhoEqn_mtxAssembly_CPU_prepare = 0;
+        time_monitor_rhoEqn_mtxAssembly_GPU_run = 0;
+        time_monitor_rhoEqn_solve = 0;
+        time_monitor_rhoEqn_correctBC = 0;
+
+        time_monitor_UEqn = 0;
+        time_monitor_UEqn_mtxAssembly = 0;
+        time_monitor_UEqn_mtxAssembly_CPU_prepare = 0;
+        time_monitor_UEqn_mtxAssembly_GPU_run = 0;
+        time_monitor_UEqn_solve = 0;
+        time_monitor_UEqn_correctBC = 0;
+        time_monitor_UEqn_H = 0;
+        time_monitor_UEqn_H_GPU_run = 0;
+        time_monitor_UEqn_H_correctBC = 0;
+        time_monitor_UEqn_A = 0;
+        time_monitor_UEqn_A_GPU_run = 0;
+        time_monitor_UEqn_A_correctBC = 0;
+
+        time_monitor_YEqn = 0;
+        time_monitor_YEqn_mtxAssembly = 0;
+        time_monitor_YEqn_mtxAssembly_CPU_prepare = 0;
+        time_monitor_YEqn_mtxAssembly_GPU_run = 0;
+        time_monitor_YEqn_solve = 0;
+        time_monitor_YEqn_correctBC = 0;
+
+        time_monitor_EEqn = 0;
+        time_monitor_EEqn_mtxAssembly = 0;
+        time_monitor_EEqn_mtxAssembly_CPU_prepare = 0;
+        time_monitor_EEqn_mtxAssembly_GPU_prepare = 0;
+        time_monitor_EEqn_mtxAssembly_GPU_run = 0;
+        time_monitor_EEqn_solve = 0;
+        time_monitor_EEqn_correctBC = 0;
+
+        time_monitor_pEqn = 0;
+        time_monitor_pEqn_solve = 0;
+
+#ifdef USE_PYTORCH
+        if (log_ && torch_)
+        {
+            Info<< "    allsolveTime = " << chemistry->time_allsolve() << " s"
+            << "    submasterTime = " << chemistry->time_submaster() << " s" << nl
+            << "    sendProblemTime = " << chemistry->time_sendProblem() << " s"
+            << "    recvProblemTime = " << chemistry->time_RecvProblem() << " s"
+            << "    sendRecvSolutionTime = " << chemistry->time_sendRecvSolution() << " s" << nl
+            << "    getDNNinputsTime = " << chemistry->time_getDNNinputs() << " s"
+            << "    DNNinferenceTime = " << chemistry->time_DNNinference() << " s"
+            << "    updateSolutionBufferTime = " << chemistry->time_updateSolutionBuffer() << " s" << nl
+            << "    vec2ndarrayTime = " << chemistry->time_vec2ndarray() << " s"
+            << "    pythonTime = " << chemistry->time_python() << " s"<< nl << endl;
+        }
+#endif
+#ifdef USE_LIBTORCH
+        if (log_ && torch_)
+        {
+            Info<< "    allsolveTime = " << chemistry->time_allsolve() << " s"
+            << "    submasterTime = " << chemistry->time_submaster() << " s" << nl
+            << "    sendProblemTime = " << chemistry->time_sendProblem() << " s"
+            << "    recvProblemTime = " << chemistry->time_RecvProblem() << " s"
+            << "    sendRecvSolutionTime = " << chemistry->time_sendRecvSolution() << " s" << nl
+            << "    DNNinferenceTime = " << chemistry->time_DNNinference() << " s"
+            << "    updateSolutionBufferTime = " << chemistry->time_updateSolutionBuffer() << " s" << nl;
+        }
+#endif
+    }
+
+    Info<< "End\n" << endl;
+
+    return 0;
+}
+
+
+// ************************************************************************* //
diff --git a/applications/solvers/dfLowMachFoam_new/pEqn.H b/applications/solvers/dfLowMachFoam_new/pEqn.H
new file mode 100644
index 000000000..34925327f
--- /dev/null
+++ b/applications/solvers/dfLowMachFoam_new/pEqn.H
@@ -0,0 +1,203 @@
+if (!pimple.simpleRho())
+{
+    rho = thermo.rho();
+}
+
+// Thermodynamic density needs to be updated by psi*d(p) after the
+// pressure solution
+const volScalarField psip0(psi*p);
+
+#ifdef GPUSolver_
+    // UEqn.H()
+    start1 = std::clock();
+    volVectorField UEqn_H
+    (
+        IOobject
+        (
+            "H("+U.name()+')',
+            runTime.timeName(),
+            mesh,
+            IOobject::NO_READ,
+            IOobject::NO_WRITE
+        ),
+        mesh,
+        dimensionedVector(dimensionSet(1,-2,-2,0,0,0,0), Zero),
+        extrapolatedCalculatedFvPatchScalarField::typeName
+    );
+    UEqn_GPU.H(&UEqn_H[0][0]);
+    end1 = std::clock();
+    time_monitor_UEqn += double(end1 - start1) / double(CLOCKS_PER_SEC);
+    time_monitor_UEqn_H += double(end1 - start1) / double(CLOCKS_PER_SEC);
+    time_monitor_UEqn_H_GPU_run += double(end1 - start1) / double(CLOCKS_PER_SEC);
+
+    start1 = std::clock();
+    UEqn_H.correctBoundaryConditions();
+    end1 = std::clock();
+    time_monitor_UEqn += double(end1 - start1) / double(CLOCKS_PER_SEC);
+    time_monitor_UEqn_H += double(end1 - start1) / double(CLOCKS_PER_SEC);
+    time_monitor_UEqn_H_correctBC += double(end1 - start1) / double(CLOCKS_PER_SEC);
+
+    // UEqn.A()
+    start1 = std::clock();
+    volScalarField UEqn_A
+    (
+        IOobject
+        (
+            "A("+U.name()+')',
+            runTime.timeName(),
+            mesh,
+            IOobject::NO_READ,
+            IOobject::NO_WRITE
+        ),
+        mesh,
+        dimensionedScalar(dimensionSet(1,-3,-1,0,0,0,0), Zero),
+        extrapolatedCalculatedFvPatchScalarField::typeName
+    );
+    UEqn_GPU.A(&UEqn_A[0]);
+    end1 = std::clock();
+    time_monitor_UEqn += double(end1 - start1) / double(CLOCKS_PER_SEC);
+    time_monitor_UEqn_A += double(end1 - start1) / double(CLOCKS_PER_SEC);
+    time_monitor_UEqn_A_correctBC += double(end1 - start1) / double(CLOCKS_PER_SEC);
+
+    start1 = std::clock();
+    UEqn_A.correctBoundaryConditions();
+    end1 = std::clock();
+    time_monitor_UEqn += double(end1 - start1) / double(CLOCKS_PER_SEC);
+    time_monitor_UEqn_A += double(end1 - start1) / double(CLOCKS_PER_SEC);
+    time_monitor_UEqn_A_correctBC += double(end1 - start1) / double(CLOCKS_PER_SEC);
+#endif
+
+start2 = std::clock();
+#ifdef GPUSolver_
+    volScalarField rAU(1.0/UEqn_A);
+    surfaceScalarField rhorAUf("rhorAUf", fvc::interpolate(rho*rAU));
+    volVectorField HbyA(constrainHbyA(rAU*UEqn_H, U, p));
+#else
+    volScalarField rAU(1.0/UEqn.A());
+    surfaceScalarField rhorAUf("rhorAUf", fvc::interpolate(rho*rAU));
+    volVectorField HbyA(constrainHbyA(rAU*UEqn.H(), U, p));
+
+    if (pimple.nCorrPiso() <= 1)
+    {
+        tUEqn.clear();
+    }
+#endif
+
+surfaceScalarField phiHbyA
+(
+    "phiHbyA",
+    fvc::interpolate(rho)*fvc::flux(HbyA)
+  + rhorAUf*fvc::ddtCorr(rho, U, phi, rhoUf)
+);
+
+fvc::makeRelative(phiHbyA, rho, U);
+
+// Update the pressure BCs to ensure flux consistency
+constrainPressure(p, rho, U, phiHbyA, rhorAUf);
+
+if (pimple.transonic())
+{
+    surfaceScalarField phid
+    (
+        "phid",
+        (fvc::interpolate(psi)/fvc::interpolate(rho))*phiHbyA
+    );
+
+    phiHbyA -= fvc::interpolate(psi*p)*phiHbyA/fvc::interpolate(rho);
+
+    fvScalarMatrix pDDtEqn
+    (
+        fvc::ddt(rho) + psi*correction(fvm::ddt(p))
+      + fvc::div(phiHbyA) + fvm::div(phid, p)
+    );
+
+    while (pimple.correctNonOrthogonal())
+    {
+        fvScalarMatrix pEqn(pDDtEqn - fvm::laplacian(rhorAUf, p));
+
+        // Relax the pressure equation to ensure diagonal-dominance
+        pEqn.relax();
+
+        start1 = std::clock();
+        pEqn.solve();
+        end1 = std::clock();
+        time_monitor_pEqn_solve += double(end1 - start1) / double(CLOCKS_PER_SEC);
+
+        if (pimple.finalNonOrthogonalIter())
+        {
+            phi = phiHbyA + pEqn.flux();
+        }
+    }
+}
+else
+{
+    fvScalarMatrix pDDtEqn
+    (
+        fvc::ddt(rho) + psi*correction(fvm::ddt(p))
+      + fvc::div(phiHbyA)
+    );
+
+    while (pimple.correctNonOrthogonal())
+    {
+        fvScalarMatrix pEqn(pDDtEqn - fvm::laplacian(rhorAUf, p));
+
+        start1 = std::clock();
+        pEqn.solve();
+        end1 = std::clock();
+        time_monitor_pEqn_solve += double(end1 - start1) / double(CLOCKS_PER_SEC);
+
+        if (pimple.finalNonOrthogonalIter())
+        {
+            phi = phiHbyA + pEqn.flux();
+        }
+    }
+}
+
+bool limitedp = pressureControl.limit(p);
+
+// Thermodynamic density update
+thermo.correctRho(psi*p - psip0);
+
+if (limitedp)
+{
+    rho = thermo.rho();
+}
+
+#include "rhoEqn.H"
+#include "compressibleContinuityErrs.H"
+
+// Explicitly relax pressure for momentum corrector
+p.relax();
+
+U = HbyA - rAU*fvc::grad(p);
+U.correctBoundaryConditions();
+K = 0.5*magSqr(U);
+
+#ifdef GPUSolver_
+    start1 = std::clock();
+    UEqn_GPU.correctPsi(&U[0][0]);
+    end1 = std::clock();
+    time_monitor_UEqn += double(end1 - start1) / double(CLOCKS_PER_SEC);
+    time_monitor_UEqn_correctBC += double(end1 - start1) / double(CLOCKS_PER_SEC);
+#endif
+
+if (pimple.simpleRho())
+{
+    rho = thermo.rho();
+}
+
+// Correct rhoUf if the mesh is moving
+fvc::correctRhoUf(rhoUf, rho, U, phi);
+
+if (thermo.dpdt())
+{
+    dpdt = fvc::ddt(p);
+
+    if (mesh.moving())
+    {
+        dpdt -= fvc::div(fvc::meshPhi(rho, U), p);
+    }
+}
+end2 = std::clock();
+time_monitor_pEqn += double(end2 - start2) / double(CLOCKS_PER_SEC);
+
diff --git a/applications/solvers/dfLowMachFoam_new/pcEqn.H b/applications/solvers/dfLowMachFoam_new/pcEqn.H
new file mode 100644
index 000000000..3b72bf3c6
--- /dev/null
+++ b/applications/solvers/dfLowMachFoam_new/pcEqn.H
@@ -0,0 +1,130 @@
+if (!pimple.simpleRho())
+{
+    rho = thermo.rho();
+}
+
+// Thermodynamic density needs to be updated by psi*d(p) after the
+// pressure solution
+const volScalarField psip0(psi*p);
+
+volScalarField rAU(1.0/UEqn.A());
+volScalarField rAtU(1.0/(1.0/rAU - UEqn.H1()));
+volVectorField HbyA(constrainHbyA(rAU*UEqn.H(), U, p));
+
+if (pimple.nCorrPiso() <= 1)
+{
+    tUEqn.clear();
+}
+
+surfaceScalarField phiHbyA
+(
+    "phiHbyA",
+    (
+        fvc::interpolate(rho)*fvc::flux(HbyA)
+      + fvc::interpolate(rho*rAU)*fvc::ddtCorr(rho, U, phi, rhoUf)
+    )
+);
+
+fvc::makeRelative(phiHbyA, rho, U);
+
+volScalarField rhorAtU("rhorAtU", rho*rAtU);
+
+// Update the pressure BCs to ensure flux consistency
+constrainPressure(p, rho, U, phiHbyA, rhorAtU);
+
+if (pimple.transonic())
+{
+    surfaceScalarField phid
+    (
+        "phid",
+        (fvc::interpolate(psi)/fvc::interpolate(rho))*phiHbyA
+    );
+
+    phiHbyA +=
+        fvc::interpolate(rho*(rAtU - rAU))*fvc::snGrad(p)*mesh.magSf()
+      - fvc::interpolate(psi*p)*phiHbyA/fvc::interpolate(rho);
+
+    HbyA -= (rAU - rAtU)*fvc::grad(p);
+
+    fvScalarMatrix pDDtEqn
+    (
+        fvc::ddt(rho) + psi*correction(fvm::ddt(p))
+      + fvc::div(phiHbyA) + fvm::div(phid, p)
+    );
+
+    while (pimple.correctNonOrthogonal())
+    {
+        fvScalarMatrix pEqn(pDDtEqn - fvm::laplacian(rhorAtU, p));
+
+        // Relax the pressure equation to ensure diagonal-dominance
+        pEqn.relax();
+
+        pEqn.solve();
+
+        if (pimple.finalNonOrthogonalIter())
+        {
+            phi = phiHbyA + pEqn.flux();
+        }
+    }
+}
+else
+{
+    phiHbyA += fvc::interpolate(rho*(rAtU - rAU))*fvc::snGrad(p)*mesh.magSf();
+    HbyA -= (rAU - rAtU)*fvc::grad(p);
+
+    fvScalarMatrix pDDtEqn
+    (
+        fvc::ddt(rho) + psi*correction(fvm::ddt(p))
+      + fvc::div(phiHbyA)
+    );
+
+    while (pimple.correctNonOrthogonal())
+    {
+        fvScalarMatrix pEqn(pDDtEqn - fvm::laplacian(rhorAtU, p));
+
+        pEqn.solve();
+
+        if (pimple.finalNonOrthogonalIter())
+        {
+            phi = phiHbyA + pEqn.flux();
+        }
+    }
+}
+
+bool limitedp = pressureControl.limit(p);
+
+// Thermodynamic density update
+thermo.correctRho(psi*p - psip0);
+
+if (limitedp)
+{
+    rho = thermo.rho();
+}
+
+#include "rhoEqn.H"
+#include "compressibleContinuityErrs.H"
+
+// Explicitly relax pressure for momentum corrector
+p.relax();
+
+U = HbyA - rAtU*fvc::grad(p);
+U.correctBoundaryConditions();
+K = 0.5*magSqr(U);
+
+if (pimple.simpleRho())
+{
+    rho = thermo.rho();
+}
+
+// Correct rhoUf if the mesh is moving
+fvc::correctRhoUf(rhoUf, rho, U, phi);
+
+if (thermo.dpdt())
+{
+    dpdt = fvc::ddt(p);
+
+    if (mesh.moving())
+    {
+        dpdt -= fvc::div(fvc::meshPhi(rho, U), p);
+    }
+}
diff --git a/applications/solvers/dfLowMachFoam_new/rhoEqn.H b/applications/solvers/dfLowMachFoam_new/rhoEqn.H
new file mode 100644
index 000000000..93965ca52
--- /dev/null
+++ b/applications/solvers/dfLowMachFoam_new/rhoEqn.H
@@ -0,0 +1,86 @@
+/*---------------------------------------------------------------------------*\
+  =========                 |
+  \\      /  F ield         | OpenFOAM: The Open Source CFD Toolbox
+   \\    /   O peration     | Website:  https://openfoam.org
+    \\  /    A nd           | Copyright (C) 2011-2018 OpenFOAM Foundation
+     \\/     M anipulation  |
+-------------------------------------------------------------------------------
+License
+    This file is part of OpenFOAM.
+
+    OpenFOAM is free software: you can redistribute it and/or modify it
+    under the terms of the GNU General Public License as published by
+    the Free Software Foundation, either version 3 of the License, or
+    (at your option) any later version.
+
+    OpenFOAM is distributed in the hope that it will be useful, but WITHOUT
+    ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
+    FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
+    for more details.
+
+    You should have received a copy of the GNU General Public License
+    along with OpenFOAM.  If not, see <http://www.gnu.org/licenses/>.
+
+Global
+    rhoEqn
+
+Description
+    Solve the continuity for density.
+
+\*---------------------------------------------------------------------------*/
+#ifdef GPUSolver_
+{
+    start1 = std::clock();
+    rho.oldTime();
+
+    int offset = 0;
+    forAll(U.boundaryField(), patchi)
+    {
+        const fvsPatchScalarField& patchFlux = phi.boundaryField()[patchi];
+        int patchSize = patchFlux.size();
+        memcpy(boundary_phi_init+offset, &patchFlux[0], patchSize*sizeof(double));
+        offset += patchSize;
+    }
+    end1 = std::clock();
+    time_monitor_rhoEqn += double(end1 - start1) / double(CLOCKS_PER_SEC);
+    time_monitor_rhoEqn_mtxAssembly += double(end1 - start1) / double(CLOCKS_PER_SEC);
+    time_monitor_rhoEqn_mtxAssembly_CPU_prepare += double(end1 - start1) / double(CLOCKS_PER_SEC);
+
+    start1 = std::clock();
+    rhoEqn_GPU.initializeTimeStep();
+    rhoEqn_GPU.fvc_div(&phi[0], boundary_phi_init);
+    rhoEqn_GPU.fvm_ddt(&rho.oldTime()[0]);
+    rhoEqn_GPU.sync();
+    end1 = std::clock();
+    time_monitor_rhoEqn += double(end1 - start1) / double(CLOCKS_PER_SEC);
+    time_monitor_rhoEqn_mtxAssembly += double(end1 - start1) / double(CLOCKS_PER_SEC);
+    time_monitor_rhoEqn_mtxAssembly_GPU_run += double(end1 - start1) / double(CLOCKS_PER_SEC);
+
+    start1 = std::clock();
+    rhoEqn_GPU.updatePsi(&rho.primitiveFieldRef()[0]);
+    rho.correctBoundaryConditions();
+    end1 = std::clock();
+    time_monitor_rhoEqn += double(end1 - start1) / double(CLOCKS_PER_SEC);
+    time_monitor_rhoEqn_correctBC += double(end1 - start1) / double(CLOCKS_PER_SEC);
+}
+#else
+{
+    start1 = std::clock();
+    fvScalarMatrix rhoEqn
+    (
+        fvm::ddt(rho)
+      + fvc::div(phi)
+    );
+    end1 = std::clock();
+    time_monitor_rhoEqn += double(end1 - start1) / double(CLOCKS_PER_SEC);
+    time_monitor_rhoEqn_mtxAssembly += double(end1 - start1) / double(CLOCKS_PER_SEC);
+
+    start1 = std::clock();
+    rhoEqn.solve();
+    end1 = std::clock();
+    time_monitor_rhoEqn += double(end1 - start1) / double(CLOCKS_PER_SEC);
+    time_monitor_rhoEqn_solve += double(end1 - start1) / double(CLOCKS_PER_SEC);
+}
+#endif
+
+// ************************************************************************* //
diff --git a/applications/solvers/dfLowMachFoam_new/setRDeltaT.H b/applications/solvers/dfLowMachFoam_new/setRDeltaT.H
new file mode 100644
index 000000000..074d05e3d
--- /dev/null
+++ b/applications/solvers/dfLowMachFoam_new/setRDeltaT.H
@@ -0,0 +1,85 @@
+{
+    volScalarField& rDeltaT = trDeltaT.ref();
+
+    const dictionary& pimpleDict = pimple.dict();
+
+    scalar maxCo
+    (
+        pimpleDict.lookupOrDefault<scalar>("maxCo", 0.8)
+    );
+
+    scalar rDeltaTSmoothingCoeff
+    (
+        pimpleDict.lookupOrDefault<scalar>("rDeltaTSmoothingCoeff", 0.02)
+    );
+
+    scalar rDeltaTDampingCoeff
+    (
+        pimpleDict.lookupOrDefault<scalar>("rDeltaTDampingCoeff", 1.0)
+    );
+
+    scalar maxDeltaT
+    (
+        pimpleDict.lookupOrDefault<scalar>("maxDeltaT", great)
+    );
+
+    volScalarField rDeltaT0("rDeltaT0", rDeltaT);
+
+    // Set the reciprocal time-step from the local Courant number
+    rDeltaT.ref() = max
+    (
+        1/dimensionedScalar(dimTime, maxDeltaT),
+        fvc::surfaceSum(mag(phi))()()
+       /((2*maxCo)*mesh.V()*rho())
+    );
+
+    if (pimple.transonic())
+    {
+        surfaceScalarField phid
+        (
+            "phid",
+            fvc::interpolate(psi)*fvc::flux(U)
+        );
+
+        rDeltaT.ref() = max
+        (
+            rDeltaT(),
+            fvc::surfaceSum(mag(phid))()()
+            /((2*maxCo)*mesh.V()*psi())
+        );
+    }
+
+    // Update tho boundary values of the reciprocal time-step
+    rDeltaT.correctBoundaryConditions();
+
+    Info<< "Flow time scale min/max = "
+        << gMin(1/rDeltaT.primitiveField())
+        << ", " << gMax(1/rDeltaT.primitiveField()) << endl;
+
+    if (rDeltaTSmoothingCoeff < 1.0)
+    {
+        fvc::smooth(rDeltaT, rDeltaTSmoothingCoeff);
+    }
+
+    Info<< "Smoothed flow time scale min/max = "
+        << gMin(1/rDeltaT.primitiveField())
+        << ", " << gMax(1/rDeltaT.primitiveField()) << endl;
+
+    // Limit rate of change of time scale
+    // - reduce as much as required
+    // - only increase at a fraction of old time scale
+    if
+    (
+        rDeltaTDampingCoeff < 1.0
+     && runTime.timeIndex() > runTime.startTimeIndex() + 1
+    )
+    {
+        rDeltaT =
+            rDeltaT0
+           *max(rDeltaT/rDeltaT0, scalar(1) - rDeltaTDampingCoeff);
+
+        Info<< "Damped flow time scale min/max = "
+            << gMin(1/rDeltaT.primitiveField())
+            << ", " << gMax(1/rDeltaT.primitiveField()) << endl;
+    }
+}
diff --git a/applications/solvers/dfLowMachFoam_new/setRootCase2.H b/applications/solvers/dfLowMachFoam_new/setRootCase2.H
new file mode 100644
index 000000000..45d966e63
--- /dev/null
+++ b/applications/solvers/dfLowMachFoam_new/setRootCase2.H
@@ -0,0 +1,5 @@
+Foam::argList args(argc,argv,true,true,/*initialise=*/false);
+if (!args.checkRootCase())
+{
+    Foam::FatalError.exit();
+}
\ No newline at end of file
diff --git a/src_gpu/dfMatrixDataBase.cu b/src_gpu/dfMatrixDataBase.cu
index 8c2c26faf..4e49faf99 100644
--- a/src_gpu/dfMatrixDataBase.cu
+++ b/src_gpu/dfMatrixDataBase.cu
@@ -132,7 +132,6 @@ void dfMatrixDataBase::setConstantIndexes(const int *owner, const int *neighbor)
 
     std::vector<int> lowCSRIndex, uppCSRIndex, diagCSRIndex, CSRRowIndex, CSRColIndex;
     int uppIndexInCSR = 0, uppIndexInLdu = 0, lowIndexInCSR = 0, lowIndexInLdu = 0, lowNumInLdu = 0;
-    CSRRowIndex.push_back(0);
     CSRColIndex.resize(2 * num_surfaces + num_cells);
     lowCSRIndex.resize(num_surfaces);
     for (int i = 0; i < num_cells; ++i) {
@@ -161,6 +160,7 @@ void dfMatrixDataBase::setConstantIndexes(const int *owner, const int *neighbor)
         diagCSRIndex.push_back(diagIndexInCSR);
         CSRColIndex[diagIndexInCSR] = i; // fill diag entry in CSRColIndex
     }
+    CSRRowIndex.push_back(2 * num_surfaces + num_cells);
 
     checkCudaErrors(cudaMalloc((void**)&d_lower_to_csr_index, surface_index_bytes));
     checkCudaErrors(cudaMalloc((void**)&d_diag_to_csr_index, cell_index_bytes));
diff --git a/src_gpu/dfMatrixOpBase.H b/src_gpu/dfMatrixOpBase.H
index f64220186..71dd82c38 100644
--- a/src_gpu/dfMatrixOpBase.H
+++ b/src_gpu/dfMatrixOpBase.H
@@ -1,20 +1,5 @@
 #pragma once
-
-#define TICK_INIT \
-    float time_elapsed_kernel=0;\
-    cudaEvent_t start_kernel, stop_kernel;\
-    checkCudaErrors(cudaEventCreate(&start_kernel));\
-    checkCudaErrors(cudaEventCreate(&stop_kernel));
-
-#define TICK_START \
-    checkCudaErrors(cudaEventRecord(start_kernel,0));
-
-#define TICK_END(prefix) \
-    checkCudaErrors(cudaEventRecord(stop_kernel,0));\
-    checkCudaErrors(cudaEventSynchronize(start_kernel));\
-    checkCudaErrors(cudaEventSynchronize(stop_kernel));\
-    checkCudaErrors(cudaEventElapsedTime(&time_elapsed_kernel,start_kernel,stop_kernel));\
-    printf("try %s 执行时间：%lf(ms)\n", #prefix, time_elapsed_kernel);
+// #define TIME_GPU
 
 // tools
 void permute_vector_d2h(cudaStream_t stream, int num_cells, const double *input, double *output);
@@ -26,13 +11,13 @@ void field_multiply_scalar(cudaStream_t stream,
 
 void fvc_to_source_vector(cudaStream_t stream, int num_cells, const double *volume, const double *fvc_output, double *source);
 
-void ldu_to_csr(cudaStream_t stream, int num_cells, int num_surfaces,
-        const int *lower_to_csr_index, const int *upper_to_csr_index, const int *diag_to_csr_index, 
+void ldu_to_csr(cudaStream_t stream, int num_cells, int num_surfaces, int num_boundary_surface,
+        const int* boundary_cell_face, const int *lower_to_csr_index, const int *upper_to_csr_index, const int *diag_to_csr_index,
         const double *lower, const double *upper, const double *diag, const double *source,
         const double *internal_coeffs, const double *boundary_coeffs,
-        double *A, double *b);
+        double *A, double *b, double *diag_vec);
 
-void update_boundary_coeffs_vector(cudaStream_t stream, int num_patches,
+void update_boundary_coeffs_vector(cudaStream_t stream, int num_boundary_surfaces, int num_patches,
         const int *patch_size, const int *patch_type,
         double *value_internal_coeffs, double *value_boundary_coeffs,
         double *gradient_internal_coeffs, double *gradient_boundary_coeffs);
@@ -43,14 +28,15 @@ void fvm_ddt_vector(cudaStream_t stream, int num_cells, double rDeltaT,
         const double *rho, const double *rho_old, const double *vf, const double *volume,
         double *diag, double *source, double sign = 1.);
 
-void fvm_div_vector(cudaStream_t stream, int num_surfaces, const int *lowerAddr, const int *upperAddr,
+void fvm_div_vector(cudaStream_t stream, int num_surfaces, int num_boundary_sourfaces, 
+        const int *lowerAddr, const int *upperAddr,
         const double *phi, const double *weight,
         double *lower, double *upper, double *diag, // end for internal
         int num_patches, const int *patch_size, const int *patch_type,
         const double *boundary_phi, const double *value_internal_coeffs, const double *value_boundary_coeffs,
         double *internal_coeffs, double *boundary_coeffs, double sign = 1.);
 
-void fvm_laplacian_vector(cudaStream_t stream, int num_surfaces,
+void fvm_laplacian_vector(cudaStream_t stream, int num_surfaces, int num_boundary_surfaces, 
         const int *lowerAddr, const int *upperAddr,
         const double *weight, const double *mag_sf, const double *delta_coeffs, const double *gamma,
         double *lower, double *upper, double *diag, // end for internal
diff --git a/src_gpu/dfMatrixOpBase.cu b/src_gpu/dfMatrixOpBase.cu
index d4f6ea7f8..e3616fac3 100644
--- a/src_gpu/dfMatrixOpBase.cu
+++ b/src_gpu/dfMatrixOpBase.cu
@@ -4,6 +4,35 @@
 #include <cuda_runtime.h>
 #include "cuda_profiler_api.h"
 
+#ifdef TIME_GPU
+    #define TICK_INIT_EVENT \
+        float time_elapsed_kernel=0;\
+        cudaEvent_t start_kernel, stop_kernel;\
+        checkCudaErrors(cudaEventCreate(&start_kernel));\
+        checkCudaErrors(cudaEventCreate(&stop_kernel));
+
+    #define TICK_START_EVENT \
+        checkCudaErrors(cudaEventRecord(start_kernel,0));
+
+    #define TICK_END_EVENT(prefix) \
+        checkCudaErrors(cudaEventRecord(stop_kernel,0));\
+        checkCudaErrors(cudaEventSynchronize(start_kernel));\
+        checkCudaErrors(cudaEventSynchronize(stop_kernel));\
+        checkCudaErrors(cudaEventElapsedTime(&time_elapsed_kernel,start_kernel,stop_kernel));\
+        printf("try %s 执行时间：%lf(ms)\n", #prefix, time_elapsed_kernel);
+#else
+    #define TICK_INIT_EVENT
+    #define TICK_START_EVENT
+    #define TICK_END_EVENT(prefix)
+#endif
+
+__global__ void warmup(int num_cells)
+{
+    int index = blockDim.x * blockIdx.x + threadIdx.x;
+    if (index >= num_cells)
+        return;
+}
+
 __global__ void permute_vector_d2h_kernel(int num_cells, const double *input, double *output)
 {
     int index = blockDim.x * blockIdx.x + threadIdx.x;
@@ -53,7 +82,7 @@ __global__ void fvc_to_source_vector_kernel(int num_cells, const double *volume,
     source[index * 3 + 2] += fvc_output[index * 3 + 2];
 }
 
-__global__ void update_boundary_coeffs_zeroGradient_vector(int num, int offset,
+__global__ void update_boundary_coeffs_zeroGradient_vector(int num_boundary_surfaces, int num, int offset,
         double *value_internal_coeffs, double *value_boundary_coeffs,
         double *gradient_internal_coeffs, double *gradient_boundary_coeffs)
 {
@@ -66,18 +95,18 @@ __global__ void update_boundary_coeffs_zeroGradient_vector(int num, int offset,
     // valueBoundaryCoeffs = 0
     // gradientInternalCoeffs = 0
     // gradientBoundaryCoeffs = 0
-    value_internal_coeffs[start_index * 3 + 0] = 1;
-    value_internal_coeffs[start_index * 3 + 1] = 1;
-    value_internal_coeffs[start_index * 3 + 2] = 1;
-    value_boundary_coeffs[start_index * 3 + 0] = 0;
-    value_boundary_coeffs[start_index * 3 + 1] = 0;
-    value_boundary_coeffs[start_index * 3 + 2] = 0;
-    gradient_internal_coeffs[start_index * 3 + 0] = 0;
-    gradient_internal_coeffs[start_index * 3 + 1] = 0;
-    gradient_internal_coeffs[start_index * 3 + 2] = 0;
-    gradient_boundary_coeffs[start_index * 3 + 0] = 0;
-    gradient_boundary_coeffs[start_index * 3 + 1] = 0;
-    gradient_boundary_coeffs[start_index * 3 + 2] = 0;
+    value_internal_coeffs[num_boundary_surfaces * 0 + start_index] = 1;
+    value_internal_coeffs[num_boundary_surfaces * 1 + start_index] = 1;
+    value_internal_coeffs[num_boundary_surfaces * 2 + start_index] = 1;
+    value_boundary_coeffs[num_boundary_surfaces * 0 + start_index] = 0;
+    value_boundary_coeffs[num_boundary_surfaces * 1 + start_index] = 0;
+    value_boundary_coeffs[num_boundary_surfaces * 2 + start_index] = 0;
+    gradient_internal_coeffs[num_boundary_surfaces * 0 + start_index] = 0;
+    gradient_internal_coeffs[num_boundary_surfaces * 1 + start_index] = 0;
+    gradient_internal_coeffs[num_boundary_surfaces * 2 + start_index] = 0;
+    gradient_boundary_coeffs[num_boundary_surfaces * 0 + start_index] = 0;
+    gradient_boundary_coeffs[num_boundary_surfaces * 1 + start_index] = 0;
+    gradient_boundary_coeffs[num_boundary_surfaces * 2 + start_index] = 0;
 }
 
 __global__ void scale_dev2t_tensor_kernel(int num, const double *vf1, double *vf2)
@@ -128,9 +157,9 @@ __global__ void fvm_ddt_vector_kernel(int num_cells, double rDeltaT,
 
     diag[index] += rDeltaT * rho[index] * vol * sign;
     // TODO: skip moving
-    source[index * 3 + 0] += rDeltaT * rho_old_kernel * vf[index * 3 + 0] * vol * sign;
-    source[index * 3 + 1] += rDeltaT * rho_old_kernel * vf[index * 3 + 1] * vol * sign;
-    source[index * 3 + 2] += rDeltaT * rho_old_kernel * vf[index * 3 + 2] * vol * sign;
+    source[num_cells * 0 + index] += rDeltaT * rho_old_kernel * vf[num_cells * 0 + index] * vol * sign;
+    source[num_cells * 1 + index] += rDeltaT * rho_old_kernel * vf[num_cells * 1 + index] * vol * sign;
+    source[num_cells * 2 + index] += rDeltaT * rho_old_kernel * vf[num_cells * 2 + index] * vol * sign;
 }
 
 __global__ void fvm_div_vector_internal(int num_surfaces,
@@ -157,7 +186,8 @@ __global__ void fvm_div_vector_internal(int num_surfaces,
     atomicAdd(&(diag[neighbor]), -upper_value);
 }
 
-__global__ void fvm_div_vector_boundary(int num, int offset,
+// TODO: modify the data structure of internal and boundary coeffs
+__global__ void fvm_div_vector_boundary(int num_boundary_surfaces, int num, int offset,
         const double *boundary_phi, const double *value_internal_coeffs, const double *value_boundary_coeffs,
         double *internal_coeffs, double *boundary_coeffs, double sign)
 {
@@ -167,12 +197,12 @@ __global__ void fvm_div_vector_boundary(int num, int offset,
 
     int start_index = offset + index;
     double boundary_f = boundary_phi[start_index];
-    internal_coeffs[start_index * 3 + 0] += boundary_f * value_internal_coeffs[start_index * 3 + 0] * sign;
-    internal_coeffs[start_index * 3 + 1] += boundary_f * value_internal_coeffs[start_index * 3 + 1] * sign;
-    internal_coeffs[start_index * 3 + 2] += boundary_f * value_internal_coeffs[start_index * 3 + 2] * sign;
-    boundary_coeffs[start_index * 3 + 0] += boundary_f * value_boundary_coeffs[start_index * 3 + 0] * sign;
-    boundary_coeffs[start_index * 3 + 1] += boundary_f * value_boundary_coeffs[start_index * 3 + 1] * sign;
-    boundary_coeffs[start_index * 3 + 2] += boundary_f * value_boundary_coeffs[start_index * 3 + 2] * sign;
+    internal_coeffs[num_boundary_surfaces * 0 + start_index] += boundary_f * value_internal_coeffs[num_boundary_surfaces * 0 + start_index] * sign;
+    internal_coeffs[num_boundary_surfaces * 1 + start_index] += boundary_f * value_internal_coeffs[num_boundary_surfaces * 1 + start_index] * sign;
+    internal_coeffs[num_boundary_surfaces * 2 + start_index] += boundary_f * value_internal_coeffs[num_boundary_surfaces * 2 + start_index] * sign;
+    boundary_coeffs[num_boundary_surfaces * 0 + start_index] += boundary_f * value_boundary_coeffs[num_boundary_surfaces * 0 + start_index] * sign;
+    boundary_coeffs[num_boundary_surfaces * 1 + start_index] += boundary_f * value_boundary_coeffs[num_boundary_surfaces * 1 + start_index] * sign;
+    boundary_coeffs[num_boundary_surfaces * 2 + start_index] += boundary_f * value_boundary_coeffs[num_boundary_surfaces * 2 + start_index] * sign;
 }
 
 __global__ void fvm_laplacian_vector_internal(int num_surfaces,
@@ -206,7 +236,7 @@ __global__ void fvm_laplacian_vector_internal(int num_surfaces,
     atomicAdd(&(diag[neighbor]), -upper_value);
 }
 
-__global__ void fvm_laplacian_vector_boundary(int num, int offset,
+__global__ void fvm_laplacian_vector_boundary(int num_boundary_surfaces, int num, int offset,
         const double *boundary_mag_sf, const double *boundary_gamma,
         const double *gradient_internal_coeffs, const double *gradient_boundary_coeffs,
         double *internal_coeffs, double *boundary_coeffs, double sign)
@@ -217,12 +247,12 @@ __global__ void fvm_laplacian_vector_boundary(int num, int offset,
 
     int start_index = offset + index;
     double boundary_value = boundary_gamma[start_index] * boundary_mag_sf[start_index];
-    internal_coeffs[start_index * 3 + 0] += boundary_value * gradient_internal_coeffs[start_index * 3 + 0] * sign;
-    internal_coeffs[start_index * 3 + 1] += boundary_value * gradient_internal_coeffs[start_index * 3 + 1] * sign;
-    internal_coeffs[start_index * 3 + 2] += boundary_value * gradient_internal_coeffs[start_index * 3 + 2] * sign;
-    boundary_coeffs[start_index * 3 + 0] += boundary_value * gradient_boundary_coeffs[start_index * 3 + 0] * sign;
-    boundary_coeffs[start_index * 3 + 1] += boundary_value * gradient_boundary_coeffs[start_index * 3 + 1] * sign;
-    boundary_coeffs[start_index * 3 + 2] += boundary_value * gradient_boundary_coeffs[start_index * 3 + 2] * sign;
+    internal_coeffs[num_boundary_surfaces * 0 + start_index] += boundary_value * gradient_internal_coeffs[num_boundary_surfaces * 0 + start_index] * sign;
+    internal_coeffs[num_boundary_surfaces * 1 + start_index] += boundary_value * gradient_internal_coeffs[num_boundary_surfaces * 1 + start_index] * sign;
+    internal_coeffs[num_boundary_surfaces * 2 + start_index] += boundary_value * gradient_internal_coeffs[num_boundary_surfaces * 2 + start_index] * sign;
+    boundary_coeffs[num_boundary_surfaces * 0 + start_index] += boundary_value * gradient_boundary_coeffs[num_boundary_surfaces * 0 + start_index] * sign;
+    boundary_coeffs[num_boundary_surfaces * 1 + start_index] += boundary_value * gradient_boundary_coeffs[num_boundary_surfaces * 1 + start_index] * sign;
+    boundary_coeffs[num_boundary_surfaces * 2 + start_index] += boundary_value * gradient_boundary_coeffs[num_boundary_surfaces * 2 + start_index] * sign;
 }
 
 __global__ void fvc_ddt_scalar_kernel(int num_cells, double rDeltaT,
@@ -277,9 +307,9 @@ __global__ void fvc_grad_vector_internal(int num_cells, int num_surfaces,
     int owner = lower_index[index];
     int neighbor = upper_index[index];
 
-    double ssfx = (w * (field_vector[owner * 3 + 0] - field_vector[neighbor * 3 + 0]) + field_vector[neighbor * 3 + 0]);
-    double ssfy = (w * (field_vector[owner * 3 + 1] - field_vector[neighbor * 3 + 1]) + field_vector[neighbor * 3 + 1]);
-    double ssfz = (w * (field_vector[owner * 3 + 2] - field_vector[neighbor * 3 + 2]) + field_vector[neighbor * 3 + 2]);    
+    double ssfx = (w * (field_vector[num_cells * 0 + owner] - field_vector[num_cells * 0 + neighbor]) + field_vector[num_cells * 0 + neighbor]);
+    double ssfy = (w * (field_vector[num_cells * 1 + owner] - field_vector[num_cells * 1 + neighbor]) + field_vector[num_cells * 1 + neighbor]);
+    double ssfz = (w * (field_vector[num_cells * 2 + owner] - field_vector[num_cells * 2 + neighbor]) + field_vector[num_cells * 2 + neighbor]);    
 
     double grad_xx = Sfx * ssfx;
     double grad_xy = Sfx * ssfy;
@@ -291,26 +321,45 @@ __global__ void fvc_grad_vector_internal(int num_cells, int num_surfaces,
     double grad_zy = Sfz * ssfy;
     double grad_zz = Sfz * ssfz;
 
-    // owner
-    atomicAdd(&(output[num_cells * 0 + owner]), grad_xx);
-    atomicAdd(&(output[num_cells * 1 + owner]), grad_xy);
-    atomicAdd(&(output[num_cells * 2 + owner]), grad_xz);
-    atomicAdd(&(output[num_cells * 3 + owner]), grad_yx);
-    atomicAdd(&(output[num_cells * 4 + owner]), grad_yy);
-    atomicAdd(&(output[num_cells * 5 + owner]), grad_yz);
-    atomicAdd(&(output[num_cells * 6 + owner]), grad_zx);
-    atomicAdd(&(output[num_cells * 7 + owner]), grad_zy);
-    atomicAdd(&(output[num_cells * 8 + owner]), grad_zz);
+    // // owner
+    // atomicAdd(&(output[num_cells * 0 + owner]), grad_xx);
+    // atomicAdd(&(output[num_cells * 1 + owner]), grad_xy);
+    // atomicAdd(&(output[num_cells * 2 + owner]), grad_xz);
+    // atomicAdd(&(output[num_cells * 3 + owner]), grad_yx);
+    // atomicAdd(&(output[num_cells * 4 + owner]), grad_yy);
+    // atomicAdd(&(output[num_cells * 5 + owner]), grad_yz);
+    // atomicAdd(&(output[num_cells * 6 + owner]), grad_zx);
+    // atomicAdd(&(output[num_cells * 7 + owner]), grad_zy);
+    // atomicAdd(&(output[num_cells * 8 + owner]), grad_zz);
 
-    // neighbour
+    // // neighbour
+    // atomicAdd(&(output[num_cells * 0 + neighbor]), -grad_xx);
+    // atomicAdd(&(output[num_cells * 1 + neighbor]), -grad_xy);
+    // atomicAdd(&(output[num_cells * 2 + neighbor]), -grad_xz);
+    // atomicAdd(&(output[num_cells * 3 + neighbor]), -grad_yx);
+    // atomicAdd(&(output[num_cells * 4 + neighbor]), -grad_yy);
+    // atomicAdd(&(output[num_cells * 5 + neighbor]), -grad_yz);
+    // atomicAdd(&(output[num_cells * 6 + neighbor]), -grad_zx);
+    // atomicAdd(&(output[num_cells * 7 + neighbor]), -grad_zy);
+    // atomicAdd(&(output[num_cells * 8 + neighbor]), -grad_zz);
+
+    atomicAdd(&(output[num_cells * 0 + owner]), grad_xx);
     atomicAdd(&(output[num_cells * 0 + neighbor]), -grad_xx);
+    atomicAdd(&(output[num_cells * 1 + owner]), grad_xy);
     atomicAdd(&(output[num_cells * 1 + neighbor]), -grad_xy);
+    atomicAdd(&(output[num_cells * 2 + owner]), grad_xz);
     atomicAdd(&(output[num_cells * 2 + neighbor]), -grad_xz);
+    atomicAdd(&(output[num_cells * 3 + owner]), grad_yx);
     atomicAdd(&(output[num_cells * 3 + neighbor]), -grad_yx);
+    atomicAdd(&(output[num_cells * 4 + owner]), grad_yy);
     atomicAdd(&(output[num_cells * 4 + neighbor]), -grad_yy);
+    atomicAdd(&(output[num_cells * 5 + owner]), grad_yz);
     atomicAdd(&(output[num_cells * 5 + neighbor]), -grad_yz);
+    atomicAdd(&(output[num_cells * 6 + owner]), grad_zx);
     atomicAdd(&(output[num_cells * 6 + neighbor]), -grad_zx);
+    atomicAdd(&(output[num_cells * 7 + owner]), grad_zy);
     atomicAdd(&(output[num_cells * 7 + neighbor]), -grad_zy);
+    atomicAdd(&(output[num_cells * 8 + owner]), grad_zz);
     atomicAdd(&(output[num_cells * 8 + neighbor]), -grad_zz);
 }
 
@@ -381,25 +430,15 @@ __global__ void fvc_grad_scalar_internal(int num_cells, int num_surfaces,
     double grad_y = Sfy * ssf * sign;
     double grad_z = Sfz * ssf * sign;
 
-    // // owner
-    // atomicAdd(&(output[num_cells * 0 + owner]), grad_x);
-    // atomicAdd(&(output[num_cells * 1 + owner]), grad_y);
-    // atomicAdd(&(output[num_cells * 2 + owner]), grad_z);
-
-    // // neighbour
-    // atomicAdd(&(output[num_cells * 0 + neighbor]), -grad_x);
-    // atomicAdd(&(output[num_cells * 1 + neighbor]), -grad_y);
-    // atomicAdd(&(output[num_cells * 2 + neighbor]), -grad_z);
-
     // owner
-    atomicAdd(&(output[owner * 3 + 0]), grad_x);
-    atomicAdd(&(output[owner * 3 + 1]), grad_y);
-    atomicAdd(&(output[owner * 3 + 2]), grad_z);
+    atomicAdd(&(output[num_cells * 0 + owner]), grad_x);
+    atomicAdd(&(output[num_cells * 1 + owner]), grad_y);
+    atomicAdd(&(output[num_cells * 2 + owner]), grad_z);
 
     // neighbour
-    atomicAdd(&(output[neighbor * 3 + 0]), -grad_x);
-    atomicAdd(&(output[neighbor * 3 + 1]), -grad_y);
-    atomicAdd(&(output[neighbor * 3 + 2]), -grad_z);
+    atomicAdd(&(output[num_cells * 0 + neighbor]), -grad_x);
+    atomicAdd(&(output[num_cells * 1 + neighbor]), -grad_y);
+    atomicAdd(&(output[num_cells * 2 + neighbor]), -grad_z);
     
 }
 
@@ -423,9 +462,9 @@ __global__ void fvc_grad_scalar_boundary(int num_cells, int num, int offset, con
     double grad_y = bouSfy * bouvf;
     double grad_z = bouSfz * bouvf;
 
-    atomicAdd(&(output[cellIndex * 3 + 0]), grad_x * sign);
-    atomicAdd(&(output[cellIndex * 3 + 1]), grad_y * sign);
-    atomicAdd(&(output[cellIndex * 3 + 2]), grad_z * sign);
+    atomicAdd(&(output[num_cells * 0 + cellIndex]), grad_x * sign);
+    atomicAdd(&(output[num_cells * 1 + cellIndex]), grad_y * sign);
+    atomicAdd(&(output[num_cells * 2 + cellIndex]), grad_z * sign);
 
     // if (cellIndex == 5)
     // {
@@ -690,19 +729,17 @@ __global__ void fvc_div_cell_tensor_internal(int num_cells, int num_surfaces,
     double div_z = (Sfx * ssf_xz + Sfy * ssf_yz + Sfz * ssf_zz) * sign;
     
     // owner
-    atomicAdd(&(output[owner * 3 + 0]), div_x);
-    atomicAdd(&(output[owner * 3 + 1]), div_y);
-    atomicAdd(&(output[owner * 3 + 2]), div_z);
+    atomicAdd(&(output[num_cells * 0 + owner]), div_x);
+    atomicAdd(&(output[num_cells * 1 + owner]), div_y);
+    atomicAdd(&(output[num_cells * 2 + owner]), div_z);
 
     // neighbour
-    atomicAdd(&(output[neighbor * 3 + 0]), -div_x);
-    atomicAdd(&(output[neighbor * 3 + 1]), -div_y);
-    atomicAdd(&(output[neighbor * 3 + 2]), -div_z);
-    
-    
+    atomicAdd(&(output[num_cells * 0 + neighbor]), -div_x);
+    atomicAdd(&(output[num_cells * 1 + neighbor]), -div_y);
+    atomicAdd(&(output[num_cells * 2 + neighbor]), -div_z);
 }
 
-__global__ void fvc_div_cell_tensor_boundary(int num_boundary_faces, int num, int offset, const int *face2Cells,
+__global__ void fvc_div_cell_tensor_boundary(int num_cells, int num_boundary_faces, int num, int offset, const int *face2Cells,
         const double *boundary_face_vector, const double *boundary_vf, double *output, double sign)
 {
     int index = blockDim.x * blockIdx.x + threadIdx.x;
@@ -730,9 +767,9 @@ __global__ void fvc_div_cell_tensor_boundary(int num_boundary_faces, int num, in
     double bouDiv_y = (bouSfx * boussf_xy + bouSfy * boussf_yy + bouSfz * boussf_zy) * sign;
     double bouDiv_z = (bouSfx * boussf_xz + bouSfy * boussf_yz + bouSfz * boussf_zz) * sign;
 
-    atomicAdd(&(output[cellIndex * 3 + 0]), bouDiv_x);
-    atomicAdd(&(output[cellIndex * 3 + 1]), bouDiv_y);
-    atomicAdd(&(output[cellIndex * 3 + 2]), bouDiv_z);
+    atomicAdd(&(output[num_cells * 0 + cellIndex]), bouDiv_x);
+    atomicAdd(&(output[num_cells * 1 + cellIndex]), bouDiv_y);
+    atomicAdd(&(output[num_cells * 2 + cellIndex]), bouDiv_z);
 
     // if (cellIndex == 0)
     // {
@@ -752,6 +789,85 @@ __global__ void fvc_div_cell_tensor_boundary(int num_boundary_faces, int num, in
     // }
 }
 
+__global__ void constructVecDiag(int num_cells, const double *diag, double *diag_vec, 
+        const double *source, double *b)
+{
+    int index = blockDim.x * blockIdx.x + threadIdx.x;
+    if (index >= num_cells)
+        return;
+    
+    diag_vec[num_cells * 0 + index] = diag[index];
+    diag_vec[num_cells * 1 + index] = diag[index];
+    diag_vec[num_cells * 2 + index] = diag[index];
+
+    b[num_cells * 0 + index] = source[num_cells * 0 + index];
+    b[num_cells * 1 + index] = source[num_cells * 1 + index];
+    b[num_cells * 2 + index] = source[num_cells * 2 + index];
+}
+
+__global__ void addBoundaryDiagSrc(int num_cells, int num_boundary_surfaces, const int *face2Cells, 
+        const double *internal_coeffs, const double *boundary_coeffs, double *diag_vec, double *b)
+{
+    int index = blockDim.x * blockIdx.x + threadIdx.x;
+    if (index >= num_boundary_surfaces)
+        return;
+    
+    int cellIndex = face2Cells[index];
+
+    double internalCoeffx = internal_coeffs[num_boundary_surfaces * 0 + index];
+    double internalCoeffy = internal_coeffs[num_boundary_surfaces * 1 + index];
+    double internalCoeffz = internal_coeffs[num_boundary_surfaces * 2 + index];
+
+    double boundaryCoeffx = boundary_coeffs[num_boundary_surfaces * 0 + index];
+    double boundaryCoeffy = boundary_coeffs[num_boundary_surfaces * 1 + index];
+    double boundaryCoeffz = boundary_coeffs[num_boundary_surfaces * 2 + index];
+
+    atomicAdd(&diag_vec[num_cells * 0 + cellIndex], internalCoeffx);
+    atomicAdd(&diag_vec[num_cells * 1 + cellIndex], internalCoeffy);
+    atomicAdd(&diag_vec[num_cells * 2 + cellIndex], internalCoeffz);
+
+    atomicAdd(&b[num_cells * 0 + cellIndex], boundaryCoeffx);
+    atomicAdd(&b[num_cells * 1 + cellIndex], boundaryCoeffy);
+    atomicAdd(&b[num_cells * 2 + cellIndex], boundaryCoeffz);
+}
+
+__global__ void ldu_to_csr_offDiag(int num_cells, int num_surfaces,
+        const int *lowCSRIndex, const int *uppCSRIndex,
+        const double *lower, const double *upper,
+        double *A_csr)
+{
+    int index = blockDim.x * blockIdx.x + threadIdx.x;
+    if (index >= num_surfaces)
+        return;
+
+    int uppIndex = uppCSRIndex[index];
+    int lowIndex = lowCSRIndex[index];
+    int upp = upper[index];
+    int low = lower[index];
+    A_csr[(num_cells + 2 * num_surfaces) * 0 + uppIndex] = upper[index];
+    A_csr[(num_cells + 2 * num_surfaces) * 1 + uppIndex] = upper[index];
+    A_csr[(num_cells + 2 * num_surfaces) * 2 + uppIndex] = upper[index];
+
+    A_csr[(num_cells + 2 * num_surfaces) * 0 + lowIndex] = lower[index];
+    A_csr[(num_cells + 2 * num_surfaces) * 1 + lowIndex] = lower[index];
+    A_csr[(num_cells + 2 * num_surfaces) * 2 + lowIndex] = lower[index];
+}
+
+__global__ void ldu_to_csr_Diag(int num_cells, int num_surfaces, 
+        const int *diagCSRIndex, const double *diag_vec,
+        double *A_csr)
+{
+    int index = blockDim.x * blockIdx.x + threadIdx.x;
+    if (index >= num_cells)
+        return;
+
+    int diagIndex = diagCSRIndex[index];
+    A_csr[(num_cells + 2 * num_surfaces) * 0 + diagIndex] = diag_vec[num_cells * 0 + index];
+    A_csr[(num_cells + 2 * num_surfaces) * 1 + diagIndex] = diag_vec[num_cells * 1 + index];
+    A_csr[(num_cells + 2 * num_surfaces) * 2 + diagIndex] = diag_vec[num_cells * 2 + index];
+}
+
+
 void permute_vector_d2h(cudaStream_t stream, int num_cells, const double *input, double *output)
 {
     size_t threads_per_block = 256;
@@ -770,13 +886,13 @@ void field_multiply_scalar(cudaStream_t stream,
         int num_cells, const double *input1, const double *input2, double *output,
         int num_boundary_surfaces, const double *boundary_input1, const double *boundary_input2, double *boundary_output)
 {
-    TICK_INIT;
+    TICK_INIT_EVENT;
     size_t threads_per_block = 256;
     size_t blocks_per_grid = (std::max(num_cells, num_boundary_surfaces) + threads_per_block - 1) / threads_per_block;
-    TICK_START;
+    TICK_START_EVENT;
     field_multiply_scalar_kernel<<<blocks_per_grid, threads_per_block, 0, stream>>>(num_cells, num_boundary_surfaces,
             input1, input2, output, boundary_input1, boundary_input2, boundary_output);
-    TICK_END(field_multiply_scalar_kernel);
+    TICK_END_EVENT(field_multiply_scalar_kernel);
 }
 
 void fvc_to_source_vector(cudaStream_t stream, int num_cells, const double *volume, const double *fvc_output, double *source)
@@ -787,16 +903,35 @@ void fvc_to_source_vector(cudaStream_t stream, int num_cells, const double *volu
             volume, fvc_output, source);
 }
 
-void ldu_to_csr(cudaStream_t stream, int num_cells, int num_surfaces,
-        const int *lower_to_csr_index, const int *upper_to_csr_index, const int *diag_to_csr_index,
+void ldu_to_csr(cudaStream_t stream, int num_cells, int num_surfaces, int num_boundary_surface,
+        const int* boundary_cell_face, const int *lower_to_csr_index, const int *upper_to_csr_index, const int *diag_to_csr_index,
         const double *lower, const double *upper, const double *diag, const double *source,
         const double *internal_coeffs, const double *boundary_coeffs,
-        double *A, double *b)
+        double *A, double *b, double *diag_vec)
 {
+    // construct new diag with size of 3*num_cells
+    size_t threads_per_block = 1024;
+    size_t blocks_per_grid = (num_cells + threads_per_block - 1) / threads_per_block;
+    constructVecDiag<<<blocks_per_grid, threads_per_block, 0, stream>>>(num_cells, diag, diag_vec, source, b);
+
+    // add coeff to source and diagnal
+    blocks_per_grid = (num_boundary_surface + threads_per_block - 1) / threads_per_block;
+    addBoundaryDiagSrc<<<blocks_per_grid, threads_per_block, 0, stream>>>(num_cells, num_boundary_surface, 
+            boundary_cell_face, internal_coeffs, boundary_coeffs, diag_vec, b);
+    
+    // convert offdiag
+    blocks_per_grid = (num_surfaces + threads_per_block - 1) / threads_per_block;
+    ldu_to_csr_offDiag<<<blocks_per_grid, threads_per_block, 0, stream>>>(num_cells, num_surfaces, 
+            lower_to_csr_index, upper_to_csr_index, lower, upper, A);
+
+    // convert diag
+    blocks_per_grid = (num_cells + threads_per_block - 1) / threads_per_block;
+    ldu_to_csr_Diag<<<blocks_per_grid, threads_per_block, 0, stream>>>(num_cells, num_surfaces, 
+            diag_to_csr_index, diag_vec, A);
 
 }
 
-void update_boundary_coeffs_vector(cudaStream_t stream, int num_patches,
+void update_boundary_coeffs_vector(cudaStream_t stream, int num_boundary_surfaces, int num_patches,
         const int *patch_size, const int *patch_type,
         double *value_internal_coeffs, double *value_boundary_coeffs,
         double *gradient_internal_coeffs, double *gradient_boundary_coeffs)
@@ -811,7 +946,7 @@ void update_boundary_coeffs_vector(cudaStream_t stream, int num_patches,
         // TODO: just basic patch type now
         // TODO: just vector version now
         if (patch_type[i] == boundaryConditions::zeroGradient) {
-            update_boundary_coeffs_zeroGradient_vector<<<blocks_per_grid, threads_per_block, 0, stream>>>(patch_size[i], offset,
+            update_boundary_coeffs_zeroGradient_vector<<<blocks_per_grid, threads_per_block, 0, stream>>>(num_boundary_surfaces, patch_size[i], offset,
                     value_internal_coeffs, value_boundary_coeffs, gradient_internal_coeffs, gradient_boundary_coeffs);
         } else if (0) {
             // xxx
@@ -825,32 +960,41 @@ void fvm_ddt_vector(cudaStream_t stream, int num_cells, double rDeltaT,
         const double *rho, const double *rho_old, const double *vf, const double *volume,
         double *diag, double *source, double sign)
 {
+#ifdef TIME_GPU
     printf("#############kernel profile#############\n");
-    TICK_INIT;
-    size_t threads_per_block = 1024;
+#endif
+    TICK_INIT_EVENT;
+    size_t threads_per_block = 64;
     size_t blocks_per_grid = (num_cells + threads_per_block - 1) / threads_per_block;
-    TICK_START;
+#ifdef TIME_GPU
+    printf("warm up ...\n");
+    warmup<<<blocks_per_grid, threads_per_block, 0, stream>>>(num_cells);
+#endif
+    TICK_START_EVENT;
     fvm_ddt_vector_kernel<<<blocks_per_grid, threads_per_block, 0, stream>>>(num_cells,
             rDeltaT, rho, rho_old, vf, volume, diag, source, sign);
-    TICK_END(fvm_ddt_vector_kernel);
+    TICK_END_EVENT(fvm_ddt_vector_kernel);
 }
 
-void fvm_div_vector(cudaStream_t stream, int num_surfaces, const int *lowerAddr, const int *upperAddr,
+void fvm_div_vector(cudaStream_t stream, int num_surfaces, int num_boundary_surfaces, 
+        const int *lowerAddr, const int *upperAddr,
         const double *phi, const double *weight,
         double *lower, double *upper, double *diag, // end for internal
         int num_patches, const int *patch_size, const int *patch_type,
         const double *boundary_phi, const double *value_internal_coeffs, const double *value_boundary_coeffs,
         double *internal_coeffs, double *boundary_coeffs, double sign)
 {
-    TICK_INIT;
-    size_t threads_per_block = 1024;
-    size_t blocks_per_grid = 1;
-
-    blocks_per_grid = (num_surfaces + threads_per_block - 1) / threads_per_block;
-    TICK_START;
+    TICK_INIT_EVENT;
+    size_t threads_per_block = 256;
+    size_t blocks_per_grid = (num_surfaces + threads_per_block - 1) / threads_per_block;
+#ifdef TIME_GPU
+    printf("warm up ...\n");
+    warmup<<<blocks_per_grid, threads_per_block, 0, stream>>>(num_surfaces);
+#endif
+    TICK_START_EVENT;
     fvm_div_vector_internal<<<blocks_per_grid, threads_per_block, 0, stream>>>(num_surfaces, lowerAddr, upperAddr,
             phi, weight, lower, upper, diag, sign);
-    TICK_END(fvm_div_vector_internal);
+    TICK_END_EVENT(fvm_div_vector_internal);
 
     int offset = 0;
     for (int i = 0; i < num_patches; i++) {
@@ -860,11 +1004,11 @@ void fvm_div_vector(cudaStream_t stream, int num_surfaces, const int *lowerAddr,
         if (patch_type[i] == boundaryConditions::zeroGradient
                 || patch_type[i] == boundaryConditions::fixedValue) {
             // TODO: just vector version now
-            TICK_START;
-            fvm_div_vector_boundary<<<blocks_per_grid, threads_per_block, 0, stream>>>(patch_size[i], offset,
+            TICK_START_EVENT;
+            fvm_div_vector_boundary<<<blocks_per_grid, threads_per_block, 0, stream>>>(num_boundary_surfaces, patch_size[i], offset,
                     boundary_phi, value_internal_coeffs, value_boundary_coeffs,
                     internal_coeffs, boundary_coeffs, sign);
-            TICK_END(fvm_div_vector_boundary);
+            TICK_END_EVENT(fvm_div_vector_boundary);
         } else if (0) {
             // xxx
             fprintf(stderr, "boundaryConditions other than zeroGradient are not support yet!\n");
@@ -873,7 +1017,7 @@ void fvm_div_vector(cudaStream_t stream, int num_surfaces, const int *lowerAddr,
     }
 }
 
-void fvm_laplacian_vector(cudaStream_t stream, int num_surfaces,
+void fvm_laplacian_vector(cudaStream_t stream, int num_surfaces, int num_boundary_surfaces, 
         const int *lowerAddr, const int *upperAddr,
         const double *weight, const double *mag_sf, const double *delta_coeffs, const double *gamma,
         double *lower, double *upper, double *diag, // end for internal
@@ -882,15 +1026,13 @@ void fvm_laplacian_vector(cudaStream_t stream, int num_surfaces,
         const double *gradient_internal_coeffs, const double *gradient_boundary_coeffs,
         double *internal_coeffs, double *boundary_coeffs, double sign)
 {
-    TICK_INIT;
+    TICK_INIT_EVENT;
     size_t threads_per_block = 1024;
-    size_t blocks_per_grid = 1;
-
-    blocks_per_grid = (num_surfaces + threads_per_block - 1) / threads_per_block;
-    TICK_START;
+    size_t blocks_per_grid = (num_surfaces + threads_per_block - 1) / threads_per_block;
+    TICK_START_EVENT;
     fvm_laplacian_vector_internal<<<blocks_per_grid, threads_per_block, 0, stream>>>(num_surfaces, lowerAddr, upperAddr,
             weight, mag_sf, delta_coeffs, gamma, lower, upper, diag, sign);
-    TICK_END(fvm_laplacian_vector_internal);
+    TICK_END_EVENT(fvm_laplacian_vector_internal);
     int offset = 0;
     for (int i = 0; i < num_patches; i++) {
         threads_per_block = 64;
@@ -899,11 +1041,11 @@ void fvm_laplacian_vector(cudaStream_t stream, int num_surfaces,
         if (patch_type[i] == boundaryConditions::zeroGradient
                 || patch_type[i] == boundaryConditions::fixedValue) {
             // TODO: just vector version now
-            TICK_START;
-            fvm_laplacian_vector_boundary<<<blocks_per_grid, threads_per_block, 0, stream>>>(patch_size[i], offset,
+            TICK_START_EVENT;
+            fvm_laplacian_vector_boundary<<<blocks_per_grid, threads_per_block, 0, stream>>>(num_boundary_surfaces, patch_size[i], offset,
                     boundary_mag_sf, boundary_gamma, gradient_internal_coeffs, gradient_boundary_coeffs,
                     internal_coeffs, boundary_coeffs, sign);
-            TICK_END(fvm_laplacian_vector_boundary);
+            TICK_END_EVENT(fvm_laplacian_vector_boundary);
         } else if (0) {
             // xxx
             fprintf(stderr, "boundaryConditions other than zeroGradient are not support yet!\n");
@@ -933,13 +1075,13 @@ void fvc_grad_vector(cudaStream_t stream, int num_cells, int num_surfaces, int n
         const double *boundary_deltaCoeffs, double sign)
 {
     checkCudaErrors(cudaMemsetAsync(output, 0, num_cells * 9 * sizeof(double), stream));
-    TICK_INIT;
-    size_t threads_per_block = 1024;
+    TICK_INIT_EVENT;
+    size_t threads_per_block = 32;
     size_t blocks_per_grid = (num_surfaces + threads_per_block - 1) / threads_per_block;
-    TICK_START;
+    TICK_START_EVENT;
     fvc_grad_vector_internal<<<blocks_per_grid, threads_per_block, 0, stream>>>(num_cells, num_surfaces, lowerAddr, upperAddr,
             Sf, weight, vf, output);
-    TICK_END(fvc_grad_vector_internal);
+    TICK_END_EVENT(fvc_grad_vector_internal);
     
     int offset = 0;
     // finish conctruct grad field except dividing cell volume
@@ -950,11 +1092,11 @@ void fvc_grad_vector(cudaStream_t stream, int num_cells, int num_surfaces, int n
         if (patch_type[i] == boundaryConditions::zeroGradient
                 || patch_type[i] == boundaryConditions::fixedValue) {
             // TODO: just vector version now
-            TICK_START;
+            TICK_START_EVENT;
             fvc_grad_vector_boundary<<<blocks_per_grid, threads_per_block, 0, stream>>>(num_cells, 
                     patch_size[i], offset, boundary_cell_face,
                     boundary_Sf, boundary_vf, output);
-            TICK_END(fvc_grad_vector_boundary);
+            TICK_END_EVENT(fvc_grad_vector_boundary);
         } else if (0) {
             // xxx
             fprintf(stderr, "boundaryConditions other than zeroGradient are not support yet!\n");
@@ -965,9 +1107,9 @@ void fvc_grad_vector(cudaStream_t stream, int num_cells, int num_surfaces, int n
     // divide cell volume
     threads_per_block = 512;
     blocks_per_grid = (num_cells + threads_per_block - 1) / threads_per_block;
-    TICK_START;
+    TICK_START_EVENT;
     divide_cell_volume_tsr<<<blocks_per_grid, threads_per_block, 0, stream>>>(num_cells, volume, output, sign);
-    TICK_END(divide_cell_volume_tsr);
+    TICK_END_EVENT(divide_cell_volume_tsr);
 
     // correct boundary conditions
     offset = 0;
@@ -977,11 +1119,11 @@ void fvc_grad_vector(cudaStream_t stream, int num_cells, int num_surfaces, int n
         // TODO: just basic patch type now
         if (patch_type[i] == boundaryConditions::zeroGradient) {
             // TODO: just vector version now
-            TICK_START;
+            TICK_START_EVENT;
             fvc_grad_vector_correctBC_zeroGradient<<<blocks_per_grid, threads_per_block, 0, stream>>>(num_cells, num_boundary_surfaces,
                     patch_size[i], offset, boundary_cell_face,
                     output, boundary_vf, boundary_Sf, boundary_mag_Sf, boundary_output, sign);
-            TICK_END(fvc_grad_vector_correctBC_zeroGradient);
+            TICK_END_EVENT(fvc_grad_vector_correctBC_zeroGradient);
         } else if (patch_type[i] == boundaryConditions::fixedValue) {
             // TODO: implement fixedValue version
             fvc_grad_vector_correctBC_fixedValue<<<blocks_per_grid, threads_per_block, 0, stream>>>(patch_size[i], offset, boundary_cell_face,
@@ -997,12 +1139,12 @@ void fvc_grad_vector(cudaStream_t stream, int num_cells, int num_surfaces, int n
 void scale_dev2T_tensor(cudaStream_t stream, int num_cells, const double *vf1, double *vf2,
         int num_boundary_surfaces, const double *boundary_vf1, double *boundary_vf2)
 {
-    TICK_INIT;
+    TICK_INIT_EVENT;
     size_t threads_per_block = 1024;
     size_t blocks_per_grid = (num_cells + threads_per_block - 1) / threads_per_block;
-    TICK_START;
+    TICK_START_EVENT;
     scale_dev2t_tensor_kernel<<<blocks_per_grid, threads_per_block, 0, stream>>>(num_cells, vf1, vf2);
-    TICK_END(scale_dev2t_tensor_kernel);
+    TICK_END_EVENT(scale_dev2t_tensor_kernel);
 
     blocks_per_grid = (num_boundary_surfaces + threads_per_block - 1) / threads_per_block;
     scale_dev2t_tensor_kernel<<<blocks_per_grid, threads_per_block, 0, stream>>>(num_boundary_surfaces, boundary_vf1, boundary_vf2);
@@ -1073,12 +1215,12 @@ void fvc_div_cell_tensor(cudaStream_t stream, int num_cells, int num_surfaces, i
         const double *volume, double sign)
 {
     // checkCudaErrors(cudaMemsetAsync(output, 0, num_cells * 3 * sizeof(double), stream));
-    TICK_INIT;
+    TICK_INIT_EVENT;
     size_t threads_per_block = 1024;
     size_t blocks_per_grid = (num_surfaces + threads_per_block - 1) / threads_per_block;
-    TICK_START;
+    TICK_START_EVENT;
     fvc_div_cell_tensor_internal<<<blocks_per_grid, threads_per_block, 0, stream>>>(num_cells, num_surfaces, lowerAddr, upperAddr, vf, weight, Sf, output, sign);
-    TICK_END(fvc_div_cell_tensor_internal);
+    TICK_END_EVENT(fvc_div_cell_tensor_internal);
 
     int offset = 0;
     for (int i = 0; i < num_patches; i++) {
@@ -1088,10 +1230,10 @@ void fvc_div_cell_tensor(cudaStream_t stream, int num_cells, int num_surfaces, i
         if (patch_type[i] == boundaryConditions::zeroGradient
                 || patch_type[i] == boundaryConditions::fixedValue) {
             // TODO: just vector version now
-            TICK_START;
-            fvc_div_cell_tensor_boundary<<<blocks_per_grid, threads_per_block, 0, stream>>>(num_boundary_surfaces, patch_size[i], offset, boundary_cell_face,
+            TICK_START_EVENT;
+            fvc_div_cell_tensor_boundary<<<blocks_per_grid, threads_per_block, 0, stream>>>(num_cells, num_boundary_surfaces, patch_size[i], offset, boundary_cell_face,
                     boundary_Sf, boundary_vf, output, sign);
-            TICK_END(fvc_div_cell_tensor_boundary);
+            TICK_END_EVENT(fvc_div_cell_tensor_boundary);
         } else if (0) {
             // xxx
             fprintf(stderr, "boundaryConditions other than zeroGradient are not support yet!\n");
@@ -1111,13 +1253,13 @@ void fvc_grad_cell_scalar(cudaStream_t stream, int num_cells, int num_surfaces,
         int num_patches, const int *patch_size, const int *patch_type,
         const int *boundary_cell_face, const double *boundary_vf, const double *boundary_Sf, const double *volume, double sign)
 {
-    TICK_INIT;
+    TICK_INIT_EVENT;
     size_t threads_per_block = 1024;
     size_t blocks_per_grid = (num_surfaces + threads_per_block - 1) / threads_per_block;
-    TICK_START;
+    TICK_START_EVENT;
     fvc_grad_scalar_internal<<<blocks_per_grid, threads_per_block, 0, stream>>>(num_cells, num_surfaces, lowerAddr, upperAddr,
             Sf, weight, vf, output, sign);
-    TICK_END(fvc_grad_scalar_internal);
+    TICK_END_EVENT(fvc_grad_scalar_internal);
     
     int offset = 0;
     for (int i = 0; i < num_patches; i++) {
@@ -1126,10 +1268,10 @@ void fvc_grad_cell_scalar(cudaStream_t stream, int num_cells, int num_surfaces,
         // TODO: just non-coupled patch type now
         if (patch_type[i] == boundaryConditions::zeroGradient
                 || patch_type[i] == boundaryConditions::fixedValue) {
-            TICK_START;
+            TICK_START_EVENT;
             fvc_grad_scalar_boundary<<<blocks_per_grid, threads_per_block, 0, stream>>>(num_cells, patch_size[i], offset, boundary_cell_face,
                     boundary_Sf, boundary_vf, output, sign);
-            TICK_END(fvc_grad_scalar_internal);
+            TICK_END_EVENT(fvc_grad_scalar_boundary);
         } else if (0) {
             // xxx
             fprintf(stderr, "boundaryConditions other than zeroGradient are not support yet!\n");
diff --git a/src_gpu/dfUEqn.H b/src_gpu/dfUEqn.H
index 80cdc7144..49edc1b7a 100644
--- a/src_gpu/dfUEqn.H
+++ b/src_gpu/dfUEqn.H
@@ -41,7 +41,7 @@ private:
 	double *d_grad_u = nullptr;
 	double *d_rho_nueff = nullptr;
 	double *d_permute = nullptr;
-    double *d_fvc_output = nullptr;
+    double *d_fvc_output = nullptr; // TODO: no need anymore
 
 	// non-constant fields - boundary
 	// thermophysical fields
@@ -64,10 +64,11 @@ private:
 	double *d_source = nullptr;
 	double *d_internal_coeffs = nullptr;
 	double *d_boundary_coeffs = nullptr;
+    double *d_diag_vector = nullptr;
 
 	// non-constant fields - csr
 	double *d_A = nullptr;
-	double *d_b = nullptr;
+	double *d_b = nullptr; // TODO: needless
 
     // field pointer map
     std::unordered_map<std::string, double*> fieldPointerMap;
diff --git a/src_gpu/dfUEqn.cu b/src_gpu/dfUEqn.cu
index 73b7516c5..d30c06131 100644
--- a/src_gpu/dfUEqn.cu
+++ b/src_gpu/dfUEqn.cu
@@ -50,6 +50,7 @@ void dfUEqn::createNonConstantLduAndCsrFields() {
   checkCudaErrors(cudaMalloc((void**)&d_upper, dataBase_.surface_value_bytes));
   checkCudaErrors(cudaMalloc((void**)&d_diag, dataBase_.cell_value_bytes));
   checkCudaErrors(cudaMalloc((void**)&d_source, dataBase_.cell_value_vec_bytes));
+  checkCudaErrors(cudaMalloc((void**)&d_diag_vector, dataBase_.cell_value_vec_bytes));
   checkCudaErrors(cudaMalloc((void**)&d_internal_coeffs, dataBase_.boundary_surface_value_vec_bytes));
   checkCudaErrors(cudaMalloc((void**)&d_boundary_coeffs, dataBase_.boundary_surface_value_vec_bytes));
   checkCudaErrors(cudaMalloc((void**)&d_A, dataBase_.csr_value_vec_bytes));
@@ -57,7 +58,7 @@ void dfUEqn::createNonConstantLduAndCsrFields() {
 }
 
 void dfUEqn::initNonConstantFieldsBoundary() {
-    update_boundary_coeffs_vector(dataBase_.stream, dataBase_.num_patches,
+    update_boundary_coeffs_vector(dataBase_.stream, dataBase_.num_boundary_surfaces, dataBase_.num_patches,
             dataBase_.patch_size.data(), patch_type.data(),
             d_value_internal_coeffs, d_value_boundary_coeffs,
             d_gradient_internal_coeffs, d_gradient_boundary_coeffs);
@@ -87,6 +88,7 @@ void dfUEqn::preProcess(const double *h_u, const double *h_boundary_u, const dou
   checkCudaErrors(cudaMemsetAsync(d_boundary_coeffs, 0, dataBase_.boundary_surface_value_vec_bytes, dataBase_.stream));
   checkCudaErrors(cudaMemsetAsync(d_A, 0, dataBase_.csr_value_vec_bytes, dataBase_.stream));
   checkCudaErrors(cudaMemsetAsync(d_b, 0, dataBase_.cell_value_vec_bytes, dataBase_.stream));
+  checkCudaErrors(cudaMemsetAsync(d_diag_vector, 0, dataBase_.cell_value_vec_bytes, dataBase_.stream)); // TODO: maybe a better way
 }
 
 void dfUEqn::process() {
@@ -97,24 +99,27 @@ void dfUEqn::process() {
     checkCudaErrors(cudaEventCreate(&stop));
     checkCudaErrors(cudaEventRecord(start,0));
 
-    // if(!graph_created) {
-    //     DEBUG_TRACE;
-    //     checkCudaErrors(cudaStreamBeginCapture(dataBase_.stream, cudaStreamCaptureModeGlobal));
+#ifndef TIME_GPU
+    if(!graph_created) {
+        DEBUG_TRACE;
+        checkCudaErrors(cudaStreamBeginCapture(dataBase_.stream, cudaStreamCaptureModeGlobal));
+#endif
 
+        permute_vector_h2d(dataBase_.stream, dataBase_.num_cells, dataBase_.d_u, d_permute);
         fvm_ddt_vector(dataBase_.stream, dataBase_.num_cells, dataBase_.rdelta_t,
-                dataBase_.d_rho, dataBase_.d_rho_old, dataBase_.d_u, dataBase_.d_volume,
+                dataBase_.d_rho, dataBase_.d_rho_old, d_permute, dataBase_.d_volume,
                 d_diag, d_source, 1.);
-        fvm_div_vector(dataBase_.stream, dataBase_.num_surfaces, dataBase_.d_owner, dataBase_.d_neighbor,
+        fvm_div_vector(dataBase_.stream, dataBase_.num_surfaces, dataBase_.num_boundary_surfaces, 
+                dataBase_.d_owner, dataBase_.d_neighbor,
                 dataBase_.d_phi, dataBase_.d_weight,
                 d_lower, d_upper, d_diag, // end for internal
                 dataBase_.num_patches, dataBase_.patch_size.data(), patch_type.data(),
                 dataBase_.d_boundary_phi, d_value_internal_coeffs, d_value_boundary_coeffs,
                 d_internal_coeffs, d_boundary_coeffs, 1.);
-        //TODO: merge bellow six kernels
         field_multiply_scalar(dataBase_.stream,
                dataBase_.num_cells, dataBase_.d_rho, d_nu_eff, d_rho_nueff, // end for internal
                dataBase_.num_boundary_surfaces, dataBase_.d_boundary_rho, d_boundary_nu_eff, d_boundary_rho_nueff);
-        fvm_laplacian_vector(dataBase_.stream, dataBase_.num_surfaces,
+        fvm_laplacian_vector(dataBase_.stream, dataBase_.num_surfaces, dataBase_.num_boundary_surfaces, 
                dataBase_.d_owner, dataBase_.d_neighbor,
                dataBase_.d_weight, dataBase_.d_mag_sf, dataBase_.d_delta_coeffs, d_rho_nueff,
                d_lower, d_upper, d_diag, // end for internal
@@ -124,7 +129,7 @@ void dfUEqn::process() {
                d_internal_coeffs, d_boundary_coeffs, -1);
         fvc_grad_vector(dataBase_.stream, dataBase_.num_cells, dataBase_.num_surfaces, dataBase_.num_boundary_surfaces,
                dataBase_.d_owner, dataBase_.d_neighbor,
-               dataBase_.d_weight, dataBase_.d_sf, dataBase_.d_u, d_grad_u,
+               dataBase_.d_weight, dataBase_.d_sf, d_permute, d_grad_u,
                dataBase_.num_patches, dataBase_.patch_size.data(), patch_type.data(),
                dataBase_.d_boundary_face_cell, dataBase_.d_boundary_u, dataBase_.d_boundary_sf,
                dataBase_.d_volume, dataBase_.d_boundary_mag_sf, d_boundary_grad_u, dataBase_.d_boundary_delta_coeffs);
@@ -137,7 +142,6 @@ void dfUEqn::process() {
                dataBase_.d_boundary_face_cell, d_boundary_grad_u, dataBase_.d_boundary_sf, dataBase_.d_volume);
         // fvc_to_source_vector(dataBase_.stream, dataBase_.num_cells,
         //        dataBase_.d_volume, d_fvc_output, d_source);
-        // TODO: merge bellow two kernel
         fvc_grad_cell_scalar(dataBase_.stream, dataBase_.num_cells, dataBase_.num_surfaces,
                 dataBase_.d_owner, dataBase_.d_neighbor,
                 dataBase_.d_weight, dataBase_.d_sf, dataBase_.d_p, d_source,
@@ -146,12 +150,14 @@ void dfUEqn::process() {
         // fvc_to_source_vector(dataBase_.stream, dataBase_.num_cells,
         //         dataBase_.d_volume, d_fvc_output, d_source);
 
-    //     checkCudaErrors(cudaStreamEndCapture(dataBase_.stream, &graph));
-    //     checkCudaErrors(cudaGraphInstantiate(&graph_instance, graph, NULL, NULL, 0));
-    //     graph_created = true;
-    // }
-    // DEBUG_TRACE;
-    // checkCudaErrors(cudaGraphLaunch(graph_instance, dataBase_.stream));
+#ifndef TIME_GPU
+        checkCudaErrors(cudaStreamEndCapture(dataBase_.stream, &graph));
+        checkCudaErrors(cudaGraphInstantiate(&graph_instance, graph, NULL, NULL, 0));
+        graph_created = true;
+    }
+    DEBUG_TRACE;
+    checkCudaErrors(cudaGraphLaunch(graph_instance, dataBase_.stream));
+#endif
 
     checkCudaErrors(cudaEventRecord(stop,0));
     checkCudaErrors(cudaEventSynchronize(start));
@@ -168,11 +174,14 @@ void dfUEqn::sync()
 }
 
 void dfUEqn::solve() {
-    //ldu_to_csr(dataBase_.stream, dataBase_.num_cells, dataBase_.num_surfaces,
-    //        dataBase_.d_lower_to_csr_index, dataBase_.d_upper_to_csr_index, dataBase_.d_diag_to_csr_index,
-    //        d_lower, d_upper, d_diag, d_source, d_internal_coeffs, d_boundary_coeffs, d_A, d_b);
+    ldu_to_csr(dataBase_.stream, dataBase_.num_cells, dataBase_.num_surfaces, dataBase_.num_boundary_surfaces,
+            dataBase_.d_boundary_face_cell,
+            dataBase_.d_lower_to_csr_index, dataBase_.d_upper_to_csr_index, dataBase_.d_diag_to_csr_index,
+            d_lower, d_upper, d_diag, d_source, d_internal_coeffs, d_boundary_coeffs, d_A, d_b, d_diag_vector);
 
     int nNz = dataBase_.num_cells + dataBase_.num_surfaces * 2; // matrix entries
+    sync();
+
     if (num_iteration == 0)                                     // first interation
     {
         printf("Initializing AmgX Linear Solver\n");
@@ -186,19 +195,19 @@ void dfUEqn::solve() {
         UySolver->updateOperator(dataBase_.num_cells, nNz, d_A + nNz);
         UzSolver->updateOperator(dataBase_.num_cells, nNz, d_A + 2 * nNz);
     }
-    UxSolver->solve(dataBase_.num_cells, dataBase_.d_u, d_b);
-    UySolver->solve(dataBase_.num_cells, dataBase_.d_u + dataBase_.num_cells, d_b + dataBase_.num_cells);
-    UzSolver->solve(dataBase_.num_cells, dataBase_.d_u + 2 * dataBase_.num_cells, d_b + 2 * dataBase_.num_cells);
+    UxSolver->solve(dataBase_.num_cells, d_permute, d_b);
+    UySolver->solve(dataBase_.num_cells, d_permute + dataBase_.num_cells, d_b + dataBase_.num_cells);
+    UzSolver->solve(dataBase_.num_cells, d_permute + 2 * dataBase_.num_cells, d_b + 2 * dataBase_.num_cells);
     num_iteration++;
 }
 
-void dfUEqn::postProcess(double *h_u) {
-    permute_vector_d2h(dataBase_.stream, dataBase_.num_cells, dataBase_.d_u, d_permute);
-    checkCudaErrors(cudaMemcpyAsync(h_u, d_permute, dataBase_.cell_value_vec_bytes, cudaMemcpyDeviceToHost, dataBase_.stream));
+void dfUEqn::postProcess(double *h_u) { // TODO: Here may be a bug
+    permute_vector_d2h(dataBase_.stream, dataBase_.num_cells, d_permute, dataBase_.d_u);
+    checkCudaErrors(cudaMemcpyAsync(dataBase_.d_u, d_permute, dataBase_.cell_value_vec_bytes, cudaMemcpyDeviceToHost, dataBase_.stream));
     checkCudaErrors(cudaStreamSynchronize(dataBase_.stream));
 
     // some boundary conditions may also need vf.boundary, deltaCoeffs.boundary, and weight.boundary
-    update_boundary_coeffs_vector(dataBase_.stream, dataBase_.num_patches,
+    update_boundary_coeffs_vector(dataBase_.stream, dataBase_.num_boundary_surfaces, dataBase_.num_patches,
             dataBase_.patch_size.data(), patch_type.data(),
             d_value_internal_coeffs, d_value_boundary_coeffs,
             d_gradient_internal_coeffs, d_gradient_boundary_coeffs);
@@ -251,30 +260,41 @@ void dfUEqn::compareResult(const double *lower, const double *upper, const doubl
     checkVectorEqual(dataBase_.num_cells, diag, h_diag.data(), 1e-14, printFlag);
     DEBUG_TRACE;
 
-    std::vector<double> h_source;
-    // , h_source_ref;
+    std::vector<double> h_source, h_source_ref;
     h_source.resize(dataBase_.num_cells * 3);
-    // h_source_ref.resize(dataBase_.num_cells * 3);
-    // for (int i = 0; i < dataBase_.num_cells; i++) {
-    //     h_source_ref[0 * dataBase_.num_cells + i] = source[i * 3 + 0];
-    //     h_source_ref[1 * dataBase_.num_cells + i] = source[i * 3 + 1];
-    //     h_source_ref[2 * dataBase_.num_cells + i] = source[i * 3 + 2];
-    // }
+    h_source_ref.resize(dataBase_.num_cells * 3);
+    for (int i = 0; i < dataBase_.num_cells; i++) {
+        h_source_ref[0 * dataBase_.num_cells + i] = source[i * 3 + 0];
+        h_source_ref[1 * dataBase_.num_cells + i] = source[i * 3 + 1];
+        h_source_ref[2 * dataBase_.num_cells + i] = source[i * 3 + 2];
+    }
     checkCudaErrors(cudaMemcpy(h_source.data(), d_source, dataBase_.cell_value_vec_bytes, cudaMemcpyDeviceToHost));
     fprintf(stderr, "check h_source");
-    checkVectorEqual(dataBase_.num_cells * 3, source, h_source.data(), 1e-14, printFlag);
+    checkVectorEqual(dataBase_.num_cells * 3, h_source_ref.data(), h_source.data(), 1e-14, printFlag);
     DEBUG_TRACE;
 
-    std::vector<double> h_internal_coeffs;
+    std::vector<double> h_internal_coeffs, h_internal_coeffs_ref;
     h_internal_coeffs.resize(dataBase_.num_boundary_surfaces * 3);
+    h_internal_coeffs_ref.resize(dataBase_.num_boundary_surfaces * 3);
+    for (int i = 0; i < dataBase_.num_boundary_surfaces; i++) {
+        h_internal_coeffs_ref[0 * dataBase_.num_boundary_surfaces + i] = internal_coeffs[i * 3 + 0];
+        h_internal_coeffs_ref[1 * dataBase_.num_boundary_surfaces + i] = internal_coeffs[i * 3 + 1];
+        h_internal_coeffs_ref[2 * dataBase_.num_boundary_surfaces + i] = internal_coeffs[i * 3 + 2];
+    }
     checkCudaErrors(cudaMemcpy(h_internal_coeffs.data(), d_internal_coeffs, dataBase_.boundary_surface_value_vec_bytes, cudaMemcpyDeviceToHost));
-    checkVectorEqual(dataBase_.num_boundary_surfaces * 3, internal_coeffs, h_internal_coeffs.data(), 1e-14, printFlag);
+    checkVectorEqual(dataBase_.num_boundary_surfaces * 3, h_internal_coeffs_ref.data(), h_internal_coeffs.data(), 1e-14, printFlag);
     DEBUG_TRACE;
 
-    std::vector<double> h_boundary_coeffs;
+    std::vector<double> h_boundary_coeffs, h_boundary_coeffs_ref;
     h_boundary_coeffs.resize(dataBase_.num_boundary_surfaces * 3);
+    h_boundary_coeffs_ref.resize(dataBase_.num_boundary_surfaces * 3);
+    for (int i = 0; i < dataBase_.num_boundary_surfaces; i++) {
+        h_boundary_coeffs_ref[0 * dataBase_.num_boundary_surfaces + i] = boundary_coeffs[i * 3 + 0];
+        h_boundary_coeffs_ref[1 * dataBase_.num_boundary_surfaces + i] = boundary_coeffs[i * 3 + 1];
+        h_boundary_coeffs_ref[2 * dataBase_.num_boundary_surfaces + i] = boundary_coeffs[i * 3 + 2];
+    }
     checkCudaErrors(cudaMemcpy(h_boundary_coeffs.data(), d_boundary_coeffs, dataBase_.boundary_surface_value_vec_bytes, cudaMemcpyDeviceToHost));
-    checkVectorEqual(dataBase_.num_boundary_surfaces * 3, boundary_coeffs, h_boundary_coeffs.data(), 1e-14, printFlag);
+    checkVectorEqual(dataBase_.num_boundary_surfaces * 3, h_boundary_coeffs_ref.data(), h_boundary_coeffs.data(), 1e-14, printFlag);
     DEBUG_TRACE;
 
     // std::vector<double> h_tmpVal;

From d6844885a067df06f3ddac8c5268f984524d0815 Mon Sep 17 00:00:00 2001
From: maorz1998 <maorz1998@stu.pku.edu.cn>
Date: Mon, 21 Aug 2023 21:19:21 +0800
Subject: [PATCH 25/25] modify app

---
 applications/solvers/dfLowMachFoam/Make/files |   2 +-
 .../solvers/dfLowMachFoam/Make/options        |   1 -
 applications/solvers/dfLowMachFoam/UEqn.H     | 115 +++++
 .../solvers/dfLowMachFoam/dfLowMachFoam.C     |  46 +-
 .../solvers/dfLowMachFoam_new/CMakeLists.txt  | 126 -----
 applications/solvers/dfLowMachFoam_new/EEqn.H | 141 ------
 .../solvers/dfLowMachFoam_new/Make/files      |   3 -
 .../solvers/dfLowMachFoam_new/Make/options    |  58 ---
 applications/solvers/dfLowMachFoam_new/UEqn.H | 247 ----------
 applications/solvers/dfLowMachFoam_new/YEqn.H | 207 --------
 .../solvers/dfLowMachFoam_new/YEqn_RR.H       |  61 ---
 .../solvers/dfLowMachFoam_new/correctPhi.H    |  12 -
 .../solvers/dfLowMachFoam_new/createFields.H  | 176 -------
 .../dfLowMachFoam_new/createGPUSolver.H       |  97 ----
 .../dfLowMachFoam_new/createdfSolver.H        |  65 ---
 .../solvers/dfLowMachFoam_new/dfLowMachFoam.C | 447 ------------------
 applications/solvers/dfLowMachFoam_new/pEqn.H | 203 --------
 .../solvers/dfLowMachFoam_new/pcEqn.H         | 130 -----
 .../solvers/dfLowMachFoam_new/rhoEqn.H        |  86 ----
 .../solvers/dfLowMachFoam_new/setRDeltaT.H    |  85 ----
 .../solvers/dfLowMachFoam_new/setRootCase2.H  |   5 -
 21 files changed, 154 insertions(+), 2159 deletions(-)
 delete mode 100644 applications/solvers/dfLowMachFoam_new/CMakeLists.txt
 delete mode 100644 applications/solvers/dfLowMachFoam_new/EEqn.H
 delete mode 100644 applications/solvers/dfLowMachFoam_new/Make/files
 delete mode 100644 applications/solvers/dfLowMachFoam_new/Make/options
 delete mode 100644 applications/solvers/dfLowMachFoam_new/UEqn.H
 delete mode 100644 applications/solvers/dfLowMachFoam_new/YEqn.H
 delete mode 100644 applications/solvers/dfLowMachFoam_new/YEqn_RR.H
 delete mode 100644 applications/solvers/dfLowMachFoam_new/correctPhi.H
 delete mode 100644 applications/solvers/dfLowMachFoam_new/createFields.H
 delete mode 100644 applications/solvers/dfLowMachFoam_new/createGPUSolver.H
 delete mode 100644 applications/solvers/dfLowMachFoam_new/createdfSolver.H
 delete mode 100644 applications/solvers/dfLowMachFoam_new/dfLowMachFoam.C
 delete mode 100644 applications/solvers/dfLowMachFoam_new/pEqn.H
 delete mode 100644 applications/solvers/dfLowMachFoam_new/pcEqn.H
 delete mode 100644 applications/solvers/dfLowMachFoam_new/rhoEqn.H
 delete mode 100644 applications/solvers/dfLowMachFoam_new/setRDeltaT.H
 delete mode 100644 applications/solvers/dfLowMachFoam_new/setRootCase2.H

diff --git a/applications/solvers/dfLowMachFoam/Make/files b/applications/solvers/dfLowMachFoam/Make/files
index 4eff5915e..9b7e89945 100644
--- a/applications/solvers/dfLowMachFoam/Make/files
+++ b/applications/solvers/dfLowMachFoam/Make/files
@@ -1,3 +1,3 @@
-new_dfLowMachFoam.C
+dfLowMachFoam.C
 
 EXE = $(DF_APPBIN)/dfLowMachFoam
diff --git a/applications/solvers/dfLowMachFoam/Make/options b/applications/solvers/dfLowMachFoam/Make/options
index e1959ada3..bda93210e 100644
--- a/applications/solvers/dfLowMachFoam/Make/options
+++ b/applications/solvers/dfLowMachFoam/Make/options
@@ -9,7 +9,6 @@ EXE_INC = -std=c++14 \
     $(PFLAGS) $(PINC) \
     $(if $(LIBTORCH_ROOT),-DUSE_LIBTORCH,) \
     $(if $(PYTHON_INC_DIR),-DUSE_PYTORCH,) \
-    $(if $(AMGX_DIR),-DGPUSolver_,) \
     -I$(LIB_SRC)/transportModels/compressible/lnInclude \
     -I$(LIB_SRC)/thermophysicalModels/basic/lnInclude \
     -I$(LIB_SRC)/TurbulenceModels/turbulenceModels/lnInclude \
diff --git a/applications/solvers/dfLowMachFoam/UEqn.H b/applications/solvers/dfLowMachFoam/UEqn.H
index c3ee91068..38934abdb 100644
--- a/applications/solvers/dfLowMachFoam/UEqn.H
+++ b/applications/solvers/dfLowMachFoam/UEqn.H
@@ -86,6 +86,121 @@
     //     K = 0.5*magSqr(U);
     // }
     // UEqn_GPU.checkValue(true);
+#elif defined GPUSolverNew_
+    const tmp<volScalarField> nuEff_tmp(turbulence->nuEff());
+    const volScalarField& nuEff = nuEff_tmp();
+
+    // run CPU, for temp
+    tmp<fvVectorMatrix> tUEqn
+    (
+        fvm::ddt(rho, U) 
+        + 
+        fvm::div(phi, U)
+        +  
+        turbulence->divDevRhoReff(U)
+        == -fvc::grad(p)
+    );
+    fvVectorMatrix& UEqn = tUEqn.ref();
+
+    // run GPU
+    // preProcess
+    // TODO: preProcessForRhoEqn for temp, now we only transfer phi(instead of rhoEqn) used by fvm::div(phi, U)
+    UEqn_GPU.sync();
+    double *h_rho = dfDataBase.getFieldPointer("rho", location::cpu, position::internal);
+    double *h_phi = dfDataBase.getFieldPointer("phi", location::cpu, position::internal);
+    double *h_boundary_phi = dfDataBase.getFieldPointer("phi", location::cpu, position::boundary);
+    memcpy(h_rho, &rho[0], dfDataBase.cell_value_bytes);
+    memcpy(h_phi, &phi[0], dfDataBase.surface_value_bytes);
+    int offset = 0;
+    forAll(phi.boundaryField(), patchi)
+    {
+        const fvsPatchScalarField& patchPhi = phi.boundaryField()[patchi];
+        int patchsize = patchPhi.size();
+        memcpy(h_boundary_phi + offset, &patchPhi[0], patchsize * sizeof(double));
+        offset += patchsize;
+    }
+    UEqn_GPU.preProcessForRhoEqn(h_rho, h_phi, h_boundary_phi);
+    DEBUG_TRACE;
+    
+    TICK_START;
+    // preparing u, p, nu_eff, and rho.boundary used by UEqn_GPU.preProcess()
+    double *h_u = dfDataBase.getFieldPointer("u", location::cpu, position::internal);
+    double *h_boundary_u = dfDataBase.getFieldPointer("u", location::cpu, position::boundary);
+    double *h_p = dfDataBase.getFieldPointer("p", location::cpu, position::internal);
+    double *h_boundary_p = dfDataBase.getFieldPointer("p", location::cpu, position::boundary);
+    double *h_nu_eff = UEqn_GPU.getFieldPointer("nu_eff", location::cpu, position::internal);
+    double *h_boundary_nu_eff = UEqn_GPU.getFieldPointer("nu_eff", location::cpu, position::boundary);
+    double *h_boundary_rho = dfDataBase.getFieldPointer("rho", location::cpu, position::boundary);
+    TICK_STOP(get pointer);
+
+    TICK_START;
+    U.oldTime();
+    memcpy(h_u, &U.oldTime()[0][0], dfDataBase.cell_value_vec_bytes);
+    memcpy(h_p, &p[0], dfDataBase.cell_value_bytes);
+    memcpy(h_nu_eff, &nuEff[0], dfDataBase.cell_value_bytes);
+    TICK_STOP(copy to pinned memory);
+
+    TICK_START;
+    offset = 0;
+    forAll(U.boundaryField(), patchi)
+    {
+        const fvPatchVectorField& patchU = U.boundaryField()[patchi];
+        const fvPatchScalarField& patchP = p.boundaryField()[patchi];
+        const fvPatchScalarField& patchNuEff = nuEff.boundaryField()[patchi];
+        const fvPatchScalarField& patchRho = rho.boundaryField()[patchi];
+        int patchsize = patchU.size();
+        memcpy(h_boundary_u + offset * 3, &patchU[0][0], patchsize * 3 * sizeof(double));
+        memcpy(h_boundary_p + offset, &patchP[0], patchsize * sizeof(double));
+        memcpy(h_boundary_nu_eff + offset, &patchNuEff[0], patchsize * sizeof(double));
+        memcpy(h_boundary_rho + offset, &patchRho[0], patchsize * sizeof(double));
+        offset += patchsize;
+    }
+    TICK_STOP(CPU prepare boundary time);
+
+    TICK_START;
+    UEqn_GPU.preProcess(h_u, h_boundary_u, h_p, h_boundary_p, h_nu_eff, h_boundary_nu_eff, h_boundary_rho);
+    DEBUG_TRACE;
+    UEqn_GPU.sync();
+    TICK_STOP(GPU preProcess time);
+
+    // process
+    TICK_START;
+    UEqn_GPU.process();
+    DEBUG_TRACE;
+    UEqn_GPU.sync();
+    TICK_STOP(GPU process time);
+
+    TICK_START;
+    UEqn_GPU.solve();
+    TICK_STOP(GPU solve time);
+
+    // postProcess
+    TICK_START;
+    UEqn_GPU.postProcess(h_u);
+    U.correctBoundaryConditions();
+    DEBUG_TRACE;
+    TICK_STOP(post process time);
+
+    // checkResult
+    // TODO: for temp, now we compare ldu, finally we compare csr
+    std::vector<double> h_internal_coeffs(dfDataBase.num_boundary_surfaces * 3);
+    std::vector<double> h_boundary_coeffs(dfDataBase.num_boundary_surfaces * 3);
+    offset = 0;
+    for (int patchi = 0; patchi < dfDataBase.num_patches; patchi++)
+    {
+        int patchsize = dfDataBase.patch_size[patchi];
+        const double* internal_coeff_ptr = &UEqn.internalCoeffs()[patchi][0][0];
+        const double* boundary_coeff_ptr = &UEqn.boundaryCoeffs()[patchi][0][0];
+        memcpy(h_internal_coeffs.data() + offset * 3, internal_coeff_ptr, patchsize * 3 * sizeof(double));
+        memcpy(h_boundary_coeffs.data() + offset * 3, boundary_coeff_ptr, patchsize * 3 * sizeof(double));
+        offset += patchsize;
+    }
+    bool printFlag = false;
+    UEqn_GPU.compareResult(&UEqn.lower()[0], &UEqn.upper()[0], &UEqn.diag()[0], &UEqn.source()[0][0],
+            h_internal_coeffs.data(), h_boundary_coeffs.data(), 
+            // &DivTensor[0][0], 
+            printFlag);
+    DEBUG_TRACE;
 #else
     start1 = std::clock();
     tmp<fvVectorMatrix> tUEqn
diff --git a/applications/solvers/dfLowMachFoam/dfLowMachFoam.C b/applications/solvers/dfLowMachFoam/dfLowMachFoam.C
index db6b25b18..6ea4251af 100644
--- a/applications/solvers/dfLowMachFoam/dfLowMachFoam.C
+++ b/applications/solvers/dfLowMachFoam/dfLowMachFoam.C
@@ -60,14 +60,34 @@ Description
 #include "basicThermo.H"
 #include "CombustionModel.H"
 
-#ifdef GPUSolver_
+#define GPUSolverNew_
+#define TIME
+
+#ifdef GPUSolverNew_
 #include "dfUEqn.H"
-#include "dfYEqn.H"
-#include "dfRhoEqn.H"
-#include "dfEEqn.H"
+// #include "dfYEqn.H"
+// #include "dfRhoEqn.H"
+// #include "dfEEqn.H"
+#include "dfMatrixDataBase.H"
+#include "dfMatrixOpBase.H"
 #include <cuda_runtime.h>
 #include <thread>
+
+#include "createGPUSolver.H"
+
 #include "upwind.H"
+#include "GenFvMatrix.H"
+#endif
+
+#ifdef TIME
+    #define TICK_START \
+        start_new = std::clock(); 
+    #define TICK_STOP(prefix) \
+        stop_new = std::clock(); \
+        Foam::Info << #prefix << " time = " << double(stop_new - start_new) / double(CLOCKS_PER_SEC) << " s" << Foam::endl;
+#else
+    #define TICK_START
+    #define TICK_STOP(prefix)
 #endif
 
 // * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * //
@@ -148,6 +168,8 @@ int main(int argc, char *argv[])
 
     label timeIndex = 0;
     clock_t start, end, start1, end1, start2, end2;
+    clock_t start_new, stop_new;
+    double time_new = 0;
 
     turbulence->validate();
 
@@ -158,9 +180,11 @@ int main(int argc, char *argv[])
     }
 
     start1 = std::clock();
-    #ifdef GPUSolver_
-    #include "createdfSolver.H"
-    #endif
+#ifdef GPUSolverNew_
+    createGPUBase(mesh, Y);
+    createGPUUEqn(CanteraTorchProperties, U);
+#endif
+
     end1 = std::clock();
     time_monitor_init += double(end1 - start1) / double(CLOCKS_PER_SEC);
 
@@ -187,7 +211,9 @@ int main(int argc, char *argv[])
         runTime++;
 
         Info<< "Time = " << runTime.timeName() << nl << endl;
-
+#ifdef GPUSolverNew_
+        dfDataBase.preTimeStep(&rho.oldTime()[0]);
+#endif
         clock_t loop_start = std::clock();
         // --- Pressure-velocity PIMPLE corrector loop
         while (pimple.loop())
@@ -276,6 +302,10 @@ int main(int argc, char *argv[])
 
         rho = thermo.rho();
 
+#ifdef GPUSolverNew_
+        dfDataBase.postTimeStep();
+#endif
+
         runTime.write();
         Info<< "========Time Spent in diffenet parts========"<< endl;
         Info<< "loop Time                    = " << loop_time << " s" << endl;
diff --git a/applications/solvers/dfLowMachFoam_new/CMakeLists.txt b/applications/solvers/dfLowMachFoam_new/CMakeLists.txt
deleted file mode 100644
index 645289a64..000000000
--- a/applications/solvers/dfLowMachFoam_new/CMakeLists.txt
+++ /dev/null
@@ -1,126 +0,0 @@
-cmake_minimum_required(VERSION 3.5)
-project(dfLowMachFoam LANGUAGES CXX)
-FIND_PACKAGE(MPI REQUIRED)
-FIND_PACKAGE(OpenMP REQUIRED)
-FIND_PACKAGE(CUDA REQUIRED)
-
-# Check valid thirdParty
-if(DEFINED ENV{WM_PROJECT_DIR})
-	MESSAGE(STATUS "OpenFOAM: " $ENV{WM_PROJECT_DIR})
-else()
-	message(FATAL_ERROR "OpenFOAM is not sourced")
-endif(DEFINED ENV{WM_PROJECT_DIR})
-
-if(DEFINED ENV{CANTERA_ROOT})
-	MESSAGE(STATUS "libcantera: " $ENV{CANTERA_ROOT})
-  SET(CANTERA_ROOT $ENV{CANTERA_ROOT})
-else()
-	message(FATAL_ERROR "libcantera directory is not specified")
-endif(DEFINED ENV{CANTERA_ROOT}) 
-
-# define variables
-SET(OpenFOAM_LIB_DIR $ENV{FOAM_LIBBIN})
-SET(OpenFOAM_SRC $ENV{FOAM_SRC})
-
-SET(DF_ROOT $ENV{DF_ROOT})
-SET(DF_SRC $ENV{DF_SRC})
-SET(SRC_ORIG $ENV{SRC_ORIG})
-
-# set compilation options
-SET(CMAKE_EXE_LINKER_FLAGS "-fuse-ld=bfd -Xlinker --add-needed -Xlinker --no-as-needed")
-SET (CMAKE_C_FLAGS ${CMAKE_C_FLAGS} ${OpenMP_C_FLAGS})
-SET (CMAKE_CXX_FLAGS ${CMAKE_CXX_FLAGS} ${OpenMP_CXX_FLAGS})
-
-SET(CMAKE_C_COMPILER g++)
-SET(PATH_LIB_OPENMPI "openmpi-system")  # Foundation version
-SET(EXE_COMPILE_OPTION "-std=c++11 -m64 -Dlinux64 -DWM_ARCH_OPTION=64 
--DWM_DP -DWM_LABEL_SIZE=32 -Wall -Wextra -Wold-style-cast -Wnon-virtual-dtor 
--Wno-unused-parameter -Wno-invalid-offsetof -Wno-attributes -O3  
--DNoRepository -ftemplate-depth-100 -std=c++14 
--Wno-unused-variable -Wno-unused-but-set-variable -Wno-old-style-cast -DOMPI_SKIP_MPICXX  
--pthread -fPIC")
-add_definitions("${EXE_COMPILE_OPTION}")
-
-# add header files
-FUNCTION(R_SEARCH search_path return_list)
-    FILE(GLOB_RECURSE new_list ${search_path}/*.H)
-    SET(dir_list "")
-    FOREACH(file_path ${new_list})
-        GET_FILENAME_COMPONENT(dir_path ${file_path} PATH)
-        SET(dir_list ${dir_list} ${dir_path})
-    ENDFOREACH()
-    LIST(REMOVE_DUPLICATES dir_list)
-    SET(${return_list} ${dir_list} PARENT_SCOPE)
-ENDFUNCTION(R_SEARCH)
-
-R_SEARCH(${DF_SRC}/dfCombustionModels dfcombustion_inc)
-R_SEARCH(${DF_SRC}/dfCanteraMixture dfcantera_inc)
-R_SEARCH(${DF_SRC}/lagrangian/intermediate dflagrangianinter_inc)
-R_SEARCH(${DF_SRC}/lagrangian/spray dflagrangianspray_inc)
-R_SEARCH(${DF_SRC}/lagrangian/turbulence dflagrangianturb_inc)
-R_SEARCH(${DF_SRC}/dfChemistryModel dfchemistry_inc)
-R_SEARCH(${DF_SRC}/thermophysicalModels/thermophysicalProperties dfthermophysicalprop_inc)
-R_SEARCH(${DF_SRC}/thermophysicalModels/thermophysicalProperties dfthermophysicalprop_inc)
-R_SEARCH(${DF_SRC}/thermophysicalModels/basic dfthermophysicalbasic_inc)
-R_SEARCH(${DF_SRC}/thermophysicalModels/SLGThermo dfthermophysicalslg_inc)
-R_SEARCH(${DF_SRC}/TurbulenceModels dfturbulence_inc)
-R_SEARCH(${DF_SRC}/dynamicMesh dfnewdynamic_inc)
-R_SEARCH(${DF_SRC}/dynamicFvMesh dffvdynamic_inc)
-
-include_directories(
-    ${OpenFOAM_SRC}/finiteVolume/lnInclude
-    ${OpenFOAM_SRC}/OSspecific/POSIX/lnInclude
-    ${OpenFOAM_SRC}/OpenFOAM/lnInclude
-    ${OpenFOAM_SRC}/transportModels/compressible/lnInclude 
-    ${OpenFOAM_SRC}/thermophysicalModels/basic/lnInclude 
-    ${OpenFOAM_SRC}/TurbulenceModels/turbulenceModels/lnInclude 
-    ${OpenFOAM_SRC}/TurbulenceModels/compressible/lnInclude 
-    ${OpenFOAM_SRC}/finiteVolume/cfdTools 
-    ${OpenFOAM_SRC}/finiteVolume/lnInclude 
-    ${OpenFOAM_SRC}/meshTools/lnInclude 
-    ${OpenFOAM_SRC}/sampling/lnInclude 
-    ${OpenFOAM_SRC}/dynamicFvMesh/lnInclude 
-    ${OpenFOAM_SRC}/Pstream/mpi 
-    ${dfcantera_inc}
-    ${dfchemistry_inc}
-    ${dfcombustion_inc}
-    ${CANTERA_ROOT}/include 
-    ${MPI_INCLUDE_PATH}
-    ${PROJECT_SOURCE_DIR}
-    ${CUDA_INCLUDE_DIRS}
-    /home/runze/AmgX/AMGX/include
-    /home/runze/deepflame-dev/src_gpu
-)
-
-# add execution
-add_executable(${PROJECT_NAME} ${PROJECT_SOURCE_DIR}/dfLowMachFoam.C)
-
-target_link_libraries(${PROJECT_NAME} 
-    $ENV{FOAM_LIBBIN}/libfiniteVolume.so libmeshTools.so libcompressibleTransportModels.so 
-    libturbulenceModels.so libsampling.so libOpenFOAM.so 
-    ${CANTERA_ROOT}/lib/libcantera_shared.so.2
-    ${DF_ROOT}/lib/libdfChemistryModel.so
-    ${DF_ROOT}/lib/libdfCanteraMixture.so
-    ${DF_ROOT}/lib/libdfFluidThermophysicalModels.so
-    ${DF_ROOT}/lib/libdfCombustionModels.so
-    $ENV{FOAM_LIBBIN}/openmpi-system/libPstream.so
-    ${MPI_LIBRARIES}
-    ${CUDA_LIBRARIES}
-    /home/runze/AmgX/AMGX/build/libamgxsh.so
-    /home/runze/deepflame-dev/src_gpu/build/libdfMatrix.so
-)
-
-if(DEFINED ENV{PYTHON_INC_DIR})
-    add_definitions(-DUSE_PYTORCH)
-    find_package (Python REQUIRED COMPONENTS Interpreter Development)
-    find_package(pybind11)
-    include_directories(
-    ${Python_INCLUDE_DIRS}
-    ${pybind11_INCLUDE_DIR}/pybind11
-    )
-    target_link_libraries(${PROJECT_NAME} ${Python_LIBRARIES})
-endif()
-
-# install
-set(CMAKE_INSTALL_PREFIX ${DF_ROOT})
-install(TARGETS ${PROJECT_NAME} DESTINATION bin)
diff --git a/applications/solvers/dfLowMachFoam_new/EEqn.H b/applications/solvers/dfLowMachFoam_new/EEqn.H
deleted file mode 100644
index 896baaa06..000000000
--- a/applications/solvers/dfLowMachFoam_new/EEqn.H
+++ /dev/null
@@ -1,141 +0,0 @@
-{
-    volScalarField& he = thermo.he();
-#ifdef GPUSolver_
-    start1 = std::clock();
-    UEqn_GPU.updatePsi(&U[0][0]);
-    UEqn_GPU.correctBoundaryConditions();
-    U.correctBoundaryConditions();
-    K = 0.5*magSqr(U);
-    end1 = std::clock();
-    time_monitor_UEqn += double(end1 - start1) / double(CLOCKS_PER_SEC);
-    time_monitor_UEqn_correctBC += double(end1 - start1) / double(CLOCKS_PER_SEC);
-
-    // prepare data on CPU
-    start1 = std::clock();
-    start2 = std::clock();
-    // const tmp<volScalarField> alphaEff_tmp(thermo.alpha());
-    // const volScalarField& alphaEff = alphaEff_tmp();
-    double *alphaEff = nullptr; // tmp
-    end2 = std::clock();
-    int eeqn_offset = 0;
-    int patchNum = 0;
-
-    forAll(he.boundaryField(), patchi)
-    {
-        patchNum++;
-        const fvsPatchScalarField& pw = mesh.surfaceInterpolation::weights().boundaryField()[patchi];
-        int patchSize = pw.size();
-
-        // construct gradient manually
-        const fvPatchScalarField& hew = he.boundaryField()[patchi];
-        const basicThermo& bThermo = basicThermo::lookupThermo(hew);
-        const scalarField& ppw = bThermo.p().boundaryField()[patchi];
-        fvPatchScalarField& Tw =
-            const_cast<fvPatchScalarField&>(bThermo.T().boundaryField()[patchi]);
-        scalarField& Tw_v = Tw;
-
-        Tw.evaluate();
-        const scalarField& patchDeltaCoeff = mesh.boundary()[patchi].deltaCoeffs();
-        const scalarField heInternal = bThermo.he(ppw, Tw, patchi)();
-        const scalarField heBoundary = bThermo.he(ppw, Tw, mesh.boundary()[patchi].faceCells())();
-        const scalarField patchGradMau = patchDeltaCoeff * (heInternal - heBoundary);
-
-        const scalarField& patchK = K.boundaryField()[patchi];
-        // const scalarField& patchAlphaEff = alphaEff.boundaryField()[patchi]; // not H2Dcopy when use UnityLewis
-        // const scalarField& patchGrad = he.boundaryField()[patchi].gradientBoundaryCoeffs(); // gradient_
-
-        // const DimensionedField<scalar, volMesh>& patchHa_ = he.boundaryField()[patchi];
-        // const gradientEnergyFvPatchScalarField patchHa(mesh.boundary()[patchi], patchHa_);
-        // const scalarField& patchGrad = patchHa.gradient(); // gradient_
-        memcpy(boundary_K + eeqn_offset, &patchK[0], patchSize*sizeof(double));
-        // memcpy(boundary_alphaEff + eeqn_offset, &patchAlphaEff[0], patchSize*sizeof(double));
-        memcpy(boundary_gradient + eeqn_offset, &patchGradMau[0], patchSize*sizeof(double));
-
-        eeqn_offset += patchSize;
-    }
-    end1 = std::clock();
-    time_monitor_EEqn += double(end1 - start1) / double(CLOCKS_PER_SEC);
-    time_monitor_EEqn_mtxAssembly += double(end1 - start1) / double(CLOCKS_PER_SEC);
-    time_monitor_EEqn_mtxAssembly_CPU_prepare += double(end1 - start1) / double(CLOCKS_PER_SEC);
-    fprintf(stderr, "time_monitor_EEqn_mtxAssembly_CPU_prepare: %lf, build alphaEff time: %lf, patchNum: %d\n",
-            time_monitor_EEqn_mtxAssembly_CPU_prepare,
-            double(end2 - start2) / double(CLOCKS_PER_SEC), patchNum);
-
-    // prepare data on GPU
-    start1 = std::clock();
-    he.oldTime();
-    K.oldTime();
-    EEqn_GPU.prepare_data(&he.oldTime()[0], &K[0], &K.oldTime()[0], alphaEff,
-            &dpdt[0], boundary_K, boundary_alphaEff, boundary_gradient);
-    EEqn_GPU.sync();
-    end1 = std::clock();
-    time_monitor_EEqn += double(end1 - start1) / double(CLOCKS_PER_SEC);
-    time_monitor_EEqn_mtxAssembly += double(end1 - start1) / double(CLOCKS_PER_SEC);
-    time_monitor_EEqn_mtxAssembly_GPU_prepare += double(end1 - start1) / double(CLOCKS_PER_SEC);
-
-    start1 = std::clock();
-    EEqn_GPU.initializeTimeStep();
-    EEqn_GPU.fvm_ddt();
-    EEqn_GPU.fvm_div();
-    EEqn_GPU.fvm_laplacian();
-    EEqn_GPU.fvc_ddt();
-    EEqn_GPU.fvc_div_phi_scalar();
-    EEqn_GPU.fvc_div_vector();
-    EEqn_GPU.add_to_source();
-    EEqn_GPU.sync();
-    end1 = std::clock();
-    time_monitor_EEqn += double(end1 - start1) / double(CLOCKS_PER_SEC);
-    time_monitor_EEqn_mtxAssembly += double(end1 - start1) / double(CLOCKS_PER_SEC);
-    time_monitor_EEqn_mtxAssembly_GPU_run += double(end1 - start1) / double(CLOCKS_PER_SEC);
-
-    // check value of mtxAssembly, no time monitor
-    // EEqn_GPU.checkValue(true);
-
-    start1 = std::clock();
-    EEqn_GPU.solve();
-    end1 = std::clock();
-    time_monitor_EEqn += double(end1 - start1) / double(CLOCKS_PER_SEC);
-    time_monitor_EEqn_solve += double(end1 - start1) / double(CLOCKS_PER_SEC);
-
-    start1 = std::clock();
-    EEqn_GPU.updatePsi(&he[0]);
-    he.correctBoundaryConditions();
-    he.write();
-    end1 = std::clock();
-    time_monitor_EEqn += double(end1 - start1) / double(CLOCKS_PER_SEC);
-    time_monitor_EEqn_correctBC += double(end1 - start1) / double(CLOCKS_PER_SEC);
-#else
-    start1 = std::clock();
-    fvScalarMatrix EEqn
-    (
-
-            fvm::ddt(rho, he) + mvConvection->fvmDiv(phi, he)
-        +   fvc::ddt(rho, K) + fvc::div(phi, K)
-        -   dpdt
-        ==
-            (
-                turbName == "laminar"
-                ?
-                (
-                    fvm::laplacian(turbulence->alpha(), he)
-                -   diffAlphaD
-                +   fvc::div(hDiffCorrFlux)
-                )
-                :
-                (
-                    fvm::laplacian(turbulence->alphaEff(), he)
-                )
-            )
-        );
-    end1 = std::clock();
-    time_monitor_EEqn += double(end1 - start1) / double(CLOCKS_PER_SEC);
-    time_monitor_EEqn_mtxAssembly += double(end1 - start1) / double(CLOCKS_PER_SEC);
-
-    EEqn.relax();
-    start1 = std::clock();
-    EEqn.solve("ha");
-    end1 = std::clock();
-    time_monitor_EEqn += double(end1 - start1) / double(CLOCKS_PER_SEC);
-    time_monitor_EEqn_solve += double(end1 - start1) / double(CLOCKS_PER_SEC);
-#endif
-}
diff --git a/applications/solvers/dfLowMachFoam_new/Make/files b/applications/solvers/dfLowMachFoam_new/Make/files
deleted file mode 100644
index 92df9b4e3..000000000
--- a/applications/solvers/dfLowMachFoam_new/Make/files
+++ /dev/null
@@ -1,3 +0,0 @@
-dfLowMachFoam.C
-
-EXE = $(DF_APPBIN)/dfLowMachFoam_new
diff --git a/applications/solvers/dfLowMachFoam_new/Make/options b/applications/solvers/dfLowMachFoam_new/Make/options
deleted file mode 100644
index bda93210e..000000000
--- a/applications/solvers/dfLowMachFoam_new/Make/options
+++ /dev/null
@@ -1,58 +0,0 @@
--include $(GENERAL_RULES)/mplibType
-
-EXE_INC = -std=c++14 \
-    -g \
-    -fopenmp \
-    -Wno-unused-variable \
-    -Wno-unused-but-set-variable \
-    -Wno-old-style-cast \
-    $(PFLAGS) $(PINC) \
-    $(if $(LIBTORCH_ROOT),-DUSE_LIBTORCH,) \
-    $(if $(PYTHON_INC_DIR),-DUSE_PYTORCH,) \
-    -I$(LIB_SRC)/transportModels/compressible/lnInclude \
-    -I$(LIB_SRC)/thermophysicalModels/basic/lnInclude \
-    -I$(LIB_SRC)/TurbulenceModels/turbulenceModels/lnInclude \
-    -I$(LIB_SRC)/TurbulenceModels/compressible/lnInclude \
-    -I$(LIB_SRC)/finiteVolume/cfdTools \
-    -I$(LIB_SRC)/finiteVolume/lnInclude \
-    -I$(LIB_SRC)/meshTools/lnInclude \
-    -I$(LIB_SRC)/sampling/lnInclude \
-    -I$(LIB_SRC)/dynamicFvMesh/lnInclude \
-    -I$(LIB_SRC)/Pstream/mpi \
-    -I$(DF_SRC)/dfCanteraMixture/lnInclude \
-    -I$(DF_SRC)/dfChemistryModel/lnInclude \
-    -I$(DF_SRC)/dfCombustionModels/lnInclude \
-    -I$(CANTERA_ROOT)/include \
-    $(if $(LIBTORCH_ROOT),-I$(LIBTORCH_ROOT)/include,) \
-    $(if $(LIBTORCH_ROOT),-I$(LIBTORCH_ROOT)/include/torch/csrc/api/include,) \
-    $(PYTHON_INC_DIR) \
-    $(if $(AMGX_DIR), -I$(DF_ROOT)/src_gpu,) \
-    $(if $(AMGX_DIR), -I/usr/local/cuda-11.6/include,) \
-    $(if $(AMGX_DIR), -I$(AMGX_DIR)/include,) \
-	-I$(DF_ROOT)/GPUTestRef/lnInclude \
-
-EXE_LIBS = \
-    -lcompressibleTransportModels \
-    -lturbulenceModels \
-    -lfiniteVolume \
-    -lmeshTools \
-    -lsampling \
-    -L$(DF_LIBBIN) \
-    -ldfFluidThermophysicalModels \
-    -ldfCompressibleTurbulenceModels \
-    -ldfCanteraMixture \
-    -ldfChemistryModel \
-    -ldfCombustionModels  \
-	-ldfGenMatrix \
-    $(CANTERA_ROOT)/lib/libcantera.so \
-    $(if $(LIBTORCH_ROOT),$(LIBTORCH_ROOT)/lib/libtorch.so,) \
-    $(if $(LIBTORCH_ROOT),$(LIBTORCH_ROOT)/lib/libc10.so,) \
-    $(if $(LIBTORCH_ROOT),-rdynamic,) \
-    $(if $(LIBTORCH_ROOT),-lpthread,) \
-    $(if $(LIBTORCH_ROOT),$(DF_SRC)/dfChemistryModel/DNNInferencer/build/libDNNInferencer.so,) \
-    $(if $(PYTHON_LIB_DIR),-L$(PYTHON_LIB_DIR),) \
-    $(if $(PYTHON_LIB_DIR),-lpython3.8,) \
-    $(if $(AMGX_DIR), /usr/local/cuda-11.6/lib64/libcudart.so,) \
-    $(if $(AMGX_DIR), $(DF_ROOT)/src_gpu/build/libdfMatrix.so,) \
-    $(if $(AMGX_DIR), $(AMGX_DIR)/build/libamgxsh.so,)
-
diff --git a/applications/solvers/dfLowMachFoam_new/UEqn.H b/applications/solvers/dfLowMachFoam_new/UEqn.H
deleted file mode 100644
index 38934abdb..000000000
--- a/applications/solvers/dfLowMachFoam_new/UEqn.H
+++ /dev/null
@@ -1,247 +0,0 @@
-// Solve the Momentum equation
-#ifdef GPUSolver_
-    start1 = std::clock();
-    int offset = 0;
-    const tmp<volScalarField> nuEff_tmp(turbulence->nuEff());
-    const volScalarField& nuEff = nuEff_tmp();
-    forAll(U.boundaryField(), patchi)
-    {
-        const scalarField& patchP = p.boundaryField()[patchi];
-        const vectorField& patchU = U.boundaryField()[patchi];
-        const scalarField& patchRho = rho.boundaryField()[patchi];
-        const scalarField& patchNuEff = nuEff.boundaryField()[patchi];
-
-        int patchSize = patchP.size();
-
-        // boundary pressure
-        memcpy(boundary_pressure_init+offset, &patchP[0], patchSize*sizeof(double));
-        // boundary velocity
-        memcpy(boundary_velocity_init+3*offset, &patchU[0][0], 3*patchSize*sizeof(double));
-        // boundary nuEff
-        memcpy(boundary_nuEff_init+offset, &patchNuEff[0], patchSize*sizeof(double));
-        // boundary rho
-        memcpy(boundary_rho_init+offset, &patchRho[0], patchSize*sizeof(double));
-        offset += patchSize;
-    }
-    end1 = std::clock();
-    time_monitor_UEqn += double(end1 - start1) / double(CLOCKS_PER_SEC);
-    time_monitor_UEqn_mtxAssembly += double(end1 - start1) / double(CLOCKS_PER_SEC);
-    time_monitor_UEqn_mtxAssembly_CPU_prepare += double(end1 - start1) / double(CLOCKS_PER_SEC);
-
-    start1 = std::clock();
-    UEqn_GPU.initializeTimeStep();
-    U.oldTime();
-    UEqn_GPU.fvm_ddt(&U.oldTime()[0][0]);
-    UEqn_GPU.fvm_div(boundary_pressure_init, boundary_velocity_init, boundary_nuEff_init, boundary_rho_init);
-    UEqn_GPU.fvc_grad(&p[0]);
-    UEqn_GPU.fvc_grad_vector();
-    UEqn_GPU.dev2T();
-    UEqn_GPU.fvc_div_tensor(&nuEff[0]);
-    UEqn_GPU.fvm_laplacian();
-    UEqn_GPU.sync();
-    end1 = std::clock();
-    time_monitor_UEqn += double(end1 - start1) / double(CLOCKS_PER_SEC);
-    time_monitor_UEqn_mtxAssembly += double(end1 - start1) / double(CLOCKS_PER_SEC);
-    time_monitor_UEqn_mtxAssembly_GPU_run += double(end1 - start1) / double(CLOCKS_PER_SEC);
-
-    // start2 = std::clock();
-    // fvVectorMatrix turb_source
-    // (
-    //     turbulence->divDevRhoReff(U)
-    // );
-    // end2 = std::clock();
-    // time_monitor_CPU += double(end2 - start2) / double(CLOCKS_PER_SEC);
-
-    // UEqn_GPU.add_fvMatrix(&turb_source.lower()[0], &turb_source.diag()[0], &turb_source.upper()[0], &turb_source.source()[0][0]);
-    // end1 = std::clock();
-    // time_monitor_UEqn += double(end1 - start1) / double(CLOCKS_PER_SEC);
-    // time_monitor_UEqn_mtxAssembly += double(end1 - start1) / double(CLOCKS_PER_SEC);
-
-    // check value
-    // U.oldTime();
-    // tmp<fvVectorMatrix> tUEqn
-    // (
-        // fvm::ddt(rho, U) 
-        // + 
-        // fvm::div(phi, U)
-        // + 
-        // turbulence->divDevRhoReff(U) 
-        // == -fvc::grad(p)
-    // );
-    // fvVectorMatrix& UEqn = tUEqn.ref();
-    // printf("b_cpu = %e\n", UEqn.source()[1][1]);
-    // forAll(U.boundaryField(), patchi){
-        // labelUList sub_boundary = mesh.boundary()[patchi].faceCells();
-        // forAll(sub_boundary, i){
-        //     if (sub_boundary[i] == 1){
-        //         printf("b_cpu_bou = %e\n", UEqn.boundaryCoeffs()[patchi][i][1]);
-        //         printf("patchi = %d, i = %d\n", patchi, i);
-        //     }
-        // }
-    // }
-    // if (pimple.momentumPredictor())
-    // {
-    //     solve(UEqn);
-    //     Info << "U_CPU\n" << U << endl;
-    //     K = 0.5*magSqr(U);
-    // }
-    // UEqn_GPU.checkValue(true);
-#elif defined GPUSolverNew_
-    const tmp<volScalarField> nuEff_tmp(turbulence->nuEff());
-    const volScalarField& nuEff = nuEff_tmp();
-
-    // run CPU, for temp
-    tmp<fvVectorMatrix> tUEqn
-    (
-        fvm::ddt(rho, U) 
-        + 
-        fvm::div(phi, U)
-        +  
-        turbulence->divDevRhoReff(U)
-        == -fvc::grad(p)
-    );
-    fvVectorMatrix& UEqn = tUEqn.ref();
-
-    // run GPU
-    // preProcess
-    // TODO: preProcessForRhoEqn for temp, now we only transfer phi(instead of rhoEqn) used by fvm::div(phi, U)
-    UEqn_GPU.sync();
-    double *h_rho = dfDataBase.getFieldPointer("rho", location::cpu, position::internal);
-    double *h_phi = dfDataBase.getFieldPointer("phi", location::cpu, position::internal);
-    double *h_boundary_phi = dfDataBase.getFieldPointer("phi", location::cpu, position::boundary);
-    memcpy(h_rho, &rho[0], dfDataBase.cell_value_bytes);
-    memcpy(h_phi, &phi[0], dfDataBase.surface_value_bytes);
-    int offset = 0;
-    forAll(phi.boundaryField(), patchi)
-    {
-        const fvsPatchScalarField& patchPhi = phi.boundaryField()[patchi];
-        int patchsize = patchPhi.size();
-        memcpy(h_boundary_phi + offset, &patchPhi[0], patchsize * sizeof(double));
-        offset += patchsize;
-    }
-    UEqn_GPU.preProcessForRhoEqn(h_rho, h_phi, h_boundary_phi);
-    DEBUG_TRACE;
-    
-    TICK_START;
-    // preparing u, p, nu_eff, and rho.boundary used by UEqn_GPU.preProcess()
-    double *h_u = dfDataBase.getFieldPointer("u", location::cpu, position::internal);
-    double *h_boundary_u = dfDataBase.getFieldPointer("u", location::cpu, position::boundary);
-    double *h_p = dfDataBase.getFieldPointer("p", location::cpu, position::internal);
-    double *h_boundary_p = dfDataBase.getFieldPointer("p", location::cpu, position::boundary);
-    double *h_nu_eff = UEqn_GPU.getFieldPointer("nu_eff", location::cpu, position::internal);
-    double *h_boundary_nu_eff = UEqn_GPU.getFieldPointer("nu_eff", location::cpu, position::boundary);
-    double *h_boundary_rho = dfDataBase.getFieldPointer("rho", location::cpu, position::boundary);
-    TICK_STOP(get pointer);
-
-    TICK_START;
-    U.oldTime();
-    memcpy(h_u, &U.oldTime()[0][0], dfDataBase.cell_value_vec_bytes);
-    memcpy(h_p, &p[0], dfDataBase.cell_value_bytes);
-    memcpy(h_nu_eff, &nuEff[0], dfDataBase.cell_value_bytes);
-    TICK_STOP(copy to pinned memory);
-
-    TICK_START;
-    offset = 0;
-    forAll(U.boundaryField(), patchi)
-    {
-        const fvPatchVectorField& patchU = U.boundaryField()[patchi];
-        const fvPatchScalarField& patchP = p.boundaryField()[patchi];
-        const fvPatchScalarField& patchNuEff = nuEff.boundaryField()[patchi];
-        const fvPatchScalarField& patchRho = rho.boundaryField()[patchi];
-        int patchsize = patchU.size();
-        memcpy(h_boundary_u + offset * 3, &patchU[0][0], patchsize * 3 * sizeof(double));
-        memcpy(h_boundary_p + offset, &patchP[0], patchsize * sizeof(double));
-        memcpy(h_boundary_nu_eff + offset, &patchNuEff[0], patchsize * sizeof(double));
-        memcpy(h_boundary_rho + offset, &patchRho[0], patchsize * sizeof(double));
-        offset += patchsize;
-    }
-    TICK_STOP(CPU prepare boundary time);
-
-    TICK_START;
-    UEqn_GPU.preProcess(h_u, h_boundary_u, h_p, h_boundary_p, h_nu_eff, h_boundary_nu_eff, h_boundary_rho);
-    DEBUG_TRACE;
-    UEqn_GPU.sync();
-    TICK_STOP(GPU preProcess time);
-
-    // process
-    TICK_START;
-    UEqn_GPU.process();
-    DEBUG_TRACE;
-    UEqn_GPU.sync();
-    TICK_STOP(GPU process time);
-
-    TICK_START;
-    UEqn_GPU.solve();
-    TICK_STOP(GPU solve time);
-
-    // postProcess
-    TICK_START;
-    UEqn_GPU.postProcess(h_u);
-    U.correctBoundaryConditions();
-    DEBUG_TRACE;
-    TICK_STOP(post process time);
-
-    // checkResult
-    // TODO: for temp, now we compare ldu, finally we compare csr
-    std::vector<double> h_internal_coeffs(dfDataBase.num_boundary_surfaces * 3);
-    std::vector<double> h_boundary_coeffs(dfDataBase.num_boundary_surfaces * 3);
-    offset = 0;
-    for (int patchi = 0; patchi < dfDataBase.num_patches; patchi++)
-    {
-        int patchsize = dfDataBase.patch_size[patchi];
-        const double* internal_coeff_ptr = &UEqn.internalCoeffs()[patchi][0][0];
-        const double* boundary_coeff_ptr = &UEqn.boundaryCoeffs()[patchi][0][0];
-        memcpy(h_internal_coeffs.data() + offset * 3, internal_coeff_ptr, patchsize * 3 * sizeof(double));
-        memcpy(h_boundary_coeffs.data() + offset * 3, boundary_coeff_ptr, patchsize * 3 * sizeof(double));
-        offset += patchsize;
-    }
-    bool printFlag = false;
-    UEqn_GPU.compareResult(&UEqn.lower()[0], &UEqn.upper()[0], &UEqn.diag()[0], &UEqn.source()[0][0],
-            h_internal_coeffs.data(), h_boundary_coeffs.data(), 
-            // &DivTensor[0][0], 
-            printFlag);
-    DEBUG_TRACE;
-#else
-    start1 = std::clock();
-    tmp<fvVectorMatrix> tUEqn
-    (
-        fvm::ddt(rho, U) + fvm::div(phi, U)
-    + turbulence->divDevRhoReff(U) 
-    == -fvc::grad(p)
-    );
-    fvVectorMatrix& UEqn = tUEqn.ref();
-
-    end1 = std::clock();
-    time_monitor_UEqn += double(end1 - start1) / double(CLOCKS_PER_SEC);
-    time_monitor_UEqn_mtxAssembly += double(end1 - start1) / double(CLOCKS_PER_SEC);
-
-    UEqn.relax();
-    start1 = std::clock();
-    if (pimple.momentumPredictor())
-    {
-        solve(UEqn);
-
-        K = 0.5*magSqr(U);
-    }
-    end1 = std::clock();
-    time_monitor_UEqn += double(end1 - start1) / double(CLOCKS_PER_SEC);
-    time_monitor_UEqn_solve += double(end1 - start1) / double(CLOCKS_PER_SEC);
-#endif
-
-// start1 = std::clock();
-// // // std::thread t(&dfMatrix::solve, &UEqn_GPU);
-// UEqn_GPU.solve();
-// end1 = std::clock();
-// time_monitor_UEqn += double(end1 - start1) / double(CLOCKS_PER_SEC);
-// time_monitor_UEqn_solve += double(end1 - start1) / double(CLOCKS_PER_SEC);
-
-// start1 = std::clock();
-// // // t.join();
-// // UEqn_GPU.updatePsi(&U[0][0]);
-// K = 0.5*magSqr(U);
-// end1 = std::clock();
-// time_monitor_UEqn += double(end1 - start1) / double(CLOCKS_PER_SEC);
-// time_monitor_UEqn_mtxAssembly += double(end1 - start1) / double(CLOCKS_PER_SEC);
-// time_monitor_CPU += double(end1 - start1) / double(CLOCKS_PER_SEC);
-// // Info << "U_amgx = " << U << endl;
-
diff --git a/applications/solvers/dfLowMachFoam_new/YEqn.H b/applications/solvers/dfLowMachFoam_new/YEqn.H
deleted file mode 100644
index 76570b24d..000000000
--- a/applications/solvers/dfLowMachFoam_new/YEqn.H
+++ /dev/null
@@ -1,207 +0,0 @@
-hDiffCorrFlux = Zero;
-diffAlphaD = Zero;
-sumYDiffError = Zero;
-
-tmp<fv::convectionScheme<scalar>> mvConvection
-(
-    fv::convectionScheme<scalar>::New
-    (
-        mesh,
-        fields,
-        phi,
-        mesh.divScheme("div(phi,Yi_h)")
-    )
-);
-#ifdef GPUSolver_
-    start1 = std::clock();
-    UEqn_GPU.solve();
-    end1 = std::clock();
-    time_monitor_UEqn += double(end1 - start1) / double(CLOCKS_PER_SEC);
-    time_monitor_UEqn_solve += double(end1 - start1) / double(CLOCKS_PER_SEC);
-
-    start1 = std::clock();
-    std::vector<double*> Y_old(Y.size()), boundary_Y(Y.size()), boundary_hai(Y.size()), boundary_rhoD(Y.size());
-    std::vector<const double*> hai(Y.size()), rhoD(Y.size());
-    for (size_t i = 0; i < Y.size(); ++i)
-    {
-        volScalarField& Yi = Y[i];
-        Yi.oldTime();
-        Y_old[i] = &Yi.oldTime()[0];
-        if (updateBoundaryFields)
-        {
-            cudaMallocHost(&boundary_Y[i], num_boundary_faces*sizeof(double));
-        }
-        const volScalarField& haii = chemistry->hai(i);
-        const volScalarField& rhoDi = chemistry->rhoD(i);
-        // hai[i] = &haii[0];
-        rhoD[i] = &rhoDi[0];
-        // cudaMallocHost(&boundary_hai[i], num_boundary_faces*sizeof(double));
-        cudaMallocHost(&boundary_rhoD[i], num_boundary_faces*sizeof(double));
-        int offset = 0;
-        forAll(Yi.boundaryField(), patchi)
-        {
-            const scalarField& patchYi = Yi.boundaryField()[patchi];
-            // const scalarField& patchHaii = haii.boundaryField()[patchi];
-            const scalarField& patchRhoDi = rhoDi.boundaryField()[patchi];
-            int patchSize = patchYi.size();
-
-            if (updateBoundaryFields)
-            {
-                memcpy(boundary_Y[i] + offset, &patchYi[0], patchSize*sizeof(double));
-            }
-            // memcpy(boundary_hai[i] + offset, &patchHaii[0], patchSize*sizeof(double));
-            memcpy(boundary_rhoD[i] + offset, &patchRhoDi[0], patchSize*sizeof(double));
-            offset += patchSize;
-        }
-        // if (i == 5)
-        // {
-        //     Info << "rhoD_CPU" << rhoDi << endl;
-        // }
-        
-    }
-    // Info << "rhoD from nuEff\n" << nuEff * rho / 0.7 << endl;
-    updateBoundaryFields = false;
-    volScalarField mut_sct = turbulence->mut().ref()/Sct;
-    double *boundary_mutsct = nullptr;
-    cudaMallocHost(&boundary_mutsct, num_boundary_faces*sizeof(double));
-    int offset = 0;
-    forAll(p.boundaryField(), patchi)
-    {
-        const scalarField& patchMut_sct = mut_sct.boundaryField()[patchi];
-        int patchSize = patchMut_sct.size();
-        memcpy(boundary_mutsct + offset, &patchMut_sct[0], patchSize*sizeof(double));
-        offset += patchSize;
-
-        // debug
-        // const fvsPatchScalarField& pw = mesh.surfaceInterpolation::weights().boundaryField()[patchi];
-        // Field<scalar> valueInternalCoeffs = Y[5].boundaryField()[patchi].valueInternalCoeffs(pw);
-        // Field<scalar> valueBoundaryCoeffs = Y[5].boundaryField()[patchi].valueBoundaryCoeffs(pw);
-        // Field<scalar> gradientInternalCoeffs = Y[5].boundaryField()[patchi].gradientInternalCoeffs();
-        // Field<scalar> gradientBoundaryCoeffs = Y[5].boundaryField()[patchi].gradientBoundaryCoeffs();
-        // Info << "valueInternalCoeffs\n" << valueInternalCoeffs << endl;
-        // Info << "valueBoundaryCoeffs\n" << valueBoundaryCoeffs << endl;
-        // Info << "gradientInternalCoeffs\n" << gradientInternalCoeffs << endl;
-        // Info << "gradientBoundaryCoeffs\n" << gradientBoundaryCoeffs << endl;
-    }
-    end1 = std::clock();
-    time_monitor_YEqn += double(end1 - start1) / double(CLOCKS_PER_SEC);
-    time_monitor_YEqn_mtxAssembly += double(end1 - start1) / double(CLOCKS_PER_SEC);
-    time_monitor_YEqn_mtxAssembly_CPU_prepare += double(end1 - start1) / double(CLOCKS_PER_SEC);
-    //fprintf(stderr, "time_monitor_YEqn_mtxAssembly_CPU_prepare: %lf\n", time_monitor_YEqn_mtxAssembly_CPU_prepare);
-
-    start1 = std::clock();
-    YEqn_GPU.initializeTimeStep();
-    YEqn_GPU.upwindWeight();
-    YEqn_GPU.fvm_laplacian_and_sumYDiffError_diffAlphaD_hDiffCorrFlux(Y_old, boundary_Y,
-            hai, boundary_hai, rhoD, boundary_rhoD, &mut_sct[0], boundary_mutsct, &thermo.alpha()[0]);
-    YEqn_GPU.fvm_ddt();
-    YEqn_GPU.fvm_div_phi();
-    YEqn_GPU.fvm_div_phiUc();
-    YEqn_GPU.sync();
-    // YEqn_GPU.checkValue(true, "of_output_H2.txt");
-    end1 = std::clock();
-    time_monitor_YEqn += double(end1 - start1) / double(CLOCKS_PER_SEC);
-    time_monitor_YEqn_mtxAssembly += double(end1 - start1) / double(CLOCKS_PER_SEC);
-    time_monitor_YEqn_mtxAssembly_GPU_run += double(end1 - start1) / double(CLOCKS_PER_SEC);
-    //fprintf(stderr, "time_monitor_YEqn_mtxAssembly_GPU_run: %lf\n", time_monitor_YEqn_mtxAssembly_GPU_run);
-
-    start1 = std::clock();
-    YEqn_GPU.solve();
-    end1 = std::clock();
-    time_monitor_YEqn += double(end1 - start1) / double(CLOCKS_PER_SEC);
-    time_monitor_YEqn_solve += double(end1 - start1) / double(CLOCKS_PER_SEC);
-#else
-    start1 = std::clock();
-    forAll(Y, i)
-    {
-        sumYDiffError += chemistry->rhoD(i)*fvc::grad(Y[i]);
-    }
-    // Info << "sumYDiffError\n" << sumYDiffError << endl;
-    const surfaceScalarField phiUc = linearInterpolate(sumYDiffError) & mesh.Sf();
-    start1 = std::clock();
-    time_monitor_YEqn += double(end1 - start1) / double(CLOCKS_PER_SEC);
-    time_monitor_YEqn_solve += double(end1 - start1) / double(CLOCKS_PER_SEC);  
-#endif
-
-//MPI_Barrier(PstreamGlobals::MPI_COMM_FOAM);
-label flag_mpi_init;
-MPI_Initialized(&flag_mpi_init);
-if(flag_mpi_init) MPI_Barrier(PstreamGlobals::MPI_COMM_FOAM);
-
-{
-    if (!splitting)
-    {
-        std::chrono::steady_clock::time_point start = std::chrono::steady_clock::now();
-        combustion->correct();
-        //label flag_mpi_init;
-        //MPI_Initialized(&flag_mpi_init);
-        if(flag_mpi_init) MPI_Barrier(PstreamGlobals::MPI_COMM_FOAM);
-        std::chrono::steady_clock::time_point stop = std::chrono::steady_clock::now();
-        std::chrono::duration<double> processingTime = std::chrono::duration_cast<std::chrono::duration<double>>(stop - start);
-        time_monitor_chem += processingTime.count();
-    }
-
-#ifdef GPUSolver_
-    start1 = std::clock();
-    forAll(Y, i)
-    {
-        volScalarField& Yi = Y[i];
-        YEqn_GPU.updatePsi(&Yi[0], i);
-        Yi.correctBoundaryConditions();
-    }
-    YEqn_GPU.correctBoundaryConditions();
-    end1 = std::clock();
-    time_monitor_YEqn += double(end1 - start1) / double(CLOCKS_PER_SEC);
-    time_monitor_YEqn_correctBC += double(end1 - start1) / double(CLOCKS_PER_SEC);
-#else
-    start2 = std::clock();
-    volScalarField Yt(0.0*Y[0]);
-    int speciesIndex = 0;
-    forAll(Y, i)
-    {
-        volScalarField& Yi = Y[i];
-        hDiffCorrFlux += chemistry->hai(i)*(chemistry->rhoD(i)*fvc::grad(Yi) - Yi*sumYDiffError);
-        diffAlphaD += fvc::laplacian(thermo.alpha()*chemistry->hai(i), Yi);
-        if (i != inertIndex)
-        {
-            start1 = std::clock();
-            tmp<volScalarField> DEff = chemistry->rhoD(i) + turbulence->mut()/Sct;
-
-            fvScalarMatrix YiEqn
-            (
-                fvm::ddt(rho, Yi)
-            +
-                (
-                    turbName == "laminar"
-                    ?  (mvConvection->fvmDiv(phi, Yi) + mvConvection->fvmDiv(phiUc, Yi))
-                    :   mvConvection->fvmDiv(phi, Yi)
-                )
-            ==
-                (
-                    splitting
-                    ?   fvm::laplacian(DEff(), Yi)
-                    :  (fvm::laplacian(DEff(), Yi) + combustion->R(Yi))
-                    )
-            );
-            
-            end1 = std::clock();
-            time_monitor_YEqn_mtxAssembly += double(end1 - start1) / double(CLOCKS_PER_SEC);
-            YiEqn.relax();
-
-            start1 = std::clock();
-            YiEqn.solve("Yi");
-            end1 = std::clock();
-            time_monitor_YEqn_solve += double(end1 - start1) / double(CLOCKS_PER_SEC);
-
-            Yi.max(0.0);
-            Yt += Yi;
-            ++speciesIndex;
-        }
-    }
-
-    Y[inertIndex] = scalar(1) - Yt;
-    Y[inertIndex].max(0.0);
-    end2 = std::clock();
-    time_monitor_YEqn += double(end2 - start2) / double(CLOCKS_PER_SEC);
-#endif
-}
diff --git a/applications/solvers/dfLowMachFoam_new/YEqn_RR.H b/applications/solvers/dfLowMachFoam_new/YEqn_RR.H
deleted file mode 100644
index f5752e95e..000000000
--- a/applications/solvers/dfLowMachFoam_new/YEqn_RR.H
+++ /dev/null
@@ -1,61 +0,0 @@
-if (!(timeIndex % 2))
-{
-    volScalarField Yt(0.0*Y[0]);
-
-    scalar dtSave = runTime.deltaT().value();
-    runTime.setDeltaT(dtSave * 2);
-
-    start = std::clock();
-    combustion->correct();
-
-    label flag_mpi_init;
-    MPI_Initialized(&flag_mpi_init);
-    if(flag_mpi_init) MPI_Barrier(PstreamGlobals::MPI_COMM_FOAM);
-    end = std::clock();
-    time_monitor_chem += double(end - start) / double(CLOCKS_PER_SEC);
-
-    forAll(Y, i)
-    {
-        volScalarField& Yi = Y[i];
-
-        if (i != inertIndex)
-        {
-            volScalarField& Yi = Y[i];
-            fvScalarMatrix YiEqn
-            (
-                fvm::ddt(rho, Yi)
-                ==
-                combustion->R(Yi)
-            );
-
-            YiEqn.relax();
-
-            YiEqn.solve("Yi");
-
-            Yi.max(0.0);
-            Yt += Yi;
-        }
-    }
-    Y[inertIndex] = scalar(1) - Yt;
-    Y[inertIndex].max(0.0);
-
-    forAll (Y, i)
-    {
-        volScalarField& tYi = Y[i].oldTime();
-
-        forAll(tYi, celli)
-        {
-            tYi[celli] = Y[i][celli];
-        }
-        volScalarField::Boundary& Bf = tYi.boundaryFieldRef(); 
-        forAll(Bf, patchi)
-        {
-            forAll(Bf[patchi], facei)   
-            {
-                Bf[patchi][facei] = Y[i].boundaryField()[patchi][facei];
-            }
-        }
-    }
-
-    runTime.setDeltaT(dtSave);
-}
\ No newline at end of file
diff --git a/applications/solvers/dfLowMachFoam_new/correctPhi.H b/applications/solvers/dfLowMachFoam_new/correctPhi.H
deleted file mode 100644
index 3cd82d29e..000000000
--- a/applications/solvers/dfLowMachFoam_new/correctPhi.H
+++ /dev/null
@@ -1,12 +0,0 @@
-CorrectPhi
-(
-    U,
-    phi,
-    p,
-    rho,
-    psi,
-    dimensionedScalar("rAUf", dimTime, 1),
-    divrhoU(),
-    pimple,
-    true
-);
diff --git a/applications/solvers/dfLowMachFoam_new/createFields.H b/applications/solvers/dfLowMachFoam_new/createFields.H
deleted file mode 100644
index 9e750c334..000000000
--- a/applications/solvers/dfLowMachFoam_new/createFields.H
+++ /dev/null
@@ -1,176 +0,0 @@
-#include "createRDeltaT.H"
-
-Info<< "Reading thermophysical properties\n" << endl;
-
-// fluidThermo* pThermo = new hePsiThermo<psiThermo, CanteraMixture>(mesh, word::null);
-fluidThermo* pThermo = new heRhoThermo<rhoThermo, CanteraMixture>(mesh, word::null);
-fluidThermo& thermo = *pThermo;
-// thermo.validate(args.executable(), "ha");
-
-const volScalarField& psi = thermo.psi();
-volScalarField& p = thermo.p();
-volScalarField& T = thermo.T();
-volScalarField rho
-(
-    IOobject
-    (
-        "rho",
-        runTime.timeName(),
-        mesh,
-        IOobject::READ_IF_PRESENT,
-        IOobject::AUTO_WRITE
-    ),
-    thermo.rho()
-);
-
-
-Info<< "Reading field U\n" << endl;
-volVectorField U
-(
-    IOobject
-    (
-        "U",
-        runTime.timeName(),
-        mesh,
-        IOobject::MUST_READ,
-        IOobject::AUTO_WRITE
-    ),
-    mesh
-);
-
-#include "compressibleCreatePhi.H"
-
-pressureControl pressureControl(p, rho, pimple.dict(), false);
-
-mesh.setFluxRequired(p.name());
-
-Info<< "Creating turbulence model\n" << endl;
-autoPtr<compressible::turbulenceModel> turbulence
-(
-    compressible::turbulenceModel::New
-    (
-        rho,
-        U,
-        phi,
-        thermo
-    )
-);
-
-Info<< "Creating field dpdt\n" << endl;
-volScalarField dpdt
-(
-    IOobject
-    (
-        "dpdt",
-        runTime.timeName(),
-        mesh,
-        IOobject::NO_READ,
-        IOobject::NO_WRITE
-    ),
-    mesh,
-    dimensionedScalar("dpdt",p.dimensions()/dimTime, 0)
-);
-
-
-Info<< "Creating reaction model\n" << endl;
-autoPtr<CombustionModel<basicThermo>> combustion
-(
-    CombustionModel<basicThermo>::New(thermo, turbulence())
-);
-Info<< "end Creating reaction model\n" << endl;
-
-
-const word combModelName(mesh.objectRegistry::lookupObject<IOdictionary>("combustionProperties").lookup("combustionModel"));
-Info << "Combustion Model Name is confirmed as "<< combModelName << endl;
-
-const word turbName(mesh.objectRegistry::lookupObject<IOdictionary>("turbulenceProperties").lookup("simulationType"));
-
-dfChemistryModel<basicThermo>* chemistry = combustion->chemistry();
-PtrList<volScalarField>& Y = chemistry->Y();
-const word inertSpecie(chemistry->lookup("inertSpecie"));
-const label inertIndex(chemistry->species()[inertSpecie]);
-chemistry->setEnergyName("ha");
-chemistry->updateEnergy();
-
-
-chemistry->correctThermo();
-Info<< "At initial time, min/max(T) = " << min(T).value() << ", " << max(T).value() << endl;
-
-//for dpdt
-
-Info<< "Creating field kinetic energy K\n" << endl;
-volScalarField K("K", 0.5*magSqr(U));
-
-multivariateSurfaceInterpolationScheme<scalar>::fieldTable fields;
-
-if(combModelName!="flareFGM")
-{
-forAll(Y, i)
-{
-    fields.add(Y[i]);
-}
-fields.add(thermo.he());
-}
-
-
-const scalar Sct = chemistry->lookupOrDefault("Sct", 1.);
-volScalarField diffAlphaD
-(
-    IOobject
-    (
-        "diffAlphaD",
-        runTime.timeName(),
-        mesh,
-        IOobject::NO_READ,
-        IOobject::NO_WRITE
-    ),
-    mesh,
-    dimensionedScalar(dimEnergy/dimTime/dimVolume, 0)
-);
-volVectorField hDiffCorrFlux
-(
-    IOobject
-    (
-        "hDiffCorrFlux",
-        runTime.timeName(),
-        mesh,
-        IOobject::NO_READ,
-        IOobject::NO_WRITE
-    ),
-    mesh,
-    dimensionedVector(dimensionSet(1,0,-3,0,0,0,0), Zero)
-);
-volVectorField sumYDiffError
-(
-    IOobject
-    (
-        "sumYDiffError",
-        runTime.timeName(),
-        mesh,
-        IOobject::NO_READ,
-        IOobject::NO_WRITE
-    ),
-    mesh,
-    dimensionedVector("sumYDiffError", dimDynamicViscosity/dimLength, Zero)
-);
-
-IOdictionary CanteraTorchProperties
-(
-    IOobject
-    (
-        "CanteraTorchProperties",
-        runTime.constant(),
-        mesh,
-        IOobject::MUST_READ,
-        IOobject::NO_WRITE
-    )
-);
-const Switch splitting = CanteraTorchProperties.lookupOrDefault("splittingStrategy", false);
-#ifdef USE_PYTORCH
-    const Switch log_ = CanteraTorchProperties.subDict("TorchSettings").lookupOrDefault("log", false);
-    const Switch torch_ = CanteraTorchProperties.subDict("TorchSettings").lookupOrDefault("torch", false);
-#endif
-#ifdef USE_LIBTORCH
-    const Switch log_ = CanteraTorchProperties.subDict("TorchSettings").lookupOrDefault("log", false);
-    const Switch torch_ = CanteraTorchProperties.subDict("TorchSettings").lookupOrDefault("torch", false);
-#endif
diff --git a/applications/solvers/dfLowMachFoam_new/createGPUSolver.H b/applications/solvers/dfLowMachFoam_new/createGPUSolver.H
deleted file mode 100644
index 94fff1125..000000000
--- a/applications/solvers/dfLowMachFoam_new/createGPUSolver.H
+++ /dev/null
@@ -1,97 +0,0 @@
-dfMatrixDataBase dfDataBase;
-//dfRhoEqn rhoEqn_GPU;
-dfUEqn UEqn_GPU(dfDataBase);
-//dfYEqn YEqn_GPU;
-//dfEEqn EEqn_GPU;
-
-void createGPUBase(fvMesh& mesh, PtrList<volScalarField>& Y) {
-    // prepare constant values: num_cells, num_surfaces, num_boundary_surfaces, num_patches, patch_size, num_species, rdelta_t
-    const labelUList& owner = mesh.owner();
-    const labelUList& neighbour = mesh.neighbour();
-    int num_cells = mesh.nCells();
-    int num_surfaces = neighbour.size();
-    int num_boundary_surfaces = 0;
-    int num_patches = 0;
-    std::vector<int> patch_size;
-    forAll(mesh.boundary(), patchi) {
-        labelUList sub_boundary = mesh.boundary()[patchi].faceCells();
-        int patchsize = sub_boundary.size();
-        patch_size.push_back(patchsize);
-        num_boundary_surfaces += patchsize;
-        num_patches++;
-    }
-    // TODO: get deltaT fomr time API
-    double rDeltaT = 1 / 1e-6;
-    dfDataBase.setConstantValues(num_cells, num_surfaces, num_boundary_surfaces, num_patches, patch_size, Y.size(), rDeltaT);
-    
-    // prepare constant indexes: owner, neighbor
-    dfDataBase.setConstantIndexes(&owner[0], &neighbour[0]);
-    
-    // prepare internal and boundary of sf, mag_sf, weights, delta_coeffs, volume
-    double *boundary_sf = new double[3 * num_boundary_surfaces];
-    double *boundary_mag_sf = new double[num_boundary_surfaces];
-    double *boundary_delta_coeffs = new double[num_boundary_surfaces];
-    int *boundary_face_cell = new int[num_boundary_surfaces];
-    int offset = 0;
-    forAll(mesh.boundary(), patchi) {
-        const vectorField& pSf = mesh.Sf().boundaryField()[patchi];
-        const scalarField& pMagSf = mesh.magSf().boundaryField()[patchi];
-        const scalarField& pDeltaCoeffs = mesh.nonOrthDeltaCoeffs().boundaryField()[patchi];
-        const labelUList& pFaceCells = mesh.boundary()[patchi].faceCells();
-
-        int patchsize = pMagSf.size();
-
-        memcpy(boundary_sf + 3*offset, &pSf[0][0], 3*patchsize*sizeof(double));
-        memcpy(boundary_mag_sf + offset, &pMagSf[0], patchsize*sizeof(double));
-        memcpy(boundary_delta_coeffs + offset, &pDeltaCoeffs[0], patchsize*sizeof(double));
-        memcpy(boundary_face_cell + offset, &pFaceCells[0], patchsize * sizeof(int));
-        offset += patchsize;
-    }
-
-    dfDataBase.createConstantFieldsInternal();
-    dfDataBase.createConstantFieldsBoundary();
-    dfDataBase.initConstantFieldsInternal(&mesh.Sf()[0][0], &mesh.magSf()[0], &mesh.surfaceInterpolation::weights()[0], &mesh.nonOrthDeltaCoeffs()[0], &mesh.V()[0]);
-    dfDataBase.initConstantFieldsBoundary(boundary_sf, boundary_mag_sf, boundary_delta_coeffs, boundary_face_cell);
-    
-    // prepare internal and boundary of Y
-    dfDataBase.createNonConstantFieldsInternal();
-    dfDataBase.createNonConstantFieldsBoundary();
-    forAll(Y, speciesI) {
-        volScalarField& Yi = Y[speciesI];
-        memcpy(dfDataBase.h_y + speciesI * num_cells, &Yi[0], num_cells * sizeof(double));
-        offset = 0;
-        forAll(Yi.boundaryField(), patchi) {
-            const scalarField& patchYi = Yi.boundaryField()[patchi];
-            int patchsize = patchYi.size();
-            memcpy(dfDataBase.h_boundary_y + speciesI * num_boundary_surfaces + offset, &patchYi[0], patchsize*sizeof(double));
-            offset += patchsize;
-        }
-    }
-    dfDataBase.initNonConstantFieldsInternal(dfDataBase.h_y);
-    dfDataBase.initNonConstantFieldsBoundary(dfDataBase.h_boundary_y);
-}
-
-void createGPUUEqn(const IOdictionary& CanteraTorchProperties, const volVectorField& U) {
-    // prepare mode_string and setting_path
-    string mode_string = "dDDI";
-    string settingPath;
-    settingPath = CanteraTorchProperties.subDict("AmgxSettings").lookupOrDefault("UEqnSettingPath", string(""));
-    UEqn_GPU.setConstantValues(mode_string, settingPath);
-
-    // prepare patch_type
-    std::vector<int> patch_type;
-    patch_type.resize(dfDataBase.num_patches);
-    forAll(U.boundaryField(), patchi)
-    {
-        constructBoundarySelectorPerPatch(&(patch_type[patchi]), U.boundaryField()[patchi].type());
-    }
-    UEqn_GPU.setConstantFields(patch_type);
-
-    // prepare internal and boundary of xxx
-    UEqn_GPU.createNonConstantFieldsInternal();
-    UEqn_GPU.createNonConstantFieldsBoundary();
-    UEqn_GPU.createNonConstantLduAndCsrFields();
-    // UEqn_GPU has no internal non-constant fields to be init
-    // UEqn_GPU.initNonConstantFieldsInternal();
-    UEqn_GPU.initNonConstantFieldsBoundary();
-}
diff --git a/applications/solvers/dfLowMachFoam_new/createdfSolver.H b/applications/solvers/dfLowMachFoam_new/createdfSolver.H
deleted file mode 100644
index 3c5593833..000000000
--- a/applications/solvers/dfLowMachFoam_new/createdfSolver.H
+++ /dev/null
@@ -1,65 +0,0 @@
-const labelUList& owner = mesh.owner();
-const labelUList& neighbour = mesh.neighbour();
-int num_cells = mesh.nCells();
-int num_surfaces = neighbour.size();
-
-std::vector<int> boundaryCellIndex;
-std::vector<double> boundary_face_vector_init;
-std::vector<double> boundary_face_init;
-std::vector<double> boundary_deltaCoeffs_init;
-std::vector<std::vector<int>> patchTypes;
-std::vector<int> patchTypeU, patchTypeY;
-int num_boundary_faces = 0;
-int patchSize;
-forAll(mesh.boundary(), patchi)
-{
-    labelUList sub_boundary = mesh.boundary()[patchi].faceCells();
-    patchSize = sub_boundary.size();
-    const vectorField& pSf = mesh.Sf().boundaryField()[patchi];
-    const scalarField& pMagSf = mesh.magSf().boundaryField()[patchi];
-    const scalarField& pDeltaCoeffs = mesh.nonOrthDeltaCoeffs().boundaryField()[patchi];
-
-    boundaryCellIndex.insert(boundaryCellIndex.end(), &sub_boundary[0], &sub_boundary[0]+patchSize);
-    boundary_face_vector_init.insert(boundary_face_vector_init.end(), &pSf[0][0], &pSf[0][0]+3*patchSize);
-    boundary_face_init.insert(boundary_face_init.end(), &pMagSf[0], &pMagSf[0]+patchSize);
-    boundary_deltaCoeffs_init.insert(boundary_deltaCoeffs_init.end(), &pDeltaCoeffs[0], &pDeltaCoeffs[0]+patchSize);
-    num_boundary_faces += patchSize;
-
-    constructBoundarySelector(patchTypeU, U.boundaryField()[patchi].type(), patchSize);
-    constructBoundarySelector(patchTypeY, Y[0].boundaryField()[patchi].type(), patchSize);
-}
-patchTypes.emplace_back(patchTypeU);
-patchTypes.emplace_back(patchTypeY);
-
-int num_boundary_cells;
-
-string settingPath;
-settingPath = CanteraTorchProperties.subDict("AmgxSettings").lookupOrDefault("UEqnSettingPath", string(""));
-
-#ifdef GPUSolver_
-    dfMatrixDataBase dfDataBase(num_surfaces, num_cells, num_boundary_faces, Y.size(), num_boundary_cells, &neighbour[0], &owner[0], &mesh.V()[0], &mesh.surfaceInterpolation::weights()[0], 
-    &mesh.Sf()[0][0], &mesh.magSf()[0], &mesh.nonOrthDeltaCoeffs()[0], boundary_face_vector_init, boundary_face_init, boundary_deltaCoeffs_init, boundaryCellIndex, patchTypes);
-    dfRhoEqn rhoEqn_GPU(dfDataBase);
-    dfUEqn UEqn_GPU(dfDataBase, "dDDI", settingPath);
-    dfYEqn YEqn_GPU(dfDataBase, "dDDI", settingPath, inertIndex);
-    dfEEqn EEqn_GPU(dfDataBase, "dDDI", settingPath);
-
-    double *ueqn_internalCoeffs_init, *ueqn_boundaryCoeffs_init, *boundary_pressure_init, *boundary_velocity_init,
-        *boundary_nuEff_init, *boundary_rho_init, *ueqn_laplac_internalCoeffs_init, *ueqn_laplac_boundaryCoeffs_init, *boundary_phi_init;
-    cudaMallocHost(&ueqn_internalCoeffs_init, 3*num_boundary_faces*sizeof(double));
-    cudaMallocHost(&ueqn_boundaryCoeffs_init, 3*num_boundary_faces*sizeof(double));
-    cudaMallocHost(&ueqn_laplac_internalCoeffs_init, 3*num_boundary_faces*sizeof(double));
-    cudaMallocHost(&ueqn_laplac_boundaryCoeffs_init, 3*num_boundary_faces*sizeof(double));
-    cudaMallocHost(&boundary_velocity_init, 3*num_boundary_faces*sizeof(double));
-    cudaMallocHost(&boundary_pressure_init, num_boundary_faces*sizeof(double));
-    cudaMallocHost(&boundary_nuEff_init, num_boundary_faces*sizeof(double));
-    cudaMallocHost(&boundary_rho_init, num_boundary_faces*sizeof(double));
-    cudaMallocHost(&boundary_phi_init, num_boundary_faces*sizeof(double));
-
-    double *boundary_alphaEff, *boundary_K, *boundary_gradient;
-    cudaMallocHost(&boundary_K, num_boundary_faces*sizeof(double));
-    cudaMallocHost(&boundary_alphaEff, num_boundary_faces*sizeof(double));
-    cudaMallocHost(&boundary_gradient, num_boundary_faces * sizeof(double));
-
-    bool updateBoundaryFields = true; // make sure that the boundary fields do H2D copy at 1st timestep
-#endif
diff --git a/applications/solvers/dfLowMachFoam_new/dfLowMachFoam.C b/applications/solvers/dfLowMachFoam_new/dfLowMachFoam.C
deleted file mode 100644
index f5b6ec90d..000000000
--- a/applications/solvers/dfLowMachFoam_new/dfLowMachFoam.C
+++ /dev/null
@@ -1,447 +0,0 @@
-/*---------------------------------------------------------------------------*\
-  =========                 |
-  \\      /  F ield         | OpenFOAM: The Open Source CFD Toolbox
-   \\    /   O peration     | Website:  https://openfoam.org
-    \\  /    A nd           | Copyright (C) 2011-2019 OpenFOAM Foundation
-     \\/     M anipulation  |
--------------------------------------------------------------------------------
-License
-    This file is part of OpenFOAM.
-
-    OpenFOAM is free software: you can redistribute it and/or modify it
-    under the terms of the GNU General Public License as published by
-    the Free Software Foundation, either version 3 of the License, or
-    (at your option) any later version.
-
-    OpenFOAM is distributed in the hope that it will be useful, but WITHOUT
-    ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
-    FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
-    for more details.
-
-    You should have received a copy of the GNU General Public License
-    along with OpenFOAM.  If not, see <http://www.gnu.org/licenses/>.
-
-Application
-    rhoPimpleFoam
-
-Description
-    Transient solver for turbulent flow of compressible fluids for HVAC and
-    similar applications, with optional mesh motion and mesh topology changes.
-
-    Uses the flexible PIMPLE (PISO-SIMPLE) solution for time-resolved and
-    pseudo-transient simulations.
-
-\*---------------------------------------------------------------------------*/
-
-#include "dfChemistryModel.H"
-#include "CanteraMixture.H"
-// #include "hePsiThermo.H"
-#include "heRhoThermo.H"
-
-#ifdef USE_PYTORCH
-#include <pybind11/embed.h>
-#include <pybind11/numpy.h>
-#include <pybind11/stl.h> //used to convert
-#endif
-
-#ifdef USE_LIBTORCH
-#include <torch/script.h>
-#include "DNNInferencer.H"
-#endif
-
-#include "fvCFD.H"
-#include "fluidThermo.H"
-#include "turbulentFluidThermoModel.H"
-#include "pimpleControl.H"
-#include "pressureControl.H"
-#include "localEulerDdtScheme.H"
-#include "fvcSmooth.H"
-#include "PstreamGlobals.H"
-#include "basicThermo.H"
-#include "CombustionModel.H"
-
-#define GPUSolverNew_
-#define TIME
-
-#ifdef GPUSolverNew_
-#include "dfUEqn.H"
-// #include "dfYEqn.H"
-// #include "dfRhoEqn.H"
-// #include "dfEEqn.H"
-#include "dfMatrixDataBase.H"
-#include "dfMatrixOpBase.H"
-#include <cuda_runtime.h>
-#include <thread>
-
-#include "createGPUSolver.H"
-
-#include "upwind.H"
-#include "GenFvMatrix.H"
-#endif
-
-#ifdef TIME
-    #define TICK_START \
-        start_new = std::clock(); 
-    #define TICK_STOP(prefix) \
-        stop_new = std::clock(); \
-        Foam::Info << #prefix << " time = " << double(stop_new - start_new) / double(CLOCKS_PER_SEC) << " s" << Foam::endl;
-#else
-    #define TICK_START
-    #define TICK_STOP(prefix)
-#endif
-
-// * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * //
-
-int main(int argc, char *argv[])
-{
-#ifdef USE_PYTORCH
-    pybind11::scoped_interpreter guard{};//start python interpreter
-#endif
-    #include "postProcess.H"
-
-    // unsigned int flags = 0;
-    // checkCudaErrors(cudaGetDeviceFlags(&flags));
-    // flags |= cudaDeviceScheduleYield;
-    // checkCudaErrors(cudaSetDeviceFlags(flags));
-
-    // #include "setRootCaseLists.H"
-    #include "listOptions.H"
-    #include "setRootCase2.H"
-    #include "listOutput.H"
-
-    #include "createTime.H"
-    #include "createMesh.H"
-    #include "createDyMControls.H"
-    #include "initContinuityErrs.H"
-    #include "createFields.H"
-    #include "createRhoUfIfPresent.H"
-
-    double time_monitor_init = 0;
-
-    double time_monitor_other = 0;
-    double time_monitor_rho = 0;
-    double time_monitor_U = 0;
-    double time_monitor_Y = 0;
-    double time_monitor_E = 0;
-    double time_monitor_p = 0;
-    double time_monitor_chemistry_correctThermo = 0;
-    double time_monitor_turbulence_correct = 0;
-    double time_monitor_chem = 0; // combustion correct
-
-    double time_monitor_rhoEqn = 0;
-    double time_monitor_rhoEqn_mtxAssembly = 0;
-    double time_monitor_rhoEqn_mtxAssembly_CPU_prepare = 0;
-    double time_monitor_rhoEqn_mtxAssembly_GPU_run = 0;
-    double time_monitor_rhoEqn_solve = 0;
-    double time_monitor_rhoEqn_correctBC = 0;
-
-    double time_monitor_UEqn = 0;
-    double time_monitor_UEqn_mtxAssembly = 0;
-    double time_monitor_UEqn_mtxAssembly_CPU_prepare = 0;
-    double time_monitor_UEqn_mtxAssembly_GPU_run = 0;
-    double time_monitor_UEqn_solve = 0;
-    double time_monitor_UEqn_correctBC = 0;
-    double time_monitor_UEqn_H = 0;
-    double time_monitor_UEqn_H_GPU_run = 0;
-    double time_monitor_UEqn_H_correctBC = 0;
-    double time_monitor_UEqn_A = 0;
-    double time_monitor_UEqn_A_GPU_run = 0;
-    double time_monitor_UEqn_A_correctBC = 0;
-
-    double time_monitor_YEqn = 0;
-    double time_monitor_YEqn_mtxAssembly = 0;
-    double time_monitor_YEqn_mtxAssembly_CPU_prepare = 0;
-    double time_monitor_YEqn_mtxAssembly_GPU_run = 0;
-    double time_monitor_YEqn_solve = 0;
-    double time_monitor_YEqn_correctBC = 0;
-
-    double time_monitor_EEqn = 0;
-    double time_monitor_EEqn_mtxAssembly = 0;
-    double time_monitor_EEqn_mtxAssembly_CPU_prepare = 0;
-    double time_monitor_EEqn_mtxAssembly_GPU_prepare = 0;
-    double time_monitor_EEqn_mtxAssembly_GPU_run = 0;
-    double time_monitor_EEqn_solve = 0;
-    double time_monitor_EEqn_correctBC = 0;
-
-    double time_monitor_pEqn = 0;
-    double time_monitor_pEqn_solve = 0;
-
-    label timeIndex = 0;
-    clock_t start, end, start1, end1, start2, end2;
-    clock_t start_new, stop_new;
-    double time_new = 0;
-
-    turbulence->validate();
-
-    if (!LTS)
-    {
-        #include "compressibleCourantNo.H"
-        #include "setInitialDeltaT.H"
-    }
-
-    start1 = std::clock();
-    createGPUBase(mesh, Y);
-    createGPUUEqn(CanteraTorchProperties, U);
-
-    end1 = std::clock();
-    time_monitor_init += double(end1 - start1) / double(CLOCKS_PER_SEC);
-
-    // * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * //
-
-    Info<< "\nStarting time loop\n" << endl;
-
-    while (runTime.run())
-    {
-        timeIndex ++;
-
-        #include "readDyMControls.H"
-
-        if (LTS)
-        {
-            #include "setRDeltaT.H"
-        }
-        else
-        {
-            #include "compressibleCourantNo.H"
-            #include "setDeltaT.H"
-        }
-
-        runTime++;
-
-        Info<< "Time = " << runTime.timeName() << nl << endl;
-        dfDataBase.preTimeStep(&rho.oldTime()[0]);
-        clock_t loop_start = std::clock();
-        // --- Pressure-velocity PIMPLE corrector loop
-        while (pimple.loop())
-        {
-            start = std::clock();
-            if (splitting)
-            {
-                #include "YEqn_RR.H"
-            }
-            if (pimple.firstPimpleIter() || moveMeshOuterCorrectors)
-            {
-                // Store momentum to set rhoUf for introduced faces.
-                autoPtr<volVectorField> rhoU;
-                if (rhoUf.valid())
-                {
-                    rhoU = new volVectorField("rhoU", rho*U);
-                }
-            }
-            end = std::clock();
-            time_monitor_other += double(end - start) / double(CLOCKS_PER_SEC);
-
-            start = std::clock();
-            if (pimple.firstPimpleIter() && !pimple.simpleRho())
-            {
-                #include "rhoEqn.H"
-            }
-            end = std::clock();
-            time_monitor_rho += double(end - start) / double(CLOCKS_PER_SEC);
-            
-            start = std::clock();
-            #include "UEqn.H"
-            end = std::clock();
-            time_monitor_U += double(end - start) / double(CLOCKS_PER_SEC);
-
-            if(combModelName!="ESF" && combModelName!="flareFGM" && combModelName!="DeePFGM")
-            {
-                start = std::clock();
-                #include "YEqn.H"
-                end = std::clock();
-                time_monitor_Y += double(end - start) / double(CLOCKS_PER_SEC);
-
-                start = std::clock();
-                #include "EEqn.H"
-                end = std::clock();
-                time_monitor_E += double(end - start) / double(CLOCKS_PER_SEC);
-
-                start = std::clock();
-                chemistry->correctThermo();
-                end = std::clock();
-                time_monitor_chemistry_correctThermo += double(end - start) / double(CLOCKS_PER_SEC);
-            }
-            else
-            {
-                combustion->correct();
-            }
-
-            Info<< "min/max(T) = " << min(T).value() << ", " << max(T).value() << endl;
-
-            // --- Pressure corrector loop
-
-            start = std::clock();
-            while (pimple.correct())
-            {
-                if (pimple.consistent())
-                {
-                    // #include "pcEqn.H"
-                }
-                else
-                {
-                    #include "pEqn.H"
-                }
-            }
-            end = std::clock();
-            time_monitor_p += double(end - start) / double(CLOCKS_PER_SEC);
-
-            start = std::clock();
-            if (pimple.turbCorr())
-            {
-                turbulence->correct();
-            }
-            end = std::clock();
-            time_monitor_turbulence_correct += double(end - start) / double(CLOCKS_PER_SEC);
-        }
-        clock_t loop_end = std::clock();
-        double loop_time = double(loop_end - loop_start) / double(CLOCKS_PER_SEC);
-
-        rho = thermo.rho();
-
-        dfDataBase.postTimeStep();
-
-        runTime.write();
-        Info<< "========Time Spent in diffenet parts========"<< endl;
-        Info<< "loop Time                    = " << loop_time << " s" << endl;
-        Info<< "other Time                   = " << time_monitor_other << " s" << endl;
-        Info<< "rho Equations                = " << time_monitor_rho << " s" << endl;
-        Info<< "U Equations                  = " << time_monitor_U << " s" << endl;
-        Info<< "Y Equations                  = " << time_monitor_Y - time_monitor_chem << " s" << endl;
-        Info<< "E Equations                  = " << time_monitor_E << " s" << endl;
-        Info<< "p Equations                  = " << time_monitor_p << " s" << endl;
-        Info<< "chemistry correctThermo      = " << time_monitor_chemistry_correctThermo << " s" << endl;
-        Info<< "turbulence correct           = " << time_monitor_turbulence_correct << " s" << endl;
-        Info<< "combustion correct(in Y)     = " << time_monitor_chem << " s" << endl;
-        Info<< "percentage of chemistry      = " << time_monitor_chem / loop_time * 100 << " %" << endl;
-        Info<< "percentage of rho/U/Y/E      = " << (time_monitor_E + time_monitor_Y + time_monitor_U + time_monitor_rho - time_monitor_chem) / loop_time * 100 << " %" << endl;
-
-
-        Info<< "========Time details of each equation======="<< endl;
-
-        Info<< "rhoEqn Time                  = " << time_monitor_rhoEqn << " s" << endl;
-        Info<< "rhoEqn assamble              = " << time_monitor_rhoEqn_mtxAssembly << " s" << endl;
-        Info<< "rhoEqn assamble(CPU prepare) = " << time_monitor_rhoEqn_mtxAssembly_CPU_prepare << " s" << endl;
-        Info<< "rhoEqn assamble(GPU run)     = " << time_monitor_rhoEqn_mtxAssembly_GPU_run << " s" << endl;
-        Info<< "rhoEqn solve                 = " << time_monitor_rhoEqn_solve << " s" << endl;
-        Info<< "rhoEqn correct boundary      = " << time_monitor_rhoEqn_correctBC << " s" << endl;
-
-        Info<< "UEqn Time                    = " << time_monitor_UEqn << " s" << endl;
-        Info<< "UEqn assamble                = " << time_monitor_UEqn_mtxAssembly << " s" << endl;
-        Info<< "UEqn assamble(CPU prepare)   = " << time_monitor_UEqn_mtxAssembly_CPU_prepare << " s" << endl;
-        Info<< "UEqn assamble(GPU run)       = " << time_monitor_UEqn_mtxAssembly_GPU_run << " s" << endl;
-        Info<< "UEqn solve                   = " << time_monitor_UEqn_solve << " s" << endl;
-        Info<< "UEqn correct boundary        = " << time_monitor_UEqn_correctBC << " s" << endl;
-        Info<< "UEqn H                       = " << time_monitor_UEqn_H << " s" << endl;
-        Info<< "UEqn H(GPU run)              = " << time_monitor_UEqn_H_GPU_run << " s" << endl;
-        Info<< "UEqn H(correct boundary)     = " << time_monitor_UEqn_H_correctBC << " s" << endl;
-        Info<< "UEqn A                       = " << time_monitor_UEqn_A << " s" << endl;
-        Info<< "UEqn A(GPU run)              = " << time_monitor_UEqn_A_GPU_run << " s" << endl;
-        Info<< "UEqn A(correct boundary)     = " << time_monitor_UEqn_A_correctBC << " s" << endl;
-
-        Info<< "YEqn Time                    = " << time_monitor_YEqn << " s" << endl;
-        Info<< "YEqn assamble                = " << time_monitor_YEqn_mtxAssembly << " s" << endl;
-        Info<< "YEqn assamble(CPU prepare)   = " << time_monitor_YEqn_mtxAssembly_CPU_prepare << " s" << endl;
-        Info<< "YEqn assamble(GPU run)       = " << time_monitor_YEqn_mtxAssembly_GPU_run << " s" << endl;
-        Info<< "YEqn solve                   = " << time_monitor_YEqn_solve << " s" << endl;
-        Info<< "YEqn correct boundary        = " << time_monitor_YEqn_correctBC << " s" << endl;
-
-        Info<< "EEqn Time                    = " << time_monitor_EEqn << " s" << endl;
-        Info<< "EEqn assamble                = " << time_monitor_EEqn_mtxAssembly << " s" << endl;
-        Info<< "EEqn assamble(CPU prepare)   = " << time_monitor_EEqn_mtxAssembly_CPU_prepare << " s" << endl;
-        Info<< "EEqn assamble(GPU prepare)   = " << time_monitor_EEqn_mtxAssembly_GPU_prepare << " s" << endl;
-        Info<< "EEqn assamble(GPU run)       = " << time_monitor_EEqn_mtxAssembly_GPU_run << " s" << endl;
-        Info<< "EEqn solve                   = " << time_monitor_EEqn_solve << " s" << endl;
-        Info<< "EEqn correct boundary        = " << time_monitor_EEqn_correctBC << " s" << endl;
-
-        Info<< "pEqn Time                    = " << time_monitor_pEqn << " s" << endl;
-        Info<< "pEqn Time solve              = " << time_monitor_pEqn_solve << " s" << endl;
-
-        Info<< "============================================"<<nl<< endl;
-
-        Info<< "ExecutionTime = " << runTime.elapsedCpuTime() << " s"
-            << "  ClockTime = " << runTime.elapsedClockTime() << " s" << endl;
-
-        time_monitor_other = 0;
-        time_monitor_rho = 0;
-        time_monitor_U = 0;
-        time_monitor_Y = 0;
-        time_monitor_E = 0;
-        time_monitor_p = 0;
-        time_monitor_chemistry_correctThermo = 0;
-        time_monitor_turbulence_correct = 0;
-        time_monitor_chem = 0;
-
-        time_monitor_rhoEqn = 0;
-        time_monitor_rhoEqn_mtxAssembly = 0;
-        time_monitor_rhoEqn_mtxAssembly_CPU_prepare = 0;
-        time_monitor_rhoEqn_mtxAssembly_GPU_run = 0;
-        time_monitor_rhoEqn_solve = 0;
-        time_monitor_rhoEqn_correctBC = 0;
-
-        time_monitor_UEqn = 0;
-        time_monitor_UEqn_mtxAssembly = 0;
-        time_monitor_UEqn_mtxAssembly_CPU_prepare = 0;
-        time_monitor_UEqn_mtxAssembly_GPU_run = 0;
-        time_monitor_UEqn_solve = 0;
-        time_monitor_UEqn_correctBC = 0;
-        time_monitor_UEqn_H = 0;
-        time_monitor_UEqn_H_GPU_run = 0;
-        time_monitor_UEqn_H_correctBC = 0;
-        time_monitor_UEqn_A = 0;
-        time_monitor_UEqn_A_GPU_run = 0;
-        time_monitor_UEqn_A_correctBC = 0;
-
-        time_monitor_YEqn = 0;
-        time_monitor_YEqn_mtxAssembly = 0;
-        time_monitor_YEqn_mtxAssembly_CPU_prepare = 0;
-        time_monitor_YEqn_mtxAssembly_GPU_run = 0;
-        time_monitor_YEqn_solve = 0;
-        time_monitor_YEqn_correctBC = 0;
-
-        time_monitor_EEqn = 0;
-        time_monitor_EEqn_mtxAssembly = 0;
-        time_monitor_EEqn_mtxAssembly_CPU_prepare = 0;
-        time_monitor_EEqn_mtxAssembly_GPU_prepare = 0;
-        time_monitor_EEqn_mtxAssembly_GPU_run = 0;
-        time_monitor_EEqn_solve = 0;
-        time_monitor_EEqn_correctBC = 0;
-
-        time_monitor_pEqn = 0;
-        time_monitor_pEqn_solve = 0;
-
-#ifdef USE_PYTORCH
-        if (log_ && torch_)
-        {
-            Info<< "    allsolveTime = " << chemistry->time_allsolve() << " s"
-            << "    submasterTime = " << chemistry->time_submaster() << " s" << nl
-            << "    sendProblemTime = " << chemistry->time_sendProblem() << " s"
-            << "    recvProblemTime = " << chemistry->time_RecvProblem() << " s"
-            << "    sendRecvSolutionTime = " << chemistry->time_sendRecvSolution() << " s" << nl
-            << "    getDNNinputsTime = " << chemistry->time_getDNNinputs() << " s"
-            << "    DNNinferenceTime = " << chemistry->time_DNNinference() << " s"
-            << "    updateSolutionBufferTime = " << chemistry->time_updateSolutionBuffer() << " s" << nl
-            << "    vec2ndarrayTime = " << chemistry->time_vec2ndarray() << " s"
-            << "    pythonTime = " << chemistry->time_python() << " s"<< nl << endl;
-        }
-#endif
-#ifdef USE_LIBTORCH
-        if (log_ && torch_)
-        {
-            Info<< "    allsolveTime = " << chemistry->time_allsolve() << " s"
-            << "    submasterTime = " << chemistry->time_submaster() << " s" << nl
-            << "    sendProblemTime = " << chemistry->time_sendProblem() << " s"
-            << "    recvProblemTime = " << chemistry->time_RecvProblem() << " s"
-            << "    sendRecvSolutionTime = " << chemistry->time_sendRecvSolution() << " s" << nl
-            << "    DNNinferenceTime = " << chemistry->time_DNNinference() << " s"
-            << "    updateSolutionBufferTime = " << chemistry->time_updateSolutionBuffer() << " s" << nl;
-        }
-#endif
-    }
-
-    Info<< "End\n" << endl;
-
-    return 0;
-}
-
-
-// ************************************************************************* //
diff --git a/applications/solvers/dfLowMachFoam_new/pEqn.H b/applications/solvers/dfLowMachFoam_new/pEqn.H
deleted file mode 100644
index 34925327f..000000000
--- a/applications/solvers/dfLowMachFoam_new/pEqn.H
+++ /dev/null
@@ -1,203 +0,0 @@
-if (!pimple.simpleRho())
-{
-    rho = thermo.rho();
-}
-
-// Thermodynamic density needs to be updated by psi*d(p) after the
-// pressure solution
-const volScalarField psip0(psi*p);
-
-#ifdef GPUSolver_
-    // UEqn.H()
-    start1 = std::clock();
-    volVectorField UEqn_H
-    (
-        IOobject
-        (
-            "H("+U.name()+')',
-            runTime.timeName(),
-            mesh,
-            IOobject::NO_READ,
-            IOobject::NO_WRITE
-        ),
-        mesh,
-        dimensionedVector(dimensionSet(1,-2,-2,0,0,0,0), Zero),
-        extrapolatedCalculatedFvPatchScalarField::typeName
-    );
-    UEqn_GPU.H(&UEqn_H[0][0]);
-    end1 = std::clock();
-    time_monitor_UEqn += double(end1 - start1) / double(CLOCKS_PER_SEC);
-    time_monitor_UEqn_H += double(end1 - start1) / double(CLOCKS_PER_SEC);
-    time_monitor_UEqn_H_GPU_run += double(end1 - start1) / double(CLOCKS_PER_SEC);
-
-    start1 = std::clock();
-    UEqn_H.correctBoundaryConditions();
-    end1 = std::clock();
-    time_monitor_UEqn += double(end1 - start1) / double(CLOCKS_PER_SEC);
-    time_monitor_UEqn_H += double(end1 - start1) / double(CLOCKS_PER_SEC);
-    time_monitor_UEqn_H_correctBC += double(end1 - start1) / double(CLOCKS_PER_SEC);
-
-    // UEqn.A()
-    start1 = std::clock();
-    volScalarField UEqn_A
-    (
-        IOobject
-        (
-            "A("+U.name()+')',
-            runTime.timeName(),
-            mesh,
-            IOobject::NO_READ,
-            IOobject::NO_WRITE
-        ),
-        mesh,
-        dimensionedScalar(dimensionSet(1,-3,-1,0,0,0,0), Zero),
-        extrapolatedCalculatedFvPatchScalarField::typeName
-    );
-    UEqn_GPU.A(&UEqn_A[0]);
-    end1 = std::clock();
-    time_monitor_UEqn += double(end1 - start1) / double(CLOCKS_PER_SEC);
-    time_monitor_UEqn_A += double(end1 - start1) / double(CLOCKS_PER_SEC);
-    time_monitor_UEqn_A_correctBC += double(end1 - start1) / double(CLOCKS_PER_SEC);
-
-    start1 = std::clock();
-    UEqn_A.correctBoundaryConditions();
-    end1 = std::clock();
-    time_monitor_UEqn += double(end1 - start1) / double(CLOCKS_PER_SEC);
-    time_monitor_UEqn_A += double(end1 - start1) / double(CLOCKS_PER_SEC);
-    time_monitor_UEqn_A_correctBC += double(end1 - start1) / double(CLOCKS_PER_SEC);
-#endif
-
-start2 = std::clock();
-#ifdef GPUSolver_
-    volScalarField rAU(1.0/UEqn_A);
-    surfaceScalarField rhorAUf("rhorAUf", fvc::interpolate(rho*rAU));
-    volVectorField HbyA(constrainHbyA(rAU*UEqn_H, U, p));
-#else
-    volScalarField rAU(1.0/UEqn.A());
-    surfaceScalarField rhorAUf("rhorAUf", fvc::interpolate(rho*rAU));
-    volVectorField HbyA(constrainHbyA(rAU*UEqn.H(), U, p));
-
-    if (pimple.nCorrPiso() <= 1)
-    {
-        tUEqn.clear();
-    }
-#endif
-
-surfaceScalarField phiHbyA
-(
-    "phiHbyA",
-    fvc::interpolate(rho)*fvc::flux(HbyA)
-  + rhorAUf*fvc::ddtCorr(rho, U, phi, rhoUf)
-);
-
-fvc::makeRelative(phiHbyA, rho, U);
-
-// Update the pressure BCs to ensure flux consistency
-constrainPressure(p, rho, U, phiHbyA, rhorAUf);
-
-if (pimple.transonic())
-{
-    surfaceScalarField phid
-    (
-        "phid",
-        (fvc::interpolate(psi)/fvc::interpolate(rho))*phiHbyA
-    );
-
-    phiHbyA -= fvc::interpolate(psi*p)*phiHbyA/fvc::interpolate(rho);
-
-    fvScalarMatrix pDDtEqn
-    (
-        fvc::ddt(rho) + psi*correction(fvm::ddt(p))
-      + fvc::div(phiHbyA) + fvm::div(phid, p)
-    );
-
-    while (pimple.correctNonOrthogonal())
-    {
-        fvScalarMatrix pEqn(pDDtEqn - fvm::laplacian(rhorAUf, p));
-
-        // Relax the pressure equation to ensure diagonal-dominance
-        pEqn.relax();
-
-        start1 = std::clock();
-        pEqn.solve();
-        end1 = std::clock();
-        time_monitor_pEqn_solve += double(end1 - start1) / double(CLOCKS_PER_SEC);
-
-        if (pimple.finalNonOrthogonalIter())
-        {
-            phi = phiHbyA + pEqn.flux();
-        }
-    }
-}
-else
-{
-    fvScalarMatrix pDDtEqn
-    (
-        fvc::ddt(rho) + psi*correction(fvm::ddt(p))
-      + fvc::div(phiHbyA)
-    );
-
-    while (pimple.correctNonOrthogonal())
-    {
-        fvScalarMatrix pEqn(pDDtEqn - fvm::laplacian(rhorAUf, p));
-
-        start1 = std::clock();
-        pEqn.solve();
-        end1 = std::clock();
-        time_monitor_pEqn_solve += double(end1 - start1) / double(CLOCKS_PER_SEC);
-
-        if (pimple.finalNonOrthogonalIter())
-        {
-            phi = phiHbyA + pEqn.flux();
-        }
-    }
-}
-
-bool limitedp = pressureControl.limit(p);
-
-// Thermodynamic density update
-thermo.correctRho(psi*p - psip0);
-
-if (limitedp)
-{
-    rho = thermo.rho();
-}
-
-#include "rhoEqn.H"
-#include "compressibleContinuityErrs.H"
-
-// Explicitly relax pressure for momentum corrector
-p.relax();
-
-U = HbyA - rAU*fvc::grad(p);
-U.correctBoundaryConditions();
-K = 0.5*magSqr(U);
-
-#ifdef GPUSolver_
-    start1 = std::clock();
-    UEqn_GPU.correctPsi(&U[0][0]);
-    end1 = std::clock();
-    time_monitor_UEqn += double(end1 - start1) / double(CLOCKS_PER_SEC);
-    time_monitor_UEqn_correctBC += double(end1 - start1) / double(CLOCKS_PER_SEC);
-#endif
-
-if (pimple.simpleRho())
-{
-    rho = thermo.rho();
-}
-
-// Correct rhoUf if the mesh is moving
-fvc::correctRhoUf(rhoUf, rho, U, phi);
-
-if (thermo.dpdt())
-{
-    dpdt = fvc::ddt(p);
-
-    if (mesh.moving())
-    {
-        dpdt -= fvc::div(fvc::meshPhi(rho, U), p);
-    }
-}
-end2 = std::clock();
-time_monitor_pEqn += double(end2 - start2) / double(CLOCKS_PER_SEC);
-
diff --git a/applications/solvers/dfLowMachFoam_new/pcEqn.H b/applications/solvers/dfLowMachFoam_new/pcEqn.H
deleted file mode 100644
index 3b72bf3c6..000000000
--- a/applications/solvers/dfLowMachFoam_new/pcEqn.H
+++ /dev/null
@@ -1,130 +0,0 @@
-if (!pimple.simpleRho())
-{
-    rho = thermo.rho();
-}
-
-// Thermodynamic density needs to be updated by psi*d(p) after the
-// pressure solution
-const volScalarField psip0(psi*p);
-
-volScalarField rAU(1.0/UEqn.A());
-volScalarField rAtU(1.0/(1.0/rAU - UEqn.H1()));
-volVectorField HbyA(constrainHbyA(rAU*UEqn.H(), U, p));
-
-if (pimple.nCorrPiso() <= 1)
-{
-    tUEqn.clear();
-}
-
-surfaceScalarField phiHbyA
-(
-    "phiHbyA",
-    (
-        fvc::interpolate(rho)*fvc::flux(HbyA)
-      + fvc::interpolate(rho*rAU)*fvc::ddtCorr(rho, U, phi, rhoUf)
-    )
-);
-
-fvc::makeRelative(phiHbyA, rho, U);
-
-volScalarField rhorAtU("rhorAtU", rho*rAtU);
-
-// Update the pressure BCs to ensure flux consistency
-constrainPressure(p, rho, U, phiHbyA, rhorAtU);
-
-if (pimple.transonic())
-{
-    surfaceScalarField phid
-    (
-        "phid",
-        (fvc::interpolate(psi)/fvc::interpolate(rho))*phiHbyA
-    );
-
-    phiHbyA +=
-        fvc::interpolate(rho*(rAtU - rAU))*fvc::snGrad(p)*mesh.magSf()
-      - fvc::interpolate(psi*p)*phiHbyA/fvc::interpolate(rho);
-
-    HbyA -= (rAU - rAtU)*fvc::grad(p);
-
-    fvScalarMatrix pDDtEqn
-    (
-        fvc::ddt(rho) + psi*correction(fvm::ddt(p))
-      + fvc::div(phiHbyA) + fvm::div(phid, p)
-    );
-
-    while (pimple.correctNonOrthogonal())
-    {
-        fvScalarMatrix pEqn(pDDtEqn - fvm::laplacian(rhorAtU, p));
-
-        // Relax the pressure equation to ensure diagonal-dominance
-        pEqn.relax();
-
-        pEqn.solve();
-
-        if (pimple.finalNonOrthogonalIter())
-        {
-            phi = phiHbyA + pEqn.flux();
-        }
-    }
-}
-else
-{
-    phiHbyA += fvc::interpolate(rho*(rAtU - rAU))*fvc::snGrad(p)*mesh.magSf();
-    HbyA -= (rAU - rAtU)*fvc::grad(p);
-
-    fvScalarMatrix pDDtEqn
-    (
-        fvc::ddt(rho) + psi*correction(fvm::ddt(p))
-      + fvc::div(phiHbyA)
-    );
-
-    while (pimple.correctNonOrthogonal())
-    {
-        fvScalarMatrix pEqn(pDDtEqn - fvm::laplacian(rhorAtU, p));
-
-        pEqn.solve();
-
-        if (pimple.finalNonOrthogonalIter())
-        {
-            phi = phiHbyA + pEqn.flux();
-        }
-    }
-}
-
-bool limitedp = pressureControl.limit(p);
-
-// Thermodynamic density update
-thermo.correctRho(psi*p - psip0);
-
-if (limitedp)
-{
-    rho = thermo.rho();
-}
-
-#include "rhoEqn.H"
-#include "compressibleContinuityErrs.H"
-
-// Explicitly relax pressure for momentum corrector
-p.relax();
-
-U = HbyA - rAtU*fvc::grad(p);
-U.correctBoundaryConditions();
-K = 0.5*magSqr(U);
-
-if (pimple.simpleRho())
-{
-    rho = thermo.rho();
-}
-
-// Correct rhoUf if the mesh is moving
-fvc::correctRhoUf(rhoUf, rho, U, phi);
-
-if (thermo.dpdt())
-{
-    dpdt = fvc::ddt(p);
-
-    if (mesh.moving())
-    {
-        dpdt -= fvc::div(fvc::meshPhi(rho, U), p);
-    }
-}
diff --git a/applications/solvers/dfLowMachFoam_new/rhoEqn.H b/applications/solvers/dfLowMachFoam_new/rhoEqn.H
deleted file mode 100644
index 93965ca52..000000000
--- a/applications/solvers/dfLowMachFoam_new/rhoEqn.H
+++ /dev/null
@@ -1,86 +0,0 @@
-/*---------------------------------------------------------------------------*\
-  =========                 |
-  \\      /  F ield         | OpenFOAM: The Open Source CFD Toolbox
-   \\    /   O peration     | Website:  https://openfoam.org
-    \\  /    A nd           | Copyright (C) 2011-2018 OpenFOAM Foundation
-     \\/     M anipulation  |
--------------------------------------------------------------------------------
-License
-    This file is part of OpenFOAM.
-
-    OpenFOAM is free software: you can redistribute it and/or modify it
-    under the terms of the GNU General Public License as published by
-    the Free Software Foundation, either version 3 of the License, or
-    (at your option) any later version.
-
-    OpenFOAM is distributed in the hope that it will be useful, but WITHOUT
-    ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
-    FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
-    for more details.
-
-    You should have received a copy of the GNU General Public License
-    along with OpenFOAM.  If not, see <http://www.gnu.org/licenses/>.
-
-Global
-    rhoEqn
-
-Description
-    Solve the continuity for density.
-
-\*---------------------------------------------------------------------------*/
-#ifdef GPUSolver_
-{
-    start1 = std::clock();
-    rho.oldTime();
-
-    int offset = 0;
-    forAll(U.boundaryField(), patchi)
-    {
-        const fvsPatchScalarField& patchFlux = phi.boundaryField()[patchi];
-        int patchSize = patchFlux.size();
-        memcpy(boundary_phi_init+offset, &patchFlux[0], patchSize*sizeof(double));
-        offset += patchSize;
-    }
-    end1 = std::clock();
-    time_monitor_rhoEqn += double(end1 - start1) / double(CLOCKS_PER_SEC);
-    time_monitor_rhoEqn_mtxAssembly += double(end1 - start1) / double(CLOCKS_PER_SEC);
-    time_monitor_rhoEqn_mtxAssembly_CPU_prepare += double(end1 - start1) / double(CLOCKS_PER_SEC);
-
-    start1 = std::clock();
-    rhoEqn_GPU.initializeTimeStep();
-    rhoEqn_GPU.fvc_div(&phi[0], boundary_phi_init);
-    rhoEqn_GPU.fvm_ddt(&rho.oldTime()[0]);
-    rhoEqn_GPU.sync();
-    end1 = std::clock();
-    time_monitor_rhoEqn += double(end1 - start1) / double(CLOCKS_PER_SEC);
-    time_monitor_rhoEqn_mtxAssembly += double(end1 - start1) / double(CLOCKS_PER_SEC);
-    time_monitor_rhoEqn_mtxAssembly_GPU_run += double(end1 - start1) / double(CLOCKS_PER_SEC);
-
-    start1 = std::clock();
-    rhoEqn_GPU.updatePsi(&rho.primitiveFieldRef()[0]);
-    rho.correctBoundaryConditions();
-    end1 = std::clock();
-    time_monitor_rhoEqn += double(end1 - start1) / double(CLOCKS_PER_SEC);
-    time_monitor_rhoEqn_correctBC += double(end1 - start1) / double(CLOCKS_PER_SEC);
-}
-#else
-{
-    start1 = std::clock();
-    fvScalarMatrix rhoEqn
-    (
-        fvm::ddt(rho)
-      + fvc::div(phi)
-    );
-    end1 = std::clock();
-    time_monitor_rhoEqn += double(end1 - start1) / double(CLOCKS_PER_SEC);
-    time_monitor_rhoEqn_mtxAssembly += double(end1 - start1) / double(CLOCKS_PER_SEC);
-
-    start1 = std::clock();
-    rhoEqn.solve();
-    end1 = std::clock();
-    time_monitor_rhoEqn += double(end1 - start1) / double(CLOCKS_PER_SEC);
-    time_monitor_rhoEqn_solve += double(end1 - start1) / double(CLOCKS_PER_SEC);
-}
-#endif
-
-// ************************************************************************* //
diff --git a/applications/solvers/dfLowMachFoam_new/setRDeltaT.H b/applications/solvers/dfLowMachFoam_new/setRDeltaT.H
deleted file mode 100644
index 074d05e3d..000000000
--- a/applications/solvers/dfLowMachFoam_new/setRDeltaT.H
+++ /dev/null
@@ -1,85 +0,0 @@
-{
-    volScalarField& rDeltaT = trDeltaT.ref();
-
-    const dictionary& pimpleDict = pimple.dict();
-
-    scalar maxCo
-    (
-        pimpleDict.lookupOrDefault<scalar>("maxCo", 0.8)
-    );
-
-    scalar rDeltaTSmoothingCoeff
-    (
-        pimpleDict.lookupOrDefault<scalar>("rDeltaTSmoothingCoeff", 0.02)
-    );
-
-    scalar rDeltaTDampingCoeff
-    (
-        pimpleDict.lookupOrDefault<scalar>("rDeltaTDampingCoeff", 1.0)
-    );
-
-    scalar maxDeltaT
-    (
-        pimpleDict.lookupOrDefault<scalar>("maxDeltaT", great)
-    );
-
-    volScalarField rDeltaT0("rDeltaT0", rDeltaT);
-
-    // Set the reciprocal time-step from the local Courant number
-    rDeltaT.ref() = max
-    (
-        1/dimensionedScalar(dimTime, maxDeltaT),
-        fvc::surfaceSum(mag(phi))()()
-       /((2*maxCo)*mesh.V()*rho())
-    );
-
-    if (pimple.transonic())
-    {
-        surfaceScalarField phid
-        (
-            "phid",
-            fvc::interpolate(psi)*fvc::flux(U)
-        );
-
-        rDeltaT.ref() = max
-        (
-            rDeltaT(),
-            fvc::surfaceSum(mag(phid))()()
-            /((2*maxCo)*mesh.V()*psi())
-        );
-    }
-
-    // Update tho boundary values of the reciprocal time-step
-    rDeltaT.correctBoundaryConditions();
-
-    Info<< "Flow time scale min/max = "
-        << gMin(1/rDeltaT.primitiveField())
-        << ", " << gMax(1/rDeltaT.primitiveField()) << endl;
-
-    if (rDeltaTSmoothingCoeff < 1.0)
-    {
-        fvc::smooth(rDeltaT, rDeltaTSmoothingCoeff);
-    }
-
-    Info<< "Smoothed flow time scale min/max = "
-        << gMin(1/rDeltaT.primitiveField())
-        << ", " << gMax(1/rDeltaT.primitiveField()) << endl;
-
-    // Limit rate of change of time scale
-    // - reduce as much as required
-    // - only increase at a fraction of old time scale
-    if
-    (
-        rDeltaTDampingCoeff < 1.0
-     && runTime.timeIndex() > runTime.startTimeIndex() + 1
-    )
-    {
-        rDeltaT =
-            rDeltaT0
-           *max(rDeltaT/rDeltaT0, scalar(1) - rDeltaTDampingCoeff);
-
-        Info<< "Damped flow time scale min/max = "
-            << gMin(1/rDeltaT.primitiveField())
-            << ", " << gMax(1/rDeltaT.primitiveField()) << endl;
-    }
-}
diff --git a/applications/solvers/dfLowMachFoam_new/setRootCase2.H b/applications/solvers/dfLowMachFoam_new/setRootCase2.H
deleted file mode 100644
index 45d966e63..000000000
--- a/applications/solvers/dfLowMachFoam_new/setRootCase2.H
+++ /dev/null
@@ -1,5 +0,0 @@
-Foam::argList args(argc,argv,true,true,/*initialise=*/false);
-if (!args.checkRootCase())
-{
-    Foam::FatalError.exit();
-}
\ No newline at end of file