From f5243bf960895f4404c856d67c0085974db7f77b Mon Sep 17 00:00:00 2001
From: maorz1998 <maorz1998@stu.pku.edu.cn>
Date: Wed, 24 May 2023 21:51:08 +0800
Subject: [PATCH] preliminary refactory

---
 applications/solvers/dfLowMachFoam/EEqn.H     |   36 +-
 applications/solvers/dfLowMachFoam/UEqn.H     |   18 +-
 applications/solvers/dfLowMachFoam/YEqn.H     |   14 +-
 .../solvers/dfLowMachFoam/createdfSolver.H    |    6 +-
 .../solvers/dfLowMachFoam/dfLowMachFoam.C     |    2 +-
 src_gpu/CMakeLists.txt                        |    2 +-
 src_gpu/GPUMesh.H                             |   23 +
 src_gpu/GPUfield.H                            |   57 +
 src_gpu/GPUfield.cpp                          |   18 +
 src_gpu/dfMatrix.H                            |  217 ----
 src_gpu/dfMatrixDataBase.H                    |  550 +++++++++
 src_gpu/dfUEqn.H                              |   53 +
 src_gpu/{dfMatrix.cu => dfUEqn.cu}            |  861 +++----------
 src_gpu/ueqn.cu                               | 1097 -----------------
 14 files changed, 896 insertions(+), 2058 deletions(-)
 create mode 100644 src_gpu/GPUMesh.H
 create mode 100644 src_gpu/GPUfield.H
 create mode 100644 src_gpu/GPUfield.cpp
 delete mode 100644 src_gpu/dfMatrix.H
 create mode 100644 src_gpu/dfMatrixDataBase.H
 create mode 100644 src_gpu/dfUEqn.H
 rename src_gpu/{dfMatrix.cu => dfUEqn.cu} (57%)
 delete mode 100644 src_gpu/ueqn.cu

diff --git a/applications/solvers/dfLowMachFoam/EEqn.H b/applications/solvers/dfLowMachFoam/EEqn.H
index 9da3120c8..32d94d8d1 100644
--- a/applications/solvers/dfLowMachFoam/EEqn.H
+++ b/applications/solvers/dfLowMachFoam/EEqn.H
@@ -1,22 +1,24 @@
 {
     volScalarField& he = thermo.he();
-    // start1 = std::clock();
-    // t.join();
-    // UEqn_GPU.updatePsi(&U[0][0]);
-    // K = 0.5*magSqr(U);
-    // end1 = std::clock();
-    // time_monitor_UEqn += double(end1 - start1) / double(CLOCKS_PER_SEC);
-    // time_monitor_UEqn_mtxAssembly += double(end1 - start1) / double(CLOCKS_PER_SEC);
-    // time_monitor_CPU += double(end1 - start1) / double(CLOCKS_PER_SEC);
-    // time_monitor_UinE += double(end1 - start1) / double(CLOCKS_PER_SEC);
-    start1 = std::clock();
-    // // t.join();
-    UEqn_GPU.updatePsi(&U[0][0]);
-    K = 0.5*magSqr(U);
-    end1 = std::clock();
-    time_monitor_UEqn += double(end1 - start1) / double(CLOCKS_PER_SEC);
-    time_monitor_UEqn_mtxAssembly += double(end1 - start1) / double(CLOCKS_PER_SEC);
-    time_monitor_CPU += double(end1 - start1) / double(CLOCKS_PER_SEC);
+    #ifdef GPUSolver_
+        // start1 = std::clock();
+        // t.join();
+        // UEqn_GPU.updatePsi(&U[0][0]);
+        // K = 0.5*magSqr(U);
+        // end1 = std::clock();
+        // time_monitor_UEqn += double(end1 - start1) / double(CLOCKS_PER_SEC);
+        // time_monitor_UEqn_mtxAssembly += double(end1 - start1) / double(CLOCKS_PER_SEC);
+        // time_monitor_CPU += double(end1 - start1) / double(CLOCKS_PER_SEC);
+        // time_monitor_UinE += double(end1 - start1) / double(CLOCKS_PER_SEC);
+        start1 = std::clock();
+        // // t.join();
+        UEqn_GPU.updatePsi(&U[0][0]);
+        K = 0.5*magSqr(U);
+        end1 = std::clock();
+        time_monitor_UEqn += double(end1 - start1) / double(CLOCKS_PER_SEC);
+        time_monitor_UEqn_mtxAssembly += double(end1 - start1) / double(CLOCKS_PER_SEC);
+        time_monitor_CPU += double(end1 - start1) / double(CLOCKS_PER_SEC);
+    #endif
 
     fvScalarMatrix EEqn
     (
diff --git a/applications/solvers/dfLowMachFoam/UEqn.H b/applications/solvers/dfLowMachFoam/UEqn.H
index d81f3948c..f7547af48 100644
--- a/applications/solvers/dfLowMachFoam/UEqn.H
+++ b/applications/solvers/dfLowMachFoam/UEqn.H
@@ -15,15 +15,15 @@ end1 = std::clock();
 #endif
 
 #ifdef CPUSolver_
-// UEqn.relax();
-start1 = std::clock();
-if (pimple.momentumPredictor())
-{
-    solve(UEqn);
+    // UEqn.relax();
+    start1 = std::clock();
+    if (pimple.momentumPredictor())
+    {
+        solve(UEqn);
 
-    K = 0.5*magSqr(U);
-}
-end1 = std::clock();
+        K = 0.5*magSqr(U);
+    }
+    end1 = std::clock();
     time_monitor_UEqn += double(end1 - start1) / double(CLOCKS_PER_SEC);
     time_monitor_UEqn_Solve += double(end1 - start1) / double(CLOCKS_PER_SEC);
 #endif
@@ -101,7 +101,7 @@ end1 = std::clock();
     boundary_pressure_init, boundary_velocity_init, boundary_nuEff_init, boundary_rho_init);
     UEqn_GPU.fvc_grad(&p[0]);
     UEqn_GPU.fvc_grad_vector();
-    UEqn_GPU.divDevRhoReff();
+    UEqn_GPU.dev2T();
     UEqn_GPU.fvc_div_tensor(&nuEff[0]);
     UEqn_GPU.fvm_laplacian();
     end1 = std::clock();
diff --git a/applications/solvers/dfLowMachFoam/YEqn.H b/applications/solvers/dfLowMachFoam/YEqn.H
index b0187baf2..8831b67ac 100644
--- a/applications/solvers/dfLowMachFoam/YEqn.H
+++ b/applications/solvers/dfLowMachFoam/YEqn.H
@@ -26,12 +26,14 @@ forAll(Y, i)
 }
 const surfaceScalarField phiUc = linearInterpolate(sumYDiffError) & mesh.Sf();
 
-start1 = std::clock();
-// // std::thread t(&dfMatrix::solve, &UEqn_GPU);
-UEqn_GPU.solve();
-end1 = std::clock();
-time_monitor_UEqn += double(end1 - start1) / double(CLOCKS_PER_SEC);
-time_monitor_UEqn_Solve += double(end1 - start1) / double(CLOCKS_PER_SEC);
+#ifdef GPUSolver_
+    start1 = std::clock();
+    // // std::thread t(&dfMatrix::solve, &UEqn_GPU);
+    UEqn_GPU.solve();
+    end1 = std::clock();
+    time_monitor_UEqn += double(end1 - start1) / double(CLOCKS_PER_SEC);
+    time_monitor_UEqn_Solve += double(end1 - start1) / double(CLOCKS_PER_SEC);
+#endif
 
 //MPI_Barrier(PstreamGlobals::MPI_COMM_FOAM);
 label flag_mpi_init;
diff --git a/applications/solvers/dfLowMachFoam/createdfSolver.H b/applications/solvers/dfLowMachFoam/createdfSolver.H
index e5946776c..79d1f2e78 100644
--- a/applications/solvers/dfLowMachFoam/createdfSolver.H
+++ b/applications/solvers/dfLowMachFoam/createdfSolver.H
@@ -28,8 +28,10 @@ int num_boundary_cells;
 string settingPath;
 settingPath = CanteraTorchProperties.subDict("AmgxSettings").lookupOrDefault("UEqnSettingPath", string(""));
 
-dfMatrix UEqn_GPU(num_surfaces, num_cells, num_boundary_faces, num_boundary_cells, &neighbour[0], &owner[0], &mesh.V()[0], &mesh.surfaceInterpolation::weights()[0], 
-&mesh.Sf()[0][0], &mesh.magSf()[0], &mesh.nonOrthDeltaCoeffs()[0], boundary_face_vector_init, boundary_face_init, boundary_deltaCoeffs_init, boundaryCellIndex, "dDDI", settingPath);
+dfMatrixDataBase dfDataBase(num_surfaces, num_cells, num_boundary_faces, num_boundary_cells, &neighbour[0], &owner[0], &mesh.V()[0], &mesh.surfaceInterpolation::weights()[0], 
+&mesh.Sf()[0][0], &mesh.magSf()[0], &mesh.nonOrthDeltaCoeffs()[0], boundary_face_vector_init, boundary_face_init, boundary_deltaCoeffs_init, boundaryCellIndex);
+
+dfUEqn UEqn_GPU(dfDataBase, "dDDI", settingPath);
 
 double *ueqn_internalCoeffs_init, *ueqn_boundaryCoeffs_init, *boundary_pressure_init, *boundary_velocity_init,
     *boundary_nuEff_init, *boundary_rho_init, *ueqn_laplac_internalCoeffs_init, *ueqn_laplac_boundaryCoeffs_init;
diff --git a/applications/solvers/dfLowMachFoam/dfLowMachFoam.C b/applications/solvers/dfLowMachFoam/dfLowMachFoam.C
index bcd56d242..2a0a43826 100644
--- a/applications/solvers/dfLowMachFoam/dfLowMachFoam.C
+++ b/applications/solvers/dfLowMachFoam/dfLowMachFoam.C
@@ -59,7 +59,7 @@ Description
 #include "basicThermo.H"
 #include "CombustionModel.H"
 
-#include "dfMatrix.H"
+#include "dfUEqn.H"
 #include <cuda_runtime.h>
 #include <thread>
 
diff --git a/src_gpu/CMakeLists.txt b/src_gpu/CMakeLists.txt
index 5bcf34327..7c48e5bb2 100644
--- a/src_gpu/CMakeLists.txt
+++ b/src_gpu/CMakeLists.txt
@@ -18,7 +18,7 @@ include_directories(
     $ENV{AMGX_DIR}/include
 )
 
-add_library(${PROJECT_NAME} SHARED dfMatrix.cu AmgXSolver.cu)
+add_library(${PROJECT_NAME} SHARED dfUEqn.cu AmgXSolver.cu)
 
 target_link_libraries(${PROJECT_NAME}
     ${MPI_LIBRARIES}
diff --git a/src_gpu/GPUMesh.H b/src_gpu/GPUMesh.H
new file mode 100644
index 000000000..22cc05ed8
--- /dev/null
+++ b/src_gpu/GPUMesh.H
@@ -0,0 +1,23 @@
+#include <iostream>
+
+class GPUMesh
+{
+public:
+    int num_cells;
+    int num_faces;
+    int boundary_cells;
+    //... all variables needed to upload once
+
+public:
+    GPUMesh();
+    ~GPUMesh();
+};
+
+GPUMesh::GPUMesh()
+{
+    // same to the constructor of dfMatrix.C
+}
+
+GPUMesh::~GPUMesh()
+{
+}
diff --git a/src_gpu/GPUfield.H b/src_gpu/GPUfield.H
new file mode 100644
index 000000000..d81afbb01
--- /dev/null
+++ b/src_gpu/GPUfield.H
@@ -0,0 +1,57 @@
+/*
+GPU field
+UEqn
+GPUField
+rho_old, rho_new, vector_old, phi, p, nuEff, 
+p_bou, vector_old_bou, nuEff_bou, rho_new_bou, 
+ueqn_internalCoeffs, ueqn_boundaryCoeffs_init, ueqn_laplac_internalCoeffs_init, ueqn_laplac_boundaryCoeffs_init
+1. initialize a map
+2. implement methods to construct GPU pointer and return GPU pointer
+
+fvm__ddt(UEqn, rho, U)
+{
+    rho.oldtime()
+    rho.
+}
+*/
+#include <iostream>
+#include <unordered_map>
+
+#define TOSTRING(x) #x
+
+struct GPUField {
+    double* cur_internal;
+    double* cur_boundary;
+    double* old_internal;
+    double* old_boundary;
+};
+std::unordered_map<std::string, GPUField> GPUFields;
+
+// initialize: cudaMalloc, conducted at begining
+void initialize(std::string U, std::string p, std::string phi);
+// Q: 
+// 1. consider the different sizes bettween face values and cell values
+// 2. not all variables need all these four terms
+
+// update at the end of this time step
+// move current pointer as oldTime pointer
+void update();
+
+template<class Type, template<class> class PatchField, class GeoMesh>
+double* cur_internal(Foam::GeometryField<Type, PatchField, GeoMesh> var)
+{
+    if (!GPUFields[TOSTRING(var)].cur_internal) {
+        // 1. cudaMemcopy current internal field
+    }
+    return GPUFields[TOSTRING(var)].cur_internal;
+}
+
+template<class Type, template<class> class PatchField, class GeoMesh>
+double* cur_boundary(Foam::GeometryField<Type, PatchField, GeoMesh> var);
+
+template<class Type, template<class> class PatchField, class GeoMesh>
+double* old_internal(Foam::GeometryField<Type, PatchField, GeoMesh> var);
+
+template<class Type, template<class> class PatchField, class GeoMesh>
+double* old_boundary(Foam::GeometryField<Type, PatchField, GeoMesh> var);
+
diff --git a/src_gpu/GPUfield.cpp b/src_gpu/GPUfield.cpp
new file mode 100644
index 000000000..e79dbdf07
--- /dev/null
+++ b/src_gpu/GPUfield.cpp
@@ -0,0 +1,18 @@
+/*
+GPU field
+UEqn
+GPUField
+rho_old, rho_new, vector_old, phi, p, nuEff, 
+p_bou, vector_old_bou, nuEff_bou, rho_new_bou, 
+ueqn_internalCoeffs, ueqn_boundaryCoeffs_init, ueqn_laplac_internalCoeffs_init, ueqn_laplac_boundaryCoeffs_init
+1. initialize a map
+2. implement methods to construct GPU pointer and return GPU pointer
+
+fvm__ddt(UEqn, rho, U)
+{
+    rho.oldtime()
+    rho.
+}
+*/
+#include <stdio.h>
+
diff --git a/src_gpu/dfMatrix.H b/src_gpu/dfMatrix.H
deleted file mode 100644
index 396d5a29d..000000000
--- a/src_gpu/dfMatrix.H
+++ /dev/null
@@ -1,217 +0,0 @@
-#ifndef dfMatrix_H
-#define dfMatrix_H
-
-#include <stdio.h>
-#include <unistd.h>
-#include "cuda_profiler_api.h"
-#include <cuda_runtime.h>
-#include "nvtx3/nvToolsExt.h"
-#include <vector>
-#include <numeric>
-#include <algorithm>
-#include <map>
-#include <iostream>
-#include <ctime>
-
-#include "AmgXSolver.H"
-# include <amgx_c.h>
-
-static const char *_cudaGetErrorEnum(cudaError_t error) {
-  return cudaGetErrorName(error);
-}
-
-template <typename T>
-void check(T result, char const *const func, const char *const file,
-        int const line) {
-  if (result) {
-    fprintf(stderr, "cuda error at %s:%d code=%d(%s) \"%s\" \n", file, line,
-            static_cast<unsigned int>(result), _cudaGetErrorEnum(result), func);
-    exit(EXIT_FAILURE);
-  }
-}
-
-#define checkCudaErrors(val) check((val), #val, __FILE__, __LINE__)
-
-class dfMatrix
-{
-private:
-    /**
-     * @brief data size
-     */
-    // - number of cell size
-    int num_cells;
-    // - number of face size
-    int num_surfaces;
-    // - number of offdiagnal entry size (2*num_surfaces)
-    int num_faces;
-    // - number of boundary cells
-    int num_boundary_cells;
-    // - number of boundary faces
-    int num_boundary_faces;
-
-    /**
-     * @brief input data (now these values are named according to UEqn)
-     */
-    // - mesh variables
-    // - csr_row_index
-    int *h_A_csr_row_index=nullptr, *d_A_csr_row_index=nullptr;
-    // - csr_col_index
-    int *h_A_csr_col_index=nullptr, *d_A_csr_col_index=nullptr;
-    // - csr_diag_index
-    int *h_A_csr_diag_index=nullptr, *d_A_csr_diag_index=nullptr;
-
-    // - the pre-permutated and post-permutated interpolation weight list
-    std::vector<double> h_weight_vec_init, h_weight_vec;
-    // - the pre-permutated and post-permutated flux (phi) list
-    std::vector<double> h_phi_vec_init, h_phi_vec;
-    // - the pre-permutated and post-permutated cell face vector list
-    std::vector<double> h_face_vector_vec_init, h_face_vector_vec;
-    std::vector<double> h_face_vec_init, h_face_vec;
-    std::vector<double> h_deltaCoeffs_vec_init, h_deltaCoeffs_vec;
-    // - the host pointer to rho_new, rho_old, velocity_old, pressure and volume list
-    double *h_rho_new = nullptr, *h_rho_old = nullptr, *h_velocity_old = nullptr, 
-    *h_pressure = nullptr;
-    const double *h_volume = nullptr;
-    // - the host pointer to the pre-permutated and post-permutated interpolation weight list
-    double *h_weight_init = nullptr, *h_weight = nullptr;
-    // - the host pointer to the pre-permutated and post-permutated flux (phi) list
-    double *h_phi_init = nullptr, *h_phi = nullptr;
-    // - the host pointer to the pre-permutated and post-permutated cell face vector list
-    double *h_face_vector_init = nullptr, *h_face_vector = nullptr;
-    double *h_face_init = nullptr, *h_face = nullptr;
-    double *h_deltaCoeffs_init = nullptr, *h_deltaCoeffs = nullptr;
-    // - the device pointer to rho_new, rho_old, velocity_old, pressure and volume list
-    double *d_rho_new = nullptr, *d_rho_old = nullptr, *d_velocity_old = nullptr, 
-    *d_pressure = nullptr, *d_volume = nullptr;
-    // - the device pointer to the pre-permutated and post-permutated interpolation weight list
-    double *d_weight_init = nullptr, *d_weight = nullptr;
-    // - the device pointer to the pre-permutated and post-permutated flux (phi) list
-    double *d_phi_init = nullptr, *d_phi = nullptr;
-    // - the device pointer to the pre-permutated and post-permutated cell face vector list
-    double *d_face_vector_init = nullptr, *d_face_vector = nullptr;
-    double *d_face_init = nullptr, *d_face = nullptr;
-    double *d_deltaCoeffs_init = nullptr, *d_deltaCoeffs = nullptr;
-
-    double rdelta_t = 1/1e-6;
-
-    /**
-     * @brief boundary related variables
-     */
-    int *h_boundary_cell_offset = nullptr, *d_boundary_cell_offset=nullptr;
-    int *h_boundary_cell_id = nullptr, *d_boundary_cell_id = nullptr;
-    double *h_internal_coeffs = nullptr, *h_boundary_coeffs = nullptr,
-    *h_boundary_pressure = nullptr, *h_boundary_face_vector = nullptr,
-    *h_boundary_face = nullptr, *d_boundary_face = nullptr,
-    *h_boundary_deltaCoeffs = nullptr, *d_boundary_deltaCoeffs = nullptr, 
-    *d_internal_coeffs = nullptr, *d_boundary_coeffs = nullptr,
-    *d_internal_coeffs_init = nullptr, *d_boundary_coeffs_init = nullptr,
-    *d_laplac_internal_coeffs = nullptr, *d_laplac_boundary_coeffs = nullptr,
-    *d_laplac_internal_coeffs_init = nullptr, *d_laplac_boundary_coeffs_init = nullptr,
-    *d_boundary_pressure = nullptr, *d_boundary_face_vector = nullptr,
-    *d_boundary_pressure_init = nullptr,
-    *d_boundary_velocity = nullptr, *d_boundary_velocity_init = nullptr,
-    *d_boundary_nuEff = nullptr, *d_boundary_nuEff_init = nullptr,
-    *d_boundary_rho = nullptr, *d_boundary_rho_init = nullptr;
-
-    std::vector<int> boundPermutationList;
-    std::vector<double> ueqn_internalCoeffs, ueqn_boundaryCoeffs;
-    std::vector<double> boundary_face_vector;
-    std::vector<double> boundary_pressure;
-    std::vector<double> boundary_face;
-    std::vector<double> boundary_deltaCoeffs;
-
-    // - the device pointer to the permutated index list
-    std::vector<int> permedIndex;
-    int *d_permedIndex=nullptr;
-    int *d_bouPermedIndex = nullptr;
-
-
-    // bytesize
-    // - bytes of diagnal entries
-    size_t cell_bytes;
-    // - bytes of diagnal entries (vector)
-    size_t cell_vec_bytes;
-    // - bytes of diagnal index
-    size_t cell_index_bytes;
-     // - bytes of diagnal index
-    size_t face_bytes;
-    size_t face_vec_bytes;
-    size_t face_index_bytes;
-
-    size_t boundary_cell_bytes;
-    size_t boundary_cell_vec_bytes;
-    size_t boundary_cell_index_bytes;
-
-    size_t boundary_face_bytes;
-    size_t boundary_face_vec_bytes;
-    size_t boundary_face_index_bytes;
-
-    // A_csr has one more element in each row: itself
-    size_t csr_row_index_bytes;
-    size_t csr_col_index_bytes;
-    size_t csr_value_bytes;
-    size_t csr_value_vec_bytes;
-
-    // extra matrix information
-    double *d_turbSrc_A = nullptr, *d_turbSrc_b = nullptr, *d_turbSrc_A_init = nullptr;
-    std::vector<double> h_turbSrc_init_mtx_vec, h_turbSrc_init_1mtx;
-    std::vector<double> h_turbSrc_init_src_vec, h_turbSrc_src_vec;
-    std::vector<int> tmpPermutatedList;
-    int * d_tmpPermutatedList = nullptr;
-
-    double *h_A_csr = nullptr, *h_b = nullptr, *h_psi = nullptr;
-    double *d_A_csr = nullptr, *d_b = nullptr, *d_psi = nullptr;
-
-    /**
-     * @brief cuda functions
-     */
-    cudaStream_t stream;
-    /**
-     * @brief AmgX functions
-     */
-    AmgXSolver* UxSolver = nullptr, *UySolver = nullptr, *UzSolver = nullptr;
-    int num_iteration;
-
-    double time_monitor_CPU;
-    double time_monitor_GPU_kernel, time_monitor_GPU_memcpy, time_monitor_GPU_memcpy_test;
-
-    double* d_grad = nullptr; 
-    double* d_grad_boundary = nullptr, *d_grad_boundary_init = nullptr;
-    double* d_nuEff = nullptr;
-
-
-
-public:
-    dfMatrix();
-    dfMatrix(int num_surfaces, int num_cells, int num_boundary_faces, int &num_boundary_cells_output,
-    const int *neighbour, const int *owner, const double* volume, const double* weight, const double* face_vector, const double* face, const double* deltaCoeffs,
-    std::vector<double> boundary_face_vector_init, std::vector<double> boundary_face_init, std::vector<double> boundary_deltaCoeffs_init,
-    std::vector<int> boundary_cell_id_init, const std::string &modeStr, const std::string &cfgFile);
-    ~dfMatrix();
-
-    void checkValue(bool print);
-
-    void fvm_ddt(double *rho_old, double *rho_new, double* vector_old);
-
-    void fvm_div(double* phi, double* ueqn_internalCoeffs_init, double* ueqn_boundaryCoeffs_init,
-    double* ueqn_laplac_internalCoeffs_init, double* ueqn_laplac_boundaryCoeffs_init,
-    double* boundary_pressure_init, double* boundary_velocity_init,
-    double* boundary_nuEff_init, double* boundary_rho_init);
-
-    void fvc_grad(double* pressure);
-
-    void fvc_grad_vector();
-
-    void divDevRhoReff();
-
-    void fvc_div_tensor(const double *nuEff);
-
-    void fvm_laplacian();
-
-    void add_fvMatrix(double* turbSrc_low, double* turbSrc_diag, double* turbSrc_upp, double* turbSrc_source);
-
-    void solve();
-    void updatePsi(double* Psi);
-};
-
-#endif
\ No newline at end of file
diff --git a/src_gpu/dfMatrixDataBase.H b/src_gpu/dfMatrixDataBase.H
new file mode 100644
index 000000000..35f3fa21e
--- /dev/null
+++ b/src_gpu/dfMatrixDataBase.H
@@ -0,0 +1,550 @@
+#pragma once
+
+#include <stdio.h>
+#include <unistd.h>
+#include "cuda_profiler_api.h"
+#include <cuda_runtime.h>
+#include "nvtx3/nvToolsExt.h"
+#include <vector>
+#include <numeric>
+#include <algorithm>
+#include <map>
+#include <iostream>
+#include <ctime>
+
+
+static const char *_cudaGetErrorEnum(cudaError_t error) {
+  return cudaGetErrorName(error);
+}
+
+template <typename T>
+void check(T result, char const *const func, const char *const file,
+        int const line) {
+  if (result) {
+    fprintf(stderr, "cuda error at %s:%d code=%d(%s) \"%s\" \n", file, line,
+            static_cast<unsigned int>(result), _cudaGetErrorEnum(result), func);
+    exit(EXIT_FAILURE);
+  }
+}
+
+#define checkCudaErrors(val) check((val), #val, __FILE__, __LINE__)
+
+void checkVectorEqual(int count, const double* basevec, double* vec, double max_relative_error) {
+    for (size_t i = 0; i < count; ++i)
+    {
+        double abs_diff = fabs(basevec[i] - vec[i]);
+        double rel_diff = fabs(basevec[i] - vec[i]) / fabs(basevec[i]);
+        if (abs_diff > 1e-12 && rel_diff > max_relative_error && !std::isinf(rel_diff))
+            fprintf(stderr, "mismatch index %d, cpu data: %.16lf, gpu data: %.16lf, relative error: %.16lf\n", i, basevec[i], vec[i], rel_diff);
+    }
+}
+
+
+struct dfMatrixDataBase
+{
+    // - number of cell size
+    int num_cells;
+    // - number of face size
+    int num_surfaces;
+    // - number of offdiagnal entry size (2*num_surfaces)
+    int num_faces;
+    // - number of boundary cells
+    int num_boundary_cells;
+    // - number of boundary faces
+    int num_boundary_faces;
+
+    // - mesh variables
+    // - csr_row_index
+    int *h_A_csr_row_index=nullptr, *d_A_csr_row_index=nullptr;
+    // - csr_col_index
+    int *h_A_csr_col_index=nullptr, *d_A_csr_col_index=nullptr;
+    // - csr_diag_index
+    int *h_A_csr_diag_index=nullptr, *d_A_csr_diag_index=nullptr;
+
+    // - the pre-permutated and post-permutated interpolation weight list
+    std::vector<double> h_weight_vec_init, h_weight_vec;
+    // - the pre-permutated and post-permutated flux (phi) list
+    std::vector<double> h_phi_vec_init, h_phi_vec;
+    // - the pre-permutated and post-permutated cell face vector list
+    std::vector<double> h_face_vector_vec_init, h_face_vector_vec;
+    std::vector<double> h_face_vec_init, h_face_vec;
+    std::vector<double> h_deltaCoeffs_vec_init, h_deltaCoeffs_vec;
+    // - the host pointer to rho_new, rho_old, velocity_old, pressure and volume list
+    double *h_rho_new = nullptr, *h_rho_old = nullptr, *h_velocity_old = nullptr, 
+    *h_pressure = nullptr;
+    const double *h_volume = nullptr;
+    // - the host pointer to the pre-permutated and post-permutated interpolation weight list
+    double *h_weight_init = nullptr, *h_weight = nullptr;
+    // - the host pointer to the pre-permutated and post-permutated flux (phi) list
+    double *h_phi_init = nullptr, *h_phi = nullptr;
+    // - the host pointer to the pre-permutated and post-permutated cell face vector list
+    double *h_face_vector_init = nullptr, *h_face_vector = nullptr;
+    double *h_face_init = nullptr, *h_face = nullptr;
+    double *h_deltaCoeffs_init = nullptr, *h_deltaCoeffs = nullptr;
+    // - the device pointer to rho_new, rho_old, velocity_old, pressure and volume list
+    double *d_rho_new = nullptr, *d_rho_old = nullptr, *d_velocity_old = nullptr, 
+    *d_pressure = nullptr, *d_volume = nullptr;
+    // - the device pointer to the pre-permutated and post-permutated interpolation weight list
+    double *d_weight_init = nullptr, *d_weight = nullptr;
+    // - the device pointer to the pre-permutated and post-permutated flux (phi) list
+    double *d_phi_init = nullptr, *d_phi = nullptr;
+    // - the device pointer to the pre-permutated and post-permutated cell face vector list
+    double *d_face_vector_init = nullptr, *d_face_vector = nullptr;
+    double *d_face_init = nullptr, *d_face = nullptr;
+    double *d_deltaCoeffs_init = nullptr, *d_deltaCoeffs = nullptr;
+
+    double rdelta_t = 1/1e-6;
+
+    /**
+     * @brief boundary related variables
+     */
+    int *h_boundary_cell_offset = nullptr, *d_boundary_cell_offset=nullptr;
+    int *h_boundary_cell_id = nullptr, *d_boundary_cell_id = nullptr;
+    double *h_internal_coeffs = nullptr, *h_boundary_coeffs = nullptr,
+    *h_boundary_pressure = nullptr, *h_boundary_face_vector = nullptr,
+    *h_boundary_face = nullptr, *d_boundary_face = nullptr,
+    *h_boundary_deltaCoeffs = nullptr, *d_boundary_deltaCoeffs = nullptr, 
+    *d_internal_coeffs = nullptr, *d_boundary_coeffs = nullptr,
+    *d_internal_coeffs_init = nullptr, *d_boundary_coeffs_init = nullptr,
+    *d_laplac_internal_coeffs = nullptr, *d_laplac_boundary_coeffs = nullptr,
+    *d_laplac_internal_coeffs_init = nullptr, *d_laplac_boundary_coeffs_init = nullptr,
+    *d_boundary_pressure = nullptr, *d_boundary_face_vector = nullptr,
+    *d_boundary_pressure_init = nullptr,
+    *d_boundary_velocity = nullptr, *d_boundary_velocity_init = nullptr,
+    *d_boundary_nuEff = nullptr, *d_boundary_nuEff_init = nullptr,
+    *d_boundary_rho = nullptr, *d_boundary_rho_init = nullptr;
+
+    std::vector<int> boundPermutationList;
+    std::vector<double> ueqn_internalCoeffs, ueqn_boundaryCoeffs;
+    std::vector<double> boundary_face_vector;
+    std::vector<double> boundary_pressure;
+    std::vector<double> boundary_face;
+    std::vector<double> boundary_deltaCoeffs;
+
+    // - the device pointer to the permutated index list
+    std::vector<int> permedIndex;
+    int *d_permedIndex=nullptr;
+    int *d_bouPermedIndex = nullptr;
+
+
+    // bytesize
+    // - bytes of diagnal entries
+    size_t cell_bytes;
+    // - bytes of diagnal entries (vector)
+    size_t cell_vec_bytes;
+    // - bytes of diagnal index
+    size_t cell_index_bytes;
+     // - bytes of diagnal index
+    size_t face_bytes;
+    size_t face_vec_bytes;
+    size_t face_index_bytes;
+
+    size_t boundary_cell_bytes;
+    size_t boundary_cell_vec_bytes;
+    size_t boundary_cell_index_bytes;
+
+    size_t boundary_face_bytes;
+    size_t boundary_face_vec_bytes;
+    size_t boundary_face_index_bytes;
+
+    // A_csr has one more element in each row: itself
+    size_t csr_row_index_bytes;
+    size_t csr_col_index_bytes;
+    size_t csr_value_bytes;
+    size_t csr_value_vec_bytes;
+
+    // extra matrix information
+    double *d_turbSrc_A = nullptr, *d_turbSrc_b = nullptr, *d_turbSrc_A_init = nullptr;
+    std::vector<double> h_turbSrc_init_mtx_vec, h_turbSrc_init_1mtx;
+    std::vector<double> h_turbSrc_init_src_vec, h_turbSrc_src_vec;
+    std::vector<int> tmpPermutatedList;
+    int * d_tmpPermutatedList = nullptr;
+
+    // double *h_A_csr = nullptr, *h_b = nullptr, *h_psi = nullptr;
+    // double *d_A_csr = nullptr, *d_b = nullptr, *d_psi = nullptr;
+
+    /**
+     * @brief cuda functions
+     */
+    cudaStream_t stream;
+    /**
+     * @brief AmgX functions
+     */
+    AmgXSolver* UxSolver = nullptr, *UySolver = nullptr, *UzSolver = nullptr;
+    int num_iteration;
+
+    double time_monitor_CPU;
+    double time_monitor_GPU_kernel, time_monitor_GPU_memcpy, time_monitor_GPU_memcpy_test;
+
+    double* d_grad = nullptr; 
+    double* d_grad_boundary = nullptr, *d_grad_boundary_init = nullptr;
+    double* d_nuEff = nullptr;
+
+    // constructor
+    dfMatrixDataBase();
+    dfMatrixDataBase(int num_surfaces, int num_cells, int num_boundary_faces, int & num_boundary_cells_output,
+        const int *neighbour, const int *owner, const double* volume, const double* weight, const double* face_vector, const double* face, 
+        const double* deltaCoeffs, std::vector<double> boundary_face_vector_init, std::vector<double> boundary_face_init, 
+        std::vector<double> boundary_deltaCoeffs_init, std::vector<int> boundary_cell_id_init)
+    : num_cells(num_cells), num_faces(num_surfaces*2), num_surfaces(num_surfaces), num_iteration(0),
+      num_boundary_faces(num_boundary_faces), h_volume(volume)
+    {
+        // allocate field pointer in pin memory
+        cudaMallocHost(&h_phi_init, num_faces * sizeof(double));
+
+        h_weight_vec_init.resize(num_faces);
+        h_weight_vec.resize(num_faces);
+        h_face_vector_vec_init.resize(num_faces*3);
+        h_face_vector_vec.resize(num_faces*3);
+        h_face_vec_init.resize(num_faces);
+        h_face_vec.resize(num_faces);
+        h_deltaCoeffs_vec_init.resize(num_faces);
+        h_deltaCoeffs_vec.resize(num_faces);
+        h_turbSrc_init_mtx_vec.resize(num_faces + num_cells);
+        h_turbSrc_init_1mtx.resize(num_faces + num_cells);
+        h_turbSrc_init_src_vec.resize(3*num_cells);
+        h_turbSrc_src_vec.resize(3*num_cells);
+
+        // byte sizes
+        cell_bytes = num_cells * sizeof(double);
+        cell_vec_bytes = num_cells * 3 * sizeof(double);
+        cell_index_bytes = num_cells * sizeof(int);
+
+        face_bytes = num_faces * sizeof(double);
+        face_vec_bytes = num_faces * 3 * sizeof(double);
+        face_index_bytes = num_faces * sizeof(int);
+
+        // A_csr has one more element in each row: itself
+        csr_row_index_bytes = (num_cells + 1) * sizeof(int);
+        csr_col_index_bytes = (num_cells + num_faces) * sizeof(int);
+        csr_value_bytes = (num_cells + num_faces) * sizeof(double);
+        csr_value_vec_bytes = (num_cells + num_faces) * 3 * sizeof(double);
+
+        /************************construct mesh variables****************************/
+        /**
+         * 1. h_csr_row_index & h_csr_diag_index
+        */
+        std::vector<int> h_mtxEntry_perRow_vec(num_cells);
+        std::vector<int> h_csr_diag_index_vec(num_cells);
+        std::vector<int> h_csr_row_index_vec(num_cells + 1, 0);
+
+        for (int faceI = 0; faceI < num_surfaces; faceI++)
+        {
+            h_csr_diag_index_vec[neighbour[faceI]]++;
+            h_mtxEntry_perRow_vec[neighbour[faceI]]++;
+            h_mtxEntry_perRow_vec[owner[faceI]]++;
+        }
+
+        // - consider diagnal element in each row
+        std::transform(h_mtxEntry_perRow_vec.begin(), h_mtxEntry_perRow_vec.end(), h_mtxEntry_perRow_vec.begin(), [](int n)
+            {return n + 1;});
+        // - construct h_csr_row_index & h_csr_diag_index
+        std::partial_sum(h_mtxEntry_perRow_vec.begin(), h_mtxEntry_perRow_vec.end(), h_csr_row_index_vec.begin()+1);
+        // - assign h_csr_row_index & h_csr_diag_index
+        h_A_csr_row_index = h_csr_row_index_vec.data();
+        h_A_csr_diag_index = h_csr_diag_index_vec.data();
+
+        /**
+         * 2. h_csr_col_index
+        */
+        std::vector<int> rowIndex(num_faces + num_cells), colIndex(num_faces + num_cells), diagIndex(num_cells);
+        std::iota(diagIndex.begin(), diagIndex.end(), 0);
+
+        // initialize the RowIndex (rowIndex of lower + upper + diagnal)
+        std::copy(neighbour, neighbour + num_surfaces, rowIndex.begin());
+        std::copy(owner, owner + num_surfaces, rowIndex.begin() + num_surfaces);
+        std::copy(diagIndex.begin(), diagIndex.end(), rowIndex.begin() + num_faces);
+        // initialize the ColIndex (colIndex of lower + upper + diagnal)
+        std::copy(owner, owner + num_surfaces, colIndex.begin());
+        std::copy(neighbour, neighbour + num_surfaces, colIndex.begin() + num_surfaces);
+        std::copy(diagIndex.begin(), diagIndex.end(), colIndex.begin() + num_faces);
+
+        // - construct hashTable for sorting
+        std::multimap<int,int> rowColPair;
+        for (int i = 0; i < 2*num_surfaces+num_cells; i++)
+        {
+            rowColPair.insert(std::make_pair(rowIndex[i], colIndex[i]));
+        }
+        // - sort
+        std::vector<std::pair<int, int>> globalPerm(rowColPair.begin(), rowColPair.end());
+        std::sort(globalPerm.begin(), globalPerm.end(), []
+        (const std::pair<int, int>& pair1, const std::pair<int, int>& pair2){
+        if (pair1.first != pair2.first) {
+            return pair1.first < pair2.first;
+        } else {
+            return pair1.second < pair2.second;
+        }
+        });
+
+        std::vector<int> h_csr_col_index_vec;
+        std::transform(globalPerm.begin(), globalPerm.end(), std::back_inserter(h_csr_col_index_vec), []
+            (const std::pair<int, int>& pair) {
+            return pair.second;
+        });
+        h_A_csr_col_index = h_csr_col_index_vec.data();
+        
+        // construct a tmp permutated List for add fvMatrix
+        std::vector<int> tmp_permutation(2*num_surfaces + num_cells);
+        std::vector<int> tmp_rowIndex(2*num_surfaces + num_cells);
+        std::iota(tmp_permutation.begin(), tmp_permutation.end(), 0);
+        std::copy(neighbour, neighbour + num_surfaces, tmp_rowIndex.begin());
+        std::copy(diagIndex.begin(), diagIndex.end(), tmp_rowIndex.begin() + num_surfaces);
+        std::copy(owner, owner + num_surfaces, tmp_rowIndex.begin() + num_surfaces + num_cells);
+        std::multimap<int,int> tmpPair;
+        for (int i = 0; i < 2*num_surfaces+num_cells; i++)
+        {
+            tmpPair.insert(std::make_pair(tmp_rowIndex[i], tmp_permutation[i]));
+        }
+        std::vector<std::pair<int, int>> tmpPerm(tmpPair.begin(), tmpPair.end());
+        std::sort(tmpPerm.begin(), tmpPerm.end(), []
+        (const std::pair<int, int>& pair1, const std::pair<int, int>& pair2){
+        if (pair1.first != pair2.first) {
+            return pair1.first < pair2.first;
+        } else {
+            return pair1.second < pair2.second;
+        }
+        });
+        std::transform(tmpPerm.begin(), tmpPerm.end(), std::back_inserter(tmpPermutatedList), []
+            (const std::pair<int, int>& pair) {
+            return pair.second;
+        });
+
+        /**
+         * 3. boundary imformations
+        */
+        // get boundPermutation and offset lists
+        std::vector<int> boundPermutationListInit(num_boundary_faces);
+        std::vector<int> boundOffsetList;
+        std::iota(boundPermutationListInit.begin(), boundPermutationListInit.end(), 0);
+
+        // - construct hashTable for sorting
+        std::multimap<int,int> boundPermutation;
+        for (int i = 0; i < num_boundary_faces; i++)
+        {
+            boundPermutation.insert(std::make_pair(boundary_cell_id_init[i], boundPermutationListInit[i]));
+        }
+
+        // - sort 
+        std::vector<std::pair<int, int>> boundPermPair(boundPermutation.begin(), boundPermutation.end());
+        std::sort(boundPermPair.begin(), boundPermPair.end(), []
+        (const std::pair<int, int>& pair1, const std::pair<int, int>& pair2){
+            if (pair1.first != pair2.first) {
+                return pair1.first < pair2.first;
+            } else {
+                return pair1.second < pair2.second;
+            }
+        });
+
+        // - construct boundPermedIndex and boundary_cell_id
+        std::vector<int> boundary_cell_id;
+        boundPermutationList.clear();
+        std::transform(boundPermPair.begin(), boundPermPair.end(), std::back_inserter(boundary_cell_id), []
+            (const std::pair<int, int>& pair) {
+            return pair.first;
+        });
+        std::transform(boundPermPair.begin(), boundPermPair.end(), std::back_inserter(boundPermutationList), []
+            (const std::pair<int, int>& pair) {
+            return pair.second;
+        });
+
+        // construct boundary_cell_offset
+        std::map<int, int> countMap;
+        std::vector<int> boundaryCellcount;
+        for (const auto& cellIndex : boundary_cell_id)
+            ++ countMap[cellIndex];
+        for (const auto& [cellIndex, count] : countMap)
+            boundaryCellcount.push_back(count);
+
+        num_boundary_cells = boundaryCellcount.size();
+        num_boundary_cells_output = num_boundary_cells;
+
+        std::vector<int> boundary_cell_offset(boundaryCellcount.size() + 1, 0);
+        std::partial_sum(boundaryCellcount.begin(), boundaryCellcount.end(), boundary_cell_offset.begin()+1);
+        
+        // assign h_boundary_cell_offset & h_boundary_cell_id
+        h_boundary_cell_offset = boundary_cell_offset.data();
+        h_boundary_cell_id = boundary_cell_id.data();
+
+        // 
+        boundary_cell_bytes = num_boundary_cells * sizeof(double);
+        boundary_cell_vec_bytes = num_boundary_cells * 3 * sizeof(double);
+        boundary_cell_index_bytes = num_boundary_cells * sizeof(int);
+
+        boundary_face_bytes = num_boundary_faces * sizeof(double);
+        boundary_face_vec_bytes = num_boundary_faces * 3 * sizeof(double);
+        boundary_face_index_bytes = num_boundary_faces * sizeof(int);
+
+        ueqn_internalCoeffs.resize(3*num_boundary_faces);
+        ueqn_boundaryCoeffs.resize(3*num_boundary_faces);
+
+        boundary_face_vector.resize(3*num_boundary_faces);
+        boundary_pressure.resize(num_boundary_faces);
+        boundary_face.resize(num_boundary_faces);
+        boundary_deltaCoeffs.resize(num_boundary_faces);
+
+        /**
+         * 4. permutation list for field variables
+        */
+        std::vector<int> offdiagRowIndex(2*num_surfaces), permIndex(2*num_surfaces);
+        // - initialize the offdiagRowIndex (rowIndex of lower + rowIndex of upper)
+        std::copy(neighbour, neighbour + num_surfaces, offdiagRowIndex.begin());
+        std::copy(owner, owner + num_surfaces, offdiagRowIndex.begin() + num_surfaces);
+
+        // - initialize the permIndex (0, 1, ..., 2*num_surfaces)
+        std::iota(permIndex.begin(), permIndex.end(), 0);
+
+        // - construct hashTable for sorting
+        std::multimap<int,int> permutation;
+        for (int i = 0; i < 2*num_surfaces; i++)
+        {
+            permutation.insert(std::make_pair(offdiagRowIndex[i], permIndex[i]));
+        }
+        // - sort 
+        std::vector<std::pair<int, int>> permPair(permutation.begin(), permutation.end());
+        std::sort(permPair.begin(), permPair.end(), []
+        (const std::pair<int, int>& pair1, const std::pair<int, int>& pair2){
+            if (pair1.first != pair2.first) {
+                return pair1.first < pair2.first;
+            } else {
+                return pair1.second < pair2.second;
+            }
+        });
+        // - form permedIndex list
+        std::transform(permPair.begin(), permPair.end(), std::back_inserter(permedIndex), []
+            (const std::pair<int, int>& pair) {
+            return pair.second;
+        });
+
+        // copy and permutate cell variables
+        std::copy(weight, weight + num_surfaces, h_weight_vec_init.begin());
+        std::copy(weight, weight + num_surfaces, h_weight_vec_init.begin() + num_surfaces);
+        std::copy(face_vector, face_vector + 3*num_surfaces, h_face_vector_vec_init.begin());
+        std::copy(face_vector, face_vector + 3*num_surfaces, h_face_vector_vec_init.begin() + 3*num_surfaces);
+        std::copy(face, face + num_surfaces, h_face_vec_init.begin());
+        std::copy(face, face + num_surfaces, h_face_vec_init.begin() + num_surfaces);
+        std::copy(deltaCoeffs, deltaCoeffs + num_surfaces, h_deltaCoeffs_vec_init.begin());
+        std::copy(deltaCoeffs, deltaCoeffs + num_surfaces, h_deltaCoeffs_vec_init.begin() + num_surfaces);
+        for (int i = 0; i < num_faces; i++)
+        {
+            h_weight_vec[i] = h_weight_vec_init[permedIndex[i]];
+            h_face_vec[i] = h_face_vec_init[permedIndex[i]];
+            h_deltaCoeffs_vec[i] = h_deltaCoeffs_vec_init[permedIndex[i]];
+            h_face_vector_vec[i*3] = h_face_vector_vec_init[3*permedIndex[i]];
+            h_face_vector_vec[i*3+1] = h_face_vector_vec_init[3*permedIndex[i]+1];
+            h_face_vector_vec[i*3+2] = h_face_vector_vec_init[3*permedIndex[i]+2];
+        }
+        h_weight = h_weight_vec.data();
+        h_face_vector = h_face_vector_vec.data();
+        h_face = h_face_vec.data();
+        h_deltaCoeffs = h_deltaCoeffs_vec.data();
+
+        for (int i = 0; i < num_boundary_faces; i++)
+        {
+            boundary_face_vector[3*i] = boundary_face_vector_init[3*boundPermutationList[i]];
+            boundary_face_vector[3*i+1] = boundary_face_vector_init[3*boundPermutationList[i]+1];
+            boundary_face_vector[3*i+2] = boundary_face_vector_init[3*boundPermutationList[i]+2];
+            boundary_face[i] = boundary_face_init[boundPermutationList[i]];
+            boundary_deltaCoeffs[i] = boundary_deltaCoeffs_init[boundPermutationList[i]];
+        }
+        h_boundary_face_vector = boundary_face_vector.data();
+        h_boundary_face = boundary_face.data();
+        h_boundary_deltaCoeffs = boundary_deltaCoeffs.data();
+
+        /************************allocate memory on device****************************/
+        int total_bytes = 0;
+
+        checkCudaErrors(cudaMalloc((void**)&d_A_csr_row_index, csr_row_index_bytes));
+        checkCudaErrors(cudaMalloc((void**)&d_A_csr_col_index, csr_col_index_bytes));
+        checkCudaErrors(cudaMalloc((void**)&d_A_csr_diag_index, cell_index_bytes));
+        total_bytes += (csr_row_index_bytes + csr_col_index_bytes + cell_index_bytes);
+
+        checkCudaErrors(cudaMalloc((void**)&d_rho_old, cell_bytes));
+        checkCudaErrors(cudaMalloc((void**)&d_rho_new, cell_bytes));
+        checkCudaErrors(cudaMalloc((void**)&d_volume, cell_bytes));
+        checkCudaErrors(cudaMalloc((void**)&d_pressure, cell_bytes));
+        checkCudaErrors(cudaMalloc((void**)&d_velocity_old, cell_vec_bytes));
+        checkCudaErrors(cudaMalloc((void**)&d_weight, face_bytes));
+        checkCudaErrors(cudaMalloc((void**)&d_face, face_bytes));
+        checkCudaErrors(cudaMalloc((void**)&d_deltaCoeffs, face_bytes));
+        checkCudaErrors(cudaMalloc((void**)&d_phi, face_bytes));
+        checkCudaErrors(cudaMalloc((void**)&d_phi_init, face_bytes));
+        checkCudaErrors(cudaMalloc((void**)&d_face_vector, face_vec_bytes));
+        checkCudaErrors(cudaMalloc((void**)&d_nuEff, cell_bytes));
+        total_bytes += (cell_bytes * 5 + face_bytes * 5 + cell_vec_bytes + face_vec_bytes);
+
+        checkCudaErrors(cudaMalloc((void**)&d_boundary_cell_offset, (num_boundary_cells+1) * sizeof(int)));
+        checkCudaErrors(cudaMalloc((void**)&d_boundary_cell_id, boundary_face_index_bytes));
+        total_bytes += (boundary_face_index_bytes + (num_boundary_cells+1) * sizeof(int));
+
+        checkCudaErrors(cudaMalloc((void**)&d_boundary_pressure_init, boundary_face_bytes));
+        checkCudaErrors(cudaMalloc((void**)&d_boundary_pressure, boundary_face_bytes));
+        checkCudaErrors(cudaMalloc((void**)&d_boundary_velocity_init, boundary_face_vec_bytes));
+        checkCudaErrors(cudaMalloc((void**)&d_boundary_velocity, boundary_face_vec_bytes));
+        checkCudaErrors(cudaMalloc((void**)&d_boundary_face_vector, boundary_face_vec_bytes));
+        checkCudaErrors(cudaMalloc((void**)&d_boundary_face, boundary_face_bytes));
+        checkCudaErrors(cudaMalloc((void**)&d_boundary_deltaCoeffs, boundary_face_bytes));
+        checkCudaErrors(cudaMalloc((void**)&d_internal_coeffs_init, boundary_face_vec_bytes));
+        checkCudaErrors(cudaMalloc((void**)&d_boundary_coeffs_init, boundary_face_vec_bytes));
+        checkCudaErrors(cudaMalloc((void**)&d_laplac_internal_coeffs_init, boundary_face_vec_bytes));
+        checkCudaErrors(cudaMalloc((void**)&d_laplac_boundary_coeffs_init, boundary_face_vec_bytes));
+        checkCudaErrors(cudaMalloc((void**)&d_internal_coeffs, boundary_face_vec_bytes));
+        checkCudaErrors(cudaMalloc((void**)&d_boundary_coeffs, boundary_face_vec_bytes));
+        checkCudaErrors(cudaMalloc((void**)&d_laplac_internal_coeffs, boundary_face_vec_bytes));
+        checkCudaErrors(cudaMalloc((void**)&d_laplac_boundary_coeffs, boundary_face_vec_bytes));
+        checkCudaErrors(cudaMalloc((void**)&d_boundary_nuEff, boundary_face_bytes));
+        checkCudaErrors(cudaMalloc((void**)&d_boundary_nuEff_init, boundary_face_bytes));
+        checkCudaErrors(cudaMalloc((void**)&d_boundary_rho, boundary_face_bytes));
+        checkCudaErrors(cudaMalloc((void**)&d_boundary_rho_init, boundary_face_bytes));
+        total_bytes += (boundary_face_bytes*8 + boundary_face_vec_bytes * 11);
+
+        // checkCudaErrors(cudaMalloc((void**)&d_A_csr, csr_value_vec_bytes));
+        // checkCudaErrors(cudaMalloc((void**)&d_b, cell_vec_bytes));
+        // checkCudaErrors(cudaMalloc((void**)&d_psi, cell_vec_bytes));
+        total_bytes += (boundary_face_bytes + boundary_face_vec_bytes * 3);
+
+        checkCudaErrors(cudaMalloc((void**)&d_turbSrc_A, csr_value_bytes));
+        checkCudaErrors(cudaMalloc((void**)&d_turbSrc_A_init, csr_value_bytes));
+        checkCudaErrors(cudaMalloc((void**)&d_turbSrc_b, cell_vec_bytes));
+        total_bytes += (2*csr_value_bytes + cell_vec_bytes);
+
+        checkCudaErrors(cudaMalloc((void**)&d_permedIndex, face_index_bytes));
+        checkCudaErrors(cudaMalloc((void**)&d_bouPermedIndex, boundary_face_index_bytes));
+        checkCudaErrors(cudaMalloc((void**)&d_tmpPermutatedList, csr_col_index_bytes));
+        total_bytes += (face_index_bytes + boundary_face_index_bytes + csr_col_index_bytes);
+
+        checkCudaErrors(cudaMalloc((void**)&d_grad, num_cells * 9 * sizeof(double)));
+        checkCudaErrors(cudaMalloc((void**)&d_grad_boundary, boundary_face_bytes * 9));
+        checkCudaErrors(cudaMalloc((void**)&d_grad_boundary_init, boundary_cell_bytes * 9));
+        total_bytes += (num_cells * 9 * sizeof(double) + boundary_face_bytes * 9 + boundary_cell_bytes * 9); // FIXME: rename
+
+
+        fprintf(stderr, "Total bytes malloc on GPU: %.2fMB\n", total_bytes * 1.0 / 1024 / 1024);
+
+        checkCudaErrors(cudaStreamCreate(&stream));
+
+        checkCudaErrors(cudaMemcpyAsync(d_A_csr_row_index, h_A_csr_row_index, csr_row_index_bytes, cudaMemcpyHostToDevice, stream));
+        checkCudaErrors(cudaMemcpyAsync(d_A_csr_col_index, h_A_csr_col_index, csr_col_index_bytes, cudaMemcpyHostToDevice, stream));
+        checkCudaErrors(cudaMemcpyAsync(d_A_csr_diag_index, h_A_csr_diag_index, cell_index_bytes, cudaMemcpyHostToDevice, stream));
+        checkCudaErrors(cudaMemcpyAsync(d_boundary_cell_offset, h_boundary_cell_offset, (num_boundary_cells+1) * sizeof(int), cudaMemcpyHostToDevice, stream));
+        checkCudaErrors(cudaMemcpyAsync(d_boundary_cell_id, h_boundary_cell_id, boundary_face_index_bytes, cudaMemcpyHostToDevice, stream));
+
+        checkCudaErrors(cudaMemcpyAsync(d_volume, h_volume, cell_bytes, cudaMemcpyHostToDevice, stream));
+        checkCudaErrors(cudaMemcpyAsync(d_weight, h_weight, face_bytes, cudaMemcpyHostToDevice, stream));
+        checkCudaErrors(cudaMemcpyAsync(d_face, h_face, face_bytes, cudaMemcpyHostToDevice, stream));
+        checkCudaErrors(cudaMemcpyAsync(d_deltaCoeffs, h_deltaCoeffs, face_bytes, cudaMemcpyHostToDevice, stream));
+        checkCudaErrors(cudaMemcpyAsync(d_face_vector, h_face_vector, face_vec_bytes, cudaMemcpyHostToDevice, stream));
+        checkCudaErrors(cudaMemcpyAsync(d_boundary_face_vector, h_boundary_face_vector, boundary_face_vec_bytes, cudaMemcpyHostToDevice, stream));
+        checkCudaErrors(cudaMemcpyAsync(d_boundary_face, h_boundary_face, boundary_face_bytes, cudaMemcpyHostToDevice, stream));
+        checkCudaErrors(cudaMemcpyAsync(d_boundary_deltaCoeffs, h_boundary_deltaCoeffs, boundary_face_bytes, cudaMemcpyHostToDevice, stream));
+
+        checkCudaErrors(cudaMemcpyAsync(d_permedIndex, permedIndex.data(), face_index_bytes, cudaMemcpyHostToDevice, stream));
+        checkCudaErrors(cudaMemcpyAsync(d_bouPermedIndex, boundPermutationList.data(), boundary_face_index_bytes, cudaMemcpyHostToDevice, stream));
+        checkCudaErrors(cudaMemcpyAsync(d_tmpPermutatedList, tmpPermutatedList.data(), csr_col_index_bytes, cudaMemcpyHostToDevice, stream));
+    };
+    ~dfMatrixDataBase(){
+        std::cout << "Destructor called." << std::endl;
+        // TODO: free pointers
+        
+    };
+};
+
diff --git a/src_gpu/dfUEqn.H b/src_gpu/dfUEqn.H
new file mode 100644
index 000000000..c7d2c7914
--- /dev/null
+++ b/src_gpu/dfUEqn.H
@@ -0,0 +1,53 @@
+#pragma once
+
+#include "AmgXSolver.H"
+#include <amgx_c.h>
+#include "dfMatrixDataBase.H"
+
+class dfUEqn
+{
+private:
+    dfMatrixDataBase& dataBase_;
+    cudaStream_t stream;
+    AmgXSolver* UxSolver, *UySolver, *UzSolver = nullptr;
+    int num_iteration;
+    double time_monitor_CPU;
+    double time_monitor_GPU_kernel, time_monitor_GPU_memcpy, time_monitor_GPU_memcpy_test;
+
+    // common variables
+    int num_cells, cell_bytes, num_faces, num_surfaces, cell_vec_bytes, csr_value_vec_bytes, num_boundary_cells;
+    int *d_A_csr_row_index, *d_A_csr_diag_index, *d_A_csr_col_index; 
+
+    // Matrix variables
+    double *d_A_csr, *d_b, *d_psi = nullptr;
+    double *h_A_csr, *h_b, *h_psi = nullptr;
+
+public:
+    dfUEqn();
+    dfUEqn(dfMatrixDataBase& dataBase, const std::string &modeStr, const std::string &cfgFile);
+    ~dfUEqn();
+
+    void checkValue(bool print);
+
+    void fvm_ddt(double *rho_old, double *rho_new, double* vector_old);
+
+    void fvm_div(double* phi, double* ueqn_internalCoeffs_init, double* ueqn_boundaryCoeffs_init,
+    double* ueqn_laplac_internalCoeffs_init, double* ueqn_laplac_boundaryCoeffs_init,
+    double* boundary_pressure_init, double* boundary_velocity_init,
+    double* boundary_nuEff_init, double* boundary_rho_init);
+
+    void fvc_grad(double* pressure);
+
+    void fvc_grad_vector();
+
+    void dev2T();
+
+    void fvc_div_tensor(const double *nuEff);
+
+    void fvm_laplacian();
+
+    void add_fvMatrix(double* turbSrc_low, double* turbSrc_diag, double* turbSrc_upp, double* turbSrc_source);
+
+    void solve();
+    void updatePsi(double* Psi);
+};
\ No newline at end of file
diff --git a/src_gpu/dfMatrix.cu b/src_gpu/dfUEqn.cu
similarity index 57%
rename from src_gpu/dfMatrix.cu
rename to src_gpu/dfUEqn.cu
index c06b09780..7a2297b4e 100644
--- a/src_gpu/dfMatrix.cu
+++ b/src_gpu/dfUEqn.cu
@@ -1,14 +1,4 @@
-#include "dfMatrix.H"
-
-void checkVectorEqual(int count, const double* basevec, double* vec, double max_relative_error) {
-    for (size_t i = 0; i < count; ++i)
-    {
-        double abs_diff = fabs(basevec[i] - vec[i]);
-        double rel_diff = fabs(basevec[i] - vec[i]) / fabs(basevec[i]);
-        if (abs_diff > 1e-12 && rel_diff > max_relative_error && !std::isinf(rel_diff))
-            fprintf(stderr, "mismatch index %d, cpu data: %.16lf, gpu data: %.16lf, relative error: %.16lf\n", i, basevec[i], vec[i], rel_diff);
-    }
-}
+#include "dfUEqn.H"
 
 // kernel functions
 __global__ void fvm_ddt_kernel(int num_cells, int num_faces, const double rdelta_t,
@@ -182,9 +172,9 @@ __global__ void fvc_grad_internal_face(int num_cells,
         grad_bz_upp += face_p * sfz;
       }
     }
-    b_output[num_cells * 0 + index] = b_input[num_cells * 0 + index] + grad_bx_low + grad_bx_upp;
-    b_output[num_cells * 1 + index] = b_input[num_cells * 1 + index] + grad_by_low + grad_by_upp;
-    b_output[num_cells * 2 + index] = b_input[num_cells * 2 + index] + grad_bz_low + grad_bz_upp;
+    b_output[num_cells * 0 + index] = b_input[num_cells * 0 + index] - grad_bx_low - grad_bx_upp;
+    b_output[num_cells * 1 + index] = b_input[num_cells * 1 + index] - grad_by_low - grad_by_upp;
+    b_output[num_cells * 2 + index] = b_input[num_cells * 2 + index] - grad_bz_low - grad_bz_upp;
 }
 
 __global__ void fvc_grad_boundary_face(int num_cells, int num_boundary_cells,
@@ -222,9 +212,9 @@ __global__ void fvc_grad_boundary_face(int num_cells, int num_boundary_cells,
     //grad_by += ny * grad_correction; 
     //grad_bz += nz * grad_correction; 
 
-    b_output[num_cells * 0 + cell_index] = b_input[num_cells * 0 + cell_index] + grad_bx;
-    b_output[num_cells * 1 + cell_index] = b_input[num_cells * 1 + cell_index] + grad_by;
-    b_output[num_cells * 2 + cell_index] = b_input[num_cells * 2 + cell_index] + grad_bz;
+    b_output[num_cells * 0 + cell_index] = b_input[num_cells * 0 + cell_index] - grad_bx;
+    b_output[num_cells * 1 + cell_index] = b_input[num_cells * 1 + cell_index] - grad_by;
+    b_output[num_cells * 2 + cell_index] = b_input[num_cells * 2 + cell_index] - grad_bz;
 }
 
 __global__ void add_fvMatrix_kernel(int num_cells, int num_faces,
@@ -818,409 +808,47 @@ __global__ void fvm_laplacian_uncorrected_vector_boundary(int num_cells, int num
     b_output[num_cells * 2 + cell_index] = b_input[num_cells * 2 + cell_index] + boundary_coeffs_z * sign;
 }
 
-// constructor (construct mesh variable)
-dfMatrix::dfMatrix(){}
-dfMatrix::dfMatrix(int num_surfaces, int num_cells, int num_boundary_faces, int & num_boundary_cells_output,
-    const int *neighbour, const int *owner, const double* volume, const double* weight, const double* face_vector, const double* face, const double* deltaCoeffs,
-    std::vector<double> boundary_face_vector_init, std::vector<double> boundary_face_init, std::vector<double> boundary_deltaCoeffs_init,
-    std::vector<int> boundary_cell_id_init, const std::string &modeStr, const std::string &cfgFile)
-: num_cells(num_cells), num_faces(num_surfaces*2), num_surfaces(num_surfaces), num_iteration(0),
-  num_boundary_faces(num_boundary_faces), h_volume(volume), time_monitor_CPU(0.), time_monitor_GPU_kernel(0.), time_monitor_GPU_memcpy(0.),
-  time_monitor_GPU_memcpy_test(0.)
-{   
-    // 
-    unsigned int flags = 0;
-    checkCudaErrors(cudaGetDeviceFlags(&flags));
-    flags |= cudaDeviceScheduleYield;
-    checkCudaErrors(cudaSetDeviceFlags(flags));
-    
-    // initialize vectors
-    // allocate field vector in pin memory
-    cudaMallocHost(&h_phi_init, num_faces * sizeof(double));
-    // h_phi_vec_init.assign(h_phi_init, h_phi_init + num_faces);
-
-    h_weight_vec_init.resize(num_faces);
-    h_weight_vec.resize(num_faces);
-    h_face_vector_vec_init.resize(num_faces*3);
-    h_face_vector_vec.resize(num_faces*3);
-    h_face_vec_init.resize(num_faces);
-    h_face_vec.resize(num_faces);
-    h_deltaCoeffs_vec_init.resize(num_faces);
-    h_deltaCoeffs_vec.resize(num_faces);
-    h_turbSrc_init_mtx_vec.resize(num_faces + num_cells);
-    h_turbSrc_init_1mtx.resize(num_faces + num_cells);
-    h_turbSrc_init_src_vec.resize(3*num_cells);
-    h_turbSrc_src_vec.resize(3*num_cells);
-
-    // byte sizes
-    cell_bytes = num_cells * sizeof(double);
-    cell_vec_bytes = num_cells * 3 * sizeof(double);
-    cell_index_bytes = num_cells * sizeof(int);
-
-    face_bytes = num_faces * sizeof(double);
-    face_vec_bytes = num_faces * 3 * sizeof(double);
-    face_index_bytes = num_faces * sizeof(int);
+// constructor
+dfUEqn::dfUEqn(dfMatrixDataBase& dataBase, const std::string &modeStr, const std::string &cfgFile)
+: dataBase_(dataBase)
+{
+    UxSolver = new AmgXSolver(modeStr, cfgFile);
+    UySolver = new AmgXSolver(modeStr, cfgFile);
+    UzSolver = new AmgXSolver(modeStr, cfgFile);
 
-    // A_csr has one more element in each row: itself
-    csr_row_index_bytes = (num_cells + 1) * sizeof(int);
-    csr_col_index_bytes = (num_cells + num_faces) * sizeof(int);
-    csr_value_bytes = (num_cells + num_faces) * sizeof(double);
-    csr_value_vec_bytes = (num_cells + num_faces) * 3 * sizeof(double);
+    num_cells = dataBase_.num_cells;
+    cell_bytes = dataBase_.cell_bytes;
+    num_faces = dataBase_.num_faces;
+    cell_vec_bytes = dataBase_.cell_vec_bytes;
+    csr_value_vec_bytes = dataBase_.csr_value_vec_bytes;
+    num_boundary_cells = dataBase_.num_boundary_cells;
+    num_surfaces = dataBase_.num_surfaces;
+
+    d_A_csr_row_index = dataBase_.d_A_csr_row_index;
+    d_A_csr_diag_index = dataBase_.d_A_csr_diag_index;
+    d_A_csr_col_index = dataBase_.d_A_csr_col_index;
 
     h_A_csr = new double[(num_cells + num_faces) * 3];
-    // h_b = new double[num_cells * 3];
-    // h_psi = new double[num_cells * 3];
-    cudaMallocHost(&h_b, cell_vec_bytes);
+    h_b = new double[num_cells * 3];
     cudaMallocHost(&h_psi, cell_vec_bytes);
 
-    /************************construct mesh variables****************************/
-    /**
-     * 1. h_csr_row_index & h_csr_diag_index
-    */
-    std::vector<int> h_mtxEntry_perRow_vec(num_cells);
-    std::vector<int> h_csr_diag_index_vec(num_cells);
-    std::vector<int> h_csr_row_index_vec(num_cells + 1, 0);
-
-    for (int faceI = 0; faceI < num_surfaces; faceI++)
-    {
-        h_csr_diag_index_vec[neighbour[faceI]]++;
-        h_mtxEntry_perRow_vec[neighbour[faceI]]++;
-        h_mtxEntry_perRow_vec[owner[faceI]]++;
-    }
-
-    // - consider diagnal element in each row
-    std::transform(h_mtxEntry_perRow_vec.begin(), h_mtxEntry_perRow_vec.end(), h_mtxEntry_perRow_vec.begin(), [](int n)
-        {return n + 1;});
-    // - construct h_csr_row_index & h_csr_diag_index
-    std::partial_sum(h_mtxEntry_perRow_vec.begin(), h_mtxEntry_perRow_vec.end(), h_csr_row_index_vec.begin()+1);
-    // - assign h_csr_row_index & h_csr_diag_index
-    h_A_csr_row_index = h_csr_row_index_vec.data();
-    h_A_csr_diag_index = h_csr_diag_index_vec.data();
-
-    /**
-     * 2. h_csr_col_index
-    */
-    std::vector<int> rowIndex(num_faces + num_cells), colIndex(num_faces + num_cells), diagIndex(num_cells);
-    std::iota(diagIndex.begin(), diagIndex.end(), 0);
-
-    // initialize the RowIndex (rowIndex of lower + upper + diagnal)
-    std::copy(neighbour, neighbour + num_surfaces, rowIndex.begin());
-    std::copy(owner, owner + num_surfaces, rowIndex.begin() + num_surfaces);
-    std::copy(diagIndex.begin(), diagIndex.end(), rowIndex.begin() + num_faces);
-    // initialize the ColIndex (colIndex of lower + upper + diagnal)
-    std::copy(owner, owner + num_surfaces, colIndex.begin());
-    std::copy(neighbour, neighbour + num_surfaces, colIndex.begin() + num_surfaces);
-    std::copy(diagIndex.begin(), diagIndex.end(), colIndex.begin() + num_faces);
-
-    // - construct hashTable for sorting
-    std::multimap<int,int> rowColPair;
-    for (int i = 0; i < 2*num_surfaces+num_cells; i++)
-    {
-        rowColPair.insert(std::make_pair(rowIndex[i], colIndex[i]));
-    }
-    // - sort
-    std::vector<std::pair<int, int>> globalPerm(rowColPair.begin(), rowColPair.end());
-    std::sort(globalPerm.begin(), globalPerm.end(), []
-    (const std::pair<int, int>& pair1, const std::pair<int, int>& pair2){
-    if (pair1.first != pair2.first) {
-        return pair1.first < pair2.first;
-    } else {
-        return pair1.second < pair2.second;
-    }
-    });
-
-    std::vector<int> h_csr_col_index_vec;
-    std::transform(globalPerm.begin(), globalPerm.end(), std::back_inserter(h_csr_col_index_vec), []
-        (const std::pair<int, int>& pair) {
-        return pair.second;
-    });
-    h_A_csr_col_index = h_csr_col_index_vec.data();
-    
-    // construct a tmp permutated List for add fvMatrix
-    std::vector<int> tmp_permutation(2*num_surfaces + num_cells);
-    std::vector<int> tmp_rowIndex(2*num_surfaces + num_cells);
-    std::iota(tmp_permutation.begin(), tmp_permutation.end(), 0);
-    std::copy(neighbour, neighbour + num_surfaces, tmp_rowIndex.begin());
-    std::copy(diagIndex.begin(), diagIndex.end(), tmp_rowIndex.begin() + num_surfaces);
-    std::copy(owner, owner + num_surfaces, tmp_rowIndex.begin() + num_surfaces + num_cells);
-    std::multimap<int,int> tmpPair;
-    for (int i = 0; i < 2*num_surfaces+num_cells; i++)
-    {
-        tmpPair.insert(std::make_pair(tmp_rowIndex[i], tmp_permutation[i]));
-    }
-    std::vector<std::pair<int, int>> tmpPerm(tmpPair.begin(), tmpPair.end());
-    std::sort(tmpPerm.begin(), tmpPerm.end(), []
-    (const std::pair<int, int>& pair1, const std::pair<int, int>& pair2){
-    if (pair1.first != pair2.first) {
-        return pair1.first < pair2.first;
-    } else {
-        return pair1.second < pair2.second;
-    }
-    });
-    std::transform(tmpPerm.begin(), tmpPerm.end(), std::back_inserter(tmpPermutatedList), []
-        (const std::pair<int, int>& pair) {
-        return pair.second;
-    });
-    
-
-    /**
-     * 3. boundary imformations
-    */
-    // get boundPermutation and offset lists
-    std::vector<int> boundPermutationListInit(num_boundary_faces);
-    std::vector<int> boundOffsetList;
-    std::iota(boundPermutationListInit.begin(), boundPermutationListInit.end(), 0);
-
-    // - construct hashTable for sorting
-    std::multimap<int,int> boundPermutation;
-    for (int i = 0; i < num_boundary_faces; i++)
-    {
-        boundPermutation.insert(std::make_pair(boundary_cell_id_init[i], boundPermutationListInit[i]));
-    }
-
-    // - sort 
-    std::vector<std::pair<int, int>> boundPermPair(boundPermutation.begin(), boundPermutation.end());
-    std::sort(boundPermPair.begin(), boundPermPair.end(), []
-    (const std::pair<int, int>& pair1, const std::pair<int, int>& pair2){
-        if (pair1.first != pair2.first) {
-            return pair1.first < pair2.first;
-        } else {
-            return pair1.second < pair2.second;
-        }
-    });
-
-    // - construct boundPermedIndex and boundary_cell_id
-    std::vector<int> boundary_cell_id;
-    boundPermutationList.clear();
-    std::transform(boundPermPair.begin(), boundPermPair.end(), std::back_inserter(boundary_cell_id), []
-        (const std::pair<int, int>& pair) {
-        return pair.first;
-    });
-    std::transform(boundPermPair.begin(), boundPermPair.end(), std::back_inserter(boundPermutationList), []
-        (const std::pair<int, int>& pair) {
-        return pair.second;
-    });
-
-    // construct boundary_cell_offset
-    std::map<int, int> countMap;
-    std::vector<int> boundaryCellcount;
-    for (const auto& cellIndex : boundary_cell_id)
-        ++ countMap[cellIndex];
-    for (const auto& [cellIndex, count] : countMap)
-        boundaryCellcount.push_back(count);
-
-    num_boundary_cells = boundaryCellcount.size();
-    num_boundary_cells_output = num_boundary_cells;
-
-    std::vector<int> boundary_cell_offset(boundaryCellcount.size() + 1, 0);
-    std::partial_sum(boundaryCellcount.begin(), boundaryCellcount.end(), boundary_cell_offset.begin()+1);
-    
-    // assign h_boundary_cell_offset & h_boundary_cell_id
-    h_boundary_cell_offset = boundary_cell_offset.data();
-    h_boundary_cell_id = boundary_cell_id.data();
-
-    // 
-    boundary_cell_bytes = num_boundary_cells * sizeof(double);
-    boundary_cell_vec_bytes = num_boundary_cells * 3 * sizeof(double);
-    boundary_cell_index_bytes = num_boundary_cells * sizeof(int);
-
-    boundary_face_bytes = num_boundary_faces * sizeof(double);
-    boundary_face_vec_bytes = num_boundary_faces * 3 * sizeof(double);
-    boundary_face_index_bytes = num_boundary_faces * sizeof(int);
-
-    ueqn_internalCoeffs.resize(3*num_boundary_faces);
-    ueqn_boundaryCoeffs.resize(3*num_boundary_faces);
-
-    boundary_face_vector.resize(3*num_boundary_faces);
-    boundary_pressure.resize(num_boundary_faces);
-    boundary_face.resize(num_boundary_faces);
-    boundary_deltaCoeffs.resize(num_boundary_faces);
-
-
-    /**
-     * 4. permutation list for field variables
-    */
-    std::vector<int> offdiagRowIndex(2*num_surfaces), permIndex(2*num_surfaces);
-    // - initialize the offdiagRowIndex (rowIndex of lower + rowIndex of upper)
-    std::copy(neighbour, neighbour + num_surfaces, offdiagRowIndex.begin());
-    std::copy(owner, owner + num_surfaces, offdiagRowIndex.begin() + num_surfaces);
-
-    // - initialize the permIndex (0, 1, ..., 2*num_surfaces)
-    std::iota(permIndex.begin(), permIndex.end(), 0);
-
-    // - construct hashTable for sorting
-    std::multimap<int,int> permutation;
-    for (int i = 0; i < 2*num_surfaces; i++)
-    {
-        permutation.insert(std::make_pair(offdiagRowIndex[i], permIndex[i]));
-    }
-    // - sort 
-    std::vector<std::pair<int, int>> permPair(permutation.begin(), permutation.end());
-    std::sort(permPair.begin(), permPair.end(), []
-    (const std::pair<int, int>& pair1, const std::pair<int, int>& pair2){
-        if (pair1.first != pair2.first) {
-            return pair1.first < pair2.first;
-        } else {
-            return pair1.second < pair2.second;
-        }
-    });
-    // - form permedIndex list
-    std::transform(permPair.begin(), permPair.end(), std::back_inserter(permedIndex), []
-        (const std::pair<int, int>& pair) {
-        return pair.second;
-    });
-
-    // copy and permutate cell variables
-    std::copy(weight, weight + num_surfaces, h_weight_vec_init.begin());
-    std::copy(weight, weight + num_surfaces, h_weight_vec_init.begin() + num_surfaces);
-    std::copy(face_vector, face_vector + 3*num_surfaces, h_face_vector_vec_init.begin());
-    std::copy(face_vector, face_vector + 3*num_surfaces, h_face_vector_vec_init.begin() + 3*num_surfaces);
-    std::copy(face, face + num_surfaces, h_face_vec_init.begin());
-    std::copy(face, face + num_surfaces, h_face_vec_init.begin() + num_surfaces);
-    std::copy(deltaCoeffs, deltaCoeffs + num_surfaces, h_deltaCoeffs_vec_init.begin());
-    std::copy(deltaCoeffs, deltaCoeffs + num_surfaces, h_deltaCoeffs_vec_init.begin() + num_surfaces);
-    for (int i = 0; i < num_faces; i++)
-    {
-        h_weight_vec[i] = h_weight_vec_init[permedIndex[i]];
-        h_face_vec[i] = h_face_vec_init[permedIndex[i]];
-        h_deltaCoeffs_vec[i] = h_deltaCoeffs_vec_init[permedIndex[i]];
-        h_face_vector_vec[i*3] = h_face_vector_vec_init[3*permedIndex[i]];
-        h_face_vector_vec[i*3+1] = h_face_vector_vec_init[3*permedIndex[i]+1];
-        h_face_vector_vec[i*3+2] = h_face_vector_vec_init[3*permedIndex[i]+2];
-    }
-    h_weight = h_weight_vec.data();
-    h_face_vector = h_face_vector_vec.data();
-    h_face = h_face_vec.data();
-    h_deltaCoeffs = h_deltaCoeffs_vec.data();
-
-    for (int i = 0; i < num_boundary_faces; i++)
-    {
-        boundary_face_vector[3*i] = boundary_face_vector_init[3*boundPermutationList[i]];
-        boundary_face_vector[3*i+1] = boundary_face_vector_init[3*boundPermutationList[i]+1];
-        boundary_face_vector[3*i+2] = boundary_face_vector_init[3*boundPermutationList[i]+2];
-        boundary_face[i] = boundary_face_init[boundPermutationList[i]];
-        boundary_deltaCoeffs[i] = boundary_deltaCoeffs_init[boundPermutationList[i]];
-    }
-    h_boundary_face_vector = boundary_face_vector.data();
-    h_boundary_face = boundary_face.data();
-    h_boundary_deltaCoeffs = boundary_deltaCoeffs.data();
-
-    /************************allocate memory on device****************************/
-    int total_bytes = 0;
-
-    checkCudaErrors(cudaMalloc((void**)&d_A_csr_row_index, csr_row_index_bytes));
-    checkCudaErrors(cudaMalloc((void**)&d_A_csr_col_index, csr_col_index_bytes));
-    checkCudaErrors(cudaMalloc((void**)&d_A_csr_diag_index, cell_index_bytes));
-    total_bytes += (csr_row_index_bytes + csr_col_index_bytes + cell_index_bytes);
-
-    checkCudaErrors(cudaMalloc((void**)&d_rho_old, cell_bytes));
-    checkCudaErrors(cudaMalloc((void**)&d_rho_new, cell_bytes));
-    checkCudaErrors(cudaMalloc((void**)&d_volume, cell_bytes));
-    checkCudaErrors(cudaMalloc((void**)&d_pressure, cell_bytes));
-    checkCudaErrors(cudaMalloc((void**)&d_velocity_old, cell_vec_bytes));
-    checkCudaErrors(cudaMalloc((void**)&d_weight, face_bytes));
-    checkCudaErrors(cudaMalloc((void**)&d_face, face_bytes));
-    checkCudaErrors(cudaMalloc((void**)&d_deltaCoeffs, face_bytes));
-    checkCudaErrors(cudaMalloc((void**)&d_phi, face_bytes));
-    checkCudaErrors(cudaMalloc((void**)&d_phi_init, face_bytes));
-    checkCudaErrors(cudaMalloc((void**)&d_face_vector, face_vec_bytes));
-    checkCudaErrors(cudaMalloc((void**)&d_nuEff, cell_bytes));
-    total_bytes += (cell_bytes * 5 + face_bytes * 5 + cell_vec_bytes + face_vec_bytes);
-
-    checkCudaErrors(cudaMalloc((void**)&d_boundary_cell_offset, (num_boundary_cells+1) * sizeof(int)));
-    checkCudaErrors(cudaMalloc((void**)&d_boundary_cell_id, boundary_face_index_bytes));
-    total_bytes += (boundary_face_index_bytes + (num_boundary_cells+1) * sizeof(int));
-
-    checkCudaErrors(cudaMalloc((void**)&d_boundary_pressure_init, boundary_face_bytes));
-    checkCudaErrors(cudaMalloc((void**)&d_boundary_pressure, boundary_face_bytes));
-    checkCudaErrors(cudaMalloc((void**)&d_boundary_velocity_init, boundary_face_vec_bytes));
-    checkCudaErrors(cudaMalloc((void**)&d_boundary_velocity, boundary_face_vec_bytes));
-    checkCudaErrors(cudaMalloc((void**)&d_boundary_face_vector, boundary_face_vec_bytes));
-    checkCudaErrors(cudaMalloc((void**)&d_boundary_face, boundary_face_bytes));
-    checkCudaErrors(cudaMalloc((void**)&d_boundary_deltaCoeffs, boundary_face_bytes));
-    checkCudaErrors(cudaMalloc((void**)&d_internal_coeffs_init, boundary_face_vec_bytes));
-    checkCudaErrors(cudaMalloc((void**)&d_boundary_coeffs_init, boundary_face_vec_bytes));
-    checkCudaErrors(cudaMalloc((void**)&d_laplac_internal_coeffs_init, boundary_face_vec_bytes));
-    checkCudaErrors(cudaMalloc((void**)&d_laplac_boundary_coeffs_init, boundary_face_vec_bytes));
-    checkCudaErrors(cudaMalloc((void**)&d_internal_coeffs, boundary_face_vec_bytes));
-    checkCudaErrors(cudaMalloc((void**)&d_boundary_coeffs, boundary_face_vec_bytes));
-    checkCudaErrors(cudaMalloc((void**)&d_laplac_internal_coeffs, boundary_face_vec_bytes));
-    checkCudaErrors(cudaMalloc((void**)&d_laplac_boundary_coeffs, boundary_face_vec_bytes));
-    checkCudaErrors(cudaMalloc((void**)&d_boundary_nuEff, boundary_face_bytes));
-    checkCudaErrors(cudaMalloc((void**)&d_boundary_nuEff_init, boundary_face_bytes));
-    checkCudaErrors(cudaMalloc((void**)&d_boundary_rho, boundary_face_bytes));
-    checkCudaErrors(cudaMalloc((void**)&d_boundary_rho_init, boundary_face_bytes));
-    total_bytes += (boundary_face_bytes*8 + boundary_face_vec_bytes * 11);
-
     checkCudaErrors(cudaMalloc((void**)&d_A_csr, csr_value_vec_bytes));
     checkCudaErrors(cudaMalloc((void**)&d_b, cell_vec_bytes));
     checkCudaErrors(cudaMalloc((void**)&d_psi, cell_vec_bytes));
-    total_bytes += (boundary_face_bytes + boundary_face_vec_bytes * 3);
-
-    checkCudaErrors(cudaMalloc((void**)&d_turbSrc_A, csr_value_bytes));
-    checkCudaErrors(cudaMalloc((void**)&d_turbSrc_A_init, csr_value_bytes));
-    checkCudaErrors(cudaMalloc((void**)&d_turbSrc_b, cell_vec_bytes));
-    total_bytes += (2*csr_value_bytes + cell_vec_bytes);
-
-    checkCudaErrors(cudaMalloc((void**)&d_permedIndex, face_index_bytes));
-    checkCudaErrors(cudaMalloc((void**)&d_bouPermedIndex, boundary_face_index_bytes));
-    checkCudaErrors(cudaMalloc((void**)&d_tmpPermutatedList, csr_col_index_bytes));
-    total_bytes += (face_index_bytes + boundary_face_index_bytes + csr_col_index_bytes);
-
-    checkCudaErrors(cudaMalloc((void**)&d_grad, num_cells * 9 * sizeof(double)));
-    checkCudaErrors(cudaMalloc((void**)&d_grad_boundary, boundary_face_bytes * 9));
-    checkCudaErrors(cudaMalloc((void**)&d_grad_boundary_init, boundary_cell_bytes * 9));
-    total_bytes += (num_cells * 9 * sizeof(double) + boundary_face_bytes * 9 + boundary_cell_bytes * 9); // FIXME: rename
-
-
-    fprintf(stderr, "Total bytes malloc on GPU: %.2fMB\n", total_bytes * 1.0 / 1024 / 1024);
 
     checkCudaErrors(cudaStreamCreate(&stream));
 
     checkCudaErrors(cudaMemsetAsync(d_A_csr, 0, csr_value_vec_bytes, stream));
     checkCudaErrors(cudaMemsetAsync(d_b, 0, cell_vec_bytes, stream));
-
-    checkCudaErrors(cudaMemcpyAsync(d_A_csr_row_index, h_A_csr_row_index, csr_row_index_bytes, cudaMemcpyHostToDevice, stream));
-    checkCudaErrors(cudaMemcpyAsync(d_A_csr_col_index, h_A_csr_col_index, csr_col_index_bytes, cudaMemcpyHostToDevice, stream));
-    checkCudaErrors(cudaMemcpyAsync(d_A_csr_diag_index, h_A_csr_diag_index, cell_index_bytes, cudaMemcpyHostToDevice, stream));
-    checkCudaErrors(cudaMemcpyAsync(d_boundary_cell_offset, h_boundary_cell_offset, (num_boundary_cells+1) * sizeof(int), cudaMemcpyHostToDevice, stream));
-    checkCudaErrors(cudaMemcpyAsync(d_boundary_cell_id, h_boundary_cell_id, boundary_face_index_bytes, cudaMemcpyHostToDevice, stream));
-
-    checkCudaErrors(cudaMemcpyAsync(d_volume, h_volume, cell_bytes, cudaMemcpyHostToDevice, stream));
-    checkCudaErrors(cudaMemcpyAsync(d_weight, h_weight, face_bytes, cudaMemcpyHostToDevice, stream));
-    checkCudaErrors(cudaMemcpyAsync(d_face, h_face, face_bytes, cudaMemcpyHostToDevice, stream));
-    checkCudaErrors(cudaMemcpyAsync(d_deltaCoeffs, h_deltaCoeffs, face_bytes, cudaMemcpyHostToDevice, stream));
-    checkCudaErrors(cudaMemcpyAsync(d_face_vector, h_face_vector, face_vec_bytes, cudaMemcpyHostToDevice, stream));
-    checkCudaErrors(cudaMemcpyAsync(d_boundary_face_vector, h_boundary_face_vector, boundary_face_vec_bytes, cudaMemcpyHostToDevice, stream));
-    checkCudaErrors(cudaMemcpyAsync(d_boundary_face, h_boundary_face, boundary_face_bytes, cudaMemcpyHostToDevice, stream));
-    checkCudaErrors(cudaMemcpyAsync(d_boundary_deltaCoeffs, h_boundary_deltaCoeffs, boundary_face_bytes, cudaMemcpyHostToDevice, stream));
-
-    checkCudaErrors(cudaMemcpyAsync(d_permedIndex, permedIndex.data(), face_index_bytes, cudaMemcpyHostToDevice, stream));
-    checkCudaErrors(cudaMemcpyAsync(d_bouPermedIndex, boundPermutationList.data(), boundary_face_index_bytes, cudaMemcpyHostToDevice, stream));
-    checkCudaErrors(cudaMemcpyAsync(d_tmpPermutatedList, tmpPermutatedList.data(), csr_col_index_bytes, cudaMemcpyHostToDevice, stream));
-
-    // construct AmgXSolver
-    UxSolver = new AmgXSolver(modeStr, cfgFile);
-    UySolver = new AmgXSolver(modeStr, cfgFile);
-    UzSolver = new AmgXSolver(modeStr, cfgFile);
 }
 
-dfMatrix::~dfMatrix()
-{
-}
-
-void dfMatrix::fvm_ddt(double *rho_old, double *rho_new, double* vector_old)
-{
-    // copy cell variables directly
-    h_rho_new = rho_new;
-    h_rho_old = rho_old;
-    h_velocity_old = vector_old;
-    
+void dfUEqn::fvm_ddt(double *rho_old, double *rho_new, double* vector_old)
+{  
     // Copy the host input array in host memory to the device input array in device memory
     clock_t start = std::clock();
-    checkCudaErrors(cudaMemcpyAsync(d_rho_old, h_rho_old, cell_bytes, cudaMemcpyHostToDevice, stream));
-    checkCudaErrors(cudaMemcpyAsync(d_rho_new, h_rho_new, cell_bytes, cudaMemcpyHostToDevice, stream));
-    checkCudaErrors(cudaMemcpyAsync(d_velocity_old, h_velocity_old, cell_vec_bytes, cudaMemcpyHostToDevice, stream));
+    checkCudaErrors(cudaMemcpyAsync(dataBase_.d_rho_old, rho_old, cell_bytes, cudaMemcpyHostToDevice, stream));
+    checkCudaErrors(cudaMemcpyAsync(dataBase_.d_rho_new, rho_new, cell_bytes, cudaMemcpyHostToDevice, stream));
+    checkCudaErrors(cudaMemcpyAsync(dataBase_.d_velocity_old, vector_old, cell_vec_bytes, cudaMemcpyHostToDevice, stream));
     clock_t end = std::clock();
     time_monitor_GPU_memcpy += double(end - start) / double(CLOCKS_PER_SEC);
 
@@ -1228,14 +856,14 @@ void dfMatrix::fvm_ddt(double *rho_old, double *rho_new, double* vector_old)
     start = std::clock();
     size_t threads_per_block = 1024;
     size_t blocks_per_grid = (num_cells + threads_per_block - 1) / threads_per_block;
-    fvm_ddt_kernel<<<blocks_per_grid, threads_per_block, 0, stream>>>(num_cells, num_faces, rdelta_t,
+    fvm_ddt_kernel<<<blocks_per_grid, threads_per_block, 0, stream>>>(num_cells, num_faces, dataBase_.rdelta_t,
           d_A_csr_row_index, d_A_csr_diag_index,
-          d_rho_old, d_rho_new, d_volume, d_velocity_old, d_A_csr, d_b, d_A_csr, d_b, d_psi);
+          dataBase_.d_rho_old, dataBase_.d_rho_new, dataBase_.d_volume, dataBase_.d_velocity_old, d_A_csr, d_b, d_A_csr, d_b, d_psi);
     end = std::clock();
     time_monitor_GPU_kernel += double(end - start) / double(CLOCKS_PER_SEC);
 }
 
-void dfMatrix::fvm_div(double* phi, double* ueqn_internalCoeffs_init, double* ueqn_boundaryCoeffs_init, 
+void dfUEqn::fvm_div(double* phi, double* ueqn_internalCoeffs_init, double* ueqn_boundaryCoeffs_init, 
     double* ueqn_laplac_internalCoeffs_init, double* ueqn_laplac_boundaryCoeffs_init,
     double* boundary_pressure_init, double* boundary_velocity_init,
     double *boundary_nuEff_init, double *boundary_rho_init)
@@ -1244,70 +872,65 @@ void dfMatrix::fvm_div(double* phi, double* ueqn_internalCoeffs_init, double* ue
     clock_t start = std::clock();
     // std::copy(phi, phi + num_surfaces, h_phi_init);
     // std::copy(phi, phi + num_surfaces, h_phi_init + num_surfaces);
-    memcpy(h_phi_init, phi, num_surfaces*sizeof(double));
+    // memcpy(h_phi_init, phi, num_surfaces*sizeof(double));
     // memcpy(h_phi_init + num_surfaces, phi, num_surfaces*sizeof(double));
     
     clock_t end = std::clock();
     time_monitor_CPU += double(end - start) / double(CLOCKS_PER_SEC);
 
     start = std::clock();
-    checkCudaErrors(cudaMemcpyAsync(d_phi_init, h_phi_init, num_surfaces*sizeof(double), cudaMemcpyHostToDevice, stream));
-    checkCudaErrors(cudaMemcpyAsync(d_phi_init + num_surfaces, d_phi_init, num_surfaces*sizeof(double), cudaMemcpyDeviceToDevice, stream));
+    checkCudaErrors(cudaMemcpyAsync(dataBase_.d_phi_init, phi, num_surfaces*sizeof(double), cudaMemcpyHostToDevice, stream));
+    checkCudaErrors(cudaMemcpyAsync(dataBase_.d_phi_init + num_surfaces, dataBase_.d_phi_init, num_surfaces*sizeof(double), cudaMemcpyDeviceToDevice, stream));
     end = std::clock();
     time_monitor_GPU_memcpy_test += double(end - start) / double(CLOCKS_PER_SEC);
 
     start = std::clock();
     size_t threads_per_block = 1024;
     size_t blocks_per_grid = (num_faces + threads_per_block - 1) / threads_per_block;
-    offdiagPermutation<<<blocks_per_grid, threads_per_block, 0, stream>>>(num_faces, d_permedIndex, d_phi_init, d_phi);
+    offdiagPermutation<<<blocks_per_grid, threads_per_block, 0, stream>>>(num_faces, dataBase_.d_permedIndex, dataBase_.d_phi_init, dataBase_.d_phi);
     end = std::clock();
     time_monitor_GPU_kernel += double(end - start) / double(CLOCKS_PER_SEC);
 
     // copy and permutate boundary variable
     start = std::clock();
-    checkCudaErrors(cudaMemcpyAsync(d_internal_coeffs_init, ueqn_internalCoeffs_init, boundary_face_vec_bytes, cudaMemcpyHostToDevice, stream));
-    checkCudaErrors(cudaMemcpyAsync(d_boundary_coeffs_init, ueqn_boundaryCoeffs_init, boundary_face_vec_bytes, cudaMemcpyHostToDevice, stream));
-    checkCudaErrors(cudaMemcpyAsync(d_laplac_internal_coeffs_init, ueqn_laplac_internalCoeffs_init, boundary_face_vec_bytes, cudaMemcpyHostToDevice, stream));
-    checkCudaErrors(cudaMemcpyAsync(d_laplac_boundary_coeffs_init, ueqn_laplac_boundaryCoeffs_init, boundary_face_vec_bytes, cudaMemcpyHostToDevice, stream));
-    checkCudaErrors(cudaMemcpyAsync(d_boundary_velocity_init, boundary_velocity_init, boundary_face_vec_bytes, cudaMemcpyHostToDevice, stream));
-    checkCudaErrors(cudaMemcpyAsync(d_boundary_pressure_init, boundary_pressure_init, boundary_face_bytes, cudaMemcpyHostToDevice, stream));
-    checkCudaErrors(cudaMemcpyAsync(d_boundary_nuEff_init, boundary_nuEff_init, boundary_face_bytes, cudaMemcpyHostToDevice, stream));
-    checkCudaErrors(cudaMemcpyAsync(d_boundary_rho_init, boundary_rho_init, boundary_face_bytes, cudaMemcpyHostToDevice, stream));
+    checkCudaErrors(cudaMemcpyAsync(dataBase_.d_internal_coeffs_init, ueqn_internalCoeffs_init, dataBase_.boundary_face_vec_bytes, cudaMemcpyHostToDevice, stream));
+    checkCudaErrors(cudaMemcpyAsync(dataBase_.d_boundary_coeffs_init, ueqn_boundaryCoeffs_init, dataBase_.boundary_face_vec_bytes, cudaMemcpyHostToDevice, stream));
+    checkCudaErrors(cudaMemcpyAsync(dataBase_.d_laplac_internal_coeffs_init, ueqn_laplac_internalCoeffs_init, dataBase_.boundary_face_vec_bytes, cudaMemcpyHostToDevice, stream));
+    checkCudaErrors(cudaMemcpyAsync(dataBase_.d_laplac_boundary_coeffs_init, ueqn_laplac_boundaryCoeffs_init, dataBase_.boundary_face_vec_bytes, cudaMemcpyHostToDevice, stream));
+    checkCudaErrors(cudaMemcpyAsync(dataBase_.d_boundary_velocity_init, boundary_velocity_init, dataBase_.boundary_face_vec_bytes, cudaMemcpyHostToDevice, stream));
+    checkCudaErrors(cudaMemcpyAsync(dataBase_.d_boundary_pressure_init, boundary_pressure_init, dataBase_.boundary_face_bytes, cudaMemcpyHostToDevice, stream));
+    checkCudaErrors(cudaMemcpyAsync(dataBase_.d_boundary_nuEff_init, boundary_nuEff_init, dataBase_.boundary_face_bytes, cudaMemcpyHostToDevice, stream));
+    checkCudaErrors(cudaMemcpyAsync(dataBase_.d_boundary_rho_init, boundary_rho_init, dataBase_.boundary_face_bytes, cudaMemcpyHostToDevice, stream));
     end = std::clock();
     time_monitor_GPU_memcpy += double(end - start) / double(CLOCKS_PER_SEC);
     
     start = std::clock();
-    blocks_per_grid = (num_boundary_faces + threads_per_block - 1) / threads_per_block;
-    boundaryPermutation<<<blocks_per_grid, threads_per_block, 0, stream>>>(num_boundary_faces, d_bouPermedIndex, d_internal_coeffs_init, d_boundary_coeffs_init,
-          d_laplac_internal_coeffs_init, d_laplac_boundary_coeffs_init, d_boundary_pressure_init, d_boundary_velocity_init, d_internal_coeffs, d_boundary_coeffs, 
-          d_laplac_internal_coeffs, d_laplac_boundary_coeffs, d_boundary_pressure, d_boundary_velocity, d_boundary_nuEff_init, d_boundary_nuEff, d_boundary_rho_init, d_boundary_rho);
+    blocks_per_grid = (dataBase_.num_boundary_faces + threads_per_block - 1) / threads_per_block;
+    boundaryPermutation<<<blocks_per_grid, threads_per_block, 0, stream>>>(dataBase_.num_boundary_faces, dataBase_.d_bouPermedIndex, dataBase_.d_internal_coeffs_init, 
+          dataBase_.d_boundary_coeffs_init, dataBase_.d_laplac_internal_coeffs_init, dataBase_.d_laplac_boundary_coeffs_init, dataBase_.d_boundary_pressure_init, 
+          dataBase_.d_boundary_velocity_init, dataBase_.d_internal_coeffs, dataBase_.d_boundary_coeffs, dataBase_.d_laplac_internal_coeffs, dataBase_.d_laplac_boundary_coeffs, 
+          dataBase_.d_boundary_pressure, dataBase_.d_boundary_velocity, dataBase_.d_boundary_nuEff_init, dataBase_.d_boundary_nuEff, dataBase_.d_boundary_rho_init, dataBase_.d_boundary_rho);
 
     // launch cuda kernel
     blocks_per_grid = (num_cells + threads_per_block - 1) / threads_per_block;
     fvm_div_internal<<<blocks_per_grid, threads_per_block, 0, stream>>>(num_cells, num_faces,
           d_A_csr_row_index, d_A_csr_diag_index,
-          d_weight, d_phi, d_A_csr, d_b, d_A_csr, d_b);
+          dataBase_.d_weight, dataBase_.d_phi, d_A_csr, d_b, d_A_csr, d_b);
     blocks_per_grid = (num_boundary_cells + threads_per_block - 1) / threads_per_block;
     fvm_div_boundary<<<blocks_per_grid, threads_per_block, 0, stream>>>(num_cells, num_faces, num_boundary_cells,
           d_A_csr_row_index, d_A_csr_diag_index,
-          d_boundary_cell_offset, d_boundary_cell_id,
-          d_internal_coeffs, d_boundary_coeffs, d_A_csr, d_b, d_A_csr, d_b);
+          dataBase_.d_boundary_cell_offset, dataBase_.d_boundary_cell_id,
+          dataBase_.d_internal_coeffs, dataBase_.d_boundary_coeffs, d_A_csr, d_b, d_A_csr, d_b);
     end = std::clock();
     time_monitor_GPU_kernel += double(end - start) / double(CLOCKS_PER_SEC);
 }
 
-void dfMatrix::fvc_grad(double* pressure)
+void dfUEqn::fvc_grad(double* pressure)
 {
-    // copy cell variables directly
+    // Copy the host input array in host memory to the device input array in device memory
     clock_t start = std::clock();
-    h_pressure = pressure;
+    checkCudaErrors(cudaMemcpyAsync(dataBase_.d_pressure, pressure, cell_bytes, cudaMemcpyHostToDevice, stream));
     clock_t end = std::clock();
-    time_monitor_CPU += double(end - start) / double(CLOCKS_PER_SEC);
-
-    // Copy the host input array in host memory to the device input array in device memory
-    start = std::clock();
-    checkCudaErrors(cudaMemcpyAsync(d_pressure, h_pressure, cell_bytes, cudaMemcpyHostToDevice, stream));
-    end = std::clock();
     time_monitor_GPU_memcpy += double(end - start) / double(CLOCKS_PER_SEC);
         
     // launch cuda kernel
@@ -1316,39 +939,96 @@ void dfMatrix::fvc_grad(double* pressure)
     size_t blocks_per_grid = (num_cells + threads_per_block - 1) / threads_per_block;
     fvc_grad_internal_face<<<blocks_per_grid, threads_per_block, 0, stream>>>(num_cells,
           d_A_csr_row_index, d_A_csr_col_index, d_A_csr_diag_index,
-          d_face_vector, d_weight, d_pressure, d_b, d_b);
+          dataBase_.d_face_vector, dataBase_.d_weight, dataBase_.d_pressure, d_b, d_b);
     blocks_per_grid = (num_boundary_cells + threads_per_block - 1) / threads_per_block;
     fvc_grad_boundary_face<<<blocks_per_grid, threads_per_block, 0, stream>>>(num_cells, num_boundary_cells,
-          d_boundary_cell_offset, d_boundary_cell_id,
-          d_boundary_face_vector, d_boundary_pressure, d_b, d_b);
+          dataBase_.d_boundary_cell_offset, dataBase_.d_boundary_cell_id,
+          dataBase_.d_boundary_face_vector, dataBase_.d_boundary_pressure, d_b, d_b);
     end = std::clock();
     time_monitor_GPU_kernel += double(end - start) / double(CLOCKS_PER_SEC);
 }
 
-void dfMatrix::add_fvMatrix(double* turbSrc_low, double* turbSrc_diag, double* turbSrc_upp, double* turbSrc_source)
+void dfUEqn::fvc_grad_vector()
 {
-    // copy and permutate matrix variables
-    std::copy(turbSrc_low, turbSrc_low + num_surfaces, h_turbSrc_init_mtx_vec.begin());
-    std::copy(turbSrc_diag, turbSrc_diag + num_cells, h_turbSrc_init_mtx_vec.begin() + num_surfaces);
-    std::copy(turbSrc_upp, turbSrc_upp + num_surfaces, h_turbSrc_init_mtx_vec.begin() + num_surfaces + num_cells);
-    std::copy(turbSrc_source, turbSrc_source + 3*num_cells, h_turbSrc_init_src_vec.begin());
-
-    // permutate
-    checkCudaErrors(cudaMemcpyAsync(d_turbSrc_A_init, h_turbSrc_init_mtx_vec.data(), csr_value_bytes, cudaMemcpyHostToDevice, stream));
+    clock_t start = std::clock();
+    // launch CUDA kernal
     size_t threads_per_block = 1024;
-    size_t blocks_per_grid = (num_cells + num_faces + threads_per_block - 1) / threads_per_block;
-    // for (int i = 0; i < num_cells+2*num_surfaces; i++)
-    //     h_turbSrc_init_1mtx[i] = h_turbSrc_init_mtx_vec[tmpPermutatedList[i]];
-    csrPermutation<<<blocks_per_grid, threads_per_block, 0, stream>>>(num_cells + num_faces, d_tmpPermutatedList, d_turbSrc_A_init, d_turbSrc_A);
+    size_t blocks_per_grid = (num_cells + threads_per_block - 1) / threads_per_block;
+    fvc_grad_vector_internal<<<blocks_per_grid, threads_per_block, 0, stream>>>(num_cells,
+          d_A_csr_row_index, d_A_csr_col_index, d_A_csr_diag_index,
+          dataBase_.d_face_vector, dataBase_.d_velocity_old, dataBase_.d_weight, dataBase_.d_volume, dataBase_.d_grad);
 
-    // Copy the host input array in host memory to the device input array in device memory
-    checkCudaErrors(cudaMemcpyAsync(d_turbSrc_b, h_turbSrc_init_src_vec.data(), cell_vec_bytes, cudaMemcpyHostToDevice, stream));
-    blocks_per_grid = (num_cells + threads_per_block - 1) / threads_per_block;
-    add_fvMatrix_kernel<<<blocks_per_grid, threads_per_block, 0, stream>>>(num_cells, num_faces,
-          d_A_csr_row_index, d_turbSrc_A, d_turbSrc_b, d_A_csr, d_b, d_A_csr, d_b);
+    blocks_per_grid = (num_boundary_cells + threads_per_block - 1) / threads_per_block;
+    fvc_grad_vector_boundary<<<blocks_per_grid, threads_per_block, 0, stream>>>(num_cells, num_boundary_cells,
+          dataBase_.d_boundary_cell_offset, dataBase_.d_boundary_cell_id, dataBase_.d_boundary_face_vector, dataBase_.d_boundary_velocity,
+          dataBase_.d_volume, dataBase_.d_grad, dataBase_.d_grad_boundary_init);
+
+    correct_boundary_conditions<<<blocks_per_grid, threads_per_block, 0, stream>>>(num_boundary_cells,
+          dataBase_.d_boundary_cell_offset, dataBase_.d_boundary_cell_id, dataBase_.d_boundary_face_vector, dataBase_.d_boundary_face, 
+          dataBase_.d_grad_boundary_init, dataBase_.d_grad_boundary);
+    clock_t end = std::clock();
+    time_monitor_GPU_kernel += double(end - start) / double(CLOCKS_PER_SEC);
+}
+
+void dfUEqn::dev2T()
+{
+    clock_t start = std::clock();
+    // launch CUDA kernal
+    size_t threads_per_block = 1024;
+    size_t blocks_per_grid = (num_cells + threads_per_block - 1) / threads_per_block;
+    dev2_t_tensor<<<blocks_per_grid, threads_per_block, 0, stream>>>(num_cells, dataBase_.d_grad);
+     
+    blocks_per_grid = (dataBase_.num_boundary_faces + threads_per_block - 1) / threads_per_block;
+    dev2_t_tensor<<<blocks_per_grid, threads_per_block, 0, stream>>>(dataBase_.num_boundary_faces, dataBase_.d_grad_boundary);
+    clock_t end = std::clock();
+    time_monitor_GPU_kernel += double(end - start) / double(CLOCKS_PER_SEC);
+}
+
+void dfUEqn::fvc_div_tensor(const double *nuEff)
+{
+    clock_t start = std::clock();
+    checkCudaErrors(cudaMemcpyAsync(dataBase_.d_nuEff, nuEff, cell_bytes, cudaMemcpyHostToDevice, stream));
+    clock_t end = std::clock();
+    time_monitor_GPU_memcpy += double(end - start) / double(CLOCKS_PER_SEC);
+
+    // launch cuda kernel
+    start = std::clock();
+    size_t threads_per_block = 512;
+    size_t blocks_per_grid = (num_cells + threads_per_block - 1) / threads_per_block;
+    fvc_div_tensor_internal<<<blocks_per_grid, threads_per_block, 0, stream>>>(num_cells,
+          d_A_csr_row_index, d_A_csr_col_index, d_A_csr_diag_index,
+          dataBase_.d_nuEff, dataBase_.d_rho_new, dataBase_.d_face_vector, dataBase_.d_grad, dataBase_.d_weight,
+          dataBase_.d_volume, 1., d_b, d_b);
+
+    blocks_per_grid = (num_boundary_cells + threads_per_block - 1) / threads_per_block;
+    fvc_div_tensor_boundary<<<blocks_per_grid, threads_per_block, 0, stream>>>(num_cells, num_boundary_cells,
+          dataBase_.d_boundary_cell_offset, dataBase_.d_boundary_cell_id,
+          dataBase_.d_boundary_nuEff, dataBase_.d_boundary_rho, dataBase_.d_boundary_face_vector, dataBase_.d_grad_boundary, 
+          dataBase_.d_volume, 1., d_b, d_b);
+    end = std::clock();
+    time_monitor_GPU_kernel += double(end - start) / double(CLOCKS_PER_SEC);
+}
+
+void dfUEqn::fvm_laplacian()
+{
+    clock_t start = std::clock();
+    // launch CUDA kernels
+    size_t threads_per_block = 1024;
+    size_t blocks_per_grid = (num_cells + threads_per_block - 1) / threads_per_block;
+    fvm_laplacian_uncorrected_vector_internal<<<blocks_per_grid, threads_per_block, 0, stream>>>(num_cells, num_faces,
+          d_A_csr_row_index, d_A_csr_col_index, d_A_csr_diag_index, dataBase_.d_rho_new, dataBase_.d_nuEff, dataBase_.d_weight, 
+          dataBase_.d_face, dataBase_.d_deltaCoeffs, -1., d_A_csr, d_A_csr);
+
+    blocks_per_grid = (num_boundary_cells + threads_per_block - 1) / threads_per_block;
+    fvm_laplacian_uncorrected_vector_boundary<<<blocks_per_grid, threads_per_block, 0, stream>>>(num_cells, num_faces, num_boundary_cells,
+          d_A_csr_row_index, d_A_csr_diag_index, dataBase_.d_boundary_cell_offset, dataBase_.d_boundary_cell_id, 
+          dataBase_.d_boundary_nuEff, dataBase_.d_boundary_rho, dataBase_.d_boundary_face, dataBase_.d_laplac_internal_coeffs, 
+          dataBase_.d_laplac_boundary_coeffs, -1., d_A_csr, d_b, d_A_csr, d_b);
+    clock_t end = std::clock();
+    time_monitor_GPU_kernel += double(end - start) / double(CLOCKS_PER_SEC);
 }
 
-void dfMatrix::checkValue(bool print)
+void dfUEqn::checkValue(bool print)
 {
     checkCudaErrors(cudaMemcpyAsync(h_A_csr, d_A_csr, csr_value_vec_bytes, cudaMemcpyDeviceToHost, stream));
     checkCudaErrors(cudaMemcpyAsync(h_b, d_b, cell_vec_bytes, cudaMemcpyDeviceToHost, stream));
@@ -1357,7 +1037,7 @@ void dfMatrix::checkValue(bool print)
     checkCudaErrors(cudaStreamSynchronize(stream));
     if (print)
     {
-        for (int i = 0; i < (2*num_surfaces + num_cells); i++)
+        for (int i = 0; i < (num_faces + num_cells); i++)
             fprintf(stderr, "h_A_csr[%d]: %.15lf\n", i, h_A_csr[i]);
         for (int i = 0; i < num_cells * 3; i++)
             fprintf(stderr, "h_b[%d]: %.15lf\n", i, h_b[i]);
@@ -1370,23 +1050,23 @@ void dfMatrix::checkValue(bool print)
     }
     int readfile = 0;
     double *of_b = new double[3*num_cells];
-    double *of_A = new double[2*num_surfaces + num_cells];
+    double *of_A = new double[num_faces + num_cells];
     readfile = fread(of_b, num_cells * 3 * sizeof(double), 1, fp);
-    readfile = fread(of_A, (2*num_surfaces + num_cells) * sizeof(double), 1, fp);
+    readfile = fread(of_A, (num_faces + num_cells) * sizeof(double), 1, fp);
 
-    std::vector<double> h_A_of_init_vec(num_cells+2*num_surfaces);
-    std::copy(of_A, of_A + num_cells+2*num_surfaces, h_A_of_init_vec.begin());
+    std::vector<double> h_A_of_init_vec(num_cells+num_faces);
+    std::copy(of_A, of_A + num_cells+num_faces, h_A_of_init_vec.begin());
 
-    std::vector<double> h_A_of_vec_1mtx(2*num_surfaces + num_cells, 0);
-    for (int i = 0; i < 2*num_surfaces + num_cells; i++)
+    std::vector<double> h_A_of_vec_1mtx(num_faces + num_cells, 0);
+    for (int i = 0; i < num_faces + num_cells; i++)
     {
-        h_A_of_vec_1mtx[i] = h_A_of_init_vec[tmpPermutatedList[i]];
+        h_A_of_vec_1mtx[i] = h_A_of_init_vec[dataBase_.tmpPermutatedList[i]];
     }
 
-    std::vector<double> h_A_of_vec((2*num_surfaces + num_cells)*3);
+    std::vector<double> h_A_of_vec((num_faces + num_cells)*3);
     for (int i =0; i < 3; i ++)
     {
-        std::copy(h_A_of_vec_1mtx.begin(), h_A_of_vec_1mtx.end(), h_A_of_vec.begin()+i*(2*num_surfaces + num_cells));
+        std::copy(h_A_of_vec_1mtx.begin(), h_A_of_vec_1mtx.end(), h_A_of_vec.begin()+i*(num_faces + num_cells));
     }
 
     // b
@@ -1410,7 +1090,7 @@ void dfMatrix::checkValue(bool print)
 
     if (print)
     {
-        for (int i = 0; i < (2*num_surfaces + num_cells); i++)
+        for (int i = 0; i < (num_faces + num_cells); i++)
             fprintf(stderr, "h_A_of_vec_1mtx[%d]: %.15lf\n", i, h_A_of_vec_1mtx[i]);
         for (int i = 0; i < 3*num_cells; i++)
             fprintf(stderr, "h_b_of_vec[%d]: %.15lf\n", i, h_b_of_vec[i]);
@@ -1418,12 +1098,12 @@ void dfMatrix::checkValue(bool print)
 
     // check
     fprintf(stderr, "check of h_A_csr\n");
-    checkVectorEqual(2*num_surfaces + num_cells, h_A_of_vec_1mtx.data(), h_A_csr, 1e-5);
+    checkVectorEqual(num_faces + num_cells, h_A_of_vec_1mtx.data(), h_A_csr, 1e-5);
     fprintf(stderr, "check of h_b\n");
     checkVectorEqual(3*num_cells, h_b_of_vec.data(), h_b, 1e-5);
 }
 
-void dfMatrix::solve()
+void dfUEqn::solve()
 {
     // for (size_t i = 0; i < num_cells; i++)
     //     fprintf(stderr, "h_velocity_old[%d]: (%.15lf, %.15lf, %.15lf)\n", i, h_velocity_old[3*i], 
@@ -1468,7 +1148,8 @@ void dfMatrix::solve()
     //     fprintf(stderr, "h_velocity_after[%d]: (%.15lf, %.15lf, %.15lf)\n", i, h_psi[i], 
     //     h_psi[num_cells + i], h_psi[num_cells*2 + i]);
 }
-void dfMatrix::updatePsi(double* Psi)
+
+void dfUEqn::updatePsi(double* Psi)
 {
     checkCudaErrors(cudaStreamSynchronize(stream));
     for (size_t i = 0; i < num_cells; i++)
@@ -1478,243 +1159,7 @@ void dfMatrix::updatePsi(double* Psi)
         Psi[i*3 + 2] = h_psi[num_cells*2 + i];
     }
 }
-void dfMatrix::fvc_grad_vector()
-{
-    clock_t start = std::clock();
-    // launch CUDA kernal
-    size_t threads_per_block = 1024;
-    size_t blocks_per_grid = (num_cells + threads_per_block - 1) / threads_per_block;
-    fvc_grad_vector_internal<<<blocks_per_grid, threads_per_block, 0, stream>>>(num_cells,
-          d_A_csr_row_index, d_A_csr_col_index, d_A_csr_diag_index,
-          d_face_vector, d_velocity_old, d_weight, d_volume, d_grad);
-
-    blocks_per_grid = (num_boundary_cells + threads_per_block - 1) / threads_per_block;
-    fvc_grad_vector_boundary<<<blocks_per_grid, threads_per_block, 0, stream>>>(num_cells, num_boundary_cells,
-          d_boundary_cell_offset, d_boundary_cell_id, d_boundary_face_vector, d_boundary_velocity, d_volume, 
-          d_grad, d_grad_boundary_init);
-
-    correct_boundary_conditions<<<blocks_per_grid, threads_per_block, 0, stream>>>(num_boundary_cells,
-          d_boundary_cell_offset, d_boundary_cell_id, d_boundary_face_vector, d_boundary_face, 
-          d_grad_boundary_init, d_grad_boundary);
-    clock_t end = std::clock();
-    time_monitor_GPU_kernel += double(end - start) / double(CLOCKS_PER_SEC);
-}
-void dfMatrix::divDevRhoReff()
-{
-    clock_t start = std::clock();
-    // launch CUDA kernal
-    size_t threads_per_block = 1024;
-    size_t blocks_per_grid = (num_cells + threads_per_block - 1) / threads_per_block;
-    dev2_t_tensor<<<blocks_per_grid, threads_per_block, 0, stream>>>(num_cells, d_grad);
-     
-    blocks_per_grid = (num_boundary_faces + threads_per_block - 1) / threads_per_block;
-    dev2_t_tensor<<<blocks_per_grid, threads_per_block, 0, stream>>>(num_boundary_faces, d_grad_boundary);
-    clock_t end = std::clock();
-    time_monitor_GPU_kernel += double(end - start) / double(CLOCKS_PER_SEC);
-
-    #ifdef DEBUG
-        double* h_grad = new double[num_cells * 9];
-        double* h_grad_boundary = new double[num_boundary_faces * 9];
-
-        // check value of internal field
-        checkCudaErrors(cudaMemcpy(h_grad, d_grad, num_cells * 9 * sizeof(double), cudaMemcpyDeviceToHost));
-        for (int i = 0; i < num_cells; i++)
-            fprintf(stderr, "fvc_grad_tensor_T[%d]: (%.2lf, %.2lf, %.2lf, %.2lf, %.2lf, %.2lf, %.2lf, %.2lf, %.2lf)\n", i, 
-            h_grad[i * 9 + 0], h_grad[i * 9 + 1], h_grad[i * 9 + 2], 
-            h_grad[i * 9 + 3], h_grad[i * 9 + 4], h_grad[i * 9 + 5],
-            h_grad[i * 9 + 6], h_grad[i * 9 + 7], h_grad[i * 9 + 8]);
-        fprintf(stderr, "error of dev2(T(fvc::grad(U)))\n");
-        checkVectorEqual(num_cells * 9, ref_grad_T, h_grad, 1);
-        
-        // check value of boundary field
-        checkCudaErrors(cudaMemcpy(h_grad_boundary, d_grad_boundary, num_boundary_faces * 9 * sizeof(double), cudaMemcpyDeviceToHost));
-        for (int i = 0; i < num_boundary_faces; i++)
-            fprintf(stderr, "df_boundary_gradU_T[%d]: (%.2lf, %.2lf, %.2lf, %.2lf, %.2lf, %.2lf, %.2lf, %.2lf, %.2lf)\n", i, 
-            h_grad_boundary[i * 9 + 0], h_grad_boundary[i * 9 + 1], h_grad_boundary[i * 9 + 2], 
-            h_grad_boundary[i * 9 + 3], h_grad_boundary[i * 9 + 4], h_grad_boundary[i * 9 + 5],
-            h_grad_boundary[i * 9 + 6], h_grad_boundary[i * 9 + 7], h_grad_boundary[i * 9 + 8]);
-
-        std::vector<double> boundary_gradU_T(num_boundary_faces*9);
-        for (int i = 0; i < num_boundary_faces; i++)
-        {
-            boundary_gradU_T[9*i] = ref_grad_T_boundary[9*boundPermutationList[i]];
-            boundary_gradU_T[9*i+1] = ref_grad_T_boundary[9*boundPermutationList[i]+1];
-            boundary_gradU_T[9*i+2] = ref_grad_T_boundary[9*boundPermutationList[i]+2];
-            boundary_gradU_T[9*i+3] = ref_grad_T_boundary[9*boundPermutationList[i]+3];
-            boundary_gradU_T[9*i+4] = ref_grad_T_boundary[9*boundPermutationList[i]+4];
-            boundary_gradU_T[9*i+5] = ref_grad_T_boundary[9*boundPermutationList[i]+5];
-            boundary_gradU_T[9*i+6] = ref_grad_T_boundary[9*boundPermutationList[i]+6];
-            boundary_gradU_T[9*i+7] = ref_grad_T_boundary[9*boundPermutationList[i]+7];
-            boundary_gradU_T[9*i+8] = ref_grad_T_boundary[9*boundPermutationList[i]+8];
-        }
-        for (int i = 0; i < num_boundary_faces; i++)
-            fprintf(stderr, "of_boundary_gradU_T[%d]: (%.2lf, %.2lf, %.2lf, %.2lf, %.2lf, %.2lf, %.2lf, %.2lf, %.2lf)\n", i, 
-            boundary_gradU_T[i * 9 + 0], boundary_gradU_T[i * 9 + 1], boundary_gradU_T[i * 9 + 2], 
-            boundary_gradU_T[i * 9 + 3], boundary_gradU_T[i * 9 + 4], boundary_gradU_T[i * 9 + 5],
-            boundary_gradU_T[i * 9 + 6], boundary_gradU_T[i * 9 + 7], boundary_gradU_T[i * 9 + 8]);
-        
-        fprintf(stderr, "error of dev2(T(fvc::grad(U))) boundary\n");
-        checkVectorEqual(num_boundary_faces * 9, boundary_gradU_T.data(), h_grad_boundary, 1);
-    #endif
-}
 
-void dfMatrix::fvc_div_tensor(const double *nuEff)
-{
-    #ifdef DEBUG
-        double* d_b_test = nullptr, *h_b_test = new double[3*num_cells];
-        double* d_grad_test = nullptr, *h_grad_test = new double[9*num_cells];
-        double* d_grad_boundary_test = nullptr,*h_grad_boundary_test = new double[num_boundary_faces * 9];
-        double* h_weight_test = new double[num_faces];
-
-        checkCudaErrors(cudaMalloc((void**)&d_b_test, cell_vec_bytes));
-        checkCudaErrors(cudaMalloc((void**)&d_grad_test, 9*num_cells*sizeof(double)));
-        checkCudaErrors(cudaMalloc((void**)&d_grad_boundary_test, 9*num_boundary_faces*sizeof(double)));
-
-        checkCudaErrors(cudaMemsetAsync(d_b_test, 0, cell_vec_bytes, stream));
-    #endif
-    clock_t start = std::clock();
-    checkCudaErrors(cudaMemcpyAsync(d_nuEff, nuEff, cell_bytes, cudaMemcpyHostToDevice, stream));
-    clock_t end = std::clock();
-    time_monitor_GPU_memcpy += double(end - start) / double(CLOCKS_PER_SEC);
-
-    // launch cuda kernel
-    start = std::clock();
-    size_t threads_per_block = 512;
-    size_t blocks_per_grid = (num_cells + threads_per_block - 1) / threads_per_block;
-    fvc_div_tensor_internal<<<blocks_per_grid, threads_per_block, 0, stream>>>(num_cells,
-          d_A_csr_row_index, d_A_csr_col_index, d_A_csr_diag_index,
-          d_nuEff, d_rho_new, d_face_vector, d_grad, d_weight, d_volume, 1., d_b, d_b);
-
-    blocks_per_grid = (num_boundary_faces + threads_per_block - 1) / threads_per_block;
-    fvc_div_tensor_boundary<<<blocks_per_grid, threads_per_block, 0, stream>>>(num_cells, num_boundary_cells,
-          d_boundary_cell_offset, d_boundary_cell_id,
-          d_boundary_nuEff, d_boundary_rho, d_boundary_face_vector, d_grad_boundary, d_volume, 1., d_b, d_b);
-    end = std::clock();
-    time_monitor_GPU_kernel += double(end - start) / double(CLOCKS_PER_SEC);
-
-    #ifdef DEBUG
-        checkCudaErrors(cudaMemcpyAsync(h_b_test, d_b_test, cell_vec_bytes, cudaMemcpyDeviceToHost, stream));
-        checkCudaErrors(cudaMemcpyAsync(h_grad_test, d_grad_test, 9*num_cells*sizeof(double), cudaMemcpyDeviceToHost, stream));
-        checkCudaErrors(cudaMemcpyAsync(h_grad_boundary_test, d_grad_boundary_test, 9*num_boundary_faces*sizeof(double), cudaMemcpyDeviceToHost, stream));
-        checkCudaErrors(cudaMemcpyAsync(h_weight_test, d_weight, num_faces*sizeof(double), cudaMemcpyDeviceToHost, stream));
-
-        for (int i = 0; i < num_cells; i++)
-            fprintf(stderr, "h_b[%d]: (%.15lf, %.15lf, %.15lf)\n", i, h_b_test[i], h_b_test[num_cells + i],
-            h_b_test[2*num_cells + i]);
-        
-        fprintf(stderr,"error of fvc_div\n");
-        std::vector<double> of_ref_div_init_vec(3*num_cells);
-        std::copy(of_ref_div, of_ref_div + 3*num_cells, of_ref_div_init_vec.begin());
-        std::vector<double> of_ref_div_vec;
-        for (int i = 0; i < 3*num_cells; i+=3)
-        {
-            of_ref_div_vec.push_back(of_ref_div_init_vec[i]);
-        }
-        // fill RHS_y
-        for (int i = 1; i < 3*num_cells; i+=3)
-        {
-            of_ref_div_vec.push_back(of_ref_div_init_vec[i]);
-        }
-        // fill RHS_z
-        for (int i = 2; i < 3*num_cells; i+=3)
-        {
-            of_ref_div_vec.push_back(of_ref_div_init_vec[i]);
-        }
-        checkVectorEqual(3*num_cells, of_ref_div_vec.data(), h_b_test, 0.1);
-
-        for (int i = 0; i < num_cells; i++)
-            fprintf(stderr, "fvc_grad_test[%d]: (%.7lf, %.7lf, %.7lf, %.7lf, %.7lf, %.7lf, %.7lf, %.7lf, %.7lf)\n", i, 
-            h_grad_test[i * 9 + 0], h_grad_test[i * 9 + 1], h_grad_test[i * 9 + 2], 
-            h_grad_test[i * 9 + 3], h_grad_test[i * 9 + 4], h_grad_test[i * 9 + 5],
-            h_grad_test[i * 9 + 6], h_grad_test[i * 9 + 7], h_grad_test[i * 9 + 8]);
-        fprintf(stderr,"error of 'this->alpha_*this->rho_*this->nuEff())*dev2(T(fvc::grad(U))'\n");
-        checkVectorEqual(9*num_cells, of_ref_grad, h_grad_test, 0.1);
-
-        for (int i = 0; i < num_boundary_faces; i++)
-            fprintf(stderr, "fvc_grad_boundary_test[%d]: (%.9lf, %.9lf, %.9lf, %.9lf, %.9lf, %.9lf, %.9lf, %.9lf, %.9lf)\n", i, 
-            h_grad_boundary_test[i * 9 + 0], h_grad_boundary_test[i * 9 + 1], h_grad_boundary_test[i * 9 + 2],
-            h_grad_boundary_test[i * 9 + 3], h_grad_boundary_test[i * 9 + 4], h_grad_boundary_test[i * 9 + 5],
-            h_grad_boundary_test[i * 9 + 6], h_grad_boundary_test[i * 9 + 7], h_grad_boundary_test[i * 9 + 8]);
-
-        std::vector<double> boundary_coeff_gradU(num_boundary_faces*9);
-        for (int i = 0; i < num_boundary_faces; i++)
-        {
-            boundary_coeff_gradU[9*i] = of_ref_boundary[9*boundPermutationList[i]];
-            boundary_coeff_gradU[9*i+1] = of_ref_boundary[9*boundPermutationList[i]+1];
-            boundary_coeff_gradU[9*i+2] = of_ref_boundary[9*boundPermutationList[i]+2];
-            boundary_coeff_gradU[9*i+3] = of_ref_boundary[9*boundPermutationList[i]+3];
-            boundary_coeff_gradU[9*i+4] = of_ref_boundary[9*boundPermutationList[i]+4];
-            boundary_coeff_gradU[9*i+5] = of_ref_boundary[9*boundPermutationList[i]+5];
-            boundary_coeff_gradU[9*i+6] = of_ref_boundary[9*boundPermutationList[i]+6];
-            boundary_coeff_gradU[9*i+7] = of_ref_boundary[9*boundPermutationList[i]+7];
-            boundary_coeff_gradU[9*i+8] = of_ref_boundary[9*boundPermutationList[i]+8];
-        }
-        for (int i = 0; i < num_boundary_faces; i++)
-            printf("of_boundary_gradU[%d]: (%.9lf, %.9lf, %.9lf, %.9lf, %.9lf, %.9lf, %.9lf, %.9lf, %.9lf)\n", i, 
-            boundary_coeff_gradU[i * 9 + 0], boundary_coeff_gradU[i * 9 + 1], boundary_coeff_gradU[i * 9 + 2], 
-            boundary_coeff_gradU[i * 9 + 3], boundary_coeff_gradU[i * 9 + 4], boundary_coeff_gradU[i * 9 + 5],
-            boundary_coeff_gradU[i * 9 + 6], boundary_coeff_gradU[i * 9 + 7], boundary_coeff_gradU[i * 9 + 8]);
-        
-        fprintf(stderr,"error of boundary of fvc::div\n");
-        checkVectorEqual(9*num_boundary_faces, boundary_coeff_gradU.data(), h_grad_boundary_test, 1e-5);
-    #endif
-}
-
-void dfMatrix::fvm_laplacian()
-{
-    #ifdef DEBUG
-        double* d_b_test = nullptr, *h_b_test = new double[3*num_cells];
-        double* d_A_csr_test = nullptr, *h_A_csr_test = new double[(2*num_surfaces + num_cells)*3];
-
-        checkCudaErrors(cudaMalloc((void**)&d_b_test, cell_vec_bytes));
-        checkCudaErrors(cudaMalloc((void**)&d_A_csr_test, csr_value_vec_bytes));
-
-        checkCudaErrors(cudaMemsetAsync(d_A_csr_test, 0, csr_value_vec_bytes, stream));
-        checkCudaErrors(cudaMemsetAsync(d_b_test, 0, cell_vec_bytes, stream));
-    #endif
+dfUEqn::~dfUEqn(){
     
-    clock_t start = std::clock();
-    // launch CUDA kernels
-    size_t threads_per_block = 1024;
-    size_t blocks_per_grid = (num_cells + threads_per_block - 1) / threads_per_block;
-    fvm_laplacian_uncorrected_vector_internal<<<blocks_per_grid, threads_per_block, 0, stream>>>(num_cells, num_faces,
-          d_A_csr_row_index, d_A_csr_col_index, d_A_csr_diag_index, d_rho_new, d_nuEff, d_weight, d_face, d_deltaCoeffs, -1., d_A_csr, d_A_csr);
-
-    blocks_per_grid = (num_boundary_cells + threads_per_block - 1) / threads_per_block;
-    fvm_laplacian_uncorrected_vector_boundary<<<blocks_per_grid, threads_per_block, 0, stream>>>(num_cells, num_faces, num_boundary_cells,
-          d_A_csr_row_index, d_A_csr_diag_index, d_boundary_cell_offset, d_boundary_cell_id, 
-          d_boundary_nuEff, d_boundary_rho, d_boundary_face, d_laplac_internal_coeffs, d_laplac_boundary_coeffs,
-          -1., d_A_csr, d_b, d_A_csr, d_b);
-    clock_t end = std::clock();
-    time_monitor_GPU_kernel += double(end - start) / double(CLOCKS_PER_SEC);
-    
-    #ifdef DEBUG
-        checkCudaErrors(cudaMemcpy(h_A_csr_test, d_A_csr_test, csr_value_vec_bytes, cudaMemcpyDeviceToHost));
-
-        char *input_file = "of_ref_laplac.txt";
-        FILE *fp = fopen(input_file, "rb+");
-        if (fp == NULL) {
-            fprintf(stderr, "Failed to open input file: %s!\n", input_file);
-        }
-        int readfile = 0;
-        double *of_A_laplac = new double[2*num_surfaces + num_cells];
-        readfile = fread(of_A_laplac, (2*num_surfaces + num_cells) * sizeof(double), 1, fp);
-        
-        std::vector<double> h_A_of_laplac_init_vec(num_cells+2*num_surfaces);
-        std::copy(of_A_laplac, of_A_laplac + num_cells+2*num_surfaces, h_A_of_laplac_init_vec.begin());
-
-        std::vector<double> h_A_of_vec_1mtx(2*num_surfaces + num_cells, 0);
-        for (int i = 0; i < 2*num_surfaces + num_cells; i++)
-        {
-            h_A_of_vec_1mtx[i] = h_A_of_laplac_init_vec[tmpPermutatedList[i]];
-        }
-
-        std::vector<double> h_A_of_vec((2*num_surfaces + num_cells)*3);
-        for (int i =0; i < 3; i ++)
-        {
-            std::copy(h_A_of_vec_1mtx.begin(), h_A_of_vec_1mtx.end(), h_A_of_vec.begin()+i*(2*num_surfaces + num_cells));
-        }
-        fprintf(stderr,"error of fvm::laplacian\n");
-        checkVectorEqual((2*num_surfaces + num_cells)*3, h_A_of_vec.data(), h_A_csr_test, 1e-5);
-    #endif
-}
+}
\ No newline at end of file
diff --git a/src_gpu/ueqn.cu b/src_gpu/ueqn.cu
deleted file mode 100644
index 23401eef5..000000000
--- a/src_gpu/ueqn.cu
+++ /dev/null
@@ -1,1097 +0,0 @@
-#include <stdio.h>
-#include <unistd.h>
-#include <sys/time.h>
-#include <cuda_runtime.h>
-#include <cuda_profiler_api.h>
-
-static const char *_cudaGetErrorEnum(cudaError_t error) {
-  return cudaGetErrorName(error);
-}
-
-template <typename T>
-void check(T result, char const *const func, const char *const file,
-        int const line) {
-  if (result) {
-    fprintf(stderr, "cuda error at %s:%d code=%d(%s) \"%s\" \n", file, line,
-            static_cast<unsigned int>(result), _cudaGetErrorEnum(result), func);
-    exit(EXIT_FAILURE);
-  }
-}
-
-#define checkCudaErrors(val) check((val), #val, __FILE__, __LINE__)
-
-static void checkVectorEqual(int count, double* basevec, double* vec, double max_relative_error) {
-  for (size_t i = 0; i < count; ++i)
-  {
-    double abs_diff = fabs(basevec[i] - vec[i]);
-    double rel_diff = fabs(basevec[i] - vec[i]) / fabs(basevec[i]);
-    if (abs_diff > 1e-16 && rel_diff > max_relative_error) {
-      fprintf(stderr, "mismatch index %d, cpu data: %.16lf, gpu data: %.16lf, relative error: %.16lf\n", i, basevec[i], vec[i], rel_diff);
-    }
-  }
-}
-
-__global__ void fvm_ddt(int num_cells, int num_faces, const double rdelta_t,
-        const int* csr_row_index, const int* csr_diag_index,
-        const double* rho_old, const double* rho_new, const double* volume, const double* velocity_old,
-        const double* A_csr_input, const double* b_input, double* A_csr_output, double* b_output) {
-    int index = blockDim.x * blockIdx.x + threadIdx.x;
-    if (index >= num_cells) return;
-
-    // A_csr has one more element in each row: itself
-    int row_index = csr_row_index[index];
-    int diag_index = csr_diag_index[index];
-
-    int csr_dim = num_cells + num_faces;
-    int csr_index = row_index + diag_index;
-    double ddt_diag = rdelta_t * rho_new[index] * volume[index];
-    A_csr_output[csr_dim * 0 + csr_index] = A_csr_input[csr_dim * 0 + csr_index] + ddt_diag;
-    A_csr_output[csr_dim * 1 + csr_index] = A_csr_input[csr_dim * 1 + csr_index] + ddt_diag;
-    A_csr_output[csr_dim * 2 + csr_index] = A_csr_input[csr_dim * 2 + csr_index] + ddt_diag;
-
-    double ddt_part_term = rdelta_t * rho_old[index] * volume[index];
-    b_output[num_cells * 0 + index] = b_input[num_cells * 0 + index] + ddt_part_term * velocity_old[index * 3 + 0];
-    b_output[num_cells * 1 + index] = b_input[num_cells * 1 + index] + ddt_part_term * velocity_old[index * 3 + 1];
-    b_output[num_cells * 2 + index] = b_input[num_cells * 2 + index] + ddt_part_term * velocity_old[index * 3 + 2];
-}
-
-__global__ void fvm_div_internal(int num_cells, int num_faces,
-        const int* csr_row_index, const int* csr_diag_index,
-        const double* weight, const double* phi,
-        const double* A_csr_input, const double* b_input, double* A_csr_output, double* b_output) {
-    int index = blockDim.x * blockIdx.x + threadIdx.x;
-    if (index >= num_cells) return;
-
-    // A_csr has one more element in each row: itself
-    int row_index = csr_row_index[index];
-    int next_row_index = csr_row_index[index + 1];
-    int diag_index = csr_diag_index[index];
-    int neighbor_offset = csr_row_index[index] - index;
-    int csr_dim = num_cells + num_faces;
-
-    double div_diag = 0;
-    for (int i = row_index; i < next_row_index; i++) {
-      int inner_index = i - row_index;
-      // lower
-      if (inner_index < diag_index) {
-        int neighbor_index = neighbor_offset + inner_index;
-        double w = weight[neighbor_index];
-        double f = phi[neighbor_index];
-        A_csr_output[csr_dim * 0 + i] = A_csr_input[csr_dim * 0 + i] + (-w) * f;
-        A_csr_output[csr_dim * 1 + i] = A_csr_input[csr_dim * 1 + i] + (-w) * f;
-        A_csr_output[csr_dim * 2 + i] = A_csr_input[csr_dim * 2 + i] + (-w) * f;
-        // lower neighbors contribute to sum of -1
-        div_diag += (w - 1) * f;
-      }
-      // upper
-      if (inner_index > diag_index) {
-        // upper, index - 1, consider of diag
-        int neighbor_index = neighbor_offset + inner_index - 1;
-        double w = weight[neighbor_index];
-        double f = phi[neighbor_index];
-        A_csr_output[csr_dim * 0 + i] = A_csr_input[csr_dim * 0 + i] + (1 - w) * f;
-        A_csr_output[csr_dim * 1 + i] = A_csr_input[csr_dim * 1 + i] + (1 - w) * f;
-        A_csr_output[csr_dim * 2 + i] = A_csr_input[csr_dim * 2 + i] + (1 - w) * f;
-        // upper neighbors contribute to sum of 1
-        div_diag += w * f;
-      }
-    }
-    A_csr_output[csr_dim * 0 + row_index + diag_index] = A_csr_input[csr_dim * 0 + row_index + diag_index] + div_diag; // diag
-    A_csr_output[csr_dim * 1 + row_index + diag_index] = A_csr_input[csr_dim * 1 + row_index + diag_index] + div_diag; // diag
-    A_csr_output[csr_dim * 2 + row_index + diag_index] = A_csr_input[csr_dim * 2 + row_index + diag_index] + div_diag; // diag
-}
-
-__global__ void fvm_div_boundary(int num_cells, int num_faces, int num_boundary_cells,
-        const int* csr_row_index, const int* csr_diag_index,
-        const int* boundary_cell_offset, const int* boundary_cell_id, 
-        const double* internal_coeffs, const double* boundary_coeffs,
-        const double* A_csr_input, const double* b_input, double* A_csr_output, double* b_output) {
-    int index = blockDim.x * blockIdx.x + threadIdx.x;
-    if (index >= num_boundary_cells) return;
-
-    int cell_offset = boundary_cell_offset[index]; // face index
-    int cell_index = boundary_cell_id[cell_offset];
-    int loop_size = boundary_cell_offset[index + 1] - cell_offset;
-
-    int row_index = csr_row_index[cell_index];
-    int diag_index = csr_diag_index[cell_index];
-    int csr_dim = num_cells + num_faces;
-    int csr_index = row_index + diag_index;
-
-    // construct internalCoeffs & boundaryCoeffs
-    double internal_coeffs_x = 0;
-    double internal_coeffs_y = 0;
-    double internal_coeffs_z = 0;
-    double boundary_coeffs_x = 0;
-    double boundary_coeffs_y = 0;
-    double boundary_coeffs_z = 0;
-    for (int i = 0; i < loop_size; i++) {
-        internal_coeffs_x += internal_coeffs[(cell_offset + i) * 3 + 0];
-        internal_coeffs_y += internal_coeffs[(cell_offset + i) * 3 + 1];
-        internal_coeffs_z += internal_coeffs[(cell_offset + i) * 3 + 2];
-        boundary_coeffs_x += boundary_coeffs[(cell_offset + i) * 3 + 0];
-        boundary_coeffs_y += boundary_coeffs[(cell_offset + i) * 3 + 1];
-        boundary_coeffs_z += boundary_coeffs[(cell_offset + i) * 3 + 2];
-    }
-
-    A_csr_output[csr_dim * 0 + csr_index] = A_csr_input[csr_dim * 0 + csr_index] + internal_coeffs_x;
-    A_csr_output[csr_dim * 1 + csr_index] = A_csr_input[csr_dim * 1 + csr_index] + internal_coeffs_y;
-    A_csr_output[csr_dim * 2 + csr_index] = A_csr_input[csr_dim * 2 + csr_index] + internal_coeffs_z;
-    b_output[num_cells * 0 + cell_index] = b_input[num_cells * 0 + cell_index] + boundary_coeffs_x;
-    b_output[num_cells * 1 + cell_index] = b_input[num_cells * 1 + cell_index] + boundary_coeffs_y;
-    b_output[num_cells * 2 + cell_index] = b_input[num_cells * 2 + cell_index] + boundary_coeffs_z;
-}
-
-__global__ void fvc_grad_internal_face(int num_cells,
-        const int* csr_row_index, const int* csr_col_index, const int* csr_diag_index,
-        const double* face_vector, const double* weight, const double* pressure,
-        const double* b_input, double* b_output) {
-    int index = blockDim.x * blockIdx.x + threadIdx.x;
-    if (index >= num_cells) return;
-
-    // A_csr has one more element in each row: itself
-    int row_index = csr_row_index[index];
-    int next_row_index = csr_row_index[index + 1];
-    int diag_index = csr_diag_index[index];
-    int neighbor_offset = csr_row_index[index] - index;
-
-    double own_cell_p = pressure[index];
-    double grad_bx = 0;
-    double grad_by = 0;
-    double grad_bz = 0;
-    double grad_bx_low = 0;
-    double grad_bx_upp = 0;
-    double grad_by_low = 0;
-    double grad_by_upp = 0;
-    double grad_bz_low = 0;
-    double grad_bz_upp = 0;
-    for (int i = row_index; i < next_row_index; i++) {
-      int inner_index = i - row_index;
-      // lower
-      if (inner_index < diag_index) {
-        int neighbor_index = neighbor_offset + inner_index;
-        double w = weight[neighbor_index];
-        double sfx = face_vector[neighbor_index * 3 + 0];
-        double sfy = face_vector[neighbor_index * 3 + 1];
-        double sfz = face_vector[neighbor_index * 3 + 2];
-        int neighbor_cell_id = csr_col_index[row_index + inner_index];
-        double neighbor_cell_p = pressure[neighbor_cell_id];
-        double face_p = (1 - w) * own_cell_p + w * neighbor_cell_p;
-        grad_bx_low -= face_p * sfx;
-        grad_by_low -= face_p * sfy;
-        grad_bz_low -= face_p * sfz;
-      }
-      // upper
-      if (inner_index > diag_index) {
-        int neighbor_index = neighbor_offset + inner_index - 1;
-        double w = weight[neighbor_index];
-        double sfx = face_vector[neighbor_index * 3 + 0];
-        double sfy = face_vector[neighbor_index * 3 + 1];
-        double sfz = face_vector[neighbor_index * 3 + 2];
-        int neighbor_cell_id = csr_col_index[row_index + inner_index + 1];
-        double neighbor_cell_p = pressure[neighbor_cell_id];
-        double face_p = (1 - w) * own_cell_p + w * neighbor_cell_p;
-        grad_bx_upp += face_p * sfx;
-        grad_by_upp += face_p * sfy;
-        grad_bz_upp += face_p * sfz;
-      }
-    }
-    b_output[num_cells * 0 + index] = b_input[num_cells * 0 + index] + grad_bx_low + grad_bx_upp;
-    b_output[num_cells * 1 + index] = b_input[num_cells * 1 + index] + grad_by_low + grad_by_upp;
-    b_output[num_cells * 2 + index] = b_input[num_cells * 2 + index] + grad_bz_low + grad_bz_upp;
-}
-
-__global__ void fvc_grad_boundary_face(int num_cells, int num_boundary_cells,
-        const int* boundary_cell_offset, const int* boundary_cell_id, 
-        const double* boundary_face_vector, const double* boundary_pressure,
-        const double* b_input, double* b_output) {
-    int index = blockDim.x * blockIdx.x + threadIdx.x;
-    if (index >= num_boundary_cells) return;
-
-    int cell_offset = boundary_cell_offset[index];
-    int next_cell_offset = boundary_cell_offset[index + 1];
-    int cell_index = boundary_cell_id[cell_offset];
-
-    // compute boundary gradient
-    double grad_bx = 0; 
-    double grad_by = 0; 
-    double grad_bz = 0; 
-    for (int i = cell_offset; i < next_cell_offset; i++) {
-      double sfx = boundary_face_vector[i * 3 + 0];
-      double sfy = boundary_face_vector[i * 3 + 1];
-      double sfz = boundary_face_vector[i * 3 + 2];
-      double face_p = boundary_pressure[i];
-      grad_bx += face_p * sfx;
-      grad_by += face_p * sfy;
-      grad_bz += face_p * sfz;
-    }
-
-    //// correct the boundary gradient
-    //double nx = boundary_face_vector[face_index * 3 + 0] / magSf[face_index];
-    //double ny = boundary_face_vector[face_index * 3 + 1] / magSf[face_index];
-    //double nz = boundary_face_vector[face_index * 3 + 2] / magSf[face_index];
-    //double sn_grad = 0;
-    //double grad_correction = sn_grad * volume[cell_index] - (nx * grad_bx + ny * grad_by + nz * grad_bz);
-    //grad_bx += nx * grad_correction; 
-    //grad_by += ny * grad_correction; 
-    //grad_bz += nz * grad_correction; 
-
-    b_output[num_cells * 0 + cell_index] = b_input[num_cells * 0 + cell_index] + grad_bx;
-    b_output[num_cells * 1 + cell_index] = b_input[num_cells * 1 + cell_index] + grad_by;
-    b_output[num_cells * 2 + cell_index] = b_input[num_cells * 2 + cell_index] + grad_bz;
-}
-
-__global__ void add_fvMatrix(int num_cells, int num_faces,
-        const int* csr_row_index,
-        const double* turbSrc_A, const double* turbSrc_b,
-        const double* A_csr_input, const double* b_input, double* A_csr_output, double* b_output) {
-    int index = blockDim.x * blockIdx.x + threadIdx.x;
-    if (index >= num_cells) return;
-
-    int row_index = csr_row_index[index];
-    int next_row_index = csr_row_index[index + 1];
-    int csr_dim = num_cells + num_faces;
-    double A_entry;
-
-    for (int i = row_index; i < next_row_index; i++)
-    {
-      A_entry = turbSrc_A[i];
-      A_csr_output[csr_dim * 0 + i] = A_csr_input[csr_dim * 0 + i] + A_entry;
-      A_csr_output[csr_dim * 1 + i] = A_csr_input[csr_dim * 1 + i] + A_entry;
-      A_csr_output[csr_dim * 2 + i] = A_csr_input[csr_dim * 2 + i] + A_entry;
-    }
-    b_output[num_cells * 0 + index] = b_input[num_cells * 0 + index] + turbSrc_b[num_cells * 0 + index];
-    b_output[num_cells * 1 + index] = b_input[num_cells * 1 + index] + turbSrc_b[num_cells * 1 + index];
-    b_output[num_cells * 2 + index] = b_input[num_cells * 2 + index] + turbSrc_b[num_cells * 2 + index];
-}
-
-/*
-// the implement fo assemble kernel is outdated.
-__global__ void assemble(int num_cells, int num_faces, const double rdelta_t, const int* csr_row_index, const int* csr_diag_index,
-        const double* rho_old, const double* rho_new, const double* volume, const double* velocity_old,
-        const double* weight, const double* phi, const double* face_vector, const double* pressure,
-        double* A_csr, double* b) {
-
-    int index = blockDim.x * blockIdx.x + threadIdx.x;
-    if (index >= num_cells) return;
-
-    // A_csr has one more element in each row: itself
-    int row_index = csr_row_index[index];
-    int next_row_index = csr_row_index[index + 1];
-    int diag_index = csr_diag_index[index];
-    int neighbor_offset = csr_row_index[index] - index;
-    int csr_dim = num_cells + num_faces;
- 
-    double div_diag = 0;
-    double grad_bx = 0;
-    double grad_by = 0;
-    double grad_bz = 0;
-    for (int i = row_index; i < next_row_index; i++) {
-      int inner_index = i - row_index;
-      // lower
-      if (inner_index < diag_index) {
-        int neighbor_index = neighbor_offset + inner_index;
-        double w = weight[neighbor_index];
-        double f = phi[neighbor_index];
-        double sfx = face_vector[neighbor_index * 3 + 0];
-        double sfy = face_vector[neighbor_index * 3 + 1];
-        double sfz = face_vector[neighbor_index * 3 + 2];
-        double p = pressure[neighbor_index];
-        A_csr[csr_dim * 0 + i] += (-w) * f;
-        A_csr[csr_dim * 1 + i] += (-w) * f;
-        A_csr[csr_dim * 2 + i] += (-w) * f;
-        // lower neighbors contribute to sum of -1
-        div_diag += (w - 1) * f;
-        grad_bx -= p * sfx;
-        grad_by -= p * sfy;
-        grad_bz -= p * sfz;
-      }
-      // upper
-      if (inner_index > diag_index) {
-        // upper, index - 1, consider of diag
-        int neighbor_index = neighbor_offset + inner_index - 1;
-        double w = weight[neighbor_index];
-        double f = phi[neighbor_index];
-        double sfx = face_vector[neighbor_index * 3 + 0];
-        double sfy = face_vector[neighbor_index * 3 + 1];
-        double sfz = face_vector[neighbor_index * 3 + 2];
-        double p = pressure[neighbor_index];
-        A_csr[csr_dim * 0 + i] += (1 - w) * f;
-        A_csr[csr_dim * 1 + i] += (1 - w) * f;
-        A_csr[csr_dim * 2 + i] += (1 - w) * f;
-        // upper neighbors contribute to sum of 1
-        div_diag += w * f;
-        grad_bx += p * sfx;
-        grad_by += p * sfy;
-        grad_bz += p * sfz;
-      }
-    }
-
-    double ddt_diag = rdelta_t * rho_new[index] * volume[index];
-    A_csr[csr_dim * 0 + row_index + diag_index] = ddt_diag + div_diag; // diag of A
-    A_csr[csr_dim * 1 + row_index + diag_index] = ddt_diag + div_diag; // diag of A
-    A_csr[csr_dim * 2 + row_index + diag_index] = ddt_diag + div_diag; // diag of A
-
-    double ddt_bx = rdelta_t * rho_old[index] * velocity_old[index * 3 + 0] * volume[index];
-    double ddt_by = rdelta_t * rho_old[index] * velocity_old[index * 3 + 1] * volume[index];
-    double ddt_bz = rdelta_t * rho_old[index] * velocity_old[index * 3 + 2] * volume[index];
-    b[num_cells * 0 + index] = ddt_bx + grad_bx;
-    b[num_cells * 1 + index] = ddt_by + grad_by;
-    b[num_cells * 2 + index] = ddt_bz + grad_bz;
-}
-*/
-int main(int argc, char* argv[]) {
-  // Print the array length to be used, and compute its size
-  bool validation_mode = false;
-  int w = 128;
-  int h = 128;
-  int d = 128;
-  char* input_file = NULL;
-
-  if (argc > 1) {
-    validation_mode = true;
-    w = atoi(argv[1]);
-    h = atoi(argv[2]);
-    d = atoi(argv[3]);
-    input_file = argv[4];
-  }
-
-  fprintf(stderr, "mesh(w, h, d) is (%d, %d, %d)\n", w, h, d);
-
-  // Get num_cells, num_faces, num_boundary_cells, num_boundary_faces
-  int num_cells = w * h * d;
-  int num_faces = num_cells * 6;
-  int num_boundary_cells = 0;
-  int num_boundary_faces = 0;
-  for (int i = 0; i < num_cells; ++i) {
-    int ww = i % w;
-    int hh = (i % (w * h)) / w;
-    int dd = i / (w * h);
-
-    bool boundary_flag = false;
-    if (ww == 0) {
-      num_boundary_faces++;
-      boundary_flag = true;
-    }
-    if (ww == (w - 1)) {
-      num_boundary_faces++;
-      boundary_flag = true;
-    }
-    if (hh == 0) {
-      num_boundary_faces++;
-      boundary_flag = true;
-    }
-    if (hh == (h - 1)) {
-      num_boundary_faces++;
-      boundary_flag = true;
-    }
-    if (dd == 0) {
-      num_boundary_faces++;
-      boundary_flag = true;
-    }
-    if (dd == (d - 1)) {
-      num_boundary_faces++;
-      boundary_flag = true;
-    }
-
-    if (boundary_flag)
-        num_boundary_cells++;
-  }
-  num_faces -= num_boundary_faces;
-
-  // Load num_cells, num_faces, num_boundary_cells and num_boundary_faces
-  FILE *fp = NULL;
-  if (validation_mode) {
-    fp = fopen(input_file, "rb+");
-    if (fp == NULL) {
-      fprintf(stderr, "%s %d, Failed to open input file: %s!\n", __FILE__, __LINE__, input_file);
-      exit(EXIT_FAILURE);
-    }
-
-    int readfile = 0;
-    int input_num_cells = 0;
-    int input_num_faces = 0;
-    int input_num_boundary_cells = 0;
-    int input_num_boundary_faces = 0;
-    readfile = fread(&input_num_cells, sizeof(int), 1, fp);
-    if(readfile == 0) {
-      fprintf(stderr, "%s %d, Failed to open input file: %s!\n", __FILE__, __LINE__, input_file);
-      exit(EXIT_FAILURE);
-    }
-    readfile = fread(&input_num_faces, sizeof(int), 1, fp);
-    if(readfile == 0) {
-      fprintf(stderr, "%s %d, Failed to open input file: %s!\n", __FILE__, __LINE__, input_file);
-      exit(EXIT_FAILURE);
-    }
-    readfile = fread(&input_num_boundary_cells, sizeof(int), 1, fp);
-    if(readfile == 0) {
-      fprintf(stderr, "%s %d, Failed to open input file: %s!\n", __FILE__, __LINE__, input_file);
-      exit(EXIT_FAILURE);
-    }
-    readfile = fread(&input_num_boundary_faces, sizeof(int), 1, fp);
-    if(readfile == 0) {
-      fprintf(stderr, "%s %d, Failed to open input file: %s!\n", __FILE__, __LINE__, input_file);
-      exit(EXIT_FAILURE);
-    }
-    //if ((input_num_cells != num_cells) || (input_num_faces != num_faces) || (input_num_boundary_cells != num_boundary_cells)) {
-    // temp
-    if ((input_num_cells != num_cells) || (input_num_faces * 2 != num_faces
-                || (input_num_boundary_cells != num_boundary_cells) || (input_num_boundary_faces != num_boundary_faces))) {
-      fprintf(stderr, "Input info mismatched:\n");
-      fprintf(stderr, "input_num_cells: %d, num_cells: %d\n", input_num_cells, num_cells);
-      fprintf(stderr, "input_num_faces: %d, num_faces: %d\n", input_num_faces, num_faces);
-      fprintf(stderr, "input_num_boundary_cells: %d, num_boundary_cells: %d\n", input_num_boundary_cells, num_boundary_cells);
-      fprintf(stderr, "input_num_boundary_faces: %d, num_boundary_faces: %d\n", input_num_boundary_faces, num_boundary_faces);
-      exit(EXIT_FAILURE);
-    }
-    num_faces = input_num_faces * 2;
-  }
-
-  size_t cell_bytes = num_cells * sizeof(double);
-  size_t cell_vec_bytes = num_cells * 3 * sizeof(double);
-  size_t cell_index_bytes = num_cells * sizeof(int);
-
-  size_t face_bytes = num_faces * sizeof(double);
-  size_t face_vec_bytes = num_faces * 3 * sizeof(double);
-
-  size_t boundary_cell_bytes = num_boundary_cells * sizeof(double);
-  size_t boundary_cell_vec_bytes = num_boundary_cells * 3 * sizeof(double);
-  size_t boundary_cell_index_bytes = num_boundary_cells * sizeof(int);
-
-  size_t boundary_face_bytes = num_boundary_faces * sizeof(double);
-  size_t boundary_face_vec_bytes = num_boundary_faces * 3 * sizeof(double);
-  size_t boundary_face_index_bytes = num_boundary_faces * sizeof(int);
-
-  // A_csr has one more element in each row: itself
-  size_t csr_row_index_bytes = (num_cells + 1) * sizeof(int);
-  size_t csr_col_index_bytes = (num_cells + num_faces) * sizeof(int);
-  size_t csr_value_bytes = (num_cells + num_faces) * sizeof(double);
-  size_t csr_value_vec_bytes = (num_cells + num_faces) * 3 * sizeof(double);
-
-  // csr_row_index, csr_col_index, csr_diag_index are pre-computed on cpu, not computed on gpu.
-  // values of each row are stored continued.
-  // row_index stores the start offset of each row.
-  // col_index stores the column number of each row
-  // diag_index stores the serial number(in a row) of the diag element.
-  int* h_A_csr_row_index = (int*)malloc(csr_row_index_bytes);
-  int* h_A_csr_col_index = (int*)malloc(csr_col_index_bytes);
-  int* h_A_csr_diag_index = (int*)malloc(cell_index_bytes);
-
-  double* h_rho_old = (double*)malloc(cell_bytes);
-  double* h_rho_new = (double*)malloc(cell_bytes);
-  double* h_volume = (double*)malloc(cell_bytes);
-  double* h_pressure = (double*)malloc(cell_bytes);
-  double* h_velocity_old = (double*)malloc(cell_vec_bytes);
-  double* h_weight = (double*)malloc(face_bytes);
-  double* h_phi = (double*)malloc(face_bytes);
-  double* h_face_vector = (double*)malloc(face_vec_bytes);
-
-  int* h_boundary_cell_offset = (int*)malloc((num_boundary_cells+1) * sizeof(int));
-  int* h_boundary_cell_id = (int*)malloc(boundary_face_bytes);
-
-  double* h_boundary_pressure = (double*)malloc(boundary_face_bytes);
-  double* h_boundary_face_vector = (double*)malloc(boundary_face_vec_bytes);
-  double* h_internal_coeffs = (double*)malloc(boundary_face_vec_bytes);
-  double* h_boundary_coeffs = (double*)malloc(boundary_face_vec_bytes);
-
-  double* h_A_csr = (double*)malloc(csr_value_vec_bytes);
-  double* h_b = (double*)malloc(cell_vec_bytes);
-  double* h_A_csr_validation = (double*)malloc(csr_value_vec_bytes);
-  double* h_b_validation = (double*)malloc(cell_vec_bytes);
-  double* h_grad_validation = (double*)malloc(cell_vec_bytes);
-
-  double* h_turbSrc_A = (double*)malloc(csr_value_bytes);
-  double* h_turbSrc_b = (double*)malloc(cell_vec_bytes);
-
-  double *b_openfoam = (double*)malloc(cell_vec_bytes);
-  double *A_openfoam = (double*)malloc(csr_value_vec_bytes);
-
-  // Verify that allocations succeeded
-  if (h_A_csr_row_index == NULL || h_A_csr_col_index == NULL || h_A_csr_diag_index == NULL
-          || h_rho_old == NULL || h_rho_new == NULL || h_volume == NULL || h_pressure == NULL || h_velocity_old == NULL
-          || h_weight == NULL || h_phi == NULL || h_face_vector == NULL
-          || h_boundary_cell_offset == NULL || h_boundary_cell_id == NULL
-          || h_boundary_pressure == NULL || h_boundary_face_vector == NULL
-          || h_internal_coeffs == NULL || h_boundary_coeffs == NULL
-          || h_A_csr == NULL || h_b == NULL || h_A_csr_validation == NULL || h_b_validation == NULL) {
-    fprintf(stderr, "Failed to allocate host arrays!\n");
-    exit(EXIT_FAILURE);
-  }
-
-  // Load input data
-  double rdelta_t = 1.0 / 0.0001;
-  if (validation_mode) {
-    int readfile = 0;
-    readfile = fread((void*)&rdelta_t, sizeof(double), 1, fp);
-    if(readfile == 0) {
-      fprintf(stderr, "%s %d, Failed to open input file: %s!\n", __FILE__, __LINE__, input_file);
-      exit(EXIT_FAILURE);
-    }
-    fprintf(stderr, "rdelta_t: %lf\n", rdelta_t);
-
-    readfile = fread(h_A_csr_row_index, csr_row_index_bytes, 1, fp);
-    if(readfile == 0) {
-      fprintf(stderr, "%s %d, Failed to open input file: %s!\n", __FILE__, __LINE__, input_file);
-      exit(EXIT_FAILURE);
-    }
-    // for (int i = 0; i < num_cells + 1; i++)
-    //   fprintf(stderr, "h_A_csr_row_index[%d]: %d\n", i, h_A_csr_row_index[i]);
-
-    readfile = fread(h_A_csr_col_index, csr_col_index_bytes, 1, fp);
-    if(readfile == 0) {
-      fprintf(stderr, "%s %d, Failed to open input file: %s!\n", __FILE__, __LINE__, input_file);
-      exit(EXIT_FAILURE);
-    }
-    // for (int i = 0; i < num_cells + num_faces; i++)
-    //   fprintf(stderr, "h_A_csr_col_index[%d]: %d\n", i, h_A_csr_col_index[i]);
-
-    readfile = fread(h_A_csr_diag_index, cell_index_bytes, 1, fp);
-    if(readfile == 0) {
-      fprintf(stderr, "%s %d, Failed to open input file: %s!\n", __FILE__, __LINE__, input_file);
-      exit(EXIT_FAILURE);
-    }
-    // for (int i = 0; i < num_cells; i++)
-    //  fprintf(stderr, "h_A_csr_diag_index[%d]: %d\n", i, h_A_csr_diag_index[i]);
-
-    readfile = fread(h_rho_old, cell_bytes, 1, fp);
-    if(readfile == 0) {
-      fprintf(stderr, "%s %d, Failed to open input file: %s!\n", __FILE__, __LINE__, input_file);
-      exit(EXIT_FAILURE);
-    }
-    //for (int i = 0; i < num_cells; i++)
-    //  fprintf(stderr, "h_rho_old[%d]: %lf\n", i, h_rho_old[i]);
-
-    readfile = fread(h_rho_new, cell_bytes, 1, fp);
-    if(readfile == 0) {
-      fprintf(stderr, "%s %d, Failed to open input file: %s!\n", __FILE__, __LINE__, input_file);
-      exit(EXIT_FAILURE);
-    }
-    //for (int i = 0; i < num_cells; i++)
-    //  fprintf(stderr, "h_rho_new[%d]: %lf\n", i, h_rho_new[i]);
-
-    readfile = fread(h_volume, cell_bytes, 1, fp);
-    if(readfile == 0) {
-      fprintf(stderr, "%s %d, Failed to open input file: %s!\n", __FILE__, __LINE__, input_file);
-      exit(EXIT_FAILURE);
-    }
-    //for (int i = 0; i < num_cells; i++)
-    //  fprintf(stderr, "h_volume[%d]: %.16lf\n", i, h_volume[i]);
-
-    readfile = fread(h_pressure, cell_bytes, 1, fp);
-    if(readfile == 0) {
-      fprintf(stderr, "%s %d, Failed to open input file: %s!\n", __FILE__, __LINE__, input_file);
-      exit(EXIT_FAILURE);
-    }
-    // for (int i = 0; i < num_cells; i++)
-    //  fprintf(stderr, "h_pressure[%d]: %.30lf\n", i, h_pressure[i]);
-
-    readfile = fread(h_velocity_old, cell_vec_bytes, 1, fp);
-    if(readfile == 0) {
-      fprintf(stderr, "%s %d, Failed to open input file: %s!\n", __FILE__, __LINE__, input_file);
-      exit(EXIT_FAILURE);
-    }
-    //for (int i = 0; i < num_cells; i++)
-    //  fprintf(stderr, "h_velocity[%d]: (%lf, %lf, %lf)\n", i, h_volume[i * 3 + 0], h_volume[i * 3 + 1], h_volume[i * 3 + 2]);
-
-    readfile = fread(h_weight, face_bytes, 1, fp);
-    if(readfile == 0) {
-      fprintf(stderr, "%s %d, Failed to open input file: %s!\n", __FILE__, __LINE__, input_file);
-      exit(EXIT_FAILURE);
-    }
-    // for (int i = 0; i < num_faces; i++)
-    //  fprintf(stderr, "h_weight[%d]: %.20lf\n", i, h_weight[i]);
-
-    readfile = fread(h_phi, face_bytes, 1, fp);
-    if(readfile == 0) {
-      fprintf(stderr, "%s %d, Failed to open input file: %s!\n", __FILE__, __LINE__, input_file);
-      exit(EXIT_FAILURE);
-    }
-    //for (int i = 0; i < num_faces; i++)
-    //  fprintf(stderr, "h_phi[%d]: %lf\n", i, h_phi[i]);
-
-    readfile = fread(h_face_vector, face_vec_bytes, 1, fp);
-    if(readfile == 0) {
-      fprintf(stderr, "%s %d, Failed to open input file: %s!\n", __FILE__, __LINE__, input_file);
-      exit(EXIT_FAILURE);
-    }
-    // for (int i = 0; i < num_faces; i++)
-    //  fprintf(stderr, "h_face_vector[%d]: (%.20lf, %.20lf, %.20lf)\n", i, h_face_vector[i * 3 + 0], h_face_vector[i * 3 + 1], h_face_vector[i * 3 + 2]);
-
-    readfile = fread(h_boundary_cell_offset, (num_boundary_cells+1) * sizeof(int), 1, fp);
-    if(readfile == 0) {
-      fprintf(stderr, "%s %d, Failed to open input file: %s!\n", __FILE__, __LINE__, input_file);
-      exit(EXIT_FAILURE);
-    }
-    // for (int i = 0; i < num_boundary_cells; i++)
-    //  fprintf(stderr, "h_boundary_cell_offset[%d]: %d\n", i, h_boundary_cell_offset[i]);
-
-    readfile = fread(h_boundary_cell_id, boundary_face_index_bytes, 1, fp);
-    if(readfile == 0) {
-      fprintf(stderr, "%s %d, Failed to open input file: %s!\n", __FILE__, __LINE__, input_file);
-      exit(EXIT_FAILURE);
-    }
-    // for (int i = 0; i < num_boundary_faces; i++)
-    //  fprintf(stderr, "h_boundary_cell_id[%d]: %d\n", i, h_boundary_cell_id[i]);
-
-    readfile = fread(h_boundary_pressure, boundary_face_bytes, 1, fp);
-    if(readfile == 0) {
-      fprintf(stderr, "%s %d, Failed to open input file: %s!\n", __FILE__, __LINE__, input_file);
-      exit(EXIT_FAILURE);
-    }
-    // for (int i = 0; i < num_boundary_faces; i++)
-    //  fprintf(stderr, "h_boundary_pressure[%d]: %lf\n", i, h_boundary_pressure[i]);
-
-    readfile = fread(h_boundary_face_vector, boundary_face_vec_bytes, 1, fp);
-    if(readfile == 0) {
-      fprintf(stderr, "%s %d, Failed to open input file: %s!\n", __FILE__, __LINE__, input_file);
-      exit(EXIT_FAILURE);
-    }
-    // for (int i = 0; i < num_boundary_faces; i++)
-    //  fprintf(stderr, "h_boundary_face_vector[%d]: (%.15lf, %.15lf, %.15lf)\n", i, h_boundary_face_vector[i * 3 + 0], h_boundary_face_vector[i * 3 + 1], h_boundary_face_vector[i * 3 + 2]);
-
-    readfile = fread(h_internal_coeffs, boundary_face_vec_bytes, 1, fp);
-    if(readfile == 0) {
-      fprintf(stderr, "%s %d, Failed to open input file: %s!\n", __FILE__, __LINE__, input_file);
-      exit(EXIT_FAILURE);
-    }
-    // for (int i = 0; i < num_boundary_faces; i++)
-    //  fprintf(stderr, "h_internal_coeffs[%d]: (%.15lf, %.15lf, %.15lf)\n", i, h_internal_coeffs[i * 3 + 0], h_internal_coeffs[i * 3 + 1], h_internal_coeffs[i * 3 + 2]);
-
-    readfile = fread(h_boundary_coeffs, boundary_face_vec_bytes, 1, fp);
-    if(readfile == 0) {
-      fprintf(stderr, "%s %d, Failed to open input file: %s!\n", __FILE__, __LINE__, input_file);
-      exit(EXIT_FAILURE);
-    }
-    //for (int i = 0; i < num_boundary_faces; i++)
-    //  fprintf(stderr, "h_boundary_coeffs[%d]: (%lf, %lf, %lf)\n", i, h_boundary_coeffs[i * 3 + 0], h_boundary_coeffs[i * 3 + 1], h_boundary_coeffs[i * 3 + 2]);
-
-    readfile = fread(h_A_csr_validation, csr_value_vec_bytes, 1, fp);
-    if(readfile == 0) {
-     fprintf(stderr, "%s %d, Failed to open input file: %s!\n", __FILE__, __LINE__, input_file);
-     exit(EXIT_FAILURE);
-    }
-    //for (int i = 0; i < (num_cells + num_faces) * 3; i++)
-    //  fprintf(stderr, "h_A_csr_validation[%d]: %.30lf\n", i, h_A_csr_validation[i]);
-
-    readfile = fread(h_b_validation, cell_vec_bytes, 1, fp);
-    if(readfile == 0) {
-     fprintf(stderr, "%s %d, Failed to open input file: %s!\n", __FILE__, __LINE__, input_file);
-     exit(EXIT_FAILURE);
-    }
-    // for (int i = 0; i < num_cells * 3; i++)
-    //  fprintf(stderr, "h_b_validation[%d]: %lf\n", i, h_b_validation[i]);
-
-    readfile = fread(h_grad_validation, cell_vec_bytes, 1, fp);
-    if(readfile == 0) {
-     fprintf(stderr, "%s %d, Failed to open input file: %s!\n", __FILE__, __LINE__, input_file);
-     exit(EXIT_FAILURE);
-    }
-    // for (int i = 0; i < num_cells * 3; i++)
-    //  fprintf(stderr, "h_grad_validation[%d]: %lf\n", i, h_grad_validation[i]);
-
-    readfile = fread(h_turbSrc_A, csr_value_bytes, 1, fp);
-    if(readfile == 0) {
-     fprintf(stderr, "%s %d, Failed to open input file: %s!\n", __FILE__, __LINE__, input_file);
-     exit(EXIT_FAILURE);
-    }
-    // for (int i = 0; i < (num_cells + num_faces); i++)
-    //  fprintf(stderr, "h_turbSrc_A[%d]: %.30lf\n", i, h_turbSrc_A[i]);
-
-    readfile = fread(h_turbSrc_b, cell_vec_bytes, 1, fp);
-    if(readfile == 0) {
-     fprintf(stderr, "%s %d, Failed to open input file: %s!\n", __FILE__, __LINE__, input_file);
-     exit(EXIT_FAILURE);
-    }
-    // for (int i = 0; i < num_cells * 3; i++)
-    //  fprintf(stderr, "h_turbSrc_b[%d]: %lf\n", i, h_turbSrc_b[i]);
-
-    // compare with of results
-    FILE *fp1 = NULL;
-    fp1 = fopen("of_finalresult.txt", "rb+");
-    readfile = fread(A_openfoam, csr_value_vec_bytes, 1, fp1);
-    readfile = fread(b_openfoam, cell_vec_bytes, 1, fp1);
-
-  } else {
-    // Initialize row index
-    h_A_csr_row_index[0] = 0;
-    for (int i = 0; i < num_cells; ++i) {
-      int ww = i % w;
-      int hh = (i % (w * h)) / w;
-      int dd = i / (w * h);
-      int neighbor_number = 6;
-
-      if (ww == 0) {
-        neighbor_number -= 1;
-      }
-      if (ww == (w - 1)) {
-        neighbor_number -= 1;
-      }
-      if (hh == 0) {
-        neighbor_number -= 1;
-      }
-      if (hh == (h - 1)) {
-        neighbor_number -= 1;
-      }
-      if (dd == 0) {
-        neighbor_number -= 1;
-      }
-      if (dd == (d - 1)) {
-        neighbor_number -= 1;
-      }
-
-      h_A_csr_row_index[i + 1] = h_A_csr_row_index[i] + neighbor_number;
-    }
-
-    // Initialize cell fields
-    for (int i = 0; i < num_cells; ++i) {
-      h_rho_old[i] = rand() / (double)RAND_MAX;
-      h_rho_new[i] = rand() / (double)RAND_MAX;
-      h_volume[i] = rand() / (double)RAND_MAX;
-      h_pressure[i] = rand() / (double)RAND_MAX;
-      h_velocity_old[i * 3 + 0] = rand() / (double)RAND_MAX;
-      h_velocity_old[i * 3 + 1] = rand() / (double)RAND_MAX;
-      h_velocity_old[i * 3 + 2] = rand() / (double)RAND_MAX;
-    }
-
-    // Initialize A_csr_col_index, A_csr_diag_index, and the face fields.
-    for (int i = 0; i < num_cells; ++i) {
-      int row_offset =  h_A_csr_row_index[i];
-      int neighbor_offset = row_offset - i;
-
-      double weight = 0.5;
-
-      // Make sure that the values are placed in each row with the following order.
-      // (x, y, z - 1)--> column: (i - w * h)
-      // (x, y - 1, z)--> column: (i - w)
-      // (x - 1, y, z)--> column: (i - 1)
-      // (x, y, z)    --> column: (i) //only in A_csr_index and A_csr_diag_index
-      // (x + 1, y, z)--> column: (i + 1)
-      // (x, y + 1, z)--> column: (i + w)
-      // (x, y, z + 1)--> column: (i + w * h)
-
-      // (x, y, z - 1)
-      // (x, y - 1, z)
-      // (x - 1, y, z)
-      int num_lower = 0;
-      if (i - w * h >= 0) {
-        num_lower++;
-        h_A_csr_col_index[row_offset++] = i - w * h;
-
-        int face_index = neighbor_offset++;
-        h_weight[face_index] = weight;
-        h_phi[face_index] = rand() / (double)RAND_MAX;
-        h_face_vector[face_index * 3 + 0] = rand() / (double)RAND_MAX;
-        h_face_vector[face_index * 3 + 1] = rand() / (double)RAND_MAX;
-        h_face_vector[face_index * 3 + 2] = rand() / (double)RAND_MAX;
-      }
-      if (i - w >= 0) {
-        num_lower++;
-        h_A_csr_col_index[row_offset++] = i - w;
-
-        int face_index = neighbor_offset++;
-        h_weight[face_index] = weight;
-        h_phi[face_index] = rand() / (double)RAND_MAX;
-        h_face_vector[face_index * 3 + 0] = rand() / (double)RAND_MAX;
-        h_face_vector[face_index * 3 + 1] = rand() / (double)RAND_MAX;
-        h_face_vector[face_index * 3 + 2] = rand() / (double)RAND_MAX;
-      }
-      if (i - 1 >= 0) {
-        num_lower++;
-        h_A_csr_col_index[row_offset++] = i - 1;
-
-        int face_index = neighbor_offset++;
-        h_weight[face_index] = weight;
-        h_phi[face_index] = rand() / (double)RAND_MAX;
-        h_face_vector[face_index * 3 + 0] = rand() / (double)RAND_MAX;
-        h_face_vector[face_index * 3 + 1] = rand() / (double)RAND_MAX;
-        h_face_vector[face_index * 3 + 2] = rand() / (double)RAND_MAX;
-      }
-
-      // (x, y, z)
-      h_A_csr_col_index[row_offset++] = i;
-      h_A_csr_diag_index[i] = num_lower;
-
-      // (x + 1, y, z)
-      // (x, y + 1, z)
-      // (x, y, z + 1)
-      if (i + 1 < num_cells) {
-        h_A_csr_col_index[row_offset++] = i + 1;
-
-        int face_index = neighbor_offset++;
-        h_weight[face_index] = weight;
-        h_face_vector[face_index * 3 + 0] = rand() / (double)RAND_MAX;
-        h_face_vector[face_index * 3 + 1] = rand() / (double)RAND_MAX;
-        h_face_vector[face_index * 3 + 2] = rand() / (double)RAND_MAX;
-        h_face_vector[face_index] = rand() / (double)RAND_MAX;
-      }
-      if (i + w < num_cells) {
-        h_A_csr_col_index[row_offset++] = i + w;
-
-        int face_index = neighbor_offset++;
-        h_weight[face_index] = weight;
-        h_phi[face_index] = rand() / (double)RAND_MAX;
-        h_face_vector[face_index * 3 + 0] = rand() / (double)RAND_MAX;
-        h_face_vector[face_index * 3 + 1] = rand() / (double)RAND_MAX;
-        h_face_vector[face_index * 3 + 2] = rand() / (double)RAND_MAX;
-      }
-      if (i + w * h < num_cells) {
-        h_A_csr_col_index[row_offset++] = i + w * h;
-
-        int face_index = neighbor_offset++;
-        h_weight[face_index] = weight;
-        h_phi[face_index] = rand() / (double)RAND_MAX;
-        h_face_vector[face_index * 3 + 0] = rand() / (double)RAND_MAX;
-        h_face_vector[face_index * 3 + 1] = rand() / (double)RAND_MAX;
-        h_face_vector[face_index * 3 + 2] = rand() / (double)RAND_MAX;
-      }
-    }
-  }
-
-  if (fp) {
-    fclose(fp);
-  }
-
-  int total_bytes = 0;
-  int copy_bytes = 0;
-
-  // Allocate the device arrays
-  int* d_A_csr_row_index = NULL;
-  int* d_A_csr_col_index = NULL;
-  int* d_A_csr_diag_index = NULL;
-  checkCudaErrors(cudaMalloc((void**)&d_A_csr_row_index, csr_row_index_bytes));
-  checkCudaErrors(cudaMalloc((void**)&d_A_csr_col_index, csr_col_index_bytes));
-  checkCudaErrors(cudaMalloc((void**)&d_A_csr_diag_index, cell_index_bytes));
-  total_bytes += (csr_row_index_bytes + csr_col_index_bytes + cell_index_bytes);
-
-  double* d_rho_old = NULL;
-  double* d_rho_new = NULL;
-  double* d_volume = NULL;
-  double* d_pressure = NULL;
-  double* d_velocity_old = NULL;
-  double* d_weight = NULL;
-  double* d_phi = NULL;
-  double* d_face_vector = NULL;
-  checkCudaErrors(cudaMalloc((void**)&d_rho_old, cell_bytes));
-  checkCudaErrors(cudaMalloc((void**)&d_rho_new, cell_bytes));
-  checkCudaErrors(cudaMalloc((void**)&d_volume, cell_bytes));
-  checkCudaErrors(cudaMalloc((void**)&d_pressure, cell_bytes));
-  checkCudaErrors(cudaMalloc((void**)&d_velocity_old, cell_vec_bytes));
-  checkCudaErrors(cudaMalloc((void**)&d_weight, face_bytes));
-  checkCudaErrors(cudaMalloc((void**)&d_phi, face_bytes));
-  checkCudaErrors(cudaMalloc((void**)&d_face_vector, face_vec_bytes));
-  total_bytes += (cell_bytes * 4 + face_bytes * 2 + cell_vec_bytes + face_vec_bytes);
-
-  int* d_boundary_cell_offset = NULL;
-  int* d_boundary_cell_id = NULL;
-  checkCudaErrors(cudaMalloc((void**)&d_boundary_cell_offset, (num_boundary_cells+1) * sizeof(int)));
-  checkCudaErrors(cudaMalloc((void**)&d_boundary_cell_id, boundary_face_bytes));
-  total_bytes += (boundary_cell_index_bytes + (num_boundary_cells+1) * sizeof(int));
-
-  double* d_boundary_pressure = NULL;
-  double* d_boundary_face_vector = NULL;
-  double* d_internal_coeffs = NULL;
-  double* d_boundary_coeffs = NULL;
-  checkCudaErrors(cudaMalloc((void**)&d_boundary_pressure, boundary_face_bytes));
-  checkCudaErrors(cudaMalloc((void**)&d_boundary_face_vector, boundary_face_vec_bytes));
-  checkCudaErrors(cudaMalloc((void**)&d_internal_coeffs, boundary_face_vec_bytes));
-  checkCudaErrors(cudaMalloc((void**)&d_boundary_coeffs, boundary_face_vec_bytes));
-  total_bytes += (boundary_face_bytes + boundary_face_vec_bytes * 3);
-
-  double* d_A_csr = NULL;
-  double* d_b = NULL;
-  checkCudaErrors(cudaMalloc((void**)&d_A_csr, csr_value_vec_bytes));
-  checkCudaErrors(cudaMalloc((void**)&d_b, cell_vec_bytes));
-  total_bytes += (csr_value_vec_bytes + cell_vec_bytes);
-
-  double* d_turbSrc_A = NULL;
-  double* d_turbSrc_b = NULL;
-  checkCudaErrors(cudaMalloc((void**)&d_turbSrc_A, csr_value_bytes));
-  checkCudaErrors(cudaMalloc((void**)&d_turbSrc_b, cell_vec_bytes));
-  total_bytes += (csr_value_bytes + cell_vec_bytes);
-
-  fprintf(stderr, "Total bytes malloc on GPU: %.2fMB\n", total_bytes * 1.0 / 1024 / 1024);
-
-  // Create cuda stream
-  cudaStream_t stream;
-  checkCudaErrors(cudaStreamCreate(&stream));
-
-  struct timeval start, end;
-  float duration = 0;
-  gettimeofday(&start, NULL);
-
-  // Copy the host input array in host memory to the device input array in device memory
-  printf("Copy input arrays from the host memory to the CUDA device\n");
-  checkCudaErrors(cudaMemcpyAsync(d_A_csr_row_index, h_A_csr_row_index, csr_row_index_bytes, cudaMemcpyHostToDevice, stream));
-  checkCudaErrors(cudaMemcpyAsync(d_A_csr_col_index, h_A_csr_col_index, csr_col_index_bytes, cudaMemcpyHostToDevice, stream));
-  checkCudaErrors(cudaMemcpyAsync(d_A_csr_diag_index, h_A_csr_diag_index, cell_index_bytes, cudaMemcpyHostToDevice, stream));
-  copy_bytes += (csr_row_index_bytes + csr_col_index_bytes + cell_index_bytes);
-
-  checkCudaErrors(cudaMemcpyAsync(d_rho_old, h_rho_old, cell_bytes, cudaMemcpyHostToDevice, stream));
-  checkCudaErrors(cudaMemcpyAsync(d_rho_new, h_rho_new, cell_bytes, cudaMemcpyHostToDevice, stream));
-  checkCudaErrors(cudaMemcpyAsync(d_volume, h_volume, cell_bytes, cudaMemcpyHostToDevice, stream));
-  checkCudaErrors(cudaMemcpyAsync(d_pressure, h_pressure, cell_bytes, cudaMemcpyHostToDevice, stream));
-  checkCudaErrors(cudaMemcpyAsync(d_velocity_old, h_velocity_old, cell_vec_bytes, cudaMemcpyHostToDevice, stream));
-  checkCudaErrors(cudaMemcpyAsync(d_weight, h_weight, face_bytes, cudaMemcpyHostToDevice, stream));
-  checkCudaErrors(cudaMemcpyAsync(d_phi, h_phi, face_bytes, cudaMemcpyHostToDevice, stream));
-  checkCudaErrors(cudaMemcpyAsync(d_face_vector, h_face_vector, face_vec_bytes, cudaMemcpyHostToDevice, stream));
-  copy_bytes += (cell_bytes * 4 + cell_vec_bytes + face_bytes * 2 + face_vec_bytes);
-
-  checkCudaErrors(cudaMemcpyAsync(d_boundary_cell_offset, h_boundary_cell_offset, (num_boundary_cells+1) * sizeof(int), cudaMemcpyHostToDevice, stream));
-  checkCudaErrors(cudaMemcpyAsync(d_boundary_cell_id, h_boundary_cell_id, boundary_face_index_bytes, cudaMemcpyHostToDevice, stream));
-  copy_bytes += ((num_boundary_cells+1) * sizeof(int) + boundary_face_index_bytes);
-
-  checkCudaErrors(cudaMemcpyAsync(d_boundary_pressure, h_boundary_pressure, boundary_face_bytes, cudaMemcpyHostToDevice, stream));
-  checkCudaErrors(cudaMemcpyAsync(d_boundary_face_vector, h_boundary_face_vector, boundary_face_vec_bytes, cudaMemcpyHostToDevice, stream));
-  checkCudaErrors(cudaMemcpyAsync(d_internal_coeffs, h_internal_coeffs, boundary_face_vec_bytes, cudaMemcpyHostToDevice, stream));
-  checkCudaErrors(cudaMemcpyAsync(d_boundary_coeffs, h_boundary_coeffs, boundary_face_vec_bytes, cudaMemcpyHostToDevice, stream));
-  copy_bytes += (boundary_face_bytes + boundary_face_vec_bytes * 3);
-
-  checkCudaErrors(cudaMemcpyAsync(d_turbSrc_A, h_turbSrc_A, csr_value_bytes, cudaMemcpyHostToDevice, stream));
-  checkCudaErrors(cudaMemcpyAsync(d_turbSrc_b, h_turbSrc_b, cell_vec_bytes, cudaMemcpyHostToDevice, stream));
-  copy_bytes += (csr_value_bytes + cell_vec_bytes);
-
-  // Synchronize stream
-  checkCudaErrors(cudaStreamSynchronize(stream));
-  gettimeofday(&end, NULL);
-  duration = (end.tv_sec - start.tv_sec) * 1000 + (end.tv_usec - start.tv_usec) * 0.001;
-  fprintf(stderr, "memcpy time cost %lf ms.\n", duration);
-  fprintf(stderr, "bandwidth %lf GB/s.\n", (copy_bytes * 1.0 / 1024 / 1024 / 1024) / (duration / 1000));
-
-  size_t threads_per_block = 1024;
-  size_t blocks_per_grid = (num_cells + threads_per_block - 1) / threads_per_block;
-  printf("CUDA kernel launch with %d blocks of %d threads\n", blocks_per_grid, threads_per_block);
-
-  /**
-   * start term-level
-   */
-  gettimeofday(&start, NULL);
-
-  // Memset A and b to zero
-  checkCudaErrors(cudaMemsetAsync(d_A_csr, 0, csr_value_vec_bytes, stream));
-  checkCudaErrors(cudaMemsetAsync(d_b, 0, cell_vec_bytes, stream));
-
-  // Launch the fvm_ddt Kernel
-  fvm_ddt<<<blocks_per_grid, threads_per_block, 0, stream>>>(num_cells, num_faces, rdelta_t,
-          d_A_csr_row_index, d_A_csr_diag_index,
-          d_rho_old, d_rho_new, d_volume, d_velocity_old, d_A_csr, d_b, d_A_csr, d_b);
-  // Launch the fvm_div Kernel
-  fvm_div_internal<<<blocks_per_grid, threads_per_block, 0, stream>>>(num_cells, num_faces,
-          d_A_csr_row_index, d_A_csr_diag_index,
-          d_weight, d_phi, d_A_csr, d_b, d_A_csr, d_b);
-  blocks_per_grid = (num_boundary_cells + threads_per_block - 1) / threads_per_block;
-  fvm_div_boundary<<<blocks_per_grid, threads_per_block, 0, stream>>>(num_cells, num_faces, num_boundary_cells,
-          d_A_csr_row_index, d_A_csr_diag_index,
-          d_boundary_cell_offset, d_boundary_cell_id,
-          d_internal_coeffs, d_boundary_coeffs, d_A_csr, d_b, d_A_csr, d_b);
-  // Launch the fvc_grad Kernel
-  blocks_per_grid = (num_cells + threads_per_block - 1) / threads_per_block;
-  fvc_grad_internal_face<<<blocks_per_grid, threads_per_block, 0, stream>>>(num_cells,
-          d_A_csr_row_index, d_A_csr_col_index, d_A_csr_diag_index,
-          d_face_vector, d_weight, d_pressure, d_b, d_b);
-  blocks_per_grid = (num_boundary_cells + threads_per_block - 1) / threads_per_block;
-  fvc_grad_boundary_face<<<blocks_per_grid, threads_per_block, 0, stream>>>(num_cells, num_boundary_cells,
-          d_boundary_cell_offset, d_boundary_cell_id,
-          d_boundary_face_vector, d_boundary_pressure, d_b, d_b);
-  blocks_per_grid = (num_cells + threads_per_block - 1) / threads_per_block;
-  add_fvMatrix<<<blocks_per_grid, threads_per_block, 0, stream>>>(num_cells, num_faces,
-          d_A_csr_row_index, d_turbSrc_A, d_turbSrc_b, d_A_csr, d_b, d_A_csr, d_b);
-
-  // Synchronize stream
-  checkCudaErrors(cudaStreamSynchronize(stream));
-  gettimeofday(&end, NULL);
-  duration = (end.tv_sec - start.tv_sec) * 1000 + (end.tv_usec - start.tv_usec) * 0.001;
-  fprintf(stderr, "assemble on term level, time cost %lf ms.\n", duration);
-  /**
-   * end term-level
-   */
-
-  /**
-   * start equation-level
-   */
-  //gettimeofday(&start, NULL);
-
-  //// Memset A and b to zero
-  //checkCudaErrors(cudaMemsetAsync(d_A_csr, 0, csr_value_vec_bytes, stream));
-  //checkCudaErrors(cudaMemsetAsync(d_b, 0, cell_vec_bytes, stream));
-
-  //// Note: the implement fo assemble kernel is outdated.
-  //// Launch the assemble Kernel
-  //assemble<<<blocks_per_grid, threads_per_block, 0, stream>>>(num_cells, num_faces, rdelta_t, d_A_csr_row_index, d_A_csr_diag_index,
-  //        d_rho_old, d_rho_new, d_volume, d_velocity_old, d_weight, d_phi, d_face_vector, d_pressure, d_A_csr, d_b);
-
-  //// Synchronize stream
-  //checkCudaErrors(cudaStreamSynchronize(stream));
-  //gettimeofday(&end, NULL);
-  //duration = (end.tv_sec - start.tv_sec) * 1000 + (end.tv_usec - start.tv_usec) * 0.001;
-  //fprintf(stderr, "assemble on equation level, time cost %lf ms.\n", duration);
-  /**
-   * end equation-level
-   */
-
-  // Sleep 10 seconds, so that user can check gpu memory usage through nvidia-smi.
-  //sleep(10);
-
-  // Check result
-  if (validation_mode) {
-    checkCudaErrors(cudaMemcpyAsync(h_A_csr, d_A_csr, csr_value_vec_bytes, cudaMemcpyDeviceToHost, stream));
-    checkCudaErrors(cudaMemcpyAsync(h_b, d_b, cell_vec_bytes, cudaMemcpyDeviceToHost, stream));
-    checkCudaErrors(cudaStreamSynchronize(stream));
-    // fprintf(stderr, "check of h_A_csr\n");
-    // checkVectorEqual((num_cells + num_faces) * 3, h_A_csr_validation, h_A_csr, 1e-5);
-    // fprintf(stderr, "check of h_b\n");
-    // checkVectorEqual(num_cells * 3, h_b_validation, h_b, 1e-5);
-    fprintf(stderr, "check of h_A_csr\n");
-    checkVectorEqual((num_cells + num_faces) * 3, A_openfoam, h_A_csr, 1e-5);
-    fprintf(stderr, "check of h_b\n");
-    checkVectorEqual(num_cells * 3, b_openfoam, h_b, 1e-5);
-  }
-
-  // Free device global memory
-  checkCudaErrors(cudaFree(d_A_csr_row_index));
-  checkCudaErrors(cudaFree(d_A_csr_col_index));
-  checkCudaErrors(cudaFree(d_A_csr_diag_index));
-  checkCudaErrors(cudaFree(d_rho_old));
-  checkCudaErrors(cudaFree(d_rho_new));
-  checkCudaErrors(cudaFree(d_volume));
-  checkCudaErrors(cudaFree(d_pressure));
-  checkCudaErrors(cudaFree(d_velocity_old));
-  checkCudaErrors(cudaFree(d_weight));
-  checkCudaErrors(cudaFree(d_phi));
-  checkCudaErrors(cudaFree(d_face_vector));
-  checkCudaErrors(cudaFree(d_boundary_cell_offset));
-  checkCudaErrors(cudaFree(d_boundary_cell_id));
-  checkCudaErrors(cudaFree(d_boundary_pressure));
-  checkCudaErrors(cudaFree(d_boundary_face_vector));
-  checkCudaErrors(cudaFree(d_internal_coeffs));
-  checkCudaErrors(cudaFree(d_boundary_coeffs));
-  checkCudaErrors(cudaFree(d_A_csr));
-  checkCudaErrors(cudaFree(d_b));
-  checkCudaErrors(cudaStreamDestroy(stream));
-
-  // Free host memory
-  free(h_A_csr_row_index);
-  free(h_A_csr_col_index);
-  free(h_A_csr_diag_index);
-  free(h_rho_old);
-  free(h_rho_new);
-  free(h_volume);
-  free(h_pressure);
-  free(h_velocity_old);
-  free(h_weight);
-  free(h_phi);
-  free(h_face_vector);
-  free(h_boundary_cell_offset);
-  free(h_boundary_cell_id);
-  free(h_boundary_pressure);
-  free(h_boundary_face_vector);
-  free(h_internal_coeffs);
-  free(h_boundary_coeffs);
-  free(h_A_csr);
-  free(h_b);
-  free(h_A_csr_validation);
-  free(h_b_validation);
-
-  printf("Done\n");
-  return 0;
-}
-