diff --git a/GPUTest/GPUTestBase.H b/GPUTest/GPUTestBase.H
new file mode 100644
index 000000000..2520485a7
--- /dev/null
+++ b/GPUTest/GPUTestBase.H
@@ -0,0 +1,646 @@
+
+enum initType{
+    original,
+    randomInit
+};
+
+struct testGPUDataBase {
+    // some fvm ops don't use d_source;
+    // some fvm ops don't use d_internal_coeffs and d_boundary_coeffs;
+    // all the fvc ops only use d_source
+    double *d_lower = nullptr;
+    double *d_upper = nullptr;
+    double *d_diag = nullptr;
+    double *d_source = nullptr;
+    double *d_internal_coeffs = nullptr;
+    double *d_boundary_coeffs = nullptr;
+
+    double *d_value_internal_coeffs = nullptr;
+    double *d_value_boundary_coeffs = nullptr;
+    double *d_gradient_internal_coeffs = nullptr;
+    double *d_gradient_boundary_coeffs = nullptr;
+
+    std::vector<int> patch_type;
+
+    // constructor
+    testGPUDataBase() {}
+
+    // deconstructor
+    ~testGPUDataBase() {
+      if (d_lower) checkCudaErrors(cudaFree(d_lower));
+      if (d_upper) checkCudaErrors(cudaFree(d_upper));
+      if (d_diag) checkCudaErrors(cudaFree(d_diag));
+      if (d_source) checkCudaErrors(cudaFree(d_source));
+      if (d_internal_coeffs) checkCudaErrors(cudaFree(d_internal_coeffs));
+      if (d_boundary_coeffs) checkCudaErrors(cudaFree(d_boundary_coeffs));
+
+      if (d_value_internal_coeffs) checkCudaErrors(cudaFree(d_value_internal_coeffs));
+      if (d_value_boundary_coeffs) checkCudaErrors(cudaFree(d_value_boundary_coeffs));
+      if (d_gradient_internal_coeffs) checkCudaErrors(cudaFree(d_gradient_internal_coeffs));
+      if (d_gradient_boundary_coeffs) checkCudaErrors(cudaFree(d_gradient_boundary_coeffs));
+    }
+};
+
+template <typename T>
+void getTypeInfo(size_t *stride, size_t *internal_size, size_t *boundary_size) {
+    size_t s = 1;
+    bool isVol = false;
+    if (typeid(T) == typeid(surfaceScalarField)) {
+        s = 1;
+        isVol = false;
+    } else if (typeid(T) == typeid(surfaceVectorField)) {
+        s = 3;
+        isVol = false;
+    } else if (typeid(T) == typeid(surfaceTensorField)) {
+        s = 9;
+        isVol = false;
+    } else if (typeid(T) == typeid(volScalarField)) {
+        s = 1;
+        isVol = true;
+    } else if (typeid(T) == typeid(volVectorField)) {
+        s = 3;
+        isVol = true;
+    } else if (typeid(T) == typeid(volTensorField)) {
+        s = 9;
+        isVol = true;
+    } else {
+        fprintf(stderr, "ERROR! Unsupported field type()!\n");
+        exit(EXIT_FAILURE);
+    }
+    *stride = s;
+    *internal_size = (isVol ? dfDataBase.num_cells : dfDataBase.num_surfaces) * s;
+    *boundary_size = dfDataBase.num_boundary_surfaces * s;
+}
+
+
+template <typename T>
+void getFieldPtr(std::queue<double*>& fieldPtrQue, T& field){
+    fieldPtrQue.push(&field[0]);
+    forAll(field.boundaryField(), patchi){
+        auto& patchField = field.boundaryFieldRef()[patchi];
+        fieldPtrQue.push(&patchField[0]);
+    }
+};
+
+// template <typename T>
+// void getFieldPtr(std::vector<double*>& fieldPtrQue, T& field){
+//     fieldPtrQue.push_back(&field[0]);
+//     forAll(field.boundaryField(), patchi){
+//         auto& patchField = field.boundaryFieldRef()[patchi];
+//         fieldPtrQue.push_back(&patchField[0]);
+//         Info << "patchi " << patchi << endl;
+//     }
+// };
+
+
+template <typename T>
+void randomInitField(T& field) {
+    size_t stride = 0;
+    size_t internal_size = 0;
+    size_t boundary_size = 0;
+    getTypeInfo<T>(&stride, &internal_size, &boundary_size);
+    size_t internal_value_bytes = internal_size * sizeof(double) * stride;
+    std::queue<double*> fieldPtrQue;
+    // std::vector<double*> fieldPtrQue;
+    getFieldPtr(fieldPtrQue, field);
+
+    // random init field value to (-0.5, 0.5)
+    // internal
+    double *&field_internal_ptr = fieldPtrQue.front(); fieldPtrQue.pop();
+    // double *field_internal_ptr = fieldPtrQue[0];
+    std::vector<double> init_field_internal;
+    init_field_internal.resize(internal_size * stride);
+    for (size_t i = 0; i < internal_size * stride; i++) {
+        init_field_internal[i] = (rand() % 10000 - 5000) / 10000.0;
+    }
+    memcpy(field_internal_ptr, init_field_internal.data(), internal_value_bytes);
+    // boundary
+    int ptrIndex = 1; 
+    forAll(field.boundaryField(), patchi)
+    {
+        auto& patchField = field.boundaryFieldRef()[patchi];
+        size_t patchsize = patchField.size();
+        double *&field_boundary_ptr = fieldPtrQue.front(); fieldPtrQue.pop();
+        // double *field_boundary_ptr = fieldPtrQue[ptrIndex];
+        // ptrIndex ++;
+        std::vector<double> init_field_boundary;
+        init_field_boundary.resize(patchsize * stride);
+        for (size_t i = 0; i < patchsize * stride; i++) {
+            init_field_boundary[i] = (rand() % 10000 - 5000) / 10000.0;
+        }
+        memcpy(field_boundary_ptr, init_field_boundary.data(), patchsize * stride * sizeof(double));
+    }
+}
+
+template <typename T>
+void uploadRegisteredField(dfMatrixDataBase& dfDataBase, const T& field, const char* fieldAlias) {
+    size_t stride = 0;
+    size_t internal_size = 0;
+    size_t boundary_size = 0;
+    getTypeInfo<T>(&stride, &internal_size, &boundary_size);
+    size_t internal_value_bytes = internal_size * sizeof(double);
+    size_t boundary_value_bytes = boundary_size * sizeof(double);
+
+    double *h_internal_field = dfDataBase.getFieldPointer(fieldAlias, location::cpu, position::internal);
+    double *h_boundary_field = dfDataBase.getFieldPointer(fieldAlias, location::cpu, position::boundary);
+    double *d_internal_field = dfDataBase.getFieldPointer(fieldAlias, location::gpu, position::internal);
+    double *d_boundary_field = dfDataBase.getFieldPointer(fieldAlias, location::gpu, position::boundary);
+
+    // internal
+    memcpy(h_internal_field, &field[0], internal_value_bytes);
+    // boundary
+    int offset = 0;
+    forAll(field.boundaryField(), patchi)
+    {
+        const auto& patchField = field.boundaryField()[patchi];
+        int patchsize = patchField.size();
+        memcpy(h_boundary_field + offset * stride, &patchField[0], patchsize * stride * sizeof(double));
+        offset += patchsize;
+    }
+    // transfer
+    checkCudaErrors(cudaMemcpyAsync(d_internal_field, h_internal_field, internal_value_bytes, cudaMemcpyHostToDevice, dfDataBase.stream));
+    checkCudaErrors(cudaMemcpyAsync(d_boundary_field, h_boundary_field, boundary_value_bytes, cudaMemcpyHostToDevice, dfDataBase.stream));
+}
+
+template <typename T>
+void uploadField(dfMatrixDataBase& dfDataBase, const T& field, double *d_field, double *d_boundary_field) {
+    size_t stride = 0;
+    size_t internal_size = 0;
+    size_t boundary_size = 0;
+    getTypeInfo<T>(&stride, &internal_size, &boundary_size);
+    size_t internal_value_bytes = internal_size * sizeof(double);
+    size_t boundary_value_bytes = boundary_size * sizeof(double);
+
+    std::vector<double> h_boundary_field;
+    h_boundary_field.resize(boundary_size);
+    int offset = 0;
+    forAll(field.boundaryField(), patchi)
+    {
+        const auto& patchField = field.boundaryField()[patchi];
+        int patchsize = patchField.size();
+        memcpy(h_boundary_field.data() + offset * stride, &patchField[0], patchsize * stride * sizeof(double));
+        offset += patchsize;
+    }
+    checkCudaErrors(cudaMemcpyAsync(d_field, &field[0], internal_value_bytes, cudaMemcpyHostToDevice, dfDataBase.stream));
+    checkCudaErrors(cudaMemcpyAsync(d_boundary_field, h_boundary_field.data(), boundary_value_bytes, cudaMemcpyHostToDevice, dfDataBase.stream));
+}
+
+template <typename T>
+void buildTestGPUDataBase(const dfMatrixDataBase& dfDataBase, testGPUDataBase& testData, const T& field,
+        bool lowerFlag, bool upperFlag, bool diagFlag, bool sourceFlag, bool internalCoeffsFlag, bool boundaryCoeffsFlag,
+        bool valueInternalCoeffsFlag, bool valueBoundaryCoeffsFlag, bool gradientInternalCoeffsFlag, bool gradientBoundaryCoeffsFlag) {
+    if ((typeid(T) != typeid(volScalarField)) && (typeid(T) != typeid(volVectorField))) {
+        fprintf(stderr, "ERROR! Unsupported field type()!\n");
+        exit(EXIT_FAILURE);
+    }
+    bool isVec = (typeid(T) == typeid(volVectorField));
+    size_t stride = isVec ? 3 : 1;
+
+    // ldu
+    if (lowerFlag) {
+        checkCudaErrors(cudaMalloc((void**)&testData.d_lower, dfDataBase.surface_value_bytes));
+        checkCudaErrors(cudaMemset(testData.d_lower, 0, dfDataBase.surface_value_bytes));
+    }
+    if (upperFlag) {
+        checkCudaErrors(cudaMalloc((void**)&testData.d_upper, dfDataBase.surface_value_bytes));
+        checkCudaErrors(cudaMemset(testData.d_upper, 0, dfDataBase.surface_value_bytes));
+    }
+    if (diagFlag) {
+        checkCudaErrors(cudaMalloc((void**)&testData.d_diag, dfDataBase.cell_value_bytes));
+        checkCudaErrors(cudaMemset(testData.d_diag, 0, dfDataBase.cell_value_bytes));
+    }
+    if (sourceFlag) {
+        checkCudaErrors(cudaMalloc((void**)&testData.d_source, dfDataBase.cell_value_bytes * stride));
+        checkCudaErrors(cudaMemset(testData.d_source, 0, dfDataBase.cell_value_bytes * stride));
+    }
+    if (internalCoeffsFlag) {
+        checkCudaErrors(cudaMalloc((void**)&testData.d_internal_coeffs, dfDataBase.boundary_surface_value_bytes * stride));
+        checkCudaErrors(cudaMemset(testData.d_internal_coeffs, 0, dfDataBase.boundary_surface_value_bytes * stride));
+    }
+    if (boundaryCoeffsFlag) {
+        checkCudaErrors(cudaMalloc((void**)&testData.d_boundary_coeffs, dfDataBase.boundary_surface_value_bytes * stride));
+        checkCudaErrors(cudaMemset(testData.d_boundary_coeffs, 0, dfDataBase.boundary_surface_value_bytes * stride));
+    }
+    // boundary coeffs
+    if (valueInternalCoeffsFlag) {
+        checkCudaErrors(cudaMalloc((void**)&testData.d_value_internal_coeffs, dfDataBase.boundary_surface_value_bytes * stride));
+        checkCudaErrors(cudaMemset(testData.d_value_internal_coeffs, 0, dfDataBase.boundary_surface_value_bytes * stride));
+    }
+    if (valueBoundaryCoeffsFlag) {
+        checkCudaErrors(cudaMalloc((void**)&testData.d_value_boundary_coeffs, dfDataBase.boundary_surface_value_bytes * stride));
+        checkCudaErrors(cudaMemset(testData.d_value_boundary_coeffs, 0, dfDataBase.boundary_surface_value_bytes * stride));
+    }
+    if (gradientInternalCoeffsFlag) {
+        checkCudaErrors(cudaMalloc((void**)&testData.d_gradient_internal_coeffs, dfDataBase.boundary_surface_value_bytes * stride));
+        checkCudaErrors(cudaMemset(testData.d_gradient_internal_coeffs, 0, dfDataBase.boundary_surface_value_bytes * stride));
+    }
+    if (gradientBoundaryCoeffsFlag) {
+        checkCudaErrors(cudaMalloc((void**)&testData.d_gradient_boundary_coeffs, dfDataBase.boundary_surface_value_bytes * stride));
+        checkCudaErrors(cudaMemset(testData.d_gradient_boundary_coeffs, 0, dfDataBase.boundary_surface_value_bytes * stride));
+    }
+    // patch type
+    testData.patch_type.resize(dfDataBase.num_patches);
+    forAll(field.boundaryField(), patchi)
+    {
+        constructBoundarySelectorPerPatch(&(testData.patch_type[patchi]), field.boundaryField()[patchi].type());
+    }
+}
+
+// TODO: It seems that compareResult of scalar and vector can't be merged
+void compareResultVector(const dfMatrixDataBase& dfDataBase, const testGPUDataBase& testData, fvVectorMatrix& dfMatrix, bool printFlag) {
+    //if ((typeid(T) != typeid(fvScalarMatrix)) && (typeid(T) != typeid(fvVectorMatrix))) {
+    //    fprintf(stderr, "ERROR! Unsupported field type()!\n");
+    //    exit(EXIT_FAILURE);
+    //}
+    //bool isVec = (typeid(T) == typeid(fvVectorMatrix));
+    //size_t stride = isVec ? 3 : 1;
+
+    size_t stride = 3;
+    if (testData.d_lower) {
+        std::vector<double> h_lower;
+        h_lower.resize(dfDataBase.num_surfaces);
+        checkCudaErrors(cudaMemcpy(h_lower.data(), testData.d_lower, dfDataBase.surface_value_bytes, cudaMemcpyDeviceToHost));
+        checkVectorEqual(dfDataBase.num_surfaces, &dfMatrix.lower()[0], h_lower.data(), 1e-14, printFlag);
+    }
+    if (testData.d_upper) {
+        std::vector<double> h_upper;
+        h_upper.resize(dfDataBase.num_surfaces);
+        checkCudaErrors(cudaMemcpy(h_upper.data(), testData.d_upper, dfDataBase.surface_value_bytes, cudaMemcpyDeviceToHost));
+        checkVectorEqual(dfDataBase.num_surfaces, &dfMatrix.upper()[0], h_upper.data(), 1e-14, printFlag);
+    }
+    if (testData.d_diag) {
+        std::vector<double> h_diag;
+        h_diag.resize(dfDataBase.num_cells);
+        checkCudaErrors(cudaMemcpy(h_diag.data(), testData.d_diag, dfDataBase.cell_value_bytes, cudaMemcpyDeviceToHost));
+        checkVectorEqual(dfDataBase.num_cells, &dfMatrix.diag()[0], h_diag.data(), 1e-14, printFlag);
+    }
+    if (testData.d_source) {
+        std::vector<double> h_source;
+        h_source.resize(dfDataBase.num_cells * stride);
+        checkCudaErrors(cudaMemcpy(h_source.data(), testData.d_source, dfDataBase.cell_value_bytes * stride, cudaMemcpyDeviceToHost));
+        //void *source_ptr = isVec ? (&dfMatrix.source()[0][0]) : (&dfMatrix.source()[0]);
+        double *source_ptr = &dfMatrix.source()[0][0];
+        checkVectorEqual(dfDataBase.num_cells * stride, source_ptr, h_source.data(), 1e-14, printFlag);
+    }
+    if (testData.d_internal_coeffs) {
+        std::vector<double> h_internal_coeffs;
+        h_internal_coeffs.resize(dfDataBase.num_boundary_surfaces * stride);
+        checkCudaErrors(cudaMemcpy(h_internal_coeffs.data(), testData.d_internal_coeffs, dfDataBase.boundary_surface_value_bytes * stride, cudaMemcpyDeviceToHost));
+        std::vector<double> cpu_internal_coeffs(dfDataBase.num_boundary_surfaces * stride);
+        int offset = 0;
+        for (int patchi = 0; patchi < dfDataBase.num_patches; patchi++)
+        {
+            int patchsize = dfDataBase.patch_size[patchi];
+            //const void* internal_coeff_ptr = isVec ? (&dfMatrix.internalCoeffs()[patchi][0][0]) : (&dfMatrix.internalCoeffs()[patchi][0]);
+            const void* internal_coeff_ptr = &dfMatrix.internalCoeffs()[patchi][0][0];
+            memcpy(cpu_internal_coeffs.data() + offset * stride, internal_coeff_ptr, patchsize * stride * sizeof(double));
+            offset += patchsize;
+        }
+        checkVectorEqual(dfDataBase.num_boundary_surfaces * stride, cpu_internal_coeffs.data(), h_internal_coeffs.data(), 1e-14, printFlag);
+    }
+    if (testData.d_boundary_coeffs) {
+        std::vector<double> h_boundary_coeffs;
+        h_boundary_coeffs.resize(dfDataBase.num_boundary_surfaces * stride);
+        checkCudaErrors(cudaMemcpy(h_boundary_coeffs.data(), testData.d_boundary_coeffs, dfDataBase.boundary_surface_value_bytes * stride, cudaMemcpyDeviceToHost));
+        std::vector<double> cpu_boundary_coeffs(dfDataBase.num_boundary_surfaces * stride);
+        int offset = 0;
+        for (int patchi = 0; patchi < dfDataBase.num_patches; patchi++)
+        {
+            int patchsize = dfDataBase.patch_size[patchi];
+            //const void* boundary_coeff_ptr = isVec ? (&dfMatrix.boundaryCoeffs()[patchi][0][0]) : (&dfMatrix.boundaryCoeffs()[patchi][0]);
+            const void* boundary_coeff_ptr = &dfMatrix.boundaryCoeffs()[patchi][0][0];
+            memcpy(cpu_boundary_coeffs.data() + offset * stride, boundary_coeff_ptr, patchsize * stride * sizeof(double));
+            offset += patchsize;
+        }
+        checkVectorEqual(dfDataBase.num_boundary_surfaces * stride, cpu_boundary_coeffs.data(), h_boundary_coeffs.data(), 1e-14, printFlag);
+    }
+}
+
+// unittest of fvm::ddt(rho, U)
+void test_fvm_ddt_vector(dfMatrixDataBase& dfDataBase, Foam::fvMesh& mesh, volScalarField& rho, volVectorField& U, initType type) {
+    if (type == initType::randomInit) {
+        rho.oldTime();
+        randomInitField<volScalarField>(rho);
+    }
+
+    // run CPU
+    // fvVectorMatrix dfMatrix = fvm::ddt(rho, U);
+    fvVectorMatrix dfMatrix = EulerDdtSchemeFvmDdt(rho, U);
+
+    // prepare for run GPU
+    // prepare rho, rho.old, U
+    uploadRegisteredField<volScalarField>(dfDataBase, rho, "rho");
+    uploadRegisteredField<volScalarField>(dfDataBase, rho.oldTime(), "rho_old");
+    uploadRegisteredField<volVectorField>(dfDataBase, U.oldTime(), "u");
+    // prepare testData
+    testGPUDataBase testData;
+    // only use diag and source
+    buildTestGPUDataBase<volVectorField>(dfDataBase, testData, U, false, false, true, true, false, false, false, false, false, false);
+    // run GPU
+    fvm_ddt_vector(dfDataBase.stream, dfDataBase.num_cells, dfDataBase.rdelta_t,
+            dfDataBase.d_rho, dfDataBase.d_rho_old, dfDataBase.d_u, dfDataBase.d_volume,
+            testData.d_diag, testData.d_source);
+
+    // compare result
+    bool printFlag = false;
+    compareResultVector(dfDataBase, testData, dfMatrix, printFlag);
+}
+
+// unittest of fvm::div(phi, U)
+void test_fvm_div_vector(dfMatrixDataBase& dfDataBase, Foam::fvMesh& mesh, surfaceScalarField& phi, volVectorField& U, initType type) {
+    if (type == initType::randomInit) {
+        phi.oldTime();
+        randomInitField<surfaceScalarField>(phi);
+    }
+
+    // run CPU
+    // fvVectorMatrix dfMatrix = fvm::div(phi, U);
+    fvVectorMatrix dfMatrix = gaussConvectionSchemeFvmDiv(phi, U);
+
+    // prepare for run GPU
+    // prepare phi field
+    uploadRegisteredField<surfaceScalarField>(dfDataBase, phi, "phi");
+    // prepare testData
+    testGPUDataBase testData;
+    // not use source
+    // gradient_internal_coeffs, gradient_boundary_coeffs are not needed actually, but update_boundary_coeffs_vector will access them
+    buildTestGPUDataBase<volVectorField>(dfDataBase, testData, U, true, true, true, false, true, true, true, true, true, true);
+    // prepare boundary coeffs
+    // TODO: updating boundary coeffs should be complemented later
+    update_boundary_coeffs_vector(dfDataBase.stream, dfDataBase.num_patches,
+            dfDataBase.patch_size.data(), testData.patch_type.data(),
+            testData.d_value_internal_coeffs, testData.d_value_boundary_coeffs,
+            testData.d_gradient_internal_coeffs, testData.d_gradient_boundary_coeffs);
+
+    // run GPU
+    fvm_div_vector(dfDataBase.stream, dfDataBase.num_surfaces, dfDataBase.d_owner, dfDataBase.d_neighbor,
+            dfDataBase.d_phi, dfDataBase.d_weight,
+            testData.d_lower, testData.d_upper, testData.d_diag, // end for internal
+            dfDataBase.num_patches, dfDataBase.patch_size.data(), testData.patch_type.data(),
+            dfDataBase.d_boundary_phi, testData.d_value_internal_coeffs, testData.d_value_boundary_coeffs,
+            testData.d_internal_coeffs, testData.d_boundary_coeffs);
+
+    // compare result
+    bool printFlag = false;
+    compareResultVector(dfDataBase, testData, dfMatrix, printFlag);
+}
+
+// unittest of fvm::laplacian(gamma, vf)
+void test_fvm_laplacian_vector(dfMatrixDataBase& dfDataBase, Foam::fvMesh& mesh,
+        volScalarField& gamma, volVectorField& U, initType type)
+{
+    if (type == initType::randomInit) {
+        gamma.oldTime();
+        randomInitField<volScalarField>(gamma);
+    }
+
+    // run CPU
+    // fvVectorMatrix dfMatrix = fvm::laplacian(gamma, U);
+    fvVectorMatrix dfMatrix = gaussLaplacianSchemeFvmLaplacian(gamma, U);
+
+    // prepare for run GPU
+    // prepare gamma on GPU
+    double *d_gamma = nullptr;
+    double *d_boundary_gamma = nullptr;
+    checkCudaErrors(cudaMalloc((void**)&d_gamma, dfDataBase.cell_value_bytes));
+    checkCudaErrors(cudaMalloc((void**)&d_boundary_gamma, dfDataBase.boundary_surface_value_bytes));
+    uploadField<volScalarField>(dfDataBase, gamma, d_gamma, d_boundary_gamma);
+    // prepare testData
+    testGPUDataBase testData;
+    // not use source
+    // value_internal_coeffs, value_boundary_coeffs are not needed actually, but update_boundary_coeffs_vector will access them
+    buildTestGPUDataBase<volVectorField>(dfDataBase, testData, U, true, true, true, false, true, true, true, true, true, true);
+    // prepare boundary coeffs
+    // TODO: updating boundary coeffs should be complemented later
+    update_boundary_coeffs_vector(dfDataBase.stream, dfDataBase.num_patches,
+            dfDataBase.patch_size.data(), testData.patch_type.data(),
+            testData.d_value_internal_coeffs, testData.d_value_boundary_coeffs,
+            testData.d_gradient_internal_coeffs, testData.d_gradient_boundary_coeffs);
+
+    // run GPU
+    fvm_laplacian_vector(dfDataBase.stream, dfDataBase.num_surfaces,
+            dfDataBase.d_owner, dfDataBase.d_neighbor,
+            dfDataBase.d_weight, dfDataBase.d_mag_sf, dfDataBase.d_delta_coeffs, d_gamma,
+            testData.d_lower, testData.d_upper, testData.d_diag, // end for internal
+            dfDataBase.num_patches, dfDataBase.patch_size.data(), testData.patch_type.data(),
+            dfDataBase.d_boundary_mag_sf, d_boundary_gamma,
+            testData.d_gradient_internal_coeffs, testData.d_gradient_boundary_coeffs,
+            testData.d_internal_coeffs, testData.d_boundary_coeffs);
+
+    // compare result
+    bool printFlag = false;
+    compareResultVector(dfDataBase, testData, dfMatrix, printFlag);
+
+    // free resources
+    checkCudaErrors(cudaFree(d_gamma));
+    checkCudaErrors(cudaFree(d_boundary_gamma));
+}
+
+// unittest of fvc::ddt(rho, K)
+void test_fvc_ddt_scalar(dfMatrixDataBase& dfDataBase, Foam::fvMesh& mesh, volScalarField& rho, volScalarField& K, initType type) {
+    if (type == initType::randomInit) {
+        rho.oldTime();
+        randomInitField<volScalarField>(rho);
+        K.oldTime();
+        randomInitField<volScalarField>(K);
+    }
+
+    // run CPU
+    // volScalarField fvc_ouput_scalar = fvc::ddt(rho, K);
+    volScalarField fvc_ouput_scalar = EulerDdtSchemeFvcDdt(rho, K);
+
+    // prepare for run GPU
+    // prepare rho, rho.old on GPU
+    uploadRegisteredField<volScalarField>(dfDataBase, rho, "rho");
+    uploadRegisteredField<volScalarField>(dfDataBase, rho.oldTime(), "rho_old");
+    // prepare K, K_old on GPU
+    double *d_K = nullptr;
+    double *d_K_old = nullptr;
+    checkCudaErrors(cudaMalloc((void**)&d_K, dfDataBase.cell_value_bytes));
+    checkCudaErrors(cudaMalloc((void**)&d_K_old, dfDataBase.cell_value_bytes));
+    checkCudaErrors(cudaMemcpyAsync(d_K, &K[0], dfDataBase.cell_value_bytes, cudaMemcpyHostToDevice, dfDataBase.stream));
+    checkCudaErrors(cudaMemcpyAsync(d_K_old, &K.oldTime()[0], dfDataBase.cell_value_bytes, cudaMemcpyHostToDevice, dfDataBase.stream));
+    // there is no need for fvc ops to build testGPUDataBase, just build d_fvc_ouput_scalar directly.
+    double *d_fvc_ouput_scalar = nullptr;
+    checkCudaErrors(cudaMalloc((void**)&d_fvc_ouput_scalar, dfDataBase.cell_value_bytes));
+    checkCudaErrors(cudaMemset(d_fvc_ouput_scalar, 0, dfDataBase.cell_value_bytes));
+    // run GPU
+    // fvc_ddt doesn't consider to add fvc_output to source yet, which needs (fvc_output * volume * sign).
+    fvc_ddt_scalar(dfDataBase.stream, dfDataBase.num_cells, dfDataBase.rdelta_t,
+            dfDataBase.d_rho, dfDataBase.d_rho_old, d_K, d_K_old,
+            d_fvc_ouput_scalar);
+
+    // compare result
+    bool printFlag = false;
+    std::vector<double> h_fvc_ouput_scalar;
+    h_fvc_ouput_scalar.resize(dfDataBase.num_cells);
+    checkCudaErrors(cudaMemcpy(h_fvc_ouput_scalar.data(), d_fvc_ouput_scalar, dfDataBase.cell_value_bytes, cudaMemcpyDeviceToHost));
+    checkVectorEqual(dfDataBase.num_cells, &fvc_ouput_scalar[0], h_fvc_ouput_scalar.data(), 1e-14, printFlag);
+
+    // free resources
+    checkCudaErrors(cudaFree(d_K));
+    checkCudaErrors(cudaFree(d_K_old));
+    checkCudaErrors(cudaFree(d_fvc_ouput_scalar));
+}
+
+// unittest of fvc::grad(U)
+void test_fvc_grad_vector(dfMatrixDataBase& dfDataBase, Foam::fvMesh& mesh, volVectorField& U, initType type) {
+    if (type == initType::randomInit) {
+        U.oldTime();
+        randomInitField<volVectorField>(U);
+    }
+
+    // run CPU
+    // volTensorField fvc_ouput_tensor = fvc::grad(U);
+    volTensorField fvc_ouput_tensor = gaussGradSchemeGrad(U);
+
+    // prepare for run GPU
+    // prepare U on GPU
+    uploadRegisteredField<volVectorField>(dfDataBase, U, "u");
+    
+    double *d_fvc_ouput_tensor = nullptr, *d_fvc_ouput_boundary_tensor = nullptr;
+    checkCudaErrors(cudaMalloc((void**)&d_fvc_ouput_tensor, dfDataBase.cell_value_tsr_bytes));
+    checkCudaErrors(cudaMalloc((void**)&d_fvc_ouput_boundary_tensor, dfDataBase.boundary_surface_value_tsr_bytes));
+    checkCudaErrors(cudaMemset(d_fvc_ouput_tensor, 0, dfDataBase.cell_value_tsr_bytes));
+    checkCudaErrors(cudaMemset(d_fvc_ouput_boundary_tensor, 0, dfDataBase.boundary_surface_value_tsr_bytes));
+    
+    // only need patch_type
+    testGPUDataBase testData;
+    buildTestGPUDataBase<volVectorField>(dfDataBase, testData, U, false, false, false, false, false, false, false, false, false, false);
+
+    fvc_grad_vector(dfDataBase.stream, dfDataBase.num_cells, dfDataBase.num_surfaces,
+            dfDataBase.d_owner, dfDataBase.d_neighbor, 
+            dfDataBase.d_weight, dfDataBase.d_sf, dfDataBase.d_u, d_fvc_ouput_tensor,
+            dfDataBase.num_patches, dfDataBase.patch_size.data(), testData.patch_type.data(), 
+            dfDataBase.d_boundary_face_cell, dfDataBase.d_boundary_u, dfDataBase.d_boundary_sf,
+            dfDataBase.d_volume, dfDataBase.d_boundary_mag_sf, d_fvc_ouput_boundary_tensor, dfDataBase.d_boundary_delta_coeffs);
+
+    // compare result
+    bool printFlag = false;
+    std::vector<double> h_fvc_ouput_tensor(dfDataBase.num_cells * 9);
+    checkCudaErrors(cudaMemcpy(h_fvc_ouput_tensor.data(), d_fvc_ouput_tensor, dfDataBase.cell_value_tsr_bytes, cudaMemcpyDeviceToHost));
+    checkVectorEqual(dfDataBase.num_cells * 9, &fvc_ouput_tensor[0][0], h_fvc_ouput_tensor.data(), 1e-14, printFlag);
+}
+
+// unittest of fvc::div(phi)
+void test_fvc_div_scalar(dfMatrixDataBase& dfDataBase, Foam::fvMesh& mesh, surfaceScalarField& phi, initType type) {
+    if (type == initType::randomInit) {
+        phi.oldTime();
+        randomInitField<surfaceScalarField>(phi);
+    }
+
+    // run CPU
+    volScalarField fvc_ouput_scalar = fvc::div(phi);
+    // volScalarField fvc_ouput_scalar = gaussConvectionSchemeFvcDiv(phi);
+
+    // prepare for run GPU
+    // prepare phi on GPU
+    uploadRegisteredField<surfaceScalarField>(dfDataBase, phi, "phi");
+
+    double *d_fvc_ouput_scalar = nullptr;
+    checkCudaErrors(cudaMalloc((void**)&d_fvc_ouput_scalar, dfDataBase.cell_value_bytes));
+    checkCudaErrors(cudaMemset(d_fvc_ouput_scalar, 0, dfDataBase.cell_value_bytes));
+
+    fvc_div_surface_scalar(dfDataBase.stream, dfDataBase.num_cells, dfDataBase.num_surfaces, dfDataBase.num_boundary_surfaces,
+            dfDataBase.d_owner, dfDataBase.d_neighbor,
+            dfDataBase.d_phi, dfDataBase.d_boundary_face_cell, dfDataBase.d_boundary_phi, dfDataBase.d_volume, d_fvc_ouput_scalar);
+    
+    // compare result
+    bool printFlag = false;
+    std::vector<double> h_fvc_ouput_scalar(dfDataBase.num_cells);
+    checkCudaErrors(cudaMemcpy(h_fvc_ouput_scalar.data(), d_fvc_ouput_scalar, dfDataBase.cell_value_bytes, cudaMemcpyDeviceToHost));
+    checkVectorEqual(dfDataBase.num_cells, &fvc_ouput_scalar[0], h_fvc_ouput_scalar.data(), 1e-14, printFlag);
+}
+
+// unittest of fvc::div(U)
+void test_fvc_div_vector(dfMatrixDataBase& dfDataBase, Foam::fvMesh& mesh, volVectorField& U, initType type) {
+    if (type == initType::randomInit) {
+        U.oldTime();
+        randomInitField<volVectorField>(U);
+    }
+
+    // run CPU
+    // volScalarField fvc_ouput_scalar = fvc::div(U);
+    volScalarField fvc_ouput_scalar = gaussDivFvcdiv(U);
+
+    // prepare for run GPU
+    // prepare phi on GPU
+    uploadRegisteredField<volVectorField>(dfDataBase, U, "u");
+
+    double *d_fvc_ouput_scalar = nullptr;
+    checkCudaErrors(cudaMalloc((void**)&d_fvc_ouput_scalar, dfDataBase.cell_value_bytes));
+    checkCudaErrors(cudaMemset(d_fvc_ouput_scalar, 0, dfDataBase.cell_value_bytes));
+
+    // only need patch_type
+    testGPUDataBase testData;
+    buildTestGPUDataBase<volVectorField>(dfDataBase, testData, U, false, false, false, false, false, false, false, false, false, false);
+
+    fvc_div_cell_vector(dfDataBase.stream, dfDataBase.num_cells, dfDataBase.num_surfaces, 
+            dfDataBase.d_owner, dfDataBase.d_neighbor,
+            dfDataBase.d_weight, dfDataBase.d_sf, dfDataBase.d_u, d_fvc_ouput_scalar,
+            dfDataBase.num_patches, dfDataBase.patch_size.data(), testData.patch_type.data(), 
+            dfDataBase.d_boundary_face_cell, dfDataBase.d_boundary_u, dfDataBase.d_boundary_sf,
+            dfDataBase.d_volume);
+    
+    // compare result
+    bool printFlag = false;
+    std::vector<double> h_fvc_ouput_scalar(dfDataBase.num_cells);
+    checkCudaErrors(cudaMemcpy(h_fvc_ouput_scalar.data(), d_fvc_ouput_scalar, dfDataBase.cell_value_bytes, cudaMemcpyDeviceToHost));
+    checkVectorEqual(dfDataBase.num_cells, &fvc_ouput_scalar[0], h_fvc_ouput_scalar.data(), 1e-14, printFlag);
+}
+
+// unittest of fvc::grad(p)
+void test_fvc_grad_scalar(dfMatrixDataBase& dfDataBase, Foam::fvMesh& mesh, volScalarField& p, initType type) {
+    if (type == initType::randomInit) {
+        p.oldTime();
+        randomInitField<volScalarField>(p);
+    }
+
+    // run CPU
+    // volVectorField fvc_ouput_vector = fvc::grad(p);
+    volVectorField fvc_ouput_vector = gaussGradSchemeGrad(p);
+
+    // prepare for run GPU
+    // prepare p on GPU
+    uploadRegisteredField<volScalarField>(dfDataBase, p, "p");
+
+    double *d_fvc_ouput_vector = nullptr;
+    checkCudaErrors(cudaMalloc((void**)&d_fvc_ouput_vector, dfDataBase.cell_value_vec_bytes));
+    checkCudaErrors(cudaMemset(d_fvc_ouput_vector, 0, dfDataBase.cell_value_vec_bytes));
+
+    // only need patch_type
+    testGPUDataBase testData;
+    buildTestGPUDataBase<volScalarField>(dfDataBase, testData, p, false, false, false, false, false, false, false, false, false, false);
+
+    fvc_grad_cell_scalar(dfDataBase.stream, dfDataBase.num_cells, dfDataBase.num_surfaces,
+            dfDataBase.d_owner, dfDataBase.d_neighbor, 
+            dfDataBase.d_weight, dfDataBase.d_sf, dfDataBase.d_p, d_fvc_ouput_vector,
+            dfDataBase.num_patches, dfDataBase.patch_size.data(), testData.patch_type.data(), 
+            dfDataBase.d_boundary_face_cell, dfDataBase.d_boundary_p, dfDataBase.d_boundary_sf, dfDataBase.d_volume);
+
+    // compare result
+    bool printFlag = false;
+    std::vector<double> h_fvc_ouput_vector(dfDataBase.num_cells * 3);
+    checkCudaErrors(cudaMemcpy(h_fvc_ouput_vector.data(), d_fvc_ouput_vector, dfDataBase.cell_value_vec_bytes, cudaMemcpyDeviceToHost));
+    checkVectorEqual(dfDataBase.num_cells * 3, &fvc_ouput_vector[0][0], h_fvc_ouput_vector.data(), 1e-14, printFlag);
+}
+
+
+// * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * //
+template <>
+void getFieldPtr<volVectorField>(std::queue<double*>& fieldPtrQue, volVectorField& field) {
+    fieldPtrQue.push(&field[0][0]);
+    forAll(field.boundaryField(), patchi){
+        auto& patchField = field.boundaryFieldRef()[patchi];
+        fieldPtrQue.push(&patchField[0][0]);
+    }
+};
+
+template <>
+void getFieldPtr<volTensorField>(std::queue<double*>& fieldPtrQue, volTensorField& field) {
+    fieldPtrQue.push(&field[0][0]);
+    forAll(field.boundaryField(), patchi){
+        auto& patchField = field.boundaryFieldRef()[patchi];
+        fieldPtrQue.push(&patchField[0][0]);
+    }
+};
\ No newline at end of file
diff --git a/GPUTest/GPUTestRefBase.H b/GPUTest/GPUTestRefBase.H
new file mode 100644
index 000000000..754219e64
--- /dev/null
+++ b/GPUTest/GPUTestRefBase.H
@@ -0,0 +1,63 @@
+
+// unittest of fvc::grad(U)
+void test_fvc_grad_vector_orig(dfMatrixDataBase& dfDataBase, Foam::fvMesh& mesh, volVectorField& U, initType type,
+        dfMatrixDataBaseOrig* dfDataBaseOrig) 
+{
+    if (type == initType::randomInit) {
+        U.oldTime();
+        randomInitField<volVectorField>(U);
+    }
+
+    // run CPU
+    // volTensorField fvc_ouput_tensor = fvc::grad(U);
+    volTensorField fvc_ouput_tensor = gaussGradSchemeGrad(U);
+
+    // prepare for run GPU
+    // prepare U on GPU
+    uploadRegisteredField<volVectorField>(dfDataBase, U, "u");
+    
+    double *d_fvc_ouput_tensor = nullptr, *d_fvc_ouput_boundary_tensor = nullptr, *d_fvc_ouput_boundary_tensor_init = nullptr;
+    checkCudaErrors(cudaMalloc((void**)&d_fvc_ouput_tensor, dfDataBase.cell_value_tsr_bytes));
+    checkCudaErrors(cudaMalloc((void**)&d_fvc_ouput_boundary_tensor, dfDataBase.boundary_surface_value_tsr_bytes));
+    checkCudaErrors(cudaMalloc((void**)&d_fvc_ouput_boundary_tensor_init, dfDataBase.boundary_surface_value_tsr_bytes));
+    checkCudaErrors(cudaMemset(d_fvc_ouput_tensor, 0, dfDataBase.cell_value_tsr_bytes));
+    checkCudaErrors(cudaMemset(d_fvc_ouput_boundary_tensor, 0, dfDataBase.boundary_surface_value_tsr_bytes));
+    checkCudaErrors(cudaMemset(d_fvc_ouput_boundary_tensor_init, 0, dfDataBase.boundary_surface_value_tsr_bytes));
+
+    fvc_grad_vector_orig(dfDataBase.stream, dfDataBaseOrig, dfDataBase, d_fvc_ouput_tensor, d_fvc_ouput_boundary_tensor_init, d_fvc_ouput_boundary_tensor);
+
+    // compare result
+    bool printFlag = false;
+    std::vector<double> h_fvc_ouput_tensor(dfDataBase.num_cells * 9);
+    checkCudaErrors(cudaMemcpy(h_fvc_ouput_tensor.data(), d_fvc_ouput_tensor, dfDataBase.cell_value_tsr_bytes, cudaMemcpyDeviceToHost));
+    checkVectorEqual(dfDataBase.num_cells * 9, &fvc_ouput_tensor[0][0], h_fvc_ouput_tensor.data(), 1e-14, printFlag);
+}
+
+void test_fvc_grad_scalar_orig(dfMatrixDataBase& dfDataBase, Foam::fvMesh& mesh, volScalarField& p, initType type,
+        dfMatrixDataBaseOrig* dfDataBaseOrig) 
+{
+    if (type == initType::randomInit) {
+        p.oldTime();
+        randomInitField<volScalarField>(p);
+    }
+
+    // run CPU
+    // volVectorField fvc_ouput_vector = fvc::grad(p);
+    volVectorField fvc_ouput_vector = gaussGradSchemeGrad(p);
+
+    // prepare for run GPU
+    // prepare p on GPU
+    uploadRegisteredField<volScalarField>(dfDataBase, p, "p");
+
+    double *d_fvc_ouput_vector = nullptr;
+    checkCudaErrors(cudaMalloc((void**)&d_fvc_ouput_vector, dfDataBase.cell_value_vec_bytes));
+    checkCudaErrors(cudaMemset(d_fvc_ouput_vector, 0, dfDataBase.cell_value_vec_bytes));
+
+    fvc_grad_scalar_orig(dfDataBase.stream, dfDataBaseOrig, dfDataBase, d_fvc_ouput_vector);
+
+    // compare result
+    bool printFlag = false;
+    std::vector<double> h_fvc_ouput_vector(dfDataBase.num_cells * 3);
+    checkCudaErrors(cudaMemcpy(h_fvc_ouput_vector.data(), d_fvc_ouput_vector, dfDataBase.cell_value_vec_bytes, cudaMemcpyDeviceToHost));
+    checkVectorEqual(dfDataBase.num_cells * 3, &fvc_ouput_vector[0][0], h_fvc_ouput_vector.data(), 1e-14, printFlag);
+}
\ No newline at end of file
diff --git a/GPUTest/Make/files b/GPUTest/Make/files
new file mode 100644
index 000000000..d78085ff8
--- /dev/null
+++ b/GPUTest/Make/files
@@ -0,0 +1,4 @@
+unittest.C
+
+EXE = $(DF_APPBIN)/unitTest
+
diff --git a/GPUTest/Make/options b/GPUTest/Make/options
new file mode 100644
index 000000000..e8e07b6a5
--- /dev/null
+++ b/GPUTest/Make/options
@@ -0,0 +1,50 @@
+-include $(GENERAL_RULES)/mplibType
+
+EXE_INC = -std=c++14 \
+    -g \
+    -fopenmp \
+    -Wno-unused-variable \
+    -Wno-unused-but-set-variable \
+    -Wno-old-style-cast \
+    $(PFLAGS) $(PINC) \
+    $(if $(LIBTORCH_ROOT),-DUSE_LIBTORCH,) \
+    $(if $(PYTHON_INC_DIR),-DUSE_PYTORCH,) \
+    -I$(LIB_SRC)/transportModels/compressible/lnInclude \
+    -I$(LIB_SRC)/thermophysicalModels/basic/lnInclude \
+    -I$(LIB_SRC)/TurbulenceModels/turbulenceModels/lnInclude \
+    -I$(LIB_SRC)/TurbulenceModels/compressible/lnInclude \
+    -I$(LIB_SRC)/finiteVolume/cfdTools \
+    -I$(LIB_SRC)/finiteVolume/lnInclude \
+    -I$(LIB_SRC)/meshTools/lnInclude \
+    -I$(LIB_SRC)/sampling/lnInclude \
+    -I$(LIB_SRC)/dynamicFvMesh/lnInclude \
+    -I$(LIB_SRC)/Pstream/mpi \
+    -I$(DF_SRC)/dfCanteraMixture/lnInclude \
+    -I$(DF_SRC)/dfChemistryModel/lnInclude \
+    -I$(DF_SRC)/dfCombustionModels/lnInclude \
+    -I$(CANTERA_ROOT)/include \
+	-I$(DF_ROOT)/src_gpu \
+	-I$(DF_ROOT)/src_gpu_orig \
+	-I$(DF_ROOT)/GPUTestRef/lnInclude \
+	-I/usr/local/cuda-11.6/include \
+	-I$(AMGX_DIR)/include
+
+EXE_LIBS = \
+    -lcompressibleTransportModels \
+    -lturbulenceModels \
+    -lfiniteVolume \
+    -lmeshTools \
+    -lsampling \
+    -L$(DF_LIBBIN) \
+    -ldfFluidThermophysicalModels \
+    -ldfCompressibleTurbulenceModels \
+    -ldfCanteraMixture \
+    -ldfChemistryModel \
+    -ldfCombustionModels \
+	-ldfGenMatrix \
+    $(CANTERA_ROOT)/lib/libcantera.so \
+	/usr/local/cuda-11.6/lib64/libcudart.so \
+	$(AMGX_DIR)/build/libamgxsh.so \
+    $(DF_ROOT)/src_gpu/build/libdfMatrix.so \
+	$(DF_ROOT)/src_gpu_orig/build/libdfMatrixOrig.so 
+
diff --git a/GPUTest/correctPhi.H b/GPUTest/correctPhi.H
new file mode 100644
index 000000000..3cd82d29e
--- /dev/null
+++ b/GPUTest/correctPhi.H
@@ -0,0 +1,12 @@
+CorrectPhi
+(
+    U,
+    phi,
+    p,
+    rho,
+    psi,
+    dimensionedScalar("rAUf", dimTime, 1),
+    divrhoU(),
+    pimple,
+    true
+);
diff --git a/GPUTest/createFields.H b/GPUTest/createFields.H
new file mode 100644
index 000000000..9e750c334
--- /dev/null
+++ b/GPUTest/createFields.H
@@ -0,0 +1,176 @@
+#include "createRDeltaT.H"
+
+Info<< "Reading thermophysical properties\n" << endl;
+
+// fluidThermo* pThermo = new hePsiThermo<psiThermo, CanteraMixture>(mesh, word::null);
+fluidThermo* pThermo = new heRhoThermo<rhoThermo, CanteraMixture>(mesh, word::null);
+fluidThermo& thermo = *pThermo;
+// thermo.validate(args.executable(), "ha");
+
+const volScalarField& psi = thermo.psi();
+volScalarField& p = thermo.p();
+volScalarField& T = thermo.T();
+volScalarField rho
+(
+    IOobject
+    (
+        "rho",
+        runTime.timeName(),
+        mesh,
+        IOobject::READ_IF_PRESENT,
+        IOobject::AUTO_WRITE
+    ),
+    thermo.rho()
+);
+
+
+Info<< "Reading field U\n" << endl;
+volVectorField U
+(
+    IOobject
+    (
+        "U",
+        runTime.timeName(),
+        mesh,
+        IOobject::MUST_READ,
+        IOobject::AUTO_WRITE
+    ),
+    mesh
+);
+
+#include "compressibleCreatePhi.H"
+
+pressureControl pressureControl(p, rho, pimple.dict(), false);
+
+mesh.setFluxRequired(p.name());
+
+Info<< "Creating turbulence model\n" << endl;
+autoPtr<compressible::turbulenceModel> turbulence
+(
+    compressible::turbulenceModel::New
+    (
+        rho,
+        U,
+        phi,
+        thermo
+    )
+);
+
+Info<< "Creating field dpdt\n" << endl;
+volScalarField dpdt
+(
+    IOobject
+    (
+        "dpdt",
+        runTime.timeName(),
+        mesh,
+        IOobject::NO_READ,
+        IOobject::NO_WRITE
+    ),
+    mesh,
+    dimensionedScalar("dpdt",p.dimensions()/dimTime, 0)
+);
+
+
+Info<< "Creating reaction model\n" << endl;
+autoPtr<CombustionModel<basicThermo>> combustion
+(
+    CombustionModel<basicThermo>::New(thermo, turbulence())
+);
+Info<< "end Creating reaction model\n" << endl;
+
+
+const word combModelName(mesh.objectRegistry::lookupObject<IOdictionary>("combustionProperties").lookup("combustionModel"));
+Info << "Combustion Model Name is confirmed as "<< combModelName << endl;
+
+const word turbName(mesh.objectRegistry::lookupObject<IOdictionary>("turbulenceProperties").lookup("simulationType"));
+
+dfChemistryModel<basicThermo>* chemistry = combustion->chemistry();
+PtrList<volScalarField>& Y = chemistry->Y();
+const word inertSpecie(chemistry->lookup("inertSpecie"));
+const label inertIndex(chemistry->species()[inertSpecie]);
+chemistry->setEnergyName("ha");
+chemistry->updateEnergy();
+
+
+chemistry->correctThermo();
+Info<< "At initial time, min/max(T) = " << min(T).value() << ", " << max(T).value() << endl;
+
+//for dpdt
+
+Info<< "Creating field kinetic energy K\n" << endl;
+volScalarField K("K", 0.5*magSqr(U));
+
+multivariateSurfaceInterpolationScheme<scalar>::fieldTable fields;
+
+if(combModelName!="flareFGM")
+{
+forAll(Y, i)
+{
+    fields.add(Y[i]);
+}
+fields.add(thermo.he());
+}
+
+
+const scalar Sct = chemistry->lookupOrDefault("Sct", 1.);
+volScalarField diffAlphaD
+(
+    IOobject
+    (
+        "diffAlphaD",
+        runTime.timeName(),
+        mesh,
+        IOobject::NO_READ,
+        IOobject::NO_WRITE
+    ),
+    mesh,
+    dimensionedScalar(dimEnergy/dimTime/dimVolume, 0)
+);
+volVectorField hDiffCorrFlux
+(
+    IOobject
+    (
+        "hDiffCorrFlux",
+        runTime.timeName(),
+        mesh,
+        IOobject::NO_READ,
+        IOobject::NO_WRITE
+    ),
+    mesh,
+    dimensionedVector(dimensionSet(1,0,-3,0,0,0,0), Zero)
+);
+volVectorField sumYDiffError
+(
+    IOobject
+    (
+        "sumYDiffError",
+        runTime.timeName(),
+        mesh,
+        IOobject::NO_READ,
+        IOobject::NO_WRITE
+    ),
+    mesh,
+    dimensionedVector("sumYDiffError", dimDynamicViscosity/dimLength, Zero)
+);
+
+IOdictionary CanteraTorchProperties
+(
+    IOobject
+    (
+        "CanteraTorchProperties",
+        runTime.constant(),
+        mesh,
+        IOobject::MUST_READ,
+        IOobject::NO_WRITE
+    )
+);
+const Switch splitting = CanteraTorchProperties.lookupOrDefault("splittingStrategy", false);
+#ifdef USE_PYTORCH
+    const Switch log_ = CanteraTorchProperties.subDict("TorchSettings").lookupOrDefault("log", false);
+    const Switch torch_ = CanteraTorchProperties.subDict("TorchSettings").lookupOrDefault("torch", false);
+#endif
+#ifdef USE_LIBTORCH
+    const Switch log_ = CanteraTorchProperties.subDict("TorchSettings").lookupOrDefault("log", false);
+    const Switch torch_ = CanteraTorchProperties.subDict("TorchSettings").lookupOrDefault("torch", false);
+#endif
diff --git a/GPUTest/createGPUSolver.H b/GPUTest/createGPUSolver.H
new file mode 100644
index 000000000..516386473
--- /dev/null
+++ b/GPUTest/createGPUSolver.H
@@ -0,0 +1,114 @@
+
+dfMatrixDataBase dfDataBase;
+
+void createGPUBase(fvMesh& mesh, PtrList<volScalarField>& Y) {
+    // prepare constant values: num_cells, num_surfaces, num_boundary_surfaces, num_patches, patch_size, num_species, rdelta_t
+    const labelUList& owner = mesh.owner();
+    const labelUList& neighbour = mesh.neighbour();
+    int num_cells = mesh.nCells();
+    int num_surfaces = neighbour.size();
+    int num_boundary_surfaces = 0;
+    int num_patches = 0;
+    std::vector<int> patch_size;
+    forAll(mesh.boundary(), patchi) {
+        labelUList sub_boundary = mesh.boundary()[patchi].faceCells();
+        int patchsize = sub_boundary.size();
+        patch_size.push_back(patchsize);
+        num_boundary_surfaces += patchsize;
+        num_patches++;
+    }
+    // TODO: get deltaT fomr time API
+    double rDeltaT = 1 / 1e-6;
+    dfDataBase.setConstantValues(num_cells, num_surfaces, num_boundary_surfaces, num_patches, patch_size, Y.size(), rDeltaT);
+    
+    // prepare constant indexes: owner, neighbor
+    dfDataBase.setConstantIndexes(&owner[0], &neighbour[0]);
+    
+    // prepare internal and boundary of sf, mag_sf, weights, delta_coeffs, volume
+    double *boundary_sf = new double[3 * num_boundary_surfaces];
+    double *boundary_mag_sf = new double[num_boundary_surfaces];
+    double *boundary_delta_coeffs = new double[num_boundary_surfaces];
+    int *boundary_face_cell = new int[num_boundary_surfaces];
+    int offset = 0;
+    forAll(mesh.boundary(), patchi) {
+        const vectorField& pSf = mesh.Sf().boundaryField()[patchi];
+        const scalarField& pMagSf = mesh.magSf().boundaryField()[patchi];
+        const scalarField& pDeltaCoeffs = mesh.nonOrthDeltaCoeffs().boundaryField()[patchi];
+        const labelUList& pFaceCells = mesh.boundary()[patchi].faceCells();
+
+        int patchsize = pMagSf.size();
+
+        memcpy(boundary_sf + 3*offset, &pSf[0][0], 3*patchsize*sizeof(double));
+        memcpy(boundary_mag_sf + offset, &pMagSf[0], patchsize*sizeof(double));
+        memcpy(boundary_delta_coeffs + offset, &pDeltaCoeffs[0], patchsize*sizeof(double));
+        memcpy(boundary_face_cell + offset, &pFaceCells[0], patchsize * sizeof(int));
+        offset += patchsize;
+    }
+
+    dfDataBase.createConstantFieldsInternal();
+    dfDataBase.createConstantFieldsBoundary();
+    dfDataBase.initConstantFieldsInternal(&mesh.Sf()[0][0], &mesh.magSf()[0], &mesh.surfaceInterpolation::weights()[0], &mesh.nonOrthDeltaCoeffs()[0], &mesh.V()[0]);
+    dfDataBase.initConstantFieldsBoundary(boundary_sf, boundary_mag_sf, boundary_delta_coeffs, boundary_face_cell);
+    
+    // prepare internal and boundary of Y
+    dfDataBase.createNonConstantFieldsInternal();
+    dfDataBase.createNonConstantFieldsBoundary();
+    forAll(Y, speciesI) {
+        volScalarField& Yi = Y[speciesI];
+        memcpy(dfDataBase.h_y + speciesI * num_cells, &Yi[0], num_cells * sizeof(double));
+        offset = 0;
+        forAll(Yi.boundaryField(), patchi) {
+            const scalarField& patchYi = Yi.boundaryField()[patchi];
+            int patchsize = patchYi.size();
+            memcpy(dfDataBase.h_boundary_y + speciesI * num_boundary_surfaces + offset, &patchYi[0], patchsize*sizeof(double));
+            offset += patchsize;
+        }
+    }
+    dfDataBase.initNonConstantFieldsInternal(dfDataBase.h_y);
+    dfDataBase.initNonConstantFieldsBoundary(dfDataBase.h_boundary_y);
+};
+
+
+dfMatrixDataBaseOrig* createGPUBaseOrig(fvMesh& mesh, PtrList<volScalarField>& Y, volVectorField& U) {
+    const labelUList& owner = mesh.owner();
+    const labelUList& neighbour = mesh.neighbour();
+    int num_cells = mesh.nCells();
+    int num_surfaces = neighbour.size();
+
+    std::vector<int> boundaryCellIndex;
+    std::vector<double> boundary_face_vector_init;
+    std::vector<double> boundary_face_init;
+    std::vector<double> boundary_deltaCoeffs_init;
+    std::vector<std::vector<int>> patchTypes;
+    std::vector<int> patchTypeU, patchTypeY;
+    int num_boundary_faces = 0;
+    int patchSize;
+    forAll(mesh.boundary(), patchi)
+    {
+        labelUList sub_boundary = mesh.boundary()[patchi].faceCells();
+        patchSize = sub_boundary.size();
+        const vectorField& pSf = mesh.Sf().boundaryField()[patchi];
+        const scalarField& pMagSf = mesh.magSf().boundaryField()[patchi];
+        const scalarField& pDeltaCoeffs = mesh.nonOrthDeltaCoeffs().boundaryField()[patchi];
+
+        boundaryCellIndex.insert(boundaryCellIndex.end(), &sub_boundary[0], &sub_boundary[0]+patchSize);
+        boundary_face_vector_init.insert(boundary_face_vector_init.end(), &pSf[0][0], &pSf[0][0]+3*patchSize);
+        boundary_face_init.insert(boundary_face_init.end(), &pMagSf[0], &pMagSf[0]+patchSize);
+        boundary_deltaCoeffs_init.insert(boundary_deltaCoeffs_init.end(), &pDeltaCoeffs[0], &pDeltaCoeffs[0]+patchSize);
+        num_boundary_faces += patchSize;
+
+        constructBoundarySelector(patchTypeU, U.boundaryField()[patchi].type(), patchSize);
+        constructBoundarySelector(patchTypeY, Y[0].boundaryField()[patchi].type(), patchSize);
+    }
+    patchTypes.emplace_back(patchTypeU);
+    patchTypes.emplace_back(patchTypeY);
+
+    int num_boundary_cells;
+
+    dfMatrixDataBaseOrig* dfDataBase = new dfMatrixDataBaseOrig(num_surfaces, num_cells, num_boundary_faces, Y.size(), num_boundary_cells, 
+            &neighbour[0], &owner[0], &mesh.V()[0], &mesh.surfaceInterpolation::weights()[0], 
+            &mesh.Sf()[0][0], &mesh.magSf()[0], &mesh.nonOrthDeltaCoeffs()[0], boundary_face_vector_init, 
+            boundary_face_init, boundary_deltaCoeffs_init, boundaryCellIndex, patchTypes);
+    
+    return dfDataBase;
+}
\ No newline at end of file
diff --git a/GPUTest/setRDeltaT.H b/GPUTest/setRDeltaT.H
new file mode 100644
index 000000000..074d05e3d
--- /dev/null
+++ b/GPUTest/setRDeltaT.H
@@ -0,0 +1,85 @@
+{
+    volScalarField& rDeltaT = trDeltaT.ref();
+
+    const dictionary& pimpleDict = pimple.dict();
+
+    scalar maxCo
+    (
+        pimpleDict.lookupOrDefault<scalar>("maxCo", 0.8)
+    );
+
+    scalar rDeltaTSmoothingCoeff
+    (
+        pimpleDict.lookupOrDefault<scalar>("rDeltaTSmoothingCoeff", 0.02)
+    );
+
+    scalar rDeltaTDampingCoeff
+    (
+        pimpleDict.lookupOrDefault<scalar>("rDeltaTDampingCoeff", 1.0)
+    );
+
+    scalar maxDeltaT
+    (
+        pimpleDict.lookupOrDefault<scalar>("maxDeltaT", great)
+    );
+
+    volScalarField rDeltaT0("rDeltaT0", rDeltaT);
+
+    // Set the reciprocal time-step from the local Courant number
+    rDeltaT.ref() = max
+    (
+        1/dimensionedScalar(dimTime, maxDeltaT),
+        fvc::surfaceSum(mag(phi))()()
+       /((2*maxCo)*mesh.V()*rho())
+    );
+
+    if (pimple.transonic())
+    {
+        surfaceScalarField phid
+        (
+            "phid",
+            fvc::interpolate(psi)*fvc::flux(U)
+        );
+
+        rDeltaT.ref() = max
+        (
+            rDeltaT(),
+            fvc::surfaceSum(mag(phid))()()
+            /((2*maxCo)*mesh.V()*psi())
+        );
+    }
+
+    // Update tho boundary values of the reciprocal time-step
+    rDeltaT.correctBoundaryConditions();
+
+    Info<< "Flow time scale min/max = "
+        << gMin(1/rDeltaT.primitiveField())
+        << ", " << gMax(1/rDeltaT.primitiveField()) << endl;
+
+    if (rDeltaTSmoothingCoeff < 1.0)
+    {
+        fvc::smooth(rDeltaT, rDeltaTSmoothingCoeff);
+    }
+
+    Info<< "Smoothed flow time scale min/max = "
+        << gMin(1/rDeltaT.primitiveField())
+        << ", " << gMax(1/rDeltaT.primitiveField()) << endl;
+
+    // Limit rate of change of time scale
+    // - reduce as much as required
+    // - only increase at a fraction of old time scale
+    if
+    (
+        rDeltaTDampingCoeff < 1.0
+     && runTime.timeIndex() > runTime.startTimeIndex() + 1
+    )
+    {
+        rDeltaT =
+            rDeltaT0
+           *max(rDeltaT/rDeltaT0, scalar(1) - rDeltaTDampingCoeff);
+
+        Info<< "Damped flow time scale min/max = "
+            << gMin(1/rDeltaT.primitiveField())
+            << ", " << gMax(1/rDeltaT.primitiveField()) << endl;
+    }
+}
diff --git a/GPUTest/setRootCase2.H b/GPUTest/setRootCase2.H
new file mode 100644
index 000000000..45d966e63
--- /dev/null
+++ b/GPUTest/setRootCase2.H
@@ -0,0 +1,5 @@
+Foam::argList args(argc,argv,true,true,/*initialise=*/false);
+if (!args.checkRootCase())
+{
+    Foam::FatalError.exit();
+}
\ No newline at end of file
diff --git a/GPUTest/unittest.C b/GPUTest/unittest.C
new file mode 100644
index 000000000..80eafef9d
--- /dev/null
+++ b/GPUTest/unittest.C
@@ -0,0 +1,169 @@
+/*---------------------------------------------------------------------------*\
+  =========                 |
+  \\      /  F ield         | OpenFOAM: The Open Source CFD Toolbox
+   \\    /   O peration     | Website:  https://openfoam.org
+    \\  /    A nd           | Copyright (C) 2011-2019 OpenFOAM Foundation
+     \\/     M anipulation  |
+-------------------------------------------------------------------------------
+License
+    This file is part of OpenFOAM.
+
+    OpenFOAM is free software: you can redistribute it and/or modify it
+    under the terms of the GNU General Public License as published by
+    the Free Software Foundation, either version 3 of the License, or
+    (at your option) any later version.
+
+    OpenFOAM is distributed in the hope that it will be useful, but WITHOUT
+    ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
+    FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
+    for more details.
+
+    You should have received a copy of the GNU General Public License
+    along with OpenFOAM.  If not, see <http://www.gnu.org/licenses/>.
+
+Application
+    unittest
+
+Description
+    GPU unittest
+
+\*---------------------------------------------------------------------------*/
+
+#include "dfChemistryModel.H"
+#include "CanteraMixture.H"
+// #include "hePsiThermo.H"
+#include "heRhoThermo.H"
+
+#include "fvCFD.H"
+#include "fluidThermo.H"
+#include "turbulentFluidThermoModel.H"
+#include "pimpleControl.H"
+#include "pressureControl.H"
+#include "localEulerDdtScheme.H"
+#include "fvcSmooth.H"
+#include "PstreamGlobals.H"
+#include "basicThermo.H"
+#include "CombustionModel.H"
+
+#include <cuda_runtime.h>
+#include <thread>
+#include "upwind.H"
+
+// debug
+#include "GenFvMatrix.H"
+#include <iostream>
+#include <queue>
+
+#include "dfMatrixDataBase.H"
+#include "dfMatrixDataBaseOrig.H"
+#include "dfMatrixOpBase.H"
+#include "dfMatrixOpBaseOrig.H"
+#include "createGPUSolver.H"
+#include "GPUTestBase.H"
+#include "GPUTestRefBase.H"
+
+int main(int argc, char *argv[])
+{
+#ifdef USE_PYTORCH
+    pybind11::scoped_interpreter guard{};//start python interpreter
+#endif
+    #include "postProcess.H"
+
+    // #include "setRootCaseLists.H"
+    #include "listOptions.H"
+    #include "setRootCase2.H"
+    #include "listOutput.H"
+
+    #include "createTime.H"
+    #include "createMesh.H"
+    #include "createDyMControls.H"
+    #include "initContinuityErrs.H"
+    #include "createFields.H"
+    #include "createRhoUfIfPresent.H"
+
+    turbulence->validate();
+
+    if (!LTS)
+    {
+        #include "compressibleCourantNo.H"
+        #include "setInitialDeltaT.H"
+    }
+
+    // * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * //
+    {
+        #include "readDyMControls.H"
+
+        if (LTS)
+        {
+            #include "setRDeltaT.H"
+        }
+        else
+        {
+            #include "compressibleCourantNo.H"
+            #include "setDeltaT.H"
+        }
+
+        createGPUBase(mesh, Y);
+        DEBUG_TRACE;
+        dfMatrixDataBaseOrig* dfDataBaseOrig = createGPUBaseOrig(mesh, Y, U);
+        DEBUG_TRACE;
+
+        // unittest of fvm::ddt(rho, U)
+        test_fvm_ddt_vector(dfDataBase, mesh, rho, U, initType::original);
+        DEBUG_TRACE;
+        test_fvm_ddt_vector(dfDataBase, mesh, rho, U, initType::randomInit);
+        DEBUG_TRACE;
+
+        // unittest of fvm::div(phi, U)
+        test_fvm_div_vector(dfDataBase, mesh, phi, U, initType::original);
+        DEBUG_TRACE;
+        test_fvm_div_vector(dfDataBase, mesh, phi, U, initType::randomInit);
+        DEBUG_TRACE;
+
+        // unittest of fvm::laplacian(gamma, U)
+        const tmp<volScalarField> nuEff_tmp(turbulence->nuEff());
+        const volScalarField& nuEff = nuEff_tmp();
+        volScalarField gamma = rho * nuEff;
+        test_fvm_laplacian_vector(dfDataBase, mesh, gamma, U, initType::original);
+        DEBUG_TRACE;
+        test_fvm_laplacian_vector(dfDataBase, mesh, gamma, U, initType::randomInit);
+        DEBUG_TRACE;
+
+        // unittest of fvc::ddt(rho, K)
+        K = 0.5*magSqr(U);
+        test_fvc_ddt_scalar(dfDataBase, mesh, rho, K, initType::original);
+        DEBUG_TRACE;
+        test_fvc_ddt_scalar(dfDataBase, mesh, rho, K, initType::randomInit);
+        DEBUG_TRACE;
+
+        // unittest of fvc::grad(U)
+        test_fvc_grad_vector(dfDataBase, mesh, U, initType::original);
+        DEBUG_TRACE;
+        // test_fvc_grad_vector(dfDataBase, mesh, U, initType::randomInit);
+        // DEBUG_TRACE;
+        test_fvc_grad_vector_orig(dfDataBase, mesh, U, initType::original, dfDataBaseOrig);
+        DEBUG_TRACE;
+
+        // unittest of fvc::div(phi)
+        test_fvc_div_scalar(dfDataBase, mesh, phi, initType::original);
+        DEBUG_TRACE;
+        test_fvc_div_scalar(dfDataBase, mesh, phi, initType::randomInit);
+        DEBUG_TRACE;
+
+        // unittest of fvc::div(U)
+        test_fvc_div_vector(dfDataBase, mesh, U, initType::original);
+        DEBUG_TRACE;
+        // test_fvc_div_vector(dfDataBase, mesh, U, initType::randomInit);
+        // DEBUG_TRACE;
+
+        // unittest of fvc::grad(p)
+        test_fvc_grad_scalar(dfDataBase, mesh, p, initType::original);
+        DEBUG_TRACE;
+        test_fvc_grad_scalar(dfDataBase, mesh, p, initType::randomInit);
+        DEBUG_TRACE;
+        test_fvc_grad_scalar_orig(dfDataBase, mesh, p, initType::original, dfDataBaseOrig);
+        DEBUG_TRACE
+    }
+    return 0;
+}
+
diff --git a/GPUTestRef/EulerDdtScheme.C b/GPUTestRef/EulerDdtScheme.C
new file mode 100644
index 000000000..0875e0033
--- /dev/null
+++ b/GPUTestRef/EulerDdtScheme.C
@@ -0,0 +1,322 @@
+/*---------------------------------------------------------------------------*\
+  =========                 |
+  \\      /  F ield         | OpenFOAM: The Open Source CFD Toolbox
+   \\    /   O peration     | Website:  https://openfoam.org
+    \\  /    A nd           | Copyright (C) 2011-2018 OpenFOAM Foundation
+     \\/     M anipulation  |
+-------------------------------------------------------------------------------
+License
+    This file is part of OpenFOAM.
+
+    OpenFOAM is free software: you can redistribute it and/or modify it
+    under the terms of the GNU General Public License as published by
+    the Free Software Foundation, either version 3 of the License, or
+    (at your option) any later version.
+
+    OpenFOAM is distributed in the hope that it will be useful, but WITHOUT
+    ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
+    FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
+    for more details.
+
+    You should have received a copy of the GNU General Public License
+    along with OpenFOAM.  If not, see <http://www.gnu.org/licenses/>.
+
+\*---------------------------------------------------------------------------*/
+
+#include "GenFvMatrix.H"
+
+// * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * //
+
+namespace Foam
+{
+
+// * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * //
+
+// namespace fv
+// {
+
+// * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * //
+
+template<class Type>
+tmp<fvMatrix<Type>>
+EulerDdtSchemeFvmDdt
+(
+    const volScalarField& rho,
+    const GeometricField<Type, fvPatchField, volMesh>& vf
+)
+{
+    const fvMesh& mesh = vf.mesh();
+
+    tmp<fvMatrix<Type>> tfvm
+    (
+        new fvMatrix<Type>
+        (
+            vf,
+            rho.dimensions()*vf.dimensions()*dimVol/dimTime
+        )
+    );
+    fvMatrix<Type>& fvm = tfvm.ref();
+
+    scalar rDeltaT = 1.0/mesh.time().deltaTValue();
+
+    fvm.diag() = rDeltaT*rho.primitiveField()*mesh.Vsc();
+
+    if (mesh.moving())
+    {
+        fvm.source() = rDeltaT
+            *rho.oldTime().primitiveField()
+            *vf.oldTime().primitiveField()*mesh.Vsc0();
+    }
+    else
+    {
+        fvm.source() = rDeltaT
+            *rho.oldTime().primitiveField()
+            *vf.oldTime().primitiveField()*mesh.Vsc();
+    }
+    return tfvm;
+}
+
+template<class Type>
+tmp<GeometricField<Type, fvPatchField, volMesh>>
+EulerDdtSchemeFvcDdt
+(
+    const volScalarField& rho,
+    const GeometricField<Type, fvPatchField, volMesh>& vf
+)
+{
+    const fvMesh& mesh = vf.mesh();
+
+    dimensionedScalar rDeltaT = 1.0/mesh.time().deltaT();
+
+    IOobject ddtIOobject
+    (
+        "ddt("+rho.name()+','+vf.name()+')',
+        mesh.time().timeName(),
+        mesh
+    );
+
+    if (mesh.moving())
+    {
+        return tmp<GeometricField<Type, fvPatchField, volMesh>>
+        (
+            new GeometricField<Type, fvPatchField, volMesh>
+            (
+                ddtIOobject,
+                rDeltaT*
+                (
+                    rho()*vf()
+                  - rho.oldTime()()
+                   *vf.oldTime()()*mesh.Vsc0()/mesh.Vsc()
+                ),
+                rDeltaT.value()*
+                (
+                    rho.boundaryField()*vf.boundaryField()
+                  - rho.oldTime().boundaryField()
+                   *vf.oldTime().boundaryField()
+                )
+            )
+        );
+    }
+    else
+    {
+        return tmp<GeometricField<Type, fvPatchField, volMesh>>
+        (
+            new GeometricField<Type, fvPatchField, volMesh>
+            (
+                ddtIOobject,
+                rDeltaT*(rho*vf - rho.oldTime()*vf.oldTime())
+            )
+        );
+    }
+}
+
+
+tmp<surfaceScalarField>
+EulerDdtSchemeFvcDdtCorr
+(
+    const volScalarField& rho,
+    const volVectorField& U,
+    const surfaceScalarField& phi,
+    const autoPtr<surfaceVectorField>& Uf
+)
+{
+    Info << "EulerDdtSchemeFvcDdtCorr start" << endl;
+
+    const fvMesh& mesh = U.mesh();
+
+    dimensionedScalar rDeltaT = 1.0/mesh.time().deltaT();
+
+    GeometricField<vector, fvPatchField, volMesh> rhoU0
+    (
+        rho.oldTime() * U.oldTime()
+    );
+
+    surfaceScalarField phiCorr
+    (
+        phi.oldTime() - fvc::dotInterpolate(mesh.Sf(), rhoU0)
+    );
+
+    return tmp<surfaceScalarField>
+    (
+        new surfaceScalarField
+        (
+            IOobject
+            (
+                "ddtCorr("
+                + rho.name() + ',' + U.name() + ',' + phi.name() + ')',
+                mesh.time().timeName(),
+                mesh
+            ),
+            EulerDdtSchemeFvcDdtPhiCoeff
+            (
+                rhoU0,
+                phi.oldTime(),
+                phiCorr,
+                rho.oldTime()
+            )*rDeltaT*phiCorr
+        )
+    );
+
+}
+
+tmp<surfaceScalarField>
+EulerDdtSchemeFvcDdtPhiCoeff
+(
+    const volVectorField& U,
+    const surfaceScalarField& phi,
+    const surfaceScalarField& phiCorr,
+    const volScalarField& rho
+)
+{
+    const fvMesh& mesh = U.mesh();
+    tmp<surfaceScalarField> tddtCouplingCoeff = scalar(1) - min(mag(phiCorr)/(mag(phi) + dimensionedScalar("small", phi.dimensions(), SMALL)),scalar(1));
+
+    surfaceScalarField& ddtCouplingCoeff = tddtCouplingCoeff.ref();
+
+    surfaceScalarField::Boundary& ccbf = ddtCouplingCoeff.boundaryFieldRef();
+
+    forAll(U.boundaryField(), patchi)
+    {
+        if
+        ( U.boundaryField()[patchi].fixesValue()
+         || isA<cyclicAMIFvPatch>(mesh.boundary()[patchi])
+        )
+        {
+            ccbf[patchi] = 0.0;
+        }
+    }
+
+    return tddtCouplingCoeff;
+}
+
+template<class Type>
+tmp<fvMatrix<Type>>
+EulerDdtSchemeFvmDdt
+(
+    const GeometricField<Type, fvPatchField, volMesh>& vf
+)
+{
+    const fvMesh& mesh = vf.mesh();
+
+    tmp<fvMatrix<Type>> tfvm
+    (
+        new fvMatrix<Type>
+        (
+            vf,
+            vf.dimensions()*dimVol/dimTime
+        )
+    );
+
+    fvMatrix<Type>& fvm = tfvm.ref();
+
+    scalar rDeltaT = 1.0/mesh.time().deltaTValue();
+
+    fvm.diag() = rDeltaT*mesh.Vsc();
+
+    if (mesh.moving())
+    {
+        fvm.source() = rDeltaT*vf.oldTime().primitiveField()*mesh.Vsc0();
+    }
+    else
+    {
+        fvm.source() = rDeltaT*vf.oldTime().primitiveField()*mesh.Vsc();
+    }
+
+    return tfvm;
+}
+
+template<class Type>
+tmp<GeometricField<Type, fvPatchField, volMesh>>
+EulerDdtSchemeFvcDdt
+(
+    const GeometricField<Type, fvPatchField, volMesh>& vf
+)
+{
+    const fvMesh& mesh = vf.mesh();
+
+    dimensionedScalar rDeltaT = 1.0/mesh.time().deltaT();
+
+    IOobject ddtIOobject
+    (
+        "ddt("+vf.name()+')',
+        mesh.time().timeName(),
+        mesh
+    );
+
+    return tmp<GeometricField<Type, fvPatchField, volMesh>>
+    (
+        new GeometricField<Type, fvPatchField, volMesh>
+        (
+            ddtIOobject,
+            rDeltaT*(vf - vf.oldTime())
+        )
+    );
+}
+
+template
+tmp<fvMatrix<scalar>>
+EulerDdtSchemeFvmDdt<scalar>
+(
+    const volScalarField& rho,
+    const GeometricField<scalar, fvPatchField, volMesh>& vf
+);
+
+template
+tmp<fvMatrix<vector>>
+EulerDdtSchemeFvmDdt<vector>
+(
+    const volScalarField& rho,
+    const GeometricField<vector, fvPatchField, volMesh>& vf
+);
+
+template
+tmp<GeometricField<scalar, fvPatchField, volMesh>>
+EulerDdtSchemeFvcDdt<scalar>
+(
+    const volScalarField& rho,
+    const GeometricField<scalar, fvPatchField, volMesh>& vf
+);
+
+template
+tmp<fvMatrix<scalar>>
+EulerDdtSchemeFvmDdt
+(
+    const GeometricField<scalar, fvPatchField, volMesh>& vf
+);
+
+template
+tmp<GeometricField<scalar, fvPatchField, volMesh>>
+EulerDdtSchemeFvcDdt
+(
+    const GeometricField<scalar, fvPatchField, volMesh>& vf
+);
+
+// * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * //
+
+// } // End namespace fv
+
+// * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * //
+
+} // End namespace Foam
+
+// ************************************************************************* //
\ No newline at end of file
diff --git a/GPUTestRef/GenFvMatrix.H b/GPUTestRef/GenFvMatrix.H
new file mode 100644
index 000000000..d76fa94d9
--- /dev/null
+++ b/GPUTestRef/GenFvMatrix.H
@@ -0,0 +1,261 @@
+#pragma once
+
+#include "tmp.H"
+#include "dimensionedType.H"
+#include "volFieldsFwd.H"
+#include "surfaceFieldsFwd.H"
+#include "typeInfo.H"
+#include "runTimeSelectionTables.H"
+#include "fvMatrices.H"
+#include "fvMesh.H"
+#include "turbulentFluidThermoModel.H"
+#include "CombustionModel.H"
+#include <mpi.h>
+#include <algorithm>
+#include "PstreamGlobals.H"
+
+// * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * //
+
+namespace Foam
+{
+
+
+// namespace fv
+// {
+
+// fvm::ddt
+template<class Type>
+tmp<fvMatrix<Type>>
+EulerDdtSchemeFvmDdt
+(
+    const volScalarField& rho,
+    const GeometricField<Type, fvPatchField, volMesh>& vf
+);
+
+template<class Type>
+tmp<fvMatrix<Type>>
+EulerDdtSchemeFvmDdt
+(
+    const GeometricField<Type, fvPatchField, volMesh>& vf
+);
+
+// fvc::ddt
+template<class Type>
+tmp<GeometricField<Type, fvPatchField, volMesh>>
+EulerDdtSchemeFvcDdt
+(
+    const volScalarField& rho,
+    const GeometricField<Type, fvPatchField, volMesh>& vf
+);
+
+template<class Type>
+tmp<GeometricField<Type, fvPatchField, volMesh>>
+EulerDdtSchemeFvcDdt
+(
+    const GeometricField<Type, fvPatchField, volMesh>& vf
+);
+
+// fvc::ddtCorr
+tmp<surfaceScalarField>
+EulerDdtSchemeFvcDdtCorr
+(
+    const volScalarField& rho,
+    const volVectorField& U,
+    const surfaceScalarField& phi,
+    const autoPtr<surfaceVectorField>& Uf
+);
+
+tmp<surfaceScalarField>
+EulerDdtSchemeFvcDdtPhiCoeff
+(
+    const volVectorField& U,
+    const surfaceScalarField& phi,
+    const surfaceScalarField& phiCorr,
+    const volScalarField& rho
+);
+
+template<class Type>
+Foam::tmp<Foam::GeometricField<Type, Foam::fvPatchField, Foam::volMesh>>
+UEqn_H
+(
+    fvMatrix<Type>& UEqn
+);
+
+tmp<volScalarField>
+rAUConstructor
+(
+    fvMatrix<vector>& UEqn
+);
+
+tmp<surfaceScalarField>
+rhorAUfConstructor
+(
+    const volScalarField& rhorAU,
+    const surfaceScalarField& linear_weights
+);
+
+tmp<surfaceScalarField>
+phiHbyAConstructor
+(
+    const volScalarField& rho,
+    const volVectorField& HbyA,
+    const surfaceScalarField& rhorAUf,
+    const surfaceScalarField& tddtCorr,
+    const surfaceScalarField& linear_weights
+);
+
+
+// fvm::div
+template<class Type>
+tmp<fvMatrix<Type>>
+gaussConvectionSchemeFvmDiv
+(
+    const surfaceScalarField& faceFlux,
+    const GeometricField<Type, fvPatchField, volMesh>& vf
+);
+
+template<class Type>
+tmp<fvMatrix<Type>>
+gaussConvectionSchemeFvmDiv
+(
+    const surfaceScalarField& faceFlux,
+    const GeometricField<Type, fvPatchField, volMesh>& vf,
+    const word& name
+);
+
+// fvc::div
+template<class Type>
+tmp<GeometricField<Type, fvPatchField, volMesh>>
+gaussConvectionSchemeFvcDiv
+(
+    const surfaceScalarField& faceFlux,
+    const GeometricField<Type, fvPatchField, volMesh>& vf
+);
+
+template<class Type>
+tmp<GeometricField<Type, fvPatchField, volMesh>>
+gaussConvectionSchemeFvcDiv
+(
+    const GeometricField<Type, fvsPatchField, surfaceMesh>& ssf
+);
+
+template<class Type>
+tmp
+<
+    GeometricField
+    <
+        typename innerProduct<vector, Type>::type, fvPatchField, volMesh
+    >
+>
+gaussDivFvcdiv
+(
+    const GeometricField<Type, fvPatchField, volMesh>& vf
+);
+
+// fvc::grad
+template<class Type>
+tmp
+<
+    GeometricField
+    <
+        typename outerProduct<vector, Type>::type,
+        fvPatchField,
+        volMesh
+    >
+>
+gaussGradSchemeGrad
+(
+    const GeometricField<Type, fvPatchField, volMesh>& vsf
+);
+
+
+template<class Type>
+tmp<fvMatrix<Type>>
+gaussLaplacianSchemeFvmLaplacianUncorrected
+(
+    const surfaceScalarField& gammaMagSf,
+    const surfaceScalarField& deltaCoeffs,
+    const GeometricField<Type, fvPatchField, volMesh>& vf
+);
+
+// fvm::laplacian
+template<class Type>
+tmp<fvMatrix<Type>>
+gaussLaplacianSchemeFvmLaplacian
+(
+    const GeometricField<scalar, fvPatchField, volMesh>& gammaScalarVol,
+    const GeometricField<Type, fvPatchField, volMesh>& vf
+);
+
+template<class Type>
+tmp<fvMatrix<Type>>
+gaussLaplacianSchemeFvmLaplacian
+(
+    const GeometricField<scalar, fvsPatchField, surfaceMesh>& gamma,
+    const GeometricField<Type, fvPatchField, volMesh>& vf
+);
+
+// turbulence->divDevRhoReff(U)
+tmp<fvVectorMatrix>
+turbulenceModelLinearViscousStressDivDevRhoReff
+(
+    volVectorField& U,
+    compressible::turbulenceModel& turbulence
+);
+
+tmp<fvVectorMatrix>
+GenMatrix_U(
+    const volScalarField& rho,
+    volVectorField& U,
+    const surfaceScalarField& phi,
+    const volScalarField& p, 
+    compressible::turbulenceModel& turbulence
+);
+
+tmp<fvScalarMatrix>
+GenMatrix_Y(
+    const volScalarField& rho,
+    volScalarField& Yi,
+    const surfaceScalarField& phi,
+    const surfaceScalarField& phiUc,
+    const volScalarField& rhoD,
+    const volScalarField& mut,
+    const Switch splitting,
+    const scalar Sct,
+    CombustionModel<basicThermo>& combustion,
+    fv::convectionScheme<scalar>& mvConvection
+);
+
+tmp<fvScalarMatrix>
+GenMatrix_E(
+    const volScalarField& rho,
+    volScalarField& he,
+    const surfaceScalarField& phi,
+    const volScalarField& K,
+    const volScalarField& dpdt,
+    const volScalarField& alphaEff,
+    const volScalarField& diffAlphaD,
+    const volVectorField& hDiffCorrFlux,
+    const surfaceScalarField& linear_weights
+);
+
+tmp<fvScalarMatrix>
+GenMatrix_p(
+    const volScalarField& rho,
+    volScalarField& p,
+    const surfaceScalarField& phiHbyA,
+    const surfaceScalarField& rhorAUf,
+    const volScalarField& phi
+);
+
+
+void check_fvmatrix_equal(fvScalarMatrix& a,fvScalarMatrix& b);
+void check_fvmatrix_equal(fvVectorMatrix& a,fvVectorMatrix& b);
+
+void check_field_equal(Field<scalar>& a, Field<scalar>& b);
+
+
+} // End namespace Foam
+
+
+// ************************************************************************* //
\ No newline at end of file
diff --git a/GPUTestRef/Make/files b/GPUTestRef/Make/files
new file mode 100644
index 000000000..314f1f495
--- /dev/null
+++ b/GPUTestRef/Make/files
@@ -0,0 +1,6 @@
+gaussGrad.C
+gaussConvectionScheme.C
+gaussLaplacianScheme.C
+EulerDdtScheme.C
+
+LIB = $(DF_LIBBIN)/libdfGenMatrix
\ No newline at end of file
diff --git a/GPUTestRef/Make/options b/GPUTestRef/Make/options
new file mode 100644
index 000000000..0523a67e8
--- /dev/null
+++ b/GPUTestRef/Make/options
@@ -0,0 +1,31 @@
+-include $(GENERAL_RULES)/mplibType
+
+EXE_INC = \
+	-g \
+    $(PFLAGS) $(PINC) \
+    -I$(LIB_SRC)/transportModels/compressible/lnInclude \
+    -I$(LIB_SRC)/thermophysicalModels/basic/lnInclude \
+    -I$(LIB_SRC)/TurbulenceModels/turbulenceModels/lnInclude \
+    -I$(LIB_SRC)/TurbulenceModels/compressible/lnInclude \
+    -I$(LIB_SRC)/finiteVolume/cfdTools \
+    -I$(LIB_SRC)/finiteVolume/lnInclude \
+    -I$(LIB_SRC)/meshTools/lnInclude \
+    -I$(LIB_SRC)/sampling/lnInclude \
+    -I$(LIB_SRC)/dynamicFvMesh/lnInclude \
+    -I$(LIB_SRC)/Pstream/mpi \
+    -I$(DF_SRC)/dfCanteraMixture/lnInclude \
+    -I$(DF_SRC)/dfChemistryModel/lnInclude \
+    -I$(DF_SRC)/dfCombustionModels/lnInclude \
+    -I$(LIB_SRC)/parallel/decompose/decompositionMethods/lnInclude \
+    -I$(LIB_SRC)/meshTools/lnInclude \
+    -I$(LIB_SRC)/fileFormats/lnInclude \
+    -I$(LIB_SRC)/triSurface/lnInclude \
+    -I$(LIB_SRC)/surfMesh/lnInclude \
+    -I$(LIB_SRC)/dynamicMesh/lnInclude \
+    -I$(LIB_SRC)/finiteVolume/lnInclude \
+    -I$(CANTERA_ROOT)/include
+
+EXE_LIBS = \
+    -lOpenFOAM \
+    -ltriSurface \
+    -lmeshTools
\ No newline at end of file
diff --git a/GPUTestRef/gaussConvectionScheme.C b/GPUTestRef/gaussConvectionScheme.C
new file mode 100644
index 000000000..b8157d2d1
--- /dev/null
+++ b/GPUTestRef/gaussConvectionScheme.C
@@ -0,0 +1,351 @@
+/*---------------------------------------------------------------------------*\
+  =========                 |
+  \\      /  F ield         | OpenFOAM: The Open Source CFD Toolbox
+   \\    /   O peration     | Website:  https://openfoam.org
+    \\  /    A nd           | Copyright (C) 2011-2018 OpenFOAM Foundation
+     \\/     M anipulation  |
+-------------------------------------------------------------------------------
+License
+    This file is part of OpenFOAM.
+
+    OpenFOAM is free software: you can redistribute it and/or modify it
+    under the terms of the GNU General Public License as published by
+    the Free Software Foundation, either version 3 of the License, or
+    (at your option) any later version.
+
+    OpenFOAM is distributed in the hope that it will be useful, but WITHOUT
+    ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
+    FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
+    for more details.
+
+    You should have received a copy of the GNU General Public License
+    along with OpenFOAM.  If not, see <http://www.gnu.org/licenses/>.
+
+\*---------------------------------------------------------------------------*/
+
+#include "GenFvMatrix.H"
+#include "fvcSurfaceIntegrate.H"
+#include "fvMatrices.H"
+#include "gaussConvectionScheme.H"
+
+// * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * //
+
+namespace Foam
+{
+
+// * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * //
+
+// * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * //
+
+template<class Type>
+tmp<fvMatrix<Type>>
+gaussConvectionSchemeFvmDiv
+(
+    const surfaceScalarField& faceFlux,
+    const GeometricField<Type, fvPatchField, volMesh>& vf,
+    const word& name
+)
+{
+    const fvMesh& mesh = vf.mesh();
+
+    tmp<fv::convectionScheme<Type>> cs = fv::convectionScheme<Type>::New(mesh,faceFlux,mesh.divScheme(name));
+    fv::gaussConvectionScheme<Type>& gcs = dynamic_cast<fv::gaussConvectionScheme<Type>&>(cs.ref());
+
+    tmp<surfaceScalarField> tweights = gcs.interpScheme().weights(vf);
+    const surfaceScalarField& weights = tweights();
+
+    tmp<fvMatrix<Type>> tfvm
+    (
+        new fvMatrix<Type>
+        (
+            vf,
+            faceFlux.dimensions()*vf.dimensions()
+        )
+    );
+    fvMatrix<Type>& fvm = tfvm.ref();
+    fvm.lower() = -weights.primitiveField()*faceFlux.primitiveField();
+    fvm.upper() = fvm.lower() + faceFlux.primitiveField();
+    fvm.negSumDiag();
+    forAll(vf.boundaryField(), patchi)
+    {
+        const fvPatchField<Type>& psf = vf.boundaryField()[patchi];
+        const fvsPatchScalarField& patchFlux = faceFlux.boundaryField()[patchi];
+        const fvsPatchScalarField& pw = weights.boundaryField()[patchi];
+
+        fvm.internalCoeffs()[patchi] = patchFlux*psf.valueInternalCoeffs(pw);
+        fvm.boundaryCoeffs()[patchi] = -patchFlux*psf.valueBoundaryCoeffs(pw);
+    }
+    if (gcs.interpScheme().corrected())
+    {
+        fvm += fvc::surfaceIntegrate(faceFlux*gcs.interpScheme().correction(vf));
+    }
+    return tfvm;
+}
+
+template<class Type>
+tmp<fvMatrix<Type>>
+gaussConvectionSchemeFvmDiv
+(
+    const surfaceScalarField& faceFlux,
+    const GeometricField<Type, fvPatchField, volMesh>& vf
+)
+{
+    word name("div("+faceFlux.name()+','+vf.name()+')');
+    return gaussConvectionSchemeFvmDiv(faceFlux,vf,name);
+}
+
+template<class Type>
+tmp<GeometricField<Type, fvPatchField, volMesh>>
+gaussConvectionSchemeFvcDiv
+(
+    const surfaceScalarField& faceFlux,
+    const GeometricField<Type, fvPatchField, volMesh>& vf
+)
+{
+    word name("div("+faceFlux.name()+','+vf.name()+')');
+    return gaussConvectionSchemeFvcDiv(faceFlux, vf, name);
+}
+
+template<class Type>
+tmp<GeometricField<Type, fvPatchField, volMesh>>
+gaussConvectionSchemeFvcDiv
+(
+    const surfaceScalarField& faceFlux,
+    const GeometricField<Type, fvPatchField, volMesh>& vf,
+    const word& name
+)
+{
+    Info << "gaussConvectionSchemeFvcDiv start" << endl;
+    
+    const fvMesh& mesh = vf.mesh();
+
+    Istream& divIntScheme = mesh.divScheme(name);
+    word divScheme(divIntScheme);
+    
+    tmp<surfaceInterpolationScheme<Type>> tinterpScheme_ = 
+        surfaceInterpolationScheme<Type>::New(mesh, faceFlux, divIntScheme);
+
+    // tmp<surfaceInterpolationScheme<Type>> tinterpScheme_ = 
+    // tmp<surfaceInterpolationScheme<Type>>
+    // (
+    //     new linear<Type>(mesh)
+    // );
+
+    
+    // surfaceInterpolationScheme<Type> interpScheme_ = tinterpScheme_.ref();
+    
+    tmp<GeometricField<Type, fvPatchField, volMesh>> tConvection
+    (
+        fvc::surfaceIntegrate(gaussConvectionSchemeFlux(faceFlux, vf, tinterpScheme_))
+    );
+
+    tConvection.ref().rename
+    (
+        "convection(" + faceFlux.name() + ',' + vf.name() + ')'
+    );
+
+    return tConvection;
+}
+
+template<class Type>
+tmp<GeometricField<Type, fvPatchField, volMesh>>
+gaussConvectionSchemeFvcDiv
+(
+    const GeometricField<Type, fvsPatchField, surfaceMesh>& ssf
+)
+{
+    return tmp<GeometricField<Type, fvPatchField, volMesh>>
+    (
+        new GeometricField<Type, fvPatchField, volMesh>
+        (
+            "div("+ssf.name()+')',
+            fvcSurfaceIntegrate(ssf)
+        )
+    );
+}
+
+template<class Type>
+tmp
+<
+    GeometricField
+    <
+        typename innerProduct<vector, Type>::type, fvPatchField, volMesh
+    >
+>
+gaussDivFvcdiv
+(
+    const GeometricField<Type, fvPatchField, volMesh>& vf
+)
+{
+    const fvMesh& mesh = vf.mesh();
+    Istream& divIntScheme = mesh.divScheme("div("+vf.name()+')');
+    word divScheme(divIntScheme);
+
+    tmp<surfaceInterpolationScheme<Type>> tinterpScheme_ = 
+        surfaceInterpolationScheme<Type>::New(mesh, divIntScheme);
+
+    tmp
+    <
+        GeometricField
+        <typename innerProduct<vector, Type>::type, fvPatchField, volMesh>
+    > tDiv
+    (
+        fvcSurfaceIntegrate
+        (
+            (tinterpScheme_().dotInterpolate(mesh.Sf(), vf))()
+        )
+    );
+
+    
+    return tDiv;
+}
+
+template<class Type>
+tmp<GeometricField<Type, fvPatchField, volMesh>>
+fvcSurfaceIntegrate
+(
+    const GeometricField<Type, fvsPatchField, surfaceMesh>& ssf
+)
+{
+    const fvMesh& mesh = ssf.mesh();
+
+    tmp<GeometricField<Type, fvPatchField, volMesh>> tvf
+    (
+        new GeometricField<Type, fvPatchField, volMesh>
+        (
+            IOobject
+            (
+                "surfaceIntegrate("+ssf.name()+')',
+                ssf.instance(),
+                mesh,
+                IOobject::NO_READ,
+                IOobject::NO_WRITE
+            ),
+            mesh,
+            dimensioned<Type>
+            (
+                "0",
+                ssf.dimensions()/dimVol,
+                Zero
+            ),
+            extrapolatedCalculatedFvPatchField<Type>::typeName
+        )
+    );
+    GeometricField<Type, fvPatchField, volMesh>& vf = tvf.ref();
+
+    fvcSurfaceIntegrate(vf.primitiveFieldRef(), ssf);
+    vf.correctBoundaryConditions();
+
+    return tvf;
+}
+
+template<class Type>
+void fvcSurfaceIntegrate
+(
+    Field<Type>& ivf,
+    const GeometricField<Type, fvsPatchField, surfaceMesh>& ssf
+)
+{
+    const fvMesh& mesh = ssf.mesh();
+
+    const labelUList& owner = mesh.owner();
+    const labelUList& neighbour = mesh.neighbour();
+
+    const Field<Type>& issf = ssf;
+
+    forAll(owner, facei)
+    {
+        ivf[owner[facei]] += issf[facei];
+        ivf[neighbour[facei]] -= issf[facei];
+    }
+
+    forAll(mesh.boundary(), patchi)
+    {
+        const labelUList& pFaceCells =
+            mesh.boundary()[patchi].faceCells();
+
+        const fvsPatchField<Type>& pssf = ssf.boundaryField()[patchi];
+
+        forAll(mesh.boundary()[patchi], facei)
+        {
+            ivf[pFaceCells[facei]] += pssf[facei];
+        }
+    }
+
+    ivf /= mesh.Vsc();
+}
+
+template<class Type>
+tmp<GeometricField<Type, fvsPatchField, surfaceMesh>>
+gaussConvectionSchemeFlux
+(
+    const surfaceScalarField& faceFlux,
+    const GeometricField<Type, fvPatchField, volMesh>& vf,
+    tmp<surfaceInterpolationScheme<Type>> tinterpScheme
+)
+{
+    Info << vf.name() <<tinterpScheme().interpolate(vf) << endl;
+    return faceFlux*tinterpScheme().interpolate(vf);
+}
+
+
+
+// * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * //
+
+template
+tmp<fvMatrix<scalar>>
+gaussConvectionSchemeFvmDiv
+(
+    const surfaceScalarField& faceFlux,
+    const GeometricField<scalar, fvPatchField, volMesh>& vf
+);
+
+template
+tmp<fvMatrix<vector>>
+gaussConvectionSchemeFvmDiv
+(
+    const surfaceScalarField& faceFlux,
+    const GeometricField<vector, fvPatchField, volMesh>& vf
+);
+
+template
+tmp<GeometricField<scalar, fvPatchField, volMesh>>
+gaussConvectionSchemeFvcDiv
+(
+    const surfaceScalarField& faceFlux,
+    const GeometricField<scalar, fvPatchField, volMesh>& vf
+);
+
+
+template
+tmp<GeometricField<scalar, fvPatchField, volMesh>>
+gaussConvectionSchemeFvcDiv
+(
+    const GeometricField<scalar, fvsPatchField, surfaceMesh>& ssf
+);
+
+template
+tmp<GeometricField<vector, fvPatchField, volMesh>>
+gaussConvectionSchemeFvcDiv
+(
+    const GeometricField<vector, fvsPatchField, surfaceMesh>& ssf
+);
+
+template
+tmp
+<
+    GeometricField
+    <
+        typename innerProduct<vector, vector>::type, fvPatchField, volMesh
+    >
+>
+gaussDivFvcdiv
+(
+    const GeometricField<vector, fvPatchField, volMesh>& vf
+);
+
+// * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * //
+
+} // End namespace Foam
+
+// ************************************************************************* //
\ No newline at end of file
diff --git a/GPUTestRef/gaussGrad.C b/GPUTestRef/gaussGrad.C
new file mode 100644
index 000000000..401eab38b
--- /dev/null
+++ b/GPUTestRef/gaussGrad.C
@@ -0,0 +1,332 @@
+/*---------------------------------------------------------------------------*\
+  =========                 |
+  \\      /  F ield         | OpenFOAM: The Open Source CFD Toolbox
+   \\    /   O peration     | Website:  https://openfoam.org
+    \\  /    A nd           | Copyright (C) 2011-2018 OpenFOAM Foundation
+     \\/     M anipulation  |
+-------------------------------------------------------------------------------
+License
+    This file is part of OpenFOAM.
+
+    OpenFOAM is free software: you can redistribute it and/or modify it
+    under the terms of the GNU General Public License as published by
+    the Free Software Foundation, either version 3 of the License, or
+    (at your option) any later version.
+
+    OpenFOAM is distributed in the hope that it will be useful, but WITHOUT
+    ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
+    FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
+    for more details.
+
+    You should have received a copy of the GNU General Public License
+    along with OpenFOAM.  If not, see <http://www.gnu.org/licenses/>.
+
+\*---------------------------------------------------------------------------*/
+
+#include "gaussGrad.H"
+#include "extrapolatedCalculatedFvPatchField.H"
+
+// * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * //
+
+namespace Foam
+{
+
+
+
+template<class Type>
+tmp
+<
+    GeometricField
+    <
+        typename outerProduct<vector, Type>::type,
+        fvPatchField,
+        volMesh
+    >
+>
+gaussGradSchemeGrad
+(
+    const GeometricField<Type, fvPatchField, volMesh>& vsf
+)
+{
+    return gaussGradSchemeGrad(vsf, "grad(" + vsf.name() + ')');
+}
+
+template<class Type>
+tmp
+<
+    GeometricField
+    <
+        typename outerProduct<vector, Type>::type,
+        fvPatchField,
+        volMesh
+    >
+>
+gaussGradSchemeGrad
+(
+    const GeometricField<Type, fvPatchField, volMesh>& vsf,
+    const word& name
+)
+{
+    const fvMesh& mesh = vsf.mesh();
+
+    typedef typename outerProduct<vector, Type>::type GradType;
+    typedef GeometricField<GradType, fvPatchField, volMesh> GradFieldType;
+
+    if (!mesh.changing() && mesh.cache(name))
+    {
+        if (!mesh.objectRegistry::template foundObject<GradFieldType>(name))
+        {
+            solution::cachePrintMessage("Calculating and caching", name, vsf);
+            tmp<GradFieldType> tgGrad = gaussGradCalcGrad(vsf, name);
+            regIOobject::store(tgGrad.ptr());
+        }
+
+        solution::cachePrintMessage("Retrieving", name, vsf);
+        GradFieldType& gGrad =
+            mesh.objectRegistry::template lookupObjectRef<GradFieldType>
+            (
+                name
+            );
+
+        if (gGrad.upToDate(vsf))
+        {
+            return gGrad;
+        }
+        else
+        {
+            solution::cachePrintMessage("Deleting", name, vsf);
+            gGrad.release();
+            delete &gGrad;
+
+            solution::cachePrintMessage("Recalculating", name, vsf);
+            tmp<GradFieldType> tgGrad = gaussGradCalcGrad(vsf, name);
+
+            solution::cachePrintMessage("Storing", name, vsf);
+            regIOobject::store(tgGrad.ptr());
+            GradFieldType& gGrad =
+                mesh.objectRegistry::template lookupObjectRef<GradFieldType>
+                (
+                    name
+                );
+
+            return gGrad;
+        }
+    }
+    else
+    {
+        if (mesh.objectRegistry::template foundObject<GradFieldType>(name))
+        {
+            GradFieldType& gGrad =
+                mesh.objectRegistry::template lookupObjectRef<GradFieldType>
+                (
+                    name
+                );
+
+            if (gGrad.ownedByRegistry())
+            {
+                solution::cachePrintMessage("Deleting", name, vsf);
+                gGrad.release();
+                delete &gGrad;
+            }
+        }
+
+        solution::cachePrintMessage("Calculating", name, vsf);
+        return gaussGradCalcGrad(vsf, name);
+    }
+}
+
+template<class Type>
+tmp
+<
+    GeometricField
+    <
+        typename outerProduct<vector, Type>::type,
+        fvPatchField,
+        volMesh
+    >
+>
+gaussGradCalcGrad
+(
+    const GeometricField<Type, fvPatchField, volMesh>& vsf,
+    const word& name
+)
+{
+    const fvMesh& mesh = vsf.mesh();
+
+    tmp<surfaceInterpolationScheme<Type>> tinterpScheme_ =
+    tmp<surfaceInterpolationScheme<Type>>
+    (
+        new linear<Type>(mesh)
+    );
+
+    typedef typename outerProduct<vector, Type>::type GradType;
+
+    tmp<GeometricField<Type, fvsPatchField, surfaceMesh>> tinterpolate = tinterpScheme_().interpolate(vsf);
+
+    tmp<GeometricField<GradType, fvPatchField, volMesh>> tgGrad
+    (
+        gaussGradGradf(tinterpolate.ref(), name)
+    );
+    GeometricField<GradType, fvPatchField, volMesh>& gGrad = tgGrad.ref();
+
+    gaussGradCorrectBoundaryConditions(vsf, gGrad);
+
+    return tgGrad;
+}
+
+template<class Type>
+void gaussGradCorrectBoundaryConditions
+(
+    const GeometricField<Type, fvPatchField, volMesh>& vsf,
+    GeometricField
+    <
+        typename outerProduct<vector, Type>::type, fvPatchField, volMesh
+    >& gGrad
+)
+{
+    typename GeometricField
+    <
+        typename outerProduct<vector, Type>::type, fvPatchField, volMesh
+    >::Boundary& gGradbf = gGrad.boundaryFieldRef();
+
+    forAll(vsf.boundaryField(), patchi)
+    {
+        if (!vsf.boundaryField()[patchi].coupled())
+        {
+            const vectorField n
+            (
+                vsf.mesh().Sf().boundaryField()[patchi]
+              / vsf.mesh().magSf().boundaryField()[patchi]
+            );
+
+            gGradbf[patchi] += n *
+            (
+                vsf.boundaryField()[patchi].snGrad()
+              - (n & gGradbf[patchi])
+            );
+        }
+     }
+}
+
+template<class Type>
+tmp
+<
+    GeometricField
+    <
+        typename outerProduct<vector, Type>::type,
+        fvPatchField,
+        volMesh
+    >
+>
+gaussGradGradf
+(
+    const GeometricField<Type, fvsPatchField, surfaceMesh>& ssf,
+    const word& name
+)
+{
+    typedef typename outerProduct<vector, Type>::type GradType;
+
+    const fvMesh& mesh = ssf.mesh();
+
+    tmp<GeometricField<GradType, fvPatchField, volMesh>> tgGrad
+    (
+        new GeometricField<GradType, fvPatchField, volMesh>
+        (
+            IOobject
+            (
+                name,
+                ssf.instance(),
+                mesh,
+                IOobject::NO_READ,
+                IOobject::NO_WRITE
+            ),
+            mesh,
+            dimensioned<GradType>
+            (
+                "0",
+                ssf.dimensions()/dimLength,
+                Zero
+            ),
+            extrapolatedCalculatedFvPatchField<GradType>::typeName
+        )
+    );
+    GeometricField<GradType, fvPatchField, volMesh>& gGrad = tgGrad.ref();
+
+    const labelUList& owner = mesh.owner();
+    const labelUList& neighbour = mesh.neighbour();
+    const vectorField& Sf = mesh.Sf();
+
+    Field<GradType>& igGrad = gGrad;
+    const Field<Type>& issf = ssf;
+
+    forAll(owner, facei)
+    {
+        GradType Sfssf = Sf[facei]*issf[facei];
+
+        igGrad[owner[facei]] += Sfssf;
+        igGrad[neighbour[facei]] -= Sfssf;
+    }
+
+    forAll(mesh.boundary(), patchi)
+    {
+        const labelUList& pFaceCells =
+            mesh.boundary()[patchi].faceCells();
+
+        const vectorField& pSf = mesh.Sf().boundaryField()[patchi];
+
+        const fvsPatchField<Type>& pssf = ssf.boundaryField()[patchi];
+
+        forAll(mesh.boundary()[patchi], facei)
+        {
+            igGrad[pFaceCells[facei]] += pSf[facei]*pssf[facei];
+            if (pFaceCells[facei] == 0)
+            {
+                // Info << "CPU add = " << pSf[facei]*pssf[facei] << endl;
+                // Info << "surface CPU = " << pSf[facei] << endl;
+                // Info << "field CPU = " << pssf[facei] << endl;
+            }
+        }
+    }
+
+    igGrad /= mesh.V();
+
+    gGrad.correctBoundaryConditions();
+
+    return tgGrad;
+}
+
+
+template
+tmp
+<
+    GeometricField
+    <
+        typename outerProduct<vector, scalar>::type,
+        fvPatchField,
+        volMesh
+    >
+>
+gaussGradSchemeGrad
+(
+    const GeometricField<scalar, fvPatchField, volMesh>& vsf
+);
+
+
+template
+tmp
+<
+    GeometricField
+    <
+        typename outerProduct<vector, vector>::type,
+        fvPatchField,
+        volMesh
+    >
+>
+gaussGradSchemeGrad
+(
+    const GeometricField<vector, fvPatchField, volMesh>& vsf
+);
+
+} // End namespace Foam
+
+// ************************************************************************* //
\ No newline at end of file
diff --git a/GPUTestRef/gaussLaplacianScheme.C b/GPUTestRef/gaussLaplacianScheme.C
new file mode 100644
index 000000000..ed321ceda
--- /dev/null
+++ b/GPUTestRef/gaussLaplacianScheme.C
@@ -0,0 +1,273 @@
+/*---------------------------------------------------------------------------*
+  =========                 |
+      /  F ield         | OpenFOAM: The Open Source CFD Toolbox
+    /   O peration     | Website:  https://openfoam.org
+  /    A nd           | Copyright (C) 2011-2018 OpenFOAM Foundation
+/     M anipulation  |
+-------------------------------------------------------------------------------
+License
+    This file is part of OpenFOAM.
+
+    OpenFOAM is free software: you can redistribute it and/or modify it
+    under the terms of the GNU General Public License as published by
+    the Free Software Foundation, either version 3 of the License, or
+    (at your option) any later version.
+
+    OpenFOAM is distributed in the hope that it will be useful, but WITHOUT
+    ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
+    FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
+    for more details.
+
+    You should have received a copy of the GNU General Public License
+    along with OpenFOAM.  If not, see <http://www.gnu.org/licenses/>.
+
+*---------------------------------------------------------------------------*/
+
+#include "gaussLaplacianScheme.H"
+#include "surfaceInterpolate.H"
+#include "fvcDiv.H"
+#include "fvcGrad.H"
+#include "fvMatrices.H"
+#include "snGradScheme.H"
+#include "linear.H"
+#include "orthogonalSnGrad.H"
+
+// * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * //
+
+namespace Foam
+{
+
+// * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * //
+
+// * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * //
+
+template<class Type>
+tmp<fvMatrix<Type>>
+gaussLaplacianSchemeFvmLaplacianUncorrected
+(
+    const surfaceScalarField& gammaMagSf,
+    const surfaceScalarField& deltaCoeffs,
+    const GeometricField<Type, fvPatchField, volMesh>& vf
+)
+{
+    tmp<fvMatrix<Type>> tfvm
+    (
+        new fvMatrix<Type>
+        (
+            vf,
+            deltaCoeffs.dimensions()*gammaMagSf.dimensions()*vf.dimensions()
+        )
+    );
+    fvMatrix<Type>& fvm = tfvm.ref();
+
+    fvm.upper() = deltaCoeffs.primitiveField()*gammaMagSf.primitiveField();
+    fvm.negSumDiag();
+
+    forAll(vf.boundaryField(), patchi)
+    {
+        const fvPatchField<Type>& pvf = vf.boundaryField()[patchi];
+        const fvsPatchScalarField& pGamma = gammaMagSf.boundaryField()[patchi];
+        const fvsPatchScalarField& pDeltaCoeffs =
+            deltaCoeffs.boundaryField()[patchi];
+
+        if (pvf.coupled())
+        {
+            fvm.internalCoeffs()[patchi] =
+                pGamma*pvf.gradientInternalCoeffs(pDeltaCoeffs);
+            fvm.boundaryCoeffs()[patchi] =
+               -pGamma*pvf.gradientBoundaryCoeffs(pDeltaCoeffs);
+        }
+        else
+        {
+            fvm.internalCoeffs()[patchi] = pGamma*pvf.gradientInternalCoeffs();
+            fvm.boundaryCoeffs()[patchi] = -pGamma*pvf.gradientBoundaryCoeffs();
+        }
+    }
+
+    return tfvm;
+}
+
+
+template<class Type>
+tmp<GeometricField<Type, fvsPatchField, surfaceMesh>>
+gaussLaplacianSchemeGammaSnGradCorr
+(
+    const surfaceVectorField& SfGammaCorr,
+    const GeometricField<Type, fvPatchField, volMesh>& vf
+)
+{
+    const fvMesh& mesh = vf.mesh();
+
+    tmp<GeometricField<Type, fvsPatchField, surfaceMesh>> tgammaSnGradCorr
+    (
+        new GeometricField<Type, fvsPatchField, surfaceMesh>
+        (
+            IOobject
+            (
+                "gammaSnGradCorr("+vf.name()+')',
+                vf.instance(),
+                mesh,
+                IOobject::NO_READ,
+                IOobject::NO_WRITE
+            ),
+            mesh,
+            SfGammaCorr.dimensions()
+           *vf.dimensions()*mesh.deltaCoeffs().dimensions()
+        )
+    );
+
+    for (direction cmpt = 0; cmpt < pTraits<Type>::nComponents; cmpt++)
+    {
+        tgammaSnGradCorr.ref().replace
+        (
+            cmpt,
+            fvc::dotInterpolate(SfGammaCorr, fvc::grad(vf.component(cmpt)))
+        );
+    }
+
+    return tgammaSnGradCorr;
+}
+
+template<class Type>
+tmp<fvMatrix<Type>>
+gaussLaplacianSchemeFvmLaplacian
+(
+    const GeometricField<scalar, fvPatchField, volMesh>& gammaScalarVol,
+    const GeometricField<Type, fvPatchField, volMesh>& vf
+)
+{
+    const fvMesh& mesh = vf.mesh();
+    tmp<surfaceInterpolationScheme<scalar>> tinterpGammaScheme_(new linear<scalar>(mesh));
+    tmp<fv::snGradScheme<Type>> tsnGradScheme_(new fv::orthogonalSnGrad<Type>(mesh));
+
+    tmp<GeometricField<scalar, fvsPatchField, surfaceMesh>> tgamma = tinterpGammaScheme_().interpolate(gammaScalarVol);
+    const GeometricField<scalar, fvsPatchField, surfaceMesh>& gamma = tgamma.ref();
+
+    GeometricField<scalar, fvsPatchField, surfaceMesh> gammaMagSf
+    (
+        gamma*mesh.magSf()
+    );
+
+    tmp<fvMatrix<Type>> tfvm = gaussLaplacianSchemeFvmLaplacianUncorrected
+    (
+        gammaMagSf,
+        tsnGradScheme_().deltaCoeffs(vf),
+        vf
+    );
+    fvMatrix<Type>& fvm = tfvm.ref();
+
+    if (tsnGradScheme_().corrected())
+    {
+        if (mesh.fluxRequired(vf.name()))
+        {
+            fvm.faceFluxCorrectionPtr() = new
+            GeometricField<Type, fvsPatchField, surfaceMesh>
+            (
+                gammaMagSf*tsnGradScheme_().correction(vf)
+            );
+
+            fvm.source() -=
+                mesh.V()*
+                fvc::div
+                (
+                    *fvm.faceFluxCorrectionPtr()
+                )().primitiveField();
+        }
+        else
+        {
+            fvm.source() -=
+                mesh.V()*
+                fvc::div
+                (
+                    gammaMagSf*tsnGradScheme_().correction(vf)
+                )().primitiveField();
+        }
+    }
+    return tfvm;
+}
+
+template<class Type>
+tmp<fvMatrix<Type>>
+gaussLaplacianSchemeFvmLaplacian
+(
+    const GeometricField<scalar, fvsPatchField, surfaceMesh>& gamma,
+    const GeometricField<Type, fvPatchField, volMesh>& vf
+)
+{
+    const fvMesh& mesh = vf.mesh();
+    tmp<fv::snGradScheme<Type>> tsnGradScheme_(new fv::orthogonalSnGrad<Type>(mesh));
+
+    GeometricField<scalar, fvsPatchField, surfaceMesh> gammaMagSf
+    (
+        gamma*mesh.magSf()
+    );
+
+    tmp<fvMatrix<Type>> tfvm = gaussLaplacianSchemeFvmLaplacianUncorrected
+    (
+        gammaMagSf,
+        tsnGradScheme_().deltaCoeffs(vf),
+        vf
+    );
+    fvMatrix<Type>& fvm = tfvm.ref();
+
+    if (tsnGradScheme_().corrected())
+    {
+        if (mesh.fluxRequired(vf.name()))
+        {
+            fvm.faceFluxCorrectionPtr() = new
+            GeometricField<Type, fvsPatchField, surfaceMesh>
+            (
+                gammaMagSf*tsnGradScheme_().correction(vf)
+            );
+
+            fvm.source() -=
+                mesh.V()*
+                fvc::div
+                (
+                    *fvm.faceFluxCorrectionPtr()
+                )().primitiveField();
+        }
+        else
+        {
+            fvm.source() -=
+                mesh.V()*
+                fvc::div
+                (
+                    gammaMagSf*tsnGradScheme_().correction(vf)
+                )().primitiveField();
+        }
+    }
+    return tfvm;
+}  
+
+// * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * //
+
+template
+tmp<fvMatrix<scalar>>
+gaussLaplacianSchemeFvmLaplacian
+(
+    const GeometricField<scalar, fvPatchField, volMesh>& gammaScalarVol,
+    const GeometricField<scalar, fvPatchField, volMesh>& vf
+);
+
+template
+tmp<fvMatrix<vector>>
+gaussLaplacianSchemeFvmLaplacian
+(
+    const GeometricField<scalar, fvPatchField, volMesh>& gammaScalarVol,
+    const GeometricField<vector, fvPatchField, volMesh>& vf
+);
+
+template
+tmp<fvMatrix<scalar>>
+gaussLaplacianSchemeFvmLaplacian
+(
+    const GeometricField<scalar, fvsPatchField, surfaceMesh>& gamma,
+    const GeometricField<scalar, fvPatchField, volMesh>& vf
+);
+
+// * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * //
+
+} // End namespace Foam
+
+// ************************************************************************* //
\ No newline at end of file
diff --git a/applications/solvers/dfLowMachFoam/Make/options b/applications/solvers/dfLowMachFoam/Make/options
index e2a57bd00..bda93210e 100644
--- a/applications/solvers/dfLowMachFoam/Make/options
+++ b/applications/solvers/dfLowMachFoam/Make/options
@@ -9,7 +9,6 @@ EXE_INC = -std=c++14 \
     $(PFLAGS) $(PINC) \
     $(if $(LIBTORCH_ROOT),-DUSE_LIBTORCH,) \
     $(if $(PYTHON_INC_DIR),-DUSE_PYTORCH,) \
-    $(if $(AMGX_DIR),-DGPUSolver_,) \
     -I$(LIB_SRC)/transportModels/compressible/lnInclude \
     -I$(LIB_SRC)/thermophysicalModels/basic/lnInclude \
     -I$(LIB_SRC)/TurbulenceModels/turbulenceModels/lnInclude \
@@ -29,7 +28,8 @@ EXE_INC = -std=c++14 \
     $(PYTHON_INC_DIR) \
     $(if $(AMGX_DIR), -I$(DF_ROOT)/src_gpu,) \
     $(if $(AMGX_DIR), -I/usr/local/cuda-11.6/include,) \
-    $(if $(AMGX_DIR), -I$(AMGX_DIR)/include,)
+    $(if $(AMGX_DIR), -I$(AMGX_DIR)/include,) \
+	-I$(DF_ROOT)/GPUTestRef/lnInclude \
 
 EXE_LIBS = \
     -lcompressibleTransportModels \
@@ -43,6 +43,7 @@ EXE_LIBS = \
     -ldfCanteraMixture \
     -ldfChemistryModel \
     -ldfCombustionModels  \
+	-ldfGenMatrix \
     $(CANTERA_ROOT)/lib/libcantera.so \
     $(if $(LIBTORCH_ROOT),$(LIBTORCH_ROOT)/lib/libtorch.so,) \
     $(if $(LIBTORCH_ROOT),$(LIBTORCH_ROOT)/lib/libc10.so,) \
diff --git a/applications/solvers/dfLowMachFoam/UEqn.H b/applications/solvers/dfLowMachFoam/UEqn.H
index c3ee91068..38934abdb 100644
--- a/applications/solvers/dfLowMachFoam/UEqn.H
+++ b/applications/solvers/dfLowMachFoam/UEqn.H
@@ -86,6 +86,121 @@
     //     K = 0.5*magSqr(U);
     // }
     // UEqn_GPU.checkValue(true);
+#elif defined GPUSolverNew_
+    const tmp<volScalarField> nuEff_tmp(turbulence->nuEff());
+    const volScalarField& nuEff = nuEff_tmp();
+
+    // run CPU, for temp
+    tmp<fvVectorMatrix> tUEqn
+    (
+        fvm::ddt(rho, U) 
+        + 
+        fvm::div(phi, U)
+        +  
+        turbulence->divDevRhoReff(U)
+        == -fvc::grad(p)
+    );
+    fvVectorMatrix& UEqn = tUEqn.ref();
+
+    // run GPU
+    // preProcess
+    // TODO: preProcessForRhoEqn for temp, now we only transfer phi(instead of rhoEqn) used by fvm::div(phi, U)
+    UEqn_GPU.sync();
+    double *h_rho = dfDataBase.getFieldPointer("rho", location::cpu, position::internal);
+    double *h_phi = dfDataBase.getFieldPointer("phi", location::cpu, position::internal);
+    double *h_boundary_phi = dfDataBase.getFieldPointer("phi", location::cpu, position::boundary);
+    memcpy(h_rho, &rho[0], dfDataBase.cell_value_bytes);
+    memcpy(h_phi, &phi[0], dfDataBase.surface_value_bytes);
+    int offset = 0;
+    forAll(phi.boundaryField(), patchi)
+    {
+        const fvsPatchScalarField& patchPhi = phi.boundaryField()[patchi];
+        int patchsize = patchPhi.size();
+        memcpy(h_boundary_phi + offset, &patchPhi[0], patchsize * sizeof(double));
+        offset += patchsize;
+    }
+    UEqn_GPU.preProcessForRhoEqn(h_rho, h_phi, h_boundary_phi);
+    DEBUG_TRACE;
+    
+    TICK_START;
+    // preparing u, p, nu_eff, and rho.boundary used by UEqn_GPU.preProcess()
+    double *h_u = dfDataBase.getFieldPointer("u", location::cpu, position::internal);
+    double *h_boundary_u = dfDataBase.getFieldPointer("u", location::cpu, position::boundary);
+    double *h_p = dfDataBase.getFieldPointer("p", location::cpu, position::internal);
+    double *h_boundary_p = dfDataBase.getFieldPointer("p", location::cpu, position::boundary);
+    double *h_nu_eff = UEqn_GPU.getFieldPointer("nu_eff", location::cpu, position::internal);
+    double *h_boundary_nu_eff = UEqn_GPU.getFieldPointer("nu_eff", location::cpu, position::boundary);
+    double *h_boundary_rho = dfDataBase.getFieldPointer("rho", location::cpu, position::boundary);
+    TICK_STOP(get pointer);
+
+    TICK_START;
+    U.oldTime();
+    memcpy(h_u, &U.oldTime()[0][0], dfDataBase.cell_value_vec_bytes);
+    memcpy(h_p, &p[0], dfDataBase.cell_value_bytes);
+    memcpy(h_nu_eff, &nuEff[0], dfDataBase.cell_value_bytes);
+    TICK_STOP(copy to pinned memory);
+
+    TICK_START;
+    offset = 0;
+    forAll(U.boundaryField(), patchi)
+    {
+        const fvPatchVectorField& patchU = U.boundaryField()[patchi];
+        const fvPatchScalarField& patchP = p.boundaryField()[patchi];
+        const fvPatchScalarField& patchNuEff = nuEff.boundaryField()[patchi];
+        const fvPatchScalarField& patchRho = rho.boundaryField()[patchi];
+        int patchsize = patchU.size();
+        memcpy(h_boundary_u + offset * 3, &patchU[0][0], patchsize * 3 * sizeof(double));
+        memcpy(h_boundary_p + offset, &patchP[0], patchsize * sizeof(double));
+        memcpy(h_boundary_nu_eff + offset, &patchNuEff[0], patchsize * sizeof(double));
+        memcpy(h_boundary_rho + offset, &patchRho[0], patchsize * sizeof(double));
+        offset += patchsize;
+    }
+    TICK_STOP(CPU prepare boundary time);
+
+    TICK_START;
+    UEqn_GPU.preProcess(h_u, h_boundary_u, h_p, h_boundary_p, h_nu_eff, h_boundary_nu_eff, h_boundary_rho);
+    DEBUG_TRACE;
+    UEqn_GPU.sync();
+    TICK_STOP(GPU preProcess time);
+
+    // process
+    TICK_START;
+    UEqn_GPU.process();
+    DEBUG_TRACE;
+    UEqn_GPU.sync();
+    TICK_STOP(GPU process time);
+
+    TICK_START;
+    UEqn_GPU.solve();
+    TICK_STOP(GPU solve time);
+
+    // postProcess
+    TICK_START;
+    UEqn_GPU.postProcess(h_u);
+    U.correctBoundaryConditions();
+    DEBUG_TRACE;
+    TICK_STOP(post process time);
+
+    // checkResult
+    // TODO: for temp, now we compare ldu, finally we compare csr
+    std::vector<double> h_internal_coeffs(dfDataBase.num_boundary_surfaces * 3);
+    std::vector<double> h_boundary_coeffs(dfDataBase.num_boundary_surfaces * 3);
+    offset = 0;
+    for (int patchi = 0; patchi < dfDataBase.num_patches; patchi++)
+    {
+        int patchsize = dfDataBase.patch_size[patchi];
+        const double* internal_coeff_ptr = &UEqn.internalCoeffs()[patchi][0][0];
+        const double* boundary_coeff_ptr = &UEqn.boundaryCoeffs()[patchi][0][0];
+        memcpy(h_internal_coeffs.data() + offset * 3, internal_coeff_ptr, patchsize * 3 * sizeof(double));
+        memcpy(h_boundary_coeffs.data() + offset * 3, boundary_coeff_ptr, patchsize * 3 * sizeof(double));
+        offset += patchsize;
+    }
+    bool printFlag = false;
+    UEqn_GPU.compareResult(&UEqn.lower()[0], &UEqn.upper()[0], &UEqn.diag()[0], &UEqn.source()[0][0],
+            h_internal_coeffs.data(), h_boundary_coeffs.data(), 
+            // &DivTensor[0][0], 
+            printFlag);
+    DEBUG_TRACE;
 #else
     start1 = std::clock();
     tmp<fvVectorMatrix> tUEqn
diff --git a/applications/solvers/dfLowMachFoam/createGPUSolver.H b/applications/solvers/dfLowMachFoam/createGPUSolver.H
new file mode 100644
index 000000000..94fff1125
--- /dev/null
+++ b/applications/solvers/dfLowMachFoam/createGPUSolver.H
@@ -0,0 +1,97 @@
+dfMatrixDataBase dfDataBase;
+//dfRhoEqn rhoEqn_GPU;
+dfUEqn UEqn_GPU(dfDataBase);
+//dfYEqn YEqn_GPU;
+//dfEEqn EEqn_GPU;
+
+void createGPUBase(fvMesh& mesh, PtrList<volScalarField>& Y) {
+    // prepare constant values: num_cells, num_surfaces, num_boundary_surfaces, num_patches, patch_size, num_species, rdelta_t
+    const labelUList& owner = mesh.owner();
+    const labelUList& neighbour = mesh.neighbour();
+    int num_cells = mesh.nCells();
+    int num_surfaces = neighbour.size();
+    int num_boundary_surfaces = 0;
+    int num_patches = 0;
+    std::vector<int> patch_size;
+    forAll(mesh.boundary(), patchi) {
+        labelUList sub_boundary = mesh.boundary()[patchi].faceCells();
+        int patchsize = sub_boundary.size();
+        patch_size.push_back(patchsize);
+        num_boundary_surfaces += patchsize;
+        num_patches++;
+    }
+    // TODO: get deltaT fomr time API
+    double rDeltaT = 1 / 1e-6;
+    dfDataBase.setConstantValues(num_cells, num_surfaces, num_boundary_surfaces, num_patches, patch_size, Y.size(), rDeltaT);
+    
+    // prepare constant indexes: owner, neighbor
+    dfDataBase.setConstantIndexes(&owner[0], &neighbour[0]);
+    
+    // prepare internal and boundary of sf, mag_sf, weights, delta_coeffs, volume
+    double *boundary_sf = new double[3 * num_boundary_surfaces];
+    double *boundary_mag_sf = new double[num_boundary_surfaces];
+    double *boundary_delta_coeffs = new double[num_boundary_surfaces];
+    int *boundary_face_cell = new int[num_boundary_surfaces];
+    int offset = 0;
+    forAll(mesh.boundary(), patchi) {
+        const vectorField& pSf = mesh.Sf().boundaryField()[patchi];
+        const scalarField& pMagSf = mesh.magSf().boundaryField()[patchi];
+        const scalarField& pDeltaCoeffs = mesh.nonOrthDeltaCoeffs().boundaryField()[patchi];
+        const labelUList& pFaceCells = mesh.boundary()[patchi].faceCells();
+
+        int patchsize = pMagSf.size();
+
+        memcpy(boundary_sf + 3*offset, &pSf[0][0], 3*patchsize*sizeof(double));
+        memcpy(boundary_mag_sf + offset, &pMagSf[0], patchsize*sizeof(double));
+        memcpy(boundary_delta_coeffs + offset, &pDeltaCoeffs[0], patchsize*sizeof(double));
+        memcpy(boundary_face_cell + offset, &pFaceCells[0], patchsize * sizeof(int));
+        offset += patchsize;
+    }
+
+    dfDataBase.createConstantFieldsInternal();
+    dfDataBase.createConstantFieldsBoundary();
+    dfDataBase.initConstantFieldsInternal(&mesh.Sf()[0][0], &mesh.magSf()[0], &mesh.surfaceInterpolation::weights()[0], &mesh.nonOrthDeltaCoeffs()[0], &mesh.V()[0]);
+    dfDataBase.initConstantFieldsBoundary(boundary_sf, boundary_mag_sf, boundary_delta_coeffs, boundary_face_cell);
+    
+    // prepare internal and boundary of Y
+    dfDataBase.createNonConstantFieldsInternal();
+    dfDataBase.createNonConstantFieldsBoundary();
+    forAll(Y, speciesI) {
+        volScalarField& Yi = Y[speciesI];
+        memcpy(dfDataBase.h_y + speciesI * num_cells, &Yi[0], num_cells * sizeof(double));
+        offset = 0;
+        forAll(Yi.boundaryField(), patchi) {
+            const scalarField& patchYi = Yi.boundaryField()[patchi];
+            int patchsize = patchYi.size();
+            memcpy(dfDataBase.h_boundary_y + speciesI * num_boundary_surfaces + offset, &patchYi[0], patchsize*sizeof(double));
+            offset += patchsize;
+        }
+    }
+    dfDataBase.initNonConstantFieldsInternal(dfDataBase.h_y);
+    dfDataBase.initNonConstantFieldsBoundary(dfDataBase.h_boundary_y);
+}
+
+void createGPUUEqn(const IOdictionary& CanteraTorchProperties, const volVectorField& U) {
+    // prepare mode_string and setting_path
+    string mode_string = "dDDI";
+    string settingPath;
+    settingPath = CanteraTorchProperties.subDict("AmgxSettings").lookupOrDefault("UEqnSettingPath", string(""));
+    UEqn_GPU.setConstantValues(mode_string, settingPath);
+
+    // prepare patch_type
+    std::vector<int> patch_type;
+    patch_type.resize(dfDataBase.num_patches);
+    forAll(U.boundaryField(), patchi)
+    {
+        constructBoundarySelectorPerPatch(&(patch_type[patchi]), U.boundaryField()[patchi].type());
+    }
+    UEqn_GPU.setConstantFields(patch_type);
+
+    // prepare internal and boundary of xxx
+    UEqn_GPU.createNonConstantFieldsInternal();
+    UEqn_GPU.createNonConstantFieldsBoundary();
+    UEqn_GPU.createNonConstantLduAndCsrFields();
+    // UEqn_GPU has no internal non-constant fields to be init
+    // UEqn_GPU.initNonConstantFieldsInternal();
+    UEqn_GPU.initNonConstantFieldsBoundary();
+}
diff --git a/applications/solvers/dfLowMachFoam/dfLowMachFoam.C b/applications/solvers/dfLowMachFoam/dfLowMachFoam.C
index db6b25b18..6ea4251af 100644
--- a/applications/solvers/dfLowMachFoam/dfLowMachFoam.C
+++ b/applications/solvers/dfLowMachFoam/dfLowMachFoam.C
@@ -60,14 +60,34 @@ Description
 #include "basicThermo.H"
 #include "CombustionModel.H"
 
-#ifdef GPUSolver_
+#define GPUSolverNew_
+#define TIME
+
+#ifdef GPUSolverNew_
 #include "dfUEqn.H"
-#include "dfYEqn.H"
-#include "dfRhoEqn.H"
-#include "dfEEqn.H"
+// #include "dfYEqn.H"
+// #include "dfRhoEqn.H"
+// #include "dfEEqn.H"
+#include "dfMatrixDataBase.H"
+#include "dfMatrixOpBase.H"
 #include <cuda_runtime.h>
 #include <thread>
+
+#include "createGPUSolver.H"
+
 #include "upwind.H"
+#include "GenFvMatrix.H"
+#endif
+
+#ifdef TIME
+    #define TICK_START \
+        start_new = std::clock(); 
+    #define TICK_STOP(prefix) \
+        stop_new = std::clock(); \
+        Foam::Info << #prefix << " time = " << double(stop_new - start_new) / double(CLOCKS_PER_SEC) << " s" << Foam::endl;
+#else
+    #define TICK_START
+    #define TICK_STOP(prefix)
 #endif
 
 // * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * //
@@ -148,6 +168,8 @@ int main(int argc, char *argv[])
 
     label timeIndex = 0;
     clock_t start, end, start1, end1, start2, end2;
+    clock_t start_new, stop_new;
+    double time_new = 0;
 
     turbulence->validate();
 
@@ -158,9 +180,11 @@ int main(int argc, char *argv[])
     }
 
     start1 = std::clock();
-    #ifdef GPUSolver_
-    #include "createdfSolver.H"
-    #endif
+#ifdef GPUSolverNew_
+    createGPUBase(mesh, Y);
+    createGPUUEqn(CanteraTorchProperties, U);
+#endif
+
     end1 = std::clock();
     time_monitor_init += double(end1 - start1) / double(CLOCKS_PER_SEC);
 
@@ -187,7 +211,9 @@ int main(int argc, char *argv[])
         runTime++;
 
         Info<< "Time = " << runTime.timeName() << nl << endl;
-
+#ifdef GPUSolverNew_
+        dfDataBase.preTimeStep(&rho.oldTime()[0]);
+#endif
         clock_t loop_start = std::clock();
         // --- Pressure-velocity PIMPLE corrector loop
         while (pimple.loop())
@@ -276,6 +302,10 @@ int main(int argc, char *argv[])
 
         rho = thermo.rho();
 
+#ifdef GPUSolverNew_
+        dfDataBase.postTimeStep();
+#endif
+
         runTime.write();
         Info<< "========Time Spent in diffenet parts========"<< endl;
         Info<< "loop Time                    = " << loop_time << " s" << endl;
diff --git a/applications/solvers/dfLowMachFoam/new_UEqn.H b/applications/solvers/dfLowMachFoam/new_UEqn.H
new file mode 100644
index 000000000..41b804a4b
--- /dev/null
+++ b/applications/solvers/dfLowMachFoam/new_UEqn.H
@@ -0,0 +1,116 @@
+#ifdef GPUSolver_
+const tmp<volScalarField> nuEff_tmp(turbulence->nuEff());
+const volScalarField& nuEff = nuEff_tmp();
+
+// run CPU, for temp
+tmp<fvVectorMatrix> tUEqn
+(
+    fvm::ddt(rho, U) + fvm::div(phi, U)
+    +  turbulence->divDevRhoReff(U)
+    == -fvc::grad(p)
+);
+// tmp<fvVectorMatrix> tUEqn_ref // test turbulence->divDevRhoReff(U)
+// (
+//     - fvc::div((turbulence->rho()*turbulence->nuEff())*dev2(Foam::T(fvc::grad(U))))
+//     - fvm::laplacian(turbulence->rho()*turbulence->nuEff(), U)
+// );
+
+fvVectorMatrix& UEqn = tUEqn.ref();
+
+// run GPU
+// preProcess
+// TODO: preProcessForRhoEqn for temp, now we only transfer phi(instead of rhoEqn) used by fvm::div(phi, U)
+UEqn_GPU.sync();
+double *h_rho = dfDataBase.getFieldPointer("rho", location::cpu, position::internal);
+double *h_phi = dfDataBase.getFieldPointer("phi", location::cpu, position::internal);
+double *h_boundary_phi = dfDataBase.getFieldPointer("phi", location::cpu, position::boundary);
+memcpy(h_rho, &rho[0], dfDataBase.cell_value_bytes);
+memcpy(h_phi, &phi[0], dfDataBase.surface_value_bytes);
+int offset = 0;
+forAll(phi.boundaryField(), patchi)
+{
+    const fvsPatchScalarField& patchPhi = phi.boundaryField()[patchi];
+    int patchsize = patchPhi.size();
+    memcpy(h_boundary_phi + offset, &patchPhi[0], patchsize * sizeof(double));
+    offset += patchsize;
+}
+UEqn_GPU.preProcessForRhoEqn(h_rho, h_phi, h_boundary_phi);
+DEBUG_TRACE;
+clock_t start = std::clock();
+// preparing u, p, nu_eff, and rho.boundary used by UEqn_GPU.preProcess()
+double *h_u = dfDataBase.getFieldPointer("u", location::cpu, position::internal);
+double *h_boundary_u = dfDataBase.getFieldPointer("u", location::cpu, position::boundary);
+double *h_p = dfDataBase.getFieldPointer("p", location::cpu, position::internal);
+double *h_boundary_p = dfDataBase.getFieldPointer("p", location::cpu, position::boundary);
+double *h_nu_eff = UEqn_GPU.getFieldPointer("nu_eff", location::cpu, position::internal);
+double *h_boundary_nu_eff = UEqn_GPU.getFieldPointer("nu_eff", location::cpu, position::boundary);
+double *h_boundary_rho = dfDataBase.getFieldPointer("rho", location::cpu, position::boundary);
+double end = std::clock();
+Info << "get pointer" << double(end - start) / double(CLOCKS_PER_SEC) << endl;
+
+start = std::clock();
+memcpy(h_u, &U.oldTime()[0][0], dfDataBase.cell_value_vec_bytes);
+memcpy(h_p, &p[0], dfDataBase.cell_value_bytes);
+memcpy(h_nu_eff, &nuEff[0], dfDataBase.cell_value_bytes);
+end = std::clock();
+Info << "copy to pinned memory" << double(end - start) / double(CLOCKS_PER_SEC) << endl;
+
+start = std::clock();
+offset = 0;
+forAll(U.boundaryField(), patchi)
+{
+    const fvPatchVectorField& patchU = U.boundaryField()[patchi];
+    const fvPatchScalarField& patchP = p.boundaryField()[patchi];
+    const fvPatchScalarField& patchNuEff = nuEff.boundaryField()[patchi];
+    const fvPatchScalarField& patchRho = rho.boundaryField()[patchi];
+    int patchsize = patchU.size();
+    memcpy(h_boundary_u + offset * 3, &patchU[0][0], patchsize * 3 * sizeof(double));
+    memcpy(h_boundary_p + offset, &patchP[0], patchsize * sizeof(double));
+    memcpy(h_boundary_nu_eff + offset, &patchNuEff[0], patchsize * sizeof(double));
+    memcpy(h_boundary_rho + offset, &patchRho[0], patchsize * sizeof(double));
+    offset += patchsize;
+}
+end = std::clock();
+Info << "CPU prepare boundary time" << double(end - start) / double(CLOCKS_PER_SEC) << endl;
+
+start = std::clock();
+UEqn_GPU.preProcess(h_u, h_boundary_u, h_p, h_boundary_p, h_nu_eff, h_boundary_nu_eff, h_boundary_rho);
+DEBUG_TRACE;
+UEqn_GPU.sync();
+end = std::clock();
+Info << "GPU preProcess time" << double(end - start) / double(CLOCKS_PER_SEC) << endl;
+
+// process
+start = std::clock();
+UEqn_GPU.process();
+end = std::clock();
+DEBUG_TRACE;
+UEqn_GPU.sync();
+// end = std::clock();
+Info << "GPU process time" << double(end - start) / double(CLOCKS_PER_SEC) << endl;
+
+// postProcess
+UEqn_GPU.postProcess(h_u);
+DEBUG_TRACE;
+
+// checkResult
+// TODO: for temp, now we compare ldu, finally we compare csr
+std::vector<double> h_internal_coeffs(dfDataBase.num_boundary_surfaces * 3);
+std::vector<double> h_boundary_coeffs(dfDataBase.num_boundary_surfaces * 3);
+offset = 0;
+for (int patchi = 0; patchi < dfDataBase.num_patches; patchi++)
+{
+    int patchsize = dfDataBase.patch_size[patchi];
+    const double* internal_coeff_ptr = &UEqn.internalCoeffs()[patchi][0][0];
+    const double* boundary_coeff_ptr = &UEqn.boundaryCoeffs()[patchi][0][0];
+    memcpy(h_internal_coeffs.data() + offset * 3, internal_coeff_ptr, patchsize * 3 * sizeof(double));
+    memcpy(h_boundary_coeffs.data() + offset * 3, boundary_coeff_ptr, patchsize * 3 * sizeof(double));
+    offset += patchsize;
+}
+bool printFlag = true;
+UEqn_GPU.compareResult(&UEqn.lower()[0], &UEqn.upper()[0], &UEqn.diag()[0], &UEqn.source()[0][0],
+        h_internal_coeffs.data(), h_boundary_coeffs.data(), 
+        // &DivTensor[0][0], 
+        printFlag);
+DEBUG_TRACE;
+#endif
diff --git a/applications/solvers/dfLowMachFoam/new_dfLowMachFoam.C b/applications/solvers/dfLowMachFoam/new_dfLowMachFoam.C
new file mode 100644
index 000000000..7d867687f
--- /dev/null
+++ b/applications/solvers/dfLowMachFoam/new_dfLowMachFoam.C
@@ -0,0 +1,113 @@
+/*---------------------------------------------------------------------------*\
+  =========                 |
+  \\      /  F ield         | OpenFOAM: The Open Source CFD Toolbox
+   \\    /   O peration     | Website:  https://openfoam.org
+    \\  /    A nd           | Copyright (C) 2011-2019 OpenFOAM Foundation
+     \\/     M anipulation  |
+-------------------------------------------------------------------------------
+License
+    This file is part of OpenFOAM.
+
+    OpenFOAM is free software: you can redistribute it and/or modify it
+    under the terms of the GNU General Public License as published by
+    the Free Software Foundation, either version 3 of the License, or
+    (at your option) any later version.
+
+    OpenFOAM is distributed in the hope that it will be useful, but WITHOUT
+    ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
+    FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
+    for more details.
+
+    You should have received a copy of the GNU General Public License
+    along with OpenFOAM.  If not, see <http://www.gnu.org/licenses/>.
+
+Application
+    unittest
+
+Description
+    GPU unittest
+
+\*---------------------------------------------------------------------------*/
+
+#include "dfChemistryModel.H"
+#include "CanteraMixture.H"
+// #include "hePsiThermo.H"
+#include "heRhoThermo.H"
+
+#include "fvCFD.H"
+#include "fluidThermo.H"
+#include "turbulentFluidThermoModel.H"
+#include "pimpleControl.H"
+#include "pressureControl.H"
+#include "localEulerDdtScheme.H"
+#include "fvcSmooth.H"
+#include "PstreamGlobals.H"
+#include "basicThermo.H"
+#include "CombustionModel.H"
+
+#include <cuda_runtime.h>
+#include <thread>
+#include "upwind.H"
+
+#include "dfMatrixDataBase.H"
+#include "dfMatrixOpBase.H"
+#include "GenFvMatrix.H"
+#include "dfUEqn.H"
+#include "createGPUSolver.H"
+
+#define GPUSolver_
+
+int main(int argc, char *argv[])
+{
+#ifdef USE_PYTORCH
+    pybind11::scoped_interpreter guard{};//start python interpreter
+#endif
+    #include "postProcess.H"
+
+    // #include "setRootCaseLists.H"
+    #include "listOptions.H"
+    #include "setRootCase2.H"
+    #include "listOutput.H"
+
+    #include "createTime.H"
+    #include "createMesh.H"
+    #include "createDyMControls.H"
+    #include "initContinuityErrs.H"
+    #include "createFields.H"
+    #include "createRhoUfIfPresent.H"
+
+    turbulence->validate();
+
+    if (!LTS)
+    {
+        #include "compressibleCourantNo.H"
+        #include "setInitialDeltaT.H"
+    }
+
+    // * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * //
+    {
+        #include "readDyMControls.H"
+
+        if (LTS)
+        {
+            #include "setRDeltaT.H"
+        }
+        else
+        {
+            #include "compressibleCourantNo.H"
+            #include "setDeltaT.H"
+        }
+
+        createGPUBase(mesh, Y);
+        createGPUUEqn(CanteraTorchProperties, U);
+
+        // for (int timestep = 0; timestep < 10; timestep++) {
+            dfDataBase.preTimeStep(&rho.oldTime()[0]);
+            #include "new_UEqn.H"
+            dfDataBase.postTimeStep();
+        // }
+    }
+    return 0;
+}
+
+
diff --git a/src_gpu/CMakeLists.txt b/src_gpu/CMakeLists.txt
index 6e4a7efef..03a7fe6db 100644
--- a/src_gpu/CMakeLists.txt
+++ b/src_gpu/CMakeLists.txt
@@ -12,6 +12,8 @@ find_package(MPI REQUIRED)
 find_package(CUDAToolkit REQUIRED)
 find_library(LIBAMGXSH amgxsh PATHS $ENV{AMGX_DIR}/build)
 
+add_compile_options(-arch=sm_70 -fmad=false)
+
 include_directories(
     ${MPI_INCLUDE_PATH}
     ${CUDA_INCLUDE_DIRS}
@@ -19,13 +21,11 @@ include_directories(
 )
 
 add_library(${PROJECT_NAME} 
-    SHARED 
-        dfUEqn.cu 
-        dfRhoEqn.cu 
-        dfYEqn.cu
-        dfEEqn.cu
+    SHARED
         AmgXSolver.cu
-        dfMatrixDataBase.cu)
+        dfMatrixDataBase.cu
+        dfMatrixOpBase.cu
+        dfUEqn.cu)
 
 target_link_libraries(${PROJECT_NAME}
     ${MPI_LIBRARIES}
diff --git a/src_gpu/dfMatrixDataBase.H b/src_gpu/dfMatrixDataBase.H
index 8efb4bf62..cac7264a8 100644
--- a/src_gpu/dfMatrixDataBase.H
+++ b/src_gpu/dfMatrixDataBase.H
@@ -12,7 +12,9 @@
 #include <iostream>
 #include <ctime>
 #include <cmath>
+#include <unordered_map>
 
+#define DEBUG_TRACE fprintf(stderr, "%s %d\n", __FILE__, __LINE__);
 
 static const char *_cudaGetErrorEnum(cudaError_t error) {
   return cudaGetErrorName(error);
@@ -30,17 +32,29 @@ void check(T result, char const *const func, const char *const file,
 
 #define checkCudaErrors(val) check((val), #val, __FILE__, __LINE__)
 
-inline void checkVectorEqual(int count, const double* basevec, double* vec, double max_relative_error) {
+inline void checkVectorEqual(int count, const double* basevec, double* vec, double max_relative_error, bool print = false) {
     for (size_t i = 0; i < count; ++i)
     {
         double abs_diff = fabs(basevec[i] - vec[i]);
         double rel_diff = fabs(basevec[i] - vec[i]) / fabs(basevec[i]);
+        if (print)
+            fprintf(stderr, "index %d, cpu data: %.16lf, gpu data: %.16lf, relative error: %.16lf\n", i, basevec[i], vec[i], rel_diff);
         // if (abs_diff > 1e-12 && rel_diff > max_relative_error && !std::isinf(rel_diff))
         if (abs_diff > 1e-15 && rel_diff > max_relative_error)
-            fprintf(stderr, "mismatch index %d, cpu data: %.16lf, gpu data: %.16lf, relative error: %.16lf\n", i, basevec[i], vec[i], rel_diff);
+            fprintf(stderr, "mismatch index %d, cpu data: %.30lf, gpu data: %.30lf, relative error: %.16lf\n", i, basevec[i], vec[i], rel_diff);
     }
 }
 
+enum location {
+    cpu,
+    gpu
+};
+
+enum position {
+    internal,
+    boundary
+};
+
 enum boundaryConditions{
     zeroGradient,
     fixedValue,
@@ -48,594 +62,147 @@ enum boundaryConditions{
     empty
 };
 
-void constructBoundarySelector(std::vector<int>& patchTypeSelector, const std::string& patchTypeStr, const int patchSize);
+void constructBoundarySelectorPerPatch(int *patchTypeSelector, const std::string& patchTypeStr);
 
 struct dfMatrixDataBase
 {
-    // - cuda resource
+    // cuda resource
     cudaStream_t stream;
 
-    // - number of cell size
-    int num_cells;
-    // - number of face size
-    int num_surfaces;
-    // - number of offdiagnal entry size (2*num_surfaces)
-    int num_faces;
-    // - number of boundary cells
-    int num_boundary_cells;
-    // - number of boundary faces
-    int num_boundary_faces;
-
-    int num_species;
-
-    // - mesh variables
-    // - csr_row_index
-    int *h_A_csr_row_index=nullptr, *d_A_csr_row_index=nullptr;
-    // - csr_col_index
-    int *h_A_csr_col_index=nullptr, *d_A_csr_col_index=nullptr;
-    // - csr_diag_index
-    int *h_A_csr_diag_index=nullptr, *d_A_csr_diag_index=nullptr;
-
-    // - the pre-permutated and post-permutated interpolation weight list
-    std::vector<double> h_weight_vec_init, h_weight_vec;
-    // - the pre-permutated and post-permutated flux (phi) list
-    std::vector<double> h_phi_vec_init, h_phi_vec;
-    // - the pre-permutated and post-permutated cell face vector list
-    std::vector<double> h_face_vector_vec_init, h_face_vector_vec;
-    std::vector<double> h_face_vec_init, h_face_vec;
-    std::vector<double> h_deltaCoeffs_vec_init, h_deltaCoeffs_vec;
-    // - the host pointer to rho_new, rho_old, velocity_old, pressure and volume list
-    double *h_rho_new = nullptr, *h_rho_old = nullptr, *h_velocity_old = nullptr, 
-    *h_pressure = nullptr;
-    const double *h_volume = nullptr;
-    // - the host pointer to the pre-permutated and post-permutated interpolation weight list
-    double *h_weight_init = nullptr, *h_weight = nullptr;
-    // - the host pointer to the pre-permutated and post-permutated flux (phi) list
-    double *h_phi_init = nullptr, *h_phi = nullptr;
-    // - the host pointer to the pre-permutated and post-permutated cell face vector list
-    double *h_face_vector_init = nullptr, *h_face_vector = nullptr;
-    double *h_face_init = nullptr, *h_face = nullptr;
-    double *h_deltaCoeffs_init = nullptr, *h_deltaCoeffs = nullptr;
-    // - the device pointer to rho_new, rho_old, velocity_old, pressure and volume list
-    double *d_rho_new = nullptr, *d_rho_old = nullptr, *d_velocity_old = nullptr, 
-    *d_pressure = nullptr, *d_volume = nullptr;
-    // - the device pointer to Y(vector Yi)
-    //std::vector<double*> d_Y;
-    double *d_Y = nullptr;
-    // - the device pointer to the pre-permutated and post-permutated interpolation weight list
-    double *d_weight_init = nullptr, *d_weight = nullptr;
-    double *d_weight_upwind = nullptr;
-    // - the device pointer to the pre-permutated and post-permutated flux (phi) list
-    double *d_phi_init = nullptr, *d_phi = nullptr;
-    // - the device pointer to the pre-permutated and post-permutated cell face vector list
-    double *d_face_vector_init = nullptr, *d_face_vector = nullptr;
-    double *d_face_init = nullptr, *d_face = nullptr;
-    double *d_deltaCoeffs_init = nullptr, *d_deltaCoeffs = nullptr;
-    std::vector<double*> d_rhoD_vector;
-
-    double *d_hDiffCorrFlux = nullptr;
-    double *d_diffAlphaD = nullptr;
-    double *d_rhoD = nullptr;
-    double *d_alpha = nullptr;
-
-    double rdelta_t = 1/1e-6;
-
-    /**
-     * @brief boundary related variables
-     */
-    int *h_boundary_cell_offset = nullptr, *d_boundary_cell_offset=nullptr;
-    int *h_boundary_cell_id = nullptr, *d_boundary_cell_id = nullptr;
-    double *h_internal_coeffs = nullptr, *h_boundary_coeffs = nullptr,
-    *h_boundary_pressure = nullptr, *h_boundary_face_vector = nullptr,
-    *h_boundary_face = nullptr, *d_boundary_face = nullptr,
-    *h_boundary_deltaCoeffs = nullptr, *d_boundary_deltaCoeffs = nullptr, 
-    *d_internal_coeffs = nullptr, *d_boundary_coeffs = nullptr,
-    *d_internal_coeffs_init = nullptr, *d_boundary_coeffs_init = nullptr,
-    *d_laplac_internal_coeffs = nullptr, *d_laplac_boundary_coeffs = nullptr,
-    *d_laplac_internal_coeffs_init = nullptr, *d_laplac_boundary_coeffs_init = nullptr,
-    *d_boundary_pressure = nullptr, *d_boundary_face_vector = nullptr,
-    *d_boundary_pressure_init = nullptr,
-    *d_boundary_phi = nullptr, *d_boundary_phi_init = nullptr, 
-    *d_boundary_velocity = nullptr, *d_boundary_velocity_init = nullptr,
-    *d_boundary_nuEff = nullptr, *d_boundary_nuEff_init = nullptr,
-    *d_boundary_rho = nullptr, *d_boundary_rho_init = nullptr;
-    std::vector<double*> d_boundary_Y_vector;
-    std::vector<double*> d_boundary_Y_init_vector;
-    std::vector<double*> d_internal_coeffs_Y_vector;
-    std::vector<double*> d_boundary_coeffs_Y_vector;
-    std::vector<double*> d_laplac_internal_coeffs_Y_vector;
-    std::vector<double*> d_laplac_boundary_coeffs_Y_vector;
-    double *d_internal_coeffs_Y = nullptr;
-    double *d_boundary_coeffs_Y = nullptr;
-    double *d_laplac_internal_coeffs_Y = nullptr;
-    double *d_laplac_boundary_coeffs_Y = nullptr;
-    std::vector<double*> d_boundary_rhoD_vector;
-    double *d_boundary_mut_sct = nullptr;
-    double *d_boundary_rhoD = nullptr;
-    double *d_boundary_alpha = nullptr;
-
-    double *d_boundary_hDiffCorrFlux = nullptr;
-    int *d_boundary_UpatchType = nullptr;
-    int *d_boundary_YpatchType = nullptr;
-
-    std::vector<int> boundPermutationList;
-    std::vector<double> ueqn_internalCoeffs, ueqn_boundaryCoeffs;
-    std::vector<double> boundary_face_vector;
-    std::vector<double> boundary_pressure;
-    std::vector<double> boundary_face;
-    std::vector<double> boundary_deltaCoeffs;
-    std::vector<std::vector<int>> patch_type_init;
-    std::vector<std::vector<int>> patch_type;
-
-    // - the device pointer to the permutated index list
-    std::vector<int> permedIndex;
-    int *d_permedIndex=nullptr;
-    int *d_bouPermedIndex = nullptr;
-
-
-    // bytesize
-    // - bytes of diagnal entries
-    size_t cell_bytes;
-    // - bytes of diagnal entries (vector)
-    size_t cell_vec_bytes;
-    // - bytes of diagnal index
-    size_t cell_index_bytes;
-     // - bytes of diagnal index
-    size_t face_bytes;
-    size_t face_vec_bytes;
-    size_t face_index_bytes;
-
-    size_t boundary_cell_bytes;
-    size_t boundary_cell_vec_bytes;
-    size_t boundary_cell_index_bytes;
-
-    size_t boundary_face_bytes;
-    size_t boundary_face_vec_bytes;
-    size_t boundary_face_index_bytes;
-
-    // A_csr has one more element in each row: itself
-    size_t csr_row_index_bytes;
-    size_t csr_col_index_bytes;
-    size_t csr_value_bytes;
-    size_t csr_value_vec_bytes;
-
-    // extra matrix information
-    double *d_turbSrc_A = nullptr, *d_turbSrc_b = nullptr, *d_turbSrc_A_init = nullptr;
-    std::vector<double> h_turbSrc_init_mtx_vec, h_turbSrc_init_1mtx;
-    std::vector<double> h_turbSrc_init_src_vec, h_turbSrc_src_vec;
-    std::vector<int> tmpPermutatedList;
-    int * d_tmpPermutatedList = nullptr;
-
-    // double *h_A_csr = nullptr, *h_b = nullptr, *h_psi = nullptr;
-    // double *d_A_csr = nullptr, *d_b = nullptr, *d_psi = nullptr;
-
-    int num_iteration;
-
-    double time_monitor_CPU;
-    double time_monitor_GPU_kernel, time_monitor_GPU_memcpy, time_monitor_GPU_memcpy_test;
-
-    double* d_grad = nullptr; 
-    double* d_grad_boundary = nullptr, *d_grad_boundary_init = nullptr;
-    double* d_nuEff = nullptr;
+    // constant values -- basic
+    int num_cells = 0;
+    int num_surfaces = 0;
+    int num_boundary_surfaces = 0;
+    int num_patches = 0;
+    int num_species = 0;
+    std::vector<int> patch_size;
+    double rdelta_t = 0;
+
+    // constant values -- ldu bytesize
+    size_t cell_value_bytes = 0;
+    size_t cell_value_vec_bytes = 0;
+    size_t cell_value_tsr_bytes = 0;
+    size_t cell_index_bytes = 0;
+    size_t surface_value_bytes = 0;
+    size_t surface_index_bytes = 0;
+    size_t surface_value_vec_bytes = 0;
+    size_t boundary_surface_value_bytes = 0;
+    size_t boundary_surface_value_vec_bytes = 0;
+    size_t boundary_surface_value_tsr_bytes = 0;
+    size_t boundary_surface_index_bytes = 0;
+
+    // constant values -- csr bytesize
+    size_t csr_row_index_bytes = 0;
+    size_t csr_col_index_bytes = 0;
+    size_t csr_value_bytes = 0;
+    size_t csr_value_vec_bytes = 0;
+
+    // constant indexes
+    int *d_owner = nullptr;
+    int *d_neighbor = nullptr;
+    int *d_lower_to_csr_index = nullptr;
+    int *d_diag_to_csr_index= nullptr;
+    int *d_upper_to_csr_index= nullptr;
+    int *d_csr_row_index= nullptr;
+    int *d_csr_col_index= nullptr;
+
+    // constant fields - internal
+    double *d_sf = nullptr;
+    double *d_mag_sf = nullptr;
+    double *d_weight = nullptr;
+    double *d_delta_coeffs = nullptr;
+    double *d_volume = nullptr;
+    
+    // constant fields - boundary
+    double *d_boundary_sf = nullptr;
+    double *d_boundary_mag_sf = nullptr;
+    double *d_boundary_weight = nullptr;
+    double *d_boundary_delta_coeffs = nullptr;
+    int *d_boundary_face_cell = nullptr;
+
+    // non-constant fields - internal 
+    // TODO: further estimate
+    // fields solved by eqns - new
+    double *d_rho = nullptr;
+    double *d_u = nullptr;
+    double *d_y = nullptr;
+    double *d_he = nullptr;
+    double *d_p = nullptr;
+    // fields solved by eqns - old 
+    // TODO: not all fields need to store oldTime
+    double *d_rho_old = nullptr;
+    //double *d_u_old = nullptr;
+    //double *d_y_old = nullptr;
+    //double *d_he_old = nullptr;
+    //double *d_p_old = nullptr;
+    // other shared fields between eqns
+    double *d_phi = nullptr;
+    // computed on GPU, used on CPU, need memcpyd2h - host
+    double *h_rho = nullptr;
+    double *h_rho_old = nullptr;
+    double *h_u= nullptr;
+    double *h_y= nullptr;
+    double *h_he= nullptr;
+    // computed on CPU, used on GPU, need memcpyh2d - host
+    double *h_p= nullptr;
+    double *h_phi= nullptr;
+
+    // non-constant fields - boundary
+    // TODO: further estimate
+    // fields solved by eqns - new
+    double *d_boundary_rho = nullptr;
+    double *d_boundary_u = nullptr;
+    double *d_boundary_y = nullptr;
+    double *d_boundary_he = nullptr;
+    double *d_boundary_p = nullptr;
+    // fields solved by eqns - old
+    double *d_boundary_rho_old = nullptr;
+    //double *d_boundary_u_old = nullptr;
+    //double *d_boundary_y_old = nullptr;
+    //double *d_boundary_he_old = nullptr;
+    //double *d_boundary_p_old = nullptr;
+    // other shared fields between eqns
+    double *d_boundary_phi = nullptr;
+    // computed on GPU, used on CPU, need memcpyd2h - host
+    double *h_boundary_rho = nullptr;
+    double *h_boundary_rho_old = nullptr;
+    double *h_boundary_u= nullptr;
+    double *h_boundary_y= nullptr;
+    double *h_boundary_he= nullptr;
+    // computed on CPU, used on GPU, need memcpyh2d - host
+    double *h_boundary_p= nullptr;
+    double *h_boundary_phi= nullptr;
+
+    std::unordered_map<std::string, double*> fieldPointerMap;
 
     // constructor
     dfMatrixDataBase();
-    dfMatrixDataBase(int num_surfaces, int num_cells, int num_boundary_faces, int num_species, int & num_boundary_cells_output,
-        const int *neighbour, const int *owner, const double* volume, const double* weight, const double* face_vector, const double* face, 
-        const double* deltaCoeffs, std::vector<double> boundary_face_vector_init, std::vector<double> boundary_face_init, 
-        std::vector<double> boundary_deltaCoeffs_init, std::vector<int> boundary_cell_id_init, std::vector<std::vector<int>> patch_type_init)
-    : num_cells(num_cells), num_faces(num_surfaces*2), num_surfaces(num_surfaces), num_species(num_species), num_iteration(0),
-      num_boundary_faces(num_boundary_faces), h_volume(volume), patch_type_init(patch_type_init)
-    {
-        // create cuda stream
-        checkCudaErrors(cudaStreamCreate(&stream));
-
-        // allocate field pointer in pin memory
-        cudaMallocHost(&h_phi_init, num_faces * sizeof(double));
-        cudaMallocHost(&h_rho_old, num_cells * sizeof(double));
-
-        h_weight_vec_init.resize(num_faces);
-        h_weight_vec.resize(num_faces);
-        h_face_vector_vec_init.resize(num_faces*3);
-        h_face_vector_vec.resize(num_faces*3);
-        h_face_vec_init.resize(num_faces);
-        h_face_vec.resize(num_faces);
-        h_deltaCoeffs_vec_init.resize(num_faces);
-        h_deltaCoeffs_vec.resize(num_faces);
-        h_turbSrc_init_mtx_vec.resize(num_faces + num_cells);
-        h_turbSrc_init_1mtx.resize(num_faces + num_cells);
-        h_turbSrc_init_src_vec.resize(3*num_cells);
-        h_turbSrc_src_vec.resize(3*num_cells);
-
-        // byte sizes
-        cell_bytes = num_cells * sizeof(double);
-        cell_vec_bytes = num_cells * 3 * sizeof(double);
-        cell_index_bytes = num_cells * sizeof(int);
-
-        face_bytes = num_faces * sizeof(double);
-        face_vec_bytes = num_faces * 3 * sizeof(double);
-        face_index_bytes = num_faces * sizeof(int);
-
-        // A_csr has one more element in each row: itself
-        csr_row_index_bytes = (num_cells + 1) * sizeof(int);
-        csr_col_index_bytes = (num_cells + num_faces) * sizeof(int);
-        csr_value_bytes = (num_cells + num_faces) * sizeof(double);
-        csr_value_vec_bytes = (num_cells + num_faces) * 3 * sizeof(double);
-
-        /************************construct mesh variables****************************/
-        /**
-         * 1. h_csr_row_index & h_csr_diag_index
-        */
-        std::vector<int> h_mtxEntry_perRow_vec(num_cells);
-        std::vector<int> h_csr_diag_index_vec(num_cells);
-        std::vector<int> h_csr_row_index_vec(num_cells + 1, 0);
-
-        for (int faceI = 0; faceI < num_surfaces; faceI++)
-        {
-            h_csr_diag_index_vec[neighbour[faceI]]++;
-            h_mtxEntry_perRow_vec[neighbour[faceI]]++;
-            h_mtxEntry_perRow_vec[owner[faceI]]++;
-        }
-
-        // - consider diagnal element in each row
-        std::transform(h_mtxEntry_perRow_vec.begin(), h_mtxEntry_perRow_vec.end(), h_mtxEntry_perRow_vec.begin(), [](int n)
-            {return n + 1;});
-        // - construct h_csr_row_index & h_csr_diag_index
-        std::partial_sum(h_mtxEntry_perRow_vec.begin(), h_mtxEntry_perRow_vec.end(), h_csr_row_index_vec.begin()+1);
-        // - assign h_csr_row_index & h_csr_diag_index
-        h_A_csr_row_index = h_csr_row_index_vec.data();
-        h_A_csr_diag_index = h_csr_diag_index_vec.data();
-
-        /**
-         * 2. h_csr_col_index
-        */
-        std::vector<int> rowIndex(num_faces + num_cells), colIndex(num_faces + num_cells), diagIndex(num_cells);
-        std::iota(diagIndex.begin(), diagIndex.end(), 0);
-
-        // initialize the RowIndex (rowIndex of lower + upper + diagnal)
-        std::copy(neighbour, neighbour + num_surfaces, rowIndex.begin());
-        std::copy(owner, owner + num_surfaces, rowIndex.begin() + num_surfaces);
-        std::copy(diagIndex.begin(), diagIndex.end(), rowIndex.begin() + num_faces);
-        // initialize the ColIndex (colIndex of lower + upper + diagnal)
-        std::copy(owner, owner + num_surfaces, colIndex.begin());
-        std::copy(neighbour, neighbour + num_surfaces, colIndex.begin() + num_surfaces);
-        std::copy(diagIndex.begin(), diagIndex.end(), colIndex.begin() + num_faces);
-
-        // - construct hashTable for sorting
-        std::multimap<int,int> rowColPair;
-        for (int i = 0; i < 2*num_surfaces+num_cells; i++)
-        {
-            rowColPair.insert(std::make_pair(rowIndex[i], colIndex[i]));
-        }
-        // - sort
-        std::vector<std::pair<int, int>> globalPerm(rowColPair.begin(), rowColPair.end());
-        std::sort(globalPerm.begin(), globalPerm.end(), []
-        (const std::pair<int, int>& pair1, const std::pair<int, int>& pair2){
-        if (pair1.first != pair2.first) {
-            return pair1.first < pair2.first;
-        } else {
-            return pair1.second < pair2.second;
-        }
-        });
-
-        std::vector<int> h_csr_col_index_vec;
-        std::transform(globalPerm.begin(), globalPerm.end(), std::back_inserter(h_csr_col_index_vec), []
-            (const std::pair<int, int>& pair) {
-            return pair.second;
-        });
-        h_A_csr_col_index = h_csr_col_index_vec.data();
-        
-        // construct a tmp permutated List for add fvMatrix
-        std::vector<int> tmp_permutation(2*num_surfaces + num_cells);
-        std::vector<int> tmp_rowIndex(2*num_surfaces + num_cells);
-        std::iota(tmp_permutation.begin(), tmp_permutation.end(), 0);
-        std::copy(neighbour, neighbour + num_surfaces, tmp_rowIndex.begin());
-        std::copy(diagIndex.begin(), diagIndex.end(), tmp_rowIndex.begin() + num_surfaces);
-        std::copy(owner, owner + num_surfaces, tmp_rowIndex.begin() + num_surfaces + num_cells);
-        std::multimap<int,int> tmpPair;
-        for (int i = 0; i < 2*num_surfaces+num_cells; i++)
-        {
-            tmpPair.insert(std::make_pair(tmp_rowIndex[i], tmp_permutation[i]));
-        }
-        std::vector<std::pair<int, int>> tmpPerm(tmpPair.begin(), tmpPair.end());
-        std::sort(tmpPerm.begin(), tmpPerm.end(), []
-        (const std::pair<int, int>& pair1, const std::pair<int, int>& pair2){
-        if (pair1.first != pair2.first) {
-            return pair1.first < pair2.first;
-        } else {
-            return pair1.second < pair2.second;
-        }
-        });
-        std::transform(tmpPerm.begin(), tmpPerm.end(), std::back_inserter(tmpPermutatedList), []
-            (const std::pair<int, int>& pair) {
-            return pair.second;
-        });
-
-        /**
-         * 3. boundary imformations
-        */
-        // get boundPermutation and offset lists
-        std::vector<int> boundPermutationListInit(num_boundary_faces);
-        std::vector<int> boundOffsetList;
-        std::iota(boundPermutationListInit.begin(), boundPermutationListInit.end(), 0);
-
-        // - construct hashTable for sorting
-        std::multimap<int,int> boundPermutation;
-        for (int i = 0; i < num_boundary_faces; i++)
-        {
-            boundPermutation.insert(std::make_pair(boundary_cell_id_init[i], boundPermutationListInit[i]));
-        }
-
-        // - sort 
-        std::vector<std::pair<int, int>> boundPermPair(boundPermutation.begin(), boundPermutation.end());
-        std::sort(boundPermPair.begin(), boundPermPair.end(), []
-        (const std::pair<int, int>& pair1, const std::pair<int, int>& pair2){
-            if (pair1.first != pair2.first) {
-                return pair1.first < pair2.first;
-            } else {
-                return pair1.second < pair2.second;
-            }
-        });
-
-        // - construct boundPermedIndex and boundary_cell_id
-        std::vector<int> boundary_cell_id;
-        boundPermutationList.clear();
-        std::transform(boundPermPair.begin(), boundPermPair.end(), std::back_inserter(boundary_cell_id), []
-            (const std::pair<int, int>& pair) {
-            return pair.first;
-        });
-        std::transform(boundPermPair.begin(), boundPermPair.end(), std::back_inserter(boundPermutationList), []
-            (const std::pair<int, int>& pair) {
-            return pair.second;
-        });
-
-        // construct boundary_cell_offset
-        std::map<int, int> countMap;
-        std::vector<int> boundaryCellcount;
-        for (const auto& cellIndex : boundary_cell_id)
-            ++ countMap[cellIndex];
-        for (const auto& [cellIndex, count] : countMap)
-            boundaryCellcount.push_back(count);
-
-        num_boundary_cells = boundaryCellcount.size();
-        num_boundary_cells_output = num_boundary_cells;
-
-        std::vector<int> boundary_cell_offset(boundaryCellcount.size() + 1, 0);
-        std::partial_sum(boundaryCellcount.begin(), boundaryCellcount.end(), boundary_cell_offset.begin()+1);
-        
-        // assign h_boundary_cell_offset & h_boundary_cell_id
-        h_boundary_cell_offset = boundary_cell_offset.data();
-        h_boundary_cell_id = boundary_cell_id.data();
-
-        // 
-        boundary_cell_bytes = num_boundary_cells * sizeof(double);
-        boundary_cell_vec_bytes = num_boundary_cells * 3 * sizeof(double);
-        boundary_cell_index_bytes = num_boundary_cells * sizeof(int);
-
-        boundary_face_bytes = num_boundary_faces * sizeof(double);
-        boundary_face_vec_bytes = num_boundary_faces * 3 * sizeof(double);
-        boundary_face_index_bytes = num_boundary_faces * sizeof(int);
-
-        ueqn_internalCoeffs.resize(3*num_boundary_faces);
-        ueqn_boundaryCoeffs.resize(3*num_boundary_faces);
-
-        boundary_face_vector.resize(3*num_boundary_faces);
-        boundary_pressure.resize(num_boundary_faces);
-        boundary_face.resize(num_boundary_faces);
-        boundary_deltaCoeffs.resize(num_boundary_faces);
-
-        patch_type.resize(2);
-        patch_type[0].resize(num_boundary_faces);
-        patch_type[1].resize(num_boundary_faces);
-
-        /**
-         * 4. permutation list for field variables
-        */
-        std::vector<int> offdiagRowIndex(2*num_surfaces), permIndex(2*num_surfaces);
-        // - initialize the offdiagRowIndex (rowIndex of lower + rowIndex of upper)
-        std::copy(neighbour, neighbour + num_surfaces, offdiagRowIndex.begin());
-        std::copy(owner, owner + num_surfaces, offdiagRowIndex.begin() + num_surfaces);
-
-        // - initialize the permIndex (0, 1, ..., 2*num_surfaces)
-        std::iota(permIndex.begin(), permIndex.end(), 0);
-
-        // - construct hashTable for sorting
-        std::multimap<int,int> permutation;
-        for (int i = 0; i < 2*num_surfaces; i++)
-        {
-            permutation.insert(std::make_pair(offdiagRowIndex[i], permIndex[i]));
-        }
-        // - sort 
-        std::vector<std::pair<int, int>> permPair(permutation.begin(), permutation.end());
-        std::sort(permPair.begin(), permPair.end(), []
-        (const std::pair<int, int>& pair1, const std::pair<int, int>& pair2){
-            if (pair1.first != pair2.first) {
-                return pair1.first < pair2.first;
-            } else {
-                return pair1.second < pair2.second;
-            }
-        });
-        // - form permedIndex list
-        std::transform(permPair.begin(), permPair.end(), std::back_inserter(permedIndex), []
-            (const std::pair<int, int>& pair) {
-            return pair.second;
-        });
-
-        // copy and permutate cell variables
-        std::copy(weight, weight + num_surfaces, h_weight_vec_init.begin());
-        std::copy(weight, weight + num_surfaces, h_weight_vec_init.begin() + num_surfaces);
-        std::copy(face_vector, face_vector + 3*num_surfaces, h_face_vector_vec_init.begin());
-        std::copy(face_vector, face_vector + 3*num_surfaces, h_face_vector_vec_init.begin() + 3*num_surfaces);
-        std::copy(face, face + num_surfaces, h_face_vec_init.begin());
-        std::copy(face, face + num_surfaces, h_face_vec_init.begin() + num_surfaces);
-        std::copy(deltaCoeffs, deltaCoeffs + num_surfaces, h_deltaCoeffs_vec_init.begin());
-        std::copy(deltaCoeffs, deltaCoeffs + num_surfaces, h_deltaCoeffs_vec_init.begin() + num_surfaces);
-        for (int i = 0; i < num_faces; i++)
-        {
-            h_weight_vec[i] = h_weight_vec_init[permedIndex[i]];
-            h_face_vec[i] = h_face_vec_init[permedIndex[i]];
-            h_deltaCoeffs_vec[i] = h_deltaCoeffs_vec_init[permedIndex[i]];
-            h_face_vector_vec[i*3] = h_face_vector_vec_init[3*permedIndex[i]];
-            h_face_vector_vec[i*3+1] = h_face_vector_vec_init[3*permedIndex[i]+1];
-            h_face_vector_vec[i*3+2] = h_face_vector_vec_init[3*permedIndex[i]+2];
-        }
-        h_weight = h_weight_vec.data();
-        h_face_vector = h_face_vector_vec.data();
-        h_face = h_face_vec.data();
-        h_deltaCoeffs = h_deltaCoeffs_vec.data();
-
-        for (int i = 0; i < num_boundary_faces; i++)
-        {
-            boundary_face_vector[3*i] = boundary_face_vector_init[3*boundPermutationList[i]];
-            boundary_face_vector[3*i+1] = boundary_face_vector_init[3*boundPermutationList[i]+1];
-            boundary_face_vector[3*i+2] = boundary_face_vector_init[3*boundPermutationList[i]+2];
-            boundary_face[i] = boundary_face_init[boundPermutationList[i]];
-            boundary_deltaCoeffs[i] = boundary_deltaCoeffs_init[boundPermutationList[i]];
-            patch_type[0][i] = patch_type_init[0][boundPermutationList[i]];
-            patch_type[1][i] = patch_type_init[1][boundPermutationList[i]];
-        }
-        h_boundary_face_vector = boundary_face_vector.data();
-        h_boundary_face = boundary_face.data();
-        h_boundary_deltaCoeffs = boundary_deltaCoeffs.data();
-
-        /************************allocate memory on device****************************/
-        int total_bytes = 0;
-
-        checkCudaErrors(cudaMalloc((void**)&d_A_csr_row_index, csr_row_index_bytes));
-        checkCudaErrors(cudaMalloc((void**)&d_A_csr_col_index, csr_col_index_bytes));
-        checkCudaErrors(cudaMalloc((void**)&d_A_csr_diag_index, cell_index_bytes));
-        total_bytes += (csr_row_index_bytes + csr_col_index_bytes + cell_index_bytes);
-
-        //d_Y.resize(num_species);
-        d_rhoD_vector.resize(num_species);
-        d_boundary_Y_vector.resize(num_species);
-        d_boundary_Y_init_vector.resize(num_species);
-        d_internal_coeffs_Y_vector.resize(num_species);
-        d_boundary_coeffs_Y_vector.resize(num_species);
-        d_laplac_internal_coeffs_Y_vector.resize(num_species);
-        d_laplac_boundary_coeffs_Y_vector.resize(num_species);
-        d_boundary_rhoD_vector.resize(num_species);
-
-        for (size_t i = 0; i < num_species; ++i){
-            //checkCudaErrors(cudaMalloc((void**)&d_Y[i], cell_bytes));
-            checkCudaErrors(cudaMalloc((void**)&d_rhoD_vector[i], cell_bytes));
-        }
-        checkCudaErrors(cudaMalloc((void**)&d_Y, cell_bytes * num_species));
-        checkCudaErrors(cudaMalloc((void**)&d_rho_old, cell_bytes));
-        checkCudaErrors(cudaMalloc((void**)&d_rho_new, cell_bytes));
-        checkCudaErrors(cudaMalloc((void**)&d_volume, cell_bytes));
-        checkCudaErrors(cudaMalloc((void**)&d_pressure, cell_bytes));
-        checkCudaErrors(cudaMalloc((void**)&d_velocity_old, cell_vec_bytes));
-        checkCudaErrors(cudaMalloc((void**)&d_weight, face_bytes));
-        checkCudaErrors(cudaMalloc((void**)&d_weight_upwind, face_bytes));
-        checkCudaErrors(cudaMalloc((void**)&d_face, face_bytes));
-        checkCudaErrors(cudaMalloc((void**)&d_deltaCoeffs, face_bytes));
-        checkCudaErrors(cudaMalloc((void**)&d_phi, face_bytes));
-        checkCudaErrors(cudaMalloc((void**)&d_phi_init, face_bytes));
-        checkCudaErrors(cudaMalloc((void**)&d_face_vector, face_vec_bytes));
-        checkCudaErrors(cudaMalloc((void**)&d_nuEff, cell_bytes));
-        checkCudaErrors(cudaMalloc((void**)&d_rhoD, cell_bytes * num_species));
-        checkCudaErrors(cudaMalloc((void**)&d_alpha, cell_bytes));
-        total_bytes += (cell_bytes * (5 + 2*num_species) + face_bytes * 6 + cell_vec_bytes + face_vec_bytes);
-
-        checkCudaErrors(cudaMalloc((void**)&d_boundary_cell_offset, (num_boundary_cells+1) * sizeof(int)));
-        checkCudaErrors(cudaMalloc((void**)&d_boundary_cell_id, boundary_face_index_bytes));
-        total_bytes += (boundary_face_index_bytes + (num_boundary_cells+1) * sizeof(int));
-
-        checkCudaErrors(cudaMalloc((void**)&d_boundary_pressure_init, boundary_face_bytes));
-        checkCudaErrors(cudaMalloc((void**)&d_boundary_pressure, boundary_face_bytes));
-        checkCudaErrors(cudaMalloc((void**)&d_boundary_phi_init, boundary_face_bytes));
-        checkCudaErrors(cudaMalloc((void**)&d_boundary_phi, boundary_face_bytes));
-        checkCudaErrors(cudaMalloc((void**)&d_boundary_velocity_init, boundary_face_vec_bytes));
-        checkCudaErrors(cudaMalloc((void**)&d_boundary_velocity, boundary_face_vec_bytes));
-        checkCudaErrors(cudaMalloc((void**)&d_boundary_face_vector, boundary_face_vec_bytes));
-        checkCudaErrors(cudaMalloc((void**)&d_boundary_face, boundary_face_bytes));
-        checkCudaErrors(cudaMalloc((void**)&d_boundary_deltaCoeffs, boundary_face_bytes));
-        checkCudaErrors(cudaMalloc((void**)&d_internal_coeffs_init, boundary_face_vec_bytes));
-        checkCudaErrors(cudaMalloc((void**)&d_boundary_coeffs_init, boundary_face_vec_bytes));
-        checkCudaErrors(cudaMalloc((void**)&d_laplac_internal_coeffs_init, boundary_face_vec_bytes));
-        checkCudaErrors(cudaMalloc((void**)&d_laplac_boundary_coeffs_init, boundary_face_vec_bytes));
-        checkCudaErrors(cudaMalloc((void**)&d_internal_coeffs, boundary_face_vec_bytes));
-        checkCudaErrors(cudaMalloc((void**)&d_boundary_coeffs, boundary_face_vec_bytes));
-        checkCudaErrors(cudaMalloc((void**)&d_laplac_internal_coeffs, boundary_face_vec_bytes));
-        checkCudaErrors(cudaMalloc((void**)&d_laplac_boundary_coeffs, boundary_face_vec_bytes));
-        checkCudaErrors(cudaMalloc((void**)&d_boundary_nuEff, boundary_face_bytes));
-        checkCudaErrors(cudaMalloc((void**)&d_boundary_nuEff_init, boundary_face_bytes));
-        checkCudaErrors(cudaMalloc((void**)&d_boundary_rho, boundary_face_bytes));
-        checkCudaErrors(cudaMalloc((void**)&d_boundary_rho_init, boundary_face_bytes));
-        checkCudaErrors(cudaMalloc((void**)&d_boundary_mut_sct, boundary_face_bytes));
-        checkCudaErrors(cudaMalloc((void**)&d_boundary_UpatchType, boundary_face_index_bytes));
-        checkCudaErrors(cudaMalloc((void**)&d_boundary_YpatchType, boundary_face_index_bytes));
-        for (size_t i = 0; i < num_species; ++i){
-            checkCudaErrors(cudaMalloc((void**)&d_boundary_Y_vector[i], boundary_face_bytes));
-            checkCudaErrors(cudaMalloc((void**)&d_boundary_Y_init_vector[i], boundary_face_bytes));
-            checkCudaErrors(cudaMalloc((void**)&d_internal_coeffs_Y_vector[i], boundary_face_bytes));
-            checkCudaErrors(cudaMalloc((void**)&d_boundary_coeffs_Y_vector[i], boundary_face_bytes));
-            checkCudaErrors(cudaMalloc((void**)&d_laplac_internal_coeffs_Y_vector[i], boundary_face_bytes));
-            checkCudaErrors(cudaMalloc((void**)&d_laplac_boundary_coeffs_Y_vector[i], boundary_face_bytes));
-            checkCudaErrors(cudaMalloc((void**)&d_boundary_rhoD_vector[i], boundary_face_bytes));
-        }
-        checkCudaErrors(cudaMalloc((void**)&d_internal_coeffs_Y, boundary_face_bytes * num_species));
-        checkCudaErrors(cudaMalloc((void**)&d_boundary_coeffs_Y, boundary_face_bytes * num_species));
-        checkCudaErrors(cudaMalloc((void**)&d_laplac_internal_coeffs_Y, boundary_face_bytes * num_species));
-        checkCudaErrors(cudaMalloc((void**)&d_laplac_boundary_coeffs_Y, boundary_face_bytes * num_species));
-        checkCudaErrors(cudaMalloc((void**)&d_boundary_rhoD, boundary_face_bytes * num_species));
-        checkCudaErrors(cudaMalloc((void**)&d_boundary_alpha, boundary_face_bytes));
-        
-        total_bytes += (boundary_face_bytes*10 + boundary_face_vec_bytes * 11);
-
-        // checkCudaErrors(cudaMalloc((void**)&d_A_csr, csr_value_vec_bytes));
-        // checkCudaErrors(cudaMalloc((void**)&d_b, cell_vec_bytes));
-        // checkCudaErrors(cudaMalloc((void**)&d_psi, cell_vec_bytes));
-        total_bytes += (boundary_face_bytes + boundary_face_vec_bytes * 3);
-
-        checkCudaErrors(cudaMalloc((void**)&d_turbSrc_A, csr_value_bytes));
-        checkCudaErrors(cudaMalloc((void**)&d_turbSrc_A_init, csr_value_bytes));
-        checkCudaErrors(cudaMalloc((void**)&d_turbSrc_b, cell_vec_bytes));
-        total_bytes += (2*csr_value_bytes + cell_vec_bytes);
-
-        checkCudaErrors(cudaMalloc((void**)&d_permedIndex, face_index_bytes));
-        checkCudaErrors(cudaMalloc((void**)&d_bouPermedIndex, boundary_face_index_bytes));
-        checkCudaErrors(cudaMalloc((void**)&d_tmpPermutatedList, csr_col_index_bytes));
-        total_bytes += (face_index_bytes + boundary_face_index_bytes + csr_col_index_bytes);
-
-        checkCudaErrors(cudaMalloc((void**)&d_grad, num_cells * 9 * sizeof(double)));
-        checkCudaErrors(cudaMalloc((void**)&d_grad_boundary, boundary_face_bytes * 9));
-        checkCudaErrors(cudaMalloc((void**)&d_grad_boundary_init, boundary_cell_bytes * 9));
-        total_bytes += (num_cells * 9 * sizeof(double) + boundary_face_bytes * 9 + boundary_cell_bytes * 9); // FIXME: rename
 
-        checkCudaErrors(cudaMalloc((void**)&d_hDiffCorrFlux, 3 * cell_bytes));
-        checkCudaErrors(cudaMalloc((void**)&d_diffAlphaD, cell_bytes));
-        checkCudaErrors(cudaMalloc((void**)&d_boundary_hDiffCorrFlux, 3 * boundary_face_bytes));
+    // deconstructor
+    ~dfMatrixDataBase();
 
-        fprintf(stderr, "Total bytes malloc on GPU: %.2fMB\n", total_bytes * 1.0 / 1024 / 1024);
+    // member function
+    void setConstantValues(int num_cells, int num_surfaces, int num_boundary_surfaces,
+                   int num_patches, std::vector<int> patch_size,
+                   int num_species, double rdelta_t);             
+    void setConstantIndexes(const int *owner, const int *neighbor);
 
-        checkCudaErrors(cudaMemcpyAsync(d_A_csr_row_index, h_A_csr_row_index, csr_row_index_bytes, cudaMemcpyHostToDevice, stream));
-        checkCudaErrors(cudaMemcpyAsync(d_A_csr_col_index, h_A_csr_col_index, csr_col_index_bytes, cudaMemcpyHostToDevice, stream));
-        checkCudaErrors(cudaMemcpyAsync(d_A_csr_diag_index, h_A_csr_diag_index, cell_index_bytes, cudaMemcpyHostToDevice, stream));
-        checkCudaErrors(cudaMemcpyAsync(d_boundary_cell_offset, h_boundary_cell_offset, (num_boundary_cells+1) * sizeof(int), cudaMemcpyHostToDevice, stream));
-        checkCudaErrors(cudaMemcpyAsync(d_boundary_cell_id, h_boundary_cell_id, boundary_face_index_bytes, cudaMemcpyHostToDevice, stream));
+    void createConstantFieldsInternal();
+    void createConstantFieldsBoundary();
+    void initConstantFieldsInternal(const double *sf, const double *mag_sf, 
+        const double *weight, const double *delta_coeffs, const double *volume);
+    void initConstantFieldsBoundary(const double *boundary_sf, const double *boundary_mag_sf, 
+        const double *boundary_delta_coeffs, const int *boundary_face_cell);
 
-        checkCudaErrors(cudaMemcpyAsync(d_volume, h_volume, cell_bytes, cudaMemcpyHostToDevice, stream));
-        checkCudaErrors(cudaMemcpyAsync(d_weight, h_weight, face_bytes, cudaMemcpyHostToDevice, stream));
-        checkCudaErrors(cudaMemcpyAsync(d_face, h_face, face_bytes, cudaMemcpyHostToDevice, stream));
-        checkCudaErrors(cudaMemcpyAsync(d_deltaCoeffs, h_deltaCoeffs, face_bytes, cudaMemcpyHostToDevice, stream));
-        checkCudaErrors(cudaMemcpyAsync(d_face_vector, h_face_vector, face_vec_bytes, cudaMemcpyHostToDevice, stream));
-        checkCudaErrors(cudaMemcpyAsync(d_boundary_face_vector, h_boundary_face_vector, boundary_face_vec_bytes, cudaMemcpyHostToDevice, stream));
-        checkCudaErrors(cudaMemcpyAsync(d_boundary_face, h_boundary_face, boundary_face_bytes, cudaMemcpyHostToDevice, stream));
-        checkCudaErrors(cudaMemcpyAsync(d_boundary_deltaCoeffs, h_boundary_deltaCoeffs, boundary_face_bytes, cudaMemcpyHostToDevice, stream));
-        checkCudaErrors(cudaMemcpyAsync(d_boundary_UpatchType, patch_type[0].data(), boundary_face_index_bytes, cudaMemcpyHostToDevice, stream));
-        checkCudaErrors(cudaMemcpyAsync(d_boundary_YpatchType, patch_type[1].data(), boundary_face_index_bytes, cudaMemcpyHostToDevice, stream));
+    void createNonConstantFieldsInternal();
+    void createNonConstantFieldsBoundary();
+    void initNonConstantFieldsInternal(const double *y);
+    void initNonConstantFieldsBoundary(const double *boundary_y);
 
-        checkCudaErrors(cudaMemcpyAsync(d_permedIndex, permedIndex.data(), face_index_bytes, cudaMemcpyHostToDevice, stream));
-        checkCudaErrors(cudaMemcpyAsync(d_bouPermedIndex, boundPermutationList.data(), boundary_face_index_bytes, cudaMemcpyHostToDevice, stream));
-        checkCudaErrors(cudaMemcpyAsync(d_tmpPermutatedList, tmpPermutatedList.data(), csr_col_index_bytes, cudaMemcpyHostToDevice, stream));
-    };
+    void preTimeStep(const double *rho_old);
+    void postTimeStep();
 
-    ~dfMatrixDataBase(){
-        std::cout << "Destructor called." << std::endl;
-        // TODO: free pointers
-        
-    };
+    // getter
+    double* getFieldPointer(const char* fieldAlias, location loc, position pos);
 };
 
diff --git a/src_gpu/dfMatrixDataBase.cu b/src_gpu/dfMatrixDataBase.cu
index d4f5a7ab0..4e49faf99 100644
--- a/src_gpu/dfMatrixDataBase.cu
+++ b/src_gpu/dfMatrixDataBase.cu
@@ -1,8 +1,6 @@
 #include "dfMatrixDataBase.H"
 
-
-void constructBoundarySelector(std::vector<int>& patchTypeSelector, const std::string& patchTypeStr,
-    const int patchSize)
+void constructBoundarySelectorPerPatch(int *patchTypeSelector, const std::string& patchTypeStr)
 {
     boundaryConditions patchCondition;
     std::vector<int> tmpSelector;
@@ -22,27 +20,315 @@ void constructBoundarySelector(std::vector<int>& patchTypeSelector, const std::s
     switch (patchCondition){
         case zeroGradient:
         {
-            tmpSelector.resize(patchSize, 0);
-            patchTypeSelector.insert(patchTypeSelector.end(), tmpSelector.begin(), tmpSelector.end());
+            *patchTypeSelector = 0;
             break;
         }
         case fixedValue:
         {
-            tmpSelector.resize(patchSize, 1);
-            patchTypeSelector.insert(patchTypeSelector.end(), tmpSelector.begin(), tmpSelector.end());
+            *patchTypeSelector = 1;
             break;
         }
         case empty:
         {
-            tmpSelector.resize(patchSize, 2);
-            patchTypeSelector.insert(patchTypeSelector.end(), tmpSelector.begin(), tmpSelector.end());
+            *patchTypeSelector = 2;
             break;
         }
         case coupled:
         {
-            tmpSelector.resize(patchSize, 3);
-            patchTypeSelector.insert(patchTypeSelector.end(), tmpSelector.begin(), tmpSelector.end());
+            *patchTypeSelector = 3;
             break;
         }
     }
 }
+
+dfMatrixDataBase::dfMatrixDataBase() {
+    checkCudaErrors(cudaStreamCreate(&stream));
+}
+
+dfMatrixDataBase::~dfMatrixDataBase() {
+    // destroy cuda resources
+    checkCudaErrors(cudaStreamDestroy(stream));
+    // TODO: free pointers
+}
+
+void dfMatrixDataBase::setConstantValues(int num_cells, int num_surfaces, int num_boundary_surfaces,
+                   int num_patches, std::vector<int> patch_size,
+                   int num_species, double rdelta_t) {
+    // constant values -- basic
+    this->num_cells = num_cells;
+    this->num_surfaces = num_surfaces;
+    this->num_boundary_surfaces = num_boundary_surfaces;
+    this->num_patches = num_patches;
+    this->patch_size = patch_size;
+    this->num_species = num_species;
+    this->rdelta_t = rdelta_t;
+
+    // constant values -- ldu bytesize
+    cell_value_bytes = num_cells * sizeof(double);
+    cell_value_vec_bytes = num_cells * 3 * sizeof(double);
+    cell_value_tsr_bytes = num_cells * 9 * sizeof(double);
+    cell_index_bytes = num_cells * sizeof(int);
+    surface_value_bytes = num_surfaces * sizeof(double);
+    surface_index_bytes = num_surfaces * sizeof(int);
+    surface_value_vec_bytes = num_surfaces * 3 * sizeof(double);
+    boundary_surface_value_bytes = num_boundary_surfaces * sizeof(double);
+    boundary_surface_value_vec_bytes = num_boundary_surfaces * 3 * sizeof(double);
+    boundary_surface_value_tsr_bytes = num_boundary_surfaces * 9 * sizeof(double);
+    boundary_surface_index_bytes = num_boundary_surfaces * sizeof(int);
+
+    // constant values -- csr bytesize
+    csr_row_index_bytes = (num_cells + 1) * sizeof(int);
+    csr_col_index_bytes = (num_cells + num_surfaces * 2) * sizeof(int);
+    csr_value_bytes = (num_cells + num_surfaces * 2) * sizeof(double);
+    csr_value_vec_bytes = (num_cells + num_surfaces * 2) * 3 * sizeof(double);
+}
+
+void dfMatrixDataBase::setConstantIndexes(const int *owner, const int *neighbor) {
+    // build d_owner, d_neighbor
+    checkCudaErrors(cudaMalloc((void**)&d_owner, surface_index_bytes));
+    checkCudaErrors(cudaMalloc((void**)&d_neighbor, surface_index_bytes));
+    checkCudaErrors(cudaMemcpyAsync(d_owner, owner, surface_index_bytes, cudaMemcpyHostToDevice, stream));
+    checkCudaErrors(cudaMemcpyAsync(d_neighbor, neighbor, surface_index_bytes, cudaMemcpyHostToDevice, stream));
+
+
+    // build d_lower_to_csr_index, d_diag_to_csr_index, d_upper_to_csr_index
+    std::vector<int> upperNum(num_cells, 0);
+    std::vector<int> lowerNum(num_cells, 0);
+    std::vector<int> lowerPermListInit(num_surfaces);
+
+    int *upperOffset = (int*)calloc(num_cells + 1, sizeof(int));
+    int *lowerOffset = (int*)calloc(num_cells + 1, sizeof(int));
+
+    for(int faceI = 0; faceI < num_surfaces; ++faceI){
+        upperNum[owner[faceI]] ++;
+        lowerNum[neighbor[faceI]] ++;
+    }
+    std::partial_sum(upperNum.begin(), upperNum.end(), 
+        upperOffset+1);
+    std::partial_sum(lowerNum.begin(), lowerNum.end(), 
+        lowerOffset+1);
+
+    std::iota(lowerPermListInit.begin(), lowerPermListInit.end(), 0);
+
+    std::multimap<int,int> permutation;
+    for (int i = 0; i < num_surfaces; ++i){
+        permutation.insert(std::make_pair(neighbor[i], lowerPermListInit[i]));
+    }
+    std::vector<std::pair<int, int>> permPair(permutation.begin(), permutation.end());
+    std::sort(permPair.begin(), permPair.end(), []
+    (const std::pair<int, int>& pair1, const std::pair<int, int>& pair2){
+        if (pair1.first != pair2.first) {
+            return pair1.first < pair2.first;
+        } else {
+            return pair1.second < pair2.second;
+        }
+    });
+
+    std::vector<int> lowerPermList;
+    std::transform(permPair.begin(), permPair.end(), std::back_inserter(lowerPermList), []
+        (const std::pair<int, int>& pair) {
+        return pair.second;
+    }); 
+
+    std::vector<int> lowCSRIndex, uppCSRIndex, diagCSRIndex, CSRRowIndex, CSRColIndex;
+    int uppIndexInCSR = 0, uppIndexInLdu = 0, lowIndexInCSR = 0, lowIndexInLdu = 0, lowNumInLdu = 0;
+    CSRColIndex.resize(2 * num_surfaces + num_cells);
+    lowCSRIndex.resize(num_surfaces);
+    for (int i = 0; i < num_cells; ++i) {
+        int numUppPerRow = upperOffset[i + 1] - upperOffset[i];
+        int numLowPerRow = lowerOffset[i + 1] - lowerOffset[i];
+        int numNZBefore = upperOffset[i] + lowerOffset[i] + i; // add diag
+        // csr row index
+        CSRRowIndex.push_back(numNZBefore);
+        // upper
+        for (int j = 0; j < numUppPerRow; ++j) {
+            uppIndexInCSR = numNZBefore + numLowPerRow + 1 + j; // 1 means diag
+            uppCSRIndex.push_back(uppIndexInCSR);
+            CSRColIndex[uppIndexInCSR] = neighbor[uppIndexInLdu]; // fill upper entry in CSRColIndex
+            uppIndexInLdu ++;
+        }
+        // lower
+        for (int j = 0; j < numLowPerRow; ++j) {
+            lowIndexInCSR = numNZBefore + j;
+            lowIndexInLdu = lowerPermList[lowNumInLdu];
+            lowCSRIndex[lowIndexInLdu] = lowIndexInCSR;
+            CSRColIndex[lowIndexInCSR] = owner[lowIndexInLdu]; // fill lower entry in CSRColIndex
+            lowNumInLdu ++;
+        }
+        // diag
+        int diagIndexInCSR = numNZBefore + numLowPerRow;
+        diagCSRIndex.push_back(diagIndexInCSR);
+        CSRColIndex[diagIndexInCSR] = i; // fill diag entry in CSRColIndex
+    }
+    CSRRowIndex.push_back(2 * num_surfaces + num_cells);
+
+    checkCudaErrors(cudaMalloc((void**)&d_lower_to_csr_index, surface_index_bytes));
+    checkCudaErrors(cudaMalloc((void**)&d_diag_to_csr_index, cell_index_bytes));
+    checkCudaErrors(cudaMalloc((void**)&d_upper_to_csr_index, surface_index_bytes));
+    checkCudaErrors(cudaMemcpyAsync(d_lower_to_csr_index, lowCSRIndex.data(), surface_index_bytes, cudaMemcpyHostToDevice, stream));
+    checkCudaErrors(cudaMemcpyAsync(d_diag_to_csr_index, diagCSRIndex.data(), cell_index_bytes, cudaMemcpyHostToDevice, stream));
+    checkCudaErrors(cudaMemcpyAsync(d_upper_to_csr_index, uppCSRIndex.data(), surface_index_bytes, cudaMemcpyHostToDevice, stream));
+
+
+    // build d_csr_row_index, d_csr_col_index
+    checkCudaErrors(cudaMalloc((void**)&d_csr_row_index, csr_row_index_bytes));
+    checkCudaErrors(cudaMalloc((void**)&d_csr_col_index, csr_col_index_bytes));
+    checkCudaErrors(cudaMemcpyAsync(d_csr_row_index, CSRRowIndex.data(), csr_row_index_bytes, cudaMemcpyHostToDevice, stream));
+    checkCudaErrors(cudaMemcpyAsync(d_csr_col_index, CSRColIndex.data(), csr_col_index_bytes, cudaMemcpyHostToDevice, stream));
+}
+
+void dfMatrixDataBase::createConstantFieldsInternal() {
+    checkCudaErrors(cudaMalloc((void**)&d_sf, surface_value_vec_bytes));
+    checkCudaErrors(cudaMalloc((void**)&d_mag_sf, surface_value_bytes));
+    checkCudaErrors(cudaMalloc((void**)&d_weight, surface_value_bytes));
+    checkCudaErrors(cudaMalloc((void**)&d_delta_coeffs, surface_value_bytes));
+    checkCudaErrors(cudaMalloc((void**)&d_volume, cell_value_bytes));
+    fieldPointerMap["d_sf"] = d_sf;
+    fieldPointerMap["d_mag_sf"] = d_mag_sf;
+    fieldPointerMap["d_weight"] = d_weight;
+    fieldPointerMap["d_delta_coeffs"] = d_delta_coeffs;
+    fieldPointerMap["d_volume"] = d_volume;
+}
+
+void dfMatrixDataBase::createConstantFieldsBoundary() {
+    checkCudaErrors(cudaMalloc((void**)&d_boundary_sf, boundary_surface_value_vec_bytes));
+    checkCudaErrors(cudaMalloc((void**)&d_boundary_mag_sf, boundary_surface_value_bytes));
+    checkCudaErrors(cudaMalloc((void**)&d_boundary_delta_coeffs, boundary_surface_value_bytes));
+    checkCudaErrors(cudaMalloc((void**)&d_boundary_face_cell, boundary_surface_index_bytes));
+    fieldPointerMap["d_boundary_sf"] = d_boundary_sf;
+    fieldPointerMap["d_boundary_mag_sf"] = d_boundary_mag_sf;
+    fieldPointerMap["d_boundary_delta_coeffs"] = d_boundary_delta_coeffs;
+}
+
+void dfMatrixDataBase::initConstantFieldsInternal(const double *sf, const double *mag_sf, 
+        const double *weight, const double *delta_coeffs, const double *volume) {
+    checkCudaErrors(cudaMemcpyAsync(d_sf, sf, surface_value_vec_bytes, cudaMemcpyHostToDevice, stream));
+    checkCudaErrors(cudaMemcpyAsync(d_mag_sf, mag_sf, surface_value_bytes, cudaMemcpyHostToDevice, stream));
+    checkCudaErrors(cudaMemcpyAsync(d_weight, weight, surface_value_bytes, cudaMemcpyHostToDevice, stream));
+    checkCudaErrors(cudaMemcpyAsync(d_delta_coeffs, delta_coeffs, surface_value_bytes, cudaMemcpyHostToDevice, stream));
+    checkCudaErrors(cudaMemcpyAsync(d_volume, volume, cell_value_bytes, cudaMemcpyHostToDevice, stream));
+}
+
+void dfMatrixDataBase::initConstantFieldsBoundary(const double *boundary_sf, const double *boundary_mag_sf, 
+        const double *boundary_delta_coeffs, const int *boundary_face_cell) {
+    checkCudaErrors(cudaMemcpyAsync(d_boundary_sf, boundary_sf, boundary_surface_value_vec_bytes, cudaMemcpyHostToDevice, stream));
+    checkCudaErrors(cudaMemcpyAsync(d_boundary_mag_sf, boundary_mag_sf, boundary_surface_value_bytes, cudaMemcpyHostToDevice, stream));
+    checkCudaErrors(cudaMemcpyAsync(d_boundary_delta_coeffs, boundary_delta_coeffs, boundary_surface_value_bytes, cudaMemcpyHostToDevice, stream));
+    checkCudaErrors(cudaMemcpyAsync(d_boundary_face_cell, boundary_face_cell, boundary_surface_index_bytes, cudaMemcpyHostToDevice, stream));
+}
+
+void dfMatrixDataBase::createNonConstantFieldsInternal() {
+    checkCudaErrors(cudaMalloc((void**)&d_rho, cell_value_bytes));
+    checkCudaErrors(cudaMalloc((void**)&d_u, cell_value_vec_bytes));
+    checkCudaErrors(cudaMalloc((void**)&d_y, cell_value_bytes * num_species));
+    checkCudaErrors(cudaMalloc((void**)&d_he, cell_value_bytes));
+    checkCudaErrors(cudaMalloc((void**)&d_p, cell_value_bytes));
+    fieldPointerMap["d_rho"] = d_rho;
+    fieldPointerMap["d_u"] = d_u;
+    fieldPointerMap["d_y"] = d_y;
+    fieldPointerMap["d_he"] = d_he;
+    fieldPointerMap["d_p"] = d_p;
+    
+    checkCudaErrors(cudaMalloc((void**)&d_rho_old, cell_value_bytes));
+    fieldPointerMap["d_rho_old"] = d_rho_old;
+    // checkCudaErrors(cudaMalloc((void**)&d_u_old, cell_value_vec_bytes));
+    // checkCudaErrors(cudaMalloc((void**)&d_y_old, cell_value_bytes * num_species));
+    // checkCudaErrors(cudaMalloc((void**)&d_he_old, cell_value_bytes));
+    // checkCudaErrors(cudaMalloc((void**)&d_p_old, cell_value_bytes));
+    
+    checkCudaErrors(cudaMalloc((void**)&d_phi, surface_value_bytes));
+    fieldPointerMap["d_phi"] = d_phi;
+
+    // computed on GPU, used on CPU, need memcpyd2h
+    checkCudaErrors(cudaMallocHost((void**)&h_rho, cell_value_bytes));
+    checkCudaErrors(cudaMallocHost((void**)&h_rho_old, cell_value_bytes));
+    checkCudaErrors(cudaMallocHost((void**)&h_u, cell_value_vec_bytes));
+    checkCudaErrors(cudaMallocHost((void**)&h_y, cell_value_bytes * num_species));
+    checkCudaErrors(cudaMallocHost((void**)&h_he, cell_value_bytes));
+    fieldPointerMap["h_rho"] = h_rho;
+    fieldPointerMap["h_rho_old"] = h_rho_old;
+    fieldPointerMap["h_u"] = h_u;
+    fieldPointerMap["h_y"] = h_y;
+    fieldPointerMap["h_he"] = h_he;
+
+    // computed on CPU, used on GPU, need memcpyh2d
+    checkCudaErrors(cudaMallocHost((void**)&h_p, cell_value_bytes));
+    checkCudaErrors(cudaMallocHost((void**)&h_phi, surface_value_bytes));
+    fieldPointerMap["h_p"] = h_p;
+    fieldPointerMap["h_phi"] = h_phi;
+}
+
+void dfMatrixDataBase::createNonConstantFieldsBoundary() {
+    checkCudaErrors(cudaMalloc((void**)&d_boundary_rho, boundary_surface_value_bytes));
+    checkCudaErrors(cudaMalloc((void**)&d_boundary_u, boundary_surface_value_vec_bytes));
+    checkCudaErrors(cudaMalloc((void**)&d_boundary_y, boundary_surface_value_bytes * num_species));
+    checkCudaErrors(cudaMalloc((void**)&d_boundary_he, boundary_surface_value_bytes));
+    checkCudaErrors(cudaMalloc((void**)&d_boundary_p, boundary_surface_value_bytes));
+    fieldPointerMap["d_boundary_rho"] = d_boundary_rho;
+    fieldPointerMap["d_boundary_u"] = d_boundary_u;
+    fieldPointerMap["d_boundary_y"] = d_boundary_y;
+    fieldPointerMap["d_boundary_he"] = d_boundary_he;
+    fieldPointerMap["d_boundary_p"] = d_boundary_p;
+
+    checkCudaErrors(cudaMalloc((void**)&d_boundary_rho_old, boundary_surface_value_bytes));
+    fieldPointerMap["d_boundary_rho_old"] = d_boundary_rho_old;
+    // checkCudaErrors(cudaMalloc((void**)&d_boundary_u_old, boundary_surface_value_vec_bytes));
+    // checkCudaErrors(cudaMalloc((void**)&d_boundary_y_old, boundary_surface_value_bytes * num_species));
+    // checkCudaErrors(cudaMalloc((void**)&d_boundary_he_old, boundary_surface_value_bytes));
+    // checkCudaErrors(cudaMalloc((void**)&d_boundary_p_old, boundary_surface_value_bytes));
+
+    checkCudaErrors(cudaMalloc((void**)&d_boundary_phi, boundary_surface_value_bytes));
+    fieldPointerMap["d_boundary_phi"] = d_boundary_phi;
+
+    // computed on GPU, used on CPU, need memcpyd2h
+    checkCudaErrors(cudaMallocHost((void**)&h_boundary_rho, boundary_surface_value_bytes));
+    checkCudaErrors(cudaMallocHost((void**)&h_boundary_rho_old, boundary_surface_value_bytes));
+    checkCudaErrors(cudaMallocHost((void**)&h_boundary_u, boundary_surface_value_vec_bytes));
+    checkCudaErrors(cudaMallocHost((void**)&h_boundary_y, boundary_surface_value_bytes * num_species));
+    checkCudaErrors(cudaMallocHost((void**)&h_boundary_he, boundary_surface_value_bytes));
+    fieldPointerMap["h_boundary_rho"] = h_boundary_rho;
+    fieldPointerMap["h_boundary_rho_old"] = h_boundary_rho_old;
+    fieldPointerMap["h_boundary_u"] = h_boundary_u;
+    fieldPointerMap["h_boundary_y"] = h_boundary_y;
+    fieldPointerMap["h_boundary_he"] = h_boundary_he;
+
+    // computed on CPU, used on GPU, need memcpyh2d
+    checkCudaErrors(cudaMallocHost((void**)&h_boundary_p, boundary_surface_value_bytes));
+    checkCudaErrors(cudaMallocHost((void**)&h_boundary_phi, boundary_surface_value_bytes));
+    fieldPointerMap["h_boundary_p"] = h_boundary_p;
+    fieldPointerMap["h_boundary_phi"] = h_boundary_phi;
+}
+
+void dfMatrixDataBase::initNonConstantFieldsInternal(const double *y) {
+    checkCudaErrors(cudaMemcpyAsync(d_y, y, cell_value_bytes * num_species, cudaMemcpyHostToDevice, stream));
+}
+
+void dfMatrixDataBase::initNonConstantFieldsBoundary(const double *boundary_y) {
+    checkCudaErrors(cudaMemcpyAsync(d_boundary_y, boundary_y, boundary_surface_value_bytes* num_species, cudaMemcpyHostToDevice, stream));
+}
+
+void dfMatrixDataBase::preTimeStep(const double *rho_old) {
+    checkCudaErrors(cudaMemcpyAsync(d_rho_old, rho_old, cell_value_bytes, cudaMemcpyHostToDevice, stream));
+}
+
+void dfMatrixDataBase::postTimeStep() {}
+
+double* dfMatrixDataBase::getFieldPointer(const char* fieldAlias, location loc, position pos) {
+    char mergedName[256];
+    if (pos == position::internal) {
+        sprintf(mergedName, "%s_%s", (loc == location::cpu) ? "h" : "d", fieldAlias);
+    } else if (pos == position::boundary) {
+        sprintf(mergedName, "%s_boundary_%s", (loc == location::cpu) ? "h" : "d", fieldAlias);
+    }
+
+    double *pointer = nullptr;
+    if (fieldPointerMap.find(std::string(mergedName)) != fieldPointerMap.end()) {
+        pointer = fieldPointerMap[std::string(mergedName)];
+    }
+    if (pointer == nullptr) {
+        fprintf(stderr, "Warning! getFieldPointer of %s returns nullptr!\n", mergedName);
+    }
+    //fprintf(stderr, "fieldAlias: %s, mergedName: %s, pointer: %p\n", fieldAlias, mergedName, pointer);
+
+    return pointer;
+}
diff --git a/src_gpu/dfMatrixOpBase.H b/src_gpu/dfMatrixOpBase.H
new file mode 100644
index 000000000..71dd82c38
--- /dev/null
+++ b/src_gpu/dfMatrixOpBase.H
@@ -0,0 +1,88 @@
+#pragma once
+// #define TIME_GPU
+
+// tools
+void permute_vector_d2h(cudaStream_t stream, int num_cells, const double *input, double *output);
+void permute_vector_h2d(cudaStream_t stream, int num_cells, const double *input, double *output);
+
+void field_multiply_scalar(cudaStream_t stream,
+        int num_cells, const double *input1, const double *input2, double *output,
+        int num_boundary_surfaces, const double *boundary_input1, const double *boundary_input2, double *boundary_output);
+
+void fvc_to_source_vector(cudaStream_t stream, int num_cells, const double *volume, const double *fvc_output, double *source);
+
+void ldu_to_csr(cudaStream_t stream, int num_cells, int num_surfaces, int num_boundary_surface,
+        const int* boundary_cell_face, const int *lower_to_csr_index, const int *upper_to_csr_index, const int *diag_to_csr_index,
+        const double *lower, const double *upper, const double *diag, const double *source,
+        const double *internal_coeffs, const double *boundary_coeffs,
+        double *A, double *b, double *diag_vec);
+
+void update_boundary_coeffs_vector(cudaStream_t stream, int num_boundary_surfaces, int num_patches,
+        const int *patch_size, const int *patch_type,
+        double *value_internal_coeffs, double *value_boundary_coeffs,
+        double *gradient_internal_coeffs, double *gradient_boundary_coeffs);
+
+// fvm ops
+
+void fvm_ddt_vector(cudaStream_t stream, int num_cells, double rDeltaT,
+        const double *rho, const double *rho_old, const double *vf, const double *volume,
+        double *diag, double *source, double sign = 1.);
+
+void fvm_div_vector(cudaStream_t stream, int num_surfaces, int num_boundary_sourfaces, 
+        const int *lowerAddr, const int *upperAddr,
+        const double *phi, const double *weight,
+        double *lower, double *upper, double *diag, // end for internal
+        int num_patches, const int *patch_size, const int *patch_type,
+        const double *boundary_phi, const double *value_internal_coeffs, const double *value_boundary_coeffs,
+        double *internal_coeffs, double *boundary_coeffs, double sign = 1.);
+
+void fvm_laplacian_vector(cudaStream_t stream, int num_surfaces, int num_boundary_surfaces, 
+        const int *lowerAddr, const int *upperAddr,
+        const double *weight, const double *mag_sf, const double *delta_coeffs, const double *gamma,
+        double *lower, double *upper, double *diag, // end for internal
+        int num_patches, const int *patch_size, const int *patch_type,
+        const double *boundary_mag_sf, const double *boundary_gamma,
+        const double *gradient_internal_coeffs, const double *gradient_boundary_coeffs,
+        double *internal_coeffs, double *boundary_coeffs, double sign = 1.);
+
+// fvc ops
+// fvc_ddt doesn't consider to add fvc_output to source yet, which needs (fvc_output * volume * sign).
+void fvc_ddt_scalar(cudaStream_t stream, int num_cells, double rDeltaT,
+        const double *rho, const double *rho_old, const double *vf, const double *vf_old,
+        double *output, double sign = 1.);
+
+void fvc_grad_vector(cudaStream_t stream, int num_cells, int num_surfaces, int num_boundary_surfaces,
+        const int *lowerAddr, const int *upperAddr, 
+        const double *weight, const double *Sf, const double *vf, double *output, // end for internal
+        int num_patches, const int *patch_size, const int *patch_type,
+        const int *boundary_cell_face, const double *boundary_vf, const double *boundary_Sf,
+        const double *volume, const double *boundary_mag_Sf, double *boundary_output,
+        const double *boundary_deltaCoeffs, double sign = 1.);
+
+void fvc_div_surface_scalar(cudaStream_t stream, int num_cells, int num_surfaces, int num_boundary_surfaces,
+        const int *lowerAddr, const int *upperAddr, const double *ssf, const int *boundary_cell_face,
+        const double *boundary_ssf, const double *volume, double *output, double sign = 1.);
+
+void fvc_div_cell_vector(cudaStream_t stream, int num_cells, int num_surfaces,
+        const int *lowerAddr, const int *upperAddr,
+        const double *weight, const double *Sf, const double *vf, double *output, // end for internal
+        int num_patches, const int *patch_size, const int *patch_type,
+        const int *boundary_cell_face, const double *boundary_vf, const double *boundary_Sf,
+        const double *volume, double sign = 1.);
+
+void fvc_div_cell_tensor(cudaStream_t stream, int num_cells, int num_surfaces, int num_boundary_surfaces, 
+        const int *lowerAddr, const int *upperAddr, 
+        const double *weight, const double *Sf, const double *vf, double *output, // end for internal
+        int num_patches, const int *patch_size, const int *patch_type,
+        const int *boundary_cell_face, const double *boundary_vf, const double *boundary_Sf,
+        const double *volume, double sign = 1.);
+
+void fvc_grad_cell_scalar(cudaStream_t stream, int num_cells, int num_surfaces, 
+        const int *lowerAddr, const int *upperAddr, 
+        const double *weight, const double *Sf, const double *vf, double *output, // end for internal
+        int num_patches, const int *patch_size, const int *patch_type,
+        const int *boundary_cell_face, const double *boundary_vf, const double *boundary_Sf, const double *volume, double sign = 1.);
+
+// others
+void scale_dev2T_tensor(cudaStream_t stream, int num_cells, const double *vf1, double *vf2,
+        int num_boundary_surfaces, const double *boundary_vf1, double *boundary_vf2);
diff --git a/src_gpu/dfMatrixOpBase.cu b/src_gpu/dfMatrixOpBase.cu
new file mode 100644
index 000000000..e3616fac3
--- /dev/null
+++ b/src_gpu/dfMatrixOpBase.cu
@@ -0,0 +1,1286 @@
+#include "dfMatrixOpBase.H"
+#include "dfMatrixDataBase.H"
+
+#include <cuda_runtime.h>
+#include "cuda_profiler_api.h"
+
+#ifdef TIME_GPU
+    #define TICK_INIT_EVENT \
+        float time_elapsed_kernel=0;\
+        cudaEvent_t start_kernel, stop_kernel;\
+        checkCudaErrors(cudaEventCreate(&start_kernel));\
+        checkCudaErrors(cudaEventCreate(&stop_kernel));
+
+    #define TICK_START_EVENT \
+        checkCudaErrors(cudaEventRecord(start_kernel,0));
+
+    #define TICK_END_EVENT(prefix) \
+        checkCudaErrors(cudaEventRecord(stop_kernel,0));\
+        checkCudaErrors(cudaEventSynchronize(start_kernel));\
+        checkCudaErrors(cudaEventSynchronize(stop_kernel));\
+        checkCudaErrors(cudaEventElapsedTime(&time_elapsed_kernel,start_kernel,stop_kernel));\
+        printf("try %s 执行时间：%lf(ms)\n", #prefix, time_elapsed_kernel);
+#else
+    #define TICK_INIT_EVENT
+    #define TICK_START_EVENT
+    #define TICK_END_EVENT(prefix)
+#endif
+
+__global__ void warmup(int num_cells)
+{
+    int index = blockDim.x * blockIdx.x + threadIdx.x;
+    if (index >= num_cells)
+        return;
+}
+
+__global__ void permute_vector_d2h_kernel(int num_cells, const double *input, double *output)
+{
+    int index = blockDim.x * blockIdx.x + threadIdx.x;
+    if (index >= num_cells)
+        return;
+
+    output[index * 3 + 0] = input[num_cells * 0 + index];
+    output[index * 3 + 1] = input[num_cells * 1 + index];
+    output[index * 3 + 2] = input[num_cells * 2 + index];
+}
+
+__global__ void permute_vector_h2d_kernel(int num_cells, const double *input, double *output)
+{
+    int index = blockDim.x * blockIdx.x + threadIdx.x;
+    if (index >= num_cells)
+        return;
+
+    output[num_cells * 0 + index] = input[index * 3 + 0];
+    output[num_cells * 1 + index] = input[index * 3 + 1];
+    output[num_cells * 2 + index] = input[index * 3 + 2];
+}
+
+__global__ void field_multiply_scalar_kernel(int num_cells, int num_boundary_surfaces,
+        const double *input1, const double *input2, double *output,
+        const double *boundary_input1, const double *boundary_input2, double *boundary_output)
+{
+    int index = blockDim.x * blockIdx.x + threadIdx.x;
+    if (index < num_cells) {
+        output[index] = input1[index] * input2[index];
+    }
+    if (index < num_boundary_surfaces) {
+        boundary_output[index] = boundary_input1[index] * boundary_input2[index];
+    }
+}
+
+__global__ void fvc_to_source_vector_kernel(int num_cells, const double *volume, const double *fvc_output, double *source)
+{
+    int index = blockDim.x * blockIdx.x + threadIdx.x;
+    if (index >= num_cells)
+        return;
+
+    // source[index * 3 + 0] += fvc_output[index * 3 + 0] * volume[index];
+    // source[index * 3 + 1] += fvc_output[index * 3 + 1] * volume[index];
+    // source[index * 3 + 2] += fvc_output[index * 3 + 2] * volume[index];
+    source[index * 3 + 0] += fvc_output[index * 3 + 0];
+    source[index * 3 + 1] += fvc_output[index * 3 + 1];
+    source[index * 3 + 2] += fvc_output[index * 3 + 2];
+}
+
+__global__ void update_boundary_coeffs_zeroGradient_vector(int num_boundary_surfaces, int num, int offset,
+        double *value_internal_coeffs, double *value_boundary_coeffs,
+        double *gradient_internal_coeffs, double *gradient_boundary_coeffs)
+{
+    int index = blockDim.x * blockIdx.x + threadIdx.x;
+    if (index >= num)
+        return;
+
+    int start_index = offset + index;
+    // valueInternalCoeffs = 1
+    // valueBoundaryCoeffs = 0
+    // gradientInternalCoeffs = 0
+    // gradientBoundaryCoeffs = 0
+    value_internal_coeffs[num_boundary_surfaces * 0 + start_index] = 1;
+    value_internal_coeffs[num_boundary_surfaces * 1 + start_index] = 1;
+    value_internal_coeffs[num_boundary_surfaces * 2 + start_index] = 1;
+    value_boundary_coeffs[num_boundary_surfaces * 0 + start_index] = 0;
+    value_boundary_coeffs[num_boundary_surfaces * 1 + start_index] = 0;
+    value_boundary_coeffs[num_boundary_surfaces * 2 + start_index] = 0;
+    gradient_internal_coeffs[num_boundary_surfaces * 0 + start_index] = 0;
+    gradient_internal_coeffs[num_boundary_surfaces * 1 + start_index] = 0;
+    gradient_internal_coeffs[num_boundary_surfaces * 2 + start_index] = 0;
+    gradient_boundary_coeffs[num_boundary_surfaces * 0 + start_index] = 0;
+    gradient_boundary_coeffs[num_boundary_surfaces * 1 + start_index] = 0;
+    gradient_boundary_coeffs[num_boundary_surfaces * 2 + start_index] = 0;
+}
+
+__global__ void scale_dev2t_tensor_kernel(int num, const double *vf1, double *vf2)
+{
+    int index = blockDim.x * blockIdx.x + threadIdx.x;
+    if (index >= num)
+        return;
+
+    double scale = vf1[index];
+    double val_xx = vf2[num * 0 + index];
+    double val_xy = vf2[num * 1 + index];
+    double val_xz = vf2[num * 2 + index];
+    double val_yx = vf2[num * 3 + index];
+    double val_yy = vf2[num * 4 + index];
+    double val_yz = vf2[num * 5 + index];
+    double val_zx = vf2[num * 6 + index];
+    double val_zy = vf2[num * 7 + index];
+    double val_zz = vf2[num * 8 + index];
+    double trace_coeff = (2. / 3.) * (val_xx + val_yy + val_zz);
+    vf2[num * 0 + index] = scale * (val_xx - trace_coeff);
+    vf2[num * 1 + index] = scale * val_yx;
+    vf2[num * 2 + index] = scale * val_zx;
+    vf2[num * 3 + index] = scale * val_xy;
+    vf2[num * 4 + index] = scale * (val_yy - trace_coeff);
+    vf2[num * 5 + index] = scale * val_zy;
+    vf2[num * 6 + index] = scale * val_xz;
+    vf2[num * 7 + index] = scale * val_yz;
+    vf2[num * 8 + index] = scale * (val_zz - trace_coeff);
+
+    // if (index == 0)
+    // {
+    //     printf("bou_grad_U = (%.5e, %.5e, %.5e, %.5e, %.5e, %.5e, %.5e, %.5e, %.5e)", vf2[0], vf2[1], vf2[2],
+    //             vf2[3], vf2[4], vf2[5], vf2[6], vf2[7], vf2[8]);
+    // }
+    
+}
+
+__global__ void fvm_ddt_vector_kernel(int num_cells, double rDeltaT,
+        const double *rho, const double *rho_old, const double *vf, const double *volume,
+        double *diag, double *source, double sign)
+{
+    int index = blockDim.x * blockIdx.x + threadIdx.x;
+    if (index >= num_cells)
+        return;
+
+    double vol = volume[index];
+    double rho_old_kernel = rho_old[index];
+
+    diag[index] += rDeltaT * rho[index] * vol * sign;
+    // TODO: skip moving
+    source[num_cells * 0 + index] += rDeltaT * rho_old_kernel * vf[num_cells * 0 + index] * vol * sign;
+    source[num_cells * 1 + index] += rDeltaT * rho_old_kernel * vf[num_cells * 1 + index] * vol * sign;
+    source[num_cells * 2 + index] += rDeltaT * rho_old_kernel * vf[num_cells * 2 + index] * vol * sign;
+}
+
+__global__ void fvm_div_vector_internal(int num_surfaces,
+        const int *lower_index, const int *upper_index,
+        const double *phi, const double *weight,
+        double *lower, double *upper, double *diag, double sign)
+{
+    int index = blockDim.x * blockIdx.x + threadIdx.x;
+    if (index >= num_surfaces)
+        return;
+
+    double w = weight[index];
+    double f = phi[index];
+
+    double lower_value = (-w) * f * sign;
+    double upper_value = (1 - w) * f * sign;
+    lower[index] += lower_value;
+    upper[index] += upper_value;
+    // if (index == 0) printf("index = 0, lower: %.16lf, upper:%.16lf\n", lower[index], upper[index]);
+
+    int owner = lower_index[index];
+    int neighbor = upper_index[index];
+    atomicAdd(&(diag[owner]), -lower_value);
+    atomicAdd(&(diag[neighbor]), -upper_value);
+}
+
+// TODO: modify the data structure of internal and boundary coeffs
+__global__ void fvm_div_vector_boundary(int num_boundary_surfaces, int num, int offset,
+        const double *boundary_phi, const double *value_internal_coeffs, const double *value_boundary_coeffs,
+        double *internal_coeffs, double *boundary_coeffs, double sign)
+{
+    int index = blockDim.x * blockIdx.x + threadIdx.x;
+    if (index >= num)
+        return;
+
+    int start_index = offset + index;
+    double boundary_f = boundary_phi[start_index];
+    internal_coeffs[num_boundary_surfaces * 0 + start_index] += boundary_f * value_internal_coeffs[num_boundary_surfaces * 0 + start_index] * sign;
+    internal_coeffs[num_boundary_surfaces * 1 + start_index] += boundary_f * value_internal_coeffs[num_boundary_surfaces * 1 + start_index] * sign;
+    internal_coeffs[num_boundary_surfaces * 2 + start_index] += boundary_f * value_internal_coeffs[num_boundary_surfaces * 2 + start_index] * sign;
+    boundary_coeffs[num_boundary_surfaces * 0 + start_index] += boundary_f * value_boundary_coeffs[num_boundary_surfaces * 0 + start_index] * sign;
+    boundary_coeffs[num_boundary_surfaces * 1 + start_index] += boundary_f * value_boundary_coeffs[num_boundary_surfaces * 1 + start_index] * sign;
+    boundary_coeffs[num_boundary_surfaces * 2 + start_index] += boundary_f * value_boundary_coeffs[num_boundary_surfaces * 2 + start_index] * sign;
+}
+
+__global__ void fvm_laplacian_vector_internal(int num_surfaces,
+        const int *lower_index, const int *upper_index,
+        const double *weight, const double *mag_sf, const double *delta_coeffs, const double *gamma,
+        double *lower, double *upper, double *diag, double sign)
+{
+    int index = blockDim.x * blockIdx.x + threadIdx.x;
+    if (index >= num_surfaces)
+        return;
+
+    int owner = lower_index[index];
+    int neighbor = upper_index[index];
+
+    double w = weight[index];
+    double upper_face_gamma = w * gamma[owner] + (1 - w) * gamma[neighbor];
+    double upper_value = upper_face_gamma * mag_sf[index] * delta_coeffs[index];
+
+    // laplacian doesn't use the original lower, but use lower = upper
+    //double lower_face_gamma = w * gamma[neighbor] + (1 - w) * gamma[owner];
+    //double lower_value = lower_face_gamma * mag_sf[index] * delta_coeffs[index];
+    double lower_value = upper_value;
+
+    lower_value = lower_value * sign;
+    upper_value = upper_value * sign;
+
+    lower[index] += lower_value;
+    upper[index] += upper_value;
+
+    atomicAdd(&(diag[owner]), -lower_value);
+    atomicAdd(&(diag[neighbor]), -upper_value);
+}
+
+__global__ void fvm_laplacian_vector_boundary(int num_boundary_surfaces, int num, int offset,
+        const double *boundary_mag_sf, const double *boundary_gamma,
+        const double *gradient_internal_coeffs, const double *gradient_boundary_coeffs,
+        double *internal_coeffs, double *boundary_coeffs, double sign)
+{
+    int index = blockDim.x * blockIdx.x + threadIdx.x;
+    if (index >= num)
+        return;
+
+    int start_index = offset + index;
+    double boundary_value = boundary_gamma[start_index] * boundary_mag_sf[start_index];
+    internal_coeffs[num_boundary_surfaces * 0 + start_index] += boundary_value * gradient_internal_coeffs[num_boundary_surfaces * 0 + start_index] * sign;
+    internal_coeffs[num_boundary_surfaces * 1 + start_index] += boundary_value * gradient_internal_coeffs[num_boundary_surfaces * 1 + start_index] * sign;
+    internal_coeffs[num_boundary_surfaces * 2 + start_index] += boundary_value * gradient_internal_coeffs[num_boundary_surfaces * 2 + start_index] * sign;
+    boundary_coeffs[num_boundary_surfaces * 0 + start_index] += boundary_value * gradient_boundary_coeffs[num_boundary_surfaces * 0 + start_index] * sign;
+    boundary_coeffs[num_boundary_surfaces * 1 + start_index] += boundary_value * gradient_boundary_coeffs[num_boundary_surfaces * 1 + start_index] * sign;
+    boundary_coeffs[num_boundary_surfaces * 2 + start_index] += boundary_value * gradient_boundary_coeffs[num_boundary_surfaces * 2 + start_index] * sign;
+}
+
+__global__ void fvc_ddt_scalar_kernel(int num_cells, double rDeltaT,
+        const double *rho, const double *rho_old, const double *vf, const double *vf_old,
+        double *output, double sign)
+{
+    int index = blockDim.x * blockIdx.x + threadIdx.x;
+    if (index >= num_cells)
+        return;
+    /*
+    // workaround way1 (use printf):
+    double val_new = rho[index] * vf[index];
+    double val_old = rho_old[index] * vf_old[index];
+    // TODO: skip moving
+    // TODO: wyr
+    // for the case of rho = rho_old and vf = vf_old, the floating-point numerical problem will be exposed.
+    // it expect zero as output, but the gpu result get a sub-normal minimal value for (val_new - val_old),
+    // which smaller than 1e-16, and then enlarged by rDeltaT (1e6)
+    // then the comparison of cpu result and gpu result will failed with relative error: inf,
+    // e.g.:
+    // cpu data: 0.0000000000000000, gpu data: 0.0000000000298050, relative error: inf
+    // if I add the print line for intermediate variables of val_new and val_old, the problem disappears.
+    // It seems that print line will change the compiler behavior, maybe avoiding the fma optimization of compiler.
+    if (index == -1) printf("index = 0, val_new: %.40lf, val_old: %.40lf\n", val_new, val_old);
+    output[index] += rDeltaT * (val_new - val_old);
+    */
+    /*
+    // workaround way2 (use volatile):
+    // volatile will change the compiler behavior, maybe avoiding the fma optimization of compiler.
+    volatile double val_new = rho[index] * vf[index];
+    volatile double val_old = rho_old[index] * vf_old[index];
+    output[index] += rDeltaT * (val_new - val_old);
+    */
+    // workaround way3 (use nvcc option -fmad=false)
+    output[index] += rDeltaT * (rho[index] * vf[index] - rho_old[index] * vf_old[index]) * sign;
+}
+
+__global__ void fvc_grad_vector_internal(int num_cells, int num_surfaces, 
+        const int *lower_index, const int *upper_index, const double *face_vector,
+        const double *weight, const double *field_vector, 
+        double *output)
+{
+    int index = blockDim.x * blockIdx.x + threadIdx.x;
+    if (index >= num_surfaces)
+        return;
+    
+    double w = weight[index];
+    double Sfx = face_vector[index * 3 + 0];
+    double Sfy = face_vector[index * 3 + 1];
+    double Sfz = face_vector[index * 3 + 2];
+
+    int owner = lower_index[index];
+    int neighbor = upper_index[index];
+
+    double ssfx = (w * (field_vector[num_cells * 0 + owner] - field_vector[num_cells * 0 + neighbor]) + field_vector[num_cells * 0 + neighbor]);
+    double ssfy = (w * (field_vector[num_cells * 1 + owner] - field_vector[num_cells * 1 + neighbor]) + field_vector[num_cells * 1 + neighbor]);
+    double ssfz = (w * (field_vector[num_cells * 2 + owner] - field_vector[num_cells * 2 + neighbor]) + field_vector[num_cells * 2 + neighbor]);    
+
+    double grad_xx = Sfx * ssfx;
+    double grad_xy = Sfx * ssfy;
+    double grad_xz = Sfx * ssfz;
+    double grad_yx = Sfy * ssfx;
+    double grad_yy = Sfy * ssfy;
+    double grad_yz = Sfy * ssfz;
+    double grad_zx = Sfz * ssfx;
+    double grad_zy = Sfz * ssfy;
+    double grad_zz = Sfz * ssfz;
+
+    // // owner
+    // atomicAdd(&(output[num_cells * 0 + owner]), grad_xx);
+    // atomicAdd(&(output[num_cells * 1 + owner]), grad_xy);
+    // atomicAdd(&(output[num_cells * 2 + owner]), grad_xz);
+    // atomicAdd(&(output[num_cells * 3 + owner]), grad_yx);
+    // atomicAdd(&(output[num_cells * 4 + owner]), grad_yy);
+    // atomicAdd(&(output[num_cells * 5 + owner]), grad_yz);
+    // atomicAdd(&(output[num_cells * 6 + owner]), grad_zx);
+    // atomicAdd(&(output[num_cells * 7 + owner]), grad_zy);
+    // atomicAdd(&(output[num_cells * 8 + owner]), grad_zz);
+
+    // // neighbour
+    // atomicAdd(&(output[num_cells * 0 + neighbor]), -grad_xx);
+    // atomicAdd(&(output[num_cells * 1 + neighbor]), -grad_xy);
+    // atomicAdd(&(output[num_cells * 2 + neighbor]), -grad_xz);
+    // atomicAdd(&(output[num_cells * 3 + neighbor]), -grad_yx);
+    // atomicAdd(&(output[num_cells * 4 + neighbor]), -grad_yy);
+    // atomicAdd(&(output[num_cells * 5 + neighbor]), -grad_yz);
+    // atomicAdd(&(output[num_cells * 6 + neighbor]), -grad_zx);
+    // atomicAdd(&(output[num_cells * 7 + neighbor]), -grad_zy);
+    // atomicAdd(&(output[num_cells * 8 + neighbor]), -grad_zz);
+
+    atomicAdd(&(output[num_cells * 0 + owner]), grad_xx);
+    atomicAdd(&(output[num_cells * 0 + neighbor]), -grad_xx);
+    atomicAdd(&(output[num_cells * 1 + owner]), grad_xy);
+    atomicAdd(&(output[num_cells * 1 + neighbor]), -grad_xy);
+    atomicAdd(&(output[num_cells * 2 + owner]), grad_xz);
+    atomicAdd(&(output[num_cells * 2 + neighbor]), -grad_xz);
+    atomicAdd(&(output[num_cells * 3 + owner]), grad_yx);
+    atomicAdd(&(output[num_cells * 3 + neighbor]), -grad_yx);
+    atomicAdd(&(output[num_cells * 4 + owner]), grad_yy);
+    atomicAdd(&(output[num_cells * 4 + neighbor]), -grad_yy);
+    atomicAdd(&(output[num_cells * 5 + owner]), grad_yz);
+    atomicAdd(&(output[num_cells * 5 + neighbor]), -grad_yz);
+    atomicAdd(&(output[num_cells * 6 + owner]), grad_zx);
+    atomicAdd(&(output[num_cells * 6 + neighbor]), -grad_zx);
+    atomicAdd(&(output[num_cells * 7 + owner]), grad_zy);
+    atomicAdd(&(output[num_cells * 7 + neighbor]), -grad_zy);
+    atomicAdd(&(output[num_cells * 8 + owner]), grad_zz);
+    atomicAdd(&(output[num_cells * 8 + neighbor]), -grad_zz);
+}
+
+// update boundary of interpolation field
+// calculate the grad field
+// TODO: this function is implemented for uncoupled boundary conditions
+//       so it should use the more specific func name
+__global__ void fvc_grad_vector_boundary(int num_cells, int num, 
+        int offset, const int *face2Cells, const double *boundary_face_vector, 
+        const double *boundary_field_vector, double *output)
+{
+    int index = blockDim.x * blockIdx.x + threadIdx.x;
+    if (index >= num)
+        return;
+
+    int start_index = offset + index;
+
+    double bouSfx = boundary_face_vector[start_index * 3 + 0];
+    double bouSfy = boundary_face_vector[start_index * 3 + 1];
+    double bouSfz = boundary_face_vector[start_index * 3 + 2];
+
+    double boussfx = boundary_field_vector[start_index * 3 + 0];
+    double boussfy = boundary_field_vector[start_index * 3 + 1];
+    double boussfz = boundary_field_vector[start_index * 3 + 2];
+
+    int cellIndex = face2Cells[start_index];
+
+    double grad_xx = bouSfx * boussfx;
+    double grad_xy = bouSfx * boussfy;
+    double grad_xz = bouSfx * boussfz;
+    double grad_yx = bouSfy * boussfx;
+    double grad_yy = bouSfy * boussfy;
+    double grad_yz = bouSfy * boussfz;
+    double grad_zx = bouSfz * boussfx;
+    double grad_zy = bouSfz * boussfy;
+    double grad_zz = bouSfz * boussfz;
+
+    atomicAdd(&(output[num_cells * 0 + cellIndex]), grad_xx);
+    atomicAdd(&(output[num_cells * 1 + cellIndex]), grad_xy);
+    atomicAdd(&(output[num_cells * 2 + cellIndex]), grad_xz);
+    atomicAdd(&(output[num_cells * 3 + cellIndex]), grad_yx);
+    atomicAdd(&(output[num_cells * 4 + cellIndex]), grad_yy);
+    atomicAdd(&(output[num_cells * 5 + cellIndex]), grad_yz);
+    atomicAdd(&(output[num_cells * 6 + cellIndex]), grad_zx);
+    atomicAdd(&(output[num_cells * 7 + cellIndex]), grad_zy);
+    atomicAdd(&(output[num_cells * 8 + cellIndex]), grad_zz);
+}
+
+__global__ void fvc_grad_scalar_internal(int num_cells, int num_surfaces,
+        const int *lower_index, const int *upper_index, const double *face_vector, 
+        const double *weight, const double *vf, double *output, double sign)
+{
+    int index = blockDim.x * blockIdx.x + threadIdx.x;
+    if (index >= num_surfaces)
+        return;
+    
+    double w = weight[index];
+    double Sfx = face_vector[index * 3 + 0];
+    double Sfy = face_vector[index * 3 + 1];
+    double Sfz = face_vector[index * 3 + 2];
+
+    int owner = lower_index[index];
+    int neighbor = upper_index[index];
+
+    double ssf = (w * (vf[owner] - vf[neighbor]) + vf[neighbor]);
+
+    double grad_x = Sfx * ssf * sign;
+    double grad_y = Sfy * ssf * sign;
+    double grad_z = Sfz * ssf * sign;
+
+    // owner
+    atomicAdd(&(output[num_cells * 0 + owner]), grad_x);
+    atomicAdd(&(output[num_cells * 1 + owner]), grad_y);
+    atomicAdd(&(output[num_cells * 2 + owner]), grad_z);
+
+    // neighbour
+    atomicAdd(&(output[num_cells * 0 + neighbor]), -grad_x);
+    atomicAdd(&(output[num_cells * 1 + neighbor]), -grad_y);
+    atomicAdd(&(output[num_cells * 2 + neighbor]), -grad_z);
+    
+}
+
+__global__ void fvc_grad_scalar_boundary(int num_cells, int num, int offset, const int *face2Cells,
+        const double *boundary_face_vector, const double *boundary_vf, double *output, double sign)
+{
+    int index = blockDim.x * blockIdx.x + threadIdx.x;
+    if (index >= num)
+        return;
+
+    int start_index = offset + index;
+
+    double bouvf = boundary_vf[start_index];
+    double bouSfx = boundary_face_vector[start_index * 3 + 0];
+    double bouSfy = boundary_face_vector[start_index * 3 + 1];
+    double bouSfz = boundary_face_vector[start_index * 3 + 2];
+
+    int cellIndex = face2Cells[start_index];
+
+    double grad_x = bouSfx * bouvf;
+    double grad_y = bouSfy * bouvf;
+    double grad_z = bouSfz * bouvf;
+
+    atomicAdd(&(output[num_cells * 0 + cellIndex]), grad_x * sign);
+    atomicAdd(&(output[num_cells * 1 + cellIndex]), grad_y * sign);
+    atomicAdd(&(output[num_cells * 2 + cellIndex]), grad_z * sign);
+
+    // if (cellIndex == 5)
+    // {
+    //     printf("Sfx = %.10e, ssf = %.10e\n", bouSfx, bouvf);
+    //     printf("gradx = %.10e, output = %.10e\n\n", grad_x, output[5]);
+    // }
+}
+
+__global__ void divide_cell_volume_tsr(int num_cells, const double* volume, double *output, double sign)
+{
+    int index = blockDim.x * blockIdx.x + threadIdx.x;
+    if (index >= num_cells)
+        return;
+    
+    double vol = volume[index];
+    output[num_cells * 0 + index] = output[num_cells * 0 + index] / vol * sign;
+    output[num_cells * 1 + index] = output[num_cells * 1 + index] / vol * sign;
+    output[num_cells * 2 + index] = output[num_cells * 2 + index] / vol * sign;
+    output[num_cells * 3 + index] = output[num_cells * 3 + index] / vol * sign;
+    output[num_cells * 4 + index] = output[num_cells * 4 + index] / vol * sign;
+    output[num_cells * 5 + index] = output[num_cells * 5 + index] / vol * sign;
+    output[num_cells * 6 + index] = output[num_cells * 6 + index] / vol * sign;
+    output[num_cells * 7 + index] = output[num_cells * 7 + index] / vol * sign;
+    output[num_cells * 8 + index] = output[num_cells * 8 + index] / vol * sign;
+}
+
+__global__ void divide_cell_volume_vec(int num_cells, const double* volume, double *output, double sign)
+{
+    int index = blockDim.x * blockIdx.x + threadIdx.x;
+    if (index >= num_cells)
+        return;
+    
+    double vol = volume[index];
+
+    output[index * 3 + 0] = output[index * 3 + 0] / vol * sign;
+    output[index * 3 + 1] = output[index * 3 + 1] / vol * sign;
+    output[index * 3 + 2] = output[index * 3 + 2] / vol * sign;
+}
+
+__global__ void divide_cell_volume_scalar(int num_cells, const double* volume, double *output, double sign)
+{
+    int index = blockDim.x * blockIdx.x + threadIdx.x;
+    if (index >= num_cells)
+        return;
+    
+    double vol = volume[index];
+
+    output[index] = output[index] / vol * sign;
+}
+
+__global__ void fvc_grad_vector_correctBC_zeroGradient(int num_cells, int num_boundary_surfaces, 
+        int num, int offset, const int *face2Cells, 
+        const double *internal_grad, const double *vf, const double *boundary_sf,
+        const double *boundary_mag_sf, double *boundary_grad, double sign)
+{
+    int index = blockDim.x * blockIdx.x + threadIdx.x;
+    if (index >= num)
+        return;
+
+    int start_index = offset + index;
+
+    int cellIndex = face2Cells[start_index];
+
+    double grad_xx = internal_grad[num_cells * 0 + cellIndex];
+    double grad_xy = internal_grad[num_cells * 1 + cellIndex];
+    double grad_xz = internal_grad[num_cells * 2 + cellIndex];
+    double grad_yx = internal_grad[num_cells * 3 + cellIndex];
+    double grad_yy = internal_grad[num_cells * 4 + cellIndex];
+    double grad_yz = internal_grad[num_cells * 5 + cellIndex];
+    double grad_zx = internal_grad[num_cells * 6 + cellIndex];
+    double grad_zy = internal_grad[num_cells * 7 + cellIndex];
+    double grad_zz = internal_grad[num_cells * 8 + cellIndex];
+
+    double vfx = vf[cellIndex * 3 + 0];
+    double vfy = vf[cellIndex * 3 + 1];
+    double vfz = vf[cellIndex * 3 + 2];
+
+    double n_x = boundary_sf[start_index * 3 + 0] / boundary_mag_sf[start_index];
+    double n_y = boundary_sf[start_index * 3 + 1] / boundary_mag_sf[start_index];
+    double n_z = boundary_sf[start_index * 3 + 2] / boundary_mag_sf[start_index];
+    
+    double grad_correction_x = - (n_x * grad_xx + n_y * grad_yx + n_z * grad_zx); // sn_grad_x = 0
+    double grad_correction_y = - (n_x * grad_xy + n_y * grad_yy + n_z * grad_zy);
+    double grad_correction_z = - (n_x * grad_xz + n_y * grad_yz + n_z * grad_zz);
+
+    boundary_grad[num_boundary_surfaces * 0 + start_index] = (grad_xx + n_x * grad_correction_x) * sign;
+    boundary_grad[num_boundary_surfaces * 1 + start_index] = (grad_xy + n_x * grad_correction_y) * sign;
+    boundary_grad[num_boundary_surfaces * 2 + start_index] = (grad_xz + n_x * grad_correction_z) * sign;
+    boundary_grad[num_boundary_surfaces * 3 + start_index] = (grad_yx + n_y * grad_correction_x) * sign;
+    boundary_grad[num_boundary_surfaces * 4 + start_index] = (grad_yy + n_y * grad_correction_y) * sign;
+    boundary_grad[num_boundary_surfaces * 5 + start_index] = (grad_yz + n_y * grad_correction_z) * sign;
+    boundary_grad[num_boundary_surfaces * 6 + start_index] = (grad_zx + n_z * grad_correction_x) * sign;
+    boundary_grad[num_boundary_surfaces * 7 + start_index] = (grad_zy + n_z * grad_correction_y) * sign;
+    boundary_grad[num_boundary_surfaces * 8 + start_index] = (grad_zz + n_z * grad_correction_z) * sign;
+}
+
+__global__ void fvc_grad_vector_correctBC_fixedValue(int num, int offset, const int *face2Cells, 
+        const double *internal_grad, const double *vf, const double *boundary_sf,
+        const double *boundary_mag_sf, double *boundary_grad,
+        const double *boundary_deltaCoeffs, const double *boundary_vf, double sign)
+{
+    int index = blockDim.x * blockIdx.x + threadIdx.x;
+    if (index >= num)
+        return;
+
+    int start_index = offset + index;
+
+    int cellIndex = face2Cells[start_index];
+
+    double grad_xx = internal_grad[cellIndex * 9 + 0];
+    double grad_xy = internal_grad[cellIndex * 9 + 1];
+    double grad_xz = internal_grad[cellIndex * 9 + 2];
+    double grad_yx = internal_grad[cellIndex * 9 + 3];
+    double grad_yy = internal_grad[cellIndex * 9 + 4];
+    double grad_yz = internal_grad[cellIndex * 9 + 5];
+    double grad_zx = internal_grad[cellIndex * 9 + 6];
+    double grad_zy = internal_grad[cellIndex * 9 + 7];
+    double grad_zz = internal_grad[cellIndex * 9 + 8];
+
+    double vfx = vf[cellIndex * 3 + 0];
+    double vfy = vf[cellIndex * 3 + 1];
+    double vfz = vf[cellIndex * 3 + 2];
+
+    double n_x = boundary_sf[start_index * 3 + 0] / boundary_mag_sf[start_index];
+    double n_y = boundary_sf[start_index * 3 + 1] / boundary_mag_sf[start_index];
+    double n_z = boundary_sf[start_index * 3 + 2] / boundary_mag_sf[start_index];
+    
+    // sn_grad: solving according to fixedValue BC
+    double sn_grad_x = boundary_deltaCoeffs[start_index] * (boundary_vf[start_index * 3 + 0] - vf[cellIndex * 3 + 0]);
+    double sn_grad_y = boundary_deltaCoeffs[start_index] * (boundary_vf[start_index * 3 + 1] - vf[cellIndex * 3 + 1]);
+    double sn_grad_z = boundary_deltaCoeffs[start_index] * (boundary_vf[start_index * 3 + 2] - vf[cellIndex * 3 + 2]);
+
+    double grad_correction_x = sn_grad_x - (n_x * grad_xx + n_y * grad_yx + n_z * grad_zx); // sn_grad_x = 0
+    double grad_correction_y = sn_grad_y - (n_x * grad_xy + n_y * grad_yy + n_z * grad_zy);
+    double grad_correction_z = sn_grad_z - (n_x * grad_xz + n_y * grad_yz + n_z * grad_zz);
+
+    boundary_grad[start_index * 9 + 0] = (grad_xx + n_x * grad_correction_x) * sign;
+    boundary_grad[start_index * 9 + 1] = (grad_xy + n_x * grad_correction_y) * sign;
+    boundary_grad[start_index * 9 + 2] = (grad_xz + n_x * grad_correction_z) * sign;
+    boundary_grad[start_index * 9 + 3] = (grad_yx + n_y * grad_correction_x) * sign;
+    boundary_grad[start_index * 9 + 4] = (grad_yy + n_y * grad_correction_y) * sign;
+    boundary_grad[start_index * 9 + 5] = (grad_yz + n_y * grad_correction_z) * sign;
+    boundary_grad[start_index * 9 + 6] = (grad_zx + n_z * grad_correction_x) * sign;
+    boundary_grad[start_index * 9 + 7] = (grad_zy + n_z * grad_correction_y) * sign;
+    boundary_grad[start_index * 9 + 8] = (grad_zz + n_z * grad_correction_z) * sign;
+}
+
+__global__ void fvc_div_surface_scalar_internal(int num_surfaces, 
+        const int *lower_index, const int *upper_index, const double *ssf,
+        double *output)
+{
+    int index = blockDim.x * blockIdx.x + threadIdx.x;
+    if (index >= num_surfaces)
+        return;
+
+    int owner = lower_index[index];
+    int neighbor = upper_index[index];
+
+    double issf = ssf[index];
+
+    // owner
+    atomicAdd(&(output[owner]), issf);
+
+    // neighbor
+    atomicAdd(&(output[neighbor]), -issf);
+}
+
+__global__ void fvc_div_surface_scalar_boundary(int num_boundary_face, const int *face2Cells,
+        const double *boundary_ssf, double *output)
+{
+    int index = blockDim.x * blockIdx.x + threadIdx.x;
+    if (index >= num_boundary_face)
+        return;
+    
+    int cellIndex = face2Cells[index];
+
+    atomicAdd(&(output[cellIndex]), boundary_ssf[index]);
+}
+
+__global__ void fvc_div_cell_vector_internal(int num_surfaces, 
+        const int *lower_index, const int *upper_index,
+        const double *field_vector, const double *weight, const double *face_vector,
+        double *output)
+{
+    int index = blockDim.x * blockIdx.x + threadIdx.x;
+    if (index >= num_surfaces)
+        return;
+
+    double w = weight[index];
+    double Sfx = face_vector[index * 3 + 0];
+    double Sfy = face_vector[index * 3 + 1];
+    double Sfz = face_vector[index * 3 + 2];
+
+    int owner = lower_index[index];
+    int neighbor = upper_index[index];
+
+    double ssfx = (w * (field_vector[owner * 3 + 0] - field_vector[neighbor * 3 + 0]) + field_vector[neighbor * 3 + 0]);
+    double ssfy = (w * (field_vector[owner * 3 + 1] - field_vector[neighbor * 3 + 1]) + field_vector[neighbor * 3 + 1]);
+    double ssfz = (w * (field_vector[owner * 3 + 2] - field_vector[neighbor * 3 + 2]) + field_vector[neighbor * 3 + 2]);
+
+    double div = Sfx * ssfx + Sfy * ssfy + Sfz * ssfz;
+
+    // owner
+    atomicAdd(&(output[owner]), div);
+
+    // neighbour
+    atomicAdd(&(output[neighbor]), -div);
+}
+
+__global__ void fvc_div_cell_vector_boundary(int num, int offset, const int *face2Cells,
+        const double *boundary_face_vector, const double *boundary_field_vector, double *output)
+{
+    int index = blockDim.x * blockIdx.x + threadIdx.x;
+    if (index >= num)
+        return;
+
+    int start_index = offset + index;
+
+    double bouSfx = boundary_face_vector[start_index * 3 + 0];
+    double bouSfy = boundary_face_vector[start_index * 3 + 1];
+    double bouSfz = boundary_face_vector[start_index * 3 + 2];
+
+    double boussfx = boundary_field_vector[start_index * 3 + 0];
+    double boussfy = boundary_field_vector[start_index * 3 + 1];
+    double boussfz = boundary_field_vector[start_index * 3 + 2];
+
+    int cellIndex = face2Cells[start_index];
+
+    double bouDiv = bouSfx * boussfx + bouSfy * boussfy + bouSfz * boussfz;
+
+    atomicAdd(&(output[cellIndex]), bouDiv);
+
+}
+
+__global__ void fvc_div_cell_tensor_internal(int num_cells, int num_surfaces,
+        const int *lower_index, const int *upper_index,
+        const double *vf, const double *weight, const double *face_vector,
+        double *output, double sign)
+{
+    int index = blockDim.x * blockIdx.x + threadIdx.x;
+    if (index >= num_surfaces)
+        return;
+
+    double w = weight[index];
+    double Sfx = face_vector[index * 3 + 0];
+    double Sfy = face_vector[index * 3 + 1];
+    double Sfz = face_vector[index * 3 + 2];
+    int owner = lower_index[index];
+    int neighbor = upper_index[index];
+
+    double ssf_xx = (w * (vf[num_cells * 0 + owner] - vf[num_cells * 0 + neighbor]) + vf[num_cells * 0 + neighbor]);
+    double ssf_xy = (w * (vf[num_cells * 1 + owner] - vf[num_cells * 1 + neighbor]) + vf[num_cells * 1 + neighbor]);
+    double ssf_xz = (w * (vf[num_cells * 2 + owner] - vf[num_cells * 2 + neighbor]) + vf[num_cells * 2 + neighbor]);
+    double ssf_yx = (w * (vf[num_cells * 3 + owner] - vf[num_cells * 3 + neighbor]) + vf[num_cells * 3 + neighbor]);
+    double ssf_yy = (w * (vf[num_cells * 4 + owner] - vf[num_cells * 4 + neighbor]) + vf[num_cells * 4 + neighbor]);
+    double ssf_yz = (w * (vf[num_cells * 5 + owner] - vf[num_cells * 5 + neighbor]) + vf[num_cells * 5 + neighbor]);
+    double ssf_zx = (w * (vf[num_cells * 6 + owner] - vf[num_cells * 6 + neighbor]) + vf[num_cells * 6 + neighbor]);
+    double ssf_zy = (w * (vf[num_cells * 7 + owner] - vf[num_cells * 7 + neighbor]) + vf[num_cells * 7 + neighbor]);
+    double ssf_zz = (w * (vf[num_cells * 8 + owner] - vf[num_cells * 8 + neighbor]) + vf[num_cells * 8 + neighbor]);
+    double div_x = (Sfx * ssf_xx + Sfy * ssf_yx + Sfz * ssf_zx) * sign;
+    double div_y = (Sfx * ssf_xy + Sfy * ssf_yy + Sfz * ssf_zy) * sign;
+    double div_z = (Sfx * ssf_xz + Sfy * ssf_yz + Sfz * ssf_zz) * sign;
+    
+    // owner
+    atomicAdd(&(output[num_cells * 0 + owner]), div_x);
+    atomicAdd(&(output[num_cells * 1 + owner]), div_y);
+    atomicAdd(&(output[num_cells * 2 + owner]), div_z);
+
+    // neighbour
+    atomicAdd(&(output[num_cells * 0 + neighbor]), -div_x);
+    atomicAdd(&(output[num_cells * 1 + neighbor]), -div_y);
+    atomicAdd(&(output[num_cells * 2 + neighbor]), -div_z);
+}
+
+__global__ void fvc_div_cell_tensor_boundary(int num_cells, int num_boundary_faces, int num, int offset, const int *face2Cells,
+        const double *boundary_face_vector, const double *boundary_vf, double *output, double sign)
+{
+    int index = blockDim.x * blockIdx.x + threadIdx.x;
+    if (index >= num)
+        return;
+
+    int start_index = offset + index;
+
+    double bouSfx = boundary_face_vector[start_index * 3 + 0];
+    double bouSfy = boundary_face_vector[start_index * 3 + 1];
+    double bouSfz = boundary_face_vector[start_index * 3 + 2];
+
+    double boussf_xx = boundary_vf[num_boundary_faces * 0 + start_index];
+    double boussf_xy = boundary_vf[num_boundary_faces * 1 + start_index];
+    double boussf_xz = boundary_vf[num_boundary_faces * 2 + start_index];
+    double boussf_yx = boundary_vf[num_boundary_faces * 3 + start_index];
+    double boussf_yy = boundary_vf[num_boundary_faces * 4 + start_index];
+    double boussf_yz = boundary_vf[num_boundary_faces * 5 + start_index];
+    double boussf_zx = boundary_vf[num_boundary_faces * 6 + start_index];
+    double boussf_zy = boundary_vf[num_boundary_faces * 7 + start_index];
+    double boussf_zz = boundary_vf[num_boundary_faces * 8 + start_index];
+    int cellIndex = face2Cells[start_index];
+
+    double bouDiv_x = (bouSfx * boussf_xx + bouSfy * boussf_yx + bouSfz * boussf_zx) * sign;
+    double bouDiv_y = (bouSfx * boussf_xy + bouSfy * boussf_yy + bouSfz * boussf_zy) * sign;
+    double bouDiv_z = (bouSfx * boussf_xz + bouSfy * boussf_yz + bouSfz * boussf_zz) * sign;
+
+    atomicAdd(&(output[num_cells * 0 + cellIndex]), bouDiv_x);
+    atomicAdd(&(output[num_cells * 1 + cellIndex]), bouDiv_y);
+    atomicAdd(&(output[num_cells * 2 + cellIndex]), bouDiv_z);
+
+    // if (cellIndex == 0)
+    // {
+    //     // printf("gpu output[0] = %.5e, %.5e, %.5e\n", output[0], output[1], output[2]);
+    //     // printf("gpu output[0] += %.5e, %.5e, %.5e\n", bouDiv_x, bouDiv_y, bouDiv_z);
+    //     printf("gpu bouvf[0] = (%.5e, %.5e, %.5e, %.5e, %.5e, %.5e, %.5e, %.5e, %.5e)\n", 
+    //             boussf_xx, boussf_xy, boussf_xz, boussf_yx, boussf_yy, boussf_yz, boussf_zx, boussf_zy, boussf_zz);
+    //     printf("gpu bouSf[0] = (%.5e, %.5e, %.5e)\n", bouSfx, bouSfy, bouSfz);
+    //     printf("gpu boufinal[0] = (%.5e, %.5e, %.5e)\n", bouDiv_x, bouDiv_y, bouDiv_z);
+    //     printf("bouIndex = %d\n\n", start_index);
+    // }
+
+    // if (index == 0)
+    // {
+    //     printf("bou_grad_U = (%.5e, %.5e, %.5e, %.5e, %.5e, %.5e, %.5e, %.5e, %.5e)", vf2[0], vf2[1], vf2[2],
+    //             vf2[3], vf2[4], vf2[5], vf2[6], vf2[7], vf2[8]);
+    // }
+}
+
+__global__ void constructVecDiag(int num_cells, const double *diag, double *diag_vec, 
+        const double *source, double *b)
+{
+    int index = blockDim.x * blockIdx.x + threadIdx.x;
+    if (index >= num_cells)
+        return;
+    
+    diag_vec[num_cells * 0 + index] = diag[index];
+    diag_vec[num_cells * 1 + index] = diag[index];
+    diag_vec[num_cells * 2 + index] = diag[index];
+
+    b[num_cells * 0 + index] = source[num_cells * 0 + index];
+    b[num_cells * 1 + index] = source[num_cells * 1 + index];
+    b[num_cells * 2 + index] = source[num_cells * 2 + index];
+}
+
+__global__ void addBoundaryDiagSrc(int num_cells, int num_boundary_surfaces, const int *face2Cells, 
+        const double *internal_coeffs, const double *boundary_coeffs, double *diag_vec, double *b)
+{
+    int index = blockDim.x * blockIdx.x + threadIdx.x;
+    if (index >= num_boundary_surfaces)
+        return;
+    
+    int cellIndex = face2Cells[index];
+
+    double internalCoeffx = internal_coeffs[num_boundary_surfaces * 0 + index];
+    double internalCoeffy = internal_coeffs[num_boundary_surfaces * 1 + index];
+    double internalCoeffz = internal_coeffs[num_boundary_surfaces * 2 + index];
+
+    double boundaryCoeffx = boundary_coeffs[num_boundary_surfaces * 0 + index];
+    double boundaryCoeffy = boundary_coeffs[num_boundary_surfaces * 1 + index];
+    double boundaryCoeffz = boundary_coeffs[num_boundary_surfaces * 2 + index];
+
+    atomicAdd(&diag_vec[num_cells * 0 + cellIndex], internalCoeffx);
+    atomicAdd(&diag_vec[num_cells * 1 + cellIndex], internalCoeffy);
+    atomicAdd(&diag_vec[num_cells * 2 + cellIndex], internalCoeffz);
+
+    atomicAdd(&b[num_cells * 0 + cellIndex], boundaryCoeffx);
+    atomicAdd(&b[num_cells * 1 + cellIndex], boundaryCoeffy);
+    atomicAdd(&b[num_cells * 2 + cellIndex], boundaryCoeffz);
+}
+
+__global__ void ldu_to_csr_offDiag(int num_cells, int num_surfaces,
+        const int *lowCSRIndex, const int *uppCSRIndex,
+        const double *lower, const double *upper,
+        double *A_csr)
+{
+    int index = blockDim.x * blockIdx.x + threadIdx.x;
+    if (index >= num_surfaces)
+        return;
+
+    int uppIndex = uppCSRIndex[index];
+    int lowIndex = lowCSRIndex[index];
+    int upp = upper[index];
+    int low = lower[index];
+    A_csr[(num_cells + 2 * num_surfaces) * 0 + uppIndex] = upper[index];
+    A_csr[(num_cells + 2 * num_surfaces) * 1 + uppIndex] = upper[index];
+    A_csr[(num_cells + 2 * num_surfaces) * 2 + uppIndex] = upper[index];
+
+    A_csr[(num_cells + 2 * num_surfaces) * 0 + lowIndex] = lower[index];
+    A_csr[(num_cells + 2 * num_surfaces) * 1 + lowIndex] = lower[index];
+    A_csr[(num_cells + 2 * num_surfaces) * 2 + lowIndex] = lower[index];
+}
+
+__global__ void ldu_to_csr_Diag(int num_cells, int num_surfaces, 
+        const int *diagCSRIndex, const double *diag_vec,
+        double *A_csr)
+{
+    int index = blockDim.x * blockIdx.x + threadIdx.x;
+    if (index >= num_cells)
+        return;
+
+    int diagIndex = diagCSRIndex[index];
+    A_csr[(num_cells + 2 * num_surfaces) * 0 + diagIndex] = diag_vec[num_cells * 0 + index];
+    A_csr[(num_cells + 2 * num_surfaces) * 1 + diagIndex] = diag_vec[num_cells * 1 + index];
+    A_csr[(num_cells + 2 * num_surfaces) * 2 + diagIndex] = diag_vec[num_cells * 2 + index];
+}
+
+
+void permute_vector_d2h(cudaStream_t stream, int num_cells, const double *input, double *output)
+{
+    size_t threads_per_block = 256;
+    size_t blocks_per_grid = (num_cells + threads_per_block - 1) / threads_per_block;
+    permute_vector_d2h_kernel<<<blocks_per_grid, threads_per_block, 0, stream>>>(num_cells, input, output);
+}
+
+void permute_vector_h2d(cudaStream_t stream, int num_cells, const double *input, double *output)
+{
+    size_t threads_per_block = 256;
+    size_t blocks_per_grid = (num_cells + threads_per_block - 1) / threads_per_block;
+    permute_vector_h2d_kernel<<<blocks_per_grid, threads_per_block, 0, stream>>>(num_cells, input, output);
+}
+
+void field_multiply_scalar(cudaStream_t stream,
+        int num_cells, const double *input1, const double *input2, double *output,
+        int num_boundary_surfaces, const double *boundary_input1, const double *boundary_input2, double *boundary_output)
+{
+    TICK_INIT_EVENT;
+    size_t threads_per_block = 256;
+    size_t blocks_per_grid = (std::max(num_cells, num_boundary_surfaces) + threads_per_block - 1) / threads_per_block;
+    TICK_START_EVENT;
+    field_multiply_scalar_kernel<<<blocks_per_grid, threads_per_block, 0, stream>>>(num_cells, num_boundary_surfaces,
+            input1, input2, output, boundary_input1, boundary_input2, boundary_output);
+    TICK_END_EVENT(field_multiply_scalar_kernel);
+}
+
+void fvc_to_source_vector(cudaStream_t stream, int num_cells, const double *volume, const double *fvc_output, double *source)
+{
+    size_t threads_per_block = 256;
+    size_t blocks_per_grid = (num_cells + threads_per_block - 1) / threads_per_block;
+    fvc_to_source_vector_kernel<<<blocks_per_grid, threads_per_block, 0, stream>>>(num_cells,
+            volume, fvc_output, source);
+}
+
+void ldu_to_csr(cudaStream_t stream, int num_cells, int num_surfaces, int num_boundary_surface,
+        const int* boundary_cell_face, const int *lower_to_csr_index, const int *upper_to_csr_index, const int *diag_to_csr_index,
+        const double *lower, const double *upper, const double *diag, const double *source,
+        const double *internal_coeffs, const double *boundary_coeffs,
+        double *A, double *b, double *diag_vec)
+{
+    // construct new diag with size of 3*num_cells
+    size_t threads_per_block = 1024;
+    size_t blocks_per_grid = (num_cells + threads_per_block - 1) / threads_per_block;
+    constructVecDiag<<<blocks_per_grid, threads_per_block, 0, stream>>>(num_cells, diag, diag_vec, source, b);
+
+    // add coeff to source and diagnal
+    blocks_per_grid = (num_boundary_surface + threads_per_block - 1) / threads_per_block;
+    addBoundaryDiagSrc<<<blocks_per_grid, threads_per_block, 0, stream>>>(num_cells, num_boundary_surface, 
+            boundary_cell_face, internal_coeffs, boundary_coeffs, diag_vec, b);
+    
+    // convert offdiag
+    blocks_per_grid = (num_surfaces + threads_per_block - 1) / threads_per_block;
+    ldu_to_csr_offDiag<<<blocks_per_grid, threads_per_block, 0, stream>>>(num_cells, num_surfaces, 
+            lower_to_csr_index, upper_to_csr_index, lower, upper, A);
+
+    // convert diag
+    blocks_per_grid = (num_cells + threads_per_block - 1) / threads_per_block;
+    ldu_to_csr_Diag<<<blocks_per_grid, threads_per_block, 0, stream>>>(num_cells, num_surfaces, 
+            diag_to_csr_index, diag_vec, A);
+
+}
+
+void update_boundary_coeffs_vector(cudaStream_t stream, int num_boundary_surfaces, int num_patches,
+        const int *patch_size, const int *patch_type,
+        double *value_internal_coeffs, double *value_boundary_coeffs,
+        double *gradient_internal_coeffs, double *gradient_boundary_coeffs)
+{
+    size_t threads_per_block = 1024;
+    size_t blocks_per_grid = 1;
+
+    int offset = 0;
+    for (int i = 0; i < num_patches; i++) {
+        threads_per_block = 256;
+        blocks_per_grid = (patch_size[i] + threads_per_block - 1) / threads_per_block;
+        // TODO: just basic patch type now
+        // TODO: just vector version now
+        if (patch_type[i] == boundaryConditions::zeroGradient) {
+            update_boundary_coeffs_zeroGradient_vector<<<blocks_per_grid, threads_per_block, 0, stream>>>(num_boundary_surfaces, patch_size[i], offset,
+                    value_internal_coeffs, value_boundary_coeffs, gradient_internal_coeffs, gradient_boundary_coeffs);
+        } else if (0) {
+            // xxx
+            fprintf(stderr, "boundaryConditions other than zeroGradient are not support yet!\n");
+        }
+        offset += patch_size[i];
+    }
+}
+
+void fvm_ddt_vector(cudaStream_t stream, int num_cells, double rDeltaT,
+        const double *rho, const double *rho_old, const double *vf, const double *volume,
+        double *diag, double *source, double sign)
+{
+#ifdef TIME_GPU
+    printf("#############kernel profile#############\n");
+#endif
+    TICK_INIT_EVENT;
+    size_t threads_per_block = 64;
+    size_t blocks_per_grid = (num_cells + threads_per_block - 1) / threads_per_block;
+#ifdef TIME_GPU
+    printf("warm up ...\n");
+    warmup<<<blocks_per_grid, threads_per_block, 0, stream>>>(num_cells);
+#endif
+    TICK_START_EVENT;
+    fvm_ddt_vector_kernel<<<blocks_per_grid, threads_per_block, 0, stream>>>(num_cells,
+            rDeltaT, rho, rho_old, vf, volume, diag, source, sign);
+    TICK_END_EVENT(fvm_ddt_vector_kernel);
+}
+
+void fvm_div_vector(cudaStream_t stream, int num_surfaces, int num_boundary_surfaces, 
+        const int *lowerAddr, const int *upperAddr,
+        const double *phi, const double *weight,
+        double *lower, double *upper, double *diag, // end for internal
+        int num_patches, const int *patch_size, const int *patch_type,
+        const double *boundary_phi, const double *value_internal_coeffs, const double *value_boundary_coeffs,
+        double *internal_coeffs, double *boundary_coeffs, double sign)
+{
+    TICK_INIT_EVENT;
+    size_t threads_per_block = 256;
+    size_t blocks_per_grid = (num_surfaces + threads_per_block - 1) / threads_per_block;
+#ifdef TIME_GPU
+    printf("warm up ...\n");
+    warmup<<<blocks_per_grid, threads_per_block, 0, stream>>>(num_surfaces);
+#endif
+    TICK_START_EVENT;
+    fvm_div_vector_internal<<<blocks_per_grid, threads_per_block, 0, stream>>>(num_surfaces, lowerAddr, upperAddr,
+            phi, weight, lower, upper, diag, sign);
+    TICK_END_EVENT(fvm_div_vector_internal);
+
+    int offset = 0;
+    for (int i = 0; i < num_patches; i++) {
+        threads_per_block = 64;
+        blocks_per_grid = (patch_size[i] + threads_per_block - 1) / threads_per_block;
+        // TODO: just basic patch type now
+        if (patch_type[i] == boundaryConditions::zeroGradient
+                || patch_type[i] == boundaryConditions::fixedValue) {
+            // TODO: just vector version now
+            TICK_START_EVENT;
+            fvm_div_vector_boundary<<<blocks_per_grid, threads_per_block, 0, stream>>>(num_boundary_surfaces, patch_size[i], offset,
+                    boundary_phi, value_internal_coeffs, value_boundary_coeffs,
+                    internal_coeffs, boundary_coeffs, sign);
+            TICK_END_EVENT(fvm_div_vector_boundary);
+        } else if (0) {
+            // xxx
+            fprintf(stderr, "boundaryConditions other than zeroGradient are not support yet!\n");
+        }
+        offset += patch_size[i];
+    }
+}
+
+void fvm_laplacian_vector(cudaStream_t stream, int num_surfaces, int num_boundary_surfaces, 
+        const int *lowerAddr, const int *upperAddr,
+        const double *weight, const double *mag_sf, const double *delta_coeffs, const double *gamma,
+        double *lower, double *upper, double *diag, // end for internal
+        int num_patches, const int *patch_size, const int *patch_type,
+        const double *boundary_mag_sf, const double *boundary_gamma,
+        const double *gradient_internal_coeffs, const double *gradient_boundary_coeffs,
+        double *internal_coeffs, double *boundary_coeffs, double sign)
+{
+    TICK_INIT_EVENT;
+    size_t threads_per_block = 1024;
+    size_t blocks_per_grid = (num_surfaces + threads_per_block - 1) / threads_per_block;
+    TICK_START_EVENT;
+    fvm_laplacian_vector_internal<<<blocks_per_grid, threads_per_block, 0, stream>>>(num_surfaces, lowerAddr, upperAddr,
+            weight, mag_sf, delta_coeffs, gamma, lower, upper, diag, sign);
+    TICK_END_EVENT(fvm_laplacian_vector_internal);
+    int offset = 0;
+    for (int i = 0; i < num_patches; i++) {
+        threads_per_block = 64;
+        blocks_per_grid = (patch_size[i] + threads_per_block - 1) / threads_per_block;
+        // TODO: just basic patch type now
+        if (patch_type[i] == boundaryConditions::zeroGradient
+                || patch_type[i] == boundaryConditions::fixedValue) {
+            // TODO: just vector version now
+            TICK_START_EVENT;
+            fvm_laplacian_vector_boundary<<<blocks_per_grid, threads_per_block, 0, stream>>>(num_boundary_surfaces, patch_size[i], offset,
+                    boundary_mag_sf, boundary_gamma, gradient_internal_coeffs, gradient_boundary_coeffs,
+                    internal_coeffs, boundary_coeffs, sign);
+            TICK_END_EVENT(fvm_laplacian_vector_boundary);
+        } else if (0) {
+            // xxx
+            fprintf(stderr, "boundaryConditions other than zeroGradient are not support yet!\n");
+        }
+        offset += patch_size[i];
+    }
+}
+
+void fvc_ddt_scalar(cudaStream_t stream, int num_cells, double rDeltaT,
+        const double *rho, const double *rho_old, const double *vf, const double *vf_old,
+        double *output, double sign)
+{
+    checkCudaErrors(cudaMemsetAsync(output, 0, num_cells * sizeof(double), stream));
+
+    size_t threads_per_block = 1024;
+    size_t blocks_per_grid = (num_cells + threads_per_block - 1) / threads_per_block;
+    fvc_ddt_scalar_kernel<<<blocks_per_grid, threads_per_block, 0, stream>>>(num_cells,
+            rDeltaT, rho, rho_old, vf, vf_old, output, sign);
+}
+
+void fvc_grad_vector(cudaStream_t stream, int num_cells, int num_surfaces, int num_boundary_surfaces,
+        const int *lowerAddr, const int *upperAddr, 
+        const double *weight, const double *Sf, const double *vf, double *output, // end for internal
+        int num_patches, const int *patch_size, const int *patch_type,
+        const int *boundary_cell_face, const double *boundary_vf, const double *boundary_Sf,
+        const double *volume, const double *boundary_mag_Sf, double *boundary_output,
+        const double *boundary_deltaCoeffs, double sign)
+{
+    checkCudaErrors(cudaMemsetAsync(output, 0, num_cells * 9 * sizeof(double), stream));
+    TICK_INIT_EVENT;
+    size_t threads_per_block = 32;
+    size_t blocks_per_grid = (num_surfaces + threads_per_block - 1) / threads_per_block;
+    TICK_START_EVENT;
+    fvc_grad_vector_internal<<<blocks_per_grid, threads_per_block, 0, stream>>>(num_cells, num_surfaces, lowerAddr, upperAddr,
+            Sf, weight, vf, output);
+    TICK_END_EVENT(fvc_grad_vector_internal);
+    
+    int offset = 0;
+    // finish conctruct grad field except dividing cell volume
+    for (int i = 0; i < num_patches; i++) {
+        threads_per_block = 64;
+        blocks_per_grid = (patch_size[i] + threads_per_block - 1) / threads_per_block;
+        // TODO: just basic patch type now
+        if (patch_type[i] == boundaryConditions::zeroGradient
+                || patch_type[i] == boundaryConditions::fixedValue) {
+            // TODO: just vector version now
+            TICK_START_EVENT;
+            fvc_grad_vector_boundary<<<blocks_per_grid, threads_per_block, 0, stream>>>(num_cells, 
+                    patch_size[i], offset, boundary_cell_face,
+                    boundary_Sf, boundary_vf, output);
+            TICK_END_EVENT(fvc_grad_vector_boundary);
+        } else if (0) {
+            // xxx
+            fprintf(stderr, "boundaryConditions other than zeroGradient are not support yet!\n");
+        }
+        offset += patch_size[i];
+    }
+
+    // divide cell volume
+    threads_per_block = 512;
+    blocks_per_grid = (num_cells + threads_per_block - 1) / threads_per_block;
+    TICK_START_EVENT;
+    divide_cell_volume_tsr<<<blocks_per_grid, threads_per_block, 0, stream>>>(num_cells, volume, output, sign);
+    TICK_END_EVENT(divide_cell_volume_tsr);
+
+    // correct boundary conditions
+    offset = 0;
+    for (int i = 0; i < num_patches; i++) {
+        threads_per_block = 64;
+        blocks_per_grid = (patch_size[i] + threads_per_block - 1) / threads_per_block;
+        // TODO: just basic patch type now
+        if (patch_type[i] == boundaryConditions::zeroGradient) {
+            // TODO: just vector version now
+            TICK_START_EVENT;
+            fvc_grad_vector_correctBC_zeroGradient<<<blocks_per_grid, threads_per_block, 0, stream>>>(num_cells, num_boundary_surfaces,
+                    patch_size[i], offset, boundary_cell_face,
+                    output, boundary_vf, boundary_Sf, boundary_mag_Sf, boundary_output, sign);
+            TICK_END_EVENT(fvc_grad_vector_correctBC_zeroGradient);
+        } else if (patch_type[i] == boundaryConditions::fixedValue) {
+            // TODO: implement fixedValue version
+            fvc_grad_vector_correctBC_fixedValue<<<blocks_per_grid, threads_per_block, 0, stream>>>(patch_size[i], offset, boundary_cell_face,
+                    output, boundary_vf, boundary_Sf, boundary_mag_Sf, boundary_output, boundary_deltaCoeffs, boundary_vf, sign);
+        } else if (0) {
+            // xxx
+            fprintf(stderr, "boundaryConditions other than zeroGradient are not support yet!\n");
+        }
+        offset += patch_size[i];
+    }
+}
+
+void scale_dev2T_tensor(cudaStream_t stream, int num_cells, const double *vf1, double *vf2,
+        int num_boundary_surfaces, const double *boundary_vf1, double *boundary_vf2)
+{
+    TICK_INIT_EVENT;
+    size_t threads_per_block = 1024;
+    size_t blocks_per_grid = (num_cells + threads_per_block - 1) / threads_per_block;
+    TICK_START_EVENT;
+    scale_dev2t_tensor_kernel<<<blocks_per_grid, threads_per_block, 0, stream>>>(num_cells, vf1, vf2);
+    TICK_END_EVENT(scale_dev2t_tensor_kernel);
+
+    blocks_per_grid = (num_boundary_surfaces + threads_per_block - 1) / threads_per_block;
+    scale_dev2t_tensor_kernel<<<blocks_per_grid, threads_per_block, 0, stream>>>(num_boundary_surfaces, boundary_vf1, boundary_vf2);
+}
+
+void fvc_div_surface_scalar(cudaStream_t stream, int num_cells, int num_surfaces, int num_boundary_surfaces,
+        const int *lowerAddr, const int *upperAddr, const double *ssf, const int *boundary_cell_face,
+        const double *boundary_ssf, const double *volume, double *output, double sign)
+{
+    checkCudaErrors(cudaMemsetAsync(output, 0, num_cells * sizeof(double), stream));
+
+    size_t threads_per_block = 1024;
+    size_t blocks_per_grid = (num_surfaces + threads_per_block - 1) / threads_per_block;
+    fvc_div_surface_scalar_internal<<<blocks_per_grid, threads_per_block, 0, stream>>>(num_surfaces, lowerAddr, upperAddr, ssf, output);
+
+    threads_per_block = 1024;
+    blocks_per_grid = (num_boundary_surfaces + threads_per_block - 1) / threads_per_block;
+    fvc_div_surface_scalar_boundary<<<blocks_per_grid, threads_per_block, 0, stream>>>(num_boundary_surfaces, boundary_cell_face, 
+            boundary_ssf, output);
+
+    // divide cell volume
+    threads_per_block = 1024;
+    blocks_per_grid = (num_cells + threads_per_block - 1) / threads_per_block;
+    divide_cell_volume_scalar<<<blocks_per_grid, threads_per_block, 0, stream>>>(num_cells, volume, output, sign);
+}
+
+void fvc_div_cell_vector(cudaStream_t stream, int num_cells, int num_surfaces,
+        const int *lowerAddr, const int *upperAddr, 
+        const double *weight, const double *Sf, const double *vf, double *output, // end for internal
+        int num_patches, const int *patch_size, const int *patch_type,
+        const int *boundary_cell_face, const double *boundary_vf, const double *boundary_Sf,
+        const double *volume, double sign)
+{
+    checkCudaErrors(cudaMemsetAsync(output, 0, num_cells * sizeof(double), stream));
+
+    size_t threads_per_block = 1024;
+    size_t blocks_per_grid = (num_surfaces + threads_per_block - 1) / threads_per_block;
+    fvc_div_cell_vector_internal<<<blocks_per_grid, threads_per_block, 0, stream>>>(num_surfaces, lowerAddr, upperAddr, vf, weight, Sf, output);
+
+    int offset = 0;
+    for (int i = 0; i < num_patches; i++) {
+        threads_per_block = 256;
+        blocks_per_grid = (patch_size[i] + threads_per_block - 1) / threads_per_block;
+        // TODO: just basic patch type now
+        if (patch_type[i] == boundaryConditions::zeroGradient
+                || patch_type[i] == boundaryConditions::fixedValue) {
+            // TODO: just vector version now
+            fvc_div_cell_vector_boundary<<<blocks_per_grid, threads_per_block, 0, stream>>>(patch_size[i], offset, boundary_cell_face,
+                    boundary_Sf, boundary_vf, output);
+        } else if (0) {
+            // xxx
+            fprintf(stderr, "boundaryConditions other than zeroGradient are not support yet!\n");
+        }
+        offset += patch_size[i];
+    }
+
+    // divide cell volume
+    threads_per_block = 1024;
+    blocks_per_grid = (num_cells + threads_per_block - 1) / threads_per_block;
+    divide_cell_volume_scalar<<<blocks_per_grid, threads_per_block, 0, stream>>>(num_cells, volume, output, sign);
+}
+
+void fvc_div_cell_tensor(cudaStream_t stream, int num_cells, int num_surfaces, int num_boundary_surfaces, 
+        const int *lowerAddr, const int *upperAddr,
+        const double *weight, const double *Sf, const double *vf, double *output, // end for internal
+        int num_patches, const int *patch_size, const int *patch_type,
+        const int *boundary_cell_face, const double *boundary_vf, const double *boundary_Sf,
+        const double *volume, double sign)
+{
+    // checkCudaErrors(cudaMemsetAsync(output, 0, num_cells * 3 * sizeof(double), stream));
+    TICK_INIT_EVENT;
+    size_t threads_per_block = 1024;
+    size_t blocks_per_grid = (num_surfaces + threads_per_block - 1) / threads_per_block;
+    TICK_START_EVENT;
+    fvc_div_cell_tensor_internal<<<blocks_per_grid, threads_per_block, 0, stream>>>(num_cells, num_surfaces, lowerAddr, upperAddr, vf, weight, Sf, output, sign);
+    TICK_END_EVENT(fvc_div_cell_tensor_internal);
+
+    int offset = 0;
+    for (int i = 0; i < num_patches; i++) {
+        threads_per_block = 64;
+        blocks_per_grid = (patch_size[i] + threads_per_block - 1) / threads_per_block;
+        // TODO: just basic patch type now
+        if (patch_type[i] == boundaryConditions::zeroGradient
+                || patch_type[i] == boundaryConditions::fixedValue) {
+            // TODO: just vector version now
+            TICK_START_EVENT;
+            fvc_div_cell_tensor_boundary<<<blocks_per_grid, threads_per_block, 0, stream>>>(num_cells, num_boundary_surfaces, patch_size[i], offset, boundary_cell_face,
+                    boundary_Sf, boundary_vf, output, sign);
+            TICK_END_EVENT(fvc_div_cell_tensor_boundary);
+        } else if (0) {
+            // xxx
+            fprintf(stderr, "boundaryConditions other than zeroGradient are not support yet!\n");
+        }
+        offset += patch_size[i];
+    }
+
+    // // divide cell volume
+    // threads_per_block = 1024;
+    // blocks_per_grid = (num_cells + threads_per_block - 1) / threads_per_block;
+    // divide_cell_volume_vec<<<blocks_per_grid, threads_per_block, 0, stream>>>(num_cells, volume, output, sign);
+}
+
+void fvc_grad_cell_scalar(cudaStream_t stream, int num_cells, int num_surfaces, 
+        const int *lowerAddr, const int *upperAddr, 
+        const double *weight, const double *Sf, const double *vf, double *output, // end for internal
+        int num_patches, const int *patch_size, const int *patch_type,
+        const int *boundary_cell_face, const double *boundary_vf, const double *boundary_Sf, const double *volume, double sign)
+{
+    TICK_INIT_EVENT;
+    size_t threads_per_block = 1024;
+    size_t blocks_per_grid = (num_surfaces + threads_per_block - 1) / threads_per_block;
+    TICK_START_EVENT;
+    fvc_grad_scalar_internal<<<blocks_per_grid, threads_per_block, 0, stream>>>(num_cells, num_surfaces, lowerAddr, upperAddr,
+            Sf, weight, vf, output, sign);
+    TICK_END_EVENT(fvc_grad_scalar_internal);
+    
+    int offset = 0;
+    for (int i = 0; i < num_patches; i++) {
+        threads_per_block = 64;
+        blocks_per_grid = (patch_size[i] + threads_per_block - 1) / threads_per_block;
+        // TODO: just non-coupled patch type now
+        if (patch_type[i] == boundaryConditions::zeroGradient
+                || patch_type[i] == boundaryConditions::fixedValue) {
+            TICK_START_EVENT;
+            fvc_grad_scalar_boundary<<<blocks_per_grid, threads_per_block, 0, stream>>>(num_cells, patch_size[i], offset, boundary_cell_face,
+                    boundary_Sf, boundary_vf, output, sign);
+            TICK_END_EVENT(fvc_grad_scalar_boundary);
+        } else if (0) {
+            // xxx
+            fprintf(stderr, "boundaryConditions other than zeroGradient are not support yet!\n");
+        }
+        offset += patch_size[i];
+    }
+
+    // // divide cell volume
+    // threads_per_block = 1024;
+    // blocks_per_grid = (num_cells + threads_per_block - 1) / threads_per_block;
+    // divide_cell_volume_vec<<<blocks_per_grid, threads_per_block, 0, stream>>>(num_cells, volume, output, sign);
+}
diff --git a/src_gpu/dfUEqn.H b/src_gpu/dfUEqn.H
index ec739db5e..49edc1b7a 100644
--- a/src_gpu/dfUEqn.H
+++ b/src_gpu/dfUEqn.H
@@ -3,60 +3,114 @@
 #include "AmgXSolver.H"
 #include <amgx_c.h>
 #include "dfMatrixDataBase.H"
+#include "dfMatrixOpBase.H"
 
 class dfUEqn
 {
 private:
-    dfMatrixDataBase &dataBase_;
-    cudaStream_t stream;
-    AmgXSolver *UxSolver, *UySolver, *UzSolver = nullptr;
-    int num_iteration;
-
-    // common variables
-    int num_cells, cell_bytes, num_faces, num_surfaces, cell_vec_bytes, csr_value_vec_bytes, num_boundary_cells;
-    int *d_A_csr_row_index, *d_A_csr_diag_index, *d_A_csr_col_index;
-
-    // Matrix variables
-    double *d_A_csr, *d_b, *d_psi, *d_psi_permute, *d_H, *d_H_permute, *d_A;
-    double *h_A_csr, *h_b, *h_psi, *h_H, *h_A = nullptr;
-
-    double *d_ueqn_internal_coeffs, *d_ueqn_boundary_coeffs= nullptr;
+	dfMatrixDataBase &dataBase_;
+
+    // cuda resource
+    // one graph for one eqn before using self-developed solver
+    cudaGraph_t graph;
+    cudaGraphExec_t graph_instance;
+    bool graph_created=false;
+
+	// constant values -- basic
+	std::string mode_string;
+	std::string setting_path;
+
+	// constant values -- amgx solvers
+	AmgXSolver *UxSolver = nullptr;
+	AmgXSolver *UySolver = nullptr;
+	AmgXSolver *UzSolver = nullptr;
+    int num_iteration = 0;
+
+	// constant fields - internal
+	// 无
+
+	// constant fields - boundary
+	std::vector<int> patch_type;
+
+	// non-constant fields - internal
+	// thermophysical fields
+	double *d_nu_eff = nullptr;
+	// computed on CPU, used on GPU, need memcpyh2d - host
+	double *h_nu_eff = nullptr;
+	// intermediate fields
+	double *d_grad_u = nullptr;
+	double *d_rho_nueff = nullptr;
+	double *d_permute = nullptr;
+    double *d_fvc_output = nullptr; // TODO: no need anymore
+
+	// non-constant fields - boundary
+	// thermophysical fields
+	double *d_boundary_nu_eff = nullptr;
+	// computed on CPU, used on GPU, need memcpyh2d - host
+	double *h_boundary_nu_eff = nullptr;
+	// intermediate fields
+	double *d_boundary_grad_u = nullptr;
+	double *d_boundary_rho_nueff = nullptr;
+    // boundary coeff fields
+	double *d_value_internal_coeffs = nullptr;
+	double *d_value_boundary_coeffs= nullptr;
+	double *d_gradient_internal_coeffs= nullptr;
+	double *d_gradient_boundary_coeffs= nullptr;
+
+	// non-constant fields - ldu
+	double *d_lower = nullptr;
+	double *d_upper = nullptr;
+	double *d_diag = nullptr;
+	double *d_source = nullptr;
+	double *d_internal_coeffs = nullptr;
+	double *d_boundary_coeffs = nullptr;
+    double *d_diag_vector = nullptr;
+
+	// non-constant fields - csr
+	double *d_A = nullptr;
+	double *d_b = nullptr; // TODO: needless
+
+    // field pointer map
+    std::unordered_map<std::string, double*> fieldPointerMap;
 
 public:
-    dfUEqn();
-    dfUEqn(dfMatrixDataBase &dataBase, const std::string &modeStr, const std::string &cfgFile);
-    ~dfUEqn();
-
-    void checkValue(bool print);
-
-    void fvm_ddt(double *vector_old);
-
-    void fvm_div(double *boundary_pressure_init, double *boundary_velocity_init,
-                 double *boundary_nuEff_init, double *boundary_rho_init);
-
-    void fvc_grad(double *pressure);
-
-    void fvc_grad_vector();
-
-    void dev2T();
-
-    void fvc_div_tensor(const double *nuEff);
-
-    void fvm_laplacian();
-
-    void A(double *Psi);
-
-    void H(double *Psi);
+	// 构造函数
+    dfUEqn(dfMatrixDataBase &dataBase)
+        : dataBase_(dataBase) {}
+
+	// 析构函数
+	~dfUEqn(){
+        if (graph_created) {
+            checkCudaErrors(cudaGraphExecDestroy(graph_instance));
+            checkCudaErrors(cudaGraphDestroy(graph));
+        }
+    }
+
+	// 成员函数
+
+    // getter函数
+    double* getFieldPointer(const char* fieldAlias, location loc, position pos);
+
+	// 初始化构建
+	void setConstantValues(const std::string &mode_string, const std::string &setting_path); 
+	void setConstantFields(const std::vector<int> patch_type);
+	void createNonConstantFieldsInternal();
+	void createNonConstantFieldsBoundary();
+	void createNonConstantLduAndCsrFields();
+	// dfUEqn has no internal non-constant fields to be init
+	//void initNonConstantFieldsInternal(xxx);
+	void initNonConstantFieldsBoundary();
+
+	// 方程运行
+    void preProcessForRhoEqn(const double *h_rho, const double *h_phi, const double *h_boundary_phi);
+	void preProcess(const double *h_u, const double *h_boundary_u, const double *h_p, const double *h_boundary_p, const double *h_nu_eff, const double *h_boundary_nu_eff, const double *h_boundary_rho);
+	void process();
+	void postProcess(double *h_u);
 
     void solve();
+    void compareResult(const double *lower, const double *upper, const double *diag, const double *source, const double *internal_coeffs, const double *boundary_coeffs, 
+    // const double *tmpVal, 
+    bool printFlag);
 
     void sync();
-
-    void updatePsi(double *Psi);
-
-    void correctBoundaryConditions();
-
-    void correctPsi(double *Psi);
-
-    void initializeTimeStep();
 };
diff --git a/src_gpu/dfUEqn.cu b/src_gpu/dfUEqn.cu
index 56983e038..d30c06131 100644
--- a/src_gpu/dfUEqn.cu
+++ b/src_gpu/dfUEqn.cu
@@ -1,1481 +1,306 @@
 #include "dfUEqn.H"
 
-// kernel functions
-__global__ void fvm_ddt_kernel(int num_cells, int num_faces, const double rdelta_t,
-                               const int *csr_row_index, const int *csr_diag_index,
-                               const double *rho_old, const double *rho_new, const double *volume, const double *velocity_old,
-                               const double *A_csr_input, const double *b_input, double *A_csr_output, double *b_output, double *psi)
-{
-    int index = blockDim.x * blockIdx.x + threadIdx.x;
-    if (index >= num_cells)
-        return;
-
-    // A_csr has one more element in each row: itself
-    int row_index = csr_row_index[index];
-    int diag_index = csr_diag_index[index];
-
-    int csr_dim = num_cells + num_faces;
-    int csr_index = row_index + diag_index;
-    double ddt_diag = rdelta_t * rho_new[index] * volume[index];
-    A_csr_output[csr_dim * 0 + csr_index] = A_csr_input[csr_dim * 0 + csr_index] + ddt_diag;
-    A_csr_output[csr_dim * 1 + csr_index] = A_csr_input[csr_dim * 1 + csr_index] + ddt_diag;
-    A_csr_output[csr_dim * 2 + csr_index] = A_csr_input[csr_dim * 2 + csr_index] + ddt_diag;
-
-    double ddt_part_term = rdelta_t * rho_old[index] * volume[index];
-    b_output[num_cells * 0 + index] = b_input[num_cells * 0 + index] + ddt_part_term * velocity_old[index * 3 + 0];
-    b_output[num_cells * 1 + index] = b_input[num_cells * 1 + index] + ddt_part_term * velocity_old[index * 3 + 1];
-    b_output[num_cells * 2 + index] = b_input[num_cells * 2 + index] + ddt_part_term * velocity_old[index * 3 + 2];
-
-    psi[num_cells * 0 + index] = velocity_old[index * 3 + 0];
-    psi[num_cells * 1 + index] = velocity_old[index * 3 + 1];
-    psi[num_cells * 2 + index] = velocity_old[index * 3 + 2];
-}
-
-__global__ void fvm_div_internal(int num_cells, int num_faces,
-                                 const int *csr_row_index, const int *csr_diag_index,
-                                 const double *weight, const double *phi,
-                                 const double *A_csr_input, const double *b_input, double *A_csr_output, double *b_output)
-{
-    int index = blockDim.x * blockIdx.x + threadIdx.x;
-    if (index >= num_cells)
-        return;
-
-    // A_csr has one more element in each row: itself
-    int row_index = csr_row_index[index];
-    int next_row_index = csr_row_index[index + 1];
-    int diag_index = csr_diag_index[index];
-    int neighbor_offset = csr_row_index[index] - index;
-    int csr_dim = num_cells + num_faces;
-
-    double div_diag = 0;
-    for (int i = row_index; i < next_row_index; i++)
-    {
-        int inner_index = i - row_index;
-        // lower
-        if (inner_index < diag_index)
-        {
-            int neighbor_index = neighbor_offset + inner_index;
-            double w = weight[neighbor_index];
-            double f = phi[neighbor_index];
-            A_csr_output[csr_dim * 0 + i] = A_csr_input[csr_dim * 0 + i] + (-w) * f;
-            A_csr_output[csr_dim * 1 + i] = A_csr_input[csr_dim * 1 + i] + (-w) * f;
-            A_csr_output[csr_dim * 2 + i] = A_csr_input[csr_dim * 2 + i] + (-w) * f;
-            // lower neighbors contribute to sum of -1
-            div_diag += (w - 1) * f;
-        }
-        // upper
-        if (inner_index > diag_index)
-        {
-            // upper, index - 1, consider of diag
-            int neighbor_index = neighbor_offset + inner_index - 1;
-            double w = weight[neighbor_index];
-            double f = phi[neighbor_index];
-            A_csr_output[csr_dim * 0 + i] = A_csr_input[csr_dim * 0 + i] + (1 - w) * f;
-            A_csr_output[csr_dim * 1 + i] = A_csr_input[csr_dim * 1 + i] + (1 - w) * f;
-            A_csr_output[csr_dim * 2 + i] = A_csr_input[csr_dim * 2 + i] + (1 - w) * f;
-            // upper neighbors contribute to sum of 1
-            div_diag += w * f;
-        }
-    }
-    A_csr_output[csr_dim * 0 + row_index + diag_index] = A_csr_input[csr_dim * 0 + row_index + diag_index] + div_diag; // diag
-    A_csr_output[csr_dim * 1 + row_index + diag_index] = A_csr_input[csr_dim * 1 + row_index + diag_index] + div_diag; // diag
-    A_csr_output[csr_dim * 2 + row_index + diag_index] = A_csr_input[csr_dim * 2 + row_index + diag_index] + div_diag; // diag
-}
-
-__global__ void fvm_div_boundary(int num_cells, int num_faces, int num_boundary_cells,
-                                 const int *csr_row_index, const int *csr_diag_index,
-                                 const int *boundary_cell_offset, const int *boundary_cell_id,
-                                 const double *internal_coeffs, const double *boundary_coeffs,
-                                 const double *A_csr_input, const double *b_input, double *A_csr_output, double *b_output,
-                                 double *ueqn_internal_coeffs, double *ueqn_boundary_coeffs)
-{
-    int index = blockDim.x * blockIdx.x + threadIdx.x;
-    if (index >= num_boundary_cells)
-        return;
-
-    int cell_offset = boundary_cell_offset[index];
-    int cell_index = boundary_cell_id[cell_offset];
-    int loop_size = boundary_cell_offset[index + 1] - cell_offset;
-
-    int row_index = csr_row_index[cell_index];
-    int diag_index = csr_diag_index[cell_index];
-    int csr_dim = num_cells + num_faces;
-    int csr_index = row_index + diag_index;
-
-    // construct internalCoeffs & boundaryCoeffs
-    double internal_coeffs_x = 0;
-    double internal_coeffs_y = 0;
-    double internal_coeffs_z = 0;
-    double boundary_coeffs_x = 0;
-    double boundary_coeffs_y = 0;
-    double boundary_coeffs_z = 0;
-    for (int i = 0; i < loop_size; i++)
-    {
-        internal_coeffs_x += internal_coeffs[(cell_offset + i) * 3 + 0];
-        internal_coeffs_y += internal_coeffs[(cell_offset + i) * 3 + 1];
-        internal_coeffs_z += internal_coeffs[(cell_offset + i) * 3 + 2];
-        boundary_coeffs_x += boundary_coeffs[(cell_offset + i) * 3 + 0];
-        boundary_coeffs_y += boundary_coeffs[(cell_offset + i) * 3 + 1];
-        boundary_coeffs_z += boundary_coeffs[(cell_offset + i) * 3 + 2];
-    }
-    ueqn_internal_coeffs[cell_index * 3 + 0] = internal_coeffs_x;
-    ueqn_internal_coeffs[cell_index * 3 + 1] = internal_coeffs_y;
-    ueqn_internal_coeffs[cell_index * 3 + 2] = internal_coeffs_z;
-    ueqn_boundary_coeffs[cell_index * 3 + 0] = boundary_coeffs_x;
-    ueqn_boundary_coeffs[cell_index * 3 + 1] = boundary_coeffs_y;
-    ueqn_boundary_coeffs[cell_index * 3 + 2] = boundary_coeffs_z;
-
-    A_csr_output[csr_dim * 0 + csr_index] = A_csr_input[csr_dim * 0 + csr_index] + internal_coeffs_x;
-    A_csr_output[csr_dim * 1 + csr_index] = A_csr_input[csr_dim * 1 + csr_index] + internal_coeffs_y;
-    A_csr_output[csr_dim * 2 + csr_index] = A_csr_input[csr_dim * 2 + csr_index] + internal_coeffs_z;
-    b_output[num_cells * 0 + cell_index] = b_input[num_cells * 0 + cell_index] + boundary_coeffs_x;
-    b_output[num_cells * 1 + cell_index] = b_input[num_cells * 1 + cell_index] + boundary_coeffs_y;
-    b_output[num_cells * 2 + cell_index] = b_input[num_cells * 2 + cell_index] + boundary_coeffs_z;
-}
-
-__global__ void fvc_grad_internal_face(int num_cells,
-                                       const int *csr_row_index, const int *csr_col_index, const int *csr_diag_index,
-                                       const double *face_vector, const double *weight, const double *pressure,
-                                       const double *b_input, double *b_output)
-{
-    int index = blockDim.x * blockIdx.x + threadIdx.x;
-    if (index >= num_cells)
-        return;
-
-    // A_csr has one more element in each row: itself
-    int row_index = csr_row_index[index];
-    int next_row_index = csr_row_index[index + 1];
-    int diag_index = csr_diag_index[index];
-    int neighbor_offset = csr_row_index[index] - index;
-
-    double own_cell_p = pressure[index];
-    double grad_bx = 0;
-    double grad_by = 0;
-    double grad_bz = 0;
-    double grad_bx_low = 0;
-    double grad_bx_upp = 0;
-    double grad_by_low = 0;
-    double grad_by_upp = 0;
-    double grad_bz_low = 0;
-    double grad_bz_upp = 0;
-    for (int i = row_index; i < next_row_index; i++)
-    {
-        int inner_index = i - row_index;
-        // lower
-        if (inner_index < diag_index)
-        {
-            int neighbor_index = neighbor_offset + inner_index;
-            double w = weight[neighbor_index];
-            double sfx = face_vector[neighbor_index * 3 + 0];
-            double sfy = face_vector[neighbor_index * 3 + 1];
-            double sfz = face_vector[neighbor_index * 3 + 2];
-            int neighbor_cell_id = csr_col_index[row_index + inner_index];
-            double neighbor_cell_p = pressure[neighbor_cell_id];
-            double face_p = (1 - w) * own_cell_p + w * neighbor_cell_p;
-            grad_bx_low -= face_p * sfx;
-            grad_by_low -= face_p * sfy;
-            grad_bz_low -= face_p * sfz;
-        }
-        // upper
-        if (inner_index > diag_index)
-        {
-            int neighbor_index = neighbor_offset + inner_index - 1;
-            double w = weight[neighbor_index];
-            double sfx = face_vector[neighbor_index * 3 + 0];
-            double sfy = face_vector[neighbor_index * 3 + 1];
-            double sfz = face_vector[neighbor_index * 3 + 2];
-            int neighbor_cell_id = csr_col_index[row_index + inner_index];
-            double neighbor_cell_p = pressure[neighbor_cell_id];
-            double face_p = w * own_cell_p + (1 - w) * neighbor_cell_p;
-            grad_bx_upp += face_p * sfx;
-            grad_by_upp += face_p * sfy;
-            grad_bz_upp += face_p * sfz;
-        }
-    }
-    b_output[num_cells * 0 + index] = b_input[num_cells * 0 + index] - grad_bx_low - grad_bx_upp;
-    b_output[num_cells * 1 + index] = b_input[num_cells * 1 + index] - grad_by_low - grad_by_upp;
-    b_output[num_cells * 2 + index] = b_input[num_cells * 2 + index] - grad_bz_low - grad_bz_upp;
-}
-
-__global__ void fvc_grad_boundary_face(int num_cells, int num_boundary_cells,
-                                       const int *boundary_cell_offset, const int *boundary_cell_id,
-                                       const double *boundary_face_vector, const double *boundary_pressure,
-                                       const double *b_input, double *b_output)
-{
-    int index = blockDim.x * blockIdx.x + threadIdx.x;
-    if (index >= num_boundary_cells)
-        return;
-
-    int cell_offset = boundary_cell_offset[index];
-    int next_cell_offset = boundary_cell_offset[index + 1];
-    int cell_index = boundary_cell_id[cell_offset];
-
-    // compute boundary gradient
-    double grad_bx = 0;
-    double grad_by = 0;
-    double grad_bz = 0;
-    for (int i = cell_offset; i < next_cell_offset; i++)
-    {
-        double sfx = boundary_face_vector[i * 3 + 0];
-        double sfy = boundary_face_vector[i * 3 + 1];
-        double sfz = boundary_face_vector[i * 3 + 2];
-        double face_p = boundary_pressure[i];
-        grad_bx += face_p * sfx;
-        grad_by += face_p * sfy;
-        grad_bz += face_p * sfz;
-    }
-
-    //// correct the boundary gradient
-    // double nx = boundary_face_vector[face_index * 3 + 0] / magSf[face_index];
-    // double ny = boundary_face_vector[face_index * 3 + 1] / magSf[face_index];
-    // double nz = boundary_face_vector[face_index * 3 + 2] / magSf[face_index];
-    // double sn_grad = 0;
-    // double grad_correction = sn_grad * volume[cell_index] - (nx * grad_bx + ny * grad_by + nz * grad_bz);
-    // grad_bx += nx * grad_correction;
-    // grad_by += ny * grad_correction;
-    // grad_bz += nz * grad_correction;
-
-    b_output[num_cells * 0 + cell_index] = b_input[num_cells * 0 + cell_index] - grad_bx;
-    b_output[num_cells * 1 + cell_index] = b_input[num_cells * 1 + cell_index] - grad_by;
-    b_output[num_cells * 2 + cell_index] = b_input[num_cells * 2 + cell_index] - grad_bz;
+void dfUEqn::setConstantValues(const std::string &mode_string, const std::string &setting_path) {
+  this->mode_string = mode_string;
+  this->setting_path = setting_path;
+  UxSolver = new AmgXSolver(mode_string, setting_path);
+  UySolver = new AmgXSolver(mode_string, setting_path);
+  UzSolver = new AmgXSolver(mode_string, setting_path);
 }
 
-__global__ void add_fvMatrix_kernel(int num_cells, int num_faces,
-                                    const int *csr_row_index,
-                                    const double *turbSrc_A, const double *turbSrc_b,
-                                    const double *A_csr_input, const double *b_input, double *A_csr_output, double *b_output)
-{
-    int index = blockDim.x * blockIdx.x + threadIdx.x;
-    if (index >= num_cells)
-        return;
-
-    int row_index = csr_row_index[index];
-    int next_row_index = csr_row_index[index + 1];
-    int csr_dim = num_cells + num_faces;
-    double A_entry;
-
-    for (int i = row_index; i < next_row_index; i++)
-    {
-        A_entry = turbSrc_A[i];
-        A_csr_output[csr_dim * 0 + i] = A_csr_input[csr_dim * 0 + i] + A_entry;
-        A_csr_output[csr_dim * 1 + i] = A_csr_input[csr_dim * 1 + i] + A_entry;
-        A_csr_output[csr_dim * 2 + i] = A_csr_input[csr_dim * 2 + i] + A_entry;
-    }
-    b_output[num_cells * 0 + index] = b_input[num_cells * 0 + index] + turbSrc_b[index * 3 + 0];
-    b_output[num_cells * 1 + index] = b_input[num_cells * 1 + index] + turbSrc_b[index * 3 + 1];
-    b_output[num_cells * 2 + index] = b_input[num_cells * 2 + index] + turbSrc_b[index * 3 + 2];
-}
-
-__global__ void offdiagPermutation(const int num_faces, const int *permedIndex,
-                                   const double *d_phi_init, double *d_phi)
-{
-    int index = blockDim.x * blockIdx.x + threadIdx.x;
-    if (index >= num_faces)
-        return;
-
-    int p = permedIndex[index];
-    d_phi[index] = d_phi_init[p];
-}
-
-__global__ void boundaryPermutation(const int num_boundary_faces, const int *bouPermedIndex,
-                                    const double *boundary_pressure_init, const double *boundary_velocity_init,
-                                    double *boundary_pressure, double *boundary_velocity,
-                                    double *boundary_nuEff_init, double *boundary_nuEff,
-                                    double *boundary_rho_init, double *boundary_rho)
-{
-    int index = blockDim.x * blockIdx.x + threadIdx.x;
-    if (index >= num_boundary_faces)
-        return;
-
-    int p = bouPermedIndex[index];
-    boundary_velocity[3 * index] = boundary_velocity_init[3 * p];
-    boundary_velocity[3 * index + 1] = boundary_velocity_init[3 * p + 1];
-    boundary_velocity[3 * index + 2] = boundary_velocity_init[3 * p + 2];
-    boundary_pressure[index] = boundary_pressure_init[p];
-    boundary_rho[index] = boundary_rho_init[p];
-    boundary_nuEff[index] = boundary_nuEff_init[p];
+void dfUEqn::setConstantFields(const std::vector<int> patch_type) {
+  this->patch_type = patch_type;
 }
 
-__global__ void fvc_grad_vector_internal(int num_cells,
-                                         const int *csr_row_index, const int *csr_col_index, const int *csr_diag_index,
-                                         const double *sf, const double *vf, const double *tlambdas, const double *volume,
-                                         double *grad)
-{
-    int index = blockDim.x * blockIdx.x + threadIdx.x;
-    if (index >= num_cells)
-        return;
-
-    // A_csr has one more element in each row: itself
-    int row_index = csr_row_index[index];
-    int row_elements = csr_row_index[index + 1] - row_index;
-    int diag_index = csr_diag_index[index];
-    int neighbor_offset = csr_row_index[index] - index;
-
-    double own_vf_x = vf[index * 3 + 0];
-    double own_vf_y = vf[index * 3 + 1];
-    double own_vf_z = vf[index * 3 + 2];
-    double grad_xx = 0;
-    double grad_xy = 0;
-    double grad_xz = 0;
-    double grad_yx = 0;
-    double grad_yy = 0;
-    double grad_yz = 0;
-    double grad_zx = 0;
-    double grad_zy = 0;
-    double grad_zz = 0;
-    // lower
-    for (int i = 0; i < diag_index; i++)
-    {
-        int neighbor_index = neighbor_offset + i;
-        int neighbor_cell_id = csr_col_index[row_index + i];
-        double w = tlambdas[neighbor_index];
-        double sf_x = sf[neighbor_index * 3 + 0];
-        double sf_y = sf[neighbor_index * 3 + 1];
-        double sf_z = sf[neighbor_index * 3 + 2];
-        double neighbor_vf_x = vf[neighbor_cell_id * 3 + 0];
-        double neighbor_vf_y = vf[neighbor_cell_id * 3 + 1];
-        double neighbor_vf_z = vf[neighbor_cell_id * 3 + 2];
-        double face_x = (1 - w) * own_vf_x + w * neighbor_vf_x;
-        double face_y = (1 - w) * own_vf_y + w * neighbor_vf_y;
-        double face_z = (1 - w) * own_vf_z + w * neighbor_vf_z;
-        grad_xx -= sf_x * face_x;
-        grad_xy -= sf_x * face_y;
-        grad_xz -= sf_x * face_z;
-        grad_yx -= sf_y * face_x;
-        grad_yy -= sf_y * face_y;
-        grad_yz -= sf_y * face_z;
-        grad_zx -= sf_z * face_x;
-        grad_zy -= sf_z * face_y;
-        grad_zz -= sf_z * face_z;
-    }
-    // upper
-    for (int i = diag_index + 1; i < row_elements; i++)
-    {
-        int neighbor_index = neighbor_offset + i - 1;
-        int neighbor_cell_id = csr_col_index[row_index + i];
-        double w = tlambdas[neighbor_index];
-        double sf_x = sf[neighbor_index * 3 + 0];
-        double sf_y = sf[neighbor_index * 3 + 1];
-        double sf_z = sf[neighbor_index * 3 + 2];
-        double neighbor_vf_x = vf[neighbor_cell_id * 3 + 0];
-        double neighbor_vf_y = vf[neighbor_cell_id * 3 + 1];
-        double neighbor_vf_z = vf[neighbor_cell_id * 3 + 2];
-        double face_x = w * own_vf_x + (1 - w) * neighbor_vf_x;
-        double face_y = w * own_vf_y + (1 - w) * neighbor_vf_y;
-        double face_z = w * own_vf_z + (1 - w) * neighbor_vf_z;
-        grad_xx += sf_x * face_x;
-        grad_xy += sf_x * face_y;
-        grad_xz += sf_x * face_z;
-        grad_yx += sf_y * face_x;
-        grad_yy += sf_y * face_y;
-        grad_yz += sf_y * face_z;
-        grad_zx += sf_z * face_x;
-        grad_zy += sf_z * face_y;
-        grad_zz += sf_z * face_z;
-        // if (index == 0)
-        // {
-        //     printf("grad_xx = %.20lf\n", grad_xx);
-        //     // printf("sf_x = %.20lf\n", sf_x);
-        //     // printf("face_x = %.20lf\n", face_x);
-        // }
-    }
-    double vol = volume[index];
-    grad[index * 9 + 0] = grad_xx / vol;
-    grad[index * 9 + 1] = grad_xy / vol;
-    grad[index * 9 + 2] = grad_xz / vol;
-    grad[index * 9 + 3] = grad_yx / vol;
-    grad[index * 9 + 4] = grad_yy / vol;
-    grad[index * 9 + 5] = grad_yz / vol;
-    grad[index * 9 + 6] = grad_zx / vol;
-    grad[index * 9 + 7] = grad_zy / vol;
-    grad[index * 9 + 8] = grad_zz / vol;
-}
-
-__global__ void fvc_grad_vector_boundary(int num_cells, int num_boundary_cells,
-                                         const int *boundary_cell_offset, const int *boundary_cell_id,
-                                         const double *boundary_sf, const double *boundary_vf, const double *volume,
-                                         double *grad, double *grad_boundary_init)
-{
-    int index = blockDim.x * blockIdx.x + threadIdx.x;
-    if (index >= num_boundary_cells)
-        return;
-
-    int cell_offset = boundary_cell_offset[index];
-    int next_cell_offset = boundary_cell_offset[index + 1];
-    int cell_index = boundary_cell_id[cell_offset];
-
-    double grad_xx = 0;
-    double grad_xy = 0;
-    double grad_xz = 0;
-    double grad_yx = 0;
-    double grad_yy = 0;
-    double grad_yz = 0;
-    double grad_zx = 0;
-    double grad_zy = 0;
-    double grad_zz = 0;
-    for (int i = cell_offset; i < next_cell_offset; i++)
-    {
-        double sf_x = boundary_sf[i * 3 + 0];
-        double sf_y = boundary_sf[i * 3 + 1];
-        double sf_z = boundary_sf[i * 3 + 2];
-        double vf_x = boundary_vf[i * 3 + 0];
-        double vf_y = boundary_vf[i * 3 + 1];
-        double vf_z = boundary_vf[i * 3 + 2];
-        grad_xx += sf_x * vf_x;
-        grad_xy += sf_x * vf_y;
-        grad_xz += sf_x * vf_z;
-        grad_yx += sf_y * vf_x;
-        grad_yy += sf_y * vf_y;
-        grad_yz += sf_y * vf_z;
-        grad_zx += sf_z * vf_x;
-        grad_zy += sf_z * vf_y;
-        grad_zz += sf_z * vf_z;
-    }
-
-    double vol = volume[cell_index];
-
-    grad[cell_index * 9 + 0] += grad_xx / vol;
-    grad[cell_index * 9 + 1] += grad_xy / vol;
-    grad[cell_index * 9 + 2] += grad_xz / vol;
-    grad[cell_index * 9 + 3] += grad_yx / vol;
-    grad[cell_index * 9 + 4] += grad_yy / vol;
-    grad[cell_index * 9 + 5] += grad_yz / vol;
-    grad[cell_index * 9 + 6] += grad_zx / vol;
-    grad[cell_index * 9 + 7] += grad_zy / vol;
-    grad[cell_index * 9 + 8] += grad_zz / vol;
+void dfUEqn::createNonConstantFieldsInternal() {
+  // thermophysical fields
+  checkCudaErrors(cudaMalloc((void**)&d_nu_eff, dataBase_.cell_value_bytes));
+  // computed on CPU, used on GPU, need memcpyh2d
+  checkCudaErrors(cudaMallocHost((void**)&h_nu_eff , dataBase_.cell_value_bytes));
+  // intermediate fields
+  checkCudaErrors(cudaMalloc((void**)&d_grad_u, dataBase_.cell_value_tsr_bytes));
+  checkCudaErrors(cudaMalloc((void**)&d_rho_nueff, dataBase_.cell_value_bytes));
+  checkCudaErrors(cudaMalloc((void**)&d_fvc_output, dataBase_.cell_value_vec_bytes));
+  checkCudaErrors(cudaMalloc((void**)&d_permute, dataBase_.cell_value_vec_bytes));
 
-    grad_boundary_init[index * 9 + 0] = grad[cell_index * 9 + 0];
-    grad_boundary_init[index * 9 + 1] = grad[cell_index * 9 + 1];
-    grad_boundary_init[index * 9 + 2] = grad[cell_index * 9 + 2];
-    grad_boundary_init[index * 9 + 3] = grad[cell_index * 9 + 3];
-    grad_boundary_init[index * 9 + 4] = grad[cell_index * 9 + 4];
-    grad_boundary_init[index * 9 + 5] = grad[cell_index * 9 + 5];
-    grad_boundary_init[index * 9 + 6] = grad[cell_index * 9 + 6];
-    grad_boundary_init[index * 9 + 7] = grad[cell_index * 9 + 7];
-    grad_boundary_init[index * 9 + 8] = grad[cell_index * 9 + 8];
-    // if (index == 1)
-    // {
-    //     printf("grad[1] = (%lf, %lf, %lf, %lf, %lf, %lf, %lf, %lf, %lf)\n", grad[index * 9 + 0], grad[index * 9 + 1], grad[index * 9 + 2],
-    //             grad[index * 9 + 3], grad[index * 9 + 4], grad[index * 9 + 5], grad[index * 9 + 6], grad[index * 9 + 7], grad[index * 9 + 8]);
-    // }
+  // getter for h_nu_eff
+  fieldPointerMap["h_nu_eff"] = h_nu_eff;
 }
-
-__global__ void correct_boundary_conditions(int num_boundary_cells,
-                                            const int *boundary_cell_offset, const int *boundary_cell_id,
-                                            const double *boundary_sf, const double *mag_sf,
-                                            double *boundary_grad_init, double *boundary_grad, const double *boundary_deltaCoeffs,
-                                            const double *internal_velocity, const double *boundary_velocity, const int *U_patch_type)
-{
-    int index = blockDim.x * blockIdx.x + threadIdx.x;
-    if (index >= num_boundary_cells)
-        return;
-
-    int cell_offset = boundary_cell_offset[index];
-    int next_cell_offset = boundary_cell_offset[index + 1];
-    int cell_index = boundary_cell_id[cell_offset];
-
-    // initialize boundary_grad
-    double grad_xx = boundary_grad_init[index * 9 + 0];
-    double grad_xy = boundary_grad_init[index * 9 + 1];
-    double grad_xz = boundary_grad_init[index * 9 + 2];
-    double grad_yx = boundary_grad_init[index * 9 + 3];
-    double grad_yy = boundary_grad_init[index * 9 + 4];
-    double grad_yz = boundary_grad_init[index * 9 + 5];
-    double grad_zx = boundary_grad_init[index * 9 + 6];
-    double grad_zy = boundary_grad_init[index * 9 + 7];
-    double grad_zz = boundary_grad_init[index * 9 + 8];
-
-    double internal_U_x = internal_velocity[cell_index * 3 + 0];
-    double internal_U_y = internal_velocity[cell_index * 3 + 1];
-    double internal_U_z = internal_velocity[cell_index * 3 + 2];
-
-    for (int i = cell_offset; i < next_cell_offset; i++)
-    {
-        // OpenFoam code
-        // const vectorField n
-        //     (
-        //      vsf.mesh().Sf().boundaryField()[patchi]
-        //      / vsf.mesh().magSf().boundaryField()[patchi]
-        //     );
-        // gGradbf[patchi] += n *
-        //     (
-        //      vsf.boundaryField()[patchi].snGrad()
-        //      - (n & gGradbf[patchi])
-        //     );
-        // template<class Type> // fixedValue
-        // Foam::tmp<Foam::Field<Type>> Foam::fvPatchField<Type>::snGrad() const
-        // {
-        //     return patch_.deltaCoeffs()*(*this - patchInternalField());
-        // }
-
-        double n_x = boundary_sf[i * 3 + 0] / mag_sf[i];
-        double n_y = boundary_sf[i * 3 + 1] / mag_sf[i];
-        double n_z = boundary_sf[i * 3 + 2] / mag_sf[i];
-
-        double sn_grad_x, sn_grad_y, sn_grad_z;
-        int patchIndex = U_patch_type[i];
-        if (patchIndex == 0) { // zeroGradient
-            sn_grad_x = 0;
-            sn_grad_y = 0;
-            sn_grad_z = 0;
-        } else if (patchIndex == 1) { // fixedValue
-            sn_grad_x = boundary_deltaCoeffs[i] * (boundary_velocity[i * 3 + 0] - internal_velocity[cell_index * 3 + 0]);
-            sn_grad_y = boundary_deltaCoeffs[i] * (boundary_velocity[i * 3 + 1] - internal_velocity[cell_index * 3 + 1]);
-            sn_grad_z = boundary_deltaCoeffs[i] * (boundary_velocity[i * 3 + 2] - internal_velocity[cell_index * 3 + 2]);
-            // if (index == 1)
-            // {
-            //     printf("cell_index = %d\n", cell_index);
-            //     printf("boundary_velocity = %e\n", boundary_velocity[i * 3 + 1]);
-            //     printf("internal_velocity = %e\n", internal_velocity[cell_index * 3 + 0]);
-            // }
-            
-        }
-        // TODO: implement other BCs
-        double grad_correction_x = sn_grad_x - (n_x * grad_xx + n_y * grad_yx + n_z * grad_zx);
-        double grad_correction_y = sn_grad_y - (n_x * grad_xy + n_y * grad_yy + n_z * grad_zy);
-        double grad_correction_z = sn_grad_z - (n_x * grad_xz + n_y * grad_yz + n_z * grad_zz);
-        boundary_grad[i * 9 + 0] = grad_xx + n_x * grad_correction_x;
-        boundary_grad[i * 9 + 1] = grad_xy + n_x * grad_correction_y;
-        boundary_grad[i * 9 + 2] = grad_xz + n_x * grad_correction_z;
-        boundary_grad[i * 9 + 3] = grad_yx + n_y * grad_correction_x;
-        boundary_grad[i * 9 + 4] = grad_yy + n_y * grad_correction_y;
-        boundary_grad[i * 9 + 5] = grad_yz + n_y * grad_correction_z;
-        boundary_grad[i * 9 + 6] = grad_zx + n_z * grad_correction_x;
-        boundary_grad[i * 9 + 7] = grad_zy + n_z * grad_correction_y;
-        boundary_grad[i * 9 + 8] = grad_zz + n_z * grad_correction_z;
-        // if (index == 1)
-        // {
-        //     printf("boundary_grad = (%lf, %lf, %lf, %lf, %lf, %lf, %lf, %lf, %lf)\n", boundary_grad[i * 9 + 0], boundary_grad[i * 9 + 1], boundary_grad[i * 9 + 2],
-        //         boundary_grad[i * 9 + 3], boundary_grad[i * 9 + 4], boundary_grad[i * 9 + 5], boundary_grad[i * 9 + 6], boundary_grad[i * 9 + 7], boundary_grad[i * 9 + 8]);
-        // }
         
+void dfUEqn::createNonConstantFieldsBoundary() {
+  // thermophysical fields
+  checkCudaErrors(cudaMalloc((void**)&d_boundary_nu_eff, dataBase_.boundary_surface_value_bytes));
+  // computed on CPU, used on GPU, need memcpyh2d
+  checkCudaErrors(cudaMallocHost((void**)&h_boundary_nu_eff, dataBase_.boundary_surface_value_bytes));
+  // intermediate fields
+  checkCudaErrors(cudaMalloc((void**)&d_boundary_grad_u, dataBase_.boundary_surface_value_tsr_bytes));
+  checkCudaErrors(cudaMalloc((void**)&d_boundary_rho_nueff, dataBase_.boundary_surface_value_bytes));
+  // boundary coeff fields
+  checkCudaErrors(cudaMalloc((void**)&d_value_internal_coeffs, dataBase_.boundary_surface_value_vec_bytes));
+  checkCudaErrors(cudaMalloc((void**)&d_value_boundary_coeffs, dataBase_.boundary_surface_value_vec_bytes));
+  checkCudaErrors(cudaMalloc((void**)&d_gradient_internal_coeffs, dataBase_.boundary_surface_value_vec_bytes));
+  checkCudaErrors(cudaMalloc((void**)&d_gradient_boundary_coeffs, dataBase_.boundary_surface_value_vec_bytes));
+
+  // getter for h_boundary_nu_eff
+  fieldPointerMap["h_boundary_nu_eff"] = h_boundary_nu_eff;
+}
+
+void dfUEqn::createNonConstantLduAndCsrFields() {
+  checkCudaErrors(cudaMalloc((void**)&d_lower, dataBase_.surface_value_bytes));
+  checkCudaErrors(cudaMalloc((void**)&d_upper, dataBase_.surface_value_bytes));
+  checkCudaErrors(cudaMalloc((void**)&d_diag, dataBase_.cell_value_bytes));
+  checkCudaErrors(cudaMalloc((void**)&d_source, dataBase_.cell_value_vec_bytes));
+  checkCudaErrors(cudaMalloc((void**)&d_diag_vector, dataBase_.cell_value_vec_bytes));
+  checkCudaErrors(cudaMalloc((void**)&d_internal_coeffs, dataBase_.boundary_surface_value_vec_bytes));
+  checkCudaErrors(cudaMalloc((void**)&d_boundary_coeffs, dataBase_.boundary_surface_value_vec_bytes));
+  checkCudaErrors(cudaMalloc((void**)&d_A, dataBase_.csr_value_vec_bytes));
+  checkCudaErrors(cudaMalloc((void**)&d_b, dataBase_.cell_value_vec_bytes));
+}
+
+void dfUEqn::initNonConstantFieldsBoundary() {
+    update_boundary_coeffs_vector(dataBase_.stream, dataBase_.num_boundary_surfaces, dataBase_.num_patches,
+            dataBase_.patch_size.data(), patch_type.data(),
+            d_value_internal_coeffs, d_value_boundary_coeffs,
+            d_gradient_internal_coeffs, d_gradient_boundary_coeffs);
+}
+
+void dfUEqn::preProcessForRhoEqn(const double *h_rho, const double *h_phi, const double *h_boundary_phi) {
+  checkCudaErrors(cudaMemcpyAsync(dataBase_.d_rho, h_rho, dataBase_.cell_value_bytes, cudaMemcpyHostToDevice, dataBase_.stream));
+  checkCudaErrors(cudaMemcpyAsync(dataBase_.d_phi, h_phi, dataBase_.surface_value_bytes, cudaMemcpyHostToDevice, dataBase_.stream));
+  checkCudaErrors(cudaMemcpyAsync(dataBase_.d_boundary_phi, h_boundary_phi, dataBase_.boundary_surface_value_bytes, cudaMemcpyHostToDevice, dataBase_.stream));
+}
+
+void dfUEqn::preProcess(const double *h_u, const double *h_boundary_u, const double *h_p, const double *h_boundary_p, 
+        const double *h_nu_eff, const double *h_boundary_nu_eff, const double *h_boundary_rho) {
+  checkCudaErrors(cudaMemcpyAsync(dataBase_.d_u, h_u, dataBase_.cell_value_vec_bytes, cudaMemcpyHostToDevice, dataBase_.stream));
+  checkCudaErrors(cudaMemcpyAsync(dataBase_.d_boundary_u, h_boundary_u, dataBase_.boundary_surface_value_vec_bytes, cudaMemcpyHostToDevice, dataBase_.stream));
+  checkCudaErrors(cudaMemcpyAsync(dataBase_.d_p, h_p, dataBase_.cell_value_bytes, cudaMemcpyHostToDevice, dataBase_.stream));
+  checkCudaErrors(cudaMemcpyAsync(dataBase_.d_boundary_p, h_boundary_p, dataBase_.boundary_surface_value_bytes, cudaMemcpyHostToDevice, dataBase_.stream));
+  checkCudaErrors(cudaMemcpyAsync(d_nu_eff, h_nu_eff, dataBase_.cell_value_bytes, cudaMemcpyHostToDevice, dataBase_.stream));
+  checkCudaErrors(cudaMemcpyAsync(d_boundary_nu_eff, h_boundary_nu_eff, dataBase_.boundary_surface_value_bytes, cudaMemcpyHostToDevice, dataBase_.stream));
+  checkCudaErrors(cudaMemcpyAsync(dataBase_.d_boundary_rho, h_boundary_rho, dataBase_.boundary_surface_value_bytes, cudaMemcpyHostToDevice, dataBase_.stream));
+
+  checkCudaErrors(cudaMemsetAsync(d_lower, 0, dataBase_.surface_value_bytes, dataBase_.stream));
+  checkCudaErrors(cudaMemsetAsync(d_upper, 0, dataBase_.surface_value_bytes, dataBase_.stream));
+  checkCudaErrors(cudaMemsetAsync(d_diag, 0, dataBase_.cell_value_bytes, dataBase_.stream));
+  checkCudaErrors(cudaMemsetAsync(d_source, 0, dataBase_.cell_value_vec_bytes, dataBase_.stream));
+  checkCudaErrors(cudaMemsetAsync(d_internal_coeffs, 0, dataBase_.boundary_surface_value_vec_bytes, dataBase_.stream));
+  checkCudaErrors(cudaMemsetAsync(d_boundary_coeffs, 0, dataBase_.boundary_surface_value_vec_bytes, dataBase_.stream));
+  checkCudaErrors(cudaMemsetAsync(d_A, 0, dataBase_.csr_value_vec_bytes, dataBase_.stream));
+  checkCudaErrors(cudaMemsetAsync(d_b, 0, dataBase_.cell_value_vec_bytes, dataBase_.stream));
+  checkCudaErrors(cudaMemsetAsync(d_diag_vector, 0, dataBase_.cell_value_vec_bytes, dataBase_.stream)); // TODO: maybe a better way
+}
+
+void dfUEqn::process() {
+    //使用event计算时间
+    float time_elapsed=0;
+    cudaEvent_t start,stop;
+    checkCudaErrors(cudaEventCreate(&start));
+    checkCudaErrors(cudaEventCreate(&stop));
+    checkCudaErrors(cudaEventRecord(start,0));
+
+#ifndef TIME_GPU
+    if(!graph_created) {
+        DEBUG_TRACE;
+        checkCudaErrors(cudaStreamBeginCapture(dataBase_.stream, cudaStreamCaptureModeGlobal));
+#endif
+
+        permute_vector_h2d(dataBase_.stream, dataBase_.num_cells, dataBase_.d_u, d_permute);
+        fvm_ddt_vector(dataBase_.stream, dataBase_.num_cells, dataBase_.rdelta_t,
+                dataBase_.d_rho, dataBase_.d_rho_old, d_permute, dataBase_.d_volume,
+                d_diag, d_source, 1.);
+        fvm_div_vector(dataBase_.stream, dataBase_.num_surfaces, dataBase_.num_boundary_surfaces, 
+                dataBase_.d_owner, dataBase_.d_neighbor,
+                dataBase_.d_phi, dataBase_.d_weight,
+                d_lower, d_upper, d_diag, // end for internal
+                dataBase_.num_patches, dataBase_.patch_size.data(), patch_type.data(),
+                dataBase_.d_boundary_phi, d_value_internal_coeffs, d_value_boundary_coeffs,
+                d_internal_coeffs, d_boundary_coeffs, 1.);
+        field_multiply_scalar(dataBase_.stream,
+               dataBase_.num_cells, dataBase_.d_rho, d_nu_eff, d_rho_nueff, // end for internal
+               dataBase_.num_boundary_surfaces, dataBase_.d_boundary_rho, d_boundary_nu_eff, d_boundary_rho_nueff);
+        fvm_laplacian_vector(dataBase_.stream, dataBase_.num_surfaces, dataBase_.num_boundary_surfaces, 
+               dataBase_.d_owner, dataBase_.d_neighbor,
+               dataBase_.d_weight, dataBase_.d_mag_sf, dataBase_.d_delta_coeffs, d_rho_nueff,
+               d_lower, d_upper, d_diag, // end for internal
+               dataBase_.num_patches, dataBase_.patch_size.data(), patch_type.data(),
+               dataBase_.d_boundary_mag_sf, d_boundary_rho_nueff,
+               d_gradient_internal_coeffs, d_gradient_boundary_coeffs,
+               d_internal_coeffs, d_boundary_coeffs, -1);
+        fvc_grad_vector(dataBase_.stream, dataBase_.num_cells, dataBase_.num_surfaces, dataBase_.num_boundary_surfaces,
+               dataBase_.d_owner, dataBase_.d_neighbor,
+               dataBase_.d_weight, dataBase_.d_sf, d_permute, d_grad_u,
+               dataBase_.num_patches, dataBase_.patch_size.data(), patch_type.data(),
+               dataBase_.d_boundary_face_cell, dataBase_.d_boundary_u, dataBase_.d_boundary_sf,
+               dataBase_.d_volume, dataBase_.d_boundary_mag_sf, d_boundary_grad_u, dataBase_.d_boundary_delta_coeffs);
+        scale_dev2T_tensor(dataBase_.stream, dataBase_.num_cells, d_rho_nueff, d_grad_u, // end for internal
+               dataBase_.num_boundary_surfaces, d_boundary_rho_nueff, d_boundary_grad_u);
+        fvc_div_cell_tensor(dataBase_.stream, dataBase_.num_cells, dataBase_.num_surfaces, dataBase_.num_boundary_surfaces,
+               dataBase_.d_owner, dataBase_.d_neighbor,
+               dataBase_.d_weight, dataBase_.d_sf, d_grad_u, d_source, // end for internal
+               dataBase_.num_patches, dataBase_.patch_size.data(), patch_type.data(),
+               dataBase_.d_boundary_face_cell, d_boundary_grad_u, dataBase_.d_boundary_sf, dataBase_.d_volume);
+        // fvc_to_source_vector(dataBase_.stream, dataBase_.num_cells,
+        //        dataBase_.d_volume, d_fvc_output, d_source);
+        fvc_grad_cell_scalar(dataBase_.stream, dataBase_.num_cells, dataBase_.num_surfaces,
+                dataBase_.d_owner, dataBase_.d_neighbor,
+                dataBase_.d_weight, dataBase_.d_sf, dataBase_.d_p, d_source,
+                dataBase_.num_patches, dataBase_.patch_size.data(), patch_type.data(),
+                dataBase_.d_boundary_face_cell, dataBase_.d_boundary_p, dataBase_.d_boundary_sf, dataBase_.d_volume, -1.);
+        // fvc_to_source_vector(dataBase_.stream, dataBase_.num_cells,
+        //         dataBase_.d_volume, d_fvc_output, d_source);
+
+#ifndef TIME_GPU
+        checkCudaErrors(cudaStreamEndCapture(dataBase_.stream, &graph));
+        checkCudaErrors(cudaGraphInstantiate(&graph_instance, graph, NULL, NULL, 0));
+        graph_created = true;
     }
-}
+    DEBUG_TRACE;
+    checkCudaErrors(cudaGraphLaunch(graph_instance, dataBase_.stream));
+#endif
 
-__global__ void dev2_t_tensor(int num, double *tensor)
-{
-    int index = blockDim.x * blockIdx.x + threadIdx.x;
-    if (index >= num)
-        return;
+    checkCudaErrors(cudaEventRecord(stop,0));
+    checkCudaErrors(cudaEventSynchronize(start));
+    checkCudaErrors(cudaEventSynchronize(stop));
+    checkCudaErrors(cudaEventElapsedTime(&time_elapsed,start,stop));
+    fprintf(stderr, "ueqn process time:%f(ms)\n",time_elapsed);
 
-    double t_xx = tensor[index * 9 + 0];
-    double t_xy = tensor[index * 9 + 1];
-    double t_xz = tensor[index * 9 + 2];
-    double t_yx = tensor[index * 9 + 3];
-    double t_yy = tensor[index * 9 + 4];
-    double t_yz = tensor[index * 9 + 5];
-    double t_zx = tensor[index * 9 + 6];
-    double t_zy = tensor[index * 9 + 7];
-    double t_zz = tensor[index * 9 + 8];
-    double trace_coeff = (2. / 3.) * (t_xx + t_yy + t_zz);
-    tensor[index * 9 + 0] = t_xx - trace_coeff;
-    tensor[index * 9 + 1] = t_yx;
-    tensor[index * 9 + 2] = t_zx;
-    tensor[index * 9 + 3] = t_xy;
-    tensor[index * 9 + 4] = t_yy - trace_coeff;
-    tensor[index * 9 + 5] = t_zy;
-    tensor[index * 9 + 6] = t_xz;
-    tensor[index * 9 + 7] = t_yz;
-    tensor[index * 9 + 8] = t_zz - trace_coeff;
+    //solve();
 }
 
-__global__ void fvc_div_tensor_internal(int num_cells,
-                                        const int *csr_row_index, const int *csr_col_index, const int *csr_diag_index,
-                                        const double *scalar0, const double *scalar1,
-                                        const double *sf, const double *vf, const double *tlambdas, const double *volume,
-                                        const double sign, const double *b_input, double *b_output)
-{
-    int index = blockDim.x * blockIdx.x + threadIdx.x;
-    if (index >= num_cells)
-        return;
-
-    // A_csr has one more element in each row: itself
-    int row_index = csr_row_index[index];
-    int row_elements = csr_row_index[index + 1] - row_index;
-    int diag_index = csr_diag_index[index];
-    int neighbor_offset = csr_row_index[index] - index;
-
-    double coeff_own = scalar0[index] * scalar1[index];
-
-    double own_vf_xx = vf[index * 9 + 0];
-    double own_vf_xy = vf[index * 9 + 1];
-    double own_vf_xz = vf[index * 9 + 2];
-    double own_vf_yx = vf[index * 9 + 3];
-    double own_vf_yy = vf[index * 9 + 4];
-    double own_vf_yz = vf[index * 9 + 5];
-    double own_vf_zx = vf[index * 9 + 6];
-    double own_vf_zy = vf[index * 9 + 7];
-    double own_vf_zz = vf[index * 9 + 8];
-    double sum_x = 0;
-    double sum_y = 0;
-    double sum_z = 0;
-
-    // lower
-    for (int i = 0; i < diag_index; i++)
-    {
-        int neighbor_index = neighbor_offset + i;
-        int neighbor_cell_id = csr_col_index[row_index + i];
-        double coeff_nei = scalar0[neighbor_cell_id] * scalar1[neighbor_cell_id];
-        double w = tlambdas[neighbor_index];
-        double sf_x = sf[neighbor_index * 3 + 0];
-        double sf_y = sf[neighbor_index * 3 + 1];
-        double sf_z = sf[neighbor_index * 3 + 2];
-        double neighbor_vf_xx = vf[neighbor_cell_id * 9 + 0];
-        double neighbor_vf_xy = vf[neighbor_cell_id * 9 + 1];
-        double neighbor_vf_xz = vf[neighbor_cell_id * 9 + 2];
-        double neighbor_vf_yx = vf[neighbor_cell_id * 9 + 3];
-        double neighbor_vf_yy = vf[neighbor_cell_id * 9 + 4];
-        double neighbor_vf_yz = vf[neighbor_cell_id * 9 + 5];
-        double neighbor_vf_zx = vf[neighbor_cell_id * 9 + 6];
-        double neighbor_vf_zy = vf[neighbor_cell_id * 9 + 7];
-        double neighbor_vf_zz = vf[neighbor_cell_id * 9 + 8];
-        double face_xx = (1 - w) * own_vf_xx * coeff_own + w * neighbor_vf_xx * coeff_nei;
-        double face_xy = (1 - w) * own_vf_xy * coeff_own + w * neighbor_vf_xy * coeff_nei;
-        double face_xz = (1 - w) * own_vf_xz * coeff_own + w * neighbor_vf_xz * coeff_nei;
-        double face_yx = (1 - w) * own_vf_yx * coeff_own + w * neighbor_vf_yx * coeff_nei;
-        double face_yy = (1 - w) * own_vf_yy * coeff_own + w * neighbor_vf_yy * coeff_nei;
-        double face_yz = (1 - w) * own_vf_yz * coeff_own + w * neighbor_vf_yz * coeff_nei;
-        double face_zx = (1 - w) * own_vf_zx * coeff_own + w * neighbor_vf_zx * coeff_nei;
-        double face_zy = (1 - w) * own_vf_zy * coeff_own + w * neighbor_vf_zy * coeff_nei;
-        double face_zz = (1 - w) * own_vf_zz * coeff_own + w * neighbor_vf_zz * coeff_nei;
-        sum_x -= sf_x * face_xx + sf_y * face_yx + sf_z * face_zx;
-        sum_y -= sf_x * face_xy + sf_y * face_yy + sf_z * face_zy;
-        sum_z -= sf_x * face_xz + sf_y * face_yz + sf_z * face_zz;
-    }
-    // upper
-    for (int i = diag_index + 1; i < row_elements; i++)
-    {
-        int neighbor_index = neighbor_offset + i - 1;
-        int neighbor_cell_id = csr_col_index[row_index + i];
-        double coeff_nei = scalar0[neighbor_cell_id] * scalar1[neighbor_cell_id];
-        double w = tlambdas[neighbor_index];
-        double sf_x = sf[neighbor_index * 3 + 0];
-        double sf_y = sf[neighbor_index * 3 + 1];
-        double sf_z = sf[neighbor_index * 3 + 2];
-        double neighbor_vf_xx = vf[neighbor_cell_id * 9 + 0];
-        double neighbor_vf_xy = vf[neighbor_cell_id * 9 + 1];
-        double neighbor_vf_xz = vf[neighbor_cell_id * 9 + 2];
-        double neighbor_vf_yx = vf[neighbor_cell_id * 9 + 3];
-        double neighbor_vf_yy = vf[neighbor_cell_id * 9 + 4];
-        double neighbor_vf_yz = vf[neighbor_cell_id * 9 + 5];
-        double neighbor_vf_zx = vf[neighbor_cell_id * 9 + 6];
-        double neighbor_vf_zy = vf[neighbor_cell_id * 9 + 7];
-        double neighbor_vf_zz = vf[neighbor_cell_id * 9 + 8];
-        double face_xx = w * own_vf_xx * coeff_own + (1 - w) * neighbor_vf_xx * coeff_nei;
-        double face_xy = w * own_vf_xy * coeff_own + (1 - w) * neighbor_vf_xy * coeff_nei;
-        double face_xz = w * own_vf_xz * coeff_own + (1 - w) * neighbor_vf_xz * coeff_nei;
-        double face_yx = w * own_vf_yx * coeff_own + (1 - w) * neighbor_vf_yx * coeff_nei;
-        double face_yy = w * own_vf_yy * coeff_own + (1 - w) * neighbor_vf_yy * coeff_nei;
-        double face_yz = w * own_vf_yz * coeff_own + (1 - w) * neighbor_vf_yz * coeff_nei;
-        double face_zx = w * own_vf_zx * coeff_own + (1 - w) * neighbor_vf_zx * coeff_nei;
-        double face_zy = w * own_vf_zy * coeff_own + (1 - w) * neighbor_vf_zy * coeff_nei;
-        double face_zz = w * own_vf_zz * coeff_own + (1 - w) * neighbor_vf_zz * coeff_nei;
-        sum_x += sf_x * face_xx + sf_y * face_yx + sf_z * face_zx;
-        sum_y += sf_x * face_xy + sf_y * face_yy + sf_z * face_zy;
-        sum_z += sf_x * face_xz + sf_y * face_yz + sf_z * face_zz;
-    }
-    double vol = volume[index];
-    b_output[num_cells * 0 + index] = b_input[num_cells * 0 + index] + sum_x * sign;
-    b_output[num_cells * 1 + index] = b_input[num_cells * 1 + index] + sum_y * sign;
-    b_output[num_cells * 2 + index] = b_input[num_cells * 2 + index] + sum_z * sign;
-}
-
-__global__ void fvc_div_tensor_boundary(int num_cells, int num_boundary_cells,
-                                        const int *boundary_cell_offset, const int *boundary_cell_id,
-                                        const double *boundary_scalar0, const double *boundary_scalar1,
-                                        const double *boundary_sf, const double *boundary_vf, const double *volume,
-                                        const double sign, const double *b_input, double *b_output)
-{
-    int index = blockDim.x * blockIdx.x + threadIdx.x;
-    if (index >= num_boundary_cells)
-        return;
-
-    int cell_offset = boundary_cell_offset[index];
-    int next_cell_offset = boundary_cell_offset[index + 1];
-    int cell_index = boundary_cell_id[cell_offset];
-
-    // OpenFoam code
-    // Foam::surfaceInterpolationScheme<Type>::dotInterpolate
-    // if (vf.boundaryField()[pi].coupled())
-    // {
-    //     psf =
-    //         pSf
-    //         & (
-    //                 pLambda*vf.boundaryField()[pi].patchInternalField()
-    //                 + (1.0 - pLambda)*vf.boundaryField()[pi].patchNeighbourField()
-    //           );
-    // }
-    // else
-    // {
-    //     psf = pSf & vf.boundaryField()[pi];
-    // }
-    // tmp<GeometricField<Type, fvPatchField, volMesh>> surfaceIntegrate
-    // forAll(mesh.boundary()[patchi], facei)
-    // {
-    //     ivf[pFaceCells[facei]] += pssf[facei];
-    // }
-    double sum_x = 0;
-    double sum_y = 0;
-    double sum_z = 0;
-    for (int i = cell_offset; i < next_cell_offset; i++)
-    {
-        double sf_x = boundary_sf[i * 3 + 0];
-        double sf_y = boundary_sf[i * 3 + 1];
-        double sf_z = boundary_sf[i * 3 + 2];
-        double face_xx = boundary_vf[i * 9 + 0];
-        double face_xy = boundary_vf[i * 9 + 1];
-        double face_xz = boundary_vf[i * 9 + 2];
-        double face_yx = boundary_vf[i * 9 + 3];
-        double face_yy = boundary_vf[i * 9 + 4];
-        double face_yz = boundary_vf[i * 9 + 5];
-        double face_zx = boundary_vf[i * 9 + 6];
-        double face_zy = boundary_vf[i * 9 + 7];
-        double face_zz = boundary_vf[i * 9 + 8];
-
-        // if not coupled
-        double coeff = boundary_scalar0[i] * boundary_scalar1[i];
-        sum_x += (sf_x * face_xx + sf_y * face_yx + sf_z * face_zx) * coeff;
-        sum_y += (sf_x * face_xy + sf_y * face_yy + sf_z * face_zy) * coeff;
-        sum_z += (sf_x * face_xz + sf_y * face_yz + sf_z * face_zz) * coeff;
-    }
-    double vol = volume[cell_index];
-    b_output[num_cells * 0 + cell_index] = b_input[num_cells * 0 + cell_index] + sum_x * sign;
-    b_output[num_cells * 1 + cell_index] = b_input[num_cells * 1 + cell_index] + sum_y * sign;
-    b_output[num_cells * 2 + cell_index] = b_input[num_cells * 2 + cell_index] + sum_z * sign;
-}
-
-__global__ void fvm_laplacian_uncorrected_vector_internal(int num_cells, int num_faces,
-                                                          const int *csr_row_index, const int *csr_col_index, const int *csr_diag_index,
-                                                          const double *scalar0, const double *scalar1, const double *weight,
-                                                          const double *magsf, const double *distance,
-                                                          const double sign, const double *A_csr_input, double *A_csr_output)
-{
-    int index = blockDim.x * blockIdx.x + threadIdx.x;
-    if (index >= num_cells)
-        return;
-
-    // A_csr has one more element in each row: itself
-    int row_index = csr_row_index[index];
-    int row_elements = csr_row_index[index + 1] - row_index;
-    int diag_index = csr_diag_index[index];
-    int neighbor_offset = csr_row_index[index] - index;
-    int csr_dim = num_cells + num_faces;
-
-    double own_scalar0 = scalar0[index];
-    double own_scalar1 = scalar1[index];
-    double own_coeff = own_scalar0 * own_scalar1;
-
-    // fvm.upper() = deltaCoeffs.primitiveField()*gammaMagSf.primitiveField();
-    // fvm.negSumDiag();
-    double sum_diag = 0;
-    // lower
-    for (int i = 0; i < diag_index; i++)
-    {
-        int neighbor_index = neighbor_offset + i;
-        int neighbor_cell_id = csr_col_index[i + row_index];
-        double w = weight[neighbor_index];
-        double nei_scalar0 = scalar0[neighbor_cell_id];
-        double nei_scalar1 = scalar1[neighbor_cell_id];
-        double nei_coeff = nei_scalar0 * nei_scalar1;
-        double gamma = w * (nei_coeff - own_coeff) + own_coeff;
-        double gamma_magsf = gamma * magsf[neighbor_index];
-        double coeff = gamma_magsf * distance[neighbor_index];
-        A_csr_output[csr_dim * 0 + row_index + i] = A_csr_input[csr_dim * 0 + row_index + i] + coeff * sign;
-        A_csr_output[csr_dim * 1 + row_index + i] = A_csr_input[csr_dim * 1 + row_index + i] + coeff * sign;
-        A_csr_output[csr_dim * 2 + row_index + i] = A_csr_input[csr_dim * 2 + row_index + i] + coeff * sign;
-
-        sum_diag += (-coeff);
-    }
-    // upper
-    for (int i = diag_index + 1; i < row_elements; i++)
-    {
-        int neighbor_index = neighbor_offset + i - 1;
-        int neighbor_cell_id = csr_col_index[i + row_index];
-        double w = weight[neighbor_index];
-        double nei_scalar0 = scalar0[neighbor_cell_id];
-        double nei_scalar1 = scalar1[neighbor_cell_id];
-        double nei_coeff = nei_scalar0 * nei_scalar1;
-        double gamma = w * (own_coeff - nei_coeff) + nei_coeff;
-        double gamma_magsf = gamma * magsf[neighbor_index];
-        double coeff = gamma_magsf * distance[neighbor_index];
-        A_csr_output[csr_dim * 0 + row_index + i] = A_csr_input[csr_dim * 0 + row_index + i] + coeff * sign;
-        A_csr_output[csr_dim * 1 + row_index + i] = A_csr_input[csr_dim * 1 + row_index + i] + coeff * sign;
-        A_csr_output[csr_dim * 2 + row_index + i] = A_csr_input[csr_dim * 2 + row_index + i] + coeff * sign;
-        sum_diag += (-coeff);
-    }
-    A_csr_output[csr_dim * 0 + row_index + diag_index] = A_csr_input[csr_dim * 0 + row_index + diag_index] + sum_diag * sign; // diag
-    A_csr_output[csr_dim * 1 + row_index + diag_index] = A_csr_input[csr_dim * 1 + row_index + diag_index] + sum_diag * sign; // diag
-    A_csr_output[csr_dim * 2 + row_index + diag_index] = A_csr_input[csr_dim * 2 + row_index + diag_index] + sum_diag * sign; // diag
-}
-
-__global__ void fvm_laplacian_uncorrected_vector_boundary(int num_cells, int num_faces, int num_boundary_cells,
-                                                          const int *csr_row_index, const int *csr_diag_index,
-                                                          const int *boundary_cell_offset, const int *boundary_cell_id,
-                                                          const double *boundary_scalar0, const double *boundary_scalar1,
-                                                          const double *boundary_magsf, const double *gradient_internal_coeffs, const double *gradient_boundary_coeffs,
-                                                          const double sign, const double *A_csr_input, const double *b_input, double *A_csr_output, double *b_output,
-                                                          double *ueqn_internal_coeffs, double *ueqn_boundary_coeffs)
-{
-    int index = blockDim.x * blockIdx.x + threadIdx.x;
-    if (index >= num_boundary_cells)
-        return;
-
-    int cell_offset = boundary_cell_offset[index];
-    int next_cell_offset = boundary_cell_offset[index + 1];
-    int cell_index = boundary_cell_id[cell_offset];
-
-    int row_index = csr_row_index[cell_index];
-    int diag_index = csr_diag_index[cell_index];
-    int csr_dim = num_cells + num_faces;
-    int csr_index = row_index + diag_index;
-
-    // OpenFoam code
-    // if (pvf.coupled())
-    // {
-    //     fvm.internalCoeffs()[patchi] =
-    //         pGamma*pvf.gradientInternalCoeffs(pDeltaCoeffs);
-    //     fvm.boundaryCoeffs()[patchi] =
-    //         -pGamma*pvf.gradientBoundaryCoeffs(pDeltaCoeffs);
-    // }
-    // else
-    // {
-    //     fvm.internalCoeffs()[patchi] = pGamma*pvf.gradientInternalCoeffs();
-    //     fvm.boundaryCoeffs()[patchi] = -
-    //         pGamma*pvf.gradientBoundaryCoeffs();
-    // }
-    double internal_coeffs_x = 0;
-    double internal_coeffs_y = 0;
-    double internal_coeffs_z = 0;
-    double boundary_coeffs_x = 0;
-    double boundary_coeffs_y = 0;
-    double boundary_coeffs_z = 0;
-    for (int i = cell_offset; i < next_cell_offset; i++)
-    {
-        double gamma = boundary_scalar0[i] * boundary_scalar1[i];
-        double gamma_magsf = gamma * boundary_magsf[i];
-        internal_coeffs_x += gamma_magsf * gradient_internal_coeffs[i * 3 + 0];
-        internal_coeffs_y += gamma_magsf * gradient_internal_coeffs[i * 3 + 1];
-        internal_coeffs_z += gamma_magsf * gradient_internal_coeffs[i * 3 + 2];
-        boundary_coeffs_x -= gamma_magsf * gradient_boundary_coeffs[i * 3 + 0];
-        boundary_coeffs_y -= gamma_magsf * gradient_boundary_coeffs[i * 3 + 1];
-        boundary_coeffs_z -= gamma_magsf * gradient_boundary_coeffs[i * 3 + 2];
-    }
-
-    ueqn_internal_coeffs[cell_index * 3 + 0] += internal_coeffs_x * sign;
-    ueqn_internal_coeffs[cell_index * 3 + 1] += internal_coeffs_y * sign;
-    ueqn_internal_coeffs[cell_index * 3 + 2] += internal_coeffs_z * sign;
-    ueqn_boundary_coeffs[cell_index * 3 + 0] += boundary_coeffs_x * sign;
-    ueqn_boundary_coeffs[cell_index * 3 + 1] += boundary_coeffs_y * sign;
-    ueqn_boundary_coeffs[cell_index * 3 + 2] += boundary_coeffs_z * sign;
-
-    A_csr_output[csr_dim * 0 + csr_index] = A_csr_input[csr_dim * 0 + csr_index] + internal_coeffs_x * sign;
-    A_csr_output[csr_dim * 1 + csr_index] = A_csr_input[csr_dim * 1 + csr_index] + internal_coeffs_y * sign;
-    A_csr_output[csr_dim * 2 + csr_index] = A_csr_input[csr_dim * 2 + csr_index] + internal_coeffs_z * sign;
-    b_output[num_cells * 0 + cell_index] = b_input[num_cells * 0 + cell_index] + boundary_coeffs_x * sign;
-    b_output[num_cells * 1 + cell_index] = b_input[num_cells * 1 + cell_index] + boundary_coeffs_y * sign;
-    b_output[num_cells * 2 + cell_index] = b_input[num_cells * 2 + cell_index] + boundary_coeffs_z * sign;
-}
-
-__global__ void addBoundaryDiag(int num_cells, int num_boundary_cells,
-                                const int *csr_row_index, const int *csr_diag_index,
-                                const int *boundary_cell_offset, const int *boundary_cell_id,
-                                const double *ueqn_internal_coeffs, const double *ueqn_boundary_coeffs,
-                                const double *psi, double *H)
-{
-    int index = blockDim.x * blockIdx.x + threadIdx.x;
-    if (index >= num_boundary_cells)
-        return;
-
-    int cell_offset = boundary_cell_offset[index];
-    int next_cell_offset = boundary_cell_offset[index + 1];
-    int cell_index = boundary_cell_id[cell_offset];
-
-    // addBoundaryDiag(boundaryDiagCmpt, cmpt); // add internal coeffs
-    // boundaryDiagCmpt.negate();
-    double internal_x = ueqn_internal_coeffs[cell_index * 3 + 0];
-    double internal_y = ueqn_internal_coeffs[cell_index * 3 + 1];
-    double internal_z = ueqn_internal_coeffs[cell_index * 3 + 2];
-
-    // addCmptAvBoundaryDiag(boundaryDiagCmpt);
-    double ave_internal = (internal_x + internal_y + internal_z) / 3;
-
-    H[num_cells * 0 + cell_index] = (-internal_x + ave_internal) * psi[num_cells * 0 + cell_index];
-    H[num_cells * 1 + cell_index] = (-internal_y + ave_internal) * psi[num_cells * 1 + cell_index];
-    H[num_cells * 2 + cell_index] = (-internal_z + ave_internal) * psi[num_cells * 2 + cell_index];
-}
-
-__global__ void permute_psi_d2h(int num_cells, const double *input, double *output)
-{
-    int index = blockDim.x * blockIdx.x + threadIdx.x;
-    if (index >= num_cells)
-        return;
-
-    output[index * 3 + 0] = input[num_cells * 0 + index];
-    output[index * 3 + 1] = input[num_cells * 1 + index];
-    output[index * 3 + 2] = input[num_cells * 2 + index];
-}
-
-__global__ void permute_psi_h2d(int num_cells, const double *input, double *output)
+void dfUEqn::sync()
 {
-    int index = blockDim.x * blockIdx.x + threadIdx.x;
-    if (index >= num_cells)
-        return;
-
-    output[num_cells * 0 + index] = input[index * 3 + 0];
-    output[num_cells * 1 + index] = input[index * 3 + 1];
-    output[num_cells * 2 + index] = input[index * 3 + 2];
+    checkCudaErrors(cudaStreamSynchronize(dataBase_.stream));
 }
 
-__global__ void lduMatrix_H(int num_cells,
-                            const int *csr_row_index, const int *csr_col_index, const int *csr_diag_index,
-                            const double *volume, const double *psi, const double *A_csr, const double *b,
-                            const double *ueqn_boundary_coeffs, double *H)
-{
-    int index = blockDim.x * blockIdx.x + threadIdx.x;
-    if (index >= num_cells)
-        return;
+void dfUEqn::solve() {
+    ldu_to_csr(dataBase_.stream, dataBase_.num_cells, dataBase_.num_surfaces, dataBase_.num_boundary_surfaces,
+            dataBase_.d_boundary_face_cell,
+            dataBase_.d_lower_to_csr_index, dataBase_.d_upper_to_csr_index, dataBase_.d_diag_to_csr_index,
+            d_lower, d_upper, d_diag, d_source, d_internal_coeffs, d_boundary_coeffs, d_A, d_b, d_diag_vector);
 
-    // A_csr has one more element in each row: itself
-    int row_index = csr_row_index[index];
-    int row_elements = csr_row_index[index + 1] - row_index;
-    int diag_index = csr_diag_index[index];
-    int neighbor_offset = csr_row_index[index] - index;
+    int nNz = dataBase_.num_cells + dataBase_.num_surfaces * 2; // matrix entries
+    sync();
 
-    double APsi_x = 0.;
-    double APsi_y = 0.;
-    double APsi_z = 0.;
-    // lower
-    for (int i = 0; i < diag_index; i++)
+    if (num_iteration == 0)                                     // first interation
     {
-        int neighbor_cell_id = csr_col_index[i + row_index];
-        APsi_x += A_csr[row_index + i] * psi[num_cells * 0 + neighbor_cell_id];
-        APsi_y += A_csr[row_index + i] * psi[num_cells * 1 + neighbor_cell_id];
-        APsi_z += A_csr[row_index + i] * psi[num_cells * 2 + neighbor_cell_id];
-    }
-    // upper
-    for (int i = diag_index + 1; i < row_elements; i++)
-    {
-        int neighbor_cell_id = csr_col_index[i + row_index];
-        APsi_x += A_csr[row_index + i] * psi[num_cells * 0 + neighbor_cell_id];
-        APsi_y += A_csr[row_index + i] * psi[num_cells * 1 + neighbor_cell_id];
-        APsi_z += A_csr[row_index + i] * psi[num_cells * 2 + neighbor_cell_id];
-    }
-
-    H[num_cells * 0 + index] = H[num_cells * 0 + index] - APsi_x + b[num_cells * 0 + index];
-    H[num_cells * 1 + index] = H[num_cells * 1 + index] - APsi_y + b[num_cells * 1 + index];
-    H[num_cells * 2 + index] = H[num_cells * 2 + index] - APsi_z + b[num_cells * 2 + index];
-
-    double vol = volume[index];
-    H[num_cells * 0 + index] = H[num_cells * 0 + index] / vol;
-    H[num_cells * 1 + index] = H[num_cells * 1 + index] / vol;
-    H[num_cells * 2 + index] = H[num_cells * 2 + index] / vol;
-}
-
-__global__ void addBoundarySource(int num_cells, int num_boundary_cells,
-                                  const int *csr_row_index, const int *csr_diag_index,
-                                  const int *boundary_cell_offset, const int *boundary_cell_id,
-                                  const double *ueqn_internal_coeffs, const double *ueqn_boundary_coeffs,
-                                  const double *volume, double *H)
-{
-    int index = blockDim.x * blockIdx.x + threadIdx.x;
-    if (index >= num_boundary_cells)
-        return;
-
-    int cell_offset = boundary_cell_offset[index];
-    int cell_index = boundary_cell_id[cell_offset];
-
-    double vol = volume[index];
-
-    H[num_cells * 0 + index] = H[num_cells * 0 + index] + ueqn_boundary_coeffs[cell_index * 3 + 0] / vol;
-    H[num_cells * 1 + index] = H[num_cells * 1 + index] + ueqn_boundary_coeffs[cell_index * 3 + 1] / vol;
-    H[num_cells * 2 + index] = H[num_cells * 2 + index] + ueqn_boundary_coeffs[cell_index * 3 + 2] / vol;
-}
-
-__global__ void addAveInternaltoDiag(int num_cells, int num_boundary_cells,
-                                     const int *csr_row_index, const int *csr_diag_index,
-                                     const int *boundary_cell_offset, const int *boundary_cell_id,
-                                     const double *ueqn_internal_coeffs, const double *ueqn_boundary_coeffs, double *A)
-{
-    int index = blockDim.x * blockIdx.x + threadIdx.x;
-    if (index >= num_boundary_cells)
-        return;
-
-    int cell_offset = boundary_cell_offset[index];
-    int next_cell_offset = boundary_cell_offset[index + 1];
-    int cell_index = boundary_cell_id[cell_offset];
-
-    double internal_x = ueqn_internal_coeffs[cell_index * 3 + 0];
-    double internal_y = ueqn_internal_coeffs[cell_index * 3 + 1];
-    double internal_z = ueqn_internal_coeffs[cell_index * 3 + 2];
-
-    double ave_internal = (internal_x + internal_y + internal_z) / 3;
-
-    A[cell_index] = ave_internal;
-}
-
-__global__ void addDiagDivVolume(int num_cells, const int *csr_row_index,
-                                 const int *csr_diag_index, const double *A_csr, const double *volume,
-                                 double *ueqn_internal_coeffs, const double *A_input, double *A_output)
-{
-    int index = blockDim.x * blockIdx.x + threadIdx.x;
-    if (index >= num_cells)
-        return;
-
-    int row_index = csr_row_index[index];
-    int diag_index = csr_diag_index[index];
-    int csr_index = row_index + diag_index;
-
-    double vol = volume[index];
-
-    A_output[index] = (A_input[index] + A_csr[csr_index] - ueqn_internal_coeffs[index * 3]) / vol;
-}
-
-__global__ void ueqn_update_BoundaryCoeffs_kernel(int num_boundary_faces, const double *boundary_phi, double *internal_coeffs,
-                                                  double *boundary_coeffs, double *laplac_internal_coeffs,
-                                                  double *laplac_boundary_coeffs, const int *U_patch_type,
-                                                  const double *boundary_velocity, const double *boundary_deltaCoeffs)
-{
-    int index = blockDim.x * blockIdx.x + threadIdx.x;
-    if (index >= num_boundary_faces)
-        return;
-
-    int patchIndex = U_patch_type[index];
-    if (patchIndex == 0) { // zeroGradient
-        double bouPhi = boundary_phi[index];
-        internal_coeffs[index * 3 + 0] = bouPhi * 1.; // valueInternalCoeffs = 1.
-        internal_coeffs[index * 3 + 1] = bouPhi * 1.;
-        internal_coeffs[index * 3 + 2] = bouPhi * 1.;
-        boundary_coeffs[index * 3 + 0] = -bouPhi * 0.; // valueBoundaryCoeffs = 0.
-        boundary_coeffs[index * 3 + 1] = -bouPhi * 0.;
-        boundary_coeffs[index * 3 + 2] = -bouPhi * 0.;
-        laplac_internal_coeffs[index * 3 + 0] = 0.; // gradientInternalCoeffs = 0.
-        laplac_internal_coeffs[index * 3 + 1] = 0.;
-        laplac_internal_coeffs[index * 3 + 2] = 0.;
-        laplac_boundary_coeffs[index * 3 + 0] = 0.; // gradientBoundaryCoeffs = 0.
-        laplac_boundary_coeffs[index * 3 + 1] = 0.;
-        laplac_boundary_coeffs[index * 3 + 2] = 0.;
-    } else if (patchIndex == 1) { // fixedValue
-        double bouDeltaCoeffs = boundary_deltaCoeffs[index];
-        double bouPhi = boundary_phi[index];
-        internal_coeffs[index * 3 + 0] = bouPhi * 0.; // valueInternalCoeffs = 0.
-        internal_coeffs[index * 3 + 1] = bouPhi * 0.;
-        internal_coeffs[index * 3 + 2] = bouPhi * 0.;
-        boundary_coeffs[index * 3 + 0] = -bouPhi * boundary_velocity[index * 3 + 0]; // valueBoundaryCoeffs = boundaryValue
-        boundary_coeffs[index * 3 + 1] = -bouPhi * boundary_velocity[index * 3 + 1];
-        boundary_coeffs[index * 3 + 2] = -bouPhi * boundary_velocity[index * 3 + 2];
-        laplac_internal_coeffs[index * 3 + 0] = -1 * bouDeltaCoeffs; // gradientInternalCoeffs = -1 * boundaryDeltaCoeffs
-        laplac_internal_coeffs[index * 3 + 1] = -1 * bouDeltaCoeffs;
-        laplac_internal_coeffs[index * 3 + 2] = -1 * bouDeltaCoeffs;
-        laplac_boundary_coeffs[index * 3 + 0] = bouDeltaCoeffs * boundary_velocity[index * 3 + 0]; // gradientBoundaryCoeffs = boundaryDeltaCoeffs * boundaryValue
-        laplac_boundary_coeffs[index * 3 + 1] = bouDeltaCoeffs * boundary_velocity[index * 3 + 1];
-        laplac_boundary_coeffs[index * 3 + 2] = bouDeltaCoeffs * boundary_velocity[index * 3 + 2];
-    } else if (patchIndex == 2) { // empty
-        double bouPhi = boundary_phi[index];
-        internal_coeffs[index * 3 + 0] = bouPhi * 0.; // valueInternalCoeffs = 0.
-        internal_coeffs[index * 3 + 1] = bouPhi * 0.;
-        internal_coeffs[index * 3 + 2] = bouPhi * 0.;
-        boundary_coeffs[index * 3 + 0] = -bouPhi * 0.; // valueBoundaryCoeffs = 0.
-        boundary_coeffs[index * 3 + 1] = -bouPhi * 0.;
-        boundary_coeffs[index * 3 + 2] = -bouPhi * 0.;
-        laplac_internal_coeffs[index * 3 + 0] = 0.; // gradientInternalCoeffs = 0.
-        laplac_internal_coeffs[index * 3 + 1] = 0.;
-        laplac_internal_coeffs[index * 3 + 2] = 0.;
-        laplac_boundary_coeffs[index * 3 + 0] = 0.; // gradientBoundaryCoeffs = 0.
-        laplac_boundary_coeffs[index * 3 + 1] = 0.;
-        laplac_boundary_coeffs[index * 3 + 2] = 0.;
+        printf("Initializing AmgX Linear Solver\n");
+        UxSolver->setOperator(dataBase_.num_cells, nNz, dataBase_.d_csr_row_index, dataBase_.d_csr_col_index, d_A);
+        UySolver->setOperator(dataBase_.num_cells, nNz, dataBase_.d_csr_row_index, dataBase_.d_csr_col_index, d_A + nNz);
+        UzSolver->setOperator(dataBase_.num_cells, nNz, dataBase_.d_csr_row_index, dataBase_.d_csr_col_index, d_A + 2 * nNz);
     }
-    // TODO implement coupled conditions
-}
-
-__global__ void ueqn_correct_BoundaryConditions_kernel(int num_cells, int num_boundary_cells,
-                                                       const int *boundary_cell_offset, const int *boundary_cell_id,
-                                                       const double *velocity, double *boundary_velocity, const int *U_patch_type)
-{
-    int index = blockDim.x * blockIdx.x + threadIdx.x;
-    if (index >= num_boundary_cells)
-        return;
-
-    int cell_offset = boundary_cell_offset[index];
-    int next_cell_offset = boundary_cell_offset[index + 1];
-    int cell_index = boundary_cell_id[cell_offset];
-
-    for (int i = cell_offset; i < next_cell_offset; i++)
+    else
     {
-        int patchIndex = U_patch_type[i];
-        switch (patchIndex)
-        {
-            case 0: // zeroGradient
-            {
-                boundary_velocity[i * 3 + 0] = velocity[cell_index];
-                boundary_velocity[i * 3 + 1] = velocity[num_cells * 1 + cell_index];
-                boundary_velocity[i * 3 + 2] = velocity[num_cells * 2 + cell_index];
-                break;
-            }
-            case 1:
-                break;
-            case 2:
-                break;
-            // TODO implement coupled conditions
-        }
+        UxSolver->updateOperator(dataBase_.num_cells, nNz, d_A);
+        UySolver->updateOperator(dataBase_.num_cells, nNz, d_A + nNz);
+        UzSolver->updateOperator(dataBase_.num_cells, nNz, d_A + 2 * nNz);
     }
+    UxSolver->solve(dataBase_.num_cells, d_permute, d_b);
+    UySolver->solve(dataBase_.num_cells, d_permute + dataBase_.num_cells, d_b + dataBase_.num_cells);
+    UzSolver->solve(dataBase_.num_cells, d_permute + 2 * dataBase_.num_cells, d_b + 2 * dataBase_.num_cells);
+    num_iteration++;
 }
 
-// constructor
-dfUEqn::dfUEqn(dfMatrixDataBase &dataBase, const std::string &modeStr, const std::string &cfgFile)
-    : dataBase_(dataBase)
-{
-    stream = dataBase_.stream;
-
-    UxSolver = new AmgXSolver(modeStr, cfgFile);
-    UySolver = new AmgXSolver(modeStr, cfgFile);
-    UzSolver = new AmgXSolver(modeStr, cfgFile);
-
-    num_cells = dataBase_.num_cells;
-    cell_bytes = dataBase_.cell_bytes;
-    num_faces = dataBase_.num_faces;
-    cell_vec_bytes = dataBase_.cell_vec_bytes;
-    csr_value_vec_bytes = dataBase_.csr_value_vec_bytes;
-    num_boundary_cells = dataBase_.num_boundary_cells;
-    num_surfaces = dataBase_.num_surfaces;
-
-    d_A_csr_row_index = dataBase_.d_A_csr_row_index;
-    d_A_csr_diag_index = dataBase_.d_A_csr_diag_index;
-    d_A_csr_col_index = dataBase_.d_A_csr_col_index;
-
-    h_A_csr = new double[(num_cells + num_faces) * 3];
-    h_b = new double[num_cells * 3];
-    cudaMallocHost(&h_psi, cell_vec_bytes);
-    cudaMallocHost(&h_H, cell_vec_bytes);
-    cudaMallocHost(&h_A, cell_bytes);
-
-    checkCudaErrors(cudaMalloc((void **)&d_A_csr, csr_value_vec_bytes));
-    checkCudaErrors(cudaMalloc((void **)&d_b, cell_vec_bytes));
-    checkCudaErrors(cudaMalloc((void **)&d_psi, cell_vec_bytes));
-    checkCudaErrors(cudaMalloc((void **)&d_psi_permute, cell_vec_bytes));
-    checkCudaErrors(cudaMalloc((void **)&d_H, cell_vec_bytes));
-    checkCudaErrors(cudaMalloc((void **)&d_H_permute, cell_vec_bytes));
-    checkCudaErrors(cudaMalloc((void **)&d_A, cell_bytes));
-    checkCudaErrors(cudaMalloc((void **)&d_ueqn_internal_coeffs, cell_vec_bytes));
-    checkCudaErrors(cudaMalloc((void **)&d_ueqn_boundary_coeffs, cell_vec_bytes));
-}
-
-void dfUEqn::fvm_ddt(double *vector_old)
-{
-    // Copy the host input array in host memory to the device input array in device memory
-    checkCudaErrors(cudaMemcpyAsync(dataBase_.d_velocity_old, vector_old, cell_vec_bytes, cudaMemcpyHostToDevice, stream));
-    size_t threads_per_block = 1024;
-    size_t blocks_per_grid = (num_cells + threads_per_block - 1) / threads_per_block;
-    fvm_ddt_kernel<<<blocks_per_grid, threads_per_block, 0, stream>>>(num_cells, num_faces, dataBase_.rdelta_t,
-            d_A_csr_row_index, d_A_csr_diag_index,
-            dataBase_.d_rho_old, dataBase_.d_rho_new, dataBase_.d_volume, dataBase_.d_velocity_old, d_A_csr, d_b, d_A_csr, d_b, d_psi);
-}
-
-void dfUEqn::fvm_div(double *boundary_pressure_init, double *boundary_velocity_init,
-                     double *boundary_nuEff_init, double *boundary_rho_init)
-{
-    // copy and permutate boundary variable
-    checkCudaErrors(cudaMemcpyAsync(dataBase_.d_boundary_velocity_init, boundary_velocity_init, dataBase_.boundary_face_vec_bytes, cudaMemcpyHostToDevice, stream));
-    checkCudaErrors(cudaMemcpyAsync(dataBase_.d_boundary_pressure_init, boundary_pressure_init, dataBase_.boundary_face_bytes, cudaMemcpyHostToDevice, stream));
-    checkCudaErrors(cudaMemcpyAsync(dataBase_.d_boundary_nuEff_init, boundary_nuEff_init, dataBase_.boundary_face_bytes, cudaMemcpyHostToDevice, stream));
-    checkCudaErrors(cudaMemcpyAsync(dataBase_.d_boundary_rho_init, boundary_rho_init, dataBase_.boundary_face_bytes, cudaMemcpyHostToDevice, stream));
-
-    size_t threads_per_block = 1024;
-    size_t blocks_per_grid = (dataBase_.num_boundary_faces + threads_per_block - 1) / threads_per_block;
-    boundaryPermutation<<<blocks_per_grid, threads_per_block, 0, stream>>>(dataBase_.num_boundary_faces, dataBase_.d_bouPermedIndex, dataBase_.d_boundary_pressure_init,
-            dataBase_.d_boundary_velocity_init, dataBase_.d_boundary_pressure, dataBase_.d_boundary_velocity, 
-            dataBase_.d_boundary_nuEff_init, dataBase_.d_boundary_nuEff, dataBase_.d_boundary_rho_init, dataBase_.d_boundary_rho);
-
-    // initialize boundary coeffs (must after the update of d_boundary_velocity)
-    threads_per_block = 1024;
-    blocks_per_grid = (dataBase_.num_boundary_faces + threads_per_block - 1) / threads_per_block;
-    ueqn_update_BoundaryCoeffs_kernel<<<blocks_per_grid, threads_per_block, 0, stream>>>(dataBase_.num_boundary_faces, dataBase_.d_boundary_phi,
-                                                                                         dataBase_.d_internal_coeffs, dataBase_.d_boundary_coeffs,
-                                                                                         dataBase_.d_laplac_internal_coeffs, dataBase_.d_laplac_boundary_coeffs,
-                                                                                         dataBase_.d_boundary_UpatchType, dataBase_.d_boundary_velocity, dataBase_.d_boundary_deltaCoeffs);
-
-    blocks_per_grid = (num_cells + threads_per_block - 1) / threads_per_block;
-    fvm_div_internal<<<blocks_per_grid, threads_per_block, 0, stream>>>(num_cells, num_faces,
-                                                                        d_A_csr_row_index, d_A_csr_diag_index,
-                                                                        dataBase_.d_weight, dataBase_.d_phi, d_A_csr, d_b, d_A_csr, d_b);
-    blocks_per_grid = (num_boundary_cells + threads_per_block - 1) / threads_per_block;
-    fvm_div_boundary<<<blocks_per_grid, threads_per_block, 0, stream>>>(num_cells, num_faces, num_boundary_cells,
-                                                                        d_A_csr_row_index, d_A_csr_diag_index,
-                                                                        dataBase_.d_boundary_cell_offset, dataBase_.d_boundary_cell_id,
-                                                                        dataBase_.d_internal_coeffs, dataBase_.d_boundary_coeffs, d_A_csr, d_b, d_A_csr, d_b,
-                                                                        d_ueqn_internal_coeffs, d_ueqn_boundary_coeffs);
-}
-
-void dfUEqn::fvc_grad(double *pressure)
-{
-    // Copy the host input array in host memory to the device input array in device memory
-    checkCudaErrors(cudaMemcpyAsync(dataBase_.d_pressure, pressure, cell_bytes, cudaMemcpyHostToDevice, stream));
-
-    // launch cuda kernel
-    size_t threads_per_block = 1024;
-    size_t blocks_per_grid = (num_cells + threads_per_block - 1) / threads_per_block;
-    fvc_grad_internal_face<<<blocks_per_grid, threads_per_block, 0, stream>>>(num_cells,
-                                                                              d_A_csr_row_index, d_A_csr_col_index, d_A_csr_diag_index,
-                                                                              dataBase_.d_face_vector, dataBase_.d_weight, dataBase_.d_pressure, d_b, d_b);
-    blocks_per_grid = (num_boundary_cells + threads_per_block - 1) / threads_per_block;
-    fvc_grad_boundary_face<<<blocks_per_grid, threads_per_block, 0, stream>>>(num_cells, num_boundary_cells,
-                                                                              dataBase_.d_boundary_cell_offset, dataBase_.d_boundary_cell_id,
-                                                                              dataBase_.d_boundary_face_vector, dataBase_.d_boundary_pressure, d_b, d_b);
-}
-
-void dfUEqn::fvc_grad_vector()
-{
-    size_t threads_per_block = 512;
-    size_t blocks_per_grid = (num_cells + threads_per_block - 1) / threads_per_block;
-    fvc_grad_vector_internal<<<blocks_per_grid, threads_per_block, 0, stream>>>(num_cells,
-                                                                                d_A_csr_row_index, d_A_csr_col_index, d_A_csr_diag_index,
-                                                                                dataBase_.d_face_vector, dataBase_.d_velocity_old, dataBase_.d_weight, dataBase_.d_volume, dataBase_.d_grad);
-
-    blocks_per_grid = (num_boundary_cells + threads_per_block - 1) / threads_per_block;
-    fvc_grad_vector_boundary<<<blocks_per_grid, threads_per_block, 0, stream>>>(num_cells, num_boundary_cells,
-                                                                                dataBase_.d_boundary_cell_offset, dataBase_.d_boundary_cell_id, dataBase_.d_boundary_face_vector, dataBase_.d_boundary_velocity,
-                                                                                dataBase_.d_volume, dataBase_.d_grad, dataBase_.d_grad_boundary_init);
-
-    correct_boundary_conditions<<<blocks_per_grid, threads_per_block, 0, stream>>>(num_boundary_cells,
-                                                                                   dataBase_.d_boundary_cell_offset, dataBase_.d_boundary_cell_id, dataBase_.d_boundary_face_vector, dataBase_.d_boundary_face,
-                                                                                   dataBase_.d_grad_boundary_init, dataBase_.d_grad_boundary, dataBase_.d_boundary_deltaCoeffs, dataBase_.d_velocity_old,
-                                                                                   dataBase_.d_boundary_velocity, dataBase_.d_boundary_UpatchType);
-}
-
-void dfUEqn::dev2T()
-{
-    size_t threads_per_block = 1024;
-    size_t blocks_per_grid = (num_cells + threads_per_block - 1) / threads_per_block;
-    dev2_t_tensor<<<blocks_per_grid, threads_per_block, 0, stream>>>(num_cells, dataBase_.d_grad);
-
-    blocks_per_grid = (dataBase_.num_boundary_faces + threads_per_block - 1) / threads_per_block;
-    dev2_t_tensor<<<blocks_per_grid, threads_per_block, 0, stream>>>(dataBase_.num_boundary_faces, dataBase_.d_grad_boundary);
-}
-
-void dfUEqn::fvc_div_tensor(const double *nuEff)
-{
-    checkCudaErrors(cudaMemcpyAsync(dataBase_.d_nuEff, nuEff, cell_bytes, cudaMemcpyHostToDevice, stream));
-    size_t threads_per_block = 512;
-    size_t blocks_per_grid = (num_cells + threads_per_block - 1) / threads_per_block;
-    fvc_div_tensor_internal<<<blocks_per_grid, threads_per_block, 0, stream>>>(num_cells,
-                                                                               d_A_csr_row_index, d_A_csr_col_index, d_A_csr_diag_index,
-                                                                               dataBase_.d_nuEff, dataBase_.d_rho_new, dataBase_.d_face_vector, dataBase_.d_grad, dataBase_.d_weight,
-                                                                               dataBase_.d_volume, 1., d_b, d_b);
+void dfUEqn::postProcess(double *h_u) { // TODO: Here may be a bug
+    permute_vector_d2h(dataBase_.stream, dataBase_.num_cells, d_permute, dataBase_.d_u);
+    checkCudaErrors(cudaMemcpyAsync(dataBase_.d_u, d_permute, dataBase_.cell_value_vec_bytes, cudaMemcpyDeviceToHost, dataBase_.stream));
+    checkCudaErrors(cudaStreamSynchronize(dataBase_.stream));
 
-    blocks_per_grid = (num_boundary_cells + threads_per_block - 1) / threads_per_block;
-    fvc_div_tensor_boundary<<<blocks_per_grid, threads_per_block, 0, stream>>>(num_cells, num_boundary_cells,
-                                                                               dataBase_.d_boundary_cell_offset, dataBase_.d_boundary_cell_id,
-                                                                               dataBase_.d_boundary_nuEff, dataBase_.d_boundary_rho, dataBase_.d_boundary_face_vector, dataBase_.d_grad_boundary,
-                                                                               dataBase_.d_volume, 1., d_b, d_b);
+    // some boundary conditions may also need vf.boundary, deltaCoeffs.boundary, and weight.boundary
+    update_boundary_coeffs_vector(dataBase_.stream, dataBase_.num_boundary_surfaces, dataBase_.num_patches,
+            dataBase_.patch_size.data(), patch_type.data(),
+            d_value_internal_coeffs, d_value_boundary_coeffs,
+            d_gradient_internal_coeffs, d_gradient_boundary_coeffs);
 }
 
-void dfUEqn::fvm_laplacian()
-{
-    size_t threads_per_block = 1024;
-    size_t blocks_per_grid = (num_cells + threads_per_block - 1) / threads_per_block;
-    fvm_laplacian_uncorrected_vector_internal<<<blocks_per_grid, threads_per_block, 0, stream>>>(num_cells, num_faces,
-                                                                                                 d_A_csr_row_index, d_A_csr_col_index, d_A_csr_diag_index, dataBase_.d_rho_new, dataBase_.d_nuEff, dataBase_.d_weight,
-                                                                                                 dataBase_.d_face, dataBase_.d_deltaCoeffs, -1., d_A_csr, d_A_csr);
-
-    blocks_per_grid = (num_boundary_cells + threads_per_block - 1) / threads_per_block;
-    fvm_laplacian_uncorrected_vector_boundary<<<blocks_per_grid, threads_per_block, 0, stream>>>(num_cells, num_faces, num_boundary_cells,
-                                                                                                 d_A_csr_row_index, d_A_csr_diag_index, dataBase_.d_boundary_cell_offset, dataBase_.d_boundary_cell_id,
-                                                                                                 dataBase_.d_boundary_nuEff, dataBase_.d_boundary_rho, dataBase_.d_boundary_face, dataBase_.d_laplac_internal_coeffs,
-                                                                                                 dataBase_.d_laplac_boundary_coeffs, -1., d_A_csr, d_b, d_A_csr, d_b, d_ueqn_internal_coeffs, d_ueqn_boundary_coeffs);
-}
-
-void dfUEqn::A(double *Psi)
-{
-    checkCudaErrors(cudaMemsetAsync(d_A, 0, cell_bytes, stream));
-
-    size_t threads_per_block = 1024;
-    size_t blocks_per_grid = (num_boundary_cells + threads_per_block - 1) / threads_per_block;
-    addAveInternaltoDiag<<<blocks_per_grid, threads_per_block, 0, stream>>>(num_cells, num_boundary_cells, d_A_csr_row_index, d_A_csr_diag_index,
-                                                                            dataBase_.d_boundary_cell_offset, dataBase_.d_boundary_cell_id,
-                                                                            d_ueqn_internal_coeffs, d_ueqn_boundary_coeffs, d_A);
-    blocks_per_grid = (num_cells + threads_per_block - 1) / threads_per_block;
-    addDiagDivVolume<<<blocks_per_grid, threads_per_block, 0, stream>>>(num_cells, d_A_csr_row_index, d_A_csr_diag_index, d_A_csr,
-                                                                        dataBase_.d_volume, d_ueqn_internal_coeffs, d_A, d_A);
-
-    checkCudaErrors(cudaMemcpyAsync(h_A, d_A, cell_bytes, cudaMemcpyDeviceToHost, stream));
-    checkCudaErrors(cudaStreamSynchronize(stream));
-    memcpy(Psi, h_A, cell_bytes);
-}
-
-void dfUEqn::H(double *Psi)
-{
-    checkCudaErrors(cudaMemsetAsync(d_H, 0, cell_bytes * 3, stream));
-    size_t threads_per_block = 1024;
-    size_t blocks_per_grid = (num_boundary_cells + threads_per_block - 1) / threads_per_block;
-    addBoundaryDiag<<<blocks_per_grid, threads_per_block, 0, stream>>>(num_cells, num_boundary_cells, d_A_csr_row_index, d_A_csr_diag_index,
-                                                                       dataBase_.d_boundary_cell_offset, dataBase_.d_boundary_cell_id,
-                                                                       d_ueqn_internal_coeffs, d_ueqn_boundary_coeffs,
-                                                                       d_psi, d_H);
-
-    blocks_per_grid = (num_cells + threads_per_block - 1) / threads_per_block;
-    lduMatrix_H<<<blocks_per_grid, threads_per_block, 0, stream>>>(num_cells, d_A_csr_row_index, d_A_csr_col_index, d_A_csr_diag_index,
-                                                                   dataBase_.d_volume, d_psi, d_A_csr, d_b, d_ueqn_boundary_coeffs, d_H);
-
-    blocks_per_grid = (num_cells + threads_per_block - 1) / threads_per_block;
-    permute_psi_d2h<<<blocks_per_grid, threads_per_block, 0, stream>>>(num_cells, d_H, d_H_permute);
-
-    checkCudaErrors(cudaMemcpyAsync(h_H, d_H_permute, cell_vec_bytes, cudaMemcpyDeviceToHost, stream));
-    checkCudaErrors(cudaStreamSynchronize(stream));
-
-    memcpy(Psi, h_H, cell_vec_bytes);
-}
-
-void dfUEqn::initializeTimeStep()
-{
-    // initialize matrix value
-    checkCudaErrors(cudaMemsetAsync(d_A_csr, 0, csr_value_vec_bytes, stream));
-    checkCudaErrors(cudaMemsetAsync(d_b, 0, cell_vec_bytes, stream));
-}
-
-void dfUEqn::checkValue(bool print)
-{
-    checkCudaErrors(cudaMemcpyAsync(h_A_csr, d_A_csr, csr_value_vec_bytes, cudaMemcpyDeviceToHost, stream));
-    checkCudaErrors(cudaMemcpyAsync(h_b, d_b, cell_vec_bytes, cudaMemcpyDeviceToHost, stream));
-
-    // Synchronize stream
-    checkCudaErrors(cudaStreamSynchronize(stream));
-    if (print)
-    {
-        for (int i = 0; i < (num_faces + num_cells); i++)
-            fprintf(stderr, "h_A_csr[%d]: (%.10lf, %.10lf, %.10lf)\n", i, h_A_csr[i], h_A_csr[i + (num_faces + num_cells)], h_A_csr[i + 2 * (num_faces + num_cells)]);
-        for (int i = 0; i < num_cells; i++)
-            fprintf(stderr, "h_b[%d]: (%.10lf, %.10lf, %.10lf)\n", i, h_b[i], h_b[i + num_cells], h_b[i + 2 * num_cells]);
+double* dfUEqn::getFieldPointer(const char* fieldAlias, location loc, position pos) {
+    char mergedName[256];
+    if (pos == position::internal) {
+        sprintf(mergedName, "%s_%s", (loc == location::cpu) ? "h" : "d", fieldAlias);
+    } else if (pos == position::boundary) {
+        sprintf(mergedName, "%s_boundary_%s", (loc == location::cpu) ? "h" : "d", fieldAlias);
     }
 
-    char *input_file = "of_output.txt";
-    FILE *fp = fopen(input_file, "rb+");
-    if (fp == NULL)
-    {
-        fprintf(stderr, "Failed to open input file: %s!\n", input_file);
+    double *pointer = nullptr;
+    if (fieldPointerMap.find(std::string(mergedName)) != fieldPointerMap.end()) {
+        pointer = fieldPointerMap[std::string(mergedName)];
     }
-    int readfile = 0;
-    double *of_b = new double[3 * num_cells];
-    double *of_A = new double[3 * (num_faces + num_cells)];
-    readfile = fread(of_b, num_cells * 3 * sizeof(double), 1, fp);
-    readfile = fread(of_A, (num_faces + num_cells) * sizeof(double) * 3, 1, fp);
-
-    std::vector<double> h_A_of_init_vec(3 * (num_cells + num_faces));
-    std::copy(of_A, of_A + (num_cells + num_faces) * 3, h_A_of_init_vec.begin());
-
-    std::vector<double> h_A_of_vec_perm(3 * (num_faces + num_cells), 0);
-    for (int i = 0; i < num_faces + num_cells; i++)
-    {
-        h_A_of_vec_perm[i] = h_A_of_init_vec[dataBase_.tmpPermutatedList[i]];
-        h_A_of_vec_perm[i + num_faces + num_cells] = h_A_of_init_vec[dataBase_.tmpPermutatedList[i] + num_faces + num_cells];
-        h_A_of_vec_perm[i + 2 * (num_faces + num_cells)] = h_A_of_init_vec[dataBase_.tmpPermutatedList[i] + 2 * (num_faces + num_cells)];
+    if (pointer == nullptr) {
+        fprintf(stderr, "Warning! getFieldPointer of %s returns nullptr!\n", mergedName);
     }
-
-    // b
-    std::vector<double> h_b_of_init_vec(3 * num_cells);
-    std::copy(of_b, of_b + 3 * num_cells, h_b_of_init_vec.begin());
-    std::vector<double> h_b_of_vec;
-    for (int i = 0; i < 3 * num_cells; i += 3)
-    {
-        h_b_of_vec.push_back(h_b_of_init_vec[i]);
+    //fprintf(stderr, "fieldAlias: %s, mergedName: %s, pointer: %p\n", fieldAlias, mergedName, pointer);
+
+    return pointer;
+}
+
+void dfUEqn::compareResult(const double *lower, const double *upper, const double *diag, 
+        const double *source, const double *internal_coeffs, const double *boundary_coeffs, 
+        // const double *tmpVal, 
+        bool printFlag)
+{
+    DEBUG_TRACE;
+    std::vector<double> h_lower;
+    h_lower.resize(dataBase_.num_surfaces);
+    checkCudaErrors(cudaMemcpy(h_lower.data(), d_lower, dataBase_.surface_value_bytes, cudaMemcpyDeviceToHost));
+    fprintf(stderr, "check h_lower");
+    checkVectorEqual(dataBase_.num_surfaces, lower, h_lower.data(), 1e-14, printFlag);
+    DEBUG_TRACE;
+
+    std::vector<double> h_upper;
+    h_upper.resize(dataBase_.num_surfaces);
+    checkCudaErrors(cudaMemcpy(h_upper.data(), d_upper, dataBase_.surface_value_bytes, cudaMemcpyDeviceToHost));
+    fprintf(stderr, "check h_upper");
+    checkVectorEqual(dataBase_.num_surfaces, upper, h_upper.data(), 1e-14, printFlag);
+    DEBUG_TRACE;
+
+    std::vector<double> h_diag;
+    h_diag.resize(dataBase_.num_cells);
+    checkCudaErrors(cudaMemcpy(h_diag.data(), d_diag, dataBase_.cell_value_bytes, cudaMemcpyDeviceToHost));
+    fprintf(stderr, "check h_diag");
+    checkVectorEqual(dataBase_.num_cells, diag, h_diag.data(), 1e-14, printFlag);
+    DEBUG_TRACE;
+
+    std::vector<double> h_source, h_source_ref;
+    h_source.resize(dataBase_.num_cells * 3);
+    h_source_ref.resize(dataBase_.num_cells * 3);
+    for (int i = 0; i < dataBase_.num_cells; i++) {
+        h_source_ref[0 * dataBase_.num_cells + i] = source[i * 3 + 0];
+        h_source_ref[1 * dataBase_.num_cells + i] = source[i * 3 + 1];
+        h_source_ref[2 * dataBase_.num_cells + i] = source[i * 3 + 2];
     }
-    // fill RHS_y
-    for (int i = 1; i < 3 * num_cells; i += 3)
-    {
-        h_b_of_vec.push_back(h_b_of_init_vec[i]);
+    checkCudaErrors(cudaMemcpy(h_source.data(), d_source, dataBase_.cell_value_vec_bytes, cudaMemcpyDeviceToHost));
+    fprintf(stderr, "check h_source");
+    checkVectorEqual(dataBase_.num_cells * 3, h_source_ref.data(), h_source.data(), 1e-14, printFlag);
+    DEBUG_TRACE;
+
+    std::vector<double> h_internal_coeffs, h_internal_coeffs_ref;
+    h_internal_coeffs.resize(dataBase_.num_boundary_surfaces * 3);
+    h_internal_coeffs_ref.resize(dataBase_.num_boundary_surfaces * 3);
+    for (int i = 0; i < dataBase_.num_boundary_surfaces; i++) {
+        h_internal_coeffs_ref[0 * dataBase_.num_boundary_surfaces + i] = internal_coeffs[i * 3 + 0];
+        h_internal_coeffs_ref[1 * dataBase_.num_boundary_surfaces + i] = internal_coeffs[i * 3 + 1];
+        h_internal_coeffs_ref[2 * dataBase_.num_boundary_surfaces + i] = internal_coeffs[i * 3 + 2];
     }
-    // fill RHS_z
-    for (int i = 2; i < 3 * num_cells; i += 3)
-    {
-        h_b_of_vec.push_back(h_b_of_init_vec[i]);
+    checkCudaErrors(cudaMemcpy(h_internal_coeffs.data(), d_internal_coeffs, dataBase_.boundary_surface_value_vec_bytes, cudaMemcpyDeviceToHost));
+    checkVectorEqual(dataBase_.num_boundary_surfaces * 3, h_internal_coeffs_ref.data(), h_internal_coeffs.data(), 1e-14, printFlag);
+    DEBUG_TRACE;
+
+    std::vector<double> h_boundary_coeffs, h_boundary_coeffs_ref;
+    h_boundary_coeffs.resize(dataBase_.num_boundary_surfaces * 3);
+    h_boundary_coeffs_ref.resize(dataBase_.num_boundary_surfaces * 3);
+    for (int i = 0; i < dataBase_.num_boundary_surfaces; i++) {
+        h_boundary_coeffs_ref[0 * dataBase_.num_boundary_surfaces + i] = boundary_coeffs[i * 3 + 0];
+        h_boundary_coeffs_ref[1 * dataBase_.num_boundary_surfaces + i] = boundary_coeffs[i * 3 + 1];
+        h_boundary_coeffs_ref[2 * dataBase_.num_boundary_surfaces + i] = boundary_coeffs[i * 3 + 2];
     }
+    checkCudaErrors(cudaMemcpy(h_boundary_coeffs.data(), d_boundary_coeffs, dataBase_.boundary_surface_value_vec_bytes, cudaMemcpyDeviceToHost));
+    checkVectorEqual(dataBase_.num_boundary_surfaces * 3, h_boundary_coeffs_ref.data(), h_boundary_coeffs.data(), 1e-14, printFlag);
+    DEBUG_TRACE;
 
-    if (print)
-    {
-        for (int i = 0; i < (num_faces + num_cells); i++)
-            printf("h_A_of_vec[%d]:(%.10lf, %.10lf, %.10lf)\n", i, h_A_of_vec_perm[i], h_A_of_vec_perm[i + (num_faces + num_cells)], h_A_of_vec_perm[i + (num_faces + num_cells) * 2]);
-        for (int i = 0; i < num_cells; i++)
-            printf("h_b_of_vec[%d]: (%.10lf, %.10lf, %.10lf)\n", i, of_b[i * 3], of_b[i * 3 + 1], of_b[i * 3 + 2]);
-    }
-
-    // check
-    // fprintf(stderr, "check of h_A_csr\n");
-    // checkVectorEqual(num_faces + num_cells, h_A_of_vec_1mtx.data(), h_A_csr, 1e-5);
-    // fprintf(stderr, "check of h_b\n");
-    // checkVectorEqual(3 * num_cells, h_b_of_vec.data(), h_b, 1e-5);
-}
-
-void dfUEqn::solve()
-{
-    // for (size_t i = 0; i < num_cells; i++)
-    //     fprintf(stderr, "h_velocity_old[%d]: (%.15lf, %.15lf, %.15lf)\n", i, h_velocity_old[3*i],
-    //     h_velocity_old[3*i + 1], h_velocity_old[3*i + 2]);
-    // constructor AmgXSolver at first interation
-    // Synchronize stream
-    // checkCudaErrors(cudaMemcpyAsync(h_b, d_b, cell_vec_bytes, cudaMemcpyDeviceToHost, stream));
-    // checkCudaErrors(cudaMemcpyAsync(h_psi, d_psi, cell_vec_bytes, cudaMemcpyDeviceToHost, stream));
-
-    checkCudaErrors(cudaStreamSynchronize(stream));
-
-    // nvtxRangePush("solve");
-
-    int nNz = num_cells + num_faces; // matrix entries
-    if (num_iteration == 0)          // first interation
-    {
-        printf("Initializing AmgX Linear Solver\n");
-        UxSolver->setOperator(num_cells, nNz, d_A_csr_row_index, d_A_csr_col_index, d_A_csr);
-        UySolver->setOperator(num_cells, nNz, d_A_csr_row_index, d_A_csr_col_index, d_A_csr + nNz);
-        UzSolver->setOperator(num_cells, nNz, d_A_csr_row_index, d_A_csr_col_index, d_A_csr + 2 * nNz);
-    }
-    else
-    {
-        UxSolver->updateOperator(num_cells, nNz, d_A_csr);
-        UySolver->updateOperator(num_cells, nNz, d_A_csr + nNz);
-        UzSolver->updateOperator(num_cells, nNz, d_A_csr + 2 * nNz);
-    }
-    UxSolver->solve(num_cells, d_psi, d_b);
-    UySolver->solve(num_cells, d_psi + num_cells, d_b + num_cells);
-    UzSolver->solve(num_cells, d_psi + 2 * num_cells, d_b + 2 * num_cells);
-    num_iteration++;
-
-    size_t threads_per_block = 1024;
-    size_t blocks_per_grid = (num_cells + threads_per_block - 1) / threads_per_block;
-    permute_psi_d2h<<<blocks_per_grid, threads_per_block, 0, stream>>>(num_cells, d_psi, d_psi_permute);
-    checkCudaErrors(cudaMemcpyAsync(h_psi, d_psi_permute, cell_vec_bytes, cudaMemcpyDeviceToHost, stream));
-    // for (size_t i = 0; i < num_cells; i++)
-    //     fprintf(stderr, "h_velocity_after[%d]: (%.15lf, %.15lf, %.15lf)\n", i, h_psi[i],
-    //     h_psi[num_cells + i], h_psi[num_cells*2 + i]);
+    // std::vector<double> h_tmpVal;
+    // h_tmpVal.resize(dataBase_.num_cells * 3);
+    // checkCudaErrors(cudaMemcpy(h_tmpVal.data(), d_fvc_output, dataBase_.cell_value_vec_bytes, cudaMemcpyDeviceToHost));
+    // checkVectorEqual(dataBase_.num_cells * 3, tmpVal, h_tmpVal.data(), 1e-14, printFlag);
+    // DEBUG_TRACE;
 }
 
-void dfUEqn::sync()
-{
-    checkCudaErrors(cudaStreamSynchronize(stream));
-}
-
-void dfUEqn::updatePsi(double *Psi)
-{
-    checkCudaErrors(cudaStreamSynchronize(stream));
-    memcpy(Psi, h_psi, cell_vec_bytes);
-}
-
-void dfUEqn::correctBoundaryConditions()
-{
-    size_t threads_per_block = 1024;
-    size_t blocks_per_grid = (num_boundary_cells + threads_per_block - 1) / threads_per_block;
-    ueqn_correct_BoundaryConditions_kernel<<<blocks_per_grid, threads_per_block, 0, stream>>>(num_cells, num_boundary_cells, 
-                                                                                              dataBase_.d_boundary_cell_offset, dataBase_.d_boundary_cell_id,
-                                                                                              d_psi, dataBase_.d_boundary_velocity, dataBase_.d_boundary_UpatchType);
-}
-
-// correct volecity in pEqn
-void dfUEqn::correctPsi(double *Psi)
-{
-    memcpy(h_psi, Psi, cell_vec_bytes);
-    checkCudaErrors(cudaMemcpyAsync(d_psi_permute, h_psi, cell_vec_bytes, cudaMemcpyDeviceToHost, stream));
-
-    size_t threads_per_block = 1024;
-    size_t blocks_per_grid = (num_cells + threads_per_block - 1) / threads_per_block;
-    permute_psi_h2d<<<blocks_per_grid, threads_per_block, 0, stream>>>(num_cells, d_psi_permute, d_psi);
-}
-
-dfUEqn::~dfUEqn()
-{
-}
diff --git a/src_gpu_orig/AmgXSolver.H b/src_gpu_orig/AmgXSolver.H
new file mode 100644
index 000000000..190808934
--- /dev/null
+++ b/src_gpu_orig/AmgXSolver.H
@@ -0,0 +1,310 @@
+/**
+ * \file AmgXSolver.hpp
+ * \brief Definition of class AmgXSolver.
+ * \author Pi-Yueh Chuang (pychuang@gwu.edu)
+ * \author Matt Martineau (mmartineau@nvidia.com)
+ * \date 2015-09-01
+ * \copyright Copyright (c) 2015-2019 Pi-Yueh Chuang, Lorena A. Barba.
+ * \copyright Copyright (c) 2019-2021, NVIDIA CORPORATION. All rights reserved.
+ *            This project is released under MIT License.
+ */
+
+
+#ifndef __AMGX_SOLVER_H__
+#define __AMGX_SOLVER_H__
+
+// CUDA
+#include <cuda_runtime.h>
+
+// STL
+# include <string>
+# include <vector>
+# include <ctime>
+
+// AmgX
+# include <amgx_c.h>
+
+// PETSc
+// # include <petscvec.h>
+
+
+/** \brief A macro to check the returned CUDA error code.
+ *
+ * \param call [in] Function call to CUDA API.
+ */
+# define CHECK(call)                                                        \
+do                                                    \
+{                                                     \
+    const cudaError_t error_code = call;              \
+    if (error_code != cudaSuccess)                    \
+    {                                                 \
+        printf("CUDA Error:\n");                      \
+        printf("    File:       %s\n", __FILE__);     \
+        printf("    Line:       %d\n", __LINE__);     \
+        printf("    Error code: %d\n", error_code);   \
+        printf("    Error text: %s\n",                \
+            cudaGetErrorString(error_code));          \
+        exit(1);                                      \
+    }                                                 \
+} while (0)
+
+
+
+
+
+
+/** \brief A wrapper class for coupling PETSc and AmgX.
+ *
+ * This class is a wrapper of AmgX library for PETSc. PETSc users only need to
+ * pass a PETSc matrix and vectors into an AmgXSolver instance to solve their
+ * linear systems. The class is designed specifically for the situation where
+ * the number of MPI processes is more than the number of GPU devices.
+ *
+ * Eaxmple usage:
+ * \code
+ * int main(int argc, char **argv)
+ * {
+ *     // initialize matrix A, RHS, etc using PETSc
+ *     ...
+ *
+ *     // create an instance of the solver wrapper
+ *     AmgXSolver    solver;
+ *     // initialize the instance with communicator, executation mode, and config file
+ *     solver.initialize(comm, mode, file);
+ *     // set matrix A. Currently it only accept PETSc AIJ matrix
+ *     solver.setA(A);
+ *     // solve. x and rhs are PETSc vectors. unkns will be the final result in the end
+ *     solver.solve(unks, rhs);
+ *     // get number of iterations
+ *     int         iters;
+ *     solver.getIters(iters);
+ *     // get residual at the last iteration
+ *     double      res;
+ *     solver.getResidual(iters, res);
+ *     // finalization
+ *     solver.finalize();
+ *
+ *     // other codes
+ *     ....
+ *
+ *     return 0;
+ * }
+ * \endcode
+ */
+class AmgXSolver
+{
+    public:
+
+        /** \brief Default constructor. */
+        AmgXSolver() = default;
+
+        /** \brief Construct a AmgXSolver instance.
+         *
+         * \param comm [in] MPI communicator.
+         * \param modeStr [in] A string; target mode of AmgX (e.g., dDDI).
+         * \param cfgFile [in] A string; the path to AmgX configuration file.
+         */
+        AmgXSolver
+        (
+            const std::string &modeStr,
+            const std::string &cfgFile
+        );
+
+        /** \brief Destructor. */
+        ~AmgXSolver();
+
+        /** \brief Initialize a AmgXSolver instance.
+         *
+         * \param comm [in] MPI communicator.
+         * \param modeStr [in] A string; target mode of AmgX (e.g., dDDI).
+         * \param cfgFile [in] A string; the path to AmgX configuration file.
+         *
+         */
+        void initialize
+        (
+            const std::string &modeStr,
+            const std::string &cfgFile
+        );
+
+
+        /** \brief Finalize this instance.
+         *
+         * This function destroys AmgX data. When there are more than one
+         * AmgXSolver instances, the last one destroyed is also in charge of
+         * destroying the shared resource object and finalizing AmgX.
+         *
+         */
+        void finalize();
+
+        /** \brief Set up the matrix used by AmgX.
+         *
+         * This function sets up the AmgX matrix from the provided CSR data
+         * structures and partition data.
+         *
+         * \param nGlobalRows [in] The number of global rows.
+         * \param nLocalRows [in] The number of local rows on this rank.
+         * \param nLocalNz [in] The total number of non zero entries locally.
+         * \param rowOffsets [in] The local CSR matrix row offsets.
+         * \param colIndicesGlobal [in] The global CSR matrix column indices.
+         * \param values [in] The local CSR matrix values.
+         * id of the owning rank for each row.
+         *
+         */
+        void setOperator
+        (
+            const int nRows,
+            const int nNz,
+            const int *rowIndex,
+            const int *colIndex,
+            const double *value
+        );
+
+        /** \brief Re-sets up an existing AmgX matrix.
+         *
+         * Replaces the matrix coefficients with the provided values and performs
+         * a resetup for the AmgX matrix.
+         *
+         * \param nLocalRows [in] The number of local rows on this rank.
+         * \param nLocalNz [in] The total number of non zero entries locally.
+         * \param values [in] The local CSR matrix values.
+         *
+         */
+        void updateOperator
+        (
+            const int nRows,
+            const int nNz,
+            const double *value
+        );
+
+        /** \brief Solve the linear system.
+         *
+         * \p p vector will be used as an initial guess and will be updated to the
+         * solution by the end of solving.
+         *
+         * For cases that use more MPI processes than the number of GPUs, this
+         * function will do data gathering before solving and data scattering
+         * after the solving.
+         *
+         * \param nLocalRows [in] The number of rows owned by this rank.
+         * \param pscalar [in, out] The unknown array.
+         * \param bscalar [in] The RHS array.
+         * \param matrix [in,out] The AmgX CSR matrix, A.
+         *
+         */
+        void solve
+        (
+            int nRows,
+            double* psi,
+            const double* rhs
+        );
+
+	/** \brief Solve the linear system.
+         *
+         * \p p vector will be used as an initial guess and will be updated to the
+         * solution by the end of solving.
+         *
+         * For cases that use more MPI processes than the number of GPUs, this
+         * function will do data gathering before solving and data scattering
+         * after the solving.
+         *
+         * \param nLocalRows [in] The number of rows owned by this rank.
+         * \param p [in, out] The unknown vector.
+         * \param b [in] The RHS vector.
+         * \param matrix [in,out] The AmgX CSR matrix, A.
+         *
+         */
+        // void solve
+        // (
+        //     int nLocalRows,
+        //     Vec& p,
+        //     Vec& b,
+        //     AmgXCSRMatrix& matrix
+        // );
+
+
+        /** \brief Get the number of iterations of the last solving.
+         *
+         * \param iter [out] Number of iterations.
+         *
+         */
+        void getIters
+        (
+            int &iter
+        );
+
+        /** \brief Get the residual at a specific iteration during the last solving.
+         *
+         * \param iter [in] Target iteration.
+         * \param res [out] Returned residual.
+         *
+         */
+        void getResidual
+        (
+            const int &iter,
+            double &res
+        );
+
+
+    private:
+
+        /** \brief Current count of AmgXSolver instances.
+         *
+         * This static variable is used to count the number of instances. The
+         * fisrt instance is responsable for initializing AmgX library and the
+         * resource instance.
+         */
+        static int              count;
+
+        /** \brief A flag indicating if this instance has been initialized. */
+        bool                    isInitialised = false;
+
+        /** \brief A parameter used by AmgX. */
+        int                     ring;
+
+        /** \brief AmgX solver mode. */
+        AMGX_Mode               mode;
+
+        /** \brief AmgX config object. */
+        AMGX_config_handle      cfg = nullptr;
+
+        /** \brief AmgX matrix object. */
+        AMGX_matrix_handle      AmgXA = nullptr;
+
+        /** \brief AmgX vector object representing unknowns. */
+        AMGX_vector_handle      AmgXP = nullptr;
+
+        /** \brief AmgX vector object representing RHS. */
+        AMGX_vector_handle      AmgXRHS = nullptr;
+
+        /** \brief AmgX solver object. */
+        AMGX_solver_handle      solver = nullptr;
+
+        /** \brief AmgX resource object.
+         *
+         * Due to the design of AmgX library, using more than one resource
+         * instance may cause some problems. So we make the resource instance
+         * as a static member to keep only one instance.
+         */
+        static AMGX_resources_handle   rsrc;
+
+        /** \brief Set AmgX solver mode based on the user-provided string.
+         *
+         * Available modes are: dDDI, dDFI, dFFI, hDDI, hDFI, hFFI.
+         *
+         * \param modeStr [in] a std::string.
+         */
+        void setMode(const std::string &modeStr);
+
+        /** \brief Perform necessary initialization of AmgX.
+         *
+         * This function initializes AmgX for current instance. Based on
+         * \ref AmgXSolver::count "count", only the instance initialized first
+         * is in charge of initializing AmgX and the resource instance.
+         *
+         * \param cfgFile [in] Path to AmgX solver configuration file.
+         */
+        void initAmgX(const std::string &cfgFile);
+};
+
+#endif
+
diff --git a/src_gpu_orig/AmgXSolver.cu b/src_gpu_orig/AmgXSolver.cu
new file mode 100644
index 000000000..b0076e5c3
--- /dev/null
+++ b/src_gpu_orig/AmgXSolver.cu
@@ -0,0 +1,296 @@
+/**
+ * \file AmgXSolver.cpp
+ * \brief Definition of member functions of the class AmgXSolver.
+ * \author Pi-Yueh Chuang (pychuang@gwu.edu)
+ * \author Matt Martineau (mmartineau@nvidia.com)
+ * \date 2015-09-01
+ * \copyright Copyright (c) 2015-2019 Pi-Yueh Chuang, Lorena A. Barba.
+ * \copyright Copyright (c) 2019-2021, NVIDIA CORPORATION. All rights reserved.
+ *            This project is released under MIT License.
+ */
+
+// AmgXWrapper
+#include "AmgXSolver.H"
+#include <numeric>
+#include <limits>
+
+// initialize AmgXSolver::count to 0
+int AmgXSolver::count = 0;
+
+// initialize AmgXSolver::rsrc to nullptr;
+AMGX_resources_handle AmgXSolver::rsrc = nullptr;
+
+
+/* \implements AmgXSolver::AmgXSolver */
+AmgXSolver::AmgXSolver(const std::string &modeStr, const std::string &cfgFile)
+{
+    initialize(modeStr, cfgFile);
+}
+
+
+/* \implements AmgXSolver::~AmgXSolver */
+AmgXSolver::~AmgXSolver()
+{
+    if (isInitialised) finalize();
+}
+
+
+/* \implements AmgXSolver::initialize */
+void AmgXSolver::initialize(const std::string &modeStr, const std::string &cfgFile)
+{
+    
+    // if this instance has already been initialized, skip
+    if (isInitialised) {
+        fprintf(stderr,
+                "This AmgXSolver instance has been initialized on this process.\n");
+        exit(0);
+    }
+
+    // increase the number of AmgXSolver instances
+    count += 1;
+
+    // get the mode of AmgX solver
+    setMode(modeStr);  
+
+    // initialize AmgX
+    initAmgX(cfgFile);  
+
+    // a bool indicating if this instance is initialized
+    isInitialised = true;
+
+    return;
+}
+
+/* \implements AmgXSolver::setMode */
+void AmgXSolver::setMode(const std::string &modeStr)
+{
+    if (modeStr == "dDDI")
+        mode = AMGX_mode_dDDI;
+    else if (modeStr == "dDFI")
+        mode = AMGX_mode_dDFI;
+    else if (modeStr == "dFFI")
+        mode = AMGX_mode_dFFI;
+    else if (modeStr[0] == 'h') {
+        printf("CPU mode, %s, is not supported in this wrapper!",
+                modeStr.c_str());
+        exit(0);
+    }
+    else {
+        printf("%s is not an available mode! Available modes are: "
+                "dDDI, dDFI, dFFI.\n", modeStr.c_str());
+        exit(0);
+    }
+}
+
+
+/* \implements AmgXSolver::initAmgX */
+ void AmgXSolver::initAmgX(const std::string &cfgFile)
+{
+    // only the first instance (AmgX solver) is in charge of initializing AmgX
+    if (count == 1)
+    {
+        // initialize AmgX
+        AMGX_SAFE_CALL(AMGX_initialize());
+
+        // intialize AmgX plugings
+        AMGX_SAFE_CALL(AMGX_initialize_plugins());
+
+        // let AmgX to handle errors returned
+        AMGX_SAFE_CALL(AMGX_install_signal_handler());
+    }
+
+    // create an AmgX configure object
+    AMGX_SAFE_CALL(AMGX_config_create_from_file(&cfg, cfgFile.c_str()));
+
+    // let AmgX handle returned error codes internally
+    AMGX_SAFE_CALL(AMGX_config_add_parameters(&cfg, "exception_handling=1"));
+
+    // create an AmgX resource object, only the first instance is in charge
+    if (count == 1) AMGX_resources_create_simple(&rsrc, cfg);
+
+    // create AmgX vector object for unknowns and RHS
+    AMGX_vector_create(&AmgXP, rsrc, mode);
+    AMGX_vector_create(&AmgXRHS, rsrc, mode);
+
+    // create AmgX matrix object for unknowns and RHS
+    AMGX_matrix_create(&AmgXA, rsrc, mode);
+
+    // create an AmgX solver object
+    AMGX_solver_create(&solver, rsrc, mode, cfg);
+
+    // obtain the default number of rings based on current configuration
+    AMGX_config_get_default_number_of_rings(cfg, &ring);
+}
+
+/* \implements AmgXSolver::finalize */
+void AmgXSolver::finalize()
+{
+    // skip if this instance has not been initialised
+    if (!isInitialised)
+    {
+        fprintf(stderr,
+                "This AmgXWrapper has not been initialised. "
+                "Please initialise it before finalization.\n");
+        exit(0);
+    }
+
+    // destroy solver instance
+    AMGX_solver_destroy(solver);
+
+    // destroy matrix instance
+    AMGX_matrix_destroy(AmgXA);
+
+    // destroy RHS and unknown vectors
+    AMGX_vector_destroy(AmgXP);
+    AMGX_vector_destroy(AmgXRHS);
+
+    // only the last instance need to destroy resource and finalizing AmgX
+    if (count == 1)
+    {
+        AMGX_resources_destroy(rsrc);
+        AMGX_SAFE_CALL(AMGX_config_destroy(cfg));
+
+        AMGX_SAFE_CALL(AMGX_finalize_plugins());
+        AMGX_SAFE_CALL(AMGX_finalize());
+    }
+    else
+    {
+        AMGX_config_destroy(cfg);
+    }
+
+    // decrease the number of instances
+    count -= 1;
+
+    // change status
+    isInitialised = false;
+}
+
+/* \implements AmgXSolver::setOperator */
+void AmgXSolver::setOperator
+(
+    const int nRows,
+    const int nNz,
+    const int *rowIndex,
+    const int *colIndex,
+    const double *value
+)
+{
+
+    // Check the matrix size is not larger than tolerated by AmgX
+    if(nRows > std::numeric_limits<int>::max())
+    {
+        fprintf(stderr,
+                "AmgX does not support a global number of rows greater than "
+                "what can be stored in 32 bits (nGlobalRows = %d).\n",
+                nRows);
+        exit(0);
+    }
+
+    if (nNz > std::numeric_limits<int>::max())
+    {
+        fprintf(stderr,
+                "AmgX does not support non-zeros per (consolidated) rank greater than"
+                "what can be stored in 32 bits (nLocalNz = %d).\n",
+                nNz);
+        exit(0);
+    }
+
+    // upload matrix A to AmgX
+    AMGX_matrix_upload_all(
+        AmgXA, nRows, nNz, 1, 1, rowIndex, colIndex, value, nullptr);
+
+    // bind the matrix A to the solver
+    AMGX_solver_setup(solver, AmgXA);
+
+    // connect (bind) vectors to the matrix
+    AMGX_vector_bind(AmgXP, AmgXA);
+    AMGX_vector_bind(AmgXRHS, AmgXA);
+}
+
+
+/* \implements AmgXSolver::updateOperator */
+void AmgXSolver::updateOperator
+(
+    const int nRows,
+    const int nNz,
+    const double *value
+)
+{
+
+    // Replace the coefficients for the CSR matrix A within AmgX
+    AMGX_matrix_replace_coefficients(AmgXA, nRows, nNz, value, nullptr);
+
+    // Re-setup the solver (a reduced overhead setup that accounts for consistent matrix structure)
+    AMGX_solver_resetup(solver, AmgXA);
+}
+
+/* \implements AmgXSolver::solve */
+// void AmgXSolver::solve(
+//     int nLocalRows, Vec& p, Vec& b, AmgXCSRMatrix& matrix)
+// {
+//     double* pscalar;
+//     double* bscalar;
+
+//     // get pointers to the raw data of local vectors
+//     VecGetArray(p, &pscalar);
+//     VecGetArray(b, &bscalar);
+
+//     solve(nLocalRows, pscalar, bscalar, matrix);
+
+//     VecRestoreArray(p, &pscalar);
+//     VecRestoreArray(b, &bscalar);
+// }
+
+
+/* \implements AmgXSolver::solve */
+void AmgXSolver::solve(
+    int nRows, double* psi, const double* rhs)
+{
+    // Upload potentially consolidated vectors to AmgX
+    AMGX_vector_upload(AmgXP, nRows, 1, psi);
+    AMGX_vector_upload(AmgXRHS, nRows, 1, rhs);
+
+    // Solve
+    AMGX_solver_solve(solver, AmgXRHS, AmgXP);
+
+    // Get the status of the solver
+    AMGX_SOLVE_STATUS status;
+    AMGX_solver_get_status(solver, &status);
+
+    // Check whether the solver successfully solved the problem
+    if (status != AMGX_SOLVE_SUCCESS)
+    {
+        fprintf(stderr, "AmgX solver failed to solve the system! "
+                        "The error code is %d.\n",
+                status);
+    }
+
+    // Download data from device
+    AMGX_vector_download(AmgXP, psi);
+
+    // get norm and iteration number
+    double irnorm = 0., rnorm = 0.;
+    int nIters = 0;
+    getResidual(0, irnorm);
+    getIters(nIters);
+    getResidual(nIters, rnorm);
+    printf("Initial residual = %.10lf, Final residual = %.5e, No Iterations %d\n", irnorm, rnorm, nIters);
+
+}
+
+
+/* \implements AmgXSolver::getIters */
+void AmgXSolver::getIters(int &iter)
+{
+    // only processes using AmgX will try to get # of iterations
+    AMGX_solver_get_iterations_number(solver, &iter);
+}
+
+
+/* \implements AmgXSolver::getResidual */
+void AmgXSolver::getResidual(const int &iter, double &res)
+{
+    // only processes using AmgX will try to get residual
+    AMGX_solver_get_iteration_residual(solver, iter, 0, &res);
+}
+
diff --git a/src_gpu_orig/CMakeLists.txt b/src_gpu_orig/CMakeLists.txt
new file mode 100644
index 000000000..3a6d59825
--- /dev/null
+++ b/src_gpu_orig/CMakeLists.txt
@@ -0,0 +1,38 @@
+#
+# dfMatrix CMake configuration
+#
+cmake_minimum_required(VERSION 3.5)
+
+project(dfMatrixOrig LANGUAGES CXX CUDA)
+set(CMAKE_CXX_STANDARD 17)
+set(CMAKE_CXX_STANDARD_REQUIRED ON)
+
+find_package(CUDA REQUIRED)
+find_package(MPI REQUIRED)
+find_package(CUDAToolkit REQUIRED)
+find_library(LIBAMGXSH amgxsh PATHS $ENV{AMGX_DIR}/build)
+
+add_compile_options(-arch=sm_70 -fmad=false)
+
+include_directories(
+    ${MPI_INCLUDE_PATH}
+    ${CUDA_INCLUDE_DIRS}
+    $ENV{AMGX_DIR}/include
+    $ENV{DF_ROOT}/src_gpu
+)
+
+add_library(${PROJECT_NAME} 
+    SHARED 
+        dfMatrixDataBaseOrig.cu
+        dfMatrixOpBaseOrig.cu)
+
+target_link_libraries(${PROJECT_NAME}
+    ${MPI_LIBRARIES}
+    ${CUDA_LIBRARIES}
+    ${LIBAMGXSH}
+)
+target_compile_options(dfMatrixOrig PUBLIC -g)
+option(DFMATRIX_ENABLE_DETAILED_DEBUG "Enable detailed debug build" OFF)
+if (DFMATRIX_ENABLE_DETAILED_DEBUG)
+    target_compile_definitions(${PROJECT_NAME} PRIVATE DEBUG)
+endif()
diff --git a/src_gpu/GPUMesh.H b/src_gpu_orig/GPUMesh.H
similarity index 100%
rename from src_gpu/GPUMesh.H
rename to src_gpu_orig/GPUMesh.H
diff --git a/src_gpu/GPUfield.H b/src_gpu_orig/GPUfield.H
similarity index 100%
rename from src_gpu/GPUfield.H
rename to src_gpu_orig/GPUfield.H
diff --git a/src_gpu/GPUfield.cpp b/src_gpu_orig/GPUfield.cpp
similarity index 100%
rename from src_gpu/GPUfield.cpp
rename to src_gpu_orig/GPUfield.cpp
diff --git a/src_gpu/dfEEqn.H b/src_gpu_orig/dfEEqn.H
similarity index 100%
rename from src_gpu/dfEEqn.H
rename to src_gpu_orig/dfEEqn.H
diff --git a/src_gpu/dfEEqn.cu b/src_gpu_orig/dfEEqn.cu
similarity index 100%
rename from src_gpu/dfEEqn.cu
rename to src_gpu_orig/dfEEqn.cu
diff --git a/src_gpu_orig/dfMatrixDataBaseOrig.H b/src_gpu_orig/dfMatrixDataBaseOrig.H
new file mode 100644
index 000000000..e4a06d861
--- /dev/null
+++ b/src_gpu_orig/dfMatrixDataBaseOrig.H
@@ -0,0 +1,607 @@
+#pragma once
+
+#include <stdio.h>
+#include <unistd.h>
+#include "cuda_profiler_api.h"
+#include <cuda_runtime.h>
+#include "nvtx3/nvToolsExt.h"
+#include <vector>
+#include <numeric>
+#include <algorithm>
+#include <map>
+#include <iostream>
+#include <ctime>
+#include <cmath>
+#include "dfMatrixDataBase.H"
+
+
+void constructBoundarySelector(std::vector<int>& patchTypeSelector, const std::string& patchTypeStr, const int patchSize);
+
+struct dfMatrixDataBaseOrig
+{
+    // - cuda resource
+    cudaStream_t stream;
+
+    // - number of cell size
+    int num_cells;
+    // - number of face size
+    int num_surfaces;
+    // - number of offdiagnal entry size (2*num_surfaces)
+    int num_faces;
+    // - number of boundary cells
+    int num_boundary_cells;
+    // - number of boundary faces
+    int num_boundary_faces;
+
+    int num_species;
+
+    // - mesh variables
+    // - csr_row_index
+    int *h_A_csr_row_index=nullptr, *d_A_csr_row_index=nullptr;
+    // - csr_col_index
+    int *h_A_csr_col_index=nullptr, *d_A_csr_col_index=nullptr;
+    // - csr_diag_index
+    int *h_A_csr_diag_index=nullptr, *d_A_csr_diag_index=nullptr;
+
+    // - the pre-permutated and post-permutated interpolation weight list
+    std::vector<double> h_weight_vec_init, h_weight_vec;
+    // - the pre-permutated and post-permutated flux (phi) list
+    std::vector<double> h_phi_vec_init, h_phi_vec;
+    // - the pre-permutated and post-permutated cell face vector list
+    std::vector<double> h_face_vector_vec_init, h_face_vector_vec;
+    std::vector<double> h_face_vec_init, h_face_vec;
+    std::vector<double> h_deltaCoeffs_vec_init, h_deltaCoeffs_vec;
+    // - the host pointer to rho_new, rho_old, velocity_old, pressure and volume list
+    double *h_rho_new = nullptr, *h_rho_old = nullptr, *h_velocity_old = nullptr, 
+    *h_pressure = nullptr;
+    const double *h_volume = nullptr;
+    // - the host pointer to the pre-permutated and post-permutated interpolation weight list
+    double *h_weight_init = nullptr, *h_weight = nullptr;
+    // - the host pointer to the pre-permutated and post-permutated flux (phi) list
+    double *h_phi_init = nullptr, *h_phi = nullptr;
+    // - the host pointer to the pre-permutated and post-permutated cell face vector list
+    double *h_face_vector_init = nullptr, *h_face_vector = nullptr;
+    double *h_face_init = nullptr, *h_face = nullptr;
+    double *h_deltaCoeffs_init = nullptr, *h_deltaCoeffs = nullptr;
+    // - the device pointer to rho_new, rho_old, velocity_old, pressure and volume list
+    double *d_rho_new = nullptr, *d_rho_old = nullptr, *d_velocity_old = nullptr, 
+    *d_pressure = nullptr, *d_volume = nullptr;
+    // - the device pointer to Y(vector Yi)
+    //std::vector<double*> d_Y;
+    double *d_Y = nullptr;
+    // - the device pointer to the pre-permutated and post-permutated interpolation weight list
+    double *d_weight_init = nullptr, *d_weight = nullptr;
+    double *d_weight_upwind = nullptr;
+    // - the device pointer to the pre-permutated and post-permutated flux (phi) list
+    double *d_phi_init = nullptr, *d_phi = nullptr;
+    // - the device pointer to the pre-permutated and post-permutated cell face vector list
+    double *d_face_vector_init = nullptr, *d_face_vector = nullptr;
+    double *d_face_init = nullptr, *d_face = nullptr;
+    double *d_deltaCoeffs_init = nullptr, *d_deltaCoeffs = nullptr;
+    std::vector<double*> d_rhoD_vector;
+
+    double *d_hDiffCorrFlux = nullptr;
+    double *d_diffAlphaD = nullptr;
+    double *d_rhoD = nullptr;
+    double *d_alpha = nullptr;
+
+    double rdelta_t = 1/1e-6;
+
+    /**
+     * @brief boundary related variables
+     */
+    int *h_boundary_cell_offset = nullptr, *d_boundary_cell_offset=nullptr;
+    int *h_boundary_cell_id = nullptr, *d_boundary_cell_id = nullptr;
+    double *h_internal_coeffs = nullptr, *h_boundary_coeffs = nullptr,
+    *h_boundary_pressure = nullptr, *h_boundary_face_vector = nullptr,
+    *h_boundary_face = nullptr, *d_boundary_face = nullptr,
+    *h_boundary_deltaCoeffs = nullptr, *d_boundary_deltaCoeffs = nullptr, 
+    *d_internal_coeffs = nullptr, *d_boundary_coeffs = nullptr,
+    *d_internal_coeffs_init = nullptr, *d_boundary_coeffs_init = nullptr,
+    *d_laplac_internal_coeffs = nullptr, *d_laplac_boundary_coeffs = nullptr,
+    *d_laplac_internal_coeffs_init = nullptr, *d_laplac_boundary_coeffs_init = nullptr,
+    *d_boundary_pressure = nullptr, *d_boundary_face_vector = nullptr,
+    *d_boundary_pressure_init = nullptr,
+    *d_boundary_phi = nullptr, *d_boundary_phi_init = nullptr, 
+    *d_boundary_velocity = nullptr, *d_boundary_velocity_init = nullptr,
+    *d_boundary_nuEff = nullptr, *d_boundary_nuEff_init = nullptr,
+    *d_boundary_rho = nullptr, *d_boundary_rho_init = nullptr;
+    std::vector<double*> d_boundary_Y_vector;
+    std::vector<double*> d_boundary_Y_init_vector;
+    std::vector<double*> d_internal_coeffs_Y_vector;
+    std::vector<double*> d_boundary_coeffs_Y_vector;
+    std::vector<double*> d_laplac_internal_coeffs_Y_vector;
+    std::vector<double*> d_laplac_boundary_coeffs_Y_vector;
+    double *d_internal_coeffs_Y = nullptr;
+    double *d_boundary_coeffs_Y = nullptr;
+    double *d_laplac_internal_coeffs_Y = nullptr;
+    double *d_laplac_boundary_coeffs_Y = nullptr;
+    std::vector<double*> d_boundary_rhoD_vector;
+    double *d_boundary_mut_sct = nullptr;
+    double *d_boundary_rhoD = nullptr;
+    double *d_boundary_alpha = nullptr;
+
+    double *d_boundary_hDiffCorrFlux = nullptr;
+    int *d_boundary_UpatchType = nullptr;
+    int *d_boundary_YpatchType = nullptr;
+
+    std::vector<int> boundPermutationList;
+    std::vector<double> ueqn_internalCoeffs, ueqn_boundaryCoeffs;
+    std::vector<double> boundary_face_vector;
+    std::vector<double> boundary_pressure;
+    std::vector<double> boundary_face;
+    std::vector<double> boundary_deltaCoeffs;
+    std::vector<std::vector<int>> patch_type_init;
+    std::vector<std::vector<int>> patch_type;
+
+    // - the device pointer to the permutated index list
+    std::vector<int> permedIndex;
+    int *d_permedIndex=nullptr;
+    int *d_bouPermedIndex = nullptr;
+
+
+    // bytesize
+    // - bytes of diagnal entries
+    size_t cell_bytes;
+    // - bytes of diagnal entries (vector)
+    size_t cell_vec_bytes;
+    // - bytes of diagnal index
+    size_t cell_index_bytes;
+     // - bytes of diagnal index
+    size_t face_bytes;
+    size_t face_vec_bytes;
+    size_t face_index_bytes;
+
+    size_t boundary_cell_bytes;
+    size_t boundary_cell_vec_bytes;
+    size_t boundary_cell_index_bytes;
+
+    size_t boundary_face_bytes;
+    size_t boundary_face_vec_bytes;
+    size_t boundary_face_index_bytes;
+
+    // A_csr has one more element in each row: itself
+    size_t csr_row_index_bytes;
+    size_t csr_col_index_bytes;
+    size_t csr_value_bytes;
+    size_t csr_value_vec_bytes;
+
+    // extra matrix information
+    double *d_turbSrc_A = nullptr, *d_turbSrc_b = nullptr, *d_turbSrc_A_init = nullptr;
+    std::vector<double> h_turbSrc_init_mtx_vec, h_turbSrc_init_1mtx;
+    std::vector<double> h_turbSrc_init_src_vec, h_turbSrc_src_vec;
+    std::vector<int> tmpPermutatedList;
+    int * d_tmpPermutatedList = nullptr;
+
+    // double *h_A_csr = nullptr, *h_b = nullptr, *h_psi = nullptr;
+    // double *d_A_csr = nullptr, *d_b = nullptr, *d_psi = nullptr;
+
+    int num_iteration;
+
+    double time_monitor_CPU;
+    double time_monitor_GPU_kernel, time_monitor_GPU_memcpy, time_monitor_GPU_memcpy_test;
+
+    double* d_grad = nullptr; 
+    double* d_grad_boundary = nullptr, *d_grad_boundary_init = nullptr;
+    double* d_nuEff = nullptr;
+
+    // constructor
+    dfMatrixDataBaseOrig();
+    dfMatrixDataBaseOrig(int num_surfaces, int num_cells, int num_boundary_faces, int num_species, int & num_boundary_cells_output,
+        const int *neighbour, const int *owner, const double* volume, const double* weight, const double* face_vector, const double* face, 
+        const double* deltaCoeffs, std::vector<double> boundary_face_vector_init, std::vector<double> boundary_face_init, 
+        std::vector<double> boundary_deltaCoeffs_init, std::vector<int> boundary_cell_id_init, std::vector<std::vector<int>> patch_type_init)
+    : num_cells(num_cells), num_faces(num_surfaces*2), num_surfaces(num_surfaces), num_species(num_species), num_iteration(0),
+      num_boundary_faces(num_boundary_faces), h_volume(volume), patch_type_init(patch_type_init)
+    {
+        // create cuda stream
+        checkCudaErrors(cudaStreamCreate(&stream));
+
+        // allocate field pointer in pin memory
+        cudaMallocHost(&h_phi_init, num_faces * sizeof(double));
+        cudaMallocHost(&h_rho_old, num_cells * sizeof(double));
+
+        h_weight_vec_init.resize(num_faces);
+        h_weight_vec.resize(num_faces);
+        h_face_vector_vec_init.resize(num_faces*3);
+        h_face_vector_vec.resize(num_faces*3);
+        h_face_vec_init.resize(num_faces);
+        h_face_vec.resize(num_faces);
+        h_deltaCoeffs_vec_init.resize(num_faces);
+        h_deltaCoeffs_vec.resize(num_faces);
+        h_turbSrc_init_mtx_vec.resize(num_faces + num_cells);
+        h_turbSrc_init_1mtx.resize(num_faces + num_cells);
+        h_turbSrc_init_src_vec.resize(3*num_cells);
+        h_turbSrc_src_vec.resize(3*num_cells);
+
+        // byte sizes
+        cell_bytes = num_cells * sizeof(double);
+        cell_vec_bytes = num_cells * 3 * sizeof(double);
+        cell_index_bytes = num_cells * sizeof(int);
+
+        face_bytes = num_faces * sizeof(double);
+        face_vec_bytes = num_faces * 3 * sizeof(double);
+        face_index_bytes = num_faces * sizeof(int);
+
+        // A_csr has one more element in each row: itself
+        csr_row_index_bytes = (num_cells + 1) * sizeof(int);
+        csr_col_index_bytes = (num_cells + num_faces) * sizeof(int);
+        csr_value_bytes = (num_cells + num_faces) * sizeof(double);
+        csr_value_vec_bytes = (num_cells + num_faces) * 3 * sizeof(double);
+
+        /************************construct mesh variables****************************/
+        /**
+         * 1. h_csr_row_index & h_csr_diag_index
+        */
+        std::vector<int> h_mtxEntry_perRow_vec(num_cells);
+        std::vector<int> h_csr_diag_index_vec(num_cells);
+        std::vector<int> h_csr_row_index_vec(num_cells + 1, 0);
+
+        for (int faceI = 0; faceI < num_surfaces; faceI++)
+        {
+            h_csr_diag_index_vec[neighbour[faceI]]++;
+            h_mtxEntry_perRow_vec[neighbour[faceI]]++;
+            h_mtxEntry_perRow_vec[owner[faceI]]++;
+        }
+
+        // - consider diagnal element in each row
+        std::transform(h_mtxEntry_perRow_vec.begin(), h_mtxEntry_perRow_vec.end(), h_mtxEntry_perRow_vec.begin(), [](int n)
+            {return n + 1;});
+        // - construct h_csr_row_index & h_csr_diag_index
+        std::partial_sum(h_mtxEntry_perRow_vec.begin(), h_mtxEntry_perRow_vec.end(), h_csr_row_index_vec.begin()+1);
+        // - assign h_csr_row_index & h_csr_diag_index
+        h_A_csr_row_index = h_csr_row_index_vec.data();
+        h_A_csr_diag_index = h_csr_diag_index_vec.data();
+
+        /**
+         * 2. h_csr_col_index
+        */
+        std::vector<int> rowIndex(num_faces + num_cells), colIndex(num_faces + num_cells), diagIndex(num_cells);
+        std::iota(diagIndex.begin(), diagIndex.end(), 0);
+
+        // initialize the RowIndex (rowIndex of lower + upper + diagnal)
+        std::copy(neighbour, neighbour + num_surfaces, rowIndex.begin());
+        std::copy(owner, owner + num_surfaces, rowIndex.begin() + num_surfaces);
+        std::copy(diagIndex.begin(), diagIndex.end(), rowIndex.begin() + num_faces);
+        // initialize the ColIndex (colIndex of lower + upper + diagnal)
+        std::copy(owner, owner + num_surfaces, colIndex.begin());
+        std::copy(neighbour, neighbour + num_surfaces, colIndex.begin() + num_surfaces);
+        std::copy(diagIndex.begin(), diagIndex.end(), colIndex.begin() + num_faces);
+
+        // - construct hashTable for sorting
+        std::multimap<int,int> rowColPair;
+        for (int i = 0; i < 2*num_surfaces+num_cells; i++)
+        {
+            rowColPair.insert(std::make_pair(rowIndex[i], colIndex[i]));
+        }
+        // - sort
+        std::vector<std::pair<int, int>> globalPerm(rowColPair.begin(), rowColPair.end());
+        std::sort(globalPerm.begin(), globalPerm.end(), []
+        (const std::pair<int, int>& pair1, const std::pair<int, int>& pair2){
+        if (pair1.first != pair2.first) {
+            return pair1.first < pair2.first;
+        } else {
+            return pair1.second < pair2.second;
+        }
+        });
+
+        std::vector<int> h_csr_col_index_vec;
+        std::transform(globalPerm.begin(), globalPerm.end(), std::back_inserter(h_csr_col_index_vec), []
+            (const std::pair<int, int>& pair) {
+            return pair.second;
+        });
+        h_A_csr_col_index = h_csr_col_index_vec.data();
+        
+        // construct a tmp permutated List for add fvMatrix
+        std::vector<int> tmp_permutation(2*num_surfaces + num_cells);
+        std::vector<int> tmp_rowIndex(2*num_surfaces + num_cells);
+        std::iota(tmp_permutation.begin(), tmp_permutation.end(), 0);
+        std::copy(neighbour, neighbour + num_surfaces, tmp_rowIndex.begin());
+        std::copy(diagIndex.begin(), diagIndex.end(), tmp_rowIndex.begin() + num_surfaces);
+        std::copy(owner, owner + num_surfaces, tmp_rowIndex.begin() + num_surfaces + num_cells);
+        std::multimap<int,int> tmpPair;
+        for (int i = 0; i < 2*num_surfaces+num_cells; i++)
+        {
+            tmpPair.insert(std::make_pair(tmp_rowIndex[i], tmp_permutation[i]));
+        }
+        std::vector<std::pair<int, int>> tmpPerm(tmpPair.begin(), tmpPair.end());
+        std::sort(tmpPerm.begin(), tmpPerm.end(), []
+        (const std::pair<int, int>& pair1, const std::pair<int, int>& pair2){
+        if (pair1.first != pair2.first) {
+            return pair1.first < pair2.first;
+        } else {
+            return pair1.second < pair2.second;
+        }
+        });
+        std::transform(tmpPerm.begin(), tmpPerm.end(), std::back_inserter(tmpPermutatedList), []
+            (const std::pair<int, int>& pair) {
+            return pair.second;
+        });
+
+        /**
+         * 3. boundary imformations
+        */
+        // get boundPermutation and offset lists
+        std::vector<int> boundPermutationListInit(num_boundary_faces);
+        std::vector<int> boundOffsetList;
+        std::iota(boundPermutationListInit.begin(), boundPermutationListInit.end(), 0);
+
+        // - construct hashTable for sorting
+        std::multimap<int,int> boundPermutation;
+        for (int i = 0; i < num_boundary_faces; i++)
+        {
+            boundPermutation.insert(std::make_pair(boundary_cell_id_init[i], boundPermutationListInit[i]));
+        }
+
+        // - sort 
+        std::vector<std::pair<int, int>> boundPermPair(boundPermutation.begin(), boundPermutation.end());
+        std::sort(boundPermPair.begin(), boundPermPair.end(), []
+        (const std::pair<int, int>& pair1, const std::pair<int, int>& pair2){
+            if (pair1.first != pair2.first) {
+                return pair1.first < pair2.first;
+            } else {
+                return pair1.second < pair2.second;
+            }
+        });
+
+        // - construct boundPermedIndex and boundary_cell_id
+        std::vector<int> boundary_cell_id;
+        boundPermutationList.clear();
+        std::transform(boundPermPair.begin(), boundPermPair.end(), std::back_inserter(boundary_cell_id), []
+            (const std::pair<int, int>& pair) {
+            return pair.first;
+        });
+        std::transform(boundPermPair.begin(), boundPermPair.end(), std::back_inserter(boundPermutationList), []
+            (const std::pair<int, int>& pair) {
+            return pair.second;
+        });
+
+        // construct boundary_cell_offset
+        std::map<int, int> countMap;
+        std::vector<int> boundaryCellcount;
+        for (const auto& cellIndex : boundary_cell_id)
+            ++ countMap[cellIndex];
+        for (const auto& [cellIndex, count] : countMap)
+            boundaryCellcount.push_back(count);
+
+        num_boundary_cells = boundaryCellcount.size();
+        num_boundary_cells_output = num_boundary_cells;
+
+        std::vector<int> boundary_cell_offset(boundaryCellcount.size() + 1, 0);
+        std::partial_sum(boundaryCellcount.begin(), boundaryCellcount.end(), boundary_cell_offset.begin()+1);
+        
+        // assign h_boundary_cell_offset & h_boundary_cell_id
+        h_boundary_cell_offset = boundary_cell_offset.data();
+        h_boundary_cell_id = boundary_cell_id.data();
+
+        // 
+        boundary_cell_bytes = num_boundary_cells * sizeof(double);
+        boundary_cell_vec_bytes = num_boundary_cells * 3 * sizeof(double);
+        boundary_cell_index_bytes = num_boundary_cells * sizeof(int);
+
+        boundary_face_bytes = num_boundary_faces * sizeof(double);
+        boundary_face_vec_bytes = num_boundary_faces * 3 * sizeof(double);
+        boundary_face_index_bytes = num_boundary_faces * sizeof(int);
+
+        ueqn_internalCoeffs.resize(3*num_boundary_faces);
+        ueqn_boundaryCoeffs.resize(3*num_boundary_faces);
+
+        boundary_face_vector.resize(3*num_boundary_faces);
+        boundary_pressure.resize(num_boundary_faces);
+        boundary_face.resize(num_boundary_faces);
+        boundary_deltaCoeffs.resize(num_boundary_faces);
+
+        patch_type.resize(2);
+        patch_type[0].resize(num_boundary_faces);
+        patch_type[1].resize(num_boundary_faces);
+
+        /**
+         * 4. permutation list for field variables
+        */
+        std::vector<int> offdiagRowIndex(2*num_surfaces), permIndex(2*num_surfaces);
+        // - initialize the offdiagRowIndex (rowIndex of lower + rowIndex of upper)
+        std::copy(neighbour, neighbour + num_surfaces, offdiagRowIndex.begin());
+        std::copy(owner, owner + num_surfaces, offdiagRowIndex.begin() + num_surfaces);
+
+        // - initialize the permIndex (0, 1, ..., 2*num_surfaces)
+        std::iota(permIndex.begin(), permIndex.end(), 0);
+
+        // - construct hashTable for sorting
+        std::multimap<int,int> permutation;
+        for (int i = 0; i < 2*num_surfaces; i++)
+        {
+            permutation.insert(std::make_pair(offdiagRowIndex[i], permIndex[i]));
+        }
+        // - sort 
+        std::vector<std::pair<int, int>> permPair(permutation.begin(), permutation.end());
+        std::sort(permPair.begin(), permPair.end(), []
+        (const std::pair<int, int>& pair1, const std::pair<int, int>& pair2){
+            if (pair1.first != pair2.first) {
+                return pair1.first < pair2.first;
+            } else {
+                return pair1.second < pair2.second;
+            }
+        });
+        // - form permedIndex list
+        std::transform(permPair.begin(), permPair.end(), std::back_inserter(permedIndex), []
+            (const std::pair<int, int>& pair) {
+            return pair.second;
+        });
+
+        // copy and permutate cell variables
+        std::copy(weight, weight + num_surfaces, h_weight_vec_init.begin());
+        std::copy(weight, weight + num_surfaces, h_weight_vec_init.begin() + num_surfaces);
+        std::copy(face_vector, face_vector + 3*num_surfaces, h_face_vector_vec_init.begin());
+        std::copy(face_vector, face_vector + 3*num_surfaces, h_face_vector_vec_init.begin() + 3*num_surfaces);
+        std::copy(face, face + num_surfaces, h_face_vec_init.begin());
+        std::copy(face, face + num_surfaces, h_face_vec_init.begin() + num_surfaces);
+        std::copy(deltaCoeffs, deltaCoeffs + num_surfaces, h_deltaCoeffs_vec_init.begin());
+        std::copy(deltaCoeffs, deltaCoeffs + num_surfaces, h_deltaCoeffs_vec_init.begin() + num_surfaces);
+        for (int i = 0; i < num_faces; i++)
+        {
+            h_weight_vec[i] = h_weight_vec_init[permedIndex[i]];
+            h_face_vec[i] = h_face_vec_init[permedIndex[i]];
+            h_deltaCoeffs_vec[i] = h_deltaCoeffs_vec_init[permedIndex[i]];
+            h_face_vector_vec[i*3] = h_face_vector_vec_init[3*permedIndex[i]];
+            h_face_vector_vec[i*3+1] = h_face_vector_vec_init[3*permedIndex[i]+1];
+            h_face_vector_vec[i*3+2] = h_face_vector_vec_init[3*permedIndex[i]+2];
+        }
+        h_weight = h_weight_vec.data();
+        h_face_vector = h_face_vector_vec.data();
+        h_face = h_face_vec.data();
+        h_deltaCoeffs = h_deltaCoeffs_vec.data();
+
+        for (int i = 0; i < num_boundary_faces; i++)
+        {
+            boundary_face_vector[3*i] = boundary_face_vector_init[3*boundPermutationList[i]];
+            boundary_face_vector[3*i+1] = boundary_face_vector_init[3*boundPermutationList[i]+1];
+            boundary_face_vector[3*i+2] = boundary_face_vector_init[3*boundPermutationList[i]+2];
+            boundary_face[i] = boundary_face_init[boundPermutationList[i]];
+            boundary_deltaCoeffs[i] = boundary_deltaCoeffs_init[boundPermutationList[i]];
+            patch_type[0][i] = patch_type_init[0][boundPermutationList[i]];
+            patch_type[1][i] = patch_type_init[1][boundPermutationList[i]];
+        }
+        h_boundary_face_vector = boundary_face_vector.data();
+        h_boundary_face = boundary_face.data();
+        h_boundary_deltaCoeffs = boundary_deltaCoeffs.data();
+
+        /************************allocate memory on device****************************/
+        int total_bytes = 0;
+
+        checkCudaErrors(cudaMalloc((void**)&d_A_csr_row_index, csr_row_index_bytes));
+        checkCudaErrors(cudaMalloc((void**)&d_A_csr_col_index, csr_col_index_bytes));
+        checkCudaErrors(cudaMalloc((void**)&d_A_csr_diag_index, cell_index_bytes));
+        total_bytes += (csr_row_index_bytes + csr_col_index_bytes + cell_index_bytes);
+
+        //d_Y.resize(num_species);
+        d_rhoD_vector.resize(num_species);
+        d_boundary_Y_vector.resize(num_species);
+        d_boundary_Y_init_vector.resize(num_species);
+        d_internal_coeffs_Y_vector.resize(num_species);
+        d_boundary_coeffs_Y_vector.resize(num_species);
+        d_laplac_internal_coeffs_Y_vector.resize(num_species);
+        d_laplac_boundary_coeffs_Y_vector.resize(num_species);
+        d_boundary_rhoD_vector.resize(num_species);
+
+        for (size_t i = 0; i < num_species; ++i){
+            //checkCudaErrors(cudaMalloc((void**)&d_Y[i], cell_bytes));
+            checkCudaErrors(cudaMalloc((void**)&d_rhoD_vector[i], cell_bytes));
+        }
+        checkCudaErrors(cudaMalloc((void**)&d_Y, cell_bytes * num_species));
+        checkCudaErrors(cudaMalloc((void**)&d_rho_old, cell_bytes));
+        checkCudaErrors(cudaMalloc((void**)&d_rho_new, cell_bytes));
+        checkCudaErrors(cudaMalloc((void**)&d_volume, cell_bytes));
+        checkCudaErrors(cudaMalloc((void**)&d_pressure, cell_bytes));
+        checkCudaErrors(cudaMalloc((void**)&d_velocity_old, cell_vec_bytes));
+        checkCudaErrors(cudaMalloc((void**)&d_weight, face_bytes));
+        checkCudaErrors(cudaMalloc((void**)&d_weight_upwind, face_bytes));
+        checkCudaErrors(cudaMalloc((void**)&d_face, face_bytes));
+        checkCudaErrors(cudaMalloc((void**)&d_deltaCoeffs, face_bytes));
+        checkCudaErrors(cudaMalloc((void**)&d_phi, face_bytes));
+        checkCudaErrors(cudaMalloc((void**)&d_phi_init, face_bytes));
+        checkCudaErrors(cudaMalloc((void**)&d_face_vector, face_vec_bytes));
+        checkCudaErrors(cudaMalloc((void**)&d_nuEff, cell_bytes));
+        checkCudaErrors(cudaMalloc((void**)&d_rhoD, cell_bytes * num_species));
+        checkCudaErrors(cudaMalloc((void**)&d_alpha, cell_bytes));
+        total_bytes += (cell_bytes * (5 + 2*num_species) + face_bytes * 6 + cell_vec_bytes + face_vec_bytes);
+
+        checkCudaErrors(cudaMalloc((void**)&d_boundary_cell_offset, (num_boundary_cells+1) * sizeof(int)));
+        checkCudaErrors(cudaMalloc((void**)&d_boundary_cell_id, boundary_face_index_bytes));
+        total_bytes += (boundary_face_index_bytes + (num_boundary_cells+1) * sizeof(int));
+
+        checkCudaErrors(cudaMalloc((void**)&d_boundary_pressure_init, boundary_face_bytes));
+        checkCudaErrors(cudaMalloc((void**)&d_boundary_pressure, boundary_face_bytes));
+        checkCudaErrors(cudaMalloc((void**)&d_boundary_phi_init, boundary_face_bytes));
+        checkCudaErrors(cudaMalloc((void**)&d_boundary_phi, boundary_face_bytes));
+        checkCudaErrors(cudaMalloc((void**)&d_boundary_velocity_init, boundary_face_vec_bytes));
+        checkCudaErrors(cudaMalloc((void**)&d_boundary_velocity, boundary_face_vec_bytes));
+        checkCudaErrors(cudaMalloc((void**)&d_boundary_face_vector, boundary_face_vec_bytes));
+        checkCudaErrors(cudaMalloc((void**)&d_boundary_face, boundary_face_bytes));
+        checkCudaErrors(cudaMalloc((void**)&d_boundary_deltaCoeffs, boundary_face_bytes));
+        checkCudaErrors(cudaMalloc((void**)&d_internal_coeffs_init, boundary_face_vec_bytes));
+        checkCudaErrors(cudaMalloc((void**)&d_boundary_coeffs_init, boundary_face_vec_bytes));
+        checkCudaErrors(cudaMalloc((void**)&d_laplac_internal_coeffs_init, boundary_face_vec_bytes));
+        checkCudaErrors(cudaMalloc((void**)&d_laplac_boundary_coeffs_init, boundary_face_vec_bytes));
+        checkCudaErrors(cudaMalloc((void**)&d_internal_coeffs, boundary_face_vec_bytes));
+        checkCudaErrors(cudaMalloc((void**)&d_boundary_coeffs, boundary_face_vec_bytes));
+        checkCudaErrors(cudaMalloc((void**)&d_laplac_internal_coeffs, boundary_face_vec_bytes));
+        checkCudaErrors(cudaMalloc((void**)&d_laplac_boundary_coeffs, boundary_face_vec_bytes));
+        checkCudaErrors(cudaMalloc((void**)&d_boundary_nuEff, boundary_face_bytes));
+        checkCudaErrors(cudaMalloc((void**)&d_boundary_nuEff_init, boundary_face_bytes));
+        checkCudaErrors(cudaMalloc((void**)&d_boundary_rho, boundary_face_bytes));
+        checkCudaErrors(cudaMalloc((void**)&d_boundary_rho_init, boundary_face_bytes));
+        checkCudaErrors(cudaMalloc((void**)&d_boundary_mut_sct, boundary_face_bytes));
+        checkCudaErrors(cudaMalloc((void**)&d_boundary_UpatchType, boundary_face_index_bytes));
+        checkCudaErrors(cudaMalloc((void**)&d_boundary_YpatchType, boundary_face_index_bytes));
+        for (size_t i = 0; i < num_species; ++i){
+            checkCudaErrors(cudaMalloc((void**)&d_boundary_Y_vector[i], boundary_face_bytes));
+            checkCudaErrors(cudaMalloc((void**)&d_boundary_Y_init_vector[i], boundary_face_bytes));
+            checkCudaErrors(cudaMalloc((void**)&d_internal_coeffs_Y_vector[i], boundary_face_bytes));
+            checkCudaErrors(cudaMalloc((void**)&d_boundary_coeffs_Y_vector[i], boundary_face_bytes));
+            checkCudaErrors(cudaMalloc((void**)&d_laplac_internal_coeffs_Y_vector[i], boundary_face_bytes));
+            checkCudaErrors(cudaMalloc((void**)&d_laplac_boundary_coeffs_Y_vector[i], boundary_face_bytes));
+            checkCudaErrors(cudaMalloc((void**)&d_boundary_rhoD_vector[i], boundary_face_bytes));
+        }
+        checkCudaErrors(cudaMalloc((void**)&d_internal_coeffs_Y, boundary_face_bytes * num_species));
+        checkCudaErrors(cudaMalloc((void**)&d_boundary_coeffs_Y, boundary_face_bytes * num_species));
+        checkCudaErrors(cudaMalloc((void**)&d_laplac_internal_coeffs_Y, boundary_face_bytes * num_species));
+        checkCudaErrors(cudaMalloc((void**)&d_laplac_boundary_coeffs_Y, boundary_face_bytes * num_species));
+        checkCudaErrors(cudaMalloc((void**)&d_boundary_rhoD, boundary_face_bytes * num_species));
+        checkCudaErrors(cudaMalloc((void**)&d_boundary_alpha, boundary_face_bytes));
+        
+        total_bytes += (boundary_face_bytes*10 + boundary_face_vec_bytes * 11);
+
+        // checkCudaErrors(cudaMalloc((void**)&d_A_csr, csr_value_vec_bytes));
+        // checkCudaErrors(cudaMalloc((void**)&d_b, cell_vec_bytes));
+        // checkCudaErrors(cudaMalloc((void**)&d_psi, cell_vec_bytes));
+        total_bytes += (boundary_face_bytes + boundary_face_vec_bytes * 3);
+
+        checkCudaErrors(cudaMalloc((void**)&d_turbSrc_A, csr_value_bytes));
+        checkCudaErrors(cudaMalloc((void**)&d_turbSrc_A_init, csr_value_bytes));
+        checkCudaErrors(cudaMalloc((void**)&d_turbSrc_b, cell_vec_bytes));
+        total_bytes += (2*csr_value_bytes + cell_vec_bytes);
+
+        checkCudaErrors(cudaMalloc((void**)&d_permedIndex, face_index_bytes));
+        checkCudaErrors(cudaMalloc((void**)&d_bouPermedIndex, boundary_face_index_bytes));
+        checkCudaErrors(cudaMalloc((void**)&d_tmpPermutatedList, csr_col_index_bytes));
+        total_bytes += (face_index_bytes + boundary_face_index_bytes + csr_col_index_bytes);
+
+        checkCudaErrors(cudaMalloc((void**)&d_grad, num_cells * 9 * sizeof(double)));
+        checkCudaErrors(cudaMalloc((void**)&d_grad_boundary, boundary_face_bytes * 9));
+        checkCudaErrors(cudaMalloc((void**)&d_grad_boundary_init, boundary_cell_bytes * 9));
+        total_bytes += (num_cells * 9 * sizeof(double) + boundary_face_bytes * 9 + boundary_cell_bytes * 9); // FIXME: rename
+
+        checkCudaErrors(cudaMalloc((void**)&d_hDiffCorrFlux, 3 * cell_bytes));
+        checkCudaErrors(cudaMalloc((void**)&d_diffAlphaD, cell_bytes));
+        checkCudaErrors(cudaMalloc((void**)&d_boundary_hDiffCorrFlux, 3 * boundary_face_bytes));
+
+        fprintf(stderr, "Total bytes malloc on GPU: %.2fMB\n", total_bytes * 1.0 / 1024 / 1024);
+
+        checkCudaErrors(cudaMemcpyAsync(d_A_csr_row_index, h_A_csr_row_index, csr_row_index_bytes, cudaMemcpyHostToDevice, stream));
+        checkCudaErrors(cudaMemcpyAsync(d_A_csr_col_index, h_A_csr_col_index, csr_col_index_bytes, cudaMemcpyHostToDevice, stream));
+        checkCudaErrors(cudaMemcpyAsync(d_A_csr_diag_index, h_A_csr_diag_index, cell_index_bytes, cudaMemcpyHostToDevice, stream));
+        checkCudaErrors(cudaMemcpyAsync(d_boundary_cell_offset, h_boundary_cell_offset, (num_boundary_cells+1) * sizeof(int), cudaMemcpyHostToDevice, stream));
+        checkCudaErrors(cudaMemcpyAsync(d_boundary_cell_id, h_boundary_cell_id, boundary_face_index_bytes, cudaMemcpyHostToDevice, stream));
+
+        checkCudaErrors(cudaMemcpyAsync(d_volume, h_volume, cell_bytes, cudaMemcpyHostToDevice, stream));
+        checkCudaErrors(cudaMemcpyAsync(d_weight, h_weight, face_bytes, cudaMemcpyHostToDevice, stream));
+        checkCudaErrors(cudaMemcpyAsync(d_face, h_face, face_bytes, cudaMemcpyHostToDevice, stream));
+        checkCudaErrors(cudaMemcpyAsync(d_deltaCoeffs, h_deltaCoeffs, face_bytes, cudaMemcpyHostToDevice, stream));
+        checkCudaErrors(cudaMemcpyAsync(d_face_vector, h_face_vector, face_vec_bytes, cudaMemcpyHostToDevice, stream));
+        checkCudaErrors(cudaMemcpyAsync(d_boundary_face_vector, h_boundary_face_vector, boundary_face_vec_bytes, cudaMemcpyHostToDevice, stream));
+        checkCudaErrors(cudaMemcpyAsync(d_boundary_face, h_boundary_face, boundary_face_bytes, cudaMemcpyHostToDevice, stream));
+        checkCudaErrors(cudaMemcpyAsync(d_boundary_deltaCoeffs, h_boundary_deltaCoeffs, boundary_face_bytes, cudaMemcpyHostToDevice, stream));
+        checkCudaErrors(cudaMemcpyAsync(d_boundary_UpatchType, patch_type[0].data(), boundary_face_index_bytes, cudaMemcpyHostToDevice, stream));
+        checkCudaErrors(cudaMemcpyAsync(d_boundary_YpatchType, patch_type[1].data(), boundary_face_index_bytes, cudaMemcpyHostToDevice, stream));
+
+        checkCudaErrors(cudaMemcpyAsync(d_permedIndex, permedIndex.data(), face_index_bytes, cudaMemcpyHostToDevice, stream));
+        checkCudaErrors(cudaMemcpyAsync(d_bouPermedIndex, boundPermutationList.data(), boundary_face_index_bytes, cudaMemcpyHostToDevice, stream));
+        checkCudaErrors(cudaMemcpyAsync(d_tmpPermutatedList, tmpPermutatedList.data(), csr_col_index_bytes, cudaMemcpyHostToDevice, stream));
+    };
+
+    ~dfMatrixDataBaseOrig(){
+        std::cout << "Destructor called." << std::endl;
+        // TODO: free pointers
+    };
+};
+
diff --git a/src_gpu_orig/dfMatrixDataBaseOrig.cu b/src_gpu_orig/dfMatrixDataBaseOrig.cu
new file mode 100644
index 000000000..7eb0ba593
--- /dev/null
+++ b/src_gpu_orig/dfMatrixDataBaseOrig.cu
@@ -0,0 +1,48 @@
+#include "dfMatrixDataBaseOrig.H"
+
+
+void constructBoundarySelector(std::vector<int>& patchTypeSelector, const std::string& patchTypeStr,
+    const int patchSize)
+{
+    boundaryConditions patchCondition;
+    std::vector<int> tmpSelector;
+    static std::map<std::string, boundaryConditions> BCMap = {
+        {"zeroGradient", zeroGradient},
+        {"fixedValue", fixedValue},
+        {"empty", empty},
+        {"coupled", coupled}
+    };
+    auto iter = BCMap.find(patchTypeStr);
+    if (iter != BCMap.end()) {
+        patchCondition = iter->second;
+    } else {
+        throw std::runtime_error("Unknown boundary condition: " + patchTypeStr);
+    }
+    // zeroGradient labeled as 0, fixedValue labeled as 1, coupled labeled as 2
+    switch (patchCondition){
+        case zeroGradient:
+        {
+            tmpSelector.resize(patchSize, 0);
+            patchTypeSelector.insert(patchTypeSelector.end(), tmpSelector.begin(), tmpSelector.end());
+            break;
+        }
+        case fixedValue:
+        {
+            tmpSelector.resize(patchSize, 1);
+            patchTypeSelector.insert(patchTypeSelector.end(), tmpSelector.begin(), tmpSelector.end());
+            break;
+        }
+        case empty:
+        {
+            tmpSelector.resize(patchSize, 2);
+            patchTypeSelector.insert(patchTypeSelector.end(), tmpSelector.begin(), tmpSelector.end());
+            break;
+        }
+        case coupled:
+        {
+            tmpSelector.resize(patchSize, 3);
+            patchTypeSelector.insert(patchTypeSelector.end(), tmpSelector.begin(), tmpSelector.end());
+            break;
+        }
+    }
+}
diff --git a/src_gpu_orig/dfMatrixOpBaseOrig.H b/src_gpu_orig/dfMatrixOpBaseOrig.H
new file mode 100644
index 000000000..0f61b558b
--- /dev/null
+++ b/src_gpu_orig/dfMatrixOpBaseOrig.H
@@ -0,0 +1,9 @@
+#pragma once
+
+#include "dfMatrixDataBaseOrig.H"
+#include "dfMatrixDataBase.H"
+
+void fvc_grad_vector_orig(cudaStream_t stream, dfMatrixDataBaseOrig* dataBaseOrig, dfMatrixDataBase& dataBase, double *d_grad, 
+        double *d_grad_boundary_init, double *d_grad_boundary);
+
+void fvc_grad_scalar_orig(cudaStream_t stream, dfMatrixDataBaseOrig* dataBaseOrig, dfMatrixDataBase& dataBase, double *d_grad);
\ No newline at end of file
diff --git a/src_gpu_orig/dfMatrixOpBaseOrig.cu b/src_gpu_orig/dfMatrixOpBaseOrig.cu
new file mode 100644
index 000000000..95737ab12
--- /dev/null
+++ b/src_gpu_orig/dfMatrixOpBaseOrig.cu
@@ -0,0 +1,460 @@
+#include "dfMatrixOpBaseOrig.H"
+
+
+__global__ void fvc_grad_vector_internal(int num_cells,
+                                         const int *csr_row_index, const int *csr_col_index, const int *csr_diag_index,
+                                         const double *sf, const double *vf, const double *tlambdas, const double *volume,
+                                         double *grad)
+{
+    int index = blockDim.x * blockIdx.x + threadIdx.x;
+    if (index >= num_cells)
+        return;
+
+    // A_csr has one more element in each row: itself
+    int row_index = csr_row_index[index];
+    int row_elements = csr_row_index[index + 1] - row_index;
+    int diag_index = csr_diag_index[index];
+    int neighbor_offset = csr_row_index[index] - index;
+
+    double own_vf_x = vf[index * 3 + 0];
+    double own_vf_y = vf[index * 3 + 1];
+    double own_vf_z = vf[index * 3 + 2];
+    double grad_xx = 0;
+    double grad_xy = 0;
+    double grad_xz = 0;
+    double grad_yx = 0;
+    double grad_yy = 0;
+    double grad_yz = 0;
+    double grad_zx = 0;
+    double grad_zy = 0;
+    double grad_zz = 0;
+    // lower
+    for (int i = 0; i < diag_index; i++)
+    {
+        int neighbor_index = neighbor_offset + i;
+        int neighbor_cell_id = csr_col_index[row_index + i];
+        double w = tlambdas[neighbor_index];
+        double sf_x = sf[neighbor_index * 3 + 0];
+        double sf_y = sf[neighbor_index * 3 + 1];
+        double sf_z = sf[neighbor_index * 3 + 2];
+        double neighbor_vf_x = vf[neighbor_cell_id * 3 + 0];
+        double neighbor_vf_y = vf[neighbor_cell_id * 3 + 1];
+        double neighbor_vf_z = vf[neighbor_cell_id * 3 + 2];
+        double face_x = (1 - w) * own_vf_x + w * neighbor_vf_x;
+        double face_y = (1 - w) * own_vf_y + w * neighbor_vf_y;
+        double face_z = (1 - w) * own_vf_z + w * neighbor_vf_z;
+        grad_xx -= sf_x * face_x;
+        grad_xy -= sf_x * face_y;
+        grad_xz -= sf_x * face_z;
+        grad_yx -= sf_y * face_x;
+        grad_yy -= sf_y * face_y;
+        grad_yz -= sf_y * face_z;
+        grad_zx -= sf_z * face_x;
+        grad_zy -= sf_z * face_y;
+        grad_zz -= sf_z * face_z;
+    }
+    // upper
+    for (int i = diag_index + 1; i < row_elements; i++)
+    {
+        int neighbor_index = neighbor_offset + i - 1;
+        int neighbor_cell_id = csr_col_index[row_index + i];
+        double w = tlambdas[neighbor_index];
+        double sf_x = sf[neighbor_index * 3 + 0];
+        double sf_y = sf[neighbor_index * 3 + 1];
+        double sf_z = sf[neighbor_index * 3 + 2];
+        double neighbor_vf_x = vf[neighbor_cell_id * 3 + 0];
+        double neighbor_vf_y = vf[neighbor_cell_id * 3 + 1];
+        double neighbor_vf_z = vf[neighbor_cell_id * 3 + 2];
+        double face_x = w * own_vf_x + (1 - w) * neighbor_vf_x;
+        double face_y = w * own_vf_y + (1 - w) * neighbor_vf_y;
+        double face_z = w * own_vf_z + (1 - w) * neighbor_vf_z;
+        grad_xx += sf_x * face_x;
+        grad_xy += sf_x * face_y;
+        grad_xz += sf_x * face_z;
+        grad_yx += sf_y * face_x;
+        grad_yy += sf_y * face_y;
+        grad_yz += sf_y * face_z;
+        grad_zx += sf_z * face_x;
+        grad_zy += sf_z * face_y;
+        grad_zz += sf_z * face_z;
+        // if (index == 0)
+        // {
+        //     printf("grad_xx = %.20lf\n", grad_xx);
+        //     // printf("sf_x = %.20lf\n", sf_x);
+        //     // printf("face_x = %.20lf\n", face_x);
+        // }
+    }
+    double vol = volume[index];
+    grad[index * 9 + 0] = grad_xx / vol;
+    grad[index * 9 + 1] = grad_xy / vol;
+    grad[index * 9 + 2] = grad_xz / vol;
+    grad[index * 9 + 3] = grad_yx / vol;
+    grad[index * 9 + 4] = grad_yy / vol;
+    grad[index * 9 + 5] = grad_yz / vol;
+    grad[index * 9 + 6] = grad_zx / vol;
+    grad[index * 9 + 7] = grad_zy / vol;
+    grad[index * 9 + 8] = grad_zz / vol;
+
+
+    // if (index == 2257)
+    // {
+    //     printf("grad[2257] = (%.5e, %.5e, %.5e, %.5e, %.5e, %.5e, %.5e, %.5e, %.5e)\n", grad[index * 9 + 0], grad[index * 9 + 1], grad[index * 9 + 2],
+    //             grad[index * 9 + 3], grad[index * 9 + 4], grad[index * 9 + 5], grad[index * 9 + 6], grad[index * 9 + 7], grad[index * 9 + 8]);
+    // }
+}
+
+__global__ void fvc_grad_vector_boundary(int num_cells, int num_boundary_cells, const int *bouPermedIndex,
+                                         const int *boundary_cell_offset, const int *boundary_cell_id,
+                                         const double *boundary_sf, const double *boundary_vf, const double *volume,
+                                         double *grad, double *grad_boundary_init)
+{
+    int index = blockDim.x * blockIdx.x + threadIdx.x;
+    if (index >= num_boundary_cells)
+        return;
+
+    int cell_offset = boundary_cell_offset[index];
+    int next_cell_offset = boundary_cell_offset[index + 1];
+    int cell_index = boundary_cell_id[cell_offset];
+
+    double grad_xx = 0;
+    double grad_xy = 0;
+    double grad_xz = 0;
+    double grad_yx = 0;
+    double grad_yy = 0;
+    double grad_yz = 0;
+    double grad_zx = 0;
+    double grad_zy = 0;
+    double grad_zz = 0;
+    for (int i = cell_offset; i < next_cell_offset; i++)
+    {
+        int p = bouPermedIndex[i];
+        double sf_x = boundary_sf[i * 3 + 0];
+        double sf_y = boundary_sf[i * 3 + 1];
+        double sf_z = boundary_sf[i * 3 + 2];
+        double vf_x = boundary_vf[p * 3 + 0];
+        double vf_y = boundary_vf[p * 3 + 1];
+        double vf_z = boundary_vf[p * 3 + 2];
+        grad_xx += sf_x * vf_x;
+        grad_xy += sf_x * vf_y;
+        grad_xz += sf_x * vf_z;
+        grad_yx += sf_y * vf_x;
+        grad_yy += sf_y * vf_y;
+        grad_yz += sf_y * vf_z;
+        grad_zx += sf_z * vf_x;
+        grad_zy += sf_z * vf_y;
+        grad_zz += sf_z * vf_z;
+    }
+
+    double vol = volume[cell_index];
+
+    grad[cell_index * 9 + 0] += grad_xx / vol;
+    grad[cell_index * 9 + 1] += grad_xy / vol;
+    grad[cell_index * 9 + 2] += grad_xz / vol;
+    grad[cell_index * 9 + 3] += grad_yx / vol;
+    grad[cell_index * 9 + 4] += grad_yy / vol;
+    grad[cell_index * 9 + 5] += grad_yz / vol;
+    grad[cell_index * 9 + 6] += grad_zx / vol;
+    grad[cell_index * 9 + 7] += grad_zy / vol;
+    grad[cell_index * 9 + 8] += grad_zz / vol;
+
+    grad_boundary_init[index * 9 + 0] = grad[cell_index * 9 + 0];
+    grad_boundary_init[index * 9 + 1] = grad[cell_index * 9 + 1];
+    grad_boundary_init[index * 9 + 2] = grad[cell_index * 9 + 2];
+    grad_boundary_init[index * 9 + 3] = grad[cell_index * 9 + 3];
+    grad_boundary_init[index * 9 + 4] = grad[cell_index * 9 + 4];
+    grad_boundary_init[index * 9 + 5] = grad[cell_index * 9 + 5];
+    grad_boundary_init[index * 9 + 6] = grad[cell_index * 9 + 6];
+    grad_boundary_init[index * 9 + 7] = grad[cell_index * 9 + 7];
+    grad_boundary_init[index * 9 + 8] = grad[cell_index * 9 + 8];
+
+    // if (index == 0)
+    // {
+    //     printf("grad[0] = (%.5e, %.5e, %.5e, %.5e, %.5e, %.5e, %.5e, %.5e, %.5e)\n", grad[index * 9 + 0], grad[index * 9 + 1], grad[index * 9 + 2],
+    //             grad[index * 9 + 3], grad[index * 9 + 4], grad[index * 9 + 5], grad[index * 9 + 6], grad[index * 9 + 7], grad[index * 9 + 8]);
+    // }
+}
+
+__global__ void correct_boundary_conditions(int num_boundary_cells, const int *bouPermedIndex,
+                                            const int *boundary_cell_offset, const int *boundary_cell_id,
+                                            const double *boundary_sf, const double *mag_sf,
+                                            double *boundary_grad_init, double *boundary_grad, const double *boundary_deltaCoeffs,
+                                            const double *internal_velocity, const double *boundary_velocity, const int *U_patch_type)
+{
+    int index = blockDim.x * blockIdx.x + threadIdx.x;
+    if (index >= num_boundary_cells)
+        return;
+
+    int cell_offset = boundary_cell_offset[index];
+    int next_cell_offset = boundary_cell_offset[index + 1];
+    int cell_index = boundary_cell_id[cell_offset];
+
+    // initialize boundary_grad
+    double grad_xx = boundary_grad_init[index * 9 + 0];
+    double grad_xy = boundary_grad_init[index * 9 + 1];
+    double grad_xz = boundary_grad_init[index * 9 + 2];
+    double grad_yx = boundary_grad_init[index * 9 + 3];
+    double grad_yy = boundary_grad_init[index * 9 + 4];
+    double grad_yz = boundary_grad_init[index * 9 + 5];
+    double grad_zx = boundary_grad_init[index * 9 + 6];
+    double grad_zy = boundary_grad_init[index * 9 + 7];
+    double grad_zz = boundary_grad_init[index * 9 + 8];
+
+    double internal_U_x = internal_velocity[cell_index * 3 + 0];
+    double internal_U_y = internal_velocity[cell_index * 3 + 1];
+    double internal_U_z = internal_velocity[cell_index * 3 + 2];
+
+    for (int i = cell_offset; i < next_cell_offset; i++)
+    {
+        // OpenFoam code
+        // const vectorField n
+        //     (
+        //      vsf.mesh().Sf().boundaryField()[patchi]
+        //      / vsf.mesh().magSf().boundaryField()[patchi]
+        //     );
+        // gGradbf[patchi] += n *
+        //     (
+        //      vsf.boundaryField()[patchi].snGrad()
+        //      - (n & gGradbf[patchi])
+        //     );
+        // template<class Type> // fixedValue
+        // Foam::tmp<Foam::Field<Type>> Foam::fvPatchField<Type>::snGrad() const
+        // {
+        //     return patch_.deltaCoeffs()*(*this - patchInternalField());
+        // }
+
+        double n_x = boundary_sf[i * 3 + 0] / mag_sf[i];
+        double n_y = boundary_sf[i * 3 + 1] / mag_sf[i];
+        double n_z = boundary_sf[i * 3 + 2] / mag_sf[i];
+
+        int p = bouPermedIndex[i];
+
+        double sn_grad_x, sn_grad_y, sn_grad_z;
+        int patchIndex = U_patch_type[i];
+        if (patchIndex == 0) { // zeroGradient
+            sn_grad_x = 0;
+            sn_grad_y = 0;
+            sn_grad_z = 0;
+        } else if (patchIndex == 1) { // fixedValue
+            sn_grad_x = boundary_deltaCoeffs[i] * (boundary_velocity[p * 3 + 0] - internal_velocity[cell_index * 3 + 0]);
+            sn_grad_y = boundary_deltaCoeffs[i] * (boundary_velocity[p * 3 + 1] - internal_velocity[cell_index * 3 + 1]);
+            sn_grad_z = boundary_deltaCoeffs[i] * (boundary_velocity[p * 3 + 2] - internal_velocity[cell_index * 3 + 2]);
+            // if (index == 1)
+            // {
+            //     printf("cell_index = %d\n", cell_index);
+            //     printf("boundary_velocity = %e\n", boundary_velocity[i * 3 + 1]);
+            //     printf("internal_velocity = %e\n", internal_velocity[cell_index * 3 + 0]);
+            // }
+            
+        }
+        // TODO: implement other BCs
+        double grad_correction_x = sn_grad_x - (n_x * grad_xx + n_y * grad_yx + n_z * grad_zx);
+        double grad_correction_y = sn_grad_y - (n_x * grad_xy + n_y * grad_yy + n_z * grad_zy);
+        double grad_correction_z = sn_grad_z - (n_x * grad_xz + n_y * grad_yz + n_z * grad_zz);
+        boundary_grad[i * 9 + 0] = grad_xx + n_x * grad_correction_x;
+        boundary_grad[i * 9 + 1] = grad_xy + n_x * grad_correction_y;
+        boundary_grad[i * 9 + 2] = grad_xz + n_x * grad_correction_z;
+        boundary_grad[i * 9 + 3] = grad_yx + n_y * grad_correction_x;
+        boundary_grad[i * 9 + 4] = grad_yy + n_y * grad_correction_y;
+        boundary_grad[i * 9 + 5] = grad_yz + n_y * grad_correction_z;
+        boundary_grad[i * 9 + 6] = grad_zx + n_z * grad_correction_x;
+        boundary_grad[i * 9 + 7] = grad_zy + n_z * grad_correction_y;
+        boundary_grad[i * 9 + 8] = grad_zz + n_z * grad_correction_z;
+        
+    }
+}
+
+__global__ void fvc_grad_scalar_internal(int num_cells,
+                                       const int *csr_row_index, const int *csr_col_index, const int *csr_diag_index,
+                                       const double *face_vector, const double *weight, const double *pressure, const double *volume,
+                                       const double *b_input, double *b_output)
+{
+    int index = blockDim.x * blockIdx.x + threadIdx.x;
+    if (index >= num_cells)
+        return;
+
+    // A_csr has one more element in each row: itself
+    int row_index = csr_row_index[index];
+    int next_row_index = csr_row_index[index + 1];
+    int diag_index = csr_diag_index[index];
+    int neighbor_offset = csr_row_index[index] - index;
+
+    double own_cell_p = pressure[index];
+    double grad_bx = 0;
+    double grad_by = 0;
+    double grad_bz = 0;
+    double grad_bx_low = 0;
+    double grad_bx_upp = 0;
+    double grad_by_low = 0;
+    double grad_by_upp = 0;
+    double grad_bz_low = 0;
+    double grad_bz_upp = 0;
+    for (int i = row_index; i < next_row_index; i++)
+    {
+        int inner_index = i - row_index;
+        // lower
+        if (inner_index < diag_index)
+        {
+            int neighbor_index = neighbor_offset + inner_index;
+            double w = weight[neighbor_index];
+            double sfx = face_vector[neighbor_index * 3 + 0];
+            double sfy = face_vector[neighbor_index * 3 + 1];
+            double sfz = face_vector[neighbor_index * 3 + 2];
+            int neighbor_cell_id = csr_col_index[row_index + inner_index];
+            double neighbor_cell_p = pressure[neighbor_cell_id];
+            double face_p = (1 - w) * own_cell_p + w * neighbor_cell_p;
+            grad_bx_low -= face_p * sfx;
+            grad_by_low -= face_p * sfy;
+            grad_bz_low -= face_p * sfz;
+        }
+        // upper
+        if (inner_index > diag_index)
+        {
+            int neighbor_index = neighbor_offset + inner_index - 1;
+            double w = weight[neighbor_index];
+            double sfx = face_vector[neighbor_index * 3 + 0];
+            double sfy = face_vector[neighbor_index * 3 + 1];
+            double sfz = face_vector[neighbor_index * 3 + 2];
+            int neighbor_cell_id = csr_col_index[row_index + inner_index];
+            double neighbor_cell_p = pressure[neighbor_cell_id];
+            double face_p = w * own_cell_p + (1 - w) * neighbor_cell_p;
+            grad_bx_upp += face_p * sfx;
+            grad_by_upp += face_p * sfy;
+            grad_bz_upp += face_p * sfz;
+        }
+    }
+    double vol = volume[index];
+    b_output[index * 3 + 0] = b_input[index * 3 + 0] + (grad_bx_low + grad_bx_upp) / vol;
+    b_output[index * 3 + 1] = b_input[index * 3 + 1] + (grad_by_low + grad_by_upp) / vol;
+    b_output[index * 3 + 2] = b_input[index * 3 + 2] + (grad_bz_low + grad_bz_upp) / vol;
+    // b_output[index * 3 + 0] = b_input[index * 3 + 0] + grad_bx_low + grad_bx_upp;
+    // b_output[index * 3 + 1] = b_input[index * 3 + 1] + grad_by_low + grad_by_upp;
+    // b_output[index * 3 + 2] = b_input[index * 3 + 2] + grad_bz_low + grad_bz_upp;
+
+}
+
+__global__ void fvc_grad_scalar_boundary(int num_cells, int num_boundary_cells, const int *bouPermedIndex,
+                                       const int *boundary_cell_offset, const int *boundary_cell_id,
+                                       const double *boundary_face_vector, const double *boundary_pressure, const double *volume,
+                                       const double *b_input, double *b_output)
+{
+    int index = blockDim.x * blockIdx.x + threadIdx.x;
+    if (index >= num_boundary_cells)
+        return;
+
+    int cell_offset = boundary_cell_offset[index];
+    int next_cell_offset = boundary_cell_offset[index + 1];
+    int cell_index = boundary_cell_id[cell_offset];
+
+    // compute boundary gradient
+    double grad_bx = 0;
+    double grad_by = 0;
+    double grad_bz = 0;
+    for (int i = cell_offset; i < next_cell_offset; i++)
+    {
+        int p = bouPermedIndex[i];
+        double sfx = boundary_face_vector[i * 3 + 0];
+        double sfy = boundary_face_vector[i * 3 + 1];
+        double sfz = boundary_face_vector[i * 3 + 2];
+        double face_p = boundary_pressure[p];
+        grad_bx += face_p * sfx;
+        grad_by += face_p * sfy;
+        grad_bz += face_p * sfz;
+    }
+
+    //// correct the boundary gradient
+    // double nx = boundary_face_vector[face_index * 3 + 0] / magSf[face_index];
+    // double ny = boundary_face_vector[face_index * 3 + 1] / magSf[face_index];
+    // double nz = boundary_face_vector[face_index * 3 + 2] / magSf[face_index];
+    // double sn_grad = 0;
+    // double grad_correction = sn_grad * volume[cell_index] - (nx * grad_bx + ny * grad_by + nz * grad_bz);
+    // grad_bx += nx * grad_correction;
+    // grad_by += ny * grad_correction;
+    // grad_bz += nz * grad_correction;
+
+    double vol = volume[cell_index];
+    b_output[cell_index * 3 + 0] = b_input[cell_index * 3 + 0] + grad_bx / vol;
+    b_output[cell_index * 3 + 1] = b_input[cell_index * 3 + 1] + grad_by / vol;
+    b_output[cell_index * 3 + 2] = b_input[cell_index * 3 + 2] + grad_bz / vol;
+}
+
+
+void fvc_grad_vector_orig(cudaStream_t stream, dfMatrixDataBaseOrig* dataBaseOrig, dfMatrixDataBase& dataBase, double *d_grad, 
+        double *d_grad_boundary_init, double *d_grad_boundary)
+{
+    float time_elapsed = 0;
+    cudaEvent_t start, stop;
+    checkCudaErrors(cudaEventCreate(&start));
+    checkCudaErrors(cudaEventCreate(&stop));
+    checkCudaErrors(cudaEventRecord(start, 0));
+
+    size_t threads_per_block = 1024;
+    size_t blocks_per_grid = (dataBase.num_cells + threads_per_block - 1) / threads_per_block;
+    fvc_grad_vector_internal<<<blocks_per_grid, threads_per_block, 0, stream>>>(dataBase.num_cells,
+                                                                                dataBaseOrig->d_A_csr_row_index, dataBaseOrig->d_A_csr_col_index, dataBaseOrig->d_A_csr_diag_index,
+                                                                                dataBaseOrig->d_face_vector, dataBase.d_u, dataBaseOrig->d_weight, dataBaseOrig->d_volume, d_grad);
+
+    checkCudaErrors(cudaEventRecord(stop, 0));
+    checkCudaErrors(cudaEventSynchronize(start));
+    checkCudaErrors(cudaEventSynchronize(stop));
+    checkCudaErrors(cudaEventElapsedTime(&time_elapsed, start, stop));
+    printf("\nfvc_grad_vector_orig internal 执行时间：%f(ms)\n", time_elapsed);
+    
+    
+    checkCudaErrors(cudaEventRecord(start, 0));
+    blocks_per_grid = (dataBaseOrig->num_boundary_cells + threads_per_block - 1) / threads_per_block;
+    fvc_grad_vector_boundary<<<blocks_per_grid, threads_per_block, 0, stream>>>(dataBase.num_cells, dataBaseOrig->num_boundary_cells, dataBaseOrig->d_bouPermedIndex,
+                                                                                dataBaseOrig->d_boundary_cell_offset, dataBaseOrig->d_boundary_cell_id, dataBaseOrig->d_boundary_face_vector, 
+                                                                                dataBase.d_boundary_u, dataBase.d_volume, d_grad, d_grad_boundary_init);
+    
+    checkCudaErrors(cudaEventRecord(stop, 0));
+    checkCudaErrors(cudaEventSynchronize(start));
+    checkCudaErrors(cudaEventSynchronize(stop));
+    checkCudaErrors(cudaEventElapsedTime(&time_elapsed, start, stop));
+    printf("fvc_grad_vector_orig boundary1 执行时间：%f(ms)\n", time_elapsed);
+    
+    
+    checkCudaErrors(cudaEventRecord(start, 0));
+    correct_boundary_conditions<<<blocks_per_grid, threads_per_block, 0, stream>>>(dataBaseOrig->num_boundary_cells, dataBaseOrig->d_bouPermedIndex,
+                                                                                   dataBaseOrig->d_boundary_cell_offset, dataBaseOrig->d_boundary_cell_id, dataBaseOrig->d_boundary_face_vector, 
+                                                                                   dataBaseOrig->d_boundary_face, d_grad_boundary_init, d_grad_boundary, dataBaseOrig->d_boundary_deltaCoeffs, 
+                                                                                   dataBase.d_u, dataBase.d_boundary_u, dataBaseOrig->d_boundary_UpatchType);
+    checkCudaErrors(cudaEventRecord(stop, 0));
+    checkCudaErrors(cudaEventSynchronize(start));
+    checkCudaErrors(cudaEventSynchronize(stop));
+    checkCudaErrors(cudaEventElapsedTime(&time_elapsed, start, stop));
+    printf("fvc_grad_vector_orig boundary2 执行时间：%f(ms)\n", time_elapsed);
+}
+
+void fvc_grad_scalar_orig(cudaStream_t stream, dfMatrixDataBaseOrig* dataBaseOrig, dfMatrixDataBase& dataBase, double *d_grad)
+{
+    float time_elapsed = 0;
+    cudaEvent_t start, stop;
+    checkCudaErrors(cudaEventCreate(&start));
+    checkCudaErrors(cudaEventCreate(&stop));
+    checkCudaErrors(cudaEventRecord(start, 0));
+
+    size_t threads_per_block = 1024;
+    size_t blocks_per_grid = (dataBase.num_cells + threads_per_block - 1) / threads_per_block;
+    fvc_grad_scalar_internal<<<blocks_per_grid, threads_per_block, 0, stream>>>(dataBase.num_cells,
+                                                                                dataBaseOrig->d_A_csr_row_index, dataBaseOrig->d_A_csr_col_index, dataBaseOrig->d_A_csr_diag_index,
+                                                                                dataBaseOrig->d_face_vector, dataBaseOrig->d_weight, dataBase.d_p, dataBaseOrig->d_volume, d_grad, d_grad);
+
+    checkCudaErrors(cudaEventRecord(stop, 0));
+    checkCudaErrors(cudaEventSynchronize(start));
+    checkCudaErrors(cudaEventSynchronize(stop));
+    checkCudaErrors(cudaEventElapsedTime(&time_elapsed, start, stop));
+    printf("\nfvc_grad_scalar_orig internal 执行时间：%f(ms)\n", time_elapsed);
+
+    checkCudaErrors(cudaEventRecord(start, 0));
+
+    blocks_per_grid = (dataBaseOrig->num_boundary_cells + threads_per_block - 1) / threads_per_block;
+    fvc_grad_scalar_boundary<<<blocks_per_grid, threads_per_block, 0, stream>>>(dataBase.num_cells, dataBaseOrig->num_boundary_cells, dataBaseOrig->d_bouPermedIndex,
+                                                                              dataBaseOrig->d_boundary_cell_offset, dataBaseOrig->d_boundary_cell_id,
+                                                                              dataBaseOrig->d_boundary_face_vector, dataBase.d_boundary_p, dataBaseOrig->d_volume, d_grad, d_grad);
+    
+    checkCudaErrors(cudaEventRecord(stop, 0));
+    checkCudaErrors(cudaEventSynchronize(start));
+    checkCudaErrors(cudaEventSynchronize(stop));
+    checkCudaErrors(cudaEventElapsedTime(&time_elapsed, start, stop));
+    printf("fvc_grad_scalar_orig boundary 执行时间：%f(ms)\n", time_elapsed);
+}
\ No newline at end of file
diff --git a/src_gpu/dfRhoEqn.H b/src_gpu_orig/dfRhoEqn.H
similarity index 100%
rename from src_gpu/dfRhoEqn.H
rename to src_gpu_orig/dfRhoEqn.H
diff --git a/src_gpu/dfRhoEqn.cu b/src_gpu_orig/dfRhoEqn.cu
similarity index 100%
rename from src_gpu/dfRhoEqn.cu
rename to src_gpu_orig/dfRhoEqn.cu
diff --git a/src_gpu_orig/dfUEqn.H b/src_gpu_orig/dfUEqn.H
new file mode 100644
index 000000000..ec739db5e
--- /dev/null
+++ b/src_gpu_orig/dfUEqn.H
@@ -0,0 +1,62 @@
+#pragma once
+
+#include "AmgXSolver.H"
+#include <amgx_c.h>
+#include "dfMatrixDataBase.H"
+
+class dfUEqn
+{
+private:
+    dfMatrixDataBase &dataBase_;
+    cudaStream_t stream;
+    AmgXSolver *UxSolver, *UySolver, *UzSolver = nullptr;
+    int num_iteration;
+
+    // common variables
+    int num_cells, cell_bytes, num_faces, num_surfaces, cell_vec_bytes, csr_value_vec_bytes, num_boundary_cells;
+    int *d_A_csr_row_index, *d_A_csr_diag_index, *d_A_csr_col_index;
+
+    // Matrix variables
+    double *d_A_csr, *d_b, *d_psi, *d_psi_permute, *d_H, *d_H_permute, *d_A;
+    double *h_A_csr, *h_b, *h_psi, *h_H, *h_A = nullptr;
+
+    double *d_ueqn_internal_coeffs, *d_ueqn_boundary_coeffs= nullptr;
+
+public:
+    dfUEqn();
+    dfUEqn(dfMatrixDataBase &dataBase, const std::string &modeStr, const std::string &cfgFile);
+    ~dfUEqn();
+
+    void checkValue(bool print);
+
+    void fvm_ddt(double *vector_old);
+
+    void fvm_div(double *boundary_pressure_init, double *boundary_velocity_init,
+                 double *boundary_nuEff_init, double *boundary_rho_init);
+
+    void fvc_grad(double *pressure);
+
+    void fvc_grad_vector();
+
+    void dev2T();
+
+    void fvc_div_tensor(const double *nuEff);
+
+    void fvm_laplacian();
+
+    void A(double *Psi);
+
+    void H(double *Psi);
+
+    void solve();
+
+    void sync();
+
+    void updatePsi(double *Psi);
+
+    void correctBoundaryConditions();
+
+    void correctPsi(double *Psi);
+
+    void initializeTimeStep();
+};
diff --git a/src_gpu_orig/dfUEqn.cu b/src_gpu_orig/dfUEqn.cu
new file mode 100644
index 000000000..56983e038
--- /dev/null
+++ b/src_gpu_orig/dfUEqn.cu
@@ -0,0 +1,1481 @@
+#include "dfUEqn.H"
+
+// kernel functions
+__global__ void fvm_ddt_kernel(int num_cells, int num_faces, const double rdelta_t,
+                               const int *csr_row_index, const int *csr_diag_index,
+                               const double *rho_old, const double *rho_new, const double *volume, const double *velocity_old,
+                               const double *A_csr_input, const double *b_input, double *A_csr_output, double *b_output, double *psi)
+{
+    int index = blockDim.x * blockIdx.x + threadIdx.x;
+    if (index >= num_cells)
+        return;
+
+    // A_csr has one more element in each row: itself
+    int row_index = csr_row_index[index];
+    int diag_index = csr_diag_index[index];
+
+    int csr_dim = num_cells + num_faces;
+    int csr_index = row_index + diag_index;
+    double ddt_diag = rdelta_t * rho_new[index] * volume[index];
+    A_csr_output[csr_dim * 0 + csr_index] = A_csr_input[csr_dim * 0 + csr_index] + ddt_diag;
+    A_csr_output[csr_dim * 1 + csr_index] = A_csr_input[csr_dim * 1 + csr_index] + ddt_diag;
+    A_csr_output[csr_dim * 2 + csr_index] = A_csr_input[csr_dim * 2 + csr_index] + ddt_diag;
+
+    double ddt_part_term = rdelta_t * rho_old[index] * volume[index];
+    b_output[num_cells * 0 + index] = b_input[num_cells * 0 + index] + ddt_part_term * velocity_old[index * 3 + 0];
+    b_output[num_cells * 1 + index] = b_input[num_cells * 1 + index] + ddt_part_term * velocity_old[index * 3 + 1];
+    b_output[num_cells * 2 + index] = b_input[num_cells * 2 + index] + ddt_part_term * velocity_old[index * 3 + 2];
+
+    psi[num_cells * 0 + index] = velocity_old[index * 3 + 0];
+    psi[num_cells * 1 + index] = velocity_old[index * 3 + 1];
+    psi[num_cells * 2 + index] = velocity_old[index * 3 + 2];
+}
+
+__global__ void fvm_div_internal(int num_cells, int num_faces,
+                                 const int *csr_row_index, const int *csr_diag_index,
+                                 const double *weight, const double *phi,
+                                 const double *A_csr_input, const double *b_input, double *A_csr_output, double *b_output)
+{
+    int index = blockDim.x * blockIdx.x + threadIdx.x;
+    if (index >= num_cells)
+        return;
+
+    // A_csr has one more element in each row: itself
+    int row_index = csr_row_index[index];
+    int next_row_index = csr_row_index[index + 1];
+    int diag_index = csr_diag_index[index];
+    int neighbor_offset = csr_row_index[index] - index;
+    int csr_dim = num_cells + num_faces;
+
+    double div_diag = 0;
+    for (int i = row_index; i < next_row_index; i++)
+    {
+        int inner_index = i - row_index;
+        // lower
+        if (inner_index < diag_index)
+        {
+            int neighbor_index = neighbor_offset + inner_index;
+            double w = weight[neighbor_index];
+            double f = phi[neighbor_index];
+            A_csr_output[csr_dim * 0 + i] = A_csr_input[csr_dim * 0 + i] + (-w) * f;
+            A_csr_output[csr_dim * 1 + i] = A_csr_input[csr_dim * 1 + i] + (-w) * f;
+            A_csr_output[csr_dim * 2 + i] = A_csr_input[csr_dim * 2 + i] + (-w) * f;
+            // lower neighbors contribute to sum of -1
+            div_diag += (w - 1) * f;
+        }
+        // upper
+        if (inner_index > diag_index)
+        {
+            // upper, index - 1, consider of diag
+            int neighbor_index = neighbor_offset + inner_index - 1;
+            double w = weight[neighbor_index];
+            double f = phi[neighbor_index];
+            A_csr_output[csr_dim * 0 + i] = A_csr_input[csr_dim * 0 + i] + (1 - w) * f;
+            A_csr_output[csr_dim * 1 + i] = A_csr_input[csr_dim * 1 + i] + (1 - w) * f;
+            A_csr_output[csr_dim * 2 + i] = A_csr_input[csr_dim * 2 + i] + (1 - w) * f;
+            // upper neighbors contribute to sum of 1
+            div_diag += w * f;
+        }
+    }
+    A_csr_output[csr_dim * 0 + row_index + diag_index] = A_csr_input[csr_dim * 0 + row_index + diag_index] + div_diag; // diag
+    A_csr_output[csr_dim * 1 + row_index + diag_index] = A_csr_input[csr_dim * 1 + row_index + diag_index] + div_diag; // diag
+    A_csr_output[csr_dim * 2 + row_index + diag_index] = A_csr_input[csr_dim * 2 + row_index + diag_index] + div_diag; // diag
+}
+
+__global__ void fvm_div_boundary(int num_cells, int num_faces, int num_boundary_cells,
+                                 const int *csr_row_index, const int *csr_diag_index,
+                                 const int *boundary_cell_offset, const int *boundary_cell_id,
+                                 const double *internal_coeffs, const double *boundary_coeffs,
+                                 const double *A_csr_input, const double *b_input, double *A_csr_output, double *b_output,
+                                 double *ueqn_internal_coeffs, double *ueqn_boundary_coeffs)
+{
+    int index = blockDim.x * blockIdx.x + threadIdx.x;
+    if (index >= num_boundary_cells)
+        return;
+
+    int cell_offset = boundary_cell_offset[index];
+    int cell_index = boundary_cell_id[cell_offset];
+    int loop_size = boundary_cell_offset[index + 1] - cell_offset;
+
+    int row_index = csr_row_index[cell_index];
+    int diag_index = csr_diag_index[cell_index];
+    int csr_dim = num_cells + num_faces;
+    int csr_index = row_index + diag_index;
+
+    // construct internalCoeffs & boundaryCoeffs
+    double internal_coeffs_x = 0;
+    double internal_coeffs_y = 0;
+    double internal_coeffs_z = 0;
+    double boundary_coeffs_x = 0;
+    double boundary_coeffs_y = 0;
+    double boundary_coeffs_z = 0;
+    for (int i = 0; i < loop_size; i++)
+    {
+        internal_coeffs_x += internal_coeffs[(cell_offset + i) * 3 + 0];
+        internal_coeffs_y += internal_coeffs[(cell_offset + i) * 3 + 1];
+        internal_coeffs_z += internal_coeffs[(cell_offset + i) * 3 + 2];
+        boundary_coeffs_x += boundary_coeffs[(cell_offset + i) * 3 + 0];
+        boundary_coeffs_y += boundary_coeffs[(cell_offset + i) * 3 + 1];
+        boundary_coeffs_z += boundary_coeffs[(cell_offset + i) * 3 + 2];
+    }
+    ueqn_internal_coeffs[cell_index * 3 + 0] = internal_coeffs_x;
+    ueqn_internal_coeffs[cell_index * 3 + 1] = internal_coeffs_y;
+    ueqn_internal_coeffs[cell_index * 3 + 2] = internal_coeffs_z;
+    ueqn_boundary_coeffs[cell_index * 3 + 0] = boundary_coeffs_x;
+    ueqn_boundary_coeffs[cell_index * 3 + 1] = boundary_coeffs_y;
+    ueqn_boundary_coeffs[cell_index * 3 + 2] = boundary_coeffs_z;
+
+    A_csr_output[csr_dim * 0 + csr_index] = A_csr_input[csr_dim * 0 + csr_index] + internal_coeffs_x;
+    A_csr_output[csr_dim * 1 + csr_index] = A_csr_input[csr_dim * 1 + csr_index] + internal_coeffs_y;
+    A_csr_output[csr_dim * 2 + csr_index] = A_csr_input[csr_dim * 2 + csr_index] + internal_coeffs_z;
+    b_output[num_cells * 0 + cell_index] = b_input[num_cells * 0 + cell_index] + boundary_coeffs_x;
+    b_output[num_cells * 1 + cell_index] = b_input[num_cells * 1 + cell_index] + boundary_coeffs_y;
+    b_output[num_cells * 2 + cell_index] = b_input[num_cells * 2 + cell_index] + boundary_coeffs_z;
+}
+
+__global__ void fvc_grad_internal_face(int num_cells,
+                                       const int *csr_row_index, const int *csr_col_index, const int *csr_diag_index,
+                                       const double *face_vector, const double *weight, const double *pressure,
+                                       const double *b_input, double *b_output)
+{
+    int index = blockDim.x * blockIdx.x + threadIdx.x;
+    if (index >= num_cells)
+        return;
+
+    // A_csr has one more element in each row: itself
+    int row_index = csr_row_index[index];
+    int next_row_index = csr_row_index[index + 1];
+    int diag_index = csr_diag_index[index];
+    int neighbor_offset = csr_row_index[index] - index;
+
+    double own_cell_p = pressure[index];
+    double grad_bx = 0;
+    double grad_by = 0;
+    double grad_bz = 0;
+    double grad_bx_low = 0;
+    double grad_bx_upp = 0;
+    double grad_by_low = 0;
+    double grad_by_upp = 0;
+    double grad_bz_low = 0;
+    double grad_bz_upp = 0;
+    for (int i = row_index; i < next_row_index; i++)
+    {
+        int inner_index = i - row_index;
+        // lower
+        if (inner_index < diag_index)
+        {
+            int neighbor_index = neighbor_offset + inner_index;
+            double w = weight[neighbor_index];
+            double sfx = face_vector[neighbor_index * 3 + 0];
+            double sfy = face_vector[neighbor_index * 3 + 1];
+            double sfz = face_vector[neighbor_index * 3 + 2];
+            int neighbor_cell_id = csr_col_index[row_index + inner_index];
+            double neighbor_cell_p = pressure[neighbor_cell_id];
+            double face_p = (1 - w) * own_cell_p + w * neighbor_cell_p;
+            grad_bx_low -= face_p * sfx;
+            grad_by_low -= face_p * sfy;
+            grad_bz_low -= face_p * sfz;
+        }
+        // upper
+        if (inner_index > diag_index)
+        {
+            int neighbor_index = neighbor_offset + inner_index - 1;
+            double w = weight[neighbor_index];
+            double sfx = face_vector[neighbor_index * 3 + 0];
+            double sfy = face_vector[neighbor_index * 3 + 1];
+            double sfz = face_vector[neighbor_index * 3 + 2];
+            int neighbor_cell_id = csr_col_index[row_index + inner_index];
+            double neighbor_cell_p = pressure[neighbor_cell_id];
+            double face_p = w * own_cell_p + (1 - w) * neighbor_cell_p;
+            grad_bx_upp += face_p * sfx;
+            grad_by_upp += face_p * sfy;
+            grad_bz_upp += face_p * sfz;
+        }
+    }
+    b_output[num_cells * 0 + index] = b_input[num_cells * 0 + index] - grad_bx_low - grad_bx_upp;
+    b_output[num_cells * 1 + index] = b_input[num_cells * 1 + index] - grad_by_low - grad_by_upp;
+    b_output[num_cells * 2 + index] = b_input[num_cells * 2 + index] - grad_bz_low - grad_bz_upp;
+}
+
+__global__ void fvc_grad_boundary_face(int num_cells, int num_boundary_cells,
+                                       const int *boundary_cell_offset, const int *boundary_cell_id,
+                                       const double *boundary_face_vector, const double *boundary_pressure,
+                                       const double *b_input, double *b_output)
+{
+    int index = blockDim.x * blockIdx.x + threadIdx.x;
+    if (index >= num_boundary_cells)
+        return;
+
+    int cell_offset = boundary_cell_offset[index];
+    int next_cell_offset = boundary_cell_offset[index + 1];
+    int cell_index = boundary_cell_id[cell_offset];
+
+    // compute boundary gradient
+    double grad_bx = 0;
+    double grad_by = 0;
+    double grad_bz = 0;
+    for (int i = cell_offset; i < next_cell_offset; i++)
+    {
+        double sfx = boundary_face_vector[i * 3 + 0];
+        double sfy = boundary_face_vector[i * 3 + 1];
+        double sfz = boundary_face_vector[i * 3 + 2];
+        double face_p = boundary_pressure[i];
+        grad_bx += face_p * sfx;
+        grad_by += face_p * sfy;
+        grad_bz += face_p * sfz;
+    }
+
+    //// correct the boundary gradient
+    // double nx = boundary_face_vector[face_index * 3 + 0] / magSf[face_index];
+    // double ny = boundary_face_vector[face_index * 3 + 1] / magSf[face_index];
+    // double nz = boundary_face_vector[face_index * 3 + 2] / magSf[face_index];
+    // double sn_grad = 0;
+    // double grad_correction = sn_grad * volume[cell_index] - (nx * grad_bx + ny * grad_by + nz * grad_bz);
+    // grad_bx += nx * grad_correction;
+    // grad_by += ny * grad_correction;
+    // grad_bz += nz * grad_correction;
+
+    b_output[num_cells * 0 + cell_index] = b_input[num_cells * 0 + cell_index] - grad_bx;
+    b_output[num_cells * 1 + cell_index] = b_input[num_cells * 1 + cell_index] - grad_by;
+    b_output[num_cells * 2 + cell_index] = b_input[num_cells * 2 + cell_index] - grad_bz;
+}
+
+__global__ void add_fvMatrix_kernel(int num_cells, int num_faces,
+                                    const int *csr_row_index,
+                                    const double *turbSrc_A, const double *turbSrc_b,
+                                    const double *A_csr_input, const double *b_input, double *A_csr_output, double *b_output)
+{
+    int index = blockDim.x * blockIdx.x + threadIdx.x;
+    if (index >= num_cells)
+        return;
+
+    int row_index = csr_row_index[index];
+    int next_row_index = csr_row_index[index + 1];
+    int csr_dim = num_cells + num_faces;
+    double A_entry;
+
+    for (int i = row_index; i < next_row_index; i++)
+    {
+        A_entry = turbSrc_A[i];
+        A_csr_output[csr_dim * 0 + i] = A_csr_input[csr_dim * 0 + i] + A_entry;
+        A_csr_output[csr_dim * 1 + i] = A_csr_input[csr_dim * 1 + i] + A_entry;
+        A_csr_output[csr_dim * 2 + i] = A_csr_input[csr_dim * 2 + i] + A_entry;
+    }
+    b_output[num_cells * 0 + index] = b_input[num_cells * 0 + index] + turbSrc_b[index * 3 + 0];
+    b_output[num_cells * 1 + index] = b_input[num_cells * 1 + index] + turbSrc_b[index * 3 + 1];
+    b_output[num_cells * 2 + index] = b_input[num_cells * 2 + index] + turbSrc_b[index * 3 + 2];
+}
+
+__global__ void offdiagPermutation(const int num_faces, const int *permedIndex,
+                                   const double *d_phi_init, double *d_phi)
+{
+    int index = blockDim.x * blockIdx.x + threadIdx.x;
+    if (index >= num_faces)
+        return;
+
+    int p = permedIndex[index];
+    d_phi[index] = d_phi_init[p];
+}
+
+__global__ void boundaryPermutation(const int num_boundary_faces, const int *bouPermedIndex,
+                                    const double *boundary_pressure_init, const double *boundary_velocity_init,
+                                    double *boundary_pressure, double *boundary_velocity,
+                                    double *boundary_nuEff_init, double *boundary_nuEff,
+                                    double *boundary_rho_init, double *boundary_rho)
+{
+    int index = blockDim.x * blockIdx.x + threadIdx.x;
+    if (index >= num_boundary_faces)
+        return;
+
+    int p = bouPermedIndex[index];
+    boundary_velocity[3 * index] = boundary_velocity_init[3 * p];
+    boundary_velocity[3 * index + 1] = boundary_velocity_init[3 * p + 1];
+    boundary_velocity[3 * index + 2] = boundary_velocity_init[3 * p + 2];
+    boundary_pressure[index] = boundary_pressure_init[p];
+    boundary_rho[index] = boundary_rho_init[p];
+    boundary_nuEff[index] = boundary_nuEff_init[p];
+}
+
+__global__ void fvc_grad_vector_internal(int num_cells,
+                                         const int *csr_row_index, const int *csr_col_index, const int *csr_diag_index,
+                                         const double *sf, const double *vf, const double *tlambdas, const double *volume,
+                                         double *grad)
+{
+    int index = blockDim.x * blockIdx.x + threadIdx.x;
+    if (index >= num_cells)
+        return;
+
+    // A_csr has one more element in each row: itself
+    int row_index = csr_row_index[index];
+    int row_elements = csr_row_index[index + 1] - row_index;
+    int diag_index = csr_diag_index[index];
+    int neighbor_offset = csr_row_index[index] - index;
+
+    double own_vf_x = vf[index * 3 + 0];
+    double own_vf_y = vf[index * 3 + 1];
+    double own_vf_z = vf[index * 3 + 2];
+    double grad_xx = 0;
+    double grad_xy = 0;
+    double grad_xz = 0;
+    double grad_yx = 0;
+    double grad_yy = 0;
+    double grad_yz = 0;
+    double grad_zx = 0;
+    double grad_zy = 0;
+    double grad_zz = 0;
+    // lower
+    for (int i = 0; i < diag_index; i++)
+    {
+        int neighbor_index = neighbor_offset + i;
+        int neighbor_cell_id = csr_col_index[row_index + i];
+        double w = tlambdas[neighbor_index];
+        double sf_x = sf[neighbor_index * 3 + 0];
+        double sf_y = sf[neighbor_index * 3 + 1];
+        double sf_z = sf[neighbor_index * 3 + 2];
+        double neighbor_vf_x = vf[neighbor_cell_id * 3 + 0];
+        double neighbor_vf_y = vf[neighbor_cell_id * 3 + 1];
+        double neighbor_vf_z = vf[neighbor_cell_id * 3 + 2];
+        double face_x = (1 - w) * own_vf_x + w * neighbor_vf_x;
+        double face_y = (1 - w) * own_vf_y + w * neighbor_vf_y;
+        double face_z = (1 - w) * own_vf_z + w * neighbor_vf_z;
+        grad_xx -= sf_x * face_x;
+        grad_xy -= sf_x * face_y;
+        grad_xz -= sf_x * face_z;
+        grad_yx -= sf_y * face_x;
+        grad_yy -= sf_y * face_y;
+        grad_yz -= sf_y * face_z;
+        grad_zx -= sf_z * face_x;
+        grad_zy -= sf_z * face_y;
+        grad_zz -= sf_z * face_z;
+    }
+    // upper
+    for (int i = diag_index + 1; i < row_elements; i++)
+    {
+        int neighbor_index = neighbor_offset + i - 1;
+        int neighbor_cell_id = csr_col_index[row_index + i];
+        double w = tlambdas[neighbor_index];
+        double sf_x = sf[neighbor_index * 3 + 0];
+        double sf_y = sf[neighbor_index * 3 + 1];
+        double sf_z = sf[neighbor_index * 3 + 2];
+        double neighbor_vf_x = vf[neighbor_cell_id * 3 + 0];
+        double neighbor_vf_y = vf[neighbor_cell_id * 3 + 1];
+        double neighbor_vf_z = vf[neighbor_cell_id * 3 + 2];
+        double face_x = w * own_vf_x + (1 - w) * neighbor_vf_x;
+        double face_y = w * own_vf_y + (1 - w) * neighbor_vf_y;
+        double face_z = w * own_vf_z + (1 - w) * neighbor_vf_z;
+        grad_xx += sf_x * face_x;
+        grad_xy += sf_x * face_y;
+        grad_xz += sf_x * face_z;
+        grad_yx += sf_y * face_x;
+        grad_yy += sf_y * face_y;
+        grad_yz += sf_y * face_z;
+        grad_zx += sf_z * face_x;
+        grad_zy += sf_z * face_y;
+        grad_zz += sf_z * face_z;
+        // if (index == 0)
+        // {
+        //     printf("grad_xx = %.20lf\n", grad_xx);
+        //     // printf("sf_x = %.20lf\n", sf_x);
+        //     // printf("face_x = %.20lf\n", face_x);
+        // }
+    }
+    double vol = volume[index];
+    grad[index * 9 + 0] = grad_xx / vol;
+    grad[index * 9 + 1] = grad_xy / vol;
+    grad[index * 9 + 2] = grad_xz / vol;
+    grad[index * 9 + 3] = grad_yx / vol;
+    grad[index * 9 + 4] = grad_yy / vol;
+    grad[index * 9 + 5] = grad_yz / vol;
+    grad[index * 9 + 6] = grad_zx / vol;
+    grad[index * 9 + 7] = grad_zy / vol;
+    grad[index * 9 + 8] = grad_zz / vol;
+}
+
+__global__ void fvc_grad_vector_boundary(int num_cells, int num_boundary_cells,
+                                         const int *boundary_cell_offset, const int *boundary_cell_id,
+                                         const double *boundary_sf, const double *boundary_vf, const double *volume,
+                                         double *grad, double *grad_boundary_init)
+{
+    int index = blockDim.x * blockIdx.x + threadIdx.x;
+    if (index >= num_boundary_cells)
+        return;
+
+    int cell_offset = boundary_cell_offset[index];
+    int next_cell_offset = boundary_cell_offset[index + 1];
+    int cell_index = boundary_cell_id[cell_offset];
+
+    double grad_xx = 0;
+    double grad_xy = 0;
+    double grad_xz = 0;
+    double grad_yx = 0;
+    double grad_yy = 0;
+    double grad_yz = 0;
+    double grad_zx = 0;
+    double grad_zy = 0;
+    double grad_zz = 0;
+    for (int i = cell_offset; i < next_cell_offset; i++)
+    {
+        double sf_x = boundary_sf[i * 3 + 0];
+        double sf_y = boundary_sf[i * 3 + 1];
+        double sf_z = boundary_sf[i * 3 + 2];
+        double vf_x = boundary_vf[i * 3 + 0];
+        double vf_y = boundary_vf[i * 3 + 1];
+        double vf_z = boundary_vf[i * 3 + 2];
+        grad_xx += sf_x * vf_x;
+        grad_xy += sf_x * vf_y;
+        grad_xz += sf_x * vf_z;
+        grad_yx += sf_y * vf_x;
+        grad_yy += sf_y * vf_y;
+        grad_yz += sf_y * vf_z;
+        grad_zx += sf_z * vf_x;
+        grad_zy += sf_z * vf_y;
+        grad_zz += sf_z * vf_z;
+    }
+
+    double vol = volume[cell_index];
+
+    grad[cell_index * 9 + 0] += grad_xx / vol;
+    grad[cell_index * 9 + 1] += grad_xy / vol;
+    grad[cell_index * 9 + 2] += grad_xz / vol;
+    grad[cell_index * 9 + 3] += grad_yx / vol;
+    grad[cell_index * 9 + 4] += grad_yy / vol;
+    grad[cell_index * 9 + 5] += grad_yz / vol;
+    grad[cell_index * 9 + 6] += grad_zx / vol;
+    grad[cell_index * 9 + 7] += grad_zy / vol;
+    grad[cell_index * 9 + 8] += grad_zz / vol;
+
+    grad_boundary_init[index * 9 + 0] = grad[cell_index * 9 + 0];
+    grad_boundary_init[index * 9 + 1] = grad[cell_index * 9 + 1];
+    grad_boundary_init[index * 9 + 2] = grad[cell_index * 9 + 2];
+    grad_boundary_init[index * 9 + 3] = grad[cell_index * 9 + 3];
+    grad_boundary_init[index * 9 + 4] = grad[cell_index * 9 + 4];
+    grad_boundary_init[index * 9 + 5] = grad[cell_index * 9 + 5];
+    grad_boundary_init[index * 9 + 6] = grad[cell_index * 9 + 6];
+    grad_boundary_init[index * 9 + 7] = grad[cell_index * 9 + 7];
+    grad_boundary_init[index * 9 + 8] = grad[cell_index * 9 + 8];
+    // if (index == 1)
+    // {
+    //     printf("grad[1] = (%lf, %lf, %lf, %lf, %lf, %lf, %lf, %lf, %lf)\n", grad[index * 9 + 0], grad[index * 9 + 1], grad[index * 9 + 2],
+    //             grad[index * 9 + 3], grad[index * 9 + 4], grad[index * 9 + 5], grad[index * 9 + 6], grad[index * 9 + 7], grad[index * 9 + 8]);
+    // }
+}
+
+__global__ void correct_boundary_conditions(int num_boundary_cells,
+                                            const int *boundary_cell_offset, const int *boundary_cell_id,
+                                            const double *boundary_sf, const double *mag_sf,
+                                            double *boundary_grad_init, double *boundary_grad, const double *boundary_deltaCoeffs,
+                                            const double *internal_velocity, const double *boundary_velocity, const int *U_patch_type)
+{
+    int index = blockDim.x * blockIdx.x + threadIdx.x;
+    if (index >= num_boundary_cells)
+        return;
+
+    int cell_offset = boundary_cell_offset[index];
+    int next_cell_offset = boundary_cell_offset[index + 1];
+    int cell_index = boundary_cell_id[cell_offset];
+
+    // initialize boundary_grad
+    double grad_xx = boundary_grad_init[index * 9 + 0];
+    double grad_xy = boundary_grad_init[index * 9 + 1];
+    double grad_xz = boundary_grad_init[index * 9 + 2];
+    double grad_yx = boundary_grad_init[index * 9 + 3];
+    double grad_yy = boundary_grad_init[index * 9 + 4];
+    double grad_yz = boundary_grad_init[index * 9 + 5];
+    double grad_zx = boundary_grad_init[index * 9 + 6];
+    double grad_zy = boundary_grad_init[index * 9 + 7];
+    double grad_zz = boundary_grad_init[index * 9 + 8];
+
+    double internal_U_x = internal_velocity[cell_index * 3 + 0];
+    double internal_U_y = internal_velocity[cell_index * 3 + 1];
+    double internal_U_z = internal_velocity[cell_index * 3 + 2];
+
+    for (int i = cell_offset; i < next_cell_offset; i++)
+    {
+        // OpenFoam code
+        // const vectorField n
+        //     (
+        //      vsf.mesh().Sf().boundaryField()[patchi]
+        //      / vsf.mesh().magSf().boundaryField()[patchi]
+        //     );
+        // gGradbf[patchi] += n *
+        //     (
+        //      vsf.boundaryField()[patchi].snGrad()
+        //      - (n & gGradbf[patchi])
+        //     );
+        // template<class Type> // fixedValue
+        // Foam::tmp<Foam::Field<Type>> Foam::fvPatchField<Type>::snGrad() const
+        // {
+        //     return patch_.deltaCoeffs()*(*this - patchInternalField());
+        // }
+
+        double n_x = boundary_sf[i * 3 + 0] / mag_sf[i];
+        double n_y = boundary_sf[i * 3 + 1] / mag_sf[i];
+        double n_z = boundary_sf[i * 3 + 2] / mag_sf[i];
+
+        double sn_grad_x, sn_grad_y, sn_grad_z;
+        int patchIndex = U_patch_type[i];
+        if (patchIndex == 0) { // zeroGradient
+            sn_grad_x = 0;
+            sn_grad_y = 0;
+            sn_grad_z = 0;
+        } else if (patchIndex == 1) { // fixedValue
+            sn_grad_x = boundary_deltaCoeffs[i] * (boundary_velocity[i * 3 + 0] - internal_velocity[cell_index * 3 + 0]);
+            sn_grad_y = boundary_deltaCoeffs[i] * (boundary_velocity[i * 3 + 1] - internal_velocity[cell_index * 3 + 1]);
+            sn_grad_z = boundary_deltaCoeffs[i] * (boundary_velocity[i * 3 + 2] - internal_velocity[cell_index * 3 + 2]);
+            // if (index == 1)
+            // {
+            //     printf("cell_index = %d\n", cell_index);
+            //     printf("boundary_velocity = %e\n", boundary_velocity[i * 3 + 1]);
+            //     printf("internal_velocity = %e\n", internal_velocity[cell_index * 3 + 0]);
+            // }
+            
+        }
+        // TODO: implement other BCs
+        double grad_correction_x = sn_grad_x - (n_x * grad_xx + n_y * grad_yx + n_z * grad_zx);
+        double grad_correction_y = sn_grad_y - (n_x * grad_xy + n_y * grad_yy + n_z * grad_zy);
+        double grad_correction_z = sn_grad_z - (n_x * grad_xz + n_y * grad_yz + n_z * grad_zz);
+        boundary_grad[i * 9 + 0] = grad_xx + n_x * grad_correction_x;
+        boundary_grad[i * 9 + 1] = grad_xy + n_x * grad_correction_y;
+        boundary_grad[i * 9 + 2] = grad_xz + n_x * grad_correction_z;
+        boundary_grad[i * 9 + 3] = grad_yx + n_y * grad_correction_x;
+        boundary_grad[i * 9 + 4] = grad_yy + n_y * grad_correction_y;
+        boundary_grad[i * 9 + 5] = grad_yz + n_y * grad_correction_z;
+        boundary_grad[i * 9 + 6] = grad_zx + n_z * grad_correction_x;
+        boundary_grad[i * 9 + 7] = grad_zy + n_z * grad_correction_y;
+        boundary_grad[i * 9 + 8] = grad_zz + n_z * grad_correction_z;
+        // if (index == 1)
+        // {
+        //     printf("boundary_grad = (%lf, %lf, %lf, %lf, %lf, %lf, %lf, %lf, %lf)\n", boundary_grad[i * 9 + 0], boundary_grad[i * 9 + 1], boundary_grad[i * 9 + 2],
+        //         boundary_grad[i * 9 + 3], boundary_grad[i * 9 + 4], boundary_grad[i * 9 + 5], boundary_grad[i * 9 + 6], boundary_grad[i * 9 + 7], boundary_grad[i * 9 + 8]);
+        // }
+        
+    }
+}
+
+__global__ void dev2_t_tensor(int num, double *tensor)
+{
+    int index = blockDim.x * blockIdx.x + threadIdx.x;
+    if (index >= num)
+        return;
+
+    double t_xx = tensor[index * 9 + 0];
+    double t_xy = tensor[index * 9 + 1];
+    double t_xz = tensor[index * 9 + 2];
+    double t_yx = tensor[index * 9 + 3];
+    double t_yy = tensor[index * 9 + 4];
+    double t_yz = tensor[index * 9 + 5];
+    double t_zx = tensor[index * 9 + 6];
+    double t_zy = tensor[index * 9 + 7];
+    double t_zz = tensor[index * 9 + 8];
+    double trace_coeff = (2. / 3.) * (t_xx + t_yy + t_zz);
+    tensor[index * 9 + 0] = t_xx - trace_coeff;
+    tensor[index * 9 + 1] = t_yx;
+    tensor[index * 9 + 2] = t_zx;
+    tensor[index * 9 + 3] = t_xy;
+    tensor[index * 9 + 4] = t_yy - trace_coeff;
+    tensor[index * 9 + 5] = t_zy;
+    tensor[index * 9 + 6] = t_xz;
+    tensor[index * 9 + 7] = t_yz;
+    tensor[index * 9 + 8] = t_zz - trace_coeff;
+}
+
+__global__ void fvc_div_tensor_internal(int num_cells,
+                                        const int *csr_row_index, const int *csr_col_index, const int *csr_diag_index,
+                                        const double *scalar0, const double *scalar1,
+                                        const double *sf, const double *vf, const double *tlambdas, const double *volume,
+                                        const double sign, const double *b_input, double *b_output)
+{
+    int index = blockDim.x * blockIdx.x + threadIdx.x;
+    if (index >= num_cells)
+        return;
+
+    // A_csr has one more element in each row: itself
+    int row_index = csr_row_index[index];
+    int row_elements = csr_row_index[index + 1] - row_index;
+    int diag_index = csr_diag_index[index];
+    int neighbor_offset = csr_row_index[index] - index;
+
+    double coeff_own = scalar0[index] * scalar1[index];
+
+    double own_vf_xx = vf[index * 9 + 0];
+    double own_vf_xy = vf[index * 9 + 1];
+    double own_vf_xz = vf[index * 9 + 2];
+    double own_vf_yx = vf[index * 9 + 3];
+    double own_vf_yy = vf[index * 9 + 4];
+    double own_vf_yz = vf[index * 9 + 5];
+    double own_vf_zx = vf[index * 9 + 6];
+    double own_vf_zy = vf[index * 9 + 7];
+    double own_vf_zz = vf[index * 9 + 8];
+    double sum_x = 0;
+    double sum_y = 0;
+    double sum_z = 0;
+
+    // lower
+    for (int i = 0; i < diag_index; i++)
+    {
+        int neighbor_index = neighbor_offset + i;
+        int neighbor_cell_id = csr_col_index[row_index + i];
+        double coeff_nei = scalar0[neighbor_cell_id] * scalar1[neighbor_cell_id];
+        double w = tlambdas[neighbor_index];
+        double sf_x = sf[neighbor_index * 3 + 0];
+        double sf_y = sf[neighbor_index * 3 + 1];
+        double sf_z = sf[neighbor_index * 3 + 2];
+        double neighbor_vf_xx = vf[neighbor_cell_id * 9 + 0];
+        double neighbor_vf_xy = vf[neighbor_cell_id * 9 + 1];
+        double neighbor_vf_xz = vf[neighbor_cell_id * 9 + 2];
+        double neighbor_vf_yx = vf[neighbor_cell_id * 9 + 3];
+        double neighbor_vf_yy = vf[neighbor_cell_id * 9 + 4];
+        double neighbor_vf_yz = vf[neighbor_cell_id * 9 + 5];
+        double neighbor_vf_zx = vf[neighbor_cell_id * 9 + 6];
+        double neighbor_vf_zy = vf[neighbor_cell_id * 9 + 7];
+        double neighbor_vf_zz = vf[neighbor_cell_id * 9 + 8];
+        double face_xx = (1 - w) * own_vf_xx * coeff_own + w * neighbor_vf_xx * coeff_nei;
+        double face_xy = (1 - w) * own_vf_xy * coeff_own + w * neighbor_vf_xy * coeff_nei;
+        double face_xz = (1 - w) * own_vf_xz * coeff_own + w * neighbor_vf_xz * coeff_nei;
+        double face_yx = (1 - w) * own_vf_yx * coeff_own + w * neighbor_vf_yx * coeff_nei;
+        double face_yy = (1 - w) * own_vf_yy * coeff_own + w * neighbor_vf_yy * coeff_nei;
+        double face_yz = (1 - w) * own_vf_yz * coeff_own + w * neighbor_vf_yz * coeff_nei;
+        double face_zx = (1 - w) * own_vf_zx * coeff_own + w * neighbor_vf_zx * coeff_nei;
+        double face_zy = (1 - w) * own_vf_zy * coeff_own + w * neighbor_vf_zy * coeff_nei;
+        double face_zz = (1 - w) * own_vf_zz * coeff_own + w * neighbor_vf_zz * coeff_nei;
+        sum_x -= sf_x * face_xx + sf_y * face_yx + sf_z * face_zx;
+        sum_y -= sf_x * face_xy + sf_y * face_yy + sf_z * face_zy;
+        sum_z -= sf_x * face_xz + sf_y * face_yz + sf_z * face_zz;
+    }
+    // upper
+    for (int i = diag_index + 1; i < row_elements; i++)
+    {
+        int neighbor_index = neighbor_offset + i - 1;
+        int neighbor_cell_id = csr_col_index[row_index + i];
+        double coeff_nei = scalar0[neighbor_cell_id] * scalar1[neighbor_cell_id];
+        double w = tlambdas[neighbor_index];
+        double sf_x = sf[neighbor_index * 3 + 0];
+        double sf_y = sf[neighbor_index * 3 + 1];
+        double sf_z = sf[neighbor_index * 3 + 2];
+        double neighbor_vf_xx = vf[neighbor_cell_id * 9 + 0];
+        double neighbor_vf_xy = vf[neighbor_cell_id * 9 + 1];
+        double neighbor_vf_xz = vf[neighbor_cell_id * 9 + 2];
+        double neighbor_vf_yx = vf[neighbor_cell_id * 9 + 3];
+        double neighbor_vf_yy = vf[neighbor_cell_id * 9 + 4];
+        double neighbor_vf_yz = vf[neighbor_cell_id * 9 + 5];
+        double neighbor_vf_zx = vf[neighbor_cell_id * 9 + 6];
+        double neighbor_vf_zy = vf[neighbor_cell_id * 9 + 7];
+        double neighbor_vf_zz = vf[neighbor_cell_id * 9 + 8];
+        double face_xx = w * own_vf_xx * coeff_own + (1 - w) * neighbor_vf_xx * coeff_nei;
+        double face_xy = w * own_vf_xy * coeff_own + (1 - w) * neighbor_vf_xy * coeff_nei;
+        double face_xz = w * own_vf_xz * coeff_own + (1 - w) * neighbor_vf_xz * coeff_nei;
+        double face_yx = w * own_vf_yx * coeff_own + (1 - w) * neighbor_vf_yx * coeff_nei;
+        double face_yy = w * own_vf_yy * coeff_own + (1 - w) * neighbor_vf_yy * coeff_nei;
+        double face_yz = w * own_vf_yz * coeff_own + (1 - w) * neighbor_vf_yz * coeff_nei;
+        double face_zx = w * own_vf_zx * coeff_own + (1 - w) * neighbor_vf_zx * coeff_nei;
+        double face_zy = w * own_vf_zy * coeff_own + (1 - w) * neighbor_vf_zy * coeff_nei;
+        double face_zz = w * own_vf_zz * coeff_own + (1 - w) * neighbor_vf_zz * coeff_nei;
+        sum_x += sf_x * face_xx + sf_y * face_yx + sf_z * face_zx;
+        sum_y += sf_x * face_xy + sf_y * face_yy + sf_z * face_zy;
+        sum_z += sf_x * face_xz + sf_y * face_yz + sf_z * face_zz;
+    }
+    double vol = volume[index];
+    b_output[num_cells * 0 + index] = b_input[num_cells * 0 + index] + sum_x * sign;
+    b_output[num_cells * 1 + index] = b_input[num_cells * 1 + index] + sum_y * sign;
+    b_output[num_cells * 2 + index] = b_input[num_cells * 2 + index] + sum_z * sign;
+}
+
+__global__ void fvc_div_tensor_boundary(int num_cells, int num_boundary_cells,
+                                        const int *boundary_cell_offset, const int *boundary_cell_id,
+                                        const double *boundary_scalar0, const double *boundary_scalar1,
+                                        const double *boundary_sf, const double *boundary_vf, const double *volume,
+                                        const double sign, const double *b_input, double *b_output)
+{
+    int index = blockDim.x * blockIdx.x + threadIdx.x;
+    if (index >= num_boundary_cells)
+        return;
+
+    int cell_offset = boundary_cell_offset[index];
+    int next_cell_offset = boundary_cell_offset[index + 1];
+    int cell_index = boundary_cell_id[cell_offset];
+
+    // OpenFoam code
+    // Foam::surfaceInterpolationScheme<Type>::dotInterpolate
+    // if (vf.boundaryField()[pi].coupled())
+    // {
+    //     psf =
+    //         pSf
+    //         & (
+    //                 pLambda*vf.boundaryField()[pi].patchInternalField()
+    //                 + (1.0 - pLambda)*vf.boundaryField()[pi].patchNeighbourField()
+    //           );
+    // }
+    // else
+    // {
+    //     psf = pSf & vf.boundaryField()[pi];
+    // }
+    // tmp<GeometricField<Type, fvPatchField, volMesh>> surfaceIntegrate
+    // forAll(mesh.boundary()[patchi], facei)
+    // {
+    //     ivf[pFaceCells[facei]] += pssf[facei];
+    // }
+    double sum_x = 0;
+    double sum_y = 0;
+    double sum_z = 0;
+    for (int i = cell_offset; i < next_cell_offset; i++)
+    {
+        double sf_x = boundary_sf[i * 3 + 0];
+        double sf_y = boundary_sf[i * 3 + 1];
+        double sf_z = boundary_sf[i * 3 + 2];
+        double face_xx = boundary_vf[i * 9 + 0];
+        double face_xy = boundary_vf[i * 9 + 1];
+        double face_xz = boundary_vf[i * 9 + 2];
+        double face_yx = boundary_vf[i * 9 + 3];
+        double face_yy = boundary_vf[i * 9 + 4];
+        double face_yz = boundary_vf[i * 9 + 5];
+        double face_zx = boundary_vf[i * 9 + 6];
+        double face_zy = boundary_vf[i * 9 + 7];
+        double face_zz = boundary_vf[i * 9 + 8];
+
+        // if not coupled
+        double coeff = boundary_scalar0[i] * boundary_scalar1[i];
+        sum_x += (sf_x * face_xx + sf_y * face_yx + sf_z * face_zx) * coeff;
+        sum_y += (sf_x * face_xy + sf_y * face_yy + sf_z * face_zy) * coeff;
+        sum_z += (sf_x * face_xz + sf_y * face_yz + sf_z * face_zz) * coeff;
+    }
+    double vol = volume[cell_index];
+    b_output[num_cells * 0 + cell_index] = b_input[num_cells * 0 + cell_index] + sum_x * sign;
+    b_output[num_cells * 1 + cell_index] = b_input[num_cells * 1 + cell_index] + sum_y * sign;
+    b_output[num_cells * 2 + cell_index] = b_input[num_cells * 2 + cell_index] + sum_z * sign;
+}
+
+__global__ void fvm_laplacian_uncorrected_vector_internal(int num_cells, int num_faces,
+                                                          const int *csr_row_index, const int *csr_col_index, const int *csr_diag_index,
+                                                          const double *scalar0, const double *scalar1, const double *weight,
+                                                          const double *magsf, const double *distance,
+                                                          const double sign, const double *A_csr_input, double *A_csr_output)
+{
+    int index = blockDim.x * blockIdx.x + threadIdx.x;
+    if (index >= num_cells)
+        return;
+
+    // A_csr has one more element in each row: itself
+    int row_index = csr_row_index[index];
+    int row_elements = csr_row_index[index + 1] - row_index;
+    int diag_index = csr_diag_index[index];
+    int neighbor_offset = csr_row_index[index] - index;
+    int csr_dim = num_cells + num_faces;
+
+    double own_scalar0 = scalar0[index];
+    double own_scalar1 = scalar1[index];
+    double own_coeff = own_scalar0 * own_scalar1;
+
+    // fvm.upper() = deltaCoeffs.primitiveField()*gammaMagSf.primitiveField();
+    // fvm.negSumDiag();
+    double sum_diag = 0;
+    // lower
+    for (int i = 0; i < diag_index; i++)
+    {
+        int neighbor_index = neighbor_offset + i;
+        int neighbor_cell_id = csr_col_index[i + row_index];
+        double w = weight[neighbor_index];
+        double nei_scalar0 = scalar0[neighbor_cell_id];
+        double nei_scalar1 = scalar1[neighbor_cell_id];
+        double nei_coeff = nei_scalar0 * nei_scalar1;
+        double gamma = w * (nei_coeff - own_coeff) + own_coeff;
+        double gamma_magsf = gamma * magsf[neighbor_index];
+        double coeff = gamma_magsf * distance[neighbor_index];
+        A_csr_output[csr_dim * 0 + row_index + i] = A_csr_input[csr_dim * 0 + row_index + i] + coeff * sign;
+        A_csr_output[csr_dim * 1 + row_index + i] = A_csr_input[csr_dim * 1 + row_index + i] + coeff * sign;
+        A_csr_output[csr_dim * 2 + row_index + i] = A_csr_input[csr_dim * 2 + row_index + i] + coeff * sign;
+
+        sum_diag += (-coeff);
+    }
+    // upper
+    for (int i = diag_index + 1; i < row_elements; i++)
+    {
+        int neighbor_index = neighbor_offset + i - 1;
+        int neighbor_cell_id = csr_col_index[i + row_index];
+        double w = weight[neighbor_index];
+        double nei_scalar0 = scalar0[neighbor_cell_id];
+        double nei_scalar1 = scalar1[neighbor_cell_id];
+        double nei_coeff = nei_scalar0 * nei_scalar1;
+        double gamma = w * (own_coeff - nei_coeff) + nei_coeff;
+        double gamma_magsf = gamma * magsf[neighbor_index];
+        double coeff = gamma_magsf * distance[neighbor_index];
+        A_csr_output[csr_dim * 0 + row_index + i] = A_csr_input[csr_dim * 0 + row_index + i] + coeff * sign;
+        A_csr_output[csr_dim * 1 + row_index + i] = A_csr_input[csr_dim * 1 + row_index + i] + coeff * sign;
+        A_csr_output[csr_dim * 2 + row_index + i] = A_csr_input[csr_dim * 2 + row_index + i] + coeff * sign;
+        sum_diag += (-coeff);
+    }
+    A_csr_output[csr_dim * 0 + row_index + diag_index] = A_csr_input[csr_dim * 0 + row_index + diag_index] + sum_diag * sign; // diag
+    A_csr_output[csr_dim * 1 + row_index + diag_index] = A_csr_input[csr_dim * 1 + row_index + diag_index] + sum_diag * sign; // diag
+    A_csr_output[csr_dim * 2 + row_index + diag_index] = A_csr_input[csr_dim * 2 + row_index + diag_index] + sum_diag * sign; // diag
+}
+
+__global__ void fvm_laplacian_uncorrected_vector_boundary(int num_cells, int num_faces, int num_boundary_cells,
+                                                          const int *csr_row_index, const int *csr_diag_index,
+                                                          const int *boundary_cell_offset, const int *boundary_cell_id,
+                                                          const double *boundary_scalar0, const double *boundary_scalar1,
+                                                          const double *boundary_magsf, const double *gradient_internal_coeffs, const double *gradient_boundary_coeffs,
+                                                          const double sign, const double *A_csr_input, const double *b_input, double *A_csr_output, double *b_output,
+                                                          double *ueqn_internal_coeffs, double *ueqn_boundary_coeffs)
+{
+    int index = blockDim.x * blockIdx.x + threadIdx.x;
+    if (index >= num_boundary_cells)
+        return;
+
+    int cell_offset = boundary_cell_offset[index];
+    int next_cell_offset = boundary_cell_offset[index + 1];
+    int cell_index = boundary_cell_id[cell_offset];
+
+    int row_index = csr_row_index[cell_index];
+    int diag_index = csr_diag_index[cell_index];
+    int csr_dim = num_cells + num_faces;
+    int csr_index = row_index + diag_index;
+
+    // OpenFoam code
+    // if (pvf.coupled())
+    // {
+    //     fvm.internalCoeffs()[patchi] =
+    //         pGamma*pvf.gradientInternalCoeffs(pDeltaCoeffs);
+    //     fvm.boundaryCoeffs()[patchi] =
+    //         -pGamma*pvf.gradientBoundaryCoeffs(pDeltaCoeffs);
+    // }
+    // else
+    // {
+    //     fvm.internalCoeffs()[patchi] = pGamma*pvf.gradientInternalCoeffs();
+    //     fvm.boundaryCoeffs()[patchi] = -
+    //         pGamma*pvf.gradientBoundaryCoeffs();
+    // }
+    double internal_coeffs_x = 0;
+    double internal_coeffs_y = 0;
+    double internal_coeffs_z = 0;
+    double boundary_coeffs_x = 0;
+    double boundary_coeffs_y = 0;
+    double boundary_coeffs_z = 0;
+    for (int i = cell_offset; i < next_cell_offset; i++)
+    {
+        double gamma = boundary_scalar0[i] * boundary_scalar1[i];
+        double gamma_magsf = gamma * boundary_magsf[i];
+        internal_coeffs_x += gamma_magsf * gradient_internal_coeffs[i * 3 + 0];
+        internal_coeffs_y += gamma_magsf * gradient_internal_coeffs[i * 3 + 1];
+        internal_coeffs_z += gamma_magsf * gradient_internal_coeffs[i * 3 + 2];
+        boundary_coeffs_x -= gamma_magsf * gradient_boundary_coeffs[i * 3 + 0];
+        boundary_coeffs_y -= gamma_magsf * gradient_boundary_coeffs[i * 3 + 1];
+        boundary_coeffs_z -= gamma_magsf * gradient_boundary_coeffs[i * 3 + 2];
+    }
+
+    ueqn_internal_coeffs[cell_index * 3 + 0] += internal_coeffs_x * sign;
+    ueqn_internal_coeffs[cell_index * 3 + 1] += internal_coeffs_y * sign;
+    ueqn_internal_coeffs[cell_index * 3 + 2] += internal_coeffs_z * sign;
+    ueqn_boundary_coeffs[cell_index * 3 + 0] += boundary_coeffs_x * sign;
+    ueqn_boundary_coeffs[cell_index * 3 + 1] += boundary_coeffs_y * sign;
+    ueqn_boundary_coeffs[cell_index * 3 + 2] += boundary_coeffs_z * sign;
+
+    A_csr_output[csr_dim * 0 + csr_index] = A_csr_input[csr_dim * 0 + csr_index] + internal_coeffs_x * sign;
+    A_csr_output[csr_dim * 1 + csr_index] = A_csr_input[csr_dim * 1 + csr_index] + internal_coeffs_y * sign;
+    A_csr_output[csr_dim * 2 + csr_index] = A_csr_input[csr_dim * 2 + csr_index] + internal_coeffs_z * sign;
+    b_output[num_cells * 0 + cell_index] = b_input[num_cells * 0 + cell_index] + boundary_coeffs_x * sign;
+    b_output[num_cells * 1 + cell_index] = b_input[num_cells * 1 + cell_index] + boundary_coeffs_y * sign;
+    b_output[num_cells * 2 + cell_index] = b_input[num_cells * 2 + cell_index] + boundary_coeffs_z * sign;
+}
+
+__global__ void addBoundaryDiag(int num_cells, int num_boundary_cells,
+                                const int *csr_row_index, const int *csr_diag_index,
+                                const int *boundary_cell_offset, const int *boundary_cell_id,
+                                const double *ueqn_internal_coeffs, const double *ueqn_boundary_coeffs,
+                                const double *psi, double *H)
+{
+    int index = blockDim.x * blockIdx.x + threadIdx.x;
+    if (index >= num_boundary_cells)
+        return;
+
+    int cell_offset = boundary_cell_offset[index];
+    int next_cell_offset = boundary_cell_offset[index + 1];
+    int cell_index = boundary_cell_id[cell_offset];
+
+    // addBoundaryDiag(boundaryDiagCmpt, cmpt); // add internal coeffs
+    // boundaryDiagCmpt.negate();
+    double internal_x = ueqn_internal_coeffs[cell_index * 3 + 0];
+    double internal_y = ueqn_internal_coeffs[cell_index * 3 + 1];
+    double internal_z = ueqn_internal_coeffs[cell_index * 3 + 2];
+
+    // addCmptAvBoundaryDiag(boundaryDiagCmpt);
+    double ave_internal = (internal_x + internal_y + internal_z) / 3;
+
+    H[num_cells * 0 + cell_index] = (-internal_x + ave_internal) * psi[num_cells * 0 + cell_index];
+    H[num_cells * 1 + cell_index] = (-internal_y + ave_internal) * psi[num_cells * 1 + cell_index];
+    H[num_cells * 2 + cell_index] = (-internal_z + ave_internal) * psi[num_cells * 2 + cell_index];
+}
+
+__global__ void permute_psi_d2h(int num_cells, const double *input, double *output)
+{
+    int index = blockDim.x * blockIdx.x + threadIdx.x;
+    if (index >= num_cells)
+        return;
+
+    output[index * 3 + 0] = input[num_cells * 0 + index];
+    output[index * 3 + 1] = input[num_cells * 1 + index];
+    output[index * 3 + 2] = input[num_cells * 2 + index];
+}
+
+__global__ void permute_psi_h2d(int num_cells, const double *input, double *output)
+{
+    int index = blockDim.x * blockIdx.x + threadIdx.x;
+    if (index >= num_cells)
+        return;
+
+    output[num_cells * 0 + index] = input[index * 3 + 0];
+    output[num_cells * 1 + index] = input[index * 3 + 1];
+    output[num_cells * 2 + index] = input[index * 3 + 2];
+}
+
+__global__ void lduMatrix_H(int num_cells,
+                            const int *csr_row_index, const int *csr_col_index, const int *csr_diag_index,
+                            const double *volume, const double *psi, const double *A_csr, const double *b,
+                            const double *ueqn_boundary_coeffs, double *H)
+{
+    int index = blockDim.x * blockIdx.x + threadIdx.x;
+    if (index >= num_cells)
+        return;
+
+    // A_csr has one more element in each row: itself
+    int row_index = csr_row_index[index];
+    int row_elements = csr_row_index[index + 1] - row_index;
+    int diag_index = csr_diag_index[index];
+    int neighbor_offset = csr_row_index[index] - index;
+
+    double APsi_x = 0.;
+    double APsi_y = 0.;
+    double APsi_z = 0.;
+    // lower
+    for (int i = 0; i < diag_index; i++)
+    {
+        int neighbor_cell_id = csr_col_index[i + row_index];
+        APsi_x += A_csr[row_index + i] * psi[num_cells * 0 + neighbor_cell_id];
+        APsi_y += A_csr[row_index + i] * psi[num_cells * 1 + neighbor_cell_id];
+        APsi_z += A_csr[row_index + i] * psi[num_cells * 2 + neighbor_cell_id];
+    }
+    // upper
+    for (int i = diag_index + 1; i < row_elements; i++)
+    {
+        int neighbor_cell_id = csr_col_index[i + row_index];
+        APsi_x += A_csr[row_index + i] * psi[num_cells * 0 + neighbor_cell_id];
+        APsi_y += A_csr[row_index + i] * psi[num_cells * 1 + neighbor_cell_id];
+        APsi_z += A_csr[row_index + i] * psi[num_cells * 2 + neighbor_cell_id];
+    }
+
+    H[num_cells * 0 + index] = H[num_cells * 0 + index] - APsi_x + b[num_cells * 0 + index];
+    H[num_cells * 1 + index] = H[num_cells * 1 + index] - APsi_y + b[num_cells * 1 + index];
+    H[num_cells * 2 + index] = H[num_cells * 2 + index] - APsi_z + b[num_cells * 2 + index];
+
+    double vol = volume[index];
+    H[num_cells * 0 + index] = H[num_cells * 0 + index] / vol;
+    H[num_cells * 1 + index] = H[num_cells * 1 + index] / vol;
+    H[num_cells * 2 + index] = H[num_cells * 2 + index] / vol;
+}
+
+__global__ void addBoundarySource(int num_cells, int num_boundary_cells,
+                                  const int *csr_row_index, const int *csr_diag_index,
+                                  const int *boundary_cell_offset, const int *boundary_cell_id,
+                                  const double *ueqn_internal_coeffs, const double *ueqn_boundary_coeffs,
+                                  const double *volume, double *H)
+{
+    int index = blockDim.x * blockIdx.x + threadIdx.x;
+    if (index >= num_boundary_cells)
+        return;
+
+    int cell_offset = boundary_cell_offset[index];
+    int cell_index = boundary_cell_id[cell_offset];
+
+    double vol = volume[index];
+
+    H[num_cells * 0 + index] = H[num_cells * 0 + index] + ueqn_boundary_coeffs[cell_index * 3 + 0] / vol;
+    H[num_cells * 1 + index] = H[num_cells * 1 + index] + ueqn_boundary_coeffs[cell_index * 3 + 1] / vol;
+    H[num_cells * 2 + index] = H[num_cells * 2 + index] + ueqn_boundary_coeffs[cell_index * 3 + 2] / vol;
+}
+
+__global__ void addAveInternaltoDiag(int num_cells, int num_boundary_cells,
+                                     const int *csr_row_index, const int *csr_diag_index,
+                                     const int *boundary_cell_offset, const int *boundary_cell_id,
+                                     const double *ueqn_internal_coeffs, const double *ueqn_boundary_coeffs, double *A)
+{
+    int index = blockDim.x * blockIdx.x + threadIdx.x;
+    if (index >= num_boundary_cells)
+        return;
+
+    int cell_offset = boundary_cell_offset[index];
+    int next_cell_offset = boundary_cell_offset[index + 1];
+    int cell_index = boundary_cell_id[cell_offset];
+
+    double internal_x = ueqn_internal_coeffs[cell_index * 3 + 0];
+    double internal_y = ueqn_internal_coeffs[cell_index * 3 + 1];
+    double internal_z = ueqn_internal_coeffs[cell_index * 3 + 2];
+
+    double ave_internal = (internal_x + internal_y + internal_z) / 3;
+
+    A[cell_index] = ave_internal;
+}
+
+__global__ void addDiagDivVolume(int num_cells, const int *csr_row_index,
+                                 const int *csr_diag_index, const double *A_csr, const double *volume,
+                                 double *ueqn_internal_coeffs, const double *A_input, double *A_output)
+{
+    int index = blockDim.x * blockIdx.x + threadIdx.x;
+    if (index >= num_cells)
+        return;
+
+    int row_index = csr_row_index[index];
+    int diag_index = csr_diag_index[index];
+    int csr_index = row_index + diag_index;
+
+    double vol = volume[index];
+
+    A_output[index] = (A_input[index] + A_csr[csr_index] - ueqn_internal_coeffs[index * 3]) / vol;
+}
+
+__global__ void ueqn_update_BoundaryCoeffs_kernel(int num_boundary_faces, const double *boundary_phi, double *internal_coeffs,
+                                                  double *boundary_coeffs, double *laplac_internal_coeffs,
+                                                  double *laplac_boundary_coeffs, const int *U_patch_type,
+                                                  const double *boundary_velocity, const double *boundary_deltaCoeffs)
+{
+    int index = blockDim.x * blockIdx.x + threadIdx.x;
+    if (index >= num_boundary_faces)
+        return;
+
+    int patchIndex = U_patch_type[index];
+    if (patchIndex == 0) { // zeroGradient
+        double bouPhi = boundary_phi[index];
+        internal_coeffs[index * 3 + 0] = bouPhi * 1.; // valueInternalCoeffs = 1.
+        internal_coeffs[index * 3 + 1] = bouPhi * 1.;
+        internal_coeffs[index * 3 + 2] = bouPhi * 1.;
+        boundary_coeffs[index * 3 + 0] = -bouPhi * 0.; // valueBoundaryCoeffs = 0.
+        boundary_coeffs[index * 3 + 1] = -bouPhi * 0.;
+        boundary_coeffs[index * 3 + 2] = -bouPhi * 0.;
+        laplac_internal_coeffs[index * 3 + 0] = 0.; // gradientInternalCoeffs = 0.
+        laplac_internal_coeffs[index * 3 + 1] = 0.;
+        laplac_internal_coeffs[index * 3 + 2] = 0.;
+        laplac_boundary_coeffs[index * 3 + 0] = 0.; // gradientBoundaryCoeffs = 0.
+        laplac_boundary_coeffs[index * 3 + 1] = 0.;
+        laplac_boundary_coeffs[index * 3 + 2] = 0.;
+    } else if (patchIndex == 1) { // fixedValue
+        double bouDeltaCoeffs = boundary_deltaCoeffs[index];
+        double bouPhi = boundary_phi[index];
+        internal_coeffs[index * 3 + 0] = bouPhi * 0.; // valueInternalCoeffs = 0.
+        internal_coeffs[index * 3 + 1] = bouPhi * 0.;
+        internal_coeffs[index * 3 + 2] = bouPhi * 0.;
+        boundary_coeffs[index * 3 + 0] = -bouPhi * boundary_velocity[index * 3 + 0]; // valueBoundaryCoeffs = boundaryValue
+        boundary_coeffs[index * 3 + 1] = -bouPhi * boundary_velocity[index * 3 + 1];
+        boundary_coeffs[index * 3 + 2] = -bouPhi * boundary_velocity[index * 3 + 2];
+        laplac_internal_coeffs[index * 3 + 0] = -1 * bouDeltaCoeffs; // gradientInternalCoeffs = -1 * boundaryDeltaCoeffs
+        laplac_internal_coeffs[index * 3 + 1] = -1 * bouDeltaCoeffs;
+        laplac_internal_coeffs[index * 3 + 2] = -1 * bouDeltaCoeffs;
+        laplac_boundary_coeffs[index * 3 + 0] = bouDeltaCoeffs * boundary_velocity[index * 3 + 0]; // gradientBoundaryCoeffs = boundaryDeltaCoeffs * boundaryValue
+        laplac_boundary_coeffs[index * 3 + 1] = bouDeltaCoeffs * boundary_velocity[index * 3 + 1];
+        laplac_boundary_coeffs[index * 3 + 2] = bouDeltaCoeffs * boundary_velocity[index * 3 + 2];
+    } else if (patchIndex == 2) { // empty
+        double bouPhi = boundary_phi[index];
+        internal_coeffs[index * 3 + 0] = bouPhi * 0.; // valueInternalCoeffs = 0.
+        internal_coeffs[index * 3 + 1] = bouPhi * 0.;
+        internal_coeffs[index * 3 + 2] = bouPhi * 0.;
+        boundary_coeffs[index * 3 + 0] = -bouPhi * 0.; // valueBoundaryCoeffs = 0.
+        boundary_coeffs[index * 3 + 1] = -bouPhi * 0.;
+        boundary_coeffs[index * 3 + 2] = -bouPhi * 0.;
+        laplac_internal_coeffs[index * 3 + 0] = 0.; // gradientInternalCoeffs = 0.
+        laplac_internal_coeffs[index * 3 + 1] = 0.;
+        laplac_internal_coeffs[index * 3 + 2] = 0.;
+        laplac_boundary_coeffs[index * 3 + 0] = 0.; // gradientBoundaryCoeffs = 0.
+        laplac_boundary_coeffs[index * 3 + 1] = 0.;
+        laplac_boundary_coeffs[index * 3 + 2] = 0.;
+    }
+    // TODO implement coupled conditions
+}
+
+__global__ void ueqn_correct_BoundaryConditions_kernel(int num_cells, int num_boundary_cells,
+                                                       const int *boundary_cell_offset, const int *boundary_cell_id,
+                                                       const double *velocity, double *boundary_velocity, const int *U_patch_type)
+{
+    int index = blockDim.x * blockIdx.x + threadIdx.x;
+    if (index >= num_boundary_cells)
+        return;
+
+    int cell_offset = boundary_cell_offset[index];
+    int next_cell_offset = boundary_cell_offset[index + 1];
+    int cell_index = boundary_cell_id[cell_offset];
+
+    for (int i = cell_offset; i < next_cell_offset; i++)
+    {
+        int patchIndex = U_patch_type[i];
+        switch (patchIndex)
+        {
+            case 0: // zeroGradient
+            {
+                boundary_velocity[i * 3 + 0] = velocity[cell_index];
+                boundary_velocity[i * 3 + 1] = velocity[num_cells * 1 + cell_index];
+                boundary_velocity[i * 3 + 2] = velocity[num_cells * 2 + cell_index];
+                break;
+            }
+            case 1:
+                break;
+            case 2:
+                break;
+            // TODO implement coupled conditions
+        }
+    }
+}
+
+// constructor
+dfUEqn::dfUEqn(dfMatrixDataBase &dataBase, const std::string &modeStr, const std::string &cfgFile)
+    : dataBase_(dataBase)
+{
+    stream = dataBase_.stream;
+
+    UxSolver = new AmgXSolver(modeStr, cfgFile);
+    UySolver = new AmgXSolver(modeStr, cfgFile);
+    UzSolver = new AmgXSolver(modeStr, cfgFile);
+
+    num_cells = dataBase_.num_cells;
+    cell_bytes = dataBase_.cell_bytes;
+    num_faces = dataBase_.num_faces;
+    cell_vec_bytes = dataBase_.cell_vec_bytes;
+    csr_value_vec_bytes = dataBase_.csr_value_vec_bytes;
+    num_boundary_cells = dataBase_.num_boundary_cells;
+    num_surfaces = dataBase_.num_surfaces;
+
+    d_A_csr_row_index = dataBase_.d_A_csr_row_index;
+    d_A_csr_diag_index = dataBase_.d_A_csr_diag_index;
+    d_A_csr_col_index = dataBase_.d_A_csr_col_index;
+
+    h_A_csr = new double[(num_cells + num_faces) * 3];
+    h_b = new double[num_cells * 3];
+    cudaMallocHost(&h_psi, cell_vec_bytes);
+    cudaMallocHost(&h_H, cell_vec_bytes);
+    cudaMallocHost(&h_A, cell_bytes);
+
+    checkCudaErrors(cudaMalloc((void **)&d_A_csr, csr_value_vec_bytes));
+    checkCudaErrors(cudaMalloc((void **)&d_b, cell_vec_bytes));
+    checkCudaErrors(cudaMalloc((void **)&d_psi, cell_vec_bytes));
+    checkCudaErrors(cudaMalloc((void **)&d_psi_permute, cell_vec_bytes));
+    checkCudaErrors(cudaMalloc((void **)&d_H, cell_vec_bytes));
+    checkCudaErrors(cudaMalloc((void **)&d_H_permute, cell_vec_bytes));
+    checkCudaErrors(cudaMalloc((void **)&d_A, cell_bytes));
+    checkCudaErrors(cudaMalloc((void **)&d_ueqn_internal_coeffs, cell_vec_bytes));
+    checkCudaErrors(cudaMalloc((void **)&d_ueqn_boundary_coeffs, cell_vec_bytes));
+}
+
+void dfUEqn::fvm_ddt(double *vector_old)
+{
+    // Copy the host input array in host memory to the device input array in device memory
+    checkCudaErrors(cudaMemcpyAsync(dataBase_.d_velocity_old, vector_old, cell_vec_bytes, cudaMemcpyHostToDevice, stream));
+    size_t threads_per_block = 1024;
+    size_t blocks_per_grid = (num_cells + threads_per_block - 1) / threads_per_block;
+    fvm_ddt_kernel<<<blocks_per_grid, threads_per_block, 0, stream>>>(num_cells, num_faces, dataBase_.rdelta_t,
+            d_A_csr_row_index, d_A_csr_diag_index,
+            dataBase_.d_rho_old, dataBase_.d_rho_new, dataBase_.d_volume, dataBase_.d_velocity_old, d_A_csr, d_b, d_A_csr, d_b, d_psi);
+}
+
+void dfUEqn::fvm_div(double *boundary_pressure_init, double *boundary_velocity_init,
+                     double *boundary_nuEff_init, double *boundary_rho_init)
+{
+    // copy and permutate boundary variable
+    checkCudaErrors(cudaMemcpyAsync(dataBase_.d_boundary_velocity_init, boundary_velocity_init, dataBase_.boundary_face_vec_bytes, cudaMemcpyHostToDevice, stream));
+    checkCudaErrors(cudaMemcpyAsync(dataBase_.d_boundary_pressure_init, boundary_pressure_init, dataBase_.boundary_face_bytes, cudaMemcpyHostToDevice, stream));
+    checkCudaErrors(cudaMemcpyAsync(dataBase_.d_boundary_nuEff_init, boundary_nuEff_init, dataBase_.boundary_face_bytes, cudaMemcpyHostToDevice, stream));
+    checkCudaErrors(cudaMemcpyAsync(dataBase_.d_boundary_rho_init, boundary_rho_init, dataBase_.boundary_face_bytes, cudaMemcpyHostToDevice, stream));
+
+    size_t threads_per_block = 1024;
+    size_t blocks_per_grid = (dataBase_.num_boundary_faces + threads_per_block - 1) / threads_per_block;
+    boundaryPermutation<<<blocks_per_grid, threads_per_block, 0, stream>>>(dataBase_.num_boundary_faces, dataBase_.d_bouPermedIndex, dataBase_.d_boundary_pressure_init,
+            dataBase_.d_boundary_velocity_init, dataBase_.d_boundary_pressure, dataBase_.d_boundary_velocity, 
+            dataBase_.d_boundary_nuEff_init, dataBase_.d_boundary_nuEff, dataBase_.d_boundary_rho_init, dataBase_.d_boundary_rho);
+
+    // initialize boundary coeffs (must after the update of d_boundary_velocity)
+    threads_per_block = 1024;
+    blocks_per_grid = (dataBase_.num_boundary_faces + threads_per_block - 1) / threads_per_block;
+    ueqn_update_BoundaryCoeffs_kernel<<<blocks_per_grid, threads_per_block, 0, stream>>>(dataBase_.num_boundary_faces, dataBase_.d_boundary_phi,
+                                                                                         dataBase_.d_internal_coeffs, dataBase_.d_boundary_coeffs,
+                                                                                         dataBase_.d_laplac_internal_coeffs, dataBase_.d_laplac_boundary_coeffs,
+                                                                                         dataBase_.d_boundary_UpatchType, dataBase_.d_boundary_velocity, dataBase_.d_boundary_deltaCoeffs);
+
+    blocks_per_grid = (num_cells + threads_per_block - 1) / threads_per_block;
+    fvm_div_internal<<<blocks_per_grid, threads_per_block, 0, stream>>>(num_cells, num_faces,
+                                                                        d_A_csr_row_index, d_A_csr_diag_index,
+                                                                        dataBase_.d_weight, dataBase_.d_phi, d_A_csr, d_b, d_A_csr, d_b);
+    blocks_per_grid = (num_boundary_cells + threads_per_block - 1) / threads_per_block;
+    fvm_div_boundary<<<blocks_per_grid, threads_per_block, 0, stream>>>(num_cells, num_faces, num_boundary_cells,
+                                                                        d_A_csr_row_index, d_A_csr_diag_index,
+                                                                        dataBase_.d_boundary_cell_offset, dataBase_.d_boundary_cell_id,
+                                                                        dataBase_.d_internal_coeffs, dataBase_.d_boundary_coeffs, d_A_csr, d_b, d_A_csr, d_b,
+                                                                        d_ueqn_internal_coeffs, d_ueqn_boundary_coeffs);
+}
+
+void dfUEqn::fvc_grad(double *pressure)
+{
+    // Copy the host input array in host memory to the device input array in device memory
+    checkCudaErrors(cudaMemcpyAsync(dataBase_.d_pressure, pressure, cell_bytes, cudaMemcpyHostToDevice, stream));
+
+    // launch cuda kernel
+    size_t threads_per_block = 1024;
+    size_t blocks_per_grid = (num_cells + threads_per_block - 1) / threads_per_block;
+    fvc_grad_internal_face<<<blocks_per_grid, threads_per_block, 0, stream>>>(num_cells,
+                                                                              d_A_csr_row_index, d_A_csr_col_index, d_A_csr_diag_index,
+                                                                              dataBase_.d_face_vector, dataBase_.d_weight, dataBase_.d_pressure, d_b, d_b);
+    blocks_per_grid = (num_boundary_cells + threads_per_block - 1) / threads_per_block;
+    fvc_grad_boundary_face<<<blocks_per_grid, threads_per_block, 0, stream>>>(num_cells, num_boundary_cells,
+                                                                              dataBase_.d_boundary_cell_offset, dataBase_.d_boundary_cell_id,
+                                                                              dataBase_.d_boundary_face_vector, dataBase_.d_boundary_pressure, d_b, d_b);
+}
+
+void dfUEqn::fvc_grad_vector()
+{
+    size_t threads_per_block = 512;
+    size_t blocks_per_grid = (num_cells + threads_per_block - 1) / threads_per_block;
+    fvc_grad_vector_internal<<<blocks_per_grid, threads_per_block, 0, stream>>>(num_cells,
+                                                                                d_A_csr_row_index, d_A_csr_col_index, d_A_csr_diag_index,
+                                                                                dataBase_.d_face_vector, dataBase_.d_velocity_old, dataBase_.d_weight, dataBase_.d_volume, dataBase_.d_grad);
+
+    blocks_per_grid = (num_boundary_cells + threads_per_block - 1) / threads_per_block;
+    fvc_grad_vector_boundary<<<blocks_per_grid, threads_per_block, 0, stream>>>(num_cells, num_boundary_cells,
+                                                                                dataBase_.d_boundary_cell_offset, dataBase_.d_boundary_cell_id, dataBase_.d_boundary_face_vector, dataBase_.d_boundary_velocity,
+                                                                                dataBase_.d_volume, dataBase_.d_grad, dataBase_.d_grad_boundary_init);
+
+    correct_boundary_conditions<<<blocks_per_grid, threads_per_block, 0, stream>>>(num_boundary_cells,
+                                                                                   dataBase_.d_boundary_cell_offset, dataBase_.d_boundary_cell_id, dataBase_.d_boundary_face_vector, dataBase_.d_boundary_face,
+                                                                                   dataBase_.d_grad_boundary_init, dataBase_.d_grad_boundary, dataBase_.d_boundary_deltaCoeffs, dataBase_.d_velocity_old,
+                                                                                   dataBase_.d_boundary_velocity, dataBase_.d_boundary_UpatchType);
+}
+
+void dfUEqn::dev2T()
+{
+    size_t threads_per_block = 1024;
+    size_t blocks_per_grid = (num_cells + threads_per_block - 1) / threads_per_block;
+    dev2_t_tensor<<<blocks_per_grid, threads_per_block, 0, stream>>>(num_cells, dataBase_.d_grad);
+
+    blocks_per_grid = (dataBase_.num_boundary_faces + threads_per_block - 1) / threads_per_block;
+    dev2_t_tensor<<<blocks_per_grid, threads_per_block, 0, stream>>>(dataBase_.num_boundary_faces, dataBase_.d_grad_boundary);
+}
+
+void dfUEqn::fvc_div_tensor(const double *nuEff)
+{
+    checkCudaErrors(cudaMemcpyAsync(dataBase_.d_nuEff, nuEff, cell_bytes, cudaMemcpyHostToDevice, stream));
+    size_t threads_per_block = 512;
+    size_t blocks_per_grid = (num_cells + threads_per_block - 1) / threads_per_block;
+    fvc_div_tensor_internal<<<blocks_per_grid, threads_per_block, 0, stream>>>(num_cells,
+                                                                               d_A_csr_row_index, d_A_csr_col_index, d_A_csr_diag_index,
+                                                                               dataBase_.d_nuEff, dataBase_.d_rho_new, dataBase_.d_face_vector, dataBase_.d_grad, dataBase_.d_weight,
+                                                                               dataBase_.d_volume, 1., d_b, d_b);
+
+    blocks_per_grid = (num_boundary_cells + threads_per_block - 1) / threads_per_block;
+    fvc_div_tensor_boundary<<<blocks_per_grid, threads_per_block, 0, stream>>>(num_cells, num_boundary_cells,
+                                                                               dataBase_.d_boundary_cell_offset, dataBase_.d_boundary_cell_id,
+                                                                               dataBase_.d_boundary_nuEff, dataBase_.d_boundary_rho, dataBase_.d_boundary_face_vector, dataBase_.d_grad_boundary,
+                                                                               dataBase_.d_volume, 1., d_b, d_b);
+}
+
+void dfUEqn::fvm_laplacian()
+{
+    size_t threads_per_block = 1024;
+    size_t blocks_per_grid = (num_cells + threads_per_block - 1) / threads_per_block;
+    fvm_laplacian_uncorrected_vector_internal<<<blocks_per_grid, threads_per_block, 0, stream>>>(num_cells, num_faces,
+                                                                                                 d_A_csr_row_index, d_A_csr_col_index, d_A_csr_diag_index, dataBase_.d_rho_new, dataBase_.d_nuEff, dataBase_.d_weight,
+                                                                                                 dataBase_.d_face, dataBase_.d_deltaCoeffs, -1., d_A_csr, d_A_csr);
+
+    blocks_per_grid = (num_boundary_cells + threads_per_block - 1) / threads_per_block;
+    fvm_laplacian_uncorrected_vector_boundary<<<blocks_per_grid, threads_per_block, 0, stream>>>(num_cells, num_faces, num_boundary_cells,
+                                                                                                 d_A_csr_row_index, d_A_csr_diag_index, dataBase_.d_boundary_cell_offset, dataBase_.d_boundary_cell_id,
+                                                                                                 dataBase_.d_boundary_nuEff, dataBase_.d_boundary_rho, dataBase_.d_boundary_face, dataBase_.d_laplac_internal_coeffs,
+                                                                                                 dataBase_.d_laplac_boundary_coeffs, -1., d_A_csr, d_b, d_A_csr, d_b, d_ueqn_internal_coeffs, d_ueqn_boundary_coeffs);
+}
+
+void dfUEqn::A(double *Psi)
+{
+    checkCudaErrors(cudaMemsetAsync(d_A, 0, cell_bytes, stream));
+
+    size_t threads_per_block = 1024;
+    size_t blocks_per_grid = (num_boundary_cells + threads_per_block - 1) / threads_per_block;
+    addAveInternaltoDiag<<<blocks_per_grid, threads_per_block, 0, stream>>>(num_cells, num_boundary_cells, d_A_csr_row_index, d_A_csr_diag_index,
+                                                                            dataBase_.d_boundary_cell_offset, dataBase_.d_boundary_cell_id,
+                                                                            d_ueqn_internal_coeffs, d_ueqn_boundary_coeffs, d_A);
+    blocks_per_grid = (num_cells + threads_per_block - 1) / threads_per_block;
+    addDiagDivVolume<<<blocks_per_grid, threads_per_block, 0, stream>>>(num_cells, d_A_csr_row_index, d_A_csr_diag_index, d_A_csr,
+                                                                        dataBase_.d_volume, d_ueqn_internal_coeffs, d_A, d_A);
+
+    checkCudaErrors(cudaMemcpyAsync(h_A, d_A, cell_bytes, cudaMemcpyDeviceToHost, stream));
+    checkCudaErrors(cudaStreamSynchronize(stream));
+    memcpy(Psi, h_A, cell_bytes);
+}
+
+void dfUEqn::H(double *Psi)
+{
+    checkCudaErrors(cudaMemsetAsync(d_H, 0, cell_bytes * 3, stream));
+    size_t threads_per_block = 1024;
+    size_t blocks_per_grid = (num_boundary_cells + threads_per_block - 1) / threads_per_block;
+    addBoundaryDiag<<<blocks_per_grid, threads_per_block, 0, stream>>>(num_cells, num_boundary_cells, d_A_csr_row_index, d_A_csr_diag_index,
+                                                                       dataBase_.d_boundary_cell_offset, dataBase_.d_boundary_cell_id,
+                                                                       d_ueqn_internal_coeffs, d_ueqn_boundary_coeffs,
+                                                                       d_psi, d_H);
+
+    blocks_per_grid = (num_cells + threads_per_block - 1) / threads_per_block;
+    lduMatrix_H<<<blocks_per_grid, threads_per_block, 0, stream>>>(num_cells, d_A_csr_row_index, d_A_csr_col_index, d_A_csr_diag_index,
+                                                                   dataBase_.d_volume, d_psi, d_A_csr, d_b, d_ueqn_boundary_coeffs, d_H);
+
+    blocks_per_grid = (num_cells + threads_per_block - 1) / threads_per_block;
+    permute_psi_d2h<<<blocks_per_grid, threads_per_block, 0, stream>>>(num_cells, d_H, d_H_permute);
+
+    checkCudaErrors(cudaMemcpyAsync(h_H, d_H_permute, cell_vec_bytes, cudaMemcpyDeviceToHost, stream));
+    checkCudaErrors(cudaStreamSynchronize(stream));
+
+    memcpy(Psi, h_H, cell_vec_bytes);
+}
+
+void dfUEqn::initializeTimeStep()
+{
+    // initialize matrix value
+    checkCudaErrors(cudaMemsetAsync(d_A_csr, 0, csr_value_vec_bytes, stream));
+    checkCudaErrors(cudaMemsetAsync(d_b, 0, cell_vec_bytes, stream));
+}
+
+void dfUEqn::checkValue(bool print)
+{
+    checkCudaErrors(cudaMemcpyAsync(h_A_csr, d_A_csr, csr_value_vec_bytes, cudaMemcpyDeviceToHost, stream));
+    checkCudaErrors(cudaMemcpyAsync(h_b, d_b, cell_vec_bytes, cudaMemcpyDeviceToHost, stream));
+
+    // Synchronize stream
+    checkCudaErrors(cudaStreamSynchronize(stream));
+    if (print)
+    {
+        for (int i = 0; i < (num_faces + num_cells); i++)
+            fprintf(stderr, "h_A_csr[%d]: (%.10lf, %.10lf, %.10lf)\n", i, h_A_csr[i], h_A_csr[i + (num_faces + num_cells)], h_A_csr[i + 2 * (num_faces + num_cells)]);
+        for (int i = 0; i < num_cells; i++)
+            fprintf(stderr, "h_b[%d]: (%.10lf, %.10lf, %.10lf)\n", i, h_b[i], h_b[i + num_cells], h_b[i + 2 * num_cells]);
+    }
+
+    char *input_file = "of_output.txt";
+    FILE *fp = fopen(input_file, "rb+");
+    if (fp == NULL)
+    {
+        fprintf(stderr, "Failed to open input file: %s!\n", input_file);
+    }
+    int readfile = 0;
+    double *of_b = new double[3 * num_cells];
+    double *of_A = new double[3 * (num_faces + num_cells)];
+    readfile = fread(of_b, num_cells * 3 * sizeof(double), 1, fp);
+    readfile = fread(of_A, (num_faces + num_cells) * sizeof(double) * 3, 1, fp);
+
+    std::vector<double> h_A_of_init_vec(3 * (num_cells + num_faces));
+    std::copy(of_A, of_A + (num_cells + num_faces) * 3, h_A_of_init_vec.begin());
+
+    std::vector<double> h_A_of_vec_perm(3 * (num_faces + num_cells), 0);
+    for (int i = 0; i < num_faces + num_cells; i++)
+    {
+        h_A_of_vec_perm[i] = h_A_of_init_vec[dataBase_.tmpPermutatedList[i]];
+        h_A_of_vec_perm[i + num_faces + num_cells] = h_A_of_init_vec[dataBase_.tmpPermutatedList[i] + num_faces + num_cells];
+        h_A_of_vec_perm[i + 2 * (num_faces + num_cells)] = h_A_of_init_vec[dataBase_.tmpPermutatedList[i] + 2 * (num_faces + num_cells)];
+    }
+
+    // b
+    std::vector<double> h_b_of_init_vec(3 * num_cells);
+    std::copy(of_b, of_b + 3 * num_cells, h_b_of_init_vec.begin());
+    std::vector<double> h_b_of_vec;
+    for (int i = 0; i < 3 * num_cells; i += 3)
+    {
+        h_b_of_vec.push_back(h_b_of_init_vec[i]);
+    }
+    // fill RHS_y
+    for (int i = 1; i < 3 * num_cells; i += 3)
+    {
+        h_b_of_vec.push_back(h_b_of_init_vec[i]);
+    }
+    // fill RHS_z
+    for (int i = 2; i < 3 * num_cells; i += 3)
+    {
+        h_b_of_vec.push_back(h_b_of_init_vec[i]);
+    }
+
+    if (print)
+    {
+        for (int i = 0; i < (num_faces + num_cells); i++)
+            printf("h_A_of_vec[%d]:(%.10lf, %.10lf, %.10lf)\n", i, h_A_of_vec_perm[i], h_A_of_vec_perm[i + (num_faces + num_cells)], h_A_of_vec_perm[i + (num_faces + num_cells) * 2]);
+        for (int i = 0; i < num_cells; i++)
+            printf("h_b_of_vec[%d]: (%.10lf, %.10lf, %.10lf)\n", i, of_b[i * 3], of_b[i * 3 + 1], of_b[i * 3 + 2]);
+    }
+
+    // check
+    // fprintf(stderr, "check of h_A_csr\n");
+    // checkVectorEqual(num_faces + num_cells, h_A_of_vec_1mtx.data(), h_A_csr, 1e-5);
+    // fprintf(stderr, "check of h_b\n");
+    // checkVectorEqual(3 * num_cells, h_b_of_vec.data(), h_b, 1e-5);
+}
+
+void dfUEqn::solve()
+{
+    // for (size_t i = 0; i < num_cells; i++)
+    //     fprintf(stderr, "h_velocity_old[%d]: (%.15lf, %.15lf, %.15lf)\n", i, h_velocity_old[3*i],
+    //     h_velocity_old[3*i + 1], h_velocity_old[3*i + 2]);
+    // constructor AmgXSolver at first interation
+    // Synchronize stream
+    // checkCudaErrors(cudaMemcpyAsync(h_b, d_b, cell_vec_bytes, cudaMemcpyDeviceToHost, stream));
+    // checkCudaErrors(cudaMemcpyAsync(h_psi, d_psi, cell_vec_bytes, cudaMemcpyDeviceToHost, stream));
+
+    checkCudaErrors(cudaStreamSynchronize(stream));
+
+    // nvtxRangePush("solve");
+
+    int nNz = num_cells + num_faces; // matrix entries
+    if (num_iteration == 0)          // first interation
+    {
+        printf("Initializing AmgX Linear Solver\n");
+        UxSolver->setOperator(num_cells, nNz, d_A_csr_row_index, d_A_csr_col_index, d_A_csr);
+        UySolver->setOperator(num_cells, nNz, d_A_csr_row_index, d_A_csr_col_index, d_A_csr + nNz);
+        UzSolver->setOperator(num_cells, nNz, d_A_csr_row_index, d_A_csr_col_index, d_A_csr + 2 * nNz);
+    }
+    else
+    {
+        UxSolver->updateOperator(num_cells, nNz, d_A_csr);
+        UySolver->updateOperator(num_cells, nNz, d_A_csr + nNz);
+        UzSolver->updateOperator(num_cells, nNz, d_A_csr + 2 * nNz);
+    }
+    UxSolver->solve(num_cells, d_psi, d_b);
+    UySolver->solve(num_cells, d_psi + num_cells, d_b + num_cells);
+    UzSolver->solve(num_cells, d_psi + 2 * num_cells, d_b + 2 * num_cells);
+    num_iteration++;
+
+    size_t threads_per_block = 1024;
+    size_t blocks_per_grid = (num_cells + threads_per_block - 1) / threads_per_block;
+    permute_psi_d2h<<<blocks_per_grid, threads_per_block, 0, stream>>>(num_cells, d_psi, d_psi_permute);
+    checkCudaErrors(cudaMemcpyAsync(h_psi, d_psi_permute, cell_vec_bytes, cudaMemcpyDeviceToHost, stream));
+    // for (size_t i = 0; i < num_cells; i++)
+    //     fprintf(stderr, "h_velocity_after[%d]: (%.15lf, %.15lf, %.15lf)\n", i, h_psi[i],
+    //     h_psi[num_cells + i], h_psi[num_cells*2 + i]);
+}
+
+void dfUEqn::sync()
+{
+    checkCudaErrors(cudaStreamSynchronize(stream));
+}
+
+void dfUEqn::updatePsi(double *Psi)
+{
+    checkCudaErrors(cudaStreamSynchronize(stream));
+    memcpy(Psi, h_psi, cell_vec_bytes);
+}
+
+void dfUEqn::correctBoundaryConditions()
+{
+    size_t threads_per_block = 1024;
+    size_t blocks_per_grid = (num_boundary_cells + threads_per_block - 1) / threads_per_block;
+    ueqn_correct_BoundaryConditions_kernel<<<blocks_per_grid, threads_per_block, 0, stream>>>(num_cells, num_boundary_cells, 
+                                                                                              dataBase_.d_boundary_cell_offset, dataBase_.d_boundary_cell_id,
+                                                                                              d_psi, dataBase_.d_boundary_velocity, dataBase_.d_boundary_UpatchType);
+}
+
+// correct volecity in pEqn
+void dfUEqn::correctPsi(double *Psi)
+{
+    memcpy(h_psi, Psi, cell_vec_bytes);
+    checkCudaErrors(cudaMemcpyAsync(d_psi_permute, h_psi, cell_vec_bytes, cudaMemcpyDeviceToHost, stream));
+
+    size_t threads_per_block = 1024;
+    size_t blocks_per_grid = (num_cells + threads_per_block - 1) / threads_per_block;
+    permute_psi_h2d<<<blocks_per_grid, threads_per_block, 0, stream>>>(num_cells, d_psi_permute, d_psi);
+}
+
+dfUEqn::~dfUEqn()
+{
+}
diff --git a/src_gpu/dfYEqn.H b/src_gpu_orig/dfYEqn.H
similarity index 100%
rename from src_gpu/dfYEqn.H
rename to src_gpu_orig/dfYEqn.H
diff --git a/src_gpu/dfYEqn.cu b/src_gpu_orig/dfYEqn.cu
similarity index 100%
rename from src_gpu/dfYEqn.cu
rename to src_gpu_orig/dfYEqn.cu