diff --git a/applications/solvers/dfLowMachFoam/EEqn.H b/applications/solvers/dfLowMachFoam/EEqn.H
index 5554e7128..51277cd73 100644
--- a/applications/solvers/dfLowMachFoam/EEqn.H
+++ b/applications/solvers/dfLowMachFoam/EEqn.H
@@ -93,7 +93,7 @@
 		// EEqn_GPU.compareResult(&EEqn.lower()[0], &EEqn.upper()[0], &EEqn.diag()[0], &EEqn.source()[0],
 		// 		h_internal_coeffs.data(), h_boundary_coeffs.data(), printFlag);
 		// DEBUG_TRACE;
-        EEqn_GPU.compareHe(&he[0], h_boundary_he_tmp, printFlag);
+        //EEqn_GPU.compareHe(&he[0], h_boundary_he_tmp, printFlag);
     }
 
     delete h_boundary_he_tmp;
diff --git a/applications/solvers/dfLowMachFoam/UEqn.H b/applications/solvers/dfLowMachFoam/UEqn.H
index 2a059161e..a6244c870 100644
--- a/applications/solvers/dfLowMachFoam/UEqn.H
+++ b/applications/solvers/dfLowMachFoam/UEqn.H
@@ -118,7 +118,7 @@
         //     h_internal_coeffs.data(), h_boundary_coeffs.data(), 
         //     // &gradU[0][0], h_boundary_gradU,
         //     printFlag);
-        UEqn_GPU.compareU(&U[0][0], h_boundary_u_tmp, printFlag);
+        //UEqn_GPU.compareU(&U[0][0], h_boundary_u_tmp, printFlag);
     }
     DEBUG_TRACE;
 #endif
diff --git a/applications/solvers/dfLowMachFoam/createGPUSolver.H b/applications/solvers/dfLowMachFoam/createGPUSolver.H
index d72027dc3..7714fcbd1 100644
--- a/applications/solvers/dfLowMachFoam/createGPUSolver.H
+++ b/applications/solvers/dfLowMachFoam/createGPUSolver.H
@@ -99,7 +99,7 @@ void initNccl() {
     ncclInit(PstreamGlobals::MPI_COMM_FOAM, nccl_comm, nccl_id, &nRanks, &myRank, &localRank, &mpi_init_flag);
 }
 
-void createGPUBase(fvMesh& mesh, PtrList<volScalarField>& Y) {
+void createGPUBase(const IOdictionary& CanteraTorchProperties, fvMesh& mesh, PtrList<volScalarField>& Y) {
     // prepare constant values: num_cells, num_surfaces, num_boundary_surfaces,
     // num_patches, patch_size, num_species, rdelta_t
     const labelUList& owner = mesh.owner();
@@ -152,6 +152,7 @@ void createGPUBase(fvMesh& mesh, PtrList<volScalarField>& Y) {
     double rDeltaT = 1 / 1e-6;
     dfDataBase.setConstantValues(num_cells, num_total_cells, num_surfaces, num_boundary_surfaces, 
             num_patches, nProcValues, patch_size, Y.size(), rDeltaT);
+
     // wyr: now there is no other place to prepare nccl info, thus mpi must be initialized at beginning.
     label flag_mpi_init;
     MPI_Initialized(&flag_mpi_init);
@@ -187,6 +188,14 @@ void createGPUBase(fvMesh& mesh, PtrList<volScalarField>& Y) {
     // prepare cuda resources
     dfDataBase.prepareCudaResources();
 
+    // setup amgx solvers
+    string mode_string = "dDDI";
+    string u_setting_path;
+    u_setting_path = CanteraTorchProperties.subDict("AmgxSettings").lookupOrDefault("UEqnSettingPath", string(""));
+    string p_setting_path;
+    p_setting_path = CanteraTorchProperties.subDict("AmgxSettings").lookupOrDefault("pEqnSettingPath", string(""));
+    dfDataBase.setAmgxSolvers(mode_string, u_setting_path, p_setting_path);
+
     // prepare constant indexes: owner, neighbor, procRows, procCols
     if (Pstream::parRun())
     {
@@ -373,12 +382,14 @@ void createGPURhoEqn(const volScalarField& rho, const surfaceScalarField& phi) {
             offset += patchsize;
         }
     }
+    rhoEqn_GPU.setConstantValues();
     rhoEqn_GPU.setConstantFields(patch_type);
     rhoEqn_GPU.initNonConstantFields(h_rho, h_phi, h_boundary_rho, h_boundary_phi);
     rhoEqn_GPU.createNonConstantLduAndCsrFields();
 }
 
 void createGPUUEqn(const IOdictionary& CanteraTorchProperties, const volVectorField& U) {
+    // TODO need remove amgx solver setting
     // prepare mode_string and setting_path
     string mode_string = "dDDI";
     string settingPath;
@@ -431,6 +442,7 @@ void createGPUUEqn(const IOdictionary& CanteraTorchProperties, const volVectorFi
 
 void createGPUYEqn(const IOdictionary& CanteraTorchProperties, PtrList<volScalarField>& Y, const int inertIndex) {
     DEBUG_TRACE;
+    // TODO need remove amgx solver setting
     // prepare mode_string and setting_path
     string mode_string = "dDDI";
     string settingPath;
@@ -481,6 +493,7 @@ void createGPUYEqn(const IOdictionary& CanteraTorchProperties, PtrList<volScalar
 
 void createGPUEEqn(const IOdictionary& CanteraTorchProperties, volScalarField& he, volScalarField& K) {
     DEBUG_TRACE;
+    // TODO need remove amgx solver setting
     // prepare mode_string and setting_path
     string mode_string = "dDDI";
     string settingPath;
@@ -527,6 +540,7 @@ void createGPUEEqn(const IOdictionary& CanteraTorchProperties, volScalarField& h
 
 void createGPUpEqn(const IOdictionary& CanteraTorchProperties, volScalarField& p, const volVectorField& U) {
     DEBUG_TRACE;
+    // TODO need remove amgx solver setting
     // prepare mode_string and setting_path
     string mode_string = "dDDI";
     string settingPath;
diff --git a/applications/solvers/dfLowMachFoam/dfLowMachFoam.C b/applications/solvers/dfLowMachFoam/dfLowMachFoam.C
index 915d3f000..5f3c7aea1 100644
--- a/applications/solvers/dfLowMachFoam/dfLowMachFoam.C
+++ b/applications/solvers/dfLowMachFoam/dfLowMachFoam.C
@@ -203,7 +203,7 @@ int main(int argc, char *argv[])
     if(mpi_init_flag) {
         initNccl();
     }
-    createGPUBase(mesh, Y);
+    createGPUBase(CanteraTorchProperties, mesh, Y);
     DEBUG_TRACE;
 #endif
 
@@ -369,7 +369,7 @@ int main(int argc, char *argv[])
                     MPI_Comm_rank(MPI_COMM_WORLD, &rank);
                 }
                 if (!mpi_init_flag || rank == 0) {
-                    // thermo_GPU.compareT(&T[0], h_boundary_T_tmp, printFlag);
+                    thermo_GPU.compareT(&T[0], h_boundary_T_tmp, printFlag);
                     // thermo_GPU.compareHe(&thermo.he()[0], h_boundary_he_tmp, printFlag);
                     // thermo_GPU.comparePsi(&psi[0], h_boundary_thermo_psi_tmp, printFlag);
                     // thermo_GPU.compareAlpha(&alpha[0], h_boundary_thermo_alpha_tmp, printFlag);
@@ -452,6 +452,8 @@ int main(int argc, char *argv[])
             }
             end = std::clock();
             time_monitor_turbulence_correct += double(end - start) / double(CLOCKS_PER_SEC);
+            //fprintf(stderr, "sleep for 5s...\n");
+            //usleep(5 * 1000 * 1000);
         }
         clock_t loop_end = std::clock();
         double loop_time = double(loop_end - loop_start) / double(CLOCKS_PER_SEC);
@@ -468,11 +470,9 @@ int main(int argc, char *argv[])
 
 #ifdef GPUSolverNew_
         // write U for
-        double *h_U_tmp = new double[dfDataBase.num_cells * 3];
-        UEqn_GPU.postProcess(h_U_tmp);
-        memcpy(&U[0][0], h_U_tmp, dfDataBase.cell_value_vec_bytes);
+        UEqn_GPU.postProcess();
+        memcpy(&U[0][0], dfDataBase.h_u, dfDataBase.cell_value_vec_bytes);
         U.correctBoundaryConditions();
-        delete h_U_tmp;
 #endif
 
         runTime.write();
diff --git a/applications/solvers/dfLowMachFoam/pEqn_GPU.H b/applications/solvers/dfLowMachFoam/pEqn_GPU.H
index 82de66ad2..b982d8475 100644
--- a/applications/solvers/dfLowMachFoam/pEqn_GPU.H
+++ b/applications/solvers/dfLowMachFoam/pEqn_GPU.H
@@ -154,7 +154,7 @@ UEqn_GPU.sync();
     }
     // pEqn_GPU.correctP(&p[0], h_boundary_p);
     if (!mpi_init_flag || rank == 0) {
-        pEqn_GPU.comparep(&p[0], h_boundary_p, false);
+        //pEqn_GPU.comparep(&p[0], h_boundary_p, false);
     }
     delete h_boundary_p;
 
diff --git a/src_gpu/AmgXSolver.cu b/src_gpu/AmgXSolver.cu
index c4321f868..8658a268b 100644
--- a/src_gpu/AmgXSolver.cu
+++ b/src_gpu/AmgXSolver.cu
@@ -54,9 +54,6 @@ void AmgXSolver::initialize(const std::string &modeStr, const std::string &cfgFi
     // get the mode of AmgX solver
     setMode(modeStr);  
 
-    // initialize AmgX
-    initAmgX(cfgFile, devID);  
-
     // check if MPI has been initialized
     MPI_Initialized(&isMPIEnabled);
     if (isMPIEnabled) {
@@ -64,6 +61,9 @@ void AmgXSolver::initialize(const std::string &modeStr, const std::string &cfgFi
         mpiWorld = MPI_COMM_WORLD;
     }
 
+    // initialize AmgX
+    initAmgX(cfgFile, devID);
+
     // a bool indicating if this instance is initialized
     isInitialised = true;
 
diff --git a/src_gpu/dfEEqn.H b/src_gpu/dfEEqn.H
index f19f3a600..6fac22965 100644
--- a/src_gpu/dfEEqn.H
+++ b/src_gpu/dfEEqn.H
@@ -12,10 +12,14 @@ class dfEEqn
     dfThermo &thermo_;
 
     // cuda resource
+    cudaStream_t stream;
+#ifdef USE_GRAPH
     // one graph for one eqn before using self-developed solver
     cudaGraph_t graph_pre, graph_post;
     cudaGraphExec_t graph_instance_pre, graph_instance_post;
-    bool graph_created=false;
+    bool pre_graph_created=false;
+    bool post_graph_created=false;
+#endif
 
 	// constant values -- basic
 	std::string mode_string;
diff --git a/src_gpu/dfEEqn.cu b/src_gpu/dfEEqn.cu
index 803d7bc1f..cb43f8c9a 100644
--- a/src_gpu/dfEEqn.cu
+++ b/src_gpu/dfEEqn.cu
@@ -21,6 +21,7 @@ double* dfEEqn::getFieldPointer(const char* fieldAlias, location loc, position p
 }
 
 void dfEEqn::setConstantValues(const std::string &mode_string, const std::string &setting_path) {
+    this->stream = dataBase_.stream;
     this->mode_string = mode_string;
     this->setting_path = setting_path;
     ESolver = new AmgXSolver(mode_string, setting_path, dataBase_.localRank);
@@ -88,12 +89,16 @@ void dfEEqn::initNonConstantFields(const double *he, const double *boundary_he)
 }
 
 void dfEEqn::cleanCudaResources() {
-    if (graph_created) {
+#ifdef USE_GRAPH
+    if (pre_graph_created) {
         checkCudaErrors(cudaGraphExecDestroy(graph_instance_pre));
-        checkCudaErrors(cudaGraphExecDestroy(graph_instance_post));
         checkCudaErrors(cudaGraphDestroy(graph_pre));
+    }
+    if (post_graph_created) {
+        checkCudaErrors(cudaGraphExecDestroy(graph_instance_post));
         checkCudaErrors(cudaGraphDestroy(graph_post));
     }
+#endif
 }
 
 void dfEEqn::preProcess(const double *h_he, const double *h_k, const double *h_k_old, const double *h_dpdt, const double *h_boundary_k, const double *h_boundary_heGradient)
@@ -101,15 +106,10 @@ void dfEEqn::preProcess(const double *h_he, const double *h_k, const double *h_k
 }
 
 void dfEEqn::process() {
-    //使用event计算时间
-    float time_elapsed=0;
-    cudaEvent_t start,stop;
-    checkCudaErrors(cudaEventCreate(&start));
-    checkCudaErrors(cudaEventCreate(&stop));
-    checkCudaErrors(cudaEventRecord(start,0));
-
-#ifndef TIME_GPU
-    if(!graph_created) {
+    TICK_INIT_EVENT;
+    TICK_START_EVENT;
+#ifdef USE_GRAPH
+    if(!pre_graph_created) {
         DEBUG_TRACE;
         checkCudaErrors(cudaStreamBeginCapture(dataBase_.stream, cudaStreamCaptureModeGlobal));
 #endif
@@ -125,7 +125,7 @@ void dfEEqn::process() {
     checkCudaErrors(cudaMallocAsync((void**)&d_gradient_internal_coeffs, dataBase_.boundary_surface_value_bytes, dataBase_.stream));
     checkCudaErrors(cudaMallocAsync((void**)&d_gradient_boundary_coeffs, dataBase_.boundary_surface_value_bytes, dataBase_.stream));
  
-    checkCudaErrors(cudaMallocAsync((void**)&d_boundary_heGradient, dataBase_.num_gradientEnergy_boundary_surfaces, dataBase_.stream));
+    checkCudaErrors(cudaMallocAsync((void**)&d_boundary_heGradient, sizeof(double) * num_gradientEnergy_boundary_surfaces, dataBase_.stream));
 
     checkCudaErrors(cudaMallocAsync((void**)&d_source, dataBase_.cell_value_bytes, dataBase_.stream));
     checkCudaErrors(cudaMallocAsync((void**)&d_internal_coeffs, dataBase_.boundary_surface_value_bytes, dataBase_.stream));
@@ -190,58 +190,71 @@ void dfEEqn::process() {
             dataBase_.num_Nz, dataBase_.d_boundary_face_cell, dataBase_.d_ldu_to_csr_index, dataBase_.num_patches,
             dataBase_.patch_size.data(), patch_type_he.data(), d_ldu, d_source, d_internal_coeffs, d_boundary_coeffs, d_A);
 #endif
-#ifndef TIME_GPU
+#ifdef USE_GRAPH
         checkCudaErrors(cudaStreamEndCapture(dataBase_.stream, &graph_pre));
         checkCudaErrors(cudaGraphInstantiate(&graph_instance_pre, graph_pre, NULL, NULL, 0));
+        pre_graph_created = true;
     }
     DEBUG_TRACE;
     checkCudaErrors(cudaGraphLaunch(graph_instance_pre, dataBase_.stream));
 #endif
-    checkCudaErrors(cudaEventRecord(stop,0));
-    checkCudaErrors(cudaEventSynchronize(start));
-    checkCudaErrors(cudaEventSynchronize(stop));
-    checkCudaErrors(cudaEventElapsedTime(&time_elapsed,start,stop));
-    if(!mpi_init_flag || myRank == 0)
-    fprintf(stderr, "eeqn assembly time:%f(ms)\n",time_elapsed);
-
-    checkCudaErrors(cudaEventRecord(start,0));
+    TICK_END_EVENT(EEqn assembly);
+
+    TICK_START_EVENT;
 #ifndef DEBUG_CHECK_LDU
     solve();
 #endif
-    checkCudaErrors(cudaEventRecord(stop,0));
-    checkCudaErrors(cudaEventSynchronize(start));
-    checkCudaErrors(cudaEventSynchronize(stop));
-    checkCudaErrors(cudaEventElapsedTime(&time_elapsed,start,stop));
-    if(!mpi_init_flag || myRank == 0)
-    fprintf(stderr, "eeqn solve time:%f(ms)\n",time_elapsed);
-
-    checkCudaErrors(cudaEventRecord(start,0));
-#ifndef TIME_GPU
-    if(!graph_created) {
+    TICK_END_EVENT(EEqn solve);
+
+#ifdef USE_GRAPH
+    if(!post_graph_created) {
         checkCudaErrors(cudaStreamBeginCapture(dataBase_.stream, cudaStreamCaptureModeGlobal));
 #endif
 
+        TICK_START_EVENT;
         correct_boundary_conditions_scalar(dataBase_.stream, dataBase_.nccl_comm, dataBase_.neighbProcNo.data(),
                 dataBase_.num_boundary_surfaces, dataBase_.num_patches, dataBase_.patch_size.data(),
                 patch_type_he.data(), dataBase_.d_boundary_delta_coeffs, dataBase_.d_boundary_face_cell,
                 dataBase_.d_he, dataBase_.d_boundary_he, dataBase_.cyclicNeighbor.data(), 
                 dataBase_.patchSizeOffset.data(), dataBase_.d_boundary_weight,
                 dataBase_.d_boundary_T, dataBase_.d_boundary_y, d_boundary_heGradient, &thermo_);
+        TICK_END_EVENT(EEqn post process correctBC);
+
+        TICK_START_EVENT;
+        // copy he to host
+        checkCudaErrors(cudaMemcpyAsync(dataBase_.h_he, dataBase_.d_he, dataBase_.cell_value_bytes, cudaMemcpyDeviceToHost, dataBase_.stream));
+        checkCudaErrors(cudaMemcpyAsync(dataBase_.h_boundary_he, dataBase_.d_boundary_he, dataBase_.boundary_surface_value_bytes, cudaMemcpyDeviceToHost, dataBase_.stream));
+        TICK_END_EVENT(EEqn post process copy back);
 
-#ifndef TIME_GPU
+        TICK_START_EVENT;
+#ifdef STREAM_ALLOCATOR
+        // thermophysical fields
+        checkCudaErrors(cudaFreeAsync(d_dpdt, dataBase_.stream));
+        // fiv weight fieldsFree
+        //checkCudaErrors(cudaFreeAsync(d_phi_special_weight, dataBase_.stream));
+        // boundary coeffs
+        checkCudaErrors(cudaFreeAsync(d_value_internal_coeffs, dataBase_.stream));
+        checkCudaErrors(cudaFreeAsync(d_value_boundary_coeffs, dataBase_.stream));
+        checkCudaErrors(cudaFreeAsync(d_gradient_internal_coeffs, dataBase_.stream));
+        checkCudaErrors(cudaFreeAsync(d_gradient_boundary_coeffs, dataBase_.stream));
+     
+        checkCudaErrors(cudaFreeAsync(d_boundary_heGradient, dataBase_.stream));
+    
+        checkCudaErrors(cudaFreeAsync(d_source, dataBase_.stream));
+        checkCudaErrors(cudaFreeAsync(d_internal_coeffs, dataBase_.stream));
+        checkCudaErrors(cudaFreeAsync(d_boundary_coeffs, dataBase_.stream));
+        checkCudaErrors(cudaFreeAsync(d_A, dataBase_.stream));
+        checkCudaErrors(cudaFreeAsync(d_b, dataBase_.stream));
+#endif
+        TICK_END_EVENT(EEqn post process free);
+#ifdef USE_GRAPH
         checkCudaErrors(cudaStreamEndCapture(dataBase_.stream, &graph_post));
         checkCudaErrors(cudaGraphInstantiate(&graph_instance_post, graph_post, NULL, NULL, 0));
-        graph_created = true;
+        post_graph_created = true;
     }
     checkCudaErrors(cudaGraphLaunch(graph_instance_post, dataBase_.stream));
 #endif
-
-    checkCudaErrors(cudaEventRecord(stop,0));
-    checkCudaErrors(cudaEventSynchronize(start));
-    checkCudaErrors(cudaEventSynchronize(stop));
-    checkCudaErrors(cudaEventElapsedTime(&time_elapsed,start,stop));
-    if(!mpi_init_flag || myRank == 0)
-    fprintf(stderr, "eeqn post process time: %f(ms)\n\n",time_elapsed);
+    sync();
 }
 
 void dfEEqn::eeqn_calculate_energy_gradient(dfThermo& GPUThermo, int num_cells, int num_species, 
@@ -252,6 +265,7 @@ void dfEEqn::eeqn_calculate_energy_gradient(dfThermo& GPUThermo, int num_cells,
 {
     int bou_offset = 0, gradient_offset = 0;
     for (int i = 0; i < num_patches; i++) {
+        if (patch_size[i] == 0) continue;
         if (patch_type[i] == boundaryConditions::gradientEnergy) {
             GPUThermo.calculateEnergyGradient(patch_size[i], num_cells, num_species, num_boundary_surfaces, bou_offset, gradient_offset,
                     face2Cells, T, p, y, boundary_delta_coeffs, boundary_p, boundary_y, boundary_thermo_gradient);
@@ -336,45 +350,8 @@ void dfEEqn::sync()
 
 void dfEEqn::solve()
 {
-    sync();
-
-    if (num_iteration == 0)                                     // first interation
-    {
-        printf("Initializing AmgX Linear Solver\n");
-        ESolver->setOperator(dataBase_.num_cells, dataBase_.num_total_cells, dataBase_.num_Nz, dataBase_.d_csr_row_index, dataBase_.d_csr_col_index, d_A);
-    }
-    else
-    {
-        ESolver->updateOperator(dataBase_.num_cells, dataBase_.num_Nz, d_A);
-    }
-    ESolver->solve(dataBase_.num_cells, dataBase_.d_he, d_source);
+    dataBase_.solve(num_iteration, AMGXSetting::u_setting, d_A, dataBase_.d_he, d_source);
     num_iteration++;
 }
 
-void dfEEqn::postProcess(double *h_he, double *h_boundary_he)
-{
-#ifdef STREAM_ALLOCATOR
-    // thermophysical fields
-    checkCudaErrors(cudaFreeAsync(d_dpdt, dataBase_.stream));
-    // fiv weight fieldsFree
-    //checkCudaErrors(cudaFreeAsync(d_phi_special_weight, dataBase_.stream));
-    // boundary coeffs
-    checkCudaErrors(cudaFreeAsync(d_value_internal_coeffs, dataBase_.stream));
-    checkCudaErrors(cudaFreeAsync(d_value_boundary_coeffs, dataBase_.stream));
-    checkCudaErrors(cudaFreeAsync(d_gradient_internal_coeffs, dataBase_.stream));
-    checkCudaErrors(cudaFreeAsync(d_gradient_boundary_coeffs, dataBase_.stream));
- 
-    checkCudaErrors(cudaFreeAsync(d_boundary_heGradient, dataBase_.stream));
-
-    checkCudaErrors(cudaFreeAsync(d_source, dataBase_.stream));
-    checkCudaErrors(cudaFreeAsync(d_internal_coeffs, dataBase_.stream));
-    checkCudaErrors(cudaFreeAsync(d_boundary_coeffs, dataBase_.stream));
-    checkCudaErrors(cudaFreeAsync(d_A, dataBase_.stream));
-    checkCudaErrors(cudaFreeAsync(d_b, dataBase_.stream));
-#endif
-
-    // copy he to host
-    checkCudaErrors(cudaMemcpyAsync(h_he, dataBase_.d_he, dataBase_.cell_value_bytes, cudaMemcpyDeviceToHost, dataBase_.stream));
-    checkCudaErrors(cudaMemcpyAsync(h_boundary_he, dataBase_.d_boundary_he, dataBase_.boundary_surface_value_bytes, cudaMemcpyDeviceToHost, dataBase_.stream));
-    sync();
-}
+void dfEEqn::postProcess(double *h_he, double *h_boundary_he) {}
diff --git a/src_gpu/dfMatrixDataBase.H b/src_gpu/dfMatrixDataBase.H
index 286583bc7..4ebecfa6c 100644
--- a/src_gpu/dfMatrixDataBase.H
+++ b/src_gpu/dfMatrixDataBase.H
@@ -16,11 +16,15 @@
 #include <cmath>
 #include <unordered_map>
 
+#include "AmgXSolver.H"
+#include <amgx_c.h>
+
+//#define DEBUG_
+//#define DEBUG_CHECK_LDU
+
 extern int myRank;
-extern int mpi_init_flag;
 
 #define GPU_DEBUG_
-
 #ifdef GPU_DEBUG_
    #define DEBUG_TRACE fprintf(stderr, "myRank[%d] %s %d\n", myRank, __FILE__, __LINE__);
 #else
@@ -59,6 +63,11 @@ inline void checkVectorEqual(int count, const double* basevec, double* vec, doub
     }   
 }
 
+enum AMGXSetting {
+    u_setting,
+    p_setting
+};
+
 enum location {
     cpu,
     gpu
@@ -146,6 +155,10 @@ struct dfMatrixDataBase
     int *d_csr_row_index= nullptr;
     int *d_csr_col_index= nullptr;
 
+    // amgx solvers
+	AmgXSolver *u_setting_solver = nullptr;
+	AmgXSolver *p_setting_solver = nullptr;
+
     // constant fields - internal
     double *d_sf = nullptr;
     double *d_mag_sf = nullptr;
@@ -277,6 +290,8 @@ struct dfMatrixDataBase
         std::vector<int> patch_size, int num_species, double rdelta_t);             
     void setConstantIndexes(const int *owner, const int *neighbor, const int *procRows, 
             const int *procCols, int globalOffset);
+    void setAmgxSolvers(const std::string &mode_string, const std::string &u_setting_path, const std::string &p_setting_path);
+    void solve(int num_iteration, AMGXSetting setting, double *d_A, double *d_x, double *d_b);
     void setCyclicInfo(std::vector<int> &cyclicNeighbor); // when use cyclic boundary
 
     void createConstantFieldsInternal();
diff --git a/src_gpu/dfMatrixDataBase.cu b/src_gpu/dfMatrixDataBase.cu
index f813ae664..64dfd99fb 100644
--- a/src_gpu/dfMatrixDataBase.cu
+++ b/src_gpu/dfMatrixDataBase.cu
@@ -146,6 +146,25 @@ void dfMatrixDataBase::setConstantValues(int num_cells, int num_total_cells, int
     csr_value_vec_bytes = num_Nz * 3 * sizeof(double);
 }
 
+void dfMatrixDataBase::setAmgxSolvers(const std::string &mode_string, const std::string &u_setting_path, const std::string &p_setting_path) {
+    // amgx solvers
+    u_setting_solver = new AmgXSolver(mode_string, u_setting_path, localRank);
+    p_setting_solver = new AmgXSolver(mode_string, p_setting_path, localRank);
+}
+    
+void dfMatrixDataBase::solve(int num_iteration, AMGXSetting setting, double *d_A, double *d_x, double *d_b) {
+    AmgXSolver *solver = (setting == AMGXSetting::u_setting) ? u_setting_solver : p_setting_solver;
+    if (num_iteration == 0)                                     // first interation
+    {
+        solver->setOperator(num_cells, num_total_cells, num_Nz, d_csr_row_index, d_csr_col_index, d_A);
+    }
+    else
+    {
+        solver->updateOperator(num_cells, num_Nz, d_A);
+    }
+    solver->solve(num_cells, d_x, d_b);
+}
+
 void dfMatrixDataBase::setCyclicInfo(std::vector<int> &cyclicNeighbor)
 {
     this->cyclicNeighbor = cyclicNeighbor;
diff --git a/src_gpu/dfMatrixOpBase.H b/src_gpu/dfMatrixOpBase.H
index 09d5d6281..5a61980f9 100644
--- a/src_gpu/dfMatrixOpBase.H
+++ b/src_gpu/dfMatrixOpBase.H
@@ -2,8 +2,84 @@
 #include <cuda_runtime.h>
 #include <nccl.h>
 #include "dfThermo.H"
-// #define TIME_GPU
-// #define DEBUG_CHECK_LDU
+class dfThermo;
+
+// macros need to be in a certain order
+
+// sequence 0: STREAM_ALLOCATOR can be open or not
+#ifndef STREAM_ALLOCATOR
+    #define STREAM_ALLOCATOR
+#endif
+
+// sequence 0: USE_GRAPH can be open or not
+#ifndef USE_GRAPH
+    #define USE_GRAPH
+#endif
+
+// sequence 0: TIME_GPU can be open or not
+#ifndef TIME_GPU
+    #define TIME_GPU
+#endif
+
+// sequence 1: TIME_GPU and USE_GRAPH can not be open at the same time
+#if (defined TIME_GPU) && (defined USE_GRAPH)
+    #undef USE_GRAPH
+#endif
+
+// sequence 2: STREAM_ALLOCATOR must be open if USE_GRAPH is open
+#if (defined USE_GRAPH) && (!defined STREAM_ALLOCATOR)
+    #define STREAM_ALLOCATOR
+#endif
+
+extern int myRank;
+
+#define PRINT_PTR(x) { \
+    fprintf(stderr, "rank[%d], %s %d, print ptr %s: %p\n", myRank, __FILE__, __LINE__, #x, x); \
+}
+
+extern __global__ void warmup();
+
+#ifdef TIME_GPU
+    #define WARM_UP \
+        warmup<<<10, 1024, 0, stream>>>();
+
+    #define TICK_INIT_EVENT \
+        float time_elapsed_kernel=0;\
+        cudaEvent_t start_kernel, stop_kernel;\
+        checkCudaErrors(cudaEventCreate(&start_kernel));\
+        checkCudaErrors(cudaEventCreate(&stop_kernel));
+
+    #define TICK_START_EVENT \
+        checkCudaErrors(cudaEventRecord(start_kernel,stream));
+
+    #define TICK_END_EVENT(prefix) \
+        checkCudaErrors(cudaEventRecord(stop_kernel,stream));\
+        checkCudaErrors(cudaEventSynchronize(start_kernel));\
+        checkCudaErrors(cudaEventSynchronize(stop_kernel));\
+        checkCudaErrors(cudaEventElapsedTime(&time_elapsed_kernel,start_kernel,stop_kernel));\
+        fprintf(stderr, "rank[%d], name: %s, time: %lf(ms)\n", myRank, #prefix, time_elapsed_kernel);
+
+/*
+    // the usage description:
+    // if you want to profile the first kernel, please use WARM_UP before TICK_INIT_EVENT.
+    // otherwise there is no need to use WARM_UP
+    WARM_UP;
+    // init event
+    TICK_INIT_EVENT;
+    // start event
+    TICK_START_EVENT;
+    // call your kernel, or kernels, or wrapper functions, e.g.:
+    my_kernel<<<blocks_per_grid, threads_per_block, 0, stream>>>(num, input, output);
+    // end event with your specified name string, e.g.:
+    TICK_END_EVENT(my_kernel);
+*/
+
+#else
+    #define WARM_UP
+    #define TICK_INIT_EVENT
+    #define TICK_START_EVENT
+    #define TICK_END_EVENT(prefix)
+#endif
 
 // tools
 void permute_vector_d2h(cudaStream_t stream, int num_cells, const double *input, double *output);
@@ -253,3 +329,4 @@ void fvMtx_flux(cudaStream_t stream, int num_surfaces, int num_boundary_surfaces
 
 void solve_explicit_scalar(cudaStream_t stream, int num_cells, const double *diag, const double *source,
         double *psi);
+
diff --git a/src_gpu/dfMatrixOpBase.cu b/src_gpu/dfMatrixOpBase.cu
index 98146436c..6e520f7c6 100644
--- a/src_gpu/dfMatrixOpBase.cu
+++ b/src_gpu/dfMatrixOpBase.cu
@@ -5,32 +5,10 @@
 #include <cuda_runtime.h>
 #include "cuda_profiler_api.h"
 
-#ifdef TIME_GPU
-    #define TICK_INIT_EVENT \
-        float time_elapsed_kernel=0;\
-        cudaEvent_t start_kernel, stop_kernel;\
-        checkCudaErrors(cudaEventCreate(&start_kernel));\
-        checkCudaErrors(cudaEventCreate(&stop_kernel));
-
-    #define TICK_START_EVENT \
-        checkCudaErrors(cudaEventRecord(start_kernel,0));
-
-    #define TICK_END_EVENT(prefix) \
-        checkCudaErrors(cudaEventRecord(stop_kernel,0));\
-        checkCudaErrors(cudaEventSynchronize(start_kernel));\
-        checkCudaErrors(cudaEventSynchronize(stop_kernel));\
-        checkCudaErrors(cudaEventElapsedTime(&time_elapsed_kernel,start_kernel,stop_kernel));\
-        printf("try %s 执行时间：%lf(ms)\n", #prefix, time_elapsed_kernel);
-#else
-    #define TICK_INIT_EVENT
-    #define TICK_START_EVENT
-    #define TICK_END_EVENT(prefix)
-#endif
-
-__global__ void warmup(int num_cells)
+__global__ void warmup()
 {
     int index = blockDim.x * blockIdx.x + threadIdx.x;
-    if (index >= num_cells)
+    if (index >= 10240)
         return;
 }
 
@@ -363,10 +341,13 @@ void correct_boundary_conditions_processor_scalar(cudaStream_t stream, ncclComm_
     correct_internal_boundary_field_scalar<<<blocks_per_grid, threads_per_block, 0, stream>>>(num, offset, 
             vf, boundary_cell_face, vf_boundary);
 
+    TICK_INIT_EVENT;
+    TICK_START_EVENT;
     checkNcclErrors(ncclGroupStart());
     checkNcclErrors(ncclSend(vf_boundary + internal_start_index, num, ncclDouble, peer, comm, stream));
     checkNcclErrors(ncclRecv(vf_boundary + neighbor_start_index, num, ncclDouble, peer, comm, stream));
     checkNcclErrors(ncclGroupEnd());
+    TICK_END_EVENT(nccl scalar);
     //checkCudaErrors(cudaStreamSynchronize(stream));
 }
 
@@ -382,12 +363,15 @@ void correct_boundary_conditions_processor_vector(cudaStream_t stream, ncclComm_
     correct_internal_boundary_field_vector<<<blocks_per_grid, threads_per_block, 0, stream>>>(num, offset, 
             num_boundary_surfaces, num_cells, vf, boundary_cell_face, vf_boundary);
 
+    TICK_INIT_EVENT;
+    TICK_START_EVENT;
     checkNcclErrors(ncclGroupStart());
     for (int i = 0; i < 3; i++) {
         checkNcclErrors(ncclSend(vf_boundary + num_boundary_surfaces * i + internal_start_index, num, ncclDouble, peer, comm, stream));
         checkNcclErrors(ncclRecv(vf_boundary + num_boundary_surfaces * i + neighbor_start_index, num, ncclDouble, peer, comm, stream));   
     }
     checkNcclErrors(ncclGroupEnd());
+    TICK_END_EVENT(nccl vector);
     //checkCudaErrors(cudaStreamSynchronize(stream));
 }
 
@@ -1282,13 +1266,17 @@ void fvc_grad_vector_correctBC_processor(cudaStream_t stream, ncclComm_t comm,
 
     correct_internal_boundary_field_tensor<<<blocks_per_grid, threads_per_block, 0, stream>>>(num, offset, 
             num_boundary_surfaces, num_cells, internal_grad, face2Cells, boundary_grad);
+
+    TICK_INIT_EVENT;
+    TICK_START_EVENT;
     checkNcclErrors(ncclGroupStart());
     for (int i = 0; i < 9; i++) {
         checkNcclErrors(ncclSend(boundary_grad + num_boundary_surfaces * i + internal_start_index, num, ncclDouble, peer, comm, stream));
         checkNcclErrors(ncclRecv(boundary_grad + num_boundary_surfaces * i + neighbor_start_index, num, ncclDouble, peer, comm, stream));   
     }
     checkNcclErrors(ncclGroupEnd());
-    checkCudaErrors(cudaStreamSynchronize(stream));
+    TICK_END_EVENT(nccl tensor);
+    //checkCudaErrors(cudaStreamSynchronize(stream));
 }
 
 __global__ void fvc_grad_cell_scalar_correctBC_zeroGradient(int num_cells, int num_boundary_surfaces,
@@ -2094,13 +2082,10 @@ void field_add_scalar(cudaStream_t stream,
         int num, const double *input1, const double *input2, double *output,
         int num_boundary_surfaces, const double *boundary_input1, const double *boundary_input2, double *boundary_output)
 {
-    TICK_INIT_EVENT;
     size_t threads_per_block = 256;
     size_t blocks_per_grid = (std::max(num, num_boundary_surfaces) + threads_per_block - 1) / threads_per_block;
-    TICK_START_EVENT;
     field_add_scalar_kernel<<<blocks_per_grid, threads_per_block, 0, stream>>>(num, num_boundary_surfaces,
             input1, input2, output, boundary_input1, boundary_input2, boundary_output);
-    TICK_END_EVENT(field_add_scalar_kernel);
 }
 
 void field_add_vector(cudaStream_t stream,
@@ -2109,7 +2094,6 @@ void field_add_vector(cudaStream_t stream,
 {
     size_t threads_per_block = 256;
     size_t blocks_per_grid = (std::max(num_cells, num_boundary_surfaces) + threads_per_block - 1) / threads_per_block;
-
     field_add_vector_kernel<<<blocks_per_grid, threads_per_block, 0, stream>>>(num_cells, num_boundary_surfaces,
             input1, input2, output, boundary_input1, boundary_input2, boundary_output, sign);
 }
@@ -2127,13 +2111,10 @@ void field_multiply_scalar(cudaStream_t stream,
         int num_cells, const double *input1, const double *input2, double *output,
         int num_boundary_surfaces, const double *boundary_input1, const double *boundary_input2, double *boundary_output)
 {
-    TICK_INIT_EVENT;
     size_t threads_per_block = 256;
     size_t blocks_per_grid = (std::max(num_cells, num_boundary_surfaces) + threads_per_block - 1) / threads_per_block;
-    TICK_START_EVENT;
     field_multiply_scalar_kernel<<<blocks_per_grid, threads_per_block, 0, stream>>>(num_cells, num_boundary_surfaces,
             input1, input2, output, boundary_input1, boundary_input2, boundary_output);
-    TICK_END_EVENT(field_multiply_scalar_kernel);
 }
 
 void vector_half_mag_square(cudaStream_t stream, int num_cells, const double *vec_input, double *scalar_output,
@@ -2177,13 +2158,10 @@ void fvc_to_source_vector(cudaStream_t stream, int num_cells, const double *volu
 
 void fvc_to_source_scalar(cudaStream_t stream, int num_cells, const double *volume, const double *fvc_output, double *source, double sign)
 {
-    TICK_INIT_EVENT;
     size_t threads_per_block = 256;
     size_t blocks_per_grid = (num_cells + threads_per_block - 1) / threads_per_block;
-    TICK_START_EVENT;
     fvc_to_source_scalar_kernel<<<blocks_per_grid, threads_per_block, 0, stream>>>(num_cells,
             volume, fvc_output, source, sign);
-    TICK_END_EVENT(fvc_to_source_scalar_kernel);
 }
 
 void ldu_to_csr_scalar(cudaStream_t stream, int num_cells, int num_surfaces, int num_boundary_surface, int num_Nz, 
@@ -2199,6 +2177,7 @@ void ldu_to_csr_scalar(cudaStream_t stream, int num_cells, int num_surfaces, int
     int bou_offset = 0, ext_offset = 0;
     size_t threads_per_block, blocks_per_grid;
     for (int i = 0; i < num_patches; i++) {
+        if (patch_size[i] == 0) continue;
         if (patch_type[i] == boundaryConditions::processor
                 || patch_type[i] == boundaryConditions::processorCyclic) {
             threads_per_block = 64;
@@ -2222,6 +2201,7 @@ void ldu_to_csr_scalar(cudaStream_t stream, int num_cells, int num_surfaces, int
     // add coeff to source and diagnal
     bou_offset = 0;
     for (int i = 0; i < num_patches; i++) {
+        if (patch_size[i] == 0) continue;
         threads_per_block = 64;
         blocks_per_grid = (patch_size[i] + threads_per_block - 1) / threads_per_block;
         if (patch_type[i] == boundaryConditions::processor
@@ -2250,21 +2230,16 @@ void ldu_to_csr(cudaStream_t stream, int num_cells, int num_surfaces, int num_bo
         const int* boundary_cell_face, const int *ldu_to_csr_index, const int *diag_to_csr_index,
         const double *ldu, const double *internal_coeffs, const double *boundary_coeffs, double *source, double *A)
 {
-    TICK_INIT_EVENT;
     // construct diag
     int nNz = num_cells + 2 * num_surfaces;
     size_t threads_per_block = 1024;
     size_t blocks_per_grid = (nNz + threads_per_block - 1) / threads_per_block;
-    TICK_START_EVENT;
     ldu_to_csr_kernel<<<blocks_per_grid, threads_per_block, 0, stream>>>(nNz, ldu_to_csr_index, ldu, A);
-    TICK_END_EVENT(ldu_to_csr_kernel);
 
     // add coeff to source and diagnal
     blocks_per_grid = (num_boundary_surface + threads_per_block - 1) / threads_per_block;
-    TICK_START_EVENT;
     addBoundaryDiagSrc<<<blocks_per_grid, threads_per_block, 0, stream>>>(num_cells, num_surfaces, num_boundary_surface, 
             boundary_cell_face, internal_coeffs, boundary_coeffs, diag_to_csr_index, A, source);
-    TICK_END_EVENT(addBoundaryDiagSrc);
 }
 
 void update_boundary_coeffs_scalar(cudaStream_t stream,
@@ -2279,6 +2254,7 @@ void update_boundary_coeffs_scalar(cudaStream_t stream,
     int offset = 0;
     int gradient_offset = 0;
     for (int i = 0; i < num_patches; i++) {
+        if (patch_size[i] == 0) continue;
         threads_per_block = 256;
         blocks_per_grid = (patch_size[i] + threads_per_block - 1) / threads_per_block;
         // TODO: just basic patch type now
@@ -2328,6 +2304,7 @@ void correct_boundary_conditions_scalar(cudaStream_t stream, ncclComm_t comm,
     int offset = 0;
     int gradient_offset = 0;
     for (int i = 0; i < num_patches; i++) {
+        if (patch_size[i] == 0) continue;
         threads_per_block = 256;
         blocks_per_grid = (patch_size[i] + threads_per_block - 1) / threads_per_block;
         if (patch_type[i] == boundaryConditions::zeroGradient
@@ -2374,6 +2351,7 @@ void correct_boundary_conditions_vector(cudaStream_t stream, ncclComm_t comm,
 
     int offset = 0;
     for (int i = 0; i < num_patches; i++) {
+        if (patch_size[i] == 0) continue;
         threads_per_block = 256;
         blocks_per_grid = (patch_size[i] + threads_per_block - 1) / threads_per_block;
         if (patch_type[i] == boundaryConditions::zeroGradient
@@ -2414,6 +2392,7 @@ void update_boundary_coeffs_vector(cudaStream_t stream, int num_boundary_surface
 
     int offset = 0;
     for (int i = 0; i < num_patches; i++) {
+        if (patch_size[i] == 0) continue;
         threads_per_block = 256;
         blocks_per_grid = (patch_size[i] + threads_per_block - 1) / threads_per_block;
         // TODO: just basic patch type now
@@ -2453,50 +2432,30 @@ void fvm_ddt_vol_scalar_vol_scalar(cudaStream_t stream, int num_cells, double rD
         const double *rho, const double *rho_old, const double *vf, const double *volume,
         double *diag, double *source, double sign)
 {
-    TICK_INIT_EVENT;
     size_t threads_per_block = 1024;
     size_t blocks_per_grid = (num_cells + threads_per_block - 1) / threads_per_block;
-    TICK_START_EVENT;
     fvm_ddt_vol_scalar_vol_scalar_kernel<<<blocks_per_grid, threads_per_block, 0, stream>>>(num_cells,
             rDeltaT, rho, rho_old, vf, volume, diag, source, sign);
-    TICK_END_EVENT(fvm_ddt_vol_scalar_vol_scalar_kernel);
 }
 
 void fvm_ddt_scalar(cudaStream_t stream, int num_cells, double rDeltaT, 
         const double *vf_old, const double *volume, 
         double *diag, double *source, double sign)
 {
-#ifdef TIME_GPU
-    printf("#############kernel profile#############\n");
-#endif
-    TICK_INIT_EVENT;
     size_t threads_per_block = 1024;
     size_t blocks_per_grid = (num_cells + threads_per_block - 1) / threads_per_block;
-#ifdef TIME_GPU
-    printf("warm up ...\n");
-    warmup<<<blocks_per_grid, threads_per_block, 0, stream>>>(num_cells);
-#endif
-    TICK_START_EVENT;
     fvm_ddt_scalar_kernel<<<blocks_per_grid, threads_per_block, 0, stream>>>(num_cells, 
             rDeltaT, vf_old, volume, diag, source, sign);
-    TICK_END_EVENT(fvm_ddt_scalar_kernel);
 }
 
 void fvm_ddt_vector(cudaStream_t stream, int num_cells, double rDeltaT,
         const double *rho, const double *rho_old, const double *vf, const double *volume,
         double *diag, double *source, double sign)
 {
-    TICK_INIT_EVENT;
     size_t threads_per_block = 64;
     size_t blocks_per_grid = (num_cells + threads_per_block - 1) / threads_per_block;
-#ifdef TIME_GPU
-    printf("warm up ...\n");
-    warmup<<<blocks_per_grid, threads_per_block, 0, stream>>>(num_cells);
-#endif
-    TICK_START_EVENT;
     fvm_ddt_vector_kernel<<<blocks_per_grid, threads_per_block, 0, stream>>>(num_cells,
             rDeltaT, rho, rho_old, vf, volume, diag, source, sign);
-    TICK_END_EVENT(fvm_ddt_vector_kernel);
 }
 
 void fvm_div_scalar(cudaStream_t stream, int num_surfaces, const int *lowerAddr, const int *upperAddr,
@@ -2506,16 +2465,14 @@ void fvm_div_scalar(cudaStream_t stream, int num_surfaces, const int *lowerAddr,
         const double *boundary_phi, const double *value_internal_coeffs, const double *value_boundary_coeffs,
         double *internal_coeffs, double *boundary_coeffs, double sign)
 {
-    TICK_INIT_EVENT;
     size_t threads_per_block = 1024;
     size_t blocks_per_grid = (num_surfaces + threads_per_block - 1) / threads_per_block;
-    TICK_START_EVENT;
     fvm_div_scalar_internal<<<blocks_per_grid, threads_per_block, 0, stream>>>(num_surfaces, lowerAddr, upperAddr,
             phi, weight, lower, upper, diag, sign);
-    TICK_END_EVENT(fvm_div_scalar_internal);
 
     int offset = 0;
     for (int i = 0; i < num_patches; i++) {
+        if (patch_size[i] == 0) continue;
         threads_per_block = 256;
         blocks_per_grid = (patch_size[i] + threads_per_block - 1) / threads_per_block;
         // TODO: just basic patch type now
@@ -2536,20 +2493,14 @@ void fvm_div_vector(cudaStream_t stream, int num_surfaces, int num_boundary_surf
         const double *boundary_phi, const double *value_internal_coeffs, const double *value_boundary_coeffs,
         double *internal_coeffs, double *boundary_coeffs, double sign)
 {
-    TICK_INIT_EVENT;
     size_t threads_per_block = 256;
     size_t blocks_per_grid = (num_surfaces + threads_per_block - 1) / threads_per_block;
-#ifdef TIME_GPU
-    printf("warm up ...\n");
-    warmup<<<blocks_per_grid, threads_per_block, 0, stream>>>(num_surfaces);
-#endif
-    TICK_START_EVENT;
     fvm_div_vector_internal<<<blocks_per_grid, threads_per_block, 0, stream>>>(num_surfaces, lowerAddr, upperAddr,
             phi, weight, lower, upper, diag, sign);
-    TICK_END_EVENT(fvm_div_vector_internal);
 
     int offset = 0;
     for (int i = 0; i < num_patches; i++) {
+        if (patch_size[i] == 0) continue;
         threads_per_block = 64;
         blocks_per_grid = (patch_size[i] + threads_per_block - 1) / threads_per_block;
         fvm_div_vector_boundary<<<blocks_per_grid, threads_per_block, 0, stream>>>(num_boundary_surfaces, patch_size[i], offset,
@@ -2576,6 +2527,7 @@ void fvm_laplacian_scalar(cudaStream_t stream, int num_surfaces, int num_boundar
             weight, mag_sf, delta_coeffs, gamma, lower, upper, diag, sign);
     int offset = 0;
     for (int i = 0; i < num_patches; i++) {
+        if (patch_size[i] == 0) continue;
         threads_per_block = 256;
         blocks_per_grid = (patch_size[i] + threads_per_block - 1) / threads_per_block;
         fvm_laplacian_scalar_boundary<<<blocks_per_grid, threads_per_block, 0, stream>>>(patch_size[i], offset,
@@ -2596,15 +2548,13 @@ void fvm_laplacian_vector(cudaStream_t stream, int num_surfaces, int num_boundar
         const double *gradient_internal_coeffs, const double *gradient_boundary_coeffs,
         double *internal_coeffs, double *boundary_coeffs, double sign)
 {
-    TICK_INIT_EVENT;
     size_t threads_per_block = 1024;
     size_t blocks_per_grid = (num_surfaces + threads_per_block - 1) / threads_per_block;
-    TICK_START_EVENT;
     fvm_laplacian_internal<<<blocks_per_grid, threads_per_block, 0, stream>>>(num_surfaces, lowerAddr, upperAddr,
             weight, mag_sf, delta_coeffs, gamma, lower, upper, diag, sign);
-    TICK_END_EVENT(fvm_laplacian_internal);
     int offset = 0;
     for (int i = 0; i < num_patches; i++) {
+        if (patch_size[i] == 0) continue;
         threads_per_block = 256;
         blocks_per_grid = (patch_size[i] + threads_per_block - 1) / threads_per_block;
         // TODO: just basic patch type now
@@ -2633,6 +2583,7 @@ void fvm_laplacian_surface_scalar_vol_scalar(cudaStream_t stream, int num_surfac
     
     int offset = 0;
     for (int i = 0; i < num_patches; i++) {
+        if (patch_size[i] == 0) continue;
         threads_per_block = 256;
         blocks_per_grid = (patch_size[i] + threads_per_block - 1) / threads_per_block;
         fvm_laplacian_surface_scalar_boundary<<<blocks_per_grid, threads_per_block, 0, stream>>>(patch_size[i], offset,
@@ -2651,13 +2602,10 @@ void fvc_ddt_vol_scalar_vol_scalar(cudaStream_t stream, int num_cells, double rD
         const double *rho, const double *rho_old, const double *vf, const double *vf_old, const double *volume, 
         double *output, double sign)
 {
-    TICK_INIT_EVENT;
     size_t threads_per_block = 1024;
     size_t blocks_per_grid = (num_cells + threads_per_block - 1) / threads_per_block;
-    TICK_START_EVENT;
     fvc_ddt_vol_scalar_vol_scalar_kernel<<<blocks_per_grid, threads_per_block, 0, stream>>>(num_cells,
             rDeltaT, rho, rho_old, vf, vf_old, volume, output, sign);
-    TICK_END_EVENT(fvc_ddt_vol_scalar_vol_scalar_kernel);
 }
 
 void fvc_grad_vector(cudaStream_t stream, ncclComm_t comm, 
@@ -2670,17 +2618,15 @@ void fvc_grad_vector(cudaStream_t stream, ncclComm_t comm,
         const int *cyclicNeighbor, const int *patchSizeOffset,
         const double *boundary_deltaCoeffs, double sign)
 {
-    TICK_INIT_EVENT;
     size_t threads_per_block = 32;
     size_t blocks_per_grid = (num_surfaces + threads_per_block - 1) / threads_per_block;
-    TICK_START_EVENT;
     fvc_grad_vector_internal<<<blocks_per_grid, threads_per_block, 0, stream>>>(num_cells, num_surfaces, lowerAddr, upperAddr,
             Sf, weight, vf, output);
-    TICK_END_EVENT(fvc_grad_vector_internal);
     
     int offset = 0;
     // finish conctruct grad field except dividing cell volume
     for (int i = 0; i < num_patches; i++) {
+        if (patch_size[i] == 0) continue;
         threads_per_block = 64;
         blocks_per_grid = (patch_size[i] + threads_per_block - 1) / threads_per_block;
         // TODO: just basic patch type now
@@ -2689,10 +2635,8 @@ void fvc_grad_vector(cudaStream_t stream, ncclComm_t comm,
                 || patch_type[i] == boundaryConditions::calculated
                 || patch_type[i] == boundaryConditions::cyclic) {
             // TODO: just vector version now
-            TICK_START_EVENT;
             fvc_grad_vector_boundary_zeroGradient<<<blocks_per_grid, threads_per_block, 0, stream>>>(num_boundary_surfaces, num_cells, 
                     patch_size[i], offset, boundary_cell_face, boundary_Sf, boundary_vf, output);
-            TICK_END_EVENT(fvc_grad_vector_boundary_zeroGradient);
         } else if (patch_type[i] == boundaryConditions::processor
                     || patch_type[i] == boundaryConditions::processorCyclic) {
             fvc_grad_vector_boundary_processor<<<blocks_per_grid, threads_per_block, 0, stream>>>(num_boundary_surfaces, num_cells, 
@@ -2709,23 +2653,20 @@ void fvc_grad_vector(cudaStream_t stream, ncclComm_t comm,
     // divide cell volume
     threads_per_block = 512;
     blocks_per_grid = (num_cells + threads_per_block - 1) / threads_per_block;
-    TICK_START_EVENT;
     divide_cell_volume_tsr<<<blocks_per_grid, threads_per_block, 0, stream>>>(num_cells, volume, output);
-    TICK_END_EVENT(divide_cell_volume_tsr);
 
     // correct boundary conditions
     offset = 0;
     for (int i = 0; i < num_patches; i++) {
+        if (patch_size[i] == 0) continue;
         threads_per_block = 64;
         blocks_per_grid = (patch_size[i] + threads_per_block - 1) / threads_per_block;
         // TODO: just basic patch type now
         if (patch_type[i] == boundaryConditions::zeroGradient) {
             // TODO: just vector version now
-            TICK_START_EVENT;
             fvc_grad_vector_correctBC_zeroGradient<<<blocks_per_grid, threads_per_block, 0, stream>>>(num_cells, num_boundary_surfaces,
                     patch_size[i], offset, boundary_cell_face,
                     output, boundary_vf, boundary_Sf, boundary_mag_Sf, boundary_output);
-            TICK_END_EVENT(fvc_grad_vector_correctBC_zeroGradient);
         } else if (patch_type[i] == boundaryConditions::fixedValue) {
             // TODO: implement fixedValue version
             fvc_grad_vector_correctBC_fixedValue<<<blocks_per_grid, threads_per_block, 0, stream>>>(
@@ -2757,12 +2698,9 @@ void fvc_grad_vector(cudaStream_t stream, ncclComm_t comm,
 void scale_dev2T_tensor(cudaStream_t stream, int num_cells, const double *vf1, double *vf2,
         int num_boundary_surfaces, const double *boundary_vf1, double *boundary_vf2)
 {
-    TICK_INIT_EVENT;
     size_t threads_per_block = 1024;
     size_t blocks_per_grid = (num_cells + threads_per_block - 1) / threads_per_block;
-    TICK_START_EVENT;
     scale_dev2t_tensor_kernel<<<blocks_per_grid, threads_per_block, 0, stream>>>(num_cells, vf1, vf2);
-    TICK_END_EVENT(scale_dev2t_tensor_kernel);
 
     blocks_per_grid = (num_boundary_surfaces + threads_per_block - 1) / threads_per_block;
     scale_dev2t_tensor_kernel<<<blocks_per_grid, threads_per_block, 0, stream>>>(num_boundary_surfaces, boundary_vf1, boundary_vf2);
@@ -2773,15 +2711,13 @@ void fvc_div_surface_scalar(cudaStream_t stream, int num_cells, int num_surfaces
         int num_patches, const int *patch_size, const int *patch_type,
         const double *boundary_ssf, const double *volume, double *output, double sign)
 {
-    TICK_INIT_EVENT;
     size_t threads_per_block = 1024;
     size_t blocks_per_grid = (num_surfaces + threads_per_block - 1) / threads_per_block;
-    TICK_START_EVENT;
     fvc_div_surface_scalar_internal<<<blocks_per_grid, threads_per_block, 0, stream>>>(num_surfaces, lowerAddr, upperAddr, ssf, output, sign);
-    TICK_END_EVENT(fvc_div_surface_scalar_internal);
 
     int offset = 0;
     for (int i = 0; i < num_patches; i++) {
+        if (patch_size[i] == 0) continue;
         threads_per_block = 256;
         blocks_per_grid = (patch_size[i] + threads_per_block - 1) / threads_per_block;
         fvc_div_surface_scalar_boundary<<<blocks_per_grid, threads_per_block, 0, stream>>>(patch_size[i], offset, boundary_cell_face, 
@@ -2802,16 +2738,14 @@ void fvc_div_cell_vector(cudaStream_t stream, int num_cells, int num_surfaces, i
         const double *boundary_weight, const double *boundary_vf, const double *boundary_Sf,
         const double *volume, double sign)
 {
-    TICK_INIT_EVENT;
     size_t threads_per_block = 1024;
     size_t blocks_per_grid = (num_surfaces + threads_per_block - 1) / threads_per_block;
-    TICK_START_EVENT;
     fvc_div_cell_vector_internal<<<blocks_per_grid, threads_per_block, 0, stream>>>(num_cells, num_surfaces,
             lowerAddr, upperAddr, vf, weight, Sf, output, sign);
-    TICK_END_EVENT(fvc_div_cell_vector_internal);
 
     int offset = 0;
     for (int i = 0; i < num_patches; i++) {
+        if (patch_size[i] == 0) continue;
         threads_per_block = 256;
         blocks_per_grid = (patch_size[i] + threads_per_block - 1) / threads_per_block;
         // TODO: just basic patch type now
@@ -2837,16 +2771,14 @@ void fvc_div_cell_tensor(cudaStream_t stream, int num_cells, int num_surfaces, i
         const int *boundary_cell_face, const double *boundary_vf, const double *boundary_Sf,
         const double *volume, double sign)
 {
-    TICK_INIT_EVENT;
     size_t threads_per_block = 1024;
     size_t blocks_per_grid = (num_surfaces + threads_per_block - 1) / threads_per_block;
-    TICK_START_EVENT;
     fvc_div_cell_tensor_internal<<<blocks_per_grid, threads_per_block, 0, stream>>>(num_cells, num_surfaces,
             lowerAddr, upperAddr, vf, weight, Sf, output, sign);
-    TICK_END_EVENT(fvc_div_cell_tensor_internal);
 
     int offset = 0;
     for (int i = 0; i < num_patches; i++) {
+        if (patch_size[i] == 0) continue;
         threads_per_block = 64;
         blocks_per_grid = (patch_size[i] + threads_per_block - 1) / threads_per_block;
         // TODO: just basic patch type now
@@ -2855,11 +2787,9 @@ void fvc_div_cell_tensor(cudaStream_t stream, int num_cells, int num_surfaces, i
                 || patch_type[i] == boundaryConditions::calculated
                 || patch_type[i] == boundaryConditions::cyclic) {
             // TODO: just vector version now
-            TICK_START_EVENT;
             fvc_div_cell_tensor_boundary_zeroGradient<<<blocks_per_grid, threads_per_block, 0, stream>>>(num_cells, num_boundary_surfaces,
                     patch_size[i], offset, boundary_cell_face,
                     boundary_Sf, boundary_vf, output, sign);
-            TICK_END_EVENT(fvc_div_cell_tensor_boundary_zeroGradient);
         } else if (patch_type[i] == boundaryConditions::processor
                     || patch_type[i] == boundaryConditions::processorCyclic) {
             fvc_div_cell_tensor_boundary_processor<<<blocks_per_grid, threads_per_block, 0, stream>>>(num_cells, num_boundary_surfaces,
@@ -2881,16 +2811,14 @@ void fvc_div_surface_scalar_vol_scalar(cudaStream_t stream, int num_surfaces,
         const int *boundary_cell_face, const double *boundary_vf, const double *boundary_ssf, 
         double sign)
 {
-    TICK_INIT_EVENT;
     size_t threads_per_block = 1024;
     size_t blocks_per_grid = (num_surfaces + threads_per_block - 1) / threads_per_block;
-    TICK_START_EVENT;
     fvc_div_surface_scalar_vol_scalar_internal<<<blocks_per_grid, threads_per_block, 0, stream>>>(num_surfaces,
             lowerAddr, upperAddr, weight, ssf, vf, output, sign);
-    TICK_END_EVENT(fvc_div_surface_scalar_vol_scalar_internal);
 
     int offset = 0;
     for (int i = 0; i < num_patches; i++) {
+        if (patch_size[i] == 0) continue;
         threads_per_block = 64;
         blocks_per_grid = (patch_size[i] + threads_per_block - 1) / threads_per_block;
         fvc_div_surface_scalar_vol_scalar_boundary<<<blocks_per_grid, threads_per_block, 0, stream>>>(
@@ -2908,16 +2836,14 @@ void fvc_grad_cell_scalar(cudaStream_t stream, int num_cells, int num_surfaces,
         int num_patches, const int *patch_size, const int *patch_type, const double *boundary_weight,
         const int *boundary_cell_face, const double *boundary_vf, const double *boundary_Sf, const double *volume, double sign)
 {
-    TICK_INIT_EVENT;
     size_t threads_per_block = 1024;
     size_t blocks_per_grid = (num_surfaces + threads_per_block - 1) / threads_per_block;
-    TICK_START_EVENT;
     fvc_grad_scalar_internal<<<blocks_per_grid, threads_per_block, 0, stream>>>(num_cells, num_surfaces, lowerAddr, upperAddr,
             Sf, weight, vf, output, sign);
-    TICK_END_EVENT(fvc_grad_scalar_internal);
     
     int offset = 0;
     for (int i = 0; i < num_patches; i++) {
+        if (patch_size[i] == 0) continue;
         threads_per_block = 64;
         blocks_per_grid = (patch_size[i] + threads_per_block - 1) / threads_per_block;
         // TODO: just non-coupled patch type now
@@ -2925,10 +2851,8 @@ void fvc_grad_cell_scalar(cudaStream_t stream, int num_cells, int num_surfaces,
                 || patch_type[i] == boundaryConditions::fixedValue
                 || patch_type[i] == boundaryConditions::calculated
                 || patch_type[i] == boundaryConditions::cyclic) {
-            TICK_START_EVENT;
             fvc_grad_scalar_boundary_zeroGradient<<<blocks_per_grid, threads_per_block, 0, stream>>>(num_boundary_surfaces, num_cells, patch_size[i], offset, boundary_cell_face,
                     boundary_Sf, boundary_vf, output, sign);
-            TICK_END_EVENT(fvc_grad_scalar_boundary_zeroGradient);
         } else if (patch_type[i] == boundaryConditions::processor
                     || patch_type[i] == boundaryConditions::processorCyclic) {
             fvc_grad_scalar_boundary_processor<<<blocks_per_grid, threads_per_block, 0, stream>>>(num_boundary_surfaces, num_cells, patch_size[i], offset, boundary_cell_face,
@@ -2950,16 +2874,14 @@ void fvc_grad_cell_scalar(cudaStream_t stream, int num_cells, int num_surfaces,
         const int *boundary_cell_face, const double *boundary_vf, const double *boundary_Sf, const double *volume, 
         bool dividVol, double sign)
 {
-    TICK_INIT_EVENT;
     size_t threads_per_block = 1024;
     size_t blocks_per_grid = (num_surfaces + threads_per_block - 1) / threads_per_block;
-    TICK_START_EVENT;
     fvc_grad_scalar_internal<<<blocks_per_grid, threads_per_block, 0, stream>>>(num_cells, num_surfaces, lowerAddr, upperAddr,
             Sf, weight, vf, output, sign);
-    TICK_END_EVENT(fvc_grad_scalar_internal);
     
     int offset = 0;
     for (int i = 0; i < num_patches; i++) {
+        if (patch_size[i] == 0) continue;
         threads_per_block = 64;
         blocks_per_grid = (patch_size[i] + threads_per_block - 1) / threads_per_block;
         // TODO: just non-coupled patch type now
@@ -2967,10 +2889,8 @@ void fvc_grad_cell_scalar(cudaStream_t stream, int num_cells, int num_surfaces,
                 || patch_type[i] == boundaryConditions::fixedValue
                 || patch_type[i] == boundaryConditions::calculated
                 || patch_type[i] == boundaryConditions::cyclic) {
-            TICK_START_EVENT;
             fvc_grad_scalar_boundary_zeroGradient<<<blocks_per_grid, threads_per_block, 0, stream>>>(num_boundary_surfaces, num_cells, patch_size[i], offset, boundary_cell_face,
                     boundary_Sf, boundary_vf, output, sign);
-            TICK_END_EVENT(fvc_grad_scalar_boundary_zeroGradient);
         } else if (patch_type[i] == boundaryConditions::processor
                     || patch_type[i] == boundaryConditions::processorCyclic) {
             fvc_grad_scalar_boundary_processor<<<blocks_per_grid, threads_per_block, 0, stream>>>(num_boundary_surfaces, num_cells, patch_size[i], offset, boundary_cell_face,
@@ -3011,6 +2931,7 @@ void fvc_grad_cell_scalar_withBC(cudaStream_t stream, ncclComm_t comm, const int
     // correct boundary conditions
     int offset = 0;
     for (int i = 0; i < num_patches; i++) {
+        if (patch_size[i] == 0) continue;
         threads_per_block = 256;
         blocks_per_grid = (patch_size[i] + threads_per_block - 1) / threads_per_block;
         // TODO: just basic patch type now
@@ -3064,6 +2985,7 @@ void fvc_laplacian_scalar(cudaStream_t stream, int num_cells, int num_surfaces,
 
     int offset = 0;
     for (int i = 0; i < num_patches; i++) {
+        if (patch_size[i] == 0) continue;
         threads_per_block = 256;
         blocks_per_grid = (patch_size[i] + threads_per_block - 1) / threads_per_block;
         // TODO: just basic patch type now
@@ -3091,26 +3013,22 @@ void fvc_flux(cudaStream_t stream, int num_cells, int num_surfaces, int num_boun
         const int *boundary_cell_face, const double *boundary_vf, const double *boundary_Sf, 
         double *boundary_output, double sign)
 {
-    TICK_INIT_EVENT;
     size_t threads_per_block = 1024;
     size_t blocks_per_grid = (num_surfaces + threads_per_block - 1) / threads_per_block;
-    TICK_START_EVENT;
     fvc_flux_internal_kernel<<<blocks_per_grid, threads_per_block, 0, stream>>>(num_cells, num_surfaces,
             lowerAddr, upperAddr, vf, weight, Sf, output, sign);
-    TICK_END_EVENT(fvc_flux_internal_kernel);
 
     int offset = 0;
     for (int i = 0; i < num_patches; i++) {
+        if (patch_size[i] == 0) continue;
         threads_per_block = 256;
         blocks_per_grid = (patch_size[i] + threads_per_block - 1) / threads_per_block;
         // TODO: maybe do not need loop boundarys
         if (patch_type[i] == boundaryConditions::zeroGradient
                 || patch_type[i] == boundaryConditions::fixedValue
                 || patch_type[i] == boundaryConditions::gradientEnergy) {
-            TICK_START_EVENT;
             fvc_flux_boundary_kernel<<<blocks_per_grid, threads_per_block, 0, stream>>>(num_boundary_surfaces, patch_size[i], offset, boundary_cell_face,
                     boundary_Sf, boundary_vf, boundary_output, sign);
-            TICK_END_EVENT(fvc_flux_boundary_kernel);
         } else {
             // xxx
             fprintf(stderr, "%s %d, boundaryConditions other than zeroGradient are not support yet!\n", __FILE__, __LINE__);
@@ -3132,6 +3050,7 @@ void fvc_interpolate(cudaStream_t stream, int num_cells, int num_surfaces,
     
     int offset = 0;
     for (int i = 0; i < num_patches; i++) {
+        if (patch_size[i] == 0) continue;
         threads_per_block = 256;
         blocks_per_grid = (patch_size[i] + threads_per_block - 1) / threads_per_block;
         // TODO: maybe do not need loop boundarys
@@ -3169,19 +3088,14 @@ void fvMtx_A(cudaStream_t stream, int num_cells, int num_surfaces, int num_bound
         double *A_pEqn)
 {
     checkCudaErrors(cudaMemcpyAsync(A_pEqn, diag, num_cells * sizeof(double), cudaMemcpyDeviceToDevice, stream));
-    TICK_INIT_EVENT;
 
     size_t threads_per_block = 1024;
     size_t blocks_per_grid = (num_boundary_surfaces + threads_per_block - 1) / threads_per_block;
-    TICK_START_EVENT;
     addAveInternaltoDiag<<<blocks_per_grid, threads_per_block, 0, stream>>>(num_cells, num_boundary_surfaces, boundary_cell_face, 
             internal_coeffs, A_pEqn);
-    TICK_END_EVENT(addAveInternaltoDiag);
     
     blocks_per_grid = (num_cells + threads_per_block - 1) / threads_per_block;
-    TICK_START_EVENT;
     divide_cell_volume_scalar<<<blocks_per_grid, threads_per_block, 0, stream>>>(num_cells, volume, A_pEqn);
-    TICK_END_EVENT(divide_cell_volume_scalar);
 }
 
 void fvMtx_H(cudaStream_t stream, int num_cells, int num_surfaces, int num_boundary_surfaces, 
@@ -3191,42 +3105,34 @@ void fvMtx_H(cudaStream_t stream, int num_cells, int num_surfaces, int num_bound
         const double *lower, const double *upper, const double *source, const double *psi, 
         double *H_pEqn, double *H_pEqn_perm)
 {
-    TICK_INIT_EVENT;
     size_t threads_per_block = 1024;
     size_t blocks_per_grid = (num_boundary_surfaces + threads_per_block - 1) / threads_per_block;
 
-    TICK_START_EVENT;
     checkCudaErrors(cudaMemcpyAsync(H_pEqn, source, num_cells * 3 * sizeof(double), cudaMemcpyDeviceToDevice, stream));
     addBoundaryDiag<<<blocks_per_grid, threads_per_block, 0, stream>>>(num_cells, num_boundary_surfaces, boundary_cell_face, 
             internal_coffs, psi, H_pEqn);
-    TICK_END_EVENT(addBoundaryDiag);
     
     blocks_per_grid = (num_surfaces + threads_per_block - 1) / threads_per_block;
-    TICK_START_EVENT;
     lduMatrix_H<<<blocks_per_grid, threads_per_block, 0, stream>>>(num_cells, num_surfaces, lowerAddr, upperAddr, 
             lower, upper, psi, H_pEqn);
-    TICK_END_EVENT(lduMatrix_H);
 
     int offset = 0;
     for (int i = 0; i < num_patches; i++) {
+        if (patch_size[i] == 0) continue;
         threads_per_block = 64;
         blocks_per_grid = (patch_size[i] + threads_per_block - 1) / threads_per_block;
         // TODO: just non-coupled patch type now
         if (patch_type[i] == boundaryConditions::zeroGradient
                 || patch_type[i] == boundaryConditions::fixedValue) {
-            TICK_START_EVENT;
             addBoundarySrc_unCoupled<<<blocks_per_grid, threads_per_block, 0, stream>>>(num_cells, patch_size[i], offset, 
                     num_boundary_surfaces, boundary_cell_face, boundary_coeffs, H_pEqn);
-            TICK_END_EVENT(addBoundarySrc_unCoupled);
         } else {
             fprintf(stderr, "%s %d, boundaryConditions other than zeroGradient are not support yet!\n", __FILE__, __LINE__);
         }
         offset += patch_size[i];
     }
     blocks_per_grid = (num_cells + threads_per_block - 1) / threads_per_block;
-    TICK_START_EVENT;
     divideVol_permute_vec<<<blocks_per_grid, threads_per_block, 0, stream>>>(num_cells, volume, H_pEqn, H_pEqn_perm);
-    TICK_END_EVENT(divideVol_permute_vec);
 }
 
 void fvMtx_flux(cudaStream_t stream, int num_surfaces, int num_boundary_surfaces, 
@@ -3242,6 +3148,7 @@ void fvMtx_flux(cudaStream_t stream, int num_surfaces, int num_boundary_surfaces
     
     int offset = 0;
     for (int i = 0; i < num_patches; i++) {
+        if (patch_size[i] == 0) continue;
         threads_per_block = 64;
         blocks_per_grid = (patch_size[i] + threads_per_block - 1) / threads_per_block;
         if (patch_type[i] == boundaryConditions::processor
@@ -3264,11 +3171,8 @@ void fvMtx_flux(cudaStream_t stream, int num_surfaces, int num_boundary_surfaces
 void solve_explicit_scalar(cudaStream_t stream, int num_cells, const double *diag, const double *source,
         double *psi)
 {
-    TICK_INIT_EVENT;
     size_t threads_per_block = 1024;
     size_t blocks_per_grid = (num_cells + threads_per_block - 1) / threads_per_block;
 
-    TICK_START_EVENT;
     solve_explicit_scalar_kernel<<<blocks_per_grid, threads_per_block, 0, stream>>>(num_cells, diag, source, psi);
-    TICK_END_EVENT(solve_explicit_scalar_kernel);
 }
diff --git a/src_gpu/dfRhoEqn.H b/src_gpu/dfRhoEqn.H
index fe22f2aa2..c38347977 100644
--- a/src_gpu/dfRhoEqn.H
+++ b/src_gpu/dfRhoEqn.H
@@ -9,10 +9,13 @@ private:
 	dfMatrixDataBase &dataBase_;
 
     // cuda resource
+    cudaStream_t stream;
+#ifdef USE_GRAPH
     // one graph for one eqn before using self-developed solver
     cudaGraph_t graph;
     cudaGraphExec_t graph_instance;
     bool graph_created=false;
+#endif
 
     // constant fields - boundary
 	std::vector<int> patch_type;
@@ -32,6 +35,7 @@ public:
     // member function
 
     // initialization
+    void setConstantValues();
     void setConstantFields(const std::vector<int> patch_type);
     void initNonConstantFields(const double *rho, const double *phi, 
             const double *boundary_rho, const double *boudnary_phi);
diff --git a/src_gpu/dfRhoEqn.cu b/src_gpu/dfRhoEqn.cu
index 8a1d2e36e..57d32c9a4 100644
--- a/src_gpu/dfRhoEqn.cu
+++ b/src_gpu/dfRhoEqn.cu
@@ -9,6 +9,10 @@ void dfRhoEqn::createNonConstantLduAndCsrFields()
 #endif
 }
 
+void dfRhoEqn::setConstantValues() {
+    this->stream = dataBase_.stream;
+}
+
 void dfRhoEqn::setConstantFields(const std::vector<int> patch_type) {
   this->patch_type = patch_type;
 }
@@ -22,10 +26,12 @@ void dfRhoEqn::initNonConstantFields(const double *rho, const double *phi,
 }
 
 void dfRhoEqn::cleanCudaResources() {
+#ifdef USE_GRAPH
     if (graph_created) {
         checkCudaErrors(cudaGraphExecDestroy(graph_instance));
         checkCudaErrors(cudaGraphDestroy(graph));
     }
+#endif
 }
 
 void dfRhoEqn::preProcess()
@@ -34,18 +40,13 @@ void dfRhoEqn::preProcess()
 
 void dfRhoEqn::process()
 {
-    // 使用event计算时间
-    float time_elapsed=0;
-    cudaEvent_t start,stop;
-    checkCudaErrors(cudaEventCreate(&start));
-    checkCudaErrors(cudaEventCreate(&stop));
-    checkCudaErrors(cudaEventRecord(start,0));
-
-// #ifndef TIME_GPU
-//     if(!graph_created) {
-//         DEBUG_TRACE;
-//         checkCudaErrors(cudaStreamBeginCapture(dataBase_.stream, cudaStreamCaptureModeGlobal));
-// #endif
+    TICK_INIT_EVENT;
+    TICK_START_EVENT;
+#ifdef USE_GRAPH
+    if(!graph_created) {
+        DEBUG_TRACE;
+        checkCudaErrors(cudaStreamBeginCapture(dataBase_.stream, cudaStreamCaptureModeGlobal));
+#endif
 
 #ifdef STREAM_ALLOCATOR
     checkCudaErrors(cudaMallocAsync((void**)&d_source, dataBase_.cell_value_bytes, dataBase_.stream));
@@ -69,21 +70,25 @@ void dfRhoEqn::process()
             dataBase_.num_boundary_surfaces, dataBase_.num_patches, dataBase_.patch_size.data(),
             patch_type.data(), dataBase_.d_boundary_delta_coeffs, dataBase_.d_boundary_face_cell, dataBase_.d_rho, dataBase_.d_boundary_rho,
             dataBase_.cyclicNeighbor.data(), dataBase_.patchSizeOffset.data(), dataBase_.d_boundary_weight);
-    
-// #ifndef TIME_GPU
-//         checkCudaErrors(cudaStreamEndCapture(dataBase_.stream, &graph));
-//         checkCudaErrors(cudaGraphInstantiate(&graph_instance, graph, NULL, NULL, 0));
-//         graph_created = true;
-//     }
-//     checkCudaErrors(cudaGraphLaunch(graph_instance, dataBase_.stream));
-// #endif
-
-    checkCudaErrors(cudaEventRecord(stop,0));
-    checkCudaErrors(cudaEventSynchronize(start));
-    checkCudaErrors(cudaEventSynchronize(stop));
-    checkCudaErrors(cudaEventElapsedTime(&time_elapsed,start,stop));
-    if(!mpi_init_flag || myRank == 0)
-    fprintf(stderr, "rhoEqn process time:%f(ms)\n\n",time_elapsed);
+#ifdef USE_GRAPH
+        checkCudaErrors(cudaStreamEndCapture(dataBase_.stream, &graph));
+        checkCudaErrors(cudaGraphInstantiate(&graph_instance, graph, NULL, NULL, 0));
+        graph_created = true;
+    }
+    checkCudaErrors(cudaGraphLaunch(graph_instance, dataBase_.stream));
+#endif
+    TICK_END_EVENT(rhoEqn process);
+
+    TICK_START_EVENT;
+#ifdef STREAM_ALLOCATOR
+    checkCudaErrors(cudaFreeAsync(d_source, dataBase_.stream));
+    checkCudaErrors(cudaFreeAsync(d_diag, dataBase_.stream));
+#endif
+    TICK_END_EVENT(rhoEqn post process free);
+    TICK_START_EVENT;
+    checkCudaErrors(cudaMemcpyAsync(dataBase_.h_rho, dataBase_.d_rho, dataBase_.cell_value_bytes, cudaMemcpyDeviceToHost, dataBase_.stream));
+    TICK_END_EVENT(rhoEqn post process copy back);
+    sync();
 }
 
 void dfRhoEqn::sync()
@@ -96,16 +101,7 @@ void dfRhoEqn::solve()
     solve_explicit_scalar(dataBase_.stream, dataBase_.num_cells, d_diag, d_source, dataBase_.d_rho);
 }
 
-void dfRhoEqn::postProcess(double *h_rho)
-{
-#ifdef STREAM_ALLOCATOR
-    checkCudaErrors(cudaFreeAsync(d_source, dataBase_.stream));
-    checkCudaErrors(cudaFreeAsync(d_diag, dataBase_.stream));
-#endif
-    checkCudaErrors(cudaMemcpyAsync(h_rho, dataBase_.d_rho, dataBase_.cell_value_bytes, cudaMemcpyDeviceToHost, dataBase_.stream));
-    sync();
-    // TODO: correct boundary conditions
-}
+void dfRhoEqn::postProcess(double *h_rho) {}
 
 void dfRhoEqn::compareResult(const double *diag, const double *source, bool printFlag)
 {
diff --git a/src_gpu/dfThermo.H b/src_gpu/dfThermo.H
index b833f937a..e1f9c0f58 100644
--- a/src_gpu/dfThermo.H
+++ b/src_gpu/dfThermo.H
@@ -1,6 +1,7 @@
 #pragma once
 
 #include "dfMatrixDataBase.H"
+#include "dfMatrixOpBase.H"
 
 void init_const_coeff_ptr(std::vector<std::vector<double>>& nasa_coeffs, std::vector<std::vector<double>>& viscosity_coeffs,
         std::vector<std::vector<double>>& thermal_conductivity_coeffs, std::vector<std::vector<double>>& binary_diffusion_coeffs,
diff --git a/src_gpu/dfThermo.cu b/src_gpu/dfThermo.cu
index 0cd3aa992..9e3aba78d 100644
--- a/src_gpu/dfThermo.cu
+++ b/src_gpu/dfThermo.cu
@@ -6,28 +6,6 @@
 #include <cstring>
 #include "device_launch_parameters.h"
 
-#ifdef TIME_GPU
-    #define TICK_INIT_EVENT \
-        float time_elapsed_kernel=0;\
-        cudaEvent_t start_kernel, stop_kernel;\
-        checkCudaErrors(cudaEventCreate(&start_kernel));\
-        checkCudaErrors(cudaEventCreate(&stop_kernel));
-
-    #define TICK_START_EVENT \
-        checkCudaErrors(cudaEventRecord(start_kernel,0));
-
-    #define TICK_END_EVENT(prefix) \
-        checkCudaErrors(cudaEventRecord(stop_kernel,0));\
-        checkCudaErrors(cudaEventSynchronize(start_kernel));\
-        checkCudaErrors(cudaEventSynchronize(stop_kernel));\
-        checkCudaErrors(cudaEventElapsedTime(&time_elapsed_kernel,start_kernel,stop_kernel));\
-        printf("try %s 执行时间：%lf(ms)\n", #prefix, time_elapsed_kernel);
-#else
-    #define TICK_INIT_EVENT
-    #define TICK_START_EVENT
-    #define TICK_END_EVENT(prefix)
-#endif
-
 #define GAS_CANSTANT 8314.46261815324
 #define SQRT8 2.8284271247461903
 #define NUM_SPECIES 7
@@ -45,8 +23,8 @@ void init_const_coeff_ptr(std::vector<std::vector<double>>& nasa_coeffs, std::ve
         std::vector<std::vector<double>>& thermal_conductivity_coeffs, std::vector<std::vector<double>>& binary_diffusion_coeffs,
         std::vector<double>& molecular_weights)
 {
-    double *d_tmp;
-    checkCudaErrors(cudaMalloc((void**)&d_tmp, sizeof(double) * NUM_SPECIES * 15));
+    //double *d_tmp;
+    //checkCudaErrors(cudaMalloc((void**)&d_tmp, sizeof(double) * NUM_SPECIES * 15));
     double nasa_coeffs_tmp[NUM_SPECIES*15];
     double viscosity_coeffs_tmp[NUM_SPECIES*5];
     double thermal_conductivity_coeffs_tmp[NUM_SPECIES*5];
@@ -366,6 +344,7 @@ void dfThermo::setConstantValue(std::string mechanism_file, int num_cells, int n
     initCoeffsfromBinaryFile(fp);
 
     stream = dataBase_.stream;
+#ifndef STREAM_ALLOCATOR
     checkCudaErrors(cudaMalloc((void**)&d_mole_fraction, sizeof(double) * num_species * num_cells));
     checkCudaErrors(cudaMalloc((void**)&d_boundary_mole_fraction, sizeof(double) * num_species * dataBase_.num_boundary_surfaces));
     checkCudaErrors(cudaMalloc((void**)&d_mean_mole_weight, sizeof(double) * num_cells));
@@ -377,10 +356,10 @@ void dfThermo::setConstantValue(std::string mechanism_file, int num_cells, int n
     checkCudaErrors(cudaMalloc((void**)&d_boundary_species_viscosities, sizeof(double) * num_species * dataBase_.num_boundary_surfaces));
     checkCudaErrors(cudaMalloc((void**)&d_species_thermal_conductivities, sizeof(double) * num_species * num_cells));
     checkCudaErrors(cudaMalloc((void**)&d_boundary_species_thermal_conductivities, sizeof(double) * num_species * dataBase_.num_boundary_surfaces));
+#endif
     
     checkCudaErrors(cudaMalloc((void**)&d_psip0, sizeof(double) * num_cells));
     checkCudaErrors(cudaMalloc((void**)&d_boundary_psip0, sizeof(double) * dataBase_.num_boundary_surfaces));
-
     std::cout << "dfThermo initialized" << std::endl;
 }
 
@@ -454,55 +433,40 @@ void dfThermo::calculateTPolyGPU(int threads_per_block, int num_thread, int num_
 void dfThermo::calculatePsiGPU(int threads_per_block, int num_thread, const double *T, const double *mean_mole_weight, 
         double *d_psi, int offset)
 {
-    TICK_INIT_EVENT;
     size_t blocks_per_grid = (num_thread + threads_per_block - 1) / threads_per_block;
-    TICK_START_EVENT;
     calculate_psi_kernel<<<blocks_per_grid, threads_per_block, 0, stream>>>(num_thread, offset, T, mean_mole_weight, d_psi);
-    TICK_END_EVENT(calculate_psi_kernel);
 }
 
 void dfThermo::calculateRhoGPU(int threads_per_block, int num_thread, const double *p, const double *psi, double *rho, int offset)
 {
-    TICK_INIT_EVENT;
     size_t blocks_per_grid = (num_thread + threads_per_block - 1) / threads_per_block;
-    TICK_START_EVENT;
     calculate_rho_kernel<<<blocks_per_grid, threads_per_block, 0, stream>>>(num_cells, offset, p, psi, rho);
-    TICK_END_EVENT(calculate_rho_kernel);
 }
 
 void dfThermo::calculateViscosityGPU(int threads_per_block, int num_thread, int num_total, const double *T, const double *mole_fraction, 
         const double *T_poly, double *species_viscosities, double *viscosity, int offset)
 {
-    TICK_INIT_EVENT;
     size_t blocks_per_grid = (num_thread + threads_per_block - 1) / threads_per_block;
-    TICK_START_EVENT;
     calculate_viscosity_kernel<<<blocks_per_grid, threads_per_block, 0, stream>>>(num_thread, num_total, num_species, offset, 
             T_poly, T, mole_fraction, species_viscosities, viscosity);
-    TICK_END_EVENT(calculate_viscosity_kernel);
 }
 
 void dfThermo::calculateThermoConductivityGPU(int threads_per_block, int num_thread, int num_total, const double *T, 
         const double *T_poly, const double *d_y, const double *mole_fraction, double *species_thermal_conductivities,
         double *thermal_conductivity, int offset)
 {
-    TICK_INIT_EVENT;
     size_t blocks_per_grid = (num_thread + threads_per_block - 1) / threads_per_block;
-    TICK_START_EVENT;
     calculate_thermoConductivity_kernel<<<blocks_per_grid, threads_per_block, 0, stream>>>(num_thread, num_total, num_species, 
             offset, d_nasa_coeffs, d_y, T_poly, T, mole_fraction, species_thermal_conductivities, thermal_conductivity);
-    TICK_END_EVENT(calculate_thermoConductivity_kernel);
 }
 
 void dfThermo::calculateTemperatureGPU(int threads_per_block, int num_thread, int num_total, const double *T_init, const double *target_h, double *T, 
         const double *d_mass_fraction, int offset, double atol, double rtol, int max_iter)
 {
-    TICK_INIT_EVENT;
     size_t blocks_per_grid = (num_thread + threads_per_block - 1) / threads_per_block;
 
-    TICK_START_EVENT;
     calculate_temperature_kernel<<<blocks_per_grid, threads_per_block, 0, stream>>>(num_thread, num_total, num_species, offset,
             T_init, target_h, d_mass_fraction, T, atol, rtol, max_iter);
-    TICK_END_EVENT(calculate_temperature_kernel);
 }
 
 void dfThermo::calculateEnthalpyGPU(int threads_per_block, int num_thread, const double *T, double *enthalpy, const double *d_mass_fraction, int offset)
@@ -530,6 +494,20 @@ void dfThermo::updateEnergy()
 
 void dfThermo::correctThermo()
 {
+#ifdef STREAM_ALLOCATOR
+    checkCudaErrors(cudaMallocAsync((void**)&d_mole_fraction, sizeof(double) * num_species * num_cells, stream));
+    checkCudaErrors(cudaMallocAsync((void**)&d_boundary_mole_fraction, sizeof(double) * num_species * dataBase_.num_boundary_surfaces, stream));
+    checkCudaErrors(cudaMallocAsync((void**)&d_mean_mole_weight, sizeof(double) * num_cells, stream));
+    checkCudaErrors(cudaMallocAsync((void**)&d_boundary_mean_mole_weight, sizeof(double) * dataBase_.num_boundary_surfaces, stream));
+    checkCudaErrors(cudaMallocAsync((void**)&d_T_poly, sizeof(double) * 5 * num_cells, stream));
+    checkCudaErrors(cudaMallocAsync((void**)&d_boundary_T_poly, sizeof(double) * 5 * dataBase_.num_boundary_surfaces, stream));
+
+    checkCudaErrors(cudaMallocAsync((void**)&d_species_viscosities, sizeof(double) * num_species * num_cells, stream));
+    checkCudaErrors(cudaMallocAsync((void**)&d_boundary_species_viscosities, sizeof(double) * num_species * dataBase_.num_boundary_surfaces, stream));
+    checkCudaErrors(cudaMallocAsync((void**)&d_species_thermal_conductivities, sizeof(double) * num_species * num_cells, stream));
+    checkCudaErrors(cudaMallocAsync((void**)&d_boundary_species_thermal_conductivities, sizeof(double) * num_species * dataBase_.num_boundary_surfaces, stream));
+#endif
+ 
     setMassFraction(dataBase_.d_y, dataBase_.d_boundary_y);
     // internal field
     int cell_thread = 512, boundary_thread = 32;
@@ -544,6 +522,7 @@ void dfThermo::correctThermo()
     // boundary field
     int offset = 0;
     for (int i = 0; i < dataBase_.num_patches; i++) {
+        if (dataBase_.patch_size[i] == 0) continue;
         if (dataBase_.patch_type_T[i] == boundaryConditions::fixedValue) {
             calculateTPolyGPU(boundary_thread, dataBase_.patch_size[i], dataBase_.num_boundary_surfaces, dataBase_.d_boundary_T, d_boundary_T_poly, offset);
             calculateEnthalpyGPU(boundary_thread, dataBase_.patch_size[i], dataBase_.d_boundary_T, dataBase_.d_boundary_he, 
@@ -587,6 +566,19 @@ void dfThermo::correctThermo()
         } else {
             offset += dataBase_.patch_size[i]; }
     }
+#ifdef STREAM_ALLOCATOR
+    checkCudaErrors(cudaFreeAsync(d_mole_fraction, stream));
+    checkCudaErrors(cudaFreeAsync(d_boundary_mole_fraction, stream));
+    checkCudaErrors(cudaFreeAsync(d_mean_mole_weight, stream));
+    checkCudaErrors(cudaFreeAsync(d_boundary_mean_mole_weight, stream));
+    checkCudaErrors(cudaFreeAsync(d_T_poly, stream));
+    checkCudaErrors(cudaFreeAsync(d_boundary_T_poly, stream));
+
+    checkCudaErrors(cudaFreeAsync(d_species_viscosities, stream));
+    checkCudaErrors(cudaFreeAsync(d_boundary_species_viscosities, stream));
+    checkCudaErrors(cudaFreeAsync(d_species_thermal_conductivities, stream));
+    checkCudaErrors(cudaFreeAsync(d_boundary_species_thermal_conductivities, stream));
+#endif
 }
 
 void dfThermo::updateRho()
@@ -620,7 +612,6 @@ void dfThermo::calculateEnergyGradient(int num_thread, int num_cells, int num_sp
 {
     size_t threads_per_block = 256;
     size_t blocks_per_grid = (num_thread + threads_per_block - 1) / threads_per_block;
-
     calculate_energy_gradient_kernel<<<blocks_per_grid, threads_per_block, 0, stream>>>(num_thread, num_cells, num_species, num_boundary_surfaces,
             bou_offset, gradient_offset, face2Cells, T, p, y, boundary_p, boundary_y, boundary_delta_coeffs, boundary_thermo_gradient);
 }
diff --git a/src_gpu/dfUEqn.H b/src_gpu/dfUEqn.H
index 3525ef8ec..ef3c8133f 100644
--- a/src_gpu/dfUEqn.H
+++ b/src_gpu/dfUEqn.H
@@ -11,10 +11,14 @@ private:
 	dfMatrixDataBase &dataBase_;
 
     // cuda resource
+    cudaStream_t stream;
+#ifdef USE_GRAPH
     // one graph for one eqn before using self-developed solver
     cudaGraph_t graph_pre, graph_post;
     cudaGraphExec_t graph_instance_pre, graph_instance_post;
-    bool graph_created=false;
+    bool pre_graph_created=false;
+    bool post_graph_created=false;
+#endif
 
 	// constant values -- basic
 	std::string mode_string;
@@ -111,7 +115,7 @@ public:
     void preProcessForRhoEqn(const double *h_rho, const double *h_phi, const double *h_boundary_phi);
 	void preProcess(const double *h_u, const double *h_boundary_u, const double *h_p, const double *h_boundary_p, const double *h_nu_eff, const double *h_boundary_nu_eff);
 	void process();
-	void postProcess(double *h_u);
+	void postProcess();
 
     void A(double *Psi);
     void H(double *Psi);
diff --git a/src_gpu/dfUEqn.cu b/src_gpu/dfUEqn.cu
index 26bdfb3d8..131c566d8 100644
--- a/src_gpu/dfUEqn.cu
+++ b/src_gpu/dfUEqn.cu
@@ -303,6 +303,7 @@ __global__ void ueqn_divide_cell_volume_vec(int num_cells, const double* volume,
 }
 
 void dfUEqn::setConstantValues(const std::string &mode_string, const std::string &setting_path) {
+  this->stream = dataBase_.stream;
   this->mode_string = mode_string;
   this->setting_path = setting_path;
   UxSolver = new AmgXSolver(mode_string, setting_path, dataBase_.localRank);
@@ -335,8 +336,8 @@ void dfUEqn::createNonConstantFieldsInternal() {
   checkCudaErrors(cudaMalloc((void**)&d_grad_u, dataBase_.cell_value_tsr_bytes));
   checkCudaErrors(cudaMalloc((void**)&d_rho_nueff, dataBase_.cell_value_bytes));
   checkCudaErrors(cudaMalloc((void**)&d_fvc_output, dataBase_.cell_value_vec_bytes));
-  checkCudaErrors(cudaMalloc((void**)&d_u_host_order, dataBase_.cell_value_vec_bytes));
 #endif
+  checkCudaErrors(cudaMalloc((void**)&d_u_host_order, dataBase_.cell_value_vec_bytes));
   // computed on CPU, used on GPU, need memcpyh2d
   checkCudaErrors(cudaMallocHost((void**)&h_nu_eff , dataBase_.cell_value_bytes));
   checkCudaErrors(cudaMallocHost((void**)&h_A_pEqn , dataBase_.cell_value_bytes));
@@ -358,9 +359,9 @@ void dfUEqn::createNonConstantFieldsBoundary() {
   checkCudaErrors(cudaMalloc((void**)&d_value_boundary_coeffs, dataBase_.boundary_surface_value_vec_bytes));
   checkCudaErrors(cudaMalloc((void**)&d_gradient_internal_coeffs, dataBase_.boundary_surface_value_vec_bytes));
   checkCudaErrors(cudaMalloc((void**)&d_gradient_boundary_coeffs, dataBase_.boundary_surface_value_vec_bytes));
+#endif
   // intermediate boundary
   checkCudaErrors(cudaMalloc((void**)&d_boundary_u_host_order, dataBase_.boundary_surface_value_vec_bytes));
-#endif
   // computed on CPU, used on GPU, need memcpyh2d
   checkCudaErrors(cudaMallocHost((void**)&h_boundary_nu_eff, dataBase_.boundary_surface_value_bytes));
 
@@ -384,10 +385,6 @@ void dfUEqn::createNonConstantLduAndCsrFields() {
   checkCudaErrors(cudaMalloc((void**)&d_A_pEqn, dataBase_.cell_value_bytes)); // TODO: delete redundant variables
   checkCudaErrors(cudaMalloc((void**)&d_H_pEqn, dataBase_.cell_value_vec_bytes));
   checkCudaErrors(cudaMalloc((void**)&d_H_pEqn_perm, dataBase_.cell_value_vec_bytes));
-
-  checkCudaErrors(cudaMalloc((void**)&d_u_host_order, dataBase_.cell_value_vec_bytes));
-  // intermediate boundary fields
-  checkCudaErrors(cudaMalloc((void**)&d_boundary_u_host_order, dataBase_.boundary_surface_value_vec_bytes));
 }
 
 void dfUEqn::initNonConstantFieldsInternal(const double *u, const double *boundary_u)
@@ -407,12 +404,16 @@ void dfUEqn::initNonConstantFieldsBoundary() {
 }
 
 void dfUEqn::cleanCudaResources() {
-    if (graph_created) {
+#ifdef USE_GRAPH
+    if (pre_graph_created) {
         checkCudaErrors(cudaGraphExecDestroy(graph_instance_pre));
-        checkCudaErrors(cudaGraphExecDestroy(graph_instance_post));
         checkCudaErrors(cudaGraphDestroy(graph_pre));
+    }
+    if (post_graph_created) {
+        checkCudaErrors(cudaGraphExecDestroy(graph_instance_post));
         checkCudaErrors(cudaGraphDestroy(graph_post));
     }
+#endif
 }
 
 void dfUEqn::preProcessForRhoEqn(const double *h_rho, const double *h_phi, const double *h_boundary_phi) {
@@ -427,18 +428,13 @@ void dfUEqn::preProcess(const double *h_u, const double *h_boundary_u, const dou
 }
 
 void dfUEqn::process() {
-    //使用event计算时间
-    float time_elapsed=0;
-    cudaEvent_t start,stop;
-    checkCudaErrors(cudaEventCreate(&start));
-    checkCudaErrors(cudaEventCreate(&stop));
-    checkCudaErrors(cudaEventRecord(start,0));
-
-// #ifndef TIME_GPU
-//     if(!graph_created) {
-//         DEBUG_TRACE;
-//         checkCudaErrors(cudaStreamBeginCapture(dataBase_.stream, cudaStreamCaptureModeGlobal));
-// #endif
+    TICK_INIT_EVENT;
+    TICK_START_EVENT;
+#ifdef USE_GRAPH
+    if(!pre_graph_created) {
+        DEBUG_TRACE;
+        checkCudaErrors(cudaStreamBeginCapture(dataBase_.stream, cudaStreamCaptureModeGlobal));
+#endif
 
 #ifdef STREAM_ALLOCATOR
         // thermophysical fields
@@ -540,58 +536,70 @@ void dfUEqn::process() {
             d_ldu, d_extern, d_source, d_internal_coeffs, d_boundary_coeffs, dataBase_.cyclicNeighbor.data(), 
             dataBase_.patchSizeOffset.data(), d_A, d_b);
 #endif
-// #ifndef TIME_GPU
-//         checkCudaErrors(cudaStreamEndCapture(dataBase_.stream, &graph_pre));
-//         checkCudaErrors(cudaGraphInstantiate(&graph_instance_pre, graph_pre, NULL, NULL, 0));
-//     }
-//     DEBUG_TRACE;
-//     checkCudaErrors(cudaGraphLaunch(graph_instance_pre, dataBase_.stream));
-// #endif
-    checkCudaErrors(cudaEventRecord(stop,0));
-    checkCudaErrors(cudaEventSynchronize(start));
-    checkCudaErrors(cudaEventSynchronize(stop));
-    checkCudaErrors(cudaEventElapsedTime(&time_elapsed,start,stop));
-    if(!mpi_init_flag || myRank == 0)
-    fprintf(stderr, "ueqn assembly time:%f(ms)\n",time_elapsed);
-
-    checkCudaErrors(cudaEventRecord(start,0));
+#ifdef USE_GRAPH
+        checkCudaErrors(cudaStreamEndCapture(dataBase_.stream, &graph_pre));
+        checkCudaErrors(cudaGraphInstantiate(&graph_instance_pre, graph_pre, NULL, NULL, 0));
+        pre_graph_created = true;
+    }
+    DEBUG_TRACE;
+    checkCudaErrors(cudaGraphLaunch(graph_instance_pre, dataBase_.stream));
+#endif
+    TICK_END_EVENT(UEqn assembly);
+
+    TICK_START_EVENT;
 #ifndef DEBUG_CHECK_LDU
     solve();
 #endif
-    checkCudaErrors(cudaEventRecord(stop,0));
-    checkCudaErrors(cudaEventSynchronize(start));
-    checkCudaErrors(cudaEventSynchronize(stop));
-    checkCudaErrors(cudaEventElapsedTime(&time_elapsed,start,stop));
-    if(!mpi_init_flag || myRank == 0)
-    fprintf(stderr, "ueqn solve time:%f(ms)\n",time_elapsed);
-
-    checkCudaErrors(cudaEventRecord(start,0));
-// #ifndef TIME_GPU
-//     if(!graph_created) {
-//         checkCudaErrors(cudaStreamBeginCapture(dataBase_.stream, cudaStreamCaptureModeGlobal));
-// #endif
+    TICK_END_EVENT(UEqn solve);
 
+#ifdef USE_GRAPH
+    if(!post_graph_created) {
+        checkCudaErrors(cudaStreamBeginCapture(dataBase_.stream, cudaStreamCaptureModeGlobal));
+#endif
+
+        TICK_START_EVENT;
         correct_boundary_conditions_vector(dataBase_.stream, dataBase_.nccl_comm, dataBase_.neighbProcNo.data(), dataBase_.num_boundary_surfaces, 
                 dataBase_.num_cells, dataBase_.num_patches, dataBase_.patch_size.data(), patch_type.data(), dataBase_.d_boundary_weight,
                 dataBase_.d_boundary_face_cell, dataBase_.d_u, dataBase_.d_boundary_u, 
                 dataBase_.cyclicNeighbor.data(), dataBase_.patchSizeOffset.data());
         vector_half_mag_square(dataBase_.stream, dataBase_.num_cells, dataBase_.d_u, dataBase_.d_k, dataBase_.num_boundary_surfaces, 
                 dataBase_.d_boundary_u, dataBase_.d_boundary_k);
+        TICK_END_EVENT(UEqn post process correctBC);
 
-// #ifndef TIME_GPU
-//         checkCudaErrors(cudaStreamEndCapture(dataBase_.stream, &graph_post));
-//         checkCudaErrors(cudaGraphInstantiate(&graph_instance_post, graph_post, NULL, NULL, 0));
-//         graph_created = true;
-//     }
-//     checkCudaErrors(cudaGraphLaunch(graph_instance_post, dataBase_.stream));
-// #endif
+        TICK_START_EVENT;
+#ifdef STREAM_ALLOCATOR
+        // free
+        // thermophysical fields
+        checkCudaErrors(cudaFreeAsync(d_nu_eff, dataBase_.stream));
+        // intermediate fields
+        checkCudaErrors(cudaFreeAsync(d_grad_u, dataBase_.stream));
+        checkCudaErrors(cudaFreeAsync(d_rho_nueff, dataBase_.stream));
+        checkCudaErrors(cudaFreeAsync(d_fvc_output, dataBase_.stream));
     
-    checkCudaErrors(cudaEventRecord(stop,0));
-    checkCudaErrors(cudaEventSynchronize(start));
-    checkCudaErrors(cudaEventSynchronize(stop));
-    checkCudaErrors(cudaEventElapsedTime(&time_elapsed,start,stop));
-    if(!mpi_init_flag || myRank == 0)
-    fprintf(stderr, "ueqn post process time:%f(ms)\n\n",time_elapsed);
+        // thermophysical fields
+        checkCudaErrors(cudaFreeAsync(d_boundary_nu_eff, dataBase_.stream));
+        // intermediate fields
+        checkCudaErrors(cudaFreeAsync(d_boundary_grad_u, dataBase_.stream));
+        checkCudaErrors(cudaFreeAsync(d_boundary_rho_nueff, dataBase_.stream));
+        // boundary coeff fields
+        checkCudaErrors(cudaFreeAsync(d_value_internal_coeffs, dataBase_.stream));
+        checkCudaErrors(cudaFreeAsync(d_value_boundary_coeffs, dataBase_.stream));
+        checkCudaErrors(cudaFreeAsync(d_gradient_internal_coeffs, dataBase_.stream));
+        checkCudaErrors(cudaFreeAsync(d_gradient_boundary_coeffs, dataBase_.stream));
+    
+        checkCudaErrors(cudaFreeAsync(d_A, dataBase_.stream));
+        checkCudaErrors(cudaFreeAsync(d_b, dataBase_.stream));
+#endif
+        TICK_END_EVENT(UEqn post process free);
+
+#ifdef USE_GRAPH
+        checkCudaErrors(cudaStreamEndCapture(dataBase_.stream, &graph_post));
+        checkCudaErrors(cudaGraphInstantiate(&graph_instance_post, graph_post, NULL, NULL, 0));
+        post_graph_created = true;
+    }
+    checkCudaErrors(cudaGraphLaunch(graph_instance_post, dataBase_.stream));
+#endif
+    sync();
 }
 
 void dfUEqn::sync()
@@ -600,114 +608,19 @@ void dfUEqn::sync()
 }
 
 void dfUEqn::solve() {
-    sync();
-
-    // double h_A[dataBase_.num_Nz * 3];
-    // double h_b[dataBase_.num_cells * 3];
-    // checkCudaErrors(cudaMemcpyAsync(h_A, d_A, dataBase_.csr_value_vec_bytes, cudaMemcpyDeviceToHost, dataBase_.stream));
-    // checkCudaErrors(cudaMemcpyAsync(h_b, d_b, dataBase_.cell_value_vec_bytes, cudaMemcpyDeviceToHost, dataBase_.stream));
-    // sync();
-
-    // char *input_file = "of_output.txt";
-    // FILE *fp = fopen(input_file, "rb+");
-    // if (fp == NULL)
-    // {
-    //     fprintf(stderr, "Failed to open input file: %s!\n", input_file);
-    // }
-    // int readfile = 0;
-    // double *of_b = new double[3 * dataBase_.num_cells];
-    // double *of_A = new double[3 * dataBase_.num_Nz];
-    // readfile = fread(of_b, dataBase_.cell_value_vec_bytes, 1, fp);
-    // readfile = fread(of_A, dataBase_.csr_value_vec_bytes, 1, fp);
-
-    // std::vector<double> h_A_of_init_vec(3 * dataBase_.num_Nz);
-    // std::copy(of_A, of_A + dataBase_.num_Nz * 3, h_A_of_init_vec.begin());
-
-    // // std::vector<double> h_A_of_vec_perm(3 * dataBase_.num_Nz, 0);
-    // // for (int i = 0; i < dataBase_.num_Nz; i++)
-    // // {
-    // //     h_A_of_vec_perm[i] = h_A_of_init_vec[dataBase_.lduCSRIndex[i]];
-    // //     h_A_of_vec_perm[i + dataBase_.num_Nz] = h_A_of_init_vec[dataBase_.lduCSRIndex[i] + dataBase_.num_Nz];
-    // //     h_A_of_vec_perm[i + 2 * dataBase_.num_Nz] = h_A_of_init_vec[dataBase_.lduCSRIndex[i] + 2 * dataBase_.num_Nz];
-    // // }
-    // std::vector<double> h_A_of_vec_1mtx(dataBase_.num_Nz, 0);
-    // for (int i = 0; i < dataBase_.num_Nz; i++) {
-    //     h_A_of_vec_1mtx[i] = h_A_of_init_vec[dataBase_.lduCSRIndex[i]];
-    // }
-
-
-    // // b
-    // std::vector<double> h_b_of_init_vec(3 * dataBase_.num_cells);
-    // std::copy(of_b, of_b + 3 * dataBase_.num_cells, h_b_of_init_vec.begin());
-    // std::vector<double> h_b_of_vec;
-    // for (int i = 0; i < 3 * dataBase_.num_cells; i += 3)
-    // {
-    //     h_b_of_vec.push_back(h_b_of_init_vec[i]);
-    // }
-    // // fill RHS_y
-    // for (int i = 1; i < 3 * dataBase_.num_cells; i += 3)
-    // {
-    //     h_b_of_vec.push_back(h_b_of_init_vec[i]);
-    // }
-    // // fill RHS_z
-    // for (int i = 2; i < 3 * dataBase_.num_cells; i += 3)
-    // {
-    //     h_b_of_vec.push_back(h_b_of_init_vec[i]);
-    // }
-
-    // fprintf(stderr, "check of h_A_csr\n");
-    // checkVectorEqual(dataBase_.num_Nz, h_A_of_vec_1mtx.data(), h_A, 1e-14, false);
-    // fprintf(stderr, "check of h_b\n");
-    // checkVectorEqual(3 * dataBase_.num_cells, h_b_of_vec.data(), h_b, 1e-14, false);
-
-
-    if (num_iteration == 0)                                     // first interation
-    {
-        printf("Initializing AmgX Linear Solver\n");
-        UxSolver->setOperator(dataBase_.num_cells, dataBase_.num_total_cells, dataBase_.num_Nz, dataBase_.d_csr_row_index, dataBase_.d_csr_col_index, d_A);
-        UySolver->setOperator(dataBase_.num_cells, dataBase_.num_total_cells, dataBase_.num_Nz, dataBase_.d_csr_row_index, dataBase_.d_csr_col_index, d_A + dataBase_.num_Nz);
-        UzSolver->setOperator(dataBase_.num_cells, dataBase_.num_total_cells, dataBase_.num_Nz, dataBase_.d_csr_row_index, dataBase_.d_csr_col_index, d_A + 2 * dataBase_.num_Nz);
-    }
-    else
-    {
-        UxSolver->updateOperator(dataBase_.num_cells, dataBase_.num_Nz, d_A);
-        UySolver->updateOperator(dataBase_.num_cells, dataBase_.num_Nz, d_A + dataBase_.num_Nz);
-        UzSolver->updateOperator(dataBase_.num_cells, dataBase_.num_Nz, d_A + 2 * dataBase_.num_Nz);
-    }
-    UxSolver->solve(dataBase_.num_cells, dataBase_.d_u, d_b);
-    UySolver->solve(dataBase_.num_cells, dataBase_.d_u + dataBase_.num_cells, d_b + dataBase_.num_cells);
-    UzSolver->solve(dataBase_.num_cells, dataBase_.d_u + 2 * dataBase_.num_cells, d_b + 2 * dataBase_.num_cells);
+    dataBase_.solve(num_iteration, AMGXSetting::u_setting, d_A, dataBase_.d_u, d_b);
+    dataBase_.solve(num_iteration, AMGXSetting::u_setting, d_A + dataBase_.num_Nz, dataBase_.d_u + dataBase_.num_cells, d_b + dataBase_.num_cells);
+    dataBase_.solve(num_iteration, AMGXSetting::u_setting, d_A + 2 * dataBase_.num_Nz, dataBase_.d_u + 2 * dataBase_.num_cells, d_b + 2 * dataBase_.num_cells);
     num_iteration++;
 }
 
-void dfUEqn::postProcess(double *h_u) {
+void dfUEqn::postProcess() {
+    // postProcess of dfUEqn can not be moved to the end of dfUEqn::process(),
+    // because dataBase_.d_u is modified in dfpEqn and we only need the result of the last change
+    // copy u and boundary_u to host
     permute_vector_d2h(dataBase_.stream, dataBase_.num_cells, dataBase_.d_u, d_u_host_order);
-    checkCudaErrors(cudaMemcpyAsync(h_u, d_u_host_order, dataBase_.cell_value_vec_bytes, cudaMemcpyDeviceToHost, dataBase_.stream));
-    checkCudaErrors(cudaStreamSynchronize(dataBase_.stream));
-
-#ifdef STREAM_ALLOCATOR
-    // free
-    // thermophysical fields
-    checkCudaErrors(cudaFreeAsync(d_nu_eff, dataBase_.stream));
-    // intermediate fields
-    checkCudaErrors(cudaFreeAsync(d_grad_u, dataBase_.stream));
-    checkCudaErrors(cudaFreeAsync(d_rho_nueff, dataBase_.stream));
-    checkCudaErrors(cudaFreeAsync(d_fvc_output, dataBase_.stream));
-
-    // thermophysical fields
-    checkCudaErrors(cudaFreeAsync(d_boundary_nu_eff, dataBase_.stream));
-    // intermediate fields
-    checkCudaErrors(cudaFreeAsync(d_boundary_grad_u, dataBase_.stream));
-    checkCudaErrors(cudaFreeAsync(d_boundary_rho_nueff, dataBase_.stream));
-    // boundary coeff fields
-    checkCudaErrors(cudaFreeAsync(d_value_internal_coeffs, dataBase_.stream));
-    checkCudaErrors(cudaFreeAsync(d_value_boundary_coeffs, dataBase_.stream));
-    checkCudaErrors(cudaFreeAsync(d_gradient_internal_coeffs, dataBase_.stream));
-    checkCudaErrors(cudaFreeAsync(d_gradient_boundary_coeffs, dataBase_.stream));
-
-    checkCudaErrors(cudaFreeAsync(d_A, dataBase_.stream));
-    checkCudaErrors(cudaFreeAsync(d_b, dataBase_.stream));
-#endif
+    checkCudaErrors(cudaMemcpyAsync(dataBase_.h_u, d_u_host_order, dataBase_.cell_value_vec_bytes, cudaMemcpyDeviceToHost, dataBase_.stream));
+    sync();
 }
 
 void dfUEqn::A(double *Psi) {
@@ -761,6 +674,7 @@ void dfUEqn::UEqnGetHbyA(cudaStream_t stream, ncclComm_t comm, const int *neighb
     
     int offset = 0;
     for (int i = 0; i < num_patches; i++) {
+        if (patch_size[i] == 0) continue;
         threads_per_block = 64;
         blocks_per_grid = (patch_size[i] + threads_per_block - 1) / threads_per_block;
         // TODO: just non-coupled patch type now
@@ -796,6 +710,7 @@ void dfUEqn::UEqnGetHbyA(cudaStream_t stream, ncclComm_t comm, const int *neighb
     // constrainHbyA
     offset = 0;
     for (int i = 0; i < num_patches; i++) {
+        if (patch_size[i] == 0) continue;
         threads_per_block = 64;
         blocks_per_grid = (patch_size[i] + threads_per_block - 1) / threads_per_block;
         // TODO: just non-coupled patch type now
@@ -829,6 +744,7 @@ void dfUEqn::ueqn_ldu_to_csr(cudaStream_t stream, int num_cells, int num_surface
     int bou_offset = 0, ext_offset = 0;
     size_t threads_per_block, blocks_per_grid;
     for (int i = 0; i < num_patches; i++) {
+        if (patch_size[i] == 0) continue;
         if (patch_type[i] == boundaryConditions::processor
             || patch_type[i] == boundaryConditions::processorCyclic) {
             threads_per_block = 64;
@@ -858,6 +774,7 @@ void dfUEqn::ueqn_ldu_to_csr(cudaStream_t stream, int num_cells, int num_surface
     // add coeff to source and diagnal
     bou_offset = 0;
     for (int i = 0; i < num_patches; i++) {
+        if (patch_size[i] == 0) continue;
         threads_per_block = 64;
         blocks_per_grid = (patch_size[i] + threads_per_block - 1) / threads_per_block;
         if (patch_type[i] == boundaryConditions::processor
diff --git a/src_gpu/dfYEqn.H b/src_gpu/dfYEqn.H
index 1064e313c..ab89abde6 100644
--- a/src_gpu/dfYEqn.H
+++ b/src_gpu/dfYEqn.H
@@ -11,10 +11,13 @@ private:
 	dfMatrixDataBase &dataBase_;
 
     // cuda resource
+    cudaStream_t stream;
+#ifdef USE_GRAPH
     // one graph for one eqn before using self-developed solver
     cudaGraph_t graph;
     cudaGraphExec_t graph_instance;
     bool graph_created=false;
+#endif
 
 	// constant values -- basic
 	std::string mode_string;
@@ -117,7 +120,7 @@ public:
         const double *h_mut_sct, const double *h_boundary_mut_sct);
 	void process();
     void postProcess(double *h_y, double *h_boundary_y);
-    void solve(int solverIndex, int speciesIndex);
+    void solve(int speciesIndex);
     void sync();
 
     // 方程特化版离散函数
diff --git a/src_gpu/dfYEqn.cu b/src_gpu/dfYEqn.cu
index ddfb19455..c81977b9c 100644
--- a/src_gpu/dfYEqn.cu
+++ b/src_gpu/dfYEqn.cu
@@ -318,6 +318,7 @@ double* dfYEqn::getFieldPointer(const char* fieldAlias, location loc, position p
 }
 
 void dfYEqn::setConstantValues(const std::string &mode_string, const std::string &setting_path, const int inertIndex) {
+    this->stream = dataBase_.stream;
     this->mode_string = mode_string;
     this->setting_path = setting_path;
     this->inertIndex = inertIndex;
@@ -405,13 +406,6 @@ void dfYEqn::createNonConstantLduAndCsrFields() {
 
 void dfYEqn::initNonConstantFieldsInternal(const double *y) {
     checkCudaErrors(cudaMemcpyAsync(dataBase_.d_y, y, dataBase_.cell_value_bytes * dataBase_.num_species, cudaMemcpyHostToDevice, dataBase_.stream));
-
-    // UnityLewis
-    checkCudaErrors(cudaMemsetAsync(d_hai, 0, dataBase_.cell_value_bytes * dataBase_.num_species, dataBase_.stream));
-    checkCudaErrors(cudaMemsetAsync(d_boundary_hai, 0, dataBase_.boundary_surface_value_bytes * dataBase_.num_species, dataBase_.stream));
-    // laminar
-    checkCudaErrors(cudaMemsetAsync(d_mut_sct, 0, dataBase_.cell_value_bytes, dataBase_.stream));
-    checkCudaErrors(cudaMemsetAsync(d_boundary_mut_sct, 0, dataBase_.boundary_surface_value_bytes, dataBase_.stream));
 }
 
 void dfYEqn::initNonConstantFieldsBoundary(const double *boundary_y) {
@@ -429,10 +423,12 @@ void dfYEqn::initNonConstantFieldsBoundary(const double *boundary_y) {
 }
 
 void dfYEqn::cleanCudaResources() {
+#ifdef USE_GRAPH
     if (graph_created) {
         checkCudaErrors(cudaGraphExecDestroy(graph_instance));
         checkCudaErrors(cudaGraphDestroy(graph));
     }
+#endif
 }
 
 void dfYEqn::preProcess(const double *h_rhoD, const double *h_boundary_rhoD,
@@ -444,14 +440,9 @@ void dfYEqn::preProcess(const double *h_rhoD, const double *h_boundary_rhoD,
 }
 
 void dfYEqn::process() {
-    //使用event计算时间
-    float time_elapsed=0;
-    cudaEvent_t start,stop;
-    checkCudaErrors(cudaEventCreate(&start));
-    checkCudaErrors(cudaEventCreate(&stop));
-    checkCudaErrors(cudaEventRecord(start,0));
-
-#ifndef TIME_GPU
+    TICK_INIT_EVENT;
+    TICK_START_EVENT;
+#ifdef USE_GRAPH
     if(!graph_created) {
         DEBUG_TRACE;
         checkCudaErrors(cudaStreamBeginCapture(dataBase_.stream, cudaStreamCaptureModeGlobal));
@@ -486,6 +477,14 @@ void dfYEqn::process() {
         checkCudaErrors(cudaMallocAsync((void**)&d_boundary_coeffs, dataBase_.boundary_surface_value_bytes, dataBase_.stream));
         checkCudaErrors(cudaMallocAsync((void**)&d_A, dataBase_.csr_value_bytes, dataBase_.stream));
 #endif
+
+        // UnityLewis
+        checkCudaErrors(cudaMemsetAsync(d_hai, 0, dataBase_.cell_value_bytes * dataBase_.num_species, dataBase_.stream));
+        checkCudaErrors(cudaMemsetAsync(d_boundary_hai, 0, dataBase_.boundary_surface_value_bytes * dataBase_.num_species, dataBase_.stream));
+        // laminar
+        checkCudaErrors(cudaMemsetAsync(d_mut_sct, 0, dataBase_.cell_value_bytes, dataBase_.stream));
+        checkCudaErrors(cudaMemsetAsync(d_boundary_mut_sct, 0, dataBase_.boundary_surface_value_bytes, dataBase_.stream));
+
         checkCudaErrors(cudaMemsetAsync(dataBase_.d_diff_alphaD, 0, dataBase_.cell_value_bytes, dataBase_.stream));
         checkCudaErrors(cudaMemsetAsync(dataBase_.d_boundary_diff_alphaD, 0, dataBase_.boundary_surface_value_bytes, dataBase_.stream));
         checkCudaErrors(cudaMemsetAsync(d_grad_y, 0, dataBase_.cell_value_vec_bytes * dataBase_.num_species, dataBase_.stream));
@@ -542,7 +541,7 @@ void dfYEqn::process() {
         // double *d_boundary_DEff = d_boundary_rhoD;
         yeqn_compute_DEff(dataBase_.stream, dataBase_.num_species, dataBase_.num_cells, dataBase_.num_boundary_surfaces,
                 d_lewis_number, dataBase_.d_thermo_alpha, d_mut_sct, d_DEff, dataBase_.d_boundary_thermo_alpha, d_boundary_mut_sct, d_boundary_DEff);
-#ifndef TIME_GPU
+#ifdef USE_GRAPH
         checkCudaErrors(cudaStreamEndCapture(dataBase_.stream, &graph));
         checkCudaErrors(cudaGraphInstantiate(&graph_instance, graph, NULL, NULL, 0));
         graph_created = true;
@@ -611,16 +610,20 @@ void dfYEqn::process() {
             ldu_to_csr_scalar(dataBase_.stream, dataBase_.num_cells, dataBase_.num_surfaces, dataBase_.num_boundary_surfaces,
                     dataBase_.num_Nz, dataBase_.d_boundary_face_cell, dataBase_.d_ldu_to_csr_index, dataBase_.num_patches,
                     dataBase_.patch_size.data(), patch_type.data(), d_ldu, d_source, d_internal_coeffs, d_boundary_coeffs, d_A);
-            solve(solverIndex, s); 
-            solverIndex ++;
+            // TODO with solver of database_, solverIndex is no need any more.
+            //solve(solverIndex, s); 
+            //solverIndex ++;
+            solve(s);
 #endif
         }
         if (s == dataBase_.num_species - 1)
             num_iteration++;
     }
+    TICK_END_EVENT(YEqn assembly and solve for all species);
+
+    TICK_START_EVENT;
     // compute y_inertIndex
     yeqn_compute_y_inertIndex(dataBase_.stream, dataBase_.num_species, inertIndex, dataBase_.num_cells, dataBase_.d_y);
-
     // correct boundary conditions
     for (int s = 0; s < dataBase_.num_species; s++) {
         correct_boundary_conditions_scalar(dataBase_.stream, dataBase_.nccl_comm, dataBase_.neighbProcNo.data(),
@@ -629,39 +632,18 @@ void dfYEqn::process() {
                 dataBase_.d_y + dataBase_.num_cells * s, dataBase_.d_boundary_y + dataBase_.num_boundary_surfaces * s, 
                 dataBase_.cyclicNeighbor.data(), dataBase_.patchSizeOffset.data(), dataBase_.d_boundary_weight);
     }
+    TICK_END_EVENT(YEqn post process for all species correctBC);
 
-    checkCudaErrors(cudaEventRecord(stop,0));
-    checkCudaErrors(cudaEventSynchronize(start));
-    checkCudaErrors(cudaEventSynchronize(stop));
-    checkCudaErrors(cudaEventElapsedTime(&time_elapsed,start,stop));
-    if(!mpi_init_flag || myRank == 0)
-    fprintf(stderr, "yeqn process time: %f(ms)\n\n",time_elapsed);
-}
-
-void dfYEqn::solve(int solverIndex, int speciesIndex) {    
-    if (num_iteration == 0)                                     // first interation
-    {
-        fprintf(stderr, "Initializing AmgX Linear Solver\n");
-        DEBUG_TRACE;
-        YSolverSet[solverIndex]->setOperator(dataBase_.num_cells, dataBase_.num_total_cells, dataBase_.num_Nz, dataBase_.d_csr_row_index, dataBase_.d_csr_col_index, d_A);
-        DEBUG_TRACE;
-    }
-    else
-    {
-        DEBUG_TRACE;
-        YSolverSet[solverIndex]->updateOperator(dataBase_.num_cells, dataBase_.num_Nz, d_A);
-        DEBUG_TRACE;
-    }
-    // use d_source as d_b
-    DEBUG_TRACE;
-    YSolverSet[solverIndex]->solve(dataBase_.num_cells, dataBase_.d_y + dataBase_.num_cells * speciesIndex, d_source);
-    DEBUG_TRACE;
-}
+    TICK_START_EVENT;
+    // copy y and boundary_y to host
+    checkCudaErrors(cudaMemcpyAsync(dataBase_.h_y, dataBase_.d_y, dataBase_.cell_value_bytes * dataBase_.num_species, cudaMemcpyDeviceToHost, dataBase_.stream));
+    checkCudaErrors(cudaMemcpyAsync(dataBase_.h_boundary_y, dataBase_.d_boundary_y, dataBase_.boundary_surface_value_bytes * dataBase_.num_species, cudaMemcpyDeviceToHost, dataBase_.stream));
+    TICK_END_EVENT(YEqn post process for all species copy back);
 
-void dfYEqn::postProcess(double *h_y, double *h_boundary_y) {
+    TICK_START_EVENT;
 #ifdef STREAM_ALLOCATOR
     // thermophysical fields
-    checkCudaErrors(cudaFreeAsync(d_rhoD, dataBase_.stream));
+    //checkCudaErrors(cudaFreeAsync(d_rhoD, dataBase_.stream));
     checkCudaErrors(cudaFreeAsync(d_hai, dataBase_.stream));
     checkCudaErrors(cudaFreeAsync(d_mut_sct, dataBase_.stream));
     // intermediate fields
@@ -672,7 +654,7 @@ void dfYEqn::postProcess(double *h_y, double *h_boundary_y) {
     checkCudaErrors(cudaFreeAsync(d_permute, dataBase_.stream));
 
     // thermophysical fields
-    checkCudaErrors(cudaFreeAsync(d_boundary_rhoD, dataBase_.stream));
+    //checkCudaErrors(cudaFreeAsync(d_boundary_rhoD, dataBase_.stream));
     checkCudaErrors(cudaFreeAsync(d_boundary_hai, dataBase_.stream));
     checkCudaErrors(cudaFreeAsync(d_boundary_mut_sct, dataBase_.stream));
     // intermediate fields
@@ -693,12 +675,19 @@ void dfYEqn::postProcess(double *h_y, double *h_boundary_y) {
     checkCudaErrors(cudaFreeAsync(d_boundary_coeffs, dataBase_.stream));
     checkCudaErrors(cudaFreeAsync(d_A, dataBase_.stream));
 #endif
-    // copy y to host
-    checkCudaErrors(cudaMemcpyAsync(h_y, dataBase_.d_y, dataBase_.cell_value_bytes * dataBase_.num_species, cudaMemcpyDeviceToHost, dataBase_.stream));
-    checkCudaErrors(cudaMemcpyAsync(h_boundary_y, dataBase_.d_boundary_y, dataBase_.boundary_surface_value_bytes * dataBase_.num_species, cudaMemcpyDeviceToHost, dataBase_.stream));
-    checkCudaErrors(cudaStreamSynchronize(dataBase_.stream));
+    TICK_END_EVENT(YEqn post process for all species free);
+    sync();
 }
 
+void dfYEqn::solve(int speciesIndex) {    
+    TICK_INIT_EVENT;
+    TICK_START_EVENT;
+    dataBase_.solve(num_iteration, AMGXSetting::u_setting, d_A, dataBase_.d_y + dataBase_.num_cells * speciesIndex, d_source);
+    TICK_END_EVENT(YEqn solve one specie);
+}
+
+void dfYEqn::postProcess(double *h_y, double *h_boundary_y) {}
+
 void dfYEqn::sync() {
     checkCudaErrors(cudaStreamSynchronize(dataBase_.stream));
 }
@@ -745,6 +734,7 @@ void dfYEqn::yeqn_fvc_laplacian_scalar(cudaStream_t stream, ncclComm_t comm, con
 
     int offset = 0;
     for (int i = 0; i < num_patches; i++) {
+        if (patch_size[i] == 0) continue;
         threads_per_block = 256;
         blocks_per_grid = (patch_size[i] + threads_per_block - 1) / threads_per_block;
         // TODO: just basic patch type now
@@ -788,6 +778,7 @@ void dfYEqn::yeqn_fvc_laplacian_scalar(cudaStream_t stream, ncclComm_t comm, con
             boundary_cell_face, output, boundary_output);
     offset = 0;
     for (int i = 0; i < num_patches; i++) {
+        if (patch_size[i] == 0) continue;
         if (patch_type[i] == boundaryConditions::processor) {
             correct_boundary_conditions_processor_scalar(stream, comm, neighbor_peer[i], patch_size[i], offset,
                     output, boundary_cell_face, boundary_output);
diff --git a/src_gpu/dfpEqn.H b/src_gpu/dfpEqn.H
index fe58e6608..79a414eac 100644
--- a/src_gpu/dfpEqn.H
+++ b/src_gpu/dfpEqn.H
@@ -11,10 +11,14 @@ private:
 	dfMatrixDataBase &dataBase_;
 
     // cuda resource
+    cudaStream_t stream;
+#ifdef USE_GRAPH
     // one graph for one eqn before using self-developed solver
     cudaGraph_t graph_pre, graph_post;
     cudaGraphExec_t graph_instance_pre, graph_instance_post;
-    bool graph_created=false;
+    bool pre_graph_created=false;
+    bool post_graph_created=false;
+#endif
 
 	// constant values -- basic
 	std::string mode_string;
@@ -92,7 +96,7 @@ public:
 	void preProcess(double *h_phi, double *h_boundary_phi);
     void correctPsi(const double *h_thermoPsi, double *h_boundary_thermoPsi); // tmp
     void correctP(const double *h_p, double *h_boundary_p); // tmp
-    void getFlux();
+    //void getFlux();
 	void process();
 	void postProcess();
 
diff --git a/src_gpu/dfpEqn.cu b/src_gpu/dfpEqn.cu
index 5bfc1c17a..fed764a36 100644
--- a/src_gpu/dfpEqn.cu
+++ b/src_gpu/dfpEqn.cu
@@ -294,6 +294,7 @@ double* dfpEqn::getFieldPointer(const char* fieldAlias, location loc, position p
 }
 
 void dfpEqn::setConstantValues(const std::string &mode_string, const std::string &setting_path) {
+    this->stream = dataBase_.stream;
     this->mode_string = mode_string;
     this->setting_path = setting_path;
     pSolver = new AmgXSolver(mode_string, setting_path, dataBase_.localRank);
@@ -305,13 +306,17 @@ void dfpEqn::setConstantFields(const std::vector<int> patch_type_U, const std::v
 }
 
 void dfpEqn::createNonConstantFieldsInternal() {
+#ifndef STREAM_ALLOCATOR
     // intermediate fields
     checkCudaErrors(cudaMalloc((void**)&d_rhorAUf, dataBase_.surface_value_bytes));
     checkCudaErrors(cudaMalloc((void**)&d_phiHbyA, dataBase_.surface_value_bytes));
     checkCudaErrors(cudaMalloc((void**)&d_flux, dataBase_.surface_value_bytes));
+#endif
 }
 
 void dfpEqn::createNonConstantFieldsBoundary() {
+#ifndef STREAM_ALLOCATOR
+    // boundary coeffs
     checkCudaErrors(cudaMalloc((void**)&d_value_internal_coeffs, dataBase_.boundary_surface_value_bytes));
     checkCudaErrors(cudaMalloc((void**)&d_value_boundary_coeffs, dataBase_.boundary_surface_value_bytes));
     checkCudaErrors(cudaMalloc((void**)&d_gradient_internal_coeffs, dataBase_.boundary_surface_value_bytes));
@@ -320,18 +325,22 @@ void dfpEqn::createNonConstantFieldsBoundary() {
     checkCudaErrors(cudaMalloc((void**)&d_boundary_rhorAUf, dataBase_.boundary_surface_value_bytes));
     checkCudaErrors(cudaMalloc((void**)&d_boundary_phiHbyA, dataBase_.boundary_surface_value_bytes));
     checkCudaErrors(cudaMalloc((void**)&d_boundary_flux, dataBase_.boundary_surface_value_bytes));
+#endif
 }
 
 void dfpEqn::createNonConstantLduAndCsrFields() {
+    // ldu and csr
     checkCudaErrors(cudaMalloc((void**)&d_ldu, dataBase_.csr_value_bytes));
     d_lower = d_ldu;
     d_diag = d_ldu + dataBase_.num_surfaces;
     d_upper = d_ldu + dataBase_.num_cells + dataBase_.num_surfaces;
     d_extern = d_ldu + dataBase_.num_cells + 2 * dataBase_.num_surfaces;
+#ifndef STREAM_ALLOCATOR
     checkCudaErrors(cudaMalloc((void**)&d_source, dataBase_.cell_value_bytes));
     checkCudaErrors(cudaMalloc((void**)&d_internal_coeffs, dataBase_.boundary_surface_value_bytes));
     checkCudaErrors(cudaMalloc((void**)&d_boundary_coeffs, dataBase_.boundary_surface_value_bytes));
     checkCudaErrors(cudaMalloc((void**)&d_A, dataBase_.csr_value_bytes));
+#endif
 }
 
 void dfpEqn::initNonConstantFields(const double *p, const double *boundary_p){
@@ -340,12 +349,16 @@ void dfpEqn::initNonConstantFields(const double *p, const double *boundary_p){
 }
 
 void dfpEqn::cleanCudaResources() {
-    if (graph_created) {
+#ifdef USE_GRAPH
+    if (pre_graph_created) {
         checkCudaErrors(cudaGraphExecDestroy(graph_instance_pre));
-        checkCudaErrors(cudaGraphExecDestroy(graph_instance_post));
         checkCudaErrors(cudaGraphDestroy(graph_pre));
+    }
+    if (post_graph_created) {
+        checkCudaErrors(cudaGraphExecDestroy(graph_instance_post));
         checkCudaErrors(cudaGraphDestroy(graph_post));
     }
+#endif
 }
 
 // tmp
@@ -364,19 +377,37 @@ void dfpEqn::correctP(const double *h_p, double *h_boundary_p) {
 };
 
 void dfpEqn::process() {
-
-    float time_elapsed=0;
-    cudaEvent_t start,stop;
-    checkCudaErrors(cudaEventCreate(&start));
-    checkCudaErrors(cudaEventCreate(&stop));
-    checkCudaErrors(cudaEventRecord(start,0));
-
-#ifndef TIME_GPU
-    if(!graph_created) {
+    TICK_INIT_EVENT;
+    TICK_START_EVENT;
+#ifdef USE_GRAPH
+    if(!pre_graph_created) {
         DEBUG_TRACE;
         checkCudaErrors(cudaStreamBeginCapture(dataBase_.stream, cudaStreamCaptureModeGlobal));
 #endif
 
+#ifdef STREAM_ALLOCATOR
+    // intermediate fields
+    checkCudaErrors(cudaMallocAsync((void**)&d_rhorAUf, dataBase_.surface_value_bytes, dataBase_.stream));
+    checkCudaErrors(cudaMallocAsync((void**)&d_phiHbyA, dataBase_.surface_value_bytes, dataBase_.stream));
+    checkCudaErrors(cudaMallocAsync((void**)&d_flux, dataBase_.surface_value_bytes, dataBase_.stream));
+
+    // boundary coeffs
+    checkCudaErrors(cudaMallocAsync((void**)&d_value_internal_coeffs, dataBase_.boundary_surface_value_bytes, dataBase_.stream));
+    checkCudaErrors(cudaMallocAsync((void**)&d_value_boundary_coeffs, dataBase_.boundary_surface_value_bytes, dataBase_.stream));
+    checkCudaErrors(cudaMallocAsync((void**)&d_gradient_internal_coeffs, dataBase_.boundary_surface_value_bytes, dataBase_.stream));
+    checkCudaErrors(cudaMallocAsync((void**)&d_gradient_boundary_coeffs, dataBase_.boundary_surface_value_bytes, dataBase_.stream));
+    // intermediate boundary fields
+    checkCudaErrors(cudaMallocAsync((void**)&d_boundary_rhorAUf, dataBase_.boundary_surface_value_bytes, dataBase_.stream));
+    checkCudaErrors(cudaMallocAsync((void**)&d_boundary_phiHbyA, dataBase_.boundary_surface_value_bytes, dataBase_.stream));
+    checkCudaErrors(cudaMallocAsync((void**)&d_boundary_flux, dataBase_.boundary_surface_value_bytes, dataBase_.stream));
+
+    // ldu and csr
+    checkCudaErrors(cudaMallocAsync((void**)&d_source, dataBase_.cell_value_bytes, dataBase_.stream));
+    checkCudaErrors(cudaMallocAsync((void**)&d_internal_coeffs, dataBase_.boundary_surface_value_bytes, dataBase_.stream));
+    checkCudaErrors(cudaMallocAsync((void**)&d_boundary_coeffs, dataBase_.boundary_surface_value_bytes, dataBase_.stream));
+    checkCudaErrors(cudaMallocAsync((void**)&d_A, dataBase_.csr_value_bytes, dataBase_.stream));
+#endif
+
     checkCudaErrors(cudaMemsetAsync(d_ldu, 0, dataBase_.csr_value_bytes, dataBase_.stream)); // d_ldu contains d_lower, d_diag, and d_upper
     checkCudaErrors(cudaMemsetAsync(d_source, 0, dataBase_.cell_value_bytes, dataBase_.stream));
     checkCudaErrors(cudaMemsetAsync(d_internal_coeffs, 0, dataBase_.boundary_surface_value_bytes, dataBase_.stream));
@@ -428,93 +459,101 @@ void dfpEqn::process() {
             dataBase_.num_patches, dataBase_.patch_size.data(), patch_type_p.data(),
             d_ldu, d_source, d_internal_coeffs, d_boundary_coeffs, d_A);
 
-#ifndef TIME_GPU
+#ifdef USE_GRAPH
         checkCudaErrors(cudaStreamEndCapture(dataBase_.stream, &graph_pre));
         checkCudaErrors(cudaGraphInstantiate(&graph_instance_pre, graph_pre, NULL, NULL, 0));
+        pre_graph_created = true;
     }
     DEBUG_TRACE;
     checkCudaErrors(cudaGraphLaunch(graph_instance_pre, dataBase_.stream));
 #endif
-    checkCudaErrors(cudaEventRecord(stop,0));
-    checkCudaErrors(cudaEventSynchronize(start));
-    checkCudaErrors(cudaEventSynchronize(stop));
-    checkCudaErrors(cudaEventElapsedTime(&time_elapsed,start,stop));
-    if(!mpi_init_flag || myRank == 0)
-    fprintf(stderr, "peqn assembly time:%f(ms)\n",time_elapsed);
-    
-    checkCudaErrors(cudaEventRecord(start,0));
+    TICK_END_EVENT(pEqn assembly);
+
+    TICK_START_EVENT;
     solve();
-    checkCudaErrors(cudaEventRecord(stop,0));
-    checkCudaErrors(cudaEventSynchronize(start));
-    checkCudaErrors(cudaEventSynchronize(stop));
-    checkCudaErrors(cudaEventElapsedTime(&time_elapsed,start,stop));
-    if(!mpi_init_flag || myRank == 0)
-    fprintf(stderr, "peqn solve time:%f(ms)\n",time_elapsed);
-
-    checkCudaErrors(cudaEventRecord(start,0));
-#ifndef TIME_GPU
-    if(!graph_created) {
+    TICK_END_EVENT(pEqn solve);
+
+#ifdef USE_GRAPH
+    if(!post_graph_created) {
         checkCudaErrors(cudaStreamBeginCapture(dataBase_.stream, cudaStreamCaptureModeGlobal));
 #endif
     
-    correct_boundary_conditions_scalar(dataBase_.stream, dataBase_.nccl_comm, dataBase_.neighbProcNo.data(),
-            dataBase_.num_boundary_surfaces, dataBase_.num_patches, dataBase_.patch_size.data(), 
-            patch_type_p.data(), dataBase_.d_boundary_delta_coeffs,
-            dataBase_.d_boundary_face_cell, dataBase_.d_p, dataBase_.d_boundary_p,
-            dataBase_.cyclicNeighbor.data(), dataBase_.patchSizeOffset.data(), dataBase_.d_boundary_weight);
-    // update phi
-    fvMtx_flux(dataBase_.stream, dataBase_.num_surfaces, dataBase_.num_boundary_surfaces, dataBase_.d_owner, dataBase_.d_neighbor, 
-            d_lower, d_upper, dataBase_.d_p, d_flux,
-            dataBase_.num_patches, dataBase_.patch_size.data(), patch_type_p.data(), 
-            dataBase_.d_boundary_face_cell, d_internal_coeffs, d_boundary_coeffs, dataBase_.cyclicNeighbor.data(), 
-            dataBase_.patchSizeOffset.data(), dataBase_.d_boundary_p, d_boundary_flux);
-    field_add_scalar(dataBase_.stream, dataBase_.num_surfaces, d_phiHbyA, d_flux, dataBase_.d_phi, 
-            dataBase_.num_boundary_surfaces, d_boundary_phiHbyA, d_boundary_flux, dataBase_.d_boundary_phi);
-    // correct U
-    checkCudaErrors(cudaMemsetAsync(dataBase_.d_u, 0., dataBase_.cell_value_vec_bytes, dataBase_.stream));
-    // TODO: may do not need to calculate boundary fields
-    fvc_grad_cell_scalar(dataBase_.stream, dataBase_.num_cells, dataBase_.num_surfaces, dataBase_.num_boundary_surfaces,
-            dataBase_.d_owner, dataBase_.d_neighbor, 
-            dataBase_.d_weight, dataBase_.d_sf, dataBase_.d_p, dataBase_.d_u, 
-            dataBase_.num_patches, dataBase_.patch_size.data(), patch_type_p.data(), dataBase_.d_boundary_weight,
-            dataBase_.d_boundary_face_cell, dataBase_.d_boundary_p, dataBase_.d_boundary_sf, dataBase_.d_volume, true);
-    scalar_field_multiply_vector_field(dataBase_.stream, dataBase_.num_cells, dataBase_.d_rAU, dataBase_.d_u, dataBase_.d_u);
-    field_add_vector(dataBase_.stream, dataBase_.num_cells, dataBase_.d_HbyA, dataBase_.d_u, dataBase_.d_u, -1.);
-    correct_boundary_conditions_vector(dataBase_.stream, dataBase_.nccl_comm, dataBase_.neighbProcNo.data(), dataBase_.num_boundary_surfaces, 
-            dataBase_.num_cells, dataBase_.num_patches, dataBase_.patch_size.data(), patch_type_U.data(), dataBase_.d_boundary_weight,
-            dataBase_.d_boundary_face_cell, dataBase_.d_u, dataBase_.d_boundary_u, 
-            dataBase_.cyclicNeighbor.data(), dataBase_.patchSizeOffset.data());
-    vector_half_mag_square(dataBase_.stream, dataBase_.num_cells, dataBase_.d_u, dataBase_.d_k, dataBase_.num_boundary_surfaces, 
-            dataBase_.d_boundary_u, dataBase_.d_boundary_k);
-    // calculate dpdt
-    fvc_ddt_scalar_field(dataBase_.stream, dataBase_.num_cells, dataBase_.rdelta_t, dataBase_.d_p, dataBase_.d_p_old, dataBase_.d_volume, dataBase_.d_dpdt, 1.);
-
-    #ifndef TIME_GPU
+        TICK_START_EVENT;
+        correct_boundary_conditions_scalar(dataBase_.stream, dataBase_.nccl_comm, dataBase_.neighbProcNo.data(),
+                dataBase_.num_boundary_surfaces, dataBase_.num_patches, dataBase_.patch_size.data(), 
+                patch_type_p.data(), dataBase_.d_boundary_delta_coeffs,
+                dataBase_.d_boundary_face_cell, dataBase_.d_p, dataBase_.d_boundary_p,
+                dataBase_.cyclicNeighbor.data(), dataBase_.patchSizeOffset.data(), dataBase_.d_boundary_weight);
+        // update phi
+        fvMtx_flux(dataBase_.stream, dataBase_.num_surfaces, dataBase_.num_boundary_surfaces, dataBase_.d_owner, dataBase_.d_neighbor, 
+                d_lower, d_upper, dataBase_.d_p, d_flux,
+                dataBase_.num_patches, dataBase_.patch_size.data(), patch_type_p.data(), 
+                dataBase_.d_boundary_face_cell, d_internal_coeffs, d_boundary_coeffs, dataBase_.cyclicNeighbor.data(), 
+                dataBase_.patchSizeOffset.data(), dataBase_.d_boundary_p, d_boundary_flux);
+        field_add_scalar(dataBase_.stream, dataBase_.num_surfaces, d_phiHbyA, d_flux, dataBase_.d_phi, 
+                dataBase_.num_boundary_surfaces, d_boundary_phiHbyA, d_boundary_flux, dataBase_.d_boundary_phi);
+        // correct U
+        checkCudaErrors(cudaMemsetAsync(dataBase_.d_u, 0., dataBase_.cell_value_vec_bytes, dataBase_.stream));
+        // TODO: may do not need to calculate boundary fields
+        fvc_grad_cell_scalar(dataBase_.stream, dataBase_.num_cells, dataBase_.num_surfaces, dataBase_.num_boundary_surfaces,
+                dataBase_.d_owner, dataBase_.d_neighbor, 
+                dataBase_.d_weight, dataBase_.d_sf, dataBase_.d_p, dataBase_.d_u, 
+                dataBase_.num_patches, dataBase_.patch_size.data(), patch_type_p.data(), dataBase_.d_boundary_weight,
+                dataBase_.d_boundary_face_cell, dataBase_.d_boundary_p, dataBase_.d_boundary_sf, dataBase_.d_volume, true);
+        scalar_field_multiply_vector_field(dataBase_.stream, dataBase_.num_cells, dataBase_.d_rAU, dataBase_.d_u, dataBase_.d_u);
+        field_add_vector(dataBase_.stream, dataBase_.num_cells, dataBase_.d_HbyA, dataBase_.d_u, dataBase_.d_u, -1.);
+        correct_boundary_conditions_vector(dataBase_.stream, dataBase_.nccl_comm, dataBase_.neighbProcNo.data(), dataBase_.num_boundary_surfaces, 
+                dataBase_.num_cells, dataBase_.num_patches, dataBase_.patch_size.data(), patch_type_U.data(), dataBase_.d_boundary_weight,
+                dataBase_.d_boundary_face_cell, dataBase_.d_u, dataBase_.d_boundary_u, 
+                dataBase_.cyclicNeighbor.data(), dataBase_.patchSizeOffset.data());
+        vector_half_mag_square(dataBase_.stream, dataBase_.num_cells, dataBase_.d_u, dataBase_.d_k, dataBase_.num_boundary_surfaces, 
+                dataBase_.d_boundary_u, dataBase_.d_boundary_k);
+        // calculate dpdt
+        fvc_ddt_scalar_field(dataBase_.stream, dataBase_.num_cells, dataBase_.rdelta_t, dataBase_.d_p, dataBase_.d_p_old, dataBase_.d_volume, dataBase_.d_dpdt, 1.);
+
+#ifdef STREAM_ALLOCATOR
+        // intermediate fields
+        checkCudaErrors(cudaFreeAsync(d_rhorAUf, dataBase_.stream));
+        checkCudaErrors(cudaFreeAsync(d_phiHbyA, dataBase_.stream));
+        checkCudaErrors(cudaFreeAsync(d_flux, dataBase_.stream));
+
+        // boundary coeffs
+        checkCudaErrors(cudaFreeAsync(d_value_internal_coeffs, dataBase_.stream));
+        checkCudaErrors(cudaFreeAsync(d_value_boundary_coeffs, dataBase_.stream));
+        checkCudaErrors(cudaFreeAsync(d_gradient_internal_coeffs, dataBase_.stream));
+        checkCudaErrors(cudaFreeAsync(d_gradient_boundary_coeffs, dataBase_.stream));
+        // intermediate boundary fields
+        checkCudaErrors(cudaFreeAsync(d_boundary_rhorAUf, dataBase_.stream));
+        checkCudaErrors(cudaFreeAsync(d_boundary_phiHbyA, dataBase_.stream));
+        checkCudaErrors(cudaFreeAsync(d_boundary_flux, dataBase_.stream));
+
+        // ldu and csr
+        checkCudaErrors(cudaFreeAsync(d_source, dataBase_.stream));
+        checkCudaErrors(cudaFreeAsync(d_internal_coeffs, dataBase_.stream));
+        checkCudaErrors(cudaFreeAsync(d_boundary_coeffs, dataBase_.stream));
+        checkCudaErrors(cudaFreeAsync(d_A, dataBase_.stream));
+#endif
+        TICK_END_EVENT(pEqn post process all);
+
+#ifdef USE_GRAPH
         checkCudaErrors(cudaStreamEndCapture(dataBase_.stream, &graph_post));
         checkCudaErrors(cudaGraphInstantiate(&graph_instance_post, graph_post, NULL, NULL, 0));
-        graph_created = true;
+        post_graph_created = true;
     }
     checkCudaErrors(cudaGraphLaunch(graph_instance_post, dataBase_.stream));
 #endif
-    
-    checkCudaErrors(cudaEventRecord(stop,0));
-    checkCudaErrors(cudaEventSynchronize(start));
-    checkCudaErrors(cudaEventSynchronize(stop));
-    checkCudaErrors(cudaEventElapsedTime(&time_elapsed,start,stop));
-    if(!mpi_init_flag || myRank == 0)
-    fprintf(stderr, "peqn post process time:%f(ms)\n\n",time_elapsed);
+    sync();
 }
 void dfpEqn::postProcess() {}
 
-void dfpEqn::getFlux()
-{
-    fvMtx_flux(dataBase_.stream, dataBase_.num_surfaces, dataBase_.num_boundary_surfaces, dataBase_.d_owner, dataBase_.d_neighbor, 
-            d_lower, d_upper, dataBase_.d_p, d_flux,
-            dataBase_.num_patches, dataBase_.patch_size.data(), patch_type_p.data(), 
-            dataBase_.d_boundary_face_cell, d_internal_coeffs, d_boundary_coeffs, dataBase_.cyclicNeighbor.data(), 
-            dataBase_.patchSizeOffset.data(), dataBase_.d_boundary_p, d_boundary_flux);
-    sync();
-}
+//void dfpEqn::getFlux()
+//{
+//    fvMtx_flux(dataBase_.stream, dataBase_.num_surfaces, dataBase_.num_boundary_surfaces, dataBase_.d_owner, dataBase_.d_neighbor, 
+//            d_lower, d_upper, dataBase_.d_p, d_flux,
+//            dataBase_.num_patches, dataBase_.patch_size.data(), patch_type_p.data(), 
+//            dataBase_.d_boundary_face_cell, d_internal_coeffs, d_boundary_coeffs, dataBase_.d_boundary_p, d_boundary_flux);
+//    sync();
+//}
 
 void dfpEqn::getrhorAUf(cudaStream_t stream, int num_cells, int num_surfaces,
         const int *lowerAddr, const int *upperAddr, 
@@ -529,6 +568,7 @@ void dfpEqn::getrhorAUf(cudaStream_t stream, int num_cells, int num_surfaces,
     
     int offset = 0;
     for (int i = 0; i < num_patches; i++) {
+        if (patch_size[i] == 0) continue;
         threads_per_block = 256;
         blocks_per_grid = (patch_size[i] + threads_per_block - 1) / threads_per_block;
         // TODO: maybe do not need loop boundarys
@@ -567,6 +607,7 @@ void dfpEqn::getphiHbyA(cudaStream_t stream, int num_cells, int num_surfaces, in
     
     int offset = 0;
     for (int i = 0; i < num_patches; i++) {
+        if (patch_size[i] == 0) continue;
         threads_per_block = 256;
         blocks_per_grid = (patch_size[i] + threads_per_block - 1) / threads_per_block;
         if (patch_type[i] == boundaryConditions::processor
@@ -586,6 +627,7 @@ void dfpEqn::getphiHbyA(cudaStream_t stream, int num_cells, int num_surfaces, in
 
     offset = 0;
     for (int i = 0; i < num_patches; i++) {
+        if (patch_size[i] == 0) continue;
         threads_per_block = 256;
         blocks_per_grid = (patch_size[i] + threads_per_block - 1) / threads_per_block;
         if (patch_type[i] == boundaryConditions::processor
@@ -606,6 +648,7 @@ void dfpEqn::getphiHbyA(cudaStream_t stream, int num_cells, int num_surfaces, in
     
     offset = 0;
     for (int i = 0; i < num_patches; i++) {
+        if (patch_size[i] == 0) continue;
         threads_per_block = 256;
         blocks_per_grid = (patch_size[i] + threads_per_block - 1) / threads_per_block;
         if (patch_type[i] == boundaryConditions::extrapolated
@@ -639,30 +682,7 @@ void dfpEqn::sync()
 
 void dfpEqn::solve()
 {
-    sync();
-
-    // double *h_A_csr = new double[dataBase_.num_Nz];
-    // checkCudaErrors(cudaMemcpy(h_A_csr, d_A, dataBase_.csr_value_bytes, cudaMemcpyDeviceToHost));
-    // for (int i = 0; i < dataBase_.num_Nz; i++)
-    //     fprintf(stderr, "h_A_csr[%d]: %.10lf\n", i, h_A_csr[i]);
-    // delete[] h_A_csr;
-
-    // double *h_b = new double[dataBase_.num_cells];
-    // checkCudaErrors(cudaMemcpy(h_b, d_source, dataBase_.cell_value_bytes, cudaMemcpyDeviceToHost));
-    // for (int i = 0; i < dataBase_.num_cells; i++)
-    //     fprintf(stderr, "h_b[%d]: %.10lf\n", i, h_b[i]);
-    // delete[] h_b;
-
-    if (num_iteration == 0)                                     // first interation
-    {
-        printf("Initializing AmgX Linear Solver\n");
-        pSolver->setOperator(dataBase_.num_cells, dataBase_.num_total_cells, dataBase_.num_Nz, dataBase_.d_csr_row_index, dataBase_.d_csr_col_index, d_A);
-    }
-    else
-    {
-        pSolver->updateOperator(dataBase_.num_cells, dataBase_.num_Nz, d_A);
-    }
-    pSolver->solve(dataBase_.num_cells, dataBase_.d_p, d_source);
+    dataBase_.solve(num_iteration, AMGXSetting::p_setting, d_A, dataBase_.d_p, d_source);
     num_iteration++;
 }