From 29c6059855b82fcb81fbb60ec3adfe2b6ea8da53 Mon Sep 17 00:00:00 2001
From: STwangyingrui <wangyingrui@sensetime.com>
Date: Thu, 19 Oct 2023 18:17:50 +0800
Subject: [PATCH 1/6] refactor for timing

---
 .../solvers/dfLowMachFoam/createGPUSolver.H   |   1 +
 src_gpu/dfEEqn.H                              |   4 +-
 src_gpu/dfEEqn.cu                             | 110 ++++++-------
 src_gpu/dfMatrixDataBase.H                    |   7 +-
 src_gpu/dfMatrixOpBase.H                      |  59 ++++++-
 src_gpu/dfMatrixOpBase.cu                     | 148 +-----------------
 src_gpu/dfRhoEqn.H                            |   2 +
 src_gpu/dfRhoEqn.cu                           |  42 +++--
 src_gpu/dfThermo.H                            |   1 +
 src_gpu/dfThermo.cu                           |  38 -----
 src_gpu/dfUEqn.H                              |   4 +-
 src_gpu/dfUEqn.cu                             | 120 +++++++-------
 src_gpu/dfYEqn.H                              |   1 +
 src_gpu/dfYEqn.cu                             |  78 +++++----
 src_gpu/dfpEqn.H                              |   4 +-
 src_gpu/dfpEqn.cu                             | 119 +++++++-------
 16 files changed, 288 insertions(+), 450 deletions(-)

diff --git a/applications/solvers/dfLowMachFoam/createGPUSolver.H b/applications/solvers/dfLowMachFoam/createGPUSolver.H
index 272d23c20..226b8f3f3 100644
--- a/applications/solvers/dfLowMachFoam/createGPUSolver.H
+++ b/applications/solvers/dfLowMachFoam/createGPUSolver.H
@@ -323,6 +323,7 @@ void createGPURhoEqn(const volScalarField& rho, const surfaceScalarField& phi) {
             offset += patchsize;
         }
     }
+    rhoEqn_GPU.setConstantValues();
     rhoEqn_GPU.setConstantFields(patch_type);
     rhoEqn_GPU.initNonConstantFields(h_rho, h_phi, h_boundary_rho, h_boundary_phi);
     rhoEqn_GPU.createNonConstantLduAndCsrFields();
diff --git a/src_gpu/dfEEqn.H b/src_gpu/dfEEqn.H
index f19f3a600..63616b414 100644
--- a/src_gpu/dfEEqn.H
+++ b/src_gpu/dfEEqn.H
@@ -12,10 +12,12 @@ class dfEEqn
     dfThermo &thermo_;
 
     // cuda resource
+    cudaStream_t stream;
     // one graph for one eqn before using self-developed solver
     cudaGraph_t graph_pre, graph_post;
     cudaGraphExec_t graph_instance_pre, graph_instance_post;
-    bool graph_created=false;
+    bool pre_graph_created=false;
+    bool post_graph_created=false;
 
 	// constant values -- basic
 	std::string mode_string;
diff --git a/src_gpu/dfEEqn.cu b/src_gpu/dfEEqn.cu
index 91a4facd4..b9f92d526 100644
--- a/src_gpu/dfEEqn.cu
+++ b/src_gpu/dfEEqn.cu
@@ -21,6 +21,7 @@ double* dfEEqn::getFieldPointer(const char* fieldAlias, location loc, position p
 }
 
 void dfEEqn::setConstantValues(const std::string &mode_string, const std::string &setting_path) {
+    this->stream = dataBase_.stream;
     this->mode_string = mode_string;
     this->setting_path = setting_path;
     ESolver = new AmgXSolver(mode_string, setting_path, dataBase_.localRank);
@@ -88,10 +89,12 @@ void dfEEqn::initNonConstantFields(const double *he, const double *boundary_he)
 }
 
 void dfEEqn::cleanCudaResources() {
-    if (graph_created) {
+    if (pre_graph_created) {
         checkCudaErrors(cudaGraphExecDestroy(graph_instance_pre));
-        checkCudaErrors(cudaGraphExecDestroy(graph_instance_post));
         checkCudaErrors(cudaGraphDestroy(graph_pre));
+    }
+    if (post_graph_created) {
+        checkCudaErrors(cudaGraphExecDestroy(graph_instance_post));
         checkCudaErrors(cudaGraphDestroy(graph_post));
     }
 }
@@ -101,15 +104,10 @@ void dfEEqn::preProcess(const double *h_he, const double *h_k, const double *h_k
 }
 
 void dfEEqn::process() {
-    //使用event计算时间
-    float time_elapsed=0;
-    cudaEvent_t start,stop;
-    checkCudaErrors(cudaEventCreate(&start));
-    checkCudaErrors(cudaEventCreate(&stop));
-    checkCudaErrors(cudaEventRecord(start,0));
-
-#ifndef TIME_GPU
-    if(!graph_created) {
+    TICK_INIT_EVENT;
+    TICK_START_EVENT;
+#ifdef USE_GRAPH
+    if(!pre_graph_created) {
         DEBUG_TRACE;
         checkCudaErrors(cudaStreamBeginCapture(dataBase_.stream, cudaStreamCaptureModeGlobal));
 #endif
@@ -190,32 +188,25 @@ void dfEEqn::process() {
             dataBase_.num_Nz, dataBase_.d_boundary_face_cell, dataBase_.d_ldu_to_csr_index, dataBase_.num_patches,
             dataBase_.patch_size.data(), patch_type_he.data(), d_ldu, d_source, d_internal_coeffs, d_boundary_coeffs, d_A);
 #endif
-#ifndef TIME_GPU
+#ifdef USE_GRAPH
         checkCudaErrors(cudaStreamEndCapture(dataBase_.stream, &graph_pre));
         checkCudaErrors(cudaGraphInstantiate(&graph_instance_pre, graph_pre, NULL, NULL, 0));
+        pre_graph_created = true;
     }
     DEBUG_TRACE;
     checkCudaErrors(cudaGraphLaunch(graph_instance_pre, dataBase_.stream));
 #endif
-    checkCudaErrors(cudaEventRecord(stop,0));
-    checkCudaErrors(cudaEventSynchronize(start));
-    checkCudaErrors(cudaEventSynchronize(stop));
-    checkCudaErrors(cudaEventElapsedTime(&time_elapsed,start,stop));
-    fprintf(stderr, "eeqn assembly time:%f(ms)\n",time_elapsed);
+    TICK_END_EVENT(EEqn assembly);
 
-    checkCudaErrors(cudaEventRecord(start,0));
+    TICK_START_EVENT;
 #ifndef DEBUG_CHECK_LDU
     solve();
 #endif
-    checkCudaErrors(cudaEventRecord(stop,0));
-    checkCudaErrors(cudaEventSynchronize(start));
-    checkCudaErrors(cudaEventSynchronize(stop));
-    checkCudaErrors(cudaEventElapsedTime(&time_elapsed,start,stop));
-    fprintf(stderr, "eeqn solve time:%f(ms)\n",time_elapsed);
-
-    checkCudaErrors(cudaEventRecord(start,0));
-#ifndef TIME_GPU
-    if(!graph_created) {
+    TICK_END_EVENT(EEqn solve);
+
+    TICK_START_EVENT;
+#ifdef USE_GRAPH
+    if(!post_graph_created) {
         checkCudaErrors(cudaStreamBeginCapture(dataBase_.stream, cudaStreamCaptureModeGlobal));
 #endif
 
@@ -224,19 +215,38 @@ void dfEEqn::process() {
                 patch_type_he.data(), dataBase_.d_boundary_delta_coeffs, dataBase_.d_boundary_face_cell,
                 dataBase_.d_he, dataBase_.d_boundary_he, d_boundary_heGradient);
 
-#ifndef TIME_GPU
+        // copy he and boundary_he to host
+        checkCudaErrors(cudaMemcpyAsync(dataBase_.h_he, dataBase_.d_he, dataBase_.cell_value_bytes, cudaMemcpyDeviceToHost, dataBase_.stream));
+        checkCudaErrors(cudaMemcpyAsync(dataBase_.h_boundary_he, dataBase_.d_boundary_he, dataBase_.boundary_surface_value_bytes, cudaMemcpyDeviceToHost, dataBase_.stream));
+
+#ifdef STREAM_ALLOCATOR
+        // thermophysical fields
+        checkCudaErrors(cudaFreeAsync(d_dpdt, dataBase_.stream));
+        // fiv weight fieldsFree
+        //checkCudaErrors(cudaFreeAsync(d_phi_special_weight, dataBase_.stream));
+        // boundary coeffs
+        checkCudaErrors(cudaFreeAsync(d_value_internal_coeffs, dataBase_.stream));
+        checkCudaErrors(cudaFreeAsync(d_value_boundary_coeffs, dataBase_.stream));
+        checkCudaErrors(cudaFreeAsync(d_gradient_internal_coeffs, dataBase_.stream));
+        checkCudaErrors(cudaFreeAsync(d_gradient_boundary_coeffs, dataBase_.stream));
+     
+        checkCudaErrors(cudaFreeAsync(d_boundary_heGradient, dataBase_.stream));
+    
+        checkCudaErrors(cudaFreeAsync(d_source, dataBase_.stream));
+        checkCudaErrors(cudaFreeAsync(d_internal_coeffs, dataBase_.stream));
+        checkCudaErrors(cudaFreeAsync(d_boundary_coeffs, dataBase_.stream));
+        checkCudaErrors(cudaFreeAsync(d_A, dataBase_.stream));
+        checkCudaErrors(cudaFreeAsync(d_b, dataBase_.stream));
+#endif
+#ifdef USE_GRAPH
         checkCudaErrors(cudaStreamEndCapture(dataBase_.stream, &graph_post));
         checkCudaErrors(cudaGraphInstantiate(&graph_instance_post, graph_post, NULL, NULL, 0));
-        graph_created = true;
+        post_graph_created = true;
     }
     checkCudaErrors(cudaGraphLaunch(graph_instance_post, dataBase_.stream));
 #endif
-
-    checkCudaErrors(cudaEventRecord(stop,0));
-    checkCudaErrors(cudaEventSynchronize(start));
-    checkCudaErrors(cudaEventSynchronize(stop));
-    checkCudaErrors(cudaEventElapsedTime(&time_elapsed,start,stop));
-    fprintf(stderr, "eeqn post process time: %f(ms)\n\n",time_elapsed);
+    TICK_END_EVENT(EEqn post process);
+    sync();
 }
 
 void dfEEqn::eeqn_calculate_energy_gradient(dfThermo& GPUThermo, int num_cells, int num_species, 
@@ -330,7 +340,7 @@ void dfEEqn::sync()
 
 void dfEEqn::solve()
 {
-    sync();
+    sync(); // TODO can be removed
 
     if (num_iteration == 0)                                     // first interation
     {
@@ -345,30 +355,4 @@ void dfEEqn::solve()
     num_iteration++;
 }
 
-void dfEEqn::postProcess(double *h_he, double *h_boundary_he)
-{
-#ifdef STREAM_ALLOCATOR
-    // thermophysical fields
-    checkCudaErrors(cudaFreeAsync(d_dpdt, dataBase_.stream));
-    // fiv weight fieldsFree
-    //checkCudaErrors(cudaFreeAsync(d_phi_special_weight, dataBase_.stream));
-    // boundary coeffs
-    checkCudaErrors(cudaFreeAsync(d_value_internal_coeffs, dataBase_.stream));
-    checkCudaErrors(cudaFreeAsync(d_value_boundary_coeffs, dataBase_.stream));
-    checkCudaErrors(cudaFreeAsync(d_gradient_internal_coeffs, dataBase_.stream));
-    checkCudaErrors(cudaFreeAsync(d_gradient_boundary_coeffs, dataBase_.stream));
- 
-    checkCudaErrors(cudaFreeAsync(d_boundary_heGradient, dataBase_.stream));
-
-    checkCudaErrors(cudaFreeAsync(d_source, dataBase_.stream));
-    checkCudaErrors(cudaFreeAsync(d_internal_coeffs, dataBase_.stream));
-    checkCudaErrors(cudaFreeAsync(d_boundary_coeffs, dataBase_.stream));
-    checkCudaErrors(cudaFreeAsync(d_A, dataBase_.stream));
-    checkCudaErrors(cudaFreeAsync(d_b, dataBase_.stream));
-#endif
-
-    // copy he to host
-    checkCudaErrors(cudaMemcpyAsync(h_he, dataBase_.d_he, dataBase_.cell_value_bytes, cudaMemcpyDeviceToHost, dataBase_.stream));
-    checkCudaErrors(cudaMemcpyAsync(h_boundary_he, dataBase_.d_boundary_he, dataBase_.boundary_surface_value_bytes, cudaMemcpyDeviceToHost, dataBase_.stream));
-    sync();
-}
+void dfEEqn::postProcess(double *h_he, double *h_boundary_he) {}
diff --git a/src_gpu/dfMatrixDataBase.H b/src_gpu/dfMatrixDataBase.H
index 574305840..def2f4a89 100644
--- a/src_gpu/dfMatrixDataBase.H
+++ b/src_gpu/dfMatrixDataBase.H
@@ -16,11 +16,12 @@
 #include <cmath>
 #include <unordered_map>
 
-extern int myRank;
+#define DEBUG_
+#define DEBUG_CHECK_LDU
 
-// #define GPU_DEBUG_
+extern int myRank;
 
-#ifdef GPU_DEBUG_
+#ifdef DEBUG_
    #define DEBUG_TRACE fprintf(stderr, "myRank[%d] %s %d\n", myRank, __FILE__, __LINE__);
 #else
    #define DEBUG_TRACE
diff --git a/src_gpu/dfMatrixOpBase.H b/src_gpu/dfMatrixOpBase.H
index 405fd28be..195e86225 100644
--- a/src_gpu/dfMatrixOpBase.H
+++ b/src_gpu/dfMatrixOpBase.H
@@ -1,8 +1,62 @@
 #pragma once
 #include <cuda_runtime.h>
 #include <nccl.h>
-// #define TIME_GPU
-// #define DEBUG_CHECK_LDU
+
+#ifndef USE_GRAPH
+    #define USE_GRAPH
+#endif
+
+#ifndef TIME_GPU
+    #define TIME_GPU
+    #if (defined TIME_GPU) && (defined USE_GRAPH)
+        #undef USE_GRAPH
+    #endif
+#endif
+
+extern int myRank;
+extern __global__ void warmup();
+
+#ifdef TIME_GPU
+    #define WARM_UP \
+        warmup<<<10, 1024, 0, stream>>>();
+
+    #define TICK_INIT_EVENT \
+        float time_elapsed_kernel=0;\
+        cudaEvent_t start_kernel, stop_kernel;\
+        checkCudaErrors(cudaEventCreate(&start_kernel));\
+        checkCudaErrors(cudaEventCreate(&stop_kernel));
+
+    #define TICK_START_EVENT \
+        checkCudaErrors(cudaEventRecord(start_kernel,stream));
+
+    #define TICK_END_EVENT(prefix) \
+        checkCudaErrors(cudaEventRecord(stop_kernel,stream));\
+        checkCudaErrors(cudaEventSynchronize(start_kernel));\
+        checkCudaErrors(cudaEventSynchronize(stop_kernel));\
+        checkCudaErrors(cudaEventElapsedTime(&time_elapsed_kernel,start_kernel,stop_kernel));\
+        fprintf(stderr, "rank[%d], name: %s, time: %lf(ms)\n", myRank, #prefix, time_elapsed_kernel);
+
+/*
+    // the usage description:
+    // if you want to profile the first kernel, please use WARM_UP before TICK_INIT_EVENT.
+    // otherwise there is no need to use WARM_UP
+    WARM_UP;
+    // init event
+    TICK_INIT_EVENT;
+    // start event
+    TICK_START_EVENT;
+    // call your kernel, or kernels, or wrapper functions, e.g.:
+    my_kernel<<<blocks_per_grid, threads_per_block, 0, stream>>>(num, input, output);
+    // end event with your specified name string, e.g.:
+    TICK_END_EVENT(my_kernel);
+*/
+
+#else
+    #define WARM_UP
+    #define TICK_INIT_EVENT
+    #define TICK_START_EVENT
+    #define TICK_END_EVENT(prefix)
+#endif
 
 // tools
 void permute_vector_d2h(cudaStream_t stream, int num_cells, const double *input, double *output);
@@ -247,3 +301,4 @@ void fvMtx_flux(cudaStream_t stream, int num_surfaces, int num_boundary_surfaces
 
 void solve_explicit_scalar(cudaStream_t stream, int num_cells, const double *diag, const double *source,
         double *psi);
+
diff --git a/src_gpu/dfMatrixOpBase.cu b/src_gpu/dfMatrixOpBase.cu
index bf2f3be7e..9d9d33566 100644
--- a/src_gpu/dfMatrixOpBase.cu
+++ b/src_gpu/dfMatrixOpBase.cu
@@ -5,32 +5,10 @@
 #include <cuda_runtime.h>
 #include "cuda_profiler_api.h"
 
-#ifdef TIME_GPU
-    #define TICK_INIT_EVENT \
-        float time_elapsed_kernel=0;\
-        cudaEvent_t start_kernel, stop_kernel;\
-        checkCudaErrors(cudaEventCreate(&start_kernel));\
-        checkCudaErrors(cudaEventCreate(&stop_kernel));
-
-    #define TICK_START_EVENT \
-        checkCudaErrors(cudaEventRecord(start_kernel,0));
-
-    #define TICK_END_EVENT(prefix) \
-        checkCudaErrors(cudaEventRecord(stop_kernel,0));\
-        checkCudaErrors(cudaEventSynchronize(start_kernel));\
-        checkCudaErrors(cudaEventSynchronize(stop_kernel));\
-        checkCudaErrors(cudaEventElapsedTime(&time_elapsed_kernel,start_kernel,stop_kernel));\
-        printf("try %s 执行时间：%lf(ms)\n", #prefix, time_elapsed_kernel);
-#else
-    #define TICK_INIT_EVENT
-    #define TICK_START_EVENT
-    #define TICK_END_EVENT(prefix)
-#endif
-
-__global__ void warmup(int num_cells)
+__global__ void warmup()
 {
     int index = blockDim.x * blockIdx.x + threadIdx.x;
-    if (index >= num_cells)
+    if (index >= 10240)
         return;
 }
 
@@ -1951,13 +1929,10 @@ void field_add_scalar(cudaStream_t stream,
         int num, const double *input1, const double *input2, double *output,
         int num_boundary_surfaces, const double *boundary_input1, const double *boundary_input2, double *boundary_output)
 {
-    TICK_INIT_EVENT;
     size_t threads_per_block = 256;
     size_t blocks_per_grid = (std::max(num, num_boundary_surfaces) + threads_per_block - 1) / threads_per_block;
-    TICK_START_EVENT;
     field_add_scalar_kernel<<<blocks_per_grid, threads_per_block, 0, stream>>>(num, num_boundary_surfaces,
             input1, input2, output, boundary_input1, boundary_input2, boundary_output);
-    TICK_END_EVENT(field_add_scalar_kernel);
 }
 
 void field_add_vector(cudaStream_t stream,
@@ -1966,7 +1941,6 @@ void field_add_vector(cudaStream_t stream,
 {
     size_t threads_per_block = 256;
     size_t blocks_per_grid = (std::max(num_cells, num_boundary_surfaces) + threads_per_block - 1) / threads_per_block;
-
     field_add_vector_kernel<<<blocks_per_grid, threads_per_block, 0, stream>>>(num_cells, num_boundary_surfaces,
             input1, input2, output, boundary_input1, boundary_input2, boundary_output, sign);
 }
@@ -1984,13 +1958,10 @@ void field_multiply_scalar(cudaStream_t stream,
         int num_cells, const double *input1, const double *input2, double *output,
         int num_boundary_surfaces, const double *boundary_input1, const double *boundary_input2, double *boundary_output)
 {
-    TICK_INIT_EVENT;
     size_t threads_per_block = 256;
     size_t blocks_per_grid = (std::max(num_cells, num_boundary_surfaces) + threads_per_block - 1) / threads_per_block;
-    TICK_START_EVENT;
     field_multiply_scalar_kernel<<<blocks_per_grid, threads_per_block, 0, stream>>>(num_cells, num_boundary_surfaces,
             input1, input2, output, boundary_input1, boundary_input2, boundary_output);
-    TICK_END_EVENT(field_multiply_scalar_kernel);
 }
 
 void vector_half_mag_square(cudaStream_t stream, int num_cells, const double *vec_input, double *scalar_output,
@@ -2034,13 +2005,10 @@ void fvc_to_source_vector(cudaStream_t stream, int num_cells, const double *volu
 
 void fvc_to_source_scalar(cudaStream_t stream, int num_cells, const double *volume, const double *fvc_output, double *source, double sign)
 {
-    TICK_INIT_EVENT;
     size_t threads_per_block = 256;
     size_t blocks_per_grid = (num_cells + threads_per_block - 1) / threads_per_block;
-    TICK_START_EVENT;
     fvc_to_source_scalar_kernel<<<blocks_per_grid, threads_per_block, 0, stream>>>(num_cells,
             volume, fvc_output, source, sign);
-    TICK_END_EVENT(fvc_to_source_scalar_kernel);
 }
 
 void ldu_to_csr_scalar(cudaStream_t stream, int num_cells, int num_surfaces, int num_boundary_surface, int num_Nz, 
@@ -2094,21 +2062,16 @@ void ldu_to_csr(cudaStream_t stream, int num_cells, int num_surfaces, int num_bo
         const int* boundary_cell_face, const int *ldu_to_csr_index, const int *diag_to_csr_index,
         const double *ldu, const double *internal_coeffs, const double *boundary_coeffs, double *source, double *A)
 {
-    TICK_INIT_EVENT;
     // construct diag
     int nNz = num_cells + 2 * num_surfaces;
     size_t threads_per_block = 1024;
     size_t blocks_per_grid = (nNz + threads_per_block - 1) / threads_per_block;
-    TICK_START_EVENT;
     ldu_to_csr_kernel<<<blocks_per_grid, threads_per_block, 0, stream>>>(nNz, ldu_to_csr_index, ldu, A);
-    TICK_END_EVENT(ldu_to_csr_kernel);
 
     // add coeff to source and diagnal
     blocks_per_grid = (num_boundary_surface + threads_per_block - 1) / threads_per_block;
-    TICK_START_EVENT;
     addBoundaryDiagSrc<<<blocks_per_grid, threads_per_block, 0, stream>>>(num_cells, num_surfaces, num_boundary_surface, 
             boundary_cell_face, internal_coeffs, boundary_coeffs, diag_to_csr_index, A, source);
-    TICK_END_EVENT(addBoundaryDiagSrc);
 }
 
 void update_boundary_coeffs_scalar(cudaStream_t stream,
@@ -2264,50 +2227,30 @@ void fvm_ddt_vol_scalar_vol_scalar(cudaStream_t stream, int num_cells, double rD
         const double *rho, const double *rho_old, const double *vf, const double *volume,
         double *diag, double *source, double sign)
 {
-    TICK_INIT_EVENT;
     size_t threads_per_block = 1024;
     size_t blocks_per_grid = (num_cells + threads_per_block - 1) / threads_per_block;
-    TICK_START_EVENT;
     fvm_ddt_vol_scalar_vol_scalar_kernel<<<blocks_per_grid, threads_per_block, 0, stream>>>(num_cells,
             rDeltaT, rho, rho_old, vf, volume, diag, source, sign);
-    TICK_END_EVENT(fvm_ddt_vol_scalar_vol_scalar_kernel);
 }
 
 void fvm_ddt_scalar(cudaStream_t stream, int num_cells, double rDeltaT, 
         const double *vf_old, const double *volume, 
         double *diag, double *source, double sign)
 {
-#ifdef TIME_GPU
-    printf("#############kernel profile#############\n");
-#endif
-    TICK_INIT_EVENT;
     size_t threads_per_block = 1024;
     size_t blocks_per_grid = (num_cells + threads_per_block - 1) / threads_per_block;
-#ifdef TIME_GPU
-    printf("warm up ...\n");
-    warmup<<<blocks_per_grid, threads_per_block, 0, stream>>>(num_cells);
-#endif
-    TICK_START_EVENT;
     fvm_ddt_scalar_kernel<<<blocks_per_grid, threads_per_block, 0, stream>>>(num_cells, 
             rDeltaT, vf_old, volume, diag, source, sign);
-    TICK_END_EVENT(fvm_ddt_scalar_kernel);
 }
 
 void fvm_ddt_vector(cudaStream_t stream, int num_cells, double rDeltaT,
         const double *rho, const double *rho_old, const double *vf, const double *volume,
         double *diag, double *source, double sign)
 {
-    TICK_INIT_EVENT;
     size_t threads_per_block = 64;
     size_t blocks_per_grid = (num_cells + threads_per_block - 1) / threads_per_block;
-#ifdef TIME_GPU
-    printf("warm up ...\n");
-    warmup<<<blocks_per_grid, threads_per_block, 0, stream>>>(num_cells);
-#endif
-    TICK_START_EVENT;
     fvm_ddt_vector_kernel<<<blocks_per_grid, threads_per_block, 0, stream>>>(num_cells,
             rDeltaT, rho, rho_old, vf, volume, diag, source, sign);
-    TICK_END_EVENT(fvm_ddt_vector_kernel);
 }
 
 void fvm_div_scalar(cudaStream_t stream, int num_surfaces, const int *lowerAddr, const int *upperAddr,
@@ -2317,13 +2260,10 @@ void fvm_div_scalar(cudaStream_t stream, int num_surfaces, const int *lowerAddr,
         const double *boundary_phi, const double *value_internal_coeffs, const double *value_boundary_coeffs,
         double *internal_coeffs, double *boundary_coeffs, double sign)
 {
-    TICK_INIT_EVENT;
     size_t threads_per_block = 1024;
     size_t blocks_per_grid = (num_surfaces + threads_per_block - 1) / threads_per_block;
-    TICK_START_EVENT;
     fvm_div_scalar_internal<<<blocks_per_grid, threads_per_block, 0, stream>>>(num_surfaces, lowerAddr, upperAddr,
             phi, weight, lower, upper, diag, sign);
-    TICK_END_EVENT(fvm_div_scalar_internal);
 
     int offset = 0;
     for (int i = 0; i < num_patches; i++) {
@@ -2334,11 +2274,9 @@ void fvm_div_scalar(cudaStream_t stream, int num_surfaces, const int *lowerAddr,
                 || patch_type[i] == boundaryConditions::fixedValue
                 || patch_type[i] == boundaryConditions::gradientEnergy) {
             // TODO: just vector version now
-            TICK_START_EVENT;
             fvm_div_scalar_boundary<<<blocks_per_grid, threads_per_block, 0, stream>>>(patch_size[i], offset,
                     boundary_phi, value_internal_coeffs, value_boundary_coeffs,
                     internal_coeffs, boundary_coeffs, sign);
-            TICK_END_EVENT(fvm_div_scalar_boundary);
         } else if (patch_type[i] == boundaryConditions::processor) {
             fvm_div_scalar_boundary<<<blocks_per_grid, threads_per_block, 0, stream>>>(patch_size[i], offset,
                     boundary_phi, value_internal_coeffs, value_boundary_coeffs,
@@ -2361,17 +2299,10 @@ void fvm_div_vector(cudaStream_t stream, int num_surfaces, int num_boundary_surf
         const double *boundary_phi, const double *value_internal_coeffs, const double *value_boundary_coeffs,
         double *internal_coeffs, double *boundary_coeffs, double sign)
 {
-    TICK_INIT_EVENT;
     size_t threads_per_block = 256;
     size_t blocks_per_grid = (num_surfaces + threads_per_block - 1) / threads_per_block;
-#ifdef TIME_GPU
-    printf("warm up ...\n");
-    warmup<<<blocks_per_grid, threads_per_block, 0, stream>>>(num_surfaces);
-#endif
-    TICK_START_EVENT;
     fvm_div_vector_internal<<<blocks_per_grid, threads_per_block, 0, stream>>>(num_surfaces, lowerAddr, upperAddr,
             phi, weight, lower, upper, diag, sign);
-    TICK_END_EVENT(fvm_div_vector_internal);
 
     int offset = 0;
     for (int i = 0; i < num_patches; i++) {
@@ -2407,13 +2338,10 @@ void fvm_laplacian_scalar(cudaStream_t stream, int num_surfaces, int num_boundar
         const double *gradient_internal_coeffs, const double *gradient_boundary_coeffs,
         double *internal_coeffs, double *boundary_coeffs, double sign)
 {
-    TICK_INIT_EVENT;
     size_t threads_per_block = 1024;
     size_t blocks_per_grid = (num_surfaces + threads_per_block - 1) / threads_per_block;
-    TICK_START_EVENT;
     fvm_laplacian_internal<<<blocks_per_grid, threads_per_block, 0, stream>>>(num_surfaces, lowerAddr, upperAddr,
             weight, mag_sf, delta_coeffs, gamma, lower, upper, diag, sign);
-    TICK_END_EVENT(fvm_laplacian_internal);
     int offset = 0;
     for (int i = 0; i < num_patches; i++) {
         threads_per_block = 256;
@@ -2423,11 +2351,9 @@ void fvm_laplacian_scalar(cudaStream_t stream, int num_surfaces, int num_boundar
                 || patch_type[i] == boundaryConditions::fixedValue
                 || patch_type[i] == boundaryConditions::gradientEnergy) {
             // TODO: just vector version now
-            TICK_START_EVENT;
             fvm_laplacian_scalar_boundary<<<blocks_per_grid, threads_per_block, 0, stream>>>(patch_size[i], offset,
                     boundary_mag_sf, boundary_gamma, gradient_internal_coeffs, gradient_boundary_coeffs,
                     internal_coeffs, boundary_coeffs, sign);
-            TICK_END_EVENT(fvm_laplacian_scalar_boundary);
         } else if (patch_type[i] == boundaryConditions::processor) {
             fvm_laplacian_scalar_boundary<<<blocks_per_grid, threads_per_block, 0, stream>>>(patch_size[i], offset,
                     boundary_mag_sf, boundary_gamma, gradient_internal_coeffs, gradient_boundary_coeffs,
@@ -2451,13 +2377,10 @@ void fvm_laplacian_vector(cudaStream_t stream, int num_surfaces, int num_boundar
         const double *gradient_internal_coeffs, const double *gradient_boundary_coeffs,
         double *internal_coeffs, double *boundary_coeffs, double sign)
 {
-    TICK_INIT_EVENT;
     size_t threads_per_block = 1024;
     size_t blocks_per_grid = (num_surfaces + threads_per_block - 1) / threads_per_block;
-    TICK_START_EVENT;
     fvm_laplacian_internal<<<blocks_per_grid, threads_per_block, 0, stream>>>(num_surfaces, lowerAddr, upperAddr,
             weight, mag_sf, delta_coeffs, gamma, lower, upper, diag, sign);
-    TICK_END_EVENT(fvm_laplacian_internal);
     int offset = 0;
     for (int i = 0; i < num_patches; i++) {
         threads_per_block = 256;
@@ -2466,11 +2389,9 @@ void fvm_laplacian_vector(cudaStream_t stream, int num_surfaces, int num_boundar
         if (patch_type[i] == boundaryConditions::zeroGradient
                 || patch_type[i] == boundaryConditions::fixedValue) {
             // TODO: just vector version now
-            TICK_START_EVENT;
             fvm_laplacian_vector_boundary<<<blocks_per_grid, threads_per_block, 0, stream>>>(num_boundary_surfaces, patch_size[i], offset,
                     boundary_mag_sf, boundary_gamma, gradient_internal_coeffs, gradient_boundary_coeffs,
                     internal_coeffs, boundary_coeffs, sign);
-            TICK_END_EVENT(fvm_laplacian_vector_boundary);
         } else if (patch_type[i] == boundaryConditions::processor) {
             fvm_laplacian_vector_boundary_tmp<<<blocks_per_grid, threads_per_block, 0, stream>>>(num_boundary_surfaces, patch_size[i], offset,
                     boundary_mag_sf, boundary_gamma, gradient_internal_coeffs, gradient_boundary_coeffs,
@@ -2518,13 +2439,10 @@ void fvc_ddt_vol_scalar_vol_scalar(cudaStream_t stream, int num_cells, double rD
         const double *rho, const double *rho_old, const double *vf, const double *vf_old, const double *volume, 
         double *output, double sign)
 {
-    TICK_INIT_EVENT;
     size_t threads_per_block = 1024;
     size_t blocks_per_grid = (num_cells + threads_per_block - 1) / threads_per_block;
-    TICK_START_EVENT;
     fvc_ddt_vol_scalar_vol_scalar_kernel<<<blocks_per_grid, threads_per_block, 0, stream>>>(num_cells,
             rDeltaT, rho, rho_old, vf, vf_old, volume, output, sign);
-    TICK_END_EVENT(fvc_ddt_vol_scalar_vol_scalar_kernel);
 }
 
 void fvc_grad_vector(cudaStream_t stream, ncclComm_t comm, 
@@ -2536,13 +2454,10 @@ void fvc_grad_vector(cudaStream_t stream, ncclComm_t comm,
         const double *volume, const double *boundary_mag_Sf, double *boundary_output,
         const double *boundary_deltaCoeffs, double sign)
 {
-    TICK_INIT_EVENT;
     size_t threads_per_block = 32;
     size_t blocks_per_grid = (num_surfaces + threads_per_block - 1) / threads_per_block;
-    TICK_START_EVENT;
     fvc_grad_vector_internal<<<blocks_per_grid, threads_per_block, 0, stream>>>(num_cells, num_surfaces, lowerAddr, upperAddr,
             Sf, weight, vf, output);
-    TICK_END_EVENT(fvc_grad_vector_internal);
     
     int offset = 0;
     // finish conctruct grad field except dividing cell volume
@@ -2554,10 +2469,8 @@ void fvc_grad_vector(cudaStream_t stream, ncclComm_t comm,
                 || patch_type[i] == boundaryConditions::fixedValue
                 || patch_type[i] == boundaryConditions::calculated) {
             // TODO: just vector version now
-            TICK_START_EVENT;
             fvc_grad_vector_boundary_zeroGradient<<<blocks_per_grid, threads_per_block, 0, stream>>>(num_boundary_surfaces, num_cells, 
                     patch_size[i], offset, boundary_cell_face, boundary_Sf, boundary_vf, output);
-            TICK_END_EVENT(fvc_grad_vector_boundary_zeroGradient);
         } else if (patch_type[i] == boundaryConditions::processor) {
             fvc_grad_vector_boundary_processor<<<blocks_per_grid, threads_per_block, 0, stream>>>(num_boundary_surfaces, num_cells, 
                     patch_size[i], offset, boundary_cell_face, boundary_Sf, boundary_weight, boundary_vf, output);
@@ -2573,9 +2486,7 @@ void fvc_grad_vector(cudaStream_t stream, ncclComm_t comm,
     // divide cell volume
     threads_per_block = 512;
     blocks_per_grid = (num_cells + threads_per_block - 1) / threads_per_block;
-    TICK_START_EVENT;
     divide_cell_volume_tsr<<<blocks_per_grid, threads_per_block, 0, stream>>>(num_cells, volume, output);
-    TICK_END_EVENT(divide_cell_volume_tsr);
 
     // correct boundary conditions
     offset = 0;
@@ -2585,11 +2496,9 @@ void fvc_grad_vector(cudaStream_t stream, ncclComm_t comm,
         // TODO: just basic patch type now
         if (patch_type[i] == boundaryConditions::zeroGradient) {
             // TODO: just vector version now
-            TICK_START_EVENT;
             fvc_grad_vector_correctBC_zeroGradient<<<blocks_per_grid, threads_per_block, 0, stream>>>(num_cells, num_boundary_surfaces,
                     patch_size[i], offset, boundary_cell_face,
                     output, boundary_vf, boundary_Sf, boundary_mag_Sf, boundary_output);
-            TICK_END_EVENT(fvc_grad_vector_correctBC_zeroGradient);
         } else if (patch_type[i] == boundaryConditions::fixedValue) {
             // TODO: implement fixedValue version
             fvc_grad_vector_correctBC_fixedValue<<<blocks_per_grid, threads_per_block, 0, stream>>>(
@@ -2612,12 +2521,9 @@ void fvc_grad_vector(cudaStream_t stream, ncclComm_t comm,
 void scale_dev2T_tensor(cudaStream_t stream, int num_cells, const double *vf1, double *vf2,
         int num_boundary_surfaces, const double *boundary_vf1, double *boundary_vf2)
 {
-    TICK_INIT_EVENT;
     size_t threads_per_block = 1024;
     size_t blocks_per_grid = (num_cells + threads_per_block - 1) / threads_per_block;
-    TICK_START_EVENT;
     scale_dev2t_tensor_kernel<<<blocks_per_grid, threads_per_block, 0, stream>>>(num_cells, vf1, vf2);
-    TICK_END_EVENT(scale_dev2t_tensor_kernel);
 
     blocks_per_grid = (num_boundary_surfaces + threads_per_block - 1) / threads_per_block;
     scale_dev2t_tensor_kernel<<<blocks_per_grid, threads_per_block, 0, stream>>>(num_boundary_surfaces, boundary_vf1, boundary_vf2);
@@ -2628,12 +2534,9 @@ void fvc_div_surface_scalar(cudaStream_t stream, int num_cells, int num_surfaces
         int num_patches, const int *patch_size, const int *patch_type,
         const double *boundary_ssf, const double *volume, double *output, double sign)
 {
-    TICK_INIT_EVENT;
     size_t threads_per_block = 1024;
     size_t blocks_per_grid = (num_surfaces + threads_per_block - 1) / threads_per_block;
-    TICK_START_EVENT;
     fvc_div_surface_scalar_internal<<<blocks_per_grid, threads_per_block, 0, stream>>>(num_surfaces, lowerAddr, upperAddr, ssf, output, sign);
-    TICK_END_EVENT(fvc_div_surface_scalar_internal);
 
     int offset = 0;
     for (int i = 0; i < num_patches; i++) {
@@ -2656,13 +2559,10 @@ void fvc_div_cell_vector(cudaStream_t stream, int num_cells, int num_surfaces, i
         const double *boundary_weight, const double *boundary_vf, const double *boundary_Sf,
         const double *volume, double sign)
 {
-    TICK_INIT_EVENT;
     size_t threads_per_block = 1024;
     size_t blocks_per_grid = (num_surfaces + threads_per_block - 1) / threads_per_block;
-    TICK_START_EVENT;
     fvc_div_cell_vector_internal<<<blocks_per_grid, threads_per_block, 0, stream>>>(num_cells, num_surfaces,
             lowerAddr, upperAddr, vf, weight, Sf, output, sign);
-    TICK_END_EVENT(fvc_div_cell_vector_internal);
 
     int offset = 0;
     for (int i = 0; i < num_patches; i++) {
@@ -2674,11 +2574,9 @@ void fvc_div_cell_vector(cudaStream_t stream, int num_cells, int num_surfaces, i
                 || patch_type[i] == boundaryConditions::gradientEnergy
                 || patch_type[i] == boundaryConditions::calculated) {
             // TODO: just vector version now
-            TICK_START_EVENT;
             fvc_div_cell_vector_boundary<<<blocks_per_grid, threads_per_block, 0, stream>>>(
                     num_boundary_surfaces, patch_size[i], offset,
                     boundary_cell_face, boundary_Sf, boundary_vf, output, sign);
-            TICK_END_EVENT(fvc_div_cell_vector_boundary);
         } else if (patch_type[i] == boundaryConditions::processor) {
             fvc_div_cell_vector_boundary_processor<<<blocks_per_grid, threads_per_block, 0, stream>>>(
                     num_boundary_surfaces, patch_size[i], offset,
@@ -2700,13 +2598,10 @@ void fvc_div_cell_tensor(cudaStream_t stream, int num_cells, int num_surfaces, i
         const int *boundary_cell_face, const double *boundary_vf, const double *boundary_Sf,
         const double *volume, double sign)
 {
-    TICK_INIT_EVENT;
     size_t threads_per_block = 1024;
     size_t blocks_per_grid = (num_surfaces + threads_per_block - 1) / threads_per_block;
-    TICK_START_EVENT;
     fvc_div_cell_tensor_internal<<<blocks_per_grid, threads_per_block, 0, stream>>>(num_cells, num_surfaces,
             lowerAddr, upperAddr, vf, weight, Sf, output, sign);
-    TICK_END_EVENT(fvc_div_cell_tensor_internal);
 
     int offset = 0;
     for (int i = 0; i < num_patches; i++) {
@@ -2717,11 +2612,9 @@ void fvc_div_cell_tensor(cudaStream_t stream, int num_cells, int num_surfaces, i
                 || patch_type[i] == boundaryConditions::fixedValue
                 || patch_type[i] == boundaryConditions::calculated) {
             // TODO: just vector version now
-            TICK_START_EVENT;
             fvc_div_cell_tensor_boundary_zeroGradient<<<blocks_per_grid, threads_per_block, 0, stream>>>(num_cells, num_boundary_surfaces,
                     patch_size[i], offset, boundary_cell_face,
                     boundary_Sf, boundary_vf, output, sign);
-            TICK_END_EVENT(fvc_div_cell_tensor_boundary_zeroGradient);
         } else if (patch_type[i] == boundaryConditions::processor) {
             fvc_div_cell_tensor_boundary_processor<<<blocks_per_grid, threads_per_block, 0, stream>>>(num_cells, num_boundary_surfaces,
                     patch_size[i], offset, boundary_cell_face, boundary_weight, boundary_Sf, boundary_vf, output, sign);
@@ -2742,13 +2635,10 @@ void fvc_div_surface_scalar_vol_scalar(cudaStream_t stream, int num_surfaces,
         const int *boundary_cell_face, const double *boundary_vf, const double *boundary_ssf, 
         double sign)
 {
-    TICK_INIT_EVENT;
     size_t threads_per_block = 1024;
     size_t blocks_per_grid = (num_surfaces + threads_per_block - 1) / threads_per_block;
-    TICK_START_EVENT;
     fvc_div_surface_scalar_vol_scalar_internal<<<blocks_per_grid, threads_per_block, 0, stream>>>(num_surfaces,
             lowerAddr, upperAddr, weight, ssf, vf, output, sign);
-    TICK_END_EVENT(fvc_div_surface_scalar_vol_scalar_internal);
 
     int offset = 0;
     for (int i = 0; i < num_patches; i++) {
@@ -2758,11 +2648,9 @@ void fvc_div_surface_scalar_vol_scalar(cudaStream_t stream, int num_surfaces,
         if (patch_type[i] == boundaryConditions::zeroGradient
                 || patch_type[i] == boundaryConditions::fixedValue
                 || patch_type[i] == boundaryConditions::calculated) {
-            TICK_START_EVENT;
             fvc_div_surface_scalar_vol_scalar_boundary<<<blocks_per_grid, threads_per_block, 0, stream>>>(
                     patch_size[i], offset, boundary_cell_face,
                     boundary_vf, boundary_ssf, output, sign);
-            TICK_END_EVENT(fvc_div_surface_scalar_vol_scalar_boundary);
         } else if (patch_type[i] == boundaryConditions::processor) {
             fvc_div_surface_scalar_vol_scalar_boundary<<<blocks_per_grid, threads_per_block, 0, stream>>>(
                     patch_size[i], offset, boundary_cell_face,
@@ -2783,13 +2671,10 @@ void fvc_grad_cell_scalar(cudaStream_t stream, int num_cells, int num_surfaces,
         int num_patches, const int *patch_size, const int *patch_type, const double *boundary_weight,
         const int *boundary_cell_face, const double *boundary_vf, const double *boundary_Sf, const double *volume, double sign)
 {
-    TICK_INIT_EVENT;
     size_t threads_per_block = 1024;
     size_t blocks_per_grid = (num_surfaces + threads_per_block - 1) / threads_per_block;
-    TICK_START_EVENT;
     fvc_grad_scalar_internal<<<blocks_per_grid, threads_per_block, 0, stream>>>(num_cells, num_surfaces, lowerAddr, upperAddr,
             Sf, weight, vf, output, sign);
-    TICK_END_EVENT(fvc_grad_scalar_internal);
     
     int offset = 0;
     for (int i = 0; i < num_patches; i++) {
@@ -2799,10 +2684,8 @@ void fvc_grad_cell_scalar(cudaStream_t stream, int num_cells, int num_surfaces,
         if (patch_type[i] == boundaryConditions::zeroGradient
                 || patch_type[i] == boundaryConditions::fixedValue
                 || patch_type[i] == boundaryConditions::calculated) {
-            TICK_START_EVENT;
             fvc_grad_scalar_boundary_zeroGradient<<<blocks_per_grid, threads_per_block, 0, stream>>>(num_boundary_surfaces, num_cells, patch_size[i], offset, boundary_cell_face,
                     boundary_Sf, boundary_vf, output, sign);
-            TICK_END_EVENT(fvc_grad_scalar_boundary_zeroGradient);
         } else if (patch_type[i] == boundaryConditions::processor) {
             fvc_grad_scalar_boundary_processor<<<blocks_per_grid, threads_per_block, 0, stream>>>(num_boundary_surfaces, num_cells, patch_size[i], offset, boundary_cell_face,
                     boundary_Sf, boundary_weight, boundary_vf, output, sign);
@@ -2823,13 +2706,10 @@ void fvc_grad_cell_scalar(cudaStream_t stream, int num_cells, int num_surfaces,
         const int *boundary_cell_face, const double *boundary_vf, const double *boundary_Sf, const double *volume, 
         bool dividVol, double sign)
 {
-    TICK_INIT_EVENT;
     size_t threads_per_block = 1024;
     size_t blocks_per_grid = (num_surfaces + threads_per_block - 1) / threads_per_block;
-    TICK_START_EVENT;
     fvc_grad_scalar_internal<<<blocks_per_grid, threads_per_block, 0, stream>>>(num_cells, num_surfaces, lowerAddr, upperAddr,
             Sf, weight, vf, output, sign);
-    TICK_END_EVENT(fvc_grad_scalar_internal);
     
     int offset = 0;
     for (int i = 0; i < num_patches; i++) {
@@ -2839,10 +2719,8 @@ void fvc_grad_cell_scalar(cudaStream_t stream, int num_cells, int num_surfaces,
         if (patch_type[i] == boundaryConditions::zeroGradient
                 || patch_type[i] == boundaryConditions::fixedValue
                 || patch_type[i] == boundaryConditions::calculated) {
-            TICK_START_EVENT;
             fvc_grad_scalar_boundary_zeroGradient<<<blocks_per_grid, threads_per_block, 0, stream>>>(num_boundary_surfaces, num_cells, patch_size[i], offset, boundary_cell_face,
                     boundary_Sf, boundary_vf, output, sign);
-            TICK_END_EVENT(fvc_grad_scalar_boundary_zeroGradient);
         } else if (patch_type[i] == boundaryConditions::processor) {
             fvc_grad_scalar_boundary_processor<<<blocks_per_grid, threads_per_block, 0, stream>>>(num_boundary_surfaces, num_cells, patch_size[i], offset, boundary_cell_face,
                     boundary_Sf, boundary_weight, boundary_vf, output, sign);
@@ -2952,13 +2830,10 @@ void fvc_flux(cudaStream_t stream, int num_cells, int num_surfaces, int num_boun
         const int *boundary_cell_face, const double *boundary_vf, const double *boundary_Sf, 
         double *boundary_output, double sign)
 {
-    TICK_INIT_EVENT;
     size_t threads_per_block = 1024;
     size_t blocks_per_grid = (num_surfaces + threads_per_block - 1) / threads_per_block;
-    TICK_START_EVENT;
     fvc_flux_internal_kernel<<<blocks_per_grid, threads_per_block, 0, stream>>>(num_cells, num_surfaces,
             lowerAddr, upperAddr, vf, weight, Sf, output, sign);
-    TICK_END_EVENT(fvc_flux_internal_kernel);
 
     int offset = 0;
     for (int i = 0; i < num_patches; i++) {
@@ -2968,10 +2843,8 @@ void fvc_flux(cudaStream_t stream, int num_cells, int num_surfaces, int num_boun
         if (patch_type[i] == boundaryConditions::zeroGradient
                 || patch_type[i] == boundaryConditions::fixedValue
                 || patch_type[i] == boundaryConditions::gradientEnergy) {
-            TICK_START_EVENT;
             fvc_flux_boundary_kernel<<<blocks_per_grid, threads_per_block, 0, stream>>>(num_boundary_surfaces, patch_size[i], offset, boundary_cell_face,
                     boundary_Sf, boundary_vf, boundary_output, sign);
-            TICK_END_EVENT(fvc_flux_boundary_kernel);
         } else {
             // xxx
             fprintf(stderr, "%s %d, boundaryConditions other than zeroGradient are not support yet!\n", __FILE__, __LINE__);
@@ -3030,19 +2903,14 @@ void fvMtx_A(cudaStream_t stream, int num_cells, int num_surfaces, int num_bound
         double *A_pEqn)
 {
     checkCudaErrors(cudaMemcpyAsync(A_pEqn, diag, num_cells * sizeof(double), cudaMemcpyDeviceToDevice, stream));
-    TICK_INIT_EVENT;
 
     size_t threads_per_block = 1024;
     size_t blocks_per_grid = (num_boundary_surfaces + threads_per_block - 1) / threads_per_block;
-    TICK_START_EVENT;
     addAveInternaltoDiag<<<blocks_per_grid, threads_per_block, 0, stream>>>(num_cells, num_boundary_surfaces, boundary_cell_face, 
             internal_coeffs, A_pEqn);
-    TICK_END_EVENT(addAveInternaltoDiag);
     
     blocks_per_grid = (num_cells + threads_per_block - 1) / threads_per_block;
-    TICK_START_EVENT;
     divide_cell_volume_scalar<<<blocks_per_grid, threads_per_block, 0, stream>>>(num_cells, volume, A_pEqn);
-    TICK_END_EVENT(divide_cell_volume_scalar);
 }
 
 void fvMtx_H(cudaStream_t stream, int num_cells, int num_surfaces, int num_boundary_surfaces, 
@@ -3052,21 +2920,16 @@ void fvMtx_H(cudaStream_t stream, int num_cells, int num_surfaces, int num_bound
         const double *lower, const double *upper, const double *source, const double *psi, 
         double *H_pEqn, double *H_pEqn_perm)
 {
-    TICK_INIT_EVENT;
     size_t threads_per_block = 1024;
     size_t blocks_per_grid = (num_boundary_surfaces + threads_per_block - 1) / threads_per_block;
 
-    TICK_START_EVENT;
     checkCudaErrors(cudaMemcpyAsync(H_pEqn, source, num_cells * 3 * sizeof(double), cudaMemcpyDeviceToDevice, stream));
     addBoundaryDiag<<<blocks_per_grid, threads_per_block, 0, stream>>>(num_cells, num_boundary_surfaces, boundary_cell_face, 
             internal_coffs, psi, H_pEqn);
-    TICK_END_EVENT(addBoundaryDiag);
     
     blocks_per_grid = (num_surfaces + threads_per_block - 1) / threads_per_block;
-    TICK_START_EVENT;
     lduMatrix_H<<<blocks_per_grid, threads_per_block, 0, stream>>>(num_cells, num_surfaces, lowerAddr, upperAddr, 
             lower, upper, psi, H_pEqn);
-    TICK_END_EVENT(lduMatrix_H);
 
     int offset = 0;
     for (int i = 0; i < num_patches; i++) {
@@ -3075,19 +2938,15 @@ void fvMtx_H(cudaStream_t stream, int num_cells, int num_surfaces, int num_bound
         // TODO: just non-coupled patch type now
         if (patch_type[i] == boundaryConditions::zeroGradient
                 || patch_type[i] == boundaryConditions::fixedValue) {
-            TICK_START_EVENT;
             addBoundarySrc_unCoupled<<<blocks_per_grid, threads_per_block, 0, stream>>>(num_cells, patch_size[i], offset, 
                     num_boundary_surfaces, boundary_cell_face, boundary_coeffs, H_pEqn);
-            TICK_END_EVENT(addBoundarySrc_unCoupled);
         } else {
             fprintf(stderr, "%s %d, boundaryConditions other than zeroGradient are not support yet!\n", __FILE__, __LINE__);
         }
         offset += patch_size[i];
     }
     blocks_per_grid = (num_cells + threads_per_block - 1) / threads_per_block;
-    TICK_START_EVENT;
     divideVol_permute_vec<<<blocks_per_grid, threads_per_block, 0, stream>>>(num_cells, volume, H_pEqn, H_pEqn_perm);
-    TICK_END_EVENT(divideVol_permute_vec);
 }
 
 void fvMtx_flux(cudaStream_t stream, int num_surfaces, int num_boundary_surfaces, 
@@ -3120,11 +2979,8 @@ void fvMtx_flux(cudaStream_t stream, int num_surfaces, int num_boundary_surfaces
 void solve_explicit_scalar(cudaStream_t stream, int num_cells, const double *diag, const double *source,
         double *psi)
 {
-    TICK_INIT_EVENT;
     size_t threads_per_block = 1024;
     size_t blocks_per_grid = (num_cells + threads_per_block - 1) / threads_per_block;
 
-    TICK_START_EVENT;
     solve_explicit_scalar_kernel<<<blocks_per_grid, threads_per_block, 0, stream>>>(num_cells, diag, source, psi);
-    TICK_END_EVENT(solve_explicit_scalar_kernel);
 }
diff --git a/src_gpu/dfRhoEqn.H b/src_gpu/dfRhoEqn.H
index fe22f2aa2..f79b0584b 100644
--- a/src_gpu/dfRhoEqn.H
+++ b/src_gpu/dfRhoEqn.H
@@ -9,6 +9,7 @@ private:
 	dfMatrixDataBase &dataBase_;
 
     // cuda resource
+    cudaStream_t stream;
     // one graph for one eqn before using self-developed solver
     cudaGraph_t graph;
     cudaGraphExec_t graph_instance;
@@ -32,6 +33,7 @@ public:
     // member function
 
     // initialization
+    void setConstantValues();
     void setConstantFields(const std::vector<int> patch_type);
     void initNonConstantFields(const double *rho, const double *phi, 
             const double *boundary_rho, const double *boudnary_phi);
diff --git a/src_gpu/dfRhoEqn.cu b/src_gpu/dfRhoEqn.cu
index 365e102f0..0babea810 100644
--- a/src_gpu/dfRhoEqn.cu
+++ b/src_gpu/dfRhoEqn.cu
@@ -9,6 +9,10 @@ void dfRhoEqn::createNonConstantLduAndCsrFields()
 #endif
 }
 
+void dfRhoEqn::setConstantValues() {
+    this->stream = dataBase_.stream;
+}
+
 void dfRhoEqn::setConstantFields(const std::vector<int> patch_type) {
   this->patch_type = patch_type;
 }
@@ -34,14 +38,9 @@ void dfRhoEqn::preProcess()
 
 void dfRhoEqn::process()
 {
-    // 使用event计算时间
-    float time_elapsed=0;
-    cudaEvent_t start,stop;
-    checkCudaErrors(cudaEventCreate(&start));
-    checkCudaErrors(cudaEventCreate(&stop));
-    checkCudaErrors(cudaEventRecord(start,0));
-
-#ifndef TIME_GPU
+    TICK_INIT_EVENT;
+    TICK_START_EVENT;
+#ifdef USE_GRAPH
     if(!graph_created) {
         DEBUG_TRACE;
         checkCudaErrors(cudaStreamBeginCapture(dataBase_.stream, cudaStreamCaptureModeGlobal));
@@ -68,19 +67,23 @@ void dfRhoEqn::process()
     correct_boundary_conditions_scalar(dataBase_.stream, dataBase_.nccl_comm, dataBase_.neighbProcNo.data(),
             dataBase_.num_boundary_surfaces, dataBase_.num_patches, dataBase_.patch_size.data(),
             patch_type.data(), dataBase_.d_boundary_delta_coeffs, dataBase_.d_boundary_face_cell, dataBase_.d_rho, dataBase_.d_boundary_rho);
-#ifndef TIME_GPU
+#ifdef USE_GRAPH
         checkCudaErrors(cudaStreamEndCapture(dataBase_.stream, &graph));
         checkCudaErrors(cudaGraphInstantiate(&graph_instance, graph, NULL, NULL, 0));
         graph_created = true;
     }
     checkCudaErrors(cudaGraphLaunch(graph_instance, dataBase_.stream));
 #endif
+    TICK_END_EVENT(rhoEqn process);
 
-    checkCudaErrors(cudaEventRecord(stop,0));
-    checkCudaErrors(cudaEventSynchronize(start));
-    checkCudaErrors(cudaEventSynchronize(stop));
-    checkCudaErrors(cudaEventElapsedTime(&time_elapsed,start,stop));
-    fprintf(stderr, "rhoEqn process time:%f(ms)\n\n",time_elapsed);
+    TICK_START_EVENT;
+#ifdef STREAM_ALLOCATOR
+    checkCudaErrors(cudaFreeAsync(d_source, dataBase_.stream));
+    checkCudaErrors(cudaFreeAsync(d_diag, dataBase_.stream));
+#endif
+    checkCudaErrors(cudaMemcpyAsync(dataBase_.h_rho, dataBase_.d_rho, dataBase_.cell_value_bytes, cudaMemcpyDeviceToHost, dataBase_.stream));
+    TICK_END_EVENT(rhoEqn post process);
+    sync();
 }
 
 void dfRhoEqn::sync()
@@ -93,16 +96,7 @@ void dfRhoEqn::solve()
     solve_explicit_scalar(dataBase_.stream, dataBase_.num_cells, d_diag, d_source, dataBase_.d_rho);
 }
 
-void dfRhoEqn::postProcess(double *h_rho)
-{
-#ifdef STREAM_ALLOCATOR
-    checkCudaErrors(cudaFreeAsync(d_source, dataBase_.stream));
-    checkCudaErrors(cudaFreeAsync(d_diag, dataBase_.stream));
-#endif
-    checkCudaErrors(cudaMemcpyAsync(h_rho, dataBase_.d_rho, dataBase_.cell_value_bytes, cudaMemcpyDeviceToHost, dataBase_.stream));
-    sync();
-    // TODO: correct boundary conditions
-}
+void dfRhoEqn::postProcess(double *h_rho) {}
 
 void dfRhoEqn::compareResult(const double *diag, const double *source, bool printFlag)
 {
diff --git a/src_gpu/dfThermo.H b/src_gpu/dfThermo.H
index b833f937a..e1f9c0f58 100644
--- a/src_gpu/dfThermo.H
+++ b/src_gpu/dfThermo.H
@@ -1,6 +1,7 @@
 #pragma once
 
 #include "dfMatrixDataBase.H"
+#include "dfMatrixOpBase.H"
 
 void init_const_coeff_ptr(std::vector<std::vector<double>>& nasa_coeffs, std::vector<std::vector<double>>& viscosity_coeffs,
         std::vector<std::vector<double>>& thermal_conductivity_coeffs, std::vector<std::vector<double>>& binary_diffusion_coeffs,
diff --git a/src_gpu/dfThermo.cu b/src_gpu/dfThermo.cu
index 307d6d611..da62828aa 100644
--- a/src_gpu/dfThermo.cu
+++ b/src_gpu/dfThermo.cu
@@ -6,28 +6,6 @@
 #include <cstring>
 #include "device_launch_parameters.h"
 
-#ifdef TIME_GPU
-    #define TICK_INIT_EVENT \
-        float time_elapsed_kernel=0;\
-        cudaEvent_t start_kernel, stop_kernel;\
-        checkCudaErrors(cudaEventCreate(&start_kernel));\
-        checkCudaErrors(cudaEventCreate(&stop_kernel));
-
-    #define TICK_START_EVENT \
-        checkCudaErrors(cudaEventRecord(start_kernel,0));
-
-    #define TICK_END_EVENT(prefix) \
-        checkCudaErrors(cudaEventRecord(stop_kernel,0));\
-        checkCudaErrors(cudaEventSynchronize(start_kernel));\
-        checkCudaErrors(cudaEventSynchronize(stop_kernel));\
-        checkCudaErrors(cudaEventElapsedTime(&time_elapsed_kernel,start_kernel,stop_kernel));\
-        printf("try %s 执行时间：%lf(ms)\n", #prefix, time_elapsed_kernel);
-#else
-    #define TICK_INIT_EVENT
-    #define TICK_START_EVENT
-    #define TICK_END_EVENT(prefix)
-#endif
-
 #define GAS_CANSTANT 8314.46261815324
 #define SQRT8 2.8284271247461903
 #define NUM_SPECIES 7
@@ -454,55 +432,40 @@ void dfThermo::calculateTPolyGPU(int threads_per_block, int num_thread, int num_
 void dfThermo::calculatePsiGPU(int threads_per_block, int num_thread, const double *T, const double *mean_mole_weight, 
         double *d_psi, int offset)
 {
-    TICK_INIT_EVENT;
     size_t blocks_per_grid = (num_thread + threads_per_block - 1) / threads_per_block;
-    TICK_START_EVENT;
     calculate_psi_kernel<<<blocks_per_grid, threads_per_block, 0, stream>>>(num_thread, offset, T, mean_mole_weight, d_psi);
-    TICK_END_EVENT(calculate_psi_kernel);
 }
 
 void dfThermo::calculateRhoGPU(int threads_per_block, int num_thread, const double *p, const double *psi, double *rho, int offset)
 {
-    TICK_INIT_EVENT;
     size_t blocks_per_grid = (num_thread + threads_per_block - 1) / threads_per_block;
-    TICK_START_EVENT;
     calculate_rho_kernel<<<blocks_per_grid, threads_per_block, 0, stream>>>(num_cells, offset, p, psi, rho);
-    TICK_END_EVENT(calculate_rho_kernel);
 }
 
 void dfThermo::calculateViscosityGPU(int threads_per_block, int num_thread, int num_total, const double *T, const double *mole_fraction, 
         const double *T_poly, double *species_viscosities, double *viscosity, int offset)
 {
-    TICK_INIT_EVENT;
     size_t blocks_per_grid = (num_thread + threads_per_block - 1) / threads_per_block;
-    TICK_START_EVENT;
     calculate_viscosity_kernel<<<blocks_per_grid, threads_per_block, 0, stream>>>(num_thread, num_total, num_species, offset, 
             T_poly, T, mole_fraction, species_viscosities, viscosity);
-    TICK_END_EVENT(calculate_viscosity_kernel);
 }
 
 void dfThermo::calculateThermoConductivityGPU(int threads_per_block, int num_thread, int num_total, const double *T, 
         const double *T_poly, const double *d_y, const double *mole_fraction, double *species_thermal_conductivities,
         double *thermal_conductivity, int offset)
 {
-    TICK_INIT_EVENT;
     size_t blocks_per_grid = (num_thread + threads_per_block - 1) / threads_per_block;
-    TICK_START_EVENT;
     calculate_thermoConductivity_kernel<<<blocks_per_grid, threads_per_block, 0, stream>>>(num_thread, num_total, num_species, 
             offset, d_nasa_coeffs, d_y, T_poly, T, mole_fraction, species_thermal_conductivities, thermal_conductivity);
-    TICK_END_EVENT(calculate_thermoConductivity_kernel);
 }
 
 void dfThermo::calculateTemperatureGPU(int threads_per_block, int num_thread, int num_total, const double *T_init, const double *target_h, double *T, 
         const double *d_mass_fraction, int offset, double atol, double rtol, int max_iter)
 {
-    TICK_INIT_EVENT;
     size_t blocks_per_grid = (num_thread + threads_per_block - 1) / threads_per_block;
 
-    TICK_START_EVENT;
     calculate_temperature_kernel<<<blocks_per_grid, threads_per_block, 0, stream>>>(num_thread, num_total, num_species, offset,
             T_init, target_h, d_mass_fraction, T, atol, rtol, max_iter);
-    TICK_END_EVENT(calculate_temperature_kernel);
 }
 
 void dfThermo::calculateEnthalpyGPU(int threads_per_block, int num_thread, const double *T, double *enthalpy, const double *d_mass_fraction, int offset)
@@ -619,7 +582,6 @@ void dfThermo::calculateEnergyGradient(int num_thread, int num_cells, int num_sp
 {
     size_t threads_per_block = 256;
     size_t blocks_per_grid = (num_thread + threads_per_block - 1) / threads_per_block;
-
     calculate_energy_gradient_kernel<<<blocks_per_grid, threads_per_block, 0, stream>>>(num_thread, num_cells, num_species, num_boundary_surfaces,
             bou_offset, gradient_offset, face2Cells, T, p, y, boundary_p, boundary_y, boundary_delta_coeffs, boundary_thermo_gradient);
 }
diff --git a/src_gpu/dfUEqn.H b/src_gpu/dfUEqn.H
index e7a16f284..99db6ef60 100644
--- a/src_gpu/dfUEqn.H
+++ b/src_gpu/dfUEqn.H
@@ -11,10 +11,12 @@ private:
 	dfMatrixDataBase &dataBase_;
 
     // cuda resource
+    cudaStream_t stream;
     // one graph for one eqn before using self-developed solver
     cudaGraph_t graph_pre, graph_post;
     cudaGraphExec_t graph_instance_pre, graph_instance_post;
-    bool graph_created=false;
+    bool pre_graph_created=false;
+    bool post_graph_created=false;
 
 	// constant values -- basic
 	std::string mode_string;
diff --git a/src_gpu/dfUEqn.cu b/src_gpu/dfUEqn.cu
index 9dabdb597..61dc11340 100644
--- a/src_gpu/dfUEqn.cu
+++ b/src_gpu/dfUEqn.cu
@@ -265,6 +265,7 @@ __global__ void ueqn_divide_cell_volume_vec(int num_cells, const double* volume,
 }
 
 void dfUEqn::setConstantValues(const std::string &mode_string, const std::string &setting_path) {
+  this->stream = dataBase_.stream;
   this->mode_string = mode_string;
   this->setting_path = setting_path;
   UxSolver = new AmgXSolver(mode_string, setting_path, dataBase_.localRank);
@@ -356,10 +357,12 @@ void dfUEqn::initNonConstantFieldsBoundary() {
 }
 
 void dfUEqn::cleanCudaResources() {
-    if (graph_created) {
+    if (pre_graph_created) {
         checkCudaErrors(cudaGraphExecDestroy(graph_instance_pre));
-        checkCudaErrors(cudaGraphExecDestroy(graph_instance_post));
         checkCudaErrors(cudaGraphDestroy(graph_pre));
+    }
+    if (post_graph_created) {
+        checkCudaErrors(cudaGraphExecDestroy(graph_instance_post));
         checkCudaErrors(cudaGraphDestroy(graph_post));
     }
 }
@@ -376,15 +379,10 @@ void dfUEqn::preProcess(const double *h_u, const double *h_boundary_u, const dou
 }
 
 void dfUEqn::process() {
-    //使用event计算时间
-    float time_elapsed=0;
-    cudaEvent_t start,stop;
-    checkCudaErrors(cudaEventCreate(&start));
-    checkCudaErrors(cudaEventCreate(&stop));
-    checkCudaErrors(cudaEventRecord(start,0));
-
-#ifndef TIME_GPU
-    if(!graph_created) {
+    TICK_INIT_EVENT;
+    TICK_START_EVENT;
+#ifdef USE_GRAPH
+    if(!pre_graph_created) {
         DEBUG_TRACE;
         checkCudaErrors(cudaStreamBeginCapture(dataBase_.stream, cudaStreamCaptureModeGlobal));
 #endif
@@ -487,54 +485,72 @@ void dfUEqn::process() {
             dataBase_.num_patches, dataBase_.patch_size.data(), patch_type.data(), 
             d_ldu, d_extern, d_source, d_internal_coeffs, d_boundary_coeffs, d_A, d_b);
 #endif
-#ifndef TIME_GPU
+#ifdef USE_GRAPH
         checkCudaErrors(cudaStreamEndCapture(dataBase_.stream, &graph_pre));
         checkCudaErrors(cudaGraphInstantiate(&graph_instance_pre, graph_pre, NULL, NULL, 0));
+        pre_graph_created = true;
     }
     DEBUG_TRACE;
     checkCudaErrors(cudaGraphLaunch(graph_instance_pre, dataBase_.stream));
 #endif
-    checkCudaErrors(cudaEventRecord(stop,0));
-    checkCudaErrors(cudaEventSynchronize(start));
-    checkCudaErrors(cudaEventSynchronize(stop));
-    checkCudaErrors(cudaEventElapsedTime(&time_elapsed,start,stop));
-    fprintf(stderr, "ueqn assembly time:%f(ms)\n",time_elapsed);
+    TICK_END_EVENT(UEqn assembly);
 
-    checkCudaErrors(cudaEventRecord(start,0));
+    TICK_START_EVENT;
 #ifndef DEBUG_CHECK_LDU
     solve();
 #endif
-    checkCudaErrors(cudaEventRecord(stop,0));
-    checkCudaErrors(cudaEventSynchronize(start));
-    checkCudaErrors(cudaEventSynchronize(stop));
-    checkCudaErrors(cudaEventElapsedTime(&time_elapsed,start,stop));
-    fprintf(stderr, "ueqn solve time:%f(ms)\n",time_elapsed);
-
-    checkCudaErrors(cudaEventRecord(start,0));
-#ifndef TIME_GPU
-    if(!graph_created) {
+    TICK_END_EVENT(UEqn solve);
+
+    TICK_START_EVENT;
+#ifdef USE_GRAPH
+    if(!post_graph_created) {
         checkCudaErrors(cudaStreamBeginCapture(dataBase_.stream, cudaStreamCaptureModeGlobal));
 #endif
 
-        correct_boundary_conditions_vector(dataBase_.stream, dataBase_.nccl_comm, dataBase_.neighbProcNo.data(), dataBase_.num_boundary_surfaces, 
+        correct_boundary_conditions_vector(dataBase_.stream, dataBase_.nccl_comm, dataBase_.neighbProcNo.data(),
+                dataBase_.num_boundary_surfaces, 
                 dataBase_.num_cells, dataBase_.num_patches, dataBase_.patch_size.data(), patch_type.data(), dataBase_.d_boundary_face_cell,
                 dataBase_.d_u, dataBase_.d_boundary_u);
         vector_half_mag_square(dataBase_.stream, dataBase_.num_cells, dataBase_.d_u, dataBase_.d_k, dataBase_.num_boundary_surfaces, 
                 dataBase_.d_boundary_u, dataBase_.d_boundary_k);
+    
+        // copy u and boundary_u to host
+        permute_vector_d2h(dataBase_.stream, dataBase_.num_cells, dataBase_.d_u, d_u_host_order);
+        checkCudaErrors(cudaMemcpyAsync(dataBase_.h_u, d_u_host_order, dataBase_.cell_value_vec_bytes, cudaMemcpyDeviceToHost, dataBase_.stream));
 
-#ifndef TIME_GPU
+#ifdef STREAM_ALLOCATOR
+        // free
+        // thermophysical fields
+        checkCudaErrors(cudaFreeAsync(d_nu_eff, dataBase_.stream));
+        // intermediate fields
+        checkCudaErrors(cudaFreeAsync(d_grad_u, dataBase_.stream));
+        checkCudaErrors(cudaFreeAsync(d_rho_nueff, dataBase_.stream));
+        checkCudaErrors(cudaFreeAsync(d_fvc_output, dataBase_.stream));
+    
+        // thermophysical fields
+        checkCudaErrors(cudaFreeAsync(d_boundary_nu_eff, dataBase_.stream));
+        // intermediate fields
+        checkCudaErrors(cudaFreeAsync(d_boundary_grad_u, dataBase_.stream));
+        checkCudaErrors(cudaFreeAsync(d_boundary_rho_nueff, dataBase_.stream));
+        // boundary coeff fields
+        checkCudaErrors(cudaFreeAsync(d_value_internal_coeffs, dataBase_.stream));
+        checkCudaErrors(cudaFreeAsync(d_value_boundary_coeffs, dataBase_.stream));
+        checkCudaErrors(cudaFreeAsync(d_gradient_internal_coeffs, dataBase_.stream));
+        checkCudaErrors(cudaFreeAsync(d_gradient_boundary_coeffs, dataBase_.stream));
+    
+        checkCudaErrors(cudaFreeAsync(d_A, dataBase_.stream));
+        checkCudaErrors(cudaFreeAsync(d_b, dataBase_.stream));
+#endif
+
+#ifdef USE_GRAPH
         checkCudaErrors(cudaStreamEndCapture(dataBase_.stream, &graph_post));
         checkCudaErrors(cudaGraphInstantiate(&graph_instance_post, graph_post, NULL, NULL, 0));
-        graph_created = true;
+        post_graph_created = true;
     }
     checkCudaErrors(cudaGraphLaunch(graph_instance_post, dataBase_.stream));
 #endif
-    
-    checkCudaErrors(cudaEventRecord(stop,0));
-    checkCudaErrors(cudaEventSynchronize(start));
-    checkCudaErrors(cudaEventSynchronize(stop));
-    checkCudaErrors(cudaEventElapsedTime(&time_elapsed,start,stop));
-    fprintf(stderr, "ueqn post process time:%f(ms)\n\n",time_elapsed);
+    TICK_END_EVENT(UEqn post process);
+    sync();
 }
 
 void dfUEqn::sync()
@@ -543,7 +559,7 @@ void dfUEqn::sync()
 }
 
 void dfUEqn::solve() {
-    sync();
+    sync(); // TODO need remove
 
     if (num_iteration == 0)                                     // first interation
     {
@@ -564,35 +580,7 @@ void dfUEqn::solve() {
     num_iteration++;
 }
 
-void dfUEqn::postProcess(double *h_u) {
-    permute_vector_d2h(dataBase_.stream, dataBase_.num_cells, dataBase_.d_u, d_u_host_order);
-    checkCudaErrors(cudaMemcpyAsync(h_u, d_u_host_order, dataBase_.cell_value_vec_bytes, cudaMemcpyDeviceToHost, dataBase_.stream));
-    checkCudaErrors(cudaStreamSynchronize(dataBase_.stream));
-
-#ifdef STREAM_ALLOCATOR
-    // free
-    // thermophysical fields
-    checkCudaErrors(cudaFreeAsync(d_nu_eff, dataBase_.stream));
-    // intermediate fields
-    checkCudaErrors(cudaFreeAsync(d_grad_u, dataBase_.stream));
-    checkCudaErrors(cudaFreeAsync(d_rho_nueff, dataBase_.stream));
-    checkCudaErrors(cudaFreeAsync(d_fvc_output, dataBase_.stream));
-
-    // thermophysical fields
-    checkCudaErrors(cudaFreeAsync(d_boundary_nu_eff, dataBase_.stream));
-    // intermediate fields
-    checkCudaErrors(cudaFreeAsync(d_boundary_grad_u, dataBase_.stream));
-    checkCudaErrors(cudaFreeAsync(d_boundary_rho_nueff, dataBase_.stream));
-    // boundary coeff fields
-    checkCudaErrors(cudaFreeAsync(d_value_internal_coeffs, dataBase_.stream));
-    checkCudaErrors(cudaFreeAsync(d_value_boundary_coeffs, dataBase_.stream));
-    checkCudaErrors(cudaFreeAsync(d_gradient_internal_coeffs, dataBase_.stream));
-    checkCudaErrors(cudaFreeAsync(d_gradient_boundary_coeffs, dataBase_.stream));
-
-    checkCudaErrors(cudaFreeAsync(d_A, dataBase_.stream));
-    checkCudaErrors(cudaFreeAsync(d_b, dataBase_.stream));
-#endif
-}
+void dfUEqn::postProcess(double *h_u) {}
 
 void dfUEqn::A(double *Psi) {
     fvMtx_A(dataBase_.stream, dataBase_.num_cells, dataBase_.num_surfaces, dataBase_.num_boundary_surfaces, 
diff --git a/src_gpu/dfYEqn.H b/src_gpu/dfYEqn.H
index c34e969bd..10304bd97 100644
--- a/src_gpu/dfYEqn.H
+++ b/src_gpu/dfYEqn.H
@@ -11,6 +11,7 @@ private:
 	dfMatrixDataBase &dataBase_;
 
     // cuda resource
+    cudaStream_t stream;
     // one graph for one eqn before using self-developed solver
     cudaGraph_t graph;
     cudaGraphExec_t graph_instance;
diff --git a/src_gpu/dfYEqn.cu b/src_gpu/dfYEqn.cu
index 432ca641b..cc08edbe1 100644
--- a/src_gpu/dfYEqn.cu
+++ b/src_gpu/dfYEqn.cu
@@ -285,6 +285,7 @@ double* dfYEqn::getFieldPointer(const char* fieldAlias, location loc, position p
 }
 
 void dfYEqn::setConstantValues(const std::string &mode_string, const std::string &setting_path, const int inertIndex) {
+    this->stream = dataBase_.stream;
     this->mode_string = mode_string;
     this->setting_path = setting_path;
     this->inertIndex = inertIndex;
@@ -411,14 +412,9 @@ void dfYEqn::preProcess(const double *h_rhoD, const double *h_boundary_rhoD,
 }
 
 void dfYEqn::process() {
-    //使用event计算时间
-    float time_elapsed=0;
-    cudaEvent_t start,stop;
-    checkCudaErrors(cudaEventCreate(&start));
-    checkCudaErrors(cudaEventCreate(&stop));
-    checkCudaErrors(cudaEventRecord(start,0));
-
-#ifndef TIME_GPU
+    TICK_INIT_EVENT;
+    TICK_START_EVENT;
+#ifdef USE_GRAPH
     if(!graph_created) {
         DEBUG_TRACE;
         checkCudaErrors(cudaStreamBeginCapture(dataBase_.stream, cudaStreamCaptureModeGlobal));
@@ -508,7 +504,7 @@ void dfYEqn::process() {
         // double *d_boundary_DEff = d_boundary_rhoD;
         yeqn_compute_DEff(dataBase_.stream, dataBase_.num_species, dataBase_.num_cells, dataBase_.num_boundary_surfaces,
                 d_lewis_number, dataBase_.d_thermo_alpha, d_mut_sct, d_DEff, dataBase_.d_boundary_thermo_alpha, d_boundary_mut_sct, d_boundary_DEff);
-#ifndef TIME_GPU
+#ifdef USE_GRAPH
         checkCudaErrors(cudaStreamEndCapture(dataBase_.stream, &graph));
         checkCudaErrors(cudaGraphInstantiate(&graph_instance, graph, NULL, NULL, 0));
         graph_created = true;
@@ -582,6 +578,9 @@ void dfYEqn::process() {
         if (s == dataBase_.num_species - 1)
             num_iteration++;
     }
+    TICK_END_EVENT(YEqn assembly and solve for all species);
+
+    TICK_START_EVENT;
     // compute y_inertIndex
     yeqn_compute_y_inertIndex(dataBase_.stream, dataBase_.num_species, inertIndex, dataBase_.num_cells, dataBase_.d_y);
 
@@ -593,34 +592,10 @@ void dfYEqn::process() {
                 dataBase_.d_y + dataBase_.num_cells * s, dataBase_.d_boundary_y + dataBase_.num_boundary_surfaces * s);
     }
 
-    checkCudaErrors(cudaEventRecord(stop,0));
-    checkCudaErrors(cudaEventSynchronize(start));
-    checkCudaErrors(cudaEventSynchronize(stop));
-    checkCudaErrors(cudaEventElapsedTime(&time_elapsed,start,stop));
-    fprintf(stderr, "yeqn process time: %f(ms)\n\n",time_elapsed);
-}
-
-void dfYEqn::solve(int solverIndex) {
-    if (num_iteration == 0)                                     // first interation
-    {
-        fprintf(stderr, "Initializing AmgX Linear Solver\n");
-        DEBUG_TRACE;
-        YSolverSet[solverIndex]->setOperator(dataBase_.num_cells, dataBase_.num_total_cells, dataBase_.num_Nz, dataBase_.d_csr_row_index, dataBase_.d_csr_col_index, d_A);
-        DEBUG_TRACE;
-    }
-    else
-    {
-        DEBUG_TRACE;
-        YSolverSet[solverIndex]->updateOperator(dataBase_.num_cells, dataBase_.num_Nz, d_A);
-        DEBUG_TRACE;
-    }
-    // use d_source as d_b
-    DEBUG_TRACE;
-    YSolverSet[solverIndex]->solve(dataBase_.num_cells, dataBase_.d_y + dataBase_.num_cells * solverIndex, d_source);
-    DEBUG_TRACE;
-}
+    // copy y and boundary_y to host
+    checkCudaErrors(cudaMemcpyAsync(dataBase_.h_y, dataBase_.d_y, dataBase_.cell_value_bytes * dataBase_.num_species, cudaMemcpyDeviceToHost, dataBase_.stream));
+    checkCudaErrors(cudaMemcpyAsync(dataBase_.h_boundary_y, dataBase_.d_boundary_y, dataBase_.boundary_surface_value_bytes * dataBase_.num_species, cudaMemcpyDeviceToHost, dataBase_.stream));
 
-void dfYEqn::postProcess(double *h_y, double *h_boundary_y) {
 #ifdef STREAM_ALLOCATOR
     // thermophysical fields
     checkCudaErrors(cudaFreeAsync(d_rhoD, dataBase_.stream));
@@ -655,12 +630,35 @@ void dfYEqn::postProcess(double *h_y, double *h_boundary_y) {
     checkCudaErrors(cudaFreeAsync(d_boundary_coeffs, dataBase_.stream));
     checkCudaErrors(cudaFreeAsync(d_A, dataBase_.stream));
 #endif
-    // copy y to host
-    checkCudaErrors(cudaMemcpyAsync(h_y, dataBase_.d_y, dataBase_.cell_value_bytes * dataBase_.num_species, cudaMemcpyDeviceToHost, dataBase_.stream));
-    checkCudaErrors(cudaMemcpyAsync(h_boundary_y, dataBase_.d_boundary_y, dataBase_.boundary_surface_value_bytes * dataBase_.num_species, cudaMemcpyDeviceToHost, dataBase_.stream));
-    checkCudaErrors(cudaStreamSynchronize(dataBase_.stream));
+    TICK_END_EVENT(YEqn post process for all species);
+    sync();
+}
+
+void dfYEqn::solve(int solverIndex) {
+    TICK_INIT_EVENT;
+    TICK_START_EVENT;
+    if (num_iteration == 0)                                     // first interation
+    {
+        fprintf(stderr, "Initializing AmgX Linear Solver\n");
+        DEBUG_TRACE;
+        YSolverSet[solverIndex]->setOperator(dataBase_.num_cells, dataBase_.num_total_cells, dataBase_.num_Nz, dataBase_.d_csr_row_index, dataBase_.d_csr_col_index, d_A);
+        DEBUG_TRACE;
+    }
+    else
+    {
+        DEBUG_TRACE;
+        YSolverSet[solverIndex]->updateOperator(dataBase_.num_cells, dataBase_.num_Nz, d_A);
+        DEBUG_TRACE;
+    }
+    // use d_source as d_b
+    DEBUG_TRACE;
+    YSolverSet[solverIndex]->solve(dataBase_.num_cells, dataBase_.d_y + dataBase_.num_cells * solverIndex, d_source);
+    DEBUG_TRACE;
+    TICK_END_EVENT(YEqn solve one specie);
 }
 
+void dfYEqn::postProcess(double *h_y, double *h_boundary_y) {}
+
 void dfYEqn::sync() {
     checkCudaErrors(cudaStreamSynchronize(dataBase_.stream));
 }
diff --git a/src_gpu/dfpEqn.H b/src_gpu/dfpEqn.H
index fe58e6608..b8d84640b 100644
--- a/src_gpu/dfpEqn.H
+++ b/src_gpu/dfpEqn.H
@@ -11,10 +11,12 @@ private:
 	dfMatrixDataBase &dataBase_;
 
     // cuda resource
+    cudaStream_t stream;
     // one graph for one eqn before using self-developed solver
     cudaGraph_t graph_pre, graph_post;
     cudaGraphExec_t graph_instance_pre, graph_instance_post;
-    bool graph_created=false;
+    bool pre_graph_created=false;
+    bool post_graph_created=false;
 
 	// constant values -- basic
 	std::string mode_string;
diff --git a/src_gpu/dfpEqn.cu b/src_gpu/dfpEqn.cu
index 139fe3a1a..b8940d878 100644
--- a/src_gpu/dfpEqn.cu
+++ b/src_gpu/dfpEqn.cu
@@ -294,6 +294,7 @@ double* dfpEqn::getFieldPointer(const char* fieldAlias, location loc, position p
 }
 
 void dfpEqn::setConstantValues(const std::string &mode_string, const std::string &setting_path) {
+    this->stream = dataBase_.stream;
     this->mode_string = mode_string;
     this->setting_path = setting_path;
     pSolver = new AmgXSolver(mode_string, setting_path, dataBase_.localRank);
@@ -340,10 +341,12 @@ void dfpEqn::initNonConstantFields(const double *p, const double *boundary_p){
 }
 
 void dfpEqn::cleanCudaResources() {
-    if (graph_created) {
+    if (pre_graph_created) {
         checkCudaErrors(cudaGraphExecDestroy(graph_instance_pre));
-        checkCudaErrors(cudaGraphExecDestroy(graph_instance_post));
         checkCudaErrors(cudaGraphDestroy(graph_pre));
+    }
+    if (post_graph_created) {
+        checkCudaErrors(cudaGraphExecDestroy(graph_instance_post));
         checkCudaErrors(cudaGraphDestroy(graph_post));
     }
 }
@@ -364,15 +367,10 @@ void dfpEqn::correctP(const double *h_p, double *h_boundary_p) {
 };
 
 void dfpEqn::process() {
-
-    float time_elapsed=0;
-    cudaEvent_t start,stop;
-    checkCudaErrors(cudaEventCreate(&start));
-    checkCudaErrors(cudaEventCreate(&stop));
-    checkCudaErrors(cudaEventRecord(start,0));
-
-#ifndef TIME_GPU
-    if(!graph_created) {
+    TICK_INIT_EVENT;
+    TICK_START_EVENT;
+#ifdef USE_GRAPH
+    if(!pre_graph_created) {
         DEBUG_TRACE;
         checkCudaErrors(cudaStreamBeginCapture(dataBase_.stream, cudaStreamCaptureModeGlobal));
 #endif
@@ -428,75 +426,66 @@ void dfpEqn::process() {
             dataBase_.num_patches, dataBase_.patch_size.data(), patch_type_p.data(),
             d_ldu, d_source, d_internal_coeffs, d_boundary_coeffs, d_A);
 
-#ifndef TIME_GPU
+#ifdef USE_GRAPH
         checkCudaErrors(cudaStreamEndCapture(dataBase_.stream, &graph_pre));
         checkCudaErrors(cudaGraphInstantiate(&graph_instance_pre, graph_pre, NULL, NULL, 0));
+        pre_graph_created = true;
     }
     DEBUG_TRACE;
     checkCudaErrors(cudaGraphLaunch(graph_instance_pre, dataBase_.stream));
 #endif
-    checkCudaErrors(cudaEventRecord(stop,0));
-    checkCudaErrors(cudaEventSynchronize(start));
-    checkCudaErrors(cudaEventSynchronize(stop));
-    checkCudaErrors(cudaEventElapsedTime(&time_elapsed,start,stop));
-    fprintf(stderr, "peqn assembly time:%f(ms)\n",time_elapsed);
-    
-    checkCudaErrors(cudaEventRecord(start,0));
+    TICK_END_EVENT(pEqn assembly);
+
+    TICK_START_EVENT;
     solve();
-    checkCudaErrors(cudaEventRecord(stop,0));
-    checkCudaErrors(cudaEventSynchronize(start));
-    checkCudaErrors(cudaEventSynchronize(stop));
-    checkCudaErrors(cudaEventElapsedTime(&time_elapsed,start,stop));
-    fprintf(stderr, "peqn solve time:%f(ms)\n",time_elapsed);
-
-    checkCudaErrors(cudaEventRecord(start,0));
-#ifndef TIME_GPU
-    if(!graph_created) {
+    TICK_END_EVENT(pEqn solve);
+
+    TICK_START_EVENT;
+#ifdef USE_GRAPH
+    if(!post_graph_created) {
         checkCudaErrors(cudaStreamBeginCapture(dataBase_.stream, cudaStreamCaptureModeGlobal));
 #endif
     
-    correct_boundary_conditions_scalar(dataBase_.stream, dataBase_.nccl_comm, dataBase_.neighbProcNo.data(),
-            dataBase_.num_boundary_surfaces, dataBase_.num_patches, dataBase_.patch_size.data(), 
-            patch_type_p.data(), dataBase_.d_boundary_delta_coeffs,
-            dataBase_.d_boundary_face_cell, dataBase_.d_p, dataBase_.d_boundary_p);
-    // update phi
-    fvMtx_flux(dataBase_.stream, dataBase_.num_surfaces, dataBase_.num_boundary_surfaces, dataBase_.d_owner, dataBase_.d_neighbor, 
-            d_lower, d_upper, dataBase_.d_p, d_flux,
-            dataBase_.num_patches, dataBase_.patch_size.data(), patch_type_p.data(), 
-            dataBase_.d_boundary_face_cell, d_internal_coeffs, d_boundary_coeffs, dataBase_.d_boundary_p, d_boundary_flux);
-    field_add_scalar(dataBase_.stream, dataBase_.num_surfaces, d_phiHbyA, d_flux, dataBase_.d_phi, 
-            dataBase_.num_boundary_surfaces, d_boundary_phiHbyA, d_boundary_flux, dataBase_.d_boundary_phi);
-    // correct U
-    checkCudaErrors(cudaMemsetAsync(dataBase_.d_u, 0., dataBase_.cell_value_vec_bytes, dataBase_.stream));
-    // TODO: may do not need to calculate boundary fields
-    fvc_grad_cell_scalar(dataBase_.stream, dataBase_.num_cells, dataBase_.num_surfaces, dataBase_.num_boundary_surfaces,
-            dataBase_.d_owner, dataBase_.d_neighbor, 
-            dataBase_.d_weight, dataBase_.d_sf, dataBase_.d_p, dataBase_.d_u, 
-            dataBase_.num_patches, dataBase_.patch_size.data(), patch_type_p.data(), dataBase_.d_boundary_weight,
-            dataBase_.d_boundary_face_cell, dataBase_.d_boundary_p, dataBase_.d_boundary_sf, dataBase_.d_volume, true);
-    scalar_field_multiply_vector_field(dataBase_.stream, dataBase_.num_cells, dataBase_.d_rAU, dataBase_.d_u, dataBase_.d_u);
-    field_add_vector(dataBase_.stream, dataBase_.num_cells, dataBase_.d_HbyA, dataBase_.d_u, dataBase_.d_u, -1.);
-    correct_boundary_conditions_vector(dataBase_.stream, dataBase_.nccl_comm, dataBase_.neighbProcNo.data(), dataBase_.num_boundary_surfaces, 
-            dataBase_.num_cells, dataBase_.num_patches, dataBase_.patch_size.data(), patch_type_U.data(), dataBase_.d_boundary_face_cell,
-            dataBase_.d_u, dataBase_.d_boundary_u);
-    vector_half_mag_square(dataBase_.stream, dataBase_.num_cells, dataBase_.d_u, dataBase_.d_k, dataBase_.num_boundary_surfaces, 
-            dataBase_.d_boundary_u, dataBase_.d_boundary_k);
-    // calculate dpdt
-    fvc_ddt_scalar_field(dataBase_.stream, dataBase_.num_cells, dataBase_.rdelta_t, dataBase_.d_p, dataBase_.d_p_old, dataBase_.d_volume, dataBase_.d_dpdt, 1.);
-
-    #ifndef TIME_GPU
+        correct_boundary_conditions_scalar(dataBase_.stream, dataBase_.nccl_comm, dataBase_.neighbProcNo.data(),
+                dataBase_.num_boundary_surfaces, dataBase_.num_patches, dataBase_.patch_size.data(), 
+                patch_type_p.data(), dataBase_.d_boundary_delta_coeffs,
+                dataBase_.d_boundary_face_cell, dataBase_.d_p, dataBase_.d_boundary_p);
+        // update phi
+        fvMtx_flux(dataBase_.stream, dataBase_.num_surfaces, dataBase_.num_boundary_surfaces, dataBase_.d_owner, dataBase_.d_neighbor, 
+                d_lower, d_upper, dataBase_.d_p, d_flux,
+                dataBase_.num_patches, dataBase_.patch_size.data(), patch_type_p.data(), 
+                dataBase_.d_boundary_face_cell, d_internal_coeffs, d_boundary_coeffs, dataBase_.d_boundary_p, d_boundary_flux);
+        field_add_scalar(dataBase_.stream, dataBase_.num_surfaces, d_phiHbyA, d_flux, dataBase_.d_phi, 
+                dataBase_.num_boundary_surfaces, d_boundary_phiHbyA, d_boundary_flux, dataBase_.d_boundary_phi);
+        // correct U
+        checkCudaErrors(cudaMemsetAsync(dataBase_.d_u, 0., dataBase_.cell_value_vec_bytes, dataBase_.stream));
+        // TODO: may do not need to calculate boundary fields
+        fvc_grad_cell_scalar(dataBase_.stream, dataBase_.num_cells, dataBase_.num_surfaces, dataBase_.num_boundary_surfaces,
+                dataBase_.d_owner, dataBase_.d_neighbor, 
+                dataBase_.d_weight, dataBase_.d_sf, dataBase_.d_p, dataBase_.d_u, 
+                dataBase_.num_patches, dataBase_.patch_size.data(), patch_type_p.data(), dataBase_.d_boundary_weight,
+                dataBase_.d_boundary_face_cell, dataBase_.d_boundary_p, dataBase_.d_boundary_sf, dataBase_.d_volume, true);
+        scalar_field_multiply_vector_field(dataBase_.stream, dataBase_.num_cells, dataBase_.d_rAU, dataBase_.d_u, dataBase_.d_u);
+        field_add_vector(dataBase_.stream, dataBase_.num_cells, dataBase_.d_HbyA, dataBase_.d_u, dataBase_.d_u, -1.);
+        correct_boundary_conditions_vector(dataBase_.stream, dataBase_.nccl_comm, dataBase_.neighbProcNo.data(),
+                dataBase_.num_boundary_surfaces, 
+                dataBase_.num_cells, dataBase_.num_patches, dataBase_.patch_size.data(), patch_type_U.data(), dataBase_.d_boundary_face_cell,
+                dataBase_.d_u, dataBase_.d_boundary_u);
+        vector_half_mag_square(dataBase_.stream, dataBase_.num_cells, dataBase_.d_u, dataBase_.d_k, dataBase_.num_boundary_surfaces, 
+                dataBase_.d_boundary_u, dataBase_.d_boundary_k);
+        // calculate dpdt
+        fvc_ddt_scalar_field(dataBase_.stream, dataBase_.num_cells, dataBase_.rdelta_t,
+                dataBase_.d_p, dataBase_.d_p_old, dataBase_.d_volume, dataBase_.d_dpdt, 1.);
+
+#ifdef USE_GRAPH
         checkCudaErrors(cudaStreamEndCapture(dataBase_.stream, &graph_post));
         checkCudaErrors(cudaGraphInstantiate(&graph_instance_post, graph_post, NULL, NULL, 0));
-        graph_created = true;
+        post_graph_created = true;
     }
     checkCudaErrors(cudaGraphLaunch(graph_instance_post, dataBase_.stream));
 #endif
-    
-    checkCudaErrors(cudaEventRecord(stop,0));
-    checkCudaErrors(cudaEventSynchronize(start));
-    checkCudaErrors(cudaEventSynchronize(stop));
-    checkCudaErrors(cudaEventElapsedTime(&time_elapsed,start,stop));
-    fprintf(stderr, "peqn post process time:%f(ms)\n\n",time_elapsed);
+    TICK_END_EVENT(pEqn post process);
+    sync();
 }
 void dfpEqn::postProcess() {}
 

From 5b8071353e4c969f19330ea8f7541d4867ae176f Mon Sep 17 00:00:00 2001
From: STwangyingrui <wangyingrui@sensetime.com>
Date: Mon, 23 Oct 2023 15:32:38 +0800
Subject: [PATCH 2/6] refine time statistic

---
 src_gpu/dfEEqn.cu          |  8 ++++++--
 src_gpu/dfMatrixDataBase.H |  4 ++--
 src_gpu/dfMatrixOpBase.cu  | 12 +++++++++++-
 src_gpu/dfRhoEqn.cu        |  4 +++-
 src_gpu/dfUEqn.cu          | 10 +++++++---
 src_gpu/dfYEqn.cu          |  6 +++++-
 src_gpu/dfpEqn.cu          |  4 ++--
 7 files changed, 36 insertions(+), 12 deletions(-)

diff --git a/src_gpu/dfEEqn.cu b/src_gpu/dfEEqn.cu
index b9f92d526..c4c1bc549 100644
--- a/src_gpu/dfEEqn.cu
+++ b/src_gpu/dfEEqn.cu
@@ -204,21 +204,25 @@ void dfEEqn::process() {
 #endif
     TICK_END_EVENT(EEqn solve);
 
-    TICK_START_EVENT;
 #ifdef USE_GRAPH
     if(!post_graph_created) {
         checkCudaErrors(cudaStreamBeginCapture(dataBase_.stream, cudaStreamCaptureModeGlobal));
 #endif
 
+        TICK_START_EVENT;
         correct_boundary_conditions_scalar(dataBase_.stream, dataBase_.nccl_comm, dataBase_.neighbProcNo.data(),
                 dataBase_.num_boundary_surfaces, dataBase_.num_patches, dataBase_.patch_size.data(),
                 patch_type_he.data(), dataBase_.d_boundary_delta_coeffs, dataBase_.d_boundary_face_cell,
                 dataBase_.d_he, dataBase_.d_boundary_he, d_boundary_heGradient);
+        TICK_END_EVENT(EEqn post process correctBC);
 
+        TICK_START_EVENT;
         // copy he and boundary_he to host
         checkCudaErrors(cudaMemcpyAsync(dataBase_.h_he, dataBase_.d_he, dataBase_.cell_value_bytes, cudaMemcpyDeviceToHost, dataBase_.stream));
         checkCudaErrors(cudaMemcpyAsync(dataBase_.h_boundary_he, dataBase_.d_boundary_he, dataBase_.boundary_surface_value_bytes, cudaMemcpyDeviceToHost, dataBase_.stream));
+        TICK_END_EVENT(EEqn post process copy back);
 
+        TICK_START_EVENT;
 #ifdef STREAM_ALLOCATOR
         // thermophysical fields
         checkCudaErrors(cudaFreeAsync(d_dpdt, dataBase_.stream));
@@ -238,6 +242,7 @@ void dfEEqn::process() {
         checkCudaErrors(cudaFreeAsync(d_A, dataBase_.stream));
         checkCudaErrors(cudaFreeAsync(d_b, dataBase_.stream));
 #endif
+        TICK_END_EVENT(EEqn post process free);
 #ifdef USE_GRAPH
         checkCudaErrors(cudaStreamEndCapture(dataBase_.stream, &graph_post));
         checkCudaErrors(cudaGraphInstantiate(&graph_instance_post, graph_post, NULL, NULL, 0));
@@ -245,7 +250,6 @@ void dfEEqn::process() {
     }
     checkCudaErrors(cudaGraphLaunch(graph_instance_post, dataBase_.stream));
 #endif
-    TICK_END_EVENT(EEqn post process);
     sync();
 }
 
diff --git a/src_gpu/dfMatrixDataBase.H b/src_gpu/dfMatrixDataBase.H
index def2f4a89..ba3403ffe 100644
--- a/src_gpu/dfMatrixDataBase.H
+++ b/src_gpu/dfMatrixDataBase.H
@@ -16,8 +16,8 @@
 #include <cmath>
 #include <unordered_map>
 
-#define DEBUG_
-#define DEBUG_CHECK_LDU
+//#define DEBUG_
+//#define DEBUG_CHECK_LDU
 
 extern int myRank;
 
diff --git a/src_gpu/dfMatrixOpBase.cu b/src_gpu/dfMatrixOpBase.cu
index 9d9d33566..dda7f9003 100644
--- a/src_gpu/dfMatrixOpBase.cu
+++ b/src_gpu/dfMatrixOpBase.cu
@@ -297,10 +297,13 @@ void correct_boundary_conditions_processor_scalar(cudaStream_t stream, ncclComm_
     correct_internal_boundary_field_scalar<<<blocks_per_grid, threads_per_block, 0, stream>>>(num, offset, 
             vf, boundary_cell_face, vf_boundary);
 
+    TICK_INIT_EVENT;
+    TICK_START_EVENT;
     checkNcclErrors(ncclGroupStart());
     checkNcclErrors(ncclSend(vf_boundary + internal_start_index, num, ncclDouble, peer, comm, stream));
     checkNcclErrors(ncclRecv(vf_boundary + neighbor_start_index, num, ncclDouble, peer, comm, stream));
     checkNcclErrors(ncclGroupEnd());
+    TICK_END_EVENT(nccl scalar);
     //checkCudaErrors(cudaStreamSynchronize(stream));
 }
 
@@ -316,12 +319,15 @@ void correct_boundary_conditions_processor_vector(cudaStream_t stream, ncclComm_
     correct_internal_boundary_field_vector<<<blocks_per_grid, threads_per_block, 0, stream>>>(num, offset, 
             num_boundary_surfaces, num_cells, vf, boundary_cell_face, vf_boundary);
 
+    TICK_INIT_EVENT;
+    TICK_START_EVENT;
     checkNcclErrors(ncclGroupStart());
     for (int i = 0; i < 3; i++) {
         checkNcclErrors(ncclSend(vf_boundary + num_boundary_surfaces * i + internal_start_index, num, ncclDouble, peer, comm, stream));
         checkNcclErrors(ncclRecv(vf_boundary + num_boundary_surfaces * i + neighbor_start_index, num, ncclDouble, peer, comm, stream));   
     }
     checkNcclErrors(ncclGroupEnd());
+    TICK_END_EVENT(nccl vector);
     //checkCudaErrors(cudaStreamSynchronize(stream));
 }
 
@@ -1179,13 +1185,17 @@ void fvc_grad_vector_correctBC_processor(cudaStream_t stream, ncclComm_t comm,
 
     correct_internal_boundary_field_tensor<<<blocks_per_grid, threads_per_block, 0, stream>>>(num, offset, 
             num_boundary_surfaces, num_cells, internal_grad, face2Cells, boundary_grad);
+
+    TICK_INIT_EVENT;
+    TICK_START_EVENT;
     checkNcclErrors(ncclGroupStart());
     for (int i = 0; i < 9; i++) {
         checkNcclErrors(ncclSend(boundary_grad + num_boundary_surfaces * i + internal_start_index, num, ncclDouble, peer, comm, stream));
         checkNcclErrors(ncclRecv(boundary_grad + num_boundary_surfaces * i + neighbor_start_index, num, ncclDouble, peer, comm, stream));   
     }
     checkNcclErrors(ncclGroupEnd());
-    checkCudaErrors(cudaStreamSynchronize(stream));
+    TICK_END_EVENT(nccl tensor);
+    //checkCudaErrors(cudaStreamSynchronize(stream));
 }
 
 __global__ void fvc_grad_cell_scalar_correctBC_zeroGradient(int num_cells, int num_boundary_surfaces,
diff --git a/src_gpu/dfRhoEqn.cu b/src_gpu/dfRhoEqn.cu
index 0babea810..3b848b6c7 100644
--- a/src_gpu/dfRhoEqn.cu
+++ b/src_gpu/dfRhoEqn.cu
@@ -81,8 +81,10 @@ void dfRhoEqn::process()
     checkCudaErrors(cudaFreeAsync(d_source, dataBase_.stream));
     checkCudaErrors(cudaFreeAsync(d_diag, dataBase_.stream));
 #endif
+    TICK_END_EVENT(rhoEqn post process free);
+    TICK_START_EVENT;
     checkCudaErrors(cudaMemcpyAsync(dataBase_.h_rho, dataBase_.d_rho, dataBase_.cell_value_bytes, cudaMemcpyDeviceToHost, dataBase_.stream));
-    TICK_END_EVENT(rhoEqn post process);
+    TICK_END_EVENT(rhoEqn post process copyback);
     sync();
 }
 
diff --git a/src_gpu/dfUEqn.cu b/src_gpu/dfUEqn.cu
index 61dc11340..76bf3f93d 100644
--- a/src_gpu/dfUEqn.cu
+++ b/src_gpu/dfUEqn.cu
@@ -501,23 +501,27 @@ void dfUEqn::process() {
 #endif
     TICK_END_EVENT(UEqn solve);
 
-    TICK_START_EVENT;
 #ifdef USE_GRAPH
     if(!post_graph_created) {
         checkCudaErrors(cudaStreamBeginCapture(dataBase_.stream, cudaStreamCaptureModeGlobal));
 #endif
 
+        TICK_START_EVENT;
         correct_boundary_conditions_vector(dataBase_.stream, dataBase_.nccl_comm, dataBase_.neighbProcNo.data(),
                 dataBase_.num_boundary_surfaces, 
                 dataBase_.num_cells, dataBase_.num_patches, dataBase_.patch_size.data(), patch_type.data(), dataBase_.d_boundary_face_cell,
                 dataBase_.d_u, dataBase_.d_boundary_u);
         vector_half_mag_square(dataBase_.stream, dataBase_.num_cells, dataBase_.d_u, dataBase_.d_k, dataBase_.num_boundary_surfaces, 
                 dataBase_.d_boundary_u, dataBase_.d_boundary_k);
-    
+        TICK_END_EVENT(UEqn post process correctBC);
+
+        TICK_START_EVENT;
         // copy u and boundary_u to host
         permute_vector_d2h(dataBase_.stream, dataBase_.num_cells, dataBase_.d_u, d_u_host_order);
         checkCudaErrors(cudaMemcpyAsync(dataBase_.h_u, d_u_host_order, dataBase_.cell_value_vec_bytes, cudaMemcpyDeviceToHost, dataBase_.stream));
+        TICK_END_EVENT(UEqn post process copyback);
 
+        TICK_START_EVENT;
 #ifdef STREAM_ALLOCATOR
         // free
         // thermophysical fields
@@ -541,6 +545,7 @@ void dfUEqn::process() {
         checkCudaErrors(cudaFreeAsync(d_A, dataBase_.stream));
         checkCudaErrors(cudaFreeAsync(d_b, dataBase_.stream));
 #endif
+        TICK_END_EVENT(UEqn post process free);
 
 #ifdef USE_GRAPH
         checkCudaErrors(cudaStreamEndCapture(dataBase_.stream, &graph_post));
@@ -549,7 +554,6 @@ void dfUEqn::process() {
     }
     checkCudaErrors(cudaGraphLaunch(graph_instance_post, dataBase_.stream));
 #endif
-    TICK_END_EVENT(UEqn post process);
     sync();
 }
 
diff --git a/src_gpu/dfYEqn.cu b/src_gpu/dfYEqn.cu
index cc08edbe1..b4c499db8 100644
--- a/src_gpu/dfYEqn.cu
+++ b/src_gpu/dfYEqn.cu
@@ -591,11 +591,15 @@ void dfYEqn::process() {
                 patch_type.data(), dataBase_.d_boundary_delta_coeffs, dataBase_.d_boundary_face_cell,
                 dataBase_.d_y + dataBase_.num_cells * s, dataBase_.d_boundary_y + dataBase_.num_boundary_surfaces * s);
     }
+    TICK_END_EVENT(YEqn post process for all species correctBC);
 
+    TICK_START_EVENT;
     // copy y and boundary_y to host
     checkCudaErrors(cudaMemcpyAsync(dataBase_.h_y, dataBase_.d_y, dataBase_.cell_value_bytes * dataBase_.num_species, cudaMemcpyDeviceToHost, dataBase_.stream));
     checkCudaErrors(cudaMemcpyAsync(dataBase_.h_boundary_y, dataBase_.d_boundary_y, dataBase_.boundary_surface_value_bytes * dataBase_.num_species, cudaMemcpyDeviceToHost, dataBase_.stream));
+    TICK_END_EVENT(YEqn post process for all species copy back);
 
+    TICK_START_EVENT;
 #ifdef STREAM_ALLOCATOR
     // thermophysical fields
     checkCudaErrors(cudaFreeAsync(d_rhoD, dataBase_.stream));
@@ -630,7 +634,7 @@ void dfYEqn::process() {
     checkCudaErrors(cudaFreeAsync(d_boundary_coeffs, dataBase_.stream));
     checkCudaErrors(cudaFreeAsync(d_A, dataBase_.stream));
 #endif
-    TICK_END_EVENT(YEqn post process for all species);
+    TICK_END_EVENT(YEqn post process for all species free);
     sync();
 }
 
diff --git a/src_gpu/dfpEqn.cu b/src_gpu/dfpEqn.cu
index b8940d878..71dad23d7 100644
--- a/src_gpu/dfpEqn.cu
+++ b/src_gpu/dfpEqn.cu
@@ -440,12 +440,12 @@ void dfpEqn::process() {
     solve();
     TICK_END_EVENT(pEqn solve);
 
-    TICK_START_EVENT;
 #ifdef USE_GRAPH
     if(!post_graph_created) {
         checkCudaErrors(cudaStreamBeginCapture(dataBase_.stream, cudaStreamCaptureModeGlobal));
 #endif
     
+        TICK_START_EVENT;
         correct_boundary_conditions_scalar(dataBase_.stream, dataBase_.nccl_comm, dataBase_.neighbProcNo.data(),
                 dataBase_.num_boundary_surfaces, dataBase_.num_patches, dataBase_.patch_size.data(), 
                 patch_type_p.data(), dataBase_.d_boundary_delta_coeffs,
@@ -476,6 +476,7 @@ void dfpEqn::process() {
         // calculate dpdt
         fvc_ddt_scalar_field(dataBase_.stream, dataBase_.num_cells, dataBase_.rdelta_t,
                 dataBase_.d_p, dataBase_.d_p_old, dataBase_.d_volume, dataBase_.d_dpdt, 1.);
+        TICK_END_EVENT(pEqn post process all);
 
 #ifdef USE_GRAPH
         checkCudaErrors(cudaStreamEndCapture(dataBase_.stream, &graph_post));
@@ -484,7 +485,6 @@ void dfpEqn::process() {
     }
     checkCudaErrors(cudaGraphLaunch(graph_instance_post, dataBase_.stream));
 #endif
-    TICK_END_EVENT(pEqn post process);
     sync();
 }
 void dfpEqn::postProcess() {}

From 74dcf0c622f7fd5cc799def761557f304330eb22 Mon Sep 17 00:00:00 2001
From: STwangyingrui <wangyingrui@sensetime.com>
Date: Tue, 24 Oct 2023 17:09:21 +0800
Subject: [PATCH 3/6] refactor macros; fix invalid config for empty patches;

---
 src_gpu/dfEEqn.H          |  2 ++
 src_gpu/dfEEqn.cu         |  5 ++++-
 src_gpu/dfMatrixOpBase.H  | 27 ++++++++++++++++++++++++---
 src_gpu/dfMatrixOpBase.cu | 25 +++++++++++++++++++++++++
 src_gpu/dfRhoEqn.H        |  2 ++
 src_gpu/dfRhoEqn.cu       |  2 ++
 src_gpu/dfThermo.cu       |  1 +
 src_gpu/dfUEqn.H          |  2 ++
 src_gpu/dfUEqn.cu         |  6 ++++++
 src_gpu/dfYEqn.H          |  2 ++
 src_gpu/dfYEqn.cu         | 24 ++++++++++++++----------
 src_gpu/dfpEqn.H          |  2 ++
 src_gpu/dfpEqn.cu         |  6 ++++++
 13 files changed, 92 insertions(+), 14 deletions(-)

diff --git a/src_gpu/dfEEqn.H b/src_gpu/dfEEqn.H
index 63616b414..6fac22965 100644
--- a/src_gpu/dfEEqn.H
+++ b/src_gpu/dfEEqn.H
@@ -13,11 +13,13 @@ class dfEEqn
 
     // cuda resource
     cudaStream_t stream;
+#ifdef USE_GRAPH
     // one graph for one eqn before using self-developed solver
     cudaGraph_t graph_pre, graph_post;
     cudaGraphExec_t graph_instance_pre, graph_instance_post;
     bool pre_graph_created=false;
     bool post_graph_created=false;
+#endif
 
 	// constant values -- basic
 	std::string mode_string;
diff --git a/src_gpu/dfEEqn.cu b/src_gpu/dfEEqn.cu
index c4c1bc549..e2b993ba7 100644
--- a/src_gpu/dfEEqn.cu
+++ b/src_gpu/dfEEqn.cu
@@ -89,6 +89,7 @@ void dfEEqn::initNonConstantFields(const double *he, const double *boundary_he)
 }
 
 void dfEEqn::cleanCudaResources() {
+#ifdef USE_GRAPH
     if (pre_graph_created) {
         checkCudaErrors(cudaGraphExecDestroy(graph_instance_pre));
         checkCudaErrors(cudaGraphDestroy(graph_pre));
@@ -97,6 +98,7 @@ void dfEEqn::cleanCudaResources() {
         checkCudaErrors(cudaGraphExecDestroy(graph_instance_post));
         checkCudaErrors(cudaGraphDestroy(graph_post));
     }
+#endif
 }
 
 void dfEEqn::preProcess(const double *h_he, const double *h_k, const double *h_k_old, const double *h_dpdt, const double *h_boundary_k, const double *h_boundary_heGradient)
@@ -123,7 +125,7 @@ void dfEEqn::process() {
     checkCudaErrors(cudaMallocAsync((void**)&d_gradient_internal_coeffs, dataBase_.boundary_surface_value_bytes, dataBase_.stream));
     checkCudaErrors(cudaMallocAsync((void**)&d_gradient_boundary_coeffs, dataBase_.boundary_surface_value_bytes, dataBase_.stream));
  
-    checkCudaErrors(cudaMallocAsync((void**)&d_boundary_heGradient, dataBase_.num_gradientEnergy_boundary_surfaces, dataBase_.stream));
+    checkCudaErrors(cudaMallocAsync((void**)&d_boundary_heGradient, sizeof(double) * num_gradientEnergy_boundary_surfaces, dataBase_.stream));
 
     checkCudaErrors(cudaMallocAsync((void**)&d_source, dataBase_.cell_value_bytes, dataBase_.stream));
     checkCudaErrors(cudaMallocAsync((void**)&d_internal_coeffs, dataBase_.boundary_surface_value_bytes, dataBase_.stream));
@@ -261,6 +263,7 @@ void dfEEqn::eeqn_calculate_energy_gradient(dfThermo& GPUThermo, int num_cells,
 {
     int bou_offset = 0, gradient_offset = 0;
     for (int i = 0; i < num_patches; i++) {
+        if (patch_size[i] == 0) continue;
         if (patch_type[i] == boundaryConditions::gradientEnergy) {
             GPUThermo.calculateEnergyGradient(patch_size[i], num_cells, num_species, num_boundary_surfaces, bou_offset, gradient_offset,
                     face2Cells, T, p, y, boundary_delta_coeffs, boundary_p, boundary_y, boundary_thermo_gradient);
diff --git a/src_gpu/dfMatrixOpBase.H b/src_gpu/dfMatrixOpBase.H
index 195e86225..ea8b5db99 100644
--- a/src_gpu/dfMatrixOpBase.H
+++ b/src_gpu/dfMatrixOpBase.H
@@ -2,18 +2,39 @@
 #include <cuda_runtime.h>
 #include <nccl.h>
 
+// macros need to be in a certain order
+
+// sequence 0: STREAM_ALLOCATOR can be open or not
+#ifndef STREAM_ALLOCATOR
+    #define STREAM_ALLOCATOR
+#endif
+
+// sequence 0: USE_GRAPH can be open or not
 #ifndef USE_GRAPH
     #define USE_GRAPH
 #endif
 
+// sequence 0: TIME_GPU can be open or not
 #ifndef TIME_GPU
     #define TIME_GPU
-    #if (defined TIME_GPU) && (defined USE_GRAPH)
-        #undef USE_GRAPH
-    #endif
+#endif
+
+// sequence 1: TIME_GPU and USE_GRAPH can not be open at the same time
+#if (defined TIME_GPU) && (defined USE_GRAPH)
+    #undef USE_GRAPH
+#endif
+
+// sequence 2: STREAM_ALLOCATOR must be open if USE_GRAPH is open
+#if (defined USE_GRAPH) && (!defined STREAM_ALLOCATOR)
+    #define STREAM_ALLOCATOR
 #endif
 
 extern int myRank;
+
+#define PRINT_PTR(x) { \
+    fprintf(stderr, "rank[%d], %s %d, print ptr %s: %p\n", myRank, __FILE__, __LINE__, #x, x); \
+}
+
 extern __global__ void warmup();
 
 #ifdef TIME_GPU
diff --git a/src_gpu/dfMatrixOpBase.cu b/src_gpu/dfMatrixOpBase.cu
index dda7f9003..129fe0fcb 100644
--- a/src_gpu/dfMatrixOpBase.cu
+++ b/src_gpu/dfMatrixOpBase.cu
@@ -2034,6 +2034,7 @@ void ldu_to_csr_scalar(cudaStream_t stream, int num_cells, int num_surfaces, int
     int bou_offset = 0, ext_offset = 0;
     size_t threads_per_block, blocks_per_grid;
     for (int i = 0; i < num_patches; i++) {
+        if (patch_size[i] == 0) continue;
         if (patch_type[i] == boundaryConditions::processor) {
             threads_per_block = 64;
             blocks_per_grid = (patch_size[i] + threads_per_block - 1) / threads_per_block;
@@ -2049,6 +2050,7 @@ void ldu_to_csr_scalar(cudaStream_t stream, int num_cells, int num_surfaces, int
     // add coeff to source and diagnal
     bou_offset = 0;
     for (int i = 0; i < num_patches; i++) {
+        if (patch_size[i] == 0) continue;
         threads_per_block = 64;
         blocks_per_grid = (patch_size[i] + threads_per_block - 1) / threads_per_block;
         if (patch_type[i] == boundaryConditions::processor) {
@@ -2096,6 +2098,7 @@ void update_boundary_coeffs_scalar(cudaStream_t stream,
     int offset = 0;
     int gradient_offset = 0;
     for (int i = 0; i < num_patches; i++) {
+        if (patch_size[i] == 0) continue;
         threads_per_block = 256;
         blocks_per_grid = (patch_size[i] + threads_per_block - 1) / threads_per_block;
         // TODO: just basic patch type now
@@ -2135,6 +2138,7 @@ void correct_boundary_conditions_scalar(cudaStream_t stream, ncclComm_t comm,
     int offset = 0;
     int gradient_offset = 0;
     for (int i = 0; i < num_patches; i++) {
+        if (patch_size[i] == 0) continue;
         threads_per_block = 256;
         blocks_per_grid = (patch_size[i] + threads_per_block - 1) / threads_per_block;
         if (patch_type[i] == boundaryConditions::zeroGradient
@@ -2170,6 +2174,7 @@ void correct_boundary_conditions_vector(cudaStream_t stream, ncclComm_t comm,
 
     int offset = 0;
     for (int i = 0; i < num_patches; i++) {
+        if (patch_size[i] == 0) continue;
         threads_per_block = 256;
         blocks_per_grid = (patch_size[i] + threads_per_block - 1) / threads_per_block;
         if (patch_type[i] == boundaryConditions::zeroGradient
@@ -2202,6 +2207,7 @@ void update_boundary_coeffs_vector(cudaStream_t stream, int num_boundary_surface
 
     int offset = 0;
     for (int i = 0; i < num_patches; i++) {
+        if (patch_size[i] == 0) continue;
         threads_per_block = 256;
         blocks_per_grid = (patch_size[i] + threads_per_block - 1) / threads_per_block;
         // TODO: just basic patch type now
@@ -2277,6 +2283,7 @@ void fvm_div_scalar(cudaStream_t stream, int num_surfaces, const int *lowerAddr,
 
     int offset = 0;
     for (int i = 0; i < num_patches; i++) {
+        if (patch_size[i] == 0) continue;
         threads_per_block = 256;
         blocks_per_grid = (patch_size[i] + threads_per_block - 1) / threads_per_block;
         // TODO: just basic patch type now
@@ -2316,6 +2323,7 @@ void fvm_div_vector(cudaStream_t stream, int num_surfaces, int num_boundary_surf
 
     int offset = 0;
     for (int i = 0; i < num_patches; i++) {
+        if (patch_size[i] == 0) continue;
         threads_per_block = 64;
         blocks_per_grid = (patch_size[i] + threads_per_block - 1) / threads_per_block;
         // TODO: just basic patch type now
@@ -2354,6 +2362,7 @@ void fvm_laplacian_scalar(cudaStream_t stream, int num_surfaces, int num_boundar
             weight, mag_sf, delta_coeffs, gamma, lower, upper, diag, sign);
     int offset = 0;
     for (int i = 0; i < num_patches; i++) {
+        if (patch_size[i] == 0) continue;
         threads_per_block = 256;
         blocks_per_grid = (patch_size[i] + threads_per_block - 1) / threads_per_block;
         // TODO: just basic patch type now
@@ -2393,6 +2402,7 @@ void fvm_laplacian_vector(cudaStream_t stream, int num_surfaces, int num_boundar
             weight, mag_sf, delta_coeffs, gamma, lower, upper, diag, sign);
     int offset = 0;
     for (int i = 0; i < num_patches; i++) {
+        if (patch_size[i] == 0) continue;
         threads_per_block = 256;
         blocks_per_grid = (patch_size[i] + threads_per_block - 1) / threads_per_block;
         // TODO: just basic patch type now
@@ -2432,6 +2442,7 @@ void fvm_laplacian_surface_scalar_vol_scalar(cudaStream_t stream, int num_surfac
     
     int offset = 0;
     for (int i = 0; i < num_patches; i++) {
+        if (patch_size[i] == 0) continue;
         threads_per_block = 256;
         blocks_per_grid = (patch_size[i] + threads_per_block - 1) / threads_per_block;
         fvm_laplacian_surface_scalar_boundary<<<blocks_per_grid, threads_per_block, 0, stream>>>(patch_size[i], offset,
@@ -2472,6 +2483,7 @@ void fvc_grad_vector(cudaStream_t stream, ncclComm_t comm,
     int offset = 0;
     // finish conctruct grad field except dividing cell volume
     for (int i = 0; i < num_patches; i++) {
+        if (patch_size[i] == 0) continue;
         threads_per_block = 64;
         blocks_per_grid = (patch_size[i] + threads_per_block - 1) / threads_per_block;
         // TODO: just basic patch type now
@@ -2501,6 +2513,7 @@ void fvc_grad_vector(cudaStream_t stream, ncclComm_t comm,
     // correct boundary conditions
     offset = 0;
     for (int i = 0; i < num_patches; i++) {
+        if (patch_size[i] == 0) continue;
         threads_per_block = 64;
         blocks_per_grid = (patch_size[i] + threads_per_block - 1) / threads_per_block;
         // TODO: just basic patch type now
@@ -2550,6 +2563,7 @@ void fvc_div_surface_scalar(cudaStream_t stream, int num_cells, int num_surfaces
 
     int offset = 0;
     for (int i = 0; i < num_patches; i++) {
+        if (patch_size[i] == 0) continue;
         threads_per_block = 256;
         blocks_per_grid = (patch_size[i] + threads_per_block - 1) / threads_per_block;
         fvc_div_surface_scalar_boundary<<<blocks_per_grid, threads_per_block, 0, stream>>>(patch_size[i], offset, boundary_cell_face, 
@@ -2576,6 +2590,7 @@ void fvc_div_cell_vector(cudaStream_t stream, int num_cells, int num_surfaces, i
 
     int offset = 0;
     for (int i = 0; i < num_patches; i++) {
+        if (patch_size[i] == 0) continue;
         threads_per_block = 256;
         blocks_per_grid = (patch_size[i] + threads_per_block - 1) / threads_per_block;
         // TODO: just basic patch type now
@@ -2615,6 +2630,7 @@ void fvc_div_cell_tensor(cudaStream_t stream, int num_cells, int num_surfaces, i
 
     int offset = 0;
     for (int i = 0; i < num_patches; i++) {
+        if (patch_size[i] == 0) continue;
         threads_per_block = 64;
         blocks_per_grid = (patch_size[i] + threads_per_block - 1) / threads_per_block;
         // TODO: just basic patch type now
@@ -2652,6 +2668,7 @@ void fvc_div_surface_scalar_vol_scalar(cudaStream_t stream, int num_surfaces,
 
     int offset = 0;
     for (int i = 0; i < num_patches; i++) {
+        if (patch_size[i] == 0) continue;
         threads_per_block = 64;
         blocks_per_grid = (patch_size[i] + threads_per_block - 1) / threads_per_block;
         // TODO: just non-coupled patch type now
@@ -2688,6 +2705,7 @@ void fvc_grad_cell_scalar(cudaStream_t stream, int num_cells, int num_surfaces,
     
     int offset = 0;
     for (int i = 0; i < num_patches; i++) {
+        if (patch_size[i] == 0) continue;
         threads_per_block = 64;
         blocks_per_grid = (patch_size[i] + threads_per_block - 1) / threads_per_block;
         // TODO: just non-coupled patch type now
@@ -2723,6 +2741,7 @@ void fvc_grad_cell_scalar(cudaStream_t stream, int num_cells, int num_surfaces,
     
     int offset = 0;
     for (int i = 0; i < num_patches; i++) {
+        if (patch_size[i] == 0) continue;
         threads_per_block = 64;
         blocks_per_grid = (patch_size[i] + threads_per_block - 1) / threads_per_block;
         // TODO: just non-coupled patch type now
@@ -2769,6 +2788,7 @@ void fvc_grad_cell_scalar_withBC(cudaStream_t stream, ncclComm_t comm, const int
     // correct boundary conditions
     int offset = 0;
     for (int i = 0; i < num_patches; i++) {
+        if (patch_size[i] == 0) continue;
         threads_per_block = 256;
         blocks_per_grid = (patch_size[i] + threads_per_block - 1) / threads_per_block;
         // TODO: just basic patch type now
@@ -2813,6 +2833,7 @@ void fvc_laplacian_scalar(cudaStream_t stream, int num_cells, int num_surfaces,
 
     int offset = 0;
     for (int i = 0; i < num_patches; i++) {
+        if (patch_size[i] == 0) continue;
         threads_per_block = 256;
         blocks_per_grid = (patch_size[i] + threads_per_block - 1) / threads_per_block;
         // TODO: just basic patch type now
@@ -2847,6 +2868,7 @@ void fvc_flux(cudaStream_t stream, int num_cells, int num_surfaces, int num_boun
 
     int offset = 0;
     for (int i = 0; i < num_patches; i++) {
+        if (patch_size[i] == 0) continue;
         threads_per_block = 256;
         blocks_per_grid = (patch_size[i] + threads_per_block - 1) / threads_per_block;
         // TODO: maybe do not need loop boundarys
@@ -2876,6 +2898,7 @@ void fvc_interpolate(cudaStream_t stream, int num_cells, int num_surfaces,
     
     int offset = 0;
     for (int i = 0; i < num_patches; i++) {
+        if (patch_size[i] == 0) continue;
         threads_per_block = 256;
         blocks_per_grid = (patch_size[i] + threads_per_block - 1) / threads_per_block;
         // TODO: maybe do not need loop boundarys
@@ -2943,6 +2966,7 @@ void fvMtx_H(cudaStream_t stream, int num_cells, int num_surfaces, int num_bound
 
     int offset = 0;
     for (int i = 0; i < num_patches; i++) {
+        if (patch_size[i] == 0) continue;
         threads_per_block = 64;
         blocks_per_grid = (patch_size[i] + threads_per_block - 1) / threads_per_block;
         // TODO: just non-coupled patch type now
@@ -2972,6 +2996,7 @@ void fvMtx_flux(cudaStream_t stream, int num_surfaces, int num_boundary_surfaces
     
     int offset = 0;
     for (int i = 0; i < num_patches; i++) {
+        if (patch_size[i] == 0) continue;
         threads_per_block = 64;
         blocks_per_grid = (patch_size[i] + threads_per_block - 1) / threads_per_block;
         if (patch_type[i] == boundaryConditions::processor) {
diff --git a/src_gpu/dfRhoEqn.H b/src_gpu/dfRhoEqn.H
index f79b0584b..c38347977 100644
--- a/src_gpu/dfRhoEqn.H
+++ b/src_gpu/dfRhoEqn.H
@@ -10,10 +10,12 @@ private:
 
     // cuda resource
     cudaStream_t stream;
+#ifdef USE_GRAPH
     // one graph for one eqn before using self-developed solver
     cudaGraph_t graph;
     cudaGraphExec_t graph_instance;
     bool graph_created=false;
+#endif
 
     // constant fields - boundary
 	std::vector<int> patch_type;
diff --git a/src_gpu/dfRhoEqn.cu b/src_gpu/dfRhoEqn.cu
index 3b848b6c7..627ef26dc 100644
--- a/src_gpu/dfRhoEqn.cu
+++ b/src_gpu/dfRhoEqn.cu
@@ -26,10 +26,12 @@ void dfRhoEqn::initNonConstantFields(const double *rho, const double *phi,
 }
 
 void dfRhoEqn::cleanCudaResources() {
+#ifdef USE_GRAPH
     if (graph_created) {
         checkCudaErrors(cudaGraphExecDestroy(graph_instance));
         checkCudaErrors(cudaGraphDestroy(graph));
     }
+#endif
 }
 
 void dfRhoEqn::preProcess()
diff --git a/src_gpu/dfThermo.cu b/src_gpu/dfThermo.cu
index da62828aa..4d7eec672 100644
--- a/src_gpu/dfThermo.cu
+++ b/src_gpu/dfThermo.cu
@@ -507,6 +507,7 @@ void dfThermo::correctThermo()
     // boundary field
     int offset = 0;
     for (int i = 0; i < dataBase_.num_patches; i++) {
+        if (dataBase_.patch_size[i] == 0) continue;
         if (dataBase_.patch_type_T[i] == boundaryConditions::fixedValue) {
             calculateTPolyGPU(boundary_thread, dataBase_.patch_size[i], dataBase_.num_boundary_surfaces, dataBase_.d_boundary_T, d_boundary_T_poly, offset);
             calculateEnthalpyGPU(boundary_thread, dataBase_.patch_size[i], dataBase_.d_boundary_T, dataBase_.d_boundary_he, 
diff --git a/src_gpu/dfUEqn.H b/src_gpu/dfUEqn.H
index 99db6ef60..33ca8ede3 100644
--- a/src_gpu/dfUEqn.H
+++ b/src_gpu/dfUEqn.H
@@ -12,11 +12,13 @@ private:
 
     // cuda resource
     cudaStream_t stream;
+#ifdef USE_GRAPH
     // one graph for one eqn before using self-developed solver
     cudaGraph_t graph_pre, graph_post;
     cudaGraphExec_t graph_instance_pre, graph_instance_post;
     bool pre_graph_created=false;
     bool post_graph_created=false;
+#endif
 
 	// constant values -- basic
 	std::string mode_string;
diff --git a/src_gpu/dfUEqn.cu b/src_gpu/dfUEqn.cu
index 76bf3f93d..56682491d 100644
--- a/src_gpu/dfUEqn.cu
+++ b/src_gpu/dfUEqn.cu
@@ -357,6 +357,7 @@ void dfUEqn::initNonConstantFieldsBoundary() {
 }
 
 void dfUEqn::cleanCudaResources() {
+#ifdef USE_GRAPH
     if (pre_graph_created) {
         checkCudaErrors(cudaGraphExecDestroy(graph_instance_pre));
         checkCudaErrors(cudaGraphDestroy(graph_pre));
@@ -365,6 +366,7 @@ void dfUEqn::cleanCudaResources() {
         checkCudaErrors(cudaGraphExecDestroy(graph_instance_post));
         checkCudaErrors(cudaGraphDestroy(graph_post));
     }
+#endif
 }
 
 void dfUEqn::preProcessForRhoEqn(const double *h_rho, const double *h_phi, const double *h_boundary_phi) {
@@ -636,6 +638,7 @@ void dfUEqn::UEqnGetHbyA(cudaStream_t stream, ncclComm_t comm, const int *neighb
     
     int offset = 0;
     for (int i = 0; i < num_patches; i++) {
+        if (patch_size[i] == 0) continue;
         threads_per_block = 64;
         blocks_per_grid = (patch_size[i] + threads_per_block - 1) / threads_per_block;
         // TODO: just non-coupled patch type now
@@ -666,6 +669,7 @@ void dfUEqn::UEqnGetHbyA(cudaStream_t stream, ncclComm_t comm, const int *neighb
     // constrainHbyA
     offset = 0;
     for (int i = 0; i < num_patches; i++) {
+        if (patch_size[i] == 0) continue;
         threads_per_block = 64;
         blocks_per_grid = (patch_size[i] + threads_per_block - 1) / threads_per_block;
         // TODO: just non-coupled patch type now
@@ -698,6 +702,7 @@ void dfUEqn::ueqn_ldu_to_csr(cudaStream_t stream, int num_cells, int num_surface
     int bou_offset = 0, ext_offset = 0;
     size_t threads_per_block, blocks_per_grid;
     for (int i = 0; i < num_patches; i++) {
+        if (patch_size[i] == 0) continue;
         if (patch_type[i] == boundaryConditions::processor) {
             threads_per_block = 64;
             blocks_per_grid = (patch_size[i] + threads_per_block - 1) / threads_per_block;
@@ -719,6 +724,7 @@ void dfUEqn::ueqn_ldu_to_csr(cudaStream_t stream, int num_cells, int num_surface
     // add coeff to source and diagnal
     bou_offset = 0;
     for (int i = 0; i < num_patches; i++) {
+        if (patch_size[i] == 0) continue;
         threads_per_block = 64;
         blocks_per_grid = (patch_size[i] + threads_per_block - 1) / threads_per_block;
         if (patch_type[i] == boundaryConditions::processor) {
diff --git a/src_gpu/dfYEqn.H b/src_gpu/dfYEqn.H
index 10304bd97..b83445202 100644
--- a/src_gpu/dfYEqn.H
+++ b/src_gpu/dfYEqn.H
@@ -12,10 +12,12 @@ private:
 
     // cuda resource
     cudaStream_t stream;
+#ifdef USE_GRAPH
     // one graph for one eqn before using self-developed solver
     cudaGraph_t graph;
     cudaGraphExec_t graph_instance;
     bool graph_created=false;
+#endif
 
 	// constant values -- basic
 	std::string mode_string;
diff --git a/src_gpu/dfYEqn.cu b/src_gpu/dfYEqn.cu
index b4c499db8..b3eb211ba 100644
--- a/src_gpu/dfYEqn.cu
+++ b/src_gpu/dfYEqn.cu
@@ -373,13 +373,6 @@ void dfYEqn::createNonConstantLduAndCsrFields() {
 
 void dfYEqn::initNonConstantFieldsInternal(const double *y) {
     checkCudaErrors(cudaMemcpyAsync(dataBase_.d_y, y, dataBase_.cell_value_bytes * dataBase_.num_species, cudaMemcpyHostToDevice, dataBase_.stream));
-
-    // UnityLewis
-    checkCudaErrors(cudaMemsetAsync(d_hai, 0, dataBase_.cell_value_bytes * dataBase_.num_species, dataBase_.stream));
-    checkCudaErrors(cudaMemsetAsync(d_boundary_hai, 0, dataBase_.boundary_surface_value_bytes * dataBase_.num_species, dataBase_.stream));
-    // laminar
-    checkCudaErrors(cudaMemsetAsync(d_mut_sct, 0, dataBase_.cell_value_bytes, dataBase_.stream));
-    checkCudaErrors(cudaMemsetAsync(d_boundary_mut_sct, 0, dataBase_.boundary_surface_value_bytes, dataBase_.stream));
 }
 
 void dfYEqn::initNonConstantFieldsBoundary(const double *boundary_y) {
@@ -397,10 +390,12 @@ void dfYEqn::initNonConstantFieldsBoundary(const double *boundary_y) {
 }
 
 void dfYEqn::cleanCudaResources() {
+#ifdef USE_GRAPH
     if (graph_created) {
         checkCudaErrors(cudaGraphExecDestroy(graph_instance));
         checkCudaErrors(cudaGraphDestroy(graph));
     }
+#endif
 }
 
 void dfYEqn::preProcess(const double *h_rhoD, const double *h_boundary_rhoD,
@@ -449,6 +444,14 @@ void dfYEqn::process() {
         checkCudaErrors(cudaMallocAsync((void**)&d_boundary_coeffs, dataBase_.boundary_surface_value_bytes, dataBase_.stream));
         checkCudaErrors(cudaMallocAsync((void**)&d_A, dataBase_.csr_value_bytes, dataBase_.stream));
 #endif
+
+        // UnityLewis
+        checkCudaErrors(cudaMemsetAsync(d_hai, 0, dataBase_.cell_value_bytes * dataBase_.num_species, dataBase_.stream));
+        checkCudaErrors(cudaMemsetAsync(d_boundary_hai, 0, dataBase_.boundary_surface_value_bytes * dataBase_.num_species, dataBase_.stream));
+        // laminar
+        checkCudaErrors(cudaMemsetAsync(d_mut_sct, 0, dataBase_.cell_value_bytes, dataBase_.stream));
+        checkCudaErrors(cudaMemsetAsync(d_boundary_mut_sct, 0, dataBase_.boundary_surface_value_bytes, dataBase_.stream));
+
         checkCudaErrors(cudaMemsetAsync(dataBase_.d_diff_alphaD, 0, dataBase_.cell_value_bytes, dataBase_.stream));
         checkCudaErrors(cudaMemsetAsync(dataBase_.d_boundary_diff_alphaD, 0, dataBase_.boundary_surface_value_bytes, dataBase_.stream));
         checkCudaErrors(cudaMemsetAsync(d_grad_y, 0, dataBase_.cell_value_vec_bytes * dataBase_.num_species, dataBase_.stream));
@@ -583,7 +586,6 @@ void dfYEqn::process() {
     TICK_START_EVENT;
     // compute y_inertIndex
     yeqn_compute_y_inertIndex(dataBase_.stream, dataBase_.num_species, inertIndex, dataBase_.num_cells, dataBase_.d_y);
-
     // correct boundary conditions
     for (int s = 0; s < dataBase_.num_species; s++) {
         correct_boundary_conditions_scalar(dataBase_.stream, dataBase_.nccl_comm, dataBase_.neighbProcNo.data(),
@@ -602,7 +604,7 @@ void dfYEqn::process() {
     TICK_START_EVENT;
 #ifdef STREAM_ALLOCATOR
     // thermophysical fields
-    checkCudaErrors(cudaFreeAsync(d_rhoD, dataBase_.stream));
+    //checkCudaErrors(cudaFreeAsync(d_rhoD, dataBase_.stream));
     checkCudaErrors(cudaFreeAsync(d_hai, dataBase_.stream));
     checkCudaErrors(cudaFreeAsync(d_mut_sct, dataBase_.stream));
     // intermediate fields
@@ -613,7 +615,7 @@ void dfYEqn::process() {
     checkCudaErrors(cudaFreeAsync(d_permute, dataBase_.stream));
 
     // thermophysical fields
-    checkCudaErrors(cudaFreeAsync(d_boundary_rhoD, dataBase_.stream));
+    //checkCudaErrors(cudaFreeAsync(d_boundary_rhoD, dataBase_.stream));
     checkCudaErrors(cudaFreeAsync(d_boundary_hai, dataBase_.stream));
     checkCudaErrors(cudaFreeAsync(d_boundary_mut_sct, dataBase_.stream));
     // intermediate fields
@@ -709,6 +711,7 @@ void dfYEqn::yeqn_fvc_laplacian_scalar(cudaStream_t stream, ncclComm_t comm, con
 
     int offset = 0;
     for (int i = 0; i < num_patches; i++) {
+        if (patch_size[i] == 0) continue;
         threads_per_block = 256;
         blocks_per_grid = (patch_size[i] + threads_per_block - 1) / threads_per_block;
         // TODO: just basic patch type now
@@ -746,6 +749,7 @@ void dfYEqn::yeqn_fvc_laplacian_scalar(cudaStream_t stream, ncclComm_t comm, con
             boundary_cell_face, output, boundary_output);
     offset = 0;
     for (int i = 0; i < num_patches; i++) {
+        if (patch_size[i] == 0) continue;
         if (patch_type[i] == boundaryConditions::processor) {
             correct_boundary_conditions_processor_scalar(stream, comm, neighbor_peer[i], patch_size[i], offset,
                     output, boundary_cell_face, boundary_output);
diff --git a/src_gpu/dfpEqn.H b/src_gpu/dfpEqn.H
index b8d84640b..ebbb246f4 100644
--- a/src_gpu/dfpEqn.H
+++ b/src_gpu/dfpEqn.H
@@ -12,11 +12,13 @@ private:
 
     // cuda resource
     cudaStream_t stream;
+#ifdef USE_GRAPH
     // one graph for one eqn before using self-developed solver
     cudaGraph_t graph_pre, graph_post;
     cudaGraphExec_t graph_instance_pre, graph_instance_post;
     bool pre_graph_created=false;
     bool post_graph_created=false;
+#endif
 
 	// constant values -- basic
 	std::string mode_string;
diff --git a/src_gpu/dfpEqn.cu b/src_gpu/dfpEqn.cu
index 71dad23d7..d96380fbd 100644
--- a/src_gpu/dfpEqn.cu
+++ b/src_gpu/dfpEqn.cu
@@ -341,6 +341,7 @@ void dfpEqn::initNonConstantFields(const double *p, const double *boundary_p){
 }
 
 void dfpEqn::cleanCudaResources() {
+#ifdef USE_GRAPH
     if (pre_graph_created) {
         checkCudaErrors(cudaGraphExecDestroy(graph_instance_pre));
         checkCudaErrors(cudaGraphDestroy(graph_pre));
@@ -349,6 +350,7 @@ void dfpEqn::cleanCudaResources() {
         checkCudaErrors(cudaGraphExecDestroy(graph_instance_post));
         checkCudaErrors(cudaGraphDestroy(graph_post));
     }
+#endif
 }
 
 // tmp
@@ -511,6 +513,7 @@ void dfpEqn::getrhorAUf(cudaStream_t stream, int num_cells, int num_surfaces,
     
     int offset = 0;
     for (int i = 0; i < num_patches; i++) {
+        if (patch_size[i] == 0) continue;
         threads_per_block = 256;
         blocks_per_grid = (patch_size[i] + threads_per_block - 1) / threads_per_block;
         // TODO: maybe do not need loop boundarys
@@ -547,6 +550,7 @@ void dfpEqn::getphiHbyA(cudaStream_t stream, int num_cells, int num_surfaces, in
     
     int offset = 0;
     for (int i = 0; i < num_patches; i++) {
+        if (patch_size[i] == 0) continue;
         threads_per_block = 256;
         blocks_per_grid = (patch_size[i] + threads_per_block - 1) / threads_per_block;
         if (patch_type[i] == boundaryConditions::processor) {
@@ -565,6 +569,7 @@ void dfpEqn::getphiHbyA(cudaStream_t stream, int num_cells, int num_surfaces, in
 
     offset = 0;
     for (int i = 0; i < num_patches; i++) {
+        if (patch_size[i] == 0) continue;
         threads_per_block = 256;
         blocks_per_grid = (patch_size[i] + threads_per_block - 1) / threads_per_block;
         if (patch_type[i] == boundaryConditions::processor) {
@@ -584,6 +589,7 @@ void dfpEqn::getphiHbyA(cudaStream_t stream, int num_cells, int num_surfaces, in
     
     offset = 0;
     for (int i = 0; i < num_patches; i++) {
+        if (patch_size[i] == 0) continue;
         threads_per_block = 256;
         blocks_per_grid = (patch_size[i] + threads_per_block - 1) / threads_per_block;
         if (patch_type[i] == boundaryConditions::extrapolated) {

From 4496dd573389cf75d49b521eea4f4601b7437413 Mon Sep 17 00:00:00 2001
From: STwangyingrui <wangyingrui@sensetime.com>
Date: Tue, 24 Oct 2023 19:29:36 +0800
Subject: [PATCH 4/6] use stream allocator in pEqn and thermo

---
 src_gpu/dfThermo.cu | 34 ++++++++++++++++++++--
 src_gpu/dfpEqn.H    |  2 +-
 src_gpu/dfpEqn.cu   | 71 ++++++++++++++++++++++++++++++++++++++++-----
 3 files changed, 95 insertions(+), 12 deletions(-)

diff --git a/src_gpu/dfThermo.cu b/src_gpu/dfThermo.cu
index 4d7eec672..92a482e5a 100644
--- a/src_gpu/dfThermo.cu
+++ b/src_gpu/dfThermo.cu
@@ -23,8 +23,8 @@ void init_const_coeff_ptr(std::vector<std::vector<double>>& nasa_coeffs, std::ve
         std::vector<std::vector<double>>& thermal_conductivity_coeffs, std::vector<std::vector<double>>& binary_diffusion_coeffs,
         std::vector<double>& molecular_weights)
 {
-    double *d_tmp;
-    checkCudaErrors(cudaMalloc((void**)&d_tmp, sizeof(double) * NUM_SPECIES * 15));
+    //double *d_tmp;
+    //checkCudaErrors(cudaMalloc((void**)&d_tmp, sizeof(double) * NUM_SPECIES * 15));
     double nasa_coeffs_tmp[NUM_SPECIES*15];
     double viscosity_coeffs_tmp[NUM_SPECIES*5];
     double thermal_conductivity_coeffs_tmp[NUM_SPECIES*5];
@@ -344,6 +344,7 @@ void dfThermo::setConstantValue(std::string mechanism_file, int num_cells, int n
     initCoeffsfromBinaryFile(fp);
 
     stream = dataBase_.stream;
+#ifndef STREAM_ALLOCATOR
     checkCudaErrors(cudaMalloc((void**)&d_mole_fraction, sizeof(double) * num_species * num_cells));
     checkCudaErrors(cudaMalloc((void**)&d_boundary_mole_fraction, sizeof(double) * num_species * dataBase_.num_boundary_surfaces));
     checkCudaErrors(cudaMalloc((void**)&d_mean_mole_weight, sizeof(double) * num_cells));
@@ -355,10 +356,10 @@ void dfThermo::setConstantValue(std::string mechanism_file, int num_cells, int n
     checkCudaErrors(cudaMalloc((void**)&d_boundary_species_viscosities, sizeof(double) * num_species * dataBase_.num_boundary_surfaces));
     checkCudaErrors(cudaMalloc((void**)&d_species_thermal_conductivities, sizeof(double) * num_species * num_cells));
     checkCudaErrors(cudaMalloc((void**)&d_boundary_species_thermal_conductivities, sizeof(double) * num_species * dataBase_.num_boundary_surfaces));
+#endif
     
     checkCudaErrors(cudaMalloc((void**)&d_psip0, sizeof(double) * num_cells));
     checkCudaErrors(cudaMalloc((void**)&d_boundary_psip0, sizeof(double) * dataBase_.num_boundary_surfaces));
-
     std::cout << "dfThermo initialized" << std::endl;
 }
 
@@ -493,6 +494,20 @@ void dfThermo::updateEnergy()
 
 void dfThermo::correctThermo()
 {
+#ifdef STREAM_ALLOCATOR
+    checkCudaErrors(cudaMallocAsync((void**)&d_mole_fraction, sizeof(double) * num_species * num_cells, stream));
+    checkCudaErrors(cudaMallocAsync((void**)&d_boundary_mole_fraction, sizeof(double) * num_species * dataBase_.num_boundary_surfaces, stream));
+    checkCudaErrors(cudaMallocAsync((void**)&d_mean_mole_weight, sizeof(double) * num_cells, stream));
+    checkCudaErrors(cudaMallocAsync((void**)&d_boundary_mean_mole_weight, sizeof(double) * dataBase_.num_boundary_surfaces, stream));
+    checkCudaErrors(cudaMallocAsync((void**)&d_T_poly, sizeof(double) * 5 * num_cells, stream));
+    checkCudaErrors(cudaMallocAsync((void**)&d_boundary_T_poly, sizeof(double) * 5 * dataBase_.num_boundary_surfaces, stream));
+
+    checkCudaErrors(cudaMallocAsync((void**)&d_species_viscosities, sizeof(double) * num_species * num_cells, stream));
+    checkCudaErrors(cudaMallocAsync((void**)&d_boundary_species_viscosities, sizeof(double) * num_species * dataBase_.num_boundary_surfaces, stream));
+    checkCudaErrors(cudaMallocAsync((void**)&d_species_thermal_conductivities, sizeof(double) * num_species * num_cells, stream));
+    checkCudaErrors(cudaMallocAsync((void**)&d_boundary_species_thermal_conductivities, sizeof(double) * num_species * dataBase_.num_boundary_surfaces, stream));
+#endif
+ 
     setMassFraction(dataBase_.d_y, dataBase_.d_boundary_y);
     // internal field
     int cell_thread = 512, boundary_thread = 32;
@@ -550,6 +565,19 @@ void dfThermo::correctThermo()
         } else {
             offset += dataBase_.patch_size[i]; }
     }
+#ifdef STREAM_ALLOCATOR
+    checkCudaErrors(cudaFreeAsync(d_mole_fraction, stream));
+    checkCudaErrors(cudaFreeAsync(d_boundary_mole_fraction, stream));
+    checkCudaErrors(cudaFreeAsync(d_mean_mole_weight, stream));
+    checkCudaErrors(cudaFreeAsync(d_boundary_mean_mole_weight, stream));
+    checkCudaErrors(cudaFreeAsync(d_T_poly, stream));
+    checkCudaErrors(cudaFreeAsync(d_boundary_T_poly, stream));
+
+    checkCudaErrors(cudaFreeAsync(d_species_viscosities, stream));
+    checkCudaErrors(cudaFreeAsync(d_boundary_species_viscosities, stream));
+    checkCudaErrors(cudaFreeAsync(d_species_thermal_conductivities, stream));
+    checkCudaErrors(cudaFreeAsync(d_boundary_species_thermal_conductivities, stream));
+#endif
 }
 
 void dfThermo::updateRho()
diff --git a/src_gpu/dfpEqn.H b/src_gpu/dfpEqn.H
index ebbb246f4..79a414eac 100644
--- a/src_gpu/dfpEqn.H
+++ b/src_gpu/dfpEqn.H
@@ -96,7 +96,7 @@ public:
 	void preProcess(double *h_phi, double *h_boundary_phi);
     void correctPsi(const double *h_thermoPsi, double *h_boundary_thermoPsi); // tmp
     void correctP(const double *h_p, double *h_boundary_p); // tmp
-    void getFlux();
+    //void getFlux();
 	void process();
 	void postProcess();
 
diff --git a/src_gpu/dfpEqn.cu b/src_gpu/dfpEqn.cu
index d96380fbd..3b48f55a5 100644
--- a/src_gpu/dfpEqn.cu
+++ b/src_gpu/dfpEqn.cu
@@ -306,13 +306,17 @@ void dfpEqn::setConstantFields(const std::vector<int> patch_type_U, const std::v
 }
 
 void dfpEqn::createNonConstantFieldsInternal() {
+#ifndef STREAM_ALLOCATOR
     // intermediate fields
     checkCudaErrors(cudaMalloc((void**)&d_rhorAUf, dataBase_.surface_value_bytes));
     checkCudaErrors(cudaMalloc((void**)&d_phiHbyA, dataBase_.surface_value_bytes));
     checkCudaErrors(cudaMalloc((void**)&d_flux, dataBase_.surface_value_bytes));
+#endif
 }
 
 void dfpEqn::createNonConstantFieldsBoundary() {
+#ifndef STREAM_ALLOCATOR
+    // boundary coeffs
     checkCudaErrors(cudaMalloc((void**)&d_value_internal_coeffs, dataBase_.boundary_surface_value_bytes));
     checkCudaErrors(cudaMalloc((void**)&d_value_boundary_coeffs, dataBase_.boundary_surface_value_bytes));
     checkCudaErrors(cudaMalloc((void**)&d_gradient_internal_coeffs, dataBase_.boundary_surface_value_bytes));
@@ -321,18 +325,22 @@ void dfpEqn::createNonConstantFieldsBoundary() {
     checkCudaErrors(cudaMalloc((void**)&d_boundary_rhorAUf, dataBase_.boundary_surface_value_bytes));
     checkCudaErrors(cudaMalloc((void**)&d_boundary_phiHbyA, dataBase_.boundary_surface_value_bytes));
     checkCudaErrors(cudaMalloc((void**)&d_boundary_flux, dataBase_.boundary_surface_value_bytes));
+#endif
 }
 
 void dfpEqn::createNonConstantLduAndCsrFields() {
+    // ldu and csr
     checkCudaErrors(cudaMalloc((void**)&d_ldu, dataBase_.csr_value_bytes));
     d_lower = d_ldu;
     d_diag = d_ldu + dataBase_.num_surfaces;
     d_upper = d_ldu + dataBase_.num_cells + dataBase_.num_surfaces;
     d_extern = d_ldu + dataBase_.num_cells + 2 * dataBase_.num_surfaces;
+#ifndef STREAM_ALLOCATOR
     checkCudaErrors(cudaMalloc((void**)&d_source, dataBase_.cell_value_bytes));
     checkCudaErrors(cudaMalloc((void**)&d_internal_coeffs, dataBase_.boundary_surface_value_bytes));
     checkCudaErrors(cudaMalloc((void**)&d_boundary_coeffs, dataBase_.boundary_surface_value_bytes));
     checkCudaErrors(cudaMalloc((void**)&d_A, dataBase_.csr_value_bytes));
+#endif
 }
 
 void dfpEqn::initNonConstantFields(const double *p, const double *boundary_p){
@@ -377,6 +385,29 @@ void dfpEqn::process() {
         checkCudaErrors(cudaStreamBeginCapture(dataBase_.stream, cudaStreamCaptureModeGlobal));
 #endif
 
+#ifdef STREAM_ALLOCATOR
+    // intermediate fields
+    checkCudaErrors(cudaMallocAsync((void**)&d_rhorAUf, dataBase_.surface_value_bytes, dataBase_.stream));
+    checkCudaErrors(cudaMallocAsync((void**)&d_phiHbyA, dataBase_.surface_value_bytes, dataBase_.stream));
+    checkCudaErrors(cudaMallocAsync((void**)&d_flux, dataBase_.surface_value_bytes, dataBase_.stream));
+
+    // boundary coeffs
+    checkCudaErrors(cudaMallocAsync((void**)&d_value_internal_coeffs, dataBase_.boundary_surface_value_bytes, dataBase_.stream));
+    checkCudaErrors(cudaMallocAsync((void**)&d_value_boundary_coeffs, dataBase_.boundary_surface_value_bytes, dataBase_.stream));
+    checkCudaErrors(cudaMallocAsync((void**)&d_gradient_internal_coeffs, dataBase_.boundary_surface_value_bytes, dataBase_.stream));
+    checkCudaErrors(cudaMallocAsync((void**)&d_gradient_boundary_coeffs, dataBase_.boundary_surface_value_bytes, dataBase_.stream));
+    // intermediate boundary fields
+    checkCudaErrors(cudaMallocAsync((void**)&d_boundary_rhorAUf, dataBase_.boundary_surface_value_bytes, dataBase_.stream));
+    checkCudaErrors(cudaMallocAsync((void**)&d_boundary_phiHbyA, dataBase_.boundary_surface_value_bytes, dataBase_.stream));
+    checkCudaErrors(cudaMallocAsync((void**)&d_boundary_flux, dataBase_.boundary_surface_value_bytes, dataBase_.stream));
+
+    // ldu and csr
+    checkCudaErrors(cudaMallocAsync((void**)&d_source, dataBase_.cell_value_bytes, dataBase_.stream));
+    checkCudaErrors(cudaMallocAsync((void**)&d_internal_coeffs, dataBase_.boundary_surface_value_bytes, dataBase_.stream));
+    checkCudaErrors(cudaMallocAsync((void**)&d_boundary_coeffs, dataBase_.boundary_surface_value_bytes, dataBase_.stream));
+    checkCudaErrors(cudaMallocAsync((void**)&d_A, dataBase_.csr_value_bytes, dataBase_.stream));
+#endif
+
     checkCudaErrors(cudaMemsetAsync(d_ldu, 0, dataBase_.csr_value_bytes, dataBase_.stream)); // d_ldu contains d_lower, d_diag, and d_upper
     checkCudaErrors(cudaMemsetAsync(d_source, 0, dataBase_.cell_value_bytes, dataBase_.stream));
     checkCudaErrors(cudaMemsetAsync(d_internal_coeffs, 0, dataBase_.boundary_surface_value_bytes, dataBase_.stream));
@@ -478,6 +509,30 @@ void dfpEqn::process() {
         // calculate dpdt
         fvc_ddt_scalar_field(dataBase_.stream, dataBase_.num_cells, dataBase_.rdelta_t,
                 dataBase_.d_p, dataBase_.d_p_old, dataBase_.d_volume, dataBase_.d_dpdt, 1.);
+
+#ifdef STREAM_ALLOCATOR
+    // intermediate fields
+    checkCudaErrors(cudaFreeAsync(d_rhorAUf, dataBase_.stream));
+    checkCudaErrors(cudaFreeAsync(d_phiHbyA, dataBase_.stream));
+    checkCudaErrors(cudaFreeAsync(d_flux, dataBase_.stream));
+
+    // boundary coeffs
+    checkCudaErrors(cudaFreeAsync(d_value_internal_coeffs, dataBase_.stream));
+    checkCudaErrors(cudaFreeAsync(d_value_boundary_coeffs, dataBase_.stream));
+    checkCudaErrors(cudaFreeAsync(d_gradient_internal_coeffs, dataBase_.stream));
+    checkCudaErrors(cudaFreeAsync(d_gradient_boundary_coeffs, dataBase_.stream));
+    // intermediate boundary fields
+    checkCudaErrors(cudaFreeAsync(d_boundary_rhorAUf, dataBase_.stream));
+    checkCudaErrors(cudaFreeAsync(d_boundary_phiHbyA, dataBase_.stream));
+    checkCudaErrors(cudaFreeAsync(d_boundary_flux, dataBase_.stream));
+
+    // ldu and csr
+    checkCudaErrors(cudaFreeAsync(d_source, dataBase_.stream));
+    checkCudaErrors(cudaFreeAsync(d_internal_coeffs, dataBase_.stream));
+    checkCudaErrors(cudaFreeAsync(d_boundary_coeffs, dataBase_.stream));
+    checkCudaErrors(cudaFreeAsync(d_A, dataBase_.stream));
+#endif
+
         TICK_END_EVENT(pEqn post process all);
 
 #ifdef USE_GRAPH
@@ -491,14 +546,14 @@ void dfpEqn::process() {
 }
 void dfpEqn::postProcess() {}
 
-void dfpEqn::getFlux()
-{
-    fvMtx_flux(dataBase_.stream, dataBase_.num_surfaces, dataBase_.num_boundary_surfaces, dataBase_.d_owner, dataBase_.d_neighbor, 
-            d_lower, d_upper, dataBase_.d_p, d_flux,
-            dataBase_.num_patches, dataBase_.patch_size.data(), patch_type_p.data(), 
-            dataBase_.d_boundary_face_cell, d_internal_coeffs, d_boundary_coeffs, dataBase_.d_boundary_p, d_boundary_flux);
-    sync();
-}
+//void dfpEqn::getFlux()
+//{
+//    fvMtx_flux(dataBase_.stream, dataBase_.num_surfaces, dataBase_.num_boundary_surfaces, dataBase_.d_owner, dataBase_.d_neighbor, 
+//            d_lower, d_upper, dataBase_.d_p, d_flux,
+//            dataBase_.num_patches, dataBase_.patch_size.data(), patch_type_p.data(), 
+//            dataBase_.d_boundary_face_cell, d_internal_coeffs, d_boundary_coeffs, dataBase_.d_boundary_p, d_boundary_flux);
+//    sync();
+//}
 
 void dfpEqn::getrhorAUf(cudaStream_t stream, int num_cells, int num_surfaces,
         const int *lowerAddr, const int *upperAddr, 

From 7ef4ce7ee036ed6a906103fc5ca63af5bfa16b52 Mon Sep 17 00:00:00 2001
From: STwangyingrui <wangyingrui@sensetime.com>
Date: Tue, 24 Oct 2023 20:40:57 +0800
Subject: [PATCH 5/6] reuse amgx solvers

---
 .../solvers/dfLowMachFoam/createGPUSolver.H   | 15 ++++++++++-
 .../solvers/dfLowMachFoam/dfLowMachFoam.C     |  4 ++-
 src_gpu/dfEEqn.cu                             | 13 +---------
 src_gpu/dfMatrixDataBase.H                    | 14 +++++++++++
 src_gpu/dfMatrixDataBase.cu                   | 19 ++++++++++++++
 src_gpu/dfUEqn.cu                             | 21 +++-------------
 src_gpu/dfYEqn.cu                             | 18 +------------
 src_gpu/dfpEqn.cu                             | 25 +------------------
 8 files changed, 56 insertions(+), 73 deletions(-)

diff --git a/applications/solvers/dfLowMachFoam/createGPUSolver.H b/applications/solvers/dfLowMachFoam/createGPUSolver.H
index 226b8f3f3..5369ef94a 100644
--- a/applications/solvers/dfLowMachFoam/createGPUSolver.H
+++ b/applications/solvers/dfLowMachFoam/createGPUSolver.H
@@ -99,7 +99,7 @@ void initNccl() {
     ncclInit(PstreamGlobals::MPI_COMM_FOAM, nccl_comm, nccl_id, &nRanks, &myRank, &localRank);
 }
 
-void createGPUBase(fvMesh& mesh, PtrList<volScalarField>& Y) {
+void createGPUBase(const IOdictionary& CanteraTorchProperties, fvMesh& mesh, PtrList<volScalarField>& Y) {
     // prepare constant values: num_cells, num_surfaces, num_boundary_surfaces,
     // num_patches, patch_size, num_species, rdelta_t
     const labelUList& owner = mesh.owner();
@@ -151,6 +151,7 @@ void createGPUBase(fvMesh& mesh, PtrList<volScalarField>& Y) {
     double rDeltaT = 1 / 1e-6;
     dfDataBase.setConstantValues(num_cells, num_total_cells, num_surfaces, num_boundary_surfaces, 
             num_patches, nProcValues, patch_size, Y.size(), rDeltaT);
+
     // wyr: now there is no other place to prepare nccl info, thus mpi must be initialized at beginning.
     label flag_mpi_init;
     MPI_Initialized(&flag_mpi_init);
@@ -169,6 +170,14 @@ void createGPUBase(fvMesh& mesh, PtrList<volScalarField>& Y) {
     // prepare cuda resources
     dfDataBase.prepareCudaResources();
 
+    // setup amgx solvers
+    string mode_string = "dDDI";
+    string u_setting_path;
+    u_setting_path = CanteraTorchProperties.subDict("AmgxSettings").lookupOrDefault("UEqnSettingPath", string(""));
+    string p_setting_path;
+    p_setting_path = CanteraTorchProperties.subDict("AmgxSettings").lookupOrDefault("pEqnSettingPath", string(""));
+    dfDataBase.setAmgxSolvers(mode_string, u_setting_path, p_setting_path);
+
     // prepare constant indexes: owner, neighbor, procRows, procCols
     if (Pstream::parRun())
     {
@@ -330,6 +339,7 @@ void createGPURhoEqn(const volScalarField& rho, const surfaceScalarField& phi) {
 }
 
 void createGPUUEqn(const IOdictionary& CanteraTorchProperties, const volVectorField& U) {
+    // TODO need remove amgx solver setting
     // prepare mode_string and setting_path
     string mode_string = "dDDI";
     string settingPath;
@@ -381,6 +391,7 @@ void createGPUUEqn(const IOdictionary& CanteraTorchProperties, const volVectorFi
 
 void createGPUYEqn(const IOdictionary& CanteraTorchProperties, PtrList<volScalarField>& Y, const int inertIndex) {
     DEBUG_TRACE;
+    // TODO need remove amgx solver setting
     // prepare mode_string and setting_path
     string mode_string = "dDDI";
     string settingPath;
@@ -430,6 +441,7 @@ void createGPUYEqn(const IOdictionary& CanteraTorchProperties, PtrList<volScalar
 
 void createGPUEEqn(const IOdictionary& CanteraTorchProperties, volScalarField& he, volScalarField& K) {
     DEBUG_TRACE;
+    // TODO need remove amgx solver setting
     // prepare mode_string and setting_path
     string mode_string = "dDDI";
     string settingPath;
@@ -475,6 +487,7 @@ void createGPUEEqn(const IOdictionary& CanteraTorchProperties, volScalarField& h
 
 void createGPUpEqn(const IOdictionary& CanteraTorchProperties, volScalarField& p, const volVectorField& U) {
     DEBUG_TRACE;
+    // TODO need remove amgx solver setting
     // prepare mode_string and setting_path
     string mode_string = "dDDI";
     string settingPath;
diff --git a/applications/solvers/dfLowMachFoam/dfLowMachFoam.C b/applications/solvers/dfLowMachFoam/dfLowMachFoam.C
index 81ce81c24..b1746a8d5 100644
--- a/applications/solvers/dfLowMachFoam/dfLowMachFoam.C
+++ b/applications/solvers/dfLowMachFoam/dfLowMachFoam.C
@@ -197,7 +197,7 @@ int main(int argc, char *argv[])
     if(mpi_init_flag) {
         initNccl();
     }
-    createGPUBase(mesh, Y);
+    createGPUBase(CanteraTorchProperties, mesh, Y);
     DEBUG_TRACE;
 #endif
 
@@ -444,6 +444,8 @@ int main(int argc, char *argv[])
             }
             end = std::clock();
             time_monitor_turbulence_correct += double(end - start) / double(CLOCKS_PER_SEC);
+            //fprintf(stderr, "sleep for 5s...\n");
+            //usleep(5 * 1000 * 1000);
         }
         clock_t loop_end = std::clock();
         double loop_time = double(loop_end - loop_start) / double(CLOCKS_PER_SEC);
diff --git a/src_gpu/dfEEqn.cu b/src_gpu/dfEEqn.cu
index e2b993ba7..02c5f1d9a 100644
--- a/src_gpu/dfEEqn.cu
+++ b/src_gpu/dfEEqn.cu
@@ -347,18 +347,7 @@ void dfEEqn::sync()
 
 void dfEEqn::solve()
 {
-    sync(); // TODO can be removed
-
-    if (num_iteration == 0)                                     // first interation
-    {
-        printf("Initializing AmgX Linear Solver\n");
-        ESolver->setOperator(dataBase_.num_cells, dataBase_.num_total_cells, dataBase_.num_Nz, dataBase_.d_csr_row_index, dataBase_.d_csr_col_index, d_A);
-    }
-    else
-    {
-        ESolver->updateOperator(dataBase_.num_cells, dataBase_.num_Nz, d_A);
-    }
-    ESolver->solve(dataBase_.num_cells, dataBase_.d_he, d_source);
+    dataBase_.solve(num_iteration, AMGXSetting::u_setting, d_A, dataBase_.d_he, d_source);
     num_iteration++;
 }
 
diff --git a/src_gpu/dfMatrixDataBase.H b/src_gpu/dfMatrixDataBase.H
index ba3403ffe..bc04208ee 100644
--- a/src_gpu/dfMatrixDataBase.H
+++ b/src_gpu/dfMatrixDataBase.H
@@ -16,6 +16,9 @@
 #include <cmath>
 #include <unordered_map>
 
+#include "AmgXSolver.H"
+#include <amgx_c.h>
+
 //#define DEBUG_
 //#define DEBUG_CHECK_LDU
 
@@ -59,6 +62,11 @@ inline void checkVectorEqual(int count, const double* basevec, double* vec, doub
     }   
 }
 
+enum AMGXSetting {
+    u_setting,
+    p_setting
+};
+
 enum location {
     cpu,
     gpu
@@ -139,6 +147,10 @@ struct dfMatrixDataBase
     int *d_csr_row_index= nullptr;
     int *d_csr_col_index= nullptr;
 
+    // amgx solvers
+	AmgXSolver *u_setting_solver = nullptr;
+	AmgXSolver *p_setting_solver = nullptr;
+
     // constant fields - internal
     double *d_sf = nullptr;
     double *d_mag_sf = nullptr;
@@ -270,6 +282,8 @@ struct dfMatrixDataBase
         std::vector<int> patch_size, int num_species, double rdelta_t);             
     void setConstantIndexes(const int *owner, const int *neighbor, const int *procRows, 
             const int *procCols, int globalOffset);
+    void setAmgxSolvers(const std::string &mode_string, const std::string &u_setting_path, const std::string &p_setting_path);
+    void solve(int num_iteration, AMGXSetting setting, double *d_A, double *d_x, double *d_b);
 
     void createConstantFieldsInternal();
     void createConstantFieldsBoundary();
diff --git a/src_gpu/dfMatrixDataBase.cu b/src_gpu/dfMatrixDataBase.cu
index 180d90cff..f4badb935 100644
--- a/src_gpu/dfMatrixDataBase.cu
+++ b/src_gpu/dfMatrixDataBase.cu
@@ -135,6 +135,25 @@ void dfMatrixDataBase::setConstantValues(int num_cells, int num_total_cells, int
     csr_value_vec_bytes = num_Nz * 3 * sizeof(double);
 }
 
+void dfMatrixDataBase::setAmgxSolvers(const std::string &mode_string, const std::string &u_setting_path, const std::string &p_setting_path) {
+    // amgx solvers
+    u_setting_solver = new AmgXSolver(mode_string, u_setting_path, localRank);
+    p_setting_solver = new AmgXSolver(mode_string, p_setting_path, localRank);
+}
+    
+void dfMatrixDataBase::solve(int num_iteration, AMGXSetting setting, double *d_A, double *d_x, double *d_b) {
+    AmgXSolver *solver = (setting == AMGXSetting::u_setting) ? u_setting_solver : p_setting_solver;
+    if (num_iteration == 0)                                     // first interation
+    {
+        solver->setOperator(num_cells, num_total_cells, num_Nz, d_csr_row_index, d_csr_col_index, d_A);
+    }
+    else
+    {
+        solver->updateOperator(num_cells, num_Nz, d_A);
+    }
+    solver->solve(num_cells, d_x, d_b);
+}
+
 void dfMatrixDataBase::setConstantIndexes(const int *owner, const int *neighbor, const int *procRows, 
         const int *procCols, int globalOffset) {
     // build d_owner, d_neighbor
diff --git a/src_gpu/dfUEqn.cu b/src_gpu/dfUEqn.cu
index 56682491d..19f68c45b 100644
--- a/src_gpu/dfUEqn.cu
+++ b/src_gpu/dfUEqn.cu
@@ -565,24 +565,9 @@ void dfUEqn::sync()
 }
 
 void dfUEqn::solve() {
-    sync(); // TODO need remove
-
-    if (num_iteration == 0)                                     // first interation
-    {
-        printf("Initializing AmgX Linear Solver\n");
-        UxSolver->setOperator(dataBase_.num_cells, dataBase_.num_total_cells, dataBase_.num_Nz, dataBase_.d_csr_row_index, dataBase_.d_csr_col_index, d_A);
-        UySolver->setOperator(dataBase_.num_cells, dataBase_.num_total_cells, dataBase_.num_Nz, dataBase_.d_csr_row_index, dataBase_.d_csr_col_index, d_A + dataBase_.num_Nz);
-        UzSolver->setOperator(dataBase_.num_cells, dataBase_.num_total_cells, dataBase_.num_Nz, dataBase_.d_csr_row_index, dataBase_.d_csr_col_index, d_A + 2 * dataBase_.num_Nz);
-    }
-    else
-    {
-        UxSolver->updateOperator(dataBase_.num_cells, dataBase_.num_Nz, d_A);
-        UySolver->updateOperator(dataBase_.num_cells, dataBase_.num_Nz, d_A + dataBase_.num_Nz);
-        UzSolver->updateOperator(dataBase_.num_cells, dataBase_.num_Nz, d_A + 2 * dataBase_.num_Nz);
-    }
-    UxSolver->solve(dataBase_.num_cells, dataBase_.d_u, d_b);
-    UySolver->solve(dataBase_.num_cells, dataBase_.d_u + dataBase_.num_cells, d_b + dataBase_.num_cells);
-    UzSolver->solve(dataBase_.num_cells, dataBase_.d_u + 2 * dataBase_.num_cells, d_b + 2 * dataBase_.num_cells);
+    dataBase_.solve(num_iteration, AMGXSetting::u_setting, d_A, dataBase_.d_u, d_b);
+    dataBase_.solve(num_iteration, AMGXSetting::u_setting, d_A + dataBase_.num_Nz, dataBase_.d_u + dataBase_.num_cells, d_b + dataBase_.num_cells);
+    dataBase_.solve(num_iteration, AMGXSetting::u_setting, d_A + 2 * dataBase_.num_Nz, dataBase_.d_u + 2 * dataBase_.num_cells, d_b + 2 * dataBase_.num_cells);
     num_iteration++;
 }
 
diff --git a/src_gpu/dfYEqn.cu b/src_gpu/dfYEqn.cu
index b3eb211ba..f16446300 100644
--- a/src_gpu/dfYEqn.cu
+++ b/src_gpu/dfYEqn.cu
@@ -643,23 +643,7 @@ void dfYEqn::process() {
 void dfYEqn::solve(int solverIndex) {
     TICK_INIT_EVENT;
     TICK_START_EVENT;
-    if (num_iteration == 0)                                     // first interation
-    {
-        fprintf(stderr, "Initializing AmgX Linear Solver\n");
-        DEBUG_TRACE;
-        YSolverSet[solverIndex]->setOperator(dataBase_.num_cells, dataBase_.num_total_cells, dataBase_.num_Nz, dataBase_.d_csr_row_index, dataBase_.d_csr_col_index, d_A);
-        DEBUG_TRACE;
-    }
-    else
-    {
-        DEBUG_TRACE;
-        YSolverSet[solverIndex]->updateOperator(dataBase_.num_cells, dataBase_.num_Nz, d_A);
-        DEBUG_TRACE;
-    }
-    // use d_source as d_b
-    DEBUG_TRACE;
-    YSolverSet[solverIndex]->solve(dataBase_.num_cells, dataBase_.d_y + dataBase_.num_cells * solverIndex, d_source);
-    DEBUG_TRACE;
+    dataBase_.solve(num_iteration, AMGXSetting::u_setting, d_A, dataBase_.d_y + dataBase_.num_cells * solverIndex, d_source);
     TICK_END_EVENT(YEqn solve one specie);
 }
 
diff --git a/src_gpu/dfpEqn.cu b/src_gpu/dfpEqn.cu
index 3b48f55a5..bd0c8ac23 100644
--- a/src_gpu/dfpEqn.cu
+++ b/src_gpu/dfpEqn.cu
@@ -676,30 +676,7 @@ void dfpEqn::sync()
 
 void dfpEqn::solve()
 {
-    sync();
-
-    // double *h_A_csr = new double[dataBase_.num_Nz];
-    // checkCudaErrors(cudaMemcpy(h_A_csr, d_A, dataBase_.csr_value_bytes, cudaMemcpyDeviceToHost));
-    // for (int i = 0; i < dataBase_.num_Nz; i++)
-    //     fprintf(stderr, "h_A_csr[%d]: %.10lf\n", i, h_A_csr[i]);
-    // delete[] h_A_csr;
-
-    // double *h_b = new double[dataBase_.num_cells];
-    // checkCudaErrors(cudaMemcpy(h_b, d_source, dataBase_.cell_value_bytes, cudaMemcpyDeviceToHost));
-    // for (int i = 0; i < dataBase_.num_cells; i++)
-    //     fprintf(stderr, "h_b[%d]: %.10lf\n", i, h_b[i]);
-    // delete[] h_b;
-
-    if (num_iteration == 0)                                     // first interation
-    {
-        printf("Initializing AmgX Linear Solver\n");
-        pSolver->setOperator(dataBase_.num_cells, dataBase_.num_total_cells, dataBase_.num_Nz, dataBase_.d_csr_row_index, dataBase_.d_csr_col_index, d_A);
-    }
-    else
-    {
-        pSolver->updateOperator(dataBase_.num_cells, dataBase_.num_Nz, d_A);
-    }
-    pSolver->solve(dataBase_.num_cells, dataBase_.d_p, d_source);
+    dataBase_.solve(num_iteration, AMGXSetting::p_setting, d_A, dataBase_.d_p, d_source);
     num_iteration++;
 }
 

From 01ac9fea1143a58cdbac655b1f3f913e3ffb4a61 Mon Sep 17 00:00:00 2001
From: STwangyingrui <wangyingrui@sensetime.com>
Date: Wed, 25 Oct 2023 18:27:09 +0800
Subject: [PATCH 6/6] fix sereral bugs and align multi-card multi-step result

---
 applications/solvers/dfLowMachFoam/EEqn.H     |  2 +-
 applications/solvers/dfLowMachFoam/UEqn.H     |  2 +-
 .../solvers/dfLowMachFoam/dfLowMachFoam.C     |  8 +++----
 applications/solvers/dfLowMachFoam/pEqn_GPU.H |  2 +-
 src_gpu/AmgXSolver.cu                         |  6 ++---
 src_gpu/dfEEqn.cu                             |  6 +++++
 src_gpu/dfRhoEqn.cu                           |  2 +-
 src_gpu/dfUEqn.H                              |  2 +-
 src_gpu/dfUEqn.cu                             | 23 ++++++++-----------
 9 files changed, 27 insertions(+), 26 deletions(-)

diff --git a/applications/solvers/dfLowMachFoam/EEqn.H b/applications/solvers/dfLowMachFoam/EEqn.H
index 5554e7128..51277cd73 100644
--- a/applications/solvers/dfLowMachFoam/EEqn.H
+++ b/applications/solvers/dfLowMachFoam/EEqn.H
@@ -93,7 +93,7 @@
 		// EEqn_GPU.compareResult(&EEqn.lower()[0], &EEqn.upper()[0], &EEqn.diag()[0], &EEqn.source()[0],
 		// 		h_internal_coeffs.data(), h_boundary_coeffs.data(), printFlag);
 		// DEBUG_TRACE;
-        EEqn_GPU.compareHe(&he[0], h_boundary_he_tmp, printFlag);
+        //EEqn_GPU.compareHe(&he[0], h_boundary_he_tmp, printFlag);
     }
 
     delete h_boundary_he_tmp;
diff --git a/applications/solvers/dfLowMachFoam/UEqn.H b/applications/solvers/dfLowMachFoam/UEqn.H
index 2a059161e..a6244c870 100644
--- a/applications/solvers/dfLowMachFoam/UEqn.H
+++ b/applications/solvers/dfLowMachFoam/UEqn.H
@@ -118,7 +118,7 @@
         //     h_internal_coeffs.data(), h_boundary_coeffs.data(), 
         //     // &gradU[0][0], h_boundary_gradU,
         //     printFlag);
-        UEqn_GPU.compareU(&U[0][0], h_boundary_u_tmp, printFlag);
+        //UEqn_GPU.compareU(&U[0][0], h_boundary_u_tmp, printFlag);
     }
     DEBUG_TRACE;
 #endif
diff --git a/applications/solvers/dfLowMachFoam/dfLowMachFoam.C b/applications/solvers/dfLowMachFoam/dfLowMachFoam.C
index fcbabae90..5f3c7aea1 100644
--- a/applications/solvers/dfLowMachFoam/dfLowMachFoam.C
+++ b/applications/solvers/dfLowMachFoam/dfLowMachFoam.C
@@ -369,7 +369,7 @@ int main(int argc, char *argv[])
                     MPI_Comm_rank(MPI_COMM_WORLD, &rank);
                 }
                 if (!mpi_init_flag || rank == 0) {
-                    // thermo_GPU.compareT(&T[0], h_boundary_T_tmp, printFlag);
+                    thermo_GPU.compareT(&T[0], h_boundary_T_tmp, printFlag);
                     // thermo_GPU.compareHe(&thermo.he()[0], h_boundary_he_tmp, printFlag);
                     // thermo_GPU.comparePsi(&psi[0], h_boundary_thermo_psi_tmp, printFlag);
                     // thermo_GPU.compareAlpha(&alpha[0], h_boundary_thermo_alpha_tmp, printFlag);
@@ -470,11 +470,9 @@ int main(int argc, char *argv[])
 
 #ifdef GPUSolverNew_
         // write U for
-        double *h_U_tmp = new double[dfDataBase.num_cells * 3];
-        UEqn_GPU.postProcess(h_U_tmp);
-        memcpy(&U[0][0], h_U_tmp, dfDataBase.cell_value_vec_bytes);
+        UEqn_GPU.postProcess();
+        memcpy(&U[0][0], dfDataBase.h_u, dfDataBase.cell_value_vec_bytes);
         U.correctBoundaryConditions();
-        delete h_U_tmp;
 #endif
 
         runTime.write();
diff --git a/applications/solvers/dfLowMachFoam/pEqn_GPU.H b/applications/solvers/dfLowMachFoam/pEqn_GPU.H
index 82de66ad2..b982d8475 100644
--- a/applications/solvers/dfLowMachFoam/pEqn_GPU.H
+++ b/applications/solvers/dfLowMachFoam/pEqn_GPU.H
@@ -154,7 +154,7 @@ UEqn_GPU.sync();
     }
     // pEqn_GPU.correctP(&p[0], h_boundary_p);
     if (!mpi_init_flag || rank == 0) {
-        pEqn_GPU.comparep(&p[0], h_boundary_p, false);
+        //pEqn_GPU.comparep(&p[0], h_boundary_p, false);
     }
     delete h_boundary_p;
 
diff --git a/src_gpu/AmgXSolver.cu b/src_gpu/AmgXSolver.cu
index c4321f868..8658a268b 100644
--- a/src_gpu/AmgXSolver.cu
+++ b/src_gpu/AmgXSolver.cu
@@ -54,9 +54,6 @@ void AmgXSolver::initialize(const std::string &modeStr, const std::string &cfgFi
     // get the mode of AmgX solver
     setMode(modeStr);  
 
-    // initialize AmgX
-    initAmgX(cfgFile, devID);  
-
     // check if MPI has been initialized
     MPI_Initialized(&isMPIEnabled);
     if (isMPIEnabled) {
@@ -64,6 +61,9 @@ void AmgXSolver::initialize(const std::string &modeStr, const std::string &cfgFi
         mpiWorld = MPI_COMM_WORLD;
     }
 
+    // initialize AmgX
+    initAmgX(cfgFile, devID);
+
     // a bool indicating if this instance is initialized
     isInitialised = true;
 
diff --git a/src_gpu/dfEEqn.cu b/src_gpu/dfEEqn.cu
index a884a42e4..cb43f8c9a 100644
--- a/src_gpu/dfEEqn.cu
+++ b/src_gpu/dfEEqn.cu
@@ -220,6 +220,12 @@ void dfEEqn::process() {
                 dataBase_.d_boundary_T, dataBase_.d_boundary_y, d_boundary_heGradient, &thermo_);
         TICK_END_EVENT(EEqn post process correctBC);
 
+        TICK_START_EVENT;
+        // copy he to host
+        checkCudaErrors(cudaMemcpyAsync(dataBase_.h_he, dataBase_.d_he, dataBase_.cell_value_bytes, cudaMemcpyDeviceToHost, dataBase_.stream));
+        checkCudaErrors(cudaMemcpyAsync(dataBase_.h_boundary_he, dataBase_.d_boundary_he, dataBase_.boundary_surface_value_bytes, cudaMemcpyDeviceToHost, dataBase_.stream));
+        TICK_END_EVENT(EEqn post process copy back);
+
         TICK_START_EVENT;
 #ifdef STREAM_ALLOCATOR
         // thermophysical fields
diff --git a/src_gpu/dfRhoEqn.cu b/src_gpu/dfRhoEqn.cu
index d0942fe2b..57d32c9a4 100644
--- a/src_gpu/dfRhoEqn.cu
+++ b/src_gpu/dfRhoEqn.cu
@@ -87,7 +87,7 @@ void dfRhoEqn::process()
     TICK_END_EVENT(rhoEqn post process free);
     TICK_START_EVENT;
     checkCudaErrors(cudaMemcpyAsync(dataBase_.h_rho, dataBase_.d_rho, dataBase_.cell_value_bytes, cudaMemcpyDeviceToHost, dataBase_.stream));
-    TICK_END_EVENT(rhoEqn post process copyback);
+    TICK_END_EVENT(rhoEqn post process copy back);
     sync();
 }
 
diff --git a/src_gpu/dfUEqn.H b/src_gpu/dfUEqn.H
index 3b412ecb4..ef3c8133f 100644
--- a/src_gpu/dfUEqn.H
+++ b/src_gpu/dfUEqn.H
@@ -115,7 +115,7 @@ public:
     void preProcessForRhoEqn(const double *h_rho, const double *h_phi, const double *h_boundary_phi);
 	void preProcess(const double *h_u, const double *h_boundary_u, const double *h_p, const double *h_boundary_p, const double *h_nu_eff, const double *h_boundary_nu_eff);
 	void process();
-	void postProcess(double *h_u);
+	void postProcess();
 
     void A(double *Psi);
     void H(double *Psi);
diff --git a/src_gpu/dfUEqn.cu b/src_gpu/dfUEqn.cu
index c5336bdc2..131c566d8 100644
--- a/src_gpu/dfUEqn.cu
+++ b/src_gpu/dfUEqn.cu
@@ -336,8 +336,8 @@ void dfUEqn::createNonConstantFieldsInternal() {
   checkCudaErrors(cudaMalloc((void**)&d_grad_u, dataBase_.cell_value_tsr_bytes));
   checkCudaErrors(cudaMalloc((void**)&d_rho_nueff, dataBase_.cell_value_bytes));
   checkCudaErrors(cudaMalloc((void**)&d_fvc_output, dataBase_.cell_value_vec_bytes));
-  checkCudaErrors(cudaMalloc((void**)&d_u_host_order, dataBase_.cell_value_vec_bytes));
 #endif
+  checkCudaErrors(cudaMalloc((void**)&d_u_host_order, dataBase_.cell_value_vec_bytes));
   // computed on CPU, used on GPU, need memcpyh2d
   checkCudaErrors(cudaMallocHost((void**)&h_nu_eff , dataBase_.cell_value_bytes));
   checkCudaErrors(cudaMallocHost((void**)&h_A_pEqn , dataBase_.cell_value_bytes));
@@ -359,9 +359,9 @@ void dfUEqn::createNonConstantFieldsBoundary() {
   checkCudaErrors(cudaMalloc((void**)&d_value_boundary_coeffs, dataBase_.boundary_surface_value_vec_bytes));
   checkCudaErrors(cudaMalloc((void**)&d_gradient_internal_coeffs, dataBase_.boundary_surface_value_vec_bytes));
   checkCudaErrors(cudaMalloc((void**)&d_gradient_boundary_coeffs, dataBase_.boundary_surface_value_vec_bytes));
+#endif
   // intermediate boundary
   checkCudaErrors(cudaMalloc((void**)&d_boundary_u_host_order, dataBase_.boundary_surface_value_vec_bytes));
-#endif
   // computed on CPU, used on GPU, need memcpyh2d
   checkCudaErrors(cudaMallocHost((void**)&h_boundary_nu_eff, dataBase_.boundary_surface_value_bytes));
 
@@ -385,10 +385,6 @@ void dfUEqn::createNonConstantLduAndCsrFields() {
   checkCudaErrors(cudaMalloc((void**)&d_A_pEqn, dataBase_.cell_value_bytes)); // TODO: delete redundant variables
   checkCudaErrors(cudaMalloc((void**)&d_H_pEqn, dataBase_.cell_value_vec_bytes));
   checkCudaErrors(cudaMalloc((void**)&d_H_pEqn_perm, dataBase_.cell_value_vec_bytes));
-
-  checkCudaErrors(cudaMalloc((void**)&d_u_host_order, dataBase_.cell_value_vec_bytes));
-  // intermediate boundary fields
-  checkCudaErrors(cudaMalloc((void**)&d_boundary_u_host_order, dataBase_.boundary_surface_value_vec_bytes));
 }
 
 void dfUEqn::initNonConstantFieldsInternal(const double *u, const double *boundary_u)
@@ -570,12 +566,6 @@ void dfUEqn::process() {
                 dataBase_.d_boundary_u, dataBase_.d_boundary_k);
         TICK_END_EVENT(UEqn post process correctBC);
 
-        TICK_START_EVENT;
-        // copy u and boundary_u to host
-        permute_vector_d2h(dataBase_.stream, dataBase_.num_cells, dataBase_.d_u, d_u_host_order);
-        checkCudaErrors(cudaMemcpyAsync(dataBase_.h_u, d_u_host_order, dataBase_.cell_value_vec_bytes, cudaMemcpyDeviceToHost, dataBase_.stream));
-        TICK_END_EVENT(UEqn post process copyback);
-
         TICK_START_EVENT;
 #ifdef STREAM_ALLOCATOR
         // free
@@ -624,7 +614,14 @@ void dfUEqn::solve() {
     num_iteration++;
 }
 
-void dfUEqn::postProcess(double *h_u) {}
+void dfUEqn::postProcess() {
+    // postProcess of dfUEqn can not be moved to the end of dfUEqn::process(),
+    // because dataBase_.d_u is modified in dfpEqn and we only need the result of the last change
+    // copy u and boundary_u to host
+    permute_vector_d2h(dataBase_.stream, dataBase_.num_cells, dataBase_.d_u, d_u_host_order);
+    checkCudaErrors(cudaMemcpyAsync(dataBase_.h_u, d_u_host_order, dataBase_.cell_value_vec_bytes, cudaMemcpyDeviceToHost, dataBase_.stream));
+    sync();
+}
 
 void dfUEqn::A(double *Psi) {
     fvMtx_A(dataBase_.stream, dataBase_.num_cells, dataBase_.num_surfaces, dataBase_.num_boundary_surfaces,