deepmodeling · maorz1998 · Oct 25, 2023 · Oct 19, 2023 · Oct 23, 2023 · Oct 24, 2023
diff --git a/applications/solvers/dfLowMachFoam/EEqn.H b/applications/solvers/dfLowMachFoam/EEqn.H
@@ -93,7 +93,7 @@
 		// EEqn_GPU.compareResult(&EEqn.lower()[0], &EEqn.upper()[0], &EEqn.diag()[0], &EEqn.source()[0],
 		// 		h_internal_coeffs.data(), h_boundary_coeffs.data(), printFlag);
 		// DEBUG_TRACE;
-        EEqn_GPU.compareHe(&he[0], h_boundary_he_tmp, printFlag);
+        //EEqn_GPU.compareHe(&he[0], h_boundary_he_tmp, printFlag);
     }
 
     delete h_boundary_he_tmp;

diff --git a/applications/solvers/dfLowMachFoam/UEqn.H b/applications/solvers/dfLowMachFoam/UEqn.H
@@ -118,7 +118,7 @@
         //     h_internal_coeffs.data(), h_boundary_coeffs.data(), 
         //     // &gradU[0][0], h_boundary_gradU,
         //     printFlag);
-        UEqn_GPU.compareU(&U[0][0], h_boundary_u_tmp, printFlag);
+        //UEqn_GPU.compareU(&U[0][0], h_boundary_u_tmp, printFlag);
     }
     DEBUG_TRACE;
 #endif

diff --git a/applications/solvers/dfLowMachFoam/createGPUSolver.H b/applications/solvers/dfLowMachFoam/createGPUSolver.H
@@ -99,7 +99,7 @@ void initNccl() {
     ncclInit(PstreamGlobals::MPI_COMM_FOAM, nccl_comm, nccl_id, &nRanks, &myRank, &localRank, &mpi_init_flag);
 }
 
-void createGPUBase(fvMesh& mesh, PtrList<volScalarField>& Y) {
+void createGPUBase(const IOdictionary& CanteraTorchProperties, fvMesh& mesh, PtrList<volScalarField>& Y) {
     // prepare constant values: num_cells, num_surfaces, num_boundary_surfaces,
     // num_patches, patch_size, num_species, rdelta_t
     const labelUList& owner = mesh.owner();
@@ -152,6 +152,7 @@ void createGPUBase(fvMesh& mesh, PtrList<volScalarField>& Y) {
     double rDeltaT = 1 / 1e-6;
     dfDataBase.setConstantValues(num_cells, num_total_cells, num_surfaces, num_boundary_surfaces, 
             num_patches, nProcValues, patch_size, Y.size(), rDeltaT);
+
     // wyr: now there is no other place to prepare nccl info, thus mpi must be initialized at beginning.
     label flag_mpi_init;
     MPI_Initialized(&flag_mpi_init);
@@ -187,6 +188,14 @@ void createGPUBase(fvMesh& mesh, PtrList<volScalarField>& Y) {
     // prepare cuda resources
     dfDataBase.prepareCudaResources();
 
+    // setup amgx solvers
+    string mode_string = "dDDI";
+    string u_setting_path;
+    u_setting_path = CanteraTorchProperties.subDict("AmgxSettings").lookupOrDefault("UEqnSettingPath", string(""));
+    string p_setting_path;
+    p_setting_path = CanteraTorchProperties.subDict("AmgxSettings").lookupOrDefault("pEqnSettingPath", string(""));
+    dfDataBase.setAmgxSolvers(mode_string, u_setting_path, p_setting_path);
+
     // prepare constant indexes: owner, neighbor, procRows, procCols
     if (Pstream::parRun())
     {
@@ -373,12 +382,14 @@ void createGPURhoEqn(const volScalarField& rho, const surfaceScalarField& phi) {
             offset += patchsize;
         }
     }
+    rhoEqn_GPU.setConstantValues();
     rhoEqn_GPU.setConstantFields(patch_type);
     rhoEqn_GPU.initNonConstantFields(h_rho, h_phi, h_boundary_rho, h_boundary_phi);
     rhoEqn_GPU.createNonConstantLduAndCsrFields();
 }
 
 void createGPUUEqn(const IOdictionary& CanteraTorchProperties, const volVectorField& U) {
+    // TODO need remove amgx solver setting
     // prepare mode_string and setting_path
     string mode_string = "dDDI";
     string settingPath;
@@ -431,6 +442,7 @@ void createGPUUEqn(const IOdictionary& CanteraTorchProperties, const volVectorFi
 
 void createGPUYEqn(const IOdictionary& CanteraTorchProperties, PtrList<volScalarField>& Y, const int inertIndex) {
     DEBUG_TRACE;
+    // TODO need remove amgx solver setting
     // prepare mode_string and setting_path
     string mode_string = "dDDI";
     string settingPath;
@@ -481,6 +493,7 @@ void createGPUYEqn(const IOdictionary& CanteraTorchProperties, PtrList<volScalar
 
 void createGPUEEqn(const IOdictionary& CanteraTorchProperties, volScalarField& he, volScalarField& K) {
     DEBUG_TRACE;
+    // TODO need remove amgx solver setting
     // prepare mode_string and setting_path
     string mode_string = "dDDI";
     string settingPath;
@@ -527,6 +540,7 @@ void createGPUEEqn(const IOdictionary& CanteraTorchProperties, volScalarField& h
 
 void createGPUpEqn(const IOdictionary& CanteraTorchProperties, volScalarField& p, const volVectorField& U) {
     DEBUG_TRACE;
+    // TODO need remove amgx solver setting
     // prepare mode_string and setting_path
     string mode_string = "dDDI";
     string settingPath;

diff --git a/applications/solvers/dfLowMachFoam/dfLowMachFoam.C b/applications/solvers/dfLowMachFoam/dfLowMachFoam.C
@@ -203,7 +203,7 @@ int main(int argc, char *argv[])
     if(mpi_init_flag) {
         initNccl();
     }
-    createGPUBase(mesh, Y);
+    createGPUBase(CanteraTorchProperties, mesh, Y);
     DEBUG_TRACE;
 #endif
 
@@ -369,7 +369,7 @@ int main(int argc, char *argv[])
                     MPI_Comm_rank(MPI_COMM_WORLD, &rank);
                 }
                 if (!mpi_init_flag || rank == 0) {
-                    // thermo_GPU.compareT(&T[0], h_boundary_T_tmp, printFlag);
+                    thermo_GPU.compareT(&T[0], h_boundary_T_tmp, printFlag);
                     // thermo_GPU.compareHe(&thermo.he()[0], h_boundary_he_tmp, printFlag);
                     // thermo_GPU.comparePsi(&psi[0], h_boundary_thermo_psi_tmp, printFlag);
                     // thermo_GPU.compareAlpha(&alpha[0], h_boundary_thermo_alpha_tmp, printFlag);
@@ -452,6 +452,8 @@ int main(int argc, char *argv[])
             }
             end = std::clock();
             time_monitor_turbulence_correct += double(end - start) / double(CLOCKS_PER_SEC);
+            //fprintf(stderr, "sleep for 5s...\n");
+            //usleep(5 * 1000 * 1000);
         }
         clock_t loop_end = std::clock();
         double loop_time = double(loop_end - loop_start) / double(CLOCKS_PER_SEC);
@@ -468,11 +470,9 @@ int main(int argc, char *argv[])
 
 #ifdef GPUSolverNew_
         // write U for
-        double *h_U_tmp = new double[dfDataBase.num_cells * 3];
-        UEqn_GPU.postProcess(h_U_tmp);
-        memcpy(&U[0][0], h_U_tmp, dfDataBase.cell_value_vec_bytes);
+        UEqn_GPU.postProcess();
+        memcpy(&U[0][0], dfDataBase.h_u, dfDataBase.cell_value_vec_bytes);
         U.correctBoundaryConditions();
-        delete h_U_tmp;
 #endif
 
         runTime.write();

diff --git a/applications/solvers/dfLowMachFoam/pEqn_GPU.H b/applications/solvers/dfLowMachFoam/pEqn_GPU.H
@@ -154,7 +154,7 @@ UEqn_GPU.sync();
     }
     // pEqn_GPU.correctP(&p[0], h_boundary_p);
     if (!mpi_init_flag || rank == 0) {
-        pEqn_GPU.comparep(&p[0], h_boundary_p, false);
+        //pEqn_GPU.comparep(&p[0], h_boundary_p, false);
     }
     delete h_boundary_p;
 

diff --git a/src_gpu/AmgXSolver.cu b/src_gpu/AmgXSolver.cu
@@ -54,16 +54,16 @@ void AmgXSolver::initialize(const std::string &modeStr, const std::string &cfgFi
     // get the mode of AmgX solver
     setMode(modeStr);  
 
-    // initialize AmgX
-    initAmgX(cfgFile, devID);  
-
     // check if MPI has been initialized
     MPI_Initialized(&isMPIEnabled);
     if (isMPIEnabled) {
         MPI_Comm_size(MPI_COMM_WORLD, &mpiSize);
         mpiWorld = MPI_COMM_WORLD;
     }
 
+    // initialize AmgX
+    initAmgX(cfgFile, devID);
+
     // a bool indicating if this instance is initialized
     isInitialised = true;
 

diff --git a/src_gpu/dfEEqn.H b/src_gpu/dfEEqn.H
@@ -12,10 +12,14 @@ class dfEEqn
     dfThermo &thermo_;
 
     // cuda resource
+    cudaStream_t stream;
+#ifdef USE_GRAPH
     // one graph for one eqn before using self-developed solver
     cudaGraph_t graph_pre, graph_post;
     cudaGraphExec_t graph_instance_pre, graph_instance_post;
-    bool graph_created=false;
+    bool pre_graph_created=false;
+    bool post_graph_created=false;
+#endif
 
 	// constant values -- basic
 	std::string mode_string;

diff --git a/src_gpu/dfEEqn.cu b/src_gpu/dfEEqn.cu
@@ -21,6 +21,7 @@ double* dfEEqn::getFieldPointer(const char* fieldAlias, location loc, position p
 }
 
 void dfEEqn::setConstantValues(const std::string &mode_string, const std::string &setting_path) {
+    this->stream = dataBase_.stream;
     this->mode_string = mode_string;
     this->setting_path = setting_path;
     ESolver = new AmgXSolver(mode_string, setting_path, dataBase_.localRank);
@@ -88,28 +89,27 @@ void dfEEqn::initNonConstantFields(const double *he, const double *boundary_he)
 }
 
 void dfEEqn::cleanCudaResources() {
-    if (graph_created) {
+#ifdef USE_GRAPH
+    if (pre_graph_created) {
         checkCudaErrors(cudaGraphExecDestroy(graph_instance_pre));
-        checkCudaErrors(cudaGraphExecDestroy(graph_instance_post));
         checkCudaErrors(cudaGraphDestroy(graph_pre));
+    }
+    if (post_graph_created) {
+        checkCudaErrors(cudaGraphExecDestroy(graph_instance_post));
         checkCudaErrors(cudaGraphDestroy(graph_post));
     }
+#endif
 }
 
 void dfEEqn::preProcess(const double *h_he, const double *h_k, const double *h_k_old, const double *h_dpdt, const double *h_boundary_k, const double *h_boundary_heGradient)
 {
 }
 
 void dfEEqn::process() {
-    //使用event计算时间
-    float time_elapsed=0;
-    cudaEvent_t start,stop;
-    checkCudaErrors(cudaEventCreate(&start));
-    checkCudaErrors(cudaEventCreate(&stop));
-    checkCudaErrors(cudaEventRecord(start,0));
-
-#ifndef TIME_GPU
-    if(!graph_created) {
+    TICK_INIT_EVENT;
+    TICK_START_EVENT;
+#ifdef USE_GRAPH
+    if(!pre_graph_created) {
         DEBUG_TRACE;
         checkCudaErrors(cudaStreamBeginCapture(dataBase_.stream, cudaStreamCaptureModeGlobal));
 #endif
@@ -125,7 +125,7 @@ void dfEEqn::process() {
     checkCudaErrors(cudaMallocAsync((void**)&d_gradient_internal_coeffs, dataBase_.boundary_surface_value_bytes, dataBase_.stream));
     checkCudaErrors(cudaMallocAsync((void**)&d_gradient_boundary_coeffs, dataBase_.boundary_surface_value_bytes, dataBase_.stream));
 
-    checkCudaErrors(cudaMallocAsync((void**)&d_boundary_heGradient, dataBase_.num_gradientEnergy_boundary_surfaces, dataBase_.stream));
+    checkCudaErrors(cudaMallocAsync((void**)&d_boundary_heGradient, sizeof(double) * num_gradientEnergy_boundary_surfaces, dataBase_.stream));
 
     checkCudaErrors(cudaMallocAsync((void**)&d_source, dataBase_.cell_value_bytes, dataBase_.stream));
     checkCudaErrors(cudaMallocAsync((void**)&d_internal_coeffs, dataBase_.boundary_surface_value_bytes, dataBase_.stream));
@@ -190,58 +190,71 @@ void dfEEqn::process() {
             dataBase_.num_Nz, dataBase_.d_boundary_face_cell, dataBase_.d_ldu_to_csr_index, dataBase_.num_patches,
             dataBase_.patch_size.data(), patch_type_he.data(), d_ldu, d_source, d_internal_coeffs, d_boundary_coeffs, d_A);
 #endif
-#ifndef TIME_GPU
+#ifdef USE_GRAPH
         checkCudaErrors(cudaStreamEndCapture(dataBase_.stream, &graph_pre));
         checkCudaErrors(cudaGraphInstantiate(&graph_instance_pre, graph_pre, NULL, NULL, 0));
+        pre_graph_created = true;
     }
     DEBUG_TRACE;
     checkCudaErrors(cudaGraphLaunch(graph_instance_pre, dataBase_.stream));
 #endif
-    checkCudaErrors(cudaEventRecord(stop,0));
-    checkCudaErrors(cudaEventSynchronize(start));
-    checkCudaErrors(cudaEventSynchronize(stop));
-    checkCudaErrors(cudaEventElapsedTime(&time_elapsed,start,stop));
-    if(!mpi_init_flag || myRank == 0)
-    fprintf(stderr, "eeqn assembly time:%f(ms)\n",time_elapsed);
-
-    checkCudaErrors(cudaEventRecord(start,0));
+    TICK_END_EVENT(EEqn assembly);
+
+    TICK_START_EVENT;
 #ifndef DEBUG_CHECK_LDU
     solve();
 #endif
-    checkCudaErrors(cudaEventRecord(stop,0));
-    checkCudaErrors(cudaEventSynchronize(start));
-    checkCudaErrors(cudaEventSynchronize(stop));
-    checkCudaErrors(cudaEventElapsedTime(&time_elapsed,start,stop));
-    if(!mpi_init_flag || myRank == 0)
-    fprintf(stderr, "eeqn solve time:%f(ms)\n",time_elapsed);
-
-    checkCudaErrors(cudaEventRecord(start,0));
-#ifndef TIME_GPU
-    if(!graph_created) {
+    TICK_END_EVENT(EEqn solve);
+
+#ifdef USE_GRAPH
+    if(!post_graph_created) {
         checkCudaErrors(cudaStreamBeginCapture(dataBase_.stream, cudaStreamCaptureModeGlobal));
 #endif
 
+        TICK_START_EVENT;
         correct_boundary_conditions_scalar(dataBase_.stream, dataBase_.nccl_comm, dataBase_.neighbProcNo.data(),
                 dataBase_.num_boundary_surfaces, dataBase_.num_patches, dataBase_.patch_size.data(),
                 patch_type_he.data(), dataBase_.d_boundary_delta_coeffs, dataBase_.d_boundary_face_cell,
                 dataBase_.d_he, dataBase_.d_boundary_he, dataBase_.cyclicNeighbor.data(), 
                 dataBase_.patchSizeOffset.data(), dataBase_.d_boundary_weight,
                 dataBase_.d_boundary_T, dataBase_.d_boundary_y, d_boundary_heGradient, &thermo_);
+        TICK_END_EVENT(EEqn post process correctBC);
+
+        TICK_START_EVENT;
+        // copy he to host
+        checkCudaErrors(cudaMemcpyAsync(dataBase_.h_he, dataBase_.d_he, dataBase_.cell_value_bytes, cudaMemcpyDeviceToHost, dataBase_.stream));
+        checkCudaErrors(cudaMemcpyAsync(dataBase_.h_boundary_he, dataBase_.d_boundary_he, dataBase_.boundary_surface_value_bytes, cudaMemcpyDeviceToHost, dataBase_.stream));
+        TICK_END_EVENT(EEqn post process copy back);
 
-#ifndef TIME_GPU
+        TICK_START_EVENT;
+#ifdef STREAM_ALLOCATOR
+        // thermophysical fields
+        checkCudaErrors(cudaFreeAsync(d_dpdt, dataBase_.stream));
+        // fiv weight fieldsFree
+        //checkCudaErrors(cudaFreeAsync(d_phi_special_weight, dataBase_.stream));
+        // boundary coeffs
+        checkCudaErrors(cudaFreeAsync(d_value_internal_coeffs, dataBase_.stream));
+        checkCudaErrors(cudaFreeAsync(d_value_boundary_coeffs, dataBase_.stream));
+        checkCudaErrors(cudaFreeAsync(d_gradient_internal_coeffs, dataBase_.stream));
+        checkCudaErrors(cudaFreeAsync(d_gradient_boundary_coeffs, dataBase_.stream));
+
+        checkCudaErrors(cudaFreeAsync(d_boundary_heGradient, dataBase_.stream));
+
+        checkCudaErrors(cudaFreeAsync(d_source, dataBase_.stream));
+        checkCudaErrors(cudaFreeAsync(d_internal_coeffs, dataBase_.stream));
+        checkCudaErrors(cudaFreeAsync(d_boundary_coeffs, dataBase_.stream));
+        checkCudaErrors(cudaFreeAsync(d_A, dataBase_.stream));
+        checkCudaErrors(cudaFreeAsync(d_b, dataBase_.stream));
+#endif
+        TICK_END_EVENT(EEqn post process free);
+#ifdef USE_GRAPH
         checkCudaErrors(cudaStreamEndCapture(dataBase_.stream, &graph_post));
         checkCudaErrors(cudaGraphInstantiate(&graph_instance_post, graph_post, NULL, NULL, 0));
-        graph_created = true;
+        post_graph_created = true;
     }
     checkCudaErrors(cudaGraphLaunch(graph_instance_post, dataBase_.stream));
 #endif
-
-    checkCudaErrors(cudaEventRecord(stop,0));
-    checkCudaErrors(cudaEventSynchronize(start));
-    checkCudaErrors(cudaEventSynchronize(stop));
-    checkCudaErrors(cudaEventElapsedTime(&time_elapsed,start,stop));
-    if(!mpi_init_flag || myRank == 0)
-    fprintf(stderr, "eeqn post process time: %f(ms)\n\n",time_elapsed);
+    sync();
 }
 
 void dfEEqn::eeqn_calculate_energy_gradient(dfThermo& GPUThermo, int num_cells, int num_species, 
@@ -252,6 +265,7 @@ void dfEEqn::eeqn_calculate_energy_gradient(dfThermo& GPUThermo, int num_cells,
 {
     int bou_offset = 0, gradient_offset = 0;
     for (int i = 0; i < num_patches; i++) {
+        if (patch_size[i] == 0) continue;
         if (patch_type[i] == boundaryConditions::gradientEnergy) {
             GPUThermo.calculateEnergyGradient(patch_size[i], num_cells, num_species, num_boundary_surfaces, bou_offset, gradient_offset,
                     face2Cells, T, p, y, boundary_delta_coeffs, boundary_p, boundary_y, boundary_thermo_gradient);
@@ -336,45 +350,8 @@ void dfEEqn::sync()
 
 void dfEEqn::solve()
 {
-    sync();
-
-    if (num_iteration == 0)                                     // first interation
-    {
-        printf("Initializing AmgX Linear Solver\n");
-        ESolver->setOperator(dataBase_.num_cells, dataBase_.num_total_cells, dataBase_.num_Nz, dataBase_.d_csr_row_index, dataBase_.d_csr_col_index, d_A);
-    }
-    else
-    {
-        ESolver->updateOperator(dataBase_.num_cells, dataBase_.num_Nz, d_A);
-    }
-    ESolver->solve(dataBase_.num_cells, dataBase_.d_he, d_source);
+    dataBase_.solve(num_iteration, AMGXSetting::u_setting, d_A, dataBase_.d_he, d_source);
     num_iteration++;
 }
 
-void dfEEqn::postProcess(double *h_he, double *h_boundary_he)
-{
-#ifdef STREAM_ALLOCATOR
-    // thermophysical fields
-    checkCudaErrors(cudaFreeAsync(d_dpdt, dataBase_.stream));
-    // fiv weight fieldsFree
-    //checkCudaErrors(cudaFreeAsync(d_phi_special_weight, dataBase_.stream));
-    // boundary coeffs
-    checkCudaErrors(cudaFreeAsync(d_value_internal_coeffs, dataBase_.stream));
-    checkCudaErrors(cudaFreeAsync(d_value_boundary_coeffs, dataBase_.stream));
-    checkCudaErrors(cudaFreeAsync(d_gradient_internal_coeffs, dataBase_.stream));
-    checkCudaErrors(cudaFreeAsync(d_gradient_boundary_coeffs, dataBase_.stream));
-
-    checkCudaErrors(cudaFreeAsync(d_boundary_heGradient, dataBase_.stream));
-
-    checkCudaErrors(cudaFreeAsync(d_source, dataBase_.stream));
-    checkCudaErrors(cudaFreeAsync(d_internal_coeffs, dataBase_.stream));
-    checkCudaErrors(cudaFreeAsync(d_boundary_coeffs, dataBase_.stream));
-    checkCudaErrors(cudaFreeAsync(d_A, dataBase_.stream));
-    checkCudaErrors(cudaFreeAsync(d_b, dataBase_.stream));
-#endif
-
-    // copy he to host
-    checkCudaErrors(cudaMemcpyAsync(h_he, dataBase_.d_he, dataBase_.cell_value_bytes, cudaMemcpyDeviceToHost, dataBase_.stream));
-    checkCudaErrors(cudaMemcpyAsync(h_boundary_he, dataBase_.d_boundary_he, dataBase_.boundary_surface_value_bytes, cudaMemcpyDeviceToHost, dataBase_.stream));
-    sync();
-}
+void dfEEqn::postProcess(double *h_he, double *h_boundary_he) {}