From 29c6059855b82fcb81fbb60ec3adfe2b6ea8da53 Mon Sep 17 00:00:00 2001 From: STwangyingrui Date: Thu, 19 Oct 2023 18:17:50 +0800 Subject: [PATCH 1/6] refactor for timing --- .../solvers/dfLowMachFoam/createGPUSolver.H | 1 + src_gpu/dfEEqn.H | 4 +- src_gpu/dfEEqn.cu | 110 ++++++------- src_gpu/dfMatrixDataBase.H | 7 +- src_gpu/dfMatrixOpBase.H | 59 ++++++- src_gpu/dfMatrixOpBase.cu | 148 +----------------- src_gpu/dfRhoEqn.H | 2 + src_gpu/dfRhoEqn.cu | 42 +++-- src_gpu/dfThermo.H | 1 + src_gpu/dfThermo.cu | 38 ----- src_gpu/dfUEqn.H | 4 +- src_gpu/dfUEqn.cu | 120 +++++++------- src_gpu/dfYEqn.H | 1 + src_gpu/dfYEqn.cu | 78 +++++---- src_gpu/dfpEqn.H | 4 +- src_gpu/dfpEqn.cu | 119 +++++++------- 16 files changed, 288 insertions(+), 450 deletions(-) diff --git a/applications/solvers/dfLowMachFoam/createGPUSolver.H b/applications/solvers/dfLowMachFoam/createGPUSolver.H index 272d23c20..226b8f3f3 100644 --- a/applications/solvers/dfLowMachFoam/createGPUSolver.H +++ b/applications/solvers/dfLowMachFoam/createGPUSolver.H @@ -323,6 +323,7 @@ void createGPURhoEqn(const volScalarField& rho, const surfaceScalarField& phi) { offset += patchsize; } } + rhoEqn_GPU.setConstantValues(); rhoEqn_GPU.setConstantFields(patch_type); rhoEqn_GPU.initNonConstantFields(h_rho, h_phi, h_boundary_rho, h_boundary_phi); rhoEqn_GPU.createNonConstantLduAndCsrFields(); diff --git a/src_gpu/dfEEqn.H b/src_gpu/dfEEqn.H index f19f3a600..63616b414 100644 --- a/src_gpu/dfEEqn.H +++ b/src_gpu/dfEEqn.H @@ -12,10 +12,12 @@ class dfEEqn dfThermo &thermo_; // cuda resource + cudaStream_t stream; // one graph for one eqn before using self-developed solver cudaGraph_t graph_pre, graph_post; cudaGraphExec_t graph_instance_pre, graph_instance_post; - bool graph_created=false; + bool pre_graph_created=false; + bool post_graph_created=false; // constant values -- basic std::string mode_string; diff --git a/src_gpu/dfEEqn.cu b/src_gpu/dfEEqn.cu index 91a4facd4..b9f92d526 100644 --- a/src_gpu/dfEEqn.cu +++ b/src_gpu/dfEEqn.cu @@ -21,6 +21,7 @@ double* dfEEqn::getFieldPointer(const char* fieldAlias, location loc, position p } void dfEEqn::setConstantValues(const std::string &mode_string, const std::string &setting_path) { + this->stream = dataBase_.stream; this->mode_string = mode_string; this->setting_path = setting_path; ESolver = new AmgXSolver(mode_string, setting_path, dataBase_.localRank); @@ -88,10 +89,12 @@ void dfEEqn::initNonConstantFields(const double *he, const double *boundary_he) } void dfEEqn::cleanCudaResources() { - if (graph_created) { + if (pre_graph_created) { checkCudaErrors(cudaGraphExecDestroy(graph_instance_pre)); - checkCudaErrors(cudaGraphExecDestroy(graph_instance_post)); checkCudaErrors(cudaGraphDestroy(graph_pre)); + } + if (post_graph_created) { + checkCudaErrors(cudaGraphExecDestroy(graph_instance_post)); checkCudaErrors(cudaGraphDestroy(graph_post)); } } @@ -101,15 +104,10 @@ void dfEEqn::preProcess(const double *h_he, const double *h_k, const double *h_k } void dfEEqn::process() { - //使用event计算时间 - float time_elapsed=0; - cudaEvent_t start,stop; - checkCudaErrors(cudaEventCreate(&start)); - checkCudaErrors(cudaEventCreate(&stop)); - checkCudaErrors(cudaEventRecord(start,0)); - -#ifndef TIME_GPU - if(!graph_created) { + TICK_INIT_EVENT; + TICK_START_EVENT; +#ifdef USE_GRAPH + if(!pre_graph_created) { DEBUG_TRACE; checkCudaErrors(cudaStreamBeginCapture(dataBase_.stream, cudaStreamCaptureModeGlobal)); #endif @@ -190,32 +188,25 @@ void dfEEqn::process() { dataBase_.num_Nz, dataBase_.d_boundary_face_cell, dataBase_.d_ldu_to_csr_index, dataBase_.num_patches, dataBase_.patch_size.data(), patch_type_he.data(), d_ldu, d_source, d_internal_coeffs, d_boundary_coeffs, d_A); #endif -#ifndef TIME_GPU +#ifdef USE_GRAPH checkCudaErrors(cudaStreamEndCapture(dataBase_.stream, &graph_pre)); checkCudaErrors(cudaGraphInstantiate(&graph_instance_pre, graph_pre, NULL, NULL, 0)); + pre_graph_created = true; } DEBUG_TRACE; checkCudaErrors(cudaGraphLaunch(graph_instance_pre, dataBase_.stream)); #endif - checkCudaErrors(cudaEventRecord(stop,0)); - checkCudaErrors(cudaEventSynchronize(start)); - checkCudaErrors(cudaEventSynchronize(stop)); - checkCudaErrors(cudaEventElapsedTime(&time_elapsed,start,stop)); - fprintf(stderr, "eeqn assembly time:%f(ms)\n",time_elapsed); + TICK_END_EVENT(EEqn assembly); - checkCudaErrors(cudaEventRecord(start,0)); + TICK_START_EVENT; #ifndef DEBUG_CHECK_LDU solve(); #endif - checkCudaErrors(cudaEventRecord(stop,0)); - checkCudaErrors(cudaEventSynchronize(start)); - checkCudaErrors(cudaEventSynchronize(stop)); - checkCudaErrors(cudaEventElapsedTime(&time_elapsed,start,stop)); - fprintf(stderr, "eeqn solve time:%f(ms)\n",time_elapsed); - - checkCudaErrors(cudaEventRecord(start,0)); -#ifndef TIME_GPU - if(!graph_created) { + TICK_END_EVENT(EEqn solve); + + TICK_START_EVENT; +#ifdef USE_GRAPH + if(!post_graph_created) { checkCudaErrors(cudaStreamBeginCapture(dataBase_.stream, cudaStreamCaptureModeGlobal)); #endif @@ -224,19 +215,38 @@ void dfEEqn::process() { patch_type_he.data(), dataBase_.d_boundary_delta_coeffs, dataBase_.d_boundary_face_cell, dataBase_.d_he, dataBase_.d_boundary_he, d_boundary_heGradient); -#ifndef TIME_GPU + // copy he and boundary_he to host + checkCudaErrors(cudaMemcpyAsync(dataBase_.h_he, dataBase_.d_he, dataBase_.cell_value_bytes, cudaMemcpyDeviceToHost, dataBase_.stream)); + checkCudaErrors(cudaMemcpyAsync(dataBase_.h_boundary_he, dataBase_.d_boundary_he, dataBase_.boundary_surface_value_bytes, cudaMemcpyDeviceToHost, dataBase_.stream)); + +#ifdef STREAM_ALLOCATOR + // thermophysical fields + checkCudaErrors(cudaFreeAsync(d_dpdt, dataBase_.stream)); + // fiv weight fieldsFree + //checkCudaErrors(cudaFreeAsync(d_phi_special_weight, dataBase_.stream)); + // boundary coeffs + checkCudaErrors(cudaFreeAsync(d_value_internal_coeffs, dataBase_.stream)); + checkCudaErrors(cudaFreeAsync(d_value_boundary_coeffs, dataBase_.stream)); + checkCudaErrors(cudaFreeAsync(d_gradient_internal_coeffs, dataBase_.stream)); + checkCudaErrors(cudaFreeAsync(d_gradient_boundary_coeffs, dataBase_.stream)); + + checkCudaErrors(cudaFreeAsync(d_boundary_heGradient, dataBase_.stream)); + + checkCudaErrors(cudaFreeAsync(d_source, dataBase_.stream)); + checkCudaErrors(cudaFreeAsync(d_internal_coeffs, dataBase_.stream)); + checkCudaErrors(cudaFreeAsync(d_boundary_coeffs, dataBase_.stream)); + checkCudaErrors(cudaFreeAsync(d_A, dataBase_.stream)); + checkCudaErrors(cudaFreeAsync(d_b, dataBase_.stream)); +#endif +#ifdef USE_GRAPH checkCudaErrors(cudaStreamEndCapture(dataBase_.stream, &graph_post)); checkCudaErrors(cudaGraphInstantiate(&graph_instance_post, graph_post, NULL, NULL, 0)); - graph_created = true; + post_graph_created = true; } checkCudaErrors(cudaGraphLaunch(graph_instance_post, dataBase_.stream)); #endif - - checkCudaErrors(cudaEventRecord(stop,0)); - checkCudaErrors(cudaEventSynchronize(start)); - checkCudaErrors(cudaEventSynchronize(stop)); - checkCudaErrors(cudaEventElapsedTime(&time_elapsed,start,stop)); - fprintf(stderr, "eeqn post process time: %f(ms)\n\n",time_elapsed); + TICK_END_EVENT(EEqn post process); + sync(); } void dfEEqn::eeqn_calculate_energy_gradient(dfThermo& GPUThermo, int num_cells, int num_species, @@ -330,7 +340,7 @@ void dfEEqn::sync() void dfEEqn::solve() { - sync(); + sync(); // TODO can be removed if (num_iteration == 0) // first interation { @@ -345,30 +355,4 @@ void dfEEqn::solve() num_iteration++; } -void dfEEqn::postProcess(double *h_he, double *h_boundary_he) -{ -#ifdef STREAM_ALLOCATOR - // thermophysical fields - checkCudaErrors(cudaFreeAsync(d_dpdt, dataBase_.stream)); - // fiv weight fieldsFree - //checkCudaErrors(cudaFreeAsync(d_phi_special_weight, dataBase_.stream)); - // boundary coeffs - checkCudaErrors(cudaFreeAsync(d_value_internal_coeffs, dataBase_.stream)); - checkCudaErrors(cudaFreeAsync(d_value_boundary_coeffs, dataBase_.stream)); - checkCudaErrors(cudaFreeAsync(d_gradient_internal_coeffs, dataBase_.stream)); - checkCudaErrors(cudaFreeAsync(d_gradient_boundary_coeffs, dataBase_.stream)); - - checkCudaErrors(cudaFreeAsync(d_boundary_heGradient, dataBase_.stream)); - - checkCudaErrors(cudaFreeAsync(d_source, dataBase_.stream)); - checkCudaErrors(cudaFreeAsync(d_internal_coeffs, dataBase_.stream)); - checkCudaErrors(cudaFreeAsync(d_boundary_coeffs, dataBase_.stream)); - checkCudaErrors(cudaFreeAsync(d_A, dataBase_.stream)); - checkCudaErrors(cudaFreeAsync(d_b, dataBase_.stream)); -#endif - - // copy he to host - checkCudaErrors(cudaMemcpyAsync(h_he, dataBase_.d_he, dataBase_.cell_value_bytes, cudaMemcpyDeviceToHost, dataBase_.stream)); - checkCudaErrors(cudaMemcpyAsync(h_boundary_he, dataBase_.d_boundary_he, dataBase_.boundary_surface_value_bytes, cudaMemcpyDeviceToHost, dataBase_.stream)); - sync(); -} +void dfEEqn::postProcess(double *h_he, double *h_boundary_he) {} diff --git a/src_gpu/dfMatrixDataBase.H b/src_gpu/dfMatrixDataBase.H index 574305840..def2f4a89 100644 --- a/src_gpu/dfMatrixDataBase.H +++ b/src_gpu/dfMatrixDataBase.H @@ -16,11 +16,12 @@ #include #include -extern int myRank; +#define DEBUG_ +#define DEBUG_CHECK_LDU -// #define GPU_DEBUG_ +extern int myRank; -#ifdef GPU_DEBUG_ +#ifdef DEBUG_ #define DEBUG_TRACE fprintf(stderr, "myRank[%d] %s %d\n", myRank, __FILE__, __LINE__); #else #define DEBUG_TRACE diff --git a/src_gpu/dfMatrixOpBase.H b/src_gpu/dfMatrixOpBase.H index 405fd28be..195e86225 100644 --- a/src_gpu/dfMatrixOpBase.H +++ b/src_gpu/dfMatrixOpBase.H @@ -1,8 +1,62 @@ #pragma once #include #include -// #define TIME_GPU -// #define DEBUG_CHECK_LDU + +#ifndef USE_GRAPH + #define USE_GRAPH +#endif + +#ifndef TIME_GPU + #define TIME_GPU + #if (defined TIME_GPU) && (defined USE_GRAPH) + #undef USE_GRAPH + #endif +#endif + +extern int myRank; +extern __global__ void warmup(); + +#ifdef TIME_GPU + #define WARM_UP \ + warmup<<<10, 1024, 0, stream>>>(); + + #define TICK_INIT_EVENT \ + float time_elapsed_kernel=0;\ + cudaEvent_t start_kernel, stop_kernel;\ + checkCudaErrors(cudaEventCreate(&start_kernel));\ + checkCudaErrors(cudaEventCreate(&stop_kernel)); + + #define TICK_START_EVENT \ + checkCudaErrors(cudaEventRecord(start_kernel,stream)); + + #define TICK_END_EVENT(prefix) \ + checkCudaErrors(cudaEventRecord(stop_kernel,stream));\ + checkCudaErrors(cudaEventSynchronize(start_kernel));\ + checkCudaErrors(cudaEventSynchronize(stop_kernel));\ + checkCudaErrors(cudaEventElapsedTime(&time_elapsed_kernel,start_kernel,stop_kernel));\ + fprintf(stderr, "rank[%d], name: %s, time: %lf(ms)\n", myRank, #prefix, time_elapsed_kernel); + +/* + // the usage description: + // if you want to profile the first kernel, please use WARM_UP before TICK_INIT_EVENT. + // otherwise there is no need to use WARM_UP + WARM_UP; + // init event + TICK_INIT_EVENT; + // start event + TICK_START_EVENT; + // call your kernel, or kernels, or wrapper functions, e.g.: + my_kernel<<>>(num, input, output); + // end event with your specified name string, e.g.: + TICK_END_EVENT(my_kernel); +*/ + +#else + #define WARM_UP + #define TICK_INIT_EVENT + #define TICK_START_EVENT + #define TICK_END_EVENT(prefix) +#endif // tools void permute_vector_d2h(cudaStream_t stream, int num_cells, const double *input, double *output); @@ -247,3 +301,4 @@ void fvMtx_flux(cudaStream_t stream, int num_surfaces, int num_boundary_surfaces void solve_explicit_scalar(cudaStream_t stream, int num_cells, const double *diag, const double *source, double *psi); + diff --git a/src_gpu/dfMatrixOpBase.cu b/src_gpu/dfMatrixOpBase.cu index bf2f3be7e..9d9d33566 100644 --- a/src_gpu/dfMatrixOpBase.cu +++ b/src_gpu/dfMatrixOpBase.cu @@ -5,32 +5,10 @@ #include #include "cuda_profiler_api.h" -#ifdef TIME_GPU - #define TICK_INIT_EVENT \ - float time_elapsed_kernel=0;\ - cudaEvent_t start_kernel, stop_kernel;\ - checkCudaErrors(cudaEventCreate(&start_kernel));\ - checkCudaErrors(cudaEventCreate(&stop_kernel)); - - #define TICK_START_EVENT \ - checkCudaErrors(cudaEventRecord(start_kernel,0)); - - #define TICK_END_EVENT(prefix) \ - checkCudaErrors(cudaEventRecord(stop_kernel,0));\ - checkCudaErrors(cudaEventSynchronize(start_kernel));\ - checkCudaErrors(cudaEventSynchronize(stop_kernel));\ - checkCudaErrors(cudaEventElapsedTime(&time_elapsed_kernel,start_kernel,stop_kernel));\ - printf("try %s 执行时间:%lf(ms)\n", #prefix, time_elapsed_kernel); -#else - #define TICK_INIT_EVENT - #define TICK_START_EVENT - #define TICK_END_EVENT(prefix) -#endif - -__global__ void warmup(int num_cells) +__global__ void warmup() { int index = blockDim.x * blockIdx.x + threadIdx.x; - if (index >= num_cells) + if (index >= 10240) return; } @@ -1951,13 +1929,10 @@ void field_add_scalar(cudaStream_t stream, int num, const double *input1, const double *input2, double *output, int num_boundary_surfaces, const double *boundary_input1, const double *boundary_input2, double *boundary_output) { - TICK_INIT_EVENT; size_t threads_per_block = 256; size_t blocks_per_grid = (std::max(num, num_boundary_surfaces) + threads_per_block - 1) / threads_per_block; - TICK_START_EVENT; field_add_scalar_kernel<<>>(num, num_boundary_surfaces, input1, input2, output, boundary_input1, boundary_input2, boundary_output); - TICK_END_EVENT(field_add_scalar_kernel); } void field_add_vector(cudaStream_t stream, @@ -1966,7 +1941,6 @@ void field_add_vector(cudaStream_t stream, { size_t threads_per_block = 256; size_t blocks_per_grid = (std::max(num_cells, num_boundary_surfaces) + threads_per_block - 1) / threads_per_block; - field_add_vector_kernel<<>>(num_cells, num_boundary_surfaces, input1, input2, output, boundary_input1, boundary_input2, boundary_output, sign); } @@ -1984,13 +1958,10 @@ void field_multiply_scalar(cudaStream_t stream, int num_cells, const double *input1, const double *input2, double *output, int num_boundary_surfaces, const double *boundary_input1, const double *boundary_input2, double *boundary_output) { - TICK_INIT_EVENT; size_t threads_per_block = 256; size_t blocks_per_grid = (std::max(num_cells, num_boundary_surfaces) + threads_per_block - 1) / threads_per_block; - TICK_START_EVENT; field_multiply_scalar_kernel<<>>(num_cells, num_boundary_surfaces, input1, input2, output, boundary_input1, boundary_input2, boundary_output); - TICK_END_EVENT(field_multiply_scalar_kernel); } void vector_half_mag_square(cudaStream_t stream, int num_cells, const double *vec_input, double *scalar_output, @@ -2034,13 +2005,10 @@ void fvc_to_source_vector(cudaStream_t stream, int num_cells, const double *volu void fvc_to_source_scalar(cudaStream_t stream, int num_cells, const double *volume, const double *fvc_output, double *source, double sign) { - TICK_INIT_EVENT; size_t threads_per_block = 256; size_t blocks_per_grid = (num_cells + threads_per_block - 1) / threads_per_block; - TICK_START_EVENT; fvc_to_source_scalar_kernel<<>>(num_cells, volume, fvc_output, source, sign); - TICK_END_EVENT(fvc_to_source_scalar_kernel); } void ldu_to_csr_scalar(cudaStream_t stream, int num_cells, int num_surfaces, int num_boundary_surface, int num_Nz, @@ -2094,21 +2062,16 @@ void ldu_to_csr(cudaStream_t stream, int num_cells, int num_surfaces, int num_bo const int* boundary_cell_face, const int *ldu_to_csr_index, const int *diag_to_csr_index, const double *ldu, const double *internal_coeffs, const double *boundary_coeffs, double *source, double *A) { - TICK_INIT_EVENT; // construct diag int nNz = num_cells + 2 * num_surfaces; size_t threads_per_block = 1024; size_t blocks_per_grid = (nNz + threads_per_block - 1) / threads_per_block; - TICK_START_EVENT; ldu_to_csr_kernel<<>>(nNz, ldu_to_csr_index, ldu, A); - TICK_END_EVENT(ldu_to_csr_kernel); // add coeff to source and diagnal blocks_per_grid = (num_boundary_surface + threads_per_block - 1) / threads_per_block; - TICK_START_EVENT; addBoundaryDiagSrc<<>>(num_cells, num_surfaces, num_boundary_surface, boundary_cell_face, internal_coeffs, boundary_coeffs, diag_to_csr_index, A, source); - TICK_END_EVENT(addBoundaryDiagSrc); } void update_boundary_coeffs_scalar(cudaStream_t stream, @@ -2264,50 +2227,30 @@ void fvm_ddt_vol_scalar_vol_scalar(cudaStream_t stream, int num_cells, double rD const double *rho, const double *rho_old, const double *vf, const double *volume, double *diag, double *source, double sign) { - TICK_INIT_EVENT; size_t threads_per_block = 1024; size_t blocks_per_grid = (num_cells + threads_per_block - 1) / threads_per_block; - TICK_START_EVENT; fvm_ddt_vol_scalar_vol_scalar_kernel<<>>(num_cells, rDeltaT, rho, rho_old, vf, volume, diag, source, sign); - TICK_END_EVENT(fvm_ddt_vol_scalar_vol_scalar_kernel); } void fvm_ddt_scalar(cudaStream_t stream, int num_cells, double rDeltaT, const double *vf_old, const double *volume, double *diag, double *source, double sign) { -#ifdef TIME_GPU - printf("#############kernel profile#############\n"); -#endif - TICK_INIT_EVENT; size_t threads_per_block = 1024; size_t blocks_per_grid = (num_cells + threads_per_block - 1) / threads_per_block; -#ifdef TIME_GPU - printf("warm up ...\n"); - warmup<<>>(num_cells); -#endif - TICK_START_EVENT; fvm_ddt_scalar_kernel<<>>(num_cells, rDeltaT, vf_old, volume, diag, source, sign); - TICK_END_EVENT(fvm_ddt_scalar_kernel); } void fvm_ddt_vector(cudaStream_t stream, int num_cells, double rDeltaT, const double *rho, const double *rho_old, const double *vf, const double *volume, double *diag, double *source, double sign) { - TICK_INIT_EVENT; size_t threads_per_block = 64; size_t blocks_per_grid = (num_cells + threads_per_block - 1) / threads_per_block; -#ifdef TIME_GPU - printf("warm up ...\n"); - warmup<<>>(num_cells); -#endif - TICK_START_EVENT; fvm_ddt_vector_kernel<<>>(num_cells, rDeltaT, rho, rho_old, vf, volume, diag, source, sign); - TICK_END_EVENT(fvm_ddt_vector_kernel); } void fvm_div_scalar(cudaStream_t stream, int num_surfaces, const int *lowerAddr, const int *upperAddr, @@ -2317,13 +2260,10 @@ void fvm_div_scalar(cudaStream_t stream, int num_surfaces, const int *lowerAddr, const double *boundary_phi, const double *value_internal_coeffs, const double *value_boundary_coeffs, double *internal_coeffs, double *boundary_coeffs, double sign) { - TICK_INIT_EVENT; size_t threads_per_block = 1024; size_t blocks_per_grid = (num_surfaces + threads_per_block - 1) / threads_per_block; - TICK_START_EVENT; fvm_div_scalar_internal<<>>(num_surfaces, lowerAddr, upperAddr, phi, weight, lower, upper, diag, sign); - TICK_END_EVENT(fvm_div_scalar_internal); int offset = 0; for (int i = 0; i < num_patches; i++) { @@ -2334,11 +2274,9 @@ void fvm_div_scalar(cudaStream_t stream, int num_surfaces, const int *lowerAddr, || patch_type[i] == boundaryConditions::fixedValue || patch_type[i] == boundaryConditions::gradientEnergy) { // TODO: just vector version now - TICK_START_EVENT; fvm_div_scalar_boundary<<>>(patch_size[i], offset, boundary_phi, value_internal_coeffs, value_boundary_coeffs, internal_coeffs, boundary_coeffs, sign); - TICK_END_EVENT(fvm_div_scalar_boundary); } else if (patch_type[i] == boundaryConditions::processor) { fvm_div_scalar_boundary<<>>(patch_size[i], offset, boundary_phi, value_internal_coeffs, value_boundary_coeffs, @@ -2361,17 +2299,10 @@ void fvm_div_vector(cudaStream_t stream, int num_surfaces, int num_boundary_surf const double *boundary_phi, const double *value_internal_coeffs, const double *value_boundary_coeffs, double *internal_coeffs, double *boundary_coeffs, double sign) { - TICK_INIT_EVENT; size_t threads_per_block = 256; size_t blocks_per_grid = (num_surfaces + threads_per_block - 1) / threads_per_block; -#ifdef TIME_GPU - printf("warm up ...\n"); - warmup<<>>(num_surfaces); -#endif - TICK_START_EVENT; fvm_div_vector_internal<<>>(num_surfaces, lowerAddr, upperAddr, phi, weight, lower, upper, diag, sign); - TICK_END_EVENT(fvm_div_vector_internal); int offset = 0; for (int i = 0; i < num_patches; i++) { @@ -2407,13 +2338,10 @@ void fvm_laplacian_scalar(cudaStream_t stream, int num_surfaces, int num_boundar const double *gradient_internal_coeffs, const double *gradient_boundary_coeffs, double *internal_coeffs, double *boundary_coeffs, double sign) { - TICK_INIT_EVENT; size_t threads_per_block = 1024; size_t blocks_per_grid = (num_surfaces + threads_per_block - 1) / threads_per_block; - TICK_START_EVENT; fvm_laplacian_internal<<>>(num_surfaces, lowerAddr, upperAddr, weight, mag_sf, delta_coeffs, gamma, lower, upper, diag, sign); - TICK_END_EVENT(fvm_laplacian_internal); int offset = 0; for (int i = 0; i < num_patches; i++) { threads_per_block = 256; @@ -2423,11 +2351,9 @@ void fvm_laplacian_scalar(cudaStream_t stream, int num_surfaces, int num_boundar || patch_type[i] == boundaryConditions::fixedValue || patch_type[i] == boundaryConditions::gradientEnergy) { // TODO: just vector version now - TICK_START_EVENT; fvm_laplacian_scalar_boundary<<>>(patch_size[i], offset, boundary_mag_sf, boundary_gamma, gradient_internal_coeffs, gradient_boundary_coeffs, internal_coeffs, boundary_coeffs, sign); - TICK_END_EVENT(fvm_laplacian_scalar_boundary); } else if (patch_type[i] == boundaryConditions::processor) { fvm_laplacian_scalar_boundary<<>>(patch_size[i], offset, boundary_mag_sf, boundary_gamma, gradient_internal_coeffs, gradient_boundary_coeffs, @@ -2451,13 +2377,10 @@ void fvm_laplacian_vector(cudaStream_t stream, int num_surfaces, int num_boundar const double *gradient_internal_coeffs, const double *gradient_boundary_coeffs, double *internal_coeffs, double *boundary_coeffs, double sign) { - TICK_INIT_EVENT; size_t threads_per_block = 1024; size_t blocks_per_grid = (num_surfaces + threads_per_block - 1) / threads_per_block; - TICK_START_EVENT; fvm_laplacian_internal<<>>(num_surfaces, lowerAddr, upperAddr, weight, mag_sf, delta_coeffs, gamma, lower, upper, diag, sign); - TICK_END_EVENT(fvm_laplacian_internal); int offset = 0; for (int i = 0; i < num_patches; i++) { threads_per_block = 256; @@ -2466,11 +2389,9 @@ void fvm_laplacian_vector(cudaStream_t stream, int num_surfaces, int num_boundar if (patch_type[i] == boundaryConditions::zeroGradient || patch_type[i] == boundaryConditions::fixedValue) { // TODO: just vector version now - TICK_START_EVENT; fvm_laplacian_vector_boundary<<>>(num_boundary_surfaces, patch_size[i], offset, boundary_mag_sf, boundary_gamma, gradient_internal_coeffs, gradient_boundary_coeffs, internal_coeffs, boundary_coeffs, sign); - TICK_END_EVENT(fvm_laplacian_vector_boundary); } else if (patch_type[i] == boundaryConditions::processor) { fvm_laplacian_vector_boundary_tmp<<>>(num_boundary_surfaces, patch_size[i], offset, boundary_mag_sf, boundary_gamma, gradient_internal_coeffs, gradient_boundary_coeffs, @@ -2518,13 +2439,10 @@ void fvc_ddt_vol_scalar_vol_scalar(cudaStream_t stream, int num_cells, double rD const double *rho, const double *rho_old, const double *vf, const double *vf_old, const double *volume, double *output, double sign) { - TICK_INIT_EVENT; size_t threads_per_block = 1024; size_t blocks_per_grid = (num_cells + threads_per_block - 1) / threads_per_block; - TICK_START_EVENT; fvc_ddt_vol_scalar_vol_scalar_kernel<<>>(num_cells, rDeltaT, rho, rho_old, vf, vf_old, volume, output, sign); - TICK_END_EVENT(fvc_ddt_vol_scalar_vol_scalar_kernel); } void fvc_grad_vector(cudaStream_t stream, ncclComm_t comm, @@ -2536,13 +2454,10 @@ void fvc_grad_vector(cudaStream_t stream, ncclComm_t comm, const double *volume, const double *boundary_mag_Sf, double *boundary_output, const double *boundary_deltaCoeffs, double sign) { - TICK_INIT_EVENT; size_t threads_per_block = 32; size_t blocks_per_grid = (num_surfaces + threads_per_block - 1) / threads_per_block; - TICK_START_EVENT; fvc_grad_vector_internal<<>>(num_cells, num_surfaces, lowerAddr, upperAddr, Sf, weight, vf, output); - TICK_END_EVENT(fvc_grad_vector_internal); int offset = 0; // finish conctruct grad field except dividing cell volume @@ -2554,10 +2469,8 @@ void fvc_grad_vector(cudaStream_t stream, ncclComm_t comm, || patch_type[i] == boundaryConditions::fixedValue || patch_type[i] == boundaryConditions::calculated) { // TODO: just vector version now - TICK_START_EVENT; fvc_grad_vector_boundary_zeroGradient<<>>(num_boundary_surfaces, num_cells, patch_size[i], offset, boundary_cell_face, boundary_Sf, boundary_vf, output); - TICK_END_EVENT(fvc_grad_vector_boundary_zeroGradient); } else if (patch_type[i] == boundaryConditions::processor) { fvc_grad_vector_boundary_processor<<>>(num_boundary_surfaces, num_cells, patch_size[i], offset, boundary_cell_face, boundary_Sf, boundary_weight, boundary_vf, output); @@ -2573,9 +2486,7 @@ void fvc_grad_vector(cudaStream_t stream, ncclComm_t comm, // divide cell volume threads_per_block = 512; blocks_per_grid = (num_cells + threads_per_block - 1) / threads_per_block; - TICK_START_EVENT; divide_cell_volume_tsr<<>>(num_cells, volume, output); - TICK_END_EVENT(divide_cell_volume_tsr); // correct boundary conditions offset = 0; @@ -2585,11 +2496,9 @@ void fvc_grad_vector(cudaStream_t stream, ncclComm_t comm, // TODO: just basic patch type now if (patch_type[i] == boundaryConditions::zeroGradient) { // TODO: just vector version now - TICK_START_EVENT; fvc_grad_vector_correctBC_zeroGradient<<>>(num_cells, num_boundary_surfaces, patch_size[i], offset, boundary_cell_face, output, boundary_vf, boundary_Sf, boundary_mag_Sf, boundary_output); - TICK_END_EVENT(fvc_grad_vector_correctBC_zeroGradient); } else if (patch_type[i] == boundaryConditions::fixedValue) { // TODO: implement fixedValue version fvc_grad_vector_correctBC_fixedValue<<>>( @@ -2612,12 +2521,9 @@ void fvc_grad_vector(cudaStream_t stream, ncclComm_t comm, void scale_dev2T_tensor(cudaStream_t stream, int num_cells, const double *vf1, double *vf2, int num_boundary_surfaces, const double *boundary_vf1, double *boundary_vf2) { - TICK_INIT_EVENT; size_t threads_per_block = 1024; size_t blocks_per_grid = (num_cells + threads_per_block - 1) / threads_per_block; - TICK_START_EVENT; scale_dev2t_tensor_kernel<<>>(num_cells, vf1, vf2); - TICK_END_EVENT(scale_dev2t_tensor_kernel); blocks_per_grid = (num_boundary_surfaces + threads_per_block - 1) / threads_per_block; scale_dev2t_tensor_kernel<<>>(num_boundary_surfaces, boundary_vf1, boundary_vf2); @@ -2628,12 +2534,9 @@ void fvc_div_surface_scalar(cudaStream_t stream, int num_cells, int num_surfaces int num_patches, const int *patch_size, const int *patch_type, const double *boundary_ssf, const double *volume, double *output, double sign) { - TICK_INIT_EVENT; size_t threads_per_block = 1024; size_t blocks_per_grid = (num_surfaces + threads_per_block - 1) / threads_per_block; - TICK_START_EVENT; fvc_div_surface_scalar_internal<<>>(num_surfaces, lowerAddr, upperAddr, ssf, output, sign); - TICK_END_EVENT(fvc_div_surface_scalar_internal); int offset = 0; for (int i = 0; i < num_patches; i++) { @@ -2656,13 +2559,10 @@ void fvc_div_cell_vector(cudaStream_t stream, int num_cells, int num_surfaces, i const double *boundary_weight, const double *boundary_vf, const double *boundary_Sf, const double *volume, double sign) { - TICK_INIT_EVENT; size_t threads_per_block = 1024; size_t blocks_per_grid = (num_surfaces + threads_per_block - 1) / threads_per_block; - TICK_START_EVENT; fvc_div_cell_vector_internal<<>>(num_cells, num_surfaces, lowerAddr, upperAddr, vf, weight, Sf, output, sign); - TICK_END_EVENT(fvc_div_cell_vector_internal); int offset = 0; for (int i = 0; i < num_patches; i++) { @@ -2674,11 +2574,9 @@ void fvc_div_cell_vector(cudaStream_t stream, int num_cells, int num_surfaces, i || patch_type[i] == boundaryConditions::gradientEnergy || patch_type[i] == boundaryConditions::calculated) { // TODO: just vector version now - TICK_START_EVENT; fvc_div_cell_vector_boundary<<>>( num_boundary_surfaces, patch_size[i], offset, boundary_cell_face, boundary_Sf, boundary_vf, output, sign); - TICK_END_EVENT(fvc_div_cell_vector_boundary); } else if (patch_type[i] == boundaryConditions::processor) { fvc_div_cell_vector_boundary_processor<<>>( num_boundary_surfaces, patch_size[i], offset, @@ -2700,13 +2598,10 @@ void fvc_div_cell_tensor(cudaStream_t stream, int num_cells, int num_surfaces, i const int *boundary_cell_face, const double *boundary_vf, const double *boundary_Sf, const double *volume, double sign) { - TICK_INIT_EVENT; size_t threads_per_block = 1024; size_t blocks_per_grid = (num_surfaces + threads_per_block - 1) / threads_per_block; - TICK_START_EVENT; fvc_div_cell_tensor_internal<<>>(num_cells, num_surfaces, lowerAddr, upperAddr, vf, weight, Sf, output, sign); - TICK_END_EVENT(fvc_div_cell_tensor_internal); int offset = 0; for (int i = 0; i < num_patches; i++) { @@ -2717,11 +2612,9 @@ void fvc_div_cell_tensor(cudaStream_t stream, int num_cells, int num_surfaces, i || patch_type[i] == boundaryConditions::fixedValue || patch_type[i] == boundaryConditions::calculated) { // TODO: just vector version now - TICK_START_EVENT; fvc_div_cell_tensor_boundary_zeroGradient<<>>(num_cells, num_boundary_surfaces, patch_size[i], offset, boundary_cell_face, boundary_Sf, boundary_vf, output, sign); - TICK_END_EVENT(fvc_div_cell_tensor_boundary_zeroGradient); } else if (patch_type[i] == boundaryConditions::processor) { fvc_div_cell_tensor_boundary_processor<<>>(num_cells, num_boundary_surfaces, patch_size[i], offset, boundary_cell_face, boundary_weight, boundary_Sf, boundary_vf, output, sign); @@ -2742,13 +2635,10 @@ void fvc_div_surface_scalar_vol_scalar(cudaStream_t stream, int num_surfaces, const int *boundary_cell_face, const double *boundary_vf, const double *boundary_ssf, double sign) { - TICK_INIT_EVENT; size_t threads_per_block = 1024; size_t blocks_per_grid = (num_surfaces + threads_per_block - 1) / threads_per_block; - TICK_START_EVENT; fvc_div_surface_scalar_vol_scalar_internal<<>>(num_surfaces, lowerAddr, upperAddr, weight, ssf, vf, output, sign); - TICK_END_EVENT(fvc_div_surface_scalar_vol_scalar_internal); int offset = 0; for (int i = 0; i < num_patches; i++) { @@ -2758,11 +2648,9 @@ void fvc_div_surface_scalar_vol_scalar(cudaStream_t stream, int num_surfaces, if (patch_type[i] == boundaryConditions::zeroGradient || patch_type[i] == boundaryConditions::fixedValue || patch_type[i] == boundaryConditions::calculated) { - TICK_START_EVENT; fvc_div_surface_scalar_vol_scalar_boundary<<>>( patch_size[i], offset, boundary_cell_face, boundary_vf, boundary_ssf, output, sign); - TICK_END_EVENT(fvc_div_surface_scalar_vol_scalar_boundary); } else if (patch_type[i] == boundaryConditions::processor) { fvc_div_surface_scalar_vol_scalar_boundary<<>>( patch_size[i], offset, boundary_cell_face, @@ -2783,13 +2671,10 @@ void fvc_grad_cell_scalar(cudaStream_t stream, int num_cells, int num_surfaces, int num_patches, const int *patch_size, const int *patch_type, const double *boundary_weight, const int *boundary_cell_face, const double *boundary_vf, const double *boundary_Sf, const double *volume, double sign) { - TICK_INIT_EVENT; size_t threads_per_block = 1024; size_t blocks_per_grid = (num_surfaces + threads_per_block - 1) / threads_per_block; - TICK_START_EVENT; fvc_grad_scalar_internal<<>>(num_cells, num_surfaces, lowerAddr, upperAddr, Sf, weight, vf, output, sign); - TICK_END_EVENT(fvc_grad_scalar_internal); int offset = 0; for (int i = 0; i < num_patches; i++) { @@ -2799,10 +2684,8 @@ void fvc_grad_cell_scalar(cudaStream_t stream, int num_cells, int num_surfaces, if (patch_type[i] == boundaryConditions::zeroGradient || patch_type[i] == boundaryConditions::fixedValue || patch_type[i] == boundaryConditions::calculated) { - TICK_START_EVENT; fvc_grad_scalar_boundary_zeroGradient<<>>(num_boundary_surfaces, num_cells, patch_size[i], offset, boundary_cell_face, boundary_Sf, boundary_vf, output, sign); - TICK_END_EVENT(fvc_grad_scalar_boundary_zeroGradient); } else if (patch_type[i] == boundaryConditions::processor) { fvc_grad_scalar_boundary_processor<<>>(num_boundary_surfaces, num_cells, patch_size[i], offset, boundary_cell_face, boundary_Sf, boundary_weight, boundary_vf, output, sign); @@ -2823,13 +2706,10 @@ void fvc_grad_cell_scalar(cudaStream_t stream, int num_cells, int num_surfaces, const int *boundary_cell_face, const double *boundary_vf, const double *boundary_Sf, const double *volume, bool dividVol, double sign) { - TICK_INIT_EVENT; size_t threads_per_block = 1024; size_t blocks_per_grid = (num_surfaces + threads_per_block - 1) / threads_per_block; - TICK_START_EVENT; fvc_grad_scalar_internal<<>>(num_cells, num_surfaces, lowerAddr, upperAddr, Sf, weight, vf, output, sign); - TICK_END_EVENT(fvc_grad_scalar_internal); int offset = 0; for (int i = 0; i < num_patches; i++) { @@ -2839,10 +2719,8 @@ void fvc_grad_cell_scalar(cudaStream_t stream, int num_cells, int num_surfaces, if (patch_type[i] == boundaryConditions::zeroGradient || patch_type[i] == boundaryConditions::fixedValue || patch_type[i] == boundaryConditions::calculated) { - TICK_START_EVENT; fvc_grad_scalar_boundary_zeroGradient<<>>(num_boundary_surfaces, num_cells, patch_size[i], offset, boundary_cell_face, boundary_Sf, boundary_vf, output, sign); - TICK_END_EVENT(fvc_grad_scalar_boundary_zeroGradient); } else if (patch_type[i] == boundaryConditions::processor) { fvc_grad_scalar_boundary_processor<<>>(num_boundary_surfaces, num_cells, patch_size[i], offset, boundary_cell_face, boundary_Sf, boundary_weight, boundary_vf, output, sign); @@ -2952,13 +2830,10 @@ void fvc_flux(cudaStream_t stream, int num_cells, int num_surfaces, int num_boun const int *boundary_cell_face, const double *boundary_vf, const double *boundary_Sf, double *boundary_output, double sign) { - TICK_INIT_EVENT; size_t threads_per_block = 1024; size_t blocks_per_grid = (num_surfaces + threads_per_block - 1) / threads_per_block; - TICK_START_EVENT; fvc_flux_internal_kernel<<>>(num_cells, num_surfaces, lowerAddr, upperAddr, vf, weight, Sf, output, sign); - TICK_END_EVENT(fvc_flux_internal_kernel); int offset = 0; for (int i = 0; i < num_patches; i++) { @@ -2968,10 +2843,8 @@ void fvc_flux(cudaStream_t stream, int num_cells, int num_surfaces, int num_boun if (patch_type[i] == boundaryConditions::zeroGradient || patch_type[i] == boundaryConditions::fixedValue || patch_type[i] == boundaryConditions::gradientEnergy) { - TICK_START_EVENT; fvc_flux_boundary_kernel<<>>(num_boundary_surfaces, patch_size[i], offset, boundary_cell_face, boundary_Sf, boundary_vf, boundary_output, sign); - TICK_END_EVENT(fvc_flux_boundary_kernel); } else { // xxx fprintf(stderr, "%s %d, boundaryConditions other than zeroGradient are not support yet!\n", __FILE__, __LINE__); @@ -3030,19 +2903,14 @@ void fvMtx_A(cudaStream_t stream, int num_cells, int num_surfaces, int num_bound double *A_pEqn) { checkCudaErrors(cudaMemcpyAsync(A_pEqn, diag, num_cells * sizeof(double), cudaMemcpyDeviceToDevice, stream)); - TICK_INIT_EVENT; size_t threads_per_block = 1024; size_t blocks_per_grid = (num_boundary_surfaces + threads_per_block - 1) / threads_per_block; - TICK_START_EVENT; addAveInternaltoDiag<<>>(num_cells, num_boundary_surfaces, boundary_cell_face, internal_coeffs, A_pEqn); - TICK_END_EVENT(addAveInternaltoDiag); blocks_per_grid = (num_cells + threads_per_block - 1) / threads_per_block; - TICK_START_EVENT; divide_cell_volume_scalar<<>>(num_cells, volume, A_pEqn); - TICK_END_EVENT(divide_cell_volume_scalar); } void fvMtx_H(cudaStream_t stream, int num_cells, int num_surfaces, int num_boundary_surfaces, @@ -3052,21 +2920,16 @@ void fvMtx_H(cudaStream_t stream, int num_cells, int num_surfaces, int num_bound const double *lower, const double *upper, const double *source, const double *psi, double *H_pEqn, double *H_pEqn_perm) { - TICK_INIT_EVENT; size_t threads_per_block = 1024; size_t blocks_per_grid = (num_boundary_surfaces + threads_per_block - 1) / threads_per_block; - TICK_START_EVENT; checkCudaErrors(cudaMemcpyAsync(H_pEqn, source, num_cells * 3 * sizeof(double), cudaMemcpyDeviceToDevice, stream)); addBoundaryDiag<<>>(num_cells, num_boundary_surfaces, boundary_cell_face, internal_coffs, psi, H_pEqn); - TICK_END_EVENT(addBoundaryDiag); blocks_per_grid = (num_surfaces + threads_per_block - 1) / threads_per_block; - TICK_START_EVENT; lduMatrix_H<<>>(num_cells, num_surfaces, lowerAddr, upperAddr, lower, upper, psi, H_pEqn); - TICK_END_EVENT(lduMatrix_H); int offset = 0; for (int i = 0; i < num_patches; i++) { @@ -3075,19 +2938,15 @@ void fvMtx_H(cudaStream_t stream, int num_cells, int num_surfaces, int num_bound // TODO: just non-coupled patch type now if (patch_type[i] == boundaryConditions::zeroGradient || patch_type[i] == boundaryConditions::fixedValue) { - TICK_START_EVENT; addBoundarySrc_unCoupled<<>>(num_cells, patch_size[i], offset, num_boundary_surfaces, boundary_cell_face, boundary_coeffs, H_pEqn); - TICK_END_EVENT(addBoundarySrc_unCoupled); } else { fprintf(stderr, "%s %d, boundaryConditions other than zeroGradient are not support yet!\n", __FILE__, __LINE__); } offset += patch_size[i]; } blocks_per_grid = (num_cells + threads_per_block - 1) / threads_per_block; - TICK_START_EVENT; divideVol_permute_vec<<>>(num_cells, volume, H_pEqn, H_pEqn_perm); - TICK_END_EVENT(divideVol_permute_vec); } void fvMtx_flux(cudaStream_t stream, int num_surfaces, int num_boundary_surfaces, @@ -3120,11 +2979,8 @@ void fvMtx_flux(cudaStream_t stream, int num_surfaces, int num_boundary_surfaces void solve_explicit_scalar(cudaStream_t stream, int num_cells, const double *diag, const double *source, double *psi) { - TICK_INIT_EVENT; size_t threads_per_block = 1024; size_t blocks_per_grid = (num_cells + threads_per_block - 1) / threads_per_block; - TICK_START_EVENT; solve_explicit_scalar_kernel<<>>(num_cells, diag, source, psi); - TICK_END_EVENT(solve_explicit_scalar_kernel); } diff --git a/src_gpu/dfRhoEqn.H b/src_gpu/dfRhoEqn.H index fe22f2aa2..f79b0584b 100644 --- a/src_gpu/dfRhoEqn.H +++ b/src_gpu/dfRhoEqn.H @@ -9,6 +9,7 @@ private: dfMatrixDataBase &dataBase_; // cuda resource + cudaStream_t stream; // one graph for one eqn before using self-developed solver cudaGraph_t graph; cudaGraphExec_t graph_instance; @@ -32,6 +33,7 @@ public: // member function // initialization + void setConstantValues(); void setConstantFields(const std::vector patch_type); void initNonConstantFields(const double *rho, const double *phi, const double *boundary_rho, const double *boudnary_phi); diff --git a/src_gpu/dfRhoEqn.cu b/src_gpu/dfRhoEqn.cu index 365e102f0..0babea810 100644 --- a/src_gpu/dfRhoEqn.cu +++ b/src_gpu/dfRhoEqn.cu @@ -9,6 +9,10 @@ void dfRhoEqn::createNonConstantLduAndCsrFields() #endif } +void dfRhoEqn::setConstantValues() { + this->stream = dataBase_.stream; +} + void dfRhoEqn::setConstantFields(const std::vector patch_type) { this->patch_type = patch_type; } @@ -34,14 +38,9 @@ void dfRhoEqn::preProcess() void dfRhoEqn::process() { - // 使用event计算时间 - float time_elapsed=0; - cudaEvent_t start,stop; - checkCudaErrors(cudaEventCreate(&start)); - checkCudaErrors(cudaEventCreate(&stop)); - checkCudaErrors(cudaEventRecord(start,0)); - -#ifndef TIME_GPU + TICK_INIT_EVENT; + TICK_START_EVENT; +#ifdef USE_GRAPH if(!graph_created) { DEBUG_TRACE; checkCudaErrors(cudaStreamBeginCapture(dataBase_.stream, cudaStreamCaptureModeGlobal)); @@ -68,19 +67,23 @@ void dfRhoEqn::process() correct_boundary_conditions_scalar(dataBase_.stream, dataBase_.nccl_comm, dataBase_.neighbProcNo.data(), dataBase_.num_boundary_surfaces, dataBase_.num_patches, dataBase_.patch_size.data(), patch_type.data(), dataBase_.d_boundary_delta_coeffs, dataBase_.d_boundary_face_cell, dataBase_.d_rho, dataBase_.d_boundary_rho); -#ifndef TIME_GPU +#ifdef USE_GRAPH checkCudaErrors(cudaStreamEndCapture(dataBase_.stream, &graph)); checkCudaErrors(cudaGraphInstantiate(&graph_instance, graph, NULL, NULL, 0)); graph_created = true; } checkCudaErrors(cudaGraphLaunch(graph_instance, dataBase_.stream)); #endif + TICK_END_EVENT(rhoEqn process); - checkCudaErrors(cudaEventRecord(stop,0)); - checkCudaErrors(cudaEventSynchronize(start)); - checkCudaErrors(cudaEventSynchronize(stop)); - checkCudaErrors(cudaEventElapsedTime(&time_elapsed,start,stop)); - fprintf(stderr, "rhoEqn process time:%f(ms)\n\n",time_elapsed); + TICK_START_EVENT; +#ifdef STREAM_ALLOCATOR + checkCudaErrors(cudaFreeAsync(d_source, dataBase_.stream)); + checkCudaErrors(cudaFreeAsync(d_diag, dataBase_.stream)); +#endif + checkCudaErrors(cudaMemcpyAsync(dataBase_.h_rho, dataBase_.d_rho, dataBase_.cell_value_bytes, cudaMemcpyDeviceToHost, dataBase_.stream)); + TICK_END_EVENT(rhoEqn post process); + sync(); } void dfRhoEqn::sync() @@ -93,16 +96,7 @@ void dfRhoEqn::solve() solve_explicit_scalar(dataBase_.stream, dataBase_.num_cells, d_diag, d_source, dataBase_.d_rho); } -void dfRhoEqn::postProcess(double *h_rho) -{ -#ifdef STREAM_ALLOCATOR - checkCudaErrors(cudaFreeAsync(d_source, dataBase_.stream)); - checkCudaErrors(cudaFreeAsync(d_diag, dataBase_.stream)); -#endif - checkCudaErrors(cudaMemcpyAsync(h_rho, dataBase_.d_rho, dataBase_.cell_value_bytes, cudaMemcpyDeviceToHost, dataBase_.stream)); - sync(); - // TODO: correct boundary conditions -} +void dfRhoEqn::postProcess(double *h_rho) {} void dfRhoEqn::compareResult(const double *diag, const double *source, bool printFlag) { diff --git a/src_gpu/dfThermo.H b/src_gpu/dfThermo.H index b833f937a..e1f9c0f58 100644 --- a/src_gpu/dfThermo.H +++ b/src_gpu/dfThermo.H @@ -1,6 +1,7 @@ #pragma once #include "dfMatrixDataBase.H" +#include "dfMatrixOpBase.H" void init_const_coeff_ptr(std::vector>& nasa_coeffs, std::vector>& viscosity_coeffs, std::vector>& thermal_conductivity_coeffs, std::vector>& binary_diffusion_coeffs, diff --git a/src_gpu/dfThermo.cu b/src_gpu/dfThermo.cu index 307d6d611..da62828aa 100644 --- a/src_gpu/dfThermo.cu +++ b/src_gpu/dfThermo.cu @@ -6,28 +6,6 @@ #include #include "device_launch_parameters.h" -#ifdef TIME_GPU - #define TICK_INIT_EVENT \ - float time_elapsed_kernel=0;\ - cudaEvent_t start_kernel, stop_kernel;\ - checkCudaErrors(cudaEventCreate(&start_kernel));\ - checkCudaErrors(cudaEventCreate(&stop_kernel)); - - #define TICK_START_EVENT \ - checkCudaErrors(cudaEventRecord(start_kernel,0)); - - #define TICK_END_EVENT(prefix) \ - checkCudaErrors(cudaEventRecord(stop_kernel,0));\ - checkCudaErrors(cudaEventSynchronize(start_kernel));\ - checkCudaErrors(cudaEventSynchronize(stop_kernel));\ - checkCudaErrors(cudaEventElapsedTime(&time_elapsed_kernel,start_kernel,stop_kernel));\ - printf("try %s 执行时间:%lf(ms)\n", #prefix, time_elapsed_kernel); -#else - #define TICK_INIT_EVENT - #define TICK_START_EVENT - #define TICK_END_EVENT(prefix) -#endif - #define GAS_CANSTANT 8314.46261815324 #define SQRT8 2.8284271247461903 #define NUM_SPECIES 7 @@ -454,55 +432,40 @@ void dfThermo::calculateTPolyGPU(int threads_per_block, int num_thread, int num_ void dfThermo::calculatePsiGPU(int threads_per_block, int num_thread, const double *T, const double *mean_mole_weight, double *d_psi, int offset) { - TICK_INIT_EVENT; size_t blocks_per_grid = (num_thread + threads_per_block - 1) / threads_per_block; - TICK_START_EVENT; calculate_psi_kernel<<>>(num_thread, offset, T, mean_mole_weight, d_psi); - TICK_END_EVENT(calculate_psi_kernel); } void dfThermo::calculateRhoGPU(int threads_per_block, int num_thread, const double *p, const double *psi, double *rho, int offset) { - TICK_INIT_EVENT; size_t blocks_per_grid = (num_thread + threads_per_block - 1) / threads_per_block; - TICK_START_EVENT; calculate_rho_kernel<<>>(num_cells, offset, p, psi, rho); - TICK_END_EVENT(calculate_rho_kernel); } void dfThermo::calculateViscosityGPU(int threads_per_block, int num_thread, int num_total, const double *T, const double *mole_fraction, const double *T_poly, double *species_viscosities, double *viscosity, int offset) { - TICK_INIT_EVENT; size_t blocks_per_grid = (num_thread + threads_per_block - 1) / threads_per_block; - TICK_START_EVENT; calculate_viscosity_kernel<<>>(num_thread, num_total, num_species, offset, T_poly, T, mole_fraction, species_viscosities, viscosity); - TICK_END_EVENT(calculate_viscosity_kernel); } void dfThermo::calculateThermoConductivityGPU(int threads_per_block, int num_thread, int num_total, const double *T, const double *T_poly, const double *d_y, const double *mole_fraction, double *species_thermal_conductivities, double *thermal_conductivity, int offset) { - TICK_INIT_EVENT; size_t blocks_per_grid = (num_thread + threads_per_block - 1) / threads_per_block; - TICK_START_EVENT; calculate_thermoConductivity_kernel<<>>(num_thread, num_total, num_species, offset, d_nasa_coeffs, d_y, T_poly, T, mole_fraction, species_thermal_conductivities, thermal_conductivity); - TICK_END_EVENT(calculate_thermoConductivity_kernel); } void dfThermo::calculateTemperatureGPU(int threads_per_block, int num_thread, int num_total, const double *T_init, const double *target_h, double *T, const double *d_mass_fraction, int offset, double atol, double rtol, int max_iter) { - TICK_INIT_EVENT; size_t blocks_per_grid = (num_thread + threads_per_block - 1) / threads_per_block; - TICK_START_EVENT; calculate_temperature_kernel<<>>(num_thread, num_total, num_species, offset, T_init, target_h, d_mass_fraction, T, atol, rtol, max_iter); - TICK_END_EVENT(calculate_temperature_kernel); } void dfThermo::calculateEnthalpyGPU(int threads_per_block, int num_thread, const double *T, double *enthalpy, const double *d_mass_fraction, int offset) @@ -619,7 +582,6 @@ void dfThermo::calculateEnergyGradient(int num_thread, int num_cells, int num_sp { size_t threads_per_block = 256; size_t blocks_per_grid = (num_thread + threads_per_block - 1) / threads_per_block; - calculate_energy_gradient_kernel<<>>(num_thread, num_cells, num_species, num_boundary_surfaces, bou_offset, gradient_offset, face2Cells, T, p, y, boundary_p, boundary_y, boundary_delta_coeffs, boundary_thermo_gradient); } diff --git a/src_gpu/dfUEqn.H b/src_gpu/dfUEqn.H index e7a16f284..99db6ef60 100644 --- a/src_gpu/dfUEqn.H +++ b/src_gpu/dfUEqn.H @@ -11,10 +11,12 @@ private: dfMatrixDataBase &dataBase_; // cuda resource + cudaStream_t stream; // one graph for one eqn before using self-developed solver cudaGraph_t graph_pre, graph_post; cudaGraphExec_t graph_instance_pre, graph_instance_post; - bool graph_created=false; + bool pre_graph_created=false; + bool post_graph_created=false; // constant values -- basic std::string mode_string; diff --git a/src_gpu/dfUEqn.cu b/src_gpu/dfUEqn.cu index 9dabdb597..61dc11340 100644 --- a/src_gpu/dfUEqn.cu +++ b/src_gpu/dfUEqn.cu @@ -265,6 +265,7 @@ __global__ void ueqn_divide_cell_volume_vec(int num_cells, const double* volume, } void dfUEqn::setConstantValues(const std::string &mode_string, const std::string &setting_path) { + this->stream = dataBase_.stream; this->mode_string = mode_string; this->setting_path = setting_path; UxSolver = new AmgXSolver(mode_string, setting_path, dataBase_.localRank); @@ -356,10 +357,12 @@ void dfUEqn::initNonConstantFieldsBoundary() { } void dfUEqn::cleanCudaResources() { - if (graph_created) { + if (pre_graph_created) { checkCudaErrors(cudaGraphExecDestroy(graph_instance_pre)); - checkCudaErrors(cudaGraphExecDestroy(graph_instance_post)); checkCudaErrors(cudaGraphDestroy(graph_pre)); + } + if (post_graph_created) { + checkCudaErrors(cudaGraphExecDestroy(graph_instance_post)); checkCudaErrors(cudaGraphDestroy(graph_post)); } } @@ -376,15 +379,10 @@ void dfUEqn::preProcess(const double *h_u, const double *h_boundary_u, const dou } void dfUEqn::process() { - //使用event计算时间 - float time_elapsed=0; - cudaEvent_t start,stop; - checkCudaErrors(cudaEventCreate(&start)); - checkCudaErrors(cudaEventCreate(&stop)); - checkCudaErrors(cudaEventRecord(start,0)); - -#ifndef TIME_GPU - if(!graph_created) { + TICK_INIT_EVENT; + TICK_START_EVENT; +#ifdef USE_GRAPH + if(!pre_graph_created) { DEBUG_TRACE; checkCudaErrors(cudaStreamBeginCapture(dataBase_.stream, cudaStreamCaptureModeGlobal)); #endif @@ -487,54 +485,72 @@ void dfUEqn::process() { dataBase_.num_patches, dataBase_.patch_size.data(), patch_type.data(), d_ldu, d_extern, d_source, d_internal_coeffs, d_boundary_coeffs, d_A, d_b); #endif -#ifndef TIME_GPU +#ifdef USE_GRAPH checkCudaErrors(cudaStreamEndCapture(dataBase_.stream, &graph_pre)); checkCudaErrors(cudaGraphInstantiate(&graph_instance_pre, graph_pre, NULL, NULL, 0)); + pre_graph_created = true; } DEBUG_TRACE; checkCudaErrors(cudaGraphLaunch(graph_instance_pre, dataBase_.stream)); #endif - checkCudaErrors(cudaEventRecord(stop,0)); - checkCudaErrors(cudaEventSynchronize(start)); - checkCudaErrors(cudaEventSynchronize(stop)); - checkCudaErrors(cudaEventElapsedTime(&time_elapsed,start,stop)); - fprintf(stderr, "ueqn assembly time:%f(ms)\n",time_elapsed); + TICK_END_EVENT(UEqn assembly); - checkCudaErrors(cudaEventRecord(start,0)); + TICK_START_EVENT; #ifndef DEBUG_CHECK_LDU solve(); #endif - checkCudaErrors(cudaEventRecord(stop,0)); - checkCudaErrors(cudaEventSynchronize(start)); - checkCudaErrors(cudaEventSynchronize(stop)); - checkCudaErrors(cudaEventElapsedTime(&time_elapsed,start,stop)); - fprintf(stderr, "ueqn solve time:%f(ms)\n",time_elapsed); - - checkCudaErrors(cudaEventRecord(start,0)); -#ifndef TIME_GPU - if(!graph_created) { + TICK_END_EVENT(UEqn solve); + + TICK_START_EVENT; +#ifdef USE_GRAPH + if(!post_graph_created) { checkCudaErrors(cudaStreamBeginCapture(dataBase_.stream, cudaStreamCaptureModeGlobal)); #endif - correct_boundary_conditions_vector(dataBase_.stream, dataBase_.nccl_comm, dataBase_.neighbProcNo.data(), dataBase_.num_boundary_surfaces, + correct_boundary_conditions_vector(dataBase_.stream, dataBase_.nccl_comm, dataBase_.neighbProcNo.data(), + dataBase_.num_boundary_surfaces, dataBase_.num_cells, dataBase_.num_patches, dataBase_.patch_size.data(), patch_type.data(), dataBase_.d_boundary_face_cell, dataBase_.d_u, dataBase_.d_boundary_u); vector_half_mag_square(dataBase_.stream, dataBase_.num_cells, dataBase_.d_u, dataBase_.d_k, dataBase_.num_boundary_surfaces, dataBase_.d_boundary_u, dataBase_.d_boundary_k); + + // copy u and boundary_u to host + permute_vector_d2h(dataBase_.stream, dataBase_.num_cells, dataBase_.d_u, d_u_host_order); + checkCudaErrors(cudaMemcpyAsync(dataBase_.h_u, d_u_host_order, dataBase_.cell_value_vec_bytes, cudaMemcpyDeviceToHost, dataBase_.stream)); -#ifndef TIME_GPU +#ifdef STREAM_ALLOCATOR + // free + // thermophysical fields + checkCudaErrors(cudaFreeAsync(d_nu_eff, dataBase_.stream)); + // intermediate fields + checkCudaErrors(cudaFreeAsync(d_grad_u, dataBase_.stream)); + checkCudaErrors(cudaFreeAsync(d_rho_nueff, dataBase_.stream)); + checkCudaErrors(cudaFreeAsync(d_fvc_output, dataBase_.stream)); + + // thermophysical fields + checkCudaErrors(cudaFreeAsync(d_boundary_nu_eff, dataBase_.stream)); + // intermediate fields + checkCudaErrors(cudaFreeAsync(d_boundary_grad_u, dataBase_.stream)); + checkCudaErrors(cudaFreeAsync(d_boundary_rho_nueff, dataBase_.stream)); + // boundary coeff fields + checkCudaErrors(cudaFreeAsync(d_value_internal_coeffs, dataBase_.stream)); + checkCudaErrors(cudaFreeAsync(d_value_boundary_coeffs, dataBase_.stream)); + checkCudaErrors(cudaFreeAsync(d_gradient_internal_coeffs, dataBase_.stream)); + checkCudaErrors(cudaFreeAsync(d_gradient_boundary_coeffs, dataBase_.stream)); + + checkCudaErrors(cudaFreeAsync(d_A, dataBase_.stream)); + checkCudaErrors(cudaFreeAsync(d_b, dataBase_.stream)); +#endif + +#ifdef USE_GRAPH checkCudaErrors(cudaStreamEndCapture(dataBase_.stream, &graph_post)); checkCudaErrors(cudaGraphInstantiate(&graph_instance_post, graph_post, NULL, NULL, 0)); - graph_created = true; + post_graph_created = true; } checkCudaErrors(cudaGraphLaunch(graph_instance_post, dataBase_.stream)); #endif - - checkCudaErrors(cudaEventRecord(stop,0)); - checkCudaErrors(cudaEventSynchronize(start)); - checkCudaErrors(cudaEventSynchronize(stop)); - checkCudaErrors(cudaEventElapsedTime(&time_elapsed,start,stop)); - fprintf(stderr, "ueqn post process time:%f(ms)\n\n",time_elapsed); + TICK_END_EVENT(UEqn post process); + sync(); } void dfUEqn::sync() @@ -543,7 +559,7 @@ void dfUEqn::sync() } void dfUEqn::solve() { - sync(); + sync(); // TODO need remove if (num_iteration == 0) // first interation { @@ -564,35 +580,7 @@ void dfUEqn::solve() { num_iteration++; } -void dfUEqn::postProcess(double *h_u) { - permute_vector_d2h(dataBase_.stream, dataBase_.num_cells, dataBase_.d_u, d_u_host_order); - checkCudaErrors(cudaMemcpyAsync(h_u, d_u_host_order, dataBase_.cell_value_vec_bytes, cudaMemcpyDeviceToHost, dataBase_.stream)); - checkCudaErrors(cudaStreamSynchronize(dataBase_.stream)); - -#ifdef STREAM_ALLOCATOR - // free - // thermophysical fields - checkCudaErrors(cudaFreeAsync(d_nu_eff, dataBase_.stream)); - // intermediate fields - checkCudaErrors(cudaFreeAsync(d_grad_u, dataBase_.stream)); - checkCudaErrors(cudaFreeAsync(d_rho_nueff, dataBase_.stream)); - checkCudaErrors(cudaFreeAsync(d_fvc_output, dataBase_.stream)); - - // thermophysical fields - checkCudaErrors(cudaFreeAsync(d_boundary_nu_eff, dataBase_.stream)); - // intermediate fields - checkCudaErrors(cudaFreeAsync(d_boundary_grad_u, dataBase_.stream)); - checkCudaErrors(cudaFreeAsync(d_boundary_rho_nueff, dataBase_.stream)); - // boundary coeff fields - checkCudaErrors(cudaFreeAsync(d_value_internal_coeffs, dataBase_.stream)); - checkCudaErrors(cudaFreeAsync(d_value_boundary_coeffs, dataBase_.stream)); - checkCudaErrors(cudaFreeAsync(d_gradient_internal_coeffs, dataBase_.stream)); - checkCudaErrors(cudaFreeAsync(d_gradient_boundary_coeffs, dataBase_.stream)); - - checkCudaErrors(cudaFreeAsync(d_A, dataBase_.stream)); - checkCudaErrors(cudaFreeAsync(d_b, dataBase_.stream)); -#endif -} +void dfUEqn::postProcess(double *h_u) {} void dfUEqn::A(double *Psi) { fvMtx_A(dataBase_.stream, dataBase_.num_cells, dataBase_.num_surfaces, dataBase_.num_boundary_surfaces, diff --git a/src_gpu/dfYEqn.H b/src_gpu/dfYEqn.H index c34e969bd..10304bd97 100644 --- a/src_gpu/dfYEqn.H +++ b/src_gpu/dfYEqn.H @@ -11,6 +11,7 @@ private: dfMatrixDataBase &dataBase_; // cuda resource + cudaStream_t stream; // one graph for one eqn before using self-developed solver cudaGraph_t graph; cudaGraphExec_t graph_instance; diff --git a/src_gpu/dfYEqn.cu b/src_gpu/dfYEqn.cu index 432ca641b..cc08edbe1 100644 --- a/src_gpu/dfYEqn.cu +++ b/src_gpu/dfYEqn.cu @@ -285,6 +285,7 @@ double* dfYEqn::getFieldPointer(const char* fieldAlias, location loc, position p } void dfYEqn::setConstantValues(const std::string &mode_string, const std::string &setting_path, const int inertIndex) { + this->stream = dataBase_.stream; this->mode_string = mode_string; this->setting_path = setting_path; this->inertIndex = inertIndex; @@ -411,14 +412,9 @@ void dfYEqn::preProcess(const double *h_rhoD, const double *h_boundary_rhoD, } void dfYEqn::process() { - //使用event计算时间 - float time_elapsed=0; - cudaEvent_t start,stop; - checkCudaErrors(cudaEventCreate(&start)); - checkCudaErrors(cudaEventCreate(&stop)); - checkCudaErrors(cudaEventRecord(start,0)); - -#ifndef TIME_GPU + TICK_INIT_EVENT; + TICK_START_EVENT; +#ifdef USE_GRAPH if(!graph_created) { DEBUG_TRACE; checkCudaErrors(cudaStreamBeginCapture(dataBase_.stream, cudaStreamCaptureModeGlobal)); @@ -508,7 +504,7 @@ void dfYEqn::process() { // double *d_boundary_DEff = d_boundary_rhoD; yeqn_compute_DEff(dataBase_.stream, dataBase_.num_species, dataBase_.num_cells, dataBase_.num_boundary_surfaces, d_lewis_number, dataBase_.d_thermo_alpha, d_mut_sct, d_DEff, dataBase_.d_boundary_thermo_alpha, d_boundary_mut_sct, d_boundary_DEff); -#ifndef TIME_GPU +#ifdef USE_GRAPH checkCudaErrors(cudaStreamEndCapture(dataBase_.stream, &graph)); checkCudaErrors(cudaGraphInstantiate(&graph_instance, graph, NULL, NULL, 0)); graph_created = true; @@ -582,6 +578,9 @@ void dfYEqn::process() { if (s == dataBase_.num_species - 1) num_iteration++; } + TICK_END_EVENT(YEqn assembly and solve for all species); + + TICK_START_EVENT; // compute y_inertIndex yeqn_compute_y_inertIndex(dataBase_.stream, dataBase_.num_species, inertIndex, dataBase_.num_cells, dataBase_.d_y); @@ -593,34 +592,10 @@ void dfYEqn::process() { dataBase_.d_y + dataBase_.num_cells * s, dataBase_.d_boundary_y + dataBase_.num_boundary_surfaces * s); } - checkCudaErrors(cudaEventRecord(stop,0)); - checkCudaErrors(cudaEventSynchronize(start)); - checkCudaErrors(cudaEventSynchronize(stop)); - checkCudaErrors(cudaEventElapsedTime(&time_elapsed,start,stop)); - fprintf(stderr, "yeqn process time: %f(ms)\n\n",time_elapsed); -} - -void dfYEqn::solve(int solverIndex) { - if (num_iteration == 0) // first interation - { - fprintf(stderr, "Initializing AmgX Linear Solver\n"); - DEBUG_TRACE; - YSolverSet[solverIndex]->setOperator(dataBase_.num_cells, dataBase_.num_total_cells, dataBase_.num_Nz, dataBase_.d_csr_row_index, dataBase_.d_csr_col_index, d_A); - DEBUG_TRACE; - } - else - { - DEBUG_TRACE; - YSolverSet[solverIndex]->updateOperator(dataBase_.num_cells, dataBase_.num_Nz, d_A); - DEBUG_TRACE; - } - // use d_source as d_b - DEBUG_TRACE; - YSolverSet[solverIndex]->solve(dataBase_.num_cells, dataBase_.d_y + dataBase_.num_cells * solverIndex, d_source); - DEBUG_TRACE; -} + // copy y and boundary_y to host + checkCudaErrors(cudaMemcpyAsync(dataBase_.h_y, dataBase_.d_y, dataBase_.cell_value_bytes * dataBase_.num_species, cudaMemcpyDeviceToHost, dataBase_.stream)); + checkCudaErrors(cudaMemcpyAsync(dataBase_.h_boundary_y, dataBase_.d_boundary_y, dataBase_.boundary_surface_value_bytes * dataBase_.num_species, cudaMemcpyDeviceToHost, dataBase_.stream)); -void dfYEqn::postProcess(double *h_y, double *h_boundary_y) { #ifdef STREAM_ALLOCATOR // thermophysical fields checkCudaErrors(cudaFreeAsync(d_rhoD, dataBase_.stream)); @@ -655,12 +630,35 @@ void dfYEqn::postProcess(double *h_y, double *h_boundary_y) { checkCudaErrors(cudaFreeAsync(d_boundary_coeffs, dataBase_.stream)); checkCudaErrors(cudaFreeAsync(d_A, dataBase_.stream)); #endif - // copy y to host - checkCudaErrors(cudaMemcpyAsync(h_y, dataBase_.d_y, dataBase_.cell_value_bytes * dataBase_.num_species, cudaMemcpyDeviceToHost, dataBase_.stream)); - checkCudaErrors(cudaMemcpyAsync(h_boundary_y, dataBase_.d_boundary_y, dataBase_.boundary_surface_value_bytes * dataBase_.num_species, cudaMemcpyDeviceToHost, dataBase_.stream)); - checkCudaErrors(cudaStreamSynchronize(dataBase_.stream)); + TICK_END_EVENT(YEqn post process for all species); + sync(); +} + +void dfYEqn::solve(int solverIndex) { + TICK_INIT_EVENT; + TICK_START_EVENT; + if (num_iteration == 0) // first interation + { + fprintf(stderr, "Initializing AmgX Linear Solver\n"); + DEBUG_TRACE; + YSolverSet[solverIndex]->setOperator(dataBase_.num_cells, dataBase_.num_total_cells, dataBase_.num_Nz, dataBase_.d_csr_row_index, dataBase_.d_csr_col_index, d_A); + DEBUG_TRACE; + } + else + { + DEBUG_TRACE; + YSolverSet[solverIndex]->updateOperator(dataBase_.num_cells, dataBase_.num_Nz, d_A); + DEBUG_TRACE; + } + // use d_source as d_b + DEBUG_TRACE; + YSolverSet[solverIndex]->solve(dataBase_.num_cells, dataBase_.d_y + dataBase_.num_cells * solverIndex, d_source); + DEBUG_TRACE; + TICK_END_EVENT(YEqn solve one specie); } +void dfYEqn::postProcess(double *h_y, double *h_boundary_y) {} + void dfYEqn::sync() { checkCudaErrors(cudaStreamSynchronize(dataBase_.stream)); } diff --git a/src_gpu/dfpEqn.H b/src_gpu/dfpEqn.H index fe58e6608..b8d84640b 100644 --- a/src_gpu/dfpEqn.H +++ b/src_gpu/dfpEqn.H @@ -11,10 +11,12 @@ private: dfMatrixDataBase &dataBase_; // cuda resource + cudaStream_t stream; // one graph for one eqn before using self-developed solver cudaGraph_t graph_pre, graph_post; cudaGraphExec_t graph_instance_pre, graph_instance_post; - bool graph_created=false; + bool pre_graph_created=false; + bool post_graph_created=false; // constant values -- basic std::string mode_string; diff --git a/src_gpu/dfpEqn.cu b/src_gpu/dfpEqn.cu index 139fe3a1a..b8940d878 100644 --- a/src_gpu/dfpEqn.cu +++ b/src_gpu/dfpEqn.cu @@ -294,6 +294,7 @@ double* dfpEqn::getFieldPointer(const char* fieldAlias, location loc, position p } void dfpEqn::setConstantValues(const std::string &mode_string, const std::string &setting_path) { + this->stream = dataBase_.stream; this->mode_string = mode_string; this->setting_path = setting_path; pSolver = new AmgXSolver(mode_string, setting_path, dataBase_.localRank); @@ -340,10 +341,12 @@ void dfpEqn::initNonConstantFields(const double *p, const double *boundary_p){ } void dfpEqn::cleanCudaResources() { - if (graph_created) { + if (pre_graph_created) { checkCudaErrors(cudaGraphExecDestroy(graph_instance_pre)); - checkCudaErrors(cudaGraphExecDestroy(graph_instance_post)); checkCudaErrors(cudaGraphDestroy(graph_pre)); + } + if (post_graph_created) { + checkCudaErrors(cudaGraphExecDestroy(graph_instance_post)); checkCudaErrors(cudaGraphDestroy(graph_post)); } } @@ -364,15 +367,10 @@ void dfpEqn::correctP(const double *h_p, double *h_boundary_p) { }; void dfpEqn::process() { - - float time_elapsed=0; - cudaEvent_t start,stop; - checkCudaErrors(cudaEventCreate(&start)); - checkCudaErrors(cudaEventCreate(&stop)); - checkCudaErrors(cudaEventRecord(start,0)); - -#ifndef TIME_GPU - if(!graph_created) { + TICK_INIT_EVENT; + TICK_START_EVENT; +#ifdef USE_GRAPH + if(!pre_graph_created) { DEBUG_TRACE; checkCudaErrors(cudaStreamBeginCapture(dataBase_.stream, cudaStreamCaptureModeGlobal)); #endif @@ -428,75 +426,66 @@ void dfpEqn::process() { dataBase_.num_patches, dataBase_.patch_size.data(), patch_type_p.data(), d_ldu, d_source, d_internal_coeffs, d_boundary_coeffs, d_A); -#ifndef TIME_GPU +#ifdef USE_GRAPH checkCudaErrors(cudaStreamEndCapture(dataBase_.stream, &graph_pre)); checkCudaErrors(cudaGraphInstantiate(&graph_instance_pre, graph_pre, NULL, NULL, 0)); + pre_graph_created = true; } DEBUG_TRACE; checkCudaErrors(cudaGraphLaunch(graph_instance_pre, dataBase_.stream)); #endif - checkCudaErrors(cudaEventRecord(stop,0)); - checkCudaErrors(cudaEventSynchronize(start)); - checkCudaErrors(cudaEventSynchronize(stop)); - checkCudaErrors(cudaEventElapsedTime(&time_elapsed,start,stop)); - fprintf(stderr, "peqn assembly time:%f(ms)\n",time_elapsed); - - checkCudaErrors(cudaEventRecord(start,0)); + TICK_END_EVENT(pEqn assembly); + + TICK_START_EVENT; solve(); - checkCudaErrors(cudaEventRecord(stop,0)); - checkCudaErrors(cudaEventSynchronize(start)); - checkCudaErrors(cudaEventSynchronize(stop)); - checkCudaErrors(cudaEventElapsedTime(&time_elapsed,start,stop)); - fprintf(stderr, "peqn solve time:%f(ms)\n",time_elapsed); - - checkCudaErrors(cudaEventRecord(start,0)); -#ifndef TIME_GPU - if(!graph_created) { + TICK_END_EVENT(pEqn solve); + + TICK_START_EVENT; +#ifdef USE_GRAPH + if(!post_graph_created) { checkCudaErrors(cudaStreamBeginCapture(dataBase_.stream, cudaStreamCaptureModeGlobal)); #endif - correct_boundary_conditions_scalar(dataBase_.stream, dataBase_.nccl_comm, dataBase_.neighbProcNo.data(), - dataBase_.num_boundary_surfaces, dataBase_.num_patches, dataBase_.patch_size.data(), - patch_type_p.data(), dataBase_.d_boundary_delta_coeffs, - dataBase_.d_boundary_face_cell, dataBase_.d_p, dataBase_.d_boundary_p); - // update phi - fvMtx_flux(dataBase_.stream, dataBase_.num_surfaces, dataBase_.num_boundary_surfaces, dataBase_.d_owner, dataBase_.d_neighbor, - d_lower, d_upper, dataBase_.d_p, d_flux, - dataBase_.num_patches, dataBase_.patch_size.data(), patch_type_p.data(), - dataBase_.d_boundary_face_cell, d_internal_coeffs, d_boundary_coeffs, dataBase_.d_boundary_p, d_boundary_flux); - field_add_scalar(dataBase_.stream, dataBase_.num_surfaces, d_phiHbyA, d_flux, dataBase_.d_phi, - dataBase_.num_boundary_surfaces, d_boundary_phiHbyA, d_boundary_flux, dataBase_.d_boundary_phi); - // correct U - checkCudaErrors(cudaMemsetAsync(dataBase_.d_u, 0., dataBase_.cell_value_vec_bytes, dataBase_.stream)); - // TODO: may do not need to calculate boundary fields - fvc_grad_cell_scalar(dataBase_.stream, dataBase_.num_cells, dataBase_.num_surfaces, dataBase_.num_boundary_surfaces, - dataBase_.d_owner, dataBase_.d_neighbor, - dataBase_.d_weight, dataBase_.d_sf, dataBase_.d_p, dataBase_.d_u, - dataBase_.num_patches, dataBase_.patch_size.data(), patch_type_p.data(), dataBase_.d_boundary_weight, - dataBase_.d_boundary_face_cell, dataBase_.d_boundary_p, dataBase_.d_boundary_sf, dataBase_.d_volume, true); - scalar_field_multiply_vector_field(dataBase_.stream, dataBase_.num_cells, dataBase_.d_rAU, dataBase_.d_u, dataBase_.d_u); - field_add_vector(dataBase_.stream, dataBase_.num_cells, dataBase_.d_HbyA, dataBase_.d_u, dataBase_.d_u, -1.); - correct_boundary_conditions_vector(dataBase_.stream, dataBase_.nccl_comm, dataBase_.neighbProcNo.data(), dataBase_.num_boundary_surfaces, - dataBase_.num_cells, dataBase_.num_patches, dataBase_.patch_size.data(), patch_type_U.data(), dataBase_.d_boundary_face_cell, - dataBase_.d_u, dataBase_.d_boundary_u); - vector_half_mag_square(dataBase_.stream, dataBase_.num_cells, dataBase_.d_u, dataBase_.d_k, dataBase_.num_boundary_surfaces, - dataBase_.d_boundary_u, dataBase_.d_boundary_k); - // calculate dpdt - fvc_ddt_scalar_field(dataBase_.stream, dataBase_.num_cells, dataBase_.rdelta_t, dataBase_.d_p, dataBase_.d_p_old, dataBase_.d_volume, dataBase_.d_dpdt, 1.); - - #ifndef TIME_GPU + correct_boundary_conditions_scalar(dataBase_.stream, dataBase_.nccl_comm, dataBase_.neighbProcNo.data(), + dataBase_.num_boundary_surfaces, dataBase_.num_patches, dataBase_.patch_size.data(), + patch_type_p.data(), dataBase_.d_boundary_delta_coeffs, + dataBase_.d_boundary_face_cell, dataBase_.d_p, dataBase_.d_boundary_p); + // update phi + fvMtx_flux(dataBase_.stream, dataBase_.num_surfaces, dataBase_.num_boundary_surfaces, dataBase_.d_owner, dataBase_.d_neighbor, + d_lower, d_upper, dataBase_.d_p, d_flux, + dataBase_.num_patches, dataBase_.patch_size.data(), patch_type_p.data(), + dataBase_.d_boundary_face_cell, d_internal_coeffs, d_boundary_coeffs, dataBase_.d_boundary_p, d_boundary_flux); + field_add_scalar(dataBase_.stream, dataBase_.num_surfaces, d_phiHbyA, d_flux, dataBase_.d_phi, + dataBase_.num_boundary_surfaces, d_boundary_phiHbyA, d_boundary_flux, dataBase_.d_boundary_phi); + // correct U + checkCudaErrors(cudaMemsetAsync(dataBase_.d_u, 0., dataBase_.cell_value_vec_bytes, dataBase_.stream)); + // TODO: may do not need to calculate boundary fields + fvc_grad_cell_scalar(dataBase_.stream, dataBase_.num_cells, dataBase_.num_surfaces, dataBase_.num_boundary_surfaces, + dataBase_.d_owner, dataBase_.d_neighbor, + dataBase_.d_weight, dataBase_.d_sf, dataBase_.d_p, dataBase_.d_u, + dataBase_.num_patches, dataBase_.patch_size.data(), patch_type_p.data(), dataBase_.d_boundary_weight, + dataBase_.d_boundary_face_cell, dataBase_.d_boundary_p, dataBase_.d_boundary_sf, dataBase_.d_volume, true); + scalar_field_multiply_vector_field(dataBase_.stream, dataBase_.num_cells, dataBase_.d_rAU, dataBase_.d_u, dataBase_.d_u); + field_add_vector(dataBase_.stream, dataBase_.num_cells, dataBase_.d_HbyA, dataBase_.d_u, dataBase_.d_u, -1.); + correct_boundary_conditions_vector(dataBase_.stream, dataBase_.nccl_comm, dataBase_.neighbProcNo.data(), + dataBase_.num_boundary_surfaces, + dataBase_.num_cells, dataBase_.num_patches, dataBase_.patch_size.data(), patch_type_U.data(), dataBase_.d_boundary_face_cell, + dataBase_.d_u, dataBase_.d_boundary_u); + vector_half_mag_square(dataBase_.stream, dataBase_.num_cells, dataBase_.d_u, dataBase_.d_k, dataBase_.num_boundary_surfaces, + dataBase_.d_boundary_u, dataBase_.d_boundary_k); + // calculate dpdt + fvc_ddt_scalar_field(dataBase_.stream, dataBase_.num_cells, dataBase_.rdelta_t, + dataBase_.d_p, dataBase_.d_p_old, dataBase_.d_volume, dataBase_.d_dpdt, 1.); + +#ifdef USE_GRAPH checkCudaErrors(cudaStreamEndCapture(dataBase_.stream, &graph_post)); checkCudaErrors(cudaGraphInstantiate(&graph_instance_post, graph_post, NULL, NULL, 0)); - graph_created = true; + post_graph_created = true; } checkCudaErrors(cudaGraphLaunch(graph_instance_post, dataBase_.stream)); #endif - - checkCudaErrors(cudaEventRecord(stop,0)); - checkCudaErrors(cudaEventSynchronize(start)); - checkCudaErrors(cudaEventSynchronize(stop)); - checkCudaErrors(cudaEventElapsedTime(&time_elapsed,start,stop)); - fprintf(stderr, "peqn post process time:%f(ms)\n\n",time_elapsed); + TICK_END_EVENT(pEqn post process); + sync(); } void dfpEqn::postProcess() {} From 5b8071353e4c969f19330ea8f7541d4867ae176f Mon Sep 17 00:00:00 2001 From: STwangyingrui Date: Mon, 23 Oct 2023 15:32:38 +0800 Subject: [PATCH 2/6] refine time statistic --- src_gpu/dfEEqn.cu | 8 ++++++-- src_gpu/dfMatrixDataBase.H | 4 ++-- src_gpu/dfMatrixOpBase.cu | 12 +++++++++++- src_gpu/dfRhoEqn.cu | 4 +++- src_gpu/dfUEqn.cu | 10 +++++++--- src_gpu/dfYEqn.cu | 6 +++++- src_gpu/dfpEqn.cu | 4 ++-- 7 files changed, 36 insertions(+), 12 deletions(-) diff --git a/src_gpu/dfEEqn.cu b/src_gpu/dfEEqn.cu index b9f92d526..c4c1bc549 100644 --- a/src_gpu/dfEEqn.cu +++ b/src_gpu/dfEEqn.cu @@ -204,21 +204,25 @@ void dfEEqn::process() { #endif TICK_END_EVENT(EEqn solve); - TICK_START_EVENT; #ifdef USE_GRAPH if(!post_graph_created) { checkCudaErrors(cudaStreamBeginCapture(dataBase_.stream, cudaStreamCaptureModeGlobal)); #endif + TICK_START_EVENT; correct_boundary_conditions_scalar(dataBase_.stream, dataBase_.nccl_comm, dataBase_.neighbProcNo.data(), dataBase_.num_boundary_surfaces, dataBase_.num_patches, dataBase_.patch_size.data(), patch_type_he.data(), dataBase_.d_boundary_delta_coeffs, dataBase_.d_boundary_face_cell, dataBase_.d_he, dataBase_.d_boundary_he, d_boundary_heGradient); + TICK_END_EVENT(EEqn post process correctBC); + TICK_START_EVENT; // copy he and boundary_he to host checkCudaErrors(cudaMemcpyAsync(dataBase_.h_he, dataBase_.d_he, dataBase_.cell_value_bytes, cudaMemcpyDeviceToHost, dataBase_.stream)); checkCudaErrors(cudaMemcpyAsync(dataBase_.h_boundary_he, dataBase_.d_boundary_he, dataBase_.boundary_surface_value_bytes, cudaMemcpyDeviceToHost, dataBase_.stream)); + TICK_END_EVENT(EEqn post process copy back); + TICK_START_EVENT; #ifdef STREAM_ALLOCATOR // thermophysical fields checkCudaErrors(cudaFreeAsync(d_dpdt, dataBase_.stream)); @@ -238,6 +242,7 @@ void dfEEqn::process() { checkCudaErrors(cudaFreeAsync(d_A, dataBase_.stream)); checkCudaErrors(cudaFreeAsync(d_b, dataBase_.stream)); #endif + TICK_END_EVENT(EEqn post process free); #ifdef USE_GRAPH checkCudaErrors(cudaStreamEndCapture(dataBase_.stream, &graph_post)); checkCudaErrors(cudaGraphInstantiate(&graph_instance_post, graph_post, NULL, NULL, 0)); @@ -245,7 +250,6 @@ void dfEEqn::process() { } checkCudaErrors(cudaGraphLaunch(graph_instance_post, dataBase_.stream)); #endif - TICK_END_EVENT(EEqn post process); sync(); } diff --git a/src_gpu/dfMatrixDataBase.H b/src_gpu/dfMatrixDataBase.H index def2f4a89..ba3403ffe 100644 --- a/src_gpu/dfMatrixDataBase.H +++ b/src_gpu/dfMatrixDataBase.H @@ -16,8 +16,8 @@ #include #include -#define DEBUG_ -#define DEBUG_CHECK_LDU +//#define DEBUG_ +//#define DEBUG_CHECK_LDU extern int myRank; diff --git a/src_gpu/dfMatrixOpBase.cu b/src_gpu/dfMatrixOpBase.cu index 9d9d33566..dda7f9003 100644 --- a/src_gpu/dfMatrixOpBase.cu +++ b/src_gpu/dfMatrixOpBase.cu @@ -297,10 +297,13 @@ void correct_boundary_conditions_processor_scalar(cudaStream_t stream, ncclComm_ correct_internal_boundary_field_scalar<<>>(num, offset, vf, boundary_cell_face, vf_boundary); + TICK_INIT_EVENT; + TICK_START_EVENT; checkNcclErrors(ncclGroupStart()); checkNcclErrors(ncclSend(vf_boundary + internal_start_index, num, ncclDouble, peer, comm, stream)); checkNcclErrors(ncclRecv(vf_boundary + neighbor_start_index, num, ncclDouble, peer, comm, stream)); checkNcclErrors(ncclGroupEnd()); + TICK_END_EVENT(nccl scalar); //checkCudaErrors(cudaStreamSynchronize(stream)); } @@ -316,12 +319,15 @@ void correct_boundary_conditions_processor_vector(cudaStream_t stream, ncclComm_ correct_internal_boundary_field_vector<<>>(num, offset, num_boundary_surfaces, num_cells, vf, boundary_cell_face, vf_boundary); + TICK_INIT_EVENT; + TICK_START_EVENT; checkNcclErrors(ncclGroupStart()); for (int i = 0; i < 3; i++) { checkNcclErrors(ncclSend(vf_boundary + num_boundary_surfaces * i + internal_start_index, num, ncclDouble, peer, comm, stream)); checkNcclErrors(ncclRecv(vf_boundary + num_boundary_surfaces * i + neighbor_start_index, num, ncclDouble, peer, comm, stream)); } checkNcclErrors(ncclGroupEnd()); + TICK_END_EVENT(nccl vector); //checkCudaErrors(cudaStreamSynchronize(stream)); } @@ -1179,13 +1185,17 @@ void fvc_grad_vector_correctBC_processor(cudaStream_t stream, ncclComm_t comm, correct_internal_boundary_field_tensor<<>>(num, offset, num_boundary_surfaces, num_cells, internal_grad, face2Cells, boundary_grad); + + TICK_INIT_EVENT; + TICK_START_EVENT; checkNcclErrors(ncclGroupStart()); for (int i = 0; i < 9; i++) { checkNcclErrors(ncclSend(boundary_grad + num_boundary_surfaces * i + internal_start_index, num, ncclDouble, peer, comm, stream)); checkNcclErrors(ncclRecv(boundary_grad + num_boundary_surfaces * i + neighbor_start_index, num, ncclDouble, peer, comm, stream)); } checkNcclErrors(ncclGroupEnd()); - checkCudaErrors(cudaStreamSynchronize(stream)); + TICK_END_EVENT(nccl tensor); + //checkCudaErrors(cudaStreamSynchronize(stream)); } __global__ void fvc_grad_cell_scalar_correctBC_zeroGradient(int num_cells, int num_boundary_surfaces, diff --git a/src_gpu/dfRhoEqn.cu b/src_gpu/dfRhoEqn.cu index 0babea810..3b848b6c7 100644 --- a/src_gpu/dfRhoEqn.cu +++ b/src_gpu/dfRhoEqn.cu @@ -81,8 +81,10 @@ void dfRhoEqn::process() checkCudaErrors(cudaFreeAsync(d_source, dataBase_.stream)); checkCudaErrors(cudaFreeAsync(d_diag, dataBase_.stream)); #endif + TICK_END_EVENT(rhoEqn post process free); + TICK_START_EVENT; checkCudaErrors(cudaMemcpyAsync(dataBase_.h_rho, dataBase_.d_rho, dataBase_.cell_value_bytes, cudaMemcpyDeviceToHost, dataBase_.stream)); - TICK_END_EVENT(rhoEqn post process); + TICK_END_EVENT(rhoEqn post process copyback); sync(); } diff --git a/src_gpu/dfUEqn.cu b/src_gpu/dfUEqn.cu index 61dc11340..76bf3f93d 100644 --- a/src_gpu/dfUEqn.cu +++ b/src_gpu/dfUEqn.cu @@ -501,23 +501,27 @@ void dfUEqn::process() { #endif TICK_END_EVENT(UEqn solve); - TICK_START_EVENT; #ifdef USE_GRAPH if(!post_graph_created) { checkCudaErrors(cudaStreamBeginCapture(dataBase_.stream, cudaStreamCaptureModeGlobal)); #endif + TICK_START_EVENT; correct_boundary_conditions_vector(dataBase_.stream, dataBase_.nccl_comm, dataBase_.neighbProcNo.data(), dataBase_.num_boundary_surfaces, dataBase_.num_cells, dataBase_.num_patches, dataBase_.patch_size.data(), patch_type.data(), dataBase_.d_boundary_face_cell, dataBase_.d_u, dataBase_.d_boundary_u); vector_half_mag_square(dataBase_.stream, dataBase_.num_cells, dataBase_.d_u, dataBase_.d_k, dataBase_.num_boundary_surfaces, dataBase_.d_boundary_u, dataBase_.d_boundary_k); - + TICK_END_EVENT(UEqn post process correctBC); + + TICK_START_EVENT; // copy u and boundary_u to host permute_vector_d2h(dataBase_.stream, dataBase_.num_cells, dataBase_.d_u, d_u_host_order); checkCudaErrors(cudaMemcpyAsync(dataBase_.h_u, d_u_host_order, dataBase_.cell_value_vec_bytes, cudaMemcpyDeviceToHost, dataBase_.stream)); + TICK_END_EVENT(UEqn post process copyback); + TICK_START_EVENT; #ifdef STREAM_ALLOCATOR // free // thermophysical fields @@ -541,6 +545,7 @@ void dfUEqn::process() { checkCudaErrors(cudaFreeAsync(d_A, dataBase_.stream)); checkCudaErrors(cudaFreeAsync(d_b, dataBase_.stream)); #endif + TICK_END_EVENT(UEqn post process free); #ifdef USE_GRAPH checkCudaErrors(cudaStreamEndCapture(dataBase_.stream, &graph_post)); @@ -549,7 +554,6 @@ void dfUEqn::process() { } checkCudaErrors(cudaGraphLaunch(graph_instance_post, dataBase_.stream)); #endif - TICK_END_EVENT(UEqn post process); sync(); } diff --git a/src_gpu/dfYEqn.cu b/src_gpu/dfYEqn.cu index cc08edbe1..b4c499db8 100644 --- a/src_gpu/dfYEqn.cu +++ b/src_gpu/dfYEqn.cu @@ -591,11 +591,15 @@ void dfYEqn::process() { patch_type.data(), dataBase_.d_boundary_delta_coeffs, dataBase_.d_boundary_face_cell, dataBase_.d_y + dataBase_.num_cells * s, dataBase_.d_boundary_y + dataBase_.num_boundary_surfaces * s); } + TICK_END_EVENT(YEqn post process for all species correctBC); + TICK_START_EVENT; // copy y and boundary_y to host checkCudaErrors(cudaMemcpyAsync(dataBase_.h_y, dataBase_.d_y, dataBase_.cell_value_bytes * dataBase_.num_species, cudaMemcpyDeviceToHost, dataBase_.stream)); checkCudaErrors(cudaMemcpyAsync(dataBase_.h_boundary_y, dataBase_.d_boundary_y, dataBase_.boundary_surface_value_bytes * dataBase_.num_species, cudaMemcpyDeviceToHost, dataBase_.stream)); + TICK_END_EVENT(YEqn post process for all species copy back); + TICK_START_EVENT; #ifdef STREAM_ALLOCATOR // thermophysical fields checkCudaErrors(cudaFreeAsync(d_rhoD, dataBase_.stream)); @@ -630,7 +634,7 @@ void dfYEqn::process() { checkCudaErrors(cudaFreeAsync(d_boundary_coeffs, dataBase_.stream)); checkCudaErrors(cudaFreeAsync(d_A, dataBase_.stream)); #endif - TICK_END_EVENT(YEqn post process for all species); + TICK_END_EVENT(YEqn post process for all species free); sync(); } diff --git a/src_gpu/dfpEqn.cu b/src_gpu/dfpEqn.cu index b8940d878..71dad23d7 100644 --- a/src_gpu/dfpEqn.cu +++ b/src_gpu/dfpEqn.cu @@ -440,12 +440,12 @@ void dfpEqn::process() { solve(); TICK_END_EVENT(pEqn solve); - TICK_START_EVENT; #ifdef USE_GRAPH if(!post_graph_created) { checkCudaErrors(cudaStreamBeginCapture(dataBase_.stream, cudaStreamCaptureModeGlobal)); #endif + TICK_START_EVENT; correct_boundary_conditions_scalar(dataBase_.stream, dataBase_.nccl_comm, dataBase_.neighbProcNo.data(), dataBase_.num_boundary_surfaces, dataBase_.num_patches, dataBase_.patch_size.data(), patch_type_p.data(), dataBase_.d_boundary_delta_coeffs, @@ -476,6 +476,7 @@ void dfpEqn::process() { // calculate dpdt fvc_ddt_scalar_field(dataBase_.stream, dataBase_.num_cells, dataBase_.rdelta_t, dataBase_.d_p, dataBase_.d_p_old, dataBase_.d_volume, dataBase_.d_dpdt, 1.); + TICK_END_EVENT(pEqn post process all); #ifdef USE_GRAPH checkCudaErrors(cudaStreamEndCapture(dataBase_.stream, &graph_post)); @@ -484,7 +485,6 @@ void dfpEqn::process() { } checkCudaErrors(cudaGraphLaunch(graph_instance_post, dataBase_.stream)); #endif - TICK_END_EVENT(pEqn post process); sync(); } void dfpEqn::postProcess() {} From 74dcf0c622f7fd5cc799def761557f304330eb22 Mon Sep 17 00:00:00 2001 From: STwangyingrui Date: Tue, 24 Oct 2023 17:09:21 +0800 Subject: [PATCH 3/6] refactor macros; fix invalid config for empty patches; --- src_gpu/dfEEqn.H | 2 ++ src_gpu/dfEEqn.cu | 5 ++++- src_gpu/dfMatrixOpBase.H | 27 ++++++++++++++++++++++++--- src_gpu/dfMatrixOpBase.cu | 25 +++++++++++++++++++++++++ src_gpu/dfRhoEqn.H | 2 ++ src_gpu/dfRhoEqn.cu | 2 ++ src_gpu/dfThermo.cu | 1 + src_gpu/dfUEqn.H | 2 ++ src_gpu/dfUEqn.cu | 6 ++++++ src_gpu/dfYEqn.H | 2 ++ src_gpu/dfYEqn.cu | 24 ++++++++++++++---------- src_gpu/dfpEqn.H | 2 ++ src_gpu/dfpEqn.cu | 6 ++++++ 13 files changed, 92 insertions(+), 14 deletions(-) diff --git a/src_gpu/dfEEqn.H b/src_gpu/dfEEqn.H index 63616b414..6fac22965 100644 --- a/src_gpu/dfEEqn.H +++ b/src_gpu/dfEEqn.H @@ -13,11 +13,13 @@ class dfEEqn // cuda resource cudaStream_t stream; +#ifdef USE_GRAPH // one graph for one eqn before using self-developed solver cudaGraph_t graph_pre, graph_post; cudaGraphExec_t graph_instance_pre, graph_instance_post; bool pre_graph_created=false; bool post_graph_created=false; +#endif // constant values -- basic std::string mode_string; diff --git a/src_gpu/dfEEqn.cu b/src_gpu/dfEEqn.cu index c4c1bc549..e2b993ba7 100644 --- a/src_gpu/dfEEqn.cu +++ b/src_gpu/dfEEqn.cu @@ -89,6 +89,7 @@ void dfEEqn::initNonConstantFields(const double *he, const double *boundary_he) } void dfEEqn::cleanCudaResources() { +#ifdef USE_GRAPH if (pre_graph_created) { checkCudaErrors(cudaGraphExecDestroy(graph_instance_pre)); checkCudaErrors(cudaGraphDestroy(graph_pre)); @@ -97,6 +98,7 @@ void dfEEqn::cleanCudaResources() { checkCudaErrors(cudaGraphExecDestroy(graph_instance_post)); checkCudaErrors(cudaGraphDestroy(graph_post)); } +#endif } void dfEEqn::preProcess(const double *h_he, const double *h_k, const double *h_k_old, const double *h_dpdt, const double *h_boundary_k, const double *h_boundary_heGradient) @@ -123,7 +125,7 @@ void dfEEqn::process() { checkCudaErrors(cudaMallocAsync((void**)&d_gradient_internal_coeffs, dataBase_.boundary_surface_value_bytes, dataBase_.stream)); checkCudaErrors(cudaMallocAsync((void**)&d_gradient_boundary_coeffs, dataBase_.boundary_surface_value_bytes, dataBase_.stream)); - checkCudaErrors(cudaMallocAsync((void**)&d_boundary_heGradient, dataBase_.num_gradientEnergy_boundary_surfaces, dataBase_.stream)); + checkCudaErrors(cudaMallocAsync((void**)&d_boundary_heGradient, sizeof(double) * num_gradientEnergy_boundary_surfaces, dataBase_.stream)); checkCudaErrors(cudaMallocAsync((void**)&d_source, dataBase_.cell_value_bytes, dataBase_.stream)); checkCudaErrors(cudaMallocAsync((void**)&d_internal_coeffs, dataBase_.boundary_surface_value_bytes, dataBase_.stream)); @@ -261,6 +263,7 @@ void dfEEqn::eeqn_calculate_energy_gradient(dfThermo& GPUThermo, int num_cells, { int bou_offset = 0, gradient_offset = 0; for (int i = 0; i < num_patches; i++) { + if (patch_size[i] == 0) continue; if (patch_type[i] == boundaryConditions::gradientEnergy) { GPUThermo.calculateEnergyGradient(patch_size[i], num_cells, num_species, num_boundary_surfaces, bou_offset, gradient_offset, face2Cells, T, p, y, boundary_delta_coeffs, boundary_p, boundary_y, boundary_thermo_gradient); diff --git a/src_gpu/dfMatrixOpBase.H b/src_gpu/dfMatrixOpBase.H index 195e86225..ea8b5db99 100644 --- a/src_gpu/dfMatrixOpBase.H +++ b/src_gpu/dfMatrixOpBase.H @@ -2,18 +2,39 @@ #include #include +// macros need to be in a certain order + +// sequence 0: STREAM_ALLOCATOR can be open or not +#ifndef STREAM_ALLOCATOR + #define STREAM_ALLOCATOR +#endif + +// sequence 0: USE_GRAPH can be open or not #ifndef USE_GRAPH #define USE_GRAPH #endif +// sequence 0: TIME_GPU can be open or not #ifndef TIME_GPU #define TIME_GPU - #if (defined TIME_GPU) && (defined USE_GRAPH) - #undef USE_GRAPH - #endif +#endif + +// sequence 1: TIME_GPU and USE_GRAPH can not be open at the same time +#if (defined TIME_GPU) && (defined USE_GRAPH) + #undef USE_GRAPH +#endif + +// sequence 2: STREAM_ALLOCATOR must be open if USE_GRAPH is open +#if (defined USE_GRAPH) && (!defined STREAM_ALLOCATOR) + #define STREAM_ALLOCATOR #endif extern int myRank; + +#define PRINT_PTR(x) { \ + fprintf(stderr, "rank[%d], %s %d, print ptr %s: %p\n", myRank, __FILE__, __LINE__, #x, x); \ +} + extern __global__ void warmup(); #ifdef TIME_GPU diff --git a/src_gpu/dfMatrixOpBase.cu b/src_gpu/dfMatrixOpBase.cu index dda7f9003..129fe0fcb 100644 --- a/src_gpu/dfMatrixOpBase.cu +++ b/src_gpu/dfMatrixOpBase.cu @@ -2034,6 +2034,7 @@ void ldu_to_csr_scalar(cudaStream_t stream, int num_cells, int num_surfaces, int int bou_offset = 0, ext_offset = 0; size_t threads_per_block, blocks_per_grid; for (int i = 0; i < num_patches; i++) { + if (patch_size[i] == 0) continue; if (patch_type[i] == boundaryConditions::processor) { threads_per_block = 64; blocks_per_grid = (patch_size[i] + threads_per_block - 1) / threads_per_block; @@ -2049,6 +2050,7 @@ void ldu_to_csr_scalar(cudaStream_t stream, int num_cells, int num_surfaces, int // add coeff to source and diagnal bou_offset = 0; for (int i = 0; i < num_patches; i++) { + if (patch_size[i] == 0) continue; threads_per_block = 64; blocks_per_grid = (patch_size[i] + threads_per_block - 1) / threads_per_block; if (patch_type[i] == boundaryConditions::processor) { @@ -2096,6 +2098,7 @@ void update_boundary_coeffs_scalar(cudaStream_t stream, int offset = 0; int gradient_offset = 0; for (int i = 0; i < num_patches; i++) { + if (patch_size[i] == 0) continue; threads_per_block = 256; blocks_per_grid = (patch_size[i] + threads_per_block - 1) / threads_per_block; // TODO: just basic patch type now @@ -2135,6 +2138,7 @@ void correct_boundary_conditions_scalar(cudaStream_t stream, ncclComm_t comm, int offset = 0; int gradient_offset = 0; for (int i = 0; i < num_patches; i++) { + if (patch_size[i] == 0) continue; threads_per_block = 256; blocks_per_grid = (patch_size[i] + threads_per_block - 1) / threads_per_block; if (patch_type[i] == boundaryConditions::zeroGradient @@ -2170,6 +2174,7 @@ void correct_boundary_conditions_vector(cudaStream_t stream, ncclComm_t comm, int offset = 0; for (int i = 0; i < num_patches; i++) { + if (patch_size[i] == 0) continue; threads_per_block = 256; blocks_per_grid = (patch_size[i] + threads_per_block - 1) / threads_per_block; if (patch_type[i] == boundaryConditions::zeroGradient @@ -2202,6 +2207,7 @@ void update_boundary_coeffs_vector(cudaStream_t stream, int num_boundary_surface int offset = 0; for (int i = 0; i < num_patches; i++) { + if (patch_size[i] == 0) continue; threads_per_block = 256; blocks_per_grid = (patch_size[i] + threads_per_block - 1) / threads_per_block; // TODO: just basic patch type now @@ -2277,6 +2283,7 @@ void fvm_div_scalar(cudaStream_t stream, int num_surfaces, const int *lowerAddr, int offset = 0; for (int i = 0; i < num_patches; i++) { + if (patch_size[i] == 0) continue; threads_per_block = 256; blocks_per_grid = (patch_size[i] + threads_per_block - 1) / threads_per_block; // TODO: just basic patch type now @@ -2316,6 +2323,7 @@ void fvm_div_vector(cudaStream_t stream, int num_surfaces, int num_boundary_surf int offset = 0; for (int i = 0; i < num_patches; i++) { + if (patch_size[i] == 0) continue; threads_per_block = 64; blocks_per_grid = (patch_size[i] + threads_per_block - 1) / threads_per_block; // TODO: just basic patch type now @@ -2354,6 +2362,7 @@ void fvm_laplacian_scalar(cudaStream_t stream, int num_surfaces, int num_boundar weight, mag_sf, delta_coeffs, gamma, lower, upper, diag, sign); int offset = 0; for (int i = 0; i < num_patches; i++) { + if (patch_size[i] == 0) continue; threads_per_block = 256; blocks_per_grid = (patch_size[i] + threads_per_block - 1) / threads_per_block; // TODO: just basic patch type now @@ -2393,6 +2402,7 @@ void fvm_laplacian_vector(cudaStream_t stream, int num_surfaces, int num_boundar weight, mag_sf, delta_coeffs, gamma, lower, upper, diag, sign); int offset = 0; for (int i = 0; i < num_patches; i++) { + if (patch_size[i] == 0) continue; threads_per_block = 256; blocks_per_grid = (patch_size[i] + threads_per_block - 1) / threads_per_block; // TODO: just basic patch type now @@ -2432,6 +2442,7 @@ void fvm_laplacian_surface_scalar_vol_scalar(cudaStream_t stream, int num_surfac int offset = 0; for (int i = 0; i < num_patches; i++) { + if (patch_size[i] == 0) continue; threads_per_block = 256; blocks_per_grid = (patch_size[i] + threads_per_block - 1) / threads_per_block; fvm_laplacian_surface_scalar_boundary<<>>(patch_size[i], offset, @@ -2472,6 +2483,7 @@ void fvc_grad_vector(cudaStream_t stream, ncclComm_t comm, int offset = 0; // finish conctruct grad field except dividing cell volume for (int i = 0; i < num_patches; i++) { + if (patch_size[i] == 0) continue; threads_per_block = 64; blocks_per_grid = (patch_size[i] + threads_per_block - 1) / threads_per_block; // TODO: just basic patch type now @@ -2501,6 +2513,7 @@ void fvc_grad_vector(cudaStream_t stream, ncclComm_t comm, // correct boundary conditions offset = 0; for (int i = 0; i < num_patches; i++) { + if (patch_size[i] == 0) continue; threads_per_block = 64; blocks_per_grid = (patch_size[i] + threads_per_block - 1) / threads_per_block; // TODO: just basic patch type now @@ -2550,6 +2563,7 @@ void fvc_div_surface_scalar(cudaStream_t stream, int num_cells, int num_surfaces int offset = 0; for (int i = 0; i < num_patches; i++) { + if (patch_size[i] == 0) continue; threads_per_block = 256; blocks_per_grid = (patch_size[i] + threads_per_block - 1) / threads_per_block; fvc_div_surface_scalar_boundary<<>>(patch_size[i], offset, boundary_cell_face, @@ -2576,6 +2590,7 @@ void fvc_div_cell_vector(cudaStream_t stream, int num_cells, int num_surfaces, i int offset = 0; for (int i = 0; i < num_patches; i++) { + if (patch_size[i] == 0) continue; threads_per_block = 256; blocks_per_grid = (patch_size[i] + threads_per_block - 1) / threads_per_block; // TODO: just basic patch type now @@ -2615,6 +2630,7 @@ void fvc_div_cell_tensor(cudaStream_t stream, int num_cells, int num_surfaces, i int offset = 0; for (int i = 0; i < num_patches; i++) { + if (patch_size[i] == 0) continue; threads_per_block = 64; blocks_per_grid = (patch_size[i] + threads_per_block - 1) / threads_per_block; // TODO: just basic patch type now @@ -2652,6 +2668,7 @@ void fvc_div_surface_scalar_vol_scalar(cudaStream_t stream, int num_surfaces, int offset = 0; for (int i = 0; i < num_patches; i++) { + if (patch_size[i] == 0) continue; threads_per_block = 64; blocks_per_grid = (patch_size[i] + threads_per_block - 1) / threads_per_block; // TODO: just non-coupled patch type now @@ -2688,6 +2705,7 @@ void fvc_grad_cell_scalar(cudaStream_t stream, int num_cells, int num_surfaces, int offset = 0; for (int i = 0; i < num_patches; i++) { + if (patch_size[i] == 0) continue; threads_per_block = 64; blocks_per_grid = (patch_size[i] + threads_per_block - 1) / threads_per_block; // TODO: just non-coupled patch type now @@ -2723,6 +2741,7 @@ void fvc_grad_cell_scalar(cudaStream_t stream, int num_cells, int num_surfaces, int offset = 0; for (int i = 0; i < num_patches; i++) { + if (patch_size[i] == 0) continue; threads_per_block = 64; blocks_per_grid = (patch_size[i] + threads_per_block - 1) / threads_per_block; // TODO: just non-coupled patch type now @@ -2769,6 +2788,7 @@ void fvc_grad_cell_scalar_withBC(cudaStream_t stream, ncclComm_t comm, const int // correct boundary conditions int offset = 0; for (int i = 0; i < num_patches; i++) { + if (patch_size[i] == 0) continue; threads_per_block = 256; blocks_per_grid = (patch_size[i] + threads_per_block - 1) / threads_per_block; // TODO: just basic patch type now @@ -2813,6 +2833,7 @@ void fvc_laplacian_scalar(cudaStream_t stream, int num_cells, int num_surfaces, int offset = 0; for (int i = 0; i < num_patches; i++) { + if (patch_size[i] == 0) continue; threads_per_block = 256; blocks_per_grid = (patch_size[i] + threads_per_block - 1) / threads_per_block; // TODO: just basic patch type now @@ -2847,6 +2868,7 @@ void fvc_flux(cudaStream_t stream, int num_cells, int num_surfaces, int num_boun int offset = 0; for (int i = 0; i < num_patches; i++) { + if (patch_size[i] == 0) continue; threads_per_block = 256; blocks_per_grid = (patch_size[i] + threads_per_block - 1) / threads_per_block; // TODO: maybe do not need loop boundarys @@ -2876,6 +2898,7 @@ void fvc_interpolate(cudaStream_t stream, int num_cells, int num_surfaces, int offset = 0; for (int i = 0; i < num_patches; i++) { + if (patch_size[i] == 0) continue; threads_per_block = 256; blocks_per_grid = (patch_size[i] + threads_per_block - 1) / threads_per_block; // TODO: maybe do not need loop boundarys @@ -2943,6 +2966,7 @@ void fvMtx_H(cudaStream_t stream, int num_cells, int num_surfaces, int num_bound int offset = 0; for (int i = 0; i < num_patches; i++) { + if (patch_size[i] == 0) continue; threads_per_block = 64; blocks_per_grid = (patch_size[i] + threads_per_block - 1) / threads_per_block; // TODO: just non-coupled patch type now @@ -2972,6 +2996,7 @@ void fvMtx_flux(cudaStream_t stream, int num_surfaces, int num_boundary_surfaces int offset = 0; for (int i = 0; i < num_patches; i++) { + if (patch_size[i] == 0) continue; threads_per_block = 64; blocks_per_grid = (patch_size[i] + threads_per_block - 1) / threads_per_block; if (patch_type[i] == boundaryConditions::processor) { diff --git a/src_gpu/dfRhoEqn.H b/src_gpu/dfRhoEqn.H index f79b0584b..c38347977 100644 --- a/src_gpu/dfRhoEqn.H +++ b/src_gpu/dfRhoEqn.H @@ -10,10 +10,12 @@ private: // cuda resource cudaStream_t stream; +#ifdef USE_GRAPH // one graph for one eqn before using self-developed solver cudaGraph_t graph; cudaGraphExec_t graph_instance; bool graph_created=false; +#endif // constant fields - boundary std::vector patch_type; diff --git a/src_gpu/dfRhoEqn.cu b/src_gpu/dfRhoEqn.cu index 3b848b6c7..627ef26dc 100644 --- a/src_gpu/dfRhoEqn.cu +++ b/src_gpu/dfRhoEqn.cu @@ -26,10 +26,12 @@ void dfRhoEqn::initNonConstantFields(const double *rho, const double *phi, } void dfRhoEqn::cleanCudaResources() { +#ifdef USE_GRAPH if (graph_created) { checkCudaErrors(cudaGraphExecDestroy(graph_instance)); checkCudaErrors(cudaGraphDestroy(graph)); } +#endif } void dfRhoEqn::preProcess() diff --git a/src_gpu/dfThermo.cu b/src_gpu/dfThermo.cu index da62828aa..4d7eec672 100644 --- a/src_gpu/dfThermo.cu +++ b/src_gpu/dfThermo.cu @@ -507,6 +507,7 @@ void dfThermo::correctThermo() // boundary field int offset = 0; for (int i = 0; i < dataBase_.num_patches; i++) { + if (dataBase_.patch_size[i] == 0) continue; if (dataBase_.patch_type_T[i] == boundaryConditions::fixedValue) { calculateTPolyGPU(boundary_thread, dataBase_.patch_size[i], dataBase_.num_boundary_surfaces, dataBase_.d_boundary_T, d_boundary_T_poly, offset); calculateEnthalpyGPU(boundary_thread, dataBase_.patch_size[i], dataBase_.d_boundary_T, dataBase_.d_boundary_he, diff --git a/src_gpu/dfUEqn.H b/src_gpu/dfUEqn.H index 99db6ef60..33ca8ede3 100644 --- a/src_gpu/dfUEqn.H +++ b/src_gpu/dfUEqn.H @@ -12,11 +12,13 @@ private: // cuda resource cudaStream_t stream; +#ifdef USE_GRAPH // one graph for one eqn before using self-developed solver cudaGraph_t graph_pre, graph_post; cudaGraphExec_t graph_instance_pre, graph_instance_post; bool pre_graph_created=false; bool post_graph_created=false; +#endif // constant values -- basic std::string mode_string; diff --git a/src_gpu/dfUEqn.cu b/src_gpu/dfUEqn.cu index 76bf3f93d..56682491d 100644 --- a/src_gpu/dfUEqn.cu +++ b/src_gpu/dfUEqn.cu @@ -357,6 +357,7 @@ void dfUEqn::initNonConstantFieldsBoundary() { } void dfUEqn::cleanCudaResources() { +#ifdef USE_GRAPH if (pre_graph_created) { checkCudaErrors(cudaGraphExecDestroy(graph_instance_pre)); checkCudaErrors(cudaGraphDestroy(graph_pre)); @@ -365,6 +366,7 @@ void dfUEqn::cleanCudaResources() { checkCudaErrors(cudaGraphExecDestroy(graph_instance_post)); checkCudaErrors(cudaGraphDestroy(graph_post)); } +#endif } void dfUEqn::preProcessForRhoEqn(const double *h_rho, const double *h_phi, const double *h_boundary_phi) { @@ -636,6 +638,7 @@ void dfUEqn::UEqnGetHbyA(cudaStream_t stream, ncclComm_t comm, const int *neighb int offset = 0; for (int i = 0; i < num_patches; i++) { + if (patch_size[i] == 0) continue; threads_per_block = 64; blocks_per_grid = (patch_size[i] + threads_per_block - 1) / threads_per_block; // TODO: just non-coupled patch type now @@ -666,6 +669,7 @@ void dfUEqn::UEqnGetHbyA(cudaStream_t stream, ncclComm_t comm, const int *neighb // constrainHbyA offset = 0; for (int i = 0; i < num_patches; i++) { + if (patch_size[i] == 0) continue; threads_per_block = 64; blocks_per_grid = (patch_size[i] + threads_per_block - 1) / threads_per_block; // TODO: just non-coupled patch type now @@ -698,6 +702,7 @@ void dfUEqn::ueqn_ldu_to_csr(cudaStream_t stream, int num_cells, int num_surface int bou_offset = 0, ext_offset = 0; size_t threads_per_block, blocks_per_grid; for (int i = 0; i < num_patches; i++) { + if (patch_size[i] == 0) continue; if (patch_type[i] == boundaryConditions::processor) { threads_per_block = 64; blocks_per_grid = (patch_size[i] + threads_per_block - 1) / threads_per_block; @@ -719,6 +724,7 @@ void dfUEqn::ueqn_ldu_to_csr(cudaStream_t stream, int num_cells, int num_surface // add coeff to source and diagnal bou_offset = 0; for (int i = 0; i < num_patches; i++) { + if (patch_size[i] == 0) continue; threads_per_block = 64; blocks_per_grid = (patch_size[i] + threads_per_block - 1) / threads_per_block; if (patch_type[i] == boundaryConditions::processor) { diff --git a/src_gpu/dfYEqn.H b/src_gpu/dfYEqn.H index 10304bd97..b83445202 100644 --- a/src_gpu/dfYEqn.H +++ b/src_gpu/dfYEqn.H @@ -12,10 +12,12 @@ private: // cuda resource cudaStream_t stream; +#ifdef USE_GRAPH // one graph for one eqn before using self-developed solver cudaGraph_t graph; cudaGraphExec_t graph_instance; bool graph_created=false; +#endif // constant values -- basic std::string mode_string; diff --git a/src_gpu/dfYEqn.cu b/src_gpu/dfYEqn.cu index b4c499db8..b3eb211ba 100644 --- a/src_gpu/dfYEqn.cu +++ b/src_gpu/dfYEqn.cu @@ -373,13 +373,6 @@ void dfYEqn::createNonConstantLduAndCsrFields() { void dfYEqn::initNonConstantFieldsInternal(const double *y) { checkCudaErrors(cudaMemcpyAsync(dataBase_.d_y, y, dataBase_.cell_value_bytes * dataBase_.num_species, cudaMemcpyHostToDevice, dataBase_.stream)); - - // UnityLewis - checkCudaErrors(cudaMemsetAsync(d_hai, 0, dataBase_.cell_value_bytes * dataBase_.num_species, dataBase_.stream)); - checkCudaErrors(cudaMemsetAsync(d_boundary_hai, 0, dataBase_.boundary_surface_value_bytes * dataBase_.num_species, dataBase_.stream)); - // laminar - checkCudaErrors(cudaMemsetAsync(d_mut_sct, 0, dataBase_.cell_value_bytes, dataBase_.stream)); - checkCudaErrors(cudaMemsetAsync(d_boundary_mut_sct, 0, dataBase_.boundary_surface_value_bytes, dataBase_.stream)); } void dfYEqn::initNonConstantFieldsBoundary(const double *boundary_y) { @@ -397,10 +390,12 @@ void dfYEqn::initNonConstantFieldsBoundary(const double *boundary_y) { } void dfYEqn::cleanCudaResources() { +#ifdef USE_GRAPH if (graph_created) { checkCudaErrors(cudaGraphExecDestroy(graph_instance)); checkCudaErrors(cudaGraphDestroy(graph)); } +#endif } void dfYEqn::preProcess(const double *h_rhoD, const double *h_boundary_rhoD, @@ -449,6 +444,14 @@ void dfYEqn::process() { checkCudaErrors(cudaMallocAsync((void**)&d_boundary_coeffs, dataBase_.boundary_surface_value_bytes, dataBase_.stream)); checkCudaErrors(cudaMallocAsync((void**)&d_A, dataBase_.csr_value_bytes, dataBase_.stream)); #endif + + // UnityLewis + checkCudaErrors(cudaMemsetAsync(d_hai, 0, dataBase_.cell_value_bytes * dataBase_.num_species, dataBase_.stream)); + checkCudaErrors(cudaMemsetAsync(d_boundary_hai, 0, dataBase_.boundary_surface_value_bytes * dataBase_.num_species, dataBase_.stream)); + // laminar + checkCudaErrors(cudaMemsetAsync(d_mut_sct, 0, dataBase_.cell_value_bytes, dataBase_.stream)); + checkCudaErrors(cudaMemsetAsync(d_boundary_mut_sct, 0, dataBase_.boundary_surface_value_bytes, dataBase_.stream)); + checkCudaErrors(cudaMemsetAsync(dataBase_.d_diff_alphaD, 0, dataBase_.cell_value_bytes, dataBase_.stream)); checkCudaErrors(cudaMemsetAsync(dataBase_.d_boundary_diff_alphaD, 0, dataBase_.boundary_surface_value_bytes, dataBase_.stream)); checkCudaErrors(cudaMemsetAsync(d_grad_y, 0, dataBase_.cell_value_vec_bytes * dataBase_.num_species, dataBase_.stream)); @@ -583,7 +586,6 @@ void dfYEqn::process() { TICK_START_EVENT; // compute y_inertIndex yeqn_compute_y_inertIndex(dataBase_.stream, dataBase_.num_species, inertIndex, dataBase_.num_cells, dataBase_.d_y); - // correct boundary conditions for (int s = 0; s < dataBase_.num_species; s++) { correct_boundary_conditions_scalar(dataBase_.stream, dataBase_.nccl_comm, dataBase_.neighbProcNo.data(), @@ -602,7 +604,7 @@ void dfYEqn::process() { TICK_START_EVENT; #ifdef STREAM_ALLOCATOR // thermophysical fields - checkCudaErrors(cudaFreeAsync(d_rhoD, dataBase_.stream)); + //checkCudaErrors(cudaFreeAsync(d_rhoD, dataBase_.stream)); checkCudaErrors(cudaFreeAsync(d_hai, dataBase_.stream)); checkCudaErrors(cudaFreeAsync(d_mut_sct, dataBase_.stream)); // intermediate fields @@ -613,7 +615,7 @@ void dfYEqn::process() { checkCudaErrors(cudaFreeAsync(d_permute, dataBase_.stream)); // thermophysical fields - checkCudaErrors(cudaFreeAsync(d_boundary_rhoD, dataBase_.stream)); + //checkCudaErrors(cudaFreeAsync(d_boundary_rhoD, dataBase_.stream)); checkCudaErrors(cudaFreeAsync(d_boundary_hai, dataBase_.stream)); checkCudaErrors(cudaFreeAsync(d_boundary_mut_sct, dataBase_.stream)); // intermediate fields @@ -709,6 +711,7 @@ void dfYEqn::yeqn_fvc_laplacian_scalar(cudaStream_t stream, ncclComm_t comm, con int offset = 0; for (int i = 0; i < num_patches; i++) { + if (patch_size[i] == 0) continue; threads_per_block = 256; blocks_per_grid = (patch_size[i] + threads_per_block - 1) / threads_per_block; // TODO: just basic patch type now @@ -746,6 +749,7 @@ void dfYEqn::yeqn_fvc_laplacian_scalar(cudaStream_t stream, ncclComm_t comm, con boundary_cell_face, output, boundary_output); offset = 0; for (int i = 0; i < num_patches; i++) { + if (patch_size[i] == 0) continue; if (patch_type[i] == boundaryConditions::processor) { correct_boundary_conditions_processor_scalar(stream, comm, neighbor_peer[i], patch_size[i], offset, output, boundary_cell_face, boundary_output); diff --git a/src_gpu/dfpEqn.H b/src_gpu/dfpEqn.H index b8d84640b..ebbb246f4 100644 --- a/src_gpu/dfpEqn.H +++ b/src_gpu/dfpEqn.H @@ -12,11 +12,13 @@ private: // cuda resource cudaStream_t stream; +#ifdef USE_GRAPH // one graph for one eqn before using self-developed solver cudaGraph_t graph_pre, graph_post; cudaGraphExec_t graph_instance_pre, graph_instance_post; bool pre_graph_created=false; bool post_graph_created=false; +#endif // constant values -- basic std::string mode_string; diff --git a/src_gpu/dfpEqn.cu b/src_gpu/dfpEqn.cu index 71dad23d7..d96380fbd 100644 --- a/src_gpu/dfpEqn.cu +++ b/src_gpu/dfpEqn.cu @@ -341,6 +341,7 @@ void dfpEqn::initNonConstantFields(const double *p, const double *boundary_p){ } void dfpEqn::cleanCudaResources() { +#ifdef USE_GRAPH if (pre_graph_created) { checkCudaErrors(cudaGraphExecDestroy(graph_instance_pre)); checkCudaErrors(cudaGraphDestroy(graph_pre)); @@ -349,6 +350,7 @@ void dfpEqn::cleanCudaResources() { checkCudaErrors(cudaGraphExecDestroy(graph_instance_post)); checkCudaErrors(cudaGraphDestroy(graph_post)); } +#endif } // tmp @@ -511,6 +513,7 @@ void dfpEqn::getrhorAUf(cudaStream_t stream, int num_cells, int num_surfaces, int offset = 0; for (int i = 0; i < num_patches; i++) { + if (patch_size[i] == 0) continue; threads_per_block = 256; blocks_per_grid = (patch_size[i] + threads_per_block - 1) / threads_per_block; // TODO: maybe do not need loop boundarys @@ -547,6 +550,7 @@ void dfpEqn::getphiHbyA(cudaStream_t stream, int num_cells, int num_surfaces, in int offset = 0; for (int i = 0; i < num_patches; i++) { + if (patch_size[i] == 0) continue; threads_per_block = 256; blocks_per_grid = (patch_size[i] + threads_per_block - 1) / threads_per_block; if (patch_type[i] == boundaryConditions::processor) { @@ -565,6 +569,7 @@ void dfpEqn::getphiHbyA(cudaStream_t stream, int num_cells, int num_surfaces, in offset = 0; for (int i = 0; i < num_patches; i++) { + if (patch_size[i] == 0) continue; threads_per_block = 256; blocks_per_grid = (patch_size[i] + threads_per_block - 1) / threads_per_block; if (patch_type[i] == boundaryConditions::processor) { @@ -584,6 +589,7 @@ void dfpEqn::getphiHbyA(cudaStream_t stream, int num_cells, int num_surfaces, in offset = 0; for (int i = 0; i < num_patches; i++) { + if (patch_size[i] == 0) continue; threads_per_block = 256; blocks_per_grid = (patch_size[i] + threads_per_block - 1) / threads_per_block; if (patch_type[i] == boundaryConditions::extrapolated) { From 4496dd573389cf75d49b521eea4f4601b7437413 Mon Sep 17 00:00:00 2001 From: STwangyingrui Date: Tue, 24 Oct 2023 19:29:36 +0800 Subject: [PATCH 4/6] use stream allocator in pEqn and thermo --- src_gpu/dfThermo.cu | 34 ++++++++++++++++++++-- src_gpu/dfpEqn.H | 2 +- src_gpu/dfpEqn.cu | 71 ++++++++++++++++++++++++++++++++++++++++----- 3 files changed, 95 insertions(+), 12 deletions(-) diff --git a/src_gpu/dfThermo.cu b/src_gpu/dfThermo.cu index 4d7eec672..92a482e5a 100644 --- a/src_gpu/dfThermo.cu +++ b/src_gpu/dfThermo.cu @@ -23,8 +23,8 @@ void init_const_coeff_ptr(std::vector>& nasa_coeffs, std::ve std::vector>& thermal_conductivity_coeffs, std::vector>& binary_diffusion_coeffs, std::vector& molecular_weights) { - double *d_tmp; - checkCudaErrors(cudaMalloc((void**)&d_tmp, sizeof(double) * NUM_SPECIES * 15)); + //double *d_tmp; + //checkCudaErrors(cudaMalloc((void**)&d_tmp, sizeof(double) * NUM_SPECIES * 15)); double nasa_coeffs_tmp[NUM_SPECIES*15]; double viscosity_coeffs_tmp[NUM_SPECIES*5]; double thermal_conductivity_coeffs_tmp[NUM_SPECIES*5]; @@ -344,6 +344,7 @@ void dfThermo::setConstantValue(std::string mechanism_file, int num_cells, int n initCoeffsfromBinaryFile(fp); stream = dataBase_.stream; +#ifndef STREAM_ALLOCATOR checkCudaErrors(cudaMalloc((void**)&d_mole_fraction, sizeof(double) * num_species * num_cells)); checkCudaErrors(cudaMalloc((void**)&d_boundary_mole_fraction, sizeof(double) * num_species * dataBase_.num_boundary_surfaces)); checkCudaErrors(cudaMalloc((void**)&d_mean_mole_weight, sizeof(double) * num_cells)); @@ -355,10 +356,10 @@ void dfThermo::setConstantValue(std::string mechanism_file, int num_cells, int n checkCudaErrors(cudaMalloc((void**)&d_boundary_species_viscosities, sizeof(double) * num_species * dataBase_.num_boundary_surfaces)); checkCudaErrors(cudaMalloc((void**)&d_species_thermal_conductivities, sizeof(double) * num_species * num_cells)); checkCudaErrors(cudaMalloc((void**)&d_boundary_species_thermal_conductivities, sizeof(double) * num_species * dataBase_.num_boundary_surfaces)); +#endif checkCudaErrors(cudaMalloc((void**)&d_psip0, sizeof(double) * num_cells)); checkCudaErrors(cudaMalloc((void**)&d_boundary_psip0, sizeof(double) * dataBase_.num_boundary_surfaces)); - std::cout << "dfThermo initialized" << std::endl; } @@ -493,6 +494,20 @@ void dfThermo::updateEnergy() void dfThermo::correctThermo() { +#ifdef STREAM_ALLOCATOR + checkCudaErrors(cudaMallocAsync((void**)&d_mole_fraction, sizeof(double) * num_species * num_cells, stream)); + checkCudaErrors(cudaMallocAsync((void**)&d_boundary_mole_fraction, sizeof(double) * num_species * dataBase_.num_boundary_surfaces, stream)); + checkCudaErrors(cudaMallocAsync((void**)&d_mean_mole_weight, sizeof(double) * num_cells, stream)); + checkCudaErrors(cudaMallocAsync((void**)&d_boundary_mean_mole_weight, sizeof(double) * dataBase_.num_boundary_surfaces, stream)); + checkCudaErrors(cudaMallocAsync((void**)&d_T_poly, sizeof(double) * 5 * num_cells, stream)); + checkCudaErrors(cudaMallocAsync((void**)&d_boundary_T_poly, sizeof(double) * 5 * dataBase_.num_boundary_surfaces, stream)); + + checkCudaErrors(cudaMallocAsync((void**)&d_species_viscosities, sizeof(double) * num_species * num_cells, stream)); + checkCudaErrors(cudaMallocAsync((void**)&d_boundary_species_viscosities, sizeof(double) * num_species * dataBase_.num_boundary_surfaces, stream)); + checkCudaErrors(cudaMallocAsync((void**)&d_species_thermal_conductivities, sizeof(double) * num_species * num_cells, stream)); + checkCudaErrors(cudaMallocAsync((void**)&d_boundary_species_thermal_conductivities, sizeof(double) * num_species * dataBase_.num_boundary_surfaces, stream)); +#endif + setMassFraction(dataBase_.d_y, dataBase_.d_boundary_y); // internal field int cell_thread = 512, boundary_thread = 32; @@ -550,6 +565,19 @@ void dfThermo::correctThermo() } else { offset += dataBase_.patch_size[i]; } } +#ifdef STREAM_ALLOCATOR + checkCudaErrors(cudaFreeAsync(d_mole_fraction, stream)); + checkCudaErrors(cudaFreeAsync(d_boundary_mole_fraction, stream)); + checkCudaErrors(cudaFreeAsync(d_mean_mole_weight, stream)); + checkCudaErrors(cudaFreeAsync(d_boundary_mean_mole_weight, stream)); + checkCudaErrors(cudaFreeAsync(d_T_poly, stream)); + checkCudaErrors(cudaFreeAsync(d_boundary_T_poly, stream)); + + checkCudaErrors(cudaFreeAsync(d_species_viscosities, stream)); + checkCudaErrors(cudaFreeAsync(d_boundary_species_viscosities, stream)); + checkCudaErrors(cudaFreeAsync(d_species_thermal_conductivities, stream)); + checkCudaErrors(cudaFreeAsync(d_boundary_species_thermal_conductivities, stream)); +#endif } void dfThermo::updateRho() diff --git a/src_gpu/dfpEqn.H b/src_gpu/dfpEqn.H index ebbb246f4..79a414eac 100644 --- a/src_gpu/dfpEqn.H +++ b/src_gpu/dfpEqn.H @@ -96,7 +96,7 @@ public: void preProcess(double *h_phi, double *h_boundary_phi); void correctPsi(const double *h_thermoPsi, double *h_boundary_thermoPsi); // tmp void correctP(const double *h_p, double *h_boundary_p); // tmp - void getFlux(); + //void getFlux(); void process(); void postProcess(); diff --git a/src_gpu/dfpEqn.cu b/src_gpu/dfpEqn.cu index d96380fbd..3b48f55a5 100644 --- a/src_gpu/dfpEqn.cu +++ b/src_gpu/dfpEqn.cu @@ -306,13 +306,17 @@ void dfpEqn::setConstantFields(const std::vector patch_type_U, const std::v } void dfpEqn::createNonConstantFieldsInternal() { +#ifndef STREAM_ALLOCATOR // intermediate fields checkCudaErrors(cudaMalloc((void**)&d_rhorAUf, dataBase_.surface_value_bytes)); checkCudaErrors(cudaMalloc((void**)&d_phiHbyA, dataBase_.surface_value_bytes)); checkCudaErrors(cudaMalloc((void**)&d_flux, dataBase_.surface_value_bytes)); +#endif } void dfpEqn::createNonConstantFieldsBoundary() { +#ifndef STREAM_ALLOCATOR + // boundary coeffs checkCudaErrors(cudaMalloc((void**)&d_value_internal_coeffs, dataBase_.boundary_surface_value_bytes)); checkCudaErrors(cudaMalloc((void**)&d_value_boundary_coeffs, dataBase_.boundary_surface_value_bytes)); checkCudaErrors(cudaMalloc((void**)&d_gradient_internal_coeffs, dataBase_.boundary_surface_value_bytes)); @@ -321,18 +325,22 @@ void dfpEqn::createNonConstantFieldsBoundary() { checkCudaErrors(cudaMalloc((void**)&d_boundary_rhorAUf, dataBase_.boundary_surface_value_bytes)); checkCudaErrors(cudaMalloc((void**)&d_boundary_phiHbyA, dataBase_.boundary_surface_value_bytes)); checkCudaErrors(cudaMalloc((void**)&d_boundary_flux, dataBase_.boundary_surface_value_bytes)); +#endif } void dfpEqn::createNonConstantLduAndCsrFields() { + // ldu and csr checkCudaErrors(cudaMalloc((void**)&d_ldu, dataBase_.csr_value_bytes)); d_lower = d_ldu; d_diag = d_ldu + dataBase_.num_surfaces; d_upper = d_ldu + dataBase_.num_cells + dataBase_.num_surfaces; d_extern = d_ldu + dataBase_.num_cells + 2 * dataBase_.num_surfaces; +#ifndef STREAM_ALLOCATOR checkCudaErrors(cudaMalloc((void**)&d_source, dataBase_.cell_value_bytes)); checkCudaErrors(cudaMalloc((void**)&d_internal_coeffs, dataBase_.boundary_surface_value_bytes)); checkCudaErrors(cudaMalloc((void**)&d_boundary_coeffs, dataBase_.boundary_surface_value_bytes)); checkCudaErrors(cudaMalloc((void**)&d_A, dataBase_.csr_value_bytes)); +#endif } void dfpEqn::initNonConstantFields(const double *p, const double *boundary_p){ @@ -377,6 +385,29 @@ void dfpEqn::process() { checkCudaErrors(cudaStreamBeginCapture(dataBase_.stream, cudaStreamCaptureModeGlobal)); #endif +#ifdef STREAM_ALLOCATOR + // intermediate fields + checkCudaErrors(cudaMallocAsync((void**)&d_rhorAUf, dataBase_.surface_value_bytes, dataBase_.stream)); + checkCudaErrors(cudaMallocAsync((void**)&d_phiHbyA, dataBase_.surface_value_bytes, dataBase_.stream)); + checkCudaErrors(cudaMallocAsync((void**)&d_flux, dataBase_.surface_value_bytes, dataBase_.stream)); + + // boundary coeffs + checkCudaErrors(cudaMallocAsync((void**)&d_value_internal_coeffs, dataBase_.boundary_surface_value_bytes, dataBase_.stream)); + checkCudaErrors(cudaMallocAsync((void**)&d_value_boundary_coeffs, dataBase_.boundary_surface_value_bytes, dataBase_.stream)); + checkCudaErrors(cudaMallocAsync((void**)&d_gradient_internal_coeffs, dataBase_.boundary_surface_value_bytes, dataBase_.stream)); + checkCudaErrors(cudaMallocAsync((void**)&d_gradient_boundary_coeffs, dataBase_.boundary_surface_value_bytes, dataBase_.stream)); + // intermediate boundary fields + checkCudaErrors(cudaMallocAsync((void**)&d_boundary_rhorAUf, dataBase_.boundary_surface_value_bytes, dataBase_.stream)); + checkCudaErrors(cudaMallocAsync((void**)&d_boundary_phiHbyA, dataBase_.boundary_surface_value_bytes, dataBase_.stream)); + checkCudaErrors(cudaMallocAsync((void**)&d_boundary_flux, dataBase_.boundary_surface_value_bytes, dataBase_.stream)); + + // ldu and csr + checkCudaErrors(cudaMallocAsync((void**)&d_source, dataBase_.cell_value_bytes, dataBase_.stream)); + checkCudaErrors(cudaMallocAsync((void**)&d_internal_coeffs, dataBase_.boundary_surface_value_bytes, dataBase_.stream)); + checkCudaErrors(cudaMallocAsync((void**)&d_boundary_coeffs, dataBase_.boundary_surface_value_bytes, dataBase_.stream)); + checkCudaErrors(cudaMallocAsync((void**)&d_A, dataBase_.csr_value_bytes, dataBase_.stream)); +#endif + checkCudaErrors(cudaMemsetAsync(d_ldu, 0, dataBase_.csr_value_bytes, dataBase_.stream)); // d_ldu contains d_lower, d_diag, and d_upper checkCudaErrors(cudaMemsetAsync(d_source, 0, dataBase_.cell_value_bytes, dataBase_.stream)); checkCudaErrors(cudaMemsetAsync(d_internal_coeffs, 0, dataBase_.boundary_surface_value_bytes, dataBase_.stream)); @@ -478,6 +509,30 @@ void dfpEqn::process() { // calculate dpdt fvc_ddt_scalar_field(dataBase_.stream, dataBase_.num_cells, dataBase_.rdelta_t, dataBase_.d_p, dataBase_.d_p_old, dataBase_.d_volume, dataBase_.d_dpdt, 1.); + +#ifdef STREAM_ALLOCATOR + // intermediate fields + checkCudaErrors(cudaFreeAsync(d_rhorAUf, dataBase_.stream)); + checkCudaErrors(cudaFreeAsync(d_phiHbyA, dataBase_.stream)); + checkCudaErrors(cudaFreeAsync(d_flux, dataBase_.stream)); + + // boundary coeffs + checkCudaErrors(cudaFreeAsync(d_value_internal_coeffs, dataBase_.stream)); + checkCudaErrors(cudaFreeAsync(d_value_boundary_coeffs, dataBase_.stream)); + checkCudaErrors(cudaFreeAsync(d_gradient_internal_coeffs, dataBase_.stream)); + checkCudaErrors(cudaFreeAsync(d_gradient_boundary_coeffs, dataBase_.stream)); + // intermediate boundary fields + checkCudaErrors(cudaFreeAsync(d_boundary_rhorAUf, dataBase_.stream)); + checkCudaErrors(cudaFreeAsync(d_boundary_phiHbyA, dataBase_.stream)); + checkCudaErrors(cudaFreeAsync(d_boundary_flux, dataBase_.stream)); + + // ldu and csr + checkCudaErrors(cudaFreeAsync(d_source, dataBase_.stream)); + checkCudaErrors(cudaFreeAsync(d_internal_coeffs, dataBase_.stream)); + checkCudaErrors(cudaFreeAsync(d_boundary_coeffs, dataBase_.stream)); + checkCudaErrors(cudaFreeAsync(d_A, dataBase_.stream)); +#endif + TICK_END_EVENT(pEqn post process all); #ifdef USE_GRAPH @@ -491,14 +546,14 @@ void dfpEqn::process() { } void dfpEqn::postProcess() {} -void dfpEqn::getFlux() -{ - fvMtx_flux(dataBase_.stream, dataBase_.num_surfaces, dataBase_.num_boundary_surfaces, dataBase_.d_owner, dataBase_.d_neighbor, - d_lower, d_upper, dataBase_.d_p, d_flux, - dataBase_.num_patches, dataBase_.patch_size.data(), patch_type_p.data(), - dataBase_.d_boundary_face_cell, d_internal_coeffs, d_boundary_coeffs, dataBase_.d_boundary_p, d_boundary_flux); - sync(); -} +//void dfpEqn::getFlux() +//{ +// fvMtx_flux(dataBase_.stream, dataBase_.num_surfaces, dataBase_.num_boundary_surfaces, dataBase_.d_owner, dataBase_.d_neighbor, +// d_lower, d_upper, dataBase_.d_p, d_flux, +// dataBase_.num_patches, dataBase_.patch_size.data(), patch_type_p.data(), +// dataBase_.d_boundary_face_cell, d_internal_coeffs, d_boundary_coeffs, dataBase_.d_boundary_p, d_boundary_flux); +// sync(); +//} void dfpEqn::getrhorAUf(cudaStream_t stream, int num_cells, int num_surfaces, const int *lowerAddr, const int *upperAddr, From 7ef4ce7ee036ed6a906103fc5ca63af5bfa16b52 Mon Sep 17 00:00:00 2001 From: STwangyingrui Date: Tue, 24 Oct 2023 20:40:57 +0800 Subject: [PATCH 5/6] reuse amgx solvers --- .../solvers/dfLowMachFoam/createGPUSolver.H | 15 ++++++++++- .../solvers/dfLowMachFoam/dfLowMachFoam.C | 4 ++- src_gpu/dfEEqn.cu | 13 +--------- src_gpu/dfMatrixDataBase.H | 14 +++++++++++ src_gpu/dfMatrixDataBase.cu | 19 ++++++++++++++ src_gpu/dfUEqn.cu | 21 +++------------- src_gpu/dfYEqn.cu | 18 +------------ src_gpu/dfpEqn.cu | 25 +------------------ 8 files changed, 56 insertions(+), 73 deletions(-) diff --git a/applications/solvers/dfLowMachFoam/createGPUSolver.H b/applications/solvers/dfLowMachFoam/createGPUSolver.H index 226b8f3f3..5369ef94a 100644 --- a/applications/solvers/dfLowMachFoam/createGPUSolver.H +++ b/applications/solvers/dfLowMachFoam/createGPUSolver.H @@ -99,7 +99,7 @@ void initNccl() { ncclInit(PstreamGlobals::MPI_COMM_FOAM, nccl_comm, nccl_id, &nRanks, &myRank, &localRank); } -void createGPUBase(fvMesh& mesh, PtrList& Y) { +void createGPUBase(const IOdictionary& CanteraTorchProperties, fvMesh& mesh, PtrList& Y) { // prepare constant values: num_cells, num_surfaces, num_boundary_surfaces, // num_patches, patch_size, num_species, rdelta_t const labelUList& owner = mesh.owner(); @@ -151,6 +151,7 @@ void createGPUBase(fvMesh& mesh, PtrList& Y) { double rDeltaT = 1 / 1e-6; dfDataBase.setConstantValues(num_cells, num_total_cells, num_surfaces, num_boundary_surfaces, num_patches, nProcValues, patch_size, Y.size(), rDeltaT); + // wyr: now there is no other place to prepare nccl info, thus mpi must be initialized at beginning. label flag_mpi_init; MPI_Initialized(&flag_mpi_init); @@ -169,6 +170,14 @@ void createGPUBase(fvMesh& mesh, PtrList& Y) { // prepare cuda resources dfDataBase.prepareCudaResources(); + // setup amgx solvers + string mode_string = "dDDI"; + string u_setting_path; + u_setting_path = CanteraTorchProperties.subDict("AmgxSettings").lookupOrDefault("UEqnSettingPath", string("")); + string p_setting_path; + p_setting_path = CanteraTorchProperties.subDict("AmgxSettings").lookupOrDefault("pEqnSettingPath", string("")); + dfDataBase.setAmgxSolvers(mode_string, u_setting_path, p_setting_path); + // prepare constant indexes: owner, neighbor, procRows, procCols if (Pstream::parRun()) { @@ -330,6 +339,7 @@ void createGPURhoEqn(const volScalarField& rho, const surfaceScalarField& phi) { } void createGPUUEqn(const IOdictionary& CanteraTorchProperties, const volVectorField& U) { + // TODO need remove amgx solver setting // prepare mode_string and setting_path string mode_string = "dDDI"; string settingPath; @@ -381,6 +391,7 @@ void createGPUUEqn(const IOdictionary& CanteraTorchProperties, const volVectorFi void createGPUYEqn(const IOdictionary& CanteraTorchProperties, PtrList& Y, const int inertIndex) { DEBUG_TRACE; + // TODO need remove amgx solver setting // prepare mode_string and setting_path string mode_string = "dDDI"; string settingPath; @@ -430,6 +441,7 @@ void createGPUYEqn(const IOdictionary& CanteraTorchProperties, PtrListsetOperator(dataBase_.num_cells, dataBase_.num_total_cells, dataBase_.num_Nz, dataBase_.d_csr_row_index, dataBase_.d_csr_col_index, d_A); - } - else - { - ESolver->updateOperator(dataBase_.num_cells, dataBase_.num_Nz, d_A); - } - ESolver->solve(dataBase_.num_cells, dataBase_.d_he, d_source); + dataBase_.solve(num_iteration, AMGXSetting::u_setting, d_A, dataBase_.d_he, d_source); num_iteration++; } diff --git a/src_gpu/dfMatrixDataBase.H b/src_gpu/dfMatrixDataBase.H index ba3403ffe..bc04208ee 100644 --- a/src_gpu/dfMatrixDataBase.H +++ b/src_gpu/dfMatrixDataBase.H @@ -16,6 +16,9 @@ #include #include +#include "AmgXSolver.H" +#include + //#define DEBUG_ //#define DEBUG_CHECK_LDU @@ -59,6 +62,11 @@ inline void checkVectorEqual(int count, const double* basevec, double* vec, doub } } +enum AMGXSetting { + u_setting, + p_setting +}; + enum location { cpu, gpu @@ -139,6 +147,10 @@ struct dfMatrixDataBase int *d_csr_row_index= nullptr; int *d_csr_col_index= nullptr; + // amgx solvers + AmgXSolver *u_setting_solver = nullptr; + AmgXSolver *p_setting_solver = nullptr; + // constant fields - internal double *d_sf = nullptr; double *d_mag_sf = nullptr; @@ -270,6 +282,8 @@ struct dfMatrixDataBase std::vector patch_size, int num_species, double rdelta_t); void setConstantIndexes(const int *owner, const int *neighbor, const int *procRows, const int *procCols, int globalOffset); + void setAmgxSolvers(const std::string &mode_string, const std::string &u_setting_path, const std::string &p_setting_path); + void solve(int num_iteration, AMGXSetting setting, double *d_A, double *d_x, double *d_b); void createConstantFieldsInternal(); void createConstantFieldsBoundary(); diff --git a/src_gpu/dfMatrixDataBase.cu b/src_gpu/dfMatrixDataBase.cu index 180d90cff..f4badb935 100644 --- a/src_gpu/dfMatrixDataBase.cu +++ b/src_gpu/dfMatrixDataBase.cu @@ -135,6 +135,25 @@ void dfMatrixDataBase::setConstantValues(int num_cells, int num_total_cells, int csr_value_vec_bytes = num_Nz * 3 * sizeof(double); } +void dfMatrixDataBase::setAmgxSolvers(const std::string &mode_string, const std::string &u_setting_path, const std::string &p_setting_path) { + // amgx solvers + u_setting_solver = new AmgXSolver(mode_string, u_setting_path, localRank); + p_setting_solver = new AmgXSolver(mode_string, p_setting_path, localRank); +} + +void dfMatrixDataBase::solve(int num_iteration, AMGXSetting setting, double *d_A, double *d_x, double *d_b) { + AmgXSolver *solver = (setting == AMGXSetting::u_setting) ? u_setting_solver : p_setting_solver; + if (num_iteration == 0) // first interation + { + solver->setOperator(num_cells, num_total_cells, num_Nz, d_csr_row_index, d_csr_col_index, d_A); + } + else + { + solver->updateOperator(num_cells, num_Nz, d_A); + } + solver->solve(num_cells, d_x, d_b); +} + void dfMatrixDataBase::setConstantIndexes(const int *owner, const int *neighbor, const int *procRows, const int *procCols, int globalOffset) { // build d_owner, d_neighbor diff --git a/src_gpu/dfUEqn.cu b/src_gpu/dfUEqn.cu index 56682491d..19f68c45b 100644 --- a/src_gpu/dfUEqn.cu +++ b/src_gpu/dfUEqn.cu @@ -565,24 +565,9 @@ void dfUEqn::sync() } void dfUEqn::solve() { - sync(); // TODO need remove - - if (num_iteration == 0) // first interation - { - printf("Initializing AmgX Linear Solver\n"); - UxSolver->setOperator(dataBase_.num_cells, dataBase_.num_total_cells, dataBase_.num_Nz, dataBase_.d_csr_row_index, dataBase_.d_csr_col_index, d_A); - UySolver->setOperator(dataBase_.num_cells, dataBase_.num_total_cells, dataBase_.num_Nz, dataBase_.d_csr_row_index, dataBase_.d_csr_col_index, d_A + dataBase_.num_Nz); - UzSolver->setOperator(dataBase_.num_cells, dataBase_.num_total_cells, dataBase_.num_Nz, dataBase_.d_csr_row_index, dataBase_.d_csr_col_index, d_A + 2 * dataBase_.num_Nz); - } - else - { - UxSolver->updateOperator(dataBase_.num_cells, dataBase_.num_Nz, d_A); - UySolver->updateOperator(dataBase_.num_cells, dataBase_.num_Nz, d_A + dataBase_.num_Nz); - UzSolver->updateOperator(dataBase_.num_cells, dataBase_.num_Nz, d_A + 2 * dataBase_.num_Nz); - } - UxSolver->solve(dataBase_.num_cells, dataBase_.d_u, d_b); - UySolver->solve(dataBase_.num_cells, dataBase_.d_u + dataBase_.num_cells, d_b + dataBase_.num_cells); - UzSolver->solve(dataBase_.num_cells, dataBase_.d_u + 2 * dataBase_.num_cells, d_b + 2 * dataBase_.num_cells); + dataBase_.solve(num_iteration, AMGXSetting::u_setting, d_A, dataBase_.d_u, d_b); + dataBase_.solve(num_iteration, AMGXSetting::u_setting, d_A + dataBase_.num_Nz, dataBase_.d_u + dataBase_.num_cells, d_b + dataBase_.num_cells); + dataBase_.solve(num_iteration, AMGXSetting::u_setting, d_A + 2 * dataBase_.num_Nz, dataBase_.d_u + 2 * dataBase_.num_cells, d_b + 2 * dataBase_.num_cells); num_iteration++; } diff --git a/src_gpu/dfYEqn.cu b/src_gpu/dfYEqn.cu index b3eb211ba..f16446300 100644 --- a/src_gpu/dfYEqn.cu +++ b/src_gpu/dfYEqn.cu @@ -643,23 +643,7 @@ void dfYEqn::process() { void dfYEqn::solve(int solverIndex) { TICK_INIT_EVENT; TICK_START_EVENT; - if (num_iteration == 0) // first interation - { - fprintf(stderr, "Initializing AmgX Linear Solver\n"); - DEBUG_TRACE; - YSolverSet[solverIndex]->setOperator(dataBase_.num_cells, dataBase_.num_total_cells, dataBase_.num_Nz, dataBase_.d_csr_row_index, dataBase_.d_csr_col_index, d_A); - DEBUG_TRACE; - } - else - { - DEBUG_TRACE; - YSolverSet[solverIndex]->updateOperator(dataBase_.num_cells, dataBase_.num_Nz, d_A); - DEBUG_TRACE; - } - // use d_source as d_b - DEBUG_TRACE; - YSolverSet[solverIndex]->solve(dataBase_.num_cells, dataBase_.d_y + dataBase_.num_cells * solverIndex, d_source); - DEBUG_TRACE; + dataBase_.solve(num_iteration, AMGXSetting::u_setting, d_A, dataBase_.d_y + dataBase_.num_cells * solverIndex, d_source); TICK_END_EVENT(YEqn solve one specie); } diff --git a/src_gpu/dfpEqn.cu b/src_gpu/dfpEqn.cu index 3b48f55a5..bd0c8ac23 100644 --- a/src_gpu/dfpEqn.cu +++ b/src_gpu/dfpEqn.cu @@ -676,30 +676,7 @@ void dfpEqn::sync() void dfpEqn::solve() { - sync(); - - // double *h_A_csr = new double[dataBase_.num_Nz]; - // checkCudaErrors(cudaMemcpy(h_A_csr, d_A, dataBase_.csr_value_bytes, cudaMemcpyDeviceToHost)); - // for (int i = 0; i < dataBase_.num_Nz; i++) - // fprintf(stderr, "h_A_csr[%d]: %.10lf\n", i, h_A_csr[i]); - // delete[] h_A_csr; - - // double *h_b = new double[dataBase_.num_cells]; - // checkCudaErrors(cudaMemcpy(h_b, d_source, dataBase_.cell_value_bytes, cudaMemcpyDeviceToHost)); - // for (int i = 0; i < dataBase_.num_cells; i++) - // fprintf(stderr, "h_b[%d]: %.10lf\n", i, h_b[i]); - // delete[] h_b; - - if (num_iteration == 0) // first interation - { - printf("Initializing AmgX Linear Solver\n"); - pSolver->setOperator(dataBase_.num_cells, dataBase_.num_total_cells, dataBase_.num_Nz, dataBase_.d_csr_row_index, dataBase_.d_csr_col_index, d_A); - } - else - { - pSolver->updateOperator(dataBase_.num_cells, dataBase_.num_Nz, d_A); - } - pSolver->solve(dataBase_.num_cells, dataBase_.d_p, d_source); + dataBase_.solve(num_iteration, AMGXSetting::p_setting, d_A, dataBase_.d_p, d_source); num_iteration++; } From 01ac9fea1143a58cdbac655b1f3f913e3ffb4a61 Mon Sep 17 00:00:00 2001 From: STwangyingrui Date: Wed, 25 Oct 2023 18:27:09 +0800 Subject: [PATCH 6/6] fix sereral bugs and align multi-card multi-step result --- applications/solvers/dfLowMachFoam/EEqn.H | 2 +- applications/solvers/dfLowMachFoam/UEqn.H | 2 +- .../solvers/dfLowMachFoam/dfLowMachFoam.C | 8 +++---- applications/solvers/dfLowMachFoam/pEqn_GPU.H | 2 +- src_gpu/AmgXSolver.cu | 6 ++--- src_gpu/dfEEqn.cu | 6 +++++ src_gpu/dfRhoEqn.cu | 2 +- src_gpu/dfUEqn.H | 2 +- src_gpu/dfUEqn.cu | 23 ++++++++----------- 9 files changed, 27 insertions(+), 26 deletions(-) diff --git a/applications/solvers/dfLowMachFoam/EEqn.H b/applications/solvers/dfLowMachFoam/EEqn.H index 5554e7128..51277cd73 100644 --- a/applications/solvers/dfLowMachFoam/EEqn.H +++ b/applications/solvers/dfLowMachFoam/EEqn.H @@ -93,7 +93,7 @@ // EEqn_GPU.compareResult(&EEqn.lower()[0], &EEqn.upper()[0], &EEqn.diag()[0], &EEqn.source()[0], // h_internal_coeffs.data(), h_boundary_coeffs.data(), printFlag); // DEBUG_TRACE; - EEqn_GPU.compareHe(&he[0], h_boundary_he_tmp, printFlag); + //EEqn_GPU.compareHe(&he[0], h_boundary_he_tmp, printFlag); } delete h_boundary_he_tmp; diff --git a/applications/solvers/dfLowMachFoam/UEqn.H b/applications/solvers/dfLowMachFoam/UEqn.H index 2a059161e..a6244c870 100644 --- a/applications/solvers/dfLowMachFoam/UEqn.H +++ b/applications/solvers/dfLowMachFoam/UEqn.H @@ -118,7 +118,7 @@ // h_internal_coeffs.data(), h_boundary_coeffs.data(), // // &gradU[0][0], h_boundary_gradU, // printFlag); - UEqn_GPU.compareU(&U[0][0], h_boundary_u_tmp, printFlag); + //UEqn_GPU.compareU(&U[0][0], h_boundary_u_tmp, printFlag); } DEBUG_TRACE; #endif diff --git a/applications/solvers/dfLowMachFoam/dfLowMachFoam.C b/applications/solvers/dfLowMachFoam/dfLowMachFoam.C index fcbabae90..5f3c7aea1 100644 --- a/applications/solvers/dfLowMachFoam/dfLowMachFoam.C +++ b/applications/solvers/dfLowMachFoam/dfLowMachFoam.C @@ -369,7 +369,7 @@ int main(int argc, char *argv[]) MPI_Comm_rank(MPI_COMM_WORLD, &rank); } if (!mpi_init_flag || rank == 0) { - // thermo_GPU.compareT(&T[0], h_boundary_T_tmp, printFlag); + thermo_GPU.compareT(&T[0], h_boundary_T_tmp, printFlag); // thermo_GPU.compareHe(&thermo.he()[0], h_boundary_he_tmp, printFlag); // thermo_GPU.comparePsi(&psi[0], h_boundary_thermo_psi_tmp, printFlag); // thermo_GPU.compareAlpha(&alpha[0], h_boundary_thermo_alpha_tmp, printFlag); @@ -470,11 +470,9 @@ int main(int argc, char *argv[]) #ifdef GPUSolverNew_ // write U for - double *h_U_tmp = new double[dfDataBase.num_cells * 3]; - UEqn_GPU.postProcess(h_U_tmp); - memcpy(&U[0][0], h_U_tmp, dfDataBase.cell_value_vec_bytes); + UEqn_GPU.postProcess(); + memcpy(&U[0][0], dfDataBase.h_u, dfDataBase.cell_value_vec_bytes); U.correctBoundaryConditions(); - delete h_U_tmp; #endif runTime.write(); diff --git a/applications/solvers/dfLowMachFoam/pEqn_GPU.H b/applications/solvers/dfLowMachFoam/pEqn_GPU.H index 82de66ad2..b982d8475 100644 --- a/applications/solvers/dfLowMachFoam/pEqn_GPU.H +++ b/applications/solvers/dfLowMachFoam/pEqn_GPU.H @@ -154,7 +154,7 @@ UEqn_GPU.sync(); } // pEqn_GPU.correctP(&p[0], h_boundary_p); if (!mpi_init_flag || rank == 0) { - pEqn_GPU.comparep(&p[0], h_boundary_p, false); + //pEqn_GPU.comparep(&p[0], h_boundary_p, false); } delete h_boundary_p; diff --git a/src_gpu/AmgXSolver.cu b/src_gpu/AmgXSolver.cu index c4321f868..8658a268b 100644 --- a/src_gpu/AmgXSolver.cu +++ b/src_gpu/AmgXSolver.cu @@ -54,9 +54,6 @@ void AmgXSolver::initialize(const std::string &modeStr, const std::string &cfgFi // get the mode of AmgX solver setMode(modeStr); - // initialize AmgX - initAmgX(cfgFile, devID); - // check if MPI has been initialized MPI_Initialized(&isMPIEnabled); if (isMPIEnabled) { @@ -64,6 +61,9 @@ void AmgXSolver::initialize(const std::string &modeStr, const std::string &cfgFi mpiWorld = MPI_COMM_WORLD; } + // initialize AmgX + initAmgX(cfgFile, devID); + // a bool indicating if this instance is initialized isInitialised = true; diff --git a/src_gpu/dfEEqn.cu b/src_gpu/dfEEqn.cu index a884a42e4..cb43f8c9a 100644 --- a/src_gpu/dfEEqn.cu +++ b/src_gpu/dfEEqn.cu @@ -220,6 +220,12 @@ void dfEEqn::process() { dataBase_.d_boundary_T, dataBase_.d_boundary_y, d_boundary_heGradient, &thermo_); TICK_END_EVENT(EEqn post process correctBC); + TICK_START_EVENT; + // copy he to host + checkCudaErrors(cudaMemcpyAsync(dataBase_.h_he, dataBase_.d_he, dataBase_.cell_value_bytes, cudaMemcpyDeviceToHost, dataBase_.stream)); + checkCudaErrors(cudaMemcpyAsync(dataBase_.h_boundary_he, dataBase_.d_boundary_he, dataBase_.boundary_surface_value_bytes, cudaMemcpyDeviceToHost, dataBase_.stream)); + TICK_END_EVENT(EEqn post process copy back); + TICK_START_EVENT; #ifdef STREAM_ALLOCATOR // thermophysical fields diff --git a/src_gpu/dfRhoEqn.cu b/src_gpu/dfRhoEqn.cu index d0942fe2b..57d32c9a4 100644 --- a/src_gpu/dfRhoEqn.cu +++ b/src_gpu/dfRhoEqn.cu @@ -87,7 +87,7 @@ void dfRhoEqn::process() TICK_END_EVENT(rhoEqn post process free); TICK_START_EVENT; checkCudaErrors(cudaMemcpyAsync(dataBase_.h_rho, dataBase_.d_rho, dataBase_.cell_value_bytes, cudaMemcpyDeviceToHost, dataBase_.stream)); - TICK_END_EVENT(rhoEqn post process copyback); + TICK_END_EVENT(rhoEqn post process copy back); sync(); } diff --git a/src_gpu/dfUEqn.H b/src_gpu/dfUEqn.H index 3b412ecb4..ef3c8133f 100644 --- a/src_gpu/dfUEqn.H +++ b/src_gpu/dfUEqn.H @@ -115,7 +115,7 @@ public: void preProcessForRhoEqn(const double *h_rho, const double *h_phi, const double *h_boundary_phi); void preProcess(const double *h_u, const double *h_boundary_u, const double *h_p, const double *h_boundary_p, const double *h_nu_eff, const double *h_boundary_nu_eff); void process(); - void postProcess(double *h_u); + void postProcess(); void A(double *Psi); void H(double *Psi); diff --git a/src_gpu/dfUEqn.cu b/src_gpu/dfUEqn.cu index c5336bdc2..131c566d8 100644 --- a/src_gpu/dfUEqn.cu +++ b/src_gpu/dfUEqn.cu @@ -336,8 +336,8 @@ void dfUEqn::createNonConstantFieldsInternal() { checkCudaErrors(cudaMalloc((void**)&d_grad_u, dataBase_.cell_value_tsr_bytes)); checkCudaErrors(cudaMalloc((void**)&d_rho_nueff, dataBase_.cell_value_bytes)); checkCudaErrors(cudaMalloc((void**)&d_fvc_output, dataBase_.cell_value_vec_bytes)); - checkCudaErrors(cudaMalloc((void**)&d_u_host_order, dataBase_.cell_value_vec_bytes)); #endif + checkCudaErrors(cudaMalloc((void**)&d_u_host_order, dataBase_.cell_value_vec_bytes)); // computed on CPU, used on GPU, need memcpyh2d checkCudaErrors(cudaMallocHost((void**)&h_nu_eff , dataBase_.cell_value_bytes)); checkCudaErrors(cudaMallocHost((void**)&h_A_pEqn , dataBase_.cell_value_bytes)); @@ -359,9 +359,9 @@ void dfUEqn::createNonConstantFieldsBoundary() { checkCudaErrors(cudaMalloc((void**)&d_value_boundary_coeffs, dataBase_.boundary_surface_value_vec_bytes)); checkCudaErrors(cudaMalloc((void**)&d_gradient_internal_coeffs, dataBase_.boundary_surface_value_vec_bytes)); checkCudaErrors(cudaMalloc((void**)&d_gradient_boundary_coeffs, dataBase_.boundary_surface_value_vec_bytes)); +#endif // intermediate boundary checkCudaErrors(cudaMalloc((void**)&d_boundary_u_host_order, dataBase_.boundary_surface_value_vec_bytes)); -#endif // computed on CPU, used on GPU, need memcpyh2d checkCudaErrors(cudaMallocHost((void**)&h_boundary_nu_eff, dataBase_.boundary_surface_value_bytes)); @@ -385,10 +385,6 @@ void dfUEqn::createNonConstantLduAndCsrFields() { checkCudaErrors(cudaMalloc((void**)&d_A_pEqn, dataBase_.cell_value_bytes)); // TODO: delete redundant variables checkCudaErrors(cudaMalloc((void**)&d_H_pEqn, dataBase_.cell_value_vec_bytes)); checkCudaErrors(cudaMalloc((void**)&d_H_pEqn_perm, dataBase_.cell_value_vec_bytes)); - - checkCudaErrors(cudaMalloc((void**)&d_u_host_order, dataBase_.cell_value_vec_bytes)); - // intermediate boundary fields - checkCudaErrors(cudaMalloc((void**)&d_boundary_u_host_order, dataBase_.boundary_surface_value_vec_bytes)); } void dfUEqn::initNonConstantFieldsInternal(const double *u, const double *boundary_u) @@ -570,12 +566,6 @@ void dfUEqn::process() { dataBase_.d_boundary_u, dataBase_.d_boundary_k); TICK_END_EVENT(UEqn post process correctBC); - TICK_START_EVENT; - // copy u and boundary_u to host - permute_vector_d2h(dataBase_.stream, dataBase_.num_cells, dataBase_.d_u, d_u_host_order); - checkCudaErrors(cudaMemcpyAsync(dataBase_.h_u, d_u_host_order, dataBase_.cell_value_vec_bytes, cudaMemcpyDeviceToHost, dataBase_.stream)); - TICK_END_EVENT(UEqn post process copyback); - TICK_START_EVENT; #ifdef STREAM_ALLOCATOR // free @@ -624,7 +614,14 @@ void dfUEqn::solve() { num_iteration++; } -void dfUEqn::postProcess(double *h_u) {} +void dfUEqn::postProcess() { + // postProcess of dfUEqn can not be moved to the end of dfUEqn::process(), + // because dataBase_.d_u is modified in dfpEqn and we only need the result of the last change + // copy u and boundary_u to host + permute_vector_d2h(dataBase_.stream, dataBase_.num_cells, dataBase_.d_u, d_u_host_order); + checkCudaErrors(cudaMemcpyAsync(dataBase_.h_u, d_u_host_order, dataBase_.cell_value_vec_bytes, cudaMemcpyDeviceToHost, dataBase_.stream)); + sync(); +} void dfUEqn::A(double *Psi) { fvMtx_A(dataBase_.stream, dataBase_.num_cells, dataBase_.num_surfaces, dataBase_.num_boundary_surfaces,