diff --git a/applications/solvers/dfLowMachFoam/EEqn.H b/applications/solvers/dfLowMachFoam/EEqn.H index 5554e7128..51277cd73 100644 --- a/applications/solvers/dfLowMachFoam/EEqn.H +++ b/applications/solvers/dfLowMachFoam/EEqn.H @@ -93,7 +93,7 @@ // EEqn_GPU.compareResult(&EEqn.lower()[0], &EEqn.upper()[0], &EEqn.diag()[0], &EEqn.source()[0], // h_internal_coeffs.data(), h_boundary_coeffs.data(), printFlag); // DEBUG_TRACE; - EEqn_GPU.compareHe(&he[0], h_boundary_he_tmp, printFlag); + //EEqn_GPU.compareHe(&he[0], h_boundary_he_tmp, printFlag); } delete h_boundary_he_tmp; diff --git a/applications/solvers/dfLowMachFoam/UEqn.H b/applications/solvers/dfLowMachFoam/UEqn.H index 2a059161e..a6244c870 100644 --- a/applications/solvers/dfLowMachFoam/UEqn.H +++ b/applications/solvers/dfLowMachFoam/UEqn.H @@ -118,7 +118,7 @@ // h_internal_coeffs.data(), h_boundary_coeffs.data(), // // &gradU[0][0], h_boundary_gradU, // printFlag); - UEqn_GPU.compareU(&U[0][0], h_boundary_u_tmp, printFlag); + //UEqn_GPU.compareU(&U[0][0], h_boundary_u_tmp, printFlag); } DEBUG_TRACE; #endif diff --git a/applications/solvers/dfLowMachFoam/createGPUSolver.H b/applications/solvers/dfLowMachFoam/createGPUSolver.H index d72027dc3..7714fcbd1 100644 --- a/applications/solvers/dfLowMachFoam/createGPUSolver.H +++ b/applications/solvers/dfLowMachFoam/createGPUSolver.H @@ -99,7 +99,7 @@ void initNccl() { ncclInit(PstreamGlobals::MPI_COMM_FOAM, nccl_comm, nccl_id, &nRanks, &myRank, &localRank, &mpi_init_flag); } -void createGPUBase(fvMesh& mesh, PtrList& Y) { +void createGPUBase(const IOdictionary& CanteraTorchProperties, fvMesh& mesh, PtrList& Y) { // prepare constant values: num_cells, num_surfaces, num_boundary_surfaces, // num_patches, patch_size, num_species, rdelta_t const labelUList& owner = mesh.owner(); @@ -152,6 +152,7 @@ void createGPUBase(fvMesh& mesh, PtrList& Y) { double rDeltaT = 1 / 1e-6; dfDataBase.setConstantValues(num_cells, num_total_cells, num_surfaces, num_boundary_surfaces, num_patches, nProcValues, patch_size, Y.size(), rDeltaT); + // wyr: now there is no other place to prepare nccl info, thus mpi must be initialized at beginning. label flag_mpi_init; MPI_Initialized(&flag_mpi_init); @@ -187,6 +188,14 @@ void createGPUBase(fvMesh& mesh, PtrList& Y) { // prepare cuda resources dfDataBase.prepareCudaResources(); + // setup amgx solvers + string mode_string = "dDDI"; + string u_setting_path; + u_setting_path = CanteraTorchProperties.subDict("AmgxSettings").lookupOrDefault("UEqnSettingPath", string("")); + string p_setting_path; + p_setting_path = CanteraTorchProperties.subDict("AmgxSettings").lookupOrDefault("pEqnSettingPath", string("")); + dfDataBase.setAmgxSolvers(mode_string, u_setting_path, p_setting_path); + // prepare constant indexes: owner, neighbor, procRows, procCols if (Pstream::parRun()) { @@ -373,12 +382,14 @@ void createGPURhoEqn(const volScalarField& rho, const surfaceScalarField& phi) { offset += patchsize; } } + rhoEqn_GPU.setConstantValues(); rhoEqn_GPU.setConstantFields(patch_type); rhoEqn_GPU.initNonConstantFields(h_rho, h_phi, h_boundary_rho, h_boundary_phi); rhoEqn_GPU.createNonConstantLduAndCsrFields(); } void createGPUUEqn(const IOdictionary& CanteraTorchProperties, const volVectorField& U) { + // TODO need remove amgx solver setting // prepare mode_string and setting_path string mode_string = "dDDI"; string settingPath; @@ -431,6 +442,7 @@ void createGPUUEqn(const IOdictionary& CanteraTorchProperties, const volVectorFi void createGPUYEqn(const IOdictionary& CanteraTorchProperties, PtrList& Y, const int inertIndex) { DEBUG_TRACE; + // TODO need remove amgx solver setting // prepare mode_string and setting_path string mode_string = "dDDI"; string settingPath; @@ -481,6 +493,7 @@ void createGPUYEqn(const IOdictionary& CanteraTorchProperties, PtrListstream = dataBase_.stream; this->mode_string = mode_string; this->setting_path = setting_path; ESolver = new AmgXSolver(mode_string, setting_path, dataBase_.localRank); @@ -88,12 +89,16 @@ void dfEEqn::initNonConstantFields(const double *he, const double *boundary_he) } void dfEEqn::cleanCudaResources() { - if (graph_created) { +#ifdef USE_GRAPH + if (pre_graph_created) { checkCudaErrors(cudaGraphExecDestroy(graph_instance_pre)); - checkCudaErrors(cudaGraphExecDestroy(graph_instance_post)); checkCudaErrors(cudaGraphDestroy(graph_pre)); + } + if (post_graph_created) { + checkCudaErrors(cudaGraphExecDestroy(graph_instance_post)); checkCudaErrors(cudaGraphDestroy(graph_post)); } +#endif } void dfEEqn::preProcess(const double *h_he, const double *h_k, const double *h_k_old, const double *h_dpdt, const double *h_boundary_k, const double *h_boundary_heGradient) @@ -101,15 +106,10 @@ void dfEEqn::preProcess(const double *h_he, const double *h_k, const double *h_k } void dfEEqn::process() { - //使用event计算时间 - float time_elapsed=0; - cudaEvent_t start,stop; - checkCudaErrors(cudaEventCreate(&start)); - checkCudaErrors(cudaEventCreate(&stop)); - checkCudaErrors(cudaEventRecord(start,0)); - -#ifndef TIME_GPU - if(!graph_created) { + TICK_INIT_EVENT; + TICK_START_EVENT; +#ifdef USE_GRAPH + if(!pre_graph_created) { DEBUG_TRACE; checkCudaErrors(cudaStreamBeginCapture(dataBase_.stream, cudaStreamCaptureModeGlobal)); #endif @@ -125,7 +125,7 @@ void dfEEqn::process() { checkCudaErrors(cudaMallocAsync((void**)&d_gradient_internal_coeffs, dataBase_.boundary_surface_value_bytes, dataBase_.stream)); checkCudaErrors(cudaMallocAsync((void**)&d_gradient_boundary_coeffs, dataBase_.boundary_surface_value_bytes, dataBase_.stream)); - checkCudaErrors(cudaMallocAsync((void**)&d_boundary_heGradient, dataBase_.num_gradientEnergy_boundary_surfaces, dataBase_.stream)); + checkCudaErrors(cudaMallocAsync((void**)&d_boundary_heGradient, sizeof(double) * num_gradientEnergy_boundary_surfaces, dataBase_.stream)); checkCudaErrors(cudaMallocAsync((void**)&d_source, dataBase_.cell_value_bytes, dataBase_.stream)); checkCudaErrors(cudaMallocAsync((void**)&d_internal_coeffs, dataBase_.boundary_surface_value_bytes, dataBase_.stream)); @@ -190,58 +190,71 @@ void dfEEqn::process() { dataBase_.num_Nz, dataBase_.d_boundary_face_cell, dataBase_.d_ldu_to_csr_index, dataBase_.num_patches, dataBase_.patch_size.data(), patch_type_he.data(), d_ldu, d_source, d_internal_coeffs, d_boundary_coeffs, d_A); #endif -#ifndef TIME_GPU +#ifdef USE_GRAPH checkCudaErrors(cudaStreamEndCapture(dataBase_.stream, &graph_pre)); checkCudaErrors(cudaGraphInstantiate(&graph_instance_pre, graph_pre, NULL, NULL, 0)); + pre_graph_created = true; } DEBUG_TRACE; checkCudaErrors(cudaGraphLaunch(graph_instance_pre, dataBase_.stream)); #endif - checkCudaErrors(cudaEventRecord(stop,0)); - checkCudaErrors(cudaEventSynchronize(start)); - checkCudaErrors(cudaEventSynchronize(stop)); - checkCudaErrors(cudaEventElapsedTime(&time_elapsed,start,stop)); - if(!mpi_init_flag || myRank == 0) - fprintf(stderr, "eeqn assembly time:%f(ms)\n",time_elapsed); - - checkCudaErrors(cudaEventRecord(start,0)); + TICK_END_EVENT(EEqn assembly); + + TICK_START_EVENT; #ifndef DEBUG_CHECK_LDU solve(); #endif - checkCudaErrors(cudaEventRecord(stop,0)); - checkCudaErrors(cudaEventSynchronize(start)); - checkCudaErrors(cudaEventSynchronize(stop)); - checkCudaErrors(cudaEventElapsedTime(&time_elapsed,start,stop)); - if(!mpi_init_flag || myRank == 0) - fprintf(stderr, "eeqn solve time:%f(ms)\n",time_elapsed); - - checkCudaErrors(cudaEventRecord(start,0)); -#ifndef TIME_GPU - if(!graph_created) { + TICK_END_EVENT(EEqn solve); + +#ifdef USE_GRAPH + if(!post_graph_created) { checkCudaErrors(cudaStreamBeginCapture(dataBase_.stream, cudaStreamCaptureModeGlobal)); #endif + TICK_START_EVENT; correct_boundary_conditions_scalar(dataBase_.stream, dataBase_.nccl_comm, dataBase_.neighbProcNo.data(), dataBase_.num_boundary_surfaces, dataBase_.num_patches, dataBase_.patch_size.data(), patch_type_he.data(), dataBase_.d_boundary_delta_coeffs, dataBase_.d_boundary_face_cell, dataBase_.d_he, dataBase_.d_boundary_he, dataBase_.cyclicNeighbor.data(), dataBase_.patchSizeOffset.data(), dataBase_.d_boundary_weight, dataBase_.d_boundary_T, dataBase_.d_boundary_y, d_boundary_heGradient, &thermo_); + TICK_END_EVENT(EEqn post process correctBC); + + TICK_START_EVENT; + // copy he to host + checkCudaErrors(cudaMemcpyAsync(dataBase_.h_he, dataBase_.d_he, dataBase_.cell_value_bytes, cudaMemcpyDeviceToHost, dataBase_.stream)); + checkCudaErrors(cudaMemcpyAsync(dataBase_.h_boundary_he, dataBase_.d_boundary_he, dataBase_.boundary_surface_value_bytes, cudaMemcpyDeviceToHost, dataBase_.stream)); + TICK_END_EVENT(EEqn post process copy back); -#ifndef TIME_GPU + TICK_START_EVENT; +#ifdef STREAM_ALLOCATOR + // thermophysical fields + checkCudaErrors(cudaFreeAsync(d_dpdt, dataBase_.stream)); + // fiv weight fieldsFree + //checkCudaErrors(cudaFreeAsync(d_phi_special_weight, dataBase_.stream)); + // boundary coeffs + checkCudaErrors(cudaFreeAsync(d_value_internal_coeffs, dataBase_.stream)); + checkCudaErrors(cudaFreeAsync(d_value_boundary_coeffs, dataBase_.stream)); + checkCudaErrors(cudaFreeAsync(d_gradient_internal_coeffs, dataBase_.stream)); + checkCudaErrors(cudaFreeAsync(d_gradient_boundary_coeffs, dataBase_.stream)); + + checkCudaErrors(cudaFreeAsync(d_boundary_heGradient, dataBase_.stream)); + + checkCudaErrors(cudaFreeAsync(d_source, dataBase_.stream)); + checkCudaErrors(cudaFreeAsync(d_internal_coeffs, dataBase_.stream)); + checkCudaErrors(cudaFreeAsync(d_boundary_coeffs, dataBase_.stream)); + checkCudaErrors(cudaFreeAsync(d_A, dataBase_.stream)); + checkCudaErrors(cudaFreeAsync(d_b, dataBase_.stream)); +#endif + TICK_END_EVENT(EEqn post process free); +#ifdef USE_GRAPH checkCudaErrors(cudaStreamEndCapture(dataBase_.stream, &graph_post)); checkCudaErrors(cudaGraphInstantiate(&graph_instance_post, graph_post, NULL, NULL, 0)); - graph_created = true; + post_graph_created = true; } checkCudaErrors(cudaGraphLaunch(graph_instance_post, dataBase_.stream)); #endif - - checkCudaErrors(cudaEventRecord(stop,0)); - checkCudaErrors(cudaEventSynchronize(start)); - checkCudaErrors(cudaEventSynchronize(stop)); - checkCudaErrors(cudaEventElapsedTime(&time_elapsed,start,stop)); - if(!mpi_init_flag || myRank == 0) - fprintf(stderr, "eeqn post process time: %f(ms)\n\n",time_elapsed); + sync(); } void dfEEqn::eeqn_calculate_energy_gradient(dfThermo& GPUThermo, int num_cells, int num_species, @@ -252,6 +265,7 @@ void dfEEqn::eeqn_calculate_energy_gradient(dfThermo& GPUThermo, int num_cells, { int bou_offset = 0, gradient_offset = 0; for (int i = 0; i < num_patches; i++) { + if (patch_size[i] == 0) continue; if (patch_type[i] == boundaryConditions::gradientEnergy) { GPUThermo.calculateEnergyGradient(patch_size[i], num_cells, num_species, num_boundary_surfaces, bou_offset, gradient_offset, face2Cells, T, p, y, boundary_delta_coeffs, boundary_p, boundary_y, boundary_thermo_gradient); @@ -336,45 +350,8 @@ void dfEEqn::sync() void dfEEqn::solve() { - sync(); - - if (num_iteration == 0) // first interation - { - printf("Initializing AmgX Linear Solver\n"); - ESolver->setOperator(dataBase_.num_cells, dataBase_.num_total_cells, dataBase_.num_Nz, dataBase_.d_csr_row_index, dataBase_.d_csr_col_index, d_A); - } - else - { - ESolver->updateOperator(dataBase_.num_cells, dataBase_.num_Nz, d_A); - } - ESolver->solve(dataBase_.num_cells, dataBase_.d_he, d_source); + dataBase_.solve(num_iteration, AMGXSetting::u_setting, d_A, dataBase_.d_he, d_source); num_iteration++; } -void dfEEqn::postProcess(double *h_he, double *h_boundary_he) -{ -#ifdef STREAM_ALLOCATOR - // thermophysical fields - checkCudaErrors(cudaFreeAsync(d_dpdt, dataBase_.stream)); - // fiv weight fieldsFree - //checkCudaErrors(cudaFreeAsync(d_phi_special_weight, dataBase_.stream)); - // boundary coeffs - checkCudaErrors(cudaFreeAsync(d_value_internal_coeffs, dataBase_.stream)); - checkCudaErrors(cudaFreeAsync(d_value_boundary_coeffs, dataBase_.stream)); - checkCudaErrors(cudaFreeAsync(d_gradient_internal_coeffs, dataBase_.stream)); - checkCudaErrors(cudaFreeAsync(d_gradient_boundary_coeffs, dataBase_.stream)); - - checkCudaErrors(cudaFreeAsync(d_boundary_heGradient, dataBase_.stream)); - - checkCudaErrors(cudaFreeAsync(d_source, dataBase_.stream)); - checkCudaErrors(cudaFreeAsync(d_internal_coeffs, dataBase_.stream)); - checkCudaErrors(cudaFreeAsync(d_boundary_coeffs, dataBase_.stream)); - checkCudaErrors(cudaFreeAsync(d_A, dataBase_.stream)); - checkCudaErrors(cudaFreeAsync(d_b, dataBase_.stream)); -#endif - - // copy he to host - checkCudaErrors(cudaMemcpyAsync(h_he, dataBase_.d_he, dataBase_.cell_value_bytes, cudaMemcpyDeviceToHost, dataBase_.stream)); - checkCudaErrors(cudaMemcpyAsync(h_boundary_he, dataBase_.d_boundary_he, dataBase_.boundary_surface_value_bytes, cudaMemcpyDeviceToHost, dataBase_.stream)); - sync(); -} +void dfEEqn::postProcess(double *h_he, double *h_boundary_he) {} diff --git a/src_gpu/dfMatrixDataBase.H b/src_gpu/dfMatrixDataBase.H index 286583bc7..4ebecfa6c 100644 --- a/src_gpu/dfMatrixDataBase.H +++ b/src_gpu/dfMatrixDataBase.H @@ -16,11 +16,15 @@ #include #include +#include "AmgXSolver.H" +#include + +//#define DEBUG_ +//#define DEBUG_CHECK_LDU + extern int myRank; -extern int mpi_init_flag; #define GPU_DEBUG_ - #ifdef GPU_DEBUG_ #define DEBUG_TRACE fprintf(stderr, "myRank[%d] %s %d\n", myRank, __FILE__, __LINE__); #else @@ -59,6 +63,11 @@ inline void checkVectorEqual(int count, const double* basevec, double* vec, doub } } +enum AMGXSetting { + u_setting, + p_setting +}; + enum location { cpu, gpu @@ -146,6 +155,10 @@ struct dfMatrixDataBase int *d_csr_row_index= nullptr; int *d_csr_col_index= nullptr; + // amgx solvers + AmgXSolver *u_setting_solver = nullptr; + AmgXSolver *p_setting_solver = nullptr; + // constant fields - internal double *d_sf = nullptr; double *d_mag_sf = nullptr; @@ -277,6 +290,8 @@ struct dfMatrixDataBase std::vector patch_size, int num_species, double rdelta_t); void setConstantIndexes(const int *owner, const int *neighbor, const int *procRows, const int *procCols, int globalOffset); + void setAmgxSolvers(const std::string &mode_string, const std::string &u_setting_path, const std::string &p_setting_path); + void solve(int num_iteration, AMGXSetting setting, double *d_A, double *d_x, double *d_b); void setCyclicInfo(std::vector &cyclicNeighbor); // when use cyclic boundary void createConstantFieldsInternal(); diff --git a/src_gpu/dfMatrixDataBase.cu b/src_gpu/dfMatrixDataBase.cu index f813ae664..64dfd99fb 100644 --- a/src_gpu/dfMatrixDataBase.cu +++ b/src_gpu/dfMatrixDataBase.cu @@ -146,6 +146,25 @@ void dfMatrixDataBase::setConstantValues(int num_cells, int num_total_cells, int csr_value_vec_bytes = num_Nz * 3 * sizeof(double); } +void dfMatrixDataBase::setAmgxSolvers(const std::string &mode_string, const std::string &u_setting_path, const std::string &p_setting_path) { + // amgx solvers + u_setting_solver = new AmgXSolver(mode_string, u_setting_path, localRank); + p_setting_solver = new AmgXSolver(mode_string, p_setting_path, localRank); +} + +void dfMatrixDataBase::solve(int num_iteration, AMGXSetting setting, double *d_A, double *d_x, double *d_b) { + AmgXSolver *solver = (setting == AMGXSetting::u_setting) ? u_setting_solver : p_setting_solver; + if (num_iteration == 0) // first interation + { + solver->setOperator(num_cells, num_total_cells, num_Nz, d_csr_row_index, d_csr_col_index, d_A); + } + else + { + solver->updateOperator(num_cells, num_Nz, d_A); + } + solver->solve(num_cells, d_x, d_b); +} + void dfMatrixDataBase::setCyclicInfo(std::vector &cyclicNeighbor) { this->cyclicNeighbor = cyclicNeighbor; diff --git a/src_gpu/dfMatrixOpBase.H b/src_gpu/dfMatrixOpBase.H index 09d5d6281..5a61980f9 100644 --- a/src_gpu/dfMatrixOpBase.H +++ b/src_gpu/dfMatrixOpBase.H @@ -2,8 +2,84 @@ #include #include #include "dfThermo.H" -// #define TIME_GPU -// #define DEBUG_CHECK_LDU +class dfThermo; + +// macros need to be in a certain order + +// sequence 0: STREAM_ALLOCATOR can be open or not +#ifndef STREAM_ALLOCATOR + #define STREAM_ALLOCATOR +#endif + +// sequence 0: USE_GRAPH can be open or not +#ifndef USE_GRAPH + #define USE_GRAPH +#endif + +// sequence 0: TIME_GPU can be open or not +#ifndef TIME_GPU + #define TIME_GPU +#endif + +// sequence 1: TIME_GPU and USE_GRAPH can not be open at the same time +#if (defined TIME_GPU) && (defined USE_GRAPH) + #undef USE_GRAPH +#endif + +// sequence 2: STREAM_ALLOCATOR must be open if USE_GRAPH is open +#if (defined USE_GRAPH) && (!defined STREAM_ALLOCATOR) + #define STREAM_ALLOCATOR +#endif + +extern int myRank; + +#define PRINT_PTR(x) { \ + fprintf(stderr, "rank[%d], %s %d, print ptr %s: %p\n", myRank, __FILE__, __LINE__, #x, x); \ +} + +extern __global__ void warmup(); + +#ifdef TIME_GPU + #define WARM_UP \ + warmup<<<10, 1024, 0, stream>>>(); + + #define TICK_INIT_EVENT \ + float time_elapsed_kernel=0;\ + cudaEvent_t start_kernel, stop_kernel;\ + checkCudaErrors(cudaEventCreate(&start_kernel));\ + checkCudaErrors(cudaEventCreate(&stop_kernel)); + + #define TICK_START_EVENT \ + checkCudaErrors(cudaEventRecord(start_kernel,stream)); + + #define TICK_END_EVENT(prefix) \ + checkCudaErrors(cudaEventRecord(stop_kernel,stream));\ + checkCudaErrors(cudaEventSynchronize(start_kernel));\ + checkCudaErrors(cudaEventSynchronize(stop_kernel));\ + checkCudaErrors(cudaEventElapsedTime(&time_elapsed_kernel,start_kernel,stop_kernel));\ + fprintf(stderr, "rank[%d], name: %s, time: %lf(ms)\n", myRank, #prefix, time_elapsed_kernel); + +/* + // the usage description: + // if you want to profile the first kernel, please use WARM_UP before TICK_INIT_EVENT. + // otherwise there is no need to use WARM_UP + WARM_UP; + // init event + TICK_INIT_EVENT; + // start event + TICK_START_EVENT; + // call your kernel, or kernels, or wrapper functions, e.g.: + my_kernel<<>>(num, input, output); + // end event with your specified name string, e.g.: + TICK_END_EVENT(my_kernel); +*/ + +#else + #define WARM_UP + #define TICK_INIT_EVENT + #define TICK_START_EVENT + #define TICK_END_EVENT(prefix) +#endif // tools void permute_vector_d2h(cudaStream_t stream, int num_cells, const double *input, double *output); @@ -253,3 +329,4 @@ void fvMtx_flux(cudaStream_t stream, int num_surfaces, int num_boundary_surfaces void solve_explicit_scalar(cudaStream_t stream, int num_cells, const double *diag, const double *source, double *psi); + diff --git a/src_gpu/dfMatrixOpBase.cu b/src_gpu/dfMatrixOpBase.cu index 98146436c..6e520f7c6 100644 --- a/src_gpu/dfMatrixOpBase.cu +++ b/src_gpu/dfMatrixOpBase.cu @@ -5,32 +5,10 @@ #include #include "cuda_profiler_api.h" -#ifdef TIME_GPU - #define TICK_INIT_EVENT \ - float time_elapsed_kernel=0;\ - cudaEvent_t start_kernel, stop_kernel;\ - checkCudaErrors(cudaEventCreate(&start_kernel));\ - checkCudaErrors(cudaEventCreate(&stop_kernel)); - - #define TICK_START_EVENT \ - checkCudaErrors(cudaEventRecord(start_kernel,0)); - - #define TICK_END_EVENT(prefix) \ - checkCudaErrors(cudaEventRecord(stop_kernel,0));\ - checkCudaErrors(cudaEventSynchronize(start_kernel));\ - checkCudaErrors(cudaEventSynchronize(stop_kernel));\ - checkCudaErrors(cudaEventElapsedTime(&time_elapsed_kernel,start_kernel,stop_kernel));\ - printf("try %s 执行时间:%lf(ms)\n", #prefix, time_elapsed_kernel); -#else - #define TICK_INIT_EVENT - #define TICK_START_EVENT - #define TICK_END_EVENT(prefix) -#endif - -__global__ void warmup(int num_cells) +__global__ void warmup() { int index = blockDim.x * blockIdx.x + threadIdx.x; - if (index >= num_cells) + if (index >= 10240) return; } @@ -363,10 +341,13 @@ void correct_boundary_conditions_processor_scalar(cudaStream_t stream, ncclComm_ correct_internal_boundary_field_scalar<<>>(num, offset, vf, boundary_cell_face, vf_boundary); + TICK_INIT_EVENT; + TICK_START_EVENT; checkNcclErrors(ncclGroupStart()); checkNcclErrors(ncclSend(vf_boundary + internal_start_index, num, ncclDouble, peer, comm, stream)); checkNcclErrors(ncclRecv(vf_boundary + neighbor_start_index, num, ncclDouble, peer, comm, stream)); checkNcclErrors(ncclGroupEnd()); + TICK_END_EVENT(nccl scalar); //checkCudaErrors(cudaStreamSynchronize(stream)); } @@ -382,12 +363,15 @@ void correct_boundary_conditions_processor_vector(cudaStream_t stream, ncclComm_ correct_internal_boundary_field_vector<<>>(num, offset, num_boundary_surfaces, num_cells, vf, boundary_cell_face, vf_boundary); + TICK_INIT_EVENT; + TICK_START_EVENT; checkNcclErrors(ncclGroupStart()); for (int i = 0; i < 3; i++) { checkNcclErrors(ncclSend(vf_boundary + num_boundary_surfaces * i + internal_start_index, num, ncclDouble, peer, comm, stream)); checkNcclErrors(ncclRecv(vf_boundary + num_boundary_surfaces * i + neighbor_start_index, num, ncclDouble, peer, comm, stream)); } checkNcclErrors(ncclGroupEnd()); + TICK_END_EVENT(nccl vector); //checkCudaErrors(cudaStreamSynchronize(stream)); } @@ -1282,13 +1266,17 @@ void fvc_grad_vector_correctBC_processor(cudaStream_t stream, ncclComm_t comm, correct_internal_boundary_field_tensor<<>>(num, offset, num_boundary_surfaces, num_cells, internal_grad, face2Cells, boundary_grad); + + TICK_INIT_EVENT; + TICK_START_EVENT; checkNcclErrors(ncclGroupStart()); for (int i = 0; i < 9; i++) { checkNcclErrors(ncclSend(boundary_grad + num_boundary_surfaces * i + internal_start_index, num, ncclDouble, peer, comm, stream)); checkNcclErrors(ncclRecv(boundary_grad + num_boundary_surfaces * i + neighbor_start_index, num, ncclDouble, peer, comm, stream)); } checkNcclErrors(ncclGroupEnd()); - checkCudaErrors(cudaStreamSynchronize(stream)); + TICK_END_EVENT(nccl tensor); + //checkCudaErrors(cudaStreamSynchronize(stream)); } __global__ void fvc_grad_cell_scalar_correctBC_zeroGradient(int num_cells, int num_boundary_surfaces, @@ -2094,13 +2082,10 @@ void field_add_scalar(cudaStream_t stream, int num, const double *input1, const double *input2, double *output, int num_boundary_surfaces, const double *boundary_input1, const double *boundary_input2, double *boundary_output) { - TICK_INIT_EVENT; size_t threads_per_block = 256; size_t blocks_per_grid = (std::max(num, num_boundary_surfaces) + threads_per_block - 1) / threads_per_block; - TICK_START_EVENT; field_add_scalar_kernel<<>>(num, num_boundary_surfaces, input1, input2, output, boundary_input1, boundary_input2, boundary_output); - TICK_END_EVENT(field_add_scalar_kernel); } void field_add_vector(cudaStream_t stream, @@ -2109,7 +2094,6 @@ void field_add_vector(cudaStream_t stream, { size_t threads_per_block = 256; size_t blocks_per_grid = (std::max(num_cells, num_boundary_surfaces) + threads_per_block - 1) / threads_per_block; - field_add_vector_kernel<<>>(num_cells, num_boundary_surfaces, input1, input2, output, boundary_input1, boundary_input2, boundary_output, sign); } @@ -2127,13 +2111,10 @@ void field_multiply_scalar(cudaStream_t stream, int num_cells, const double *input1, const double *input2, double *output, int num_boundary_surfaces, const double *boundary_input1, const double *boundary_input2, double *boundary_output) { - TICK_INIT_EVENT; size_t threads_per_block = 256; size_t blocks_per_grid = (std::max(num_cells, num_boundary_surfaces) + threads_per_block - 1) / threads_per_block; - TICK_START_EVENT; field_multiply_scalar_kernel<<>>(num_cells, num_boundary_surfaces, input1, input2, output, boundary_input1, boundary_input2, boundary_output); - TICK_END_EVENT(field_multiply_scalar_kernel); } void vector_half_mag_square(cudaStream_t stream, int num_cells, const double *vec_input, double *scalar_output, @@ -2177,13 +2158,10 @@ void fvc_to_source_vector(cudaStream_t stream, int num_cells, const double *volu void fvc_to_source_scalar(cudaStream_t stream, int num_cells, const double *volume, const double *fvc_output, double *source, double sign) { - TICK_INIT_EVENT; size_t threads_per_block = 256; size_t blocks_per_grid = (num_cells + threads_per_block - 1) / threads_per_block; - TICK_START_EVENT; fvc_to_source_scalar_kernel<<>>(num_cells, volume, fvc_output, source, sign); - TICK_END_EVENT(fvc_to_source_scalar_kernel); } void ldu_to_csr_scalar(cudaStream_t stream, int num_cells, int num_surfaces, int num_boundary_surface, int num_Nz, @@ -2199,6 +2177,7 @@ void ldu_to_csr_scalar(cudaStream_t stream, int num_cells, int num_surfaces, int int bou_offset = 0, ext_offset = 0; size_t threads_per_block, blocks_per_grid; for (int i = 0; i < num_patches; i++) { + if (patch_size[i] == 0) continue; if (patch_type[i] == boundaryConditions::processor || patch_type[i] == boundaryConditions::processorCyclic) { threads_per_block = 64; @@ -2222,6 +2201,7 @@ void ldu_to_csr_scalar(cudaStream_t stream, int num_cells, int num_surfaces, int // add coeff to source and diagnal bou_offset = 0; for (int i = 0; i < num_patches; i++) { + if (patch_size[i] == 0) continue; threads_per_block = 64; blocks_per_grid = (patch_size[i] + threads_per_block - 1) / threads_per_block; if (patch_type[i] == boundaryConditions::processor @@ -2250,21 +2230,16 @@ void ldu_to_csr(cudaStream_t stream, int num_cells, int num_surfaces, int num_bo const int* boundary_cell_face, const int *ldu_to_csr_index, const int *diag_to_csr_index, const double *ldu, const double *internal_coeffs, const double *boundary_coeffs, double *source, double *A) { - TICK_INIT_EVENT; // construct diag int nNz = num_cells + 2 * num_surfaces; size_t threads_per_block = 1024; size_t blocks_per_grid = (nNz + threads_per_block - 1) / threads_per_block; - TICK_START_EVENT; ldu_to_csr_kernel<<>>(nNz, ldu_to_csr_index, ldu, A); - TICK_END_EVENT(ldu_to_csr_kernel); // add coeff to source and diagnal blocks_per_grid = (num_boundary_surface + threads_per_block - 1) / threads_per_block; - TICK_START_EVENT; addBoundaryDiagSrc<<>>(num_cells, num_surfaces, num_boundary_surface, boundary_cell_face, internal_coeffs, boundary_coeffs, diag_to_csr_index, A, source); - TICK_END_EVENT(addBoundaryDiagSrc); } void update_boundary_coeffs_scalar(cudaStream_t stream, @@ -2279,6 +2254,7 @@ void update_boundary_coeffs_scalar(cudaStream_t stream, int offset = 0; int gradient_offset = 0; for (int i = 0; i < num_patches; i++) { + if (patch_size[i] == 0) continue; threads_per_block = 256; blocks_per_grid = (patch_size[i] + threads_per_block - 1) / threads_per_block; // TODO: just basic patch type now @@ -2328,6 +2304,7 @@ void correct_boundary_conditions_scalar(cudaStream_t stream, ncclComm_t comm, int offset = 0; int gradient_offset = 0; for (int i = 0; i < num_patches; i++) { + if (patch_size[i] == 0) continue; threads_per_block = 256; blocks_per_grid = (patch_size[i] + threads_per_block - 1) / threads_per_block; if (patch_type[i] == boundaryConditions::zeroGradient @@ -2374,6 +2351,7 @@ void correct_boundary_conditions_vector(cudaStream_t stream, ncclComm_t comm, int offset = 0; for (int i = 0; i < num_patches; i++) { + if (patch_size[i] == 0) continue; threads_per_block = 256; blocks_per_grid = (patch_size[i] + threads_per_block - 1) / threads_per_block; if (patch_type[i] == boundaryConditions::zeroGradient @@ -2414,6 +2392,7 @@ void update_boundary_coeffs_vector(cudaStream_t stream, int num_boundary_surface int offset = 0; for (int i = 0; i < num_patches; i++) { + if (patch_size[i] == 0) continue; threads_per_block = 256; blocks_per_grid = (patch_size[i] + threads_per_block - 1) / threads_per_block; // TODO: just basic patch type now @@ -2453,50 +2432,30 @@ void fvm_ddt_vol_scalar_vol_scalar(cudaStream_t stream, int num_cells, double rD const double *rho, const double *rho_old, const double *vf, const double *volume, double *diag, double *source, double sign) { - TICK_INIT_EVENT; size_t threads_per_block = 1024; size_t blocks_per_grid = (num_cells + threads_per_block - 1) / threads_per_block; - TICK_START_EVENT; fvm_ddt_vol_scalar_vol_scalar_kernel<<>>(num_cells, rDeltaT, rho, rho_old, vf, volume, diag, source, sign); - TICK_END_EVENT(fvm_ddt_vol_scalar_vol_scalar_kernel); } void fvm_ddt_scalar(cudaStream_t stream, int num_cells, double rDeltaT, const double *vf_old, const double *volume, double *diag, double *source, double sign) { -#ifdef TIME_GPU - printf("#############kernel profile#############\n"); -#endif - TICK_INIT_EVENT; size_t threads_per_block = 1024; size_t blocks_per_grid = (num_cells + threads_per_block - 1) / threads_per_block; -#ifdef TIME_GPU - printf("warm up ...\n"); - warmup<<>>(num_cells); -#endif - TICK_START_EVENT; fvm_ddt_scalar_kernel<<>>(num_cells, rDeltaT, vf_old, volume, diag, source, sign); - TICK_END_EVENT(fvm_ddt_scalar_kernel); } void fvm_ddt_vector(cudaStream_t stream, int num_cells, double rDeltaT, const double *rho, const double *rho_old, const double *vf, const double *volume, double *diag, double *source, double sign) { - TICK_INIT_EVENT; size_t threads_per_block = 64; size_t blocks_per_grid = (num_cells + threads_per_block - 1) / threads_per_block; -#ifdef TIME_GPU - printf("warm up ...\n"); - warmup<<>>(num_cells); -#endif - TICK_START_EVENT; fvm_ddt_vector_kernel<<>>(num_cells, rDeltaT, rho, rho_old, vf, volume, diag, source, sign); - TICK_END_EVENT(fvm_ddt_vector_kernel); } void fvm_div_scalar(cudaStream_t stream, int num_surfaces, const int *lowerAddr, const int *upperAddr, @@ -2506,16 +2465,14 @@ void fvm_div_scalar(cudaStream_t stream, int num_surfaces, const int *lowerAddr, const double *boundary_phi, const double *value_internal_coeffs, const double *value_boundary_coeffs, double *internal_coeffs, double *boundary_coeffs, double sign) { - TICK_INIT_EVENT; size_t threads_per_block = 1024; size_t blocks_per_grid = (num_surfaces + threads_per_block - 1) / threads_per_block; - TICK_START_EVENT; fvm_div_scalar_internal<<>>(num_surfaces, lowerAddr, upperAddr, phi, weight, lower, upper, diag, sign); - TICK_END_EVENT(fvm_div_scalar_internal); int offset = 0; for (int i = 0; i < num_patches; i++) { + if (patch_size[i] == 0) continue; threads_per_block = 256; blocks_per_grid = (patch_size[i] + threads_per_block - 1) / threads_per_block; // TODO: just basic patch type now @@ -2536,20 +2493,14 @@ void fvm_div_vector(cudaStream_t stream, int num_surfaces, int num_boundary_surf const double *boundary_phi, const double *value_internal_coeffs, const double *value_boundary_coeffs, double *internal_coeffs, double *boundary_coeffs, double sign) { - TICK_INIT_EVENT; size_t threads_per_block = 256; size_t blocks_per_grid = (num_surfaces + threads_per_block - 1) / threads_per_block; -#ifdef TIME_GPU - printf("warm up ...\n"); - warmup<<>>(num_surfaces); -#endif - TICK_START_EVENT; fvm_div_vector_internal<<>>(num_surfaces, lowerAddr, upperAddr, phi, weight, lower, upper, diag, sign); - TICK_END_EVENT(fvm_div_vector_internal); int offset = 0; for (int i = 0; i < num_patches; i++) { + if (patch_size[i] == 0) continue; threads_per_block = 64; blocks_per_grid = (patch_size[i] + threads_per_block - 1) / threads_per_block; fvm_div_vector_boundary<<>>(num_boundary_surfaces, patch_size[i], offset, @@ -2576,6 +2527,7 @@ void fvm_laplacian_scalar(cudaStream_t stream, int num_surfaces, int num_boundar weight, mag_sf, delta_coeffs, gamma, lower, upper, diag, sign); int offset = 0; for (int i = 0; i < num_patches; i++) { + if (patch_size[i] == 0) continue; threads_per_block = 256; blocks_per_grid = (patch_size[i] + threads_per_block - 1) / threads_per_block; fvm_laplacian_scalar_boundary<<>>(patch_size[i], offset, @@ -2596,15 +2548,13 @@ void fvm_laplacian_vector(cudaStream_t stream, int num_surfaces, int num_boundar const double *gradient_internal_coeffs, const double *gradient_boundary_coeffs, double *internal_coeffs, double *boundary_coeffs, double sign) { - TICK_INIT_EVENT; size_t threads_per_block = 1024; size_t blocks_per_grid = (num_surfaces + threads_per_block - 1) / threads_per_block; - TICK_START_EVENT; fvm_laplacian_internal<<>>(num_surfaces, lowerAddr, upperAddr, weight, mag_sf, delta_coeffs, gamma, lower, upper, diag, sign); - TICK_END_EVENT(fvm_laplacian_internal); int offset = 0; for (int i = 0; i < num_patches; i++) { + if (patch_size[i] == 0) continue; threads_per_block = 256; blocks_per_grid = (patch_size[i] + threads_per_block - 1) / threads_per_block; // TODO: just basic patch type now @@ -2633,6 +2583,7 @@ void fvm_laplacian_surface_scalar_vol_scalar(cudaStream_t stream, int num_surfac int offset = 0; for (int i = 0; i < num_patches; i++) { + if (patch_size[i] == 0) continue; threads_per_block = 256; blocks_per_grid = (patch_size[i] + threads_per_block - 1) / threads_per_block; fvm_laplacian_surface_scalar_boundary<<>>(patch_size[i], offset, @@ -2651,13 +2602,10 @@ void fvc_ddt_vol_scalar_vol_scalar(cudaStream_t stream, int num_cells, double rD const double *rho, const double *rho_old, const double *vf, const double *vf_old, const double *volume, double *output, double sign) { - TICK_INIT_EVENT; size_t threads_per_block = 1024; size_t blocks_per_grid = (num_cells + threads_per_block - 1) / threads_per_block; - TICK_START_EVENT; fvc_ddt_vol_scalar_vol_scalar_kernel<<>>(num_cells, rDeltaT, rho, rho_old, vf, vf_old, volume, output, sign); - TICK_END_EVENT(fvc_ddt_vol_scalar_vol_scalar_kernel); } void fvc_grad_vector(cudaStream_t stream, ncclComm_t comm, @@ -2670,17 +2618,15 @@ void fvc_grad_vector(cudaStream_t stream, ncclComm_t comm, const int *cyclicNeighbor, const int *patchSizeOffset, const double *boundary_deltaCoeffs, double sign) { - TICK_INIT_EVENT; size_t threads_per_block = 32; size_t blocks_per_grid = (num_surfaces + threads_per_block - 1) / threads_per_block; - TICK_START_EVENT; fvc_grad_vector_internal<<>>(num_cells, num_surfaces, lowerAddr, upperAddr, Sf, weight, vf, output); - TICK_END_EVENT(fvc_grad_vector_internal); int offset = 0; // finish conctruct grad field except dividing cell volume for (int i = 0; i < num_patches; i++) { + if (patch_size[i] == 0) continue; threads_per_block = 64; blocks_per_grid = (patch_size[i] + threads_per_block - 1) / threads_per_block; // TODO: just basic patch type now @@ -2689,10 +2635,8 @@ void fvc_grad_vector(cudaStream_t stream, ncclComm_t comm, || patch_type[i] == boundaryConditions::calculated || patch_type[i] == boundaryConditions::cyclic) { // TODO: just vector version now - TICK_START_EVENT; fvc_grad_vector_boundary_zeroGradient<<>>(num_boundary_surfaces, num_cells, patch_size[i], offset, boundary_cell_face, boundary_Sf, boundary_vf, output); - TICK_END_EVENT(fvc_grad_vector_boundary_zeroGradient); } else if (patch_type[i] == boundaryConditions::processor || patch_type[i] == boundaryConditions::processorCyclic) { fvc_grad_vector_boundary_processor<<>>(num_boundary_surfaces, num_cells, @@ -2709,23 +2653,20 @@ void fvc_grad_vector(cudaStream_t stream, ncclComm_t comm, // divide cell volume threads_per_block = 512; blocks_per_grid = (num_cells + threads_per_block - 1) / threads_per_block; - TICK_START_EVENT; divide_cell_volume_tsr<<>>(num_cells, volume, output); - TICK_END_EVENT(divide_cell_volume_tsr); // correct boundary conditions offset = 0; for (int i = 0; i < num_patches; i++) { + if (patch_size[i] == 0) continue; threads_per_block = 64; blocks_per_grid = (patch_size[i] + threads_per_block - 1) / threads_per_block; // TODO: just basic patch type now if (patch_type[i] == boundaryConditions::zeroGradient) { // TODO: just vector version now - TICK_START_EVENT; fvc_grad_vector_correctBC_zeroGradient<<>>(num_cells, num_boundary_surfaces, patch_size[i], offset, boundary_cell_face, output, boundary_vf, boundary_Sf, boundary_mag_Sf, boundary_output); - TICK_END_EVENT(fvc_grad_vector_correctBC_zeroGradient); } else if (patch_type[i] == boundaryConditions::fixedValue) { // TODO: implement fixedValue version fvc_grad_vector_correctBC_fixedValue<<>>( @@ -2757,12 +2698,9 @@ void fvc_grad_vector(cudaStream_t stream, ncclComm_t comm, void scale_dev2T_tensor(cudaStream_t stream, int num_cells, const double *vf1, double *vf2, int num_boundary_surfaces, const double *boundary_vf1, double *boundary_vf2) { - TICK_INIT_EVENT; size_t threads_per_block = 1024; size_t blocks_per_grid = (num_cells + threads_per_block - 1) / threads_per_block; - TICK_START_EVENT; scale_dev2t_tensor_kernel<<>>(num_cells, vf1, vf2); - TICK_END_EVENT(scale_dev2t_tensor_kernel); blocks_per_grid = (num_boundary_surfaces + threads_per_block - 1) / threads_per_block; scale_dev2t_tensor_kernel<<>>(num_boundary_surfaces, boundary_vf1, boundary_vf2); @@ -2773,15 +2711,13 @@ void fvc_div_surface_scalar(cudaStream_t stream, int num_cells, int num_surfaces int num_patches, const int *patch_size, const int *patch_type, const double *boundary_ssf, const double *volume, double *output, double sign) { - TICK_INIT_EVENT; size_t threads_per_block = 1024; size_t blocks_per_grid = (num_surfaces + threads_per_block - 1) / threads_per_block; - TICK_START_EVENT; fvc_div_surface_scalar_internal<<>>(num_surfaces, lowerAddr, upperAddr, ssf, output, sign); - TICK_END_EVENT(fvc_div_surface_scalar_internal); int offset = 0; for (int i = 0; i < num_patches; i++) { + if (patch_size[i] == 0) continue; threads_per_block = 256; blocks_per_grid = (patch_size[i] + threads_per_block - 1) / threads_per_block; fvc_div_surface_scalar_boundary<<>>(patch_size[i], offset, boundary_cell_face, @@ -2802,16 +2738,14 @@ void fvc_div_cell_vector(cudaStream_t stream, int num_cells, int num_surfaces, i const double *boundary_weight, const double *boundary_vf, const double *boundary_Sf, const double *volume, double sign) { - TICK_INIT_EVENT; size_t threads_per_block = 1024; size_t blocks_per_grid = (num_surfaces + threads_per_block - 1) / threads_per_block; - TICK_START_EVENT; fvc_div_cell_vector_internal<<>>(num_cells, num_surfaces, lowerAddr, upperAddr, vf, weight, Sf, output, sign); - TICK_END_EVENT(fvc_div_cell_vector_internal); int offset = 0; for (int i = 0; i < num_patches; i++) { + if (patch_size[i] == 0) continue; threads_per_block = 256; blocks_per_grid = (patch_size[i] + threads_per_block - 1) / threads_per_block; // TODO: just basic patch type now @@ -2837,16 +2771,14 @@ void fvc_div_cell_tensor(cudaStream_t stream, int num_cells, int num_surfaces, i const int *boundary_cell_face, const double *boundary_vf, const double *boundary_Sf, const double *volume, double sign) { - TICK_INIT_EVENT; size_t threads_per_block = 1024; size_t blocks_per_grid = (num_surfaces + threads_per_block - 1) / threads_per_block; - TICK_START_EVENT; fvc_div_cell_tensor_internal<<>>(num_cells, num_surfaces, lowerAddr, upperAddr, vf, weight, Sf, output, sign); - TICK_END_EVENT(fvc_div_cell_tensor_internal); int offset = 0; for (int i = 0; i < num_patches; i++) { + if (patch_size[i] == 0) continue; threads_per_block = 64; blocks_per_grid = (patch_size[i] + threads_per_block - 1) / threads_per_block; // TODO: just basic patch type now @@ -2855,11 +2787,9 @@ void fvc_div_cell_tensor(cudaStream_t stream, int num_cells, int num_surfaces, i || patch_type[i] == boundaryConditions::calculated || patch_type[i] == boundaryConditions::cyclic) { // TODO: just vector version now - TICK_START_EVENT; fvc_div_cell_tensor_boundary_zeroGradient<<>>(num_cells, num_boundary_surfaces, patch_size[i], offset, boundary_cell_face, boundary_Sf, boundary_vf, output, sign); - TICK_END_EVENT(fvc_div_cell_tensor_boundary_zeroGradient); } else if (patch_type[i] == boundaryConditions::processor || patch_type[i] == boundaryConditions::processorCyclic) { fvc_div_cell_tensor_boundary_processor<<>>(num_cells, num_boundary_surfaces, @@ -2881,16 +2811,14 @@ void fvc_div_surface_scalar_vol_scalar(cudaStream_t stream, int num_surfaces, const int *boundary_cell_face, const double *boundary_vf, const double *boundary_ssf, double sign) { - TICK_INIT_EVENT; size_t threads_per_block = 1024; size_t blocks_per_grid = (num_surfaces + threads_per_block - 1) / threads_per_block; - TICK_START_EVENT; fvc_div_surface_scalar_vol_scalar_internal<<>>(num_surfaces, lowerAddr, upperAddr, weight, ssf, vf, output, sign); - TICK_END_EVENT(fvc_div_surface_scalar_vol_scalar_internal); int offset = 0; for (int i = 0; i < num_patches; i++) { + if (patch_size[i] == 0) continue; threads_per_block = 64; blocks_per_grid = (patch_size[i] + threads_per_block - 1) / threads_per_block; fvc_div_surface_scalar_vol_scalar_boundary<<>>( @@ -2908,16 +2836,14 @@ void fvc_grad_cell_scalar(cudaStream_t stream, int num_cells, int num_surfaces, int num_patches, const int *patch_size, const int *patch_type, const double *boundary_weight, const int *boundary_cell_face, const double *boundary_vf, const double *boundary_Sf, const double *volume, double sign) { - TICK_INIT_EVENT; size_t threads_per_block = 1024; size_t blocks_per_grid = (num_surfaces + threads_per_block - 1) / threads_per_block; - TICK_START_EVENT; fvc_grad_scalar_internal<<>>(num_cells, num_surfaces, lowerAddr, upperAddr, Sf, weight, vf, output, sign); - TICK_END_EVENT(fvc_grad_scalar_internal); int offset = 0; for (int i = 0; i < num_patches; i++) { + if (patch_size[i] == 0) continue; threads_per_block = 64; blocks_per_grid = (patch_size[i] + threads_per_block - 1) / threads_per_block; // TODO: just non-coupled patch type now @@ -2925,10 +2851,8 @@ void fvc_grad_cell_scalar(cudaStream_t stream, int num_cells, int num_surfaces, || patch_type[i] == boundaryConditions::fixedValue || patch_type[i] == boundaryConditions::calculated || patch_type[i] == boundaryConditions::cyclic) { - TICK_START_EVENT; fvc_grad_scalar_boundary_zeroGradient<<>>(num_boundary_surfaces, num_cells, patch_size[i], offset, boundary_cell_face, boundary_Sf, boundary_vf, output, sign); - TICK_END_EVENT(fvc_grad_scalar_boundary_zeroGradient); } else if (patch_type[i] == boundaryConditions::processor || patch_type[i] == boundaryConditions::processorCyclic) { fvc_grad_scalar_boundary_processor<<>>(num_boundary_surfaces, num_cells, patch_size[i], offset, boundary_cell_face, @@ -2950,16 +2874,14 @@ void fvc_grad_cell_scalar(cudaStream_t stream, int num_cells, int num_surfaces, const int *boundary_cell_face, const double *boundary_vf, const double *boundary_Sf, const double *volume, bool dividVol, double sign) { - TICK_INIT_EVENT; size_t threads_per_block = 1024; size_t blocks_per_grid = (num_surfaces + threads_per_block - 1) / threads_per_block; - TICK_START_EVENT; fvc_grad_scalar_internal<<>>(num_cells, num_surfaces, lowerAddr, upperAddr, Sf, weight, vf, output, sign); - TICK_END_EVENT(fvc_grad_scalar_internal); int offset = 0; for (int i = 0; i < num_patches; i++) { + if (patch_size[i] == 0) continue; threads_per_block = 64; blocks_per_grid = (patch_size[i] + threads_per_block - 1) / threads_per_block; // TODO: just non-coupled patch type now @@ -2967,10 +2889,8 @@ void fvc_grad_cell_scalar(cudaStream_t stream, int num_cells, int num_surfaces, || patch_type[i] == boundaryConditions::fixedValue || patch_type[i] == boundaryConditions::calculated || patch_type[i] == boundaryConditions::cyclic) { - TICK_START_EVENT; fvc_grad_scalar_boundary_zeroGradient<<>>(num_boundary_surfaces, num_cells, patch_size[i], offset, boundary_cell_face, boundary_Sf, boundary_vf, output, sign); - TICK_END_EVENT(fvc_grad_scalar_boundary_zeroGradient); } else if (patch_type[i] == boundaryConditions::processor || patch_type[i] == boundaryConditions::processorCyclic) { fvc_grad_scalar_boundary_processor<<>>(num_boundary_surfaces, num_cells, patch_size[i], offset, boundary_cell_face, @@ -3011,6 +2931,7 @@ void fvc_grad_cell_scalar_withBC(cudaStream_t stream, ncclComm_t comm, const int // correct boundary conditions int offset = 0; for (int i = 0; i < num_patches; i++) { + if (patch_size[i] == 0) continue; threads_per_block = 256; blocks_per_grid = (patch_size[i] + threads_per_block - 1) / threads_per_block; // TODO: just basic patch type now @@ -3064,6 +2985,7 @@ void fvc_laplacian_scalar(cudaStream_t stream, int num_cells, int num_surfaces, int offset = 0; for (int i = 0; i < num_patches; i++) { + if (patch_size[i] == 0) continue; threads_per_block = 256; blocks_per_grid = (patch_size[i] + threads_per_block - 1) / threads_per_block; // TODO: just basic patch type now @@ -3091,26 +3013,22 @@ void fvc_flux(cudaStream_t stream, int num_cells, int num_surfaces, int num_boun const int *boundary_cell_face, const double *boundary_vf, const double *boundary_Sf, double *boundary_output, double sign) { - TICK_INIT_EVENT; size_t threads_per_block = 1024; size_t blocks_per_grid = (num_surfaces + threads_per_block - 1) / threads_per_block; - TICK_START_EVENT; fvc_flux_internal_kernel<<>>(num_cells, num_surfaces, lowerAddr, upperAddr, vf, weight, Sf, output, sign); - TICK_END_EVENT(fvc_flux_internal_kernel); int offset = 0; for (int i = 0; i < num_patches; i++) { + if (patch_size[i] == 0) continue; threads_per_block = 256; blocks_per_grid = (patch_size[i] + threads_per_block - 1) / threads_per_block; // TODO: maybe do not need loop boundarys if (patch_type[i] == boundaryConditions::zeroGradient || patch_type[i] == boundaryConditions::fixedValue || patch_type[i] == boundaryConditions::gradientEnergy) { - TICK_START_EVENT; fvc_flux_boundary_kernel<<>>(num_boundary_surfaces, patch_size[i], offset, boundary_cell_face, boundary_Sf, boundary_vf, boundary_output, sign); - TICK_END_EVENT(fvc_flux_boundary_kernel); } else { // xxx fprintf(stderr, "%s %d, boundaryConditions other than zeroGradient are not support yet!\n", __FILE__, __LINE__); @@ -3132,6 +3050,7 @@ void fvc_interpolate(cudaStream_t stream, int num_cells, int num_surfaces, int offset = 0; for (int i = 0; i < num_patches; i++) { + if (patch_size[i] == 0) continue; threads_per_block = 256; blocks_per_grid = (patch_size[i] + threads_per_block - 1) / threads_per_block; // TODO: maybe do not need loop boundarys @@ -3169,19 +3088,14 @@ void fvMtx_A(cudaStream_t stream, int num_cells, int num_surfaces, int num_bound double *A_pEqn) { checkCudaErrors(cudaMemcpyAsync(A_pEqn, diag, num_cells * sizeof(double), cudaMemcpyDeviceToDevice, stream)); - TICK_INIT_EVENT; size_t threads_per_block = 1024; size_t blocks_per_grid = (num_boundary_surfaces + threads_per_block - 1) / threads_per_block; - TICK_START_EVENT; addAveInternaltoDiag<<>>(num_cells, num_boundary_surfaces, boundary_cell_face, internal_coeffs, A_pEqn); - TICK_END_EVENT(addAveInternaltoDiag); blocks_per_grid = (num_cells + threads_per_block - 1) / threads_per_block; - TICK_START_EVENT; divide_cell_volume_scalar<<>>(num_cells, volume, A_pEqn); - TICK_END_EVENT(divide_cell_volume_scalar); } void fvMtx_H(cudaStream_t stream, int num_cells, int num_surfaces, int num_boundary_surfaces, @@ -3191,42 +3105,34 @@ void fvMtx_H(cudaStream_t stream, int num_cells, int num_surfaces, int num_bound const double *lower, const double *upper, const double *source, const double *psi, double *H_pEqn, double *H_pEqn_perm) { - TICK_INIT_EVENT; size_t threads_per_block = 1024; size_t blocks_per_grid = (num_boundary_surfaces + threads_per_block - 1) / threads_per_block; - TICK_START_EVENT; checkCudaErrors(cudaMemcpyAsync(H_pEqn, source, num_cells * 3 * sizeof(double), cudaMemcpyDeviceToDevice, stream)); addBoundaryDiag<<>>(num_cells, num_boundary_surfaces, boundary_cell_face, internal_coffs, psi, H_pEqn); - TICK_END_EVENT(addBoundaryDiag); blocks_per_grid = (num_surfaces + threads_per_block - 1) / threads_per_block; - TICK_START_EVENT; lduMatrix_H<<>>(num_cells, num_surfaces, lowerAddr, upperAddr, lower, upper, psi, H_pEqn); - TICK_END_EVENT(lduMatrix_H); int offset = 0; for (int i = 0; i < num_patches; i++) { + if (patch_size[i] == 0) continue; threads_per_block = 64; blocks_per_grid = (patch_size[i] + threads_per_block - 1) / threads_per_block; // TODO: just non-coupled patch type now if (patch_type[i] == boundaryConditions::zeroGradient || patch_type[i] == boundaryConditions::fixedValue) { - TICK_START_EVENT; addBoundarySrc_unCoupled<<>>(num_cells, patch_size[i], offset, num_boundary_surfaces, boundary_cell_face, boundary_coeffs, H_pEqn); - TICK_END_EVENT(addBoundarySrc_unCoupled); } else { fprintf(stderr, "%s %d, boundaryConditions other than zeroGradient are not support yet!\n", __FILE__, __LINE__); } offset += patch_size[i]; } blocks_per_grid = (num_cells + threads_per_block - 1) / threads_per_block; - TICK_START_EVENT; divideVol_permute_vec<<>>(num_cells, volume, H_pEqn, H_pEqn_perm); - TICK_END_EVENT(divideVol_permute_vec); } void fvMtx_flux(cudaStream_t stream, int num_surfaces, int num_boundary_surfaces, @@ -3242,6 +3148,7 @@ void fvMtx_flux(cudaStream_t stream, int num_surfaces, int num_boundary_surfaces int offset = 0; for (int i = 0; i < num_patches; i++) { + if (patch_size[i] == 0) continue; threads_per_block = 64; blocks_per_grid = (patch_size[i] + threads_per_block - 1) / threads_per_block; if (patch_type[i] == boundaryConditions::processor @@ -3264,11 +3171,8 @@ void fvMtx_flux(cudaStream_t stream, int num_surfaces, int num_boundary_surfaces void solve_explicit_scalar(cudaStream_t stream, int num_cells, const double *diag, const double *source, double *psi) { - TICK_INIT_EVENT; size_t threads_per_block = 1024; size_t blocks_per_grid = (num_cells + threads_per_block - 1) / threads_per_block; - TICK_START_EVENT; solve_explicit_scalar_kernel<<>>(num_cells, diag, source, psi); - TICK_END_EVENT(solve_explicit_scalar_kernel); } diff --git a/src_gpu/dfRhoEqn.H b/src_gpu/dfRhoEqn.H index fe22f2aa2..c38347977 100644 --- a/src_gpu/dfRhoEqn.H +++ b/src_gpu/dfRhoEqn.H @@ -9,10 +9,13 @@ private: dfMatrixDataBase &dataBase_; // cuda resource + cudaStream_t stream; +#ifdef USE_GRAPH // one graph for one eqn before using self-developed solver cudaGraph_t graph; cudaGraphExec_t graph_instance; bool graph_created=false; +#endif // constant fields - boundary std::vector patch_type; @@ -32,6 +35,7 @@ public: // member function // initialization + void setConstantValues(); void setConstantFields(const std::vector patch_type); void initNonConstantFields(const double *rho, const double *phi, const double *boundary_rho, const double *boudnary_phi); diff --git a/src_gpu/dfRhoEqn.cu b/src_gpu/dfRhoEqn.cu index 8a1d2e36e..57d32c9a4 100644 --- a/src_gpu/dfRhoEqn.cu +++ b/src_gpu/dfRhoEqn.cu @@ -9,6 +9,10 @@ void dfRhoEqn::createNonConstantLduAndCsrFields() #endif } +void dfRhoEqn::setConstantValues() { + this->stream = dataBase_.stream; +} + void dfRhoEqn::setConstantFields(const std::vector patch_type) { this->patch_type = patch_type; } @@ -22,10 +26,12 @@ void dfRhoEqn::initNonConstantFields(const double *rho, const double *phi, } void dfRhoEqn::cleanCudaResources() { +#ifdef USE_GRAPH if (graph_created) { checkCudaErrors(cudaGraphExecDestroy(graph_instance)); checkCudaErrors(cudaGraphDestroy(graph)); } +#endif } void dfRhoEqn::preProcess() @@ -34,18 +40,13 @@ void dfRhoEqn::preProcess() void dfRhoEqn::process() { - // 使用event计算时间 - float time_elapsed=0; - cudaEvent_t start,stop; - checkCudaErrors(cudaEventCreate(&start)); - checkCudaErrors(cudaEventCreate(&stop)); - checkCudaErrors(cudaEventRecord(start,0)); - -// #ifndef TIME_GPU -// if(!graph_created) { -// DEBUG_TRACE; -// checkCudaErrors(cudaStreamBeginCapture(dataBase_.stream, cudaStreamCaptureModeGlobal)); -// #endif + TICK_INIT_EVENT; + TICK_START_EVENT; +#ifdef USE_GRAPH + if(!graph_created) { + DEBUG_TRACE; + checkCudaErrors(cudaStreamBeginCapture(dataBase_.stream, cudaStreamCaptureModeGlobal)); +#endif #ifdef STREAM_ALLOCATOR checkCudaErrors(cudaMallocAsync((void**)&d_source, dataBase_.cell_value_bytes, dataBase_.stream)); @@ -69,21 +70,25 @@ void dfRhoEqn::process() dataBase_.num_boundary_surfaces, dataBase_.num_patches, dataBase_.patch_size.data(), patch_type.data(), dataBase_.d_boundary_delta_coeffs, dataBase_.d_boundary_face_cell, dataBase_.d_rho, dataBase_.d_boundary_rho, dataBase_.cyclicNeighbor.data(), dataBase_.patchSizeOffset.data(), dataBase_.d_boundary_weight); - -// #ifndef TIME_GPU -// checkCudaErrors(cudaStreamEndCapture(dataBase_.stream, &graph)); -// checkCudaErrors(cudaGraphInstantiate(&graph_instance, graph, NULL, NULL, 0)); -// graph_created = true; -// } -// checkCudaErrors(cudaGraphLaunch(graph_instance, dataBase_.stream)); -// #endif - - checkCudaErrors(cudaEventRecord(stop,0)); - checkCudaErrors(cudaEventSynchronize(start)); - checkCudaErrors(cudaEventSynchronize(stop)); - checkCudaErrors(cudaEventElapsedTime(&time_elapsed,start,stop)); - if(!mpi_init_flag || myRank == 0) - fprintf(stderr, "rhoEqn process time:%f(ms)\n\n",time_elapsed); +#ifdef USE_GRAPH + checkCudaErrors(cudaStreamEndCapture(dataBase_.stream, &graph)); + checkCudaErrors(cudaGraphInstantiate(&graph_instance, graph, NULL, NULL, 0)); + graph_created = true; + } + checkCudaErrors(cudaGraphLaunch(graph_instance, dataBase_.stream)); +#endif + TICK_END_EVENT(rhoEqn process); + + TICK_START_EVENT; +#ifdef STREAM_ALLOCATOR + checkCudaErrors(cudaFreeAsync(d_source, dataBase_.stream)); + checkCudaErrors(cudaFreeAsync(d_diag, dataBase_.stream)); +#endif + TICK_END_EVENT(rhoEqn post process free); + TICK_START_EVENT; + checkCudaErrors(cudaMemcpyAsync(dataBase_.h_rho, dataBase_.d_rho, dataBase_.cell_value_bytes, cudaMemcpyDeviceToHost, dataBase_.stream)); + TICK_END_EVENT(rhoEqn post process copy back); + sync(); } void dfRhoEqn::sync() @@ -96,16 +101,7 @@ void dfRhoEqn::solve() solve_explicit_scalar(dataBase_.stream, dataBase_.num_cells, d_diag, d_source, dataBase_.d_rho); } -void dfRhoEqn::postProcess(double *h_rho) -{ -#ifdef STREAM_ALLOCATOR - checkCudaErrors(cudaFreeAsync(d_source, dataBase_.stream)); - checkCudaErrors(cudaFreeAsync(d_diag, dataBase_.stream)); -#endif - checkCudaErrors(cudaMemcpyAsync(h_rho, dataBase_.d_rho, dataBase_.cell_value_bytes, cudaMemcpyDeviceToHost, dataBase_.stream)); - sync(); - // TODO: correct boundary conditions -} +void dfRhoEqn::postProcess(double *h_rho) {} void dfRhoEqn::compareResult(const double *diag, const double *source, bool printFlag) { diff --git a/src_gpu/dfThermo.H b/src_gpu/dfThermo.H index b833f937a..e1f9c0f58 100644 --- a/src_gpu/dfThermo.H +++ b/src_gpu/dfThermo.H @@ -1,6 +1,7 @@ #pragma once #include "dfMatrixDataBase.H" +#include "dfMatrixOpBase.H" void init_const_coeff_ptr(std::vector>& nasa_coeffs, std::vector>& viscosity_coeffs, std::vector>& thermal_conductivity_coeffs, std::vector>& binary_diffusion_coeffs, diff --git a/src_gpu/dfThermo.cu b/src_gpu/dfThermo.cu index 0cd3aa992..9e3aba78d 100644 --- a/src_gpu/dfThermo.cu +++ b/src_gpu/dfThermo.cu @@ -6,28 +6,6 @@ #include #include "device_launch_parameters.h" -#ifdef TIME_GPU - #define TICK_INIT_EVENT \ - float time_elapsed_kernel=0;\ - cudaEvent_t start_kernel, stop_kernel;\ - checkCudaErrors(cudaEventCreate(&start_kernel));\ - checkCudaErrors(cudaEventCreate(&stop_kernel)); - - #define TICK_START_EVENT \ - checkCudaErrors(cudaEventRecord(start_kernel,0)); - - #define TICK_END_EVENT(prefix) \ - checkCudaErrors(cudaEventRecord(stop_kernel,0));\ - checkCudaErrors(cudaEventSynchronize(start_kernel));\ - checkCudaErrors(cudaEventSynchronize(stop_kernel));\ - checkCudaErrors(cudaEventElapsedTime(&time_elapsed_kernel,start_kernel,stop_kernel));\ - printf("try %s 执行时间:%lf(ms)\n", #prefix, time_elapsed_kernel); -#else - #define TICK_INIT_EVENT - #define TICK_START_EVENT - #define TICK_END_EVENT(prefix) -#endif - #define GAS_CANSTANT 8314.46261815324 #define SQRT8 2.8284271247461903 #define NUM_SPECIES 7 @@ -45,8 +23,8 @@ void init_const_coeff_ptr(std::vector>& nasa_coeffs, std::ve std::vector>& thermal_conductivity_coeffs, std::vector>& binary_diffusion_coeffs, std::vector& molecular_weights) { - double *d_tmp; - checkCudaErrors(cudaMalloc((void**)&d_tmp, sizeof(double) * NUM_SPECIES * 15)); + //double *d_tmp; + //checkCudaErrors(cudaMalloc((void**)&d_tmp, sizeof(double) * NUM_SPECIES * 15)); double nasa_coeffs_tmp[NUM_SPECIES*15]; double viscosity_coeffs_tmp[NUM_SPECIES*5]; double thermal_conductivity_coeffs_tmp[NUM_SPECIES*5]; @@ -366,6 +344,7 @@ void dfThermo::setConstantValue(std::string mechanism_file, int num_cells, int n initCoeffsfromBinaryFile(fp); stream = dataBase_.stream; +#ifndef STREAM_ALLOCATOR checkCudaErrors(cudaMalloc((void**)&d_mole_fraction, sizeof(double) * num_species * num_cells)); checkCudaErrors(cudaMalloc((void**)&d_boundary_mole_fraction, sizeof(double) * num_species * dataBase_.num_boundary_surfaces)); checkCudaErrors(cudaMalloc((void**)&d_mean_mole_weight, sizeof(double) * num_cells)); @@ -377,10 +356,10 @@ void dfThermo::setConstantValue(std::string mechanism_file, int num_cells, int n checkCudaErrors(cudaMalloc((void**)&d_boundary_species_viscosities, sizeof(double) * num_species * dataBase_.num_boundary_surfaces)); checkCudaErrors(cudaMalloc((void**)&d_species_thermal_conductivities, sizeof(double) * num_species * num_cells)); checkCudaErrors(cudaMalloc((void**)&d_boundary_species_thermal_conductivities, sizeof(double) * num_species * dataBase_.num_boundary_surfaces)); +#endif checkCudaErrors(cudaMalloc((void**)&d_psip0, sizeof(double) * num_cells)); checkCudaErrors(cudaMalloc((void**)&d_boundary_psip0, sizeof(double) * dataBase_.num_boundary_surfaces)); - std::cout << "dfThermo initialized" << std::endl; } @@ -454,55 +433,40 @@ void dfThermo::calculateTPolyGPU(int threads_per_block, int num_thread, int num_ void dfThermo::calculatePsiGPU(int threads_per_block, int num_thread, const double *T, const double *mean_mole_weight, double *d_psi, int offset) { - TICK_INIT_EVENT; size_t blocks_per_grid = (num_thread + threads_per_block - 1) / threads_per_block; - TICK_START_EVENT; calculate_psi_kernel<<>>(num_thread, offset, T, mean_mole_weight, d_psi); - TICK_END_EVENT(calculate_psi_kernel); } void dfThermo::calculateRhoGPU(int threads_per_block, int num_thread, const double *p, const double *psi, double *rho, int offset) { - TICK_INIT_EVENT; size_t blocks_per_grid = (num_thread + threads_per_block - 1) / threads_per_block; - TICK_START_EVENT; calculate_rho_kernel<<>>(num_cells, offset, p, psi, rho); - TICK_END_EVENT(calculate_rho_kernel); } void dfThermo::calculateViscosityGPU(int threads_per_block, int num_thread, int num_total, const double *T, const double *mole_fraction, const double *T_poly, double *species_viscosities, double *viscosity, int offset) { - TICK_INIT_EVENT; size_t blocks_per_grid = (num_thread + threads_per_block - 1) / threads_per_block; - TICK_START_EVENT; calculate_viscosity_kernel<<>>(num_thread, num_total, num_species, offset, T_poly, T, mole_fraction, species_viscosities, viscosity); - TICK_END_EVENT(calculate_viscosity_kernel); } void dfThermo::calculateThermoConductivityGPU(int threads_per_block, int num_thread, int num_total, const double *T, const double *T_poly, const double *d_y, const double *mole_fraction, double *species_thermal_conductivities, double *thermal_conductivity, int offset) { - TICK_INIT_EVENT; size_t blocks_per_grid = (num_thread + threads_per_block - 1) / threads_per_block; - TICK_START_EVENT; calculate_thermoConductivity_kernel<<>>(num_thread, num_total, num_species, offset, d_nasa_coeffs, d_y, T_poly, T, mole_fraction, species_thermal_conductivities, thermal_conductivity); - TICK_END_EVENT(calculate_thermoConductivity_kernel); } void dfThermo::calculateTemperatureGPU(int threads_per_block, int num_thread, int num_total, const double *T_init, const double *target_h, double *T, const double *d_mass_fraction, int offset, double atol, double rtol, int max_iter) { - TICK_INIT_EVENT; size_t blocks_per_grid = (num_thread + threads_per_block - 1) / threads_per_block; - TICK_START_EVENT; calculate_temperature_kernel<<>>(num_thread, num_total, num_species, offset, T_init, target_h, d_mass_fraction, T, atol, rtol, max_iter); - TICK_END_EVENT(calculate_temperature_kernel); } void dfThermo::calculateEnthalpyGPU(int threads_per_block, int num_thread, const double *T, double *enthalpy, const double *d_mass_fraction, int offset) @@ -530,6 +494,20 @@ void dfThermo::updateEnergy() void dfThermo::correctThermo() { +#ifdef STREAM_ALLOCATOR + checkCudaErrors(cudaMallocAsync((void**)&d_mole_fraction, sizeof(double) * num_species * num_cells, stream)); + checkCudaErrors(cudaMallocAsync((void**)&d_boundary_mole_fraction, sizeof(double) * num_species * dataBase_.num_boundary_surfaces, stream)); + checkCudaErrors(cudaMallocAsync((void**)&d_mean_mole_weight, sizeof(double) * num_cells, stream)); + checkCudaErrors(cudaMallocAsync((void**)&d_boundary_mean_mole_weight, sizeof(double) * dataBase_.num_boundary_surfaces, stream)); + checkCudaErrors(cudaMallocAsync((void**)&d_T_poly, sizeof(double) * 5 * num_cells, stream)); + checkCudaErrors(cudaMallocAsync((void**)&d_boundary_T_poly, sizeof(double) * 5 * dataBase_.num_boundary_surfaces, stream)); + + checkCudaErrors(cudaMallocAsync((void**)&d_species_viscosities, sizeof(double) * num_species * num_cells, stream)); + checkCudaErrors(cudaMallocAsync((void**)&d_boundary_species_viscosities, sizeof(double) * num_species * dataBase_.num_boundary_surfaces, stream)); + checkCudaErrors(cudaMallocAsync((void**)&d_species_thermal_conductivities, sizeof(double) * num_species * num_cells, stream)); + checkCudaErrors(cudaMallocAsync((void**)&d_boundary_species_thermal_conductivities, sizeof(double) * num_species * dataBase_.num_boundary_surfaces, stream)); +#endif + setMassFraction(dataBase_.d_y, dataBase_.d_boundary_y); // internal field int cell_thread = 512, boundary_thread = 32; @@ -544,6 +522,7 @@ void dfThermo::correctThermo() // boundary field int offset = 0; for (int i = 0; i < dataBase_.num_patches; i++) { + if (dataBase_.patch_size[i] == 0) continue; if (dataBase_.patch_type_T[i] == boundaryConditions::fixedValue) { calculateTPolyGPU(boundary_thread, dataBase_.patch_size[i], dataBase_.num_boundary_surfaces, dataBase_.d_boundary_T, d_boundary_T_poly, offset); calculateEnthalpyGPU(boundary_thread, dataBase_.patch_size[i], dataBase_.d_boundary_T, dataBase_.d_boundary_he, @@ -587,6 +566,19 @@ void dfThermo::correctThermo() } else { offset += dataBase_.patch_size[i]; } } +#ifdef STREAM_ALLOCATOR + checkCudaErrors(cudaFreeAsync(d_mole_fraction, stream)); + checkCudaErrors(cudaFreeAsync(d_boundary_mole_fraction, stream)); + checkCudaErrors(cudaFreeAsync(d_mean_mole_weight, stream)); + checkCudaErrors(cudaFreeAsync(d_boundary_mean_mole_weight, stream)); + checkCudaErrors(cudaFreeAsync(d_T_poly, stream)); + checkCudaErrors(cudaFreeAsync(d_boundary_T_poly, stream)); + + checkCudaErrors(cudaFreeAsync(d_species_viscosities, stream)); + checkCudaErrors(cudaFreeAsync(d_boundary_species_viscosities, stream)); + checkCudaErrors(cudaFreeAsync(d_species_thermal_conductivities, stream)); + checkCudaErrors(cudaFreeAsync(d_boundary_species_thermal_conductivities, stream)); +#endif } void dfThermo::updateRho() @@ -620,7 +612,6 @@ void dfThermo::calculateEnergyGradient(int num_thread, int num_cells, int num_sp { size_t threads_per_block = 256; size_t blocks_per_grid = (num_thread + threads_per_block - 1) / threads_per_block; - calculate_energy_gradient_kernel<<>>(num_thread, num_cells, num_species, num_boundary_surfaces, bou_offset, gradient_offset, face2Cells, T, p, y, boundary_p, boundary_y, boundary_delta_coeffs, boundary_thermo_gradient); } diff --git a/src_gpu/dfUEqn.H b/src_gpu/dfUEqn.H index 3525ef8ec..ef3c8133f 100644 --- a/src_gpu/dfUEqn.H +++ b/src_gpu/dfUEqn.H @@ -11,10 +11,14 @@ private: dfMatrixDataBase &dataBase_; // cuda resource + cudaStream_t stream; +#ifdef USE_GRAPH // one graph for one eqn before using self-developed solver cudaGraph_t graph_pre, graph_post; cudaGraphExec_t graph_instance_pre, graph_instance_post; - bool graph_created=false; + bool pre_graph_created=false; + bool post_graph_created=false; +#endif // constant values -- basic std::string mode_string; @@ -111,7 +115,7 @@ public: void preProcessForRhoEqn(const double *h_rho, const double *h_phi, const double *h_boundary_phi); void preProcess(const double *h_u, const double *h_boundary_u, const double *h_p, const double *h_boundary_p, const double *h_nu_eff, const double *h_boundary_nu_eff); void process(); - void postProcess(double *h_u); + void postProcess(); void A(double *Psi); void H(double *Psi); diff --git a/src_gpu/dfUEqn.cu b/src_gpu/dfUEqn.cu index 26bdfb3d8..131c566d8 100644 --- a/src_gpu/dfUEqn.cu +++ b/src_gpu/dfUEqn.cu @@ -303,6 +303,7 @@ __global__ void ueqn_divide_cell_volume_vec(int num_cells, const double* volume, } void dfUEqn::setConstantValues(const std::string &mode_string, const std::string &setting_path) { + this->stream = dataBase_.stream; this->mode_string = mode_string; this->setting_path = setting_path; UxSolver = new AmgXSolver(mode_string, setting_path, dataBase_.localRank); @@ -335,8 +336,8 @@ void dfUEqn::createNonConstantFieldsInternal() { checkCudaErrors(cudaMalloc((void**)&d_grad_u, dataBase_.cell_value_tsr_bytes)); checkCudaErrors(cudaMalloc((void**)&d_rho_nueff, dataBase_.cell_value_bytes)); checkCudaErrors(cudaMalloc((void**)&d_fvc_output, dataBase_.cell_value_vec_bytes)); - checkCudaErrors(cudaMalloc((void**)&d_u_host_order, dataBase_.cell_value_vec_bytes)); #endif + checkCudaErrors(cudaMalloc((void**)&d_u_host_order, dataBase_.cell_value_vec_bytes)); // computed on CPU, used on GPU, need memcpyh2d checkCudaErrors(cudaMallocHost((void**)&h_nu_eff , dataBase_.cell_value_bytes)); checkCudaErrors(cudaMallocHost((void**)&h_A_pEqn , dataBase_.cell_value_bytes)); @@ -358,9 +359,9 @@ void dfUEqn::createNonConstantFieldsBoundary() { checkCudaErrors(cudaMalloc((void**)&d_value_boundary_coeffs, dataBase_.boundary_surface_value_vec_bytes)); checkCudaErrors(cudaMalloc((void**)&d_gradient_internal_coeffs, dataBase_.boundary_surface_value_vec_bytes)); checkCudaErrors(cudaMalloc((void**)&d_gradient_boundary_coeffs, dataBase_.boundary_surface_value_vec_bytes)); +#endif // intermediate boundary checkCudaErrors(cudaMalloc((void**)&d_boundary_u_host_order, dataBase_.boundary_surface_value_vec_bytes)); -#endif // computed on CPU, used on GPU, need memcpyh2d checkCudaErrors(cudaMallocHost((void**)&h_boundary_nu_eff, dataBase_.boundary_surface_value_bytes)); @@ -384,10 +385,6 @@ void dfUEqn::createNonConstantLduAndCsrFields() { checkCudaErrors(cudaMalloc((void**)&d_A_pEqn, dataBase_.cell_value_bytes)); // TODO: delete redundant variables checkCudaErrors(cudaMalloc((void**)&d_H_pEqn, dataBase_.cell_value_vec_bytes)); checkCudaErrors(cudaMalloc((void**)&d_H_pEqn_perm, dataBase_.cell_value_vec_bytes)); - - checkCudaErrors(cudaMalloc((void**)&d_u_host_order, dataBase_.cell_value_vec_bytes)); - // intermediate boundary fields - checkCudaErrors(cudaMalloc((void**)&d_boundary_u_host_order, dataBase_.boundary_surface_value_vec_bytes)); } void dfUEqn::initNonConstantFieldsInternal(const double *u, const double *boundary_u) @@ -407,12 +404,16 @@ void dfUEqn::initNonConstantFieldsBoundary() { } void dfUEqn::cleanCudaResources() { - if (graph_created) { +#ifdef USE_GRAPH + if (pre_graph_created) { checkCudaErrors(cudaGraphExecDestroy(graph_instance_pre)); - checkCudaErrors(cudaGraphExecDestroy(graph_instance_post)); checkCudaErrors(cudaGraphDestroy(graph_pre)); + } + if (post_graph_created) { + checkCudaErrors(cudaGraphExecDestroy(graph_instance_post)); checkCudaErrors(cudaGraphDestroy(graph_post)); } +#endif } void dfUEqn::preProcessForRhoEqn(const double *h_rho, const double *h_phi, const double *h_boundary_phi) { @@ -427,18 +428,13 @@ void dfUEqn::preProcess(const double *h_u, const double *h_boundary_u, const dou } void dfUEqn::process() { - //使用event计算时间 - float time_elapsed=0; - cudaEvent_t start,stop; - checkCudaErrors(cudaEventCreate(&start)); - checkCudaErrors(cudaEventCreate(&stop)); - checkCudaErrors(cudaEventRecord(start,0)); - -// #ifndef TIME_GPU -// if(!graph_created) { -// DEBUG_TRACE; -// checkCudaErrors(cudaStreamBeginCapture(dataBase_.stream, cudaStreamCaptureModeGlobal)); -// #endif + TICK_INIT_EVENT; + TICK_START_EVENT; +#ifdef USE_GRAPH + if(!pre_graph_created) { + DEBUG_TRACE; + checkCudaErrors(cudaStreamBeginCapture(dataBase_.stream, cudaStreamCaptureModeGlobal)); +#endif #ifdef STREAM_ALLOCATOR // thermophysical fields @@ -540,58 +536,70 @@ void dfUEqn::process() { d_ldu, d_extern, d_source, d_internal_coeffs, d_boundary_coeffs, dataBase_.cyclicNeighbor.data(), dataBase_.patchSizeOffset.data(), d_A, d_b); #endif -// #ifndef TIME_GPU -// checkCudaErrors(cudaStreamEndCapture(dataBase_.stream, &graph_pre)); -// checkCudaErrors(cudaGraphInstantiate(&graph_instance_pre, graph_pre, NULL, NULL, 0)); -// } -// DEBUG_TRACE; -// checkCudaErrors(cudaGraphLaunch(graph_instance_pre, dataBase_.stream)); -// #endif - checkCudaErrors(cudaEventRecord(stop,0)); - checkCudaErrors(cudaEventSynchronize(start)); - checkCudaErrors(cudaEventSynchronize(stop)); - checkCudaErrors(cudaEventElapsedTime(&time_elapsed,start,stop)); - if(!mpi_init_flag || myRank == 0) - fprintf(stderr, "ueqn assembly time:%f(ms)\n",time_elapsed); - - checkCudaErrors(cudaEventRecord(start,0)); +#ifdef USE_GRAPH + checkCudaErrors(cudaStreamEndCapture(dataBase_.stream, &graph_pre)); + checkCudaErrors(cudaGraphInstantiate(&graph_instance_pre, graph_pre, NULL, NULL, 0)); + pre_graph_created = true; + } + DEBUG_TRACE; + checkCudaErrors(cudaGraphLaunch(graph_instance_pre, dataBase_.stream)); +#endif + TICK_END_EVENT(UEqn assembly); + + TICK_START_EVENT; #ifndef DEBUG_CHECK_LDU solve(); #endif - checkCudaErrors(cudaEventRecord(stop,0)); - checkCudaErrors(cudaEventSynchronize(start)); - checkCudaErrors(cudaEventSynchronize(stop)); - checkCudaErrors(cudaEventElapsedTime(&time_elapsed,start,stop)); - if(!mpi_init_flag || myRank == 0) - fprintf(stderr, "ueqn solve time:%f(ms)\n",time_elapsed); - - checkCudaErrors(cudaEventRecord(start,0)); -// #ifndef TIME_GPU -// if(!graph_created) { -// checkCudaErrors(cudaStreamBeginCapture(dataBase_.stream, cudaStreamCaptureModeGlobal)); -// #endif + TICK_END_EVENT(UEqn solve); +#ifdef USE_GRAPH + if(!post_graph_created) { + checkCudaErrors(cudaStreamBeginCapture(dataBase_.stream, cudaStreamCaptureModeGlobal)); +#endif + + TICK_START_EVENT; correct_boundary_conditions_vector(dataBase_.stream, dataBase_.nccl_comm, dataBase_.neighbProcNo.data(), dataBase_.num_boundary_surfaces, dataBase_.num_cells, dataBase_.num_patches, dataBase_.patch_size.data(), patch_type.data(), dataBase_.d_boundary_weight, dataBase_.d_boundary_face_cell, dataBase_.d_u, dataBase_.d_boundary_u, dataBase_.cyclicNeighbor.data(), dataBase_.patchSizeOffset.data()); vector_half_mag_square(dataBase_.stream, dataBase_.num_cells, dataBase_.d_u, dataBase_.d_k, dataBase_.num_boundary_surfaces, dataBase_.d_boundary_u, dataBase_.d_boundary_k); + TICK_END_EVENT(UEqn post process correctBC); -// #ifndef TIME_GPU -// checkCudaErrors(cudaStreamEndCapture(dataBase_.stream, &graph_post)); -// checkCudaErrors(cudaGraphInstantiate(&graph_instance_post, graph_post, NULL, NULL, 0)); -// graph_created = true; -// } -// checkCudaErrors(cudaGraphLaunch(graph_instance_post, dataBase_.stream)); -// #endif + TICK_START_EVENT; +#ifdef STREAM_ALLOCATOR + // free + // thermophysical fields + checkCudaErrors(cudaFreeAsync(d_nu_eff, dataBase_.stream)); + // intermediate fields + checkCudaErrors(cudaFreeAsync(d_grad_u, dataBase_.stream)); + checkCudaErrors(cudaFreeAsync(d_rho_nueff, dataBase_.stream)); + checkCudaErrors(cudaFreeAsync(d_fvc_output, dataBase_.stream)); - checkCudaErrors(cudaEventRecord(stop,0)); - checkCudaErrors(cudaEventSynchronize(start)); - checkCudaErrors(cudaEventSynchronize(stop)); - checkCudaErrors(cudaEventElapsedTime(&time_elapsed,start,stop)); - if(!mpi_init_flag || myRank == 0) - fprintf(stderr, "ueqn post process time:%f(ms)\n\n",time_elapsed); + // thermophysical fields + checkCudaErrors(cudaFreeAsync(d_boundary_nu_eff, dataBase_.stream)); + // intermediate fields + checkCudaErrors(cudaFreeAsync(d_boundary_grad_u, dataBase_.stream)); + checkCudaErrors(cudaFreeAsync(d_boundary_rho_nueff, dataBase_.stream)); + // boundary coeff fields + checkCudaErrors(cudaFreeAsync(d_value_internal_coeffs, dataBase_.stream)); + checkCudaErrors(cudaFreeAsync(d_value_boundary_coeffs, dataBase_.stream)); + checkCudaErrors(cudaFreeAsync(d_gradient_internal_coeffs, dataBase_.stream)); + checkCudaErrors(cudaFreeAsync(d_gradient_boundary_coeffs, dataBase_.stream)); + + checkCudaErrors(cudaFreeAsync(d_A, dataBase_.stream)); + checkCudaErrors(cudaFreeAsync(d_b, dataBase_.stream)); +#endif + TICK_END_EVENT(UEqn post process free); + +#ifdef USE_GRAPH + checkCudaErrors(cudaStreamEndCapture(dataBase_.stream, &graph_post)); + checkCudaErrors(cudaGraphInstantiate(&graph_instance_post, graph_post, NULL, NULL, 0)); + post_graph_created = true; + } + checkCudaErrors(cudaGraphLaunch(graph_instance_post, dataBase_.stream)); +#endif + sync(); } void dfUEqn::sync() @@ -600,114 +608,19 @@ void dfUEqn::sync() } void dfUEqn::solve() { - sync(); - - // double h_A[dataBase_.num_Nz * 3]; - // double h_b[dataBase_.num_cells * 3]; - // checkCudaErrors(cudaMemcpyAsync(h_A, d_A, dataBase_.csr_value_vec_bytes, cudaMemcpyDeviceToHost, dataBase_.stream)); - // checkCudaErrors(cudaMemcpyAsync(h_b, d_b, dataBase_.cell_value_vec_bytes, cudaMemcpyDeviceToHost, dataBase_.stream)); - // sync(); - - // char *input_file = "of_output.txt"; - // FILE *fp = fopen(input_file, "rb+"); - // if (fp == NULL) - // { - // fprintf(stderr, "Failed to open input file: %s!\n", input_file); - // } - // int readfile = 0; - // double *of_b = new double[3 * dataBase_.num_cells]; - // double *of_A = new double[3 * dataBase_.num_Nz]; - // readfile = fread(of_b, dataBase_.cell_value_vec_bytes, 1, fp); - // readfile = fread(of_A, dataBase_.csr_value_vec_bytes, 1, fp); - - // std::vector h_A_of_init_vec(3 * dataBase_.num_Nz); - // std::copy(of_A, of_A + dataBase_.num_Nz * 3, h_A_of_init_vec.begin()); - - // // std::vector h_A_of_vec_perm(3 * dataBase_.num_Nz, 0); - // // for (int i = 0; i < dataBase_.num_Nz; i++) - // // { - // // h_A_of_vec_perm[i] = h_A_of_init_vec[dataBase_.lduCSRIndex[i]]; - // // h_A_of_vec_perm[i + dataBase_.num_Nz] = h_A_of_init_vec[dataBase_.lduCSRIndex[i] + dataBase_.num_Nz]; - // // h_A_of_vec_perm[i + 2 * dataBase_.num_Nz] = h_A_of_init_vec[dataBase_.lduCSRIndex[i] + 2 * dataBase_.num_Nz]; - // // } - // std::vector h_A_of_vec_1mtx(dataBase_.num_Nz, 0); - // for (int i = 0; i < dataBase_.num_Nz; i++) { - // h_A_of_vec_1mtx[i] = h_A_of_init_vec[dataBase_.lduCSRIndex[i]]; - // } - - - // // b - // std::vector h_b_of_init_vec(3 * dataBase_.num_cells); - // std::copy(of_b, of_b + 3 * dataBase_.num_cells, h_b_of_init_vec.begin()); - // std::vector h_b_of_vec; - // for (int i = 0; i < 3 * dataBase_.num_cells; i += 3) - // { - // h_b_of_vec.push_back(h_b_of_init_vec[i]); - // } - // // fill RHS_y - // for (int i = 1; i < 3 * dataBase_.num_cells; i += 3) - // { - // h_b_of_vec.push_back(h_b_of_init_vec[i]); - // } - // // fill RHS_z - // for (int i = 2; i < 3 * dataBase_.num_cells; i += 3) - // { - // h_b_of_vec.push_back(h_b_of_init_vec[i]); - // } - - // fprintf(stderr, "check of h_A_csr\n"); - // checkVectorEqual(dataBase_.num_Nz, h_A_of_vec_1mtx.data(), h_A, 1e-14, false); - // fprintf(stderr, "check of h_b\n"); - // checkVectorEqual(3 * dataBase_.num_cells, h_b_of_vec.data(), h_b, 1e-14, false); - - - if (num_iteration == 0) // first interation - { - printf("Initializing AmgX Linear Solver\n"); - UxSolver->setOperator(dataBase_.num_cells, dataBase_.num_total_cells, dataBase_.num_Nz, dataBase_.d_csr_row_index, dataBase_.d_csr_col_index, d_A); - UySolver->setOperator(dataBase_.num_cells, dataBase_.num_total_cells, dataBase_.num_Nz, dataBase_.d_csr_row_index, dataBase_.d_csr_col_index, d_A + dataBase_.num_Nz); - UzSolver->setOperator(dataBase_.num_cells, dataBase_.num_total_cells, dataBase_.num_Nz, dataBase_.d_csr_row_index, dataBase_.d_csr_col_index, d_A + 2 * dataBase_.num_Nz); - } - else - { - UxSolver->updateOperator(dataBase_.num_cells, dataBase_.num_Nz, d_A); - UySolver->updateOperator(dataBase_.num_cells, dataBase_.num_Nz, d_A + dataBase_.num_Nz); - UzSolver->updateOperator(dataBase_.num_cells, dataBase_.num_Nz, d_A + 2 * dataBase_.num_Nz); - } - UxSolver->solve(dataBase_.num_cells, dataBase_.d_u, d_b); - UySolver->solve(dataBase_.num_cells, dataBase_.d_u + dataBase_.num_cells, d_b + dataBase_.num_cells); - UzSolver->solve(dataBase_.num_cells, dataBase_.d_u + 2 * dataBase_.num_cells, d_b + 2 * dataBase_.num_cells); + dataBase_.solve(num_iteration, AMGXSetting::u_setting, d_A, dataBase_.d_u, d_b); + dataBase_.solve(num_iteration, AMGXSetting::u_setting, d_A + dataBase_.num_Nz, dataBase_.d_u + dataBase_.num_cells, d_b + dataBase_.num_cells); + dataBase_.solve(num_iteration, AMGXSetting::u_setting, d_A + 2 * dataBase_.num_Nz, dataBase_.d_u + 2 * dataBase_.num_cells, d_b + 2 * dataBase_.num_cells); num_iteration++; } -void dfUEqn::postProcess(double *h_u) { +void dfUEqn::postProcess() { + // postProcess of dfUEqn can not be moved to the end of dfUEqn::process(), + // because dataBase_.d_u is modified in dfpEqn and we only need the result of the last change + // copy u and boundary_u to host permute_vector_d2h(dataBase_.stream, dataBase_.num_cells, dataBase_.d_u, d_u_host_order); - checkCudaErrors(cudaMemcpyAsync(h_u, d_u_host_order, dataBase_.cell_value_vec_bytes, cudaMemcpyDeviceToHost, dataBase_.stream)); - checkCudaErrors(cudaStreamSynchronize(dataBase_.stream)); - -#ifdef STREAM_ALLOCATOR - // free - // thermophysical fields - checkCudaErrors(cudaFreeAsync(d_nu_eff, dataBase_.stream)); - // intermediate fields - checkCudaErrors(cudaFreeAsync(d_grad_u, dataBase_.stream)); - checkCudaErrors(cudaFreeAsync(d_rho_nueff, dataBase_.stream)); - checkCudaErrors(cudaFreeAsync(d_fvc_output, dataBase_.stream)); - - // thermophysical fields - checkCudaErrors(cudaFreeAsync(d_boundary_nu_eff, dataBase_.stream)); - // intermediate fields - checkCudaErrors(cudaFreeAsync(d_boundary_grad_u, dataBase_.stream)); - checkCudaErrors(cudaFreeAsync(d_boundary_rho_nueff, dataBase_.stream)); - // boundary coeff fields - checkCudaErrors(cudaFreeAsync(d_value_internal_coeffs, dataBase_.stream)); - checkCudaErrors(cudaFreeAsync(d_value_boundary_coeffs, dataBase_.stream)); - checkCudaErrors(cudaFreeAsync(d_gradient_internal_coeffs, dataBase_.stream)); - checkCudaErrors(cudaFreeAsync(d_gradient_boundary_coeffs, dataBase_.stream)); - - checkCudaErrors(cudaFreeAsync(d_A, dataBase_.stream)); - checkCudaErrors(cudaFreeAsync(d_b, dataBase_.stream)); -#endif + checkCudaErrors(cudaMemcpyAsync(dataBase_.h_u, d_u_host_order, dataBase_.cell_value_vec_bytes, cudaMemcpyDeviceToHost, dataBase_.stream)); + sync(); } void dfUEqn::A(double *Psi) { @@ -761,6 +674,7 @@ void dfUEqn::UEqnGetHbyA(cudaStream_t stream, ncclComm_t comm, const int *neighb int offset = 0; for (int i = 0; i < num_patches; i++) { + if (patch_size[i] == 0) continue; threads_per_block = 64; blocks_per_grid = (patch_size[i] + threads_per_block - 1) / threads_per_block; // TODO: just non-coupled patch type now @@ -796,6 +710,7 @@ void dfUEqn::UEqnGetHbyA(cudaStream_t stream, ncclComm_t comm, const int *neighb // constrainHbyA offset = 0; for (int i = 0; i < num_patches; i++) { + if (patch_size[i] == 0) continue; threads_per_block = 64; blocks_per_grid = (patch_size[i] + threads_per_block - 1) / threads_per_block; // TODO: just non-coupled patch type now @@ -829,6 +744,7 @@ void dfUEqn::ueqn_ldu_to_csr(cudaStream_t stream, int num_cells, int num_surface int bou_offset = 0, ext_offset = 0; size_t threads_per_block, blocks_per_grid; for (int i = 0; i < num_patches; i++) { + if (patch_size[i] == 0) continue; if (patch_type[i] == boundaryConditions::processor || patch_type[i] == boundaryConditions::processorCyclic) { threads_per_block = 64; @@ -858,6 +774,7 @@ void dfUEqn::ueqn_ldu_to_csr(cudaStream_t stream, int num_cells, int num_surface // add coeff to source and diagnal bou_offset = 0; for (int i = 0; i < num_patches; i++) { + if (patch_size[i] == 0) continue; threads_per_block = 64; blocks_per_grid = (patch_size[i] + threads_per_block - 1) / threads_per_block; if (patch_type[i] == boundaryConditions::processor diff --git a/src_gpu/dfYEqn.H b/src_gpu/dfYEqn.H index 1064e313c..ab89abde6 100644 --- a/src_gpu/dfYEqn.H +++ b/src_gpu/dfYEqn.H @@ -11,10 +11,13 @@ private: dfMatrixDataBase &dataBase_; // cuda resource + cudaStream_t stream; +#ifdef USE_GRAPH // one graph for one eqn before using self-developed solver cudaGraph_t graph; cudaGraphExec_t graph_instance; bool graph_created=false; +#endif // constant values -- basic std::string mode_string; @@ -117,7 +120,7 @@ public: const double *h_mut_sct, const double *h_boundary_mut_sct); void process(); void postProcess(double *h_y, double *h_boundary_y); - void solve(int solverIndex, int speciesIndex); + void solve(int speciesIndex); void sync(); // 方程特化版离散函数 diff --git a/src_gpu/dfYEqn.cu b/src_gpu/dfYEqn.cu index ddfb19455..c81977b9c 100644 --- a/src_gpu/dfYEqn.cu +++ b/src_gpu/dfYEqn.cu @@ -318,6 +318,7 @@ double* dfYEqn::getFieldPointer(const char* fieldAlias, location loc, position p } void dfYEqn::setConstantValues(const std::string &mode_string, const std::string &setting_path, const int inertIndex) { + this->stream = dataBase_.stream; this->mode_string = mode_string; this->setting_path = setting_path; this->inertIndex = inertIndex; @@ -405,13 +406,6 @@ void dfYEqn::createNonConstantLduAndCsrFields() { void dfYEqn::initNonConstantFieldsInternal(const double *y) { checkCudaErrors(cudaMemcpyAsync(dataBase_.d_y, y, dataBase_.cell_value_bytes * dataBase_.num_species, cudaMemcpyHostToDevice, dataBase_.stream)); - - // UnityLewis - checkCudaErrors(cudaMemsetAsync(d_hai, 0, dataBase_.cell_value_bytes * dataBase_.num_species, dataBase_.stream)); - checkCudaErrors(cudaMemsetAsync(d_boundary_hai, 0, dataBase_.boundary_surface_value_bytes * dataBase_.num_species, dataBase_.stream)); - // laminar - checkCudaErrors(cudaMemsetAsync(d_mut_sct, 0, dataBase_.cell_value_bytes, dataBase_.stream)); - checkCudaErrors(cudaMemsetAsync(d_boundary_mut_sct, 0, dataBase_.boundary_surface_value_bytes, dataBase_.stream)); } void dfYEqn::initNonConstantFieldsBoundary(const double *boundary_y) { @@ -429,10 +423,12 @@ void dfYEqn::initNonConstantFieldsBoundary(const double *boundary_y) { } void dfYEqn::cleanCudaResources() { +#ifdef USE_GRAPH if (graph_created) { checkCudaErrors(cudaGraphExecDestroy(graph_instance)); checkCudaErrors(cudaGraphDestroy(graph)); } +#endif } void dfYEqn::preProcess(const double *h_rhoD, const double *h_boundary_rhoD, @@ -444,14 +440,9 @@ void dfYEqn::preProcess(const double *h_rhoD, const double *h_boundary_rhoD, } void dfYEqn::process() { - //使用event计算时间 - float time_elapsed=0; - cudaEvent_t start,stop; - checkCudaErrors(cudaEventCreate(&start)); - checkCudaErrors(cudaEventCreate(&stop)); - checkCudaErrors(cudaEventRecord(start,0)); - -#ifndef TIME_GPU + TICK_INIT_EVENT; + TICK_START_EVENT; +#ifdef USE_GRAPH if(!graph_created) { DEBUG_TRACE; checkCudaErrors(cudaStreamBeginCapture(dataBase_.stream, cudaStreamCaptureModeGlobal)); @@ -486,6 +477,14 @@ void dfYEqn::process() { checkCudaErrors(cudaMallocAsync((void**)&d_boundary_coeffs, dataBase_.boundary_surface_value_bytes, dataBase_.stream)); checkCudaErrors(cudaMallocAsync((void**)&d_A, dataBase_.csr_value_bytes, dataBase_.stream)); #endif + + // UnityLewis + checkCudaErrors(cudaMemsetAsync(d_hai, 0, dataBase_.cell_value_bytes * dataBase_.num_species, dataBase_.stream)); + checkCudaErrors(cudaMemsetAsync(d_boundary_hai, 0, dataBase_.boundary_surface_value_bytes * dataBase_.num_species, dataBase_.stream)); + // laminar + checkCudaErrors(cudaMemsetAsync(d_mut_sct, 0, dataBase_.cell_value_bytes, dataBase_.stream)); + checkCudaErrors(cudaMemsetAsync(d_boundary_mut_sct, 0, dataBase_.boundary_surface_value_bytes, dataBase_.stream)); + checkCudaErrors(cudaMemsetAsync(dataBase_.d_diff_alphaD, 0, dataBase_.cell_value_bytes, dataBase_.stream)); checkCudaErrors(cudaMemsetAsync(dataBase_.d_boundary_diff_alphaD, 0, dataBase_.boundary_surface_value_bytes, dataBase_.stream)); checkCudaErrors(cudaMemsetAsync(d_grad_y, 0, dataBase_.cell_value_vec_bytes * dataBase_.num_species, dataBase_.stream)); @@ -542,7 +541,7 @@ void dfYEqn::process() { // double *d_boundary_DEff = d_boundary_rhoD; yeqn_compute_DEff(dataBase_.stream, dataBase_.num_species, dataBase_.num_cells, dataBase_.num_boundary_surfaces, d_lewis_number, dataBase_.d_thermo_alpha, d_mut_sct, d_DEff, dataBase_.d_boundary_thermo_alpha, d_boundary_mut_sct, d_boundary_DEff); -#ifndef TIME_GPU +#ifdef USE_GRAPH checkCudaErrors(cudaStreamEndCapture(dataBase_.stream, &graph)); checkCudaErrors(cudaGraphInstantiate(&graph_instance, graph, NULL, NULL, 0)); graph_created = true; @@ -611,16 +610,20 @@ void dfYEqn::process() { ldu_to_csr_scalar(dataBase_.stream, dataBase_.num_cells, dataBase_.num_surfaces, dataBase_.num_boundary_surfaces, dataBase_.num_Nz, dataBase_.d_boundary_face_cell, dataBase_.d_ldu_to_csr_index, dataBase_.num_patches, dataBase_.patch_size.data(), patch_type.data(), d_ldu, d_source, d_internal_coeffs, d_boundary_coeffs, d_A); - solve(solverIndex, s); - solverIndex ++; + // TODO with solver of database_, solverIndex is no need any more. + //solve(solverIndex, s); + //solverIndex ++; + solve(s); #endif } if (s == dataBase_.num_species - 1) num_iteration++; } + TICK_END_EVENT(YEqn assembly and solve for all species); + + TICK_START_EVENT; // compute y_inertIndex yeqn_compute_y_inertIndex(dataBase_.stream, dataBase_.num_species, inertIndex, dataBase_.num_cells, dataBase_.d_y); - // correct boundary conditions for (int s = 0; s < dataBase_.num_species; s++) { correct_boundary_conditions_scalar(dataBase_.stream, dataBase_.nccl_comm, dataBase_.neighbProcNo.data(), @@ -629,39 +632,18 @@ void dfYEqn::process() { dataBase_.d_y + dataBase_.num_cells * s, dataBase_.d_boundary_y + dataBase_.num_boundary_surfaces * s, dataBase_.cyclicNeighbor.data(), dataBase_.patchSizeOffset.data(), dataBase_.d_boundary_weight); } + TICK_END_EVENT(YEqn post process for all species correctBC); - checkCudaErrors(cudaEventRecord(stop,0)); - checkCudaErrors(cudaEventSynchronize(start)); - checkCudaErrors(cudaEventSynchronize(stop)); - checkCudaErrors(cudaEventElapsedTime(&time_elapsed,start,stop)); - if(!mpi_init_flag || myRank == 0) - fprintf(stderr, "yeqn process time: %f(ms)\n\n",time_elapsed); -} - -void dfYEqn::solve(int solverIndex, int speciesIndex) { - if (num_iteration == 0) // first interation - { - fprintf(stderr, "Initializing AmgX Linear Solver\n"); - DEBUG_TRACE; - YSolverSet[solverIndex]->setOperator(dataBase_.num_cells, dataBase_.num_total_cells, dataBase_.num_Nz, dataBase_.d_csr_row_index, dataBase_.d_csr_col_index, d_A); - DEBUG_TRACE; - } - else - { - DEBUG_TRACE; - YSolverSet[solverIndex]->updateOperator(dataBase_.num_cells, dataBase_.num_Nz, d_A); - DEBUG_TRACE; - } - // use d_source as d_b - DEBUG_TRACE; - YSolverSet[solverIndex]->solve(dataBase_.num_cells, dataBase_.d_y + dataBase_.num_cells * speciesIndex, d_source); - DEBUG_TRACE; -} + TICK_START_EVENT; + // copy y and boundary_y to host + checkCudaErrors(cudaMemcpyAsync(dataBase_.h_y, dataBase_.d_y, dataBase_.cell_value_bytes * dataBase_.num_species, cudaMemcpyDeviceToHost, dataBase_.stream)); + checkCudaErrors(cudaMemcpyAsync(dataBase_.h_boundary_y, dataBase_.d_boundary_y, dataBase_.boundary_surface_value_bytes * dataBase_.num_species, cudaMemcpyDeviceToHost, dataBase_.stream)); + TICK_END_EVENT(YEqn post process for all species copy back); -void dfYEqn::postProcess(double *h_y, double *h_boundary_y) { + TICK_START_EVENT; #ifdef STREAM_ALLOCATOR // thermophysical fields - checkCudaErrors(cudaFreeAsync(d_rhoD, dataBase_.stream)); + //checkCudaErrors(cudaFreeAsync(d_rhoD, dataBase_.stream)); checkCudaErrors(cudaFreeAsync(d_hai, dataBase_.stream)); checkCudaErrors(cudaFreeAsync(d_mut_sct, dataBase_.stream)); // intermediate fields @@ -672,7 +654,7 @@ void dfYEqn::postProcess(double *h_y, double *h_boundary_y) { checkCudaErrors(cudaFreeAsync(d_permute, dataBase_.stream)); // thermophysical fields - checkCudaErrors(cudaFreeAsync(d_boundary_rhoD, dataBase_.stream)); + //checkCudaErrors(cudaFreeAsync(d_boundary_rhoD, dataBase_.stream)); checkCudaErrors(cudaFreeAsync(d_boundary_hai, dataBase_.stream)); checkCudaErrors(cudaFreeAsync(d_boundary_mut_sct, dataBase_.stream)); // intermediate fields @@ -693,12 +675,19 @@ void dfYEqn::postProcess(double *h_y, double *h_boundary_y) { checkCudaErrors(cudaFreeAsync(d_boundary_coeffs, dataBase_.stream)); checkCudaErrors(cudaFreeAsync(d_A, dataBase_.stream)); #endif - // copy y to host - checkCudaErrors(cudaMemcpyAsync(h_y, dataBase_.d_y, dataBase_.cell_value_bytes * dataBase_.num_species, cudaMemcpyDeviceToHost, dataBase_.stream)); - checkCudaErrors(cudaMemcpyAsync(h_boundary_y, dataBase_.d_boundary_y, dataBase_.boundary_surface_value_bytes * dataBase_.num_species, cudaMemcpyDeviceToHost, dataBase_.stream)); - checkCudaErrors(cudaStreamSynchronize(dataBase_.stream)); + TICK_END_EVENT(YEqn post process for all species free); + sync(); } +void dfYEqn::solve(int speciesIndex) { + TICK_INIT_EVENT; + TICK_START_EVENT; + dataBase_.solve(num_iteration, AMGXSetting::u_setting, d_A, dataBase_.d_y + dataBase_.num_cells * speciesIndex, d_source); + TICK_END_EVENT(YEqn solve one specie); +} + +void dfYEqn::postProcess(double *h_y, double *h_boundary_y) {} + void dfYEqn::sync() { checkCudaErrors(cudaStreamSynchronize(dataBase_.stream)); } @@ -745,6 +734,7 @@ void dfYEqn::yeqn_fvc_laplacian_scalar(cudaStream_t stream, ncclComm_t comm, con int offset = 0; for (int i = 0; i < num_patches; i++) { + if (patch_size[i] == 0) continue; threads_per_block = 256; blocks_per_grid = (patch_size[i] + threads_per_block - 1) / threads_per_block; // TODO: just basic patch type now @@ -788,6 +778,7 @@ void dfYEqn::yeqn_fvc_laplacian_scalar(cudaStream_t stream, ncclComm_t comm, con boundary_cell_face, output, boundary_output); offset = 0; for (int i = 0; i < num_patches; i++) { + if (patch_size[i] == 0) continue; if (patch_type[i] == boundaryConditions::processor) { correct_boundary_conditions_processor_scalar(stream, comm, neighbor_peer[i], patch_size[i], offset, output, boundary_cell_face, boundary_output); diff --git a/src_gpu/dfpEqn.H b/src_gpu/dfpEqn.H index fe58e6608..79a414eac 100644 --- a/src_gpu/dfpEqn.H +++ b/src_gpu/dfpEqn.H @@ -11,10 +11,14 @@ private: dfMatrixDataBase &dataBase_; // cuda resource + cudaStream_t stream; +#ifdef USE_GRAPH // one graph for one eqn before using self-developed solver cudaGraph_t graph_pre, graph_post; cudaGraphExec_t graph_instance_pre, graph_instance_post; - bool graph_created=false; + bool pre_graph_created=false; + bool post_graph_created=false; +#endif // constant values -- basic std::string mode_string; @@ -92,7 +96,7 @@ public: void preProcess(double *h_phi, double *h_boundary_phi); void correctPsi(const double *h_thermoPsi, double *h_boundary_thermoPsi); // tmp void correctP(const double *h_p, double *h_boundary_p); // tmp - void getFlux(); + //void getFlux(); void process(); void postProcess(); diff --git a/src_gpu/dfpEqn.cu b/src_gpu/dfpEqn.cu index 5bfc1c17a..fed764a36 100644 --- a/src_gpu/dfpEqn.cu +++ b/src_gpu/dfpEqn.cu @@ -294,6 +294,7 @@ double* dfpEqn::getFieldPointer(const char* fieldAlias, location loc, position p } void dfpEqn::setConstantValues(const std::string &mode_string, const std::string &setting_path) { + this->stream = dataBase_.stream; this->mode_string = mode_string; this->setting_path = setting_path; pSolver = new AmgXSolver(mode_string, setting_path, dataBase_.localRank); @@ -305,13 +306,17 @@ void dfpEqn::setConstantFields(const std::vector patch_type_U, const std::v } void dfpEqn::createNonConstantFieldsInternal() { +#ifndef STREAM_ALLOCATOR // intermediate fields checkCudaErrors(cudaMalloc((void**)&d_rhorAUf, dataBase_.surface_value_bytes)); checkCudaErrors(cudaMalloc((void**)&d_phiHbyA, dataBase_.surface_value_bytes)); checkCudaErrors(cudaMalloc((void**)&d_flux, dataBase_.surface_value_bytes)); +#endif } void dfpEqn::createNonConstantFieldsBoundary() { +#ifndef STREAM_ALLOCATOR + // boundary coeffs checkCudaErrors(cudaMalloc((void**)&d_value_internal_coeffs, dataBase_.boundary_surface_value_bytes)); checkCudaErrors(cudaMalloc((void**)&d_value_boundary_coeffs, dataBase_.boundary_surface_value_bytes)); checkCudaErrors(cudaMalloc((void**)&d_gradient_internal_coeffs, dataBase_.boundary_surface_value_bytes)); @@ -320,18 +325,22 @@ void dfpEqn::createNonConstantFieldsBoundary() { checkCudaErrors(cudaMalloc((void**)&d_boundary_rhorAUf, dataBase_.boundary_surface_value_bytes)); checkCudaErrors(cudaMalloc((void**)&d_boundary_phiHbyA, dataBase_.boundary_surface_value_bytes)); checkCudaErrors(cudaMalloc((void**)&d_boundary_flux, dataBase_.boundary_surface_value_bytes)); +#endif } void dfpEqn::createNonConstantLduAndCsrFields() { + // ldu and csr checkCudaErrors(cudaMalloc((void**)&d_ldu, dataBase_.csr_value_bytes)); d_lower = d_ldu; d_diag = d_ldu + dataBase_.num_surfaces; d_upper = d_ldu + dataBase_.num_cells + dataBase_.num_surfaces; d_extern = d_ldu + dataBase_.num_cells + 2 * dataBase_.num_surfaces; +#ifndef STREAM_ALLOCATOR checkCudaErrors(cudaMalloc((void**)&d_source, dataBase_.cell_value_bytes)); checkCudaErrors(cudaMalloc((void**)&d_internal_coeffs, dataBase_.boundary_surface_value_bytes)); checkCudaErrors(cudaMalloc((void**)&d_boundary_coeffs, dataBase_.boundary_surface_value_bytes)); checkCudaErrors(cudaMalloc((void**)&d_A, dataBase_.csr_value_bytes)); +#endif } void dfpEqn::initNonConstantFields(const double *p, const double *boundary_p){ @@ -340,12 +349,16 @@ void dfpEqn::initNonConstantFields(const double *p, const double *boundary_p){ } void dfpEqn::cleanCudaResources() { - if (graph_created) { +#ifdef USE_GRAPH + if (pre_graph_created) { checkCudaErrors(cudaGraphExecDestroy(graph_instance_pre)); - checkCudaErrors(cudaGraphExecDestroy(graph_instance_post)); checkCudaErrors(cudaGraphDestroy(graph_pre)); + } + if (post_graph_created) { + checkCudaErrors(cudaGraphExecDestroy(graph_instance_post)); checkCudaErrors(cudaGraphDestroy(graph_post)); } +#endif } // tmp @@ -364,19 +377,37 @@ void dfpEqn::correctP(const double *h_p, double *h_boundary_p) { }; void dfpEqn::process() { - - float time_elapsed=0; - cudaEvent_t start,stop; - checkCudaErrors(cudaEventCreate(&start)); - checkCudaErrors(cudaEventCreate(&stop)); - checkCudaErrors(cudaEventRecord(start,0)); - -#ifndef TIME_GPU - if(!graph_created) { + TICK_INIT_EVENT; + TICK_START_EVENT; +#ifdef USE_GRAPH + if(!pre_graph_created) { DEBUG_TRACE; checkCudaErrors(cudaStreamBeginCapture(dataBase_.stream, cudaStreamCaptureModeGlobal)); #endif +#ifdef STREAM_ALLOCATOR + // intermediate fields + checkCudaErrors(cudaMallocAsync((void**)&d_rhorAUf, dataBase_.surface_value_bytes, dataBase_.stream)); + checkCudaErrors(cudaMallocAsync((void**)&d_phiHbyA, dataBase_.surface_value_bytes, dataBase_.stream)); + checkCudaErrors(cudaMallocAsync((void**)&d_flux, dataBase_.surface_value_bytes, dataBase_.stream)); + + // boundary coeffs + checkCudaErrors(cudaMallocAsync((void**)&d_value_internal_coeffs, dataBase_.boundary_surface_value_bytes, dataBase_.stream)); + checkCudaErrors(cudaMallocAsync((void**)&d_value_boundary_coeffs, dataBase_.boundary_surface_value_bytes, dataBase_.stream)); + checkCudaErrors(cudaMallocAsync((void**)&d_gradient_internal_coeffs, dataBase_.boundary_surface_value_bytes, dataBase_.stream)); + checkCudaErrors(cudaMallocAsync((void**)&d_gradient_boundary_coeffs, dataBase_.boundary_surface_value_bytes, dataBase_.stream)); + // intermediate boundary fields + checkCudaErrors(cudaMallocAsync((void**)&d_boundary_rhorAUf, dataBase_.boundary_surface_value_bytes, dataBase_.stream)); + checkCudaErrors(cudaMallocAsync((void**)&d_boundary_phiHbyA, dataBase_.boundary_surface_value_bytes, dataBase_.stream)); + checkCudaErrors(cudaMallocAsync((void**)&d_boundary_flux, dataBase_.boundary_surface_value_bytes, dataBase_.stream)); + + // ldu and csr + checkCudaErrors(cudaMallocAsync((void**)&d_source, dataBase_.cell_value_bytes, dataBase_.stream)); + checkCudaErrors(cudaMallocAsync((void**)&d_internal_coeffs, dataBase_.boundary_surface_value_bytes, dataBase_.stream)); + checkCudaErrors(cudaMallocAsync((void**)&d_boundary_coeffs, dataBase_.boundary_surface_value_bytes, dataBase_.stream)); + checkCudaErrors(cudaMallocAsync((void**)&d_A, dataBase_.csr_value_bytes, dataBase_.stream)); +#endif + checkCudaErrors(cudaMemsetAsync(d_ldu, 0, dataBase_.csr_value_bytes, dataBase_.stream)); // d_ldu contains d_lower, d_diag, and d_upper checkCudaErrors(cudaMemsetAsync(d_source, 0, dataBase_.cell_value_bytes, dataBase_.stream)); checkCudaErrors(cudaMemsetAsync(d_internal_coeffs, 0, dataBase_.boundary_surface_value_bytes, dataBase_.stream)); @@ -428,93 +459,101 @@ void dfpEqn::process() { dataBase_.num_patches, dataBase_.patch_size.data(), patch_type_p.data(), d_ldu, d_source, d_internal_coeffs, d_boundary_coeffs, d_A); -#ifndef TIME_GPU +#ifdef USE_GRAPH checkCudaErrors(cudaStreamEndCapture(dataBase_.stream, &graph_pre)); checkCudaErrors(cudaGraphInstantiate(&graph_instance_pre, graph_pre, NULL, NULL, 0)); + pre_graph_created = true; } DEBUG_TRACE; checkCudaErrors(cudaGraphLaunch(graph_instance_pre, dataBase_.stream)); #endif - checkCudaErrors(cudaEventRecord(stop,0)); - checkCudaErrors(cudaEventSynchronize(start)); - checkCudaErrors(cudaEventSynchronize(stop)); - checkCudaErrors(cudaEventElapsedTime(&time_elapsed,start,stop)); - if(!mpi_init_flag || myRank == 0) - fprintf(stderr, "peqn assembly time:%f(ms)\n",time_elapsed); - - checkCudaErrors(cudaEventRecord(start,0)); + TICK_END_EVENT(pEqn assembly); + + TICK_START_EVENT; solve(); - checkCudaErrors(cudaEventRecord(stop,0)); - checkCudaErrors(cudaEventSynchronize(start)); - checkCudaErrors(cudaEventSynchronize(stop)); - checkCudaErrors(cudaEventElapsedTime(&time_elapsed,start,stop)); - if(!mpi_init_flag || myRank == 0) - fprintf(stderr, "peqn solve time:%f(ms)\n",time_elapsed); - - checkCudaErrors(cudaEventRecord(start,0)); -#ifndef TIME_GPU - if(!graph_created) { + TICK_END_EVENT(pEqn solve); + +#ifdef USE_GRAPH + if(!post_graph_created) { checkCudaErrors(cudaStreamBeginCapture(dataBase_.stream, cudaStreamCaptureModeGlobal)); #endif - correct_boundary_conditions_scalar(dataBase_.stream, dataBase_.nccl_comm, dataBase_.neighbProcNo.data(), - dataBase_.num_boundary_surfaces, dataBase_.num_patches, dataBase_.patch_size.data(), - patch_type_p.data(), dataBase_.d_boundary_delta_coeffs, - dataBase_.d_boundary_face_cell, dataBase_.d_p, dataBase_.d_boundary_p, - dataBase_.cyclicNeighbor.data(), dataBase_.patchSizeOffset.data(), dataBase_.d_boundary_weight); - // update phi - fvMtx_flux(dataBase_.stream, dataBase_.num_surfaces, dataBase_.num_boundary_surfaces, dataBase_.d_owner, dataBase_.d_neighbor, - d_lower, d_upper, dataBase_.d_p, d_flux, - dataBase_.num_patches, dataBase_.patch_size.data(), patch_type_p.data(), - dataBase_.d_boundary_face_cell, d_internal_coeffs, d_boundary_coeffs, dataBase_.cyclicNeighbor.data(), - dataBase_.patchSizeOffset.data(), dataBase_.d_boundary_p, d_boundary_flux); - field_add_scalar(dataBase_.stream, dataBase_.num_surfaces, d_phiHbyA, d_flux, dataBase_.d_phi, - dataBase_.num_boundary_surfaces, d_boundary_phiHbyA, d_boundary_flux, dataBase_.d_boundary_phi); - // correct U - checkCudaErrors(cudaMemsetAsync(dataBase_.d_u, 0., dataBase_.cell_value_vec_bytes, dataBase_.stream)); - // TODO: may do not need to calculate boundary fields - fvc_grad_cell_scalar(dataBase_.stream, dataBase_.num_cells, dataBase_.num_surfaces, dataBase_.num_boundary_surfaces, - dataBase_.d_owner, dataBase_.d_neighbor, - dataBase_.d_weight, dataBase_.d_sf, dataBase_.d_p, dataBase_.d_u, - dataBase_.num_patches, dataBase_.patch_size.data(), patch_type_p.data(), dataBase_.d_boundary_weight, - dataBase_.d_boundary_face_cell, dataBase_.d_boundary_p, dataBase_.d_boundary_sf, dataBase_.d_volume, true); - scalar_field_multiply_vector_field(dataBase_.stream, dataBase_.num_cells, dataBase_.d_rAU, dataBase_.d_u, dataBase_.d_u); - field_add_vector(dataBase_.stream, dataBase_.num_cells, dataBase_.d_HbyA, dataBase_.d_u, dataBase_.d_u, -1.); - correct_boundary_conditions_vector(dataBase_.stream, dataBase_.nccl_comm, dataBase_.neighbProcNo.data(), dataBase_.num_boundary_surfaces, - dataBase_.num_cells, dataBase_.num_patches, dataBase_.patch_size.data(), patch_type_U.data(), dataBase_.d_boundary_weight, - dataBase_.d_boundary_face_cell, dataBase_.d_u, dataBase_.d_boundary_u, - dataBase_.cyclicNeighbor.data(), dataBase_.patchSizeOffset.data()); - vector_half_mag_square(dataBase_.stream, dataBase_.num_cells, dataBase_.d_u, dataBase_.d_k, dataBase_.num_boundary_surfaces, - dataBase_.d_boundary_u, dataBase_.d_boundary_k); - // calculate dpdt - fvc_ddt_scalar_field(dataBase_.stream, dataBase_.num_cells, dataBase_.rdelta_t, dataBase_.d_p, dataBase_.d_p_old, dataBase_.d_volume, dataBase_.d_dpdt, 1.); - - #ifndef TIME_GPU + TICK_START_EVENT; + correct_boundary_conditions_scalar(dataBase_.stream, dataBase_.nccl_comm, dataBase_.neighbProcNo.data(), + dataBase_.num_boundary_surfaces, dataBase_.num_patches, dataBase_.patch_size.data(), + patch_type_p.data(), dataBase_.d_boundary_delta_coeffs, + dataBase_.d_boundary_face_cell, dataBase_.d_p, dataBase_.d_boundary_p, + dataBase_.cyclicNeighbor.data(), dataBase_.patchSizeOffset.data(), dataBase_.d_boundary_weight); + // update phi + fvMtx_flux(dataBase_.stream, dataBase_.num_surfaces, dataBase_.num_boundary_surfaces, dataBase_.d_owner, dataBase_.d_neighbor, + d_lower, d_upper, dataBase_.d_p, d_flux, + dataBase_.num_patches, dataBase_.patch_size.data(), patch_type_p.data(), + dataBase_.d_boundary_face_cell, d_internal_coeffs, d_boundary_coeffs, dataBase_.cyclicNeighbor.data(), + dataBase_.patchSizeOffset.data(), dataBase_.d_boundary_p, d_boundary_flux); + field_add_scalar(dataBase_.stream, dataBase_.num_surfaces, d_phiHbyA, d_flux, dataBase_.d_phi, + dataBase_.num_boundary_surfaces, d_boundary_phiHbyA, d_boundary_flux, dataBase_.d_boundary_phi); + // correct U + checkCudaErrors(cudaMemsetAsync(dataBase_.d_u, 0., dataBase_.cell_value_vec_bytes, dataBase_.stream)); + // TODO: may do not need to calculate boundary fields + fvc_grad_cell_scalar(dataBase_.stream, dataBase_.num_cells, dataBase_.num_surfaces, dataBase_.num_boundary_surfaces, + dataBase_.d_owner, dataBase_.d_neighbor, + dataBase_.d_weight, dataBase_.d_sf, dataBase_.d_p, dataBase_.d_u, + dataBase_.num_patches, dataBase_.patch_size.data(), patch_type_p.data(), dataBase_.d_boundary_weight, + dataBase_.d_boundary_face_cell, dataBase_.d_boundary_p, dataBase_.d_boundary_sf, dataBase_.d_volume, true); + scalar_field_multiply_vector_field(dataBase_.stream, dataBase_.num_cells, dataBase_.d_rAU, dataBase_.d_u, dataBase_.d_u); + field_add_vector(dataBase_.stream, dataBase_.num_cells, dataBase_.d_HbyA, dataBase_.d_u, dataBase_.d_u, -1.); + correct_boundary_conditions_vector(dataBase_.stream, dataBase_.nccl_comm, dataBase_.neighbProcNo.data(), dataBase_.num_boundary_surfaces, + dataBase_.num_cells, dataBase_.num_patches, dataBase_.patch_size.data(), patch_type_U.data(), dataBase_.d_boundary_weight, + dataBase_.d_boundary_face_cell, dataBase_.d_u, dataBase_.d_boundary_u, + dataBase_.cyclicNeighbor.data(), dataBase_.patchSizeOffset.data()); + vector_half_mag_square(dataBase_.stream, dataBase_.num_cells, dataBase_.d_u, dataBase_.d_k, dataBase_.num_boundary_surfaces, + dataBase_.d_boundary_u, dataBase_.d_boundary_k); + // calculate dpdt + fvc_ddt_scalar_field(dataBase_.stream, dataBase_.num_cells, dataBase_.rdelta_t, dataBase_.d_p, dataBase_.d_p_old, dataBase_.d_volume, dataBase_.d_dpdt, 1.); + +#ifdef STREAM_ALLOCATOR + // intermediate fields + checkCudaErrors(cudaFreeAsync(d_rhorAUf, dataBase_.stream)); + checkCudaErrors(cudaFreeAsync(d_phiHbyA, dataBase_.stream)); + checkCudaErrors(cudaFreeAsync(d_flux, dataBase_.stream)); + + // boundary coeffs + checkCudaErrors(cudaFreeAsync(d_value_internal_coeffs, dataBase_.stream)); + checkCudaErrors(cudaFreeAsync(d_value_boundary_coeffs, dataBase_.stream)); + checkCudaErrors(cudaFreeAsync(d_gradient_internal_coeffs, dataBase_.stream)); + checkCudaErrors(cudaFreeAsync(d_gradient_boundary_coeffs, dataBase_.stream)); + // intermediate boundary fields + checkCudaErrors(cudaFreeAsync(d_boundary_rhorAUf, dataBase_.stream)); + checkCudaErrors(cudaFreeAsync(d_boundary_phiHbyA, dataBase_.stream)); + checkCudaErrors(cudaFreeAsync(d_boundary_flux, dataBase_.stream)); + + // ldu and csr + checkCudaErrors(cudaFreeAsync(d_source, dataBase_.stream)); + checkCudaErrors(cudaFreeAsync(d_internal_coeffs, dataBase_.stream)); + checkCudaErrors(cudaFreeAsync(d_boundary_coeffs, dataBase_.stream)); + checkCudaErrors(cudaFreeAsync(d_A, dataBase_.stream)); +#endif + TICK_END_EVENT(pEqn post process all); + +#ifdef USE_GRAPH checkCudaErrors(cudaStreamEndCapture(dataBase_.stream, &graph_post)); checkCudaErrors(cudaGraphInstantiate(&graph_instance_post, graph_post, NULL, NULL, 0)); - graph_created = true; + post_graph_created = true; } checkCudaErrors(cudaGraphLaunch(graph_instance_post, dataBase_.stream)); #endif - - checkCudaErrors(cudaEventRecord(stop,0)); - checkCudaErrors(cudaEventSynchronize(start)); - checkCudaErrors(cudaEventSynchronize(stop)); - checkCudaErrors(cudaEventElapsedTime(&time_elapsed,start,stop)); - if(!mpi_init_flag || myRank == 0) - fprintf(stderr, "peqn post process time:%f(ms)\n\n",time_elapsed); + sync(); } void dfpEqn::postProcess() {} -void dfpEqn::getFlux() -{ - fvMtx_flux(dataBase_.stream, dataBase_.num_surfaces, dataBase_.num_boundary_surfaces, dataBase_.d_owner, dataBase_.d_neighbor, - d_lower, d_upper, dataBase_.d_p, d_flux, - dataBase_.num_patches, dataBase_.patch_size.data(), patch_type_p.data(), - dataBase_.d_boundary_face_cell, d_internal_coeffs, d_boundary_coeffs, dataBase_.cyclicNeighbor.data(), - dataBase_.patchSizeOffset.data(), dataBase_.d_boundary_p, d_boundary_flux); - sync(); -} +//void dfpEqn::getFlux() +//{ +// fvMtx_flux(dataBase_.stream, dataBase_.num_surfaces, dataBase_.num_boundary_surfaces, dataBase_.d_owner, dataBase_.d_neighbor, +// d_lower, d_upper, dataBase_.d_p, d_flux, +// dataBase_.num_patches, dataBase_.patch_size.data(), patch_type_p.data(), +// dataBase_.d_boundary_face_cell, d_internal_coeffs, d_boundary_coeffs, dataBase_.d_boundary_p, d_boundary_flux); +// sync(); +//} void dfpEqn::getrhorAUf(cudaStream_t stream, int num_cells, int num_surfaces, const int *lowerAddr, const int *upperAddr, @@ -529,6 +568,7 @@ void dfpEqn::getrhorAUf(cudaStream_t stream, int num_cells, int num_surfaces, int offset = 0; for (int i = 0; i < num_patches; i++) { + if (patch_size[i] == 0) continue; threads_per_block = 256; blocks_per_grid = (patch_size[i] + threads_per_block - 1) / threads_per_block; // TODO: maybe do not need loop boundarys @@ -567,6 +607,7 @@ void dfpEqn::getphiHbyA(cudaStream_t stream, int num_cells, int num_surfaces, in int offset = 0; for (int i = 0; i < num_patches; i++) { + if (patch_size[i] == 0) continue; threads_per_block = 256; blocks_per_grid = (patch_size[i] + threads_per_block - 1) / threads_per_block; if (patch_type[i] == boundaryConditions::processor @@ -586,6 +627,7 @@ void dfpEqn::getphiHbyA(cudaStream_t stream, int num_cells, int num_surfaces, in offset = 0; for (int i = 0; i < num_patches; i++) { + if (patch_size[i] == 0) continue; threads_per_block = 256; blocks_per_grid = (patch_size[i] + threads_per_block - 1) / threads_per_block; if (patch_type[i] == boundaryConditions::processor @@ -606,6 +648,7 @@ void dfpEqn::getphiHbyA(cudaStream_t stream, int num_cells, int num_surfaces, in offset = 0; for (int i = 0; i < num_patches; i++) { + if (patch_size[i] == 0) continue; threads_per_block = 256; blocks_per_grid = (patch_size[i] + threads_per_block - 1) / threads_per_block; if (patch_type[i] == boundaryConditions::extrapolated @@ -639,30 +682,7 @@ void dfpEqn::sync() void dfpEqn::solve() { - sync(); - - // double *h_A_csr = new double[dataBase_.num_Nz]; - // checkCudaErrors(cudaMemcpy(h_A_csr, d_A, dataBase_.csr_value_bytes, cudaMemcpyDeviceToHost)); - // for (int i = 0; i < dataBase_.num_Nz; i++) - // fprintf(stderr, "h_A_csr[%d]: %.10lf\n", i, h_A_csr[i]); - // delete[] h_A_csr; - - // double *h_b = new double[dataBase_.num_cells]; - // checkCudaErrors(cudaMemcpy(h_b, d_source, dataBase_.cell_value_bytes, cudaMemcpyDeviceToHost)); - // for (int i = 0; i < dataBase_.num_cells; i++) - // fprintf(stderr, "h_b[%d]: %.10lf\n", i, h_b[i]); - // delete[] h_b; - - if (num_iteration == 0) // first interation - { - printf("Initializing AmgX Linear Solver\n"); - pSolver->setOperator(dataBase_.num_cells, dataBase_.num_total_cells, dataBase_.num_Nz, dataBase_.d_csr_row_index, dataBase_.d_csr_col_index, d_A); - } - else - { - pSolver->updateOperator(dataBase_.num_cells, dataBase_.num_Nz, d_A); - } - pSolver->solve(dataBase_.num_cells, dataBase_.d_p, d_source); + dataBase_.solve(num_iteration, AMGXSetting::p_setting, d_A, dataBase_.d_p, d_source); num_iteration++; }