Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
2 changes: 1 addition & 1 deletion applications/solvers/dfLowMachFoam/EEqn.H
Original file line number Diff line number Diff line change
Expand Up @@ -93,7 +93,7 @@
// EEqn_GPU.compareResult(&EEqn.lower()[0], &EEqn.upper()[0], &EEqn.diag()[0], &EEqn.source()[0],
// h_internal_coeffs.data(), h_boundary_coeffs.data(), printFlag);
// DEBUG_TRACE;
EEqn_GPU.compareHe(&he[0], h_boundary_he_tmp, printFlag);
//EEqn_GPU.compareHe(&he[0], h_boundary_he_tmp, printFlag);
}

delete h_boundary_he_tmp;
Expand Down
2 changes: 1 addition & 1 deletion applications/solvers/dfLowMachFoam/UEqn.H
Original file line number Diff line number Diff line change
Expand Up @@ -118,7 +118,7 @@
// h_internal_coeffs.data(), h_boundary_coeffs.data(),
// // &gradU[0][0], h_boundary_gradU,
// printFlag);
UEqn_GPU.compareU(&U[0][0], h_boundary_u_tmp, printFlag);
//UEqn_GPU.compareU(&U[0][0], h_boundary_u_tmp, printFlag);
}
DEBUG_TRACE;
#endif
Expand Down
16 changes: 15 additions & 1 deletion applications/solvers/dfLowMachFoam/createGPUSolver.H
Original file line number Diff line number Diff line change
Expand Up @@ -99,7 +99,7 @@ void initNccl() {
ncclInit(PstreamGlobals::MPI_COMM_FOAM, nccl_comm, nccl_id, &nRanks, &myRank, &localRank, &mpi_init_flag);
}

void createGPUBase(fvMesh& mesh, PtrList<volScalarField>& Y) {
void createGPUBase(const IOdictionary& CanteraTorchProperties, fvMesh& mesh, PtrList<volScalarField>& Y) {
// prepare constant values: num_cells, num_surfaces, num_boundary_surfaces,
// num_patches, patch_size, num_species, rdelta_t
const labelUList& owner = mesh.owner();
Expand Down Expand Up @@ -152,6 +152,7 @@ void createGPUBase(fvMesh& mesh, PtrList<volScalarField>& Y) {
double rDeltaT = 1 / 1e-6;
dfDataBase.setConstantValues(num_cells, num_total_cells, num_surfaces, num_boundary_surfaces,
num_patches, nProcValues, patch_size, Y.size(), rDeltaT);

// wyr: now there is no other place to prepare nccl info, thus mpi must be initialized at beginning.
label flag_mpi_init;
MPI_Initialized(&flag_mpi_init);
Expand Down Expand Up @@ -187,6 +188,14 @@ void createGPUBase(fvMesh& mesh, PtrList<volScalarField>& Y) {
// prepare cuda resources
dfDataBase.prepareCudaResources();

// setup amgx solvers
string mode_string = "dDDI";
string u_setting_path;
u_setting_path = CanteraTorchProperties.subDict("AmgxSettings").lookupOrDefault("UEqnSettingPath", string(""));
string p_setting_path;
p_setting_path = CanteraTorchProperties.subDict("AmgxSettings").lookupOrDefault("pEqnSettingPath", string(""));
dfDataBase.setAmgxSolvers(mode_string, u_setting_path, p_setting_path);

// prepare constant indexes: owner, neighbor, procRows, procCols
if (Pstream::parRun())
{
Expand Down Expand Up @@ -373,12 +382,14 @@ void createGPURhoEqn(const volScalarField& rho, const surfaceScalarField& phi) {
offset += patchsize;
}
}
rhoEqn_GPU.setConstantValues();
rhoEqn_GPU.setConstantFields(patch_type);
rhoEqn_GPU.initNonConstantFields(h_rho, h_phi, h_boundary_rho, h_boundary_phi);
rhoEqn_GPU.createNonConstantLduAndCsrFields();
}

void createGPUUEqn(const IOdictionary& CanteraTorchProperties, const volVectorField& U) {
// TODO need remove amgx solver setting
// prepare mode_string and setting_path
string mode_string = "dDDI";
string settingPath;
Expand Down Expand Up @@ -431,6 +442,7 @@ void createGPUUEqn(const IOdictionary& CanteraTorchProperties, const volVectorFi

void createGPUYEqn(const IOdictionary& CanteraTorchProperties, PtrList<volScalarField>& Y, const int inertIndex) {
DEBUG_TRACE;
// TODO need remove amgx solver setting
// prepare mode_string and setting_path
string mode_string = "dDDI";
string settingPath;
Expand Down Expand Up @@ -481,6 +493,7 @@ void createGPUYEqn(const IOdictionary& CanteraTorchProperties, PtrList<volScalar

void createGPUEEqn(const IOdictionary& CanteraTorchProperties, volScalarField& he, volScalarField& K) {
DEBUG_TRACE;
// TODO need remove amgx solver setting
// prepare mode_string and setting_path
string mode_string = "dDDI";
string settingPath;
Expand Down Expand Up @@ -527,6 +540,7 @@ void createGPUEEqn(const IOdictionary& CanteraTorchProperties, volScalarField& h

void createGPUpEqn(const IOdictionary& CanteraTorchProperties, volScalarField& p, const volVectorField& U) {
DEBUG_TRACE;
// TODO need remove amgx solver setting
// prepare mode_string and setting_path
string mode_string = "dDDI";
string settingPath;
Expand Down
12 changes: 6 additions & 6 deletions applications/solvers/dfLowMachFoam/dfLowMachFoam.C
Original file line number Diff line number Diff line change
Expand Up @@ -203,7 +203,7 @@ int main(int argc, char *argv[])
if(mpi_init_flag) {
initNccl();
}
createGPUBase(mesh, Y);
createGPUBase(CanteraTorchProperties, mesh, Y);
DEBUG_TRACE;
#endif

Expand Down Expand Up @@ -369,7 +369,7 @@ int main(int argc, char *argv[])
MPI_Comm_rank(MPI_COMM_WORLD, &rank);
}
if (!mpi_init_flag || rank == 0) {
// thermo_GPU.compareT(&T[0], h_boundary_T_tmp, printFlag);
thermo_GPU.compareT(&T[0], h_boundary_T_tmp, printFlag);
// thermo_GPU.compareHe(&thermo.he()[0], h_boundary_he_tmp, printFlag);
// thermo_GPU.comparePsi(&psi[0], h_boundary_thermo_psi_tmp, printFlag);
// thermo_GPU.compareAlpha(&alpha[0], h_boundary_thermo_alpha_tmp, printFlag);
Expand Down Expand Up @@ -452,6 +452,8 @@ int main(int argc, char *argv[])
}
end = std::clock();
time_monitor_turbulence_correct += double(end - start) / double(CLOCKS_PER_SEC);
//fprintf(stderr, "sleep for 5s...\n");
//usleep(5 * 1000 * 1000);
}
clock_t loop_end = std::clock();
double loop_time = double(loop_end - loop_start) / double(CLOCKS_PER_SEC);
Expand All @@ -468,11 +470,9 @@ int main(int argc, char *argv[])

#ifdef GPUSolverNew_
// write U for
double *h_U_tmp = new double[dfDataBase.num_cells * 3];
UEqn_GPU.postProcess(h_U_tmp);
memcpy(&U[0][0], h_U_tmp, dfDataBase.cell_value_vec_bytes);
UEqn_GPU.postProcess();
memcpy(&U[0][0], dfDataBase.h_u, dfDataBase.cell_value_vec_bytes);
U.correctBoundaryConditions();
delete h_U_tmp;
#endif

runTime.write();
Expand Down
2 changes: 1 addition & 1 deletion applications/solvers/dfLowMachFoam/pEqn_GPU.H
Original file line number Diff line number Diff line change
Expand Up @@ -154,7 +154,7 @@ UEqn_GPU.sync();
}
// pEqn_GPU.correctP(&p[0], h_boundary_p);
if (!mpi_init_flag || rank == 0) {
pEqn_GPU.comparep(&p[0], h_boundary_p, false);
//pEqn_GPU.comparep(&p[0], h_boundary_p, false);
}
delete h_boundary_p;

Expand Down
6 changes: 3 additions & 3 deletions src_gpu/AmgXSolver.cu
Original file line number Diff line number Diff line change
Expand Up @@ -54,16 +54,16 @@ void AmgXSolver::initialize(const std::string &modeStr, const std::string &cfgFi
// get the mode of AmgX solver
setMode(modeStr);

// initialize AmgX
initAmgX(cfgFile, devID);

// check if MPI has been initialized
MPI_Initialized(&isMPIEnabled);
if (isMPIEnabled) {
MPI_Comm_size(MPI_COMM_WORLD, &mpiSize);
mpiWorld = MPI_COMM_WORLD;
}

// initialize AmgX
initAmgX(cfgFile, devID);

// a bool indicating if this instance is initialized
isInitialised = true;

Expand Down
6 changes: 5 additions & 1 deletion src_gpu/dfEEqn.H
Original file line number Diff line number Diff line change
Expand Up @@ -12,10 +12,14 @@ class dfEEqn
dfThermo &thermo_;

// cuda resource
cudaStream_t stream;
#ifdef USE_GRAPH
// one graph for one eqn before using self-developed solver
cudaGraph_t graph_pre, graph_post;
cudaGraphExec_t graph_instance_pre, graph_instance_post;
bool graph_created=false;
bool pre_graph_created=false;
bool post_graph_created=false;
#endif

// constant values -- basic
std::string mode_string;
Expand Down
135 changes: 56 additions & 79 deletions src_gpu/dfEEqn.cu
Original file line number Diff line number Diff line change
Expand Up @@ -21,6 +21,7 @@ double* dfEEqn::getFieldPointer(const char* fieldAlias, location loc, position p
}

void dfEEqn::setConstantValues(const std::string &mode_string, const std::string &setting_path) {
this->stream = dataBase_.stream;
this->mode_string = mode_string;
this->setting_path = setting_path;
ESolver = new AmgXSolver(mode_string, setting_path, dataBase_.localRank);
Expand Down Expand Up @@ -88,28 +89,27 @@ void dfEEqn::initNonConstantFields(const double *he, const double *boundary_he)
}

void dfEEqn::cleanCudaResources() {
if (graph_created) {
#ifdef USE_GRAPH
if (pre_graph_created) {
checkCudaErrors(cudaGraphExecDestroy(graph_instance_pre));
checkCudaErrors(cudaGraphExecDestroy(graph_instance_post));
checkCudaErrors(cudaGraphDestroy(graph_pre));
}
if (post_graph_created) {
checkCudaErrors(cudaGraphExecDestroy(graph_instance_post));
checkCudaErrors(cudaGraphDestroy(graph_post));
}
#endif
}

void dfEEqn::preProcess(const double *h_he, const double *h_k, const double *h_k_old, const double *h_dpdt, const double *h_boundary_k, const double *h_boundary_heGradient)
{
}

void dfEEqn::process() {
//使用event计算时间
float time_elapsed=0;
cudaEvent_t start,stop;
checkCudaErrors(cudaEventCreate(&start));
checkCudaErrors(cudaEventCreate(&stop));
checkCudaErrors(cudaEventRecord(start,0));

#ifndef TIME_GPU
if(!graph_created) {
TICK_INIT_EVENT;
TICK_START_EVENT;
#ifdef USE_GRAPH
if(!pre_graph_created) {
DEBUG_TRACE;
checkCudaErrors(cudaStreamBeginCapture(dataBase_.stream, cudaStreamCaptureModeGlobal));
#endif
Expand All @@ -125,7 +125,7 @@ void dfEEqn::process() {
checkCudaErrors(cudaMallocAsync((void**)&d_gradient_internal_coeffs, dataBase_.boundary_surface_value_bytes, dataBase_.stream));
checkCudaErrors(cudaMallocAsync((void**)&d_gradient_boundary_coeffs, dataBase_.boundary_surface_value_bytes, dataBase_.stream));

checkCudaErrors(cudaMallocAsync((void**)&d_boundary_heGradient, dataBase_.num_gradientEnergy_boundary_surfaces, dataBase_.stream));
checkCudaErrors(cudaMallocAsync((void**)&d_boundary_heGradient, sizeof(double) * num_gradientEnergy_boundary_surfaces, dataBase_.stream));

checkCudaErrors(cudaMallocAsync((void**)&d_source, dataBase_.cell_value_bytes, dataBase_.stream));
checkCudaErrors(cudaMallocAsync((void**)&d_internal_coeffs, dataBase_.boundary_surface_value_bytes, dataBase_.stream));
Expand Down Expand Up @@ -190,58 +190,71 @@ void dfEEqn::process() {
dataBase_.num_Nz, dataBase_.d_boundary_face_cell, dataBase_.d_ldu_to_csr_index, dataBase_.num_patches,
dataBase_.patch_size.data(), patch_type_he.data(), d_ldu, d_source, d_internal_coeffs, d_boundary_coeffs, d_A);
#endif
#ifndef TIME_GPU
#ifdef USE_GRAPH
checkCudaErrors(cudaStreamEndCapture(dataBase_.stream, &graph_pre));
checkCudaErrors(cudaGraphInstantiate(&graph_instance_pre, graph_pre, NULL, NULL, 0));
pre_graph_created = true;
}
DEBUG_TRACE;
checkCudaErrors(cudaGraphLaunch(graph_instance_pre, dataBase_.stream));
#endif
checkCudaErrors(cudaEventRecord(stop,0));
checkCudaErrors(cudaEventSynchronize(start));
checkCudaErrors(cudaEventSynchronize(stop));
checkCudaErrors(cudaEventElapsedTime(&time_elapsed,start,stop));
if(!mpi_init_flag || myRank == 0)
fprintf(stderr, "eeqn assembly time:%f(ms)\n",time_elapsed);

checkCudaErrors(cudaEventRecord(start,0));
TICK_END_EVENT(EEqn assembly);

TICK_START_EVENT;
#ifndef DEBUG_CHECK_LDU
solve();
#endif
checkCudaErrors(cudaEventRecord(stop,0));
checkCudaErrors(cudaEventSynchronize(start));
checkCudaErrors(cudaEventSynchronize(stop));
checkCudaErrors(cudaEventElapsedTime(&time_elapsed,start,stop));
if(!mpi_init_flag || myRank == 0)
fprintf(stderr, "eeqn solve time:%f(ms)\n",time_elapsed);

checkCudaErrors(cudaEventRecord(start,0));
#ifndef TIME_GPU
if(!graph_created) {
TICK_END_EVENT(EEqn solve);

#ifdef USE_GRAPH
if(!post_graph_created) {
checkCudaErrors(cudaStreamBeginCapture(dataBase_.stream, cudaStreamCaptureModeGlobal));
#endif

TICK_START_EVENT;
correct_boundary_conditions_scalar(dataBase_.stream, dataBase_.nccl_comm, dataBase_.neighbProcNo.data(),
dataBase_.num_boundary_surfaces, dataBase_.num_patches, dataBase_.patch_size.data(),
patch_type_he.data(), dataBase_.d_boundary_delta_coeffs, dataBase_.d_boundary_face_cell,
dataBase_.d_he, dataBase_.d_boundary_he, dataBase_.cyclicNeighbor.data(),
dataBase_.patchSizeOffset.data(), dataBase_.d_boundary_weight,
dataBase_.d_boundary_T, dataBase_.d_boundary_y, d_boundary_heGradient, &thermo_);
TICK_END_EVENT(EEqn post process correctBC);

TICK_START_EVENT;
// copy he to host
checkCudaErrors(cudaMemcpyAsync(dataBase_.h_he, dataBase_.d_he, dataBase_.cell_value_bytes, cudaMemcpyDeviceToHost, dataBase_.stream));
checkCudaErrors(cudaMemcpyAsync(dataBase_.h_boundary_he, dataBase_.d_boundary_he, dataBase_.boundary_surface_value_bytes, cudaMemcpyDeviceToHost, dataBase_.stream));
TICK_END_EVENT(EEqn post process copy back);

#ifndef TIME_GPU
TICK_START_EVENT;
#ifdef STREAM_ALLOCATOR
// thermophysical fields
checkCudaErrors(cudaFreeAsync(d_dpdt, dataBase_.stream));
// fiv weight fieldsFree
//checkCudaErrors(cudaFreeAsync(d_phi_special_weight, dataBase_.stream));
// boundary coeffs
checkCudaErrors(cudaFreeAsync(d_value_internal_coeffs, dataBase_.stream));
checkCudaErrors(cudaFreeAsync(d_value_boundary_coeffs, dataBase_.stream));
checkCudaErrors(cudaFreeAsync(d_gradient_internal_coeffs, dataBase_.stream));
checkCudaErrors(cudaFreeAsync(d_gradient_boundary_coeffs, dataBase_.stream));

checkCudaErrors(cudaFreeAsync(d_boundary_heGradient, dataBase_.stream));

checkCudaErrors(cudaFreeAsync(d_source, dataBase_.stream));
checkCudaErrors(cudaFreeAsync(d_internal_coeffs, dataBase_.stream));
checkCudaErrors(cudaFreeAsync(d_boundary_coeffs, dataBase_.stream));
checkCudaErrors(cudaFreeAsync(d_A, dataBase_.stream));
checkCudaErrors(cudaFreeAsync(d_b, dataBase_.stream));
#endif
TICK_END_EVENT(EEqn post process free);
#ifdef USE_GRAPH
checkCudaErrors(cudaStreamEndCapture(dataBase_.stream, &graph_post));
checkCudaErrors(cudaGraphInstantiate(&graph_instance_post, graph_post, NULL, NULL, 0));
graph_created = true;
post_graph_created = true;
}
checkCudaErrors(cudaGraphLaunch(graph_instance_post, dataBase_.stream));
#endif

checkCudaErrors(cudaEventRecord(stop,0));
checkCudaErrors(cudaEventSynchronize(start));
checkCudaErrors(cudaEventSynchronize(stop));
checkCudaErrors(cudaEventElapsedTime(&time_elapsed,start,stop));
if(!mpi_init_flag || myRank == 0)
fprintf(stderr, "eeqn post process time: %f(ms)\n\n",time_elapsed);
sync();
}

void dfEEqn::eeqn_calculate_energy_gradient(dfThermo& GPUThermo, int num_cells, int num_species,
Expand All @@ -252,6 +265,7 @@ void dfEEqn::eeqn_calculate_energy_gradient(dfThermo& GPUThermo, int num_cells,
{
int bou_offset = 0, gradient_offset = 0;
for (int i = 0; i < num_patches; i++) {
if (patch_size[i] == 0) continue;
if (patch_type[i] == boundaryConditions::gradientEnergy) {
GPUThermo.calculateEnergyGradient(patch_size[i], num_cells, num_species, num_boundary_surfaces, bou_offset, gradient_offset,
face2Cells, T, p, y, boundary_delta_coeffs, boundary_p, boundary_y, boundary_thermo_gradient);
Expand Down Expand Up @@ -336,45 +350,8 @@ void dfEEqn::sync()

void dfEEqn::solve()
{
sync();

if (num_iteration == 0) // first interation
{
printf("Initializing AmgX Linear Solver\n");
ESolver->setOperator(dataBase_.num_cells, dataBase_.num_total_cells, dataBase_.num_Nz, dataBase_.d_csr_row_index, dataBase_.d_csr_col_index, d_A);
}
else
{
ESolver->updateOperator(dataBase_.num_cells, dataBase_.num_Nz, d_A);
}
ESolver->solve(dataBase_.num_cells, dataBase_.d_he, d_source);
dataBase_.solve(num_iteration, AMGXSetting::u_setting, d_A, dataBase_.d_he, d_source);
num_iteration++;
}

void dfEEqn::postProcess(double *h_he, double *h_boundary_he)
{
#ifdef STREAM_ALLOCATOR
// thermophysical fields
checkCudaErrors(cudaFreeAsync(d_dpdt, dataBase_.stream));
// fiv weight fieldsFree
//checkCudaErrors(cudaFreeAsync(d_phi_special_weight, dataBase_.stream));
// boundary coeffs
checkCudaErrors(cudaFreeAsync(d_value_internal_coeffs, dataBase_.stream));
checkCudaErrors(cudaFreeAsync(d_value_boundary_coeffs, dataBase_.stream));
checkCudaErrors(cudaFreeAsync(d_gradient_internal_coeffs, dataBase_.stream));
checkCudaErrors(cudaFreeAsync(d_gradient_boundary_coeffs, dataBase_.stream));

checkCudaErrors(cudaFreeAsync(d_boundary_heGradient, dataBase_.stream));

checkCudaErrors(cudaFreeAsync(d_source, dataBase_.stream));
checkCudaErrors(cudaFreeAsync(d_internal_coeffs, dataBase_.stream));
checkCudaErrors(cudaFreeAsync(d_boundary_coeffs, dataBase_.stream));
checkCudaErrors(cudaFreeAsync(d_A, dataBase_.stream));
checkCudaErrors(cudaFreeAsync(d_b, dataBase_.stream));
#endif

// copy he to host
checkCudaErrors(cudaMemcpyAsync(h_he, dataBase_.d_he, dataBase_.cell_value_bytes, cudaMemcpyDeviceToHost, dataBase_.stream));
checkCudaErrors(cudaMemcpyAsync(h_boundary_he, dataBase_.d_boundary_he, dataBase_.boundary_surface_value_bytes, cudaMemcpyDeviceToHost, dataBase_.stream));
sync();
}
void dfEEqn::postProcess(double *h_he, double *h_boundary_he) {}
Loading