Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
2 changes: 1 addition & 1 deletion applications/solvers/dfLowMachFoam/EEqn.H
Original file line number Diff line number Diff line change
Expand Up @@ -93,7 +93,7 @@
// EEqn_GPU.compareResult(&EEqn.lower()[0], &EEqn.upper()[0], &EEqn.diag()[0], &EEqn.source()[0],
// h_internal_coeffs.data(), h_boundary_coeffs.data(), printFlag);
// DEBUG_TRACE;
//EEqn_GPU.compareHe(&he[0], h_boundary_he_tmp, printFlag);
// EEqn_GPU.compareHe(&he[0], h_boundary_he_tmp, printFlag);
}

delete h_boundary_he_tmp;
Expand Down
7 changes: 3 additions & 4 deletions applications/solvers/dfLowMachFoam/Make/options
Original file line number Diff line number Diff line change
Expand Up @@ -27,7 +27,7 @@ EXE_INC = -std=c++14 \
$(if $(LIBTORCH_ROOT),-I$(LIBTORCH_ROOT)/include/torch/csrc/api/include,) \
$(PYTHON_INC_DIR) \
$(if $(AMGX_DIR), -I$(DF_ROOT)/src_gpu,) \
$(if $(AMGX_DIR), -I/usr/local/cuda-11.6/include,) \
$(if $(AMGX_DIR), -I/usr/local/cuda/include,) \
$(if $(AMGX_DIR), -I$(AMGX_DIR)/include,) \
-I$(DF_ROOT)/GPUTestRef/lnInclude \

Expand All @@ -43,7 +43,6 @@ EXE_LIBS = \
-ldfCanteraMixture \
-ldfChemistryModel \
-ldfCombustionModels \
-ldfGenMatrix \
$(CANTERA_ROOT)/lib/libcantera.so \
$(if $(LIBTORCH_ROOT),$(LIBTORCH_ROOT)/lib/libtorch.so,) \
$(if $(LIBTORCH_ROOT),$(LIBTORCH_ROOT)/lib/libc10.so,) \
Expand All @@ -52,8 +51,8 @@ EXE_LIBS = \
$(if $(LIBTORCH_ROOT),$(DF_SRC)/dfChemistryModel/DNNInferencer/build/libDNNInferencer.so,) \
$(if $(PYTHON_LIB_DIR),-L$(PYTHON_LIB_DIR),) \
$(if $(PYTHON_LIB_DIR),-lpython3.8,) \
$(if $(AMGX_DIR), /usr/local/cuda-11.6/lib64/libcudart.so,) \
$(if $(AMGX_DIR), /usr/local/cuda-11.6/lib64/libnccl.so,) \
$(if $(AMGX_DIR), /usr/local/cuda/lib64/libcudart.so,) \
$(if $(AMGX_DIR), /usr/local/cuda/lib64/libnccl.so,) \
$(if $(AMGX_DIR), $(DF_ROOT)/src_gpu/build/libdfMatrix.so,) \
$(if $(AMGX_DIR), $(AMGX_DIR)/build/libamgxsh.so,)

7 changes: 3 additions & 4 deletions applications/solvers/dfLowMachFoam/dfLowMachFoam.C
Original file line number Diff line number Diff line change
Expand Up @@ -60,10 +60,10 @@ Description
#include "basicThermo.H"
#include "CombustionModel.H"

#define GPUSolverNew_
#define TIME
// #define GPUSolverNew_
// #define TIME
// #define DEBUG_
#define SHOW_MEMINFO
// #define SHOW_MEMINFO

#include "dfMatrixDataBase.H"

Expand All @@ -87,7 +87,6 @@ Description
#include "createGPUSolver.H"

#include "upwind.H"
#include "GenFvMatrix.H"
#include "CanteraMixture.H"
#include "multivariateGaussConvectionScheme.H"
#include "limitedSurfaceInterpolationScheme.H"
Expand Down
3 changes: 2 additions & 1 deletion bashrc.in
Original file line number Diff line number Diff line change
Expand Up @@ -15,4 +15,5 @@ export DF_LIBBIN=pwd/platforms/$WM_OPTIONS/lib
export PATH=$DF_APPBIN:$PATH
export LD_LIBRARY_PATH=$DF_LIBBIN:$LD_LIBRARY_PATH
export LD_LIBRARY_PATH=$DF_ROOT/src_gpu/build:$LD_LIBRARY_PATH
export LD_LIBRARY_PATH=$AMGX_DIR/build:$LD_LIBRARY_PATH
export LD_LIBRARY_PATH=$AMGX_DIR/build:$LD_LIBRARY_PATH
export LD_LIBRARY_PATH=$DF_ROOT/src/dfChemistryModel/DNNInferencer/build:$LD_LIBRARY_PATH
2 changes: 1 addition & 1 deletion install.sh
Original file line number Diff line number Diff line change
Expand Up @@ -29,7 +29,7 @@ if [ $USE_GPUSOLVER = true ]; then
mkdir build
cd build
cmake ..
make
make -j
export LD_LIBRARY_PATH=$DF_ROOT/src_gpu/build:$LD_LIBRARY_PATH
fi
cd $DF_ROOT
Expand Down
2 changes: 2 additions & 0 deletions src/dfChemistryModel/dfChemistryModel.H
Original file line number Diff line number Diff line change
Expand Up @@ -343,6 +343,8 @@ public:
}
}

bool ifChemstry() const {return chemistry_;}

// profiling
#if defined USE_LIBTORCH || defined USE_PYTORCH
double time_allsolve() {return time_allsolve_;}
Expand Down
2 changes: 1 addition & 1 deletion src_gpu/AmgXSolver.cu
Original file line number Diff line number Diff line change
Expand Up @@ -335,7 +335,7 @@ void AmgXSolver::solve(
getIters(nIters);
getResidual(nIters, rnorm);
if (!isMPIEnabled || myRank == 0)
fprintf(stderr, "Initial residual = %.10lf, Final residual = %.5e, No Iterations %d\n", irnorm, rnorm, nIters);
printf("Initial residual = %.10lf, Final residual = %.5e, No Iterations %d\n", irnorm, rnorm, nIters);

}

Expand Down
14 changes: 12 additions & 2 deletions src_gpu/dfChemistrySolver.cu
Original file line number Diff line number Diff line change
Expand Up @@ -121,13 +121,15 @@ void dfChemistrySolver::setConstantValue(int num_cells, int num_species, int bat
std::cerr << "error loading the model\n";
exit(-1);
}
modules_[i].to(device_);
// modules_[i].to(device_);
modules_[i].to(device_, torch::kHalf);
}
}

void dfChemistrySolver::Inference(const double *h_T, const double *d_T,const double *p, const double *y,
const double *rho, double *RR) {
// construct input
clock_t start = clock();
inputsize_ = 0;
std::vector<int> reactCellIndex;
for (int i = 0; i < num_cells_; i++) {
Expand All @@ -136,6 +138,10 @@ void dfChemistrySolver::Inference(const double *h_T, const double *d_T,const dou
}
}
inputsize_ = reactCellIndex.size();
clock_t end = clock();
double elapsed_secs = double(end - start) / CLOCKS_PER_SEC;
std::cout << "construct input time: " << elapsed_secs << std::endl;

#ifdef STREAM_ALLOCATOR
checkCudaErrors(cudaMallocAsync((void**)&init_input_, sizeof(double) * inputsize_ * dim_input_, stream));
checkCudaErrors(cudaMallocAsync((void**)&y_input_BCT, sizeof(double) * inputsize_ * num_species_, stream));
Expand All @@ -157,13 +163,16 @@ void dfChemistrySolver::Inference(const double *h_T, const double *d_T,const dou
Xmu_, Xstd_, init_input_);

// inference by torch
TICK_INIT_EVENT;
TICK_START_EVENT;
double *d_output;
for (int sample_start = 0; sample_start < inputsize_; sample_start += batch_size_) {
int sample_end = std::min(sample_start + batch_size_, inputsize_);
int sample_len = sample_end - sample_start;
at::Tensor torch_input = torch::from_blob(init_input_ + sample_start * dim_input_, {sample_len, dim_input_},
torch::TensorOptions().device(device_).dtype(torch::kDouble));
torch_input = torch_input.to(at::kFloat);
// torch_input = torch_input.to(at::kFloat);
torch_input = torch_input.to(at::kHalf);
std::vector<torch::jit::IValue> INPUTS;
INPUTS.push_back(torch_input);
std::vector<at::Tensor> output(num_modules_);
Expand All @@ -175,6 +184,7 @@ void dfChemistrySolver::Inference(const double *h_T, const double *d_T,const dou
cudaMemcpy(NN_output_ + (i * inputsize_ + sample_start), d_output, sizeof(double) * sample_len, cudaMemcpyDeviceToDevice);
}
}
TICK_END_EVENT(Inference);

calculate_y_new<<<blocks_per_grid, threads_per_block, 0, stream>>>(inputsize_, num_modules_, NN_output_,
y_input_BCT, Ymu_, Ystd_, NN_output_);
Expand Down
6 changes: 6 additions & 0 deletions src_gpu/dfEEqn.cu
Original file line number Diff line number Diff line change
Expand Up @@ -150,6 +150,12 @@ void dfEEqn::process() {
dataBase_.num_patches, dataBase_.patch_size.data(), patch_type_he.data(),
dataBase_.d_boundary_delta_coeffs, dataBase_.d_boundary_p, dataBase_.d_boundary_y,
d_boundary_heGradient);
correct_boundary_conditions_scalar(dataBase_.stream, dataBase_.nccl_comm, dataBase_.neighbProcNo.data(),
dataBase_.num_boundary_surfaces, dataBase_.num_patches, dataBase_.patch_size.data(),
patch_type_he.data(), dataBase_.d_boundary_delta_coeffs, dataBase_.d_boundary_face_cell,
dataBase_.d_he, dataBase_.d_boundary_he, dataBase_.cyclicNeighbor.data(),
dataBase_.patchSizeOffset.data(), dataBase_.d_boundary_weight,
dataBase_.d_boundary_T, dataBase_.d_boundary_y, d_boundary_heGradient, &thermo_);
update_boundary_coeffs_scalar(dataBase_.stream,
dataBase_.num_patches, dataBase_.patch_size.data(), patch_type_he.data(),
dataBase_.d_boundary_delta_coeffs, dataBase_.d_boundary_he, dataBase_.d_boundary_weight,
Expand Down
4 changes: 2 additions & 2 deletions src_gpu/dfMatrixOpBase.cu
Original file line number Diff line number Diff line change
Expand Up @@ -849,7 +849,7 @@ __global__ void fvm_laplacian_scalar_boundary(int num, int offset,
int start_index = offset + index;
double boundary_value = boundary_gamma[start_index] * boundary_mag_sf[start_index];
internal_coeffs[start_index] += boundary_value * gradient_internal_coeffs[start_index] * sign;
boundary_coeffs[start_index] -= boundary_value * gradient_boundary_coeffs[start_index] * sign;
boundary_coeffs[start_index] -= boundary_value * gradient_boundary_coeffs[start_index] * sign;
}

__global__ void fvm_laplacian_surface_scalar_boundary(int num, int offset,
Expand Down Expand Up @@ -2438,7 +2438,7 @@ void correct_boundary_conditions_scalar(cudaStream_t stream, ncclComm_t comm,
gradient_offset, vf, boundary_cell_face, thermo_gradient, boundary_delta_coeffs, boundary_vf);
gradient_offset += patch_size[i];
} else if (patch_type[i] == boundaryConditions::fixedEnergy) {
GPUThermo->calculateEnthalpyGPU(threads_per_block, patch_size[i], boundary_T, boundary_vf, boundary_y, offset);
GPUThermo->calculateEnthalpyGPU(threads_per_block, patch_size[i], num_boundary_surfaces, boundary_T, boundary_vf, boundary_y, offset);
} else if (patch_type[i] == boundaryConditions::cyclic) {
correct_boundary_conditions_cyclic_scalar<<<blocks_per_grid, threads_per_block, 0, stream>>>(patch_size[i], offset,
patchSizeOffset[cyclicNeighbor[i]], vf, boundary_cell_face, boundary_weight, boundary_vf);
Expand Down
2 changes: 1 addition & 1 deletion src_gpu/dfThermo.H
Original file line number Diff line number Diff line change
Expand Up @@ -83,7 +83,7 @@ public:
void calculateRhoDGPU(int threads_per_block, int num_thread, int num_total, const double *T,
const double *T_poly, const double *p, const double *mole_fraction,
const double *mean_mole_weight, const double *rho, double *rhoD, int offset = 0);
void calculateEnthalpyGPU(int thread_per_block, int num_thread, const double *T, double *enthalpy, const double *d_mass_fraction, int offset = 0);
void calculateEnthalpyGPU(int thread_per_block, int num_thread, int num_total, const double *T, double *enthalpy, const double *d_mass_fraction, int offset = 0);
void calculateTemperatureGPU(int thread_per_block, int num_thread, int num_total, const double *T_init, const double *target_h,
double *T, const double *d_mass_fraction, int offset = 0,
double atol = 1e-7, double rtol = 1e-7, int max_iter = 20);
Expand Down
76 changes: 46 additions & 30 deletions src_gpu/dfThermo.cu
Original file line number Diff line number Diff line change
Expand Up @@ -8,7 +8,7 @@

#define GAS_CANSTANT 8314.46261815324
#define SQRT8 2.8284271247461903
#define NUM_SPECIES 9
#define NUM_SPECIES 7

// constant memory
__constant__ __device__ double d_nasa_coeffs[NUM_SPECIES*15];
Expand Down Expand Up @@ -213,36 +213,44 @@ __global__ void calculate_diffusion_kernel(int num_thread, int num_total, int nu
if (index >= num_thread)
return;

int startIndex = offset + index;
extern __shared__ double shared_data[];
double *mole_fraction_shared = shared_data;

double D[NUM_SPECIES * NUM_SPECIES];
double sum1, sum2;
double tmp;
int startIndex = offset + index;

for (int i = 0; i < num_species; i++) {
for (int j = 0; j < num_species; j++) {
tmp = 0.;
for (int k = 0; k < 5; k++)
tmp += (d_binary_diffusion_coeffs[i * num_species * 5 + j * 5 + k] * T_poly[num_total * k + startIndex]);
D[i * num_species + j] = tmp * pow(T[startIndex], 1.5);
}
mole_fraction_shared[i * blockDim.x + threadIdx.x] = mole_fraction[i * num_total + startIndex];
}

double poly[5];
for (int j = 0; j < 5; j++) {
poly[j] = T_poly[num_total * j + startIndex];
}

double powT = T[startIndex] * sqrt(T[startIndex]);

double local_mean_mole_weight = mean_mole_weight[startIndex];
double local_rho_div_p = rho[startIndex] / p[startIndex];
for (int i = 0; i < num_species; i++) {
if (mole_fraction[num_total * i + startIndex] + 1e-10 > 1.) {
if (mole_fraction_shared[i * blockDim.x + threadIdx.x] + 1e-10 > 1.) {
d[num_total * i + startIndex] = 0.;
continue;
}
sum1 = 0.;
sum2 = 0.;
double sum1 = 0.;
double sum2 = 0.;
for (int j = 0; j < num_species; j++) {
if (i == j) continue;
sum1 += mole_fraction[num_total * j + startIndex] / D[i * num_species + j];
sum2 += mole_fraction[num_total * j + startIndex] * d_molecular_weights[j] / D[i * num_species + j];
// calculate D
double tmp = 0.;
for (int k = 0; k < 5; k++)
tmp += (d_binary_diffusion_coeffs[i * num_species * 5 + j * 5 + k] * poly[k]);
double local_D = tmp * powT;
sum1 += mole_fraction_shared[j * blockDim.x + threadIdx.x] / local_D;
sum2 += mole_fraction_shared[j * blockDim.x + threadIdx.x] * d_molecular_weights[j] / local_D;
}
sum1 *= p[startIndex];
sum2 *= p[startIndex] * mole_fraction[num_total * i + startIndex] /
(mean_mole_weight[startIndex] - mole_fraction[num_total * i + startIndex] * d_molecular_weights[i]);
d[num_total * i + startIndex] = 1 / (sum1 + sum2) * rho[startIndex];
sum2 *= mole_fraction_shared[i * blockDim.x + threadIdx.x] /
(local_mean_mole_weight - mole_fraction_shared[i * blockDim.x + threadIdx.x] * d_molecular_weights[i]);
d[num_total * i + startIndex] = 1 / (sum1 + sum2) * local_rho_div_p;
}
}

Expand Down Expand Up @@ -314,7 +322,7 @@ __global__ void calculate_temperature_kernel(int num_thread, int num_total, int
extern void __global__ correct_internal_boundary_field_scalar(int num, int offset,
const double *vf_internal, const int *face2Cells, double *vf_boundary);

__global__ void calculate_enthalpy_kernel(int num_thread, int offset, int num_cells, int num_species,
__global__ void calculate_enthalpy_kernel(int num_thread, int offset, int num_total, int num_species,
const double *T, const double *mass_fraction, double *enthalpy)
{
int index = blockDim.x * blockIdx.x + threadIdx.x;
Expand All @@ -323,7 +331,7 @@ __global__ void calculate_enthalpy_kernel(int num_thread, int offset, int num_ce

int startIndex = index + offset;

enthalpy[startIndex] = calculate_enthalpy_device_kernel(num_cells, num_species, startIndex, T[startIndex], mass_fraction);
enthalpy[startIndex] = calculate_enthalpy_device_kernel(num_total, num_species, startIndex, T[startIndex], mass_fraction);
}

__global__ void calculate_psip0_kernel(int num_thread, int offset, const double *p, const double *psi, double *psip0)
Expand Down Expand Up @@ -489,7 +497,7 @@ void dfThermo::calculatePsiGPU(int threads_per_block, int num_thread, const doub
void dfThermo::calculateRhoGPU(int threads_per_block, int num_thread, const double *p, const double *psi, double *rho, int offset)
{
size_t blocks_per_grid = (num_thread + threads_per_block - 1) / threads_per_block;
calculate_rho_kernel<<<blocks_per_grid, threads_per_block, 0, stream>>>(num_cells, offset, p, psi, rho);
calculate_rho_kernel<<<blocks_per_grid, threads_per_block, 0, stream>>>(num_thread, offset, p, psi, rho);
}

void dfThermo::calculateViscosityGPU(int num_thread, int num_total, const double *T, const double *mole_fraction,
Expand All @@ -514,13 +522,19 @@ void dfThermo::calculateThermoConductivityGPU(int threads_per_block, int num_thr
offset, d_nasa_coeffs, d_y, T_poly, T, mole_fraction, species_thermal_conductivities, thermal_conductivity);
}

void dfThermo::calculateRhoDGPU(int threads_per_block, int num_thread, int num_total, const double *T,
const double *T_poly, const double *p, const double *mole_fraction,
void dfThermo::calculateRhoDGPU(int threads_per_block, int num_thread, int num_total, const double *T,
const double *T_poly, const double *p, const double *mole_fraction,
const double *mean_mole_weight, const double *rho, double *rhoD, int offset)
{
threads_per_block = 32;
size_t blocks_per_grid = (num_thread + threads_per_block - 1) / threads_per_block;
calculate_diffusion_kernel<<<blocks_per_grid, threads_per_block, 0, stream>>>(num_thread, num_total, num_species, offset,
size_t sharedMemSize = sizeof(double) * threads_per_block * num_species;

TICK_INIT_EVENT;
TICK_START_EVENT;
calculate_diffusion_kernel<<<blocks_per_grid, threads_per_block, sharedMemSize, stream>>>(num_thread, num_total, num_species, offset,
T_poly, mole_fraction, p, mean_mole_weight, rho, T, rhoD);
TICK_END_EVENT("calculate_diffusion_kernel");
}

void dfThermo::calculateTemperatureGPU(int threads_per_block, int num_thread, int num_total, const double *T_init, const double *target_h, double *T,
Expand All @@ -532,17 +546,17 @@ void dfThermo::calculateTemperatureGPU(int threads_per_block, int num_thread, in
T_init, target_h, d_mass_fraction, T, atol, rtol, max_iter);
}

void dfThermo::calculateEnthalpyGPU(int threads_per_block, int num_thread, const double *T, double *enthalpy, const double *d_mass_fraction, int offset)
void dfThermo::calculateEnthalpyGPU(int threads_per_block, int num_thread, int num_total, const double *T, double *enthalpy, const double *d_mass_fraction, int offset)
{
size_t blocks_per_grid = (num_thread + threads_per_block - 1) / threads_per_block;

calculate_enthalpy_kernel<<<blocks_per_grid, threads_per_block, 0, stream>>>(num_thread, offset, num_cells, num_species,
calculate_enthalpy_kernel<<<blocks_per_grid, threads_per_block, 0, stream>>>(num_thread, offset, num_total, num_species,
T, d_mass_fraction, enthalpy);
}

void dfThermo::updateEnergy()
{
calculateEnthalpyGPU(1024, num_cells, dataBase_.d_T, dataBase_.d_he, dataBase_.d_y);
calculateEnthalpyGPU(1024, num_cells, num_cells, dataBase_.d_T, dataBase_.d_he, dataBase_.d_y);

// int offset = 0;
// for (int i = 0; i < dataBase_.num_patches; i++) {
Expand Down Expand Up @@ -574,6 +588,7 @@ void dfThermo::correctThermo()
setMassFraction(dataBase_.d_y, dataBase_.d_boundary_y);
// internal field
int cell_thread = 512, boundary_thread = 32;
fprintf(stderr, "\n\n");
calculateTemperatureGPU(cell_thread, dataBase_.num_cells, dataBase_.num_cells, dataBase_.d_T, dataBase_.d_he, dataBase_.d_T, dataBase_.d_y); // calculate temperature
calculateTPolyGPU(cell_thread, dataBase_.num_cells, dataBase_.num_cells, dataBase_.d_T, d_T_poly); // calculate T_poly
calculatePsiGPU(cell_thread, dataBase_.num_cells, dataBase_.d_T, d_mean_mole_weight, dataBase_.d_thermo_psi); // calculate psi
Expand All @@ -584,13 +599,14 @@ void dfThermo::correctThermo()
d_species_thermal_conductivities, dataBase_.d_thermo_alpha); // calculate thermal conductivity
calculateRhoDGPU(cell_thread, dataBase_.num_cells, dataBase_.num_cells, dataBase_.d_T, d_T_poly, dataBase_.d_p, d_mole_fraction,
d_mean_mole_weight, dataBase_.d_rho, dataBase_.d_thermo_rhoD);
fprintf(stderr, "\n\n");
// boundary field
int offset = 0;
for (int i = 0; i < dataBase_.num_patches; i++) {
if (dataBase_.patch_size[i] == 0) continue;
if (dataBase_.patch_type_T[i] == boundaryConditions::fixedValue) {
calculateTPolyGPU(boundary_thread, dataBase_.patch_size[i], dataBase_.num_boundary_surfaces, dataBase_.d_boundary_T, d_boundary_T_poly, offset);
calculateEnthalpyGPU(boundary_thread, dataBase_.patch_size[i], dataBase_.d_boundary_T, dataBase_.d_boundary_he,
calculateEnthalpyGPU(boundary_thread, dataBase_.patch_size[i], dataBase_.num_boundary_surfaces, dataBase_.d_boundary_T, dataBase_.d_boundary_he,
dataBase_.d_boundary_y, offset);
calculatePsiGPU(boundary_thread, dataBase_.patch_size[i], dataBase_.d_boundary_T, d_boundary_mean_mole_weight, dataBase_.d_boundary_thermo_psi, offset);
calculateRhoGPU(boundary_thread, dataBase_.patch_size[i], dataBase_.d_boundary_p, dataBase_.d_boundary_thermo_psi, dataBase_.d_boundary_rho, offset);
Expand Down
Loading