Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
6 changes: 5 additions & 1 deletion src_gpu/CMakeLists.txt
Original file line number Diff line number Diff line change
Expand Up @@ -6,10 +6,12 @@ cmake_minimum_required(VERSION 3.5)
project(dfMatrix LANGUAGES CXX CUDA)
set(CMAKE_CXX_STANDARD 17)
set(CMAKE_CXX_STANDARD_REQUIRED ON)
set(CMAKE_PREFIX_PATH /root/libtorch)

find_package(CUDA REQUIRED)
find_package(MPI REQUIRED)
find_package(CUDAToolkit REQUIRED)
find_package(Torch REQUIRED)
find_library(LIBAMGXSH amgxsh PATHS $ENV{AMGX_DIR}/build)

add_compile_options(-arch=sm_70 -fmad=false)
Expand All @@ -31,12 +33,14 @@ add_library(${PROJECT_NAME}
dfEEqn.cu
dfRhoEqn.cu
dfpEqn.cu
dfThermo.cu)
dfThermo.cu
dfChemistrySolver.cu)

target_link_libraries(${PROJECT_NAME}
${MPI_LIBRARIES}
${CUDA_LIBRARIES}
${LIBAMGXSH}
${TORCH_LIBRARIES}
)
target_compile_options(dfMatrix PUBLIC -g)
option(DFMATRIX_ENABLE_DETAILED_DEBUG "Enable detailed debug build" OFF)
Expand Down
25 changes: 25 additions & 0 deletions src_gpu/dfChemistrySolver.H
Original file line number Diff line number Diff line change
@@ -0,0 +1,25 @@
#pragma once

#include <stdio.h>
#include <unistd.h>
#include <cuda_runtime.h>
#include <torch/script.h>

class dfChemistrySolver
{
private:
std::vector<torch::jit::script::Module> modules_;
torch::Device device_;
double *Xmu_, *Xstd_, *Ymu_, *Ystd_;

double *init_input_, *y_input_BCT;
int dim_input_, num_cells_, num_species_;
public:
dfChemistrySolver(int num_cells, int num_species);
~dfChemistrySolver();

void loadModels(const std::string dir);
void loadNormalization(const std::string dir);
void Inference(const double *T, const double *p, const double *y,
const double *rho, double *RR);
};
128 changes: 128 additions & 0 deletions src_gpu/dfChemistrySolver.cu
Original file line number Diff line number Diff line change
@@ -0,0 +1,128 @@
#include "dfChemistrySolver.H"

__global__ void construct_init_input(int num_cells, int dim, const double *T, const double *p,
const double *y, double *y_input_BCT, double *output)
{
int index = blockDim.x * blockIdx.x + threadIdx.x;
if (index >= num_cells)
return;

output[index * dim] = T[index];
output[index * dim + 1] = p[index];
double y_BCT;
for (int i = 0; i < dim - 2; ++i) {
y_BCT = (pow(y[i * num_cells + index], 0.1) - 1) * 10; // BCT: lambda = 0.1
output[index * dim + 2 + i] = y_BCT;
y_input_BCT[i * num_cells + index] = y_BCT;
}
}

__global__ void normalize_input(int num_cells, int dim, const double *input,
const double *Xmu, const double *Xstd, double *output)
{
int index = blockDim.x * blockIdx.x + threadIdx.x;
if (index >= num_cells)
return;

for (int i = 0; i < dim; ++i) {
output[index * dim + i] = (input[index * dim + i] - Xmu[i]) / Xstd[i];
}
}

__global__ void calculate_y_new(int num_cells, int num_species, const double *output_init,
const double *y_input_BCT, const double *Ymu, const double *Ystd, double *output)
{
int index = blockDim.x * blockIdx.x + threadIdx.x;
if (index >= num_cells)
return;

double RR_tmp;
for (int i = 0; i < num_species; ++i) {
RR_tmp = output_init[i * num_cells + index] * Ystd[i] + Ymu[i] + y_input_BCT[i * num_cells + index];
RR_tmp = pow((RR_tmp * 0.1 + 1), 10); // rev-BCT: lambda = 0.1
output[i * num_cells + index] = RR_tmp;
}
}

__global__ void calculate_RR(int num_cells, int num_species, double delta_t,
const double *rho, const double *y_old, double *y_new, double *RR)
{
int index = blockDim.x * blockIdx.x + threadIdx.x;
if (index >= num_cells)
return;

// normalize
double y_ave = 0.;
for (int i = 0; i < num_species; ++i) {
y_ave += y_new[i * num_cells + index];
}
for (int i = 0; i < num_species; ++i) {
y_new[i * num_cells + index] = y_new[i * num_cells + index] / y_ave;
RR[i * num_cells + index] = (y_new[i * num_cells + index] - y_old[i * num_cells + index]) * rho[index] / delta_t;
}
}

dfChemistrySolver::dfChemistrySolver(int num_cells, int num_species)
: device_(torch::kCUDA), num_cells_(num_cells), num_species_(num_species)
{
dim_input_ = num_species + 2; // p, T, y
cudaMalloc(&init_input_, sizeof(double) * num_cells * dim_input_);
cudaMalloc(&y_input_BCT, sizeof(double) * num_cells * num_species_);
cudaMalloc(&Xmu_, sizeof(double) * dim_input_);
cudaMalloc(&Xstd_, sizeof(double) * dim_input_);
cudaMalloc(&Ymu_, sizeof(double) * num_species_);
cudaMalloc(&Ystd_, sizeof(double) * num_species_);
modules_.reserve(num_species_);

// now norm paras are set in constructor manually
at::TensorOptions opts = at::TensorOptions().dtype(at::kDouble).device(at::kCUDA);
std::vector<double> Xmu_vec = {2.1996457626e+03, 1.0281762632e+05, -5.2533840071e+00,
-4.5623405933e+00, -1.5305375824e+00, -3.7453838144e+00,
-3.3097212360e+00, -4.3339657954e+00, -2.9028421546e-01};
std::vector<double> Xstd_vec = {3.8304388695e+02, 7.1905832810e+02, 4.6788389177e-01,
5.3034657693e-01, 5.0325864222e-01, 5.1058993718e-01,
7.9704668543e-01, 5.1327160125e-01, 4.6827591193e-03};
std::vector<double> Ymu_vec = {0.0085609265, -0.0082998877, 0.0030108739,
-0.0067360325, 0.0037464590, 0.0024258509};
std::vector<double> Ystd_vec = {0.0085609265, -0.0082998877, 0.0030108739,
-0.0067360325, 0.0037464590, 0.0024258509};

cudaMemcpy(Xmu_, Xmu_vec.data(), sizeof(double) * dim_input_, cudaMemcpyHostToDevice);
cudaMemcpy(Xstd_, Xstd_vec.data(), sizeof(double) * dim_input_, cudaMemcpyHostToDevice);
cudaMemcpy(Ymu_, Ymu_vec.data(), sizeof(double) * num_species_, cudaMemcpyHostToDevice);
cudaMemcpy(Ystd_, Ystd_vec.data(), sizeof(double) * num_species_, cudaMemcpyHostToDevice);
}

dfChemistrySolver::~dfChemistrySolver() {
cudaFree(init_input_);
}

void dfChemistrySolver::Inference(const double *T, const double *p, const double *y,
const double *rho, double *RR) {
// construct input
size_t threads_per_block = 1024;
size_t blocks_per_grid = (num_cells_ + threads_per_block - 1) / threads_per_block;
construct_init_input<<<blocks_per_grid, threads_per_block>>>(num_cells_, dim_input_, T, p, y, y_input_BCT, init_input_);
normalize_input<<<blocks_per_grid, threads_per_block>>>(num_cells_, dim_input_, init_input_, Xmu_, Xstd_, init_input_);

// inference by torch
at::Tensor torch_input = torch::from_blob(init_input_, {num_cells_, dim_input_}, device_);
torch_input = torch_input.to(at::kFloat);
std::vector<torch::jit::IValue> INPUTS;
INPUTS.push_back(torch_input);

std::vector<at::Tensor> output(num_species_);
for (int i = 0; i < num_species_; ++i) {
output[i] = modules_[i].forward(INPUTS).toTensor();
output[i] = output[i].to(at::kDouble);
}

// post process
double *d_output;
for (int i = 0; i < num_species_; ++i) {
d_output = output[i].data_ptr<double>();
cudaMemcpy(RR + i * num_cells_, d_output, sizeof(double) * num_cells_, cudaMemcpyDeviceToDevice);
}
calculate_y_new<<<blocks_per_grid, threads_per_block>>>(num_cells_, num_species_, RR, y_input_BCT, Ymu_, Ystd_, RR);
calculate_RR<<<blocks_per_grid, threads_per_block>>>(num_cells_, num_species_, 1e-6, rho, y, RR, RR);
}
3 changes: 3 additions & 0 deletions src_gpu/dfMatrixDataBase.H
Original file line number Diff line number Diff line change
Expand Up @@ -226,6 +226,9 @@ struct dfMatrixDataBase
// internal fields used between eqns
double *d_rAU = nullptr;
double *d_HbyA = nullptr;
// turbulence fields
double *d_turbulence_k = nullptr;
double *d_turbulence_epsilon = nullptr;

// non-constant fields - boundary
// TODO: further estimate
Expand Down
4 changes: 4 additions & 0 deletions src_gpu/dfMatrixDataBase.cu
Original file line number Diff line number Diff line change
Expand Up @@ -372,6 +372,10 @@ void dfMatrixDataBase::createNonConstantFieldsInternal() {
checkCudaErrors(cudaMalloc((void**)&d_T, cell_value_bytes));
checkCudaErrors(cudaMalloc((void**)&d_mu, cell_value_bytes));

// turbulence fields
checkCudaErrors(cudaMalloc((void**)&d_turbulence_k, cell_value_bytes));
checkCudaErrors(cudaMalloc((void**)&d_turbulence_epsilon, cell_value_bytes));

// internal fields used between eqns
checkCudaErrors(cudaMalloc((void**)&d_rAU, cell_value_bytes));
checkCudaErrors(cudaMalloc((void**)&d_HbyA, cell_value_vec_bytes));
Expand Down
4 changes: 4 additions & 0 deletions src_gpu/dfUEqn.H
Original file line number Diff line number Diff line change
Expand Up @@ -45,6 +45,8 @@ private:
double *h_H_pEqn = nullptr;
// intermediate fields
double *d_grad_u = nullptr;
double *d_delta = nullptr;

double *d_rho_nueff = nullptr;
double *d_u_host_order = nullptr;
double *d_fvc_output = nullptr; // TODO: no need anymore
Expand Down Expand Up @@ -133,6 +135,8 @@ public:
const int *cyclicNeighbor, const int *patchSizeOffset,
double *HbyA, double *boundary_HbyA);
void getHbyA();
void getTurbulenceKEpsilon_Smagorinsky(cudaStream_t stream, int num_cells, int num_boundary_surfaces,
const double *grad_U_tsr, const double *volume, double *delta, double *turbulence_k, double *turbulence_epsilon);
void correctPsi(double *Psi, double *boundary_psi);
void ueqn_ldu_to_csr(cudaStream_t stream, int num_cells, int num_surfaces, int num_boundary_surface, int num_Nz,
const int* boundary_cell_face, const int *ldu_to_csr_index, const int *diag_to_csr_index,
Expand Down
80 changes: 75 additions & 5 deletions src_gpu/dfUEqn.cu
Original file line number Diff line number Diff line change
Expand Up @@ -302,6 +302,53 @@ __global__ void ueqn_divide_cell_volume_vec(int num_cells, const double* volume,
output[num_cells * 2 + index] = output[num_cells * 2 + index] / vol;
}

__global__ void ueqn_calculate_turbulence_k_Smagorinsky(int num_cells,
const double *grad_U_tsr, const double *volume, double Ce, double Ck,
double *delta, double *output)
{
int index = blockDim.x * blockIdx.x + threadIdx.x;
if (index >= num_cells)
return;

double vol = volume[index];
double del = pow(vol, 1/3);

// D = 0.5*(T+T^T)
double D_xx = grad_U_tsr[num_cells * 0 + index];
double D_xy = 0.5 * (grad_U_tsr[num_cells * 1 + index] + grad_U_tsr[num_cells * 3 + index]);
double D_xz = 0.5 * (grad_U_tsr[num_cells * 2 + index] + grad_U_tsr[num_cells * 6 + index]);
double D_yy = grad_U_tsr[num_cells * 4 + index];
double D_yz = 0.5 * (grad_U_tsr[num_cells * 5 + index] + grad_U_tsr[num_cells * 7 + index]);
double D_zz = grad_U_tsr[num_cells * 8 + index];

// dev(D)
double trace = D_xx + D_yy + D_zz;
double dev_D_xx = D_xx - (1. / 3.) * trace;
double dev_D_yy = D_yy - (1. / 3.) * trace;
double dev_D_zz = D_zz - (1. / 3.) * trace;

// scalar a
double a = Ce * del;
// scalar b
double b = 1.5 * trace;
// scalar c
double c = 2 * Ck * del * (dev_D_xx * D_xx + dev_D_yy * D_yy + dev_D_zz * D_zz
+ D_xy * D_xy * 2 + D_xz * D_xz * 2 + D_yz * D_yz * 2);

output[index] = pow((-b + pow(b * b + 4 * a * c, 0.5)) / (2 * a), 2);
delta[index] = del;
}

__global__ void ueqn_calculate_turbulence_epsilon_Smagorinsky(int num_cells,
const double *k, const double *delta, double Ce, double *output)
{
int index = blockDim.x * blockIdx.x + threadIdx.x;
if (index >= num_cells)
return;

output[index] = Ce * pow(k[index], 1.5) / delta[index];
}

void dfUEqn::setConstantValues(const std::string &mode_string, const std::string &setting_path) {
this->stream = dataBase_.stream;
this->mode_string = mode_string;
Expand Down Expand Up @@ -334,6 +381,8 @@ void dfUEqn::createNonConstantFieldsInternal() {
checkCudaErrors(cudaMalloc((void**)&d_nu_eff, dataBase_.cell_value_bytes));
// intermediate fields
checkCudaErrors(cudaMalloc((void**)&d_grad_u, dataBase_.cell_value_tsr_bytes));
checkCudaErrors(cudaMalloc((void**)&d_delta, dataBase_.cell_value_bytes));

checkCudaErrors(cudaMalloc((void**)&d_rho_nueff, dataBase_.cell_value_bytes));
checkCudaErrors(cudaMalloc((void**)&d_fvc_output, dataBase_.cell_value_vec_bytes));
#endif
Expand Down Expand Up @@ -441,6 +490,8 @@ void dfUEqn::process() {
checkCudaErrors(cudaMallocAsync((void**)&d_nu_eff, dataBase_.cell_value_bytes, dataBase_.stream));
// intermediate fields
checkCudaErrors(cudaMallocAsync((void**)&d_grad_u, dataBase_.cell_value_tsr_bytes, dataBase_.stream));
checkCudaErrors(cudaMallocAsync((void**)&d_delta, dataBase_.cell_value_bytes, dataBase_.stream));

checkCudaErrors(cudaMallocAsync((void**)&d_rho_nueff, dataBase_.cell_value_bytes, dataBase_.stream));
checkCudaErrors(cudaMallocAsync((void**)&d_fvc_output, dataBase_.cell_value_vec_bytes, dataBase_.stream));

Expand Down Expand Up @@ -478,6 +529,8 @@ void dfUEqn::process() {
checkCudaErrors(cudaMemsetAsync(d_grad_u, 0, dataBase_.cell_value_tsr_bytes, dataBase_.stream));
checkCudaErrors(cudaMemsetAsync(d_boundary_grad_u, 0, dataBase_.boundary_surface_value_tsr_bytes, dataBase_.stream));

checkCudaErrors(cudaMemsetAsync(d_delta, 0, dataBase_.cell_value_bytes, dataBase_.stream));

// permute_vector_h2d(dataBase_.stream, dataBase_.num_cells, d_u_host_order, dataBase_.d_u);
// permute_vector_h2d(dataBase_.stream, dataBase_.num_boundary_surfaces, d_boundary_u_host_order, dataBase_.d_boundary_u);
update_boundary_coeffs_vector(dataBase_.stream, dataBase_.num_boundary_surfaces, dataBase_.num_patches,
Expand Down Expand Up @@ -513,13 +566,16 @@ void dfUEqn::process() {
dataBase_.d_boundary_face_cell, dataBase_.d_boundary_u, dataBase_.d_boundary_sf, dataBase_.d_boundary_weight,
dataBase_.d_volume, dataBase_.d_boundary_mag_sf, d_boundary_grad_u, dataBase_.cyclicNeighbor.data(),
dataBase_.patchSizeOffset.data(), dataBase_.d_boundary_delta_coeffs);
// calculate k & epsilon (if use turbulence model)
getTurbulenceKEpsilon_Smagorinsky(dataBase_.stream, dataBase_.num_cells, dataBase_.num_boundary_surfaces, d_grad_u, dataBase_.d_volume,
d_delta, dataBase_.d_turbulence_k, dataBase_.d_turbulence_epsilon);
scale_dev2T_tensor(dataBase_.stream, dataBase_.num_cells, dataBase_.d_mu, d_grad_u, // end for internal
dataBase_.num_boundary_surfaces, dataBase_.d_boundary_mu, d_boundary_grad_u);
dataBase_.num_boundary_surfaces, dataBase_.d_boundary_mu, d_boundary_grad_u);
fvc_div_cell_tensor(dataBase_.stream, dataBase_.num_cells, dataBase_.num_surfaces, dataBase_.num_boundary_surfaces,
dataBase_.d_owner, dataBase_.d_neighbor,
dataBase_.d_weight, dataBase_.d_sf, d_grad_u, d_source, // end for internal
dataBase_.num_patches, dataBase_.patch_size.data(), dataBase_.patch_type_calculated.data(), dataBase_.d_boundary_weight,
dataBase_.d_boundary_face_cell, d_boundary_grad_u, dataBase_.d_boundary_sf, dataBase_.d_volume);
dataBase_.d_owner, dataBase_.d_neighbor,
dataBase_.d_weight, dataBase_.d_sf, d_grad_u, d_source, // end for internal
dataBase_.num_patches, dataBase_.patch_size.data(), dataBase_.patch_type_calculated.data(), dataBase_.d_boundary_weight,
dataBase_.d_boundary_face_cell, d_boundary_grad_u, dataBase_.d_boundary_sf, dataBase_.d_volume);
fvc_grad_cell_scalar(dataBase_.stream, dataBase_.num_cells, dataBase_.num_surfaces, dataBase_.num_boundary_surfaces,
dataBase_.d_owner, dataBase_.d_neighbor,
dataBase_.d_weight, dataBase_.d_sf, dataBase_.d_p, d_source,
Expand Down Expand Up @@ -573,6 +629,7 @@ void dfUEqn::process() {
checkCudaErrors(cudaFreeAsync(d_nu_eff, dataBase_.stream));
// intermediate fields
checkCudaErrors(cudaFreeAsync(d_grad_u, dataBase_.stream));
checkCudaErrors(cudaFreeAsync(d_delta, dataBase_.stream));
checkCudaErrors(cudaFreeAsync(d_rho_nueff, dataBase_.stream));
checkCudaErrors(cudaFreeAsync(d_fvc_output, dataBase_.stream));

Expand Down Expand Up @@ -651,6 +708,19 @@ void dfUEqn::getrAU(cudaStream_t stream, ncclComm_t comm, int num_cells, int num
boundary_cell_face, rAU, boundary_rAU, dataBase_.cyclicNeighbor.data(), dataBase_.patchSizeOffset.data(), dataBase_.d_boundary_weight);
}

void dfUEqn::getTurbulenceKEpsilon_Smagorinsky(cudaStream_t stream, int num_cells, int num_boundary_surfaces,
const double *grad_U_tsr, const double *volume,
double *delta, double *turbulence_k, double *turbulence_epsilon)
{
size_t threads_per_block = 1024;
size_t blocks_per_grid = (num_cells + threads_per_block - 1) / threads_per_block;
ueqn_calculate_turbulence_k_Smagorinsky<<<blocks_per_grid, threads_per_block, 0, stream>>>(num_cells, grad_U_tsr, volume,
1.048, 0.094, delta, turbulence_k);

ueqn_calculate_turbulence_epsilon_Smagorinsky<<<blocks_per_grid, threads_per_block, 0, stream>>>(num_cells, turbulence_k,
delta, 1.048, turbulence_epsilon);
}

void dfUEqn::UEqnGetHbyA(cudaStream_t stream, ncclComm_t comm, const int *neighbor_peer,
int num_cells, int num_surfaces, int num_boundary_surfaces,
const int *lowerAddr, const int *upperAddr, const double *volume, const double *u,
Expand Down
2 changes: 0 additions & 2 deletions src_gpu/dfYEqn.cu
Original file line number Diff line number Diff line change
Expand Up @@ -555,7 +555,6 @@ void dfYEqn::process() {
// to compare yi, you should only open DEBUG_ in src_gpu.
// Besides, if you compare ldu data, be patient to keep specie_index in YEqn.H and dfYEqn.cu the same.
// #define DEBUG_CHECK_LDU
int solverIndex = 0;
#if defined DEBUG_CHECK_LDU
int specie_index = 0;
for (int s = specie_index; s < specie_index + 1; s++) {
Expand Down Expand Up @@ -611,7 +610,6 @@ void dfYEqn::process() {
dataBase_.num_Nz, dataBase_.d_boundary_face_cell, dataBase_.d_ldu_to_csr_index, dataBase_.num_patches,
dataBase_.patch_size.data(), patch_type.data(), d_ldu, d_source, d_internal_coeffs, d_boundary_coeffs, d_A);
// TODO with solver of database_, solverIndex is no need any more.
//solve(solverIndex, s);
//solverIndex ++;
solve(s);
#endif
Expand Down