diff --git a/src_gpu/CMakeLists.txt b/src_gpu/CMakeLists.txt index 167055179..57058eeef 100644 --- a/src_gpu/CMakeLists.txt +++ b/src_gpu/CMakeLists.txt @@ -6,10 +6,12 @@ cmake_minimum_required(VERSION 3.5) project(dfMatrix LANGUAGES CXX CUDA) set(CMAKE_CXX_STANDARD 17) set(CMAKE_CXX_STANDARD_REQUIRED ON) +set(CMAKE_PREFIX_PATH /root/libtorch) find_package(CUDA REQUIRED) find_package(MPI REQUIRED) find_package(CUDAToolkit REQUIRED) +find_package(Torch REQUIRED) find_library(LIBAMGXSH amgxsh PATHS $ENV{AMGX_DIR}/build) add_compile_options(-arch=sm_70 -fmad=false) @@ -31,12 +33,14 @@ add_library(${PROJECT_NAME} dfEEqn.cu dfRhoEqn.cu dfpEqn.cu - dfThermo.cu) + dfThermo.cu + dfChemistrySolver.cu) target_link_libraries(${PROJECT_NAME} ${MPI_LIBRARIES} ${CUDA_LIBRARIES} ${LIBAMGXSH} + ${TORCH_LIBRARIES} ) target_compile_options(dfMatrix PUBLIC -g) option(DFMATRIX_ENABLE_DETAILED_DEBUG "Enable detailed debug build" OFF) diff --git a/src_gpu/dfChemistrySolver.H b/src_gpu/dfChemistrySolver.H new file mode 100644 index 000000000..cbc144a60 --- /dev/null +++ b/src_gpu/dfChemistrySolver.H @@ -0,0 +1,25 @@ +#pragma once + +#include +#include +#include +#include + +class dfChemistrySolver +{ +private: + std::vector modules_; + torch::Device device_; + double *Xmu_, *Xstd_, *Ymu_, *Ystd_; + + double *init_input_, *y_input_BCT; + int dim_input_, num_cells_, num_species_; +public: + dfChemistrySolver(int num_cells, int num_species); + ~dfChemistrySolver(); + + void loadModels(const std::string dir); + void loadNormalization(const std::string dir); + void Inference(const double *T, const double *p, const double *y, + const double *rho, double *RR); +}; \ No newline at end of file diff --git a/src_gpu/dfChemistrySolver.cu b/src_gpu/dfChemistrySolver.cu new file mode 100644 index 000000000..fd50e811a --- /dev/null +++ b/src_gpu/dfChemistrySolver.cu @@ -0,0 +1,128 @@ +#include "dfChemistrySolver.H" + +__global__ void construct_init_input(int num_cells, int dim, const double *T, const double *p, + const double *y, double *y_input_BCT, double *output) +{ + int index = blockDim.x * blockIdx.x + threadIdx.x; + if (index >= num_cells) + return; + + output[index * dim] = T[index]; + output[index * dim + 1] = p[index]; + double y_BCT; + for (int i = 0; i < dim - 2; ++i) { + y_BCT = (pow(y[i * num_cells + index], 0.1) - 1) * 10; // BCT: lambda = 0.1 + output[index * dim + 2 + i] = y_BCT; + y_input_BCT[i * num_cells + index] = y_BCT; + } +} + +__global__ void normalize_input(int num_cells, int dim, const double *input, + const double *Xmu, const double *Xstd, double *output) +{ + int index = blockDim.x * blockIdx.x + threadIdx.x; + if (index >= num_cells) + return; + + for (int i = 0; i < dim; ++i) { + output[index * dim + i] = (input[index * dim + i] - Xmu[i]) / Xstd[i]; + } +} + +__global__ void calculate_y_new(int num_cells, int num_species, const double *output_init, + const double *y_input_BCT, const double *Ymu, const double *Ystd, double *output) +{ + int index = blockDim.x * blockIdx.x + threadIdx.x; + if (index >= num_cells) + return; + + double RR_tmp; + for (int i = 0; i < num_species; ++i) { + RR_tmp = output_init[i * num_cells + index] * Ystd[i] + Ymu[i] + y_input_BCT[i * num_cells + index]; + RR_tmp = pow((RR_tmp * 0.1 + 1), 10); // rev-BCT: lambda = 0.1 + output[i * num_cells + index] = RR_tmp; + } +} + +__global__ void calculate_RR(int num_cells, int num_species, double delta_t, + const double *rho, const double *y_old, double *y_new, double *RR) +{ + int index = blockDim.x * blockIdx.x + threadIdx.x; + if (index >= num_cells) + return; + + // normalize + double y_ave = 0.; + for (int i = 0; i < num_species; ++i) { + y_ave += y_new[i * num_cells + index]; + } + for (int i = 0; i < num_species; ++i) { + y_new[i * num_cells + index] = y_new[i * num_cells + index] / y_ave; + RR[i * num_cells + index] = (y_new[i * num_cells + index] - y_old[i * num_cells + index]) * rho[index] / delta_t; + } +} + +dfChemistrySolver::dfChemistrySolver(int num_cells, int num_species) + : device_(torch::kCUDA), num_cells_(num_cells), num_species_(num_species) +{ + dim_input_ = num_species + 2; // p, T, y + cudaMalloc(&init_input_, sizeof(double) * num_cells * dim_input_); + cudaMalloc(&y_input_BCT, sizeof(double) * num_cells * num_species_); + cudaMalloc(&Xmu_, sizeof(double) * dim_input_); + cudaMalloc(&Xstd_, sizeof(double) * dim_input_); + cudaMalloc(&Ymu_, sizeof(double) * num_species_); + cudaMalloc(&Ystd_, sizeof(double) * num_species_); + modules_.reserve(num_species_); + + // now norm paras are set in constructor manually + at::TensorOptions opts = at::TensorOptions().dtype(at::kDouble).device(at::kCUDA); + std::vector Xmu_vec = {2.1996457626e+03, 1.0281762632e+05, -5.2533840071e+00, + -4.5623405933e+00, -1.5305375824e+00, -3.7453838144e+00, + -3.3097212360e+00, -4.3339657954e+00, -2.9028421546e-01}; + std::vector Xstd_vec = {3.8304388695e+02, 7.1905832810e+02, 4.6788389177e-01, + 5.3034657693e-01, 5.0325864222e-01, 5.1058993718e-01, + 7.9704668543e-01, 5.1327160125e-01, 4.6827591193e-03}; + std::vector Ymu_vec = {0.0085609265, -0.0082998877, 0.0030108739, + -0.0067360325, 0.0037464590, 0.0024258509}; + std::vector Ystd_vec = {0.0085609265, -0.0082998877, 0.0030108739, + -0.0067360325, 0.0037464590, 0.0024258509}; + + cudaMemcpy(Xmu_, Xmu_vec.data(), sizeof(double) * dim_input_, cudaMemcpyHostToDevice); + cudaMemcpy(Xstd_, Xstd_vec.data(), sizeof(double) * dim_input_, cudaMemcpyHostToDevice); + cudaMemcpy(Ymu_, Ymu_vec.data(), sizeof(double) * num_species_, cudaMemcpyHostToDevice); + cudaMemcpy(Ystd_, Ystd_vec.data(), sizeof(double) * num_species_, cudaMemcpyHostToDevice); +} + +dfChemistrySolver::~dfChemistrySolver() { + cudaFree(init_input_); +} + +void dfChemistrySolver::Inference(const double *T, const double *p, const double *y, + const double *rho, double *RR) { + // construct input + size_t threads_per_block = 1024; + size_t blocks_per_grid = (num_cells_ + threads_per_block - 1) / threads_per_block; + construct_init_input<<>>(num_cells_, dim_input_, T, p, y, y_input_BCT, init_input_); + normalize_input<<>>(num_cells_, dim_input_, init_input_, Xmu_, Xstd_, init_input_); + + // inference by torch + at::Tensor torch_input = torch::from_blob(init_input_, {num_cells_, dim_input_}, device_); + torch_input = torch_input.to(at::kFloat); + std::vector INPUTS; + INPUTS.push_back(torch_input); + + std::vector output(num_species_); + for (int i = 0; i < num_species_; ++i) { + output[i] = modules_[i].forward(INPUTS).toTensor(); + output[i] = output[i].to(at::kDouble); + } + + // post process + double *d_output; + for (int i = 0; i < num_species_; ++i) { + d_output = output[i].data_ptr(); + cudaMemcpy(RR + i * num_cells_, d_output, sizeof(double) * num_cells_, cudaMemcpyDeviceToDevice); + } + calculate_y_new<<>>(num_cells_, num_species_, RR, y_input_BCT, Ymu_, Ystd_, RR); + calculate_RR<<>>(num_cells_, num_species_, 1e-6, rho, y, RR, RR); +} \ No newline at end of file diff --git a/src_gpu/dfMatrixDataBase.H b/src_gpu/dfMatrixDataBase.H index 4ebecfa6c..40cf32bae 100644 --- a/src_gpu/dfMatrixDataBase.H +++ b/src_gpu/dfMatrixDataBase.H @@ -226,6 +226,9 @@ struct dfMatrixDataBase // internal fields used between eqns double *d_rAU = nullptr; double *d_HbyA = nullptr; + // turbulence fields + double *d_turbulence_k = nullptr; + double *d_turbulence_epsilon = nullptr; // non-constant fields - boundary // TODO: further estimate diff --git a/src_gpu/dfMatrixDataBase.cu b/src_gpu/dfMatrixDataBase.cu index 64dfd99fb..fd87cb46a 100644 --- a/src_gpu/dfMatrixDataBase.cu +++ b/src_gpu/dfMatrixDataBase.cu @@ -372,6 +372,10 @@ void dfMatrixDataBase::createNonConstantFieldsInternal() { checkCudaErrors(cudaMalloc((void**)&d_T, cell_value_bytes)); checkCudaErrors(cudaMalloc((void**)&d_mu, cell_value_bytes)); + // turbulence fields + checkCudaErrors(cudaMalloc((void**)&d_turbulence_k, cell_value_bytes)); + checkCudaErrors(cudaMalloc((void**)&d_turbulence_epsilon, cell_value_bytes)); + // internal fields used between eqns checkCudaErrors(cudaMalloc((void**)&d_rAU, cell_value_bytes)); checkCudaErrors(cudaMalloc((void**)&d_HbyA, cell_value_vec_bytes)); diff --git a/src_gpu/dfUEqn.H b/src_gpu/dfUEqn.H index ef3c8133f..a5f30d987 100644 --- a/src_gpu/dfUEqn.H +++ b/src_gpu/dfUEqn.H @@ -45,6 +45,8 @@ private: double *h_H_pEqn = nullptr; // intermediate fields double *d_grad_u = nullptr; + double *d_delta = nullptr; + double *d_rho_nueff = nullptr; double *d_u_host_order = nullptr; double *d_fvc_output = nullptr; // TODO: no need anymore @@ -133,6 +135,8 @@ public: const int *cyclicNeighbor, const int *patchSizeOffset, double *HbyA, double *boundary_HbyA); void getHbyA(); + void getTurbulenceKEpsilon_Smagorinsky(cudaStream_t stream, int num_cells, int num_boundary_surfaces, + const double *grad_U_tsr, const double *volume, double *delta, double *turbulence_k, double *turbulence_epsilon); void correctPsi(double *Psi, double *boundary_psi); void ueqn_ldu_to_csr(cudaStream_t stream, int num_cells, int num_surfaces, int num_boundary_surface, int num_Nz, const int* boundary_cell_face, const int *ldu_to_csr_index, const int *diag_to_csr_index, diff --git a/src_gpu/dfUEqn.cu b/src_gpu/dfUEqn.cu index 131c566d8..325589f1f 100644 --- a/src_gpu/dfUEqn.cu +++ b/src_gpu/dfUEqn.cu @@ -302,6 +302,53 @@ __global__ void ueqn_divide_cell_volume_vec(int num_cells, const double* volume, output[num_cells * 2 + index] = output[num_cells * 2 + index] / vol; } +__global__ void ueqn_calculate_turbulence_k_Smagorinsky(int num_cells, + const double *grad_U_tsr, const double *volume, double Ce, double Ck, + double *delta, double *output) +{ + int index = blockDim.x * blockIdx.x + threadIdx.x; + if (index >= num_cells) + return; + + double vol = volume[index]; + double del = pow(vol, 1/3); + + // D = 0.5*(T+T^T) + double D_xx = grad_U_tsr[num_cells * 0 + index]; + double D_xy = 0.5 * (grad_U_tsr[num_cells * 1 + index] + grad_U_tsr[num_cells * 3 + index]); + double D_xz = 0.5 * (grad_U_tsr[num_cells * 2 + index] + grad_U_tsr[num_cells * 6 + index]); + double D_yy = grad_U_tsr[num_cells * 4 + index]; + double D_yz = 0.5 * (grad_U_tsr[num_cells * 5 + index] + grad_U_tsr[num_cells * 7 + index]); + double D_zz = grad_U_tsr[num_cells * 8 + index]; + + // dev(D) + double trace = D_xx + D_yy + D_zz; + double dev_D_xx = D_xx - (1. / 3.) * trace; + double dev_D_yy = D_yy - (1. / 3.) * trace; + double dev_D_zz = D_zz - (1. / 3.) * trace; + + // scalar a + double a = Ce * del; + // scalar b + double b = 1.5 * trace; + // scalar c + double c = 2 * Ck * del * (dev_D_xx * D_xx + dev_D_yy * D_yy + dev_D_zz * D_zz + + D_xy * D_xy * 2 + D_xz * D_xz * 2 + D_yz * D_yz * 2); + + output[index] = pow((-b + pow(b * b + 4 * a * c, 0.5)) / (2 * a), 2); + delta[index] = del; +} + +__global__ void ueqn_calculate_turbulence_epsilon_Smagorinsky(int num_cells, + const double *k, const double *delta, double Ce, double *output) +{ + int index = blockDim.x * blockIdx.x + threadIdx.x; + if (index >= num_cells) + return; + + output[index] = Ce * pow(k[index], 1.5) / delta[index]; +} + void dfUEqn::setConstantValues(const std::string &mode_string, const std::string &setting_path) { this->stream = dataBase_.stream; this->mode_string = mode_string; @@ -334,6 +381,8 @@ void dfUEqn::createNonConstantFieldsInternal() { checkCudaErrors(cudaMalloc((void**)&d_nu_eff, dataBase_.cell_value_bytes)); // intermediate fields checkCudaErrors(cudaMalloc((void**)&d_grad_u, dataBase_.cell_value_tsr_bytes)); + checkCudaErrors(cudaMalloc((void**)&d_delta, dataBase_.cell_value_bytes)); + checkCudaErrors(cudaMalloc((void**)&d_rho_nueff, dataBase_.cell_value_bytes)); checkCudaErrors(cudaMalloc((void**)&d_fvc_output, dataBase_.cell_value_vec_bytes)); #endif @@ -441,6 +490,8 @@ void dfUEqn::process() { checkCudaErrors(cudaMallocAsync((void**)&d_nu_eff, dataBase_.cell_value_bytes, dataBase_.stream)); // intermediate fields checkCudaErrors(cudaMallocAsync((void**)&d_grad_u, dataBase_.cell_value_tsr_bytes, dataBase_.stream)); + checkCudaErrors(cudaMallocAsync((void**)&d_delta, dataBase_.cell_value_bytes, dataBase_.stream)); + checkCudaErrors(cudaMallocAsync((void**)&d_rho_nueff, dataBase_.cell_value_bytes, dataBase_.stream)); checkCudaErrors(cudaMallocAsync((void**)&d_fvc_output, dataBase_.cell_value_vec_bytes, dataBase_.stream)); @@ -478,6 +529,8 @@ void dfUEqn::process() { checkCudaErrors(cudaMemsetAsync(d_grad_u, 0, dataBase_.cell_value_tsr_bytes, dataBase_.stream)); checkCudaErrors(cudaMemsetAsync(d_boundary_grad_u, 0, dataBase_.boundary_surface_value_tsr_bytes, dataBase_.stream)); + checkCudaErrors(cudaMemsetAsync(d_delta, 0, dataBase_.cell_value_bytes, dataBase_.stream)); + // permute_vector_h2d(dataBase_.stream, dataBase_.num_cells, d_u_host_order, dataBase_.d_u); // permute_vector_h2d(dataBase_.stream, dataBase_.num_boundary_surfaces, d_boundary_u_host_order, dataBase_.d_boundary_u); update_boundary_coeffs_vector(dataBase_.stream, dataBase_.num_boundary_surfaces, dataBase_.num_patches, @@ -513,13 +566,16 @@ void dfUEqn::process() { dataBase_.d_boundary_face_cell, dataBase_.d_boundary_u, dataBase_.d_boundary_sf, dataBase_.d_boundary_weight, dataBase_.d_volume, dataBase_.d_boundary_mag_sf, d_boundary_grad_u, dataBase_.cyclicNeighbor.data(), dataBase_.patchSizeOffset.data(), dataBase_.d_boundary_delta_coeffs); + // calculate k & epsilon (if use turbulence model) + getTurbulenceKEpsilon_Smagorinsky(dataBase_.stream, dataBase_.num_cells, dataBase_.num_boundary_surfaces, d_grad_u, dataBase_.d_volume, + d_delta, dataBase_.d_turbulence_k, dataBase_.d_turbulence_epsilon); scale_dev2T_tensor(dataBase_.stream, dataBase_.num_cells, dataBase_.d_mu, d_grad_u, // end for internal - dataBase_.num_boundary_surfaces, dataBase_.d_boundary_mu, d_boundary_grad_u); + dataBase_.num_boundary_surfaces, dataBase_.d_boundary_mu, d_boundary_grad_u); fvc_div_cell_tensor(dataBase_.stream, dataBase_.num_cells, dataBase_.num_surfaces, dataBase_.num_boundary_surfaces, - dataBase_.d_owner, dataBase_.d_neighbor, - dataBase_.d_weight, dataBase_.d_sf, d_grad_u, d_source, // end for internal - dataBase_.num_patches, dataBase_.patch_size.data(), dataBase_.patch_type_calculated.data(), dataBase_.d_boundary_weight, - dataBase_.d_boundary_face_cell, d_boundary_grad_u, dataBase_.d_boundary_sf, dataBase_.d_volume); + dataBase_.d_owner, dataBase_.d_neighbor, + dataBase_.d_weight, dataBase_.d_sf, d_grad_u, d_source, // end for internal + dataBase_.num_patches, dataBase_.patch_size.data(), dataBase_.patch_type_calculated.data(), dataBase_.d_boundary_weight, + dataBase_.d_boundary_face_cell, d_boundary_grad_u, dataBase_.d_boundary_sf, dataBase_.d_volume); fvc_grad_cell_scalar(dataBase_.stream, dataBase_.num_cells, dataBase_.num_surfaces, dataBase_.num_boundary_surfaces, dataBase_.d_owner, dataBase_.d_neighbor, dataBase_.d_weight, dataBase_.d_sf, dataBase_.d_p, d_source, @@ -573,6 +629,7 @@ void dfUEqn::process() { checkCudaErrors(cudaFreeAsync(d_nu_eff, dataBase_.stream)); // intermediate fields checkCudaErrors(cudaFreeAsync(d_grad_u, dataBase_.stream)); + checkCudaErrors(cudaFreeAsync(d_delta, dataBase_.stream)); checkCudaErrors(cudaFreeAsync(d_rho_nueff, dataBase_.stream)); checkCudaErrors(cudaFreeAsync(d_fvc_output, dataBase_.stream)); @@ -651,6 +708,19 @@ void dfUEqn::getrAU(cudaStream_t stream, ncclComm_t comm, int num_cells, int num boundary_cell_face, rAU, boundary_rAU, dataBase_.cyclicNeighbor.data(), dataBase_.patchSizeOffset.data(), dataBase_.d_boundary_weight); } +void dfUEqn::getTurbulenceKEpsilon_Smagorinsky(cudaStream_t stream, int num_cells, int num_boundary_surfaces, + const double *grad_U_tsr, const double *volume, + double *delta, double *turbulence_k, double *turbulence_epsilon) +{ + size_t threads_per_block = 1024; + size_t blocks_per_grid = (num_cells + threads_per_block - 1) / threads_per_block; + ueqn_calculate_turbulence_k_Smagorinsky<<>>(num_cells, grad_U_tsr, volume, + 1.048, 0.094, delta, turbulence_k); + + ueqn_calculate_turbulence_epsilon_Smagorinsky<<>>(num_cells, turbulence_k, + delta, 1.048, turbulence_epsilon); +} + void dfUEqn::UEqnGetHbyA(cudaStream_t stream, ncclComm_t comm, const int *neighbor_peer, int num_cells, int num_surfaces, int num_boundary_surfaces, const int *lowerAddr, const int *upperAddr, const double *volume, const double *u, diff --git a/src_gpu/dfYEqn.cu b/src_gpu/dfYEqn.cu index c81977b9c..d68ad23ff 100644 --- a/src_gpu/dfYEqn.cu +++ b/src_gpu/dfYEqn.cu @@ -555,7 +555,6 @@ void dfYEqn::process() { // to compare yi, you should only open DEBUG_ in src_gpu. // Besides, if you compare ldu data, be patient to keep specie_index in YEqn.H and dfYEqn.cu the same. // #define DEBUG_CHECK_LDU - int solverIndex = 0; #if defined DEBUG_CHECK_LDU int specie_index = 0; for (int s = specie_index; s < specie_index + 1; s++) { @@ -611,7 +610,6 @@ void dfYEqn::process() { dataBase_.num_Nz, dataBase_.d_boundary_face_cell, dataBase_.d_ldu_to_csr_index, dataBase_.num_patches, dataBase_.patch_size.data(), patch_type.data(), d_ldu, d_source, d_internal_coeffs, d_boundary_coeffs, d_A); // TODO with solver of database_, solverIndex is no need any more. - //solve(solverIndex, s); //solverIndex ++; solve(s); #endif