From f78f459f4ef06f3e0ee56b5fed8c7dcb5ddad613 Mon Sep 17 00:00:00 2001 From: maorz1998 Date: Mon, 30 Oct 2023 15:44:40 +0800 Subject: [PATCH 1/5] run pass turbulence k & epsilon --- src_gpu/dfUEqn.cu | 24 ++++++++++++++++-------- 1 file changed, 16 insertions(+), 8 deletions(-) diff --git a/src_gpu/dfUEqn.cu b/src_gpu/dfUEqn.cu index 325589f1f..f0253e62a 100644 --- a/src_gpu/dfUEqn.cu +++ b/src_gpu/dfUEqn.cu @@ -311,7 +311,9 @@ __global__ void ueqn_calculate_turbulence_k_Smagorinsky(int num_cells, return; double vol = volume[index]; - double del = pow(vol, 1/3); + double oneThird = (1. / 3.); + + double del = pow(vol, oneThird); // D = 0.5*(T+T^T) double D_xx = grad_U_tsr[num_cells * 0 + index]; @@ -323,19 +325,20 @@ __global__ void ueqn_calculate_turbulence_k_Smagorinsky(int num_cells, // dev(D) double trace = D_xx + D_yy + D_zz; - double dev_D_xx = D_xx - (1. / 3.) * trace; - double dev_D_yy = D_yy - (1. / 3.) * trace; - double dev_D_zz = D_zz - (1. / 3.) * trace; + double dev_D_xx = D_xx - oneThird * trace; + double dev_D_yy = D_yy - oneThird * trace; + double dev_D_zz = D_zz - oneThird * trace; // scalar a - double a = Ce * del; + double a = Ce / del; // scalar b - double b = 1.5 * trace; + double b = 2 * oneThird * trace; // scalar c double c = 2 * Ck * del * (dev_D_xx * D_xx + dev_D_yy * D_yy + dev_D_zz * D_zz + D_xy * D_xy * 2 + D_xz * D_xz * 2 + D_yz * D_yz * 2); - output[index] = pow((-b + pow(b * b + 4 * a * c, 0.5)) / (2 * a), 2); + double sqrt_result = (-b + pow(b * b + 4 * a * c, 0.5)) / (2 * a); + output[index] = sqrt_result * sqrt_result; delta[index] = del; } @@ -566,9 +569,14 @@ void dfUEqn::process() { dataBase_.d_boundary_face_cell, dataBase_.d_boundary_u, dataBase_.d_boundary_sf, dataBase_.d_boundary_weight, dataBase_.d_volume, dataBase_.d_boundary_mag_sf, d_boundary_grad_u, dataBase_.cyclicNeighbor.data(), dataBase_.patchSizeOffset.data(), dataBase_.d_boundary_delta_coeffs); - // calculate k & epsilon (if use turbulence model) + + // **if use turbulence model** + // calculate k & epsilon getTurbulenceKEpsilon_Smagorinsky(dataBase_.stream, dataBase_.num_cells, dataBase_.num_boundary_surfaces, d_grad_u, dataBase_.d_volume, d_delta, dataBase_.d_turbulence_k, dataBase_.d_turbulence_epsilon); + // calculate nut + // **end use turbulence model** + scale_dev2T_tensor(dataBase_.stream, dataBase_.num_cells, dataBase_.d_mu, d_grad_u, // end for internal dataBase_.num_boundary_surfaces, dataBase_.d_boundary_mu, d_boundary_grad_u); fvc_div_cell_tensor(dataBase_.stream, dataBase_.num_cells, dataBase_.num_surfaces, dataBase_.num_boundary_surfaces, From 03c73389d5e94bd24b4120e58c3d33543f12f978 Mon Sep 17 00:00:00 2001 From: maorz1998 Date: Tue, 7 Nov 2023 23:23:40 +0800 Subject: [PATCH 2/5] support dynimic NN input size --- src_gpu/dfChemistrySolver.H | 16 +- src_gpu/dfChemistrySolver.cu | 157 ++++++++----- src_gpu/dfChemistrySolver/Layer.H | 88 ++++++++ src_gpu/dfChemistrySolver/Layer.cu | 212 ++++++++++++++++++ .../dfChemistrySolver_cublas.H | 25 +++ .../dfChemistrySolver_cublas.cu | 147 ++++++++++++ 6 files changed, 591 insertions(+), 54 deletions(-) create mode 100644 src_gpu/dfChemistrySolver/Layer.H create mode 100644 src_gpu/dfChemistrySolver/Layer.cu create mode 100644 src_gpu/dfChemistrySolver/dfChemistrySolver_cublas.H create mode 100644 src_gpu/dfChemistrySolver/dfChemistrySolver_cublas.cu diff --git a/src_gpu/dfChemistrySolver.H b/src_gpu/dfChemistrySolver.H index 96a16f521..5a5201399 100644 --- a/src_gpu/dfChemistrySolver.H +++ b/src_gpu/dfChemistrySolver.H @@ -4,23 +4,33 @@ #include #include #include +#include "dfMatrixDataBase.H" class dfChemistrySolver { private: + dfMatrixDataBase &dataBase_; + cudaStream_t stream; std::vector modules_; torch::Device device_; double *Xmu_, *Xstd_, *Ymu_, *Ystd_; - double *init_input_, *y_input_BCT; + double *init_input_, *y_input_BCT, *NN_output_; + int *d_reactCellIndex; int dim_input_, num_cells_, num_species_, num_modules_; int batch_size_; + double unReactT_; + int inputsize_; public: - dfChemistrySolver(int num_cells, int num_species, int batch_size); + dfChemistrySolver(dfMatrixDataBase &dataBase) + : device_(torch::kCUDA), dataBase_(dataBase) {}; ~dfChemistrySolver(); + void setConstantValue(int num_cells, int num_species, int batch_size); void loadModels(const std::string dir); void loadNormalization(const std::string dir); - void Inference(const double *T, const double *p, const double *y, + void Inference(const double *h_T, const double *d_T,const double *p, const double *y, const double *rho, double *RR); + + void sync(); }; \ No newline at end of file diff --git a/src_gpu/dfChemistrySolver.cu b/src_gpu/dfChemistrySolver.cu index 935bbb3f9..9c4291064 100644 --- a/src_gpu/dfChemistrySolver.cu +++ b/src_gpu/dfChemistrySolver.cu @@ -1,27 +1,31 @@ #include "dfChemistrySolver.H" +#include "dfMatrixOpBase.H" -__global__ void construct_init_input(int num_cells, int dim, const double *T, const double *p, - const double *y, double *y_input_BCT, double *output) +__global__ void construct_init_input(int num_thread, int num_cells, int dim, const double *T, const double *p, + const int *reactCellIndex, const double *y, double *y_input_BCT, double *output) { int index = blockDim.x * blockIdx.x + threadIdx.x; - if (index >= num_cells) + if (index >= num_thread) return; - output[index * dim] = T[index]; - output[index * dim + 1] = p[index]; + int cellIndex = reactCellIndex[index]; + + output[index * dim] = T[cellIndex]; + // output[index * dim + 1] = p[cellIndex]; + output[index * dim + 1] = 101325.; double y_BCT; for (int i = 0; i < dim - 2; ++i) { - y_BCT = (pow(y[i * num_cells + index], 0.1) - 1) * 10; // BCT: lambda = 0.1 + y_BCT = (pow(y[i * num_cells + cellIndex], 0.1) - 1) * 10; // BCT: lambda = 0.1 output[index * dim + 2 + i] = y_BCT; - y_input_BCT[i * num_cells + index] = y_BCT; + y_input_BCT[i * num_thread + index] = y_BCT; } } -__global__ void normalize_input(int num_cells, int dim, const double *input, +__global__ void normalize_input(int num_thread, int num_cells, int dim, const double *input, const double *Xmu, const double *Xstd, double *output) { int index = blockDim.x * blockIdx.x + threadIdx.x; - if (index >= num_cells) + if (index >= num_thread) return; for (int i = 0; i < dim; ++i) { @@ -29,46 +33,57 @@ __global__ void normalize_input(int num_cells, int dim, const double *input, } } -__global__ void calculate_y_new(int num_cells, int num_modules, const double *output_init, +__global__ void calculate_y_new(int num_thread, int num_modules, const double *output_init, const double *y_input_BCT, const double *Ymu, const double *Ystd, double *output) { int index = blockDim.x * blockIdx.x + threadIdx.x; - if (index >= num_cells) + if (index >= num_thread) return; - + double RR_tmp; for (int i = 0; i < num_modules; ++i) { - RR_tmp = output_init[i * num_cells + index] * Ystd[i] + Ymu[i] + y_input_BCT[i * num_cells + index]; + RR_tmp = output_init[i * num_thread + index] * Ystd[i] + Ymu[i] + y_input_BCT[i * num_thread + index]; RR_tmp = pow((RR_tmp * 0.1 + 1), 10); // rev-BCT: lambda = 0.1 - output[i * num_cells + index] = RR_tmp; + output[i * num_thread + index] = RR_tmp; } } -__global__ void calculate_RR(int num_cells, int num_species, double delta_t, - const double *rho, const double *y_old, double *y_new, double *RR) +__global__ void calculate_RR(int num_thread, int num_cells, int num_species, double delta_t, + const int *reactCellIndex, const double *rho, const double *y_old, const double *p, + double *y_NN, double *RR) { int index = blockDim.x * blockIdx.x + threadIdx.x; - if (index >= num_cells) + if (index >= num_thread) return; + + int cellIndex = reactCellIndex[index]; // normalize double y_ave = 0.; - for (int i = 0; i < num_species; ++i) { - y_ave += y_new[i * num_cells + index]; + for (int i = 0; i < num_species - 1; ++i) { + y_ave += y_NN[i * num_thread + index]; } - for (int i = 0; i < num_species; ++i) { - y_new[i * num_cells + index] = y_new[i * num_cells + index] / y_ave; - RR[i * num_cells + index] = (y_new[i * num_cells + index] - y_old[i * num_cells + index]) * rho[index] / delta_t; + y_ave += y_old[(num_species - 1) * num_cells + cellIndex]; + for (int i = 0; i < num_species - 1; ++i) { + y_NN[i * num_thread + index] = y_NN[i * num_thread + index] / y_ave; + RR[i * num_cells + cellIndex] = (y_NN[i * num_thread + index] - y_old[i * num_cells + cellIndex]) * rho[cellIndex] + * (p[cellIndex] / 101325.) / delta_t; // correction } } -dfChemistrySolver::dfChemistrySolver(int num_cells, int num_species, int batch_size) - : device_(torch::kCUDA), num_cells_(num_cells), num_species_(num_species), batch_size_(batch_size) -{ +dfChemistrySolver::~dfChemistrySolver() { + cudaFree(init_input_); +} + +void dfChemistrySolver::setConstantValue(int num_cells, int num_species, int batch_size) { + this->num_cells_ = num_cells; + this->num_species_ = num_species; + this->batch_size_ = batch_size; + this->stream = dataBase_.stream; + dim_input_ = num_species + 2; // p, T, y num_modules_ = num_species_ - 1; - cudaMalloc(&init_input_, sizeof(double) * num_cells * dim_input_); - cudaMalloc(&y_input_BCT, sizeof(double) * num_cells * num_species_); + unReactT_ = 610; cudaMalloc(&Xmu_, sizeof(double) * dim_input_); cudaMalloc(&Xstd_, sizeof(double) * dim_input_); cudaMalloc(&Ymu_, sizeof(double) * num_modules_); @@ -77,16 +92,17 @@ dfChemistrySolver::dfChemistrySolver(int num_cells, int num_species, int batch_s // now norm paras are set in constructor manually at::TensorOptions opts = at::TensorOptions().dtype(at::kDouble).device(at::kCUDA); - std::vector Xmu_vec = {2.1996457626e+03, 1.0281762632e+05, -5.2533840071e+00, - -4.5623405933e+00, -1.5305375824e+00, -3.7453838144e+00, - -3.3097212360e+00, -4.3339657954e+00, -2.9028421546e-01}; - std::vector Xstd_vec = {3.8304388695e+02, 7.1905832810e+02, 4.6788389177e-01, - 5.3034657693e-01, 5.0325864222e-01, 5.1058993718e-01, - 7.9704668543e-01, 5.1327160125e-01, 4.6827591193e-03}; - std::vector Ymu_vec = {0.0085609265, -0.0082998877, 0.0030108739, - -0.0067360325, 0.0037464590, 0.0024258509}; - std::vector Ystd_vec = {0.05887569, 0.12253898, 0.01643904, - 0.0963155, 0.07127831, 0.04373392}; + std::vector Xmu_vec = {1.2996375154e+03, 1.4349643303e+05, -4.3678815323e+00, + -5.8949183472e+00, -3.8840763486e+00, -5.5436246211e+00, + -6.0178199636e+00, -2.1469850084e+00, -6.9828365432e+00, + -7.7747568654e+00, -1.8571483828e-01}; + std::vector Xstd_vec = {3.9612732767e+02, 1.8822821412e+04, 1.1226048640e+00, 6.8397462420e-01, + 1.8879462146e+00, 1.2433158499e+00, 1.3169176600e+00, 4.3600457243e-01, + 8.1820904505e-01, 8.0471805333e-01, 6.1020187522e-02}; + std::vector Ymu_vec = {-0.0101101322, -0.0138129078, -0.0146349442, -0.0088870325, + -0.0075195178, 0.0020506931, -0.0103104668, -0.0192603020}; + std::vector Ystd_vec = {0.0297933161, 0.0802139099, 0.0230954310, 0.1541940427, + 0.1316836678, 0.0042975580, 0.1476416977, 0.0860471308}; cudaMemcpy(Xmu_, Xmu_vec.data(), sizeof(double) * dim_input_, cudaMemcpyHostToDevice); cudaMemcpy(Xstd_, Xstd_vec.data(), sizeof(double) * dim_input_, cudaMemcpyHostToDevice); @@ -109,24 +125,44 @@ dfChemistrySolver::dfChemistrySolver(int num_cells, int num_species, int batch_s } } -dfChemistrySolver::~dfChemistrySolver() { - cudaFree(init_input_); -} - -void dfChemistrySolver::Inference(const double *T, const double *p, const double *y, +void dfChemistrySolver::Inference(const double *h_T, const double *d_T,const double *p, const double *y, const double *rho, double *RR) { // construct input + inputsize_ = 0; + std::vector reactCellIndex; + for (int i = 0; i < num_cells_; i++) { + if (h_T[i] >= unReactT_) { + reactCellIndex.push_back(i); + } + } + inputsize_ = reactCellIndex.size(); +#ifdef STREAM_ALLOCATOR + checkCudaErrors(cudaMallocAsync((void**)&init_input_, sizeof(double) * inputsize_ * dim_input_, stream)); + checkCudaErrors(cudaMallocAsync((void**)&y_input_BCT, sizeof(double) * inputsize_ * num_species_, stream)); + checkCudaErrors(cudaMallocAsync((void**)&NN_output_, sizeof(double) * inputsize_ * num_species_, stream)); + checkCudaErrors(cudaMallocAsync((void**)&d_reactCellIndex, sizeof(int) * inputsize_, stream)); + checkCudaErrors(cudaMemcpyAsync(d_reactCellIndex, reactCellIndex.data(), sizeof(int) * inputsize_, cudaMemcpyHostToDevice, stream)); +#else + cudaMalloc(&init_input_, sizeof(double) * inputsize_ * dim_input_); + cudaMalloc(&y_input_BCT, sizeof(double) * inputsize_ * num_species_); + cudaMalloc(&d_reactCellIndex, sizeof(int) * inputsize_); + cudaMemcpy(d_reactCellIndex, reactCellIndex.data(), sizeof(int) * inputsize_, cudaMemcpyHostToDevice); +#endif + // construct input size_t threads_per_block = 1024; - size_t blocks_per_grid = (num_cells_ + threads_per_block - 1) / threads_per_block; - construct_init_input<<>>(num_cells_, dim_input_, T, p, y, y_input_BCT, init_input_); - normalize_input<<>>(num_cells_, dim_input_, init_input_, Xmu_, Xstd_, init_input_); + size_t blocks_per_grid = (inputsize_ + threads_per_block - 1) / threads_per_block; + construct_init_input<<>>(inputsize_, num_cells_, dim_input_, d_T, p, + d_reactCellIndex, y, y_input_BCT, init_input_); + normalize_input<<>>(inputsize_, num_cells_, dim_input_, init_input_, + Xmu_, Xstd_, init_input_); // inference by torch double *d_output; - for (int sample_start = 0; sample_start < num_cells_; sample_start += batch_size_) { - int sample_end = std::min(sample_start + batch_size_, num_cells_); + for (int sample_start = 0; sample_start < inputsize_; sample_start += batch_size_) { + int sample_end = std::min(sample_start + batch_size_, inputsize_); int sample_len = sample_end - sample_start; - at::Tensor torch_input = torch::from_blob(init_input_ + sample_start * dim_input_, {sample_len, dim_input_}, torch::TensorOptions().device(device_).dtype(torch::kDouble)); + at::Tensor torch_input = torch::from_blob(init_input_ + sample_start * dim_input_, {sample_len, dim_input_}, + torch::TensorOptions().device(device_).dtype(torch::kDouble)); torch_input = torch_input.to(at::kFloat); std::vector INPUTS; INPUTS.push_back(torch_input); @@ -136,10 +172,29 @@ void dfChemistrySolver::Inference(const double *T, const double *p, const double output[i] = modules_[i].forward(INPUTS).toTensor(); output[i] = output[i].to(at::kDouble); d_output = output[i].data_ptr(); - cudaMemcpy(RR + (i * num_cells_ + sample_start), d_output, sizeof(double) * sample_len, cudaMemcpyDeviceToDevice); + cudaMemcpy(NN_output_ + (i * inputsize_ + sample_start), d_output, sizeof(double) * sample_len, cudaMemcpyDeviceToDevice); } } - calculate_y_new<<>>(num_cells_, num_modules_, RR, y_input_BCT, Ymu_, Ystd_, RR); - calculate_RR<<>>(num_cells_, num_species_, 1e-6, rho, y, RR, RR); + calculate_y_new<<>>(inputsize_, num_modules_, NN_output_, + y_input_BCT, Ymu_, Ystd_, NN_output_); + calculate_RR<<>>(inputsize_, num_cells_, num_species_, 1e-6, + d_reactCellIndex, rho, y, p, NN_output_, RR); + +#ifdef STREAM_ALLOCATOR + checkCudaErrors(cudaFreeAsync(init_input_, dataBase_.stream)); + checkCudaErrors(cudaFreeAsync(y_input_BCT, dataBase_.stream)); + checkCudaErrors(cudaFreeAsync(NN_output_, dataBase_.stream)); + checkCudaErrors(cudaFreeAsync(d_reactCellIndex, dataBase_.stream)); +#else + cudaFree(init_input_); + cudaFree(y_input_BCT); + cudaFree(NN_output_); + cudaFree(d_reactCellIndex); +#endif + +} + +void dfChemistrySolver::sync() { + checkCudaErrors(cudaStreamSynchronize(dataBase_.stream)); } \ No newline at end of file diff --git a/src_gpu/dfChemistrySolver/Layer.H b/src_gpu/dfChemistrySolver/Layer.H new file mode 100644 index 000000000..e1a90e921 --- /dev/null +++ b/src_gpu/dfChemistrySolver/Layer.H @@ -0,0 +1,88 @@ +#pragma once +#include +#include +#include +#include +#include + +template +class Tensor{ +private: + std::vector shape_; + int64_t element_num_; + DataType* data_; + bool owner_; +public: + Tensor(std::vector shape): + shape_(shape), + element_num_(std::accumulate(shape_.begin(),shape_.end(),1,std::multiplies())), + data_(new DataType[element_num_]), + owner_(true){}; + + Tensor(std::vector shape, DataType* data): + shape_(shape), + element_num_(std::accumulate(shape_.begin(),shape_.end(),1,std::multiplies())), + data_(data), + owner_(false){}; + ~Tensor(){ + if(owner_) delete[] data_; + } + int64_t dim_num() const {return shape_.size();}; + int64_t element_num() const {return element_num_;}; + int64_t bytes_num() const {return element_num_ * sizeof(DataType);}; + int64_t dim(int64_t i) const {return shape_[i];}; + const DataType* data() const {return data_;}; + DataType* data() {return data_;}; +}; + +template +class Layer{ +public: + virtual ~Layer() {}; + virtual void forward(const Tensor& input, Tensor& output) = 0; + virtual void load_parameters(const std::string& dir, int64_t layer_id) = 0; +}; + +template +class Linear : public Layer{ +private: + int64_t in_features_; + int64_t out_features_; + // (in_features, out_features) + Tensor weights_; + // (out_features,) + Tensor bias_; + +public: + Linear(int64_t in_features,int64_t out_features): + in_features_(in_features), + out_features_(out_features), + weights_({in_features_,out_features_}), + bias_({out_features_}){}; + virtual ~Linear(){}; + + virtual void forward(const Tensor& input, Tensor& output); + virtual void load_parameters(const std::string& dir, int64_t layer_id); +}; + +template +class LinearGELU : public Layer{ +private: + int64_t in_features_; + int64_t out_features_; + // (in_features, out_features) + Tensor weights_; + // (out_features,) + Tensor bias_; + +public: + LinearGELU(int64_t in_features,int64_t out_features): + in_features_(in_features), + out_features_(out_features), + weights_({in_features_,out_features_}), + bias_({out_features_}){}; + virtual ~LinearGELU(){}; + + virtual void forward(const Tensor& input, Tensor& output); + virtual void load_parameters(const std::string& dir, int64_t layer_id); +}; \ No newline at end of file diff --git a/src_gpu/dfChemistrySolver/Layer.cu b/src_gpu/dfChemistrySolver/Layer.cu new file mode 100644 index 000000000..23910b517 --- /dev/null +++ b/src_gpu/dfChemistrySolver/Layer.cu @@ -0,0 +1,212 @@ +#include "Layer.H" +#include +#include +#include +#include +#include +#include +#include + +#ifdef __cplusplus +extern "C" { +#endif + +extern void sgemm_(char *transa, char *transb, int *m, int *n, int *k, float *alpha, float *a, int *lda, float *b, int *ldb, float *beta, float *c, int *ldc); + +#ifdef __cplusplus +} +#endif + +template<> +void Linear::forward(const Tensor& input, Tensor& output){ + assert(input.dim_num() == 2); + assert(output.dim_num() == 2); + assert(input.dim(0) == output.dim(0)); + assert(input.dim(1) == in_features_); + assert(out_features_ == output.dim(1)); + + char transA = 'N'; + char transB = 'N'; + float alpha = 1.f; + float beta = 1.f; + int m = out_features_; + int n = input.dim(0); + int k = in_features_; + float* A = weights_.data(); + int lda = out_features_; + float* B = const_cast(input.data()); + int ldb = input.dim(1); + float* C = output.data(); + int ldc = output.dim(1); + + for(int i = 0; i < input.dim(0); ++i) + std::copy(bias_.data(), bias_.data() + out_features_, C + i * out_features_); + + sgemm_(&transA, &transB, &m, &n, &k, &alpha, A, &lda, B, &ldb, &beta, C, &ldc); +} + +template<> +void Linear::load_parameters(const std::string& dir, int64_t layer_id){ + int flag_mpi_init; + MPI_Initialized(&flag_mpi_init); + if(!flag_mpi_init){ + std::cerr << "DNNInferencer_blas::load_models : MPI is not initialized" << std::endl; + MPI_Abort(MPI_COMM_WORLD, 1); + } + int mpirank; + int mpisize; + MPI_Comm_rank(MPI_COMM_WORLD, &mpirank); + MPI_Comm_size(MPI_COMM_WORLD, &mpisize); + + if(mpirank == 0){ + std::stringstream ss1,ss2; + ss1 << dir << "/" << "linear_" << layer_id << "_weights_rowmajor_" << in_features_ << "_" << out_features_ << ".data"; + std::string weights_path = ss1.str(); + ss2 << dir << "/" << "linear_" << layer_id << "_bias_" << out_features_ << ".data"; + std::string bias_path = ss2.str(); + // weights + std::ifstream weights_file(weights_path, std::ios::binary); + if(!weights_file.is_open()){ + std::cerr << "open weights file error : " << weights_path << std::endl << std::flush; + abort(); + } + weights_file.read(reinterpret_cast(weights_.data()), weights_.bytes_num()); + weights_file.close(); + // bias + std::ifstream bias_file(bias_path, std::ios::binary); + if(!bias_file.is_open()){ + std::cerr << "open bias file error : " << bias_path << std::endl << std::flush; + abort(); + } + bias_file.read(reinterpret_cast(bias_.data()), bias_.bytes_num()); + bias_file.close(); + } + + MPI_Bcast(weights_.data(), weights_.element_num(), MPI_FLOAT, 0, MPI_COMM_WORLD); + MPI_Bcast(bias_.data(), bias_.element_num(), MPI_FLOAT, 0, MPI_COMM_WORLD); +} + +template<> +void LinearGELU::load_parameters(const std::string& dir, int64_t layer_id){ + int flag_mpi_init; + MPI_Initialized(&flag_mpi_init); + if(!flag_mpi_init){ + std::cerr << "DNNInferencer_blas::load_models : MPI is not initialized" << std::endl; + MPI_Abort(MPI_COMM_WORLD, 1); + } + int mpirank; + int mpisize; + MPI_Comm_rank(MPI_COMM_WORLD, &mpirank); + MPI_Comm_size(MPI_COMM_WORLD, &mpisize); + + if(mpirank == 0){ + std::stringstream ss1,ss2; + ss1 << dir << "/" << "linear_" << layer_id << "_weights_rowmajor_" << in_features_ << "_" << out_features_ << ".data"; + std::string weights_path = ss1.str(); + ss2 << dir << "/" << "linear_" << layer_id << "_bias_" << out_features_ << ".data"; + std::string bias_path = ss2.str(); + + // weights + std::ifstream weights_file(weights_path, std::ios::binary); + if(!weights_file.is_open()){ + std::cerr << "open weights file error : " << weights_path << std::endl << std::flush; + abort(); + } + weights_file.read(reinterpret_cast(weights_.data()), weights_.bytes_num()); + weights_file.close(); + + // bias + std::ifstream bias_file(bias_path, std::ios::binary); + if(!bias_file.is_open()){ + std::cerr << "open bias file error : " << bias_path << std::endl << std::flush; + abort(); + } + bias_file.read(reinterpret_cast(bias_.data()), bias_.bytes_num()); + bias_file.close(); + } + + MPI_Bcast(weights_.data(), weights_.element_num(), MPI_FLOAT, 0, MPI_COMM_WORLD); + MPI_Bcast(bias_.data(), bias_.element_num(), MPI_FLOAT, 0, MPI_COMM_WORLD); +} + + +void gelu_navie(int64_t len, float* data){ + for(int64_t i = 0; i < len; ++i){ + float x = data[i]; + data[i] = 0.5 * x * (1.f + tanhf(sqrtf(2.f / M_PI) * (x + 0.044715f * powf(x, 3.f)))); + } +} + +inline float tanhf_exp(float x){ + if(x > 8.f) return 1.; + if(x < -8.f) return -1.; + return 1.f - 2.f / (expf(2.f * x) + 1.f); +} + +double tanh_exp(double x){ + return 1. - 2. / (exp(2. * x) + 1.); +} + +void geluf_exp(int64_t len, float* data){ + + const float const_1 = sqrtf(2.f / M_PI); + const float const_2 = 0.044715f; + const float one = 1.f; + const float half = 0.5; + + for(int64_t i = 0; i < len; ++i){ + float x = data[i]; + data[i] = half * x * (one + tanhf_exp(const_1 * (x + const_2 * x * x * x))); + } +} + +void gelud_exp(int64_t len, float* data){ + const double const_1 = sqrtf(2. / M_PI); + const double const_2 = 0.044715; + const double one = 1.; + const double half = 0.5; + + for(int64_t i = 0; i < len; ++i){ + double x = data[i]; + data[i] = half * x * (one + tanh_exp(const_1 * (x + const_2 * x * x * x))); + } +} + +// TODO +void gelu_exp_sve(int64_t len, float* data){ + +} + +template<> +void LinearGELU::forward(const Tensor& input, Tensor& output){ + assert(input.dim_num() == 2); + assert(output.dim_num() == 2); + assert(input.dim(0) == output.dim(0)); + assert(input.dim(1) == in_features_); + assert(out_features_ == output.dim(1)); + + char transA = 'N'; + char transB = 'N'; + float alpha = 1.f; + float beta = 1.f; + int m = out_features_; + int n = input.dim(0); + int k = in_features_; + float* A = weights_.data(); + int lda = out_features_; + float* B = const_cast(input.data()); + int ldb = input.dim(1); + float* C = output.data(); + int ldc = output.dim(1); + + for(int i = 0; i < input.dim(0); ++i) + std::copy(bias_.data(), bias_.data() + out_features_, C + i * out_features_); + + sgemm_(&transA, &transB, &m, &n, &k, &alpha, A, &lda, B, &ldb, &beta, C, &ldc); + // GELU + geluf_exp(output.element_num(), output.data()); +} + +template class Tensor; +template class Linear; +template class LinearGELU; \ No newline at end of file diff --git a/src_gpu/dfChemistrySolver/dfChemistrySolver_cublas.H b/src_gpu/dfChemistrySolver/dfChemistrySolver_cublas.H new file mode 100644 index 000000000..c7eea6c7f --- /dev/null +++ b/src_gpu/dfChemistrySolver/dfChemistrySolver_cublas.H @@ -0,0 +1,25 @@ +#pragma once + +#include +#include +#include +#include + +class dfChemistrySolver +{ +private: + std::vector modules_; + torch::Device device_; + double *Xmu_, *Xstd_, *Ymu_, *Ystd_; + + double *init_input_, *y_input_BCT; + int dim_input_, num_cells_, num_species_, num_modules_; +public: + dfChemistrySolver(int num_cells, int num_species); + ~dfChemistrySolver(); + + void loadModels(const std::string dir); + void loadNormalization(const std::string dir); + void Inference(const double *T, const double *p, const double *y, + const double *rho, double *RR); +}; \ No newline at end of file diff --git a/src_gpu/dfChemistrySolver/dfChemistrySolver_cublas.cu b/src_gpu/dfChemistrySolver/dfChemistrySolver_cublas.cu new file mode 100644 index 000000000..4e2014150 --- /dev/null +++ b/src_gpu/dfChemistrySolver/dfChemistrySolver_cublas.cu @@ -0,0 +1,147 @@ +#include "dfChemistrySolver_cublas.H" + +__global__ void construct_init_input(int num_cells, int dim, const double *T, const double *p, + const double *y, double *y_input_BCT, double *output) +{ + int index = blockDim.x * blockIdx.x + threadIdx.x; + if (index >= num_cells) + return; + + output[index * dim] = T[index]; + output[index * dim + 1] = p[index]; + double y_BCT; + for (int i = 0; i < dim - 2; ++i) { + y_BCT = (pow(y[i * num_cells + index], 0.1) - 1) * 10; // BCT: lambda = 0.1 + output[index * dim + 2 + i] = y_BCT; + y_input_BCT[i * num_cells + index] = y_BCT; + } +} + +__global__ void normalize_input(int num_cells, int dim, const double *input, + const double *Xmu, const double *Xstd, double *output) +{ + int index = blockDim.x * blockIdx.x + threadIdx.x; + if (index >= num_cells) + return; + + for (int i = 0; i < dim; ++i) { + output[index * dim + i] = (input[index * dim + i] - Xmu[i]) / Xstd[i]; + } +} + +__global__ void calculate_y_new(int num_cells, int num_modules, const double *output_init, + const double *y_input_BCT, const double *Ymu, const double *Ystd, double *output) +{ + int index = blockDim.x * blockIdx.x + threadIdx.x; + if (index >= num_cells) + return; + + double RR_tmp; + for (int i = 0; i < num_modules; ++i) { + RR_tmp = output_init[i * num_cells + index] * Ystd[i] + Ymu[i] + y_input_BCT[i * num_cells + index]; + RR_tmp = pow((RR_tmp * 0.1 + 1), 10); // rev-BCT: lambda = 0.1 + output[i * num_cells + index] = RR_tmp; + } +} + +__global__ void calculate_RR(int num_cells, int num_species, double delta_t, + const double *rho, const double *y_old, double *y_new, double *RR) +{ + int index = blockDim.x * blockIdx.x + threadIdx.x; + if (index >= num_cells) + return; + + // normalize + double y_ave = 0.; + for (int i = 0; i < num_species; ++i) { + y_ave += y_new[i * num_cells + index]; + } + for (int i = 0; i < num_species; ++i) { + y_new[i * num_cells + index] = y_new[i * num_cells + index] / y_ave; + RR[i * num_cells + index] = (y_new[i * num_cells + index] - y_old[i * num_cells + index]) * rho[index] / delta_t; + } +} + +dfChemistrySolver::dfChemistrySolver(int num_cells, int num_species) + : device_(torch::kCUDA), num_cells_(num_cells), num_species_(num_species) +{ + dim_input_ = num_species + 2; // p, T, y + num_modules_ = num_species_ - 1; + cudaMalloc(&init_input_, sizeof(double) * num_cells * dim_input_); + cudaMalloc(&y_input_BCT, sizeof(double) * num_cells * num_species_); + cudaMalloc(&Xmu_, sizeof(double) * dim_input_); + cudaMalloc(&Xstd_, sizeof(double) * dim_input_); + cudaMalloc(&Ymu_, sizeof(double) * num_modules_); + cudaMalloc(&Ystd_, sizeof(double) * num_modules_); + modules_.reserve(num_modules_); + + // now norm paras are set in constructor manually + at::TensorOptions opts = at::TensorOptions().dtype(at::kDouble).device(at::kCUDA); + std::vector Xmu_vec = {2.1996457626e+03, 1.0281762632e+05, -5.2533840071e+00, + -4.5623405933e+00, -1.5305375824e+00, -3.7453838144e+00, + -3.3097212360e+00, -4.3339657954e+00, -2.9028421546e-01}; + std::vector Xstd_vec = {3.8304388695e+02, 7.1905832810e+02, 4.6788389177e-01, + 5.3034657693e-01, 5.0325864222e-01, 5.1058993718e-01, + 7.9704668543e-01, 5.1327160125e-01, 4.6827591193e-03}; + std::vector Ymu_vec = {0.0085609265, -0.0082998877, 0.0030108739, + -0.0067360325, 0.0037464590, 0.0024258509}; + std::vector Ystd_vec = {0.0085609265, -0.0082998877, 0.0030108739, + -0.0067360325, 0.0037464590, 0.0024258509}; + + cudaMemcpy(Xmu_, Xmu_vec.data(), sizeof(double) * dim_input_, cudaMemcpyHostToDevice); + cudaMemcpy(Xstd_, Xstd_vec.data(), sizeof(double) * dim_input_, cudaMemcpyHostToDevice); + cudaMemcpy(Ymu_, Ymu_vec.data(), sizeof(double) * num_modules_, cudaMemcpyHostToDevice); + cudaMemcpy(Ystd_, Ystd_vec.data(), sizeof(double) * num_modules_, cudaMemcpyHostToDevice); + + // input modules + std::string prefix = "new_Temporary_Chemical_"; + std::string suffix = ".pt"; + for (int i = 0; i < num_modules_; ++i) { + std::string model_path = prefix + std::to_string(i) + suffix; + printf("loading model %s\n", model_path.c_str()); + modules_.push_back(torch::jit::load(model_path)); + // try { + // // modules_.push_back(torch::jit::load(model_path)); + // torch::jit::load(model_path); + // } + // catch (const c10::Error& e) { + // std::cerr << "error loading the model\n"; + // exit(-1); + // } + modules_[i].to(device_); + } +} + +dfChemistrySolver::~dfChemistrySolver() { + cudaFree(init_input_); +} + +void dfChemistrySolver::Inference(const double *T, const double *p, const double *y, + const double *rho, double *RR) { + // construct input + size_t threads_per_block = 1024; + size_t blocks_per_grid = (num_cells_ + threads_per_block - 1) / threads_per_block; + construct_init_input<<>>(num_cells_, dim_input_, T, p, y, y_input_BCT, init_input_); + normalize_input<<>>(num_cells_, dim_input_, init_input_, Xmu_, Xstd_, init_input_); + + // inference by torch + at::Tensor torch_input = torch::from_blob(init_input_, {num_cells_, dim_input_}, device_); + torch_input = torch_input.to(at::kFloat); + std::vector INPUTS; + INPUTS.push_back(torch_input); + + std::vector output(num_modules_); + for (int i = 0; i < num_modules_; ++i) { + output[i] = modules_[i].forward(INPUTS).toTensor(); + output[i] = output[i].to(at::kDouble); + } + + // post process + double *d_output; + for (int i = 0; i < num_modules_; ++i) { + d_output = output[i].data_ptr(); + cudaMemcpy(RR + i * num_cells_, d_output, sizeof(double) * num_cells_, cudaMemcpyDeviceToDevice); + } + calculate_y_new<<>>(num_cells_, num_modules_, RR, y_input_BCT, Ymu_, Ystd_, RR); + calculate_RR<<>>(num_cells_, num_species_, 1e-6, rho, y, RR, RR); +} \ No newline at end of file From 24d15bb78fa4b7dcfa749f9947c419caba51ec92 Mon Sep 17 00:00:00 2001 From: maorz1998 Date: Tue, 7 Nov 2023 23:29:32 +0800 Subject: [PATCH 3/5] mixture averaged --- src_gpu/dfChemistrySolver/Layer.H | 88 -------- src_gpu/dfChemistrySolver/Layer.cu | 212 ------------------ .../dfChemistrySolver_cublas.H | 25 --- .../dfChemistrySolver_cublas.cu | 147 ------------ src_gpu/dfThermo.H | 9 +- src_gpu/dfThermo.cu | 89 +++++++- src_gpu/dfYEqn.H | 16 +- src_gpu/dfYEqn.cu | 62 +++-- 8 files changed, 149 insertions(+), 499 deletions(-) delete mode 100644 src_gpu/dfChemistrySolver/Layer.H delete mode 100644 src_gpu/dfChemistrySolver/Layer.cu delete mode 100644 src_gpu/dfChemistrySolver/dfChemistrySolver_cublas.H delete mode 100644 src_gpu/dfChemistrySolver/dfChemistrySolver_cublas.cu diff --git a/src_gpu/dfChemistrySolver/Layer.H b/src_gpu/dfChemistrySolver/Layer.H deleted file mode 100644 index e1a90e921..000000000 --- a/src_gpu/dfChemistrySolver/Layer.H +++ /dev/null @@ -1,88 +0,0 @@ -#pragma once -#include -#include -#include -#include -#include - -template -class Tensor{ -private: - std::vector shape_; - int64_t element_num_; - DataType* data_; - bool owner_; -public: - Tensor(std::vector shape): - shape_(shape), - element_num_(std::accumulate(shape_.begin(),shape_.end(),1,std::multiplies())), - data_(new DataType[element_num_]), - owner_(true){}; - - Tensor(std::vector shape, DataType* data): - shape_(shape), - element_num_(std::accumulate(shape_.begin(),shape_.end(),1,std::multiplies())), - data_(data), - owner_(false){}; - ~Tensor(){ - if(owner_) delete[] data_; - } - int64_t dim_num() const {return shape_.size();}; - int64_t element_num() const {return element_num_;}; - int64_t bytes_num() const {return element_num_ * sizeof(DataType);}; - int64_t dim(int64_t i) const {return shape_[i];}; - const DataType* data() const {return data_;}; - DataType* data() {return data_;}; -}; - -template -class Layer{ -public: - virtual ~Layer() {}; - virtual void forward(const Tensor& input, Tensor& output) = 0; - virtual void load_parameters(const std::string& dir, int64_t layer_id) = 0; -}; - -template -class Linear : public Layer{ -private: - int64_t in_features_; - int64_t out_features_; - // (in_features, out_features) - Tensor weights_; - // (out_features,) - Tensor bias_; - -public: - Linear(int64_t in_features,int64_t out_features): - in_features_(in_features), - out_features_(out_features), - weights_({in_features_,out_features_}), - bias_({out_features_}){}; - virtual ~Linear(){}; - - virtual void forward(const Tensor& input, Tensor& output); - virtual void load_parameters(const std::string& dir, int64_t layer_id); -}; - -template -class LinearGELU : public Layer{ -private: - int64_t in_features_; - int64_t out_features_; - // (in_features, out_features) - Tensor weights_; - // (out_features,) - Tensor bias_; - -public: - LinearGELU(int64_t in_features,int64_t out_features): - in_features_(in_features), - out_features_(out_features), - weights_({in_features_,out_features_}), - bias_({out_features_}){}; - virtual ~LinearGELU(){}; - - virtual void forward(const Tensor& input, Tensor& output); - virtual void load_parameters(const std::string& dir, int64_t layer_id); -}; \ No newline at end of file diff --git a/src_gpu/dfChemistrySolver/Layer.cu b/src_gpu/dfChemistrySolver/Layer.cu deleted file mode 100644 index 23910b517..000000000 --- a/src_gpu/dfChemistrySolver/Layer.cu +++ /dev/null @@ -1,212 +0,0 @@ -#include "Layer.H" -#include -#include -#include -#include -#include -#include -#include - -#ifdef __cplusplus -extern "C" { -#endif - -extern void sgemm_(char *transa, char *transb, int *m, int *n, int *k, float *alpha, float *a, int *lda, float *b, int *ldb, float *beta, float *c, int *ldc); - -#ifdef __cplusplus -} -#endif - -template<> -void Linear::forward(const Tensor& input, Tensor& output){ - assert(input.dim_num() == 2); - assert(output.dim_num() == 2); - assert(input.dim(0) == output.dim(0)); - assert(input.dim(1) == in_features_); - assert(out_features_ == output.dim(1)); - - char transA = 'N'; - char transB = 'N'; - float alpha = 1.f; - float beta = 1.f; - int m = out_features_; - int n = input.dim(0); - int k = in_features_; - float* A = weights_.data(); - int lda = out_features_; - float* B = const_cast(input.data()); - int ldb = input.dim(1); - float* C = output.data(); - int ldc = output.dim(1); - - for(int i = 0; i < input.dim(0); ++i) - std::copy(bias_.data(), bias_.data() + out_features_, C + i * out_features_); - - sgemm_(&transA, &transB, &m, &n, &k, &alpha, A, &lda, B, &ldb, &beta, C, &ldc); -} - -template<> -void Linear::load_parameters(const std::string& dir, int64_t layer_id){ - int flag_mpi_init; - MPI_Initialized(&flag_mpi_init); - if(!flag_mpi_init){ - std::cerr << "DNNInferencer_blas::load_models : MPI is not initialized" << std::endl; - MPI_Abort(MPI_COMM_WORLD, 1); - } - int mpirank; - int mpisize; - MPI_Comm_rank(MPI_COMM_WORLD, &mpirank); - MPI_Comm_size(MPI_COMM_WORLD, &mpisize); - - if(mpirank == 0){ - std::stringstream ss1,ss2; - ss1 << dir << "/" << "linear_" << layer_id << "_weights_rowmajor_" << in_features_ << "_" << out_features_ << ".data"; - std::string weights_path = ss1.str(); - ss2 << dir << "/" << "linear_" << layer_id << "_bias_" << out_features_ << ".data"; - std::string bias_path = ss2.str(); - // weights - std::ifstream weights_file(weights_path, std::ios::binary); - if(!weights_file.is_open()){ - std::cerr << "open weights file error : " << weights_path << std::endl << std::flush; - abort(); - } - weights_file.read(reinterpret_cast(weights_.data()), weights_.bytes_num()); - weights_file.close(); - // bias - std::ifstream bias_file(bias_path, std::ios::binary); - if(!bias_file.is_open()){ - std::cerr << "open bias file error : " << bias_path << std::endl << std::flush; - abort(); - } - bias_file.read(reinterpret_cast(bias_.data()), bias_.bytes_num()); - bias_file.close(); - } - - MPI_Bcast(weights_.data(), weights_.element_num(), MPI_FLOAT, 0, MPI_COMM_WORLD); - MPI_Bcast(bias_.data(), bias_.element_num(), MPI_FLOAT, 0, MPI_COMM_WORLD); -} - -template<> -void LinearGELU::load_parameters(const std::string& dir, int64_t layer_id){ - int flag_mpi_init; - MPI_Initialized(&flag_mpi_init); - if(!flag_mpi_init){ - std::cerr << "DNNInferencer_blas::load_models : MPI is not initialized" << std::endl; - MPI_Abort(MPI_COMM_WORLD, 1); - } - int mpirank; - int mpisize; - MPI_Comm_rank(MPI_COMM_WORLD, &mpirank); - MPI_Comm_size(MPI_COMM_WORLD, &mpisize); - - if(mpirank == 0){ - std::stringstream ss1,ss2; - ss1 << dir << "/" << "linear_" << layer_id << "_weights_rowmajor_" << in_features_ << "_" << out_features_ << ".data"; - std::string weights_path = ss1.str(); - ss2 << dir << "/" << "linear_" << layer_id << "_bias_" << out_features_ << ".data"; - std::string bias_path = ss2.str(); - - // weights - std::ifstream weights_file(weights_path, std::ios::binary); - if(!weights_file.is_open()){ - std::cerr << "open weights file error : " << weights_path << std::endl << std::flush; - abort(); - } - weights_file.read(reinterpret_cast(weights_.data()), weights_.bytes_num()); - weights_file.close(); - - // bias - std::ifstream bias_file(bias_path, std::ios::binary); - if(!bias_file.is_open()){ - std::cerr << "open bias file error : " << bias_path << std::endl << std::flush; - abort(); - } - bias_file.read(reinterpret_cast(bias_.data()), bias_.bytes_num()); - bias_file.close(); - } - - MPI_Bcast(weights_.data(), weights_.element_num(), MPI_FLOAT, 0, MPI_COMM_WORLD); - MPI_Bcast(bias_.data(), bias_.element_num(), MPI_FLOAT, 0, MPI_COMM_WORLD); -} - - -void gelu_navie(int64_t len, float* data){ - for(int64_t i = 0; i < len; ++i){ - float x = data[i]; - data[i] = 0.5 * x * (1.f + tanhf(sqrtf(2.f / M_PI) * (x + 0.044715f * powf(x, 3.f)))); - } -} - -inline float tanhf_exp(float x){ - if(x > 8.f) return 1.; - if(x < -8.f) return -1.; - return 1.f - 2.f / (expf(2.f * x) + 1.f); -} - -double tanh_exp(double x){ - return 1. - 2. / (exp(2. * x) + 1.); -} - -void geluf_exp(int64_t len, float* data){ - - const float const_1 = sqrtf(2.f / M_PI); - const float const_2 = 0.044715f; - const float one = 1.f; - const float half = 0.5; - - for(int64_t i = 0; i < len; ++i){ - float x = data[i]; - data[i] = half * x * (one + tanhf_exp(const_1 * (x + const_2 * x * x * x))); - } -} - -void gelud_exp(int64_t len, float* data){ - const double const_1 = sqrtf(2. / M_PI); - const double const_2 = 0.044715; - const double one = 1.; - const double half = 0.5; - - for(int64_t i = 0; i < len; ++i){ - double x = data[i]; - data[i] = half * x * (one + tanh_exp(const_1 * (x + const_2 * x * x * x))); - } -} - -// TODO -void gelu_exp_sve(int64_t len, float* data){ - -} - -template<> -void LinearGELU::forward(const Tensor& input, Tensor& output){ - assert(input.dim_num() == 2); - assert(output.dim_num() == 2); - assert(input.dim(0) == output.dim(0)); - assert(input.dim(1) == in_features_); - assert(out_features_ == output.dim(1)); - - char transA = 'N'; - char transB = 'N'; - float alpha = 1.f; - float beta = 1.f; - int m = out_features_; - int n = input.dim(0); - int k = in_features_; - float* A = weights_.data(); - int lda = out_features_; - float* B = const_cast(input.data()); - int ldb = input.dim(1); - float* C = output.data(); - int ldc = output.dim(1); - - for(int i = 0; i < input.dim(0); ++i) - std::copy(bias_.data(), bias_.data() + out_features_, C + i * out_features_); - - sgemm_(&transA, &transB, &m, &n, &k, &alpha, A, &lda, B, &ldb, &beta, C, &ldc); - // GELU - geluf_exp(output.element_num(), output.data()); -} - -template class Tensor; -template class Linear; -template class LinearGELU; \ No newline at end of file diff --git a/src_gpu/dfChemistrySolver/dfChemistrySolver_cublas.H b/src_gpu/dfChemistrySolver/dfChemistrySolver_cublas.H deleted file mode 100644 index c7eea6c7f..000000000 --- a/src_gpu/dfChemistrySolver/dfChemistrySolver_cublas.H +++ /dev/null @@ -1,25 +0,0 @@ -#pragma once - -#include -#include -#include -#include - -class dfChemistrySolver -{ -private: - std::vector modules_; - torch::Device device_; - double *Xmu_, *Xstd_, *Ymu_, *Ystd_; - - double *init_input_, *y_input_BCT; - int dim_input_, num_cells_, num_species_, num_modules_; -public: - dfChemistrySolver(int num_cells, int num_species); - ~dfChemistrySolver(); - - void loadModels(const std::string dir); - void loadNormalization(const std::string dir); - void Inference(const double *T, const double *p, const double *y, - const double *rho, double *RR); -}; \ No newline at end of file diff --git a/src_gpu/dfChemistrySolver/dfChemistrySolver_cublas.cu b/src_gpu/dfChemistrySolver/dfChemistrySolver_cublas.cu deleted file mode 100644 index 4e2014150..000000000 --- a/src_gpu/dfChemistrySolver/dfChemistrySolver_cublas.cu +++ /dev/null @@ -1,147 +0,0 @@ -#include "dfChemistrySolver_cublas.H" - -__global__ void construct_init_input(int num_cells, int dim, const double *T, const double *p, - const double *y, double *y_input_BCT, double *output) -{ - int index = blockDim.x * blockIdx.x + threadIdx.x; - if (index >= num_cells) - return; - - output[index * dim] = T[index]; - output[index * dim + 1] = p[index]; - double y_BCT; - for (int i = 0; i < dim - 2; ++i) { - y_BCT = (pow(y[i * num_cells + index], 0.1) - 1) * 10; // BCT: lambda = 0.1 - output[index * dim + 2 + i] = y_BCT; - y_input_BCT[i * num_cells + index] = y_BCT; - } -} - -__global__ void normalize_input(int num_cells, int dim, const double *input, - const double *Xmu, const double *Xstd, double *output) -{ - int index = blockDim.x * blockIdx.x + threadIdx.x; - if (index >= num_cells) - return; - - for (int i = 0; i < dim; ++i) { - output[index * dim + i] = (input[index * dim + i] - Xmu[i]) / Xstd[i]; - } -} - -__global__ void calculate_y_new(int num_cells, int num_modules, const double *output_init, - const double *y_input_BCT, const double *Ymu, const double *Ystd, double *output) -{ - int index = blockDim.x * blockIdx.x + threadIdx.x; - if (index >= num_cells) - return; - - double RR_tmp; - for (int i = 0; i < num_modules; ++i) { - RR_tmp = output_init[i * num_cells + index] * Ystd[i] + Ymu[i] + y_input_BCT[i * num_cells + index]; - RR_tmp = pow((RR_tmp * 0.1 + 1), 10); // rev-BCT: lambda = 0.1 - output[i * num_cells + index] = RR_tmp; - } -} - -__global__ void calculate_RR(int num_cells, int num_species, double delta_t, - const double *rho, const double *y_old, double *y_new, double *RR) -{ - int index = blockDim.x * blockIdx.x + threadIdx.x; - if (index >= num_cells) - return; - - // normalize - double y_ave = 0.; - for (int i = 0; i < num_species; ++i) { - y_ave += y_new[i * num_cells + index]; - } - for (int i = 0; i < num_species; ++i) { - y_new[i * num_cells + index] = y_new[i * num_cells + index] / y_ave; - RR[i * num_cells + index] = (y_new[i * num_cells + index] - y_old[i * num_cells + index]) * rho[index] / delta_t; - } -} - -dfChemistrySolver::dfChemistrySolver(int num_cells, int num_species) - : device_(torch::kCUDA), num_cells_(num_cells), num_species_(num_species) -{ - dim_input_ = num_species + 2; // p, T, y - num_modules_ = num_species_ - 1; - cudaMalloc(&init_input_, sizeof(double) * num_cells * dim_input_); - cudaMalloc(&y_input_BCT, sizeof(double) * num_cells * num_species_); - cudaMalloc(&Xmu_, sizeof(double) * dim_input_); - cudaMalloc(&Xstd_, sizeof(double) * dim_input_); - cudaMalloc(&Ymu_, sizeof(double) * num_modules_); - cudaMalloc(&Ystd_, sizeof(double) * num_modules_); - modules_.reserve(num_modules_); - - // now norm paras are set in constructor manually - at::TensorOptions opts = at::TensorOptions().dtype(at::kDouble).device(at::kCUDA); - std::vector Xmu_vec = {2.1996457626e+03, 1.0281762632e+05, -5.2533840071e+00, - -4.5623405933e+00, -1.5305375824e+00, -3.7453838144e+00, - -3.3097212360e+00, -4.3339657954e+00, -2.9028421546e-01}; - std::vector Xstd_vec = {3.8304388695e+02, 7.1905832810e+02, 4.6788389177e-01, - 5.3034657693e-01, 5.0325864222e-01, 5.1058993718e-01, - 7.9704668543e-01, 5.1327160125e-01, 4.6827591193e-03}; - std::vector Ymu_vec = {0.0085609265, -0.0082998877, 0.0030108739, - -0.0067360325, 0.0037464590, 0.0024258509}; - std::vector Ystd_vec = {0.0085609265, -0.0082998877, 0.0030108739, - -0.0067360325, 0.0037464590, 0.0024258509}; - - cudaMemcpy(Xmu_, Xmu_vec.data(), sizeof(double) * dim_input_, cudaMemcpyHostToDevice); - cudaMemcpy(Xstd_, Xstd_vec.data(), sizeof(double) * dim_input_, cudaMemcpyHostToDevice); - cudaMemcpy(Ymu_, Ymu_vec.data(), sizeof(double) * num_modules_, cudaMemcpyHostToDevice); - cudaMemcpy(Ystd_, Ystd_vec.data(), sizeof(double) * num_modules_, cudaMemcpyHostToDevice); - - // input modules - std::string prefix = "new_Temporary_Chemical_"; - std::string suffix = ".pt"; - for (int i = 0; i < num_modules_; ++i) { - std::string model_path = prefix + std::to_string(i) + suffix; - printf("loading model %s\n", model_path.c_str()); - modules_.push_back(torch::jit::load(model_path)); - // try { - // // modules_.push_back(torch::jit::load(model_path)); - // torch::jit::load(model_path); - // } - // catch (const c10::Error& e) { - // std::cerr << "error loading the model\n"; - // exit(-1); - // } - modules_[i].to(device_); - } -} - -dfChemistrySolver::~dfChemistrySolver() { - cudaFree(init_input_); -} - -void dfChemistrySolver::Inference(const double *T, const double *p, const double *y, - const double *rho, double *RR) { - // construct input - size_t threads_per_block = 1024; - size_t blocks_per_grid = (num_cells_ + threads_per_block - 1) / threads_per_block; - construct_init_input<<>>(num_cells_, dim_input_, T, p, y, y_input_BCT, init_input_); - normalize_input<<>>(num_cells_, dim_input_, init_input_, Xmu_, Xstd_, init_input_); - - // inference by torch - at::Tensor torch_input = torch::from_blob(init_input_, {num_cells_, dim_input_}, device_); - torch_input = torch_input.to(at::kFloat); - std::vector INPUTS; - INPUTS.push_back(torch_input); - - std::vector output(num_modules_); - for (int i = 0; i < num_modules_; ++i) { - output[i] = modules_[i].forward(INPUTS).toTensor(); - output[i] = output[i].to(at::kDouble); - } - - // post process - double *d_output; - for (int i = 0; i < num_modules_; ++i) { - d_output = output[i].data_ptr(); - cudaMemcpy(RR + i * num_cells_, d_output, sizeof(double) * num_cells_, cudaMemcpyDeviceToDevice); - } - calculate_y_new<<>>(num_cells_, num_modules_, RR, y_input_BCT, Ymu_, Ystd_, RR); - calculate_RR<<>>(num_cells_, num_species_, 1e-6, rho, y, RR, RR); -} \ No newline at end of file diff --git a/src_gpu/dfThermo.H b/src_gpu/dfThermo.H index e1f9c0f58..fdcf54b78 100644 --- a/src_gpu/dfThermo.H +++ b/src_gpu/dfThermo.H @@ -63,8 +63,9 @@ public: void setConstantValue(std::string mechanism_file, int num_cells, int num_species); void setConstantFields(const std::vector patch_type); void initNonConstantFields(const double *h_T, const double *h_he, const double *h_psi, const double *h_alpha, - const double *h_mu, const double *h_k, const double *h_dpdt, const double *h_boundary_T, const double *h_boundary_he, - const double *h_boundary_psi, const double *h_boundary_alpha, const double *h_boundary_mu, const double *h_boundary_k); + const double *h_mu, const double *h_k, const double *h_dpdt, const double *h_rhoD, const double *h_boundary_T, + const double *h_boundary_he, const double *h_boundary_psi, const double *h_boundary_alpha, const double *h_boundary_mu, + const double *h_boundary_k, const double *h_boundary_rhoD); // set mass fraction void setMassFraction(const double *d_y, const double *d_boundary_y); @@ -79,6 +80,9 @@ public: void calculateThermoConductivityGPU(int thread_per_block, int num_thread, int num_total, const double *T, const double *T_poly, const double *d_y, const double *mole_fraction, double *species_thermal_conductivities, double *thermal_conductivity, int offset = 0); + void calculateRhoDGPU(int threads_per_block, int num_thread, int num_total, const double *T, + const double *T_poly, const double *p, const double *mole_fraction, + const double *mean_mole_weight, const double *rho, double *rhoD, int offset = 0); void calculateEnthalpyGPU(int thread_per_block, int num_thread, const double *T, double *enthalpy, const double *d_mass_fraction, int offset = 0); void calculateTemperatureGPU(int thread_per_block, int num_thread, int num_total, const double *T_init, const double *target_h, double *T, const double *d_mass_fraction, int offset = 0, @@ -98,6 +102,7 @@ public: void compareMu(const double *mu, const double *boundary_mu, bool printFlag); void compareAlpha(const double *alpha, const double *boundary_alpha, bool printFlag); void compareHe(const double *he, const double *boundary_he, bool printFlag); + void compareRhoD(const double *rhoD, const double *boundary_rhoD, int species_index, bool printFlag); void correctHe(const double *he, const double *boundary_he); void correctPsi(const double *psi, const double *boundary_psi); diff --git a/src_gpu/dfThermo.cu b/src_gpu/dfThermo.cu index 9e3aba78d..9c46b3d96 100644 --- a/src_gpu/dfThermo.cu +++ b/src_gpu/dfThermo.cu @@ -8,7 +8,7 @@ #define GAS_CANSTANT 8314.46261815324 #define SQRT8 2.8284271247461903 -#define NUM_SPECIES 7 +#define NUM_SPECIES 9 // constant memory __constant__ __device__ double d_nasa_coeffs[NUM_SPECIES*15]; @@ -200,6 +200,47 @@ __global__ void calculate_thermoConductivity_kernel(int num_thread, int num_tota thermal_conductivity[startIndex] = lambda_mix / cp; } +__global__ void calculate_diffusion_kernel(int num_thread, int num_total, int num_species, + int offset, const double *T_poly, const double *mole_fraction, const double *p, + const double *mean_mole_weight, const double *rho, const double *T, double *d) +{ + int index = blockDim.x * blockIdx.x + threadIdx.x; + if (index >= num_thread) + return; + + int startIndex = offset + index; + + double D[NUM_SPECIES * NUM_SPECIES]; + double sum1, sum2; + double tmp; + + for (int i = 0; i < num_species; i++) { + for (int j = 0; j < num_species; j++) { + tmp = 0.; + for (int k = 0; k < 5; k++) + tmp += (d_binary_diffusion_coeffs[i * num_species * 5 + j * 5 + k] * T_poly[num_total * k + startIndex]); + D[i * num_species + j] = tmp * pow(T[startIndex], 1.5); + } + } + for (int i = 0; i < num_species; i++) { + if (mole_fraction[num_total * i + startIndex] + 1e-10 > 1.) { + d[num_total * i + startIndex] = 0.; + continue; + } + sum1 = 0.; + sum2 = 0.; + for (int j = 0; j < num_species; j++) { + if (i == j) continue; + sum1 += mole_fraction[num_total * j + startIndex] / D[i * num_species + j]; + sum2 += mole_fraction[num_total * j + startIndex] * d_molecular_weights[j] / D[i * num_species + j]; + } + sum1 *= p[startIndex]; + sum2 *= p[startIndex] * mole_fraction[num_total * i + startIndex] / + (mean_mole_weight[startIndex] - mole_fraction[num_total * i + startIndex] * d_molecular_weights[i]); + d[num_total * i + startIndex] = 1 / (sum1 + sum2) * rho[startIndex]; + } +} + __device__ double calculate_enthalpy_device_kernel(int num_total, int num_species, int index, const double local_T, const double *mass_fraction) { @@ -404,8 +445,9 @@ void dfThermo::setConstantFields(const std::vector patch_type) } void dfThermo::initNonConstantFields(const double *h_T, const double *h_he, const double *h_psi, const double *h_alpha, - const double *h_mu, const double *h_k, const double *h_dpdt, const double *h_boundary_T, const double *h_boundary_he, - const double *h_boundary_psi, const double *h_boundary_alpha, const double *h_boundary_mu, const double *h_boundary_k) + const double *h_mu, const double *h_k, const double *h_dpdt, const double *h_rhoD, const double *h_boundary_T, + const double *h_boundary_he, const double *h_boundary_psi, const double *h_boundary_alpha, const double *h_boundary_mu, + const double *h_boundary_k, const double *h_boundary_rhoD) { checkCudaErrors(cudaMemcpy(dataBase_.d_T, h_T, dataBase_.cell_value_bytes, cudaMemcpyHostToDevice)); checkCudaErrors(cudaMemcpy(dataBase_.d_he, h_he, dataBase_.cell_value_bytes, cudaMemcpyHostToDevice)); @@ -414,6 +456,7 @@ void dfThermo::initNonConstantFields(const double *h_T, const double *h_he, cons checkCudaErrors(cudaMemcpy(dataBase_.d_mu, h_mu, dataBase_.cell_value_bytes, cudaMemcpyHostToDevice)); checkCudaErrors(cudaMemcpy(dataBase_.d_k, h_k, dataBase_.cell_value_bytes, cudaMemcpyHostToDevice)); checkCudaErrors(cudaMemcpy(dataBase_.d_dpdt, h_dpdt, dataBase_.cell_value_bytes, cudaMemcpyHostToDevice)); + checkCudaErrors(cudaMemcpy(dataBase_.d_thermo_rhoD, h_rhoD, dataBase_.cell_value_bytes * dataBase_.num_species, cudaMemcpyHostToDevice)); checkCudaErrors(cudaMemcpy(dataBase_.d_boundary_T, h_boundary_T, dataBase_.boundary_surface_value_bytes, cudaMemcpyHostToDevice)); checkCudaErrors(cudaMemcpy(dataBase_.d_boundary_he, h_boundary_he, dataBase_.boundary_surface_value_bytes, cudaMemcpyHostToDevice)); @@ -421,6 +464,7 @@ void dfThermo::initNonConstantFields(const double *h_T, const double *h_he, cons checkCudaErrors(cudaMemcpy(dataBase_.d_boundary_thermo_alpha, h_boundary_alpha, dataBase_.boundary_surface_value_bytes, cudaMemcpyHostToDevice)); checkCudaErrors(cudaMemcpy(dataBase_.d_boundary_mu, h_boundary_mu, dataBase_.boundary_surface_value_bytes, cudaMemcpyHostToDevice)); checkCudaErrors(cudaMemcpy(dataBase_.d_boundary_k, h_boundary_k, dataBase_.boundary_surface_value_bytes, cudaMemcpyHostToDevice)); + checkCudaErrors(cudaMemcpy(dataBase_.d_boundary_thermo_rhoD, h_boundary_rhoD, dataBase_.boundary_surface_value_bytes * dataBase_.num_species, cudaMemcpyHostToDevice)); } void dfThermo::calculateTPolyGPU(int threads_per_block, int num_thread, int num_total, const double *T, double *T_poly, int offset) @@ -460,6 +504,15 @@ void dfThermo::calculateThermoConductivityGPU(int threads_per_block, int num_thr offset, d_nasa_coeffs, d_y, T_poly, T, mole_fraction, species_thermal_conductivities, thermal_conductivity); } +void dfThermo::calculateRhoDGPU(int threads_per_block, int num_thread, int num_total, const double *T, + const double *T_poly, const double *p, const double *mole_fraction, + const double *mean_mole_weight, const double *rho, double *rhoD, int offset) +{ + size_t blocks_per_grid = (num_thread + threads_per_block - 1) / threads_per_block; + calculate_diffusion_kernel<<>>(num_thread, num_total, num_species, offset, + T_poly, mole_fraction, p, mean_mole_weight, rho, T, rhoD); +} + void dfThermo::calculateTemperatureGPU(int threads_per_block, int num_thread, int num_total, const double *T_init, const double *target_h, double *T, const double *d_mass_fraction, int offset, double atol, double rtol, int max_iter) { @@ -519,6 +572,8 @@ void dfThermo::correctThermo() d_T_poly, d_species_viscosities, dataBase_.d_mu); // calculate viscosity calculateThermoConductivityGPU(cell_thread, dataBase_.num_cells, dataBase_.num_cells, dataBase_.d_T, d_T_poly, dataBase_.d_y, d_mole_fraction, d_species_thermal_conductivities, dataBase_.d_thermo_alpha); // calculate thermal conductivity + calculateRhoDGPU(cell_thread, dataBase_.num_cells, dataBase_.num_cells, dataBase_.d_T, d_T_poly, dataBase_.d_p, d_mole_fraction, + d_mean_mole_weight, dataBase_.d_rho, dataBase_.d_thermo_rhoD); // boundary field int offset = 0; for (int i = 0; i < dataBase_.num_patches; i++) { @@ -533,6 +588,8 @@ void dfThermo::correctThermo() d_boundary_T_poly, d_boundary_species_viscosities, dataBase_.d_boundary_mu, offset); calculateThermoConductivityGPU(boundary_thread, dataBase_.patch_size[i], dataBase_.num_boundary_surfaces, dataBase_.d_boundary_T, d_boundary_T_poly, dataBase_.d_boundary_y, d_boundary_mole_fraction, d_boundary_species_thermal_conductivities, dataBase_.d_boundary_thermo_alpha, offset); + calculateRhoDGPU(boundary_thread, dataBase_.patch_size[i], dataBase_.num_boundary_surfaces, dataBase_.d_boundary_T, d_boundary_T_poly, + dataBase_.d_boundary_p, d_boundary_mole_fraction, d_boundary_mean_mole_weight, dataBase_.d_boundary_rho, dataBase_.d_boundary_thermo_rhoD, offset); } else { calculateTemperatureGPU(boundary_thread, dataBase_.patch_size[i], dataBase_.num_boundary_surfaces, dataBase_.d_boundary_T, dataBase_.d_boundary_he, dataBase_.d_boundary_T, dataBase_.d_boundary_y, offset); @@ -543,6 +600,8 @@ void dfThermo::correctThermo() d_boundary_T_poly, d_boundary_species_viscosities, dataBase_.d_boundary_mu, offset); calculateThermoConductivityGPU(boundary_thread, dataBase_.patch_size[i], dataBase_.num_boundary_surfaces, dataBase_.d_boundary_T, d_boundary_T_poly, dataBase_.d_boundary_y, d_boundary_mole_fraction, d_boundary_species_thermal_conductivities, dataBase_.d_boundary_thermo_alpha, offset); + calculateRhoDGPU(boundary_thread, dataBase_.patch_size[i], dataBase_.num_boundary_surfaces, dataBase_.d_boundary_T, d_boundary_T_poly, + dataBase_.d_boundary_p, d_boundary_mole_fraction, d_boundary_mean_mole_weight, dataBase_.d_boundary_rho, dataBase_.d_boundary_thermo_rhoD, offset); } // correct internal field of processor boundary if (dataBase_.patch_type_T[i] == boundaryConditions::processor @@ -561,7 +620,11 @@ void dfThermo::correctThermo() dataBase_.d_mu, dataBase_.d_boundary_face_cell, dataBase_.d_boundary_mu); correct_internal_boundary_field_scalar<<>>(dataBase_.patch_size[i], offset, dataBase_.d_rho, dataBase_.d_boundary_face_cell, dataBase_.d_boundary_rho); - + for (int j = 0; j < num_species; j++) { + correct_internal_boundary_field_scalar<<>>(dataBase_.patch_size[i], offset, + dataBase_.d_thermo_rhoD + j * dataBase_.num_cells, dataBase_.d_boundary_face_cell, + dataBase_.d_boundary_thermo_rhoD + j * dataBase_.num_boundary_surfaces); + } offset += 2 * dataBase_.patch_size[i]; } else { offset += dataBase_.patch_size[i]; } @@ -740,6 +803,24 @@ void dfThermo::compareAlpha(const double *alpha, const double *boundary_alpha, b delete h_boundary_alpha; } +void dfThermo::compareRhoD(const double *rhoD, const double *boundary_rhoD, int species_index, bool printFlag) +{ + double *h_rhoD = new double[dataBase_.num_cells]; + double *h_boundary_rhoD = new double[dataBase_.num_boundary_surfaces]; + + checkCudaErrors(cudaMemcpy(h_rhoD, dataBase_.d_thermo_rhoD + species_index * dataBase_.num_cells, dataBase_.cell_value_bytes, cudaMemcpyDeviceToHost)); + checkCudaErrors(cudaMemcpy(h_boundary_rhoD, dataBase_.d_boundary_thermo_rhoD + species_index * dataBase_.num_boundary_surfaces, + dataBase_.boundary_surface_value_bytes, cudaMemcpyDeviceToHost)); + + fprintf(stderr, "check h_thermo_rhoD\n"); + checkVectorEqual(dataBase_.num_cells, rhoD, h_rhoD, 1e-10, printFlag); + fprintf(stderr, "check h_thermo_boundary_rhoD\n"); + checkVectorEqual(dataBase_.num_boundary_surfaces, boundary_rhoD, h_boundary_rhoD, 1e-10, printFlag); + + delete h_rhoD; + delete h_boundary_rhoD; +} + void dfThermo::correctHe(const double *he, const double *boundary_he) { checkCudaErrors(cudaMemcpy(dataBase_.d_he, he, dataBase_.cell_value_bytes, cudaMemcpyHostToDevice)); diff --git a/src_gpu/dfYEqn.H b/src_gpu/dfYEqn.H index ab89abde6..b3715efc4 100644 --- a/src_gpu/dfYEqn.H +++ b/src_gpu/dfYEqn.H @@ -4,11 +4,13 @@ #include #include "dfMatrixDataBase.H" #include "dfMatrixOpBase.H" +#include "dfChemistrySolver.H" class dfYEqn { private: dfMatrixDataBase &dataBase_; + dfChemistrySolver &chemistrySolver_; // cuda resource cudaStream_t stream; @@ -48,6 +50,8 @@ private: double *d_phiUc = nullptr; double *d_DEff = nullptr; double *d_permute = nullptr; + // combustion fields + double *d_RR = nullptr; // computed on CPU, used on GPU, need memcpyh2d double *h_hai = nullptr; double *h_rhoD = nullptr; @@ -92,8 +96,8 @@ private: public: // 构造函数 - dfYEqn(dfMatrixDataBase &dataBase) - : dataBase_(dataBase) {} + dfYEqn(dfMatrixDataBase &dataBase, dfChemistrySolver &chemistrySolver) + : dataBase_(dataBase), chemistrySolver_(chemistrySolver) {} // 析构函数 ~dfYEqn(){} @@ -127,9 +131,11 @@ public: void yeqn_compute_thermo_alpha(cudaStream_t stream, int num_cells, const double *rhoD, double *thermo_alpha, int num_boundary_surfaces, const double *boundary_rhoD, double *boundary_thermo_alpha); - void yeqn_compute_DEff(cudaStream_t stream, int num_species, int num_cells, int num_boundary_surfaces, + void yeqn_compute_DEff_via_lewisNumber(cudaStream_t stream, int num_species, int num_cells, int num_boundary_surfaces, double *lewis_number, const double *alpha, const double *mut_sct, double *DEff, const double *boundary_alpha, const double *boundary_mut_sct, double *boundary_DEff); + void yeqn_compute_RR(dfChemistrySolver& chemistrySolver, cudaStream_t stream, const double *h_T, const double *d_T, + const double *p, const double *y, const double *rho, double *RR); void yeqn_fvc_laplacian_scalar(cudaStream_t stream, ncclComm_t comm, const int *neighbor_peer, int num_species, int num_cells, int num_surfaces, int num_boundary_surfaces, const int *lowerAddr, const int *upperAddr, @@ -140,9 +146,9 @@ public: const double *boundary_thermo_alpha, const double *boundary_hai, const double *boundary_vf, const int *cyclicNeighbor, const int *patchSizeOffset, double *boundary_output); void yeqn_compute_sumYDiffError_and_hDiffCorrFlux(cudaStream_t stream, int num_species, int num_cells, int num_boundary_surfaces, - const double *lewis_number, const double *thermo_alpha, const double *hai, const double *y, const double *grad_y, + const double *rhoD, const double *hai, const double *y, const double *grad_y, double *sumY_diff_error, double *hDiff_corr_flux, - const double *boundary_hai, const double *boundary_y, const double *boundary_grad_y, const double *boundary_thermo_alpha, + const double *boundary_hai, const double *boundary_y, const double *boundary_grad_y, const double *boundary_rhoD, double *boundary_sumY_diff_error, double *boundary_hDiff_corr_flux); void yeqn_compute_phiUc(cudaStream_t stream, int num_cells, int num_surfaces, int num_boundary_surfaces, const int *lowerAddr, const int *upperAddr, diff --git a/src_gpu/dfYEqn.cu b/src_gpu/dfYEqn.cu index d68ad23ff..e6bbc5d4c 100644 --- a/src_gpu/dfYEqn.cu +++ b/src_gpu/dfYEqn.cu @@ -83,7 +83,7 @@ __global__ void yeqn_compute_phiUc_boundary(int num_boundary_surfaces, } __global__ void yeqn_sumError_and_compute_hDiffCorrFlux(int num_species, int num, - const double *lewis_number, const double *thermo_alpha, const double *hai, const double *y, const double *grady, + const double *rhoD, const double *hai, const double *y, const double *grady, double *sum_rhoD_grady, double *hDiffCorrFlux) { int index = blockDim.x * blockIdx.x + threadIdx.x; @@ -99,7 +99,7 @@ __global__ void yeqn_sumError_and_compute_hDiffCorrFlux(int num_species, int num double sum_hai_y = 0; for (int s = 0; s < num_species; s++) { double hai_value = hai[num * s + index]; - double rhoD_value = thermo_alpha[index] / lewis_number[s]; // le = alpha/D + double rhoD_value = rhoD[num * s + index]; // le = alpha/D double y_value = y[num * s + index]; double grady_x = grady[num * s * 3 + num * 0 + index]; double grady_y = grady[num * s * 3 + num * 1 + index]; @@ -346,6 +346,8 @@ void dfYEqn::createNonConstantFieldsInternal() { checkCudaErrors(cudaMalloc((void**)&d_phiUc, dataBase_.surface_value_bytes)); checkCudaErrors(cudaMalloc((void**)&d_DEff, dataBase_.cell_value_bytes * dataBase_.num_species)); checkCudaErrors(cudaMalloc((void**)&d_permute, dataBase_.cell_value_vec_bytes)); + + checkCudaErrors(cudaMalloc((void**)&d_RR, dataBase_.cell_value_bytes * dataBase_.num_species)); #endif // computed on CPU, used on GPU, need memcpyh2d checkCudaErrors(cudaMallocHost((void**)&h_rhoD, dataBase_.cell_value_bytes * dataBase_.num_species)); @@ -458,6 +460,8 @@ void dfYEqn::process() { checkCudaErrors(cudaMallocAsync((void**)&d_phiUc, dataBase_.surface_value_bytes, dataBase_.stream)); checkCudaErrors(cudaMallocAsync((void**)&d_DEff, dataBase_.cell_value_bytes * dataBase_.num_species, dataBase_.stream)); checkCudaErrors(cudaMallocAsync((void**)&d_permute, dataBase_.cell_value_vec_bytes, dataBase_.stream)); + // combustion fields + checkCudaErrors(cudaMallocAsync((void**)&d_RR, dataBase_.cell_value_bytes * dataBase_.num_species, dataBase_.stream)); // thermophysical fields checkCudaErrors(cudaMallocAsync((void**)&d_boundary_hai, dataBase_.boundary_surface_value_bytes * dataBase_.num_species, dataBase_.stream)); checkCudaErrors(cudaMallocAsync((void**)&d_boundary_mut_sct, dataBase_.boundary_surface_value_bytes, dataBase_.stream)); @@ -489,6 +493,11 @@ void dfYEqn::process() { checkCudaErrors(cudaMemsetAsync(dataBase_.d_boundary_diff_alphaD, 0, dataBase_.boundary_surface_value_bytes, dataBase_.stream)); checkCudaErrors(cudaMemsetAsync(d_grad_y, 0, dataBase_.cell_value_vec_bytes * dataBase_.num_species, dataBase_.stream)); checkCudaErrors(cudaMemsetAsync(d_boundary_grad_y, 0, dataBase_.boundary_surface_value_vec_bytes * dataBase_.num_species, dataBase_.stream)); + // combustion fields + checkCudaErrors(cudaMemsetAsync(d_RR, 0, dataBase_.cell_value_bytes * dataBase_.num_species, dataBase_.stream)); + // calculate reaction rates + checkCudaErrors(cudaStreamSynchronize(dataBase_.stream)); + yeqn_compute_RR(chemistrySolver_, dataBase_.stream, dataBase_.h_T, dataBase_.d_T, dataBase_.d_p, dataBase_.d_y, dataBase_.d_rho_old, d_RR); // compute diffAlphaD yeqn_fvc_laplacian_scalar(dataBase_.stream, dataBase_.nccl_comm, dataBase_.neighbProcNo.data(), dataBase_.num_species, dataBase_.num_cells, dataBase_.num_surfaces, dataBase_.num_boundary_surfaces, @@ -522,9 +531,9 @@ void dfYEqn::process() { // compute sumYDiffError and hDiffCorrFlux yeqn_compute_sumYDiffError_and_hDiffCorrFlux(dataBase_.stream, dataBase_.num_species, dataBase_.num_cells, dataBase_.num_boundary_surfaces, - d_lewis_number, dataBase_.d_thermo_alpha, d_hai, dataBase_.d_y, d_grad_y, + dataBase_.d_thermo_rhoD, d_hai, dataBase_.d_y, d_grad_y, d_sumY_diff_error, dataBase_.d_hDiff_corr_flux, - d_boundary_hai, dataBase_.d_boundary_y, d_boundary_grad_y, dataBase_.d_boundary_thermo_alpha, + d_boundary_hai, dataBase_.d_boundary_y, d_boundary_grad_y, dataBase_.d_boundary_thermo_rhoD, d_boundary_sumY_diff_error, dataBase_.d_boundary_hDiff_corr_flux); // compute phiUc yeqn_compute_phiUc(dataBase_.stream, dataBase_.num_cells, dataBase_.num_surfaces, dataBase_.num_boundary_surfaces, @@ -539,8 +548,9 @@ void dfYEqn::process() { // turbulence->mut()/Sct = 0 when laminar. // double *d_DEff = d_rhoD; // double *d_boundary_DEff = d_boundary_rhoD; - yeqn_compute_DEff(dataBase_.stream, dataBase_.num_species, dataBase_.num_cells, dataBase_.num_boundary_surfaces, - d_lewis_number, dataBase_.d_thermo_alpha, d_mut_sct, d_DEff, dataBase_.d_boundary_thermo_alpha, d_boundary_mut_sct, d_boundary_DEff); + // TODO: calculate d_DEff in dfThermo + // yeqn_compute_DEff_via_lewisNumber(dataBase_.stream, dataBase_.num_species, dataBase_.num_cells, dataBase_.num_boundary_surfaces, + // d_lewis_number, dataBase_.d_thermo_alpha, d_mut_sct, d_DEff, dataBase_.d_boundary_thermo_alpha, d_boundary_mut_sct, d_boundary_DEff); #ifdef USE_GRAPH checkCudaErrors(cudaStreamEndCapture(dataBase_.stream, &graph)); checkCudaErrors(cudaGraphInstantiate(&graph_instance, graph, NULL, NULL, 0)); @@ -574,6 +584,15 @@ void dfYEqn::process() { fvm_ddt_vol_scalar_vol_scalar(dataBase_.stream, dataBase_.num_cells, dataBase_.rdelta_t, dataBase_.d_rho, dataBase_.d_rho_old, dataBase_.d_y + dataBase_.num_cells * s, dataBase_.d_volume, d_diag, d_source, 1.); + // **calculate div weights with limitedLinear scheme** + // compute_limitedLinear_weight(dataBase_.stream, dataBase_.nccl_comm, dataBase_.neighbProcNo.data(), dataBase_.num_surfaces, + // dataBase_.num_cells, dataBase_.num_boundary_surfaces, dataBase_.d_owner, dataBase_.d_neighbor, dataBase_.d_mesh_dis, + // dataBase_.d_weight, dataBase_.d_sf, dataBase_.d_y + dataBase_.num_cells * s, dataBase_.d_phi, dataBase_.d_phi_weight, + // dataBase_.num_patches, dataBase_.patch_size.data(), patch_type.data(), dataBase_.d_boundary_weight, dataBase_.d_boundary_face_cell, + // dataBase_.d_boundary_y + dataBase_.num_boundary_surfaces * s, dataBase_.d_boundary_sf, dataBase_.d_volume, + // dataBase_.d_boundary_mag_sf, dataBase_.d_boundary_phi, dataBase_.cyclicNeighbor.data(), dataBase_.patchSizeOffset.data(), + // dataBase_.d_boundary_delta_coeffs); + // fvmDiv(phi, Yi) fvm_div_scalar(dataBase_.stream, dataBase_.num_surfaces, dataBase_.d_owner, dataBase_.d_neighbor, dataBase_.d_phi, dataBase_.d_phi_weight, @@ -596,13 +615,14 @@ void dfYEqn::process() { fvm_laplacian_scalar(dataBase_.stream, dataBase_.num_surfaces, dataBase_.num_boundary_surfaces, dataBase_.d_owner, dataBase_.d_neighbor, dataBase_.d_weight, dataBase_.d_mag_sf, dataBase_.d_delta_coeffs, - d_DEff + dataBase_.num_cells * s, + dataBase_.d_thermo_rhoD + dataBase_.num_cells * s, d_lower, d_upper, d_diag, // end for internal dataBase_.num_patches, dataBase_.patch_size.data(), patch_type.data(), - dataBase_.d_boundary_mag_sf, d_boundary_DEff + dataBase_.num_boundary_surfaces * s, + dataBase_.d_boundary_mag_sf, dataBase_.d_boundary_thermo_rhoD + dataBase_.num_boundary_surfaces * s, d_gradient_internal_coeffs + dataBase_.num_boundary_surfaces * s, d_gradient_boundary_coeffs + dataBase_.num_boundary_surfaces * s, d_internal_coeffs, d_boundary_coeffs, -1.); + fvc_to_source_scalar(dataBase_.stream, dataBase_.num_cells, dataBase_.d_volume, d_RR + dataBase_.num_cells * s, d_source); #ifndef DEBUG_CHECK_LDU // ldu to csr // use d_source as d_b @@ -634,8 +654,6 @@ void dfYEqn::process() { TICK_START_EVENT; // copy y and boundary_y to host - checkCudaErrors(cudaMemcpyAsync(dataBase_.h_y, dataBase_.d_y, dataBase_.cell_value_bytes * dataBase_.num_species, cudaMemcpyDeviceToHost, dataBase_.stream)); - checkCudaErrors(cudaMemcpyAsync(dataBase_.h_boundary_y, dataBase_.d_boundary_y, dataBase_.boundary_surface_value_bytes * dataBase_.num_species, cudaMemcpyDeviceToHost, dataBase_.stream)); TICK_END_EVENT(YEqn post process for all species copy back); TICK_START_EVENT; @@ -651,6 +669,8 @@ void dfYEqn::process() { checkCudaErrors(cudaFreeAsync(d_DEff, dataBase_.stream)); checkCudaErrors(cudaFreeAsync(d_permute, dataBase_.stream)); + checkCudaErrors(cudaFreeAsync(d_RR, dataBase_.stream)); + // thermophysical fields //checkCudaErrors(cudaFreeAsync(d_boundary_rhoD, dataBase_.stream)); checkCudaErrors(cudaFreeAsync(d_boundary_hai, dataBase_.stream)); @@ -684,7 +704,11 @@ void dfYEqn::solve(int speciesIndex) { TICK_END_EVENT(YEqn solve one specie); } -void dfYEqn::postProcess(double *h_y, double *h_boundary_y) {} +void dfYEqn::postProcess(double *h_y, double *h_boundary_y) { + checkCudaErrors(cudaMemcpyAsync(dataBase_.h_y, dataBase_.d_y, dataBase_.cell_value_bytes * dataBase_.num_species, cudaMemcpyDeviceToHost, dataBase_.stream)); + checkCudaErrors(cudaMemcpyAsync(dataBase_.h_boundary_y, dataBase_.d_boundary_y, dataBase_.boundary_surface_value_bytes * dataBase_.num_species, cudaMemcpyDeviceToHost, dataBase_.stream)); + sync(); +} void dfYEqn::sync() { checkCudaErrors(cudaStreamSynchronize(dataBase_.stream)); @@ -702,7 +726,7 @@ void dfYEqn::yeqn_compute_thermo_alpha(cudaStream_t stream, num_boundary_surfaces, boundary_rhoD, boundary_thermo_alpha); } -void dfYEqn::yeqn_compute_DEff(cudaStream_t stream, int num_species, int num_cells, int num_boundary_surfaces, +void dfYEqn::yeqn_compute_DEff_via_lewisNumber(cudaStream_t stream, int num_species, int num_cells, int num_boundary_surfaces, double *lewis_number, const double *alpha, const double *mut_sct, double *DEff, const double *boundary_alpha, const double *boundary_mut_sct, double *boundary_DEff) { @@ -715,6 +739,12 @@ void dfYEqn::yeqn_compute_DEff(cudaStream_t stream, int num_species, int num_cel lewis_number, boundary_alpha, boundary_mut_sct, boundary_DEff); } +void dfYEqn::yeqn_compute_RR(dfChemistrySolver& chemistrySolver, cudaStream_t stream, const double *h_T, const double *d_T, + const double *p, const double *y, const double *rho, double *RR) +{ + chemistrySolver.Inference(h_T, d_T, p, y, rho, RR); +} + void dfYEqn::yeqn_fvc_laplacian_scalar(cudaStream_t stream, ncclComm_t comm, const int *neighbor_peer, int num_species, int num_cells, int num_surfaces, int num_boundary_surfaces, const int *lowerAddr, const int *upperAddr, @@ -792,18 +822,18 @@ void dfYEqn::yeqn_fvc_laplacian_scalar(cudaStream_t stream, ncclComm_t comm, con } void dfYEqn::yeqn_compute_sumYDiffError_and_hDiffCorrFlux(cudaStream_t stream, int num_species, int num_cells, int num_boundary_surfaces, - const double *lewis_number, const double *thermo_alpha, const double *hai, const double *y, const double *grad_y, + const double *rhoD, const double *hai, const double *y, const double *grad_y, double *sumY_diff_error, double *hDiff_corr_flux, - const double *boundary_hai, const double *boundary_y, const double *boundary_grad_y, const double *boundary_thermo_alpha, + const double *boundary_hai, const double *boundary_y, const double *boundary_grad_y, const double *boundary_rhoD, double *boundary_sumY_diff_error, double *boundary_hDiff_corr_flux) { size_t threads_per_block = 1024; size_t blocks_per_grid = (num_cells + threads_per_block - 1) / threads_per_block; yeqn_sumError_and_compute_hDiffCorrFlux<<>>(num_species, num_cells, - lewis_number, thermo_alpha, hai, y, grad_y, sumY_diff_error, hDiff_corr_flux); + rhoD, hai, y, grad_y, sumY_diff_error, hDiff_corr_flux); blocks_per_grid = (num_boundary_surfaces + threads_per_block - 1) / threads_per_block; yeqn_sumError_and_compute_hDiffCorrFlux<<>>(num_species, num_boundary_surfaces, - lewis_number, boundary_thermo_alpha, boundary_hai, boundary_y, boundary_grad_y, boundary_sumY_diff_error, boundary_hDiff_corr_flux); + boundary_rhoD, boundary_hai, boundary_y, boundary_grad_y, boundary_sumY_diff_error, boundary_hDiff_corr_flux); } void dfYEqn::yeqn_compute_phiUc(cudaStream_t stream, int num_cells, int num_surfaces, int num_boundary_surfaces, From 9375fb143aa917f738c7d8fc114460a104b8bef7 Mon Sep 17 00:00:00 2001 From: maorz1998 Date: Tue, 7 Nov 2023 23:56:13 +0800 Subject: [PATCH 4/5] fix bugs in pEqn & UEqn --- applications/solvers/dfLowMachFoam/UEqn.H | 8 ++-- applications/solvers/dfLowMachFoam/YEqn.H | 38 ++++++++++----- .../solvers/dfLowMachFoam/createGPUSolver.H | 47 +++++++++++++++++-- .../solvers/dfLowMachFoam/dfLowMachFoam.C | 33 +++++++++---- applications/solvers/dfLowMachFoam/pEqn_CPU.H | 3 +- applications/solvers/dfLowMachFoam/pEqn_GPU.H | 3 +- src_gpu/dfUEqn.H | 5 ++ src_gpu/dfUEqn.cu | 25 +++++++++- src_gpu/dfpEqn.cu | 6 +-- 9 files changed, 130 insertions(+), 38 deletions(-) diff --git a/applications/solvers/dfLowMachFoam/UEqn.H b/applications/solvers/dfLowMachFoam/UEqn.H index a6244c870..a0a9689b1 100644 --- a/applications/solvers/dfLowMachFoam/UEqn.H +++ b/applications/solvers/dfLowMachFoam/UEqn.H @@ -11,7 +11,6 @@ fvm::div(phi, U) + turbulence->divDevRhoReff(U) - == -fvc::grad(p) ); fvVectorMatrix& UEqn = tUEqn.ref(); TICK_STOP(CPU assembly time); @@ -64,7 +63,7 @@ #if defined DEBUG_ // UEqn.relax(); TICK_START; - UEqn.solve(); + solve(UEqn == -fvc::grad(p)); K.oldTime(); K = 0.5*magSqr(U); TICK_STOP(CPU solve time); @@ -118,7 +117,7 @@ // h_internal_coeffs.data(), h_boundary_coeffs.data(), // // &gradU[0][0], h_boundary_gradU, // printFlag); - //UEqn_GPU.compareU(&U[0][0], h_boundary_u_tmp, printFlag); + // UEqn_GPU.compareU(&U[0][0], h_boundary_u_tmp, printFlag); } DEBUG_TRACE; #endif @@ -129,7 +128,6 @@ ( fvm::ddt(rho, U) + fvm::div(phi, U) + turbulence->divDevRhoReff(U) - == -fvc::grad(p) ); fvVectorMatrix& UEqn = tUEqn.ref(); end1 = std::clock(); @@ -140,7 +138,7 @@ start1 = std::clock(); if (pimple.momentumPredictor()) { - solve(UEqn); + solve(UEqn == -fvc::grad(p)); K.oldTime(); K = 0.5*magSqr(U); } diff --git a/applications/solvers/dfLowMachFoam/YEqn.H b/applications/solvers/dfLowMachFoam/YEqn.H index d430ebd73..010f9551c 100644 --- a/applications/solvers/dfLowMachFoam/YEqn.H +++ b/applications/solvers/dfLowMachFoam/YEqn.H @@ -22,6 +22,15 @@ //randomInitField(const_cast(chemistry->hai(5))); //randomInitField(const_cast(chemistry->hai(6))); + // auto& mgcs = dynamic_cast&>(mvConvection.ref()); + // tmp> tinterpScheme_ = mgcs.interpolationScheme()()(Y[0]); + // tmp tweights = tinterpScheme_().weights(Y[0]); + // const surfaceScalarField& weights = tweights(); + // Info << "CPU weights\n" << weights << endl; + + // auto& limitedScheme_ = dynamic_cast&>(tinterpScheme_()); + // Info << "CPU limiter\n" << limitedScheme_.limiter(Y[0]) << endl; + forAll(Y, i) { sumYDiffError += chemistry->rhoD(i)*fvc::grad(Y[i]); @@ -44,6 +53,12 @@ ) ); + // auto& mgcs = dynamic_cast&>(mvConvection.ref()); + // tmp> tinterpScheme_ = mgcs.interpolationScheme()()(Y[0]); + // tmp tweights = tinterpScheme_().weights(Y[0]); + // const surfaceScalarField& weights = tweights(); + // Info << "CPU weights\n" << weights << endl; + start1 = std::clock(); forAll(Y, i) { @@ -62,17 +77,6 @@ MPI_Initialized(&flag_mpi_init); if(flag_mpi_init) MPI_Barrier(PstreamGlobals::MPI_COMM_FOAM); { - if (!splitting) - { - std::chrono::steady_clock::time_point start = std::chrono::steady_clock::now(); - combustion->correct(); - //label flag_mpi_init; - //MPI_Initialized(&flag_mpi_init); - if(flag_mpi_init) MPI_Barrier(PstreamGlobals::MPI_COMM_FOAM); - std::chrono::steady_clock::time_point stop = std::chrono::steady_clock::now(); - std::chrono::duration processingTime = std::chrono::duration_cast>(stop - start); - time_monitor_chem += processingTime.count(); - } #ifdef GPUSolverNew_ #if defined DEBUG_ @@ -265,6 +269,18 @@ if(flag_mpi_init) MPI_Barrier(PstreamGlobals::MPI_COMM_FOAM); fflush(stderr); #else + if (!splitting) + { + std::chrono::steady_clock::time_point start = std::chrono::steady_clock::now(); + combustion->correct(); + //label flag_mpi_init; + //MPI_Initialized(&flag_mpi_init); + if(flag_mpi_init) MPI_Barrier(PstreamGlobals::MPI_COMM_FOAM); + std::chrono::steady_clock::time_point stop = std::chrono::steady_clock::now(); + std::chrono::duration processingTime = std::chrono::duration_cast>(stop - start); + time_monitor_chem += processingTime.count(); + } + start2 = std::clock(); volScalarField Yt(0.0*Y[0]); int speciesIndex = 0; diff --git a/applications/solvers/dfLowMachFoam/createGPUSolver.H b/applications/solvers/dfLowMachFoam/createGPUSolver.H index 7714fcbd1..79455fb22 100644 --- a/applications/solvers/dfLowMachFoam/createGPUSolver.H +++ b/applications/solvers/dfLowMachFoam/createGPUSolver.H @@ -6,9 +6,10 @@ int nRanks, myRank, localRank, mpi_init_flag = 0; dfMatrixDataBase dfDataBase; dfThermo thermo_GPU(dfDataBase); +dfChemistrySolver chemistrySolver_GPU(dfDataBase); dfRhoEqn rhoEqn_GPU(dfDataBase); dfUEqn UEqn_GPU(dfDataBase); -dfYEqn YEqn_GPU(dfDataBase); +dfYEqn YEqn_GPU(dfDataBase, chemistrySolver_GPU); dfEEqn EEqn_GPU(dfDataBase, thermo_GPU); dfpEqn pEqn_GPU(dfDataBase); @@ -325,7 +326,17 @@ void createGPUBase(const IOdictionary& CanteraTorchProperties, fvMesh& mesh, Ptr dfDataBase.createConstantFieldsInternal(); dfDataBase.createConstantFieldsBoundary(); - dfDataBase.initConstantFieldsInternal(&mesh.Sf()[0][0], &mesh.magSf()[0], &mesh.surfaceInterpolation::weights()[0], &mesh.deltaCoeffs()[0], &mesh.V()[0]); + + // construct mesh distance for limitedLinear scheme + vectorField meshDistance = mesh.Sf(); + forAll(meshDistance, facei) { + label own = owner[facei]; + label nei = neighbour[facei]; + meshDistance[facei] = mesh.C()[nei] - mesh.C()[own]; + } + + dfDataBase.initConstantFieldsInternal(&mesh.Sf()[0][0], &mesh.magSf()[0], &mesh.surfaceInterpolation::weights()[0], + &mesh.deltaCoeffs()[0], &mesh.V()[0], &meshDistance[0][0]); dfDataBase.initConstantFieldsBoundary(boundary_sf, boundary_mag_sf, boundary_delta_coeffs, boundary_weights, boundary_face_cell, patch_type_calculated, patch_type_extropolated); @@ -586,7 +597,7 @@ void createGPUpEqn(const IOdictionary& CanteraTorchProperties, volScalarField& p void createGPUThermo(const IOdictionary& CanteraTorchProperties, volScalarField& T, volScalarField& he, const volScalarField& psi, const volScalarField& alpha, const volScalarField& mu, - const volScalarField& K, const volScalarField& dpdt) { + const volScalarField& K, const volScalarField& dpdt, dfChemistryModel* chemistry) { DEBUG_TRACE; // initialize dfThermo string mechanismFile; @@ -604,6 +615,8 @@ void createGPUThermo(const IOdictionary& CanteraTorchProperties, volScalarField& double *h_boundary_thermo_alpha = new double[dfDataBase.num_boundary_surfaces]; double *h_boundary_mu = new double[dfDataBase.num_boundary_surfaces]; double *h_boundary_k = new double[dfDataBase.num_boundary_surfaces]; + double *h_boundary_thermo_rhoD = new double[dfDataBase.num_boundary_surfaces * dfDataBase.num_species]; + double *h_thermo_rhoD = new double[dfDataBase.num_cells * dfDataBase.num_species]; // initialize thermo boundary std::vector patch_type_T(dfDataBase.num_patches); @@ -651,6 +664,14 @@ void createGPUThermo(const IOdictionary& CanteraTorchProperties, volScalarField& dynamic_cast&>(patchK).patchInternalField()(); memcpy(h_boundary_k + offset + patchsize, &patchKInternal[0], patchsize * sizeof(double)); + for (int i = 0; i < dfDataBase.num_species; i++) { + const fvPatchScalarField& patchRhoD = chemistry->rhoD(i).boundaryField()[patchi]; + memcpy(h_boundary_thermo_rhoD + i * dfDataBase.num_boundary_surfaces + offset, &patchRhoD[0], patchsize * sizeof(double)); + scalarField patchRhoDInternal = + dynamic_cast&>(patchRhoD).patchInternalField()(); + memcpy(h_boundary_thermo_rhoD + i * dfDataBase.num_boundary_surfaces + offset + patchsize, &patchRhoDInternal[0], patchsize * sizeof(double)); + } + offset += patchsize * 2; } else { memcpy(h_boundary_T + offset, &patchT[0], patchsize * sizeof(double)); @@ -660,13 +681,29 @@ void createGPUThermo(const IOdictionary& CanteraTorchProperties, volScalarField& memcpy(h_boundary_mu + offset, &patchMu[0], patchsize * sizeof(double)); memcpy(h_boundary_k + offset, &patchK[0], patchsize * sizeof(double)); + for (int i = 0; i < dfDataBase.num_species; i++) { + const fvPatchScalarField& patchRhoD = chemistry->rhoD(i).boundaryField()[patchi]; + memcpy(h_boundary_thermo_rhoD + i * dfDataBase.num_boundary_surfaces + offset, &patchRhoD[0], patchsize * sizeof(double)); + } offset += patchsize; } } + for (int i = 0; i < dfDataBase.num_species; i++) { + memcpy(h_thermo_rhoD + i * dfDataBase.num_cells, &chemistry->rhoD(i)[0], dfDataBase.num_cells * sizeof(double)); + } + double *h_T = dfDataBase.getFieldPointer("T", location::cpu, position::internal); + memcpy(h_T, &T[0], dfDataBase.cell_value_bytes); + thermo_GPU.setConstantFields(patch_type_T); - thermo_GPU.initNonConstantFields(&T[0], &he[0], &psi[0], &alpha[0], &mu[0], &K[0], &dpdt[0], - h_boundary_T, h_boundary_he, h_boundary_thermo_psi, h_boundary_thermo_alpha, h_boundary_mu, h_boundary_k); + thermo_GPU.initNonConstantFields(h_T, &he[0], &psi[0], &alpha[0], &mu[0], &K[0], &dpdt[0], h_thermo_rhoD, + h_boundary_T, h_boundary_he, h_boundary_thermo_psi, h_boundary_thermo_alpha, h_boundary_mu, h_boundary_k, h_boundary_thermo_rhoD); delete h_boundary_T; delete h_boundary_he; + delete h_boundary_thermo_psi; + delete h_boundary_thermo_alpha; + delete h_boundary_mu; + delete h_boundary_k; + delete h_boundary_thermo_rhoD; + delete h_thermo_rhoD; } diff --git a/applications/solvers/dfLowMachFoam/dfLowMachFoam.C b/applications/solvers/dfLowMachFoam/dfLowMachFoam.C index 5f3c7aea1..4f4c771df 100644 --- a/applications/solvers/dfLowMachFoam/dfLowMachFoam.C +++ b/applications/solvers/dfLowMachFoam/dfLowMachFoam.C @@ -76,6 +76,7 @@ Description #include "dfMatrixOpBase.H" #include "dfNcclBase.H" #include "dfThermo.H" + #include "dfChemistrySolver.H" #include #include @@ -87,9 +88,13 @@ Description #include "upwind.H" #include "GenFvMatrix.H" #include "CanteraMixture.H" + #include "multivariateGaussConvectionScheme.H" + #include "limitedSurfaceInterpolationScheme.H" #else #include "processorFvPatchField.H" #include "cyclicFvPatchField.H" + #include "multivariateGaussConvectionScheme.H" + #include "limitedSurfaceInterpolationScheme.H" int myRank = -1; int mpi_init_flag = 0; #endif @@ -216,7 +221,11 @@ int main(int argc, char *argv[]) const volScalarField& mu = thermo.mu(); const volScalarField& alpha = thermo.alpha(); - createGPUThermo(CanteraTorchProperties, T, thermo.he(), psi, alpha, mu, K, dpdt); + createGPUThermo(CanteraTorchProperties, T, thermo.he(), psi, alpha, mu, K, dpdt, chemistry); + if (chemistry->ifChemstry()) + { + chemistrySolver_GPU.setConstantValue(dfDataBase.num_cells, dfDataBase.num_species, 4096); + } #endif end1 = std::clock(); @@ -302,6 +311,7 @@ int main(int argc, char *argv[]) thermo_GPU.sync(); #if defined DEBUG_ // check correctThermo + int speciesIndex = 6; chemistry->correctThermo(); // reference double *h_boundary_T_tmp = new double[dfDataBase.num_boundary_surfaces]; double *h_boundary_he_tmp = new double[dfDataBase.num_boundary_surfaces]; @@ -309,6 +319,7 @@ int main(int argc, char *argv[]) double *h_boundary_rho_tmp = new double[dfDataBase.num_boundary_surfaces]; double *h_boundary_thermo_alpha_tmp = new double[dfDataBase.num_boundary_surfaces]; double *h_boundary_thermo_psi_tmp = new double[dfDataBase.num_boundary_surfaces]; + double *h_boundary_thermo_rhoD_tmp = new double[dfDataBase.num_boundary_surfaces]; offset = 0; forAll(T.boundaryField(), patchi) { @@ -318,6 +329,7 @@ int main(int argc, char *argv[]) const fvPatchScalarField& patchAlpha = alpha.boundaryField()[patchi]; const fvPatchScalarField& patchRho = thermo.rho()().boundaryField()[patchi]; const fvPatchScalarField& patchT = T.boundaryField()[patchi]; + const fvPatchScalarField& patchRhoD = chemistry->rhoD(speciesIndex).boundaryField()[patchi]; int patchsize = patchT.size(); if (patchT.type() == "processor") { @@ -351,6 +363,11 @@ int main(int argc, char *argv[]) dynamic_cast&>(patchRho).patchInternalField()(); memcpy(h_boundary_rho_tmp + offset + patchsize, &patchRhoInternal[0], patchsize * sizeof(double)); + memcpy(h_boundary_thermo_rhoD_tmp + offset, &patchRhoD[0], patchsize * sizeof(double)); + scalarField patchRhoDInternal = + dynamic_cast&>(patchRhoD).patchInternalField()(); + memcpy(h_boundary_thermo_rhoD_tmp + offset + patchsize, &patchRhoDInternal[0], patchsize * sizeof(double)); + offset += patchsize * 2; } else { memcpy(h_boundary_T_tmp + offset, &patchT[0], patchsize * sizeof(double)); @@ -359,6 +376,7 @@ int main(int argc, char *argv[]) memcpy(h_boundary_thermo_alpha_tmp + offset, &patchAlpha[0], patchsize * sizeof(double)); memcpy(h_boundary_mu_tmp + offset, &patchMu[0], patchsize * sizeof(double)); memcpy(h_boundary_rho_tmp + offset, &patchRho[0], patchsize * sizeof(double)); + memcpy(h_boundary_thermo_rhoD_tmp + offset, &patchRhoD[0], patchsize * sizeof(double)); offset += patchsize; } @@ -369,12 +387,13 @@ int main(int argc, char *argv[]) MPI_Comm_rank(MPI_COMM_WORLD, &rank); } if (!mpi_init_flag || rank == 0) { - thermo_GPU.compareT(&T[0], h_boundary_T_tmp, printFlag); + // thermo_GPU.compareT(&T[0], h_boundary_T_tmp, printFlag); // thermo_GPU.compareHe(&thermo.he()[0], h_boundary_he_tmp, printFlag); // thermo_GPU.comparePsi(&psi[0], h_boundary_thermo_psi_tmp, printFlag); // thermo_GPU.compareAlpha(&alpha[0], h_boundary_thermo_alpha_tmp, printFlag); // thermo_GPU.compareMu(&mu[0], h_boundary_mu_tmp, printFlag); // thermo_GPU.compareRho(&thermo.rho()()[0], h_boundary_rho_tmp, printFlag); + // thermo_GPU.compareRhoD(&chemistry->rhoD(speciesIndex)[0], h_boundary_thermo_rhoD_tmp, speciesIndex, printFlag); } delete h_boundary_T_tmp; @@ -396,11 +415,10 @@ int main(int argc, char *argv[]) } // update T for debug #ifdef GPUSolverNew_ - double *h_T_tmp = new double[dfDataBase.num_cells]; + double *h_T = dfDataBase.getFieldPointer("T", location::cpu, position::internal); double *h_boundary_T_tmp = new double[dfDataBase.num_boundary_surfaces]; - thermo_GPU.updateCPUT(h_T_tmp, h_boundary_T_tmp); - - memcpy(&T[0], h_T_tmp, T.size() * sizeof(double)); + thermo_GPU.updateCPUT(h_T, h_boundary_T_tmp); + memcpy(&T[0], h_T, T.size() * sizeof(double)); offset = 0; forAll(T.boundaryField(), patchi) { const fvPatchScalarField& const_patchT = T.boundaryField()[patchi]; @@ -414,7 +432,6 @@ int main(int argc, char *argv[]) offset += patchsize; } } - delete h_T_tmp; delete h_boundary_T_tmp; #endif @@ -469,7 +486,7 @@ int main(int argc, char *argv[]) #endif #ifdef GPUSolverNew_ - // write U for + // write U UEqn_GPU.postProcess(); memcpy(&U[0][0], dfDataBase.h_u, dfDataBase.cell_value_vec_bytes); U.correctBoundaryConditions(); diff --git a/applications/solvers/dfLowMachFoam/pEqn_CPU.H b/applications/solvers/dfLowMachFoam/pEqn_CPU.H index b12fe14fa..aa379dde7 100644 --- a/applications/solvers/dfLowMachFoam/pEqn_CPU.H +++ b/applications/solvers/dfLowMachFoam/pEqn_CPU.H @@ -68,8 +68,7 @@ else fvScalarMatrix pDDtEqn ( fvc::ddt(rho) + - // psi*correction(fvm::ddt(p)) // TODO: bugs in correction fvMatrix - psi*fvm::ddt(p) + psi*correction(fvm::ddt(p)) + fvc::div(phiHbyA) ); diff --git a/applications/solvers/dfLowMachFoam/pEqn_GPU.H b/applications/solvers/dfLowMachFoam/pEqn_GPU.H index b982d8475..3dd060955 100644 --- a/applications/solvers/dfLowMachFoam/pEqn_GPU.H +++ b/applications/solvers/dfLowMachFoam/pEqn_GPU.H @@ -98,8 +98,7 @@ UEqn_GPU.sync(); fvScalarMatrix pDDtEqn ( fvc::ddt(rho) + - // psi*correction(fvm::ddt(p)) // TODO: bugs in correction fvMatrix - psi*fvm::ddt(p) + psi*correction(fvm::ddt(p)) + fvc::div(phiHbyA) ); while (pimple.correctNonOrthogonal()) diff --git a/src_gpu/dfUEqn.H b/src_gpu/dfUEqn.H index a5f30d987..71f8cede5 100644 --- a/src_gpu/dfUEqn.H +++ b/src_gpu/dfUEqn.H @@ -84,6 +84,11 @@ private: // non-constant fields - csr double *d_A = nullptr; double *d_b = nullptr; // TODO: needless + double *d_ldu_solve = nullptr; + double *d_extern_solve = nullptr; + double *d_source_solve = nullptr; + double *d_internal_coeffs_solve = nullptr; + double *d_boundary_coeffs_solve = nullptr; // field pointer map std::unordered_map fieldPointerMap; diff --git a/src_gpu/dfUEqn.cu b/src_gpu/dfUEqn.cu index f0253e62a..4b828e170 100644 --- a/src_gpu/dfUEqn.cu +++ b/src_gpu/dfUEqn.cu @@ -433,6 +433,11 @@ void dfUEqn::createNonConstantLduAndCsrFields() { #ifndef STREAM_ALLOCATOR checkCudaErrors(cudaMalloc((void**)&d_A, dataBase_.csr_value_vec_bytes)); checkCudaErrors(cudaMalloc((void**)&d_b, dataBase_.cell_value_vec_bytes)); + checkCudaErrors(cudaMalloc((void**)&d_ldu_solve, dataBase_.csr_value_bytes)); + d_extern_solve = d_ldu_solve + dataBase_.num_cells + 2 * dataBase_.num_surfaces; + checkCudaErrors(cudaMalloc((void**)&d_source_solve, dataBase_.cell_value_vec_bytes)); + checkCudaErrors(cudaMalloc((void**)&d_internal_coeffs_solve, dataBase_.boundary_surface_value_vec_bytes)); + checkCudaErrors(cudaMalloc((void**)&d_boundary_coeffs_solve, dataBase_.boundary_surface_value_vec_bytes)); #endif checkCudaErrors(cudaMalloc((void**)&d_A_pEqn, dataBase_.cell_value_bytes)); // TODO: delete redundant variables checkCudaErrors(cudaMalloc((void**)&d_H_pEqn, dataBase_.cell_value_vec_bytes)); @@ -511,6 +516,12 @@ void dfUEqn::process() { checkCudaErrors(cudaMallocAsync((void**)&d_A, dataBase_.csr_value_vec_bytes, dataBase_.stream)); checkCudaErrors(cudaMallocAsync((void**)&d_b, dataBase_.cell_value_vec_bytes, dataBase_.stream)); + checkCudaErrors(cudaMallocAsync((void**)&d_ldu_solve, dataBase_.csr_value_bytes, dataBase_.stream)); + d_extern_solve = d_ldu_solve + dataBase_.num_cells + 2 * dataBase_.num_surfaces; + checkCudaErrors(cudaMallocAsync((void**)&d_source_solve, dataBase_.cell_value_vec_bytes, dataBase_.stream)); + checkCudaErrors(cudaMallocAsync((void**)&d_internal_coeffs_solve, dataBase_.boundary_surface_value_vec_bytes, dataBase_.stream)); + checkCudaErrors(cudaMallocAsync((void**)&d_boundary_coeffs_solve, dataBase_.boundary_surface_value_vec_bytes, dataBase_.stream)); + #endif // checkCudaErrors(cudaMemcpyAsync(d_u_host_order, dataBase_.h_u, dataBase_.cell_value_vec_bytes, cudaMemcpyHostToDevice, dataBase_.stream)); @@ -584,9 +595,15 @@ void dfUEqn::process() { dataBase_.d_weight, dataBase_.d_sf, d_grad_u, d_source, // end for internal dataBase_.num_patches, dataBase_.patch_size.data(), dataBase_.patch_type_calculated.data(), dataBase_.d_boundary_weight, dataBase_.d_boundary_face_cell, d_boundary_grad_u, dataBase_.d_boundary_sf, dataBase_.d_volume); + + checkCudaErrors(cudaMemcpyAsync(d_ldu_solve, d_ldu, dataBase_.csr_value_bytes, cudaMemcpyDeviceToDevice, dataBase_.stream)); + checkCudaErrors(cudaMemcpyAsync(d_source_solve, d_source, dataBase_.cell_value_vec_bytes, cudaMemcpyDeviceToDevice, dataBase_.stream)); + checkCudaErrors(cudaMemcpyAsync(d_internal_coeffs_solve, d_internal_coeffs, dataBase_.boundary_surface_value_vec_bytes, cudaMemcpyDeviceToDevice, dataBase_.stream)); + checkCudaErrors(cudaMemcpyAsync(d_boundary_coeffs_solve, d_boundary_coeffs, dataBase_.boundary_surface_value_vec_bytes, cudaMemcpyDeviceToDevice, dataBase_.stream)); + fvc_grad_cell_scalar(dataBase_.stream, dataBase_.num_cells, dataBase_.num_surfaces, dataBase_.num_boundary_surfaces, dataBase_.d_owner, dataBase_.d_neighbor, - dataBase_.d_weight, dataBase_.d_sf, dataBase_.d_p, d_source, + dataBase_.d_weight, dataBase_.d_sf, dataBase_.d_p, d_source_solve, dataBase_.num_patches, dataBase_.patch_size.data(), patch_type.data(), dataBase_.d_boundary_weight, dataBase_.d_boundary_face_cell, dataBase_.d_boundary_p, dataBase_.d_boundary_sf, dataBase_.d_volume, -1.); getrAU(dataBase_.stream, dataBase_.nccl_comm, dataBase_.num_cells, dataBase_.num_surfaces, dataBase_.num_boundary_surfaces, @@ -597,7 +614,7 @@ void dfUEqn::process() { ueqn_ldu_to_csr(dataBase_.stream, dataBase_.num_cells, dataBase_.num_surfaces, dataBase_.num_boundary_surfaces, dataBase_.num_Nz, dataBase_.d_boundary_face_cell, dataBase_.d_ldu_to_csr_index, dataBase_.d_diag_to_csr_index, dataBase_.num_patches, dataBase_.patch_size.data(), patch_type.data(), dataBase_.d_u, dataBase_.d_boundary_u, - d_ldu, d_extern, d_source, d_internal_coeffs, d_boundary_coeffs, dataBase_.cyclicNeighbor.data(), + d_ldu_solve, d_extern_solve, d_source_solve, d_internal_coeffs_solve, d_boundary_coeffs_solve, dataBase_.cyclicNeighbor.data(), dataBase_.patchSizeOffset.data(), d_A, d_b); #endif #ifdef USE_GRAPH @@ -654,6 +671,10 @@ void dfUEqn::process() { checkCudaErrors(cudaFreeAsync(d_A, dataBase_.stream)); checkCudaErrors(cudaFreeAsync(d_b, dataBase_.stream)); + checkCudaErrors(cudaFreeAsync(d_ldu_solve, dataBase_.stream)); + checkCudaErrors(cudaFreeAsync(d_source_solve, dataBase_.stream)); + checkCudaErrors(cudaFreeAsync(d_internal_coeffs_solve, dataBase_.stream)); + checkCudaErrors(cudaFreeAsync(d_boundary_coeffs_solve, dataBase_.stream)); #endif TICK_END_EVENT(UEqn post process free); diff --git a/src_gpu/dfpEqn.cu b/src_gpu/dfpEqn.cu index fed764a36..dc7038a6d 100644 --- a/src_gpu/dfpEqn.cu +++ b/src_gpu/dfpEqn.cu @@ -263,9 +263,9 @@ __global__ void correct_diag_mtx_multi_tpsi_kernel(int num_cells, const double * return; // correction: source += (-diag * psi + source) - // double srcVal = source[index]; - // double APsi = - diag[index] * psi[index] + srcVal; - // source[index] += APsi; + double srcVal = source[index]; + double APsi = - diag[index] * psi[index] + srcVal; + source[index] -= APsi; // multi psi double tPsiVal = thermo_psi[index]; From 7a2c9240574fb090ba01ce90c7be81697e4bc0e9 Mon Sep 17 00:00:00 2001 From: maorz1998 Date: Tue, 7 Nov 2023 23:57:41 +0800 Subject: [PATCH 5/5] limitedLinear debuging --- src_gpu/AmgXSolver.H | 26 ------ src_gpu/dfEEqn.cu | 4 +- src_gpu/dfMatrixDataBase.H | 7 +- src_gpu/dfMatrixDataBase.cu | 15 +++- src_gpu/dfMatrixOpBase.H | 11 +++ src_gpu/dfMatrixOpBase.cu | 171 ++++++++++++++++++++++++++++++++++++ src_gpu/dfRhoEqn.cu | 2 +- 7 files changed, 205 insertions(+), 31 deletions(-) diff --git a/src_gpu/AmgXSolver.H b/src_gpu/AmgXSolver.H index 9e6e680aa..8eb194a6f 100644 --- a/src_gpu/AmgXSolver.H +++ b/src_gpu/AmgXSolver.H @@ -30,32 +30,6 @@ // mpi #include - -/** \brief A macro to check the returned CUDA error code. - * - * \param call [in] Function call to CUDA API. - */ -# define CHECK(call) \ -do \ -{ \ - const cudaError_t error_code = call; \ - if (error_code != cudaSuccess) \ - { \ - printf("CUDA Error:\n"); \ - printf(" File: %s\n", __FILE__); \ - printf(" Line: %d\n", __LINE__); \ - printf(" Error code: %d\n", error_code); \ - printf(" Error text: %s\n", \ - cudaGetErrorString(error_code)); \ - exit(1); \ - } \ -} while (0) - - - - - - /** \brief A wrapper class for coupling PETSc and AmgX. * * This class is a wrapper of AmgX library for PETSc. PETSc users only need to diff --git a/src_gpu/dfEEqn.cu b/src_gpu/dfEEqn.cu index cb43f8c9a..ad55af27a 100644 --- a/src_gpu/dfEEqn.cu +++ b/src_gpu/dfEEqn.cu @@ -222,8 +222,8 @@ void dfEEqn::process() { TICK_START_EVENT; // copy he to host - checkCudaErrors(cudaMemcpyAsync(dataBase_.h_he, dataBase_.d_he, dataBase_.cell_value_bytes, cudaMemcpyDeviceToHost, dataBase_.stream)); - checkCudaErrors(cudaMemcpyAsync(dataBase_.h_boundary_he, dataBase_.d_boundary_he, dataBase_.boundary_surface_value_bytes, cudaMemcpyDeviceToHost, dataBase_.stream)); + // checkCudaErrors(cudaMemcpyAsync(dataBase_.h_he, dataBase_.d_he, dataBase_.cell_value_bytes, cudaMemcpyDeviceToHost, dataBase_.stream)); + // checkCudaErrors(cudaMemcpyAsync(dataBase_.h_boundary_he, dataBase_.d_boundary_he, dataBase_.boundary_surface_value_bytes, cudaMemcpyDeviceToHost, dataBase_.stream)); TICK_END_EVENT(EEqn post process copy back); TICK_START_EVENT; diff --git a/src_gpu/dfMatrixDataBase.H b/src_gpu/dfMatrixDataBase.H index 40cf32bae..8cf6485dd 100644 --- a/src_gpu/dfMatrixDataBase.H +++ b/src_gpu/dfMatrixDataBase.H @@ -161,6 +161,7 @@ struct dfMatrixDataBase // constant fields - internal double *d_sf = nullptr; + double *d_mesh_dis = nullptr; double *d_mag_sf = nullptr; double *d_weight = nullptr; double *d_phi_weight = nullptr; // weight for mvConvection->fvmDiv @@ -168,6 +169,7 @@ struct dfMatrixDataBase double *d_volume = nullptr; double *h_sf = nullptr; + double *h_mesh_dis = nullptr; // constant fields - boundary double *d_boundary_sf = nullptr; @@ -207,8 +209,10 @@ struct dfMatrixDataBase double *d_dpdt = nullptr; double *d_T = nullptr; + double *h_T = nullptr; double *d_mu = nullptr; double *d_thermo_alpha = nullptr; + double *d_thermo_rhoD = nullptr; // computed on GPU, used on CPU, need memcpyd2h - host double *h_rho = nullptr; @@ -258,6 +262,7 @@ struct dfMatrixDataBase double *d_boundary_T = nullptr; double *d_boundary_mu = nullptr; double *d_boundary_thermo_alpha = nullptr; + double *d_boundary_thermo_rhoD = nullptr; // boundary fields used between eqns double *d_boundary_rAU = nullptr; double *d_boundary_HbyA = nullptr; @@ -300,7 +305,7 @@ struct dfMatrixDataBase void createConstantFieldsInternal(); void createConstantFieldsBoundary(); void initConstantFieldsInternal(const double *sf, const double *mag_sf, - const double *weight, const double *delta_coeffs, const double *volume); + const double *weight, const double *delta_coeffs, const double *volume, const double *mesh_distance); void initConstantFieldsBoundary(const double *boundary_sf, const double *boundary_mag_sf, const double *boundary_delta_coeffs, const double *boundary_weight, const int *boundary_face_cell, std::vector &patch_type_calculated, std::vector &patch_type_extropolated); diff --git a/src_gpu/dfMatrixDataBase.cu b/src_gpu/dfMatrixDataBase.cu index fd87cb46a..2dd42745d 100644 --- a/src_gpu/dfMatrixDataBase.cu +++ b/src_gpu/dfMatrixDataBase.cu @@ -267,12 +267,14 @@ void dfMatrixDataBase::setConstantIndexes(const int *owner, const int *neighbor, void dfMatrixDataBase::createConstantFieldsInternal() { checkCudaErrors(cudaMalloc((void**)&d_sf, surface_value_vec_bytes)); + checkCudaErrors(cudaMalloc((void**)&d_mesh_dis, surface_value_vec_bytes)); checkCudaErrors(cudaMalloc((void**)&d_mag_sf, surface_value_bytes)); checkCudaErrors(cudaMalloc((void**)&d_weight, surface_value_bytes)); checkCudaErrors(cudaMalloc((void**)&d_phi_weight, surface_value_bytes)); checkCudaErrors(cudaMalloc((void**)&d_delta_coeffs, surface_value_bytes)); checkCudaErrors(cudaMalloc((void**)&d_volume, cell_value_bytes)); fieldPointerMap["d_sf"] = d_sf; + fieldPointerMap["d_mesh_dis"] = d_mesh_dis; fieldPointerMap["d_mag_sf"] = d_mag_sf; fieldPointerMap["d_weight"] = d_weight; fieldPointerMap["d_phi_weight"] = d_phi_weight; @@ -280,7 +282,9 @@ void dfMatrixDataBase::createConstantFieldsInternal() { fieldPointerMap["d_volume"] = d_volume; checkCudaErrors(cudaMallocHost((void**)&h_sf, surface_value_vec_bytes)); + checkCudaErrors(cudaMallocHost((void**)&h_mesh_dis, surface_value_vec_bytes)); fieldPointerMap["h_sf"] = h_sf; + fieldPointerMap["h_mesh_dis"] = h_mesh_dis; } void dfMatrixDataBase::createConstantFieldsBoundary() { @@ -299,14 +303,18 @@ void dfMatrixDataBase::createConstantFieldsBoundary() { } void dfMatrixDataBase::initConstantFieldsInternal(const double *sf, const double *mag_sf, - const double *weight, const double *delta_coeffs, const double *volume) { + const double *weight, const double *delta_coeffs, const double *volume, const double *mesh_distance) { // permute sf for (int i = 0; i < num_surfaces; i++) { h_sf[num_surfaces * 0 + i] = sf[i * 3 + 0]; h_sf[num_surfaces * 1 + i] = sf[i * 3 + 1]; h_sf[num_surfaces * 2 + i] = sf[i * 3 + 2]; + h_mesh_dis[num_surfaces * 0 + i] = mesh_distance[i * 3 + 0]; + h_mesh_dis[num_surfaces * 1 + i] = mesh_distance[i * 3 + 1]; + h_mesh_dis[num_surfaces * 2 + i] = mesh_distance[i * 3 + 2]; } checkCudaErrors(cudaMemcpyAsync(d_sf, h_sf, surface_value_vec_bytes, cudaMemcpyHostToDevice, stream)); + checkCudaErrors(cudaMemcpyAsync(d_mesh_dis, h_mesh_dis, surface_value_vec_bytes, cudaMemcpyHostToDevice, stream)); checkCudaErrors(cudaMemcpyAsync(d_mag_sf, mag_sf, surface_value_bytes, cudaMemcpyHostToDevice, stream)); checkCudaErrors(cudaMemcpyAsync(d_weight, weight, surface_value_bytes, cudaMemcpyHostToDevice, stream)); checkCudaErrors(cudaMemcpyAsync(d_delta_coeffs, delta_coeffs, surface_value_bytes, cudaMemcpyHostToDevice, stream)); @@ -366,6 +374,8 @@ void dfMatrixDataBase::createNonConstantFieldsInternal() { // thermophysical fields checkCudaErrors(cudaMalloc((void**)&d_thermo_psi, cell_value_bytes)); checkCudaErrors(cudaMalloc((void**)&d_thermo_alpha, cell_value_bytes)); + checkCudaErrors(cudaMalloc((void**)&d_thermo_rhoD, num_species * cell_value_bytes)); + checkCudaErrors(cudaMalloc((void**)&d_hDiff_corr_flux, cell_value_vec_bytes)); checkCudaErrors(cudaMalloc((void**)&d_diff_alphaD, cell_value_bytes)); checkCudaErrors(cudaMalloc((void**)&d_dpdt, cell_value_bytes)); @@ -381,6 +391,7 @@ void dfMatrixDataBase::createNonConstantFieldsInternal() { checkCudaErrors(cudaMalloc((void**)&d_HbyA, cell_value_vec_bytes)); // computed on GPU, used on CPU, need memcpyd2h + checkCudaErrors(cudaMallocHost((void**)&h_T, cell_value_bytes)); checkCudaErrors(cudaMallocHost((void**)&h_rho, cell_value_bytes)); checkCudaErrors(cudaMallocHost((void**)&h_rho_old, cell_value_bytes)); checkCudaErrors(cudaMallocHost((void**)&h_u, cell_value_vec_bytes)); @@ -389,6 +400,7 @@ void dfMatrixDataBase::createNonConstantFieldsInternal() { checkCudaErrors(cudaMallocHost((void**)&h_he, cell_value_bytes)); checkCudaErrors(cudaMallocHost((void**)&h_k, cell_value_bytes)); checkCudaErrors(cudaMallocHost((void**)&h_k_old, cell_value_bytes)); + fieldPointerMap["h_T"] = h_T; fieldPointerMap["h_rho"] = h_rho; fieldPointerMap["h_rho_old"] = h_rho_old; fieldPointerMap["h_u"] = h_u; @@ -440,6 +452,7 @@ void dfMatrixDataBase::createNonConstantFieldsBoundary() { // thermophysical fields checkCudaErrors(cudaMalloc((void**)&d_boundary_thermo_psi, boundary_surface_value_bytes)); checkCudaErrors(cudaMalloc((void**)&d_boundary_thermo_alpha, boundary_surface_value_bytes)); + checkCudaErrors(cudaMalloc((void**)&d_boundary_thermo_rhoD, num_species * boundary_surface_value_bytes)); checkCudaErrors(cudaMalloc((void**)&d_boundary_hDiff_corr_flux, boundary_surface_value_vec_bytes)); checkCudaErrors(cudaMalloc((void**)&d_boundary_diff_alphaD, boundary_surface_value_bytes)); checkCudaErrors(cudaMalloc((void**)&d_boundary_T, boundary_surface_value_bytes)); diff --git a/src_gpu/dfMatrixOpBase.H b/src_gpu/dfMatrixOpBase.H index 5a61980f9..c27090303 100644 --- a/src_gpu/dfMatrixOpBase.H +++ b/src_gpu/dfMatrixOpBase.H @@ -156,6 +156,17 @@ void correct_boundary_conditions_vector(cudaStream_t stream, ncclComm_t comm, void compute_upwind_weight(cudaStream_t stream, int num_surfaces, const double *phi, double *weight); +void compute_limitedLinear_weight(cudaStream_t stream, ncclComm_t comm, const int *neighbor_peer, + int num_surfaces, int num_cells, int num_boundary_surfaces, + const int *lowerAddr, const int *upperAddr, const double *mesh_distance, + const double *weight, const double *Sf, const double *vf, const double *phi, double *output, // end for internal + int num_patches, const int *patch_size, const int *patch_type, const double *boundary_weight, + const int *boundary_cell_face, const double *boundary_vf, const double *boundary_Sf, + const double *volume, const double *boundary_mag_Sf, const double *boundary_phi, + // const double *boundary_distance, double *boundary_output, + const int *cyclicNeighbor, const int *patchSizeOffset, + const double *boundary_deltaCoeffs); + // fvm ops void fvm_ddt_vol_scalar_vol_scalar(cudaStream_t stream, int num_cells, double rDeltaT, diff --git a/src_gpu/dfMatrixOpBase.cu b/src_gpu/dfMatrixOpBase.cu index 6e520f7c6..afe3b9cf4 100644 --- a/src_gpu/dfMatrixOpBase.cu +++ b/src_gpu/dfMatrixOpBase.cu @@ -5,6 +5,9 @@ #include #include "cuda_profiler_api.h" +using std::min; +using std::max; + __global__ void warmup() { int index = blockDim.x * blockIdx.x + threadIdx.x; @@ -167,6 +170,112 @@ __global__ void compute_upwind_weight_internal(int num_faces, const double *phi, weight[index] = 0.; } +__device__ int sign(double x) +{ + return (x >= 0) ? 1: -1; +} + +__device__ int pos0(double x) +{ + return (x >= 0) ? 1 : 0; +} + +__global__ void compute_limiter_phi_internal(int num_cells, int num_surfaces, const double *vf, + const int *lower_index, const int *upper_index, const double *mesh_distance, + const double *phi, const double *mesh_weights, const double *gradc, + double *limiter) +{ + int index = blockDim.x * blockIdx.x + threadIdx.x; + if (index >= num_surfaces) + return; + + int owner = lower_index[index]; + int neighbor = upper_index[index]; + double faceFlux = phi[index]; + double gradf = vf[neighbor] - vf[owner]; + double gradcf, r; + + // LimiterFunc::r + if (faceFlux > 0) { + gradcf = mesh_distance[index] * gradc[owner] + + mesh_distance[num_surfaces + index] * gradc[num_cells + owner] + + mesh_distance[num_surfaces * 2 + index] * gradc[num_cells * 2 + owner]; + } else { + gradcf = mesh_distance[index] * gradc[neighbor] + + mesh_distance[num_surfaces + index] * gradc[num_cells + neighbor] + + mesh_distance[num_surfaces * 2 + index] * gradc[num_cells * 2 + neighbor]; + } + if (fabs(gradcf) >= 1000 * fabs(gradf)) { + r = 2*1000*sign(gradcf)*sign(gradf) - 1; + } else { + r = 2 * (gradcf / gradf) - 1; + } + + limiter[index] = max(min(r, 1.), 0.); // now twoByk_ = 1, fvScheme: limitedLinear 1; +} + +__global__ void compute_limiter_phi_boundary(int num, int offset, int num_boundary_surfaces, + const double *boundary_weight, const double *boundary_vf, const double *boundary_gradc, + const double *boundary_distance, const double *boundary_phi, double *boundary_limiter) +{ + int index = blockDim.x * blockIdx.x + threadIdx.x; + if (index >= num) + return; + + int neighbor_start_index = offset + index; + int internal_start_index = offset + index + num; + + double bouFaceFlux = boundary_phi[neighbor_start_index]; + double bouGradf = boundary_vf[internal_start_index] - boundary_vf[neighbor_start_index]; + double bouGradcf, r; + + + // LimiterFunc::r + if (bouFaceFlux > 0) { + bouGradcf = boundary_distance[neighbor_start_index] * boundary_gradc[internal_start_index] + + boundary_distance[num_boundary_surfaces + neighbor_start_index] * boundary_gradc[num_boundary_surfaces + internal_start_index] + + boundary_distance[num_boundary_surfaces * 2 + neighbor_start_index] * boundary_gradc[num_boundary_surfaces * 2 + internal_start_index]; + } else { + bouGradcf = boundary_distance[neighbor_start_index] * boundary_gradc[neighbor_start_index] + + boundary_distance[num_boundary_surfaces + neighbor_start_index] * boundary_gradc[num_boundary_surfaces + neighbor_start_index] + + boundary_distance[num_boundary_surfaces * 2 + neighbor_start_index] * boundary_gradc[num_boundary_surfaces * 2 + neighbor_start_index]; + } + if (fabs(bouGradcf) >= 1000 * fabs(bouGradf)) { + r = 2*1000*sign(bouGradcf)*sign(bouGradf) - 1; + } else { + r = 2 * (bouGradcf / bouGradf) - 1; + } + + boundary_limiter[neighbor_start_index] = max(min(r, 1.), 0.); // now twoByk_ = 1, fvScheme: limitedLinear 1; +} + +__global__ void compute_limiter_weight_internal(int num_cells, int num_surfaces, + const double *phi, const double *mesh_weights, const double *limiter_weights, double *output_weights) +{ + int index = blockDim.x * blockIdx.x + threadIdx.x; + if (index >= num_surfaces) + return; + + double limiterW = limiter_weights[index]; + output_weights[index] = limiterW * mesh_weights[index] + + (1. - limiterW) * pos0(phi[index]); +} + +__global__ void compute_limiter_weight_boundary(int num, int offset, int num_boundary_surfaces, + const double *boundary_weight, const double *boundary_phi, + const double *boundary_limiter_weights, double *boundary_output_weights) +{ + int index = blockDim.x * blockIdx.x + threadIdx.x; + if (index >= num) + return; + + int neighbor_start_index = offset + index; + + double limiterW = boundary_limiter_weights[neighbor_start_index]; + boundary_output_weights[neighbor_start_index] = limiterW * boundary_weight[neighbor_start_index] + + (1. - limiterW) * pos0(boundary_phi[neighbor_start_index]); +} + __global__ void update_boundary_coeffs_zeroGradient_scalar(int num, int offset, double *value_internal_coeffs, double *value_boundary_coeffs, double *gradient_internal_coeffs, double *gradient_boundary_coeffs) @@ -2428,6 +2537,68 @@ void compute_upwind_weight(cudaStream_t stream, int num_surfaces, const double * compute_upwind_weight_internal<<>>(num_surfaces, phi, weight); } +void compute_limitedLinear_weight(cudaStream_t stream, ncclComm_t comm, const int *neighbor_peer, + int num_surfaces, int num_cells, int num_boundary_surfaces, + const int *lowerAddr, const int *upperAddr, const double *mesh_distance, + const double *weight, const double *Sf, const double *vf, const double *phi, double *output, // end for internal + int num_patches, const int *patch_size, const int *patch_type, const double *boundary_weight, + const int *boundary_cell_face, const double *boundary_vf, const double *boundary_Sf, + const double *volume, const double *boundary_mag_Sf, const double *boundary_phi, + // const double *boundary_distance, double *boundary_output, + const int *cyclicNeighbor, const int *patchSizeOffset, + const double *boundary_deltaCoeffs) +{ + // calculate fvc::grad(vf) (now output stores fvc::grad(lPhi)) + // fvc_grad_cell_scalar_withBC(stream, comm, neighbor_peer, num_cells, num_surfaces, num_boundary_surfaces, + // lowerAddr, upperAddr, weight, Sf, vf, output, num_patches, patch_size, patch_type, boundary_weight, + // boundary_cell_face, boundary_vf, boundary_Sf, volume, boundary_mag_Sf, boundary_output, + // cyclicNeighbor, patchSizeOffset, boundary_deltaCoeffs); + fvc_grad_cell_scalar(stream, num_cells, num_surfaces, num_boundary_surfaces, lowerAddr, upperAddr, + weight, Sf, vf, output, num_patches, patch_size, patch_type, boundary_weight, + boundary_cell_face, boundary_vf, boundary_Sf, volume, true); + // calculated limiter (now output stores this->limiter(phi)) + size_t threads_per_block = 1024; + size_t blocks_per_grid = (num_surfaces + threads_per_block - 1) / threads_per_block; + compute_limiter_phi_internal<<>>(num_cells, num_surfaces, vf, + lowerAddr, upperAddr, mesh_distance, phi, weight, output, output); + + // int offset = 0; + // for (int i = 0; i < num_patches; i++) { + // if (patch_size[i] == 0) continue; + // if (patch_type[i] == boundaryConditions::processor + // || patch_type[i] == boundaryConditions::processorCyclic) { + // threads_per_block = 256; + // blocks_per_grid = (patch_size[i] + threads_per_block - 1) / threads_per_block; + // compute_limiter_phi_boundary<<>>(patch_size[i], offset, + // num_boundary_surfaces, boundary_weight, boundary_vf, boundary_output, boundary_distance, + // boundary_phi, boundary_output); + // offset += 2 * patch_size[i]; + // } else { + // cudaMemset(boundary_output + offset, 1., patch_size[i] * sizeof(double)); + // offset += patch_size[i]; + // } + // } + // calculate weight + // threads_per_block = 1024; + // blocks_per_grid = (num_surfaces + threads_per_block - 1) / threads_per_block; + // compute_limiter_weight_internal<<>>(num_cells, num_surfaces, phi, + // weight, output, output); + // offset = 0; + // for (int i = 0; i < num_patches; i++) { + // if (patch_size[i] == 0) continue; + // threads_per_block = 256; + // blocks_per_grid = (patch_size[i] + threads_per_block - 1) / threads_per_block; + // compute_limiter_weight_boundary<<>>(patch_size[i], offset, + // num_boundary_surfaces, boundary_weight, boundary_phi, boundary_output, boundary_output); + // if (patch_type[i] == boundaryConditions::processor + // || patch_type[i] == boundaryConditions::processorCyclic) { + // offset += 2 * patch_size[i]; + // } else { + // offset += patch_size[i]; + // } + // } +} + void fvm_ddt_vol_scalar_vol_scalar(cudaStream_t stream, int num_cells, double rDeltaT, const double *rho, const double *rho_old, const double *vf, const double *volume, double *diag, double *source, double sign) diff --git a/src_gpu/dfRhoEqn.cu b/src_gpu/dfRhoEqn.cu index 57d32c9a4..b9eeef762 100644 --- a/src_gpu/dfRhoEqn.cu +++ b/src_gpu/dfRhoEqn.cu @@ -86,7 +86,7 @@ void dfRhoEqn::process() #endif TICK_END_EVENT(rhoEqn post process free); TICK_START_EVENT; - checkCudaErrors(cudaMemcpyAsync(dataBase_.h_rho, dataBase_.d_rho, dataBase_.cell_value_bytes, cudaMemcpyDeviceToHost, dataBase_.stream)); + // checkCudaErrors(cudaMemcpyAsync(dataBase_.h_rho, dataBase_.d_rho, dataBase_.cell_value_bytes, cudaMemcpyDeviceToHost, dataBase_.stream)); TICK_END_EVENT(rhoEqn post process copy back); sync(); }