diff --git a/src_gpu/dfChemistrySolver.H b/src_gpu/dfChemistrySolver.H index cbc144a60..96a16f521 100644 --- a/src_gpu/dfChemistrySolver.H +++ b/src_gpu/dfChemistrySolver.H @@ -13,9 +13,10 @@ private: double *Xmu_, *Xstd_, *Ymu_, *Ystd_; double *init_input_, *y_input_BCT; - int dim_input_, num_cells_, num_species_; + int dim_input_, num_cells_, num_species_, num_modules_; + int batch_size_; public: - dfChemistrySolver(int num_cells, int num_species); + dfChemistrySolver(int num_cells, int num_species, int batch_size); ~dfChemistrySolver(); void loadModels(const std::string dir); diff --git a/src_gpu/dfChemistrySolver.cu b/src_gpu/dfChemistrySolver.cu index fd50e811a..935bbb3f9 100644 --- a/src_gpu/dfChemistrySolver.cu +++ b/src_gpu/dfChemistrySolver.cu @@ -29,7 +29,7 @@ __global__ void normalize_input(int num_cells, int dim, const double *input, } } -__global__ void calculate_y_new(int num_cells, int num_species, const double *output_init, +__global__ void calculate_y_new(int num_cells, int num_modules, const double *output_init, const double *y_input_BCT, const double *Ymu, const double *Ystd, double *output) { int index = blockDim.x * blockIdx.x + threadIdx.x; @@ -37,7 +37,7 @@ __global__ void calculate_y_new(int num_cells, int num_species, const double *ou return; double RR_tmp; - for (int i = 0; i < num_species; ++i) { + for (int i = 0; i < num_modules; ++i) { RR_tmp = output_init[i * num_cells + index] * Ystd[i] + Ymu[i] + y_input_BCT[i * num_cells + index]; RR_tmp = pow((RR_tmp * 0.1 + 1), 10); // rev-BCT: lambda = 0.1 output[i * num_cells + index] = RR_tmp; @@ -62,17 +62,18 @@ __global__ void calculate_RR(int num_cells, int num_species, double delta_t, } } -dfChemistrySolver::dfChemistrySolver(int num_cells, int num_species) - : device_(torch::kCUDA), num_cells_(num_cells), num_species_(num_species) +dfChemistrySolver::dfChemistrySolver(int num_cells, int num_species, int batch_size) + : device_(torch::kCUDA), num_cells_(num_cells), num_species_(num_species), batch_size_(batch_size) { dim_input_ = num_species + 2; // p, T, y + num_modules_ = num_species_ - 1; cudaMalloc(&init_input_, sizeof(double) * num_cells * dim_input_); cudaMalloc(&y_input_BCT, sizeof(double) * num_cells * num_species_); cudaMalloc(&Xmu_, sizeof(double) * dim_input_); cudaMalloc(&Xstd_, sizeof(double) * dim_input_); - cudaMalloc(&Ymu_, sizeof(double) * num_species_); - cudaMalloc(&Ystd_, sizeof(double) * num_species_); - modules_.reserve(num_species_); + cudaMalloc(&Ymu_, sizeof(double) * num_modules_); + cudaMalloc(&Ystd_, sizeof(double) * num_modules_); + modules_.reserve(num_modules_); // now norm paras are set in constructor manually at::TensorOptions opts = at::TensorOptions().dtype(at::kDouble).device(at::kCUDA); @@ -84,13 +85,28 @@ dfChemistrySolver::dfChemistrySolver(int num_cells, int num_species) 7.9704668543e-01, 5.1327160125e-01, 4.6827591193e-03}; std::vector Ymu_vec = {0.0085609265, -0.0082998877, 0.0030108739, -0.0067360325, 0.0037464590, 0.0024258509}; - std::vector Ystd_vec = {0.0085609265, -0.0082998877, 0.0030108739, - -0.0067360325, 0.0037464590, 0.0024258509}; + std::vector Ystd_vec = {0.05887569, 0.12253898, 0.01643904, + 0.0963155, 0.07127831, 0.04373392}; cudaMemcpy(Xmu_, Xmu_vec.data(), sizeof(double) * dim_input_, cudaMemcpyHostToDevice); cudaMemcpy(Xstd_, Xstd_vec.data(), sizeof(double) * dim_input_, cudaMemcpyHostToDevice); - cudaMemcpy(Ymu_, Ymu_vec.data(), sizeof(double) * num_species_, cudaMemcpyHostToDevice); - cudaMemcpy(Ystd_, Ystd_vec.data(), sizeof(double) * num_species_, cudaMemcpyHostToDevice); + cudaMemcpy(Ymu_, Ymu_vec.data(), sizeof(double) * num_modules_, cudaMemcpyHostToDevice); + cudaMemcpy(Ystd_, Ystd_vec.data(), sizeof(double) * num_modules_, cudaMemcpyHostToDevice); + + // input modules + std::string prefix = "new_Temporary_Chemical_"; + std::string suffix = ".pt"; + for (int i = 0; i < num_modules_; ++i) { + std::string model_path = prefix + std::to_string(i) + suffix; + try { + modules_.push_back(torch::jit::load(model_path)); + } + catch (const c10::Error& e) { + std::cerr << "error loading the model\n"; + exit(-1); + } + modules_[i].to(device_); + } } dfChemistrySolver::~dfChemistrySolver() { @@ -106,23 +122,24 @@ void dfChemistrySolver::Inference(const double *T, const double *p, const double normalize_input<<>>(num_cells_, dim_input_, init_input_, Xmu_, Xstd_, init_input_); // inference by torch - at::Tensor torch_input = torch::from_blob(init_input_, {num_cells_, dim_input_}, device_); - torch_input = torch_input.to(at::kFloat); - std::vector INPUTS; - INPUTS.push_back(torch_input); + double *d_output; + for (int sample_start = 0; sample_start < num_cells_; sample_start += batch_size_) { + int sample_end = std::min(sample_start + batch_size_, num_cells_); + int sample_len = sample_end - sample_start; + at::Tensor torch_input = torch::from_blob(init_input_ + sample_start * dim_input_, {sample_len, dim_input_}, torch::TensorOptions().device(device_).dtype(torch::kDouble)); + torch_input = torch_input.to(at::kFloat); + std::vector INPUTS; + INPUTS.push_back(torch_input); + std::vector output(num_modules_); - std::vector output(num_species_); - for (int i = 0; i < num_species_; ++i) { - output[i] = modules_[i].forward(INPUTS).toTensor(); - output[i] = output[i].to(at::kDouble); + for (int i = 0; i < num_modules_; ++i) { + output[i] = modules_[i].forward(INPUTS).toTensor(); + output[i] = output[i].to(at::kDouble); + d_output = output[i].data_ptr(); + cudaMemcpy(RR + (i * num_cells_ + sample_start), d_output, sizeof(double) * sample_len, cudaMemcpyDeviceToDevice); + } } - // post process - double *d_output; - for (int i = 0; i < num_species_; ++i) { - d_output = output[i].data_ptr(); - cudaMemcpy(RR + i * num_cells_, d_output, sizeof(double) * num_cells_, cudaMemcpyDeviceToDevice); - } - calculate_y_new<<>>(num_cells_, num_species_, RR, y_input_BCT, Ymu_, Ystd_, RR); + calculate_y_new<<>>(num_cells_, num_modules_, RR, y_input_BCT, Ymu_, Ystd_, RR); calculate_RR<<>>(num_cells_, num_species_, 1e-6, rho, y, RR, RR); } \ No newline at end of file