Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
5 changes: 3 additions & 2 deletions src_gpu/dfChemistrySolver.H
Original file line number Diff line number Diff line change
Expand Up @@ -13,9 +13,10 @@ private:
double *Xmu_, *Xstd_, *Ymu_, *Ystd_;

double *init_input_, *y_input_BCT;
int dim_input_, num_cells_, num_species_;
int dim_input_, num_cells_, num_species_, num_modules_;
int batch_size_;
public:
dfChemistrySolver(int num_cells, int num_species);
dfChemistrySolver(int num_cells, int num_species, int batch_size);
~dfChemistrySolver();

void loadModels(const std::string dir);
Expand Down
69 changes: 43 additions & 26 deletions src_gpu/dfChemistrySolver.cu
Original file line number Diff line number Diff line change
Expand Up @@ -29,15 +29,15 @@ __global__ void normalize_input(int num_cells, int dim, const double *input,
}
}

__global__ void calculate_y_new(int num_cells, int num_species, const double *output_init,
__global__ void calculate_y_new(int num_cells, int num_modules, const double *output_init,
const double *y_input_BCT, const double *Ymu, const double *Ystd, double *output)
{
int index = blockDim.x * blockIdx.x + threadIdx.x;
if (index >= num_cells)
return;

double RR_tmp;
for (int i = 0; i < num_species; ++i) {
for (int i = 0; i < num_modules; ++i) {
RR_tmp = output_init[i * num_cells + index] * Ystd[i] + Ymu[i] + y_input_BCT[i * num_cells + index];
RR_tmp = pow((RR_tmp * 0.1 + 1), 10); // rev-BCT: lambda = 0.1
output[i * num_cells + index] = RR_tmp;
Expand All @@ -62,17 +62,18 @@ __global__ void calculate_RR(int num_cells, int num_species, double delta_t,
}
}

dfChemistrySolver::dfChemistrySolver(int num_cells, int num_species)
: device_(torch::kCUDA), num_cells_(num_cells), num_species_(num_species)
dfChemistrySolver::dfChemistrySolver(int num_cells, int num_species, int batch_size)
: device_(torch::kCUDA), num_cells_(num_cells), num_species_(num_species), batch_size_(batch_size)
{
dim_input_ = num_species + 2; // p, T, y
num_modules_ = num_species_ - 1;
cudaMalloc(&init_input_, sizeof(double) * num_cells * dim_input_);
cudaMalloc(&y_input_BCT, sizeof(double) * num_cells * num_species_);
cudaMalloc(&Xmu_, sizeof(double) * dim_input_);
cudaMalloc(&Xstd_, sizeof(double) * dim_input_);
cudaMalloc(&Ymu_, sizeof(double) * num_species_);
cudaMalloc(&Ystd_, sizeof(double) * num_species_);
modules_.reserve(num_species_);
cudaMalloc(&Ymu_, sizeof(double) * num_modules_);
cudaMalloc(&Ystd_, sizeof(double) * num_modules_);
modules_.reserve(num_modules_);

// now norm paras are set in constructor manually
at::TensorOptions opts = at::TensorOptions().dtype(at::kDouble).device(at::kCUDA);
Expand All @@ -84,13 +85,28 @@ dfChemistrySolver::dfChemistrySolver(int num_cells, int num_species)
7.9704668543e-01, 5.1327160125e-01, 4.6827591193e-03};
std::vector<double> Ymu_vec = {0.0085609265, -0.0082998877, 0.0030108739,
-0.0067360325, 0.0037464590, 0.0024258509};
std::vector<double> Ystd_vec = {0.0085609265, -0.0082998877, 0.0030108739,
-0.0067360325, 0.0037464590, 0.0024258509};
std::vector<double> Ystd_vec = {0.05887569, 0.12253898, 0.01643904,
0.0963155, 0.07127831, 0.04373392};

cudaMemcpy(Xmu_, Xmu_vec.data(), sizeof(double) * dim_input_, cudaMemcpyHostToDevice);
cudaMemcpy(Xstd_, Xstd_vec.data(), sizeof(double) * dim_input_, cudaMemcpyHostToDevice);
cudaMemcpy(Ymu_, Ymu_vec.data(), sizeof(double) * num_species_, cudaMemcpyHostToDevice);
cudaMemcpy(Ystd_, Ystd_vec.data(), sizeof(double) * num_species_, cudaMemcpyHostToDevice);
cudaMemcpy(Ymu_, Ymu_vec.data(), sizeof(double) * num_modules_, cudaMemcpyHostToDevice);
cudaMemcpy(Ystd_, Ystd_vec.data(), sizeof(double) * num_modules_, cudaMemcpyHostToDevice);

// input modules
std::string prefix = "new_Temporary_Chemical_";
std::string suffix = ".pt";
for (int i = 0; i < num_modules_; ++i) {
std::string model_path = prefix + std::to_string(i) + suffix;
try {
modules_.push_back(torch::jit::load(model_path));
}
catch (const c10::Error& e) {
std::cerr << "error loading the model\n";
exit(-1);
}
modules_[i].to(device_);
}
}

dfChemistrySolver::~dfChemistrySolver() {
Expand All @@ -106,23 +122,24 @@ void dfChemistrySolver::Inference(const double *T, const double *p, const double
normalize_input<<<blocks_per_grid, threads_per_block>>>(num_cells_, dim_input_, init_input_, Xmu_, Xstd_, init_input_);

// inference by torch
at::Tensor torch_input = torch::from_blob(init_input_, {num_cells_, dim_input_}, device_);
torch_input = torch_input.to(at::kFloat);
std::vector<torch::jit::IValue> INPUTS;
INPUTS.push_back(torch_input);
double *d_output;
for (int sample_start = 0; sample_start < num_cells_; sample_start += batch_size_) {
int sample_end = std::min(sample_start + batch_size_, num_cells_);
int sample_len = sample_end - sample_start;
at::Tensor torch_input = torch::from_blob(init_input_ + sample_start * dim_input_, {sample_len, dim_input_}, torch::TensorOptions().device(device_).dtype(torch::kDouble));
torch_input = torch_input.to(at::kFloat);
std::vector<torch::jit::IValue> INPUTS;
INPUTS.push_back(torch_input);
std::vector<at::Tensor> output(num_modules_);

std::vector<at::Tensor> output(num_species_);
for (int i = 0; i < num_species_; ++i) {
output[i] = modules_[i].forward(INPUTS).toTensor();
output[i] = output[i].to(at::kDouble);
for (int i = 0; i < num_modules_; ++i) {
output[i] = modules_[i].forward(INPUTS).toTensor();
output[i] = output[i].to(at::kDouble);
d_output = output[i].data_ptr<double>();
cudaMemcpy(RR + (i * num_cells_ + sample_start), d_output, sizeof(double) * sample_len, cudaMemcpyDeviceToDevice);
}
}

// post process
double *d_output;
for (int i = 0; i < num_species_; ++i) {
d_output = output[i].data_ptr<double>();
cudaMemcpy(RR + i * num_cells_, d_output, sizeof(double) * num_cells_, cudaMemcpyDeviceToDevice);
}
calculate_y_new<<<blocks_per_grid, threads_per_block>>>(num_cells_, num_species_, RR, y_input_BCT, Ymu_, Ystd_, RR);
calculate_y_new<<<blocks_per_grid, threads_per_block>>>(num_cells_, num_modules_, RR, y_input_BCT, Ymu_, Ystd_, RR);
calculate_RR<<<blocks_per_grid, threads_per_block>>>(num_cells_, num_species_, 1e-6, rho, y, RR, RR);
}