From f78f459f4ef06f3e0ee56b5fed8c7dcb5ddad613 Mon Sep 17 00:00:00 2001
From: maorz1998 <maorz1998@stu.pku.edu.cn>
Date: Mon, 30 Oct 2023 15:44:40 +0800
Subject: [PATCH 1/5] run pass turbulence k & epsilon

---
 src_gpu/dfUEqn.cu | 24 ++++++++++++++++--------
 1 file changed, 16 insertions(+), 8 deletions(-)

diff --git a/src_gpu/dfUEqn.cu b/src_gpu/dfUEqn.cu
index 325589f1f..f0253e62a 100644
--- a/src_gpu/dfUEqn.cu
+++ b/src_gpu/dfUEqn.cu
@@ -311,7 +311,9 @@ __global__ void ueqn_calculate_turbulence_k_Smagorinsky(int num_cells,
         return;
     
     double vol = volume[index];
-    double del = pow(vol, 1/3);
+    double oneThird = (1. / 3.);
+
+    double del = pow(vol, oneThird);
 
     // D = 0.5*(T+T^T)
     double D_xx = grad_U_tsr[num_cells * 0 + index];
@@ -323,19 +325,20 @@ __global__ void ueqn_calculate_turbulence_k_Smagorinsky(int num_cells,
 
     // dev(D)
     double trace = D_xx + D_yy + D_zz;
-    double dev_D_xx = D_xx - (1. / 3.) * trace;
-    double dev_D_yy = D_yy - (1. / 3.) * trace;
-    double dev_D_zz = D_zz - (1. / 3.) * trace;
+    double dev_D_xx = D_xx - oneThird * trace;
+    double dev_D_yy = D_yy - oneThird * trace;
+    double dev_D_zz = D_zz - oneThird * trace;
 
     // scalar a
-    double a = Ce * del;
+    double a = Ce / del;
     // scalar b
-    double b = 1.5 * trace;
+    double b = 2 * oneThird * trace;
     // scalar c
     double c = 2 * Ck * del * (dev_D_xx * D_xx + dev_D_yy * D_yy + dev_D_zz * D_zz 
                                     + D_xy * D_xy * 2 + D_xz * D_xz * 2 + D_yz * D_yz * 2);
     
-    output[index] = pow((-b + pow(b * b + 4 * a * c, 0.5)) / (2 * a), 2);
+    double sqrt_result = (-b + pow(b * b + 4 * a * c, 0.5)) / (2 * a);
+    output[index] = sqrt_result * sqrt_result;
     delta[index] = del;
 }
 
@@ -566,9 +569,14 @@ void dfUEqn::process() {
                 dataBase_.d_boundary_face_cell, dataBase_.d_boundary_u, dataBase_.d_boundary_sf, dataBase_.d_boundary_weight, 
                 dataBase_.d_volume, dataBase_.d_boundary_mag_sf, d_boundary_grad_u, dataBase_.cyclicNeighbor.data(),
                 dataBase_.patchSizeOffset.data(), dataBase_.d_boundary_delta_coeffs);
-        // calculate k & epsilon (if use turbulence model)
+
+        // **if use turbulence model**
+        // calculate k & epsilon
         getTurbulenceKEpsilon_Smagorinsky(dataBase_.stream, dataBase_.num_cells, dataBase_.num_boundary_surfaces, d_grad_u, dataBase_.d_volume, 
                 d_delta, dataBase_.d_turbulence_k, dataBase_.d_turbulence_epsilon);
+        // calculate nut
+        // **end use turbulence model**
+
         scale_dev2T_tensor(dataBase_.stream, dataBase_.num_cells, dataBase_.d_mu, d_grad_u, // end for internal
                 dataBase_.num_boundary_surfaces, dataBase_.d_boundary_mu, d_boundary_grad_u);
         fvc_div_cell_tensor(dataBase_.stream, dataBase_.num_cells, dataBase_.num_surfaces, dataBase_.num_boundary_surfaces,

From 03c73389d5e94bd24b4120e58c3d33543f12f978 Mon Sep 17 00:00:00 2001
From: maorz1998 <maorz1998@stu.pku.edu.cn>
Date: Tue, 7 Nov 2023 23:23:40 +0800
Subject: [PATCH 2/5] support dynimic NN input size

---
 src_gpu/dfChemistrySolver.H                   |  16 +-
 src_gpu/dfChemistrySolver.cu                  | 157 ++++++++-----
 src_gpu/dfChemistrySolver/Layer.H             |  88 ++++++++
 src_gpu/dfChemistrySolver/Layer.cu            | 212 ++++++++++++++++++
 .../dfChemistrySolver_cublas.H                |  25 +++
 .../dfChemistrySolver_cublas.cu               | 147 ++++++++++++
 6 files changed, 591 insertions(+), 54 deletions(-)
 create mode 100644 src_gpu/dfChemistrySolver/Layer.H
 create mode 100644 src_gpu/dfChemistrySolver/Layer.cu
 create mode 100644 src_gpu/dfChemistrySolver/dfChemistrySolver_cublas.H
 create mode 100644 src_gpu/dfChemistrySolver/dfChemistrySolver_cublas.cu

diff --git a/src_gpu/dfChemistrySolver.H b/src_gpu/dfChemistrySolver.H
index 96a16f521..5a5201399 100644
--- a/src_gpu/dfChemistrySolver.H
+++ b/src_gpu/dfChemistrySolver.H
@@ -4,23 +4,33 @@
 #include <unistd.h>
 #include <cuda_runtime.h>
 #include <torch/script.h>
+#include "dfMatrixDataBase.H"
 
 class dfChemistrySolver
 {
 private:
+    dfMatrixDataBase &dataBase_;
+    cudaStream_t stream;
     std::vector<torch::jit::script::Module> modules_;
     torch::Device device_;
     double *Xmu_, *Xstd_, *Ymu_, *Ystd_;
 
-    double *init_input_, *y_input_BCT;
+    double *init_input_, *y_input_BCT, *NN_output_; 
+    int *d_reactCellIndex;
     int dim_input_, num_cells_, num_species_, num_modules_;
     int batch_size_;
+    double unReactT_;
+    int inputsize_;
 public:
-    dfChemistrySolver(int num_cells, int num_species, int batch_size);
+    dfChemistrySolver(dfMatrixDataBase &dataBase)
+        : device_(torch::kCUDA), dataBase_(dataBase) {};
     ~dfChemistrySolver();
 
+    void setConstantValue(int num_cells, int num_species, int batch_size);
     void loadModels(const std::string dir);
     void loadNormalization(const std::string dir);
-    void Inference(const double *T, const double *p, const double *y,
+    void Inference(const double *h_T, const double *d_T,const double *p, const double *y,
             const double *rho, double *RR);
+    
+    void sync();
 };
\ No newline at end of file
diff --git a/src_gpu/dfChemistrySolver.cu b/src_gpu/dfChemistrySolver.cu
index 935bbb3f9..9c4291064 100644
--- a/src_gpu/dfChemistrySolver.cu
+++ b/src_gpu/dfChemistrySolver.cu
@@ -1,27 +1,31 @@
 #include "dfChemistrySolver.H"
+#include "dfMatrixOpBase.H"
 
-__global__ void construct_init_input(int num_cells, int dim, const double *T, const double *p,
-        const double *y, double *y_input_BCT, double *output)
+__global__ void construct_init_input(int num_thread, int num_cells, int dim, const double *T, const double *p,
+        const int *reactCellIndex, const double *y, double *y_input_BCT, double *output)
 {
     int index = blockDim.x * blockIdx.x + threadIdx.x;
-    if (index >= num_cells)
+    if (index >= num_thread)
         return;
 
-    output[index * dim] = T[index];
-    output[index * dim + 1] = p[index];
+    int cellIndex = reactCellIndex[index];
+
+    output[index * dim] = T[cellIndex];
+    // output[index * dim + 1] = p[cellIndex];
+    output[index * dim + 1] = 101325.;
     double y_BCT;
     for (int i = 0; i < dim - 2; ++i) {
-        y_BCT = (pow(y[i * num_cells + index], 0.1) - 1) * 10; // BCT: lambda = 0.1
+        y_BCT = (pow(y[i * num_cells + cellIndex], 0.1) - 1) * 10; // BCT: lambda = 0.1
         output[index * dim + 2 + i] = y_BCT;
-        y_input_BCT[i * num_cells + index] = y_BCT;
+        y_input_BCT[i * num_thread + index] = y_BCT;
     }
 }
 
-__global__ void normalize_input(int num_cells, int dim, const double *input, 
+__global__ void normalize_input(int num_thread, int num_cells, int dim, const double *input, 
         const double *Xmu, const double *Xstd, double *output)
 {
     int index = blockDim.x * blockIdx.x + threadIdx.x;
-    if (index >= num_cells)
+    if (index >= num_thread)
         return;
 
     for (int i = 0; i < dim; ++i) {
@@ -29,46 +33,57 @@ __global__ void normalize_input(int num_cells, int dim, const double *input,
     }
 }
 
-__global__ void calculate_y_new(int num_cells, int num_modules, const double *output_init, 
+__global__ void calculate_y_new(int num_thread, int num_modules, const double *output_init, 
         const double *y_input_BCT, const double *Ymu, const double *Ystd, double *output)
 {
     int index = blockDim.x * blockIdx.x + threadIdx.x;
-    if (index >= num_cells)
+    if (index >= num_thread)
         return;
-
+    
     double RR_tmp;
     for (int i = 0; i < num_modules; ++i) {
-        RR_tmp = output_init[i * num_cells + index] * Ystd[i] + Ymu[i] + y_input_BCT[i * num_cells + index];
+        RR_tmp = output_init[i * num_thread + index] * Ystd[i] + Ymu[i] + y_input_BCT[i * num_thread + index];
         RR_tmp = pow((RR_tmp * 0.1 + 1), 10); // rev-BCT: lambda = 0.1
-        output[i * num_cells + index] = RR_tmp;
+        output[i * num_thread + index] = RR_tmp;
     }
 }
 
-__global__ void calculate_RR(int num_cells, int num_species, double delta_t,
-        const double *rho, const double *y_old, double *y_new, double *RR)
+__global__ void calculate_RR(int num_thread, int num_cells, int num_species, double delta_t,
+        const int *reactCellIndex, const double *rho, const double *y_old, const double *p, 
+        double *y_NN, double *RR)
 {
     int index = blockDim.x * blockIdx.x + threadIdx.x;
-    if (index >= num_cells)
+    if (index >= num_thread)
         return;
+
+    int cellIndex = reactCellIndex[index];
     
     // normalize
     double y_ave = 0.;
-    for (int i = 0; i < num_species; ++i) {
-        y_ave += y_new[i * num_cells + index];
+    for (int i = 0; i < num_species - 1; ++i) {
+        y_ave += y_NN[i * num_thread + index];
     }
-    for (int i = 0; i < num_species; ++i) {
-        y_new[i * num_cells + index] = y_new[i * num_cells + index] / y_ave;
-        RR[i * num_cells + index] = (y_new[i * num_cells + index] - y_old[i * num_cells + index]) * rho[index] / delta_t;
+    y_ave += y_old[(num_species - 1) * num_cells + cellIndex];
+    for (int i = 0; i < num_species - 1; ++i) {
+        y_NN[i * num_thread + index] = y_NN[i * num_thread + index] / y_ave;
+        RR[i * num_cells + cellIndex] = (y_NN[i * num_thread + index] - y_old[i * num_cells + cellIndex]) * rho[cellIndex]
+                * (p[cellIndex] / 101325.) / delta_t; // correction
     }
 }
 
-dfChemistrySolver::dfChemistrySolver(int num_cells, int num_species, int batch_size)
-    : device_(torch::kCUDA), num_cells_(num_cells), num_species_(num_species), batch_size_(batch_size)
-{
+dfChemistrySolver::~dfChemistrySolver() {
+    cudaFree(init_input_);
+}
+
+void dfChemistrySolver::setConstantValue(int num_cells, int num_species, int batch_size) {
+    this->num_cells_ = num_cells;
+    this->num_species_ = num_species;
+    this->batch_size_ = batch_size;
+    this->stream = dataBase_.stream;
+
     dim_input_ = num_species + 2; // p, T, y
     num_modules_ = num_species_ - 1;
-    cudaMalloc(&init_input_, sizeof(double) * num_cells * dim_input_);
-    cudaMalloc(&y_input_BCT, sizeof(double) * num_cells * num_species_);
+    unReactT_ = 610;
     cudaMalloc(&Xmu_, sizeof(double) * dim_input_);
     cudaMalloc(&Xstd_, sizeof(double) * dim_input_);
     cudaMalloc(&Ymu_, sizeof(double) * num_modules_);
@@ -77,16 +92,17 @@ dfChemistrySolver::dfChemistrySolver(int num_cells, int num_species, int batch_s
 
     // now norm paras are set in constructor manually
     at::TensorOptions opts = at::TensorOptions().dtype(at::kDouble).device(at::kCUDA);
-    std::vector<double> Xmu_vec = {2.1996457626e+03,  1.0281762632e+05, -5.2533840071e+00,
-                -4.5623405933e+00, -1.5305375824e+00, -3.7453838144e+00,
-                -3.3097212360e+00, -4.3339657954e+00, -2.9028421546e-01};
-    std::vector<double> Xstd_vec = {3.8304388695e+02, 7.1905832810e+02, 4.6788389177e-01, 
-                5.3034657693e-01, 5.0325864222e-01, 5.1058993718e-01, 
-                7.9704668543e-01, 5.1327160125e-01, 4.6827591193e-03};
-    std::vector<double> Ymu_vec = {0.0085609265, -0.0082998877, 0.0030108739,
-                -0.0067360325, 0.0037464590, 0.0024258509};
-    std::vector<double> Ystd_vec = {0.05887569, 0.12253898, 0.01643904, 
-                0.0963155, 0.07127831, 0.04373392};
+    std::vector<double> Xmu_vec = {1.2996375154e+03,  1.4349643303e+05, -4.3678815323e+00,
+            -5.8949183472e+00, -3.8840763486e+00, -5.5436246211e+00,
+            -6.0178199636e+00, -2.1469850084e+00, -6.9828365432e+00,
+            -7.7747568654e+00, -1.8571483828e-01};
+    std::vector<double> Xstd_vec = {3.9612732767e+02, 1.8822821412e+04, 1.1226048640e+00, 6.8397462420e-01,
+            1.8879462146e+00, 1.2433158499e+00, 1.3169176600e+00, 4.3600457243e-01,
+            8.1820904505e-01, 8.0471805333e-01, 6.1020187522e-02};
+    std::vector<double> Ymu_vec = {-0.0101101322, -0.0138129078, -0.0146349442, -0.0088870325,
+            -0.0075195178,  0.0020506931, -0.0103104668, -0.0192603020};
+    std::vector<double> Ystd_vec = {0.0297933161, 0.0802139099, 0.0230954310, 0.1541940427, 
+            0.1316836678, 0.0042975580, 0.1476416977, 0.0860471308};
     
     cudaMemcpy(Xmu_, Xmu_vec.data(), sizeof(double) * dim_input_, cudaMemcpyHostToDevice);
     cudaMemcpy(Xstd_, Xstd_vec.data(), sizeof(double) * dim_input_, cudaMemcpyHostToDevice);
@@ -109,24 +125,44 @@ dfChemistrySolver::dfChemistrySolver(int num_cells, int num_species, int batch_s
     }
 }
 
-dfChemistrySolver::~dfChemistrySolver() {
-    cudaFree(init_input_);
-}
-
-void dfChemistrySolver::Inference(const double *T, const double *p, const double *y,
+void dfChemistrySolver::Inference(const double *h_T, const double *d_T,const double *p, const double *y,
         const double *rho, double *RR) {
     // construct input
+    inputsize_ = 0;
+    std::vector<int> reactCellIndex;
+    for (int i = 0; i < num_cells_; i++) {
+        if (h_T[i] >= unReactT_) {
+            reactCellIndex.push_back(i);
+        }
+    }
+    inputsize_ = reactCellIndex.size();
+#ifdef STREAM_ALLOCATOR
+    checkCudaErrors(cudaMallocAsync((void**)&init_input_, sizeof(double) * inputsize_ * dim_input_, stream));
+    checkCudaErrors(cudaMallocAsync((void**)&y_input_BCT, sizeof(double) * inputsize_ * num_species_, stream));
+    checkCudaErrors(cudaMallocAsync((void**)&NN_output_, sizeof(double) * inputsize_ * num_species_, stream));
+    checkCudaErrors(cudaMallocAsync((void**)&d_reactCellIndex, sizeof(int) * inputsize_, stream));
+    checkCudaErrors(cudaMemcpyAsync(d_reactCellIndex, reactCellIndex.data(), sizeof(int) * inputsize_, cudaMemcpyHostToDevice, stream));
+#else
+    cudaMalloc(&init_input_, sizeof(double) * inputsize_ * dim_input_);
+    cudaMalloc(&y_input_BCT, sizeof(double) * inputsize_ * num_species_);
+    cudaMalloc(&d_reactCellIndex, sizeof(int) * inputsize_);
+    cudaMemcpy(d_reactCellIndex, reactCellIndex.data(), sizeof(int) * inputsize_, cudaMemcpyHostToDevice);
+#endif
+    // construct input
     size_t threads_per_block = 1024;
-    size_t blocks_per_grid = (num_cells_ + threads_per_block - 1) / threads_per_block;
-    construct_init_input<<<blocks_per_grid, threads_per_block>>>(num_cells_, dim_input_, T, p, y, y_input_BCT, init_input_);
-    normalize_input<<<blocks_per_grid, threads_per_block>>>(num_cells_, dim_input_, init_input_, Xmu_, Xstd_, init_input_);
+    size_t blocks_per_grid = (inputsize_ + threads_per_block - 1) / threads_per_block;
+    construct_init_input<<<blocks_per_grid, threads_per_block, 0, stream>>>(inputsize_, num_cells_, dim_input_, d_T, p, 
+            d_reactCellIndex, y, y_input_BCT, init_input_);
+    normalize_input<<<blocks_per_grid, threads_per_block, 0, stream>>>(inputsize_, num_cells_, dim_input_, init_input_, 
+            Xmu_, Xstd_, init_input_);
 
     // inference by torch
     double *d_output;
-    for (int sample_start = 0; sample_start < num_cells_; sample_start += batch_size_) {
-        int sample_end = std::min(sample_start + batch_size_, num_cells_);
+    for (int sample_start = 0; sample_start < inputsize_; sample_start += batch_size_) {
+        int sample_end = std::min(sample_start + batch_size_, inputsize_);
         int sample_len = sample_end - sample_start;
-        at::Tensor torch_input = torch::from_blob(init_input_ + sample_start * dim_input_, {sample_len, dim_input_}, torch::TensorOptions().device(device_).dtype(torch::kDouble));
+        at::Tensor torch_input = torch::from_blob(init_input_ + sample_start * dim_input_, {sample_len, dim_input_}, 
+                torch::TensorOptions().device(device_).dtype(torch::kDouble));
         torch_input = torch_input.to(at::kFloat);
         std::vector<torch::jit::IValue> INPUTS;
         INPUTS.push_back(torch_input);
@@ -136,10 +172,29 @@ void dfChemistrySolver::Inference(const double *T, const double *p, const double
             output[i] = modules_[i].forward(INPUTS).toTensor();
             output[i] = output[i].to(at::kDouble);
             d_output = output[i].data_ptr<double>();
-            cudaMemcpy(RR + (i * num_cells_ + sample_start), d_output, sizeof(double) * sample_len, cudaMemcpyDeviceToDevice);
+            cudaMemcpy(NN_output_ + (i * inputsize_ + sample_start), d_output, sizeof(double) * sample_len, cudaMemcpyDeviceToDevice);
         }
     }
 
-    calculate_y_new<<<blocks_per_grid, threads_per_block>>>(num_cells_, num_modules_, RR, y_input_BCT, Ymu_, Ystd_, RR);
-    calculate_RR<<<blocks_per_grid, threads_per_block>>>(num_cells_, num_species_, 1e-6, rho, y, RR, RR);
+    calculate_y_new<<<blocks_per_grid, threads_per_block, 0, stream>>>(inputsize_, num_modules_, NN_output_, 
+            y_input_BCT, Ymu_, Ystd_, NN_output_);
+    calculate_RR<<<blocks_per_grid, threads_per_block, 0, stream>>>(inputsize_, num_cells_, num_species_, 1e-6, 
+            d_reactCellIndex, rho, y, p, NN_output_, RR);
+
+#ifdef STREAM_ALLOCATOR
+    checkCudaErrors(cudaFreeAsync(init_input_, dataBase_.stream));
+    checkCudaErrors(cudaFreeAsync(y_input_BCT, dataBase_.stream));
+    checkCudaErrors(cudaFreeAsync(NN_output_, dataBase_.stream));
+    checkCudaErrors(cudaFreeAsync(d_reactCellIndex, dataBase_.stream)); 
+#else
+    cudaFree(init_input_);
+    cudaFree(y_input_BCT);
+    cudaFree(NN_output_);
+    cudaFree(d_reactCellIndex);
+#endif
+
+}
+
+void dfChemistrySolver::sync() {
+    checkCudaErrors(cudaStreamSynchronize(dataBase_.stream));
 }
\ No newline at end of file
diff --git a/src_gpu/dfChemistrySolver/Layer.H b/src_gpu/dfChemistrySolver/Layer.H
new file mode 100644
index 000000000..e1a90e921
--- /dev/null
+++ b/src_gpu/dfChemistrySolver/Layer.H
@@ -0,0 +1,88 @@
+#pragma once
+#include <cstdint>
+#include <cstdbool>
+#include <numeric>
+#include <string>
+#include <vector>
+
+template<typename DataType>
+class Tensor{
+private:
+    std::vector<int64_t> shape_;
+    int64_t element_num_;
+    DataType* data_;
+    bool owner_;
+public:
+    Tensor(std::vector<int64_t> shape):
+        shape_(shape),
+        element_num_(std::accumulate(shape_.begin(),shape_.end(),1,std::multiplies<int64_t>())),
+        data_(new DataType[element_num_]),
+        owner_(true){};
+
+    Tensor(std::vector<int64_t> shape, DataType* data):
+        shape_(shape), 
+        element_num_(std::accumulate(shape_.begin(),shape_.end(),1,std::multiplies<int64_t>())),
+        data_(data), 
+        owner_(false){};
+    ~Tensor(){
+        if(owner_)  delete[] data_;
+    }
+    int64_t dim_num() const {return shape_.size();};
+    int64_t element_num() const {return element_num_;};
+    int64_t bytes_num() const {return element_num_ * sizeof(DataType);};
+    int64_t dim(int64_t i) const {return shape_[i];};
+    const DataType* data() const {return data_;};
+    DataType* data() {return data_;};
+};
+
+template<typename DataType>
+class Layer{
+public:
+    virtual ~Layer() {};
+    virtual void forward(const Tensor<DataType>& input, Tensor<DataType>& output) = 0;
+    virtual void load_parameters(const std::string& dir, int64_t layer_id) = 0;
+};
+
+template<typename DataType>
+class Linear : public Layer<DataType>{
+private:
+    int64_t in_features_;
+    int64_t out_features_;
+    // (in_features, out_features)
+    Tensor<DataType> weights_;
+    // (out_features,)
+    Tensor<DataType> bias_;
+
+public:
+    Linear(int64_t in_features,int64_t out_features):
+        in_features_(in_features),
+        out_features_(out_features),
+        weights_({in_features_,out_features_}),
+        bias_({out_features_}){};
+    virtual ~Linear(){};
+
+    virtual void forward(const Tensor<DataType>& input, Tensor<DataType>& output);
+    virtual void load_parameters(const std::string& dir, int64_t layer_id);
+};
+
+template<typename DataType>
+class LinearGELU : public Layer<DataType>{
+private:
+    int64_t in_features_;
+    int64_t out_features_;
+    // (in_features, out_features)
+    Tensor<DataType> weights_;
+    // (out_features,)
+    Tensor<DataType> bias_;
+
+public:
+    LinearGELU(int64_t in_features,int64_t out_features):
+        in_features_(in_features),
+        out_features_(out_features),
+        weights_({in_features_,out_features_}),
+        bias_({out_features_}){};
+    virtual ~LinearGELU(){};
+
+    virtual void forward(const Tensor<DataType>& input, Tensor<DataType>& output);
+    virtual void load_parameters(const std::string& dir, int64_t layer_id);
+};
\ No newline at end of file
diff --git a/src_gpu/dfChemistrySolver/Layer.cu b/src_gpu/dfChemistrySolver/Layer.cu
new file mode 100644
index 000000000..23910b517
--- /dev/null
+++ b/src_gpu/dfChemistrySolver/Layer.cu
@@ -0,0 +1,212 @@
+#include "Layer.H"
+#include <cmath>
+#include <cublas_v2.h>
+#include <sstream>
+#include <fstream>
+#include <cassert>
+#include <iostream>
+#include <mpi.h>
+    
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+extern void sgemm_(char *transa, char *transb, int *m, int *n, int *k, float *alpha, float *a, int *lda, float *b, int *ldb, float *beta, float *c, int *ldc);
+
+#ifdef __cplusplus
+}
+#endif
+
+template<>
+void Linear<float>::forward(const Tensor<float>& input, Tensor<float>& output){
+    assert(input.dim_num() == 2);
+    assert(output.dim_num() == 2);
+    assert(input.dim(0) == output.dim(0));
+    assert(input.dim(1) == in_features_);
+    assert(out_features_ == output.dim(1));
+
+    char transA = 'N';
+    char transB = 'N';
+    float alpha = 1.f;
+    float beta = 1.f;
+    int m = out_features_;
+    int n = input.dim(0);
+    int k = in_features_;
+    float* A = weights_.data();
+    int lda = out_features_;
+    float* B = const_cast<float*>(input.data());
+    int ldb = input.dim(1);
+    float*  C = output.data();
+    int ldc = output.dim(1);
+
+    for(int i = 0; i < input.dim(0); ++i)
+        std::copy(bias_.data(), bias_.data() + out_features_, C + i * out_features_);
+
+    sgemm_(&transA, &transB, &m, &n, &k, &alpha, A, &lda, B, &ldb, &beta, C, &ldc);
+}
+
+template<>
+void Linear<float>::load_parameters(const std::string& dir, int64_t layer_id){
+    int flag_mpi_init;
+    MPI_Initialized(&flag_mpi_init);
+    if(!flag_mpi_init){
+        std::cerr << "DNNInferencer_blas::load_models : MPI is not initialized" << std::endl;
+        MPI_Abort(MPI_COMM_WORLD, 1);
+    }
+    int mpirank;
+    int mpisize;
+    MPI_Comm_rank(MPI_COMM_WORLD, &mpirank);
+    MPI_Comm_size(MPI_COMM_WORLD, &mpisize);
+
+    if(mpirank == 0){
+        std::stringstream ss1,ss2;
+        ss1 << dir << "/" << "linear_" << layer_id << "_weights_rowmajor_" << in_features_ << "_" << out_features_ << ".data";
+        std::string weights_path = ss1.str();
+        ss2 << dir << "/" << "linear_" << layer_id << "_bias_" << out_features_ << ".data";
+        std::string bias_path = ss2.str();
+        // weights
+        std::ifstream weights_file(weights_path, std::ios::binary);
+        if(!weights_file.is_open()){
+            std::cerr << "open weights file error : " << weights_path << std::endl << std::flush;
+            abort();
+        }
+        weights_file.read(reinterpret_cast<char*>(weights_.data()), weights_.bytes_num());
+        weights_file.close();
+        // bias
+        std::ifstream bias_file(bias_path, std::ios::binary);
+        if(!bias_file.is_open()){
+            std::cerr << "open bias file error : " << bias_path << std::endl << std::flush;
+            abort();
+        }
+        bias_file.read(reinterpret_cast<char*>(bias_.data()), bias_.bytes_num());
+        bias_file.close();
+    }
+
+    MPI_Bcast(weights_.data(), weights_.element_num(), MPI_FLOAT, 0, MPI_COMM_WORLD);
+    MPI_Bcast(bias_.data(), bias_.element_num(), MPI_FLOAT, 0, MPI_COMM_WORLD);
+}
+
+template<>
+void LinearGELU<float>::load_parameters(const std::string& dir, int64_t layer_id){
+    int flag_mpi_init;
+    MPI_Initialized(&flag_mpi_init);
+    if(!flag_mpi_init){
+        std::cerr << "DNNInferencer_blas::load_models : MPI is not initialized" << std::endl;
+        MPI_Abort(MPI_COMM_WORLD, 1);
+    }
+    int mpirank;
+    int mpisize;
+    MPI_Comm_rank(MPI_COMM_WORLD, &mpirank);
+    MPI_Comm_size(MPI_COMM_WORLD, &mpisize);
+
+    if(mpirank == 0){
+        std::stringstream ss1,ss2;
+        ss1 << dir << "/" << "linear_" << layer_id << "_weights_rowmajor_" << in_features_ << "_" << out_features_ << ".data";
+        std::string weights_path = ss1.str();
+        ss2 << dir << "/" << "linear_" << layer_id << "_bias_" << out_features_ << ".data";
+        std::string bias_path = ss2.str();
+        
+        // weights
+        std::ifstream weights_file(weights_path, std::ios::binary);
+        if(!weights_file.is_open()){
+            std::cerr << "open weights file error : " << weights_path << std::endl << std::flush;
+            abort();
+        }
+        weights_file.read(reinterpret_cast<char*>(weights_.data()), weights_.bytes_num());
+        weights_file.close();
+
+        // bias
+        std::ifstream bias_file(bias_path, std::ios::binary);
+        if(!bias_file.is_open()){
+            std::cerr << "open bias file error : " << bias_path << std::endl << std::flush;
+            abort();
+        }
+        bias_file.read(reinterpret_cast<char*>(bias_.data()), bias_.bytes_num());
+        bias_file.close();
+    }
+
+    MPI_Bcast(weights_.data(), weights_.element_num(), MPI_FLOAT, 0, MPI_COMM_WORLD);
+    MPI_Bcast(bias_.data(), bias_.element_num(), MPI_FLOAT, 0, MPI_COMM_WORLD);
+}
+
+
+void gelu_navie(int64_t len, float* data){
+    for(int64_t i = 0; i < len; ++i){
+        float x = data[i];
+        data[i] = 0.5 * x * (1.f + tanhf(sqrtf(2.f / M_PI) * (x + 0.044715f * powf(x, 3.f))));
+    }
+}
+
+inline float tanhf_exp(float x){
+    if(x > 8.f) return 1.;
+    if(x < -8.f) return -1.;
+    return 1.f - 2.f / (expf(2.f * x) + 1.f);
+}
+
+double tanh_exp(double x){
+    return 1. - 2. / (exp(2. * x) + 1.);
+}
+
+void geluf_exp(int64_t len, float* data){
+
+    const float const_1 = sqrtf(2.f / M_PI);
+    const float const_2 = 0.044715f;
+    const float one = 1.f;
+    const float half = 0.5;
+
+    for(int64_t i = 0; i < len; ++i){
+        float x = data[i];
+        data[i] = half * x * (one + tanhf_exp(const_1 * (x + const_2 * x * x * x)));
+    }
+}
+
+void gelud_exp(int64_t len, float* data){
+    const double const_1 = sqrtf(2. / M_PI);
+    const double const_2 = 0.044715;
+    const double one = 1.;
+    const double half = 0.5;
+
+    for(int64_t i = 0; i < len; ++i){
+        double x = data[i];
+        data[i] = half * x * (one + tanh_exp(const_1 * (x + const_2 * x * x * x)));
+    }
+}
+
+// TODO 
+void gelu_exp_sve(int64_t len, float* data){
+
+}
+
+template<>
+void LinearGELU<float>::forward(const Tensor<float>& input, Tensor<float>& output){
+    assert(input.dim_num() == 2);
+    assert(output.dim_num() == 2);
+    assert(input.dim(0) == output.dim(0));
+    assert(input.dim(1) == in_features_);
+    assert(out_features_ == output.dim(1));
+    
+    char transA = 'N';
+    char transB = 'N';
+    float alpha = 1.f;
+    float beta = 1.f;
+    int m = out_features_;
+    int n = input.dim(0);
+    int k = in_features_;
+    float* A = weights_.data();
+    int lda = out_features_;
+    float* B = const_cast<float*>(input.data());
+    int ldb = input.dim(1);
+    float*  C = output.data();
+    int ldc = output.dim(1);
+
+    for(int i = 0; i < input.dim(0); ++i)
+        std::copy(bias_.data(), bias_.data() + out_features_, C + i * out_features_);
+
+    sgemm_(&transA, &transB, &m, &n, &k, &alpha, A, &lda, B, &ldb, &beta, C, &ldc);
+    // GELU
+    geluf_exp(output.element_num(), output.data());
+}
+
+template class Tensor<float>;
+template class Linear<float>;
+template class LinearGELU<float>;
\ No newline at end of file
diff --git a/src_gpu/dfChemistrySolver/dfChemistrySolver_cublas.H b/src_gpu/dfChemistrySolver/dfChemistrySolver_cublas.H
new file mode 100644
index 000000000..c7eea6c7f
--- /dev/null
+++ b/src_gpu/dfChemistrySolver/dfChemistrySolver_cublas.H
@@ -0,0 +1,25 @@
+#pragma once
+
+#include <stdio.h>
+#include <unistd.h>
+#include <cuda_runtime.h>
+#include <torch/script.h>
+
+class dfChemistrySolver
+{
+private:
+    std::vector<torch::jit::script::Module> modules_;
+    torch::Device device_;
+    double *Xmu_, *Xstd_, *Ymu_, *Ystd_;
+
+    double *init_input_, *y_input_BCT;
+    int dim_input_, num_cells_, num_species_, num_modules_;
+public:
+    dfChemistrySolver(int num_cells, int num_species);
+    ~dfChemistrySolver();
+
+    void loadModels(const std::string dir);
+    void loadNormalization(const std::string dir);
+    void Inference(const double *T, const double *p, const double *y,
+            const double *rho, double *RR);
+};
\ No newline at end of file
diff --git a/src_gpu/dfChemistrySolver/dfChemistrySolver_cublas.cu b/src_gpu/dfChemistrySolver/dfChemistrySolver_cublas.cu
new file mode 100644
index 000000000..4e2014150
--- /dev/null
+++ b/src_gpu/dfChemistrySolver/dfChemistrySolver_cublas.cu
@@ -0,0 +1,147 @@
+#include "dfChemistrySolver_cublas.H"
+
+__global__ void construct_init_input(int num_cells, int dim, const double *T, const double *p,
+        const double *y, double *y_input_BCT, double *output)
+{
+    int index = blockDim.x * blockIdx.x + threadIdx.x;
+    if (index >= num_cells)
+        return;
+
+    output[index * dim] = T[index];
+    output[index * dim + 1] = p[index];
+    double y_BCT;
+    for (int i = 0; i < dim - 2; ++i) {
+        y_BCT = (pow(y[i * num_cells + index], 0.1) - 1) * 10; // BCT: lambda = 0.1
+        output[index * dim + 2 + i] = y_BCT;
+        y_input_BCT[i * num_cells + index] = y_BCT;
+    }
+}
+
+__global__ void normalize_input(int num_cells, int dim, const double *input, 
+        const double *Xmu, const double *Xstd, double *output)
+{
+    int index = blockDim.x * blockIdx.x + threadIdx.x;
+    if (index >= num_cells)
+        return;
+
+    for (int i = 0; i < dim; ++i) {
+        output[index * dim + i] = (input[index * dim + i] - Xmu[i]) / Xstd[i];
+    }
+}
+
+__global__ void calculate_y_new(int num_cells, int num_modules, const double *output_init, 
+        const double *y_input_BCT, const double *Ymu, const double *Ystd, double *output)
+{
+    int index = blockDim.x * blockIdx.x + threadIdx.x;
+    if (index >= num_cells)
+        return;
+
+    double RR_tmp;
+    for (int i = 0; i < num_modules; ++i) {
+        RR_tmp = output_init[i * num_cells + index] * Ystd[i] + Ymu[i] + y_input_BCT[i * num_cells + index];
+        RR_tmp = pow((RR_tmp * 0.1 + 1), 10); // rev-BCT: lambda = 0.1
+        output[i * num_cells + index] = RR_tmp;
+    }
+}
+
+__global__ void calculate_RR(int num_cells, int num_species, double delta_t,
+        const double *rho, const double *y_old, double *y_new, double *RR)
+{
+    int index = blockDim.x * blockIdx.x + threadIdx.x;
+    if (index >= num_cells)
+        return;
+    
+    // normalize
+    double y_ave = 0.;
+    for (int i = 0; i < num_species; ++i) {
+        y_ave += y_new[i * num_cells + index];
+    }
+    for (int i = 0; i < num_species; ++i) {
+        y_new[i * num_cells + index] = y_new[i * num_cells + index] / y_ave;
+        RR[i * num_cells + index] = (y_new[i * num_cells + index] - y_old[i * num_cells + index]) * rho[index] / delta_t;
+    }
+}
+
+dfChemistrySolver::dfChemistrySolver(int num_cells, int num_species)
+    : device_(torch::kCUDA), num_cells_(num_cells), num_species_(num_species)
+{
+    dim_input_ = num_species + 2; // p, T, y
+    num_modules_ = num_species_ - 1;
+    cudaMalloc(&init_input_, sizeof(double) * num_cells * dim_input_);
+    cudaMalloc(&y_input_BCT, sizeof(double) * num_cells * num_species_);
+    cudaMalloc(&Xmu_, sizeof(double) * dim_input_);
+    cudaMalloc(&Xstd_, sizeof(double) * dim_input_);
+    cudaMalloc(&Ymu_, sizeof(double) * num_modules_);
+    cudaMalloc(&Ystd_, sizeof(double) * num_modules_);
+    modules_.reserve(num_modules_);
+
+    // now norm paras are set in constructor manually
+    at::TensorOptions opts = at::TensorOptions().dtype(at::kDouble).device(at::kCUDA);
+    std::vector<double> Xmu_vec = {2.1996457626e+03,  1.0281762632e+05, -5.2533840071e+00,
+                -4.5623405933e+00, -1.5305375824e+00, -3.7453838144e+00,
+                -3.3097212360e+00, -4.3339657954e+00, -2.9028421546e-01};
+    std::vector<double> Xstd_vec = {3.8304388695e+02, 7.1905832810e+02, 4.6788389177e-01, 
+                5.3034657693e-01, 5.0325864222e-01, 5.1058993718e-01, 
+                7.9704668543e-01, 5.1327160125e-01, 4.6827591193e-03};
+    std::vector<double> Ymu_vec = {0.0085609265, -0.0082998877, 0.0030108739,
+                -0.0067360325, 0.0037464590, 0.0024258509};
+    std::vector<double> Ystd_vec = {0.0085609265, -0.0082998877, 0.0030108739,
+                -0.0067360325, 0.0037464590, 0.0024258509};
+    
+    cudaMemcpy(Xmu_, Xmu_vec.data(), sizeof(double) * dim_input_, cudaMemcpyHostToDevice);
+    cudaMemcpy(Xstd_, Xstd_vec.data(), sizeof(double) * dim_input_, cudaMemcpyHostToDevice);
+    cudaMemcpy(Ymu_, Ymu_vec.data(), sizeof(double) * num_modules_, cudaMemcpyHostToDevice);
+    cudaMemcpy(Ystd_, Ystd_vec.data(), sizeof(double) * num_modules_, cudaMemcpyHostToDevice);
+
+    // input modules
+    std::string prefix = "new_Temporary_Chemical_";
+    std::string suffix = ".pt";
+    for (int i = 0; i < num_modules_; ++i) {
+        std::string model_path = prefix + std::to_string(i) + suffix;
+        printf("loading model %s\n", model_path.c_str());
+        modules_.push_back(torch::jit::load(model_path));
+        // try {
+        //     // modules_.push_back(torch::jit::load(model_path));
+        //     torch::jit::load(model_path);
+        // }
+        // catch (const c10::Error& e) {
+        //     std::cerr << "error loading the model\n";
+        //     exit(-1);
+        // }
+        modules_[i].to(device_);
+    }
+}
+
+dfChemistrySolver::~dfChemistrySolver() {
+    cudaFree(init_input_);
+}
+
+void dfChemistrySolver::Inference(const double *T, const double *p, const double *y,
+        const double *rho, double *RR) {
+    // construct input
+    size_t threads_per_block = 1024;
+    size_t blocks_per_grid = (num_cells_ + threads_per_block - 1) / threads_per_block;
+    construct_init_input<<<blocks_per_grid, threads_per_block>>>(num_cells_, dim_input_, T, p, y, y_input_BCT, init_input_);
+    normalize_input<<<blocks_per_grid, threads_per_block>>>(num_cells_, dim_input_, init_input_, Xmu_, Xstd_, init_input_);
+
+    // inference by torch
+    at::Tensor torch_input = torch::from_blob(init_input_, {num_cells_, dim_input_}, device_);
+    torch_input = torch_input.to(at::kFloat);
+    std::vector<torch::jit::IValue> INPUTS;
+    INPUTS.push_back(torch_input);
+
+    std::vector<at::Tensor> output(num_modules_);
+    for (int i = 0; i < num_modules_; ++i) {
+        output[i] = modules_[i].forward(INPUTS).toTensor();
+        output[i] = output[i].to(at::kDouble);
+    }
+
+    // post process
+    double *d_output;
+    for (int i = 0; i < num_modules_; ++i) {
+        d_output = output[i].data_ptr<double>();
+        cudaMemcpy(RR + i * num_cells_, d_output, sizeof(double) * num_cells_, cudaMemcpyDeviceToDevice);
+    }
+    calculate_y_new<<<blocks_per_grid, threads_per_block>>>(num_cells_, num_modules_, RR, y_input_BCT, Ymu_, Ystd_, RR);
+    calculate_RR<<<blocks_per_grid, threads_per_block>>>(num_cells_, num_species_, 1e-6, rho, y, RR, RR);
+}
\ No newline at end of file

From 24d15bb78fa4b7dcfa749f9947c419caba51ec92 Mon Sep 17 00:00:00 2001
From: maorz1998 <maorz1998@stu.pku.edu.cn>
Date: Tue, 7 Nov 2023 23:29:32 +0800
Subject: [PATCH 3/5] mixture averaged

---
 src_gpu/dfChemistrySolver/Layer.H             |  88 --------
 src_gpu/dfChemistrySolver/Layer.cu            | 212 ------------------
 .../dfChemistrySolver_cublas.H                |  25 ---
 .../dfChemistrySolver_cublas.cu               | 147 ------------
 src_gpu/dfThermo.H                            |   9 +-
 src_gpu/dfThermo.cu                           |  89 +++++++-
 src_gpu/dfYEqn.H                              |  16 +-
 src_gpu/dfYEqn.cu                             |  62 +++--
 8 files changed, 149 insertions(+), 499 deletions(-)
 delete mode 100644 src_gpu/dfChemistrySolver/Layer.H
 delete mode 100644 src_gpu/dfChemistrySolver/Layer.cu
 delete mode 100644 src_gpu/dfChemistrySolver/dfChemistrySolver_cublas.H
 delete mode 100644 src_gpu/dfChemistrySolver/dfChemistrySolver_cublas.cu

diff --git a/src_gpu/dfChemistrySolver/Layer.H b/src_gpu/dfChemistrySolver/Layer.H
deleted file mode 100644
index e1a90e921..000000000
--- a/src_gpu/dfChemistrySolver/Layer.H
+++ /dev/null
@@ -1,88 +0,0 @@
-#pragma once
-#include <cstdint>
-#include <cstdbool>
-#include <numeric>
-#include <string>
-#include <vector>
-
-template<typename DataType>
-class Tensor{
-private:
-    std::vector<int64_t> shape_;
-    int64_t element_num_;
-    DataType* data_;
-    bool owner_;
-public:
-    Tensor(std::vector<int64_t> shape):
-        shape_(shape),
-        element_num_(std::accumulate(shape_.begin(),shape_.end(),1,std::multiplies<int64_t>())),
-        data_(new DataType[element_num_]),
-        owner_(true){};
-
-    Tensor(std::vector<int64_t> shape, DataType* data):
-        shape_(shape), 
-        element_num_(std::accumulate(shape_.begin(),shape_.end(),1,std::multiplies<int64_t>())),
-        data_(data), 
-        owner_(false){};
-    ~Tensor(){
-        if(owner_)  delete[] data_;
-    }
-    int64_t dim_num() const {return shape_.size();};
-    int64_t element_num() const {return element_num_;};
-    int64_t bytes_num() const {return element_num_ * sizeof(DataType);};
-    int64_t dim(int64_t i) const {return shape_[i];};
-    const DataType* data() const {return data_;};
-    DataType* data() {return data_;};
-};
-
-template<typename DataType>
-class Layer{
-public:
-    virtual ~Layer() {};
-    virtual void forward(const Tensor<DataType>& input, Tensor<DataType>& output) = 0;
-    virtual void load_parameters(const std::string& dir, int64_t layer_id) = 0;
-};
-
-template<typename DataType>
-class Linear : public Layer<DataType>{
-private:
-    int64_t in_features_;
-    int64_t out_features_;
-    // (in_features, out_features)
-    Tensor<DataType> weights_;
-    // (out_features,)
-    Tensor<DataType> bias_;
-
-public:
-    Linear(int64_t in_features,int64_t out_features):
-        in_features_(in_features),
-        out_features_(out_features),
-        weights_({in_features_,out_features_}),
-        bias_({out_features_}){};
-    virtual ~Linear(){};
-
-    virtual void forward(const Tensor<DataType>& input, Tensor<DataType>& output);
-    virtual void load_parameters(const std::string& dir, int64_t layer_id);
-};
-
-template<typename DataType>
-class LinearGELU : public Layer<DataType>{
-private:
-    int64_t in_features_;
-    int64_t out_features_;
-    // (in_features, out_features)
-    Tensor<DataType> weights_;
-    // (out_features,)
-    Tensor<DataType> bias_;
-
-public:
-    LinearGELU(int64_t in_features,int64_t out_features):
-        in_features_(in_features),
-        out_features_(out_features),
-        weights_({in_features_,out_features_}),
-        bias_({out_features_}){};
-    virtual ~LinearGELU(){};
-
-    virtual void forward(const Tensor<DataType>& input, Tensor<DataType>& output);
-    virtual void load_parameters(const std::string& dir, int64_t layer_id);
-};
\ No newline at end of file
diff --git a/src_gpu/dfChemistrySolver/Layer.cu b/src_gpu/dfChemistrySolver/Layer.cu
deleted file mode 100644
index 23910b517..000000000
--- a/src_gpu/dfChemistrySolver/Layer.cu
+++ /dev/null
@@ -1,212 +0,0 @@
-#include "Layer.H"
-#include <cmath>
-#include <cublas_v2.h>
-#include <sstream>
-#include <fstream>
-#include <cassert>
-#include <iostream>
-#include <mpi.h>
-    
-#ifdef __cplusplus
-extern "C" {
-#endif
-
-extern void sgemm_(char *transa, char *transb, int *m, int *n, int *k, float *alpha, float *a, int *lda, float *b, int *ldb, float *beta, float *c, int *ldc);
-
-#ifdef __cplusplus
-}
-#endif
-
-template<>
-void Linear<float>::forward(const Tensor<float>& input, Tensor<float>& output){
-    assert(input.dim_num() == 2);
-    assert(output.dim_num() == 2);
-    assert(input.dim(0) == output.dim(0));
-    assert(input.dim(1) == in_features_);
-    assert(out_features_ == output.dim(1));
-
-    char transA = 'N';
-    char transB = 'N';
-    float alpha = 1.f;
-    float beta = 1.f;
-    int m = out_features_;
-    int n = input.dim(0);
-    int k = in_features_;
-    float* A = weights_.data();
-    int lda = out_features_;
-    float* B = const_cast<float*>(input.data());
-    int ldb = input.dim(1);
-    float*  C = output.data();
-    int ldc = output.dim(1);
-
-    for(int i = 0; i < input.dim(0); ++i)
-        std::copy(bias_.data(), bias_.data() + out_features_, C + i * out_features_);
-
-    sgemm_(&transA, &transB, &m, &n, &k, &alpha, A, &lda, B, &ldb, &beta, C, &ldc);
-}
-
-template<>
-void Linear<float>::load_parameters(const std::string& dir, int64_t layer_id){
-    int flag_mpi_init;
-    MPI_Initialized(&flag_mpi_init);
-    if(!flag_mpi_init){
-        std::cerr << "DNNInferencer_blas::load_models : MPI is not initialized" << std::endl;
-        MPI_Abort(MPI_COMM_WORLD, 1);
-    }
-    int mpirank;
-    int mpisize;
-    MPI_Comm_rank(MPI_COMM_WORLD, &mpirank);
-    MPI_Comm_size(MPI_COMM_WORLD, &mpisize);
-
-    if(mpirank == 0){
-        std::stringstream ss1,ss2;
-        ss1 << dir << "/" << "linear_" << layer_id << "_weights_rowmajor_" << in_features_ << "_" << out_features_ << ".data";
-        std::string weights_path = ss1.str();
-        ss2 << dir << "/" << "linear_" << layer_id << "_bias_" << out_features_ << ".data";
-        std::string bias_path = ss2.str();
-        // weights
-        std::ifstream weights_file(weights_path, std::ios::binary);
-        if(!weights_file.is_open()){
-            std::cerr << "open weights file error : " << weights_path << std::endl << std::flush;
-            abort();
-        }
-        weights_file.read(reinterpret_cast<char*>(weights_.data()), weights_.bytes_num());
-        weights_file.close();
-        // bias
-        std::ifstream bias_file(bias_path, std::ios::binary);
-        if(!bias_file.is_open()){
-            std::cerr << "open bias file error : " << bias_path << std::endl << std::flush;
-            abort();
-        }
-        bias_file.read(reinterpret_cast<char*>(bias_.data()), bias_.bytes_num());
-        bias_file.close();
-    }
-
-    MPI_Bcast(weights_.data(), weights_.element_num(), MPI_FLOAT, 0, MPI_COMM_WORLD);
-    MPI_Bcast(bias_.data(), bias_.element_num(), MPI_FLOAT, 0, MPI_COMM_WORLD);
-}
-
-template<>
-void LinearGELU<float>::load_parameters(const std::string& dir, int64_t layer_id){
-    int flag_mpi_init;
-    MPI_Initialized(&flag_mpi_init);
-    if(!flag_mpi_init){
-        std::cerr << "DNNInferencer_blas::load_models : MPI is not initialized" << std::endl;
-        MPI_Abort(MPI_COMM_WORLD, 1);
-    }
-    int mpirank;
-    int mpisize;
-    MPI_Comm_rank(MPI_COMM_WORLD, &mpirank);
-    MPI_Comm_size(MPI_COMM_WORLD, &mpisize);
-
-    if(mpirank == 0){
-        std::stringstream ss1,ss2;
-        ss1 << dir << "/" << "linear_" << layer_id << "_weights_rowmajor_" << in_features_ << "_" << out_features_ << ".data";
-        std::string weights_path = ss1.str();
-        ss2 << dir << "/" << "linear_" << layer_id << "_bias_" << out_features_ << ".data";
-        std::string bias_path = ss2.str();
-        
-        // weights
-        std::ifstream weights_file(weights_path, std::ios::binary);
-        if(!weights_file.is_open()){
-            std::cerr << "open weights file error : " << weights_path << std::endl << std::flush;
-            abort();
-        }
-        weights_file.read(reinterpret_cast<char*>(weights_.data()), weights_.bytes_num());
-        weights_file.close();
-
-        // bias
-        std::ifstream bias_file(bias_path, std::ios::binary);
-        if(!bias_file.is_open()){
-            std::cerr << "open bias file error : " << bias_path << std::endl << std::flush;
-            abort();
-        }
-        bias_file.read(reinterpret_cast<char*>(bias_.data()), bias_.bytes_num());
-        bias_file.close();
-    }
-
-    MPI_Bcast(weights_.data(), weights_.element_num(), MPI_FLOAT, 0, MPI_COMM_WORLD);
-    MPI_Bcast(bias_.data(), bias_.element_num(), MPI_FLOAT, 0, MPI_COMM_WORLD);
-}
-
-
-void gelu_navie(int64_t len, float* data){
-    for(int64_t i = 0; i < len; ++i){
-        float x = data[i];
-        data[i] = 0.5 * x * (1.f + tanhf(sqrtf(2.f / M_PI) * (x + 0.044715f * powf(x, 3.f))));
-    }
-}
-
-inline float tanhf_exp(float x){
-    if(x > 8.f) return 1.;
-    if(x < -8.f) return -1.;
-    return 1.f - 2.f / (expf(2.f * x) + 1.f);
-}
-
-double tanh_exp(double x){
-    return 1. - 2. / (exp(2. * x) + 1.);
-}
-
-void geluf_exp(int64_t len, float* data){
-
-    const float const_1 = sqrtf(2.f / M_PI);
-    const float const_2 = 0.044715f;
-    const float one = 1.f;
-    const float half = 0.5;
-
-    for(int64_t i = 0; i < len; ++i){
-        float x = data[i];
-        data[i] = half * x * (one + tanhf_exp(const_1 * (x + const_2 * x * x * x)));
-    }
-}
-
-void gelud_exp(int64_t len, float* data){
-    const double const_1 = sqrtf(2. / M_PI);
-    const double const_2 = 0.044715;
-    const double one = 1.;
-    const double half = 0.5;
-
-    for(int64_t i = 0; i < len; ++i){
-        double x = data[i];
-        data[i] = half * x * (one + tanh_exp(const_1 * (x + const_2 * x * x * x)));
-    }
-}
-
-// TODO 
-void gelu_exp_sve(int64_t len, float* data){
-
-}
-
-template<>
-void LinearGELU<float>::forward(const Tensor<float>& input, Tensor<float>& output){
-    assert(input.dim_num() == 2);
-    assert(output.dim_num() == 2);
-    assert(input.dim(0) == output.dim(0));
-    assert(input.dim(1) == in_features_);
-    assert(out_features_ == output.dim(1));
-    
-    char transA = 'N';
-    char transB = 'N';
-    float alpha = 1.f;
-    float beta = 1.f;
-    int m = out_features_;
-    int n = input.dim(0);
-    int k = in_features_;
-    float* A = weights_.data();
-    int lda = out_features_;
-    float* B = const_cast<float*>(input.data());
-    int ldb = input.dim(1);
-    float*  C = output.data();
-    int ldc = output.dim(1);
-
-    for(int i = 0; i < input.dim(0); ++i)
-        std::copy(bias_.data(), bias_.data() + out_features_, C + i * out_features_);
-
-    sgemm_(&transA, &transB, &m, &n, &k, &alpha, A, &lda, B, &ldb, &beta, C, &ldc);
-    // GELU
-    geluf_exp(output.element_num(), output.data());
-}
-
-template class Tensor<float>;
-template class Linear<float>;
-template class LinearGELU<float>;
\ No newline at end of file
diff --git a/src_gpu/dfChemistrySolver/dfChemistrySolver_cublas.H b/src_gpu/dfChemistrySolver/dfChemistrySolver_cublas.H
deleted file mode 100644
index c7eea6c7f..000000000
--- a/src_gpu/dfChemistrySolver/dfChemistrySolver_cublas.H
+++ /dev/null
@@ -1,25 +0,0 @@
-#pragma once
-
-#include <stdio.h>
-#include <unistd.h>
-#include <cuda_runtime.h>
-#include <torch/script.h>
-
-class dfChemistrySolver
-{
-private:
-    std::vector<torch::jit::script::Module> modules_;
-    torch::Device device_;
-    double *Xmu_, *Xstd_, *Ymu_, *Ystd_;
-
-    double *init_input_, *y_input_BCT;
-    int dim_input_, num_cells_, num_species_, num_modules_;
-public:
-    dfChemistrySolver(int num_cells, int num_species);
-    ~dfChemistrySolver();
-
-    void loadModels(const std::string dir);
-    void loadNormalization(const std::string dir);
-    void Inference(const double *T, const double *p, const double *y,
-            const double *rho, double *RR);
-};
\ No newline at end of file
diff --git a/src_gpu/dfChemistrySolver/dfChemistrySolver_cublas.cu b/src_gpu/dfChemistrySolver/dfChemistrySolver_cublas.cu
deleted file mode 100644
index 4e2014150..000000000
--- a/src_gpu/dfChemistrySolver/dfChemistrySolver_cublas.cu
+++ /dev/null
@@ -1,147 +0,0 @@
-#include "dfChemistrySolver_cublas.H"
-
-__global__ void construct_init_input(int num_cells, int dim, const double *T, const double *p,
-        const double *y, double *y_input_BCT, double *output)
-{
-    int index = blockDim.x * blockIdx.x + threadIdx.x;
-    if (index >= num_cells)
-        return;
-
-    output[index * dim] = T[index];
-    output[index * dim + 1] = p[index];
-    double y_BCT;
-    for (int i = 0; i < dim - 2; ++i) {
-        y_BCT = (pow(y[i * num_cells + index], 0.1) - 1) * 10; // BCT: lambda = 0.1
-        output[index * dim + 2 + i] = y_BCT;
-        y_input_BCT[i * num_cells + index] = y_BCT;
-    }
-}
-
-__global__ void normalize_input(int num_cells, int dim, const double *input, 
-        const double *Xmu, const double *Xstd, double *output)
-{
-    int index = blockDim.x * blockIdx.x + threadIdx.x;
-    if (index >= num_cells)
-        return;
-
-    for (int i = 0; i < dim; ++i) {
-        output[index * dim + i] = (input[index * dim + i] - Xmu[i]) / Xstd[i];
-    }
-}
-
-__global__ void calculate_y_new(int num_cells, int num_modules, const double *output_init, 
-        const double *y_input_BCT, const double *Ymu, const double *Ystd, double *output)
-{
-    int index = blockDim.x * blockIdx.x + threadIdx.x;
-    if (index >= num_cells)
-        return;
-
-    double RR_tmp;
-    for (int i = 0; i < num_modules; ++i) {
-        RR_tmp = output_init[i * num_cells + index] * Ystd[i] + Ymu[i] + y_input_BCT[i * num_cells + index];
-        RR_tmp = pow((RR_tmp * 0.1 + 1), 10); // rev-BCT: lambda = 0.1
-        output[i * num_cells + index] = RR_tmp;
-    }
-}
-
-__global__ void calculate_RR(int num_cells, int num_species, double delta_t,
-        const double *rho, const double *y_old, double *y_new, double *RR)
-{
-    int index = blockDim.x * blockIdx.x + threadIdx.x;
-    if (index >= num_cells)
-        return;
-    
-    // normalize
-    double y_ave = 0.;
-    for (int i = 0; i < num_species; ++i) {
-        y_ave += y_new[i * num_cells + index];
-    }
-    for (int i = 0; i < num_species; ++i) {
-        y_new[i * num_cells + index] = y_new[i * num_cells + index] / y_ave;
-        RR[i * num_cells + index] = (y_new[i * num_cells + index] - y_old[i * num_cells + index]) * rho[index] / delta_t;
-    }
-}
-
-dfChemistrySolver::dfChemistrySolver(int num_cells, int num_species)
-    : device_(torch::kCUDA), num_cells_(num_cells), num_species_(num_species)
-{
-    dim_input_ = num_species + 2; // p, T, y
-    num_modules_ = num_species_ - 1;
-    cudaMalloc(&init_input_, sizeof(double) * num_cells * dim_input_);
-    cudaMalloc(&y_input_BCT, sizeof(double) * num_cells * num_species_);
-    cudaMalloc(&Xmu_, sizeof(double) * dim_input_);
-    cudaMalloc(&Xstd_, sizeof(double) * dim_input_);
-    cudaMalloc(&Ymu_, sizeof(double) * num_modules_);
-    cudaMalloc(&Ystd_, sizeof(double) * num_modules_);
-    modules_.reserve(num_modules_);
-
-    // now norm paras are set in constructor manually
-    at::TensorOptions opts = at::TensorOptions().dtype(at::kDouble).device(at::kCUDA);
-    std::vector<double> Xmu_vec = {2.1996457626e+03,  1.0281762632e+05, -5.2533840071e+00,
-                -4.5623405933e+00, -1.5305375824e+00, -3.7453838144e+00,
-                -3.3097212360e+00, -4.3339657954e+00, -2.9028421546e-01};
-    std::vector<double> Xstd_vec = {3.8304388695e+02, 7.1905832810e+02, 4.6788389177e-01, 
-                5.3034657693e-01, 5.0325864222e-01, 5.1058993718e-01, 
-                7.9704668543e-01, 5.1327160125e-01, 4.6827591193e-03};
-    std::vector<double> Ymu_vec = {0.0085609265, -0.0082998877, 0.0030108739,
-                -0.0067360325, 0.0037464590, 0.0024258509};
-    std::vector<double> Ystd_vec = {0.0085609265, -0.0082998877, 0.0030108739,
-                -0.0067360325, 0.0037464590, 0.0024258509};
-    
-    cudaMemcpy(Xmu_, Xmu_vec.data(), sizeof(double) * dim_input_, cudaMemcpyHostToDevice);
-    cudaMemcpy(Xstd_, Xstd_vec.data(), sizeof(double) * dim_input_, cudaMemcpyHostToDevice);
-    cudaMemcpy(Ymu_, Ymu_vec.data(), sizeof(double) * num_modules_, cudaMemcpyHostToDevice);
-    cudaMemcpy(Ystd_, Ystd_vec.data(), sizeof(double) * num_modules_, cudaMemcpyHostToDevice);
-
-    // input modules
-    std::string prefix = "new_Temporary_Chemical_";
-    std::string suffix = ".pt";
-    for (int i = 0; i < num_modules_; ++i) {
-        std::string model_path = prefix + std::to_string(i) + suffix;
-        printf("loading model %s\n", model_path.c_str());
-        modules_.push_back(torch::jit::load(model_path));
-        // try {
-        //     // modules_.push_back(torch::jit::load(model_path));
-        //     torch::jit::load(model_path);
-        // }
-        // catch (const c10::Error& e) {
-        //     std::cerr << "error loading the model\n";
-        //     exit(-1);
-        // }
-        modules_[i].to(device_);
-    }
-}
-
-dfChemistrySolver::~dfChemistrySolver() {
-    cudaFree(init_input_);
-}
-
-void dfChemistrySolver::Inference(const double *T, const double *p, const double *y,
-        const double *rho, double *RR) {
-    // construct input
-    size_t threads_per_block = 1024;
-    size_t blocks_per_grid = (num_cells_ + threads_per_block - 1) / threads_per_block;
-    construct_init_input<<<blocks_per_grid, threads_per_block>>>(num_cells_, dim_input_, T, p, y, y_input_BCT, init_input_);
-    normalize_input<<<blocks_per_grid, threads_per_block>>>(num_cells_, dim_input_, init_input_, Xmu_, Xstd_, init_input_);
-
-    // inference by torch
-    at::Tensor torch_input = torch::from_blob(init_input_, {num_cells_, dim_input_}, device_);
-    torch_input = torch_input.to(at::kFloat);
-    std::vector<torch::jit::IValue> INPUTS;
-    INPUTS.push_back(torch_input);
-
-    std::vector<at::Tensor> output(num_modules_);
-    for (int i = 0; i < num_modules_; ++i) {
-        output[i] = modules_[i].forward(INPUTS).toTensor();
-        output[i] = output[i].to(at::kDouble);
-    }
-
-    // post process
-    double *d_output;
-    for (int i = 0; i < num_modules_; ++i) {
-        d_output = output[i].data_ptr<double>();
-        cudaMemcpy(RR + i * num_cells_, d_output, sizeof(double) * num_cells_, cudaMemcpyDeviceToDevice);
-    }
-    calculate_y_new<<<blocks_per_grid, threads_per_block>>>(num_cells_, num_modules_, RR, y_input_BCT, Ymu_, Ystd_, RR);
-    calculate_RR<<<blocks_per_grid, threads_per_block>>>(num_cells_, num_species_, 1e-6, rho, y, RR, RR);
-}
\ No newline at end of file
diff --git a/src_gpu/dfThermo.H b/src_gpu/dfThermo.H
index e1f9c0f58..fdcf54b78 100644
--- a/src_gpu/dfThermo.H
+++ b/src_gpu/dfThermo.H
@@ -63,8 +63,9 @@ public:
     void setConstantValue(std::string mechanism_file, int num_cells, int num_species);
     void setConstantFields(const std::vector<int> patch_type);
     void initNonConstantFields(const double *h_T, const double *h_he, const double *h_psi, const double *h_alpha, 
-            const double *h_mu, const double *h_k, const double *h_dpdt, const double *h_boundary_T, const double *h_boundary_he, 
-            const double *h_boundary_psi, const double *h_boundary_alpha, const double *h_boundary_mu, const double *h_boundary_k);
+            const double *h_mu, const double *h_k, const double *h_dpdt, const double *h_rhoD, const double *h_boundary_T, 
+            const double *h_boundary_he, const double *h_boundary_psi, const double *h_boundary_alpha, const double *h_boundary_mu, 
+            const double *h_boundary_k, const double *h_boundary_rhoD);
 
     // set mass fraction
     void setMassFraction(const double *d_y, const double *d_boundary_y);
@@ -79,6 +80,9 @@ public:
     void calculateThermoConductivityGPU(int thread_per_block, int num_thread, int num_total, const double *T, const double *T_poly,
             const double *d_y, const double *mole_fraction, double *species_thermal_conductivities,
             double *thermal_conductivity, int offset = 0);
+    void calculateRhoDGPU(int threads_per_block, int num_thread, int num_total, const double *T, 
+            const double *T_poly, const double *p, const double *mole_fraction, 
+            const double *mean_mole_weight, const double *rho, double *rhoD, int offset = 0);
     void calculateEnthalpyGPU(int thread_per_block, int num_thread, const double *T, double *enthalpy, const double *d_mass_fraction, int offset = 0);
     void calculateTemperatureGPU(int thread_per_block, int num_thread, int num_total, const double *T_init, const double *target_h, 
             double *T, const double *d_mass_fraction, int offset = 0,
@@ -98,6 +102,7 @@ public:
     void compareMu(const double *mu, const double *boundary_mu, bool printFlag);
     void compareAlpha(const double *alpha, const double *boundary_alpha, bool printFlag);
     void compareHe(const double *he, const double *boundary_he, bool printFlag);
+    void compareRhoD(const double *rhoD, const double *boundary_rhoD, int species_index, bool printFlag);
 
     void correctHe(const double *he, const double *boundary_he);
     void correctPsi(const double *psi, const double *boundary_psi);
diff --git a/src_gpu/dfThermo.cu b/src_gpu/dfThermo.cu
index 9e3aba78d..9c46b3d96 100644
--- a/src_gpu/dfThermo.cu
+++ b/src_gpu/dfThermo.cu
@@ -8,7 +8,7 @@
 
 #define GAS_CANSTANT 8314.46261815324
 #define SQRT8 2.8284271247461903
-#define NUM_SPECIES 7
+#define NUM_SPECIES 9
 
 // constant memory
 __constant__ __device__ double d_nasa_coeffs[NUM_SPECIES*15];
@@ -200,6 +200,47 @@ __global__ void calculate_thermoConductivity_kernel(int num_thread, int num_tota
     thermal_conductivity[startIndex] = lambda_mix / cp;
 }
 
+__global__ void calculate_diffusion_kernel(int num_thread, int num_total, int num_species,
+        int offset, const double *T_poly, const double *mole_fraction, const double *p,
+        const double *mean_mole_weight, const double *rho, const double *T, double *d)
+{
+    int index = blockDim.x * blockIdx.x + threadIdx.x;
+    if (index >= num_thread)
+        return;
+    
+    int startIndex = offset + index;
+    
+    double D[NUM_SPECIES * NUM_SPECIES];
+    double sum1, sum2;
+    double tmp;
+
+    for (int i = 0; i < num_species; i++) {
+        for (int j = 0; j < num_species; j++) {
+            tmp = 0.;
+            for (int k = 0; k < 5; k++)
+                tmp += (d_binary_diffusion_coeffs[i * num_species * 5 + j * 5 + k] * T_poly[num_total * k + startIndex]);
+            D[i * num_species + j] = tmp * pow(T[startIndex], 1.5);
+        }
+    }
+    for (int i = 0; i < num_species; i++) {
+        if (mole_fraction[num_total * i + startIndex] + 1e-10 > 1.) {
+            d[num_total * i + startIndex] = 0.;
+            continue;
+        }
+        sum1 = 0.;
+        sum2 = 0.;
+        for (int j = 0; j < num_species; j++) {
+            if (i == j) continue;
+            sum1 += mole_fraction[num_total * j + startIndex] / D[i * num_species + j];
+            sum2 += mole_fraction[num_total * j + startIndex] * d_molecular_weights[j] / D[i * num_species + j];
+        }
+        sum1 *= p[startIndex];
+        sum2 *= p[startIndex] * mole_fraction[num_total * i + startIndex] / 
+                (mean_mole_weight[startIndex] - mole_fraction[num_total * i + startIndex] * d_molecular_weights[i]);
+        d[num_total * i + startIndex] = 1 / (sum1 + sum2) * rho[startIndex];
+    }
+}
+
 __device__ double calculate_enthalpy_device_kernel(int num_total, int num_species, int index, const double local_T,
         const double *mass_fraction)
 {
@@ -404,8 +445,9 @@ void dfThermo::setConstantFields(const std::vector<int> patch_type)
 }
 
 void dfThermo::initNonConstantFields(const double *h_T, const double *h_he, const double *h_psi, const double *h_alpha, 
-        const double *h_mu, const double *h_k, const double *h_dpdt, const double *h_boundary_T, const double *h_boundary_he, 
-        const double *h_boundary_psi, const double *h_boundary_alpha, const double *h_boundary_mu, const double *h_boundary_k)
+        const double *h_mu, const double *h_k, const double *h_dpdt, const double *h_rhoD, const double *h_boundary_T, 
+        const double *h_boundary_he, const double *h_boundary_psi, const double *h_boundary_alpha, const double *h_boundary_mu, 
+        const double *h_boundary_k, const double *h_boundary_rhoD)
 {
     checkCudaErrors(cudaMemcpy(dataBase_.d_T, h_T, dataBase_.cell_value_bytes, cudaMemcpyHostToDevice));
     checkCudaErrors(cudaMemcpy(dataBase_.d_he, h_he, dataBase_.cell_value_bytes, cudaMemcpyHostToDevice));
@@ -414,6 +456,7 @@ void dfThermo::initNonConstantFields(const double *h_T, const double *h_he, cons
     checkCudaErrors(cudaMemcpy(dataBase_.d_mu, h_mu, dataBase_.cell_value_bytes, cudaMemcpyHostToDevice));
     checkCudaErrors(cudaMemcpy(dataBase_.d_k, h_k, dataBase_.cell_value_bytes, cudaMemcpyHostToDevice));
     checkCudaErrors(cudaMemcpy(dataBase_.d_dpdt, h_dpdt, dataBase_.cell_value_bytes, cudaMemcpyHostToDevice));
+    checkCudaErrors(cudaMemcpy(dataBase_.d_thermo_rhoD, h_rhoD, dataBase_.cell_value_bytes * dataBase_.num_species, cudaMemcpyHostToDevice));
 
     checkCudaErrors(cudaMemcpy(dataBase_.d_boundary_T, h_boundary_T, dataBase_.boundary_surface_value_bytes, cudaMemcpyHostToDevice));
     checkCudaErrors(cudaMemcpy(dataBase_.d_boundary_he, h_boundary_he, dataBase_.boundary_surface_value_bytes, cudaMemcpyHostToDevice));
@@ -421,6 +464,7 @@ void dfThermo::initNonConstantFields(const double *h_T, const double *h_he, cons
     checkCudaErrors(cudaMemcpy(dataBase_.d_boundary_thermo_alpha, h_boundary_alpha, dataBase_.boundary_surface_value_bytes, cudaMemcpyHostToDevice));
     checkCudaErrors(cudaMemcpy(dataBase_.d_boundary_mu, h_boundary_mu, dataBase_.boundary_surface_value_bytes, cudaMemcpyHostToDevice));
     checkCudaErrors(cudaMemcpy(dataBase_.d_boundary_k, h_boundary_k, dataBase_.boundary_surface_value_bytes, cudaMemcpyHostToDevice));
+    checkCudaErrors(cudaMemcpy(dataBase_.d_boundary_thermo_rhoD, h_boundary_rhoD, dataBase_.boundary_surface_value_bytes * dataBase_.num_species, cudaMemcpyHostToDevice));
 }
 
 void dfThermo::calculateTPolyGPU(int threads_per_block, int num_thread, int num_total, const double *T, double *T_poly, int offset)
@@ -460,6 +504,15 @@ void dfThermo::calculateThermoConductivityGPU(int threads_per_block, int num_thr
             offset, d_nasa_coeffs, d_y, T_poly, T, mole_fraction, species_thermal_conductivities, thermal_conductivity);
 }
 
+void dfThermo::calculateRhoDGPU(int threads_per_block, int num_thread, int num_total, const double *T, 
+        const double *T_poly, const double *p, const double *mole_fraction, 
+        const double *mean_mole_weight, const double *rho, double *rhoD, int offset)
+{
+    size_t blocks_per_grid = (num_thread + threads_per_block - 1) / threads_per_block;
+    calculate_diffusion_kernel<<<blocks_per_grid, threads_per_block, 0, stream>>>(num_thread, num_total, num_species, offset, 
+            T_poly, mole_fraction, p, mean_mole_weight, rho, T, rhoD);
+}
+
 void dfThermo::calculateTemperatureGPU(int threads_per_block, int num_thread, int num_total, const double *T_init, const double *target_h, double *T, 
         const double *d_mass_fraction, int offset, double atol, double rtol, int max_iter)
 {
@@ -519,6 +572,8 @@ void dfThermo::correctThermo()
             d_T_poly, d_species_viscosities, dataBase_.d_mu); // calculate viscosity
     calculateThermoConductivityGPU(cell_thread, dataBase_.num_cells, dataBase_.num_cells, dataBase_.d_T, d_T_poly, dataBase_.d_y, d_mole_fraction, 
             d_species_thermal_conductivities, dataBase_.d_thermo_alpha); // calculate thermal conductivity
+    calculateRhoDGPU(cell_thread, dataBase_.num_cells, dataBase_.num_cells, dataBase_.d_T, d_T_poly, dataBase_.d_p, d_mole_fraction, 
+            d_mean_mole_weight, dataBase_.d_rho, dataBase_.d_thermo_rhoD); 
     // boundary field
     int offset = 0;
     for (int i = 0; i < dataBase_.num_patches; i++) {
@@ -533,6 +588,8 @@ void dfThermo::correctThermo()
                     d_boundary_T_poly, d_boundary_species_viscosities, dataBase_.d_boundary_mu, offset);
             calculateThermoConductivityGPU(boundary_thread, dataBase_.patch_size[i], dataBase_.num_boundary_surfaces, dataBase_.d_boundary_T, d_boundary_T_poly, 
                     dataBase_.d_boundary_y, d_boundary_mole_fraction, d_boundary_species_thermal_conductivities, dataBase_.d_boundary_thermo_alpha, offset);
+            calculateRhoDGPU(boundary_thread, dataBase_.patch_size[i], dataBase_.num_boundary_surfaces, dataBase_.d_boundary_T, d_boundary_T_poly,
+                    dataBase_.d_boundary_p, d_boundary_mole_fraction, d_boundary_mean_mole_weight, dataBase_.d_boundary_rho, dataBase_.d_boundary_thermo_rhoD, offset);
         } else {
             calculateTemperatureGPU(boundary_thread, dataBase_.patch_size[i], dataBase_.num_boundary_surfaces, dataBase_.d_boundary_T, dataBase_.d_boundary_he, 
                     dataBase_.d_boundary_T, dataBase_.d_boundary_y, offset);
@@ -543,6 +600,8 @@ void dfThermo::correctThermo()
                     d_boundary_T_poly, d_boundary_species_viscosities, dataBase_.d_boundary_mu, offset);
             calculateThermoConductivityGPU(boundary_thread, dataBase_.patch_size[i], dataBase_.num_boundary_surfaces, dataBase_.d_boundary_T, d_boundary_T_poly,
                     dataBase_.d_boundary_y, d_boundary_mole_fraction, d_boundary_species_thermal_conductivities, dataBase_.d_boundary_thermo_alpha, offset);
+            calculateRhoDGPU(boundary_thread, dataBase_.patch_size[i], dataBase_.num_boundary_surfaces, dataBase_.d_boundary_T, d_boundary_T_poly,
+                    dataBase_.d_boundary_p, d_boundary_mole_fraction, d_boundary_mean_mole_weight, dataBase_.d_boundary_rho, dataBase_.d_boundary_thermo_rhoD, offset);
         }
         // correct internal field of processor boundary
         if (dataBase_.patch_type_T[i] == boundaryConditions::processor
@@ -561,7 +620,11 @@ void dfThermo::correctThermo()
                     dataBase_.d_mu, dataBase_.d_boundary_face_cell, dataBase_.d_boundary_mu);
             correct_internal_boundary_field_scalar<<<blocks_per_grid, threads_per_block, 0, stream>>>(dataBase_.patch_size[i], offset,
                     dataBase_.d_rho, dataBase_.d_boundary_face_cell, dataBase_.d_boundary_rho);
-
+            for (int j = 0; j < num_species; j++) {
+                correct_internal_boundary_field_scalar<<<blocks_per_grid, threads_per_block, 0, stream>>>(dataBase_.patch_size[i], offset,
+                        dataBase_.d_thermo_rhoD + j * dataBase_.num_cells, dataBase_.d_boundary_face_cell, 
+                        dataBase_.d_boundary_thermo_rhoD + j * dataBase_.num_boundary_surfaces);
+            }
             offset += 2 * dataBase_.patch_size[i];
         } else {
             offset += dataBase_.patch_size[i]; }
@@ -740,6 +803,24 @@ void dfThermo::compareAlpha(const double *alpha, const double *boundary_alpha, b
     delete h_boundary_alpha;
 }
 
+void dfThermo::compareRhoD(const double *rhoD, const double *boundary_rhoD, int species_index, bool printFlag)
+{
+    double *h_rhoD = new double[dataBase_.num_cells];
+    double *h_boundary_rhoD = new double[dataBase_.num_boundary_surfaces];
+
+    checkCudaErrors(cudaMemcpy(h_rhoD, dataBase_.d_thermo_rhoD + species_index * dataBase_.num_cells, dataBase_.cell_value_bytes, cudaMemcpyDeviceToHost));
+    checkCudaErrors(cudaMemcpy(h_boundary_rhoD, dataBase_.d_boundary_thermo_rhoD + species_index * dataBase_.num_boundary_surfaces, 
+                dataBase_.boundary_surface_value_bytes, cudaMemcpyDeviceToHost));
+
+    fprintf(stderr, "check h_thermo_rhoD\n");
+    checkVectorEqual(dataBase_.num_cells, rhoD, h_rhoD, 1e-10, printFlag);
+    fprintf(stderr, "check h_thermo_boundary_rhoD\n");
+    checkVectorEqual(dataBase_.num_boundary_surfaces, boundary_rhoD, h_boundary_rhoD, 1e-10, printFlag);
+
+    delete h_rhoD;
+    delete h_boundary_rhoD;
+}
+
 void dfThermo::correctHe(const double *he, const double *boundary_he)
 {
     checkCudaErrors(cudaMemcpy(dataBase_.d_he, he, dataBase_.cell_value_bytes, cudaMemcpyHostToDevice));
diff --git a/src_gpu/dfYEqn.H b/src_gpu/dfYEqn.H
index ab89abde6..b3715efc4 100644
--- a/src_gpu/dfYEqn.H
+++ b/src_gpu/dfYEqn.H
@@ -4,11 +4,13 @@
 #include <amgx_c.h>
 #include "dfMatrixDataBase.H"
 #include "dfMatrixOpBase.H"
+#include "dfChemistrySolver.H"
 
 class dfYEqn
 {
 private:
 	dfMatrixDataBase &dataBase_;
+    dfChemistrySolver &chemistrySolver_;
 
     // cuda resource
     cudaStream_t stream;
@@ -48,6 +50,8 @@ private:
     double *d_phiUc = nullptr;
     double *d_DEff = nullptr;
     double *d_permute = nullptr;
+    // combustion fields
+    double *d_RR = nullptr;
     // computed on CPU, used on GPU, need memcpyh2d
 	double *h_hai = nullptr;
 	double *h_rhoD = nullptr;
@@ -92,8 +96,8 @@ private:
 
 public:
 	// 构造函数
-    dfYEqn(dfMatrixDataBase &dataBase)
-        : dataBase_(dataBase) {}
+    dfYEqn(dfMatrixDataBase &dataBase, dfChemistrySolver &chemistrySolver)
+        : dataBase_(dataBase), chemistrySolver_(chemistrySolver) {}
 
 	// 析构函数
 	~dfYEqn(){}
@@ -127,9 +131,11 @@ public:
     void yeqn_compute_thermo_alpha(cudaStream_t stream,
             int num_cells, const double *rhoD, double *thermo_alpha,
             int num_boundary_surfaces, const double *boundary_rhoD, double *boundary_thermo_alpha);
-    void yeqn_compute_DEff(cudaStream_t stream, int num_species, int num_cells, int num_boundary_surfaces, 
+    void yeqn_compute_DEff_via_lewisNumber(cudaStream_t stream, int num_species, int num_cells, int num_boundary_surfaces, 
             double *lewis_number, const double *alpha, const double *mut_sct, double *DEff,
             const double *boundary_alpha, const double *boundary_mut_sct, double *boundary_DEff);
+    void yeqn_compute_RR(dfChemistrySolver& chemistrySolver, cudaStream_t stream, const double *h_T, const double *d_T, 
+            const double *p, const double *y, const double *rho, double *RR);
     void yeqn_fvc_laplacian_scalar(cudaStream_t stream, ncclComm_t comm, const int *neighbor_peer,
             int num_species, int num_cells, int num_surfaces, int num_boundary_surfaces,
             const int *lowerAddr, const int *upperAddr,
@@ -140,9 +146,9 @@ public:
             const double *boundary_thermo_alpha, const double *boundary_hai, const double *boundary_vf,
             const int *cyclicNeighbor, const int *patchSizeOffset, double *boundary_output);
     void yeqn_compute_sumYDiffError_and_hDiffCorrFlux(cudaStream_t stream, int num_species, int num_cells, int num_boundary_surfaces,
-            const double *lewis_number, const double *thermo_alpha, const double *hai, const double *y, const double *grad_y,
+            const double *rhoD, const double *hai, const double *y, const double *grad_y,
             double *sumY_diff_error, double *hDiff_corr_flux,
-            const double *boundary_hai, const double *boundary_y, const double *boundary_grad_y, const double *boundary_thermo_alpha,
+            const double *boundary_hai, const double *boundary_y, const double *boundary_grad_y, const double *boundary_rhoD,
             double *boundary_sumY_diff_error, double *boundary_hDiff_corr_flux);
     void yeqn_compute_phiUc(cudaStream_t stream, int num_cells, int num_surfaces, int num_boundary_surfaces,
             const int *lowerAddr, const int *upperAddr,
diff --git a/src_gpu/dfYEqn.cu b/src_gpu/dfYEqn.cu
index d68ad23ff..e6bbc5d4c 100644
--- a/src_gpu/dfYEqn.cu
+++ b/src_gpu/dfYEqn.cu
@@ -83,7 +83,7 @@ __global__ void yeqn_compute_phiUc_boundary(int num_boundary_surfaces,
 }
  
 __global__ void yeqn_sumError_and_compute_hDiffCorrFlux(int num_species, int num,
-        const double *lewis_number, const double *thermo_alpha, const double *hai, const double *y, const double *grady,
+        const double *rhoD, const double *hai, const double *y, const double *grady,
         double *sum_rhoD_grady, double *hDiffCorrFlux)
 {
     int index = blockDim.x * blockIdx.x + threadIdx.x;
@@ -99,7 +99,7 @@ __global__ void yeqn_sumError_and_compute_hDiffCorrFlux(int num_species, int num
     double sum_hai_y = 0;
     for (int s = 0; s < num_species; s++) {
         double hai_value = hai[num * s + index];
-        double rhoD_value = thermo_alpha[index] / lewis_number[s]; // le = alpha/D
+        double rhoD_value = rhoD[num * s + index]; // le = alpha/D
         double y_value = y[num * s + index];
         double grady_x = grady[num * s * 3 + num * 0 + index];
         double grady_y = grady[num * s * 3 + num * 1 + index];
@@ -346,6 +346,8 @@ void dfYEqn::createNonConstantFieldsInternal() {
     checkCudaErrors(cudaMalloc((void**)&d_phiUc, dataBase_.surface_value_bytes));
     checkCudaErrors(cudaMalloc((void**)&d_DEff, dataBase_.cell_value_bytes * dataBase_.num_species));
     checkCudaErrors(cudaMalloc((void**)&d_permute, dataBase_.cell_value_vec_bytes));
+
+    checkCudaErrors(cudaMalloc((void**)&d_RR, dataBase_.cell_value_bytes * dataBase_.num_species));
 #endif
     // computed on CPU, used on GPU, need memcpyh2d
     checkCudaErrors(cudaMallocHost((void**)&h_rhoD, dataBase_.cell_value_bytes * dataBase_.num_species));
@@ -458,6 +460,8 @@ void dfYEqn::process() {
         checkCudaErrors(cudaMallocAsync((void**)&d_phiUc, dataBase_.surface_value_bytes, dataBase_.stream));
         checkCudaErrors(cudaMallocAsync((void**)&d_DEff, dataBase_.cell_value_bytes * dataBase_.num_species, dataBase_.stream));
         checkCudaErrors(cudaMallocAsync((void**)&d_permute, dataBase_.cell_value_vec_bytes, dataBase_.stream));
+        // combustion fields
+        checkCudaErrors(cudaMallocAsync((void**)&d_RR, dataBase_.cell_value_bytes * dataBase_.num_species, dataBase_.stream));
         // thermophysical fields
         checkCudaErrors(cudaMallocAsync((void**)&d_boundary_hai, dataBase_.boundary_surface_value_bytes * dataBase_.num_species, dataBase_.stream));
         checkCudaErrors(cudaMallocAsync((void**)&d_boundary_mut_sct, dataBase_.boundary_surface_value_bytes, dataBase_.stream));
@@ -489,6 +493,11 @@ void dfYEqn::process() {
         checkCudaErrors(cudaMemsetAsync(dataBase_.d_boundary_diff_alphaD, 0, dataBase_.boundary_surface_value_bytes, dataBase_.stream));
         checkCudaErrors(cudaMemsetAsync(d_grad_y, 0, dataBase_.cell_value_vec_bytes * dataBase_.num_species, dataBase_.stream));
         checkCudaErrors(cudaMemsetAsync(d_boundary_grad_y, 0, dataBase_.boundary_surface_value_vec_bytes * dataBase_.num_species, dataBase_.stream));
+        // combustion fields
+        checkCudaErrors(cudaMemsetAsync(d_RR, 0, dataBase_.cell_value_bytes * dataBase_.num_species, dataBase_.stream));
+        // calculate reaction rates
+        checkCudaErrors(cudaStreamSynchronize(dataBase_.stream));
+        yeqn_compute_RR(chemistrySolver_, dataBase_.stream, dataBase_.h_T, dataBase_.d_T, dataBase_.d_p, dataBase_.d_y, dataBase_.d_rho_old, d_RR);
         // compute diffAlphaD
         yeqn_fvc_laplacian_scalar(dataBase_.stream, dataBase_.nccl_comm, dataBase_.neighbProcNo.data(),
                 dataBase_.num_species, dataBase_.num_cells, dataBase_.num_surfaces, dataBase_.num_boundary_surfaces,
@@ -522,9 +531,9 @@ void dfYEqn::process() {
         // compute sumYDiffError and hDiffCorrFlux
         yeqn_compute_sumYDiffError_and_hDiffCorrFlux(dataBase_.stream,
                 dataBase_.num_species, dataBase_.num_cells, dataBase_.num_boundary_surfaces,
-                d_lewis_number, dataBase_.d_thermo_alpha, d_hai, dataBase_.d_y, d_grad_y, 
+                dataBase_.d_thermo_rhoD, d_hai, dataBase_.d_y, d_grad_y, 
                 d_sumY_diff_error, dataBase_.d_hDiff_corr_flux,
-                d_boundary_hai, dataBase_.d_boundary_y, d_boundary_grad_y, dataBase_.d_boundary_thermo_alpha,
+                d_boundary_hai, dataBase_.d_boundary_y, d_boundary_grad_y, dataBase_.d_boundary_thermo_rhoD,
                 d_boundary_sumY_diff_error, dataBase_.d_boundary_hDiff_corr_flux);
         // compute phiUc
         yeqn_compute_phiUc(dataBase_.stream, dataBase_.num_cells, dataBase_.num_surfaces, dataBase_.num_boundary_surfaces,
@@ -539,8 +548,9 @@ void dfYEqn::process() {
         // turbulence->mut()/Sct = 0 when laminar.
         // double *d_DEff = d_rhoD;
         // double *d_boundary_DEff = d_boundary_rhoD;
-        yeqn_compute_DEff(dataBase_.stream, dataBase_.num_species, dataBase_.num_cells, dataBase_.num_boundary_surfaces,
-                d_lewis_number, dataBase_.d_thermo_alpha, d_mut_sct, d_DEff, dataBase_.d_boundary_thermo_alpha, d_boundary_mut_sct, d_boundary_DEff);
+        // TODO: calculate d_DEff in dfThermo
+        // yeqn_compute_DEff_via_lewisNumber(dataBase_.stream, dataBase_.num_species, dataBase_.num_cells, dataBase_.num_boundary_surfaces,
+        //         d_lewis_number, dataBase_.d_thermo_alpha, d_mut_sct, d_DEff, dataBase_.d_boundary_thermo_alpha, d_boundary_mut_sct, d_boundary_DEff);
 #ifdef USE_GRAPH
         checkCudaErrors(cudaStreamEndCapture(dataBase_.stream, &graph));
         checkCudaErrors(cudaGraphInstantiate(&graph_instance, graph, NULL, NULL, 0));
@@ -574,6 +584,15 @@ void dfYEqn::process() {
             fvm_ddt_vol_scalar_vol_scalar(dataBase_.stream, dataBase_.num_cells, dataBase_.rdelta_t,
                     dataBase_.d_rho, dataBase_.d_rho_old, dataBase_.d_y + dataBase_.num_cells * s, dataBase_.d_volume,
                     d_diag, d_source, 1.);
+            // **calculate div weights with limitedLinear scheme**
+            // compute_limitedLinear_weight(dataBase_.stream, dataBase_.nccl_comm, dataBase_.neighbProcNo.data(), dataBase_.num_surfaces, 
+            //         dataBase_.num_cells, dataBase_.num_boundary_surfaces, dataBase_.d_owner, dataBase_.d_neighbor, dataBase_.d_mesh_dis,
+            //         dataBase_.d_weight, dataBase_.d_sf, dataBase_.d_y + dataBase_.num_cells * s, dataBase_.d_phi, dataBase_.d_phi_weight,
+            //         dataBase_.num_patches, dataBase_.patch_size.data(), patch_type.data(), dataBase_.d_boundary_weight, dataBase_.d_boundary_face_cell,
+            //         dataBase_.d_boundary_y + dataBase_.num_boundary_surfaces * s, dataBase_.d_boundary_sf, dataBase_.d_volume, 
+            //         dataBase_.d_boundary_mag_sf, dataBase_.d_boundary_phi, dataBase_.cyclicNeighbor.data(), dataBase_.patchSizeOffset.data(),
+            //         dataBase_.d_boundary_delta_coeffs);
+
             // fvmDiv(phi, Yi)
             fvm_div_scalar(dataBase_.stream, dataBase_.num_surfaces, dataBase_.d_owner, dataBase_.d_neighbor,
                     dataBase_.d_phi, dataBase_.d_phi_weight,
@@ -596,13 +615,14 @@ void dfYEqn::process() {
             fvm_laplacian_scalar(dataBase_.stream, dataBase_.num_surfaces, dataBase_.num_boundary_surfaces,
                     dataBase_.d_owner, dataBase_.d_neighbor,
                     dataBase_.d_weight, dataBase_.d_mag_sf, dataBase_.d_delta_coeffs,
-                    d_DEff + dataBase_.num_cells * s,
+                    dataBase_.d_thermo_rhoD + dataBase_.num_cells * s,
                     d_lower, d_upper, d_diag, // end for internal
                     dataBase_.num_patches, dataBase_.patch_size.data(), patch_type.data(),
-                    dataBase_.d_boundary_mag_sf, d_boundary_DEff + dataBase_.num_boundary_surfaces * s,
+                    dataBase_.d_boundary_mag_sf, dataBase_.d_boundary_thermo_rhoD + dataBase_.num_boundary_surfaces * s,
                     d_gradient_internal_coeffs + dataBase_.num_boundary_surfaces * s,
                     d_gradient_boundary_coeffs + dataBase_.num_boundary_surfaces * s,
                     d_internal_coeffs, d_boundary_coeffs, -1.);
+            fvc_to_source_scalar(dataBase_.stream, dataBase_.num_cells, dataBase_.d_volume, d_RR + dataBase_.num_cells * s, d_source);
 #ifndef DEBUG_CHECK_LDU
             // ldu to csr
             // use d_source as d_b
@@ -634,8 +654,6 @@ void dfYEqn::process() {
 
     TICK_START_EVENT;
     // copy y and boundary_y to host
-    checkCudaErrors(cudaMemcpyAsync(dataBase_.h_y, dataBase_.d_y, dataBase_.cell_value_bytes * dataBase_.num_species, cudaMemcpyDeviceToHost, dataBase_.stream));
-    checkCudaErrors(cudaMemcpyAsync(dataBase_.h_boundary_y, dataBase_.d_boundary_y, dataBase_.boundary_surface_value_bytes * dataBase_.num_species, cudaMemcpyDeviceToHost, dataBase_.stream));
     TICK_END_EVENT(YEqn post process for all species copy back);
 
     TICK_START_EVENT;
@@ -651,6 +669,8 @@ void dfYEqn::process() {
     checkCudaErrors(cudaFreeAsync(d_DEff, dataBase_.stream));
     checkCudaErrors(cudaFreeAsync(d_permute, dataBase_.stream));
 
+    checkCudaErrors(cudaFreeAsync(d_RR, dataBase_.stream));
+
     // thermophysical fields
     //checkCudaErrors(cudaFreeAsync(d_boundary_rhoD, dataBase_.stream));
     checkCudaErrors(cudaFreeAsync(d_boundary_hai, dataBase_.stream));
@@ -684,7 +704,11 @@ void dfYEqn::solve(int speciesIndex) {
     TICK_END_EVENT(YEqn solve one specie);
 }
 
-void dfYEqn::postProcess(double *h_y, double *h_boundary_y) {}
+void dfYEqn::postProcess(double *h_y, double *h_boundary_y) {
+    checkCudaErrors(cudaMemcpyAsync(dataBase_.h_y, dataBase_.d_y, dataBase_.cell_value_bytes * dataBase_.num_species, cudaMemcpyDeviceToHost, dataBase_.stream));
+    checkCudaErrors(cudaMemcpyAsync(dataBase_.h_boundary_y, dataBase_.d_boundary_y, dataBase_.boundary_surface_value_bytes * dataBase_.num_species, cudaMemcpyDeviceToHost, dataBase_.stream));
+    sync();
+}
 
 void dfYEqn::sync() {
     checkCudaErrors(cudaStreamSynchronize(dataBase_.stream));
@@ -702,7 +726,7 @@ void dfYEqn::yeqn_compute_thermo_alpha(cudaStream_t stream,
             num_boundary_surfaces, boundary_rhoD, boundary_thermo_alpha);
 }
 
-void dfYEqn::yeqn_compute_DEff(cudaStream_t stream, int num_species, int num_cells, int num_boundary_surfaces, 
+void dfYEqn::yeqn_compute_DEff_via_lewisNumber(cudaStream_t stream, int num_species, int num_cells, int num_boundary_surfaces, 
         double *lewis_number, const double *alpha, const double *mut_sct, double *DEff,
         const double *boundary_alpha, const double *boundary_mut_sct, double *boundary_DEff)
 {
@@ -715,6 +739,12 @@ void dfYEqn::yeqn_compute_DEff(cudaStream_t stream, int num_species, int num_cel
             lewis_number, boundary_alpha, boundary_mut_sct, boundary_DEff);
 }
 
+void dfYEqn::yeqn_compute_RR(dfChemistrySolver& chemistrySolver, cudaStream_t stream, const double *h_T, const double *d_T,
+        const double *p, const double *y, const double *rho, double *RR)
+{
+    chemistrySolver.Inference(h_T, d_T, p, y, rho, RR);
+}
+
 void dfYEqn::yeqn_fvc_laplacian_scalar(cudaStream_t stream, ncclComm_t comm, const int *neighbor_peer,
         int num_species, int num_cells, int num_surfaces, int num_boundary_surfaces,
         const int *lowerAddr, const int *upperAddr,
@@ -792,18 +822,18 @@ void dfYEqn::yeqn_fvc_laplacian_scalar(cudaStream_t stream, ncclComm_t comm, con
 }
 
 void dfYEqn::yeqn_compute_sumYDiffError_and_hDiffCorrFlux(cudaStream_t stream, int num_species, int num_cells, int num_boundary_surfaces,
-        const double *lewis_number, const double *thermo_alpha, const double *hai, const double *y, const double *grad_y,
+        const double *rhoD, const double *hai, const double *y, const double *grad_y,
         double *sumY_diff_error, double *hDiff_corr_flux,
-        const double *boundary_hai, const double *boundary_y, const double *boundary_grad_y, const double *boundary_thermo_alpha,
+        const double *boundary_hai, const double *boundary_y, const double *boundary_grad_y, const double *boundary_rhoD,
         double *boundary_sumY_diff_error, double *boundary_hDiff_corr_flux)
 {
     size_t threads_per_block = 1024;
     size_t blocks_per_grid = (num_cells + threads_per_block - 1) / threads_per_block;
     yeqn_sumError_and_compute_hDiffCorrFlux<<<blocks_per_grid, threads_per_block, 0, stream>>>(num_species, num_cells,
-            lewis_number, thermo_alpha, hai, y, grad_y, sumY_diff_error, hDiff_corr_flux);
+            rhoD, hai, y, grad_y, sumY_diff_error, hDiff_corr_flux);
     blocks_per_grid = (num_boundary_surfaces + threads_per_block - 1) / threads_per_block;
     yeqn_sumError_and_compute_hDiffCorrFlux<<<blocks_per_grid, threads_per_block, 0, stream>>>(num_species, num_boundary_surfaces,
-            lewis_number, boundary_thermo_alpha, boundary_hai, boundary_y, boundary_grad_y, boundary_sumY_diff_error, boundary_hDiff_corr_flux);
+            boundary_rhoD, boundary_hai, boundary_y, boundary_grad_y, boundary_sumY_diff_error, boundary_hDiff_corr_flux);
 }
 
 void dfYEqn::yeqn_compute_phiUc(cudaStream_t stream, int num_cells, int num_surfaces, int num_boundary_surfaces,

From 9375fb143aa917f738c7d8fc114460a104b8bef7 Mon Sep 17 00:00:00 2001
From: maorz1998 <maorz1998@stu.pku.edu.cn>
Date: Tue, 7 Nov 2023 23:56:13 +0800
Subject: [PATCH 4/5] fix bugs in pEqn & UEqn

---
 applications/solvers/dfLowMachFoam/UEqn.H     |  8 ++--
 applications/solvers/dfLowMachFoam/YEqn.H     | 38 ++++++++++-----
 .../solvers/dfLowMachFoam/createGPUSolver.H   | 47 +++++++++++++++++--
 .../solvers/dfLowMachFoam/dfLowMachFoam.C     | 33 +++++++++----
 applications/solvers/dfLowMachFoam/pEqn_CPU.H |  3 +-
 applications/solvers/dfLowMachFoam/pEqn_GPU.H |  3 +-
 src_gpu/dfUEqn.H                              |  5 ++
 src_gpu/dfUEqn.cu                             | 25 +++++++++-
 src_gpu/dfpEqn.cu                             |  6 +--
 9 files changed, 130 insertions(+), 38 deletions(-)

diff --git a/applications/solvers/dfLowMachFoam/UEqn.H b/applications/solvers/dfLowMachFoam/UEqn.H
index a6244c870..a0a9689b1 100644
--- a/applications/solvers/dfLowMachFoam/UEqn.H
+++ b/applications/solvers/dfLowMachFoam/UEqn.H
@@ -11,7 +11,6 @@
         fvm::div(phi, U)
         +  
         turbulence->divDevRhoReff(U)
-        == -fvc::grad(p)
     );
     fvVectorMatrix& UEqn = tUEqn.ref();
     TICK_STOP(CPU assembly time);
@@ -64,7 +63,7 @@
 #if defined DEBUG_
     // UEqn.relax();
     TICK_START;
-    UEqn.solve();
+    solve(UEqn == -fvc::grad(p));
     K.oldTime();
     K = 0.5*magSqr(U);
     TICK_STOP(CPU solve time);
@@ -118,7 +117,7 @@
         //     h_internal_coeffs.data(), h_boundary_coeffs.data(), 
         //     // &gradU[0][0], h_boundary_gradU,
         //     printFlag);
-        //UEqn_GPU.compareU(&U[0][0], h_boundary_u_tmp, printFlag);
+        // UEqn_GPU.compareU(&U[0][0], h_boundary_u_tmp, printFlag);
     }
     DEBUG_TRACE;
 #endif
@@ -129,7 +128,6 @@
     (
         fvm::ddt(rho, U) + fvm::div(phi, U)
     + turbulence->divDevRhoReff(U) 
-    == -fvc::grad(p)
     );
     fvVectorMatrix& UEqn = tUEqn.ref();
     end1 = std::clock();
@@ -140,7 +138,7 @@
     start1 = std::clock();
     if (pimple.momentumPredictor())
     {
-        solve(UEqn);
+        solve(UEqn == -fvc::grad(p));
         K.oldTime();
         K = 0.5*magSqr(U);
     }
diff --git a/applications/solvers/dfLowMachFoam/YEqn.H b/applications/solvers/dfLowMachFoam/YEqn.H
index d430ebd73..010f9551c 100644
--- a/applications/solvers/dfLowMachFoam/YEqn.H
+++ b/applications/solvers/dfLowMachFoam/YEqn.H
@@ -22,6 +22,15 @@
     //randomInitField<volScalarField>(const_cast<volScalarField&>(chemistry->hai(5)));
     //randomInitField<volScalarField>(const_cast<volScalarField&>(chemistry->hai(6)));
 
+    // auto& mgcs = dynamic_cast<fv::multivariateGaussConvectionScheme<scalar>&>(mvConvection.ref());
+    // tmp<surfaceInterpolationScheme<scalar>> tinterpScheme_ = mgcs.interpolationScheme()()(Y[0]);
+    // tmp<surfaceScalarField> tweights = tinterpScheme_().weights(Y[0]);
+    // const surfaceScalarField& weights = tweights();
+    // Info << "CPU weights\n" << weights << endl;
+
+    // auto& limitedScheme_ = dynamic_cast<const limitedSurfaceInterpolationScheme<scalar>&>(tinterpScheme_());
+    // Info << "CPU limiter\n" << limitedScheme_.limiter(Y[0]) << endl;
+
     forAll(Y, i)
     {
         sumYDiffError += chemistry->rhoD(i)*fvc::grad(Y[i]);
@@ -44,6 +53,12 @@
         )
     );
 
+    // auto& mgcs = dynamic_cast<fv::multivariateGaussConvectionScheme<scalar>&>(mvConvection.ref());
+    // tmp<surfaceInterpolationScheme<scalar>> tinterpScheme_ = mgcs.interpolationScheme()()(Y[0]);
+    // tmp<surfaceScalarField> tweights = tinterpScheme_().weights(Y[0]);
+    // const surfaceScalarField& weights = tweights();
+    // Info << "CPU weights\n" << weights << endl;
+
     start1 = std::clock();
     forAll(Y, i)
     {
@@ -62,17 +77,6 @@ MPI_Initialized(&flag_mpi_init);
 if(flag_mpi_init) MPI_Barrier(PstreamGlobals::MPI_COMM_FOAM);
 
 {
-    if (!splitting)
-    {
-        std::chrono::steady_clock::time_point start = std::chrono::steady_clock::now();
-        combustion->correct();
-        //label flag_mpi_init;
-        //MPI_Initialized(&flag_mpi_init);
-        if(flag_mpi_init) MPI_Barrier(PstreamGlobals::MPI_COMM_FOAM);
-        std::chrono::steady_clock::time_point stop = std::chrono::steady_clock::now();
-        std::chrono::duration<double> processingTime = std::chrono::duration_cast<std::chrono::duration<double>>(stop - start);
-        time_monitor_chem += processingTime.count();
-    }
 
 #ifdef GPUSolverNew_
 #if defined DEBUG_
@@ -265,6 +269,18 @@ if(flag_mpi_init) MPI_Barrier(PstreamGlobals::MPI_COMM_FOAM);
 
     fflush(stderr);
 #else
+    if (!splitting)
+    {
+        std::chrono::steady_clock::time_point start = std::chrono::steady_clock::now();
+        combustion->correct();
+        //label flag_mpi_init;
+        //MPI_Initialized(&flag_mpi_init);
+        if(flag_mpi_init) MPI_Barrier(PstreamGlobals::MPI_COMM_FOAM);
+        std::chrono::steady_clock::time_point stop = std::chrono::steady_clock::now();
+        std::chrono::duration<double> processingTime = std::chrono::duration_cast<std::chrono::duration<double>>(stop - start);
+        time_monitor_chem += processingTime.count();
+    }
+
     start2 = std::clock();
     volScalarField Yt(0.0*Y[0]);
     int speciesIndex = 0;
diff --git a/applications/solvers/dfLowMachFoam/createGPUSolver.H b/applications/solvers/dfLowMachFoam/createGPUSolver.H
index 7714fcbd1..79455fb22 100644
--- a/applications/solvers/dfLowMachFoam/createGPUSolver.H
+++ b/applications/solvers/dfLowMachFoam/createGPUSolver.H
@@ -6,9 +6,10 @@ int nRanks, myRank, localRank, mpi_init_flag = 0;
 
 dfMatrixDataBase dfDataBase;
 dfThermo thermo_GPU(dfDataBase);
+dfChemistrySolver chemistrySolver_GPU(dfDataBase);
 dfRhoEqn rhoEqn_GPU(dfDataBase);
 dfUEqn UEqn_GPU(dfDataBase);
-dfYEqn YEqn_GPU(dfDataBase);
+dfYEqn YEqn_GPU(dfDataBase, chemistrySolver_GPU);
 dfEEqn EEqn_GPU(dfDataBase, thermo_GPU);
 dfpEqn pEqn_GPU(dfDataBase);
 
@@ -325,7 +326,17 @@ void createGPUBase(const IOdictionary& CanteraTorchProperties, fvMesh& mesh, Ptr
 
     dfDataBase.createConstantFieldsInternal();
     dfDataBase.createConstantFieldsBoundary();
-    dfDataBase.initConstantFieldsInternal(&mesh.Sf()[0][0], &mesh.magSf()[0], &mesh.surfaceInterpolation::weights()[0], &mesh.deltaCoeffs()[0], &mesh.V()[0]);
+
+    // construct mesh distance for limitedLinear scheme
+    vectorField meshDistance = mesh.Sf();
+    forAll(meshDistance, facei) {
+        label own = owner[facei];
+        label nei = neighbour[facei];
+        meshDistance[facei] = mesh.C()[nei] - mesh.C()[own];
+    }
+
+    dfDataBase.initConstantFieldsInternal(&mesh.Sf()[0][0], &mesh.magSf()[0], &mesh.surfaceInterpolation::weights()[0], 
+            &mesh.deltaCoeffs()[0], &mesh.V()[0], &meshDistance[0][0]);
     dfDataBase.initConstantFieldsBoundary(boundary_sf, boundary_mag_sf, boundary_delta_coeffs, boundary_weights, boundary_face_cell, 
             patch_type_calculated, patch_type_extropolated);
     
@@ -586,7 +597,7 @@ void createGPUpEqn(const IOdictionary& CanteraTorchProperties, volScalarField& p
 
 void createGPUThermo(const IOdictionary& CanteraTorchProperties, volScalarField& T, volScalarField& he, 
         const volScalarField& psi, const volScalarField& alpha, const volScalarField& mu,
-        const volScalarField& K, const volScalarField& dpdt) {
+        const volScalarField& K, const volScalarField& dpdt, dfChemistryModel<basicThermo>* chemistry) {
     DEBUG_TRACE;
     // initialize dfThermo
     string mechanismFile;
@@ -604,6 +615,8 @@ void createGPUThermo(const IOdictionary& CanteraTorchProperties, volScalarField&
     double *h_boundary_thermo_alpha = new double[dfDataBase.num_boundary_surfaces];
     double *h_boundary_mu = new double[dfDataBase.num_boundary_surfaces];
     double *h_boundary_k = new double[dfDataBase.num_boundary_surfaces];
+    double *h_boundary_thermo_rhoD = new double[dfDataBase.num_boundary_surfaces * dfDataBase.num_species];
+    double *h_thermo_rhoD = new double[dfDataBase.num_cells * dfDataBase.num_species];
 
     // initialize thermo boundary
     std::vector<int> patch_type_T(dfDataBase.num_patches);
@@ -651,6 +664,14 @@ void createGPUThermo(const IOdictionary& CanteraTorchProperties, volScalarField&
                     dynamic_cast<const processorFvPatchField<scalar>&>(patchK).patchInternalField()();
             memcpy(h_boundary_k + offset + patchsize, &patchKInternal[0], patchsize * sizeof(double));
 
+            for (int i = 0; i < dfDataBase.num_species; i++) {
+                const fvPatchScalarField& patchRhoD = chemistry->rhoD(i).boundaryField()[patchi];
+                memcpy(h_boundary_thermo_rhoD + i * dfDataBase.num_boundary_surfaces + offset, &patchRhoD[0], patchsize * sizeof(double));
+                scalarField patchRhoDInternal = 
+                        dynamic_cast<const processorFvPatchField<scalar>&>(patchRhoD).patchInternalField()();
+                memcpy(h_boundary_thermo_rhoD + i * dfDataBase.num_boundary_surfaces + offset + patchsize, &patchRhoDInternal[0], patchsize * sizeof(double));
+            }
+
             offset += patchsize * 2;
         } else {
             memcpy(h_boundary_T + offset, &patchT[0], patchsize * sizeof(double));
@@ -660,13 +681,29 @@ void createGPUThermo(const IOdictionary& CanteraTorchProperties, volScalarField&
             memcpy(h_boundary_mu + offset, &patchMu[0], patchsize * sizeof(double));
             memcpy(h_boundary_k + offset, &patchK[0], patchsize * sizeof(double));
 
+            for (int i = 0; i < dfDataBase.num_species; i++) {
+                const fvPatchScalarField& patchRhoD = chemistry->rhoD(i).boundaryField()[patchi];
+                memcpy(h_boundary_thermo_rhoD + i * dfDataBase.num_boundary_surfaces + offset, &patchRhoD[0], patchsize * sizeof(double));
+            }
             offset += patchsize;
         }
     }
+    for (int i = 0; i < dfDataBase.num_species; i++) {
+        memcpy(h_thermo_rhoD + i * dfDataBase.num_cells, &chemistry->rhoD(i)[0], dfDataBase.num_cells * sizeof(double));
+    }
+    double *h_T = dfDataBase.getFieldPointer("T", location::cpu, position::internal);
+    memcpy(h_T, &T[0], dfDataBase.cell_value_bytes);
+
     thermo_GPU.setConstantFields(patch_type_T);
-    thermo_GPU.initNonConstantFields(&T[0], &he[0], &psi[0], &alpha[0], &mu[0], &K[0], &dpdt[0],
-            h_boundary_T, h_boundary_he, h_boundary_thermo_psi, h_boundary_thermo_alpha, h_boundary_mu, h_boundary_k);
+    thermo_GPU.initNonConstantFields(h_T, &he[0], &psi[0], &alpha[0], &mu[0], &K[0], &dpdt[0], h_thermo_rhoD,
+            h_boundary_T, h_boundary_he, h_boundary_thermo_psi, h_boundary_thermo_alpha, h_boundary_mu, h_boundary_k, h_boundary_thermo_rhoD);
 
     delete h_boundary_T;
     delete h_boundary_he;
+    delete h_boundary_thermo_psi;
+    delete h_boundary_thermo_alpha;
+    delete h_boundary_mu;
+    delete h_boundary_k;
+    delete h_boundary_thermo_rhoD;
+    delete h_thermo_rhoD;
 }
diff --git a/applications/solvers/dfLowMachFoam/dfLowMachFoam.C b/applications/solvers/dfLowMachFoam/dfLowMachFoam.C
index 5f3c7aea1..4f4c771df 100644
--- a/applications/solvers/dfLowMachFoam/dfLowMachFoam.C
+++ b/applications/solvers/dfLowMachFoam/dfLowMachFoam.C
@@ -76,6 +76,7 @@ Description
     #include "dfMatrixOpBase.H"
     #include "dfNcclBase.H"
     #include "dfThermo.H"
+    #include "dfChemistrySolver.H"
     #include <cuda_runtime.h>
     #include <thread>
 
@@ -87,9 +88,13 @@ Description
     #include "upwind.H"
     #include "GenFvMatrix.H"
     #include "CanteraMixture.H"
+    #include "multivariateGaussConvectionScheme.H"
+    #include "limitedSurfaceInterpolationScheme.H"
 #else
     #include "processorFvPatchField.H"
     #include "cyclicFvPatchField.H"
+    #include "multivariateGaussConvectionScheme.H"
+    #include "limitedSurfaceInterpolationScheme.H"
     int myRank = -1;
     int mpi_init_flag = 0;
 #endif
@@ -216,7 +221,11 @@ int main(int argc, char *argv[])
 
     const volScalarField& mu = thermo.mu();
     const volScalarField& alpha = thermo.alpha();
-    createGPUThermo(CanteraTorchProperties, T, thermo.he(), psi, alpha, mu, K, dpdt);
+    createGPUThermo(CanteraTorchProperties, T, thermo.he(), psi, alpha, mu, K, dpdt, chemistry);
+    if (chemistry->ifChemstry())
+    {
+        chemistrySolver_GPU.setConstantValue(dfDataBase.num_cells, dfDataBase.num_species, 4096);
+    }
 #endif
 
     end1 = std::clock();
@@ -302,6 +311,7 @@ int main(int argc, char *argv[])
                 thermo_GPU.sync();
             #if defined DEBUG_
                 // check correctThermo
+                int speciesIndex = 6;
                 chemistry->correctThermo(); // reference
                 double *h_boundary_T_tmp = new double[dfDataBase.num_boundary_surfaces];
                 double *h_boundary_he_tmp = new double[dfDataBase.num_boundary_surfaces];
@@ -309,6 +319,7 @@ int main(int argc, char *argv[])
                 double *h_boundary_rho_tmp = new double[dfDataBase.num_boundary_surfaces];
                 double *h_boundary_thermo_alpha_tmp = new double[dfDataBase.num_boundary_surfaces];    
                 double *h_boundary_thermo_psi_tmp = new double[dfDataBase.num_boundary_surfaces];
+                double *h_boundary_thermo_rhoD_tmp = new double[dfDataBase.num_boundary_surfaces];
                 offset = 0;
                 forAll(T.boundaryField(), patchi)
                 {
@@ -318,6 +329,7 @@ int main(int argc, char *argv[])
                     const fvPatchScalarField& patchAlpha = alpha.boundaryField()[patchi];
                     const fvPatchScalarField& patchRho = thermo.rho()().boundaryField()[patchi];
                     const fvPatchScalarField& patchT = T.boundaryField()[patchi];
+                    const fvPatchScalarField& patchRhoD = chemistry->rhoD(speciesIndex).boundaryField()[patchi];
 
                     int patchsize = patchT.size();
                     if (patchT.type() == "processor") {
@@ -351,6 +363,11 @@ int main(int argc, char *argv[])
                                 dynamic_cast<const processorFvPatchField<scalar>&>(patchRho).patchInternalField()();
                         memcpy(h_boundary_rho_tmp + offset + patchsize, &patchRhoInternal[0], patchsize * sizeof(double));
 
+                        memcpy(h_boundary_thermo_rhoD_tmp + offset, &patchRhoD[0], patchsize * sizeof(double));
+                        scalarField patchRhoDInternal = 
+                                dynamic_cast<const processorFvPatchField<scalar>&>(patchRhoD).patchInternalField()();
+                        memcpy(h_boundary_thermo_rhoD_tmp + offset + patchsize, &patchRhoDInternal[0], patchsize * sizeof(double));
+
                         offset += patchsize * 2;
                     } else {
                         memcpy(h_boundary_T_tmp + offset, &patchT[0], patchsize * sizeof(double));
@@ -359,6 +376,7 @@ int main(int argc, char *argv[])
                         memcpy(h_boundary_thermo_alpha_tmp + offset, &patchAlpha[0], patchsize * sizeof(double));
                         memcpy(h_boundary_mu_tmp + offset, &patchMu[0], patchsize * sizeof(double));
                         memcpy(h_boundary_rho_tmp + offset, &patchRho[0], patchsize * sizeof(double));
+                        memcpy(h_boundary_thermo_rhoD_tmp + offset, &patchRhoD[0], patchsize * sizeof(double));
 
                         offset += patchsize;
                     }
@@ -369,12 +387,13 @@ int main(int argc, char *argv[])
                     MPI_Comm_rank(MPI_COMM_WORLD, &rank);
                 }
                 if (!mpi_init_flag || rank == 0) {
-                    thermo_GPU.compareT(&T[0], h_boundary_T_tmp, printFlag);
+                    // thermo_GPU.compareT(&T[0], h_boundary_T_tmp, printFlag);
                     // thermo_GPU.compareHe(&thermo.he()[0], h_boundary_he_tmp, printFlag);
                     // thermo_GPU.comparePsi(&psi[0], h_boundary_thermo_psi_tmp, printFlag);
                     // thermo_GPU.compareAlpha(&alpha[0], h_boundary_thermo_alpha_tmp, printFlag);
                     // thermo_GPU.compareMu(&mu[0], h_boundary_mu_tmp, printFlag);
                     // thermo_GPU.compareRho(&thermo.rho()()[0], h_boundary_rho_tmp, printFlag);
+                    // thermo_GPU.compareRhoD(&chemistry->rhoD(speciesIndex)[0], h_boundary_thermo_rhoD_tmp, speciesIndex, printFlag);
                 }
 
                 delete h_boundary_T_tmp;
@@ -396,11 +415,10 @@ int main(int argc, char *argv[])
             }
             // update T for debug
             #ifdef GPUSolverNew_
-            double *h_T_tmp = new double[dfDataBase.num_cells];
+            double *h_T = dfDataBase.getFieldPointer("T", location::cpu, position::internal);
             double *h_boundary_T_tmp = new double[dfDataBase.num_boundary_surfaces];
-            thermo_GPU.updateCPUT(h_T_tmp, h_boundary_T_tmp);
-
-            memcpy(&T[0], h_T_tmp, T.size() * sizeof(double));
+            thermo_GPU.updateCPUT(h_T, h_boundary_T_tmp);
+            memcpy(&T[0], h_T, T.size() * sizeof(double));
             offset = 0;
             forAll(T.boundaryField(), patchi) {
                 const fvPatchScalarField& const_patchT = T.boundaryField()[patchi];
@@ -414,7 +432,6 @@ int main(int argc, char *argv[])
                     offset += patchsize;
                 }
             }
-            delete h_T_tmp;
             delete h_boundary_T_tmp;
             #endif
 
@@ -469,7 +486,7 @@ int main(int argc, char *argv[])
 #endif
 
 #ifdef GPUSolverNew_
-        // write U for
+        // write U
         UEqn_GPU.postProcess();
         memcpy(&U[0][0], dfDataBase.h_u, dfDataBase.cell_value_vec_bytes);
         U.correctBoundaryConditions();
diff --git a/applications/solvers/dfLowMachFoam/pEqn_CPU.H b/applications/solvers/dfLowMachFoam/pEqn_CPU.H
index b12fe14fa..aa379dde7 100644
--- a/applications/solvers/dfLowMachFoam/pEqn_CPU.H
+++ b/applications/solvers/dfLowMachFoam/pEqn_CPU.H
@@ -68,8 +68,7 @@ else
     fvScalarMatrix pDDtEqn
     (
         fvc::ddt(rho) + 
-        // psi*correction(fvm::ddt(p)) // TODO: bugs in correction fvMatrix
-        psi*fvm::ddt(p)
+        psi*correction(fvm::ddt(p))
         + fvc::div(phiHbyA)
     );
 
diff --git a/applications/solvers/dfLowMachFoam/pEqn_GPU.H b/applications/solvers/dfLowMachFoam/pEqn_GPU.H
index b982d8475..3dd060955 100644
--- a/applications/solvers/dfLowMachFoam/pEqn_GPU.H
+++ b/applications/solvers/dfLowMachFoam/pEqn_GPU.H
@@ -98,8 +98,7 @@ UEqn_GPU.sync();
     fvScalarMatrix pDDtEqn
     (
         fvc::ddt(rho) + 
-        // psi*correction(fvm::ddt(p)) // TODO: bugs in correction fvMatrix
-        psi*fvm::ddt(p)
+        psi*correction(fvm::ddt(p))
         + fvc::div(phiHbyA)
     );
     while (pimple.correctNonOrthogonal())
diff --git a/src_gpu/dfUEqn.H b/src_gpu/dfUEqn.H
index a5f30d987..71f8cede5 100644
--- a/src_gpu/dfUEqn.H
+++ b/src_gpu/dfUEqn.H
@@ -84,6 +84,11 @@ private:
 	// non-constant fields - csr
 	double *d_A = nullptr;
 	double *d_b = nullptr; // TODO: needless
+    double *d_ldu_solve = nullptr;
+    double *d_extern_solve = nullptr;
+	double *d_source_solve = nullptr;
+	double *d_internal_coeffs_solve = nullptr;
+	double *d_boundary_coeffs_solve = nullptr;
 
     // field pointer map
     std::unordered_map<std::string, double*> fieldPointerMap;
diff --git a/src_gpu/dfUEqn.cu b/src_gpu/dfUEqn.cu
index f0253e62a..4b828e170 100644
--- a/src_gpu/dfUEqn.cu
+++ b/src_gpu/dfUEqn.cu
@@ -433,6 +433,11 @@ void dfUEqn::createNonConstantLduAndCsrFields() {
 #ifndef STREAM_ALLOCATOR
   checkCudaErrors(cudaMalloc((void**)&d_A, dataBase_.csr_value_vec_bytes));
   checkCudaErrors(cudaMalloc((void**)&d_b, dataBase_.cell_value_vec_bytes));
+  checkCudaErrors(cudaMalloc((void**)&d_ldu_solve, dataBase_.csr_value_bytes));
+  d_extern_solve = d_ldu_solve + dataBase_.num_cells + 2 * dataBase_.num_surfaces;
+  checkCudaErrors(cudaMalloc((void**)&d_source_solve, dataBase_.cell_value_vec_bytes));
+  checkCudaErrors(cudaMalloc((void**)&d_internal_coeffs_solve, dataBase_.boundary_surface_value_vec_bytes));
+  checkCudaErrors(cudaMalloc((void**)&d_boundary_coeffs_solve, dataBase_.boundary_surface_value_vec_bytes));
 #endif
   checkCudaErrors(cudaMalloc((void**)&d_A_pEqn, dataBase_.cell_value_bytes)); // TODO: delete redundant variables
   checkCudaErrors(cudaMalloc((void**)&d_H_pEqn, dataBase_.cell_value_vec_bytes));
@@ -511,6 +516,12 @@ void dfUEqn::process() {
 
         checkCudaErrors(cudaMallocAsync((void**)&d_A, dataBase_.csr_value_vec_bytes, dataBase_.stream));
         checkCudaErrors(cudaMallocAsync((void**)&d_b, dataBase_.cell_value_vec_bytes, dataBase_.stream));
+        checkCudaErrors(cudaMallocAsync((void**)&d_ldu_solve, dataBase_.csr_value_bytes, dataBase_.stream));
+        d_extern_solve = d_ldu_solve + dataBase_.num_cells + 2 * dataBase_.num_surfaces;
+        checkCudaErrors(cudaMallocAsync((void**)&d_source_solve, dataBase_.cell_value_vec_bytes, dataBase_.stream));
+        checkCudaErrors(cudaMallocAsync((void**)&d_internal_coeffs_solve, dataBase_.boundary_surface_value_vec_bytes, dataBase_.stream));
+        checkCudaErrors(cudaMallocAsync((void**)&d_boundary_coeffs_solve, dataBase_.boundary_surface_value_vec_bytes, dataBase_.stream));
+
 #endif
 
         // checkCudaErrors(cudaMemcpyAsync(d_u_host_order, dataBase_.h_u, dataBase_.cell_value_vec_bytes, cudaMemcpyHostToDevice, dataBase_.stream));
@@ -584,9 +595,15 @@ void dfUEqn::process() {
                 dataBase_.d_weight, dataBase_.d_sf, d_grad_u, d_source, // end for internal
                 dataBase_.num_patches, dataBase_.patch_size.data(), dataBase_.patch_type_calculated.data(), dataBase_.d_boundary_weight,
                 dataBase_.d_boundary_face_cell, d_boundary_grad_u, dataBase_.d_boundary_sf, dataBase_.d_volume);
+        
+        checkCudaErrors(cudaMemcpyAsync(d_ldu_solve, d_ldu, dataBase_.csr_value_bytes, cudaMemcpyDeviceToDevice, dataBase_.stream));
+        checkCudaErrors(cudaMemcpyAsync(d_source_solve, d_source, dataBase_.cell_value_vec_bytes, cudaMemcpyDeviceToDevice, dataBase_.stream));
+        checkCudaErrors(cudaMemcpyAsync(d_internal_coeffs_solve, d_internal_coeffs, dataBase_.boundary_surface_value_vec_bytes, cudaMemcpyDeviceToDevice, dataBase_.stream));
+        checkCudaErrors(cudaMemcpyAsync(d_boundary_coeffs_solve, d_boundary_coeffs, dataBase_.boundary_surface_value_vec_bytes, cudaMemcpyDeviceToDevice, dataBase_.stream));
+
         fvc_grad_cell_scalar(dataBase_.stream, dataBase_.num_cells, dataBase_.num_surfaces, dataBase_.num_boundary_surfaces,
                 dataBase_.d_owner, dataBase_.d_neighbor,
-                dataBase_.d_weight, dataBase_.d_sf, dataBase_.d_p, d_source,
+                dataBase_.d_weight, dataBase_.d_sf, dataBase_.d_p, d_source_solve,
                 dataBase_.num_patches, dataBase_.patch_size.data(), patch_type.data(), dataBase_.d_boundary_weight,
                 dataBase_.d_boundary_face_cell, dataBase_.d_boundary_p, dataBase_.d_boundary_sf, dataBase_.d_volume, -1.);
         getrAU(dataBase_.stream, dataBase_.nccl_comm, dataBase_.num_cells, dataBase_.num_surfaces, dataBase_.num_boundary_surfaces, 
@@ -597,7 +614,7 @@ void dfUEqn::process() {
         ueqn_ldu_to_csr(dataBase_.stream, dataBase_.num_cells, dataBase_.num_surfaces, dataBase_.num_boundary_surfaces, dataBase_.num_Nz,
             dataBase_.d_boundary_face_cell, dataBase_.d_ldu_to_csr_index, dataBase_.d_diag_to_csr_index, 
             dataBase_.num_patches, dataBase_.patch_size.data(), patch_type.data(), dataBase_.d_u, dataBase_.d_boundary_u,
-            d_ldu, d_extern, d_source, d_internal_coeffs, d_boundary_coeffs, dataBase_.cyclicNeighbor.data(), 
+            d_ldu_solve, d_extern_solve, d_source_solve, d_internal_coeffs_solve, d_boundary_coeffs_solve, dataBase_.cyclicNeighbor.data(), 
             dataBase_.patchSizeOffset.data(), d_A, d_b);
 #endif
 #ifdef USE_GRAPH
@@ -654,6 +671,10 @@ void dfUEqn::process() {
     
         checkCudaErrors(cudaFreeAsync(d_A, dataBase_.stream));
         checkCudaErrors(cudaFreeAsync(d_b, dataBase_.stream));
+        checkCudaErrors(cudaFreeAsync(d_ldu_solve, dataBase_.stream));
+        checkCudaErrors(cudaFreeAsync(d_source_solve, dataBase_.stream));
+        checkCudaErrors(cudaFreeAsync(d_internal_coeffs_solve, dataBase_.stream));
+        checkCudaErrors(cudaFreeAsync(d_boundary_coeffs_solve, dataBase_.stream));
 #endif
         TICK_END_EVENT(UEqn post process free);
 
diff --git a/src_gpu/dfpEqn.cu b/src_gpu/dfpEqn.cu
index fed764a36..dc7038a6d 100644
--- a/src_gpu/dfpEqn.cu
+++ b/src_gpu/dfpEqn.cu
@@ -263,9 +263,9 @@ __global__ void correct_diag_mtx_multi_tpsi_kernel(int num_cells, const double *
         return;
 
     // correction: source += (-diag * psi + source)
-    // double srcVal = source[index];
-    // double APsi = - diag[index] * psi[index] + srcVal;
-    // source[index] += APsi;
+    double srcVal = source[index];
+    double APsi = - diag[index] * psi[index] + srcVal;
+    source[index] -= APsi;
 
     // multi psi
     double tPsiVal = thermo_psi[index];

From 7a2c9240574fb090ba01ce90c7be81697e4bc0e9 Mon Sep 17 00:00:00 2001
From: maorz1998 <maorz1998@stu.pku.edu.cn>
Date: Tue, 7 Nov 2023 23:57:41 +0800
Subject: [PATCH 5/5] limitedLinear debuging

---
 src_gpu/AmgXSolver.H        |  26 ------
 src_gpu/dfEEqn.cu           |   4 +-
 src_gpu/dfMatrixDataBase.H  |   7 +-
 src_gpu/dfMatrixDataBase.cu |  15 +++-
 src_gpu/dfMatrixOpBase.H    |  11 +++
 src_gpu/dfMatrixOpBase.cu   | 171 ++++++++++++++++++++++++++++++++++++
 src_gpu/dfRhoEqn.cu         |   2 +-
 7 files changed, 205 insertions(+), 31 deletions(-)

diff --git a/src_gpu/AmgXSolver.H b/src_gpu/AmgXSolver.H
index 9e6e680aa..8eb194a6f 100644
--- a/src_gpu/AmgXSolver.H
+++ b/src_gpu/AmgXSolver.H
@@ -30,32 +30,6 @@
 // mpi
 #include <mpi.h>
 
-
-/** \brief A macro to check the returned CUDA error code.
- *
- * \param call [in] Function call to CUDA API.
- */
-# define CHECK(call)                                                        \
-do                                                    \
-{                                                     \
-    const cudaError_t error_code = call;              \
-    if (error_code != cudaSuccess)                    \
-    {                                                 \
-        printf("CUDA Error:\n");                      \
-        printf("    File:       %s\n", __FILE__);     \
-        printf("    Line:       %d\n", __LINE__);     \
-        printf("    Error code: %d\n", error_code);   \
-        printf("    Error text: %s\n",                \
-            cudaGetErrorString(error_code));          \
-        exit(1);                                      \
-    }                                                 \
-} while (0)
-
-
-
-
-
-
 /** \brief A wrapper class for coupling PETSc and AmgX.
  *
  * This class is a wrapper of AmgX library for PETSc. PETSc users only need to
diff --git a/src_gpu/dfEEqn.cu b/src_gpu/dfEEqn.cu
index cb43f8c9a..ad55af27a 100644
--- a/src_gpu/dfEEqn.cu
+++ b/src_gpu/dfEEqn.cu
@@ -222,8 +222,8 @@ void dfEEqn::process() {
 
         TICK_START_EVENT;
         // copy he to host
-        checkCudaErrors(cudaMemcpyAsync(dataBase_.h_he, dataBase_.d_he, dataBase_.cell_value_bytes, cudaMemcpyDeviceToHost, dataBase_.stream));
-        checkCudaErrors(cudaMemcpyAsync(dataBase_.h_boundary_he, dataBase_.d_boundary_he, dataBase_.boundary_surface_value_bytes, cudaMemcpyDeviceToHost, dataBase_.stream));
+        // checkCudaErrors(cudaMemcpyAsync(dataBase_.h_he, dataBase_.d_he, dataBase_.cell_value_bytes, cudaMemcpyDeviceToHost, dataBase_.stream));
+        // checkCudaErrors(cudaMemcpyAsync(dataBase_.h_boundary_he, dataBase_.d_boundary_he, dataBase_.boundary_surface_value_bytes, cudaMemcpyDeviceToHost, dataBase_.stream));
         TICK_END_EVENT(EEqn post process copy back);
 
         TICK_START_EVENT;
diff --git a/src_gpu/dfMatrixDataBase.H b/src_gpu/dfMatrixDataBase.H
index 40cf32bae..8cf6485dd 100644
--- a/src_gpu/dfMatrixDataBase.H
+++ b/src_gpu/dfMatrixDataBase.H
@@ -161,6 +161,7 @@ struct dfMatrixDataBase
 
     // constant fields - internal
     double *d_sf = nullptr;
+    double *d_mesh_dis = nullptr;
     double *d_mag_sf = nullptr;
     double *d_weight = nullptr;
     double *d_phi_weight = nullptr; // weight for mvConvection->fvmDiv
@@ -168,6 +169,7 @@ struct dfMatrixDataBase
     double *d_volume = nullptr;
     
     double *h_sf = nullptr;
+    double *h_mesh_dis = nullptr;
 
     // constant fields - boundary
     double *d_boundary_sf = nullptr;
@@ -207,8 +209,10 @@ struct dfMatrixDataBase
     double *d_dpdt = nullptr;
 
     double *d_T = nullptr;
+    double *h_T = nullptr;
     double *d_mu = nullptr;
     double *d_thermo_alpha = nullptr;
+    double *d_thermo_rhoD = nullptr;
 
     // computed on GPU, used on CPU, need memcpyd2h - host
     double *h_rho = nullptr;
@@ -258,6 +262,7 @@ struct dfMatrixDataBase
     double *d_boundary_T = nullptr;
     double *d_boundary_mu = nullptr;
     double *d_boundary_thermo_alpha = nullptr;
+    double *d_boundary_thermo_rhoD = nullptr;
     // boundary fields used between eqns
     double *d_boundary_rAU = nullptr;
     double *d_boundary_HbyA = nullptr;
@@ -300,7 +305,7 @@ struct dfMatrixDataBase
     void createConstantFieldsInternal();
     void createConstantFieldsBoundary();
     void initConstantFieldsInternal(const double *sf, const double *mag_sf, 
-            const double *weight, const double *delta_coeffs, const double *volume);
+            const double *weight, const double *delta_coeffs, const double *volume, const double *mesh_distance);
     void initConstantFieldsBoundary(const double *boundary_sf, const double *boundary_mag_sf, 
             const double *boundary_delta_coeffs, const double *boundary_weight, const int *boundary_face_cell, 
             std::vector<int> &patch_type_calculated, std::vector<int> &patch_type_extropolated);
diff --git a/src_gpu/dfMatrixDataBase.cu b/src_gpu/dfMatrixDataBase.cu
index fd87cb46a..2dd42745d 100644
--- a/src_gpu/dfMatrixDataBase.cu
+++ b/src_gpu/dfMatrixDataBase.cu
@@ -267,12 +267,14 @@ void dfMatrixDataBase::setConstantIndexes(const int *owner, const int *neighbor,
 
 void dfMatrixDataBase::createConstantFieldsInternal() {
     checkCudaErrors(cudaMalloc((void**)&d_sf, surface_value_vec_bytes));
+    checkCudaErrors(cudaMalloc((void**)&d_mesh_dis, surface_value_vec_bytes));
     checkCudaErrors(cudaMalloc((void**)&d_mag_sf, surface_value_bytes));
     checkCudaErrors(cudaMalloc((void**)&d_weight, surface_value_bytes));
     checkCudaErrors(cudaMalloc((void**)&d_phi_weight, surface_value_bytes));
     checkCudaErrors(cudaMalloc((void**)&d_delta_coeffs, surface_value_bytes));
     checkCudaErrors(cudaMalloc((void**)&d_volume, cell_value_bytes));
     fieldPointerMap["d_sf"] = d_sf;
+    fieldPointerMap["d_mesh_dis"] = d_mesh_dis;
     fieldPointerMap["d_mag_sf"] = d_mag_sf;
     fieldPointerMap["d_weight"] = d_weight;
     fieldPointerMap["d_phi_weight"] = d_phi_weight;
@@ -280,7 +282,9 @@ void dfMatrixDataBase::createConstantFieldsInternal() {
     fieldPointerMap["d_volume"] = d_volume;
 
     checkCudaErrors(cudaMallocHost((void**)&h_sf, surface_value_vec_bytes));
+    checkCudaErrors(cudaMallocHost((void**)&h_mesh_dis, surface_value_vec_bytes));
     fieldPointerMap["h_sf"] = h_sf;
+    fieldPointerMap["h_mesh_dis"] = h_mesh_dis;
 }
 
 void dfMatrixDataBase::createConstantFieldsBoundary() {
@@ -299,14 +303,18 @@ void dfMatrixDataBase::createConstantFieldsBoundary() {
 }
 
 void dfMatrixDataBase::initConstantFieldsInternal(const double *sf, const double *mag_sf, 
-        const double *weight, const double *delta_coeffs, const double *volume) {
+        const double *weight, const double *delta_coeffs, const double *volume, const double *mesh_distance) {
     // permute sf
     for (int i = 0; i < num_surfaces; i++) {
         h_sf[num_surfaces * 0 + i] = sf[i * 3 + 0];
         h_sf[num_surfaces * 1 + i] = sf[i * 3 + 1];
         h_sf[num_surfaces * 2 + i] = sf[i * 3 + 2];
+        h_mesh_dis[num_surfaces * 0 + i] = mesh_distance[i * 3 + 0];
+        h_mesh_dis[num_surfaces * 1 + i] = mesh_distance[i * 3 + 1];
+        h_mesh_dis[num_surfaces * 2 + i] = mesh_distance[i * 3 + 2];
     }
     checkCudaErrors(cudaMemcpyAsync(d_sf, h_sf, surface_value_vec_bytes, cudaMemcpyHostToDevice, stream));
+    checkCudaErrors(cudaMemcpyAsync(d_mesh_dis, h_mesh_dis, surface_value_vec_bytes, cudaMemcpyHostToDevice, stream));
     checkCudaErrors(cudaMemcpyAsync(d_mag_sf, mag_sf, surface_value_bytes, cudaMemcpyHostToDevice, stream));
     checkCudaErrors(cudaMemcpyAsync(d_weight, weight, surface_value_bytes, cudaMemcpyHostToDevice, stream));
     checkCudaErrors(cudaMemcpyAsync(d_delta_coeffs, delta_coeffs, surface_value_bytes, cudaMemcpyHostToDevice, stream));
@@ -366,6 +374,8 @@ void dfMatrixDataBase::createNonConstantFieldsInternal() {
     // thermophysical fields
     checkCudaErrors(cudaMalloc((void**)&d_thermo_psi, cell_value_bytes));
     checkCudaErrors(cudaMalloc((void**)&d_thermo_alpha, cell_value_bytes));
+    checkCudaErrors(cudaMalloc((void**)&d_thermo_rhoD, num_species * cell_value_bytes));
+
     checkCudaErrors(cudaMalloc((void**)&d_hDiff_corr_flux, cell_value_vec_bytes));
     checkCudaErrors(cudaMalloc((void**)&d_diff_alphaD, cell_value_bytes));
     checkCudaErrors(cudaMalloc((void**)&d_dpdt, cell_value_bytes));
@@ -381,6 +391,7 @@ void dfMatrixDataBase::createNonConstantFieldsInternal() {
     checkCudaErrors(cudaMalloc((void**)&d_HbyA, cell_value_vec_bytes));
 
     // computed on GPU, used on CPU, need memcpyd2h
+    checkCudaErrors(cudaMallocHost((void**)&h_T, cell_value_bytes));
     checkCudaErrors(cudaMallocHost((void**)&h_rho, cell_value_bytes));
     checkCudaErrors(cudaMallocHost((void**)&h_rho_old, cell_value_bytes));
     checkCudaErrors(cudaMallocHost((void**)&h_u, cell_value_vec_bytes));
@@ -389,6 +400,7 @@ void dfMatrixDataBase::createNonConstantFieldsInternal() {
     checkCudaErrors(cudaMallocHost((void**)&h_he, cell_value_bytes));
     checkCudaErrors(cudaMallocHost((void**)&h_k, cell_value_bytes));
     checkCudaErrors(cudaMallocHost((void**)&h_k_old, cell_value_bytes));
+    fieldPointerMap["h_T"] = h_T;
     fieldPointerMap["h_rho"] = h_rho;
     fieldPointerMap["h_rho_old"] = h_rho_old;
     fieldPointerMap["h_u"] = h_u;
@@ -440,6 +452,7 @@ void dfMatrixDataBase::createNonConstantFieldsBoundary() {
     // thermophysical fields
     checkCudaErrors(cudaMalloc((void**)&d_boundary_thermo_psi, boundary_surface_value_bytes));
     checkCudaErrors(cudaMalloc((void**)&d_boundary_thermo_alpha, boundary_surface_value_bytes));
+    checkCudaErrors(cudaMalloc((void**)&d_boundary_thermo_rhoD, num_species * boundary_surface_value_bytes));
     checkCudaErrors(cudaMalloc((void**)&d_boundary_hDiff_corr_flux, boundary_surface_value_vec_bytes));
     checkCudaErrors(cudaMalloc((void**)&d_boundary_diff_alphaD, boundary_surface_value_bytes));
     checkCudaErrors(cudaMalloc((void**)&d_boundary_T, boundary_surface_value_bytes));
diff --git a/src_gpu/dfMatrixOpBase.H b/src_gpu/dfMatrixOpBase.H
index 5a61980f9..c27090303 100644
--- a/src_gpu/dfMatrixOpBase.H
+++ b/src_gpu/dfMatrixOpBase.H
@@ -156,6 +156,17 @@ void correct_boundary_conditions_vector(cudaStream_t stream, ncclComm_t comm,
 
 void compute_upwind_weight(cudaStream_t stream, int num_surfaces, const double *phi, double *weight);
 
+void compute_limitedLinear_weight(cudaStream_t stream, ncclComm_t comm, const int *neighbor_peer, 
+        int num_surfaces, int num_cells, int num_boundary_surfaces,
+        const int *lowerAddr, const int *upperAddr, const double *mesh_distance, 
+        const double *weight, const double *Sf, const double *vf, const double *phi,  double *output, // end for internal
+        int num_patches, const int *patch_size, const int *patch_type, const double *boundary_weight,
+        const int *boundary_cell_face, const double *boundary_vf, const double *boundary_Sf,
+        const double *volume, const double *boundary_mag_Sf, const double *boundary_phi, 
+        // const double *boundary_distance, double *boundary_output, 
+        const int *cyclicNeighbor, const int *patchSizeOffset,
+        const double *boundary_deltaCoeffs);
+
 // fvm ops
 
 void fvm_ddt_vol_scalar_vol_scalar(cudaStream_t stream, int num_cells, double rDeltaT,
diff --git a/src_gpu/dfMatrixOpBase.cu b/src_gpu/dfMatrixOpBase.cu
index 6e520f7c6..afe3b9cf4 100644
--- a/src_gpu/dfMatrixOpBase.cu
+++ b/src_gpu/dfMatrixOpBase.cu
@@ -5,6 +5,9 @@
 #include <cuda_runtime.h>
 #include "cuda_profiler_api.h"
 
+using std::min;
+using std::max;
+
 __global__ void warmup()
 {
     int index = blockDim.x * blockIdx.x + threadIdx.x;
@@ -167,6 +170,112 @@ __global__ void compute_upwind_weight_internal(int num_faces, const double *phi,
         weight[index] = 0.;
 }
 
+__device__ int sign(double x)
+{
+    return (x >= 0) ? 1: -1;
+}
+
+__device__ int pos0(double x)
+{
+    return (x >= 0) ? 1 : 0;
+}
+
+__global__ void compute_limiter_phi_internal(int num_cells, int num_surfaces, const double *vf, 
+        const int *lower_index, const int *upper_index, const double *mesh_distance, 
+        const double *phi, const double *mesh_weights, const double *gradc,
+        double *limiter)
+{
+    int index = blockDim.x * blockIdx.x + threadIdx.x;
+    if (index >= num_surfaces)
+        return;
+    
+    int owner = lower_index[index];
+    int neighbor = upper_index[index];
+    double faceFlux = phi[index];
+    double gradf = vf[neighbor] - vf[owner];
+    double gradcf, r;
+
+    // LimiterFunc::r
+    if (faceFlux > 0) {
+        gradcf = mesh_distance[index] * gradc[owner] + 
+                mesh_distance[num_surfaces + index] * gradc[num_cells + owner] +
+                mesh_distance[num_surfaces * 2 + index] * gradc[num_cells * 2 + owner];
+    } else {
+        gradcf = mesh_distance[index] * gradc[neighbor] + 
+                mesh_distance[num_surfaces + index] * gradc[num_cells + neighbor] +
+                mesh_distance[num_surfaces * 2 + index] * gradc[num_cells * 2 + neighbor];
+    }
+    if (fabs(gradcf) >= 1000 * fabs(gradf)) {
+        r = 2*1000*sign(gradcf)*sign(gradf) - 1;
+    } else {
+        r = 2 * (gradcf / gradf) - 1;
+    }
+
+    limiter[index] = max(min(r, 1.), 0.); // now twoByk_ = 1, fvScheme: limitedLinear 1; 
+}
+
+__global__ void compute_limiter_phi_boundary(int num, int offset, int num_boundary_surfaces, 
+        const double *boundary_weight, const double *boundary_vf, const double *boundary_gradc,
+        const double *boundary_distance, const double *boundary_phi, double *boundary_limiter)
+{
+    int index = blockDim.x * blockIdx.x + threadIdx.x;
+    if (index >= num)
+        return;
+
+    int neighbor_start_index = offset + index;
+    int internal_start_index = offset + index + num;
+
+    double bouFaceFlux = boundary_phi[neighbor_start_index];
+    double bouGradf = boundary_vf[internal_start_index] - boundary_vf[neighbor_start_index];
+    double bouGradcf, r;
+
+
+    // LimiterFunc::r
+    if (bouFaceFlux > 0) {
+        bouGradcf = boundary_distance[neighbor_start_index] * boundary_gradc[internal_start_index] + 
+                boundary_distance[num_boundary_surfaces + neighbor_start_index] * boundary_gradc[num_boundary_surfaces + internal_start_index] +
+                boundary_distance[num_boundary_surfaces * 2 + neighbor_start_index] * boundary_gradc[num_boundary_surfaces * 2 + internal_start_index];
+    } else {
+        bouGradcf = boundary_distance[neighbor_start_index] * boundary_gradc[neighbor_start_index] + 
+                boundary_distance[num_boundary_surfaces + neighbor_start_index] * boundary_gradc[num_boundary_surfaces + neighbor_start_index] +
+                boundary_distance[num_boundary_surfaces * 2 + neighbor_start_index] * boundary_gradc[num_boundary_surfaces * 2 + neighbor_start_index];
+    }
+    if (fabs(bouGradcf) >= 1000 * fabs(bouGradf)) {
+        r = 2*1000*sign(bouGradcf)*sign(bouGradf) - 1;
+    } else {
+        r = 2 * (bouGradcf / bouGradf) - 1;
+    }
+
+    boundary_limiter[neighbor_start_index] = max(min(r, 1.), 0.); // now twoByk_ = 1, fvScheme: limitedLinear 1; 
+}
+
+__global__ void compute_limiter_weight_internal(int num_cells, int num_surfaces,
+        const double *phi, const double *mesh_weights, const double *limiter_weights, double *output_weights)
+{
+    int index = blockDim.x * blockIdx.x + threadIdx.x;
+    if (index >= num_surfaces)
+        return;
+    
+    double limiterW = limiter_weights[index];
+    output_weights[index] = limiterW * mesh_weights[index] +
+            (1. - limiterW) * pos0(phi[index]);
+}
+
+__global__ void compute_limiter_weight_boundary(int num, int offset, int num_boundary_surfaces, 
+        const double *boundary_weight, const double *boundary_phi, 
+        const double *boundary_limiter_weights, double *boundary_output_weights)
+{
+    int index = blockDim.x * blockIdx.x + threadIdx.x;
+    if (index >= num)
+        return;
+
+    int neighbor_start_index = offset + index;
+
+    double limiterW = boundary_limiter_weights[neighbor_start_index];
+    boundary_output_weights[neighbor_start_index] = limiterW * boundary_weight[neighbor_start_index] +
+            (1. - limiterW) * pos0(boundary_phi[neighbor_start_index]);
+}
+
 __global__ void update_boundary_coeffs_zeroGradient_scalar(int num, int offset,
         double *value_internal_coeffs, double *value_boundary_coeffs,
         double *gradient_internal_coeffs, double *gradient_boundary_coeffs)
@@ -2428,6 +2537,68 @@ void compute_upwind_weight(cudaStream_t stream, int num_surfaces, const double *
     compute_upwind_weight_internal<<<blocks_per_grid, threads_per_block, 0, stream>>>(num_surfaces, phi, weight);
 }
 
+void compute_limitedLinear_weight(cudaStream_t stream, ncclComm_t comm, const int *neighbor_peer, 
+        int num_surfaces, int num_cells, int num_boundary_surfaces,
+        const int *lowerAddr, const int *upperAddr, const double *mesh_distance, 
+        const double *weight, const double *Sf, const double *vf, const double *phi,  double *output, // end for internal
+        int num_patches, const int *patch_size, const int *patch_type, const double *boundary_weight,
+        const int *boundary_cell_face, const double *boundary_vf, const double *boundary_Sf,
+        const double *volume, const double *boundary_mag_Sf, const double *boundary_phi, 
+        // const double *boundary_distance, double *boundary_output, 
+        const int *cyclicNeighbor, const int *patchSizeOffset,
+        const double *boundary_deltaCoeffs)
+{
+    // calculate fvc::grad(vf) (now output stores fvc::grad(lPhi))
+    // fvc_grad_cell_scalar_withBC(stream, comm, neighbor_peer, num_cells, num_surfaces, num_boundary_surfaces,
+    //         lowerAddr, upperAddr, weight, Sf, vf, output, num_patches, patch_size, patch_type, boundary_weight,
+    //         boundary_cell_face, boundary_vf, boundary_Sf, volume, boundary_mag_Sf, boundary_output,
+    //         cyclicNeighbor, patchSizeOffset, boundary_deltaCoeffs);
+    fvc_grad_cell_scalar(stream, num_cells, num_surfaces, num_boundary_surfaces, lowerAddr, upperAddr, 
+            weight, Sf, vf, output, num_patches, patch_size, patch_type, boundary_weight,
+            boundary_cell_face, boundary_vf, boundary_Sf, volume, true);
+    // calculated limiter (now output stores this->limiter(phi))
+    size_t threads_per_block = 1024;
+    size_t blocks_per_grid = (num_surfaces + threads_per_block - 1) / threads_per_block;
+    compute_limiter_phi_internal<<<blocks_per_grid, threads_per_block, 0, stream>>>(num_cells, num_surfaces, vf, 
+            lowerAddr, upperAddr, mesh_distance, phi, weight, output, output);
+    
+    // int offset = 0;
+    // for (int i = 0; i < num_patches; i++) {
+    //     if (patch_size[i] == 0) continue;
+    //     if (patch_type[i] == boundaryConditions::processor
+    //         || patch_type[i] == boundaryConditions::processorCyclic) {
+    //         threads_per_block = 256;
+    //         blocks_per_grid = (patch_size[i] + threads_per_block - 1) / threads_per_block;
+    //         compute_limiter_phi_boundary<<<blocks_per_grid, threads_per_block, 0, stream>>>(patch_size[i], offset,
+    //                 num_boundary_surfaces, boundary_weight, boundary_vf, boundary_output, boundary_distance, 
+    //                 boundary_phi, boundary_output);
+    //         offset += 2 * patch_size[i];
+    //     } else {
+    //         cudaMemset(boundary_output + offset, 1., patch_size[i] * sizeof(double));
+    //         offset += patch_size[i];
+    //     }
+    // }
+    // calculate weight
+    // threads_per_block = 1024;
+    // blocks_per_grid = (num_surfaces + threads_per_block - 1) / threads_per_block;
+    // compute_limiter_weight_internal<<<blocks_per_grid, threads_per_block, 0, stream>>>(num_cells, num_surfaces, phi, 
+    //         weight, output, output);
+    // offset = 0;
+    // for (int i = 0; i < num_patches; i++) {
+    //     if (patch_size[i] == 0) continue;
+    //     threads_per_block = 256;
+    //     blocks_per_grid = (patch_size[i] + threads_per_block - 1) / threads_per_block;
+    //     compute_limiter_weight_boundary<<<blocks_per_grid, threads_per_block, 0, stream>>>(patch_size[i], offset,
+    //             num_boundary_surfaces, boundary_weight, boundary_phi, boundary_output, boundary_output);
+    //     if (patch_type[i] == boundaryConditions::processor
+    //         || patch_type[i] == boundaryConditions::processorCyclic) {
+    //         offset += 2 * patch_size[i];
+    //     } else {
+    //         offset += patch_size[i];
+    //     }
+    // }
+}
+
 void fvm_ddt_vol_scalar_vol_scalar(cudaStream_t stream, int num_cells, double rDeltaT,
         const double *rho, const double *rho_old, const double *vf, const double *volume,
         double *diag, double *source, double sign)
diff --git a/src_gpu/dfRhoEqn.cu b/src_gpu/dfRhoEqn.cu
index 57d32c9a4..b9eeef762 100644
--- a/src_gpu/dfRhoEqn.cu
+++ b/src_gpu/dfRhoEqn.cu
@@ -86,7 +86,7 @@ void dfRhoEqn::process()
 #endif
     TICK_END_EVENT(rhoEqn post process free);
     TICK_START_EVENT;
-    checkCudaErrors(cudaMemcpyAsync(dataBase_.h_rho, dataBase_.d_rho, dataBase_.cell_value_bytes, cudaMemcpyDeviceToHost, dataBase_.stream));
+    // checkCudaErrors(cudaMemcpyAsync(dataBase_.h_rho, dataBase_.d_rho, dataBase_.cell_value_bytes, cudaMemcpyDeviceToHost, dataBase_.stream));
     TICK_END_EVENT(rhoEqn post process copy back);
     sync();
 }