diff --git a/src_gpu/CMakeLists.txt b/src_gpu/CMakeLists.txt
index 167055179..57058eeef 100644
--- a/src_gpu/CMakeLists.txt
+++ b/src_gpu/CMakeLists.txt
@@ -6,10 +6,12 @@ cmake_minimum_required(VERSION 3.5)
 project(dfMatrix LANGUAGES CXX CUDA)
 set(CMAKE_CXX_STANDARD 17)
 set(CMAKE_CXX_STANDARD_REQUIRED ON)
+set(CMAKE_PREFIX_PATH /root/libtorch)
 
 find_package(CUDA REQUIRED)
 find_package(MPI REQUIRED)
 find_package(CUDAToolkit REQUIRED)
+find_package(Torch REQUIRED)
 find_library(LIBAMGXSH amgxsh PATHS $ENV{AMGX_DIR}/build)
 
 add_compile_options(-arch=sm_70 -fmad=false)
@@ -31,12 +33,14 @@ add_library(${PROJECT_NAME}
         dfEEqn.cu
         dfRhoEqn.cu
         dfpEqn.cu
-        dfThermo.cu)
+        dfThermo.cu
+        dfChemistrySolver.cu)
 
 target_link_libraries(${PROJECT_NAME}
     ${MPI_LIBRARIES}
     ${CUDA_LIBRARIES}
     ${LIBAMGXSH}
+    ${TORCH_LIBRARIES}
 )
 target_compile_options(dfMatrix PUBLIC -g)
 option(DFMATRIX_ENABLE_DETAILED_DEBUG "Enable detailed debug build" OFF)
diff --git a/src_gpu/dfChemistrySolver.H b/src_gpu/dfChemistrySolver.H
new file mode 100644
index 000000000..cbc144a60
--- /dev/null
+++ b/src_gpu/dfChemistrySolver.H
@@ -0,0 +1,25 @@
+#pragma once
+
+#include <stdio.h>
+#include <unistd.h>
+#include <cuda_runtime.h>
+#include <torch/script.h>
+
+class dfChemistrySolver
+{
+private:
+    std::vector<torch::jit::script::Module> modules_;
+    torch::Device device_;
+    double *Xmu_, *Xstd_, *Ymu_, *Ystd_;
+
+    double *init_input_, *y_input_BCT;
+    int dim_input_, num_cells_, num_species_;
+public:
+    dfChemistrySolver(int num_cells, int num_species);
+    ~dfChemistrySolver();
+
+    void loadModels(const std::string dir);
+    void loadNormalization(const std::string dir);
+    void Inference(const double *T, const double *p, const double *y,
+            const double *rho, double *RR);
+};
\ No newline at end of file
diff --git a/src_gpu/dfChemistrySolver.cu b/src_gpu/dfChemistrySolver.cu
new file mode 100644
index 000000000..fd50e811a
--- /dev/null
+++ b/src_gpu/dfChemistrySolver.cu
@@ -0,0 +1,128 @@
+#include "dfChemistrySolver.H"
+
+__global__ void construct_init_input(int num_cells, int dim, const double *T, const double *p,
+        const double *y, double *y_input_BCT, double *output)
+{
+    int index = blockDim.x * blockIdx.x + threadIdx.x;
+    if (index >= num_cells)
+        return;
+
+    output[index * dim] = T[index];
+    output[index * dim + 1] = p[index];
+    double y_BCT;
+    for (int i = 0; i < dim - 2; ++i) {
+        y_BCT = (pow(y[i * num_cells + index], 0.1) - 1) * 10; // BCT: lambda = 0.1
+        output[index * dim + 2 + i] = y_BCT;
+        y_input_BCT[i * num_cells + index] = y_BCT;
+    }
+}
+
+__global__ void normalize_input(int num_cells, int dim, const double *input, 
+        const double *Xmu, const double *Xstd, double *output)
+{
+    int index = blockDim.x * blockIdx.x + threadIdx.x;
+    if (index >= num_cells)
+        return;
+
+    for (int i = 0; i < dim; ++i) {
+        output[index * dim + i] = (input[index * dim + i] - Xmu[i]) / Xstd[i];
+    }
+}
+
+__global__ void calculate_y_new(int num_cells, int num_species, const double *output_init, 
+        const double *y_input_BCT, const double *Ymu, const double *Ystd, double *output)
+{
+    int index = blockDim.x * blockIdx.x + threadIdx.x;
+    if (index >= num_cells)
+        return;
+
+    double RR_tmp;
+    for (int i = 0; i < num_species; ++i) {
+        RR_tmp = output_init[i * num_cells + index] * Ystd[i] + Ymu[i] + y_input_BCT[i * num_cells + index];
+        RR_tmp = pow((RR_tmp * 0.1 + 1), 10); // rev-BCT: lambda = 0.1
+        output[i * num_cells + index] = RR_tmp;
+    }
+}
+
+__global__ void calculate_RR(int num_cells, int num_species, double delta_t,
+        const double *rho, const double *y_old, double *y_new, double *RR)
+{
+    int index = blockDim.x * blockIdx.x + threadIdx.x;
+    if (index >= num_cells)
+        return;
+    
+    // normalize
+    double y_ave = 0.;
+    for (int i = 0; i < num_species; ++i) {
+        y_ave += y_new[i * num_cells + index];
+    }
+    for (int i = 0; i < num_species; ++i) {
+        y_new[i * num_cells + index] = y_new[i * num_cells + index] / y_ave;
+        RR[i * num_cells + index] = (y_new[i * num_cells + index] - y_old[i * num_cells + index]) * rho[index] / delta_t;
+    }
+}
+
+dfChemistrySolver::dfChemistrySolver(int num_cells, int num_species)
+    : device_(torch::kCUDA), num_cells_(num_cells), num_species_(num_species)
+{
+    dim_input_ = num_species + 2; // p, T, y
+    cudaMalloc(&init_input_, sizeof(double) * num_cells * dim_input_);
+    cudaMalloc(&y_input_BCT, sizeof(double) * num_cells * num_species_);
+    cudaMalloc(&Xmu_, sizeof(double) * dim_input_);
+    cudaMalloc(&Xstd_, sizeof(double) * dim_input_);
+    cudaMalloc(&Ymu_, sizeof(double) * num_species_);
+    cudaMalloc(&Ystd_, sizeof(double) * num_species_);
+    modules_.reserve(num_species_);
+
+    // now norm paras are set in constructor manually
+    at::TensorOptions opts = at::TensorOptions().dtype(at::kDouble).device(at::kCUDA);
+    std::vector<double> Xmu_vec = {2.1996457626e+03,  1.0281762632e+05, -5.2533840071e+00,
+                -4.5623405933e+00, -1.5305375824e+00, -3.7453838144e+00,
+                -3.3097212360e+00, -4.3339657954e+00, -2.9028421546e-01};
+    std::vector<double> Xstd_vec = {3.8304388695e+02, 7.1905832810e+02, 4.6788389177e-01, 
+                5.3034657693e-01, 5.0325864222e-01, 5.1058993718e-01, 
+                7.9704668543e-01, 5.1327160125e-01, 4.6827591193e-03};
+    std::vector<double> Ymu_vec = {0.0085609265, -0.0082998877, 0.0030108739,
+                -0.0067360325, 0.0037464590, 0.0024258509};
+    std::vector<double> Ystd_vec = {0.0085609265, -0.0082998877, 0.0030108739,
+                -0.0067360325, 0.0037464590, 0.0024258509};
+    
+    cudaMemcpy(Xmu_, Xmu_vec.data(), sizeof(double) * dim_input_, cudaMemcpyHostToDevice);
+    cudaMemcpy(Xstd_, Xstd_vec.data(), sizeof(double) * dim_input_, cudaMemcpyHostToDevice);
+    cudaMemcpy(Ymu_, Ymu_vec.data(), sizeof(double) * num_species_, cudaMemcpyHostToDevice);
+    cudaMemcpy(Ystd_, Ystd_vec.data(), sizeof(double) * num_species_, cudaMemcpyHostToDevice);
+}
+
+dfChemistrySolver::~dfChemistrySolver() {
+    cudaFree(init_input_);
+}
+
+void dfChemistrySolver::Inference(const double *T, const double *p, const double *y,
+        const double *rho, double *RR) {
+    // construct input
+    size_t threads_per_block = 1024;
+    size_t blocks_per_grid = (num_cells_ + threads_per_block - 1) / threads_per_block;
+    construct_init_input<<<blocks_per_grid, threads_per_block>>>(num_cells_, dim_input_, T, p, y, y_input_BCT, init_input_);
+    normalize_input<<<blocks_per_grid, threads_per_block>>>(num_cells_, dim_input_, init_input_, Xmu_, Xstd_, init_input_);
+
+    // inference by torch
+    at::Tensor torch_input = torch::from_blob(init_input_, {num_cells_, dim_input_}, device_);
+    torch_input = torch_input.to(at::kFloat);
+    std::vector<torch::jit::IValue> INPUTS;
+    INPUTS.push_back(torch_input);
+
+    std::vector<at::Tensor> output(num_species_);
+    for (int i = 0; i < num_species_; ++i) {
+        output[i] = modules_[i].forward(INPUTS).toTensor();
+        output[i] = output[i].to(at::kDouble);
+    }
+
+    // post process
+    double *d_output;
+    for (int i = 0; i < num_species_; ++i) {
+        d_output = output[i].data_ptr<double>();
+        cudaMemcpy(RR + i * num_cells_, d_output, sizeof(double) * num_cells_, cudaMemcpyDeviceToDevice);
+    }
+    calculate_y_new<<<blocks_per_grid, threads_per_block>>>(num_cells_, num_species_, RR, y_input_BCT, Ymu_, Ystd_, RR);
+    calculate_RR<<<blocks_per_grid, threads_per_block>>>(num_cells_, num_species_, 1e-6, rho, y, RR, RR);
+}
\ No newline at end of file
diff --git a/src_gpu/dfMatrixDataBase.H b/src_gpu/dfMatrixDataBase.H
index 4ebecfa6c..40cf32bae 100644
--- a/src_gpu/dfMatrixDataBase.H
+++ b/src_gpu/dfMatrixDataBase.H
@@ -226,6 +226,9 @@ struct dfMatrixDataBase
     // internal fields used between eqns
     double *d_rAU = nullptr;
     double *d_HbyA = nullptr;
+    // turbulence fields
+    double *d_turbulence_k = nullptr;
+    double *d_turbulence_epsilon = nullptr;
 
     // non-constant fields - boundary
     // TODO: further estimate
diff --git a/src_gpu/dfMatrixDataBase.cu b/src_gpu/dfMatrixDataBase.cu
index 64dfd99fb..fd87cb46a 100644
--- a/src_gpu/dfMatrixDataBase.cu
+++ b/src_gpu/dfMatrixDataBase.cu
@@ -372,6 +372,10 @@ void dfMatrixDataBase::createNonConstantFieldsInternal() {
     checkCudaErrors(cudaMalloc((void**)&d_T, cell_value_bytes));
     checkCudaErrors(cudaMalloc((void**)&d_mu, cell_value_bytes));
 
+    // turbulence fields
+    checkCudaErrors(cudaMalloc((void**)&d_turbulence_k, cell_value_bytes));
+    checkCudaErrors(cudaMalloc((void**)&d_turbulence_epsilon, cell_value_bytes));
+
     // internal fields used between eqns
     checkCudaErrors(cudaMalloc((void**)&d_rAU, cell_value_bytes));
     checkCudaErrors(cudaMalloc((void**)&d_HbyA, cell_value_vec_bytes));
diff --git a/src_gpu/dfUEqn.H b/src_gpu/dfUEqn.H
index ef3c8133f..a5f30d987 100644
--- a/src_gpu/dfUEqn.H
+++ b/src_gpu/dfUEqn.H
@@ -45,6 +45,8 @@ private:
     double *h_H_pEqn = nullptr;
 	// intermediate fields
 	double *d_grad_u = nullptr;
+    double *d_delta = nullptr;
+
 	double *d_rho_nueff = nullptr;
 	double *d_u_host_order = nullptr;
     double *d_fvc_output = nullptr; // TODO: no need anymore
@@ -133,6 +135,8 @@ public:
         const int *cyclicNeighbor, const int *patchSizeOffset,
         double *HbyA, double *boundary_HbyA);
     void getHbyA();
+    void getTurbulenceKEpsilon_Smagorinsky(cudaStream_t stream, int num_cells, int num_boundary_surfaces, 
+        const double *grad_U_tsr, const double *volume, double *delta, double *turbulence_k, double *turbulence_epsilon);
     void correctPsi(double *Psi, double *boundary_psi);
     void ueqn_ldu_to_csr(cudaStream_t stream, int num_cells, int num_surfaces, int num_boundary_surface, int num_Nz, 
         const int* boundary_cell_face, const int *ldu_to_csr_index, const int *diag_to_csr_index,
diff --git a/src_gpu/dfUEqn.cu b/src_gpu/dfUEqn.cu
index 131c566d8..325589f1f 100644
--- a/src_gpu/dfUEqn.cu
+++ b/src_gpu/dfUEqn.cu
@@ -302,6 +302,53 @@ __global__ void ueqn_divide_cell_volume_vec(int num_cells, const double* volume,
     output[num_cells * 2 + index] = output[num_cells * 2 + index] / vol;
 }
 
+__global__ void ueqn_calculate_turbulence_k_Smagorinsky(int num_cells, 
+        const double *grad_U_tsr, const double *volume, double Ce, double Ck, 
+        double *delta, double *output)
+{
+    int index = blockDim.x * blockIdx.x + threadIdx.x;
+    if (index >= num_cells)
+        return;
+    
+    double vol = volume[index];
+    double del = pow(vol, 1/3);
+
+    // D = 0.5*(T+T^T)
+    double D_xx = grad_U_tsr[num_cells * 0 + index];
+    double D_xy = 0.5 * (grad_U_tsr[num_cells * 1 + index] + grad_U_tsr[num_cells * 3 + index]);
+    double D_xz = 0.5 * (grad_U_tsr[num_cells * 2 + index] + grad_U_tsr[num_cells * 6 + index]);
+    double D_yy = grad_U_tsr[num_cells * 4 + index];
+    double D_yz = 0.5 * (grad_U_tsr[num_cells * 5 + index] + grad_U_tsr[num_cells * 7 + index]);
+    double D_zz = grad_U_tsr[num_cells * 8 + index];
+
+    // dev(D)
+    double trace = D_xx + D_yy + D_zz;
+    double dev_D_xx = D_xx - (1. / 3.) * trace;
+    double dev_D_yy = D_yy - (1. / 3.) * trace;
+    double dev_D_zz = D_zz - (1. / 3.) * trace;
+
+    // scalar a
+    double a = Ce * del;
+    // scalar b
+    double b = 1.5 * trace;
+    // scalar c
+    double c = 2 * Ck * del * (dev_D_xx * D_xx + dev_D_yy * D_yy + dev_D_zz * D_zz 
+                                    + D_xy * D_xy * 2 + D_xz * D_xz * 2 + D_yz * D_yz * 2);
+    
+    output[index] = pow((-b + pow(b * b + 4 * a * c, 0.5)) / (2 * a), 2);
+    delta[index] = del;
+}
+
+__global__ void ueqn_calculate_turbulence_epsilon_Smagorinsky(int num_cells,
+        const double *k, const double *delta, double Ce, double *output)
+{
+    int index = blockDim.x * blockIdx.x + threadIdx.x;
+    if (index >= num_cells)
+        return;
+    
+    output[index] = Ce * pow(k[index], 1.5) / delta[index];
+}
+
 void dfUEqn::setConstantValues(const std::string &mode_string, const std::string &setting_path) {
   this->stream = dataBase_.stream;
   this->mode_string = mode_string;
@@ -334,6 +381,8 @@ void dfUEqn::createNonConstantFieldsInternal() {
   checkCudaErrors(cudaMalloc((void**)&d_nu_eff, dataBase_.cell_value_bytes));
   // intermediate fields
   checkCudaErrors(cudaMalloc((void**)&d_grad_u, dataBase_.cell_value_tsr_bytes));
+  checkCudaErrors(cudaMalloc((void**)&d_delta, dataBase_.cell_value_bytes));
+
   checkCudaErrors(cudaMalloc((void**)&d_rho_nueff, dataBase_.cell_value_bytes));
   checkCudaErrors(cudaMalloc((void**)&d_fvc_output, dataBase_.cell_value_vec_bytes));
 #endif
@@ -441,6 +490,8 @@ void dfUEqn::process() {
         checkCudaErrors(cudaMallocAsync((void**)&d_nu_eff, dataBase_.cell_value_bytes, dataBase_.stream));
         // intermediate fields
         checkCudaErrors(cudaMallocAsync((void**)&d_grad_u, dataBase_.cell_value_tsr_bytes, dataBase_.stream));
+        checkCudaErrors(cudaMallocAsync((void**)&d_delta, dataBase_.cell_value_bytes, dataBase_.stream));
+
         checkCudaErrors(cudaMallocAsync((void**)&d_rho_nueff, dataBase_.cell_value_bytes, dataBase_.stream));
         checkCudaErrors(cudaMallocAsync((void**)&d_fvc_output, dataBase_.cell_value_vec_bytes, dataBase_.stream));
 
@@ -478,6 +529,8 @@ void dfUEqn::process() {
         checkCudaErrors(cudaMemsetAsync(d_grad_u, 0, dataBase_.cell_value_tsr_bytes, dataBase_.stream));
         checkCudaErrors(cudaMemsetAsync(d_boundary_grad_u, 0, dataBase_.boundary_surface_value_tsr_bytes, dataBase_.stream));
 
+        checkCudaErrors(cudaMemsetAsync(d_delta, 0, dataBase_.cell_value_bytes, dataBase_.stream));
+
         // permute_vector_h2d(dataBase_.stream, dataBase_.num_cells, d_u_host_order, dataBase_.d_u);
         // permute_vector_h2d(dataBase_.stream, dataBase_.num_boundary_surfaces, d_boundary_u_host_order, dataBase_.d_boundary_u);
         update_boundary_coeffs_vector(dataBase_.stream, dataBase_.num_boundary_surfaces, dataBase_.num_patches,
@@ -513,13 +566,16 @@ void dfUEqn::process() {
                 dataBase_.d_boundary_face_cell, dataBase_.d_boundary_u, dataBase_.d_boundary_sf, dataBase_.d_boundary_weight, 
                 dataBase_.d_volume, dataBase_.d_boundary_mag_sf, d_boundary_grad_u, dataBase_.cyclicNeighbor.data(),
                 dataBase_.patchSizeOffset.data(), dataBase_.d_boundary_delta_coeffs);
+        // calculate k & epsilon (if use turbulence model)
+        getTurbulenceKEpsilon_Smagorinsky(dataBase_.stream, dataBase_.num_cells, dataBase_.num_boundary_surfaces, d_grad_u, dataBase_.d_volume, 
+                d_delta, dataBase_.d_turbulence_k, dataBase_.d_turbulence_epsilon);
         scale_dev2T_tensor(dataBase_.stream, dataBase_.num_cells, dataBase_.d_mu, d_grad_u, // end for internal
-               dataBase_.num_boundary_surfaces, dataBase_.d_boundary_mu, d_boundary_grad_u);
+                dataBase_.num_boundary_surfaces, dataBase_.d_boundary_mu, d_boundary_grad_u);
         fvc_div_cell_tensor(dataBase_.stream, dataBase_.num_cells, dataBase_.num_surfaces, dataBase_.num_boundary_surfaces,
-               dataBase_.d_owner, dataBase_.d_neighbor,
-               dataBase_.d_weight, dataBase_.d_sf, d_grad_u, d_source, // end for internal
-               dataBase_.num_patches, dataBase_.patch_size.data(), dataBase_.patch_type_calculated.data(), dataBase_.d_boundary_weight,
-               dataBase_.d_boundary_face_cell, d_boundary_grad_u, dataBase_.d_boundary_sf, dataBase_.d_volume);
+                dataBase_.d_owner, dataBase_.d_neighbor,
+                dataBase_.d_weight, dataBase_.d_sf, d_grad_u, d_source, // end for internal
+                dataBase_.num_patches, dataBase_.patch_size.data(), dataBase_.patch_type_calculated.data(), dataBase_.d_boundary_weight,
+                dataBase_.d_boundary_face_cell, d_boundary_grad_u, dataBase_.d_boundary_sf, dataBase_.d_volume);
         fvc_grad_cell_scalar(dataBase_.stream, dataBase_.num_cells, dataBase_.num_surfaces, dataBase_.num_boundary_surfaces,
                 dataBase_.d_owner, dataBase_.d_neighbor,
                 dataBase_.d_weight, dataBase_.d_sf, dataBase_.d_p, d_source,
@@ -573,6 +629,7 @@ void dfUEqn::process() {
         checkCudaErrors(cudaFreeAsync(d_nu_eff, dataBase_.stream));
         // intermediate fields
         checkCudaErrors(cudaFreeAsync(d_grad_u, dataBase_.stream));
+        checkCudaErrors(cudaFreeAsync(d_delta, dataBase_.stream));
         checkCudaErrors(cudaFreeAsync(d_rho_nueff, dataBase_.stream));
         checkCudaErrors(cudaFreeAsync(d_fvc_output, dataBase_.stream));
     
@@ -651,6 +708,19 @@ void dfUEqn::getrAU(cudaStream_t stream, ncclComm_t comm, int num_cells, int num
             boundary_cell_face, rAU, boundary_rAU, dataBase_.cyclicNeighbor.data(), dataBase_.patchSizeOffset.data(), dataBase_.d_boundary_weight);
 }
 
+void dfUEqn::getTurbulenceKEpsilon_Smagorinsky(cudaStream_t stream, int num_cells, int num_boundary_surfaces, 
+        const double *grad_U_tsr, const double *volume, 
+        double *delta, double *turbulence_k, double *turbulence_epsilon)
+{
+    size_t threads_per_block = 1024;
+    size_t blocks_per_grid = (num_cells + threads_per_block - 1) / threads_per_block;
+    ueqn_calculate_turbulence_k_Smagorinsky<<<blocks_per_grid, threads_per_block, 0, stream>>>(num_cells, grad_U_tsr, volume,
+            1.048, 0.094, delta, turbulence_k);
+    
+    ueqn_calculate_turbulence_epsilon_Smagorinsky<<<blocks_per_grid, threads_per_block, 0, stream>>>(num_cells, turbulence_k, 
+            delta, 1.048, turbulence_epsilon);
+}
+
 void dfUEqn::UEqnGetHbyA(cudaStream_t stream, ncclComm_t comm, const int *neighbor_peer, 
         int num_cells, int num_surfaces, int num_boundary_surfaces, 
         const int *lowerAddr, const int *upperAddr, const double *volume, const double *u,
diff --git a/src_gpu/dfYEqn.cu b/src_gpu/dfYEqn.cu
index c81977b9c..d68ad23ff 100644
--- a/src_gpu/dfYEqn.cu
+++ b/src_gpu/dfYEqn.cu
@@ -555,7 +555,6 @@ void dfYEqn::process() {
     // to compare yi, you should only open DEBUG_ in src_gpu.
     // Besides, if you compare ldu data, be patient to keep specie_index in YEqn.H and dfYEqn.cu the same.
 // #define DEBUG_CHECK_LDU
-    int solverIndex = 0;
 #if defined DEBUG_CHECK_LDU
     int specie_index = 0;
     for (int s = specie_index; s < specie_index + 1; s++) {
@@ -611,7 +610,6 @@ void dfYEqn::process() {
                     dataBase_.num_Nz, dataBase_.d_boundary_face_cell, dataBase_.d_ldu_to_csr_index, dataBase_.num_patches,
                     dataBase_.patch_size.data(), patch_type.data(), d_ldu, d_source, d_internal_coeffs, d_boundary_coeffs, d_A);
             // TODO with solver of database_, solverIndex is no need any more.
-            //solve(solverIndex, s); 
             //solverIndex ++;
             solve(s);
 #endif