deepmodeling · maorz1998 · Dec 23, 2023 · Dec 23, 2023
diff --git a/applications/solvers/dfLowMachFoam/EEqn.H b/applications/solvers/dfLowMachFoam/EEqn.H
@@ -93,7 +93,7 @@
 		// EEqn_GPU.compareResult(&EEqn.lower()[0], &EEqn.upper()[0], &EEqn.diag()[0], &EEqn.source()[0],
 		// 		h_internal_coeffs.data(), h_boundary_coeffs.data(), printFlag);
 		// DEBUG_TRACE;
-        //EEqn_GPU.compareHe(&he[0], h_boundary_he_tmp, printFlag);
+        // EEqn_GPU.compareHe(&he[0], h_boundary_he_tmp, printFlag);
     }
 
     delete h_boundary_he_tmp;

diff --git a/applications/solvers/dfLowMachFoam/Make/options b/applications/solvers/dfLowMachFoam/Make/options
@@ -27,7 +27,7 @@ EXE_INC = -std=c++14 \
     $(if $(LIBTORCH_ROOT),-I$(LIBTORCH_ROOT)/include/torch/csrc/api/include,) \
     $(PYTHON_INC_DIR) \
     $(if $(AMGX_DIR), -I$(DF_ROOT)/src_gpu,) \
-    $(if $(AMGX_DIR), -I/usr/local/cuda-11.6/include,) \
+    $(if $(AMGX_DIR), -I/usr/local/cuda/include,) \
     $(if $(AMGX_DIR), -I$(AMGX_DIR)/include,) \
 	-I$(DF_ROOT)/GPUTestRef/lnInclude \
 
@@ -43,7 +43,6 @@ EXE_LIBS = \
     -ldfCanteraMixture \
     -ldfChemistryModel \
     -ldfCombustionModels  \
-	-ldfGenMatrix \
     $(CANTERA_ROOT)/lib/libcantera.so \
     $(if $(LIBTORCH_ROOT),$(LIBTORCH_ROOT)/lib/libtorch.so,) \
     $(if $(LIBTORCH_ROOT),$(LIBTORCH_ROOT)/lib/libc10.so,) \
@@ -52,8 +51,8 @@ EXE_LIBS = \
     $(if $(LIBTORCH_ROOT),$(DF_SRC)/dfChemistryModel/DNNInferencer/build/libDNNInferencer.so,) \
     $(if $(PYTHON_LIB_DIR),-L$(PYTHON_LIB_DIR),) \
     $(if $(PYTHON_LIB_DIR),-lpython3.8,) \
-    $(if $(AMGX_DIR), /usr/local/cuda-11.6/lib64/libcudart.so,) \
-	$(if $(AMGX_DIR), /usr/local/cuda-11.6/lib64/libnccl.so,) \
+    $(if $(AMGX_DIR), /usr/local/cuda/lib64/libcudart.so,) \
+	$(if $(AMGX_DIR), /usr/local/cuda/lib64/libnccl.so,) \
     $(if $(AMGX_DIR), $(DF_ROOT)/src_gpu/build/libdfMatrix.so,) \
     $(if $(AMGX_DIR), $(AMGX_DIR)/build/libamgxsh.so,)
 
diff --git a/applications/solvers/dfLowMachFoam/dfLowMachFoam.C b/applications/solvers/dfLowMachFoam/dfLowMachFoam.C
@@ -60,10 +60,10 @@ Description
 #include "basicThermo.H"
 #include "CombustionModel.H"
 
-#define GPUSolverNew_
-#define TIME
+// #define GPUSolverNew_
+// #define TIME
 // #define DEBUG_
-#define SHOW_MEMINFO
+// #define SHOW_MEMINFO
 
 #include "dfMatrixDataBase.H"
 
@@ -87,7 +87,6 @@ Description
     #include "createGPUSolver.H"
 
     #include "upwind.H"
-    #include "GenFvMatrix.H"
     #include "CanteraMixture.H"
     #include "multivariateGaussConvectionScheme.H"
     #include "limitedSurfaceInterpolationScheme.H"

diff --git a/bashrc.in b/bashrc.in
@@ -15,4 +15,5 @@ export DF_LIBBIN=pwd/platforms/$WM_OPTIONS/lib
 export PATH=$DF_APPBIN:$PATH
 export LD_LIBRARY_PATH=$DF_LIBBIN:$LD_LIBRARY_PATH
 export LD_LIBRARY_PATH=$DF_ROOT/src_gpu/build:$LD_LIBRARY_PATH
-export LD_LIBRARY_PATH=$AMGX_DIR/build:$LD_LIBRARY_PATH
+export LD_LIBRARY_PATH=$AMGX_DIR/build:$LD_LIBRARY_PATH
+export LD_LIBRARY_PATH=$DF_ROOT/src/dfChemistryModel/DNNInferencer/build:$LD_LIBRARY_PATH
diff --git a/install.sh b/install.sh
@@ -29,7 +29,7 @@ if [ $USE_GPUSOLVER = true ]; then
     mkdir build
     cd build
     cmake ..
-    make 
+    make -j
     export LD_LIBRARY_PATH=$DF_ROOT/src_gpu/build:$LD_LIBRARY_PATH
 fi
 cd $DF_ROOT

diff --git a/src/dfChemistryModel/dfChemistryModel.H b/src/dfChemistryModel/dfChemistryModel.H
@@ -343,6 +343,8 @@ public:
             }
         }
 
+        bool ifChemstry() const {return chemistry_;}
+
     // profiling
 #if defined USE_LIBTORCH || defined USE_PYTORCH
     double time_allsolve() {return time_allsolve_;}

diff --git a/src_gpu/AmgXSolver.cu b/src_gpu/AmgXSolver.cu
@@ -335,7 +335,7 @@ void AmgXSolver::solve(
     getIters(nIters);
     getResidual(nIters, rnorm);
     if (!isMPIEnabled || myRank == 0)
-        fprintf(stderr, "Initial residual = %.10lf, Final residual = %.5e, No Iterations %d\n", irnorm, rnorm, nIters);
+        printf("Initial residual = %.10lf, Final residual = %.5e, No Iterations %d\n", irnorm, rnorm, nIters);
 
 }
 

diff --git a/src_gpu/dfChemistrySolver.cu b/src_gpu/dfChemistrySolver.cu
@@ -121,13 +121,15 @@ void dfChemistrySolver::setConstantValue(int num_cells, int num_species, int bat
             std::cerr << "error loading the model\n";
             exit(-1);
         }
-        modules_[i].to(device_);
+        // modules_[i].to(device_);
+        modules_[i].to(device_, torch::kHalf);
     }
 }
 
 void dfChemistrySolver::Inference(const double *h_T, const double *d_T,const double *p, const double *y,
         const double *rho, double *RR) {
     // construct input
+    clock_t start = clock();
     inputsize_ = 0;
     std::vector<int> reactCellIndex;
     for (int i = 0; i < num_cells_; i++) {
@@ -136,6 +138,10 @@ void dfChemistrySolver::Inference(const double *h_T, const double *d_T,const dou
         }
     }
     inputsize_ = reactCellIndex.size();
+    clock_t end = clock();
+    double elapsed_secs = double(end - start) / CLOCKS_PER_SEC;
+    std::cout << "construct input time: " << elapsed_secs << std::endl;
+
 #ifdef STREAM_ALLOCATOR
     checkCudaErrors(cudaMallocAsync((void**)&init_input_, sizeof(double) * inputsize_ * dim_input_, stream));
     checkCudaErrors(cudaMallocAsync((void**)&y_input_BCT, sizeof(double) * inputsize_ * num_species_, stream));
@@ -157,13 +163,16 @@ void dfChemistrySolver::Inference(const double *h_T, const double *d_T,const dou
             Xmu_, Xstd_, init_input_);
 
     // inference by torch
+    TICK_INIT_EVENT;
+    TICK_START_EVENT;
     double *d_output;
     for (int sample_start = 0; sample_start < inputsize_; sample_start += batch_size_) {
         int sample_end = std::min(sample_start + batch_size_, inputsize_);
         int sample_len = sample_end - sample_start;
         at::Tensor torch_input = torch::from_blob(init_input_ + sample_start * dim_input_, {sample_len, dim_input_}, 
                 torch::TensorOptions().device(device_).dtype(torch::kDouble));
-        torch_input = torch_input.to(at::kFloat);
+        // torch_input = torch_input.to(at::kFloat);
+        torch_input = torch_input.to(at::kHalf);
         std::vector<torch::jit::IValue> INPUTS;
         INPUTS.push_back(torch_input);
         std::vector<at::Tensor> output(num_modules_);
@@ -175,6 +184,7 @@ void dfChemistrySolver::Inference(const double *h_T, const double *d_T,const dou
             cudaMemcpy(NN_output_ + (i * inputsize_ + sample_start), d_output, sizeof(double) * sample_len, cudaMemcpyDeviceToDevice);
         }
     }
+    TICK_END_EVENT(Inference);
 
     calculate_y_new<<<blocks_per_grid, threads_per_block, 0, stream>>>(inputsize_, num_modules_, NN_output_, 
             y_input_BCT, Ymu_, Ystd_, NN_output_);

diff --git a/src_gpu/dfEEqn.cu b/src_gpu/dfEEqn.cu
@@ -150,6 +150,12 @@ void dfEEqn::process() {
             dataBase_.num_patches, dataBase_.patch_size.data(), patch_type_he.data(),
             dataBase_.d_boundary_delta_coeffs, dataBase_.d_boundary_p, dataBase_.d_boundary_y,
             d_boundary_heGradient);
+    correct_boundary_conditions_scalar(dataBase_.stream, dataBase_.nccl_comm, dataBase_.neighbProcNo.data(),
+            dataBase_.num_boundary_surfaces, dataBase_.num_patches, dataBase_.patch_size.data(),
+            patch_type_he.data(), dataBase_.d_boundary_delta_coeffs, dataBase_.d_boundary_face_cell,
+            dataBase_.d_he, dataBase_.d_boundary_he, dataBase_.cyclicNeighbor.data(), 
+            dataBase_.patchSizeOffset.data(), dataBase_.d_boundary_weight,
+            dataBase_.d_boundary_T, dataBase_.d_boundary_y, d_boundary_heGradient, &thermo_);
     update_boundary_coeffs_scalar(dataBase_.stream,
             dataBase_.num_patches, dataBase_.patch_size.data(), patch_type_he.data(),
             dataBase_.d_boundary_delta_coeffs, dataBase_.d_boundary_he, dataBase_.d_boundary_weight,

diff --git a/src_gpu/dfMatrixOpBase.cu b/src_gpu/dfMatrixOpBase.cu
@@ -849,7 +849,7 @@ __global__ void fvm_laplacian_scalar_boundary(int num, int offset,
     int start_index = offset + index;
     double boundary_value = boundary_gamma[start_index] * boundary_mag_sf[start_index];
     internal_coeffs[start_index] += boundary_value * gradient_internal_coeffs[start_index] * sign;
-    boundary_coeffs[start_index] -= boundary_value * gradient_boundary_coeffs[start_index] * sign;
+    boundary_coeffs[start_index] -= boundary_value * gradient_boundary_coeffs[start_index] * sign; 
 }
 
 __global__ void fvm_laplacian_surface_scalar_boundary(int num, int offset,
@@ -2438,7 +2438,7 @@ void correct_boundary_conditions_scalar(cudaStream_t stream, ncclComm_t comm,
                     gradient_offset, vf, boundary_cell_face, thermo_gradient, boundary_delta_coeffs, boundary_vf);
             gradient_offset += patch_size[i];
         } else if (patch_type[i] == boundaryConditions::fixedEnergy) {
-            GPUThermo->calculateEnthalpyGPU(threads_per_block, patch_size[i], boundary_T, boundary_vf, boundary_y, offset);
+            GPUThermo->calculateEnthalpyGPU(threads_per_block, patch_size[i], num_boundary_surfaces, boundary_T, boundary_vf, boundary_y, offset);
         } else if (patch_type[i] == boundaryConditions::cyclic) {
             correct_boundary_conditions_cyclic_scalar<<<blocks_per_grid, threads_per_block, 0, stream>>>(patch_size[i], offset,
                     patchSizeOffset[cyclicNeighbor[i]], vf, boundary_cell_face, boundary_weight, boundary_vf);

diff --git a/src_gpu/dfThermo.H b/src_gpu/dfThermo.H
@@ -83,7 +83,7 @@ public:
     void calculateRhoDGPU(int threads_per_block, int num_thread, int num_total, const double *T, 
             const double *T_poly, const double *p, const double *mole_fraction, 
             const double *mean_mole_weight, const double *rho, double *rhoD, int offset = 0);
-    void calculateEnthalpyGPU(int thread_per_block, int num_thread, const double *T, double *enthalpy, const double *d_mass_fraction, int offset = 0);
+    void calculateEnthalpyGPU(int thread_per_block, int num_thread, int num_total, const double *T, double *enthalpy, const double *d_mass_fraction, int offset = 0);
     void calculateTemperatureGPU(int thread_per_block, int num_thread, int num_total, const double *T_init, const double *target_h, 
             double *T, const double *d_mass_fraction, int offset = 0,
             double atol = 1e-7, double rtol = 1e-7, int max_iter = 20);

diff --git a/src_gpu/dfThermo.cu b/src_gpu/dfThermo.cu
@@ -8,7 +8,7 @@
 
 #define GAS_CANSTANT 8314.46261815324
 #define SQRT8 2.8284271247461903
-#define NUM_SPECIES 9
+#define NUM_SPECIES 7
 
 // constant memory
 __constant__ __device__ double d_nasa_coeffs[NUM_SPECIES*15];
@@ -213,36 +213,44 @@ __global__ void calculate_diffusion_kernel(int num_thread, int num_total, int nu
     if (index >= num_thread)
         return;
 
-    int startIndex = offset + index;
+    extern __shared__ double shared_data[];
+    double *mole_fraction_shared = shared_data;
 
-    double D[NUM_SPECIES * NUM_SPECIES];
-    double sum1, sum2;
-    double tmp;
+    int startIndex = offset + index;
 
     for (int i = 0; i < num_species; i++) {
-        for (int j = 0; j < num_species; j++) {
-            tmp = 0.;
-            for (int k = 0; k < 5; k++)
-                tmp += (d_binary_diffusion_coeffs[i * num_species * 5 + j * 5 + k] * T_poly[num_total * k + startIndex]);
-            D[i * num_species + j] = tmp * pow(T[startIndex], 1.5);
-        }
+        mole_fraction_shared[i * blockDim.x + threadIdx.x] = mole_fraction[i * num_total + startIndex];
+    }
+
+    double poly[5];
+    for (int j = 0; j < 5; j++) {
+        poly[j] = T_poly[num_total * j + startIndex];
     }
+
+    double powT = T[startIndex] * sqrt(T[startIndex]);
+
+    double local_mean_mole_weight = mean_mole_weight[startIndex];
+    double local_rho_div_p = rho[startIndex] / p[startIndex];
     for (int i = 0; i < num_species; i++) {
-        if (mole_fraction[num_total * i + startIndex] + 1e-10 > 1.) {
+        if (mole_fraction_shared[i * blockDim.x + threadIdx.x] + 1e-10 > 1.) {
             d[num_total * i + startIndex] = 0.;
             continue;
         }
-        sum1 = 0.;
-        sum2 = 0.;
+        double sum1 = 0.;
+        double sum2 = 0.;
         for (int j = 0; j < num_species; j++) {
             if (i == j) continue;
-            sum1 += mole_fraction[num_total * j + startIndex] / D[i * num_species + j];
-            sum2 += mole_fraction[num_total * j + startIndex] * d_molecular_weights[j] / D[i * num_species + j];
+            // calculate D
+            double tmp = 0.;
+            for (int k = 0; k < 5; k++)
+                tmp += (d_binary_diffusion_coeffs[i * num_species * 5 + j * 5 + k] * poly[k]);
+            double local_D = tmp * powT;
+            sum1 += mole_fraction_shared[j * blockDim.x + threadIdx.x] / local_D;
+            sum2 += mole_fraction_shared[j * blockDim.x + threadIdx.x] * d_molecular_weights[j] / local_D;
         }
-        sum1 *= p[startIndex];
-        sum2 *= p[startIndex] * mole_fraction[num_total * i + startIndex] / 
-                (mean_mole_weight[startIndex] - mole_fraction[num_total * i + startIndex] * d_molecular_weights[i]);
-        d[num_total * i + startIndex] = 1 / (sum1 + sum2) * rho[startIndex];
+        sum2 *= mole_fraction_shared[i * blockDim.x + threadIdx.x] / 
+                (local_mean_mole_weight - mole_fraction_shared[i * blockDim.x + threadIdx.x] * d_molecular_weights[i]);
+        d[num_total * i + startIndex] = 1 / (sum1 + sum2) * local_rho_div_p;
     }
 }
 
@@ -314,7 +322,7 @@ __global__ void calculate_temperature_kernel(int num_thread, int num_total, int
 extern void __global__ correct_internal_boundary_field_scalar(int num, int offset,
         const double *vf_internal, const int *face2Cells, double *vf_boundary);
 
-__global__ void calculate_enthalpy_kernel(int num_thread, int offset, int num_cells, int num_species, 
+__global__ void calculate_enthalpy_kernel(int num_thread, int offset, int num_total, int num_species, 
         const double *T, const double *mass_fraction, double *enthalpy)
 {
     int index = blockDim.x * blockIdx.x + threadIdx.x;
@@ -323,7 +331,7 @@ __global__ void calculate_enthalpy_kernel(int num_thread, int offset, int num_ce
 
     int startIndex = index + offset;
 
-    enthalpy[startIndex] = calculate_enthalpy_device_kernel(num_cells, num_species, startIndex, T[startIndex], mass_fraction);
+    enthalpy[startIndex] = calculate_enthalpy_device_kernel(num_total, num_species, startIndex, T[startIndex], mass_fraction);
 }
 
 __global__ void calculate_psip0_kernel(int num_thread, int offset, const double *p, const double *psi, double *psip0)
@@ -489,7 +497,7 @@ void dfThermo::calculatePsiGPU(int threads_per_block, int num_thread, const doub
 void dfThermo::calculateRhoGPU(int threads_per_block, int num_thread, const double *p, const double *psi, double *rho, int offset)
 {
     size_t blocks_per_grid = (num_thread + threads_per_block - 1) / threads_per_block;
-    calculate_rho_kernel<<<blocks_per_grid, threads_per_block, 0, stream>>>(num_cells, offset, p, psi, rho);
+    calculate_rho_kernel<<<blocks_per_grid, threads_per_block, 0, stream>>>(num_thread, offset, p, psi, rho);
 }
 
 void dfThermo::calculateViscosityGPU(int num_thread, int num_total, const double *T, const double *mole_fraction,
@@ -514,13 +522,19 @@ void dfThermo::calculateThermoConductivityGPU(int threads_per_block, int num_thr
             offset, d_nasa_coeffs, d_y, T_poly, T, mole_fraction, species_thermal_conductivities, thermal_conductivity);
 }
 
-void dfThermo::calculateRhoDGPU(int threads_per_block, int num_thread, int num_total, const double *T, 
-        const double *T_poly, const double *p, const double *mole_fraction, 
+void dfThermo::calculateRhoDGPU(int threads_per_block, int num_thread, int num_total, const double *T,
+        const double *T_poly, const double *p, const double *mole_fraction,
         const double *mean_mole_weight, const double *rho, double *rhoD, int offset)
 {
+    threads_per_block = 32;
     size_t blocks_per_grid = (num_thread + threads_per_block - 1) / threads_per_block;
-    calculate_diffusion_kernel<<<blocks_per_grid, threads_per_block, 0, stream>>>(num_thread, num_total, num_species, offset, 
+    size_t sharedMemSize = sizeof(double) * threads_per_block * num_species;
+
+    TICK_INIT_EVENT;
+    TICK_START_EVENT;
+    calculate_diffusion_kernel<<<blocks_per_grid, threads_per_block, sharedMemSize, stream>>>(num_thread, num_total, num_species, offset,
             T_poly, mole_fraction, p, mean_mole_weight, rho, T, rhoD);
+    TICK_END_EVENT("calculate_diffusion_kernel");
 }
 
 void dfThermo::calculateTemperatureGPU(int threads_per_block, int num_thread, int num_total, const double *T_init, const double *target_h, double *T, 
@@ -532,17 +546,17 @@ void dfThermo::calculateTemperatureGPU(int threads_per_block, int num_thread, in
             T_init, target_h, d_mass_fraction, T, atol, rtol, max_iter);
 }
 
-void dfThermo::calculateEnthalpyGPU(int threads_per_block, int num_thread, const double *T, double *enthalpy, const double *d_mass_fraction, int offset)
+void dfThermo::calculateEnthalpyGPU(int threads_per_block, int num_thread, int num_total, const double *T, double *enthalpy, const double *d_mass_fraction, int offset)
 {
     size_t blocks_per_grid = (num_thread + threads_per_block - 1) / threads_per_block;
 
-    calculate_enthalpy_kernel<<<blocks_per_grid, threads_per_block, 0, stream>>>(num_thread, offset, num_cells, num_species, 
+    calculate_enthalpy_kernel<<<blocks_per_grid, threads_per_block, 0, stream>>>(num_thread, offset, num_total, num_species, 
             T, d_mass_fraction, enthalpy);
 }
 
 void dfThermo::updateEnergy()
 {
-    calculateEnthalpyGPU(1024, num_cells, dataBase_.d_T, dataBase_.d_he, dataBase_.d_y);
+    calculateEnthalpyGPU(1024, num_cells, num_cells, dataBase_.d_T, dataBase_.d_he, dataBase_.d_y);
 
     // int offset = 0;
     // for (int i = 0; i < dataBase_.num_patches; i++) {
@@ -574,6 +588,7 @@ void dfThermo::correctThermo()
     setMassFraction(dataBase_.d_y, dataBase_.d_boundary_y);
     // internal field
     int cell_thread = 512, boundary_thread = 32;
+    fprintf(stderr, "\n\n");
     calculateTemperatureGPU(cell_thread, dataBase_.num_cells, dataBase_.num_cells, dataBase_.d_T, dataBase_.d_he, dataBase_.d_T, dataBase_.d_y); // calculate temperature
     calculateTPolyGPU(cell_thread, dataBase_.num_cells, dataBase_.num_cells, dataBase_.d_T, d_T_poly); // calculate T_poly
     calculatePsiGPU(cell_thread, dataBase_.num_cells, dataBase_.d_T, d_mean_mole_weight, dataBase_.d_thermo_psi); // calculate psi
@@ -584,13 +599,14 @@ void dfThermo::correctThermo()
             d_species_thermal_conductivities, dataBase_.d_thermo_alpha); // calculate thermal conductivity
     calculateRhoDGPU(cell_thread, dataBase_.num_cells, dataBase_.num_cells, dataBase_.d_T, d_T_poly, dataBase_.d_p, d_mole_fraction, 
             d_mean_mole_weight, dataBase_.d_rho, dataBase_.d_thermo_rhoD); 
+    fprintf(stderr, "\n\n");
     // boundary field
     int offset = 0;
     for (int i = 0; i < dataBase_.num_patches; i++) {
         if (dataBase_.patch_size[i] == 0) continue;
         if (dataBase_.patch_type_T[i] == boundaryConditions::fixedValue) {
             calculateTPolyGPU(boundary_thread, dataBase_.patch_size[i], dataBase_.num_boundary_surfaces, dataBase_.d_boundary_T, d_boundary_T_poly, offset);
-            calculateEnthalpyGPU(boundary_thread, dataBase_.patch_size[i], dataBase_.d_boundary_T, dataBase_.d_boundary_he, 
+            calculateEnthalpyGPU(boundary_thread, dataBase_.patch_size[i], dataBase_.num_boundary_surfaces, dataBase_.d_boundary_T, dataBase_.d_boundary_he, 
                     dataBase_.d_boundary_y, offset);
             calculatePsiGPU(boundary_thread, dataBase_.patch_size[i], dataBase_.d_boundary_T, d_boundary_mean_mole_weight, dataBase_.d_boundary_thermo_psi, offset);
             calculateRhoGPU(boundary_thread, dataBase_.patch_size[i], dataBase_.d_boundary_p, dataBase_.d_boundary_thermo_psi, dataBase_.d_boundary_rho, offset);