From d125f380ff8c1c05c21a2ca7015f442240715940 Mon Sep 17 00:00:00 2001 From: maorz1998 Date: Mon, 29 May 2023 17:27:50 +0800 Subject: [PATCH] rhoEqn & fix bugs in UEqn --- applications/solvers/dfLowMachFoam/UEqn.H | 24 +- .../solvers/dfLowMachFoam/createdfSolver.H | 6 +- .../solvers/dfLowMachFoam/dfLowMachFoam.C | 1 + applications/solvers/dfLowMachFoam/rhoEqn.H | 20 + src_gpu/CMakeLists.txt | 2 +- src_gpu/dfMatrixDataBase.H | 13 +- src_gpu/dfRhoEqn.H | 43 ++ src_gpu/dfRhoEqn.cu | 141 ++++ src_gpu/dfUEqn.H | 24 +- src_gpu/dfUEqn.cu | 674 ++++++++++-------- 10 files changed, 591 insertions(+), 357 deletions(-) create mode 100644 src_gpu/dfRhoEqn.H create mode 100644 src_gpu/dfRhoEqn.cu diff --git a/applications/solvers/dfLowMachFoam/UEqn.H b/applications/solvers/dfLowMachFoam/UEqn.H index f7547af48..3ceca690b 100644 --- a/applications/solvers/dfLowMachFoam/UEqn.H +++ b/applications/solvers/dfLowMachFoam/UEqn.H @@ -30,7 +30,7 @@ end1 = std::clock(); #ifdef GPUSolver_ start1 = std::clock(); - UEqn_GPU.fvm_ddt(&rho.oldTime()[0], &rho[0], &U.oldTime()[0][0]); + UEqn_GPU.fvm_ddt(&U.oldTime()[0][0]); start2 = std::clock(); int offset = 0; const tmp nuEff_tmp(turbulence->nuEff()); @@ -69,24 +69,6 @@ end1 = std::clock(); memcpy(boundary_nuEff_init+offset, &patchNuEff[0], patchSize*sizeof(double)); // boundary rho memcpy(boundary_rho_init+offset, &patchRho[0], patchSize*sizeof(double)); - - // // only need to construct once - // std::copy(&ueqn_internalCoeffs_vec[0][0], &ueqn_internalCoeffs_vec[0][0]+3*patchSize, ueqn_internalCoeffs_init + 3*offset); - // // need to construct every time step - // std::copy(&ueqn_boundaryCoeffs_vec[0][0], &ueqn_boundaryCoeffs_vec[0][0]+3*patchSize, ueqn_boundaryCoeffs_init + 3*offset); - // // laplacian internalCoeffs - // std::copy(&ueqn_laplac_internalCoeffs_vec[0][0], &ueqn_laplac_internalCoeffs_vec[0][0]+3*patchSize, ueqn_laplac_internalCoeffs_init + 3*offset); - // // laplacian boundaryCoeffs - // std::copy(&ueqn_laplac_boundaryCoeffs_vec[0][0], &ueqn_laplac_boundaryCoeffs_vec[0][0]+3*patchSize, ueqn_laplac_boundaryCoeffs_init + 3*offset); - // // boundary pressure - // std::copy(&patchP[0], &patchP[0]+patchSize, boundary_pressure_init+offset); - // // boundary velocity - // std::copy(&patchU[0][0], &patchU[0][0]+3*patchSize, boundary_velocity_init+3*offset); - // // boundary nuEff - // std::copy(&patchNuEff[0], &patchNuEff[0]+patchSize, boundary_nuEff_init+offset); - // // boundary rho - // std::copy(&patchRho[0], &patchRho[0]+patchSize, boundary_rho_init+offset); - offset += patchSize; } end1 = std::clock(); @@ -97,8 +79,8 @@ end1 = std::clock(); start1 = std::clock(); UEqn_GPU.fvm_div(&phi[0], ueqn_internalCoeffs_init, ueqn_boundaryCoeffs_init, - ueqn_laplac_internalCoeffs_init, ueqn_laplac_boundaryCoeffs_init, - boundary_pressure_init, boundary_velocity_init, boundary_nuEff_init, boundary_rho_init); + ueqn_laplac_internalCoeffs_init, ueqn_laplac_boundaryCoeffs_init, + boundary_pressure_init, boundary_velocity_init, boundary_nuEff_init, boundary_rho_init); UEqn_GPU.fvc_grad(&p[0]); UEqn_GPU.fvc_grad_vector(); UEqn_GPU.dev2T(); diff --git a/applications/solvers/dfLowMachFoam/createdfSolver.H b/applications/solvers/dfLowMachFoam/createdfSolver.H index 79d1f2e78..ed87aceb5 100644 --- a/applications/solvers/dfLowMachFoam/createdfSolver.H +++ b/applications/solvers/dfLowMachFoam/createdfSolver.H @@ -32,9 +32,10 @@ dfMatrixDataBase dfDataBase(num_surfaces, num_cells, num_boundary_faces, num_bou &mesh.Sf()[0][0], &mesh.magSf()[0], &mesh.nonOrthDeltaCoeffs()[0], boundary_face_vector_init, boundary_face_init, boundary_deltaCoeffs_init, boundaryCellIndex); dfUEqn UEqn_GPU(dfDataBase, "dDDI", settingPath); +dfRhoEqn rhoEqn_GPU(dfDataBase); double *ueqn_internalCoeffs_init, *ueqn_boundaryCoeffs_init, *boundary_pressure_init, *boundary_velocity_init, - *boundary_nuEff_init, *boundary_rho_init, *ueqn_laplac_internalCoeffs_init, *ueqn_laplac_boundaryCoeffs_init; + *boundary_nuEff_init, *boundary_rho_init, *ueqn_laplac_internalCoeffs_init, *ueqn_laplac_boundaryCoeffs_init, *boundary_phi_init; cudaMallocHost(&ueqn_internalCoeffs_init, 3*num_boundary_faces*sizeof(double)); cudaMallocHost(&ueqn_boundaryCoeffs_init, 3*num_boundary_faces*sizeof(double)); cudaMallocHost(&ueqn_laplac_internalCoeffs_init, 3*num_boundary_faces*sizeof(double)); @@ -42,4 +43,5 @@ cudaMallocHost(&ueqn_laplac_boundaryCoeffs_init, 3*num_boundary_faces*sizeof(dou cudaMallocHost(&boundary_velocity_init, 3*num_boundary_faces*sizeof(double)); cudaMallocHost(&boundary_pressure_init, num_boundary_faces*sizeof(double)); cudaMallocHost(&boundary_nuEff_init, num_boundary_faces*sizeof(double)); -cudaMallocHost(&boundary_rho_init, num_boundary_faces*sizeof(double)); \ No newline at end of file +cudaMallocHost(&boundary_rho_init, num_boundary_faces*sizeof(double)); +cudaMallocHost(&boundary_phi_init, num_boundary_faces*sizeof(double)); \ No newline at end of file diff --git a/applications/solvers/dfLowMachFoam/dfLowMachFoam.C b/applications/solvers/dfLowMachFoam/dfLowMachFoam.C index 2a0a43826..8012df850 100644 --- a/applications/solvers/dfLowMachFoam/dfLowMachFoam.C +++ b/applications/solvers/dfLowMachFoam/dfLowMachFoam.C @@ -60,6 +60,7 @@ Description #include "CombustionModel.H" #include "dfUEqn.H" +#include "dfRhoEqn.H" #include #include diff --git a/applications/solvers/dfLowMachFoam/rhoEqn.H b/applications/solvers/dfLowMachFoam/rhoEqn.H index bb8ceeca6..e49ef9e5b 100644 --- a/applications/solvers/dfLowMachFoam/rhoEqn.H +++ b/applications/solvers/dfLowMachFoam/rhoEqn.H @@ -29,6 +29,7 @@ Description \*---------------------------------------------------------------------------*/ +#ifdef CPUSolver_ { fvScalarMatrix rhoEqn ( @@ -38,5 +39,24 @@ Description rhoEqn.solve(); } +#endif +#ifdef GPUSolver_ +{ + rho.oldTime(); + + int offset = 0; + forAll(U.boundaryField(), patchi) + { + const fvsPatchScalarField& patchFlux = phi.boundaryField()[patchi]; + int patchSize = patchFlux.size(); + memcpy(boundary_phi_init+offset, &patchFlux[0], patchSize*sizeof(double)); + offset += patchSize; + } + rhoEqn_GPU.fvc_div(&phi[0], boundary_phi_init); + rhoEqn_GPU.fvm_ddt(&rho.oldTime()[0], &rho[0]); + rhoEqn_GPU.updatePsi(&rho.primitiveFieldRef()[0]); + rho.correctBoundaryConditions(); +} +#endif // ************************************************************************* // diff --git a/src_gpu/CMakeLists.txt b/src_gpu/CMakeLists.txt index 7c48e5bb2..2c17c56ed 100644 --- a/src_gpu/CMakeLists.txt +++ b/src_gpu/CMakeLists.txt @@ -18,7 +18,7 @@ include_directories( $ENV{AMGX_DIR}/include ) -add_library(${PROJECT_NAME} SHARED dfUEqn.cu AmgXSolver.cu) +add_library(${PROJECT_NAME} SHARED dfUEqn.cu dfRhoEqn.cu AmgXSolver.cu) target_link_libraries(${PROJECT_NAME} ${MPI_LIBRARIES} diff --git a/src_gpu/dfMatrixDataBase.H b/src_gpu/dfMatrixDataBase.H index 35f3fa21e..971fdc6bb 100644 --- a/src_gpu/dfMatrixDataBase.H +++ b/src_gpu/dfMatrixDataBase.H @@ -29,7 +29,7 @@ void check(T result, char const *const func, const char *const file, #define checkCudaErrors(val) check((val), #val, __FILE__, __LINE__) -void checkVectorEqual(int count, const double* basevec, double* vec, double max_relative_error) { +inline void checkVectorEqual(int count, const double* basevec, double* vec, double max_relative_error) { for (size_t i = 0; i < count; ++i) { double abs_diff = fabs(basevec[i] - vec[i]); @@ -110,6 +110,7 @@ struct dfMatrixDataBase *d_laplac_internal_coeffs_init = nullptr, *d_laplac_boundary_coeffs_init = nullptr, *d_boundary_pressure = nullptr, *d_boundary_face_vector = nullptr, *d_boundary_pressure_init = nullptr, + *d_boundary_phi = nullptr, *d_boundary_phi_init = nullptr, *d_boundary_velocity = nullptr, *d_boundary_velocity_init = nullptr, *d_boundary_nuEff = nullptr, *d_boundary_nuEff_init = nullptr, *d_boundary_rho = nullptr, *d_boundary_rho_init = nullptr; @@ -167,10 +168,7 @@ struct dfMatrixDataBase * @brief cuda functions */ cudaStream_t stream; - /** - * @brief AmgX functions - */ - AmgXSolver* UxSolver = nullptr, *UySolver = nullptr, *UzSolver = nullptr; + int num_iteration; double time_monitor_CPU; @@ -191,6 +189,7 @@ struct dfMatrixDataBase { // allocate field pointer in pin memory cudaMallocHost(&h_phi_init, num_faces * sizeof(double)); + cudaMallocHost(&h_rho_old, num_cells * sizeof(double)); h_weight_vec_init.resize(num_faces); h_weight_vec.resize(num_faces); @@ -478,6 +477,8 @@ struct dfMatrixDataBase checkCudaErrors(cudaMalloc((void**)&d_boundary_pressure_init, boundary_face_bytes)); checkCudaErrors(cudaMalloc((void**)&d_boundary_pressure, boundary_face_bytes)); + checkCudaErrors(cudaMalloc((void**)&d_boundary_phi_init, boundary_face_bytes)); + checkCudaErrors(cudaMalloc((void**)&d_boundary_phi, boundary_face_bytes)); checkCudaErrors(cudaMalloc((void**)&d_boundary_velocity_init, boundary_face_vec_bytes)); checkCudaErrors(cudaMalloc((void**)&d_boundary_velocity, boundary_face_vec_bytes)); checkCudaErrors(cudaMalloc((void**)&d_boundary_face_vector, boundary_face_vec_bytes)); @@ -495,7 +496,7 @@ struct dfMatrixDataBase checkCudaErrors(cudaMalloc((void**)&d_boundary_nuEff_init, boundary_face_bytes)); checkCudaErrors(cudaMalloc((void**)&d_boundary_rho, boundary_face_bytes)); checkCudaErrors(cudaMalloc((void**)&d_boundary_rho_init, boundary_face_bytes)); - total_bytes += (boundary_face_bytes*8 + boundary_face_vec_bytes * 11); + total_bytes += (boundary_face_bytes*10 + boundary_face_vec_bytes * 11); // checkCudaErrors(cudaMalloc((void**)&d_A_csr, csr_value_vec_bytes)); // checkCudaErrors(cudaMalloc((void**)&d_b, cell_vec_bytes)); diff --git a/src_gpu/dfRhoEqn.H b/src_gpu/dfRhoEqn.H new file mode 100644 index 000000000..24bb834d2 --- /dev/null +++ b/src_gpu/dfRhoEqn.H @@ -0,0 +1,43 @@ +#pragma once +#include "dfMatrixDataBase.H" + +/* + fvScalarMatrix rhoEqn + ( + fvm::ddt(rho) + + fvc::div(phi) + ); + + rhoEqn.solve(); +*/ + +class dfRhoEqn +{ +private: + dfMatrixDataBase& dataBase_; + cudaStream_t stream; + int num_iteration; + double time_monitor_CPU; + double time_monitor_GPU_kernel, time_monitor_GPU_memcpy, time_monitor_GPU_memcpy_test; + + // common variables + int num_cells, cell_bytes, num_surfaces, num_boundary_cells; + int *d_A_csr_row_index, *d_A_csr_diag_index; + + // Matrix variables + double *d_b, *d_psi = nullptr; + double *h_b, *h_psi = nullptr; + +public: + dfRhoEqn(); + dfRhoEqn(dfMatrixDataBase& dataBase); + ~dfRhoEqn(); + + void checkValue(bool print); + + void fvc_div(double *phi, double *boundary_phi_init); + + void fvm_ddt(double *rho_old, double *rho_new); + + void updatePsi(double* Psi); +}; \ No newline at end of file diff --git a/src_gpu/dfRhoEqn.cu b/src_gpu/dfRhoEqn.cu new file mode 100644 index 000000000..89aa4ae56 --- /dev/null +++ b/src_gpu/dfRhoEqn.cu @@ -0,0 +1,141 @@ +#include "dfRhoEqn.H" + +// kernel functions +__global__ void fvc_div_internal_rho(int num_cells, const int *csr_row_index, + const int *csr_diag_index, const int *permedIndex, const double *phi_init, + double *phi_out, const double sign, const double *b_input, double *b_output) +{ + int index = blockDim.x * blockIdx.x + threadIdx.x; + if (index >= num_cells) + return; + + // A_csr has one more element in each row: itself + int row_index = csr_row_index[index]; + int row_elements = csr_row_index[index + 1] - row_index; + int diag_index = csr_diag_index[index]; + int neighbor_offset = csr_row_index[index] - index; + + double sum = 0; + + // lower + for (int i = 0; i < diag_index; i++) + { + int neighbor_index = neighbor_offset + i; + int permute_index = permedIndex[neighbor_index]; + double phi = phi_init[permute_index]; + phi_out[neighbor_index] = phi; + sum -= phi; + } + // upper + for (int i = diag_index + 1; i < row_elements; i++) + { + int neighbor_index = neighbor_offset + i - 1; + int permute_index = permedIndex[neighbor_index]; + double phi = phi_init[permute_index]; + phi_out[neighbor_index] = phi; + sum += phi; + } + + b_output[index] = b_input[index] + sum * sign; +} + +__global__ void fvc_div_boundary_rho(int num_cells, int num_boundary_cells, const int *boundary_cell_offset, + const int *boundary_cell_id, const int *bouPermedIndex, const double *boundary_phi_init, + double *boundary_phi, const double sign, const double *b_input, double *b_output) +{ + int index = blockDim.x * blockIdx.x + threadIdx.x; + if (index >= num_boundary_cells) + return; + + int cell_offset = boundary_cell_offset[index]; + int next_cell_offset = boundary_cell_offset[index + 1]; + int cell_index = boundary_cell_id[cell_offset]; + + double sum = 0; + + for (int i = cell_offset; i < next_cell_offset; i++) + { + int permute_index = bouPermedIndex[i]; + double phi = boundary_phi_init[permute_index]; + boundary_phi[i] = phi; + sum += phi; + } + + b_output[cell_index] = b_input[cell_index] + sum * sign; +} + +__global__ void fvm_ddt_rho(int num_cells, const double rdelta_t, + const double *rho_old, double *rho_new, const double *volume, const double *b) +{ + int index = blockDim.x * blockIdx.x + threadIdx.x; + if (index >= num_cells) + return; + + double ddt_diag = rdelta_t * volume[index]; + double ddt_source = rdelta_t * rho_old[index] * volume[index]; + double source_sum = ddt_source - b[index]; + + rho_new[index] = source_sum / ddt_diag; +} + +// constructor +dfRhoEqn::dfRhoEqn(dfMatrixDataBase &dataBase) + : dataBase_(dataBase) +{ + num_cells = dataBase_.num_cells; + cell_bytes = dataBase_.cell_bytes; + num_surfaces = dataBase_.num_surfaces; + num_boundary_cells = dataBase_.num_boundary_cells; + + d_A_csr_row_index = dataBase_.d_A_csr_row_index; + d_A_csr_diag_index = dataBase_.d_A_csr_diag_index; + + cudaMallocHost(&h_psi, cell_bytes); + + checkCudaErrors(cudaMalloc((void **)&d_b, cell_bytes)); + checkCudaErrors(cudaMalloc((void **)&d_psi, cell_bytes)); + + checkCudaErrors(cudaStreamCreate(&stream)); +} + +void dfRhoEqn::fvc_div(double *phi, double *boundary_phi_init) +{ + checkCudaErrors(cudaMemsetAsync(d_b, 0, cell_bytes, stream)); + clock_t start = std::clock(); + memcpy(dataBase_.h_phi_init, phi, num_surfaces * sizeof(double)); + + clock_t end = std::clock(); + time_monitor_CPU += double(end - start) / double(CLOCKS_PER_SEC); + + start = std::clock(); + checkCudaErrors(cudaMemcpyAsync(dataBase_.d_phi_init, dataBase_.h_phi_init, num_surfaces * sizeof(double), cudaMemcpyHostToDevice, stream)); + checkCudaErrors(cudaMemcpyAsync(dataBase_.d_phi_init + num_surfaces, dataBase_.d_phi_init, num_surfaces * sizeof(double), cudaMemcpyDeviceToDevice, stream)); + checkCudaErrors(cudaMemcpyAsync(dataBase_.d_boundary_phi_init, boundary_phi_init, dataBase_.boundary_face_bytes, cudaMemcpyHostToDevice, stream)); + end = std::clock(); + + size_t threads_per_block = 1024; + size_t blocks_per_grid = (num_cells + threads_per_block - 1) / threads_per_block; + fvc_div_internal_rho<<>>(num_cells, d_A_csr_row_index, d_A_csr_diag_index, dataBase_.d_permedIndex, + dataBase_.d_phi_init, dataBase_.d_phi, 1., d_b, d_b); + + blocks_per_grid = (num_boundary_cells + threads_per_block - 1) / threads_per_block; + fvc_div_boundary_rho<<>>(num_cells, num_boundary_cells, dataBase_.d_boundary_cell_offset, dataBase_.d_boundary_cell_id, + dataBase_.d_bouPermedIndex, dataBase_.d_boundary_phi_init, dataBase_.d_boundary_phi, 1., d_b, d_b); +} + +void dfRhoEqn::fvm_ddt(double *rho_old, double *rho_new) +{ + checkCudaErrors(cudaMemcpyAsync(dataBase_.d_rho_old, rho_old, cell_bytes, cudaMemcpyHostToDevice, stream)); + size_t threads_per_block = 1024; + size_t blocks_per_grid = (num_cells + threads_per_block - 1) / threads_per_block; + fvm_ddt_rho<<>>(num_cells, dataBase_.rdelta_t, dataBase_.d_rho_old, dataBase_.d_rho_new, dataBase_.d_volume, d_b); + checkCudaErrors(cudaMemcpyAsync(h_psi, dataBase_.d_rho_new, cell_bytes, cudaMemcpyDeviceToHost, stream)); +} + +void dfRhoEqn::updatePsi(double *Psi) +{ + checkCudaErrors(cudaStreamSynchronize(stream)); + for (size_t i = 0; i < num_cells; i++) + Psi[i] = h_psi[i]; +} +dfRhoEqn::~dfRhoEqn(){} \ No newline at end of file diff --git a/src_gpu/dfUEqn.H b/src_gpu/dfUEqn.H index c7d2c7914..fb79a1dcf 100644 --- a/src_gpu/dfUEqn.H +++ b/src_gpu/dfUEqn.H @@ -7,16 +7,16 @@ class dfUEqn { private: - dfMatrixDataBase& dataBase_; + dfMatrixDataBase &dataBase_; cudaStream_t stream; - AmgXSolver* UxSolver, *UySolver, *UzSolver = nullptr; + AmgXSolver *UxSolver, *UySolver, *UzSolver = nullptr; int num_iteration; double time_monitor_CPU; double time_monitor_GPU_kernel, time_monitor_GPU_memcpy, time_monitor_GPU_memcpy_test; // common variables int num_cells, cell_bytes, num_faces, num_surfaces, cell_vec_bytes, csr_value_vec_bytes, num_boundary_cells; - int *d_A_csr_row_index, *d_A_csr_diag_index, *d_A_csr_col_index; + int *d_A_csr_row_index, *d_A_csr_diag_index, *d_A_csr_col_index; // Matrix variables double *d_A_csr, *d_b, *d_psi = nullptr; @@ -24,19 +24,19 @@ private: public: dfUEqn(); - dfUEqn(dfMatrixDataBase& dataBase, const std::string &modeStr, const std::string &cfgFile); + dfUEqn(dfMatrixDataBase &dataBase, const std::string &modeStr, const std::string &cfgFile); ~dfUEqn(); void checkValue(bool print); - void fvm_ddt(double *rho_old, double *rho_new, double* vector_old); + void fvm_ddt(double *vector_old); - void fvm_div(double* phi, double* ueqn_internalCoeffs_init, double* ueqn_boundaryCoeffs_init, - double* ueqn_laplac_internalCoeffs_init, double* ueqn_laplac_boundaryCoeffs_init, - double* boundary_pressure_init, double* boundary_velocity_init, - double* boundary_nuEff_init, double* boundary_rho_init); + void fvm_div(double *phi, double *ueqn_internalCoeffs_init, double *ueqn_boundaryCoeffs_init, + double *ueqn_laplac_internalCoeffs_init, double *ueqn_laplac_boundaryCoeffs_init, + double *boundary_pressure_init, double *boundary_velocity_init, + double *boundary_nuEff_init, double *boundary_rho_init); - void fvc_grad(double* pressure); + void fvc_grad(double *pressure); void fvc_grad_vector(); @@ -46,8 +46,8 @@ public: void fvm_laplacian(); - void add_fvMatrix(double* turbSrc_low, double* turbSrc_diag, double* turbSrc_upp, double* turbSrc_source); + void add_fvMatrix(double *turbSrc_low, double *turbSrc_diag, double *turbSrc_upp, double *turbSrc_source); void solve(); - void updatePsi(double* Psi); + void updatePsi(double *Psi); }; \ No newline at end of file diff --git a/src_gpu/dfUEqn.cu b/src_gpu/dfUEqn.cu index 7a2297b4e..4afebd8c1 100644 --- a/src_gpu/dfUEqn.cu +++ b/src_gpu/dfUEqn.cu @@ -2,11 +2,13 @@ // kernel functions __global__ void fvm_ddt_kernel(int num_cells, int num_faces, const double rdelta_t, - const int* csr_row_index, const int* csr_diag_index, - const double* rho_old, const double* rho_new, const double* volume, const double* velocity_old, - const double* A_csr_input, const double* b_input, double* A_csr_output, double* b_output, double* psi) { + const int *csr_row_index, const int *csr_diag_index, + const double *rho_old, const double *rho_new, const double *volume, const double *velocity_old, + const double *A_csr_input, const double *b_input, double *A_csr_output, double *b_output, double *psi) +{ int index = blockDim.x * blockIdx.x + threadIdx.x; - if (index >= num_cells) return; + if (index >= num_cells) + return; // A_csr has one more element in each row: itself int row_index = csr_row_index[index]; @@ -30,11 +32,13 @@ __global__ void fvm_ddt_kernel(int num_cells, int num_faces, const double rdelta } __global__ void fvm_div_internal(int num_cells, int num_faces, - const int* csr_row_index, const int* csr_diag_index, - const double* weight, const double* phi, - const double* A_csr_input, const double* b_input, double* A_csr_output, double* b_output) { + const int *csr_row_index, const int *csr_diag_index, + const double *weight, const double *phi, + const double *A_csr_input, const double *b_input, double *A_csr_output, double *b_output) +{ int index = blockDim.x * blockIdx.x + threadIdx.x; - if (index >= num_cells) return; + if (index >= num_cells) + return; // A_csr has one more element in each row: itself int row_index = csr_row_index[index]; @@ -44,31 +48,34 @@ __global__ void fvm_div_internal(int num_cells, int num_faces, int csr_dim = num_cells + num_faces; double div_diag = 0; - for (int i = row_index; i < next_row_index; i++) { - int inner_index = i - row_index; - // lower - if (inner_index < diag_index) { - int neighbor_index = neighbor_offset + inner_index; - double w = weight[neighbor_index]; - double f = phi[neighbor_index]; - A_csr_output[csr_dim * 0 + i] = A_csr_input[csr_dim * 0 + i] + (-w) * f; - A_csr_output[csr_dim * 1 + i] = A_csr_input[csr_dim * 1 + i] + (-w) * f; - A_csr_output[csr_dim * 2 + i] = A_csr_input[csr_dim * 2 + i] + (-w) * f; - // lower neighbors contribute to sum of -1 - div_diag += (w - 1) * f; - } - // upper - if (inner_index > diag_index) { - // upper, index - 1, consider of diag - int neighbor_index = neighbor_offset + inner_index - 1; - double w = weight[neighbor_index]; - double f = phi[neighbor_index]; - A_csr_output[csr_dim * 0 + i] = A_csr_input[csr_dim * 0 + i] + (1 - w) * f; - A_csr_output[csr_dim * 1 + i] = A_csr_input[csr_dim * 1 + i] + (1 - w) * f; - A_csr_output[csr_dim * 2 + i] = A_csr_input[csr_dim * 2 + i] + (1 - w) * f; - // upper neighbors contribute to sum of 1 - div_diag += w * f; - } + for (int i = row_index; i < next_row_index; i++) + { + int inner_index = i - row_index; + // lower + if (inner_index < diag_index) + { + int neighbor_index = neighbor_offset + inner_index; + double w = weight[neighbor_index]; + double f = phi[neighbor_index]; + A_csr_output[csr_dim * 0 + i] = A_csr_input[csr_dim * 0 + i] + (-w) * f; + A_csr_output[csr_dim * 1 + i] = A_csr_input[csr_dim * 1 + i] + (-w) * f; + A_csr_output[csr_dim * 2 + i] = A_csr_input[csr_dim * 2 + i] + (-w) * f; + // lower neighbors contribute to sum of -1 + div_diag += (w - 1) * f; + } + // upper + if (inner_index > diag_index) + { + // upper, index - 1, consider of diag + int neighbor_index = neighbor_offset + inner_index - 1; + double w = weight[neighbor_index]; + double f = phi[neighbor_index]; + A_csr_output[csr_dim * 0 + i] = A_csr_input[csr_dim * 0 + i] + (1 - w) * f; + A_csr_output[csr_dim * 1 + i] = A_csr_input[csr_dim * 1 + i] + (1 - w) * f; + A_csr_output[csr_dim * 2 + i] = A_csr_input[csr_dim * 2 + i] + (1 - w) * f; + // upper neighbors contribute to sum of 1 + div_diag += w * f; + } } A_csr_output[csr_dim * 0 + row_index + diag_index] = A_csr_input[csr_dim * 0 + row_index + diag_index] + div_diag; // diag A_csr_output[csr_dim * 1 + row_index + diag_index] = A_csr_input[csr_dim * 1 + row_index + diag_index] + div_diag; // diag @@ -76,12 +83,14 @@ __global__ void fvm_div_internal(int num_cells, int num_faces, } __global__ void fvm_div_boundary(int num_cells, int num_faces, int num_boundary_cells, - const int* csr_row_index, const int* csr_diag_index, - const int* boundary_cell_offset, const int* boundary_cell_id, - const double* internal_coeffs, const double* boundary_coeffs, - const double* A_csr_input, const double* b_input, double* A_csr_output, double* b_output) { + const int *csr_row_index, const int *csr_diag_index, + const int *boundary_cell_offset, const int *boundary_cell_id, + const double *internal_coeffs, const double *boundary_coeffs, + const double *A_csr_input, const double *b_input, double *A_csr_output, double *b_output) +{ int index = blockDim.x * blockIdx.x + threadIdx.x; - if (index >= num_boundary_cells) return; + if (index >= num_boundary_cells) + return; int cell_offset = boundary_cell_offset[index]; int cell_index = boundary_cell_id[cell_offset]; @@ -102,7 +111,8 @@ __global__ void fvm_div_boundary(int num_cells, int num_faces, int num_boundary_ double boundary_coeffs_x = 0; double boundary_coeffs_y = 0; double boundary_coeffs_z = 0; - for (int i = 0; i < loop_size; i++) { + for (int i = 0; i < loop_size; i++) + { internal_coeffs_x += internal_coeffs[(cell_offset + i) * 3 + 0]; internal_coeffs_y += internal_coeffs[(cell_offset + i) * 3 + 1]; internal_coeffs_z += internal_coeffs[(cell_offset + i) * 3 + 2]; @@ -119,11 +129,13 @@ __global__ void fvm_div_boundary(int num_cells, int num_faces, int num_boundary_ } __global__ void fvc_grad_internal_face(int num_cells, - const int* csr_row_index, const int* csr_col_index, const int* csr_diag_index, - const double* face_vector, const double* weight, const double* pressure, - const double* b_input, double* b_output) { + const int *csr_row_index, const int *csr_col_index, const int *csr_diag_index, + const double *face_vector, const double *weight, const double *pressure, + const double *b_input, double *b_output) +{ int index = blockDim.x * blockIdx.x + threadIdx.x; - if (index >= num_cells) return; + if (index >= num_cells) + return; // A_csr has one more element in each row: itself int row_index = csr_row_index[index]; @@ -141,36 +153,39 @@ __global__ void fvc_grad_internal_face(int num_cells, double grad_by_upp = 0; double grad_bz_low = 0; double grad_bz_upp = 0; - for (int i = row_index; i < next_row_index; i++) { - int inner_index = i - row_index; - // lower - if (inner_index < diag_index) { - int neighbor_index = neighbor_offset + inner_index; - double w = weight[neighbor_index]; - double sfx = face_vector[neighbor_index * 3 + 0]; - double sfy = face_vector[neighbor_index * 3 + 1]; - double sfz = face_vector[neighbor_index * 3 + 2]; - int neighbor_cell_id = csr_col_index[row_index + inner_index]; - double neighbor_cell_p = pressure[neighbor_cell_id]; - double face_p = (1 - w) * own_cell_p + w * neighbor_cell_p; - grad_bx_low -= face_p * sfx; - grad_by_low -= face_p * sfy; - grad_bz_low -= face_p * sfz; - } - // upper - if (inner_index > diag_index) { - int neighbor_index = neighbor_offset + inner_index - 1; - double w = weight[neighbor_index]; - double sfx = face_vector[neighbor_index * 3 + 0]; - double sfy = face_vector[neighbor_index * 3 + 1]; - double sfz = face_vector[neighbor_index * 3 + 2]; - int neighbor_cell_id = csr_col_index[row_index + inner_index]; - double neighbor_cell_p = pressure[neighbor_cell_id]; - double face_p = (1 - w) * own_cell_p + w * neighbor_cell_p; - grad_bx_upp += face_p * sfx; - grad_by_upp += face_p * sfy; - grad_bz_upp += face_p * sfz; - } + for (int i = row_index; i < next_row_index; i++) + { + int inner_index = i - row_index; + // lower + if (inner_index < diag_index) + { + int neighbor_index = neighbor_offset + inner_index; + double w = weight[neighbor_index]; + double sfx = face_vector[neighbor_index * 3 + 0]; + double sfy = face_vector[neighbor_index * 3 + 1]; + double sfz = face_vector[neighbor_index * 3 + 2]; + int neighbor_cell_id = csr_col_index[row_index + inner_index]; + double neighbor_cell_p = pressure[neighbor_cell_id]; + double face_p = (1 - w) * own_cell_p + w * neighbor_cell_p; + grad_bx_low -= face_p * sfx; + grad_by_low -= face_p * sfy; + grad_bz_low -= face_p * sfz; + } + // upper + if (inner_index > diag_index) + { + int neighbor_index = neighbor_offset + inner_index - 1; + double w = weight[neighbor_index]; + double sfx = face_vector[neighbor_index * 3 + 0]; + double sfy = face_vector[neighbor_index * 3 + 1]; + double sfz = face_vector[neighbor_index * 3 + 2]; + int neighbor_cell_id = csr_col_index[row_index + inner_index]; + double neighbor_cell_p = pressure[neighbor_cell_id]; + double face_p = (1 - w) * own_cell_p + w * neighbor_cell_p; + grad_bx_upp += face_p * sfx; + grad_by_upp += face_p * sfy; + grad_bz_upp += face_p * sfz; + } } b_output[num_cells * 0 + index] = b_input[num_cells * 0 + index] - grad_bx_low - grad_bx_upp; b_output[num_cells * 1 + index] = b_input[num_cells * 1 + index] - grad_by_low - grad_by_upp; @@ -178,39 +193,42 @@ __global__ void fvc_grad_internal_face(int num_cells, } __global__ void fvc_grad_boundary_face(int num_cells, int num_boundary_cells, - const int* boundary_cell_offset, const int* boundary_cell_id, - const double* boundary_face_vector, const double* boundary_pressure, - const double* b_input, double* b_output) { + const int *boundary_cell_offset, const int *boundary_cell_id, + const double *boundary_face_vector, const double *boundary_pressure, + const double *b_input, double *b_output) +{ int index = blockDim.x * blockIdx.x + threadIdx.x; - if (index >= num_boundary_cells) return; + if (index >= num_boundary_cells) + return; int cell_offset = boundary_cell_offset[index]; int next_cell_offset = boundary_cell_offset[index + 1]; int cell_index = boundary_cell_id[cell_offset]; // compute boundary gradient - double grad_bx = 0; - double grad_by = 0; - double grad_bz = 0; - for (int i = cell_offset; i < next_cell_offset; i++) { - double sfx = boundary_face_vector[i * 3 + 0]; - double sfy = boundary_face_vector[i * 3 + 1]; - double sfz = boundary_face_vector[i * 3 + 2]; - double face_p = boundary_pressure[i]; - grad_bx += face_p * sfx; - grad_by += face_p * sfy; - grad_bz += face_p * sfz; + double grad_bx = 0; + double grad_by = 0; + double grad_bz = 0; + for (int i = cell_offset; i < next_cell_offset; i++) + { + double sfx = boundary_face_vector[i * 3 + 0]; + double sfy = boundary_face_vector[i * 3 + 1]; + double sfz = boundary_face_vector[i * 3 + 2]; + double face_p = boundary_pressure[i]; + grad_bx += face_p * sfx; + grad_by += face_p * sfy; + grad_bz += face_p * sfz; } //// correct the boundary gradient - //double nx = boundary_face_vector[face_index * 3 + 0] / magSf[face_index]; - //double ny = boundary_face_vector[face_index * 3 + 1] / magSf[face_index]; - //double nz = boundary_face_vector[face_index * 3 + 2] / magSf[face_index]; - //double sn_grad = 0; - //double grad_correction = sn_grad * volume[cell_index] - (nx * grad_bx + ny * grad_by + nz * grad_bz); - //grad_bx += nx * grad_correction; - //grad_by += ny * grad_correction; - //grad_bz += nz * grad_correction; + // double nx = boundary_face_vector[face_index * 3 + 0] / magSf[face_index]; + // double ny = boundary_face_vector[face_index * 3 + 1] / magSf[face_index]; + // double nz = boundary_face_vector[face_index * 3 + 2] / magSf[face_index]; + // double sn_grad = 0; + // double grad_correction = sn_grad * volume[cell_index] - (nx * grad_bx + ny * grad_by + nz * grad_bz); + // grad_bx += nx * grad_correction; + // grad_by += ny * grad_correction; + // grad_bz += nz * grad_correction; b_output[num_cells * 0 + cell_index] = b_input[num_cells * 0 + cell_index] - grad_bx; b_output[num_cells * 1 + cell_index] = b_input[num_cells * 1 + cell_index] - grad_by; @@ -218,11 +236,13 @@ __global__ void fvc_grad_boundary_face(int num_cells, int num_boundary_cells, } __global__ void add_fvMatrix_kernel(int num_cells, int num_faces, - const int* csr_row_index, - const double* turbSrc_A, const double* turbSrc_b, - const double* A_csr_input, const double* b_input, double* A_csr_output, double* b_output) { + const int *csr_row_index, + const double *turbSrc_A, const double *turbSrc_b, + const double *A_csr_input, const double *b_input, double *A_csr_output, double *b_output) +{ int index = blockDim.x * blockIdx.x + threadIdx.x; - if (index >= num_cells) return; + if (index >= num_cells) + return; int row_index = csr_row_index[index]; int next_row_index = csr_row_index[index + 1]; @@ -231,10 +251,10 @@ __global__ void add_fvMatrix_kernel(int num_cells, int num_faces, for (int i = row_index; i < next_row_index; i++) { - A_entry = turbSrc_A[i]; - A_csr_output[csr_dim * 0 + i] = A_csr_input[csr_dim * 0 + i] + A_entry; - A_csr_output[csr_dim * 1 + i] = A_csr_input[csr_dim * 1 + i] + A_entry; - A_csr_output[csr_dim * 2 + i] = A_csr_input[csr_dim * 2 + i] + A_entry; + A_entry = turbSrc_A[i]; + A_csr_output[csr_dim * 0 + i] = A_csr_input[csr_dim * 0 + i] + A_entry; + A_csr_output[csr_dim * 1 + i] = A_csr_input[csr_dim * 1 + i] + A_entry; + A_csr_output[csr_dim * 2 + i] = A_csr_input[csr_dim * 2 + i] + A_entry; } b_output[num_cells * 0 + index] = b_input[num_cells * 0 + index] + turbSrc_b[index * 3 + 0]; b_output[num_cells * 1 + index] = b_input[num_cells * 1 + index] + turbSrc_b[index * 3 + 1]; @@ -242,67 +262,72 @@ __global__ void add_fvMatrix_kernel(int num_cells, int num_faces, } __global__ void offdiagPermutation(const int num_faces, const int *permedIndex, - const double *d_phi_init, double *d_phi) + const double *d_phi_init, double *d_phi) { int index = blockDim.x * blockIdx.x + threadIdx.x; - if (index >= num_faces) return; + if (index >= num_faces) + return; int p = permedIndex[index]; d_phi[index] = d_phi_init[p]; } __global__ void boundaryPermutation(const int num_boundary_faces, const int *bouPermedIndex, - const double *ueqn_internalCoeffs_init, const double *ueqn_boundaryCoeffs_init, - const double *ueqn_laplac_internalCoeffs_init, const double *ueqn_laplac_boundaryCoeffs_init, - const double *boundary_pressure_init, const double *boundary_velocity_init, - double *ueqn_internalCoeffs, double *ueqn_boundaryCoeffs, - double *ueqn_laplac_internalCoeffs, double *ueqn_laplac_boundaryCoeffs, - double *boundary_pressure, double *boundary_velocity, - double *boundary_nuEff_init, double *boundary_nuEff, - double *boundary_rho_init, double *boundary_rho) + const double *ueqn_internalCoeffs_init, const double *ueqn_boundaryCoeffs_init, + const double *ueqn_laplac_internalCoeffs_init, const double *ueqn_laplac_boundaryCoeffs_init, + const double *boundary_pressure_init, const double *boundary_velocity_init, + double *ueqn_internalCoeffs, double *ueqn_boundaryCoeffs, + double *ueqn_laplac_internalCoeffs, double *ueqn_laplac_boundaryCoeffs, + double *boundary_pressure, double *boundary_velocity, + double *boundary_nuEff_init, double *boundary_nuEff, + double *boundary_rho_init, double *boundary_rho) { int index = blockDim.x * blockIdx.x + threadIdx.x; - if (index >= num_boundary_faces) return; + if (index >= num_boundary_faces) + return; int p = bouPermedIndex[index]; - ueqn_internalCoeffs[3*index] = ueqn_internalCoeffs_init[3*p]; - ueqn_internalCoeffs[3*index + 1] = ueqn_internalCoeffs_init[3*p + 1]; - ueqn_internalCoeffs[3*index + 2] = ueqn_internalCoeffs_init[3*p + 2]; - ueqn_boundaryCoeffs[3*index] = ueqn_boundaryCoeffs_init[3*p]; - ueqn_boundaryCoeffs[3*index + 1] = ueqn_boundaryCoeffs_init[3*p + 1]; - ueqn_boundaryCoeffs[3*index + 2] = ueqn_boundaryCoeffs_init[3*p + 2]; - - ueqn_laplac_internalCoeffs[3*index] = ueqn_laplac_internalCoeffs_init[3*p]; - ueqn_laplac_internalCoeffs[3*index + 1] = ueqn_laplac_internalCoeffs_init[3*p + 1]; - ueqn_laplac_internalCoeffs[3*index + 2] = ueqn_laplac_internalCoeffs_init[3*p + 2]; - ueqn_laplac_boundaryCoeffs[3*index] = ueqn_laplac_boundaryCoeffs_init[3*p]; - ueqn_laplac_boundaryCoeffs[3*index + 1] = ueqn_laplac_boundaryCoeffs_init[3*p + 1]; - ueqn_laplac_boundaryCoeffs[3*index + 2] = ueqn_laplac_boundaryCoeffs_init[3*p + 2]; - - boundary_velocity[3*index] = boundary_velocity_init[3*p]; - boundary_velocity[3*index + 1] = boundary_velocity_init[3*p + 1]; - boundary_velocity[3*index + 2] = boundary_velocity_init[3*p + 2]; + ueqn_internalCoeffs[3 * index] = ueqn_internalCoeffs_init[3 * p]; + ueqn_internalCoeffs[3 * index + 1] = ueqn_internalCoeffs_init[3 * p + 1]; + ueqn_internalCoeffs[3 * index + 2] = ueqn_internalCoeffs_init[3 * p + 2]; + ueqn_boundaryCoeffs[3 * index] = ueqn_boundaryCoeffs_init[3 * p]; + ueqn_boundaryCoeffs[3 * index + 1] = ueqn_boundaryCoeffs_init[3 * p + 1]; + ueqn_boundaryCoeffs[3 * index + 2] = ueqn_boundaryCoeffs_init[3 * p + 2]; + + ueqn_laplac_internalCoeffs[3 * index] = ueqn_laplac_internalCoeffs_init[3 * p]; + ueqn_laplac_internalCoeffs[3 * index + 1] = ueqn_laplac_internalCoeffs_init[3 * p + 1]; + ueqn_laplac_internalCoeffs[3 * index + 2] = ueqn_laplac_internalCoeffs_init[3 * p + 2]; + ueqn_laplac_boundaryCoeffs[3 * index] = ueqn_laplac_boundaryCoeffs_init[3 * p]; + ueqn_laplac_boundaryCoeffs[3 * index + 1] = ueqn_laplac_boundaryCoeffs_init[3 * p + 1]; + ueqn_laplac_boundaryCoeffs[3 * index + 2] = ueqn_laplac_boundaryCoeffs_init[3 * p + 2]; + + boundary_velocity[3 * index] = boundary_velocity_init[3 * p]; + boundary_velocity[3 * index + 1] = boundary_velocity_init[3 * p + 1]; + boundary_velocity[3 * index + 2] = boundary_velocity_init[3 * p + 2]; boundary_pressure[index] = boundary_pressure_init[p]; boundary_rho[index] = boundary_rho_init[p]; boundary_nuEff[index] = boundary_nuEff_init[p]; } -__global__ void csrPermutation(const int num_Nz, const int *csrPermedIndex, - const double *d_turbSrc_A_init, double *d_turbSrc_A) +__global__ void csrPermutation(const int num_Nz, const int *csrPermedIndex, + const double *d_turbSrc_A_init, double *d_turbSrc_A) { int index = blockDim.x * blockIdx.x + threadIdx.x; - if (index >= num_Nz) return; + if (index >= num_Nz) + return; int p = csrPermedIndex[index]; d_turbSrc_A[index] = d_turbSrc_A_init[p]; } __global__ void fvc_grad_vector_internal(int num_cells, - const int* csr_row_index, const int* csr_col_index, const int* csr_diag_index, - const double* sf, const double* vf, const double* tlambdas, const double* volume, - double* grad) { + const int *csr_row_index, const int *csr_col_index, const int *csr_diag_index, + const double *sf, const double *vf, const double *tlambdas, const double *volume, + double *grad) +{ int index = blockDim.x * blockIdx.x + threadIdx.x; - if (index >= num_cells) return; + if (index >= num_cells) + return; // A_csr has one more element in each row: itself int row_index = csr_row_index[index]; @@ -310,7 +335,6 @@ __global__ void fvc_grad_vector_internal(int num_cells, int diag_index = csr_diag_index[index]; int neighbor_offset = csr_row_index[index] - index; - double own_vf_x = vf[index * 3 + 0]; double own_vf_y = vf[index * 3 + 1]; double own_vf_z = vf[index * 3 + 2]; @@ -324,7 +348,8 @@ __global__ void fvc_grad_vector_internal(int num_cells, double grad_zy = 0; double grad_zz = 0; // lower - for (int i = 0; i < diag_index; i++) { + for (int i = 0; i < diag_index; i++) + { int neighbor_index = neighbor_offset + i; int neighbor_cell_id = csr_col_index[row_index + i]; double w = tlambdas[neighbor_index]; @@ -348,7 +373,8 @@ __global__ void fvc_grad_vector_internal(int num_cells, grad_zz -= sf_z * face_z; } // upper - for (int i = diag_index + 1; i < row_elements; i++) { + for (int i = diag_index + 1; i < row_elements; i++) + { int neighbor_index = neighbor_offset + i - 1; int neighbor_cell_id = csr_col_index[row_index + i]; double w = tlambdas[neighbor_index]; @@ -376,7 +402,7 @@ __global__ void fvc_grad_vector_internal(int num_cells, // // printf("sf_x = %.20lf\n", sf_x); // // printf("face_x = %.20lf\n", face_x); // } - } + } double vol = volume[index]; grad[index * 9 + 0] = grad_xx / vol; grad[index * 9 + 1] = grad_xy / vol; @@ -390,11 +416,13 @@ __global__ void fvc_grad_vector_internal(int num_cells, } __global__ void fvc_grad_vector_boundary(int num_cells, int num_boundary_cells, - const int* boundary_cell_offset, const int* boundary_cell_id, - const double* boundary_sf, const double* boundary_vf, const double* volume, - double* grad, double* grad_boundary_init) { + const int *boundary_cell_offset, const int *boundary_cell_id, + const double *boundary_sf, const double *boundary_vf, const double *volume, + double *grad, double *grad_boundary_init) +{ int index = blockDim.x * blockIdx.x + threadIdx.x; - if (index >= num_boundary_cells) return; + if (index >= num_boundary_cells) + return; int cell_offset = boundary_cell_offset[index]; int next_cell_offset = boundary_cell_offset[index + 1]; @@ -409,26 +437,27 @@ __global__ void fvc_grad_vector_boundary(int num_cells, int num_boundary_cells, double grad_zx = 0; double grad_zy = 0; double grad_zz = 0; - for (int i = cell_offset; i < next_cell_offset; i++) { - double sf_x = boundary_sf[i * 3 + 0]; - double sf_y = boundary_sf[i * 3 + 1]; - double sf_z = boundary_sf[i * 3 + 2]; - double vf_x = boundary_vf[i * 3 + 0]; - double vf_y = boundary_vf[i * 3 + 1]; - double vf_z = boundary_vf[i * 3 + 2]; - grad_xx += sf_x * vf_x; - grad_xy += sf_x * vf_y; - grad_xz += sf_x * vf_z; - grad_yx += sf_y * vf_x; - grad_yy += sf_y * vf_y; - grad_yz += sf_y * vf_z; - grad_zx += sf_z * vf_x; - grad_zy += sf_z * vf_y; - grad_zz += sf_z * vf_z; + for (int i = cell_offset; i < next_cell_offset; i++) + { + double sf_x = boundary_sf[i * 3 + 0]; + double sf_y = boundary_sf[i * 3 + 1]; + double sf_z = boundary_sf[i * 3 + 2]; + double vf_x = boundary_vf[i * 3 + 0]; + double vf_y = boundary_vf[i * 3 + 1]; + double vf_z = boundary_vf[i * 3 + 2]; + grad_xx += sf_x * vf_x; + grad_xy += sf_x * vf_y; + grad_xz += sf_x * vf_z; + grad_yx += sf_y * vf_x; + grad_yy += sf_y * vf_y; + grad_yz += sf_y * vf_z; + grad_zx += sf_z * vf_x; + grad_zy += sf_z * vf_y; + grad_zz += sf_z * vf_z; } double vol = volume[cell_index]; - + grad[cell_index * 9 + 0] += grad_xx / vol; grad[cell_index * 9 + 1] += grad_xy / vol; grad[cell_index * 9 + 2] += grad_xz / vol; @@ -448,15 +477,16 @@ __global__ void fvc_grad_vector_boundary(int num_cells, int num_boundary_cells, grad_boundary_init[index * 9 + 6] = grad[cell_index * 9 + 6]; grad_boundary_init[index * 9 + 7] = grad[cell_index * 9 + 7]; grad_boundary_init[index * 9 + 8] = grad[cell_index * 9 + 8]; - } __global__ void correct_boundary_conditions(int num_boundary_cells, - const int* boundary_cell_offset, const int* boundary_cell_id, - const double* boundary_sf, const double* mag_sf, - double* boundary_grad_init, double* boundary_grad) { + const int *boundary_cell_offset, const int *boundary_cell_id, + const double *boundary_sf, const double *mag_sf, + double *boundary_grad_init, double *boundary_grad) +{ int index = blockDim.x * blockIdx.x + threadIdx.x; - if (index >= num_boundary_cells) return; + if (index >= num_boundary_cells) + return; int cell_offset = boundary_cell_offset[index]; int next_cell_offset = boundary_cell_offset[index + 1]; @@ -472,7 +502,8 @@ __global__ void correct_boundary_conditions(int num_boundary_cells, double grad_zy = boundary_grad_init[index * 9 + 7]; double grad_zz = boundary_grad_init[index * 9 + 8]; - for (int i = cell_offset; i < next_cell_offset; i++) { + for (int i = cell_offset; i < next_cell_offset; i++) + { // OpenFoam code // const vectorField n // ( @@ -503,12 +534,14 @@ __global__ void correct_boundary_conditions(int num_boundary_cells, boundary_grad[i * 9 + 6] = grad_zx + n_z * grad_correction_x; boundary_grad[i * 9 + 7] = grad_zy + n_z * grad_correction_y; boundary_grad[i * 9 + 8] = grad_zz + n_z * grad_correction_z; - } + } } -__global__ void dev2_t_tensor(int num, double* tensor) { +__global__ void dev2_t_tensor(int num, double *tensor) +{ int index = blockDim.x * blockIdx.x + threadIdx.x; - if (index >= num) return; + if (index >= num) + return; double t_xx = tensor[index * 9 + 0]; double t_xy = tensor[index * 9 + 1]; @@ -529,16 +562,17 @@ __global__ void dev2_t_tensor(int num, double* tensor) { tensor[index * 9 + 6] = t_xz; tensor[index * 9 + 7] = t_yz; tensor[index * 9 + 8] = t_zz - trace_coeff; - } __global__ void fvc_div_tensor_internal(int num_cells, - const int* csr_row_index, const int* csr_col_index, const int* csr_diag_index, - const double* scalar0, const double* scalar1, - const double* sf, const double* vf, const double* tlambdas, const double* volume, - const double sign, const double* b_input, double* b_output) { + const int *csr_row_index, const int *csr_col_index, const int *csr_diag_index, + const double *scalar0, const double *scalar1, + const double *sf, const double *vf, const double *tlambdas, const double *volume, + const double sign, const double *b_input, double *b_output) +{ int index = blockDim.x * blockIdx.x + threadIdx.x; - if (index >= num_cells) return; + if (index >= num_cells) + return; // A_csr has one more element in each row: itself int row_index = csr_row_index[index]; @@ -560,9 +594,10 @@ __global__ void fvc_div_tensor_internal(int num_cells, double sum_x = 0; double sum_y = 0; double sum_z = 0; - + // lower - for (int i = 0; i < diag_index; i++) { + for (int i = 0; i < diag_index; i++) + { int neighbor_index = neighbor_offset + i; int neighbor_cell_id = csr_col_index[row_index + i]; double coeff_nei = scalar0[neighbor_cell_id] * scalar1[neighbor_cell_id]; @@ -593,7 +628,8 @@ __global__ void fvc_div_tensor_internal(int num_cells, sum_z -= sf_x * face_xz + sf_y * face_yz + sf_z * face_zz; } // upper - for (int i = diag_index + 1; i < row_elements; i++) { + for (int i = diag_index + 1; i < row_elements; i++) + { int neighbor_index = neighbor_offset + i - 1; int neighbor_cell_id = csr_col_index[row_index + i]; double coeff_nei = scalar0[neighbor_cell_id] * scalar1[neighbor_cell_id]; @@ -624,19 +660,21 @@ __global__ void fvc_div_tensor_internal(int num_cells, sum_z += sf_x * face_xz + sf_y * face_yz + sf_z * face_zz; } double vol = volume[index]; - - b_output[num_cells * 0 + index] = b_input[num_cells * 0 + index] + sum_x * sign ; - b_output[num_cells * 1 + index] = b_input[num_cells * 1 + index] + sum_y * sign ; - b_output[num_cells * 2 + index] = b_input[num_cells * 2 + index] + sum_z * sign ; + + b_output[num_cells * 0 + index] = b_input[num_cells * 0 + index] + sum_x * sign; + b_output[num_cells * 1 + index] = b_input[num_cells * 1 + index] + sum_y * sign; + b_output[num_cells * 2 + index] = b_input[num_cells * 2 + index] + sum_z * sign; } __global__ void fvc_div_tensor_boundary(int num_cells, int num_boundary_cells, - const int* boundary_cell_offset, const int* boundary_cell_id, - const double* boundary_scalar0, const double* boundary_scalar1, - const double* boundary_sf, const double* boundary_vf, const double* volume, - const double sign, const double* b_input, double* b_output) { + const int *boundary_cell_offset, const int *boundary_cell_id, + const double *boundary_scalar0, const double *boundary_scalar1, + const double *boundary_sf, const double *boundary_vf, const double *volume, + const double sign, const double *b_input, double *b_output) +{ int index = blockDim.x * blockIdx.x + threadIdx.x; - if (index >= num_boundary_cells) return; + if (index >= num_boundary_cells) + return; int cell_offset = boundary_cell_offset[index]; int next_cell_offset = boundary_cell_offset[index + 1]; @@ -662,42 +700,45 @@ __global__ void fvc_div_tensor_boundary(int num_cells, int num_boundary_cells, // { // ivf[pFaceCells[facei]] += pssf[facei]; // } - double sum_x = 0; - double sum_y = 0; - double sum_z = 0; - for (int i = cell_offset; i < next_cell_offset; i++) { - double sf_x = boundary_sf[i * 3 + 0]; - double sf_y = boundary_sf[i * 3 + 1]; - double sf_z = boundary_sf[i * 3 + 2]; - double face_xx = boundary_vf[i * 9 + 0]; - double face_xy = boundary_vf[i * 9 + 1]; - double face_xz = boundary_vf[i * 9 + 2]; - double face_yx = boundary_vf[i * 9 + 3]; - double face_yy = boundary_vf[i * 9 + 4]; - double face_yz = boundary_vf[i * 9 + 5]; - double face_zx = boundary_vf[i * 9 + 6]; - double face_zy = boundary_vf[i * 9 + 7]; - double face_zz = boundary_vf[i * 9 + 8]; - - // if not coupled - double coeff = boundary_scalar0[i] * boundary_scalar1[i]; - sum_x += (sf_x * face_xx + sf_y * face_yx + sf_z * face_zx) * coeff; - sum_y += (sf_x * face_xy + sf_y * face_yy + sf_z * face_zy) * coeff; - sum_z += (sf_x * face_xz + sf_y * face_yz + sf_z * face_zz) * coeff; + double sum_x = 0; + double sum_y = 0; + double sum_z = 0; + for (int i = cell_offset; i < next_cell_offset; i++) + { + double sf_x = boundary_sf[i * 3 + 0]; + double sf_y = boundary_sf[i * 3 + 1]; + double sf_z = boundary_sf[i * 3 + 2]; + double face_xx = boundary_vf[i * 9 + 0]; + double face_xy = boundary_vf[i * 9 + 1]; + double face_xz = boundary_vf[i * 9 + 2]; + double face_yx = boundary_vf[i * 9 + 3]; + double face_yy = boundary_vf[i * 9 + 4]; + double face_yz = boundary_vf[i * 9 + 5]; + double face_zx = boundary_vf[i * 9 + 6]; + double face_zy = boundary_vf[i * 9 + 7]; + double face_zz = boundary_vf[i * 9 + 8]; + + // if not coupled + double coeff = boundary_scalar0[i] * boundary_scalar1[i]; + sum_x += (sf_x * face_xx + sf_y * face_yx + sf_z * face_zx) * coeff; + sum_y += (sf_x * face_xy + sf_y * face_yy + sf_z * face_zy) * coeff; + sum_z += (sf_x * face_xz + sf_y * face_yz + sf_z * face_zz) * coeff; } double vol = volume[cell_index]; - b_output[num_cells * 0 + cell_index] = b_input[num_cells * 0 + cell_index] + sum_x * sign ; - b_output[num_cells * 1 + cell_index] = b_input[num_cells * 1 + cell_index] + sum_y * sign ; - b_output[num_cells * 2 + cell_index] = b_input[num_cells * 2 + cell_index] + sum_z * sign ; + b_output[num_cells * 0 + cell_index] = b_input[num_cells * 0 + cell_index] + sum_x * sign; + b_output[num_cells * 1 + cell_index] = b_input[num_cells * 1 + cell_index] + sum_y * sign; + b_output[num_cells * 2 + cell_index] = b_input[num_cells * 2 + cell_index] + sum_z * sign; } __global__ void fvm_laplacian_uncorrected_vector_internal(int num_cells, int num_faces, - const int* csr_row_index, const int* csr_col_index, const int* csr_diag_index, - const double* scalar0, const double* scalar1, const double* weight, - const double* magsf, const double* distance, - const double sign, const double* A_csr_input, double* A_csr_output) { + const int *csr_row_index, const int *csr_col_index, const int *csr_diag_index, + const double *scalar0, const double *scalar1, const double *weight, + const double *magsf, const double *distance, + const double sign, const double *A_csr_input, double *A_csr_output) +{ int index = blockDim.x * blockIdx.x + threadIdx.x; - if (index >= num_cells) return; + if (index >= num_cells) + return; // A_csr has one more element in each row: itself int row_index = csr_row_index[index]; @@ -714,7 +755,8 @@ __global__ void fvm_laplacian_uncorrected_vector_internal(int num_cells, int num // fvm.negSumDiag(); double sum_diag = 0; // lower - for (int i = 0; i < diag_index; i++) { + for (int i = 0; i < diag_index; i++) + { int neighbor_index = neighbor_offset + i; int neighbor_cell_id = csr_col_index[i + row_index]; double w = weight[neighbor_index]; @@ -727,11 +769,12 @@ __global__ void fvm_laplacian_uncorrected_vector_internal(int num_cells, int num A_csr_output[csr_dim * 0 + row_index + i] = A_csr_input[csr_dim * 0 + row_index + i] + coeff * sign; A_csr_output[csr_dim * 1 + row_index + i] = A_csr_input[csr_dim * 1 + row_index + i] + coeff * sign; A_csr_output[csr_dim * 2 + row_index + i] = A_csr_input[csr_dim * 2 + row_index + i] + coeff * sign; - + sum_diag += (-coeff); } // upper - for (int i = diag_index + 1; i < row_elements; i++) { + for (int i = diag_index + 1; i < row_elements; i++) + { int neighbor_index = neighbor_offset + i - 1; int neighbor_cell_id = csr_col_index[i + row_index]; double w = weight[neighbor_index]; @@ -752,13 +795,15 @@ __global__ void fvm_laplacian_uncorrected_vector_internal(int num_cells, int num } __global__ void fvm_laplacian_uncorrected_vector_boundary(int num_cells, int num_faces, int num_boundary_cells, - const int* csr_row_index, const int* csr_diag_index, - const int* boundary_cell_offset, const int* boundary_cell_id, - const double* boundary_scalar0, const double* boundary_scalar1, - const double* boundary_magsf, const double* gradient_internal_coeffs, const double* gradient_boundary_coeffs, - const double sign, const double* A_csr_input, const double* b_input, double* A_csr_output, double* b_output) { + const int *csr_row_index, const int *csr_diag_index, + const int *boundary_cell_offset, const int *boundary_cell_id, + const double *boundary_scalar0, const double *boundary_scalar1, + const double *boundary_magsf, const double *gradient_internal_coeffs, const double *gradient_boundary_coeffs, + const double sign, const double *A_csr_input, const double *b_input, double *A_csr_output, double *b_output) +{ int index = blockDim.x * blockIdx.x + threadIdx.x; - if (index >= num_boundary_cells) return; + if (index >= num_boundary_cells) + return; int cell_offset = boundary_cell_offset[index]; int next_cell_offset = boundary_cell_offset[index + 1]; @@ -789,7 +834,8 @@ __global__ void fvm_laplacian_uncorrected_vector_boundary(int num_cells, int num double boundary_coeffs_x = 0; double boundary_coeffs_y = 0; double boundary_coeffs_z = 0; - for (int i = cell_offset; i < next_cell_offset; i++) { + for (int i = cell_offset; i < next_cell_offset; i++) + { double gamma = boundary_scalar0[i] * boundary_scalar1[i]; double gamma_magsf = gamma * boundary_magsf[i]; internal_coeffs_x += gamma_magsf * gradient_internal_coeffs[i * 3 + 0]; @@ -809,8 +855,8 @@ __global__ void fvm_laplacian_uncorrected_vector_boundary(int num_cells, int num } // constructor -dfUEqn::dfUEqn(dfMatrixDataBase& dataBase, const std::string &modeStr, const std::string &cfgFile) -: dataBase_(dataBase) +dfUEqn::dfUEqn(dfMatrixDataBase &dataBase, const std::string &modeStr, const std::string &cfgFile) + : dataBase_(dataBase) { UxSolver = new AmgXSolver(modeStr, cfgFile); UySolver = new AmgXSolver(modeStr, cfgFile); @@ -832,22 +878,20 @@ dfUEqn::dfUEqn(dfMatrixDataBase& dataBase, const std::string &modeStr, const std h_b = new double[num_cells * 3]; cudaMallocHost(&h_psi, cell_vec_bytes); - checkCudaErrors(cudaMalloc((void**)&d_A_csr, csr_value_vec_bytes)); - checkCudaErrors(cudaMalloc((void**)&d_b, cell_vec_bytes)); - checkCudaErrors(cudaMalloc((void**)&d_psi, cell_vec_bytes)); + checkCudaErrors(cudaMalloc((void **)&d_A_csr, csr_value_vec_bytes)); + checkCudaErrors(cudaMalloc((void **)&d_b, cell_vec_bytes)); + checkCudaErrors(cudaMalloc((void **)&d_psi, cell_vec_bytes)); checkCudaErrors(cudaStreamCreate(&stream)); +} +void dfUEqn::fvm_ddt(double *vector_old) +{ + // initialize matrix value checkCudaErrors(cudaMemsetAsync(d_A_csr, 0, csr_value_vec_bytes, stream)); checkCudaErrors(cudaMemsetAsync(d_b, 0, cell_vec_bytes, stream)); -} - -void dfUEqn::fvm_ddt(double *rho_old, double *rho_new, double* vector_old) -{ // Copy the host input array in host memory to the device input array in device memory clock_t start = std::clock(); - checkCudaErrors(cudaMemcpyAsync(dataBase_.d_rho_old, rho_old, cell_bytes, cudaMemcpyHostToDevice, stream)); - checkCudaErrors(cudaMemcpyAsync(dataBase_.d_rho_new, rho_new, cell_bytes, cudaMemcpyHostToDevice, stream)); checkCudaErrors(cudaMemcpyAsync(dataBase_.d_velocity_old, vector_old, cell_vec_bytes, cudaMemcpyHostToDevice, stream)); clock_t end = std::clock(); time_monitor_GPU_memcpy += double(end - start) / double(CLOCKS_PER_SEC); @@ -857,32 +901,32 @@ void dfUEqn::fvm_ddt(double *rho_old, double *rho_new, double* vector_old) size_t threads_per_block = 1024; size_t blocks_per_grid = (num_cells + threads_per_block - 1) / threads_per_block; fvm_ddt_kernel<<>>(num_cells, num_faces, dataBase_.rdelta_t, - d_A_csr_row_index, d_A_csr_diag_index, - dataBase_.d_rho_old, dataBase_.d_rho_new, dataBase_.d_volume, dataBase_.d_velocity_old, d_A_csr, d_b, d_A_csr, d_b, d_psi); + d_A_csr_row_index, d_A_csr_diag_index, + dataBase_.d_rho_old, dataBase_.d_rho_new, dataBase_.d_volume, dataBase_.d_velocity_old, d_A_csr, d_b, d_A_csr, d_b, d_psi); end = std::clock(); time_monitor_GPU_kernel += double(end - start) / double(CLOCKS_PER_SEC); } -void dfUEqn::fvm_div(double* phi, double* ueqn_internalCoeffs_init, double* ueqn_boundaryCoeffs_init, - double* ueqn_laplac_internalCoeffs_init, double* ueqn_laplac_boundaryCoeffs_init, - double* boundary_pressure_init, double* boundary_velocity_init, - double *boundary_nuEff_init, double *boundary_rho_init) +void dfUEqn::fvm_div(double *phi, double *ueqn_internalCoeffs_init, double *ueqn_boundaryCoeffs_init, + double *ueqn_laplac_internalCoeffs_init, double *ueqn_laplac_boundaryCoeffs_init, + double *boundary_pressure_init, double *boundary_velocity_init, + double *boundary_nuEff_init, double *boundary_rho_init) { // copy and permutate face variables clock_t start = std::clock(); // std::copy(phi, phi + num_surfaces, h_phi_init); // std::copy(phi, phi + num_surfaces, h_phi_init + num_surfaces); - // memcpy(h_phi_init, phi, num_surfaces*sizeof(double)); + memcpy(dataBase_.h_phi_init, phi, num_surfaces * sizeof(double)); // memcpy(h_phi_init + num_surfaces, phi, num_surfaces*sizeof(double)); - + clock_t end = std::clock(); time_monitor_CPU += double(end - start) / double(CLOCKS_PER_SEC); start = std::clock(); - checkCudaErrors(cudaMemcpyAsync(dataBase_.d_phi_init, phi, num_surfaces*sizeof(double), cudaMemcpyHostToDevice, stream)); - checkCudaErrors(cudaMemcpyAsync(dataBase_.d_phi_init + num_surfaces, dataBase_.d_phi_init, num_surfaces*sizeof(double), cudaMemcpyDeviceToDevice, stream)); + checkCudaErrors(cudaMemcpyAsync(dataBase_.d_phi_init, dataBase_.h_phi_init, num_surfaces * sizeof(double), cudaMemcpyHostToDevice, stream)); + checkCudaErrors(cudaMemcpyAsync(dataBase_.d_phi_init + num_surfaces, dataBase_.d_phi_init, num_surfaces * sizeof(double), cudaMemcpyDeviceToDevice, stream)); end = std::clock(); - time_monitor_GPU_memcpy_test += double(end - start) / double(CLOCKS_PER_SEC); + time_monitor_GPU_memcpy += double(end - start) / double(CLOCKS_PER_SEC); start = std::clock(); size_t threads_per_block = 1024; @@ -903,47 +947,47 @@ void dfUEqn::fvm_div(double* phi, double* ueqn_internalCoeffs_init, double* ueqn checkCudaErrors(cudaMemcpyAsync(dataBase_.d_boundary_rho_init, boundary_rho_init, dataBase_.boundary_face_bytes, cudaMemcpyHostToDevice, stream)); end = std::clock(); time_monitor_GPU_memcpy += double(end - start) / double(CLOCKS_PER_SEC); - + start = std::clock(); blocks_per_grid = (dataBase_.num_boundary_faces + threads_per_block - 1) / threads_per_block; - boundaryPermutation<<>>(dataBase_.num_boundary_faces, dataBase_.d_bouPermedIndex, dataBase_.d_internal_coeffs_init, - dataBase_.d_boundary_coeffs_init, dataBase_.d_laplac_internal_coeffs_init, dataBase_.d_laplac_boundary_coeffs_init, dataBase_.d_boundary_pressure_init, - dataBase_.d_boundary_velocity_init, dataBase_.d_internal_coeffs, dataBase_.d_boundary_coeffs, dataBase_.d_laplac_internal_coeffs, dataBase_.d_laplac_boundary_coeffs, - dataBase_.d_boundary_pressure, dataBase_.d_boundary_velocity, dataBase_.d_boundary_nuEff_init, dataBase_.d_boundary_nuEff, dataBase_.d_boundary_rho_init, dataBase_.d_boundary_rho); + boundaryPermutation<<>>(dataBase_.num_boundary_faces, dataBase_.d_bouPermedIndex, dataBase_.d_internal_coeffs_init, + dataBase_.d_boundary_coeffs_init, dataBase_.d_laplac_internal_coeffs_init, dataBase_.d_laplac_boundary_coeffs_init, dataBase_.d_boundary_pressure_init, + dataBase_.d_boundary_velocity_init, dataBase_.d_internal_coeffs, dataBase_.d_boundary_coeffs, dataBase_.d_laplac_internal_coeffs, dataBase_.d_laplac_boundary_coeffs, + dataBase_.d_boundary_pressure, dataBase_.d_boundary_velocity, dataBase_.d_boundary_nuEff_init, dataBase_.d_boundary_nuEff, dataBase_.d_boundary_rho_init, dataBase_.d_boundary_rho); // launch cuda kernel blocks_per_grid = (num_cells + threads_per_block - 1) / threads_per_block; fvm_div_internal<<>>(num_cells, num_faces, - d_A_csr_row_index, d_A_csr_diag_index, - dataBase_.d_weight, dataBase_.d_phi, d_A_csr, d_b, d_A_csr, d_b); + d_A_csr_row_index, d_A_csr_diag_index, + dataBase_.d_weight, dataBase_.d_phi, d_A_csr, d_b, d_A_csr, d_b); blocks_per_grid = (num_boundary_cells + threads_per_block - 1) / threads_per_block; fvm_div_boundary<<>>(num_cells, num_faces, num_boundary_cells, - d_A_csr_row_index, d_A_csr_diag_index, - dataBase_.d_boundary_cell_offset, dataBase_.d_boundary_cell_id, - dataBase_.d_internal_coeffs, dataBase_.d_boundary_coeffs, d_A_csr, d_b, d_A_csr, d_b); + d_A_csr_row_index, d_A_csr_diag_index, + dataBase_.d_boundary_cell_offset, dataBase_.d_boundary_cell_id, + dataBase_.d_internal_coeffs, dataBase_.d_boundary_coeffs, d_A_csr, d_b, d_A_csr, d_b); end = std::clock(); time_monitor_GPU_kernel += double(end - start) / double(CLOCKS_PER_SEC); } -void dfUEqn::fvc_grad(double* pressure) +void dfUEqn::fvc_grad(double *pressure) { // Copy the host input array in host memory to the device input array in device memory clock_t start = std::clock(); checkCudaErrors(cudaMemcpyAsync(dataBase_.d_pressure, pressure, cell_bytes, cudaMemcpyHostToDevice, stream)); clock_t end = std::clock(); time_monitor_GPU_memcpy += double(end - start) / double(CLOCKS_PER_SEC); - + // launch cuda kernel start = std::clock(); size_t threads_per_block = 1024; size_t blocks_per_grid = (num_cells + threads_per_block - 1) / threads_per_block; fvc_grad_internal_face<<>>(num_cells, - d_A_csr_row_index, d_A_csr_col_index, d_A_csr_diag_index, - dataBase_.d_face_vector, dataBase_.d_weight, dataBase_.d_pressure, d_b, d_b); + d_A_csr_row_index, d_A_csr_col_index, d_A_csr_diag_index, + dataBase_.d_face_vector, dataBase_.d_weight, dataBase_.d_pressure, d_b, d_b); blocks_per_grid = (num_boundary_cells + threads_per_block - 1) / threads_per_block; fvc_grad_boundary_face<<>>(num_cells, num_boundary_cells, - dataBase_.d_boundary_cell_offset, dataBase_.d_boundary_cell_id, - dataBase_.d_boundary_face_vector, dataBase_.d_boundary_pressure, d_b, d_b); + dataBase_.d_boundary_cell_offset, dataBase_.d_boundary_cell_id, + dataBase_.d_boundary_face_vector, dataBase_.d_boundary_pressure, d_b, d_b); end = std::clock(); time_monitor_GPU_kernel += double(end - start) / double(CLOCKS_PER_SEC); } @@ -955,17 +999,17 @@ void dfUEqn::fvc_grad_vector() size_t threads_per_block = 1024; size_t blocks_per_grid = (num_cells + threads_per_block - 1) / threads_per_block; fvc_grad_vector_internal<<>>(num_cells, - d_A_csr_row_index, d_A_csr_col_index, d_A_csr_diag_index, - dataBase_.d_face_vector, dataBase_.d_velocity_old, dataBase_.d_weight, dataBase_.d_volume, dataBase_.d_grad); + d_A_csr_row_index, d_A_csr_col_index, d_A_csr_diag_index, + dataBase_.d_face_vector, dataBase_.d_velocity_old, dataBase_.d_weight, dataBase_.d_volume, dataBase_.d_grad); blocks_per_grid = (num_boundary_cells + threads_per_block - 1) / threads_per_block; fvc_grad_vector_boundary<<>>(num_cells, num_boundary_cells, - dataBase_.d_boundary_cell_offset, dataBase_.d_boundary_cell_id, dataBase_.d_boundary_face_vector, dataBase_.d_boundary_velocity, - dataBase_.d_volume, dataBase_.d_grad, dataBase_.d_grad_boundary_init); + dataBase_.d_boundary_cell_offset, dataBase_.d_boundary_cell_id, dataBase_.d_boundary_face_vector, dataBase_.d_boundary_velocity, + dataBase_.d_volume, dataBase_.d_grad, dataBase_.d_grad_boundary_init); correct_boundary_conditions<<>>(num_boundary_cells, - dataBase_.d_boundary_cell_offset, dataBase_.d_boundary_cell_id, dataBase_.d_boundary_face_vector, dataBase_.d_boundary_face, - dataBase_.d_grad_boundary_init, dataBase_.d_grad_boundary); + dataBase_.d_boundary_cell_offset, dataBase_.d_boundary_cell_id, dataBase_.d_boundary_face_vector, dataBase_.d_boundary_face, + dataBase_.d_grad_boundary_init, dataBase_.d_grad_boundary); clock_t end = std::clock(); time_monitor_GPU_kernel += double(end - start) / double(CLOCKS_PER_SEC); } @@ -977,7 +1021,7 @@ void dfUEqn::dev2T() size_t threads_per_block = 1024; size_t blocks_per_grid = (num_cells + threads_per_block - 1) / threads_per_block; dev2_t_tensor<<>>(num_cells, dataBase_.d_grad); - + blocks_per_grid = (dataBase_.num_boundary_faces + threads_per_block - 1) / threads_per_block; dev2_t_tensor<<>>(dataBase_.num_boundary_faces, dataBase_.d_grad_boundary); clock_t end = std::clock(); @@ -996,15 +1040,15 @@ void dfUEqn::fvc_div_tensor(const double *nuEff) size_t threads_per_block = 512; size_t blocks_per_grid = (num_cells + threads_per_block - 1) / threads_per_block; fvc_div_tensor_internal<<>>(num_cells, - d_A_csr_row_index, d_A_csr_col_index, d_A_csr_diag_index, - dataBase_.d_nuEff, dataBase_.d_rho_new, dataBase_.d_face_vector, dataBase_.d_grad, dataBase_.d_weight, - dataBase_.d_volume, 1., d_b, d_b); + d_A_csr_row_index, d_A_csr_col_index, d_A_csr_diag_index, + dataBase_.d_nuEff, dataBase_.d_rho_new, dataBase_.d_face_vector, dataBase_.d_grad, dataBase_.d_weight, + dataBase_.d_volume, 1., d_b, d_b); blocks_per_grid = (num_boundary_cells + threads_per_block - 1) / threads_per_block; fvc_div_tensor_boundary<<>>(num_cells, num_boundary_cells, - dataBase_.d_boundary_cell_offset, dataBase_.d_boundary_cell_id, - dataBase_.d_boundary_nuEff, dataBase_.d_boundary_rho, dataBase_.d_boundary_face_vector, dataBase_.d_grad_boundary, - dataBase_.d_volume, 1., d_b, d_b); + dataBase_.d_boundary_cell_offset, dataBase_.d_boundary_cell_id, + dataBase_.d_boundary_nuEff, dataBase_.d_boundary_rho, dataBase_.d_boundary_face_vector, dataBase_.d_grad_boundary, + dataBase_.d_volume, 1., d_b, d_b); end = std::clock(); time_monitor_GPU_kernel += double(end - start) / double(CLOCKS_PER_SEC); } @@ -1016,14 +1060,14 @@ void dfUEqn::fvm_laplacian() size_t threads_per_block = 1024; size_t blocks_per_grid = (num_cells + threads_per_block - 1) / threads_per_block; fvm_laplacian_uncorrected_vector_internal<<>>(num_cells, num_faces, - d_A_csr_row_index, d_A_csr_col_index, d_A_csr_diag_index, dataBase_.d_rho_new, dataBase_.d_nuEff, dataBase_.d_weight, - dataBase_.d_face, dataBase_.d_deltaCoeffs, -1., d_A_csr, d_A_csr); + d_A_csr_row_index, d_A_csr_col_index, d_A_csr_diag_index, dataBase_.d_rho_new, dataBase_.d_nuEff, dataBase_.d_weight, + dataBase_.d_face, dataBase_.d_deltaCoeffs, -1., d_A_csr, d_A_csr); blocks_per_grid = (num_boundary_cells + threads_per_block - 1) / threads_per_block; fvm_laplacian_uncorrected_vector_boundary<<>>(num_cells, num_faces, num_boundary_cells, - d_A_csr_row_index, d_A_csr_diag_index, dataBase_.d_boundary_cell_offset, dataBase_.d_boundary_cell_id, - dataBase_.d_boundary_nuEff, dataBase_.d_boundary_rho, dataBase_.d_boundary_face, dataBase_.d_laplac_internal_coeffs, - dataBase_.d_laplac_boundary_coeffs, -1., d_A_csr, d_b, d_A_csr, d_b); + d_A_csr_row_index, d_A_csr_diag_index, dataBase_.d_boundary_cell_offset, dataBase_.d_boundary_cell_id, + dataBase_.d_boundary_nuEff, dataBase_.d_boundary_rho, dataBase_.d_boundary_face, dataBase_.d_laplac_internal_coeffs, + dataBase_.d_laplac_boundary_coeffs, -1., d_A_csr, d_b, d_A_csr, d_b); clock_t end = std::clock(); time_monitor_GPU_kernel += double(end - start) / double(CLOCKS_PER_SEC); } @@ -1045,17 +1089,18 @@ void dfUEqn::checkValue(bool print) char *input_file = "of_output.txt"; FILE *fp = fopen(input_file, "rb+"); - if (fp == NULL) { + if (fp == NULL) + { fprintf(stderr, "Failed to open input file: %s!\n", input_file); } int readfile = 0; - double *of_b = new double[3*num_cells]; + double *of_b = new double[3 * num_cells]; double *of_A = new double[num_faces + num_cells]; readfile = fread(of_b, num_cells * 3 * sizeof(double), 1, fp); readfile = fread(of_A, (num_faces + num_cells) * sizeof(double), 1, fp); - std::vector h_A_of_init_vec(num_cells+num_faces); - std::copy(of_A, of_A + num_cells+num_faces, h_A_of_init_vec.begin()); + std::vector h_A_of_init_vec(num_cells + num_faces); + std::copy(of_A, of_A + num_cells + num_faces, h_A_of_init_vec.begin()); std::vector h_A_of_vec_1mtx(num_faces + num_cells, 0); for (int i = 0; i < num_faces + num_cells; i++) @@ -1063,27 +1108,27 @@ void dfUEqn::checkValue(bool print) h_A_of_vec_1mtx[i] = h_A_of_init_vec[dataBase_.tmpPermutatedList[i]]; } - std::vector h_A_of_vec((num_faces + num_cells)*3); - for (int i =0; i < 3; i ++) + std::vector h_A_of_vec((num_faces + num_cells) * 3); + for (int i = 0; i < 3; i++) { - std::copy(h_A_of_vec_1mtx.begin(), h_A_of_vec_1mtx.end(), h_A_of_vec.begin()+i*(num_faces + num_cells)); + std::copy(h_A_of_vec_1mtx.begin(), h_A_of_vec_1mtx.end(), h_A_of_vec.begin() + i * (num_faces + num_cells)); } // b - std::vector h_b_of_init_vec(3*num_cells); - std::copy(of_b, of_b + 3*num_cells, h_b_of_init_vec.begin()); + std::vector h_b_of_init_vec(3 * num_cells); + std::copy(of_b, of_b + 3 * num_cells, h_b_of_init_vec.begin()); std::vector h_b_of_vec; - for (int i = 0; i < 3*num_cells; i+=3) + for (int i = 0; i < 3 * num_cells; i += 3) { h_b_of_vec.push_back(h_b_of_init_vec[i]); } // fill RHS_y - for (int i = 1; i < 3*num_cells; i+=3) + for (int i = 1; i < 3 * num_cells; i += 3) { h_b_of_vec.push_back(h_b_of_init_vec[i]); } // fill RHS_z - for (int i = 2; i < 3*num_cells; i+=3) + for (int i = 2; i < 3 * num_cells; i += 3) { h_b_of_vec.push_back(h_b_of_init_vec[i]); } @@ -1092,7 +1137,7 @@ void dfUEqn::checkValue(bool print) { for (int i = 0; i < (num_faces + num_cells); i++) fprintf(stderr, "h_A_of_vec_1mtx[%d]: %.15lf\n", i, h_A_of_vec_1mtx[i]); - for (int i = 0; i < 3*num_cells; i++) + for (int i = 0; i < 3 * num_cells; i++) fprintf(stderr, "h_b_of_vec[%d]: %.15lf\n", i, h_b_of_vec[i]); } @@ -1100,13 +1145,13 @@ void dfUEqn::checkValue(bool print) fprintf(stderr, "check of h_A_csr\n"); checkVectorEqual(num_faces + num_cells, h_A_of_vec_1mtx.data(), h_A_csr, 1e-5); fprintf(stderr, "check of h_b\n"); - checkVectorEqual(3*num_cells, h_b_of_vec.data(), h_b, 1e-5); + checkVectorEqual(3 * num_cells, h_b_of_vec.data(), h_b, 1e-5); } void dfUEqn::solve() { // for (size_t i = 0; i < num_cells; i++) - // fprintf(stderr, "h_velocity_old[%d]: (%.15lf, %.15lf, %.15lf)\n", i, h_velocity_old[3*i], + // fprintf(stderr, "h_velocity_old[%d]: (%.15lf, %.15lf, %.15lf)\n", i, h_velocity_old[3*i], // h_velocity_old[3*i + 1], h_velocity_old[3*i + 2]); // constructor AmgXSolver at first interation // Synchronize stream @@ -1117,7 +1162,6 @@ void dfUEqn::solve() printf("CPU Time (copy&permutate) = %.6lf s\n", time_monitor_CPU); printf("GPU Time (kernel launch) = %.6lf s\n", time_monitor_GPU_kernel); printf("GPU Time (memcpy) = %.6lf s\n", time_monitor_GPU_memcpy); - printf("GPU Time (memcpy half) = %.6lf s\n", time_monitor_GPU_memcpy_test); time_monitor_CPU = 0; time_monitor_GPU_kernel = 0; time_monitor_GPU_memcpy = 0; @@ -1125,41 +1169,41 @@ void dfUEqn::solve() // nvtxRangePush("solve"); int nNz = num_cells + num_faces; // matrix entries - if (num_iteration == 0) // first interation + if (num_iteration == 0) // first interation { printf("Initializing AmgX Linear Solver\n"); UxSolver->setOperator(num_cells, nNz, d_A_csr_row_index, d_A_csr_col_index, d_A_csr); UySolver->setOperator(num_cells, nNz, d_A_csr_row_index, d_A_csr_col_index, d_A_csr + nNz); - UzSolver->setOperator(num_cells, nNz, d_A_csr_row_index, d_A_csr_col_index, d_A_csr + 2*nNz); + UzSolver->setOperator(num_cells, nNz, d_A_csr_row_index, d_A_csr_col_index, d_A_csr + 2 * nNz); } else { UxSolver->updateOperator(num_cells, nNz, d_A_csr); UySolver->updateOperator(num_cells, nNz, d_A_csr + nNz); - UzSolver->updateOperator(num_cells, nNz, d_A_csr + 2*nNz); + UzSolver->updateOperator(num_cells, nNz, d_A_csr + 2 * nNz); } UxSolver->solve(num_cells, d_psi, d_b); UySolver->solve(num_cells, d_psi + num_cells, d_b + num_cells); - UzSolver->solve(num_cells, d_psi + 2*num_cells, d_b + 2*num_cells); - num_iteration ++; + UzSolver->solve(num_cells, d_psi + 2 * num_cells, d_b + 2 * num_cells); + num_iteration++; checkCudaErrors(cudaMemcpyAsync(h_psi, d_psi, cell_vec_bytes, cudaMemcpyDeviceToHost, stream)); // for (size_t i = 0; i < num_cells; i++) - // fprintf(stderr, "h_velocity_after[%d]: (%.15lf, %.15lf, %.15lf)\n", i, h_psi[i], + // fprintf(stderr, "h_velocity_after[%d]: (%.15lf, %.15lf, %.15lf)\n", i, h_psi[i], // h_psi[num_cells + i], h_psi[num_cells*2 + i]); } -void dfUEqn::updatePsi(double* Psi) +void dfUEqn::updatePsi(double *Psi) { checkCudaErrors(cudaStreamSynchronize(stream)); for (size_t i = 0; i < num_cells; i++) { - Psi[i*3] = h_psi[i]; - Psi[i*3 + 1] = h_psi[num_cells + i]; - Psi[i*3 + 2] = h_psi[num_cells*2 + i]; + Psi[i * 3] = h_psi[i]; + Psi[i * 3 + 1] = h_psi[num_cells + i]; + Psi[i * 3 + 2] = h_psi[num_cells * 2 + i]; } } -dfUEqn::~dfUEqn(){ - +dfUEqn::~dfUEqn() +{ } \ No newline at end of file