From f5243bf960895f4404c856d67c0085974db7f77b Mon Sep 17 00:00:00 2001 From: maorz1998 Date: Wed, 24 May 2023 21:51:08 +0800 Subject: [PATCH] preliminary refactory --- applications/solvers/dfLowMachFoam/EEqn.H | 36 +- applications/solvers/dfLowMachFoam/UEqn.H | 18 +- applications/solvers/dfLowMachFoam/YEqn.H | 14 +- .../solvers/dfLowMachFoam/createdfSolver.H | 6 +- .../solvers/dfLowMachFoam/dfLowMachFoam.C | 2 +- src_gpu/CMakeLists.txt | 2 +- src_gpu/GPUMesh.H | 23 + src_gpu/GPUfield.H | 57 + src_gpu/GPUfield.cpp | 18 + src_gpu/dfMatrix.H | 217 ---- src_gpu/dfMatrixDataBase.H | 550 +++++++++ src_gpu/dfUEqn.H | 53 + src_gpu/{dfMatrix.cu => dfUEqn.cu} | 861 +++---------- src_gpu/ueqn.cu | 1097 ----------------- 14 files changed, 896 insertions(+), 2058 deletions(-) create mode 100644 src_gpu/GPUMesh.H create mode 100644 src_gpu/GPUfield.H create mode 100644 src_gpu/GPUfield.cpp delete mode 100644 src_gpu/dfMatrix.H create mode 100644 src_gpu/dfMatrixDataBase.H create mode 100644 src_gpu/dfUEqn.H rename src_gpu/{dfMatrix.cu => dfUEqn.cu} (57%) delete mode 100644 src_gpu/ueqn.cu diff --git a/applications/solvers/dfLowMachFoam/EEqn.H b/applications/solvers/dfLowMachFoam/EEqn.H index 9da3120c8..32d94d8d1 100644 --- a/applications/solvers/dfLowMachFoam/EEqn.H +++ b/applications/solvers/dfLowMachFoam/EEqn.H @@ -1,22 +1,24 @@ { volScalarField& he = thermo.he(); - // start1 = std::clock(); - // t.join(); - // UEqn_GPU.updatePsi(&U[0][0]); - // K = 0.5*magSqr(U); - // end1 = std::clock(); - // time_monitor_UEqn += double(end1 - start1) / double(CLOCKS_PER_SEC); - // time_monitor_UEqn_mtxAssembly += double(end1 - start1) / double(CLOCKS_PER_SEC); - // time_monitor_CPU += double(end1 - start1) / double(CLOCKS_PER_SEC); - // time_monitor_UinE += double(end1 - start1) / double(CLOCKS_PER_SEC); - start1 = std::clock(); - // // t.join(); - UEqn_GPU.updatePsi(&U[0][0]); - K = 0.5*magSqr(U); - end1 = std::clock(); - time_monitor_UEqn += double(end1 - start1) / double(CLOCKS_PER_SEC); - time_monitor_UEqn_mtxAssembly += double(end1 - start1) / double(CLOCKS_PER_SEC); - time_monitor_CPU += double(end1 - start1) / double(CLOCKS_PER_SEC); + #ifdef GPUSolver_ + // start1 = std::clock(); + // t.join(); + // UEqn_GPU.updatePsi(&U[0][0]); + // K = 0.5*magSqr(U); + // end1 = std::clock(); + // time_monitor_UEqn += double(end1 - start1) / double(CLOCKS_PER_SEC); + // time_monitor_UEqn_mtxAssembly += double(end1 - start1) / double(CLOCKS_PER_SEC); + // time_monitor_CPU += double(end1 - start1) / double(CLOCKS_PER_SEC); + // time_monitor_UinE += double(end1 - start1) / double(CLOCKS_PER_SEC); + start1 = std::clock(); + // // t.join(); + UEqn_GPU.updatePsi(&U[0][0]); + K = 0.5*magSqr(U); + end1 = std::clock(); + time_monitor_UEqn += double(end1 - start1) / double(CLOCKS_PER_SEC); + time_monitor_UEqn_mtxAssembly += double(end1 - start1) / double(CLOCKS_PER_SEC); + time_monitor_CPU += double(end1 - start1) / double(CLOCKS_PER_SEC); + #endif fvScalarMatrix EEqn ( diff --git a/applications/solvers/dfLowMachFoam/UEqn.H b/applications/solvers/dfLowMachFoam/UEqn.H index d81f3948c..f7547af48 100644 --- a/applications/solvers/dfLowMachFoam/UEqn.H +++ b/applications/solvers/dfLowMachFoam/UEqn.H @@ -15,15 +15,15 @@ end1 = std::clock(); #endif #ifdef CPUSolver_ -// UEqn.relax(); -start1 = std::clock(); -if (pimple.momentumPredictor()) -{ - solve(UEqn); + // UEqn.relax(); + start1 = std::clock(); + if (pimple.momentumPredictor()) + { + solve(UEqn); - K = 0.5*magSqr(U); -} -end1 = std::clock(); + K = 0.5*magSqr(U); + } + end1 = std::clock(); time_monitor_UEqn += double(end1 - start1) / double(CLOCKS_PER_SEC); time_monitor_UEqn_Solve += double(end1 - start1) / double(CLOCKS_PER_SEC); #endif @@ -101,7 +101,7 @@ end1 = std::clock(); boundary_pressure_init, boundary_velocity_init, boundary_nuEff_init, boundary_rho_init); UEqn_GPU.fvc_grad(&p[0]); UEqn_GPU.fvc_grad_vector(); - UEqn_GPU.divDevRhoReff(); + UEqn_GPU.dev2T(); UEqn_GPU.fvc_div_tensor(&nuEff[0]); UEqn_GPU.fvm_laplacian(); end1 = std::clock(); diff --git a/applications/solvers/dfLowMachFoam/YEqn.H b/applications/solvers/dfLowMachFoam/YEqn.H index b0187baf2..8831b67ac 100644 --- a/applications/solvers/dfLowMachFoam/YEqn.H +++ b/applications/solvers/dfLowMachFoam/YEqn.H @@ -26,12 +26,14 @@ forAll(Y, i) } const surfaceScalarField phiUc = linearInterpolate(sumYDiffError) & mesh.Sf(); -start1 = std::clock(); -// // std::thread t(&dfMatrix::solve, &UEqn_GPU); -UEqn_GPU.solve(); -end1 = std::clock(); -time_monitor_UEqn += double(end1 - start1) / double(CLOCKS_PER_SEC); -time_monitor_UEqn_Solve += double(end1 - start1) / double(CLOCKS_PER_SEC); +#ifdef GPUSolver_ + start1 = std::clock(); + // // std::thread t(&dfMatrix::solve, &UEqn_GPU); + UEqn_GPU.solve(); + end1 = std::clock(); + time_monitor_UEqn += double(end1 - start1) / double(CLOCKS_PER_SEC); + time_monitor_UEqn_Solve += double(end1 - start1) / double(CLOCKS_PER_SEC); +#endif //MPI_Barrier(PstreamGlobals::MPI_COMM_FOAM); label flag_mpi_init; diff --git a/applications/solvers/dfLowMachFoam/createdfSolver.H b/applications/solvers/dfLowMachFoam/createdfSolver.H index e5946776c..79d1f2e78 100644 --- a/applications/solvers/dfLowMachFoam/createdfSolver.H +++ b/applications/solvers/dfLowMachFoam/createdfSolver.H @@ -28,8 +28,10 @@ int num_boundary_cells; string settingPath; settingPath = CanteraTorchProperties.subDict("AmgxSettings").lookupOrDefault("UEqnSettingPath", string("")); -dfMatrix UEqn_GPU(num_surfaces, num_cells, num_boundary_faces, num_boundary_cells, &neighbour[0], &owner[0], &mesh.V()[0], &mesh.surfaceInterpolation::weights()[0], -&mesh.Sf()[0][0], &mesh.magSf()[0], &mesh.nonOrthDeltaCoeffs()[0], boundary_face_vector_init, boundary_face_init, boundary_deltaCoeffs_init, boundaryCellIndex, "dDDI", settingPath); +dfMatrixDataBase dfDataBase(num_surfaces, num_cells, num_boundary_faces, num_boundary_cells, &neighbour[0], &owner[0], &mesh.V()[0], &mesh.surfaceInterpolation::weights()[0], +&mesh.Sf()[0][0], &mesh.magSf()[0], &mesh.nonOrthDeltaCoeffs()[0], boundary_face_vector_init, boundary_face_init, boundary_deltaCoeffs_init, boundaryCellIndex); + +dfUEqn UEqn_GPU(dfDataBase, "dDDI", settingPath); double *ueqn_internalCoeffs_init, *ueqn_boundaryCoeffs_init, *boundary_pressure_init, *boundary_velocity_init, *boundary_nuEff_init, *boundary_rho_init, *ueqn_laplac_internalCoeffs_init, *ueqn_laplac_boundaryCoeffs_init; diff --git a/applications/solvers/dfLowMachFoam/dfLowMachFoam.C b/applications/solvers/dfLowMachFoam/dfLowMachFoam.C index bcd56d242..2a0a43826 100644 --- a/applications/solvers/dfLowMachFoam/dfLowMachFoam.C +++ b/applications/solvers/dfLowMachFoam/dfLowMachFoam.C @@ -59,7 +59,7 @@ Description #include "basicThermo.H" #include "CombustionModel.H" -#include "dfMatrix.H" +#include "dfUEqn.H" #include #include diff --git a/src_gpu/CMakeLists.txt b/src_gpu/CMakeLists.txt index 5bcf34327..7c48e5bb2 100644 --- a/src_gpu/CMakeLists.txt +++ b/src_gpu/CMakeLists.txt @@ -18,7 +18,7 @@ include_directories( $ENV{AMGX_DIR}/include ) -add_library(${PROJECT_NAME} SHARED dfMatrix.cu AmgXSolver.cu) +add_library(${PROJECT_NAME} SHARED dfUEqn.cu AmgXSolver.cu) target_link_libraries(${PROJECT_NAME} ${MPI_LIBRARIES} diff --git a/src_gpu/GPUMesh.H b/src_gpu/GPUMesh.H new file mode 100644 index 000000000..22cc05ed8 --- /dev/null +++ b/src_gpu/GPUMesh.H @@ -0,0 +1,23 @@ +#include + +class GPUMesh +{ +public: + int num_cells; + int num_faces; + int boundary_cells; + //... all variables needed to upload once + +public: + GPUMesh(); + ~GPUMesh(); +}; + +GPUMesh::GPUMesh() +{ + // same to the constructor of dfMatrix.C +} + +GPUMesh::~GPUMesh() +{ +} diff --git a/src_gpu/GPUfield.H b/src_gpu/GPUfield.H new file mode 100644 index 000000000..d81afbb01 --- /dev/null +++ b/src_gpu/GPUfield.H @@ -0,0 +1,57 @@ +/* +GPU field +UEqn +GPUField +rho_old, rho_new, vector_old, phi, p, nuEff, +p_bou, vector_old_bou, nuEff_bou, rho_new_bou, +ueqn_internalCoeffs, ueqn_boundaryCoeffs_init, ueqn_laplac_internalCoeffs_init, ueqn_laplac_boundaryCoeffs_init +1. initialize a map +2. implement methods to construct GPU pointer and return GPU pointer + +fvm__ddt(UEqn, rho, U) +{ + rho.oldtime() + rho. +} +*/ +#include +#include + +#define TOSTRING(x) #x + +struct GPUField { + double* cur_internal; + double* cur_boundary; + double* old_internal; + double* old_boundary; +}; +std::unordered_map GPUFields; + +// initialize: cudaMalloc, conducted at begining +void initialize(std::string U, std::string p, std::string phi); +// Q: +// 1. consider the different sizes bettween face values and cell values +// 2. not all variables need all these four terms + +// update at the end of this time step +// move current pointer as oldTime pointer +void update(); + +template class PatchField, class GeoMesh> +double* cur_internal(Foam::GeometryField var) +{ + if (!GPUFields[TOSTRING(var)].cur_internal) { + // 1. cudaMemcopy current internal field + } + return GPUFields[TOSTRING(var)].cur_internal; +} + +template class PatchField, class GeoMesh> +double* cur_boundary(Foam::GeometryField var); + +template class PatchField, class GeoMesh> +double* old_internal(Foam::GeometryField var); + +template class PatchField, class GeoMesh> +double* old_boundary(Foam::GeometryField var); + diff --git a/src_gpu/GPUfield.cpp b/src_gpu/GPUfield.cpp new file mode 100644 index 000000000..e79dbdf07 --- /dev/null +++ b/src_gpu/GPUfield.cpp @@ -0,0 +1,18 @@ +/* +GPU field +UEqn +GPUField +rho_old, rho_new, vector_old, phi, p, nuEff, +p_bou, vector_old_bou, nuEff_bou, rho_new_bou, +ueqn_internalCoeffs, ueqn_boundaryCoeffs_init, ueqn_laplac_internalCoeffs_init, ueqn_laplac_boundaryCoeffs_init +1. initialize a map +2. implement methods to construct GPU pointer and return GPU pointer + +fvm__ddt(UEqn, rho, U) +{ + rho.oldtime() + rho. +} +*/ +#include + diff --git a/src_gpu/dfMatrix.H b/src_gpu/dfMatrix.H deleted file mode 100644 index 396d5a29d..000000000 --- a/src_gpu/dfMatrix.H +++ /dev/null @@ -1,217 +0,0 @@ -#ifndef dfMatrix_H -#define dfMatrix_H - -#include -#include -#include "cuda_profiler_api.h" -#include -#include "nvtx3/nvToolsExt.h" -#include -#include -#include -#include -#include -#include - -#include "AmgXSolver.H" -# include - -static const char *_cudaGetErrorEnum(cudaError_t error) { - return cudaGetErrorName(error); -} - -template -void check(T result, char const *const func, const char *const file, - int const line) { - if (result) { - fprintf(stderr, "cuda error at %s:%d code=%d(%s) \"%s\" \n", file, line, - static_cast(result), _cudaGetErrorEnum(result), func); - exit(EXIT_FAILURE); - } -} - -#define checkCudaErrors(val) check((val), #val, __FILE__, __LINE__) - -class dfMatrix -{ -private: - /** - * @brief data size - */ - // - number of cell size - int num_cells; - // - number of face size - int num_surfaces; - // - number of offdiagnal entry size (2*num_surfaces) - int num_faces; - // - number of boundary cells - int num_boundary_cells; - // - number of boundary faces - int num_boundary_faces; - - /** - * @brief input data (now these values are named according to UEqn) - */ - // - mesh variables - // - csr_row_index - int *h_A_csr_row_index=nullptr, *d_A_csr_row_index=nullptr; - // - csr_col_index - int *h_A_csr_col_index=nullptr, *d_A_csr_col_index=nullptr; - // - csr_diag_index - int *h_A_csr_diag_index=nullptr, *d_A_csr_diag_index=nullptr; - - // - the pre-permutated and post-permutated interpolation weight list - std::vector h_weight_vec_init, h_weight_vec; - // - the pre-permutated and post-permutated flux (phi) list - std::vector h_phi_vec_init, h_phi_vec; - // - the pre-permutated and post-permutated cell face vector list - std::vector h_face_vector_vec_init, h_face_vector_vec; - std::vector h_face_vec_init, h_face_vec; - std::vector h_deltaCoeffs_vec_init, h_deltaCoeffs_vec; - // - the host pointer to rho_new, rho_old, velocity_old, pressure and volume list - double *h_rho_new = nullptr, *h_rho_old = nullptr, *h_velocity_old = nullptr, - *h_pressure = nullptr; - const double *h_volume = nullptr; - // - the host pointer to the pre-permutated and post-permutated interpolation weight list - double *h_weight_init = nullptr, *h_weight = nullptr; - // - the host pointer to the pre-permutated and post-permutated flux (phi) list - double *h_phi_init = nullptr, *h_phi = nullptr; - // - the host pointer to the pre-permutated and post-permutated cell face vector list - double *h_face_vector_init = nullptr, *h_face_vector = nullptr; - double *h_face_init = nullptr, *h_face = nullptr; - double *h_deltaCoeffs_init = nullptr, *h_deltaCoeffs = nullptr; - // - the device pointer to rho_new, rho_old, velocity_old, pressure and volume list - double *d_rho_new = nullptr, *d_rho_old = nullptr, *d_velocity_old = nullptr, - *d_pressure = nullptr, *d_volume = nullptr; - // - the device pointer to the pre-permutated and post-permutated interpolation weight list - double *d_weight_init = nullptr, *d_weight = nullptr; - // - the device pointer to the pre-permutated and post-permutated flux (phi) list - double *d_phi_init = nullptr, *d_phi = nullptr; - // - the device pointer to the pre-permutated and post-permutated cell face vector list - double *d_face_vector_init = nullptr, *d_face_vector = nullptr; - double *d_face_init = nullptr, *d_face = nullptr; - double *d_deltaCoeffs_init = nullptr, *d_deltaCoeffs = nullptr; - - double rdelta_t = 1/1e-6; - - /** - * @brief boundary related variables - */ - int *h_boundary_cell_offset = nullptr, *d_boundary_cell_offset=nullptr; - int *h_boundary_cell_id = nullptr, *d_boundary_cell_id = nullptr; - double *h_internal_coeffs = nullptr, *h_boundary_coeffs = nullptr, - *h_boundary_pressure = nullptr, *h_boundary_face_vector = nullptr, - *h_boundary_face = nullptr, *d_boundary_face = nullptr, - *h_boundary_deltaCoeffs = nullptr, *d_boundary_deltaCoeffs = nullptr, - *d_internal_coeffs = nullptr, *d_boundary_coeffs = nullptr, - *d_internal_coeffs_init = nullptr, *d_boundary_coeffs_init = nullptr, - *d_laplac_internal_coeffs = nullptr, *d_laplac_boundary_coeffs = nullptr, - *d_laplac_internal_coeffs_init = nullptr, *d_laplac_boundary_coeffs_init = nullptr, - *d_boundary_pressure = nullptr, *d_boundary_face_vector = nullptr, - *d_boundary_pressure_init = nullptr, - *d_boundary_velocity = nullptr, *d_boundary_velocity_init = nullptr, - *d_boundary_nuEff = nullptr, *d_boundary_nuEff_init = nullptr, - *d_boundary_rho = nullptr, *d_boundary_rho_init = nullptr; - - std::vector boundPermutationList; - std::vector ueqn_internalCoeffs, ueqn_boundaryCoeffs; - std::vector boundary_face_vector; - std::vector boundary_pressure; - std::vector boundary_face; - std::vector boundary_deltaCoeffs; - - // - the device pointer to the permutated index list - std::vector permedIndex; - int *d_permedIndex=nullptr; - int *d_bouPermedIndex = nullptr; - - - // bytesize - // - bytes of diagnal entries - size_t cell_bytes; - // - bytes of diagnal entries (vector) - size_t cell_vec_bytes; - // - bytes of diagnal index - size_t cell_index_bytes; - // - bytes of diagnal index - size_t face_bytes; - size_t face_vec_bytes; - size_t face_index_bytes; - - size_t boundary_cell_bytes; - size_t boundary_cell_vec_bytes; - size_t boundary_cell_index_bytes; - - size_t boundary_face_bytes; - size_t boundary_face_vec_bytes; - size_t boundary_face_index_bytes; - - // A_csr has one more element in each row: itself - size_t csr_row_index_bytes; - size_t csr_col_index_bytes; - size_t csr_value_bytes; - size_t csr_value_vec_bytes; - - // extra matrix information - double *d_turbSrc_A = nullptr, *d_turbSrc_b = nullptr, *d_turbSrc_A_init = nullptr; - std::vector h_turbSrc_init_mtx_vec, h_turbSrc_init_1mtx; - std::vector h_turbSrc_init_src_vec, h_turbSrc_src_vec; - std::vector tmpPermutatedList; - int * d_tmpPermutatedList = nullptr; - - double *h_A_csr = nullptr, *h_b = nullptr, *h_psi = nullptr; - double *d_A_csr = nullptr, *d_b = nullptr, *d_psi = nullptr; - - /** - * @brief cuda functions - */ - cudaStream_t stream; - /** - * @brief AmgX functions - */ - AmgXSolver* UxSolver = nullptr, *UySolver = nullptr, *UzSolver = nullptr; - int num_iteration; - - double time_monitor_CPU; - double time_monitor_GPU_kernel, time_monitor_GPU_memcpy, time_monitor_GPU_memcpy_test; - - double* d_grad = nullptr; - double* d_grad_boundary = nullptr, *d_grad_boundary_init = nullptr; - double* d_nuEff = nullptr; - - - -public: - dfMatrix(); - dfMatrix(int num_surfaces, int num_cells, int num_boundary_faces, int &num_boundary_cells_output, - const int *neighbour, const int *owner, const double* volume, const double* weight, const double* face_vector, const double* face, const double* deltaCoeffs, - std::vector boundary_face_vector_init, std::vector boundary_face_init, std::vector boundary_deltaCoeffs_init, - std::vector boundary_cell_id_init, const std::string &modeStr, const std::string &cfgFile); - ~dfMatrix(); - - void checkValue(bool print); - - void fvm_ddt(double *rho_old, double *rho_new, double* vector_old); - - void fvm_div(double* phi, double* ueqn_internalCoeffs_init, double* ueqn_boundaryCoeffs_init, - double* ueqn_laplac_internalCoeffs_init, double* ueqn_laplac_boundaryCoeffs_init, - double* boundary_pressure_init, double* boundary_velocity_init, - double* boundary_nuEff_init, double* boundary_rho_init); - - void fvc_grad(double* pressure); - - void fvc_grad_vector(); - - void divDevRhoReff(); - - void fvc_div_tensor(const double *nuEff); - - void fvm_laplacian(); - - void add_fvMatrix(double* turbSrc_low, double* turbSrc_diag, double* turbSrc_upp, double* turbSrc_source); - - void solve(); - void updatePsi(double* Psi); -}; - -#endif \ No newline at end of file diff --git a/src_gpu/dfMatrixDataBase.H b/src_gpu/dfMatrixDataBase.H new file mode 100644 index 000000000..35f3fa21e --- /dev/null +++ b/src_gpu/dfMatrixDataBase.H @@ -0,0 +1,550 @@ +#pragma once + +#include +#include +#include "cuda_profiler_api.h" +#include +#include "nvtx3/nvToolsExt.h" +#include +#include +#include +#include +#include +#include + + +static const char *_cudaGetErrorEnum(cudaError_t error) { + return cudaGetErrorName(error); +} + +template +void check(T result, char const *const func, const char *const file, + int const line) { + if (result) { + fprintf(stderr, "cuda error at %s:%d code=%d(%s) \"%s\" \n", file, line, + static_cast(result), _cudaGetErrorEnum(result), func); + exit(EXIT_FAILURE); + } +} + +#define checkCudaErrors(val) check((val), #val, __FILE__, __LINE__) + +void checkVectorEqual(int count, const double* basevec, double* vec, double max_relative_error) { + for (size_t i = 0; i < count; ++i) + { + double abs_diff = fabs(basevec[i] - vec[i]); + double rel_diff = fabs(basevec[i] - vec[i]) / fabs(basevec[i]); + if (abs_diff > 1e-12 && rel_diff > max_relative_error && !std::isinf(rel_diff)) + fprintf(stderr, "mismatch index %d, cpu data: %.16lf, gpu data: %.16lf, relative error: %.16lf\n", i, basevec[i], vec[i], rel_diff); + } +} + + +struct dfMatrixDataBase +{ + // - number of cell size + int num_cells; + // - number of face size + int num_surfaces; + // - number of offdiagnal entry size (2*num_surfaces) + int num_faces; + // - number of boundary cells + int num_boundary_cells; + // - number of boundary faces + int num_boundary_faces; + + // - mesh variables + // - csr_row_index + int *h_A_csr_row_index=nullptr, *d_A_csr_row_index=nullptr; + // - csr_col_index + int *h_A_csr_col_index=nullptr, *d_A_csr_col_index=nullptr; + // - csr_diag_index + int *h_A_csr_diag_index=nullptr, *d_A_csr_diag_index=nullptr; + + // - the pre-permutated and post-permutated interpolation weight list + std::vector h_weight_vec_init, h_weight_vec; + // - the pre-permutated and post-permutated flux (phi) list + std::vector h_phi_vec_init, h_phi_vec; + // - the pre-permutated and post-permutated cell face vector list + std::vector h_face_vector_vec_init, h_face_vector_vec; + std::vector h_face_vec_init, h_face_vec; + std::vector h_deltaCoeffs_vec_init, h_deltaCoeffs_vec; + // - the host pointer to rho_new, rho_old, velocity_old, pressure and volume list + double *h_rho_new = nullptr, *h_rho_old = nullptr, *h_velocity_old = nullptr, + *h_pressure = nullptr; + const double *h_volume = nullptr; + // - the host pointer to the pre-permutated and post-permutated interpolation weight list + double *h_weight_init = nullptr, *h_weight = nullptr; + // - the host pointer to the pre-permutated and post-permutated flux (phi) list + double *h_phi_init = nullptr, *h_phi = nullptr; + // - the host pointer to the pre-permutated and post-permutated cell face vector list + double *h_face_vector_init = nullptr, *h_face_vector = nullptr; + double *h_face_init = nullptr, *h_face = nullptr; + double *h_deltaCoeffs_init = nullptr, *h_deltaCoeffs = nullptr; + // - the device pointer to rho_new, rho_old, velocity_old, pressure and volume list + double *d_rho_new = nullptr, *d_rho_old = nullptr, *d_velocity_old = nullptr, + *d_pressure = nullptr, *d_volume = nullptr; + // - the device pointer to the pre-permutated and post-permutated interpolation weight list + double *d_weight_init = nullptr, *d_weight = nullptr; + // - the device pointer to the pre-permutated and post-permutated flux (phi) list + double *d_phi_init = nullptr, *d_phi = nullptr; + // - the device pointer to the pre-permutated and post-permutated cell face vector list + double *d_face_vector_init = nullptr, *d_face_vector = nullptr; + double *d_face_init = nullptr, *d_face = nullptr; + double *d_deltaCoeffs_init = nullptr, *d_deltaCoeffs = nullptr; + + double rdelta_t = 1/1e-6; + + /** + * @brief boundary related variables + */ + int *h_boundary_cell_offset = nullptr, *d_boundary_cell_offset=nullptr; + int *h_boundary_cell_id = nullptr, *d_boundary_cell_id = nullptr; + double *h_internal_coeffs = nullptr, *h_boundary_coeffs = nullptr, + *h_boundary_pressure = nullptr, *h_boundary_face_vector = nullptr, + *h_boundary_face = nullptr, *d_boundary_face = nullptr, + *h_boundary_deltaCoeffs = nullptr, *d_boundary_deltaCoeffs = nullptr, + *d_internal_coeffs = nullptr, *d_boundary_coeffs = nullptr, + *d_internal_coeffs_init = nullptr, *d_boundary_coeffs_init = nullptr, + *d_laplac_internal_coeffs = nullptr, *d_laplac_boundary_coeffs = nullptr, + *d_laplac_internal_coeffs_init = nullptr, *d_laplac_boundary_coeffs_init = nullptr, + *d_boundary_pressure = nullptr, *d_boundary_face_vector = nullptr, + *d_boundary_pressure_init = nullptr, + *d_boundary_velocity = nullptr, *d_boundary_velocity_init = nullptr, + *d_boundary_nuEff = nullptr, *d_boundary_nuEff_init = nullptr, + *d_boundary_rho = nullptr, *d_boundary_rho_init = nullptr; + + std::vector boundPermutationList; + std::vector ueqn_internalCoeffs, ueqn_boundaryCoeffs; + std::vector boundary_face_vector; + std::vector boundary_pressure; + std::vector boundary_face; + std::vector boundary_deltaCoeffs; + + // - the device pointer to the permutated index list + std::vector permedIndex; + int *d_permedIndex=nullptr; + int *d_bouPermedIndex = nullptr; + + + // bytesize + // - bytes of diagnal entries + size_t cell_bytes; + // - bytes of diagnal entries (vector) + size_t cell_vec_bytes; + // - bytes of diagnal index + size_t cell_index_bytes; + // - bytes of diagnal index + size_t face_bytes; + size_t face_vec_bytes; + size_t face_index_bytes; + + size_t boundary_cell_bytes; + size_t boundary_cell_vec_bytes; + size_t boundary_cell_index_bytes; + + size_t boundary_face_bytes; + size_t boundary_face_vec_bytes; + size_t boundary_face_index_bytes; + + // A_csr has one more element in each row: itself + size_t csr_row_index_bytes; + size_t csr_col_index_bytes; + size_t csr_value_bytes; + size_t csr_value_vec_bytes; + + // extra matrix information + double *d_turbSrc_A = nullptr, *d_turbSrc_b = nullptr, *d_turbSrc_A_init = nullptr; + std::vector h_turbSrc_init_mtx_vec, h_turbSrc_init_1mtx; + std::vector h_turbSrc_init_src_vec, h_turbSrc_src_vec; + std::vector tmpPermutatedList; + int * d_tmpPermutatedList = nullptr; + + // double *h_A_csr = nullptr, *h_b = nullptr, *h_psi = nullptr; + // double *d_A_csr = nullptr, *d_b = nullptr, *d_psi = nullptr; + + /** + * @brief cuda functions + */ + cudaStream_t stream; + /** + * @brief AmgX functions + */ + AmgXSolver* UxSolver = nullptr, *UySolver = nullptr, *UzSolver = nullptr; + int num_iteration; + + double time_monitor_CPU; + double time_monitor_GPU_kernel, time_monitor_GPU_memcpy, time_monitor_GPU_memcpy_test; + + double* d_grad = nullptr; + double* d_grad_boundary = nullptr, *d_grad_boundary_init = nullptr; + double* d_nuEff = nullptr; + + // constructor + dfMatrixDataBase(); + dfMatrixDataBase(int num_surfaces, int num_cells, int num_boundary_faces, int & num_boundary_cells_output, + const int *neighbour, const int *owner, const double* volume, const double* weight, const double* face_vector, const double* face, + const double* deltaCoeffs, std::vector boundary_face_vector_init, std::vector boundary_face_init, + std::vector boundary_deltaCoeffs_init, std::vector boundary_cell_id_init) + : num_cells(num_cells), num_faces(num_surfaces*2), num_surfaces(num_surfaces), num_iteration(0), + num_boundary_faces(num_boundary_faces), h_volume(volume) + { + // allocate field pointer in pin memory + cudaMallocHost(&h_phi_init, num_faces * sizeof(double)); + + h_weight_vec_init.resize(num_faces); + h_weight_vec.resize(num_faces); + h_face_vector_vec_init.resize(num_faces*3); + h_face_vector_vec.resize(num_faces*3); + h_face_vec_init.resize(num_faces); + h_face_vec.resize(num_faces); + h_deltaCoeffs_vec_init.resize(num_faces); + h_deltaCoeffs_vec.resize(num_faces); + h_turbSrc_init_mtx_vec.resize(num_faces + num_cells); + h_turbSrc_init_1mtx.resize(num_faces + num_cells); + h_turbSrc_init_src_vec.resize(3*num_cells); + h_turbSrc_src_vec.resize(3*num_cells); + + // byte sizes + cell_bytes = num_cells * sizeof(double); + cell_vec_bytes = num_cells * 3 * sizeof(double); + cell_index_bytes = num_cells * sizeof(int); + + face_bytes = num_faces * sizeof(double); + face_vec_bytes = num_faces * 3 * sizeof(double); + face_index_bytes = num_faces * sizeof(int); + + // A_csr has one more element in each row: itself + csr_row_index_bytes = (num_cells + 1) * sizeof(int); + csr_col_index_bytes = (num_cells + num_faces) * sizeof(int); + csr_value_bytes = (num_cells + num_faces) * sizeof(double); + csr_value_vec_bytes = (num_cells + num_faces) * 3 * sizeof(double); + + /************************construct mesh variables****************************/ + /** + * 1. h_csr_row_index & h_csr_diag_index + */ + std::vector h_mtxEntry_perRow_vec(num_cells); + std::vector h_csr_diag_index_vec(num_cells); + std::vector h_csr_row_index_vec(num_cells + 1, 0); + + for (int faceI = 0; faceI < num_surfaces; faceI++) + { + h_csr_diag_index_vec[neighbour[faceI]]++; + h_mtxEntry_perRow_vec[neighbour[faceI]]++; + h_mtxEntry_perRow_vec[owner[faceI]]++; + } + + // - consider diagnal element in each row + std::transform(h_mtxEntry_perRow_vec.begin(), h_mtxEntry_perRow_vec.end(), h_mtxEntry_perRow_vec.begin(), [](int n) + {return n + 1;}); + // - construct h_csr_row_index & h_csr_diag_index + std::partial_sum(h_mtxEntry_perRow_vec.begin(), h_mtxEntry_perRow_vec.end(), h_csr_row_index_vec.begin()+1); + // - assign h_csr_row_index & h_csr_diag_index + h_A_csr_row_index = h_csr_row_index_vec.data(); + h_A_csr_diag_index = h_csr_diag_index_vec.data(); + + /** + * 2. h_csr_col_index + */ + std::vector rowIndex(num_faces + num_cells), colIndex(num_faces + num_cells), diagIndex(num_cells); + std::iota(diagIndex.begin(), diagIndex.end(), 0); + + // initialize the RowIndex (rowIndex of lower + upper + diagnal) + std::copy(neighbour, neighbour + num_surfaces, rowIndex.begin()); + std::copy(owner, owner + num_surfaces, rowIndex.begin() + num_surfaces); + std::copy(diagIndex.begin(), diagIndex.end(), rowIndex.begin() + num_faces); + // initialize the ColIndex (colIndex of lower + upper + diagnal) + std::copy(owner, owner + num_surfaces, colIndex.begin()); + std::copy(neighbour, neighbour + num_surfaces, colIndex.begin() + num_surfaces); + std::copy(diagIndex.begin(), diagIndex.end(), colIndex.begin() + num_faces); + + // - construct hashTable for sorting + std::multimap rowColPair; + for (int i = 0; i < 2*num_surfaces+num_cells; i++) + { + rowColPair.insert(std::make_pair(rowIndex[i], colIndex[i])); + } + // - sort + std::vector> globalPerm(rowColPair.begin(), rowColPair.end()); + std::sort(globalPerm.begin(), globalPerm.end(), [] + (const std::pair& pair1, const std::pair& pair2){ + if (pair1.first != pair2.first) { + return pair1.first < pair2.first; + } else { + return pair1.second < pair2.second; + } + }); + + std::vector h_csr_col_index_vec; + std::transform(globalPerm.begin(), globalPerm.end(), std::back_inserter(h_csr_col_index_vec), [] + (const std::pair& pair) { + return pair.second; + }); + h_A_csr_col_index = h_csr_col_index_vec.data(); + + // construct a tmp permutated List for add fvMatrix + std::vector tmp_permutation(2*num_surfaces + num_cells); + std::vector tmp_rowIndex(2*num_surfaces + num_cells); + std::iota(tmp_permutation.begin(), tmp_permutation.end(), 0); + std::copy(neighbour, neighbour + num_surfaces, tmp_rowIndex.begin()); + std::copy(diagIndex.begin(), diagIndex.end(), tmp_rowIndex.begin() + num_surfaces); + std::copy(owner, owner + num_surfaces, tmp_rowIndex.begin() + num_surfaces + num_cells); + std::multimap tmpPair; + for (int i = 0; i < 2*num_surfaces+num_cells; i++) + { + tmpPair.insert(std::make_pair(tmp_rowIndex[i], tmp_permutation[i])); + } + std::vector> tmpPerm(tmpPair.begin(), tmpPair.end()); + std::sort(tmpPerm.begin(), tmpPerm.end(), [] + (const std::pair& pair1, const std::pair& pair2){ + if (pair1.first != pair2.first) { + return pair1.first < pair2.first; + } else { + return pair1.second < pair2.second; + } + }); + std::transform(tmpPerm.begin(), tmpPerm.end(), std::back_inserter(tmpPermutatedList), [] + (const std::pair& pair) { + return pair.second; + }); + + /** + * 3. boundary imformations + */ + // get boundPermutation and offset lists + std::vector boundPermutationListInit(num_boundary_faces); + std::vector boundOffsetList; + std::iota(boundPermutationListInit.begin(), boundPermutationListInit.end(), 0); + + // - construct hashTable for sorting + std::multimap boundPermutation; + for (int i = 0; i < num_boundary_faces; i++) + { + boundPermutation.insert(std::make_pair(boundary_cell_id_init[i], boundPermutationListInit[i])); + } + + // - sort + std::vector> boundPermPair(boundPermutation.begin(), boundPermutation.end()); + std::sort(boundPermPair.begin(), boundPermPair.end(), [] + (const std::pair& pair1, const std::pair& pair2){ + if (pair1.first != pair2.first) { + return pair1.first < pair2.first; + } else { + return pair1.second < pair2.second; + } + }); + + // - construct boundPermedIndex and boundary_cell_id + std::vector boundary_cell_id; + boundPermutationList.clear(); + std::transform(boundPermPair.begin(), boundPermPair.end(), std::back_inserter(boundary_cell_id), [] + (const std::pair& pair) { + return pair.first; + }); + std::transform(boundPermPair.begin(), boundPermPair.end(), std::back_inserter(boundPermutationList), [] + (const std::pair& pair) { + return pair.second; + }); + + // construct boundary_cell_offset + std::map countMap; + std::vector boundaryCellcount; + for (const auto& cellIndex : boundary_cell_id) + ++ countMap[cellIndex]; + for (const auto& [cellIndex, count] : countMap) + boundaryCellcount.push_back(count); + + num_boundary_cells = boundaryCellcount.size(); + num_boundary_cells_output = num_boundary_cells; + + std::vector boundary_cell_offset(boundaryCellcount.size() + 1, 0); + std::partial_sum(boundaryCellcount.begin(), boundaryCellcount.end(), boundary_cell_offset.begin()+1); + + // assign h_boundary_cell_offset & h_boundary_cell_id + h_boundary_cell_offset = boundary_cell_offset.data(); + h_boundary_cell_id = boundary_cell_id.data(); + + // + boundary_cell_bytes = num_boundary_cells * sizeof(double); + boundary_cell_vec_bytes = num_boundary_cells * 3 * sizeof(double); + boundary_cell_index_bytes = num_boundary_cells * sizeof(int); + + boundary_face_bytes = num_boundary_faces * sizeof(double); + boundary_face_vec_bytes = num_boundary_faces * 3 * sizeof(double); + boundary_face_index_bytes = num_boundary_faces * sizeof(int); + + ueqn_internalCoeffs.resize(3*num_boundary_faces); + ueqn_boundaryCoeffs.resize(3*num_boundary_faces); + + boundary_face_vector.resize(3*num_boundary_faces); + boundary_pressure.resize(num_boundary_faces); + boundary_face.resize(num_boundary_faces); + boundary_deltaCoeffs.resize(num_boundary_faces); + + /** + * 4. permutation list for field variables + */ + std::vector offdiagRowIndex(2*num_surfaces), permIndex(2*num_surfaces); + // - initialize the offdiagRowIndex (rowIndex of lower + rowIndex of upper) + std::copy(neighbour, neighbour + num_surfaces, offdiagRowIndex.begin()); + std::copy(owner, owner + num_surfaces, offdiagRowIndex.begin() + num_surfaces); + + // - initialize the permIndex (0, 1, ..., 2*num_surfaces) + std::iota(permIndex.begin(), permIndex.end(), 0); + + // - construct hashTable for sorting + std::multimap permutation; + for (int i = 0; i < 2*num_surfaces; i++) + { + permutation.insert(std::make_pair(offdiagRowIndex[i], permIndex[i])); + } + // - sort + std::vector> permPair(permutation.begin(), permutation.end()); + std::sort(permPair.begin(), permPair.end(), [] + (const std::pair& pair1, const std::pair& pair2){ + if (pair1.first != pair2.first) { + return pair1.first < pair2.first; + } else { + return pair1.second < pair2.second; + } + }); + // - form permedIndex list + std::transform(permPair.begin(), permPair.end(), std::back_inserter(permedIndex), [] + (const std::pair& pair) { + return pair.second; + }); + + // copy and permutate cell variables + std::copy(weight, weight + num_surfaces, h_weight_vec_init.begin()); + std::copy(weight, weight + num_surfaces, h_weight_vec_init.begin() + num_surfaces); + std::copy(face_vector, face_vector + 3*num_surfaces, h_face_vector_vec_init.begin()); + std::copy(face_vector, face_vector + 3*num_surfaces, h_face_vector_vec_init.begin() + 3*num_surfaces); + std::copy(face, face + num_surfaces, h_face_vec_init.begin()); + std::copy(face, face + num_surfaces, h_face_vec_init.begin() + num_surfaces); + std::copy(deltaCoeffs, deltaCoeffs + num_surfaces, h_deltaCoeffs_vec_init.begin()); + std::copy(deltaCoeffs, deltaCoeffs + num_surfaces, h_deltaCoeffs_vec_init.begin() + num_surfaces); + for (int i = 0; i < num_faces; i++) + { + h_weight_vec[i] = h_weight_vec_init[permedIndex[i]]; + h_face_vec[i] = h_face_vec_init[permedIndex[i]]; + h_deltaCoeffs_vec[i] = h_deltaCoeffs_vec_init[permedIndex[i]]; + h_face_vector_vec[i*3] = h_face_vector_vec_init[3*permedIndex[i]]; + h_face_vector_vec[i*3+1] = h_face_vector_vec_init[3*permedIndex[i]+1]; + h_face_vector_vec[i*3+2] = h_face_vector_vec_init[3*permedIndex[i]+2]; + } + h_weight = h_weight_vec.data(); + h_face_vector = h_face_vector_vec.data(); + h_face = h_face_vec.data(); + h_deltaCoeffs = h_deltaCoeffs_vec.data(); + + for (int i = 0; i < num_boundary_faces; i++) + { + boundary_face_vector[3*i] = boundary_face_vector_init[3*boundPermutationList[i]]; + boundary_face_vector[3*i+1] = boundary_face_vector_init[3*boundPermutationList[i]+1]; + boundary_face_vector[3*i+2] = boundary_face_vector_init[3*boundPermutationList[i]+2]; + boundary_face[i] = boundary_face_init[boundPermutationList[i]]; + boundary_deltaCoeffs[i] = boundary_deltaCoeffs_init[boundPermutationList[i]]; + } + h_boundary_face_vector = boundary_face_vector.data(); + h_boundary_face = boundary_face.data(); + h_boundary_deltaCoeffs = boundary_deltaCoeffs.data(); + + /************************allocate memory on device****************************/ + int total_bytes = 0; + + checkCudaErrors(cudaMalloc((void**)&d_A_csr_row_index, csr_row_index_bytes)); + checkCudaErrors(cudaMalloc((void**)&d_A_csr_col_index, csr_col_index_bytes)); + checkCudaErrors(cudaMalloc((void**)&d_A_csr_diag_index, cell_index_bytes)); + total_bytes += (csr_row_index_bytes + csr_col_index_bytes + cell_index_bytes); + + checkCudaErrors(cudaMalloc((void**)&d_rho_old, cell_bytes)); + checkCudaErrors(cudaMalloc((void**)&d_rho_new, cell_bytes)); + checkCudaErrors(cudaMalloc((void**)&d_volume, cell_bytes)); + checkCudaErrors(cudaMalloc((void**)&d_pressure, cell_bytes)); + checkCudaErrors(cudaMalloc((void**)&d_velocity_old, cell_vec_bytes)); + checkCudaErrors(cudaMalloc((void**)&d_weight, face_bytes)); + checkCudaErrors(cudaMalloc((void**)&d_face, face_bytes)); + checkCudaErrors(cudaMalloc((void**)&d_deltaCoeffs, face_bytes)); + checkCudaErrors(cudaMalloc((void**)&d_phi, face_bytes)); + checkCudaErrors(cudaMalloc((void**)&d_phi_init, face_bytes)); + checkCudaErrors(cudaMalloc((void**)&d_face_vector, face_vec_bytes)); + checkCudaErrors(cudaMalloc((void**)&d_nuEff, cell_bytes)); + total_bytes += (cell_bytes * 5 + face_bytes * 5 + cell_vec_bytes + face_vec_bytes); + + checkCudaErrors(cudaMalloc((void**)&d_boundary_cell_offset, (num_boundary_cells+1) * sizeof(int))); + checkCudaErrors(cudaMalloc((void**)&d_boundary_cell_id, boundary_face_index_bytes)); + total_bytes += (boundary_face_index_bytes + (num_boundary_cells+1) * sizeof(int)); + + checkCudaErrors(cudaMalloc((void**)&d_boundary_pressure_init, boundary_face_bytes)); + checkCudaErrors(cudaMalloc((void**)&d_boundary_pressure, boundary_face_bytes)); + checkCudaErrors(cudaMalloc((void**)&d_boundary_velocity_init, boundary_face_vec_bytes)); + checkCudaErrors(cudaMalloc((void**)&d_boundary_velocity, boundary_face_vec_bytes)); + checkCudaErrors(cudaMalloc((void**)&d_boundary_face_vector, boundary_face_vec_bytes)); + checkCudaErrors(cudaMalloc((void**)&d_boundary_face, boundary_face_bytes)); + checkCudaErrors(cudaMalloc((void**)&d_boundary_deltaCoeffs, boundary_face_bytes)); + checkCudaErrors(cudaMalloc((void**)&d_internal_coeffs_init, boundary_face_vec_bytes)); + checkCudaErrors(cudaMalloc((void**)&d_boundary_coeffs_init, boundary_face_vec_bytes)); + checkCudaErrors(cudaMalloc((void**)&d_laplac_internal_coeffs_init, boundary_face_vec_bytes)); + checkCudaErrors(cudaMalloc((void**)&d_laplac_boundary_coeffs_init, boundary_face_vec_bytes)); + checkCudaErrors(cudaMalloc((void**)&d_internal_coeffs, boundary_face_vec_bytes)); + checkCudaErrors(cudaMalloc((void**)&d_boundary_coeffs, boundary_face_vec_bytes)); + checkCudaErrors(cudaMalloc((void**)&d_laplac_internal_coeffs, boundary_face_vec_bytes)); + checkCudaErrors(cudaMalloc((void**)&d_laplac_boundary_coeffs, boundary_face_vec_bytes)); + checkCudaErrors(cudaMalloc((void**)&d_boundary_nuEff, boundary_face_bytes)); + checkCudaErrors(cudaMalloc((void**)&d_boundary_nuEff_init, boundary_face_bytes)); + checkCudaErrors(cudaMalloc((void**)&d_boundary_rho, boundary_face_bytes)); + checkCudaErrors(cudaMalloc((void**)&d_boundary_rho_init, boundary_face_bytes)); + total_bytes += (boundary_face_bytes*8 + boundary_face_vec_bytes * 11); + + // checkCudaErrors(cudaMalloc((void**)&d_A_csr, csr_value_vec_bytes)); + // checkCudaErrors(cudaMalloc((void**)&d_b, cell_vec_bytes)); + // checkCudaErrors(cudaMalloc((void**)&d_psi, cell_vec_bytes)); + total_bytes += (boundary_face_bytes + boundary_face_vec_bytes * 3); + + checkCudaErrors(cudaMalloc((void**)&d_turbSrc_A, csr_value_bytes)); + checkCudaErrors(cudaMalloc((void**)&d_turbSrc_A_init, csr_value_bytes)); + checkCudaErrors(cudaMalloc((void**)&d_turbSrc_b, cell_vec_bytes)); + total_bytes += (2*csr_value_bytes + cell_vec_bytes); + + checkCudaErrors(cudaMalloc((void**)&d_permedIndex, face_index_bytes)); + checkCudaErrors(cudaMalloc((void**)&d_bouPermedIndex, boundary_face_index_bytes)); + checkCudaErrors(cudaMalloc((void**)&d_tmpPermutatedList, csr_col_index_bytes)); + total_bytes += (face_index_bytes + boundary_face_index_bytes + csr_col_index_bytes); + + checkCudaErrors(cudaMalloc((void**)&d_grad, num_cells * 9 * sizeof(double))); + checkCudaErrors(cudaMalloc((void**)&d_grad_boundary, boundary_face_bytes * 9)); + checkCudaErrors(cudaMalloc((void**)&d_grad_boundary_init, boundary_cell_bytes * 9)); + total_bytes += (num_cells * 9 * sizeof(double) + boundary_face_bytes * 9 + boundary_cell_bytes * 9); // FIXME: rename + + + fprintf(stderr, "Total bytes malloc on GPU: %.2fMB\n", total_bytes * 1.0 / 1024 / 1024); + + checkCudaErrors(cudaStreamCreate(&stream)); + + checkCudaErrors(cudaMemcpyAsync(d_A_csr_row_index, h_A_csr_row_index, csr_row_index_bytes, cudaMemcpyHostToDevice, stream)); + checkCudaErrors(cudaMemcpyAsync(d_A_csr_col_index, h_A_csr_col_index, csr_col_index_bytes, cudaMemcpyHostToDevice, stream)); + checkCudaErrors(cudaMemcpyAsync(d_A_csr_diag_index, h_A_csr_diag_index, cell_index_bytes, cudaMemcpyHostToDevice, stream)); + checkCudaErrors(cudaMemcpyAsync(d_boundary_cell_offset, h_boundary_cell_offset, (num_boundary_cells+1) * sizeof(int), cudaMemcpyHostToDevice, stream)); + checkCudaErrors(cudaMemcpyAsync(d_boundary_cell_id, h_boundary_cell_id, boundary_face_index_bytes, cudaMemcpyHostToDevice, stream)); + + checkCudaErrors(cudaMemcpyAsync(d_volume, h_volume, cell_bytes, cudaMemcpyHostToDevice, stream)); + checkCudaErrors(cudaMemcpyAsync(d_weight, h_weight, face_bytes, cudaMemcpyHostToDevice, stream)); + checkCudaErrors(cudaMemcpyAsync(d_face, h_face, face_bytes, cudaMemcpyHostToDevice, stream)); + checkCudaErrors(cudaMemcpyAsync(d_deltaCoeffs, h_deltaCoeffs, face_bytes, cudaMemcpyHostToDevice, stream)); + checkCudaErrors(cudaMemcpyAsync(d_face_vector, h_face_vector, face_vec_bytes, cudaMemcpyHostToDevice, stream)); + checkCudaErrors(cudaMemcpyAsync(d_boundary_face_vector, h_boundary_face_vector, boundary_face_vec_bytes, cudaMemcpyHostToDevice, stream)); + checkCudaErrors(cudaMemcpyAsync(d_boundary_face, h_boundary_face, boundary_face_bytes, cudaMemcpyHostToDevice, stream)); + checkCudaErrors(cudaMemcpyAsync(d_boundary_deltaCoeffs, h_boundary_deltaCoeffs, boundary_face_bytes, cudaMemcpyHostToDevice, stream)); + + checkCudaErrors(cudaMemcpyAsync(d_permedIndex, permedIndex.data(), face_index_bytes, cudaMemcpyHostToDevice, stream)); + checkCudaErrors(cudaMemcpyAsync(d_bouPermedIndex, boundPermutationList.data(), boundary_face_index_bytes, cudaMemcpyHostToDevice, stream)); + checkCudaErrors(cudaMemcpyAsync(d_tmpPermutatedList, tmpPermutatedList.data(), csr_col_index_bytes, cudaMemcpyHostToDevice, stream)); + }; + ~dfMatrixDataBase(){ + std::cout << "Destructor called." << std::endl; + // TODO: free pointers + + }; +}; + diff --git a/src_gpu/dfUEqn.H b/src_gpu/dfUEqn.H new file mode 100644 index 000000000..c7d2c7914 --- /dev/null +++ b/src_gpu/dfUEqn.H @@ -0,0 +1,53 @@ +#pragma once + +#include "AmgXSolver.H" +#include +#include "dfMatrixDataBase.H" + +class dfUEqn +{ +private: + dfMatrixDataBase& dataBase_; + cudaStream_t stream; + AmgXSolver* UxSolver, *UySolver, *UzSolver = nullptr; + int num_iteration; + double time_monitor_CPU; + double time_monitor_GPU_kernel, time_monitor_GPU_memcpy, time_monitor_GPU_memcpy_test; + + // common variables + int num_cells, cell_bytes, num_faces, num_surfaces, cell_vec_bytes, csr_value_vec_bytes, num_boundary_cells; + int *d_A_csr_row_index, *d_A_csr_diag_index, *d_A_csr_col_index; + + // Matrix variables + double *d_A_csr, *d_b, *d_psi = nullptr; + double *h_A_csr, *h_b, *h_psi = nullptr; + +public: + dfUEqn(); + dfUEqn(dfMatrixDataBase& dataBase, const std::string &modeStr, const std::string &cfgFile); + ~dfUEqn(); + + void checkValue(bool print); + + void fvm_ddt(double *rho_old, double *rho_new, double* vector_old); + + void fvm_div(double* phi, double* ueqn_internalCoeffs_init, double* ueqn_boundaryCoeffs_init, + double* ueqn_laplac_internalCoeffs_init, double* ueqn_laplac_boundaryCoeffs_init, + double* boundary_pressure_init, double* boundary_velocity_init, + double* boundary_nuEff_init, double* boundary_rho_init); + + void fvc_grad(double* pressure); + + void fvc_grad_vector(); + + void dev2T(); + + void fvc_div_tensor(const double *nuEff); + + void fvm_laplacian(); + + void add_fvMatrix(double* turbSrc_low, double* turbSrc_diag, double* turbSrc_upp, double* turbSrc_source); + + void solve(); + void updatePsi(double* Psi); +}; \ No newline at end of file diff --git a/src_gpu/dfMatrix.cu b/src_gpu/dfUEqn.cu similarity index 57% rename from src_gpu/dfMatrix.cu rename to src_gpu/dfUEqn.cu index c06b09780..7a2297b4e 100644 --- a/src_gpu/dfMatrix.cu +++ b/src_gpu/dfUEqn.cu @@ -1,14 +1,4 @@ -#include "dfMatrix.H" - -void checkVectorEqual(int count, const double* basevec, double* vec, double max_relative_error) { - for (size_t i = 0; i < count; ++i) - { - double abs_diff = fabs(basevec[i] - vec[i]); - double rel_diff = fabs(basevec[i] - vec[i]) / fabs(basevec[i]); - if (abs_diff > 1e-12 && rel_diff > max_relative_error && !std::isinf(rel_diff)) - fprintf(stderr, "mismatch index %d, cpu data: %.16lf, gpu data: %.16lf, relative error: %.16lf\n", i, basevec[i], vec[i], rel_diff); - } -} +#include "dfUEqn.H" // kernel functions __global__ void fvm_ddt_kernel(int num_cells, int num_faces, const double rdelta_t, @@ -182,9 +172,9 @@ __global__ void fvc_grad_internal_face(int num_cells, grad_bz_upp += face_p * sfz; } } - b_output[num_cells * 0 + index] = b_input[num_cells * 0 + index] + grad_bx_low + grad_bx_upp; - b_output[num_cells * 1 + index] = b_input[num_cells * 1 + index] + grad_by_low + grad_by_upp; - b_output[num_cells * 2 + index] = b_input[num_cells * 2 + index] + grad_bz_low + grad_bz_upp; + b_output[num_cells * 0 + index] = b_input[num_cells * 0 + index] - grad_bx_low - grad_bx_upp; + b_output[num_cells * 1 + index] = b_input[num_cells * 1 + index] - grad_by_low - grad_by_upp; + b_output[num_cells * 2 + index] = b_input[num_cells * 2 + index] - grad_bz_low - grad_bz_upp; } __global__ void fvc_grad_boundary_face(int num_cells, int num_boundary_cells, @@ -222,9 +212,9 @@ __global__ void fvc_grad_boundary_face(int num_cells, int num_boundary_cells, //grad_by += ny * grad_correction; //grad_bz += nz * grad_correction; - b_output[num_cells * 0 + cell_index] = b_input[num_cells * 0 + cell_index] + grad_bx; - b_output[num_cells * 1 + cell_index] = b_input[num_cells * 1 + cell_index] + grad_by; - b_output[num_cells * 2 + cell_index] = b_input[num_cells * 2 + cell_index] + grad_bz; + b_output[num_cells * 0 + cell_index] = b_input[num_cells * 0 + cell_index] - grad_bx; + b_output[num_cells * 1 + cell_index] = b_input[num_cells * 1 + cell_index] - grad_by; + b_output[num_cells * 2 + cell_index] = b_input[num_cells * 2 + cell_index] - grad_bz; } __global__ void add_fvMatrix_kernel(int num_cells, int num_faces, @@ -818,409 +808,47 @@ __global__ void fvm_laplacian_uncorrected_vector_boundary(int num_cells, int num b_output[num_cells * 2 + cell_index] = b_input[num_cells * 2 + cell_index] + boundary_coeffs_z * sign; } -// constructor (construct mesh variable) -dfMatrix::dfMatrix(){} -dfMatrix::dfMatrix(int num_surfaces, int num_cells, int num_boundary_faces, int & num_boundary_cells_output, - const int *neighbour, const int *owner, const double* volume, const double* weight, const double* face_vector, const double* face, const double* deltaCoeffs, - std::vector boundary_face_vector_init, std::vector boundary_face_init, std::vector boundary_deltaCoeffs_init, - std::vector boundary_cell_id_init, const std::string &modeStr, const std::string &cfgFile) -: num_cells(num_cells), num_faces(num_surfaces*2), num_surfaces(num_surfaces), num_iteration(0), - num_boundary_faces(num_boundary_faces), h_volume(volume), time_monitor_CPU(0.), time_monitor_GPU_kernel(0.), time_monitor_GPU_memcpy(0.), - time_monitor_GPU_memcpy_test(0.) -{ - // - unsigned int flags = 0; - checkCudaErrors(cudaGetDeviceFlags(&flags)); - flags |= cudaDeviceScheduleYield; - checkCudaErrors(cudaSetDeviceFlags(flags)); - - // initialize vectors - // allocate field vector in pin memory - cudaMallocHost(&h_phi_init, num_faces * sizeof(double)); - // h_phi_vec_init.assign(h_phi_init, h_phi_init + num_faces); - - h_weight_vec_init.resize(num_faces); - h_weight_vec.resize(num_faces); - h_face_vector_vec_init.resize(num_faces*3); - h_face_vector_vec.resize(num_faces*3); - h_face_vec_init.resize(num_faces); - h_face_vec.resize(num_faces); - h_deltaCoeffs_vec_init.resize(num_faces); - h_deltaCoeffs_vec.resize(num_faces); - h_turbSrc_init_mtx_vec.resize(num_faces + num_cells); - h_turbSrc_init_1mtx.resize(num_faces + num_cells); - h_turbSrc_init_src_vec.resize(3*num_cells); - h_turbSrc_src_vec.resize(3*num_cells); - - // byte sizes - cell_bytes = num_cells * sizeof(double); - cell_vec_bytes = num_cells * 3 * sizeof(double); - cell_index_bytes = num_cells * sizeof(int); - - face_bytes = num_faces * sizeof(double); - face_vec_bytes = num_faces * 3 * sizeof(double); - face_index_bytes = num_faces * sizeof(int); +// constructor +dfUEqn::dfUEqn(dfMatrixDataBase& dataBase, const std::string &modeStr, const std::string &cfgFile) +: dataBase_(dataBase) +{ + UxSolver = new AmgXSolver(modeStr, cfgFile); + UySolver = new AmgXSolver(modeStr, cfgFile); + UzSolver = new AmgXSolver(modeStr, cfgFile); - // A_csr has one more element in each row: itself - csr_row_index_bytes = (num_cells + 1) * sizeof(int); - csr_col_index_bytes = (num_cells + num_faces) * sizeof(int); - csr_value_bytes = (num_cells + num_faces) * sizeof(double); - csr_value_vec_bytes = (num_cells + num_faces) * 3 * sizeof(double); + num_cells = dataBase_.num_cells; + cell_bytes = dataBase_.cell_bytes; + num_faces = dataBase_.num_faces; + cell_vec_bytes = dataBase_.cell_vec_bytes; + csr_value_vec_bytes = dataBase_.csr_value_vec_bytes; + num_boundary_cells = dataBase_.num_boundary_cells; + num_surfaces = dataBase_.num_surfaces; + + d_A_csr_row_index = dataBase_.d_A_csr_row_index; + d_A_csr_diag_index = dataBase_.d_A_csr_diag_index; + d_A_csr_col_index = dataBase_.d_A_csr_col_index; h_A_csr = new double[(num_cells + num_faces) * 3]; - // h_b = new double[num_cells * 3]; - // h_psi = new double[num_cells * 3]; - cudaMallocHost(&h_b, cell_vec_bytes); + h_b = new double[num_cells * 3]; cudaMallocHost(&h_psi, cell_vec_bytes); - /************************construct mesh variables****************************/ - /** - * 1. h_csr_row_index & h_csr_diag_index - */ - std::vector h_mtxEntry_perRow_vec(num_cells); - std::vector h_csr_diag_index_vec(num_cells); - std::vector h_csr_row_index_vec(num_cells + 1, 0); - - for (int faceI = 0; faceI < num_surfaces; faceI++) - { - h_csr_diag_index_vec[neighbour[faceI]]++; - h_mtxEntry_perRow_vec[neighbour[faceI]]++; - h_mtxEntry_perRow_vec[owner[faceI]]++; - } - - // - consider diagnal element in each row - std::transform(h_mtxEntry_perRow_vec.begin(), h_mtxEntry_perRow_vec.end(), h_mtxEntry_perRow_vec.begin(), [](int n) - {return n + 1;}); - // - construct h_csr_row_index & h_csr_diag_index - std::partial_sum(h_mtxEntry_perRow_vec.begin(), h_mtxEntry_perRow_vec.end(), h_csr_row_index_vec.begin()+1); - // - assign h_csr_row_index & h_csr_diag_index - h_A_csr_row_index = h_csr_row_index_vec.data(); - h_A_csr_diag_index = h_csr_diag_index_vec.data(); - - /** - * 2. h_csr_col_index - */ - std::vector rowIndex(num_faces + num_cells), colIndex(num_faces + num_cells), diagIndex(num_cells); - std::iota(diagIndex.begin(), diagIndex.end(), 0); - - // initialize the RowIndex (rowIndex of lower + upper + diagnal) - std::copy(neighbour, neighbour + num_surfaces, rowIndex.begin()); - std::copy(owner, owner + num_surfaces, rowIndex.begin() + num_surfaces); - std::copy(diagIndex.begin(), diagIndex.end(), rowIndex.begin() + num_faces); - // initialize the ColIndex (colIndex of lower + upper + diagnal) - std::copy(owner, owner + num_surfaces, colIndex.begin()); - std::copy(neighbour, neighbour + num_surfaces, colIndex.begin() + num_surfaces); - std::copy(diagIndex.begin(), diagIndex.end(), colIndex.begin() + num_faces); - - // - construct hashTable for sorting - std::multimap rowColPair; - for (int i = 0; i < 2*num_surfaces+num_cells; i++) - { - rowColPair.insert(std::make_pair(rowIndex[i], colIndex[i])); - } - // - sort - std::vector> globalPerm(rowColPair.begin(), rowColPair.end()); - std::sort(globalPerm.begin(), globalPerm.end(), [] - (const std::pair& pair1, const std::pair& pair2){ - if (pair1.first != pair2.first) { - return pair1.first < pair2.first; - } else { - return pair1.second < pair2.second; - } - }); - - std::vector h_csr_col_index_vec; - std::transform(globalPerm.begin(), globalPerm.end(), std::back_inserter(h_csr_col_index_vec), [] - (const std::pair& pair) { - return pair.second; - }); - h_A_csr_col_index = h_csr_col_index_vec.data(); - - // construct a tmp permutated List for add fvMatrix - std::vector tmp_permutation(2*num_surfaces + num_cells); - std::vector tmp_rowIndex(2*num_surfaces + num_cells); - std::iota(tmp_permutation.begin(), tmp_permutation.end(), 0); - std::copy(neighbour, neighbour + num_surfaces, tmp_rowIndex.begin()); - std::copy(diagIndex.begin(), diagIndex.end(), tmp_rowIndex.begin() + num_surfaces); - std::copy(owner, owner + num_surfaces, tmp_rowIndex.begin() + num_surfaces + num_cells); - std::multimap tmpPair; - for (int i = 0; i < 2*num_surfaces+num_cells; i++) - { - tmpPair.insert(std::make_pair(tmp_rowIndex[i], tmp_permutation[i])); - } - std::vector> tmpPerm(tmpPair.begin(), tmpPair.end()); - std::sort(tmpPerm.begin(), tmpPerm.end(), [] - (const std::pair& pair1, const std::pair& pair2){ - if (pair1.first != pair2.first) { - return pair1.first < pair2.first; - } else { - return pair1.second < pair2.second; - } - }); - std::transform(tmpPerm.begin(), tmpPerm.end(), std::back_inserter(tmpPermutatedList), [] - (const std::pair& pair) { - return pair.second; - }); - - - /** - * 3. boundary imformations - */ - // get boundPermutation and offset lists - std::vector boundPermutationListInit(num_boundary_faces); - std::vector boundOffsetList; - std::iota(boundPermutationListInit.begin(), boundPermutationListInit.end(), 0); - - // - construct hashTable for sorting - std::multimap boundPermutation; - for (int i = 0; i < num_boundary_faces; i++) - { - boundPermutation.insert(std::make_pair(boundary_cell_id_init[i], boundPermutationListInit[i])); - } - - // - sort - std::vector> boundPermPair(boundPermutation.begin(), boundPermutation.end()); - std::sort(boundPermPair.begin(), boundPermPair.end(), [] - (const std::pair& pair1, const std::pair& pair2){ - if (pair1.first != pair2.first) { - return pair1.first < pair2.first; - } else { - return pair1.second < pair2.second; - } - }); - - // - construct boundPermedIndex and boundary_cell_id - std::vector boundary_cell_id; - boundPermutationList.clear(); - std::transform(boundPermPair.begin(), boundPermPair.end(), std::back_inserter(boundary_cell_id), [] - (const std::pair& pair) { - return pair.first; - }); - std::transform(boundPermPair.begin(), boundPermPair.end(), std::back_inserter(boundPermutationList), [] - (const std::pair& pair) { - return pair.second; - }); - - // construct boundary_cell_offset - std::map countMap; - std::vector boundaryCellcount; - for (const auto& cellIndex : boundary_cell_id) - ++ countMap[cellIndex]; - for (const auto& [cellIndex, count] : countMap) - boundaryCellcount.push_back(count); - - num_boundary_cells = boundaryCellcount.size(); - num_boundary_cells_output = num_boundary_cells; - - std::vector boundary_cell_offset(boundaryCellcount.size() + 1, 0); - std::partial_sum(boundaryCellcount.begin(), boundaryCellcount.end(), boundary_cell_offset.begin()+1); - - // assign h_boundary_cell_offset & h_boundary_cell_id - h_boundary_cell_offset = boundary_cell_offset.data(); - h_boundary_cell_id = boundary_cell_id.data(); - - // - boundary_cell_bytes = num_boundary_cells * sizeof(double); - boundary_cell_vec_bytes = num_boundary_cells * 3 * sizeof(double); - boundary_cell_index_bytes = num_boundary_cells * sizeof(int); - - boundary_face_bytes = num_boundary_faces * sizeof(double); - boundary_face_vec_bytes = num_boundary_faces * 3 * sizeof(double); - boundary_face_index_bytes = num_boundary_faces * sizeof(int); - - ueqn_internalCoeffs.resize(3*num_boundary_faces); - ueqn_boundaryCoeffs.resize(3*num_boundary_faces); - - boundary_face_vector.resize(3*num_boundary_faces); - boundary_pressure.resize(num_boundary_faces); - boundary_face.resize(num_boundary_faces); - boundary_deltaCoeffs.resize(num_boundary_faces); - - - /** - * 4. permutation list for field variables - */ - std::vector offdiagRowIndex(2*num_surfaces), permIndex(2*num_surfaces); - // - initialize the offdiagRowIndex (rowIndex of lower + rowIndex of upper) - std::copy(neighbour, neighbour + num_surfaces, offdiagRowIndex.begin()); - std::copy(owner, owner + num_surfaces, offdiagRowIndex.begin() + num_surfaces); - - // - initialize the permIndex (0, 1, ..., 2*num_surfaces) - std::iota(permIndex.begin(), permIndex.end(), 0); - - // - construct hashTable for sorting - std::multimap permutation; - for (int i = 0; i < 2*num_surfaces; i++) - { - permutation.insert(std::make_pair(offdiagRowIndex[i], permIndex[i])); - } - // - sort - std::vector> permPair(permutation.begin(), permutation.end()); - std::sort(permPair.begin(), permPair.end(), [] - (const std::pair& pair1, const std::pair& pair2){ - if (pair1.first != pair2.first) { - return pair1.first < pair2.first; - } else { - return pair1.second < pair2.second; - } - }); - // - form permedIndex list - std::transform(permPair.begin(), permPair.end(), std::back_inserter(permedIndex), [] - (const std::pair& pair) { - return pair.second; - }); - - // copy and permutate cell variables - std::copy(weight, weight + num_surfaces, h_weight_vec_init.begin()); - std::copy(weight, weight + num_surfaces, h_weight_vec_init.begin() + num_surfaces); - std::copy(face_vector, face_vector + 3*num_surfaces, h_face_vector_vec_init.begin()); - std::copy(face_vector, face_vector + 3*num_surfaces, h_face_vector_vec_init.begin() + 3*num_surfaces); - std::copy(face, face + num_surfaces, h_face_vec_init.begin()); - std::copy(face, face + num_surfaces, h_face_vec_init.begin() + num_surfaces); - std::copy(deltaCoeffs, deltaCoeffs + num_surfaces, h_deltaCoeffs_vec_init.begin()); - std::copy(deltaCoeffs, deltaCoeffs + num_surfaces, h_deltaCoeffs_vec_init.begin() + num_surfaces); - for (int i = 0; i < num_faces; i++) - { - h_weight_vec[i] = h_weight_vec_init[permedIndex[i]]; - h_face_vec[i] = h_face_vec_init[permedIndex[i]]; - h_deltaCoeffs_vec[i] = h_deltaCoeffs_vec_init[permedIndex[i]]; - h_face_vector_vec[i*3] = h_face_vector_vec_init[3*permedIndex[i]]; - h_face_vector_vec[i*3+1] = h_face_vector_vec_init[3*permedIndex[i]+1]; - h_face_vector_vec[i*3+2] = h_face_vector_vec_init[3*permedIndex[i]+2]; - } - h_weight = h_weight_vec.data(); - h_face_vector = h_face_vector_vec.data(); - h_face = h_face_vec.data(); - h_deltaCoeffs = h_deltaCoeffs_vec.data(); - - for (int i = 0; i < num_boundary_faces; i++) - { - boundary_face_vector[3*i] = boundary_face_vector_init[3*boundPermutationList[i]]; - boundary_face_vector[3*i+1] = boundary_face_vector_init[3*boundPermutationList[i]+1]; - boundary_face_vector[3*i+2] = boundary_face_vector_init[3*boundPermutationList[i]+2]; - boundary_face[i] = boundary_face_init[boundPermutationList[i]]; - boundary_deltaCoeffs[i] = boundary_deltaCoeffs_init[boundPermutationList[i]]; - } - h_boundary_face_vector = boundary_face_vector.data(); - h_boundary_face = boundary_face.data(); - h_boundary_deltaCoeffs = boundary_deltaCoeffs.data(); - - /************************allocate memory on device****************************/ - int total_bytes = 0; - - checkCudaErrors(cudaMalloc((void**)&d_A_csr_row_index, csr_row_index_bytes)); - checkCudaErrors(cudaMalloc((void**)&d_A_csr_col_index, csr_col_index_bytes)); - checkCudaErrors(cudaMalloc((void**)&d_A_csr_diag_index, cell_index_bytes)); - total_bytes += (csr_row_index_bytes + csr_col_index_bytes + cell_index_bytes); - - checkCudaErrors(cudaMalloc((void**)&d_rho_old, cell_bytes)); - checkCudaErrors(cudaMalloc((void**)&d_rho_new, cell_bytes)); - checkCudaErrors(cudaMalloc((void**)&d_volume, cell_bytes)); - checkCudaErrors(cudaMalloc((void**)&d_pressure, cell_bytes)); - checkCudaErrors(cudaMalloc((void**)&d_velocity_old, cell_vec_bytes)); - checkCudaErrors(cudaMalloc((void**)&d_weight, face_bytes)); - checkCudaErrors(cudaMalloc((void**)&d_face, face_bytes)); - checkCudaErrors(cudaMalloc((void**)&d_deltaCoeffs, face_bytes)); - checkCudaErrors(cudaMalloc((void**)&d_phi, face_bytes)); - checkCudaErrors(cudaMalloc((void**)&d_phi_init, face_bytes)); - checkCudaErrors(cudaMalloc((void**)&d_face_vector, face_vec_bytes)); - checkCudaErrors(cudaMalloc((void**)&d_nuEff, cell_bytes)); - total_bytes += (cell_bytes * 5 + face_bytes * 5 + cell_vec_bytes + face_vec_bytes); - - checkCudaErrors(cudaMalloc((void**)&d_boundary_cell_offset, (num_boundary_cells+1) * sizeof(int))); - checkCudaErrors(cudaMalloc((void**)&d_boundary_cell_id, boundary_face_index_bytes)); - total_bytes += (boundary_face_index_bytes + (num_boundary_cells+1) * sizeof(int)); - - checkCudaErrors(cudaMalloc((void**)&d_boundary_pressure_init, boundary_face_bytes)); - checkCudaErrors(cudaMalloc((void**)&d_boundary_pressure, boundary_face_bytes)); - checkCudaErrors(cudaMalloc((void**)&d_boundary_velocity_init, boundary_face_vec_bytes)); - checkCudaErrors(cudaMalloc((void**)&d_boundary_velocity, boundary_face_vec_bytes)); - checkCudaErrors(cudaMalloc((void**)&d_boundary_face_vector, boundary_face_vec_bytes)); - checkCudaErrors(cudaMalloc((void**)&d_boundary_face, boundary_face_bytes)); - checkCudaErrors(cudaMalloc((void**)&d_boundary_deltaCoeffs, boundary_face_bytes)); - checkCudaErrors(cudaMalloc((void**)&d_internal_coeffs_init, boundary_face_vec_bytes)); - checkCudaErrors(cudaMalloc((void**)&d_boundary_coeffs_init, boundary_face_vec_bytes)); - checkCudaErrors(cudaMalloc((void**)&d_laplac_internal_coeffs_init, boundary_face_vec_bytes)); - checkCudaErrors(cudaMalloc((void**)&d_laplac_boundary_coeffs_init, boundary_face_vec_bytes)); - checkCudaErrors(cudaMalloc((void**)&d_internal_coeffs, boundary_face_vec_bytes)); - checkCudaErrors(cudaMalloc((void**)&d_boundary_coeffs, boundary_face_vec_bytes)); - checkCudaErrors(cudaMalloc((void**)&d_laplac_internal_coeffs, boundary_face_vec_bytes)); - checkCudaErrors(cudaMalloc((void**)&d_laplac_boundary_coeffs, boundary_face_vec_bytes)); - checkCudaErrors(cudaMalloc((void**)&d_boundary_nuEff, boundary_face_bytes)); - checkCudaErrors(cudaMalloc((void**)&d_boundary_nuEff_init, boundary_face_bytes)); - checkCudaErrors(cudaMalloc((void**)&d_boundary_rho, boundary_face_bytes)); - checkCudaErrors(cudaMalloc((void**)&d_boundary_rho_init, boundary_face_bytes)); - total_bytes += (boundary_face_bytes*8 + boundary_face_vec_bytes * 11); - checkCudaErrors(cudaMalloc((void**)&d_A_csr, csr_value_vec_bytes)); checkCudaErrors(cudaMalloc((void**)&d_b, cell_vec_bytes)); checkCudaErrors(cudaMalloc((void**)&d_psi, cell_vec_bytes)); - total_bytes += (boundary_face_bytes + boundary_face_vec_bytes * 3); - - checkCudaErrors(cudaMalloc((void**)&d_turbSrc_A, csr_value_bytes)); - checkCudaErrors(cudaMalloc((void**)&d_turbSrc_A_init, csr_value_bytes)); - checkCudaErrors(cudaMalloc((void**)&d_turbSrc_b, cell_vec_bytes)); - total_bytes += (2*csr_value_bytes + cell_vec_bytes); - - checkCudaErrors(cudaMalloc((void**)&d_permedIndex, face_index_bytes)); - checkCudaErrors(cudaMalloc((void**)&d_bouPermedIndex, boundary_face_index_bytes)); - checkCudaErrors(cudaMalloc((void**)&d_tmpPermutatedList, csr_col_index_bytes)); - total_bytes += (face_index_bytes + boundary_face_index_bytes + csr_col_index_bytes); - - checkCudaErrors(cudaMalloc((void**)&d_grad, num_cells * 9 * sizeof(double))); - checkCudaErrors(cudaMalloc((void**)&d_grad_boundary, boundary_face_bytes * 9)); - checkCudaErrors(cudaMalloc((void**)&d_grad_boundary_init, boundary_cell_bytes * 9)); - total_bytes += (num_cells * 9 * sizeof(double) + boundary_face_bytes * 9 + boundary_cell_bytes * 9); // FIXME: rename - - - fprintf(stderr, "Total bytes malloc on GPU: %.2fMB\n", total_bytes * 1.0 / 1024 / 1024); checkCudaErrors(cudaStreamCreate(&stream)); checkCudaErrors(cudaMemsetAsync(d_A_csr, 0, csr_value_vec_bytes, stream)); checkCudaErrors(cudaMemsetAsync(d_b, 0, cell_vec_bytes, stream)); - - checkCudaErrors(cudaMemcpyAsync(d_A_csr_row_index, h_A_csr_row_index, csr_row_index_bytes, cudaMemcpyHostToDevice, stream)); - checkCudaErrors(cudaMemcpyAsync(d_A_csr_col_index, h_A_csr_col_index, csr_col_index_bytes, cudaMemcpyHostToDevice, stream)); - checkCudaErrors(cudaMemcpyAsync(d_A_csr_diag_index, h_A_csr_diag_index, cell_index_bytes, cudaMemcpyHostToDevice, stream)); - checkCudaErrors(cudaMemcpyAsync(d_boundary_cell_offset, h_boundary_cell_offset, (num_boundary_cells+1) * sizeof(int), cudaMemcpyHostToDevice, stream)); - checkCudaErrors(cudaMemcpyAsync(d_boundary_cell_id, h_boundary_cell_id, boundary_face_index_bytes, cudaMemcpyHostToDevice, stream)); - - checkCudaErrors(cudaMemcpyAsync(d_volume, h_volume, cell_bytes, cudaMemcpyHostToDevice, stream)); - checkCudaErrors(cudaMemcpyAsync(d_weight, h_weight, face_bytes, cudaMemcpyHostToDevice, stream)); - checkCudaErrors(cudaMemcpyAsync(d_face, h_face, face_bytes, cudaMemcpyHostToDevice, stream)); - checkCudaErrors(cudaMemcpyAsync(d_deltaCoeffs, h_deltaCoeffs, face_bytes, cudaMemcpyHostToDevice, stream)); - checkCudaErrors(cudaMemcpyAsync(d_face_vector, h_face_vector, face_vec_bytes, cudaMemcpyHostToDevice, stream)); - checkCudaErrors(cudaMemcpyAsync(d_boundary_face_vector, h_boundary_face_vector, boundary_face_vec_bytes, cudaMemcpyHostToDevice, stream)); - checkCudaErrors(cudaMemcpyAsync(d_boundary_face, h_boundary_face, boundary_face_bytes, cudaMemcpyHostToDevice, stream)); - checkCudaErrors(cudaMemcpyAsync(d_boundary_deltaCoeffs, h_boundary_deltaCoeffs, boundary_face_bytes, cudaMemcpyHostToDevice, stream)); - - checkCudaErrors(cudaMemcpyAsync(d_permedIndex, permedIndex.data(), face_index_bytes, cudaMemcpyHostToDevice, stream)); - checkCudaErrors(cudaMemcpyAsync(d_bouPermedIndex, boundPermutationList.data(), boundary_face_index_bytes, cudaMemcpyHostToDevice, stream)); - checkCudaErrors(cudaMemcpyAsync(d_tmpPermutatedList, tmpPermutatedList.data(), csr_col_index_bytes, cudaMemcpyHostToDevice, stream)); - - // construct AmgXSolver - UxSolver = new AmgXSolver(modeStr, cfgFile); - UySolver = new AmgXSolver(modeStr, cfgFile); - UzSolver = new AmgXSolver(modeStr, cfgFile); } -dfMatrix::~dfMatrix() -{ -} - -void dfMatrix::fvm_ddt(double *rho_old, double *rho_new, double* vector_old) -{ - // copy cell variables directly - h_rho_new = rho_new; - h_rho_old = rho_old; - h_velocity_old = vector_old; - +void dfUEqn::fvm_ddt(double *rho_old, double *rho_new, double* vector_old) +{ // Copy the host input array in host memory to the device input array in device memory clock_t start = std::clock(); - checkCudaErrors(cudaMemcpyAsync(d_rho_old, h_rho_old, cell_bytes, cudaMemcpyHostToDevice, stream)); - checkCudaErrors(cudaMemcpyAsync(d_rho_new, h_rho_new, cell_bytes, cudaMemcpyHostToDevice, stream)); - checkCudaErrors(cudaMemcpyAsync(d_velocity_old, h_velocity_old, cell_vec_bytes, cudaMemcpyHostToDevice, stream)); + checkCudaErrors(cudaMemcpyAsync(dataBase_.d_rho_old, rho_old, cell_bytes, cudaMemcpyHostToDevice, stream)); + checkCudaErrors(cudaMemcpyAsync(dataBase_.d_rho_new, rho_new, cell_bytes, cudaMemcpyHostToDevice, stream)); + checkCudaErrors(cudaMemcpyAsync(dataBase_.d_velocity_old, vector_old, cell_vec_bytes, cudaMemcpyHostToDevice, stream)); clock_t end = std::clock(); time_monitor_GPU_memcpy += double(end - start) / double(CLOCKS_PER_SEC); @@ -1228,14 +856,14 @@ void dfMatrix::fvm_ddt(double *rho_old, double *rho_new, double* vector_old) start = std::clock(); size_t threads_per_block = 1024; size_t blocks_per_grid = (num_cells + threads_per_block - 1) / threads_per_block; - fvm_ddt_kernel<<>>(num_cells, num_faces, rdelta_t, + fvm_ddt_kernel<<>>(num_cells, num_faces, dataBase_.rdelta_t, d_A_csr_row_index, d_A_csr_diag_index, - d_rho_old, d_rho_new, d_volume, d_velocity_old, d_A_csr, d_b, d_A_csr, d_b, d_psi); + dataBase_.d_rho_old, dataBase_.d_rho_new, dataBase_.d_volume, dataBase_.d_velocity_old, d_A_csr, d_b, d_A_csr, d_b, d_psi); end = std::clock(); time_monitor_GPU_kernel += double(end - start) / double(CLOCKS_PER_SEC); } -void dfMatrix::fvm_div(double* phi, double* ueqn_internalCoeffs_init, double* ueqn_boundaryCoeffs_init, +void dfUEqn::fvm_div(double* phi, double* ueqn_internalCoeffs_init, double* ueqn_boundaryCoeffs_init, double* ueqn_laplac_internalCoeffs_init, double* ueqn_laplac_boundaryCoeffs_init, double* boundary_pressure_init, double* boundary_velocity_init, double *boundary_nuEff_init, double *boundary_rho_init) @@ -1244,70 +872,65 @@ void dfMatrix::fvm_div(double* phi, double* ueqn_internalCoeffs_init, double* ue clock_t start = std::clock(); // std::copy(phi, phi + num_surfaces, h_phi_init); // std::copy(phi, phi + num_surfaces, h_phi_init + num_surfaces); - memcpy(h_phi_init, phi, num_surfaces*sizeof(double)); + // memcpy(h_phi_init, phi, num_surfaces*sizeof(double)); // memcpy(h_phi_init + num_surfaces, phi, num_surfaces*sizeof(double)); clock_t end = std::clock(); time_monitor_CPU += double(end - start) / double(CLOCKS_PER_SEC); start = std::clock(); - checkCudaErrors(cudaMemcpyAsync(d_phi_init, h_phi_init, num_surfaces*sizeof(double), cudaMemcpyHostToDevice, stream)); - checkCudaErrors(cudaMemcpyAsync(d_phi_init + num_surfaces, d_phi_init, num_surfaces*sizeof(double), cudaMemcpyDeviceToDevice, stream)); + checkCudaErrors(cudaMemcpyAsync(dataBase_.d_phi_init, phi, num_surfaces*sizeof(double), cudaMemcpyHostToDevice, stream)); + checkCudaErrors(cudaMemcpyAsync(dataBase_.d_phi_init + num_surfaces, dataBase_.d_phi_init, num_surfaces*sizeof(double), cudaMemcpyDeviceToDevice, stream)); end = std::clock(); time_monitor_GPU_memcpy_test += double(end - start) / double(CLOCKS_PER_SEC); start = std::clock(); size_t threads_per_block = 1024; size_t blocks_per_grid = (num_faces + threads_per_block - 1) / threads_per_block; - offdiagPermutation<<>>(num_faces, d_permedIndex, d_phi_init, d_phi); + offdiagPermutation<<>>(num_faces, dataBase_.d_permedIndex, dataBase_.d_phi_init, dataBase_.d_phi); end = std::clock(); time_monitor_GPU_kernel += double(end - start) / double(CLOCKS_PER_SEC); // copy and permutate boundary variable start = std::clock(); - checkCudaErrors(cudaMemcpyAsync(d_internal_coeffs_init, ueqn_internalCoeffs_init, boundary_face_vec_bytes, cudaMemcpyHostToDevice, stream)); - checkCudaErrors(cudaMemcpyAsync(d_boundary_coeffs_init, ueqn_boundaryCoeffs_init, boundary_face_vec_bytes, cudaMemcpyHostToDevice, stream)); - checkCudaErrors(cudaMemcpyAsync(d_laplac_internal_coeffs_init, ueqn_laplac_internalCoeffs_init, boundary_face_vec_bytes, cudaMemcpyHostToDevice, stream)); - checkCudaErrors(cudaMemcpyAsync(d_laplac_boundary_coeffs_init, ueqn_laplac_boundaryCoeffs_init, boundary_face_vec_bytes, cudaMemcpyHostToDevice, stream)); - checkCudaErrors(cudaMemcpyAsync(d_boundary_velocity_init, boundary_velocity_init, boundary_face_vec_bytes, cudaMemcpyHostToDevice, stream)); - checkCudaErrors(cudaMemcpyAsync(d_boundary_pressure_init, boundary_pressure_init, boundary_face_bytes, cudaMemcpyHostToDevice, stream)); - checkCudaErrors(cudaMemcpyAsync(d_boundary_nuEff_init, boundary_nuEff_init, boundary_face_bytes, cudaMemcpyHostToDevice, stream)); - checkCudaErrors(cudaMemcpyAsync(d_boundary_rho_init, boundary_rho_init, boundary_face_bytes, cudaMemcpyHostToDevice, stream)); + checkCudaErrors(cudaMemcpyAsync(dataBase_.d_internal_coeffs_init, ueqn_internalCoeffs_init, dataBase_.boundary_face_vec_bytes, cudaMemcpyHostToDevice, stream)); + checkCudaErrors(cudaMemcpyAsync(dataBase_.d_boundary_coeffs_init, ueqn_boundaryCoeffs_init, dataBase_.boundary_face_vec_bytes, cudaMemcpyHostToDevice, stream)); + checkCudaErrors(cudaMemcpyAsync(dataBase_.d_laplac_internal_coeffs_init, ueqn_laplac_internalCoeffs_init, dataBase_.boundary_face_vec_bytes, cudaMemcpyHostToDevice, stream)); + checkCudaErrors(cudaMemcpyAsync(dataBase_.d_laplac_boundary_coeffs_init, ueqn_laplac_boundaryCoeffs_init, dataBase_.boundary_face_vec_bytes, cudaMemcpyHostToDevice, stream)); + checkCudaErrors(cudaMemcpyAsync(dataBase_.d_boundary_velocity_init, boundary_velocity_init, dataBase_.boundary_face_vec_bytes, cudaMemcpyHostToDevice, stream)); + checkCudaErrors(cudaMemcpyAsync(dataBase_.d_boundary_pressure_init, boundary_pressure_init, dataBase_.boundary_face_bytes, cudaMemcpyHostToDevice, stream)); + checkCudaErrors(cudaMemcpyAsync(dataBase_.d_boundary_nuEff_init, boundary_nuEff_init, dataBase_.boundary_face_bytes, cudaMemcpyHostToDevice, stream)); + checkCudaErrors(cudaMemcpyAsync(dataBase_.d_boundary_rho_init, boundary_rho_init, dataBase_.boundary_face_bytes, cudaMemcpyHostToDevice, stream)); end = std::clock(); time_monitor_GPU_memcpy += double(end - start) / double(CLOCKS_PER_SEC); start = std::clock(); - blocks_per_grid = (num_boundary_faces + threads_per_block - 1) / threads_per_block; - boundaryPermutation<<>>(num_boundary_faces, d_bouPermedIndex, d_internal_coeffs_init, d_boundary_coeffs_init, - d_laplac_internal_coeffs_init, d_laplac_boundary_coeffs_init, d_boundary_pressure_init, d_boundary_velocity_init, d_internal_coeffs, d_boundary_coeffs, - d_laplac_internal_coeffs, d_laplac_boundary_coeffs, d_boundary_pressure, d_boundary_velocity, d_boundary_nuEff_init, d_boundary_nuEff, d_boundary_rho_init, d_boundary_rho); + blocks_per_grid = (dataBase_.num_boundary_faces + threads_per_block - 1) / threads_per_block; + boundaryPermutation<<>>(dataBase_.num_boundary_faces, dataBase_.d_bouPermedIndex, dataBase_.d_internal_coeffs_init, + dataBase_.d_boundary_coeffs_init, dataBase_.d_laplac_internal_coeffs_init, dataBase_.d_laplac_boundary_coeffs_init, dataBase_.d_boundary_pressure_init, + dataBase_.d_boundary_velocity_init, dataBase_.d_internal_coeffs, dataBase_.d_boundary_coeffs, dataBase_.d_laplac_internal_coeffs, dataBase_.d_laplac_boundary_coeffs, + dataBase_.d_boundary_pressure, dataBase_.d_boundary_velocity, dataBase_.d_boundary_nuEff_init, dataBase_.d_boundary_nuEff, dataBase_.d_boundary_rho_init, dataBase_.d_boundary_rho); // launch cuda kernel blocks_per_grid = (num_cells + threads_per_block - 1) / threads_per_block; fvm_div_internal<<>>(num_cells, num_faces, d_A_csr_row_index, d_A_csr_diag_index, - d_weight, d_phi, d_A_csr, d_b, d_A_csr, d_b); + dataBase_.d_weight, dataBase_.d_phi, d_A_csr, d_b, d_A_csr, d_b); blocks_per_grid = (num_boundary_cells + threads_per_block - 1) / threads_per_block; fvm_div_boundary<<>>(num_cells, num_faces, num_boundary_cells, d_A_csr_row_index, d_A_csr_diag_index, - d_boundary_cell_offset, d_boundary_cell_id, - d_internal_coeffs, d_boundary_coeffs, d_A_csr, d_b, d_A_csr, d_b); + dataBase_.d_boundary_cell_offset, dataBase_.d_boundary_cell_id, + dataBase_.d_internal_coeffs, dataBase_.d_boundary_coeffs, d_A_csr, d_b, d_A_csr, d_b); end = std::clock(); time_monitor_GPU_kernel += double(end - start) / double(CLOCKS_PER_SEC); } -void dfMatrix::fvc_grad(double* pressure) +void dfUEqn::fvc_grad(double* pressure) { - // copy cell variables directly + // Copy the host input array in host memory to the device input array in device memory clock_t start = std::clock(); - h_pressure = pressure; + checkCudaErrors(cudaMemcpyAsync(dataBase_.d_pressure, pressure, cell_bytes, cudaMemcpyHostToDevice, stream)); clock_t end = std::clock(); - time_monitor_CPU += double(end - start) / double(CLOCKS_PER_SEC); - - // Copy the host input array in host memory to the device input array in device memory - start = std::clock(); - checkCudaErrors(cudaMemcpyAsync(d_pressure, h_pressure, cell_bytes, cudaMemcpyHostToDevice, stream)); - end = std::clock(); time_monitor_GPU_memcpy += double(end - start) / double(CLOCKS_PER_SEC); // launch cuda kernel @@ -1316,39 +939,96 @@ void dfMatrix::fvc_grad(double* pressure) size_t blocks_per_grid = (num_cells + threads_per_block - 1) / threads_per_block; fvc_grad_internal_face<<>>(num_cells, d_A_csr_row_index, d_A_csr_col_index, d_A_csr_diag_index, - d_face_vector, d_weight, d_pressure, d_b, d_b); + dataBase_.d_face_vector, dataBase_.d_weight, dataBase_.d_pressure, d_b, d_b); blocks_per_grid = (num_boundary_cells + threads_per_block - 1) / threads_per_block; fvc_grad_boundary_face<<>>(num_cells, num_boundary_cells, - d_boundary_cell_offset, d_boundary_cell_id, - d_boundary_face_vector, d_boundary_pressure, d_b, d_b); + dataBase_.d_boundary_cell_offset, dataBase_.d_boundary_cell_id, + dataBase_.d_boundary_face_vector, dataBase_.d_boundary_pressure, d_b, d_b); end = std::clock(); time_monitor_GPU_kernel += double(end - start) / double(CLOCKS_PER_SEC); } -void dfMatrix::add_fvMatrix(double* turbSrc_low, double* turbSrc_diag, double* turbSrc_upp, double* turbSrc_source) +void dfUEqn::fvc_grad_vector() { - // copy and permutate matrix variables - std::copy(turbSrc_low, turbSrc_low + num_surfaces, h_turbSrc_init_mtx_vec.begin()); - std::copy(turbSrc_diag, turbSrc_diag + num_cells, h_turbSrc_init_mtx_vec.begin() + num_surfaces); - std::copy(turbSrc_upp, turbSrc_upp + num_surfaces, h_turbSrc_init_mtx_vec.begin() + num_surfaces + num_cells); - std::copy(turbSrc_source, turbSrc_source + 3*num_cells, h_turbSrc_init_src_vec.begin()); - - // permutate - checkCudaErrors(cudaMemcpyAsync(d_turbSrc_A_init, h_turbSrc_init_mtx_vec.data(), csr_value_bytes, cudaMemcpyHostToDevice, stream)); + clock_t start = std::clock(); + // launch CUDA kernal size_t threads_per_block = 1024; - size_t blocks_per_grid = (num_cells + num_faces + threads_per_block - 1) / threads_per_block; - // for (int i = 0; i < num_cells+2*num_surfaces; i++) - // h_turbSrc_init_1mtx[i] = h_turbSrc_init_mtx_vec[tmpPermutatedList[i]]; - csrPermutation<<>>(num_cells + num_faces, d_tmpPermutatedList, d_turbSrc_A_init, d_turbSrc_A); + size_t blocks_per_grid = (num_cells + threads_per_block - 1) / threads_per_block; + fvc_grad_vector_internal<<>>(num_cells, + d_A_csr_row_index, d_A_csr_col_index, d_A_csr_diag_index, + dataBase_.d_face_vector, dataBase_.d_velocity_old, dataBase_.d_weight, dataBase_.d_volume, dataBase_.d_grad); - // Copy the host input array in host memory to the device input array in device memory - checkCudaErrors(cudaMemcpyAsync(d_turbSrc_b, h_turbSrc_init_src_vec.data(), cell_vec_bytes, cudaMemcpyHostToDevice, stream)); - blocks_per_grid = (num_cells + threads_per_block - 1) / threads_per_block; - add_fvMatrix_kernel<<>>(num_cells, num_faces, - d_A_csr_row_index, d_turbSrc_A, d_turbSrc_b, d_A_csr, d_b, d_A_csr, d_b); + blocks_per_grid = (num_boundary_cells + threads_per_block - 1) / threads_per_block; + fvc_grad_vector_boundary<<>>(num_cells, num_boundary_cells, + dataBase_.d_boundary_cell_offset, dataBase_.d_boundary_cell_id, dataBase_.d_boundary_face_vector, dataBase_.d_boundary_velocity, + dataBase_.d_volume, dataBase_.d_grad, dataBase_.d_grad_boundary_init); + + correct_boundary_conditions<<>>(num_boundary_cells, + dataBase_.d_boundary_cell_offset, dataBase_.d_boundary_cell_id, dataBase_.d_boundary_face_vector, dataBase_.d_boundary_face, + dataBase_.d_grad_boundary_init, dataBase_.d_grad_boundary); + clock_t end = std::clock(); + time_monitor_GPU_kernel += double(end - start) / double(CLOCKS_PER_SEC); +} + +void dfUEqn::dev2T() +{ + clock_t start = std::clock(); + // launch CUDA kernal + size_t threads_per_block = 1024; + size_t blocks_per_grid = (num_cells + threads_per_block - 1) / threads_per_block; + dev2_t_tensor<<>>(num_cells, dataBase_.d_grad); + + blocks_per_grid = (dataBase_.num_boundary_faces + threads_per_block - 1) / threads_per_block; + dev2_t_tensor<<>>(dataBase_.num_boundary_faces, dataBase_.d_grad_boundary); + clock_t end = std::clock(); + time_monitor_GPU_kernel += double(end - start) / double(CLOCKS_PER_SEC); +} + +void dfUEqn::fvc_div_tensor(const double *nuEff) +{ + clock_t start = std::clock(); + checkCudaErrors(cudaMemcpyAsync(dataBase_.d_nuEff, nuEff, cell_bytes, cudaMemcpyHostToDevice, stream)); + clock_t end = std::clock(); + time_monitor_GPU_memcpy += double(end - start) / double(CLOCKS_PER_SEC); + + // launch cuda kernel + start = std::clock(); + size_t threads_per_block = 512; + size_t blocks_per_grid = (num_cells + threads_per_block - 1) / threads_per_block; + fvc_div_tensor_internal<<>>(num_cells, + d_A_csr_row_index, d_A_csr_col_index, d_A_csr_diag_index, + dataBase_.d_nuEff, dataBase_.d_rho_new, dataBase_.d_face_vector, dataBase_.d_grad, dataBase_.d_weight, + dataBase_.d_volume, 1., d_b, d_b); + + blocks_per_grid = (num_boundary_cells + threads_per_block - 1) / threads_per_block; + fvc_div_tensor_boundary<<>>(num_cells, num_boundary_cells, + dataBase_.d_boundary_cell_offset, dataBase_.d_boundary_cell_id, + dataBase_.d_boundary_nuEff, dataBase_.d_boundary_rho, dataBase_.d_boundary_face_vector, dataBase_.d_grad_boundary, + dataBase_.d_volume, 1., d_b, d_b); + end = std::clock(); + time_monitor_GPU_kernel += double(end - start) / double(CLOCKS_PER_SEC); +} + +void dfUEqn::fvm_laplacian() +{ + clock_t start = std::clock(); + // launch CUDA kernels + size_t threads_per_block = 1024; + size_t blocks_per_grid = (num_cells + threads_per_block - 1) / threads_per_block; + fvm_laplacian_uncorrected_vector_internal<<>>(num_cells, num_faces, + d_A_csr_row_index, d_A_csr_col_index, d_A_csr_diag_index, dataBase_.d_rho_new, dataBase_.d_nuEff, dataBase_.d_weight, + dataBase_.d_face, dataBase_.d_deltaCoeffs, -1., d_A_csr, d_A_csr); + + blocks_per_grid = (num_boundary_cells + threads_per_block - 1) / threads_per_block; + fvm_laplacian_uncorrected_vector_boundary<<>>(num_cells, num_faces, num_boundary_cells, + d_A_csr_row_index, d_A_csr_diag_index, dataBase_.d_boundary_cell_offset, dataBase_.d_boundary_cell_id, + dataBase_.d_boundary_nuEff, dataBase_.d_boundary_rho, dataBase_.d_boundary_face, dataBase_.d_laplac_internal_coeffs, + dataBase_.d_laplac_boundary_coeffs, -1., d_A_csr, d_b, d_A_csr, d_b); + clock_t end = std::clock(); + time_monitor_GPU_kernel += double(end - start) / double(CLOCKS_PER_SEC); } -void dfMatrix::checkValue(bool print) +void dfUEqn::checkValue(bool print) { checkCudaErrors(cudaMemcpyAsync(h_A_csr, d_A_csr, csr_value_vec_bytes, cudaMemcpyDeviceToHost, stream)); checkCudaErrors(cudaMemcpyAsync(h_b, d_b, cell_vec_bytes, cudaMemcpyDeviceToHost, stream)); @@ -1357,7 +1037,7 @@ void dfMatrix::checkValue(bool print) checkCudaErrors(cudaStreamSynchronize(stream)); if (print) { - for (int i = 0; i < (2*num_surfaces + num_cells); i++) + for (int i = 0; i < (num_faces + num_cells); i++) fprintf(stderr, "h_A_csr[%d]: %.15lf\n", i, h_A_csr[i]); for (int i = 0; i < num_cells * 3; i++) fprintf(stderr, "h_b[%d]: %.15lf\n", i, h_b[i]); @@ -1370,23 +1050,23 @@ void dfMatrix::checkValue(bool print) } int readfile = 0; double *of_b = new double[3*num_cells]; - double *of_A = new double[2*num_surfaces + num_cells]; + double *of_A = new double[num_faces + num_cells]; readfile = fread(of_b, num_cells * 3 * sizeof(double), 1, fp); - readfile = fread(of_A, (2*num_surfaces + num_cells) * sizeof(double), 1, fp); + readfile = fread(of_A, (num_faces + num_cells) * sizeof(double), 1, fp); - std::vector h_A_of_init_vec(num_cells+2*num_surfaces); - std::copy(of_A, of_A + num_cells+2*num_surfaces, h_A_of_init_vec.begin()); + std::vector h_A_of_init_vec(num_cells+num_faces); + std::copy(of_A, of_A + num_cells+num_faces, h_A_of_init_vec.begin()); - std::vector h_A_of_vec_1mtx(2*num_surfaces + num_cells, 0); - for (int i = 0; i < 2*num_surfaces + num_cells; i++) + std::vector h_A_of_vec_1mtx(num_faces + num_cells, 0); + for (int i = 0; i < num_faces + num_cells; i++) { - h_A_of_vec_1mtx[i] = h_A_of_init_vec[tmpPermutatedList[i]]; + h_A_of_vec_1mtx[i] = h_A_of_init_vec[dataBase_.tmpPermutatedList[i]]; } - std::vector h_A_of_vec((2*num_surfaces + num_cells)*3); + std::vector h_A_of_vec((num_faces + num_cells)*3); for (int i =0; i < 3; i ++) { - std::copy(h_A_of_vec_1mtx.begin(), h_A_of_vec_1mtx.end(), h_A_of_vec.begin()+i*(2*num_surfaces + num_cells)); + std::copy(h_A_of_vec_1mtx.begin(), h_A_of_vec_1mtx.end(), h_A_of_vec.begin()+i*(num_faces + num_cells)); } // b @@ -1410,7 +1090,7 @@ void dfMatrix::checkValue(bool print) if (print) { - for (int i = 0; i < (2*num_surfaces + num_cells); i++) + for (int i = 0; i < (num_faces + num_cells); i++) fprintf(stderr, "h_A_of_vec_1mtx[%d]: %.15lf\n", i, h_A_of_vec_1mtx[i]); for (int i = 0; i < 3*num_cells; i++) fprintf(stderr, "h_b_of_vec[%d]: %.15lf\n", i, h_b_of_vec[i]); @@ -1418,12 +1098,12 @@ void dfMatrix::checkValue(bool print) // check fprintf(stderr, "check of h_A_csr\n"); - checkVectorEqual(2*num_surfaces + num_cells, h_A_of_vec_1mtx.data(), h_A_csr, 1e-5); + checkVectorEqual(num_faces + num_cells, h_A_of_vec_1mtx.data(), h_A_csr, 1e-5); fprintf(stderr, "check of h_b\n"); checkVectorEqual(3*num_cells, h_b_of_vec.data(), h_b, 1e-5); } -void dfMatrix::solve() +void dfUEqn::solve() { // for (size_t i = 0; i < num_cells; i++) // fprintf(stderr, "h_velocity_old[%d]: (%.15lf, %.15lf, %.15lf)\n", i, h_velocity_old[3*i], @@ -1468,7 +1148,8 @@ void dfMatrix::solve() // fprintf(stderr, "h_velocity_after[%d]: (%.15lf, %.15lf, %.15lf)\n", i, h_psi[i], // h_psi[num_cells + i], h_psi[num_cells*2 + i]); } -void dfMatrix::updatePsi(double* Psi) + +void dfUEqn::updatePsi(double* Psi) { checkCudaErrors(cudaStreamSynchronize(stream)); for (size_t i = 0; i < num_cells; i++) @@ -1478,243 +1159,7 @@ void dfMatrix::updatePsi(double* Psi) Psi[i*3 + 2] = h_psi[num_cells*2 + i]; } } -void dfMatrix::fvc_grad_vector() -{ - clock_t start = std::clock(); - // launch CUDA kernal - size_t threads_per_block = 1024; - size_t blocks_per_grid = (num_cells + threads_per_block - 1) / threads_per_block; - fvc_grad_vector_internal<<>>(num_cells, - d_A_csr_row_index, d_A_csr_col_index, d_A_csr_diag_index, - d_face_vector, d_velocity_old, d_weight, d_volume, d_grad); - - blocks_per_grid = (num_boundary_cells + threads_per_block - 1) / threads_per_block; - fvc_grad_vector_boundary<<>>(num_cells, num_boundary_cells, - d_boundary_cell_offset, d_boundary_cell_id, d_boundary_face_vector, d_boundary_velocity, d_volume, - d_grad, d_grad_boundary_init); - - correct_boundary_conditions<<>>(num_boundary_cells, - d_boundary_cell_offset, d_boundary_cell_id, d_boundary_face_vector, d_boundary_face, - d_grad_boundary_init, d_grad_boundary); - clock_t end = std::clock(); - time_monitor_GPU_kernel += double(end - start) / double(CLOCKS_PER_SEC); -} -void dfMatrix::divDevRhoReff() -{ - clock_t start = std::clock(); - // launch CUDA kernal - size_t threads_per_block = 1024; - size_t blocks_per_grid = (num_cells + threads_per_block - 1) / threads_per_block; - dev2_t_tensor<<>>(num_cells, d_grad); - - blocks_per_grid = (num_boundary_faces + threads_per_block - 1) / threads_per_block; - dev2_t_tensor<<>>(num_boundary_faces, d_grad_boundary); - clock_t end = std::clock(); - time_monitor_GPU_kernel += double(end - start) / double(CLOCKS_PER_SEC); - - #ifdef DEBUG - double* h_grad = new double[num_cells * 9]; - double* h_grad_boundary = new double[num_boundary_faces * 9]; - - // check value of internal field - checkCudaErrors(cudaMemcpy(h_grad, d_grad, num_cells * 9 * sizeof(double), cudaMemcpyDeviceToHost)); - for (int i = 0; i < num_cells; i++) - fprintf(stderr, "fvc_grad_tensor_T[%d]: (%.2lf, %.2lf, %.2lf, %.2lf, %.2lf, %.2lf, %.2lf, %.2lf, %.2lf)\n", i, - h_grad[i * 9 + 0], h_grad[i * 9 + 1], h_grad[i * 9 + 2], - h_grad[i * 9 + 3], h_grad[i * 9 + 4], h_grad[i * 9 + 5], - h_grad[i * 9 + 6], h_grad[i * 9 + 7], h_grad[i * 9 + 8]); - fprintf(stderr, "error of dev2(T(fvc::grad(U)))\n"); - checkVectorEqual(num_cells * 9, ref_grad_T, h_grad, 1); - - // check value of boundary field - checkCudaErrors(cudaMemcpy(h_grad_boundary, d_grad_boundary, num_boundary_faces * 9 * sizeof(double), cudaMemcpyDeviceToHost)); - for (int i = 0; i < num_boundary_faces; i++) - fprintf(stderr, "df_boundary_gradU_T[%d]: (%.2lf, %.2lf, %.2lf, %.2lf, %.2lf, %.2lf, %.2lf, %.2lf, %.2lf)\n", i, - h_grad_boundary[i * 9 + 0], h_grad_boundary[i * 9 + 1], h_grad_boundary[i * 9 + 2], - h_grad_boundary[i * 9 + 3], h_grad_boundary[i * 9 + 4], h_grad_boundary[i * 9 + 5], - h_grad_boundary[i * 9 + 6], h_grad_boundary[i * 9 + 7], h_grad_boundary[i * 9 + 8]); - - std::vector boundary_gradU_T(num_boundary_faces*9); - for (int i = 0; i < num_boundary_faces; i++) - { - boundary_gradU_T[9*i] = ref_grad_T_boundary[9*boundPermutationList[i]]; - boundary_gradU_T[9*i+1] = ref_grad_T_boundary[9*boundPermutationList[i]+1]; - boundary_gradU_T[9*i+2] = ref_grad_T_boundary[9*boundPermutationList[i]+2]; - boundary_gradU_T[9*i+3] = ref_grad_T_boundary[9*boundPermutationList[i]+3]; - boundary_gradU_T[9*i+4] = ref_grad_T_boundary[9*boundPermutationList[i]+4]; - boundary_gradU_T[9*i+5] = ref_grad_T_boundary[9*boundPermutationList[i]+5]; - boundary_gradU_T[9*i+6] = ref_grad_T_boundary[9*boundPermutationList[i]+6]; - boundary_gradU_T[9*i+7] = ref_grad_T_boundary[9*boundPermutationList[i]+7]; - boundary_gradU_T[9*i+8] = ref_grad_T_boundary[9*boundPermutationList[i]+8]; - } - for (int i = 0; i < num_boundary_faces; i++) - fprintf(stderr, "of_boundary_gradU_T[%d]: (%.2lf, %.2lf, %.2lf, %.2lf, %.2lf, %.2lf, %.2lf, %.2lf, %.2lf)\n", i, - boundary_gradU_T[i * 9 + 0], boundary_gradU_T[i * 9 + 1], boundary_gradU_T[i * 9 + 2], - boundary_gradU_T[i * 9 + 3], boundary_gradU_T[i * 9 + 4], boundary_gradU_T[i * 9 + 5], - boundary_gradU_T[i * 9 + 6], boundary_gradU_T[i * 9 + 7], boundary_gradU_T[i * 9 + 8]); - - fprintf(stderr, "error of dev2(T(fvc::grad(U))) boundary\n"); - checkVectorEqual(num_boundary_faces * 9, boundary_gradU_T.data(), h_grad_boundary, 1); - #endif -} -void dfMatrix::fvc_div_tensor(const double *nuEff) -{ - #ifdef DEBUG - double* d_b_test = nullptr, *h_b_test = new double[3*num_cells]; - double* d_grad_test = nullptr, *h_grad_test = new double[9*num_cells]; - double* d_grad_boundary_test = nullptr,*h_grad_boundary_test = new double[num_boundary_faces * 9]; - double* h_weight_test = new double[num_faces]; - - checkCudaErrors(cudaMalloc((void**)&d_b_test, cell_vec_bytes)); - checkCudaErrors(cudaMalloc((void**)&d_grad_test, 9*num_cells*sizeof(double))); - checkCudaErrors(cudaMalloc((void**)&d_grad_boundary_test, 9*num_boundary_faces*sizeof(double))); - - checkCudaErrors(cudaMemsetAsync(d_b_test, 0, cell_vec_bytes, stream)); - #endif - clock_t start = std::clock(); - checkCudaErrors(cudaMemcpyAsync(d_nuEff, nuEff, cell_bytes, cudaMemcpyHostToDevice, stream)); - clock_t end = std::clock(); - time_monitor_GPU_memcpy += double(end - start) / double(CLOCKS_PER_SEC); - - // launch cuda kernel - start = std::clock(); - size_t threads_per_block = 512; - size_t blocks_per_grid = (num_cells + threads_per_block - 1) / threads_per_block; - fvc_div_tensor_internal<<>>(num_cells, - d_A_csr_row_index, d_A_csr_col_index, d_A_csr_diag_index, - d_nuEff, d_rho_new, d_face_vector, d_grad, d_weight, d_volume, 1., d_b, d_b); - - blocks_per_grid = (num_boundary_faces + threads_per_block - 1) / threads_per_block; - fvc_div_tensor_boundary<<>>(num_cells, num_boundary_cells, - d_boundary_cell_offset, d_boundary_cell_id, - d_boundary_nuEff, d_boundary_rho, d_boundary_face_vector, d_grad_boundary, d_volume, 1., d_b, d_b); - end = std::clock(); - time_monitor_GPU_kernel += double(end - start) / double(CLOCKS_PER_SEC); - - #ifdef DEBUG - checkCudaErrors(cudaMemcpyAsync(h_b_test, d_b_test, cell_vec_bytes, cudaMemcpyDeviceToHost, stream)); - checkCudaErrors(cudaMemcpyAsync(h_grad_test, d_grad_test, 9*num_cells*sizeof(double), cudaMemcpyDeviceToHost, stream)); - checkCudaErrors(cudaMemcpyAsync(h_grad_boundary_test, d_grad_boundary_test, 9*num_boundary_faces*sizeof(double), cudaMemcpyDeviceToHost, stream)); - checkCudaErrors(cudaMemcpyAsync(h_weight_test, d_weight, num_faces*sizeof(double), cudaMemcpyDeviceToHost, stream)); - - for (int i = 0; i < num_cells; i++) - fprintf(stderr, "h_b[%d]: (%.15lf, %.15lf, %.15lf)\n", i, h_b_test[i], h_b_test[num_cells + i], - h_b_test[2*num_cells + i]); - - fprintf(stderr,"error of fvc_div\n"); - std::vector of_ref_div_init_vec(3*num_cells); - std::copy(of_ref_div, of_ref_div + 3*num_cells, of_ref_div_init_vec.begin()); - std::vector of_ref_div_vec; - for (int i = 0; i < 3*num_cells; i+=3) - { - of_ref_div_vec.push_back(of_ref_div_init_vec[i]); - } - // fill RHS_y - for (int i = 1; i < 3*num_cells; i+=3) - { - of_ref_div_vec.push_back(of_ref_div_init_vec[i]); - } - // fill RHS_z - for (int i = 2; i < 3*num_cells; i+=3) - { - of_ref_div_vec.push_back(of_ref_div_init_vec[i]); - } - checkVectorEqual(3*num_cells, of_ref_div_vec.data(), h_b_test, 0.1); - - for (int i = 0; i < num_cells; i++) - fprintf(stderr, "fvc_grad_test[%d]: (%.7lf, %.7lf, %.7lf, %.7lf, %.7lf, %.7lf, %.7lf, %.7lf, %.7lf)\n", i, - h_grad_test[i * 9 + 0], h_grad_test[i * 9 + 1], h_grad_test[i * 9 + 2], - h_grad_test[i * 9 + 3], h_grad_test[i * 9 + 4], h_grad_test[i * 9 + 5], - h_grad_test[i * 9 + 6], h_grad_test[i * 9 + 7], h_grad_test[i * 9 + 8]); - fprintf(stderr,"error of 'this->alpha_*this->rho_*this->nuEff())*dev2(T(fvc::grad(U))'\n"); - checkVectorEqual(9*num_cells, of_ref_grad, h_grad_test, 0.1); - - for (int i = 0; i < num_boundary_faces; i++) - fprintf(stderr, "fvc_grad_boundary_test[%d]: (%.9lf, %.9lf, %.9lf, %.9lf, %.9lf, %.9lf, %.9lf, %.9lf, %.9lf)\n", i, - h_grad_boundary_test[i * 9 + 0], h_grad_boundary_test[i * 9 + 1], h_grad_boundary_test[i * 9 + 2], - h_grad_boundary_test[i * 9 + 3], h_grad_boundary_test[i * 9 + 4], h_grad_boundary_test[i * 9 + 5], - h_grad_boundary_test[i * 9 + 6], h_grad_boundary_test[i * 9 + 7], h_grad_boundary_test[i * 9 + 8]); - - std::vector boundary_coeff_gradU(num_boundary_faces*9); - for (int i = 0; i < num_boundary_faces; i++) - { - boundary_coeff_gradU[9*i] = of_ref_boundary[9*boundPermutationList[i]]; - boundary_coeff_gradU[9*i+1] = of_ref_boundary[9*boundPermutationList[i]+1]; - boundary_coeff_gradU[9*i+2] = of_ref_boundary[9*boundPermutationList[i]+2]; - boundary_coeff_gradU[9*i+3] = of_ref_boundary[9*boundPermutationList[i]+3]; - boundary_coeff_gradU[9*i+4] = of_ref_boundary[9*boundPermutationList[i]+4]; - boundary_coeff_gradU[9*i+5] = of_ref_boundary[9*boundPermutationList[i]+5]; - boundary_coeff_gradU[9*i+6] = of_ref_boundary[9*boundPermutationList[i]+6]; - boundary_coeff_gradU[9*i+7] = of_ref_boundary[9*boundPermutationList[i]+7]; - boundary_coeff_gradU[9*i+8] = of_ref_boundary[9*boundPermutationList[i]+8]; - } - for (int i = 0; i < num_boundary_faces; i++) - printf("of_boundary_gradU[%d]: (%.9lf, %.9lf, %.9lf, %.9lf, %.9lf, %.9lf, %.9lf, %.9lf, %.9lf)\n", i, - boundary_coeff_gradU[i * 9 + 0], boundary_coeff_gradU[i * 9 + 1], boundary_coeff_gradU[i * 9 + 2], - boundary_coeff_gradU[i * 9 + 3], boundary_coeff_gradU[i * 9 + 4], boundary_coeff_gradU[i * 9 + 5], - boundary_coeff_gradU[i * 9 + 6], boundary_coeff_gradU[i * 9 + 7], boundary_coeff_gradU[i * 9 + 8]); - - fprintf(stderr,"error of boundary of fvc::div\n"); - checkVectorEqual(9*num_boundary_faces, boundary_coeff_gradU.data(), h_grad_boundary_test, 1e-5); - #endif -} - -void dfMatrix::fvm_laplacian() -{ - #ifdef DEBUG - double* d_b_test = nullptr, *h_b_test = new double[3*num_cells]; - double* d_A_csr_test = nullptr, *h_A_csr_test = new double[(2*num_surfaces + num_cells)*3]; - - checkCudaErrors(cudaMalloc((void**)&d_b_test, cell_vec_bytes)); - checkCudaErrors(cudaMalloc((void**)&d_A_csr_test, csr_value_vec_bytes)); - - checkCudaErrors(cudaMemsetAsync(d_A_csr_test, 0, csr_value_vec_bytes, stream)); - checkCudaErrors(cudaMemsetAsync(d_b_test, 0, cell_vec_bytes, stream)); - #endif +dfUEqn::~dfUEqn(){ - clock_t start = std::clock(); - // launch CUDA kernels - size_t threads_per_block = 1024; - size_t blocks_per_grid = (num_cells + threads_per_block - 1) / threads_per_block; - fvm_laplacian_uncorrected_vector_internal<<>>(num_cells, num_faces, - d_A_csr_row_index, d_A_csr_col_index, d_A_csr_diag_index, d_rho_new, d_nuEff, d_weight, d_face, d_deltaCoeffs, -1., d_A_csr, d_A_csr); - - blocks_per_grid = (num_boundary_cells + threads_per_block - 1) / threads_per_block; - fvm_laplacian_uncorrected_vector_boundary<<>>(num_cells, num_faces, num_boundary_cells, - d_A_csr_row_index, d_A_csr_diag_index, d_boundary_cell_offset, d_boundary_cell_id, - d_boundary_nuEff, d_boundary_rho, d_boundary_face, d_laplac_internal_coeffs, d_laplac_boundary_coeffs, - -1., d_A_csr, d_b, d_A_csr, d_b); - clock_t end = std::clock(); - time_monitor_GPU_kernel += double(end - start) / double(CLOCKS_PER_SEC); - - #ifdef DEBUG - checkCudaErrors(cudaMemcpy(h_A_csr_test, d_A_csr_test, csr_value_vec_bytes, cudaMemcpyDeviceToHost)); - - char *input_file = "of_ref_laplac.txt"; - FILE *fp = fopen(input_file, "rb+"); - if (fp == NULL) { - fprintf(stderr, "Failed to open input file: %s!\n", input_file); - } - int readfile = 0; - double *of_A_laplac = new double[2*num_surfaces + num_cells]; - readfile = fread(of_A_laplac, (2*num_surfaces + num_cells) * sizeof(double), 1, fp); - - std::vector h_A_of_laplac_init_vec(num_cells+2*num_surfaces); - std::copy(of_A_laplac, of_A_laplac + num_cells+2*num_surfaces, h_A_of_laplac_init_vec.begin()); - - std::vector h_A_of_vec_1mtx(2*num_surfaces + num_cells, 0); - for (int i = 0; i < 2*num_surfaces + num_cells; i++) - { - h_A_of_vec_1mtx[i] = h_A_of_laplac_init_vec[tmpPermutatedList[i]]; - } - - std::vector h_A_of_vec((2*num_surfaces + num_cells)*3); - for (int i =0; i < 3; i ++) - { - std::copy(h_A_of_vec_1mtx.begin(), h_A_of_vec_1mtx.end(), h_A_of_vec.begin()+i*(2*num_surfaces + num_cells)); - } - fprintf(stderr,"error of fvm::laplacian\n"); - checkVectorEqual((2*num_surfaces + num_cells)*3, h_A_of_vec.data(), h_A_csr_test, 1e-5); - #endif -} +} \ No newline at end of file diff --git a/src_gpu/ueqn.cu b/src_gpu/ueqn.cu deleted file mode 100644 index 23401eef5..000000000 --- a/src_gpu/ueqn.cu +++ /dev/null @@ -1,1097 +0,0 @@ -#include -#include -#include -#include -#include - -static const char *_cudaGetErrorEnum(cudaError_t error) { - return cudaGetErrorName(error); -} - -template -void check(T result, char const *const func, const char *const file, - int const line) { - if (result) { - fprintf(stderr, "cuda error at %s:%d code=%d(%s) \"%s\" \n", file, line, - static_cast(result), _cudaGetErrorEnum(result), func); - exit(EXIT_FAILURE); - } -} - -#define checkCudaErrors(val) check((val), #val, __FILE__, __LINE__) - -static void checkVectorEqual(int count, double* basevec, double* vec, double max_relative_error) { - for (size_t i = 0; i < count; ++i) - { - double abs_diff = fabs(basevec[i] - vec[i]); - double rel_diff = fabs(basevec[i] - vec[i]) / fabs(basevec[i]); - if (abs_diff > 1e-16 && rel_diff > max_relative_error) { - fprintf(stderr, "mismatch index %d, cpu data: %.16lf, gpu data: %.16lf, relative error: %.16lf\n", i, basevec[i], vec[i], rel_diff); - } - } -} - -__global__ void fvm_ddt(int num_cells, int num_faces, const double rdelta_t, - const int* csr_row_index, const int* csr_diag_index, - const double* rho_old, const double* rho_new, const double* volume, const double* velocity_old, - const double* A_csr_input, const double* b_input, double* A_csr_output, double* b_output) { - int index = blockDim.x * blockIdx.x + threadIdx.x; - if (index >= num_cells) return; - - // A_csr has one more element in each row: itself - int row_index = csr_row_index[index]; - int diag_index = csr_diag_index[index]; - - int csr_dim = num_cells + num_faces; - int csr_index = row_index + diag_index; - double ddt_diag = rdelta_t * rho_new[index] * volume[index]; - A_csr_output[csr_dim * 0 + csr_index] = A_csr_input[csr_dim * 0 + csr_index] + ddt_diag; - A_csr_output[csr_dim * 1 + csr_index] = A_csr_input[csr_dim * 1 + csr_index] + ddt_diag; - A_csr_output[csr_dim * 2 + csr_index] = A_csr_input[csr_dim * 2 + csr_index] + ddt_diag; - - double ddt_part_term = rdelta_t * rho_old[index] * volume[index]; - b_output[num_cells * 0 + index] = b_input[num_cells * 0 + index] + ddt_part_term * velocity_old[index * 3 + 0]; - b_output[num_cells * 1 + index] = b_input[num_cells * 1 + index] + ddt_part_term * velocity_old[index * 3 + 1]; - b_output[num_cells * 2 + index] = b_input[num_cells * 2 + index] + ddt_part_term * velocity_old[index * 3 + 2]; -} - -__global__ void fvm_div_internal(int num_cells, int num_faces, - const int* csr_row_index, const int* csr_diag_index, - const double* weight, const double* phi, - const double* A_csr_input, const double* b_input, double* A_csr_output, double* b_output) { - int index = blockDim.x * blockIdx.x + threadIdx.x; - if (index >= num_cells) return; - - // A_csr has one more element in each row: itself - int row_index = csr_row_index[index]; - int next_row_index = csr_row_index[index + 1]; - int diag_index = csr_diag_index[index]; - int neighbor_offset = csr_row_index[index] - index; - int csr_dim = num_cells + num_faces; - - double div_diag = 0; - for (int i = row_index; i < next_row_index; i++) { - int inner_index = i - row_index; - // lower - if (inner_index < diag_index) { - int neighbor_index = neighbor_offset + inner_index; - double w = weight[neighbor_index]; - double f = phi[neighbor_index]; - A_csr_output[csr_dim * 0 + i] = A_csr_input[csr_dim * 0 + i] + (-w) * f; - A_csr_output[csr_dim * 1 + i] = A_csr_input[csr_dim * 1 + i] + (-w) * f; - A_csr_output[csr_dim * 2 + i] = A_csr_input[csr_dim * 2 + i] + (-w) * f; - // lower neighbors contribute to sum of -1 - div_diag += (w - 1) * f; - } - // upper - if (inner_index > diag_index) { - // upper, index - 1, consider of diag - int neighbor_index = neighbor_offset + inner_index - 1; - double w = weight[neighbor_index]; - double f = phi[neighbor_index]; - A_csr_output[csr_dim * 0 + i] = A_csr_input[csr_dim * 0 + i] + (1 - w) * f; - A_csr_output[csr_dim * 1 + i] = A_csr_input[csr_dim * 1 + i] + (1 - w) * f; - A_csr_output[csr_dim * 2 + i] = A_csr_input[csr_dim * 2 + i] + (1 - w) * f; - // upper neighbors contribute to sum of 1 - div_diag += w * f; - } - } - A_csr_output[csr_dim * 0 + row_index + diag_index] = A_csr_input[csr_dim * 0 + row_index + diag_index] + div_diag; // diag - A_csr_output[csr_dim * 1 + row_index + diag_index] = A_csr_input[csr_dim * 1 + row_index + diag_index] + div_diag; // diag - A_csr_output[csr_dim * 2 + row_index + diag_index] = A_csr_input[csr_dim * 2 + row_index + diag_index] + div_diag; // diag -} - -__global__ void fvm_div_boundary(int num_cells, int num_faces, int num_boundary_cells, - const int* csr_row_index, const int* csr_diag_index, - const int* boundary_cell_offset, const int* boundary_cell_id, - const double* internal_coeffs, const double* boundary_coeffs, - const double* A_csr_input, const double* b_input, double* A_csr_output, double* b_output) { - int index = blockDim.x * blockIdx.x + threadIdx.x; - if (index >= num_boundary_cells) return; - - int cell_offset = boundary_cell_offset[index]; // face index - int cell_index = boundary_cell_id[cell_offset]; - int loop_size = boundary_cell_offset[index + 1] - cell_offset; - - int row_index = csr_row_index[cell_index]; - int diag_index = csr_diag_index[cell_index]; - int csr_dim = num_cells + num_faces; - int csr_index = row_index + diag_index; - - // construct internalCoeffs & boundaryCoeffs - double internal_coeffs_x = 0; - double internal_coeffs_y = 0; - double internal_coeffs_z = 0; - double boundary_coeffs_x = 0; - double boundary_coeffs_y = 0; - double boundary_coeffs_z = 0; - for (int i = 0; i < loop_size; i++) { - internal_coeffs_x += internal_coeffs[(cell_offset + i) * 3 + 0]; - internal_coeffs_y += internal_coeffs[(cell_offset + i) * 3 + 1]; - internal_coeffs_z += internal_coeffs[(cell_offset + i) * 3 + 2]; - boundary_coeffs_x += boundary_coeffs[(cell_offset + i) * 3 + 0]; - boundary_coeffs_y += boundary_coeffs[(cell_offset + i) * 3 + 1]; - boundary_coeffs_z += boundary_coeffs[(cell_offset + i) * 3 + 2]; - } - - A_csr_output[csr_dim * 0 + csr_index] = A_csr_input[csr_dim * 0 + csr_index] + internal_coeffs_x; - A_csr_output[csr_dim * 1 + csr_index] = A_csr_input[csr_dim * 1 + csr_index] + internal_coeffs_y; - A_csr_output[csr_dim * 2 + csr_index] = A_csr_input[csr_dim * 2 + csr_index] + internal_coeffs_z; - b_output[num_cells * 0 + cell_index] = b_input[num_cells * 0 + cell_index] + boundary_coeffs_x; - b_output[num_cells * 1 + cell_index] = b_input[num_cells * 1 + cell_index] + boundary_coeffs_y; - b_output[num_cells * 2 + cell_index] = b_input[num_cells * 2 + cell_index] + boundary_coeffs_z; -} - -__global__ void fvc_grad_internal_face(int num_cells, - const int* csr_row_index, const int* csr_col_index, const int* csr_diag_index, - const double* face_vector, const double* weight, const double* pressure, - const double* b_input, double* b_output) { - int index = blockDim.x * blockIdx.x + threadIdx.x; - if (index >= num_cells) return; - - // A_csr has one more element in each row: itself - int row_index = csr_row_index[index]; - int next_row_index = csr_row_index[index + 1]; - int diag_index = csr_diag_index[index]; - int neighbor_offset = csr_row_index[index] - index; - - double own_cell_p = pressure[index]; - double grad_bx = 0; - double grad_by = 0; - double grad_bz = 0; - double grad_bx_low = 0; - double grad_bx_upp = 0; - double grad_by_low = 0; - double grad_by_upp = 0; - double grad_bz_low = 0; - double grad_bz_upp = 0; - for (int i = row_index; i < next_row_index; i++) { - int inner_index = i - row_index; - // lower - if (inner_index < diag_index) { - int neighbor_index = neighbor_offset + inner_index; - double w = weight[neighbor_index]; - double sfx = face_vector[neighbor_index * 3 + 0]; - double sfy = face_vector[neighbor_index * 3 + 1]; - double sfz = face_vector[neighbor_index * 3 + 2]; - int neighbor_cell_id = csr_col_index[row_index + inner_index]; - double neighbor_cell_p = pressure[neighbor_cell_id]; - double face_p = (1 - w) * own_cell_p + w * neighbor_cell_p; - grad_bx_low -= face_p * sfx; - grad_by_low -= face_p * sfy; - grad_bz_low -= face_p * sfz; - } - // upper - if (inner_index > diag_index) { - int neighbor_index = neighbor_offset + inner_index - 1; - double w = weight[neighbor_index]; - double sfx = face_vector[neighbor_index * 3 + 0]; - double sfy = face_vector[neighbor_index * 3 + 1]; - double sfz = face_vector[neighbor_index * 3 + 2]; - int neighbor_cell_id = csr_col_index[row_index + inner_index + 1]; - double neighbor_cell_p = pressure[neighbor_cell_id]; - double face_p = (1 - w) * own_cell_p + w * neighbor_cell_p; - grad_bx_upp += face_p * sfx; - grad_by_upp += face_p * sfy; - grad_bz_upp += face_p * sfz; - } - } - b_output[num_cells * 0 + index] = b_input[num_cells * 0 + index] + grad_bx_low + grad_bx_upp; - b_output[num_cells * 1 + index] = b_input[num_cells * 1 + index] + grad_by_low + grad_by_upp; - b_output[num_cells * 2 + index] = b_input[num_cells * 2 + index] + grad_bz_low + grad_bz_upp; -} - -__global__ void fvc_grad_boundary_face(int num_cells, int num_boundary_cells, - const int* boundary_cell_offset, const int* boundary_cell_id, - const double* boundary_face_vector, const double* boundary_pressure, - const double* b_input, double* b_output) { - int index = blockDim.x * blockIdx.x + threadIdx.x; - if (index >= num_boundary_cells) return; - - int cell_offset = boundary_cell_offset[index]; - int next_cell_offset = boundary_cell_offset[index + 1]; - int cell_index = boundary_cell_id[cell_offset]; - - // compute boundary gradient - double grad_bx = 0; - double grad_by = 0; - double grad_bz = 0; - for (int i = cell_offset; i < next_cell_offset; i++) { - double sfx = boundary_face_vector[i * 3 + 0]; - double sfy = boundary_face_vector[i * 3 + 1]; - double sfz = boundary_face_vector[i * 3 + 2]; - double face_p = boundary_pressure[i]; - grad_bx += face_p * sfx; - grad_by += face_p * sfy; - grad_bz += face_p * sfz; - } - - //// correct the boundary gradient - //double nx = boundary_face_vector[face_index * 3 + 0] / magSf[face_index]; - //double ny = boundary_face_vector[face_index * 3 + 1] / magSf[face_index]; - //double nz = boundary_face_vector[face_index * 3 + 2] / magSf[face_index]; - //double sn_grad = 0; - //double grad_correction = sn_grad * volume[cell_index] - (nx * grad_bx + ny * grad_by + nz * grad_bz); - //grad_bx += nx * grad_correction; - //grad_by += ny * grad_correction; - //grad_bz += nz * grad_correction; - - b_output[num_cells * 0 + cell_index] = b_input[num_cells * 0 + cell_index] + grad_bx; - b_output[num_cells * 1 + cell_index] = b_input[num_cells * 1 + cell_index] + grad_by; - b_output[num_cells * 2 + cell_index] = b_input[num_cells * 2 + cell_index] + grad_bz; -} - -__global__ void add_fvMatrix(int num_cells, int num_faces, - const int* csr_row_index, - const double* turbSrc_A, const double* turbSrc_b, - const double* A_csr_input, const double* b_input, double* A_csr_output, double* b_output) { - int index = blockDim.x * blockIdx.x + threadIdx.x; - if (index >= num_cells) return; - - int row_index = csr_row_index[index]; - int next_row_index = csr_row_index[index + 1]; - int csr_dim = num_cells + num_faces; - double A_entry; - - for (int i = row_index; i < next_row_index; i++) - { - A_entry = turbSrc_A[i]; - A_csr_output[csr_dim * 0 + i] = A_csr_input[csr_dim * 0 + i] + A_entry; - A_csr_output[csr_dim * 1 + i] = A_csr_input[csr_dim * 1 + i] + A_entry; - A_csr_output[csr_dim * 2 + i] = A_csr_input[csr_dim * 2 + i] + A_entry; - } - b_output[num_cells * 0 + index] = b_input[num_cells * 0 + index] + turbSrc_b[num_cells * 0 + index]; - b_output[num_cells * 1 + index] = b_input[num_cells * 1 + index] + turbSrc_b[num_cells * 1 + index]; - b_output[num_cells * 2 + index] = b_input[num_cells * 2 + index] + turbSrc_b[num_cells * 2 + index]; -} - -/* -// the implement fo assemble kernel is outdated. -__global__ void assemble(int num_cells, int num_faces, const double rdelta_t, const int* csr_row_index, const int* csr_diag_index, - const double* rho_old, const double* rho_new, const double* volume, const double* velocity_old, - const double* weight, const double* phi, const double* face_vector, const double* pressure, - double* A_csr, double* b) { - - int index = blockDim.x * blockIdx.x + threadIdx.x; - if (index >= num_cells) return; - - // A_csr has one more element in each row: itself - int row_index = csr_row_index[index]; - int next_row_index = csr_row_index[index + 1]; - int diag_index = csr_diag_index[index]; - int neighbor_offset = csr_row_index[index] - index; - int csr_dim = num_cells + num_faces; - - double div_diag = 0; - double grad_bx = 0; - double grad_by = 0; - double grad_bz = 0; - for (int i = row_index; i < next_row_index; i++) { - int inner_index = i - row_index; - // lower - if (inner_index < diag_index) { - int neighbor_index = neighbor_offset + inner_index; - double w = weight[neighbor_index]; - double f = phi[neighbor_index]; - double sfx = face_vector[neighbor_index * 3 + 0]; - double sfy = face_vector[neighbor_index * 3 + 1]; - double sfz = face_vector[neighbor_index * 3 + 2]; - double p = pressure[neighbor_index]; - A_csr[csr_dim * 0 + i] += (-w) * f; - A_csr[csr_dim * 1 + i] += (-w) * f; - A_csr[csr_dim * 2 + i] += (-w) * f; - // lower neighbors contribute to sum of -1 - div_diag += (w - 1) * f; - grad_bx -= p * sfx; - grad_by -= p * sfy; - grad_bz -= p * sfz; - } - // upper - if (inner_index > diag_index) { - // upper, index - 1, consider of diag - int neighbor_index = neighbor_offset + inner_index - 1; - double w = weight[neighbor_index]; - double f = phi[neighbor_index]; - double sfx = face_vector[neighbor_index * 3 + 0]; - double sfy = face_vector[neighbor_index * 3 + 1]; - double sfz = face_vector[neighbor_index * 3 + 2]; - double p = pressure[neighbor_index]; - A_csr[csr_dim * 0 + i] += (1 - w) * f; - A_csr[csr_dim * 1 + i] += (1 - w) * f; - A_csr[csr_dim * 2 + i] += (1 - w) * f; - // upper neighbors contribute to sum of 1 - div_diag += w * f; - grad_bx += p * sfx; - grad_by += p * sfy; - grad_bz += p * sfz; - } - } - - double ddt_diag = rdelta_t * rho_new[index] * volume[index]; - A_csr[csr_dim * 0 + row_index + diag_index] = ddt_diag + div_diag; // diag of A - A_csr[csr_dim * 1 + row_index + diag_index] = ddt_diag + div_diag; // diag of A - A_csr[csr_dim * 2 + row_index + diag_index] = ddt_diag + div_diag; // diag of A - - double ddt_bx = rdelta_t * rho_old[index] * velocity_old[index * 3 + 0] * volume[index]; - double ddt_by = rdelta_t * rho_old[index] * velocity_old[index * 3 + 1] * volume[index]; - double ddt_bz = rdelta_t * rho_old[index] * velocity_old[index * 3 + 2] * volume[index]; - b[num_cells * 0 + index] = ddt_bx + grad_bx; - b[num_cells * 1 + index] = ddt_by + grad_by; - b[num_cells * 2 + index] = ddt_bz + grad_bz; -} -*/ -int main(int argc, char* argv[]) { - // Print the array length to be used, and compute its size - bool validation_mode = false; - int w = 128; - int h = 128; - int d = 128; - char* input_file = NULL; - - if (argc > 1) { - validation_mode = true; - w = atoi(argv[1]); - h = atoi(argv[2]); - d = atoi(argv[3]); - input_file = argv[4]; - } - - fprintf(stderr, "mesh(w, h, d) is (%d, %d, %d)\n", w, h, d); - - // Get num_cells, num_faces, num_boundary_cells, num_boundary_faces - int num_cells = w * h * d; - int num_faces = num_cells * 6; - int num_boundary_cells = 0; - int num_boundary_faces = 0; - for (int i = 0; i < num_cells; ++i) { - int ww = i % w; - int hh = (i % (w * h)) / w; - int dd = i / (w * h); - - bool boundary_flag = false; - if (ww == 0) { - num_boundary_faces++; - boundary_flag = true; - } - if (ww == (w - 1)) { - num_boundary_faces++; - boundary_flag = true; - } - if (hh == 0) { - num_boundary_faces++; - boundary_flag = true; - } - if (hh == (h - 1)) { - num_boundary_faces++; - boundary_flag = true; - } - if (dd == 0) { - num_boundary_faces++; - boundary_flag = true; - } - if (dd == (d - 1)) { - num_boundary_faces++; - boundary_flag = true; - } - - if (boundary_flag) - num_boundary_cells++; - } - num_faces -= num_boundary_faces; - - // Load num_cells, num_faces, num_boundary_cells and num_boundary_faces - FILE *fp = NULL; - if (validation_mode) { - fp = fopen(input_file, "rb+"); - if (fp == NULL) { - fprintf(stderr, "%s %d, Failed to open input file: %s!\n", __FILE__, __LINE__, input_file); - exit(EXIT_FAILURE); - } - - int readfile = 0; - int input_num_cells = 0; - int input_num_faces = 0; - int input_num_boundary_cells = 0; - int input_num_boundary_faces = 0; - readfile = fread(&input_num_cells, sizeof(int), 1, fp); - if(readfile == 0) { - fprintf(stderr, "%s %d, Failed to open input file: %s!\n", __FILE__, __LINE__, input_file); - exit(EXIT_FAILURE); - } - readfile = fread(&input_num_faces, sizeof(int), 1, fp); - if(readfile == 0) { - fprintf(stderr, "%s %d, Failed to open input file: %s!\n", __FILE__, __LINE__, input_file); - exit(EXIT_FAILURE); - } - readfile = fread(&input_num_boundary_cells, sizeof(int), 1, fp); - if(readfile == 0) { - fprintf(stderr, "%s %d, Failed to open input file: %s!\n", __FILE__, __LINE__, input_file); - exit(EXIT_FAILURE); - } - readfile = fread(&input_num_boundary_faces, sizeof(int), 1, fp); - if(readfile == 0) { - fprintf(stderr, "%s %d, Failed to open input file: %s!\n", __FILE__, __LINE__, input_file); - exit(EXIT_FAILURE); - } - //if ((input_num_cells != num_cells) || (input_num_faces != num_faces) || (input_num_boundary_cells != num_boundary_cells)) { - // temp - if ((input_num_cells != num_cells) || (input_num_faces * 2 != num_faces - || (input_num_boundary_cells != num_boundary_cells) || (input_num_boundary_faces != num_boundary_faces))) { - fprintf(stderr, "Input info mismatched:\n"); - fprintf(stderr, "input_num_cells: %d, num_cells: %d\n", input_num_cells, num_cells); - fprintf(stderr, "input_num_faces: %d, num_faces: %d\n", input_num_faces, num_faces); - fprintf(stderr, "input_num_boundary_cells: %d, num_boundary_cells: %d\n", input_num_boundary_cells, num_boundary_cells); - fprintf(stderr, "input_num_boundary_faces: %d, num_boundary_faces: %d\n", input_num_boundary_faces, num_boundary_faces); - exit(EXIT_FAILURE); - } - num_faces = input_num_faces * 2; - } - - size_t cell_bytes = num_cells * sizeof(double); - size_t cell_vec_bytes = num_cells * 3 * sizeof(double); - size_t cell_index_bytes = num_cells * sizeof(int); - - size_t face_bytes = num_faces * sizeof(double); - size_t face_vec_bytes = num_faces * 3 * sizeof(double); - - size_t boundary_cell_bytes = num_boundary_cells * sizeof(double); - size_t boundary_cell_vec_bytes = num_boundary_cells * 3 * sizeof(double); - size_t boundary_cell_index_bytes = num_boundary_cells * sizeof(int); - - size_t boundary_face_bytes = num_boundary_faces * sizeof(double); - size_t boundary_face_vec_bytes = num_boundary_faces * 3 * sizeof(double); - size_t boundary_face_index_bytes = num_boundary_faces * sizeof(int); - - // A_csr has one more element in each row: itself - size_t csr_row_index_bytes = (num_cells + 1) * sizeof(int); - size_t csr_col_index_bytes = (num_cells + num_faces) * sizeof(int); - size_t csr_value_bytes = (num_cells + num_faces) * sizeof(double); - size_t csr_value_vec_bytes = (num_cells + num_faces) * 3 * sizeof(double); - - // csr_row_index, csr_col_index, csr_diag_index are pre-computed on cpu, not computed on gpu. - // values of each row are stored continued. - // row_index stores the start offset of each row. - // col_index stores the column number of each row - // diag_index stores the serial number(in a row) of the diag element. - int* h_A_csr_row_index = (int*)malloc(csr_row_index_bytes); - int* h_A_csr_col_index = (int*)malloc(csr_col_index_bytes); - int* h_A_csr_diag_index = (int*)malloc(cell_index_bytes); - - double* h_rho_old = (double*)malloc(cell_bytes); - double* h_rho_new = (double*)malloc(cell_bytes); - double* h_volume = (double*)malloc(cell_bytes); - double* h_pressure = (double*)malloc(cell_bytes); - double* h_velocity_old = (double*)malloc(cell_vec_bytes); - double* h_weight = (double*)malloc(face_bytes); - double* h_phi = (double*)malloc(face_bytes); - double* h_face_vector = (double*)malloc(face_vec_bytes); - - int* h_boundary_cell_offset = (int*)malloc((num_boundary_cells+1) * sizeof(int)); - int* h_boundary_cell_id = (int*)malloc(boundary_face_bytes); - - double* h_boundary_pressure = (double*)malloc(boundary_face_bytes); - double* h_boundary_face_vector = (double*)malloc(boundary_face_vec_bytes); - double* h_internal_coeffs = (double*)malloc(boundary_face_vec_bytes); - double* h_boundary_coeffs = (double*)malloc(boundary_face_vec_bytes); - - double* h_A_csr = (double*)malloc(csr_value_vec_bytes); - double* h_b = (double*)malloc(cell_vec_bytes); - double* h_A_csr_validation = (double*)malloc(csr_value_vec_bytes); - double* h_b_validation = (double*)malloc(cell_vec_bytes); - double* h_grad_validation = (double*)malloc(cell_vec_bytes); - - double* h_turbSrc_A = (double*)malloc(csr_value_bytes); - double* h_turbSrc_b = (double*)malloc(cell_vec_bytes); - - double *b_openfoam = (double*)malloc(cell_vec_bytes); - double *A_openfoam = (double*)malloc(csr_value_vec_bytes); - - // Verify that allocations succeeded - if (h_A_csr_row_index == NULL || h_A_csr_col_index == NULL || h_A_csr_diag_index == NULL - || h_rho_old == NULL || h_rho_new == NULL || h_volume == NULL || h_pressure == NULL || h_velocity_old == NULL - || h_weight == NULL || h_phi == NULL || h_face_vector == NULL - || h_boundary_cell_offset == NULL || h_boundary_cell_id == NULL - || h_boundary_pressure == NULL || h_boundary_face_vector == NULL - || h_internal_coeffs == NULL || h_boundary_coeffs == NULL - || h_A_csr == NULL || h_b == NULL || h_A_csr_validation == NULL || h_b_validation == NULL) { - fprintf(stderr, "Failed to allocate host arrays!\n"); - exit(EXIT_FAILURE); - } - - // Load input data - double rdelta_t = 1.0 / 0.0001; - if (validation_mode) { - int readfile = 0; - readfile = fread((void*)&rdelta_t, sizeof(double), 1, fp); - if(readfile == 0) { - fprintf(stderr, "%s %d, Failed to open input file: %s!\n", __FILE__, __LINE__, input_file); - exit(EXIT_FAILURE); - } - fprintf(stderr, "rdelta_t: %lf\n", rdelta_t); - - readfile = fread(h_A_csr_row_index, csr_row_index_bytes, 1, fp); - if(readfile == 0) { - fprintf(stderr, "%s %d, Failed to open input file: %s!\n", __FILE__, __LINE__, input_file); - exit(EXIT_FAILURE); - } - // for (int i = 0; i < num_cells + 1; i++) - // fprintf(stderr, "h_A_csr_row_index[%d]: %d\n", i, h_A_csr_row_index[i]); - - readfile = fread(h_A_csr_col_index, csr_col_index_bytes, 1, fp); - if(readfile == 0) { - fprintf(stderr, "%s %d, Failed to open input file: %s!\n", __FILE__, __LINE__, input_file); - exit(EXIT_FAILURE); - } - // for (int i = 0; i < num_cells + num_faces; i++) - // fprintf(stderr, "h_A_csr_col_index[%d]: %d\n", i, h_A_csr_col_index[i]); - - readfile = fread(h_A_csr_diag_index, cell_index_bytes, 1, fp); - if(readfile == 0) { - fprintf(stderr, "%s %d, Failed to open input file: %s!\n", __FILE__, __LINE__, input_file); - exit(EXIT_FAILURE); - } - // for (int i = 0; i < num_cells; i++) - // fprintf(stderr, "h_A_csr_diag_index[%d]: %d\n", i, h_A_csr_diag_index[i]); - - readfile = fread(h_rho_old, cell_bytes, 1, fp); - if(readfile == 0) { - fprintf(stderr, "%s %d, Failed to open input file: %s!\n", __FILE__, __LINE__, input_file); - exit(EXIT_FAILURE); - } - //for (int i = 0; i < num_cells; i++) - // fprintf(stderr, "h_rho_old[%d]: %lf\n", i, h_rho_old[i]); - - readfile = fread(h_rho_new, cell_bytes, 1, fp); - if(readfile == 0) { - fprintf(stderr, "%s %d, Failed to open input file: %s!\n", __FILE__, __LINE__, input_file); - exit(EXIT_FAILURE); - } - //for (int i = 0; i < num_cells; i++) - // fprintf(stderr, "h_rho_new[%d]: %lf\n", i, h_rho_new[i]); - - readfile = fread(h_volume, cell_bytes, 1, fp); - if(readfile == 0) { - fprintf(stderr, "%s %d, Failed to open input file: %s!\n", __FILE__, __LINE__, input_file); - exit(EXIT_FAILURE); - } - //for (int i = 0; i < num_cells; i++) - // fprintf(stderr, "h_volume[%d]: %.16lf\n", i, h_volume[i]); - - readfile = fread(h_pressure, cell_bytes, 1, fp); - if(readfile == 0) { - fprintf(stderr, "%s %d, Failed to open input file: %s!\n", __FILE__, __LINE__, input_file); - exit(EXIT_FAILURE); - } - // for (int i = 0; i < num_cells; i++) - // fprintf(stderr, "h_pressure[%d]: %.30lf\n", i, h_pressure[i]); - - readfile = fread(h_velocity_old, cell_vec_bytes, 1, fp); - if(readfile == 0) { - fprintf(stderr, "%s %d, Failed to open input file: %s!\n", __FILE__, __LINE__, input_file); - exit(EXIT_FAILURE); - } - //for (int i = 0; i < num_cells; i++) - // fprintf(stderr, "h_velocity[%d]: (%lf, %lf, %lf)\n", i, h_volume[i * 3 + 0], h_volume[i * 3 + 1], h_volume[i * 3 + 2]); - - readfile = fread(h_weight, face_bytes, 1, fp); - if(readfile == 0) { - fprintf(stderr, "%s %d, Failed to open input file: %s!\n", __FILE__, __LINE__, input_file); - exit(EXIT_FAILURE); - } - // for (int i = 0; i < num_faces; i++) - // fprintf(stderr, "h_weight[%d]: %.20lf\n", i, h_weight[i]); - - readfile = fread(h_phi, face_bytes, 1, fp); - if(readfile == 0) { - fprintf(stderr, "%s %d, Failed to open input file: %s!\n", __FILE__, __LINE__, input_file); - exit(EXIT_FAILURE); - } - //for (int i = 0; i < num_faces; i++) - // fprintf(stderr, "h_phi[%d]: %lf\n", i, h_phi[i]); - - readfile = fread(h_face_vector, face_vec_bytes, 1, fp); - if(readfile == 0) { - fprintf(stderr, "%s %d, Failed to open input file: %s!\n", __FILE__, __LINE__, input_file); - exit(EXIT_FAILURE); - } - // for (int i = 0; i < num_faces; i++) - // fprintf(stderr, "h_face_vector[%d]: (%.20lf, %.20lf, %.20lf)\n", i, h_face_vector[i * 3 + 0], h_face_vector[i * 3 + 1], h_face_vector[i * 3 + 2]); - - readfile = fread(h_boundary_cell_offset, (num_boundary_cells+1) * sizeof(int), 1, fp); - if(readfile == 0) { - fprintf(stderr, "%s %d, Failed to open input file: %s!\n", __FILE__, __LINE__, input_file); - exit(EXIT_FAILURE); - } - // for (int i = 0; i < num_boundary_cells; i++) - // fprintf(stderr, "h_boundary_cell_offset[%d]: %d\n", i, h_boundary_cell_offset[i]); - - readfile = fread(h_boundary_cell_id, boundary_face_index_bytes, 1, fp); - if(readfile == 0) { - fprintf(stderr, "%s %d, Failed to open input file: %s!\n", __FILE__, __LINE__, input_file); - exit(EXIT_FAILURE); - } - // for (int i = 0; i < num_boundary_faces; i++) - // fprintf(stderr, "h_boundary_cell_id[%d]: %d\n", i, h_boundary_cell_id[i]); - - readfile = fread(h_boundary_pressure, boundary_face_bytes, 1, fp); - if(readfile == 0) { - fprintf(stderr, "%s %d, Failed to open input file: %s!\n", __FILE__, __LINE__, input_file); - exit(EXIT_FAILURE); - } - // for (int i = 0; i < num_boundary_faces; i++) - // fprintf(stderr, "h_boundary_pressure[%d]: %lf\n", i, h_boundary_pressure[i]); - - readfile = fread(h_boundary_face_vector, boundary_face_vec_bytes, 1, fp); - if(readfile == 0) { - fprintf(stderr, "%s %d, Failed to open input file: %s!\n", __FILE__, __LINE__, input_file); - exit(EXIT_FAILURE); - } - // for (int i = 0; i < num_boundary_faces; i++) - // fprintf(stderr, "h_boundary_face_vector[%d]: (%.15lf, %.15lf, %.15lf)\n", i, h_boundary_face_vector[i * 3 + 0], h_boundary_face_vector[i * 3 + 1], h_boundary_face_vector[i * 3 + 2]); - - readfile = fread(h_internal_coeffs, boundary_face_vec_bytes, 1, fp); - if(readfile == 0) { - fprintf(stderr, "%s %d, Failed to open input file: %s!\n", __FILE__, __LINE__, input_file); - exit(EXIT_FAILURE); - } - // for (int i = 0; i < num_boundary_faces; i++) - // fprintf(stderr, "h_internal_coeffs[%d]: (%.15lf, %.15lf, %.15lf)\n", i, h_internal_coeffs[i * 3 + 0], h_internal_coeffs[i * 3 + 1], h_internal_coeffs[i * 3 + 2]); - - readfile = fread(h_boundary_coeffs, boundary_face_vec_bytes, 1, fp); - if(readfile == 0) { - fprintf(stderr, "%s %d, Failed to open input file: %s!\n", __FILE__, __LINE__, input_file); - exit(EXIT_FAILURE); - } - //for (int i = 0; i < num_boundary_faces; i++) - // fprintf(stderr, "h_boundary_coeffs[%d]: (%lf, %lf, %lf)\n", i, h_boundary_coeffs[i * 3 + 0], h_boundary_coeffs[i * 3 + 1], h_boundary_coeffs[i * 3 + 2]); - - readfile = fread(h_A_csr_validation, csr_value_vec_bytes, 1, fp); - if(readfile == 0) { - fprintf(stderr, "%s %d, Failed to open input file: %s!\n", __FILE__, __LINE__, input_file); - exit(EXIT_FAILURE); - } - //for (int i = 0; i < (num_cells + num_faces) * 3; i++) - // fprintf(stderr, "h_A_csr_validation[%d]: %.30lf\n", i, h_A_csr_validation[i]); - - readfile = fread(h_b_validation, cell_vec_bytes, 1, fp); - if(readfile == 0) { - fprintf(stderr, "%s %d, Failed to open input file: %s!\n", __FILE__, __LINE__, input_file); - exit(EXIT_FAILURE); - } - // for (int i = 0; i < num_cells * 3; i++) - // fprintf(stderr, "h_b_validation[%d]: %lf\n", i, h_b_validation[i]); - - readfile = fread(h_grad_validation, cell_vec_bytes, 1, fp); - if(readfile == 0) { - fprintf(stderr, "%s %d, Failed to open input file: %s!\n", __FILE__, __LINE__, input_file); - exit(EXIT_FAILURE); - } - // for (int i = 0; i < num_cells * 3; i++) - // fprintf(stderr, "h_grad_validation[%d]: %lf\n", i, h_grad_validation[i]); - - readfile = fread(h_turbSrc_A, csr_value_bytes, 1, fp); - if(readfile == 0) { - fprintf(stderr, "%s %d, Failed to open input file: %s!\n", __FILE__, __LINE__, input_file); - exit(EXIT_FAILURE); - } - // for (int i = 0; i < (num_cells + num_faces); i++) - // fprintf(stderr, "h_turbSrc_A[%d]: %.30lf\n", i, h_turbSrc_A[i]); - - readfile = fread(h_turbSrc_b, cell_vec_bytes, 1, fp); - if(readfile == 0) { - fprintf(stderr, "%s %d, Failed to open input file: %s!\n", __FILE__, __LINE__, input_file); - exit(EXIT_FAILURE); - } - // for (int i = 0; i < num_cells * 3; i++) - // fprintf(stderr, "h_turbSrc_b[%d]: %lf\n", i, h_turbSrc_b[i]); - - // compare with of results - FILE *fp1 = NULL; - fp1 = fopen("of_finalresult.txt", "rb+"); - readfile = fread(A_openfoam, csr_value_vec_bytes, 1, fp1); - readfile = fread(b_openfoam, cell_vec_bytes, 1, fp1); - - } else { - // Initialize row index - h_A_csr_row_index[0] = 0; - for (int i = 0; i < num_cells; ++i) { - int ww = i % w; - int hh = (i % (w * h)) / w; - int dd = i / (w * h); - int neighbor_number = 6; - - if (ww == 0) { - neighbor_number -= 1; - } - if (ww == (w - 1)) { - neighbor_number -= 1; - } - if (hh == 0) { - neighbor_number -= 1; - } - if (hh == (h - 1)) { - neighbor_number -= 1; - } - if (dd == 0) { - neighbor_number -= 1; - } - if (dd == (d - 1)) { - neighbor_number -= 1; - } - - h_A_csr_row_index[i + 1] = h_A_csr_row_index[i] + neighbor_number; - } - - // Initialize cell fields - for (int i = 0; i < num_cells; ++i) { - h_rho_old[i] = rand() / (double)RAND_MAX; - h_rho_new[i] = rand() / (double)RAND_MAX; - h_volume[i] = rand() / (double)RAND_MAX; - h_pressure[i] = rand() / (double)RAND_MAX; - h_velocity_old[i * 3 + 0] = rand() / (double)RAND_MAX; - h_velocity_old[i * 3 + 1] = rand() / (double)RAND_MAX; - h_velocity_old[i * 3 + 2] = rand() / (double)RAND_MAX; - } - - // Initialize A_csr_col_index, A_csr_diag_index, and the face fields. - for (int i = 0; i < num_cells; ++i) { - int row_offset = h_A_csr_row_index[i]; - int neighbor_offset = row_offset - i; - - double weight = 0.5; - - // Make sure that the values are placed in each row with the following order. - // (x, y, z - 1)--> column: (i - w * h) - // (x, y - 1, z)--> column: (i - w) - // (x - 1, y, z)--> column: (i - 1) - // (x, y, z) --> column: (i) //only in A_csr_index and A_csr_diag_index - // (x + 1, y, z)--> column: (i + 1) - // (x, y + 1, z)--> column: (i + w) - // (x, y, z + 1)--> column: (i + w * h) - - // (x, y, z - 1) - // (x, y - 1, z) - // (x - 1, y, z) - int num_lower = 0; - if (i - w * h >= 0) { - num_lower++; - h_A_csr_col_index[row_offset++] = i - w * h; - - int face_index = neighbor_offset++; - h_weight[face_index] = weight; - h_phi[face_index] = rand() / (double)RAND_MAX; - h_face_vector[face_index * 3 + 0] = rand() / (double)RAND_MAX; - h_face_vector[face_index * 3 + 1] = rand() / (double)RAND_MAX; - h_face_vector[face_index * 3 + 2] = rand() / (double)RAND_MAX; - } - if (i - w >= 0) { - num_lower++; - h_A_csr_col_index[row_offset++] = i - w; - - int face_index = neighbor_offset++; - h_weight[face_index] = weight; - h_phi[face_index] = rand() / (double)RAND_MAX; - h_face_vector[face_index * 3 + 0] = rand() / (double)RAND_MAX; - h_face_vector[face_index * 3 + 1] = rand() / (double)RAND_MAX; - h_face_vector[face_index * 3 + 2] = rand() / (double)RAND_MAX; - } - if (i - 1 >= 0) { - num_lower++; - h_A_csr_col_index[row_offset++] = i - 1; - - int face_index = neighbor_offset++; - h_weight[face_index] = weight; - h_phi[face_index] = rand() / (double)RAND_MAX; - h_face_vector[face_index * 3 + 0] = rand() / (double)RAND_MAX; - h_face_vector[face_index * 3 + 1] = rand() / (double)RAND_MAX; - h_face_vector[face_index * 3 + 2] = rand() / (double)RAND_MAX; - } - - // (x, y, z) - h_A_csr_col_index[row_offset++] = i; - h_A_csr_diag_index[i] = num_lower; - - // (x + 1, y, z) - // (x, y + 1, z) - // (x, y, z + 1) - if (i + 1 < num_cells) { - h_A_csr_col_index[row_offset++] = i + 1; - - int face_index = neighbor_offset++; - h_weight[face_index] = weight; - h_face_vector[face_index * 3 + 0] = rand() / (double)RAND_MAX; - h_face_vector[face_index * 3 + 1] = rand() / (double)RAND_MAX; - h_face_vector[face_index * 3 + 2] = rand() / (double)RAND_MAX; - h_face_vector[face_index] = rand() / (double)RAND_MAX; - } - if (i + w < num_cells) { - h_A_csr_col_index[row_offset++] = i + w; - - int face_index = neighbor_offset++; - h_weight[face_index] = weight; - h_phi[face_index] = rand() / (double)RAND_MAX; - h_face_vector[face_index * 3 + 0] = rand() / (double)RAND_MAX; - h_face_vector[face_index * 3 + 1] = rand() / (double)RAND_MAX; - h_face_vector[face_index * 3 + 2] = rand() / (double)RAND_MAX; - } - if (i + w * h < num_cells) { - h_A_csr_col_index[row_offset++] = i + w * h; - - int face_index = neighbor_offset++; - h_weight[face_index] = weight; - h_phi[face_index] = rand() / (double)RAND_MAX; - h_face_vector[face_index * 3 + 0] = rand() / (double)RAND_MAX; - h_face_vector[face_index * 3 + 1] = rand() / (double)RAND_MAX; - h_face_vector[face_index * 3 + 2] = rand() / (double)RAND_MAX; - } - } - } - - if (fp) { - fclose(fp); - } - - int total_bytes = 0; - int copy_bytes = 0; - - // Allocate the device arrays - int* d_A_csr_row_index = NULL; - int* d_A_csr_col_index = NULL; - int* d_A_csr_diag_index = NULL; - checkCudaErrors(cudaMalloc((void**)&d_A_csr_row_index, csr_row_index_bytes)); - checkCudaErrors(cudaMalloc((void**)&d_A_csr_col_index, csr_col_index_bytes)); - checkCudaErrors(cudaMalloc((void**)&d_A_csr_diag_index, cell_index_bytes)); - total_bytes += (csr_row_index_bytes + csr_col_index_bytes + cell_index_bytes); - - double* d_rho_old = NULL; - double* d_rho_new = NULL; - double* d_volume = NULL; - double* d_pressure = NULL; - double* d_velocity_old = NULL; - double* d_weight = NULL; - double* d_phi = NULL; - double* d_face_vector = NULL; - checkCudaErrors(cudaMalloc((void**)&d_rho_old, cell_bytes)); - checkCudaErrors(cudaMalloc((void**)&d_rho_new, cell_bytes)); - checkCudaErrors(cudaMalloc((void**)&d_volume, cell_bytes)); - checkCudaErrors(cudaMalloc((void**)&d_pressure, cell_bytes)); - checkCudaErrors(cudaMalloc((void**)&d_velocity_old, cell_vec_bytes)); - checkCudaErrors(cudaMalloc((void**)&d_weight, face_bytes)); - checkCudaErrors(cudaMalloc((void**)&d_phi, face_bytes)); - checkCudaErrors(cudaMalloc((void**)&d_face_vector, face_vec_bytes)); - total_bytes += (cell_bytes * 4 + face_bytes * 2 + cell_vec_bytes + face_vec_bytes); - - int* d_boundary_cell_offset = NULL; - int* d_boundary_cell_id = NULL; - checkCudaErrors(cudaMalloc((void**)&d_boundary_cell_offset, (num_boundary_cells+1) * sizeof(int))); - checkCudaErrors(cudaMalloc((void**)&d_boundary_cell_id, boundary_face_bytes)); - total_bytes += (boundary_cell_index_bytes + (num_boundary_cells+1) * sizeof(int)); - - double* d_boundary_pressure = NULL; - double* d_boundary_face_vector = NULL; - double* d_internal_coeffs = NULL; - double* d_boundary_coeffs = NULL; - checkCudaErrors(cudaMalloc((void**)&d_boundary_pressure, boundary_face_bytes)); - checkCudaErrors(cudaMalloc((void**)&d_boundary_face_vector, boundary_face_vec_bytes)); - checkCudaErrors(cudaMalloc((void**)&d_internal_coeffs, boundary_face_vec_bytes)); - checkCudaErrors(cudaMalloc((void**)&d_boundary_coeffs, boundary_face_vec_bytes)); - total_bytes += (boundary_face_bytes + boundary_face_vec_bytes * 3); - - double* d_A_csr = NULL; - double* d_b = NULL; - checkCudaErrors(cudaMalloc((void**)&d_A_csr, csr_value_vec_bytes)); - checkCudaErrors(cudaMalloc((void**)&d_b, cell_vec_bytes)); - total_bytes += (csr_value_vec_bytes + cell_vec_bytes); - - double* d_turbSrc_A = NULL; - double* d_turbSrc_b = NULL; - checkCudaErrors(cudaMalloc((void**)&d_turbSrc_A, csr_value_bytes)); - checkCudaErrors(cudaMalloc((void**)&d_turbSrc_b, cell_vec_bytes)); - total_bytes += (csr_value_bytes + cell_vec_bytes); - - fprintf(stderr, "Total bytes malloc on GPU: %.2fMB\n", total_bytes * 1.0 / 1024 / 1024); - - // Create cuda stream - cudaStream_t stream; - checkCudaErrors(cudaStreamCreate(&stream)); - - struct timeval start, end; - float duration = 0; - gettimeofday(&start, NULL); - - // Copy the host input array in host memory to the device input array in device memory - printf("Copy input arrays from the host memory to the CUDA device\n"); - checkCudaErrors(cudaMemcpyAsync(d_A_csr_row_index, h_A_csr_row_index, csr_row_index_bytes, cudaMemcpyHostToDevice, stream)); - checkCudaErrors(cudaMemcpyAsync(d_A_csr_col_index, h_A_csr_col_index, csr_col_index_bytes, cudaMemcpyHostToDevice, stream)); - checkCudaErrors(cudaMemcpyAsync(d_A_csr_diag_index, h_A_csr_diag_index, cell_index_bytes, cudaMemcpyHostToDevice, stream)); - copy_bytes += (csr_row_index_bytes + csr_col_index_bytes + cell_index_bytes); - - checkCudaErrors(cudaMemcpyAsync(d_rho_old, h_rho_old, cell_bytes, cudaMemcpyHostToDevice, stream)); - checkCudaErrors(cudaMemcpyAsync(d_rho_new, h_rho_new, cell_bytes, cudaMemcpyHostToDevice, stream)); - checkCudaErrors(cudaMemcpyAsync(d_volume, h_volume, cell_bytes, cudaMemcpyHostToDevice, stream)); - checkCudaErrors(cudaMemcpyAsync(d_pressure, h_pressure, cell_bytes, cudaMemcpyHostToDevice, stream)); - checkCudaErrors(cudaMemcpyAsync(d_velocity_old, h_velocity_old, cell_vec_bytes, cudaMemcpyHostToDevice, stream)); - checkCudaErrors(cudaMemcpyAsync(d_weight, h_weight, face_bytes, cudaMemcpyHostToDevice, stream)); - checkCudaErrors(cudaMemcpyAsync(d_phi, h_phi, face_bytes, cudaMemcpyHostToDevice, stream)); - checkCudaErrors(cudaMemcpyAsync(d_face_vector, h_face_vector, face_vec_bytes, cudaMemcpyHostToDevice, stream)); - copy_bytes += (cell_bytes * 4 + cell_vec_bytes + face_bytes * 2 + face_vec_bytes); - - checkCudaErrors(cudaMemcpyAsync(d_boundary_cell_offset, h_boundary_cell_offset, (num_boundary_cells+1) * sizeof(int), cudaMemcpyHostToDevice, stream)); - checkCudaErrors(cudaMemcpyAsync(d_boundary_cell_id, h_boundary_cell_id, boundary_face_index_bytes, cudaMemcpyHostToDevice, stream)); - copy_bytes += ((num_boundary_cells+1) * sizeof(int) + boundary_face_index_bytes); - - checkCudaErrors(cudaMemcpyAsync(d_boundary_pressure, h_boundary_pressure, boundary_face_bytes, cudaMemcpyHostToDevice, stream)); - checkCudaErrors(cudaMemcpyAsync(d_boundary_face_vector, h_boundary_face_vector, boundary_face_vec_bytes, cudaMemcpyHostToDevice, stream)); - checkCudaErrors(cudaMemcpyAsync(d_internal_coeffs, h_internal_coeffs, boundary_face_vec_bytes, cudaMemcpyHostToDevice, stream)); - checkCudaErrors(cudaMemcpyAsync(d_boundary_coeffs, h_boundary_coeffs, boundary_face_vec_bytes, cudaMemcpyHostToDevice, stream)); - copy_bytes += (boundary_face_bytes + boundary_face_vec_bytes * 3); - - checkCudaErrors(cudaMemcpyAsync(d_turbSrc_A, h_turbSrc_A, csr_value_bytes, cudaMemcpyHostToDevice, stream)); - checkCudaErrors(cudaMemcpyAsync(d_turbSrc_b, h_turbSrc_b, cell_vec_bytes, cudaMemcpyHostToDevice, stream)); - copy_bytes += (csr_value_bytes + cell_vec_bytes); - - // Synchronize stream - checkCudaErrors(cudaStreamSynchronize(stream)); - gettimeofday(&end, NULL); - duration = (end.tv_sec - start.tv_sec) * 1000 + (end.tv_usec - start.tv_usec) * 0.001; - fprintf(stderr, "memcpy time cost %lf ms.\n", duration); - fprintf(stderr, "bandwidth %lf GB/s.\n", (copy_bytes * 1.0 / 1024 / 1024 / 1024) / (duration / 1000)); - - size_t threads_per_block = 1024; - size_t blocks_per_grid = (num_cells + threads_per_block - 1) / threads_per_block; - printf("CUDA kernel launch with %d blocks of %d threads\n", blocks_per_grid, threads_per_block); - - /** - * start term-level - */ - gettimeofday(&start, NULL); - - // Memset A and b to zero - checkCudaErrors(cudaMemsetAsync(d_A_csr, 0, csr_value_vec_bytes, stream)); - checkCudaErrors(cudaMemsetAsync(d_b, 0, cell_vec_bytes, stream)); - - // Launch the fvm_ddt Kernel - fvm_ddt<<>>(num_cells, num_faces, rdelta_t, - d_A_csr_row_index, d_A_csr_diag_index, - d_rho_old, d_rho_new, d_volume, d_velocity_old, d_A_csr, d_b, d_A_csr, d_b); - // Launch the fvm_div Kernel - fvm_div_internal<<>>(num_cells, num_faces, - d_A_csr_row_index, d_A_csr_diag_index, - d_weight, d_phi, d_A_csr, d_b, d_A_csr, d_b); - blocks_per_grid = (num_boundary_cells + threads_per_block - 1) / threads_per_block; - fvm_div_boundary<<>>(num_cells, num_faces, num_boundary_cells, - d_A_csr_row_index, d_A_csr_diag_index, - d_boundary_cell_offset, d_boundary_cell_id, - d_internal_coeffs, d_boundary_coeffs, d_A_csr, d_b, d_A_csr, d_b); - // Launch the fvc_grad Kernel - blocks_per_grid = (num_cells + threads_per_block - 1) / threads_per_block; - fvc_grad_internal_face<<>>(num_cells, - d_A_csr_row_index, d_A_csr_col_index, d_A_csr_diag_index, - d_face_vector, d_weight, d_pressure, d_b, d_b); - blocks_per_grid = (num_boundary_cells + threads_per_block - 1) / threads_per_block; - fvc_grad_boundary_face<<>>(num_cells, num_boundary_cells, - d_boundary_cell_offset, d_boundary_cell_id, - d_boundary_face_vector, d_boundary_pressure, d_b, d_b); - blocks_per_grid = (num_cells + threads_per_block - 1) / threads_per_block; - add_fvMatrix<<>>(num_cells, num_faces, - d_A_csr_row_index, d_turbSrc_A, d_turbSrc_b, d_A_csr, d_b, d_A_csr, d_b); - - // Synchronize stream - checkCudaErrors(cudaStreamSynchronize(stream)); - gettimeofday(&end, NULL); - duration = (end.tv_sec - start.tv_sec) * 1000 + (end.tv_usec - start.tv_usec) * 0.001; - fprintf(stderr, "assemble on term level, time cost %lf ms.\n", duration); - /** - * end term-level - */ - - /** - * start equation-level - */ - //gettimeofday(&start, NULL); - - //// Memset A and b to zero - //checkCudaErrors(cudaMemsetAsync(d_A_csr, 0, csr_value_vec_bytes, stream)); - //checkCudaErrors(cudaMemsetAsync(d_b, 0, cell_vec_bytes, stream)); - - //// Note: the implement fo assemble kernel is outdated. - //// Launch the assemble Kernel - //assemble<<>>(num_cells, num_faces, rdelta_t, d_A_csr_row_index, d_A_csr_diag_index, - // d_rho_old, d_rho_new, d_volume, d_velocity_old, d_weight, d_phi, d_face_vector, d_pressure, d_A_csr, d_b); - - //// Synchronize stream - //checkCudaErrors(cudaStreamSynchronize(stream)); - //gettimeofday(&end, NULL); - //duration = (end.tv_sec - start.tv_sec) * 1000 + (end.tv_usec - start.tv_usec) * 0.001; - //fprintf(stderr, "assemble on equation level, time cost %lf ms.\n", duration); - /** - * end equation-level - */ - - // Sleep 10 seconds, so that user can check gpu memory usage through nvidia-smi. - //sleep(10); - - // Check result - if (validation_mode) { - checkCudaErrors(cudaMemcpyAsync(h_A_csr, d_A_csr, csr_value_vec_bytes, cudaMemcpyDeviceToHost, stream)); - checkCudaErrors(cudaMemcpyAsync(h_b, d_b, cell_vec_bytes, cudaMemcpyDeviceToHost, stream)); - checkCudaErrors(cudaStreamSynchronize(stream)); - // fprintf(stderr, "check of h_A_csr\n"); - // checkVectorEqual((num_cells + num_faces) * 3, h_A_csr_validation, h_A_csr, 1e-5); - // fprintf(stderr, "check of h_b\n"); - // checkVectorEqual(num_cells * 3, h_b_validation, h_b, 1e-5); - fprintf(stderr, "check of h_A_csr\n"); - checkVectorEqual((num_cells + num_faces) * 3, A_openfoam, h_A_csr, 1e-5); - fprintf(stderr, "check of h_b\n"); - checkVectorEqual(num_cells * 3, b_openfoam, h_b, 1e-5); - } - - // Free device global memory - checkCudaErrors(cudaFree(d_A_csr_row_index)); - checkCudaErrors(cudaFree(d_A_csr_col_index)); - checkCudaErrors(cudaFree(d_A_csr_diag_index)); - checkCudaErrors(cudaFree(d_rho_old)); - checkCudaErrors(cudaFree(d_rho_new)); - checkCudaErrors(cudaFree(d_volume)); - checkCudaErrors(cudaFree(d_pressure)); - checkCudaErrors(cudaFree(d_velocity_old)); - checkCudaErrors(cudaFree(d_weight)); - checkCudaErrors(cudaFree(d_phi)); - checkCudaErrors(cudaFree(d_face_vector)); - checkCudaErrors(cudaFree(d_boundary_cell_offset)); - checkCudaErrors(cudaFree(d_boundary_cell_id)); - checkCudaErrors(cudaFree(d_boundary_pressure)); - checkCudaErrors(cudaFree(d_boundary_face_vector)); - checkCudaErrors(cudaFree(d_internal_coeffs)); - checkCudaErrors(cudaFree(d_boundary_coeffs)); - checkCudaErrors(cudaFree(d_A_csr)); - checkCudaErrors(cudaFree(d_b)); - checkCudaErrors(cudaStreamDestroy(stream)); - - // Free host memory - free(h_A_csr_row_index); - free(h_A_csr_col_index); - free(h_A_csr_diag_index); - free(h_rho_old); - free(h_rho_new); - free(h_volume); - free(h_pressure); - free(h_velocity_old); - free(h_weight); - free(h_phi); - free(h_face_vector); - free(h_boundary_cell_offset); - free(h_boundary_cell_id); - free(h_boundary_pressure); - free(h_boundary_face_vector); - free(h_internal_coeffs); - free(h_boundary_coeffs); - free(h_A_csr); - free(h_b); - free(h_A_csr_validation); - free(h_b_validation); - - printf("Done\n"); - return 0; -} -