From 7ee80e549d5f5dfad6dfc97dc85515ddfea6e38a Mon Sep 17 00:00:00 2001 From: STwangyingrui Date: Fri, 2 Jun 2023 14:01:48 +0800 Subject: [PATCH 1/3] implement eeqn, fvm_div debugging --- applications/solvers/dfLowMachFoam/EEqn.H | 128 ++- .../solvers/dfLowMachFoam/createdfSolver.H | 14 +- .../solvers/dfLowMachFoam/dfLowMachFoam.C | 6 + src_gpu/CMakeLists.txt | 1 + src_gpu/dfEEqn.H | 75 ++ src_gpu/dfEEqn.cu | 772 ++++++++++++++++++ 6 files changed, 986 insertions(+), 10 deletions(-) create mode 100644 src_gpu/dfEEqn.H create mode 100644 src_gpu/dfEEqn.cu diff --git a/applications/solvers/dfLowMachFoam/EEqn.H b/applications/solvers/dfLowMachFoam/EEqn.H index 6a49d9ced..3b8d38c8e 100644 --- a/applications/solvers/dfLowMachFoam/EEqn.H +++ b/applications/solvers/dfLowMachFoam/EEqn.H @@ -1,4 +1,5 @@ { + fprintf(stderr, "\n\n=====================enter EEqn=====================\n\n\n"); volScalarField& he = thermo.he(); #ifdef GPUSolver_ // start1 = std::clock(); @@ -14,23 +15,132 @@ // // t.join(); UEqn_GPU.updatePsi(&U[0][0]); K = 0.5*magSqr(U); + // K = 0.5*magSqr(U) * 10000000000; end1 = std::clock(); time_monitor_UEqn += double(end1 - start1) / double(CLOCKS_PER_SEC); time_monitor_UEqn_mtxAssembly += double(end1 - start1) / double(CLOCKS_PER_SEC); time_monitor_CPU += double(end1 - start1) / double(CLOCKS_PER_SEC); #endif +#ifdef CPUSolver_ + //fvScalarMatrix EEqn + //( + // fvm::ddt(rho, he) + fvm::div(phi, he) + // + fvc::ddt(rho, K) + fvc::div(phi, K) + // - dpdt + // - fvm::laplacian(turbulence->alphaEff(), he) + // + diffAlphaD + // == + // fvc::div(hDiffCorrFlux) + //); + + //EEqn.relax(); + + //EEqn.solve(); + fvScalarMatrix EEqn ( - fvm::ddt(rho, he) + fvm::div(phi, he) - + fvc::ddt(rho, K) + fvc::div(phi, K) - - dpdt - - fvm::laplacian(turbulence->alphaEff(), he) - + diffAlphaD - == - fvc::div(hDiffCorrFlux) + //fvm::ddt(rho, he) + fvm::div(phi, he) + //fvm::ddt(rho, he) + fvc::ddt(rho, K) + //fvm::ddt(rho, he) + fvc::div(phi, K) * 10000000000 + //fvm::ddt(rho, he) + fvc::div(phi, K) + //fvm::laplacian(turbulence->alphaEff(), he) + fvm::div(phi, he) + //fvm::laplacian(turbulence->alphaEff(), he) - dpdt + diffAlphaD + //fvm::laplacian(turbulence->alphaEff(), he) - fvc::div(hDiffCorrFlux) + //fvm::laplacian(turbulence->alphaEff(), he) + fvc::div(phi, K) * 10000000000 + //== + //fvc::div(hDiffCorrFlux) ); - EEqn.relax(); - EEqn.solve(); +#endif + +#ifdef GPUSolver_ + start2 = std::clock(); + // prepare data on CPU + const tmp alphaEff_tmp(turbulence->alphaEff()); + const volScalarField& alphaEff = alphaEff_tmp(); + + int idx = 0; + offset = 0; + forAll(he.boundaryField(), patchi) + { + const fvsPatchScalarField& patchFlux = phi.boundaryField()[patchi]; + const fvsPatchScalarField& pw = mesh.surfaceInterpolation::weights().boundaryField()[patchi]; + int patchSize = pw.size(); + + const scalarField& patchK = K.boundaryField()[patchi]; + const scalarField& patchAlphaEff = alphaEff.boundaryField()[patchi]; + memcpy(boundary_K + offset, &patchK[0], patchSize*sizeof(double)); + memcpy(boundary_alphaEff + offset, &patchAlphaEff[0], patchSize*sizeof(double)); + + Field valueInternalCoeffs = patchFlux*he.boundaryField()[patchi].valueInternalCoeffs(pw); + Field valueBoundaryCoeffs = -patchFlux*he.boundaryField()[patchi].valueBoundaryCoeffs(pw); + Field gradientInternalCoeffs = he.boundaryField()[patchi].gradientInternalCoeffs(); + Field gradientBoundaryCoeffs = he.boundaryField()[patchi].gradientBoundaryCoeffs(); + memcpy(eeqn_valueInternalCoeffs + offset, &valueInternalCoeffs[0], patchSize*sizeof(double)); + memcpy(eeqn_valueBoundaryCoeffs + offset, &valueBoundaryCoeffs[0], patchSize*sizeof(double)); + memcpy(eeqn_gradientInternalCoeffs + offset, &gradientInternalCoeffs[0], patchSize*sizeof(double)); + memcpy(eeqn_gradientBoundaryCoeffs + offset, &gradientBoundaryCoeffs[0], patchSize*sizeof(double)); + + //for (int t = 0; t < patchSize; t++) { + // fprintf(stderr, "gpu valueInternalCoeffs[%d][%d]: %.10lf\n", patchi, t, eeqn_valueInternalCoeffs[offset + t]); + // if (idx == 64 || idx == 4160 || idx == 8385) + // fprintf(stderr, "eeqn_value_internal_coeffs[%d](other index: %d %d): %.10lf(other value: %.10lf)\n", idx, offset, t, eeqn_valueInternalCoeffs[idx], eeqn_valueInternalCoeffs[offset + t]); + // idx++; + //} + offset += patchSize; + } + offset = 0; + end2 = std::clock(); + time_monitor_CPU += double(end2 - start2) / double(CLOCKS_PER_SEC); + start1 = std::clock(); + // prepare data on GPU + EEqn_GPU.prepare_data(&he.oldTime()[0], &K[0], &K.oldTime()[0], &alphaEff[0], + &dpdt[0], &diffAlphaD[0], &hDiffCorrFlux[0][0], + boundary_K, boundary_alphaEff, + eeqn_valueInternalCoeffs, eeqn_valueBoundaryCoeffs, + eeqn_gradientInternalCoeffs, eeqn_gradientBoundaryCoeffs); + end1 = std::clock(); + time_monitor_CPU += double(end1 - start1) / double(CLOCKS_PER_SEC); + time_monitor_EEqn += double(end1 - start1) / double(CLOCKS_PER_SEC); + time_monitor_EEqn_mtxAssembly += double(end1 - start1) / double(CLOCKS_PER_SEC); + fprintf(stderr, "\n\n=====================EEqn GPUSolver_ part2=====================\n\n\n"); + + start1 = std::clock(); + EEqn_GPU.resetAb(); + //EEqn_GPU.fvm_ddt(); + EEqn_GPU.fvm_div(); + //EEqn_GPU.fvm_laplacian(); + //EEqn_GPU.fvc_ddt(); + //EEqn_GPU.fvc_div_phi_scalar(); + //EEqn_GPU.fvc_div_vector(); + //EEqn_GPU.add_to_source(); + EEqn_GPU.sync(); + end1 = std::clock(); + time_monitor_EEqn += double(end1 - start1) / double(CLOCKS_PER_SEC); + time_monitor_EEqn_mtxAssembly += double(end1 - start1) / double(CLOCKS_PER_SEC); + fprintf(stderr, "\n\n=====================EEqn GPUSolver_ part3=====================\n\n\n"); + + // check value of mtxAssembly + EEqn_GPU.checkValue(false); + //EEqn_GPU.checkValue(true); + fprintf(stderr, "\n\n=====================EEqn GPUSolver_ part4=====================\n\n\n"); + + start1 = std::clock(); + EEqn_GPU.solve(); + end1 = std::clock(); + time_monitor_EEqn += double(end1 - start1) / double(CLOCKS_PER_SEC); + time_monitor_EEqn_Solve += double(end1 - start1) / double(CLOCKS_PER_SEC); + fprintf(stderr, "\n\n=====================EEqn GPUSolver_ part5=====================\n\n\n"); + + start1 = std::clock(); + EEqn_GPU.updatePsi(&he[0]); + end1 = std::clock(); + time_monitor_EEqn += double(end1 - start1) / double(CLOCKS_PER_SEC); + time_monitor_EEqn_mtxAssembly += double(end1 - start1) / double(CLOCKS_PER_SEC); + time_monitor_CPU += double(end1 - start1) / double(CLOCKS_PER_SEC); + fprintf(stderr, "\n\n=====================EEqn GPUSolver_ part6=====================\n\n\n"); +#endif + fprintf(stderr, "\n\n=====================leave EEqn=====================\n\n\n"); } diff --git a/applications/solvers/dfLowMachFoam/createdfSolver.H b/applications/solvers/dfLowMachFoam/createdfSolver.H index b60307463..e56376b28 100644 --- a/applications/solvers/dfLowMachFoam/createdfSolver.H +++ b/applications/solvers/dfLowMachFoam/createdfSolver.H @@ -31,9 +31,11 @@ settingPath = CanteraTorchProperties.subDict("AmgxSettings").lookupOrDefault("UE dfMatrixDataBase dfDataBase(num_surfaces, num_cells, num_boundary_faces, Y.size(), num_boundary_cells, &neighbour[0], &owner[0], &mesh.V()[0], &mesh.surfaceInterpolation::weights()[0], &mesh.Sf()[0][0], &mesh.magSf()[0], &mesh.nonOrthDeltaCoeffs()[0], boundary_face_vector_init, boundary_face_init, boundary_deltaCoeffs_init, boundaryCellIndex); +#ifdef GPUSolver_ dfRhoEqn rhoEqn_GPU(dfDataBase); dfUEqn UEqn_GPU(dfDataBase, "dDDI", settingPath); dfYEqn YEqn_GPU(dfDataBase, "dDDI", settingPath, inertIndex); +dfEEqn EEqn_GPU(dfDataBase, "dDDI", settingPath); double *ueqn_internalCoeffs_init, *ueqn_boundaryCoeffs_init, *boundary_pressure_init, *boundary_velocity_init, *boundary_nuEff_init, *boundary_rho_init, *ueqn_laplac_internalCoeffs_init, *ueqn_laplac_boundaryCoeffs_init, *boundary_phi_init; @@ -45,4 +47,14 @@ cudaMallocHost(&boundary_velocity_init, 3*num_boundary_faces*sizeof(double)); cudaMallocHost(&boundary_pressure_init, num_boundary_faces*sizeof(double)); cudaMallocHost(&boundary_nuEff_init, num_boundary_faces*sizeof(double)); cudaMallocHost(&boundary_rho_init, num_boundary_faces*sizeof(double)); -cudaMallocHost(&boundary_phi_init, num_boundary_faces*sizeof(double)); \ No newline at end of file +cudaMallocHost(&boundary_phi_init, num_boundary_faces*sizeof(double)); + +double *boundary_alphaEff, *boundary_K; +double *eeqn_valueInternalCoeffs, *eeqn_valueBoundaryCoeffs, *eeqn_gradientInternalCoeffs, *eeqn_gradientBoundaryCoeffs; +cudaMallocHost(&boundary_K, num_boundary_faces*sizeof(double)); +cudaMallocHost(&boundary_alphaEff, num_boundary_faces*sizeof(double)); +cudaMallocHost(&eeqn_valueInternalCoeffs, num_boundary_faces*sizeof(double)); +cudaMallocHost(&eeqn_valueBoundaryCoeffs, num_boundary_faces*sizeof(double)); +cudaMallocHost(&eeqn_gradientInternalCoeffs, num_boundary_faces*sizeof(double)); +cudaMallocHost(&eeqn_gradientBoundaryCoeffs, num_boundary_faces*sizeof(double)); +#endif diff --git a/applications/solvers/dfLowMachFoam/dfLowMachFoam.C b/applications/solvers/dfLowMachFoam/dfLowMachFoam.C index d875b607d..7b6ba15ee 100644 --- a/applications/solvers/dfLowMachFoam/dfLowMachFoam.C +++ b/applications/solvers/dfLowMachFoam/dfLowMachFoam.C @@ -62,11 +62,13 @@ Description #include "dfUEqn.H" #include "dfYEqn.H" #include "dfRhoEqn.H" +#include "dfEEqn.H" #include #include #include "upwind.H" #define GPUSolver_ +//#define CPUSolver_ // * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * // @@ -99,6 +101,10 @@ int main(int argc, char *argv[]) double time_monitor_UEqn_mtxAssembly=0; double time_monitor_UEqn_Solve=0; double time_monitor_UEqn_sum=0; + double time_monitor_EEqn=0; + double time_monitor_EEqn_mtxAssembly=0; + double time_monitor_EEqn_Solve=0; + double time_monitor_EEqn_sum=0; double time_monitor_chem=0; double time_monitor_Y=0; double time_monitor_E=0; diff --git a/src_gpu/CMakeLists.txt b/src_gpu/CMakeLists.txt index c749eaa42..1040cee14 100644 --- a/src_gpu/CMakeLists.txt +++ b/src_gpu/CMakeLists.txt @@ -23,6 +23,7 @@ add_library(${PROJECT_NAME} dfUEqn.cu dfRhoEqn.cu dfYEqn.cu + dfEEqn.cu AmgXSolver.cu) target_link_libraries(${PROJECT_NAME} diff --git a/src_gpu/dfEEqn.H b/src_gpu/dfEEqn.H new file mode 100644 index 000000000..10fc3569a --- /dev/null +++ b/src_gpu/dfEEqn.H @@ -0,0 +1,75 @@ +#pragma once + +#include "AmgXSolver.H" +#include +#include "dfMatrixDataBase.H" + +class dfEEqn +{ +private: + dfMatrixDataBase &dataBase_; + cudaStream_t stream; + AmgXSolver *ESolver = nullptr; + int num_iteration; + double time_monitor_CPU; + double time_monitor_GPU_kernel, time_monitor_GPU_memcpy, time_monitor_GPU_memcpy_test; + + // common variables + int num_cells, cell_bytes, num_faces, num_surfaces, cell_vec_bytes, csr_value_vec_bytes, num_boundary_cells; + int num_boundary_faces, boundary_face_bytes; + int *d_A_csr_row_index, *d_A_csr_diag_index, *d_A_csr_col_index; + + // Matrix variables + double *d_A_csr, *d_b = nullptr; + double *h_A_csr, *h_b = nullptr; + double *d_he_old = nullptr; + double *h_he_new = nullptr; + + // fields used by EEqn + double *d_alphaEff = nullptr; + double *d_K = nullptr; + double *d_K_old = nullptr; + double *d_dpdt = nullptr; + double *d_diffAlphaD = nullptr; + double *d_hDiffCorrFlux = nullptr; + double *d_boundary_K_init = nullptr; + double *d_boundary_K = nullptr; + double *d_boundary_alphaEff_init = nullptr; + double *d_boundary_alphaEff = nullptr; + double *d_value_internal_coeffs_init = nullptr; + double *d_value_boundary_coeffs_init = nullptr; + double *d_gradient_internal_coeffs_init = nullptr; + double *d_gradient_boundary_coeffs_init = nullptr; + double *d_value_internal_coeffs = nullptr; + double *d_value_boundary_coeffs = nullptr; + double *d_gradient_internal_coeffs = nullptr; + double *d_gradient_boundary_coeffs = nullptr; + +public: + dfEEqn(); + dfEEqn(dfMatrixDataBase &dataBase, const std::string &modeStr, const std::string &cfgFile); + ~dfEEqn(); + + void prepare_data(const double *he_old, const double *K, const double *K_old, const double *alphaEff, + const double *hDiffCorrFlux, const double *dpdt, const double *diffAlphaD, + const double *boundary_K, const double *boundary_alphaEff, + const double *valueInternalCoeffs, const double *valueBoundaryCoeffs, + const double *gradientInternalCoeffs, const double *gradientBoundaryCoeffs); + + void resetAb(); + + void fvm_ddt(); + void fvm_div(); + void fvm_laplacian(); + + void fvc_ddt(); + void fvc_div_phi_scalar(); + void fvc_div_vector(); + void add_to_source(); + + void solve(); + void checkValue(bool print); + void updatePsi(double *Psi); + + void sync(); +}; diff --git a/src_gpu/dfEEqn.cu b/src_gpu/dfEEqn.cu new file mode 100644 index 000000000..6f3d58acb --- /dev/null +++ b/src_gpu/dfEEqn.cu @@ -0,0 +1,772 @@ +#include "dfEEqn.H" + +// kernel functions + +__global__ void eeqn_fvm_ddt_kernel(int num_cells, const double rdelta_t, + const int *csr_row_index, const int *csr_diag_index, + const double *rho_old, const double *rho_new, const double *volume, const double *he_old, + const double sign, const double *A_csr_input, const double *b_input, + double *A_csr_output, double *b_output) +{ + int index = blockDim.x * blockIdx.x + threadIdx.x; + if (index >= num_cells) + return; + + // A_csr has one more element in each row: itself + int row_index = csr_row_index[index]; + int diag_index = csr_diag_index[index]; + int csr_index = row_index + diag_index; + double ddt_diag = rdelta_t * rho_new[index] * volume[index]; + A_csr_output[csr_index] = A_csr_input[csr_index] + ddt_diag * sign; + + double ddt_part_term = rdelta_t * rho_old[index] * volume[index]; + b_output[index] = b_input[index] + ddt_part_term * he_old[index] * sign; +} + +__global__ void eeqn_fvm_div_internal(int num_cells, + const int *csr_row_index, const int *csr_diag_index, + const double *weight, const double *phi, + const double sign, const double *A_csr_input, const double *b_input, + double *A_csr_output, double *b_output) +{ + int index = blockDim.x * blockIdx.x + threadIdx.x; + if (index >= num_cells) + return; + + // A_csr has one more element in each row: itself + int row_index = csr_row_index[index]; + int next_row_index = csr_row_index[index + 1]; + int diag_index = csr_diag_index[index]; + int neighbor_offset = csr_row_index[index] - index; + + double div_diag = 0; + for (int i = row_index; i < next_row_index; i++) + { + int inner_index = i - row_index; + // lower + if (inner_index < diag_index) + { + int neighbor_index = neighbor_offset + inner_index; + double w = weight[neighbor_index]; + double f = phi[neighbor_index]; + //if (i == 267) printf("lower A[%d], %.10lf + %.10lf = %.10lf\n", i, A_csr_input[i], (-w) * f * sign, A_csr_input[i] + (1 - w) * f * sign); + A_csr_output[i] = A_csr_input[i] + (-w) * f * sign; + // lower neighbors contribute to sum of -1 + div_diag += (w - 1) * f; + } + // upper + if (inner_index > diag_index) + { + // upper, index - 1, consider of diag + int neighbor_index = neighbor_offset + inner_index - 1; + double w = weight[neighbor_index]; + double f = phi[neighbor_index]; + //if (i == 267) printf("upper A[%d], %.10lf + %.10lf = %.10lf\n", i, A_csr_input[i], (-w) * f * sign, A_csr_input[i] + (1 - w) * f * sign); + A_csr_output[i] = A_csr_input[i] + (1 - w) * f * sign; + // upper neighbors contribute to sum of 1 + div_diag += w * f; + } + } + //if (row_index + diag_index == 267) printf("diag A[%d], %.10lf + %.10lf = %.10lf\n", row_index + diag_index, A_csr_input[row_index + diag_index], div_diag * sign, A_csr_input[row_index + diag_index] + div_diag * sign); + A_csr_output[row_index + diag_index] = A_csr_input[row_index + diag_index] + div_diag * sign; // diag +} + +__global__ void eeqn_fvm_div_boundary(int num_boundary_cells, + const int *csr_row_index, const int *csr_diag_index, + const int *boundary_cell_offset, const int *boundary_cell_id, + const double *value_internal_coeffs, const double *value_boundary_coeffs, + const double sign, const double *A_csr_input, const double *b_input, + double *A_csr_output, double *b_output) +{ + int index = blockDim.x * blockIdx.x + threadIdx.x; + if (index >= num_boundary_cells) + return; + + int cell_offset = boundary_cell_offset[index]; + int cell_index = boundary_cell_id[cell_offset]; + int loop_size = boundary_cell_offset[index + 1] - cell_offset; + + int row_index = csr_row_index[cell_index]; + int diag_index = csr_diag_index[cell_index]; + int csr_index = row_index + diag_index; + + // construct internalCoeffs & boundaryCoeffs + double internal_coeffs = 0; + double boundary_coeffs = 0; + for (int i = 0; i < loop_size; i++) + { + //if (cell_index == 1) printf("term[%d], src index: %d, src value: %.10lf\n", i, cell_offset + i, value_internal_coeffs[cell_offset + i]); + internal_coeffs += value_internal_coeffs[cell_offset + i]; + boundary_coeffs += value_boundary_coeffs[cell_offset + i]; + } + //if (cell_index == 1) printf("boundary A[%d], cell_index: %d, loop_size: %d, %.10lf + %.10lf = %.10lf, \n", csr_index, cell_index, loop_size, A_csr_input[csr_index], internal_coeffs * sign, A_csr_input[csr_index] + internal_coeffs * sign); + A_csr_output[csr_index] = A_csr_input[csr_index] + internal_coeffs * sign; + b_output[cell_index] = b_input[cell_index] + boundary_coeffs * sign; +} + +__global__ void eeqn_fvm_laplacian_uncorrected_internal(int num_cells, + const int *csr_row_index, const int *csr_col_index, const int *csr_diag_index, + const double *alphaEff, const double *weight, + const double *magsf, const double *distance, + const double sign, const double *A_csr_input, double *A_csr_output) +{ + int index = blockDim.x * blockIdx.x + threadIdx.x; + if (index >= num_cells) + return; + + // A_csr has one more element in each row: itself + int row_index = csr_row_index[index]; + int row_elements = csr_row_index[index + 1] - row_index; + int diag_index = csr_diag_index[index]; + int neighbor_offset = csr_row_index[index] - index; + + double own_alphaEff = alphaEff[index]; + // fvm.upper() = deltaCoeffs.primitiveField()*gammaMagSf.primitiveField(); + // fvm.negSumDiag(); + double sum_diag = 0; + // lower + for (int i = 0; i < diag_index; i++) + { + int neighbor_index = neighbor_offset + i; + int neighbor_cell_id = csr_col_index[i + row_index]; + double w = weight[neighbor_index]; + double nei_alphaEff = alphaEff[neighbor_cell_id]; + double gamma = w * (nei_alphaEff - own_alphaEff) + own_alphaEff; + double gamma_magsf = gamma * magsf[neighbor_index]; + double coeff = gamma_magsf * distance[neighbor_index]; + A_csr_output[row_index + i] = A_csr_input[row_index + i] + coeff * sign; + sum_diag += (-coeff); + } + // upper + for (int i = diag_index + 1; i < row_elements; i++) + { + int neighbor_index = neighbor_offset + i - 1; + int neighbor_cell_id = csr_col_index[i + row_index]; + double w = weight[neighbor_index]; + double nei_alphaEff = alphaEff[neighbor_cell_id]; + double gamma = w * (nei_alphaEff - own_alphaEff) + own_alphaEff; + double gamma_magsf = gamma * magsf[neighbor_index]; + double coeff = gamma_magsf * distance[neighbor_index]; + A_csr_output[row_index + i] = A_csr_input[row_index + i] + coeff * sign; + sum_diag += (-coeff); + } + A_csr_output[row_index + diag_index] = A_csr_input[row_index + diag_index] + sum_diag * sign; // diag +} + +__global__ void eeqn_fvm_laplacian_uncorrected_boundary(int num_boundary_cells, + const int *csr_row_index, const int *csr_diag_index, + const int *boundary_cell_offset, const int *boundary_cell_id, + const double *boundary_alphaEff, const double *boundary_magsf, + const double *gradient_internal_coeffs, const double *gradient_boundary_coeffs, + const double sign, const double *A_csr_input, const double *b_input, + double *A_csr_output, double *b_output) +{ + int index = blockDim.x * blockIdx.x + threadIdx.x; + if (index >= num_boundary_cells) + return; + + int cell_offset = boundary_cell_offset[index]; + int next_cell_offset = boundary_cell_offset[index + 1]; + int cell_index = boundary_cell_id[cell_offset]; + + int row_index = csr_row_index[cell_index]; + int diag_index = csr_diag_index[cell_index]; + int csr_index = row_index + diag_index; + + // OpenFoam code + // if (pvf.coupled()) + // { + // fvm.internalCoeffs()[patchi] = + // pGamma*pvf.gradientInternalCoeffs(pDeltaCoeffs); + // fvm.boundaryCoeffs()[patchi] = + // -pGamma*pvf.gradientBoundaryCoeffs(pDeltaCoeffs); + // } + // else + // { + // fvm.internalCoeffs()[patchi] = pGamma*pvf.gradientInternalCoeffs(); + // fvm.boundaryCoeffs()[patchi] = - + // pGamma*pvf.gradientBoundaryCoeffs(); + // } + double internal_coeffs = 0; + double boundary_coeffs = 0; + for (int i = cell_offset; i < next_cell_offset; i++) + { + double gamma = boundary_alphaEff[i]; + double gamma_magsf = gamma * boundary_magsf[i]; + internal_coeffs += gamma_magsf * gradient_internal_coeffs[i]; + boundary_coeffs += gamma_magsf * gradient_boundary_coeffs[i]; + } + + A_csr_output[csr_index] = A_csr_input[csr_index] + internal_coeffs * sign; + b_output[cell_index] = b_input[cell_index] + boundary_coeffs * sign; +} + +__global__ void eeqn_fvc_ddt_kernel(int num_cells, const double rdelta_t, + const double *rho_old, const double *rho_new, + const double *K_old, const double *K, + const double sign, const double *b_input, double *b_output) +{ + int index = blockDim.x * blockIdx.x + threadIdx.x; + if (index >= num_cells) + return; + + double fvc_ddt_term = rdelta_t * (rho_new[index] * K[index] - rho_old[index] * K_old[index]); + b_output[index] = b_input[index] + fvc_ddt_term * sign; +} + +__global__ void eeqn_fvc_div_vector_internal(int num_cells, + const int *csr_row_index, const int *csr_col_index, const int *csr_diag_index, + const double *sf, const double *vf, const double *tlambdas, + const double sign, const double *b_input, double *b_output) +{ + int index = blockDim.x * blockIdx.x + threadIdx.x; + if (index >= num_cells) + return; + + // A_csr has one more element in each row: itself + int row_index = csr_row_index[index]; + int row_elements = csr_row_index[index + 1] - row_index; + int diag_index = csr_diag_index[index]; + int neighbor_offset = csr_row_index[index] - index; + + double own_vf_x = vf[index * 3 + 0]; + double own_vf_y = vf[index * 3 + 1]; + double own_vf_z = vf[index * 3 + 2]; + double sum = 0; + // lower + for (int i = 0; i < diag_index; i++) + { + int neighbor_index = neighbor_offset + i; + int neighbor_cell_id = csr_col_index[row_index + i]; + double w = tlambdas[neighbor_index]; + double sf_x = sf[neighbor_index * 3 + 0]; + double sf_y = sf[neighbor_index * 3 + 1]; + double sf_z = sf[neighbor_index * 3 + 2]; + double neighbor_vf_x = vf[neighbor_cell_id * 3 + 0]; + double neighbor_vf_y = vf[neighbor_cell_id * 3 + 1]; + double neighbor_vf_z = vf[neighbor_cell_id * 3 + 2]; + double face_x = (1 - w) * own_vf_x + w * neighbor_vf_x; + double face_y = (1 - w) * own_vf_y + w * neighbor_vf_y; + double face_z = (1 - w) * own_vf_z + w * neighbor_vf_z; + sum -= sf_x * face_x + sf_y * face_y + sf_z * face_z; + } + // upper + for (int i = diag_index + 1; i < row_elements; i++) + { + int neighbor_index = neighbor_offset + i - 1; + int neighbor_cell_id = csr_col_index[row_index + i]; + double w = tlambdas[neighbor_index]; + double sf_x = sf[neighbor_index * 3 + 0]; + double sf_y = sf[neighbor_index * 3 + 1]; + double sf_z = sf[neighbor_index * 3 + 2]; + double neighbor_vf_x = vf[neighbor_cell_id * 3 + 0]; + double neighbor_vf_y = vf[neighbor_cell_id * 3 + 1]; + double neighbor_vf_z = vf[neighbor_cell_id * 3 + 2]; + double face_x = w * own_vf_x + (1 - w) * neighbor_vf_x; + double face_y = w * own_vf_y + (1 - w) * neighbor_vf_y; + double face_z = w * own_vf_z + (1 - w) * neighbor_vf_z; + sum += sf_x * face_x + sf_y * face_y + sf_z * face_z; + } + b_output[index] = b_input[index] + sum * sign; +} + +__global__ void eeqn_fvc_div_vector_boundary(int num_boundary_cells, + const int *boundary_cell_offset, const int *boundary_cell_id, + const double *boundary_sf, const double *boundary_vf, + const double sign, const double *b_input, double *b_output) +{ + int index = blockDim.x * blockIdx.x + threadIdx.x; + if (index >= num_boundary_cells) + return; + + int cell_offset = boundary_cell_offset[index]; + int next_cell_offset = boundary_cell_offset[index + 1]; + int cell_index = boundary_cell_id[cell_offset]; + + // OpenFoam code + // Foam::surfaceInterpolationScheme::dotInterpolate + // if (vf.boundaryField()[pi].coupled()) + // { + // psf = + // pSf + // & ( + // pLambda*vf.boundaryField()[pi].patchInternalField() + // + (1.0 - pLambda)*vf.boundaryField()[pi].patchNeighbourField() + // ); + // } + // else + // { + // psf = pSf & vf.boundaryField()[pi]; + // } + // tmp> surfaceIntegrate + // forAll(mesh.boundary()[patchi], facei) + // { + // ivf[pFaceCells[facei]] += pssf[facei]; + // } + double sum = 0; + for (int i = cell_offset; i < next_cell_offset; i++) + { + double sf_x = boundary_sf[i * 3 + 0]; + double sf_y = boundary_sf[i * 3 + 1]; + double sf_z = boundary_sf[i * 3 + 2]; + double face_x = boundary_vf[i * 3 + 0]; + double face_y = boundary_vf[i * 3 + 1]; + double face_z = boundary_vf[i * 3 + 2]; + + // if not coupled + sum += (sf_x * face_x + sf_y * face_y + sf_z * face_z); + } + b_output[cell_index] = b_input[cell_index] + sum * sign; +} + +__global__ void eeqn_fvc_div_phi_scalar_internal(int num_cells, + const int *csr_row_index, const int *csr_col_index, const int *csr_diag_index, + const double *weight, const double *phi, const double *K, + const double sign, const double *b_input, double *b_output) +{ + int index = blockDim.x * blockIdx.x + threadIdx.x; + if (index >= num_cells) + return; + + // A_csr has one more element in each row: itself + int row_index = csr_row_index[index]; + int next_row_index = csr_row_index[index + 1]; + int diag_index = csr_diag_index[index]; + int neighbor_offset = csr_row_index[index] - index; + + double own_cell_k = K[index]; + double interp = 0; + for (int i = row_index; i < next_row_index; i++) + { + int inner_index = i - row_index; + // lower + if (inner_index < diag_index) + { + int neighbor_index = neighbor_offset + inner_index; + double w = weight[neighbor_index]; + int neighbor_cell_id = csr_col_index[row_index + inner_index]; + double neighbor_cell_k = K[neighbor_cell_id]; + double face_k = (1 - w) * own_cell_k + w * neighbor_cell_k; + interp -= phi[i] * face_k; + } + // upper + if (inner_index > diag_index) + { + int neighbor_index = neighbor_offset + inner_index - 1; + double w = weight[neighbor_index]; + int neighbor_cell_id = csr_col_index[row_index + inner_index]; + double neighbor_cell_k = K[neighbor_cell_id]; + double face_k = (1 - w) * own_cell_k + w * neighbor_cell_k; + interp += phi[i] * face_k; + } + } + b_output[index] = b_input[index] + interp * sign; +} + +__global__ void eeqn_fvc_div_phi_scalar_boundary(int num_boundary_cells, + const int *boundary_cell_offset, const int *boundary_cell_id, + const double *boundary_phi, const double* boundary_K, + const double sign, const double *b_input, double *b_output) +{ + int index = blockDim.x * blockIdx.x + threadIdx.x; + if (index >= num_boundary_cells) + return; + + int cell_offset = boundary_cell_offset[index]; + int next_cell_offset = boundary_cell_offset[index + 1]; + int cell_index = boundary_cell_id[cell_offset]; + + // boundary interplate + double interp = 0; + for (int i = cell_offset; i < next_cell_offset; i++) + { + interp += boundary_phi[i] * boundary_K[i]; + } + + b_output[cell_index] = b_input[cell_index] + interp * sign; +} + + +__global__ void eeqn_add_to_source_kernel(int num_cells, + const double sign_dpdt, const double *dpdt, + const double sign_diffAlphaD, const double *diffAlphaD, + const double *volume, + const double *b_input, double *b_output) +{ + int index = blockDim.x * blockIdx.x + threadIdx.x; + if (index >= num_cells) + return; + + b_output[index] = b_input[index] + sign_dpdt * dpdt[index] * volume[index] + sign_diffAlphaD * diffAlphaD[index] * volume[index]; +} + +__global__ void eeqn_boundaryPermutation(const int num_boundary_faces, const int *bouPermedIndex, + const double *boundary_K_init, const double *boundary_alphaEff_init, + const double *value_internal_coeffs_init, const double *value_boundary_coeffs_init, + const double *gradient_internal_coeffs_init, const double *gradient_boundary_coeffs_init, + double *boundary_K, double *boundary_alphaEff, + double *value_internal_coeffs, double *value_boundary_coeffs, + double *gradient_internal_coeffs, double *gradient_boundary_coeffs) +{ + int index = blockDim.x * blockIdx.x + threadIdx.x; + if (index >= num_boundary_faces) + return; + + int p = bouPermedIndex[index]; + + //if (index == 4 || index == 5 || index == 6) { + // printf("dst index: %d, src index: %d, value: %.10lf\n", index, p, value_internal_coeffs_init[p]); + //} + boundary_K[index] = boundary_K_init[p]; + boundary_alphaEff[index] = boundary_alphaEff_init[p]; + value_internal_coeffs[index] = value_internal_coeffs_init[p]; + value_boundary_coeffs[index] = value_boundary_coeffs_init[p]; + gradient_internal_coeffs[index] = gradient_internal_coeffs_init[p]; + gradient_boundary_coeffs[index] = gradient_boundary_coeffs_init[p]; +} + +// constructor +dfEEqn::dfEEqn(dfMatrixDataBase &dataBase, const std::string &modeStr, const std::string &cfgFile) + : dataBase_(dataBase) +{ + ESolver = new AmgXSolver(modeStr, cfgFile); + + num_cells = dataBase_.num_cells; + cell_bytes = dataBase_.cell_bytes; + num_faces = dataBase_.num_faces; + cell_vec_bytes = dataBase_.cell_vec_bytes; + csr_value_vec_bytes = dataBase_.csr_value_vec_bytes; + num_boundary_cells = dataBase_.num_boundary_cells; + num_surfaces = dataBase_.num_surfaces; + num_boundary_faces = dataBase_.num_boundary_faces; + boundary_face_bytes = dataBase_.boundary_face_bytes; + + d_A_csr_row_index = dataBase_.d_A_csr_row_index; + d_A_csr_diag_index = dataBase_.d_A_csr_diag_index; + d_A_csr_col_index = dataBase_.d_A_csr_col_index; + + h_A_csr = new double[(num_cells + num_faces) * 3]; + h_b = new double[num_cells * 3]; + cudaMallocHost(&h_he_new, cell_bytes); + + checkCudaErrors(cudaMalloc((void **)&d_A_csr, csr_value_vec_bytes)); + checkCudaErrors(cudaMalloc((void **)&d_b, cell_vec_bytes)); + + checkCudaErrors(cudaMalloc((void **)&d_he_old, cell_bytes)); + checkCudaErrors(cudaMalloc((void **)&d_K, cell_bytes)); + checkCudaErrors(cudaMalloc((void **)&d_K_old, cell_bytes)); + checkCudaErrors(cudaMalloc((void **)&d_alphaEff, cell_bytes)); + checkCudaErrors(cudaMalloc((void **)&d_dpdt, cell_bytes)); + checkCudaErrors(cudaMalloc((void **)&d_diffAlphaD, cell_bytes)); + checkCudaErrors(cudaMalloc((void **)&d_hDiffCorrFlux, cell_bytes * 3)); + + checkCudaErrors(cudaMalloc((void **)&d_boundary_K_init, boundary_face_bytes)); + checkCudaErrors(cudaMalloc((void **)&d_boundary_K, boundary_face_bytes)); + checkCudaErrors(cudaMalloc((void **)&d_boundary_alphaEff_init, boundary_face_bytes)); + checkCudaErrors(cudaMalloc((void **)&d_boundary_alphaEff, boundary_face_bytes)); + + checkCudaErrors(cudaMalloc((void **)&d_value_internal_coeffs_init, boundary_face_bytes)); + checkCudaErrors(cudaMalloc((void **)&d_value_boundary_coeffs_init, boundary_face_bytes)); + checkCudaErrors(cudaMalloc((void **)&d_gradient_internal_coeffs_init, boundary_face_bytes)); + checkCudaErrors(cudaMalloc((void **)&d_gradient_boundary_coeffs_init, boundary_face_bytes)); + checkCudaErrors(cudaMalloc((void **)&d_value_internal_coeffs, boundary_face_bytes)); + checkCudaErrors(cudaMalloc((void **)&d_value_boundary_coeffs, boundary_face_bytes)); + checkCudaErrors(cudaMalloc((void **)&d_gradient_internal_coeffs, boundary_face_bytes)); + checkCudaErrors(cudaMalloc((void **)&d_gradient_boundary_coeffs, boundary_face_bytes)); + + checkCudaErrors(cudaStreamCreate(&stream)); +} + +void dfEEqn::prepare_data(const double *he_old, const double *K, const double *K_old, const double *alphaEff, + const double *dpdt, const double *diffAlphaD, const double *hDiffCorrFlux, + const double *boundary_K, const double *boundary_alphaEff, + const double *valueInternalCoeffs, const double *valueBoundaryCoeffs, + const double *gradientInternalCoeffs, const double *gradientBoundaryCoeffs) { + // TODO not real async copy now, because some host array are not in pinned memory. + + // copy the host input array in host memory to the device input array in device memory + clock_t start = std::clock(); + + checkCudaErrors(cudaMemcpyAsync(d_he_old, he_old, cell_bytes, cudaMemcpyHostToDevice, stream)); + checkCudaErrors(cudaMemcpyAsync(d_K, K, cell_bytes, cudaMemcpyHostToDevice, stream)); + checkCudaErrors(cudaMemcpyAsync(d_K_old, K_old, cell_bytes, cudaMemcpyHostToDevice, stream)); + checkCudaErrors(cudaMemcpyAsync(d_alphaEff, alphaEff, cell_bytes, cudaMemcpyHostToDevice, stream)); + checkCudaErrors(cudaMemcpyAsync(d_dpdt, dpdt, cell_bytes, cudaMemcpyHostToDevice, stream)); + checkCudaErrors(cudaMemcpyAsync(d_diffAlphaD, diffAlphaD, cell_bytes, cudaMemcpyHostToDevice, stream)); + checkCudaErrors(cudaMemcpyAsync(d_hDiffCorrFlux, hDiffCorrFlux, cell_bytes * 3, cudaMemcpyHostToDevice, stream)); + + // copy and permutate boundary variable + checkCudaErrors(cudaMemcpyAsync(d_boundary_K_init, boundary_K, boundary_face_bytes, cudaMemcpyHostToDevice, stream)); + checkCudaErrors(cudaMemcpyAsync(d_boundary_alphaEff_init, boundary_alphaEff, boundary_face_bytes, cudaMemcpyHostToDevice, stream)); + checkCudaErrors(cudaMemcpyAsync(d_value_internal_coeffs_init, valueInternalCoeffs, boundary_face_bytes, cudaMemcpyHostToDevice, stream)); + checkCudaErrors(cudaMemcpyAsync(d_value_boundary_coeffs_init, valueBoundaryCoeffs, boundary_face_bytes, cudaMemcpyHostToDevice, stream)); + checkCudaErrors(cudaMemcpyAsync(d_gradient_internal_coeffs_init, gradientInternalCoeffs, boundary_face_bytes, cudaMemcpyHostToDevice, stream)); + checkCudaErrors(cudaMemcpyAsync(d_gradient_boundary_coeffs_init, gradientBoundaryCoeffs, boundary_face_bytes, cudaMemcpyHostToDevice, stream)); + + size_t threads_per_block = 1024; + size_t blocks_per_grid = (num_boundary_faces + threads_per_block - 1) / threads_per_block; + eeqn_boundaryPermutation<<>>(num_boundary_faces, dataBase_.d_bouPermedIndex, + d_boundary_K_init, d_boundary_alphaEff_init, + d_value_internal_coeffs_init, d_value_boundary_coeffs_init, + d_gradient_internal_coeffs_init, d_gradient_boundary_coeffs_init, + d_boundary_K, d_boundary_alphaEff, + d_value_internal_coeffs, d_value_boundary_coeffs, + d_gradient_internal_coeffs, d_gradient_boundary_coeffs); + clock_t end = std::clock(); + time_monitor_GPU_memcpy += double(end - start) / double(CLOCKS_PER_SEC); +} + +void dfEEqn::resetAb() { + // initialize matrix value + checkCudaErrors(cudaMemsetAsync(d_A_csr, 0, csr_value_vec_bytes, stream)); + checkCudaErrors(cudaMemsetAsync(d_b, 0, cell_vec_bytes, stream)); +} + +#define TRACE fprintf(stderr, "%s %d\n", __FILE__, __LINE__); + +void dfEEqn::fvm_ddt() +{ + clock_t start = std::clock(); + size_t threads_per_block = 1024; + size_t blocks_per_grid = (num_cells + threads_per_block - 1) / threads_per_block; + eeqn_fvm_ddt_kernel<<>>(num_cells, dataBase_.rdelta_t, + d_A_csr_row_index, d_A_csr_diag_index, + dataBase_.d_rho_old, dataBase_.d_rho_new, dataBase_.d_volume, d_he_old, + 1., d_A_csr, d_b, d_A_csr, d_b); + clock_t end = std::clock(); + time_monitor_GPU_kernel += double(end - start) / double(CLOCKS_PER_SEC); +} + +void dfEEqn::fvm_div() +{ + clock_t start = std::clock(); + size_t threads_per_block = 1024; + size_t blocks_per_grid = (num_cells + threads_per_block - 1) / threads_per_block; + eeqn_fvm_div_internal<<>>(num_cells, + d_A_csr_row_index, d_A_csr_diag_index, + dataBase_.d_weight, dataBase_.d_phi, + 1., d_A_csr, d_b, d_A_csr, d_b); + blocks_per_grid = (num_boundary_cells + threads_per_block - 1) / threads_per_block; + eeqn_fvm_div_boundary<<>>(num_boundary_cells, + d_A_csr_row_index, d_A_csr_diag_index, + dataBase_.d_boundary_cell_offset, dataBase_.d_boundary_cell_id, + d_value_internal_coeffs, d_value_boundary_coeffs, + 1., d_A_csr, d_b, d_A_csr, d_b); + clock_t end = std::clock(); + time_monitor_GPU_kernel += double(end - start) / double(CLOCKS_PER_SEC); +} + +void dfEEqn::fvm_laplacian() +{ + clock_t start = std::clock(); + size_t threads_per_block = 1024; + size_t blocks_per_grid = (num_cells + threads_per_block - 1) / threads_per_block; + eeqn_fvm_laplacian_uncorrected_internal<<>>(num_cells, + d_A_csr_row_index, d_A_csr_col_index, d_A_csr_diag_index, d_alphaEff, dataBase_.d_weight, + dataBase_.d_face, dataBase_.d_deltaCoeffs, + 1., d_A_csr, d_A_csr); + + blocks_per_grid = (num_boundary_cells + threads_per_block - 1) / threads_per_block; + eeqn_fvm_laplacian_uncorrected_boundary<<>>(num_boundary_cells, + d_A_csr_row_index, d_A_csr_diag_index, dataBase_.d_boundary_cell_offset, dataBase_.d_boundary_cell_id, + d_boundary_alphaEff, dataBase_.d_boundary_face, d_gradient_internal_coeffs, d_gradient_boundary_coeffs, + 1., d_A_csr, d_b, d_A_csr, d_b); + clock_t end = std::clock(); + time_monitor_GPU_kernel += double(end - start) / double(CLOCKS_PER_SEC); +} + +void dfEEqn::fvc_ddt() +{ + // " + fvc::ddt(rho,K)" is on the left side of "==", thus should minus from source. + clock_t start = std::clock(); + size_t threads_per_block = 1024; + size_t blocks_per_grid = (num_cells + threads_per_block - 1) / threads_per_block; + eeqn_fvc_ddt_kernel<<>>(num_cells, dataBase_.rdelta_t, + dataBase_.d_rho_old, dataBase_.d_rho_new, d_K_old, d_K, + -1., d_b, d_b); + clock_t end = std::clock(); + time_monitor_GPU_kernel += double(end - start) / double(CLOCKS_PER_SEC); +} + +void dfEEqn::fvc_div_vector() +{ + // " + fvc::div(hDiffCorrFlux)" is on the right side of "==", thus should add to source. + clock_t start = std::clock(); + size_t threads_per_block = 512; + size_t blocks_per_grid = (num_cells + threads_per_block - 1) / threads_per_block; + eeqn_fvc_div_vector_internal<<>>(num_cells, + d_A_csr_row_index, d_A_csr_col_index, d_A_csr_diag_index, + dataBase_.d_face_vector, dataBase_.d_grad, dataBase_.d_weight, + 1., d_b, d_b); + blocks_per_grid = (num_boundary_cells + threads_per_block - 1) / threads_per_block; + eeqn_fvc_div_vector_boundary<<>>(num_boundary_cells, + dataBase_.d_boundary_cell_offset, dataBase_.d_boundary_cell_id, + dataBase_.d_boundary_face_vector, dataBase_.d_grad_boundary, + 1., d_b, d_b); + clock_t end = std::clock(); + time_monitor_GPU_kernel += double(end - start) / double(CLOCKS_PER_SEC); +} + +void dfEEqn::fvc_div_phi_scalar() +{ + // " + fvc::div(phi,K)" is on the left side of "==", thus should minus from source. + clock_t start = std::clock(); + size_t threads_per_block = 1024; + size_t blocks_per_grid = (num_cells + threads_per_block - 1) / threads_per_block; + eeqn_fvc_div_phi_scalar_internal<<>>(num_cells, + d_A_csr_row_index, d_A_csr_col_index, d_A_csr_diag_index, + dataBase_.d_weight, dataBase_.d_phi, d_K, + -1., d_b, d_b); + blocks_per_grid = (num_boundary_cells + threads_per_block - 1) / threads_per_block; + eeqn_fvc_div_phi_scalar_boundary<<>>(num_boundary_cells, + dataBase_.d_boundary_cell_offset, dataBase_.d_boundary_cell_id, + dataBase_.d_boundary_phi, d_boundary_K, + -1., d_b, d_b); + clock_t end = std::clock(); + time_monitor_GPU_kernel += double(end - start) / double(CLOCKS_PER_SEC); +} + +void dfEEqn::add_to_source() +{ + // " - dpdt" is on the left side of "==", thus should add to source. + // "+ diffAlphaD" is on the left side of "==", thus should minus from source. + clock_t start = std::clock(); + size_t threads_per_block = 1024; + size_t blocks_per_grid = (num_cells + threads_per_block - 1) / threads_per_block; + // " + fvc::ddt(rho,K)" is on the left side of "==", thus should minus from source. + eeqn_add_to_source_kernel<<>>(num_cells, + 1., d_dpdt, -1., d_diffAlphaD, dataBase_.d_volume, d_b, d_b); + clock_t end = std::clock(); + time_monitor_GPU_kernel += double(end - start) / double(CLOCKS_PER_SEC); +} + +void dfEEqn::checkValue(bool print) +{ + checkCudaErrors(cudaMemcpyAsync(h_A_csr, d_A_csr, csr_value_vec_bytes, cudaMemcpyDeviceToHost, stream)); + checkCudaErrors(cudaMemcpyAsync(h_b, d_b, cell_vec_bytes, cudaMemcpyDeviceToHost, stream)); + + // Synchronize stream + checkCudaErrors(cudaStreamSynchronize(stream)); + if (print) + { + for (int i = 0; i < (num_faces + num_cells); i++) + fprintf(stderr, "h_A_csr[%d]: %.15lf\n", i, h_A_csr[i]); + for (int i = 0; i < num_cells * 3; i++) + fprintf(stderr, "h_b[%d]: %.15lf\n", i, h_b[i]); + } + + char *input_file = "of_output.txt"; + FILE *fp = fopen(input_file, "rb+"); + if (fp == NULL) + { + fprintf(stderr, "Failed to open input file: %s!\n", input_file); + } + int readfile = 0; + double *of_b = new double[num_cells]; + double *of_A = new double[num_faces + num_cells]; + readfile = fread(of_b, num_cells * sizeof(double), 1, fp); + readfile = fread(of_A, (num_faces + num_cells) * sizeof(double), 1, fp); + + std::vector h_A_of_init_vec(num_cells + num_faces); + std::copy(of_A, of_A + num_cells + num_faces, h_A_of_init_vec.begin()); + + std::vector h_A_of_vec_1mtx(num_faces + num_cells, 0); + for (int i = 0; i < num_faces + num_cells; i++) + { + h_A_of_vec_1mtx[i] = h_A_of_init_vec[dataBase_.tmpPermutatedList[i]]; + } + + // b + std::vector h_b_of_vec(num_cells); + std::copy(of_b, of_b + num_cells, h_b_of_vec.begin()); + + if (print) + { + for (int i = 0; i < (num_faces + num_cells); i++) + fprintf(stderr, "h_A_of_vec_1mtx[%d]: %.15lf\n", i, h_A_of_vec_1mtx[i]); + for (int i = 0; i < num_cells; i++) + fprintf(stderr, "h_b_of_vec[%d]: %.15lf\n", i, h_b_of_vec[i]); + } + + // check + fprintf(stderr, "check of h_A_csr\n"); + checkVectorEqual(num_faces + num_cells, h_A_of_vec_1mtx.data(), h_A_csr, 1e-5); + fprintf(stderr, "check of h_b\n"); + checkVectorEqual(num_cells, h_b_of_vec.data(), h_b, 1e-5); +} + +void dfEEqn::solve() +{ + checkCudaErrors(cudaStreamSynchronize(stream)); + printf("CPU Time (copy&permutate) = %.6lf s\n", time_monitor_CPU); + printf("GPU Time (kernel launch) = %.6lf s\n", time_monitor_GPU_kernel); + printf("GPU Time (memcpy) = %.6lf s\n", time_monitor_GPU_memcpy); + time_monitor_CPU = 0; + time_monitor_GPU_kernel = 0; + time_monitor_GPU_memcpy = 0; + + // nvtxRangePush("solve"); + + int nNz = num_cells + num_faces; // matrix entries + if (num_iteration == 0) // first interation + { + printf("Initializing AmgX Linear Solver\n"); + ESolver->setOperator(num_cells, nNz, d_A_csr_row_index, d_A_csr_col_index, d_A_csr); + } + else + { + ESolver->updateOperator(num_cells, nNz, d_A_csr); + } + ESolver->solve(num_cells, d_he_old, d_b); + num_iteration++; + + checkCudaErrors(cudaMemcpyAsync(h_he_new, d_he_old, cell_bytes, cudaMemcpyDeviceToHost, stream)); + // for (size_t i = 0; i < num_cells; i++) + // fprintf(stderr, "h_he_after[%d]: %.15lf\n", i, h_he_new[i]); +} + +void dfEEqn::sync() +{ + checkCudaErrors(cudaStreamSynchronize(stream)); +} + +void dfEEqn::updatePsi(double *Psi) +{ + for (size_t i = 0; i < num_cells; i++) + { + Psi[i] = h_he_new[i]; + } +} + +dfEEqn::~dfEEqn() +{ + delete h_A_csr; + delete h_b; + + checkCudaErrors(cudaFree(d_A_csr)); + checkCudaErrors(cudaFree(d_b)); + + checkCudaErrors(cudaFree(d_he_old)); + checkCudaErrors(cudaFree(h_he_new)); + + checkCudaErrors(cudaFree(d_K)); + checkCudaErrors(cudaFree(d_K_old)); + checkCudaErrors(cudaFree(d_alphaEff)); + checkCudaErrors(cudaFree(d_dpdt)); + checkCudaErrors(cudaFree(d_diffAlphaD)); + checkCudaErrors(cudaFree(d_hDiffCorrFlux)); + + checkCudaErrors(cudaFree(d_boundary_K_init)); + checkCudaErrors(cudaFree(d_boundary_K)); + checkCudaErrors(cudaFree(d_boundary_alphaEff_init)); + checkCudaErrors(cudaFree(d_boundary_alphaEff)); + + checkCudaErrors(cudaFree(d_value_internal_coeffs_init)); + checkCudaErrors(cudaFree(d_value_boundary_coeffs_init)); + checkCudaErrors(cudaFree(d_gradient_internal_coeffs_init)); + checkCudaErrors(cudaFree(d_gradient_boundary_coeffs_init)); + checkCudaErrors(cudaFree(d_value_internal_coeffs)); + checkCudaErrors(cudaFree(d_value_boundary_coeffs)); + checkCudaErrors(cudaFree(d_gradient_internal_coeffs)); + checkCudaErrors(cudaFree(d_gradient_boundary_coeffs)); +} From e847450106ca42e99b4ca432eb1c64851cdcc512 Mon Sep 17 00:00:00 2001 From: STwangyingrui Date: Mon, 5 Jun 2023 14:14:33 +0800 Subject: [PATCH 2/3] fix bug in fvc_grad_internal_face --- .../solvers/dfLowMachFoam/createdfSolver.H | 3 ++- .../H2/cvodeIntegrator_gpu/system/fvSchemes | 18 +++++++++--------- src_gpu/dfUEqn.cu | 4 ++-- 3 files changed, 13 insertions(+), 12 deletions(-) diff --git a/applications/solvers/dfLowMachFoam/createdfSolver.H b/applications/solvers/dfLowMachFoam/createdfSolver.H index e56376b28..880ae674c 100644 --- a/applications/solvers/dfLowMachFoam/createdfSolver.H +++ b/applications/solvers/dfLowMachFoam/createdfSolver.H @@ -49,9 +49,10 @@ cudaMallocHost(&boundary_nuEff_init, num_boundary_faces*sizeof(double)); cudaMallocHost(&boundary_rho_init, num_boundary_faces*sizeof(double)); cudaMallocHost(&boundary_phi_init, num_boundary_faces*sizeof(double)); -double *boundary_alphaEff, *boundary_K; +double *boundary_alphaEff, *boundary_K, *boundary_hDiffCorrFlux; double *eeqn_valueInternalCoeffs, *eeqn_valueBoundaryCoeffs, *eeqn_gradientInternalCoeffs, *eeqn_gradientBoundaryCoeffs; cudaMallocHost(&boundary_K, num_boundary_faces*sizeof(double)); +cudaMallocHost(&boundary_hDiffCorrFlux, 3 * num_boundary_faces*sizeof(double)); cudaMallocHost(&boundary_alphaEff, num_boundary_faces*sizeof(double)); cudaMallocHost(&eeqn_valueInternalCoeffs, num_boundary_faces*sizeof(double)); cudaMallocHost(&eeqn_valueBoundaryCoeffs, num_boundary_faces*sizeof(double)); diff --git a/examples/dfLowMachFoam/threeD_reactingTGV/H2/cvodeIntegrator_gpu/system/fvSchemes b/examples/dfLowMachFoam/threeD_reactingTGV/H2/cvodeIntegrator_gpu/system/fvSchemes index c5c9136e5..e2c0058cd 100644 --- a/examples/dfLowMachFoam/threeD_reactingTGV/H2/cvodeIntegrator_gpu/system/fvSchemes +++ b/examples/dfLowMachFoam/threeD_reactingTGV/H2/cvodeIntegrator_gpu/system/fvSchemes @@ -30,15 +30,15 @@ divSchemes default none; div(phi,U) Gauss linear; - div(phi,Yi) Gauss limitedLinear01 1; - div(phi,h) Gauss limitedLinear 1; - div(phi,ha) Gauss limitedLinear 1; - div(phi,K) Gauss limitedLinear 1; - div(phid,p) Gauss limitedLinear 1; - div(phi,epsilon) Gauss limitedLinear 1; - div(phi,Yi_h) Gauss limitedLinear01 1; - div(phi,k) Gauss limitedLinear 1; - div(hDiffCorrFlux) Gauss cubic; + div(phi,Yi) Gauss linear; + div(phi,h) Gauss linear; + div(phi,ha) Gauss linear; + div(phi,K) Gauss linear; + div(phid,p) Gauss linear; + div(phi,epsilon) Gauss linear; + div(phi,Yi_h) Gauss upwind; + div(phi,k) Gauss linear; + div(hDiffCorrFlux) Gauss linear; div(((rho*nuEff)*dev2(T(grad(U))))) Gauss linear; } diff --git a/src_gpu/dfUEqn.cu b/src_gpu/dfUEqn.cu index 4afebd8c1..350befb50 100644 --- a/src_gpu/dfUEqn.cu +++ b/src_gpu/dfUEqn.cu @@ -181,7 +181,7 @@ __global__ void fvc_grad_internal_face(int num_cells, double sfz = face_vector[neighbor_index * 3 + 2]; int neighbor_cell_id = csr_col_index[row_index + inner_index]; double neighbor_cell_p = pressure[neighbor_cell_id]; - double face_p = (1 - w) * own_cell_p + w * neighbor_cell_p; + double face_p = w * own_cell_p + (1 - w) * neighbor_cell_p; grad_bx_upp += face_p * sfx; grad_by_upp += face_p * sfy; grad_bz_upp += face_p * sfz; @@ -1206,4 +1206,4 @@ void dfUEqn::updatePsi(double *Psi) dfUEqn::~dfUEqn() { -} \ No newline at end of file +} From 56f23a335014c6aadd62672e956210aae8c07772 Mon Sep 17 00:00:00 2001 From: STwangyingrui Date: Mon, 5 Jun 2023 17:13:26 +0800 Subject: [PATCH 3/3] fix EEqn_GPU --- applications/solvers/dfLowMachFoam/EEqn.H | 137 +++++++----------- .../solvers/dfLowMachFoam/dfLowMachFoam.C | 17 +++ src_gpu/dfEEqn.H | 6 +- src_gpu/dfEEqn.cu | 105 +++++--------- 4 files changed, 112 insertions(+), 153 deletions(-) diff --git a/applications/solvers/dfLowMachFoam/EEqn.H b/applications/solvers/dfLowMachFoam/EEqn.H index 3b8d38c8e..bdef9043f 100644 --- a/applications/solvers/dfLowMachFoam/EEqn.H +++ b/applications/solvers/dfLowMachFoam/EEqn.H @@ -1,68 +1,44 @@ { - fprintf(stderr, "\n\n=====================enter EEqn=====================\n\n\n"); volScalarField& he = thermo.he(); - #ifdef GPUSolver_ - // start1 = std::clock(); - // t.join(); - // UEqn_GPU.updatePsi(&U[0][0]); - // K = 0.5*magSqr(U); - // end1 = std::clock(); - // time_monitor_UEqn += double(end1 - start1) / double(CLOCKS_PER_SEC); - // time_monitor_UEqn_mtxAssembly += double(end1 - start1) / double(CLOCKS_PER_SEC); - // time_monitor_CPU += double(end1 - start1) / double(CLOCKS_PER_SEC); - // time_monitor_UinE += double(end1 - start1) / double(CLOCKS_PER_SEC); + +#ifdef GPUSolver_ start1 = std::clock(); - // // t.join(); UEqn_GPU.updatePsi(&U[0][0]); K = 0.5*magSqr(U); - // K = 0.5*magSqr(U) * 10000000000; end1 = std::clock(); time_monitor_UEqn += double(end1 - start1) / double(CLOCKS_PER_SEC); time_monitor_UEqn_mtxAssembly += double(end1 - start1) / double(CLOCKS_PER_SEC); time_monitor_CPU += double(end1 - start1) / double(CLOCKS_PER_SEC); - #endif -#ifdef CPUSolver_ - //fvScalarMatrix EEqn - //( - // fvm::ddt(rho, he) + fvm::div(phi, he) - // + fvc::ddt(rho, K) + fvc::div(phi, K) - // - dpdt - // - fvm::laplacian(turbulence->alphaEff(), he) - // + diffAlphaD - // == - // fvc::div(hDiffCorrFlux) - //); - - //EEqn.relax(); - - //EEqn.solve(); +#endif +#ifdef CPUSolver_ + start1 = std::clock(); fvScalarMatrix EEqn ( - //fvm::ddt(rho, he) + fvm::div(phi, he) - //fvm::ddt(rho, he) + fvc::ddt(rho, K) - //fvm::ddt(rho, he) + fvc::div(phi, K) * 10000000000 - //fvm::ddt(rho, he) + fvc::div(phi, K) - //fvm::laplacian(turbulence->alphaEff(), he) - fvm::div(phi, he) - //fvm::laplacian(turbulence->alphaEff(), he) - dpdt + diffAlphaD - //fvm::laplacian(turbulence->alphaEff(), he) - fvc::div(hDiffCorrFlux) - //fvm::laplacian(turbulence->alphaEff(), he) + fvc::div(phi, K) * 10000000000 - //== - //fvc::div(hDiffCorrFlux) + fvm::ddt(rho, he) + fvm::div(phi, he) + + fvc::ddt(rho, K) + fvc::div(phi, K) + - dpdt + - fvm::laplacian(turbulence->alphaEff(), he) + + diffAlphaD + == + fvc::div(hDiffCorrFlux) ); + EEqn.relax(); + EEqn.solve(); + + end1 = std::clock(); + time_monitor_EEqn += double(end1 - start1) / double(CLOCKS_PER_SEC); #endif #ifdef GPUSolver_ - start2 = std::clock(); // prepare data on CPU + start1 = std::clock(); + start2 = std::clock(); const tmp alphaEff_tmp(turbulence->alphaEff()); const volScalarField& alphaEff = alphaEff_tmp(); - - int idx = 0; - offset = 0; + int eeqn_offset = 0; forAll(he.boundaryField(), patchi) { const fvsPatchScalarField& patchFlux = phi.boundaryField()[patchi]; @@ -70,77 +46,72 @@ int patchSize = pw.size(); const scalarField& patchK = K.boundaryField()[patchi]; + const vectorField& patchhDiffCorrFlux = hDiffCorrFlux.boundaryField()[patchi]; const scalarField& patchAlphaEff = alphaEff.boundaryField()[patchi]; - memcpy(boundary_K + offset, &patchK[0], patchSize*sizeof(double)); - memcpy(boundary_alphaEff + offset, &patchAlphaEff[0], patchSize*sizeof(double)); + memcpy(boundary_K + eeqn_offset, &patchK[0], patchSize*sizeof(double)); + memcpy(boundary_hDiffCorrFlux + eeqn_offset * 3, &patchhDiffCorrFlux[0][0], 3 * patchSize*sizeof(double)); + memcpy(boundary_alphaEff + eeqn_offset, &patchAlphaEff[0], patchSize*sizeof(double)); Field valueInternalCoeffs = patchFlux*he.boundaryField()[patchi].valueInternalCoeffs(pw); Field valueBoundaryCoeffs = -patchFlux*he.boundaryField()[patchi].valueBoundaryCoeffs(pw); Field gradientInternalCoeffs = he.boundaryField()[patchi].gradientInternalCoeffs(); Field gradientBoundaryCoeffs = he.boundaryField()[patchi].gradientBoundaryCoeffs(); - memcpy(eeqn_valueInternalCoeffs + offset, &valueInternalCoeffs[0], patchSize*sizeof(double)); - memcpy(eeqn_valueBoundaryCoeffs + offset, &valueBoundaryCoeffs[0], patchSize*sizeof(double)); - memcpy(eeqn_gradientInternalCoeffs + offset, &gradientInternalCoeffs[0], patchSize*sizeof(double)); - memcpy(eeqn_gradientBoundaryCoeffs + offset, &gradientBoundaryCoeffs[0], patchSize*sizeof(double)); + memcpy(eeqn_valueInternalCoeffs + eeqn_offset, &valueInternalCoeffs[0], patchSize*sizeof(double)); + memcpy(eeqn_valueBoundaryCoeffs + eeqn_offset, &valueBoundaryCoeffs[0], patchSize*sizeof(double)); + memcpy(eeqn_gradientInternalCoeffs + eeqn_offset, &gradientInternalCoeffs[0], patchSize*sizeof(double)); + memcpy(eeqn_gradientBoundaryCoeffs + eeqn_offset, &gradientBoundaryCoeffs[0], patchSize*sizeof(double)); - //for (int t = 0; t < patchSize; t++) { - // fprintf(stderr, "gpu valueInternalCoeffs[%d][%d]: %.10lf\n", patchi, t, eeqn_valueInternalCoeffs[offset + t]); - // if (idx == 64 || idx == 4160 || idx == 8385) - // fprintf(stderr, "eeqn_value_internal_coeffs[%d](other index: %d %d): %.10lf(other value: %.10lf)\n", idx, offset, t, eeqn_valueInternalCoeffs[idx], eeqn_valueInternalCoeffs[offset + t]); - // idx++; - //} - offset += patchSize; + eeqn_offset += patchSize; } - offset = 0; - end2 = std::clock(); - time_monitor_CPU += double(end2 - start2) / double(CLOCKS_PER_SEC); - start1 = std::clock(); + eeqn_offset = 0; + end1 = std::clock(); + time_monitor_CPU += double(end1 - start1) / double(CLOCKS_PER_SEC); + time_monitor_EEqn_mtxAssembly_CPU_Prepare += double(end1 - start1) / double(CLOCKS_PER_SEC); + // prepare data on GPU + start1 = std::clock(); EEqn_GPU.prepare_data(&he.oldTime()[0], &K[0], &K.oldTime()[0], &alphaEff[0], &dpdt[0], &diffAlphaD[0], &hDiffCorrFlux[0][0], - boundary_K, boundary_alphaEff, + boundary_K, boundary_hDiffCorrFlux, boundary_alphaEff, eeqn_valueInternalCoeffs, eeqn_valueBoundaryCoeffs, eeqn_gradientInternalCoeffs, eeqn_gradientBoundaryCoeffs); + if (doSync) EEqn_GPU.sync(); end1 = std::clock(); - time_monitor_CPU += double(end1 - start1) / double(CLOCKS_PER_SEC); - time_monitor_EEqn += double(end1 - start1) / double(CLOCKS_PER_SEC); - time_monitor_EEqn_mtxAssembly += double(end1 - start1) / double(CLOCKS_PER_SEC); - fprintf(stderr, "\n\n=====================EEqn GPUSolver_ part2=====================\n\n\n"); + time_monitor_EEqn_mtxAssembly_GPU_Prepare += double(end1 - start1) / double(CLOCKS_PER_SEC); start1 = std::clock(); EEqn_GPU.resetAb(); - //EEqn_GPU.fvm_ddt(); + EEqn_GPU.fvm_ddt(); EEqn_GPU.fvm_div(); - //EEqn_GPU.fvm_laplacian(); - //EEqn_GPU.fvc_ddt(); - //EEqn_GPU.fvc_div_phi_scalar(); - //EEqn_GPU.fvc_div_vector(); - //EEqn_GPU.add_to_source(); - EEqn_GPU.sync(); + EEqn_GPU.fvm_laplacian(); + EEqn_GPU.fvc_ddt(); + EEqn_GPU.fvc_div_phi_scalar(); + EEqn_GPU.fvc_div_vector(); + EEqn_GPU.add_to_source(); + if (doSync) EEqn_GPU.sync(); end1 = std::clock(); - time_monitor_EEqn += double(end1 - start1) / double(CLOCKS_PER_SEC); - time_monitor_EEqn_mtxAssembly += double(end1 - start1) / double(CLOCKS_PER_SEC); - fprintf(stderr, "\n\n=====================EEqn GPUSolver_ part3=====================\n\n\n"); + time_monitor_EEqn_mtxAssembly_GPU_Run += double(end1 - start1) / double(CLOCKS_PER_SEC); + + EEqn_GPU.sync(); + end2 = std::clock(); + time_monitor_EEqn += double(end2 - start2) / double(CLOCKS_PER_SEC); + time_monitor_EEqn_mtxAssembly += double(end2 - start2) / double(CLOCKS_PER_SEC); - // check value of mtxAssembly + // check value of mtxAssembly, no time monitor EEqn_GPU.checkValue(false); - //EEqn_GPU.checkValue(true); - fprintf(stderr, "\n\n=====================EEqn GPUSolver_ part4=====================\n\n\n"); start1 = std::clock(); EEqn_GPU.solve(); + if (doSync) EEqn_GPU.sync(); end1 = std::clock(); time_monitor_EEqn += double(end1 - start1) / double(CLOCKS_PER_SEC); time_monitor_EEqn_Solve += double(end1 - start1) / double(CLOCKS_PER_SEC); - fprintf(stderr, "\n\n=====================EEqn GPUSolver_ part5=====================\n\n\n"); start1 = std::clock(); EEqn_GPU.updatePsi(&he[0]); end1 = std::clock(); + time_monitor_CPU += double(end1 - start1) / double(CLOCKS_PER_SEC); time_monitor_EEqn += double(end1 - start1) / double(CLOCKS_PER_SEC); time_monitor_EEqn_mtxAssembly += double(end1 - start1) / double(CLOCKS_PER_SEC); - time_monitor_CPU += double(end1 - start1) / double(CLOCKS_PER_SEC); - fprintf(stderr, "\n\n=====================EEqn GPUSolver_ part6=====================\n\n\n"); #endif - fprintf(stderr, "\n\n=====================leave EEqn=====================\n\n\n"); } diff --git a/applications/solvers/dfLowMachFoam/dfLowMachFoam.C b/applications/solvers/dfLowMachFoam/dfLowMachFoam.C index 7b6ba15ee..80bf0bcd8 100644 --- a/applications/solvers/dfLowMachFoam/dfLowMachFoam.C +++ b/applications/solvers/dfLowMachFoam/dfLowMachFoam.C @@ -96,6 +96,7 @@ int main(int argc, char *argv[]) #include "createFields.H" #include "createRhoUfIfPresent.H" + bool doSync = true; double time_monitor_flow=0; double time_monitor_UEqn=0; double time_monitor_UEqn_mtxAssembly=0; @@ -103,6 +104,9 @@ int main(int argc, char *argv[]) double time_monitor_UEqn_sum=0; double time_monitor_EEqn=0; double time_monitor_EEqn_mtxAssembly=0; + double time_monitor_EEqn_mtxAssembly_CPU_Prepare; + double time_monitor_EEqn_mtxAssembly_GPU_Prepare; + double time_monitor_EEqn_mtxAssembly_GPU_Run; double time_monitor_EEqn_Solve=0; double time_monitor_EEqn_sum=0; double time_monitor_chem=0; @@ -229,6 +233,7 @@ int main(int argc, char *argv[]) runTime.write(); time_monitor_UEqn_sum += time_monitor_UEqn; + time_monitor_EEqn_sum += time_monitor_EEqn; Info << "output time index " << runTime.timeIndex() << endl; @@ -244,6 +249,12 @@ int main(int argc, char *argv[]) Info<< "UEqn Time solve = " << time_monitor_UEqn_Solve << " s" << endl; // Info<< "UEqn sum Time = " << time_monitor_UEqn_sum << " s" << endl; // Info<< "UEqn sum Time - overhead = " << time_monitor_UEqn_sum - time_UEqn_initial << " s" << endl; + Info<< "EEqn Time = " << time_monitor_EEqn << " s" << endl; + Info<< "EEqn Time assamble Mtx = " << time_monitor_EEqn_mtxAssembly << " s" << endl; + Info<< "EEqn assamble(CPU prepare) = " << time_monitor_EEqn_mtxAssembly_CPU_Prepare << " s" << endl; + Info<< "EEqn assamble(GPU prepare) = " << time_monitor_EEqn_mtxAssembly_GPU_Prepare << " s" << endl; + Info<< "EEqn assamble(GPU run) = " << time_monitor_EEqn_mtxAssembly_GPU_Run << " s" << endl; + Info<< "EEqn Time solve = " << time_monitor_EEqn_Solve << " s" << endl; Info<< "sum Time = " << (time_monitor_chem + time_monitor_Y + time_monitor_flow + time_monitor_E + time_monitor_corrThermo + time_monitor_corrDiff) << " s" << endl; Info<< "CPU Time (get turb souce) = " << time_monitor_CPU << " s" << endl; Info<< "UEqn time in EEqn = " << time_monitor_UinE << " s" << endl; @@ -254,6 +265,12 @@ int main(int argc, char *argv[]) time_monitor_UEqn = 0; time_monitor_UEqn_mtxAssembly = 0; time_monitor_UEqn_Solve = 0; + time_monitor_EEqn = 0; + time_monitor_EEqn_mtxAssembly = 0; + time_monitor_EEqn_mtxAssembly_CPU_Prepare = 0; + time_monitor_EEqn_mtxAssembly_GPU_Prepare = 0; + time_monitor_EEqn_mtxAssembly_GPU_Run = 0; + time_monitor_EEqn_Solve = 0; time_monitor_chem = 0; time_monitor_Y = 0; time_monitor_E = 0; diff --git a/src_gpu/dfEEqn.H b/src_gpu/dfEEqn.H index 10fc3569a..b606e5f28 100644 --- a/src_gpu/dfEEqn.H +++ b/src_gpu/dfEEqn.H @@ -11,8 +11,6 @@ private: cudaStream_t stream; AmgXSolver *ESolver = nullptr; int num_iteration; - double time_monitor_CPU; - double time_monitor_GPU_kernel, time_monitor_GPU_memcpy, time_monitor_GPU_memcpy_test; // common variables int num_cells, cell_bytes, num_faces, num_surfaces, cell_vec_bytes, csr_value_vec_bytes, num_boundary_cells; @@ -34,6 +32,8 @@ private: double *d_hDiffCorrFlux = nullptr; double *d_boundary_K_init = nullptr; double *d_boundary_K = nullptr; + double *d_boundary_hDiffCorrFlux_init = nullptr; + double *d_boundary_hDiffCorrFlux = nullptr; double *d_boundary_alphaEff_init = nullptr; double *d_boundary_alphaEff = nullptr; double *d_value_internal_coeffs_init = nullptr; @@ -52,7 +52,7 @@ public: void prepare_data(const double *he_old, const double *K, const double *K_old, const double *alphaEff, const double *hDiffCorrFlux, const double *dpdt, const double *diffAlphaD, - const double *boundary_K, const double *boundary_alphaEff, + const double *boundary_K, const double *boundary_hDiffCorrFlux, const double *boundary_alphaEff, const double *valueInternalCoeffs, const double *valueBoundaryCoeffs, const double *gradientInternalCoeffs, const double *gradientBoundaryCoeffs); diff --git a/src_gpu/dfEEqn.cu b/src_gpu/dfEEqn.cu index 6f3d58acb..6907ed4cd 100644 --- a/src_gpu/dfEEqn.cu +++ b/src_gpu/dfEEqn.cu @@ -49,7 +49,6 @@ __global__ void eeqn_fvm_div_internal(int num_cells, int neighbor_index = neighbor_offset + inner_index; double w = weight[neighbor_index]; double f = phi[neighbor_index]; - //if (i == 267) printf("lower A[%d], %.10lf + %.10lf = %.10lf\n", i, A_csr_input[i], (-w) * f * sign, A_csr_input[i] + (1 - w) * f * sign); A_csr_output[i] = A_csr_input[i] + (-w) * f * sign; // lower neighbors contribute to sum of -1 div_diag += (w - 1) * f; @@ -61,13 +60,11 @@ __global__ void eeqn_fvm_div_internal(int num_cells, int neighbor_index = neighbor_offset + inner_index - 1; double w = weight[neighbor_index]; double f = phi[neighbor_index]; - //if (i == 267) printf("upper A[%d], %.10lf + %.10lf = %.10lf\n", i, A_csr_input[i], (-w) * f * sign, A_csr_input[i] + (1 - w) * f * sign); A_csr_output[i] = A_csr_input[i] + (1 - w) * f * sign; // upper neighbors contribute to sum of 1 div_diag += w * f; } } - //if (row_index + diag_index == 267) printf("diag A[%d], %.10lf + %.10lf = %.10lf\n", row_index + diag_index, A_csr_input[row_index + diag_index], div_diag * sign, A_csr_input[row_index + diag_index] + div_diag * sign); A_csr_output[row_index + diag_index] = A_csr_input[row_index + diag_index] + div_diag * sign; // diag } @@ -95,11 +92,9 @@ __global__ void eeqn_fvm_div_boundary(int num_boundary_cells, double boundary_coeffs = 0; for (int i = 0; i < loop_size; i++) { - //if (cell_index == 1) printf("term[%d], src index: %d, src value: %.10lf\n", i, cell_offset + i, value_internal_coeffs[cell_offset + i]); internal_coeffs += value_internal_coeffs[cell_offset + i]; boundary_coeffs += value_boundary_coeffs[cell_offset + i]; } - //if (cell_index == 1) printf("boundary A[%d], cell_index: %d, loop_size: %d, %.10lf + %.10lf = %.10lf, \n", csr_index, cell_index, loop_size, A_csr_input[csr_index], internal_coeffs * sign, A_csr_input[csr_index] + internal_coeffs * sign); A_csr_output[csr_index] = A_csr_input[csr_index] + internal_coeffs * sign; b_output[cell_index] = b_input[cell_index] + boundary_coeffs * sign; } @@ -204,13 +199,14 @@ __global__ void eeqn_fvm_laplacian_uncorrected_boundary(int num_boundary_cells, __global__ void eeqn_fvc_ddt_kernel(int num_cells, const double rdelta_t, const double *rho_old, const double *rho_new, const double *K_old, const double *K, + const double *volume, const double sign, const double *b_input, double *b_output) { int index = blockDim.x * blockIdx.x + threadIdx.x; if (index >= num_cells) return; - double fvc_ddt_term = rdelta_t * (rho_new[index] * K[index] - rho_old[index] * K_old[index]); + double fvc_ddt_term = rdelta_t * (rho_new[index] * K[index] - rho_old[index] * K_old[index]) * volume[index]; b_output[index] = b_input[index] + fvc_ddt_term * sign; } @@ -344,20 +340,22 @@ __global__ void eeqn_fvc_div_phi_scalar_internal(int num_cells, { int neighbor_index = neighbor_offset + inner_index; double w = weight[neighbor_index]; + double p = phi[neighbor_index]; int neighbor_cell_id = csr_col_index[row_index + inner_index]; double neighbor_cell_k = K[neighbor_cell_id]; double face_k = (1 - w) * own_cell_k + w * neighbor_cell_k; - interp -= phi[i] * face_k; + interp -= p * face_k; } // upper if (inner_index > diag_index) { int neighbor_index = neighbor_offset + inner_index - 1; double w = weight[neighbor_index]; + double p = phi[neighbor_index]; int neighbor_cell_id = csr_col_index[row_index + inner_index]; double neighbor_cell_k = K[neighbor_cell_id]; - double face_k = (1 - w) * own_cell_k + w * neighbor_cell_k; - interp += phi[i] * face_k; + double face_k = w * own_cell_k + (1 - w) * neighbor_cell_k; + interp += p * face_k; } } b_output[index] = b_input[index] + interp * sign; @@ -401,10 +399,12 @@ __global__ void eeqn_add_to_source_kernel(int num_cells, } __global__ void eeqn_boundaryPermutation(const int num_boundary_faces, const int *bouPermedIndex, - const double *boundary_K_init, const double *boundary_alphaEff_init, + const double *boundary_K_init, const double *boundary_hDiffCorrFlux_init, + const double *boundary_alphaEff_init, const double *value_internal_coeffs_init, const double *value_boundary_coeffs_init, const double *gradient_internal_coeffs_init, const double *gradient_boundary_coeffs_init, - double *boundary_K, double *boundary_alphaEff, + double *boundary_K, double *boundary_hDiffCorrFlux, + double *boundary_alphaEff, double *value_internal_coeffs, double *value_boundary_coeffs, double *gradient_internal_coeffs, double *gradient_boundary_coeffs) { @@ -414,10 +414,10 @@ __global__ void eeqn_boundaryPermutation(const int num_boundary_faces, const int int p = bouPermedIndex[index]; - //if (index == 4 || index == 5 || index == 6) { - // printf("dst index: %d, src index: %d, value: %.10lf\n", index, p, value_internal_coeffs_init[p]); - //} boundary_K[index] = boundary_K_init[p]; + boundary_hDiffCorrFlux[index * 3 + 0] = boundary_hDiffCorrFlux_init[p * 3 + 0]; + boundary_hDiffCorrFlux[index * 3 + 1] = boundary_hDiffCorrFlux_init[p * 3 + 1]; + boundary_hDiffCorrFlux[index * 3 + 2] = boundary_hDiffCorrFlux_init[p * 3 + 2]; boundary_alphaEff[index] = boundary_alphaEff_init[p]; value_internal_coeffs[index] = value_internal_coeffs_init[p]; value_boundary_coeffs[index] = value_boundary_coeffs_init[p]; @@ -462,6 +462,8 @@ dfEEqn::dfEEqn(dfMatrixDataBase &dataBase, const std::string &modeStr, const std checkCudaErrors(cudaMalloc((void **)&d_boundary_K_init, boundary_face_bytes)); checkCudaErrors(cudaMalloc((void **)&d_boundary_K, boundary_face_bytes)); + checkCudaErrors(cudaMalloc((void **)&d_boundary_hDiffCorrFlux_init, boundary_face_bytes * 3)); + checkCudaErrors(cudaMalloc((void **)&d_boundary_hDiffCorrFlux, boundary_face_bytes * 3)); checkCudaErrors(cudaMalloc((void **)&d_boundary_alphaEff_init, boundary_face_bytes)); checkCudaErrors(cudaMalloc((void **)&d_boundary_alphaEff, boundary_face_bytes)); @@ -479,14 +481,12 @@ dfEEqn::dfEEqn(dfMatrixDataBase &dataBase, const std::string &modeStr, const std void dfEEqn::prepare_data(const double *he_old, const double *K, const double *K_old, const double *alphaEff, const double *dpdt, const double *diffAlphaD, const double *hDiffCorrFlux, - const double *boundary_K, const double *boundary_alphaEff, + const double *boundary_K, const double *boundary_hDiffCorrFlux, const double *boundary_alphaEff, const double *valueInternalCoeffs, const double *valueBoundaryCoeffs, const double *gradientInternalCoeffs, const double *gradientBoundaryCoeffs) { // TODO not real async copy now, because some host array are not in pinned memory. // copy the host input array in host memory to the device input array in device memory - clock_t start = std::clock(); - checkCudaErrors(cudaMemcpyAsync(d_he_old, he_old, cell_bytes, cudaMemcpyHostToDevice, stream)); checkCudaErrors(cudaMemcpyAsync(d_K, K, cell_bytes, cudaMemcpyHostToDevice, stream)); checkCudaErrors(cudaMemcpyAsync(d_K_old, K_old, cell_bytes, cudaMemcpyHostToDevice, stream)); @@ -497,6 +497,7 @@ void dfEEqn::prepare_data(const double *he_old, const double *K, const double *K // copy and permutate boundary variable checkCudaErrors(cudaMemcpyAsync(d_boundary_K_init, boundary_K, boundary_face_bytes, cudaMemcpyHostToDevice, stream)); + checkCudaErrors(cudaMemcpyAsync(d_boundary_hDiffCorrFlux_init, boundary_hDiffCorrFlux, boundary_face_bytes * 3, cudaMemcpyHostToDevice, stream)); checkCudaErrors(cudaMemcpyAsync(d_boundary_alphaEff_init, boundary_alphaEff, boundary_face_bytes, cudaMemcpyHostToDevice, stream)); checkCudaErrors(cudaMemcpyAsync(d_value_internal_coeffs_init, valueInternalCoeffs, boundary_face_bytes, cudaMemcpyHostToDevice, stream)); checkCudaErrors(cudaMemcpyAsync(d_value_boundary_coeffs_init, valueBoundaryCoeffs, boundary_face_bytes, cudaMemcpyHostToDevice, stream)); @@ -506,14 +507,12 @@ void dfEEqn::prepare_data(const double *he_old, const double *K, const double *K size_t threads_per_block = 1024; size_t blocks_per_grid = (num_boundary_faces + threads_per_block - 1) / threads_per_block; eeqn_boundaryPermutation<<>>(num_boundary_faces, dataBase_.d_bouPermedIndex, - d_boundary_K_init, d_boundary_alphaEff_init, + d_boundary_K_init, d_boundary_hDiffCorrFlux_init, d_boundary_alphaEff_init, d_value_internal_coeffs_init, d_value_boundary_coeffs_init, d_gradient_internal_coeffs_init, d_gradient_boundary_coeffs_init, - d_boundary_K, d_boundary_alphaEff, + d_boundary_K, d_boundary_hDiffCorrFlux, d_boundary_alphaEff, d_value_internal_coeffs, d_value_boundary_coeffs, d_gradient_internal_coeffs, d_gradient_boundary_coeffs); - clock_t end = std::clock(); - time_monitor_GPU_memcpy += double(end - start) / double(CLOCKS_PER_SEC); } void dfEEqn::resetAb() { @@ -522,24 +521,18 @@ void dfEEqn::resetAb() { checkCudaErrors(cudaMemsetAsync(d_b, 0, cell_vec_bytes, stream)); } -#define TRACE fprintf(stderr, "%s %d\n", __FILE__, __LINE__); - void dfEEqn::fvm_ddt() { - clock_t start = std::clock(); size_t threads_per_block = 1024; size_t blocks_per_grid = (num_cells + threads_per_block - 1) / threads_per_block; eeqn_fvm_ddt_kernel<<>>(num_cells, dataBase_.rdelta_t, d_A_csr_row_index, d_A_csr_diag_index, dataBase_.d_rho_old, dataBase_.d_rho_new, dataBase_.d_volume, d_he_old, 1., d_A_csr, d_b, d_A_csr, d_b); - clock_t end = std::clock(); - time_monitor_GPU_kernel += double(end - start) / double(CLOCKS_PER_SEC); } void dfEEqn::fvm_div() { - clock_t start = std::clock(); size_t threads_per_block = 1024; size_t blocks_per_grid = (num_cells + threads_per_block - 1) / threads_per_block; eeqn_fvm_div_internal<<>>(num_cells, @@ -552,65 +545,52 @@ void dfEEqn::fvm_div() dataBase_.d_boundary_cell_offset, dataBase_.d_boundary_cell_id, d_value_internal_coeffs, d_value_boundary_coeffs, 1., d_A_csr, d_b, d_A_csr, d_b); - clock_t end = std::clock(); - time_monitor_GPU_kernel += double(end - start) / double(CLOCKS_PER_SEC); } void dfEEqn::fvm_laplacian() { - clock_t start = std::clock(); size_t threads_per_block = 1024; size_t blocks_per_grid = (num_cells + threads_per_block - 1) / threads_per_block; eeqn_fvm_laplacian_uncorrected_internal<<>>(num_cells, d_A_csr_row_index, d_A_csr_col_index, d_A_csr_diag_index, d_alphaEff, dataBase_.d_weight, dataBase_.d_face, dataBase_.d_deltaCoeffs, - 1., d_A_csr, d_A_csr); - + -1., d_A_csr, d_A_csr); blocks_per_grid = (num_boundary_cells + threads_per_block - 1) / threads_per_block; eeqn_fvm_laplacian_uncorrected_boundary<<>>(num_boundary_cells, d_A_csr_row_index, d_A_csr_diag_index, dataBase_.d_boundary_cell_offset, dataBase_.d_boundary_cell_id, d_boundary_alphaEff, dataBase_.d_boundary_face, d_gradient_internal_coeffs, d_gradient_boundary_coeffs, - 1., d_A_csr, d_b, d_A_csr, d_b); - clock_t end = std::clock(); - time_monitor_GPU_kernel += double(end - start) / double(CLOCKS_PER_SEC); + -1., d_A_csr, d_b, d_A_csr, d_b); } void dfEEqn::fvc_ddt() { // " + fvc::ddt(rho,K)" is on the left side of "==", thus should minus from source. - clock_t start = std::clock(); size_t threads_per_block = 1024; size_t blocks_per_grid = (num_cells + threads_per_block - 1) / threads_per_block; eeqn_fvc_ddt_kernel<<>>(num_cells, dataBase_.rdelta_t, - dataBase_.d_rho_old, dataBase_.d_rho_new, d_K_old, d_K, + dataBase_.d_rho_old, dataBase_.d_rho_new, d_K_old, d_K, dataBase_.d_volume, -1., d_b, d_b); - clock_t end = std::clock(); - time_monitor_GPU_kernel += double(end - start) / double(CLOCKS_PER_SEC); } void dfEEqn::fvc_div_vector() { // " + fvc::div(hDiffCorrFlux)" is on the right side of "==", thus should add to source. - clock_t start = std::clock(); size_t threads_per_block = 512; size_t blocks_per_grid = (num_cells + threads_per_block - 1) / threads_per_block; eeqn_fvc_div_vector_internal<<>>(num_cells, d_A_csr_row_index, d_A_csr_col_index, d_A_csr_diag_index, - dataBase_.d_face_vector, dataBase_.d_grad, dataBase_.d_weight, + dataBase_.d_face_vector, d_hDiffCorrFlux, dataBase_.d_weight, 1., d_b, d_b); blocks_per_grid = (num_boundary_cells + threads_per_block - 1) / threads_per_block; eeqn_fvc_div_vector_boundary<<>>(num_boundary_cells, dataBase_.d_boundary_cell_offset, dataBase_.d_boundary_cell_id, - dataBase_.d_boundary_face_vector, dataBase_.d_grad_boundary, + dataBase_.d_boundary_face_vector, d_boundary_hDiffCorrFlux, 1., d_b, d_b); - clock_t end = std::clock(); - time_monitor_GPU_kernel += double(end - start) / double(CLOCKS_PER_SEC); } void dfEEqn::fvc_div_phi_scalar() { // " + fvc::div(phi,K)" is on the left side of "==", thus should minus from source. - clock_t start = std::clock(); size_t threads_per_block = 1024; size_t blocks_per_grid = (num_cells + threads_per_block - 1) / threads_per_block; eeqn_fvc_div_phi_scalar_internal<<>>(num_cells, @@ -622,22 +602,17 @@ void dfEEqn::fvc_div_phi_scalar() dataBase_.d_boundary_cell_offset, dataBase_.d_boundary_cell_id, dataBase_.d_boundary_phi, d_boundary_K, -1., d_b, d_b); - clock_t end = std::clock(); - time_monitor_GPU_kernel += double(end - start) / double(CLOCKS_PER_SEC); } void dfEEqn::add_to_source() { // " - dpdt" is on the left side of "==", thus should add to source. // "+ diffAlphaD" is on the left side of "==", thus should minus from source. - clock_t start = std::clock(); size_t threads_per_block = 1024; size_t blocks_per_grid = (num_cells + threads_per_block - 1) / threads_per_block; // " + fvc::ddt(rho,K)" is on the left side of "==", thus should minus from source. eeqn_add_to_source_kernel<<>>(num_cells, 1., d_dpdt, -1., d_diffAlphaD, dataBase_.d_volume, d_b, d_b); - clock_t end = std::clock(); - time_monitor_GPU_kernel += double(end - start) / double(CLOCKS_PER_SEC); } void dfEEqn::checkValue(bool print) @@ -650,9 +625,9 @@ void dfEEqn::checkValue(bool print) if (print) { for (int i = 0; i < (num_faces + num_cells); i++) - fprintf(stderr, "h_A_csr[%d]: %.15lf\n", i, h_A_csr[i]); - for (int i = 0; i < num_cells * 3; i++) - fprintf(stderr, "h_b[%d]: %.15lf\n", i, h_b[i]); + fprintf(stderr, "h_A_csr[%d]: %.16lf\n", i, h_A_csr[i]); + for (int i = 0; i < num_cells; i++) + fprintf(stderr, "h_b[%d]: %.16lf\n", i, h_b[i]); } char *input_file = "of_output.txt"; @@ -683,28 +658,21 @@ void dfEEqn::checkValue(bool print) if (print) { for (int i = 0; i < (num_faces + num_cells); i++) - fprintf(stderr, "h_A_of_vec_1mtx[%d]: %.15lf\n", i, h_A_of_vec_1mtx[i]); + fprintf(stderr, "h_A_of_vec_1mtx[%d]: %.16lf\n", i, h_A_of_vec_1mtx[i]); for (int i = 0; i < num_cells; i++) - fprintf(stderr, "h_b_of_vec[%d]: %.15lf\n", i, h_b_of_vec[i]); + fprintf(stderr, "h_b_of_vec[%d]: %.16lf\n", i, h_b_of_vec[i]); } // check fprintf(stderr, "check of h_A_csr\n"); - checkVectorEqual(num_faces + num_cells, h_A_of_vec_1mtx.data(), h_A_csr, 1e-5); + checkVectorEqual(num_faces + num_cells, h_A_of_vec_1mtx.data(), h_A_csr, 1e-6); fprintf(stderr, "check of h_b\n"); - checkVectorEqual(num_cells, h_b_of_vec.data(), h_b, 1e-5); + checkVectorEqual(num_cells, h_b_of_vec.data(), h_b, 1e-6); } void dfEEqn::solve() { checkCudaErrors(cudaStreamSynchronize(stream)); - printf("CPU Time (copy&permutate) = %.6lf s\n", time_monitor_CPU); - printf("GPU Time (kernel launch) = %.6lf s\n", time_monitor_GPU_kernel); - printf("GPU Time (memcpy) = %.6lf s\n", time_monitor_GPU_memcpy); - time_monitor_CPU = 0; - time_monitor_GPU_kernel = 0; - time_monitor_GPU_memcpy = 0; - // nvtxRangePush("solve"); int nNz = num_cells + num_faces; // matrix entries @@ -721,8 +689,9 @@ void dfEEqn::solve() num_iteration++; checkCudaErrors(cudaMemcpyAsync(h_he_new, d_he_old, cell_bytes, cudaMemcpyDeviceToHost, stream)); + // checkCudaErrors(cudaStreamSynchronize(stream)); // for (size_t i = 0; i < num_cells; i++) - // fprintf(stderr, "h_he_after[%d]: %.15lf\n", i, h_he_new[i]); + // fprintf(stderr, "h_he_after[%d]: %.16lf\n", i, h_he_new[i]); } void dfEEqn::sync() @@ -743,12 +712,12 @@ dfEEqn::~dfEEqn() delete h_A_csr; delete h_b; + checkCudaErrors(cudaFreeHost(h_he_new)); + checkCudaErrors(cudaFree(d_A_csr)); checkCudaErrors(cudaFree(d_b)); checkCudaErrors(cudaFree(d_he_old)); - checkCudaErrors(cudaFree(h_he_new)); - checkCudaErrors(cudaFree(d_K)); checkCudaErrors(cudaFree(d_K_old)); checkCudaErrors(cudaFree(d_alphaEff)); @@ -758,6 +727,8 @@ dfEEqn::~dfEEqn() checkCudaErrors(cudaFree(d_boundary_K_init)); checkCudaErrors(cudaFree(d_boundary_K)); + checkCudaErrors(cudaFree(d_boundary_hDiffCorrFlux_init)); + checkCudaErrors(cudaFree(d_boundary_hDiffCorrFlux)); checkCudaErrors(cudaFree(d_boundary_alphaEff_init)); checkCudaErrors(cudaFree(d_boundary_alphaEff));