From 03744a4d059d0264fc71eaf78a9960d0ac5d6dce Mon Sep 17 00:00:00 2001 From: maorz1998 Date: Tue, 22 Aug 2023 18:07:59 +0800 Subject: [PATCH 1/3] fix bugs in UEqn --- applications/solvers/dfLowMachFoam/UEqn.H | 40 +++--- .../solvers/dfLowMachFoam/dfLowMachFoam.C | 2 +- applications/solvers/dfLowMachFoam/new_UEqn.H | 116 ------------------ .../solvers/dfLowMachFoam/new_dfLowMachFoam.C | 113 ----------------- src_gpu/dfUEqn.cu | 2 +- 5 files changed, 24 insertions(+), 249 deletions(-) delete mode 100644 applications/solvers/dfLowMachFoam/new_UEqn.H delete mode 100644 applications/solvers/dfLowMachFoam/new_dfLowMachFoam.C diff --git a/applications/solvers/dfLowMachFoam/UEqn.H b/applications/solvers/dfLowMachFoam/UEqn.H index 38934abdb..4e8c090ad 100644 --- a/applications/solvers/dfLowMachFoam/UEqn.H +++ b/applications/solvers/dfLowMachFoam/UEqn.H @@ -177,30 +177,34 @@ // postProcess TICK_START; UEqn_GPU.postProcess(h_u); + memcpy(&U[0][0], h_u, dfDataBase.cell_value_vec_bytes); U.correctBoundaryConditions(); + K = 0.5*magSqr(U); DEBUG_TRACE; TICK_STOP(post process time); // checkResult // TODO: for temp, now we compare ldu, finally we compare csr - std::vector h_internal_coeffs(dfDataBase.num_boundary_surfaces * 3); - std::vector h_boundary_coeffs(dfDataBase.num_boundary_surfaces * 3); - offset = 0; - for (int patchi = 0; patchi < dfDataBase.num_patches; patchi++) - { - int patchsize = dfDataBase.patch_size[patchi]; - const double* internal_coeff_ptr = &UEqn.internalCoeffs()[patchi][0][0]; - const double* boundary_coeff_ptr = &UEqn.boundaryCoeffs()[patchi][0][0]; - memcpy(h_internal_coeffs.data() + offset * 3, internal_coeff_ptr, patchsize * 3 * sizeof(double)); - memcpy(h_boundary_coeffs.data() + offset * 3, boundary_coeff_ptr, patchsize * 3 * sizeof(double)); - offset += patchsize; - } - bool printFlag = false; - UEqn_GPU.compareResult(&UEqn.lower()[0], &UEqn.upper()[0], &UEqn.diag()[0], &UEqn.source()[0][0], - h_internal_coeffs.data(), h_boundary_coeffs.data(), - // &DivTensor[0][0], - printFlag); - DEBUG_TRACE; + // std::vector h_internal_coeffs(dfDataBase.num_boundary_surfaces * 3); + // std::vector h_boundary_coeffs(dfDataBase.num_boundary_surfaces * 3); + // Info << "location 0 " << endl; + // offset = 0; + // for (int patchi = 0; patchi < dfDataBase.num_patches; patchi++) + // { + // int patchsize = dfDataBase.patch_size[patchi]; + // const double* internal_coeff_ptr = &UEqn.internalCoeffs()[patchi][0][0]; + // const double* boundary_coeff_ptr = &UEqn.boundaryCoeffs()[patchi][0][0]; + // memcpy(h_internal_coeffs.data() + offset * 3, internal_coeff_ptr, patchsize * 3 * sizeof(double)); + // memcpy(h_boundary_coeffs.data() + offset * 3, boundary_coeff_ptr, patchsize * 3 * sizeof(double)); + // offset += patchsize; + // } + // bool printFlag = false; + // UEqn_GPU.compareResult(&UEqn.lower()[0], &UEqn.upper()[0], &UEqn.diag()[0], &UEqn.source()[0][0], + // h_internal_coeffs.data(), h_boundary_coeffs.data(), + // // &DivTensor[0][0], + // printFlag); + // DEBUG_TRACE; + #else start1 = std::clock(); tmp tUEqn diff --git a/applications/solvers/dfLowMachFoam/dfLowMachFoam.C b/applications/solvers/dfLowMachFoam/dfLowMachFoam.C index 6ea4251af..4a17ab215 100644 --- a/applications/solvers/dfLowMachFoam/dfLowMachFoam.C +++ b/applications/solvers/dfLowMachFoam/dfLowMachFoam.C @@ -60,7 +60,7 @@ Description #include "basicThermo.H" #include "CombustionModel.H" -#define GPUSolverNew_ +// #define GPUSolverNew_ #define TIME #ifdef GPUSolverNew_ diff --git a/applications/solvers/dfLowMachFoam/new_UEqn.H b/applications/solvers/dfLowMachFoam/new_UEqn.H deleted file mode 100644 index 41b804a4b..000000000 --- a/applications/solvers/dfLowMachFoam/new_UEqn.H +++ /dev/null @@ -1,116 +0,0 @@ -#ifdef GPUSolver_ -const tmp nuEff_tmp(turbulence->nuEff()); -const volScalarField& nuEff = nuEff_tmp(); - -// run CPU, for temp -tmp tUEqn -( - fvm::ddt(rho, U) + fvm::div(phi, U) - + turbulence->divDevRhoReff(U) - == -fvc::grad(p) -); -// tmp tUEqn_ref // test turbulence->divDevRhoReff(U) -// ( -// - fvc::div((turbulence->rho()*turbulence->nuEff())*dev2(Foam::T(fvc::grad(U)))) -// - fvm::laplacian(turbulence->rho()*turbulence->nuEff(), U) -// ); - -fvVectorMatrix& UEqn = tUEqn.ref(); - -// run GPU -// preProcess -// TODO: preProcessForRhoEqn for temp, now we only transfer phi(instead of rhoEqn) used by fvm::div(phi, U) -UEqn_GPU.sync(); -double *h_rho = dfDataBase.getFieldPointer("rho", location::cpu, position::internal); -double *h_phi = dfDataBase.getFieldPointer("phi", location::cpu, position::internal); -double *h_boundary_phi = dfDataBase.getFieldPointer("phi", location::cpu, position::boundary); -memcpy(h_rho, &rho[0], dfDataBase.cell_value_bytes); -memcpy(h_phi, &phi[0], dfDataBase.surface_value_bytes); -int offset = 0; -forAll(phi.boundaryField(), patchi) -{ - const fvsPatchScalarField& patchPhi = phi.boundaryField()[patchi]; - int patchsize = patchPhi.size(); - memcpy(h_boundary_phi + offset, &patchPhi[0], patchsize * sizeof(double)); - offset += patchsize; -} -UEqn_GPU.preProcessForRhoEqn(h_rho, h_phi, h_boundary_phi); -DEBUG_TRACE; -clock_t start = std::clock(); -// preparing u, p, nu_eff, and rho.boundary used by UEqn_GPU.preProcess() -double *h_u = dfDataBase.getFieldPointer("u", location::cpu, position::internal); -double *h_boundary_u = dfDataBase.getFieldPointer("u", location::cpu, position::boundary); -double *h_p = dfDataBase.getFieldPointer("p", location::cpu, position::internal); -double *h_boundary_p = dfDataBase.getFieldPointer("p", location::cpu, position::boundary); -double *h_nu_eff = UEqn_GPU.getFieldPointer("nu_eff", location::cpu, position::internal); -double *h_boundary_nu_eff = UEqn_GPU.getFieldPointer("nu_eff", location::cpu, position::boundary); -double *h_boundary_rho = dfDataBase.getFieldPointer("rho", location::cpu, position::boundary); -double end = std::clock(); -Info << "get pointer" << double(end - start) / double(CLOCKS_PER_SEC) << endl; - -start = std::clock(); -memcpy(h_u, &U.oldTime()[0][0], dfDataBase.cell_value_vec_bytes); -memcpy(h_p, &p[0], dfDataBase.cell_value_bytes); -memcpy(h_nu_eff, &nuEff[0], dfDataBase.cell_value_bytes); -end = std::clock(); -Info << "copy to pinned memory" << double(end - start) / double(CLOCKS_PER_SEC) << endl; - -start = std::clock(); -offset = 0; -forAll(U.boundaryField(), patchi) -{ - const fvPatchVectorField& patchU = U.boundaryField()[patchi]; - const fvPatchScalarField& patchP = p.boundaryField()[patchi]; - const fvPatchScalarField& patchNuEff = nuEff.boundaryField()[patchi]; - const fvPatchScalarField& patchRho = rho.boundaryField()[patchi]; - int patchsize = patchU.size(); - memcpy(h_boundary_u + offset * 3, &patchU[0][0], patchsize * 3 * sizeof(double)); - memcpy(h_boundary_p + offset, &patchP[0], patchsize * sizeof(double)); - memcpy(h_boundary_nu_eff + offset, &patchNuEff[0], patchsize * sizeof(double)); - memcpy(h_boundary_rho + offset, &patchRho[0], patchsize * sizeof(double)); - offset += patchsize; -} -end = std::clock(); -Info << "CPU prepare boundary time" << double(end - start) / double(CLOCKS_PER_SEC) << endl; - -start = std::clock(); -UEqn_GPU.preProcess(h_u, h_boundary_u, h_p, h_boundary_p, h_nu_eff, h_boundary_nu_eff, h_boundary_rho); -DEBUG_TRACE; -UEqn_GPU.sync(); -end = std::clock(); -Info << "GPU preProcess time" << double(end - start) / double(CLOCKS_PER_SEC) << endl; - -// process -start = std::clock(); -UEqn_GPU.process(); -end = std::clock(); -DEBUG_TRACE; -UEqn_GPU.sync(); -// end = std::clock(); -Info << "GPU process time" << double(end - start) / double(CLOCKS_PER_SEC) << endl; - -// postProcess -UEqn_GPU.postProcess(h_u); -DEBUG_TRACE; - -// checkResult -// TODO: for temp, now we compare ldu, finally we compare csr -std::vector h_internal_coeffs(dfDataBase.num_boundary_surfaces * 3); -std::vector h_boundary_coeffs(dfDataBase.num_boundary_surfaces * 3); -offset = 0; -for (int patchi = 0; patchi < dfDataBase.num_patches; patchi++) -{ - int patchsize = dfDataBase.patch_size[patchi]; - const double* internal_coeff_ptr = &UEqn.internalCoeffs()[patchi][0][0]; - const double* boundary_coeff_ptr = &UEqn.boundaryCoeffs()[patchi][0][0]; - memcpy(h_internal_coeffs.data() + offset * 3, internal_coeff_ptr, patchsize * 3 * sizeof(double)); - memcpy(h_boundary_coeffs.data() + offset * 3, boundary_coeff_ptr, patchsize * 3 * sizeof(double)); - offset += patchsize; -} -bool printFlag = true; -UEqn_GPU.compareResult(&UEqn.lower()[0], &UEqn.upper()[0], &UEqn.diag()[0], &UEqn.source()[0][0], - h_internal_coeffs.data(), h_boundary_coeffs.data(), - // &DivTensor[0][0], - printFlag); -DEBUG_TRACE; -#endif diff --git a/applications/solvers/dfLowMachFoam/new_dfLowMachFoam.C b/applications/solvers/dfLowMachFoam/new_dfLowMachFoam.C deleted file mode 100644 index 7d867687f..000000000 --- a/applications/solvers/dfLowMachFoam/new_dfLowMachFoam.C +++ /dev/null @@ -1,113 +0,0 @@ -/*---------------------------------------------------------------------------*\ - ========= | - \\ / F ield | OpenFOAM: The Open Source CFD Toolbox - \\ / O peration | Website: https://openfoam.org - \\ / A nd | Copyright (C) 2011-2019 OpenFOAM Foundation - \\/ M anipulation | -------------------------------------------------------------------------------- -License - This file is part of OpenFOAM. - - OpenFOAM is free software: you can redistribute it and/or modify it - under the terms of the GNU General Public License as published by - the Free Software Foundation, either version 3 of the License, or - (at your option) any later version. - - OpenFOAM is distributed in the hope that it will be useful, but WITHOUT - ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or - FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License - for more details. - - You should have received a copy of the GNU General Public License - along with OpenFOAM. If not, see . - -Application - unittest - -Description - GPU unittest - -\*---------------------------------------------------------------------------*/ - -#include "dfChemistryModel.H" -#include "CanteraMixture.H" -// #include "hePsiThermo.H" -#include "heRhoThermo.H" - -#include "fvCFD.H" -#include "fluidThermo.H" -#include "turbulentFluidThermoModel.H" -#include "pimpleControl.H" -#include "pressureControl.H" -#include "localEulerDdtScheme.H" -#include "fvcSmooth.H" -#include "PstreamGlobals.H" -#include "basicThermo.H" -#include "CombustionModel.H" - -#include -#include -#include "upwind.H" - -#include "dfMatrixDataBase.H" -#include "dfMatrixOpBase.H" -#include "GenFvMatrix.H" -#include "dfUEqn.H" -#include "createGPUSolver.H" - -#define GPUSolver_ - -int main(int argc, char *argv[]) -{ -#ifdef USE_PYTORCH - pybind11::scoped_interpreter guard{};//start python interpreter -#endif - #include "postProcess.H" - - // #include "setRootCaseLists.H" - #include "listOptions.H" - #include "setRootCase2.H" - #include "listOutput.H" - - #include "createTime.H" - #include "createMesh.H" - #include "createDyMControls.H" - #include "initContinuityErrs.H" - #include "createFields.H" - #include "createRhoUfIfPresent.H" - - turbulence->validate(); - - if (!LTS) - { - #include "compressibleCourantNo.H" - #include "setInitialDeltaT.H" - } - - // * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * // - { - #include "readDyMControls.H" - - if (LTS) - { - #include "setRDeltaT.H" - } - else - { - #include "compressibleCourantNo.H" - #include "setDeltaT.H" - } - - createGPUBase(mesh, Y); - createGPUUEqn(CanteraTorchProperties, U); - - // for (int timestep = 0; timestep < 10; timestep++) { - dfDataBase.preTimeStep(&rho.oldTime()[0]); - #include "new_UEqn.H" - dfDataBase.postTimeStep(); - // } - } - return 0; -} - - diff --git a/src_gpu/dfUEqn.cu b/src_gpu/dfUEqn.cu index d30c06131..5b97d259d 100644 --- a/src_gpu/dfUEqn.cu +++ b/src_gpu/dfUEqn.cu @@ -203,7 +203,7 @@ void dfUEqn::solve() { void dfUEqn::postProcess(double *h_u) { // TODO: Here may be a bug permute_vector_d2h(dataBase_.stream, dataBase_.num_cells, d_permute, dataBase_.d_u); - checkCudaErrors(cudaMemcpyAsync(dataBase_.d_u, d_permute, dataBase_.cell_value_vec_bytes, cudaMemcpyDeviceToHost, dataBase_.stream)); + checkCudaErrors(cudaMemcpyAsync(h_u, dataBase_.d_u, dataBase_.cell_value_vec_bytes, cudaMemcpyDeviceToHost, dataBase_.stream)); checkCudaErrors(cudaStreamSynchronize(dataBase_.stream)); // some boundary conditions may also need vf.boundary, deltaCoeffs.boundary, and weight.boundary From 5c7daa58cc2198962077f8a8ec8cd7d8c111fe7e Mon Sep 17 00:00:00 2001 From: maorz1998 Date: Tue, 22 Aug 2023 21:11:04 +0800 Subject: [PATCH 2/3] opt ldu2csr --- src_gpu/dfMatrixOpBase.H | 2 +- src_gpu/dfMatrixOpBase.cu | 49 +++++++++++++++++++++------------------ src_gpu/dfUEqn.cu | 13 +++++------ 3 files changed, 33 insertions(+), 31 deletions(-) diff --git a/src_gpu/dfMatrixOpBase.H b/src_gpu/dfMatrixOpBase.H index 71dd82c38..7a98a685d 100644 --- a/src_gpu/dfMatrixOpBase.H +++ b/src_gpu/dfMatrixOpBase.H @@ -15,7 +15,7 @@ void ldu_to_csr(cudaStream_t stream, int num_cells, int num_surfaces, int num_bo const int* boundary_cell_face, const int *lower_to_csr_index, const int *upper_to_csr_index, const int *diag_to_csr_index, const double *lower, const double *upper, const double *diag, const double *source, const double *internal_coeffs, const double *boundary_coeffs, - double *A, double *b, double *diag_vec); + double *A, double *b); void update_boundary_coeffs_vector(cudaStream_t stream, int num_boundary_surfaces, int num_patches, const int *patch_size, const int *patch_type, diff --git a/src_gpu/dfMatrixOpBase.cu b/src_gpu/dfMatrixOpBase.cu index e3616fac3..54a9b6893 100644 --- a/src_gpu/dfMatrixOpBase.cu +++ b/src_gpu/dfMatrixOpBase.cu @@ -805,14 +805,17 @@ __global__ void constructVecDiag(int num_cells, const double *diag, double *diag b[num_cells * 2 + index] = source[num_cells * 2 + index]; } -__global__ void addBoundaryDiagSrc(int num_cells, int num_boundary_surfaces, const int *face2Cells, - const double *internal_coeffs, const double *boundary_coeffs, double *diag_vec, double *b) +__global__ void addBoundaryDiagSrc(int num_cells, int num_surfaces, int num_boundary_surfaces, const int *face2Cells, + const double *internal_coeffs, const double *boundary_coeffs, const int *diagCSRIndex, + double *A_csr, double *b) { int index = blockDim.x * blockIdx.x + threadIdx.x; if (index >= num_boundary_surfaces) return; int cellIndex = face2Cells[index]; + int diagIndex = diagCSRIndex[cellIndex]; + int nNz = num_cells + 2 * num_surfaces; double internalCoeffx = internal_coeffs[num_boundary_surfaces * 0 + index]; double internalCoeffy = internal_coeffs[num_boundary_surfaces * 1 + index]; @@ -822,9 +825,9 @@ __global__ void addBoundaryDiagSrc(int num_cells, int num_boundary_surfaces, con double boundaryCoeffy = boundary_coeffs[num_boundary_surfaces * 1 + index]; double boundaryCoeffz = boundary_coeffs[num_boundary_surfaces * 2 + index]; - atomicAdd(&diag_vec[num_cells * 0 + cellIndex], internalCoeffx); - atomicAdd(&diag_vec[num_cells * 1 + cellIndex], internalCoeffy); - atomicAdd(&diag_vec[num_cells * 2 + cellIndex], internalCoeffz); + atomicAdd(&A_csr[nNz * 0 + diagIndex], internalCoeffx); + atomicAdd(&A_csr[nNz * 1 + diagIndex], internalCoeffy); + atomicAdd(&A_csr[nNz * 2 + diagIndex], internalCoeffz); atomicAdd(&b[num_cells * 0 + cellIndex], boundaryCoeffx); atomicAdd(&b[num_cells * 1 + cellIndex], boundaryCoeffy); @@ -842,8 +845,8 @@ __global__ void ldu_to_csr_offDiag(int num_cells, int num_surfaces, int uppIndex = uppCSRIndex[index]; int lowIndex = lowCSRIndex[index]; - int upp = upper[index]; - int low = lower[index]; + double upp = upper[index]; + double low = lower[index]; A_csr[(num_cells + 2 * num_surfaces) * 0 + uppIndex] = upper[index]; A_csr[(num_cells + 2 * num_surfaces) * 1 + uppIndex] = upper[index]; A_csr[(num_cells + 2 * num_surfaces) * 2 + uppIndex] = upper[index]; @@ -854,17 +857,22 @@ __global__ void ldu_to_csr_offDiag(int num_cells, int num_surfaces, } __global__ void ldu_to_csr_Diag(int num_cells, int num_surfaces, - const int *diagCSRIndex, const double *diag_vec, - double *A_csr) + const int *diagCSRIndex, const double *diag, const double *source, + double *A_csr, double *b) { int index = blockDim.x * blockIdx.x + threadIdx.x; if (index >= num_cells) return; int diagIndex = diagCSRIndex[index]; - A_csr[(num_cells + 2 * num_surfaces) * 0 + diagIndex] = diag_vec[num_cells * 0 + index]; - A_csr[(num_cells + 2 * num_surfaces) * 1 + diagIndex] = diag_vec[num_cells * 1 + index]; - A_csr[(num_cells + 2 * num_surfaces) * 2 + diagIndex] = diag_vec[num_cells * 2 + index]; + double diagVal = diag[index]; + A_csr[(num_cells + 2 * num_surfaces) * 0 + diagIndex] = diagVal; + A_csr[(num_cells + 2 * num_surfaces) * 1 + diagIndex] = diagVal; + A_csr[(num_cells + 2 * num_surfaces) * 2 + diagIndex] = diagVal; + + b[num_cells * 0 + index] = source[num_cells * 0 + index]; + b[num_cells * 1 + index] = source[num_cells * 1 + index]; + b[num_cells * 2 + index] = source[num_cells * 2 + index]; } @@ -907,28 +915,23 @@ void ldu_to_csr(cudaStream_t stream, int num_cells, int num_surfaces, int num_bo const int* boundary_cell_face, const int *lower_to_csr_index, const int *upper_to_csr_index, const int *diag_to_csr_index, const double *lower, const double *upper, const double *diag, const double *source, const double *internal_coeffs, const double *boundary_coeffs, - double *A, double *b, double *diag_vec) + double *A, double *b) { - // construct new diag with size of 3*num_cells + // construct diag size_t threads_per_block = 1024; size_t blocks_per_grid = (num_cells + threads_per_block - 1) / threads_per_block; - constructVecDiag<<>>(num_cells, diag, diag_vec, source, b); + ldu_to_csr_Diag<<>>(num_cells, num_surfaces, + diag_to_csr_index, diag, source, A, b); // add coeff to source and diagnal blocks_per_grid = (num_boundary_surface + threads_per_block - 1) / threads_per_block; - addBoundaryDiagSrc<<>>(num_cells, num_boundary_surface, - boundary_cell_face, internal_coeffs, boundary_coeffs, diag_vec, b); + addBoundaryDiagSrc<<>>(num_cells, num_surfaces, num_boundary_surface, + boundary_cell_face, internal_coeffs, boundary_coeffs, diag_to_csr_index, A, b); // convert offdiag blocks_per_grid = (num_surfaces + threads_per_block - 1) / threads_per_block; ldu_to_csr_offDiag<<>>(num_cells, num_surfaces, lower_to_csr_index, upper_to_csr_index, lower, upper, A); - - // convert diag - blocks_per_grid = (num_cells + threads_per_block - 1) / threads_per_block; - ldu_to_csr_Diag<<>>(num_cells, num_surfaces, - diag_to_csr_index, diag_vec, A); - } void update_boundary_coeffs_vector(cudaStream_t stream, int num_boundary_surfaces, int num_patches, diff --git a/src_gpu/dfUEqn.cu b/src_gpu/dfUEqn.cu index 5b97d259d..837a7587a 100644 --- a/src_gpu/dfUEqn.cu +++ b/src_gpu/dfUEqn.cu @@ -50,7 +50,6 @@ void dfUEqn::createNonConstantLduAndCsrFields() { checkCudaErrors(cudaMalloc((void**)&d_upper, dataBase_.surface_value_bytes)); checkCudaErrors(cudaMalloc((void**)&d_diag, dataBase_.cell_value_bytes)); checkCudaErrors(cudaMalloc((void**)&d_source, dataBase_.cell_value_vec_bytes)); - checkCudaErrors(cudaMalloc((void**)&d_diag_vector, dataBase_.cell_value_vec_bytes)); checkCudaErrors(cudaMalloc((void**)&d_internal_coeffs, dataBase_.boundary_surface_value_vec_bytes)); checkCudaErrors(cudaMalloc((void**)&d_boundary_coeffs, dataBase_.boundary_surface_value_vec_bytes)); checkCudaErrors(cudaMalloc((void**)&d_A, dataBase_.csr_value_vec_bytes)); @@ -88,7 +87,6 @@ void dfUEqn::preProcess(const double *h_u, const double *h_boundary_u, const dou checkCudaErrors(cudaMemsetAsync(d_boundary_coeffs, 0, dataBase_.boundary_surface_value_vec_bytes, dataBase_.stream)); checkCudaErrors(cudaMemsetAsync(d_A, 0, dataBase_.csr_value_vec_bytes, dataBase_.stream)); checkCudaErrors(cudaMemsetAsync(d_b, 0, dataBase_.cell_value_vec_bytes, dataBase_.stream)); - checkCudaErrors(cudaMemsetAsync(d_diag_vector, 0, dataBase_.cell_value_vec_bytes, dataBase_.stream)); // TODO: maybe a better way } void dfUEqn::process() { @@ -140,15 +138,11 @@ void dfUEqn::process() { dataBase_.d_weight, dataBase_.d_sf, d_grad_u, d_source, // end for internal dataBase_.num_patches, dataBase_.patch_size.data(), patch_type.data(), dataBase_.d_boundary_face_cell, d_boundary_grad_u, dataBase_.d_boundary_sf, dataBase_.d_volume); - // fvc_to_source_vector(dataBase_.stream, dataBase_.num_cells, - // dataBase_.d_volume, d_fvc_output, d_source); fvc_grad_cell_scalar(dataBase_.stream, dataBase_.num_cells, dataBase_.num_surfaces, dataBase_.d_owner, dataBase_.d_neighbor, dataBase_.d_weight, dataBase_.d_sf, dataBase_.d_p, d_source, dataBase_.num_patches, dataBase_.patch_size.data(), patch_type.data(), dataBase_.d_boundary_face_cell, dataBase_.d_boundary_p, dataBase_.d_boundary_sf, dataBase_.d_volume, -1.); - // fvc_to_source_vector(dataBase_.stream, dataBase_.num_cells, - // dataBase_.d_volume, d_fvc_output, d_source); #ifndef TIME_GPU checkCudaErrors(cudaStreamEndCapture(dataBase_.stream, &graph)); @@ -177,11 +171,16 @@ void dfUEqn::solve() { ldu_to_csr(dataBase_.stream, dataBase_.num_cells, dataBase_.num_surfaces, dataBase_.num_boundary_surfaces, dataBase_.d_boundary_face_cell, dataBase_.d_lower_to_csr_index, dataBase_.d_upper_to_csr_index, dataBase_.d_diag_to_csr_index, - d_lower, d_upper, d_diag, d_source, d_internal_coeffs, d_boundary_coeffs, d_A, d_b, d_diag_vector); + d_lower, d_upper, d_diag, d_source, d_internal_coeffs, d_boundary_coeffs, d_A, d_b); int nNz = dataBase_.num_cells + dataBase_.num_surfaces * 2; // matrix entries sync(); + // double *h_A_csr = new double[nNz * 3]; + // checkCudaErrors(cudaMemcpy(h_A_csr, d_A, dataBase_.csr_value_vec_bytes, cudaMemcpyDeviceToHost)); + // for (int i = 0; i < nNz; i++) + // fprintf(stderr, "h_A_csr[%d]: (%.10lf, %.10lf, %.10lf)\n", i, h_A_csr[i], h_A_csr[i + nNz], h_A_csr[i + 2 * nNz]); + if (num_iteration == 0) // first interation { printf("Initializing AmgX Linear Solver\n"); From 920fc84d8402a21c3c9338684be91549fdb11e9f Mon Sep 17 00:00:00 2001 From: maorz1998 Date: Sun, 27 Aug 2023 16:44:38 +0800 Subject: [PATCH 3/3] fvMatrix.A(), fvMatrix.H(), opt ldu2csr --- applications/solvers/dfLowMachFoam/UEqn.H | 2 +- .../solvers/dfLowMachFoam/dfLowMachFoam.C | 2 +- applications/solvers/dfLowMachFoam/pEqn.H | 43 ++++ src_gpu/dfMatrixDataBase.H | 1 + src_gpu/dfMatrixDataBase.cu | 36 ++- src_gpu/dfMatrixOpBase.H | 19 +- src_gpu/dfMatrixOpBase.cu | 241 ++++++++++++++---- src_gpu/dfUEqn.H | 10 + src_gpu/dfUEqn.cu | 54 +++- 9 files changed, 335 insertions(+), 73 deletions(-) diff --git a/applications/solvers/dfLowMachFoam/UEqn.H b/applications/solvers/dfLowMachFoam/UEqn.H index 4e8c090ad..ea15c8b43 100644 --- a/applications/solvers/dfLowMachFoam/UEqn.H +++ b/applications/solvers/dfLowMachFoam/UEqn.H @@ -135,7 +135,7 @@ TICK_START; U.oldTime(); - memcpy(h_u, &U.oldTime()[0][0], dfDataBase.cell_value_vec_bytes); + memcpy(h_u, &U.oldTime()[0][0], dfDataBase.cell_value_vec_bytes); memcpy(h_p, &p[0], dfDataBase.cell_value_bytes); memcpy(h_nu_eff, &nuEff[0], dfDataBase.cell_value_bytes); TICK_STOP(copy to pinned memory); diff --git a/applications/solvers/dfLowMachFoam/dfLowMachFoam.C b/applications/solvers/dfLowMachFoam/dfLowMachFoam.C index 4a17ab215..6ea4251af 100644 --- a/applications/solvers/dfLowMachFoam/dfLowMachFoam.C +++ b/applications/solvers/dfLowMachFoam/dfLowMachFoam.C @@ -60,7 +60,7 @@ Description #include "basicThermo.H" #include "CombustionModel.H" -// #define GPUSolverNew_ +#define GPUSolverNew_ #define TIME #ifdef GPUSolverNew_ diff --git a/applications/solvers/dfLowMachFoam/pEqn.H b/applications/solvers/dfLowMachFoam/pEqn.H index 34925327f..cc35d93fe 100644 --- a/applications/solvers/dfLowMachFoam/pEqn.H +++ b/applications/solvers/dfLowMachFoam/pEqn.H @@ -67,11 +67,52 @@ const volScalarField psip0(psi*p); time_monitor_UEqn_A_correctBC += double(end1 - start1) / double(CLOCKS_PER_SEC); #endif +#ifdef GPUSolverNew_ +volVectorField UEqn_H +( + IOobject + ( + "H("+U.name()+')', + runTime.timeName(), + mesh, + IOobject::NO_READ, + IOobject::NO_WRITE + ), + mesh, + dimensionedVector(dimensionSet(1,-2,-2,0,0,0,0), Zero), + extrapolatedCalculatedFvPatchScalarField::typeName +); +UEqn_GPU.H(&UEqn_H[0][0]); +UEqn_H.correctBoundaryConditions(); + +volScalarField UEqn_A +( + IOobject + ( + "A("+U.name()+')', + runTime.timeName(), + mesh, + IOobject::NO_READ, + IOobject::NO_WRITE + ), + mesh, + dimensionedScalar(dimensionSet(1,-3,-1,0,0,0,0), Zero), + extrapolatedCalculatedFvPatchScalarField::typeName +); +UEqn_GPU.A(&UEqn_A[0]); +UEqn_A.correctBoundaryConditions(); +#endif + + start2 = std::clock(); #ifdef GPUSolver_ volScalarField rAU(1.0/UEqn_A); surfaceScalarField rhorAUf("rhorAUf", fvc::interpolate(rho*rAU)); volVectorField HbyA(constrainHbyA(rAU*UEqn_H, U, p)); +#elif defined GPUSolverNew_ + volScalarField rAU(1.0/UEqn_A); + surfaceScalarField rhorAUf("rhorAUf", fvc::interpolate(rho*rAU)); + volVectorField HbyA(constrainHbyA(rAU*UEqn_H, U, p)); #else volScalarField rAU(1.0/UEqn.A()); surfaceScalarField rhorAUf("rhorAUf", fvc::interpolate(rho*rAU)); @@ -179,6 +220,8 @@ K = 0.5*magSqr(U); end1 = std::clock(); time_monitor_UEqn += double(end1 - start1) / double(CLOCKS_PER_SEC); time_monitor_UEqn_correctBC += double(end1 - start1) / double(CLOCKS_PER_SEC); +#elif defined GPUSolverNew_ + UEqn_GPU.correctPsi(&U[0][0]); #endif if (pimple.simpleRho()) diff --git a/src_gpu/dfMatrixDataBase.H b/src_gpu/dfMatrixDataBase.H index cac7264a8..a4eeee70a 100644 --- a/src_gpu/dfMatrixDataBase.H +++ b/src_gpu/dfMatrixDataBase.H @@ -103,6 +103,7 @@ struct dfMatrixDataBase int *d_lower_to_csr_index = nullptr; int *d_diag_to_csr_index= nullptr; int *d_upper_to_csr_index= nullptr; + int *d_ldu_to_csr_index = nullptr; int *d_csr_row_index= nullptr; int *d_csr_col_index= nullptr; diff --git a/src_gpu/dfMatrixDataBase.cu b/src_gpu/dfMatrixDataBase.cu index 4e49faf99..1c1ee5ce0 100644 --- a/src_gpu/dfMatrixDataBase.cu +++ b/src_gpu/dfMatrixDataBase.cu @@ -160,14 +160,48 @@ void dfMatrixDataBase::setConstantIndexes(const int *owner, const int *neighbor) diagCSRIndex.push_back(diagIndexInCSR); CSRColIndex[diagIndexInCSR] = i; // fill diag entry in CSRColIndex } - CSRRowIndex.push_back(2 * num_surfaces + num_cells); + int nNz = 2 * num_surfaces + num_cells; + CSRRowIndex.push_back(nNz); + + // get reverseIndex from csr to ldu (low + diag + upp) + std::vector CSRIndex; + CSRIndex.insert(CSRIndex.end(), lowCSRIndex.begin(), lowCSRIndex.end()); + CSRIndex.insert(CSRIndex.end(), diagCSRIndex.begin(), diagCSRIndex.end()); + CSRIndex.insert(CSRIndex.end(), uppCSRIndex.begin(), uppCSRIndex.end()); + + std::vector lduCSRIndexPermInit(nNz); + std::iota(lduCSRIndexPermInit.begin(), lduCSRIndexPermInit.end(), 0); + + std::multimap IndexPermutation; + for (int i = 0; i < nNz; ++i){ + IndexPermutation.insert(std::make_pair(CSRIndex[i], lduCSRIndexPermInit[i])); + } + std::vector> IndexPermPair(IndexPermutation.begin(), IndexPermutation.end()); + std::sort(IndexPermPair.begin(), IndexPermPair.end(), [] + (const std::pair& pair1, const std::pair& pair2){ + return pair1.first < pair2.first;}); + + std::vector lduCSRIndex; + std::transform(IndexPermPair.begin(), IndexPermPair.end(), std::back_inserter(lduCSRIndex), [] + (const std::pair& pair) { + return pair.second; + }); + + std::vector test; + std::transform(IndexPermPair.begin(), IndexPermPair.end(), std::back_inserter(test), [] + (const std::pair& pair) { + return pair.first; + }); + checkCudaErrors(cudaMalloc((void**)&d_lower_to_csr_index, surface_index_bytes)); checkCudaErrors(cudaMalloc((void**)&d_diag_to_csr_index, cell_index_bytes)); checkCudaErrors(cudaMalloc((void**)&d_upper_to_csr_index, surface_index_bytes)); + checkCudaErrors(cudaMalloc((void**)&d_ldu_to_csr_index, csr_col_index_bytes)); checkCudaErrors(cudaMemcpyAsync(d_lower_to_csr_index, lowCSRIndex.data(), surface_index_bytes, cudaMemcpyHostToDevice, stream)); checkCudaErrors(cudaMemcpyAsync(d_diag_to_csr_index, diagCSRIndex.data(), cell_index_bytes, cudaMemcpyHostToDevice, stream)); checkCudaErrors(cudaMemcpyAsync(d_upper_to_csr_index, uppCSRIndex.data(), surface_index_bytes, cudaMemcpyHostToDevice, stream)); + checkCudaErrors(cudaMemcpyAsync(d_ldu_to_csr_index, lduCSRIndex.data(), csr_col_index_bytes, cudaMemcpyHostToDevice, stream)); // build d_csr_row_index, d_csr_col_index diff --git a/src_gpu/dfMatrixOpBase.H b/src_gpu/dfMatrixOpBase.H index 7a98a685d..c22fd5c4f 100644 --- a/src_gpu/dfMatrixOpBase.H +++ b/src_gpu/dfMatrixOpBase.H @@ -1,5 +1,6 @@ #pragma once -// #define TIME_GPU +#include +#define TIME_GPU // tools void permute_vector_d2h(cudaStream_t stream, int num_cells, const double *input, double *output); @@ -12,9 +13,8 @@ void field_multiply_scalar(cudaStream_t stream, void fvc_to_source_vector(cudaStream_t stream, int num_cells, const double *volume, const double *fvc_output, double *source); void ldu_to_csr(cudaStream_t stream, int num_cells, int num_surfaces, int num_boundary_surface, - const int* boundary_cell_face, const int *lower_to_csr_index, const int *upper_to_csr_index, const int *diag_to_csr_index, - const double *lower, const double *upper, const double *diag, const double *source, - const double *internal_coeffs, const double *boundary_coeffs, + const int* boundary_cell_face, const int *ldu_to_csr_index, const int *diag_to_csr_index, + const double *ldu, const double *source, const double *internal_coeffs, const double *boundary_coeffs, double *A, double *b); void update_boundary_coeffs_vector(cudaStream_t stream, int num_boundary_surfaces, int num_patches, @@ -86,3 +86,14 @@ void fvc_grad_cell_scalar(cudaStream_t stream, int num_cells, int num_surfaces, // others void scale_dev2T_tensor(cudaStream_t stream, int num_cells, const double *vf1, double *vf2, int num_boundary_surfaces, const double *boundary_vf1, double *boundary_vf2); + +void fvMtx_A(cudaStream_t stream, int num_cells, int num_surfaces, int num_boundary_surfaces, + const int *boundary_cell_face, const double *internal_coeffs, const double *volume, const double *diag, + double *A_pEqn, double sign = 1.); + +void fvMtx_H(cudaStream_t stream, int num_cells, int num_surfaces, int num_boundary_surfaces, + const int *lowerAddr, const int *upperAddr, const double *volume, + int num_patches, const int *patch_size, const int *patch_type, + const int *boundary_cell_face, const double *internal_coffs, const double *boundary_coeffs, + const double *lower, const double *upper, const double *source, const double *psi, + double *H_pEqn, double *H_pEqn_perm, double sign = 1.); diff --git a/src_gpu/dfMatrixOpBase.cu b/src_gpu/dfMatrixOpBase.cu index 54a9b6893..08ff4aefc 100644 --- a/src_gpu/dfMatrixOpBase.cu +++ b/src_gpu/dfMatrixOpBase.cu @@ -789,22 +789,6 @@ __global__ void fvc_div_cell_tensor_boundary(int num_cells, int num_boundary_fac // } } -__global__ void constructVecDiag(int num_cells, const double *diag, double *diag_vec, - const double *source, double *b) -{ - int index = blockDim.x * blockIdx.x + threadIdx.x; - if (index >= num_cells) - return; - - diag_vec[num_cells * 0 + index] = diag[index]; - diag_vec[num_cells * 1 + index] = diag[index]; - diag_vec[num_cells * 2 + index] = diag[index]; - - b[num_cells * 0 + index] = source[num_cells * 0 + index]; - b[num_cells * 1 + index] = source[num_cells * 1 + index]; - b[num_cells * 2 + index] = source[num_cells * 2 + index]; -} - __global__ void addBoundaryDiagSrc(int num_cells, int num_surfaces, int num_boundary_surfaces, const int *face2Cells, const double *internal_coeffs, const double *boundary_coeffs, const int *diagCSRIndex, double *A_csr, double *b) @@ -834,47 +818,128 @@ __global__ void addBoundaryDiagSrc(int num_cells, int num_surfaces, int num_boun atomicAdd(&b[num_cells * 2 + cellIndex], boundaryCoeffz); } -__global__ void ldu_to_csr_offDiag(int num_cells, int num_surfaces, - const int *lowCSRIndex, const int *uppCSRIndex, - const double *lower, const double *upper, - double *A_csr) +__global__ void ldu_to_csr_kernel(int nNz, const int *ldu_to_csr_index, + const double *ldu, double *A_csr) { int index = blockDim.x * blockIdx.x + threadIdx.x; - if (index >= num_surfaces) + if (index >= nNz) return; + + int lduIndex = ldu_to_csr_index[index]; + double csrVal = ldu[lduIndex]; + A_csr[nNz * 0 + index] = csrVal; + A_csr[nNz * 1 + index] = csrVal; + A_csr[nNz * 2 + index] = csrVal; +} - int uppIndex = uppCSRIndex[index]; - int lowIndex = lowCSRIndex[index]; - double upp = upper[index]; - double low = lower[index]; - A_csr[(num_cells + 2 * num_surfaces) * 0 + uppIndex] = upper[index]; - A_csr[(num_cells + 2 * num_surfaces) * 1 + uppIndex] = upper[index]; - A_csr[(num_cells + 2 * num_surfaces) * 2 + uppIndex] = upper[index]; - - A_csr[(num_cells + 2 * num_surfaces) * 0 + lowIndex] = lower[index]; - A_csr[(num_cells + 2 * num_surfaces) * 1 + lowIndex] = lower[index]; - A_csr[(num_cells + 2 * num_surfaces) * 2 + lowIndex] = lower[index]; +__global__ void addAveInternaltoDiag(int num_cells, int num_boundary_surfaces, const int *face2Cells, + const double *internal_coeffs, double *A_pEqn) +{ + int index = blockDim.x * blockIdx.x + threadIdx.x; + if (index >= num_boundary_surfaces) + return; + + int cellIndex = face2Cells[index]; + + double internal_x = internal_coeffs[num_boundary_surfaces * 0 + index]; + double internal_y = internal_coeffs[num_boundary_surfaces * 1 + index]; + double internal_z = internal_coeffs[num_boundary_surfaces * 2 + index]; + + double ave_internal = (internal_x + internal_y + internal_z) / 3; + + atomicAdd(&A_pEqn[cellIndex], ave_internal); } -__global__ void ldu_to_csr_Diag(int num_cells, int num_surfaces, - const int *diagCSRIndex, const double *diag, const double *source, - double *A_csr, double *b) +__global__ void addBoundaryDiag(int num_cells, int num_boundary_surfaces, const int *face2Cells, + const double *internal_coeffs, const double *psi, double *H_pEqn) { int index = blockDim.x * blockIdx.x + threadIdx.x; - if (index >= num_cells) + if (index >= num_boundary_surfaces) return; - int diagIndex = diagCSRIndex[index]; - double diagVal = diag[index]; - A_csr[(num_cells + 2 * num_surfaces) * 0 + diagIndex] = diagVal; - A_csr[(num_cells + 2 * num_surfaces) * 1 + diagIndex] = diagVal; - A_csr[(num_cells + 2 * num_surfaces) * 2 + diagIndex] = diagVal; + // addBoundaryDiag(boundaryDiagCmpt, cmpt); // add internal coeffs + // boundaryDiagCmpt.negate(); + double internal_x = internal_coeffs[num_boundary_surfaces * 0 + index]; + double internal_y = internal_coeffs[num_boundary_surfaces * 1 + index]; + double internal_z = internal_coeffs[num_boundary_surfaces * 2 + index]; + + // addCmptAvBoundaryDiag(boundaryDiagCmpt); + double ave_internal = (internal_x + internal_y + internal_z) / 3; + + int cellIndex = face2Cells[index]; - b[num_cells * 0 + index] = source[num_cells * 0 + index]; - b[num_cells * 1 + index] = source[num_cells * 1 + index]; - b[num_cells * 2 + index] = source[num_cells * 2 + index]; + // do not permute H anymore + atomicAdd(&H_pEqn[num_cells * 0 + cellIndex], (-internal_x + ave_internal) * psi[num_cells * 0 + cellIndex]); + atomicAdd(&H_pEqn[num_cells * 1 + cellIndex], (-internal_y + ave_internal) * psi[num_cells * 1 + cellIndex]); + atomicAdd(&H_pEqn[num_cells * 2 + cellIndex], (-internal_z + ave_internal) * psi[num_cells * 2 + cellIndex]); } +__global__ void lduMatrix_H(int num_cells, int num_surfaces, + const int *lower_index, const int *upper_index, const double *lower, const double *upper, + const double *psi, double *H_pEqn) +{ + /* + for (label face=0; face= num_surfaces) + return; + + int l = lower_index[index]; + int u = upper_index[index]; + + atomicAdd(&H_pEqn[num_cells * 0 + u], -lower[index] * psi[num_cells * 0 + l]); + atomicAdd(&H_pEqn[num_cells * 1 + u], -lower[index] * psi[num_cells * 1 + l]); + atomicAdd(&H_pEqn[num_cells * 2 + u], -lower[index] * psi[num_cells * 2 + l]); + atomicAdd(&H_pEqn[num_cells * 0 + l], -upper[index] * psi[num_cells * 0 + u]); + atomicAdd(&H_pEqn[num_cells * 1 + l], -upper[index] * psi[num_cells * 1 + u]); + atomicAdd(&H_pEqn[num_cells * 2 + l], -upper[index] * psi[num_cells * 2 + u]); +} + +__global__ void addBoundarySrc_unCoupled(int num_cells, int num, int offset, + int num_boundary_surfaces, const int *face2Cells, const double *boundary_coeffs, double *H_pEqn) +{ + int index = blockDim.x * blockIdx.x + threadIdx.x; + if (index >= num) + return; + + int start_index = offset + index; + + // addBoundaryDiag(boundaryDiagCmpt, cmpt); // add internal coeffs + // boundaryDiagCmpt.negate(); + double boundary_x = boundary_coeffs[num_boundary_surfaces * 0 + start_index]; + double boundary_y = boundary_coeffs[num_boundary_surfaces * 1 + start_index]; + double boundary_z = boundary_coeffs[num_boundary_surfaces * 2 + start_index]; + + + int cellIndex = face2Cells[start_index]; + + // do not permute H anymore + atomicAdd(&H_pEqn[num_cells * 0 + cellIndex], boundary_x); + atomicAdd(&H_pEqn[num_cells * 1 + cellIndex], boundary_y); + atomicAdd(&H_pEqn[num_cells * 2 + cellIndex], boundary_z); +} + +__global__ void divideVol_permute_vec(int num_cells, const double *volume, double *H_pEqn, double *H_pEqn_perm) +{ + int index = blockDim.x * blockIdx.x + threadIdx.x; + if (index >= num_cells) + return; + + // divide volume + double vol = volume[index]; + double H_pEqn_x = H_pEqn[num_cells * 0 + index] / vol; + double H_pEqn_y = H_pEqn[num_cells * 1 + index] / vol; + double H_pEqn_z = H_pEqn[num_cells * 2 + index] / vol; + + // permute + H_pEqn_perm[index * 3 + 0] = H_pEqn_x; + H_pEqn_perm[index * 3 + 1] = H_pEqn_y; + H_pEqn_perm[index * 3 + 2] = H_pEqn_z; +} void permute_vector_d2h(cudaStream_t stream, int num_cells, const double *input, double *output) { @@ -912,26 +977,26 @@ void fvc_to_source_vector(cudaStream_t stream, int num_cells, const double *volu } void ldu_to_csr(cudaStream_t stream, int num_cells, int num_surfaces, int num_boundary_surface, - const int* boundary_cell_face, const int *lower_to_csr_index, const int *upper_to_csr_index, const int *diag_to_csr_index, - const double *lower, const double *upper, const double *diag, const double *source, - const double *internal_coeffs, const double *boundary_coeffs, + const int* boundary_cell_face, const int *ldu_to_csr_index, const int *diag_to_csr_index, + const double *ldu, const double *source, const double *internal_coeffs, const double *boundary_coeffs, double *A, double *b) { + TICK_INIT_EVENT; // construct diag + int nNz = num_cells + 2 * num_surfaces; size_t threads_per_block = 1024; - size_t blocks_per_grid = (num_cells + threads_per_block - 1) / threads_per_block; - ldu_to_csr_Diag<<>>(num_cells, num_surfaces, - diag_to_csr_index, diag, source, A, b); + size_t blocks_per_grid = (nNz + threads_per_block - 1) / threads_per_block; + TICK_START_EVENT; + checkCudaErrors(cudaMemcpyAsync(b, source, num_cells * 3 * sizeof(double), cudaMemcpyDeviceToDevice, stream)); + ldu_to_csr_kernel<<>>(nNz, ldu_to_csr_index, ldu, A); + TICK_END_EVENT(ldu_to_csr_kernel); // add coeff to source and diagnal blocks_per_grid = (num_boundary_surface + threads_per_block - 1) / threads_per_block; + TICK_START_EVENT; addBoundaryDiagSrc<<>>(num_cells, num_surfaces, num_boundary_surface, boundary_cell_face, internal_coeffs, boundary_coeffs, diag_to_csr_index, A, b); - - // convert offdiag - blocks_per_grid = (num_surfaces + threads_per_block - 1) / threads_per_block; - ldu_to_csr_offDiag<<>>(num_cells, num_surfaces, - lower_to_csr_index, upper_to_csr_index, lower, upper, A); + TICK_END_EVENT(addBoundaryDiagSrc); } void update_boundary_coeffs_vector(cudaStream_t stream, int num_boundary_surfaces, int num_patches, @@ -1287,3 +1352,69 @@ void fvc_grad_cell_scalar(cudaStream_t stream, int num_cells, int num_surfaces, // blocks_per_grid = (num_cells + threads_per_block - 1) / threads_per_block; // divide_cell_volume_vec<<>>(num_cells, volume, output, sign); } + +void fvMtx_A(cudaStream_t stream, int num_cells, int num_surfaces, int num_boundary_surfaces, + const int *boundary_cell_face, const double *internal_coeffs, const double *volume, const double *diag, + double *A_pEqn, double sign) +{ + checkCudaErrors(cudaMemcpyAsync(A_pEqn, diag, num_cells * sizeof(double), cudaMemcpyDeviceToDevice, stream)); + TICK_INIT_EVENT; + + size_t threads_per_block = 1024; + size_t blocks_per_grid = (num_boundary_surfaces + threads_per_block - 1) / threads_per_block; + TICK_START_EVENT; + addAveInternaltoDiag<<>>(num_cells, num_boundary_surfaces, boundary_cell_face, + internal_coeffs, A_pEqn); + TICK_END_EVENT(addAveInternaltoDiag); + + blocks_per_grid = (num_cells + threads_per_block - 1) / threads_per_block; + TICK_START_EVENT; + divide_cell_volume_scalar<<>>(num_cells, volume, A_pEqn, sign); + TICK_END_EVENT(divide_cell_volume_scalar); +} + +void fvMtx_H(cudaStream_t stream, int num_cells, int num_surfaces, int num_boundary_surfaces, + const int *lowerAddr, const int *upperAddr, const double *volume, + int num_patches, const int *patch_size, const int *patch_type, + const int *boundary_cell_face, const double *internal_coffs, const double *boundary_coeffs, + const double *lower, const double *upper, const double *source, const double *psi, + double *H_pEqn, double *H_pEqn_perm, double sign) +{ + TICK_INIT_EVENT; + size_t threads_per_block = 1024; + size_t blocks_per_grid = (num_boundary_surfaces + threads_per_block - 1) / threads_per_block; + + TICK_START_EVENT; + checkCudaErrors(cudaMemcpyAsync(H_pEqn, source, num_cells * 3 * sizeof(double), cudaMemcpyDeviceToDevice, stream)); + addBoundaryDiag<<>>(num_cells, num_boundary_surfaces, boundary_cell_face, + internal_coffs, psi, H_pEqn); + TICK_END_EVENT(addBoundaryDiag); + + blocks_per_grid = (num_surfaces + threads_per_block - 1) / threads_per_block; + TICK_START_EVENT; + lduMatrix_H<<>>(num_cells, num_surfaces, lowerAddr, upperAddr, + lower, upper, psi, H_pEqn); + TICK_END_EVENT(lduMatrix_H); + + int offset = 0; + for (int i = 0; i < num_patches; i++) { + threads_per_block = 64; + blocks_per_grid = (patch_size[i] + threads_per_block - 1) / threads_per_block; + // TODO: just non-coupled patch type now + if (patch_type[i] == boundaryConditions::zeroGradient + || patch_type[i] == boundaryConditions::fixedValue) { + TICK_START_EVENT; + addBoundarySrc_unCoupled<<>>(num_cells, patch_size[i], offset, + num_boundary_surfaces, boundary_cell_face, boundary_coeffs, H_pEqn); + TICK_END_EVENT(addBoundarySrc_unCoupled); + } else if (0) { + // xxx + fprintf(stderr, "boundaryConditions other than zeroGradient are not support yet!\n"); + } + offset += patch_size[i]; + } + blocks_per_grid = (num_cells + threads_per_block - 1) / threads_per_block; + TICK_START_EVENT; + divideVol_permute_vec<<>>(num_cells, volume, H_pEqn, H_pEqn_perm); + TICK_END_EVENT(divideVol_permute_vec); +} diff --git a/src_gpu/dfUEqn.H b/src_gpu/dfUEqn.H index 49edc1b7a..43ee3402b 100644 --- a/src_gpu/dfUEqn.H +++ b/src_gpu/dfUEqn.H @@ -37,6 +37,8 @@ private: double *d_nu_eff = nullptr; // computed on CPU, used on GPU, need memcpyh2d - host double *h_nu_eff = nullptr; + double *h_A_pEqn = nullptr; + double *h_H_pEqn = nullptr; // intermediate fields double *d_grad_u = nullptr; double *d_rho_nueff = nullptr; @@ -58,6 +60,7 @@ private: double *d_gradient_boundary_coeffs= nullptr; // non-constant fields - ldu + double *d_ldu = nullptr; double *d_lower = nullptr; double *d_upper = nullptr; double *d_diag = nullptr; @@ -69,6 +72,9 @@ private: // non-constant fields - csr double *d_A = nullptr; double *d_b = nullptr; // TODO: needless + double *d_A_pEqn = nullptr; + double *d_H_pEqn = nullptr; + double *d_H_pEqn_perm = nullptr; // field pointer map std::unordered_map fieldPointerMap; @@ -107,6 +113,10 @@ public: void process(); void postProcess(double *h_u); + void A(double *Psi); + void H(double *Psi); + void correctPsi(double *Psi); + void solve(); void compareResult(const double *lower, const double *upper, const double *diag, const double *source, const double *internal_coeffs, const double *boundary_coeffs, // const double *tmpVal, diff --git a/src_gpu/dfUEqn.cu b/src_gpu/dfUEqn.cu index 837a7587a..2dc569915 100644 --- a/src_gpu/dfUEqn.cu +++ b/src_gpu/dfUEqn.cu @@ -17,6 +17,8 @@ void dfUEqn::createNonConstantFieldsInternal() { checkCudaErrors(cudaMalloc((void**)&d_nu_eff, dataBase_.cell_value_bytes)); // computed on CPU, used on GPU, need memcpyh2d checkCudaErrors(cudaMallocHost((void**)&h_nu_eff , dataBase_.cell_value_bytes)); + checkCudaErrors(cudaMallocHost((void**)&h_A_pEqn , dataBase_.cell_value_bytes)); + checkCudaErrors(cudaMallocHost((void**)&h_H_pEqn , dataBase_.cell_value_vec_bytes)); // intermediate fields checkCudaErrors(cudaMalloc((void**)&d_grad_u, dataBase_.cell_value_tsr_bytes)); checkCudaErrors(cudaMalloc((void**)&d_rho_nueff, dataBase_.cell_value_bytes)); @@ -46,14 +48,18 @@ void dfUEqn::createNonConstantFieldsBoundary() { } void dfUEqn::createNonConstantLduAndCsrFields() { - checkCudaErrors(cudaMalloc((void**)&d_lower, dataBase_.surface_value_bytes)); - checkCudaErrors(cudaMalloc((void**)&d_upper, dataBase_.surface_value_bytes)); - checkCudaErrors(cudaMalloc((void**)&d_diag, dataBase_.cell_value_bytes)); + checkCudaErrors(cudaMalloc((void**)&d_ldu, dataBase_.csr_value_bytes)); + d_lower = d_ldu; + d_diag = d_ldu + dataBase_.num_surfaces; + d_upper = d_ldu + dataBase_.num_cells + dataBase_.num_surfaces; checkCudaErrors(cudaMalloc((void**)&d_source, dataBase_.cell_value_vec_bytes)); checkCudaErrors(cudaMalloc((void**)&d_internal_coeffs, dataBase_.boundary_surface_value_vec_bytes)); checkCudaErrors(cudaMalloc((void**)&d_boundary_coeffs, dataBase_.boundary_surface_value_vec_bytes)); checkCudaErrors(cudaMalloc((void**)&d_A, dataBase_.csr_value_vec_bytes)); checkCudaErrors(cudaMalloc((void**)&d_b, dataBase_.cell_value_vec_bytes)); + checkCudaErrors(cudaMalloc((void**)&d_A_pEqn, dataBase_.cell_value_bytes)); + checkCudaErrors(cudaMalloc((void**)&d_H_pEqn, dataBase_.cell_value_vec_bytes)); + checkCudaErrors(cudaMalloc((void**)&d_H_pEqn_perm, dataBase_.cell_value_vec_bytes)); } void dfUEqn::initNonConstantFieldsBoundary() { @@ -79,14 +85,14 @@ void dfUEqn::preProcess(const double *h_u, const double *h_boundary_u, const dou checkCudaErrors(cudaMemcpyAsync(d_boundary_nu_eff, h_boundary_nu_eff, dataBase_.boundary_surface_value_bytes, cudaMemcpyHostToDevice, dataBase_.stream)); checkCudaErrors(cudaMemcpyAsync(dataBase_.d_boundary_rho, h_boundary_rho, dataBase_.boundary_surface_value_bytes, cudaMemcpyHostToDevice, dataBase_.stream)); - checkCudaErrors(cudaMemsetAsync(d_lower, 0, dataBase_.surface_value_bytes, dataBase_.stream)); - checkCudaErrors(cudaMemsetAsync(d_upper, 0, dataBase_.surface_value_bytes, dataBase_.stream)); - checkCudaErrors(cudaMemsetAsync(d_diag, 0, dataBase_.cell_value_bytes, dataBase_.stream)); + checkCudaErrors(cudaMemsetAsync(d_ldu, 0, dataBase_.csr_value_bytes, dataBase_.stream)); // d_ldu contains d_lower, d_diag, and d_upper checkCudaErrors(cudaMemsetAsync(d_source, 0, dataBase_.cell_value_vec_bytes, dataBase_.stream)); checkCudaErrors(cudaMemsetAsync(d_internal_coeffs, 0, dataBase_.boundary_surface_value_vec_bytes, dataBase_.stream)); checkCudaErrors(cudaMemsetAsync(d_boundary_coeffs, 0, dataBase_.boundary_surface_value_vec_bytes, dataBase_.stream)); checkCudaErrors(cudaMemsetAsync(d_A, 0, dataBase_.csr_value_vec_bytes, dataBase_.stream)); checkCudaErrors(cudaMemsetAsync(d_b, 0, dataBase_.cell_value_vec_bytes, dataBase_.stream)); + checkCudaErrors(cudaMemsetAsync(d_A_pEqn, 0, dataBase_.cell_value_bytes, dataBase_.stream)); + checkCudaErrors(cudaMemsetAsync(d_H_pEqn, 0, dataBase_.cell_value_vec_bytes, dataBase_.stream)); } void dfUEqn::process() { @@ -143,6 +149,9 @@ void dfUEqn::process() { dataBase_.d_weight, dataBase_.d_sf, dataBase_.d_p, d_source, dataBase_.num_patches, dataBase_.patch_size.data(), patch_type.data(), dataBase_.d_boundary_face_cell, dataBase_.d_boundary_p, dataBase_.d_boundary_sf, dataBase_.d_volume, -1.); + ldu_to_csr(dataBase_.stream, dataBase_.num_cells, dataBase_.num_surfaces, dataBase_.num_boundary_surfaces, + dataBase_.d_boundary_face_cell, dataBase_.d_ldu_to_csr_index, dataBase_.d_diag_to_csr_index, + d_ldu, d_source, d_internal_coeffs, d_boundary_coeffs, d_A, d_b); #ifndef TIME_GPU checkCudaErrors(cudaStreamEndCapture(dataBase_.stream, &graph)); @@ -168,10 +177,6 @@ void dfUEqn::sync() } void dfUEqn::solve() { - ldu_to_csr(dataBase_.stream, dataBase_.num_cells, dataBase_.num_surfaces, dataBase_.num_boundary_surfaces, - dataBase_.d_boundary_face_cell, - dataBase_.d_lower_to_csr_index, dataBase_.d_upper_to_csr_index, dataBase_.d_diag_to_csr_index, - d_lower, d_upper, d_diag, d_source, d_internal_coeffs, d_boundary_coeffs, d_A, d_b); int nNz = dataBase_.num_cells + dataBase_.num_surfaces * 2; // matrix entries sync(); @@ -200,7 +205,7 @@ void dfUEqn::solve() { num_iteration++; } -void dfUEqn::postProcess(double *h_u) { // TODO: Here may be a bug +void dfUEqn::postProcess(double *h_u) { permute_vector_d2h(dataBase_.stream, dataBase_.num_cells, d_permute, dataBase_.d_u); checkCudaErrors(cudaMemcpyAsync(h_u, dataBase_.d_u, dataBase_.cell_value_vec_bytes, cudaMemcpyDeviceToHost, dataBase_.stream)); checkCudaErrors(cudaStreamSynchronize(dataBase_.stream)); @@ -212,6 +217,33 @@ void dfUEqn::postProcess(double *h_u) { // TODO: Here may be a bug d_gradient_internal_coeffs, d_gradient_boundary_coeffs); } +void dfUEqn::A(double *Psi) { + fvMtx_A(dataBase_.stream, dataBase_.num_cells, dataBase_.num_surfaces, dataBase_.num_boundary_surfaces, + dataBase_.d_boundary_face_cell, d_internal_coeffs, dataBase_.d_volume, d_diag, d_A_pEqn); + checkCudaErrors(cudaMemcpyAsync(h_A_pEqn, d_A_pEqn, dataBase_.cell_value_bytes, cudaMemcpyDeviceToHost, dataBase_.stream)); + sync(); + // TODO: correct Boundary conditions + memcpy(Psi, h_A_pEqn, dataBase_.cell_value_bytes); +} + +void dfUEqn::H(double *Psi) { + fvMtx_H(dataBase_.stream, dataBase_.num_cells, dataBase_.num_surfaces, dataBase_.num_boundary_surfaces, + dataBase_.d_owner, dataBase_.d_neighbor, dataBase_.d_volume, + dataBase_.num_patches, dataBase_.patch_size.data(), patch_type.data(), + dataBase_.d_boundary_face_cell, d_internal_coeffs, d_boundary_coeffs, + d_lower, d_upper, d_source, d_permute, d_H_pEqn, d_H_pEqn_perm); + checkCudaErrors(cudaMemcpyAsync(h_H_pEqn, d_H_pEqn_perm, dataBase_.cell_value_vec_bytes, cudaMemcpyDeviceToHost, dataBase_.stream)); + sync(); + // TODO: correct Boundary conditions + memcpy(Psi, h_H_pEqn, dataBase_.cell_value_vec_bytes); +} + +void dfUEqn::correctPsi(double *Psi) { + checkCudaErrors(cudaMemcpy(dataBase_.d_u, Psi, dataBase_.cell_value_vec_bytes, cudaMemcpyHostToDevice)); + permute_vector_h2d(dataBase_.stream, dataBase_.num_cells, dataBase_.d_u, d_permute); +} + + double* dfUEqn::getFieldPointer(const char* fieldAlias, location loc, position pos) { char mergedName[256]; if (pos == position::internal) {