From 3f5e136e0d601809d586a925d9532bf88a5e8457 Mon Sep 17 00:00:00 2001 From: maorz1998 Date: Tue, 1 Aug 2023 22:32:34 +0800 Subject: [PATCH 01/25] dfMatrixDataBase --- GPUTest/Make/files | 4 + GPUTest/Make/options | 46 ++ GPUTest/correctPhi.H | 12 + GPUTest/createFields.H | 176 ++++++ GPUTest/createGPUSolver.H | 71 +++ GPUTest/setRDeltaT.H | 85 +++ GPUTest/setRootCase2.H | 5 + GPUTest/unittest.C | 100 ++++ src_gpu/CMakeLists.txt | 7 +- src_gpu/dfMatrixDataBase.H | 700 +++++------------------- src_gpu/dfMatrixDataBase.cu | 261 +++++++-- {src_gpu => src_gpu_orig}/AmgXSolver.H | 0 {src_gpu => src_gpu_orig}/AmgXSolver.cu | 0 src_gpu_orig/CMakeLists.txt | 39 ++ {src_gpu => src_gpu_orig}/GPUMesh.H | 0 {src_gpu => src_gpu_orig}/GPUfield.H | 0 {src_gpu => src_gpu_orig}/GPUfield.cpp | 0 {src_gpu => src_gpu_orig}/dfEEqn.H | 0 {src_gpu => src_gpu_orig}/dfEEqn.cu | 0 src_gpu_orig/dfMatrixDataBase.H | 641 ++++++++++++++++++++++ src_gpu_orig/dfMatrixDataBase.cu | 48 ++ {src_gpu => src_gpu_orig}/dfRhoEqn.H | 0 {src_gpu => src_gpu_orig}/dfRhoEqn.cu | 0 {src_gpu => src_gpu_orig}/dfUEqn.H | 0 {src_gpu => src_gpu_orig}/dfUEqn.cu | 0 {src_gpu => src_gpu_orig}/dfYEqn.H | 0 {src_gpu => src_gpu_orig}/dfYEqn.cu | 0 27 files changed, 1575 insertions(+), 620 deletions(-) create mode 100644 GPUTest/Make/files create mode 100644 GPUTest/Make/options create mode 100644 GPUTest/correctPhi.H create mode 100644 GPUTest/createFields.H create mode 100644 GPUTest/createGPUSolver.H create mode 100644 GPUTest/setRDeltaT.H create mode 100644 GPUTest/setRootCase2.H create mode 100644 GPUTest/unittest.C rename {src_gpu => src_gpu_orig}/AmgXSolver.H (100%) rename {src_gpu => src_gpu_orig}/AmgXSolver.cu (100%) create mode 100644 src_gpu_orig/CMakeLists.txt rename {src_gpu => src_gpu_orig}/GPUMesh.H (100%) rename {src_gpu => src_gpu_orig}/GPUfield.H (100%) rename {src_gpu => src_gpu_orig}/GPUfield.cpp (100%) rename {src_gpu => src_gpu_orig}/dfEEqn.H (100%) rename {src_gpu => src_gpu_orig}/dfEEqn.cu (100%) create mode 100644 src_gpu_orig/dfMatrixDataBase.H create mode 100644 src_gpu_orig/dfMatrixDataBase.cu rename {src_gpu => src_gpu_orig}/dfRhoEqn.H (100%) rename {src_gpu => src_gpu_orig}/dfRhoEqn.cu (100%) rename {src_gpu => src_gpu_orig}/dfUEqn.H (100%) rename {src_gpu => src_gpu_orig}/dfUEqn.cu (100%) rename {src_gpu => src_gpu_orig}/dfYEqn.H (100%) rename {src_gpu => src_gpu_orig}/dfYEqn.cu (100%) diff --git a/GPUTest/Make/files b/GPUTest/Make/files new file mode 100644 index 000000000..d78085ff8 --- /dev/null +++ b/GPUTest/Make/files @@ -0,0 +1,4 @@ +unittest.C + +EXE = $(DF_APPBIN)/unitTest + diff --git a/GPUTest/Make/options b/GPUTest/Make/options new file mode 100644 index 000000000..637eb0e9b --- /dev/null +++ b/GPUTest/Make/options @@ -0,0 +1,46 @@ +-include $(GENERAL_RULES)/mplibType + +EXE_INC = -std=c++14 \ + -g \ + -fopenmp \ + -Wno-unused-variable \ + -Wno-unused-but-set-variable \ + -Wno-old-style-cast \ + $(PFLAGS) $(PINC) \ + $(if $(LIBTORCH_ROOT),-DUSE_LIBTORCH,) \ + $(if $(PYTHON_INC_DIR),-DUSE_PYTORCH,) \ + -I$(LIB_SRC)/transportModels/compressible/lnInclude \ + -I$(LIB_SRC)/thermophysicalModels/basic/lnInclude \ + -I$(LIB_SRC)/TurbulenceModels/turbulenceModels/lnInclude \ + -I$(LIB_SRC)/TurbulenceModels/compressible/lnInclude \ + -I$(LIB_SRC)/finiteVolume/cfdTools \ + -I$(LIB_SRC)/finiteVolume/lnInclude \ + -I$(LIB_SRC)/meshTools/lnInclude \ + -I$(LIB_SRC)/sampling/lnInclude \ + -I$(LIB_SRC)/dynamicFvMesh/lnInclude \ + -I$(LIB_SRC)/Pstream/mpi \ + -I$(DF_SRC)/dfCanteraMixture/lnInclude \ + -I$(DF_SRC)/dfChemistryModel/lnInclude \ + -I$(DF_SRC)/dfCombustionModels/lnInclude \ + -I$(CANTERA_ROOT)/include \ + -I$(DF_ROOT)/src_gpu \ + -I/usr/local/cuda-11.6/include \ + -I$(AMGX_DIR)/include + +EXE_LIBS = \ + -lcompressibleTransportModels \ + -lturbulenceModels \ + -lfiniteVolume \ + -lmeshTools \ + -lsampling \ + -L$(DF_LIBBIN) \ + -ldfFluidThermophysicalModels \ + -ldfCompressibleTurbulenceModels \ + -ldfCanteraMixture \ + -ldfChemistryModel \ + -ldfCombustionModels \ + $(CANTERA_ROOT)/lib/libcantera.so \ + /usr/local/cuda-11.6/lib64/libcudart.so \ + $(AMGX_DIR)/build/libamgxsh.so \ + $(DF_ROOT)/src_gpu/build/libdfMatrix.so + diff --git a/GPUTest/correctPhi.H b/GPUTest/correctPhi.H new file mode 100644 index 000000000..3cd82d29e --- /dev/null +++ b/GPUTest/correctPhi.H @@ -0,0 +1,12 @@ +CorrectPhi +( + U, + phi, + p, + rho, + psi, + dimensionedScalar("rAUf", dimTime, 1), + divrhoU(), + pimple, + true +); diff --git a/GPUTest/createFields.H b/GPUTest/createFields.H new file mode 100644 index 000000000..9e750c334 --- /dev/null +++ b/GPUTest/createFields.H @@ -0,0 +1,176 @@ +#include "createRDeltaT.H" + +Info<< "Reading thermophysical properties\n" << endl; + +// fluidThermo* pThermo = new hePsiThermo(mesh, word::null); +fluidThermo* pThermo = new heRhoThermo(mesh, word::null); +fluidThermo& thermo = *pThermo; +// thermo.validate(args.executable(), "ha"); + +const volScalarField& psi = thermo.psi(); +volScalarField& p = thermo.p(); +volScalarField& T = thermo.T(); +volScalarField rho +( + IOobject + ( + "rho", + runTime.timeName(), + mesh, + IOobject::READ_IF_PRESENT, + IOobject::AUTO_WRITE + ), + thermo.rho() +); + + +Info<< "Reading field U\n" << endl; +volVectorField U +( + IOobject + ( + "U", + runTime.timeName(), + mesh, + IOobject::MUST_READ, + IOobject::AUTO_WRITE + ), + mesh +); + +#include "compressibleCreatePhi.H" + +pressureControl pressureControl(p, rho, pimple.dict(), false); + +mesh.setFluxRequired(p.name()); + +Info<< "Creating turbulence model\n" << endl; +autoPtr turbulence +( + compressible::turbulenceModel::New + ( + rho, + U, + phi, + thermo + ) +); + +Info<< "Creating field dpdt\n" << endl; +volScalarField dpdt +( + IOobject + ( + "dpdt", + runTime.timeName(), + mesh, + IOobject::NO_READ, + IOobject::NO_WRITE + ), + mesh, + dimensionedScalar("dpdt",p.dimensions()/dimTime, 0) +); + + +Info<< "Creating reaction model\n" << endl; +autoPtr> combustion +( + CombustionModel::New(thermo, turbulence()) +); +Info<< "end Creating reaction model\n" << endl; + + +const word combModelName(mesh.objectRegistry::lookupObject("combustionProperties").lookup("combustionModel")); +Info << "Combustion Model Name is confirmed as "<< combModelName << endl; + +const word turbName(mesh.objectRegistry::lookupObject("turbulenceProperties").lookup("simulationType")); + +dfChemistryModel* chemistry = combustion->chemistry(); +PtrList& Y = chemistry->Y(); +const word inertSpecie(chemistry->lookup("inertSpecie")); +const label inertIndex(chemistry->species()[inertSpecie]); +chemistry->setEnergyName("ha"); +chemistry->updateEnergy(); + + +chemistry->correctThermo(); +Info<< "At initial time, min/max(T) = " << min(T).value() << ", " << max(T).value() << endl; + +//for dpdt + +Info<< "Creating field kinetic energy K\n" << endl; +volScalarField K("K", 0.5*magSqr(U)); + +multivariateSurfaceInterpolationScheme::fieldTable fields; + +if(combModelName!="flareFGM") +{ +forAll(Y, i) +{ + fields.add(Y[i]); +} +fields.add(thermo.he()); +} + + +const scalar Sct = chemistry->lookupOrDefault("Sct", 1.); +volScalarField diffAlphaD +( + IOobject + ( + "diffAlphaD", + runTime.timeName(), + mesh, + IOobject::NO_READ, + IOobject::NO_WRITE + ), + mesh, + dimensionedScalar(dimEnergy/dimTime/dimVolume, 0) +); +volVectorField hDiffCorrFlux +( + IOobject + ( + "hDiffCorrFlux", + runTime.timeName(), + mesh, + IOobject::NO_READ, + IOobject::NO_WRITE + ), + mesh, + dimensionedVector(dimensionSet(1,0,-3,0,0,0,0), Zero) +); +volVectorField sumYDiffError +( + IOobject + ( + "sumYDiffError", + runTime.timeName(), + mesh, + IOobject::NO_READ, + IOobject::NO_WRITE + ), + mesh, + dimensionedVector("sumYDiffError", dimDynamicViscosity/dimLength, Zero) +); + +IOdictionary CanteraTorchProperties +( + IOobject + ( + "CanteraTorchProperties", + runTime.constant(), + mesh, + IOobject::MUST_READ, + IOobject::NO_WRITE + ) +); +const Switch splitting = CanteraTorchProperties.lookupOrDefault("splittingStrategy", false); +#ifdef USE_PYTORCH + const Switch log_ = CanteraTorchProperties.subDict("TorchSettings").lookupOrDefault("log", false); + const Switch torch_ = CanteraTorchProperties.subDict("TorchSettings").lookupOrDefault("torch", false); +#endif +#ifdef USE_LIBTORCH + const Switch log_ = CanteraTorchProperties.subDict("TorchSettings").lookupOrDefault("log", false); + const Switch torch_ = CanteraTorchProperties.subDict("TorchSettings").lookupOrDefault("torch", false); +#endif diff --git a/GPUTest/createGPUSolver.H b/GPUTest/createGPUSolver.H new file mode 100644 index 000000000..9a6c289ab --- /dev/null +++ b/GPUTest/createGPUSolver.H @@ -0,0 +1,71 @@ +dfMatrixDataBase dfDataBase; + +void createGPUBase(fvMesh& mesh, PtrList& Y) { + // obtain variables from fvMesh + const labelUList& owner = mesh.owner(); + const labelUList& neighbour = mesh.neighbour(); + int num_cells = mesh.nCells(); + int num_surfaces = neighbour.size(); + + + // prepare num_cells, num_surfaces, num_boundary_surfaces, num_patches, patch_sizes, num_species, rdelta_t + // - obtain boundary size info from mesh + int patchSize = 0, num_patches = 0, num_boundary_surfaces = 0; + std::vector patch_sizes; + forAll(mesh.boundary(), patchi) { + labelUList sub_boundary = mesh.boundary()[patchi].faceCells(); + patchSize = sub_boundary.size(); + + patch_sizes.push_back(patchSize); + num_boundary_surfaces += patchSize; + num_patches ++; + } + dfDataBase.setConstantValues(num_cells, num_surfaces, num_boundary_surfaces, num_patches, patch_sizes, Y.size(), 1e-6); // TODO: get deltaT fomr time API + + // prepare owner, neighbor + dfDataBase.setConstantIndexes(&owner[0], &neighbour[0]); + + // prepare internal and boundary of sf, mag_sf, weights, delta_coeffs, volume + // - obtain boundary field info from mesh + double *boundary_sf = new double[3 * num_boundary_surfaces]; + double *boundary_mag_sf = new double[num_boundary_surfaces]; + double *boundary_delta_coeffs = new double[num_boundary_surfaces]; + int offset = 0; + forAll(mesh.boundary(), patchi) { + const vectorField& pSf = mesh.Sf().boundaryField()[patchi]; + const scalarField& pMagSf = mesh.magSf().boundaryField()[patchi]; + const scalarField& pDeltaCoeffs = mesh.nonOrthDeltaCoeffs().boundaryField()[patchi]; + + patchSize = pMagSf.size(); + + memcpy(boundary_sf + 3*offset, &pSf[0][0], 3*patchSize*sizeof(double)); + memcpy(boundary_mag_sf + offset, &pMagSf[0], patchSize*sizeof(double)); + memcpy(boundary_delta_coeffs + offset, &pDeltaCoeffs[0], patchSize*sizeof(double)); + offset += patchSize; + } + + dfDataBase.createConstantFieldsInternal(); + dfDataBase.createConstantFieldsBoundary(); + dfDataBase.initConstantFieldsInternal(&mesh.Sf()[0][0], &mesh.magSf()[0], &mesh.surfaceInterpolation::weights()[0], &mesh.nonOrthDeltaCoeffs()[0], &mesh.V()[0]); + dfDataBase.initConstantFieldsBoundary(boundary_sf, boundary_mag_sf, boundary_delta_coeffs); + + // prepare internal and boundary of xxx + // - obtain init_Y + double *h_Y = new double[Y.size() * num_cells]; + double *boundary_Y = new double[Y.size() * num_boundary_surfaces]; + forAll(Y, speciesI) { + volScalarField& Yi = Y[speciesI]; + memcpy(h_Y + speciesI * num_cells, &Yi[0], num_cells * sizeof(double)); + offset = 0; + forAll(Yi.boundaryField(), patchi) { + const scalarField& patchYi = Yi.boundaryField()[patchi]; + patchSize = patchYi.size(); + memcpy(boundary_Y + speciesI * num_boundary_surfaces + offset, &patchYi[0], patchSize*sizeof(double)); + offset += patchSize; + } + } + dfDataBase.createNonConstantFieldsInternal(); + dfDataBase.createNonConstantFieldsBoundary(); + dfDataBase.initNonConstantFieldsInternal(h_Y); + dfDataBase.initNonConstantFieldsBoundary(boundary_Y); +}; \ No newline at end of file diff --git a/GPUTest/setRDeltaT.H b/GPUTest/setRDeltaT.H new file mode 100644 index 000000000..074d05e3d --- /dev/null +++ b/GPUTest/setRDeltaT.H @@ -0,0 +1,85 @@ +{ + volScalarField& rDeltaT = trDeltaT.ref(); + + const dictionary& pimpleDict = pimple.dict(); + + scalar maxCo + ( + pimpleDict.lookupOrDefault("maxCo", 0.8) + ); + + scalar rDeltaTSmoothingCoeff + ( + pimpleDict.lookupOrDefault("rDeltaTSmoothingCoeff", 0.02) + ); + + scalar rDeltaTDampingCoeff + ( + pimpleDict.lookupOrDefault("rDeltaTDampingCoeff", 1.0) + ); + + scalar maxDeltaT + ( + pimpleDict.lookupOrDefault("maxDeltaT", great) + ); + + volScalarField rDeltaT0("rDeltaT0", rDeltaT); + + // Set the reciprocal time-step from the local Courant number + rDeltaT.ref() = max + ( + 1/dimensionedScalar(dimTime, maxDeltaT), + fvc::surfaceSum(mag(phi))()() + /((2*maxCo)*mesh.V()*rho()) + ); + + if (pimple.transonic()) + { + surfaceScalarField phid + ( + "phid", + fvc::interpolate(psi)*fvc::flux(U) + ); + + rDeltaT.ref() = max + ( + rDeltaT(), + fvc::surfaceSum(mag(phid))()() + /((2*maxCo)*mesh.V()*psi()) + ); + } + + // Update tho boundary values of the reciprocal time-step + rDeltaT.correctBoundaryConditions(); + + Info<< "Flow time scale min/max = " + << gMin(1/rDeltaT.primitiveField()) + << ", " << gMax(1/rDeltaT.primitiveField()) << endl; + + if (rDeltaTSmoothingCoeff < 1.0) + { + fvc::smooth(rDeltaT, rDeltaTSmoothingCoeff); + } + + Info<< "Smoothed flow time scale min/max = " + << gMin(1/rDeltaT.primitiveField()) + << ", " << gMax(1/rDeltaT.primitiveField()) << endl; + + // Limit rate of change of time scale + // - reduce as much as required + // - only increase at a fraction of old time scale + if + ( + rDeltaTDampingCoeff < 1.0 + && runTime.timeIndex() > runTime.startTimeIndex() + 1 + ) + { + rDeltaT = + rDeltaT0 + *max(rDeltaT/rDeltaT0, scalar(1) - rDeltaTDampingCoeff); + + Info<< "Damped flow time scale min/max = " + << gMin(1/rDeltaT.primitiveField()) + << ", " << gMax(1/rDeltaT.primitiveField()) << endl; + } +} diff --git a/GPUTest/setRootCase2.H b/GPUTest/setRootCase2.H new file mode 100644 index 000000000..45d966e63 --- /dev/null +++ b/GPUTest/setRootCase2.H @@ -0,0 +1,5 @@ +Foam::argList args(argc,argv,true,true,/*initialise=*/false); +if (!args.checkRootCase()) +{ + Foam::FatalError.exit(); +} \ No newline at end of file diff --git a/GPUTest/unittest.C b/GPUTest/unittest.C new file mode 100644 index 000000000..2e3d55ce5 --- /dev/null +++ b/GPUTest/unittest.C @@ -0,0 +1,100 @@ +/*---------------------------------------------------------------------------*\ + ========= | + \\ / F ield | OpenFOAM: The Open Source CFD Toolbox + \\ / O peration | Website: https://openfoam.org + \\ / A nd | Copyright (C) 2011-2019 OpenFOAM Foundation + \\/ M anipulation | +------------------------------------------------------------------------------- +License + This file is part of OpenFOAM. + + OpenFOAM is free software: you can redistribute it and/or modify it + under the terms of the GNU General Public License as published by + the Free Software Foundation, either version 3 of the License, or + (at your option) any later version. + + OpenFOAM is distributed in the hope that it will be useful, but WITHOUT + ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or + FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License + for more details. + + You should have received a copy of the GNU General Public License + along with OpenFOAM. If not, see . + +Application + unittest + +Description + GPU unittest + +\*---------------------------------------------------------------------------*/ + +#include "dfChemistryModel.H" +#include "CanteraMixture.H" +// #include "hePsiThermo.H" +#include "heRhoThermo.H" + +#include "fvCFD.H" +#include "fluidThermo.H" +#include "turbulentFluidThermoModel.H" +#include "pimpleControl.H" +#include "pressureControl.H" +#include "localEulerDdtScheme.H" +#include "fvcSmooth.H" +#include "PstreamGlobals.H" +#include "basicThermo.H" +#include "CombustionModel.H" + +#include "dfMatrixDataBase.H" +#include +#include +#include "upwind.H" +#include "createGPUSolver.H" + +int main(int argc, char *argv[]) +{ +#ifdef USE_PYTORCH + pybind11::scoped_interpreter guard{};//start python interpreter +#endif + #include "postProcess.H" + + // #include "setRootCaseLists.H" + #include "listOptions.H" + #include "setRootCase2.H" + #include "listOutput.H" + + #include "createTime.H" + #include "createMesh.H" + #include "createDyMControls.H" + #include "initContinuityErrs.H" + #include "createFields.H" + #include "createRhoUfIfPresent.H" + + turbulence->validate(); + + if (!LTS) + { + #include "compressibleCourantNo.H" + #include "setInitialDeltaT.H" + } + + // * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * // + { + #include "readDyMControls.H" + + if (LTS) + { + #include "setRDeltaT.H" + } + else + { + #include "compressibleCourantNo.H" + #include "setDeltaT.H" + } + + createGPUBase(mesh, Y); + } + return 0; +} + + diff --git a/src_gpu/CMakeLists.txt b/src_gpu/CMakeLists.txt index 6e4a7efef..015a1d11b 100644 --- a/src_gpu/CMakeLists.txt +++ b/src_gpu/CMakeLists.txt @@ -12,6 +12,8 @@ find_package(MPI REQUIRED) find_package(CUDAToolkit REQUIRED) find_library(LIBAMGXSH amgxsh PATHS $ENV{AMGX_DIR}/build) +add_compile_options(-arch=sm_70) + include_directories( ${MPI_INCLUDE_PATH} ${CUDA_INCLUDE_DIRS} @@ -20,11 +22,6 @@ include_directories( add_library(${PROJECT_NAME} SHARED - dfUEqn.cu - dfRhoEqn.cu - dfYEqn.cu - dfEEqn.cu - AmgXSolver.cu dfMatrixDataBase.cu) target_link_libraries(${PROJECT_NAME} diff --git a/src_gpu/dfMatrixDataBase.H b/src_gpu/dfMatrixDataBase.H index 8efb4bf62..c2e1446ec 100644 --- a/src_gpu/dfMatrixDataBase.H +++ b/src_gpu/dfMatrixDataBase.H @@ -52,590 +52,138 @@ void constructBoundarySelector(std::vector& patchTypeSelector, const std::s struct dfMatrixDataBase { - // - cuda resource + // cuda resource cudaStream_t stream; - - // - number of cell size - int num_cells; - // - number of face size - int num_surfaces; - // - number of offdiagnal entry size (2*num_surfaces) - int num_faces; - // - number of boundary cells - int num_boundary_cells; - // - number of boundary faces - int num_boundary_faces; - - int num_species; - - // - mesh variables - // - csr_row_index - int *h_A_csr_row_index=nullptr, *d_A_csr_row_index=nullptr; - // - csr_col_index - int *h_A_csr_col_index=nullptr, *d_A_csr_col_index=nullptr; - // - csr_diag_index - int *h_A_csr_diag_index=nullptr, *d_A_csr_diag_index=nullptr; - - // - the pre-permutated and post-permutated interpolation weight list - std::vector h_weight_vec_init, h_weight_vec; - // - the pre-permutated and post-permutated flux (phi) list - std::vector h_phi_vec_init, h_phi_vec; - // - the pre-permutated and post-permutated cell face vector list - std::vector h_face_vector_vec_init, h_face_vector_vec; - std::vector h_face_vec_init, h_face_vec; - std::vector h_deltaCoeffs_vec_init, h_deltaCoeffs_vec; - // - the host pointer to rho_new, rho_old, velocity_old, pressure and volume list - double *h_rho_new = nullptr, *h_rho_old = nullptr, *h_velocity_old = nullptr, - *h_pressure = nullptr; - const double *h_volume = nullptr; - // - the host pointer to the pre-permutated and post-permutated interpolation weight list - double *h_weight_init = nullptr, *h_weight = nullptr; - // - the host pointer to the pre-permutated and post-permutated flux (phi) list - double *h_phi_init = nullptr, *h_phi = nullptr; - // - the host pointer to the pre-permutated and post-permutated cell face vector list - double *h_face_vector_init = nullptr, *h_face_vector = nullptr; - double *h_face_init = nullptr, *h_face = nullptr; - double *h_deltaCoeffs_init = nullptr, *h_deltaCoeffs = nullptr; - // - the device pointer to rho_new, rho_old, velocity_old, pressure and volume list - double *d_rho_new = nullptr, *d_rho_old = nullptr, *d_velocity_old = nullptr, - *d_pressure = nullptr, *d_volume = nullptr; - // - the device pointer to Y(vector Yi) - //std::vector d_Y; - double *d_Y = nullptr; - // - the device pointer to the pre-permutated and post-permutated interpolation weight list - double *d_weight_init = nullptr, *d_weight = nullptr; - double *d_weight_upwind = nullptr; - // - the device pointer to the pre-permutated and post-permutated flux (phi) list - double *d_phi_init = nullptr, *d_phi = nullptr; - // - the device pointer to the pre-permutated and post-permutated cell face vector list - double *d_face_vector_init = nullptr, *d_face_vector = nullptr; - double *d_face_init = nullptr, *d_face = nullptr; - double *d_deltaCoeffs_init = nullptr, *d_deltaCoeffs = nullptr; - std::vector d_rhoD_vector; - - double *d_hDiffCorrFlux = nullptr; - double *d_diffAlphaD = nullptr; - double *d_rhoD = nullptr; - double *d_alpha = nullptr; - - double rdelta_t = 1/1e-6; - - /** - * @brief boundary related variables - */ - int *h_boundary_cell_offset = nullptr, *d_boundary_cell_offset=nullptr; - int *h_boundary_cell_id = nullptr, *d_boundary_cell_id = nullptr; - double *h_internal_coeffs = nullptr, *h_boundary_coeffs = nullptr, - *h_boundary_pressure = nullptr, *h_boundary_face_vector = nullptr, - *h_boundary_face = nullptr, *d_boundary_face = nullptr, - *h_boundary_deltaCoeffs = nullptr, *d_boundary_deltaCoeffs = nullptr, - *d_internal_coeffs = nullptr, *d_boundary_coeffs = nullptr, - *d_internal_coeffs_init = nullptr, *d_boundary_coeffs_init = nullptr, - *d_laplac_internal_coeffs = nullptr, *d_laplac_boundary_coeffs = nullptr, - *d_laplac_internal_coeffs_init = nullptr, *d_laplac_boundary_coeffs_init = nullptr, - *d_boundary_pressure = nullptr, *d_boundary_face_vector = nullptr, - *d_boundary_pressure_init = nullptr, - *d_boundary_phi = nullptr, *d_boundary_phi_init = nullptr, - *d_boundary_velocity = nullptr, *d_boundary_velocity_init = nullptr, - *d_boundary_nuEff = nullptr, *d_boundary_nuEff_init = nullptr, - *d_boundary_rho = nullptr, *d_boundary_rho_init = nullptr; - std::vector d_boundary_Y_vector; - std::vector d_boundary_Y_init_vector; - std::vector d_internal_coeffs_Y_vector; - std::vector d_boundary_coeffs_Y_vector; - std::vector d_laplac_internal_coeffs_Y_vector; - std::vector d_laplac_boundary_coeffs_Y_vector; - double *d_internal_coeffs_Y = nullptr; - double *d_boundary_coeffs_Y = nullptr; - double *d_laplac_internal_coeffs_Y = nullptr; - double *d_laplac_boundary_coeffs_Y = nullptr; - std::vector d_boundary_rhoD_vector; - double *d_boundary_mut_sct = nullptr; - double *d_boundary_rhoD = nullptr; - double *d_boundary_alpha = nullptr; - - double *d_boundary_hDiffCorrFlux = nullptr; - int *d_boundary_UpatchType = nullptr; - int *d_boundary_YpatchType = nullptr; - - std::vector boundPermutationList; - std::vector ueqn_internalCoeffs, ueqn_boundaryCoeffs; - std::vector boundary_face_vector; - std::vector boundary_pressure; - std::vector boundary_face; - std::vector boundary_deltaCoeffs; - std::vector> patch_type_init; - std::vector> patch_type; - - // - the device pointer to the permutated index list - std::vector permedIndex; - int *d_permedIndex=nullptr; - int *d_bouPermedIndex = nullptr; - - - // bytesize - // - bytes of diagnal entries - size_t cell_bytes; - // - bytes of diagnal entries (vector) - size_t cell_vec_bytes; - // - bytes of diagnal index - size_t cell_index_bytes; - // - bytes of diagnal index - size_t face_bytes; - size_t face_vec_bytes; - size_t face_index_bytes; - - size_t boundary_cell_bytes; - size_t boundary_cell_vec_bytes; - size_t boundary_cell_index_bytes; - - size_t boundary_face_bytes; - size_t boundary_face_vec_bytes; - size_t boundary_face_index_bytes; - - // A_csr has one more element in each row: itself - size_t csr_row_index_bytes; - size_t csr_col_index_bytes; - size_t csr_value_bytes; - size_t csr_value_vec_bytes; - - // extra matrix information - double *d_turbSrc_A = nullptr, *d_turbSrc_b = nullptr, *d_turbSrc_A_init = nullptr; - std::vector h_turbSrc_init_mtx_vec, h_turbSrc_init_1mtx; - std::vector h_turbSrc_init_src_vec, h_turbSrc_src_vec; - std::vector tmpPermutatedList; - int * d_tmpPermutatedList = nullptr; - - // double *h_A_csr = nullptr, *h_b = nullptr, *h_psi = nullptr; - // double *d_A_csr = nullptr, *d_b = nullptr, *d_psi = nullptr; - - int num_iteration; - - double time_monitor_CPU; - double time_monitor_GPU_kernel, time_monitor_GPU_memcpy, time_monitor_GPU_memcpy_test; - - double* d_grad = nullptr; - double* d_grad_boundary = nullptr, *d_grad_boundary_init = nullptr; - double* d_nuEff = nullptr; + // maybe one graph for one eqn before using self-developed solver + // and should be located in each eqn. + cudaGraph_t graph; + cudaGraphExec_t graph_instance; + bool graph_created=false; + + // constant values -- basic + int num_cells = 0; + int num_surfaces = 0; + int num_boundary_surfaces = 0; + int num_patches = 0; + int num_species = 0; + std::vector patch_sizes; + double rdelta_t = 0; + + // constant values -- ldu bytesize + size_t cell_value_bytes = 0; + size_t cell_value_vec_bytes = 0; + size_t cell_value_tsr_bytes = 0; + size_t cell_index_bytes = 0; + size_t surface_value_bytes = 0; + size_t surface_index_bytes = 0; + size_t surface_value_vec_bytes = 0; + size_t boundary_surface_value_bytes = 0; + size_t boundary_surface_value_vec_bytes = 0; + size_t boundary_surface_value_tsr_bytes = 0; + size_t boundary_surface_index_bytes = 0; + + // constant values -- csr bytesize + size_t csr_row_index_bytes = 0; + size_t csr_col_index_bytes = 0; + size_t csr_value_bytes = 0; + size_t csr_value_vec_bytes = 0; + + // constant indexes + int *d_owner = nullptr; + int *d_neighbor = nullptr; + int *d_lower_to_csr_index = nullptr; + int *d_diag_to_csr_index= nullptr; + int *d_upper_to_csr_index= nullptr; + int *d_csr_row_index= nullptr; + int *d_csr_col_index= nullptr; + + // constant fields - internal + double *d_sf = nullptr; + double *d_mag_sf = nullptr; + double *d_weight = nullptr; + double *d_delta_coeffs = nullptr; + double *d_volume = nullptr; + + // constant fields - boundary + double *d_boundary_sf = nullptr; + double *d_boundary_mag_sf = nullptr; + double *d_boundary_weight = nullptr; + double *d_boundary_delta_coeffs = nullptr; + + // non-constant fields - internal + // TODO: further estimate + // fields solved by eqns - new + double *d_rho = nullptr; + double *d_u = nullptr; + double *d_y = nullptr; + double *d_he = nullptr; + double *d_p = nullptr; + // fields solved by eqns - old + // TODO: not all fields need to store oldTime + double *d_rho_old = nullptr; + double *d_u_old = nullptr; + double *d_y_old = nullptr; + double *d_he_old = nullptr; + double *d_p_old = nullptr; + // other shared fields between eqns + double *d_phi = nullptr; + // computed on GPU, used on CPU, need memcpyd2h - host + double *h_rho = nullptr; + double *h_u= nullptr; + double *h_y= nullptr; + double *h_he= nullptr; + // computed on CPU, used on GPU, need memcpyh2d - host + double *h_p= nullptr; + double *h_phi= nullptr; + + // non-constant fields - boundary + // TODO: further estimate + // fields solved by eqns - new + double *d_boundary_rho = nullptr; + double *d_boundary_u = nullptr; + double *d_boundary_y = nullptr; + double *d_boundary_he = nullptr; + double *d_boundary_p = nullptr; + // fields solved by eqns - old + double *d_boundary_rho_old = nullptr; + double *d_boundary_u_old = nullptr; + double *d_boundary_y_old = nullptr; + double *d_boundary_he_old = nullptr; + double *d_boundary_p_old = nullptr; + // other shared fields between eqns + double *d_boundary_phi = nullptr; + // computed on GPU, used on CPU, need memcpyd2h - host + double *h_boundary_rho = nullptr; + double *h_boundary_u= nullptr; + double *h_boundary_y= nullptr; + double *h_boundary_he= nullptr; + // computed on CPU, used on GPU, need memcpyh2d - host + double *h_boundary_p= nullptr; + double *h_boundary_phi= nullptr; // constructor dfMatrixDataBase(); - dfMatrixDataBase(int num_surfaces, int num_cells, int num_boundary_faces, int num_species, int & num_boundary_cells_output, - const int *neighbour, const int *owner, const double* volume, const double* weight, const double* face_vector, const double* face, - const double* deltaCoeffs, std::vector boundary_face_vector_init, std::vector boundary_face_init, - std::vector boundary_deltaCoeffs_init, std::vector boundary_cell_id_init, std::vector> patch_type_init) - : num_cells(num_cells), num_faces(num_surfaces*2), num_surfaces(num_surfaces), num_species(num_species), num_iteration(0), - num_boundary_faces(num_boundary_faces), h_volume(volume), patch_type_init(patch_type_init) - { - // create cuda stream - checkCudaErrors(cudaStreamCreate(&stream)); - - // allocate field pointer in pin memory - cudaMallocHost(&h_phi_init, num_faces * sizeof(double)); - cudaMallocHost(&h_rho_old, num_cells * sizeof(double)); - - h_weight_vec_init.resize(num_faces); - h_weight_vec.resize(num_faces); - h_face_vector_vec_init.resize(num_faces*3); - h_face_vector_vec.resize(num_faces*3); - h_face_vec_init.resize(num_faces); - h_face_vec.resize(num_faces); - h_deltaCoeffs_vec_init.resize(num_faces); - h_deltaCoeffs_vec.resize(num_faces); - h_turbSrc_init_mtx_vec.resize(num_faces + num_cells); - h_turbSrc_init_1mtx.resize(num_faces + num_cells); - h_turbSrc_init_src_vec.resize(3*num_cells); - h_turbSrc_src_vec.resize(3*num_cells); - - // byte sizes - cell_bytes = num_cells * sizeof(double); - cell_vec_bytes = num_cells * 3 * sizeof(double); - cell_index_bytes = num_cells * sizeof(int); - - face_bytes = num_faces * sizeof(double); - face_vec_bytes = num_faces * 3 * sizeof(double); - face_index_bytes = num_faces * sizeof(int); - - // A_csr has one more element in each row: itself - csr_row_index_bytes = (num_cells + 1) * sizeof(int); - csr_col_index_bytes = (num_cells + num_faces) * sizeof(int); - csr_value_bytes = (num_cells + num_faces) * sizeof(double); - csr_value_vec_bytes = (num_cells + num_faces) * 3 * sizeof(double); - - /************************construct mesh variables****************************/ - /** - * 1. h_csr_row_index & h_csr_diag_index - */ - std::vector h_mtxEntry_perRow_vec(num_cells); - std::vector h_csr_diag_index_vec(num_cells); - std::vector h_csr_row_index_vec(num_cells + 1, 0); - - for (int faceI = 0; faceI < num_surfaces; faceI++) - { - h_csr_diag_index_vec[neighbour[faceI]]++; - h_mtxEntry_perRow_vec[neighbour[faceI]]++; - h_mtxEntry_perRow_vec[owner[faceI]]++; - } - - // - consider diagnal element in each row - std::transform(h_mtxEntry_perRow_vec.begin(), h_mtxEntry_perRow_vec.end(), h_mtxEntry_perRow_vec.begin(), [](int n) - {return n + 1;}); - // - construct h_csr_row_index & h_csr_diag_index - std::partial_sum(h_mtxEntry_perRow_vec.begin(), h_mtxEntry_perRow_vec.end(), h_csr_row_index_vec.begin()+1); - // - assign h_csr_row_index & h_csr_diag_index - h_A_csr_row_index = h_csr_row_index_vec.data(); - h_A_csr_diag_index = h_csr_diag_index_vec.data(); - - /** - * 2. h_csr_col_index - */ - std::vector rowIndex(num_faces + num_cells), colIndex(num_faces + num_cells), diagIndex(num_cells); - std::iota(diagIndex.begin(), diagIndex.end(), 0); - - // initialize the RowIndex (rowIndex of lower + upper + diagnal) - std::copy(neighbour, neighbour + num_surfaces, rowIndex.begin()); - std::copy(owner, owner + num_surfaces, rowIndex.begin() + num_surfaces); - std::copy(diagIndex.begin(), diagIndex.end(), rowIndex.begin() + num_faces); - // initialize the ColIndex (colIndex of lower + upper + diagnal) - std::copy(owner, owner + num_surfaces, colIndex.begin()); - std::copy(neighbour, neighbour + num_surfaces, colIndex.begin() + num_surfaces); - std::copy(diagIndex.begin(), diagIndex.end(), colIndex.begin() + num_faces); - - // - construct hashTable for sorting - std::multimap rowColPair; - for (int i = 0; i < 2*num_surfaces+num_cells; i++) - { - rowColPair.insert(std::make_pair(rowIndex[i], colIndex[i])); - } - // - sort - std::vector> globalPerm(rowColPair.begin(), rowColPair.end()); - std::sort(globalPerm.begin(), globalPerm.end(), [] - (const std::pair& pair1, const std::pair& pair2){ - if (pair1.first != pair2.first) { - return pair1.first < pair2.first; - } else { - return pair1.second < pair2.second; - } - }); - - std::vector h_csr_col_index_vec; - std::transform(globalPerm.begin(), globalPerm.end(), std::back_inserter(h_csr_col_index_vec), [] - (const std::pair& pair) { - return pair.second; - }); - h_A_csr_col_index = h_csr_col_index_vec.data(); - - // construct a tmp permutated List for add fvMatrix - std::vector tmp_permutation(2*num_surfaces + num_cells); - std::vector tmp_rowIndex(2*num_surfaces + num_cells); - std::iota(tmp_permutation.begin(), tmp_permutation.end(), 0); - std::copy(neighbour, neighbour + num_surfaces, tmp_rowIndex.begin()); - std::copy(diagIndex.begin(), diagIndex.end(), tmp_rowIndex.begin() + num_surfaces); - std::copy(owner, owner + num_surfaces, tmp_rowIndex.begin() + num_surfaces + num_cells); - std::multimap tmpPair; - for (int i = 0; i < 2*num_surfaces+num_cells; i++) - { - tmpPair.insert(std::make_pair(tmp_rowIndex[i], tmp_permutation[i])); - } - std::vector> tmpPerm(tmpPair.begin(), tmpPair.end()); - std::sort(tmpPerm.begin(), tmpPerm.end(), [] - (const std::pair& pair1, const std::pair& pair2){ - if (pair1.first != pair2.first) { - return pair1.first < pair2.first; - } else { - return pair1.second < pair2.second; - } - }); - std::transform(tmpPerm.begin(), tmpPerm.end(), std::back_inserter(tmpPermutatedList), [] - (const std::pair& pair) { - return pair.second; - }); - - /** - * 3. boundary imformations - */ - // get boundPermutation and offset lists - std::vector boundPermutationListInit(num_boundary_faces); - std::vector boundOffsetList; - std::iota(boundPermutationListInit.begin(), boundPermutationListInit.end(), 0); - - // - construct hashTable for sorting - std::multimap boundPermutation; - for (int i = 0; i < num_boundary_faces; i++) - { - boundPermutation.insert(std::make_pair(boundary_cell_id_init[i], boundPermutationListInit[i])); - } - - // - sort - std::vector> boundPermPair(boundPermutation.begin(), boundPermutation.end()); - std::sort(boundPermPair.begin(), boundPermPair.end(), [] - (const std::pair& pair1, const std::pair& pair2){ - if (pair1.first != pair2.first) { - return pair1.first < pair2.first; - } else { - return pair1.second < pair2.second; - } - }); - - // - construct boundPermedIndex and boundary_cell_id - std::vector boundary_cell_id; - boundPermutationList.clear(); - std::transform(boundPermPair.begin(), boundPermPair.end(), std::back_inserter(boundary_cell_id), [] - (const std::pair& pair) { - return pair.first; - }); - std::transform(boundPermPair.begin(), boundPermPair.end(), std::back_inserter(boundPermutationList), [] - (const std::pair& pair) { - return pair.second; - }); - - // construct boundary_cell_offset - std::map countMap; - std::vector boundaryCellcount; - for (const auto& cellIndex : boundary_cell_id) - ++ countMap[cellIndex]; - for (const auto& [cellIndex, count] : countMap) - boundaryCellcount.push_back(count); - - num_boundary_cells = boundaryCellcount.size(); - num_boundary_cells_output = num_boundary_cells; - - std::vector boundary_cell_offset(boundaryCellcount.size() + 1, 0); - std::partial_sum(boundaryCellcount.begin(), boundaryCellcount.end(), boundary_cell_offset.begin()+1); - - // assign h_boundary_cell_offset & h_boundary_cell_id - h_boundary_cell_offset = boundary_cell_offset.data(); - h_boundary_cell_id = boundary_cell_id.data(); - - // - boundary_cell_bytes = num_boundary_cells * sizeof(double); - boundary_cell_vec_bytes = num_boundary_cells * 3 * sizeof(double); - boundary_cell_index_bytes = num_boundary_cells * sizeof(int); - - boundary_face_bytes = num_boundary_faces * sizeof(double); - boundary_face_vec_bytes = num_boundary_faces * 3 * sizeof(double); - boundary_face_index_bytes = num_boundary_faces * sizeof(int); - - ueqn_internalCoeffs.resize(3*num_boundary_faces); - ueqn_boundaryCoeffs.resize(3*num_boundary_faces); - - boundary_face_vector.resize(3*num_boundary_faces); - boundary_pressure.resize(num_boundary_faces); - boundary_face.resize(num_boundary_faces); - boundary_deltaCoeffs.resize(num_boundary_faces); - - patch_type.resize(2); - patch_type[0].resize(num_boundary_faces); - patch_type[1].resize(num_boundary_faces); - - /** - * 4. permutation list for field variables - */ - std::vector offdiagRowIndex(2*num_surfaces), permIndex(2*num_surfaces); - // - initialize the offdiagRowIndex (rowIndex of lower + rowIndex of upper) - std::copy(neighbour, neighbour + num_surfaces, offdiagRowIndex.begin()); - std::copy(owner, owner + num_surfaces, offdiagRowIndex.begin() + num_surfaces); - - // - initialize the permIndex (0, 1, ..., 2*num_surfaces) - std::iota(permIndex.begin(), permIndex.end(), 0); - - // - construct hashTable for sorting - std::multimap permutation; - for (int i = 0; i < 2*num_surfaces; i++) - { - permutation.insert(std::make_pair(offdiagRowIndex[i], permIndex[i])); - } - // - sort - std::vector> permPair(permutation.begin(), permutation.end()); - std::sort(permPair.begin(), permPair.end(), [] - (const std::pair& pair1, const std::pair& pair2){ - if (pair1.first != pair2.first) { - return pair1.first < pair2.first; - } else { - return pair1.second < pair2.second; - } - }); - // - form permedIndex list - std::transform(permPair.begin(), permPair.end(), std::back_inserter(permedIndex), [] - (const std::pair& pair) { - return pair.second; - }); - - // copy and permutate cell variables - std::copy(weight, weight + num_surfaces, h_weight_vec_init.begin()); - std::copy(weight, weight + num_surfaces, h_weight_vec_init.begin() + num_surfaces); - std::copy(face_vector, face_vector + 3*num_surfaces, h_face_vector_vec_init.begin()); - std::copy(face_vector, face_vector + 3*num_surfaces, h_face_vector_vec_init.begin() + 3*num_surfaces); - std::copy(face, face + num_surfaces, h_face_vec_init.begin()); - std::copy(face, face + num_surfaces, h_face_vec_init.begin() + num_surfaces); - std::copy(deltaCoeffs, deltaCoeffs + num_surfaces, h_deltaCoeffs_vec_init.begin()); - std::copy(deltaCoeffs, deltaCoeffs + num_surfaces, h_deltaCoeffs_vec_init.begin() + num_surfaces); - for (int i = 0; i < num_faces; i++) - { - h_weight_vec[i] = h_weight_vec_init[permedIndex[i]]; - h_face_vec[i] = h_face_vec_init[permedIndex[i]]; - h_deltaCoeffs_vec[i] = h_deltaCoeffs_vec_init[permedIndex[i]]; - h_face_vector_vec[i*3] = h_face_vector_vec_init[3*permedIndex[i]]; - h_face_vector_vec[i*3+1] = h_face_vector_vec_init[3*permedIndex[i]+1]; - h_face_vector_vec[i*3+2] = h_face_vector_vec_init[3*permedIndex[i]+2]; - } - h_weight = h_weight_vec.data(); - h_face_vector = h_face_vector_vec.data(); - h_face = h_face_vec.data(); - h_deltaCoeffs = h_deltaCoeffs_vec.data(); - - for (int i = 0; i < num_boundary_faces; i++) - { - boundary_face_vector[3*i] = boundary_face_vector_init[3*boundPermutationList[i]]; - boundary_face_vector[3*i+1] = boundary_face_vector_init[3*boundPermutationList[i]+1]; - boundary_face_vector[3*i+2] = boundary_face_vector_init[3*boundPermutationList[i]+2]; - boundary_face[i] = boundary_face_init[boundPermutationList[i]]; - boundary_deltaCoeffs[i] = boundary_deltaCoeffs_init[boundPermutationList[i]]; - patch_type[0][i] = patch_type_init[0][boundPermutationList[i]]; - patch_type[1][i] = patch_type_init[1][boundPermutationList[i]]; - } - h_boundary_face_vector = boundary_face_vector.data(); - h_boundary_face = boundary_face.data(); - h_boundary_deltaCoeffs = boundary_deltaCoeffs.data(); - - /************************allocate memory on device****************************/ - int total_bytes = 0; - - checkCudaErrors(cudaMalloc((void**)&d_A_csr_row_index, csr_row_index_bytes)); - checkCudaErrors(cudaMalloc((void**)&d_A_csr_col_index, csr_col_index_bytes)); - checkCudaErrors(cudaMalloc((void**)&d_A_csr_diag_index, cell_index_bytes)); - total_bytes += (csr_row_index_bytes + csr_col_index_bytes + cell_index_bytes); - - //d_Y.resize(num_species); - d_rhoD_vector.resize(num_species); - d_boundary_Y_vector.resize(num_species); - d_boundary_Y_init_vector.resize(num_species); - d_internal_coeffs_Y_vector.resize(num_species); - d_boundary_coeffs_Y_vector.resize(num_species); - d_laplac_internal_coeffs_Y_vector.resize(num_species); - d_laplac_boundary_coeffs_Y_vector.resize(num_species); - d_boundary_rhoD_vector.resize(num_species); - - for (size_t i = 0; i < num_species; ++i){ - //checkCudaErrors(cudaMalloc((void**)&d_Y[i], cell_bytes)); - checkCudaErrors(cudaMalloc((void**)&d_rhoD_vector[i], cell_bytes)); - } - checkCudaErrors(cudaMalloc((void**)&d_Y, cell_bytes * num_species)); - checkCudaErrors(cudaMalloc((void**)&d_rho_old, cell_bytes)); - checkCudaErrors(cudaMalloc((void**)&d_rho_new, cell_bytes)); - checkCudaErrors(cudaMalloc((void**)&d_volume, cell_bytes)); - checkCudaErrors(cudaMalloc((void**)&d_pressure, cell_bytes)); - checkCudaErrors(cudaMalloc((void**)&d_velocity_old, cell_vec_bytes)); - checkCudaErrors(cudaMalloc((void**)&d_weight, face_bytes)); - checkCudaErrors(cudaMalloc((void**)&d_weight_upwind, face_bytes)); - checkCudaErrors(cudaMalloc((void**)&d_face, face_bytes)); - checkCudaErrors(cudaMalloc((void**)&d_deltaCoeffs, face_bytes)); - checkCudaErrors(cudaMalloc((void**)&d_phi, face_bytes)); - checkCudaErrors(cudaMalloc((void**)&d_phi_init, face_bytes)); - checkCudaErrors(cudaMalloc((void**)&d_face_vector, face_vec_bytes)); - checkCudaErrors(cudaMalloc((void**)&d_nuEff, cell_bytes)); - checkCudaErrors(cudaMalloc((void**)&d_rhoD, cell_bytes * num_species)); - checkCudaErrors(cudaMalloc((void**)&d_alpha, cell_bytes)); - total_bytes += (cell_bytes * (5 + 2*num_species) + face_bytes * 6 + cell_vec_bytes + face_vec_bytes); - - checkCudaErrors(cudaMalloc((void**)&d_boundary_cell_offset, (num_boundary_cells+1) * sizeof(int))); - checkCudaErrors(cudaMalloc((void**)&d_boundary_cell_id, boundary_face_index_bytes)); - total_bytes += (boundary_face_index_bytes + (num_boundary_cells+1) * sizeof(int)); - - checkCudaErrors(cudaMalloc((void**)&d_boundary_pressure_init, boundary_face_bytes)); - checkCudaErrors(cudaMalloc((void**)&d_boundary_pressure, boundary_face_bytes)); - checkCudaErrors(cudaMalloc((void**)&d_boundary_phi_init, boundary_face_bytes)); - checkCudaErrors(cudaMalloc((void**)&d_boundary_phi, boundary_face_bytes)); - checkCudaErrors(cudaMalloc((void**)&d_boundary_velocity_init, boundary_face_vec_bytes)); - checkCudaErrors(cudaMalloc((void**)&d_boundary_velocity, boundary_face_vec_bytes)); - checkCudaErrors(cudaMalloc((void**)&d_boundary_face_vector, boundary_face_vec_bytes)); - checkCudaErrors(cudaMalloc((void**)&d_boundary_face, boundary_face_bytes)); - checkCudaErrors(cudaMalloc((void**)&d_boundary_deltaCoeffs, boundary_face_bytes)); - checkCudaErrors(cudaMalloc((void**)&d_internal_coeffs_init, boundary_face_vec_bytes)); - checkCudaErrors(cudaMalloc((void**)&d_boundary_coeffs_init, boundary_face_vec_bytes)); - checkCudaErrors(cudaMalloc((void**)&d_laplac_internal_coeffs_init, boundary_face_vec_bytes)); - checkCudaErrors(cudaMalloc((void**)&d_laplac_boundary_coeffs_init, boundary_face_vec_bytes)); - checkCudaErrors(cudaMalloc((void**)&d_internal_coeffs, boundary_face_vec_bytes)); - checkCudaErrors(cudaMalloc((void**)&d_boundary_coeffs, boundary_face_vec_bytes)); - checkCudaErrors(cudaMalloc((void**)&d_laplac_internal_coeffs, boundary_face_vec_bytes)); - checkCudaErrors(cudaMalloc((void**)&d_laplac_boundary_coeffs, boundary_face_vec_bytes)); - checkCudaErrors(cudaMalloc((void**)&d_boundary_nuEff, boundary_face_bytes)); - checkCudaErrors(cudaMalloc((void**)&d_boundary_nuEff_init, boundary_face_bytes)); - checkCudaErrors(cudaMalloc((void**)&d_boundary_rho, boundary_face_bytes)); - checkCudaErrors(cudaMalloc((void**)&d_boundary_rho_init, boundary_face_bytes)); - checkCudaErrors(cudaMalloc((void**)&d_boundary_mut_sct, boundary_face_bytes)); - checkCudaErrors(cudaMalloc((void**)&d_boundary_UpatchType, boundary_face_index_bytes)); - checkCudaErrors(cudaMalloc((void**)&d_boundary_YpatchType, boundary_face_index_bytes)); - for (size_t i = 0; i < num_species; ++i){ - checkCudaErrors(cudaMalloc((void**)&d_boundary_Y_vector[i], boundary_face_bytes)); - checkCudaErrors(cudaMalloc((void**)&d_boundary_Y_init_vector[i], boundary_face_bytes)); - checkCudaErrors(cudaMalloc((void**)&d_internal_coeffs_Y_vector[i], boundary_face_bytes)); - checkCudaErrors(cudaMalloc((void**)&d_boundary_coeffs_Y_vector[i], boundary_face_bytes)); - checkCudaErrors(cudaMalloc((void**)&d_laplac_internal_coeffs_Y_vector[i], boundary_face_bytes)); - checkCudaErrors(cudaMalloc((void**)&d_laplac_boundary_coeffs_Y_vector[i], boundary_face_bytes)); - checkCudaErrors(cudaMalloc((void**)&d_boundary_rhoD_vector[i], boundary_face_bytes)); - } - checkCudaErrors(cudaMalloc((void**)&d_internal_coeffs_Y, boundary_face_bytes * num_species)); - checkCudaErrors(cudaMalloc((void**)&d_boundary_coeffs_Y, boundary_face_bytes * num_species)); - checkCudaErrors(cudaMalloc((void**)&d_laplac_internal_coeffs_Y, boundary_face_bytes * num_species)); - checkCudaErrors(cudaMalloc((void**)&d_laplac_boundary_coeffs_Y, boundary_face_bytes * num_species)); - checkCudaErrors(cudaMalloc((void**)&d_boundary_rhoD, boundary_face_bytes * num_species)); - checkCudaErrors(cudaMalloc((void**)&d_boundary_alpha, boundary_face_bytes)); - - total_bytes += (boundary_face_bytes*10 + boundary_face_vec_bytes * 11); - - // checkCudaErrors(cudaMalloc((void**)&d_A_csr, csr_value_vec_bytes)); - // checkCudaErrors(cudaMalloc((void**)&d_b, cell_vec_bytes)); - // checkCudaErrors(cudaMalloc((void**)&d_psi, cell_vec_bytes)); - total_bytes += (boundary_face_bytes + boundary_face_vec_bytes * 3); - - checkCudaErrors(cudaMalloc((void**)&d_turbSrc_A, csr_value_bytes)); - checkCudaErrors(cudaMalloc((void**)&d_turbSrc_A_init, csr_value_bytes)); - checkCudaErrors(cudaMalloc((void**)&d_turbSrc_b, cell_vec_bytes)); - total_bytes += (2*csr_value_bytes + cell_vec_bytes); - - checkCudaErrors(cudaMalloc((void**)&d_permedIndex, face_index_bytes)); - checkCudaErrors(cudaMalloc((void**)&d_bouPermedIndex, boundary_face_index_bytes)); - checkCudaErrors(cudaMalloc((void**)&d_tmpPermutatedList, csr_col_index_bytes)); - total_bytes += (face_index_bytes + boundary_face_index_bytes + csr_col_index_bytes); - - checkCudaErrors(cudaMalloc((void**)&d_grad, num_cells * 9 * sizeof(double))); - checkCudaErrors(cudaMalloc((void**)&d_grad_boundary, boundary_face_bytes * 9)); - checkCudaErrors(cudaMalloc((void**)&d_grad_boundary_init, boundary_cell_bytes * 9)); - total_bytes += (num_cells * 9 * sizeof(double) + boundary_face_bytes * 9 + boundary_cell_bytes * 9); // FIXME: rename - - checkCudaErrors(cudaMalloc((void**)&d_hDiffCorrFlux, 3 * cell_bytes)); - checkCudaErrors(cudaMalloc((void**)&d_diffAlphaD, cell_bytes)); - checkCudaErrors(cudaMalloc((void**)&d_boundary_hDiffCorrFlux, 3 * boundary_face_bytes)); - fprintf(stderr, "Total bytes malloc on GPU: %.2fMB\n", total_bytes * 1.0 / 1024 / 1024); + // deconstructor + ~dfMatrixDataBase(); - checkCudaErrors(cudaMemcpyAsync(d_A_csr_row_index, h_A_csr_row_index, csr_row_index_bytes, cudaMemcpyHostToDevice, stream)); - checkCudaErrors(cudaMemcpyAsync(d_A_csr_col_index, h_A_csr_col_index, csr_col_index_bytes, cudaMemcpyHostToDevice, stream)); - checkCudaErrors(cudaMemcpyAsync(d_A_csr_diag_index, h_A_csr_diag_index, cell_index_bytes, cudaMemcpyHostToDevice, stream)); - checkCudaErrors(cudaMemcpyAsync(d_boundary_cell_offset, h_boundary_cell_offset, (num_boundary_cells+1) * sizeof(int), cudaMemcpyHostToDevice, stream)); - checkCudaErrors(cudaMemcpyAsync(d_boundary_cell_id, h_boundary_cell_id, boundary_face_index_bytes, cudaMemcpyHostToDevice, stream)); + // member function + void setConstantValues(int num_cells, int num_surfaces, int num_boundary_surfaces, + int num_patches, std::vector patch_sizes, + int num_species, double rdelta_t); + void setConstantIndexes(const int *owner, const int *neighbor); - checkCudaErrors(cudaMemcpyAsync(d_volume, h_volume, cell_bytes, cudaMemcpyHostToDevice, stream)); - checkCudaErrors(cudaMemcpyAsync(d_weight, h_weight, face_bytes, cudaMemcpyHostToDevice, stream)); - checkCudaErrors(cudaMemcpyAsync(d_face, h_face, face_bytes, cudaMemcpyHostToDevice, stream)); - checkCudaErrors(cudaMemcpyAsync(d_deltaCoeffs, h_deltaCoeffs, face_bytes, cudaMemcpyHostToDevice, stream)); - checkCudaErrors(cudaMemcpyAsync(d_face_vector, h_face_vector, face_vec_bytes, cudaMemcpyHostToDevice, stream)); - checkCudaErrors(cudaMemcpyAsync(d_boundary_face_vector, h_boundary_face_vector, boundary_face_vec_bytes, cudaMemcpyHostToDevice, stream)); - checkCudaErrors(cudaMemcpyAsync(d_boundary_face, h_boundary_face, boundary_face_bytes, cudaMemcpyHostToDevice, stream)); - checkCudaErrors(cudaMemcpyAsync(d_boundary_deltaCoeffs, h_boundary_deltaCoeffs, boundary_face_bytes, cudaMemcpyHostToDevice, stream)); - checkCudaErrors(cudaMemcpyAsync(d_boundary_UpatchType, patch_type[0].data(), boundary_face_index_bytes, cudaMemcpyHostToDevice, stream)); - checkCudaErrors(cudaMemcpyAsync(d_boundary_YpatchType, patch_type[1].data(), boundary_face_index_bytes, cudaMemcpyHostToDevice, stream)); + void createConstantFieldsInternal(); + void createConstantFieldsBoundary(); + void initConstantFieldsInternal(const double *sf, const double *mag_sf, + const double *weight, const double *delta_coeffs, const double *volume); + void initConstantFieldsBoundary(const double *boundary_sf, const double *boundary_mag_sf, + const double *boundary_delta_coeffs); - checkCudaErrors(cudaMemcpyAsync(d_permedIndex, permedIndex.data(), face_index_bytes, cudaMemcpyHostToDevice, stream)); - checkCudaErrors(cudaMemcpyAsync(d_bouPermedIndex, boundPermutationList.data(), boundary_face_index_bytes, cudaMemcpyHostToDevice, stream)); - checkCudaErrors(cudaMemcpyAsync(d_tmpPermutatedList, tmpPermutatedList.data(), csr_col_index_bytes, cudaMemcpyHostToDevice, stream)); - }; + void createNonConstantFieldsInternal(); + void createNonConstantFieldsBoundary(); + void initNonConstantFieldsInternal(const double *y); + void initNonConstantFieldsBoundary(const double *boundary_y); - ~dfMatrixDataBase(){ - std::cout << "Destructor called." << std::endl; - // TODO: free pointers - - }; }; diff --git a/src_gpu/dfMatrixDataBase.cu b/src_gpu/dfMatrixDataBase.cu index d4f5a7ab0..4ecbc25c8 100644 --- a/src_gpu/dfMatrixDataBase.cu +++ b/src_gpu/dfMatrixDataBase.cu @@ -1,48 +1,231 @@ #include "dfMatrixDataBase.H" +dfMatrixDataBase::dfMatrixDataBase() { + checkCudaErrors(cudaStreamCreate(&stream)); +} -void constructBoundarySelector(std::vector& patchTypeSelector, const std::string& patchTypeStr, - const int patchSize) -{ - boundaryConditions patchCondition; - std::vector tmpSelector; - static std::map BCMap = { - {"zeroGradient", zeroGradient}, - {"fixedValue", fixedValue}, - {"empty", empty}, - {"coupled", coupled} - }; - auto iter = BCMap.find(patchTypeStr); - if (iter != BCMap.end()) { - patchCondition = iter->second; - } else { - throw std::runtime_error("Unknown boundary condition: " + patchTypeStr); +dfMatrixDataBase::~dfMatrixDataBase() { + // destroy cuda resources + checkCudaErrors(cudaStreamDestroy(stream)); + if (graph_created) { + checkCudaErrors(cudaGraphExecDestroy(graph_instance)); + checkCudaErrors(cudaGraphDestroy(graph)); } - // zeroGradient labeled as 0, fixedValue labeled as 1, coupled labeled as 2 - switch (patchCondition){ - case zeroGradient: - { - tmpSelector.resize(patchSize, 0); - patchTypeSelector.insert(patchTypeSelector.end(), tmpSelector.begin(), tmpSelector.end()); - break; - } - case fixedValue: - { - tmpSelector.resize(patchSize, 1); - patchTypeSelector.insert(patchTypeSelector.end(), tmpSelector.begin(), tmpSelector.end()); - break; + // TODO: free pointers +} + +void dfMatrixDataBase::setConstantValues(int num_cells, int num_surfaces, int num_boundary_surfaces, + int num_patches, std::vector patch_sizes, + int num_species, double rdelta_t) { + // constant values -- basic + this->num_cells = num_cells; + this->num_surfaces = num_surfaces; + this->num_boundary_surfaces = num_boundary_surfaces; + this->num_patches = num_patches; + this->patch_sizes = patch_sizes; + this->num_species = num_species; + this->rdelta_t = rdelta_t; + + // constant values -- ldu bytesize + cell_value_bytes = num_cells * sizeof(double); + cell_value_vec_bytes = num_cells * 3 * sizeof(double); + cell_value_tsr_bytes = num_cells * 9 * sizeof(double); + cell_index_bytes = num_cells * sizeof(int); + surface_value_bytes = num_surfaces * sizeof(double); + surface_index_bytes = num_surfaces * sizeof(int); + surface_value_vec_bytes = num_surfaces * 3 * sizeof(double); + boundary_surface_value_bytes = num_boundary_surfaces * sizeof(double); + boundary_surface_value_vec_bytes = num_boundary_surfaces * 3 * sizeof(double); + boundary_surface_value_tsr_bytes = num_boundary_surfaces * 9 * sizeof(double); + boundary_surface_index_bytes = num_boundary_surfaces * sizeof(int); + + // constant values -- csr bytesize + csr_row_index_bytes = (num_cells + 1) * sizeof(int); + csr_col_index_bytes = (num_cells + num_surfaces * 2) * sizeof(int); + csr_value_bytes = (num_cells + num_surfaces * 2) * sizeof(double); + csr_value_vec_bytes = (num_cells + num_surfaces * 2) * 3 * sizeof(double); +} + +void dfMatrixDataBase::setConstantIndexes(const int *owner, const int *neighbor) { + // build d_owner, d_neighbor + checkCudaErrors(cudaMalloc((void**)&d_owner, surface_index_bytes)); + checkCudaErrors(cudaMalloc((void**)&d_neighbor, surface_index_bytes)); + checkCudaErrors(cudaMemcpyAsync(d_owner, owner, surface_index_bytes, cudaMemcpyHostToDevice, stream)); + checkCudaErrors(cudaMemcpyAsync(d_neighbor, neighbor, surface_index_bytes, cudaMemcpyHostToDevice, stream)); + + + // build d_lower_to_csr_index, d_diag_to_csr_index, d_upper_to_csr_index + std::vector upperNum(num_cells, 0); + std::vector lowerNum(num_cells, 0); + std::vector lowerPermListInit(num_surfaces); + + int *upperOffset = (int*)calloc(num_cells + 1, sizeof(int)); + int *lowerOffset = (int*)calloc(num_cells + 1, sizeof(int)); + + for(int faceI = 0; faceI < num_surfaces; ++faceI){ + upperNum[owner[faceI]] ++; + lowerNum[neighbor[faceI]] ++; + } + std::partial_sum(upperNum.begin(), upperNum.end(), + upperOffset+1); + std::partial_sum(lowerNum.begin(), lowerNum.end(), + lowerOffset+1); + + std::iota(lowerPermListInit.begin(), lowerPermListInit.end(), 0); + + std::multimap permutation; + for (int i = 0; i < num_surfaces; ++i){ + permutation.insert(std::make_pair(neighbor[i], lowerPermListInit[i])); + } + std::vector> permPair(permutation.begin(), permutation.end()); + std::sort(permPair.begin(), permPair.end(), [] + (const std::pair& pair1, const std::pair& pair2){ + if (pair1.first != pair2.first) { + return pair1.first < pair2.first; + } else { + return pair1.second < pair2.second; } - case empty: - { - tmpSelector.resize(patchSize, 2); - patchTypeSelector.insert(patchTypeSelector.end(), tmpSelector.begin(), tmpSelector.end()); - break; + }); + + std::vector lowerPermList; + std::transform(permPair.begin(), permPair.end(), std::back_inserter(lowerPermList), [] + (const std::pair& pair) { + return pair.second; + }); + + std::vector lowCSRIndex, uppCSRIndex, diagCSRIndex, CSRRowIndex, CSRColIndex; + int uppIndexInCSR = 0, uppIndexInLdu = 0, lowIndexInCSR = 0, lowIndexInLdu = 0, lowNumInLdu = 0; + CSRRowIndex.push_back(0); + CSRColIndex.resize(2 * num_surfaces + num_cells); + lowCSRIndex.resize(num_surfaces); + for (int i = 0; i < num_cells; ++i) { + int numUppPerRow = upperOffset[i + 1] - upperOffset[i]; + int numLowPerRow = lowerOffset[i + 1] - lowerOffset[i]; + int numNZBefore = upperOffset[i] + lowerOffset[i] + i; // add diag + // csr row index + CSRRowIndex.push_back(numNZBefore); + // upper + for (int j = 0; j < numUppPerRow; ++j) { + uppIndexInCSR = numNZBefore + numLowPerRow + 1 + j; // 1 means diag + uppCSRIndex.push_back(uppIndexInCSR); + CSRColIndex[uppIndexInCSR] = neighbor[uppIndexInLdu]; // fill upper entry in CSRColIndex + uppIndexInLdu ++; } - case coupled: - { - tmpSelector.resize(patchSize, 3); - patchTypeSelector.insert(patchTypeSelector.end(), tmpSelector.begin(), tmpSelector.end()); - break; + // lower + for (int j = 0; j < numLowPerRow; ++j) { + lowIndexInCSR = numNZBefore + j; + lowIndexInLdu = lowerPermList[lowNumInLdu]; + lowCSRIndex[lowIndexInLdu] = lowIndexInCSR; + CSRColIndex[lowIndexInCSR] = owner[lowIndexInLdu]; // fill lower entry in CSRColIndex + lowNumInLdu ++; } + // diag + int diagIndexInCSR = numNZBefore + numLowPerRow; + diagCSRIndex.push_back(diagIndexInCSR); + CSRColIndex[diagIndexInCSR] = i; // fill diag entry in CSRColIndex } + + checkCudaErrors(cudaMalloc((void**)&d_lower_to_csr_index, surface_index_bytes)); + checkCudaErrors(cudaMalloc((void**)&d_diag_to_csr_index, cell_index_bytes)); + checkCudaErrors(cudaMalloc((void**)&d_upper_to_csr_index, surface_index_bytes)); + checkCudaErrors(cudaMemcpyAsync(d_lower_to_csr_index, lowCSRIndex.data(), surface_index_bytes, cudaMemcpyHostToDevice, stream)); + checkCudaErrors(cudaMemcpyAsync(d_diag_to_csr_index, diagCSRIndex.data(), cell_index_bytes, cudaMemcpyHostToDevice, stream)); + checkCudaErrors(cudaMemcpyAsync(d_upper_to_csr_index, uppCSRIndex.data(), surface_index_bytes, cudaMemcpyHostToDevice, stream)); + + + // build d_csr_row_index, d_csr_col_index + checkCudaErrors(cudaMalloc((void**)&d_csr_row_index, csr_row_index_bytes)); + checkCudaErrors(cudaMalloc((void**)&d_csr_col_index, csr_col_index_bytes)); + checkCudaErrors(cudaMemcpyAsync(d_csr_row_index, CSRRowIndex.data(), csr_row_index_bytes, cudaMemcpyHostToDevice, stream)); + checkCudaErrors(cudaMemcpyAsync(d_csr_col_index, CSRColIndex.data(), csr_col_index_bytes, cudaMemcpyHostToDevice, stream)); +} + +void dfMatrixDataBase::createConstantFieldsInternal() { + checkCudaErrors(cudaMalloc((void**)&d_sf, surface_value_vec_bytes)); + checkCudaErrors(cudaMalloc((void**)&d_mag_sf, surface_value_bytes)); + checkCudaErrors(cudaMalloc((void**)&d_weight, surface_value_bytes)); + checkCudaErrors(cudaMalloc((void**)&d_delta_coeffs, surface_value_bytes)); + checkCudaErrors(cudaMalloc((void**)&d_volume, cell_value_bytes)); +} + +void dfMatrixDataBase::createConstantFieldsBoundary() { + checkCudaErrors(cudaMalloc((void**)&d_boundary_sf, boundary_surface_value_vec_bytes)); + checkCudaErrors(cudaMalloc((void**)&d_boundary_mag_sf, boundary_surface_value_bytes)); + checkCudaErrors(cudaMalloc((void**)&d_boundary_delta_coeffs, boundary_surface_value_bytes)); +} + +void dfMatrixDataBase::initConstantFieldsInternal(const double *sf, const double *mag_sf, + const double *weight, const double *delta_coeffs, const double *volume) { + checkCudaErrors(cudaMemcpyAsync(d_sf, sf, surface_value_vec_bytes, cudaMemcpyHostToDevice, stream)); + checkCudaErrors(cudaMemcpyAsync(d_mag_sf, mag_sf, surface_value_bytes, cudaMemcpyHostToDevice, stream)); + checkCudaErrors(cudaMemcpyAsync(d_weight, weight, surface_value_bytes, cudaMemcpyHostToDevice, stream)); + checkCudaErrors(cudaMemcpyAsync(d_delta_coeffs, delta_coeffs, surface_value_bytes, cudaMemcpyHostToDevice, stream)); + checkCudaErrors(cudaMemcpyAsync(d_volume, volume, cell_value_bytes, cudaMemcpyHostToDevice, stream)); +} + +void dfMatrixDataBase::initConstantFieldsBoundary(const double *boundary_sf, const double *boundary_mag_sf, + const double *boundary_delta_coeffs) { + checkCudaErrors(cudaMemcpyAsync(d_boundary_sf, boundary_sf, boundary_surface_value_vec_bytes, cudaMemcpyHostToDevice, stream)); + checkCudaErrors(cudaMemcpyAsync(d_boundary_mag_sf, boundary_mag_sf, boundary_surface_value_bytes, cudaMemcpyHostToDevice, stream)); + checkCudaErrors(cudaMemcpyAsync(d_boundary_delta_coeffs, boundary_delta_coeffs, boundary_surface_value_bytes, cudaMemcpyHostToDevice, stream)); +} + +void dfMatrixDataBase::createNonConstantFieldsInternal() { + checkCudaErrors(cudaMalloc((void**)&d_rho, cell_value_bytes)); + checkCudaErrors(cudaMalloc((void**)&d_u, cell_value_vec_bytes)); + checkCudaErrors(cudaMalloc((void**)&d_y, cell_value_bytes * num_species)); + checkCudaErrors(cudaMalloc((void**)&d_he, cell_value_bytes)); + checkCudaErrors(cudaMalloc((void**)&d_p, cell_value_bytes)); + + checkCudaErrors(cudaMalloc((void**)&d_rho_old, cell_value_bytes)); + checkCudaErrors(cudaMalloc((void**)&d_u_old, cell_value_vec_bytes)); + checkCudaErrors(cudaMalloc((void**)&d_y_old, cell_value_bytes * num_species)); + checkCudaErrors(cudaMalloc((void**)&d_he_old, cell_value_bytes)); + checkCudaErrors(cudaMalloc((void**)&d_p_old, cell_value_bytes)); + + checkCudaErrors(cudaMalloc((void**)&d_phi, surface_value_bytes)); + + // computed on GPU, used on CPU, need memcpyd2h + checkCudaErrors(cudaMallocHost((void**)&h_rho, cell_value_bytes)); + checkCudaErrors(cudaMallocHost((void**)&h_u, cell_value_vec_bytes)); + checkCudaErrors(cudaMallocHost((void**)&h_y, cell_value_bytes * num_species)); + checkCudaErrors(cudaMallocHost((void**)&h_he, cell_value_bytes)); + + // computed on CPU, used on GPU, need memcpyh2d + checkCudaErrors(cudaMallocHost((void**)&h_p, cell_value_bytes)); + checkCudaErrors(cudaMallocHost((void**)&h_phi, surface_value_bytes)); +} + +void dfMatrixDataBase::createNonConstantFieldsBoundary() { + checkCudaErrors(cudaMalloc((void**)&d_boundary_rho, boundary_surface_value_bytes)); + checkCudaErrors(cudaMalloc((void**)&d_boundary_u, boundary_surface_value_vec_bytes)); + checkCudaErrors(cudaMalloc((void**)&d_boundary_y, boundary_surface_value_bytes * num_species)); + checkCudaErrors(cudaMalloc((void**)&d_boundary_he, boundary_surface_value_bytes)); + checkCudaErrors(cudaMalloc((void**)&d_boundary_p, boundary_surface_value_bytes)); + + checkCudaErrors(cudaMalloc((void**)&d_boundary_rho_old, boundary_surface_value_bytes)); + checkCudaErrors(cudaMalloc((void**)&d_boundary_u_old, boundary_surface_value_vec_bytes)); + checkCudaErrors(cudaMalloc((void**)&d_boundary_y_old, boundary_surface_value_bytes * num_species)); + checkCudaErrors(cudaMalloc((void**)&d_boundary_he_old, boundary_surface_value_bytes)); + checkCudaErrors(cudaMalloc((void**)&d_boundary_p_old, boundary_surface_value_bytes)); + + checkCudaErrors(cudaMalloc((void**)&d_boundary_phi, boundary_surface_value_bytes)); + + // computed on GPU, used on CPU, need memcpyd2h + checkCudaErrors(cudaMallocHost((void**)&h_boundary_rho, boundary_surface_value_bytes)); + checkCudaErrors(cudaMallocHost((void**)&h_boundary_u, boundary_surface_value_vec_bytes)); + checkCudaErrors(cudaMallocHost((void**)&h_boundary_y, boundary_surface_value_bytes * num_species)); + checkCudaErrors(cudaMallocHost((void**)&h_boundary_he, boundary_surface_value_bytes)); + + // computed on CPU, used on GPU, need memcpyh2d + checkCudaErrors(cudaMallocHost((void**)&h_boundary_p, boundary_surface_value_bytes)); + checkCudaErrors(cudaMallocHost((void**)&h_boundary_phi, boundary_surface_value_bytes)); +} + +void dfMatrixDataBase::initNonConstantFieldsInternal(const double *y) { + checkCudaErrors(cudaMemcpyAsync(d_y, y, cell_value_bytes * num_species, cudaMemcpyHostToDevice, stream)); +} + +void dfMatrixDataBase::initNonConstantFieldsBoundary(const double *boundary_y) { + checkCudaErrors(cudaMemcpyAsync(d_boundary_y, boundary_y, boundary_surface_value_bytes* num_species, cudaMemcpyHostToDevice, stream)); } diff --git a/src_gpu/AmgXSolver.H b/src_gpu_orig/AmgXSolver.H similarity index 100% rename from src_gpu/AmgXSolver.H rename to src_gpu_orig/AmgXSolver.H diff --git a/src_gpu/AmgXSolver.cu b/src_gpu_orig/AmgXSolver.cu similarity index 100% rename from src_gpu/AmgXSolver.cu rename to src_gpu_orig/AmgXSolver.cu diff --git a/src_gpu_orig/CMakeLists.txt b/src_gpu_orig/CMakeLists.txt new file mode 100644 index 000000000..6e4a7efef --- /dev/null +++ b/src_gpu_orig/CMakeLists.txt @@ -0,0 +1,39 @@ +# +# dfMatrix CMake configuration +# +cmake_minimum_required(VERSION 3.5) + +project(dfMatrix LANGUAGES CXX CUDA) +set(CMAKE_CXX_STANDARD 17) +set(CMAKE_CXX_STANDARD_REQUIRED ON) + +find_package(CUDA REQUIRED) +find_package(MPI REQUIRED) +find_package(CUDAToolkit REQUIRED) +find_library(LIBAMGXSH amgxsh PATHS $ENV{AMGX_DIR}/build) + +include_directories( + ${MPI_INCLUDE_PATH} + ${CUDA_INCLUDE_DIRS} + $ENV{AMGX_DIR}/include +) + +add_library(${PROJECT_NAME} + SHARED + dfUEqn.cu + dfRhoEqn.cu + dfYEqn.cu + dfEEqn.cu + AmgXSolver.cu + dfMatrixDataBase.cu) + +target_link_libraries(${PROJECT_NAME} + ${MPI_LIBRARIES} + ${CUDA_LIBRARIES} + ${LIBAMGXSH} +) +target_compile_options(dfMatrix PUBLIC -g) +option(DFMATRIX_ENABLE_DETAILED_DEBUG "Enable detailed debug build" OFF) +if (DFMATRIX_ENABLE_DETAILED_DEBUG) + target_compile_definitions(${PROJECT_NAME} PRIVATE DEBUG) +endif() diff --git a/src_gpu/GPUMesh.H b/src_gpu_orig/GPUMesh.H similarity index 100% rename from src_gpu/GPUMesh.H rename to src_gpu_orig/GPUMesh.H diff --git a/src_gpu/GPUfield.H b/src_gpu_orig/GPUfield.H similarity index 100% rename from src_gpu/GPUfield.H rename to src_gpu_orig/GPUfield.H diff --git a/src_gpu/GPUfield.cpp b/src_gpu_orig/GPUfield.cpp similarity index 100% rename from src_gpu/GPUfield.cpp rename to src_gpu_orig/GPUfield.cpp diff --git a/src_gpu/dfEEqn.H b/src_gpu_orig/dfEEqn.H similarity index 100% rename from src_gpu/dfEEqn.H rename to src_gpu_orig/dfEEqn.H diff --git a/src_gpu/dfEEqn.cu b/src_gpu_orig/dfEEqn.cu similarity index 100% rename from src_gpu/dfEEqn.cu rename to src_gpu_orig/dfEEqn.cu diff --git a/src_gpu_orig/dfMatrixDataBase.H b/src_gpu_orig/dfMatrixDataBase.H new file mode 100644 index 000000000..8efb4bf62 --- /dev/null +++ b/src_gpu_orig/dfMatrixDataBase.H @@ -0,0 +1,641 @@ +#pragma once + +#include +#include +#include "cuda_profiler_api.h" +#include +#include "nvtx3/nvToolsExt.h" +#include +#include +#include +#include +#include +#include +#include + + +static const char *_cudaGetErrorEnum(cudaError_t error) { + return cudaGetErrorName(error); +} + +template +void check(T result, char const *const func, const char *const file, + int const line) { + if (result) { + fprintf(stderr, "cuda error at %s:%d code=%d(%s) \"%s\" \n", file, line, + static_cast(result), _cudaGetErrorEnum(result), func); + exit(EXIT_FAILURE); + } +} + +#define checkCudaErrors(val) check((val), #val, __FILE__, __LINE__) + +inline void checkVectorEqual(int count, const double* basevec, double* vec, double max_relative_error) { + for (size_t i = 0; i < count; ++i) + { + double abs_diff = fabs(basevec[i] - vec[i]); + double rel_diff = fabs(basevec[i] - vec[i]) / fabs(basevec[i]); + // if (abs_diff > 1e-12 && rel_diff > max_relative_error && !std::isinf(rel_diff)) + if (abs_diff > 1e-15 && rel_diff > max_relative_error) + fprintf(stderr, "mismatch index %d, cpu data: %.16lf, gpu data: %.16lf, relative error: %.16lf\n", i, basevec[i], vec[i], rel_diff); + } +} + +enum boundaryConditions{ + zeroGradient, + fixedValue, + coupled, + empty +}; + +void constructBoundarySelector(std::vector& patchTypeSelector, const std::string& patchTypeStr, const int patchSize); + +struct dfMatrixDataBase +{ + // - cuda resource + cudaStream_t stream; + + // - number of cell size + int num_cells; + // - number of face size + int num_surfaces; + // - number of offdiagnal entry size (2*num_surfaces) + int num_faces; + // - number of boundary cells + int num_boundary_cells; + // - number of boundary faces + int num_boundary_faces; + + int num_species; + + // - mesh variables + // - csr_row_index + int *h_A_csr_row_index=nullptr, *d_A_csr_row_index=nullptr; + // - csr_col_index + int *h_A_csr_col_index=nullptr, *d_A_csr_col_index=nullptr; + // - csr_diag_index + int *h_A_csr_diag_index=nullptr, *d_A_csr_diag_index=nullptr; + + // - the pre-permutated and post-permutated interpolation weight list + std::vector h_weight_vec_init, h_weight_vec; + // - the pre-permutated and post-permutated flux (phi) list + std::vector h_phi_vec_init, h_phi_vec; + // - the pre-permutated and post-permutated cell face vector list + std::vector h_face_vector_vec_init, h_face_vector_vec; + std::vector h_face_vec_init, h_face_vec; + std::vector h_deltaCoeffs_vec_init, h_deltaCoeffs_vec; + // - the host pointer to rho_new, rho_old, velocity_old, pressure and volume list + double *h_rho_new = nullptr, *h_rho_old = nullptr, *h_velocity_old = nullptr, + *h_pressure = nullptr; + const double *h_volume = nullptr; + // - the host pointer to the pre-permutated and post-permutated interpolation weight list + double *h_weight_init = nullptr, *h_weight = nullptr; + // - the host pointer to the pre-permutated and post-permutated flux (phi) list + double *h_phi_init = nullptr, *h_phi = nullptr; + // - the host pointer to the pre-permutated and post-permutated cell face vector list + double *h_face_vector_init = nullptr, *h_face_vector = nullptr; + double *h_face_init = nullptr, *h_face = nullptr; + double *h_deltaCoeffs_init = nullptr, *h_deltaCoeffs = nullptr; + // - the device pointer to rho_new, rho_old, velocity_old, pressure and volume list + double *d_rho_new = nullptr, *d_rho_old = nullptr, *d_velocity_old = nullptr, + *d_pressure = nullptr, *d_volume = nullptr; + // - the device pointer to Y(vector Yi) + //std::vector d_Y; + double *d_Y = nullptr; + // - the device pointer to the pre-permutated and post-permutated interpolation weight list + double *d_weight_init = nullptr, *d_weight = nullptr; + double *d_weight_upwind = nullptr; + // - the device pointer to the pre-permutated and post-permutated flux (phi) list + double *d_phi_init = nullptr, *d_phi = nullptr; + // - the device pointer to the pre-permutated and post-permutated cell face vector list + double *d_face_vector_init = nullptr, *d_face_vector = nullptr; + double *d_face_init = nullptr, *d_face = nullptr; + double *d_deltaCoeffs_init = nullptr, *d_deltaCoeffs = nullptr; + std::vector d_rhoD_vector; + + double *d_hDiffCorrFlux = nullptr; + double *d_diffAlphaD = nullptr; + double *d_rhoD = nullptr; + double *d_alpha = nullptr; + + double rdelta_t = 1/1e-6; + + /** + * @brief boundary related variables + */ + int *h_boundary_cell_offset = nullptr, *d_boundary_cell_offset=nullptr; + int *h_boundary_cell_id = nullptr, *d_boundary_cell_id = nullptr; + double *h_internal_coeffs = nullptr, *h_boundary_coeffs = nullptr, + *h_boundary_pressure = nullptr, *h_boundary_face_vector = nullptr, + *h_boundary_face = nullptr, *d_boundary_face = nullptr, + *h_boundary_deltaCoeffs = nullptr, *d_boundary_deltaCoeffs = nullptr, + *d_internal_coeffs = nullptr, *d_boundary_coeffs = nullptr, + *d_internal_coeffs_init = nullptr, *d_boundary_coeffs_init = nullptr, + *d_laplac_internal_coeffs = nullptr, *d_laplac_boundary_coeffs = nullptr, + *d_laplac_internal_coeffs_init = nullptr, *d_laplac_boundary_coeffs_init = nullptr, + *d_boundary_pressure = nullptr, *d_boundary_face_vector = nullptr, + *d_boundary_pressure_init = nullptr, + *d_boundary_phi = nullptr, *d_boundary_phi_init = nullptr, + *d_boundary_velocity = nullptr, *d_boundary_velocity_init = nullptr, + *d_boundary_nuEff = nullptr, *d_boundary_nuEff_init = nullptr, + *d_boundary_rho = nullptr, *d_boundary_rho_init = nullptr; + std::vector d_boundary_Y_vector; + std::vector d_boundary_Y_init_vector; + std::vector d_internal_coeffs_Y_vector; + std::vector d_boundary_coeffs_Y_vector; + std::vector d_laplac_internal_coeffs_Y_vector; + std::vector d_laplac_boundary_coeffs_Y_vector; + double *d_internal_coeffs_Y = nullptr; + double *d_boundary_coeffs_Y = nullptr; + double *d_laplac_internal_coeffs_Y = nullptr; + double *d_laplac_boundary_coeffs_Y = nullptr; + std::vector d_boundary_rhoD_vector; + double *d_boundary_mut_sct = nullptr; + double *d_boundary_rhoD = nullptr; + double *d_boundary_alpha = nullptr; + + double *d_boundary_hDiffCorrFlux = nullptr; + int *d_boundary_UpatchType = nullptr; + int *d_boundary_YpatchType = nullptr; + + std::vector boundPermutationList; + std::vector ueqn_internalCoeffs, ueqn_boundaryCoeffs; + std::vector boundary_face_vector; + std::vector boundary_pressure; + std::vector boundary_face; + std::vector boundary_deltaCoeffs; + std::vector> patch_type_init; + std::vector> patch_type; + + // - the device pointer to the permutated index list + std::vector permedIndex; + int *d_permedIndex=nullptr; + int *d_bouPermedIndex = nullptr; + + + // bytesize + // - bytes of diagnal entries + size_t cell_bytes; + // - bytes of diagnal entries (vector) + size_t cell_vec_bytes; + // - bytes of diagnal index + size_t cell_index_bytes; + // - bytes of diagnal index + size_t face_bytes; + size_t face_vec_bytes; + size_t face_index_bytes; + + size_t boundary_cell_bytes; + size_t boundary_cell_vec_bytes; + size_t boundary_cell_index_bytes; + + size_t boundary_face_bytes; + size_t boundary_face_vec_bytes; + size_t boundary_face_index_bytes; + + // A_csr has one more element in each row: itself + size_t csr_row_index_bytes; + size_t csr_col_index_bytes; + size_t csr_value_bytes; + size_t csr_value_vec_bytes; + + // extra matrix information + double *d_turbSrc_A = nullptr, *d_turbSrc_b = nullptr, *d_turbSrc_A_init = nullptr; + std::vector h_turbSrc_init_mtx_vec, h_turbSrc_init_1mtx; + std::vector h_turbSrc_init_src_vec, h_turbSrc_src_vec; + std::vector tmpPermutatedList; + int * d_tmpPermutatedList = nullptr; + + // double *h_A_csr = nullptr, *h_b = nullptr, *h_psi = nullptr; + // double *d_A_csr = nullptr, *d_b = nullptr, *d_psi = nullptr; + + int num_iteration; + + double time_monitor_CPU; + double time_monitor_GPU_kernel, time_monitor_GPU_memcpy, time_monitor_GPU_memcpy_test; + + double* d_grad = nullptr; + double* d_grad_boundary = nullptr, *d_grad_boundary_init = nullptr; + double* d_nuEff = nullptr; + + // constructor + dfMatrixDataBase(); + dfMatrixDataBase(int num_surfaces, int num_cells, int num_boundary_faces, int num_species, int & num_boundary_cells_output, + const int *neighbour, const int *owner, const double* volume, const double* weight, const double* face_vector, const double* face, + const double* deltaCoeffs, std::vector boundary_face_vector_init, std::vector boundary_face_init, + std::vector boundary_deltaCoeffs_init, std::vector boundary_cell_id_init, std::vector> patch_type_init) + : num_cells(num_cells), num_faces(num_surfaces*2), num_surfaces(num_surfaces), num_species(num_species), num_iteration(0), + num_boundary_faces(num_boundary_faces), h_volume(volume), patch_type_init(patch_type_init) + { + // create cuda stream + checkCudaErrors(cudaStreamCreate(&stream)); + + // allocate field pointer in pin memory + cudaMallocHost(&h_phi_init, num_faces * sizeof(double)); + cudaMallocHost(&h_rho_old, num_cells * sizeof(double)); + + h_weight_vec_init.resize(num_faces); + h_weight_vec.resize(num_faces); + h_face_vector_vec_init.resize(num_faces*3); + h_face_vector_vec.resize(num_faces*3); + h_face_vec_init.resize(num_faces); + h_face_vec.resize(num_faces); + h_deltaCoeffs_vec_init.resize(num_faces); + h_deltaCoeffs_vec.resize(num_faces); + h_turbSrc_init_mtx_vec.resize(num_faces + num_cells); + h_turbSrc_init_1mtx.resize(num_faces + num_cells); + h_turbSrc_init_src_vec.resize(3*num_cells); + h_turbSrc_src_vec.resize(3*num_cells); + + // byte sizes + cell_bytes = num_cells * sizeof(double); + cell_vec_bytes = num_cells * 3 * sizeof(double); + cell_index_bytes = num_cells * sizeof(int); + + face_bytes = num_faces * sizeof(double); + face_vec_bytes = num_faces * 3 * sizeof(double); + face_index_bytes = num_faces * sizeof(int); + + // A_csr has one more element in each row: itself + csr_row_index_bytes = (num_cells + 1) * sizeof(int); + csr_col_index_bytes = (num_cells + num_faces) * sizeof(int); + csr_value_bytes = (num_cells + num_faces) * sizeof(double); + csr_value_vec_bytes = (num_cells + num_faces) * 3 * sizeof(double); + + /************************construct mesh variables****************************/ + /** + * 1. h_csr_row_index & h_csr_diag_index + */ + std::vector h_mtxEntry_perRow_vec(num_cells); + std::vector h_csr_diag_index_vec(num_cells); + std::vector h_csr_row_index_vec(num_cells + 1, 0); + + for (int faceI = 0; faceI < num_surfaces; faceI++) + { + h_csr_diag_index_vec[neighbour[faceI]]++; + h_mtxEntry_perRow_vec[neighbour[faceI]]++; + h_mtxEntry_perRow_vec[owner[faceI]]++; + } + + // - consider diagnal element in each row + std::transform(h_mtxEntry_perRow_vec.begin(), h_mtxEntry_perRow_vec.end(), h_mtxEntry_perRow_vec.begin(), [](int n) + {return n + 1;}); + // - construct h_csr_row_index & h_csr_diag_index + std::partial_sum(h_mtxEntry_perRow_vec.begin(), h_mtxEntry_perRow_vec.end(), h_csr_row_index_vec.begin()+1); + // - assign h_csr_row_index & h_csr_diag_index + h_A_csr_row_index = h_csr_row_index_vec.data(); + h_A_csr_diag_index = h_csr_diag_index_vec.data(); + + /** + * 2. h_csr_col_index + */ + std::vector rowIndex(num_faces + num_cells), colIndex(num_faces + num_cells), diagIndex(num_cells); + std::iota(diagIndex.begin(), diagIndex.end(), 0); + + // initialize the RowIndex (rowIndex of lower + upper + diagnal) + std::copy(neighbour, neighbour + num_surfaces, rowIndex.begin()); + std::copy(owner, owner + num_surfaces, rowIndex.begin() + num_surfaces); + std::copy(diagIndex.begin(), diagIndex.end(), rowIndex.begin() + num_faces); + // initialize the ColIndex (colIndex of lower + upper + diagnal) + std::copy(owner, owner + num_surfaces, colIndex.begin()); + std::copy(neighbour, neighbour + num_surfaces, colIndex.begin() + num_surfaces); + std::copy(diagIndex.begin(), diagIndex.end(), colIndex.begin() + num_faces); + + // - construct hashTable for sorting + std::multimap rowColPair; + for (int i = 0; i < 2*num_surfaces+num_cells; i++) + { + rowColPair.insert(std::make_pair(rowIndex[i], colIndex[i])); + } + // - sort + std::vector> globalPerm(rowColPair.begin(), rowColPair.end()); + std::sort(globalPerm.begin(), globalPerm.end(), [] + (const std::pair& pair1, const std::pair& pair2){ + if (pair1.first != pair2.first) { + return pair1.first < pair2.first; + } else { + return pair1.second < pair2.second; + } + }); + + std::vector h_csr_col_index_vec; + std::transform(globalPerm.begin(), globalPerm.end(), std::back_inserter(h_csr_col_index_vec), [] + (const std::pair& pair) { + return pair.second; + }); + h_A_csr_col_index = h_csr_col_index_vec.data(); + + // construct a tmp permutated List for add fvMatrix + std::vector tmp_permutation(2*num_surfaces + num_cells); + std::vector tmp_rowIndex(2*num_surfaces + num_cells); + std::iota(tmp_permutation.begin(), tmp_permutation.end(), 0); + std::copy(neighbour, neighbour + num_surfaces, tmp_rowIndex.begin()); + std::copy(diagIndex.begin(), diagIndex.end(), tmp_rowIndex.begin() + num_surfaces); + std::copy(owner, owner + num_surfaces, tmp_rowIndex.begin() + num_surfaces + num_cells); + std::multimap tmpPair; + for (int i = 0; i < 2*num_surfaces+num_cells; i++) + { + tmpPair.insert(std::make_pair(tmp_rowIndex[i], tmp_permutation[i])); + } + std::vector> tmpPerm(tmpPair.begin(), tmpPair.end()); + std::sort(tmpPerm.begin(), tmpPerm.end(), [] + (const std::pair& pair1, const std::pair& pair2){ + if (pair1.first != pair2.first) { + return pair1.first < pair2.first; + } else { + return pair1.second < pair2.second; + } + }); + std::transform(tmpPerm.begin(), tmpPerm.end(), std::back_inserter(tmpPermutatedList), [] + (const std::pair& pair) { + return pair.second; + }); + + /** + * 3. boundary imformations + */ + // get boundPermutation and offset lists + std::vector boundPermutationListInit(num_boundary_faces); + std::vector boundOffsetList; + std::iota(boundPermutationListInit.begin(), boundPermutationListInit.end(), 0); + + // - construct hashTable for sorting + std::multimap boundPermutation; + for (int i = 0; i < num_boundary_faces; i++) + { + boundPermutation.insert(std::make_pair(boundary_cell_id_init[i], boundPermutationListInit[i])); + } + + // - sort + std::vector> boundPermPair(boundPermutation.begin(), boundPermutation.end()); + std::sort(boundPermPair.begin(), boundPermPair.end(), [] + (const std::pair& pair1, const std::pair& pair2){ + if (pair1.first != pair2.first) { + return pair1.first < pair2.first; + } else { + return pair1.second < pair2.second; + } + }); + + // - construct boundPermedIndex and boundary_cell_id + std::vector boundary_cell_id; + boundPermutationList.clear(); + std::transform(boundPermPair.begin(), boundPermPair.end(), std::back_inserter(boundary_cell_id), [] + (const std::pair& pair) { + return pair.first; + }); + std::transform(boundPermPair.begin(), boundPermPair.end(), std::back_inserter(boundPermutationList), [] + (const std::pair& pair) { + return pair.second; + }); + + // construct boundary_cell_offset + std::map countMap; + std::vector boundaryCellcount; + for (const auto& cellIndex : boundary_cell_id) + ++ countMap[cellIndex]; + for (const auto& [cellIndex, count] : countMap) + boundaryCellcount.push_back(count); + + num_boundary_cells = boundaryCellcount.size(); + num_boundary_cells_output = num_boundary_cells; + + std::vector boundary_cell_offset(boundaryCellcount.size() + 1, 0); + std::partial_sum(boundaryCellcount.begin(), boundaryCellcount.end(), boundary_cell_offset.begin()+1); + + // assign h_boundary_cell_offset & h_boundary_cell_id + h_boundary_cell_offset = boundary_cell_offset.data(); + h_boundary_cell_id = boundary_cell_id.data(); + + // + boundary_cell_bytes = num_boundary_cells * sizeof(double); + boundary_cell_vec_bytes = num_boundary_cells * 3 * sizeof(double); + boundary_cell_index_bytes = num_boundary_cells * sizeof(int); + + boundary_face_bytes = num_boundary_faces * sizeof(double); + boundary_face_vec_bytes = num_boundary_faces * 3 * sizeof(double); + boundary_face_index_bytes = num_boundary_faces * sizeof(int); + + ueqn_internalCoeffs.resize(3*num_boundary_faces); + ueqn_boundaryCoeffs.resize(3*num_boundary_faces); + + boundary_face_vector.resize(3*num_boundary_faces); + boundary_pressure.resize(num_boundary_faces); + boundary_face.resize(num_boundary_faces); + boundary_deltaCoeffs.resize(num_boundary_faces); + + patch_type.resize(2); + patch_type[0].resize(num_boundary_faces); + patch_type[1].resize(num_boundary_faces); + + /** + * 4. permutation list for field variables + */ + std::vector offdiagRowIndex(2*num_surfaces), permIndex(2*num_surfaces); + // - initialize the offdiagRowIndex (rowIndex of lower + rowIndex of upper) + std::copy(neighbour, neighbour + num_surfaces, offdiagRowIndex.begin()); + std::copy(owner, owner + num_surfaces, offdiagRowIndex.begin() + num_surfaces); + + // - initialize the permIndex (0, 1, ..., 2*num_surfaces) + std::iota(permIndex.begin(), permIndex.end(), 0); + + // - construct hashTable for sorting + std::multimap permutation; + for (int i = 0; i < 2*num_surfaces; i++) + { + permutation.insert(std::make_pair(offdiagRowIndex[i], permIndex[i])); + } + // - sort + std::vector> permPair(permutation.begin(), permutation.end()); + std::sort(permPair.begin(), permPair.end(), [] + (const std::pair& pair1, const std::pair& pair2){ + if (pair1.first != pair2.first) { + return pair1.first < pair2.first; + } else { + return pair1.second < pair2.second; + } + }); + // - form permedIndex list + std::transform(permPair.begin(), permPair.end(), std::back_inserter(permedIndex), [] + (const std::pair& pair) { + return pair.second; + }); + + // copy and permutate cell variables + std::copy(weight, weight + num_surfaces, h_weight_vec_init.begin()); + std::copy(weight, weight + num_surfaces, h_weight_vec_init.begin() + num_surfaces); + std::copy(face_vector, face_vector + 3*num_surfaces, h_face_vector_vec_init.begin()); + std::copy(face_vector, face_vector + 3*num_surfaces, h_face_vector_vec_init.begin() + 3*num_surfaces); + std::copy(face, face + num_surfaces, h_face_vec_init.begin()); + std::copy(face, face + num_surfaces, h_face_vec_init.begin() + num_surfaces); + std::copy(deltaCoeffs, deltaCoeffs + num_surfaces, h_deltaCoeffs_vec_init.begin()); + std::copy(deltaCoeffs, deltaCoeffs + num_surfaces, h_deltaCoeffs_vec_init.begin() + num_surfaces); + for (int i = 0; i < num_faces; i++) + { + h_weight_vec[i] = h_weight_vec_init[permedIndex[i]]; + h_face_vec[i] = h_face_vec_init[permedIndex[i]]; + h_deltaCoeffs_vec[i] = h_deltaCoeffs_vec_init[permedIndex[i]]; + h_face_vector_vec[i*3] = h_face_vector_vec_init[3*permedIndex[i]]; + h_face_vector_vec[i*3+1] = h_face_vector_vec_init[3*permedIndex[i]+1]; + h_face_vector_vec[i*3+2] = h_face_vector_vec_init[3*permedIndex[i]+2]; + } + h_weight = h_weight_vec.data(); + h_face_vector = h_face_vector_vec.data(); + h_face = h_face_vec.data(); + h_deltaCoeffs = h_deltaCoeffs_vec.data(); + + for (int i = 0; i < num_boundary_faces; i++) + { + boundary_face_vector[3*i] = boundary_face_vector_init[3*boundPermutationList[i]]; + boundary_face_vector[3*i+1] = boundary_face_vector_init[3*boundPermutationList[i]+1]; + boundary_face_vector[3*i+2] = boundary_face_vector_init[3*boundPermutationList[i]+2]; + boundary_face[i] = boundary_face_init[boundPermutationList[i]]; + boundary_deltaCoeffs[i] = boundary_deltaCoeffs_init[boundPermutationList[i]]; + patch_type[0][i] = patch_type_init[0][boundPermutationList[i]]; + patch_type[1][i] = patch_type_init[1][boundPermutationList[i]]; + } + h_boundary_face_vector = boundary_face_vector.data(); + h_boundary_face = boundary_face.data(); + h_boundary_deltaCoeffs = boundary_deltaCoeffs.data(); + + /************************allocate memory on device****************************/ + int total_bytes = 0; + + checkCudaErrors(cudaMalloc((void**)&d_A_csr_row_index, csr_row_index_bytes)); + checkCudaErrors(cudaMalloc((void**)&d_A_csr_col_index, csr_col_index_bytes)); + checkCudaErrors(cudaMalloc((void**)&d_A_csr_diag_index, cell_index_bytes)); + total_bytes += (csr_row_index_bytes + csr_col_index_bytes + cell_index_bytes); + + //d_Y.resize(num_species); + d_rhoD_vector.resize(num_species); + d_boundary_Y_vector.resize(num_species); + d_boundary_Y_init_vector.resize(num_species); + d_internal_coeffs_Y_vector.resize(num_species); + d_boundary_coeffs_Y_vector.resize(num_species); + d_laplac_internal_coeffs_Y_vector.resize(num_species); + d_laplac_boundary_coeffs_Y_vector.resize(num_species); + d_boundary_rhoD_vector.resize(num_species); + + for (size_t i = 0; i < num_species; ++i){ + //checkCudaErrors(cudaMalloc((void**)&d_Y[i], cell_bytes)); + checkCudaErrors(cudaMalloc((void**)&d_rhoD_vector[i], cell_bytes)); + } + checkCudaErrors(cudaMalloc((void**)&d_Y, cell_bytes * num_species)); + checkCudaErrors(cudaMalloc((void**)&d_rho_old, cell_bytes)); + checkCudaErrors(cudaMalloc((void**)&d_rho_new, cell_bytes)); + checkCudaErrors(cudaMalloc((void**)&d_volume, cell_bytes)); + checkCudaErrors(cudaMalloc((void**)&d_pressure, cell_bytes)); + checkCudaErrors(cudaMalloc((void**)&d_velocity_old, cell_vec_bytes)); + checkCudaErrors(cudaMalloc((void**)&d_weight, face_bytes)); + checkCudaErrors(cudaMalloc((void**)&d_weight_upwind, face_bytes)); + checkCudaErrors(cudaMalloc((void**)&d_face, face_bytes)); + checkCudaErrors(cudaMalloc((void**)&d_deltaCoeffs, face_bytes)); + checkCudaErrors(cudaMalloc((void**)&d_phi, face_bytes)); + checkCudaErrors(cudaMalloc((void**)&d_phi_init, face_bytes)); + checkCudaErrors(cudaMalloc((void**)&d_face_vector, face_vec_bytes)); + checkCudaErrors(cudaMalloc((void**)&d_nuEff, cell_bytes)); + checkCudaErrors(cudaMalloc((void**)&d_rhoD, cell_bytes * num_species)); + checkCudaErrors(cudaMalloc((void**)&d_alpha, cell_bytes)); + total_bytes += (cell_bytes * (5 + 2*num_species) + face_bytes * 6 + cell_vec_bytes + face_vec_bytes); + + checkCudaErrors(cudaMalloc((void**)&d_boundary_cell_offset, (num_boundary_cells+1) * sizeof(int))); + checkCudaErrors(cudaMalloc((void**)&d_boundary_cell_id, boundary_face_index_bytes)); + total_bytes += (boundary_face_index_bytes + (num_boundary_cells+1) * sizeof(int)); + + checkCudaErrors(cudaMalloc((void**)&d_boundary_pressure_init, boundary_face_bytes)); + checkCudaErrors(cudaMalloc((void**)&d_boundary_pressure, boundary_face_bytes)); + checkCudaErrors(cudaMalloc((void**)&d_boundary_phi_init, boundary_face_bytes)); + checkCudaErrors(cudaMalloc((void**)&d_boundary_phi, boundary_face_bytes)); + checkCudaErrors(cudaMalloc((void**)&d_boundary_velocity_init, boundary_face_vec_bytes)); + checkCudaErrors(cudaMalloc((void**)&d_boundary_velocity, boundary_face_vec_bytes)); + checkCudaErrors(cudaMalloc((void**)&d_boundary_face_vector, boundary_face_vec_bytes)); + checkCudaErrors(cudaMalloc((void**)&d_boundary_face, boundary_face_bytes)); + checkCudaErrors(cudaMalloc((void**)&d_boundary_deltaCoeffs, boundary_face_bytes)); + checkCudaErrors(cudaMalloc((void**)&d_internal_coeffs_init, boundary_face_vec_bytes)); + checkCudaErrors(cudaMalloc((void**)&d_boundary_coeffs_init, boundary_face_vec_bytes)); + checkCudaErrors(cudaMalloc((void**)&d_laplac_internal_coeffs_init, boundary_face_vec_bytes)); + checkCudaErrors(cudaMalloc((void**)&d_laplac_boundary_coeffs_init, boundary_face_vec_bytes)); + checkCudaErrors(cudaMalloc((void**)&d_internal_coeffs, boundary_face_vec_bytes)); + checkCudaErrors(cudaMalloc((void**)&d_boundary_coeffs, boundary_face_vec_bytes)); + checkCudaErrors(cudaMalloc((void**)&d_laplac_internal_coeffs, boundary_face_vec_bytes)); + checkCudaErrors(cudaMalloc((void**)&d_laplac_boundary_coeffs, boundary_face_vec_bytes)); + checkCudaErrors(cudaMalloc((void**)&d_boundary_nuEff, boundary_face_bytes)); + checkCudaErrors(cudaMalloc((void**)&d_boundary_nuEff_init, boundary_face_bytes)); + checkCudaErrors(cudaMalloc((void**)&d_boundary_rho, boundary_face_bytes)); + checkCudaErrors(cudaMalloc((void**)&d_boundary_rho_init, boundary_face_bytes)); + checkCudaErrors(cudaMalloc((void**)&d_boundary_mut_sct, boundary_face_bytes)); + checkCudaErrors(cudaMalloc((void**)&d_boundary_UpatchType, boundary_face_index_bytes)); + checkCudaErrors(cudaMalloc((void**)&d_boundary_YpatchType, boundary_face_index_bytes)); + for (size_t i = 0; i < num_species; ++i){ + checkCudaErrors(cudaMalloc((void**)&d_boundary_Y_vector[i], boundary_face_bytes)); + checkCudaErrors(cudaMalloc((void**)&d_boundary_Y_init_vector[i], boundary_face_bytes)); + checkCudaErrors(cudaMalloc((void**)&d_internal_coeffs_Y_vector[i], boundary_face_bytes)); + checkCudaErrors(cudaMalloc((void**)&d_boundary_coeffs_Y_vector[i], boundary_face_bytes)); + checkCudaErrors(cudaMalloc((void**)&d_laplac_internal_coeffs_Y_vector[i], boundary_face_bytes)); + checkCudaErrors(cudaMalloc((void**)&d_laplac_boundary_coeffs_Y_vector[i], boundary_face_bytes)); + checkCudaErrors(cudaMalloc((void**)&d_boundary_rhoD_vector[i], boundary_face_bytes)); + } + checkCudaErrors(cudaMalloc((void**)&d_internal_coeffs_Y, boundary_face_bytes * num_species)); + checkCudaErrors(cudaMalloc((void**)&d_boundary_coeffs_Y, boundary_face_bytes * num_species)); + checkCudaErrors(cudaMalloc((void**)&d_laplac_internal_coeffs_Y, boundary_face_bytes * num_species)); + checkCudaErrors(cudaMalloc((void**)&d_laplac_boundary_coeffs_Y, boundary_face_bytes * num_species)); + checkCudaErrors(cudaMalloc((void**)&d_boundary_rhoD, boundary_face_bytes * num_species)); + checkCudaErrors(cudaMalloc((void**)&d_boundary_alpha, boundary_face_bytes)); + + total_bytes += (boundary_face_bytes*10 + boundary_face_vec_bytes * 11); + + // checkCudaErrors(cudaMalloc((void**)&d_A_csr, csr_value_vec_bytes)); + // checkCudaErrors(cudaMalloc((void**)&d_b, cell_vec_bytes)); + // checkCudaErrors(cudaMalloc((void**)&d_psi, cell_vec_bytes)); + total_bytes += (boundary_face_bytes + boundary_face_vec_bytes * 3); + + checkCudaErrors(cudaMalloc((void**)&d_turbSrc_A, csr_value_bytes)); + checkCudaErrors(cudaMalloc((void**)&d_turbSrc_A_init, csr_value_bytes)); + checkCudaErrors(cudaMalloc((void**)&d_turbSrc_b, cell_vec_bytes)); + total_bytes += (2*csr_value_bytes + cell_vec_bytes); + + checkCudaErrors(cudaMalloc((void**)&d_permedIndex, face_index_bytes)); + checkCudaErrors(cudaMalloc((void**)&d_bouPermedIndex, boundary_face_index_bytes)); + checkCudaErrors(cudaMalloc((void**)&d_tmpPermutatedList, csr_col_index_bytes)); + total_bytes += (face_index_bytes + boundary_face_index_bytes + csr_col_index_bytes); + + checkCudaErrors(cudaMalloc((void**)&d_grad, num_cells * 9 * sizeof(double))); + checkCudaErrors(cudaMalloc((void**)&d_grad_boundary, boundary_face_bytes * 9)); + checkCudaErrors(cudaMalloc((void**)&d_grad_boundary_init, boundary_cell_bytes * 9)); + total_bytes += (num_cells * 9 * sizeof(double) + boundary_face_bytes * 9 + boundary_cell_bytes * 9); // FIXME: rename + + checkCudaErrors(cudaMalloc((void**)&d_hDiffCorrFlux, 3 * cell_bytes)); + checkCudaErrors(cudaMalloc((void**)&d_diffAlphaD, cell_bytes)); + checkCudaErrors(cudaMalloc((void**)&d_boundary_hDiffCorrFlux, 3 * boundary_face_bytes)); + + fprintf(stderr, "Total bytes malloc on GPU: %.2fMB\n", total_bytes * 1.0 / 1024 / 1024); + + checkCudaErrors(cudaMemcpyAsync(d_A_csr_row_index, h_A_csr_row_index, csr_row_index_bytes, cudaMemcpyHostToDevice, stream)); + checkCudaErrors(cudaMemcpyAsync(d_A_csr_col_index, h_A_csr_col_index, csr_col_index_bytes, cudaMemcpyHostToDevice, stream)); + checkCudaErrors(cudaMemcpyAsync(d_A_csr_diag_index, h_A_csr_diag_index, cell_index_bytes, cudaMemcpyHostToDevice, stream)); + checkCudaErrors(cudaMemcpyAsync(d_boundary_cell_offset, h_boundary_cell_offset, (num_boundary_cells+1) * sizeof(int), cudaMemcpyHostToDevice, stream)); + checkCudaErrors(cudaMemcpyAsync(d_boundary_cell_id, h_boundary_cell_id, boundary_face_index_bytes, cudaMemcpyHostToDevice, stream)); + + checkCudaErrors(cudaMemcpyAsync(d_volume, h_volume, cell_bytes, cudaMemcpyHostToDevice, stream)); + checkCudaErrors(cudaMemcpyAsync(d_weight, h_weight, face_bytes, cudaMemcpyHostToDevice, stream)); + checkCudaErrors(cudaMemcpyAsync(d_face, h_face, face_bytes, cudaMemcpyHostToDevice, stream)); + checkCudaErrors(cudaMemcpyAsync(d_deltaCoeffs, h_deltaCoeffs, face_bytes, cudaMemcpyHostToDevice, stream)); + checkCudaErrors(cudaMemcpyAsync(d_face_vector, h_face_vector, face_vec_bytes, cudaMemcpyHostToDevice, stream)); + checkCudaErrors(cudaMemcpyAsync(d_boundary_face_vector, h_boundary_face_vector, boundary_face_vec_bytes, cudaMemcpyHostToDevice, stream)); + checkCudaErrors(cudaMemcpyAsync(d_boundary_face, h_boundary_face, boundary_face_bytes, cudaMemcpyHostToDevice, stream)); + checkCudaErrors(cudaMemcpyAsync(d_boundary_deltaCoeffs, h_boundary_deltaCoeffs, boundary_face_bytes, cudaMemcpyHostToDevice, stream)); + checkCudaErrors(cudaMemcpyAsync(d_boundary_UpatchType, patch_type[0].data(), boundary_face_index_bytes, cudaMemcpyHostToDevice, stream)); + checkCudaErrors(cudaMemcpyAsync(d_boundary_YpatchType, patch_type[1].data(), boundary_face_index_bytes, cudaMemcpyHostToDevice, stream)); + + checkCudaErrors(cudaMemcpyAsync(d_permedIndex, permedIndex.data(), face_index_bytes, cudaMemcpyHostToDevice, stream)); + checkCudaErrors(cudaMemcpyAsync(d_bouPermedIndex, boundPermutationList.data(), boundary_face_index_bytes, cudaMemcpyHostToDevice, stream)); + checkCudaErrors(cudaMemcpyAsync(d_tmpPermutatedList, tmpPermutatedList.data(), csr_col_index_bytes, cudaMemcpyHostToDevice, stream)); + }; + + ~dfMatrixDataBase(){ + std::cout << "Destructor called." << std::endl; + // TODO: free pointers + + }; +}; + diff --git a/src_gpu_orig/dfMatrixDataBase.cu b/src_gpu_orig/dfMatrixDataBase.cu new file mode 100644 index 000000000..d4f5a7ab0 --- /dev/null +++ b/src_gpu_orig/dfMatrixDataBase.cu @@ -0,0 +1,48 @@ +#include "dfMatrixDataBase.H" + + +void constructBoundarySelector(std::vector& patchTypeSelector, const std::string& patchTypeStr, + const int patchSize) +{ + boundaryConditions patchCondition; + std::vector tmpSelector; + static std::map BCMap = { + {"zeroGradient", zeroGradient}, + {"fixedValue", fixedValue}, + {"empty", empty}, + {"coupled", coupled} + }; + auto iter = BCMap.find(patchTypeStr); + if (iter != BCMap.end()) { + patchCondition = iter->second; + } else { + throw std::runtime_error("Unknown boundary condition: " + patchTypeStr); + } + // zeroGradient labeled as 0, fixedValue labeled as 1, coupled labeled as 2 + switch (patchCondition){ + case zeroGradient: + { + tmpSelector.resize(patchSize, 0); + patchTypeSelector.insert(patchTypeSelector.end(), tmpSelector.begin(), tmpSelector.end()); + break; + } + case fixedValue: + { + tmpSelector.resize(patchSize, 1); + patchTypeSelector.insert(patchTypeSelector.end(), tmpSelector.begin(), tmpSelector.end()); + break; + } + case empty: + { + tmpSelector.resize(patchSize, 2); + patchTypeSelector.insert(patchTypeSelector.end(), tmpSelector.begin(), tmpSelector.end()); + break; + } + case coupled: + { + tmpSelector.resize(patchSize, 3); + patchTypeSelector.insert(patchTypeSelector.end(), tmpSelector.begin(), tmpSelector.end()); + break; + } + } +} diff --git a/src_gpu/dfRhoEqn.H b/src_gpu_orig/dfRhoEqn.H similarity index 100% rename from src_gpu/dfRhoEqn.H rename to src_gpu_orig/dfRhoEqn.H diff --git a/src_gpu/dfRhoEqn.cu b/src_gpu_orig/dfRhoEqn.cu similarity index 100% rename from src_gpu/dfRhoEqn.cu rename to src_gpu_orig/dfRhoEqn.cu diff --git a/src_gpu/dfUEqn.H b/src_gpu_orig/dfUEqn.H similarity index 100% rename from src_gpu/dfUEqn.H rename to src_gpu_orig/dfUEqn.H diff --git a/src_gpu/dfUEqn.cu b/src_gpu_orig/dfUEqn.cu similarity index 100% rename from src_gpu/dfUEqn.cu rename to src_gpu_orig/dfUEqn.cu diff --git a/src_gpu/dfYEqn.H b/src_gpu_orig/dfYEqn.H similarity index 100% rename from src_gpu/dfYEqn.H rename to src_gpu_orig/dfYEqn.H diff --git a/src_gpu/dfYEqn.cu b/src_gpu_orig/dfYEqn.cu similarity index 100% rename from src_gpu/dfYEqn.cu rename to src_gpu_orig/dfYEqn.cu From a59a190d2b7fc92d65874d054f20cb3bf410bd7e Mon Sep 17 00:00:00 2001 From: STwangyingrui Date: Thu, 3 Aug 2023 00:32:54 +0800 Subject: [PATCH 02/25] add unittest of fvm::div(phi, U) --- GPUTest/GPUTestBase.H | 141 ++++++++++++++++++++++++++++++++++++ GPUTest/createGPUSolver.H | 60 +++++++-------- GPUTest/unittest.C | 9 ++- src_gpu/CMakeLists.txt | 5 +- src_gpu/dfMatrixDataBase.H | 5 +- src_gpu/dfMatrixDataBase.cu | 45 +++++++++++- src_gpu/dfMatrixOpBase.H | 30 ++++++++ src_gpu/dfMatrixOpBase.cu | 140 +++++++++++++++++++++++++++++++++++ 8 files changed, 395 insertions(+), 40 deletions(-) create mode 100644 GPUTest/GPUTestBase.H create mode 100644 src_gpu/dfMatrixOpBase.H create mode 100644 src_gpu/dfMatrixOpBase.cu diff --git a/GPUTest/GPUTestBase.H b/GPUTest/GPUTestBase.H new file mode 100644 index 000000000..e1ffb0bd5 --- /dev/null +++ b/GPUTest/GPUTestBase.H @@ -0,0 +1,141 @@ + +enum initType{ + original, + randomInit +}; + +// unittest of fvm::div(phi, U) +void test_fvm_div_scalar(dfMatrixDataBase& dfDataBase, Foam::fvMesh& mesh, surfaceScalarField& phi, volVectorField& U, initType type) { + int offset = 0; + + // deal with init type + if (type == initType::randomInit) { + // random init phi to (-0.5, 0.5) + // internal + double *phi_internal_ptr = &phi[0]; + std::vector init_phi_internal; + init_phi_internal.resize(dfDataBase.num_surfaces); + for (int i = 0; i < dfDataBase.num_surfaces; i++) { + init_phi_internal[i] = (rand() % 10000 - 5000) / 10000.0; + } + memcpy(phi_internal_ptr, init_phi_internal.data(), dfDataBase.surface_value_bytes); + // boundary + offset = 0; + forAll(U.boundaryField(), patchi) + { + fvsPatchScalarField& patchPhi = phi.boundaryFieldRef()[patchi]; + int patchsize = patchPhi.size(); + double *phi_boundary_ptr = &patchPhi[0]; + std::vector init_phi_boundary; + init_phi_boundary.resize(patchsize); + for (int i = 0; i < patchsize; i++) { + init_phi_boundary[i] = (rand() % 10000 - 5000) / 10000.0; + } + memcpy(phi_boundary_ptr, init_phi_boundary.data(), patchsize * sizeof(double)); + offset += patchsize; + } + // TODO: random init weight to (0, 1) + // failed, weight is const. + } + + // run CPU + fvVectorMatrix df_U = fvm::div(phi, U); + + // run GPU + // run GPU - preProcess + // prepare phi + memcpy(dfDataBase.h_phi, &phi[0], dfDataBase.surface_value_bytes); + offset = 0; + forAll(U.boundaryField(), patchi) + { + const fvsPatchScalarField& patchPhi = phi.boundaryField()[patchi]; + int patchsize = patchPhi.size(); + memcpy(dfDataBase.h_boundary_phi + offset, &patchPhi[0], patchsize * sizeof(double)); + offset += patchsize; + } + checkCudaErrors(cudaMemcpyAsync(dfDataBase.d_phi, dfDataBase.h_phi, dfDataBase.surface_value_bytes, cudaMemcpyHostToDevice, dfDataBase.stream)); + checkCudaErrors(cudaMemcpyAsync(dfDataBase.d_boundary_phi, dfDataBase.h_boundary_phi, dfDataBase.boundary_surface_value_bytes, cudaMemcpyHostToDevice, dfDataBase.stream)); + // prepare patch_type + std::vector patch_type_U; + patch_type_U.resize(dfDataBase.num_patches); + forAll(U.boundaryField(), patchi) + { + constructBoundarySelectorPerPatch(&(patch_type_U[patchi]), U.boundaryField()[patchi].type()); + } + // prepare boundary coeffs + // TODO: updating boundary coeffs should be complemented later + double *d_value_internal_coeffs_U = nullptr; + double *d_value_boundary_coeffs_U = nullptr; + double *d_gradient_internal_coeffs_U = nullptr; + double *d_gradient_boundary_coeffs_U = nullptr; + checkCudaErrors(cudaMalloc((void**)&d_value_internal_coeffs_U, dfDataBase.boundary_surface_value_vec_bytes)); + checkCudaErrors(cudaMalloc((void**)&d_value_boundary_coeffs_U, dfDataBase.boundary_surface_value_vec_bytes)); + checkCudaErrors(cudaMalloc((void**)&d_gradient_internal_coeffs_U, dfDataBase.boundary_surface_value_vec_bytes)); + checkCudaErrors(cudaMalloc((void**)&d_gradient_boundary_coeffs_U, dfDataBase.boundary_surface_value_vec_bytes)); + update_boundary_coeffs_vector(dfDataBase.stream, dfDataBase.num_patches, + dfDataBase.patch_size.data(), patch_type_U.data(), + d_value_internal_coeffs_U, d_value_boundary_coeffs_U, + d_gradient_internal_coeffs_U, d_gradient_boundary_coeffs_U); + // prepare ldu + double *d_lower = nullptr; + double *d_upper = nullptr; + double *d_diag = nullptr; + double *d_internal_coeffs = nullptr; + double *d_boundary_coeffs = nullptr; + checkCudaErrors(cudaMalloc((void**)&d_lower, dfDataBase.surface_value_bytes)); + checkCudaErrors(cudaMalloc((void**)&d_upper, dfDataBase.surface_value_bytes)); + checkCudaErrors(cudaMalloc((void**)&d_diag, dfDataBase.cell_value_bytes)); + checkCudaErrors(cudaMalloc((void**)&d_internal_coeffs, dfDataBase.boundary_surface_value_vec_bytes)); + checkCudaErrors(cudaMalloc((void**)&d_boundary_coeffs, dfDataBase.boundary_surface_value_vec_bytes)); + // run GPU - Process + fvm_div_scalar(dfDataBase.stream, dfDataBase.num_surfaces, dfDataBase.d_owner, dfDataBase.d_neighbor, + dfDataBase.d_phi, dfDataBase.d_weight, + d_lower, d_upper, d_diag, // end for internal + dfDataBase.num_patches, dfDataBase.patch_size.data(), patch_type_U.data(), + dfDataBase.d_boundary_phi, d_value_internal_coeffs_U, d_value_boundary_coeffs_U, + d_internal_coeffs, d_boundary_coeffs); + // run GPU - postProcess + std::vector h_lower; + h_lower.resize(dfDataBase.num_surfaces); + std::vector h_upper; + h_upper.resize(dfDataBase.num_surfaces); + std::vector h_diag; + h_diag.resize(dfDataBase.num_cells); + std::vector h_internal_coeffs; + h_internal_coeffs.resize(dfDataBase.num_boundary_surfaces * 3); + std::vector h_boundary_coeffs; + h_boundary_coeffs.resize(dfDataBase.num_boundary_surfaces * 3); + checkCudaErrors(cudaMemcpy(h_lower.data(), d_lower, dfDataBase.surface_value_bytes, cudaMemcpyDeviceToHost)); + checkCudaErrors(cudaMemcpy(h_upper.data(), d_upper, dfDataBase.surface_value_bytes, cudaMemcpyDeviceToHost)); + checkCudaErrors(cudaMemcpy(h_diag.data(), d_diag, dfDataBase.cell_value_bytes, cudaMemcpyDeviceToHost)); + checkCudaErrors(cudaMemcpy(h_internal_coeffs.data(), d_internal_coeffs, dfDataBase.boundary_surface_value_vec_bytes, cudaMemcpyDeviceToHost)); + checkCudaErrors(cudaMemcpy(h_boundary_coeffs.data(), d_boundary_coeffs, dfDataBase.boundary_surface_value_vec_bytes, cudaMemcpyDeviceToHost)); + checkCudaErrors(cudaFree(d_lower)); + checkCudaErrors(cudaFree(d_upper)); + checkCudaErrors(cudaFree(d_diag)); + checkCudaErrors(cudaFree(d_internal_coeffs)); + checkCudaErrors(cudaFree(d_boundary_coeffs)); + checkCudaErrors(cudaFree(d_value_internal_coeffs_U)); + checkCudaErrors(cudaFree(d_value_boundary_coeffs_U)); + checkCudaErrors(cudaFree(d_gradient_internal_coeffs_U)); + checkCudaErrors(cudaFree(d_gradient_boundary_coeffs_U)); + + // compare CPU and GPU results + checkVectorEqual(dfDataBase.num_surfaces, &df_U.lower()[0], h_lower.data(), 1e-14); + checkVectorEqual(dfDataBase.num_surfaces, &df_U.upper()[0], h_upper.data(), 1e-14); + checkVectorEqual(dfDataBase.num_cells, &df_U.diag()[0], h_diag.data(), 1e-14); + std::vector cpu_internal_coeffs(dfDataBase.num_boundary_surfaces * 3); + std::vector cpu_boundary_coeffs(dfDataBase.num_boundary_surfaces * 3); + offset = 0; + forAll(U.boundaryField(), patchi) + { + int patchSize = U.boundaryField()[patchi].size(); + const double* internal_coeff_ptr = &df_U.internalCoeffs()[patchi][0][0]; + const double* boundary_coeff_ptr = &df_U.boundaryCoeffs()[patchi][0][0]; + memcpy(cpu_internal_coeffs.data() + offset * 3, internal_coeff_ptr, patchSize * 3 * sizeof(double)); + memcpy(cpu_boundary_coeffs.data() + offset * 3, boundary_coeff_ptr, patchSize * 3 * sizeof(double)); + offset += patchSize; + } + checkVectorEqual(dfDataBase.num_boundary_surfaces * 3, cpu_internal_coeffs.data(), h_internal_coeffs.data(), 1e-14); + checkVectorEqual(dfDataBase.num_boundary_surfaces * 3, cpu_boundary_coeffs.data(), h_boundary_coeffs.data(), 1e-14); +} diff --git a/GPUTest/createGPUSolver.H b/GPUTest/createGPUSolver.H index 9a6c289ab..0f1e20eca 100644 --- a/GPUTest/createGPUSolver.H +++ b/GPUTest/createGPUSolver.H @@ -1,32 +1,29 @@ + dfMatrixDataBase dfDataBase; void createGPUBase(fvMesh& mesh, PtrList& Y) { - // obtain variables from fvMesh + // prepare constant values: num_cells, num_surfaces, num_boundary_surfaces, num_patches, patch_size, num_species, rdelta_t const labelUList& owner = mesh.owner(); const labelUList& neighbour = mesh.neighbour(); int num_cells = mesh.nCells(); int num_surfaces = neighbour.size(); - - - // prepare num_cells, num_surfaces, num_boundary_surfaces, num_patches, patch_sizes, num_species, rdelta_t - // - obtain boundary size info from mesh - int patchSize = 0, num_patches = 0, num_boundary_surfaces = 0; - std::vector patch_sizes; + int num_boundary_surfaces = 0; + int num_patches = 0; + std::vector patch_size; forAll(mesh.boundary(), patchi) { labelUList sub_boundary = mesh.boundary()[patchi].faceCells(); - patchSize = sub_boundary.size(); - - patch_sizes.push_back(patchSize); - num_boundary_surfaces += patchSize; - num_patches ++; + int patchsize = sub_boundary.size(); + patch_size.push_back(patchsize); + num_boundary_surfaces += patchsize; + num_patches++; } - dfDataBase.setConstantValues(num_cells, num_surfaces, num_boundary_surfaces, num_patches, patch_sizes, Y.size(), 1e-6); // TODO: get deltaT fomr time API + // TODO: get deltaT fomr time API + dfDataBase.setConstantValues(num_cells, num_surfaces, num_boundary_surfaces, num_patches, patch_size, Y.size(), 1e-6); - // prepare owner, neighbor + // prepare constant indexes: owner, neighbor dfDataBase.setConstantIndexes(&owner[0], &neighbour[0]); // prepare internal and boundary of sf, mag_sf, weights, delta_coeffs, volume - // - obtain boundary field info from mesh double *boundary_sf = new double[3 * num_boundary_surfaces]; double *boundary_mag_sf = new double[num_boundary_surfaces]; double *boundary_delta_coeffs = new double[num_boundary_surfaces]; @@ -36,12 +33,12 @@ void createGPUBase(fvMesh& mesh, PtrList& Y) { const scalarField& pMagSf = mesh.magSf().boundaryField()[patchi]; const scalarField& pDeltaCoeffs = mesh.nonOrthDeltaCoeffs().boundaryField()[patchi]; - patchSize = pMagSf.size(); + int patchsize = pMagSf.size(); - memcpy(boundary_sf + 3*offset, &pSf[0][0], 3*patchSize*sizeof(double)); - memcpy(boundary_mag_sf + offset, &pMagSf[0], patchSize*sizeof(double)); - memcpy(boundary_delta_coeffs + offset, &pDeltaCoeffs[0], patchSize*sizeof(double)); - offset += patchSize; + memcpy(boundary_sf + 3*offset, &pSf[0][0], 3*patchsize*sizeof(double)); + memcpy(boundary_mag_sf + offset, &pMagSf[0], patchsize*sizeof(double)); + memcpy(boundary_delta_coeffs + offset, &pDeltaCoeffs[0], patchsize*sizeof(double)); + offset += patchsize; } dfDataBase.createConstantFieldsInternal(); @@ -49,23 +46,20 @@ void createGPUBase(fvMesh& mesh, PtrList& Y) { dfDataBase.initConstantFieldsInternal(&mesh.Sf()[0][0], &mesh.magSf()[0], &mesh.surfaceInterpolation::weights()[0], &mesh.nonOrthDeltaCoeffs()[0], &mesh.V()[0]); dfDataBase.initConstantFieldsBoundary(boundary_sf, boundary_mag_sf, boundary_delta_coeffs); - // prepare internal and boundary of xxx - // - obtain init_Y - double *h_Y = new double[Y.size() * num_cells]; - double *boundary_Y = new double[Y.size() * num_boundary_surfaces]; + // prepare internal and boundary of Y + dfDataBase.createNonConstantFieldsInternal(); + dfDataBase.createNonConstantFieldsBoundary(); forAll(Y, speciesI) { volScalarField& Yi = Y[speciesI]; - memcpy(h_Y + speciesI * num_cells, &Yi[0], num_cells * sizeof(double)); + memcpy(dfDataBase.h_y + speciesI * num_cells, &Yi[0], num_cells * sizeof(double)); offset = 0; forAll(Yi.boundaryField(), patchi) { const scalarField& patchYi = Yi.boundaryField()[patchi]; - patchSize = patchYi.size(); - memcpy(boundary_Y + speciesI * num_boundary_surfaces + offset, &patchYi[0], patchSize*sizeof(double)); - offset += patchSize; + int patchsize = patchYi.size(); + memcpy(dfDataBase.h_boundary_y + speciesI * num_boundary_surfaces + offset, &patchYi[0], patchsize*sizeof(double)); + offset += patchsize; } } - dfDataBase.createNonConstantFieldsInternal(); - dfDataBase.createNonConstantFieldsBoundary(); - dfDataBase.initNonConstantFieldsInternal(h_Y); - dfDataBase.initNonConstantFieldsBoundary(boundary_Y); -}; \ No newline at end of file + dfDataBase.initNonConstantFieldsInternal(dfDataBase.h_y); + dfDataBase.initNonConstantFieldsBoundary(dfDataBase.h_boundary_y); +}; diff --git a/GPUTest/unittest.C b/GPUTest/unittest.C index 2e3d55ce5..b57a8efd6 100644 --- a/GPUTest/unittest.C +++ b/GPUTest/unittest.C @@ -45,11 +45,14 @@ Description #include "basicThermo.H" #include "CombustionModel.H" -#include "dfMatrixDataBase.H" #include #include #include "upwind.H" + +#include "dfMatrixDataBase.H" +#include "dfMatrixOpBase.H" #include "createGPUSolver.H" +#include "GPUTestBase.H" int main(int argc, char *argv[]) { @@ -93,6 +96,10 @@ int main(int argc, char *argv[]) } createGPUBase(mesh, Y); + + // unittest of fvm::div(phi, U) + test_fvm_div_scalar(dfDataBase, mesh, phi, U, initType::original); + test_fvm_div_scalar(dfDataBase, mesh, phi, U, initType::randomInit); } return 0; } diff --git a/src_gpu/CMakeLists.txt b/src_gpu/CMakeLists.txt index 015a1d11b..d82c86df5 100644 --- a/src_gpu/CMakeLists.txt +++ b/src_gpu/CMakeLists.txt @@ -21,8 +21,9 @@ include_directories( ) add_library(${PROJECT_NAME} - SHARED - dfMatrixDataBase.cu) + SHARED + dfMatrixDataBase.cu + dfMatrixOpBase.cu) target_link_libraries(${PROJECT_NAME} ${MPI_LIBRARIES} diff --git a/src_gpu/dfMatrixDataBase.H b/src_gpu/dfMatrixDataBase.H index c2e1446ec..efcb78190 100644 --- a/src_gpu/dfMatrixDataBase.H +++ b/src_gpu/dfMatrixDataBase.H @@ -48,6 +48,7 @@ enum boundaryConditions{ empty }; +void constructBoundarySelectorPerPatch(int *patchTypeSelector, const std::string& patchTypeStr); void constructBoundarySelector(std::vector& patchTypeSelector, const std::string& patchTypeStr, const int patchSize); struct dfMatrixDataBase @@ -66,7 +67,7 @@ struct dfMatrixDataBase int num_boundary_surfaces = 0; int num_patches = 0; int num_species = 0; - std::vector patch_sizes; + std::vector patch_size; double rdelta_t = 0; // constant values -- ldu bytesize @@ -169,7 +170,7 @@ struct dfMatrixDataBase // member function void setConstantValues(int num_cells, int num_surfaces, int num_boundary_surfaces, - int num_patches, std::vector patch_sizes, + int num_patches, std::vector patch_size, int num_species, double rdelta_t); void setConstantIndexes(const int *owner, const int *neighbor); diff --git a/src_gpu/dfMatrixDataBase.cu b/src_gpu/dfMatrixDataBase.cu index 4ecbc25c8..4bcbe88a4 100644 --- a/src_gpu/dfMatrixDataBase.cu +++ b/src_gpu/dfMatrixDataBase.cu @@ -1,5 +1,46 @@ #include "dfMatrixDataBase.H" +void constructBoundarySelectorPerPatch(int *patchTypeSelector, const std::string& patchTypeStr) +{ + boundaryConditions patchCondition; + std::vector tmpSelector; + static std::map BCMap = { + {"zeroGradient", zeroGradient}, + {"fixedValue", fixedValue}, + {"empty", empty}, + {"coupled", coupled} + }; + auto iter = BCMap.find(patchTypeStr); + if (iter != BCMap.end()) { + patchCondition = iter->second; + } else { + throw std::runtime_error("Unknown boundary condition: " + patchTypeStr); + } + // zeroGradient labeled as 0, fixedValue labeled as 1, coupled labeled as 2 + switch (patchCondition){ + case zeroGradient: + { + *patchTypeSelector = 0; + break; + } + case fixedValue: + { + *patchTypeSelector = 1; + break; + } + case empty: + { + *patchTypeSelector = 2; + break; + } + case coupled: + { + *patchTypeSelector = 3; + break; + } + } +} + dfMatrixDataBase::dfMatrixDataBase() { checkCudaErrors(cudaStreamCreate(&stream)); } @@ -15,14 +56,14 @@ dfMatrixDataBase::~dfMatrixDataBase() { } void dfMatrixDataBase::setConstantValues(int num_cells, int num_surfaces, int num_boundary_surfaces, - int num_patches, std::vector patch_sizes, + int num_patches, std::vector patch_size, int num_species, double rdelta_t) { // constant values -- basic this->num_cells = num_cells; this->num_surfaces = num_surfaces; this->num_boundary_surfaces = num_boundary_surfaces; this->num_patches = num_patches; - this->patch_sizes = patch_sizes; + this->patch_size = patch_size; this->num_species = num_species; this->rdelta_t = rdelta_t; diff --git a/src_gpu/dfMatrixOpBase.H b/src_gpu/dfMatrixOpBase.H new file mode 100644 index 000000000..47692e239 --- /dev/null +++ b/src_gpu/dfMatrixOpBase.H @@ -0,0 +1,30 @@ +#pragma once + +void ldu_to_csr(cudaStream_t stream, int num_cells, int num_surfaces, + const int *lower_to_csr_index, const int *upper_to_csr_index, const int *diag_to_csr_index, + const double *lower, const double *upper, const double *diag, const double *source, + const double *internal_coeffs, const double *boundary_coeffs, + double *A, double *b); + +void update_boundary_coeffs_vector(cudaStream_t stream, int num_patches, + const int *patch_size, const int *patch_type, + double *value_internal_coeffs, double *value_boundary_coeffs, + double *gradient_internal_coeffs, double *gradient_boundary_coeffs); + +// void fvm_ddt(); + +void fvm_div_scalar(cudaStream_t stream, int num_surfaces, const int *lowerAddr, const int *upperAddr, + const double *phi, const double *weight, + double *lower, double *upper, double *diag, // end for internal + int num_patches, const int *patch_size, const int *patch_type, + const double *boundary_phi, const double *value_internal_coeffs, const double *value_boundary_coeffs, + double *internal_coeffs, double *boundary_coeffs); + +// void fvm_laplacian(); +// +// void fvc_ddt(); +// +// void fvc_grad_surface(); +// +// void fvc_div_cell(); + diff --git a/src_gpu/dfMatrixOpBase.cu b/src_gpu/dfMatrixOpBase.cu new file mode 100644 index 000000000..6c533e05e --- /dev/null +++ b/src_gpu/dfMatrixOpBase.cu @@ -0,0 +1,140 @@ +#include "dfMatrixOpBase.H" +#include "dfMatrixDataBase.H" + +#include +#include "cuda_profiler_api.h" + +__global__ void update_boundary_coeffs_zeroGradient_vector(int num, int offset, + double *value_internal_coeffs, double *value_boundary_coeffs, + double *gradient_internal_coeffs, double *gradient_boundary_coeffs) +{ + int index = blockDim.x * blockIdx.x + threadIdx.x; + if (index >= num) + return; + + int start_index = offset + index; + // valueInternalCoeffs = 1 + // valueBoundaryCoeffs = 0 + // gradientInternalCoeffs = 0 + // gradientBoundaryCoeffs = 0 + value_internal_coeffs[start_index * 3 + 0] = 1; + value_internal_coeffs[start_index * 3 + 1] = 1; + value_internal_coeffs[start_index * 3 + 2] = 1; + value_boundary_coeffs[start_index * 3 + 0] = 0; + value_boundary_coeffs[start_index * 3 + 1] = 0; + value_boundary_coeffs[start_index * 3 + 2] = 0; + gradient_internal_coeffs[start_index * 3 + 0] = 0; + gradient_internal_coeffs[start_index * 3 + 1] = 0; + gradient_internal_coeffs[start_index * 3 + 2] = 0; + gradient_boundary_coeffs[start_index * 3 + 0] = 0; + gradient_boundary_coeffs[start_index * 3 + 1] = 0; + gradient_boundary_coeffs[start_index * 3 + 2] = 0; +} + +__global__ void fvm_div_scalar_internal(int num_surfaces, + const int *lower_index, const int *upper_index, + const double *phi, const double *weight, + double *lower, double *upper, double *diag) +{ + int index = blockDim.x * blockIdx.x + threadIdx.x; + if (index >= num_surfaces) + return; + + double w = weight[index]; + double f = phi[index]; + + lower[index] += (-w) * f; + upper[index] += (1 - w) * f; + + int l = lower_index[index]; + int u = upper_index[index]; + atomicAdd(&(diag[l]), w * f); + atomicAdd(&(diag[u]), (w - 1) * f); +} + +__global__ void fvm_div_scalar_boundary(int num, int offset, + const double *boundary_phi, const double *value_internal_coeffs, const double *value_boundary_coeffs, + double *internal_coeffs, double *boundary_coeffs) +{ + int index = blockDim.x * blockIdx.x + threadIdx.x; + if (index >= num) + return; + + int start_index = offset + index; + double boundary_f = boundary_phi[start_index]; + internal_coeffs[start_index * 3 + 0] = boundary_f * value_internal_coeffs[start_index * 3 + 0]; + internal_coeffs[start_index * 3 + 1] = boundary_f * value_internal_coeffs[start_index * 3 + 1]; + internal_coeffs[start_index * 3 + 2] = boundary_f * value_internal_coeffs[start_index * 3 + 2]; + boundary_coeffs[start_index * 3 + 0] = boundary_f * value_boundary_coeffs[start_index * 3 + 0]; + boundary_coeffs[start_index * 3 + 1] = boundary_f * value_boundary_coeffs[start_index * 3 + 1]; + boundary_coeffs[start_index * 3 + 2] = boundary_f * value_boundary_coeffs[start_index * 3 + 2]; +} + +void ldu_to_csr(cudaStream_t stream, int num_cells, int num_surfaces, + const int *lower_to_csr_index, const int *upper_to_csr_index, const int *diag_to_csr_index, + const double *lower, const double *upper, const double *diag, const double *source, + const double *internal_coeffs, const double *boundary_coeffs, + double *A, double *b) +{ + +} + +void update_boundary_coeffs_vector(cudaStream_t stream, int num_patches, + const int *patch_size, const int *patch_type, + double *value_internal_coeffs, double *value_boundary_coeffs, + double *gradient_internal_coeffs, double *gradient_boundary_coeffs) +{ + size_t threads_per_block = 1024; + size_t blocks_per_grid = 1; + + int offset = 0; + for (int i = 0; i < num_patches; i++) { + threads_per_block = 256; + blocks_per_grid = (patch_size[i] + threads_per_block - 1) / threads_per_block; + // TODO: just basic patch type now + // TODO: just vector version now + if (patch_type[i] == boundaryConditions::zeroGradient) { + update_boundary_coeffs_zeroGradient_vector<<>>(patch_size[i], offset, + value_internal_coeffs, value_boundary_coeffs, gradient_internal_coeffs, gradient_boundary_coeffs); + } else if (patch_type[i] == boundaryConditions::fixedValue) { + // xxx + } else if (0) { + // xxx + } + offset += patch_size[i]; + } +} + +void fvm_div_scalar(cudaStream_t stream, int num_surfaces, const int *lowerAddr, const int *upperAddr, + const double *phi, const double *weight, + double *lower, double *upper, double *diag, // end for internal + int num_patches, const int *patch_size, const int *patch_type, + const double *boundary_phi, const double *value_internal_coeffs, const double *value_boundary_coeffs, + double *internal_coeffs, double *boundary_coeffs) +{ + size_t threads_per_block = 1024; + size_t blocks_per_grid = 1; + + blocks_per_grid = (num_surfaces + threads_per_block - 1) / threads_per_block; + fvm_div_scalar_internal<<>>(num_surfaces, + lowerAddr, upperAddr, + phi, weight, lower, upper, diag); + + int offset = 0; + for (int i = 0; i < num_patches; i++) { + threads_per_block = 256; + blocks_per_grid = (patch_size[i] + threads_per_block - 1) / threads_per_block; + // TODO: just basic patch type now + if (patch_type[i] == boundaryConditions::zeroGradient + || patch_type[i] == boundaryConditions::fixedValue) { + // TODO: just vector version now + fvm_div_scalar_boundary<<>>(patch_size[i], offset, + boundary_phi, value_internal_coeffs, value_boundary_coeffs, + internal_coeffs, boundary_coeffs); + } else if (0) { + // xxx + } + offset += patch_size[i]; + } +} + From 1b25756d75b6c9719b2396dc317de8da4dcc2411 Mon Sep 17 00:00:00 2001 From: STwangyingrui Date: Thu, 3 Aug 2023 18:30:26 +0800 Subject: [PATCH 03/25] simplify unittest --- GPUTest/GPUTestBase.H | 294 ++++++++++++++++++++++-------------- src_gpu/dfMatrixDataBase.H | 20 ++- src_gpu/dfMatrixDataBase.cu | 74 +++++++-- 3 files changed, 261 insertions(+), 127 deletions(-) diff --git a/GPUTest/GPUTestBase.H b/GPUTest/GPUTestBase.H index e1ffb0bd5..bce191a9e 100644 --- a/GPUTest/GPUTestBase.H +++ b/GPUTest/GPUTestBase.H @@ -4,138 +4,204 @@ enum initType{ randomInit }; -// unittest of fvm::div(phi, U) -void test_fvm_div_scalar(dfMatrixDataBase& dfDataBase, Foam::fvMesh& mesh, surfaceScalarField& phi, volVectorField& U, initType type) { - int offset = 0; +struct testGPUDataBase { + // some fvm ops don't use d_source; + // some fvm ops don't use d_internal_coeffs and d_boundary_coeffs; + // all the fvc ops only use d_source + double *d_lower = nullptr; + double *d_upper = nullptr; + double *d_diag = nullptr; + double *d_source = nullptr; + double *d_internal_coeffs = nullptr; + double *d_boundary_coeffs = nullptr; - // deal with init type - if (type == initType::randomInit) { - // random init phi to (-0.5, 0.5) + double *d_value_internal_coeffs = nullptr; + double *d_value_boundary_coeffs = nullptr; + double *d_gradient_internal_coeffs = nullptr; + double *d_gradient_boundary_coeffs = nullptr; + + std::vector patch_type; + + // constructor + testGPUDataBase() {} + + // deconstructor + ~testGPUDataBase() { + if (d_lower) checkCudaErrors(cudaFree(d_lower)); + if (d_upper) checkCudaErrors(cudaFree(d_upper)); + if (d_diag) checkCudaErrors(cudaFree(d_diag)); + if (d_source) checkCudaErrors(cudaFree(d_source)); + if (d_internal_coeffs) checkCudaErrors(cudaFree(d_internal_coeffs)); + if (d_boundary_coeffs) checkCudaErrors(cudaFree(d_boundary_coeffs)); + + if (d_value_internal_coeffs) checkCudaErrors(cudaFree(d_value_internal_coeffs)); + if (d_value_boundary_coeffs) checkCudaErrors(cudaFree(d_value_boundary_coeffs)); + if (d_gradient_internal_coeffs) checkCudaErrors(cudaFree(d_gradient_internal_coeffs)); + if (d_gradient_boundary_coeffs) checkCudaErrors(cudaFree(d_gradient_boundary_coeffs)); + } +}; + +void randomInitSurfaceScalar(surfaceScalarField& field) { + // random init field value to (-0.5, 0.5) // internal - double *phi_internal_ptr = &phi[0]; - std::vector init_phi_internal; - init_phi_internal.resize(dfDataBase.num_surfaces); + double *field_internal_ptr = &field[0]; + std::vector init_field_internal; + init_field_internal.resize(dfDataBase.num_surfaces); for (int i = 0; i < dfDataBase.num_surfaces; i++) { - init_phi_internal[i] = (rand() % 10000 - 5000) / 10000.0; + init_field_internal[i] = (rand() % 10000 - 5000) / 10000.0; } - memcpy(phi_internal_ptr, init_phi_internal.data(), dfDataBase.surface_value_bytes); + memcpy(field_internal_ptr, init_field_internal.data(), dfDataBase.surface_value_bytes); // boundary - offset = 0; - forAll(U.boundaryField(), patchi) + int offset = 0; + forAll(field.boundaryField(), patchi) { - fvsPatchScalarField& patchPhi = phi.boundaryFieldRef()[patchi]; - int patchsize = patchPhi.size(); - double *phi_boundary_ptr = &patchPhi[0]; - std::vector init_phi_boundary; - init_phi_boundary.resize(patchsize); + fvsPatchScalarField& patchField = field.boundaryFieldRef()[patchi]; + int patchsize = patchField.size(); + double *field_boundary_ptr = &patchField[0]; + std::vector init_field_boundary; + init_field_boundary.resize(patchsize); for (int i = 0; i < patchsize; i++) { - init_phi_boundary[i] = (rand() % 10000 - 5000) / 10000.0; + init_field_boundary[i] = (rand() % 10000 - 5000) / 10000.0; } - memcpy(phi_boundary_ptr, init_phi_boundary.data(), patchsize * sizeof(double)); + memcpy(field_boundary_ptr, init_field_boundary.data(), patchsize * sizeof(double)); offset += patchsize; } - // TODO: random init weight to (0, 1) - // failed, weight is const. - } - - // run CPU - fvVectorMatrix df_U = fvm::div(phi, U); +} - // run GPU - // run GPU - preProcess - // prepare phi - memcpy(dfDataBase.h_phi, &phi[0], dfDataBase.surface_value_bytes); - offset = 0; - forAll(U.boundaryField(), patchi) +void uploadSurfaceScalar(dfMatrixDataBase& dfDataBase, const surfaceScalarField& field, const char* fieldAlias) { + double *h_internal_field = dfDataBase.getFieldPointer(fieldAlias, location::cpu, position::internal); + double *h_boundary_field = dfDataBase.getFieldPointer(fieldAlias, location::cpu, position::boundary); + double *d_internal_field = dfDataBase.getFieldPointer(fieldAlias, location::gpu, position::internal); + double *d_boundary_field = dfDataBase.getFieldPointer(fieldAlias, location::gpu, position::boundary); + // internal + memcpy(h_internal_field, &field[0], dfDataBase.surface_value_bytes); + // boundary + int offset = 0; + forAll(field.boundaryField(), patchi) { - const fvsPatchScalarField& patchPhi = phi.boundaryField()[patchi]; - int patchsize = patchPhi.size(); - memcpy(dfDataBase.h_boundary_phi + offset, &patchPhi[0], patchsize * sizeof(double)); + const fvsPatchScalarField& patchField = field.boundaryField()[patchi]; + int patchsize = patchField.size(); + memcpy(h_boundary_field + offset, &patchField[0], patchsize * sizeof(double)); offset += patchsize; } - checkCudaErrors(cudaMemcpyAsync(dfDataBase.d_phi, dfDataBase.h_phi, dfDataBase.surface_value_bytes, cudaMemcpyHostToDevice, dfDataBase.stream)); - checkCudaErrors(cudaMemcpyAsync(dfDataBase.d_boundary_phi, dfDataBase.h_boundary_phi, dfDataBase.boundary_surface_value_bytes, cudaMemcpyHostToDevice, dfDataBase.stream)); - // prepare patch_type - std::vector patch_type_U; - patch_type_U.resize(dfDataBase.num_patches); - forAll(U.boundaryField(), patchi) + // transfer + checkCudaErrors(cudaMemcpyAsync(d_internal_field, h_internal_field, dfDataBase.surface_value_bytes, cudaMemcpyHostToDevice, dfDataBase.stream)); + checkCudaErrors(cudaMemcpyAsync(d_boundary_field, h_boundary_field, dfDataBase.boundary_surface_value_bytes, cudaMemcpyHostToDevice, dfDataBase.stream)); +} + +void buildTestGPUDataBaseVector(const dfMatrixDataBase& dfDataBase, testGPUDataBase& testData, const volVectorField& field) { + // ldu + checkCudaErrors(cudaMalloc((void**)&testData.d_lower, dfDataBase.surface_value_bytes)); + checkCudaErrors(cudaMalloc((void**)&testData.d_upper, dfDataBase.surface_value_bytes)); + checkCudaErrors(cudaMalloc((void**)&testData.d_diag, dfDataBase.cell_value_bytes)); + checkCudaErrors(cudaMalloc((void**)&testData.d_internal_coeffs, dfDataBase.boundary_surface_value_vec_bytes)); + checkCudaErrors(cudaMalloc((void**)&testData.d_boundary_coeffs, dfDataBase.boundary_surface_value_vec_bytes)); + // boundary coeffs + checkCudaErrors(cudaMalloc((void**)&testData.d_value_internal_coeffs, dfDataBase.boundary_surface_value_vec_bytes)); + checkCudaErrors(cudaMalloc((void**)&testData.d_value_boundary_coeffs, dfDataBase.boundary_surface_value_vec_bytes)); + checkCudaErrors(cudaMalloc((void**)&testData.d_gradient_internal_coeffs, dfDataBase.boundary_surface_value_vec_bytes)); + checkCudaErrors(cudaMalloc((void**)&testData.d_gradient_boundary_coeffs, dfDataBase.boundary_surface_value_vec_bytes)); + // patch type + testData.patch_type.resize(dfDataBase.num_patches); + forAll(field.boundaryField(), patchi) { - constructBoundarySelectorPerPatch(&(patch_type_U[patchi]), U.boundaryField()[patchi].type()); + constructBoundarySelectorPerPatch(&(testData.patch_type[patchi]), field.boundaryField()[patchi].type()); } +} + +void updateBoundaryCoeffsVector(const dfMatrixDataBase& dfDataBase, testGPUDataBase& testData) { + update_boundary_coeffs_vector(dfDataBase.stream, dfDataBase.num_patches, + dfDataBase.patch_size.data(), testData.patch_type.data(), + testData.d_value_internal_coeffs, testData.d_value_boundary_coeffs, + testData.d_gradient_internal_coeffs, testData.d_gradient_boundary_coeffs); +} + +void compareResultVector(const dfMatrixDataBase& dfDataBase, const testGPUDataBase& testData, fvVectorMatrix& dfMatrix, bool printFlag) { + if (testData.d_lower) { + std::vector h_lower; + h_lower.resize(dfDataBase.num_surfaces); + checkCudaErrors(cudaMemcpy(h_lower.data(), testData.d_lower, dfDataBase.surface_value_bytes, cudaMemcpyDeviceToHost)); + checkVectorEqual(dfDataBase.num_surfaces, &dfMatrix.lower()[0], h_lower.data(), 1e-14, printFlag); + } + if (testData.d_upper) { + std::vector h_upper; + h_upper.resize(dfDataBase.num_surfaces); + checkCudaErrors(cudaMemcpy(h_upper.data(), testData.d_upper, dfDataBase.surface_value_bytes, cudaMemcpyDeviceToHost)); + checkVectorEqual(dfDataBase.num_surfaces, &dfMatrix.upper()[0], h_upper.data(), 1e-14, printFlag); + } + if (testData.d_diag) { + std::vector h_diag; + h_diag.resize(dfDataBase.num_cells); + checkCudaErrors(cudaMemcpy(h_diag.data(), testData.d_diag, dfDataBase.cell_value_bytes, cudaMemcpyDeviceToHost)); + checkVectorEqual(dfDataBase.num_cells, &dfMatrix.diag()[0], h_diag.data(), 1e-14, printFlag); + } + if (testData.d_source) { + std::vector h_source; + h_source.resize(dfDataBase.num_cells * 3); + checkCudaErrors(cudaMemcpy(h_source.data(), testData.d_source, dfDataBase.cell_value_vec_bytes, cudaMemcpyDeviceToHost)); + checkVectorEqual(dfDataBase.num_cells, &dfMatrix.source()[0][0], h_source.data(), 1e-14, printFlag); + } + if (testData.d_internal_coeffs) { + std::vector h_internal_coeffs; + h_internal_coeffs.resize(dfDataBase.num_boundary_surfaces * 3); + checkCudaErrors(cudaMemcpy(h_internal_coeffs.data(), testData.d_internal_coeffs, dfDataBase.boundary_surface_value_vec_bytes, cudaMemcpyDeviceToHost)); + std::vector cpu_internal_coeffs(dfDataBase.num_boundary_surfaces * 3); + int offset = 0; + for (int patchi = 0; patchi < dfDataBase.num_patches; patchi++) + { + int patchsize = dfDataBase.patch_size[patchi]; + const double* internal_coeff_ptr = &dfMatrix.internalCoeffs()[patchi][0][0]; + memcpy(cpu_internal_coeffs.data() + offset * 3, internal_coeff_ptr, patchsize * 3 * sizeof(double)); + offset += patchsize; + } + checkVectorEqual(dfDataBase.num_boundary_surfaces * 3, cpu_internal_coeffs.data(), h_internal_coeffs.data(), 1e-14, printFlag); + } + if (testData.d_boundary_coeffs) { + std::vector h_boundary_coeffs; + h_boundary_coeffs.resize(dfDataBase.num_boundary_surfaces * 3); + checkCudaErrors(cudaMemcpy(h_boundary_coeffs.data(), testData.d_boundary_coeffs, dfDataBase.boundary_surface_value_vec_bytes, cudaMemcpyDeviceToHost)); + std::vector cpu_boundary_coeffs(dfDataBase.num_boundary_surfaces * 3); + int offset = 0; + for (int patchi = 0; patchi < dfDataBase.num_patches; patchi++) + { + int patchsize = dfDataBase.patch_size[patchi]; + const double* boundary_coeff_ptr = &dfMatrix.boundaryCoeffs()[patchi][0][0]; + memcpy(cpu_boundary_coeffs.data() + offset * 3, boundary_coeff_ptr, patchsize * 3 * sizeof(double)); + offset += patchsize; + } + checkVectorEqual(dfDataBase.num_boundary_surfaces * 3, cpu_boundary_coeffs.data(), h_boundary_coeffs.data(), 1e-14, printFlag); + } +} + +// unittest of fvm::div(phi, U) +void test_fvm_div_scalar(dfMatrixDataBase& dfDataBase, Foam::fvMesh& mesh, surfaceScalarField& phi, volVectorField& U, initType type) { + if (type == initType::randomInit) { + randomInitSurfaceScalar(phi); + // TODO: random init weight failed, weight is const. + } + + // run CPU + fvVectorMatrix dfMatrix = fvm::div(phi, U); + + // prepare for run GPU + // prepare phi field + uploadSurfaceScalar(dfDataBase, phi, "phi"); + // prepare testData + testGPUDataBase testData; + buildTestGPUDataBaseVector(dfDataBase, testData, U); // prepare boundary coeffs // TODO: updating boundary coeffs should be complemented later - double *d_value_internal_coeffs_U = nullptr; - double *d_value_boundary_coeffs_U = nullptr; - double *d_gradient_internal_coeffs_U = nullptr; - double *d_gradient_boundary_coeffs_U = nullptr; - checkCudaErrors(cudaMalloc((void**)&d_value_internal_coeffs_U, dfDataBase.boundary_surface_value_vec_bytes)); - checkCudaErrors(cudaMalloc((void**)&d_value_boundary_coeffs_U, dfDataBase.boundary_surface_value_vec_bytes)); - checkCudaErrors(cudaMalloc((void**)&d_gradient_internal_coeffs_U, dfDataBase.boundary_surface_value_vec_bytes)); - checkCudaErrors(cudaMalloc((void**)&d_gradient_boundary_coeffs_U, dfDataBase.boundary_surface_value_vec_bytes)); - update_boundary_coeffs_vector(dfDataBase.stream, dfDataBase.num_patches, - dfDataBase.patch_size.data(), patch_type_U.data(), - d_value_internal_coeffs_U, d_value_boundary_coeffs_U, - d_gradient_internal_coeffs_U, d_gradient_boundary_coeffs_U); - // prepare ldu - double *d_lower = nullptr; - double *d_upper = nullptr; - double *d_diag = nullptr; - double *d_internal_coeffs = nullptr; - double *d_boundary_coeffs = nullptr; - checkCudaErrors(cudaMalloc((void**)&d_lower, dfDataBase.surface_value_bytes)); - checkCudaErrors(cudaMalloc((void**)&d_upper, dfDataBase.surface_value_bytes)); - checkCudaErrors(cudaMalloc((void**)&d_diag, dfDataBase.cell_value_bytes)); - checkCudaErrors(cudaMalloc((void**)&d_internal_coeffs, dfDataBase.boundary_surface_value_vec_bytes)); - checkCudaErrors(cudaMalloc((void**)&d_boundary_coeffs, dfDataBase.boundary_surface_value_vec_bytes)); - // run GPU - Process + updateBoundaryCoeffsVector(dfDataBase, testData); + + // run GPU fvm_div_scalar(dfDataBase.stream, dfDataBase.num_surfaces, dfDataBase.d_owner, dfDataBase.d_neighbor, dfDataBase.d_phi, dfDataBase.d_weight, - d_lower, d_upper, d_diag, // end for internal - dfDataBase.num_patches, dfDataBase.patch_size.data(), patch_type_U.data(), - dfDataBase.d_boundary_phi, d_value_internal_coeffs_U, d_value_boundary_coeffs_U, - d_internal_coeffs, d_boundary_coeffs); - // run GPU - postProcess - std::vector h_lower; - h_lower.resize(dfDataBase.num_surfaces); - std::vector h_upper; - h_upper.resize(dfDataBase.num_surfaces); - std::vector h_diag; - h_diag.resize(dfDataBase.num_cells); - std::vector h_internal_coeffs; - h_internal_coeffs.resize(dfDataBase.num_boundary_surfaces * 3); - std::vector h_boundary_coeffs; - h_boundary_coeffs.resize(dfDataBase.num_boundary_surfaces * 3); - checkCudaErrors(cudaMemcpy(h_lower.data(), d_lower, dfDataBase.surface_value_bytes, cudaMemcpyDeviceToHost)); - checkCudaErrors(cudaMemcpy(h_upper.data(), d_upper, dfDataBase.surface_value_bytes, cudaMemcpyDeviceToHost)); - checkCudaErrors(cudaMemcpy(h_diag.data(), d_diag, dfDataBase.cell_value_bytes, cudaMemcpyDeviceToHost)); - checkCudaErrors(cudaMemcpy(h_internal_coeffs.data(), d_internal_coeffs, dfDataBase.boundary_surface_value_vec_bytes, cudaMemcpyDeviceToHost)); - checkCudaErrors(cudaMemcpy(h_boundary_coeffs.data(), d_boundary_coeffs, dfDataBase.boundary_surface_value_vec_bytes, cudaMemcpyDeviceToHost)); - checkCudaErrors(cudaFree(d_lower)); - checkCudaErrors(cudaFree(d_upper)); - checkCudaErrors(cudaFree(d_diag)); - checkCudaErrors(cudaFree(d_internal_coeffs)); - checkCudaErrors(cudaFree(d_boundary_coeffs)); - checkCudaErrors(cudaFree(d_value_internal_coeffs_U)); - checkCudaErrors(cudaFree(d_value_boundary_coeffs_U)); - checkCudaErrors(cudaFree(d_gradient_internal_coeffs_U)); - checkCudaErrors(cudaFree(d_gradient_boundary_coeffs_U)); - - // compare CPU and GPU results - checkVectorEqual(dfDataBase.num_surfaces, &df_U.lower()[0], h_lower.data(), 1e-14); - checkVectorEqual(dfDataBase.num_surfaces, &df_U.upper()[0], h_upper.data(), 1e-14); - checkVectorEqual(dfDataBase.num_cells, &df_U.diag()[0], h_diag.data(), 1e-14); - std::vector cpu_internal_coeffs(dfDataBase.num_boundary_surfaces * 3); - std::vector cpu_boundary_coeffs(dfDataBase.num_boundary_surfaces * 3); - offset = 0; - forAll(U.boundaryField(), patchi) - { - int patchSize = U.boundaryField()[patchi].size(); - const double* internal_coeff_ptr = &df_U.internalCoeffs()[patchi][0][0]; - const double* boundary_coeff_ptr = &df_U.boundaryCoeffs()[patchi][0][0]; - memcpy(cpu_internal_coeffs.data() + offset * 3, internal_coeff_ptr, patchSize * 3 * sizeof(double)); - memcpy(cpu_boundary_coeffs.data() + offset * 3, boundary_coeff_ptr, patchSize * 3 * sizeof(double)); - offset += patchSize; - } - checkVectorEqual(dfDataBase.num_boundary_surfaces * 3, cpu_internal_coeffs.data(), h_internal_coeffs.data(), 1e-14); - checkVectorEqual(dfDataBase.num_boundary_surfaces * 3, cpu_boundary_coeffs.data(), h_boundary_coeffs.data(), 1e-14); + testData.d_lower, testData.d_upper, testData.d_diag, // end for internal + dfDataBase.num_patches, dfDataBase.patch_size.data(), testData.patch_type.data(), + dfDataBase.d_boundary_phi, testData.d_value_internal_coeffs, testData.d_value_boundary_coeffs, + testData.d_internal_coeffs, testData.d_boundary_coeffs); + + // compare result + bool printFlag = false; + compareResultVector(dfDataBase, testData, dfMatrix, printFlag); } diff --git a/src_gpu/dfMatrixDataBase.H b/src_gpu/dfMatrixDataBase.H index efcb78190..4d8a7d29d 100644 --- a/src_gpu/dfMatrixDataBase.H +++ b/src_gpu/dfMatrixDataBase.H @@ -12,7 +12,7 @@ #include #include #include - +#include static const char *_cudaGetErrorEnum(cudaError_t error) { return cudaGetErrorName(error); @@ -30,17 +30,29 @@ void check(T result, char const *const func, const char *const file, #define checkCudaErrors(val) check((val), #val, __FILE__, __LINE__) -inline void checkVectorEqual(int count, const double* basevec, double* vec, double max_relative_error) { +inline void checkVectorEqual(int count, const double* basevec, double* vec, double max_relative_error, bool print = false) { for (size_t i = 0; i < count; ++i) { double abs_diff = fabs(basevec[i] - vec[i]); double rel_diff = fabs(basevec[i] - vec[i]) / fabs(basevec[i]); + if (print) + fprintf(stderr, "index %d, cpu data: %.16lf, gpu data: %.16lf, relative error: %.16lf\n", i, basevec[i], vec[i], rel_diff); // if (abs_diff > 1e-12 && rel_diff > max_relative_error && !std::isinf(rel_diff)) if (abs_diff > 1e-15 && rel_diff > max_relative_error) fprintf(stderr, "mismatch index %d, cpu data: %.16lf, gpu data: %.16lf, relative error: %.16lf\n", i, basevec[i], vec[i], rel_diff); } } +enum location { + cpu, + gpu +}; + +enum position { + internal, + boundary +}; + enum boundaryConditions{ zeroGradient, fixedValue, @@ -162,6 +174,8 @@ struct dfMatrixDataBase double *h_boundary_p= nullptr; double *h_boundary_phi= nullptr; + std::unordered_map fieldPointerMap; + // constructor dfMatrixDataBase(); @@ -186,5 +200,7 @@ struct dfMatrixDataBase void initNonConstantFieldsInternal(const double *y); void initNonConstantFieldsBoundary(const double *boundary_y); + // getter + double* getFieldPointer(const char* fieldAlias, location loc, position pos); }; diff --git a/src_gpu/dfMatrixDataBase.cu b/src_gpu/dfMatrixDataBase.cu index 4bcbe88a4..2ef707bbc 100644 --- a/src_gpu/dfMatrixDataBase.cu +++ b/src_gpu/dfMatrixDataBase.cu @@ -187,12 +187,20 @@ void dfMatrixDataBase::createConstantFieldsInternal() { checkCudaErrors(cudaMalloc((void**)&d_weight, surface_value_bytes)); checkCudaErrors(cudaMalloc((void**)&d_delta_coeffs, surface_value_bytes)); checkCudaErrors(cudaMalloc((void**)&d_volume, cell_value_bytes)); + fieldPointerMap["d_sf"] = d_sf; + fieldPointerMap["d_mag_sf"] = d_mag_sf; + fieldPointerMap["d_weight"] = d_weight; + fieldPointerMap["d_delta_coeffs"] = d_delta_coeffs; + fieldPointerMap["d_volume"] = d_volume; } void dfMatrixDataBase::createConstantFieldsBoundary() { checkCudaErrors(cudaMalloc((void**)&d_boundary_sf, boundary_surface_value_vec_bytes)); checkCudaErrors(cudaMalloc((void**)&d_boundary_mag_sf, boundary_surface_value_bytes)); checkCudaErrors(cudaMalloc((void**)&d_boundary_delta_coeffs, boundary_surface_value_bytes)); + fieldPointerMap["d_boundary_sf"] = d_boundary_sf; + fieldPointerMap["d_boundary_mag_sf"] = d_boundary_mag_sf; + fieldPointerMap["d_boundary_delta_coeffs"] = d_boundary_delta_coeffs; } void dfMatrixDataBase::initConstantFieldsInternal(const double *sf, const double *mag_sf, @@ -217,24 +225,36 @@ void dfMatrixDataBase::createNonConstantFieldsInternal() { checkCudaErrors(cudaMalloc((void**)&d_y, cell_value_bytes * num_species)); checkCudaErrors(cudaMalloc((void**)&d_he, cell_value_bytes)); checkCudaErrors(cudaMalloc((void**)&d_p, cell_value_bytes)); + fieldPointerMap["d_rho"] = d_rho; + fieldPointerMap["d_u"] = d_u; + fieldPointerMap["d_y"] = d_y; + fieldPointerMap["d_he"] = d_he; + fieldPointerMap["d_p"] = d_p; - checkCudaErrors(cudaMalloc((void**)&d_rho_old, cell_value_bytes)); - checkCudaErrors(cudaMalloc((void**)&d_u_old, cell_value_vec_bytes)); - checkCudaErrors(cudaMalloc((void**)&d_y_old, cell_value_bytes * num_species)); - checkCudaErrors(cudaMalloc((void**)&d_he_old, cell_value_bytes)); - checkCudaErrors(cudaMalloc((void**)&d_p_old, cell_value_bytes)); + // checkCudaErrors(cudaMalloc((void**)&d_rho_old, cell_value_bytes)); + // checkCudaErrors(cudaMalloc((void**)&d_u_old, cell_value_vec_bytes)); + // checkCudaErrors(cudaMalloc((void**)&d_y_old, cell_value_bytes * num_species)); + // checkCudaErrors(cudaMalloc((void**)&d_he_old, cell_value_bytes)); + // checkCudaErrors(cudaMalloc((void**)&d_p_old, cell_value_bytes)); checkCudaErrors(cudaMalloc((void**)&d_phi, surface_value_bytes)); + fieldPointerMap["d_phi"] = d_phi; // computed on GPU, used on CPU, need memcpyd2h checkCudaErrors(cudaMallocHost((void**)&h_rho, cell_value_bytes)); checkCudaErrors(cudaMallocHost((void**)&h_u, cell_value_vec_bytes)); checkCudaErrors(cudaMallocHost((void**)&h_y, cell_value_bytes * num_species)); checkCudaErrors(cudaMallocHost((void**)&h_he, cell_value_bytes)); + fieldPointerMap["h_rho"] = h_rho; + fieldPointerMap["h_u"] = h_u; + fieldPointerMap["h_y"] = h_y; + fieldPointerMap["h_he"] = h_he; // computed on CPU, used on GPU, need memcpyh2d checkCudaErrors(cudaMallocHost((void**)&h_p, cell_value_bytes)); checkCudaErrors(cudaMallocHost((void**)&h_phi, surface_value_bytes)); + fieldPointerMap["h_p"] = h_p; + fieldPointerMap["h_phi"] = h_phi; } void dfMatrixDataBase::createNonConstantFieldsBoundary() { @@ -243,24 +263,36 @@ void dfMatrixDataBase::createNonConstantFieldsBoundary() { checkCudaErrors(cudaMalloc((void**)&d_boundary_y, boundary_surface_value_bytes * num_species)); checkCudaErrors(cudaMalloc((void**)&d_boundary_he, boundary_surface_value_bytes)); checkCudaErrors(cudaMalloc((void**)&d_boundary_p, boundary_surface_value_bytes)); - - checkCudaErrors(cudaMalloc((void**)&d_boundary_rho_old, boundary_surface_value_bytes)); - checkCudaErrors(cudaMalloc((void**)&d_boundary_u_old, boundary_surface_value_vec_bytes)); - checkCudaErrors(cudaMalloc((void**)&d_boundary_y_old, boundary_surface_value_bytes * num_species)); - checkCudaErrors(cudaMalloc((void**)&d_boundary_he_old, boundary_surface_value_bytes)); - checkCudaErrors(cudaMalloc((void**)&d_boundary_p_old, boundary_surface_value_bytes)); + fieldPointerMap["d_boundary_rho"] = d_boundary_rho; + fieldPointerMap["d_boundary_u"] = d_boundary_u; + fieldPointerMap["d_boundary_y"] = d_boundary_y; + fieldPointerMap["d_boundary_he"] = d_boundary_he; + fieldPointerMap["d_boundary_p"] = d_boundary_p; + + // checkCudaErrors(cudaMalloc((void**)&d_boundary_rho_old, boundary_surface_value_bytes)); + // checkCudaErrors(cudaMalloc((void**)&d_boundary_u_old, boundary_surface_value_vec_bytes)); + // checkCudaErrors(cudaMalloc((void**)&d_boundary_y_old, boundary_surface_value_bytes * num_species)); + // checkCudaErrors(cudaMalloc((void**)&d_boundary_he_old, boundary_surface_value_bytes)); + // checkCudaErrors(cudaMalloc((void**)&d_boundary_p_old, boundary_surface_value_bytes)); checkCudaErrors(cudaMalloc((void**)&d_boundary_phi, boundary_surface_value_bytes)); + fieldPointerMap["d_boundary_phi"] = d_boundary_phi; // computed on GPU, used on CPU, need memcpyd2h checkCudaErrors(cudaMallocHost((void**)&h_boundary_rho, boundary_surface_value_bytes)); checkCudaErrors(cudaMallocHost((void**)&h_boundary_u, boundary_surface_value_vec_bytes)); checkCudaErrors(cudaMallocHost((void**)&h_boundary_y, boundary_surface_value_bytes * num_species)); checkCudaErrors(cudaMallocHost((void**)&h_boundary_he, boundary_surface_value_bytes)); + fieldPointerMap["h_boundary_rho"] = h_boundary_rho; + fieldPointerMap["h_boundary_u"] = h_boundary_u; + fieldPointerMap["h_boundary_y"] = h_boundary_y; + fieldPointerMap["h_boundary_he"] = h_boundary_he; // computed on CPU, used on GPU, need memcpyh2d checkCudaErrors(cudaMallocHost((void**)&h_boundary_p, boundary_surface_value_bytes)); checkCudaErrors(cudaMallocHost((void**)&h_boundary_phi, boundary_surface_value_bytes)); + fieldPointerMap["h_boundary_p"] = h_boundary_p; + fieldPointerMap["h_boundary_phi"] = h_boundary_phi; } void dfMatrixDataBase::initNonConstantFieldsInternal(const double *y) { @@ -270,3 +302,23 @@ void dfMatrixDataBase::initNonConstantFieldsInternal(const double *y) { void dfMatrixDataBase::initNonConstantFieldsBoundary(const double *boundary_y) { checkCudaErrors(cudaMemcpyAsync(d_boundary_y, boundary_y, boundary_surface_value_bytes* num_species, cudaMemcpyHostToDevice, stream)); } + +double* dfMatrixDataBase::getFieldPointer(const char* fieldAlias, location loc, position pos) { + char mergedName[256]; + if (pos == position::internal) { + sprintf(mergedName, "%s_%s", (loc == location::cpu) ? "h" : "d", fieldAlias); + } else if (pos == position::boundary) { + sprintf(mergedName, "%s_boundary_%s", (loc == location::cpu) ? "h" : "d", fieldAlias); + } + + double *pointer = nullptr; + if (fieldPointerMap.find(std::string(mergedName)) != fieldPointerMap.end()) { + pointer = fieldPointerMap[std::string(mergedName)]; + } + if (pointer == nullptr) { + fprintf(stderr, "Warning! getFieldPointer of %s returns nullptr!\n", mergedName); + } + //fprintf(stderr, "fieldAlias: %s, mergedName: %s, pointer: %p\n", fieldAlias, mergedName, pointer); + + return pointer; +} From cbd7b49ef5ffa6d80c2b99722ddbb399fab26fd5 Mon Sep 17 00:00:00 2001 From: STwangyingrui Date: Fri, 4 Aug 2023 00:25:14 +0800 Subject: [PATCH 04/25] add initial version of GPU new UEqn --- applications/solvers/dfLowMachFoam/Make/files | 2 +- .../solvers/dfLowMachFoam/createGPUSolver.H | 93 ++++++ applications/solvers/dfLowMachFoam/new_UEqn.H | 47 +++ .../solvers/dfLowMachFoam/new_dfLowMachFoam.C | 108 ++++++ src_gpu/AmgXSolver.H | 310 ++++++++++++++++++ src_gpu/AmgXSolver.cu | 296 +++++++++++++++++ src_gpu/CMakeLists.txt | 4 +- src_gpu/dfMatrixOpBase.H | 3 + src_gpu/dfMatrixOpBase.cu | 37 +++ src_gpu/dfUEqn.H | 99 ++++++ src_gpu/dfUEqn.cu | 201 ++++++++++++ 11 files changed, 1198 insertions(+), 2 deletions(-) create mode 100644 applications/solvers/dfLowMachFoam/createGPUSolver.H create mode 100644 applications/solvers/dfLowMachFoam/new_UEqn.H create mode 100644 applications/solvers/dfLowMachFoam/new_dfLowMachFoam.C create mode 100644 src_gpu/AmgXSolver.H create mode 100644 src_gpu/AmgXSolver.cu create mode 100644 src_gpu/dfUEqn.H create mode 100644 src_gpu/dfUEqn.cu diff --git a/applications/solvers/dfLowMachFoam/Make/files b/applications/solvers/dfLowMachFoam/Make/files index 9b7e89945..4eff5915e 100644 --- a/applications/solvers/dfLowMachFoam/Make/files +++ b/applications/solvers/dfLowMachFoam/Make/files @@ -1,3 +1,3 @@ -dfLowMachFoam.C +new_dfLowMachFoam.C EXE = $(DF_APPBIN)/dfLowMachFoam diff --git a/applications/solvers/dfLowMachFoam/createGPUSolver.H b/applications/solvers/dfLowMachFoam/createGPUSolver.H new file mode 100644 index 000000000..5d16f7b80 --- /dev/null +++ b/applications/solvers/dfLowMachFoam/createGPUSolver.H @@ -0,0 +1,93 @@ +dfMatrixDataBase dfDataBase; +//dfRhoEqn rhoEqn_GPU; +dfUEqn UEqn_GPU(dfDataBase); +//dfYEqn YEqn_GPU; +//dfEEqn EEqn_GPU; + +void createGPUBase(fvMesh& mesh, PtrList& Y) { + // prepare constant values: num_cells, num_surfaces, num_boundary_surfaces, num_patches, patch_size, num_species, rdelta_t + const labelUList& owner = mesh.owner(); + const labelUList& neighbour = mesh.neighbour(); + int num_cells = mesh.nCells(); + int num_surfaces = neighbour.size(); + int num_boundary_surfaces = 0; + int num_patches = 0; + std::vector patch_size; + forAll(mesh.boundary(), patchi) { + labelUList sub_boundary = mesh.boundary()[patchi].faceCells(); + int patchsize = sub_boundary.size(); + patch_size.push_back(patchsize); + num_boundary_surfaces += patchsize; + num_patches++; + } + // TODO: get deltaT fomr time API + dfDataBase.setConstantValues(num_cells, num_surfaces, num_boundary_surfaces, num_patches, patch_size, Y.size(), 1e-6); + + // prepare constant indexes: owner, neighbor + dfDataBase.setConstantIndexes(&owner[0], &neighbour[0]); + + // prepare internal and boundary of sf, mag_sf, weights, delta_coeffs, volume + double *boundary_sf = new double[3 * num_boundary_surfaces]; + double *boundary_mag_sf = new double[num_boundary_surfaces]; + double *boundary_delta_coeffs = new double[num_boundary_surfaces]; + int offset = 0; + forAll(mesh.boundary(), patchi) { + const vectorField& pSf = mesh.Sf().boundaryField()[patchi]; + const scalarField& pMagSf = mesh.magSf().boundaryField()[patchi]; + const scalarField& pDeltaCoeffs = mesh.nonOrthDeltaCoeffs().boundaryField()[patchi]; + + int patchsize = pMagSf.size(); + + memcpy(boundary_sf + 3*offset, &pSf[0][0], 3*patchsize*sizeof(double)); + memcpy(boundary_mag_sf + offset, &pMagSf[0], patchsize*sizeof(double)); + memcpy(boundary_delta_coeffs + offset, &pDeltaCoeffs[0], patchsize*sizeof(double)); + offset += patchsize; + } + + dfDataBase.createConstantFieldsInternal(); + dfDataBase.createConstantFieldsBoundary(); + dfDataBase.initConstantFieldsInternal(&mesh.Sf()[0][0], &mesh.magSf()[0], &mesh.surfaceInterpolation::weights()[0], &mesh.nonOrthDeltaCoeffs()[0], &mesh.V()[0]); + dfDataBase.initConstantFieldsBoundary(boundary_sf, boundary_mag_sf, boundary_delta_coeffs); + + // prepare internal and boundary of Y + dfDataBase.createNonConstantFieldsInternal(); + dfDataBase.createNonConstantFieldsBoundary(); + forAll(Y, speciesI) { + volScalarField& Yi = Y[speciesI]; + memcpy(dfDataBase.h_y + speciesI * num_cells, &Yi[0], num_cells * sizeof(double)); + offset = 0; + forAll(Yi.boundaryField(), patchi) { + const scalarField& patchYi = Yi.boundaryField()[patchi]; + int patchsize = patchYi.size(); + memcpy(dfDataBase.h_boundary_y + speciesI * num_boundary_surfaces + offset, &patchYi[0], patchsize*sizeof(double)); + offset += patchsize; + } + } + dfDataBase.initNonConstantFieldsInternal(dfDataBase.h_y); + dfDataBase.initNonConstantFieldsBoundary(dfDataBase.h_boundary_y); +} + +void createGPUUEqn(const IOdictionary& CanteraTorchProperties, const volVectorField& U) { + // prepare mode_string and setting_path + string mode_string = "dDDI"; + string settingPath; + settingPath = CanteraTorchProperties.subDict("AmgxSettings").lookupOrDefault("UEqnSettingPath", string("")); + UEqn_GPU.setConstantValues(mode_string, settingPath); + + // prepare patch_type + std::vector patch_type; + patch_type.resize(dfDataBase.num_patches); + forAll(U.boundaryField(), patchi) + { + constructBoundarySelectorPerPatch(&(patch_type[patchi]), U.boundaryField()[patchi].type()); + } + UEqn_GPU.setConstantFields(patch_type); + + // prepare internal and boundary of xxx + UEqn_GPU.createNonConstantFieldsInternal(); + UEqn_GPU.createNonConstantFieldsBoundary(); + UEqn_GPU.createNonConstantLduAndCsrFields(); + // UEqn_GPU has no internal non-constant fields to be init + // UEqn_GPU.initNonConstantFieldsInternal(); + UEqn_GPU.initNonConstantFieldsBoundary(); +} diff --git a/applications/solvers/dfLowMachFoam/new_UEqn.H b/applications/solvers/dfLowMachFoam/new_UEqn.H new file mode 100644 index 000000000..c38735375 --- /dev/null +++ b/applications/solvers/dfLowMachFoam/new_UEqn.H @@ -0,0 +1,47 @@ +#ifdef GPUSolver_ +// run CPU +tmp tUEqn +( + fvm::div(phi, U) +); +fvVectorMatrix& UEqn = tUEqn.ref(); + +// run GPU +// preProcess +// skip preparing u, p, nu_eff, and rho.boundary used by UEqn_GPU.preProcess() +// TODO: temp, now we only transfer phi(instead of rhoEqn) used by fvm::div(phi, U) +double *h_u = dfDataBase.getFieldPointer("u", location::cpu, position::internal); +double *h_phi = dfDataBase.getFieldPointer("phi", location::cpu, position::internal); +double *h_boundary_phi = dfDataBase.getFieldPointer("phi", location::cpu, position::boundary); +memcpy(h_phi, &phi[0], dfDataBase.surface_value_bytes); +int offset = 0; +forAll(phi.boundaryField(), patchi) +{ + const fvsPatchScalarField& patchPhi = phi.boundaryField()[patchi]; + int patchsize = patchPhi.size(); + memcpy(h_boundary_phi + offset, &patchPhi[0], patchsize * sizeof(double)); + offset += patchsize; +} +UEqn_GPU.preProcessForRhoEqn(h_phi, h_boundary_phi); +// process +UEqn_GPU.process(); +// postProcess +UEqn_GPU.postProcess(h_u); +// checkResult +// TODO: temp, now we compare ldu, finally we compare csr +std::vector h_internal_coeffs(dfDataBase.num_boundary_surfaces * 3); +std::vector h_boundary_coeffs(dfDataBase.num_boundary_surfaces * 3); +offset = 0; +for (int patchi = 0; patchi < dfDataBase.num_patches; patchi++) +{ + int patchsize = dfDataBase.patch_size[patchi]; + const double* internal_coeff_ptr = &UEqn.internalCoeffs()[patchi][0][0]; + const double* boundary_coeff_ptr = &UEqn.boundaryCoeffs()[patchi][0][0]; + memcpy(h_internal_coeffs.data() + offset * 3, internal_coeff_ptr, patchsize * 3 * sizeof(double)); + memcpy(h_boundary_coeffs.data() + offset * 3, boundary_coeff_ptr, patchsize * 3 * sizeof(double)); + offset += patchsize; +} +bool printFlag = false; +UEqn_GPU.compareResult(&UEqn.lower()[0], &UEqn.upper()[0], &UEqn.diag()[0], + h_internal_coeffs.data(), h_boundary_coeffs.data(), printFlag); +#endif diff --git a/applications/solvers/dfLowMachFoam/new_dfLowMachFoam.C b/applications/solvers/dfLowMachFoam/new_dfLowMachFoam.C new file mode 100644 index 000000000..530a9f7ec --- /dev/null +++ b/applications/solvers/dfLowMachFoam/new_dfLowMachFoam.C @@ -0,0 +1,108 @@ +/*---------------------------------------------------------------------------*\ + ========= | + \\ / F ield | OpenFOAM: The Open Source CFD Toolbox + \\ / O peration | Website: https://openfoam.org + \\ / A nd | Copyright (C) 2011-2019 OpenFOAM Foundation + \\/ M anipulation | +------------------------------------------------------------------------------- +License + This file is part of OpenFOAM. + + OpenFOAM is free software: you can redistribute it and/or modify it + under the terms of the GNU General Public License as published by + the Free Software Foundation, either version 3 of the License, or + (at your option) any later version. + + OpenFOAM is distributed in the hope that it will be useful, but WITHOUT + ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or + FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License + for more details. + + You should have received a copy of the GNU General Public License + along with OpenFOAM. If not, see . + +Application + unittest + +Description + GPU unittest + +\*---------------------------------------------------------------------------*/ + +#include "dfChemistryModel.H" +#include "CanteraMixture.H" +// #include "hePsiThermo.H" +#include "heRhoThermo.H" + +#include "fvCFD.H" +#include "fluidThermo.H" +#include "turbulentFluidThermoModel.H" +#include "pimpleControl.H" +#include "pressureControl.H" +#include "localEulerDdtScheme.H" +#include "fvcSmooth.H" +#include "PstreamGlobals.H" +#include "basicThermo.H" +#include "CombustionModel.H" + +#include +#include +#include "upwind.H" + +#include "dfMatrixDataBase.H" +#include "dfMatrixOpBase.H" +#include "dfUEqn.H" +#include "createGPUSolver.H" + +int main(int argc, char *argv[]) +{ +#ifdef USE_PYTORCH + pybind11::scoped_interpreter guard{};//start python interpreter +#endif + #include "postProcess.H" + + // #include "setRootCaseLists.H" + #include "listOptions.H" + #include "setRootCase2.H" + #include "listOutput.H" + + #include "createTime.H" + #include "createMesh.H" + #include "createDyMControls.H" + #include "initContinuityErrs.H" + #include "createFields.H" + #include "createRhoUfIfPresent.H" + + turbulence->validate(); + + if (!LTS) + { + #include "compressibleCourantNo.H" + #include "setInitialDeltaT.H" + } + + // * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * // + { + #include "readDyMControls.H" + + if (LTS) + { + #include "setRDeltaT.H" + } + else + { + #include "compressibleCourantNo.H" + #include "setDeltaT.H" + } + + createGPUBase(mesh, Y); + createGPUUEqn(CanteraTorchProperties, U); + + // foreach(timestep) { + #include "new_UEqn.H" + // } + } + return 0; +} + + diff --git a/src_gpu/AmgXSolver.H b/src_gpu/AmgXSolver.H new file mode 100644 index 000000000..190808934 --- /dev/null +++ b/src_gpu/AmgXSolver.H @@ -0,0 +1,310 @@ +/** + * \file AmgXSolver.hpp + * \brief Definition of class AmgXSolver. + * \author Pi-Yueh Chuang (pychuang@gwu.edu) + * \author Matt Martineau (mmartineau@nvidia.com) + * \date 2015-09-01 + * \copyright Copyright (c) 2015-2019 Pi-Yueh Chuang, Lorena A. Barba. + * \copyright Copyright (c) 2019-2021, NVIDIA CORPORATION. All rights reserved. + * This project is released under MIT License. + */ + + +#ifndef __AMGX_SOLVER_H__ +#define __AMGX_SOLVER_H__ + +// CUDA +#include + +// STL +# include +# include +# include + +// AmgX +# include + +// PETSc +// # include + + +/** \brief A macro to check the returned CUDA error code. + * + * \param call [in] Function call to CUDA API. + */ +# define CHECK(call) \ +do \ +{ \ + const cudaError_t error_code = call; \ + if (error_code != cudaSuccess) \ + { \ + printf("CUDA Error:\n"); \ + printf(" File: %s\n", __FILE__); \ + printf(" Line: %d\n", __LINE__); \ + printf(" Error code: %d\n", error_code); \ + printf(" Error text: %s\n", \ + cudaGetErrorString(error_code)); \ + exit(1); \ + } \ +} while (0) + + + + + + +/** \brief A wrapper class for coupling PETSc and AmgX. + * + * This class is a wrapper of AmgX library for PETSc. PETSc users only need to + * pass a PETSc matrix and vectors into an AmgXSolver instance to solve their + * linear systems. The class is designed specifically for the situation where + * the number of MPI processes is more than the number of GPU devices. + * + * Eaxmple usage: + * \code + * int main(int argc, char **argv) + * { + * // initialize matrix A, RHS, etc using PETSc + * ... + * + * // create an instance of the solver wrapper + * AmgXSolver solver; + * // initialize the instance with communicator, executation mode, and config file + * solver.initialize(comm, mode, file); + * // set matrix A. Currently it only accept PETSc AIJ matrix + * solver.setA(A); + * // solve. x and rhs are PETSc vectors. unkns will be the final result in the end + * solver.solve(unks, rhs); + * // get number of iterations + * int iters; + * solver.getIters(iters); + * // get residual at the last iteration + * double res; + * solver.getResidual(iters, res); + * // finalization + * solver.finalize(); + * + * // other codes + * .... + * + * return 0; + * } + * \endcode + */ +class AmgXSolver +{ + public: + + /** \brief Default constructor. */ + AmgXSolver() = default; + + /** \brief Construct a AmgXSolver instance. + * + * \param comm [in] MPI communicator. + * \param modeStr [in] A string; target mode of AmgX (e.g., dDDI). + * \param cfgFile [in] A string; the path to AmgX configuration file. + */ + AmgXSolver + ( + const std::string &modeStr, + const std::string &cfgFile + ); + + /** \brief Destructor. */ + ~AmgXSolver(); + + /** \brief Initialize a AmgXSolver instance. + * + * \param comm [in] MPI communicator. + * \param modeStr [in] A string; target mode of AmgX (e.g., dDDI). + * \param cfgFile [in] A string; the path to AmgX configuration file. + * + */ + void initialize + ( + const std::string &modeStr, + const std::string &cfgFile + ); + + + /** \brief Finalize this instance. + * + * This function destroys AmgX data. When there are more than one + * AmgXSolver instances, the last one destroyed is also in charge of + * destroying the shared resource object and finalizing AmgX. + * + */ + void finalize(); + + /** \brief Set up the matrix used by AmgX. + * + * This function sets up the AmgX matrix from the provided CSR data + * structures and partition data. + * + * \param nGlobalRows [in] The number of global rows. + * \param nLocalRows [in] The number of local rows on this rank. + * \param nLocalNz [in] The total number of non zero entries locally. + * \param rowOffsets [in] The local CSR matrix row offsets. + * \param colIndicesGlobal [in] The global CSR matrix column indices. + * \param values [in] The local CSR matrix values. + * id of the owning rank for each row. + * + */ + void setOperator + ( + const int nRows, + const int nNz, + const int *rowIndex, + const int *colIndex, + const double *value + ); + + /** \brief Re-sets up an existing AmgX matrix. + * + * Replaces the matrix coefficients with the provided values and performs + * a resetup for the AmgX matrix. + * + * \param nLocalRows [in] The number of local rows on this rank. + * \param nLocalNz [in] The total number of non zero entries locally. + * \param values [in] The local CSR matrix values. + * + */ + void updateOperator + ( + const int nRows, + const int nNz, + const double *value + ); + + /** \brief Solve the linear system. + * + * \p p vector will be used as an initial guess and will be updated to the + * solution by the end of solving. + * + * For cases that use more MPI processes than the number of GPUs, this + * function will do data gathering before solving and data scattering + * after the solving. + * + * \param nLocalRows [in] The number of rows owned by this rank. + * \param pscalar [in, out] The unknown array. + * \param bscalar [in] The RHS array. + * \param matrix [in,out] The AmgX CSR matrix, A. + * + */ + void solve + ( + int nRows, + double* psi, + const double* rhs + ); + + /** \brief Solve the linear system. + * + * \p p vector will be used as an initial guess and will be updated to the + * solution by the end of solving. + * + * For cases that use more MPI processes than the number of GPUs, this + * function will do data gathering before solving and data scattering + * after the solving. + * + * \param nLocalRows [in] The number of rows owned by this rank. + * \param p [in, out] The unknown vector. + * \param b [in] The RHS vector. + * \param matrix [in,out] The AmgX CSR matrix, A. + * + */ + // void solve + // ( + // int nLocalRows, + // Vec& p, + // Vec& b, + // AmgXCSRMatrix& matrix + // ); + + + /** \brief Get the number of iterations of the last solving. + * + * \param iter [out] Number of iterations. + * + */ + void getIters + ( + int &iter + ); + + /** \brief Get the residual at a specific iteration during the last solving. + * + * \param iter [in] Target iteration. + * \param res [out] Returned residual. + * + */ + void getResidual + ( + const int &iter, + double &res + ); + + + private: + + /** \brief Current count of AmgXSolver instances. + * + * This static variable is used to count the number of instances. The + * fisrt instance is responsable for initializing AmgX library and the + * resource instance. + */ + static int count; + + /** \brief A flag indicating if this instance has been initialized. */ + bool isInitialised = false; + + /** \brief A parameter used by AmgX. */ + int ring; + + /** \brief AmgX solver mode. */ + AMGX_Mode mode; + + /** \brief AmgX config object. */ + AMGX_config_handle cfg = nullptr; + + /** \brief AmgX matrix object. */ + AMGX_matrix_handle AmgXA = nullptr; + + /** \brief AmgX vector object representing unknowns. */ + AMGX_vector_handle AmgXP = nullptr; + + /** \brief AmgX vector object representing RHS. */ + AMGX_vector_handle AmgXRHS = nullptr; + + /** \brief AmgX solver object. */ + AMGX_solver_handle solver = nullptr; + + /** \brief AmgX resource object. + * + * Due to the design of AmgX library, using more than one resource + * instance may cause some problems. So we make the resource instance + * as a static member to keep only one instance. + */ + static AMGX_resources_handle rsrc; + + /** \brief Set AmgX solver mode based on the user-provided string. + * + * Available modes are: dDDI, dDFI, dFFI, hDDI, hDFI, hFFI. + * + * \param modeStr [in] a std::string. + */ + void setMode(const std::string &modeStr); + + /** \brief Perform necessary initialization of AmgX. + * + * This function initializes AmgX for current instance. Based on + * \ref AmgXSolver::count "count", only the instance initialized first + * is in charge of initializing AmgX and the resource instance. + * + * \param cfgFile [in] Path to AmgX solver configuration file. + */ + void initAmgX(const std::string &cfgFile); +}; + +#endif + diff --git a/src_gpu/AmgXSolver.cu b/src_gpu/AmgXSolver.cu new file mode 100644 index 000000000..b0076e5c3 --- /dev/null +++ b/src_gpu/AmgXSolver.cu @@ -0,0 +1,296 @@ +/** + * \file AmgXSolver.cpp + * \brief Definition of member functions of the class AmgXSolver. + * \author Pi-Yueh Chuang (pychuang@gwu.edu) + * \author Matt Martineau (mmartineau@nvidia.com) + * \date 2015-09-01 + * \copyright Copyright (c) 2015-2019 Pi-Yueh Chuang, Lorena A. Barba. + * \copyright Copyright (c) 2019-2021, NVIDIA CORPORATION. All rights reserved. + * This project is released under MIT License. + */ + +// AmgXWrapper +#include "AmgXSolver.H" +#include +#include + +// initialize AmgXSolver::count to 0 +int AmgXSolver::count = 0; + +// initialize AmgXSolver::rsrc to nullptr; +AMGX_resources_handle AmgXSolver::rsrc = nullptr; + + +/* \implements AmgXSolver::AmgXSolver */ +AmgXSolver::AmgXSolver(const std::string &modeStr, const std::string &cfgFile) +{ + initialize(modeStr, cfgFile); +} + + +/* \implements AmgXSolver::~AmgXSolver */ +AmgXSolver::~AmgXSolver() +{ + if (isInitialised) finalize(); +} + + +/* \implements AmgXSolver::initialize */ +void AmgXSolver::initialize(const std::string &modeStr, const std::string &cfgFile) +{ + + // if this instance has already been initialized, skip + if (isInitialised) { + fprintf(stderr, + "This AmgXSolver instance has been initialized on this process.\n"); + exit(0); + } + + // increase the number of AmgXSolver instances + count += 1; + + // get the mode of AmgX solver + setMode(modeStr); + + // initialize AmgX + initAmgX(cfgFile); + + // a bool indicating if this instance is initialized + isInitialised = true; + + return; +} + +/* \implements AmgXSolver::setMode */ +void AmgXSolver::setMode(const std::string &modeStr) +{ + if (modeStr == "dDDI") + mode = AMGX_mode_dDDI; + else if (modeStr == "dDFI") + mode = AMGX_mode_dDFI; + else if (modeStr == "dFFI") + mode = AMGX_mode_dFFI; + else if (modeStr[0] == 'h') { + printf("CPU mode, %s, is not supported in this wrapper!", + modeStr.c_str()); + exit(0); + } + else { + printf("%s is not an available mode! Available modes are: " + "dDDI, dDFI, dFFI.\n", modeStr.c_str()); + exit(0); + } +} + + +/* \implements AmgXSolver::initAmgX */ + void AmgXSolver::initAmgX(const std::string &cfgFile) +{ + // only the first instance (AmgX solver) is in charge of initializing AmgX + if (count == 1) + { + // initialize AmgX + AMGX_SAFE_CALL(AMGX_initialize()); + + // intialize AmgX plugings + AMGX_SAFE_CALL(AMGX_initialize_plugins()); + + // let AmgX to handle errors returned + AMGX_SAFE_CALL(AMGX_install_signal_handler()); + } + + // create an AmgX configure object + AMGX_SAFE_CALL(AMGX_config_create_from_file(&cfg, cfgFile.c_str())); + + // let AmgX handle returned error codes internally + AMGX_SAFE_CALL(AMGX_config_add_parameters(&cfg, "exception_handling=1")); + + // create an AmgX resource object, only the first instance is in charge + if (count == 1) AMGX_resources_create_simple(&rsrc, cfg); + + // create AmgX vector object for unknowns and RHS + AMGX_vector_create(&AmgXP, rsrc, mode); + AMGX_vector_create(&AmgXRHS, rsrc, mode); + + // create AmgX matrix object for unknowns and RHS + AMGX_matrix_create(&AmgXA, rsrc, mode); + + // create an AmgX solver object + AMGX_solver_create(&solver, rsrc, mode, cfg); + + // obtain the default number of rings based on current configuration + AMGX_config_get_default_number_of_rings(cfg, &ring); +} + +/* \implements AmgXSolver::finalize */ +void AmgXSolver::finalize() +{ + // skip if this instance has not been initialised + if (!isInitialised) + { + fprintf(stderr, + "This AmgXWrapper has not been initialised. " + "Please initialise it before finalization.\n"); + exit(0); + } + + // destroy solver instance + AMGX_solver_destroy(solver); + + // destroy matrix instance + AMGX_matrix_destroy(AmgXA); + + // destroy RHS and unknown vectors + AMGX_vector_destroy(AmgXP); + AMGX_vector_destroy(AmgXRHS); + + // only the last instance need to destroy resource and finalizing AmgX + if (count == 1) + { + AMGX_resources_destroy(rsrc); + AMGX_SAFE_CALL(AMGX_config_destroy(cfg)); + + AMGX_SAFE_CALL(AMGX_finalize_plugins()); + AMGX_SAFE_CALL(AMGX_finalize()); + } + else + { + AMGX_config_destroy(cfg); + } + + // decrease the number of instances + count -= 1; + + // change status + isInitialised = false; +} + +/* \implements AmgXSolver::setOperator */ +void AmgXSolver::setOperator +( + const int nRows, + const int nNz, + const int *rowIndex, + const int *colIndex, + const double *value +) +{ + + // Check the matrix size is not larger than tolerated by AmgX + if(nRows > std::numeric_limits::max()) + { + fprintf(stderr, + "AmgX does not support a global number of rows greater than " + "what can be stored in 32 bits (nGlobalRows = %d).\n", + nRows); + exit(0); + } + + if (nNz > std::numeric_limits::max()) + { + fprintf(stderr, + "AmgX does not support non-zeros per (consolidated) rank greater than" + "what can be stored in 32 bits (nLocalNz = %d).\n", + nNz); + exit(0); + } + + // upload matrix A to AmgX + AMGX_matrix_upload_all( + AmgXA, nRows, nNz, 1, 1, rowIndex, colIndex, value, nullptr); + + // bind the matrix A to the solver + AMGX_solver_setup(solver, AmgXA); + + // connect (bind) vectors to the matrix + AMGX_vector_bind(AmgXP, AmgXA); + AMGX_vector_bind(AmgXRHS, AmgXA); +} + + +/* \implements AmgXSolver::updateOperator */ +void AmgXSolver::updateOperator +( + const int nRows, + const int nNz, + const double *value +) +{ + + // Replace the coefficients for the CSR matrix A within AmgX + AMGX_matrix_replace_coefficients(AmgXA, nRows, nNz, value, nullptr); + + // Re-setup the solver (a reduced overhead setup that accounts for consistent matrix structure) + AMGX_solver_resetup(solver, AmgXA); +} + +/* \implements AmgXSolver::solve */ +// void AmgXSolver::solve( +// int nLocalRows, Vec& p, Vec& b, AmgXCSRMatrix& matrix) +// { +// double* pscalar; +// double* bscalar; + +// // get pointers to the raw data of local vectors +// VecGetArray(p, &pscalar); +// VecGetArray(b, &bscalar); + +// solve(nLocalRows, pscalar, bscalar, matrix); + +// VecRestoreArray(p, &pscalar); +// VecRestoreArray(b, &bscalar); +// } + + +/* \implements AmgXSolver::solve */ +void AmgXSolver::solve( + int nRows, double* psi, const double* rhs) +{ + // Upload potentially consolidated vectors to AmgX + AMGX_vector_upload(AmgXP, nRows, 1, psi); + AMGX_vector_upload(AmgXRHS, nRows, 1, rhs); + + // Solve + AMGX_solver_solve(solver, AmgXRHS, AmgXP); + + // Get the status of the solver + AMGX_SOLVE_STATUS status; + AMGX_solver_get_status(solver, &status); + + // Check whether the solver successfully solved the problem + if (status != AMGX_SOLVE_SUCCESS) + { + fprintf(stderr, "AmgX solver failed to solve the system! " + "The error code is %d.\n", + status); + } + + // Download data from device + AMGX_vector_download(AmgXP, psi); + + // get norm and iteration number + double irnorm = 0., rnorm = 0.; + int nIters = 0; + getResidual(0, irnorm); + getIters(nIters); + getResidual(nIters, rnorm); + printf("Initial residual = %.10lf, Final residual = %.5e, No Iterations %d\n", irnorm, rnorm, nIters); + +} + + +/* \implements AmgXSolver::getIters */ +void AmgXSolver::getIters(int &iter) +{ + // only processes using AmgX will try to get # of iterations + AMGX_solver_get_iterations_number(solver, &iter); +} + + +/* \implements AmgXSolver::getResidual */ +void AmgXSolver::getResidual(const int &iter, double &res) +{ + // only processes using AmgX will try to get residual + AMGX_solver_get_iteration_residual(solver, iter, 0, &res); +} + diff --git a/src_gpu/CMakeLists.txt b/src_gpu/CMakeLists.txt index d82c86df5..ed9070476 100644 --- a/src_gpu/CMakeLists.txt +++ b/src_gpu/CMakeLists.txt @@ -22,8 +22,10 @@ include_directories( add_library(${PROJECT_NAME} SHARED + AmgXSolver.cu dfMatrixDataBase.cu - dfMatrixOpBase.cu) + dfMatrixOpBase.cu + dfUEqn.cu) target_link_libraries(${PROJECT_NAME} ${MPI_LIBRARIES} diff --git a/src_gpu/dfMatrixOpBase.H b/src_gpu/dfMatrixOpBase.H index 47692e239..3e533a281 100644 --- a/src_gpu/dfMatrixOpBase.H +++ b/src_gpu/dfMatrixOpBase.H @@ -1,5 +1,8 @@ #pragma once +void permute_vector_d2h(cudaStream_t stream, int num_cells, const double *input, double *output); +void permute_vector_h2d(cudaStream_t stream, int num_cells, const double *input, double *output); + void ldu_to_csr(cudaStream_t stream, int num_cells, int num_surfaces, const int *lower_to_csr_index, const int *upper_to_csr_index, const int *diag_to_csr_index, const double *lower, const double *upper, const double *diag, const double *source, diff --git a/src_gpu/dfMatrixOpBase.cu b/src_gpu/dfMatrixOpBase.cu index 6c533e05e..99801737e 100644 --- a/src_gpu/dfMatrixOpBase.cu +++ b/src_gpu/dfMatrixOpBase.cu @@ -4,6 +4,28 @@ #include #include "cuda_profiler_api.h" +__global__ void permute_vector_d2h_kernel(int num_cells, const double *input, double *output) +{ + int index = blockDim.x * blockIdx.x + threadIdx.x; + if (index >= num_cells) + return; + + output[index * 3 + 0] = input[num_cells * 0 + index]; + output[index * 3 + 1] = input[num_cells * 1 + index]; + output[index * 3 + 2] = input[num_cells * 2 + index]; +} + +__global__ void permute_vector_h2d_kernel(int num_cells, const double *input, double *output) +{ + int index = blockDim.x * blockIdx.x + threadIdx.x; + if (index >= num_cells) + return; + + output[num_cells * 0 + index] = input[index * 3 + 0]; + output[num_cells * 1 + index] = input[index * 3 + 1]; + output[num_cells * 2 + index] = input[index * 3 + 2]; +} + __global__ void update_boundary_coeffs_zeroGradient_vector(int num, int offset, double *value_internal_coeffs, double *value_boundary_coeffs, double *gradient_internal_coeffs, double *gradient_boundary_coeffs) @@ -45,6 +67,7 @@ __global__ void fvm_div_scalar_internal(int num_surfaces, lower[index] += (-w) * f; upper[index] += (1 - w) * f; + // if (index == 0) printf("index = 0, lower: %.16lf, upper:%.16lf\n", lower[index], upper[index]); int l = lower_index[index]; int u = upper_index[index]; @@ -70,6 +93,20 @@ __global__ void fvm_div_scalar_boundary(int num, int offset, boundary_coeffs[start_index * 3 + 2] = boundary_f * value_boundary_coeffs[start_index * 3 + 2]; } +void permute_vector_d2h(cudaStream_t stream, int num_cells, const double *input, double *output) +{ + size_t threads_per_block = 256; + size_t blocks_per_grid = (num_cells + threads_per_block - 1) / threads_per_block; + permute_vector_d2h_kernel<<>>(num_cells, input, output); +} + +void permute_vector_h2d(cudaStream_t stream, int num_cells, const double *input, double *output) +{ + size_t threads_per_block = 256; + size_t blocks_per_grid = (num_cells + threads_per_block - 1) / threads_per_block; + permute_vector_h2d_kernel<<>>(num_cells, input, output); +} + void ldu_to_csr(cudaStream_t stream, int num_cells, int num_surfaces, const int *lower_to_csr_index, const int *upper_to_csr_index, const int *diag_to_csr_index, const double *lower, const double *upper, const double *diag, const double *source, diff --git a/src_gpu/dfUEqn.H b/src_gpu/dfUEqn.H new file mode 100644 index 000000000..0ee570b9d --- /dev/null +++ b/src_gpu/dfUEqn.H @@ -0,0 +1,99 @@ +#pragma once + +#include "AmgXSolver.H" +#include +#include "dfMatrixDataBase.H" +#include "dfMatrixOpBase.H" + +class dfUEqn +{ +private: + dfMatrixDataBase &dataBase_; + + // constant values -- basic + std::string mode_string; + std::string setting_path; + + // constant values -- amgx solvers + AmgXSolver *UxSolver = nullptr; + AmgXSolver *UySolver = nullptr; + AmgXSolver *UzSolver = nullptr; + int num_iteration = 0; + + // constant fields - internal + // 无 + + // constant fields - boundary + std::vector patch_type; + + // non-constant fields - internal + // thermophysical fields + double *d_nu_eff = nullptr; + // computed on CPU, used on GPU, need memcpyh2d - host + double *h_nu_eff = nullptr; + // intermediate fields + double *d_grad_u = nullptr; + double *d_rho_nueff = nullptr; + double *d_permute = nullptr; + + // non-constant fields - boundary + // thermophysical fields + double *d_boundary_nu_eff = nullptr; + // computed on CPU, used on GPU, need memcpyh2d - host + double *h_boundary_nu_eff = nullptr; + // intermediate fields + double *d_boundary_grad_u = nullptr; + double *d_boundary_rho_nueff = nullptr; + // boundary coeff fields + double *d_value_internal_coeffs = nullptr; + double *d_value_boundary_coeffs= nullptr; + double *d_gradient_internal_coeffs= nullptr; + double *d_gradient_boundary_coeffs= nullptr; + + // non-constant fields - ldu + double *d_lower = nullptr; + double *d_upper = nullptr; + double *d_diag = nullptr; + double *d_source = nullptr; + double *d_internal_coeffs = nullptr; + double *d_boundary_coeffs = nullptr; + + // non-constant fields - csr + double *d_A = nullptr; + double *d_b = nullptr; + + // field pointer map + std::unordered_map fieldPointerMap; + +public: + // 构造函数 + dfUEqn(dfMatrixDataBase &dataBase) + : dataBase_(dataBase) {} + + // 析构函数 + ~dfUEqn(){} + + // 成员函数 + + // getter函数 + double* getFieldPointer(const char* fieldAlias, location loc, position pos); + + // 初始化构建 + void setConstantValues(const std::string &mode_string, const std::string &setting_path); + void setConstantFields(const std::vector patch_type); + void createNonConstantFieldsInternal(); + void createNonConstantFieldsBoundary(); + void createNonConstantLduAndCsrFields(); + // dfUEqn has no internal non-constant fields to be init + //void initNonConstantFieldsInternal(xxx); + void initNonConstantFieldsBoundary(); + + // 方程运行 + void preProcess(const double *h_u, const double *h_boundary_u, const double *h_p, const double *h_boundary_p, const double *h_nu_eff, const double *h_boundary_nu_eff, const double *h_boundary_rho); + void preProcessForRhoEqn(const double *h_phi, const double *h_boundary_phi); + void process(); + void postProcess(double *h_u); + + void solve(); + void compareResult(const double *lower, const double *upper, const double *diag, const double *internal_coeffs, const double *boundary_coeffs, bool printFlag); +}; diff --git a/src_gpu/dfUEqn.cu b/src_gpu/dfUEqn.cu new file mode 100644 index 000000000..e6c52ec7f --- /dev/null +++ b/src_gpu/dfUEqn.cu @@ -0,0 +1,201 @@ +#include "dfUEqn.H" + +void dfUEqn::setConstantValues(const std::string &mode_string, const std::string &setting_path) { + this->mode_string = mode_string; + this->setting_path = setting_path; + UxSolver = new AmgXSolver(mode_string, setting_path); + UySolver = new AmgXSolver(mode_string, setting_path); + UzSolver = new AmgXSolver(mode_string, setting_path); +} + +void dfUEqn::setConstantFields(const std::vector patch_type) { + this->patch_type = patch_type; +} + +void dfUEqn::createNonConstantFieldsInternal() { + // thermophysical fields + checkCudaErrors(cudaMalloc((void**)&d_nu_eff, dataBase_.cell_value_bytes)); + // computed on CPU, used on GPU, need memcpyh2d + checkCudaErrors(cudaMallocHost((void**)&h_nu_eff , dataBase_.cell_value_bytes)); + // intermediate fields + checkCudaErrors(cudaMalloc((void**)&d_grad_u, dataBase_.cell_value_tsr_bytes)); + checkCudaErrors(cudaMalloc((void**)&d_rho_nueff, dataBase_.cell_value_bytes)); + checkCudaErrors(cudaMalloc((void**)&d_permute, dataBase_.cell_value_vec_bytes)); + + // getter for h_nu_eff + fieldPointerMap["h_nu_eff"] = h_nu_eff; +} + +void dfUEqn::createNonConstantFieldsBoundary() { + // thermophysical fields + checkCudaErrors(cudaMalloc((void**)&d_boundary_nu_eff, dataBase_.boundary_surface_value_bytes)); + // computed on CPU, used on GPU, need memcpyh2d + checkCudaErrors(cudaMallocHost((void**)&h_boundary_nu_eff, dataBase_.boundary_surface_value_bytes)); + // intermediate fields + checkCudaErrors(cudaMalloc((void**)&d_boundary_grad_u, dataBase_.boundary_surface_value_tsr_bytes)); + checkCudaErrors(cudaMalloc((void**)&d_boundary_rho_nueff, dataBase_.boundary_surface_value_bytes)); + // boundary coeff fields + checkCudaErrors(cudaMalloc((void**)&d_value_internal_coeffs, dataBase_.boundary_surface_value_vec_bytes)); + checkCudaErrors(cudaMalloc((void**)&d_value_boundary_coeffs, dataBase_.boundary_surface_value_vec_bytes)); + checkCudaErrors(cudaMalloc((void**)&d_gradient_internal_coeffs, dataBase_.boundary_surface_value_vec_bytes)); + checkCudaErrors(cudaMalloc((void**)&d_gradient_boundary_coeffs, dataBase_.boundary_surface_value_vec_bytes)); + + // getter for h_boundary_nu_eff + fieldPointerMap["h_boundary_nu_eff"] = h_boundary_nu_eff; +} + +void dfUEqn::createNonConstantLduAndCsrFields() { + checkCudaErrors(cudaMalloc((void**)&d_lower, dataBase_.surface_value_bytes)); + checkCudaErrors(cudaMalloc((void**)&d_upper, dataBase_.surface_value_bytes)); + checkCudaErrors(cudaMalloc((void**)&d_diag, dataBase_.cell_value_bytes)); + checkCudaErrors(cudaMalloc((void**)&d_source, dataBase_.cell_value_vec_bytes)); + checkCudaErrors(cudaMalloc((void**)&d_internal_coeffs, dataBase_.boundary_surface_value_vec_bytes)); + checkCudaErrors(cudaMalloc((void**)&d_boundary_coeffs, dataBase_.boundary_surface_value_vec_bytes)); + checkCudaErrors(cudaMalloc((void**)&d_A, dataBase_.csr_value_vec_bytes)); + checkCudaErrors(cudaMalloc((void**)&d_b, dataBase_.cell_value_vec_bytes)); +} + +void dfUEqn::initNonConstantFieldsBoundary() { + update_boundary_coeffs_vector(dataBase_.stream, dataBase_.num_patches, + dataBase_.patch_size.data(), patch_type.data(), + d_value_internal_coeffs, d_value_boundary_coeffs, + d_gradient_internal_coeffs, d_gradient_boundary_coeffs); +} + +void dfUEqn::preProcess(const double *h_u, const double *h_boundary_u, const double *h_p, const double *h_boundary_p, + const double *h_nu_eff, const double *h_boundary_nu_eff, const double *h_boundary_rho) { + checkCudaErrors(cudaMemcpyAsync(dataBase_.d_u, h_u, dataBase_.cell_value_vec_bytes, cudaMemcpyHostToDevice, dataBase_.stream)); + checkCudaErrors(cudaMemcpyAsync(dataBase_.d_boundary_u, h_boundary_u, dataBase_.boundary_surface_value_vec_bytes, cudaMemcpyHostToDevice, dataBase_.stream)); + checkCudaErrors(cudaMemcpyAsync(dataBase_.d_p, h_p, dataBase_.cell_value_bytes, cudaMemcpyHostToDevice, dataBase_.stream)); + checkCudaErrors(cudaMemcpyAsync(dataBase_.d_boundary_p, h_boundary_p, dataBase_.boundary_surface_value_bytes, cudaMemcpyHostToDevice, dataBase_.stream)); + checkCudaErrors(cudaMemcpyAsync(d_nu_eff, h_nu_eff, dataBase_.cell_value_bytes, cudaMemcpyHostToDevice, dataBase_.stream)); + checkCudaErrors(cudaMemcpyAsync(d_boundary_nu_eff, h_boundary_nu_eff, dataBase_.boundary_surface_value_bytes, cudaMemcpyHostToDevice, dataBase_.stream)); + checkCudaErrors(cudaMemcpyAsync(dataBase_.d_boundary_rho, h_boundary_rho, dataBase_.boundary_surface_value_bytes, cudaMemcpyHostToDevice, dataBase_.stream)); + + checkCudaErrors(cudaMemsetAsync(d_lower, 0, dataBase_.surface_value_bytes, dataBase_.stream)); + checkCudaErrors(cudaMemsetAsync(d_upper, 0, dataBase_.surface_value_bytes, dataBase_.stream)); + checkCudaErrors(cudaMemsetAsync(d_diag, 0, dataBase_.cell_value_bytes, dataBase_.stream)); + checkCudaErrors(cudaMemsetAsync(d_source, 0, dataBase_.cell_value_vec_bytes, dataBase_.stream)); + checkCudaErrors(cudaMemsetAsync(d_internal_coeffs, 0, dataBase_.boundary_surface_value_vec_bytes, dataBase_.stream)); + checkCudaErrors(cudaMemsetAsync(d_boundary_coeffs, 0, dataBase_.boundary_surface_value_vec_bytes, dataBase_.stream)); + checkCudaErrors(cudaMemsetAsync(d_A, 0, dataBase_.csr_value_vec_bytes, dataBase_.stream)); + checkCudaErrors(cudaMemsetAsync(d_b, 0, dataBase_.cell_value_vec_bytes, dataBase_.stream)); +} + +void dfUEqn::preProcessForRhoEqn(const double *h_phi, const double *h_boundary_phi) { + checkCudaErrors(cudaMemcpyAsync(dataBase_.d_phi, h_phi, dataBase_.surface_value_bytes, cudaMemcpyHostToDevice, dataBase_.stream)); + checkCudaErrors(cudaMemcpyAsync(dataBase_.d_boundary_phi, h_boundary_phi, dataBase_.boundary_surface_value_bytes, cudaMemcpyHostToDevice, dataBase_.stream)); + + checkCudaErrors(cudaMemsetAsync(d_lower, 0, dataBase_.surface_value_bytes, dataBase_.stream)); + checkCudaErrors(cudaMemsetAsync(d_upper, 0, dataBase_.surface_value_bytes, dataBase_.stream)); + checkCudaErrors(cudaMemsetAsync(d_diag, 0, dataBase_.cell_value_bytes, dataBase_.stream)); + checkCudaErrors(cudaMemsetAsync(d_source, 0, dataBase_.cell_value_vec_bytes, dataBase_.stream)); + checkCudaErrors(cudaMemsetAsync(d_internal_coeffs, 0, dataBase_.boundary_surface_value_vec_bytes, dataBase_.stream)); + checkCudaErrors(cudaMemsetAsync(d_boundary_coeffs, 0, dataBase_.boundary_surface_value_vec_bytes, dataBase_.stream)); + checkCudaErrors(cudaMemsetAsync(d_A, 0, dataBase_.csr_value_vec_bytes, dataBase_.stream)); + checkCudaErrors(cudaMemsetAsync(d_b, 0, dataBase_.cell_value_vec_bytes, dataBase_.stream)); + +} + +void dfUEqn::process() { + // run each fvc or fvm function + fvm_div_scalar(dataBase_.stream, dataBase_.num_surfaces, dataBase_.d_owner, dataBase_.d_neighbor, + dataBase_.d_phi, dataBase_.d_weight, + d_lower, d_upper, d_diag, // end for internal + dataBase_.num_patches, dataBase_.patch_size.data(), patch_type.data(), + dataBase_.d_boundary_phi, d_value_internal_coeffs, d_value_boundary_coeffs, + d_internal_coeffs, d_boundary_coeffs); + //ldu_to_csr(dataBase_.stream, dataBase_.num_cells, dataBase_.num_surfaces, + // dataBase_.d_lower_to_csr_index, dataBase_.d_upper_to_csr_index, dataBase_.d_diag_to_csr_index, + // d_lower, d_upper, d_diag, d_source, d_internal_coeffs, d_boundary_coeffs, d_A, d_b); + //solve(); +} + +void dfUEqn::solve() { + checkCudaErrors(cudaStreamSynchronize(dataBase_.stream)); + + int nNz = dataBase_.num_cells + dataBase_.num_surfaces * 2; // matrix entries + if (num_iteration == 0) // first interation + { + printf("Initializing AmgX Linear Solver\n"); + UxSolver->setOperator(dataBase_.num_cells, nNz, dataBase_.d_csr_row_index, dataBase_.d_csr_col_index, d_A); + UySolver->setOperator(dataBase_.num_cells, nNz, dataBase_.d_csr_row_index, dataBase_.d_csr_col_index, d_A + nNz); + UzSolver->setOperator(dataBase_.num_cells, nNz, dataBase_.d_csr_row_index, dataBase_.d_csr_col_index, d_A + 2 * nNz); + } + else + { + UxSolver->updateOperator(dataBase_.num_cells, nNz, d_A); + UySolver->updateOperator(dataBase_.num_cells, nNz, d_A + nNz); + UzSolver->updateOperator(dataBase_.num_cells, nNz, d_A + 2 * nNz); + } + UxSolver->solve(dataBase_.num_cells, dataBase_.d_u, d_b); + UySolver->solve(dataBase_.num_cells, dataBase_.d_u + dataBase_.num_cells, d_b + dataBase_.num_cells); + UzSolver->solve(dataBase_.num_cells, dataBase_.d_u + 2 * dataBase_.num_cells, d_b + 2 * dataBase_.num_cells); + num_iteration++; +} + +void dfUEqn::postProcess(double *h_u) { + permute_vector_d2h(dataBase_.stream, dataBase_.num_cells, dataBase_.d_u, d_permute); + checkCudaErrors(cudaMemcpyAsync(h_u, d_permute, dataBase_.cell_value_vec_bytes, cudaMemcpyDeviceToHost, dataBase_.stream)); + checkCudaErrors(cudaStreamSynchronize(dataBase_.stream)); + + // some boundary conditions may also need vf.boundary, deltaCoeffs.boundary, and weight.boundary + update_boundary_coeffs_vector(dataBase_.stream, dataBase_.num_patches, + dataBase_.patch_size.data(), patch_type.data(), + d_value_internal_coeffs, d_value_boundary_coeffs, + d_gradient_internal_coeffs, d_gradient_boundary_coeffs); +} + +double* dfUEqn::getFieldPointer(const char* fieldAlias, location loc, position pos) { + char mergedName[256]; + if (pos == position::internal) { + sprintf(mergedName, "%s_%s", (loc == location::cpu) ? "h" : "d", fieldAlias); + } else if (pos == position::boundary) { + sprintf(mergedName, "%s_boundary_%s", (loc == location::cpu) ? "h" : "d", fieldAlias); + } + + double *pointer = nullptr; + if (fieldPointerMap.find(std::string(mergedName)) != fieldPointerMap.end()) { + pointer = fieldPointerMap[std::string(mergedName)]; + } + if (pointer == nullptr) { + fprintf(stderr, "Warning! getFieldPointer of %s returns nullptr!\n", mergedName); + } + //fprintf(stderr, "fieldAlias: %s, mergedName: %s, pointer: %p\n", fieldAlias, mergedName, pointer); + + return pointer; +} + +void dfUEqn::compareResult(const double *lower, const double *upper, const double *diag, const double *internal_coeffs, const double *boundary_coeffs, bool printFlag) +{ + std::vector h_lower; + h_lower.resize(dataBase_.num_surfaces); + checkCudaErrors(cudaMemcpy(h_lower.data(), d_lower, dataBase_.surface_value_bytes, cudaMemcpyDeviceToHost)); + checkVectorEqual(dataBase_.num_surfaces, lower, h_lower.data(), 1e-14, printFlag); + + std::vector h_upper; + h_upper.resize(dataBase_.num_surfaces); + checkCudaErrors(cudaMemcpy(h_upper.data(), d_upper, dataBase_.surface_value_bytes, cudaMemcpyDeviceToHost)); + checkVectorEqual(dataBase_.num_surfaces, upper, h_upper.data(), 1e-14, printFlag); + + std::vector h_diag; + h_diag.resize(dataBase_.num_cells); + checkCudaErrors(cudaMemcpy(h_diag.data(), d_diag, dataBase_.cell_value_bytes, cudaMemcpyDeviceToHost)); + checkVectorEqual(dataBase_.num_cells, diag, h_diag.data(), 1e-14, printFlag); + + //std::vector h_source; + //h_source.resize(dataBase_.num_cells * 3); + //checkCudaErrors(cudaMemcpy(h_source.data(), d_source, dataBase_.cell_value_vec_bytes, cudaMemcpyDeviceToHost)); + //checkVectorEqual(dataBase_.num_cells, source, h_source.data(), 1e-14, printFlag); + + std::vector h_internal_coeffs; + h_internal_coeffs.resize(dataBase_.num_boundary_surfaces * 3); + checkCudaErrors(cudaMemcpy(h_internal_coeffs.data(), d_internal_coeffs, dataBase_.boundary_surface_value_vec_bytes, cudaMemcpyDeviceToHost)); + checkVectorEqual(dataBase_.num_boundary_surfaces * 3, internal_coeffs, h_internal_coeffs.data(), 1e-14, printFlag); + + std::vector h_boundary_coeffs; + h_boundary_coeffs.resize(dataBase_.num_boundary_surfaces * 3); + checkCudaErrors(cudaMemcpy(h_boundary_coeffs.data(), d_boundary_coeffs, dataBase_.boundary_surface_value_vec_bytes, cudaMemcpyDeviceToHost)); + checkVectorEqual(dataBase_.num_boundary_surfaces * 3, boundary_coeffs, h_boundary_coeffs.data(), 1e-14, printFlag); +} + From be72eb01f3024483fc3cdf20e2d0a767317014ba Mon Sep 17 00:00:00 2001 From: STwangyingrui Date: Fri, 4 Aug 2023 21:28:32 +0800 Subject: [PATCH 05/25] small fix of fvm_div_boundary --- src_gpu/dfMatrixOpBase.cu | 12 ++++++------ 1 file changed, 6 insertions(+), 6 deletions(-) diff --git a/src_gpu/dfMatrixOpBase.cu b/src_gpu/dfMatrixOpBase.cu index 99801737e..4a3c25088 100644 --- a/src_gpu/dfMatrixOpBase.cu +++ b/src_gpu/dfMatrixOpBase.cu @@ -85,12 +85,12 @@ __global__ void fvm_div_scalar_boundary(int num, int offset, int start_index = offset + index; double boundary_f = boundary_phi[start_index]; - internal_coeffs[start_index * 3 + 0] = boundary_f * value_internal_coeffs[start_index * 3 + 0]; - internal_coeffs[start_index * 3 + 1] = boundary_f * value_internal_coeffs[start_index * 3 + 1]; - internal_coeffs[start_index * 3 + 2] = boundary_f * value_internal_coeffs[start_index * 3 + 2]; - boundary_coeffs[start_index * 3 + 0] = boundary_f * value_boundary_coeffs[start_index * 3 + 0]; - boundary_coeffs[start_index * 3 + 1] = boundary_f * value_boundary_coeffs[start_index * 3 + 1]; - boundary_coeffs[start_index * 3 + 2] = boundary_f * value_boundary_coeffs[start_index * 3 + 2]; + internal_coeffs[start_index * 3 + 0] += boundary_f * value_internal_coeffs[start_index * 3 + 0]; + internal_coeffs[start_index * 3 + 1] += boundary_f * value_internal_coeffs[start_index * 3 + 1]; + internal_coeffs[start_index * 3 + 2] += boundary_f * value_internal_coeffs[start_index * 3 + 2]; + boundary_coeffs[start_index * 3 + 0] += boundary_f * value_boundary_coeffs[start_index * 3 + 0]; + boundary_coeffs[start_index * 3 + 1] += boundary_f * value_boundary_coeffs[start_index * 3 + 1]; + boundary_coeffs[start_index * 3 + 2] += boundary_f * value_boundary_coeffs[start_index * 3 + 2]; } void permute_vector_d2h(cudaStream_t stream, int num_cells, const double *input, double *output) From 7be119075b30fa4ad8843d598e5f178808a6721c Mon Sep 17 00:00:00 2001 From: STwangyingrui Date: Fri, 4 Aug 2023 21:33:32 +0800 Subject: [PATCH 06/25] modify fvm_div_scalar to fvm_div_vector --- GPUTest/GPUTestBase.H | 4 ++-- GPUTest/unittest.C | 4 ++-- src_gpu/dfMatrixOpBase.H | 2 +- src_gpu/dfMatrixOpBase.cu | 10 +++++----- src_gpu/dfUEqn.cu | 2 +- 5 files changed, 11 insertions(+), 11 deletions(-) diff --git a/GPUTest/GPUTestBase.H b/GPUTest/GPUTestBase.H index bce191a9e..9de15dd90 100644 --- a/GPUTest/GPUTestBase.H +++ b/GPUTest/GPUTestBase.H @@ -174,7 +174,7 @@ void compareResultVector(const dfMatrixDataBase& dfDataBase, const testGPUDataBa } // unittest of fvm::div(phi, U) -void test_fvm_div_scalar(dfMatrixDataBase& dfDataBase, Foam::fvMesh& mesh, surfaceScalarField& phi, volVectorField& U, initType type) { +void test_fvm_div_vector(dfMatrixDataBase& dfDataBase, Foam::fvMesh& mesh, surfaceScalarField& phi, volVectorField& U, initType type) { if (type == initType::randomInit) { randomInitSurfaceScalar(phi); // TODO: random init weight failed, weight is const. @@ -194,7 +194,7 @@ void test_fvm_div_scalar(dfMatrixDataBase& dfDataBase, Foam::fvMesh& mesh, surfa updateBoundaryCoeffsVector(dfDataBase, testData); // run GPU - fvm_div_scalar(dfDataBase.stream, dfDataBase.num_surfaces, dfDataBase.d_owner, dfDataBase.d_neighbor, + fvm_div_vector(dfDataBase.stream, dfDataBase.num_surfaces, dfDataBase.d_owner, dfDataBase.d_neighbor, dfDataBase.d_phi, dfDataBase.d_weight, testData.d_lower, testData.d_upper, testData.d_diag, // end for internal dfDataBase.num_patches, dfDataBase.patch_size.data(), testData.patch_type.data(), diff --git a/GPUTest/unittest.C b/GPUTest/unittest.C index b57a8efd6..f9826eb35 100644 --- a/GPUTest/unittest.C +++ b/GPUTest/unittest.C @@ -98,8 +98,8 @@ int main(int argc, char *argv[]) createGPUBase(mesh, Y); // unittest of fvm::div(phi, U) - test_fvm_div_scalar(dfDataBase, mesh, phi, U, initType::original); - test_fvm_div_scalar(dfDataBase, mesh, phi, U, initType::randomInit); + test_fvm_div_vector(dfDataBase, mesh, phi, U, initType::original); + test_fvm_div_vector(dfDataBase, mesh, phi, U, initType::randomInit); } return 0; } diff --git a/src_gpu/dfMatrixOpBase.H b/src_gpu/dfMatrixOpBase.H index 3e533a281..cfe953d4e 100644 --- a/src_gpu/dfMatrixOpBase.H +++ b/src_gpu/dfMatrixOpBase.H @@ -16,7 +16,7 @@ void update_boundary_coeffs_vector(cudaStream_t stream, int num_patches, // void fvm_ddt(); -void fvm_div_scalar(cudaStream_t stream, int num_surfaces, const int *lowerAddr, const int *upperAddr, +void fvm_div_vector(cudaStream_t stream, int num_surfaces, const int *lowerAddr, const int *upperAddr, const double *phi, const double *weight, double *lower, double *upper, double *diag, // end for internal int num_patches, const int *patch_size, const int *patch_type, diff --git a/src_gpu/dfMatrixOpBase.cu b/src_gpu/dfMatrixOpBase.cu index 4a3c25088..8fce760fd 100644 --- a/src_gpu/dfMatrixOpBase.cu +++ b/src_gpu/dfMatrixOpBase.cu @@ -53,7 +53,7 @@ __global__ void update_boundary_coeffs_zeroGradient_vector(int num, int offset, gradient_boundary_coeffs[start_index * 3 + 2] = 0; } -__global__ void fvm_div_scalar_internal(int num_surfaces, +__global__ void fvm_div_vector_internal(int num_surfaces, const int *lower_index, const int *upper_index, const double *phi, const double *weight, double *lower, double *upper, double *diag) @@ -75,7 +75,7 @@ __global__ void fvm_div_scalar_internal(int num_surfaces, atomicAdd(&(diag[u]), (w - 1) * f); } -__global__ void fvm_div_scalar_boundary(int num, int offset, +__global__ void fvm_div_vector_boundary(int num, int offset, const double *boundary_phi, const double *value_internal_coeffs, const double *value_boundary_coeffs, double *internal_coeffs, double *boundary_coeffs) { @@ -142,7 +142,7 @@ void update_boundary_coeffs_vector(cudaStream_t stream, int num_patches, } } -void fvm_div_scalar(cudaStream_t stream, int num_surfaces, const int *lowerAddr, const int *upperAddr, +void fvm_div_vector(cudaStream_t stream, int num_surfaces, const int *lowerAddr, const int *upperAddr, const double *phi, const double *weight, double *lower, double *upper, double *diag, // end for internal int num_patches, const int *patch_size, const int *patch_type, @@ -153,7 +153,7 @@ void fvm_div_scalar(cudaStream_t stream, int num_surfaces, const int *lowerAddr, size_t blocks_per_grid = 1; blocks_per_grid = (num_surfaces + threads_per_block - 1) / threads_per_block; - fvm_div_scalar_internal<<>>(num_surfaces, + fvm_div_vector_internal<<>>(num_surfaces, lowerAddr, upperAddr, phi, weight, lower, upper, diag); @@ -165,7 +165,7 @@ void fvm_div_scalar(cudaStream_t stream, int num_surfaces, const int *lowerAddr, if (patch_type[i] == boundaryConditions::zeroGradient || patch_type[i] == boundaryConditions::fixedValue) { // TODO: just vector version now - fvm_div_scalar_boundary<<>>(patch_size[i], offset, + fvm_div_vector_boundary<<>>(patch_size[i], offset, boundary_phi, value_internal_coeffs, value_boundary_coeffs, internal_coeffs, boundary_coeffs); } else if (0) { diff --git a/src_gpu/dfUEqn.cu b/src_gpu/dfUEqn.cu index e6c52ec7f..fbbf9e71d 100644 --- a/src_gpu/dfUEqn.cu +++ b/src_gpu/dfUEqn.cu @@ -99,7 +99,7 @@ void dfUEqn::preProcessForRhoEqn(const double *h_phi, const double *h_boundary_p void dfUEqn::process() { // run each fvc or fvm function - fvm_div_scalar(dataBase_.stream, dataBase_.num_surfaces, dataBase_.d_owner, dataBase_.d_neighbor, + fvm_div_vector(dataBase_.stream, dataBase_.num_surfaces, dataBase_.d_owner, dataBase_.d_neighbor, dataBase_.d_phi, dataBase_.d_weight, d_lower, d_upper, d_diag, // end for internal dataBase_.num_patches, dataBase_.patch_size.data(), patch_type.data(), From 196f22a1aaeb23729704ba8f1fc918557e86734d Mon Sep 17 00:00:00 2001 From: STwangyingrui Date: Fri, 4 Aug 2023 15:41:07 +0000 Subject: [PATCH 07/25] implement fvm::ddt(rho, U) and add unittest for it --- GPUTest/GPUTestBase.H | 165 ++++++++++++++++-- GPUTest/createGPUSolver.H | 3 +- GPUTest/unittest.C | 3 + .../solvers/dfLowMachFoam/createGPUSolver.H | 3 +- src_gpu/dfMatrixDataBase.H | 16 +- src_gpu/dfMatrixDataBase.cu | 6 +- src_gpu/dfMatrixOpBase.H | 17 +- src_gpu/dfMatrixOpBase.cu | 25 +++ 8 files changed, 207 insertions(+), 31 deletions(-) diff --git a/GPUTest/GPUTestBase.H b/GPUTest/GPUTestBase.H index 9de15dd90..d46d3e95b 100644 --- a/GPUTest/GPUTestBase.H +++ b/GPUTest/GPUTestBase.H @@ -55,7 +55,7 @@ void randomInitSurfaceScalar(surfaceScalarField& field) { int offset = 0; forAll(field.boundaryField(), patchi) { - fvsPatchScalarField& patchField = field.boundaryFieldRef()[patchi]; + auto& patchField = field.boundaryFieldRef()[patchi]; int patchsize = patchField.size(); double *field_boundary_ptr = &patchField[0]; std::vector init_field_boundary; @@ -68,6 +68,97 @@ void randomInitSurfaceScalar(surfaceScalarField& field) { } } +void randomInitVolScalar(volScalarField& field) { + // random init field value to (-0.5, 0.5) + // internal + double *field_internal_ptr = &field[0]; + std::vector init_field_internal; + init_field_internal.resize(dfDataBase.num_cells); + for (int i = 0; i < dfDataBase.num_cells; i++) { + init_field_internal[i] = (rand() % 10000 - 5000) / 10000.0; + } + memcpy(field_internal_ptr, init_field_internal.data(), dfDataBase.cell_value_bytes); + // boundary + int offset = 0; + forAll(field.boundaryField(), patchi) + { + auto& patchField = field.boundaryFieldRef()[patchi]; + int patchsize = patchField.size(); + double *field_boundary_ptr = &patchField[0]; + std::vector init_field_boundary; + init_field_boundary.resize(patchsize); + for (int i = 0; i < patchsize; i++) { + init_field_boundary[i] = (rand() % 10000 - 5000) / 10000.0; + } + memcpy(field_boundary_ptr, init_field_boundary.data(), patchsize * sizeof(double)); + offset += patchsize; + } +} + +// rho_old need special treatment +void uploadRhoOld(dfMatrixDataBase& dfDataBase, const volScalarField& field) { + double *h_internal_field = dfDataBase.getFieldPointer("rho", location::cpu, position::internal); + double *h_boundary_field = dfDataBase.getFieldPointer("rho", location::cpu, position::boundary); + double *d_internal_field = dfDataBase.getFieldPointer("rho_old", location::gpu, position::internal); + double *d_boundary_field = dfDataBase.getFieldPointer("rho_old", location::gpu, position::boundary); + // internal + memcpy(h_internal_field, &field[0], dfDataBase.cell_value_bytes); + // boundary + int offset = 0; + forAll(field.boundaryField(), patchi) + { + const auto& patchField = field.boundaryField()[patchi]; + int patchsize = patchField.size(); + memcpy(h_boundary_field + offset, &patchField[0], patchsize * sizeof(double)); + offset += patchsize; + } + // transfer + checkCudaErrors(cudaMemcpyAsync(d_internal_field, h_internal_field, dfDataBase.cell_value_bytes, cudaMemcpyHostToDevice, dfDataBase.stream)); + checkCudaErrors(cudaMemcpyAsync(d_boundary_field, h_boundary_field, dfDataBase.boundary_surface_value_bytes, cudaMemcpyHostToDevice, dfDataBase.stream)); +} + +void uploadVolScalar(dfMatrixDataBase& dfDataBase, const volScalarField& field, const char* fieldAlias) { + double *h_internal_field = dfDataBase.getFieldPointer(fieldAlias, location::cpu, position::internal); + double *h_boundary_field = dfDataBase.getFieldPointer(fieldAlias, location::cpu, position::boundary); + double *d_internal_field = dfDataBase.getFieldPointer(fieldAlias, location::gpu, position::internal); + double *d_boundary_field = dfDataBase.getFieldPointer(fieldAlias, location::gpu, position::boundary); + // internal + memcpy(h_internal_field, &field[0], dfDataBase.cell_value_bytes); + // boundary + int offset = 0; + forAll(field.boundaryField(), patchi) + { + const auto& patchField = field.boundaryField()[patchi]; + int patchsize = patchField.size(); + memcpy(h_boundary_field + offset, &patchField[0], patchsize * sizeof(double)); + offset += patchsize; + } + // transfer + checkCudaErrors(cudaMemcpyAsync(d_internal_field, h_internal_field, dfDataBase.cell_value_bytes, cudaMemcpyHostToDevice, dfDataBase.stream)); + checkCudaErrors(cudaMemcpyAsync(d_boundary_field, h_boundary_field, dfDataBase.boundary_surface_value_bytes, cudaMemcpyHostToDevice, dfDataBase.stream)); +} + +void uploadVolVector(dfMatrixDataBase& dfDataBase, const volVectorField& field, const char* fieldAlias) { + double *h_internal_field = dfDataBase.getFieldPointer(fieldAlias, location::cpu, position::internal); + double *h_boundary_field = dfDataBase.getFieldPointer(fieldAlias, location::cpu, position::boundary); + double *d_internal_field = dfDataBase.getFieldPointer(fieldAlias, location::gpu, position::internal); + double *d_boundary_field = dfDataBase.getFieldPointer(fieldAlias, location::gpu, position::boundary); + // internal + memcpy(h_internal_field, &field[0], dfDataBase.cell_value_vec_bytes); + // boundary + int offset = 0; + forAll(field.boundaryField(), patchi) + { + const auto& patchField = field.boundaryField()[patchi]; + int patchsize = patchField.size(); + memcpy(h_boundary_field + offset * 3, &patchField[0], patchsize * 3 * sizeof(double)); + offset += patchsize; + } + // transfer + checkCudaErrors(cudaMemcpyAsync(d_internal_field, h_internal_field, dfDataBase.cell_value_vec_bytes, cudaMemcpyHostToDevice, dfDataBase.stream)); + checkCudaErrors(cudaMemcpyAsync(d_boundary_field, h_boundary_field, dfDataBase.boundary_surface_value_vec_bytes, cudaMemcpyHostToDevice, dfDataBase.stream)); +} + void uploadSurfaceScalar(dfMatrixDataBase& dfDataBase, const surfaceScalarField& field, const char* fieldAlias) { double *h_internal_field = dfDataBase.getFieldPointer(fieldAlias, location::cpu, position::internal); double *h_boundary_field = dfDataBase.getFieldPointer(fieldAlias, location::cpu, position::boundary); @@ -79,7 +170,7 @@ void uploadSurfaceScalar(dfMatrixDataBase& dfDataBase, const surfaceScalarField& int offset = 0; forAll(field.boundaryField(), patchi) { - const fvsPatchScalarField& patchField = field.boundaryField()[patchi]; + const auto& patchField = field.boundaryField()[patchi]; int patchsize = patchField.size(); memcpy(h_boundary_field + offset, &patchField[0], patchsize * sizeof(double)); offset += patchsize; @@ -89,18 +180,31 @@ void uploadSurfaceScalar(dfMatrixDataBase& dfDataBase, const surfaceScalarField& checkCudaErrors(cudaMemcpyAsync(d_boundary_field, h_boundary_field, dfDataBase.boundary_surface_value_bytes, cudaMemcpyHostToDevice, dfDataBase.stream)); } -void buildTestGPUDataBaseVector(const dfMatrixDataBase& dfDataBase, testGPUDataBase& testData, const volVectorField& field) { +void buildTestGPUDataBaseVector(const dfMatrixDataBase& dfDataBase, testGPUDataBase& testData, const volVectorField& field, + bool lowerFlag, bool upperFlag, bool diagFlag, bool sourceFlag, bool internalCoeffsFlag, bool boundaryCoeffsFlag, + bool valueInternalCoeffsFlag, bool valueBoundaryCoeffsFlag, bool gradientInternalCoeffsFlag, bool gradientBoundaryCoeffsFlag) { // ldu - checkCudaErrors(cudaMalloc((void**)&testData.d_lower, dfDataBase.surface_value_bytes)); - checkCudaErrors(cudaMalloc((void**)&testData.d_upper, dfDataBase.surface_value_bytes)); - checkCudaErrors(cudaMalloc((void**)&testData.d_diag, dfDataBase.cell_value_bytes)); - checkCudaErrors(cudaMalloc((void**)&testData.d_internal_coeffs, dfDataBase.boundary_surface_value_vec_bytes)); - checkCudaErrors(cudaMalloc((void**)&testData.d_boundary_coeffs, dfDataBase.boundary_surface_value_vec_bytes)); + if (lowerFlag) + checkCudaErrors(cudaMalloc((void**)&testData.d_lower, dfDataBase.surface_value_bytes)); + if (upperFlag) + checkCudaErrors(cudaMalloc((void**)&testData.d_upper, dfDataBase.surface_value_bytes)); + if (diagFlag) + checkCudaErrors(cudaMalloc((void**)&testData.d_diag, dfDataBase.cell_value_bytes)); + if (sourceFlag) + checkCudaErrors(cudaMalloc((void**)&testData.d_source, dfDataBase.cell_value_vec_bytes)); + if (internalCoeffsFlag) + checkCudaErrors(cudaMalloc((void**)&testData.d_internal_coeffs, dfDataBase.boundary_surface_value_vec_bytes)); + if (boundaryCoeffsFlag) + checkCudaErrors(cudaMalloc((void**)&testData.d_boundary_coeffs, dfDataBase.boundary_surface_value_vec_bytes)); // boundary coeffs - checkCudaErrors(cudaMalloc((void**)&testData.d_value_internal_coeffs, dfDataBase.boundary_surface_value_vec_bytes)); - checkCudaErrors(cudaMalloc((void**)&testData.d_value_boundary_coeffs, dfDataBase.boundary_surface_value_vec_bytes)); - checkCudaErrors(cudaMalloc((void**)&testData.d_gradient_internal_coeffs, dfDataBase.boundary_surface_value_vec_bytes)); - checkCudaErrors(cudaMalloc((void**)&testData.d_gradient_boundary_coeffs, dfDataBase.boundary_surface_value_vec_bytes)); + if (valueInternalCoeffsFlag) + checkCudaErrors(cudaMalloc((void**)&testData.d_value_internal_coeffs, dfDataBase.boundary_surface_value_vec_bytes)); + if (valueBoundaryCoeffsFlag) + checkCudaErrors(cudaMalloc((void**)&testData.d_value_boundary_coeffs, dfDataBase.boundary_surface_value_vec_bytes)); + if (gradientInternalCoeffsFlag) + checkCudaErrors(cudaMalloc((void**)&testData.d_gradient_internal_coeffs, dfDataBase.boundary_surface_value_vec_bytes)); + if (gradientBoundaryCoeffsFlag) + checkCudaErrors(cudaMalloc((void**)&testData.d_gradient_boundary_coeffs, dfDataBase.boundary_surface_value_vec_bytes)); // patch type testData.patch_type.resize(dfDataBase.num_patches); forAll(field.boundaryField(), patchi) @@ -139,7 +243,7 @@ void compareResultVector(const dfMatrixDataBase& dfDataBase, const testGPUDataBa std::vector h_source; h_source.resize(dfDataBase.num_cells * 3); checkCudaErrors(cudaMemcpy(h_source.data(), testData.d_source, dfDataBase.cell_value_vec_bytes, cudaMemcpyDeviceToHost)); - checkVectorEqual(dfDataBase.num_cells, &dfMatrix.source()[0][0], h_source.data(), 1e-14, printFlag); + checkVectorEqual(dfDataBase.num_cells * 3, &dfMatrix.source()[0][0], h_source.data(), 1e-14, printFlag); } if (testData.d_internal_coeffs) { std::vector h_internal_coeffs; @@ -173,6 +277,37 @@ void compareResultVector(const dfMatrixDataBase& dfDataBase, const testGPUDataBa } } +// unittest of fvm::ddt(rho, U) +void test_fvm_ddt_vector(dfMatrixDataBase& dfDataBase, Foam::fvMesh& mesh, volScalarField& rho, volVectorField& U, initType type) { + + if (type == initType::randomInit) { + // random init rho and rho.old + randomInitVolScalar(rho); + rho.oldTime(); + } + + // run CPU + fvVectorMatrix dfMatrix = fvm::ddt(rho, U); + + // prepare for run GPU + // prepare rho, rho.old, U + uploadVolScalar(dfDataBase, rho, "rho"); + uploadRhoOld(dfDataBase, rho.oldTime()); + uploadVolVector(dfDataBase, U.oldTime(), "u"); + // prepare testData + testGPUDataBase testData; + // only use diag and source + buildTestGPUDataBaseVector(dfDataBase, testData, U, false, false, true, true, false, false, false, false, false, false); + // run GPU + fvm_ddt_vector(dfDataBase.stream, dfDataBase.num_cells, dfDataBase.rdelta_t, + dfDataBase.d_rho, dfDataBase.d_rho_old, dfDataBase.d_u, dfDataBase.d_volume, + testData.d_diag, testData.d_source); + + // compare result + bool printFlag = false; + compareResultVector(dfDataBase, testData, dfMatrix, printFlag); +} + // unittest of fvm::div(phi, U) void test_fvm_div_vector(dfMatrixDataBase& dfDataBase, Foam::fvMesh& mesh, surfaceScalarField& phi, volVectorField& U, initType type) { if (type == initType::randomInit) { @@ -188,7 +323,9 @@ void test_fvm_div_vector(dfMatrixDataBase& dfDataBase, Foam::fvMesh& mesh, surfa uploadSurfaceScalar(dfDataBase, phi, "phi"); // prepare testData testGPUDataBase testData; - buildTestGPUDataBaseVector(dfDataBase, testData, U); + // not use source + // gradient_internal_coeffs, gradient_boundary_coeffs are not needed actually, but updateBoundaryCoeffsVector will access them + buildTestGPUDataBaseVector(dfDataBase, testData, U, true, true, true, false, true, true, true, true, true, true); // prepare boundary coeffs // TODO: updating boundary coeffs should be complemented later updateBoundaryCoeffsVector(dfDataBase, testData); diff --git a/GPUTest/createGPUSolver.H b/GPUTest/createGPUSolver.H index 0f1e20eca..3dd593337 100644 --- a/GPUTest/createGPUSolver.H +++ b/GPUTest/createGPUSolver.H @@ -18,7 +18,8 @@ void createGPUBase(fvMesh& mesh, PtrList& Y) { num_patches++; } // TODO: get deltaT fomr time API - dfDataBase.setConstantValues(num_cells, num_surfaces, num_boundary_surfaces, num_patches, patch_size, Y.size(), 1e-6); + double rDeltaT = 1 / 1e-6; + dfDataBase.setConstantValues(num_cells, num_surfaces, num_boundary_surfaces, num_patches, patch_size, Y.size(), rDeltaT); // prepare constant indexes: owner, neighbor dfDataBase.setConstantIndexes(&owner[0], &neighbour[0]); diff --git a/GPUTest/unittest.C b/GPUTest/unittest.C index f9826eb35..dd1f29e53 100644 --- a/GPUTest/unittest.C +++ b/GPUTest/unittest.C @@ -97,6 +97,9 @@ int main(int argc, char *argv[]) createGPUBase(mesh, Y); + // unittest of fvm::ddt(rho, U) + test_fvm_ddt_vector(dfDataBase, mesh, rho, U, initType::original); + test_fvm_ddt_vector(dfDataBase, mesh, rho, U, initType::randomInit); // unittest of fvm::div(phi, U) test_fvm_div_vector(dfDataBase, mesh, phi, U, initType::original); test_fvm_div_vector(dfDataBase, mesh, phi, U, initType::randomInit); diff --git a/applications/solvers/dfLowMachFoam/createGPUSolver.H b/applications/solvers/dfLowMachFoam/createGPUSolver.H index 5d16f7b80..d9ce745d7 100644 --- a/applications/solvers/dfLowMachFoam/createGPUSolver.H +++ b/applications/solvers/dfLowMachFoam/createGPUSolver.H @@ -21,7 +21,8 @@ void createGPUBase(fvMesh& mesh, PtrList& Y) { num_patches++; } // TODO: get deltaT fomr time API - dfDataBase.setConstantValues(num_cells, num_surfaces, num_boundary_surfaces, num_patches, patch_size, Y.size(), 1e-6); + double rDeltaT = 1 / 1e-6; + dfDataBase.setConstantValues(num_cells, num_surfaces, num_boundary_surfaces, num_patches, patch_size, Y.size(), rDeltaT); // prepare constant indexes: owner, neighbor dfDataBase.setConstantIndexes(&owner[0], &neighbour[0]); diff --git a/src_gpu/dfMatrixDataBase.H b/src_gpu/dfMatrixDataBase.H index 4d8a7d29d..cce7e6adc 100644 --- a/src_gpu/dfMatrixDataBase.H +++ b/src_gpu/dfMatrixDataBase.H @@ -134,10 +134,10 @@ struct dfMatrixDataBase // fields solved by eqns - old // TODO: not all fields need to store oldTime double *d_rho_old = nullptr; - double *d_u_old = nullptr; - double *d_y_old = nullptr; - double *d_he_old = nullptr; - double *d_p_old = nullptr; + //double *d_u_old = nullptr; + //double *d_y_old = nullptr; + //double *d_he_old = nullptr; + //double *d_p_old = nullptr; // other shared fields between eqns double *d_phi = nullptr; // computed on GPU, used on CPU, need memcpyd2h - host @@ -159,10 +159,10 @@ struct dfMatrixDataBase double *d_boundary_p = nullptr; // fields solved by eqns - old double *d_boundary_rho_old = nullptr; - double *d_boundary_u_old = nullptr; - double *d_boundary_y_old = nullptr; - double *d_boundary_he_old = nullptr; - double *d_boundary_p_old = nullptr; + //double *d_boundary_u_old = nullptr; + //double *d_boundary_y_old = nullptr; + //double *d_boundary_he_old = nullptr; + //double *d_boundary_p_old = nullptr; // other shared fields between eqns double *d_boundary_phi = nullptr; // computed on GPU, used on CPU, need memcpyd2h - host diff --git a/src_gpu/dfMatrixDataBase.cu b/src_gpu/dfMatrixDataBase.cu index 2ef707bbc..341241bf4 100644 --- a/src_gpu/dfMatrixDataBase.cu +++ b/src_gpu/dfMatrixDataBase.cu @@ -231,7 +231,8 @@ void dfMatrixDataBase::createNonConstantFieldsInternal() { fieldPointerMap["d_he"] = d_he; fieldPointerMap["d_p"] = d_p; - // checkCudaErrors(cudaMalloc((void**)&d_rho_old, cell_value_bytes)); + checkCudaErrors(cudaMalloc((void**)&d_rho_old, cell_value_bytes)); + fieldPointerMap["d_rho_old"] = d_rho_old; // checkCudaErrors(cudaMalloc((void**)&d_u_old, cell_value_vec_bytes)); // checkCudaErrors(cudaMalloc((void**)&d_y_old, cell_value_bytes * num_species)); // checkCudaErrors(cudaMalloc((void**)&d_he_old, cell_value_bytes)); @@ -269,7 +270,8 @@ void dfMatrixDataBase::createNonConstantFieldsBoundary() { fieldPointerMap["d_boundary_he"] = d_boundary_he; fieldPointerMap["d_boundary_p"] = d_boundary_p; - // checkCudaErrors(cudaMalloc((void**)&d_boundary_rho_old, boundary_surface_value_bytes)); + checkCudaErrors(cudaMalloc((void**)&d_boundary_rho_old, boundary_surface_value_bytes)); + fieldPointerMap["d_boundary_rho_old"] = d_boundary_rho_old; // checkCudaErrors(cudaMalloc((void**)&d_boundary_u_old, boundary_surface_value_vec_bytes)); // checkCudaErrors(cudaMalloc((void**)&d_boundary_y_old, boundary_surface_value_bytes * num_species)); // checkCudaErrors(cudaMalloc((void**)&d_boundary_he_old, boundary_surface_value_bytes)); diff --git a/src_gpu/dfMatrixOpBase.H b/src_gpu/dfMatrixOpBase.H index cfe953d4e..40404944d 100644 --- a/src_gpu/dfMatrixOpBase.H +++ b/src_gpu/dfMatrixOpBase.H @@ -1,5 +1,6 @@ #pragma once +// tools void permute_vector_d2h(cudaStream_t stream, int num_cells, const double *input, double *output); void permute_vector_h2d(cudaStream_t stream, int num_cells, const double *input, double *output); @@ -14,7 +15,11 @@ void update_boundary_coeffs_vector(cudaStream_t stream, int num_patches, double *value_internal_coeffs, double *value_boundary_coeffs, double *gradient_internal_coeffs, double *gradient_boundary_coeffs); -// void fvm_ddt(); +// fvm ops + +void fvm_ddt_vector(cudaStream_t stream, int num_cells, double rDeltaT, + const double *rho, const double *rho_old, const double *vf, const double *volume, + double *diag, double *source); void fvm_div_vector(cudaStream_t stream, int num_surfaces, const int *lowerAddr, const int *upperAddr, const double *phi, const double *weight, @@ -23,10 +28,12 @@ void fvm_div_vector(cudaStream_t stream, int num_surfaces, const int *lowerAddr, const double *boundary_phi, const double *value_internal_coeffs, const double *value_boundary_coeffs, double *internal_coeffs, double *boundary_coeffs); -// void fvm_laplacian(); -// -// void fvc_ddt(); -// +void fvm_laplacian(); + +// fvc ops + +void fvc_ddt(); + // void fvc_grad_surface(); // // void fvc_div_cell(); diff --git a/src_gpu/dfMatrixOpBase.cu b/src_gpu/dfMatrixOpBase.cu index 8fce760fd..1728a91dc 100644 --- a/src_gpu/dfMatrixOpBase.cu +++ b/src_gpu/dfMatrixOpBase.cu @@ -53,6 +53,21 @@ __global__ void update_boundary_coeffs_zeroGradient_vector(int num, int offset, gradient_boundary_coeffs[start_index * 3 + 2] = 0; } +__global__ void fvm_ddt_vector_kernel(int num_cells, double rDeltaT, + const double *rho, const double *rho_old, const double *vf, const double *volume, + double *diag, double *source) +{ + int index = blockDim.x * blockIdx.x + threadIdx.x; + if (index >= num_cells) + return; + + diag[index] += rDeltaT * rho[index] * volume[index]; + // TODO: skip moving + source[index * 3 + 0] += rDeltaT * rho_old[index] * vf[index * 3 + 0] * volume[index]; + source[index * 3 + 1] += rDeltaT * rho_old[index] * vf[index * 3 + 1] * volume[index]; + source[index * 3 + 2] += rDeltaT * rho_old[index] * vf[index * 3 + 2] * volume[index]; +} + __global__ void fvm_div_vector_internal(int num_surfaces, const int *lower_index, const int *upper_index, const double *phi, const double *weight, @@ -142,6 +157,16 @@ void update_boundary_coeffs_vector(cudaStream_t stream, int num_patches, } } +void fvm_ddt_vector(cudaStream_t stream, int num_cells, double rDeltaT, + const double *rho, const double *rho_old, const double *vf, const double *volume, + double *diag, double *source) +{ + size_t threads_per_block = 1024; + size_t blocks_per_grid = (num_cells + threads_per_block - 1) / threads_per_block; + fvm_ddt_vector_kernel<<>>(num_cells, + rDeltaT, rho, rho_old, vf, volume, diag, source); +} + void fvm_div_vector(cudaStream_t stream, int num_surfaces, const int *lowerAddr, const int *upperAddr, const double *phi, const double *weight, double *lower, double *upper, double *diag, // end for internal From 46062382354ff8444cc6220af00ba42a39d94097 Mon Sep 17 00:00:00 2001 From: STwangyingrui Date: Mon, 7 Aug 2023 06:27:09 +0000 Subject: [PATCH 08/25] implement fvm::laplacian(gamma, U) and add unittest for it; fix several old bugs; --- GPUTest/GPUTestBase.H | 120 ++++++++++++++++++++++++++++++++------ GPUTest/unittest.C | 7 ++- src_gpu/dfMatrixOpBase.H | 9 ++- src_gpu/dfMatrixOpBase.cu | 98 ++++++++++++++++++++++++++++--- 4 files changed, 205 insertions(+), 29 deletions(-) diff --git a/GPUTest/GPUTestBase.H b/GPUTest/GPUTestBase.H index d46d3e95b..83c9976f3 100644 --- a/GPUTest/GPUTestBase.H +++ b/GPUTest/GPUTestBase.H @@ -95,8 +95,8 @@ void randomInitVolScalar(volScalarField& field) { } } -// rho_old need special treatment -void uploadRhoOld(dfMatrixDataBase& dfDataBase, const volScalarField& field) { +// rho_old need special treatment: it use h_xxx of rho +void uploadRegisteredRhoOld(dfMatrixDataBase& dfDataBase, const volScalarField& field) { double *h_internal_field = dfDataBase.getFieldPointer("rho", location::cpu, position::internal); double *h_boundary_field = dfDataBase.getFieldPointer("rho", location::cpu, position::boundary); double *d_internal_field = dfDataBase.getFieldPointer("rho_old", location::gpu, position::internal); @@ -117,7 +117,7 @@ void uploadRhoOld(dfMatrixDataBase& dfDataBase, const volScalarField& field) { checkCudaErrors(cudaMemcpyAsync(d_boundary_field, h_boundary_field, dfDataBase.boundary_surface_value_bytes, cudaMemcpyHostToDevice, dfDataBase.stream)); } -void uploadVolScalar(dfMatrixDataBase& dfDataBase, const volScalarField& field, const char* fieldAlias) { +void uploadRegisteredVolScalar(dfMatrixDataBase& dfDataBase, const volScalarField& field, const char* fieldAlias) { double *h_internal_field = dfDataBase.getFieldPointer(fieldAlias, location::cpu, position::internal); double *h_boundary_field = dfDataBase.getFieldPointer(fieldAlias, location::cpu, position::boundary); double *d_internal_field = dfDataBase.getFieldPointer(fieldAlias, location::gpu, position::internal); @@ -138,7 +138,7 @@ void uploadVolScalar(dfMatrixDataBase& dfDataBase, const volScalarField& field, checkCudaErrors(cudaMemcpyAsync(d_boundary_field, h_boundary_field, dfDataBase.boundary_surface_value_bytes, cudaMemcpyHostToDevice, dfDataBase.stream)); } -void uploadVolVector(dfMatrixDataBase& dfDataBase, const volVectorField& field, const char* fieldAlias) { +void uploadRegisteredVolVector(dfMatrixDataBase& dfDataBase, const volVectorField& field, const char* fieldAlias) { double *h_internal_field = dfDataBase.getFieldPointer(fieldAlias, location::cpu, position::internal); double *h_boundary_field = dfDataBase.getFieldPointer(fieldAlias, location::cpu, position::boundary); double *d_internal_field = dfDataBase.getFieldPointer(fieldAlias, location::gpu, position::internal); @@ -159,7 +159,7 @@ void uploadVolVector(dfMatrixDataBase& dfDataBase, const volVectorField& field, checkCudaErrors(cudaMemcpyAsync(d_boundary_field, h_boundary_field, dfDataBase.boundary_surface_value_vec_bytes, cudaMemcpyHostToDevice, dfDataBase.stream)); } -void uploadSurfaceScalar(dfMatrixDataBase& dfDataBase, const surfaceScalarField& field, const char* fieldAlias) { +void uploadRegisteredSurfaceScalar(dfMatrixDataBase& dfDataBase, const surfaceScalarField& field, const char* fieldAlias) { double *h_internal_field = dfDataBase.getFieldPointer(fieldAlias, location::cpu, position::internal); double *h_boundary_field = dfDataBase.getFieldPointer(fieldAlias, location::cpu, position::boundary); double *d_internal_field = dfDataBase.getFieldPointer(fieldAlias, location::gpu, position::internal); @@ -180,31 +180,67 @@ void uploadSurfaceScalar(dfMatrixDataBase& dfDataBase, const surfaceScalarField& checkCudaErrors(cudaMemcpyAsync(d_boundary_field, h_boundary_field, dfDataBase.boundary_surface_value_bytes, cudaMemcpyHostToDevice, dfDataBase.stream)); } +void uploadVolScalar(dfMatrixDataBase& dfDataBase, const volScalarField& field, double *d_field, double *d_boundary_field) { + std::vector h_boundary_field; + h_boundary_field.resize(dfDataBase.num_boundary_surfaces); + int offset = 0; + forAll(field.boundaryField(), patchi) + { + const auto& patchField = field.boundaryField()[patchi]; + int patchsize = patchField.size(); + memcpy(h_boundary_field.data() + offset, &patchField[0], patchsize * sizeof(double)); + offset += patchsize; + } + checkCudaErrors(cudaMemcpyAsync(d_field, &field[0], dfDataBase.cell_value_bytes, cudaMemcpyHostToDevice, dfDataBase.stream)); + checkCudaErrors(cudaMemcpyAsync(d_boundary_field, h_boundary_field.data(), dfDataBase.boundary_surface_value_bytes, cudaMemcpyHostToDevice, dfDataBase.stream)); +} + + void buildTestGPUDataBaseVector(const dfMatrixDataBase& dfDataBase, testGPUDataBase& testData, const volVectorField& field, bool lowerFlag, bool upperFlag, bool diagFlag, bool sourceFlag, bool internalCoeffsFlag, bool boundaryCoeffsFlag, bool valueInternalCoeffsFlag, bool valueBoundaryCoeffsFlag, bool gradientInternalCoeffsFlag, bool gradientBoundaryCoeffsFlag) { // ldu - if (lowerFlag) + if (lowerFlag) { checkCudaErrors(cudaMalloc((void**)&testData.d_lower, dfDataBase.surface_value_bytes)); - if (upperFlag) + checkCudaErrors(cudaMemset(testData.d_lower, 0, dfDataBase.surface_value_bytes)); + } + if (upperFlag) { checkCudaErrors(cudaMalloc((void**)&testData.d_upper, dfDataBase.surface_value_bytes)); - if (diagFlag) + checkCudaErrors(cudaMemset(testData.d_upper, 0, dfDataBase.surface_value_bytes)); + } + if (diagFlag) { checkCudaErrors(cudaMalloc((void**)&testData.d_diag, dfDataBase.cell_value_bytes)); - if (sourceFlag) + checkCudaErrors(cudaMemset(testData.d_diag, 0, dfDataBase.cell_value_bytes)); + } + if (sourceFlag) { checkCudaErrors(cudaMalloc((void**)&testData.d_source, dfDataBase.cell_value_vec_bytes)); - if (internalCoeffsFlag) + checkCudaErrors(cudaMemset(testData.d_source, 0, dfDataBase.cell_value_vec_bytes)); + } + if (internalCoeffsFlag) { checkCudaErrors(cudaMalloc((void**)&testData.d_internal_coeffs, dfDataBase.boundary_surface_value_vec_bytes)); - if (boundaryCoeffsFlag) + checkCudaErrors(cudaMemset(testData.d_internal_coeffs, 0, dfDataBase.boundary_surface_value_vec_bytes)); + } + if (boundaryCoeffsFlag) { checkCudaErrors(cudaMalloc((void**)&testData.d_boundary_coeffs, dfDataBase.boundary_surface_value_vec_bytes)); + checkCudaErrors(cudaMemset(testData.d_boundary_coeffs, 0, dfDataBase.boundary_surface_value_vec_bytes)); + } // boundary coeffs - if (valueInternalCoeffsFlag) + if (valueInternalCoeffsFlag) { checkCudaErrors(cudaMalloc((void**)&testData.d_value_internal_coeffs, dfDataBase.boundary_surface_value_vec_bytes)); - if (valueBoundaryCoeffsFlag) + checkCudaErrors(cudaMemset(testData.d_value_internal_coeffs, 0, dfDataBase.boundary_surface_value_vec_bytes)); + } + if (valueBoundaryCoeffsFlag) { checkCudaErrors(cudaMalloc((void**)&testData.d_value_boundary_coeffs, dfDataBase.boundary_surface_value_vec_bytes)); - if (gradientInternalCoeffsFlag) + checkCudaErrors(cudaMemset(testData.d_value_boundary_coeffs, 0, dfDataBase.boundary_surface_value_vec_bytes)); + } + if (gradientInternalCoeffsFlag) { checkCudaErrors(cudaMalloc((void**)&testData.d_gradient_internal_coeffs, dfDataBase.boundary_surface_value_vec_bytes)); - if (gradientBoundaryCoeffsFlag) + checkCudaErrors(cudaMemset(testData.d_gradient_internal_coeffs, 0, dfDataBase.boundary_surface_value_vec_bytes)); + } + if (gradientBoundaryCoeffsFlag) { checkCudaErrors(cudaMalloc((void**)&testData.d_gradient_boundary_coeffs, dfDataBase.boundary_surface_value_vec_bytes)); + checkCudaErrors(cudaMemset(testData.d_gradient_boundary_coeffs, 0, dfDataBase.boundary_surface_value_vec_bytes)); + } // patch type testData.patch_type.resize(dfDataBase.num_patches); forAll(field.boundaryField(), patchi) @@ -291,9 +327,9 @@ void test_fvm_ddt_vector(dfMatrixDataBase& dfDataBase, Foam::fvMesh& mesh, volSc // prepare for run GPU // prepare rho, rho.old, U - uploadVolScalar(dfDataBase, rho, "rho"); - uploadRhoOld(dfDataBase, rho.oldTime()); - uploadVolVector(dfDataBase, U.oldTime(), "u"); + uploadRegisteredVolScalar(dfDataBase, rho, "rho"); + uploadRegisteredRhoOld(dfDataBase, rho.oldTime()); + uploadRegisteredVolVector(dfDataBase, U.oldTime(), "u"); // prepare testData testGPUDataBase testData; // only use diag and source @@ -320,7 +356,7 @@ void test_fvm_div_vector(dfMatrixDataBase& dfDataBase, Foam::fvMesh& mesh, surfa // prepare for run GPU // prepare phi field - uploadSurfaceScalar(dfDataBase, phi, "phi"); + uploadRegisteredSurfaceScalar(dfDataBase, phi, "phi"); // prepare testData testGPUDataBase testData; // not use source @@ -342,3 +378,49 @@ void test_fvm_div_vector(dfMatrixDataBase& dfDataBase, Foam::fvMesh& mesh, surfa bool printFlag = false; compareResultVector(dfDataBase, testData, dfMatrix, printFlag); } + +void test_fvm_laplacian_vector(dfMatrixDataBase& dfDataBase, Foam::fvMesh& mesh, + volScalarField& gamma, volVectorField& U, initType type) +{ + if (type == initType::randomInit) { + randomInitVolScalar(gamma); + } + + // run CPU + fvVectorMatrix dfMatrix = fvm::laplacian(gamma, U); + + // prepare for run GPU + // prepare gamma on GPU + double *d_gamma = nullptr; + double *d_boundary_gamma = nullptr; + checkCudaErrors(cudaMalloc((void**)&d_gamma, dfDataBase.cell_value_bytes)); + checkCudaErrors(cudaMalloc((void**)&d_boundary_gamma, dfDataBase.boundary_surface_value_bytes)); + uploadVolScalar(dfDataBase, gamma, d_gamma, d_boundary_gamma); + // prepare testData + testGPUDataBase testData; + // not use source + // value_internal_coeffs, value_boundary_coeffs are not needed actually, but updateBoundaryCoeffsVector will access them + buildTestGPUDataBaseVector(dfDataBase, testData, U, true, true, true, false, true, true, true, true, true, true); + // prepare boundary coeffs + // TODO: updating boundary coeffs should be complemented later + updateBoundaryCoeffsVector(dfDataBase, testData); + + // run GPU + fvm_laplacian_vector(dfDataBase.stream, dfDataBase.num_surfaces, dfDataBase.num_boundary_surfaces, + dfDataBase.d_owner, dfDataBase.d_neighbor, + dfDataBase.d_weight, dfDataBase.d_mag_sf, dfDataBase.d_delta_coeffs, d_gamma, + testData.d_lower, testData.d_upper, testData.d_diag, // end for internal + dfDataBase.num_patches, dfDataBase.patch_size.data(), testData.patch_type.data(), + dfDataBase.d_boundary_mag_sf, d_boundary_gamma, + testData.d_gradient_internal_coeffs, testData.d_gradient_boundary_coeffs, + testData.d_internal_coeffs, testData.d_boundary_coeffs); + + // compare result + bool printFlag = false; + compareResultVector(dfDataBase, testData, dfMatrix, printFlag); + + // free resources + checkCudaErrors(cudaFree(d_gamma)); + checkCudaErrors(cudaFree(d_boundary_gamma)); +} + diff --git a/GPUTest/unittest.C b/GPUTest/unittest.C index dd1f29e53..78608e46e 100644 --- a/GPUTest/unittest.C +++ b/GPUTest/unittest.C @@ -103,8 +103,13 @@ int main(int argc, char *argv[]) // unittest of fvm::div(phi, U) test_fvm_div_vector(dfDataBase, mesh, phi, U, initType::original); test_fvm_div_vector(dfDataBase, mesh, phi, U, initType::randomInit); + // unittest of fvm::laplacian(gamma, U) + const tmp nuEff_tmp(turbulence->nuEff()); + const volScalarField& nuEff = nuEff_tmp(); + volScalarField gamma = rho * nuEff; + test_fvm_laplacian_vector(dfDataBase, mesh, gamma, U, initType::original); + test_fvm_laplacian_vector(dfDataBase, mesh, gamma, U, initType::randomInit); } return 0; } - diff --git a/src_gpu/dfMatrixOpBase.H b/src_gpu/dfMatrixOpBase.H index 40404944d..617a2e787 100644 --- a/src_gpu/dfMatrixOpBase.H +++ b/src_gpu/dfMatrixOpBase.H @@ -28,7 +28,14 @@ void fvm_div_vector(cudaStream_t stream, int num_surfaces, const int *lowerAddr, const double *boundary_phi, const double *value_internal_coeffs, const double *value_boundary_coeffs, double *internal_coeffs, double *boundary_coeffs); -void fvm_laplacian(); +void fvm_laplacian_vector(cudaStream_t stream, int num_surfaces, int num_boundary_surfaces, + const int *lowerAddr, const int *upperAddr, + const double *weight, const double *mag_sf, const double *delta_coeffs, const double *gamma, + double *lower, double *upper, double *diag, // end for internal + int num_patches, const int *patch_size, const int *patch_type, + const double *boundary_mag_sf, const double *boundary_gamma, + const double *gradient_internal_coeffs, const double *gradient_boundary_coeffs, + double *internal_coeffs, double *boundary_coeffs); // fvc ops diff --git a/src_gpu/dfMatrixOpBase.cu b/src_gpu/dfMatrixOpBase.cu index 1728a91dc..5b63cad61 100644 --- a/src_gpu/dfMatrixOpBase.cu +++ b/src_gpu/dfMatrixOpBase.cu @@ -80,14 +80,16 @@ __global__ void fvm_div_vector_internal(int num_surfaces, double w = weight[index]; double f = phi[index]; - lower[index] += (-w) * f; - upper[index] += (1 - w) * f; + double lower_value = (-w) * f; + double upper_value = (1 - w) * f; + lower[index] += lower_value; + upper[index] += upper_value; // if (index == 0) printf("index = 0, lower: %.16lf, upper:%.16lf\n", lower[index], upper[index]); - int l = lower_index[index]; - int u = upper_index[index]; - atomicAdd(&(diag[l]), w * f); - atomicAdd(&(diag[u]), (w - 1) * f); + int owner = lower_index[index]; + int neighbor = upper_index[index]; + atomicAdd(&(diag[owner]), -lower_value); + atomicAdd(&(diag[neighbor]), -upper_value); } __global__ void fvm_div_vector_boundary(int num, int offset, @@ -108,6 +110,53 @@ __global__ void fvm_div_vector_boundary(int num, int offset, boundary_coeffs[start_index * 3 + 2] += boundary_f * value_boundary_coeffs[start_index * 3 + 2]; } +__global__ void fvm_laplacian_vector_internal(int num_surfaces, + const int *lower_index, const int *upper_index, + const double *weight, const double *mag_sf, const double *delta_coeffs, const double *gamma, + double *lower, double *upper, double *diag) +{ + int index = blockDim.x * blockIdx.x + threadIdx.x; + if (index >= num_surfaces) + return; + + int owner = lower_index[index]; + int neighbor = upper_index[index]; + + double w = weight[index]; + double upper_face_gamma = w * gamma[owner] + (1 - w) * gamma[neighbor]; + double upper_value = upper_face_gamma * mag_sf[index] * delta_coeffs[index]; + + // laplacian doesn't use the original lower, but use lower = upper + //double lower_face_gamma = w * gamma[neighbor] + (1 - w) * gamma[owner]; + //double lower_value = lower_face_gamma * mag_sf[index] * delta_coeffs[index]; + double lower_value = upper_value; + + lower[index] += lower_value; + upper[index] += upper_value; + + atomicAdd(&(diag[owner]), -lower_value); + atomicAdd(&(diag[neighbor]), -upper_value); +} + +__global__ void fvm_laplacian_vector_boundary(int num, int offset, + const double *boundary_mag_sf, const double *boundary_gamma, + const double *gradient_internal_coeffs, const double *gradient_boundary_coeffs, + double *internal_coeffs, double *boundary_coeffs) +{ + int index = blockDim.x * blockIdx.x + threadIdx.x; + if (index >= num) + return; + + int start_index = offset + index; + double boundary_value = boundary_gamma[start_index] * boundary_mag_sf[start_index]; + internal_coeffs[start_index * 3 + 0] += boundary_value * gradient_internal_coeffs[start_index * 3 + 0]; + internal_coeffs[start_index * 3 + 1] += boundary_value * gradient_internal_coeffs[start_index * 3 + 1]; + internal_coeffs[start_index * 3 + 2] += boundary_value * gradient_internal_coeffs[start_index * 3 + 2]; + boundary_coeffs[start_index * 3 + 0] += boundary_value * gradient_boundary_coeffs[start_index * 3 + 0]; + boundary_coeffs[start_index * 3 + 1] += boundary_value * gradient_boundary_coeffs[start_index * 3 + 1]; + boundary_coeffs[start_index * 3 + 2] += boundary_value * gradient_boundary_coeffs[start_index * 3 + 2]; +} + void permute_vector_d2h(cudaStream_t stream, int num_cells, const double *input, double *output) { size_t threads_per_block = 256; @@ -178,8 +227,7 @@ void fvm_div_vector(cudaStream_t stream, int num_surfaces, const int *lowerAddr, size_t blocks_per_grid = 1; blocks_per_grid = (num_surfaces + threads_per_block - 1) / threads_per_block; - fvm_div_vector_internal<<>>(num_surfaces, - lowerAddr, upperAddr, + fvm_div_vector_internal<<>>(num_surfaces, lowerAddr, upperAddr, phi, weight, lower, upper, diag); int offset = 0; @@ -200,3 +248,37 @@ void fvm_div_vector(cudaStream_t stream, int num_surfaces, const int *lowerAddr, } } +void fvm_laplacian_vector(cudaStream_t stream, int num_surfaces, int num_boundary_surfaces, // TODO: num_boundary_surfaces may not be in use + const int *lowerAddr, const int *upperAddr, + const double *weight, const double *mag_sf, const double *delta_coeffs, const double *gamma, + double *lower, double *upper, double *diag, // end for internal + int num_patches, const int *patch_size, const int *patch_type, + const double *boundary_mag_sf, const double *boundary_gamma, + const double *gradient_internal_coeffs, const double *gradient_boundary_coeffs, + double *internal_coeffs, double *boundary_coeffs) +{ + size_t threads_per_block = 1024; + size_t blocks_per_grid = 1; + + blocks_per_grid = (num_surfaces + threads_per_block - 1) / threads_per_block; + fvm_laplacian_vector_internal<<>>(num_surfaces, lowerAddr, upperAddr, + weight, mag_sf, delta_coeffs, gamma, lower, upper, diag); + + int offset = 0; + for (int i = 0; i < num_patches; i++) { + threads_per_block = 256; + blocks_per_grid = (patch_size[i] + threads_per_block - 1) / threads_per_block; + // TODO: just basic patch type now + if (patch_type[i] == boundaryConditions::zeroGradient + || patch_type[i] == boundaryConditions::fixedValue) { + // TODO: just vector version now + fvm_laplacian_vector_boundary<<>>(patch_size[i], offset, + boundary_mag_sf, boundary_gamma, gradient_internal_coeffs, gradient_boundary_coeffs, + internal_coeffs, boundary_coeffs); + } else if (0) { + // xxx + } + offset += patch_size[i]; + } +} + From cc7223d9a8bf13d58a7ec000423958acde27b50f Mon Sep 17 00:00:00 2001 From: STwangyingrui Date: Mon, 7 Aug 2023 08:40:42 +0000 Subject: [PATCH 09/25] implement fvc::ddt(rho, K) and add unittest for it; fix several old bugs; --- GPUTest/GPUTestBase.H | 106 +++++++++++++++++++++++++++++++++++++- GPUTest/unittest.C | 7 +++ src_gpu/dfMatrixOpBase.H | 8 +-- src_gpu/dfMatrixOpBase.cu | 29 +++++++++-- 4 files changed, 142 insertions(+), 8 deletions(-) diff --git a/GPUTest/GPUTestBase.H b/GPUTest/GPUTestBase.H index 83c9976f3..b6b7bba62 100644 --- a/GPUTest/GPUTestBase.H +++ b/GPUTest/GPUTestBase.H @@ -195,6 +195,58 @@ void uploadVolScalar(dfMatrixDataBase& dfDataBase, const volScalarField& field, checkCudaErrors(cudaMemcpyAsync(d_boundary_field, h_boundary_field.data(), dfDataBase.boundary_surface_value_bytes, cudaMemcpyHostToDevice, dfDataBase.stream)); } +void buildTestGPUDataBaseScalar(const dfMatrixDataBase& dfDataBase, testGPUDataBase& testData, const volScalarField& field, + bool lowerFlag, bool upperFlag, bool diagFlag, bool sourceFlag, bool internalCoeffsFlag, bool boundaryCoeffsFlag, + bool valueInternalCoeffsFlag, bool valueBoundaryCoeffsFlag, bool gradientInternalCoeffsFlag, bool gradientBoundaryCoeffsFlag) { + // ldu + if (lowerFlag) { + checkCudaErrors(cudaMalloc((void**)&testData.d_lower, dfDataBase.surface_value_bytes)); + checkCudaErrors(cudaMemset(testData.d_lower, 0, dfDataBase.surface_value_bytes)); + } + if (upperFlag) { + checkCudaErrors(cudaMalloc((void**)&testData.d_upper, dfDataBase.surface_value_bytes)); + checkCudaErrors(cudaMemset(testData.d_upper, 0, dfDataBase.surface_value_bytes)); + } + if (diagFlag) { + checkCudaErrors(cudaMalloc((void**)&testData.d_diag, dfDataBase.cell_value_bytes)); + checkCudaErrors(cudaMemset(testData.d_diag, 0, dfDataBase.cell_value_bytes)); + } + if (sourceFlag) { + checkCudaErrors(cudaMalloc((void**)&testData.d_source, dfDataBase.cell_value_bytes)); + checkCudaErrors(cudaMemset(testData.d_source, 0, dfDataBase.cell_value_bytes)); + } + if (internalCoeffsFlag) { + checkCudaErrors(cudaMalloc((void**)&testData.d_internal_coeffs, dfDataBase.boundary_surface_value_bytes)); + checkCudaErrors(cudaMemset(testData.d_internal_coeffs, 0, dfDataBase.boundary_surface_value_bytes)); + } + if (boundaryCoeffsFlag) { + checkCudaErrors(cudaMalloc((void**)&testData.d_boundary_coeffs, dfDataBase.boundary_surface_value_bytes)); + checkCudaErrors(cudaMemset(testData.d_boundary_coeffs, 0, dfDataBase.boundary_surface_value_bytes)); + } + // boundary coeffs + if (valueInternalCoeffsFlag) { + checkCudaErrors(cudaMalloc((void**)&testData.d_value_internal_coeffs, dfDataBase.boundary_surface_value_bytes)); + checkCudaErrors(cudaMemset(testData.d_value_internal_coeffs, 0, dfDataBase.boundary_surface_value_bytes)); + } + if (valueBoundaryCoeffsFlag) { + checkCudaErrors(cudaMalloc((void**)&testData.d_value_boundary_coeffs, dfDataBase.boundary_surface_value_bytes)); + checkCudaErrors(cudaMemset(testData.d_value_boundary_coeffs, 0, dfDataBase.boundary_surface_value_bytes)); + } + if (gradientInternalCoeffsFlag) { + checkCudaErrors(cudaMalloc((void**)&testData.d_gradient_internal_coeffs, dfDataBase.boundary_surface_value_bytes)); + checkCudaErrors(cudaMemset(testData.d_gradient_internal_coeffs, 0, dfDataBase.boundary_surface_value_bytes)); + } + if (gradientBoundaryCoeffsFlag) { + checkCudaErrors(cudaMalloc((void**)&testData.d_gradient_boundary_coeffs, dfDataBase.boundary_surface_value_bytes)); + checkCudaErrors(cudaMemset(testData.d_gradient_boundary_coeffs, 0, dfDataBase.boundary_surface_value_bytes)); + } + // patch type + testData.patch_type.resize(dfDataBase.num_patches); + forAll(field.boundaryField(), patchi) + { + constructBoundarySelectorPerPatch(&(testData.patch_type[patchi]), field.boundaryField()[patchi].type()); + } +} void buildTestGPUDataBaseVector(const dfMatrixDataBase& dfDataBase, testGPUDataBase& testData, const volVectorField& field, bool lowerFlag, bool upperFlag, bool diagFlag, bool sourceFlag, bool internalCoeffsFlag, bool boundaryCoeffsFlag, @@ -318,8 +370,8 @@ void test_fvm_ddt_vector(dfMatrixDataBase& dfDataBase, Foam::fvMesh& mesh, volSc if (type == initType::randomInit) { // random init rho and rho.old - randomInitVolScalar(rho); rho.oldTime(); + randomInitVolScalar(rho); } // run CPU @@ -379,6 +431,7 @@ void test_fvm_div_vector(dfMatrixDataBase& dfDataBase, Foam::fvMesh& mesh, surfa compareResultVector(dfDataBase, testData, dfMatrix, printFlag); } +// unittest of fvm::laplacian(gamma, vf) void test_fvm_laplacian_vector(dfMatrixDataBase& dfDataBase, Foam::fvMesh& mesh, volScalarField& gamma, volVectorField& U, initType type) { @@ -406,7 +459,7 @@ void test_fvm_laplacian_vector(dfMatrixDataBase& dfDataBase, Foam::fvMesh& mesh, updateBoundaryCoeffsVector(dfDataBase, testData); // run GPU - fvm_laplacian_vector(dfDataBase.stream, dfDataBase.num_surfaces, dfDataBase.num_boundary_surfaces, + fvm_laplacian_vector(dfDataBase.stream, dfDataBase.num_surfaces, dfDataBase.d_owner, dfDataBase.d_neighbor, dfDataBase.d_weight, dfDataBase.d_mag_sf, dfDataBase.d_delta_coeffs, d_gamma, testData.d_lower, testData.d_upper, testData.d_diag, // end for internal @@ -424,3 +477,52 @@ void test_fvm_laplacian_vector(dfMatrixDataBase& dfDataBase, Foam::fvMesh& mesh, checkCudaErrors(cudaFree(d_boundary_gamma)); } +// unittest of fvc::ddt(rho, K) +void test_fvc_ddt_scalar(dfMatrixDataBase& dfDataBase, Foam::fvMesh& mesh, volScalarField& rho, volScalarField& K, initType type) { + + if (type == initType::randomInit) { + // random init rho and rho.old + rho.oldTime(); + randomInitVolScalar(rho); + K.oldTime(); + randomInitVolScalar(K); + } + + // run CPU + volScalarField fvc_ouput_scalar = fvc::ddt(rho, K); + + // prepare for run GPU + // prepare rho, rho.old on GPU + uploadRegisteredVolScalar(dfDataBase, rho, "rho"); + uploadRegisteredRhoOld(dfDataBase, rho.oldTime()); + // prepare K, K_old on GPU + double *d_K = nullptr; + double *d_K_old = nullptr; + checkCudaErrors(cudaMalloc((void**)&d_K, dfDataBase.cell_value_bytes)); + checkCudaErrors(cudaMalloc((void**)&d_K_old, dfDataBase.cell_value_bytes)); + checkCudaErrors(cudaMemcpyAsync(d_K, &K[0], dfDataBase.cell_value_bytes, cudaMemcpyHostToDevice, dfDataBase.stream)); + checkCudaErrors(cudaMemcpyAsync(d_K_old, &K.oldTime()[0], dfDataBase.cell_value_bytes, cudaMemcpyHostToDevice, dfDataBase.stream)); + // there is no need for fvc ops to build testGPUDataBase, just build d_fvc_ouput_scalar directly. + double *d_fvc_ouput_scalar = nullptr; + checkCudaErrors(cudaMalloc((void**)&d_fvc_ouput_scalar, dfDataBase.cell_value_bytes)); + checkCudaErrors(cudaMemset(d_fvc_ouput_scalar, 0, dfDataBase.cell_value_bytes)); + // run GPU + // fvc_ddt doesn't consider to add fvc_output to source yet, which needs (fvc_output * volume * sign). + fvc_ddt_scalar(dfDataBase.stream, dfDataBase.num_cells, dfDataBase.rdelta_t, + dfDataBase.d_rho, dfDataBase.d_rho_old, d_K, d_K_old, + d_fvc_ouput_scalar); + + // compare result + bool printFlag = false; + std::vector h_fvc_ouput_scalar; + h_fvc_ouput_scalar.resize(dfDataBase.num_cells); + checkCudaErrors(cudaMemcpy(h_fvc_ouput_scalar.data(), d_fvc_ouput_scalar, dfDataBase.cell_value_bytes, cudaMemcpyDeviceToHost)); + checkVectorEqual(dfDataBase.num_cells, &fvc_ouput_scalar[0], h_fvc_ouput_scalar.data(), 1e-12, printFlag); + + // free resources + checkCudaErrors(cudaFree(d_K)); + checkCudaErrors(cudaFree(d_K_old)); + checkCudaErrors(cudaFree(d_fvc_ouput_scalar)); +} + + diff --git a/GPUTest/unittest.C b/GPUTest/unittest.C index 78608e46e..cf58ec093 100644 --- a/GPUTest/unittest.C +++ b/GPUTest/unittest.C @@ -100,15 +100,22 @@ int main(int argc, char *argv[]) // unittest of fvm::ddt(rho, U) test_fvm_ddt_vector(dfDataBase, mesh, rho, U, initType::original); test_fvm_ddt_vector(dfDataBase, mesh, rho, U, initType::randomInit); + // unittest of fvm::div(phi, U) test_fvm_div_vector(dfDataBase, mesh, phi, U, initType::original); test_fvm_div_vector(dfDataBase, mesh, phi, U, initType::randomInit); + // unittest of fvm::laplacian(gamma, U) const tmp nuEff_tmp(turbulence->nuEff()); const volScalarField& nuEff = nuEff_tmp(); volScalarField gamma = rho * nuEff; test_fvm_laplacian_vector(dfDataBase, mesh, gamma, U, initType::original); test_fvm_laplacian_vector(dfDataBase, mesh, gamma, U, initType::randomInit); + + // unittest of fvc::ddt(rho, K) + K = 0.5*magSqr(U); + test_fvc_ddt_scalar(dfDataBase, mesh, rho, K, initType::original); + test_fvc_ddt_scalar(dfDataBase, mesh, rho, K, initType::randomInit); } return 0; } diff --git a/src_gpu/dfMatrixOpBase.H b/src_gpu/dfMatrixOpBase.H index 617a2e787..9de229b14 100644 --- a/src_gpu/dfMatrixOpBase.H +++ b/src_gpu/dfMatrixOpBase.H @@ -28,7 +28,7 @@ void fvm_div_vector(cudaStream_t stream, int num_surfaces, const int *lowerAddr, const double *boundary_phi, const double *value_internal_coeffs, const double *value_boundary_coeffs, double *internal_coeffs, double *boundary_coeffs); -void fvm_laplacian_vector(cudaStream_t stream, int num_surfaces, int num_boundary_surfaces, +void fvm_laplacian_vector(cudaStream_t stream, int num_surfaces, const int *lowerAddr, const int *upperAddr, const double *weight, const double *mag_sf, const double *delta_coeffs, const double *gamma, double *lower, double *upper, double *diag, // end for internal @@ -38,8 +38,10 @@ void fvm_laplacian_vector(cudaStream_t stream, int num_surfaces, int num_boundar double *internal_coeffs, double *boundary_coeffs); // fvc ops - -void fvc_ddt(); +// fvc_ddt doesn't consider to add fvc_output to source yet, which needs (fvc_output * volume * sign). +void fvc_ddt_scalar(cudaStream_t stream, int num_cells, double rDeltaT, + const double *rho, const double *rho_old, const double *vf, const double *vf_old, + double *output); // void fvc_grad_surface(); // diff --git a/src_gpu/dfMatrixOpBase.cu b/src_gpu/dfMatrixOpBase.cu index 5b63cad61..c8583cd17 100644 --- a/src_gpu/dfMatrixOpBase.cu +++ b/src_gpu/dfMatrixOpBase.cu @@ -157,6 +157,18 @@ __global__ void fvm_laplacian_vector_boundary(int num, int offset, boundary_coeffs[start_index * 3 + 2] += boundary_value * gradient_boundary_coeffs[start_index * 3 + 2]; } +__global__ void fvc_ddt_scalar_kernel(int num_cells, double rDeltaT, + const double *rho, const double *rho_old, const double *vf, const double *vf_old, + double *output) +{ + int index = blockDim.x * blockIdx.x + threadIdx.x; + if (index >= num_cells) + return; + + // TODO: skip moving + output[index] += rDeltaT * (rho[index] * vf[index] - rho_old[index] * vf_old[index]); +} + void permute_vector_d2h(cudaStream_t stream, int num_cells, const double *input, double *output) { size_t threads_per_block = 256; @@ -197,10 +209,9 @@ void update_boundary_coeffs_vector(cudaStream_t stream, int num_patches, if (patch_type[i] == boundaryConditions::zeroGradient) { update_boundary_coeffs_zeroGradient_vector<<>>(patch_size[i], offset, value_internal_coeffs, value_boundary_coeffs, gradient_internal_coeffs, gradient_boundary_coeffs); - } else if (patch_type[i] == boundaryConditions::fixedValue) { - // xxx } else if (0) { // xxx + fprintf(stderr, "boundaryConditions other than zeroGradient are not support yet!\n"); } offset += patch_size[i]; } @@ -243,12 +254,13 @@ void fvm_div_vector(cudaStream_t stream, int num_surfaces, const int *lowerAddr, internal_coeffs, boundary_coeffs); } else if (0) { // xxx + fprintf(stderr, "boundaryConditions other than zeroGradient are not support yet!\n"); } offset += patch_size[i]; } } -void fvm_laplacian_vector(cudaStream_t stream, int num_surfaces, int num_boundary_surfaces, // TODO: num_boundary_surfaces may not be in use +void fvm_laplacian_vector(cudaStream_t stream, int num_surfaces, const int *lowerAddr, const int *upperAddr, const double *weight, const double *mag_sf, const double *delta_coeffs, const double *gamma, double *lower, double *upper, double *diag, // end for internal @@ -277,8 +289,19 @@ void fvm_laplacian_vector(cudaStream_t stream, int num_surfaces, int num_boundar internal_coeffs, boundary_coeffs); } else if (0) { // xxx + fprintf(stderr, "boundaryConditions other than zeroGradient are not support yet!\n"); } offset += patch_size[i]; } } +void fvc_ddt_scalar(cudaStream_t stream, int num_cells, double rDeltaT, + const double *rho, const double *rho_old, const double *vf, const double *vf_old, + double *output) +{ + size_t threads_per_block = 1024; + size_t blocks_per_grid = (num_cells + threads_per_block - 1) / threads_per_block; + fvc_ddt_scalar_kernel<<>>(num_cells, + rDeltaT, rho, rho_old, vf, vf_old, output); +} + From 15e30b43e7586c00b9cbd67d3724a3058ed91b47 Mon Sep 17 00:00:00 2001 From: STwangyingrui Date: Mon, 7 Aug 2023 11:09:26 +0000 Subject: [PATCH 10/25] workaround to fix a bug of floating-point numerical error for fvc_ddt --- GPUTest/GPUTestBase.H | 27 +++++++++++++-------------- GPUTest/unittest.C | 9 +++++++++ src_gpu/dfMatrixDataBase.H | 2 ++ src_gpu/dfMatrixOpBase.cu | 14 +++++++++++++- 4 files changed, 37 insertions(+), 15 deletions(-) diff --git a/GPUTest/GPUTestBase.H b/GPUTest/GPUTestBase.H index b6b7bba62..360abdf2c 100644 --- a/GPUTest/GPUTestBase.H +++ b/GPUTest/GPUTestBase.H @@ -367,11 +367,10 @@ void compareResultVector(const dfMatrixDataBase& dfDataBase, const testGPUDataBa // unittest of fvm::ddt(rho, U) void test_fvm_ddt_vector(dfMatrixDataBase& dfDataBase, Foam::fvMesh& mesh, volScalarField& rho, volVectorField& U, initType type) { - if (type == initType::randomInit) { - // random init rho and rho.old - rho.oldTime(); - randomInitVolScalar(rho); + // random init rho and rho.old + rho.oldTime(); + randomInitVolScalar(rho); } // run CPU @@ -399,8 +398,8 @@ void test_fvm_ddt_vector(dfMatrixDataBase& dfDataBase, Foam::fvMesh& mesh, volSc // unittest of fvm::div(phi, U) void test_fvm_div_vector(dfMatrixDataBase& dfDataBase, Foam::fvMesh& mesh, surfaceScalarField& phi, volVectorField& U, initType type) { if (type == initType::randomInit) { - randomInitSurfaceScalar(phi); - // TODO: random init weight failed, weight is const. + phi.oldTime(); + randomInitSurfaceScalar(phi); } // run CPU @@ -436,7 +435,8 @@ void test_fvm_laplacian_vector(dfMatrixDataBase& dfDataBase, Foam::fvMesh& mesh, volScalarField& gamma, volVectorField& U, initType type) { if (type == initType::randomInit) { - randomInitVolScalar(gamma); + gamma.oldTime(); + randomInitVolScalar(gamma); } // run CPU @@ -479,13 +479,12 @@ void test_fvm_laplacian_vector(dfMatrixDataBase& dfDataBase, Foam::fvMesh& mesh, // unittest of fvc::ddt(rho, K) void test_fvc_ddt_scalar(dfMatrixDataBase& dfDataBase, Foam::fvMesh& mesh, volScalarField& rho, volScalarField& K, initType type) { - if (type == initType::randomInit) { - // random init rho and rho.old - rho.oldTime(); - randomInitVolScalar(rho); - K.oldTime(); - randomInitVolScalar(K); + // random init rho and rho.old + rho.oldTime(); + randomInitVolScalar(rho); + K.oldTime(); + randomInitVolScalar(K); } // run CPU @@ -517,7 +516,7 @@ void test_fvc_ddt_scalar(dfMatrixDataBase& dfDataBase, Foam::fvMesh& mesh, volSc std::vector h_fvc_ouput_scalar; h_fvc_ouput_scalar.resize(dfDataBase.num_cells); checkCudaErrors(cudaMemcpy(h_fvc_ouput_scalar.data(), d_fvc_ouput_scalar, dfDataBase.cell_value_bytes, cudaMemcpyDeviceToHost)); - checkVectorEqual(dfDataBase.num_cells, &fvc_ouput_scalar[0], h_fvc_ouput_scalar.data(), 1e-12, printFlag); + checkVectorEqual(dfDataBase.num_cells, &fvc_ouput_scalar[0], h_fvc_ouput_scalar.data(), 1e-14, printFlag); // free resources checkCudaErrors(cudaFree(d_K)); diff --git a/GPUTest/unittest.C b/GPUTest/unittest.C index cf58ec093..edd5b7856 100644 --- a/GPUTest/unittest.C +++ b/GPUTest/unittest.C @@ -96,26 +96,35 @@ int main(int argc, char *argv[]) } createGPUBase(mesh, Y); + DEBUG_TRACE; // unittest of fvm::ddt(rho, U) test_fvm_ddt_vector(dfDataBase, mesh, rho, U, initType::original); + DEBUG_TRACE; test_fvm_ddt_vector(dfDataBase, mesh, rho, U, initType::randomInit); + DEBUG_TRACE; // unittest of fvm::div(phi, U) test_fvm_div_vector(dfDataBase, mesh, phi, U, initType::original); + DEBUG_TRACE; test_fvm_div_vector(dfDataBase, mesh, phi, U, initType::randomInit); + DEBUG_TRACE; // unittest of fvm::laplacian(gamma, U) const tmp nuEff_tmp(turbulence->nuEff()); const volScalarField& nuEff = nuEff_tmp(); volScalarField gamma = rho * nuEff; test_fvm_laplacian_vector(dfDataBase, mesh, gamma, U, initType::original); + DEBUG_TRACE; test_fvm_laplacian_vector(dfDataBase, mesh, gamma, U, initType::randomInit); + DEBUG_TRACE; // unittest of fvc::ddt(rho, K) K = 0.5*magSqr(U); test_fvc_ddt_scalar(dfDataBase, mesh, rho, K, initType::original); + DEBUG_TRACE; test_fvc_ddt_scalar(dfDataBase, mesh, rho, K, initType::randomInit); + DEBUG_TRACE; } return 0; } diff --git a/src_gpu/dfMatrixDataBase.H b/src_gpu/dfMatrixDataBase.H index cce7e6adc..8aee29b45 100644 --- a/src_gpu/dfMatrixDataBase.H +++ b/src_gpu/dfMatrixDataBase.H @@ -14,6 +14,8 @@ #include #include +#define DEBUG_TRACE fprintf(stderr, "%s %d\n", __FILE__, __LINE__); + static const char *_cudaGetErrorEnum(cudaError_t error) { return cudaGetErrorName(error); } diff --git a/src_gpu/dfMatrixOpBase.cu b/src_gpu/dfMatrixOpBase.cu index c8583cd17..e8836060f 100644 --- a/src_gpu/dfMatrixOpBase.cu +++ b/src_gpu/dfMatrixOpBase.cu @@ -165,8 +165,20 @@ __global__ void fvc_ddt_scalar_kernel(int num_cells, double rDeltaT, if (index >= num_cells) return; + double val_new = rho[index] * vf[index]; + double val_old = rho_old[index] * vf_old[index]; // TODO: skip moving - output[index] += rDeltaT * (rho[index] * vf[index] - rho_old[index] * vf_old[index]); + // TODO: wyr + // for the case of rho = rho_old and vf = vf_old, the floating-point numerical problem will be exposed. + // it expect zero as output, but the gpu result get a sub-normal minimal value for (val_new - val_old), + // which smaller than 1e-16, and then enlarged by rDeltaT (1e6) + // then the comparison of cpu result and gpu result will failed with relative error: inf, + // e.g.: + // cpu data: 0.0000000000000000, gpu data: 0.0000000000298050, relative error: inf + // if I add the print line for intermediate variables of val_new and val_old, the problem disappears. + // It seems that print line will change the compiler behavior, maybe avoiding the fma optimization of compiler. + if (index == -1) printf("index = 0, val_new: %.40lf, val_old: %.40lf\n", val_new, val_old); + output[index] += rDeltaT * (val_new - val_old); } void permute_vector_d2h(cudaStream_t stream, int num_cells, const double *input, double *output) From a8c68c69a2419f7e9de16a9104f699f37ac93f4d Mon Sep 17 00:00:00 2001 From: STwangyingrui Date: Mon, 7 Aug 2023 20:24:16 +0800 Subject: [PATCH 11/25] fix occasional errors of fvm fvm::ddt and fvc::ddt: caused by re-using h_rho between async-uploading rho and async-uploading rho.old --- GPUTest/GPUTestBase.H | 26 ++------------------------ src_gpu/dfMatrixDataBase.H | 2 ++ src_gpu/dfMatrixDataBase.cu | 4 ++++ 3 files changed, 8 insertions(+), 24 deletions(-) diff --git a/GPUTest/GPUTestBase.H b/GPUTest/GPUTestBase.H index 360abdf2c..efd6ba4fd 100644 --- a/GPUTest/GPUTestBase.H +++ b/GPUTest/GPUTestBase.H @@ -95,28 +95,6 @@ void randomInitVolScalar(volScalarField& field) { } } -// rho_old need special treatment: it use h_xxx of rho -void uploadRegisteredRhoOld(dfMatrixDataBase& dfDataBase, const volScalarField& field) { - double *h_internal_field = dfDataBase.getFieldPointer("rho", location::cpu, position::internal); - double *h_boundary_field = dfDataBase.getFieldPointer("rho", location::cpu, position::boundary); - double *d_internal_field = dfDataBase.getFieldPointer("rho_old", location::gpu, position::internal); - double *d_boundary_field = dfDataBase.getFieldPointer("rho_old", location::gpu, position::boundary); - // internal - memcpy(h_internal_field, &field[0], dfDataBase.cell_value_bytes); - // boundary - int offset = 0; - forAll(field.boundaryField(), patchi) - { - const auto& patchField = field.boundaryField()[patchi]; - int patchsize = patchField.size(); - memcpy(h_boundary_field + offset, &patchField[0], patchsize * sizeof(double)); - offset += patchsize; - } - // transfer - checkCudaErrors(cudaMemcpyAsync(d_internal_field, h_internal_field, dfDataBase.cell_value_bytes, cudaMemcpyHostToDevice, dfDataBase.stream)); - checkCudaErrors(cudaMemcpyAsync(d_boundary_field, h_boundary_field, dfDataBase.boundary_surface_value_bytes, cudaMemcpyHostToDevice, dfDataBase.stream)); -} - void uploadRegisteredVolScalar(dfMatrixDataBase& dfDataBase, const volScalarField& field, const char* fieldAlias) { double *h_internal_field = dfDataBase.getFieldPointer(fieldAlias, location::cpu, position::internal); double *h_boundary_field = dfDataBase.getFieldPointer(fieldAlias, location::cpu, position::boundary); @@ -379,7 +357,7 @@ void test_fvm_ddt_vector(dfMatrixDataBase& dfDataBase, Foam::fvMesh& mesh, volSc // prepare for run GPU // prepare rho, rho.old, U uploadRegisteredVolScalar(dfDataBase, rho, "rho"); - uploadRegisteredRhoOld(dfDataBase, rho.oldTime()); + uploadRegisteredVolScalar(dfDataBase, rho.oldTime(), "rho_old"); uploadRegisteredVolVector(dfDataBase, U.oldTime(), "u"); // prepare testData testGPUDataBase testData; @@ -493,7 +471,7 @@ void test_fvc_ddt_scalar(dfMatrixDataBase& dfDataBase, Foam::fvMesh& mesh, volSc // prepare for run GPU // prepare rho, rho.old on GPU uploadRegisteredVolScalar(dfDataBase, rho, "rho"); - uploadRegisteredRhoOld(dfDataBase, rho.oldTime()); + uploadRegisteredVolScalar(dfDataBase, rho.oldTime(), "rho_old"); // prepare K, K_old on GPU double *d_K = nullptr; double *d_K_old = nullptr; diff --git a/src_gpu/dfMatrixDataBase.H b/src_gpu/dfMatrixDataBase.H index 8aee29b45..4e0bd4cbe 100644 --- a/src_gpu/dfMatrixDataBase.H +++ b/src_gpu/dfMatrixDataBase.H @@ -144,6 +144,7 @@ struct dfMatrixDataBase double *d_phi = nullptr; // computed on GPU, used on CPU, need memcpyd2h - host double *h_rho = nullptr; + double *h_rho_old = nullptr; double *h_u= nullptr; double *h_y= nullptr; double *h_he= nullptr; @@ -169,6 +170,7 @@ struct dfMatrixDataBase double *d_boundary_phi = nullptr; // computed on GPU, used on CPU, need memcpyd2h - host double *h_boundary_rho = nullptr; + double *h_boundary_rho_old = nullptr; double *h_boundary_u= nullptr; double *h_boundary_y= nullptr; double *h_boundary_he= nullptr; diff --git a/src_gpu/dfMatrixDataBase.cu b/src_gpu/dfMatrixDataBase.cu index 341241bf4..b426201a2 100644 --- a/src_gpu/dfMatrixDataBase.cu +++ b/src_gpu/dfMatrixDataBase.cu @@ -243,10 +243,12 @@ void dfMatrixDataBase::createNonConstantFieldsInternal() { // computed on GPU, used on CPU, need memcpyd2h checkCudaErrors(cudaMallocHost((void**)&h_rho, cell_value_bytes)); + checkCudaErrors(cudaMallocHost((void**)&h_rho_old, cell_value_bytes)); checkCudaErrors(cudaMallocHost((void**)&h_u, cell_value_vec_bytes)); checkCudaErrors(cudaMallocHost((void**)&h_y, cell_value_bytes * num_species)); checkCudaErrors(cudaMallocHost((void**)&h_he, cell_value_bytes)); fieldPointerMap["h_rho"] = h_rho; + fieldPointerMap["h_rho_old"] = h_rho_old; fieldPointerMap["h_u"] = h_u; fieldPointerMap["h_y"] = h_y; fieldPointerMap["h_he"] = h_he; @@ -282,10 +284,12 @@ void dfMatrixDataBase::createNonConstantFieldsBoundary() { // computed on GPU, used on CPU, need memcpyd2h checkCudaErrors(cudaMallocHost((void**)&h_boundary_rho, boundary_surface_value_bytes)); + checkCudaErrors(cudaMallocHost((void**)&h_boundary_rho_old, boundary_surface_value_bytes)); checkCudaErrors(cudaMallocHost((void**)&h_boundary_u, boundary_surface_value_vec_bytes)); checkCudaErrors(cudaMallocHost((void**)&h_boundary_y, boundary_surface_value_bytes * num_species)); checkCudaErrors(cudaMallocHost((void**)&h_boundary_he, boundary_surface_value_bytes)); fieldPointerMap["h_boundary_rho"] = h_boundary_rho; + fieldPointerMap["h_boundary_rho_old"] = h_boundary_rho_old; fieldPointerMap["h_boundary_u"] = h_boundary_u; fieldPointerMap["h_boundary_y"] = h_boundary_y; fieldPointerMap["h_boundary_he"] = h_boundary_he; From 24530375a9d3f09ff518967a7c42237c962aae2e Mon Sep 17 00:00:00 2001 From: STwangyingrui Date: Mon, 7 Aug 2023 21:07:07 +0800 Subject: [PATCH 12/25] workaround way two (use volatile) to avoid floating-point numerical errors, which may be caused by fma contraction --- src_gpu/dfMatrixOpBase.cu | 9 ++++++++- 1 file changed, 8 insertions(+), 1 deletion(-) diff --git a/src_gpu/dfMatrixOpBase.cu b/src_gpu/dfMatrixOpBase.cu index e8836060f..934441c81 100644 --- a/src_gpu/dfMatrixOpBase.cu +++ b/src_gpu/dfMatrixOpBase.cu @@ -164,7 +164,8 @@ __global__ void fvc_ddt_scalar_kernel(int num_cells, double rDeltaT, int index = blockDim.x * blockIdx.x + threadIdx.x; if (index >= num_cells) return; - + /* + // workaround way1 (use printf): double val_new = rho[index] * vf[index]; double val_old = rho_old[index] * vf_old[index]; // TODO: skip moving @@ -179,6 +180,12 @@ __global__ void fvc_ddt_scalar_kernel(int num_cells, double rDeltaT, // It seems that print line will change the compiler behavior, maybe avoiding the fma optimization of compiler. if (index == -1) printf("index = 0, val_new: %.40lf, val_old: %.40lf\n", val_new, val_old); output[index] += rDeltaT * (val_new - val_old); + */ + // workaround way2 (use volatile): + // volatile will change the compiler behavior, maybe avoiding the fma optimization of compiler. + volatile double val_new = rho[index] * vf[index]; + volatile double val_old = rho_old[index] * vf_old[index]; + output[index] += rDeltaT * (val_new - val_old); } void permute_vector_d2h(cudaStream_t stream, int num_cells, const double *input, double *output) From ae31072113dfe38cd5e32ca5468cba90ddc7faeb Mon Sep 17 00:00:00 2001 From: STwangyingrui Date: Mon, 7 Aug 2023 21:14:20 +0800 Subject: [PATCH 13/25] workaround way three (use nvcc option -fmad=false) to avoid floating-point numerical errors, which may be caused by fma contraction --- src_gpu/CMakeLists.txt | 2 +- src_gpu/dfMatrixOpBase.cu | 4 ++++ 2 files changed, 5 insertions(+), 1 deletion(-) diff --git a/src_gpu/CMakeLists.txt b/src_gpu/CMakeLists.txt index ed9070476..03a7fe6db 100644 --- a/src_gpu/CMakeLists.txt +++ b/src_gpu/CMakeLists.txt @@ -12,7 +12,7 @@ find_package(MPI REQUIRED) find_package(CUDAToolkit REQUIRED) find_library(LIBAMGXSH amgxsh PATHS $ENV{AMGX_DIR}/build) -add_compile_options(-arch=sm_70) +add_compile_options(-arch=sm_70 -fmad=false) include_directories( ${MPI_INCLUDE_PATH} diff --git a/src_gpu/dfMatrixOpBase.cu b/src_gpu/dfMatrixOpBase.cu index 934441c81..f55b6895a 100644 --- a/src_gpu/dfMatrixOpBase.cu +++ b/src_gpu/dfMatrixOpBase.cu @@ -181,11 +181,15 @@ __global__ void fvc_ddt_scalar_kernel(int num_cells, double rDeltaT, if (index == -1) printf("index = 0, val_new: %.40lf, val_old: %.40lf\n", val_new, val_old); output[index] += rDeltaT * (val_new - val_old); */ + /* // workaround way2 (use volatile): // volatile will change the compiler behavior, maybe avoiding the fma optimization of compiler. volatile double val_new = rho[index] * vf[index]; volatile double val_old = rho_old[index] * vf_old[index]; output[index] += rDeltaT * (val_new - val_old); + */ + // workaround way3 (use nvcc option -fmad=false) + output[index] += rDeltaT * (rho[index] * vf[index] - rho_old[index] * vf_old[index]); } void permute_vector_d2h(cudaStream_t stream, int num_cells, const double *input, double *output) From b16100fdb8c4db1d15ea21afbbee90d1947e48e5 Mon Sep 17 00:00:00 2001 From: STwangyingrui Date: Mon, 7 Aug 2023 23:55:14 +0800 Subject: [PATCH 14/25] use template to simplify unittest --- GPUTest/GPUTestBase.H | 344 +++++++++++++++++------------------------- 1 file changed, 138 insertions(+), 206 deletions(-) diff --git a/GPUTest/GPUTestBase.H b/GPUTest/GPUTestBase.H index efd6ba4fd..42a64cd51 100644 --- a/GPUTest/GPUTestBase.H +++ b/GPUTest/GPUTestBase.H @@ -41,194 +41,118 @@ struct testGPUDataBase { } }; -void randomInitSurfaceScalar(surfaceScalarField& field) { - // random init field value to (-0.5, 0.5) - // internal - double *field_internal_ptr = &field[0]; - std::vector init_field_internal; - init_field_internal.resize(dfDataBase.num_surfaces); - for (int i = 0; i < dfDataBase.num_surfaces; i++) { - init_field_internal[i] = (rand() % 10000 - 5000) / 10000.0; - } - memcpy(field_internal_ptr, init_field_internal.data(), dfDataBase.surface_value_bytes); - // boundary - int offset = 0; - forAll(field.boundaryField(), patchi) - { - auto& patchField = field.boundaryFieldRef()[patchi]; - int patchsize = patchField.size(); - double *field_boundary_ptr = &patchField[0]; - std::vector init_field_boundary; - init_field_boundary.resize(patchsize); - for (int i = 0; i < patchsize; i++) { - init_field_boundary[i] = (rand() % 10000 - 5000) / 10000.0; - } - memcpy(field_boundary_ptr, init_field_boundary.data(), patchsize * sizeof(double)); - offset += patchsize; - } +template +void getTypeInfo(size_t *stride, size_t *internal_size, size_t *boundary_size) { + if ((typeid(T) != typeid(surfaceScalarField)) + && (typeid(T) != typeid(surfaceVectorField)) + && (typeid(T) != typeid(volScalarField)) + && (typeid(T) != typeid(volVectorField))) { + fprintf(stderr, "ERROR! Unsupported field type()!\n"); + exit(EXIT_FAILURE); + } + bool isVol = ((typeid(T) == typeid(volScalarField)) || (typeid(T) == typeid(volVectorField))); + bool isVec = ((typeid(T) == typeid(surfaceVectorField)) || (typeid(T) == typeid(volVectorField))); + *stride = isVec ? 3 : 1; + *internal_size = (isVol ? dfDataBase.num_cells : dfDataBase.num_surfaces) * (*stride); + *boundary_size = dfDataBase.num_boundary_surfaces * (*stride); } -void randomInitVolScalar(volScalarField& field) { - // random init field value to (-0.5, 0.5) - // internal - double *field_internal_ptr = &field[0]; - std::vector init_field_internal; - init_field_internal.resize(dfDataBase.num_cells); - for (int i = 0; i < dfDataBase.num_cells; i++) { - init_field_internal[i] = (rand() % 10000 - 5000) / 10000.0; - } - memcpy(field_internal_ptr, init_field_internal.data(), dfDataBase.cell_value_bytes); - // boundary - int offset = 0; - forAll(field.boundaryField(), patchi) - { - auto& patchField = field.boundaryFieldRef()[patchi]; - int patchsize = patchField.size(); - double *field_boundary_ptr = &patchField[0]; - std::vector init_field_boundary; - init_field_boundary.resize(patchsize); - for (int i = 0; i < patchsize; i++) { - init_field_boundary[i] = (rand() % 10000 - 5000) / 10000.0; - } - memcpy(field_boundary_ptr, init_field_boundary.data(), patchsize * sizeof(double)); - offset += patchsize; - } -} +template +void randomInitField(T& field) { + size_t stride = 0; + size_t internal_size = 0; + size_t boundary_size = 0; + getTypeInfo(&stride, &internal_size, &boundary_size); + size_t internal_value_bytes = internal_size * sizeof(double); -void uploadRegisteredVolScalar(dfMatrixDataBase& dfDataBase, const volScalarField& field, const char* fieldAlias) { - double *h_internal_field = dfDataBase.getFieldPointer(fieldAlias, location::cpu, position::internal); - double *h_boundary_field = dfDataBase.getFieldPointer(fieldAlias, location::cpu, position::boundary); - double *d_internal_field = dfDataBase.getFieldPointer(fieldAlias, location::gpu, position::internal); - double *d_boundary_field = dfDataBase.getFieldPointer(fieldAlias, location::gpu, position::boundary); + // random init field value to (-0.5, 0.5) // internal - memcpy(h_internal_field, &field[0], dfDataBase.cell_value_bytes); - // boundary - int offset = 0; - forAll(field.boundaryField(), patchi) - { - const auto& patchField = field.boundaryField()[patchi]; - int patchsize = patchField.size(); - memcpy(h_boundary_field + offset, &patchField[0], patchsize * sizeof(double)); - offset += patchsize; + double *field_internal_ptr = &field[0]; + std::vector init_field_internal; + init_field_internal.resize(internal_size); + for (size_t i = 0; i < internal_size; i++) { + init_field_internal[i] = (rand() % 10000 - 5000) / 10000.0; } - // transfer - checkCudaErrors(cudaMemcpyAsync(d_internal_field, h_internal_field, dfDataBase.cell_value_bytes, cudaMemcpyHostToDevice, dfDataBase.stream)); - checkCudaErrors(cudaMemcpyAsync(d_boundary_field, h_boundary_field, dfDataBase.boundary_surface_value_bytes, cudaMemcpyHostToDevice, dfDataBase.stream)); -} - -void uploadRegisteredVolVector(dfMatrixDataBase& dfDataBase, const volVectorField& field, const char* fieldAlias) { - double *h_internal_field = dfDataBase.getFieldPointer(fieldAlias, location::cpu, position::internal); - double *h_boundary_field = dfDataBase.getFieldPointer(fieldAlias, location::cpu, position::boundary); - double *d_internal_field = dfDataBase.getFieldPointer(fieldAlias, location::gpu, position::internal); - double *d_boundary_field = dfDataBase.getFieldPointer(fieldAlias, location::gpu, position::boundary); - // internal - memcpy(h_internal_field, &field[0], dfDataBase.cell_value_vec_bytes); + memcpy(field_internal_ptr, init_field_internal.data(), internal_value_bytes); // boundary - int offset = 0; forAll(field.boundaryField(), patchi) { - const auto& patchField = field.boundaryField()[patchi]; - int patchsize = patchField.size(); - memcpy(h_boundary_field + offset * 3, &patchField[0], patchsize * 3 * sizeof(double)); - offset += patchsize; + auto& patchField = field.boundaryFieldRef()[patchi]; + size_t patchsize = patchField.size(); + double *field_boundary_ptr = &patchField[0]; + std::vector init_field_boundary; + init_field_boundary.resize(patchsize * stride); + for (size_t i = 0; i < patchsize * stride; i++) { + init_field_boundary[i] = (rand() % 10000 - 5000) / 10000.0; + } + memcpy(field_boundary_ptr, init_field_boundary.data(), patchsize * stride * sizeof(double)); } - // transfer - checkCudaErrors(cudaMemcpyAsync(d_internal_field, h_internal_field, dfDataBase.cell_value_vec_bytes, cudaMemcpyHostToDevice, dfDataBase.stream)); - checkCudaErrors(cudaMemcpyAsync(d_boundary_field, h_boundary_field, dfDataBase.boundary_surface_value_vec_bytes, cudaMemcpyHostToDevice, dfDataBase.stream)); } -void uploadRegisteredSurfaceScalar(dfMatrixDataBase& dfDataBase, const surfaceScalarField& field, const char* fieldAlias) { +template +void uploadRegisteredField(dfMatrixDataBase& dfDataBase, const T& field, const char* fieldAlias) { + size_t stride = 0; + size_t internal_size = 0; + size_t boundary_size = 0; + getTypeInfo(&stride, &internal_size, &boundary_size); + size_t internal_value_bytes = internal_size * sizeof(double); + size_t boundary_value_bytes = boundary_size * sizeof(double); + double *h_internal_field = dfDataBase.getFieldPointer(fieldAlias, location::cpu, position::internal); double *h_boundary_field = dfDataBase.getFieldPointer(fieldAlias, location::cpu, position::boundary); double *d_internal_field = dfDataBase.getFieldPointer(fieldAlias, location::gpu, position::internal); double *d_boundary_field = dfDataBase.getFieldPointer(fieldAlias, location::gpu, position::boundary); + // internal - memcpy(h_internal_field, &field[0], dfDataBase.surface_value_bytes); + memcpy(h_internal_field, &field[0], internal_value_bytes); // boundary int offset = 0; forAll(field.boundaryField(), patchi) { const auto& patchField = field.boundaryField()[patchi]; int patchsize = patchField.size(); - memcpy(h_boundary_field + offset, &patchField[0], patchsize * sizeof(double)); + memcpy(h_boundary_field + offset * stride, &patchField[0], patchsize * stride * sizeof(double)); offset += patchsize; } // transfer - checkCudaErrors(cudaMemcpyAsync(d_internal_field, h_internal_field, dfDataBase.surface_value_bytes, cudaMemcpyHostToDevice, dfDataBase.stream)); - checkCudaErrors(cudaMemcpyAsync(d_boundary_field, h_boundary_field, dfDataBase.boundary_surface_value_bytes, cudaMemcpyHostToDevice, dfDataBase.stream)); + checkCudaErrors(cudaMemcpyAsync(d_internal_field, h_internal_field, internal_value_bytes, cudaMemcpyHostToDevice, dfDataBase.stream)); + checkCudaErrors(cudaMemcpyAsync(d_boundary_field, h_boundary_field, boundary_value_bytes, cudaMemcpyHostToDevice, dfDataBase.stream)); } -void uploadVolScalar(dfMatrixDataBase& dfDataBase, const volScalarField& field, double *d_field, double *d_boundary_field) { +template +void uploadField(dfMatrixDataBase& dfDataBase, const T& field, double *d_field, double *d_boundary_field) { + size_t stride = 0; + size_t internal_size = 0; + size_t boundary_size = 0; + getTypeInfo(&stride, &internal_size, &boundary_size); + size_t internal_value_bytes = internal_size * sizeof(double); + size_t boundary_value_bytes = boundary_size * sizeof(double); + std::vector h_boundary_field; - h_boundary_field.resize(dfDataBase.num_boundary_surfaces); + h_boundary_field.resize(boundary_size); int offset = 0; forAll(field.boundaryField(), patchi) { const auto& patchField = field.boundaryField()[patchi]; int patchsize = patchField.size(); - memcpy(h_boundary_field.data() + offset, &patchField[0], patchsize * sizeof(double)); + memcpy(h_boundary_field.data() + offset * stride, &patchField[0], patchsize * stride * sizeof(double)); offset += patchsize; } - checkCudaErrors(cudaMemcpyAsync(d_field, &field[0], dfDataBase.cell_value_bytes, cudaMemcpyHostToDevice, dfDataBase.stream)); - checkCudaErrors(cudaMemcpyAsync(d_boundary_field, h_boundary_field.data(), dfDataBase.boundary_surface_value_bytes, cudaMemcpyHostToDevice, dfDataBase.stream)); + checkCudaErrors(cudaMemcpyAsync(d_field, &field[0], internal_value_bytes, cudaMemcpyHostToDevice, dfDataBase.stream)); + checkCudaErrors(cudaMemcpyAsync(d_boundary_field, h_boundary_field.data(), boundary_value_bytes, cudaMemcpyHostToDevice, dfDataBase.stream)); } -void buildTestGPUDataBaseScalar(const dfMatrixDataBase& dfDataBase, testGPUDataBase& testData, const volScalarField& field, +template +void buildTestGPUDataBase(const dfMatrixDataBase& dfDataBase, testGPUDataBase& testData, const T& field, bool lowerFlag, bool upperFlag, bool diagFlag, bool sourceFlag, bool internalCoeffsFlag, bool boundaryCoeffsFlag, bool valueInternalCoeffsFlag, bool valueBoundaryCoeffsFlag, bool gradientInternalCoeffsFlag, bool gradientBoundaryCoeffsFlag) { - // ldu - if (lowerFlag) { - checkCudaErrors(cudaMalloc((void**)&testData.d_lower, dfDataBase.surface_value_bytes)); - checkCudaErrors(cudaMemset(testData.d_lower, 0, dfDataBase.surface_value_bytes)); - } - if (upperFlag) { - checkCudaErrors(cudaMalloc((void**)&testData.d_upper, dfDataBase.surface_value_bytes)); - checkCudaErrors(cudaMemset(testData.d_upper, 0, dfDataBase.surface_value_bytes)); - } - if (diagFlag) { - checkCudaErrors(cudaMalloc((void**)&testData.d_diag, dfDataBase.cell_value_bytes)); - checkCudaErrors(cudaMemset(testData.d_diag, 0, dfDataBase.cell_value_bytes)); + if ((typeid(T) != typeid(volScalarField)) && (typeid(T) != typeid(volVectorField))) { + fprintf(stderr, "ERROR! Unsupported field type()!\n"); + exit(EXIT_FAILURE); } - if (sourceFlag) { - checkCudaErrors(cudaMalloc((void**)&testData.d_source, dfDataBase.cell_value_bytes)); - checkCudaErrors(cudaMemset(testData.d_source, 0, dfDataBase.cell_value_bytes)); - } - if (internalCoeffsFlag) { - checkCudaErrors(cudaMalloc((void**)&testData.d_internal_coeffs, dfDataBase.boundary_surface_value_bytes)); - checkCudaErrors(cudaMemset(testData.d_internal_coeffs, 0, dfDataBase.boundary_surface_value_bytes)); - } - if (boundaryCoeffsFlag) { - checkCudaErrors(cudaMalloc((void**)&testData.d_boundary_coeffs, dfDataBase.boundary_surface_value_bytes)); - checkCudaErrors(cudaMemset(testData.d_boundary_coeffs, 0, dfDataBase.boundary_surface_value_bytes)); - } - // boundary coeffs - if (valueInternalCoeffsFlag) { - checkCudaErrors(cudaMalloc((void**)&testData.d_value_internal_coeffs, dfDataBase.boundary_surface_value_bytes)); - checkCudaErrors(cudaMemset(testData.d_value_internal_coeffs, 0, dfDataBase.boundary_surface_value_bytes)); - } - if (valueBoundaryCoeffsFlag) { - checkCudaErrors(cudaMalloc((void**)&testData.d_value_boundary_coeffs, dfDataBase.boundary_surface_value_bytes)); - checkCudaErrors(cudaMemset(testData.d_value_boundary_coeffs, 0, dfDataBase.boundary_surface_value_bytes)); - } - if (gradientInternalCoeffsFlag) { - checkCudaErrors(cudaMalloc((void**)&testData.d_gradient_internal_coeffs, dfDataBase.boundary_surface_value_bytes)); - checkCudaErrors(cudaMemset(testData.d_gradient_internal_coeffs, 0, dfDataBase.boundary_surface_value_bytes)); - } - if (gradientBoundaryCoeffsFlag) { - checkCudaErrors(cudaMalloc((void**)&testData.d_gradient_boundary_coeffs, dfDataBase.boundary_surface_value_bytes)); - checkCudaErrors(cudaMemset(testData.d_gradient_boundary_coeffs, 0, dfDataBase.boundary_surface_value_bytes)); - } - // patch type - testData.patch_type.resize(dfDataBase.num_patches); - forAll(field.boundaryField(), patchi) - { - constructBoundarySelectorPerPatch(&(testData.patch_type[patchi]), field.boundaryField()[patchi].type()); - } -} + bool isVec = (typeid(T) == typeid(volVectorField)); + size_t stride = isVec ? 3 : 1; -void buildTestGPUDataBaseVector(const dfMatrixDataBase& dfDataBase, testGPUDataBase& testData, const volVectorField& field, - bool lowerFlag, bool upperFlag, bool diagFlag, bool sourceFlag, bool internalCoeffsFlag, bool boundaryCoeffsFlag, - bool valueInternalCoeffsFlag, bool valueBoundaryCoeffsFlag, bool gradientInternalCoeffsFlag, bool gradientBoundaryCoeffsFlag) { // ldu if (lowerFlag) { checkCudaErrors(cudaMalloc((void**)&testData.d_lower, dfDataBase.surface_value_bytes)); @@ -243,33 +167,33 @@ void buildTestGPUDataBaseVector(const dfMatrixDataBase& dfDataBase, testGPUDataB checkCudaErrors(cudaMemset(testData.d_diag, 0, dfDataBase.cell_value_bytes)); } if (sourceFlag) { - checkCudaErrors(cudaMalloc((void**)&testData.d_source, dfDataBase.cell_value_vec_bytes)); - checkCudaErrors(cudaMemset(testData.d_source, 0, dfDataBase.cell_value_vec_bytes)); + checkCudaErrors(cudaMalloc((void**)&testData.d_source, dfDataBase.cell_value_bytes * stride)); + checkCudaErrors(cudaMemset(testData.d_source, 0, dfDataBase.cell_value_bytes * stride)); } if (internalCoeffsFlag) { - checkCudaErrors(cudaMalloc((void**)&testData.d_internal_coeffs, dfDataBase.boundary_surface_value_vec_bytes)); - checkCudaErrors(cudaMemset(testData.d_internal_coeffs, 0, dfDataBase.boundary_surface_value_vec_bytes)); + checkCudaErrors(cudaMalloc((void**)&testData.d_internal_coeffs, dfDataBase.boundary_surface_value_bytes * stride)); + checkCudaErrors(cudaMemset(testData.d_internal_coeffs, 0, dfDataBase.boundary_surface_value_bytes * stride)); } if (boundaryCoeffsFlag) { - checkCudaErrors(cudaMalloc((void**)&testData.d_boundary_coeffs, dfDataBase.boundary_surface_value_vec_bytes)); - checkCudaErrors(cudaMemset(testData.d_boundary_coeffs, 0, dfDataBase.boundary_surface_value_vec_bytes)); + checkCudaErrors(cudaMalloc((void**)&testData.d_boundary_coeffs, dfDataBase.boundary_surface_value_bytes * stride)); + checkCudaErrors(cudaMemset(testData.d_boundary_coeffs, 0, dfDataBase.boundary_surface_value_bytes * stride)); } // boundary coeffs if (valueInternalCoeffsFlag) { - checkCudaErrors(cudaMalloc((void**)&testData.d_value_internal_coeffs, dfDataBase.boundary_surface_value_vec_bytes)); - checkCudaErrors(cudaMemset(testData.d_value_internal_coeffs, 0, dfDataBase.boundary_surface_value_vec_bytes)); + checkCudaErrors(cudaMalloc((void**)&testData.d_value_internal_coeffs, dfDataBase.boundary_surface_value_bytes * stride)); + checkCudaErrors(cudaMemset(testData.d_value_internal_coeffs, 0, dfDataBase.boundary_surface_value_bytes * stride)); } if (valueBoundaryCoeffsFlag) { - checkCudaErrors(cudaMalloc((void**)&testData.d_value_boundary_coeffs, dfDataBase.boundary_surface_value_vec_bytes)); - checkCudaErrors(cudaMemset(testData.d_value_boundary_coeffs, 0, dfDataBase.boundary_surface_value_vec_bytes)); + checkCudaErrors(cudaMalloc((void**)&testData.d_value_boundary_coeffs, dfDataBase.boundary_surface_value_bytes * stride)); + checkCudaErrors(cudaMemset(testData.d_value_boundary_coeffs, 0, dfDataBase.boundary_surface_value_bytes * stride)); } if (gradientInternalCoeffsFlag) { - checkCudaErrors(cudaMalloc((void**)&testData.d_gradient_internal_coeffs, dfDataBase.boundary_surface_value_vec_bytes)); - checkCudaErrors(cudaMemset(testData.d_gradient_internal_coeffs, 0, dfDataBase.boundary_surface_value_vec_bytes)); + checkCudaErrors(cudaMalloc((void**)&testData.d_gradient_internal_coeffs, dfDataBase.boundary_surface_value_bytes * stride)); + checkCudaErrors(cudaMemset(testData.d_gradient_internal_coeffs, 0, dfDataBase.boundary_surface_value_bytes * stride)); } if (gradientBoundaryCoeffsFlag) { - checkCudaErrors(cudaMalloc((void**)&testData.d_gradient_boundary_coeffs, dfDataBase.boundary_surface_value_vec_bytes)); - checkCudaErrors(cudaMemset(testData.d_gradient_boundary_coeffs, 0, dfDataBase.boundary_surface_value_vec_bytes)); + checkCudaErrors(cudaMalloc((void**)&testData.d_gradient_boundary_coeffs, dfDataBase.boundary_surface_value_bytes * stride)); + checkCudaErrors(cudaMemset(testData.d_gradient_boundary_coeffs, 0, dfDataBase.boundary_surface_value_bytes * stride)); } // patch type testData.patch_type.resize(dfDataBase.num_patches); @@ -279,14 +203,16 @@ void buildTestGPUDataBaseVector(const dfMatrixDataBase& dfDataBase, testGPUDataB } } -void updateBoundaryCoeffsVector(const dfMatrixDataBase& dfDataBase, testGPUDataBase& testData) { - update_boundary_coeffs_vector(dfDataBase.stream, dfDataBase.num_patches, - dfDataBase.patch_size.data(), testData.patch_type.data(), - testData.d_value_internal_coeffs, testData.d_value_boundary_coeffs, - testData.d_gradient_internal_coeffs, testData.d_gradient_boundary_coeffs); -} - +// TODO: It seems that compareResult of scalar and vector can't be merged void compareResultVector(const dfMatrixDataBase& dfDataBase, const testGPUDataBase& testData, fvVectorMatrix& dfMatrix, bool printFlag) { + //if ((typeid(T) != typeid(fvScalarMatrix)) && (typeid(T) != typeid(fvVectorMatrix))) { + // fprintf(stderr, "ERROR! Unsupported field type()!\n"); + // exit(EXIT_FAILURE); + //} + //bool isVec = (typeid(T) == typeid(fvVectorMatrix)); + //size_t stride = isVec ? 3 : 1; + + size_t stride = 3; if (testData.d_lower) { std::vector h_lower; h_lower.resize(dfDataBase.num_surfaces); @@ -307,48 +233,51 @@ void compareResultVector(const dfMatrixDataBase& dfDataBase, const testGPUDataBa } if (testData.d_source) { std::vector h_source; - h_source.resize(dfDataBase.num_cells * 3); - checkCudaErrors(cudaMemcpy(h_source.data(), testData.d_source, dfDataBase.cell_value_vec_bytes, cudaMemcpyDeviceToHost)); - checkVectorEqual(dfDataBase.num_cells * 3, &dfMatrix.source()[0][0], h_source.data(), 1e-14, printFlag); + h_source.resize(dfDataBase.num_cells * stride); + checkCudaErrors(cudaMemcpy(h_source.data(), testData.d_source, dfDataBase.cell_value_bytes * stride, cudaMemcpyDeviceToHost)); + //void *source_ptr = isVec ? (&dfMatrix.source()[0][0]) : (&dfMatrix.source()[0]); + double *source_ptr = &dfMatrix.source()[0][0]; + checkVectorEqual(dfDataBase.num_cells * stride, source_ptr, h_source.data(), 1e-14, printFlag); } if (testData.d_internal_coeffs) { std::vector h_internal_coeffs; - h_internal_coeffs.resize(dfDataBase.num_boundary_surfaces * 3); - checkCudaErrors(cudaMemcpy(h_internal_coeffs.data(), testData.d_internal_coeffs, dfDataBase.boundary_surface_value_vec_bytes, cudaMemcpyDeviceToHost)); - std::vector cpu_internal_coeffs(dfDataBase.num_boundary_surfaces * 3); + h_internal_coeffs.resize(dfDataBase.num_boundary_surfaces * stride); + checkCudaErrors(cudaMemcpy(h_internal_coeffs.data(), testData.d_internal_coeffs, dfDataBase.boundary_surface_value_bytes * stride, cudaMemcpyDeviceToHost)); + std::vector cpu_internal_coeffs(dfDataBase.num_boundary_surfaces * stride); int offset = 0; for (int patchi = 0; patchi < dfDataBase.num_patches; patchi++) { int patchsize = dfDataBase.patch_size[patchi]; - const double* internal_coeff_ptr = &dfMatrix.internalCoeffs()[patchi][0][0]; - memcpy(cpu_internal_coeffs.data() + offset * 3, internal_coeff_ptr, patchsize * 3 * sizeof(double)); + //const void* internal_coeff_ptr = isVec ? (&dfMatrix.internalCoeffs()[patchi][0][0]) : (&dfMatrix.internalCoeffs()[patchi][0]); + const void* internal_coeff_ptr = &dfMatrix.internalCoeffs()[patchi][0][0]; + memcpy(cpu_internal_coeffs.data() + offset * stride, internal_coeff_ptr, patchsize * stride * sizeof(double)); offset += patchsize; } - checkVectorEqual(dfDataBase.num_boundary_surfaces * 3, cpu_internal_coeffs.data(), h_internal_coeffs.data(), 1e-14, printFlag); + checkVectorEqual(dfDataBase.num_boundary_surfaces * stride, cpu_internal_coeffs.data(), h_internal_coeffs.data(), 1e-14, printFlag); } if (testData.d_boundary_coeffs) { std::vector h_boundary_coeffs; - h_boundary_coeffs.resize(dfDataBase.num_boundary_surfaces * 3); - checkCudaErrors(cudaMemcpy(h_boundary_coeffs.data(), testData.d_boundary_coeffs, dfDataBase.boundary_surface_value_vec_bytes, cudaMemcpyDeviceToHost)); - std::vector cpu_boundary_coeffs(dfDataBase.num_boundary_surfaces * 3); + h_boundary_coeffs.resize(dfDataBase.num_boundary_surfaces * stride); + checkCudaErrors(cudaMemcpy(h_boundary_coeffs.data(), testData.d_boundary_coeffs, dfDataBase.boundary_surface_value_bytes * stride, cudaMemcpyDeviceToHost)); + std::vector cpu_boundary_coeffs(dfDataBase.num_boundary_surfaces * stride); int offset = 0; for (int patchi = 0; patchi < dfDataBase.num_patches; patchi++) { int patchsize = dfDataBase.patch_size[patchi]; - const double* boundary_coeff_ptr = &dfMatrix.boundaryCoeffs()[patchi][0][0]; - memcpy(cpu_boundary_coeffs.data() + offset * 3, boundary_coeff_ptr, patchsize * 3 * sizeof(double)); + //const void* boundary_coeff_ptr = isVec ? (&dfMatrix.boundaryCoeffs()[patchi][0][0]) : (&dfMatrix.boundaryCoeffs()[patchi][0]); + const void* boundary_coeff_ptr = &dfMatrix.boundaryCoeffs()[patchi][0][0]; + memcpy(cpu_boundary_coeffs.data() + offset * stride, boundary_coeff_ptr, patchsize * stride * sizeof(double)); offset += patchsize; } - checkVectorEqual(dfDataBase.num_boundary_surfaces * 3, cpu_boundary_coeffs.data(), h_boundary_coeffs.data(), 1e-14, printFlag); + checkVectorEqual(dfDataBase.num_boundary_surfaces * stride, cpu_boundary_coeffs.data(), h_boundary_coeffs.data(), 1e-14, printFlag); } } // unittest of fvm::ddt(rho, U) void test_fvm_ddt_vector(dfMatrixDataBase& dfDataBase, Foam::fvMesh& mesh, volScalarField& rho, volVectorField& U, initType type) { if (type == initType::randomInit) { - // random init rho and rho.old rho.oldTime(); - randomInitVolScalar(rho); + randomInitField(rho); } // run CPU @@ -356,13 +285,13 @@ void test_fvm_ddt_vector(dfMatrixDataBase& dfDataBase, Foam::fvMesh& mesh, volSc // prepare for run GPU // prepare rho, rho.old, U - uploadRegisteredVolScalar(dfDataBase, rho, "rho"); - uploadRegisteredVolScalar(dfDataBase, rho.oldTime(), "rho_old"); - uploadRegisteredVolVector(dfDataBase, U.oldTime(), "u"); + uploadRegisteredField(dfDataBase, rho, "rho"); + uploadRegisteredField(dfDataBase, rho.oldTime(), "rho_old"); + uploadRegisteredField(dfDataBase, U.oldTime(), "u"); // prepare testData testGPUDataBase testData; // only use diag and source - buildTestGPUDataBaseVector(dfDataBase, testData, U, false, false, true, true, false, false, false, false, false, false); + buildTestGPUDataBase(dfDataBase, testData, U, false, false, true, true, false, false, false, false, false, false); // run GPU fvm_ddt_vector(dfDataBase.stream, dfDataBase.num_cells, dfDataBase.rdelta_t, dfDataBase.d_rho, dfDataBase.d_rho_old, dfDataBase.d_u, dfDataBase.d_volume, @@ -377,7 +306,7 @@ void test_fvm_ddt_vector(dfMatrixDataBase& dfDataBase, Foam::fvMesh& mesh, volSc void test_fvm_div_vector(dfMatrixDataBase& dfDataBase, Foam::fvMesh& mesh, surfaceScalarField& phi, volVectorField& U, initType type) { if (type == initType::randomInit) { phi.oldTime(); - randomInitSurfaceScalar(phi); + randomInitField(phi); } // run CPU @@ -385,15 +314,18 @@ void test_fvm_div_vector(dfMatrixDataBase& dfDataBase, Foam::fvMesh& mesh, surfa // prepare for run GPU // prepare phi field - uploadRegisteredSurfaceScalar(dfDataBase, phi, "phi"); + uploadRegisteredField(dfDataBase, phi, "phi"); // prepare testData testGPUDataBase testData; // not use source - // gradient_internal_coeffs, gradient_boundary_coeffs are not needed actually, but updateBoundaryCoeffsVector will access them - buildTestGPUDataBaseVector(dfDataBase, testData, U, true, true, true, false, true, true, true, true, true, true); + // gradient_internal_coeffs, gradient_boundary_coeffs are not needed actually, but update_boundary_coeffs_vector will access them + buildTestGPUDataBase(dfDataBase, testData, U, true, true, true, false, true, true, true, true, true, true); // prepare boundary coeffs // TODO: updating boundary coeffs should be complemented later - updateBoundaryCoeffsVector(dfDataBase, testData); + update_boundary_coeffs_vector(dfDataBase.stream, dfDataBase.num_patches, + dfDataBase.patch_size.data(), testData.patch_type.data(), + testData.d_value_internal_coeffs, testData.d_value_boundary_coeffs, + testData.d_gradient_internal_coeffs, testData.d_gradient_boundary_coeffs); // run GPU fvm_div_vector(dfDataBase.stream, dfDataBase.num_surfaces, dfDataBase.d_owner, dfDataBase.d_neighbor, @@ -414,7 +346,7 @@ void test_fvm_laplacian_vector(dfMatrixDataBase& dfDataBase, Foam::fvMesh& mesh, { if (type == initType::randomInit) { gamma.oldTime(); - randomInitVolScalar(gamma); + randomInitField(gamma); } // run CPU @@ -426,15 +358,18 @@ void test_fvm_laplacian_vector(dfMatrixDataBase& dfDataBase, Foam::fvMesh& mesh, double *d_boundary_gamma = nullptr; checkCudaErrors(cudaMalloc((void**)&d_gamma, dfDataBase.cell_value_bytes)); checkCudaErrors(cudaMalloc((void**)&d_boundary_gamma, dfDataBase.boundary_surface_value_bytes)); - uploadVolScalar(dfDataBase, gamma, d_gamma, d_boundary_gamma); + uploadField(dfDataBase, gamma, d_gamma, d_boundary_gamma); // prepare testData testGPUDataBase testData; // not use source - // value_internal_coeffs, value_boundary_coeffs are not needed actually, but updateBoundaryCoeffsVector will access them - buildTestGPUDataBaseVector(dfDataBase, testData, U, true, true, true, false, true, true, true, true, true, true); + // value_internal_coeffs, value_boundary_coeffs are not needed actually, but update_boundary_coeffs_vector will access them + buildTestGPUDataBase(dfDataBase, testData, U, true, true, true, false, true, true, true, true, true, true); // prepare boundary coeffs // TODO: updating boundary coeffs should be complemented later - updateBoundaryCoeffsVector(dfDataBase, testData); + update_boundary_coeffs_vector(dfDataBase.stream, dfDataBase.num_patches, + dfDataBase.patch_size.data(), testData.patch_type.data(), + testData.d_value_internal_coeffs, testData.d_value_boundary_coeffs, + testData.d_gradient_internal_coeffs, testData.d_gradient_boundary_coeffs); // run GPU fvm_laplacian_vector(dfDataBase.stream, dfDataBase.num_surfaces, @@ -458,11 +393,10 @@ void test_fvm_laplacian_vector(dfMatrixDataBase& dfDataBase, Foam::fvMesh& mesh, // unittest of fvc::ddt(rho, K) void test_fvc_ddt_scalar(dfMatrixDataBase& dfDataBase, Foam::fvMesh& mesh, volScalarField& rho, volScalarField& K, initType type) { if (type == initType::randomInit) { - // random init rho and rho.old rho.oldTime(); - randomInitVolScalar(rho); + randomInitField(rho); K.oldTime(); - randomInitVolScalar(K); + randomInitField(K); } // run CPU @@ -470,8 +404,8 @@ void test_fvc_ddt_scalar(dfMatrixDataBase& dfDataBase, Foam::fvMesh& mesh, volSc // prepare for run GPU // prepare rho, rho.old on GPU - uploadRegisteredVolScalar(dfDataBase, rho, "rho"); - uploadRegisteredVolScalar(dfDataBase, rho.oldTime(), "rho_old"); + uploadRegisteredField(dfDataBase, rho, "rho"); + uploadRegisteredField(dfDataBase, rho.oldTime(), "rho_old"); // prepare K, K_old on GPU double *d_K = nullptr; double *d_K_old = nullptr; @@ -501,5 +435,3 @@ void test_fvc_ddt_scalar(dfMatrixDataBase& dfDataBase, Foam::fvMesh& mesh, volSc checkCudaErrors(cudaFree(d_K_old)); checkCudaErrors(cudaFree(d_fvc_ouput_scalar)); } - - From 8b3805fb699040f1e9ee89502c1e02389bbb1311 Mon Sep 17 00:00:00 2001 From: STwangyingrui Date: Tue, 8 Aug 2023 00:05:29 +0800 Subject: [PATCH 15/25] modify getTypeInfo to support tensor type --- GPUTest/GPUTestBase.H | 33 ++++++++++++++++++++++++--------- 1 file changed, 24 insertions(+), 9 deletions(-) diff --git a/GPUTest/GPUTestBase.H b/GPUTest/GPUTestBase.H index 42a64cd51..e35588006 100644 --- a/GPUTest/GPUTestBase.H +++ b/GPUTest/GPUTestBase.H @@ -43,18 +43,33 @@ struct testGPUDataBase { template void getTypeInfo(size_t *stride, size_t *internal_size, size_t *boundary_size) { - if ((typeid(T) != typeid(surfaceScalarField)) - && (typeid(T) != typeid(surfaceVectorField)) - && (typeid(T) != typeid(volScalarField)) - && (typeid(T) != typeid(volVectorField))) { + size_t s = 1; + bool isVol = false; + if (typeid(T) == typeid(surfaceScalarField)) { + s = 1; + isVol = false; + } else if (typeid(T) == typeid(surfaceVectorField)) { + s = 3; + isVol = false; + } else if (typeid(T) == typeid(surfaceTensorField)) { + s = 9; + isVol = false; + } else if (typeid(T) == typeid(volScalarField)) { + s = 1; + isVol = true; + } else if (typeid(T) == typeid(volVectorField)) { + s = 3; + isVol = true; + } else if (typeid(T) == typeid(volTensorField)) { + s = 9; + isVol = true; + } else { fprintf(stderr, "ERROR! Unsupported field type()!\n"); exit(EXIT_FAILURE); } - bool isVol = ((typeid(T) == typeid(volScalarField)) || (typeid(T) == typeid(volVectorField))); - bool isVec = ((typeid(T) == typeid(surfaceVectorField)) || (typeid(T) == typeid(volVectorField))); - *stride = isVec ? 3 : 1; - *internal_size = (isVol ? dfDataBase.num_cells : dfDataBase.num_surfaces) * (*stride); - *boundary_size = dfDataBase.num_boundary_surfaces * (*stride); + *stride = s; + *internal_size = (isVol ? dfDataBase.num_cells : dfDataBase.num_surfaces) * s; + *boundary_size = dfDataBase.num_boundary_surfaces * s; } template From 776aa69b519825144cfb9a2ebfcbf39fe4697e8d Mon Sep 17 00:00:00 2001 From: maorz1998 Date: Thu, 10 Aug 2023 00:26:32 +0800 Subject: [PATCH 16/25] first commit for debugging --- GPUTest/GPUTestBase.H | 69 ++++++ GPUTest/Make/options | 2 + GPUTest/createGPUSolver.H | 5 +- GPUTest/unittest.C | 13 + GPUTestRef/GenFvMatrix.H | 248 ++++++++++++++++++++ GPUTestRef/Make/files | 4 + GPUTestRef/Make/options | 31 +++ GPUTestRef/gaussConvectionScheme.C | 305 ++++++++++++++++++++++++ GPUTestRef/gaussGrad.C | 332 ++++++++++++++++++++++++++ src_gpu/dfMatrixDataBase.H | 3 +- src_gpu/dfMatrixDataBase.cu | 4 +- src_gpu/dfMatrixOpBase.H | 12 + src_gpu/dfMatrixOpBase.cu | 365 +++++++++++++++++++++++++++++ 13 files changed, 1390 insertions(+), 3 deletions(-) create mode 100644 GPUTestRef/GenFvMatrix.H create mode 100644 GPUTestRef/Make/files create mode 100644 GPUTestRef/Make/options create mode 100644 GPUTestRef/gaussConvectionScheme.C create mode 100644 GPUTestRef/gaussGrad.C diff --git a/GPUTest/GPUTestBase.H b/GPUTest/GPUTestBase.H index e35588006..38676528e 100644 --- a/GPUTest/GPUTestBase.H +++ b/GPUTest/GPUTestBase.H @@ -450,3 +450,72 @@ void test_fvc_ddt_scalar(dfMatrixDataBase& dfDataBase, Foam::fvMesh& mesh, volSc checkCudaErrors(cudaFree(d_K_old)); checkCudaErrors(cudaFree(d_fvc_ouput_scalar)); } + +// unittest of fvc::grad(U) +void test_fvc_grad_vector(dfMatrixDataBase& dfDataBase, Foam::fvMesh& mesh, volVectorField& U, initType type) { + // if (type == initType::randomInit) { + // U.oldTime(); + // randomInitField(U); + // } + + // run CPU + volTensorField fvc_ouput_tensor = fvc::grad(U); + // volTensorField fvc_ouput_tensor = gaussGradSchemeGrad(U); + + // prepare for run GPU + // prepare U on GPU + uploadRegisteredField(dfDataBase, U, "u"); + + double *d_fvc_ouput_tensor = nullptr, *d_fvc_ouput_boundary_tensor = nullptr; + checkCudaErrors(cudaMalloc((void**)&d_fvc_ouput_tensor, dfDataBase.cell_value_tsr_bytes)); + checkCudaErrors(cudaMalloc((void**)&d_fvc_ouput_boundary_tensor, dfDataBase.boundary_surface_value_tsr_bytes)); + checkCudaErrors(cudaMemset(d_fvc_ouput_tensor, 0, dfDataBase.cell_value_tsr_bytes)); + checkCudaErrors(cudaMemset(d_fvc_ouput_boundary_tensor, 0, dfDataBase.boundary_surface_value_tsr_bytes)); + + // only need patch_type + testGPUDataBase testData; + buildTestGPUDataBase(dfDataBase, testData, U, false, false, false, false, false, false, false, false, false, false); + + fvc_grad_vector(dfDataBase.stream, dfDataBase.num_cells, dfDataBase.num_surfaces, + dfDataBase.d_owner, dfDataBase.d_neighbor, + dfDataBase.d_weight, dfDataBase.d_sf, dfDataBase.d_u, d_fvc_ouput_tensor, + dfDataBase.num_patches, dfDataBase.patch_size.data(), testData.patch_type.data(), + dfDataBase.d_boundary_face_cell, dfDataBase.d_boundary_u, dfDataBase.d_boundary_sf, + dfDataBase.d_volume, dfDataBase.d_boundary_mag_sf, d_fvc_ouput_boundary_tensor, dfDataBase.d_boundary_delta_coeffs); + + // compare result + bool printFlag = false; + std::vector h_fvc_ouput_tensor(dfDataBase.num_cells * 9); + checkCudaErrors(cudaMemcpy(h_fvc_ouput_tensor.data(), d_fvc_ouput_tensor, dfDataBase.cell_value_tsr_bytes, cudaMemcpyDeviceToHost)); + checkVectorEqual(dfDataBase.num_cells * 9, &fvc_ouput_tensor[0][0], h_fvc_ouput_tensor.data(), 1e-14, printFlag); +} + +// unittest of fvc::div(phi) +void test_fvc_div_scalar(dfMatrixDataBase& dfDataBase, Foam::fvMesh& mesh, surfaceScalarField& phi, initType type) { + if (type == initType::randomInit) { + phi.oldTime(); + randomInitField(phi); + } + + // run CPU + volScalarField fvc_ouput_scalar = fvc::div(phi); + // volScalarField fvc_ouput_scalar = gaussConvectionSchemeFvcDiv(phi); + + // prepare for run GPU + // prepare phi on GPU + uploadRegisteredField(dfDataBase, phi, "phi"); + + double *d_fvc_ouput_scalar = nullptr; + checkCudaErrors(cudaMalloc((void**)&d_fvc_ouput_scalar, dfDataBase.cell_value_bytes)); + checkCudaErrors(cudaMemset(d_fvc_ouput_scalar, 0, dfDataBase.cell_value_bytes)); + + fvc_div_surface_scalar(dfDataBase.stream, dfDataBase.num_cells, dfDataBase.num_surfaces, dfDataBase.num_boundary_surfaces, + dfDataBase.d_owner, dfDataBase.d_neighbor, + dfDataBase.d_phi, dfDataBase.d_boundary_face_cell, dfDataBase.d_boundary_phi, dfDataBase.d_volume, d_fvc_ouput_scalar); + + // compare result + bool printFlag = false; + std::vector h_fvc_ouput_scalar(dfDataBase.num_cells); + checkCudaErrors(cudaMemcpy(h_fvc_ouput_scalar.data(), d_fvc_ouput_scalar, dfDataBase.cell_value_bytes, cudaMemcpyDeviceToHost)); + checkVectorEqual(dfDataBase.num_cells, &fvc_ouput_scalar[0], h_fvc_ouput_scalar.data(), 1e-14, printFlag); +} diff --git a/GPUTest/Make/options b/GPUTest/Make/options index 637eb0e9b..197663050 100644 --- a/GPUTest/Make/options +++ b/GPUTest/Make/options @@ -24,6 +24,7 @@ EXE_INC = -std=c++14 \ -I$(DF_SRC)/dfCombustionModels/lnInclude \ -I$(CANTERA_ROOT)/include \ -I$(DF_ROOT)/src_gpu \ + -I$(DF_ROOT)/GPUTestRef/lnInclude \ -I/usr/local/cuda-11.6/include \ -I$(AMGX_DIR)/include @@ -39,6 +40,7 @@ EXE_LIBS = \ -ldfCanteraMixture \ -ldfChemistryModel \ -ldfCombustionModels \ + -ldfGenMatrix \ $(CANTERA_ROOT)/lib/libcantera.so \ /usr/local/cuda-11.6/lib64/libcudart.so \ $(AMGX_DIR)/build/libamgxsh.so \ diff --git a/GPUTest/createGPUSolver.H b/GPUTest/createGPUSolver.H index 3dd593337..478b15ed1 100644 --- a/GPUTest/createGPUSolver.H +++ b/GPUTest/createGPUSolver.H @@ -28,24 +28,27 @@ void createGPUBase(fvMesh& mesh, PtrList& Y) { double *boundary_sf = new double[3 * num_boundary_surfaces]; double *boundary_mag_sf = new double[num_boundary_surfaces]; double *boundary_delta_coeffs = new double[num_boundary_surfaces]; + int *boundary_face_cell = new int[num_boundary_surfaces]; int offset = 0; forAll(mesh.boundary(), patchi) { const vectorField& pSf = mesh.Sf().boundaryField()[patchi]; const scalarField& pMagSf = mesh.magSf().boundaryField()[patchi]; const scalarField& pDeltaCoeffs = mesh.nonOrthDeltaCoeffs().boundaryField()[patchi]; + const labelUList& pFaceCells = mesh.boundary()[patchi].faceCells(); int patchsize = pMagSf.size(); memcpy(boundary_sf + 3*offset, &pSf[0][0], 3*patchsize*sizeof(double)); memcpy(boundary_mag_sf + offset, &pMagSf[0], patchsize*sizeof(double)); memcpy(boundary_delta_coeffs + offset, &pDeltaCoeffs[0], patchsize*sizeof(double)); + memcpy(boundary_face_cell + offset, &pFaceCells[0], patchsize * sizeof(int)); offset += patchsize; } dfDataBase.createConstantFieldsInternal(); dfDataBase.createConstantFieldsBoundary(); dfDataBase.initConstantFieldsInternal(&mesh.Sf()[0][0], &mesh.magSf()[0], &mesh.surfaceInterpolation::weights()[0], &mesh.nonOrthDeltaCoeffs()[0], &mesh.V()[0]); - dfDataBase.initConstantFieldsBoundary(boundary_sf, boundary_mag_sf, boundary_delta_coeffs); + dfDataBase.initConstantFieldsBoundary(boundary_sf, boundary_mag_sf, boundary_delta_coeffs, boundary_face_cell); // prepare internal and boundary of Y dfDataBase.createNonConstantFieldsInternal(); diff --git a/GPUTest/unittest.C b/GPUTest/unittest.C index edd5b7856..ccbaefa71 100644 --- a/GPUTest/unittest.C +++ b/GPUTest/unittest.C @@ -49,6 +49,9 @@ Description #include #include "upwind.H" +// debug +#include "GenFvMatrix.H" + #include "dfMatrixDataBase.H" #include "dfMatrixOpBase.H" #include "createGPUSolver.H" @@ -125,6 +128,16 @@ int main(int argc, char *argv[]) DEBUG_TRACE; test_fvc_ddt_scalar(dfDataBase, mesh, rho, K, initType::randomInit); DEBUG_TRACE; + + // unittest of fvc::grad(U) + test_fvc_grad_vector(dfDataBase, mesh, U, initType::original); + DEBUG_TRACE; + + // unittest of fvc::div(phi) + test_fvc_div_scalar(dfDataBase, mesh, phi, initType::original); + DEBUG_TRACE; + test_fvc_div_scalar(dfDataBase, mesh, phi, initType::randomInit); + DEBUG_TRACE; } return 0; } diff --git a/GPUTestRef/GenFvMatrix.H b/GPUTestRef/GenFvMatrix.H new file mode 100644 index 000000000..d328fe504 --- /dev/null +++ b/GPUTestRef/GenFvMatrix.H @@ -0,0 +1,248 @@ +#pragma once + +#include "tmp.H" +#include "dimensionedType.H" +#include "volFieldsFwd.H" +#include "surfaceFieldsFwd.H" +#include "typeInfo.H" +#include "runTimeSelectionTables.H" +#include "fvMatrices.H" +#include "fvMesh.H" +#include "turbulentFluidThermoModel.H" +#include "CombustionModel.H" +#include +#include +#include "PstreamGlobals.H" + +// * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * // + +namespace Foam +{ + + +// namespace fv +// { + +// fvm::ddt +template +tmp> +EulerDdtSchemeFvmDdt +( + const volScalarField& rho, + const GeometricField& vf +); + +template +tmp> +EulerDdtSchemeFvmDdt +( + const GeometricField& vf +); + +// fvc::ddt +template +tmp> +EulerDdtSchemeFvcDdt +( + const volScalarField& rho, + const GeometricField& vf +); + +template +tmp> +EulerDdtSchemeFvcDdt +( + const GeometricField& vf +); + +// fvc::ddtCorr +tmp +EulerDdtSchemeFvcDdtCorr +( + const volScalarField& rho, + const volVectorField& U, + const surfaceScalarField& phi, + const autoPtr& Uf +); + +tmp +EulerDdtSchemeFvcDdtPhiCoeff +( + const volVectorField& U, + const surfaceScalarField& phi, + const surfaceScalarField& phiCorr, + const volScalarField& rho +); + +template +Foam::tmp> +UEqn_H +( + fvMatrix& UEqn +); + +tmp +rAUConstructor +( + fvMatrix& UEqn +); + +tmp +rhorAUfConstructor +( + const volScalarField& rhorAU, + const surfaceScalarField& linear_weights +); + +tmp +phiHbyAConstructor +( + const volScalarField& rho, + const volVectorField& HbyA, + const surfaceScalarField& rhorAUf, + const surfaceScalarField& tddtCorr, + const surfaceScalarField& linear_weights +); + + +// fvm::div +template +tmp> +gaussConvectionSchemeFvmDiv +( + const surfaceScalarField& faceFlux, + const GeometricField& vf +); + +template +tmp> +gaussConvectionSchemeFvmDiv +( + const surfaceScalarField& faceFlux, + const GeometricField& vf, + const word& name +); + +// fvc::div +template +tmp> +gaussConvectionSchemeFvcDiv +( + const surfaceScalarField& faceFlux, + const GeometricField& vf +); + +template +tmp> +gaussConvectionSchemeFvcDiv +( + const GeometricField& ssf +); + +// fvc::grad +template +tmp +< + GeometricField + < + typename outerProduct::type, + fvPatchField, + volMesh + > +> +gaussGradSchemeGrad +( + const GeometricField& vsf +); + + +template +tmp> +gaussLaplacianSchemeFvmLaplacianUncorrected +( + const surfaceScalarField& gammaMagSf, + const surfaceScalarField& deltaCoeffs, + const GeometricField& vf +); + +// fvm::laplacian +template +tmp> +gaussLaplacianSchemeFvmLaplacian +( + const GeometricField& gammaScalarVol, + const GeometricField& vf +); + +template +tmp> +gaussLaplacianSchemeFvmLaplacian +( + const GeometricField& gamma, + const GeometricField& vf +); + +// turbulence->divDevRhoReff(U) +tmp +turbulenceModelLinearViscousStressDivDevRhoReff +( + volVectorField& U, + compressible::turbulenceModel& turbulence +); + +tmp +GenMatrix_U( + const volScalarField& rho, + volVectorField& U, + const surfaceScalarField& phi, + const volScalarField& p, + compressible::turbulenceModel& turbulence +); + +tmp +GenMatrix_Y( + const volScalarField& rho, + volScalarField& Yi, + const surfaceScalarField& phi, + const surfaceScalarField& phiUc, + const volScalarField& rhoD, + const volScalarField& mut, + const Switch splitting, + const scalar Sct, + CombustionModel& combustion, + fv::convectionScheme& mvConvection +); + +tmp +GenMatrix_E( + const volScalarField& rho, + volScalarField& he, + const surfaceScalarField& phi, + const volScalarField& K, + const volScalarField& dpdt, + const volScalarField& alphaEff, + const volScalarField& diffAlphaD, + const volVectorField& hDiffCorrFlux, + const surfaceScalarField& linear_weights +); + +tmp +GenMatrix_p( + const volScalarField& rho, + volScalarField& p, + const surfaceScalarField& phiHbyA, + const surfaceScalarField& rhorAUf, + const volScalarField& phi +); + + +void check_fvmatrix_equal(fvScalarMatrix& a,fvScalarMatrix& b); +void check_fvmatrix_equal(fvVectorMatrix& a,fvVectorMatrix& b); + +void check_field_equal(Field& a, Field& b); + + +} // End namespace Foam + + +// ************************************************************************* // \ No newline at end of file diff --git a/GPUTestRef/Make/files b/GPUTestRef/Make/files new file mode 100644 index 000000000..1137c3eed --- /dev/null +++ b/GPUTestRef/Make/files @@ -0,0 +1,4 @@ +gaussGrad.C +gaussConvectionScheme.C + +LIB = $(DF_LIBBIN)/libdfGenMatrix \ No newline at end of file diff --git a/GPUTestRef/Make/options b/GPUTestRef/Make/options new file mode 100644 index 000000000..0523a67e8 --- /dev/null +++ b/GPUTestRef/Make/options @@ -0,0 +1,31 @@ +-include $(GENERAL_RULES)/mplibType + +EXE_INC = \ + -g \ + $(PFLAGS) $(PINC) \ + -I$(LIB_SRC)/transportModels/compressible/lnInclude \ + -I$(LIB_SRC)/thermophysicalModels/basic/lnInclude \ + -I$(LIB_SRC)/TurbulenceModels/turbulenceModels/lnInclude \ + -I$(LIB_SRC)/TurbulenceModels/compressible/lnInclude \ + -I$(LIB_SRC)/finiteVolume/cfdTools \ + -I$(LIB_SRC)/finiteVolume/lnInclude \ + -I$(LIB_SRC)/meshTools/lnInclude \ + -I$(LIB_SRC)/sampling/lnInclude \ + -I$(LIB_SRC)/dynamicFvMesh/lnInclude \ + -I$(LIB_SRC)/Pstream/mpi \ + -I$(DF_SRC)/dfCanteraMixture/lnInclude \ + -I$(DF_SRC)/dfChemistryModel/lnInclude \ + -I$(DF_SRC)/dfCombustionModels/lnInclude \ + -I$(LIB_SRC)/parallel/decompose/decompositionMethods/lnInclude \ + -I$(LIB_SRC)/meshTools/lnInclude \ + -I$(LIB_SRC)/fileFormats/lnInclude \ + -I$(LIB_SRC)/triSurface/lnInclude \ + -I$(LIB_SRC)/surfMesh/lnInclude \ + -I$(LIB_SRC)/dynamicMesh/lnInclude \ + -I$(LIB_SRC)/finiteVolume/lnInclude \ + -I$(CANTERA_ROOT)/include + +EXE_LIBS = \ + -lOpenFOAM \ + -ltriSurface \ + -lmeshTools \ No newline at end of file diff --git a/GPUTestRef/gaussConvectionScheme.C b/GPUTestRef/gaussConvectionScheme.C new file mode 100644 index 000000000..83d11bda9 --- /dev/null +++ b/GPUTestRef/gaussConvectionScheme.C @@ -0,0 +1,305 @@ +/*---------------------------------------------------------------------------*\ + ========= | + \\ / F ield | OpenFOAM: The Open Source CFD Toolbox + \\ / O peration | Website: https://openfoam.org + \\ / A nd | Copyright (C) 2011-2018 OpenFOAM Foundation + \\/ M anipulation | +------------------------------------------------------------------------------- +License + This file is part of OpenFOAM. + + OpenFOAM is free software: you can redistribute it and/or modify it + under the terms of the GNU General Public License as published by + the Free Software Foundation, either version 3 of the License, or + (at your option) any later version. + + OpenFOAM is distributed in the hope that it will be useful, but WITHOUT + ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or + FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License + for more details. + + You should have received a copy of the GNU General Public License + along with OpenFOAM. If not, see . + +\*---------------------------------------------------------------------------*/ + +#include "GenFvMatrix.H" +#include "fvcSurfaceIntegrate.H" +#include "fvMatrices.H" +#include "gaussConvectionScheme.H" + +// * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * // + +namespace Foam +{ + +// * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * // + +// * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * // + +template +tmp> +gaussConvectionSchemeFvmDiv +( + const surfaceScalarField& faceFlux, + const GeometricField& vf, + const word& name +) +{ + const fvMesh& mesh = vf.mesh(); + + tmp> cs = fv::convectionScheme::New(mesh,faceFlux,mesh.divScheme(name)); + fv::gaussConvectionScheme& gcs = dynamic_cast&>(cs.ref()); + + tmp tweights = gcs.interpScheme().weights(vf); + const surfaceScalarField& weights = tweights(); + + tmp> tfvm + ( + new fvMatrix + ( + vf, + faceFlux.dimensions()*vf.dimensions() + ) + ); + fvMatrix& fvm = tfvm.ref(); + fvm.lower() = -weights.primitiveField()*faceFlux.primitiveField(); + fvm.upper() = fvm.lower() + faceFlux.primitiveField(); + fvm.negSumDiag(); + forAll(vf.boundaryField(), patchi) + { + const fvPatchField& psf = vf.boundaryField()[patchi]; + const fvsPatchScalarField& patchFlux = faceFlux.boundaryField()[patchi]; + const fvsPatchScalarField& pw = weights.boundaryField()[patchi]; + + fvm.internalCoeffs()[patchi] = patchFlux*psf.valueInternalCoeffs(pw); + fvm.boundaryCoeffs()[patchi] = -patchFlux*psf.valueBoundaryCoeffs(pw); + } + if (gcs.interpScheme().corrected()) + { + fvm += fvc::surfaceIntegrate(faceFlux*gcs.interpScheme().correction(vf)); + } + return tfvm; +} + +template +tmp> +gaussConvectionSchemeFvmDiv +( + const surfaceScalarField& faceFlux, + const GeometricField& vf +) +{ + word name("div("+faceFlux.name()+','+vf.name()+')'); + return gaussConvectionSchemeFvmDiv(faceFlux,vf,name); +} + +template +tmp> +gaussConvectionSchemeFvcDiv +( + const surfaceScalarField& faceFlux, + const GeometricField& vf +) +{ + word name("div("+faceFlux.name()+','+vf.name()+')'); + return gaussConvectionSchemeFvcDiv(faceFlux, vf, name); +} + +template +tmp> +gaussConvectionSchemeFvcDiv +( + const surfaceScalarField& faceFlux, + const GeometricField& vf, + const word& name +) +{ + Info << "gaussConvectionSchemeFvcDiv start" << endl; + + const fvMesh& mesh = vf.mesh(); + + Istream& divIntScheme = mesh.divScheme(name); + word divScheme(divIntScheme); + + tmp> tinterpScheme_ = + surfaceInterpolationScheme::New(mesh, faceFlux, divIntScheme); + + // tmp> tinterpScheme_ = + // tmp> + // ( + // new linear(mesh) + // ); + + + // surfaceInterpolationScheme interpScheme_ = tinterpScheme_.ref(); + + tmp> tConvection + ( + fvc::surfaceIntegrate(gaussConvectionSchemeFlux(faceFlux, vf, tinterpScheme_)) + ); + + tConvection.ref().rename + ( + "convection(" + faceFlux.name() + ',' + vf.name() + ')' + ); + + return tConvection; +} + +template +tmp> +gaussConvectionSchemeFvcDiv +( + const GeometricField& ssf +) +{ + return tmp> + ( + new GeometricField + ( + "div("+ssf.name()+')', + fvcSurfaceIntegrate(ssf) + ) + ); +} + +template +tmp> +fvcSurfaceIntegrate +( + const GeometricField& ssf +) +{ + const fvMesh& mesh = ssf.mesh(); + + tmp> tvf + ( + new GeometricField + ( + IOobject + ( + "surfaceIntegrate("+ssf.name()+')', + ssf.instance(), + mesh, + IOobject::NO_READ, + IOobject::NO_WRITE + ), + mesh, + dimensioned + ( + "0", + ssf.dimensions()/dimVol, + Zero + ), + extrapolatedCalculatedFvPatchField::typeName + ) + ); + GeometricField& vf = tvf.ref(); + + fvcSurfaceIntegrate(vf.primitiveFieldRef(), ssf); + vf.correctBoundaryConditions(); + + return tvf; +} + +template +void fvcSurfaceIntegrate +( + Field& ivf, + const GeometricField& ssf +) +{ + const fvMesh& mesh = ssf.mesh(); + + const labelUList& owner = mesh.owner(); + const labelUList& neighbour = mesh.neighbour(); + + const Field& issf = ssf; + + forAll(owner, facei) + { + ivf[owner[facei]] += issf[facei]; + ivf[neighbour[facei]] -= issf[facei]; + } + Info << "ivfcpu[473]before bou = " << ivf[473] << endl; + + forAll(mesh.boundary(), patchi) + { + const labelUList& pFaceCells = + mesh.boundary()[patchi].faceCells(); + + const fvsPatchField& pssf = ssf.boundaryField()[patchi]; + + forAll(mesh.boundary()[patchi], facei) + { + ivf[pFaceCells[facei]] += pssf[facei]; + if (pFaceCells[facei] == 473) + { + Info << "pssfcpu[473] += " << pssf[facei] << endl; + } + + } + } + + Info << "ivfcpu[473] = " << ivf[473] << endl; + + ivf /= mesh.Vsc(); + + printf("vol cpu = %.15e\n", mesh.Vsc()()[473]); +} + +template +tmp> +gaussConvectionSchemeFlux +( + const surfaceScalarField& faceFlux, + const GeometricField& vf, + tmp> tinterpScheme +) +{ + Info << vf.name() <> +gaussConvectionSchemeFvmDiv +( + const surfaceScalarField& faceFlux, + const GeometricField& vf +); + +template +tmp> +gaussConvectionSchemeFvmDiv +( + const surfaceScalarField& faceFlux, + const GeometricField& vf +); + +template +tmp> +gaussConvectionSchemeFvcDiv +( + const surfaceScalarField& faceFlux, + const GeometricField& vf +); + + +template +tmp> +gaussConvectionSchemeFvcDiv +( + const GeometricField& ssf +); + +// * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * // + +} // End namespace Foam + +// ************************************************************************* // \ No newline at end of file diff --git a/GPUTestRef/gaussGrad.C b/GPUTestRef/gaussGrad.C new file mode 100644 index 000000000..401eab38b --- /dev/null +++ b/GPUTestRef/gaussGrad.C @@ -0,0 +1,332 @@ +/*---------------------------------------------------------------------------*\ + ========= | + \\ / F ield | OpenFOAM: The Open Source CFD Toolbox + \\ / O peration | Website: https://openfoam.org + \\ / A nd | Copyright (C) 2011-2018 OpenFOAM Foundation + \\/ M anipulation | +------------------------------------------------------------------------------- +License + This file is part of OpenFOAM. + + OpenFOAM is free software: you can redistribute it and/or modify it + under the terms of the GNU General Public License as published by + the Free Software Foundation, either version 3 of the License, or + (at your option) any later version. + + OpenFOAM is distributed in the hope that it will be useful, but WITHOUT + ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or + FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License + for more details. + + You should have received a copy of the GNU General Public License + along with OpenFOAM. If not, see . + +\*---------------------------------------------------------------------------*/ + +#include "gaussGrad.H" +#include "extrapolatedCalculatedFvPatchField.H" + +// * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * // + +namespace Foam +{ + + + +template +tmp +< + GeometricField + < + typename outerProduct::type, + fvPatchField, + volMesh + > +> +gaussGradSchemeGrad +( + const GeometricField& vsf +) +{ + return gaussGradSchemeGrad(vsf, "grad(" + vsf.name() + ')'); +} + +template +tmp +< + GeometricField + < + typename outerProduct::type, + fvPatchField, + volMesh + > +> +gaussGradSchemeGrad +( + const GeometricField& vsf, + const word& name +) +{ + const fvMesh& mesh = vsf.mesh(); + + typedef typename outerProduct::type GradType; + typedef GeometricField GradFieldType; + + if (!mesh.changing() && mesh.cache(name)) + { + if (!mesh.objectRegistry::template foundObject(name)) + { + solution::cachePrintMessage("Calculating and caching", name, vsf); + tmp tgGrad = gaussGradCalcGrad(vsf, name); + regIOobject::store(tgGrad.ptr()); + } + + solution::cachePrintMessage("Retrieving", name, vsf); + GradFieldType& gGrad = + mesh.objectRegistry::template lookupObjectRef + ( + name + ); + + if (gGrad.upToDate(vsf)) + { + return gGrad; + } + else + { + solution::cachePrintMessage("Deleting", name, vsf); + gGrad.release(); + delete &gGrad; + + solution::cachePrintMessage("Recalculating", name, vsf); + tmp tgGrad = gaussGradCalcGrad(vsf, name); + + solution::cachePrintMessage("Storing", name, vsf); + regIOobject::store(tgGrad.ptr()); + GradFieldType& gGrad = + mesh.objectRegistry::template lookupObjectRef + ( + name + ); + + return gGrad; + } + } + else + { + if (mesh.objectRegistry::template foundObject(name)) + { + GradFieldType& gGrad = + mesh.objectRegistry::template lookupObjectRef + ( + name + ); + + if (gGrad.ownedByRegistry()) + { + solution::cachePrintMessage("Deleting", name, vsf); + gGrad.release(); + delete &gGrad; + } + } + + solution::cachePrintMessage("Calculating", name, vsf); + return gaussGradCalcGrad(vsf, name); + } +} + +template +tmp +< + GeometricField + < + typename outerProduct::type, + fvPatchField, + volMesh + > +> +gaussGradCalcGrad +( + const GeometricField& vsf, + const word& name +) +{ + const fvMesh& mesh = vsf.mesh(); + + tmp> tinterpScheme_ = + tmp> + ( + new linear(mesh) + ); + + typedef typename outerProduct::type GradType; + + tmp> tinterpolate = tinterpScheme_().interpolate(vsf); + + tmp> tgGrad + ( + gaussGradGradf(tinterpolate.ref(), name) + ); + GeometricField& gGrad = tgGrad.ref(); + + gaussGradCorrectBoundaryConditions(vsf, gGrad); + + return tgGrad; +} + +template +void gaussGradCorrectBoundaryConditions +( + const GeometricField& vsf, + GeometricField + < + typename outerProduct::type, fvPatchField, volMesh + >& gGrad +) +{ + typename GeometricField + < + typename outerProduct::type, fvPatchField, volMesh + >::Boundary& gGradbf = gGrad.boundaryFieldRef(); + + forAll(vsf.boundaryField(), patchi) + { + if (!vsf.boundaryField()[patchi].coupled()) + { + const vectorField n + ( + vsf.mesh().Sf().boundaryField()[patchi] + / vsf.mesh().magSf().boundaryField()[patchi] + ); + + gGradbf[patchi] += n * + ( + vsf.boundaryField()[patchi].snGrad() + - (n & gGradbf[patchi]) + ); + } + } +} + +template +tmp +< + GeometricField + < + typename outerProduct::type, + fvPatchField, + volMesh + > +> +gaussGradGradf +( + const GeometricField& ssf, + const word& name +) +{ + typedef typename outerProduct::type GradType; + + const fvMesh& mesh = ssf.mesh(); + + tmp> tgGrad + ( + new GeometricField + ( + IOobject + ( + name, + ssf.instance(), + mesh, + IOobject::NO_READ, + IOobject::NO_WRITE + ), + mesh, + dimensioned + ( + "0", + ssf.dimensions()/dimLength, + Zero + ), + extrapolatedCalculatedFvPatchField::typeName + ) + ); + GeometricField& gGrad = tgGrad.ref(); + + const labelUList& owner = mesh.owner(); + const labelUList& neighbour = mesh.neighbour(); + const vectorField& Sf = mesh.Sf(); + + Field& igGrad = gGrad; + const Field& issf = ssf; + + forAll(owner, facei) + { + GradType Sfssf = Sf[facei]*issf[facei]; + + igGrad[owner[facei]] += Sfssf; + igGrad[neighbour[facei]] -= Sfssf; + } + + forAll(mesh.boundary(), patchi) + { + const labelUList& pFaceCells = + mesh.boundary()[patchi].faceCells(); + + const vectorField& pSf = mesh.Sf().boundaryField()[patchi]; + + const fvsPatchField& pssf = ssf.boundaryField()[patchi]; + + forAll(mesh.boundary()[patchi], facei) + { + igGrad[pFaceCells[facei]] += pSf[facei]*pssf[facei]; + if (pFaceCells[facei] == 0) + { + // Info << "CPU add = " << pSf[facei]*pssf[facei] << endl; + // Info << "surface CPU = " << pSf[facei] << endl; + // Info << "field CPU = " << pssf[facei] << endl; + } + } + } + + igGrad /= mesh.V(); + + gGrad.correctBoundaryConditions(); + + return tgGrad; +} + + +template +tmp +< + GeometricField + < + typename outerProduct::type, + fvPatchField, + volMesh + > +> +gaussGradSchemeGrad +( + const GeometricField& vsf +); + + +template +tmp +< + GeometricField + < + typename outerProduct::type, + fvPatchField, + volMesh + > +> +gaussGradSchemeGrad +( + const GeometricField& vsf +); + +} // End namespace Foam + +// ************************************************************************* // \ No newline at end of file diff --git a/src_gpu/dfMatrixDataBase.H b/src_gpu/dfMatrixDataBase.H index 4e0bd4cbe..7eb8b9ec2 100644 --- a/src_gpu/dfMatrixDataBase.H +++ b/src_gpu/dfMatrixDataBase.H @@ -124,6 +124,7 @@ struct dfMatrixDataBase double *d_boundary_mag_sf = nullptr; double *d_boundary_weight = nullptr; double *d_boundary_delta_coeffs = nullptr; + int *d_boundary_face_cell = nullptr; // non-constant fields - internal // TODO: further estimate @@ -197,7 +198,7 @@ struct dfMatrixDataBase void initConstantFieldsInternal(const double *sf, const double *mag_sf, const double *weight, const double *delta_coeffs, const double *volume); void initConstantFieldsBoundary(const double *boundary_sf, const double *boundary_mag_sf, - const double *boundary_delta_coeffs); + const double *boundary_delta_coeffs, const int *boundary_face_cell); void createNonConstantFieldsInternal(); void createNonConstantFieldsBoundary(); diff --git a/src_gpu/dfMatrixDataBase.cu b/src_gpu/dfMatrixDataBase.cu index b426201a2..cb6a44d5f 100644 --- a/src_gpu/dfMatrixDataBase.cu +++ b/src_gpu/dfMatrixDataBase.cu @@ -198,6 +198,7 @@ void dfMatrixDataBase::createConstantFieldsBoundary() { checkCudaErrors(cudaMalloc((void**)&d_boundary_sf, boundary_surface_value_vec_bytes)); checkCudaErrors(cudaMalloc((void**)&d_boundary_mag_sf, boundary_surface_value_bytes)); checkCudaErrors(cudaMalloc((void**)&d_boundary_delta_coeffs, boundary_surface_value_bytes)); + checkCudaErrors(cudaMalloc((void**)&d_boundary_face_cell, boundary_surface_index_bytes)); fieldPointerMap["d_boundary_sf"] = d_boundary_sf; fieldPointerMap["d_boundary_mag_sf"] = d_boundary_mag_sf; fieldPointerMap["d_boundary_delta_coeffs"] = d_boundary_delta_coeffs; @@ -213,10 +214,11 @@ void dfMatrixDataBase::initConstantFieldsInternal(const double *sf, const double } void dfMatrixDataBase::initConstantFieldsBoundary(const double *boundary_sf, const double *boundary_mag_sf, - const double *boundary_delta_coeffs) { + const double *boundary_delta_coeffs, const int *boundary_face_cell) { checkCudaErrors(cudaMemcpyAsync(d_boundary_sf, boundary_sf, boundary_surface_value_vec_bytes, cudaMemcpyHostToDevice, stream)); checkCudaErrors(cudaMemcpyAsync(d_boundary_mag_sf, boundary_mag_sf, boundary_surface_value_bytes, cudaMemcpyHostToDevice, stream)); checkCudaErrors(cudaMemcpyAsync(d_boundary_delta_coeffs, boundary_delta_coeffs, boundary_surface_value_bytes, cudaMemcpyHostToDevice, stream)); + checkCudaErrors(cudaMemcpyAsync(d_boundary_face_cell, boundary_face_cell, boundary_surface_index_bytes, cudaMemcpyHostToDevice, stream)); } void dfMatrixDataBase::createNonConstantFieldsInternal() { diff --git a/src_gpu/dfMatrixOpBase.H b/src_gpu/dfMatrixOpBase.H index 9de229b14..b4015f0a6 100644 --- a/src_gpu/dfMatrixOpBase.H +++ b/src_gpu/dfMatrixOpBase.H @@ -43,6 +43,18 @@ void fvc_ddt_scalar(cudaStream_t stream, int num_cells, double rDeltaT, const double *rho, const double *rho_old, const double *vf, const double *vf_old, double *output); + +void fvc_grad_vector(cudaStream_t stream, int num_cells, int num_surfaces, + const int *lowerAddr, const int *upperAddr, + const double *weight, const double *Sf, const double *vf, double *output, // end for internal + int num_patches, const int *patch_size, const int *patch_type, + const int *boundary_cell_face, const double *boundary_vf, const double *boundary_Sf, + const double *volume, const double *boundary_mag_Sf, double *boundary_output, + const double *boundary_deltaCoeffs); + +void fvc_div_surface_scalar(cudaStream_t stream, int num_cells, int num_surfaces, int num_boundary_surfaces, + const int *lowerAddr, const int *upperAddr, const double *ssf, const int *boundary_cell_face, + const double *boundary_ssf, const double *volume, double *output); // void fvc_grad_surface(); // // void fvc_div_cell(); diff --git a/src_gpu/dfMatrixOpBase.cu b/src_gpu/dfMatrixOpBase.cu index f55b6895a..e397b4232 100644 --- a/src_gpu/dfMatrixOpBase.cu +++ b/src_gpu/dfMatrixOpBase.cu @@ -192,6 +192,295 @@ __global__ void fvc_ddt_scalar_kernel(int num_cells, double rDeltaT, output[index] += rDeltaT * (rho[index] * vf[index] - rho_old[index] * vf_old[index]); } +__global__ void fvc_grad_vector_internal(int num_surfaces, + const int *lower_index, const int *upper_index, const double *face_vector, + const double *weight, const double *field_vector, + double *output) +{ + int index = blockDim.x * blockIdx.x + threadIdx.x; + if (index >= num_surfaces) + return; + + double w = weight[index]; + double Sfx = face_vector[index * 3 + 0]; + double Sfy = face_vector[index * 3 + 1]; + double Sfz = face_vector[index * 3 + 2]; + + int owner = lower_index[index]; + int neighbor = upper_index[index]; + + double ssfx = (w * (field_vector[owner * 3 + 0] - field_vector[neighbor * 3 + 0]) + field_vector[neighbor * 3 + 0]); + double ssfy = (w * (field_vector[owner * 3 + 1] - field_vector[neighbor * 3 + 1]) + field_vector[neighbor * 3 + 1]); + double ssfz = (w * (field_vector[owner * 3 + 2] - field_vector[neighbor * 3 + 2]) + field_vector[neighbor * 3 + 2]); + + double grad_xx = Sfx * ssfx; + double grad_xy = Sfx * ssfy; + double grad_xz = Sfx * ssfz; + double grad_yx = Sfy * ssfx; + double grad_yy = Sfy * ssfy; + double grad_yz = Sfy * ssfz; + double grad_zx = Sfz * ssfx; + double grad_zy = Sfz * ssfy; + double grad_zz = Sfz * ssfz; + + // owner + atomicAdd(&(output[owner * 9 + 0]), grad_xx); + atomicAdd(&(output[owner * 9 + 1]), grad_xy); + atomicAdd(&(output[owner * 9 + 2]), grad_xz); + atomicAdd(&(output[owner * 9 + 3]), grad_yx); + atomicAdd(&(output[owner * 9 + 4]), grad_yy); + atomicAdd(&(output[owner * 9 + 5]), grad_yz); + atomicAdd(&(output[owner * 9 + 6]), grad_zx); + atomicAdd(&(output[owner * 9 + 7]), grad_zy); + atomicAdd(&(output[owner * 9 + 8]), grad_zz); + + // neighbour + atomicAdd(&(output[neighbor * 9 + 0]), -grad_xx); + atomicAdd(&(output[neighbor * 9 + 1]), -grad_xy); + atomicAdd(&(output[neighbor * 9 + 2]), -grad_xz); + atomicAdd(&(output[neighbor * 9 + 3]), -grad_yx); + atomicAdd(&(output[neighbor * 9 + 4]), -grad_yy); + atomicAdd(&(output[neighbor * 9 + 5]), -grad_yz); + atomicAdd(&(output[neighbor * 9 + 6]), -grad_zx); + atomicAdd(&(output[neighbor * 9 + 7]), -grad_zy); + atomicAdd(&(output[neighbor * 9 + 8]), -grad_zz); + + // if (owner == 0) + // { + // printf("tensor[0] = (%.5e, %.5e, %.5e, %.5e, %.5e, %.5e, %.5e, %.5e, %.5e)\n", output[owner * 9 + 0], + // output[owner * 9 + 1], output[owner * 9 + 2], output[owner * 9 + 3], output[owner * 9 + 4], output[owner * 9 + 5], + // output[owner * 9 + 6], output[owner * 9 + 7], output[owner * 9 + 8]); + // } +} + +// update boundary of interpolation field +// calculate the grad field +// TODO: this function is implemented for uncoupled boundary conditions +// so it should use the more specific func name +__global__ void fvc_grad_vector_boundary(int num, int offset, const int *face2Cells, + const double *boundary_face_vector, const double *boundary_field_vector, double *output) +{ + int index = blockDim.x * blockIdx.x + threadIdx.x; + if (index >= num) + return; + + int start_index = offset + index; + + double bouSfx = boundary_face_vector[start_index * 3 + 0]; + double bouSfy = boundary_face_vector[start_index * 3 + 1]; + double bouSfz = boundary_face_vector[start_index * 3 + 2]; + + double boussfx = boundary_field_vector[start_index * 3 + 0]; + double boussfy = boundary_field_vector[start_index * 3 + 1]; + double boussfz = boundary_field_vector[start_index * 3 + 2]; + + int cellIndex = face2Cells[start_index]; + + // if (cellIndex == 0) + // { + // printf("surface vector = (%.5e, %.5e, %.5e)\n field vector = (%.5e, %.5e, %.5e)\n", bouSfx, bouSfy, bouSfz, + // boussfx, boussfy, boussfz); + // } + + double grad_xx = bouSfx * boussfx; + double grad_xy = bouSfx * boussfy; + double grad_xz = bouSfx * boussfz; + double grad_yx = bouSfy * boussfx; + double grad_yy = bouSfy * boussfy; + double grad_yz = bouSfy * boussfz; + double grad_zx = bouSfz * boussfx; + double grad_zy = bouSfz * boussfy; + double grad_zz = bouSfz * boussfz; + + atomicAdd(&(output[cellIndex * 9 + 0]), grad_xx); + atomicAdd(&(output[cellIndex * 9 + 1]), grad_xy); + atomicAdd(&(output[cellIndex * 9 + 2]), grad_xz); + atomicAdd(&(output[cellIndex * 9 + 3]), grad_yx); + atomicAdd(&(output[cellIndex * 9 + 4]), grad_yy); + atomicAdd(&(output[cellIndex * 9 + 5]), grad_yz); + atomicAdd(&(output[cellIndex * 9 + 6]), grad_zx); + atomicAdd(&(output[cellIndex * 9 + 7]), grad_zy); + atomicAdd(&(output[cellIndex * 9 + 8]), grad_zz); +} + +__global__ void divide_cell_volume_tsr(int num_cells, const double* volume, double *output) +{ + int index = blockDim.x * blockIdx.x + threadIdx.x; + if (index >= num_cells) + return; + + double vol = volume[index]; + output[index * 9 + 0] = output[index * 9 + 0] / vol; + output[index * 9 + 1] = output[index * 9 + 1] / vol; + output[index * 9 + 2] = output[index * 9 + 2] / vol; + output[index * 9 + 3] = output[index * 9 + 3] / vol; + output[index * 9 + 4] = output[index * 9 + 4] / vol; + output[index * 9 + 5] = output[index * 9 + 5] / vol; + output[index * 9 + 6] = output[index * 9 + 6] / vol; + output[index * 9 + 7] = output[index * 9 + 7] / vol; + output[index * 9 + 8] = output[index * 9 + 8] / vol; +} + +__global__ void divide_cell_volume_scalar(int num_cells, const double* volume, double *output) +{ + int index = blockDim.x * blockIdx.x + threadIdx.x; + if (index >= num_cells) + return; + + double vol = volume[index]; + + if (index == 473) + { + printf("vol gpu = %.15e\n", vol); + } + + output[index] = output[index] / vol; +} + +__global__ void fvc_grad_vector_correctBC_zeroGradient(int num, int offset, const int *face2Cells, + const double *internal_grad, const double *vf, const double *boundary_sf, + const double *boundary_mag_sf, double *boundary_grad) +{ + int index = blockDim.x * blockIdx.x + threadIdx.x; + if (index >= num) + return; + + int start_index = offset + index; + + int cellIndex = face2Cells[start_index]; + + double grad_xx = internal_grad[cellIndex * 9 + 0]; + double grad_xy = internal_grad[cellIndex * 9 + 1]; + double grad_xz = internal_grad[cellIndex * 9 + 2]; + double grad_yx = internal_grad[cellIndex * 9 + 3]; + double grad_yy = internal_grad[cellIndex * 9 + 4]; + double grad_yz = internal_grad[cellIndex * 9 + 5]; + double grad_zx = internal_grad[cellIndex * 9 + 6]; + double grad_zy = internal_grad[cellIndex * 9 + 7]; + double grad_zz = internal_grad[cellIndex * 9 + 8]; + + double vfx = vf[cellIndex * 3 + 0]; + double vfy = vf[cellIndex * 3 + 1]; + double vfz = vf[cellIndex * 3 + 2]; + + double n_x = boundary_sf[cellIndex * 3 + 0] / boundary_mag_sf[cellIndex]; + double n_y = boundary_sf[cellIndex * 3 + 1] / boundary_mag_sf[cellIndex]; + double n_z = boundary_sf[cellIndex * 3 + 2] / boundary_mag_sf[cellIndex]; + + double grad_correction_x = - (n_x * grad_xx + n_y * grad_yx + n_z * grad_zx); // sn_grad_x = 0 + double grad_correction_y = - (n_x * grad_xy + n_y * grad_yy + n_z * grad_zy); + double grad_correction_z = - (n_x * grad_xz + n_y * grad_yz + n_z * grad_zz); + + boundary_grad[start_index * 9 + 0] = grad_xx + n_x * grad_correction_x; + boundary_grad[start_index * 9 + 1] = grad_xy + n_x * grad_correction_y; + boundary_grad[start_index * 9 + 2] = grad_xz + n_x * grad_correction_z; + boundary_grad[start_index * 9 + 3] = grad_yx + n_y * grad_correction_x; + boundary_grad[start_index * 9 + 4] = grad_yy + n_y * grad_correction_y; + boundary_grad[start_index * 9 + 5] = grad_yz + n_y * grad_correction_z; + boundary_grad[start_index * 9 + 6] = grad_zx + n_z * grad_correction_x; + boundary_grad[start_index * 9 + 7] = grad_zy + n_z * grad_correction_y; + boundary_grad[start_index * 9 + 8] = grad_zz + n_z * grad_correction_z; +} + +__global__ void fvc_grad_vector_correctBC_fixedValue(int num, int offset, const int *face2Cells, + const double *internal_grad, const double *vf, const double *boundary_sf, + const double *boundary_mag_sf, double *boundary_grad, + const double *boundary_deltaCoeffs, const double *boundary_vf) +{ + int index = blockDim.x * blockIdx.x + threadIdx.x; + if (index >= num) + return; + + int start_index = offset + index; + + int cellIndex = face2Cells[start_index]; + + double grad_xx = internal_grad[cellIndex * 9 + 0]; + double grad_xy = internal_grad[cellIndex * 9 + 1]; + double grad_xz = internal_grad[cellIndex * 9 + 2]; + double grad_yx = internal_grad[cellIndex * 9 + 3]; + double grad_yy = internal_grad[cellIndex * 9 + 4]; + double grad_yz = internal_grad[cellIndex * 9 + 5]; + double grad_zx = internal_grad[cellIndex * 9 + 6]; + double grad_zy = internal_grad[cellIndex * 9 + 7]; + double grad_zz = internal_grad[cellIndex * 9 + 8]; + + double vfx = vf[cellIndex * 3 + 0]; + double vfy = vf[cellIndex * 3 + 1]; + double vfz = vf[cellIndex * 3 + 2]; + + double n_x = boundary_sf[start_index * 3 + 0] / boundary_mag_sf[start_index]; + double n_y = boundary_sf[start_index * 3 + 1] / boundary_mag_sf[start_index]; + double n_z = boundary_sf[start_index * 3 + 2] / boundary_mag_sf[start_index]; + + // sn_grad: solving according to fixedValue BC + double sn_grad_x = boundary_deltaCoeffs[start_index] * (boundary_vf[start_index * 3 + 0] - vf[cellIndex * 3 + 0]); + double sn_grad_y = boundary_deltaCoeffs[start_index] * (boundary_vf[start_index * 3 + 1] - vf[cellIndex * 3 + 1]); + double sn_grad_z = boundary_deltaCoeffs[start_index] * (boundary_vf[start_index * 3 + 2] - vf[cellIndex * 3 + 2]); + + double grad_correction_x = sn_grad_x - (n_x * grad_xx + n_y * grad_yx + n_z * grad_zx); // sn_grad_x = 0 + double grad_correction_y = sn_grad_y - (n_x * grad_xy + n_y * grad_yy + n_z * grad_zy); + double grad_correction_z = sn_grad_z - (n_x * grad_xz + n_y * grad_yz + n_z * grad_zz); + + boundary_grad[start_index * 9 + 0] = grad_xx + n_x * grad_correction_x; + boundary_grad[start_index * 9 + 1] = grad_xy + n_x * grad_correction_y; + boundary_grad[start_index * 9 + 2] = grad_xz + n_x * grad_correction_z; + boundary_grad[start_index * 9 + 3] = grad_yx + n_y * grad_correction_x; + boundary_grad[start_index * 9 + 4] = grad_yy + n_y * grad_correction_y; + boundary_grad[start_index * 9 + 5] = grad_yz + n_y * grad_correction_z; + boundary_grad[start_index * 9 + 6] = grad_zx + n_z * grad_correction_x; + boundary_grad[start_index * 9 + 7] = grad_zy + n_z * grad_correction_y; + boundary_grad[start_index * 9 + 8] = grad_zz + n_z * grad_correction_z; +} + +__global__ void fvc_div_surface_scalar_internal(int num_surfaces, + const int *lower_index, const int *upper_index, const double *ssf, + double *output) +{ + int index = blockDim.x * blockIdx.x + threadIdx.x; + if (index >= num_surfaces) + return; + + int owner = lower_index[index]; + int neighbor = upper_index[index]; + + double issf = ssf[index]; + + // owner + atomicAdd(&(output[owner]), issf); + + // neighbor + atomicAdd(&(output[neighbor]), -issf); + + if (index == 0) + { + printf("output[3511]before = %.5e\n", output[473]); + } +} + +__global__ void fvc_div_surface_scalar_boundary(int num_boundary_face, const int *face2Cells, + const double *boundary_ssf, double *output) +{ + int index = blockDim.x * blockIdx.x + threadIdx.x; + if (index >= num_boundary_face) + return; + + int cellIndex = face2Cells[index]; + + atomicAdd(&(output[cellIndex]), boundary_ssf[index]); + + // if (index == 0) + // { + // printf("output[3511] = %.5e\n", output[3511]); + // } + + if (cellIndex == 473) + { + printf("output[473] = %.5e\n", output[473]); + printf("boundary_ssf[473] = %.5e\n", boundary_ssf[index]); + } +} + void permute_vector_d2h(cudaStream_t stream, int num_cells, const double *input, double *output) { size_t threads_per_block = 256; @@ -328,3 +617,79 @@ void fvc_ddt_scalar(cudaStream_t stream, int num_cells, double rDeltaT, rDeltaT, rho, rho_old, vf, vf_old, output); } +void fvc_grad_vector(cudaStream_t stream, int num_cells, int num_surfaces, + const int *lowerAddr, const int *upperAddr, + const double *weight, const double *Sf, const double *vf, double *output, // end for internal + int num_patches, const int *patch_size, const int *patch_type, + const int *boundary_cell_face, const double *boundary_vf, const double *boundary_Sf, + const double *volume, const double *boundary_mag_Sf, double *boundary_output, + const double *boundary_deltaCoeffs) +{ + size_t threads_per_block = 1024; + size_t blocks_per_grid = (num_surfaces + threads_per_block - 1) / threads_per_block; + fvc_grad_vector_internal<<>>(num_surfaces, lowerAddr, upperAddr, + Sf, weight, vf, output); + + int offset = 0; + // finish conctruct grad field except dividing cell volume + for (int i = 0; i < num_patches; i++) { + threads_per_block = 256; + blocks_per_grid = (patch_size[i] + threads_per_block - 1) / threads_per_block; + // TODO: just basic patch type now + if (patch_type[i] == boundaryConditions::zeroGradient + || patch_type[i] == boundaryConditions::fixedValue) { + // TODO: just vector version now + fvc_grad_vector_boundary<<>>(patch_size[i], offset, boundary_cell_face, + boundary_Sf, boundary_vf, output); + } else if (0) { + // xxx + fprintf(stderr, "boundaryConditions other than zeroGradient are not support yet!\n"); + } + offset += patch_size[i]; + } + + // divide cell volume + threads_per_block = 1024; + blocks_per_grid = (num_cells + threads_per_block - 1) / threads_per_block; + divide_cell_volume_tsr<<>>(num_cells, volume, output); + + offset = 0; + // correct boundary conditions + for (int i = 0; i < num_patches; i++) { + threads_per_block = 256; + blocks_per_grid = (patch_size[i] + threads_per_block - 1) / threads_per_block; + // TODO: just basic patch type now + if (patch_type[i] == boundaryConditions::zeroGradient) { + // TODO: just vector version now + fvc_grad_vector_correctBC_zeroGradient<<>>(patch_size[i], offset, boundary_cell_face, + output, boundary_vf, boundary_Sf, boundary_mag_Sf, boundary_output); + } else if (patch_type[i] == boundaryConditions::fixedValue) { + fvc_grad_vector_correctBC_fixedValue<<>>(patch_size[i], offset, boundary_cell_face, + output, boundary_vf, boundary_Sf, boundary_mag_Sf, boundary_output, boundary_deltaCoeffs, boundary_vf); + } else if (0) { + // xxx + fprintf(stderr, "boundaryConditions other than zeroGradient are not support yet!\n"); + } + offset += patch_size[i]; + } +} + +void fvc_div_surface_scalar(cudaStream_t stream, int num_cells, int num_surfaces, int num_boundary_surfaces, + const int *lowerAddr, const int *upperAddr, const double *ssf, const int *boundary_cell_face, + const double *boundary_ssf, const double *volume, double *output) +{ + size_t threads_per_block = 1024; + size_t blocks_per_grid = (num_surfaces + threads_per_block - 1) / threads_per_block; + fvc_div_surface_scalar_internal<<>>(num_surfaces, lowerAddr, upperAddr, ssf, output); + + threads_per_block = 1024; + blocks_per_grid = (num_boundary_surfaces + threads_per_block - 1) / threads_per_block; + fvc_div_surface_scalar_boundary<<>>(num_boundary_surfaces, boundary_cell_face, + boundary_ssf, output); + + // divide cell volume + threads_per_block = 1024; + blocks_per_grid = (num_cells + threads_per_block - 1) / threads_per_block; + divide_cell_volume_scalar<<>>(num_cells, volume, output); +} + From 4ee65cc4a83cbc0d71c06d44b260d1872cec502d Mon Sep 17 00:00:00 2001 From: maorz1998 Date: Sat, 12 Aug 2023 00:10:11 +0800 Subject: [PATCH 17/25] fvc op & CPU op --- GPUTest/GPUTestBase.H | 115 +++++++++-- GPUTest/unittest.C | 10 + GPUTestRef/EulerDdtScheme.C | 322 +++++++++++++++++++++++++++++ GPUTestRef/GenFvMatrix.H | 13 ++ GPUTestRef/Make/files | 2 + GPUTestRef/gaussConvectionScheme.C | 66 +++++- GPUTestRef/gaussLaplacianScheme.C | 273 ++++++++++++++++++++++++ src_gpu/dfMatrixOpBase.H | 10 +- src_gpu/dfMatrixOpBase.cu | 119 ++++++++--- 9 files changed, 872 insertions(+), 58 deletions(-) create mode 100644 GPUTestRef/EulerDdtScheme.C create mode 100644 GPUTestRef/gaussLaplacianScheme.C diff --git a/GPUTest/GPUTestBase.H b/GPUTest/GPUTestBase.H index 38676528e..b5f5b944b 100644 --- a/GPUTest/GPUTestBase.H +++ b/GPUTest/GPUTestBase.H @@ -72,29 +72,57 @@ void getTypeInfo(size_t *stride, size_t *internal_size, size_t *boundary_size) { *boundary_size = dfDataBase.num_boundary_surfaces * s; } + +template +void getFieldPtr(std::queue& fieldPtrQue, T& field){ + fieldPtrQue.push(&field[0]); + forAll(field.boundaryField(), patchi){ + auto& patchField = field.boundaryFieldRef()[patchi]; + fieldPtrQue.push(&patchField[0]); + } +}; + +// template +// void getFieldPtr(std::vector& fieldPtrQue, T& field){ +// fieldPtrQue.push_back(&field[0]); +// forAll(field.boundaryField(), patchi){ +// auto& patchField = field.boundaryFieldRef()[patchi]; +// fieldPtrQue.push_back(&patchField[0]); +// Info << "patchi " << patchi << endl; +// } +// }; + + template void randomInitField(T& field) { size_t stride = 0; size_t internal_size = 0; size_t boundary_size = 0; getTypeInfo(&stride, &internal_size, &boundary_size); - size_t internal_value_bytes = internal_size * sizeof(double); + size_t internal_value_bytes = internal_size * sizeof(double) * stride; + std::queue fieldPtrQue; + // std::vector fieldPtrQue; + getFieldPtr(fieldPtrQue, field); // random init field value to (-0.5, 0.5) // internal - double *field_internal_ptr = &field[0]; + double *&field_internal_ptr = fieldPtrQue.front(); fieldPtrQue.pop(); + // double *field_internal_ptr = fieldPtrQue[0]; std::vector init_field_internal; - init_field_internal.resize(internal_size); - for (size_t i = 0; i < internal_size; i++) { + init_field_internal.resize(internal_size * stride); + for (size_t i = 0; i < internal_size * stride; i++) { init_field_internal[i] = (rand() % 10000 - 5000) / 10000.0; } memcpy(field_internal_ptr, init_field_internal.data(), internal_value_bytes); // boundary + int ptrIndex = 1; forAll(field.boundaryField(), patchi) { auto& patchField = field.boundaryFieldRef()[patchi]; size_t patchsize = patchField.size(); - double *field_boundary_ptr = &patchField[0]; + double *&field_boundary_ptr = fieldPtrQue.front(); fieldPtrQue.pop(); + // double *field_boundary_ptr = fieldPtrQue[ptrIndex]; + // ptrIndex ++; std::vector init_field_boundary; init_field_boundary.resize(patchsize * stride); for (size_t i = 0; i < patchsize * stride; i++) { @@ -296,7 +324,8 @@ void test_fvm_ddt_vector(dfMatrixDataBase& dfDataBase, Foam::fvMesh& mesh, volSc } // run CPU - fvVectorMatrix dfMatrix = fvm::ddt(rho, U); + // fvVectorMatrix dfMatrix = fvm::ddt(rho, U); + fvVectorMatrix dfMatrix = EulerDdtSchemeFvmDdt(rho, U); // prepare for run GPU // prepare rho, rho.old, U @@ -325,7 +354,8 @@ void test_fvm_div_vector(dfMatrixDataBase& dfDataBase, Foam::fvMesh& mesh, surfa } // run CPU - fvVectorMatrix dfMatrix = fvm::div(phi, U); + // fvVectorMatrix dfMatrix = fvm::div(phi, U); + fvVectorMatrix dfMatrix = gaussConvectionSchemeFvmDiv(phi, U); // prepare for run GPU // prepare phi field @@ -365,7 +395,8 @@ void test_fvm_laplacian_vector(dfMatrixDataBase& dfDataBase, Foam::fvMesh& mesh, } // run CPU - fvVectorMatrix dfMatrix = fvm::laplacian(gamma, U); + // fvVectorMatrix dfMatrix = fvm::laplacian(gamma, U); + fvVectorMatrix dfMatrix = gaussLaplacianSchemeFvmLaplacian(gamma, U); // prepare for run GPU // prepare gamma on GPU @@ -415,7 +446,8 @@ void test_fvc_ddt_scalar(dfMatrixDataBase& dfDataBase, Foam::fvMesh& mesh, volSc } // run CPU - volScalarField fvc_ouput_scalar = fvc::ddt(rho, K); + // volScalarField fvc_ouput_scalar = fvc::ddt(rho, K); + volScalarField fvc_ouput_scalar = EulerDdtSchemeFvcDdt(rho, K); // prepare for run GPU // prepare rho, rho.old on GPU @@ -453,10 +485,10 @@ void test_fvc_ddt_scalar(dfMatrixDataBase& dfDataBase, Foam::fvMesh& mesh, volSc // unittest of fvc::grad(U) void test_fvc_grad_vector(dfMatrixDataBase& dfDataBase, Foam::fvMesh& mesh, volVectorField& U, initType type) { - // if (type == initType::randomInit) { - // U.oldTime(); - // randomInitField(U); - // } + if (type == initType::randomInit) { + U.oldTime(); + randomInitField(U); + } // run CPU volTensorField fvc_ouput_tensor = fvc::grad(U); @@ -519,3 +551,60 @@ void test_fvc_div_scalar(dfMatrixDataBase& dfDataBase, Foam::fvMesh& mesh, surfa checkCudaErrors(cudaMemcpy(h_fvc_ouput_scalar.data(), d_fvc_ouput_scalar, dfDataBase.cell_value_bytes, cudaMemcpyDeviceToHost)); checkVectorEqual(dfDataBase.num_cells, &fvc_ouput_scalar[0], h_fvc_ouput_scalar.data(), 1e-14, printFlag); } + +// unittest of fvc::div(U) +void test_fvc_div_vector(dfMatrixDataBase& dfDataBase, Foam::fvMesh& mesh, volVectorField& U, initType type) { + if (type == initType::randomInit) { + U.oldTime(); + randomInitField(U); + } + + // run CPU + // volScalarField fvc_ouput_scalar = fvc::div(U); + volScalarField fvc_ouput_scalar = gaussDivFvcdiv(U); + + // prepare for run GPU + // prepare phi on GPU + uploadRegisteredField(dfDataBase, U, "u"); + + double *d_fvc_ouput_scalar = nullptr; + checkCudaErrors(cudaMalloc((void**)&d_fvc_ouput_scalar, dfDataBase.cell_value_bytes)); + checkCudaErrors(cudaMemset(d_fvc_ouput_scalar, 0, dfDataBase.cell_value_bytes)); + + // only need patch_type + testGPUDataBase testData; + buildTestGPUDataBase(dfDataBase, testData, U, false, false, false, false, false, false, false, false, false, false); + + fvc_div_cell_vector(dfDataBase.stream, dfDataBase.num_cells, dfDataBase.num_surfaces, + dfDataBase.d_owner, dfDataBase.d_neighbor, + dfDataBase.d_weight, dfDataBase.d_sf, dfDataBase.d_u, d_fvc_ouput_scalar, + dfDataBase.num_patches, dfDataBase.patch_size.data(), testData.patch_type.data(), + dfDataBase.d_boundary_face_cell, dfDataBase.d_boundary_u, dfDataBase.d_boundary_sf, + dfDataBase.d_volume); + + // compare result + bool printFlag = false; + std::vector h_fvc_ouput_scalar(dfDataBase.num_cells); + checkCudaErrors(cudaMemcpy(h_fvc_ouput_scalar.data(), d_fvc_ouput_scalar, dfDataBase.cell_value_bytes, cudaMemcpyDeviceToHost)); + checkVectorEqual(dfDataBase.num_cells, &fvc_ouput_scalar[0], h_fvc_ouput_scalar.data(), 1e-14, printFlag); +} + + +// * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * // +template <> +void getFieldPtr(std::queue& fieldPtrQue, volVectorField& field) { + fieldPtrQue.push(&field[0][0]); + forAll(field.boundaryField(), patchi){ + auto& patchField = field.boundaryFieldRef()[patchi]; + fieldPtrQue.push(&patchField[0][0]); + } +}; + +template <> +void getFieldPtr(std::queue& fieldPtrQue, volTensorField& field) { + fieldPtrQue.push(&field[0][0]); + forAll(field.boundaryField(), patchi){ + auto& patchField = field.boundaryFieldRef()[patchi]; + fieldPtrQue.push(&patchField[0][0]); + } +}; \ No newline at end of file diff --git a/GPUTest/unittest.C b/GPUTest/unittest.C index ccbaefa71..0577b3d2e 100644 --- a/GPUTest/unittest.C +++ b/GPUTest/unittest.C @@ -51,6 +51,8 @@ Description // debug #include "GenFvMatrix.H" +#include +#include #include "dfMatrixDataBase.H" #include "dfMatrixOpBase.H" @@ -132,12 +134,20 @@ int main(int argc, char *argv[]) // unittest of fvc::grad(U) test_fvc_grad_vector(dfDataBase, mesh, U, initType::original); DEBUG_TRACE; + test_fvc_grad_vector(dfDataBase, mesh, U, initType::randomInit); + DEBUG_TRACE; // unittest of fvc::div(phi) test_fvc_div_scalar(dfDataBase, mesh, phi, initType::original); DEBUG_TRACE; test_fvc_div_scalar(dfDataBase, mesh, phi, initType::randomInit); DEBUG_TRACE; + + // unittest of fvc::div(U) + test_fvc_div_vector(dfDataBase, mesh, U, initType::original); + DEBUG_TRACE; + test_fvc_div_vector(dfDataBase, mesh, U, initType::randomInit); + DEBUG_TRACE; } return 0; } diff --git a/GPUTestRef/EulerDdtScheme.C b/GPUTestRef/EulerDdtScheme.C new file mode 100644 index 000000000..0875e0033 --- /dev/null +++ b/GPUTestRef/EulerDdtScheme.C @@ -0,0 +1,322 @@ +/*---------------------------------------------------------------------------*\ + ========= | + \\ / F ield | OpenFOAM: The Open Source CFD Toolbox + \\ / O peration | Website: https://openfoam.org + \\ / A nd | Copyright (C) 2011-2018 OpenFOAM Foundation + \\/ M anipulation | +------------------------------------------------------------------------------- +License + This file is part of OpenFOAM. + + OpenFOAM is free software: you can redistribute it and/or modify it + under the terms of the GNU General Public License as published by + the Free Software Foundation, either version 3 of the License, or + (at your option) any later version. + + OpenFOAM is distributed in the hope that it will be useful, but WITHOUT + ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or + FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License + for more details. + + You should have received a copy of the GNU General Public License + along with OpenFOAM. If not, see . + +\*---------------------------------------------------------------------------*/ + +#include "GenFvMatrix.H" + +// * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * // + +namespace Foam +{ + +// * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * // + +// namespace fv +// { + +// * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * // + +template +tmp> +EulerDdtSchemeFvmDdt +( + const volScalarField& rho, + const GeometricField& vf +) +{ + const fvMesh& mesh = vf.mesh(); + + tmp> tfvm + ( + new fvMatrix + ( + vf, + rho.dimensions()*vf.dimensions()*dimVol/dimTime + ) + ); + fvMatrix& fvm = tfvm.ref(); + + scalar rDeltaT = 1.0/mesh.time().deltaTValue(); + + fvm.diag() = rDeltaT*rho.primitiveField()*mesh.Vsc(); + + if (mesh.moving()) + { + fvm.source() = rDeltaT + *rho.oldTime().primitiveField() + *vf.oldTime().primitiveField()*mesh.Vsc0(); + } + else + { + fvm.source() = rDeltaT + *rho.oldTime().primitiveField() + *vf.oldTime().primitiveField()*mesh.Vsc(); + } + return tfvm; +} + +template +tmp> +EulerDdtSchemeFvcDdt +( + const volScalarField& rho, + const GeometricField& vf +) +{ + const fvMesh& mesh = vf.mesh(); + + dimensionedScalar rDeltaT = 1.0/mesh.time().deltaT(); + + IOobject ddtIOobject + ( + "ddt("+rho.name()+','+vf.name()+')', + mesh.time().timeName(), + mesh + ); + + if (mesh.moving()) + { + return tmp> + ( + new GeometricField + ( + ddtIOobject, + rDeltaT* + ( + rho()*vf() + - rho.oldTime()() + *vf.oldTime()()*mesh.Vsc0()/mesh.Vsc() + ), + rDeltaT.value()* + ( + rho.boundaryField()*vf.boundaryField() + - rho.oldTime().boundaryField() + *vf.oldTime().boundaryField() + ) + ) + ); + } + else + { + return tmp> + ( + new GeometricField + ( + ddtIOobject, + rDeltaT*(rho*vf - rho.oldTime()*vf.oldTime()) + ) + ); + } +} + + +tmp +EulerDdtSchemeFvcDdtCorr +( + const volScalarField& rho, + const volVectorField& U, + const surfaceScalarField& phi, + const autoPtr& Uf +) +{ + Info << "EulerDdtSchemeFvcDdtCorr start" << endl; + + const fvMesh& mesh = U.mesh(); + + dimensionedScalar rDeltaT = 1.0/mesh.time().deltaT(); + + GeometricField rhoU0 + ( + rho.oldTime() * U.oldTime() + ); + + surfaceScalarField phiCorr + ( + phi.oldTime() - fvc::dotInterpolate(mesh.Sf(), rhoU0) + ); + + return tmp + ( + new surfaceScalarField + ( + IOobject + ( + "ddtCorr(" + + rho.name() + ',' + U.name() + ',' + phi.name() + ')', + mesh.time().timeName(), + mesh + ), + EulerDdtSchemeFvcDdtPhiCoeff + ( + rhoU0, + phi.oldTime(), + phiCorr, + rho.oldTime() + )*rDeltaT*phiCorr + ) + ); + +} + +tmp +EulerDdtSchemeFvcDdtPhiCoeff +( + const volVectorField& U, + const surfaceScalarField& phi, + const surfaceScalarField& phiCorr, + const volScalarField& rho +) +{ + const fvMesh& mesh = U.mesh(); + tmp tddtCouplingCoeff = scalar(1) - min(mag(phiCorr)/(mag(phi) + dimensionedScalar("small", phi.dimensions(), SMALL)),scalar(1)); + + surfaceScalarField& ddtCouplingCoeff = tddtCouplingCoeff.ref(); + + surfaceScalarField::Boundary& ccbf = ddtCouplingCoeff.boundaryFieldRef(); + + forAll(U.boundaryField(), patchi) + { + if + ( U.boundaryField()[patchi].fixesValue() + || isA(mesh.boundary()[patchi]) + ) + { + ccbf[patchi] = 0.0; + } + } + + return tddtCouplingCoeff; +} + +template +tmp> +EulerDdtSchemeFvmDdt +( + const GeometricField& vf +) +{ + const fvMesh& mesh = vf.mesh(); + + tmp> tfvm + ( + new fvMatrix + ( + vf, + vf.dimensions()*dimVol/dimTime + ) + ); + + fvMatrix& fvm = tfvm.ref(); + + scalar rDeltaT = 1.0/mesh.time().deltaTValue(); + + fvm.diag() = rDeltaT*mesh.Vsc(); + + if (mesh.moving()) + { + fvm.source() = rDeltaT*vf.oldTime().primitiveField()*mesh.Vsc0(); + } + else + { + fvm.source() = rDeltaT*vf.oldTime().primitiveField()*mesh.Vsc(); + } + + return tfvm; +} + +template +tmp> +EulerDdtSchemeFvcDdt +( + const GeometricField& vf +) +{ + const fvMesh& mesh = vf.mesh(); + + dimensionedScalar rDeltaT = 1.0/mesh.time().deltaT(); + + IOobject ddtIOobject + ( + "ddt("+vf.name()+')', + mesh.time().timeName(), + mesh + ); + + return tmp> + ( + new GeometricField + ( + ddtIOobject, + rDeltaT*(vf - vf.oldTime()) + ) + ); +} + +template +tmp> +EulerDdtSchemeFvmDdt +( + const volScalarField& rho, + const GeometricField& vf +); + +template +tmp> +EulerDdtSchemeFvmDdt +( + const volScalarField& rho, + const GeometricField& vf +); + +template +tmp> +EulerDdtSchemeFvcDdt +( + const volScalarField& rho, + const GeometricField& vf +); + +template +tmp> +EulerDdtSchemeFvmDdt +( + const GeometricField& vf +); + +template +tmp> +EulerDdtSchemeFvcDdt +( + const GeometricField& vf +); + +// * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * // + +// } // End namespace fv + +// * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * // + +} // End namespace Foam + +// ************************************************************************* // \ No newline at end of file diff --git a/GPUTestRef/GenFvMatrix.H b/GPUTestRef/GenFvMatrix.H index d328fe504..d76fa94d9 100644 --- a/GPUTestRef/GenFvMatrix.H +++ b/GPUTestRef/GenFvMatrix.H @@ -139,6 +139,19 @@ gaussConvectionSchemeFvcDiv const GeometricField& ssf ); +template +tmp +< + GeometricField + < + typename innerProduct::type, fvPatchField, volMesh + > +> +gaussDivFvcdiv +( + const GeometricField& vf +); + // fvc::grad template tmp diff --git a/GPUTestRef/Make/files b/GPUTestRef/Make/files index 1137c3eed..314f1f495 100644 --- a/GPUTestRef/Make/files +++ b/GPUTestRef/Make/files @@ -1,4 +1,6 @@ gaussGrad.C gaussConvectionScheme.C +gaussLaplacianScheme.C +EulerDdtScheme.C LIB = $(DF_LIBBIN)/libdfGenMatrix \ No newline at end of file diff --git a/GPUTestRef/gaussConvectionScheme.C b/GPUTestRef/gaussConvectionScheme.C index 83d11bda9..b8157d2d1 100644 --- a/GPUTestRef/gaussConvectionScheme.C +++ b/GPUTestRef/gaussConvectionScheme.C @@ -164,6 +164,42 @@ gaussConvectionSchemeFvcDiv ); } +template +tmp +< + GeometricField + < + typename innerProduct::type, fvPatchField, volMesh + > +> +gaussDivFvcdiv +( + const GeometricField& vf +) +{ + const fvMesh& mesh = vf.mesh(); + Istream& divIntScheme = mesh.divScheme("div("+vf.name()+')'); + word divScheme(divIntScheme); + + tmp> tinterpScheme_ = + surfaceInterpolationScheme::New(mesh, divIntScheme); + + tmp + < + GeometricField + ::type, fvPatchField, volMesh> + > tDiv + ( + fvcSurfaceIntegrate + ( + (tinterpScheme_().dotInterpolate(mesh.Sf(), vf))() + ) + ); + + + return tDiv; +} + template tmp> fvcSurfaceIntegrate @@ -222,7 +258,6 @@ void fvcSurfaceIntegrate ivf[owner[facei]] += issf[facei]; ivf[neighbour[facei]] -= issf[facei]; } - Info << "ivfcpu[473]before bou = " << ivf[473] << endl; forAll(mesh.boundary(), patchi) { @@ -234,19 +269,10 @@ void fvcSurfaceIntegrate forAll(mesh.boundary()[patchi], facei) { ivf[pFaceCells[facei]] += pssf[facei]; - if (pFaceCells[facei] == 473) - { - Info << "pssfcpu[473] += " << pssf[facei] << endl; - } - } } - Info << "ivfcpu[473] = " << ivf[473] << endl; - ivf /= mesh.Vsc(); - - printf("vol cpu = %.15e\n", mesh.Vsc()()[473]); } template @@ -298,6 +324,26 @@ gaussConvectionSchemeFvcDiv const GeometricField& ssf ); +template +tmp> +gaussConvectionSchemeFvcDiv +( + const GeometricField& ssf +); + +template +tmp +< + GeometricField + < + typename innerProduct::type, fvPatchField, volMesh + > +> +gaussDivFvcdiv +( + const GeometricField& vf +); + // * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * // } // End namespace Foam diff --git a/GPUTestRef/gaussLaplacianScheme.C b/GPUTestRef/gaussLaplacianScheme.C new file mode 100644 index 000000000..ed321ceda --- /dev/null +++ b/GPUTestRef/gaussLaplacianScheme.C @@ -0,0 +1,273 @@ +/*---------------------------------------------------------------------------* + ========= | + / F ield | OpenFOAM: The Open Source CFD Toolbox + / O peration | Website: https://openfoam.org + / A nd | Copyright (C) 2011-2018 OpenFOAM Foundation +/ M anipulation | +------------------------------------------------------------------------------- +License + This file is part of OpenFOAM. + + OpenFOAM is free software: you can redistribute it and/or modify it + under the terms of the GNU General Public License as published by + the Free Software Foundation, either version 3 of the License, or + (at your option) any later version. + + OpenFOAM is distributed in the hope that it will be useful, but WITHOUT + ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or + FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License + for more details. + + You should have received a copy of the GNU General Public License + along with OpenFOAM. If not, see . + +*---------------------------------------------------------------------------*/ + +#include "gaussLaplacianScheme.H" +#include "surfaceInterpolate.H" +#include "fvcDiv.H" +#include "fvcGrad.H" +#include "fvMatrices.H" +#include "snGradScheme.H" +#include "linear.H" +#include "orthogonalSnGrad.H" + +// * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * // + +namespace Foam +{ + +// * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * // + +// * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * // + +template +tmp> +gaussLaplacianSchemeFvmLaplacianUncorrected +( + const surfaceScalarField& gammaMagSf, + const surfaceScalarField& deltaCoeffs, + const GeometricField& vf +) +{ + tmp> tfvm + ( + new fvMatrix + ( + vf, + deltaCoeffs.dimensions()*gammaMagSf.dimensions()*vf.dimensions() + ) + ); + fvMatrix& fvm = tfvm.ref(); + + fvm.upper() = deltaCoeffs.primitiveField()*gammaMagSf.primitiveField(); + fvm.negSumDiag(); + + forAll(vf.boundaryField(), patchi) + { + const fvPatchField& pvf = vf.boundaryField()[patchi]; + const fvsPatchScalarField& pGamma = gammaMagSf.boundaryField()[patchi]; + const fvsPatchScalarField& pDeltaCoeffs = + deltaCoeffs.boundaryField()[patchi]; + + if (pvf.coupled()) + { + fvm.internalCoeffs()[patchi] = + pGamma*pvf.gradientInternalCoeffs(pDeltaCoeffs); + fvm.boundaryCoeffs()[patchi] = + -pGamma*pvf.gradientBoundaryCoeffs(pDeltaCoeffs); + } + else + { + fvm.internalCoeffs()[patchi] = pGamma*pvf.gradientInternalCoeffs(); + fvm.boundaryCoeffs()[patchi] = -pGamma*pvf.gradientBoundaryCoeffs(); + } + } + + return tfvm; +} + + +template +tmp> +gaussLaplacianSchemeGammaSnGradCorr +( + const surfaceVectorField& SfGammaCorr, + const GeometricField& vf +) +{ + const fvMesh& mesh = vf.mesh(); + + tmp> tgammaSnGradCorr + ( + new GeometricField + ( + IOobject + ( + "gammaSnGradCorr("+vf.name()+')', + vf.instance(), + mesh, + IOobject::NO_READ, + IOobject::NO_WRITE + ), + mesh, + SfGammaCorr.dimensions() + *vf.dimensions()*mesh.deltaCoeffs().dimensions() + ) + ); + + for (direction cmpt = 0; cmpt < pTraits::nComponents; cmpt++) + { + tgammaSnGradCorr.ref().replace + ( + cmpt, + fvc::dotInterpolate(SfGammaCorr, fvc::grad(vf.component(cmpt))) + ); + } + + return tgammaSnGradCorr; +} + +template +tmp> +gaussLaplacianSchemeFvmLaplacian +( + const GeometricField& gammaScalarVol, + const GeometricField& vf +) +{ + const fvMesh& mesh = vf.mesh(); + tmp> tinterpGammaScheme_(new linear(mesh)); + tmp> tsnGradScheme_(new fv::orthogonalSnGrad(mesh)); + + tmp> tgamma = tinterpGammaScheme_().interpolate(gammaScalarVol); + const GeometricField& gamma = tgamma.ref(); + + GeometricField gammaMagSf + ( + gamma*mesh.magSf() + ); + + tmp> tfvm = gaussLaplacianSchemeFvmLaplacianUncorrected + ( + gammaMagSf, + tsnGradScheme_().deltaCoeffs(vf), + vf + ); + fvMatrix& fvm = tfvm.ref(); + + if (tsnGradScheme_().corrected()) + { + if (mesh.fluxRequired(vf.name())) + { + fvm.faceFluxCorrectionPtr() = new + GeometricField + ( + gammaMagSf*tsnGradScheme_().correction(vf) + ); + + fvm.source() -= + mesh.V()* + fvc::div + ( + *fvm.faceFluxCorrectionPtr() + )().primitiveField(); + } + else + { + fvm.source() -= + mesh.V()* + fvc::div + ( + gammaMagSf*tsnGradScheme_().correction(vf) + )().primitiveField(); + } + } + return tfvm; +} + +template +tmp> +gaussLaplacianSchemeFvmLaplacian +( + const GeometricField& gamma, + const GeometricField& vf +) +{ + const fvMesh& mesh = vf.mesh(); + tmp> tsnGradScheme_(new fv::orthogonalSnGrad(mesh)); + + GeometricField gammaMagSf + ( + gamma*mesh.magSf() + ); + + tmp> tfvm = gaussLaplacianSchemeFvmLaplacianUncorrected + ( + gammaMagSf, + tsnGradScheme_().deltaCoeffs(vf), + vf + ); + fvMatrix& fvm = tfvm.ref(); + + if (tsnGradScheme_().corrected()) + { + if (mesh.fluxRequired(vf.name())) + { + fvm.faceFluxCorrectionPtr() = new + GeometricField + ( + gammaMagSf*tsnGradScheme_().correction(vf) + ); + + fvm.source() -= + mesh.V()* + fvc::div + ( + *fvm.faceFluxCorrectionPtr() + )().primitiveField(); + } + else + { + fvm.source() -= + mesh.V()* + fvc::div + ( + gammaMagSf*tsnGradScheme_().correction(vf) + )().primitiveField(); + } + } + return tfvm; +} + +// * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * // + +template +tmp> +gaussLaplacianSchemeFvmLaplacian +( + const GeometricField& gammaScalarVol, + const GeometricField& vf +); + +template +tmp> +gaussLaplacianSchemeFvmLaplacian +( + const GeometricField& gammaScalarVol, + const GeometricField& vf +); + +template +tmp> +gaussLaplacianSchemeFvmLaplacian +( + const GeometricField& gamma, + const GeometricField& vf +); + +// * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * // + +} // End namespace Foam + +// ************************************************************************* // \ No newline at end of file diff --git a/src_gpu/dfMatrixOpBase.H b/src_gpu/dfMatrixOpBase.H index b4015f0a6..35ac78c82 100644 --- a/src_gpu/dfMatrixOpBase.H +++ b/src_gpu/dfMatrixOpBase.H @@ -55,7 +55,11 @@ void fvc_grad_vector(cudaStream_t stream, int num_cells, int num_surfaces, void fvc_div_surface_scalar(cudaStream_t stream, int num_cells, int num_surfaces, int num_boundary_surfaces, const int *lowerAddr, const int *upperAddr, const double *ssf, const int *boundary_cell_face, const double *boundary_ssf, const double *volume, double *output); -// void fvc_grad_surface(); -// -// void fvc_div_cell(); + +void fvc_div_cell_vector(cudaStream_t stream, int num_cells, int num_surfaces, + const int *lowerAddr, const int *upperAddr, + const double *weight, const double *Sf, const double *vf, double *output, // end for internal + int num_patches, const int *patch_size, const int *patch_type, + const int *boundary_cell_face, const double *boundary_vf, const double *boundary_Sf, + const double *volume); diff --git a/src_gpu/dfMatrixOpBase.cu b/src_gpu/dfMatrixOpBase.cu index e397b4232..39ad1f6a3 100644 --- a/src_gpu/dfMatrixOpBase.cu +++ b/src_gpu/dfMatrixOpBase.cu @@ -244,13 +244,6 @@ __global__ void fvc_grad_vector_internal(int num_surfaces, atomicAdd(&(output[neighbor * 9 + 6]), -grad_zx); atomicAdd(&(output[neighbor * 9 + 7]), -grad_zy); atomicAdd(&(output[neighbor * 9 + 8]), -grad_zz); - - // if (owner == 0) - // { - // printf("tensor[0] = (%.5e, %.5e, %.5e, %.5e, %.5e, %.5e, %.5e, %.5e, %.5e)\n", output[owner * 9 + 0], - // output[owner * 9 + 1], output[owner * 9 + 2], output[owner * 9 + 3], output[owner * 9 + 4], output[owner * 9 + 5], - // output[owner * 9 + 6], output[owner * 9 + 7], output[owner * 9 + 8]); - // } } // update boundary of interpolation field @@ -276,12 +269,6 @@ __global__ void fvc_grad_vector_boundary(int num, int offset, const int *face2Ce int cellIndex = face2Cells[start_index]; - // if (cellIndex == 0) - // { - // printf("surface vector = (%.5e, %.5e, %.5e)\n field vector = (%.5e, %.5e, %.5e)\n", bouSfx, bouSfy, bouSfz, - // boussfx, boussfy, boussfz); - // } - double grad_xx = bouSfx * boussfx; double grad_xy = bouSfx * boussfy; double grad_xz = bouSfx * boussfz; @@ -329,11 +316,6 @@ __global__ void divide_cell_volume_scalar(int num_cells, const double* volume, d double vol = volume[index]; - if (index == 473) - { - printf("vol gpu = %.15e\n", vol); - } - output[index] = output[index] / vol; } @@ -451,11 +433,6 @@ __global__ void fvc_div_surface_scalar_internal(int num_surfaces, // neighbor atomicAdd(&(output[neighbor]), -issf); - - if (index == 0) - { - printf("output[3511]before = %.5e\n", output[473]); - } } __global__ void fvc_div_surface_scalar_boundary(int num_boundary_face, const int *face2Cells, @@ -468,17 +445,61 @@ __global__ void fvc_div_surface_scalar_boundary(int num_boundary_face, const int int cellIndex = face2Cells[index]; atomicAdd(&(output[cellIndex]), boundary_ssf[index]); +} - // if (index == 0) - // { - // printf("output[3511] = %.5e\n", output[3511]); - // } +__global__ void fvc_div_cell_vector_internal(int num_surfaces, + const int *lower_index, const int *upper_index, + const double *field_vector, const double *weight, const double *face_vector, + double *output) +{ + int index = blockDim.x * blockIdx.x + threadIdx.x; + if (index >= num_surfaces) + return; + + double w = weight[index]; + double Sfx = face_vector[index * 3 + 0]; + double Sfy = face_vector[index * 3 + 1]; + double Sfz = face_vector[index * 3 + 2]; + + int owner = lower_index[index]; + int neighbor = upper_index[index]; + + double ssfx = (w * (field_vector[owner * 3 + 0] - field_vector[neighbor * 3 + 0]) + field_vector[neighbor * 3 + 0]); + double ssfy = (w * (field_vector[owner * 3 + 1] - field_vector[neighbor * 3 + 1]) + field_vector[neighbor * 3 + 1]); + double ssfz = (w * (field_vector[owner * 3 + 2] - field_vector[neighbor * 3 + 2]) + field_vector[neighbor * 3 + 2]); + + double div = Sfx * ssfx + Sfy * ssfy + Sfz * ssfz; + + // owner + atomicAdd(&(output[owner]), div); + + // neighbour + atomicAdd(&(output[neighbor]), -div); +} + +__global__ void fvc_div_cell_vector_boundary(int num, int offset, const int *face2Cells, + const double *boundary_face_vector, const double *boundary_field_vector, double *output) +{ + int index = blockDim.x * blockIdx.x + threadIdx.x; + if (index >= num) + return; + + int start_index = offset + index; + + double bouSfx = boundary_face_vector[start_index * 3 + 0]; + double bouSfy = boundary_face_vector[start_index * 3 + 1]; + double bouSfz = boundary_face_vector[start_index * 3 + 2]; + + double boussfx = boundary_field_vector[start_index * 3 + 0]; + double boussfy = boundary_field_vector[start_index * 3 + 1]; + double boussfz = boundary_field_vector[start_index * 3 + 2]; + + int cellIndex = face2Cells[start_index]; + + double bouDiv = bouSfx * boussfx + bouSfy * boussfy + bouSfz * boussfz; + + atomicAdd(&(output[cellIndex]), bouDiv); - if (cellIndex == 473) - { - printf("output[473] = %.5e\n", output[473]); - printf("boundary_ssf[473] = %.5e\n", boundary_ssf[index]); - } } void permute_vector_d2h(cudaStream_t stream, int num_cells, const double *input, double *output) @@ -693,3 +714,37 @@ void fvc_div_surface_scalar(cudaStream_t stream, int num_cells, int num_surfaces divide_cell_volume_scalar<<>>(num_cells, volume, output); } +void fvc_div_cell_vector(cudaStream_t stream, int num_cells, int num_surfaces, + const int *lowerAddr, const int *upperAddr, + const double *weight, const double *Sf, const double *vf, double *output, // end for internal + int num_patches, const int *patch_size, const int *patch_type, + const int *boundary_cell_face, const double *boundary_vf, const double *boundary_Sf, + const double *volume) +{ + size_t threads_per_block = 1024; + size_t blocks_per_grid = (num_surfaces + threads_per_block - 1) / threads_per_block; + fvc_div_cell_vector_internal<<>>(num_surfaces, lowerAddr, upperAddr, vf, weight, Sf, output); + + int offset = 0; + for (int i = 0; i < num_patches; i++) { + threads_per_block = 256; + blocks_per_grid = (patch_size[i] + threads_per_block - 1) / threads_per_block; + // TODO: just basic patch type now + if (patch_type[i] == boundaryConditions::zeroGradient + || patch_type[i] == boundaryConditions::fixedValue) { + // TODO: just vector version now + fvc_div_cell_vector_boundary<<>>(patch_size[i], offset, boundary_cell_face, + boundary_Sf, boundary_vf, output); + } else if (0) { + // xxx + fprintf(stderr, "boundaryConditions other than zeroGradient are not support yet!\n"); + } + offset += patch_size[i]; + } + + // divide cell volume + threads_per_block = 1024; + blocks_per_grid = (num_cells + threads_per_block - 1) / threads_per_block; + divide_cell_volume_scalar<<>>(num_cells, volume, output); +} + From 7e29a106203cd90cd8192b1fca04c26e14dad7e5 Mon Sep 17 00:00:00 2001 From: maorz1998 Date: Sat, 12 Aug 2023 23:09:50 +0800 Subject: [PATCH 18/25] add the comparison with the original method --- GPUTest/GPUTestBase.H | 40 +- GPUTest/GPUTestRefBase.H | 63 +++ GPUTest/Make/options | 4 +- GPUTest/createGPUSolver.H | 45 ++ GPUTest/unittest.C | 19 +- src_gpu/dfMatrixOpBase.H | 6 + src_gpu/dfMatrixOpBase.cu | 196 +++++++- src_gpu_orig/CMakeLists.txt | 15 +- ...atrixDataBase.H => dfMatrixDataBaseOrig.H} | 44 +- ...rixDataBase.cu => dfMatrixDataBaseOrig.cu} | 2 +- src_gpu_orig/dfMatrixOpBaseOrig.H | 9 + src_gpu_orig/dfMatrixOpBaseOrig.cu | 460 ++++++++++++++++++ 12 files changed, 847 insertions(+), 56 deletions(-) create mode 100644 GPUTest/GPUTestRefBase.H rename src_gpu_orig/{dfMatrixDataBase.H => dfMatrixDataBaseOrig.H} (95%) rename src_gpu_orig/{dfMatrixDataBase.cu => dfMatrixDataBaseOrig.cu} (97%) create mode 100644 src_gpu_orig/dfMatrixOpBaseOrig.H create mode 100644 src_gpu_orig/dfMatrixOpBaseOrig.cu diff --git a/GPUTest/GPUTestBase.H b/GPUTest/GPUTestBase.H index b5f5b944b..2520485a7 100644 --- a/GPUTest/GPUTestBase.H +++ b/GPUTest/GPUTestBase.H @@ -491,8 +491,8 @@ void test_fvc_grad_vector(dfMatrixDataBase& dfDataBase, Foam::fvMesh& mesh, volV } // run CPU - volTensorField fvc_ouput_tensor = fvc::grad(U); - // volTensorField fvc_ouput_tensor = gaussGradSchemeGrad(U); + // volTensorField fvc_ouput_tensor = fvc::grad(U); + volTensorField fvc_ouput_tensor = gaussGradSchemeGrad(U); // prepare for run GPU // prepare U on GPU @@ -589,6 +589,42 @@ void test_fvc_div_vector(dfMatrixDataBase& dfDataBase, Foam::fvMesh& mesh, volVe checkVectorEqual(dfDataBase.num_cells, &fvc_ouput_scalar[0], h_fvc_ouput_scalar.data(), 1e-14, printFlag); } +// unittest of fvc::grad(p) +void test_fvc_grad_scalar(dfMatrixDataBase& dfDataBase, Foam::fvMesh& mesh, volScalarField& p, initType type) { + if (type == initType::randomInit) { + p.oldTime(); + randomInitField(p); + } + + // run CPU + // volVectorField fvc_ouput_vector = fvc::grad(p); + volVectorField fvc_ouput_vector = gaussGradSchemeGrad(p); + + // prepare for run GPU + // prepare p on GPU + uploadRegisteredField(dfDataBase, p, "p"); + + double *d_fvc_ouput_vector = nullptr; + checkCudaErrors(cudaMalloc((void**)&d_fvc_ouput_vector, dfDataBase.cell_value_vec_bytes)); + checkCudaErrors(cudaMemset(d_fvc_ouput_vector, 0, dfDataBase.cell_value_vec_bytes)); + + // only need patch_type + testGPUDataBase testData; + buildTestGPUDataBase(dfDataBase, testData, p, false, false, false, false, false, false, false, false, false, false); + + fvc_grad_cell_scalar(dfDataBase.stream, dfDataBase.num_cells, dfDataBase.num_surfaces, + dfDataBase.d_owner, dfDataBase.d_neighbor, + dfDataBase.d_weight, dfDataBase.d_sf, dfDataBase.d_p, d_fvc_ouput_vector, + dfDataBase.num_patches, dfDataBase.patch_size.data(), testData.patch_type.data(), + dfDataBase.d_boundary_face_cell, dfDataBase.d_boundary_p, dfDataBase.d_boundary_sf, dfDataBase.d_volume); + + // compare result + bool printFlag = false; + std::vector h_fvc_ouput_vector(dfDataBase.num_cells * 3); + checkCudaErrors(cudaMemcpy(h_fvc_ouput_vector.data(), d_fvc_ouput_vector, dfDataBase.cell_value_vec_bytes, cudaMemcpyDeviceToHost)); + checkVectorEqual(dfDataBase.num_cells * 3, &fvc_ouput_vector[0][0], h_fvc_ouput_vector.data(), 1e-14, printFlag); +} + // * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * // template <> diff --git a/GPUTest/GPUTestRefBase.H b/GPUTest/GPUTestRefBase.H new file mode 100644 index 000000000..754219e64 --- /dev/null +++ b/GPUTest/GPUTestRefBase.H @@ -0,0 +1,63 @@ + +// unittest of fvc::grad(U) +void test_fvc_grad_vector_orig(dfMatrixDataBase& dfDataBase, Foam::fvMesh& mesh, volVectorField& U, initType type, + dfMatrixDataBaseOrig* dfDataBaseOrig) +{ + if (type == initType::randomInit) { + U.oldTime(); + randomInitField(U); + } + + // run CPU + // volTensorField fvc_ouput_tensor = fvc::grad(U); + volTensorField fvc_ouput_tensor = gaussGradSchemeGrad(U); + + // prepare for run GPU + // prepare U on GPU + uploadRegisteredField(dfDataBase, U, "u"); + + double *d_fvc_ouput_tensor = nullptr, *d_fvc_ouput_boundary_tensor = nullptr, *d_fvc_ouput_boundary_tensor_init = nullptr; + checkCudaErrors(cudaMalloc((void**)&d_fvc_ouput_tensor, dfDataBase.cell_value_tsr_bytes)); + checkCudaErrors(cudaMalloc((void**)&d_fvc_ouput_boundary_tensor, dfDataBase.boundary_surface_value_tsr_bytes)); + checkCudaErrors(cudaMalloc((void**)&d_fvc_ouput_boundary_tensor_init, dfDataBase.boundary_surface_value_tsr_bytes)); + checkCudaErrors(cudaMemset(d_fvc_ouput_tensor, 0, dfDataBase.cell_value_tsr_bytes)); + checkCudaErrors(cudaMemset(d_fvc_ouput_boundary_tensor, 0, dfDataBase.boundary_surface_value_tsr_bytes)); + checkCudaErrors(cudaMemset(d_fvc_ouput_boundary_tensor_init, 0, dfDataBase.boundary_surface_value_tsr_bytes)); + + fvc_grad_vector_orig(dfDataBase.stream, dfDataBaseOrig, dfDataBase, d_fvc_ouput_tensor, d_fvc_ouput_boundary_tensor_init, d_fvc_ouput_boundary_tensor); + + // compare result + bool printFlag = false; + std::vector h_fvc_ouput_tensor(dfDataBase.num_cells * 9); + checkCudaErrors(cudaMemcpy(h_fvc_ouput_tensor.data(), d_fvc_ouput_tensor, dfDataBase.cell_value_tsr_bytes, cudaMemcpyDeviceToHost)); + checkVectorEqual(dfDataBase.num_cells * 9, &fvc_ouput_tensor[0][0], h_fvc_ouput_tensor.data(), 1e-14, printFlag); +} + +void test_fvc_grad_scalar_orig(dfMatrixDataBase& dfDataBase, Foam::fvMesh& mesh, volScalarField& p, initType type, + dfMatrixDataBaseOrig* dfDataBaseOrig) +{ + if (type == initType::randomInit) { + p.oldTime(); + randomInitField(p); + } + + // run CPU + // volVectorField fvc_ouput_vector = fvc::grad(p); + volVectorField fvc_ouput_vector = gaussGradSchemeGrad(p); + + // prepare for run GPU + // prepare p on GPU + uploadRegisteredField(dfDataBase, p, "p"); + + double *d_fvc_ouput_vector = nullptr; + checkCudaErrors(cudaMalloc((void**)&d_fvc_ouput_vector, dfDataBase.cell_value_vec_bytes)); + checkCudaErrors(cudaMemset(d_fvc_ouput_vector, 0, dfDataBase.cell_value_vec_bytes)); + + fvc_grad_scalar_orig(dfDataBase.stream, dfDataBaseOrig, dfDataBase, d_fvc_ouput_vector); + + // compare result + bool printFlag = false; + std::vector h_fvc_ouput_vector(dfDataBase.num_cells * 3); + checkCudaErrors(cudaMemcpy(h_fvc_ouput_vector.data(), d_fvc_ouput_vector, dfDataBase.cell_value_vec_bytes, cudaMemcpyDeviceToHost)); + checkVectorEqual(dfDataBase.num_cells * 3, &fvc_ouput_vector[0][0], h_fvc_ouput_vector.data(), 1e-14, printFlag); +} \ No newline at end of file diff --git a/GPUTest/Make/options b/GPUTest/Make/options index 197663050..e8e07b6a5 100644 --- a/GPUTest/Make/options +++ b/GPUTest/Make/options @@ -24,6 +24,7 @@ EXE_INC = -std=c++14 \ -I$(DF_SRC)/dfCombustionModels/lnInclude \ -I$(CANTERA_ROOT)/include \ -I$(DF_ROOT)/src_gpu \ + -I$(DF_ROOT)/src_gpu_orig \ -I$(DF_ROOT)/GPUTestRef/lnInclude \ -I/usr/local/cuda-11.6/include \ -I$(AMGX_DIR)/include @@ -44,5 +45,6 @@ EXE_LIBS = \ $(CANTERA_ROOT)/lib/libcantera.so \ /usr/local/cuda-11.6/lib64/libcudart.so \ $(AMGX_DIR)/build/libamgxsh.so \ - $(DF_ROOT)/src_gpu/build/libdfMatrix.so + $(DF_ROOT)/src_gpu/build/libdfMatrix.so \ + $(DF_ROOT)/src_gpu_orig/build/libdfMatrixOrig.so diff --git a/GPUTest/createGPUSolver.H b/GPUTest/createGPUSolver.H index 478b15ed1..516386473 100644 --- a/GPUTest/createGPUSolver.H +++ b/GPUTest/createGPUSolver.H @@ -67,3 +67,48 @@ void createGPUBase(fvMesh& mesh, PtrList& Y) { dfDataBase.initNonConstantFieldsInternal(dfDataBase.h_y); dfDataBase.initNonConstantFieldsBoundary(dfDataBase.h_boundary_y); }; + + +dfMatrixDataBaseOrig* createGPUBaseOrig(fvMesh& mesh, PtrList& Y, volVectorField& U) { + const labelUList& owner = mesh.owner(); + const labelUList& neighbour = mesh.neighbour(); + int num_cells = mesh.nCells(); + int num_surfaces = neighbour.size(); + + std::vector boundaryCellIndex; + std::vector boundary_face_vector_init; + std::vector boundary_face_init; + std::vector boundary_deltaCoeffs_init; + std::vector> patchTypes; + std::vector patchTypeU, patchTypeY; + int num_boundary_faces = 0; + int patchSize; + forAll(mesh.boundary(), patchi) + { + labelUList sub_boundary = mesh.boundary()[patchi].faceCells(); + patchSize = sub_boundary.size(); + const vectorField& pSf = mesh.Sf().boundaryField()[patchi]; + const scalarField& pMagSf = mesh.magSf().boundaryField()[patchi]; + const scalarField& pDeltaCoeffs = mesh.nonOrthDeltaCoeffs().boundaryField()[patchi]; + + boundaryCellIndex.insert(boundaryCellIndex.end(), &sub_boundary[0], &sub_boundary[0]+patchSize); + boundary_face_vector_init.insert(boundary_face_vector_init.end(), &pSf[0][0], &pSf[0][0]+3*patchSize); + boundary_face_init.insert(boundary_face_init.end(), &pMagSf[0], &pMagSf[0]+patchSize); + boundary_deltaCoeffs_init.insert(boundary_deltaCoeffs_init.end(), &pDeltaCoeffs[0], &pDeltaCoeffs[0]+patchSize); + num_boundary_faces += patchSize; + + constructBoundarySelector(patchTypeU, U.boundaryField()[patchi].type(), patchSize); + constructBoundarySelector(patchTypeY, Y[0].boundaryField()[patchi].type(), patchSize); + } + patchTypes.emplace_back(patchTypeU); + patchTypes.emplace_back(patchTypeY); + + int num_boundary_cells; + + dfMatrixDataBaseOrig* dfDataBase = new dfMatrixDataBaseOrig(num_surfaces, num_cells, num_boundary_faces, Y.size(), num_boundary_cells, + &neighbour[0], &owner[0], &mesh.V()[0], &mesh.surfaceInterpolation::weights()[0], + &mesh.Sf()[0][0], &mesh.magSf()[0], &mesh.nonOrthDeltaCoeffs()[0], boundary_face_vector_init, + boundary_face_init, boundary_deltaCoeffs_init, boundaryCellIndex, patchTypes); + + return dfDataBase; +} \ No newline at end of file diff --git a/GPUTest/unittest.C b/GPUTest/unittest.C index 0577b3d2e..80eafef9d 100644 --- a/GPUTest/unittest.C +++ b/GPUTest/unittest.C @@ -55,9 +55,12 @@ Description #include #include "dfMatrixDataBase.H" +#include "dfMatrixDataBaseOrig.H" #include "dfMatrixOpBase.H" +#include "dfMatrixOpBaseOrig.H" #include "createGPUSolver.H" #include "GPUTestBase.H" +#include "GPUTestRefBase.H" int main(int argc, char *argv[]) { @@ -102,6 +105,8 @@ int main(int argc, char *argv[]) createGPUBase(mesh, Y); DEBUG_TRACE; + dfMatrixDataBaseOrig* dfDataBaseOrig = createGPUBaseOrig(mesh, Y, U); + DEBUG_TRACE; // unittest of fvm::ddt(rho, U) test_fvm_ddt_vector(dfDataBase, mesh, rho, U, initType::original); @@ -134,7 +139,9 @@ int main(int argc, char *argv[]) // unittest of fvc::grad(U) test_fvc_grad_vector(dfDataBase, mesh, U, initType::original); DEBUG_TRACE; - test_fvc_grad_vector(dfDataBase, mesh, U, initType::randomInit); + // test_fvc_grad_vector(dfDataBase, mesh, U, initType::randomInit); + // DEBUG_TRACE; + test_fvc_grad_vector_orig(dfDataBase, mesh, U, initType::original, dfDataBaseOrig); DEBUG_TRACE; // unittest of fvc::div(phi) @@ -146,8 +153,16 @@ int main(int argc, char *argv[]) // unittest of fvc::div(U) test_fvc_div_vector(dfDataBase, mesh, U, initType::original); DEBUG_TRACE; - test_fvc_div_vector(dfDataBase, mesh, U, initType::randomInit); + // test_fvc_div_vector(dfDataBase, mesh, U, initType::randomInit); + // DEBUG_TRACE; + + // unittest of fvc::grad(p) + test_fvc_grad_scalar(dfDataBase, mesh, p, initType::original); + DEBUG_TRACE; + test_fvc_grad_scalar(dfDataBase, mesh, p, initType::randomInit); DEBUG_TRACE; + test_fvc_grad_scalar_orig(dfDataBase, mesh, p, initType::original, dfDataBaseOrig); + DEBUG_TRACE } return 0; } diff --git a/src_gpu/dfMatrixOpBase.H b/src_gpu/dfMatrixOpBase.H index 35ac78c82..109f20c3f 100644 --- a/src_gpu/dfMatrixOpBase.H +++ b/src_gpu/dfMatrixOpBase.H @@ -63,3 +63,9 @@ void fvc_div_cell_vector(cudaStream_t stream, int num_cells, int num_surfaces, const int *boundary_cell_face, const double *boundary_vf, const double *boundary_Sf, const double *volume); +void fvc_grad_cell_scalar(cudaStream_t stream, int num_cells, int num_surfaces, + const int *lowerAddr, const int *upperAddr, + const double *weight, const double *Sf, const double *vf, double *output, // end for internal + int num_patches, const int *patch_size, const int *patch_type, + const int *boundary_cell_face, const double *boundary_vf, const double *boundary_Sf, const double *volume); + diff --git a/src_gpu/dfMatrixOpBase.cu b/src_gpu/dfMatrixOpBase.cu index 39ad1f6a3..7a76db89c 100644 --- a/src_gpu/dfMatrixOpBase.cu +++ b/src_gpu/dfMatrixOpBase.cu @@ -290,6 +290,81 @@ __global__ void fvc_grad_vector_boundary(int num, int offset, const int *face2Ce atomicAdd(&(output[cellIndex * 9 + 8]), grad_zz); } +__global__ void fvc_grad_scalar_internal(int num_cells, int num_surfaces, + const int *lower_index, const int *upper_index, const double *face_vector, + const double *weight, const double *vf, double *output) +{ + int index = blockDim.x * blockIdx.x + threadIdx.x; + if (index >= num_surfaces) + return; + + double w = weight[index]; + double Sfx = face_vector[index * 3 + 0]; + double Sfy = face_vector[index * 3 + 1]; + double Sfz = face_vector[index * 3 + 2]; + + int owner = lower_index[index]; + int neighbor = upper_index[index]; + + double ssf = (w * (vf[owner] - vf[neighbor]) + vf[neighbor]); + + double grad_x = Sfx * ssf; + double grad_y = Sfy * ssf; + double grad_z = Sfz * ssf; + + // // owner + // atomicAdd(&(output[num_cells * 0 + owner]), grad_x); + // atomicAdd(&(output[num_cells * 1 + owner]), grad_y); + // atomicAdd(&(output[num_cells * 2 + owner]), grad_z); + + // // neighbour + // atomicAdd(&(output[num_cells * 0 + neighbor]), -grad_x); + // atomicAdd(&(output[num_cells * 1 + neighbor]), -grad_y); + // atomicAdd(&(output[num_cells * 2 + neighbor]), -grad_z); + + // owner + atomicAdd(&(output[owner * 3 + 0]), grad_x); + atomicAdd(&(output[owner * 3 + 1]), grad_y); + atomicAdd(&(output[owner * 3 + 2]), grad_z); + + // neighbour + atomicAdd(&(output[neighbor * 3 + 0]), -grad_x); + atomicAdd(&(output[neighbor * 3 + 1]), -grad_y); + atomicAdd(&(output[neighbor * 3 + 2]), -grad_z); + +} + +__global__ void fvc_grad_scalar_boundary(int num_cells, int num, int offset, const int *face2Cells, + const double *boundary_face_vector, const double *boundary_vf, double *output) +{ + int index = blockDim.x * blockIdx.x + threadIdx.x; + if (index >= num) + return; + + int start_index = offset + index; + + double bouvf = boundary_vf[start_index]; + double bouSfx = boundary_face_vector[start_index * 3 + 0]; + double bouSfy = boundary_face_vector[start_index * 3 + 1]; + double bouSfz = boundary_face_vector[start_index * 3 + 2]; + + int cellIndex = face2Cells[start_index]; + + double grad_x = bouSfx * bouvf; + double grad_y = bouSfy * bouvf; + double grad_z = bouSfz * bouvf; + + atomicAdd(&(output[cellIndex * 3 + 0]), grad_x); + atomicAdd(&(output[cellIndex * 3 + 1]), grad_y); + atomicAdd(&(output[cellIndex * 3 + 2]), grad_z); + + // if (cellIndex == 5) + // { + // printf("Sfx = %.10e, ssf = %.10e\n", bouSfx, bouvf); + // printf("gradx = %.10e, output = %.10e\n\n", grad_x, output[5]); + // } +} + __global__ void divide_cell_volume_tsr(int num_cells, const double* volume, double *output) { int index = blockDim.x * blockIdx.x + threadIdx.x; @@ -308,6 +383,19 @@ __global__ void divide_cell_volume_tsr(int num_cells, const double* volume, doub output[index * 9 + 8] = output[index * 9 + 8] / vol; } +__global__ void divide_cell_volume_vec(int num_cells, const double* volume, double *output) +{ + int index = blockDim.x * blockIdx.x + threadIdx.x; + if (index >= num_cells) + return; + + double vol = volume[index]; + + output[index * 3 + 0] = output[index * 3 + 0] / vol; + output[index * 3 + 1] = output[index * 3 + 1] / vol; + output[index * 3 + 2] = output[index * 3 + 2] / vol; +} + __global__ void divide_cell_volume_scalar(int num_cells, const double* volume, double *output) { int index = blockDim.x * blockIdx.x + threadIdx.x; @@ -646,11 +734,28 @@ void fvc_grad_vector(cudaStream_t stream, int num_cells, int num_surfaces, const double *volume, const double *boundary_mag_Sf, double *boundary_output, const double *boundary_deltaCoeffs) { + float time_elapsed = 0; + cudaEvent_t start, stop; + checkCudaErrors(cudaEventCreate(&start)); + checkCudaErrors(cudaEventCreate(&stop)); + + // checkCudaErrors(cudaStreamSynchronize(stream)); + checkCudaErrors(cudaEventRecord(start, 0)); + size_t threads_per_block = 1024; size_t blocks_per_grid = (num_surfaces + threads_per_block - 1) / threads_per_block; fvc_grad_vector_internal<<>>(num_surfaces, lowerAddr, upperAddr, Sf, weight, vf, output); - + + // checkCudaErrors(cudaStreamSynchronize(stream)); + checkCudaErrors(cudaEventRecord(stop, 0)); + checkCudaErrors(cudaEventSynchronize(start)); + checkCudaErrors(cudaEventSynchronize(stop)); + checkCudaErrors(cudaEventElapsedTime(&time_elapsed, start, stop)); + printf("\nfvc_grad_vector_new internal 执行时间:%f(ms)\n", time_elapsed); + + // checkCudaErrors(cudaStreamSynchronize(stream)); + checkCudaErrors(cudaEventRecord(start, 0)); int offset = 0; // finish conctruct grad field except dividing cell volume for (int i = 0; i < num_patches; i++) { @@ -668,14 +773,33 @@ void fvc_grad_vector(cudaStream_t stream, int num_cells, int num_surfaces, } offset += patch_size[i]; } + // checkCudaErrors(cudaStreamSynchronize(stream)); + checkCudaErrors(cudaEventRecord(stop, 0)); + checkCudaErrors(cudaEventSynchronize(start)); + checkCudaErrors(cudaEventSynchronize(stop)); + checkCudaErrors(cudaEventElapsedTime(&time_elapsed, start, stop)); + printf("fvc_grad_vector_new boundary1 执行时间:%f(ms)\n", time_elapsed); // divide cell volume - threads_per_block = 1024; + // checkCudaErrors(cudaStreamSynchronize(stream)); + checkCudaErrors(cudaEventRecord(start, 0)); + + threads_per_block = 512; blocks_per_grid = (num_cells + threads_per_block - 1) / threads_per_block; divide_cell_volume_tsr<<>>(num_cells, volume, output); - offset = 0; + // checkCudaErrors(cudaStreamSynchronize(stream)); + checkCudaErrors(cudaEventRecord(stop, 0)); + checkCudaErrors(cudaEventSynchronize(start)); + checkCudaErrors(cudaEventSynchronize(stop)); + checkCudaErrors(cudaEventElapsedTime(&time_elapsed, start, stop)); + printf("fvc_grad_vector_new divide_cell 执行时间:%f(ms)\n", time_elapsed); + // correct boundary conditions + // checkCudaErrors(cudaStreamSynchronize(stream)); + checkCudaErrors(cudaEventRecord(start, 0)); + + offset = 0; for (int i = 0; i < num_patches; i++) { threads_per_block = 256; blocks_per_grid = (patch_size[i] + threads_per_block - 1) / threads_per_block; @@ -693,6 +817,13 @@ void fvc_grad_vector(cudaStream_t stream, int num_cells, int num_surfaces, } offset += patch_size[i]; } + // checkCudaErrors(cudaStreamSynchronize(stream)); + + checkCudaErrors(cudaEventRecord(stop, 0)); + checkCudaErrors(cudaEventSynchronize(start)); + checkCudaErrors(cudaEventSynchronize(stop)); + checkCudaErrors(cudaEventElapsedTime(&time_elapsed, start, stop)); + printf("fvc_grad_vector_new boundary2 执行时间:%f(ms)\n", time_elapsed); } void fvc_div_surface_scalar(cudaStream_t stream, int num_cells, int num_surfaces, int num_boundary_surfaces, @@ -748,3 +879,62 @@ void fvc_div_cell_vector(cudaStream_t stream, int num_cells, int num_surfaces, divide_cell_volume_scalar<<>>(num_cells, volume, output); } +void fvc_grad_cell_scalar(cudaStream_t stream, int num_cells, int num_surfaces, + const int *lowerAddr, const int *upperAddr, + const double *weight, const double *Sf, const double *vf, double *output, // end for internal + int num_patches, const int *patch_size, const int *patch_type, + const int *boundary_cell_face, const double *boundary_vf, const double *boundary_Sf, const double *volume) +{ + float time_elapsed = 0; + cudaEvent_t start, stop; + checkCudaErrors(cudaEventCreate(&start)); + checkCudaErrors(cudaEventCreate(&stop)); + checkCudaErrors(cudaEventRecord(start, 0)); + + size_t threads_per_block = 1024; + size_t blocks_per_grid = (num_surfaces + threads_per_block - 1) / threads_per_block; + fvc_grad_scalar_internal<<>>(num_cells, num_surfaces, lowerAddr, upperAddr, + Sf, weight, vf, output); + + checkCudaErrors(cudaEventRecord(stop, 0)); + checkCudaErrors(cudaEventSynchronize(start)); + checkCudaErrors(cudaEventSynchronize(stop)); + checkCudaErrors(cudaEventElapsedTime(&time_elapsed, start, stop)); + printf("\nfvc_grad_scalar_new internal 执行时间:%f(ms)\n", time_elapsed); + + checkCudaErrors(cudaEventRecord(start, 0)); + + int offset = 0; + for (int i = 0; i < num_patches; i++) { + threads_per_block = 256; + blocks_per_grid = (patch_size[i] + threads_per_block - 1) / threads_per_block; + // TODO: just non-coupled patch type now + if (patch_type[i] == boundaryConditions::zeroGradient + || patch_type[i] == boundaryConditions::fixedValue) { + fvc_grad_scalar_boundary<<>>(num_cells, patch_size[i], offset, boundary_cell_face, + boundary_Sf, boundary_vf, output); + } else if (0) { + // xxx + fprintf(stderr, "boundaryConditions other than zeroGradient are not support yet!\n"); + } + offset += patch_size[i]; + } + checkCudaErrors(cudaEventRecord(stop, 0)); + checkCudaErrors(cudaEventSynchronize(start)); + checkCudaErrors(cudaEventSynchronize(stop)); + checkCudaErrors(cudaEventElapsedTime(&time_elapsed, start, stop)); + printf("fvc_grad_scalar_new boundary 执行时间:%f(ms)\n", time_elapsed); + + checkCudaErrors(cudaEventRecord(start, 0)); + + // divide cell volume + threads_per_block = 1024; + blocks_per_grid = (num_cells + threads_per_block - 1) / threads_per_block; + divide_cell_volume_vec<<>>(num_cells, volume, output); + + checkCudaErrors(cudaEventRecord(stop, 0)); + checkCudaErrors(cudaEventSynchronize(start)); + checkCudaErrors(cudaEventSynchronize(stop)); + checkCudaErrors(cudaEventElapsedTime(&time_elapsed, start, stop)); + printf("fvc_grad_scalar_new divide_cell_vector 执行时间:%f(ms)\n", time_elapsed); +} diff --git a/src_gpu_orig/CMakeLists.txt b/src_gpu_orig/CMakeLists.txt index 6e4a7efef..3a6d59825 100644 --- a/src_gpu_orig/CMakeLists.txt +++ b/src_gpu_orig/CMakeLists.txt @@ -3,7 +3,7 @@ # cmake_minimum_required(VERSION 3.5) -project(dfMatrix LANGUAGES CXX CUDA) +project(dfMatrixOrig LANGUAGES CXX CUDA) set(CMAKE_CXX_STANDARD 17) set(CMAKE_CXX_STANDARD_REQUIRED ON) @@ -12,27 +12,26 @@ find_package(MPI REQUIRED) find_package(CUDAToolkit REQUIRED) find_library(LIBAMGXSH amgxsh PATHS $ENV{AMGX_DIR}/build) +add_compile_options(-arch=sm_70 -fmad=false) + include_directories( ${MPI_INCLUDE_PATH} ${CUDA_INCLUDE_DIRS} $ENV{AMGX_DIR}/include + $ENV{DF_ROOT}/src_gpu ) add_library(${PROJECT_NAME} SHARED - dfUEqn.cu - dfRhoEqn.cu - dfYEqn.cu - dfEEqn.cu - AmgXSolver.cu - dfMatrixDataBase.cu) + dfMatrixDataBaseOrig.cu + dfMatrixOpBaseOrig.cu) target_link_libraries(${PROJECT_NAME} ${MPI_LIBRARIES} ${CUDA_LIBRARIES} ${LIBAMGXSH} ) -target_compile_options(dfMatrix PUBLIC -g) +target_compile_options(dfMatrixOrig PUBLIC -g) option(DFMATRIX_ENABLE_DETAILED_DEBUG "Enable detailed debug build" OFF) if (DFMATRIX_ENABLE_DETAILED_DEBUG) target_compile_definitions(${PROJECT_NAME} PRIVATE DEBUG) diff --git a/src_gpu_orig/dfMatrixDataBase.H b/src_gpu_orig/dfMatrixDataBaseOrig.H similarity index 95% rename from src_gpu_orig/dfMatrixDataBase.H rename to src_gpu_orig/dfMatrixDataBaseOrig.H index 8efb4bf62..e4a06d861 100644 --- a/src_gpu_orig/dfMatrixDataBase.H +++ b/src_gpu_orig/dfMatrixDataBaseOrig.H @@ -12,45 +12,12 @@ #include #include #include +#include "dfMatrixDataBase.H" -static const char *_cudaGetErrorEnum(cudaError_t error) { - return cudaGetErrorName(error); -} - -template -void check(T result, char const *const func, const char *const file, - int const line) { - if (result) { - fprintf(stderr, "cuda error at %s:%d code=%d(%s) \"%s\" \n", file, line, - static_cast(result), _cudaGetErrorEnum(result), func); - exit(EXIT_FAILURE); - } -} - -#define checkCudaErrors(val) check((val), #val, __FILE__, __LINE__) - -inline void checkVectorEqual(int count, const double* basevec, double* vec, double max_relative_error) { - for (size_t i = 0; i < count; ++i) - { - double abs_diff = fabs(basevec[i] - vec[i]); - double rel_diff = fabs(basevec[i] - vec[i]) / fabs(basevec[i]); - // if (abs_diff > 1e-12 && rel_diff > max_relative_error && !std::isinf(rel_diff)) - if (abs_diff > 1e-15 && rel_diff > max_relative_error) - fprintf(stderr, "mismatch index %d, cpu data: %.16lf, gpu data: %.16lf, relative error: %.16lf\n", i, basevec[i], vec[i], rel_diff); - } -} - -enum boundaryConditions{ - zeroGradient, - fixedValue, - coupled, - empty -}; - void constructBoundarySelector(std::vector& patchTypeSelector, const std::string& patchTypeStr, const int patchSize); -struct dfMatrixDataBase +struct dfMatrixDataBaseOrig { // - cuda resource cudaStream_t stream; @@ -219,8 +186,8 @@ struct dfMatrixDataBase double* d_nuEff = nullptr; // constructor - dfMatrixDataBase(); - dfMatrixDataBase(int num_surfaces, int num_cells, int num_boundary_faces, int num_species, int & num_boundary_cells_output, + dfMatrixDataBaseOrig(); + dfMatrixDataBaseOrig(int num_surfaces, int num_cells, int num_boundary_faces, int num_species, int & num_boundary_cells_output, const int *neighbour, const int *owner, const double* volume, const double* weight, const double* face_vector, const double* face, const double* deltaCoeffs, std::vector boundary_face_vector_init, std::vector boundary_face_init, std::vector boundary_deltaCoeffs_init, std::vector boundary_cell_id_init, std::vector> patch_type_init) @@ -632,10 +599,9 @@ struct dfMatrixDataBase checkCudaErrors(cudaMemcpyAsync(d_tmpPermutatedList, tmpPermutatedList.data(), csr_col_index_bytes, cudaMemcpyHostToDevice, stream)); }; - ~dfMatrixDataBase(){ + ~dfMatrixDataBaseOrig(){ std::cout << "Destructor called." << std::endl; // TODO: free pointers - }; }; diff --git a/src_gpu_orig/dfMatrixDataBase.cu b/src_gpu_orig/dfMatrixDataBaseOrig.cu similarity index 97% rename from src_gpu_orig/dfMatrixDataBase.cu rename to src_gpu_orig/dfMatrixDataBaseOrig.cu index d4f5a7ab0..7eb0ba593 100644 --- a/src_gpu_orig/dfMatrixDataBase.cu +++ b/src_gpu_orig/dfMatrixDataBaseOrig.cu @@ -1,4 +1,4 @@ -#include "dfMatrixDataBase.H" +#include "dfMatrixDataBaseOrig.H" void constructBoundarySelector(std::vector& patchTypeSelector, const std::string& patchTypeStr, diff --git a/src_gpu_orig/dfMatrixOpBaseOrig.H b/src_gpu_orig/dfMatrixOpBaseOrig.H new file mode 100644 index 000000000..0f61b558b --- /dev/null +++ b/src_gpu_orig/dfMatrixOpBaseOrig.H @@ -0,0 +1,9 @@ +#pragma once + +#include "dfMatrixDataBaseOrig.H" +#include "dfMatrixDataBase.H" + +void fvc_grad_vector_orig(cudaStream_t stream, dfMatrixDataBaseOrig* dataBaseOrig, dfMatrixDataBase& dataBase, double *d_grad, + double *d_grad_boundary_init, double *d_grad_boundary); + +void fvc_grad_scalar_orig(cudaStream_t stream, dfMatrixDataBaseOrig* dataBaseOrig, dfMatrixDataBase& dataBase, double *d_grad); \ No newline at end of file diff --git a/src_gpu_orig/dfMatrixOpBaseOrig.cu b/src_gpu_orig/dfMatrixOpBaseOrig.cu new file mode 100644 index 000000000..95737ab12 --- /dev/null +++ b/src_gpu_orig/dfMatrixOpBaseOrig.cu @@ -0,0 +1,460 @@ +#include "dfMatrixOpBaseOrig.H" + + +__global__ void fvc_grad_vector_internal(int num_cells, + const int *csr_row_index, const int *csr_col_index, const int *csr_diag_index, + const double *sf, const double *vf, const double *tlambdas, const double *volume, + double *grad) +{ + int index = blockDim.x * blockIdx.x + threadIdx.x; + if (index >= num_cells) + return; + + // A_csr has one more element in each row: itself + int row_index = csr_row_index[index]; + int row_elements = csr_row_index[index + 1] - row_index; + int diag_index = csr_diag_index[index]; + int neighbor_offset = csr_row_index[index] - index; + + double own_vf_x = vf[index * 3 + 0]; + double own_vf_y = vf[index * 3 + 1]; + double own_vf_z = vf[index * 3 + 2]; + double grad_xx = 0; + double grad_xy = 0; + double grad_xz = 0; + double grad_yx = 0; + double grad_yy = 0; + double grad_yz = 0; + double grad_zx = 0; + double grad_zy = 0; + double grad_zz = 0; + // lower + for (int i = 0; i < diag_index; i++) + { + int neighbor_index = neighbor_offset + i; + int neighbor_cell_id = csr_col_index[row_index + i]; + double w = tlambdas[neighbor_index]; + double sf_x = sf[neighbor_index * 3 + 0]; + double sf_y = sf[neighbor_index * 3 + 1]; + double sf_z = sf[neighbor_index * 3 + 2]; + double neighbor_vf_x = vf[neighbor_cell_id * 3 + 0]; + double neighbor_vf_y = vf[neighbor_cell_id * 3 + 1]; + double neighbor_vf_z = vf[neighbor_cell_id * 3 + 2]; + double face_x = (1 - w) * own_vf_x + w * neighbor_vf_x; + double face_y = (1 - w) * own_vf_y + w * neighbor_vf_y; + double face_z = (1 - w) * own_vf_z + w * neighbor_vf_z; + grad_xx -= sf_x * face_x; + grad_xy -= sf_x * face_y; + grad_xz -= sf_x * face_z; + grad_yx -= sf_y * face_x; + grad_yy -= sf_y * face_y; + grad_yz -= sf_y * face_z; + grad_zx -= sf_z * face_x; + grad_zy -= sf_z * face_y; + grad_zz -= sf_z * face_z; + } + // upper + for (int i = diag_index + 1; i < row_elements; i++) + { + int neighbor_index = neighbor_offset + i - 1; + int neighbor_cell_id = csr_col_index[row_index + i]; + double w = tlambdas[neighbor_index]; + double sf_x = sf[neighbor_index * 3 + 0]; + double sf_y = sf[neighbor_index * 3 + 1]; + double sf_z = sf[neighbor_index * 3 + 2]; + double neighbor_vf_x = vf[neighbor_cell_id * 3 + 0]; + double neighbor_vf_y = vf[neighbor_cell_id * 3 + 1]; + double neighbor_vf_z = vf[neighbor_cell_id * 3 + 2]; + double face_x = w * own_vf_x + (1 - w) * neighbor_vf_x; + double face_y = w * own_vf_y + (1 - w) * neighbor_vf_y; + double face_z = w * own_vf_z + (1 - w) * neighbor_vf_z; + grad_xx += sf_x * face_x; + grad_xy += sf_x * face_y; + grad_xz += sf_x * face_z; + grad_yx += sf_y * face_x; + grad_yy += sf_y * face_y; + grad_yz += sf_y * face_z; + grad_zx += sf_z * face_x; + grad_zy += sf_z * face_y; + grad_zz += sf_z * face_z; + // if (index == 0) + // { + // printf("grad_xx = %.20lf\n", grad_xx); + // // printf("sf_x = %.20lf\n", sf_x); + // // printf("face_x = %.20lf\n", face_x); + // } + } + double vol = volume[index]; + grad[index * 9 + 0] = grad_xx / vol; + grad[index * 9 + 1] = grad_xy / vol; + grad[index * 9 + 2] = grad_xz / vol; + grad[index * 9 + 3] = grad_yx / vol; + grad[index * 9 + 4] = grad_yy / vol; + grad[index * 9 + 5] = grad_yz / vol; + grad[index * 9 + 6] = grad_zx / vol; + grad[index * 9 + 7] = grad_zy / vol; + grad[index * 9 + 8] = grad_zz / vol; + + + // if (index == 2257) + // { + // printf("grad[2257] = (%.5e, %.5e, %.5e, %.5e, %.5e, %.5e, %.5e, %.5e, %.5e)\n", grad[index * 9 + 0], grad[index * 9 + 1], grad[index * 9 + 2], + // grad[index * 9 + 3], grad[index * 9 + 4], grad[index * 9 + 5], grad[index * 9 + 6], grad[index * 9 + 7], grad[index * 9 + 8]); + // } +} + +__global__ void fvc_grad_vector_boundary(int num_cells, int num_boundary_cells, const int *bouPermedIndex, + const int *boundary_cell_offset, const int *boundary_cell_id, + const double *boundary_sf, const double *boundary_vf, const double *volume, + double *grad, double *grad_boundary_init) +{ + int index = blockDim.x * blockIdx.x + threadIdx.x; + if (index >= num_boundary_cells) + return; + + int cell_offset = boundary_cell_offset[index]; + int next_cell_offset = boundary_cell_offset[index + 1]; + int cell_index = boundary_cell_id[cell_offset]; + + double grad_xx = 0; + double grad_xy = 0; + double grad_xz = 0; + double grad_yx = 0; + double grad_yy = 0; + double grad_yz = 0; + double grad_zx = 0; + double grad_zy = 0; + double grad_zz = 0; + for (int i = cell_offset; i < next_cell_offset; i++) + { + int p = bouPermedIndex[i]; + double sf_x = boundary_sf[i * 3 + 0]; + double sf_y = boundary_sf[i * 3 + 1]; + double sf_z = boundary_sf[i * 3 + 2]; + double vf_x = boundary_vf[p * 3 + 0]; + double vf_y = boundary_vf[p * 3 + 1]; + double vf_z = boundary_vf[p * 3 + 2]; + grad_xx += sf_x * vf_x; + grad_xy += sf_x * vf_y; + grad_xz += sf_x * vf_z; + grad_yx += sf_y * vf_x; + grad_yy += sf_y * vf_y; + grad_yz += sf_y * vf_z; + grad_zx += sf_z * vf_x; + grad_zy += sf_z * vf_y; + grad_zz += sf_z * vf_z; + } + + double vol = volume[cell_index]; + + grad[cell_index * 9 + 0] += grad_xx / vol; + grad[cell_index * 9 + 1] += grad_xy / vol; + grad[cell_index * 9 + 2] += grad_xz / vol; + grad[cell_index * 9 + 3] += grad_yx / vol; + grad[cell_index * 9 + 4] += grad_yy / vol; + grad[cell_index * 9 + 5] += grad_yz / vol; + grad[cell_index * 9 + 6] += grad_zx / vol; + grad[cell_index * 9 + 7] += grad_zy / vol; + grad[cell_index * 9 + 8] += grad_zz / vol; + + grad_boundary_init[index * 9 + 0] = grad[cell_index * 9 + 0]; + grad_boundary_init[index * 9 + 1] = grad[cell_index * 9 + 1]; + grad_boundary_init[index * 9 + 2] = grad[cell_index * 9 + 2]; + grad_boundary_init[index * 9 + 3] = grad[cell_index * 9 + 3]; + grad_boundary_init[index * 9 + 4] = grad[cell_index * 9 + 4]; + grad_boundary_init[index * 9 + 5] = grad[cell_index * 9 + 5]; + grad_boundary_init[index * 9 + 6] = grad[cell_index * 9 + 6]; + grad_boundary_init[index * 9 + 7] = grad[cell_index * 9 + 7]; + grad_boundary_init[index * 9 + 8] = grad[cell_index * 9 + 8]; + + // if (index == 0) + // { + // printf("grad[0] = (%.5e, %.5e, %.5e, %.5e, %.5e, %.5e, %.5e, %.5e, %.5e)\n", grad[index * 9 + 0], grad[index * 9 + 1], grad[index * 9 + 2], + // grad[index * 9 + 3], grad[index * 9 + 4], grad[index * 9 + 5], grad[index * 9 + 6], grad[index * 9 + 7], grad[index * 9 + 8]); + // } +} + +__global__ void correct_boundary_conditions(int num_boundary_cells, const int *bouPermedIndex, + const int *boundary_cell_offset, const int *boundary_cell_id, + const double *boundary_sf, const double *mag_sf, + double *boundary_grad_init, double *boundary_grad, const double *boundary_deltaCoeffs, + const double *internal_velocity, const double *boundary_velocity, const int *U_patch_type) +{ + int index = blockDim.x * blockIdx.x + threadIdx.x; + if (index >= num_boundary_cells) + return; + + int cell_offset = boundary_cell_offset[index]; + int next_cell_offset = boundary_cell_offset[index + 1]; + int cell_index = boundary_cell_id[cell_offset]; + + // initialize boundary_grad + double grad_xx = boundary_grad_init[index * 9 + 0]; + double grad_xy = boundary_grad_init[index * 9 + 1]; + double grad_xz = boundary_grad_init[index * 9 + 2]; + double grad_yx = boundary_grad_init[index * 9 + 3]; + double grad_yy = boundary_grad_init[index * 9 + 4]; + double grad_yz = boundary_grad_init[index * 9 + 5]; + double grad_zx = boundary_grad_init[index * 9 + 6]; + double grad_zy = boundary_grad_init[index * 9 + 7]; + double grad_zz = boundary_grad_init[index * 9 + 8]; + + double internal_U_x = internal_velocity[cell_index * 3 + 0]; + double internal_U_y = internal_velocity[cell_index * 3 + 1]; + double internal_U_z = internal_velocity[cell_index * 3 + 2]; + + for (int i = cell_offset; i < next_cell_offset; i++) + { + // OpenFoam code + // const vectorField n + // ( + // vsf.mesh().Sf().boundaryField()[patchi] + // / vsf.mesh().magSf().boundaryField()[patchi] + // ); + // gGradbf[patchi] += n * + // ( + // vsf.boundaryField()[patchi].snGrad() + // - (n & gGradbf[patchi]) + // ); + // template // fixedValue + // Foam::tmp> Foam::fvPatchField::snGrad() const + // { + // return patch_.deltaCoeffs()*(*this - patchInternalField()); + // } + + double n_x = boundary_sf[i * 3 + 0] / mag_sf[i]; + double n_y = boundary_sf[i * 3 + 1] / mag_sf[i]; + double n_z = boundary_sf[i * 3 + 2] / mag_sf[i]; + + int p = bouPermedIndex[i]; + + double sn_grad_x, sn_grad_y, sn_grad_z; + int patchIndex = U_patch_type[i]; + if (patchIndex == 0) { // zeroGradient + sn_grad_x = 0; + sn_grad_y = 0; + sn_grad_z = 0; + } else if (patchIndex == 1) { // fixedValue + sn_grad_x = boundary_deltaCoeffs[i] * (boundary_velocity[p * 3 + 0] - internal_velocity[cell_index * 3 + 0]); + sn_grad_y = boundary_deltaCoeffs[i] * (boundary_velocity[p * 3 + 1] - internal_velocity[cell_index * 3 + 1]); + sn_grad_z = boundary_deltaCoeffs[i] * (boundary_velocity[p * 3 + 2] - internal_velocity[cell_index * 3 + 2]); + // if (index == 1) + // { + // printf("cell_index = %d\n", cell_index); + // printf("boundary_velocity = %e\n", boundary_velocity[i * 3 + 1]); + // printf("internal_velocity = %e\n", internal_velocity[cell_index * 3 + 0]); + // } + + } + // TODO: implement other BCs + double grad_correction_x = sn_grad_x - (n_x * grad_xx + n_y * grad_yx + n_z * grad_zx); + double grad_correction_y = sn_grad_y - (n_x * grad_xy + n_y * grad_yy + n_z * grad_zy); + double grad_correction_z = sn_grad_z - (n_x * grad_xz + n_y * grad_yz + n_z * grad_zz); + boundary_grad[i * 9 + 0] = grad_xx + n_x * grad_correction_x; + boundary_grad[i * 9 + 1] = grad_xy + n_x * grad_correction_y; + boundary_grad[i * 9 + 2] = grad_xz + n_x * grad_correction_z; + boundary_grad[i * 9 + 3] = grad_yx + n_y * grad_correction_x; + boundary_grad[i * 9 + 4] = grad_yy + n_y * grad_correction_y; + boundary_grad[i * 9 + 5] = grad_yz + n_y * grad_correction_z; + boundary_grad[i * 9 + 6] = grad_zx + n_z * grad_correction_x; + boundary_grad[i * 9 + 7] = grad_zy + n_z * grad_correction_y; + boundary_grad[i * 9 + 8] = grad_zz + n_z * grad_correction_z; + + } +} + +__global__ void fvc_grad_scalar_internal(int num_cells, + const int *csr_row_index, const int *csr_col_index, const int *csr_diag_index, + const double *face_vector, const double *weight, const double *pressure, const double *volume, + const double *b_input, double *b_output) +{ + int index = blockDim.x * blockIdx.x + threadIdx.x; + if (index >= num_cells) + return; + + // A_csr has one more element in each row: itself + int row_index = csr_row_index[index]; + int next_row_index = csr_row_index[index + 1]; + int diag_index = csr_diag_index[index]; + int neighbor_offset = csr_row_index[index] - index; + + double own_cell_p = pressure[index]; + double grad_bx = 0; + double grad_by = 0; + double grad_bz = 0; + double grad_bx_low = 0; + double grad_bx_upp = 0; + double grad_by_low = 0; + double grad_by_upp = 0; + double grad_bz_low = 0; + double grad_bz_upp = 0; + for (int i = row_index; i < next_row_index; i++) + { + int inner_index = i - row_index; + // lower + if (inner_index < diag_index) + { + int neighbor_index = neighbor_offset + inner_index; + double w = weight[neighbor_index]; + double sfx = face_vector[neighbor_index * 3 + 0]; + double sfy = face_vector[neighbor_index * 3 + 1]; + double sfz = face_vector[neighbor_index * 3 + 2]; + int neighbor_cell_id = csr_col_index[row_index + inner_index]; + double neighbor_cell_p = pressure[neighbor_cell_id]; + double face_p = (1 - w) * own_cell_p + w * neighbor_cell_p; + grad_bx_low -= face_p * sfx; + grad_by_low -= face_p * sfy; + grad_bz_low -= face_p * sfz; + } + // upper + if (inner_index > diag_index) + { + int neighbor_index = neighbor_offset + inner_index - 1; + double w = weight[neighbor_index]; + double sfx = face_vector[neighbor_index * 3 + 0]; + double sfy = face_vector[neighbor_index * 3 + 1]; + double sfz = face_vector[neighbor_index * 3 + 2]; + int neighbor_cell_id = csr_col_index[row_index + inner_index]; + double neighbor_cell_p = pressure[neighbor_cell_id]; + double face_p = w * own_cell_p + (1 - w) * neighbor_cell_p; + grad_bx_upp += face_p * sfx; + grad_by_upp += face_p * sfy; + grad_bz_upp += face_p * sfz; + } + } + double vol = volume[index]; + b_output[index * 3 + 0] = b_input[index * 3 + 0] + (grad_bx_low + grad_bx_upp) / vol; + b_output[index * 3 + 1] = b_input[index * 3 + 1] + (grad_by_low + grad_by_upp) / vol; + b_output[index * 3 + 2] = b_input[index * 3 + 2] + (grad_bz_low + grad_bz_upp) / vol; + // b_output[index * 3 + 0] = b_input[index * 3 + 0] + grad_bx_low + grad_bx_upp; + // b_output[index * 3 + 1] = b_input[index * 3 + 1] + grad_by_low + grad_by_upp; + // b_output[index * 3 + 2] = b_input[index * 3 + 2] + grad_bz_low + grad_bz_upp; + +} + +__global__ void fvc_grad_scalar_boundary(int num_cells, int num_boundary_cells, const int *bouPermedIndex, + const int *boundary_cell_offset, const int *boundary_cell_id, + const double *boundary_face_vector, const double *boundary_pressure, const double *volume, + const double *b_input, double *b_output) +{ + int index = blockDim.x * blockIdx.x + threadIdx.x; + if (index >= num_boundary_cells) + return; + + int cell_offset = boundary_cell_offset[index]; + int next_cell_offset = boundary_cell_offset[index + 1]; + int cell_index = boundary_cell_id[cell_offset]; + + // compute boundary gradient + double grad_bx = 0; + double grad_by = 0; + double grad_bz = 0; + for (int i = cell_offset; i < next_cell_offset; i++) + { + int p = bouPermedIndex[i]; + double sfx = boundary_face_vector[i * 3 + 0]; + double sfy = boundary_face_vector[i * 3 + 1]; + double sfz = boundary_face_vector[i * 3 + 2]; + double face_p = boundary_pressure[p]; + grad_bx += face_p * sfx; + grad_by += face_p * sfy; + grad_bz += face_p * sfz; + } + + //// correct the boundary gradient + // double nx = boundary_face_vector[face_index * 3 + 0] / magSf[face_index]; + // double ny = boundary_face_vector[face_index * 3 + 1] / magSf[face_index]; + // double nz = boundary_face_vector[face_index * 3 + 2] / magSf[face_index]; + // double sn_grad = 0; + // double grad_correction = sn_grad * volume[cell_index] - (nx * grad_bx + ny * grad_by + nz * grad_bz); + // grad_bx += nx * grad_correction; + // grad_by += ny * grad_correction; + // grad_bz += nz * grad_correction; + + double vol = volume[cell_index]; + b_output[cell_index * 3 + 0] = b_input[cell_index * 3 + 0] + grad_bx / vol; + b_output[cell_index * 3 + 1] = b_input[cell_index * 3 + 1] + grad_by / vol; + b_output[cell_index * 3 + 2] = b_input[cell_index * 3 + 2] + grad_bz / vol; +} + + +void fvc_grad_vector_orig(cudaStream_t stream, dfMatrixDataBaseOrig* dataBaseOrig, dfMatrixDataBase& dataBase, double *d_grad, + double *d_grad_boundary_init, double *d_grad_boundary) +{ + float time_elapsed = 0; + cudaEvent_t start, stop; + checkCudaErrors(cudaEventCreate(&start)); + checkCudaErrors(cudaEventCreate(&stop)); + checkCudaErrors(cudaEventRecord(start, 0)); + + size_t threads_per_block = 1024; + size_t blocks_per_grid = (dataBase.num_cells + threads_per_block - 1) / threads_per_block; + fvc_grad_vector_internal<<>>(dataBase.num_cells, + dataBaseOrig->d_A_csr_row_index, dataBaseOrig->d_A_csr_col_index, dataBaseOrig->d_A_csr_diag_index, + dataBaseOrig->d_face_vector, dataBase.d_u, dataBaseOrig->d_weight, dataBaseOrig->d_volume, d_grad); + + checkCudaErrors(cudaEventRecord(stop, 0)); + checkCudaErrors(cudaEventSynchronize(start)); + checkCudaErrors(cudaEventSynchronize(stop)); + checkCudaErrors(cudaEventElapsedTime(&time_elapsed, start, stop)); + printf("\nfvc_grad_vector_orig internal 执行时间:%f(ms)\n", time_elapsed); + + + checkCudaErrors(cudaEventRecord(start, 0)); + blocks_per_grid = (dataBaseOrig->num_boundary_cells + threads_per_block - 1) / threads_per_block; + fvc_grad_vector_boundary<<>>(dataBase.num_cells, dataBaseOrig->num_boundary_cells, dataBaseOrig->d_bouPermedIndex, + dataBaseOrig->d_boundary_cell_offset, dataBaseOrig->d_boundary_cell_id, dataBaseOrig->d_boundary_face_vector, + dataBase.d_boundary_u, dataBase.d_volume, d_grad, d_grad_boundary_init); + + checkCudaErrors(cudaEventRecord(stop, 0)); + checkCudaErrors(cudaEventSynchronize(start)); + checkCudaErrors(cudaEventSynchronize(stop)); + checkCudaErrors(cudaEventElapsedTime(&time_elapsed, start, stop)); + printf("fvc_grad_vector_orig boundary1 执行时间:%f(ms)\n", time_elapsed); + + + checkCudaErrors(cudaEventRecord(start, 0)); + correct_boundary_conditions<<>>(dataBaseOrig->num_boundary_cells, dataBaseOrig->d_bouPermedIndex, + dataBaseOrig->d_boundary_cell_offset, dataBaseOrig->d_boundary_cell_id, dataBaseOrig->d_boundary_face_vector, + dataBaseOrig->d_boundary_face, d_grad_boundary_init, d_grad_boundary, dataBaseOrig->d_boundary_deltaCoeffs, + dataBase.d_u, dataBase.d_boundary_u, dataBaseOrig->d_boundary_UpatchType); + checkCudaErrors(cudaEventRecord(stop, 0)); + checkCudaErrors(cudaEventSynchronize(start)); + checkCudaErrors(cudaEventSynchronize(stop)); + checkCudaErrors(cudaEventElapsedTime(&time_elapsed, start, stop)); + printf("fvc_grad_vector_orig boundary2 执行时间:%f(ms)\n", time_elapsed); +} + +void fvc_grad_scalar_orig(cudaStream_t stream, dfMatrixDataBaseOrig* dataBaseOrig, dfMatrixDataBase& dataBase, double *d_grad) +{ + float time_elapsed = 0; + cudaEvent_t start, stop; + checkCudaErrors(cudaEventCreate(&start)); + checkCudaErrors(cudaEventCreate(&stop)); + checkCudaErrors(cudaEventRecord(start, 0)); + + size_t threads_per_block = 1024; + size_t blocks_per_grid = (dataBase.num_cells + threads_per_block - 1) / threads_per_block; + fvc_grad_scalar_internal<<>>(dataBase.num_cells, + dataBaseOrig->d_A_csr_row_index, dataBaseOrig->d_A_csr_col_index, dataBaseOrig->d_A_csr_diag_index, + dataBaseOrig->d_face_vector, dataBaseOrig->d_weight, dataBase.d_p, dataBaseOrig->d_volume, d_grad, d_grad); + + checkCudaErrors(cudaEventRecord(stop, 0)); + checkCudaErrors(cudaEventSynchronize(start)); + checkCudaErrors(cudaEventSynchronize(stop)); + checkCudaErrors(cudaEventElapsedTime(&time_elapsed, start, stop)); + printf("\nfvc_grad_scalar_orig internal 执行时间:%f(ms)\n", time_elapsed); + + checkCudaErrors(cudaEventRecord(start, 0)); + + blocks_per_grid = (dataBaseOrig->num_boundary_cells + threads_per_block - 1) / threads_per_block; + fvc_grad_scalar_boundary<<>>(dataBase.num_cells, dataBaseOrig->num_boundary_cells, dataBaseOrig->d_bouPermedIndex, + dataBaseOrig->d_boundary_cell_offset, dataBaseOrig->d_boundary_cell_id, + dataBaseOrig->d_boundary_face_vector, dataBase.d_boundary_p, dataBaseOrig->d_volume, d_grad, d_grad); + + checkCudaErrors(cudaEventRecord(stop, 0)); + checkCudaErrors(cudaEventSynchronize(start)); + checkCudaErrors(cudaEventSynchronize(stop)); + checkCudaErrors(cudaEventElapsedTime(&time_elapsed, start, stop)); + printf("fvc_grad_scalar_orig boundary 执行时间:%f(ms)\n", time_elapsed); +} \ No newline at end of file From db5a689cce42bbc436817f0f64cad671385c5f10 Mon Sep 17 00:00:00 2001 From: STwangyingrui Date: Tue, 15 Aug 2023 17:59:50 +0800 Subject: [PATCH 19/25] run pass basic ueqn_gpu --- .../solvers/dfLowMachFoam/createGPUSolver.H | 5 +- applications/solvers/dfLowMachFoam/new_UEqn.H | 54 +++- .../solvers/dfLowMachFoam/new_dfLowMachFoam.C | 2 + src_gpu/dfMatrixDataBase.H | 6 +- src_gpu/dfMatrixDataBase.cu | 6 + src_gpu/dfMatrixOpBase.H | 16 ++ src_gpu/dfMatrixOpBase.cu | 266 +++++++++++++----- src_gpu/dfUEqn.H | 5 +- src_gpu/dfUEqn.cu | 98 ++++--- 9 files changed, 344 insertions(+), 114 deletions(-) diff --git a/applications/solvers/dfLowMachFoam/createGPUSolver.H b/applications/solvers/dfLowMachFoam/createGPUSolver.H index d9ce745d7..94fff1125 100644 --- a/applications/solvers/dfLowMachFoam/createGPUSolver.H +++ b/applications/solvers/dfLowMachFoam/createGPUSolver.H @@ -31,24 +31,27 @@ void createGPUBase(fvMesh& mesh, PtrList& Y) { double *boundary_sf = new double[3 * num_boundary_surfaces]; double *boundary_mag_sf = new double[num_boundary_surfaces]; double *boundary_delta_coeffs = new double[num_boundary_surfaces]; + int *boundary_face_cell = new int[num_boundary_surfaces]; int offset = 0; forAll(mesh.boundary(), patchi) { const vectorField& pSf = mesh.Sf().boundaryField()[patchi]; const scalarField& pMagSf = mesh.magSf().boundaryField()[patchi]; const scalarField& pDeltaCoeffs = mesh.nonOrthDeltaCoeffs().boundaryField()[patchi]; + const labelUList& pFaceCells = mesh.boundary()[patchi].faceCells(); int patchsize = pMagSf.size(); memcpy(boundary_sf + 3*offset, &pSf[0][0], 3*patchsize*sizeof(double)); memcpy(boundary_mag_sf + offset, &pMagSf[0], patchsize*sizeof(double)); memcpy(boundary_delta_coeffs + offset, &pDeltaCoeffs[0], patchsize*sizeof(double)); + memcpy(boundary_face_cell + offset, &pFaceCells[0], patchsize * sizeof(int)); offset += patchsize; } dfDataBase.createConstantFieldsInternal(); dfDataBase.createConstantFieldsBoundary(); dfDataBase.initConstantFieldsInternal(&mesh.Sf()[0][0], &mesh.magSf()[0], &mesh.surfaceInterpolation::weights()[0], &mesh.nonOrthDeltaCoeffs()[0], &mesh.V()[0]); - dfDataBase.initConstantFieldsBoundary(boundary_sf, boundary_mag_sf, boundary_delta_coeffs); + dfDataBase.initConstantFieldsBoundary(boundary_sf, boundary_mag_sf, boundary_delta_coeffs, boundary_face_cell); // prepare internal and boundary of Y dfDataBase.createNonConstantFieldsInternal(); diff --git a/applications/solvers/dfLowMachFoam/new_UEqn.H b/applications/solvers/dfLowMachFoam/new_UEqn.H index c38735375..3d84f3631 100644 --- a/applications/solvers/dfLowMachFoam/new_UEqn.H +++ b/applications/solvers/dfLowMachFoam/new_UEqn.H @@ -1,18 +1,22 @@ #ifdef GPUSolver_ -// run CPU +const tmp nuEff_tmp(turbulence->nuEff()); +const volScalarField& nuEff = nuEff_tmp(); + +// run CPU, for temp tmp tUEqn ( - fvm::div(phi, U) + fvm::ddt(rho, U) + fvm::div(phi, U) == -fvc::grad(p) + //turbulence->divDevRhoReff(U) ); fvVectorMatrix& UEqn = tUEqn.ref(); // run GPU // preProcess -// skip preparing u, p, nu_eff, and rho.boundary used by UEqn_GPU.preProcess() -// TODO: temp, now we only transfer phi(instead of rhoEqn) used by fvm::div(phi, U) -double *h_u = dfDataBase.getFieldPointer("u", location::cpu, position::internal); +// TODO: preProcessForRhoEqn for temp, now we only transfer phi(instead of rhoEqn) used by fvm::div(phi, U) +double *h_rho = dfDataBase.getFieldPointer("rho", location::cpu, position::internal); double *h_phi = dfDataBase.getFieldPointer("phi", location::cpu, position::internal); double *h_boundary_phi = dfDataBase.getFieldPointer("phi", location::cpu, position::boundary); +memcpy(h_rho, &rho[0], dfDataBase.cell_value_bytes); memcpy(h_phi, &phi[0], dfDataBase.surface_value_bytes); int offset = 0; forAll(phi.boundaryField(), patchi) @@ -22,13 +26,46 @@ forAll(phi.boundaryField(), patchi) memcpy(h_boundary_phi + offset, &patchPhi[0], patchsize * sizeof(double)); offset += patchsize; } -UEqn_GPU.preProcessForRhoEqn(h_phi, h_boundary_phi); +UEqn_GPU.preProcessForRhoEqn(h_rho, h_phi, h_boundary_phi); +DEBUG_TRACE; +// preparing u, p, nu_eff, and rho.boundary used by UEqn_GPU.preProcess() +double *h_u = dfDataBase.getFieldPointer("u", location::cpu, position::internal); +double *h_boundary_u = dfDataBase.getFieldPointer("u", location::cpu, position::boundary); +double *h_p = dfDataBase.getFieldPointer("p", location::cpu, position::internal); +double *h_boundary_p = dfDataBase.getFieldPointer("p", location::cpu, position::boundary); +double *h_nu_eff = UEqn_GPU.getFieldPointer("nu_eff", location::cpu, position::internal); +double *h_boundary_nu_eff = UEqn_GPU.getFieldPointer("nu_eff", location::cpu, position::boundary); +double *h_boundary_rho = dfDataBase.getFieldPointer("rho", location::cpu, position::boundary); +memcpy(h_u, &U.oldTime()[0][0], dfDataBase.cell_value_vec_bytes); +memcpy(h_p, &p[0], dfDataBase.cell_value_bytes); +memcpy(h_nu_eff, &nuEff[0], dfDataBase.cell_value_bytes); +offset = 0; +forAll(U.boundaryField(), patchi) +{ + const fvPatchVectorField& patchU = U.boundaryField()[patchi]; + const fvPatchScalarField& patchP = p.boundaryField()[patchi]; + const fvPatchScalarField& patchNuEff = nuEff.boundaryField()[patchi]; + const fvPatchScalarField& patchRho = rho.boundaryField()[patchi]; + int patchsize = patchU.size(); + memcpy(h_boundary_u + offset * 3, &patchU[0][0], patchsize * 3 * sizeof(double)); + memcpy(h_boundary_p + offset, &patchP[0], patchsize * sizeof(double)); + memcpy(h_boundary_nu_eff + offset, &patchNuEff[0], patchsize * sizeof(double)); + memcpy(h_boundary_rho + offset, &patchRho[0], patchsize * sizeof(double)); + offset += patchsize; +} +UEqn_GPU.preProcess(h_u, h_boundary_u, h_p, h_boundary_p, h_nu_eff, h_boundary_nu_eff, h_boundary_rho); +DEBUG_TRACE; + // process UEqn_GPU.process(); +DEBUG_TRACE; + // postProcess UEqn_GPU.postProcess(h_u); +DEBUG_TRACE; + // checkResult -// TODO: temp, now we compare ldu, finally we compare csr +// TODO: for temp, now we compare ldu, finally we compare csr std::vector h_internal_coeffs(dfDataBase.num_boundary_surfaces * 3); std::vector h_boundary_coeffs(dfDataBase.num_boundary_surfaces * 3); offset = 0; @@ -42,6 +79,7 @@ for (int patchi = 0; patchi < dfDataBase.num_patches; patchi++) offset += patchsize; } bool printFlag = false; -UEqn_GPU.compareResult(&UEqn.lower()[0], &UEqn.upper()[0], &UEqn.diag()[0], +UEqn_GPU.compareResult(&UEqn.lower()[0], &UEqn.upper()[0], &UEqn.diag()[0], &UEqn.source()[0][0], h_internal_coeffs.data(), h_boundary_coeffs.data(), printFlag); +DEBUG_TRACE; #endif diff --git a/applications/solvers/dfLowMachFoam/new_dfLowMachFoam.C b/applications/solvers/dfLowMachFoam/new_dfLowMachFoam.C index 530a9f7ec..0deffb40f 100644 --- a/applications/solvers/dfLowMachFoam/new_dfLowMachFoam.C +++ b/applications/solvers/dfLowMachFoam/new_dfLowMachFoam.C @@ -99,7 +99,9 @@ int main(int argc, char *argv[]) createGPUUEqn(CanteraTorchProperties, U); // foreach(timestep) { + dfDataBase.preTimeStep(&rho.oldTime()[0]); #include "new_UEqn.H" + dfDataBase.postTimeStep(); // } } return 0; diff --git a/src_gpu/dfMatrixDataBase.H b/src_gpu/dfMatrixDataBase.H index 7eb8b9ec2..69d20d7af 100644 --- a/src_gpu/dfMatrixDataBase.H +++ b/src_gpu/dfMatrixDataBase.H @@ -41,7 +41,7 @@ inline void checkVectorEqual(int count, const double* basevec, double* vec, doub fprintf(stderr, "index %d, cpu data: %.16lf, gpu data: %.16lf, relative error: %.16lf\n", i, basevec[i], vec[i], rel_diff); // if (abs_diff > 1e-12 && rel_diff > max_relative_error && !std::isinf(rel_diff)) if (abs_diff > 1e-15 && rel_diff > max_relative_error) - fprintf(stderr, "mismatch index %d, cpu data: %.16lf, gpu data: %.16lf, relative error: %.16lf\n", i, basevec[i], vec[i], rel_diff); + fprintf(stderr, "mismatch index %d, cpu data: %.30lf, gpu data: %.30lf, relative error: %.16lf\n", i, basevec[i], vec[i], rel_diff); } } @@ -63,7 +63,6 @@ enum boundaryConditions{ }; void constructBoundarySelectorPerPatch(int *patchTypeSelector, const std::string& patchTypeStr); -void constructBoundarySelector(std::vector& patchTypeSelector, const std::string& patchTypeStr, const int patchSize); struct dfMatrixDataBase { @@ -205,6 +204,9 @@ struct dfMatrixDataBase void initNonConstantFieldsInternal(const double *y); void initNonConstantFieldsBoundary(const double *boundary_y); + void preTimeStep(const double *rho_old); + void postTimeStep(); + // getter double* getFieldPointer(const char* fieldAlias, location loc, position pos); }; diff --git a/src_gpu/dfMatrixDataBase.cu b/src_gpu/dfMatrixDataBase.cu index cb6a44d5f..64b35f956 100644 --- a/src_gpu/dfMatrixDataBase.cu +++ b/src_gpu/dfMatrixDataBase.cu @@ -311,6 +311,12 @@ void dfMatrixDataBase::initNonConstantFieldsBoundary(const double *boundary_y) { checkCudaErrors(cudaMemcpyAsync(d_boundary_y, boundary_y, boundary_surface_value_bytes* num_species, cudaMemcpyHostToDevice, stream)); } +void dfMatrixDataBase::preTimeStep(const double *rho_old) { + checkCudaErrors(cudaMemcpyAsync(d_rho_old, rho_old, cell_value_bytes, cudaMemcpyHostToDevice, stream)); +} + +void dfMatrixDataBase::postTimeStep() {} + double* dfMatrixDataBase::getFieldPointer(const char* fieldAlias, location loc, position pos) { char mergedName[256]; if (pos == position::internal) { diff --git a/src_gpu/dfMatrixOpBase.H b/src_gpu/dfMatrixOpBase.H index 109f20c3f..a415a8a1b 100644 --- a/src_gpu/dfMatrixOpBase.H +++ b/src_gpu/dfMatrixOpBase.H @@ -4,6 +4,12 @@ void permute_vector_d2h(cudaStream_t stream, int num_cells, const double *input, double *output); void permute_vector_h2d(cudaStream_t stream, int num_cells, const double *input, double *output); +void field_multiply_scalar(cudaStream_t stream, + int num_cells, const double *input1, const double *input2, double *output, + int num_boundary_surfaces, const double *boundary_input1, const double *boundary_input2, double *boundary_output); + +void fvc_to_source_vector(cudaStream_t stream, int num_cells, const double *volume, const double *fvc_output, double *source); + void ldu_to_csr(cudaStream_t stream, int num_cells, int num_surfaces, const int *lower_to_csr_index, const int *upper_to_csr_index, const int *diag_to_csr_index, const double *lower, const double *upper, const double *diag, const double *source, @@ -57,6 +63,13 @@ void fvc_div_surface_scalar(cudaStream_t stream, int num_cells, int num_surfaces const double *boundary_ssf, const double *volume, double *output); void fvc_div_cell_vector(cudaStream_t stream, int num_cells, int num_surfaces, + const int *lowerAddr, const int *upperAddr, + const double *weight, const double *Sf, const double *vf, double *output, // end for internal + int num_patches, const int *patch_size, const int *patch_type, + const int *boundary_cell_face, const double *boundary_vf, const double *boundary_Sf, + const double *volume); + +void fvc_div_cell_tensor(cudaStream_t stream, int num_cells, int num_surfaces, const int *lowerAddr, const int *upperAddr, const double *weight, const double *Sf, const double *vf, double *output, // end for internal int num_patches, const int *patch_size, const int *patch_type, @@ -69,3 +82,6 @@ void fvc_grad_cell_scalar(cudaStream_t stream, int num_cells, int num_surfaces, int num_patches, const int *patch_size, const int *patch_type, const int *boundary_cell_face, const double *boundary_vf, const double *boundary_Sf, const double *volume); +// others +void scale_dev2T_tensor(cudaStream_t stream, int num_cells, const double *vf1, double *vf2, + int num_boundary_surfaces, const double *boundary_vf1, double *boundary_vf2); diff --git a/src_gpu/dfMatrixOpBase.cu b/src_gpu/dfMatrixOpBase.cu index 7a76db89c..e4b2a25e9 100644 --- a/src_gpu/dfMatrixOpBase.cu +++ b/src_gpu/dfMatrixOpBase.cu @@ -26,6 +26,30 @@ __global__ void permute_vector_h2d_kernel(int num_cells, const double *input, do output[num_cells * 2 + index] = input[index * 3 + 2]; } +__global__ void field_multiply_scalar_kernel(int num_cells, int num_boundary_surfaces, + const double *input1, const double *input2, double *output, + const double *boundary_input1, const double *boundary_input2, double *boundary_output) +{ + int index = blockDim.x * blockIdx.x + threadIdx.x; + if (index < num_cells) { + output[index] = input1[index] * input2[index]; + } + if (index < num_boundary_surfaces) { + boundary_output[index] = boundary_input1[index] * boundary_input2[index]; + } +} + +__global__ void fvc_to_source_vector_kernel(int num_cells, const double *volume, const double *fvc_output, double *source) +{ + int index = blockDim.x * blockIdx.x + threadIdx.x; + if (index >= num_cells) + return; + + source[index * 3 + 0] += fvc_output[index * 3 + 0] * volume[index]; + source[index * 3 + 1] += fvc_output[index * 3 + 1] * volume[index]; + source[index * 3 + 2] += fvc_output[index * 3 + 2] * volume[index]; +} + __global__ void update_boundary_coeffs_zeroGradient_vector(int num, int offset, double *value_internal_coeffs, double *value_boundary_coeffs, double *gradient_internal_coeffs, double *gradient_boundary_coeffs) @@ -53,6 +77,34 @@ __global__ void update_boundary_coeffs_zeroGradient_vector(int num, int offset, gradient_boundary_coeffs[start_index * 3 + 2] = 0; } +__global__ void scale_dev2t_tensor_kernel(int num, const double *vf1, double *vf2) +{ + int index = blockDim.x * blockIdx.x + threadIdx.x; + if (index >= num) + return; + + double scale = vf1[index]; + double val_xx = vf2[index * 9 + 0]; + double val_xy = vf2[index * 9 + 1]; + double val_xz = vf2[index * 9 + 2]; + double val_yx = vf2[index * 9 + 3]; + double val_yy = vf2[index * 9 + 4]; + double val_yz = vf2[index * 9 + 5]; + double val_zx = vf2[index * 9 + 6]; + double val_zy = vf2[index * 9 + 7]; + double val_zz = vf2[index * 9 + 8]; + double trace_coeff = (2. / 3.) * (val_xx + val_yy + val_zz); + vf2[index * 9 + 0] = scale * (val_xx - trace_coeff); + vf2[index * 9 + 1] = scale * val_yx; + vf2[index * 9 + 2] = scale * val_zx; + vf2[index * 9 + 3] = scale * val_xy; + vf2[index * 9 + 4] = scale * (val_yy - trace_coeff); + vf2[index * 9 + 5] = scale * val_zy; + vf2[index * 9 + 6] = scale * val_xz; + vf2[index * 9 + 7] = scale * val_yz; + vf2[index * 9 + 8] = scale * (val_zz - trace_coeff); +} + __global__ void fvm_ddt_vector_kernel(int num_cells, double rDeltaT, const double *rho, const double *rho_old, const double *vf, const double *volume, double *diag, double *source) @@ -590,6 +642,79 @@ __global__ void fvc_div_cell_vector_boundary(int num, int offset, const int *fac } +__global__ void fvc_div_cell_tensor_internal(int num_surfaces, + const int *lower_index, const int *upper_index, + const double *vf, const double *weight, const double *face_vector, + double *output) +{ + int index = blockDim.x * blockIdx.x + threadIdx.x; + if (index >= num_surfaces) + return; + + double w = weight[index]; + double Sfx = face_vector[index * 3 + 0]; + double Sfy = face_vector[index * 3 + 1]; + double Sfz = face_vector[index * 3 + 2]; + int owner = lower_index[index]; + int neighbor = upper_index[index]; + + double ssf_xx = (w * (vf[owner * 9 + 0] - vf[neighbor * 9 + 0]) + vf[neighbor * 9 + 0]); + double ssf_xy = (w * (vf[owner * 9 + 1] - vf[neighbor * 9 + 1]) + vf[neighbor * 9 + 1]); + double ssf_xz = (w * (vf[owner * 9 + 2] - vf[neighbor * 9 + 2]) + vf[neighbor * 9 + 2]); + double ssf_yx = (w * (vf[owner * 9 + 3] - vf[neighbor * 9 + 3]) + vf[neighbor * 9 + 3]); + double ssf_yy = (w * (vf[owner * 9 + 4] - vf[neighbor * 9 + 4]) + vf[neighbor * 9 + 4]); + double ssf_yz = (w * (vf[owner * 9 + 5] - vf[neighbor * 9 + 5]) + vf[neighbor * 9 + 5]); + double ssf_zx = (w * (vf[owner * 9 + 6] - vf[neighbor * 9 + 6]) + vf[neighbor * 9 + 6]); + double ssf_zy = (w * (vf[owner * 9 + 7] - vf[neighbor * 9 + 7]) + vf[neighbor * 9 + 7]); + double ssf_zz = (w * (vf[owner * 9 + 8] - vf[neighbor * 9 + 8]) + vf[neighbor * 9 + 8]); + double div_x = Sfx * ssf_xx + Sfy * ssf_xy + Sfz * ssf_xz; + double div_y = Sfx * ssf_yx + Sfy * ssf_yy + Sfz * ssf_yz; + double div_z = Sfx * ssf_zx + Sfy * ssf_zy + Sfz * ssf_zz; + + // owner + atomicAdd(&(output[owner * 3 + 0]), div_x); + atomicAdd(&(output[owner * 3 + 1]), div_y); + atomicAdd(&(output[owner * 3 + 2]), div_z); + + // neighbour + atomicAdd(&(output[neighbor * 3 + 0]), -div_x); + atomicAdd(&(output[neighbor * 3 + 1]), -div_y); + atomicAdd(&(output[neighbor * 3 + 2]), -div_z); +} + +__global__ void fvc_div_cell_tensor_boundary(int num, int offset, const int *face2Cells, + const double *boundary_face_vector, const double *boundary_vf, double *output) +{ + int index = blockDim.x * blockIdx.x + threadIdx.x; + if (index >= num) + return; + + int start_index = offset + index; + + double bouSfx = boundary_face_vector[start_index * 3 + 0]; + double bouSfy = boundary_face_vector[start_index * 3 + 1]; + double bouSfz = boundary_face_vector[start_index * 3 + 2]; + + double boussf_xx = boundary_vf[start_index * 9 + 0]; + double boussf_xy = boundary_vf[start_index * 9 + 1]; + double boussf_xz = boundary_vf[start_index * 9 + 2]; + double boussf_yx = boundary_vf[start_index * 9 + 3]; + double boussf_yy = boundary_vf[start_index * 9 + 4]; + double boussf_yz = boundary_vf[start_index * 9 + 5]; + double boussf_zx = boundary_vf[start_index * 9 + 6]; + double boussf_zy = boundary_vf[start_index * 9 + 7]; + double boussf_zz = boundary_vf[start_index * 9 + 8]; + int cellIndex = face2Cells[start_index]; + + double bouDiv_x = bouSfx * boussf_xx + bouSfy * boussf_xy + bouSfz * boussf_xz; + double bouDiv_y = bouSfx * boussf_yx + bouSfy * boussf_yy + bouSfz * boussf_yz; + double bouDiv_z = bouSfx * boussf_zx + bouSfy * boussf_zy + bouSfz * boussf_zz; + + atomicAdd(&(output[cellIndex * 3 + 0]), bouDiv_x); + atomicAdd(&(output[cellIndex * 3 + 1]), bouDiv_y); + atomicAdd(&(output[cellIndex * 3 + 2]), bouDiv_z); +} + void permute_vector_d2h(cudaStream_t stream, int num_cells, const double *input, double *output) { size_t threads_per_block = 256; @@ -604,6 +729,24 @@ void permute_vector_h2d(cudaStream_t stream, int num_cells, const double *input, permute_vector_h2d_kernel<<>>(num_cells, input, output); } +void field_multiply_scalar(cudaStream_t stream, + int num_cells, const double *input1, const double *input2, double *output, + int num_boundary_surfaces, const double *boundary_input1, const double *boundary_input2, double *boundary_output) +{ + size_t threads_per_block = 256; + size_t blocks_per_grid = (std::max(num_cells, num_boundary_surfaces) + threads_per_block - 1) / threads_per_block; + field_multiply_scalar_kernel<<>>(num_cells, num_boundary_surfaces, + input1, input2, output, boundary_input1, boundary_input2, boundary_output); +} + +void fvc_to_source_vector(cudaStream_t stream, int num_cells, const double *volume, const double *fvc_output, double *source) +{ + size_t threads_per_block = 256; + size_t blocks_per_grid = (num_cells + threads_per_block - 1) / threads_per_block; + fvc_to_source_vector_kernel<<>>(num_cells, + volume, fvc_output, source); +} + void ldu_to_csr(cudaStream_t stream, int num_cells, int num_surfaces, const int *lower_to_csr_index, const int *upper_to_csr_index, const int *diag_to_csr_index, const double *lower, const double *upper, const double *diag, const double *source, @@ -720,6 +863,8 @@ void fvc_ddt_scalar(cudaStream_t stream, int num_cells, double rDeltaT, const double *rho, const double *rho_old, const double *vf, const double *vf_old, double *output) { + checkCudaErrors(cudaMemsetAsync(output, 0, num_cells * sizeof(double), stream)); + size_t threads_per_block = 1024; size_t blocks_per_grid = (num_cells + threads_per_block - 1) / threads_per_block; fvc_ddt_scalar_kernel<<>>(num_cells, @@ -734,28 +879,12 @@ void fvc_grad_vector(cudaStream_t stream, int num_cells, int num_surfaces, const double *volume, const double *boundary_mag_Sf, double *boundary_output, const double *boundary_deltaCoeffs) { - float time_elapsed = 0; - cudaEvent_t start, stop; - checkCudaErrors(cudaEventCreate(&start)); - checkCudaErrors(cudaEventCreate(&stop)); - - // checkCudaErrors(cudaStreamSynchronize(stream)); - checkCudaErrors(cudaEventRecord(start, 0)); - + checkCudaErrors(cudaMemsetAsync(output, 0, num_cells * 9 * sizeof(double), stream)); size_t threads_per_block = 1024; size_t blocks_per_grid = (num_surfaces + threads_per_block - 1) / threads_per_block; fvc_grad_vector_internal<<>>(num_surfaces, lowerAddr, upperAddr, Sf, weight, vf, output); - // checkCudaErrors(cudaStreamSynchronize(stream)); - checkCudaErrors(cudaEventRecord(stop, 0)); - checkCudaErrors(cudaEventSynchronize(start)); - checkCudaErrors(cudaEventSynchronize(stop)); - checkCudaErrors(cudaEventElapsedTime(&time_elapsed, start, stop)); - printf("\nfvc_grad_vector_new internal 执行时间:%f(ms)\n", time_elapsed); - - // checkCudaErrors(cudaStreamSynchronize(stream)); - checkCudaErrors(cudaEventRecord(start, 0)); int offset = 0; // finish conctruct grad field except dividing cell volume for (int i = 0; i < num_patches; i++) { @@ -773,32 +902,13 @@ void fvc_grad_vector(cudaStream_t stream, int num_cells, int num_surfaces, } offset += patch_size[i]; } - // checkCudaErrors(cudaStreamSynchronize(stream)); - checkCudaErrors(cudaEventRecord(stop, 0)); - checkCudaErrors(cudaEventSynchronize(start)); - checkCudaErrors(cudaEventSynchronize(stop)); - checkCudaErrors(cudaEventElapsedTime(&time_elapsed, start, stop)); - printf("fvc_grad_vector_new boundary1 执行时间:%f(ms)\n", time_elapsed); // divide cell volume - // checkCudaErrors(cudaStreamSynchronize(stream)); - checkCudaErrors(cudaEventRecord(start, 0)); - threads_per_block = 512; blocks_per_grid = (num_cells + threads_per_block - 1) / threads_per_block; divide_cell_volume_tsr<<>>(num_cells, volume, output); - // checkCudaErrors(cudaStreamSynchronize(stream)); - checkCudaErrors(cudaEventRecord(stop, 0)); - checkCudaErrors(cudaEventSynchronize(start)); - checkCudaErrors(cudaEventSynchronize(stop)); - checkCudaErrors(cudaEventElapsedTime(&time_elapsed, start, stop)); - printf("fvc_grad_vector_new divide_cell 执行时间:%f(ms)\n", time_elapsed); - // correct boundary conditions - // checkCudaErrors(cudaStreamSynchronize(stream)); - checkCudaErrors(cudaEventRecord(start, 0)); - offset = 0; for (int i = 0; i < num_patches; i++) { threads_per_block = 256; @@ -817,19 +927,25 @@ void fvc_grad_vector(cudaStream_t stream, int num_cells, int num_surfaces, } offset += patch_size[i]; } - // checkCudaErrors(cudaStreamSynchronize(stream)); +} - checkCudaErrors(cudaEventRecord(stop, 0)); - checkCudaErrors(cudaEventSynchronize(start)); - checkCudaErrors(cudaEventSynchronize(stop)); - checkCudaErrors(cudaEventElapsedTime(&time_elapsed, start, stop)); - printf("fvc_grad_vector_new boundary2 执行时间:%f(ms)\n", time_elapsed); +void scale_dev2T_tensor(cudaStream_t stream, int num_cells, const double *vf1, double *vf2, + int num_boundary_surfaces, const double *boundary_vf1, double *boundary_vf2) +{ + size_t threads_per_block = 1024; + size_t blocks_per_grid = (num_cells + threads_per_block - 1) / threads_per_block; + scale_dev2t_tensor_kernel<<>>(num_cells, vf1, vf2); + + blocks_per_grid = (num_boundary_surfaces + threads_per_block - 1) / threads_per_block; + scale_dev2t_tensor_kernel<<>>(num_boundary_surfaces, boundary_vf1, boundary_vf2); } void fvc_div_surface_scalar(cudaStream_t stream, int num_cells, int num_surfaces, int num_boundary_surfaces, const int *lowerAddr, const int *upperAddr, const double *ssf, const int *boundary_cell_face, const double *boundary_ssf, const double *volume, double *output) { + checkCudaErrors(cudaMemsetAsync(output, 0, num_cells * sizeof(double), stream)); + size_t threads_per_block = 1024; size_t blocks_per_grid = (num_surfaces + threads_per_block - 1) / threads_per_block; fvc_div_surface_scalar_internal<<>>(num_surfaces, lowerAddr, upperAddr, ssf, output); @@ -852,6 +968,8 @@ void fvc_div_cell_vector(cudaStream_t stream, int num_cells, int num_surfaces, const int *boundary_cell_face, const double *boundary_vf, const double *boundary_Sf, const double *volume) { + checkCudaErrors(cudaMemsetAsync(output, 0, num_cells * sizeof(double), stream)); + size_t threads_per_block = 1024; size_t blocks_per_grid = (num_surfaces + threads_per_block - 1) / threads_per_block; fvc_div_cell_vector_internal<<>>(num_surfaces, lowerAddr, upperAddr, vf, weight, Sf, output); @@ -879,31 +997,54 @@ void fvc_div_cell_vector(cudaStream_t stream, int num_cells, int num_surfaces, divide_cell_volume_scalar<<>>(num_cells, volume, output); } +void fvc_div_cell_tensor(cudaStream_t stream, int num_cells, int num_surfaces, + const int *lowerAddr, const int *upperAddr, + const double *weight, const double *Sf, const double *vf, double *output, // end for internal + int num_patches, const int *patch_size, const int *patch_type, + const int *boundary_cell_face, const double *boundary_vf, const double *boundary_Sf, + const double *volume) +{ + checkCudaErrors(cudaMemsetAsync(output, 0, num_cells * 3 * sizeof(double), stream)); + + size_t threads_per_block = 1024; + size_t blocks_per_grid = (num_surfaces + threads_per_block - 1) / threads_per_block; + fvc_div_cell_tensor_internal<<>>(num_surfaces, lowerAddr, upperAddr, vf, weight, Sf, output); + + int offset = 0; + for (int i = 0; i < num_patches; i++) { + threads_per_block = 256; + blocks_per_grid = (patch_size[i] + threads_per_block - 1) / threads_per_block; + // TODO: just basic patch type now + if (patch_type[i] == boundaryConditions::zeroGradient + || patch_type[i] == boundaryConditions::fixedValue) { + // TODO: just vector version now + fvc_div_cell_tensor_boundary<<>>(patch_size[i], offset, boundary_cell_face, + boundary_Sf, boundary_vf, output); + } else if (0) { + // xxx + fprintf(stderr, "boundaryConditions other than zeroGradient are not support yet!\n"); + } + offset += patch_size[i]; + } + + // divide cell volume + threads_per_block = 1024; + blocks_per_grid = (num_cells + threads_per_block - 1) / threads_per_block; + divide_cell_volume_vec<<>>(num_cells, volume, output); +} + void fvc_grad_cell_scalar(cudaStream_t stream, int num_cells, int num_surfaces, const int *lowerAddr, const int *upperAddr, const double *weight, const double *Sf, const double *vf, double *output, // end for internal int num_patches, const int *patch_size, const int *patch_type, const int *boundary_cell_face, const double *boundary_vf, const double *boundary_Sf, const double *volume) { - float time_elapsed = 0; - cudaEvent_t start, stop; - checkCudaErrors(cudaEventCreate(&start)); - checkCudaErrors(cudaEventCreate(&stop)); - checkCudaErrors(cudaEventRecord(start, 0)); - + checkCudaErrors(cudaMemsetAsync(output, 0, num_cells * 3 * sizeof(double), stream)); size_t threads_per_block = 1024; size_t blocks_per_grid = (num_surfaces + threads_per_block - 1) / threads_per_block; fvc_grad_scalar_internal<<>>(num_cells, num_surfaces, lowerAddr, upperAddr, Sf, weight, vf, output); - checkCudaErrors(cudaEventRecord(stop, 0)); - checkCudaErrors(cudaEventSynchronize(start)); - checkCudaErrors(cudaEventSynchronize(stop)); - checkCudaErrors(cudaEventElapsedTime(&time_elapsed, start, stop)); - printf("\nfvc_grad_scalar_new internal 执行时间:%f(ms)\n", time_elapsed); - - checkCudaErrors(cudaEventRecord(start, 0)); - int offset = 0; for (int i = 0; i < num_patches; i++) { threads_per_block = 256; @@ -919,22 +1060,9 @@ void fvc_grad_cell_scalar(cudaStream_t stream, int num_cells, int num_surfaces, } offset += patch_size[i]; } - checkCudaErrors(cudaEventRecord(stop, 0)); - checkCudaErrors(cudaEventSynchronize(start)); - checkCudaErrors(cudaEventSynchronize(stop)); - checkCudaErrors(cudaEventElapsedTime(&time_elapsed, start, stop)); - printf("fvc_grad_scalar_new boundary 执行时间:%f(ms)\n", time_elapsed); - - checkCudaErrors(cudaEventRecord(start, 0)); // divide cell volume threads_per_block = 1024; blocks_per_grid = (num_cells + threads_per_block - 1) / threads_per_block; divide_cell_volume_vec<<>>(num_cells, volume, output); - - checkCudaErrors(cudaEventRecord(stop, 0)); - checkCudaErrors(cudaEventSynchronize(start)); - checkCudaErrors(cudaEventSynchronize(stop)); - checkCudaErrors(cudaEventElapsedTime(&time_elapsed, start, stop)); - printf("fvc_grad_scalar_new divide_cell_vector 执行时间:%f(ms)\n", time_elapsed); } diff --git a/src_gpu/dfUEqn.H b/src_gpu/dfUEqn.H index 0ee570b9d..880b9c347 100644 --- a/src_gpu/dfUEqn.H +++ b/src_gpu/dfUEqn.H @@ -35,6 +35,7 @@ private: double *d_grad_u = nullptr; double *d_rho_nueff = nullptr; double *d_permute = nullptr; + double *d_fvc_output = nullptr; // non-constant fields - boundary // thermophysical fields @@ -89,11 +90,11 @@ public: void initNonConstantFieldsBoundary(); // 方程运行 + void preProcessForRhoEqn(const double *h_rho, const double *h_phi, const double *h_boundary_phi); void preProcess(const double *h_u, const double *h_boundary_u, const double *h_p, const double *h_boundary_p, const double *h_nu_eff, const double *h_boundary_nu_eff, const double *h_boundary_rho); - void preProcessForRhoEqn(const double *h_phi, const double *h_boundary_phi); void process(); void postProcess(double *h_u); void solve(); - void compareResult(const double *lower, const double *upper, const double *diag, const double *internal_coeffs, const double *boundary_coeffs, bool printFlag); + void compareResult(const double *lower, const double *upper, const double *diag, const double *source, const double *internal_coeffs, const double *boundary_coeffs, bool printFlag); }; diff --git a/src_gpu/dfUEqn.cu b/src_gpu/dfUEqn.cu index fbbf9e71d..1e8065721 100644 --- a/src_gpu/dfUEqn.cu +++ b/src_gpu/dfUEqn.cu @@ -20,6 +20,7 @@ void dfUEqn::createNonConstantFieldsInternal() { // intermediate fields checkCudaErrors(cudaMalloc((void**)&d_grad_u, dataBase_.cell_value_tsr_bytes)); checkCudaErrors(cudaMalloc((void**)&d_rho_nueff, dataBase_.cell_value_bytes)); + checkCudaErrors(cudaMalloc((void**)&d_fvc_output, dataBase_.cell_value_vec_bytes)); checkCudaErrors(cudaMalloc((void**)&d_permute, dataBase_.cell_value_vec_bytes)); // getter for h_nu_eff @@ -62,6 +63,12 @@ void dfUEqn::initNonConstantFieldsBoundary() { d_gradient_internal_coeffs, d_gradient_boundary_coeffs); } +void dfUEqn::preProcessForRhoEqn(const double *h_rho, const double *h_phi, const double *h_boundary_phi) { + checkCudaErrors(cudaMemcpyAsync(dataBase_.d_rho, h_rho, dataBase_.cell_value_bytes, cudaMemcpyHostToDevice, dataBase_.stream)); + checkCudaErrors(cudaMemcpyAsync(dataBase_.d_phi, h_phi, dataBase_.surface_value_bytes, cudaMemcpyHostToDevice, dataBase_.stream)); + checkCudaErrors(cudaMemcpyAsync(dataBase_.d_boundary_phi, h_boundary_phi, dataBase_.boundary_surface_value_bytes, cudaMemcpyHostToDevice, dataBase_.stream)); +} + void dfUEqn::preProcess(const double *h_u, const double *h_boundary_u, const double *h_p, const double *h_boundary_p, const double *h_nu_eff, const double *h_boundary_nu_eff, const double *h_boundary_rho) { checkCudaErrors(cudaMemcpyAsync(dataBase_.d_u, h_u, dataBase_.cell_value_vec_bytes, cudaMemcpyHostToDevice, dataBase_.stream)); @@ -82,37 +89,57 @@ void dfUEqn::preProcess(const double *h_u, const double *h_boundary_u, const dou checkCudaErrors(cudaMemsetAsync(d_b, 0, dataBase_.cell_value_vec_bytes, dataBase_.stream)); } -void dfUEqn::preProcessForRhoEqn(const double *h_phi, const double *h_boundary_phi) { - checkCudaErrors(cudaMemcpyAsync(dataBase_.d_phi, h_phi, dataBase_.surface_value_bytes, cudaMemcpyHostToDevice, dataBase_.stream)); - checkCudaErrors(cudaMemcpyAsync(dataBase_.d_boundary_phi, h_boundary_phi, dataBase_.boundary_surface_value_bytes, cudaMemcpyHostToDevice, dataBase_.stream)); - - checkCudaErrors(cudaMemsetAsync(d_lower, 0, dataBase_.surface_value_bytes, dataBase_.stream)); - checkCudaErrors(cudaMemsetAsync(d_upper, 0, dataBase_.surface_value_bytes, dataBase_.stream)); - checkCudaErrors(cudaMemsetAsync(d_diag, 0, dataBase_.cell_value_bytes, dataBase_.stream)); - checkCudaErrors(cudaMemsetAsync(d_source, 0, dataBase_.cell_value_vec_bytes, dataBase_.stream)); - checkCudaErrors(cudaMemsetAsync(d_internal_coeffs, 0, dataBase_.boundary_surface_value_vec_bytes, dataBase_.stream)); - checkCudaErrors(cudaMemsetAsync(d_boundary_coeffs, 0, dataBase_.boundary_surface_value_vec_bytes, dataBase_.stream)); - checkCudaErrors(cudaMemsetAsync(d_A, 0, dataBase_.csr_value_vec_bytes, dataBase_.stream)); - checkCudaErrors(cudaMemsetAsync(d_b, 0, dataBase_.cell_value_vec_bytes, dataBase_.stream)); - -} - void dfUEqn::process() { - // run each fvc or fvm function - fvm_div_vector(dataBase_.stream, dataBase_.num_surfaces, dataBase_.d_owner, dataBase_.d_neighbor, - dataBase_.d_phi, dataBase_.d_weight, - d_lower, d_upper, d_diag, // end for internal - dataBase_.num_patches, dataBase_.patch_size.data(), patch_type.data(), - dataBase_.d_boundary_phi, d_value_internal_coeffs, d_value_boundary_coeffs, - d_internal_coeffs, d_boundary_coeffs); - //ldu_to_csr(dataBase_.stream, dataBase_.num_cells, dataBase_.num_surfaces, - // dataBase_.d_lower_to_csr_index, dataBase_.d_upper_to_csr_index, dataBase_.d_diag_to_csr_index, - // d_lower, d_upper, d_diag, d_source, d_internal_coeffs, d_boundary_coeffs, d_A, d_b); - //solve(); + fvm_ddt_vector(dataBase_.stream, dataBase_.num_cells, dataBase_.rdelta_t, + dataBase_.d_rho, dataBase_.d_rho_old, dataBase_.d_u, dataBase_.d_volume, + d_diag, d_source); + fvm_div_vector(dataBase_.stream, dataBase_.num_surfaces, dataBase_.d_owner, dataBase_.d_neighbor, + dataBase_.d_phi, dataBase_.d_weight, + d_lower, d_upper, d_diag, // end for internal + dataBase_.num_patches, dataBase_.patch_size.data(), patch_type.data(), + dataBase_.d_boundary_phi, d_value_internal_coeffs, d_value_boundary_coeffs, + d_internal_coeffs, d_boundary_coeffs); + //field_multiply_scalar(dataBase_.stream, + // dataBase_.num_cells, dataBase_.d_rho, d_nu_eff, d_rho_nueff, // end for internal + // dataBase_.num_boundary_surfaces, dataBase_.d_boundary_rho, d_boundary_nu_eff, d_boundary_rho_nueff); + //fvm_laplacian_vector(dataBase_.stream, dataBase_.num_surfaces, + // dataBase_.d_owner, dataBase_.d_neighbor, + // dataBase_.d_weight, dataBase_.d_mag_sf, dataBase_.d_delta_coeffs, d_rho_nueff, + // d_lower, d_upper, d_diag, // end for internal + // dataBase_.num_patches, dataBase_.patch_size.data(), patch_type.data(), + // dataBase_.d_boundary_mag_sf, d_boundary_rho_nueff, + // d_gradient_internal_coeffs, d_gradient_boundary_coeffs, + // d_internal_coeffs, d_boundary_coeffs); + //fvc_grad_vector(dataBase_.stream, dataBase_.num_cells, dataBase_.num_surfaces, + // dataBase_.d_owner, dataBase_.d_neighbor, + // dataBase_.d_weight, dataBase_.d_sf, dataBase_.d_u, d_grad_u, + // dataBase_.num_patches, dataBase_.patch_size.data(), patch_type.data(), + // dataBase_.d_boundary_face_cell, dataBase_.d_boundary_u, dataBase_.d_boundary_sf, + // dataBase_.d_volume, dataBase_.d_boundary_mag_sf, d_boundary_grad_u, dataBase_.d_boundary_delta_coeffs); + //scale_dev2T_tensor(dataBase_.stream, dataBase_.num_cells, d_rho_nueff, d_grad_u, // end for internal + // dataBase_.num_boundary_surfaces, d_boundary_rho_nueff, d_boundary_grad_u); + //fvc_div_cell_tensor(dataBase_.stream, dataBase_.num_cells, dataBase_.num_surfaces, + // dataBase_.d_owner, dataBase_.d_neighbor, + // dataBase_.d_weight, dataBase_.d_sf, d_grad_u, d_fvc_output, // end for internal + // dataBase_.num_patches, dataBase_.patch_size.data(), patch_type.data(), + // dataBase_.d_boundary_face_cell, d_boundary_grad_u, dataBase_.d_boundary_sf, dataBase_.d_volume); + //fvc_to_source_vector(dataBase_.stream, dataBase_.num_cells, + // dataBase_.d_volume, d_fvc_output, d_source); + fvc_grad_cell_scalar(dataBase_.stream, dataBase_.num_cells, dataBase_.num_surfaces, + dataBase_.d_owner, dataBase_.d_neighbor, + dataBase_.d_weight, dataBase_.d_sf, dataBase_.d_p, d_fvc_output, + dataBase_.num_patches, dataBase_.patch_size.data(), patch_type.data(), + dataBase_.d_boundary_face_cell, dataBase_.d_boundary_p, dataBase_.d_boundary_sf, dataBase_.d_volume); + fvc_to_source_vector(dataBase_.stream, dataBase_.num_cells, + dataBase_.d_volume, d_fvc_output, d_source); + //solve(); } void dfUEqn::solve() { - checkCudaErrors(cudaStreamSynchronize(dataBase_.stream)); + //ldu_to_csr(dataBase_.stream, dataBase_.num_cells, dataBase_.num_surfaces, + // dataBase_.d_lower_to_csr_index, dataBase_.d_upper_to_csr_index, dataBase_.d_diag_to_csr_index, + // d_lower, d_upper, d_diag, d_source, d_internal_coeffs, d_boundary_coeffs, d_A, d_b); + ////checkCudaErrors(cudaStreamSynchronize(dataBase_.stream)); int nNz = dataBase_.num_cells + dataBase_.num_surfaces * 2; // matrix entries if (num_iteration == 0) // first interation @@ -166,36 +193,43 @@ double* dfUEqn::getFieldPointer(const char* fieldAlias, location loc, position p return pointer; } -void dfUEqn::compareResult(const double *lower, const double *upper, const double *diag, const double *internal_coeffs, const double *boundary_coeffs, bool printFlag) +void dfUEqn::compareResult(const double *lower, const double *upper, const double *diag, const double *source, const double *internal_coeffs, const double *boundary_coeffs, bool printFlag) { + DEBUG_TRACE; std::vector h_lower; h_lower.resize(dataBase_.num_surfaces); checkCudaErrors(cudaMemcpy(h_lower.data(), d_lower, dataBase_.surface_value_bytes, cudaMemcpyDeviceToHost)); checkVectorEqual(dataBase_.num_surfaces, lower, h_lower.data(), 1e-14, printFlag); + DEBUG_TRACE; std::vector h_upper; h_upper.resize(dataBase_.num_surfaces); checkCudaErrors(cudaMemcpy(h_upper.data(), d_upper, dataBase_.surface_value_bytes, cudaMemcpyDeviceToHost)); checkVectorEqual(dataBase_.num_surfaces, upper, h_upper.data(), 1e-14, printFlag); + DEBUG_TRACE; std::vector h_diag; h_diag.resize(dataBase_.num_cells); checkCudaErrors(cudaMemcpy(h_diag.data(), d_diag, dataBase_.cell_value_bytes, cudaMemcpyDeviceToHost)); checkVectorEqual(dataBase_.num_cells, diag, h_diag.data(), 1e-14, printFlag); + DEBUG_TRACE; - //std::vector h_source; - //h_source.resize(dataBase_.num_cells * 3); - //checkCudaErrors(cudaMemcpy(h_source.data(), d_source, dataBase_.cell_value_vec_bytes, cudaMemcpyDeviceToHost)); - //checkVectorEqual(dataBase_.num_cells, source, h_source.data(), 1e-14, printFlag); + std::vector h_source; + h_source.resize(dataBase_.num_cells * 3); + checkCudaErrors(cudaMemcpy(h_source.data(), d_source, dataBase_.cell_value_vec_bytes, cudaMemcpyDeviceToHost)); + checkVectorEqual(dataBase_.num_cells, source, h_source.data(), 1e-14, printFlag); + DEBUG_TRACE; std::vector h_internal_coeffs; h_internal_coeffs.resize(dataBase_.num_boundary_surfaces * 3); checkCudaErrors(cudaMemcpy(h_internal_coeffs.data(), d_internal_coeffs, dataBase_.boundary_surface_value_vec_bytes, cudaMemcpyDeviceToHost)); checkVectorEqual(dataBase_.num_boundary_surfaces * 3, internal_coeffs, h_internal_coeffs.data(), 1e-14, printFlag); + DEBUG_TRACE; std::vector h_boundary_coeffs; h_boundary_coeffs.resize(dataBase_.num_boundary_surfaces * 3); checkCudaErrors(cudaMemcpy(h_boundary_coeffs.data(), d_boundary_coeffs, dataBase_.boundary_surface_value_vec_bytes, cudaMemcpyDeviceToHost)); checkVectorEqual(dataBase_.num_boundary_surfaces * 3, boundary_coeffs, h_boundary_coeffs.data(), 1e-14, printFlag); + DEBUG_TRACE; } From 04d5512a627a3d5bf1e6b258070bf4b67187c223 Mon Sep 17 00:00:00 2001 From: STwangyingrui Date: Tue, 15 Aug 2023 19:04:48 +0800 Subject: [PATCH 20/25] fvc/fvm ops support sign --- applications/solvers/dfLowMachFoam/new_UEqn.H | 3 +- src_gpu/dfMatrixOpBase.H | 19 +- src_gpu/dfMatrixOpBase.cu | 169 +++++++++--------- src_gpu/dfUEqn.cu | 6 +- 4 files changed, 100 insertions(+), 97 deletions(-) diff --git a/applications/solvers/dfLowMachFoam/new_UEqn.H b/applications/solvers/dfLowMachFoam/new_UEqn.H index 3d84f3631..1b5487139 100644 --- a/applications/solvers/dfLowMachFoam/new_UEqn.H +++ b/applications/solvers/dfLowMachFoam/new_UEqn.H @@ -5,7 +5,8 @@ const volScalarField& nuEff = nuEff_tmp(); // run CPU, for temp tmp tUEqn ( - fvm::ddt(rho, U) + fvm::div(phi, U) == -fvc::grad(p) + -fvm::ddt(rho, U) - fvm::div(phi, U) == fvc::grad(p) + //fvm::ddt(rho, U) + fvm::div(phi, U) == -fvc::grad(p) //turbulence->divDevRhoReff(U) ); fvVectorMatrix& UEqn = tUEqn.ref(); diff --git a/src_gpu/dfMatrixOpBase.H b/src_gpu/dfMatrixOpBase.H index a415a8a1b..ae303cedb 100644 --- a/src_gpu/dfMatrixOpBase.H +++ b/src_gpu/dfMatrixOpBase.H @@ -25,14 +25,14 @@ void update_boundary_coeffs_vector(cudaStream_t stream, int num_patches, void fvm_ddt_vector(cudaStream_t stream, int num_cells, double rDeltaT, const double *rho, const double *rho_old, const double *vf, const double *volume, - double *diag, double *source); + double *diag, double *source, double sign = 1.); void fvm_div_vector(cudaStream_t stream, int num_surfaces, const int *lowerAddr, const int *upperAddr, const double *phi, const double *weight, double *lower, double *upper, double *diag, // end for internal int num_patches, const int *patch_size, const int *patch_type, const double *boundary_phi, const double *value_internal_coeffs, const double *value_boundary_coeffs, - double *internal_coeffs, double *boundary_coeffs); + double *internal_coeffs, double *boundary_coeffs, double sign = 1.); void fvm_laplacian_vector(cudaStream_t stream, int num_surfaces, const int *lowerAddr, const int *upperAddr, @@ -41,14 +41,13 @@ void fvm_laplacian_vector(cudaStream_t stream, int num_surfaces, int num_patches, const int *patch_size, const int *patch_type, const double *boundary_mag_sf, const double *boundary_gamma, const double *gradient_internal_coeffs, const double *gradient_boundary_coeffs, - double *internal_coeffs, double *boundary_coeffs); + double *internal_coeffs, double *boundary_coeffs, double sign = 1.); // fvc ops // fvc_ddt doesn't consider to add fvc_output to source yet, which needs (fvc_output * volume * sign). void fvc_ddt_scalar(cudaStream_t stream, int num_cells, double rDeltaT, const double *rho, const double *rho_old, const double *vf, const double *vf_old, - double *output); - + double *output, double sign = 1.); void fvc_grad_vector(cudaStream_t stream, int num_cells, int num_surfaces, const int *lowerAddr, const int *upperAddr, @@ -56,31 +55,31 @@ void fvc_grad_vector(cudaStream_t stream, int num_cells, int num_surfaces, int num_patches, const int *patch_size, const int *patch_type, const int *boundary_cell_face, const double *boundary_vf, const double *boundary_Sf, const double *volume, const double *boundary_mag_Sf, double *boundary_output, - const double *boundary_deltaCoeffs); + const double *boundary_deltaCoeffs, double sign = 1.); void fvc_div_surface_scalar(cudaStream_t stream, int num_cells, int num_surfaces, int num_boundary_surfaces, const int *lowerAddr, const int *upperAddr, const double *ssf, const int *boundary_cell_face, - const double *boundary_ssf, const double *volume, double *output); + const double *boundary_ssf, const double *volume, double *output, double sign = 1.); void fvc_div_cell_vector(cudaStream_t stream, int num_cells, int num_surfaces, const int *lowerAddr, const int *upperAddr, const double *weight, const double *Sf, const double *vf, double *output, // end for internal int num_patches, const int *patch_size, const int *patch_type, const int *boundary_cell_face, const double *boundary_vf, const double *boundary_Sf, - const double *volume); + const double *volume, double sign = 1.); void fvc_div_cell_tensor(cudaStream_t stream, int num_cells, int num_surfaces, const int *lowerAddr, const int *upperAddr, const double *weight, const double *Sf, const double *vf, double *output, // end for internal int num_patches, const int *patch_size, const int *patch_type, const int *boundary_cell_face, const double *boundary_vf, const double *boundary_Sf, - const double *volume); + const double *volume, double sign = 1.); void fvc_grad_cell_scalar(cudaStream_t stream, int num_cells, int num_surfaces, const int *lowerAddr, const int *upperAddr, const double *weight, const double *Sf, const double *vf, double *output, // end for internal int num_patches, const int *patch_size, const int *patch_type, - const int *boundary_cell_face, const double *boundary_vf, const double *boundary_Sf, const double *volume); + const int *boundary_cell_face, const double *boundary_vf, const double *boundary_Sf, const double *volume, double sign = 1.); // others void scale_dev2T_tensor(cudaStream_t stream, int num_cells, const double *vf1, double *vf2, diff --git a/src_gpu/dfMatrixOpBase.cu b/src_gpu/dfMatrixOpBase.cu index e4b2a25e9..fd90ce480 100644 --- a/src_gpu/dfMatrixOpBase.cu +++ b/src_gpu/dfMatrixOpBase.cu @@ -107,23 +107,23 @@ __global__ void scale_dev2t_tensor_kernel(int num, const double *vf1, double *vf __global__ void fvm_ddt_vector_kernel(int num_cells, double rDeltaT, const double *rho, const double *rho_old, const double *vf, const double *volume, - double *diag, double *source) + double *diag, double *source, double sign) { int index = blockDim.x * blockIdx.x + threadIdx.x; if (index >= num_cells) return; - diag[index] += rDeltaT * rho[index] * volume[index]; + diag[index] += rDeltaT * rho[index] * volume[index] * sign; // TODO: skip moving - source[index * 3 + 0] += rDeltaT * rho_old[index] * vf[index * 3 + 0] * volume[index]; - source[index * 3 + 1] += rDeltaT * rho_old[index] * vf[index * 3 + 1] * volume[index]; - source[index * 3 + 2] += rDeltaT * rho_old[index] * vf[index * 3 + 2] * volume[index]; + source[index * 3 + 0] += rDeltaT * rho_old[index] * vf[index * 3 + 0] * volume[index] * sign; + source[index * 3 + 1] += rDeltaT * rho_old[index] * vf[index * 3 + 1] * volume[index] * sign; + source[index * 3 + 2] += rDeltaT * rho_old[index] * vf[index * 3 + 2] * volume[index] * sign; } __global__ void fvm_div_vector_internal(int num_surfaces, const int *lower_index, const int *upper_index, const double *phi, const double *weight, - double *lower, double *upper, double *diag) + double *lower, double *upper, double *diag, double sign) { int index = blockDim.x * blockIdx.x + threadIdx.x; if (index >= num_surfaces) @@ -132,8 +132,8 @@ __global__ void fvm_div_vector_internal(int num_surfaces, double w = weight[index]; double f = phi[index]; - double lower_value = (-w) * f; - double upper_value = (1 - w) * f; + double lower_value = (-w) * f * sign; + double upper_value = (1 - w) * f * sign; lower[index] += lower_value; upper[index] += upper_value; // if (index == 0) printf("index = 0, lower: %.16lf, upper:%.16lf\n", lower[index], upper[index]); @@ -146,7 +146,7 @@ __global__ void fvm_div_vector_internal(int num_surfaces, __global__ void fvm_div_vector_boundary(int num, int offset, const double *boundary_phi, const double *value_internal_coeffs, const double *value_boundary_coeffs, - double *internal_coeffs, double *boundary_coeffs) + double *internal_coeffs, double *boundary_coeffs, double sign) { int index = blockDim.x * blockIdx.x + threadIdx.x; if (index >= num) @@ -154,18 +154,18 @@ __global__ void fvm_div_vector_boundary(int num, int offset, int start_index = offset + index; double boundary_f = boundary_phi[start_index]; - internal_coeffs[start_index * 3 + 0] += boundary_f * value_internal_coeffs[start_index * 3 + 0]; - internal_coeffs[start_index * 3 + 1] += boundary_f * value_internal_coeffs[start_index * 3 + 1]; - internal_coeffs[start_index * 3 + 2] += boundary_f * value_internal_coeffs[start_index * 3 + 2]; - boundary_coeffs[start_index * 3 + 0] += boundary_f * value_boundary_coeffs[start_index * 3 + 0]; - boundary_coeffs[start_index * 3 + 1] += boundary_f * value_boundary_coeffs[start_index * 3 + 1]; - boundary_coeffs[start_index * 3 + 2] += boundary_f * value_boundary_coeffs[start_index * 3 + 2]; + internal_coeffs[start_index * 3 + 0] += boundary_f * value_internal_coeffs[start_index * 3 + 0] * sign; + internal_coeffs[start_index * 3 + 1] += boundary_f * value_internal_coeffs[start_index * 3 + 1] * sign; + internal_coeffs[start_index * 3 + 2] += boundary_f * value_internal_coeffs[start_index * 3 + 2] * sign; + boundary_coeffs[start_index * 3 + 0] += boundary_f * value_boundary_coeffs[start_index * 3 + 0] * sign; + boundary_coeffs[start_index * 3 + 1] += boundary_f * value_boundary_coeffs[start_index * 3 + 1] * sign; + boundary_coeffs[start_index * 3 + 2] += boundary_f * value_boundary_coeffs[start_index * 3 + 2] * sign; } __global__ void fvm_laplacian_vector_internal(int num_surfaces, const int *lower_index, const int *upper_index, const double *weight, const double *mag_sf, const double *delta_coeffs, const double *gamma, - double *lower, double *upper, double *diag) + double *lower, double *upper, double *diag, double sign) { int index = blockDim.x * blockIdx.x + threadIdx.x; if (index >= num_surfaces) @@ -183,6 +183,9 @@ __global__ void fvm_laplacian_vector_internal(int num_surfaces, //double lower_value = lower_face_gamma * mag_sf[index] * delta_coeffs[index]; double lower_value = upper_value; + lower_value = lower_value * sign; + upper_value = upper_value * sign; + lower[index] += lower_value; upper[index] += upper_value; @@ -193,7 +196,7 @@ __global__ void fvm_laplacian_vector_internal(int num_surfaces, __global__ void fvm_laplacian_vector_boundary(int num, int offset, const double *boundary_mag_sf, const double *boundary_gamma, const double *gradient_internal_coeffs, const double *gradient_boundary_coeffs, - double *internal_coeffs, double *boundary_coeffs) + double *internal_coeffs, double *boundary_coeffs, double sign) { int index = blockDim.x * blockIdx.x + threadIdx.x; if (index >= num) @@ -201,17 +204,17 @@ __global__ void fvm_laplacian_vector_boundary(int num, int offset, int start_index = offset + index; double boundary_value = boundary_gamma[start_index] * boundary_mag_sf[start_index]; - internal_coeffs[start_index * 3 + 0] += boundary_value * gradient_internal_coeffs[start_index * 3 + 0]; - internal_coeffs[start_index * 3 + 1] += boundary_value * gradient_internal_coeffs[start_index * 3 + 1]; - internal_coeffs[start_index * 3 + 2] += boundary_value * gradient_internal_coeffs[start_index * 3 + 2]; - boundary_coeffs[start_index * 3 + 0] += boundary_value * gradient_boundary_coeffs[start_index * 3 + 0]; - boundary_coeffs[start_index * 3 + 1] += boundary_value * gradient_boundary_coeffs[start_index * 3 + 1]; - boundary_coeffs[start_index * 3 + 2] += boundary_value * gradient_boundary_coeffs[start_index * 3 + 2]; + internal_coeffs[start_index * 3 + 0] += boundary_value * gradient_internal_coeffs[start_index * 3 + 0] * sign; + internal_coeffs[start_index * 3 + 1] += boundary_value * gradient_internal_coeffs[start_index * 3 + 1] * sign; + internal_coeffs[start_index * 3 + 2] += boundary_value * gradient_internal_coeffs[start_index * 3 + 2] * sign; + boundary_coeffs[start_index * 3 + 0] += boundary_value * gradient_boundary_coeffs[start_index * 3 + 0] * sign; + boundary_coeffs[start_index * 3 + 1] += boundary_value * gradient_boundary_coeffs[start_index * 3 + 1] * sign; + boundary_coeffs[start_index * 3 + 2] += boundary_value * gradient_boundary_coeffs[start_index * 3 + 2] * sign; } __global__ void fvc_ddt_scalar_kernel(int num_cells, double rDeltaT, const double *rho, const double *rho_old, const double *vf, const double *vf_old, - double *output) + double *output, double sign) { int index = blockDim.x * blockIdx.x + threadIdx.x; if (index >= num_cells) @@ -241,7 +244,7 @@ __global__ void fvc_ddt_scalar_kernel(int num_cells, double rDeltaT, output[index] += rDeltaT * (val_new - val_old); */ // workaround way3 (use nvcc option -fmad=false) - output[index] += rDeltaT * (rho[index] * vf[index] - rho_old[index] * vf_old[index]); + output[index] += rDeltaT * (rho[index] * vf[index] - rho_old[index] * vf_old[index]) * sign; } __global__ void fvc_grad_vector_internal(int num_surfaces, @@ -417,25 +420,25 @@ __global__ void fvc_grad_scalar_boundary(int num_cells, int num, int offset, con // } } -__global__ void divide_cell_volume_tsr(int num_cells, const double* volume, double *output) +__global__ void divide_cell_volume_tsr(int num_cells, const double* volume, double *output, double sign) { int index = blockDim.x * blockIdx.x + threadIdx.x; if (index >= num_cells) return; double vol = volume[index]; - output[index * 9 + 0] = output[index * 9 + 0] / vol; - output[index * 9 + 1] = output[index * 9 + 1] / vol; - output[index * 9 + 2] = output[index * 9 + 2] / vol; - output[index * 9 + 3] = output[index * 9 + 3] / vol; - output[index * 9 + 4] = output[index * 9 + 4] / vol; - output[index * 9 + 5] = output[index * 9 + 5] / vol; - output[index * 9 + 6] = output[index * 9 + 6] / vol; - output[index * 9 + 7] = output[index * 9 + 7] / vol; - output[index * 9 + 8] = output[index * 9 + 8] / vol; + output[index * 9 + 0] = output[index * 9 + 0] / vol * sign; + output[index * 9 + 1] = output[index * 9 + 1] / vol * sign; + output[index * 9 + 2] = output[index * 9 + 2] / vol * sign; + output[index * 9 + 3] = output[index * 9 + 3] / vol * sign; + output[index * 9 + 4] = output[index * 9 + 4] / vol * sign; + output[index * 9 + 5] = output[index * 9 + 5] / vol * sign; + output[index * 9 + 6] = output[index * 9 + 6] / vol * sign; + output[index * 9 + 7] = output[index * 9 + 7] / vol * sign; + output[index * 9 + 8] = output[index * 9 + 8] / vol * sign; } -__global__ void divide_cell_volume_vec(int num_cells, const double* volume, double *output) +__global__ void divide_cell_volume_vec(int num_cells, const double* volume, double *output, double sign) { int index = blockDim.x * blockIdx.x + threadIdx.x; if (index >= num_cells) @@ -443,12 +446,12 @@ __global__ void divide_cell_volume_vec(int num_cells, const double* volume, doub double vol = volume[index]; - output[index * 3 + 0] = output[index * 3 + 0] / vol; - output[index * 3 + 1] = output[index * 3 + 1] / vol; - output[index * 3 + 2] = output[index * 3 + 2] / vol; + output[index * 3 + 0] = output[index * 3 + 0] / vol * sign; + output[index * 3 + 1] = output[index * 3 + 1] / vol * sign; + output[index * 3 + 2] = output[index * 3 + 2] / vol * sign; } -__global__ void divide_cell_volume_scalar(int num_cells, const double* volume, double *output) +__global__ void divide_cell_volume_scalar(int num_cells, const double* volume, double *output, double sign) { int index = blockDim.x * blockIdx.x + threadIdx.x; if (index >= num_cells) @@ -456,12 +459,12 @@ __global__ void divide_cell_volume_scalar(int num_cells, const double* volume, d double vol = volume[index]; - output[index] = output[index] / vol; + output[index] = output[index] / vol * sign; } __global__ void fvc_grad_vector_correctBC_zeroGradient(int num, int offset, const int *face2Cells, const double *internal_grad, const double *vf, const double *boundary_sf, - const double *boundary_mag_sf, double *boundary_grad) + const double *boundary_mag_sf, double *boundary_grad, double sign) { int index = blockDim.x * blockIdx.x + threadIdx.x; if (index >= num) @@ -493,21 +496,21 @@ __global__ void fvc_grad_vector_correctBC_zeroGradient(int num, int offset, cons double grad_correction_y = - (n_x * grad_xy + n_y * grad_yy + n_z * grad_zy); double grad_correction_z = - (n_x * grad_xz + n_y * grad_yz + n_z * grad_zz); - boundary_grad[start_index * 9 + 0] = grad_xx + n_x * grad_correction_x; - boundary_grad[start_index * 9 + 1] = grad_xy + n_x * grad_correction_y; - boundary_grad[start_index * 9 + 2] = grad_xz + n_x * grad_correction_z; - boundary_grad[start_index * 9 + 3] = grad_yx + n_y * grad_correction_x; - boundary_grad[start_index * 9 + 4] = grad_yy + n_y * grad_correction_y; - boundary_grad[start_index * 9 + 5] = grad_yz + n_y * grad_correction_z; - boundary_grad[start_index * 9 + 6] = grad_zx + n_z * grad_correction_x; - boundary_grad[start_index * 9 + 7] = grad_zy + n_z * grad_correction_y; - boundary_grad[start_index * 9 + 8] = grad_zz + n_z * grad_correction_z; + boundary_grad[start_index * 9 + 0] = (grad_xx + n_x * grad_correction_x) * sign; + boundary_grad[start_index * 9 + 1] = (grad_xy + n_x * grad_correction_y) * sign; + boundary_grad[start_index * 9 + 2] = (grad_xz + n_x * grad_correction_z) * sign; + boundary_grad[start_index * 9 + 3] = (grad_yx + n_y * grad_correction_x) * sign; + boundary_grad[start_index * 9 + 4] = (grad_yy + n_y * grad_correction_y) * sign; + boundary_grad[start_index * 9 + 5] = (grad_yz + n_y * grad_correction_z) * sign; + boundary_grad[start_index * 9 + 6] = (grad_zx + n_z * grad_correction_x) * sign; + boundary_grad[start_index * 9 + 7] = (grad_zy + n_z * grad_correction_y) * sign; + boundary_grad[start_index * 9 + 8] = (grad_zz + n_z * grad_correction_z) * sign; } __global__ void fvc_grad_vector_correctBC_fixedValue(int num, int offset, const int *face2Cells, const double *internal_grad, const double *vf, const double *boundary_sf, const double *boundary_mag_sf, double *boundary_grad, - const double *boundary_deltaCoeffs, const double *boundary_vf) + const double *boundary_deltaCoeffs, const double *boundary_vf, double sign) { int index = blockDim.x * blockIdx.x + threadIdx.x; if (index >= num) @@ -544,15 +547,15 @@ __global__ void fvc_grad_vector_correctBC_fixedValue(int num, int offset, const double grad_correction_y = sn_grad_y - (n_x * grad_xy + n_y * grad_yy + n_z * grad_zy); double grad_correction_z = sn_grad_z - (n_x * grad_xz + n_y * grad_yz + n_z * grad_zz); - boundary_grad[start_index * 9 + 0] = grad_xx + n_x * grad_correction_x; - boundary_grad[start_index * 9 + 1] = grad_xy + n_x * grad_correction_y; - boundary_grad[start_index * 9 + 2] = grad_xz + n_x * grad_correction_z; - boundary_grad[start_index * 9 + 3] = grad_yx + n_y * grad_correction_x; - boundary_grad[start_index * 9 + 4] = grad_yy + n_y * grad_correction_y; - boundary_grad[start_index * 9 + 5] = grad_yz + n_y * grad_correction_z; - boundary_grad[start_index * 9 + 6] = grad_zx + n_z * grad_correction_x; - boundary_grad[start_index * 9 + 7] = grad_zy + n_z * grad_correction_y; - boundary_grad[start_index * 9 + 8] = grad_zz + n_z * grad_correction_z; + boundary_grad[start_index * 9 + 0] = (grad_xx + n_x * grad_correction_x) * sign; + boundary_grad[start_index * 9 + 1] = (grad_xy + n_x * grad_correction_y) * sign; + boundary_grad[start_index * 9 + 2] = (grad_xz + n_x * grad_correction_z) * sign; + boundary_grad[start_index * 9 + 3] = (grad_yx + n_y * grad_correction_x) * sign; + boundary_grad[start_index * 9 + 4] = (grad_yy + n_y * grad_correction_y) * sign; + boundary_grad[start_index * 9 + 5] = (grad_yz + n_y * grad_correction_z) * sign; + boundary_grad[start_index * 9 + 6] = (grad_zx + n_z * grad_correction_x) * sign; + boundary_grad[start_index * 9 + 7] = (grad_zy + n_z * grad_correction_y) * sign; + boundary_grad[start_index * 9 + 8] = (grad_zz + n_z * grad_correction_z) * sign; } __global__ void fvc_div_surface_scalar_internal(int num_surfaces, @@ -783,12 +786,12 @@ void update_boundary_coeffs_vector(cudaStream_t stream, int num_patches, void fvm_ddt_vector(cudaStream_t stream, int num_cells, double rDeltaT, const double *rho, const double *rho_old, const double *vf, const double *volume, - double *diag, double *source) + double *diag, double *source, double sign) { size_t threads_per_block = 1024; size_t blocks_per_grid = (num_cells + threads_per_block - 1) / threads_per_block; fvm_ddt_vector_kernel<<>>(num_cells, - rDeltaT, rho, rho_old, vf, volume, diag, source); + rDeltaT, rho, rho_old, vf, volume, diag, source, sign); } void fvm_div_vector(cudaStream_t stream, int num_surfaces, const int *lowerAddr, const int *upperAddr, @@ -796,14 +799,14 @@ void fvm_div_vector(cudaStream_t stream, int num_surfaces, const int *lowerAddr, double *lower, double *upper, double *diag, // end for internal int num_patches, const int *patch_size, const int *patch_type, const double *boundary_phi, const double *value_internal_coeffs, const double *value_boundary_coeffs, - double *internal_coeffs, double *boundary_coeffs) + double *internal_coeffs, double *boundary_coeffs, double sign) { size_t threads_per_block = 1024; size_t blocks_per_grid = 1; blocks_per_grid = (num_surfaces + threads_per_block - 1) / threads_per_block; fvm_div_vector_internal<<>>(num_surfaces, lowerAddr, upperAddr, - phi, weight, lower, upper, diag); + phi, weight, lower, upper, diag, sign); int offset = 0; for (int i = 0; i < num_patches; i++) { @@ -815,7 +818,7 @@ void fvm_div_vector(cudaStream_t stream, int num_surfaces, const int *lowerAddr, // TODO: just vector version now fvm_div_vector_boundary<<>>(patch_size[i], offset, boundary_phi, value_internal_coeffs, value_boundary_coeffs, - internal_coeffs, boundary_coeffs); + internal_coeffs, boundary_coeffs, sign); } else if (0) { // xxx fprintf(stderr, "boundaryConditions other than zeroGradient are not support yet!\n"); @@ -831,14 +834,14 @@ void fvm_laplacian_vector(cudaStream_t stream, int num_surfaces, int num_patches, const int *patch_size, const int *patch_type, const double *boundary_mag_sf, const double *boundary_gamma, const double *gradient_internal_coeffs, const double *gradient_boundary_coeffs, - double *internal_coeffs, double *boundary_coeffs) + double *internal_coeffs, double *boundary_coeffs, double sign) { size_t threads_per_block = 1024; size_t blocks_per_grid = 1; blocks_per_grid = (num_surfaces + threads_per_block - 1) / threads_per_block; fvm_laplacian_vector_internal<<>>(num_surfaces, lowerAddr, upperAddr, - weight, mag_sf, delta_coeffs, gamma, lower, upper, diag); + weight, mag_sf, delta_coeffs, gamma, lower, upper, diag, sign); int offset = 0; for (int i = 0; i < num_patches; i++) { @@ -850,7 +853,7 @@ void fvm_laplacian_vector(cudaStream_t stream, int num_surfaces, // TODO: just vector version now fvm_laplacian_vector_boundary<<>>(patch_size[i], offset, boundary_mag_sf, boundary_gamma, gradient_internal_coeffs, gradient_boundary_coeffs, - internal_coeffs, boundary_coeffs); + internal_coeffs, boundary_coeffs, sign); } else if (0) { // xxx fprintf(stderr, "boundaryConditions other than zeroGradient are not support yet!\n"); @@ -861,14 +864,14 @@ void fvm_laplacian_vector(cudaStream_t stream, int num_surfaces, void fvc_ddt_scalar(cudaStream_t stream, int num_cells, double rDeltaT, const double *rho, const double *rho_old, const double *vf, const double *vf_old, - double *output) + double *output, double sign) { checkCudaErrors(cudaMemsetAsync(output, 0, num_cells * sizeof(double), stream)); size_t threads_per_block = 1024; size_t blocks_per_grid = (num_cells + threads_per_block - 1) / threads_per_block; fvc_ddt_scalar_kernel<<>>(num_cells, - rDeltaT, rho, rho_old, vf, vf_old, output); + rDeltaT, rho, rho_old, vf, vf_old, output, sign); } void fvc_grad_vector(cudaStream_t stream, int num_cells, int num_surfaces, @@ -877,7 +880,7 @@ void fvc_grad_vector(cudaStream_t stream, int num_cells, int num_surfaces, int num_patches, const int *patch_size, const int *patch_type, const int *boundary_cell_face, const double *boundary_vf, const double *boundary_Sf, const double *volume, const double *boundary_mag_Sf, double *boundary_output, - const double *boundary_deltaCoeffs) + const double *boundary_deltaCoeffs, double sign) { checkCudaErrors(cudaMemsetAsync(output, 0, num_cells * 9 * sizeof(double), stream)); size_t threads_per_block = 1024; @@ -906,7 +909,7 @@ void fvc_grad_vector(cudaStream_t stream, int num_cells, int num_surfaces, // divide cell volume threads_per_block = 512; blocks_per_grid = (num_cells + threads_per_block - 1) / threads_per_block; - divide_cell_volume_tsr<<>>(num_cells, volume, output); + divide_cell_volume_tsr<<>>(num_cells, volume, output, sign); // correct boundary conditions offset = 0; @@ -917,10 +920,10 @@ void fvc_grad_vector(cudaStream_t stream, int num_cells, int num_surfaces, if (patch_type[i] == boundaryConditions::zeroGradient) { // TODO: just vector version now fvc_grad_vector_correctBC_zeroGradient<<>>(patch_size[i], offset, boundary_cell_face, - output, boundary_vf, boundary_Sf, boundary_mag_Sf, boundary_output); + output, boundary_vf, boundary_Sf, boundary_mag_Sf, boundary_output, sign); } else if (patch_type[i] == boundaryConditions::fixedValue) { fvc_grad_vector_correctBC_fixedValue<<>>(patch_size[i], offset, boundary_cell_face, - output, boundary_vf, boundary_Sf, boundary_mag_Sf, boundary_output, boundary_deltaCoeffs, boundary_vf); + output, boundary_vf, boundary_Sf, boundary_mag_Sf, boundary_output, boundary_deltaCoeffs, boundary_vf, sign); } else if (0) { // xxx fprintf(stderr, "boundaryConditions other than zeroGradient are not support yet!\n"); @@ -942,7 +945,7 @@ void scale_dev2T_tensor(cudaStream_t stream, int num_cells, const double *vf1, d void fvc_div_surface_scalar(cudaStream_t stream, int num_cells, int num_surfaces, int num_boundary_surfaces, const int *lowerAddr, const int *upperAddr, const double *ssf, const int *boundary_cell_face, - const double *boundary_ssf, const double *volume, double *output) + const double *boundary_ssf, const double *volume, double *output, double sign) { checkCudaErrors(cudaMemsetAsync(output, 0, num_cells * sizeof(double), stream)); @@ -958,7 +961,7 @@ void fvc_div_surface_scalar(cudaStream_t stream, int num_cells, int num_surfaces // divide cell volume threads_per_block = 1024; blocks_per_grid = (num_cells + threads_per_block - 1) / threads_per_block; - divide_cell_volume_scalar<<>>(num_cells, volume, output); + divide_cell_volume_scalar<<>>(num_cells, volume, output, sign); } void fvc_div_cell_vector(cudaStream_t stream, int num_cells, int num_surfaces, @@ -966,7 +969,7 @@ void fvc_div_cell_vector(cudaStream_t stream, int num_cells, int num_surfaces, const double *weight, const double *Sf, const double *vf, double *output, // end for internal int num_patches, const int *patch_size, const int *patch_type, const int *boundary_cell_face, const double *boundary_vf, const double *boundary_Sf, - const double *volume) + const double *volume, double sign) { checkCudaErrors(cudaMemsetAsync(output, 0, num_cells * sizeof(double), stream)); @@ -994,7 +997,7 @@ void fvc_div_cell_vector(cudaStream_t stream, int num_cells, int num_surfaces, // divide cell volume threads_per_block = 1024; blocks_per_grid = (num_cells + threads_per_block - 1) / threads_per_block; - divide_cell_volume_scalar<<>>(num_cells, volume, output); + divide_cell_volume_scalar<<>>(num_cells, volume, output, sign); } void fvc_div_cell_tensor(cudaStream_t stream, int num_cells, int num_surfaces, @@ -1002,7 +1005,7 @@ void fvc_div_cell_tensor(cudaStream_t stream, int num_cells, int num_surfaces, const double *weight, const double *Sf, const double *vf, double *output, // end for internal int num_patches, const int *patch_size, const int *patch_type, const int *boundary_cell_face, const double *boundary_vf, const double *boundary_Sf, - const double *volume) + const double *volume, double sign) { checkCudaErrors(cudaMemsetAsync(output, 0, num_cells * 3 * sizeof(double), stream)); @@ -1030,14 +1033,14 @@ void fvc_div_cell_tensor(cudaStream_t stream, int num_cells, int num_surfaces, // divide cell volume threads_per_block = 1024; blocks_per_grid = (num_cells + threads_per_block - 1) / threads_per_block; - divide_cell_volume_vec<<>>(num_cells, volume, output); + divide_cell_volume_vec<<>>(num_cells, volume, output, sign); } void fvc_grad_cell_scalar(cudaStream_t stream, int num_cells, int num_surfaces, const int *lowerAddr, const int *upperAddr, const double *weight, const double *Sf, const double *vf, double *output, // end for internal int num_patches, const int *patch_size, const int *patch_type, - const int *boundary_cell_face, const double *boundary_vf, const double *boundary_Sf, const double *volume) + const int *boundary_cell_face, const double *boundary_vf, const double *boundary_Sf, const double *volume, double sign) { checkCudaErrors(cudaMemsetAsync(output, 0, num_cells * 3 * sizeof(double), stream)); size_t threads_per_block = 1024; @@ -1064,5 +1067,5 @@ void fvc_grad_cell_scalar(cudaStream_t stream, int num_cells, int num_surfaces, // divide cell volume threads_per_block = 1024; blocks_per_grid = (num_cells + threads_per_block - 1) / threads_per_block; - divide_cell_volume_vec<<>>(num_cells, volume, output); + divide_cell_volume_vec<<>>(num_cells, volume, output, sign); } diff --git a/src_gpu/dfUEqn.cu b/src_gpu/dfUEqn.cu index 1e8065721..f20808ec8 100644 --- a/src_gpu/dfUEqn.cu +++ b/src_gpu/dfUEqn.cu @@ -92,13 +92,13 @@ void dfUEqn::preProcess(const double *h_u, const double *h_boundary_u, const dou void dfUEqn::process() { fvm_ddt_vector(dataBase_.stream, dataBase_.num_cells, dataBase_.rdelta_t, dataBase_.d_rho, dataBase_.d_rho_old, dataBase_.d_u, dataBase_.d_volume, - d_diag, d_source); + d_diag, d_source, -1.); fvm_div_vector(dataBase_.stream, dataBase_.num_surfaces, dataBase_.d_owner, dataBase_.d_neighbor, dataBase_.d_phi, dataBase_.d_weight, d_lower, d_upper, d_diag, // end for internal dataBase_.num_patches, dataBase_.patch_size.data(), patch_type.data(), dataBase_.d_boundary_phi, d_value_internal_coeffs, d_value_boundary_coeffs, - d_internal_coeffs, d_boundary_coeffs); + d_internal_coeffs, d_boundary_coeffs, -1.); //field_multiply_scalar(dataBase_.stream, // dataBase_.num_cells, dataBase_.d_rho, d_nu_eff, d_rho_nueff, // end for internal // dataBase_.num_boundary_surfaces, dataBase_.d_boundary_rho, d_boundary_nu_eff, d_boundary_rho_nueff); @@ -129,7 +129,7 @@ void dfUEqn::process() { dataBase_.d_owner, dataBase_.d_neighbor, dataBase_.d_weight, dataBase_.d_sf, dataBase_.d_p, d_fvc_output, dataBase_.num_patches, dataBase_.patch_size.data(), patch_type.data(), - dataBase_.d_boundary_face_cell, dataBase_.d_boundary_p, dataBase_.d_boundary_sf, dataBase_.d_volume); + dataBase_.d_boundary_face_cell, dataBase_.d_boundary_p, dataBase_.d_boundary_sf, dataBase_.d_volume, -1.); fvc_to_source_vector(dataBase_.stream, dataBase_.num_cells, dataBase_.d_volume, d_fvc_output, d_source); //solve(); From cbb74cd69e44cd1017a62e4e0a67ec33627f0a57 Mon Sep 17 00:00:00 2001 From: STwangyingrui Date: Tue, 15 Aug 2023 19:49:38 +0800 Subject: [PATCH 21/25] use cuda graph in ueqn --- .../solvers/dfLowMachFoam/new_dfLowMachFoam.C | 10 +- src_gpu/dfMatrixDataBase.H | 5 - src_gpu/dfMatrixDataBase.cu | 4 - src_gpu/dfUEqn.H | 13 ++- src_gpu/dfUEqn.cu | 110 +++++++++++------- 5 files changed, 84 insertions(+), 58 deletions(-) diff --git a/applications/solvers/dfLowMachFoam/new_dfLowMachFoam.C b/applications/solvers/dfLowMachFoam/new_dfLowMachFoam.C index 0deffb40f..a8368f5af 100644 --- a/applications/solvers/dfLowMachFoam/new_dfLowMachFoam.C +++ b/applications/solvers/dfLowMachFoam/new_dfLowMachFoam.C @@ -98,11 +98,11 @@ int main(int argc, char *argv[]) createGPUBase(mesh, Y); createGPUUEqn(CanteraTorchProperties, U); - // foreach(timestep) { - dfDataBase.preTimeStep(&rho.oldTime()[0]); - #include "new_UEqn.H" - dfDataBase.postTimeStep(); - // } + for (int timestep = 0; timestep < 10; timestep++) { + dfDataBase.preTimeStep(&rho.oldTime()[0]); + #include "new_UEqn.H" + dfDataBase.postTimeStep(); + } } return 0; } diff --git a/src_gpu/dfMatrixDataBase.H b/src_gpu/dfMatrixDataBase.H index 69d20d7af..cac7264a8 100644 --- a/src_gpu/dfMatrixDataBase.H +++ b/src_gpu/dfMatrixDataBase.H @@ -68,11 +68,6 @@ struct dfMatrixDataBase { // cuda resource cudaStream_t stream; - // maybe one graph for one eqn before using self-developed solver - // and should be located in each eqn. - cudaGraph_t graph; - cudaGraphExec_t graph_instance; - bool graph_created=false; // constant values -- basic int num_cells = 0; diff --git a/src_gpu/dfMatrixDataBase.cu b/src_gpu/dfMatrixDataBase.cu index 64b35f956..8c2c26faf 100644 --- a/src_gpu/dfMatrixDataBase.cu +++ b/src_gpu/dfMatrixDataBase.cu @@ -48,10 +48,6 @@ dfMatrixDataBase::dfMatrixDataBase() { dfMatrixDataBase::~dfMatrixDataBase() { // destroy cuda resources checkCudaErrors(cudaStreamDestroy(stream)); - if (graph_created) { - checkCudaErrors(cudaGraphExecDestroy(graph_instance)); - checkCudaErrors(cudaGraphDestroy(graph)); - } // TODO: free pointers } diff --git a/src_gpu/dfUEqn.H b/src_gpu/dfUEqn.H index 880b9c347..4d1c25697 100644 --- a/src_gpu/dfUEqn.H +++ b/src_gpu/dfUEqn.H @@ -10,6 +10,12 @@ class dfUEqn private: dfMatrixDataBase &dataBase_; + // cuda resource + // one graph for one eqn before using self-developed solver + cudaGraph_t graph; + cudaGraphExec_t graph_instance; + bool graph_created=false; + // constant values -- basic std::string mode_string; std::string setting_path; @@ -72,7 +78,12 @@ public: : dataBase_(dataBase) {} // 析构函数 - ~dfUEqn(){} + ~dfUEqn(){ + if (graph_created) { + checkCudaErrors(cudaGraphExecDestroy(graph_instance)); + checkCudaErrors(cudaGraphDestroy(graph)); + } + } // 成员函数 diff --git a/src_gpu/dfUEqn.cu b/src_gpu/dfUEqn.cu index f20808ec8..470550cd0 100644 --- a/src_gpu/dfUEqn.cu +++ b/src_gpu/dfUEqn.cu @@ -90,48 +90,73 @@ void dfUEqn::preProcess(const double *h_u, const double *h_boundary_u, const dou } void dfUEqn::process() { - fvm_ddt_vector(dataBase_.stream, dataBase_.num_cells, dataBase_.rdelta_t, - dataBase_.d_rho, dataBase_.d_rho_old, dataBase_.d_u, dataBase_.d_volume, - d_diag, d_source, -1.); - fvm_div_vector(dataBase_.stream, dataBase_.num_surfaces, dataBase_.d_owner, dataBase_.d_neighbor, - dataBase_.d_phi, dataBase_.d_weight, - d_lower, d_upper, d_diag, // end for internal - dataBase_.num_patches, dataBase_.patch_size.data(), patch_type.data(), - dataBase_.d_boundary_phi, d_value_internal_coeffs, d_value_boundary_coeffs, - d_internal_coeffs, d_boundary_coeffs, -1.); - //field_multiply_scalar(dataBase_.stream, - // dataBase_.num_cells, dataBase_.d_rho, d_nu_eff, d_rho_nueff, // end for internal - // dataBase_.num_boundary_surfaces, dataBase_.d_boundary_rho, d_boundary_nu_eff, d_boundary_rho_nueff); - //fvm_laplacian_vector(dataBase_.stream, dataBase_.num_surfaces, - // dataBase_.d_owner, dataBase_.d_neighbor, - // dataBase_.d_weight, dataBase_.d_mag_sf, dataBase_.d_delta_coeffs, d_rho_nueff, - // d_lower, d_upper, d_diag, // end for internal - // dataBase_.num_patches, dataBase_.patch_size.data(), patch_type.data(), - // dataBase_.d_boundary_mag_sf, d_boundary_rho_nueff, - // d_gradient_internal_coeffs, d_gradient_boundary_coeffs, - // d_internal_coeffs, d_boundary_coeffs); - //fvc_grad_vector(dataBase_.stream, dataBase_.num_cells, dataBase_.num_surfaces, - // dataBase_.d_owner, dataBase_.d_neighbor, - // dataBase_.d_weight, dataBase_.d_sf, dataBase_.d_u, d_grad_u, - // dataBase_.num_patches, dataBase_.patch_size.data(), patch_type.data(), - // dataBase_.d_boundary_face_cell, dataBase_.d_boundary_u, dataBase_.d_boundary_sf, - // dataBase_.d_volume, dataBase_.d_boundary_mag_sf, d_boundary_grad_u, dataBase_.d_boundary_delta_coeffs); - //scale_dev2T_tensor(dataBase_.stream, dataBase_.num_cells, d_rho_nueff, d_grad_u, // end for internal - // dataBase_.num_boundary_surfaces, d_boundary_rho_nueff, d_boundary_grad_u); - //fvc_div_cell_tensor(dataBase_.stream, dataBase_.num_cells, dataBase_.num_surfaces, - // dataBase_.d_owner, dataBase_.d_neighbor, - // dataBase_.d_weight, dataBase_.d_sf, d_grad_u, d_fvc_output, // end for internal - // dataBase_.num_patches, dataBase_.patch_size.data(), patch_type.data(), - // dataBase_.d_boundary_face_cell, d_boundary_grad_u, dataBase_.d_boundary_sf, dataBase_.d_volume); - //fvc_to_source_vector(dataBase_.stream, dataBase_.num_cells, - // dataBase_.d_volume, d_fvc_output, d_source); - fvc_grad_cell_scalar(dataBase_.stream, dataBase_.num_cells, dataBase_.num_surfaces, - dataBase_.d_owner, dataBase_.d_neighbor, - dataBase_.d_weight, dataBase_.d_sf, dataBase_.d_p, d_fvc_output, - dataBase_.num_patches, dataBase_.patch_size.data(), patch_type.data(), - dataBase_.d_boundary_face_cell, dataBase_.d_boundary_p, dataBase_.d_boundary_sf, dataBase_.d_volume, -1.); - fvc_to_source_vector(dataBase_.stream, dataBase_.num_cells, - dataBase_.d_volume, d_fvc_output, d_source); + //使用event计算时间 + float time_elapsed=0; + cudaEvent_t start,stop; + checkCudaErrors(cudaEventCreate(&start)); + checkCudaErrors(cudaEventCreate(&stop)); + checkCudaErrors(cudaEventRecord(start,0)); + + if(!graph_created) { + DEBUG_TRACE; + checkCudaErrors(cudaStreamBeginCapture(dataBase_.stream, cudaStreamCaptureModeGlobal)); + + fvm_ddt_vector(dataBase_.stream, dataBase_.num_cells, dataBase_.rdelta_t, + dataBase_.d_rho, dataBase_.d_rho_old, dataBase_.d_u, dataBase_.d_volume, + d_diag, d_source, -1.); + fvm_div_vector(dataBase_.stream, dataBase_.num_surfaces, dataBase_.d_owner, dataBase_.d_neighbor, + dataBase_.d_phi, dataBase_.d_weight, + d_lower, d_upper, d_diag, // end for internal + dataBase_.num_patches, dataBase_.patch_size.data(), patch_type.data(), + dataBase_.d_boundary_phi, d_value_internal_coeffs, d_value_boundary_coeffs, + d_internal_coeffs, d_boundary_coeffs, -1.); + //field_multiply_scalar(dataBase_.stream, + // dataBase_.num_cells, dataBase_.d_rho, d_nu_eff, d_rho_nueff, // end for internal + // dataBase_.num_boundary_surfaces, dataBase_.d_boundary_rho, d_boundary_nu_eff, d_boundary_rho_nueff); + //fvm_laplacian_vector(dataBase_.stream, dataBase_.num_surfaces, + // dataBase_.d_owner, dataBase_.d_neighbor, + // dataBase_.d_weight, dataBase_.d_mag_sf, dataBase_.d_delta_coeffs, d_rho_nueff, + // d_lower, d_upper, d_diag, // end for internal + // dataBase_.num_patches, dataBase_.patch_size.data(), patch_type.data(), + // dataBase_.d_boundary_mag_sf, d_boundary_rho_nueff, + // d_gradient_internal_coeffs, d_gradient_boundary_coeffs, + // d_internal_coeffs, d_boundary_coeffs); + //fvc_grad_vector(dataBase_.stream, dataBase_.num_cells, dataBase_.num_surfaces, + // dataBase_.d_owner, dataBase_.d_neighbor, + // dataBase_.d_weight, dataBase_.d_sf, dataBase_.d_u, d_grad_u, + // dataBase_.num_patches, dataBase_.patch_size.data(), patch_type.data(), + // dataBase_.d_boundary_face_cell, dataBase_.d_boundary_u, dataBase_.d_boundary_sf, + // dataBase_.d_volume, dataBase_.d_boundary_mag_sf, d_boundary_grad_u, dataBase_.d_boundary_delta_coeffs); + //scale_dev2T_tensor(dataBase_.stream, dataBase_.num_cells, d_rho_nueff, d_grad_u, // end for internal + // dataBase_.num_boundary_surfaces, d_boundary_rho_nueff, d_boundary_grad_u); + //fvc_div_cell_tensor(dataBase_.stream, dataBase_.num_cells, dataBase_.num_surfaces, + // dataBase_.d_owner, dataBase_.d_neighbor, + // dataBase_.d_weight, dataBase_.d_sf, d_grad_u, d_fvc_output, // end for internal + // dataBase_.num_patches, dataBase_.patch_size.data(), patch_type.data(), + // dataBase_.d_boundary_face_cell, d_boundary_grad_u, dataBase_.d_boundary_sf, dataBase_.d_volume); + //fvc_to_source_vector(dataBase_.stream, dataBase_.num_cells, + // dataBase_.d_volume, d_fvc_output, d_source); + fvc_grad_cell_scalar(dataBase_.stream, dataBase_.num_cells, dataBase_.num_surfaces, + dataBase_.d_owner, dataBase_.d_neighbor, + dataBase_.d_weight, dataBase_.d_sf, dataBase_.d_p, d_fvc_output, + dataBase_.num_patches, dataBase_.patch_size.data(), patch_type.data(), + dataBase_.d_boundary_face_cell, dataBase_.d_boundary_p, dataBase_.d_boundary_sf, dataBase_.d_volume, -1.); + fvc_to_source_vector(dataBase_.stream, dataBase_.num_cells, + dataBase_.d_volume, d_fvc_output, d_source); + + checkCudaErrors(cudaStreamEndCapture(dataBase_.stream, &graph)); + checkCudaErrors(cudaGraphInstantiate(&graph_instance, graph, NULL, NULL, 0)); + graph_created = true; + } + DEBUG_TRACE; + checkCudaErrors(cudaGraphLaunch(graph_instance, dataBase_.stream)); + + checkCudaErrors(cudaEventRecord(stop,0)); + checkCudaErrors(cudaEventSynchronize(start)); + checkCudaErrors(cudaEventSynchronize(stop)); + checkCudaErrors(cudaEventElapsedTime(&time_elapsed,start,stop)); + fprintf(stderr, "ueqn process time:%f(ms)\n",time_elapsed); + //solve(); } @@ -139,7 +164,6 @@ void dfUEqn::solve() { //ldu_to_csr(dataBase_.stream, dataBase_.num_cells, dataBase_.num_surfaces, // dataBase_.d_lower_to_csr_index, dataBase_.d_upper_to_csr_index, dataBase_.d_diag_to_csr_index, // d_lower, d_upper, d_diag, d_source, d_internal_coeffs, d_boundary_coeffs, d_A, d_b); - ////checkCudaErrors(cudaStreamSynchronize(dataBase_.stream)); int nNz = dataBase_.num_cells + dataBase_.num_surfaces * 2; // matrix entries if (num_iteration == 0) // first interation From 50eee68f0c5f338e5f5b8a35caa042bc0924bf76 Mon Sep 17 00:00:00 2001 From: maorz1998 Date: Wed, 16 Aug 2023 18:22:40 +0800 Subject: [PATCH 22/25] fix bugs in turbulence term --- .../solvers/dfLowMachFoam/Make/options | 4 +- applications/solvers/dfLowMachFoam/new_UEqn.H | 16 +++-- .../solvers/dfLowMachFoam/new_dfLowMachFoam.C | 7 +- src_gpu/dfMatrixOpBase.cu | 44 +++++++++--- src_gpu/dfUEqn.H | 4 +- src_gpu/dfUEqn.cu | 69 +++++++++++-------- 6 files changed, 97 insertions(+), 47 deletions(-) diff --git a/applications/solvers/dfLowMachFoam/Make/options b/applications/solvers/dfLowMachFoam/Make/options index e2a57bd00..e1959ada3 100644 --- a/applications/solvers/dfLowMachFoam/Make/options +++ b/applications/solvers/dfLowMachFoam/Make/options @@ -29,7 +29,8 @@ EXE_INC = -std=c++14 \ $(PYTHON_INC_DIR) \ $(if $(AMGX_DIR), -I$(DF_ROOT)/src_gpu,) \ $(if $(AMGX_DIR), -I/usr/local/cuda-11.6/include,) \ - $(if $(AMGX_DIR), -I$(AMGX_DIR)/include,) + $(if $(AMGX_DIR), -I$(AMGX_DIR)/include,) \ + -I$(DF_ROOT)/GPUTestRef/lnInclude \ EXE_LIBS = \ -lcompressibleTransportModels \ @@ -43,6 +44,7 @@ EXE_LIBS = \ -ldfCanteraMixture \ -ldfChemistryModel \ -ldfCombustionModels \ + -ldfGenMatrix \ $(CANTERA_ROOT)/lib/libcantera.so \ $(if $(LIBTORCH_ROOT),$(LIBTORCH_ROOT)/lib/libtorch.so,) \ $(if $(LIBTORCH_ROOT),$(LIBTORCH_ROOT)/lib/libc10.so,) \ diff --git a/applications/solvers/dfLowMachFoam/new_UEqn.H b/applications/solvers/dfLowMachFoam/new_UEqn.H index 1b5487139..231c38c29 100644 --- a/applications/solvers/dfLowMachFoam/new_UEqn.H +++ b/applications/solvers/dfLowMachFoam/new_UEqn.H @@ -5,10 +5,16 @@ const volScalarField& nuEff = nuEff_tmp(); // run CPU, for temp tmp tUEqn ( - -fvm::ddt(rho, U) - fvm::div(phi, U) == fvc::grad(p) - //fvm::ddt(rho, U) + fvm::div(phi, U) == -fvc::grad(p) - //turbulence->divDevRhoReff(U) + fvm::ddt(rho, U) + fvm::div(phi, U) + + turbulence->divDevRhoReff(U) + == -fvc::grad(p) ); +// tmp tUEqn_ref // test turbulence->divDevRhoReff(U) +// ( +// - fvc::div((turbulence->rho()*turbulence->nuEff())*dev2(Foam::T(fvc::grad(U)))) +// - fvm::laplacian(turbulence->rho()*turbulence->nuEff(), U) +// ); + fvVectorMatrix& UEqn = tUEqn.ref(); // run GPU @@ -81,6 +87,8 @@ for (int patchi = 0; patchi < dfDataBase.num_patches; patchi++) } bool printFlag = false; UEqn_GPU.compareResult(&UEqn.lower()[0], &UEqn.upper()[0], &UEqn.diag()[0], &UEqn.source()[0][0], - h_internal_coeffs.data(), h_boundary_coeffs.data(), printFlag); + h_internal_coeffs.data(), h_boundary_coeffs.data(), + // &DivTensor[0][0], + printFlag); DEBUG_TRACE; #endif diff --git a/applications/solvers/dfLowMachFoam/new_dfLowMachFoam.C b/applications/solvers/dfLowMachFoam/new_dfLowMachFoam.C index a8368f5af..7d867687f 100644 --- a/applications/solvers/dfLowMachFoam/new_dfLowMachFoam.C +++ b/applications/solvers/dfLowMachFoam/new_dfLowMachFoam.C @@ -51,9 +51,12 @@ Description #include "dfMatrixDataBase.H" #include "dfMatrixOpBase.H" +#include "GenFvMatrix.H" #include "dfUEqn.H" #include "createGPUSolver.H" +#define GPUSolver_ + int main(int argc, char *argv[]) { #ifdef USE_PYTORCH @@ -98,11 +101,11 @@ int main(int argc, char *argv[]) createGPUBase(mesh, Y); createGPUUEqn(CanteraTorchProperties, U); - for (int timestep = 0; timestep < 10; timestep++) { + // for (int timestep = 0; timestep < 10; timestep++) { dfDataBase.preTimeStep(&rho.oldTime()[0]); #include "new_UEqn.H" dfDataBase.postTimeStep(); - } + // } } return 0; } diff --git a/src_gpu/dfMatrixOpBase.cu b/src_gpu/dfMatrixOpBase.cu index fd90ce480..f8d71cca7 100644 --- a/src_gpu/dfMatrixOpBase.cu +++ b/src_gpu/dfMatrixOpBase.cu @@ -103,6 +103,13 @@ __global__ void scale_dev2t_tensor_kernel(int num, const double *vf1, double *vf vf2[index * 9 + 6] = scale * val_xz; vf2[index * 9 + 7] = scale * val_yz; vf2[index * 9 + 8] = scale * (val_zz - trace_coeff); + + // if (index == 0) + // { + // printf("bou_grad_U = (%.5e, %.5e, %.5e, %.5e, %.5e, %.5e, %.5e, %.5e, %.5e)", vf2[0], vf2[1], vf2[2], + // vf2[3], vf2[4], vf2[5], vf2[6], vf2[7], vf2[8]); + // } + } __global__ void fvm_ddt_vector_kernel(int num_cells, double rDeltaT, @@ -488,9 +495,9 @@ __global__ void fvc_grad_vector_correctBC_zeroGradient(int num, int offset, cons double vfy = vf[cellIndex * 3 + 1]; double vfz = vf[cellIndex * 3 + 2]; - double n_x = boundary_sf[cellIndex * 3 + 0] / boundary_mag_sf[cellIndex]; - double n_y = boundary_sf[cellIndex * 3 + 1] / boundary_mag_sf[cellIndex]; - double n_z = boundary_sf[cellIndex * 3 + 2] / boundary_mag_sf[cellIndex]; + double n_x = boundary_sf[start_index * 3 + 0] / boundary_mag_sf[start_index]; + double n_y = boundary_sf[start_index * 3 + 1] / boundary_mag_sf[start_index]; + double n_z = boundary_sf[start_index * 3 + 2] / boundary_mag_sf[start_index]; double grad_correction_x = - (n_x * grad_xx + n_y * grad_yx + n_z * grad_zx); // sn_grad_x = 0 double grad_correction_y = - (n_x * grad_xy + n_y * grad_yy + n_z * grad_zy); @@ -670,9 +677,9 @@ __global__ void fvc_div_cell_tensor_internal(int num_surfaces, double ssf_zx = (w * (vf[owner * 9 + 6] - vf[neighbor * 9 + 6]) + vf[neighbor * 9 + 6]); double ssf_zy = (w * (vf[owner * 9 + 7] - vf[neighbor * 9 + 7]) + vf[neighbor * 9 + 7]); double ssf_zz = (w * (vf[owner * 9 + 8] - vf[neighbor * 9 + 8]) + vf[neighbor * 9 + 8]); - double div_x = Sfx * ssf_xx + Sfy * ssf_xy + Sfz * ssf_xz; - double div_y = Sfx * ssf_yx + Sfy * ssf_yy + Sfz * ssf_yz; - double div_z = Sfx * ssf_zx + Sfy * ssf_zy + Sfz * ssf_zz; + double div_x = Sfx * ssf_xx + Sfy * ssf_yx + Sfz * ssf_zx; + double div_y = Sfx * ssf_xy + Sfy * ssf_yy + Sfz * ssf_zy; + double div_z = Sfx * ssf_xz + Sfy * ssf_yz + Sfz * ssf_zz; // owner atomicAdd(&(output[owner * 3 + 0]), div_x); @@ -683,6 +690,8 @@ __global__ void fvc_div_cell_tensor_internal(int num_surfaces, atomicAdd(&(output[neighbor * 3 + 0]), -div_x); atomicAdd(&(output[neighbor * 3 + 1]), -div_y); atomicAdd(&(output[neighbor * 3 + 2]), -div_z); + + } __global__ void fvc_div_cell_tensor_boundary(int num, int offset, const int *face2Cells, @@ -709,13 +718,30 @@ __global__ void fvc_div_cell_tensor_boundary(int num, int offset, const int *fac double boussf_zz = boundary_vf[start_index * 9 + 8]; int cellIndex = face2Cells[start_index]; - double bouDiv_x = bouSfx * boussf_xx + bouSfy * boussf_xy + bouSfz * boussf_xz; - double bouDiv_y = bouSfx * boussf_yx + bouSfy * boussf_yy + bouSfz * boussf_yz; - double bouDiv_z = bouSfx * boussf_zx + bouSfy * boussf_zy + bouSfz * boussf_zz; + double bouDiv_x = bouSfx * boussf_xx + bouSfy * boussf_yx + bouSfz * boussf_zx; + double bouDiv_y = bouSfx * boussf_xy + bouSfy * boussf_yy + bouSfz * boussf_zy; + double bouDiv_z = bouSfx * boussf_xz + bouSfy * boussf_yz + bouSfz * boussf_zz; atomicAdd(&(output[cellIndex * 3 + 0]), bouDiv_x); atomicAdd(&(output[cellIndex * 3 + 1]), bouDiv_y); atomicAdd(&(output[cellIndex * 3 + 2]), bouDiv_z); + + // if (cellIndex == 0) + // { + // // printf("gpu output[0] = %.5e, %.5e, %.5e\n", output[0], output[1], output[2]); + // // printf("gpu output[0] += %.5e, %.5e, %.5e\n", bouDiv_x, bouDiv_y, bouDiv_z); + // printf("gpu bouvf[0] = (%.5e, %.5e, %.5e, %.5e, %.5e, %.5e, %.5e, %.5e, %.5e)\n", + // boussf_xx, boussf_xy, boussf_xz, boussf_yx, boussf_yy, boussf_yz, boussf_zx, boussf_zy, boussf_zz); + // printf("gpu bouSf[0] = (%.5e, %.5e, %.5e)\n", bouSfx, bouSfy, bouSfz); + // printf("gpu boufinal[0] = (%.5e, %.5e, %.5e)\n", bouDiv_x, bouDiv_y, bouDiv_z); + // printf("bouIndex = %d\n\n", start_index); + // } + + // if (index == 0) + // { + // printf("bou_grad_U = (%.5e, %.5e, %.5e, %.5e, %.5e, %.5e, %.5e, %.5e, %.5e)", vf2[0], vf2[1], vf2[2], + // vf2[3], vf2[4], vf2[5], vf2[6], vf2[7], vf2[8]); + // } } void permute_vector_d2h(cudaStream_t stream, int num_cells, const double *input, double *output) diff --git a/src_gpu/dfUEqn.H b/src_gpu/dfUEqn.H index 4d1c25697..7b28f082c 100644 --- a/src_gpu/dfUEqn.H +++ b/src_gpu/dfUEqn.H @@ -107,5 +107,7 @@ public: void postProcess(double *h_u); void solve(); - void compareResult(const double *lower, const double *upper, const double *diag, const double *source, const double *internal_coeffs, const double *boundary_coeffs, bool printFlag); + void compareResult(const double *lower, const double *upper, const double *diag, const double *source, const double *internal_coeffs, const double *boundary_coeffs, + // const double *tmpVal, + bool printFlag); }; diff --git a/src_gpu/dfUEqn.cu b/src_gpu/dfUEqn.cu index 470550cd0..90efcf670 100644 --- a/src_gpu/dfUEqn.cu +++ b/src_gpu/dfUEqn.cu @@ -103,39 +103,39 @@ void dfUEqn::process() { fvm_ddt_vector(dataBase_.stream, dataBase_.num_cells, dataBase_.rdelta_t, dataBase_.d_rho, dataBase_.d_rho_old, dataBase_.d_u, dataBase_.d_volume, - d_diag, d_source, -1.); + d_diag, d_source, 1.); fvm_div_vector(dataBase_.stream, dataBase_.num_surfaces, dataBase_.d_owner, dataBase_.d_neighbor, dataBase_.d_phi, dataBase_.d_weight, d_lower, d_upper, d_diag, // end for internal dataBase_.num_patches, dataBase_.patch_size.data(), patch_type.data(), dataBase_.d_boundary_phi, d_value_internal_coeffs, d_value_boundary_coeffs, - d_internal_coeffs, d_boundary_coeffs, -1.); - //field_multiply_scalar(dataBase_.stream, - // dataBase_.num_cells, dataBase_.d_rho, d_nu_eff, d_rho_nueff, // end for internal - // dataBase_.num_boundary_surfaces, dataBase_.d_boundary_rho, d_boundary_nu_eff, d_boundary_rho_nueff); - //fvm_laplacian_vector(dataBase_.stream, dataBase_.num_surfaces, - // dataBase_.d_owner, dataBase_.d_neighbor, - // dataBase_.d_weight, dataBase_.d_mag_sf, dataBase_.d_delta_coeffs, d_rho_nueff, - // d_lower, d_upper, d_diag, // end for internal - // dataBase_.num_patches, dataBase_.patch_size.data(), patch_type.data(), - // dataBase_.d_boundary_mag_sf, d_boundary_rho_nueff, - // d_gradient_internal_coeffs, d_gradient_boundary_coeffs, - // d_internal_coeffs, d_boundary_coeffs); - //fvc_grad_vector(dataBase_.stream, dataBase_.num_cells, dataBase_.num_surfaces, - // dataBase_.d_owner, dataBase_.d_neighbor, - // dataBase_.d_weight, dataBase_.d_sf, dataBase_.d_u, d_grad_u, - // dataBase_.num_patches, dataBase_.patch_size.data(), patch_type.data(), - // dataBase_.d_boundary_face_cell, dataBase_.d_boundary_u, dataBase_.d_boundary_sf, - // dataBase_.d_volume, dataBase_.d_boundary_mag_sf, d_boundary_grad_u, dataBase_.d_boundary_delta_coeffs); - //scale_dev2T_tensor(dataBase_.stream, dataBase_.num_cells, d_rho_nueff, d_grad_u, // end for internal - // dataBase_.num_boundary_surfaces, d_boundary_rho_nueff, d_boundary_grad_u); - //fvc_div_cell_tensor(dataBase_.stream, dataBase_.num_cells, dataBase_.num_surfaces, - // dataBase_.d_owner, dataBase_.d_neighbor, - // dataBase_.d_weight, dataBase_.d_sf, d_grad_u, d_fvc_output, // end for internal - // dataBase_.num_patches, dataBase_.patch_size.data(), patch_type.data(), - // dataBase_.d_boundary_face_cell, d_boundary_grad_u, dataBase_.d_boundary_sf, dataBase_.d_volume); - //fvc_to_source_vector(dataBase_.stream, dataBase_.num_cells, - // dataBase_.d_volume, d_fvc_output, d_source); + d_internal_coeffs, d_boundary_coeffs, 1.); + field_multiply_scalar(dataBase_.stream, + dataBase_.num_cells, dataBase_.d_rho, d_nu_eff, d_rho_nueff, // end for internal + dataBase_.num_boundary_surfaces, dataBase_.d_boundary_rho, d_boundary_nu_eff, d_boundary_rho_nueff); + fvm_laplacian_vector(dataBase_.stream, dataBase_.num_surfaces, + dataBase_.d_owner, dataBase_.d_neighbor, + dataBase_.d_weight, dataBase_.d_mag_sf, dataBase_.d_delta_coeffs, d_rho_nueff, + d_lower, d_upper, d_diag, // end for internal + dataBase_.num_patches, dataBase_.patch_size.data(), patch_type.data(), + dataBase_.d_boundary_mag_sf, d_boundary_rho_nueff, + d_gradient_internal_coeffs, d_gradient_boundary_coeffs, + d_internal_coeffs, d_boundary_coeffs, -1); + fvc_grad_vector(dataBase_.stream, dataBase_.num_cells, dataBase_.num_surfaces, + dataBase_.d_owner, dataBase_.d_neighbor, + dataBase_.d_weight, dataBase_.d_sf, dataBase_.d_u, d_grad_u, + dataBase_.num_patches, dataBase_.patch_size.data(), patch_type.data(), + dataBase_.d_boundary_face_cell, dataBase_.d_boundary_u, dataBase_.d_boundary_sf, + dataBase_.d_volume, dataBase_.d_boundary_mag_sf, d_boundary_grad_u, dataBase_.d_boundary_delta_coeffs); + scale_dev2T_tensor(dataBase_.stream, dataBase_.num_cells, d_rho_nueff, d_grad_u, // end for internal + dataBase_.num_boundary_surfaces, d_boundary_rho_nueff, d_boundary_grad_u); + fvc_div_cell_tensor(dataBase_.stream, dataBase_.num_cells, dataBase_.num_surfaces, + dataBase_.d_owner, dataBase_.d_neighbor, + dataBase_.d_weight, dataBase_.d_sf, d_grad_u, d_fvc_output, // end for internal + dataBase_.num_patches, dataBase_.patch_size.data(), patch_type.data(), + dataBase_.d_boundary_face_cell, d_boundary_grad_u, dataBase_.d_boundary_sf, dataBase_.d_volume); + fvc_to_source_vector(dataBase_.stream, dataBase_.num_cells, + dataBase_.d_volume, d_fvc_output, d_source); fvc_grad_cell_scalar(dataBase_.stream, dataBase_.num_cells, dataBase_.num_surfaces, dataBase_.d_owner, dataBase_.d_neighbor, dataBase_.d_weight, dataBase_.d_sf, dataBase_.d_p, d_fvc_output, @@ -217,7 +217,10 @@ double* dfUEqn::getFieldPointer(const char* fieldAlias, location loc, position p return pointer; } -void dfUEqn::compareResult(const double *lower, const double *upper, const double *diag, const double *source, const double *internal_coeffs, const double *boundary_coeffs, bool printFlag) +void dfUEqn::compareResult(const double *lower, const double *upper, const double *diag, + const double *source, const double *internal_coeffs, const double *boundary_coeffs, + // const double *tmpVal, + bool printFlag) { DEBUG_TRACE; std::vector h_lower; @@ -241,7 +244,7 @@ void dfUEqn::compareResult(const double *lower, const double *upper, const doubl std::vector h_source; h_source.resize(dataBase_.num_cells * 3); checkCudaErrors(cudaMemcpy(h_source.data(), d_source, dataBase_.cell_value_vec_bytes, cudaMemcpyDeviceToHost)); - checkVectorEqual(dataBase_.num_cells, source, h_source.data(), 1e-14, printFlag); + checkVectorEqual(dataBase_.num_cells * 3, source, h_source.data(), 1e-14, printFlag); DEBUG_TRACE; std::vector h_internal_coeffs; @@ -255,5 +258,11 @@ void dfUEqn::compareResult(const double *lower, const double *upper, const doubl checkCudaErrors(cudaMemcpy(h_boundary_coeffs.data(), d_boundary_coeffs, dataBase_.boundary_surface_value_vec_bytes, cudaMemcpyDeviceToHost)); checkVectorEqual(dataBase_.num_boundary_surfaces * 3, boundary_coeffs, h_boundary_coeffs.data(), 1e-14, printFlag); DEBUG_TRACE; + + // std::vector h_tmpVal; + // h_tmpVal.resize(dataBase_.num_cells * 3); + // checkCudaErrors(cudaMemcpy(h_tmpVal.data(), d_fvc_output, dataBase_.cell_value_vec_bytes, cudaMemcpyDeviceToHost)); + // checkVectorEqual(dataBase_.num_cells * 3, tmpVal, h_tmpVal.data(), 1e-14, printFlag); + // DEBUG_TRACE; } From f69147863645ccc2eca9b52e5bf8eae836f04e10 Mon Sep 17 00:00:00 2001 From: maorz1998 Date: Thu, 17 Aug 2023 21:52:39 +0800 Subject: [PATCH 23/25] primiry opt & add time monitor --- applications/solvers/dfLowMachFoam/new_UEqn.H | 22 ++ src_gpu/dfMatrixOpBase.H | 20 +- src_gpu/dfMatrixOpBase.cu | 343 ++++++++++-------- src_gpu/dfUEqn.H | 2 + src_gpu/dfUEqn.cu | 54 ++- 5 files changed, 273 insertions(+), 168 deletions(-) diff --git a/applications/solvers/dfLowMachFoam/new_UEqn.H b/applications/solvers/dfLowMachFoam/new_UEqn.H index 231c38c29..9d94d27b6 100644 --- a/applications/solvers/dfLowMachFoam/new_UEqn.H +++ b/applications/solvers/dfLowMachFoam/new_UEqn.H @@ -20,6 +20,7 @@ fvVectorMatrix& UEqn = tUEqn.ref(); // run GPU // preProcess // TODO: preProcessForRhoEqn for temp, now we only transfer phi(instead of rhoEqn) used by fvm::div(phi, U) +UEqn_GPU.sync(); double *h_rho = dfDataBase.getFieldPointer("rho", location::cpu, position::internal); double *h_phi = dfDataBase.getFieldPointer("phi", location::cpu, position::internal); double *h_boundary_phi = dfDataBase.getFieldPointer("phi", location::cpu, position::boundary); @@ -35,6 +36,7 @@ forAll(phi.boundaryField(), patchi) } UEqn_GPU.preProcessForRhoEqn(h_rho, h_phi, h_boundary_phi); DEBUG_TRACE; +clock_t start = std::clock(); // preparing u, p, nu_eff, and rho.boundary used by UEqn_GPU.preProcess() double *h_u = dfDataBase.getFieldPointer("u", location::cpu, position::internal); double *h_boundary_u = dfDataBase.getFieldPointer("u", location::cpu, position::boundary); @@ -43,9 +45,17 @@ double *h_boundary_p = dfDataBase.getFieldPointer("p", location::cpu, position:: double *h_nu_eff = UEqn_GPU.getFieldPointer("nu_eff", location::cpu, position::internal); double *h_boundary_nu_eff = UEqn_GPU.getFieldPointer("nu_eff", location::cpu, position::boundary); double *h_boundary_rho = dfDataBase.getFieldPointer("rho", location::cpu, position::boundary); +double end = std::clock(); +Info << "get pointer" << double(end - start) / double(CLOCKS_PER_SEC) << endl; + +start = std::clock(); memcpy(h_u, &U.oldTime()[0][0], dfDataBase.cell_value_vec_bytes); memcpy(h_p, &p[0], dfDataBase.cell_value_bytes); memcpy(h_nu_eff, &nuEff[0], dfDataBase.cell_value_bytes); +end = std::clock(); +Info << "copy to pinned memory" << double(end - start) / double(CLOCKS_PER_SEC) << endl; + +start = std::clock(); offset = 0; forAll(U.boundaryField(), patchi) { @@ -60,12 +70,24 @@ forAll(U.boundaryField(), patchi) memcpy(h_boundary_rho + offset, &patchRho[0], patchsize * sizeof(double)); offset += patchsize; } +end = std::clock(); +Info << "CPU prepare boundary time" << double(end - start) / double(CLOCKS_PER_SEC) << endl; + +start = std::clock(); UEqn_GPU.preProcess(h_u, h_boundary_u, h_p, h_boundary_p, h_nu_eff, h_boundary_nu_eff, h_boundary_rho); DEBUG_TRACE; +UEqn_GPU.sync(); +end = std::clock(); +Info << "GPU preProcess time" << double(end - start) / double(CLOCKS_PER_SEC) << endl; // process +start = std::clock(); UEqn_GPU.process(); +end = std::clock(); DEBUG_TRACE; +UEqn_GPU.sync(); +// end = std::clock(); +Info << "GPU process time" << double(end - start) / double(CLOCKS_PER_SEC) << endl; // postProcess UEqn_GPU.postProcess(h_u); diff --git a/src_gpu/dfMatrixOpBase.H b/src_gpu/dfMatrixOpBase.H index ae303cedb..f64220186 100644 --- a/src_gpu/dfMatrixOpBase.H +++ b/src_gpu/dfMatrixOpBase.H @@ -1,5 +1,21 @@ #pragma once +#define TICK_INIT \ + float time_elapsed_kernel=0;\ + cudaEvent_t start_kernel, stop_kernel;\ + checkCudaErrors(cudaEventCreate(&start_kernel));\ + checkCudaErrors(cudaEventCreate(&stop_kernel)); + +#define TICK_START \ + checkCudaErrors(cudaEventRecord(start_kernel,0)); + +#define TICK_END(prefix) \ + checkCudaErrors(cudaEventRecord(stop_kernel,0));\ + checkCudaErrors(cudaEventSynchronize(start_kernel));\ + checkCudaErrors(cudaEventSynchronize(stop_kernel));\ + checkCudaErrors(cudaEventElapsedTime(&time_elapsed_kernel,start_kernel,stop_kernel));\ + printf("try %s 执行时间:%lf(ms)\n", #prefix, time_elapsed_kernel); + // tools void permute_vector_d2h(cudaStream_t stream, int num_cells, const double *input, double *output); void permute_vector_h2d(cudaStream_t stream, int num_cells, const double *input, double *output); @@ -49,7 +65,7 @@ void fvc_ddt_scalar(cudaStream_t stream, int num_cells, double rDeltaT, const double *rho, const double *rho_old, const double *vf, const double *vf_old, double *output, double sign = 1.); -void fvc_grad_vector(cudaStream_t stream, int num_cells, int num_surfaces, +void fvc_grad_vector(cudaStream_t stream, int num_cells, int num_surfaces, int num_boundary_surfaces, const int *lowerAddr, const int *upperAddr, const double *weight, const double *Sf, const double *vf, double *output, // end for internal int num_patches, const int *patch_size, const int *patch_type, @@ -68,7 +84,7 @@ void fvc_div_cell_vector(cudaStream_t stream, int num_cells, int num_surfaces, const int *boundary_cell_face, const double *boundary_vf, const double *boundary_Sf, const double *volume, double sign = 1.); -void fvc_div_cell_tensor(cudaStream_t stream, int num_cells, int num_surfaces, +void fvc_div_cell_tensor(cudaStream_t stream, int num_cells, int num_surfaces, int num_boundary_surfaces, const int *lowerAddr, const int *upperAddr, const double *weight, const double *Sf, const double *vf, double *output, // end for internal int num_patches, const int *patch_size, const int *patch_type, diff --git a/src_gpu/dfMatrixOpBase.cu b/src_gpu/dfMatrixOpBase.cu index f8d71cca7..d4f6ea7f8 100644 --- a/src_gpu/dfMatrixOpBase.cu +++ b/src_gpu/dfMatrixOpBase.cu @@ -45,9 +45,12 @@ __global__ void fvc_to_source_vector_kernel(int num_cells, const double *volume, if (index >= num_cells) return; - source[index * 3 + 0] += fvc_output[index * 3 + 0] * volume[index]; - source[index * 3 + 1] += fvc_output[index * 3 + 1] * volume[index]; - source[index * 3 + 2] += fvc_output[index * 3 + 2] * volume[index]; + // source[index * 3 + 0] += fvc_output[index * 3 + 0] * volume[index]; + // source[index * 3 + 1] += fvc_output[index * 3 + 1] * volume[index]; + // source[index * 3 + 2] += fvc_output[index * 3 + 2] * volume[index]; + source[index * 3 + 0] += fvc_output[index * 3 + 0]; + source[index * 3 + 1] += fvc_output[index * 3 + 1]; + source[index * 3 + 2] += fvc_output[index * 3 + 2]; } __global__ void update_boundary_coeffs_zeroGradient_vector(int num, int offset, @@ -84,25 +87,25 @@ __global__ void scale_dev2t_tensor_kernel(int num, const double *vf1, double *vf return; double scale = vf1[index]; - double val_xx = vf2[index * 9 + 0]; - double val_xy = vf2[index * 9 + 1]; - double val_xz = vf2[index * 9 + 2]; - double val_yx = vf2[index * 9 + 3]; - double val_yy = vf2[index * 9 + 4]; - double val_yz = vf2[index * 9 + 5]; - double val_zx = vf2[index * 9 + 6]; - double val_zy = vf2[index * 9 + 7]; - double val_zz = vf2[index * 9 + 8]; + double val_xx = vf2[num * 0 + index]; + double val_xy = vf2[num * 1 + index]; + double val_xz = vf2[num * 2 + index]; + double val_yx = vf2[num * 3 + index]; + double val_yy = vf2[num * 4 + index]; + double val_yz = vf2[num * 5 + index]; + double val_zx = vf2[num * 6 + index]; + double val_zy = vf2[num * 7 + index]; + double val_zz = vf2[num * 8 + index]; double trace_coeff = (2. / 3.) * (val_xx + val_yy + val_zz); - vf2[index * 9 + 0] = scale * (val_xx - trace_coeff); - vf2[index * 9 + 1] = scale * val_yx; - vf2[index * 9 + 2] = scale * val_zx; - vf2[index * 9 + 3] = scale * val_xy; - vf2[index * 9 + 4] = scale * (val_yy - trace_coeff); - vf2[index * 9 + 5] = scale * val_zy; - vf2[index * 9 + 6] = scale * val_xz; - vf2[index * 9 + 7] = scale * val_yz; - vf2[index * 9 + 8] = scale * (val_zz - trace_coeff); + vf2[num * 0 + index] = scale * (val_xx - trace_coeff); + vf2[num * 1 + index] = scale * val_yx; + vf2[num * 2 + index] = scale * val_zx; + vf2[num * 3 + index] = scale * val_xy; + vf2[num * 4 + index] = scale * (val_yy - trace_coeff); + vf2[num * 5 + index] = scale * val_zy; + vf2[num * 6 + index] = scale * val_xz; + vf2[num * 7 + index] = scale * val_yz; + vf2[num * 8 + index] = scale * (val_zz - trace_coeff); // if (index == 0) // { @@ -120,11 +123,14 @@ __global__ void fvm_ddt_vector_kernel(int num_cells, double rDeltaT, if (index >= num_cells) return; - diag[index] += rDeltaT * rho[index] * volume[index] * sign; + double vol = volume[index]; + double rho_old_kernel = rho_old[index]; + + diag[index] += rDeltaT * rho[index] * vol * sign; // TODO: skip moving - source[index * 3 + 0] += rDeltaT * rho_old[index] * vf[index * 3 + 0] * volume[index] * sign; - source[index * 3 + 1] += rDeltaT * rho_old[index] * vf[index * 3 + 1] * volume[index] * sign; - source[index * 3 + 2] += rDeltaT * rho_old[index] * vf[index * 3 + 2] * volume[index] * sign; + source[index * 3 + 0] += rDeltaT * rho_old_kernel * vf[index * 3 + 0] * vol * sign; + source[index * 3 + 1] += rDeltaT * rho_old_kernel * vf[index * 3 + 1] * vol * sign; + source[index * 3 + 2] += rDeltaT * rho_old_kernel * vf[index * 3 + 2] * vol * sign; } __global__ void fvm_div_vector_internal(int num_surfaces, @@ -254,7 +260,7 @@ __global__ void fvc_ddt_scalar_kernel(int num_cells, double rDeltaT, output[index] += rDeltaT * (rho[index] * vf[index] - rho_old[index] * vf_old[index]) * sign; } -__global__ void fvc_grad_vector_internal(int num_surfaces, +__global__ void fvc_grad_vector_internal(int num_cells, int num_surfaces, const int *lower_index, const int *upper_index, const double *face_vector, const double *weight, const double *field_vector, double *output) @@ -286,34 +292,35 @@ __global__ void fvc_grad_vector_internal(int num_surfaces, double grad_zz = Sfz * ssfz; // owner - atomicAdd(&(output[owner * 9 + 0]), grad_xx); - atomicAdd(&(output[owner * 9 + 1]), grad_xy); - atomicAdd(&(output[owner * 9 + 2]), grad_xz); - atomicAdd(&(output[owner * 9 + 3]), grad_yx); - atomicAdd(&(output[owner * 9 + 4]), grad_yy); - atomicAdd(&(output[owner * 9 + 5]), grad_yz); - atomicAdd(&(output[owner * 9 + 6]), grad_zx); - atomicAdd(&(output[owner * 9 + 7]), grad_zy); - atomicAdd(&(output[owner * 9 + 8]), grad_zz); + atomicAdd(&(output[num_cells * 0 + owner]), grad_xx); + atomicAdd(&(output[num_cells * 1 + owner]), grad_xy); + atomicAdd(&(output[num_cells * 2 + owner]), grad_xz); + atomicAdd(&(output[num_cells * 3 + owner]), grad_yx); + atomicAdd(&(output[num_cells * 4 + owner]), grad_yy); + atomicAdd(&(output[num_cells * 5 + owner]), grad_yz); + atomicAdd(&(output[num_cells * 6 + owner]), grad_zx); + atomicAdd(&(output[num_cells * 7 + owner]), grad_zy); + atomicAdd(&(output[num_cells * 8 + owner]), grad_zz); // neighbour - atomicAdd(&(output[neighbor * 9 + 0]), -grad_xx); - atomicAdd(&(output[neighbor * 9 + 1]), -grad_xy); - atomicAdd(&(output[neighbor * 9 + 2]), -grad_xz); - atomicAdd(&(output[neighbor * 9 + 3]), -grad_yx); - atomicAdd(&(output[neighbor * 9 + 4]), -grad_yy); - atomicAdd(&(output[neighbor * 9 + 5]), -grad_yz); - atomicAdd(&(output[neighbor * 9 + 6]), -grad_zx); - atomicAdd(&(output[neighbor * 9 + 7]), -grad_zy); - atomicAdd(&(output[neighbor * 9 + 8]), -grad_zz); + atomicAdd(&(output[num_cells * 0 + neighbor]), -grad_xx); + atomicAdd(&(output[num_cells * 1 + neighbor]), -grad_xy); + atomicAdd(&(output[num_cells * 2 + neighbor]), -grad_xz); + atomicAdd(&(output[num_cells * 3 + neighbor]), -grad_yx); + atomicAdd(&(output[num_cells * 4 + neighbor]), -grad_yy); + atomicAdd(&(output[num_cells * 5 + neighbor]), -grad_yz); + atomicAdd(&(output[num_cells * 6 + neighbor]), -grad_zx); + atomicAdd(&(output[num_cells * 7 + neighbor]), -grad_zy); + atomicAdd(&(output[num_cells * 8 + neighbor]), -grad_zz); } // update boundary of interpolation field // calculate the grad field // TODO: this function is implemented for uncoupled boundary conditions // so it should use the more specific func name -__global__ void fvc_grad_vector_boundary(int num, int offset, const int *face2Cells, - const double *boundary_face_vector, const double *boundary_field_vector, double *output) +__global__ void fvc_grad_vector_boundary(int num_cells, int num, + int offset, const int *face2Cells, const double *boundary_face_vector, + const double *boundary_field_vector, double *output) { int index = blockDim.x * blockIdx.x + threadIdx.x; if (index >= num) @@ -341,20 +348,20 @@ __global__ void fvc_grad_vector_boundary(int num, int offset, const int *face2Ce double grad_zy = bouSfz * boussfy; double grad_zz = bouSfz * boussfz; - atomicAdd(&(output[cellIndex * 9 + 0]), grad_xx); - atomicAdd(&(output[cellIndex * 9 + 1]), grad_xy); - atomicAdd(&(output[cellIndex * 9 + 2]), grad_xz); - atomicAdd(&(output[cellIndex * 9 + 3]), grad_yx); - atomicAdd(&(output[cellIndex * 9 + 4]), grad_yy); - atomicAdd(&(output[cellIndex * 9 + 5]), grad_yz); - atomicAdd(&(output[cellIndex * 9 + 6]), grad_zx); - atomicAdd(&(output[cellIndex * 9 + 7]), grad_zy); - atomicAdd(&(output[cellIndex * 9 + 8]), grad_zz); + atomicAdd(&(output[num_cells * 0 + cellIndex]), grad_xx); + atomicAdd(&(output[num_cells * 1 + cellIndex]), grad_xy); + atomicAdd(&(output[num_cells * 2 + cellIndex]), grad_xz); + atomicAdd(&(output[num_cells * 3 + cellIndex]), grad_yx); + atomicAdd(&(output[num_cells * 4 + cellIndex]), grad_yy); + atomicAdd(&(output[num_cells * 5 + cellIndex]), grad_yz); + atomicAdd(&(output[num_cells * 6 + cellIndex]), grad_zx); + atomicAdd(&(output[num_cells * 7 + cellIndex]), grad_zy); + atomicAdd(&(output[num_cells * 8 + cellIndex]), grad_zz); } __global__ void fvc_grad_scalar_internal(int num_cells, int num_surfaces, const int *lower_index, const int *upper_index, const double *face_vector, - const double *weight, const double *vf, double *output) + const double *weight, const double *vf, double *output, double sign) { int index = blockDim.x * blockIdx.x + threadIdx.x; if (index >= num_surfaces) @@ -370,9 +377,9 @@ __global__ void fvc_grad_scalar_internal(int num_cells, int num_surfaces, double ssf = (w * (vf[owner] - vf[neighbor]) + vf[neighbor]); - double grad_x = Sfx * ssf; - double grad_y = Sfy * ssf; - double grad_z = Sfz * ssf; + double grad_x = Sfx * ssf * sign; + double grad_y = Sfy * ssf * sign; + double grad_z = Sfz * ssf * sign; // // owner // atomicAdd(&(output[num_cells * 0 + owner]), grad_x); @@ -397,7 +404,7 @@ __global__ void fvc_grad_scalar_internal(int num_cells, int num_surfaces, } __global__ void fvc_grad_scalar_boundary(int num_cells, int num, int offset, const int *face2Cells, - const double *boundary_face_vector, const double *boundary_vf, double *output) + const double *boundary_face_vector, const double *boundary_vf, double *output, double sign) { int index = blockDim.x * blockIdx.x + threadIdx.x; if (index >= num) @@ -416,9 +423,9 @@ __global__ void fvc_grad_scalar_boundary(int num_cells, int num, int offset, con double grad_y = bouSfy * bouvf; double grad_z = bouSfz * bouvf; - atomicAdd(&(output[cellIndex * 3 + 0]), grad_x); - atomicAdd(&(output[cellIndex * 3 + 1]), grad_y); - atomicAdd(&(output[cellIndex * 3 + 2]), grad_z); + atomicAdd(&(output[cellIndex * 3 + 0]), grad_x * sign); + atomicAdd(&(output[cellIndex * 3 + 1]), grad_y * sign); + atomicAdd(&(output[cellIndex * 3 + 2]), grad_z * sign); // if (cellIndex == 5) // { @@ -434,15 +441,15 @@ __global__ void divide_cell_volume_tsr(int num_cells, const double* volume, doub return; double vol = volume[index]; - output[index * 9 + 0] = output[index * 9 + 0] / vol * sign; - output[index * 9 + 1] = output[index * 9 + 1] / vol * sign; - output[index * 9 + 2] = output[index * 9 + 2] / vol * sign; - output[index * 9 + 3] = output[index * 9 + 3] / vol * sign; - output[index * 9 + 4] = output[index * 9 + 4] / vol * sign; - output[index * 9 + 5] = output[index * 9 + 5] / vol * sign; - output[index * 9 + 6] = output[index * 9 + 6] / vol * sign; - output[index * 9 + 7] = output[index * 9 + 7] / vol * sign; - output[index * 9 + 8] = output[index * 9 + 8] / vol * sign; + output[num_cells * 0 + index] = output[num_cells * 0 + index] / vol * sign; + output[num_cells * 1 + index] = output[num_cells * 1 + index] / vol * sign; + output[num_cells * 2 + index] = output[num_cells * 2 + index] / vol * sign; + output[num_cells * 3 + index] = output[num_cells * 3 + index] / vol * sign; + output[num_cells * 4 + index] = output[num_cells * 4 + index] / vol * sign; + output[num_cells * 5 + index] = output[num_cells * 5 + index] / vol * sign; + output[num_cells * 6 + index] = output[num_cells * 6 + index] / vol * sign; + output[num_cells * 7 + index] = output[num_cells * 7 + index] / vol * sign; + output[num_cells * 8 + index] = output[num_cells * 8 + index] / vol * sign; } __global__ void divide_cell_volume_vec(int num_cells, const double* volume, double *output, double sign) @@ -469,7 +476,8 @@ __global__ void divide_cell_volume_scalar(int num_cells, const double* volume, d output[index] = output[index] / vol * sign; } -__global__ void fvc_grad_vector_correctBC_zeroGradient(int num, int offset, const int *face2Cells, +__global__ void fvc_grad_vector_correctBC_zeroGradient(int num_cells, int num_boundary_surfaces, + int num, int offset, const int *face2Cells, const double *internal_grad, const double *vf, const double *boundary_sf, const double *boundary_mag_sf, double *boundary_grad, double sign) { @@ -481,15 +489,15 @@ __global__ void fvc_grad_vector_correctBC_zeroGradient(int num, int offset, cons int cellIndex = face2Cells[start_index]; - double grad_xx = internal_grad[cellIndex * 9 + 0]; - double grad_xy = internal_grad[cellIndex * 9 + 1]; - double grad_xz = internal_grad[cellIndex * 9 + 2]; - double grad_yx = internal_grad[cellIndex * 9 + 3]; - double grad_yy = internal_grad[cellIndex * 9 + 4]; - double grad_yz = internal_grad[cellIndex * 9 + 5]; - double grad_zx = internal_grad[cellIndex * 9 + 6]; - double grad_zy = internal_grad[cellIndex * 9 + 7]; - double grad_zz = internal_grad[cellIndex * 9 + 8]; + double grad_xx = internal_grad[num_cells * 0 + cellIndex]; + double grad_xy = internal_grad[num_cells * 1 + cellIndex]; + double grad_xz = internal_grad[num_cells * 2 + cellIndex]; + double grad_yx = internal_grad[num_cells * 3 + cellIndex]; + double grad_yy = internal_grad[num_cells * 4 + cellIndex]; + double grad_yz = internal_grad[num_cells * 5 + cellIndex]; + double grad_zx = internal_grad[num_cells * 6 + cellIndex]; + double grad_zy = internal_grad[num_cells * 7 + cellIndex]; + double grad_zz = internal_grad[num_cells * 8 + cellIndex]; double vfx = vf[cellIndex * 3 + 0]; double vfy = vf[cellIndex * 3 + 1]; @@ -503,15 +511,15 @@ __global__ void fvc_grad_vector_correctBC_zeroGradient(int num, int offset, cons double grad_correction_y = - (n_x * grad_xy + n_y * grad_yy + n_z * grad_zy); double grad_correction_z = - (n_x * grad_xz + n_y * grad_yz + n_z * grad_zz); - boundary_grad[start_index * 9 + 0] = (grad_xx + n_x * grad_correction_x) * sign; - boundary_grad[start_index * 9 + 1] = (grad_xy + n_x * grad_correction_y) * sign; - boundary_grad[start_index * 9 + 2] = (grad_xz + n_x * grad_correction_z) * sign; - boundary_grad[start_index * 9 + 3] = (grad_yx + n_y * grad_correction_x) * sign; - boundary_grad[start_index * 9 + 4] = (grad_yy + n_y * grad_correction_y) * sign; - boundary_grad[start_index * 9 + 5] = (grad_yz + n_y * grad_correction_z) * sign; - boundary_grad[start_index * 9 + 6] = (grad_zx + n_z * grad_correction_x) * sign; - boundary_grad[start_index * 9 + 7] = (grad_zy + n_z * grad_correction_y) * sign; - boundary_grad[start_index * 9 + 8] = (grad_zz + n_z * grad_correction_z) * sign; + boundary_grad[num_boundary_surfaces * 0 + start_index] = (grad_xx + n_x * grad_correction_x) * sign; + boundary_grad[num_boundary_surfaces * 1 + start_index] = (grad_xy + n_x * grad_correction_y) * sign; + boundary_grad[num_boundary_surfaces * 2 + start_index] = (grad_xz + n_x * grad_correction_z) * sign; + boundary_grad[num_boundary_surfaces * 3 + start_index] = (grad_yx + n_y * grad_correction_x) * sign; + boundary_grad[num_boundary_surfaces * 4 + start_index] = (grad_yy + n_y * grad_correction_y) * sign; + boundary_grad[num_boundary_surfaces * 5 + start_index] = (grad_yz + n_y * grad_correction_z) * sign; + boundary_grad[num_boundary_surfaces * 6 + start_index] = (grad_zx + n_z * grad_correction_x) * sign; + boundary_grad[num_boundary_surfaces * 7 + start_index] = (grad_zy + n_z * grad_correction_y) * sign; + boundary_grad[num_boundary_surfaces * 8 + start_index] = (grad_zz + n_z * grad_correction_z) * sign; } __global__ void fvc_grad_vector_correctBC_fixedValue(int num, int offset, const int *face2Cells, @@ -652,10 +660,10 @@ __global__ void fvc_div_cell_vector_boundary(int num, int offset, const int *fac } -__global__ void fvc_div_cell_tensor_internal(int num_surfaces, +__global__ void fvc_div_cell_tensor_internal(int num_cells, int num_surfaces, const int *lower_index, const int *upper_index, const double *vf, const double *weight, const double *face_vector, - double *output) + double *output, double sign) { int index = blockDim.x * blockIdx.x + threadIdx.x; if (index >= num_surfaces) @@ -668,19 +676,19 @@ __global__ void fvc_div_cell_tensor_internal(int num_surfaces, int owner = lower_index[index]; int neighbor = upper_index[index]; - double ssf_xx = (w * (vf[owner * 9 + 0] - vf[neighbor * 9 + 0]) + vf[neighbor * 9 + 0]); - double ssf_xy = (w * (vf[owner * 9 + 1] - vf[neighbor * 9 + 1]) + vf[neighbor * 9 + 1]); - double ssf_xz = (w * (vf[owner * 9 + 2] - vf[neighbor * 9 + 2]) + vf[neighbor * 9 + 2]); - double ssf_yx = (w * (vf[owner * 9 + 3] - vf[neighbor * 9 + 3]) + vf[neighbor * 9 + 3]); - double ssf_yy = (w * (vf[owner * 9 + 4] - vf[neighbor * 9 + 4]) + vf[neighbor * 9 + 4]); - double ssf_yz = (w * (vf[owner * 9 + 5] - vf[neighbor * 9 + 5]) + vf[neighbor * 9 + 5]); - double ssf_zx = (w * (vf[owner * 9 + 6] - vf[neighbor * 9 + 6]) + vf[neighbor * 9 + 6]); - double ssf_zy = (w * (vf[owner * 9 + 7] - vf[neighbor * 9 + 7]) + vf[neighbor * 9 + 7]); - double ssf_zz = (w * (vf[owner * 9 + 8] - vf[neighbor * 9 + 8]) + vf[neighbor * 9 + 8]); - double div_x = Sfx * ssf_xx + Sfy * ssf_yx + Sfz * ssf_zx; - double div_y = Sfx * ssf_xy + Sfy * ssf_yy + Sfz * ssf_zy; - double div_z = Sfx * ssf_xz + Sfy * ssf_yz + Sfz * ssf_zz; - + double ssf_xx = (w * (vf[num_cells * 0 + owner] - vf[num_cells * 0 + neighbor]) + vf[num_cells * 0 + neighbor]); + double ssf_xy = (w * (vf[num_cells * 1 + owner] - vf[num_cells * 1 + neighbor]) + vf[num_cells * 1 + neighbor]); + double ssf_xz = (w * (vf[num_cells * 2 + owner] - vf[num_cells * 2 + neighbor]) + vf[num_cells * 2 + neighbor]); + double ssf_yx = (w * (vf[num_cells * 3 + owner] - vf[num_cells * 3 + neighbor]) + vf[num_cells * 3 + neighbor]); + double ssf_yy = (w * (vf[num_cells * 4 + owner] - vf[num_cells * 4 + neighbor]) + vf[num_cells * 4 + neighbor]); + double ssf_yz = (w * (vf[num_cells * 5 + owner] - vf[num_cells * 5 + neighbor]) + vf[num_cells * 5 + neighbor]); + double ssf_zx = (w * (vf[num_cells * 6 + owner] - vf[num_cells * 6 + neighbor]) + vf[num_cells * 6 + neighbor]); + double ssf_zy = (w * (vf[num_cells * 7 + owner] - vf[num_cells * 7 + neighbor]) + vf[num_cells * 7 + neighbor]); + double ssf_zz = (w * (vf[num_cells * 8 + owner] - vf[num_cells * 8 + neighbor]) + vf[num_cells * 8 + neighbor]); + double div_x = (Sfx * ssf_xx + Sfy * ssf_yx + Sfz * ssf_zx) * sign; + double div_y = (Sfx * ssf_xy + Sfy * ssf_yy + Sfz * ssf_zy) * sign; + double div_z = (Sfx * ssf_xz + Sfy * ssf_yz + Sfz * ssf_zz) * sign; + // owner atomicAdd(&(output[owner * 3 + 0]), div_x); atomicAdd(&(output[owner * 3 + 1]), div_y); @@ -694,8 +702,8 @@ __global__ void fvc_div_cell_tensor_internal(int num_surfaces, } -__global__ void fvc_div_cell_tensor_boundary(int num, int offset, const int *face2Cells, - const double *boundary_face_vector, const double *boundary_vf, double *output) +__global__ void fvc_div_cell_tensor_boundary(int num_boundary_faces, int num, int offset, const int *face2Cells, + const double *boundary_face_vector, const double *boundary_vf, double *output, double sign) { int index = blockDim.x * blockIdx.x + threadIdx.x; if (index >= num) @@ -707,20 +715,20 @@ __global__ void fvc_div_cell_tensor_boundary(int num, int offset, const int *fac double bouSfy = boundary_face_vector[start_index * 3 + 1]; double bouSfz = boundary_face_vector[start_index * 3 + 2]; - double boussf_xx = boundary_vf[start_index * 9 + 0]; - double boussf_xy = boundary_vf[start_index * 9 + 1]; - double boussf_xz = boundary_vf[start_index * 9 + 2]; - double boussf_yx = boundary_vf[start_index * 9 + 3]; - double boussf_yy = boundary_vf[start_index * 9 + 4]; - double boussf_yz = boundary_vf[start_index * 9 + 5]; - double boussf_zx = boundary_vf[start_index * 9 + 6]; - double boussf_zy = boundary_vf[start_index * 9 + 7]; - double boussf_zz = boundary_vf[start_index * 9 + 8]; + double boussf_xx = boundary_vf[num_boundary_faces * 0 + start_index]; + double boussf_xy = boundary_vf[num_boundary_faces * 1 + start_index]; + double boussf_xz = boundary_vf[num_boundary_faces * 2 + start_index]; + double boussf_yx = boundary_vf[num_boundary_faces * 3 + start_index]; + double boussf_yy = boundary_vf[num_boundary_faces * 4 + start_index]; + double boussf_yz = boundary_vf[num_boundary_faces * 5 + start_index]; + double boussf_zx = boundary_vf[num_boundary_faces * 6 + start_index]; + double boussf_zy = boundary_vf[num_boundary_faces * 7 + start_index]; + double boussf_zz = boundary_vf[num_boundary_faces * 8 + start_index]; int cellIndex = face2Cells[start_index]; - double bouDiv_x = bouSfx * boussf_xx + bouSfy * boussf_yx + bouSfz * boussf_zx; - double bouDiv_y = bouSfx * boussf_xy + bouSfy * boussf_yy + bouSfz * boussf_zy; - double bouDiv_z = bouSfx * boussf_xz + bouSfy * boussf_yz + bouSfz * boussf_zz; + double bouDiv_x = (bouSfx * boussf_xx + bouSfy * boussf_yx + bouSfz * boussf_zx) * sign; + double bouDiv_y = (bouSfx * boussf_xy + bouSfy * boussf_yy + bouSfz * boussf_zy) * sign; + double bouDiv_z = (bouSfx * boussf_xz + bouSfy * boussf_yz + bouSfz * boussf_zz) * sign; atomicAdd(&(output[cellIndex * 3 + 0]), bouDiv_x); atomicAdd(&(output[cellIndex * 3 + 1]), bouDiv_y); @@ -762,10 +770,13 @@ void field_multiply_scalar(cudaStream_t stream, int num_cells, const double *input1, const double *input2, double *output, int num_boundary_surfaces, const double *boundary_input1, const double *boundary_input2, double *boundary_output) { + TICK_INIT; size_t threads_per_block = 256; size_t blocks_per_grid = (std::max(num_cells, num_boundary_surfaces) + threads_per_block - 1) / threads_per_block; + TICK_START; field_multiply_scalar_kernel<<>>(num_cells, num_boundary_surfaces, input1, input2, output, boundary_input1, boundary_input2, boundary_output); + TICK_END(field_multiply_scalar_kernel); } void fvc_to_source_vector(cudaStream_t stream, int num_cells, const double *volume, const double *fvc_output, double *source) @@ -814,10 +825,14 @@ void fvm_ddt_vector(cudaStream_t stream, int num_cells, double rDeltaT, const double *rho, const double *rho_old, const double *vf, const double *volume, double *diag, double *source, double sign) { + printf("#############kernel profile#############\n"); + TICK_INIT; size_t threads_per_block = 1024; size_t blocks_per_grid = (num_cells + threads_per_block - 1) / threads_per_block; + TICK_START; fvm_ddt_vector_kernel<<>>(num_cells, rDeltaT, rho, rho_old, vf, volume, diag, source, sign); + TICK_END(fvm_ddt_vector_kernel); } void fvm_div_vector(cudaStream_t stream, int num_surfaces, const int *lowerAddr, const int *upperAddr, @@ -827,24 +842,29 @@ void fvm_div_vector(cudaStream_t stream, int num_surfaces, const int *lowerAddr, const double *boundary_phi, const double *value_internal_coeffs, const double *value_boundary_coeffs, double *internal_coeffs, double *boundary_coeffs, double sign) { + TICK_INIT; size_t threads_per_block = 1024; size_t blocks_per_grid = 1; blocks_per_grid = (num_surfaces + threads_per_block - 1) / threads_per_block; + TICK_START; fvm_div_vector_internal<<>>(num_surfaces, lowerAddr, upperAddr, phi, weight, lower, upper, diag, sign); + TICK_END(fvm_div_vector_internal); int offset = 0; for (int i = 0; i < num_patches; i++) { - threads_per_block = 256; + threads_per_block = 64; blocks_per_grid = (patch_size[i] + threads_per_block - 1) / threads_per_block; // TODO: just basic patch type now if (patch_type[i] == boundaryConditions::zeroGradient || patch_type[i] == boundaryConditions::fixedValue) { // TODO: just vector version now + TICK_START; fvm_div_vector_boundary<<>>(patch_size[i], offset, boundary_phi, value_internal_coeffs, value_boundary_coeffs, internal_coeffs, boundary_coeffs, sign); + TICK_END(fvm_div_vector_boundary); } else if (0) { // xxx fprintf(stderr, "boundaryConditions other than zeroGradient are not support yet!\n"); @@ -862,24 +882,28 @@ void fvm_laplacian_vector(cudaStream_t stream, int num_surfaces, const double *gradient_internal_coeffs, const double *gradient_boundary_coeffs, double *internal_coeffs, double *boundary_coeffs, double sign) { + TICK_INIT; size_t threads_per_block = 1024; size_t blocks_per_grid = 1; blocks_per_grid = (num_surfaces + threads_per_block - 1) / threads_per_block; + TICK_START; fvm_laplacian_vector_internal<<>>(num_surfaces, lowerAddr, upperAddr, weight, mag_sf, delta_coeffs, gamma, lower, upper, diag, sign); - + TICK_END(fvm_laplacian_vector_internal); int offset = 0; for (int i = 0; i < num_patches; i++) { - threads_per_block = 256; + threads_per_block = 64; blocks_per_grid = (patch_size[i] + threads_per_block - 1) / threads_per_block; // TODO: just basic patch type now if (patch_type[i] == boundaryConditions::zeroGradient || patch_type[i] == boundaryConditions::fixedValue) { // TODO: just vector version now + TICK_START; fvm_laplacian_vector_boundary<<>>(patch_size[i], offset, boundary_mag_sf, boundary_gamma, gradient_internal_coeffs, gradient_boundary_coeffs, internal_coeffs, boundary_coeffs, sign); + TICK_END(fvm_laplacian_vector_boundary); } else if (0) { // xxx fprintf(stderr, "boundaryConditions other than zeroGradient are not support yet!\n"); @@ -900,7 +924,7 @@ void fvc_ddt_scalar(cudaStream_t stream, int num_cells, double rDeltaT, rDeltaT, rho, rho_old, vf, vf_old, output, sign); } -void fvc_grad_vector(cudaStream_t stream, int num_cells, int num_surfaces, +void fvc_grad_vector(cudaStream_t stream, int num_cells, int num_surfaces, int num_boundary_surfaces, const int *lowerAddr, const int *upperAddr, const double *weight, const double *Sf, const double *vf, double *output, // end for internal int num_patches, const int *patch_size, const int *patch_type, @@ -909,22 +933,28 @@ void fvc_grad_vector(cudaStream_t stream, int num_cells, int num_surfaces, const double *boundary_deltaCoeffs, double sign) { checkCudaErrors(cudaMemsetAsync(output, 0, num_cells * 9 * sizeof(double), stream)); + TICK_INIT; size_t threads_per_block = 1024; size_t blocks_per_grid = (num_surfaces + threads_per_block - 1) / threads_per_block; - fvc_grad_vector_internal<<>>(num_surfaces, lowerAddr, upperAddr, + TICK_START; + fvc_grad_vector_internal<<>>(num_cells, num_surfaces, lowerAddr, upperAddr, Sf, weight, vf, output); + TICK_END(fvc_grad_vector_internal); int offset = 0; // finish conctruct grad field except dividing cell volume for (int i = 0; i < num_patches; i++) { - threads_per_block = 256; + threads_per_block = 64; blocks_per_grid = (patch_size[i] + threads_per_block - 1) / threads_per_block; // TODO: just basic patch type now if (patch_type[i] == boundaryConditions::zeroGradient || patch_type[i] == boundaryConditions::fixedValue) { // TODO: just vector version now - fvc_grad_vector_boundary<<>>(patch_size[i], offset, boundary_cell_face, + TICK_START; + fvc_grad_vector_boundary<<>>(num_cells, + patch_size[i], offset, boundary_cell_face, boundary_Sf, boundary_vf, output); + TICK_END(fvc_grad_vector_boundary); } else if (0) { // xxx fprintf(stderr, "boundaryConditions other than zeroGradient are not support yet!\n"); @@ -935,19 +965,25 @@ void fvc_grad_vector(cudaStream_t stream, int num_cells, int num_surfaces, // divide cell volume threads_per_block = 512; blocks_per_grid = (num_cells + threads_per_block - 1) / threads_per_block; + TICK_START; divide_cell_volume_tsr<<>>(num_cells, volume, output, sign); + TICK_END(divide_cell_volume_tsr); // correct boundary conditions offset = 0; for (int i = 0; i < num_patches; i++) { - threads_per_block = 256; + threads_per_block = 64; blocks_per_grid = (patch_size[i] + threads_per_block - 1) / threads_per_block; // TODO: just basic patch type now if (patch_type[i] == boundaryConditions::zeroGradient) { // TODO: just vector version now - fvc_grad_vector_correctBC_zeroGradient<<>>(patch_size[i], offset, boundary_cell_face, + TICK_START; + fvc_grad_vector_correctBC_zeroGradient<<>>(num_cells, num_boundary_surfaces, + patch_size[i], offset, boundary_cell_face, output, boundary_vf, boundary_Sf, boundary_mag_Sf, boundary_output, sign); + TICK_END(fvc_grad_vector_correctBC_zeroGradient); } else if (patch_type[i] == boundaryConditions::fixedValue) { + // TODO: implement fixedValue version fvc_grad_vector_correctBC_fixedValue<<>>(patch_size[i], offset, boundary_cell_face, output, boundary_vf, boundary_Sf, boundary_mag_Sf, boundary_output, boundary_deltaCoeffs, boundary_vf, sign); } else if (0) { @@ -961,9 +997,12 @@ void fvc_grad_vector(cudaStream_t stream, int num_cells, int num_surfaces, void scale_dev2T_tensor(cudaStream_t stream, int num_cells, const double *vf1, double *vf2, int num_boundary_surfaces, const double *boundary_vf1, double *boundary_vf2) { + TICK_INIT; size_t threads_per_block = 1024; size_t blocks_per_grid = (num_cells + threads_per_block - 1) / threads_per_block; + TICK_START; scale_dev2t_tensor_kernel<<>>(num_cells, vf1, vf2); + TICK_END(scale_dev2t_tensor_kernel); blocks_per_grid = (num_boundary_surfaces + threads_per_block - 1) / threads_per_block; scale_dev2t_tensor_kernel<<>>(num_boundary_surfaces, boundary_vf1, boundary_vf2); @@ -1026,29 +1065,33 @@ void fvc_div_cell_vector(cudaStream_t stream, int num_cells, int num_surfaces, divide_cell_volume_scalar<<>>(num_cells, volume, output, sign); } -void fvc_div_cell_tensor(cudaStream_t stream, int num_cells, int num_surfaces, +void fvc_div_cell_tensor(cudaStream_t stream, int num_cells, int num_surfaces, int num_boundary_surfaces, const int *lowerAddr, const int *upperAddr, const double *weight, const double *Sf, const double *vf, double *output, // end for internal int num_patches, const int *patch_size, const int *patch_type, const int *boundary_cell_face, const double *boundary_vf, const double *boundary_Sf, const double *volume, double sign) { - checkCudaErrors(cudaMemsetAsync(output, 0, num_cells * 3 * sizeof(double), stream)); - + // checkCudaErrors(cudaMemsetAsync(output, 0, num_cells * 3 * sizeof(double), stream)); + TICK_INIT; size_t threads_per_block = 1024; size_t blocks_per_grid = (num_surfaces + threads_per_block - 1) / threads_per_block; - fvc_div_cell_tensor_internal<<>>(num_surfaces, lowerAddr, upperAddr, vf, weight, Sf, output); + TICK_START; + fvc_div_cell_tensor_internal<<>>(num_cells, num_surfaces, lowerAddr, upperAddr, vf, weight, Sf, output, sign); + TICK_END(fvc_div_cell_tensor_internal); int offset = 0; for (int i = 0; i < num_patches; i++) { - threads_per_block = 256; + threads_per_block = 64; blocks_per_grid = (patch_size[i] + threads_per_block - 1) / threads_per_block; // TODO: just basic patch type now if (patch_type[i] == boundaryConditions::zeroGradient || patch_type[i] == boundaryConditions::fixedValue) { // TODO: just vector version now - fvc_div_cell_tensor_boundary<<>>(patch_size[i], offset, boundary_cell_face, - boundary_Sf, boundary_vf, output); + TICK_START; + fvc_div_cell_tensor_boundary<<>>(num_boundary_surfaces, patch_size[i], offset, boundary_cell_face, + boundary_Sf, boundary_vf, output, sign); + TICK_END(fvc_div_cell_tensor_boundary); } else if (0) { // xxx fprintf(stderr, "boundaryConditions other than zeroGradient are not support yet!\n"); @@ -1056,10 +1099,10 @@ void fvc_div_cell_tensor(cudaStream_t stream, int num_cells, int num_surfaces, offset += patch_size[i]; } - // divide cell volume - threads_per_block = 1024; - blocks_per_grid = (num_cells + threads_per_block - 1) / threads_per_block; - divide_cell_volume_vec<<>>(num_cells, volume, output, sign); + // // divide cell volume + // threads_per_block = 1024; + // blocks_per_grid = (num_cells + threads_per_block - 1) / threads_per_block; + // divide_cell_volume_vec<<>>(num_cells, volume, output, sign); } void fvc_grad_cell_scalar(cudaStream_t stream, int num_cells, int num_surfaces, @@ -1068,21 +1111,25 @@ void fvc_grad_cell_scalar(cudaStream_t stream, int num_cells, int num_surfaces, int num_patches, const int *patch_size, const int *patch_type, const int *boundary_cell_face, const double *boundary_vf, const double *boundary_Sf, const double *volume, double sign) { - checkCudaErrors(cudaMemsetAsync(output, 0, num_cells * 3 * sizeof(double), stream)); + TICK_INIT; size_t threads_per_block = 1024; size_t blocks_per_grid = (num_surfaces + threads_per_block - 1) / threads_per_block; + TICK_START; fvc_grad_scalar_internal<<>>(num_cells, num_surfaces, lowerAddr, upperAddr, - Sf, weight, vf, output); + Sf, weight, vf, output, sign); + TICK_END(fvc_grad_scalar_internal); int offset = 0; for (int i = 0; i < num_patches; i++) { - threads_per_block = 256; + threads_per_block = 64; blocks_per_grid = (patch_size[i] + threads_per_block - 1) / threads_per_block; // TODO: just non-coupled patch type now if (patch_type[i] == boundaryConditions::zeroGradient || patch_type[i] == boundaryConditions::fixedValue) { + TICK_START; fvc_grad_scalar_boundary<<>>(num_cells, patch_size[i], offset, boundary_cell_face, - boundary_Sf, boundary_vf, output); + boundary_Sf, boundary_vf, output, sign); + TICK_END(fvc_grad_scalar_internal); } else if (0) { // xxx fprintf(stderr, "boundaryConditions other than zeroGradient are not support yet!\n"); @@ -1090,8 +1137,8 @@ void fvc_grad_cell_scalar(cudaStream_t stream, int num_cells, int num_surfaces, offset += patch_size[i]; } - // divide cell volume - threads_per_block = 1024; - blocks_per_grid = (num_cells + threads_per_block - 1) / threads_per_block; - divide_cell_volume_vec<<>>(num_cells, volume, output, sign); + // // divide cell volume + // threads_per_block = 1024; + // blocks_per_grid = (num_cells + threads_per_block - 1) / threads_per_block; + // divide_cell_volume_vec<<>>(num_cells, volume, output, sign); } diff --git a/src_gpu/dfUEqn.H b/src_gpu/dfUEqn.H index 7b28f082c..80cdc7144 100644 --- a/src_gpu/dfUEqn.H +++ b/src_gpu/dfUEqn.H @@ -110,4 +110,6 @@ public: void compareResult(const double *lower, const double *upper, const double *diag, const double *source, const double *internal_coeffs, const double *boundary_coeffs, // const double *tmpVal, bool printFlag); + + void sync(); }; diff --git a/src_gpu/dfUEqn.cu b/src_gpu/dfUEqn.cu index 90efcf670..73b7516c5 100644 --- a/src_gpu/dfUEqn.cu +++ b/src_gpu/dfUEqn.cu @@ -97,9 +97,9 @@ void dfUEqn::process() { checkCudaErrors(cudaEventCreate(&stop)); checkCudaErrors(cudaEventRecord(start,0)); - if(!graph_created) { - DEBUG_TRACE; - checkCudaErrors(cudaStreamBeginCapture(dataBase_.stream, cudaStreamCaptureModeGlobal)); + // if(!graph_created) { + // DEBUG_TRACE; + // checkCudaErrors(cudaStreamBeginCapture(dataBase_.stream, cudaStreamCaptureModeGlobal)); fvm_ddt_vector(dataBase_.stream, dataBase_.num_cells, dataBase_.rdelta_t, dataBase_.d_rho, dataBase_.d_rho_old, dataBase_.d_u, dataBase_.d_volume, @@ -110,6 +110,7 @@ void dfUEqn::process() { dataBase_.num_patches, dataBase_.patch_size.data(), patch_type.data(), dataBase_.d_boundary_phi, d_value_internal_coeffs, d_value_boundary_coeffs, d_internal_coeffs, d_boundary_coeffs, 1.); + //TODO: merge bellow six kernels field_multiply_scalar(dataBase_.stream, dataBase_.num_cells, dataBase_.d_rho, d_nu_eff, d_rho_nueff, // end for internal dataBase_.num_boundary_surfaces, dataBase_.d_boundary_rho, d_boundary_nu_eff, d_boundary_rho_nueff); @@ -121,7 +122,7 @@ void dfUEqn::process() { dataBase_.d_boundary_mag_sf, d_boundary_rho_nueff, d_gradient_internal_coeffs, d_gradient_boundary_coeffs, d_internal_coeffs, d_boundary_coeffs, -1); - fvc_grad_vector(dataBase_.stream, dataBase_.num_cells, dataBase_.num_surfaces, + fvc_grad_vector(dataBase_.stream, dataBase_.num_cells, dataBase_.num_surfaces, dataBase_.num_boundary_surfaces, dataBase_.d_owner, dataBase_.d_neighbor, dataBase_.d_weight, dataBase_.d_sf, dataBase_.d_u, d_grad_u, dataBase_.num_patches, dataBase_.patch_size.data(), patch_type.data(), @@ -129,37 +130,43 @@ void dfUEqn::process() { dataBase_.d_volume, dataBase_.d_boundary_mag_sf, d_boundary_grad_u, dataBase_.d_boundary_delta_coeffs); scale_dev2T_tensor(dataBase_.stream, dataBase_.num_cells, d_rho_nueff, d_grad_u, // end for internal dataBase_.num_boundary_surfaces, d_boundary_rho_nueff, d_boundary_grad_u); - fvc_div_cell_tensor(dataBase_.stream, dataBase_.num_cells, dataBase_.num_surfaces, + fvc_div_cell_tensor(dataBase_.stream, dataBase_.num_cells, dataBase_.num_surfaces, dataBase_.num_boundary_surfaces, dataBase_.d_owner, dataBase_.d_neighbor, - dataBase_.d_weight, dataBase_.d_sf, d_grad_u, d_fvc_output, // end for internal + dataBase_.d_weight, dataBase_.d_sf, d_grad_u, d_source, // end for internal dataBase_.num_patches, dataBase_.patch_size.data(), patch_type.data(), dataBase_.d_boundary_face_cell, d_boundary_grad_u, dataBase_.d_boundary_sf, dataBase_.d_volume); - fvc_to_source_vector(dataBase_.stream, dataBase_.num_cells, - dataBase_.d_volume, d_fvc_output, d_source); + // fvc_to_source_vector(dataBase_.stream, dataBase_.num_cells, + // dataBase_.d_volume, d_fvc_output, d_source); + // TODO: merge bellow two kernel fvc_grad_cell_scalar(dataBase_.stream, dataBase_.num_cells, dataBase_.num_surfaces, dataBase_.d_owner, dataBase_.d_neighbor, - dataBase_.d_weight, dataBase_.d_sf, dataBase_.d_p, d_fvc_output, + dataBase_.d_weight, dataBase_.d_sf, dataBase_.d_p, d_source, dataBase_.num_patches, dataBase_.patch_size.data(), patch_type.data(), dataBase_.d_boundary_face_cell, dataBase_.d_boundary_p, dataBase_.d_boundary_sf, dataBase_.d_volume, -1.); - fvc_to_source_vector(dataBase_.stream, dataBase_.num_cells, - dataBase_.d_volume, d_fvc_output, d_source); + // fvc_to_source_vector(dataBase_.stream, dataBase_.num_cells, + // dataBase_.d_volume, d_fvc_output, d_source); - checkCudaErrors(cudaStreamEndCapture(dataBase_.stream, &graph)); - checkCudaErrors(cudaGraphInstantiate(&graph_instance, graph, NULL, NULL, 0)); - graph_created = true; - } - DEBUG_TRACE; - checkCudaErrors(cudaGraphLaunch(graph_instance, dataBase_.stream)); + // checkCudaErrors(cudaStreamEndCapture(dataBase_.stream, &graph)); + // checkCudaErrors(cudaGraphInstantiate(&graph_instance, graph, NULL, NULL, 0)); + // graph_created = true; + // } + // DEBUG_TRACE; + // checkCudaErrors(cudaGraphLaunch(graph_instance, dataBase_.stream)); checkCudaErrors(cudaEventRecord(stop,0)); checkCudaErrors(cudaEventSynchronize(start)); checkCudaErrors(cudaEventSynchronize(stop)); checkCudaErrors(cudaEventElapsedTime(&time_elapsed,start,stop)); - fprintf(stderr, "ueqn process time:%f(ms)\n",time_elapsed); + fprintf(stderr, "ueqn process time:%f(ms)\n",time_elapsed); //solve(); } +void dfUEqn::sync() +{ + checkCudaErrors(cudaStreamSynchronize(dataBase_.stream)); +} + void dfUEqn::solve() { //ldu_to_csr(dataBase_.stream, dataBase_.num_cells, dataBase_.num_surfaces, // dataBase_.d_lower_to_csr_index, dataBase_.d_upper_to_csr_index, dataBase_.d_diag_to_csr_index, @@ -226,24 +233,35 @@ void dfUEqn::compareResult(const double *lower, const double *upper, const doubl std::vector h_lower; h_lower.resize(dataBase_.num_surfaces); checkCudaErrors(cudaMemcpy(h_lower.data(), d_lower, dataBase_.surface_value_bytes, cudaMemcpyDeviceToHost)); + fprintf(stderr, "check h_lower"); checkVectorEqual(dataBase_.num_surfaces, lower, h_lower.data(), 1e-14, printFlag); DEBUG_TRACE; std::vector h_upper; h_upper.resize(dataBase_.num_surfaces); checkCudaErrors(cudaMemcpy(h_upper.data(), d_upper, dataBase_.surface_value_bytes, cudaMemcpyDeviceToHost)); + fprintf(stderr, "check h_upper"); checkVectorEqual(dataBase_.num_surfaces, upper, h_upper.data(), 1e-14, printFlag); DEBUG_TRACE; std::vector h_diag; h_diag.resize(dataBase_.num_cells); checkCudaErrors(cudaMemcpy(h_diag.data(), d_diag, dataBase_.cell_value_bytes, cudaMemcpyDeviceToHost)); + fprintf(stderr, "check h_diag"); checkVectorEqual(dataBase_.num_cells, diag, h_diag.data(), 1e-14, printFlag); DEBUG_TRACE; std::vector h_source; + // , h_source_ref; h_source.resize(dataBase_.num_cells * 3); + // h_source_ref.resize(dataBase_.num_cells * 3); + // for (int i = 0; i < dataBase_.num_cells; i++) { + // h_source_ref[0 * dataBase_.num_cells + i] = source[i * 3 + 0]; + // h_source_ref[1 * dataBase_.num_cells + i] = source[i * 3 + 1]; + // h_source_ref[2 * dataBase_.num_cells + i] = source[i * 3 + 2]; + // } checkCudaErrors(cudaMemcpy(h_source.data(), d_source, dataBase_.cell_value_vec_bytes, cudaMemcpyDeviceToHost)); + fprintf(stderr, "check h_source"); checkVectorEqual(dataBase_.num_cells * 3, source, h_source.data(), 1e-14, printFlag); DEBUG_TRACE; From 7a7accf67d6d7e264697c91e201cc01fa7610f90 Mon Sep 17 00:00:00 2001 From: maorz1998 Date: Mon, 21 Aug 2023 21:12:17 +0800 Subject: [PATCH 24/25] add solve part, fix some bugs --- applications/solvers/dfLowMachFoam/new_UEqn.H | 2 +- .../solvers/dfLowMachFoam_new/CMakeLists.txt | 126 +++++ applications/solvers/dfLowMachFoam_new/EEqn.H | 141 ++++++ .../solvers/dfLowMachFoam_new/Make/files | 3 + .../solvers/dfLowMachFoam_new/Make/options | 58 +++ applications/solvers/dfLowMachFoam_new/UEqn.H | 247 ++++++++++ applications/solvers/dfLowMachFoam_new/YEqn.H | 207 ++++++++ .../solvers/dfLowMachFoam_new/YEqn_RR.H | 61 +++ .../solvers/dfLowMachFoam_new/correctPhi.H | 12 + .../solvers/dfLowMachFoam_new/createFields.H | 176 +++++++ .../dfLowMachFoam_new/createGPUSolver.H | 97 ++++ .../dfLowMachFoam_new/createdfSolver.H | 65 +++ .../solvers/dfLowMachFoam_new/dfLowMachFoam.C | 447 ++++++++++++++++++ applications/solvers/dfLowMachFoam_new/pEqn.H | 203 ++++++++ .../solvers/dfLowMachFoam_new/pcEqn.H | 130 +++++ .../solvers/dfLowMachFoam_new/rhoEqn.H | 86 ++++ .../solvers/dfLowMachFoam_new/setRDeltaT.H | 85 ++++ .../solvers/dfLowMachFoam_new/setRootCase2.H | 5 + src_gpu/dfMatrixDataBase.cu | 2 +- src_gpu/dfMatrixOpBase.H | 30 +- src_gpu/dfMatrixOpBase.cu | 406 ++++++++++------ src_gpu/dfUEqn.H | 5 +- src_gpu/dfUEqn.cu | 98 ++-- 23 files changed, 2495 insertions(+), 197 deletions(-) create mode 100644 applications/solvers/dfLowMachFoam_new/CMakeLists.txt create mode 100644 applications/solvers/dfLowMachFoam_new/EEqn.H create mode 100644 applications/solvers/dfLowMachFoam_new/Make/files create mode 100644 applications/solvers/dfLowMachFoam_new/Make/options create mode 100644 applications/solvers/dfLowMachFoam_new/UEqn.H create mode 100644 applications/solvers/dfLowMachFoam_new/YEqn.H create mode 100644 applications/solvers/dfLowMachFoam_new/YEqn_RR.H create mode 100644 applications/solvers/dfLowMachFoam_new/correctPhi.H create mode 100644 applications/solvers/dfLowMachFoam_new/createFields.H create mode 100644 applications/solvers/dfLowMachFoam_new/createGPUSolver.H create mode 100644 applications/solvers/dfLowMachFoam_new/createdfSolver.H create mode 100644 applications/solvers/dfLowMachFoam_new/dfLowMachFoam.C create mode 100644 applications/solvers/dfLowMachFoam_new/pEqn.H create mode 100644 applications/solvers/dfLowMachFoam_new/pcEqn.H create mode 100644 applications/solvers/dfLowMachFoam_new/rhoEqn.H create mode 100644 applications/solvers/dfLowMachFoam_new/setRDeltaT.H create mode 100644 applications/solvers/dfLowMachFoam_new/setRootCase2.H diff --git a/applications/solvers/dfLowMachFoam/new_UEqn.H b/applications/solvers/dfLowMachFoam/new_UEqn.H index 9d94d27b6..41b804a4b 100644 --- a/applications/solvers/dfLowMachFoam/new_UEqn.H +++ b/applications/solvers/dfLowMachFoam/new_UEqn.H @@ -107,7 +107,7 @@ for (int patchi = 0; patchi < dfDataBase.num_patches; patchi++) memcpy(h_boundary_coeffs.data() + offset * 3, boundary_coeff_ptr, patchsize * 3 * sizeof(double)); offset += patchsize; } -bool printFlag = false; +bool printFlag = true; UEqn_GPU.compareResult(&UEqn.lower()[0], &UEqn.upper()[0], &UEqn.diag()[0], &UEqn.source()[0][0], h_internal_coeffs.data(), h_boundary_coeffs.data(), // &DivTensor[0][0], diff --git a/applications/solvers/dfLowMachFoam_new/CMakeLists.txt b/applications/solvers/dfLowMachFoam_new/CMakeLists.txt new file mode 100644 index 000000000..645289a64 --- /dev/null +++ b/applications/solvers/dfLowMachFoam_new/CMakeLists.txt @@ -0,0 +1,126 @@ +cmake_minimum_required(VERSION 3.5) +project(dfLowMachFoam LANGUAGES CXX) +FIND_PACKAGE(MPI REQUIRED) +FIND_PACKAGE(OpenMP REQUIRED) +FIND_PACKAGE(CUDA REQUIRED) + +# Check valid thirdParty +if(DEFINED ENV{WM_PROJECT_DIR}) + MESSAGE(STATUS "OpenFOAM: " $ENV{WM_PROJECT_DIR}) +else() + message(FATAL_ERROR "OpenFOAM is not sourced") +endif(DEFINED ENV{WM_PROJECT_DIR}) + +if(DEFINED ENV{CANTERA_ROOT}) + MESSAGE(STATUS "libcantera: " $ENV{CANTERA_ROOT}) + SET(CANTERA_ROOT $ENV{CANTERA_ROOT}) +else() + message(FATAL_ERROR "libcantera directory is not specified") +endif(DEFINED ENV{CANTERA_ROOT}) + +# define variables +SET(OpenFOAM_LIB_DIR $ENV{FOAM_LIBBIN}) +SET(OpenFOAM_SRC $ENV{FOAM_SRC}) + +SET(DF_ROOT $ENV{DF_ROOT}) +SET(DF_SRC $ENV{DF_SRC}) +SET(SRC_ORIG $ENV{SRC_ORIG}) + +# set compilation options +SET(CMAKE_EXE_LINKER_FLAGS "-fuse-ld=bfd -Xlinker --add-needed -Xlinker --no-as-needed") +SET (CMAKE_C_FLAGS ${CMAKE_C_FLAGS} ${OpenMP_C_FLAGS}) +SET (CMAKE_CXX_FLAGS ${CMAKE_CXX_FLAGS} ${OpenMP_CXX_FLAGS}) + +SET(CMAKE_C_COMPILER g++) +SET(PATH_LIB_OPENMPI "openmpi-system") # Foundation version +SET(EXE_COMPILE_OPTION "-std=c++11 -m64 -Dlinux64 -DWM_ARCH_OPTION=64 +-DWM_DP -DWM_LABEL_SIZE=32 -Wall -Wextra -Wold-style-cast -Wnon-virtual-dtor +-Wno-unused-parameter -Wno-invalid-offsetof -Wno-attributes -O3 +-DNoRepository -ftemplate-depth-100 -std=c++14 +-Wno-unused-variable -Wno-unused-but-set-variable -Wno-old-style-cast -DOMPI_SKIP_MPICXX +-pthread -fPIC") +add_definitions("${EXE_COMPILE_OPTION}") + +# add header files +FUNCTION(R_SEARCH search_path return_list) + FILE(GLOB_RECURSE new_list ${search_path}/*.H) + SET(dir_list "") + FOREACH(file_path ${new_list}) + GET_FILENAME_COMPONENT(dir_path ${file_path} PATH) + SET(dir_list ${dir_list} ${dir_path}) + ENDFOREACH() + LIST(REMOVE_DUPLICATES dir_list) + SET(${return_list} ${dir_list} PARENT_SCOPE) +ENDFUNCTION(R_SEARCH) + +R_SEARCH(${DF_SRC}/dfCombustionModels dfcombustion_inc) +R_SEARCH(${DF_SRC}/dfCanteraMixture dfcantera_inc) +R_SEARCH(${DF_SRC}/lagrangian/intermediate dflagrangianinter_inc) +R_SEARCH(${DF_SRC}/lagrangian/spray dflagrangianspray_inc) +R_SEARCH(${DF_SRC}/lagrangian/turbulence dflagrangianturb_inc) +R_SEARCH(${DF_SRC}/dfChemistryModel dfchemistry_inc) +R_SEARCH(${DF_SRC}/thermophysicalModels/thermophysicalProperties dfthermophysicalprop_inc) +R_SEARCH(${DF_SRC}/thermophysicalModels/thermophysicalProperties dfthermophysicalprop_inc) +R_SEARCH(${DF_SRC}/thermophysicalModels/basic dfthermophysicalbasic_inc) +R_SEARCH(${DF_SRC}/thermophysicalModels/SLGThermo dfthermophysicalslg_inc) +R_SEARCH(${DF_SRC}/TurbulenceModels dfturbulence_inc) +R_SEARCH(${DF_SRC}/dynamicMesh dfnewdynamic_inc) +R_SEARCH(${DF_SRC}/dynamicFvMesh dffvdynamic_inc) + +include_directories( + ${OpenFOAM_SRC}/finiteVolume/lnInclude + ${OpenFOAM_SRC}/OSspecific/POSIX/lnInclude + ${OpenFOAM_SRC}/OpenFOAM/lnInclude + ${OpenFOAM_SRC}/transportModels/compressible/lnInclude + ${OpenFOAM_SRC}/thermophysicalModels/basic/lnInclude + ${OpenFOAM_SRC}/TurbulenceModels/turbulenceModels/lnInclude + ${OpenFOAM_SRC}/TurbulenceModels/compressible/lnInclude + ${OpenFOAM_SRC}/finiteVolume/cfdTools + ${OpenFOAM_SRC}/finiteVolume/lnInclude + ${OpenFOAM_SRC}/meshTools/lnInclude + ${OpenFOAM_SRC}/sampling/lnInclude + ${OpenFOAM_SRC}/dynamicFvMesh/lnInclude + ${OpenFOAM_SRC}/Pstream/mpi + ${dfcantera_inc} + ${dfchemistry_inc} + ${dfcombustion_inc} + ${CANTERA_ROOT}/include + ${MPI_INCLUDE_PATH} + ${PROJECT_SOURCE_DIR} + ${CUDA_INCLUDE_DIRS} + /home/runze/AmgX/AMGX/include + /home/runze/deepflame-dev/src_gpu +) + +# add execution +add_executable(${PROJECT_NAME} ${PROJECT_SOURCE_DIR}/dfLowMachFoam.C) + +target_link_libraries(${PROJECT_NAME} + $ENV{FOAM_LIBBIN}/libfiniteVolume.so libmeshTools.so libcompressibleTransportModels.so + libturbulenceModels.so libsampling.so libOpenFOAM.so + ${CANTERA_ROOT}/lib/libcantera_shared.so.2 + ${DF_ROOT}/lib/libdfChemistryModel.so + ${DF_ROOT}/lib/libdfCanteraMixture.so + ${DF_ROOT}/lib/libdfFluidThermophysicalModels.so + ${DF_ROOT}/lib/libdfCombustionModels.so + $ENV{FOAM_LIBBIN}/openmpi-system/libPstream.so + ${MPI_LIBRARIES} + ${CUDA_LIBRARIES} + /home/runze/AmgX/AMGX/build/libamgxsh.so + /home/runze/deepflame-dev/src_gpu/build/libdfMatrix.so +) + +if(DEFINED ENV{PYTHON_INC_DIR}) + add_definitions(-DUSE_PYTORCH) + find_package (Python REQUIRED COMPONENTS Interpreter Development) + find_package(pybind11) + include_directories( + ${Python_INCLUDE_DIRS} + ${pybind11_INCLUDE_DIR}/pybind11 + ) + target_link_libraries(${PROJECT_NAME} ${Python_LIBRARIES}) +endif() + +# install +set(CMAKE_INSTALL_PREFIX ${DF_ROOT}) +install(TARGETS ${PROJECT_NAME} DESTINATION bin) diff --git a/applications/solvers/dfLowMachFoam_new/EEqn.H b/applications/solvers/dfLowMachFoam_new/EEqn.H new file mode 100644 index 000000000..896baaa06 --- /dev/null +++ b/applications/solvers/dfLowMachFoam_new/EEqn.H @@ -0,0 +1,141 @@ +{ + volScalarField& he = thermo.he(); +#ifdef GPUSolver_ + start1 = std::clock(); + UEqn_GPU.updatePsi(&U[0][0]); + UEqn_GPU.correctBoundaryConditions(); + U.correctBoundaryConditions(); + K = 0.5*magSqr(U); + end1 = std::clock(); + time_monitor_UEqn += double(end1 - start1) / double(CLOCKS_PER_SEC); + time_monitor_UEqn_correctBC += double(end1 - start1) / double(CLOCKS_PER_SEC); + + // prepare data on CPU + start1 = std::clock(); + start2 = std::clock(); + // const tmp alphaEff_tmp(thermo.alpha()); + // const volScalarField& alphaEff = alphaEff_tmp(); + double *alphaEff = nullptr; // tmp + end2 = std::clock(); + int eeqn_offset = 0; + int patchNum = 0; + + forAll(he.boundaryField(), patchi) + { + patchNum++; + const fvsPatchScalarField& pw = mesh.surfaceInterpolation::weights().boundaryField()[patchi]; + int patchSize = pw.size(); + + // construct gradient manually + const fvPatchScalarField& hew = he.boundaryField()[patchi]; + const basicThermo& bThermo = basicThermo::lookupThermo(hew); + const scalarField& ppw = bThermo.p().boundaryField()[patchi]; + fvPatchScalarField& Tw = + const_cast(bThermo.T().boundaryField()[patchi]); + scalarField& Tw_v = Tw; + + Tw.evaluate(); + const scalarField& patchDeltaCoeff = mesh.boundary()[patchi].deltaCoeffs(); + const scalarField heInternal = bThermo.he(ppw, Tw, patchi)(); + const scalarField heBoundary = bThermo.he(ppw, Tw, mesh.boundary()[patchi].faceCells())(); + const scalarField patchGradMau = patchDeltaCoeff * (heInternal - heBoundary); + + const scalarField& patchK = K.boundaryField()[patchi]; + // const scalarField& patchAlphaEff = alphaEff.boundaryField()[patchi]; // not H2Dcopy when use UnityLewis + // const scalarField& patchGrad = he.boundaryField()[patchi].gradientBoundaryCoeffs(); // gradient_ + + // const DimensionedField& patchHa_ = he.boundaryField()[patchi]; + // const gradientEnergyFvPatchScalarField patchHa(mesh.boundary()[patchi], patchHa_); + // const scalarField& patchGrad = patchHa.gradient(); // gradient_ + memcpy(boundary_K + eeqn_offset, &patchK[0], patchSize*sizeof(double)); + // memcpy(boundary_alphaEff + eeqn_offset, &patchAlphaEff[0], patchSize*sizeof(double)); + memcpy(boundary_gradient + eeqn_offset, &patchGradMau[0], patchSize*sizeof(double)); + + eeqn_offset += patchSize; + } + end1 = std::clock(); + time_monitor_EEqn += double(end1 - start1) / double(CLOCKS_PER_SEC); + time_monitor_EEqn_mtxAssembly += double(end1 - start1) / double(CLOCKS_PER_SEC); + time_monitor_EEqn_mtxAssembly_CPU_prepare += double(end1 - start1) / double(CLOCKS_PER_SEC); + fprintf(stderr, "time_monitor_EEqn_mtxAssembly_CPU_prepare: %lf, build alphaEff time: %lf, patchNum: %d\n", + time_monitor_EEqn_mtxAssembly_CPU_prepare, + double(end2 - start2) / double(CLOCKS_PER_SEC), patchNum); + + // prepare data on GPU + start1 = std::clock(); + he.oldTime(); + K.oldTime(); + EEqn_GPU.prepare_data(&he.oldTime()[0], &K[0], &K.oldTime()[0], alphaEff, + &dpdt[0], boundary_K, boundary_alphaEff, boundary_gradient); + EEqn_GPU.sync(); + end1 = std::clock(); + time_monitor_EEqn += double(end1 - start1) / double(CLOCKS_PER_SEC); + time_monitor_EEqn_mtxAssembly += double(end1 - start1) / double(CLOCKS_PER_SEC); + time_monitor_EEqn_mtxAssembly_GPU_prepare += double(end1 - start1) / double(CLOCKS_PER_SEC); + + start1 = std::clock(); + EEqn_GPU.initializeTimeStep(); + EEqn_GPU.fvm_ddt(); + EEqn_GPU.fvm_div(); + EEqn_GPU.fvm_laplacian(); + EEqn_GPU.fvc_ddt(); + EEqn_GPU.fvc_div_phi_scalar(); + EEqn_GPU.fvc_div_vector(); + EEqn_GPU.add_to_source(); + EEqn_GPU.sync(); + end1 = std::clock(); + time_monitor_EEqn += double(end1 - start1) / double(CLOCKS_PER_SEC); + time_monitor_EEqn_mtxAssembly += double(end1 - start1) / double(CLOCKS_PER_SEC); + time_monitor_EEqn_mtxAssembly_GPU_run += double(end1 - start1) / double(CLOCKS_PER_SEC); + + // check value of mtxAssembly, no time monitor + // EEqn_GPU.checkValue(true); + + start1 = std::clock(); + EEqn_GPU.solve(); + end1 = std::clock(); + time_monitor_EEqn += double(end1 - start1) / double(CLOCKS_PER_SEC); + time_monitor_EEqn_solve += double(end1 - start1) / double(CLOCKS_PER_SEC); + + start1 = std::clock(); + EEqn_GPU.updatePsi(&he[0]); + he.correctBoundaryConditions(); + he.write(); + end1 = std::clock(); + time_monitor_EEqn += double(end1 - start1) / double(CLOCKS_PER_SEC); + time_monitor_EEqn_correctBC += double(end1 - start1) / double(CLOCKS_PER_SEC); +#else + start1 = std::clock(); + fvScalarMatrix EEqn + ( + + fvm::ddt(rho, he) + mvConvection->fvmDiv(phi, he) + + fvc::ddt(rho, K) + fvc::div(phi, K) + - dpdt + == + ( + turbName == "laminar" + ? + ( + fvm::laplacian(turbulence->alpha(), he) + - diffAlphaD + + fvc::div(hDiffCorrFlux) + ) + : + ( + fvm::laplacian(turbulence->alphaEff(), he) + ) + ) + ); + end1 = std::clock(); + time_monitor_EEqn += double(end1 - start1) / double(CLOCKS_PER_SEC); + time_monitor_EEqn_mtxAssembly += double(end1 - start1) / double(CLOCKS_PER_SEC); + + EEqn.relax(); + start1 = std::clock(); + EEqn.solve("ha"); + end1 = std::clock(); + time_monitor_EEqn += double(end1 - start1) / double(CLOCKS_PER_SEC); + time_monitor_EEqn_solve += double(end1 - start1) / double(CLOCKS_PER_SEC); +#endif +} diff --git a/applications/solvers/dfLowMachFoam_new/Make/files b/applications/solvers/dfLowMachFoam_new/Make/files new file mode 100644 index 000000000..92df9b4e3 --- /dev/null +++ b/applications/solvers/dfLowMachFoam_new/Make/files @@ -0,0 +1,3 @@ +dfLowMachFoam.C + +EXE = $(DF_APPBIN)/dfLowMachFoam_new diff --git a/applications/solvers/dfLowMachFoam_new/Make/options b/applications/solvers/dfLowMachFoam_new/Make/options new file mode 100644 index 000000000..bda93210e --- /dev/null +++ b/applications/solvers/dfLowMachFoam_new/Make/options @@ -0,0 +1,58 @@ +-include $(GENERAL_RULES)/mplibType + +EXE_INC = -std=c++14 \ + -g \ + -fopenmp \ + -Wno-unused-variable \ + -Wno-unused-but-set-variable \ + -Wno-old-style-cast \ + $(PFLAGS) $(PINC) \ + $(if $(LIBTORCH_ROOT),-DUSE_LIBTORCH,) \ + $(if $(PYTHON_INC_DIR),-DUSE_PYTORCH,) \ + -I$(LIB_SRC)/transportModels/compressible/lnInclude \ + -I$(LIB_SRC)/thermophysicalModels/basic/lnInclude \ + -I$(LIB_SRC)/TurbulenceModels/turbulenceModels/lnInclude \ + -I$(LIB_SRC)/TurbulenceModels/compressible/lnInclude \ + -I$(LIB_SRC)/finiteVolume/cfdTools \ + -I$(LIB_SRC)/finiteVolume/lnInclude \ + -I$(LIB_SRC)/meshTools/lnInclude \ + -I$(LIB_SRC)/sampling/lnInclude \ + -I$(LIB_SRC)/dynamicFvMesh/lnInclude \ + -I$(LIB_SRC)/Pstream/mpi \ + -I$(DF_SRC)/dfCanteraMixture/lnInclude \ + -I$(DF_SRC)/dfChemistryModel/lnInclude \ + -I$(DF_SRC)/dfCombustionModels/lnInclude \ + -I$(CANTERA_ROOT)/include \ + $(if $(LIBTORCH_ROOT),-I$(LIBTORCH_ROOT)/include,) \ + $(if $(LIBTORCH_ROOT),-I$(LIBTORCH_ROOT)/include/torch/csrc/api/include,) \ + $(PYTHON_INC_DIR) \ + $(if $(AMGX_DIR), -I$(DF_ROOT)/src_gpu,) \ + $(if $(AMGX_DIR), -I/usr/local/cuda-11.6/include,) \ + $(if $(AMGX_DIR), -I$(AMGX_DIR)/include,) \ + -I$(DF_ROOT)/GPUTestRef/lnInclude \ + +EXE_LIBS = \ + -lcompressibleTransportModels \ + -lturbulenceModels \ + -lfiniteVolume \ + -lmeshTools \ + -lsampling \ + -L$(DF_LIBBIN) \ + -ldfFluidThermophysicalModels \ + -ldfCompressibleTurbulenceModels \ + -ldfCanteraMixture \ + -ldfChemistryModel \ + -ldfCombustionModels \ + -ldfGenMatrix \ + $(CANTERA_ROOT)/lib/libcantera.so \ + $(if $(LIBTORCH_ROOT),$(LIBTORCH_ROOT)/lib/libtorch.so,) \ + $(if $(LIBTORCH_ROOT),$(LIBTORCH_ROOT)/lib/libc10.so,) \ + $(if $(LIBTORCH_ROOT),-rdynamic,) \ + $(if $(LIBTORCH_ROOT),-lpthread,) \ + $(if $(LIBTORCH_ROOT),$(DF_SRC)/dfChemistryModel/DNNInferencer/build/libDNNInferencer.so,) \ + $(if $(PYTHON_LIB_DIR),-L$(PYTHON_LIB_DIR),) \ + $(if $(PYTHON_LIB_DIR),-lpython3.8,) \ + $(if $(AMGX_DIR), /usr/local/cuda-11.6/lib64/libcudart.so,) \ + $(if $(AMGX_DIR), $(DF_ROOT)/src_gpu/build/libdfMatrix.so,) \ + $(if $(AMGX_DIR), $(AMGX_DIR)/build/libamgxsh.so,) + diff --git a/applications/solvers/dfLowMachFoam_new/UEqn.H b/applications/solvers/dfLowMachFoam_new/UEqn.H new file mode 100644 index 000000000..38934abdb --- /dev/null +++ b/applications/solvers/dfLowMachFoam_new/UEqn.H @@ -0,0 +1,247 @@ +// Solve the Momentum equation +#ifdef GPUSolver_ + start1 = std::clock(); + int offset = 0; + const tmp nuEff_tmp(turbulence->nuEff()); + const volScalarField& nuEff = nuEff_tmp(); + forAll(U.boundaryField(), patchi) + { + const scalarField& patchP = p.boundaryField()[patchi]; + const vectorField& patchU = U.boundaryField()[patchi]; + const scalarField& patchRho = rho.boundaryField()[patchi]; + const scalarField& patchNuEff = nuEff.boundaryField()[patchi]; + + int patchSize = patchP.size(); + + // boundary pressure + memcpy(boundary_pressure_init+offset, &patchP[0], patchSize*sizeof(double)); + // boundary velocity + memcpy(boundary_velocity_init+3*offset, &patchU[0][0], 3*patchSize*sizeof(double)); + // boundary nuEff + memcpy(boundary_nuEff_init+offset, &patchNuEff[0], patchSize*sizeof(double)); + // boundary rho + memcpy(boundary_rho_init+offset, &patchRho[0], patchSize*sizeof(double)); + offset += patchSize; + } + end1 = std::clock(); + time_monitor_UEqn += double(end1 - start1) / double(CLOCKS_PER_SEC); + time_monitor_UEqn_mtxAssembly += double(end1 - start1) / double(CLOCKS_PER_SEC); + time_monitor_UEqn_mtxAssembly_CPU_prepare += double(end1 - start1) / double(CLOCKS_PER_SEC); + + start1 = std::clock(); + UEqn_GPU.initializeTimeStep(); + U.oldTime(); + UEqn_GPU.fvm_ddt(&U.oldTime()[0][0]); + UEqn_GPU.fvm_div(boundary_pressure_init, boundary_velocity_init, boundary_nuEff_init, boundary_rho_init); + UEqn_GPU.fvc_grad(&p[0]); + UEqn_GPU.fvc_grad_vector(); + UEqn_GPU.dev2T(); + UEqn_GPU.fvc_div_tensor(&nuEff[0]); + UEqn_GPU.fvm_laplacian(); + UEqn_GPU.sync(); + end1 = std::clock(); + time_monitor_UEqn += double(end1 - start1) / double(CLOCKS_PER_SEC); + time_monitor_UEqn_mtxAssembly += double(end1 - start1) / double(CLOCKS_PER_SEC); + time_monitor_UEqn_mtxAssembly_GPU_run += double(end1 - start1) / double(CLOCKS_PER_SEC); + + // start2 = std::clock(); + // fvVectorMatrix turb_source + // ( + // turbulence->divDevRhoReff(U) + // ); + // end2 = std::clock(); + // time_monitor_CPU += double(end2 - start2) / double(CLOCKS_PER_SEC); + + // UEqn_GPU.add_fvMatrix(&turb_source.lower()[0], &turb_source.diag()[0], &turb_source.upper()[0], &turb_source.source()[0][0]); + // end1 = std::clock(); + // time_monitor_UEqn += double(end1 - start1) / double(CLOCKS_PER_SEC); + // time_monitor_UEqn_mtxAssembly += double(end1 - start1) / double(CLOCKS_PER_SEC); + + // check value + // U.oldTime(); + // tmp tUEqn + // ( + // fvm::ddt(rho, U) + // + + // fvm::div(phi, U) + // + + // turbulence->divDevRhoReff(U) + // == -fvc::grad(p) + // ); + // fvVectorMatrix& UEqn = tUEqn.ref(); + // printf("b_cpu = %e\n", UEqn.source()[1][1]); + // forAll(U.boundaryField(), patchi){ + // labelUList sub_boundary = mesh.boundary()[patchi].faceCells(); + // forAll(sub_boundary, i){ + // if (sub_boundary[i] == 1){ + // printf("b_cpu_bou = %e\n", UEqn.boundaryCoeffs()[patchi][i][1]); + // printf("patchi = %d, i = %d\n", patchi, i); + // } + // } + // } + // if (pimple.momentumPredictor()) + // { + // solve(UEqn); + // Info << "U_CPU\n" << U << endl; + // K = 0.5*magSqr(U); + // } + // UEqn_GPU.checkValue(true); +#elif defined GPUSolverNew_ + const tmp nuEff_tmp(turbulence->nuEff()); + const volScalarField& nuEff = nuEff_tmp(); + + // run CPU, for temp + tmp tUEqn + ( + fvm::ddt(rho, U) + + + fvm::div(phi, U) + + + turbulence->divDevRhoReff(U) + == -fvc::grad(p) + ); + fvVectorMatrix& UEqn = tUEqn.ref(); + + // run GPU + // preProcess + // TODO: preProcessForRhoEqn for temp, now we only transfer phi(instead of rhoEqn) used by fvm::div(phi, U) + UEqn_GPU.sync(); + double *h_rho = dfDataBase.getFieldPointer("rho", location::cpu, position::internal); + double *h_phi = dfDataBase.getFieldPointer("phi", location::cpu, position::internal); + double *h_boundary_phi = dfDataBase.getFieldPointer("phi", location::cpu, position::boundary); + memcpy(h_rho, &rho[0], dfDataBase.cell_value_bytes); + memcpy(h_phi, &phi[0], dfDataBase.surface_value_bytes); + int offset = 0; + forAll(phi.boundaryField(), patchi) + { + const fvsPatchScalarField& patchPhi = phi.boundaryField()[patchi]; + int patchsize = patchPhi.size(); + memcpy(h_boundary_phi + offset, &patchPhi[0], patchsize * sizeof(double)); + offset += patchsize; + } + UEqn_GPU.preProcessForRhoEqn(h_rho, h_phi, h_boundary_phi); + DEBUG_TRACE; + + TICK_START; + // preparing u, p, nu_eff, and rho.boundary used by UEqn_GPU.preProcess() + double *h_u = dfDataBase.getFieldPointer("u", location::cpu, position::internal); + double *h_boundary_u = dfDataBase.getFieldPointer("u", location::cpu, position::boundary); + double *h_p = dfDataBase.getFieldPointer("p", location::cpu, position::internal); + double *h_boundary_p = dfDataBase.getFieldPointer("p", location::cpu, position::boundary); + double *h_nu_eff = UEqn_GPU.getFieldPointer("nu_eff", location::cpu, position::internal); + double *h_boundary_nu_eff = UEqn_GPU.getFieldPointer("nu_eff", location::cpu, position::boundary); + double *h_boundary_rho = dfDataBase.getFieldPointer("rho", location::cpu, position::boundary); + TICK_STOP(get pointer); + + TICK_START; + U.oldTime(); + memcpy(h_u, &U.oldTime()[0][0], dfDataBase.cell_value_vec_bytes); + memcpy(h_p, &p[0], dfDataBase.cell_value_bytes); + memcpy(h_nu_eff, &nuEff[0], dfDataBase.cell_value_bytes); + TICK_STOP(copy to pinned memory); + + TICK_START; + offset = 0; + forAll(U.boundaryField(), patchi) + { + const fvPatchVectorField& patchU = U.boundaryField()[patchi]; + const fvPatchScalarField& patchP = p.boundaryField()[patchi]; + const fvPatchScalarField& patchNuEff = nuEff.boundaryField()[patchi]; + const fvPatchScalarField& patchRho = rho.boundaryField()[patchi]; + int patchsize = patchU.size(); + memcpy(h_boundary_u + offset * 3, &patchU[0][0], patchsize * 3 * sizeof(double)); + memcpy(h_boundary_p + offset, &patchP[0], patchsize * sizeof(double)); + memcpy(h_boundary_nu_eff + offset, &patchNuEff[0], patchsize * sizeof(double)); + memcpy(h_boundary_rho + offset, &patchRho[0], patchsize * sizeof(double)); + offset += patchsize; + } + TICK_STOP(CPU prepare boundary time); + + TICK_START; + UEqn_GPU.preProcess(h_u, h_boundary_u, h_p, h_boundary_p, h_nu_eff, h_boundary_nu_eff, h_boundary_rho); + DEBUG_TRACE; + UEqn_GPU.sync(); + TICK_STOP(GPU preProcess time); + + // process + TICK_START; + UEqn_GPU.process(); + DEBUG_TRACE; + UEqn_GPU.sync(); + TICK_STOP(GPU process time); + + TICK_START; + UEqn_GPU.solve(); + TICK_STOP(GPU solve time); + + // postProcess + TICK_START; + UEqn_GPU.postProcess(h_u); + U.correctBoundaryConditions(); + DEBUG_TRACE; + TICK_STOP(post process time); + + // checkResult + // TODO: for temp, now we compare ldu, finally we compare csr + std::vector h_internal_coeffs(dfDataBase.num_boundary_surfaces * 3); + std::vector h_boundary_coeffs(dfDataBase.num_boundary_surfaces * 3); + offset = 0; + for (int patchi = 0; patchi < dfDataBase.num_patches; patchi++) + { + int patchsize = dfDataBase.patch_size[patchi]; + const double* internal_coeff_ptr = &UEqn.internalCoeffs()[patchi][0][0]; + const double* boundary_coeff_ptr = &UEqn.boundaryCoeffs()[patchi][0][0]; + memcpy(h_internal_coeffs.data() + offset * 3, internal_coeff_ptr, patchsize * 3 * sizeof(double)); + memcpy(h_boundary_coeffs.data() + offset * 3, boundary_coeff_ptr, patchsize * 3 * sizeof(double)); + offset += patchsize; + } + bool printFlag = false; + UEqn_GPU.compareResult(&UEqn.lower()[0], &UEqn.upper()[0], &UEqn.diag()[0], &UEqn.source()[0][0], + h_internal_coeffs.data(), h_boundary_coeffs.data(), + // &DivTensor[0][0], + printFlag); + DEBUG_TRACE; +#else + start1 = std::clock(); + tmp tUEqn + ( + fvm::ddt(rho, U) + fvm::div(phi, U) + + turbulence->divDevRhoReff(U) + == -fvc::grad(p) + ); + fvVectorMatrix& UEqn = tUEqn.ref(); + + end1 = std::clock(); + time_monitor_UEqn += double(end1 - start1) / double(CLOCKS_PER_SEC); + time_monitor_UEqn_mtxAssembly += double(end1 - start1) / double(CLOCKS_PER_SEC); + + UEqn.relax(); + start1 = std::clock(); + if (pimple.momentumPredictor()) + { + solve(UEqn); + + K = 0.5*magSqr(U); + } + end1 = std::clock(); + time_monitor_UEqn += double(end1 - start1) / double(CLOCKS_PER_SEC); + time_monitor_UEqn_solve += double(end1 - start1) / double(CLOCKS_PER_SEC); +#endif + +// start1 = std::clock(); +// // // std::thread t(&dfMatrix::solve, &UEqn_GPU); +// UEqn_GPU.solve(); +// end1 = std::clock(); +// time_monitor_UEqn += double(end1 - start1) / double(CLOCKS_PER_SEC); +// time_monitor_UEqn_solve += double(end1 - start1) / double(CLOCKS_PER_SEC); + +// start1 = std::clock(); +// // // t.join(); +// // UEqn_GPU.updatePsi(&U[0][0]); +// K = 0.5*magSqr(U); +// end1 = std::clock(); +// time_monitor_UEqn += double(end1 - start1) / double(CLOCKS_PER_SEC); +// time_monitor_UEqn_mtxAssembly += double(end1 - start1) / double(CLOCKS_PER_SEC); +// time_monitor_CPU += double(end1 - start1) / double(CLOCKS_PER_SEC); +// // Info << "U_amgx = " << U << endl; + diff --git a/applications/solvers/dfLowMachFoam_new/YEqn.H b/applications/solvers/dfLowMachFoam_new/YEqn.H new file mode 100644 index 000000000..76570b24d --- /dev/null +++ b/applications/solvers/dfLowMachFoam_new/YEqn.H @@ -0,0 +1,207 @@ +hDiffCorrFlux = Zero; +diffAlphaD = Zero; +sumYDiffError = Zero; + +tmp> mvConvection +( + fv::convectionScheme::New + ( + mesh, + fields, + phi, + mesh.divScheme("div(phi,Yi_h)") + ) +); +#ifdef GPUSolver_ + start1 = std::clock(); + UEqn_GPU.solve(); + end1 = std::clock(); + time_monitor_UEqn += double(end1 - start1) / double(CLOCKS_PER_SEC); + time_monitor_UEqn_solve += double(end1 - start1) / double(CLOCKS_PER_SEC); + + start1 = std::clock(); + std::vector Y_old(Y.size()), boundary_Y(Y.size()), boundary_hai(Y.size()), boundary_rhoD(Y.size()); + std::vector hai(Y.size()), rhoD(Y.size()); + for (size_t i = 0; i < Y.size(); ++i) + { + volScalarField& Yi = Y[i]; + Yi.oldTime(); + Y_old[i] = &Yi.oldTime()[0]; + if (updateBoundaryFields) + { + cudaMallocHost(&boundary_Y[i], num_boundary_faces*sizeof(double)); + } + const volScalarField& haii = chemistry->hai(i); + const volScalarField& rhoDi = chemistry->rhoD(i); + // hai[i] = &haii[0]; + rhoD[i] = &rhoDi[0]; + // cudaMallocHost(&boundary_hai[i], num_boundary_faces*sizeof(double)); + cudaMallocHost(&boundary_rhoD[i], num_boundary_faces*sizeof(double)); + int offset = 0; + forAll(Yi.boundaryField(), patchi) + { + const scalarField& patchYi = Yi.boundaryField()[patchi]; + // const scalarField& patchHaii = haii.boundaryField()[patchi]; + const scalarField& patchRhoDi = rhoDi.boundaryField()[patchi]; + int patchSize = patchYi.size(); + + if (updateBoundaryFields) + { + memcpy(boundary_Y[i] + offset, &patchYi[0], patchSize*sizeof(double)); + } + // memcpy(boundary_hai[i] + offset, &patchHaii[0], patchSize*sizeof(double)); + memcpy(boundary_rhoD[i] + offset, &patchRhoDi[0], patchSize*sizeof(double)); + offset += patchSize; + } + // if (i == 5) + // { + // Info << "rhoD_CPU" << rhoDi << endl; + // } + + } + // Info << "rhoD from nuEff\n" << nuEff * rho / 0.7 << endl; + updateBoundaryFields = false; + volScalarField mut_sct = turbulence->mut().ref()/Sct; + double *boundary_mutsct = nullptr; + cudaMallocHost(&boundary_mutsct, num_boundary_faces*sizeof(double)); + int offset = 0; + forAll(p.boundaryField(), patchi) + { + const scalarField& patchMut_sct = mut_sct.boundaryField()[patchi]; + int patchSize = patchMut_sct.size(); + memcpy(boundary_mutsct + offset, &patchMut_sct[0], patchSize*sizeof(double)); + offset += patchSize; + + // debug + // const fvsPatchScalarField& pw = mesh.surfaceInterpolation::weights().boundaryField()[patchi]; + // Field valueInternalCoeffs = Y[5].boundaryField()[patchi].valueInternalCoeffs(pw); + // Field valueBoundaryCoeffs = Y[5].boundaryField()[patchi].valueBoundaryCoeffs(pw); + // Field gradientInternalCoeffs = Y[5].boundaryField()[patchi].gradientInternalCoeffs(); + // Field gradientBoundaryCoeffs = Y[5].boundaryField()[patchi].gradientBoundaryCoeffs(); + // Info << "valueInternalCoeffs\n" << valueInternalCoeffs << endl; + // Info << "valueBoundaryCoeffs\n" << valueBoundaryCoeffs << endl; + // Info << "gradientInternalCoeffs\n" << gradientInternalCoeffs << endl; + // Info << "gradientBoundaryCoeffs\n" << gradientBoundaryCoeffs << endl; + } + end1 = std::clock(); + time_monitor_YEqn += double(end1 - start1) / double(CLOCKS_PER_SEC); + time_monitor_YEqn_mtxAssembly += double(end1 - start1) / double(CLOCKS_PER_SEC); + time_monitor_YEqn_mtxAssembly_CPU_prepare += double(end1 - start1) / double(CLOCKS_PER_SEC); + //fprintf(stderr, "time_monitor_YEqn_mtxAssembly_CPU_prepare: %lf\n", time_monitor_YEqn_mtxAssembly_CPU_prepare); + + start1 = std::clock(); + YEqn_GPU.initializeTimeStep(); + YEqn_GPU.upwindWeight(); + YEqn_GPU.fvm_laplacian_and_sumYDiffError_diffAlphaD_hDiffCorrFlux(Y_old, boundary_Y, + hai, boundary_hai, rhoD, boundary_rhoD, &mut_sct[0], boundary_mutsct, &thermo.alpha()[0]); + YEqn_GPU.fvm_ddt(); + YEqn_GPU.fvm_div_phi(); + YEqn_GPU.fvm_div_phiUc(); + YEqn_GPU.sync(); + // YEqn_GPU.checkValue(true, "of_output_H2.txt"); + end1 = std::clock(); + time_monitor_YEqn += double(end1 - start1) / double(CLOCKS_PER_SEC); + time_monitor_YEqn_mtxAssembly += double(end1 - start1) / double(CLOCKS_PER_SEC); + time_monitor_YEqn_mtxAssembly_GPU_run += double(end1 - start1) / double(CLOCKS_PER_SEC); + //fprintf(stderr, "time_monitor_YEqn_mtxAssembly_GPU_run: %lf\n", time_monitor_YEqn_mtxAssembly_GPU_run); + + start1 = std::clock(); + YEqn_GPU.solve(); + end1 = std::clock(); + time_monitor_YEqn += double(end1 - start1) / double(CLOCKS_PER_SEC); + time_monitor_YEqn_solve += double(end1 - start1) / double(CLOCKS_PER_SEC); +#else + start1 = std::clock(); + forAll(Y, i) + { + sumYDiffError += chemistry->rhoD(i)*fvc::grad(Y[i]); + } + // Info << "sumYDiffError\n" << sumYDiffError << endl; + const surfaceScalarField phiUc = linearInterpolate(sumYDiffError) & mesh.Sf(); + start1 = std::clock(); + time_monitor_YEqn += double(end1 - start1) / double(CLOCKS_PER_SEC); + time_monitor_YEqn_solve += double(end1 - start1) / double(CLOCKS_PER_SEC); +#endif + +//MPI_Barrier(PstreamGlobals::MPI_COMM_FOAM); +label flag_mpi_init; +MPI_Initialized(&flag_mpi_init); +if(flag_mpi_init) MPI_Barrier(PstreamGlobals::MPI_COMM_FOAM); + +{ + if (!splitting) + { + std::chrono::steady_clock::time_point start = std::chrono::steady_clock::now(); + combustion->correct(); + //label flag_mpi_init; + //MPI_Initialized(&flag_mpi_init); + if(flag_mpi_init) MPI_Barrier(PstreamGlobals::MPI_COMM_FOAM); + std::chrono::steady_clock::time_point stop = std::chrono::steady_clock::now(); + std::chrono::duration processingTime = std::chrono::duration_cast>(stop - start); + time_monitor_chem += processingTime.count(); + } + +#ifdef GPUSolver_ + start1 = std::clock(); + forAll(Y, i) + { + volScalarField& Yi = Y[i]; + YEqn_GPU.updatePsi(&Yi[0], i); + Yi.correctBoundaryConditions(); + } + YEqn_GPU.correctBoundaryConditions(); + end1 = std::clock(); + time_monitor_YEqn += double(end1 - start1) / double(CLOCKS_PER_SEC); + time_monitor_YEqn_correctBC += double(end1 - start1) / double(CLOCKS_PER_SEC); +#else + start2 = std::clock(); + volScalarField Yt(0.0*Y[0]); + int speciesIndex = 0; + forAll(Y, i) + { + volScalarField& Yi = Y[i]; + hDiffCorrFlux += chemistry->hai(i)*(chemistry->rhoD(i)*fvc::grad(Yi) - Yi*sumYDiffError); + diffAlphaD += fvc::laplacian(thermo.alpha()*chemistry->hai(i), Yi); + if (i != inertIndex) + { + start1 = std::clock(); + tmp DEff = chemistry->rhoD(i) + turbulence->mut()/Sct; + + fvScalarMatrix YiEqn + ( + fvm::ddt(rho, Yi) + + + ( + turbName == "laminar" + ? (mvConvection->fvmDiv(phi, Yi) + mvConvection->fvmDiv(phiUc, Yi)) + : mvConvection->fvmDiv(phi, Yi) + ) + == + ( + splitting + ? fvm::laplacian(DEff(), Yi) + : (fvm::laplacian(DEff(), Yi) + combustion->R(Yi)) + ) + ); + + end1 = std::clock(); + time_monitor_YEqn_mtxAssembly += double(end1 - start1) / double(CLOCKS_PER_SEC); + YiEqn.relax(); + + start1 = std::clock(); + YiEqn.solve("Yi"); + end1 = std::clock(); + time_monitor_YEqn_solve += double(end1 - start1) / double(CLOCKS_PER_SEC); + + Yi.max(0.0); + Yt += Yi; + ++speciesIndex; + } + } + + Y[inertIndex] = scalar(1) - Yt; + Y[inertIndex].max(0.0); + end2 = std::clock(); + time_monitor_YEqn += double(end2 - start2) / double(CLOCKS_PER_SEC); +#endif +} diff --git a/applications/solvers/dfLowMachFoam_new/YEqn_RR.H b/applications/solvers/dfLowMachFoam_new/YEqn_RR.H new file mode 100644 index 000000000..f5752e95e --- /dev/null +++ b/applications/solvers/dfLowMachFoam_new/YEqn_RR.H @@ -0,0 +1,61 @@ +if (!(timeIndex % 2)) +{ + volScalarField Yt(0.0*Y[0]); + + scalar dtSave = runTime.deltaT().value(); + runTime.setDeltaT(dtSave * 2); + + start = std::clock(); + combustion->correct(); + + label flag_mpi_init; + MPI_Initialized(&flag_mpi_init); + if(flag_mpi_init) MPI_Barrier(PstreamGlobals::MPI_COMM_FOAM); + end = std::clock(); + time_monitor_chem += double(end - start) / double(CLOCKS_PER_SEC); + + forAll(Y, i) + { + volScalarField& Yi = Y[i]; + + if (i != inertIndex) + { + volScalarField& Yi = Y[i]; + fvScalarMatrix YiEqn + ( + fvm::ddt(rho, Yi) + == + combustion->R(Yi) + ); + + YiEqn.relax(); + + YiEqn.solve("Yi"); + + Yi.max(0.0); + Yt += Yi; + } + } + Y[inertIndex] = scalar(1) - Yt; + Y[inertIndex].max(0.0); + + forAll (Y, i) + { + volScalarField& tYi = Y[i].oldTime(); + + forAll(tYi, celli) + { + tYi[celli] = Y[i][celli]; + } + volScalarField::Boundary& Bf = tYi.boundaryFieldRef(); + forAll(Bf, patchi) + { + forAll(Bf[patchi], facei) + { + Bf[patchi][facei] = Y[i].boundaryField()[patchi][facei]; + } + } + } + + runTime.setDeltaT(dtSave); +} \ No newline at end of file diff --git a/applications/solvers/dfLowMachFoam_new/correctPhi.H b/applications/solvers/dfLowMachFoam_new/correctPhi.H new file mode 100644 index 000000000..3cd82d29e --- /dev/null +++ b/applications/solvers/dfLowMachFoam_new/correctPhi.H @@ -0,0 +1,12 @@ +CorrectPhi +( + U, + phi, + p, + rho, + psi, + dimensionedScalar("rAUf", dimTime, 1), + divrhoU(), + pimple, + true +); diff --git a/applications/solvers/dfLowMachFoam_new/createFields.H b/applications/solvers/dfLowMachFoam_new/createFields.H new file mode 100644 index 000000000..9e750c334 --- /dev/null +++ b/applications/solvers/dfLowMachFoam_new/createFields.H @@ -0,0 +1,176 @@ +#include "createRDeltaT.H" + +Info<< "Reading thermophysical properties\n" << endl; + +// fluidThermo* pThermo = new hePsiThermo(mesh, word::null); +fluidThermo* pThermo = new heRhoThermo(mesh, word::null); +fluidThermo& thermo = *pThermo; +// thermo.validate(args.executable(), "ha"); + +const volScalarField& psi = thermo.psi(); +volScalarField& p = thermo.p(); +volScalarField& T = thermo.T(); +volScalarField rho +( + IOobject + ( + "rho", + runTime.timeName(), + mesh, + IOobject::READ_IF_PRESENT, + IOobject::AUTO_WRITE + ), + thermo.rho() +); + + +Info<< "Reading field U\n" << endl; +volVectorField U +( + IOobject + ( + "U", + runTime.timeName(), + mesh, + IOobject::MUST_READ, + IOobject::AUTO_WRITE + ), + mesh +); + +#include "compressibleCreatePhi.H" + +pressureControl pressureControl(p, rho, pimple.dict(), false); + +mesh.setFluxRequired(p.name()); + +Info<< "Creating turbulence model\n" << endl; +autoPtr turbulence +( + compressible::turbulenceModel::New + ( + rho, + U, + phi, + thermo + ) +); + +Info<< "Creating field dpdt\n" << endl; +volScalarField dpdt +( + IOobject + ( + "dpdt", + runTime.timeName(), + mesh, + IOobject::NO_READ, + IOobject::NO_WRITE + ), + mesh, + dimensionedScalar("dpdt",p.dimensions()/dimTime, 0) +); + + +Info<< "Creating reaction model\n" << endl; +autoPtr> combustion +( + CombustionModel::New(thermo, turbulence()) +); +Info<< "end Creating reaction model\n" << endl; + + +const word combModelName(mesh.objectRegistry::lookupObject("combustionProperties").lookup("combustionModel")); +Info << "Combustion Model Name is confirmed as "<< combModelName << endl; + +const word turbName(mesh.objectRegistry::lookupObject("turbulenceProperties").lookup("simulationType")); + +dfChemistryModel* chemistry = combustion->chemistry(); +PtrList& Y = chemistry->Y(); +const word inertSpecie(chemistry->lookup("inertSpecie")); +const label inertIndex(chemistry->species()[inertSpecie]); +chemistry->setEnergyName("ha"); +chemistry->updateEnergy(); + + +chemistry->correctThermo(); +Info<< "At initial time, min/max(T) = " << min(T).value() << ", " << max(T).value() << endl; + +//for dpdt + +Info<< "Creating field kinetic energy K\n" << endl; +volScalarField K("K", 0.5*magSqr(U)); + +multivariateSurfaceInterpolationScheme::fieldTable fields; + +if(combModelName!="flareFGM") +{ +forAll(Y, i) +{ + fields.add(Y[i]); +} +fields.add(thermo.he()); +} + + +const scalar Sct = chemistry->lookupOrDefault("Sct", 1.); +volScalarField diffAlphaD +( + IOobject + ( + "diffAlphaD", + runTime.timeName(), + mesh, + IOobject::NO_READ, + IOobject::NO_WRITE + ), + mesh, + dimensionedScalar(dimEnergy/dimTime/dimVolume, 0) +); +volVectorField hDiffCorrFlux +( + IOobject + ( + "hDiffCorrFlux", + runTime.timeName(), + mesh, + IOobject::NO_READ, + IOobject::NO_WRITE + ), + mesh, + dimensionedVector(dimensionSet(1,0,-3,0,0,0,0), Zero) +); +volVectorField sumYDiffError +( + IOobject + ( + "sumYDiffError", + runTime.timeName(), + mesh, + IOobject::NO_READ, + IOobject::NO_WRITE + ), + mesh, + dimensionedVector("sumYDiffError", dimDynamicViscosity/dimLength, Zero) +); + +IOdictionary CanteraTorchProperties +( + IOobject + ( + "CanteraTorchProperties", + runTime.constant(), + mesh, + IOobject::MUST_READ, + IOobject::NO_WRITE + ) +); +const Switch splitting = CanteraTorchProperties.lookupOrDefault("splittingStrategy", false); +#ifdef USE_PYTORCH + const Switch log_ = CanteraTorchProperties.subDict("TorchSettings").lookupOrDefault("log", false); + const Switch torch_ = CanteraTorchProperties.subDict("TorchSettings").lookupOrDefault("torch", false); +#endif +#ifdef USE_LIBTORCH + const Switch log_ = CanteraTorchProperties.subDict("TorchSettings").lookupOrDefault("log", false); + const Switch torch_ = CanteraTorchProperties.subDict("TorchSettings").lookupOrDefault("torch", false); +#endif diff --git a/applications/solvers/dfLowMachFoam_new/createGPUSolver.H b/applications/solvers/dfLowMachFoam_new/createGPUSolver.H new file mode 100644 index 000000000..94fff1125 --- /dev/null +++ b/applications/solvers/dfLowMachFoam_new/createGPUSolver.H @@ -0,0 +1,97 @@ +dfMatrixDataBase dfDataBase; +//dfRhoEqn rhoEqn_GPU; +dfUEqn UEqn_GPU(dfDataBase); +//dfYEqn YEqn_GPU; +//dfEEqn EEqn_GPU; + +void createGPUBase(fvMesh& mesh, PtrList& Y) { + // prepare constant values: num_cells, num_surfaces, num_boundary_surfaces, num_patches, patch_size, num_species, rdelta_t + const labelUList& owner = mesh.owner(); + const labelUList& neighbour = mesh.neighbour(); + int num_cells = mesh.nCells(); + int num_surfaces = neighbour.size(); + int num_boundary_surfaces = 0; + int num_patches = 0; + std::vector patch_size; + forAll(mesh.boundary(), patchi) { + labelUList sub_boundary = mesh.boundary()[patchi].faceCells(); + int patchsize = sub_boundary.size(); + patch_size.push_back(patchsize); + num_boundary_surfaces += patchsize; + num_patches++; + } + // TODO: get deltaT fomr time API + double rDeltaT = 1 / 1e-6; + dfDataBase.setConstantValues(num_cells, num_surfaces, num_boundary_surfaces, num_patches, patch_size, Y.size(), rDeltaT); + + // prepare constant indexes: owner, neighbor + dfDataBase.setConstantIndexes(&owner[0], &neighbour[0]); + + // prepare internal and boundary of sf, mag_sf, weights, delta_coeffs, volume + double *boundary_sf = new double[3 * num_boundary_surfaces]; + double *boundary_mag_sf = new double[num_boundary_surfaces]; + double *boundary_delta_coeffs = new double[num_boundary_surfaces]; + int *boundary_face_cell = new int[num_boundary_surfaces]; + int offset = 0; + forAll(mesh.boundary(), patchi) { + const vectorField& pSf = mesh.Sf().boundaryField()[patchi]; + const scalarField& pMagSf = mesh.magSf().boundaryField()[patchi]; + const scalarField& pDeltaCoeffs = mesh.nonOrthDeltaCoeffs().boundaryField()[patchi]; + const labelUList& pFaceCells = mesh.boundary()[patchi].faceCells(); + + int patchsize = pMagSf.size(); + + memcpy(boundary_sf + 3*offset, &pSf[0][0], 3*patchsize*sizeof(double)); + memcpy(boundary_mag_sf + offset, &pMagSf[0], patchsize*sizeof(double)); + memcpy(boundary_delta_coeffs + offset, &pDeltaCoeffs[0], patchsize*sizeof(double)); + memcpy(boundary_face_cell + offset, &pFaceCells[0], patchsize * sizeof(int)); + offset += patchsize; + } + + dfDataBase.createConstantFieldsInternal(); + dfDataBase.createConstantFieldsBoundary(); + dfDataBase.initConstantFieldsInternal(&mesh.Sf()[0][0], &mesh.magSf()[0], &mesh.surfaceInterpolation::weights()[0], &mesh.nonOrthDeltaCoeffs()[0], &mesh.V()[0]); + dfDataBase.initConstantFieldsBoundary(boundary_sf, boundary_mag_sf, boundary_delta_coeffs, boundary_face_cell); + + // prepare internal and boundary of Y + dfDataBase.createNonConstantFieldsInternal(); + dfDataBase.createNonConstantFieldsBoundary(); + forAll(Y, speciesI) { + volScalarField& Yi = Y[speciesI]; + memcpy(dfDataBase.h_y + speciesI * num_cells, &Yi[0], num_cells * sizeof(double)); + offset = 0; + forAll(Yi.boundaryField(), patchi) { + const scalarField& patchYi = Yi.boundaryField()[patchi]; + int patchsize = patchYi.size(); + memcpy(dfDataBase.h_boundary_y + speciesI * num_boundary_surfaces + offset, &patchYi[0], patchsize*sizeof(double)); + offset += patchsize; + } + } + dfDataBase.initNonConstantFieldsInternal(dfDataBase.h_y); + dfDataBase.initNonConstantFieldsBoundary(dfDataBase.h_boundary_y); +} + +void createGPUUEqn(const IOdictionary& CanteraTorchProperties, const volVectorField& U) { + // prepare mode_string and setting_path + string mode_string = "dDDI"; + string settingPath; + settingPath = CanteraTorchProperties.subDict("AmgxSettings").lookupOrDefault("UEqnSettingPath", string("")); + UEqn_GPU.setConstantValues(mode_string, settingPath); + + // prepare patch_type + std::vector patch_type; + patch_type.resize(dfDataBase.num_patches); + forAll(U.boundaryField(), patchi) + { + constructBoundarySelectorPerPatch(&(patch_type[patchi]), U.boundaryField()[patchi].type()); + } + UEqn_GPU.setConstantFields(patch_type); + + // prepare internal and boundary of xxx + UEqn_GPU.createNonConstantFieldsInternal(); + UEqn_GPU.createNonConstantFieldsBoundary(); + UEqn_GPU.createNonConstantLduAndCsrFields(); + // UEqn_GPU has no internal non-constant fields to be init + // UEqn_GPU.initNonConstantFieldsInternal(); + UEqn_GPU.initNonConstantFieldsBoundary(); +} diff --git a/applications/solvers/dfLowMachFoam_new/createdfSolver.H b/applications/solvers/dfLowMachFoam_new/createdfSolver.H new file mode 100644 index 000000000..3c5593833 --- /dev/null +++ b/applications/solvers/dfLowMachFoam_new/createdfSolver.H @@ -0,0 +1,65 @@ +const labelUList& owner = mesh.owner(); +const labelUList& neighbour = mesh.neighbour(); +int num_cells = mesh.nCells(); +int num_surfaces = neighbour.size(); + +std::vector boundaryCellIndex; +std::vector boundary_face_vector_init; +std::vector boundary_face_init; +std::vector boundary_deltaCoeffs_init; +std::vector> patchTypes; +std::vector patchTypeU, patchTypeY; +int num_boundary_faces = 0; +int patchSize; +forAll(mesh.boundary(), patchi) +{ + labelUList sub_boundary = mesh.boundary()[patchi].faceCells(); + patchSize = sub_boundary.size(); + const vectorField& pSf = mesh.Sf().boundaryField()[patchi]; + const scalarField& pMagSf = mesh.magSf().boundaryField()[patchi]; + const scalarField& pDeltaCoeffs = mesh.nonOrthDeltaCoeffs().boundaryField()[patchi]; + + boundaryCellIndex.insert(boundaryCellIndex.end(), &sub_boundary[0], &sub_boundary[0]+patchSize); + boundary_face_vector_init.insert(boundary_face_vector_init.end(), &pSf[0][0], &pSf[0][0]+3*patchSize); + boundary_face_init.insert(boundary_face_init.end(), &pMagSf[0], &pMagSf[0]+patchSize); + boundary_deltaCoeffs_init.insert(boundary_deltaCoeffs_init.end(), &pDeltaCoeffs[0], &pDeltaCoeffs[0]+patchSize); + num_boundary_faces += patchSize; + + constructBoundarySelector(patchTypeU, U.boundaryField()[patchi].type(), patchSize); + constructBoundarySelector(patchTypeY, Y[0].boundaryField()[patchi].type(), patchSize); +} +patchTypes.emplace_back(patchTypeU); +patchTypes.emplace_back(patchTypeY); + +int num_boundary_cells; + +string settingPath; +settingPath = CanteraTorchProperties.subDict("AmgxSettings").lookupOrDefault("UEqnSettingPath", string("")); + +#ifdef GPUSolver_ + dfMatrixDataBase dfDataBase(num_surfaces, num_cells, num_boundary_faces, Y.size(), num_boundary_cells, &neighbour[0], &owner[0], &mesh.V()[0], &mesh.surfaceInterpolation::weights()[0], + &mesh.Sf()[0][0], &mesh.magSf()[0], &mesh.nonOrthDeltaCoeffs()[0], boundary_face_vector_init, boundary_face_init, boundary_deltaCoeffs_init, boundaryCellIndex, patchTypes); + dfRhoEqn rhoEqn_GPU(dfDataBase); + dfUEqn UEqn_GPU(dfDataBase, "dDDI", settingPath); + dfYEqn YEqn_GPU(dfDataBase, "dDDI", settingPath, inertIndex); + dfEEqn EEqn_GPU(dfDataBase, "dDDI", settingPath); + + double *ueqn_internalCoeffs_init, *ueqn_boundaryCoeffs_init, *boundary_pressure_init, *boundary_velocity_init, + *boundary_nuEff_init, *boundary_rho_init, *ueqn_laplac_internalCoeffs_init, *ueqn_laplac_boundaryCoeffs_init, *boundary_phi_init; + cudaMallocHost(&ueqn_internalCoeffs_init, 3*num_boundary_faces*sizeof(double)); + cudaMallocHost(&ueqn_boundaryCoeffs_init, 3*num_boundary_faces*sizeof(double)); + cudaMallocHost(&ueqn_laplac_internalCoeffs_init, 3*num_boundary_faces*sizeof(double)); + cudaMallocHost(&ueqn_laplac_boundaryCoeffs_init, 3*num_boundary_faces*sizeof(double)); + cudaMallocHost(&boundary_velocity_init, 3*num_boundary_faces*sizeof(double)); + cudaMallocHost(&boundary_pressure_init, num_boundary_faces*sizeof(double)); + cudaMallocHost(&boundary_nuEff_init, num_boundary_faces*sizeof(double)); + cudaMallocHost(&boundary_rho_init, num_boundary_faces*sizeof(double)); + cudaMallocHost(&boundary_phi_init, num_boundary_faces*sizeof(double)); + + double *boundary_alphaEff, *boundary_K, *boundary_gradient; + cudaMallocHost(&boundary_K, num_boundary_faces*sizeof(double)); + cudaMallocHost(&boundary_alphaEff, num_boundary_faces*sizeof(double)); + cudaMallocHost(&boundary_gradient, num_boundary_faces * sizeof(double)); + + bool updateBoundaryFields = true; // make sure that the boundary fields do H2D copy at 1st timestep +#endif diff --git a/applications/solvers/dfLowMachFoam_new/dfLowMachFoam.C b/applications/solvers/dfLowMachFoam_new/dfLowMachFoam.C new file mode 100644 index 000000000..f5b6ec90d --- /dev/null +++ b/applications/solvers/dfLowMachFoam_new/dfLowMachFoam.C @@ -0,0 +1,447 @@ +/*---------------------------------------------------------------------------*\ + ========= | + \\ / F ield | OpenFOAM: The Open Source CFD Toolbox + \\ / O peration | Website: https://openfoam.org + \\ / A nd | Copyright (C) 2011-2019 OpenFOAM Foundation + \\/ M anipulation | +------------------------------------------------------------------------------- +License + This file is part of OpenFOAM. + + OpenFOAM is free software: you can redistribute it and/or modify it + under the terms of the GNU General Public License as published by + the Free Software Foundation, either version 3 of the License, or + (at your option) any later version. + + OpenFOAM is distributed in the hope that it will be useful, but WITHOUT + ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or + FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License + for more details. + + You should have received a copy of the GNU General Public License + along with OpenFOAM. If not, see . + +Application + rhoPimpleFoam + +Description + Transient solver for turbulent flow of compressible fluids for HVAC and + similar applications, with optional mesh motion and mesh topology changes. + + Uses the flexible PIMPLE (PISO-SIMPLE) solution for time-resolved and + pseudo-transient simulations. + +\*---------------------------------------------------------------------------*/ + +#include "dfChemistryModel.H" +#include "CanteraMixture.H" +// #include "hePsiThermo.H" +#include "heRhoThermo.H" + +#ifdef USE_PYTORCH +#include +#include +#include //used to convert +#endif + +#ifdef USE_LIBTORCH +#include +#include "DNNInferencer.H" +#endif + +#include "fvCFD.H" +#include "fluidThermo.H" +#include "turbulentFluidThermoModel.H" +#include "pimpleControl.H" +#include "pressureControl.H" +#include "localEulerDdtScheme.H" +#include "fvcSmooth.H" +#include "PstreamGlobals.H" +#include "basicThermo.H" +#include "CombustionModel.H" + +#define GPUSolverNew_ +#define TIME + +#ifdef GPUSolverNew_ +#include "dfUEqn.H" +// #include "dfYEqn.H" +// #include "dfRhoEqn.H" +// #include "dfEEqn.H" +#include "dfMatrixDataBase.H" +#include "dfMatrixOpBase.H" +#include +#include + +#include "createGPUSolver.H" + +#include "upwind.H" +#include "GenFvMatrix.H" +#endif + +#ifdef TIME + #define TICK_START \ + start_new = std::clock(); + #define TICK_STOP(prefix) \ + stop_new = std::clock(); \ + Foam::Info << #prefix << " time = " << double(stop_new - start_new) / double(CLOCKS_PER_SEC) << " s" << Foam::endl; +#else + #define TICK_START + #define TICK_STOP(prefix) +#endif + +// * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * // + +int main(int argc, char *argv[]) +{ +#ifdef USE_PYTORCH + pybind11::scoped_interpreter guard{};//start python interpreter +#endif + #include "postProcess.H" + + // unsigned int flags = 0; + // checkCudaErrors(cudaGetDeviceFlags(&flags)); + // flags |= cudaDeviceScheduleYield; + // checkCudaErrors(cudaSetDeviceFlags(flags)); + + // #include "setRootCaseLists.H" + #include "listOptions.H" + #include "setRootCase2.H" + #include "listOutput.H" + + #include "createTime.H" + #include "createMesh.H" + #include "createDyMControls.H" + #include "initContinuityErrs.H" + #include "createFields.H" + #include "createRhoUfIfPresent.H" + + double time_monitor_init = 0; + + double time_monitor_other = 0; + double time_monitor_rho = 0; + double time_monitor_U = 0; + double time_monitor_Y = 0; + double time_monitor_E = 0; + double time_monitor_p = 0; + double time_monitor_chemistry_correctThermo = 0; + double time_monitor_turbulence_correct = 0; + double time_monitor_chem = 0; // combustion correct + + double time_monitor_rhoEqn = 0; + double time_monitor_rhoEqn_mtxAssembly = 0; + double time_monitor_rhoEqn_mtxAssembly_CPU_prepare = 0; + double time_monitor_rhoEqn_mtxAssembly_GPU_run = 0; + double time_monitor_rhoEqn_solve = 0; + double time_monitor_rhoEqn_correctBC = 0; + + double time_monitor_UEqn = 0; + double time_monitor_UEqn_mtxAssembly = 0; + double time_monitor_UEqn_mtxAssembly_CPU_prepare = 0; + double time_monitor_UEqn_mtxAssembly_GPU_run = 0; + double time_monitor_UEqn_solve = 0; + double time_monitor_UEqn_correctBC = 0; + double time_monitor_UEqn_H = 0; + double time_monitor_UEqn_H_GPU_run = 0; + double time_monitor_UEqn_H_correctBC = 0; + double time_monitor_UEqn_A = 0; + double time_monitor_UEqn_A_GPU_run = 0; + double time_monitor_UEqn_A_correctBC = 0; + + double time_monitor_YEqn = 0; + double time_monitor_YEqn_mtxAssembly = 0; + double time_monitor_YEqn_mtxAssembly_CPU_prepare = 0; + double time_monitor_YEqn_mtxAssembly_GPU_run = 0; + double time_monitor_YEqn_solve = 0; + double time_monitor_YEqn_correctBC = 0; + + double time_monitor_EEqn = 0; + double time_monitor_EEqn_mtxAssembly = 0; + double time_monitor_EEqn_mtxAssembly_CPU_prepare = 0; + double time_monitor_EEqn_mtxAssembly_GPU_prepare = 0; + double time_monitor_EEqn_mtxAssembly_GPU_run = 0; + double time_monitor_EEqn_solve = 0; + double time_monitor_EEqn_correctBC = 0; + + double time_monitor_pEqn = 0; + double time_monitor_pEqn_solve = 0; + + label timeIndex = 0; + clock_t start, end, start1, end1, start2, end2; + clock_t start_new, stop_new; + double time_new = 0; + + turbulence->validate(); + + if (!LTS) + { + #include "compressibleCourantNo.H" + #include "setInitialDeltaT.H" + } + + start1 = std::clock(); + createGPUBase(mesh, Y); + createGPUUEqn(CanteraTorchProperties, U); + + end1 = std::clock(); + time_monitor_init += double(end1 - start1) / double(CLOCKS_PER_SEC); + + // * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * // + + Info<< "\nStarting time loop\n" << endl; + + while (runTime.run()) + { + timeIndex ++; + + #include "readDyMControls.H" + + if (LTS) + { + #include "setRDeltaT.H" + } + else + { + #include "compressibleCourantNo.H" + #include "setDeltaT.H" + } + + runTime++; + + Info<< "Time = " << runTime.timeName() << nl << endl; + dfDataBase.preTimeStep(&rho.oldTime()[0]); + clock_t loop_start = std::clock(); + // --- Pressure-velocity PIMPLE corrector loop + while (pimple.loop()) + { + start = std::clock(); + if (splitting) + { + #include "YEqn_RR.H" + } + if (pimple.firstPimpleIter() || moveMeshOuterCorrectors) + { + // Store momentum to set rhoUf for introduced faces. + autoPtr rhoU; + if (rhoUf.valid()) + { + rhoU = new volVectorField("rhoU", rho*U); + } + } + end = std::clock(); + time_monitor_other += double(end - start) / double(CLOCKS_PER_SEC); + + start = std::clock(); + if (pimple.firstPimpleIter() && !pimple.simpleRho()) + { + #include "rhoEqn.H" + } + end = std::clock(); + time_monitor_rho += double(end - start) / double(CLOCKS_PER_SEC); + + start = std::clock(); + #include "UEqn.H" + end = std::clock(); + time_monitor_U += double(end - start) / double(CLOCKS_PER_SEC); + + if(combModelName!="ESF" && combModelName!="flareFGM" && combModelName!="DeePFGM") + { + start = std::clock(); + #include "YEqn.H" + end = std::clock(); + time_monitor_Y += double(end - start) / double(CLOCKS_PER_SEC); + + start = std::clock(); + #include "EEqn.H" + end = std::clock(); + time_monitor_E += double(end - start) / double(CLOCKS_PER_SEC); + + start = std::clock(); + chemistry->correctThermo(); + end = std::clock(); + time_monitor_chemistry_correctThermo += double(end - start) / double(CLOCKS_PER_SEC); + } + else + { + combustion->correct(); + } + + Info<< "min/max(T) = " << min(T).value() << ", " << max(T).value() << endl; + + // --- Pressure corrector loop + + start = std::clock(); + while (pimple.correct()) + { + if (pimple.consistent()) + { + // #include "pcEqn.H" + } + else + { + #include "pEqn.H" + } + } + end = std::clock(); + time_monitor_p += double(end - start) / double(CLOCKS_PER_SEC); + + start = std::clock(); + if (pimple.turbCorr()) + { + turbulence->correct(); + } + end = std::clock(); + time_monitor_turbulence_correct += double(end - start) / double(CLOCKS_PER_SEC); + } + clock_t loop_end = std::clock(); + double loop_time = double(loop_end - loop_start) / double(CLOCKS_PER_SEC); + + rho = thermo.rho(); + + dfDataBase.postTimeStep(); + + runTime.write(); + Info<< "========Time Spent in diffenet parts========"<< endl; + Info<< "loop Time = " << loop_time << " s" << endl; + Info<< "other Time = " << time_monitor_other << " s" << endl; + Info<< "rho Equations = " << time_monitor_rho << " s" << endl; + Info<< "U Equations = " << time_monitor_U << " s" << endl; + Info<< "Y Equations = " << time_monitor_Y - time_monitor_chem << " s" << endl; + Info<< "E Equations = " << time_monitor_E << " s" << endl; + Info<< "p Equations = " << time_monitor_p << " s" << endl; + Info<< "chemistry correctThermo = " << time_monitor_chemistry_correctThermo << " s" << endl; + Info<< "turbulence correct = " << time_monitor_turbulence_correct << " s" << endl; + Info<< "combustion correct(in Y) = " << time_monitor_chem << " s" << endl; + Info<< "percentage of chemistry = " << time_monitor_chem / loop_time * 100 << " %" << endl; + Info<< "percentage of rho/U/Y/E = " << (time_monitor_E + time_monitor_Y + time_monitor_U + time_monitor_rho - time_monitor_chem) / loop_time * 100 << " %" << endl; + + + Info<< "========Time details of each equation======="<< endl; + + Info<< "rhoEqn Time = " << time_monitor_rhoEqn << " s" << endl; + Info<< "rhoEqn assamble = " << time_monitor_rhoEqn_mtxAssembly << " s" << endl; + Info<< "rhoEqn assamble(CPU prepare) = " << time_monitor_rhoEqn_mtxAssembly_CPU_prepare << " s" << endl; + Info<< "rhoEqn assamble(GPU run) = " << time_monitor_rhoEqn_mtxAssembly_GPU_run << " s" << endl; + Info<< "rhoEqn solve = " << time_monitor_rhoEqn_solve << " s" << endl; + Info<< "rhoEqn correct boundary = " << time_monitor_rhoEqn_correctBC << " s" << endl; + + Info<< "UEqn Time = " << time_monitor_UEqn << " s" << endl; + Info<< "UEqn assamble = " << time_monitor_UEqn_mtxAssembly << " s" << endl; + Info<< "UEqn assamble(CPU prepare) = " << time_monitor_UEqn_mtxAssembly_CPU_prepare << " s" << endl; + Info<< "UEqn assamble(GPU run) = " << time_monitor_UEqn_mtxAssembly_GPU_run << " s" << endl; + Info<< "UEqn solve = " << time_monitor_UEqn_solve << " s" << endl; + Info<< "UEqn correct boundary = " << time_monitor_UEqn_correctBC << " s" << endl; + Info<< "UEqn H = " << time_monitor_UEqn_H << " s" << endl; + Info<< "UEqn H(GPU run) = " << time_monitor_UEqn_H_GPU_run << " s" << endl; + Info<< "UEqn H(correct boundary) = " << time_monitor_UEqn_H_correctBC << " s" << endl; + Info<< "UEqn A = " << time_monitor_UEqn_A << " s" << endl; + Info<< "UEqn A(GPU run) = " << time_monitor_UEqn_A_GPU_run << " s" << endl; + Info<< "UEqn A(correct boundary) = " << time_monitor_UEqn_A_correctBC << " s" << endl; + + Info<< "YEqn Time = " << time_monitor_YEqn << " s" << endl; + Info<< "YEqn assamble = " << time_monitor_YEqn_mtxAssembly << " s" << endl; + Info<< "YEqn assamble(CPU prepare) = " << time_monitor_YEqn_mtxAssembly_CPU_prepare << " s" << endl; + Info<< "YEqn assamble(GPU run) = " << time_monitor_YEqn_mtxAssembly_GPU_run << " s" << endl; + Info<< "YEqn solve = " << time_monitor_YEqn_solve << " s" << endl; + Info<< "YEqn correct boundary = " << time_monitor_YEqn_correctBC << " s" << endl; + + Info<< "EEqn Time = " << time_monitor_EEqn << " s" << endl; + Info<< "EEqn assamble = " << time_monitor_EEqn_mtxAssembly << " s" << endl; + Info<< "EEqn assamble(CPU prepare) = " << time_monitor_EEqn_mtxAssembly_CPU_prepare << " s" << endl; + Info<< "EEqn assamble(GPU prepare) = " << time_monitor_EEqn_mtxAssembly_GPU_prepare << " s" << endl; + Info<< "EEqn assamble(GPU run) = " << time_monitor_EEqn_mtxAssembly_GPU_run << " s" << endl; + Info<< "EEqn solve = " << time_monitor_EEqn_solve << " s" << endl; + Info<< "EEqn correct boundary = " << time_monitor_EEqn_correctBC << " s" << endl; + + Info<< "pEqn Time = " << time_monitor_pEqn << " s" << endl; + Info<< "pEqn Time solve = " << time_monitor_pEqn_solve << " s" << endl; + + Info<< "============================================"<. + +Global + rhoEqn + +Description + Solve the continuity for density. + +\*---------------------------------------------------------------------------*/ +#ifdef GPUSolver_ +{ + start1 = std::clock(); + rho.oldTime(); + + int offset = 0; + forAll(U.boundaryField(), patchi) + { + const fvsPatchScalarField& patchFlux = phi.boundaryField()[patchi]; + int patchSize = patchFlux.size(); + memcpy(boundary_phi_init+offset, &patchFlux[0], patchSize*sizeof(double)); + offset += patchSize; + } + end1 = std::clock(); + time_monitor_rhoEqn += double(end1 - start1) / double(CLOCKS_PER_SEC); + time_monitor_rhoEqn_mtxAssembly += double(end1 - start1) / double(CLOCKS_PER_SEC); + time_monitor_rhoEqn_mtxAssembly_CPU_prepare += double(end1 - start1) / double(CLOCKS_PER_SEC); + + start1 = std::clock(); + rhoEqn_GPU.initializeTimeStep(); + rhoEqn_GPU.fvc_div(&phi[0], boundary_phi_init); + rhoEqn_GPU.fvm_ddt(&rho.oldTime()[0]); + rhoEqn_GPU.sync(); + end1 = std::clock(); + time_monitor_rhoEqn += double(end1 - start1) / double(CLOCKS_PER_SEC); + time_monitor_rhoEqn_mtxAssembly += double(end1 - start1) / double(CLOCKS_PER_SEC); + time_monitor_rhoEqn_mtxAssembly_GPU_run += double(end1 - start1) / double(CLOCKS_PER_SEC); + + start1 = std::clock(); + rhoEqn_GPU.updatePsi(&rho.primitiveFieldRef()[0]); + rho.correctBoundaryConditions(); + end1 = std::clock(); + time_monitor_rhoEqn += double(end1 - start1) / double(CLOCKS_PER_SEC); + time_monitor_rhoEqn_correctBC += double(end1 - start1) / double(CLOCKS_PER_SEC); +} +#else +{ + start1 = std::clock(); + fvScalarMatrix rhoEqn + ( + fvm::ddt(rho) + + fvc::div(phi) + ); + end1 = std::clock(); + time_monitor_rhoEqn += double(end1 - start1) / double(CLOCKS_PER_SEC); + time_monitor_rhoEqn_mtxAssembly += double(end1 - start1) / double(CLOCKS_PER_SEC); + + start1 = std::clock(); + rhoEqn.solve(); + end1 = std::clock(); + time_monitor_rhoEqn += double(end1 - start1) / double(CLOCKS_PER_SEC); + time_monitor_rhoEqn_solve += double(end1 - start1) / double(CLOCKS_PER_SEC); +} +#endif + +// ************************************************************************* // diff --git a/applications/solvers/dfLowMachFoam_new/setRDeltaT.H b/applications/solvers/dfLowMachFoam_new/setRDeltaT.H new file mode 100644 index 000000000..074d05e3d --- /dev/null +++ b/applications/solvers/dfLowMachFoam_new/setRDeltaT.H @@ -0,0 +1,85 @@ +{ + volScalarField& rDeltaT = trDeltaT.ref(); + + const dictionary& pimpleDict = pimple.dict(); + + scalar maxCo + ( + pimpleDict.lookupOrDefault("maxCo", 0.8) + ); + + scalar rDeltaTSmoothingCoeff + ( + pimpleDict.lookupOrDefault("rDeltaTSmoothingCoeff", 0.02) + ); + + scalar rDeltaTDampingCoeff + ( + pimpleDict.lookupOrDefault("rDeltaTDampingCoeff", 1.0) + ); + + scalar maxDeltaT + ( + pimpleDict.lookupOrDefault("maxDeltaT", great) + ); + + volScalarField rDeltaT0("rDeltaT0", rDeltaT); + + // Set the reciprocal time-step from the local Courant number + rDeltaT.ref() = max + ( + 1/dimensionedScalar(dimTime, maxDeltaT), + fvc::surfaceSum(mag(phi))()() + /((2*maxCo)*mesh.V()*rho()) + ); + + if (pimple.transonic()) + { + surfaceScalarField phid + ( + "phid", + fvc::interpolate(psi)*fvc::flux(U) + ); + + rDeltaT.ref() = max + ( + rDeltaT(), + fvc::surfaceSum(mag(phid))()() + /((2*maxCo)*mesh.V()*psi()) + ); + } + + // Update tho boundary values of the reciprocal time-step + rDeltaT.correctBoundaryConditions(); + + Info<< "Flow time scale min/max = " + << gMin(1/rDeltaT.primitiveField()) + << ", " << gMax(1/rDeltaT.primitiveField()) << endl; + + if (rDeltaTSmoothingCoeff < 1.0) + { + fvc::smooth(rDeltaT, rDeltaTSmoothingCoeff); + } + + Info<< "Smoothed flow time scale min/max = " + << gMin(1/rDeltaT.primitiveField()) + << ", " << gMax(1/rDeltaT.primitiveField()) << endl; + + // Limit rate of change of time scale + // - reduce as much as required + // - only increase at a fraction of old time scale + if + ( + rDeltaTDampingCoeff < 1.0 + && runTime.timeIndex() > runTime.startTimeIndex() + 1 + ) + { + rDeltaT = + rDeltaT0 + *max(rDeltaT/rDeltaT0, scalar(1) - rDeltaTDampingCoeff); + + Info<< "Damped flow time scale min/max = " + << gMin(1/rDeltaT.primitiveField()) + << ", " << gMax(1/rDeltaT.primitiveField()) << endl; + } +} diff --git a/applications/solvers/dfLowMachFoam_new/setRootCase2.H b/applications/solvers/dfLowMachFoam_new/setRootCase2.H new file mode 100644 index 000000000..45d966e63 --- /dev/null +++ b/applications/solvers/dfLowMachFoam_new/setRootCase2.H @@ -0,0 +1,5 @@ +Foam::argList args(argc,argv,true,true,/*initialise=*/false); +if (!args.checkRootCase()) +{ + Foam::FatalError.exit(); +} \ No newline at end of file diff --git a/src_gpu/dfMatrixDataBase.cu b/src_gpu/dfMatrixDataBase.cu index 8c2c26faf..4e49faf99 100644 --- a/src_gpu/dfMatrixDataBase.cu +++ b/src_gpu/dfMatrixDataBase.cu @@ -132,7 +132,6 @@ void dfMatrixDataBase::setConstantIndexes(const int *owner, const int *neighbor) std::vector lowCSRIndex, uppCSRIndex, diagCSRIndex, CSRRowIndex, CSRColIndex; int uppIndexInCSR = 0, uppIndexInLdu = 0, lowIndexInCSR = 0, lowIndexInLdu = 0, lowNumInLdu = 0; - CSRRowIndex.push_back(0); CSRColIndex.resize(2 * num_surfaces + num_cells); lowCSRIndex.resize(num_surfaces); for (int i = 0; i < num_cells; ++i) { @@ -161,6 +160,7 @@ void dfMatrixDataBase::setConstantIndexes(const int *owner, const int *neighbor) diagCSRIndex.push_back(diagIndexInCSR); CSRColIndex[diagIndexInCSR] = i; // fill diag entry in CSRColIndex } + CSRRowIndex.push_back(2 * num_surfaces + num_cells); checkCudaErrors(cudaMalloc((void**)&d_lower_to_csr_index, surface_index_bytes)); checkCudaErrors(cudaMalloc((void**)&d_diag_to_csr_index, cell_index_bytes)); diff --git a/src_gpu/dfMatrixOpBase.H b/src_gpu/dfMatrixOpBase.H index f64220186..71dd82c38 100644 --- a/src_gpu/dfMatrixOpBase.H +++ b/src_gpu/dfMatrixOpBase.H @@ -1,20 +1,5 @@ #pragma once - -#define TICK_INIT \ - float time_elapsed_kernel=0;\ - cudaEvent_t start_kernel, stop_kernel;\ - checkCudaErrors(cudaEventCreate(&start_kernel));\ - checkCudaErrors(cudaEventCreate(&stop_kernel)); - -#define TICK_START \ - checkCudaErrors(cudaEventRecord(start_kernel,0)); - -#define TICK_END(prefix) \ - checkCudaErrors(cudaEventRecord(stop_kernel,0));\ - checkCudaErrors(cudaEventSynchronize(start_kernel));\ - checkCudaErrors(cudaEventSynchronize(stop_kernel));\ - checkCudaErrors(cudaEventElapsedTime(&time_elapsed_kernel,start_kernel,stop_kernel));\ - printf("try %s 执行时间:%lf(ms)\n", #prefix, time_elapsed_kernel); +// #define TIME_GPU // tools void permute_vector_d2h(cudaStream_t stream, int num_cells, const double *input, double *output); @@ -26,13 +11,13 @@ void field_multiply_scalar(cudaStream_t stream, void fvc_to_source_vector(cudaStream_t stream, int num_cells, const double *volume, const double *fvc_output, double *source); -void ldu_to_csr(cudaStream_t stream, int num_cells, int num_surfaces, - const int *lower_to_csr_index, const int *upper_to_csr_index, const int *diag_to_csr_index, +void ldu_to_csr(cudaStream_t stream, int num_cells, int num_surfaces, int num_boundary_surface, + const int* boundary_cell_face, const int *lower_to_csr_index, const int *upper_to_csr_index, const int *diag_to_csr_index, const double *lower, const double *upper, const double *diag, const double *source, const double *internal_coeffs, const double *boundary_coeffs, - double *A, double *b); + double *A, double *b, double *diag_vec); -void update_boundary_coeffs_vector(cudaStream_t stream, int num_patches, +void update_boundary_coeffs_vector(cudaStream_t stream, int num_boundary_surfaces, int num_patches, const int *patch_size, const int *patch_type, double *value_internal_coeffs, double *value_boundary_coeffs, double *gradient_internal_coeffs, double *gradient_boundary_coeffs); @@ -43,14 +28,15 @@ void fvm_ddt_vector(cudaStream_t stream, int num_cells, double rDeltaT, const double *rho, const double *rho_old, const double *vf, const double *volume, double *diag, double *source, double sign = 1.); -void fvm_div_vector(cudaStream_t stream, int num_surfaces, const int *lowerAddr, const int *upperAddr, +void fvm_div_vector(cudaStream_t stream, int num_surfaces, int num_boundary_sourfaces, + const int *lowerAddr, const int *upperAddr, const double *phi, const double *weight, double *lower, double *upper, double *diag, // end for internal int num_patches, const int *patch_size, const int *patch_type, const double *boundary_phi, const double *value_internal_coeffs, const double *value_boundary_coeffs, double *internal_coeffs, double *boundary_coeffs, double sign = 1.); -void fvm_laplacian_vector(cudaStream_t stream, int num_surfaces, +void fvm_laplacian_vector(cudaStream_t stream, int num_surfaces, int num_boundary_surfaces, const int *lowerAddr, const int *upperAddr, const double *weight, const double *mag_sf, const double *delta_coeffs, const double *gamma, double *lower, double *upper, double *diag, // end for internal diff --git a/src_gpu/dfMatrixOpBase.cu b/src_gpu/dfMatrixOpBase.cu index d4f6ea7f8..e3616fac3 100644 --- a/src_gpu/dfMatrixOpBase.cu +++ b/src_gpu/dfMatrixOpBase.cu @@ -4,6 +4,35 @@ #include #include "cuda_profiler_api.h" +#ifdef TIME_GPU + #define TICK_INIT_EVENT \ + float time_elapsed_kernel=0;\ + cudaEvent_t start_kernel, stop_kernel;\ + checkCudaErrors(cudaEventCreate(&start_kernel));\ + checkCudaErrors(cudaEventCreate(&stop_kernel)); + + #define TICK_START_EVENT \ + checkCudaErrors(cudaEventRecord(start_kernel,0)); + + #define TICK_END_EVENT(prefix) \ + checkCudaErrors(cudaEventRecord(stop_kernel,0));\ + checkCudaErrors(cudaEventSynchronize(start_kernel));\ + checkCudaErrors(cudaEventSynchronize(stop_kernel));\ + checkCudaErrors(cudaEventElapsedTime(&time_elapsed_kernel,start_kernel,stop_kernel));\ + printf("try %s 执行时间:%lf(ms)\n", #prefix, time_elapsed_kernel); +#else + #define TICK_INIT_EVENT + #define TICK_START_EVENT + #define TICK_END_EVENT(prefix) +#endif + +__global__ void warmup(int num_cells) +{ + int index = blockDim.x * blockIdx.x + threadIdx.x; + if (index >= num_cells) + return; +} + __global__ void permute_vector_d2h_kernel(int num_cells, const double *input, double *output) { int index = blockDim.x * blockIdx.x + threadIdx.x; @@ -53,7 +82,7 @@ __global__ void fvc_to_source_vector_kernel(int num_cells, const double *volume, source[index * 3 + 2] += fvc_output[index * 3 + 2]; } -__global__ void update_boundary_coeffs_zeroGradient_vector(int num, int offset, +__global__ void update_boundary_coeffs_zeroGradient_vector(int num_boundary_surfaces, int num, int offset, double *value_internal_coeffs, double *value_boundary_coeffs, double *gradient_internal_coeffs, double *gradient_boundary_coeffs) { @@ -66,18 +95,18 @@ __global__ void update_boundary_coeffs_zeroGradient_vector(int num, int offset, // valueBoundaryCoeffs = 0 // gradientInternalCoeffs = 0 // gradientBoundaryCoeffs = 0 - value_internal_coeffs[start_index * 3 + 0] = 1; - value_internal_coeffs[start_index * 3 + 1] = 1; - value_internal_coeffs[start_index * 3 + 2] = 1; - value_boundary_coeffs[start_index * 3 + 0] = 0; - value_boundary_coeffs[start_index * 3 + 1] = 0; - value_boundary_coeffs[start_index * 3 + 2] = 0; - gradient_internal_coeffs[start_index * 3 + 0] = 0; - gradient_internal_coeffs[start_index * 3 + 1] = 0; - gradient_internal_coeffs[start_index * 3 + 2] = 0; - gradient_boundary_coeffs[start_index * 3 + 0] = 0; - gradient_boundary_coeffs[start_index * 3 + 1] = 0; - gradient_boundary_coeffs[start_index * 3 + 2] = 0; + value_internal_coeffs[num_boundary_surfaces * 0 + start_index] = 1; + value_internal_coeffs[num_boundary_surfaces * 1 + start_index] = 1; + value_internal_coeffs[num_boundary_surfaces * 2 + start_index] = 1; + value_boundary_coeffs[num_boundary_surfaces * 0 + start_index] = 0; + value_boundary_coeffs[num_boundary_surfaces * 1 + start_index] = 0; + value_boundary_coeffs[num_boundary_surfaces * 2 + start_index] = 0; + gradient_internal_coeffs[num_boundary_surfaces * 0 + start_index] = 0; + gradient_internal_coeffs[num_boundary_surfaces * 1 + start_index] = 0; + gradient_internal_coeffs[num_boundary_surfaces * 2 + start_index] = 0; + gradient_boundary_coeffs[num_boundary_surfaces * 0 + start_index] = 0; + gradient_boundary_coeffs[num_boundary_surfaces * 1 + start_index] = 0; + gradient_boundary_coeffs[num_boundary_surfaces * 2 + start_index] = 0; } __global__ void scale_dev2t_tensor_kernel(int num, const double *vf1, double *vf2) @@ -128,9 +157,9 @@ __global__ void fvm_ddt_vector_kernel(int num_cells, double rDeltaT, diag[index] += rDeltaT * rho[index] * vol * sign; // TODO: skip moving - source[index * 3 + 0] += rDeltaT * rho_old_kernel * vf[index * 3 + 0] * vol * sign; - source[index * 3 + 1] += rDeltaT * rho_old_kernel * vf[index * 3 + 1] * vol * sign; - source[index * 3 + 2] += rDeltaT * rho_old_kernel * vf[index * 3 + 2] * vol * sign; + source[num_cells * 0 + index] += rDeltaT * rho_old_kernel * vf[num_cells * 0 + index] * vol * sign; + source[num_cells * 1 + index] += rDeltaT * rho_old_kernel * vf[num_cells * 1 + index] * vol * sign; + source[num_cells * 2 + index] += rDeltaT * rho_old_kernel * vf[num_cells * 2 + index] * vol * sign; } __global__ void fvm_div_vector_internal(int num_surfaces, @@ -157,7 +186,8 @@ __global__ void fvm_div_vector_internal(int num_surfaces, atomicAdd(&(diag[neighbor]), -upper_value); } -__global__ void fvm_div_vector_boundary(int num, int offset, +// TODO: modify the data structure of internal and boundary coeffs +__global__ void fvm_div_vector_boundary(int num_boundary_surfaces, int num, int offset, const double *boundary_phi, const double *value_internal_coeffs, const double *value_boundary_coeffs, double *internal_coeffs, double *boundary_coeffs, double sign) { @@ -167,12 +197,12 @@ __global__ void fvm_div_vector_boundary(int num, int offset, int start_index = offset + index; double boundary_f = boundary_phi[start_index]; - internal_coeffs[start_index * 3 + 0] += boundary_f * value_internal_coeffs[start_index * 3 + 0] * sign; - internal_coeffs[start_index * 3 + 1] += boundary_f * value_internal_coeffs[start_index * 3 + 1] * sign; - internal_coeffs[start_index * 3 + 2] += boundary_f * value_internal_coeffs[start_index * 3 + 2] * sign; - boundary_coeffs[start_index * 3 + 0] += boundary_f * value_boundary_coeffs[start_index * 3 + 0] * sign; - boundary_coeffs[start_index * 3 + 1] += boundary_f * value_boundary_coeffs[start_index * 3 + 1] * sign; - boundary_coeffs[start_index * 3 + 2] += boundary_f * value_boundary_coeffs[start_index * 3 + 2] * sign; + internal_coeffs[num_boundary_surfaces * 0 + start_index] += boundary_f * value_internal_coeffs[num_boundary_surfaces * 0 + start_index] * sign; + internal_coeffs[num_boundary_surfaces * 1 + start_index] += boundary_f * value_internal_coeffs[num_boundary_surfaces * 1 + start_index] * sign; + internal_coeffs[num_boundary_surfaces * 2 + start_index] += boundary_f * value_internal_coeffs[num_boundary_surfaces * 2 + start_index] * sign; + boundary_coeffs[num_boundary_surfaces * 0 + start_index] += boundary_f * value_boundary_coeffs[num_boundary_surfaces * 0 + start_index] * sign; + boundary_coeffs[num_boundary_surfaces * 1 + start_index] += boundary_f * value_boundary_coeffs[num_boundary_surfaces * 1 + start_index] * sign; + boundary_coeffs[num_boundary_surfaces * 2 + start_index] += boundary_f * value_boundary_coeffs[num_boundary_surfaces * 2 + start_index] * sign; } __global__ void fvm_laplacian_vector_internal(int num_surfaces, @@ -206,7 +236,7 @@ __global__ void fvm_laplacian_vector_internal(int num_surfaces, atomicAdd(&(diag[neighbor]), -upper_value); } -__global__ void fvm_laplacian_vector_boundary(int num, int offset, +__global__ void fvm_laplacian_vector_boundary(int num_boundary_surfaces, int num, int offset, const double *boundary_mag_sf, const double *boundary_gamma, const double *gradient_internal_coeffs, const double *gradient_boundary_coeffs, double *internal_coeffs, double *boundary_coeffs, double sign) @@ -217,12 +247,12 @@ __global__ void fvm_laplacian_vector_boundary(int num, int offset, int start_index = offset + index; double boundary_value = boundary_gamma[start_index] * boundary_mag_sf[start_index]; - internal_coeffs[start_index * 3 + 0] += boundary_value * gradient_internal_coeffs[start_index * 3 + 0] * sign; - internal_coeffs[start_index * 3 + 1] += boundary_value * gradient_internal_coeffs[start_index * 3 + 1] * sign; - internal_coeffs[start_index * 3 + 2] += boundary_value * gradient_internal_coeffs[start_index * 3 + 2] * sign; - boundary_coeffs[start_index * 3 + 0] += boundary_value * gradient_boundary_coeffs[start_index * 3 + 0] * sign; - boundary_coeffs[start_index * 3 + 1] += boundary_value * gradient_boundary_coeffs[start_index * 3 + 1] * sign; - boundary_coeffs[start_index * 3 + 2] += boundary_value * gradient_boundary_coeffs[start_index * 3 + 2] * sign; + internal_coeffs[num_boundary_surfaces * 0 + start_index] += boundary_value * gradient_internal_coeffs[num_boundary_surfaces * 0 + start_index] * sign; + internal_coeffs[num_boundary_surfaces * 1 + start_index] += boundary_value * gradient_internal_coeffs[num_boundary_surfaces * 1 + start_index] * sign; + internal_coeffs[num_boundary_surfaces * 2 + start_index] += boundary_value * gradient_internal_coeffs[num_boundary_surfaces * 2 + start_index] * sign; + boundary_coeffs[num_boundary_surfaces * 0 + start_index] += boundary_value * gradient_boundary_coeffs[num_boundary_surfaces * 0 + start_index] * sign; + boundary_coeffs[num_boundary_surfaces * 1 + start_index] += boundary_value * gradient_boundary_coeffs[num_boundary_surfaces * 1 + start_index] * sign; + boundary_coeffs[num_boundary_surfaces * 2 + start_index] += boundary_value * gradient_boundary_coeffs[num_boundary_surfaces * 2 + start_index] * sign; } __global__ void fvc_ddt_scalar_kernel(int num_cells, double rDeltaT, @@ -277,9 +307,9 @@ __global__ void fvc_grad_vector_internal(int num_cells, int num_surfaces, int owner = lower_index[index]; int neighbor = upper_index[index]; - double ssfx = (w * (field_vector[owner * 3 + 0] - field_vector[neighbor * 3 + 0]) + field_vector[neighbor * 3 + 0]); - double ssfy = (w * (field_vector[owner * 3 + 1] - field_vector[neighbor * 3 + 1]) + field_vector[neighbor * 3 + 1]); - double ssfz = (w * (field_vector[owner * 3 + 2] - field_vector[neighbor * 3 + 2]) + field_vector[neighbor * 3 + 2]); + double ssfx = (w * (field_vector[num_cells * 0 + owner] - field_vector[num_cells * 0 + neighbor]) + field_vector[num_cells * 0 + neighbor]); + double ssfy = (w * (field_vector[num_cells * 1 + owner] - field_vector[num_cells * 1 + neighbor]) + field_vector[num_cells * 1 + neighbor]); + double ssfz = (w * (field_vector[num_cells * 2 + owner] - field_vector[num_cells * 2 + neighbor]) + field_vector[num_cells * 2 + neighbor]); double grad_xx = Sfx * ssfx; double grad_xy = Sfx * ssfy; @@ -291,26 +321,45 @@ __global__ void fvc_grad_vector_internal(int num_cells, int num_surfaces, double grad_zy = Sfz * ssfy; double grad_zz = Sfz * ssfz; - // owner - atomicAdd(&(output[num_cells * 0 + owner]), grad_xx); - atomicAdd(&(output[num_cells * 1 + owner]), grad_xy); - atomicAdd(&(output[num_cells * 2 + owner]), grad_xz); - atomicAdd(&(output[num_cells * 3 + owner]), grad_yx); - atomicAdd(&(output[num_cells * 4 + owner]), grad_yy); - atomicAdd(&(output[num_cells * 5 + owner]), grad_yz); - atomicAdd(&(output[num_cells * 6 + owner]), grad_zx); - atomicAdd(&(output[num_cells * 7 + owner]), grad_zy); - atomicAdd(&(output[num_cells * 8 + owner]), grad_zz); + // // owner + // atomicAdd(&(output[num_cells * 0 + owner]), grad_xx); + // atomicAdd(&(output[num_cells * 1 + owner]), grad_xy); + // atomicAdd(&(output[num_cells * 2 + owner]), grad_xz); + // atomicAdd(&(output[num_cells * 3 + owner]), grad_yx); + // atomicAdd(&(output[num_cells * 4 + owner]), grad_yy); + // atomicAdd(&(output[num_cells * 5 + owner]), grad_yz); + // atomicAdd(&(output[num_cells * 6 + owner]), grad_zx); + // atomicAdd(&(output[num_cells * 7 + owner]), grad_zy); + // atomicAdd(&(output[num_cells * 8 + owner]), grad_zz); - // neighbour + // // neighbour + // atomicAdd(&(output[num_cells * 0 + neighbor]), -grad_xx); + // atomicAdd(&(output[num_cells * 1 + neighbor]), -grad_xy); + // atomicAdd(&(output[num_cells * 2 + neighbor]), -grad_xz); + // atomicAdd(&(output[num_cells * 3 + neighbor]), -grad_yx); + // atomicAdd(&(output[num_cells * 4 + neighbor]), -grad_yy); + // atomicAdd(&(output[num_cells * 5 + neighbor]), -grad_yz); + // atomicAdd(&(output[num_cells * 6 + neighbor]), -grad_zx); + // atomicAdd(&(output[num_cells * 7 + neighbor]), -grad_zy); + // atomicAdd(&(output[num_cells * 8 + neighbor]), -grad_zz); + + atomicAdd(&(output[num_cells * 0 + owner]), grad_xx); atomicAdd(&(output[num_cells * 0 + neighbor]), -grad_xx); + atomicAdd(&(output[num_cells * 1 + owner]), grad_xy); atomicAdd(&(output[num_cells * 1 + neighbor]), -grad_xy); + atomicAdd(&(output[num_cells * 2 + owner]), grad_xz); atomicAdd(&(output[num_cells * 2 + neighbor]), -grad_xz); + atomicAdd(&(output[num_cells * 3 + owner]), grad_yx); atomicAdd(&(output[num_cells * 3 + neighbor]), -grad_yx); + atomicAdd(&(output[num_cells * 4 + owner]), grad_yy); atomicAdd(&(output[num_cells * 4 + neighbor]), -grad_yy); + atomicAdd(&(output[num_cells * 5 + owner]), grad_yz); atomicAdd(&(output[num_cells * 5 + neighbor]), -grad_yz); + atomicAdd(&(output[num_cells * 6 + owner]), grad_zx); atomicAdd(&(output[num_cells * 6 + neighbor]), -grad_zx); + atomicAdd(&(output[num_cells * 7 + owner]), grad_zy); atomicAdd(&(output[num_cells * 7 + neighbor]), -grad_zy); + atomicAdd(&(output[num_cells * 8 + owner]), grad_zz); atomicAdd(&(output[num_cells * 8 + neighbor]), -grad_zz); } @@ -381,25 +430,15 @@ __global__ void fvc_grad_scalar_internal(int num_cells, int num_surfaces, double grad_y = Sfy * ssf * sign; double grad_z = Sfz * ssf * sign; - // // owner - // atomicAdd(&(output[num_cells * 0 + owner]), grad_x); - // atomicAdd(&(output[num_cells * 1 + owner]), grad_y); - // atomicAdd(&(output[num_cells * 2 + owner]), grad_z); - - // // neighbour - // atomicAdd(&(output[num_cells * 0 + neighbor]), -grad_x); - // atomicAdd(&(output[num_cells * 1 + neighbor]), -grad_y); - // atomicAdd(&(output[num_cells * 2 + neighbor]), -grad_z); - // owner - atomicAdd(&(output[owner * 3 + 0]), grad_x); - atomicAdd(&(output[owner * 3 + 1]), grad_y); - atomicAdd(&(output[owner * 3 + 2]), grad_z); + atomicAdd(&(output[num_cells * 0 + owner]), grad_x); + atomicAdd(&(output[num_cells * 1 + owner]), grad_y); + atomicAdd(&(output[num_cells * 2 + owner]), grad_z); // neighbour - atomicAdd(&(output[neighbor * 3 + 0]), -grad_x); - atomicAdd(&(output[neighbor * 3 + 1]), -grad_y); - atomicAdd(&(output[neighbor * 3 + 2]), -grad_z); + atomicAdd(&(output[num_cells * 0 + neighbor]), -grad_x); + atomicAdd(&(output[num_cells * 1 + neighbor]), -grad_y); + atomicAdd(&(output[num_cells * 2 + neighbor]), -grad_z); } @@ -423,9 +462,9 @@ __global__ void fvc_grad_scalar_boundary(int num_cells, int num, int offset, con double grad_y = bouSfy * bouvf; double grad_z = bouSfz * bouvf; - atomicAdd(&(output[cellIndex * 3 + 0]), grad_x * sign); - atomicAdd(&(output[cellIndex * 3 + 1]), grad_y * sign); - atomicAdd(&(output[cellIndex * 3 + 2]), grad_z * sign); + atomicAdd(&(output[num_cells * 0 + cellIndex]), grad_x * sign); + atomicAdd(&(output[num_cells * 1 + cellIndex]), grad_y * sign); + atomicAdd(&(output[num_cells * 2 + cellIndex]), grad_z * sign); // if (cellIndex == 5) // { @@ -690,19 +729,17 @@ __global__ void fvc_div_cell_tensor_internal(int num_cells, int num_surfaces, double div_z = (Sfx * ssf_xz + Sfy * ssf_yz + Sfz * ssf_zz) * sign; // owner - atomicAdd(&(output[owner * 3 + 0]), div_x); - atomicAdd(&(output[owner * 3 + 1]), div_y); - atomicAdd(&(output[owner * 3 + 2]), div_z); + atomicAdd(&(output[num_cells * 0 + owner]), div_x); + atomicAdd(&(output[num_cells * 1 + owner]), div_y); + atomicAdd(&(output[num_cells * 2 + owner]), div_z); // neighbour - atomicAdd(&(output[neighbor * 3 + 0]), -div_x); - atomicAdd(&(output[neighbor * 3 + 1]), -div_y); - atomicAdd(&(output[neighbor * 3 + 2]), -div_z); - - + atomicAdd(&(output[num_cells * 0 + neighbor]), -div_x); + atomicAdd(&(output[num_cells * 1 + neighbor]), -div_y); + atomicAdd(&(output[num_cells * 2 + neighbor]), -div_z); } -__global__ void fvc_div_cell_tensor_boundary(int num_boundary_faces, int num, int offset, const int *face2Cells, +__global__ void fvc_div_cell_tensor_boundary(int num_cells, int num_boundary_faces, int num, int offset, const int *face2Cells, const double *boundary_face_vector, const double *boundary_vf, double *output, double sign) { int index = blockDim.x * blockIdx.x + threadIdx.x; @@ -730,9 +767,9 @@ __global__ void fvc_div_cell_tensor_boundary(int num_boundary_faces, int num, in double bouDiv_y = (bouSfx * boussf_xy + bouSfy * boussf_yy + bouSfz * boussf_zy) * sign; double bouDiv_z = (bouSfx * boussf_xz + bouSfy * boussf_yz + bouSfz * boussf_zz) * sign; - atomicAdd(&(output[cellIndex * 3 + 0]), bouDiv_x); - atomicAdd(&(output[cellIndex * 3 + 1]), bouDiv_y); - atomicAdd(&(output[cellIndex * 3 + 2]), bouDiv_z); + atomicAdd(&(output[num_cells * 0 + cellIndex]), bouDiv_x); + atomicAdd(&(output[num_cells * 1 + cellIndex]), bouDiv_y); + atomicAdd(&(output[num_cells * 2 + cellIndex]), bouDiv_z); // if (cellIndex == 0) // { @@ -752,6 +789,85 @@ __global__ void fvc_div_cell_tensor_boundary(int num_boundary_faces, int num, in // } } +__global__ void constructVecDiag(int num_cells, const double *diag, double *diag_vec, + const double *source, double *b) +{ + int index = blockDim.x * blockIdx.x + threadIdx.x; + if (index >= num_cells) + return; + + diag_vec[num_cells * 0 + index] = diag[index]; + diag_vec[num_cells * 1 + index] = diag[index]; + diag_vec[num_cells * 2 + index] = diag[index]; + + b[num_cells * 0 + index] = source[num_cells * 0 + index]; + b[num_cells * 1 + index] = source[num_cells * 1 + index]; + b[num_cells * 2 + index] = source[num_cells * 2 + index]; +} + +__global__ void addBoundaryDiagSrc(int num_cells, int num_boundary_surfaces, const int *face2Cells, + const double *internal_coeffs, const double *boundary_coeffs, double *diag_vec, double *b) +{ + int index = blockDim.x * blockIdx.x + threadIdx.x; + if (index >= num_boundary_surfaces) + return; + + int cellIndex = face2Cells[index]; + + double internalCoeffx = internal_coeffs[num_boundary_surfaces * 0 + index]; + double internalCoeffy = internal_coeffs[num_boundary_surfaces * 1 + index]; + double internalCoeffz = internal_coeffs[num_boundary_surfaces * 2 + index]; + + double boundaryCoeffx = boundary_coeffs[num_boundary_surfaces * 0 + index]; + double boundaryCoeffy = boundary_coeffs[num_boundary_surfaces * 1 + index]; + double boundaryCoeffz = boundary_coeffs[num_boundary_surfaces * 2 + index]; + + atomicAdd(&diag_vec[num_cells * 0 + cellIndex], internalCoeffx); + atomicAdd(&diag_vec[num_cells * 1 + cellIndex], internalCoeffy); + atomicAdd(&diag_vec[num_cells * 2 + cellIndex], internalCoeffz); + + atomicAdd(&b[num_cells * 0 + cellIndex], boundaryCoeffx); + atomicAdd(&b[num_cells * 1 + cellIndex], boundaryCoeffy); + atomicAdd(&b[num_cells * 2 + cellIndex], boundaryCoeffz); +} + +__global__ void ldu_to_csr_offDiag(int num_cells, int num_surfaces, + const int *lowCSRIndex, const int *uppCSRIndex, + const double *lower, const double *upper, + double *A_csr) +{ + int index = blockDim.x * blockIdx.x + threadIdx.x; + if (index >= num_surfaces) + return; + + int uppIndex = uppCSRIndex[index]; + int lowIndex = lowCSRIndex[index]; + int upp = upper[index]; + int low = lower[index]; + A_csr[(num_cells + 2 * num_surfaces) * 0 + uppIndex] = upper[index]; + A_csr[(num_cells + 2 * num_surfaces) * 1 + uppIndex] = upper[index]; + A_csr[(num_cells + 2 * num_surfaces) * 2 + uppIndex] = upper[index]; + + A_csr[(num_cells + 2 * num_surfaces) * 0 + lowIndex] = lower[index]; + A_csr[(num_cells + 2 * num_surfaces) * 1 + lowIndex] = lower[index]; + A_csr[(num_cells + 2 * num_surfaces) * 2 + lowIndex] = lower[index]; +} + +__global__ void ldu_to_csr_Diag(int num_cells, int num_surfaces, + const int *diagCSRIndex, const double *diag_vec, + double *A_csr) +{ + int index = blockDim.x * blockIdx.x + threadIdx.x; + if (index >= num_cells) + return; + + int diagIndex = diagCSRIndex[index]; + A_csr[(num_cells + 2 * num_surfaces) * 0 + diagIndex] = diag_vec[num_cells * 0 + index]; + A_csr[(num_cells + 2 * num_surfaces) * 1 + diagIndex] = diag_vec[num_cells * 1 + index]; + A_csr[(num_cells + 2 * num_surfaces) * 2 + diagIndex] = diag_vec[num_cells * 2 + index]; +} + + void permute_vector_d2h(cudaStream_t stream, int num_cells, const double *input, double *output) { size_t threads_per_block = 256; @@ -770,13 +886,13 @@ void field_multiply_scalar(cudaStream_t stream, int num_cells, const double *input1, const double *input2, double *output, int num_boundary_surfaces, const double *boundary_input1, const double *boundary_input2, double *boundary_output) { - TICK_INIT; + TICK_INIT_EVENT; size_t threads_per_block = 256; size_t blocks_per_grid = (std::max(num_cells, num_boundary_surfaces) + threads_per_block - 1) / threads_per_block; - TICK_START; + TICK_START_EVENT; field_multiply_scalar_kernel<<>>(num_cells, num_boundary_surfaces, input1, input2, output, boundary_input1, boundary_input2, boundary_output); - TICK_END(field_multiply_scalar_kernel); + TICK_END_EVENT(field_multiply_scalar_kernel); } void fvc_to_source_vector(cudaStream_t stream, int num_cells, const double *volume, const double *fvc_output, double *source) @@ -787,16 +903,35 @@ void fvc_to_source_vector(cudaStream_t stream, int num_cells, const double *volu volume, fvc_output, source); } -void ldu_to_csr(cudaStream_t stream, int num_cells, int num_surfaces, - const int *lower_to_csr_index, const int *upper_to_csr_index, const int *diag_to_csr_index, +void ldu_to_csr(cudaStream_t stream, int num_cells, int num_surfaces, int num_boundary_surface, + const int* boundary_cell_face, const int *lower_to_csr_index, const int *upper_to_csr_index, const int *diag_to_csr_index, const double *lower, const double *upper, const double *diag, const double *source, const double *internal_coeffs, const double *boundary_coeffs, - double *A, double *b) + double *A, double *b, double *diag_vec) { + // construct new diag with size of 3*num_cells + size_t threads_per_block = 1024; + size_t blocks_per_grid = (num_cells + threads_per_block - 1) / threads_per_block; + constructVecDiag<<>>(num_cells, diag, diag_vec, source, b); + + // add coeff to source and diagnal + blocks_per_grid = (num_boundary_surface + threads_per_block - 1) / threads_per_block; + addBoundaryDiagSrc<<>>(num_cells, num_boundary_surface, + boundary_cell_face, internal_coeffs, boundary_coeffs, diag_vec, b); + + // convert offdiag + blocks_per_grid = (num_surfaces + threads_per_block - 1) / threads_per_block; + ldu_to_csr_offDiag<<>>(num_cells, num_surfaces, + lower_to_csr_index, upper_to_csr_index, lower, upper, A); + + // convert diag + blocks_per_grid = (num_cells + threads_per_block - 1) / threads_per_block; + ldu_to_csr_Diag<<>>(num_cells, num_surfaces, + diag_to_csr_index, diag_vec, A); } -void update_boundary_coeffs_vector(cudaStream_t stream, int num_patches, +void update_boundary_coeffs_vector(cudaStream_t stream, int num_boundary_surfaces, int num_patches, const int *patch_size, const int *patch_type, double *value_internal_coeffs, double *value_boundary_coeffs, double *gradient_internal_coeffs, double *gradient_boundary_coeffs) @@ -811,7 +946,7 @@ void update_boundary_coeffs_vector(cudaStream_t stream, int num_patches, // TODO: just basic patch type now // TODO: just vector version now if (patch_type[i] == boundaryConditions::zeroGradient) { - update_boundary_coeffs_zeroGradient_vector<<>>(patch_size[i], offset, + update_boundary_coeffs_zeroGradient_vector<<>>(num_boundary_surfaces, patch_size[i], offset, value_internal_coeffs, value_boundary_coeffs, gradient_internal_coeffs, gradient_boundary_coeffs); } else if (0) { // xxx @@ -825,32 +960,41 @@ void fvm_ddt_vector(cudaStream_t stream, int num_cells, double rDeltaT, const double *rho, const double *rho_old, const double *vf, const double *volume, double *diag, double *source, double sign) { +#ifdef TIME_GPU printf("#############kernel profile#############\n"); - TICK_INIT; - size_t threads_per_block = 1024; +#endif + TICK_INIT_EVENT; + size_t threads_per_block = 64; size_t blocks_per_grid = (num_cells + threads_per_block - 1) / threads_per_block; - TICK_START; +#ifdef TIME_GPU + printf("warm up ...\n"); + warmup<<>>(num_cells); +#endif + TICK_START_EVENT; fvm_ddt_vector_kernel<<>>(num_cells, rDeltaT, rho, rho_old, vf, volume, diag, source, sign); - TICK_END(fvm_ddt_vector_kernel); + TICK_END_EVENT(fvm_ddt_vector_kernel); } -void fvm_div_vector(cudaStream_t stream, int num_surfaces, const int *lowerAddr, const int *upperAddr, +void fvm_div_vector(cudaStream_t stream, int num_surfaces, int num_boundary_surfaces, + const int *lowerAddr, const int *upperAddr, const double *phi, const double *weight, double *lower, double *upper, double *diag, // end for internal int num_patches, const int *patch_size, const int *patch_type, const double *boundary_phi, const double *value_internal_coeffs, const double *value_boundary_coeffs, double *internal_coeffs, double *boundary_coeffs, double sign) { - TICK_INIT; - size_t threads_per_block = 1024; - size_t blocks_per_grid = 1; - - blocks_per_grid = (num_surfaces + threads_per_block - 1) / threads_per_block; - TICK_START; + TICK_INIT_EVENT; + size_t threads_per_block = 256; + size_t blocks_per_grid = (num_surfaces + threads_per_block - 1) / threads_per_block; +#ifdef TIME_GPU + printf("warm up ...\n"); + warmup<<>>(num_surfaces); +#endif + TICK_START_EVENT; fvm_div_vector_internal<<>>(num_surfaces, lowerAddr, upperAddr, phi, weight, lower, upper, diag, sign); - TICK_END(fvm_div_vector_internal); + TICK_END_EVENT(fvm_div_vector_internal); int offset = 0; for (int i = 0; i < num_patches; i++) { @@ -860,11 +1004,11 @@ void fvm_div_vector(cudaStream_t stream, int num_surfaces, const int *lowerAddr, if (patch_type[i] == boundaryConditions::zeroGradient || patch_type[i] == boundaryConditions::fixedValue) { // TODO: just vector version now - TICK_START; - fvm_div_vector_boundary<<>>(patch_size[i], offset, + TICK_START_EVENT; + fvm_div_vector_boundary<<>>(num_boundary_surfaces, patch_size[i], offset, boundary_phi, value_internal_coeffs, value_boundary_coeffs, internal_coeffs, boundary_coeffs, sign); - TICK_END(fvm_div_vector_boundary); + TICK_END_EVENT(fvm_div_vector_boundary); } else if (0) { // xxx fprintf(stderr, "boundaryConditions other than zeroGradient are not support yet!\n"); @@ -873,7 +1017,7 @@ void fvm_div_vector(cudaStream_t stream, int num_surfaces, const int *lowerAddr, } } -void fvm_laplacian_vector(cudaStream_t stream, int num_surfaces, +void fvm_laplacian_vector(cudaStream_t stream, int num_surfaces, int num_boundary_surfaces, const int *lowerAddr, const int *upperAddr, const double *weight, const double *mag_sf, const double *delta_coeffs, const double *gamma, double *lower, double *upper, double *diag, // end for internal @@ -882,15 +1026,13 @@ void fvm_laplacian_vector(cudaStream_t stream, int num_surfaces, const double *gradient_internal_coeffs, const double *gradient_boundary_coeffs, double *internal_coeffs, double *boundary_coeffs, double sign) { - TICK_INIT; + TICK_INIT_EVENT; size_t threads_per_block = 1024; - size_t blocks_per_grid = 1; - - blocks_per_grid = (num_surfaces + threads_per_block - 1) / threads_per_block; - TICK_START; + size_t blocks_per_grid = (num_surfaces + threads_per_block - 1) / threads_per_block; + TICK_START_EVENT; fvm_laplacian_vector_internal<<>>(num_surfaces, lowerAddr, upperAddr, weight, mag_sf, delta_coeffs, gamma, lower, upper, diag, sign); - TICK_END(fvm_laplacian_vector_internal); + TICK_END_EVENT(fvm_laplacian_vector_internal); int offset = 0; for (int i = 0; i < num_patches; i++) { threads_per_block = 64; @@ -899,11 +1041,11 @@ void fvm_laplacian_vector(cudaStream_t stream, int num_surfaces, if (patch_type[i] == boundaryConditions::zeroGradient || patch_type[i] == boundaryConditions::fixedValue) { // TODO: just vector version now - TICK_START; - fvm_laplacian_vector_boundary<<>>(patch_size[i], offset, + TICK_START_EVENT; + fvm_laplacian_vector_boundary<<>>(num_boundary_surfaces, patch_size[i], offset, boundary_mag_sf, boundary_gamma, gradient_internal_coeffs, gradient_boundary_coeffs, internal_coeffs, boundary_coeffs, sign); - TICK_END(fvm_laplacian_vector_boundary); + TICK_END_EVENT(fvm_laplacian_vector_boundary); } else if (0) { // xxx fprintf(stderr, "boundaryConditions other than zeroGradient are not support yet!\n"); @@ -933,13 +1075,13 @@ void fvc_grad_vector(cudaStream_t stream, int num_cells, int num_surfaces, int n const double *boundary_deltaCoeffs, double sign) { checkCudaErrors(cudaMemsetAsync(output, 0, num_cells * 9 * sizeof(double), stream)); - TICK_INIT; - size_t threads_per_block = 1024; + TICK_INIT_EVENT; + size_t threads_per_block = 32; size_t blocks_per_grid = (num_surfaces + threads_per_block - 1) / threads_per_block; - TICK_START; + TICK_START_EVENT; fvc_grad_vector_internal<<>>(num_cells, num_surfaces, lowerAddr, upperAddr, Sf, weight, vf, output); - TICK_END(fvc_grad_vector_internal); + TICK_END_EVENT(fvc_grad_vector_internal); int offset = 0; // finish conctruct grad field except dividing cell volume @@ -950,11 +1092,11 @@ void fvc_grad_vector(cudaStream_t stream, int num_cells, int num_surfaces, int n if (patch_type[i] == boundaryConditions::zeroGradient || patch_type[i] == boundaryConditions::fixedValue) { // TODO: just vector version now - TICK_START; + TICK_START_EVENT; fvc_grad_vector_boundary<<>>(num_cells, patch_size[i], offset, boundary_cell_face, boundary_Sf, boundary_vf, output); - TICK_END(fvc_grad_vector_boundary); + TICK_END_EVENT(fvc_grad_vector_boundary); } else if (0) { // xxx fprintf(stderr, "boundaryConditions other than zeroGradient are not support yet!\n"); @@ -965,9 +1107,9 @@ void fvc_grad_vector(cudaStream_t stream, int num_cells, int num_surfaces, int n // divide cell volume threads_per_block = 512; blocks_per_grid = (num_cells + threads_per_block - 1) / threads_per_block; - TICK_START; + TICK_START_EVENT; divide_cell_volume_tsr<<>>(num_cells, volume, output, sign); - TICK_END(divide_cell_volume_tsr); + TICK_END_EVENT(divide_cell_volume_tsr); // correct boundary conditions offset = 0; @@ -977,11 +1119,11 @@ void fvc_grad_vector(cudaStream_t stream, int num_cells, int num_surfaces, int n // TODO: just basic patch type now if (patch_type[i] == boundaryConditions::zeroGradient) { // TODO: just vector version now - TICK_START; + TICK_START_EVENT; fvc_grad_vector_correctBC_zeroGradient<<>>(num_cells, num_boundary_surfaces, patch_size[i], offset, boundary_cell_face, output, boundary_vf, boundary_Sf, boundary_mag_Sf, boundary_output, sign); - TICK_END(fvc_grad_vector_correctBC_zeroGradient); + TICK_END_EVENT(fvc_grad_vector_correctBC_zeroGradient); } else if (patch_type[i] == boundaryConditions::fixedValue) { // TODO: implement fixedValue version fvc_grad_vector_correctBC_fixedValue<<>>(patch_size[i], offset, boundary_cell_face, @@ -997,12 +1139,12 @@ void fvc_grad_vector(cudaStream_t stream, int num_cells, int num_surfaces, int n void scale_dev2T_tensor(cudaStream_t stream, int num_cells, const double *vf1, double *vf2, int num_boundary_surfaces, const double *boundary_vf1, double *boundary_vf2) { - TICK_INIT; + TICK_INIT_EVENT; size_t threads_per_block = 1024; size_t blocks_per_grid = (num_cells + threads_per_block - 1) / threads_per_block; - TICK_START; + TICK_START_EVENT; scale_dev2t_tensor_kernel<<>>(num_cells, vf1, vf2); - TICK_END(scale_dev2t_tensor_kernel); + TICK_END_EVENT(scale_dev2t_tensor_kernel); blocks_per_grid = (num_boundary_surfaces + threads_per_block - 1) / threads_per_block; scale_dev2t_tensor_kernel<<>>(num_boundary_surfaces, boundary_vf1, boundary_vf2); @@ -1073,12 +1215,12 @@ void fvc_div_cell_tensor(cudaStream_t stream, int num_cells, int num_surfaces, i const double *volume, double sign) { // checkCudaErrors(cudaMemsetAsync(output, 0, num_cells * 3 * sizeof(double), stream)); - TICK_INIT; + TICK_INIT_EVENT; size_t threads_per_block = 1024; size_t blocks_per_grid = (num_surfaces + threads_per_block - 1) / threads_per_block; - TICK_START; + TICK_START_EVENT; fvc_div_cell_tensor_internal<<>>(num_cells, num_surfaces, lowerAddr, upperAddr, vf, weight, Sf, output, sign); - TICK_END(fvc_div_cell_tensor_internal); + TICK_END_EVENT(fvc_div_cell_tensor_internal); int offset = 0; for (int i = 0; i < num_patches; i++) { @@ -1088,10 +1230,10 @@ void fvc_div_cell_tensor(cudaStream_t stream, int num_cells, int num_surfaces, i if (patch_type[i] == boundaryConditions::zeroGradient || patch_type[i] == boundaryConditions::fixedValue) { // TODO: just vector version now - TICK_START; - fvc_div_cell_tensor_boundary<<>>(num_boundary_surfaces, patch_size[i], offset, boundary_cell_face, + TICK_START_EVENT; + fvc_div_cell_tensor_boundary<<>>(num_cells, num_boundary_surfaces, patch_size[i], offset, boundary_cell_face, boundary_Sf, boundary_vf, output, sign); - TICK_END(fvc_div_cell_tensor_boundary); + TICK_END_EVENT(fvc_div_cell_tensor_boundary); } else if (0) { // xxx fprintf(stderr, "boundaryConditions other than zeroGradient are not support yet!\n"); @@ -1111,13 +1253,13 @@ void fvc_grad_cell_scalar(cudaStream_t stream, int num_cells, int num_surfaces, int num_patches, const int *patch_size, const int *patch_type, const int *boundary_cell_face, const double *boundary_vf, const double *boundary_Sf, const double *volume, double sign) { - TICK_INIT; + TICK_INIT_EVENT; size_t threads_per_block = 1024; size_t blocks_per_grid = (num_surfaces + threads_per_block - 1) / threads_per_block; - TICK_START; + TICK_START_EVENT; fvc_grad_scalar_internal<<>>(num_cells, num_surfaces, lowerAddr, upperAddr, Sf, weight, vf, output, sign); - TICK_END(fvc_grad_scalar_internal); + TICK_END_EVENT(fvc_grad_scalar_internal); int offset = 0; for (int i = 0; i < num_patches; i++) { @@ -1126,10 +1268,10 @@ void fvc_grad_cell_scalar(cudaStream_t stream, int num_cells, int num_surfaces, // TODO: just non-coupled patch type now if (patch_type[i] == boundaryConditions::zeroGradient || patch_type[i] == boundaryConditions::fixedValue) { - TICK_START; + TICK_START_EVENT; fvc_grad_scalar_boundary<<>>(num_cells, patch_size[i], offset, boundary_cell_face, boundary_Sf, boundary_vf, output, sign); - TICK_END(fvc_grad_scalar_internal); + TICK_END_EVENT(fvc_grad_scalar_boundary); } else if (0) { // xxx fprintf(stderr, "boundaryConditions other than zeroGradient are not support yet!\n"); diff --git a/src_gpu/dfUEqn.H b/src_gpu/dfUEqn.H index 80cdc7144..49edc1b7a 100644 --- a/src_gpu/dfUEqn.H +++ b/src_gpu/dfUEqn.H @@ -41,7 +41,7 @@ private: double *d_grad_u = nullptr; double *d_rho_nueff = nullptr; double *d_permute = nullptr; - double *d_fvc_output = nullptr; + double *d_fvc_output = nullptr; // TODO: no need anymore // non-constant fields - boundary // thermophysical fields @@ -64,10 +64,11 @@ private: double *d_source = nullptr; double *d_internal_coeffs = nullptr; double *d_boundary_coeffs = nullptr; + double *d_diag_vector = nullptr; // non-constant fields - csr double *d_A = nullptr; - double *d_b = nullptr; + double *d_b = nullptr; // TODO: needless // field pointer map std::unordered_map fieldPointerMap; diff --git a/src_gpu/dfUEqn.cu b/src_gpu/dfUEqn.cu index 73b7516c5..d30c06131 100644 --- a/src_gpu/dfUEqn.cu +++ b/src_gpu/dfUEqn.cu @@ -50,6 +50,7 @@ void dfUEqn::createNonConstantLduAndCsrFields() { checkCudaErrors(cudaMalloc((void**)&d_upper, dataBase_.surface_value_bytes)); checkCudaErrors(cudaMalloc((void**)&d_diag, dataBase_.cell_value_bytes)); checkCudaErrors(cudaMalloc((void**)&d_source, dataBase_.cell_value_vec_bytes)); + checkCudaErrors(cudaMalloc((void**)&d_diag_vector, dataBase_.cell_value_vec_bytes)); checkCudaErrors(cudaMalloc((void**)&d_internal_coeffs, dataBase_.boundary_surface_value_vec_bytes)); checkCudaErrors(cudaMalloc((void**)&d_boundary_coeffs, dataBase_.boundary_surface_value_vec_bytes)); checkCudaErrors(cudaMalloc((void**)&d_A, dataBase_.csr_value_vec_bytes)); @@ -57,7 +58,7 @@ void dfUEqn::createNonConstantLduAndCsrFields() { } void dfUEqn::initNonConstantFieldsBoundary() { - update_boundary_coeffs_vector(dataBase_.stream, dataBase_.num_patches, + update_boundary_coeffs_vector(dataBase_.stream, dataBase_.num_boundary_surfaces, dataBase_.num_patches, dataBase_.patch_size.data(), patch_type.data(), d_value_internal_coeffs, d_value_boundary_coeffs, d_gradient_internal_coeffs, d_gradient_boundary_coeffs); @@ -87,6 +88,7 @@ void dfUEqn::preProcess(const double *h_u, const double *h_boundary_u, const dou checkCudaErrors(cudaMemsetAsync(d_boundary_coeffs, 0, dataBase_.boundary_surface_value_vec_bytes, dataBase_.stream)); checkCudaErrors(cudaMemsetAsync(d_A, 0, dataBase_.csr_value_vec_bytes, dataBase_.stream)); checkCudaErrors(cudaMemsetAsync(d_b, 0, dataBase_.cell_value_vec_bytes, dataBase_.stream)); + checkCudaErrors(cudaMemsetAsync(d_diag_vector, 0, dataBase_.cell_value_vec_bytes, dataBase_.stream)); // TODO: maybe a better way } void dfUEqn::process() { @@ -97,24 +99,27 @@ void dfUEqn::process() { checkCudaErrors(cudaEventCreate(&stop)); checkCudaErrors(cudaEventRecord(start,0)); - // if(!graph_created) { - // DEBUG_TRACE; - // checkCudaErrors(cudaStreamBeginCapture(dataBase_.stream, cudaStreamCaptureModeGlobal)); +#ifndef TIME_GPU + if(!graph_created) { + DEBUG_TRACE; + checkCudaErrors(cudaStreamBeginCapture(dataBase_.stream, cudaStreamCaptureModeGlobal)); +#endif + permute_vector_h2d(dataBase_.stream, dataBase_.num_cells, dataBase_.d_u, d_permute); fvm_ddt_vector(dataBase_.stream, dataBase_.num_cells, dataBase_.rdelta_t, - dataBase_.d_rho, dataBase_.d_rho_old, dataBase_.d_u, dataBase_.d_volume, + dataBase_.d_rho, dataBase_.d_rho_old, d_permute, dataBase_.d_volume, d_diag, d_source, 1.); - fvm_div_vector(dataBase_.stream, dataBase_.num_surfaces, dataBase_.d_owner, dataBase_.d_neighbor, + fvm_div_vector(dataBase_.stream, dataBase_.num_surfaces, dataBase_.num_boundary_surfaces, + dataBase_.d_owner, dataBase_.d_neighbor, dataBase_.d_phi, dataBase_.d_weight, d_lower, d_upper, d_diag, // end for internal dataBase_.num_patches, dataBase_.patch_size.data(), patch_type.data(), dataBase_.d_boundary_phi, d_value_internal_coeffs, d_value_boundary_coeffs, d_internal_coeffs, d_boundary_coeffs, 1.); - //TODO: merge bellow six kernels field_multiply_scalar(dataBase_.stream, dataBase_.num_cells, dataBase_.d_rho, d_nu_eff, d_rho_nueff, // end for internal dataBase_.num_boundary_surfaces, dataBase_.d_boundary_rho, d_boundary_nu_eff, d_boundary_rho_nueff); - fvm_laplacian_vector(dataBase_.stream, dataBase_.num_surfaces, + fvm_laplacian_vector(dataBase_.stream, dataBase_.num_surfaces, dataBase_.num_boundary_surfaces, dataBase_.d_owner, dataBase_.d_neighbor, dataBase_.d_weight, dataBase_.d_mag_sf, dataBase_.d_delta_coeffs, d_rho_nueff, d_lower, d_upper, d_diag, // end for internal @@ -124,7 +129,7 @@ void dfUEqn::process() { d_internal_coeffs, d_boundary_coeffs, -1); fvc_grad_vector(dataBase_.stream, dataBase_.num_cells, dataBase_.num_surfaces, dataBase_.num_boundary_surfaces, dataBase_.d_owner, dataBase_.d_neighbor, - dataBase_.d_weight, dataBase_.d_sf, dataBase_.d_u, d_grad_u, + dataBase_.d_weight, dataBase_.d_sf, d_permute, d_grad_u, dataBase_.num_patches, dataBase_.patch_size.data(), patch_type.data(), dataBase_.d_boundary_face_cell, dataBase_.d_boundary_u, dataBase_.d_boundary_sf, dataBase_.d_volume, dataBase_.d_boundary_mag_sf, d_boundary_grad_u, dataBase_.d_boundary_delta_coeffs); @@ -137,7 +142,6 @@ void dfUEqn::process() { dataBase_.d_boundary_face_cell, d_boundary_grad_u, dataBase_.d_boundary_sf, dataBase_.d_volume); // fvc_to_source_vector(dataBase_.stream, dataBase_.num_cells, // dataBase_.d_volume, d_fvc_output, d_source); - // TODO: merge bellow two kernel fvc_grad_cell_scalar(dataBase_.stream, dataBase_.num_cells, dataBase_.num_surfaces, dataBase_.d_owner, dataBase_.d_neighbor, dataBase_.d_weight, dataBase_.d_sf, dataBase_.d_p, d_source, @@ -146,12 +150,14 @@ void dfUEqn::process() { // fvc_to_source_vector(dataBase_.stream, dataBase_.num_cells, // dataBase_.d_volume, d_fvc_output, d_source); - // checkCudaErrors(cudaStreamEndCapture(dataBase_.stream, &graph)); - // checkCudaErrors(cudaGraphInstantiate(&graph_instance, graph, NULL, NULL, 0)); - // graph_created = true; - // } - // DEBUG_TRACE; - // checkCudaErrors(cudaGraphLaunch(graph_instance, dataBase_.stream)); +#ifndef TIME_GPU + checkCudaErrors(cudaStreamEndCapture(dataBase_.stream, &graph)); + checkCudaErrors(cudaGraphInstantiate(&graph_instance, graph, NULL, NULL, 0)); + graph_created = true; + } + DEBUG_TRACE; + checkCudaErrors(cudaGraphLaunch(graph_instance, dataBase_.stream)); +#endif checkCudaErrors(cudaEventRecord(stop,0)); checkCudaErrors(cudaEventSynchronize(start)); @@ -168,11 +174,14 @@ void dfUEqn::sync() } void dfUEqn::solve() { - //ldu_to_csr(dataBase_.stream, dataBase_.num_cells, dataBase_.num_surfaces, - // dataBase_.d_lower_to_csr_index, dataBase_.d_upper_to_csr_index, dataBase_.d_diag_to_csr_index, - // d_lower, d_upper, d_diag, d_source, d_internal_coeffs, d_boundary_coeffs, d_A, d_b); + ldu_to_csr(dataBase_.stream, dataBase_.num_cells, dataBase_.num_surfaces, dataBase_.num_boundary_surfaces, + dataBase_.d_boundary_face_cell, + dataBase_.d_lower_to_csr_index, dataBase_.d_upper_to_csr_index, dataBase_.d_diag_to_csr_index, + d_lower, d_upper, d_diag, d_source, d_internal_coeffs, d_boundary_coeffs, d_A, d_b, d_diag_vector); int nNz = dataBase_.num_cells + dataBase_.num_surfaces * 2; // matrix entries + sync(); + if (num_iteration == 0) // first interation { printf("Initializing AmgX Linear Solver\n"); @@ -186,19 +195,19 @@ void dfUEqn::solve() { UySolver->updateOperator(dataBase_.num_cells, nNz, d_A + nNz); UzSolver->updateOperator(dataBase_.num_cells, nNz, d_A + 2 * nNz); } - UxSolver->solve(dataBase_.num_cells, dataBase_.d_u, d_b); - UySolver->solve(dataBase_.num_cells, dataBase_.d_u + dataBase_.num_cells, d_b + dataBase_.num_cells); - UzSolver->solve(dataBase_.num_cells, dataBase_.d_u + 2 * dataBase_.num_cells, d_b + 2 * dataBase_.num_cells); + UxSolver->solve(dataBase_.num_cells, d_permute, d_b); + UySolver->solve(dataBase_.num_cells, d_permute + dataBase_.num_cells, d_b + dataBase_.num_cells); + UzSolver->solve(dataBase_.num_cells, d_permute + 2 * dataBase_.num_cells, d_b + 2 * dataBase_.num_cells); num_iteration++; } -void dfUEqn::postProcess(double *h_u) { - permute_vector_d2h(dataBase_.stream, dataBase_.num_cells, dataBase_.d_u, d_permute); - checkCudaErrors(cudaMemcpyAsync(h_u, d_permute, dataBase_.cell_value_vec_bytes, cudaMemcpyDeviceToHost, dataBase_.stream)); +void dfUEqn::postProcess(double *h_u) { // TODO: Here may be a bug + permute_vector_d2h(dataBase_.stream, dataBase_.num_cells, d_permute, dataBase_.d_u); + checkCudaErrors(cudaMemcpyAsync(dataBase_.d_u, d_permute, dataBase_.cell_value_vec_bytes, cudaMemcpyDeviceToHost, dataBase_.stream)); checkCudaErrors(cudaStreamSynchronize(dataBase_.stream)); // some boundary conditions may also need vf.boundary, deltaCoeffs.boundary, and weight.boundary - update_boundary_coeffs_vector(dataBase_.stream, dataBase_.num_patches, + update_boundary_coeffs_vector(dataBase_.stream, dataBase_.num_boundary_surfaces, dataBase_.num_patches, dataBase_.patch_size.data(), patch_type.data(), d_value_internal_coeffs, d_value_boundary_coeffs, d_gradient_internal_coeffs, d_gradient_boundary_coeffs); @@ -251,30 +260,41 @@ void dfUEqn::compareResult(const double *lower, const double *upper, const doubl checkVectorEqual(dataBase_.num_cells, diag, h_diag.data(), 1e-14, printFlag); DEBUG_TRACE; - std::vector h_source; - // , h_source_ref; + std::vector h_source, h_source_ref; h_source.resize(dataBase_.num_cells * 3); - // h_source_ref.resize(dataBase_.num_cells * 3); - // for (int i = 0; i < dataBase_.num_cells; i++) { - // h_source_ref[0 * dataBase_.num_cells + i] = source[i * 3 + 0]; - // h_source_ref[1 * dataBase_.num_cells + i] = source[i * 3 + 1]; - // h_source_ref[2 * dataBase_.num_cells + i] = source[i * 3 + 2]; - // } + h_source_ref.resize(dataBase_.num_cells * 3); + for (int i = 0; i < dataBase_.num_cells; i++) { + h_source_ref[0 * dataBase_.num_cells + i] = source[i * 3 + 0]; + h_source_ref[1 * dataBase_.num_cells + i] = source[i * 3 + 1]; + h_source_ref[2 * dataBase_.num_cells + i] = source[i * 3 + 2]; + } checkCudaErrors(cudaMemcpy(h_source.data(), d_source, dataBase_.cell_value_vec_bytes, cudaMemcpyDeviceToHost)); fprintf(stderr, "check h_source"); - checkVectorEqual(dataBase_.num_cells * 3, source, h_source.data(), 1e-14, printFlag); + checkVectorEqual(dataBase_.num_cells * 3, h_source_ref.data(), h_source.data(), 1e-14, printFlag); DEBUG_TRACE; - std::vector h_internal_coeffs; + std::vector h_internal_coeffs, h_internal_coeffs_ref; h_internal_coeffs.resize(dataBase_.num_boundary_surfaces * 3); + h_internal_coeffs_ref.resize(dataBase_.num_boundary_surfaces * 3); + for (int i = 0; i < dataBase_.num_boundary_surfaces; i++) { + h_internal_coeffs_ref[0 * dataBase_.num_boundary_surfaces + i] = internal_coeffs[i * 3 + 0]; + h_internal_coeffs_ref[1 * dataBase_.num_boundary_surfaces + i] = internal_coeffs[i * 3 + 1]; + h_internal_coeffs_ref[2 * dataBase_.num_boundary_surfaces + i] = internal_coeffs[i * 3 + 2]; + } checkCudaErrors(cudaMemcpy(h_internal_coeffs.data(), d_internal_coeffs, dataBase_.boundary_surface_value_vec_bytes, cudaMemcpyDeviceToHost)); - checkVectorEqual(dataBase_.num_boundary_surfaces * 3, internal_coeffs, h_internal_coeffs.data(), 1e-14, printFlag); + checkVectorEqual(dataBase_.num_boundary_surfaces * 3, h_internal_coeffs_ref.data(), h_internal_coeffs.data(), 1e-14, printFlag); DEBUG_TRACE; - std::vector h_boundary_coeffs; + std::vector h_boundary_coeffs, h_boundary_coeffs_ref; h_boundary_coeffs.resize(dataBase_.num_boundary_surfaces * 3); + h_boundary_coeffs_ref.resize(dataBase_.num_boundary_surfaces * 3); + for (int i = 0; i < dataBase_.num_boundary_surfaces; i++) { + h_boundary_coeffs_ref[0 * dataBase_.num_boundary_surfaces + i] = boundary_coeffs[i * 3 + 0]; + h_boundary_coeffs_ref[1 * dataBase_.num_boundary_surfaces + i] = boundary_coeffs[i * 3 + 1]; + h_boundary_coeffs_ref[2 * dataBase_.num_boundary_surfaces + i] = boundary_coeffs[i * 3 + 2]; + } checkCudaErrors(cudaMemcpy(h_boundary_coeffs.data(), d_boundary_coeffs, dataBase_.boundary_surface_value_vec_bytes, cudaMemcpyDeviceToHost)); - checkVectorEqual(dataBase_.num_boundary_surfaces * 3, boundary_coeffs, h_boundary_coeffs.data(), 1e-14, printFlag); + checkVectorEqual(dataBase_.num_boundary_surfaces * 3, h_boundary_coeffs_ref.data(), h_boundary_coeffs.data(), 1e-14, printFlag); DEBUG_TRACE; // std::vector h_tmpVal; From d6844885a067df06f3ddac8c5268f984524d0815 Mon Sep 17 00:00:00 2001 From: maorz1998 Date: Mon, 21 Aug 2023 21:19:21 +0800 Subject: [PATCH 25/25] modify app --- applications/solvers/dfLowMachFoam/Make/files | 2 +- .../solvers/dfLowMachFoam/Make/options | 1 - applications/solvers/dfLowMachFoam/UEqn.H | 115 +++++ .../solvers/dfLowMachFoam/dfLowMachFoam.C | 46 +- .../solvers/dfLowMachFoam_new/CMakeLists.txt | 126 ----- applications/solvers/dfLowMachFoam_new/EEqn.H | 141 ------ .../solvers/dfLowMachFoam_new/Make/files | 3 - .../solvers/dfLowMachFoam_new/Make/options | 58 --- applications/solvers/dfLowMachFoam_new/UEqn.H | 247 ---------- applications/solvers/dfLowMachFoam_new/YEqn.H | 207 -------- .../solvers/dfLowMachFoam_new/YEqn_RR.H | 61 --- .../solvers/dfLowMachFoam_new/correctPhi.H | 12 - .../solvers/dfLowMachFoam_new/createFields.H | 176 ------- .../dfLowMachFoam_new/createGPUSolver.H | 97 ---- .../dfLowMachFoam_new/createdfSolver.H | 65 --- .../solvers/dfLowMachFoam_new/dfLowMachFoam.C | 447 ------------------ applications/solvers/dfLowMachFoam_new/pEqn.H | 203 -------- .../solvers/dfLowMachFoam_new/pcEqn.H | 130 ----- .../solvers/dfLowMachFoam_new/rhoEqn.H | 86 ---- .../solvers/dfLowMachFoam_new/setRDeltaT.H | 85 ---- .../solvers/dfLowMachFoam_new/setRootCase2.H | 5 - 21 files changed, 154 insertions(+), 2159 deletions(-) delete mode 100644 applications/solvers/dfLowMachFoam_new/CMakeLists.txt delete mode 100644 applications/solvers/dfLowMachFoam_new/EEqn.H delete mode 100644 applications/solvers/dfLowMachFoam_new/Make/files delete mode 100644 applications/solvers/dfLowMachFoam_new/Make/options delete mode 100644 applications/solvers/dfLowMachFoam_new/UEqn.H delete mode 100644 applications/solvers/dfLowMachFoam_new/YEqn.H delete mode 100644 applications/solvers/dfLowMachFoam_new/YEqn_RR.H delete mode 100644 applications/solvers/dfLowMachFoam_new/correctPhi.H delete mode 100644 applications/solvers/dfLowMachFoam_new/createFields.H delete mode 100644 applications/solvers/dfLowMachFoam_new/createGPUSolver.H delete mode 100644 applications/solvers/dfLowMachFoam_new/createdfSolver.H delete mode 100644 applications/solvers/dfLowMachFoam_new/dfLowMachFoam.C delete mode 100644 applications/solvers/dfLowMachFoam_new/pEqn.H delete mode 100644 applications/solvers/dfLowMachFoam_new/pcEqn.H delete mode 100644 applications/solvers/dfLowMachFoam_new/rhoEqn.H delete mode 100644 applications/solvers/dfLowMachFoam_new/setRDeltaT.H delete mode 100644 applications/solvers/dfLowMachFoam_new/setRootCase2.H diff --git a/applications/solvers/dfLowMachFoam/Make/files b/applications/solvers/dfLowMachFoam/Make/files index 4eff5915e..9b7e89945 100644 --- a/applications/solvers/dfLowMachFoam/Make/files +++ b/applications/solvers/dfLowMachFoam/Make/files @@ -1,3 +1,3 @@ -new_dfLowMachFoam.C +dfLowMachFoam.C EXE = $(DF_APPBIN)/dfLowMachFoam diff --git a/applications/solvers/dfLowMachFoam/Make/options b/applications/solvers/dfLowMachFoam/Make/options index e1959ada3..bda93210e 100644 --- a/applications/solvers/dfLowMachFoam/Make/options +++ b/applications/solvers/dfLowMachFoam/Make/options @@ -9,7 +9,6 @@ EXE_INC = -std=c++14 \ $(PFLAGS) $(PINC) \ $(if $(LIBTORCH_ROOT),-DUSE_LIBTORCH,) \ $(if $(PYTHON_INC_DIR),-DUSE_PYTORCH,) \ - $(if $(AMGX_DIR),-DGPUSolver_,) \ -I$(LIB_SRC)/transportModels/compressible/lnInclude \ -I$(LIB_SRC)/thermophysicalModels/basic/lnInclude \ -I$(LIB_SRC)/TurbulenceModels/turbulenceModels/lnInclude \ diff --git a/applications/solvers/dfLowMachFoam/UEqn.H b/applications/solvers/dfLowMachFoam/UEqn.H index c3ee91068..38934abdb 100644 --- a/applications/solvers/dfLowMachFoam/UEqn.H +++ b/applications/solvers/dfLowMachFoam/UEqn.H @@ -86,6 +86,121 @@ // K = 0.5*magSqr(U); // } // UEqn_GPU.checkValue(true); +#elif defined GPUSolverNew_ + const tmp nuEff_tmp(turbulence->nuEff()); + const volScalarField& nuEff = nuEff_tmp(); + + // run CPU, for temp + tmp tUEqn + ( + fvm::ddt(rho, U) + + + fvm::div(phi, U) + + + turbulence->divDevRhoReff(U) + == -fvc::grad(p) + ); + fvVectorMatrix& UEqn = tUEqn.ref(); + + // run GPU + // preProcess + // TODO: preProcessForRhoEqn for temp, now we only transfer phi(instead of rhoEqn) used by fvm::div(phi, U) + UEqn_GPU.sync(); + double *h_rho = dfDataBase.getFieldPointer("rho", location::cpu, position::internal); + double *h_phi = dfDataBase.getFieldPointer("phi", location::cpu, position::internal); + double *h_boundary_phi = dfDataBase.getFieldPointer("phi", location::cpu, position::boundary); + memcpy(h_rho, &rho[0], dfDataBase.cell_value_bytes); + memcpy(h_phi, &phi[0], dfDataBase.surface_value_bytes); + int offset = 0; + forAll(phi.boundaryField(), patchi) + { + const fvsPatchScalarField& patchPhi = phi.boundaryField()[patchi]; + int patchsize = patchPhi.size(); + memcpy(h_boundary_phi + offset, &patchPhi[0], patchsize * sizeof(double)); + offset += patchsize; + } + UEqn_GPU.preProcessForRhoEqn(h_rho, h_phi, h_boundary_phi); + DEBUG_TRACE; + + TICK_START; + // preparing u, p, nu_eff, and rho.boundary used by UEqn_GPU.preProcess() + double *h_u = dfDataBase.getFieldPointer("u", location::cpu, position::internal); + double *h_boundary_u = dfDataBase.getFieldPointer("u", location::cpu, position::boundary); + double *h_p = dfDataBase.getFieldPointer("p", location::cpu, position::internal); + double *h_boundary_p = dfDataBase.getFieldPointer("p", location::cpu, position::boundary); + double *h_nu_eff = UEqn_GPU.getFieldPointer("nu_eff", location::cpu, position::internal); + double *h_boundary_nu_eff = UEqn_GPU.getFieldPointer("nu_eff", location::cpu, position::boundary); + double *h_boundary_rho = dfDataBase.getFieldPointer("rho", location::cpu, position::boundary); + TICK_STOP(get pointer); + + TICK_START; + U.oldTime(); + memcpy(h_u, &U.oldTime()[0][0], dfDataBase.cell_value_vec_bytes); + memcpy(h_p, &p[0], dfDataBase.cell_value_bytes); + memcpy(h_nu_eff, &nuEff[0], dfDataBase.cell_value_bytes); + TICK_STOP(copy to pinned memory); + + TICK_START; + offset = 0; + forAll(U.boundaryField(), patchi) + { + const fvPatchVectorField& patchU = U.boundaryField()[patchi]; + const fvPatchScalarField& patchP = p.boundaryField()[patchi]; + const fvPatchScalarField& patchNuEff = nuEff.boundaryField()[patchi]; + const fvPatchScalarField& patchRho = rho.boundaryField()[patchi]; + int patchsize = patchU.size(); + memcpy(h_boundary_u + offset * 3, &patchU[0][0], patchsize * 3 * sizeof(double)); + memcpy(h_boundary_p + offset, &patchP[0], patchsize * sizeof(double)); + memcpy(h_boundary_nu_eff + offset, &patchNuEff[0], patchsize * sizeof(double)); + memcpy(h_boundary_rho + offset, &patchRho[0], patchsize * sizeof(double)); + offset += patchsize; + } + TICK_STOP(CPU prepare boundary time); + + TICK_START; + UEqn_GPU.preProcess(h_u, h_boundary_u, h_p, h_boundary_p, h_nu_eff, h_boundary_nu_eff, h_boundary_rho); + DEBUG_TRACE; + UEqn_GPU.sync(); + TICK_STOP(GPU preProcess time); + + // process + TICK_START; + UEqn_GPU.process(); + DEBUG_TRACE; + UEqn_GPU.sync(); + TICK_STOP(GPU process time); + + TICK_START; + UEqn_GPU.solve(); + TICK_STOP(GPU solve time); + + // postProcess + TICK_START; + UEqn_GPU.postProcess(h_u); + U.correctBoundaryConditions(); + DEBUG_TRACE; + TICK_STOP(post process time); + + // checkResult + // TODO: for temp, now we compare ldu, finally we compare csr + std::vector h_internal_coeffs(dfDataBase.num_boundary_surfaces * 3); + std::vector h_boundary_coeffs(dfDataBase.num_boundary_surfaces * 3); + offset = 0; + for (int patchi = 0; patchi < dfDataBase.num_patches; patchi++) + { + int patchsize = dfDataBase.patch_size[patchi]; + const double* internal_coeff_ptr = &UEqn.internalCoeffs()[patchi][0][0]; + const double* boundary_coeff_ptr = &UEqn.boundaryCoeffs()[patchi][0][0]; + memcpy(h_internal_coeffs.data() + offset * 3, internal_coeff_ptr, patchsize * 3 * sizeof(double)); + memcpy(h_boundary_coeffs.data() + offset * 3, boundary_coeff_ptr, patchsize * 3 * sizeof(double)); + offset += patchsize; + } + bool printFlag = false; + UEqn_GPU.compareResult(&UEqn.lower()[0], &UEqn.upper()[0], &UEqn.diag()[0], &UEqn.source()[0][0], + h_internal_coeffs.data(), h_boundary_coeffs.data(), + // &DivTensor[0][0], + printFlag); + DEBUG_TRACE; #else start1 = std::clock(); tmp tUEqn diff --git a/applications/solvers/dfLowMachFoam/dfLowMachFoam.C b/applications/solvers/dfLowMachFoam/dfLowMachFoam.C index db6b25b18..6ea4251af 100644 --- a/applications/solvers/dfLowMachFoam/dfLowMachFoam.C +++ b/applications/solvers/dfLowMachFoam/dfLowMachFoam.C @@ -60,14 +60,34 @@ Description #include "basicThermo.H" #include "CombustionModel.H" -#ifdef GPUSolver_ +#define GPUSolverNew_ +#define TIME + +#ifdef GPUSolverNew_ #include "dfUEqn.H" -#include "dfYEqn.H" -#include "dfRhoEqn.H" -#include "dfEEqn.H" +// #include "dfYEqn.H" +// #include "dfRhoEqn.H" +// #include "dfEEqn.H" +#include "dfMatrixDataBase.H" +#include "dfMatrixOpBase.H" #include #include + +#include "createGPUSolver.H" + #include "upwind.H" +#include "GenFvMatrix.H" +#endif + +#ifdef TIME + #define TICK_START \ + start_new = std::clock(); + #define TICK_STOP(prefix) \ + stop_new = std::clock(); \ + Foam::Info << #prefix << " time = " << double(stop_new - start_new) / double(CLOCKS_PER_SEC) << " s" << Foam::endl; +#else + #define TICK_START + #define TICK_STOP(prefix) #endif // * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * // @@ -148,6 +168,8 @@ int main(int argc, char *argv[]) label timeIndex = 0; clock_t start, end, start1, end1, start2, end2; + clock_t start_new, stop_new; + double time_new = 0; turbulence->validate(); @@ -158,9 +180,11 @@ int main(int argc, char *argv[]) } start1 = std::clock(); - #ifdef GPUSolver_ - #include "createdfSolver.H" - #endif +#ifdef GPUSolverNew_ + createGPUBase(mesh, Y); + createGPUUEqn(CanteraTorchProperties, U); +#endif + end1 = std::clock(); time_monitor_init += double(end1 - start1) / double(CLOCKS_PER_SEC); @@ -187,7 +211,9 @@ int main(int argc, char *argv[]) runTime++; Info<< "Time = " << runTime.timeName() << nl << endl; - +#ifdef GPUSolverNew_ + dfDataBase.preTimeStep(&rho.oldTime()[0]); +#endif clock_t loop_start = std::clock(); // --- Pressure-velocity PIMPLE corrector loop while (pimple.loop()) @@ -276,6 +302,10 @@ int main(int argc, char *argv[]) rho = thermo.rho(); +#ifdef GPUSolverNew_ + dfDataBase.postTimeStep(); +#endif + runTime.write(); Info<< "========Time Spent in diffenet parts========"<< endl; Info<< "loop Time = " << loop_time << " s" << endl; diff --git a/applications/solvers/dfLowMachFoam_new/CMakeLists.txt b/applications/solvers/dfLowMachFoam_new/CMakeLists.txt deleted file mode 100644 index 645289a64..000000000 --- a/applications/solvers/dfLowMachFoam_new/CMakeLists.txt +++ /dev/null @@ -1,126 +0,0 @@ -cmake_minimum_required(VERSION 3.5) -project(dfLowMachFoam LANGUAGES CXX) -FIND_PACKAGE(MPI REQUIRED) -FIND_PACKAGE(OpenMP REQUIRED) -FIND_PACKAGE(CUDA REQUIRED) - -# Check valid thirdParty -if(DEFINED ENV{WM_PROJECT_DIR}) - MESSAGE(STATUS "OpenFOAM: " $ENV{WM_PROJECT_DIR}) -else() - message(FATAL_ERROR "OpenFOAM is not sourced") -endif(DEFINED ENV{WM_PROJECT_DIR}) - -if(DEFINED ENV{CANTERA_ROOT}) - MESSAGE(STATUS "libcantera: " $ENV{CANTERA_ROOT}) - SET(CANTERA_ROOT $ENV{CANTERA_ROOT}) -else() - message(FATAL_ERROR "libcantera directory is not specified") -endif(DEFINED ENV{CANTERA_ROOT}) - -# define variables -SET(OpenFOAM_LIB_DIR $ENV{FOAM_LIBBIN}) -SET(OpenFOAM_SRC $ENV{FOAM_SRC}) - -SET(DF_ROOT $ENV{DF_ROOT}) -SET(DF_SRC $ENV{DF_SRC}) -SET(SRC_ORIG $ENV{SRC_ORIG}) - -# set compilation options -SET(CMAKE_EXE_LINKER_FLAGS "-fuse-ld=bfd -Xlinker --add-needed -Xlinker --no-as-needed") -SET (CMAKE_C_FLAGS ${CMAKE_C_FLAGS} ${OpenMP_C_FLAGS}) -SET (CMAKE_CXX_FLAGS ${CMAKE_CXX_FLAGS} ${OpenMP_CXX_FLAGS}) - -SET(CMAKE_C_COMPILER g++) -SET(PATH_LIB_OPENMPI "openmpi-system") # Foundation version -SET(EXE_COMPILE_OPTION "-std=c++11 -m64 -Dlinux64 -DWM_ARCH_OPTION=64 --DWM_DP -DWM_LABEL_SIZE=32 -Wall -Wextra -Wold-style-cast -Wnon-virtual-dtor --Wno-unused-parameter -Wno-invalid-offsetof -Wno-attributes -O3 --DNoRepository -ftemplate-depth-100 -std=c++14 --Wno-unused-variable -Wno-unused-but-set-variable -Wno-old-style-cast -DOMPI_SKIP_MPICXX --pthread -fPIC") -add_definitions("${EXE_COMPILE_OPTION}") - -# add header files -FUNCTION(R_SEARCH search_path return_list) - FILE(GLOB_RECURSE new_list ${search_path}/*.H) - SET(dir_list "") - FOREACH(file_path ${new_list}) - GET_FILENAME_COMPONENT(dir_path ${file_path} PATH) - SET(dir_list ${dir_list} ${dir_path}) - ENDFOREACH() - LIST(REMOVE_DUPLICATES dir_list) - SET(${return_list} ${dir_list} PARENT_SCOPE) -ENDFUNCTION(R_SEARCH) - -R_SEARCH(${DF_SRC}/dfCombustionModels dfcombustion_inc) -R_SEARCH(${DF_SRC}/dfCanteraMixture dfcantera_inc) -R_SEARCH(${DF_SRC}/lagrangian/intermediate dflagrangianinter_inc) -R_SEARCH(${DF_SRC}/lagrangian/spray dflagrangianspray_inc) -R_SEARCH(${DF_SRC}/lagrangian/turbulence dflagrangianturb_inc) -R_SEARCH(${DF_SRC}/dfChemistryModel dfchemistry_inc) -R_SEARCH(${DF_SRC}/thermophysicalModels/thermophysicalProperties dfthermophysicalprop_inc) -R_SEARCH(${DF_SRC}/thermophysicalModels/thermophysicalProperties dfthermophysicalprop_inc) -R_SEARCH(${DF_SRC}/thermophysicalModels/basic dfthermophysicalbasic_inc) -R_SEARCH(${DF_SRC}/thermophysicalModels/SLGThermo dfthermophysicalslg_inc) -R_SEARCH(${DF_SRC}/TurbulenceModels dfturbulence_inc) -R_SEARCH(${DF_SRC}/dynamicMesh dfnewdynamic_inc) -R_SEARCH(${DF_SRC}/dynamicFvMesh dffvdynamic_inc) - -include_directories( - ${OpenFOAM_SRC}/finiteVolume/lnInclude - ${OpenFOAM_SRC}/OSspecific/POSIX/lnInclude - ${OpenFOAM_SRC}/OpenFOAM/lnInclude - ${OpenFOAM_SRC}/transportModels/compressible/lnInclude - ${OpenFOAM_SRC}/thermophysicalModels/basic/lnInclude - ${OpenFOAM_SRC}/TurbulenceModels/turbulenceModels/lnInclude - ${OpenFOAM_SRC}/TurbulenceModels/compressible/lnInclude - ${OpenFOAM_SRC}/finiteVolume/cfdTools - ${OpenFOAM_SRC}/finiteVolume/lnInclude - ${OpenFOAM_SRC}/meshTools/lnInclude - ${OpenFOAM_SRC}/sampling/lnInclude - ${OpenFOAM_SRC}/dynamicFvMesh/lnInclude - ${OpenFOAM_SRC}/Pstream/mpi - ${dfcantera_inc} - ${dfchemistry_inc} - ${dfcombustion_inc} - ${CANTERA_ROOT}/include - ${MPI_INCLUDE_PATH} - ${PROJECT_SOURCE_DIR} - ${CUDA_INCLUDE_DIRS} - /home/runze/AmgX/AMGX/include - /home/runze/deepflame-dev/src_gpu -) - -# add execution -add_executable(${PROJECT_NAME} ${PROJECT_SOURCE_DIR}/dfLowMachFoam.C) - -target_link_libraries(${PROJECT_NAME} - $ENV{FOAM_LIBBIN}/libfiniteVolume.so libmeshTools.so libcompressibleTransportModels.so - libturbulenceModels.so libsampling.so libOpenFOAM.so - ${CANTERA_ROOT}/lib/libcantera_shared.so.2 - ${DF_ROOT}/lib/libdfChemistryModel.so - ${DF_ROOT}/lib/libdfCanteraMixture.so - ${DF_ROOT}/lib/libdfFluidThermophysicalModels.so - ${DF_ROOT}/lib/libdfCombustionModels.so - $ENV{FOAM_LIBBIN}/openmpi-system/libPstream.so - ${MPI_LIBRARIES} - ${CUDA_LIBRARIES} - /home/runze/AmgX/AMGX/build/libamgxsh.so - /home/runze/deepflame-dev/src_gpu/build/libdfMatrix.so -) - -if(DEFINED ENV{PYTHON_INC_DIR}) - add_definitions(-DUSE_PYTORCH) - find_package (Python REQUIRED COMPONENTS Interpreter Development) - find_package(pybind11) - include_directories( - ${Python_INCLUDE_DIRS} - ${pybind11_INCLUDE_DIR}/pybind11 - ) - target_link_libraries(${PROJECT_NAME} ${Python_LIBRARIES}) -endif() - -# install -set(CMAKE_INSTALL_PREFIX ${DF_ROOT}) -install(TARGETS ${PROJECT_NAME} DESTINATION bin) diff --git a/applications/solvers/dfLowMachFoam_new/EEqn.H b/applications/solvers/dfLowMachFoam_new/EEqn.H deleted file mode 100644 index 896baaa06..000000000 --- a/applications/solvers/dfLowMachFoam_new/EEqn.H +++ /dev/null @@ -1,141 +0,0 @@ -{ - volScalarField& he = thermo.he(); -#ifdef GPUSolver_ - start1 = std::clock(); - UEqn_GPU.updatePsi(&U[0][0]); - UEqn_GPU.correctBoundaryConditions(); - U.correctBoundaryConditions(); - K = 0.5*magSqr(U); - end1 = std::clock(); - time_monitor_UEqn += double(end1 - start1) / double(CLOCKS_PER_SEC); - time_monitor_UEqn_correctBC += double(end1 - start1) / double(CLOCKS_PER_SEC); - - // prepare data on CPU - start1 = std::clock(); - start2 = std::clock(); - // const tmp alphaEff_tmp(thermo.alpha()); - // const volScalarField& alphaEff = alphaEff_tmp(); - double *alphaEff = nullptr; // tmp - end2 = std::clock(); - int eeqn_offset = 0; - int patchNum = 0; - - forAll(he.boundaryField(), patchi) - { - patchNum++; - const fvsPatchScalarField& pw = mesh.surfaceInterpolation::weights().boundaryField()[patchi]; - int patchSize = pw.size(); - - // construct gradient manually - const fvPatchScalarField& hew = he.boundaryField()[patchi]; - const basicThermo& bThermo = basicThermo::lookupThermo(hew); - const scalarField& ppw = bThermo.p().boundaryField()[patchi]; - fvPatchScalarField& Tw = - const_cast(bThermo.T().boundaryField()[patchi]); - scalarField& Tw_v = Tw; - - Tw.evaluate(); - const scalarField& patchDeltaCoeff = mesh.boundary()[patchi].deltaCoeffs(); - const scalarField heInternal = bThermo.he(ppw, Tw, patchi)(); - const scalarField heBoundary = bThermo.he(ppw, Tw, mesh.boundary()[patchi].faceCells())(); - const scalarField patchGradMau = patchDeltaCoeff * (heInternal - heBoundary); - - const scalarField& patchK = K.boundaryField()[patchi]; - // const scalarField& patchAlphaEff = alphaEff.boundaryField()[patchi]; // not H2Dcopy when use UnityLewis - // const scalarField& patchGrad = he.boundaryField()[patchi].gradientBoundaryCoeffs(); // gradient_ - - // const DimensionedField& patchHa_ = he.boundaryField()[patchi]; - // const gradientEnergyFvPatchScalarField patchHa(mesh.boundary()[patchi], patchHa_); - // const scalarField& patchGrad = patchHa.gradient(); // gradient_ - memcpy(boundary_K + eeqn_offset, &patchK[0], patchSize*sizeof(double)); - // memcpy(boundary_alphaEff + eeqn_offset, &patchAlphaEff[0], patchSize*sizeof(double)); - memcpy(boundary_gradient + eeqn_offset, &patchGradMau[0], patchSize*sizeof(double)); - - eeqn_offset += patchSize; - } - end1 = std::clock(); - time_monitor_EEqn += double(end1 - start1) / double(CLOCKS_PER_SEC); - time_monitor_EEqn_mtxAssembly += double(end1 - start1) / double(CLOCKS_PER_SEC); - time_monitor_EEqn_mtxAssembly_CPU_prepare += double(end1 - start1) / double(CLOCKS_PER_SEC); - fprintf(stderr, "time_monitor_EEqn_mtxAssembly_CPU_prepare: %lf, build alphaEff time: %lf, patchNum: %d\n", - time_monitor_EEqn_mtxAssembly_CPU_prepare, - double(end2 - start2) / double(CLOCKS_PER_SEC), patchNum); - - // prepare data on GPU - start1 = std::clock(); - he.oldTime(); - K.oldTime(); - EEqn_GPU.prepare_data(&he.oldTime()[0], &K[0], &K.oldTime()[0], alphaEff, - &dpdt[0], boundary_K, boundary_alphaEff, boundary_gradient); - EEqn_GPU.sync(); - end1 = std::clock(); - time_monitor_EEqn += double(end1 - start1) / double(CLOCKS_PER_SEC); - time_monitor_EEqn_mtxAssembly += double(end1 - start1) / double(CLOCKS_PER_SEC); - time_monitor_EEqn_mtxAssembly_GPU_prepare += double(end1 - start1) / double(CLOCKS_PER_SEC); - - start1 = std::clock(); - EEqn_GPU.initializeTimeStep(); - EEqn_GPU.fvm_ddt(); - EEqn_GPU.fvm_div(); - EEqn_GPU.fvm_laplacian(); - EEqn_GPU.fvc_ddt(); - EEqn_GPU.fvc_div_phi_scalar(); - EEqn_GPU.fvc_div_vector(); - EEqn_GPU.add_to_source(); - EEqn_GPU.sync(); - end1 = std::clock(); - time_monitor_EEqn += double(end1 - start1) / double(CLOCKS_PER_SEC); - time_monitor_EEqn_mtxAssembly += double(end1 - start1) / double(CLOCKS_PER_SEC); - time_monitor_EEqn_mtxAssembly_GPU_run += double(end1 - start1) / double(CLOCKS_PER_SEC); - - // check value of mtxAssembly, no time monitor - // EEqn_GPU.checkValue(true); - - start1 = std::clock(); - EEqn_GPU.solve(); - end1 = std::clock(); - time_monitor_EEqn += double(end1 - start1) / double(CLOCKS_PER_SEC); - time_monitor_EEqn_solve += double(end1 - start1) / double(CLOCKS_PER_SEC); - - start1 = std::clock(); - EEqn_GPU.updatePsi(&he[0]); - he.correctBoundaryConditions(); - he.write(); - end1 = std::clock(); - time_monitor_EEqn += double(end1 - start1) / double(CLOCKS_PER_SEC); - time_monitor_EEqn_correctBC += double(end1 - start1) / double(CLOCKS_PER_SEC); -#else - start1 = std::clock(); - fvScalarMatrix EEqn - ( - - fvm::ddt(rho, he) + mvConvection->fvmDiv(phi, he) - + fvc::ddt(rho, K) + fvc::div(phi, K) - - dpdt - == - ( - turbName == "laminar" - ? - ( - fvm::laplacian(turbulence->alpha(), he) - - diffAlphaD - + fvc::div(hDiffCorrFlux) - ) - : - ( - fvm::laplacian(turbulence->alphaEff(), he) - ) - ) - ); - end1 = std::clock(); - time_monitor_EEqn += double(end1 - start1) / double(CLOCKS_PER_SEC); - time_monitor_EEqn_mtxAssembly += double(end1 - start1) / double(CLOCKS_PER_SEC); - - EEqn.relax(); - start1 = std::clock(); - EEqn.solve("ha"); - end1 = std::clock(); - time_monitor_EEqn += double(end1 - start1) / double(CLOCKS_PER_SEC); - time_monitor_EEqn_solve += double(end1 - start1) / double(CLOCKS_PER_SEC); -#endif -} diff --git a/applications/solvers/dfLowMachFoam_new/Make/files b/applications/solvers/dfLowMachFoam_new/Make/files deleted file mode 100644 index 92df9b4e3..000000000 --- a/applications/solvers/dfLowMachFoam_new/Make/files +++ /dev/null @@ -1,3 +0,0 @@ -dfLowMachFoam.C - -EXE = $(DF_APPBIN)/dfLowMachFoam_new diff --git a/applications/solvers/dfLowMachFoam_new/Make/options b/applications/solvers/dfLowMachFoam_new/Make/options deleted file mode 100644 index bda93210e..000000000 --- a/applications/solvers/dfLowMachFoam_new/Make/options +++ /dev/null @@ -1,58 +0,0 @@ --include $(GENERAL_RULES)/mplibType - -EXE_INC = -std=c++14 \ - -g \ - -fopenmp \ - -Wno-unused-variable \ - -Wno-unused-but-set-variable \ - -Wno-old-style-cast \ - $(PFLAGS) $(PINC) \ - $(if $(LIBTORCH_ROOT),-DUSE_LIBTORCH,) \ - $(if $(PYTHON_INC_DIR),-DUSE_PYTORCH,) \ - -I$(LIB_SRC)/transportModels/compressible/lnInclude \ - -I$(LIB_SRC)/thermophysicalModels/basic/lnInclude \ - -I$(LIB_SRC)/TurbulenceModels/turbulenceModels/lnInclude \ - -I$(LIB_SRC)/TurbulenceModels/compressible/lnInclude \ - -I$(LIB_SRC)/finiteVolume/cfdTools \ - -I$(LIB_SRC)/finiteVolume/lnInclude \ - -I$(LIB_SRC)/meshTools/lnInclude \ - -I$(LIB_SRC)/sampling/lnInclude \ - -I$(LIB_SRC)/dynamicFvMesh/lnInclude \ - -I$(LIB_SRC)/Pstream/mpi \ - -I$(DF_SRC)/dfCanteraMixture/lnInclude \ - -I$(DF_SRC)/dfChemistryModel/lnInclude \ - -I$(DF_SRC)/dfCombustionModels/lnInclude \ - -I$(CANTERA_ROOT)/include \ - $(if $(LIBTORCH_ROOT),-I$(LIBTORCH_ROOT)/include,) \ - $(if $(LIBTORCH_ROOT),-I$(LIBTORCH_ROOT)/include/torch/csrc/api/include,) \ - $(PYTHON_INC_DIR) \ - $(if $(AMGX_DIR), -I$(DF_ROOT)/src_gpu,) \ - $(if $(AMGX_DIR), -I/usr/local/cuda-11.6/include,) \ - $(if $(AMGX_DIR), -I$(AMGX_DIR)/include,) \ - -I$(DF_ROOT)/GPUTestRef/lnInclude \ - -EXE_LIBS = \ - -lcompressibleTransportModels \ - -lturbulenceModels \ - -lfiniteVolume \ - -lmeshTools \ - -lsampling \ - -L$(DF_LIBBIN) \ - -ldfFluidThermophysicalModels \ - -ldfCompressibleTurbulenceModels \ - -ldfCanteraMixture \ - -ldfChemistryModel \ - -ldfCombustionModels \ - -ldfGenMatrix \ - $(CANTERA_ROOT)/lib/libcantera.so \ - $(if $(LIBTORCH_ROOT),$(LIBTORCH_ROOT)/lib/libtorch.so,) \ - $(if $(LIBTORCH_ROOT),$(LIBTORCH_ROOT)/lib/libc10.so,) \ - $(if $(LIBTORCH_ROOT),-rdynamic,) \ - $(if $(LIBTORCH_ROOT),-lpthread,) \ - $(if $(LIBTORCH_ROOT),$(DF_SRC)/dfChemistryModel/DNNInferencer/build/libDNNInferencer.so,) \ - $(if $(PYTHON_LIB_DIR),-L$(PYTHON_LIB_DIR),) \ - $(if $(PYTHON_LIB_DIR),-lpython3.8,) \ - $(if $(AMGX_DIR), /usr/local/cuda-11.6/lib64/libcudart.so,) \ - $(if $(AMGX_DIR), $(DF_ROOT)/src_gpu/build/libdfMatrix.so,) \ - $(if $(AMGX_DIR), $(AMGX_DIR)/build/libamgxsh.so,) - diff --git a/applications/solvers/dfLowMachFoam_new/UEqn.H b/applications/solvers/dfLowMachFoam_new/UEqn.H deleted file mode 100644 index 38934abdb..000000000 --- a/applications/solvers/dfLowMachFoam_new/UEqn.H +++ /dev/null @@ -1,247 +0,0 @@ -// Solve the Momentum equation -#ifdef GPUSolver_ - start1 = std::clock(); - int offset = 0; - const tmp nuEff_tmp(turbulence->nuEff()); - const volScalarField& nuEff = nuEff_tmp(); - forAll(U.boundaryField(), patchi) - { - const scalarField& patchP = p.boundaryField()[patchi]; - const vectorField& patchU = U.boundaryField()[patchi]; - const scalarField& patchRho = rho.boundaryField()[patchi]; - const scalarField& patchNuEff = nuEff.boundaryField()[patchi]; - - int patchSize = patchP.size(); - - // boundary pressure - memcpy(boundary_pressure_init+offset, &patchP[0], patchSize*sizeof(double)); - // boundary velocity - memcpy(boundary_velocity_init+3*offset, &patchU[0][0], 3*patchSize*sizeof(double)); - // boundary nuEff - memcpy(boundary_nuEff_init+offset, &patchNuEff[0], patchSize*sizeof(double)); - // boundary rho - memcpy(boundary_rho_init+offset, &patchRho[0], patchSize*sizeof(double)); - offset += patchSize; - } - end1 = std::clock(); - time_monitor_UEqn += double(end1 - start1) / double(CLOCKS_PER_SEC); - time_monitor_UEqn_mtxAssembly += double(end1 - start1) / double(CLOCKS_PER_SEC); - time_monitor_UEqn_mtxAssembly_CPU_prepare += double(end1 - start1) / double(CLOCKS_PER_SEC); - - start1 = std::clock(); - UEqn_GPU.initializeTimeStep(); - U.oldTime(); - UEqn_GPU.fvm_ddt(&U.oldTime()[0][0]); - UEqn_GPU.fvm_div(boundary_pressure_init, boundary_velocity_init, boundary_nuEff_init, boundary_rho_init); - UEqn_GPU.fvc_grad(&p[0]); - UEqn_GPU.fvc_grad_vector(); - UEqn_GPU.dev2T(); - UEqn_GPU.fvc_div_tensor(&nuEff[0]); - UEqn_GPU.fvm_laplacian(); - UEqn_GPU.sync(); - end1 = std::clock(); - time_monitor_UEqn += double(end1 - start1) / double(CLOCKS_PER_SEC); - time_monitor_UEqn_mtxAssembly += double(end1 - start1) / double(CLOCKS_PER_SEC); - time_monitor_UEqn_mtxAssembly_GPU_run += double(end1 - start1) / double(CLOCKS_PER_SEC); - - // start2 = std::clock(); - // fvVectorMatrix turb_source - // ( - // turbulence->divDevRhoReff(U) - // ); - // end2 = std::clock(); - // time_monitor_CPU += double(end2 - start2) / double(CLOCKS_PER_SEC); - - // UEqn_GPU.add_fvMatrix(&turb_source.lower()[0], &turb_source.diag()[0], &turb_source.upper()[0], &turb_source.source()[0][0]); - // end1 = std::clock(); - // time_monitor_UEqn += double(end1 - start1) / double(CLOCKS_PER_SEC); - // time_monitor_UEqn_mtxAssembly += double(end1 - start1) / double(CLOCKS_PER_SEC); - - // check value - // U.oldTime(); - // tmp tUEqn - // ( - // fvm::ddt(rho, U) - // + - // fvm::div(phi, U) - // + - // turbulence->divDevRhoReff(U) - // == -fvc::grad(p) - // ); - // fvVectorMatrix& UEqn = tUEqn.ref(); - // printf("b_cpu = %e\n", UEqn.source()[1][1]); - // forAll(U.boundaryField(), patchi){ - // labelUList sub_boundary = mesh.boundary()[patchi].faceCells(); - // forAll(sub_boundary, i){ - // if (sub_boundary[i] == 1){ - // printf("b_cpu_bou = %e\n", UEqn.boundaryCoeffs()[patchi][i][1]); - // printf("patchi = %d, i = %d\n", patchi, i); - // } - // } - // } - // if (pimple.momentumPredictor()) - // { - // solve(UEqn); - // Info << "U_CPU\n" << U << endl; - // K = 0.5*magSqr(U); - // } - // UEqn_GPU.checkValue(true); -#elif defined GPUSolverNew_ - const tmp nuEff_tmp(turbulence->nuEff()); - const volScalarField& nuEff = nuEff_tmp(); - - // run CPU, for temp - tmp tUEqn - ( - fvm::ddt(rho, U) - + - fvm::div(phi, U) - + - turbulence->divDevRhoReff(U) - == -fvc::grad(p) - ); - fvVectorMatrix& UEqn = tUEqn.ref(); - - // run GPU - // preProcess - // TODO: preProcessForRhoEqn for temp, now we only transfer phi(instead of rhoEqn) used by fvm::div(phi, U) - UEqn_GPU.sync(); - double *h_rho = dfDataBase.getFieldPointer("rho", location::cpu, position::internal); - double *h_phi = dfDataBase.getFieldPointer("phi", location::cpu, position::internal); - double *h_boundary_phi = dfDataBase.getFieldPointer("phi", location::cpu, position::boundary); - memcpy(h_rho, &rho[0], dfDataBase.cell_value_bytes); - memcpy(h_phi, &phi[0], dfDataBase.surface_value_bytes); - int offset = 0; - forAll(phi.boundaryField(), patchi) - { - const fvsPatchScalarField& patchPhi = phi.boundaryField()[patchi]; - int patchsize = patchPhi.size(); - memcpy(h_boundary_phi + offset, &patchPhi[0], patchsize * sizeof(double)); - offset += patchsize; - } - UEqn_GPU.preProcessForRhoEqn(h_rho, h_phi, h_boundary_phi); - DEBUG_TRACE; - - TICK_START; - // preparing u, p, nu_eff, and rho.boundary used by UEqn_GPU.preProcess() - double *h_u = dfDataBase.getFieldPointer("u", location::cpu, position::internal); - double *h_boundary_u = dfDataBase.getFieldPointer("u", location::cpu, position::boundary); - double *h_p = dfDataBase.getFieldPointer("p", location::cpu, position::internal); - double *h_boundary_p = dfDataBase.getFieldPointer("p", location::cpu, position::boundary); - double *h_nu_eff = UEqn_GPU.getFieldPointer("nu_eff", location::cpu, position::internal); - double *h_boundary_nu_eff = UEqn_GPU.getFieldPointer("nu_eff", location::cpu, position::boundary); - double *h_boundary_rho = dfDataBase.getFieldPointer("rho", location::cpu, position::boundary); - TICK_STOP(get pointer); - - TICK_START; - U.oldTime(); - memcpy(h_u, &U.oldTime()[0][0], dfDataBase.cell_value_vec_bytes); - memcpy(h_p, &p[0], dfDataBase.cell_value_bytes); - memcpy(h_nu_eff, &nuEff[0], dfDataBase.cell_value_bytes); - TICK_STOP(copy to pinned memory); - - TICK_START; - offset = 0; - forAll(U.boundaryField(), patchi) - { - const fvPatchVectorField& patchU = U.boundaryField()[patchi]; - const fvPatchScalarField& patchP = p.boundaryField()[patchi]; - const fvPatchScalarField& patchNuEff = nuEff.boundaryField()[patchi]; - const fvPatchScalarField& patchRho = rho.boundaryField()[patchi]; - int patchsize = patchU.size(); - memcpy(h_boundary_u + offset * 3, &patchU[0][0], patchsize * 3 * sizeof(double)); - memcpy(h_boundary_p + offset, &patchP[0], patchsize * sizeof(double)); - memcpy(h_boundary_nu_eff + offset, &patchNuEff[0], patchsize * sizeof(double)); - memcpy(h_boundary_rho + offset, &patchRho[0], patchsize * sizeof(double)); - offset += patchsize; - } - TICK_STOP(CPU prepare boundary time); - - TICK_START; - UEqn_GPU.preProcess(h_u, h_boundary_u, h_p, h_boundary_p, h_nu_eff, h_boundary_nu_eff, h_boundary_rho); - DEBUG_TRACE; - UEqn_GPU.sync(); - TICK_STOP(GPU preProcess time); - - // process - TICK_START; - UEqn_GPU.process(); - DEBUG_TRACE; - UEqn_GPU.sync(); - TICK_STOP(GPU process time); - - TICK_START; - UEqn_GPU.solve(); - TICK_STOP(GPU solve time); - - // postProcess - TICK_START; - UEqn_GPU.postProcess(h_u); - U.correctBoundaryConditions(); - DEBUG_TRACE; - TICK_STOP(post process time); - - // checkResult - // TODO: for temp, now we compare ldu, finally we compare csr - std::vector h_internal_coeffs(dfDataBase.num_boundary_surfaces * 3); - std::vector h_boundary_coeffs(dfDataBase.num_boundary_surfaces * 3); - offset = 0; - for (int patchi = 0; patchi < dfDataBase.num_patches; patchi++) - { - int patchsize = dfDataBase.patch_size[patchi]; - const double* internal_coeff_ptr = &UEqn.internalCoeffs()[patchi][0][0]; - const double* boundary_coeff_ptr = &UEqn.boundaryCoeffs()[patchi][0][0]; - memcpy(h_internal_coeffs.data() + offset * 3, internal_coeff_ptr, patchsize * 3 * sizeof(double)); - memcpy(h_boundary_coeffs.data() + offset * 3, boundary_coeff_ptr, patchsize * 3 * sizeof(double)); - offset += patchsize; - } - bool printFlag = false; - UEqn_GPU.compareResult(&UEqn.lower()[0], &UEqn.upper()[0], &UEqn.diag()[0], &UEqn.source()[0][0], - h_internal_coeffs.data(), h_boundary_coeffs.data(), - // &DivTensor[0][0], - printFlag); - DEBUG_TRACE; -#else - start1 = std::clock(); - tmp tUEqn - ( - fvm::ddt(rho, U) + fvm::div(phi, U) - + turbulence->divDevRhoReff(U) - == -fvc::grad(p) - ); - fvVectorMatrix& UEqn = tUEqn.ref(); - - end1 = std::clock(); - time_monitor_UEqn += double(end1 - start1) / double(CLOCKS_PER_SEC); - time_monitor_UEqn_mtxAssembly += double(end1 - start1) / double(CLOCKS_PER_SEC); - - UEqn.relax(); - start1 = std::clock(); - if (pimple.momentumPredictor()) - { - solve(UEqn); - - K = 0.5*magSqr(U); - } - end1 = std::clock(); - time_monitor_UEqn += double(end1 - start1) / double(CLOCKS_PER_SEC); - time_monitor_UEqn_solve += double(end1 - start1) / double(CLOCKS_PER_SEC); -#endif - -// start1 = std::clock(); -// // // std::thread t(&dfMatrix::solve, &UEqn_GPU); -// UEqn_GPU.solve(); -// end1 = std::clock(); -// time_monitor_UEqn += double(end1 - start1) / double(CLOCKS_PER_SEC); -// time_monitor_UEqn_solve += double(end1 - start1) / double(CLOCKS_PER_SEC); - -// start1 = std::clock(); -// // // t.join(); -// // UEqn_GPU.updatePsi(&U[0][0]); -// K = 0.5*magSqr(U); -// end1 = std::clock(); -// time_monitor_UEqn += double(end1 - start1) / double(CLOCKS_PER_SEC); -// time_monitor_UEqn_mtxAssembly += double(end1 - start1) / double(CLOCKS_PER_SEC); -// time_monitor_CPU += double(end1 - start1) / double(CLOCKS_PER_SEC); -// // Info << "U_amgx = " << U << endl; - diff --git a/applications/solvers/dfLowMachFoam_new/YEqn.H b/applications/solvers/dfLowMachFoam_new/YEqn.H deleted file mode 100644 index 76570b24d..000000000 --- a/applications/solvers/dfLowMachFoam_new/YEqn.H +++ /dev/null @@ -1,207 +0,0 @@ -hDiffCorrFlux = Zero; -diffAlphaD = Zero; -sumYDiffError = Zero; - -tmp> mvConvection -( - fv::convectionScheme::New - ( - mesh, - fields, - phi, - mesh.divScheme("div(phi,Yi_h)") - ) -); -#ifdef GPUSolver_ - start1 = std::clock(); - UEqn_GPU.solve(); - end1 = std::clock(); - time_monitor_UEqn += double(end1 - start1) / double(CLOCKS_PER_SEC); - time_monitor_UEqn_solve += double(end1 - start1) / double(CLOCKS_PER_SEC); - - start1 = std::clock(); - std::vector Y_old(Y.size()), boundary_Y(Y.size()), boundary_hai(Y.size()), boundary_rhoD(Y.size()); - std::vector hai(Y.size()), rhoD(Y.size()); - for (size_t i = 0; i < Y.size(); ++i) - { - volScalarField& Yi = Y[i]; - Yi.oldTime(); - Y_old[i] = &Yi.oldTime()[0]; - if (updateBoundaryFields) - { - cudaMallocHost(&boundary_Y[i], num_boundary_faces*sizeof(double)); - } - const volScalarField& haii = chemistry->hai(i); - const volScalarField& rhoDi = chemistry->rhoD(i); - // hai[i] = &haii[0]; - rhoD[i] = &rhoDi[0]; - // cudaMallocHost(&boundary_hai[i], num_boundary_faces*sizeof(double)); - cudaMallocHost(&boundary_rhoD[i], num_boundary_faces*sizeof(double)); - int offset = 0; - forAll(Yi.boundaryField(), patchi) - { - const scalarField& patchYi = Yi.boundaryField()[patchi]; - // const scalarField& patchHaii = haii.boundaryField()[patchi]; - const scalarField& patchRhoDi = rhoDi.boundaryField()[patchi]; - int patchSize = patchYi.size(); - - if (updateBoundaryFields) - { - memcpy(boundary_Y[i] + offset, &patchYi[0], patchSize*sizeof(double)); - } - // memcpy(boundary_hai[i] + offset, &patchHaii[0], patchSize*sizeof(double)); - memcpy(boundary_rhoD[i] + offset, &patchRhoDi[0], patchSize*sizeof(double)); - offset += patchSize; - } - // if (i == 5) - // { - // Info << "rhoD_CPU" << rhoDi << endl; - // } - - } - // Info << "rhoD from nuEff\n" << nuEff * rho / 0.7 << endl; - updateBoundaryFields = false; - volScalarField mut_sct = turbulence->mut().ref()/Sct; - double *boundary_mutsct = nullptr; - cudaMallocHost(&boundary_mutsct, num_boundary_faces*sizeof(double)); - int offset = 0; - forAll(p.boundaryField(), patchi) - { - const scalarField& patchMut_sct = mut_sct.boundaryField()[patchi]; - int patchSize = patchMut_sct.size(); - memcpy(boundary_mutsct + offset, &patchMut_sct[0], patchSize*sizeof(double)); - offset += patchSize; - - // debug - // const fvsPatchScalarField& pw = mesh.surfaceInterpolation::weights().boundaryField()[patchi]; - // Field valueInternalCoeffs = Y[5].boundaryField()[patchi].valueInternalCoeffs(pw); - // Field valueBoundaryCoeffs = Y[5].boundaryField()[patchi].valueBoundaryCoeffs(pw); - // Field gradientInternalCoeffs = Y[5].boundaryField()[patchi].gradientInternalCoeffs(); - // Field gradientBoundaryCoeffs = Y[5].boundaryField()[patchi].gradientBoundaryCoeffs(); - // Info << "valueInternalCoeffs\n" << valueInternalCoeffs << endl; - // Info << "valueBoundaryCoeffs\n" << valueBoundaryCoeffs << endl; - // Info << "gradientInternalCoeffs\n" << gradientInternalCoeffs << endl; - // Info << "gradientBoundaryCoeffs\n" << gradientBoundaryCoeffs << endl; - } - end1 = std::clock(); - time_monitor_YEqn += double(end1 - start1) / double(CLOCKS_PER_SEC); - time_monitor_YEqn_mtxAssembly += double(end1 - start1) / double(CLOCKS_PER_SEC); - time_monitor_YEqn_mtxAssembly_CPU_prepare += double(end1 - start1) / double(CLOCKS_PER_SEC); - //fprintf(stderr, "time_monitor_YEqn_mtxAssembly_CPU_prepare: %lf\n", time_monitor_YEqn_mtxAssembly_CPU_prepare); - - start1 = std::clock(); - YEqn_GPU.initializeTimeStep(); - YEqn_GPU.upwindWeight(); - YEqn_GPU.fvm_laplacian_and_sumYDiffError_diffAlphaD_hDiffCorrFlux(Y_old, boundary_Y, - hai, boundary_hai, rhoD, boundary_rhoD, &mut_sct[0], boundary_mutsct, &thermo.alpha()[0]); - YEqn_GPU.fvm_ddt(); - YEqn_GPU.fvm_div_phi(); - YEqn_GPU.fvm_div_phiUc(); - YEqn_GPU.sync(); - // YEqn_GPU.checkValue(true, "of_output_H2.txt"); - end1 = std::clock(); - time_monitor_YEqn += double(end1 - start1) / double(CLOCKS_PER_SEC); - time_monitor_YEqn_mtxAssembly += double(end1 - start1) / double(CLOCKS_PER_SEC); - time_monitor_YEqn_mtxAssembly_GPU_run += double(end1 - start1) / double(CLOCKS_PER_SEC); - //fprintf(stderr, "time_monitor_YEqn_mtxAssembly_GPU_run: %lf\n", time_monitor_YEqn_mtxAssembly_GPU_run); - - start1 = std::clock(); - YEqn_GPU.solve(); - end1 = std::clock(); - time_monitor_YEqn += double(end1 - start1) / double(CLOCKS_PER_SEC); - time_monitor_YEqn_solve += double(end1 - start1) / double(CLOCKS_PER_SEC); -#else - start1 = std::clock(); - forAll(Y, i) - { - sumYDiffError += chemistry->rhoD(i)*fvc::grad(Y[i]); - } - // Info << "sumYDiffError\n" << sumYDiffError << endl; - const surfaceScalarField phiUc = linearInterpolate(sumYDiffError) & mesh.Sf(); - start1 = std::clock(); - time_monitor_YEqn += double(end1 - start1) / double(CLOCKS_PER_SEC); - time_monitor_YEqn_solve += double(end1 - start1) / double(CLOCKS_PER_SEC); -#endif - -//MPI_Barrier(PstreamGlobals::MPI_COMM_FOAM); -label flag_mpi_init; -MPI_Initialized(&flag_mpi_init); -if(flag_mpi_init) MPI_Barrier(PstreamGlobals::MPI_COMM_FOAM); - -{ - if (!splitting) - { - std::chrono::steady_clock::time_point start = std::chrono::steady_clock::now(); - combustion->correct(); - //label flag_mpi_init; - //MPI_Initialized(&flag_mpi_init); - if(flag_mpi_init) MPI_Barrier(PstreamGlobals::MPI_COMM_FOAM); - std::chrono::steady_clock::time_point stop = std::chrono::steady_clock::now(); - std::chrono::duration processingTime = std::chrono::duration_cast>(stop - start); - time_monitor_chem += processingTime.count(); - } - -#ifdef GPUSolver_ - start1 = std::clock(); - forAll(Y, i) - { - volScalarField& Yi = Y[i]; - YEqn_GPU.updatePsi(&Yi[0], i); - Yi.correctBoundaryConditions(); - } - YEqn_GPU.correctBoundaryConditions(); - end1 = std::clock(); - time_monitor_YEqn += double(end1 - start1) / double(CLOCKS_PER_SEC); - time_monitor_YEqn_correctBC += double(end1 - start1) / double(CLOCKS_PER_SEC); -#else - start2 = std::clock(); - volScalarField Yt(0.0*Y[0]); - int speciesIndex = 0; - forAll(Y, i) - { - volScalarField& Yi = Y[i]; - hDiffCorrFlux += chemistry->hai(i)*(chemistry->rhoD(i)*fvc::grad(Yi) - Yi*sumYDiffError); - diffAlphaD += fvc::laplacian(thermo.alpha()*chemistry->hai(i), Yi); - if (i != inertIndex) - { - start1 = std::clock(); - tmp DEff = chemistry->rhoD(i) + turbulence->mut()/Sct; - - fvScalarMatrix YiEqn - ( - fvm::ddt(rho, Yi) - + - ( - turbName == "laminar" - ? (mvConvection->fvmDiv(phi, Yi) + mvConvection->fvmDiv(phiUc, Yi)) - : mvConvection->fvmDiv(phi, Yi) - ) - == - ( - splitting - ? fvm::laplacian(DEff(), Yi) - : (fvm::laplacian(DEff(), Yi) + combustion->R(Yi)) - ) - ); - - end1 = std::clock(); - time_monitor_YEqn_mtxAssembly += double(end1 - start1) / double(CLOCKS_PER_SEC); - YiEqn.relax(); - - start1 = std::clock(); - YiEqn.solve("Yi"); - end1 = std::clock(); - time_monitor_YEqn_solve += double(end1 - start1) / double(CLOCKS_PER_SEC); - - Yi.max(0.0); - Yt += Yi; - ++speciesIndex; - } - } - - Y[inertIndex] = scalar(1) - Yt; - Y[inertIndex].max(0.0); - end2 = std::clock(); - time_monitor_YEqn += double(end2 - start2) / double(CLOCKS_PER_SEC); -#endif -} diff --git a/applications/solvers/dfLowMachFoam_new/YEqn_RR.H b/applications/solvers/dfLowMachFoam_new/YEqn_RR.H deleted file mode 100644 index f5752e95e..000000000 --- a/applications/solvers/dfLowMachFoam_new/YEqn_RR.H +++ /dev/null @@ -1,61 +0,0 @@ -if (!(timeIndex % 2)) -{ - volScalarField Yt(0.0*Y[0]); - - scalar dtSave = runTime.deltaT().value(); - runTime.setDeltaT(dtSave * 2); - - start = std::clock(); - combustion->correct(); - - label flag_mpi_init; - MPI_Initialized(&flag_mpi_init); - if(flag_mpi_init) MPI_Barrier(PstreamGlobals::MPI_COMM_FOAM); - end = std::clock(); - time_monitor_chem += double(end - start) / double(CLOCKS_PER_SEC); - - forAll(Y, i) - { - volScalarField& Yi = Y[i]; - - if (i != inertIndex) - { - volScalarField& Yi = Y[i]; - fvScalarMatrix YiEqn - ( - fvm::ddt(rho, Yi) - == - combustion->R(Yi) - ); - - YiEqn.relax(); - - YiEqn.solve("Yi"); - - Yi.max(0.0); - Yt += Yi; - } - } - Y[inertIndex] = scalar(1) - Yt; - Y[inertIndex].max(0.0); - - forAll (Y, i) - { - volScalarField& tYi = Y[i].oldTime(); - - forAll(tYi, celli) - { - tYi[celli] = Y[i][celli]; - } - volScalarField::Boundary& Bf = tYi.boundaryFieldRef(); - forAll(Bf, patchi) - { - forAll(Bf[patchi], facei) - { - Bf[patchi][facei] = Y[i].boundaryField()[patchi][facei]; - } - } - } - - runTime.setDeltaT(dtSave); -} \ No newline at end of file diff --git a/applications/solvers/dfLowMachFoam_new/correctPhi.H b/applications/solvers/dfLowMachFoam_new/correctPhi.H deleted file mode 100644 index 3cd82d29e..000000000 --- a/applications/solvers/dfLowMachFoam_new/correctPhi.H +++ /dev/null @@ -1,12 +0,0 @@ -CorrectPhi -( - U, - phi, - p, - rho, - psi, - dimensionedScalar("rAUf", dimTime, 1), - divrhoU(), - pimple, - true -); diff --git a/applications/solvers/dfLowMachFoam_new/createFields.H b/applications/solvers/dfLowMachFoam_new/createFields.H deleted file mode 100644 index 9e750c334..000000000 --- a/applications/solvers/dfLowMachFoam_new/createFields.H +++ /dev/null @@ -1,176 +0,0 @@ -#include "createRDeltaT.H" - -Info<< "Reading thermophysical properties\n" << endl; - -// fluidThermo* pThermo = new hePsiThermo(mesh, word::null); -fluidThermo* pThermo = new heRhoThermo(mesh, word::null); -fluidThermo& thermo = *pThermo; -// thermo.validate(args.executable(), "ha"); - -const volScalarField& psi = thermo.psi(); -volScalarField& p = thermo.p(); -volScalarField& T = thermo.T(); -volScalarField rho -( - IOobject - ( - "rho", - runTime.timeName(), - mesh, - IOobject::READ_IF_PRESENT, - IOobject::AUTO_WRITE - ), - thermo.rho() -); - - -Info<< "Reading field U\n" << endl; -volVectorField U -( - IOobject - ( - "U", - runTime.timeName(), - mesh, - IOobject::MUST_READ, - IOobject::AUTO_WRITE - ), - mesh -); - -#include "compressibleCreatePhi.H" - -pressureControl pressureControl(p, rho, pimple.dict(), false); - -mesh.setFluxRequired(p.name()); - -Info<< "Creating turbulence model\n" << endl; -autoPtr turbulence -( - compressible::turbulenceModel::New - ( - rho, - U, - phi, - thermo - ) -); - -Info<< "Creating field dpdt\n" << endl; -volScalarField dpdt -( - IOobject - ( - "dpdt", - runTime.timeName(), - mesh, - IOobject::NO_READ, - IOobject::NO_WRITE - ), - mesh, - dimensionedScalar("dpdt",p.dimensions()/dimTime, 0) -); - - -Info<< "Creating reaction model\n" << endl; -autoPtr> combustion -( - CombustionModel::New(thermo, turbulence()) -); -Info<< "end Creating reaction model\n" << endl; - - -const word combModelName(mesh.objectRegistry::lookupObject("combustionProperties").lookup("combustionModel")); -Info << "Combustion Model Name is confirmed as "<< combModelName << endl; - -const word turbName(mesh.objectRegistry::lookupObject("turbulenceProperties").lookup("simulationType")); - -dfChemistryModel* chemistry = combustion->chemistry(); -PtrList& Y = chemistry->Y(); -const word inertSpecie(chemistry->lookup("inertSpecie")); -const label inertIndex(chemistry->species()[inertSpecie]); -chemistry->setEnergyName("ha"); -chemistry->updateEnergy(); - - -chemistry->correctThermo(); -Info<< "At initial time, min/max(T) = " << min(T).value() << ", " << max(T).value() << endl; - -//for dpdt - -Info<< "Creating field kinetic energy K\n" << endl; -volScalarField K("K", 0.5*magSqr(U)); - -multivariateSurfaceInterpolationScheme::fieldTable fields; - -if(combModelName!="flareFGM") -{ -forAll(Y, i) -{ - fields.add(Y[i]); -} -fields.add(thermo.he()); -} - - -const scalar Sct = chemistry->lookupOrDefault("Sct", 1.); -volScalarField diffAlphaD -( - IOobject - ( - "diffAlphaD", - runTime.timeName(), - mesh, - IOobject::NO_READ, - IOobject::NO_WRITE - ), - mesh, - dimensionedScalar(dimEnergy/dimTime/dimVolume, 0) -); -volVectorField hDiffCorrFlux -( - IOobject - ( - "hDiffCorrFlux", - runTime.timeName(), - mesh, - IOobject::NO_READ, - IOobject::NO_WRITE - ), - mesh, - dimensionedVector(dimensionSet(1,0,-3,0,0,0,0), Zero) -); -volVectorField sumYDiffError -( - IOobject - ( - "sumYDiffError", - runTime.timeName(), - mesh, - IOobject::NO_READ, - IOobject::NO_WRITE - ), - mesh, - dimensionedVector("sumYDiffError", dimDynamicViscosity/dimLength, Zero) -); - -IOdictionary CanteraTorchProperties -( - IOobject - ( - "CanteraTorchProperties", - runTime.constant(), - mesh, - IOobject::MUST_READ, - IOobject::NO_WRITE - ) -); -const Switch splitting = CanteraTorchProperties.lookupOrDefault("splittingStrategy", false); -#ifdef USE_PYTORCH - const Switch log_ = CanteraTorchProperties.subDict("TorchSettings").lookupOrDefault("log", false); - const Switch torch_ = CanteraTorchProperties.subDict("TorchSettings").lookupOrDefault("torch", false); -#endif -#ifdef USE_LIBTORCH - const Switch log_ = CanteraTorchProperties.subDict("TorchSettings").lookupOrDefault("log", false); - const Switch torch_ = CanteraTorchProperties.subDict("TorchSettings").lookupOrDefault("torch", false); -#endif diff --git a/applications/solvers/dfLowMachFoam_new/createGPUSolver.H b/applications/solvers/dfLowMachFoam_new/createGPUSolver.H deleted file mode 100644 index 94fff1125..000000000 --- a/applications/solvers/dfLowMachFoam_new/createGPUSolver.H +++ /dev/null @@ -1,97 +0,0 @@ -dfMatrixDataBase dfDataBase; -//dfRhoEqn rhoEqn_GPU; -dfUEqn UEqn_GPU(dfDataBase); -//dfYEqn YEqn_GPU; -//dfEEqn EEqn_GPU; - -void createGPUBase(fvMesh& mesh, PtrList& Y) { - // prepare constant values: num_cells, num_surfaces, num_boundary_surfaces, num_patches, patch_size, num_species, rdelta_t - const labelUList& owner = mesh.owner(); - const labelUList& neighbour = mesh.neighbour(); - int num_cells = mesh.nCells(); - int num_surfaces = neighbour.size(); - int num_boundary_surfaces = 0; - int num_patches = 0; - std::vector patch_size; - forAll(mesh.boundary(), patchi) { - labelUList sub_boundary = mesh.boundary()[patchi].faceCells(); - int patchsize = sub_boundary.size(); - patch_size.push_back(patchsize); - num_boundary_surfaces += patchsize; - num_patches++; - } - // TODO: get deltaT fomr time API - double rDeltaT = 1 / 1e-6; - dfDataBase.setConstantValues(num_cells, num_surfaces, num_boundary_surfaces, num_patches, patch_size, Y.size(), rDeltaT); - - // prepare constant indexes: owner, neighbor - dfDataBase.setConstantIndexes(&owner[0], &neighbour[0]); - - // prepare internal and boundary of sf, mag_sf, weights, delta_coeffs, volume - double *boundary_sf = new double[3 * num_boundary_surfaces]; - double *boundary_mag_sf = new double[num_boundary_surfaces]; - double *boundary_delta_coeffs = new double[num_boundary_surfaces]; - int *boundary_face_cell = new int[num_boundary_surfaces]; - int offset = 0; - forAll(mesh.boundary(), patchi) { - const vectorField& pSf = mesh.Sf().boundaryField()[patchi]; - const scalarField& pMagSf = mesh.magSf().boundaryField()[patchi]; - const scalarField& pDeltaCoeffs = mesh.nonOrthDeltaCoeffs().boundaryField()[patchi]; - const labelUList& pFaceCells = mesh.boundary()[patchi].faceCells(); - - int patchsize = pMagSf.size(); - - memcpy(boundary_sf + 3*offset, &pSf[0][0], 3*patchsize*sizeof(double)); - memcpy(boundary_mag_sf + offset, &pMagSf[0], patchsize*sizeof(double)); - memcpy(boundary_delta_coeffs + offset, &pDeltaCoeffs[0], patchsize*sizeof(double)); - memcpy(boundary_face_cell + offset, &pFaceCells[0], patchsize * sizeof(int)); - offset += patchsize; - } - - dfDataBase.createConstantFieldsInternal(); - dfDataBase.createConstantFieldsBoundary(); - dfDataBase.initConstantFieldsInternal(&mesh.Sf()[0][0], &mesh.magSf()[0], &mesh.surfaceInterpolation::weights()[0], &mesh.nonOrthDeltaCoeffs()[0], &mesh.V()[0]); - dfDataBase.initConstantFieldsBoundary(boundary_sf, boundary_mag_sf, boundary_delta_coeffs, boundary_face_cell); - - // prepare internal and boundary of Y - dfDataBase.createNonConstantFieldsInternal(); - dfDataBase.createNonConstantFieldsBoundary(); - forAll(Y, speciesI) { - volScalarField& Yi = Y[speciesI]; - memcpy(dfDataBase.h_y + speciesI * num_cells, &Yi[0], num_cells * sizeof(double)); - offset = 0; - forAll(Yi.boundaryField(), patchi) { - const scalarField& patchYi = Yi.boundaryField()[patchi]; - int patchsize = patchYi.size(); - memcpy(dfDataBase.h_boundary_y + speciesI * num_boundary_surfaces + offset, &patchYi[0], patchsize*sizeof(double)); - offset += patchsize; - } - } - dfDataBase.initNonConstantFieldsInternal(dfDataBase.h_y); - dfDataBase.initNonConstantFieldsBoundary(dfDataBase.h_boundary_y); -} - -void createGPUUEqn(const IOdictionary& CanteraTorchProperties, const volVectorField& U) { - // prepare mode_string and setting_path - string mode_string = "dDDI"; - string settingPath; - settingPath = CanteraTorchProperties.subDict("AmgxSettings").lookupOrDefault("UEqnSettingPath", string("")); - UEqn_GPU.setConstantValues(mode_string, settingPath); - - // prepare patch_type - std::vector patch_type; - patch_type.resize(dfDataBase.num_patches); - forAll(U.boundaryField(), patchi) - { - constructBoundarySelectorPerPatch(&(patch_type[patchi]), U.boundaryField()[patchi].type()); - } - UEqn_GPU.setConstantFields(patch_type); - - // prepare internal and boundary of xxx - UEqn_GPU.createNonConstantFieldsInternal(); - UEqn_GPU.createNonConstantFieldsBoundary(); - UEqn_GPU.createNonConstantLduAndCsrFields(); - // UEqn_GPU has no internal non-constant fields to be init - // UEqn_GPU.initNonConstantFieldsInternal(); - UEqn_GPU.initNonConstantFieldsBoundary(); -} diff --git a/applications/solvers/dfLowMachFoam_new/createdfSolver.H b/applications/solvers/dfLowMachFoam_new/createdfSolver.H deleted file mode 100644 index 3c5593833..000000000 --- a/applications/solvers/dfLowMachFoam_new/createdfSolver.H +++ /dev/null @@ -1,65 +0,0 @@ -const labelUList& owner = mesh.owner(); -const labelUList& neighbour = mesh.neighbour(); -int num_cells = mesh.nCells(); -int num_surfaces = neighbour.size(); - -std::vector boundaryCellIndex; -std::vector boundary_face_vector_init; -std::vector boundary_face_init; -std::vector boundary_deltaCoeffs_init; -std::vector> patchTypes; -std::vector patchTypeU, patchTypeY; -int num_boundary_faces = 0; -int patchSize; -forAll(mesh.boundary(), patchi) -{ - labelUList sub_boundary = mesh.boundary()[patchi].faceCells(); - patchSize = sub_boundary.size(); - const vectorField& pSf = mesh.Sf().boundaryField()[patchi]; - const scalarField& pMagSf = mesh.magSf().boundaryField()[patchi]; - const scalarField& pDeltaCoeffs = mesh.nonOrthDeltaCoeffs().boundaryField()[patchi]; - - boundaryCellIndex.insert(boundaryCellIndex.end(), &sub_boundary[0], &sub_boundary[0]+patchSize); - boundary_face_vector_init.insert(boundary_face_vector_init.end(), &pSf[0][0], &pSf[0][0]+3*patchSize); - boundary_face_init.insert(boundary_face_init.end(), &pMagSf[0], &pMagSf[0]+patchSize); - boundary_deltaCoeffs_init.insert(boundary_deltaCoeffs_init.end(), &pDeltaCoeffs[0], &pDeltaCoeffs[0]+patchSize); - num_boundary_faces += patchSize; - - constructBoundarySelector(patchTypeU, U.boundaryField()[patchi].type(), patchSize); - constructBoundarySelector(patchTypeY, Y[0].boundaryField()[patchi].type(), patchSize); -} -patchTypes.emplace_back(patchTypeU); -patchTypes.emplace_back(patchTypeY); - -int num_boundary_cells; - -string settingPath; -settingPath = CanteraTorchProperties.subDict("AmgxSettings").lookupOrDefault("UEqnSettingPath", string("")); - -#ifdef GPUSolver_ - dfMatrixDataBase dfDataBase(num_surfaces, num_cells, num_boundary_faces, Y.size(), num_boundary_cells, &neighbour[0], &owner[0], &mesh.V()[0], &mesh.surfaceInterpolation::weights()[0], - &mesh.Sf()[0][0], &mesh.magSf()[0], &mesh.nonOrthDeltaCoeffs()[0], boundary_face_vector_init, boundary_face_init, boundary_deltaCoeffs_init, boundaryCellIndex, patchTypes); - dfRhoEqn rhoEqn_GPU(dfDataBase); - dfUEqn UEqn_GPU(dfDataBase, "dDDI", settingPath); - dfYEqn YEqn_GPU(dfDataBase, "dDDI", settingPath, inertIndex); - dfEEqn EEqn_GPU(dfDataBase, "dDDI", settingPath); - - double *ueqn_internalCoeffs_init, *ueqn_boundaryCoeffs_init, *boundary_pressure_init, *boundary_velocity_init, - *boundary_nuEff_init, *boundary_rho_init, *ueqn_laplac_internalCoeffs_init, *ueqn_laplac_boundaryCoeffs_init, *boundary_phi_init; - cudaMallocHost(&ueqn_internalCoeffs_init, 3*num_boundary_faces*sizeof(double)); - cudaMallocHost(&ueqn_boundaryCoeffs_init, 3*num_boundary_faces*sizeof(double)); - cudaMallocHost(&ueqn_laplac_internalCoeffs_init, 3*num_boundary_faces*sizeof(double)); - cudaMallocHost(&ueqn_laplac_boundaryCoeffs_init, 3*num_boundary_faces*sizeof(double)); - cudaMallocHost(&boundary_velocity_init, 3*num_boundary_faces*sizeof(double)); - cudaMallocHost(&boundary_pressure_init, num_boundary_faces*sizeof(double)); - cudaMallocHost(&boundary_nuEff_init, num_boundary_faces*sizeof(double)); - cudaMallocHost(&boundary_rho_init, num_boundary_faces*sizeof(double)); - cudaMallocHost(&boundary_phi_init, num_boundary_faces*sizeof(double)); - - double *boundary_alphaEff, *boundary_K, *boundary_gradient; - cudaMallocHost(&boundary_K, num_boundary_faces*sizeof(double)); - cudaMallocHost(&boundary_alphaEff, num_boundary_faces*sizeof(double)); - cudaMallocHost(&boundary_gradient, num_boundary_faces * sizeof(double)); - - bool updateBoundaryFields = true; // make sure that the boundary fields do H2D copy at 1st timestep -#endif diff --git a/applications/solvers/dfLowMachFoam_new/dfLowMachFoam.C b/applications/solvers/dfLowMachFoam_new/dfLowMachFoam.C deleted file mode 100644 index f5b6ec90d..000000000 --- a/applications/solvers/dfLowMachFoam_new/dfLowMachFoam.C +++ /dev/null @@ -1,447 +0,0 @@ -/*---------------------------------------------------------------------------*\ - ========= | - \\ / F ield | OpenFOAM: The Open Source CFD Toolbox - \\ / O peration | Website: https://openfoam.org - \\ / A nd | Copyright (C) 2011-2019 OpenFOAM Foundation - \\/ M anipulation | -------------------------------------------------------------------------------- -License - This file is part of OpenFOAM. - - OpenFOAM is free software: you can redistribute it and/or modify it - under the terms of the GNU General Public License as published by - the Free Software Foundation, either version 3 of the License, or - (at your option) any later version. - - OpenFOAM is distributed in the hope that it will be useful, but WITHOUT - ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or - FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License - for more details. - - You should have received a copy of the GNU General Public License - along with OpenFOAM. If not, see . - -Application - rhoPimpleFoam - -Description - Transient solver for turbulent flow of compressible fluids for HVAC and - similar applications, with optional mesh motion and mesh topology changes. - - Uses the flexible PIMPLE (PISO-SIMPLE) solution for time-resolved and - pseudo-transient simulations. - -\*---------------------------------------------------------------------------*/ - -#include "dfChemistryModel.H" -#include "CanteraMixture.H" -// #include "hePsiThermo.H" -#include "heRhoThermo.H" - -#ifdef USE_PYTORCH -#include -#include -#include //used to convert -#endif - -#ifdef USE_LIBTORCH -#include -#include "DNNInferencer.H" -#endif - -#include "fvCFD.H" -#include "fluidThermo.H" -#include "turbulentFluidThermoModel.H" -#include "pimpleControl.H" -#include "pressureControl.H" -#include "localEulerDdtScheme.H" -#include "fvcSmooth.H" -#include "PstreamGlobals.H" -#include "basicThermo.H" -#include "CombustionModel.H" - -#define GPUSolverNew_ -#define TIME - -#ifdef GPUSolverNew_ -#include "dfUEqn.H" -// #include "dfYEqn.H" -// #include "dfRhoEqn.H" -// #include "dfEEqn.H" -#include "dfMatrixDataBase.H" -#include "dfMatrixOpBase.H" -#include -#include - -#include "createGPUSolver.H" - -#include "upwind.H" -#include "GenFvMatrix.H" -#endif - -#ifdef TIME - #define TICK_START \ - start_new = std::clock(); - #define TICK_STOP(prefix) \ - stop_new = std::clock(); \ - Foam::Info << #prefix << " time = " << double(stop_new - start_new) / double(CLOCKS_PER_SEC) << " s" << Foam::endl; -#else - #define TICK_START - #define TICK_STOP(prefix) -#endif - -// * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * // - -int main(int argc, char *argv[]) -{ -#ifdef USE_PYTORCH - pybind11::scoped_interpreter guard{};//start python interpreter -#endif - #include "postProcess.H" - - // unsigned int flags = 0; - // checkCudaErrors(cudaGetDeviceFlags(&flags)); - // flags |= cudaDeviceScheduleYield; - // checkCudaErrors(cudaSetDeviceFlags(flags)); - - // #include "setRootCaseLists.H" - #include "listOptions.H" - #include "setRootCase2.H" - #include "listOutput.H" - - #include "createTime.H" - #include "createMesh.H" - #include "createDyMControls.H" - #include "initContinuityErrs.H" - #include "createFields.H" - #include "createRhoUfIfPresent.H" - - double time_monitor_init = 0; - - double time_monitor_other = 0; - double time_monitor_rho = 0; - double time_monitor_U = 0; - double time_monitor_Y = 0; - double time_monitor_E = 0; - double time_monitor_p = 0; - double time_monitor_chemistry_correctThermo = 0; - double time_monitor_turbulence_correct = 0; - double time_monitor_chem = 0; // combustion correct - - double time_monitor_rhoEqn = 0; - double time_monitor_rhoEqn_mtxAssembly = 0; - double time_monitor_rhoEqn_mtxAssembly_CPU_prepare = 0; - double time_monitor_rhoEqn_mtxAssembly_GPU_run = 0; - double time_monitor_rhoEqn_solve = 0; - double time_monitor_rhoEqn_correctBC = 0; - - double time_monitor_UEqn = 0; - double time_monitor_UEqn_mtxAssembly = 0; - double time_monitor_UEqn_mtxAssembly_CPU_prepare = 0; - double time_monitor_UEqn_mtxAssembly_GPU_run = 0; - double time_monitor_UEqn_solve = 0; - double time_monitor_UEqn_correctBC = 0; - double time_monitor_UEqn_H = 0; - double time_monitor_UEqn_H_GPU_run = 0; - double time_monitor_UEqn_H_correctBC = 0; - double time_monitor_UEqn_A = 0; - double time_monitor_UEqn_A_GPU_run = 0; - double time_monitor_UEqn_A_correctBC = 0; - - double time_monitor_YEqn = 0; - double time_monitor_YEqn_mtxAssembly = 0; - double time_monitor_YEqn_mtxAssembly_CPU_prepare = 0; - double time_monitor_YEqn_mtxAssembly_GPU_run = 0; - double time_monitor_YEqn_solve = 0; - double time_monitor_YEqn_correctBC = 0; - - double time_monitor_EEqn = 0; - double time_monitor_EEqn_mtxAssembly = 0; - double time_monitor_EEqn_mtxAssembly_CPU_prepare = 0; - double time_monitor_EEqn_mtxAssembly_GPU_prepare = 0; - double time_monitor_EEqn_mtxAssembly_GPU_run = 0; - double time_monitor_EEqn_solve = 0; - double time_monitor_EEqn_correctBC = 0; - - double time_monitor_pEqn = 0; - double time_monitor_pEqn_solve = 0; - - label timeIndex = 0; - clock_t start, end, start1, end1, start2, end2; - clock_t start_new, stop_new; - double time_new = 0; - - turbulence->validate(); - - if (!LTS) - { - #include "compressibleCourantNo.H" - #include "setInitialDeltaT.H" - } - - start1 = std::clock(); - createGPUBase(mesh, Y); - createGPUUEqn(CanteraTorchProperties, U); - - end1 = std::clock(); - time_monitor_init += double(end1 - start1) / double(CLOCKS_PER_SEC); - - // * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * // - - Info<< "\nStarting time loop\n" << endl; - - while (runTime.run()) - { - timeIndex ++; - - #include "readDyMControls.H" - - if (LTS) - { - #include "setRDeltaT.H" - } - else - { - #include "compressibleCourantNo.H" - #include "setDeltaT.H" - } - - runTime++; - - Info<< "Time = " << runTime.timeName() << nl << endl; - dfDataBase.preTimeStep(&rho.oldTime()[0]); - clock_t loop_start = std::clock(); - // --- Pressure-velocity PIMPLE corrector loop - while (pimple.loop()) - { - start = std::clock(); - if (splitting) - { - #include "YEqn_RR.H" - } - if (pimple.firstPimpleIter() || moveMeshOuterCorrectors) - { - // Store momentum to set rhoUf for introduced faces. - autoPtr rhoU; - if (rhoUf.valid()) - { - rhoU = new volVectorField("rhoU", rho*U); - } - } - end = std::clock(); - time_monitor_other += double(end - start) / double(CLOCKS_PER_SEC); - - start = std::clock(); - if (pimple.firstPimpleIter() && !pimple.simpleRho()) - { - #include "rhoEqn.H" - } - end = std::clock(); - time_monitor_rho += double(end - start) / double(CLOCKS_PER_SEC); - - start = std::clock(); - #include "UEqn.H" - end = std::clock(); - time_monitor_U += double(end - start) / double(CLOCKS_PER_SEC); - - if(combModelName!="ESF" && combModelName!="flareFGM" && combModelName!="DeePFGM") - { - start = std::clock(); - #include "YEqn.H" - end = std::clock(); - time_monitor_Y += double(end - start) / double(CLOCKS_PER_SEC); - - start = std::clock(); - #include "EEqn.H" - end = std::clock(); - time_monitor_E += double(end - start) / double(CLOCKS_PER_SEC); - - start = std::clock(); - chemistry->correctThermo(); - end = std::clock(); - time_monitor_chemistry_correctThermo += double(end - start) / double(CLOCKS_PER_SEC); - } - else - { - combustion->correct(); - } - - Info<< "min/max(T) = " << min(T).value() << ", " << max(T).value() << endl; - - // --- Pressure corrector loop - - start = std::clock(); - while (pimple.correct()) - { - if (pimple.consistent()) - { - // #include "pcEqn.H" - } - else - { - #include "pEqn.H" - } - } - end = std::clock(); - time_monitor_p += double(end - start) / double(CLOCKS_PER_SEC); - - start = std::clock(); - if (pimple.turbCorr()) - { - turbulence->correct(); - } - end = std::clock(); - time_monitor_turbulence_correct += double(end - start) / double(CLOCKS_PER_SEC); - } - clock_t loop_end = std::clock(); - double loop_time = double(loop_end - loop_start) / double(CLOCKS_PER_SEC); - - rho = thermo.rho(); - - dfDataBase.postTimeStep(); - - runTime.write(); - Info<< "========Time Spent in diffenet parts========"<< endl; - Info<< "loop Time = " << loop_time << " s" << endl; - Info<< "other Time = " << time_monitor_other << " s" << endl; - Info<< "rho Equations = " << time_monitor_rho << " s" << endl; - Info<< "U Equations = " << time_monitor_U << " s" << endl; - Info<< "Y Equations = " << time_monitor_Y - time_monitor_chem << " s" << endl; - Info<< "E Equations = " << time_monitor_E << " s" << endl; - Info<< "p Equations = " << time_monitor_p << " s" << endl; - Info<< "chemistry correctThermo = " << time_monitor_chemistry_correctThermo << " s" << endl; - Info<< "turbulence correct = " << time_monitor_turbulence_correct << " s" << endl; - Info<< "combustion correct(in Y) = " << time_monitor_chem << " s" << endl; - Info<< "percentage of chemistry = " << time_monitor_chem / loop_time * 100 << " %" << endl; - Info<< "percentage of rho/U/Y/E = " << (time_monitor_E + time_monitor_Y + time_monitor_U + time_monitor_rho - time_monitor_chem) / loop_time * 100 << " %" << endl; - - - Info<< "========Time details of each equation======="<< endl; - - Info<< "rhoEqn Time = " << time_monitor_rhoEqn << " s" << endl; - Info<< "rhoEqn assamble = " << time_monitor_rhoEqn_mtxAssembly << " s" << endl; - Info<< "rhoEqn assamble(CPU prepare) = " << time_monitor_rhoEqn_mtxAssembly_CPU_prepare << " s" << endl; - Info<< "rhoEqn assamble(GPU run) = " << time_monitor_rhoEqn_mtxAssembly_GPU_run << " s" << endl; - Info<< "rhoEqn solve = " << time_monitor_rhoEqn_solve << " s" << endl; - Info<< "rhoEqn correct boundary = " << time_monitor_rhoEqn_correctBC << " s" << endl; - - Info<< "UEqn Time = " << time_monitor_UEqn << " s" << endl; - Info<< "UEqn assamble = " << time_monitor_UEqn_mtxAssembly << " s" << endl; - Info<< "UEqn assamble(CPU prepare) = " << time_monitor_UEqn_mtxAssembly_CPU_prepare << " s" << endl; - Info<< "UEqn assamble(GPU run) = " << time_monitor_UEqn_mtxAssembly_GPU_run << " s" << endl; - Info<< "UEqn solve = " << time_monitor_UEqn_solve << " s" << endl; - Info<< "UEqn correct boundary = " << time_monitor_UEqn_correctBC << " s" << endl; - Info<< "UEqn H = " << time_monitor_UEqn_H << " s" << endl; - Info<< "UEqn H(GPU run) = " << time_monitor_UEqn_H_GPU_run << " s" << endl; - Info<< "UEqn H(correct boundary) = " << time_monitor_UEqn_H_correctBC << " s" << endl; - Info<< "UEqn A = " << time_monitor_UEqn_A << " s" << endl; - Info<< "UEqn A(GPU run) = " << time_monitor_UEqn_A_GPU_run << " s" << endl; - Info<< "UEqn A(correct boundary) = " << time_monitor_UEqn_A_correctBC << " s" << endl; - - Info<< "YEqn Time = " << time_monitor_YEqn << " s" << endl; - Info<< "YEqn assamble = " << time_monitor_YEqn_mtxAssembly << " s" << endl; - Info<< "YEqn assamble(CPU prepare) = " << time_monitor_YEqn_mtxAssembly_CPU_prepare << " s" << endl; - Info<< "YEqn assamble(GPU run) = " << time_monitor_YEqn_mtxAssembly_GPU_run << " s" << endl; - Info<< "YEqn solve = " << time_monitor_YEqn_solve << " s" << endl; - Info<< "YEqn correct boundary = " << time_monitor_YEqn_correctBC << " s" << endl; - - Info<< "EEqn Time = " << time_monitor_EEqn << " s" << endl; - Info<< "EEqn assamble = " << time_monitor_EEqn_mtxAssembly << " s" << endl; - Info<< "EEqn assamble(CPU prepare) = " << time_monitor_EEqn_mtxAssembly_CPU_prepare << " s" << endl; - Info<< "EEqn assamble(GPU prepare) = " << time_monitor_EEqn_mtxAssembly_GPU_prepare << " s" << endl; - Info<< "EEqn assamble(GPU run) = " << time_monitor_EEqn_mtxAssembly_GPU_run << " s" << endl; - Info<< "EEqn solve = " << time_monitor_EEqn_solve << " s" << endl; - Info<< "EEqn correct boundary = " << time_monitor_EEqn_correctBC << " s" << endl; - - Info<< "pEqn Time = " << time_monitor_pEqn << " s" << endl; - Info<< "pEqn Time solve = " << time_monitor_pEqn_solve << " s" << endl; - - Info<< "============================================"<. - -Global - rhoEqn - -Description - Solve the continuity for density. - -\*---------------------------------------------------------------------------*/ -#ifdef GPUSolver_ -{ - start1 = std::clock(); - rho.oldTime(); - - int offset = 0; - forAll(U.boundaryField(), patchi) - { - const fvsPatchScalarField& patchFlux = phi.boundaryField()[patchi]; - int patchSize = patchFlux.size(); - memcpy(boundary_phi_init+offset, &patchFlux[0], patchSize*sizeof(double)); - offset += patchSize; - } - end1 = std::clock(); - time_monitor_rhoEqn += double(end1 - start1) / double(CLOCKS_PER_SEC); - time_monitor_rhoEqn_mtxAssembly += double(end1 - start1) / double(CLOCKS_PER_SEC); - time_monitor_rhoEqn_mtxAssembly_CPU_prepare += double(end1 - start1) / double(CLOCKS_PER_SEC); - - start1 = std::clock(); - rhoEqn_GPU.initializeTimeStep(); - rhoEqn_GPU.fvc_div(&phi[0], boundary_phi_init); - rhoEqn_GPU.fvm_ddt(&rho.oldTime()[0]); - rhoEqn_GPU.sync(); - end1 = std::clock(); - time_monitor_rhoEqn += double(end1 - start1) / double(CLOCKS_PER_SEC); - time_monitor_rhoEqn_mtxAssembly += double(end1 - start1) / double(CLOCKS_PER_SEC); - time_monitor_rhoEqn_mtxAssembly_GPU_run += double(end1 - start1) / double(CLOCKS_PER_SEC); - - start1 = std::clock(); - rhoEqn_GPU.updatePsi(&rho.primitiveFieldRef()[0]); - rho.correctBoundaryConditions(); - end1 = std::clock(); - time_monitor_rhoEqn += double(end1 - start1) / double(CLOCKS_PER_SEC); - time_monitor_rhoEqn_correctBC += double(end1 - start1) / double(CLOCKS_PER_SEC); -} -#else -{ - start1 = std::clock(); - fvScalarMatrix rhoEqn - ( - fvm::ddt(rho) - + fvc::div(phi) - ); - end1 = std::clock(); - time_monitor_rhoEqn += double(end1 - start1) / double(CLOCKS_PER_SEC); - time_monitor_rhoEqn_mtxAssembly += double(end1 - start1) / double(CLOCKS_PER_SEC); - - start1 = std::clock(); - rhoEqn.solve(); - end1 = std::clock(); - time_monitor_rhoEqn += double(end1 - start1) / double(CLOCKS_PER_SEC); - time_monitor_rhoEqn_solve += double(end1 - start1) / double(CLOCKS_PER_SEC); -} -#endif - -// ************************************************************************* // diff --git a/applications/solvers/dfLowMachFoam_new/setRDeltaT.H b/applications/solvers/dfLowMachFoam_new/setRDeltaT.H deleted file mode 100644 index 074d05e3d..000000000 --- a/applications/solvers/dfLowMachFoam_new/setRDeltaT.H +++ /dev/null @@ -1,85 +0,0 @@ -{ - volScalarField& rDeltaT = trDeltaT.ref(); - - const dictionary& pimpleDict = pimple.dict(); - - scalar maxCo - ( - pimpleDict.lookupOrDefault("maxCo", 0.8) - ); - - scalar rDeltaTSmoothingCoeff - ( - pimpleDict.lookupOrDefault("rDeltaTSmoothingCoeff", 0.02) - ); - - scalar rDeltaTDampingCoeff - ( - pimpleDict.lookupOrDefault("rDeltaTDampingCoeff", 1.0) - ); - - scalar maxDeltaT - ( - pimpleDict.lookupOrDefault("maxDeltaT", great) - ); - - volScalarField rDeltaT0("rDeltaT0", rDeltaT); - - // Set the reciprocal time-step from the local Courant number - rDeltaT.ref() = max - ( - 1/dimensionedScalar(dimTime, maxDeltaT), - fvc::surfaceSum(mag(phi))()() - /((2*maxCo)*mesh.V()*rho()) - ); - - if (pimple.transonic()) - { - surfaceScalarField phid - ( - "phid", - fvc::interpolate(psi)*fvc::flux(U) - ); - - rDeltaT.ref() = max - ( - rDeltaT(), - fvc::surfaceSum(mag(phid))()() - /((2*maxCo)*mesh.V()*psi()) - ); - } - - // Update tho boundary values of the reciprocal time-step - rDeltaT.correctBoundaryConditions(); - - Info<< "Flow time scale min/max = " - << gMin(1/rDeltaT.primitiveField()) - << ", " << gMax(1/rDeltaT.primitiveField()) << endl; - - if (rDeltaTSmoothingCoeff < 1.0) - { - fvc::smooth(rDeltaT, rDeltaTSmoothingCoeff); - } - - Info<< "Smoothed flow time scale min/max = " - << gMin(1/rDeltaT.primitiveField()) - << ", " << gMax(1/rDeltaT.primitiveField()) << endl; - - // Limit rate of change of time scale - // - reduce as much as required - // - only increase at a fraction of old time scale - if - ( - rDeltaTDampingCoeff < 1.0 - && runTime.timeIndex() > runTime.startTimeIndex() + 1 - ) - { - rDeltaT = - rDeltaT0 - *max(rDeltaT/rDeltaT0, scalar(1) - rDeltaTDampingCoeff); - - Info<< "Damped flow time scale min/max = " - << gMin(1/rDeltaT.primitiveField()) - << ", " << gMax(1/rDeltaT.primitiveField()) << endl; - } -} diff --git a/applications/solvers/dfLowMachFoam_new/setRootCase2.H b/applications/solvers/dfLowMachFoam_new/setRootCase2.H deleted file mode 100644 index 45d966e63..000000000 --- a/applications/solvers/dfLowMachFoam_new/setRootCase2.H +++ /dev/null @@ -1,5 +0,0 @@ -Foam::argList args(argc,argv,true,true,/*initialise=*/false); -if (!args.checkRootCase()) -{ - Foam::FatalError.exit(); -} \ No newline at end of file