Skip to content
Merged

Gpu #307

Show file tree
Hide file tree
Changes from all commits
Commits
Show all changes
62 commits
Select commit Hold shift + click to select a range
4f4ca6d
add the validated cuda kernels for ueqn
STwangyingrui Apr 4, 2023
b026103
Merge pull request #239 from STwangyingrui/GPU
maorz1998 Apr 4, 2023
ecf8d1d
introduce new class dfMatrix and construct ueqn matrix with csr format
maorz1998 Apr 5, 2023
5aefe64
modify dfLowMachFoam.C & dfMatrix.cu
maorz1998 Apr 5, 2023
bd3e506
Merge pull request #240 from maorz1998/GPU
maorz1998 Apr 5, 2023
6255add
use amgx to solve dfMatrix
maorz1998 Apr 8, 2023
e985539
fix bugs and perform preliminary optimization
maorz1998 Apr 9, 2023
0ffd0f2
use GPU to accelerate permulation
maorz1998 Apr 9, 2023
8035a15
Merge pull request #244 from maorz1998/GPU
maorz1998 Apr 9, 2023
6161ba8
recent modification
maorz1998 May 4, 2023
e06acc7
Merge pull request #253 from maorz1998/GPU
maorz1998 May 4, 2023
7728d82
initial commit of turbulence->divDevRhoReff(U)
maorz1998 May 10, 2023
e533177
Merge pull request #255 from maorz1998/GPU
maorz1998 May 10, 2023
8554ef9
fix bugs in constructing turbulence->divDevRhoReff(U)
maorz1998 May 17, 2023
81486b4
clean version
maorz1998 May 18, 2023
f231592
fix a bug
maorz1998 May 18, 2023
656691f
conditional compilation
maorz1998 May 19, 2023
f5243bf
preliminary refactory
maorz1998 May 24, 2023
fdb4d38
fix conflicts
maorz1998 May 24, 2023
894300a
Merge pull request #272 from maorz1998/GPU
maorz1998 May 24, 2023
d125f38
rhoEqn & fix bugs in UEqn
maorz1998 May 29, 2023
e8e4caa
Merge pull request #277 from maorz1998/GPU
maorz1998 May 29, 2023
a3e1a1d
preliminary GPU version of YEqn
maorz1998 Jun 2, 2023
5cd1e53
clean version
maorz1998 Jun 2, 2023
3b5fbd3
Merge pull request #281 from maorz1998/GPU
maorz1998 Jun 2, 2023
7ee80e5
implement eeqn, fvm_div debugging
STwangyingrui Jun 2, 2023
e847450
fix bug in fvc_grad_internal_face
STwangyingrui Jun 5, 2023
56f23a3
fix EEqn_GPU
STwangyingrui Jun 5, 2023
9aa40f8
Merge pull request #283 from STwangyingrui/yr/GPU-dev
maorz1998 Jun 5, 2023
3ea932d
GPU version of UEqn.H and UEqn.A
maorz1998 Jun 6, 2023
7775dd9
upwind scheme
maorz1998 Jun 6, 2023
753e85c
Merge branch 'GPU' of github.com:maorz1998/deepflame-dev into GPU
maorz1998 Jun 6, 2023
89fcd96
Merge pull request #284 from maorz1998/GPU
maorz1998 Jun 6, 2023
fbc6754
remove rebundant data transfer between equetions and construct bounda…
maorz1998 Jun 7, 2023
b295447
Merge pull request #286 from maorz1998/GPU
maorz1998 Jun 7, 2023
f827b9d
add GPU version of hDiffCorrFlux and diffAlphaD in YEqn, merge branch…
STwangyingrui Jun 8, 2023
b158080
refactors: same stream cross equations, initializeTimeStep for each e…
STwangyingrui Jun 8, 2023
9316c68
refactor YEqn: minimize memcpy for species of rhoD and hai
STwangyingrui Jun 8, 2023
5a0918b
refactor profile, add permute_psi for UEqn, add sync for each equation
STwangyingrui Jun 9, 2023
4f97772
move Yt to GPU
STwangyingrui Jun 10, 2023
1929947
small refactor of time profiling
STwangyingrui Jun 11, 2023
f0dcdce
Merge pull request #289 from STwangyingrui/yr/GPU-dev
maorz1998 Jun 11, 2023
45015c3
primary implementation of correctBoundaryConditions on GPU
maorz1998 Jun 11, 2023
35be654
Merge pull request #291 from maorz1998/GPU
maorz1998 Jun 11, 2023
81e8603
optimez yeqn: merge species
STwangyingrui Jun 12, 2023
4943b42
Merge pull request #292 from STwangyingrui/yr/GPU-dev
maorz1998 Jun 12, 2023
fe02927
eliminate redundant operations when using UnityLewis to enhance perfo…
maorz1998 Jun 13, 2023
3aefd4b
Merge pull request #294 from maorz1998/GPU
maorz1998 Jun 13, 2023
4fbc6f8
add fixedValue boundary, debugging
maorz1998 Jun 26, 2023
9d800f9
Merge branch 'GPU' of github.com:maorz1998/deepflame-dev into GPU
maorz1998 Jun 26, 2023
83bbbf6
Merge pull request #301 from maorz1998/GPU
maorz1998 Jun 26, 2023
bfff1f7
fix bugs in fixedValue BC
maorz1998 Jun 28, 2023
36c4d31
clean version
maorz1998 Jun 28, 2023
425721d
Merge pull request #305 from maorz1998/GPU
maorz1998 Jun 28, 2023
c08e462
update configure and installation for GPU solver
maorz1998 Jun 28, 2023
e419f71
modify docs
maorz1998 Jun 28, 2023
416b4a5
modify docs
maorz1998 Jun 28, 2023
6c53c42
Update install.rst
maorz1998 Jun 28, 2023
0d54b4b
Merge pull request #306 from maorz1998/GPU
maorz1998 Jun 28, 2023
9e95820
Merge branch 'master' into GPU
maorz1998 Jun 28, 2023
ffc20a6
Update dfChemistryModel.C
maorz1998 Jun 29, 2023
420afda
Update UEqn.H
maorz1998 Jun 29, 2023
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
3 changes: 3 additions & 0 deletions .gitignore
Original file line number Diff line number Diff line change
Expand Up @@ -70,3 +70,6 @@ __pycache__/
lib/
bin/
.vscode/
result/
*result*
*profile*
1 change: 1 addition & 0 deletions Allwclean
Original file line number Diff line number Diff line change
Expand Up @@ -25,3 +25,4 @@ wclean ./applications/solvers/dfHighSpeedFoam
rm -rf src_orig/
rm -rf bin/
rm -rf lib/
rm -rf src_gpu/build
10 changes: 10 additions & 0 deletions applications/solvers/dfLowMachFoam/CMakeLists.txt
Original file line number Diff line number Diff line change
@@ -1,6 +1,8 @@
cmake_minimum_required(VERSION 3.5)
project(dfLowMachFoam LANGUAGES CXX)
FIND_PACKAGE(MPI REQUIRED)
FIND_PACKAGE(OpenMP REQUIRED)
FIND_PACKAGE(CUDA REQUIRED)

# Check valid thirdParty
if(DEFINED ENV{WM_PROJECT_DIR})
Expand All @@ -26,6 +28,8 @@ SET(SRC_ORIG $ENV{SRC_ORIG})

# set compilation options
SET(CMAKE_EXE_LINKER_FLAGS "-fuse-ld=bfd -Xlinker --add-needed -Xlinker --no-as-needed")
SET (CMAKE_C_FLAGS ${CMAKE_C_FLAGS} ${OpenMP_C_FLAGS})
SET (CMAKE_CXX_FLAGS ${CMAKE_CXX_FLAGS} ${OpenMP_CXX_FLAGS})

SET(CMAKE_C_COMPILER g++)
SET(PATH_LIB_OPENMPI "openmpi-system") # Foundation version
Expand Down Expand Up @@ -83,6 +87,9 @@ include_directories(
${CANTERA_ROOT}/include
${MPI_INCLUDE_PATH}
${PROJECT_SOURCE_DIR}
${CUDA_INCLUDE_DIRS}
/home/runze/AmgX/AMGX/include
/home/runze/deepflame-dev/src_gpu
)

# add execution
Expand All @@ -98,6 +105,9 @@ target_link_libraries(${PROJECT_NAME}
${DF_ROOT}/lib/libdfCombustionModels.so
$ENV{FOAM_LIBBIN}/openmpi-system/libPstream.so
${MPI_LIBRARIES}
${CUDA_LIBRARIES}
/home/runze/AmgX/AMGX/build/libamgxsh.so
/home/runze/deepflame-dev/src_gpu/build/libdfMatrix.so
)

if(DEFINED ENV{PYTHON_INC_DIR})
Expand Down
120 changes: 116 additions & 4 deletions applications/solvers/dfLowMachFoam/EEqn.H
Original file line number Diff line number Diff line change
@@ -1,8 +1,113 @@
{
volScalarField& he = thermo.he();
#ifdef GPUSolver_
start1 = std::clock();
UEqn_GPU.updatePsi(&U[0][0]);
UEqn_GPU.correctBoundaryConditions();
U.correctBoundaryConditions();
K = 0.5*magSqr(U);
end1 = std::clock();
time_monitor_UEqn += double(end1 - start1) / double(CLOCKS_PER_SEC);
time_monitor_UEqn_correctBC += double(end1 - start1) / double(CLOCKS_PER_SEC);

// prepare data on CPU
start1 = std::clock();
start2 = std::clock();
// const tmp<volScalarField> alphaEff_tmp(thermo.alpha());
// const volScalarField& alphaEff = alphaEff_tmp();
double *alphaEff = nullptr; // tmp
end2 = std::clock();
int eeqn_offset = 0;
int patchNum = 0;

forAll(he.boundaryField(), patchi)
{
patchNum++;
const fvsPatchScalarField& pw = mesh.surfaceInterpolation::weights().boundaryField()[patchi];
int patchSize = pw.size();

// construct gradient manually
const fvPatchScalarField& hew = he.boundaryField()[patchi];
const basicThermo& bThermo = basicThermo::lookupThermo(hew);
const scalarField& ppw = bThermo.p().boundaryField()[patchi];
fvPatchScalarField& Tw =
const_cast<fvPatchScalarField&>(bThermo.T().boundaryField()[patchi]);
scalarField& Tw_v = Tw;

Tw.evaluate();
const scalarField& patchDeltaCoeff = mesh.boundary()[patchi].deltaCoeffs();
const scalarField heInternal = bThermo.he(ppw, Tw, patchi)();
const scalarField heBoundary = bThermo.he(ppw, Tw, mesh.boundary()[patchi].faceCells())();
const scalarField patchGradMau = patchDeltaCoeff * (heInternal - heBoundary);

const scalarField& patchK = K.boundaryField()[patchi];
// const scalarField& patchAlphaEff = alphaEff.boundaryField()[patchi]; // not H2Dcopy when use UnityLewis
// const scalarField& patchGrad = he.boundaryField()[patchi].gradientBoundaryCoeffs(); // gradient_

// const DimensionedField<scalar, volMesh>& patchHa_ = he.boundaryField()[patchi];
// const gradientEnergyFvPatchScalarField patchHa(mesh.boundary()[patchi], patchHa_);
// const scalarField& patchGrad = patchHa.gradient(); // gradient_
memcpy(boundary_K + eeqn_offset, &patchK[0], patchSize*sizeof(double));
// memcpy(boundary_alphaEff + eeqn_offset, &patchAlphaEff[0], patchSize*sizeof(double));
memcpy(boundary_gradient + eeqn_offset, &patchGradMau[0], patchSize*sizeof(double));

eeqn_offset += patchSize;
}
end1 = std::clock();
time_monitor_EEqn += double(end1 - start1) / double(CLOCKS_PER_SEC);
time_monitor_EEqn_mtxAssembly += double(end1 - start1) / double(CLOCKS_PER_SEC);
time_monitor_EEqn_mtxAssembly_CPU_prepare += double(end1 - start1) / double(CLOCKS_PER_SEC);
fprintf(stderr, "time_monitor_EEqn_mtxAssembly_CPU_prepare: %lf, build alphaEff time: %lf, patchNum: %d\n",
time_monitor_EEqn_mtxAssembly_CPU_prepare,
double(end2 - start2) / double(CLOCKS_PER_SEC), patchNum);

// prepare data on GPU
start1 = std::clock();
he.oldTime();
K.oldTime();
EEqn_GPU.prepare_data(&he.oldTime()[0], &K[0], &K.oldTime()[0], alphaEff,
&dpdt[0], boundary_K, boundary_alphaEff, boundary_gradient);
EEqn_GPU.sync();
end1 = std::clock();
time_monitor_EEqn += double(end1 - start1) / double(CLOCKS_PER_SEC);
time_monitor_EEqn_mtxAssembly += double(end1 - start1) / double(CLOCKS_PER_SEC);
time_monitor_EEqn_mtxAssembly_GPU_prepare += double(end1 - start1) / double(CLOCKS_PER_SEC);

start1 = std::clock();
EEqn_GPU.initializeTimeStep();
EEqn_GPU.fvm_ddt();
EEqn_GPU.fvm_div();
EEqn_GPU.fvm_laplacian();
EEqn_GPU.fvc_ddt();
EEqn_GPU.fvc_div_phi_scalar();
EEqn_GPU.fvc_div_vector();
EEqn_GPU.add_to_source();
EEqn_GPU.sync();
end1 = std::clock();
time_monitor_EEqn += double(end1 - start1) / double(CLOCKS_PER_SEC);
time_monitor_EEqn_mtxAssembly += double(end1 - start1) / double(CLOCKS_PER_SEC);
time_monitor_EEqn_mtxAssembly_GPU_run += double(end1 - start1) / double(CLOCKS_PER_SEC);

// check value of mtxAssembly, no time monitor
// EEqn_GPU.checkValue(true);

start1 = std::clock();
EEqn_GPU.solve();
end1 = std::clock();
time_monitor_EEqn += double(end1 - start1) / double(CLOCKS_PER_SEC);
time_monitor_EEqn_solve += double(end1 - start1) / double(CLOCKS_PER_SEC);

start1 = std::clock();
EEqn_GPU.updatePsi(&he[0]);
he.correctBoundaryConditions();
he.write();
end1 = std::clock();
time_monitor_EEqn += double(end1 - start1) / double(CLOCKS_PER_SEC);
time_monitor_EEqn_correctBC += double(end1 - start1) / double(CLOCKS_PER_SEC);
#else
start1 = std::clock();
fvScalarMatrix EEqn
(
(

fvm::ddt(rho, he) + mvConvection->fvmDiv(phi, he)
+ fvc::ddt(rho, K) + fvc::div(phi, K)
Expand All @@ -22,8 +127,15 @@
)
)
);
end1 = std::clock();
time_monitor_EEqn += double(end1 - start1) / double(CLOCKS_PER_SEC);
time_monitor_EEqn_mtxAssembly += double(end1 - start1) / double(CLOCKS_PER_SEC);

EEqn.relax();

EEqn.solve("ha");
EEqn.relax();
start1 = std::clock();
EEqn.solve("ha");
end1 = std::clock();
time_monitor_EEqn += double(end1 - start1) / double(CLOCKS_PER_SEC);
time_monitor_EEqn_solve += double(end1 - start1) / double(CLOCKS_PER_SEC);
#endif
}
14 changes: 12 additions & 2 deletions applications/solvers/dfLowMachFoam/Make/options
Original file line number Diff line number Diff line change
@@ -1,12 +1,15 @@
-include $(GENERAL_RULES)/mplibType

EXE_INC = -std=c++14 \
-g \
-fopenmp \
-Wno-unused-variable \
-Wno-unused-but-set-variable \
-Wno-old-style-cast \
$(PFLAGS) $(PINC) \
$(if $(LIBTORCH_ROOT),-DUSE_LIBTORCH,) \
$(if $(PYTHON_INC_DIR),-DUSE_PYTORCH,) \
$(if $(AMGX_DIR),-DGPUSolver_,) \
-I$(LIB_SRC)/transportModels/compressible/lnInclude \
-I$(LIB_SRC)/thermophysicalModels/basic/lnInclude \
-I$(LIB_SRC)/TurbulenceModels/turbulenceModels/lnInclude \
Expand All @@ -23,7 +26,10 @@ EXE_INC = -std=c++14 \
-I$(CANTERA_ROOT)/include \
$(if $(LIBTORCH_ROOT),-I$(LIBTORCH_ROOT)/include,) \
$(if $(LIBTORCH_ROOT),-I$(LIBTORCH_ROOT)/include/torch/csrc/api/include,) \
$(PYTHON_INC_DIR)
$(PYTHON_INC_DIR) \
$(if $(AMGX_DIR), -I$(DF_ROOT)/src_gpu,) \
$(if $(AMGX_DIR), -I/usr/local/cuda-11.6/include,) \
$(if $(AMGX_DIR), -I$(AMGX_DIR)/include,)

EXE_LIBS = \
-lcompressibleTransportModels \
Expand All @@ -44,4 +50,8 @@ EXE_LIBS = \
$(if $(LIBTORCH_ROOT),-lpthread,) \
$(if $(LIBTORCH_ROOT),$(DF_SRC)/dfChemistryModel/DNNInferencer/build/libDNNInferencer.so,) \
$(if $(PYTHON_LIB_DIR),-L$(PYTHON_LIB_DIR),) \
$(if $(PYTHON_LIB_DIR),-lpython3.8,)
$(if $(PYTHON_LIB_DIR),-lpython3.8,) \
$(if $(AMGX_DIR), /usr/local/cuda-11.6/lib64/libcudart.so,) \
$(if $(AMGX_DIR), $(DF_ROOT)/src_gpu/build/libdfMatrix.so,) \
$(if $(AMGX_DIR), $(AMGX_DIR)/build/libamgxsh.so,)

139 changes: 127 additions & 12 deletions applications/solvers/dfLowMachFoam/UEqn.H
Original file line number Diff line number Diff line change
@@ -1,17 +1,132 @@
// Solve the Momentum equation
#ifdef GPUSolver_
start1 = std::clock();
int offset = 0;
const tmp<volScalarField> nuEff_tmp(turbulence->nuEff());
const volScalarField& nuEff = nuEff_tmp();
forAll(U.boundaryField(), patchi)
{
const scalarField& patchP = p.boundaryField()[patchi];
const vectorField& patchU = U.boundaryField()[patchi];
const scalarField& patchRho = rho.boundaryField()[patchi];
const scalarField& patchNuEff = nuEff.boundaryField()[patchi];

tmp<fvVectorMatrix> tUEqn
(
fvm::ddt(rho, U) + fvm::div(phi, U)
+ turbulence->divDevRhoReff(U)
);
fvVectorMatrix& UEqn = tUEqn.ref();
int patchSize = patchP.size();

UEqn.relax();
// boundary pressure
memcpy(boundary_pressure_init+offset, &patchP[0], patchSize*sizeof(double));
// boundary velocity
memcpy(boundary_velocity_init+3*offset, &patchU[0][0], 3*patchSize*sizeof(double));
// boundary nuEff
memcpy(boundary_nuEff_init+offset, &patchNuEff[0], patchSize*sizeof(double));
// boundary rho
memcpy(boundary_rho_init+offset, &patchRho[0], patchSize*sizeof(double));
offset += patchSize;
}
end1 = std::clock();
time_monitor_UEqn += double(end1 - start1) / double(CLOCKS_PER_SEC);
time_monitor_UEqn_mtxAssembly += double(end1 - start1) / double(CLOCKS_PER_SEC);
time_monitor_UEqn_mtxAssembly_CPU_prepare += double(end1 - start1) / double(CLOCKS_PER_SEC);

if (pimple.momentumPredictor())
{
solve(UEqn == -fvc::grad(p));
start1 = std::clock();
UEqn_GPU.initializeTimeStep();
U.oldTime();
UEqn_GPU.fvm_ddt(&U.oldTime()[0][0]);
UEqn_GPU.fvm_div(boundary_pressure_init, boundary_velocity_init, boundary_nuEff_init, boundary_rho_init);
UEqn_GPU.fvc_grad(&p[0]);
UEqn_GPU.fvc_grad_vector();
UEqn_GPU.dev2T();
UEqn_GPU.fvc_div_tensor(&nuEff[0]);
UEqn_GPU.fvm_laplacian();
UEqn_GPU.sync();
end1 = std::clock();
time_monitor_UEqn += double(end1 - start1) / double(CLOCKS_PER_SEC);
time_monitor_UEqn_mtxAssembly += double(end1 - start1) / double(CLOCKS_PER_SEC);
time_monitor_UEqn_mtxAssembly_GPU_run += double(end1 - start1) / double(CLOCKS_PER_SEC);

// start2 = std::clock();
// fvVectorMatrix turb_source
// (
// turbulence->divDevRhoReff(U)
// );
// end2 = std::clock();
// time_monitor_CPU += double(end2 - start2) / double(CLOCKS_PER_SEC);

// UEqn_GPU.add_fvMatrix(&turb_source.lower()[0], &turb_source.diag()[0], &turb_source.upper()[0], &turb_source.source()[0][0]);
// end1 = std::clock();
// time_monitor_UEqn += double(end1 - start1) / double(CLOCKS_PER_SEC);
// time_monitor_UEqn_mtxAssembly += double(end1 - start1) / double(CLOCKS_PER_SEC);

// check value
// U.oldTime();
// tmp<fvVectorMatrix> tUEqn
// (
// fvm::ddt(rho, U)
// +
// fvm::div(phi, U)
// +
// turbulence->divDevRhoReff(U)
// == -fvc::grad(p)
// );
// fvVectorMatrix& UEqn = tUEqn.ref();
// printf("b_cpu = %e\n", UEqn.source()[1][1]);
// forAll(U.boundaryField(), patchi){
// labelUList sub_boundary = mesh.boundary()[patchi].faceCells();
// forAll(sub_boundary, i){
// if (sub_boundary[i] == 1){
// printf("b_cpu_bou = %e\n", UEqn.boundaryCoeffs()[patchi][i][1]);
// printf("patchi = %d, i = %d\n", patchi, i);
// }
// }
// }
// if (pimple.momentumPredictor())
// {
// solve(UEqn);
// Info << "U_CPU\n" << U << endl;
// K = 0.5*magSqr(U);
// }
// UEqn_GPU.checkValue(true);
#else
start1 = std::clock();
tmp<fvVectorMatrix> tUEqn
(
fvm::ddt(rho, U) + fvm::div(phi, U)
+ turbulence->divDevRhoReff(U)
== -fvc::grad(p)
);
fvVectorMatrix& UEqn = tUEqn.ref();

end1 = std::clock();
time_monitor_UEqn += double(end1 - start1) / double(CLOCKS_PER_SEC);
time_monitor_UEqn_mtxAssembly += double(end1 - start1) / double(CLOCKS_PER_SEC);

UEqn.relax();
start1 = std::clock();
if (pimple.momentumPredictor())
{
solve(UEqn);

K = 0.5*magSqr(U);
}
end1 = std::clock();
time_monitor_UEqn += double(end1 - start1) / double(CLOCKS_PER_SEC);
time_monitor_UEqn_solve += double(end1 - start1) / double(CLOCKS_PER_SEC);
#endif

// start1 = std::clock();
// // // std::thread t(&dfMatrix::solve, &UEqn_GPU);
// UEqn_GPU.solve();
// end1 = std::clock();
// time_monitor_UEqn += double(end1 - start1) / double(CLOCKS_PER_SEC);
// time_monitor_UEqn_solve += double(end1 - start1) / double(CLOCKS_PER_SEC);

// start1 = std::clock();
// // // t.join();
// // UEqn_GPU.updatePsi(&U[0][0]);
// K = 0.5*magSqr(U);
// end1 = std::clock();
// time_monitor_UEqn += double(end1 - start1) / double(CLOCKS_PER_SEC);
// time_monitor_UEqn_mtxAssembly += double(end1 - start1) / double(CLOCKS_PER_SEC);
// time_monitor_CPU += double(end1 - start1) / double(CLOCKS_PER_SEC);
// // Info << "U_amgx = " << U << endl;

K = 0.5*magSqr(U);
}
Loading