Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
24 changes: 3 additions & 21 deletions applications/solvers/dfLowMachFoam/UEqn.H
Original file line number Diff line number Diff line change
Expand Up @@ -30,7 +30,7 @@ end1 = std::clock();

#ifdef GPUSolver_
start1 = std::clock();
UEqn_GPU.fvm_ddt(&rho.oldTime()[0], &rho[0], &U.oldTime()[0][0]);
UEqn_GPU.fvm_ddt(&U.oldTime()[0][0]);
start2 = std::clock();
int offset = 0;
const tmp<volScalarField> nuEff_tmp(turbulence->nuEff());
Expand Down Expand Up @@ -69,24 +69,6 @@ end1 = std::clock();
memcpy(boundary_nuEff_init+offset, &patchNuEff[0], patchSize*sizeof(double));
// boundary rho
memcpy(boundary_rho_init+offset, &patchRho[0], patchSize*sizeof(double));

// // only need to construct once
// std::copy(&ueqn_internalCoeffs_vec[0][0], &ueqn_internalCoeffs_vec[0][0]+3*patchSize, ueqn_internalCoeffs_init + 3*offset);
// // need to construct every time step
// std::copy(&ueqn_boundaryCoeffs_vec[0][0], &ueqn_boundaryCoeffs_vec[0][0]+3*patchSize, ueqn_boundaryCoeffs_init + 3*offset);
// // laplacian internalCoeffs
// std::copy(&ueqn_laplac_internalCoeffs_vec[0][0], &ueqn_laplac_internalCoeffs_vec[0][0]+3*patchSize, ueqn_laplac_internalCoeffs_init + 3*offset);
// // laplacian boundaryCoeffs
// std::copy(&ueqn_laplac_boundaryCoeffs_vec[0][0], &ueqn_laplac_boundaryCoeffs_vec[0][0]+3*patchSize, ueqn_laplac_boundaryCoeffs_init + 3*offset);
// // boundary pressure
// std::copy(&patchP[0], &patchP[0]+patchSize, boundary_pressure_init+offset);
// // boundary velocity
// std::copy(&patchU[0][0], &patchU[0][0]+3*patchSize, boundary_velocity_init+3*offset);
// // boundary nuEff
// std::copy(&patchNuEff[0], &patchNuEff[0]+patchSize, boundary_nuEff_init+offset);
// // boundary rho
// std::copy(&patchRho[0], &patchRho[0]+patchSize, boundary_rho_init+offset);

offset += patchSize;
}
end1 = std::clock();
Expand All @@ -97,8 +79,8 @@ end1 = std::clock();

start1 = std::clock();
UEqn_GPU.fvm_div(&phi[0], ueqn_internalCoeffs_init, ueqn_boundaryCoeffs_init,
ueqn_laplac_internalCoeffs_init, ueqn_laplac_boundaryCoeffs_init,
boundary_pressure_init, boundary_velocity_init, boundary_nuEff_init, boundary_rho_init);
ueqn_laplac_internalCoeffs_init, ueqn_laplac_boundaryCoeffs_init,
boundary_pressure_init, boundary_velocity_init, boundary_nuEff_init, boundary_rho_init);
UEqn_GPU.fvc_grad(&p[0]);
UEqn_GPU.fvc_grad_vector();
UEqn_GPU.dev2T();
Expand Down
6 changes: 4 additions & 2 deletions applications/solvers/dfLowMachFoam/createdfSolver.H
Original file line number Diff line number Diff line change
Expand Up @@ -32,14 +32,16 @@ dfMatrixDataBase dfDataBase(num_surfaces, num_cells, num_boundary_faces, num_bou
&mesh.Sf()[0][0], &mesh.magSf()[0], &mesh.nonOrthDeltaCoeffs()[0], boundary_face_vector_init, boundary_face_init, boundary_deltaCoeffs_init, boundaryCellIndex);

dfUEqn UEqn_GPU(dfDataBase, "dDDI", settingPath);
dfRhoEqn rhoEqn_GPU(dfDataBase);

double *ueqn_internalCoeffs_init, *ueqn_boundaryCoeffs_init, *boundary_pressure_init, *boundary_velocity_init,
*boundary_nuEff_init, *boundary_rho_init, *ueqn_laplac_internalCoeffs_init, *ueqn_laplac_boundaryCoeffs_init;
*boundary_nuEff_init, *boundary_rho_init, *ueqn_laplac_internalCoeffs_init, *ueqn_laplac_boundaryCoeffs_init, *boundary_phi_init;
cudaMallocHost(&ueqn_internalCoeffs_init, 3*num_boundary_faces*sizeof(double));
cudaMallocHost(&ueqn_boundaryCoeffs_init, 3*num_boundary_faces*sizeof(double));
cudaMallocHost(&ueqn_laplac_internalCoeffs_init, 3*num_boundary_faces*sizeof(double));
cudaMallocHost(&ueqn_laplac_boundaryCoeffs_init, 3*num_boundary_faces*sizeof(double));
cudaMallocHost(&boundary_velocity_init, 3*num_boundary_faces*sizeof(double));
cudaMallocHost(&boundary_pressure_init, num_boundary_faces*sizeof(double));
cudaMallocHost(&boundary_nuEff_init, num_boundary_faces*sizeof(double));
cudaMallocHost(&boundary_rho_init, num_boundary_faces*sizeof(double));
cudaMallocHost(&boundary_rho_init, num_boundary_faces*sizeof(double));
cudaMallocHost(&boundary_phi_init, num_boundary_faces*sizeof(double));
1 change: 1 addition & 0 deletions applications/solvers/dfLowMachFoam/dfLowMachFoam.C
Original file line number Diff line number Diff line change
Expand Up @@ -60,6 +60,7 @@ Description
#include "CombustionModel.H"

#include "dfUEqn.H"
#include "dfRhoEqn.H"
#include <cuda_runtime.h>
#include <thread>

Expand Down
20 changes: 20 additions & 0 deletions applications/solvers/dfLowMachFoam/rhoEqn.H
Original file line number Diff line number Diff line change
Expand Up @@ -29,6 +29,7 @@ Description

\*---------------------------------------------------------------------------*/

#ifdef CPUSolver_
{
fvScalarMatrix rhoEqn
(
Expand All @@ -38,5 +39,24 @@ Description

rhoEqn.solve();
}
#endif
#ifdef GPUSolver_
{
rho.oldTime();

int offset = 0;
forAll(U.boundaryField(), patchi)
{
const fvsPatchScalarField& patchFlux = phi.boundaryField()[patchi];
int patchSize = patchFlux.size();
memcpy(boundary_phi_init+offset, &patchFlux[0], patchSize*sizeof(double));
offset += patchSize;
}
rhoEqn_GPU.fvc_div(&phi[0], boundary_phi_init);
rhoEqn_GPU.fvm_ddt(&rho.oldTime()[0], &rho[0]);
rhoEqn_GPU.updatePsi(&rho.primitiveFieldRef()[0]);
rho.correctBoundaryConditions();
}
#endif

// ************************************************************************* //
2 changes: 1 addition & 1 deletion src_gpu/CMakeLists.txt
Original file line number Diff line number Diff line change
Expand Up @@ -18,7 +18,7 @@ include_directories(
$ENV{AMGX_DIR}/include
)

add_library(${PROJECT_NAME} SHARED dfUEqn.cu AmgXSolver.cu)
add_library(${PROJECT_NAME} SHARED dfUEqn.cu dfRhoEqn.cu AmgXSolver.cu)

target_link_libraries(${PROJECT_NAME}
${MPI_LIBRARIES}
Expand Down
13 changes: 7 additions & 6 deletions src_gpu/dfMatrixDataBase.H
Original file line number Diff line number Diff line change
Expand Up @@ -29,7 +29,7 @@ void check(T result, char const *const func, const char *const file,

#define checkCudaErrors(val) check((val), #val, __FILE__, __LINE__)

void checkVectorEqual(int count, const double* basevec, double* vec, double max_relative_error) {
inline void checkVectorEqual(int count, const double* basevec, double* vec, double max_relative_error) {
for (size_t i = 0; i < count; ++i)
{
double abs_diff = fabs(basevec[i] - vec[i]);
Expand Down Expand Up @@ -110,6 +110,7 @@ struct dfMatrixDataBase
*d_laplac_internal_coeffs_init = nullptr, *d_laplac_boundary_coeffs_init = nullptr,
*d_boundary_pressure = nullptr, *d_boundary_face_vector = nullptr,
*d_boundary_pressure_init = nullptr,
*d_boundary_phi = nullptr, *d_boundary_phi_init = nullptr,
*d_boundary_velocity = nullptr, *d_boundary_velocity_init = nullptr,
*d_boundary_nuEff = nullptr, *d_boundary_nuEff_init = nullptr,
*d_boundary_rho = nullptr, *d_boundary_rho_init = nullptr;
Expand Down Expand Up @@ -167,10 +168,7 @@ struct dfMatrixDataBase
* @brief cuda functions
*/
cudaStream_t stream;
/**
* @brief AmgX functions
*/
AmgXSolver* UxSolver = nullptr, *UySolver = nullptr, *UzSolver = nullptr;

int num_iteration;

double time_monitor_CPU;
Expand All @@ -191,6 +189,7 @@ struct dfMatrixDataBase
{
// allocate field pointer in pin memory
cudaMallocHost(&h_phi_init, num_faces * sizeof(double));
cudaMallocHost(&h_rho_old, num_cells * sizeof(double));

h_weight_vec_init.resize(num_faces);
h_weight_vec.resize(num_faces);
Expand Down Expand Up @@ -478,6 +477,8 @@ struct dfMatrixDataBase

checkCudaErrors(cudaMalloc((void**)&d_boundary_pressure_init, boundary_face_bytes));
checkCudaErrors(cudaMalloc((void**)&d_boundary_pressure, boundary_face_bytes));
checkCudaErrors(cudaMalloc((void**)&d_boundary_phi_init, boundary_face_bytes));
checkCudaErrors(cudaMalloc((void**)&d_boundary_phi, boundary_face_bytes));
checkCudaErrors(cudaMalloc((void**)&d_boundary_velocity_init, boundary_face_vec_bytes));
checkCudaErrors(cudaMalloc((void**)&d_boundary_velocity, boundary_face_vec_bytes));
checkCudaErrors(cudaMalloc((void**)&d_boundary_face_vector, boundary_face_vec_bytes));
Expand All @@ -495,7 +496,7 @@ struct dfMatrixDataBase
checkCudaErrors(cudaMalloc((void**)&d_boundary_nuEff_init, boundary_face_bytes));
checkCudaErrors(cudaMalloc((void**)&d_boundary_rho, boundary_face_bytes));
checkCudaErrors(cudaMalloc((void**)&d_boundary_rho_init, boundary_face_bytes));
total_bytes += (boundary_face_bytes*8 + boundary_face_vec_bytes * 11);
total_bytes += (boundary_face_bytes*10 + boundary_face_vec_bytes * 11);

// checkCudaErrors(cudaMalloc((void**)&d_A_csr, csr_value_vec_bytes));
// checkCudaErrors(cudaMalloc((void**)&d_b, cell_vec_bytes));
Expand Down
43 changes: 43 additions & 0 deletions src_gpu/dfRhoEqn.H
Original file line number Diff line number Diff line change
@@ -0,0 +1,43 @@
#pragma once
#include "dfMatrixDataBase.H"

/*
fvScalarMatrix rhoEqn
(
fvm::ddt(rho)
+ fvc::div(phi)
);

rhoEqn.solve();
*/

class dfRhoEqn
{
private:
dfMatrixDataBase& dataBase_;
cudaStream_t stream;
int num_iteration;
double time_monitor_CPU;
double time_monitor_GPU_kernel, time_monitor_GPU_memcpy, time_monitor_GPU_memcpy_test;

// common variables
int num_cells, cell_bytes, num_surfaces, num_boundary_cells;
int *d_A_csr_row_index, *d_A_csr_diag_index;

// Matrix variables
double *d_b, *d_psi = nullptr;
double *h_b, *h_psi = nullptr;

public:
dfRhoEqn();
dfRhoEqn(dfMatrixDataBase& dataBase);
~dfRhoEqn();

void checkValue(bool print);

void fvc_div(double *phi, double *boundary_phi_init);

void fvm_ddt(double *rho_old, double *rho_new);

void updatePsi(double* Psi);
};
141 changes: 141 additions & 0 deletions src_gpu/dfRhoEqn.cu
Original file line number Diff line number Diff line change
@@ -0,0 +1,141 @@
#include "dfRhoEqn.H"

// kernel functions
__global__ void fvc_div_internal_rho(int num_cells, const int *csr_row_index,
const int *csr_diag_index, const int *permedIndex, const double *phi_init,
double *phi_out, const double sign, const double *b_input, double *b_output)
{
int index = blockDim.x * blockIdx.x + threadIdx.x;
if (index >= num_cells)
return;

// A_csr has one more element in each row: itself
int row_index = csr_row_index[index];
int row_elements = csr_row_index[index + 1] - row_index;
int diag_index = csr_diag_index[index];
int neighbor_offset = csr_row_index[index] - index;

double sum = 0;

// lower
for (int i = 0; i < diag_index; i++)
{
int neighbor_index = neighbor_offset + i;
int permute_index = permedIndex[neighbor_index];
double phi = phi_init[permute_index];
phi_out[neighbor_index] = phi;
sum -= phi;
}
// upper
for (int i = diag_index + 1; i < row_elements; i++)
{
int neighbor_index = neighbor_offset + i - 1;
int permute_index = permedIndex[neighbor_index];
double phi = phi_init[permute_index];
phi_out[neighbor_index] = phi;
sum += phi;
}

b_output[index] = b_input[index] + sum * sign;
}

__global__ void fvc_div_boundary_rho(int num_cells, int num_boundary_cells, const int *boundary_cell_offset,
const int *boundary_cell_id, const int *bouPermedIndex, const double *boundary_phi_init,
double *boundary_phi, const double sign, const double *b_input, double *b_output)
{
int index = blockDim.x * blockIdx.x + threadIdx.x;
if (index >= num_boundary_cells)
return;

int cell_offset = boundary_cell_offset[index];
int next_cell_offset = boundary_cell_offset[index + 1];
int cell_index = boundary_cell_id[cell_offset];

double sum = 0;

for (int i = cell_offset; i < next_cell_offset; i++)
{
int permute_index = bouPermedIndex[i];
double phi = boundary_phi_init[permute_index];
boundary_phi[i] = phi;
sum += phi;
}

b_output[cell_index] = b_input[cell_index] + sum * sign;
}

__global__ void fvm_ddt_rho(int num_cells, const double rdelta_t,
const double *rho_old, double *rho_new, const double *volume, const double *b)
{
int index = blockDim.x * blockIdx.x + threadIdx.x;
if (index >= num_cells)
return;

double ddt_diag = rdelta_t * volume[index];
double ddt_source = rdelta_t * rho_old[index] * volume[index];
double source_sum = ddt_source - b[index];

rho_new[index] = source_sum / ddt_diag;
}

// constructor
dfRhoEqn::dfRhoEqn(dfMatrixDataBase &dataBase)
: dataBase_(dataBase)
{
num_cells = dataBase_.num_cells;
cell_bytes = dataBase_.cell_bytes;
num_surfaces = dataBase_.num_surfaces;
num_boundary_cells = dataBase_.num_boundary_cells;

d_A_csr_row_index = dataBase_.d_A_csr_row_index;
d_A_csr_diag_index = dataBase_.d_A_csr_diag_index;

cudaMallocHost(&h_psi, cell_bytes);

checkCudaErrors(cudaMalloc((void **)&d_b, cell_bytes));
checkCudaErrors(cudaMalloc((void **)&d_psi, cell_bytes));

checkCudaErrors(cudaStreamCreate(&stream));
}

void dfRhoEqn::fvc_div(double *phi, double *boundary_phi_init)
{
checkCudaErrors(cudaMemsetAsync(d_b, 0, cell_bytes, stream));
clock_t start = std::clock();
memcpy(dataBase_.h_phi_init, phi, num_surfaces * sizeof(double));

clock_t end = std::clock();
time_monitor_CPU += double(end - start) / double(CLOCKS_PER_SEC);

start = std::clock();
checkCudaErrors(cudaMemcpyAsync(dataBase_.d_phi_init, dataBase_.h_phi_init, num_surfaces * sizeof(double), cudaMemcpyHostToDevice, stream));
checkCudaErrors(cudaMemcpyAsync(dataBase_.d_phi_init + num_surfaces, dataBase_.d_phi_init, num_surfaces * sizeof(double), cudaMemcpyDeviceToDevice, stream));
checkCudaErrors(cudaMemcpyAsync(dataBase_.d_boundary_phi_init, boundary_phi_init, dataBase_.boundary_face_bytes, cudaMemcpyHostToDevice, stream));
end = std::clock();

size_t threads_per_block = 1024;
size_t blocks_per_grid = (num_cells + threads_per_block - 1) / threads_per_block;
fvc_div_internal_rho<<<blocks_per_grid, threads_per_block, 0, stream>>>(num_cells, d_A_csr_row_index, d_A_csr_diag_index, dataBase_.d_permedIndex,
dataBase_.d_phi_init, dataBase_.d_phi, 1., d_b, d_b);

blocks_per_grid = (num_boundary_cells + threads_per_block - 1) / threads_per_block;
fvc_div_boundary_rho<<<blocks_per_grid, threads_per_block, 0, stream>>>(num_cells, num_boundary_cells, dataBase_.d_boundary_cell_offset, dataBase_.d_boundary_cell_id,
dataBase_.d_bouPermedIndex, dataBase_.d_boundary_phi_init, dataBase_.d_boundary_phi, 1., d_b, d_b);
}

void dfRhoEqn::fvm_ddt(double *rho_old, double *rho_new)
{
checkCudaErrors(cudaMemcpyAsync(dataBase_.d_rho_old, rho_old, cell_bytes, cudaMemcpyHostToDevice, stream));
size_t threads_per_block = 1024;
size_t blocks_per_grid = (num_cells + threads_per_block - 1) / threads_per_block;
fvm_ddt_rho<<<blocks_per_grid, threads_per_block, 0, stream>>>(num_cells, dataBase_.rdelta_t, dataBase_.d_rho_old, dataBase_.d_rho_new, dataBase_.d_volume, d_b);
checkCudaErrors(cudaMemcpyAsync(h_psi, dataBase_.d_rho_new, cell_bytes, cudaMemcpyDeviceToHost, stream));
}

void dfRhoEqn::updatePsi(double *Psi)
{
checkCudaErrors(cudaStreamSynchronize(stream));
for (size_t i = 0; i < num_cells; i++)
Psi[i] = h_psi[i];
}
dfRhoEqn::~dfRhoEqn(){}
Loading