diff --git a/applications/solvers/dfLowMachFoam/EEqn.H b/applications/solvers/dfLowMachFoam/EEqn.H index bdef9043f..8e8c0bee0 100644 --- a/applications/solvers/dfLowMachFoam/EEqn.H +++ b/applications/solvers/dfLowMachFoam/EEqn.H @@ -23,13 +23,16 @@ == fvc::div(hDiffCorrFlux) ); + end1 = std::clock(); + time_monitor_EEqn += double(end1 - start1) / double(CLOCKS_PER_SEC); + time_monitor_EEqn_mtxAssembly += double(end1 - start1) / double(CLOCKS_PER_SEC); - EEqn.relax(); - + // EEqn.relax(); + start1 = std::clock(); EEqn.solve(); - end1 = std::clock(); time_monitor_EEqn += double(end1 - start1) / double(CLOCKS_PER_SEC); + time_monitor_EEqn_Solve += double(end1 - start1) / double(CLOCKS_PER_SEC); #endif #ifdef GPUSolver_ @@ -52,35 +55,22 @@ memcpy(boundary_hDiffCorrFlux + eeqn_offset * 3, &patchhDiffCorrFlux[0][0], 3 * patchSize*sizeof(double)); memcpy(boundary_alphaEff + eeqn_offset, &patchAlphaEff[0], patchSize*sizeof(double)); - Field valueInternalCoeffs = patchFlux*he.boundaryField()[patchi].valueInternalCoeffs(pw); - Field valueBoundaryCoeffs = -patchFlux*he.boundaryField()[patchi].valueBoundaryCoeffs(pw); - Field gradientInternalCoeffs = he.boundaryField()[patchi].gradientInternalCoeffs(); - Field gradientBoundaryCoeffs = he.boundaryField()[patchi].gradientBoundaryCoeffs(); - memcpy(eeqn_valueInternalCoeffs + eeqn_offset, &valueInternalCoeffs[0], patchSize*sizeof(double)); - memcpy(eeqn_valueBoundaryCoeffs + eeqn_offset, &valueBoundaryCoeffs[0], patchSize*sizeof(double)); - memcpy(eeqn_gradientInternalCoeffs + eeqn_offset, &gradientInternalCoeffs[0], patchSize*sizeof(double)); - memcpy(eeqn_gradientBoundaryCoeffs + eeqn_offset, &gradientBoundaryCoeffs[0], patchSize*sizeof(double)); - eeqn_offset += patchSize; } - eeqn_offset = 0; end1 = std::clock(); - time_monitor_CPU += double(end1 - start1) / double(CLOCKS_PER_SEC); time_monitor_EEqn_mtxAssembly_CPU_Prepare += double(end1 - start1) / double(CLOCKS_PER_SEC); // prepare data on GPU start1 = std::clock(); EEqn_GPU.prepare_data(&he.oldTime()[0], &K[0], &K.oldTime()[0], &alphaEff[0], &dpdt[0], &diffAlphaD[0], &hDiffCorrFlux[0][0], - boundary_K, boundary_hDiffCorrFlux, boundary_alphaEff, - eeqn_valueInternalCoeffs, eeqn_valueBoundaryCoeffs, - eeqn_gradientInternalCoeffs, eeqn_gradientBoundaryCoeffs); + boundary_K, boundary_hDiffCorrFlux, boundary_alphaEff); if (doSync) EEqn_GPU.sync(); end1 = std::clock(); time_monitor_EEqn_mtxAssembly_GPU_Prepare += double(end1 - start1) / double(CLOCKS_PER_SEC); start1 = std::clock(); - EEqn_GPU.resetAb(); + EEqn_GPU.initializeTimeStep(); EEqn_GPU.fvm_ddt(); EEqn_GPU.fvm_div(); EEqn_GPU.fvm_laplacian(); @@ -98,7 +88,7 @@ time_monitor_EEqn_mtxAssembly += double(end2 - start2) / double(CLOCKS_PER_SEC); // check value of mtxAssembly, no time monitor - EEqn_GPU.checkValue(false); + // EEqn_GPU.checkValue(false); start1 = std::clock(); EEqn_GPU.solve(); @@ -109,8 +99,8 @@ start1 = std::clock(); EEqn_GPU.updatePsi(&he[0]); + he.correctBoundaryConditions(); end1 = std::clock(); - time_monitor_CPU += double(end1 - start1) / double(CLOCKS_PER_SEC); time_monitor_EEqn += double(end1 - start1) / double(CLOCKS_PER_SEC); time_monitor_EEqn_mtxAssembly += double(end1 - start1) / double(CLOCKS_PER_SEC); #endif diff --git a/applications/solvers/dfLowMachFoam/UEqn.H b/applications/solvers/dfLowMachFoam/UEqn.H index 5a089e0c0..4427ece1b 100644 --- a/applications/solvers/dfLowMachFoam/UEqn.H +++ b/applications/solvers/dfLowMachFoam/UEqn.H @@ -1,6 +1,7 @@ // Solve the Momentum equation #ifdef GPUSolver_ start1 = std::clock(); + UEqn_GPU.initializeTimeStep(); UEqn_GPU.fvm_ddt(&U.oldTime()[0][0]); start2 = std::clock(); int offset = 0; @@ -8,30 +9,13 @@ const volScalarField& nuEff = nuEff_tmp(); forAll(U.boundaryField(), patchi) { - const fvsPatchScalarField& patchFlux = phi.boundaryField()[patchi]; - const fvsPatchScalarField& pw = mesh.surfaceInterpolation::weights().boundaryField()[patchi]; - const scalarField& patchP = p.boundaryField()[patchi]; - const vectorField& pSf = mesh.Sf().boundaryField()[patchi]; const vectorField& patchU = U.boundaryField()[patchi]; const scalarField& patchRho = rho.boundaryField()[patchi]; const scalarField& patchNuEff = nuEff.boundaryField()[patchi]; - int patchSize = pw.size(); - - Field ueqn_internalCoeffs_vec = patchFlux*U.boundaryField()[patchi].valueInternalCoeffs(pw); - Field ueqn_boundaryCoeffs_vec = -patchFlux*U.boundaryField()[patchi].valueBoundaryCoeffs(pw); - Field ueqn_laplac_internalCoeffs_vec = U.boundaryField()[patchi].gradientInternalCoeffs(); - Field ueqn_laplac_boundaryCoeffs_vec = U.boundaryField()[patchi].gradientBoundaryCoeffs(); + int patchSize = patchP.size(); - // only need to construct once - memcpy(ueqn_internalCoeffs_init + 3*offset, &ueqn_internalCoeffs_vec[0][0], 3*patchSize*sizeof(double)); - // need to construct every time step - memcpy(ueqn_boundaryCoeffs_init + 3*offset, &ueqn_boundaryCoeffs_vec[0][0], 3*patchSize*sizeof(double)); - // laplacian internalCoeffs - memcpy(ueqn_laplac_internalCoeffs_init + 3*offset, &ueqn_laplac_internalCoeffs_vec[0][0], 3*patchSize*sizeof(double)); - // laplacian boundaryCoeffs - memcpy(ueqn_laplac_boundaryCoeffs_init + 3*offset, &ueqn_laplac_boundaryCoeffs_vec[0][0], 3*patchSize*sizeof(double)); // boundary pressure memcpy(boundary_pressure_init+offset, &patchP[0], patchSize*sizeof(double)); // boundary velocity @@ -44,14 +28,12 @@ } end1 = std::clock(); end2 = std::clock(); - time_monitor_CPU += double(end2 - start2) / double(CLOCKS_PER_SEC); + time_monitor_UEqn_CPU += double(end2 - start2) / double(CLOCKS_PER_SEC); time_monitor_UEqn += double(end1 - start1) / double(CLOCKS_PER_SEC); time_monitor_UEqn_mtxAssembly += double(end1 - start1) / double(CLOCKS_PER_SEC); start1 = std::clock(); - UEqn_GPU.fvm_div(&phi[0], ueqn_internalCoeffs_init, ueqn_boundaryCoeffs_init, - ueqn_laplac_internalCoeffs_init, ueqn_laplac_boundaryCoeffs_init, - boundary_pressure_init, boundary_velocity_init, boundary_nuEff_init, boundary_rho_init); + UEqn_GPU.fvm_div(boundary_pressure_init, boundary_velocity_init, boundary_nuEff_init, boundary_rho_init); UEqn_GPU.fvc_grad(&p[0]); UEqn_GPU.fvc_grad_vector(); UEqn_GPU.dev2T(); diff --git a/applications/solvers/dfLowMachFoam/YEqn.H b/applications/solvers/dfLowMachFoam/YEqn.H index eeaea40e7..c741a8f48 100644 --- a/applications/solvers/dfLowMachFoam/YEqn.H +++ b/applications/solvers/dfLowMachFoam/YEqn.H @@ -1,5 +1,4 @@ start = std::clock(); -cudaSetDevice(0); hDiffCorrFlux = Zero; diffAlphaD = Zero; sumYDiffError = Zero; @@ -15,12 +14,15 @@ tmp> mvConvection ) ); - +start1 = std::clock(); forAll(Y, i) { sumYDiffError += chemistry->rhoD(i)*fvc::grad(Y[i]); } const surfaceScalarField phiUc = linearInterpolate(sumYDiffError) & mesh.Sf(); +start1 = std::clock(); +time_monitor_YEqn += double(end1 - start1) / double(CLOCKS_PER_SEC); +time_monitor_YEqn_Solve += double(end1 - start1) / double(CLOCKS_PER_SEC); #ifdef GPUSolver_ start1 = std::clock(); @@ -32,13 +34,14 @@ const surfaceScalarField phiUc = linearInterpolate(sumYDiffError) & mesh.Sf(); #endif #ifdef GPUSolver_ - std::vector Y_new(Y.size()), Y_old(Y.size()), boundary_Y_init(Y.size()), boundary_rhoD_init(Y.size()); + start1 = std::clock(); + start2 = std::clock(); + std::vector Y_old(Y.size()), boundary_Y_init(Y.size()), boundary_rhoD_init(Y.size()); std::vector rhoD_GPU(Y.size()); for (size_t i = 0; i < Y.size(); ++i) { volScalarField& Yi = Y[i]; const volScalarField& rhoDi = chemistry->rhoD(i); - Y_new[i] = &Yi[0]; Y_old[i] = &Yi.oldTime()[0]; rhoD_GPU[i] = &chemistry->rhoD(i)[0]; cudaMallocHost(&boundary_Y_init[i], num_boundary_faces*sizeof(double)); @@ -64,15 +67,25 @@ const surfaceScalarField phiUc = linearInterpolate(sumYDiffError) & mesh.Sf(); int patchSize = patchMut_sct.size(); boundary_mutsct.insert(boundary_mutsct.end(), &patchMut_sct[0], &patchMut_sct[0] + patchSize); } + end2 = std::clock(); + time_monitor_YEqn_mtxAssembly_CPU_Prepare += double(end2 - start2) / double(CLOCKS_PER_SEC); YEqn_GPU.upwindWeight(); - YEqn_GPU.correctVelocity(Y_new, boundary_Y_init, rhoD_GPU); - YEqn_GPU.fvm_ddt(Y_old); + YEqn_GPU.correctVelocity(Y_old, boundary_Y_init, rhoD_GPU); + YEqn_GPU.fvm_ddt(); YEqn_GPU.fvm_div_phi(); YEqn_GPU.fvm_div_phiUc(); YEqn_GPU.fvm_laplacian(&mut_sct[0], boundary_mutsct.data(), boundary_rhoD_init); + end1 = std::clock(); + time_monitor_YEqn += double(end1 - start1) / double(CLOCKS_PER_SEC); + time_monitor_YEqn_mtxAssembly += double(end1 - start1) / double(CLOCKS_PER_SEC); + + start1 = std::clock(); YEqn_GPU.solve(); + end1 = std::clock(); + time_monitor_YEqn += double(end1 - start1) / double(CLOCKS_PER_SEC); + time_monitor_YEqn_Solve += double(end1 - start1) / double(CLOCKS_PER_SEC); #endif //MPI_Barrier(PstreamGlobals::MPI_COMM_FOAM); @@ -108,9 +121,15 @@ time_monitor_corrDiff += double(end - start) / double(CLOCKS_PER_SEC); if (i != inertIndex) { #ifdef GPUSolver_ + start1 = std::clock(); YEqn_GPU.updatePsi(&Yi[0], speciesIndex); Yi.correctBoundaryConditions(); + end1 = std::clock(); + time_monitor_YEqn += double(end1 - start1) / double(CLOCKS_PER_SEC); + time_monitor_YEqn_mtxAssembly += double(end1 - start1) / double(CLOCKS_PER_SEC); + time_monitor_YEqn_mtxAssembly_CPU_Prepare += double(end1 - start1) / double(CLOCKS_PER_SEC); #else + start1 = std::clock(); tmp DEff = chemistry->rhoD(i) + turbulence->mut()/Sct; fvScalarMatrix YiEqn ( @@ -124,10 +143,16 @@ time_monitor_corrDiff += double(end - start) / double(CLOCKS_PER_SEC); : (fvm::laplacian(DEff(), Yi) + combustion->R(Yi)) ) ); + end1 = std::clock(); + time_monitor_YEqn += double(end1 - start1) / double(CLOCKS_PER_SEC); + time_monitor_YEqn_mtxAssembly += double(end1 - start1) / double(CLOCKS_PER_SEC); + // YiEqn.relax(); - YiEqn.relax(); - + start1 = std::clock(); YiEqn.solve("Yi"); + end1 = std::clock(); + time_monitor_YEqn += double(end1 - start1) / double(CLOCKS_PER_SEC); + time_monitor_YEqn_Solve += double(end1 - start1) / double(CLOCKS_PER_SEC); #endif Yi.max(0.0); @@ -141,5 +166,4 @@ time_monitor_corrDiff += double(end - start) / double(CLOCKS_PER_SEC); end = std::clock(); time_monitor_Y += double(end - start) / double(CLOCKS_PER_SEC); - cudaSetDevice(0); } diff --git a/applications/solvers/dfLowMachFoam/dfLowMachFoam.C b/applications/solvers/dfLowMachFoam/dfLowMachFoam.C index 80bf0bcd8..c2a62797a 100644 --- a/applications/solvers/dfLowMachFoam/dfLowMachFoam.C +++ b/applications/solvers/dfLowMachFoam/dfLowMachFoam.C @@ -102,6 +102,7 @@ int main(int argc, char *argv[]) double time_monitor_UEqn_mtxAssembly=0; double time_monitor_UEqn_Solve=0; double time_monitor_UEqn_sum=0; + double time_monitor_UEqn_CPU=0; double time_monitor_EEqn=0; double time_monitor_EEqn_mtxAssembly=0; double time_monitor_EEqn_mtxAssembly_CPU_Prepare; @@ -109,6 +110,10 @@ int main(int argc, char *argv[]) double time_monitor_EEqn_mtxAssembly_GPU_Run; double time_monitor_EEqn_Solve=0; double time_monitor_EEqn_sum=0; + double time_monitor_YEqn=0; + double time_monitor_YEqn_mtxAssembly=0; + double time_monitor_YEqn_mtxAssembly_CPU_Prepare=0; + double time_monitor_YEqn_Solve=0; double time_monitor_chem=0; double time_monitor_Y=0; double time_monitor_E=0; @@ -213,7 +218,7 @@ int main(int argc, char *argv[]) { if (pimple.consistent()) { - #include "pcEqn.H" + // #include "pcEqn.H" } else { @@ -255,9 +260,12 @@ int main(int argc, char *argv[]) Info<< "EEqn assamble(GPU prepare) = " << time_monitor_EEqn_mtxAssembly_GPU_Prepare << " s" << endl; Info<< "EEqn assamble(GPU run) = " << time_monitor_EEqn_mtxAssembly_GPU_Run << " s" << endl; Info<< "EEqn Time solve = " << time_monitor_EEqn_Solve << " s" << endl; + Info<< "YEqn Time = " << time_monitor_YEqn << " s" << endl; + Info<< "YEqn Time assamble Mtx = " << time_monitor_YEqn_mtxAssembly << " s" << endl; + Info<< "YEqn assamble(CPU prepare) = " << time_monitor_YEqn_mtxAssembly_CPU_Prepare << " s" << endl; + Info<< "YEqn Time solve = " << time_monitor_YEqn_Solve << " s" << endl; Info<< "sum Time = " << (time_monitor_chem + time_monitor_Y + time_monitor_flow + time_monitor_E + time_monitor_corrThermo + time_monitor_corrDiff) << " s" << endl; Info<< "CPU Time (get turb souce) = " << time_monitor_CPU << " s" << endl; - Info<< "UEqn time in EEqn = " << time_monitor_UinE << " s" << endl; Info<< "============================================"<= num_boundary_cells) @@ -211,9 +211,9 @@ __global__ void eeqn_fvc_ddt_kernel(int num_cells, const double rdelta_t, } __global__ void eeqn_fvc_div_vector_internal(int num_cells, - const int *csr_row_index, const int *csr_col_index, const int *csr_diag_index, - const double *sf, const double *vf, const double *tlambdas, - const double sign, const double *b_input, double *b_output) + const int *csr_row_index, const int *csr_col_index, const int *csr_diag_index, + const double *sf, const double *vf, const double *tlambdas, + const double sign, const double *b_input, double *b_output) { int index = blockDim.x * blockIdx.x + threadIdx.x; if (index >= num_cells) @@ -267,9 +267,9 @@ __global__ void eeqn_fvc_div_vector_internal(int num_cells, } __global__ void eeqn_fvc_div_vector_boundary(int num_boundary_cells, - const int *boundary_cell_offset, const int *boundary_cell_id, - const double *boundary_sf, const double *boundary_vf, - const double sign, const double *b_input, double *b_output) + const int *boundary_cell_offset, const int *boundary_cell_id, + const double *boundary_sf, const double *boundary_vf, + const double sign, const double *b_input, double *b_output) { int index = blockDim.x * blockIdx.x + threadIdx.x; if (index >= num_boundary_cells) @@ -316,9 +316,9 @@ __global__ void eeqn_fvc_div_vector_boundary(int num_boundary_cells, } __global__ void eeqn_fvc_div_phi_scalar_internal(int num_cells, - const int *csr_row_index, const int *csr_col_index, const int *csr_diag_index, - const double *weight, const double *phi, const double *K, - const double sign, const double *b_input, double *b_output) + const int *csr_row_index, const int *csr_col_index, const int *csr_diag_index, + const double *weight, const double *phi, const double *K, + const double sign, const double *b_input, double *b_output) { int index = blockDim.x * blockIdx.x + threadIdx.x; if (index >= num_cells) @@ -362,9 +362,9 @@ __global__ void eeqn_fvc_div_phi_scalar_internal(int num_cells, } __global__ void eeqn_fvc_div_phi_scalar_boundary(int num_boundary_cells, - const int *boundary_cell_offset, const int *boundary_cell_id, - const double *boundary_phi, const double* boundary_K, - const double sign, const double *b_input, double *b_output) + const int *boundary_cell_offset, const int *boundary_cell_id, + const double *boundary_phi, const double *boundary_K, + const double sign, const double *b_input, double *b_output) { int index = blockDim.x * blockIdx.x + threadIdx.x; if (index >= num_boundary_cells) @@ -384,12 +384,11 @@ __global__ void eeqn_fvc_div_phi_scalar_boundary(int num_boundary_cells, b_output[cell_index] = b_input[cell_index] + interp * sign; } - __global__ void eeqn_add_to_source_kernel(int num_cells, - const double sign_dpdt, const double *dpdt, - const double sign_diffAlphaD, const double *diffAlphaD, - const double *volume, - const double *b_input, double *b_output) + const double sign_dpdt, const double *dpdt, + const double sign_diffAlphaD, const double *diffAlphaD, + const double *volume, + const double *b_input, double *b_output) { int index = blockDim.x * blockIdx.x + threadIdx.x; if (index >= num_cells) @@ -399,14 +398,10 @@ __global__ void eeqn_add_to_source_kernel(int num_cells, } __global__ void eeqn_boundaryPermutation(const int num_boundary_faces, const int *bouPermedIndex, - const double *boundary_K_init, const double *boundary_hDiffCorrFlux_init, - const double *boundary_alphaEff_init, - const double *value_internal_coeffs_init, const double *value_boundary_coeffs_init, - const double *gradient_internal_coeffs_init, const double *gradient_boundary_coeffs_init, - double *boundary_K, double *boundary_hDiffCorrFlux, - double *boundary_alphaEff, - double *value_internal_coeffs, double *value_boundary_coeffs, - double *gradient_internal_coeffs, double *gradient_boundary_coeffs) + const double *boundary_K_init, const double *boundary_hDiffCorrFlux_init, + const double *boundary_alphaEff_init, + double *boundary_K, double *boundary_hDiffCorrFlux, + double *boundary_alphaEff) { int index = blockDim.x * blockIdx.x + threadIdx.x; if (index >= num_boundary_faces) @@ -419,10 +414,26 @@ __global__ void eeqn_boundaryPermutation(const int num_boundary_faces, const int boundary_hDiffCorrFlux[index * 3 + 1] = boundary_hDiffCorrFlux_init[p * 3 + 1]; boundary_hDiffCorrFlux[index * 3 + 2] = boundary_hDiffCorrFlux_init[p * 3 + 2]; boundary_alphaEff[index] = boundary_alphaEff_init[p]; - value_internal_coeffs[index] = value_internal_coeffs_init[p]; - value_boundary_coeffs[index] = value_boundary_coeffs_init[p]; - gradient_internal_coeffs[index] = gradient_internal_coeffs_init[p]; - gradient_boundary_coeffs[index] = gradient_boundary_coeffs_init[p]; +} + +__global__ void eeqn_update_BoundaryCoeffs_kernel(int num_boundary_faces, const double *boundary_phi, double *internal_coeffs, + double *boundary_coeffs, double *laplac_internal_coeffs, + double *laplac_boundary_coeffs) +{ + int index = blockDim.x * blockIdx.x + threadIdx.x; + if (index >= num_boundary_faces) + return; + + // zeroGradient + double valueInternalCoeffs = 1.; + double valueBoundaryCoeffs = 0.; + double gradientInternalCoeffs = 0.; + double gradientBoundaryCoeffs = 0.; + + internal_coeffs[index] = boundary_phi[index] * valueInternalCoeffs; + boundary_coeffs[index] = -boundary_phi[index] * valueBoundaryCoeffs; + laplac_internal_coeffs[index] = gradientInternalCoeffs; + laplac_boundary_coeffs[index] = gradientBoundaryCoeffs; } // constructor @@ -480,10 +491,9 @@ dfEEqn::dfEEqn(dfMatrixDataBase &dataBase, const std::string &modeStr, const std } void dfEEqn::prepare_data(const double *he_old, const double *K, const double *K_old, const double *alphaEff, - const double *dpdt, const double *diffAlphaD, const double *hDiffCorrFlux, - const double *boundary_K, const double *boundary_hDiffCorrFlux, const double *boundary_alphaEff, - const double *valueInternalCoeffs, const double *valueBoundaryCoeffs, - const double *gradientInternalCoeffs, const double *gradientBoundaryCoeffs) { + const double *dpdt, const double *diffAlphaD, const double *hDiffCorrFlux, + const double *boundary_K, const double *boundary_hDiffCorrFlux, const double *boundary_alphaEff) +{ // TODO not real async copy now, because some host array are not in pinned memory. // copy the host input array in host memory to the device input array in device memory @@ -499,26 +509,25 @@ void dfEEqn::prepare_data(const double *he_old, const double *K, const double *K checkCudaErrors(cudaMemcpyAsync(d_boundary_K_init, boundary_K, boundary_face_bytes, cudaMemcpyHostToDevice, stream)); checkCudaErrors(cudaMemcpyAsync(d_boundary_hDiffCorrFlux_init, boundary_hDiffCorrFlux, boundary_face_bytes * 3, cudaMemcpyHostToDevice, stream)); checkCudaErrors(cudaMemcpyAsync(d_boundary_alphaEff_init, boundary_alphaEff, boundary_face_bytes, cudaMemcpyHostToDevice, stream)); - checkCudaErrors(cudaMemcpyAsync(d_value_internal_coeffs_init, valueInternalCoeffs, boundary_face_bytes, cudaMemcpyHostToDevice, stream)); - checkCudaErrors(cudaMemcpyAsync(d_value_boundary_coeffs_init, valueBoundaryCoeffs, boundary_face_bytes, cudaMemcpyHostToDevice, stream)); - checkCudaErrors(cudaMemcpyAsync(d_gradient_internal_coeffs_init, gradientInternalCoeffs, boundary_face_bytes, cudaMemcpyHostToDevice, stream)); - checkCudaErrors(cudaMemcpyAsync(d_gradient_boundary_coeffs_init, gradientBoundaryCoeffs, boundary_face_bytes, cudaMemcpyHostToDevice, stream)); size_t threads_per_block = 1024; size_t blocks_per_grid = (num_boundary_faces + threads_per_block - 1) / threads_per_block; eeqn_boundaryPermutation<<>>(num_boundary_faces, dataBase_.d_bouPermedIndex, - d_boundary_K_init, d_boundary_hDiffCorrFlux_init, d_boundary_alphaEff_init, - d_value_internal_coeffs_init, d_value_boundary_coeffs_init, - d_gradient_internal_coeffs_init, d_gradient_boundary_coeffs_init, - d_boundary_K, d_boundary_hDiffCorrFlux, d_boundary_alphaEff, - d_value_internal_coeffs, d_value_boundary_coeffs, - d_gradient_internal_coeffs, d_gradient_boundary_coeffs); + d_boundary_K_init, d_boundary_hDiffCorrFlux_init, d_boundary_alphaEff_init, + d_boundary_K, d_boundary_hDiffCorrFlux, d_boundary_alphaEff); } -void dfEEqn::resetAb() { +void dfEEqn::initializeTimeStep() +{ // initialize matrix value checkCudaErrors(cudaMemsetAsync(d_A_csr, 0, csr_value_vec_bytes, stream)); checkCudaErrors(cudaMemsetAsync(d_b, 0, cell_vec_bytes, stream)); + // initialize boundary coeffs + size_t threads_per_block = 1024; + size_t blocks_per_grid = (dataBase_.num_boundary_faces + threads_per_block - 1) / threads_per_block; + eeqn_update_BoundaryCoeffs_kernel<<>>(dataBase_.num_boundary_faces, dataBase_.d_boundary_phi, + d_value_internal_coeffs, d_value_boundary_coeffs, + d_gradient_internal_coeffs, d_gradient_boundary_coeffs); } void dfEEqn::fvm_ddt() @@ -526,9 +535,9 @@ void dfEEqn::fvm_ddt() size_t threads_per_block = 1024; size_t blocks_per_grid = (num_cells + threads_per_block - 1) / threads_per_block; eeqn_fvm_ddt_kernel<<>>(num_cells, dataBase_.rdelta_t, - d_A_csr_row_index, d_A_csr_diag_index, - dataBase_.d_rho_old, dataBase_.d_rho_new, dataBase_.d_volume, d_he_old, - 1., d_A_csr, d_b, d_A_csr, d_b); + d_A_csr_row_index, d_A_csr_diag_index, + dataBase_.d_rho_old, dataBase_.d_rho_new, dataBase_.d_volume, d_he_old, + 1., d_A_csr, d_b, d_A_csr, d_b); } void dfEEqn::fvm_div() @@ -536,15 +545,15 @@ void dfEEqn::fvm_div() size_t threads_per_block = 1024; size_t blocks_per_grid = (num_cells + threads_per_block - 1) / threads_per_block; eeqn_fvm_div_internal<<>>(num_cells, - d_A_csr_row_index, d_A_csr_diag_index, - dataBase_.d_weight, dataBase_.d_phi, - 1., d_A_csr, d_b, d_A_csr, d_b); + d_A_csr_row_index, d_A_csr_diag_index, + dataBase_.d_weight, dataBase_.d_phi, + 1., d_A_csr, d_b, d_A_csr, d_b); blocks_per_grid = (num_boundary_cells + threads_per_block - 1) / threads_per_block; eeqn_fvm_div_boundary<<>>(num_boundary_cells, - d_A_csr_row_index, d_A_csr_diag_index, - dataBase_.d_boundary_cell_offset, dataBase_.d_boundary_cell_id, - d_value_internal_coeffs, d_value_boundary_coeffs, - 1., d_A_csr, d_b, d_A_csr, d_b); + d_A_csr_row_index, d_A_csr_diag_index, + dataBase_.d_boundary_cell_offset, dataBase_.d_boundary_cell_id, + d_value_internal_coeffs, d_value_boundary_coeffs, + 1., d_A_csr, d_b, d_A_csr, d_b); } void dfEEqn::fvm_laplacian() @@ -552,14 +561,14 @@ void dfEEqn::fvm_laplacian() size_t threads_per_block = 1024; size_t blocks_per_grid = (num_cells + threads_per_block - 1) / threads_per_block; eeqn_fvm_laplacian_uncorrected_internal<<>>(num_cells, - d_A_csr_row_index, d_A_csr_col_index, d_A_csr_diag_index, d_alphaEff, dataBase_.d_weight, - dataBase_.d_face, dataBase_.d_deltaCoeffs, - -1., d_A_csr, d_A_csr); + d_A_csr_row_index, d_A_csr_col_index, d_A_csr_diag_index, d_alphaEff, dataBase_.d_weight, + dataBase_.d_face, dataBase_.d_deltaCoeffs, + -1., d_A_csr, d_A_csr); blocks_per_grid = (num_boundary_cells + threads_per_block - 1) / threads_per_block; eeqn_fvm_laplacian_uncorrected_boundary<<>>(num_boundary_cells, - d_A_csr_row_index, d_A_csr_diag_index, dataBase_.d_boundary_cell_offset, dataBase_.d_boundary_cell_id, - d_boundary_alphaEff, dataBase_.d_boundary_face, d_gradient_internal_coeffs, d_gradient_boundary_coeffs, - -1., d_A_csr, d_b, d_A_csr, d_b); + d_A_csr_row_index, d_A_csr_diag_index, dataBase_.d_boundary_cell_offset, dataBase_.d_boundary_cell_id, + d_boundary_alphaEff, dataBase_.d_boundary_face, d_gradient_internal_coeffs, d_gradient_boundary_coeffs, + -1., d_A_csr, d_b, d_A_csr, d_b); } void dfEEqn::fvc_ddt() @@ -568,8 +577,8 @@ void dfEEqn::fvc_ddt() size_t threads_per_block = 1024; size_t blocks_per_grid = (num_cells + threads_per_block - 1) / threads_per_block; eeqn_fvc_ddt_kernel<<>>(num_cells, dataBase_.rdelta_t, - dataBase_.d_rho_old, dataBase_.d_rho_new, d_K_old, d_K, dataBase_.d_volume, - -1., d_b, d_b); + dataBase_.d_rho_old, dataBase_.d_rho_new, d_K_old, d_K, dataBase_.d_volume, + -1., d_b, d_b); } void dfEEqn::fvc_div_vector() @@ -578,14 +587,14 @@ void dfEEqn::fvc_div_vector() size_t threads_per_block = 512; size_t blocks_per_grid = (num_cells + threads_per_block - 1) / threads_per_block; eeqn_fvc_div_vector_internal<<>>(num_cells, - d_A_csr_row_index, d_A_csr_col_index, d_A_csr_diag_index, - dataBase_.d_face_vector, d_hDiffCorrFlux, dataBase_.d_weight, - 1., d_b, d_b); + d_A_csr_row_index, d_A_csr_col_index, d_A_csr_diag_index, + dataBase_.d_face_vector, d_hDiffCorrFlux, dataBase_.d_weight, + 1., d_b, d_b); blocks_per_grid = (num_boundary_cells + threads_per_block - 1) / threads_per_block; eeqn_fvc_div_vector_boundary<<>>(num_boundary_cells, - dataBase_.d_boundary_cell_offset, dataBase_.d_boundary_cell_id, - dataBase_.d_boundary_face_vector, d_boundary_hDiffCorrFlux, - 1., d_b, d_b); + dataBase_.d_boundary_cell_offset, dataBase_.d_boundary_cell_id, + dataBase_.d_boundary_face_vector, d_boundary_hDiffCorrFlux, + 1., d_b, d_b); } void dfEEqn::fvc_div_phi_scalar() @@ -594,14 +603,14 @@ void dfEEqn::fvc_div_phi_scalar() size_t threads_per_block = 1024; size_t blocks_per_grid = (num_cells + threads_per_block - 1) / threads_per_block; eeqn_fvc_div_phi_scalar_internal<<>>(num_cells, - d_A_csr_row_index, d_A_csr_col_index, d_A_csr_diag_index, - dataBase_.d_weight, dataBase_.d_phi, d_K, - -1., d_b, d_b); + d_A_csr_row_index, d_A_csr_col_index, d_A_csr_diag_index, + dataBase_.d_weight, dataBase_.d_phi, d_K, + -1., d_b, d_b); blocks_per_grid = (num_boundary_cells + threads_per_block - 1) / threads_per_block; eeqn_fvc_div_phi_scalar_boundary<<>>(num_boundary_cells, - dataBase_.d_boundary_cell_offset, dataBase_.d_boundary_cell_id, - dataBase_.d_boundary_phi, d_boundary_K, - -1., d_b, d_b); + dataBase_.d_boundary_cell_offset, dataBase_.d_boundary_cell_id, + dataBase_.d_boundary_phi, d_boundary_K, + -1., d_b, d_b); } void dfEEqn::add_to_source() @@ -612,7 +621,7 @@ void dfEEqn::add_to_source() size_t blocks_per_grid = (num_cells + threads_per_block - 1) / threads_per_block; // " + fvc::ddt(rho,K)" is on the left side of "==", thus should minus from source. eeqn_add_to_source_kernel<<>>(num_cells, - 1., d_dpdt, -1., d_diffAlphaD, dataBase_.d_volume, d_b, d_b); + 1., d_dpdt, -1., d_diffAlphaD, dataBase_.d_volume, d_b, d_b); } void dfEEqn::checkValue(bool print) @@ -701,10 +710,7 @@ void dfEEqn::sync() void dfEEqn::updatePsi(double *Psi) { - for (size_t i = 0; i < num_cells; i++) - { - Psi[i] = h_he_new[i]; - } + memcpy(Psi, h_he_new, cell_bytes); } dfEEqn::~dfEEqn() diff --git a/src_gpu/dfRhoEqn.H b/src_gpu/dfRhoEqn.H index 24bb834d2..35885a517 100644 --- a/src_gpu/dfRhoEqn.H +++ b/src_gpu/dfRhoEqn.H @@ -37,7 +37,7 @@ public: void fvc_div(double *phi, double *boundary_phi_init); - void fvm_ddt(double *rho_old, double *rho_new); + void fvm_ddt(double *rho_old); void updatePsi(double* Psi); }; \ No newline at end of file diff --git a/src_gpu/dfRhoEqn.cu b/src_gpu/dfRhoEqn.cu index 89aa4ae56..fc933f95b 100644 --- a/src_gpu/dfRhoEqn.cu +++ b/src_gpu/dfRhoEqn.cu @@ -123,7 +123,7 @@ void dfRhoEqn::fvc_div(double *phi, double *boundary_phi_init) dataBase_.d_bouPermedIndex, dataBase_.d_boundary_phi_init, dataBase_.d_boundary_phi, 1., d_b, d_b); } -void dfRhoEqn::fvm_ddt(double *rho_old, double *rho_new) +void dfRhoEqn::fvm_ddt(double *rho_old) { checkCudaErrors(cudaMemcpyAsync(dataBase_.d_rho_old, rho_old, cell_bytes, cudaMemcpyHostToDevice, stream)); size_t threads_per_block = 1024; diff --git a/src_gpu/dfUEqn.H b/src_gpu/dfUEqn.H index 8c6ed9c1e..3405a5bbc 100644 --- a/src_gpu/dfUEqn.H +++ b/src_gpu/dfUEqn.H @@ -31,9 +31,7 @@ public: void fvm_ddt(double *vector_old); - void fvm_div(double *phi, double *ueqn_internalCoeffs_init, double *ueqn_boundaryCoeffs_init, - double *ueqn_laplac_internalCoeffs_init, double *ueqn_laplac_boundaryCoeffs_init, - double *boundary_pressure_init, double *boundary_velocity_init, + void fvm_div(double *boundary_pressure_init, double *boundary_velocity_init, double *boundary_nuEff_init, double *boundary_rho_init); void fvc_grad(double *pressure); @@ -57,4 +55,6 @@ public: void updatePsi(double *Psi); void correctPsi(double *Psi); + + void initializeTimeStep(); }; \ No newline at end of file diff --git a/src_gpu/dfUEqn.cu b/src_gpu/dfUEqn.cu index 5e664cde4..7eb7cbe31 100644 --- a/src_gpu/dfUEqn.cu +++ b/src_gpu/dfUEqn.cu @@ -281,11 +281,7 @@ __global__ void offdiagPermutation(const int num_faces, const int *permedIndex, } __global__ void boundaryPermutation(const int num_boundary_faces, const int *bouPermedIndex, - const double *ueqn_internalCoeffs_init, const double *ueqn_boundaryCoeffs_init, - const double *ueqn_laplac_internalCoeffs_init, const double *ueqn_laplac_boundaryCoeffs_init, const double *boundary_pressure_init, const double *boundary_velocity_init, - double *ueqn_internalCoeffs, double *ueqn_boundaryCoeffs, - double *ueqn_laplac_internalCoeffs, double *ueqn_laplac_boundaryCoeffs, double *boundary_pressure, double *boundary_velocity, double *boundary_nuEff_init, double *boundary_nuEff, double *boundary_rho_init, double *boundary_rho) @@ -295,20 +291,6 @@ __global__ void boundaryPermutation(const int num_boundary_faces, const int *bou return; int p = bouPermedIndex[index]; - ueqn_internalCoeffs[3 * index] = ueqn_internalCoeffs_init[3 * p]; - ueqn_internalCoeffs[3 * index + 1] = ueqn_internalCoeffs_init[3 * p + 1]; - ueqn_internalCoeffs[3 * index + 2] = ueqn_internalCoeffs_init[3 * p + 2]; - ueqn_boundaryCoeffs[3 * index] = ueqn_boundaryCoeffs_init[3 * p]; - ueqn_boundaryCoeffs[3 * index + 1] = ueqn_boundaryCoeffs_init[3 * p + 1]; - ueqn_boundaryCoeffs[3 * index + 2] = ueqn_boundaryCoeffs_init[3 * p + 2]; - - ueqn_laplac_internalCoeffs[3 * index] = ueqn_laplac_internalCoeffs_init[3 * p]; - ueqn_laplac_internalCoeffs[3 * index + 1] = ueqn_laplac_internalCoeffs_init[3 * p + 1]; - ueqn_laplac_internalCoeffs[3 * index + 2] = ueqn_laplac_internalCoeffs_init[3 * p + 2]; - ueqn_laplac_boundaryCoeffs[3 * index] = ueqn_laplac_boundaryCoeffs_init[3 * p]; - ueqn_laplac_boundaryCoeffs[3 * index + 1] = ueqn_laplac_boundaryCoeffs_init[3 * p + 1]; - ueqn_laplac_boundaryCoeffs[3 * index + 2] = ueqn_laplac_boundaryCoeffs_init[3 * p + 2]; - boundary_velocity[3 * index] = boundary_velocity_init[3 * p]; boundary_velocity[3 * index + 1] = boundary_velocity_init[3 * p + 1]; boundary_velocity[3 * index + 2] = boundary_velocity_init[3 * p + 2]; @@ -317,17 +299,6 @@ __global__ void boundaryPermutation(const int num_boundary_faces, const int *bou boundary_nuEff[index] = boundary_nuEff_init[p]; } -__global__ void csrPermutation(const int num_Nz, const int *csrPermedIndex, - const double *d_turbSrc_A_init, double *d_turbSrc_A) -{ - int index = blockDim.x * blockIdx.x + threadIdx.x; - if (index >= num_Nz) - return; - - int p = csrPermedIndex[index]; - d_turbSrc_A[index] = d_turbSrc_A_init[p]; -} - __global__ void fvc_grad_vector_internal(int num_cells, const int *csr_row_index, const int *csr_col_index, const int *csr_diag_index, const double *sf, const double *vf, const double *tlambdas, const double *volume, @@ -999,7 +970,35 @@ __global__ void addDiagDivVolume(int num_cells, const int *csr_row_index, double vol = volume[index]; - A_output[index] = (A_input[index] + A_csr[csr_index] - ueqn_internal_coeffs[index * 3]) / vol; + A_output[index] = (A_input[index] + A_csr[csr_index] - ueqn_internal_coeffs[index * 3]) / vol; +} + +__global__ void ueqn_update_BoundaryCoeffs_kernel(int num_boundary_faces, const double *boundary_phi, double *internal_coeffs, + double *boundary_coeffs, double *laplac_internal_coeffs, + double *laplac_boundary_coeffs) +{ + int index = blockDim.x * blockIdx.x + threadIdx.x; + if (index >= num_boundary_faces) + return; + + // zeroGradient + double valueInternalCoeffs = 1.; + double valueBoundaryCoeffs = 0.; + double gradientInternalCoeffs = 0.; + double gradientBoundaryCoeffs = 0.; + + internal_coeffs[index * 3 + 0] = boundary_phi[index] * valueInternalCoeffs; + internal_coeffs[index * 3 + 1] = boundary_phi[index] * valueInternalCoeffs; + internal_coeffs[index * 3 + 2] = boundary_phi[index] * valueInternalCoeffs; + boundary_coeffs[index * 3 + 0] = -boundary_phi[index] * valueBoundaryCoeffs; + boundary_coeffs[index * 3 + 1] = -boundary_phi[index] * valueBoundaryCoeffs; + boundary_coeffs[index * 3 + 2] = -boundary_phi[index] * valueBoundaryCoeffs; + laplac_internal_coeffs[index * 3 + 0] = gradientInternalCoeffs; + laplac_internal_coeffs[index * 3 + 1] = gradientInternalCoeffs; + laplac_internal_coeffs[index * 3 + 2] = gradientInternalCoeffs; + laplac_boundary_coeffs[index * 3 + 0] = gradientBoundaryCoeffs; + laplac_boundary_coeffs[index * 3 + 1] = gradientBoundaryCoeffs; + laplac_boundary_coeffs[index * 3 + 2] = gradientBoundaryCoeffs; } // constructor @@ -1041,9 +1040,6 @@ dfUEqn::dfUEqn(dfMatrixDataBase &dataBase, const std::string &modeStr, const std void dfUEqn::fvm_ddt(double *vector_old) { - // initialize matrix value - checkCudaErrors(cudaMemsetAsync(d_A_csr, 0, csr_value_vec_bytes, stream)); - checkCudaErrors(cudaMemsetAsync(d_b, 0, cell_vec_bytes, stream)); // Copy the host input array in host memory to the device input array in device memory clock_t start = std::clock(); checkCudaErrors(cudaMemcpyAsync(dataBase_.d_velocity_old, vector_old, cell_vec_bytes, cudaMemcpyHostToDevice, stream)); @@ -1061,53 +1057,24 @@ void dfUEqn::fvm_ddt(double *vector_old) time_monitor_GPU_kernel += double(end - start) / double(CLOCKS_PER_SEC); } -void dfUEqn::fvm_div(double *phi, double *ueqn_internalCoeffs_init, double *ueqn_boundaryCoeffs_init, - double *ueqn_laplac_internalCoeffs_init, double *ueqn_laplac_boundaryCoeffs_init, - double *boundary_pressure_init, double *boundary_velocity_init, +void dfUEqn::fvm_div(double *boundary_pressure_init, double *boundary_velocity_init, double *boundary_nuEff_init, double *boundary_rho_init) { - // copy and permutate face variables - clock_t start = std::clock(); - // std::copy(phi, phi + num_surfaces, h_phi_init); - // std::copy(phi, phi + num_surfaces, h_phi_init + num_surfaces); - memcpy(dataBase_.h_phi_init, phi, num_surfaces * sizeof(double)); - // memcpy(h_phi_init + num_surfaces, phi, num_surfaces*sizeof(double)); - - clock_t end = std::clock(); - time_monitor_CPU += double(end - start) / double(CLOCKS_PER_SEC); - - start = std::clock(); - checkCudaErrors(cudaMemcpyAsync(dataBase_.d_phi_init, dataBase_.h_phi_init, num_surfaces * sizeof(double), cudaMemcpyHostToDevice, stream)); - checkCudaErrors(cudaMemcpyAsync(dataBase_.d_phi_init + num_surfaces, dataBase_.d_phi_init, num_surfaces * sizeof(double), cudaMemcpyDeviceToDevice, stream)); - end = std::clock(); - time_monitor_GPU_memcpy += double(end - start) / double(CLOCKS_PER_SEC); - - start = std::clock(); - size_t threads_per_block = 1024; - size_t blocks_per_grid = (num_faces + threads_per_block - 1) / threads_per_block; - offdiagPermutation<<>>(num_faces, dataBase_.d_permedIndex, dataBase_.d_phi_init, dataBase_.d_phi); - end = std::clock(); - time_monitor_GPU_kernel += double(end - start) / double(CLOCKS_PER_SEC); - // copy and permutate boundary variable - start = std::clock(); - checkCudaErrors(cudaMemcpyAsync(dataBase_.d_internal_coeffs_init, ueqn_internalCoeffs_init, dataBase_.boundary_face_vec_bytes, cudaMemcpyHostToDevice, stream)); - checkCudaErrors(cudaMemcpyAsync(dataBase_.d_boundary_coeffs_init, ueqn_boundaryCoeffs_init, dataBase_.boundary_face_vec_bytes, cudaMemcpyHostToDevice, stream)); - checkCudaErrors(cudaMemcpyAsync(dataBase_.d_laplac_internal_coeffs_init, ueqn_laplac_internalCoeffs_init, dataBase_.boundary_face_vec_bytes, cudaMemcpyHostToDevice, stream)); - checkCudaErrors(cudaMemcpyAsync(dataBase_.d_laplac_boundary_coeffs_init, ueqn_laplac_boundaryCoeffs_init, dataBase_.boundary_face_vec_bytes, cudaMemcpyHostToDevice, stream)); + clock_t start = std::clock(); checkCudaErrors(cudaMemcpyAsync(dataBase_.d_boundary_velocity_init, boundary_velocity_init, dataBase_.boundary_face_vec_bytes, cudaMemcpyHostToDevice, stream)); checkCudaErrors(cudaMemcpyAsync(dataBase_.d_boundary_pressure_init, boundary_pressure_init, dataBase_.boundary_face_bytes, cudaMemcpyHostToDevice, stream)); checkCudaErrors(cudaMemcpyAsync(dataBase_.d_boundary_nuEff_init, boundary_nuEff_init, dataBase_.boundary_face_bytes, cudaMemcpyHostToDevice, stream)); checkCudaErrors(cudaMemcpyAsync(dataBase_.d_boundary_rho_init, boundary_rho_init, dataBase_.boundary_face_bytes, cudaMemcpyHostToDevice, stream)); - end = std::clock(); + clock_t end = std::clock(); time_monitor_GPU_memcpy += double(end - start) / double(CLOCKS_PER_SEC); start = std::clock(); - blocks_per_grid = (dataBase_.num_boundary_faces + threads_per_block - 1) / threads_per_block; - boundaryPermutation<<>>(dataBase_.num_boundary_faces, dataBase_.d_bouPermedIndex, dataBase_.d_internal_coeffs_init, - dataBase_.d_boundary_coeffs_init, dataBase_.d_laplac_internal_coeffs_init, dataBase_.d_laplac_boundary_coeffs_init, dataBase_.d_boundary_pressure_init, - dataBase_.d_boundary_velocity_init, dataBase_.d_internal_coeffs, dataBase_.d_boundary_coeffs, dataBase_.d_laplac_internal_coeffs, dataBase_.d_laplac_boundary_coeffs, - dataBase_.d_boundary_pressure, dataBase_.d_boundary_velocity, dataBase_.d_boundary_nuEff_init, dataBase_.d_boundary_nuEff, dataBase_.d_boundary_rho_init, dataBase_.d_boundary_rho); + size_t threads_per_block = 1024; + size_t blocks_per_grid = (dataBase_.num_boundary_faces + threads_per_block - 1) / threads_per_block; + boundaryPermutation<<>>(dataBase_.num_boundary_faces, dataBase_.d_bouPermedIndex, dataBase_.d_boundary_pressure_init, + dataBase_.d_boundary_velocity_init, dataBase_.d_boundary_pressure, dataBase_.d_boundary_velocity, + dataBase_.d_boundary_nuEff_init, dataBase_.d_boundary_nuEff, dataBase_.d_boundary_rho_init, dataBase_.d_boundary_rho); // launch cuda kernel blocks_per_grid = (num_cells + threads_per_block - 1) / threads_per_block; @@ -1229,8 +1196,6 @@ void dfUEqn::fvm_laplacian() void dfUEqn::A(double *Psi) { - // tmp tdiag(new scalarField(diag())); - // addCmptAvBoundaryDiag(tdiag.ref()); checkCudaErrors(cudaMemsetAsync(d_A, 0, cell_bytes, stream)); size_t threads_per_block = 1024; @@ -1241,7 +1206,7 @@ void dfUEqn::A(double *Psi) blocks_per_grid = (num_cells + threads_per_block - 1) / threads_per_block; addDiagDivVolume<<>>(num_cells, d_A_csr_row_index, d_A_csr_diag_index, d_A_csr, dataBase_.d_volume, d_ueqn_internal_coeffs, d_A, d_A); - + checkCudaErrors(cudaMemcpyAsync(h_A, d_A, cell_bytes, cudaMemcpyDeviceToHost, stream)); checkCudaErrors(cudaStreamSynchronize(stream)); for (size_t i = 0; i < num_cells; i++) @@ -1250,9 +1215,7 @@ void dfUEqn::A(double *Psi) void dfUEqn::H(double *Psi) { - checkCudaErrors(cudaMemsetAsync(d_H, 0, cell_vec_bytes, stream)); - checkCudaErrors(cudaStreamSynchronize(stream)); - + checkCudaErrors(cudaMemsetAsync(d_H, 0, cell_bytes * 3, stream)); size_t threads_per_block = 1024; size_t blocks_per_grid = (num_boundary_cells + threads_per_block - 1) / threads_per_block; addBoundaryDiag<<>>(num_cells, num_boundary_cells, d_A_csr_row_index, d_A_csr_diag_index, @@ -1275,7 +1238,20 @@ void dfUEqn::H(double *Psi) } // for (int i = 0; i < num_cells; i++) - // fprintf(stderr, "h_H_GPU[%d]: (%.5e, %.5e, %.5e)\n", i, h_H[i], h_H[num_cells + i], h_H[num_cells * 2 + i]); + // fprintf(stderr, "h_H_GPU[%d]: (%.5e, %.5e, %.5e)\n", i, h_H[i], h_H[num_cells + i], h_H[num_cells * 2 + i]); +} + +void dfUEqn::initializeTimeStep() +{ + // initialize matrix value + checkCudaErrors(cudaMemsetAsync(d_A_csr, 0, csr_value_vec_bytes, stream)); + checkCudaErrors(cudaMemsetAsync(d_b, 0, cell_vec_bytes, stream)); + // initialize boundary coeffs + size_t threads_per_block = 1024; + size_t blocks_per_grid = (dataBase_.num_boundary_faces + threads_per_block - 1) / threads_per_block; + ueqn_update_BoundaryCoeffs_kernel<<>>(dataBase_.num_boundary_faces, dataBase_.d_boundary_phi, + dataBase_.d_internal_coeffs, dataBase_.d_boundary_coeffs, + dataBase_.d_laplac_internal_coeffs, dataBase_.d_laplac_boundary_coeffs); } void dfUEqn::checkValue(bool print) diff --git a/src_gpu/dfYEqn.H b/src_gpu/dfYEqn.H index e477d0d2e..260baf4f2 100644 --- a/src_gpu/dfYEqn.H +++ b/src_gpu/dfYEqn.H @@ -35,11 +35,11 @@ public: void checkValue(bool print, char *filename); - void correctVelocity(std::vector Y_new, std::vector boundary_Y_init, std::vector rhoD_GPU); + void correctVelocity(std::vector Y_old, std::vector boundary_Y_init, std::vector rhoD_GPU); void upwindWeight(); - void fvm_ddt(std::vector Y_old); + void fvm_ddt(); void fvm_div_phi(); diff --git a/src_gpu/dfYEqn.cu b/src_gpu/dfYEqn.cu index b507763a0..9ef75747f 100644 --- a/src_gpu/dfYEqn.cu +++ b/src_gpu/dfYEqn.cu @@ -508,7 +508,7 @@ void dfYEqn::upwindWeight() getUpwindWeight<<>>(num_faces, dataBase_.d_phi, dataBase_.d_weight_upwind); } -void dfYEqn::correctVelocity(std::vector Y_new, std::vector boundary_Y_init, std::vector rhoD_GPU) +void dfYEqn::correctVelocity(std::vector Y_old, std::vector boundary_Y_init, std::vector rhoD_GPU) { // initialize variables in each time step checkCudaErrors(cudaMemsetAsync(d_sumYDiffError, 0, 3 * cell_bytes, stream)); @@ -518,7 +518,7 @@ void dfYEqn::correctVelocity(std::vector Y_new, std::vector for (size_t i = 0; i < num_species; ++i) { checkCudaErrors(cudaMemsetAsync(d_sumYDiffError_tmp, 0, 3 * cell_bytes, stream)); - checkCudaErrors(cudaMemcpyAsync(dataBase_.d_Y_new_vector[i], Y_new[i], cell_bytes, cudaMemcpyHostToDevice, stream)); + checkCudaErrors(cudaMemcpyAsync(dataBase_.d_Y_old_vector[i], Y_old[i], cell_bytes, cudaMemcpyHostToDevice, stream)); checkCudaErrors(cudaMemcpyAsync(dataBase_.d_boundary_Y_init_vector[i], boundary_Y_init[i], dataBase_.boundary_face_bytes, cudaMemcpyHostToDevice, stream)); checkCudaErrors(cudaMemcpyAsync(dataBase_.d_rhoD_vector[i], rhoD_GPU[i], cell_bytes, cudaMemcpyHostToDevice, stream)); @@ -527,7 +527,7 @@ void dfYEqn::correctVelocity(std::vector Y_new, std::vector blocks_per_grid = (num_cells + threads_per_block - 1) / threads_per_block; fvc_grad_internal_face_Y<<>>(num_cells, d_A_csr_row_index, d_A_csr_col_index, d_A_csr_diag_index, dataBase_.d_volume, - dataBase_.d_face_vector, dataBase_.d_weight, dataBase_.d_Y_new_vector[i], + dataBase_.d_face_vector, dataBase_.d_weight, dataBase_.d_Y_old_vector[i], dataBase_.d_rhoD_vector[i], d_sumYDiffError_tmp, d_sumYDiffError_tmp); blocks_per_grid = (num_boundary_cells + threads_per_block - 1) / threads_per_block; fvc_grad_boundary_face_Y<<>>(num_cells, num_boundary_cells, dataBase_.d_boundary_cell_offset, @@ -554,7 +554,7 @@ void dfYEqn::correctVelocity(std::vector Y_new, std::vector dataBase_.d_boundary_face_vector, d_sumYDiffError_boundary, d_phiUc_boundary); } -void dfYEqn::fvm_ddt(std::vector Y_old) +void dfYEqn::fvm_ddt() { // initialize variables in each time step checkCudaErrors(cudaMemsetAsync(d_A_csr, 0, (num_cells + num_faces) * (num_species - 1) * sizeof(double), stream)); // consider inert species @@ -567,7 +567,6 @@ void dfYEqn::fvm_ddt(std::vector Y_old) { if (i == inertIndex) continue; - checkCudaErrors(cudaMemcpyAsync(dataBase_.d_Y_old_vector[i], Y_old[i], cell_bytes, cudaMemcpyHostToDevice, stream)); // launch cuda kernel threads_per_block = 1024; @@ -757,8 +756,7 @@ void dfYEqn::solve() void dfYEqn::updatePsi(double *Psi, int speciesIndex) { checkCudaErrors(cudaStreamSynchronize(stream)); - for (size_t i = 0; i < num_cells; i++) - Psi[i] = h_psi[i + speciesIndex * num_cells]; + memcpy(Psi, h_psi + speciesIndex * num_cells, cell_bytes); } dfYEqn::~dfYEqn()