From ad1b78ed9e73e62c6b6832523af41e984fb2b08f Mon Sep 17 00:00:00 2001 From: dzzz2001 Date: Sat, 2 Aug 2025 22:04:27 +0800 Subject: [PATCH 1/3] add cuda support for cal_force_loc --- source/source_pw/module_pwdft/forces.cpp | 115 ++++++++++++--- .../module_pwdft/kernels/cuda/force_op.cu | 137 +++++++++++++++++- .../source_pw/module_pwdft/kernels/force_op.h | 30 ++++ 3 files changed, 261 insertions(+), 21 deletions(-) diff --git a/source/source_pw/module_pwdft/forces.cpp b/source/source_pw/module_pwdft/forces.cpp index 6aab09161f..a5203e4ccc 100644 --- a/source/source_pw/module_pwdft/forces.cpp +++ b/source/source_pw/module_pwdft/forces.cpp @@ -329,7 +329,7 @@ void Forces::cal_force_loc(const UnitCell& ucell, { ModuleBase::TITLE("Forces", "cal_force_loc"); ModuleBase::timer::tick("Forces", "cal_force_loc"); - + this->device = base_device::get_device_type(this->ctx); std::complex* aux = new std::complex[rho_basis->nmaxgr]; // now, in all pools , the charge are the same, // so, the force calculated by each pool is equal. @@ -368,30 +368,105 @@ void Forces::cal_force_loc(const UnitCell& ucell, // to G space. maybe need fftw with OpenMP rho_basis->real2recip(aux, aux); -#ifdef _OPENMP -#pragma omp parallel for -#endif - for (int iat = 0; iat < this->nat; ++iat) + std::vector tau_h; + std::vector gcar_h; + if(this->device == base_device::GpuDevice) { - // read `it` `ia` from the table - int it = ucell.iat2it[iat]; - int ia = ucell.iat2ia[iat]; - for (int ig = 0; ig < rho_basis->npw; ig++) + tau_h.resize(this->nat * 3); + for(int iat = 0; iat < this->nat; ++iat) { - const double phase = ModuleBase::TWO_PI * (rho_basis->gcar[ig] * ucell.atoms[it].tau[ia]); - double sinp, cosp; - ModuleBase::libm::sincos(phase, &sinp, &cosp); - const double factor - = vloc(it, rho_basis->ig2igg[ig]) * (cosp * aux[ig].imag() + sinp * aux[ig].real()); - forcelc(iat, 0) += rho_basis->gcar[ig][0] * factor; - forcelc(iat, 1) += rho_basis->gcar[ig][1] * factor; - forcelc(iat, 2) += rho_basis->gcar[ig][2] * factor; + int it = ucell.iat2it[iat]; + int ia = ucell.iat2ia[iat]; + tau_h[iat * 3] = ucell.atoms[it].tau[ia].x; + tau_h[iat * 3 + 1] = ucell.atoms[it].tau[ia].y; + tau_h[iat * 3 + 2] = ucell.atoms[it].tau[ia].z; } - forcelc(iat, 0) *= (ucell.tpiba * ucell.omega); - forcelc(iat, 1) *= (ucell.tpiba * ucell.omega); - forcelc(iat, 2) *= (ucell.tpiba * ucell.omega); + + gcar_h.resize(rho_basis->npw * 3); + for(int ig = 0; ig < rho_basis->npw; ++ig) + { + gcar_h[ig * 3] = rho_basis->gcar[ig].x; + gcar_h[ig * 3 + 1] = rho_basis->gcar[ig].y; + gcar_h[ig * 3 + 2] = rho_basis->gcar[ig].z; + } + } + int* iat2it_d = nullptr; + int* ig2gg_d = nullptr; + double* gcar_d = nullptr; + double* tau_d = nullptr; + std::complex* aux_d = nullptr; + double* forcelc_d = nullptr; + double* vloc_d = nullptr; + if(this->device == base_device::GpuDevice) + { + resmem_int_op()(iat2it_d, this->nat); + resmem_int_op()(ig2gg_d, rho_basis->npw); + resmem_var_op()(gcar_d, rho_basis->npw * 3); + resmem_var_op()(tau_d, this->nat * 3); + resmem_complex_op()(aux_d, rho_basis->npw); + resmem_var_op()(forcelc_d, this->nat * 3); + resmem_var_op()(vloc_d, vloc.nr * vloc.nc); + + syncmem_int_h2d_op()(iat2it_d, ucell.iat2it, this->nat); + syncmem_int_h2d_op()(ig2gg_d, rho_basis->ig2igg, rho_basis->npw); + syncmem_var_h2d_op()(gcar_d, gcar_h.data(), rho_basis->npw * 3); + syncmem_var_h2d_op()(tau_d, tau_h.data(), this->nat * 3); + syncmem_complex_h2d_op()(aux_d, aux, rho_basis->npw); + syncmem_var_h2d_op()(forcelc_d, forcelc.c, this->nat * 3); + syncmem_var_h2d_op()(vloc_d, vloc.c, vloc.nr * vloc.nc); } + if(this->device == base_device::GpuDevice) + { + hamilt::cal_force_loc_op()( + this->nat, + rho_basis->npw, + ucell.tpiba * ucell.omega, + iat2it_d, + ig2gg_d, + gcar_d, + tau_d, + aux_d, + vloc_d, + vloc.nc, + forcelc_d); + syncmem_var_d2h_op()(forcelc.c, forcelc_d, this->nat * 3); + } + else{ + #ifdef _OPENMP + #pragma omp parallel for + #endif + for (int iat = 0; iat < this->nat; ++iat) + { + // read `it` `ia` from the table + int it = ucell.iat2it[iat]; + int ia = ucell.iat2ia[iat]; + for (int ig = 0; ig < rho_basis->npw; ig++) + { + const double phase = ModuleBase::TWO_PI * (rho_basis->gcar[ig] * ucell.atoms[it].tau[ia]); + double sinp, cosp; + ModuleBase::libm::sincos(phase, &sinp, &cosp); + const double factor + = vloc(it, rho_basis->ig2igg[ig]) * (cosp * aux[ig].imag() + sinp * aux[ig].real()); + forcelc(iat, 0) += rho_basis->gcar[ig][0] * factor; + forcelc(iat, 1) += rho_basis->gcar[ig][1] * factor; + forcelc(iat, 2) += rho_basis->gcar[ig][2] * factor; + } + forcelc(iat, 0) *= (ucell.tpiba * ucell.omega); + forcelc(iat, 1) *= (ucell.tpiba * ucell.omega); + forcelc(iat, 2) *= (ucell.tpiba * ucell.omega); + } + } + if(this->device == base_device::GpuDevice) + { + delmem_int_op()(iat2it_d); + delmem_int_op()(ig2gg_d); + delmem_var_op()(gcar_d); + delmem_var_op()(tau_d); + delmem_complex_op()(aux_d); + delmem_var_op()(forcelc_d); + delmem_var_op()(vloc_d); + } // this->print(GlobalV::ofs_running, "local forces", forcelc); Parallel_Reduce::reduce_pool(forcelc.c, forcelc.nr * forcelc.nc); delete[] aux; diff --git a/source/source_pw/module_pwdft/kernels/cuda/force_op.cu b/source/source_pw/module_pwdft/kernels/cuda/force_op.cu index 5f4a123868..1c1113ca6b 100644 --- a/source/source_pw/module_pwdft/kernels/cuda/force_op.cu +++ b/source/source_pw/module_pwdft/kernels/cuda/force_op.cu @@ -1,6 +1,7 @@ #include "source_pw/module_pwdft/kernels/force_op.h" // #include "source_psi/kernels/device.h" #include "source_base/module_device/types.h" +#include "source_base/constants.h" #include @@ -10,9 +11,19 @@ #include #define THREADS_PER_BLOCK 256 +#define FULL_MASK 0xffffffff +#define WARP_SIZE 32 namespace hamilt { +template +__forceinline__ +__device__ +void warp_reduce(FPTYPE & val) { + for (int offset = 16; offset > 0; offset >>= 1) { + val += __shfl_down_sync(FULL_MASK, val, offset); + } +} template __global__ void cal_vkb1_nl( @@ -606,6 +617,129 @@ void revertVkbValues( cudaCheckOnDebug(); } +template +__global__ void force_loc_kernel( + const int nat, + const int npw, + const FPTYPE tpiba_omega, + const int* iat2it, + const int* ig2gg_d, + const FPTYPE* gcar_d, + const FPTYPE* tau_d, + const thrust::complex* aux_d, + const FPTYPE* vloc_d, + const int vloc_nc, + FPTYPE* forcelc_d) +{ + const int iat = blockIdx.x; + const int tid = threadIdx.x; + const int warp_id = tid / WARP_SIZE; + const int lane_id = tid % WARP_SIZE; + + if (iat >= nat) return; + const int it = iat2it[iat]; // get the type of atom + + // Initialize force components + FPTYPE force_x = 0.0; + FPTYPE force_y = 0.0; + FPTYPE force_z = 0.0; + + const auto tau_x = tau_d[iat * 3 + 0]; + const auto tau_y = tau_d[iat * 3 + 1]; + const auto tau_z = tau_d[iat * 3 + 2]; + + // Process all plane waves in chunks of blockDim.x + for (int ig = tid; ig < npw; ig += blockDim.x) { + const auto gcar_x = gcar_d[ig * 3 + 0]; + const auto gcar_y = gcar_d[ig * 3 + 1]; + const auto gcar_z = gcar_d[ig * 3 + 2]; + + // Calculate phase factor + const FPTYPE phase = ModuleBase::TWO_PI * (gcar_x * tau_x + + gcar_y * tau_y + + gcar_z * tau_z); + FPTYPE sinp, cosp; + sincos(phase, &sinp, &cosp); + + // Get vloc value + const FPTYPE vloc_val = vloc_d[it * vloc_nc + ig2gg_d[ig]]; + + // Calculate factor + const auto aux_val = aux_d[ig]; + const FPTYPE factor = vloc_val * (cosp * aux_val.imag() + sinp * aux_val.real()); + + // Multiply by gcar components + force_x += gcar_x * factor; + force_y += gcar_y * factor; + force_z += gcar_z * factor; + } + + // Warp-level reduction + warp_reduce(force_x); + warp_reduce(force_y); + warp_reduce(force_z); + + // First thread in each warp writes to shared memory + __shared__ FPTYPE warp_sums_x[THREADS_PER_BLOCK / WARP_SIZE]; // 256 threads / 32 = 8 warps + __shared__ FPTYPE warp_sums_y[THREADS_PER_BLOCK / WARP_SIZE]; + __shared__ FPTYPE warp_sums_z[THREADS_PER_BLOCK / WARP_SIZE]; + + if (lane_id == 0) { + warp_sums_x[warp_id] = force_x; + warp_sums_y[warp_id] = force_y; + warp_sums_z[warp_id] = force_z; + } + + __syncthreads(); + + // Final reduction by first warp + if (warp_id == 0) { + FPTYPE final_x = (lane_id < blockDim.x/WARP_SIZE) ? warp_sums_x[lane_id] : 0.0; + FPTYPE final_y = (lane_id < blockDim.x/WARP_SIZE) ? warp_sums_y[lane_id] : 0.0; + FPTYPE final_z = (lane_id < blockDim.x/WARP_SIZE) ? warp_sums_z[lane_id] : 0.0; + + warp_reduce(final_x); + warp_reduce(final_y); + warp_reduce(final_z); + + if (lane_id == 0) { + forcelc_d[iat * 3 + 0] = final_x * tpiba_omega; + forcelc_d[iat * 3 + 1] = final_y * tpiba_omega; + forcelc_d[iat * 3 + 2] = final_z * tpiba_omega; + } + } +} + +template +void cal_force_loc_op::operator()( + const int nat, + const int npw, + const FPTYPE tpiba_omega, + const int* iat2it, + const int* ig2igg, + const FPTYPE* gcar, + const FPTYPE* tau, + const std::complex* aux, + const FPTYPE* vloc, + const int vloc_nc, + FPTYPE* forcelc) +{ + force_loc_kernel + <<>>(nat, + npw, + tpiba_omega, + iat2it, + ig2igg, + gcar, + tau, + reinterpret_cast*>(aux), + vloc, + vloc_nc, + forcelc); // array of data + +} + + // for revertVkbValues functions instantiation template void revertVkbValues(const int *gcar_zero_ptrs, std::complex *vkb_ptr, const std::complex *vkb_save_ptr, int nkb, int gcar_zero_count, int npw, int ipol, int npwx, const std::complex coeff); // for saveVkbValues functions instantiation @@ -613,8 +747,9 @@ template void saveVkbValues(const int *gcar_zero_ptrs, const std::comple template struct cal_vkb1_nl_op; template struct cal_force_nl_op; +template struct cal_force_loc_op; template struct cal_vkb1_nl_op; template struct cal_force_nl_op; - +template struct cal_force_loc_op; } // namespace hamilt diff --git a/source/source_pw/module_pwdft/kernels/force_op.h b/source/source_pw/module_pwdft/kernels/force_op.h index ccbbec77d7..6648b93101 100644 --- a/source/source_pw/module_pwdft/kernels/force_op.h +++ b/source/source_pw/module_pwdft/kernels/force_op.h @@ -149,6 +149,21 @@ struct cal_force_nl_op FPTYPE* force); }; +template +struct cal_force_loc_op{ + void operator()( + const int nat, + const int npw, + const FPTYPE tpiba_omega, + const int* ig2igg, + const FPTYPE* gcar, + const FPTYPE* tau, + const std::complex* aux, + const FPTYPE* vloc, + const int vloc_nr, + FPTYPE* forcelc); +}; + #if __CUDA || __UT_USE_CUDA || __ROCM || __UT_USE_ROCM template struct cal_vkb1_nl_op @@ -275,6 +290,21 @@ void saveVkbValues(const int* gcar_zero_ptrs, int ipol, int npwx); +template +struct cal_force_loc_op{ + void operator()( + const int nat, + const int npw, + const FPTYPE tpiba_omega, + const int* iat2it, + const int* ig2igg, + const FPTYPE* gcar, + const FPTYPE* tau, + const std::complex* aux, + const FPTYPE* vloc, + const int vloc_nr, + FPTYPE* forcelc); +}; #endif // __CUDA || __UT_USE_CUDA || __ROCM || __UT_USE_ROCM } // namespace hamilt #endif // W_ABACUS_DEVELOP_ABACUS_DEVELOP_SOURCE_source_pw_HAMILT_PWDFT_KERNELS_FORCE_OP_H \ No newline at end of file From 6f84b857fb6254fa6a86fd181fe8a608bcd4ad0b Mon Sep 17 00:00:00 2001 From: dzzz2001 Date: Sat, 2 Aug 2025 22:41:20 +0800 Subject: [PATCH 2/3] fix compilation bug --- source/source_pw/module_pwdft/forces.cpp | 2 +- source/source_pw/module_pwdft/kernels/force_op.h | 15 +++++++++++++++ 2 files changed, 16 insertions(+), 1 deletion(-) diff --git a/source/source_pw/module_pwdft/forces.cpp b/source/source_pw/module_pwdft/forces.cpp index a5203e4ccc..c621bf72f4 100644 --- a/source/source_pw/module_pwdft/forces.cpp +++ b/source/source_pw/module_pwdft/forces.cpp @@ -418,7 +418,7 @@ void Forces::cal_force_loc(const UnitCell& ucell, if(this->device == base_device::GpuDevice) { - hamilt::cal_force_loc_op()( + hamilt::cal_force_loc_op()( this->nat, rho_basis->npw, ucell.tpiba * ucell.omega, diff --git a/source/source_pw/module_pwdft/kernels/force_op.h b/source/source_pw/module_pwdft/kernels/force_op.h index 6648b93101..464c15c2be 100644 --- a/source/source_pw/module_pwdft/kernels/force_op.h +++ b/source/source_pw/module_pwdft/kernels/force_op.h @@ -164,6 +164,21 @@ struct cal_force_loc_op{ FPTYPE* forcelc); }; +template +struct cal_force_loc_op{ + void operator()( + const int nat, + const int npw, + const FPTYPE tpiba_omega, + const int* iat2it, + const int* ig2igg, + const FPTYPE* gcar, + const FPTYPE* tau, + const std::complex* aux, + const FPTYPE* vloc, + const int vloc_nr, + FPTYPE* forcelc) {}; +}; #if __CUDA || __UT_USE_CUDA || __ROCM || __UT_USE_ROCM template struct cal_vkb1_nl_op From 82f193b7ee4091e7ca5605c173b9c2c6035168f1 Mon Sep 17 00:00:00 2001 From: dzzz2001 Date: Sat, 2 Aug 2025 22:51:11 +0800 Subject: [PATCH 3/3] fix bug --- source/source_pw/module_pwdft/kernels/force_op.h | 15 --------------- 1 file changed, 15 deletions(-) diff --git a/source/source_pw/module_pwdft/kernels/force_op.h b/source/source_pw/module_pwdft/kernels/force_op.h index 464c15c2be..ca15580d5a 100644 --- a/source/source_pw/module_pwdft/kernels/force_op.h +++ b/source/source_pw/module_pwdft/kernels/force_op.h @@ -149,21 +149,6 @@ struct cal_force_nl_op FPTYPE* force); }; -template -struct cal_force_loc_op{ - void operator()( - const int nat, - const int npw, - const FPTYPE tpiba_omega, - const int* ig2igg, - const FPTYPE* gcar, - const FPTYPE* tau, - const std::complex* aux, - const FPTYPE* vloc, - const int vloc_nr, - FPTYPE* forcelc); -}; - template struct cal_force_loc_op{ void operator()(