diff --git a/source/module_hamilt_pw/hamilt_pwdft/forces.cpp b/source/module_hamilt_pw/hamilt_pwdft/forces.cpp index df29f88989..a4d779366b 100644 --- a/source/module_hamilt_pw/hamilt_pwdft/forces.cpp +++ b/source/module_hamilt_pw/hamilt_pwdft/forces.cpp @@ -15,6 +15,7 @@ #include "module_hamilt_general/module_ewald/H_Ewald_pw.h" #include "module_hamilt_general/module_surchem/surchem.h" #include "module_hamilt_general/module_vdw/vdw.h" +#include "kernels/force_op.h" #ifdef _OPENMP #include @@ -531,31 +532,110 @@ void Forces::cal_force_loc(const UnitCell& ucell, // to G space. maybe need fftw with OpenMP rho_basis->real2recip(aux, aux); -#ifdef _OPENMP -#pragma omp parallel for -#endif - for (int iat = 0; iat < this->nat; ++iat) - { - // read `it` `ia` from the table + // sincos op for G space + + + // data preparation + std::vector tau_flat(this->nat * 3); + std::vector gcar_flat(rho_basis->npw * 3); + + + for (int iat = 0; iat < this->nat; iat++) { + int it = ucell.iat2it[iat]; + int ia = ucell.iat2ia[iat]; + + tau_flat[iat * 3 + 0] = static_cast(ucell.atoms[it].tau[ia][0]); + tau_flat[iat * 3 + 1] = static_cast(ucell.atoms[it].tau[ia][1]); + tau_flat[iat * 3 + 2] = static_cast(ucell.atoms[it].tau[ia][2]); + } + + for (int ig = 0; ig < rho_basis->npw; ig++) { + gcar_flat[ig * 3 + 0] = static_cast(rho_basis->gcar[ig][0]); + gcar_flat[ig * 3 + 1] = static_cast(rho_basis->gcar[ig][1]); + gcar_flat[ig * 3 + 2] = static_cast(rho_basis->gcar[ig][2]); + } + + // calculate vloc_factors for all atom types + std::vector vloc_per_type_host(this->nat * rho_basis->npw); + for (int iat = 0; iat < this->nat; iat++) { int it = ucell.iat2it[iat]; - int ia = ucell.iat2ia[iat]; - for (int ig = 0; ig < rho_basis->npw; ig++) - { - const double phase = ModuleBase::TWO_PI * (rho_basis->gcar[ig] * ucell.atoms[it].tau[ia]); - double sinp, cosp; - ModuleBase::libm::sincos(phase, &sinp, &cosp); - const double factor - = vloc(it, rho_basis->ig2igg[ig]) * (cosp * aux[ig].imag() + sinp * aux[ig].real()); - forcelc(iat, 0) += rho_basis->gcar[ig][0] * factor; - forcelc(iat, 1) += rho_basis->gcar[ig][1] * factor; - forcelc(iat, 2) += rho_basis->gcar[ig][2] * factor; + for (int ig = 0; ig < rho_basis->npw; ig++) { + vloc_per_type_host[iat * rho_basis->npw + ig] = static_cast(vloc(it, rho_basis->ig2igg[ig])); } - forcelc(iat, 0) *= (ucell.tpiba * ucell.omega); - forcelc(iat, 1) *= (ucell.tpiba * ucell.omega); - forcelc(iat, 2) *= (ucell.tpiba * ucell.omega); + } + + std::vector> aux_fptype(rho_basis->npw); + for (int ig = 0; ig < rho_basis->npw; ig++) { + aux_fptype[ig] = static_cast>(aux[ig]); + } + + FPTYPE* d_gcar = gcar_flat.data(); + FPTYPE* d_tau = tau_flat.data(); + FPTYPE* d_vloc_per_type = vloc_per_type_host.data(); + std::complex* d_aux = aux_fptype.data(); + FPTYPE* d_force = nullptr; + std::vector force_host(this->nat * 3); + + if (this->device == base_device::GpuDevice) + { + d_gcar = nullptr; + d_tau = nullptr; + d_vloc_per_type = nullptr; + d_aux = nullptr; + + resmem_var_op()(this->ctx, d_gcar, rho_basis->npw * 3); + resmem_var_op()(this->ctx, d_tau, this->nat * 3); + resmem_var_op()(this->ctx, d_vloc_per_type, this->nat * rho_basis->npw); + resmem_complex_op()(this->ctx, d_aux, rho_basis->npw); + resmem_var_op()(this->ctx, d_force, this->nat * 3); + + syncmem_var_h2d_op()(this->ctx, this->cpu_ctx, d_gcar, gcar_flat.data(), rho_basis->npw * 3); + syncmem_var_h2d_op()(this->ctx, this->cpu_ctx, d_tau, tau_flat.data(), this->nat * 3); + syncmem_var_h2d_op()(this->ctx, this->cpu_ctx, d_vloc_per_type, vloc_per_type_host.data(), this->nat * rho_basis->npw); + syncmem_complex_h2d_op()(this->ctx, this->cpu_ctx, d_aux, aux_fptype.data(), rho_basis->npw); + + base_device::memory::set_memory_op()(this->ctx, d_force, 0.0, this->nat * 3); + } + else + { + d_force = force_host.data(); + std::fill(force_host.begin(), force_host.end(), static_cast(0.0)); + } + + const FPTYPE scale_factor = static_cast(ucell.tpiba * ucell.omega); + + // call op for sincos calculation + hamilt::cal_force_loc_sincos_op()( + this->ctx, + this->nat, + rho_basis->npw, + this->nat, + d_gcar, + d_tau, + d_vloc_per_type, + d_aux, + scale_factor, + d_force + ); + + if (this->device == base_device::GpuDevice) + { + syncmem_var_d2h_op()(this->cpu_ctx, this->ctx, force_host.data(), d_force, this->nat * 3); + + delmem_var_op()(this->ctx, d_gcar); + delmem_var_op()(this->ctx, d_tau); + delmem_var_op()(this->ctx, d_vloc_per_type); + delmem_complex_op()(this->ctx, d_aux); + delmem_var_op()(this->ctx, d_force); + } + + for (int iat = 0; iat < this->nat; iat++) { + forcelc(iat, 0) = static_cast(force_host[iat * 3 + 0]); + forcelc(iat, 1) = static_cast(force_host[iat * 3 + 1]); + forcelc(iat, 2) = static_cast(force_host[iat * 3 + 2]); } - // this->print(GlobalV::ofs_running, "local forces", forcelc); + // this->print(GlobalV: :ofs_running, "local forces", forcelc); Parallel_Reduce::reduce_pool(forcelc.c, forcelc.nr * forcelc.nc); delete[] aux; ModuleBase::timer::tick("Forces", "cal_force_loc"); @@ -665,6 +745,119 @@ void Forces::cal_force_ew(const UnitCell& ucell, aux[rho_basis->ig_gge0] = std::complex(0.0, 0.0); } + // sincos op for cal_force_ew + + std::vector it_facts_host(this->nat); + std::vector tau_flat(this->nat * 3); + + // iterate over by lookup table + for (int iat = 0; iat < this->nat; iat++) { + int it = ucell.iat2it[iat]; + int ia = ucell.iat2ia[iat]; + + double zv; + if (PARAM.inp.use_paw) + { +#ifdef USE_PAW + zv = GlobalC::paw_cell.get_val(it); +#endif + } + else + { + zv = ucell.atoms[it].ncpp.zv; + } + + it_facts_host[iat] = static_cast(zv * ModuleBase::e2 * ucell.tpiba * + ModuleBase::TWO_PI / ucell.omega * fact); + + tau_flat[iat * 3 + 0] = static_cast(ucell.atoms[it].tau[ia][0]); + tau_flat[iat * 3 + 1] = static_cast(ucell.atoms[it].tau[ia][1]); + tau_flat[iat * 3 + 2] = static_cast(ucell.atoms[it].tau[ia][2]); + } + + std::vector gcar_flat(rho_basis->npw * 3); + for (int ig = 0; ig < rho_basis->npw; ig++) { + gcar_flat[ig * 3 + 0] = static_cast(rho_basis->gcar[ig][0]); + gcar_flat[ig * 3 + 1] = static_cast(rho_basis->gcar[ig][1]); + gcar_flat[ig * 3 + 2] = static_cast(rho_basis->gcar[ig][2]); + } + + std::vector> aux_fptype(rho_basis->npw); + for (int ig = 0; ig < rho_basis->npw; ig++) { + aux_fptype[ig] = static_cast>(aux[ig]); + } + + FPTYPE* d_gcar = gcar_flat.data(); + FPTYPE* d_tau = tau_flat.data(); + FPTYPE* d_it_facts = it_facts_host.data(); + std::complex* d_aux = aux_fptype.data(); + FPTYPE* d_force_g = nullptr; + std::vector force_g_host(this->nat * 3); + + if (this->device == base_device::GpuDevice) + { + d_gcar = nullptr; + d_tau = nullptr; + d_it_facts = nullptr; + d_aux = nullptr; + + resmem_var_op()(this->ctx, d_gcar, rho_basis->npw * 3); + resmem_var_op()(this->ctx, d_tau, this->nat * 3); + resmem_var_op()(this->ctx, d_it_facts, this->nat); + resmem_complex_op()(this->ctx, d_aux, rho_basis->npw); + resmem_var_op()(this->ctx, d_force_g, this->nat * 3); + + + syncmem_var_h2d_op()(this->ctx, this->cpu_ctx, d_gcar, gcar_flat.data(), rho_basis->npw * 3); + syncmem_var_h2d_op()(this->ctx, this->cpu_ctx, d_tau, tau_flat.data(), this->nat * 3); + syncmem_var_h2d_op()(this->ctx, this->cpu_ctx, d_it_facts, it_facts_host.data(), this->nat); + syncmem_complex_h2d_op()(this->ctx, this->cpu_ctx, d_aux, aux_fptype.data(), rho_basis->npw); + + + base_device::memory::set_memory_op()(this->ctx, d_force_g, 0.0, this->nat * 3); + } + else + { + d_force_g = force_g_host.data(); + std::fill(force_g_host.begin(), force_g_host.end(), static_cast(0.0)); + } + + // call op for sincos calculation + hamilt::cal_force_ew_sincos_op()( + this->ctx, + this->nat, + rho_basis->npw, + rho_basis->ig_gge0, + d_gcar, + d_tau, + d_it_facts, + d_aux, + d_force_g + ); + + + if (this->device == base_device::GpuDevice) + { + + syncmem_var_d2h_op()(this->cpu_ctx, this->ctx, force_g_host.data(), d_force_g, this->nat * 3); + + + delmem_var_op()(this->ctx, d_gcar); + delmem_var_op()(this->ctx, d_tau); + delmem_var_op()(this->ctx, d_it_facts); + delmem_complex_op()(this->ctx, d_aux); + delmem_var_op()(this->ctx, d_force_g); + } + + + for (int iat = 0; iat < this->nat; iat++) { + forceion(iat, 0) += static_cast(force_g_host[iat * 3 + 0]); + forceion(iat, 1) += static_cast(force_g_host[iat * 3 + 1]); + forceion(iat, 2) += static_cast(force_g_host[iat * 3 + 2]); + } + + +// calculate real space force #ifdef _OPENMP #pragma omp parallel { @@ -688,66 +881,7 @@ void Forces::cal_force_ew(const UnitCell& ucell, iat_end = iat_beg + iat_end; ucell.iat2iait(iat_beg, &ia_beg, &it_beg); - int iat = iat_beg; - int it = it_beg; - int ia = ia_beg; - - // preprocess ig_gap for skipping the ig point - int ig_gap = (rho_basis->ig_gge0 >= 0 && rho_basis->ig_gge0 < rho_basis->npw) ? rho_basis->ig_gge0 : -1; - - double it_fact = 0.; - int last_it = -1; - - // iterating atoms - while (iat < iat_end) - { - if (it != last_it) - { // calculate it_tact when it is changed - double zv; - if (PARAM.inp.use_paw) - { -#ifdef USE_PAW - zv = GlobalC::paw_cell.get_val(it); -#endif - } - else - { - zv = ucell.atoms[it].ncpp.zv; - } - it_fact = zv * ModuleBase::e2 * ucell.tpiba * ModuleBase::TWO_PI / ucell.omega * fact; - last_it = it; - } - - if (ucell.atoms[it].na != 0) - { - const auto ig_loop = [&](int ig_beg, int ig_end) { - for (int ig = ig_beg; ig < ig_end; ig++) - { - const ModuleBase::Vector3 gcar = rho_basis->gcar[ig]; - const double arg = ModuleBase::TWO_PI * (gcar * ucell.atoms[it].tau[ia]); - double sinp, cosp; - ModuleBase::libm::sincos(arg, &sinp, &cosp); - double sumnb = -cosp * aux[ig].imag() + sinp * aux[ig].real(); - forceion(iat, 0) += gcar[0] * sumnb; - forceion(iat, 1) += gcar[1] * sumnb; - forceion(iat, 2) += gcar[2] * sumnb; - } - }; - - // skip ig_gge0 point by separating ig loop into two part - ig_loop(0, ig_gap); - ig_loop(ig_gap + 1, rho_basis->npw); - - forceion(iat, 0) *= it_fact; - forceion(iat, 1) *= it_fact; - forceion(iat, 2) *= it_fact; - - ++iat; - ucell.step_iait(&ia, &it); - } - } - - // means that the processor contains G=0 term. + if (rho_basis->ig_gge0 >= 0) { double rmax = 5.0 / (sqrt(alpha) * ucell.lat0); diff --git a/source/module_hamilt_pw/hamilt_pwdft/kernels/cuda/force_op.cu b/source/module_hamilt_pw/hamilt_pwdft/kernels/cuda/force_op.cu index 5d0656d105..5ff3ddf2b1 100644 --- a/source/module_hamilt_pw/hamilt_pwdft/kernels/cuda/force_op.cu +++ b/source/module_hamilt_pw/hamilt_pwdft/kernels/cuda/force_op.cu @@ -13,6 +13,122 @@ namespace hamilt { +// CUDA kernels for sincos loops +template +__global__ void cal_force_loc_sincos_kernel( + const int nat, + const int npw, + const int ntype, + const FPTYPE* gcar, + const FPTYPE* tau, + const FPTYPE* vloc_per_type, + const thrust::complex* aux, + const FPTYPE scale_factor, + FPTYPE* force) +{ + const FPTYPE TWO_PI = 2.0 * M_PI; + + const int iat = blockIdx.y; + const int ig_start = blockIdx.x * blockDim.x + threadIdx.x; + const int ig_stride = gridDim.x * blockDim.x; + + if (iat >= nat) return; + + // Load atom information to registers + const FPTYPE tau_x = tau[iat * 3 + 0]; + const FPTYPE tau_y = tau[iat * 3 + 1]; + const FPTYPE tau_z = tau[iat * 3 + 2]; + + // Local accumulation variables + FPTYPE local_force_x = 0.0; + FPTYPE local_force_y = 0.0; + FPTYPE local_force_z = 0.0; + + // Grid-stride loop over G-vectors + for (int ig = ig_start; ig < npw; ig += ig_stride) { + // Calculate phase: 2π * (G · τ) + const FPTYPE phase = TWO_PI * (gcar[ig * 3 + 0] * tau_x + + gcar[ig * 3 + 1] * tau_y + + gcar[ig * 3 + 2] * tau_z); + + // Use CUDA intrinsic for sincos + FPTYPE sinp, cosp; + sincos(phase, &sinp, &cosp); + + // Calculate force factor + const FPTYPE vloc_factor = vloc_per_type[iat * npw + ig]; + const FPTYPE factor = vloc_factor * (cosp * aux[ig].imag() + sinp * aux[ig].real()); + + // Accumulate force contributions + local_force_x += gcar[ig * 3 + 0] * factor; + local_force_y += gcar[ig * 3 + 1] * factor; + local_force_z += gcar[ig * 3 + 2] * factor; + } + + // Atomic add to global memory + atomicAdd(&force[iat * 3 + 0], local_force_x * scale_factor); + atomicAdd(&force[iat * 3 + 1], local_force_y * scale_factor); + atomicAdd(&force[iat * 3 + 2], local_force_z * scale_factor); +} + +template +__global__ void cal_force_ew_sincos_kernel( + const int nat, + const int npw, + const int ig_gge0, + const FPTYPE* gcar, + const FPTYPE* tau, + const FPTYPE* it_facts, + const thrust::complex* aux, + FPTYPE* force) +{ + const FPTYPE TWO_PI = 2.0 * M_PI; + + const int iat = blockIdx.y; + const int ig_start = blockIdx.x * blockDim.x + threadIdx.x; + const int ig_stride = gridDim.x * blockDim.x; + + if (iat >= nat) return; + + // Load atom information to registers + const FPTYPE tau_x = tau[iat * 3 + 0]; + const FPTYPE tau_y = tau[iat * 3 + 1]; + const FPTYPE tau_z = tau[iat * 3 + 2]; + const FPTYPE it_fact = it_facts[iat]; + + // Local accumulation variables + FPTYPE local_force_x = 0.0; + FPTYPE local_force_y = 0.0; + FPTYPE local_force_z = 0.0; + + // Grid-stride loop over G-vectors + for (int ig = ig_start; ig < npw; ig += ig_stride) { + // Skip G=0 term + if (ig == ig_gge0) continue; + + // Calculate phase: 2π * (G · τ) + const FPTYPE phase = TWO_PI * (gcar[ig * 3 + 0] * tau_x + + gcar[ig * 3 + 1] * tau_y + + gcar[ig * 3 + 2] * tau_z); + + // Use CUDA intrinsic for sincos + FPTYPE sinp, cosp; + sincos(phase, &sinp, &cosp); + + // Calculate Ewald sum contribution (fixed sign error) + const FPTYPE factor = it_fact * (-cosp * aux[ig].imag() + sinp * aux[ig].real()); + + // Accumulate force contributions + local_force_x += gcar[ig * 3 + 0] * factor; + local_force_y += gcar[ig * 3 + 1] * factor; + local_force_z += gcar[ig * 3 + 2] * factor; + } + + // Atomic add to global memory + atomicAdd(&force[iat * 3 + 0], local_force_x); + atomicAdd(&force[iat * 3 + 1], local_force_y); + atomicAdd(&force[iat * 3 + 2], local_force_z); +} template __global__ void cal_vkb1_nl( @@ -188,6 +304,65 @@ void cal_force_nl_op::operator()(const base_dev cudaCheckOnDebug(); } +// GPU operators +template +void cal_force_loc_sincos_op::operator()( + const base_device::DEVICE_GPU* ctx, + const int& nat, + const int& npw, + const int& ntype, + const FPTYPE* gcar, + const FPTYPE* tau, + const FPTYPE* vloc_per_type, + const std::complex* aux, + const FPTYPE& scale_factor, + FPTYPE* force) +{ + // Calculate optimal grid configuration for GPU load balancing + const int threads_per_block = THREADS_PER_BLOCK; + const int max_blocks_per_sm = 32; // Configurable per GPU architecture + const int max_blocks_x = std::min(max_blocks_per_sm, (npw + threads_per_block - 1) / threads_per_block); + + dim3 grid(max_blocks_x, nat); + dim3 block(threads_per_block); + + cal_force_loc_sincos_kernel<<>>( + nat, npw, ntype, gcar, tau, vloc_per_type, + reinterpret_cast*>(aux), + scale_factor, force + ); + + cudaCheckOnDebug(); +} + +template +void cal_force_ew_sincos_op::operator()( + const base_device::DEVICE_GPU* ctx, + const int& nat, + const int& npw, + const int& ig_gge0, + const FPTYPE* gcar, + const FPTYPE* tau, + const FPTYPE* it_facts, + const std::complex* aux, + FPTYPE* force) +{ + // Calculate optimal grid configuration for GPU load balancing + const int threads_per_block = THREADS_PER_BLOCK; + const int max_blocks_per_sm = 32; // Configurable per GPU architecture + const int max_blocks_x = std::min(max_blocks_per_sm, (npw + threads_per_block - 1) / threads_per_block); + + dim3 grid(max_blocks_x, nat); + dim3 block(threads_per_block); + + cal_force_ew_sincos_kernel<<>>( + nat, npw, ig_gge0, gcar, tau, it_facts, + reinterpret_cast*>(aux), force + ); + + cudaCheckOnDebug(); +} + template __global__ void cal_force_nl( const int ntype, @@ -613,8 +788,12 @@ template void saveVkbValues(const int *gcar_zero_ptrs, const std::comple template struct cal_vkb1_nl_op; template struct cal_force_nl_op; +template struct cal_force_loc_sincos_op; +template struct cal_force_ew_sincos_op; template struct cal_vkb1_nl_op; template struct cal_force_nl_op; +template struct cal_force_loc_sincos_op; +template struct cal_force_ew_sincos_op; } // namespace hamilt diff --git a/source/module_hamilt_pw/hamilt_pwdft/kernels/force_op.cpp b/source/module_hamilt_pw/hamilt_pwdft/kernels/force_op.cpp index 6d797e147d..109321d6c3 100644 --- a/source/module_hamilt_pw/hamilt_pwdft/kernels/force_op.cpp +++ b/source/module_hamilt_pw/hamilt_pwdft/kernels/force_op.cpp @@ -1,4 +1,8 @@ #include "module_hamilt_pw/hamilt_pwdft/kernels/force_op.h" +#include "module_base/libm/libm.h" +#include "module_base/tool_threading.h" +#include "module_hamilt_general/module_ewald/H_Ewald_pw.h" +#include #ifdef _OPENMP #include @@ -424,10 +428,116 @@ struct cal_force_nl_op } }; +// CPU implementation of local force sincos operator +template +struct cal_force_loc_sincos_op +{ + void operator()(const base_device::DEVICE_CPU* ctx, + const int& nat, + const int& npw, + const int& ntype, + const FPTYPE* gcar, + const FPTYPE* tau, + const FPTYPE* vloc_per_type, + const std::complex* aux, + const FPTYPE& scale_factor, + FPTYPE* force) + { + const FPTYPE TWO_PI = 2.0 * M_PI; + +#ifdef _OPENMP +#pragma omp parallel for +#endif + for (int iat = 0; iat < nat; ++iat) + { + const FPTYPE tau_x = tau[iat * 3 + 0]; + const FPTYPE tau_y = tau[iat * 3 + 1]; + const FPTYPE tau_z = tau[iat * 3 + 2]; + + FPTYPE local_force[3] = {0.0, 0.0, 0.0}; + + for (int ig = 0; ig < npw; ig++) + { + const FPTYPE phase = TWO_PI * (gcar[ig * 3 + 0] * tau_x + + gcar[ig * 3 + 1] * tau_y + + gcar[ig * 3 + 2] * tau_z); + FPTYPE sinp, cosp; + ModuleBase::libm::sincos(phase, &sinp, &cosp); + + const FPTYPE vloc_factor = vloc_per_type[iat * npw + ig]; + const FPTYPE factor = vloc_factor * (cosp * aux[ig].imag() + sinp * aux[ig].real()) * scale_factor; + + local_force[0] += gcar[ig * 3 + 0] * factor; + local_force[1] += gcar[ig * 3 + 1] * factor; + local_force[2] += gcar[ig * 3 + 2] * factor; + } + + force[iat * 3 + 0] = local_force[0]; + force[iat * 3 + 1] = local_force[1]; + force[iat * 3 + 2] = local_force[2]; + } + } +}; + +// CPU implementation of Ewald force sincos operator +template +struct cal_force_ew_sincos_op +{ + void operator()(const base_device::DEVICE_CPU* ctx, + const int& nat, + const int& npw, + const int& ig_gge0, + const FPTYPE* gcar, + const FPTYPE* tau, + const FPTYPE* it_facts, + const std::complex* aux, + FPTYPE* force) + { + const FPTYPE TWO_PI = 2.0 * M_PI; + +#ifdef _OPENMP +#pragma omp parallel for +#endif + for (int iat = 0; iat < nat; ++iat) + { + const FPTYPE tau_x = tau[iat * 3 + 0]; + const FPTYPE tau_y = tau[iat * 3 + 1]; + const FPTYPE tau_z = tau[iat * 3 + 2]; + const FPTYPE it_fact = it_facts[iat]; + + FPTYPE local_force[3] = {0.0, 0.0, 0.0}; + + for (int ig = 0; ig < npw; ig++) + { + // Skip G=0 term + if (ig == ig_gge0) continue; + + const FPTYPE phase = TWO_PI * (gcar[ig * 3 + 0] * tau_x + + gcar[ig * 3 + 1] * tau_y + + gcar[ig * 3 + 2] * tau_z); + FPTYPE sinp, cosp; + ModuleBase::libm::sincos(phase, &sinp, &cosp); + + const FPTYPE factor = it_fact * (-cosp * aux[ig].imag() + sinp * aux[ig].real()); + + local_force[0] += gcar[ig * 3 + 0] * factor; + local_force[1] += gcar[ig * 3 + 1] * factor; + local_force[2] += gcar[ig * 3 + 2] * factor; + } + + force[iat * 3 + 0] = local_force[0]; + force[iat * 3 + 1] = local_force[1]; + force[iat * 3 + 2] = local_force[2]; + } + } +}; + template struct cal_vkb1_nl_op; template struct cal_force_nl_op; - +template struct cal_force_loc_sincos_op; +template struct cal_force_ew_sincos_op; template struct cal_vkb1_nl_op; template struct cal_force_nl_op; - +template struct cal_force_loc_sincos_op; +template struct cal_force_ew_sincos_op; } // namespace hamilt diff --git a/source/module_hamilt_pw/hamilt_pwdft/kernels/force_op.h b/source/module_hamilt_pw/hamilt_pwdft/kernels/force_op.h index 3aa5d4f87e..acf490b278 100644 --- a/source/module_hamilt_pw/hamilt_pwdft/kernels/force_op.h +++ b/source/module_hamilt_pw/hamilt_pwdft/kernels/force_op.h @@ -146,6 +146,64 @@ struct cal_force_nl_op const FPTYPE* lambda, const std::complex* becp, const std::complex* dbecp, + FPTYPE* force); +}; + +template +struct cal_force_loc_sincos_op +{ + /// @brief Calculate local pseudopotential forces (sincos loop only) + /// + /// Input Parameters + /// @param ctx - which device this function runs on + /// @param nat - number of atoms + /// @param npw - number of plane waves + /// @param ntype - number of atom types + /// @param gcar - G-vector Cartesian coordinates [npw * 3] + /// @param tau - atomic positions [nat * 3] + /// @param vloc_per_type - precomputed vloc factors per atom [nat * npw] + /// @param aux - charge density in G-space [npw] + /// @param scale_factor - tpiba * omega + /// + /// Output Parameters + /// @param force - output forces [nat * 3] + void operator()(const Device* ctx, + const int& nat, + const int& npw, + const int& ntype, + const FPTYPE* gcar, + const FPTYPE* tau, + const FPTYPE* vloc_per_type, + const std::complex* aux, + const FPTYPE& scale_factor, + FPTYPE* force); +}; + +template +struct cal_force_ew_sincos_op +{ + /// @brief Calculate Ewald forces (sincos loop only) + /// + /// Input Parameters + /// @param ctx - which device this function runs on + /// @param nat - number of atoms + /// @param npw - number of plane waves + /// @param ig_gge0 - index of G=0 vector (-1 if not present) + /// @param gcar - G-vector Cartesian coordinates [npw * 3] + /// @param tau - atomic positions [nat * 3] + /// @param it_facts - precomputed it_fact for each atom [nat] + /// @param aux - structure factor related array [npw] + /// + /// Output Parameters + /// @param force - output forces [nat * 3] + void operator()(const Device* ctx, + const int& nat, + const int& npw, + const int& ig_gge0, + const FPTYPE* gcar, + const FPTYPE* tau, + const FPTYPE* it_facts, + const std::complex* aux, FPTYPE* force); }; @@ -248,6 +306,35 @@ struct cal_force_nl_op FPTYPE* force); }; +template +struct cal_force_loc_sincos_op +{ + void operator()(const base_device::DEVICE_GPU* ctx, + const int& nat, + const int& npw, + const int& ntype, + const FPTYPE* gcar, + const FPTYPE* tau, + const FPTYPE* vloc_per_type, + const std::complex* aux, + const FPTYPE& scale_factor, + FPTYPE* force); +}; + +template +struct cal_force_ew_sincos_op +{ + void operator()(const base_device::DEVICE_GPU* ctx, + const int& nat, + const int& npw, + const int& ig_gge0, + const FPTYPE* gcar, + const FPTYPE* tau, + const FPTYPE* it_facts, + const std::complex* aux, + FPTYPE* force); +}; + /** * @brief revert the vkb values for force_nl calculation */ diff --git a/source/module_hamilt_pw/hamilt_pwdft/kernels/rocm/force_op.hip.cu b/source/module_hamilt_pw/hamilt_pwdft/kernels/rocm/force_op.hip.cu index c78b333b86..6bb3a84e7e 100644 --- a/source/module_hamilt_pw/hamilt_pwdft/kernels/rocm/force_op.hip.cu +++ b/source/module_hamilt_pw/hamilt_pwdft/kernels/rocm/force_op.hip.cu @@ -618,10 +618,190 @@ template void revertVkbValues(const int *gcar_zero_ptrs, std::complex(const int *gcar_zero_ptrs, const std::complex *vkb_ptr, std::complex *vkb_save_ptr, int nkb, int gcar_zero_count, int npw, int ipol, int npwx); +// HIP kernels for sincos loops +template +__global__ void cal_force_loc_sincos_kernel( + const int nat, + const int npw, + const int ntype, + const FPTYPE* gcar, + const FPTYPE* tau, + const FPTYPE* vloc_per_type, + const thrust::complex* aux, + const FPTYPE scale_factor, + FPTYPE* force) +{ + const FPTYPE TWO_PI = 2.0 * M_PI; + + const int iat = blockIdx.y; + const int ig_start = blockIdx.x * blockDim.x + threadIdx.x; + const int ig_stride = gridDim.x * blockDim.x; + + if (iat >= nat) return; + + // Load atom information to registers + const FPTYPE tau_x = tau[iat * 3 + 0]; + const FPTYPE tau_y = tau[iat * 3 + 1]; + const FPTYPE tau_z = tau[iat * 3 + 2]; + + // Local accumulation variables + FPTYPE local_force_x = 0.0; + FPTYPE local_force_y = 0.0; + FPTYPE local_force_z = 0.0; + + // Grid-stride loop over G-vectors + for (int ig = ig_start; ig < npw; ig += ig_stride) { + // Calculate phase: 2π * (G · τ) + const FPTYPE phase = TWO_PI * (gcar[ig * 3 + 0] * tau_x + + gcar[ig * 3 + 1] * tau_y + + gcar[ig * 3 + 2] * tau_z); + + // Use HIP intrinsic for sincos + FPTYPE sinp, cosp; + sincos(phase, &sinp, &cosp); + + // Calculate force factor + const FPTYPE vloc_factor = vloc_per_type[iat * npw + ig]; + const FPTYPE factor = vloc_factor * (cosp * aux[ig].imag() + sinp * aux[ig].real()); + + // Accumulate force contributions + local_force_x += gcar[ig * 3 + 0] * factor; + local_force_y += gcar[ig * 3 + 1] * factor; + local_force_z += gcar[ig * 3 + 2] * factor; + } + + // Atomic add to global memory + atomicAdd(&force[iat * 3 + 0], local_force_x * scale_factor); + atomicAdd(&force[iat * 3 + 1], local_force_y * scale_factor); + atomicAdd(&force[iat * 3 + 2], local_force_z * scale_factor); +} + +template +__global__ void cal_force_ew_sincos_kernel( + const int nat, + const int npw, + const int ig_gge0, + const FPTYPE* gcar, + const FPTYPE* tau, + const FPTYPE* it_facts, + const thrust::complex* aux, + FPTYPE* force) +{ + const FPTYPE TWO_PI = 2.0 * M_PI; + + const int iat = blockIdx.y; + const int ig_start = blockIdx.x * blockDim.x + threadIdx.x; + const int ig_stride = gridDim.x * blockDim.x; + + if (iat >= nat) return; + + // Load atom information to registers + const FPTYPE tau_x = tau[iat * 3 + 0]; + const FPTYPE tau_y = tau[iat * 3 + 1]; + const FPTYPE tau_z = tau[iat * 3 + 2]; + const FPTYPE it_fact = it_facts[iat]; + + // Local accumulation variables + FPTYPE local_force_x = 0.0; + FPTYPE local_force_y = 0.0; + FPTYPE local_force_z = 0.0; + + // Grid-stride loop over G-vectors + for (int ig = ig_start; ig < npw; ig += ig_stride) { + // Skip G=0 term + if (ig == ig_gge0) continue; + + // Calculate phase: 2π * (G · τ) + const FPTYPE phase = TWO_PI * (gcar[ig * 3 + 0] * tau_x + + gcar[ig * 3 + 1] * tau_y + + gcar[ig * 3 + 2] * tau_z); + + // Use HIP intrinsic for sincos + FPTYPE sinp, cosp; + sincos(phase, &sinp, &cosp); + + // Calculate Ewald sum contribution (fixed sign error) + const FPTYPE factor = it_fact * (-cosp * aux[ig].imag() + sinp * aux[ig].real()); + + // Accumulate force contributions + local_force_x += gcar[ig * 3 + 0] * factor; + local_force_y += gcar[ig * 3 + 1] * factor; + local_force_z += gcar[ig * 3 + 2] * factor; + } + + // Atomic add to global memory + atomicAdd(&force[iat * 3 + 0], local_force_x); + atomicAdd(&force[iat * 3 + 1], local_force_y); + atomicAdd(&force[iat * 3 + 2], local_force_z); +} + +// GPU operators +template +void cal_force_loc_sincos_op::operator()( + const base_device::DEVICE_GPU* ctx, + const int& nat, + const int& npw, + const int& ntype, + const FPTYPE* gcar, + const FPTYPE* tau, + const FPTYPE* vloc_per_type, + const std::complex* aux, + const FPTYPE& scale_factor, + FPTYPE* force) +{ + // Calculate optimal grid configuration for GPU load balancing + const int threads_per_block = THREADS_PER_BLOCK; + const int max_blocks_per_sm = 32; // Configurable per GPU architecture + const int max_blocks_x = std::min(max_blocks_per_sm, (npw + threads_per_block - 1) / threads_per_block); + + dim3 grid(max_blocks_x, nat); + dim3 block(threads_per_block); + + hipLaunchKernelGGL(cal_force_loc_sincos_kernel, + grid, block, 0, 0, + nat, npw, ntype, gcar, tau, vloc_per_type, + reinterpret_cast*>(aux), + scale_factor, force); + + hipCheckOnDebug(); +} + +template +void cal_force_ew_sincos_op::operator()( + const base_device::DEVICE_GPU* ctx, + const int& nat, + const int& npw, + const int& ig_gge0, + const FPTYPE* gcar, + const FPTYPE* tau, + const FPTYPE* it_facts, + const std::complex* aux, + FPTYPE* force) +{ + // Calculate optimal grid configuration for GPU load balancing + const int threads_per_block = THREADS_PER_BLOCK; + const int max_blocks_per_sm = 32; // Configurable per GPU architecture + const int max_blocks_x = std::min(max_blocks_per_sm, (npw + threads_per_block - 1) / threads_per_block); + + dim3 grid(max_blocks_x, nat); + dim3 block(threads_per_block); + + hipLaunchKernelGGL(cal_force_ew_sincos_kernel, + grid, block, 0, 0, + nat, npw, ig_gge0, gcar, tau, it_facts, + reinterpret_cast*>(aux), force); + + hipCheckOnDebug(); +} + template struct cal_vkb1_nl_op; template struct cal_force_nl_op; +template struct cal_force_loc_sincos_op; +template struct cal_force_ew_sincos_op; template struct cal_vkb1_nl_op; template struct cal_force_nl_op; +template struct cal_force_loc_sincos_op; +template struct cal_force_ew_sincos_op; } // namespace hamilt