From 94bfa42dfaf27b80dc059e2ed6ac589b86ed2e69 Mon Sep 17 00:00:00 2001 From: Jie Date: Mon, 26 May 2025 15:56:44 +0800 Subject: [PATCH 1/6] implement gpu op for sincos loops --- .../kernels/cuda/force_sincos_op.cu | 204 ++++++++++++++++++ .../hamilt_pwdft/kernels/force_sincos_op.cpp | 123 +++++++++++ .../hamilt_pwdft/kernels/force_sincos_op.h | 106 +++++++++ .../kernels/rocm/force_sincos_op.hip.cu | 203 +++++++++++++++++ 4 files changed, 636 insertions(+) create mode 100644 source/module_hamilt_pw/hamilt_pwdft/kernels/cuda/force_sincos_op.cu create mode 100644 source/module_hamilt_pw/hamilt_pwdft/kernels/force_sincos_op.cpp create mode 100644 source/module_hamilt_pw/hamilt_pwdft/kernels/force_sincos_op.h create mode 100644 source/module_hamilt_pw/hamilt_pwdft/kernels/rocm/force_sincos_op.hip.cu diff --git a/source/module_hamilt_pw/hamilt_pwdft/kernels/cuda/force_sincos_op.cu b/source/module_hamilt_pw/hamilt_pwdft/kernels/cuda/force_sincos_op.cu new file mode 100644 index 0000000000..f97214893e --- /dev/null +++ b/source/module_hamilt_pw/hamilt_pwdft/kernels/cuda/force_sincos_op.cu @@ -0,0 +1,204 @@ +#include "module_hamilt_pw/hamilt_pwdft/kernels/force_sincos_op.h" +#include "module_base/module_device/types.h" + +#include +#include +#include +#include + +#define THREADS_PER_BLOCK 256 + +namespace hamilt { + +// CUDA kernels for sincos loops +template +__global__ void cal_force_loc_sincos_kernel( + const int nat, + const int npw, + const FPTYPE* gcar, + const FPTYPE* tau, + const int* iat2it, + const FPTYPE* vloc_factors, + const thrust::complex* aux, + const FPTYPE scale_factor, + FPTYPE* force) +{ + const FPTYPE TWO_PI = 2.0 * M_PI; + + const int iat = blockIdx.y; + const int ig_start = blockIdx.x * blockDim.x + threadIdx.x; + const int ig_stride = gridDim.x * blockDim.x; + + if (iat >= nat) return; + + // Load atom information to registers + const int it = iat2it[iat]; + const FPTYPE tau_x = tau[iat * 3 + 0]; + const FPTYPE tau_y = tau[iat * 3 + 1]; + const FPTYPE tau_z = tau[iat * 3 + 2]; + + // Local accumulation variables + FPTYPE local_force_x = 0.0; + FPTYPE local_force_y = 0.0; + FPTYPE local_force_z = 0.0; + + // Grid-stride loop over G-vectors + for (int ig = ig_start; ig < npw; ig += ig_stride) { + // Calculate phase: 2π * (G · τ) + const FPTYPE phase = TWO_PI * ( + gcar[ig * 3 + 0] * tau_x + + gcar[ig * 3 + 1] * tau_y + + gcar[ig * 3 + 2] * tau_z + ); + + // Use CUDA intrinsic for sincos + FPTYPE sinp, cosp; + __sincos(phase, &sinp, &cosp); + + // Calculate force factor + const FPTYPE factor = vloc_factors[ig] * + (cosp * aux[ig].imag() + sinp * aux[ig].real()); + + // Accumulate force contributions + local_force_x += gcar[ig * 3 + 0] * factor; + local_force_y += gcar[ig * 3 + 1] * factor; + local_force_z += gcar[ig * 3 + 2] * factor; + } + + // Atomic add to global memory + atomicAdd(&force[iat * 3 + 0], local_force_x * scale_factor); + atomicAdd(&force[iat * 3 + 1], local_force_y * scale_factor); + atomicAdd(&force[iat * 3 + 2], local_force_z * scale_factor); +} + +template +__global__ void cal_force_ew_sincos_kernel( + const int nat, + const int npw, + const int ig_gge0, + const FPTYPE* gcar, + const FPTYPE* tau, + const int* iat2it, + const FPTYPE* it_facts, + const thrust::complex* aux, + FPTYPE* force) +{ + const FPTYPE TWO_PI = 2.0 * M_PI; + + const int iat = blockIdx.y; + const int ig_start = blockIdx.x * blockDim.x + threadIdx.x; + const int ig_stride = gridDim.x * blockDim.x; + + if (iat >= nat) return; + + // Load atom information to registers + const int it = iat2it[iat]; + const FPTYPE tau_x = tau[iat * 3 + 0]; + const FPTYPE tau_y = tau[iat * 3 + 1]; + const FPTYPE tau_z = tau[iat * 3 + 2]; + const FPTYPE it_fact = it_facts[iat]; + + // Local accumulation variables + FPTYPE local_force_x = 0.0; + FPTYPE local_force_y = 0.0; + FPTYPE local_force_z = 0.0; + + // Grid-stride loop over G-vectors + for (int ig = ig_start; ig < npw; ig += ig_stride) { + // Skip G=0 term + if (ig == ig_gge0) continue; + + // Calculate phase: 2π * (G · τ) + const FPTYPE arg = TWO_PI * ( + gcar[ig * 3 + 0] * tau_x + + gcar[ig * 3 + 1] * tau_y + + gcar[ig * 3 + 2] * tau_z + ); + + // Use CUDA intrinsic for sincos + FPTYPE sinp, cosp; + __sincos(arg, &sinp, &cosp); + + // Calculate Ewald sum contribution + const FPTYPE sumnb = -cosp * aux[ig].imag() + sinp * aux[ig].real(); + + // Accumulate force contributions + local_force_x += gcar[ig * 3 + 0] * sumnb; + local_force_y += gcar[ig * 3 + 1] * sumnb; + local_force_z += gcar[ig * 3 + 2] * sumnb; + } + + // Atomic add to global memory + atomicAdd(&force[iat * 3 + 0], local_force_x * it_fact); + atomicAdd(&force[iat * 3 + 1], local_force_y * it_fact); + atomicAdd(&force[iat * 3 + 2], local_force_z * it_fact); +} + +// GPU operator implementations +template +void cal_force_loc_sincos_op::operator()( + const base_device::DEVICE_GPU* ctx, + const int& nat, + const int& npw, + const FPTYPE* gcar, + const FPTYPE* tau, + const int* iat2it, + const FPTYPE* vloc_factors, + const std::complex* aux, + const FPTYPE& scale_factor, + FPTYPE* force) +{ + // Calculate optimal grid configuration + const int threads_per_block = THREADS_PER_BLOCK; + const int max_blocks_x = min(32, (npw + threads_per_block - 1) / threads_per_block); + + dim3 grid(max_blocks_x, nat); + dim3 block(threads_per_block); + + cal_force_loc_sincos_kernel<<>>( + nat, npw, + gcar, tau, iat2it, vloc_factors, + reinterpret_cast*>(aux), + scale_factor, force + ); + + cudaCheckOnDebug(); +} + +template +void cal_force_ew_sincos_op::operator()( + const base_device::DEVICE_GPU* ctx, + const int& nat, + const int& npw, + const int& ig_gge0, + const FPTYPE* gcar, + const FPTYPE* tau, + const int* iat2it, + const FPTYPE* it_facts, + const std::complex* aux, + FPTYPE* force) +{ + // Calculate optimal grid configuration + const int threads_per_block = THREADS_PER_BLOCK; + const int max_blocks_x = min(32, (npw + threads_per_block - 1) / threads_per_block); + + dim3 grid(max_blocks_x, nat); + dim3 block(threads_per_block); + + cal_force_ew_sincos_kernel<<>>( + nat, npw, ig_gge0, + gcar, tau, iat2it, it_facts, + reinterpret_cast*>(aux), + force + ); + + cudaCheckOnDebug(); +} + +template struct cal_force_loc_sincos_op; +template struct cal_force_loc_sincos_op; + +template struct cal_force_ew_sincos_op; +template struct cal_force_ew_sincos_op; + +} // namespace hamilt \ No newline at end of file diff --git a/source/module_hamilt_pw/hamilt_pwdft/kernels/force_sincos_op.cpp b/source/module_hamilt_pw/hamilt_pwdft/kernels/force_sincos_op.cpp new file mode 100644 index 0000000000..e7ce8554cc --- /dev/null +++ b/source/module_hamilt_pw/hamilt_pwdft/kernels/force_sincos_op.cpp @@ -0,0 +1,123 @@ +#include "force_sincos_op.h" +#include "module_base/libm/libm.h" + +#ifdef _OPENMP +#include +#endif + +namespace hamilt +{ + +template +struct cal_force_loc_sincos_op +{ + void operator()(const base_device::DEVICE_CPU* ctx, + const int& nat, + const int& npw, + const FPTYPE* gcar, + const FPTYPE* tau, + const int* iat2it, + const FPTYPE* vloc_factors, + const std::complex* aux, + const FPTYPE& scale_factor, + FPTYPE* force) + { + const FPTYPE TWO_PI = 2.0 * M_PI; + +#ifdef _OPENMP +#pragma omp parallel for +#endif + for (int iat = 0; iat < nat; ++iat) + { + const int it = iat2it[iat]; + const FPTYPE tau_x = tau[iat * 3 + 0]; + const FPTYPE tau_y = tau[iat * 3 + 1]; + const FPTYPE tau_z = tau[iat * 3 + 2]; + + FPTYPE local_force[3] = {0.0, 0.0, 0.0}; + + for (int ig = 0; ig < npw; ig++) + { + const FPTYPE phase = TWO_PI * (gcar[ig * 3 + 0] * tau_x + + gcar[ig * 3 + 1] * tau_y + + gcar[ig * 3 + 2] * tau_z); + FPTYPE sinp, cosp; + ModuleBase::libm::sincos(phase, &sinp, &cosp); + + const FPTYPE factor = vloc_factors[ig] * + (cosp * aux[ig].imag() + sinp * aux[ig].real()); + + local_force[0] += gcar[ig * 3 + 0] * factor; + local_force[1] += gcar[ig * 3 + 1] * factor; + local_force[2] += gcar[ig * 3 + 2] * factor; + } + + force[iat * 3 + 0] += local_force[0] * scale_factor; + force[iat * 3 + 1] += local_force[1] * scale_factor; + force[iat * 3 + 2] += local_force[2] * scale_factor; + } + } +}; + +template +struct cal_force_ew_sincos_op +{ + void operator()(const base_device::DEVICE_CPU* ctx, + const int& nat, + const int& npw, + const int& ig_gge0, + const FPTYPE* gcar, + const FPTYPE* tau, + const int* iat2it, + const FPTYPE* it_facts, + const std::complex* aux, + FPTYPE* force) + { + const FPTYPE TWO_PI = 2.0 * M_PI; + +#ifdef _OPENMP +#pragma omp parallel for +#endif + for (int iat = 0; iat < nat; ++iat) + { + const int it = iat2it[iat]; + const FPTYPE tau_x = tau[iat * 3 + 0]; + const FPTYPE tau_y = tau[iat * 3 + 1]; + const FPTYPE tau_z = tau[iat * 3 + 2]; + const FPTYPE it_fact = it_facts[iat]; + + FPTYPE local_force[3] = {0.0, 0.0, 0.0}; + + for (int ig = 0; ig < npw; ig++) + { + // Skip G=0 term + if (ig == ig_gge0) continue; + + const FPTYPE arg = TWO_PI * (gcar[ig * 3 + 0] * tau_x + + gcar[ig * 3 + 1] * tau_y + + gcar[ig * 3 + 2] * tau_z); + FPTYPE sinp, cosp; + ModuleBase::libm::sincos(arg, &sinp, &cosp); + + const FPTYPE sumnb = -cosp * aux[ig].imag() + sinp * aux[ig].real(); + + local_force[0] += gcar[ig * 3 + 0] * sumnb; + local_force[1] += gcar[ig * 3 + 1] * sumnb; + local_force[2] += gcar[ig * 3 + 2] * sumnb; + } + + force[iat * 3 + 0] += local_force[0] * it_fact; + force[iat * 3 + 1] += local_force[1] * it_fact; + force[iat * 3 + 2] += local_force[2] * it_fact; + } + } +}; + +// Template instantiations +template struct cal_force_loc_sincos_op; +template struct cal_force_loc_sincos_op; + +template struct cal_force_ew_sincos_op; +template struct cal_force_ew_sincos_op; + +} // namespace hamilt \ No newline at end of file diff --git a/source/module_hamilt_pw/hamilt_pwdft/kernels/force_sincos_op.h b/source/module_hamilt_pw/hamilt_pwdft/kernels/force_sincos_op.h new file mode 100644 index 0000000000..455b8425a7 --- /dev/null +++ b/source/module_hamilt_pw/hamilt_pwdft/kernels/force_sincos_op.h @@ -0,0 +1,106 @@ +#ifndef FORCE_SINCOS_OP_H +#define FORCE_SINCOS_OP_H + +#include "module_base/module_device/types.h" +#include + +namespace hamilt +{ + +template +struct cal_force_loc_sincos_op +{ + /// @brief Calculate local pseudopotential forces (sincos loop only) + /// + /// Input Parameters + /// @param ctx - which device this function runs on + /// @param nat - number of atoms + /// @param npw - number of plane waves + /// @param gcar - G-vector Cartesian coordinates [npw * 3] + /// @param tau - atomic positions [nat * 3] + /// @param iat2it - atom to type mapping [nat] + /// @param vloc_factors - precomputed vloc factors [npw] + /// @param aux - charge density in G-space [npw] + /// @param scale_factor - tpiba * omega + /// + /// Output Parameters + /// @param force - output forces [nat * 3] + void operator()(const Device* ctx, + const int& nat, + const int& npw, + const FPTYPE* gcar, + const FPTYPE* tau, + const int* iat2it, + const FPTYPE* vloc_factors, + const std::complex* aux, + const FPTYPE& scale_factor, + FPTYPE* force); +}; + +template +struct cal_force_ew_sincos_op +{ + /// @brief Calculate Ewald forces (sincos loop only) + /// + /// Input Parameters + /// @param ctx - which device this function runs on + /// @param nat - number of atoms + /// @param npw - number of plane waves + /// @param ig_gge0 - index of G=0 vector (-1 if not present) + /// @param gcar - G-vector Cartesian coordinates [npw * 3] + /// @param tau - atomic positions [nat * 3] + /// @param iat2it - atom to type mapping [nat] + /// @param it_facts - precomputed it_fact for each atom [nat] + /// @param aux - structure factor related array [npw] + /// + /// Output Parameters + /// @param force - output forces [nat * 3] + void operator()(const Device* ctx, + const int& nat, + const int& npw, + const int& ig_gge0, + const FPTYPE* gcar, + const FPTYPE* tau, + const int* iat2it, + const FPTYPE* it_facts, + const std::complex* aux, + FPTYPE* force); +}; + +#if __CUDA || __UT_USE_CUDA || __ROCM || __UT_USE_ROCM + +template +struct cal_force_loc_sincos_op +{ + void operator()(const base_device::DEVICE_GPU* ctx, + const int& nat, + const int& npw, + const FPTYPE* gcar, + const FPTYPE* tau, + const int* iat2it, + const FPTYPE* vloc_factors, + const std::complex* aux, + const FPTYPE& scale_factor, + FPTYPE* force); +}; + +template +struct cal_force_ew_sincos_op +{ + void operator()(const base_device::DEVICE_GPU* ctx, + const int& nat, + const int& npw, + const int& ig_gge0, + const FPTYPE* gcar, + const FPTYPE* tau, + const int* iat2it, + const FPTYPE* it_facts, + const std::complex* aux, + FPTYPE* force); +}; + +#endif // __CUDA || __UT_USE_CUDA || __ROCM || __UT_USE_ROCM + +} // namespace hamilt + +#endif // FORCE_SINCOS_OP_H \ No newline at end of file diff --git a/source/module_hamilt_pw/hamilt_pwdft/kernels/rocm/force_sincos_op.hip.cu b/source/module_hamilt_pw/hamilt_pwdft/kernels/rocm/force_sincos_op.hip.cu new file mode 100644 index 0000000000..a09cd28bed --- /dev/null +++ b/source/module_hamilt_pw/hamilt_pwdft/kernels/rocm/force_sincos_op.hip.cu @@ -0,0 +1,203 @@ +#include "module_hamilt_pw/hamilt_pwdft/kernels/force_sincos_op.h" + +#include +#include +#include +#include + +#define THREADS_PER_BLOCK 256 + +namespace hamilt { + +// HIP kernels for sincos loops +template +__global__ void cal_force_loc_sincos_kernel( + const int nat, + const int npw, + const FPTYPE* gcar, + const FPTYPE* tau, + const int* iat2it, + const FPTYPE* vloc_factors, + const thrust::complex* aux, + const FPTYPE scale_factor, + FPTYPE* force) +{ + const FPTYPE TWO_PI = 2.0 * M_PI; + + const int iat = blockIdx.y; + const int ig_start = blockIdx.x * blockDim.x + threadIdx.x; + const int ig_stride = gridDim.x * blockDim.x; + + if (iat >= nat) return; + + // Load atom information to registers + const int it = iat2it[iat]; + const FPTYPE tau_x = tau[iat * 3 + 0]; + const FPTYPE tau_y = tau[iat * 3 + 1]; + const FPTYPE tau_z = tau[iat * 3 + 2]; + + // Local accumulation variables + FPTYPE local_force_x = 0.0; + FPTYPE local_force_y = 0.0; + FPTYPE local_force_z = 0.0; + + // Grid-stride loop over G-vectors + for (int ig = ig_start; ig < npw; ig += ig_stride) { + // Calculate phase: 2π * (G · τ) + const FPTYPE phase = TWO_PI * ( + gcar[ig * 3 + 0] * tau_x + + gcar[ig * 3 + 1] * tau_y + + gcar[ig * 3 + 2] * tau_z + ); + + // Use HIP intrinsic for sincos + FPTYPE sinp, cosp; + __sincos(phase, &sinp, &cosp); + + // Calculate force factor + const FPTYPE factor = vloc_factors[ig] * + (cosp * aux[ig].imag() + sinp * aux[ig].real()); + + // Accumulate force contributions + local_force_x += gcar[ig * 3 + 0] * factor; + local_force_y += gcar[ig * 3 + 1] * factor; + local_force_z += gcar[ig * 3 + 2] * factor; + } + + // Atomic add to global memory + atomicAdd(&force[iat * 3 + 0], local_force_x * scale_factor); + atomicAdd(&force[iat * 3 + 1], local_force_y * scale_factor); + atomicAdd(&force[iat * 3 + 2], local_force_z * scale_factor); +} + +template +__global__ void cal_force_ew_sincos_kernel( + const int nat, + const int npw, + const int ig_gge0, + const FPTYPE* gcar, + const FPTYPE* tau, + const int* iat2it, + const FPTYPE* it_facts, + const thrust::complex* aux, + FPTYPE* force) +{ + const FPTYPE TWO_PI = 2.0 * M_PI; + + const int iat = blockIdx.y; + const int ig_start = blockIdx.x * blockDim.x + threadIdx.x; + const int ig_stride = gridDim.x * blockDim.x; + + if (iat >= nat) return; + + // Load atom information to registers + const int it = iat2it[iat]; + const FPTYPE tau_x = tau[iat * 3 + 0]; + const FPTYPE tau_y = tau[iat * 3 + 1]; + const FPTYPE tau_z = tau[iat * 3 + 2]; + const FPTYPE it_fact = it_facts[iat]; + + // Local accumulation variables + FPTYPE local_force_x = 0.0; + FPTYPE local_force_y = 0.0; + FPTYPE local_force_z = 0.0; + + // Grid-stride loop over G-vectors + for (int ig = ig_start; ig < npw; ig += ig_stride) { + // Skip G=0 term + if (ig == ig_gge0) continue; + + // Calculate phase: 2π * (G · τ) + const FPTYPE arg = TWO_PI * ( + gcar[ig * 3 + 0] * tau_x + + gcar[ig * 3 + 1] * tau_y + + gcar[ig * 3 + 2] * tau_z + ); + + // Use HIP intrinsic for sincos + FPTYPE sinp, cosp; + __sincos(arg, &sinp, &cosp); + + // Calculate Ewald sum contribution + const FPTYPE sumnb = -cosp * aux[ig].imag() + sinp * aux[ig].real(); + + // Accumulate force contributions + local_force_x += gcar[ig * 3 + 0] * sumnb; + local_force_y += gcar[ig * 3 + 1] * sumnb; + local_force_z += gcar[ig * 3 + 2] * sumnb; + } + + // Atomic add to global memory + atomicAdd(&force[iat * 3 + 0], local_force_x * it_fact); + atomicAdd(&force[iat * 3 + 1], local_force_y * it_fact); + atomicAdd(&force[iat * 3 + 2], local_force_z * it_fact); +} + +// GPU operator implementations +template +void cal_force_loc_sincos_op::operator()( + const base_device::DEVICE_GPU* ctx, + const int& nat, + const int& npw, + const FPTYPE* gcar, + const FPTYPE* tau, + const int* iat2it, + const FPTYPE* vloc_factors, + const std::complex* aux, + const FPTYPE& scale_factor, + FPTYPE* force) +{ + // Calculate optimal grid configuration + const int threads_per_block = THREADS_PER_BLOCK; + const int max_blocks_x = min(32, (npw + threads_per_block - 1) / threads_per_block); + + dim3 grid(max_blocks_x, nat); + dim3 block(threads_per_block); + + hipLaunchKernelGGL(HIP_KERNEL_NAME(cal_force_loc_sincos_kernel), grid, block, 0, 0, + nat, npw, + gcar, tau, iat2it, vloc_factors, + reinterpret_cast*>(aux), + scale_factor, force + ); + + hipCheckOnDebug(); +} + +template +void cal_force_ew_sincos_op::operator()( + const base_device::DEVICE_GPU* ctx, + const int& nat, + const int& npw, + const int& ig_gge0, + const FPTYPE* gcar, + const FPTYPE* tau, + const int* iat2it, + const FPTYPE* it_facts, + const std::complex* aux, + FPTYPE* force) +{ + // Calculate optimal grid configuration + const int threads_per_block = THREADS_PER_BLOCK; + const int max_blocks_x = min(32, (npw + threads_per_block - 1) / threads_per_block); + + dim3 grid(max_blocks_x, nat); + dim3 block(threads_per_block); + + hipLaunchKernelGGL(HIP_KERNEL_NAME(cal_force_ew_sincos_kernel), grid, block, 0, 0, + nat, npw, ig_gge0, + gcar, tau, iat2it, it_facts, + reinterpret_cast*>(aux), + force + ); + + hipCheckOnDebug(); +} + +template struct cal_force_loc_sincos_op; +template struct cal_force_loc_sincos_op; + +template struct cal_force_ew_sincos_op; +template struct cal_force_ew_sincos_op; + +} // namespace hamilt \ No newline at end of file From 69aa10063376e7d5a4b4408e76c4a8451da8c65e Mon Sep 17 00:00:00 2001 From: Jie Date: Tue, 27 May 2025 18:35:51 +0800 Subject: [PATCH 2/6] add cpu kernel for cal_force_loc & cal_force_ew --- .../module_hamilt_pw/hamilt_pwdft/forces.cpp | 452 +++++++++++------- .../hamilt_pwdft/kernels/force_op.cpp | 255 ++++++++++ .../hamilt_pwdft/kernels/force_op.h | 107 +++++ 3 files changed, 636 insertions(+), 178 deletions(-) diff --git a/source/module_hamilt_pw/hamilt_pwdft/forces.cpp b/source/module_hamilt_pw/hamilt_pwdft/forces.cpp index df29f88989..52ddf7da63 100644 --- a/source/module_hamilt_pw/hamilt_pwdft/forces.cpp +++ b/source/module_hamilt_pw/hamilt_pwdft/forces.cpp @@ -15,6 +15,8 @@ #include "module_hamilt_general/module_ewald/H_Ewald_pw.h" #include "module_hamilt_general/module_surchem/surchem.h" #include "module_hamilt_general/module_vdw/vdw.h" +#include "module_hamilt_pw/hamilt_pwdft/kernels/force_op.h" +#include #ifdef _OPENMP #include @@ -23,6 +25,208 @@ #include "module_cell/module_paw/paw_cell.h" #endif +// Data preparation function for Ewald parallel operator +template +void prepare_ew_parallel_data(const UnitCell& ucell, + ModulePW::PW_Basis* rho_basis, + const std::complex* aux, + const double alpha, + const double fact, + std::vector& gcar_flat, + std::vector& tau_flat, + std::vector& iat2it_array, + std::vector& it_facts, + std::vector& atom_na, + std::vector& zv_values, + std::vector& latvec_flat, + std::vector& G_flat) +{ + const int nat = ucell.nat; + const int npw = rho_basis->npw; + const int ntype = ucell.ntype; + + // Prepare gcar_flat + gcar_flat.resize(npw * 3); + for (int ig = 0; ig < npw; ig++) + { + gcar_flat[ig * 3 + 0] = static_cast(rho_basis->gcar[ig][0]); + gcar_flat[ig * 3 + 1] = static_cast(rho_basis->gcar[ig][1]); + gcar_flat[ig * 3 + 2] = static_cast(rho_basis->gcar[ig][2]); + } + + // Prepare tau_flat and iat2it_array + tau_flat.resize(nat * 3); + iat2it_array.resize(nat); + int iat = 0; + for (int it = 0; it < ntype; it++) + { + for (int ia = 0; ia < ucell.atoms[it].na; ia++) + { + tau_flat[iat * 3 + 0] = static_cast(ucell.atoms[it].tau[ia].x); + tau_flat[iat * 3 + 1] = static_cast(ucell.atoms[it].tau[ia].y); + tau_flat[iat * 3 + 2] = static_cast(ucell.atoms[it].tau[ia].z); + iat2it_array[iat] = it; + iat++; + } + } + + // Prepare it_facts + it_facts.resize(nat); + iat = 0; + for (int it = 0; it < ntype; it++) + { + double zv; + if (PARAM.inp.use_paw) + { +#ifdef USE_PAW + zv = GlobalC::paw_cell.get_val(it); +#else + zv = ucell.atoms[it].ncpp.zv; +#endif + } + else + { + zv = ucell.atoms[it].ncpp.zv; + } + + const FPTYPE it_fact = static_cast(zv * ModuleBase::e2 * ucell.tpiba * ModuleBase::TWO_PI / ucell.omega * fact); + + for (int ia = 0; ia < ucell.atoms[it].na; ia++) + { + it_facts[iat] = it_fact; + iat++; + } + } + + // Prepare atom_na + atom_na.resize(ntype); + for (int it = 0; it < ntype; it++) + { + atom_na[it] = ucell.atoms[it].na; + } + + // Prepare zv_values + zv_values.resize(ntype); + for (int it = 0; it < ntype; it++) + { + if (PARAM.inp.use_paw) + { +#ifdef USE_PAW + zv_values[it] = static_cast(GlobalC::paw_cell.get_val(it)); +#else + zv_values[it] = static_cast(ucell.atoms[it].ncpp.zv); +#endif + } + else + { + zv_values[it] = static_cast(ucell.atoms[it].ncpp.zv); + } + } + + // Prepare latvec_flat (row-major order) + latvec_flat.resize(9); + latvec_flat[0] = static_cast(ucell.latvec.e11); + latvec_flat[1] = static_cast(ucell.latvec.e12); + latvec_flat[2] = static_cast(ucell.latvec.e13); + latvec_flat[3] = static_cast(ucell.latvec.e21); + latvec_flat[4] = static_cast(ucell.latvec.e22); + latvec_flat[5] = static_cast(ucell.latvec.e23); + latvec_flat[6] = static_cast(ucell.latvec.e31); + latvec_flat[7] = static_cast(ucell.latvec.e32); + latvec_flat[8] = static_cast(ucell.latvec.e33); + + // Prepare G_flat (row-major order) + G_flat.resize(9); + G_flat[0] = static_cast(ucell.G.e11); + G_flat[1] = static_cast(ucell.G.e12); + G_flat[2] = static_cast(ucell.G.e13); + G_flat[3] = static_cast(ucell.G.e21); + G_flat[4] = static_cast(ucell.G.e22); + G_flat[5] = static_cast(ucell.G.e23); + G_flat[6] = static_cast(ucell.G.e31); + G_flat[7] = static_cast(ucell.G.e32); + G_flat[8] = static_cast(ucell.G.e33); +} + +// Data preparation function for local force sincos operator +template +void prepare_loc_sincos_data(const UnitCell& ucell, + ModulePW::PW_Basis* rho_basis, + const ModuleBase::matrix& vloc, + const std::complex* aux, + std::vector& gcar_flat, + std::vector& tau_flat, + std::vector& iat2it_array, + std::vector& vloc_factors) +{ + const int nat = ucell.nat; + const int npw = rho_basis->npw; + + // Prepare gcar_flat + gcar_flat.resize(npw * 3); + for (int ig = 0; ig < npw; ig++) + { + gcar_flat[ig * 3 + 0] = static_cast(rho_basis->gcar[ig][0]); + gcar_flat[ig * 3 + 1] = static_cast(rho_basis->gcar[ig][1]); + gcar_flat[ig * 3 + 2] = static_cast(rho_basis->gcar[ig][2]); + } + + // Prepare tau_flat and iat2it_array + tau_flat.resize(nat * 3); + iat2it_array.resize(nat); + int iat = 0; + for (int it = 0; it < ucell.ntype; it++) + { + for (int ia = 0; ia < ucell.atoms[it].na; ia++) + { + tau_flat[iat * 3 + 0] = static_cast(ucell.atoms[it].tau[ia].x); + tau_flat[iat * 3 + 1] = static_cast(ucell.atoms[it].tau[ia].y); + tau_flat[iat * 3 + 2] = static_cast(ucell.atoms[it].tau[ia].z); + iat2it_array[iat] = it; + iat++; + } + } + + // Prepare vloc_factors: vloc(it, ig2igg[ig]) for each G-vector and atom type + vloc_factors.resize(npw * ucell.ntype); + for (int it = 0; it < ucell.ntype; it++) + { + for (int ig = 0; ig < npw; ig++) + { + vloc_factors[it * npw + ig] = static_cast(vloc(it, rho_basis->ig2igg[ig])); + } + } +} + +// Data copy back function for local force sincos operator +template +void copy_back_loc_force_data(const FPTYPE* force, + const int nat, + const FPTYPE& scale_factor, + ModuleBase::matrix& forcelc) +{ + for (int iat = 0; iat < nat; iat++) + { + forcelc(iat, 0) = static_cast(force[iat * 3 + 0] * scale_factor); + forcelc(iat, 1) = static_cast(force[iat * 3 + 1] * scale_factor); + forcelc(iat, 2) = static_cast(force[iat * 3 + 2] * scale_factor); + } +} + +// Data copy back function for Ewald parallel operator +template +void copy_back_ew_force_data(const FPTYPE* force, + const int nat, + ModuleBase::matrix& forceion) +{ + for (int iat = 0; iat < nat; iat++) + { + forceion(iat, 0) = static_cast(force[iat * 3 + 0]); + forceion(iat, 1) = static_cast(force[iat * 3 + 1]); + forceion(iat, 2) = static_cast(force[iat * 3 + 2]); + } +} + template void Forces::cal_force(UnitCell& ucell, ModuleBase::matrix& force, @@ -531,30 +735,40 @@ void Forces::cal_force_loc(const UnitCell& ucell, // to G space. maybe need fftw with OpenMP rho_basis->real2recip(aux, aux); -#ifdef _OPENMP -#pragma omp parallel for -#endif - for (int iat = 0; iat < this->nat; ++iat) + // Prepare data for the sincos operator + std::vector gcar_flat, tau_flat, vloc_factors; + std::vector iat2it_array; + prepare_loc_sincos_data(ucell, rho_basis, vloc, aux, + gcar_flat, tau_flat, iat2it_array, vloc_factors); + + // Prepare force array for the operator + std::vector force_flat(this->nat * 3, 0.0); + + // Convert aux to FPTYPE complex array + std::vector> aux_fptype(rho_basis->npw); + for (int ig = 0; ig < rho_basis->npw; ig++) { - // read `it` `ia` from the table - int it = ucell.iat2it[iat]; - int ia = ucell.iat2ia[iat]; - for (int ig = 0; ig < rho_basis->npw; ig++) - { - const double phase = ModuleBase::TWO_PI * (rho_basis->gcar[ig] * ucell.atoms[it].tau[ia]); - double sinp, cosp; - ModuleBase::libm::sincos(phase, &sinp, &cosp); - const double factor - = vloc(it, rho_basis->ig2igg[ig]) * (cosp * aux[ig].imag() + sinp * aux[ig].real()); - forcelc(iat, 0) += rho_basis->gcar[ig][0] * factor; - forcelc(iat, 1) += rho_basis->gcar[ig][1] * factor; - forcelc(iat, 2) += rho_basis->gcar[ig][2] * factor; - } - forcelc(iat, 0) *= (ucell.tpiba * ucell.omega); - forcelc(iat, 1) *= (ucell.tpiba * ucell.omega); - forcelc(iat, 2) *= (ucell.tpiba * ucell.omega); + aux_fptype[ig] = std::complex(static_cast(aux[ig].real()), + static_cast(aux[ig].imag())); } + // Call the sincos operator + hamilt::cal_force_loc_sincos_op sincos_op; + sincos_op(this->ctx, + this->nat, + rho_basis->npw, + ucell.ntype, + gcar_flat.data(), + tau_flat.data(), + iat2it_array.data(), + vloc_factors.data(), + aux_fptype.data(), + force_flat.data()); + + // Copy back the results with scaling + const FPTYPE scale_factor = static_cast(ucell.tpiba * ucell.omega); + copy_back_loc_force_data(force_flat.data(), this->nat, scale_factor, forcelc); + // this->print(GlobalV::ofs_running, "local forces", forcelc); Parallel_Reduce::reduce_pool(forcelc.c, forcelc.nr * forcelc.nc); delete[] aux; @@ -665,166 +879,48 @@ void Forces::cal_force_ew(const UnitCell& ucell, aux[rho_basis->ig_gge0] = std::complex(0.0, 0.0); } -#ifdef _OPENMP -#pragma omp parallel - { - int num_threads = omp_get_num_threads(); - int thread_id = omp_get_thread_num(); -#else - int num_threads = 1; - int thread_id = 0; -#endif + // Prepare data for the parallel operator + std::vector gcar_flat, tau_flat, it_facts, zv_values, latvec_flat, G_flat; + std::vector iat2it_array, atom_na; + prepare_ew_parallel_data(ucell, rho_basis, aux, alpha, fact, + gcar_flat, tau_flat, iat2it_array, it_facts, + atom_na, zv_values, latvec_flat, G_flat); - /* Here is task distribution for multi-thread, - 0. atom will be iterated both in main nat loop and the loop in `if (rho_basis->ig_gge0 >= 0)`. - To avoid syncing, we must calculate work range of each thread by our self - 1. Calculate the iat range [iat_beg, iat_end) by each thread - a. when it is single thread stage, [iat_beg, iat_end) will be [0, nat) - 2. each thread iterate atoms form `iat_beg` to `iat_end-1` - */ - int iat_beg, iat_end; - int it_beg, ia_beg; - ModuleBase::TASK_DIST_1D(num_threads, thread_id, this->nat, iat_beg, iat_end); - iat_end = iat_beg + iat_end; - ucell.iat2iait(iat_beg, &ia_beg, &it_beg); - - int iat = iat_beg; - int it = it_beg; - int ia = ia_beg; - - // preprocess ig_gap for skipping the ig point - int ig_gap = (rho_basis->ig_gge0 >= 0 && rho_basis->ig_gge0 < rho_basis->npw) ? rho_basis->ig_gge0 : -1; - - double it_fact = 0.; - int last_it = -1; - - // iterating atoms - while (iat < iat_end) - { - if (it != last_it) - { // calculate it_tact when it is changed - double zv; - if (PARAM.inp.use_paw) - { -#ifdef USE_PAW - zv = GlobalC::paw_cell.get_val(it); -#endif - } - else - { - zv = ucell.atoms[it].ncpp.zv; - } - it_fact = zv * ModuleBase::e2 * ucell.tpiba * ModuleBase::TWO_PI / ucell.omega * fact; - last_it = it; - } - - if (ucell.atoms[it].na != 0) - { - const auto ig_loop = [&](int ig_beg, int ig_end) { - for (int ig = ig_beg; ig < ig_end; ig++) - { - const ModuleBase::Vector3 gcar = rho_basis->gcar[ig]; - const double arg = ModuleBase::TWO_PI * (gcar * ucell.atoms[it].tau[ia]); - double sinp, cosp; - ModuleBase::libm::sincos(arg, &sinp, &cosp); - double sumnb = -cosp * aux[ig].imag() + sinp * aux[ig].real(); - forceion(iat, 0) += gcar[0] * sumnb; - forceion(iat, 1) += gcar[1] * sumnb; - forceion(iat, 2) += gcar[2] * sumnb; - } - }; - - // skip ig_gge0 point by separating ig loop into two part - ig_loop(0, ig_gap); - ig_loop(ig_gap + 1, rho_basis->npw); - - forceion(iat, 0) *= it_fact; - forceion(iat, 1) *= it_fact; - forceion(iat, 2) *= it_fact; - - ++iat; - ucell.step_iait(&ia, &it); - } - } - - // means that the processor contains G=0 term. - if (rho_basis->ig_gge0 >= 0) - { - double rmax = 5.0 / (sqrt(alpha) * ucell.lat0); - int nrm = 0; + // Prepare force array for the operator + std::vector force_flat(this->nat * 3, 0.0); - // output of rgen: the number of vectors in the sphere - const int mxr = 200; - // the maximum number of R vectors included in r - ModuleBase::Vector3* r = new ModuleBase::Vector3[mxr]; - double* r2 = new double[mxr]; - ModuleBase::GlobalFunc::ZEROS(r2, mxr); - int* irr = new int[mxr]; - ModuleBase::GlobalFunc::ZEROS(irr, mxr); - // the square modulus of R_j-tau_s-tau_s' - - int iat1 = iat_beg; - int T1 = it_beg; - int I1 = ia_beg; - const double sqa = sqrt(alpha); - const double sq8a_2pi = sqrt(8.0 * alpha / ModuleBase::TWO_PI); - - // iterating atoms. - // do not need to sync threads because task range of each thread is isolated - while (iat1 < iat_end) - { - int iat2 = 0; // mohan fix bug 2011-06-07 - int I2 = 0; - int T2 = 0; - while (iat2 < this->nat) - { - if (iat1 != iat2 && ucell.atoms[T2].na != 0 && ucell.atoms[T1].na != 0) - { - ModuleBase::Vector3 d_tau - = ucell.atoms[T1].tau[I1] - ucell.atoms[T2].tau[I2]; - H_Ewald_pw::rgen(d_tau, rmax, irr, ucell.latvec, ucell.G, r, r2, nrm); - - for (int n = 0; n < nrm; n++) - { - const double rr = sqrt(r2[n]) * ucell.lat0; - - double factor; - if (PARAM.inp.use_paw) - { -#ifdef USE_PAW - factor = GlobalC::paw_cell.get_val(T1) * GlobalC::paw_cell.get_val(T2) * ModuleBase::e2 - / (rr * rr) - * (erfc(sqa * rr) / rr + sq8a_2pi * ModuleBase::libm::exp(-alpha * rr * rr)) - * ucell.lat0; -#endif - } - else - { - factor = ucell.atoms[T1].ncpp.zv * ucell.atoms[T2].ncpp.zv - * ModuleBase::e2 / (rr * rr) - * (erfc(sqa * rr) / rr + sq8a_2pi * ModuleBase::libm::exp(-alpha * rr * rr)) - * ucell.lat0; - } - - forceion(iat1, 0) -= factor * r[n].x; - forceion(iat1, 1) -= factor * r[n].y; - forceion(iat1, 2) -= factor * r[n].z; - } - } - ++iat2; - ucell.step_iait(&I2, &T2); - } // atom b - ++iat1; - ucell.step_iait(&I1, &T1); - } // atom a - - delete[] r; - delete[] r2; - delete[] irr; - } -#ifdef _OPENMP + // Convert aux to FPTYPE complex array + std::vector> aux_fptype(rho_basis->npw); + for (int ig = 0; ig < rho_basis->npw; ig++) + { + aux_fptype[ig] = std::complex(static_cast(aux[ig].real()), + static_cast(aux[ig].imag())); } -#endif + + // Call the parallel operator + hamilt::cal_force_ew_parallel_op parallel_op; + parallel_op(this->ctx, + this->nat, + rho_basis->npw, + ucell.ntype, + rho_basis->ig_gge0, + gcar_flat.data(), + tau_flat.data(), + iat2it_array.data(), + it_facts.data(), + atom_na.data(), + zv_values.data(), + aux_fptype.data(), + static_cast(alpha), + static_cast(fact), + static_cast(ucell.lat0), + latvec_flat.data(), + G_flat.data(), + PARAM.inp.use_paw, + force_flat.data()); + + // Copy back the results + copy_back_ew_force_data(force_flat.data(), this->nat, forceion); Parallel_Reduce::reduce_pool(forceion.c, forceion.nr * forceion.nc); diff --git a/source/module_hamilt_pw/hamilt_pwdft/kernels/force_op.cpp b/source/module_hamilt_pw/hamilt_pwdft/kernels/force_op.cpp index 6d797e147d..b5745f2ac5 100644 --- a/source/module_hamilt_pw/hamilt_pwdft/kernels/force_op.cpp +++ b/source/module_hamilt_pw/hamilt_pwdft/kernels/force_op.cpp @@ -1,4 +1,8 @@ #include "module_hamilt_pw/hamilt_pwdft/kernels/force_op.h" +#include "module_base/libm/libm.h" +#include "module_base/tool_threading.h" +#include "module_hamilt_general/module_ewald/H_Ewald_pw.h" +#include #ifdef _OPENMP #include @@ -424,10 +428,261 @@ struct cal_force_nl_op } }; +// CPU implementation of local force sincos operator +template +struct cal_force_loc_sincos_op +{ + void operator()(const base_device::DEVICE_CPU* ctx, + const int& nat, + const int& npw, + const int& ntype, + const FPTYPE* gcar, + const FPTYPE* tau, + const int* iat2it, + const FPTYPE* vloc_factors, + const std::complex* aux, + FPTYPE* force) + { + const FPTYPE TWO_PI = 2.0 * M_PI; + +#ifdef _OPENMP +#pragma omp parallel for +#endif + for (int iat = 0; iat < nat; ++iat) + { + const int it = iat2it[iat]; + const FPTYPE tau_x = tau[iat * 3 + 0]; + const FPTYPE tau_y = tau[iat * 3 + 1]; + const FPTYPE tau_z = tau[iat * 3 + 2]; + + FPTYPE local_force[3] = {0.0, 0.0, 0.0}; + + for (int ig = 0; ig < npw; ig++) + { + const FPTYPE phase = TWO_PI * (gcar[ig * 3 + 0] * tau_x + + gcar[ig * 3 + 1] * tau_y + + gcar[ig * 3 + 2] * tau_z); + FPTYPE sinp, cosp; + ModuleBase::libm::sincos(phase, &sinp, &cosp); + + const FPTYPE vloc_factor = vloc_factors[it * npw + ig]; + const FPTYPE factor = vloc_factor * (cosp * aux[ig].imag() + sinp * aux[ig].real()); + + local_force[0] += gcar[ig * 3 + 0] * factor; + local_force[1] += gcar[ig * 3 + 1] * factor; + local_force[2] += gcar[ig * 3 + 2] * factor; + } + + force[iat * 3 + 0] = local_force[0]; + force[iat * 3 + 1] = local_force[1]; + force[iat * 3 + 2] = local_force[2]; + } + } +}; + +// CPU implementation of Ewald parallel operator +template +struct cal_force_ew_parallel_op +{ + void operator()(const base_device::DEVICE_CPU* ctx, + const int& nat, + const int& npw, + const int& ntype, + const int& ig_gge0, + const FPTYPE* gcar, + const FPTYPE* tau, + const int* iat2it, + const FPTYPE* it_facts, + const int* atom_na, + const FPTYPE* zv_values, + const std::complex* aux, + const FPTYPE& alpha, + const FPTYPE& fact, + const FPTYPE& lat0, + const FPTYPE* latvec, + const FPTYPE* G, + const bool& use_paw, + FPTYPE* force) + { + const FPTYPE TWO_PI = 2.0 * M_PI; + const FPTYPE e2 = 2.0; // ModuleBase::e2 equivalent + +#ifdef _OPENMP +#pragma omp parallel + { + int num_threads = omp_get_num_threads(); + int thread_id = omp_get_thread_num(); +#else + int num_threads = 1; + int thread_id = 0; +#endif + + // Task distribution for multi-thread + int iat_beg, iat_end; + ModuleBase::TASK_DIST_1D(num_threads, thread_id, nat, iat_beg, iat_end); + iat_end = iat_beg + iat_end; + + // Preprocess ig_gap for skipping the ig point + int ig_gap = (ig_gge0 >= 0 && ig_gge0 < npw) ? ig_gge0 : -1; + + // Part 1: sincos calculation (replacing original while loop with for loop) + for (int iat = iat_beg; iat < iat_end; ++iat) + { + const int it = iat2it[iat]; + + // Check if this atom type has atoms + if (atom_na[it] == 0) continue; + + const FPTYPE tau_x = tau[iat * 3 + 0]; + const FPTYPE tau_y = tau[iat * 3 + 1]; + const FPTYPE tau_z = tau[iat * 3 + 2]; + const FPTYPE it_fact = it_facts[iat]; + + FPTYPE local_force[3] = {0.0, 0.0, 0.0}; + + // G-vector loop with ig_gap handling (using for loops for GPU compatibility) + for (int ig = 0; ig < npw; ig++) + { + // Skip G=0 term + if (ig == ig_gap) continue; + + const FPTYPE arg = TWO_PI * (gcar[ig * 3 + 0] * tau_x + + gcar[ig * 3 + 1] * tau_y + + gcar[ig * 3 + 2] * tau_z); + FPTYPE sinp, cosp; + ModuleBase::libm::sincos(arg, &sinp, &cosp); + + const FPTYPE sumnb = -cosp * aux[ig].imag() + sinp * aux[ig].real(); + + local_force[0] += gcar[ig * 3 + 0] * sumnb; + local_force[1] += gcar[ig * 3 + 1] * sumnb; + local_force[2] += gcar[ig * 3 + 2] * sumnb; + } + + // Apply it_fact scaling + force[iat * 3 + 0] += local_force[0] * it_fact; + force[iat * 3 + 1] += local_force[1] * it_fact; + force[iat * 3 + 2] += local_force[2] * it_fact; + } + + // Part 2: Real space Ewald calculation (when processor contains G=0 term) + if (ig_gge0 >= 0) + { + const FPTYPE rmax = 5.0 / (sqrt(alpha) * lat0); + const int mxr = 200; + + // Allocate temporary arrays for each thread + ModuleBase::Vector3* r_flat = new ModuleBase::Vector3[mxr]; + double* r2 = new double[mxr]; + int* irr = new int[mxr]; + + // Initialize arrays + for (int i = 0; i < mxr; i++) { + r2[i] = 0.0; + irr[i] = 0; + r_flat[i].x = 0.0; + r_flat[i].y = 0.0; + r_flat[i].z = 0.0; + } + + const FPTYPE sqa = sqrt(alpha); + const FPTYPE sq8a_2pi = sqrt(8.0 * alpha / TWO_PI); + + // Iterate over atoms in assigned range + for (int iat1 = iat_beg; iat1 < iat_end; ++iat1) + { + const int T1 = iat2it[iat1]; + if (atom_na[T1] == 0) continue; + + const FPTYPE tau1_x = tau[iat1 * 3 + 0]; + const FPTYPE tau1_y = tau[iat1 * 3 + 1]; + const FPTYPE tau1_z = tau[iat1 * 3 + 2]; + + // Iterate over all other atoms + for (int iat2 = 0; iat2 < nat; ++iat2) + { + const int T2 = iat2it[iat2]; + + if (iat1 != iat2 && atom_na[T2] != 0 && atom_na[T1] != 0) + { + const FPTYPE tau2_x = tau[iat2 * 3 + 0]; + const FPTYPE tau2_y = tau[iat2 * 3 + 1]; + const FPTYPE tau2_z = tau[iat2 * 3 + 2]; + + // Calculate d_tau + const FPTYPE d_tau_x = tau1_x - tau2_x; + const FPTYPE d_tau_y = tau1_y - tau2_y; + const FPTYPE d_tau_z = tau1_z - tau2_z; + + // Reconstruct Matrix3 objects from flat arrays for rgen call + ModuleBase::Matrix3 latvec_matrix( + static_cast(latvec[0]), static_cast(latvec[1]), static_cast(latvec[2]), + static_cast(latvec[3]), static_cast(latvec[4]), static_cast(latvec[5]), + static_cast(latvec[6]), static_cast(latvec[7]), static_cast(latvec[8]) + ); + + ModuleBase::Matrix3 G_matrix( + static_cast(G[0]), static_cast(G[1]), static_cast(G[2]), + static_cast(G[3]), static_cast(G[4]), static_cast(G[5]), + static_cast(G[6]), static_cast(G[7]), static_cast(G[8]) + ); + + ModuleBase::Vector3 dtau_vec( + static_cast(d_tau_x), + static_cast(d_tau_y), + static_cast(d_tau_z) + ); + + // Call H_Ewald_pw::rgen to generate neighbor shells + int nrm = 0; + H_Ewald_pw::rgen(dtau_vec, static_cast(rmax), irr, latvec_matrix, G_matrix, r_flat, r2, nrm); + + // Process all generated r vectors + for (int nr = 0; nr < nrm; nr++) + { + const FPTYPE rr = sqrt(r2[nr]) * lat0; + + FPTYPE factor; + if (use_paw) + { + factor = zv_values[T1] * zv_values[T2] * e2 / (rr * rr) + * (erfc(sqa * rr) / rr + sq8a_2pi * exp(-alpha * rr * rr)) + * lat0; + } + else + { + factor = zv_values[T1] * zv_values[T2] * e2 / (rr * rr) + * (erfc(sqa * rr) / rr + sq8a_2pi * exp(-alpha * rr * rr)) + * lat0; + } + + force[iat1 * 3 + 0] -= factor * static_cast(r_flat[nr].x); + force[iat1 * 3 + 1] -= factor * static_cast(r_flat[nr].y); + force[iat1 * 3 + 2] -= factor * static_cast(r_flat[nr].z); + } + } + } + } + + delete[] r_flat; + delete[] r2; + delete[] irr; + } + +#ifdef _OPENMP + } +#endif + } +}; + template struct cal_vkb1_nl_op; template struct cal_force_nl_op; +template struct cal_force_loc_sincos_op; +template struct cal_force_ew_parallel_op; template struct cal_vkb1_nl_op; template struct cal_force_nl_op; +template struct cal_force_loc_sincos_op; +template struct cal_force_ew_parallel_op; } // namespace hamilt diff --git a/source/module_hamilt_pw/hamilt_pwdft/kernels/force_op.h b/source/module_hamilt_pw/hamilt_pwdft/kernels/force_op.h index 3aa5d4f87e..152f41e310 100644 --- a/source/module_hamilt_pw/hamilt_pwdft/kernels/force_op.h +++ b/source/module_hamilt_pw/hamilt_pwdft/kernels/force_op.h @@ -146,6 +146,83 @@ struct cal_force_nl_op const FPTYPE* lambda, const std::complex* becp, const std::complex* dbecp, + FPTYPE* force); +}; + +template +struct cal_force_loc_sincos_op +{ + /// @brief Calculate local pseudopotential forces (sincos loop only) + /// + /// Input Parameters + /// @param ctx - which device this function runs on + /// @param nat - number of atoms + /// @param npw - number of plane waves + /// @param ntype - number of atom types + /// @param gcar - G-vector Cartesian coordinates [npw * 3] + /// @param tau - atomic positions [nat * 3] + /// @param iat2it - atom to type mapping [nat] + /// @param vloc_factors - precomputed vloc factors [ntype * npw] + /// @param aux - charge density in G-space [npw] + /// + /// Output Parameters + /// @param force - output forces [nat * 3] + void operator()(const Device* ctx, + const int& nat, + const int& npw, + const int& ntype, + const FPTYPE* gcar, + const FPTYPE* tau, + const int* iat2it, + const FPTYPE* vloc_factors, + const std::complex* aux, + FPTYPE* force); +}; + +template +struct cal_force_ew_parallel_op +{ + /// @brief Calculate Ewald forces (parallel region only) + /// + /// Input Parameters + /// @param ctx - which device this function runs on + /// @param nat - number of atoms + /// @param npw - number of plane waves + /// @param ntype - number of atom types + /// @param ig_gge0 - index of G=0 vector (-1 if not present) + /// @param gcar - G-vector Cartesian coordinates [npw * 3] + /// @param tau - atomic positions [nat * 3] + /// @param iat2it - atom to type mapping [nat] + /// @param it_facts - precomputed it_fact for each atom [nat] + /// @param atom_na - number of atoms of each type [ntype] + /// @param zv_values - valence charge for each type [ntype] + /// @param aux - structure factor related array [npw] + /// @param alpha - Ewald parameter + /// @param fact - scaling factor + /// @param lat0 - lattice constant + /// @param latvec - lattice vectors [3 * 3] + /// @param G - reciprocal lattice vectors [3 * 3] + /// @param use_pwa - whether PAW is used + /// Output Parameters + /// @param force - output forces [nat * 3] + void operator()(const Device* ctx, + const int& nat, + const int& npw, + const int& ntype, + const int& ig_gge0, + const FPTYPE* gcar, + const FPTYPE* tau, + const int* iat2it, + const FPTYPE* it_facts, + const int* atom_na, + const FPTYPE* zv_values, + const std::complex* aux, + const FPTYPE& alpha, + const FPTYPE& fact, + const FPTYPE& lat0, + const FPTYPE* latvec, + const FPTYPE* G, + const bool& use_paw, FPTYPE* force); }; @@ -248,6 +325,36 @@ struct cal_force_nl_op FPTYPE* force); }; +template +struct cal_force_loc_sincos_op +{ + void operator()(const base_device::DEVICE_GPU* ctx, + const int& nat, + const int& npw, + const int& ntype, + const FPTYPE* gcar, + const FPTYPE* tau, + const int* iat2it, + const FPTYPE* vloc_factors, + const std::complex* aux, + FPTYPE* force); +}; + +template +struct cal_force_ew_parallel_op +{ + void operator()(const base_device::DEVICE_GPU* ctx, + const int& nat, + const int& npw, + const int& ig_gge0, + const FPTYPE* gcar, + const FPTYPE* tau, + const int* iat2it, + const FPTYPE* it_facts, + const std::complex* aux, + FPTYPE* force); +}; + /** * @brief revert the vkb values for force_nl calculation */ From 06b2e2505aaee5010b9147905bc18bf6790c93b3 Mon Sep 17 00:00:00 2001 From: Jie Date: Fri, 30 May 2025 15:06:08 +0800 Subject: [PATCH 3/6] fix sincos op for gpu&cpu --- .../module_hamilt_pw/hamilt_pwdft/forces.cpp | 614 ++++++++++-------- .../hamilt_pwdft/kernels/cuda/force_op.cu | 184 ++++++ .../kernels/cuda/force_sincos_op.cu | 204 ------ .../hamilt_pwdft/kernels/force_op.cpp | 204 +----- .../hamilt_pwdft/kernels/force_op.h | 34 +- .../hamilt_pwdft/kernels/force_sincos_op.cpp | 123 ---- .../hamilt_pwdft/kernels/force_sincos_op.h | 106 --- .../hamilt_pwdft/kernels/rocm/force_op.hip.cu | 185 ++++++ .../kernels/rocm/force_sincos_op.hip.cu | 203 ------ 9 files changed, 751 insertions(+), 1106 deletions(-) delete mode 100644 source/module_hamilt_pw/hamilt_pwdft/kernels/cuda/force_sincos_op.cu delete mode 100644 source/module_hamilt_pw/hamilt_pwdft/kernels/force_sincos_op.cpp delete mode 100644 source/module_hamilt_pw/hamilt_pwdft/kernels/force_sincos_op.h delete mode 100644 source/module_hamilt_pw/hamilt_pwdft/kernels/rocm/force_sincos_op.hip.cu diff --git a/source/module_hamilt_pw/hamilt_pwdft/forces.cpp b/source/module_hamilt_pw/hamilt_pwdft/forces.cpp index 52ddf7da63..5e1d3cd6a2 100644 --- a/source/module_hamilt_pw/hamilt_pwdft/forces.cpp +++ b/source/module_hamilt_pw/hamilt_pwdft/forces.cpp @@ -15,8 +15,7 @@ #include "module_hamilt_general/module_ewald/H_Ewald_pw.h" #include "module_hamilt_general/module_surchem/surchem.h" #include "module_hamilt_general/module_vdw/vdw.h" -#include "module_hamilt_pw/hamilt_pwdft/kernels/force_op.h" -#include +#include "kernels/force_op.h" #ifdef _OPENMP #include @@ -25,208 +24,6 @@ #include "module_cell/module_paw/paw_cell.h" #endif -// Data preparation function for Ewald parallel operator -template -void prepare_ew_parallel_data(const UnitCell& ucell, - ModulePW::PW_Basis* rho_basis, - const std::complex* aux, - const double alpha, - const double fact, - std::vector& gcar_flat, - std::vector& tau_flat, - std::vector& iat2it_array, - std::vector& it_facts, - std::vector& atom_na, - std::vector& zv_values, - std::vector& latvec_flat, - std::vector& G_flat) -{ - const int nat = ucell.nat; - const int npw = rho_basis->npw; - const int ntype = ucell.ntype; - - // Prepare gcar_flat - gcar_flat.resize(npw * 3); - for (int ig = 0; ig < npw; ig++) - { - gcar_flat[ig * 3 + 0] = static_cast(rho_basis->gcar[ig][0]); - gcar_flat[ig * 3 + 1] = static_cast(rho_basis->gcar[ig][1]); - gcar_flat[ig * 3 + 2] = static_cast(rho_basis->gcar[ig][2]); - } - - // Prepare tau_flat and iat2it_array - tau_flat.resize(nat * 3); - iat2it_array.resize(nat); - int iat = 0; - for (int it = 0; it < ntype; it++) - { - for (int ia = 0; ia < ucell.atoms[it].na; ia++) - { - tau_flat[iat * 3 + 0] = static_cast(ucell.atoms[it].tau[ia].x); - tau_flat[iat * 3 + 1] = static_cast(ucell.atoms[it].tau[ia].y); - tau_flat[iat * 3 + 2] = static_cast(ucell.atoms[it].tau[ia].z); - iat2it_array[iat] = it; - iat++; - } - } - - // Prepare it_facts - it_facts.resize(nat); - iat = 0; - for (int it = 0; it < ntype; it++) - { - double zv; - if (PARAM.inp.use_paw) - { -#ifdef USE_PAW - zv = GlobalC::paw_cell.get_val(it); -#else - zv = ucell.atoms[it].ncpp.zv; -#endif - } - else - { - zv = ucell.atoms[it].ncpp.zv; - } - - const FPTYPE it_fact = static_cast(zv * ModuleBase::e2 * ucell.tpiba * ModuleBase::TWO_PI / ucell.omega * fact); - - for (int ia = 0; ia < ucell.atoms[it].na; ia++) - { - it_facts[iat] = it_fact; - iat++; - } - } - - // Prepare atom_na - atom_na.resize(ntype); - for (int it = 0; it < ntype; it++) - { - atom_na[it] = ucell.atoms[it].na; - } - - // Prepare zv_values - zv_values.resize(ntype); - for (int it = 0; it < ntype; it++) - { - if (PARAM.inp.use_paw) - { -#ifdef USE_PAW - zv_values[it] = static_cast(GlobalC::paw_cell.get_val(it)); -#else - zv_values[it] = static_cast(ucell.atoms[it].ncpp.zv); -#endif - } - else - { - zv_values[it] = static_cast(ucell.atoms[it].ncpp.zv); - } - } - - // Prepare latvec_flat (row-major order) - latvec_flat.resize(9); - latvec_flat[0] = static_cast(ucell.latvec.e11); - latvec_flat[1] = static_cast(ucell.latvec.e12); - latvec_flat[2] = static_cast(ucell.latvec.e13); - latvec_flat[3] = static_cast(ucell.latvec.e21); - latvec_flat[4] = static_cast(ucell.latvec.e22); - latvec_flat[5] = static_cast(ucell.latvec.e23); - latvec_flat[6] = static_cast(ucell.latvec.e31); - latvec_flat[7] = static_cast(ucell.latvec.e32); - latvec_flat[8] = static_cast(ucell.latvec.e33); - - // Prepare G_flat (row-major order) - G_flat.resize(9); - G_flat[0] = static_cast(ucell.G.e11); - G_flat[1] = static_cast(ucell.G.e12); - G_flat[2] = static_cast(ucell.G.e13); - G_flat[3] = static_cast(ucell.G.e21); - G_flat[4] = static_cast(ucell.G.e22); - G_flat[5] = static_cast(ucell.G.e23); - G_flat[6] = static_cast(ucell.G.e31); - G_flat[7] = static_cast(ucell.G.e32); - G_flat[8] = static_cast(ucell.G.e33); -} - -// Data preparation function for local force sincos operator -template -void prepare_loc_sincos_data(const UnitCell& ucell, - ModulePW::PW_Basis* rho_basis, - const ModuleBase::matrix& vloc, - const std::complex* aux, - std::vector& gcar_flat, - std::vector& tau_flat, - std::vector& iat2it_array, - std::vector& vloc_factors) -{ - const int nat = ucell.nat; - const int npw = rho_basis->npw; - - // Prepare gcar_flat - gcar_flat.resize(npw * 3); - for (int ig = 0; ig < npw; ig++) - { - gcar_flat[ig * 3 + 0] = static_cast(rho_basis->gcar[ig][0]); - gcar_flat[ig * 3 + 1] = static_cast(rho_basis->gcar[ig][1]); - gcar_flat[ig * 3 + 2] = static_cast(rho_basis->gcar[ig][2]); - } - - // Prepare tau_flat and iat2it_array - tau_flat.resize(nat * 3); - iat2it_array.resize(nat); - int iat = 0; - for (int it = 0; it < ucell.ntype; it++) - { - for (int ia = 0; ia < ucell.atoms[it].na; ia++) - { - tau_flat[iat * 3 + 0] = static_cast(ucell.atoms[it].tau[ia].x); - tau_flat[iat * 3 + 1] = static_cast(ucell.atoms[it].tau[ia].y); - tau_flat[iat * 3 + 2] = static_cast(ucell.atoms[it].tau[ia].z); - iat2it_array[iat] = it; - iat++; - } - } - - // Prepare vloc_factors: vloc(it, ig2igg[ig]) for each G-vector and atom type - vloc_factors.resize(npw * ucell.ntype); - for (int it = 0; it < ucell.ntype; it++) - { - for (int ig = 0; ig < npw; ig++) - { - vloc_factors[it * npw + ig] = static_cast(vloc(it, rho_basis->ig2igg[ig])); - } - } -} - -// Data copy back function for local force sincos operator -template -void copy_back_loc_force_data(const FPTYPE* force, - const int nat, - const FPTYPE& scale_factor, - ModuleBase::matrix& forcelc) -{ - for (int iat = 0; iat < nat; iat++) - { - forcelc(iat, 0) = static_cast(force[iat * 3 + 0] * scale_factor); - forcelc(iat, 1) = static_cast(force[iat * 3 + 1] * scale_factor); - forcelc(iat, 2) = static_cast(force[iat * 3 + 2] * scale_factor); - } -} - -// Data copy back function for Ewald parallel operator -template -void copy_back_ew_force_data(const FPTYPE* force, - const int nat, - ModuleBase::matrix& forceion) -{ - for (int iat = 0; iat < nat; iat++) - { - forceion(iat, 0) = static_cast(force[iat * 3 + 0]); - forceion(iat, 1) = static_cast(force[iat * 3 + 1]); - forceion(iat, 2) = static_cast(force[iat * 3 + 2]); - } -} - template void Forces::cal_force(UnitCell& ucell, ModuleBase::matrix& force, @@ -735,39 +532,125 @@ void Forces::cal_force_loc(const UnitCell& ucell, // to G space. maybe need fftw with OpenMP rho_basis->real2recip(aux, aux); - // Prepare data for the sincos operator - std::vector gcar_flat, tau_flat, vloc_factors; - std::vector iat2it_array; - prepare_loc_sincos_data(ucell, rho_basis, vloc, aux, - gcar_flat, tau_flat, iat2it_array, vloc_factors); - - // Prepare force array for the operator - std::vector force_flat(this->nat * 3, 0.0); - - // Convert aux to FPTYPE complex array + // =============== GPU/CPU异构优化:使用sincos op替换原循环 =============== + + + // 准备原子相关数据:按照全局原子索引iat顺序 + std::vector tau_flat(this->nat * 3); + std::vector iat2it_host(this->nat); + std::vector gcar_flat(rho_basis->npw * 3); + + // 按照原始代码逻辑:遍历全局原子索引iat,通过查表获取(it,ia) + for (int iat = 0; iat < this->nat; iat++) { + int it = ucell.iat2it[iat]; // 查表获取原子类型 + int ia = ucell.iat2ia[iat]; // 查表获取该类型内的原子索引 + + tau_flat[iat * 3 + 0] = static_cast(ucell.atoms[it].tau[ia][0]); + tau_flat[iat * 3 + 1] = static_cast(ucell.atoms[it].tau[ia][1]); + tau_flat[iat * 3 + 2] = static_cast(ucell.atoms[it].tau[ia][2]); + iat2it_host[iat] = it; + } + + for (int ig = 0; ig < rho_basis->npw; ig++) { + gcar_flat[ig * 3 + 0] = static_cast(rho_basis->gcar[ig][0]); + gcar_flat[ig * 3 + 1] = static_cast(rho_basis->gcar[ig][1]); + gcar_flat[ig * 3 + 2] = static_cast(rho_basis->gcar[ig][2]); + } + + // 重新计算vloc_factors考虑所有原子类型 + std::vector vloc_per_type_host(ucell.ntype * rho_basis->npw); + for (int it = 0; it < ucell.ntype; it++) { + for (int ig = 0; ig < rho_basis->npw; ig++) { + vloc_per_type_host[it * rho_basis->npw + ig] = static_cast(vloc(it, rho_basis->ig2igg[ig])); + } + } + + // 转换aux到FPTYPE类型 std::vector> aux_fptype(rho_basis->npw); - for (int ig = 0; ig < rho_basis->npw; ig++) + for (int ig = 0; ig < rho_basis->npw; ig++) { + aux_fptype[ig] = static_cast>(aux[ig]); + } + + // 设备端内存和指针设置(根据设备类型分支) + FPTYPE* d_gcar = gcar_flat.data(); + FPTYPE* d_tau = tau_flat.data(); + int* d_iat2it = iat2it_host.data(); + FPTYPE* d_vloc_per_type = vloc_per_type_host.data(); + std::complex* d_aux = aux_fptype.data(); + FPTYPE* d_force = nullptr; + std::vector force_host(this->nat * 3); + + if (this->device == base_device::GpuDevice) + { + d_gcar = nullptr; + d_tau = nullptr; + d_iat2it = nullptr; + d_vloc_per_type = nullptr; + d_aux = nullptr; + + resmem_var_op()(this->ctx, d_gcar, rho_basis->npw * 3); + resmem_var_op()(this->ctx, d_tau, this->nat * 3); + resmem_int_op()(this->ctx, d_iat2it, this->nat); + resmem_var_op()(this->ctx, d_vloc_per_type, ucell.ntype * rho_basis->npw); + resmem_complex_op()(this->ctx, d_aux, rho_basis->npw); + resmem_var_op()(this->ctx, d_force, this->nat * 3); + + // 数据传输到设备 + syncmem_var_h2d_op()(this->ctx, this->cpu_ctx, d_gcar, gcar_flat.data(), rho_basis->npw * 3); + syncmem_var_h2d_op()(this->ctx, this->cpu_ctx, d_tau, tau_flat.data(), this->nat * 3); + syncmem_int_h2d_op()(this->ctx, this->cpu_ctx, d_iat2it, iat2it_host.data(), this->nat); + syncmem_var_h2d_op()(this->ctx, this->cpu_ctx, d_vloc_per_type, vloc_per_type_host.data(), ucell.ntype * rho_basis->npw); + syncmem_complex_h2d_op()(this->ctx, this->cpu_ctx, d_aux, aux_fptype.data(), rho_basis->npw); + + // 初始化force为0 + base_device::memory::set_memory_op()(this->ctx, d_force, 0.0, this->nat * 3); + } + else { - aux_fptype[ig] = std::complex(static_cast(aux[ig].real()), - static_cast(aux[ig].imag())); - } - - // Call the sincos operator - hamilt::cal_force_loc_sincos_op sincos_op; - sincos_op(this->ctx, - this->nat, - rho_basis->npw, - ucell.ntype, - gcar_flat.data(), - tau_flat.data(), - iat2it_array.data(), - vloc_factors.data(), - aux_fptype.data(), - force_flat.data()); - - // Copy back the results with scaling + d_force = force_host.data(); + // 对于CPU,直接初始化为0 + std::fill(force_host.begin(), force_host.end(), static_cast(0.0)); + } + + // 计算缩放因子 const FPTYPE scale_factor = static_cast(ucell.tpiba * ucell.omega); - copy_back_loc_force_data(force_flat.data(), this->nat, scale_factor, forcelc); + + // 调用op进行sincos计算 + hamilt::cal_force_loc_sincos_op()( + this->ctx, + this->nat, + rho_basis->npw, + ucell.ntype, + d_gcar, + d_tau, + d_iat2it, + d_vloc_per_type, + d_aux, + scale_factor, + d_force + ); + + // 根据设备类型处理结果 + if (this->device == base_device::GpuDevice) + { + // 结果传回CPU + syncmem_var_d2h_op()(this->cpu_ctx, this->ctx, force_host.data(), d_force, this->nat * 3); + + // 清理设备内存 + delmem_var_op()(this->ctx, d_gcar); + delmem_var_op()(this->ctx, d_tau); + delmem_int_op()(this->ctx, d_iat2it); + delmem_var_op()(this->ctx, d_vloc_per_type); + delmem_complex_op()(this->ctx, d_aux); + delmem_var_op()(this->ctx, d_force); + } + + // 将结果写入forcelc矩阵 + for (int iat = 0; iat < this->nat; iat++) { + forcelc(iat, 0) = static_cast(force_host[iat * 3 + 0]); + forcelc(iat, 1) = static_cast(force_host[iat * 3 + 1]); + forcelc(iat, 2) = static_cast(force_host[iat * 3 + 2]); + } // this->print(GlobalV::ofs_running, "local forces", forcelc); Parallel_Reduce::reduce_pool(forcelc.c, forcelc.nr * forcelc.nc); @@ -879,48 +762,233 @@ void Forces::cal_force_ew(const UnitCell& ucell, aux[rho_basis->ig_gge0] = std::complex(0.0, 0.0); } - // Prepare data for the parallel operator - std::vector gcar_flat, tau_flat, it_facts, zv_values, latvec_flat, G_flat; - std::vector iat2it_array, atom_na; - prepare_ew_parallel_data(ucell, rho_basis, aux, alpha, fact, - gcar_flat, tau_flat, iat2it_array, it_facts, - atom_na, zv_values, latvec_flat, G_flat); + // =============== 第一步:G空间sincos计算(在OpenMP区域外)=============== + + // 预计算每个原子的it_fact和相关数据:按照全局原子索引iat顺序 + std::vector it_facts_host(this->nat); + std::vector tau_flat(this->nat * 3); + std::vector iat2it_host(this->nat); + + // 按照原始代码逻辑:遍历全局原子索引iat,通过查表获取(it,ia) + for (int iat = 0; iat < this->nat; iat++) { + int it = ucell.iat2it[iat]; // 查表获取原子类型 + int ia = ucell.iat2ia[iat]; // 查表获取该类型内的原子索引 + + double zv; + if (PARAM.inp.use_paw) + { +#ifdef USE_PAW + zv = GlobalC::paw_cell.get_val(it); +#endif + } + else + { + zv = ucell.atoms[it].ncpp.zv; + } + + it_facts_host[iat] = static_cast(zv * ModuleBase::e2 * ucell.tpiba * + ModuleBase::TWO_PI / ucell.omega * fact); + + tau_flat[iat * 3 + 0] = static_cast(ucell.atoms[it].tau[ia][0]); + tau_flat[iat * 3 + 1] = static_cast(ucell.atoms[it].tau[ia][1]); + tau_flat[iat * 3 + 2] = static_cast(ucell.atoms[it].tau[ia][2]); + iat2it_host[iat] = it; + } + + // 准备设备端数据 + std::vector gcar_flat(rho_basis->npw * 3); + for (int ig = 0; ig < rho_basis->npw; ig++) { + gcar_flat[ig * 3 + 0] = static_cast(rho_basis->gcar[ig][0]); + gcar_flat[ig * 3 + 1] = static_cast(rho_basis->gcar[ig][1]); + gcar_flat[ig * 3 + 2] = static_cast(rho_basis->gcar[ig][2]); + } + + // 转换aux到FPTYPE类型 + std::vector> aux_fptype(rho_basis->npw); + for (int ig = 0; ig < rho_basis->npw; ig++) { + aux_fptype[ig] = static_cast>(aux[ig]); + } + + // 设备端内存和指针设置(根据设备类型分支) + FPTYPE* d_gcar = gcar_flat.data(); + FPTYPE* d_tau = tau_flat.data(); + int* d_iat2it = iat2it_host.data(); + FPTYPE* d_it_facts = it_facts_host.data(); + std::complex* d_aux = aux_fptype.data(); + FPTYPE* d_force_g = nullptr; + std::vector force_g_host(this->nat * 3); + + if (this->device == base_device::GpuDevice) + { + d_gcar = nullptr; + d_tau = nullptr; + d_iat2it = nullptr; + d_it_facts = nullptr; + d_aux = nullptr; + + resmem_var_op()(this->ctx, d_gcar, rho_basis->npw * 3); + resmem_var_op()(this->ctx, d_tau, this->nat * 3); + resmem_int_op()(this->ctx, d_iat2it, this->nat); + resmem_var_op()(this->ctx, d_it_facts, this->nat); + resmem_complex_op()(this->ctx, d_aux, rho_basis->npw); + resmem_var_op()(this->ctx, d_force_g, this->nat * 3); + + // 数据传输 + syncmem_var_h2d_op()(this->ctx, this->cpu_ctx, d_gcar, gcar_flat.data(), rho_basis->npw * 3); + syncmem_var_h2d_op()(this->ctx, this->cpu_ctx, d_tau, tau_flat.data(), this->nat * 3); + syncmem_int_h2d_op()(this->ctx, this->cpu_ctx, d_iat2it, iat2it_host.data(), this->nat); + syncmem_var_h2d_op()(this->ctx, this->cpu_ctx, d_it_facts, it_facts_host.data(), this->nat); + syncmem_complex_h2d_op()(this->ctx, this->cpu_ctx, d_aux, aux_fptype.data(), rho_basis->npw); + + // 初始化force为0 + base_device::memory::set_memory_op()(this->ctx, d_force_g, 0.0, this->nat * 3); + } + else + { + d_force_g = force_g_host.data(); + // 对于CPU,直接初始化为0 + std::fill(force_g_host.begin(), force_g_host.end(), static_cast(0.0)); + } + + // 调用op处理G空间sincos计算(在OpenMP区域外,无冲突) + hamilt::cal_force_ew_sincos_op()( + this->ctx, + this->nat, + rho_basis->npw, + rho_basis->ig_gge0, // G=0项索引,op内部会自动跳过 + d_gcar, + d_tau, + d_iat2it, + d_it_facts, + d_aux, + d_force_g + ); + + // 根据设备类型处理结果 + if (this->device == base_device::GpuDevice) + { + // 将G空间结果传回CPU + syncmem_var_d2h_op()(this->cpu_ctx, this->ctx, force_g_host.data(), d_force_g, this->nat * 3); + + // 清理设备内存 + delmem_var_op()(this->ctx, d_gcar); + delmem_var_op()(this->ctx, d_tau); + delmem_int_op()(this->ctx, d_iat2it); + delmem_var_op()(this->ctx, d_it_facts); + delmem_complex_op()(this->ctx, d_aux); + delmem_var_op()(this->ctx, d_force_g); + } + + // 累加到forceion + for (int iat = 0; iat < this->nat; iat++) { + forceion(iat, 0) += static_cast(force_g_host[iat * 3 + 0]); + forceion(iat, 1) += static_cast(force_g_host[iat * 3 + 1]); + forceion(iat, 2) += static_cast(force_g_host[iat * 3 + 2]); + } - // Prepare force array for the operator - std::vector force_flat(this->nat * 3, 0.0); + // =============== 第二步:实空间计算(保留原OpenMP结构)=============== - // Convert aux to FPTYPE complex array - std::vector> aux_fptype(rho_basis->npw); - for (int ig = 0; ig < rho_basis->npw; ig++) +#ifdef _OPENMP +#pragma omp parallel { - aux_fptype[ig] = std::complex(static_cast(aux[ig].real()), - static_cast(aux[ig].imag())); - } - - // Call the parallel operator - hamilt::cal_force_ew_parallel_op parallel_op; - parallel_op(this->ctx, - this->nat, - rho_basis->npw, - ucell.ntype, - rho_basis->ig_gge0, - gcar_flat.data(), - tau_flat.data(), - iat2it_array.data(), - it_facts.data(), - atom_na.data(), - zv_values.data(), - aux_fptype.data(), - static_cast(alpha), - static_cast(fact), - static_cast(ucell.lat0), - latvec_flat.data(), - G_flat.data(), - PARAM.inp.use_paw, - force_flat.data()); - - // Copy back the results - copy_back_ew_force_data(force_flat.data(), this->nat, forceion); + int num_threads = omp_get_num_threads(); + int thread_id = omp_get_thread_num(); +#else + int num_threads = 1; + int thread_id = 0; +#endif + + /* Here is task distribution for multi-thread, + 0. atom will be iterated both in main nat loop and the loop in `if (rho_basis->ig_gge0 >= 0)`. + To avoid syncing, we must calculate work range of each thread by our self + 1. Calculate the iat range [iat_beg, iat_end) by each thread + a. when it is single thread stage, [iat_beg, iat_end) will be [0, nat) + 2. each thread iterate atoms form `iat_beg` to `iat_end-1` + */ + int iat_beg, iat_end; + int it_beg, ia_beg; + ModuleBase::TASK_DIST_1D(num_threads, thread_id, this->nat, iat_beg, iat_end); + iat_end = iat_beg + iat_end; + ucell.iat2iait(iat_beg, &ia_beg, &it_beg); + + // 只保留实空间相互作用计算(means that the processor contains G=0 term) + if (rho_basis->ig_gge0 >= 0) + { + double rmax = 5.0 / (sqrt(alpha) * ucell.lat0); + int nrm = 0; + + // output of rgen: the number of vectors in the sphere + const int mxr = 200; + // the maximum number of R vectors included in r + ModuleBase::Vector3* r = new ModuleBase::Vector3[mxr]; + double* r2 = new double[mxr]; + ModuleBase::GlobalFunc::ZEROS(r2, mxr); + int* irr = new int[mxr]; + ModuleBase::GlobalFunc::ZEROS(irr, mxr); + // the square modulus of R_j-tau_s-tau_s' + + int iat1 = iat_beg; + int T1 = it_beg; + int I1 = ia_beg; + const double sqa = sqrt(alpha); + const double sq8a_2pi = sqrt(8.0 * alpha / ModuleBase::TWO_PI); + + // iterating atoms. + // do not need to sync threads because task range of each thread is isolated + while (iat1 < iat_end) + { + int iat2 = 0; // mohan fix bug 2011-06-07 + int I2 = 0; + int T2 = 0; + while (iat2 < this->nat) + { + if (iat1 != iat2 && ucell.atoms[T2].na != 0 && ucell.atoms[T1].na != 0) + { + ModuleBase::Vector3 d_tau + = ucell.atoms[T1].tau[I1] - ucell.atoms[T2].tau[I2]; + H_Ewald_pw::rgen(d_tau, rmax, irr, ucell.latvec, ucell.G, r, r2, nrm); + + for (int n = 0; n < nrm; n++) + { + const double rr = sqrt(r2[n]) * ucell.lat0; + + double factor; + if (PARAM.inp.use_paw) + { +#ifdef USE_PAW + factor = GlobalC::paw_cell.get_val(T1) * GlobalC::paw_cell.get_val(T2) * ModuleBase::e2 + / (rr * rr) + * (erfc(sqa * rr) / rr + sq8a_2pi * ModuleBase::libm::exp(-alpha * rr * rr)) + * ucell.lat0; +#endif + } + else + { + factor = ucell.atoms[T1].ncpp.zv * ucell.atoms[T2].ncpp.zv + * ModuleBase::e2 / (rr * rr) + * (erfc(sqa * rr) / rr + sq8a_2pi * ModuleBase::libm::exp(-alpha * rr * rr)) + * ucell.lat0; + } + + forceion(iat1, 0) -= factor * r[n].x; + forceion(iat1, 1) -= factor * r[n].y; + forceion(iat1, 2) -= factor * r[n].z; + } + } + ++iat2; + ucell.step_iait(&I2, &T2); + } // atom b + ++iat1; + ucell.step_iait(&I1, &T1); + } // atom a + + delete[] r; + delete[] r2; + delete[] irr; + } +#ifdef _OPENMP + } +#endif Parallel_Reduce::reduce_pool(forceion.c, forceion.nr * forceion.nc); diff --git a/source/module_hamilt_pw/hamilt_pwdft/kernels/cuda/force_op.cu b/source/module_hamilt_pw/hamilt_pwdft/kernels/cuda/force_op.cu index 5d0656d105..077864caff 100644 --- a/source/module_hamilt_pw/hamilt_pwdft/kernels/cuda/force_op.cu +++ b/source/module_hamilt_pw/hamilt_pwdft/kernels/cuda/force_op.cu @@ -13,6 +13,125 @@ namespace hamilt { +// CUDA kernels for sincos loops +template +__global__ void cal_force_loc_sincos_kernel( + const int nat, + const int npw, + const int ntype, + const FPTYPE* gcar, + const FPTYPE* tau, + const int* iat2it, + const FPTYPE* vloc_per_type, + const thrust::complex* aux, + const FPTYPE scale_factor, + FPTYPE* force) +{ + const FPTYPE TWO_PI = 2.0 * M_PI; + + const int iat = blockIdx.y; + const int ig_start = blockIdx.x * blockDim.x + threadIdx.x; + const int ig_stride = gridDim.x * blockDim.x; + + if (iat >= nat) return; + + // Load atom information to registers + const int it = iat2it[iat]; + const FPTYPE tau_x = tau[iat * 3 + 0]; + const FPTYPE tau_y = tau[iat * 3 + 1]; + const FPTYPE tau_z = tau[iat * 3 + 2]; + + // Local accumulation variables + FPTYPE local_force_x = 0.0; + FPTYPE local_force_y = 0.0; + FPTYPE local_force_z = 0.0; + + // Grid-stride loop over G-vectors + for (int ig = ig_start; ig < npw; ig += ig_stride) { + // Calculate phase: 2π * (G · τ) + const FPTYPE phase = TWO_PI * (gcar[ig * 3 + 0] * tau_x + + gcar[ig * 3 + 1] * tau_y + + gcar[ig * 3 + 2] * tau_z); + + // Use CUDA intrinsic for sincos + FPTYPE sinp, cosp; + sincos(phase, &sinp, &cosp); + + // Calculate force factor + const FPTYPE vloc_factor = vloc_per_type[it * npw + ig]; + const FPTYPE factor = vloc_factor * (cosp * aux[ig].imag() + sinp * aux[ig].real()); + + // Accumulate force contributions + local_force_x += gcar[ig * 3 + 0] * factor; + local_force_y += gcar[ig * 3 + 1] * factor; + local_force_z += gcar[ig * 3 + 2] * factor; + } + + // Atomic add to global memory + atomicAdd(&force[iat * 3 + 0], local_force_x * scale_factor); + atomicAdd(&force[iat * 3 + 1], local_force_y * scale_factor); + atomicAdd(&force[iat * 3 + 2], local_force_z * scale_factor); +} + +template +__global__ void cal_force_ew_sincos_kernel( + const int nat, + const int npw, + const int ig_gge0, + const FPTYPE* gcar, + const FPTYPE* tau, + const int* iat2it, + const FPTYPE* it_facts, + const thrust::complex* aux, + FPTYPE* force) +{ + const FPTYPE TWO_PI = 2.0 * M_PI; + + const int iat = blockIdx.y; + const int ig_start = blockIdx.x * blockDim.x + threadIdx.x; + const int ig_stride = gridDim.x * blockDim.x; + + if (iat >= nat) return; + + // Load atom information to registers + const FPTYPE tau_x = tau[iat * 3 + 0]; + const FPTYPE tau_y = tau[iat * 3 + 1]; + const FPTYPE tau_z = tau[iat * 3 + 2]; + const FPTYPE it_fact = it_facts[iat]; + + // Local accumulation variables + FPTYPE local_force_x = 0.0; + FPTYPE local_force_y = 0.0; + FPTYPE local_force_z = 0.0; + + // Grid-stride loop over G-vectors + for (int ig = ig_start; ig < npw; ig += ig_stride) { + // Skip G=0 term + if (ig == ig_gge0) continue; + + // Calculate phase: 2π * (G · τ) + const FPTYPE phase = TWO_PI * (gcar[ig * 3 + 0] * tau_x + + gcar[ig * 3 + 1] * tau_y + + gcar[ig * 3 + 2] * tau_z); + + // Use CUDA intrinsic for sincos + FPTYPE sinp, cosp; + sincos(phase, &sinp, &cosp); + + // Calculate Ewald sum contribution + const FPTYPE factor = it_fact * (cosp * aux[ig].imag() + sinp * aux[ig].real()); + + // Accumulate force contributions + local_force_x += gcar[ig * 3 + 0] * factor; + local_force_y += gcar[ig * 3 + 1] * factor; + local_force_z += gcar[ig * 3 + 2] * factor; + } + + // Atomic add to global memory + atomicAdd(&force[iat * 3 + 0], local_force_x); + atomicAdd(&force[iat * 3 + 1], local_force_y); + atomicAdd(&force[iat * 3 + 2], local_force_z); +} template __global__ void cal_vkb1_nl( @@ -188,6 +307,67 @@ void cal_force_nl_op::operator()(const base_dev cudaCheckOnDebug(); } +// GPU operators +template +void cal_force_loc_sincos_op::operator()( + const base_device::DEVICE_GPU* ctx, + const int& nat, + const int& npw, + const int& ntype, + const FPTYPE* gcar, + const FPTYPE* tau, + const int* iat2it, + const FPTYPE* vloc_per_type, + const std::complex* aux, + const FPTYPE& scale_factor, + FPTYPE* force) +{ + // Calculate optimal grid configuration for GPU load balancing + const int threads_per_block = THREADS_PER_BLOCK; + const int max_blocks_per_sm = 32; // Configurable per GPU architecture + const int max_blocks_x = std::min(max_blocks_per_sm, (npw + threads_per_block - 1) / threads_per_block); + + dim3 grid(max_blocks_x, nat); + dim3 block(threads_per_block); + + cal_force_loc_sincos_kernel<<>>( + nat, npw, ntype, gcar, tau, iat2it, vloc_per_type, + reinterpret_cast*>(aux), + scale_factor, force + ); + + cudaCheckOnDebug(); +} + +template +void cal_force_ew_sincos_op::operator()( + const base_device::DEVICE_GPU* ctx, + const int& nat, + const int& npw, + const int& ig_gge0, + const FPTYPE* gcar, + const FPTYPE* tau, + const int* iat2it, + const FPTYPE* it_facts, + const std::complex* aux, + FPTYPE* force) +{ + // Calculate optimal grid configuration for GPU load balancing + const int threads_per_block = THREADS_PER_BLOCK; + const int max_blocks_per_sm = 32; // Configurable per GPU architecture + const int max_blocks_x = std::min(max_blocks_per_sm, (npw + threads_per_block - 1) / threads_per_block); + + dim3 grid(max_blocks_x, nat); + dim3 block(threads_per_block); + + cal_force_ew_sincos_kernel<<>>( + nat, npw, ig_gge0, gcar, tau, iat2it, it_facts, + reinterpret_cast*>(aux), force + ); + + cudaCheckOnDebug(); +} + template __global__ void cal_force_nl( const int ntype, @@ -613,8 +793,12 @@ template void saveVkbValues(const int *gcar_zero_ptrs, const std::comple template struct cal_vkb1_nl_op; template struct cal_force_nl_op; +template struct cal_force_loc_sincos_op; +template struct cal_force_ew_sincos_op; template struct cal_vkb1_nl_op; template struct cal_force_nl_op; +template struct cal_force_loc_sincos_op; +template struct cal_force_ew_sincos_op; } // namespace hamilt diff --git a/source/module_hamilt_pw/hamilt_pwdft/kernels/cuda/force_sincos_op.cu b/source/module_hamilt_pw/hamilt_pwdft/kernels/cuda/force_sincos_op.cu deleted file mode 100644 index f97214893e..0000000000 --- a/source/module_hamilt_pw/hamilt_pwdft/kernels/cuda/force_sincos_op.cu +++ /dev/null @@ -1,204 +0,0 @@ -#include "module_hamilt_pw/hamilt_pwdft/kernels/force_sincos_op.h" -#include "module_base/module_device/types.h" - -#include -#include -#include -#include - -#define THREADS_PER_BLOCK 256 - -namespace hamilt { - -// CUDA kernels for sincos loops -template -__global__ void cal_force_loc_sincos_kernel( - const int nat, - const int npw, - const FPTYPE* gcar, - const FPTYPE* tau, - const int* iat2it, - const FPTYPE* vloc_factors, - const thrust::complex* aux, - const FPTYPE scale_factor, - FPTYPE* force) -{ - const FPTYPE TWO_PI = 2.0 * M_PI; - - const int iat = blockIdx.y; - const int ig_start = blockIdx.x * blockDim.x + threadIdx.x; - const int ig_stride = gridDim.x * blockDim.x; - - if (iat >= nat) return; - - // Load atom information to registers - const int it = iat2it[iat]; - const FPTYPE tau_x = tau[iat * 3 + 0]; - const FPTYPE tau_y = tau[iat * 3 + 1]; - const FPTYPE tau_z = tau[iat * 3 + 2]; - - // Local accumulation variables - FPTYPE local_force_x = 0.0; - FPTYPE local_force_y = 0.0; - FPTYPE local_force_z = 0.0; - - // Grid-stride loop over G-vectors - for (int ig = ig_start; ig < npw; ig += ig_stride) { - // Calculate phase: 2π * (G · τ) - const FPTYPE phase = TWO_PI * ( - gcar[ig * 3 + 0] * tau_x + - gcar[ig * 3 + 1] * tau_y + - gcar[ig * 3 + 2] * tau_z - ); - - // Use CUDA intrinsic for sincos - FPTYPE sinp, cosp; - __sincos(phase, &sinp, &cosp); - - // Calculate force factor - const FPTYPE factor = vloc_factors[ig] * - (cosp * aux[ig].imag() + sinp * aux[ig].real()); - - // Accumulate force contributions - local_force_x += gcar[ig * 3 + 0] * factor; - local_force_y += gcar[ig * 3 + 1] * factor; - local_force_z += gcar[ig * 3 + 2] * factor; - } - - // Atomic add to global memory - atomicAdd(&force[iat * 3 + 0], local_force_x * scale_factor); - atomicAdd(&force[iat * 3 + 1], local_force_y * scale_factor); - atomicAdd(&force[iat * 3 + 2], local_force_z * scale_factor); -} - -template -__global__ void cal_force_ew_sincos_kernel( - const int nat, - const int npw, - const int ig_gge0, - const FPTYPE* gcar, - const FPTYPE* tau, - const int* iat2it, - const FPTYPE* it_facts, - const thrust::complex* aux, - FPTYPE* force) -{ - const FPTYPE TWO_PI = 2.0 * M_PI; - - const int iat = blockIdx.y; - const int ig_start = blockIdx.x * blockDim.x + threadIdx.x; - const int ig_stride = gridDim.x * blockDim.x; - - if (iat >= nat) return; - - // Load atom information to registers - const int it = iat2it[iat]; - const FPTYPE tau_x = tau[iat * 3 + 0]; - const FPTYPE tau_y = tau[iat * 3 + 1]; - const FPTYPE tau_z = tau[iat * 3 + 2]; - const FPTYPE it_fact = it_facts[iat]; - - // Local accumulation variables - FPTYPE local_force_x = 0.0; - FPTYPE local_force_y = 0.0; - FPTYPE local_force_z = 0.0; - - // Grid-stride loop over G-vectors - for (int ig = ig_start; ig < npw; ig += ig_stride) { - // Skip G=0 term - if (ig == ig_gge0) continue; - - // Calculate phase: 2π * (G · τ) - const FPTYPE arg = TWO_PI * ( - gcar[ig * 3 + 0] * tau_x + - gcar[ig * 3 + 1] * tau_y + - gcar[ig * 3 + 2] * tau_z - ); - - // Use CUDA intrinsic for sincos - FPTYPE sinp, cosp; - __sincos(arg, &sinp, &cosp); - - // Calculate Ewald sum contribution - const FPTYPE sumnb = -cosp * aux[ig].imag() + sinp * aux[ig].real(); - - // Accumulate force contributions - local_force_x += gcar[ig * 3 + 0] * sumnb; - local_force_y += gcar[ig * 3 + 1] * sumnb; - local_force_z += gcar[ig * 3 + 2] * sumnb; - } - - // Atomic add to global memory - atomicAdd(&force[iat * 3 + 0], local_force_x * it_fact); - atomicAdd(&force[iat * 3 + 1], local_force_y * it_fact); - atomicAdd(&force[iat * 3 + 2], local_force_z * it_fact); -} - -// GPU operator implementations -template -void cal_force_loc_sincos_op::operator()( - const base_device::DEVICE_GPU* ctx, - const int& nat, - const int& npw, - const FPTYPE* gcar, - const FPTYPE* tau, - const int* iat2it, - const FPTYPE* vloc_factors, - const std::complex* aux, - const FPTYPE& scale_factor, - FPTYPE* force) -{ - // Calculate optimal grid configuration - const int threads_per_block = THREADS_PER_BLOCK; - const int max_blocks_x = min(32, (npw + threads_per_block - 1) / threads_per_block); - - dim3 grid(max_blocks_x, nat); - dim3 block(threads_per_block); - - cal_force_loc_sincos_kernel<<>>( - nat, npw, - gcar, tau, iat2it, vloc_factors, - reinterpret_cast*>(aux), - scale_factor, force - ); - - cudaCheckOnDebug(); -} - -template -void cal_force_ew_sincos_op::operator()( - const base_device::DEVICE_GPU* ctx, - const int& nat, - const int& npw, - const int& ig_gge0, - const FPTYPE* gcar, - const FPTYPE* tau, - const int* iat2it, - const FPTYPE* it_facts, - const std::complex* aux, - FPTYPE* force) -{ - // Calculate optimal grid configuration - const int threads_per_block = THREADS_PER_BLOCK; - const int max_blocks_x = min(32, (npw + threads_per_block - 1) / threads_per_block); - - dim3 grid(max_blocks_x, nat); - dim3 block(threads_per_block); - - cal_force_ew_sincos_kernel<<>>( - nat, npw, ig_gge0, - gcar, tau, iat2it, it_facts, - reinterpret_cast*>(aux), - force - ); - - cudaCheckOnDebug(); -} - -template struct cal_force_loc_sincos_op; -template struct cal_force_loc_sincos_op; - -template struct cal_force_ew_sincos_op; -template struct cal_force_ew_sincos_op; - -} // namespace hamilt \ No newline at end of file diff --git a/source/module_hamilt_pw/hamilt_pwdft/kernels/force_op.cpp b/source/module_hamilt_pw/hamilt_pwdft/kernels/force_op.cpp index b5745f2ac5..e0dc91a70c 100644 --- a/source/module_hamilt_pw/hamilt_pwdft/kernels/force_op.cpp +++ b/source/module_hamilt_pw/hamilt_pwdft/kernels/force_op.cpp @@ -439,8 +439,9 @@ struct cal_force_loc_sincos_op const FPTYPE* gcar, const FPTYPE* tau, const int* iat2it, - const FPTYPE* vloc_factors, + const FPTYPE* vloc_per_type, const std::complex* aux, + const FPTYPE& scale_factor, FPTYPE* force) { const FPTYPE TWO_PI = 2.0 * M_PI; @@ -465,8 +466,8 @@ struct cal_force_loc_sincos_op FPTYPE sinp, cosp; ModuleBase::libm::sincos(phase, &sinp, &cosp); - const FPTYPE vloc_factor = vloc_factors[it * npw + ig]; - const FPTYPE factor = vloc_factor * (cosp * aux[ig].imag() + sinp * aux[ig].real()); + const FPTYPE vloc_factor = vloc_per_type[it * npw + ig]; + const FPTYPE factor = vloc_factor * (cosp * aux[ig].imag() + sinp * aux[ig].real()) * scale_factor; local_force[0] += gcar[ig * 3 + 0] * factor; local_force[1] += gcar[ig * 3 + 1] * factor; @@ -480,209 +481,66 @@ struct cal_force_loc_sincos_op } }; -// CPU implementation of Ewald parallel operator +// CPU implementation of Ewald force sincos operator template -struct cal_force_ew_parallel_op +struct cal_force_ew_sincos_op { void operator()(const base_device::DEVICE_CPU* ctx, const int& nat, const int& npw, - const int& ntype, const int& ig_gge0, const FPTYPE* gcar, const FPTYPE* tau, const int* iat2it, const FPTYPE* it_facts, - const int* atom_na, - const FPTYPE* zv_values, const std::complex* aux, - const FPTYPE& alpha, - const FPTYPE& fact, - const FPTYPE& lat0, - const FPTYPE* latvec, - const FPTYPE* G, - const bool& use_paw, FPTYPE* force) { const FPTYPE TWO_PI = 2.0 * M_PI; - const FPTYPE e2 = 2.0; // ModuleBase::e2 equivalent #ifdef _OPENMP -#pragma omp parallel - { - int num_threads = omp_get_num_threads(); - int thread_id = omp_get_thread_num(); -#else - int num_threads = 1; - int thread_id = 0; +#pragma omp parallel for #endif + for (int iat = 0; iat < nat; ++iat) + { + const FPTYPE tau_x = tau[iat * 3 + 0]; + const FPTYPE tau_y = tau[iat * 3 + 1]; + const FPTYPE tau_z = tau[iat * 3 + 2]; + const FPTYPE it_fact = it_facts[iat]; - // Task distribution for multi-thread - int iat_beg, iat_end; - ModuleBase::TASK_DIST_1D(num_threads, thread_id, nat, iat_beg, iat_end); - iat_end = iat_beg + iat_end; - - // Preprocess ig_gap for skipping the ig point - int ig_gap = (ig_gge0 >= 0 && ig_gge0 < npw) ? ig_gge0 : -1; + FPTYPE local_force[3] = {0.0, 0.0, 0.0}; - // Part 1: sincos calculation (replacing original while loop with for loop) - for (int iat = iat_beg; iat < iat_end; ++iat) + for (int ig = 0; ig < npw; ig++) { - const int it = iat2it[iat]; - - // Check if this atom type has atoms - if (atom_na[it] == 0) continue; - - const FPTYPE tau_x = tau[iat * 3 + 0]; - const FPTYPE tau_y = tau[iat * 3 + 1]; - const FPTYPE tau_z = tau[iat * 3 + 2]; - const FPTYPE it_fact = it_facts[iat]; + // Skip G=0 term + if (ig == ig_gge0) continue; - FPTYPE local_force[3] = {0.0, 0.0, 0.0}; - - // G-vector loop with ig_gap handling (using for loops for GPU compatibility) - for (int ig = 0; ig < npw; ig++) - { - // Skip G=0 term - if (ig == ig_gap) continue; - - const FPTYPE arg = TWO_PI * (gcar[ig * 3 + 0] * tau_x + - gcar[ig * 3 + 1] * tau_y + - gcar[ig * 3 + 2] * tau_z); - FPTYPE sinp, cosp; - ModuleBase::libm::sincos(arg, &sinp, &cosp); - - const FPTYPE sumnb = -cosp * aux[ig].imag() + sinp * aux[ig].real(); - - local_force[0] += gcar[ig * 3 + 0] * sumnb; - local_force[1] += gcar[ig * 3 + 1] * sumnb; - local_force[2] += gcar[ig * 3 + 2] * sumnb; - } - - // Apply it_fact scaling - force[iat * 3 + 0] += local_force[0] * it_fact; - force[iat * 3 + 1] += local_force[1] * it_fact; - force[iat * 3 + 2] += local_force[2] * it_fact; - } - - // Part 2: Real space Ewald calculation (when processor contains G=0 term) - if (ig_gge0 >= 0) - { - const FPTYPE rmax = 5.0 / (sqrt(alpha) * lat0); - const int mxr = 200; + const FPTYPE phase = TWO_PI * (gcar[ig * 3 + 0] * tau_x + + gcar[ig * 3 + 1] * tau_y + + gcar[ig * 3 + 2] * tau_z); + FPTYPE sinp, cosp; + ModuleBase::libm::sincos(phase, &sinp, &cosp); - // Allocate temporary arrays for each thread - ModuleBase::Vector3* r_flat = new ModuleBase::Vector3[mxr]; - double* r2 = new double[mxr]; - int* irr = new int[mxr]; + const FPTYPE factor = it_fact * (cosp * aux[ig].imag() + sinp * aux[ig].real()); - // Initialize arrays - for (int i = 0; i < mxr; i++) { - r2[i] = 0.0; - irr[i] = 0; - r_flat[i].x = 0.0; - r_flat[i].y = 0.0; - r_flat[i].z = 0.0; - } - - const FPTYPE sqa = sqrt(alpha); - const FPTYPE sq8a_2pi = sqrt(8.0 * alpha / TWO_PI); - - // Iterate over atoms in assigned range - for (int iat1 = iat_beg; iat1 < iat_end; ++iat1) - { - const int T1 = iat2it[iat1]; - if (atom_na[T1] == 0) continue; - - const FPTYPE tau1_x = tau[iat1 * 3 + 0]; - const FPTYPE tau1_y = tau[iat1 * 3 + 1]; - const FPTYPE tau1_z = tau[iat1 * 3 + 2]; - - // Iterate over all other atoms - for (int iat2 = 0; iat2 < nat; ++iat2) - { - const int T2 = iat2it[iat2]; - - if (iat1 != iat2 && atom_na[T2] != 0 && atom_na[T1] != 0) - { - const FPTYPE tau2_x = tau[iat2 * 3 + 0]; - const FPTYPE tau2_y = tau[iat2 * 3 + 1]; - const FPTYPE tau2_z = tau[iat2 * 3 + 2]; - - // Calculate d_tau - const FPTYPE d_tau_x = tau1_x - tau2_x; - const FPTYPE d_tau_y = tau1_y - tau2_y; - const FPTYPE d_tau_z = tau1_z - tau2_z; - - // Reconstruct Matrix3 objects from flat arrays for rgen call - ModuleBase::Matrix3 latvec_matrix( - static_cast(latvec[0]), static_cast(latvec[1]), static_cast(latvec[2]), - static_cast(latvec[3]), static_cast(latvec[4]), static_cast(latvec[5]), - static_cast(latvec[6]), static_cast(latvec[7]), static_cast(latvec[8]) - ); - - ModuleBase::Matrix3 G_matrix( - static_cast(G[0]), static_cast(G[1]), static_cast(G[2]), - static_cast(G[3]), static_cast(G[4]), static_cast(G[5]), - static_cast(G[6]), static_cast(G[7]), static_cast(G[8]) - ); - - ModuleBase::Vector3 dtau_vec( - static_cast(d_tau_x), - static_cast(d_tau_y), - static_cast(d_tau_z) - ); - - // Call H_Ewald_pw::rgen to generate neighbor shells - int nrm = 0; - H_Ewald_pw::rgen(dtau_vec, static_cast(rmax), irr, latvec_matrix, G_matrix, r_flat, r2, nrm); - - // Process all generated r vectors - for (int nr = 0; nr < nrm; nr++) - { - const FPTYPE rr = sqrt(r2[nr]) * lat0; - - FPTYPE factor; - if (use_paw) - { - factor = zv_values[T1] * zv_values[T2] * e2 / (rr * rr) - * (erfc(sqa * rr) / rr + sq8a_2pi * exp(-alpha * rr * rr)) - * lat0; - } - else - { - factor = zv_values[T1] * zv_values[T2] * e2 / (rr * rr) - * (erfc(sqa * rr) / rr + sq8a_2pi * exp(-alpha * rr * rr)) - * lat0; - } - - force[iat1 * 3 + 0] -= factor * static_cast(r_flat[nr].x); - force[iat1 * 3 + 1] -= factor * static_cast(r_flat[nr].y); - force[iat1 * 3 + 2] -= factor * static_cast(r_flat[nr].z); - } - } - } - } - - delete[] r_flat; - delete[] r2; - delete[] irr; + local_force[0] += gcar[ig * 3 + 0] * factor; + local_force[1] += gcar[ig * 3 + 1] * factor; + local_force[2] += gcar[ig * 3 + 2] * factor; } -#ifdef _OPENMP + force[iat * 3 + 0] = local_force[0]; + force[iat * 3 + 1] = local_force[1]; + force[iat * 3 + 2] = local_force[2]; } -#endif } }; template struct cal_vkb1_nl_op; template struct cal_force_nl_op; template struct cal_force_loc_sincos_op; -template struct cal_force_ew_parallel_op; - +template struct cal_force_ew_sincos_op; template struct cal_vkb1_nl_op; template struct cal_force_nl_op; template struct cal_force_loc_sincos_op; -template struct cal_force_ew_parallel_op; - +template struct cal_force_ew_sincos_op; } // namespace hamilt diff --git a/source/module_hamilt_pw/hamilt_pwdft/kernels/force_op.h b/source/module_hamilt_pw/hamilt_pwdft/kernels/force_op.h index 152f41e310..3fc5d92212 100644 --- a/source/module_hamilt_pw/hamilt_pwdft/kernels/force_op.h +++ b/source/module_hamilt_pw/hamilt_pwdft/kernels/force_op.h @@ -162,8 +162,9 @@ struct cal_force_loc_sincos_op /// @param gcar - G-vector Cartesian coordinates [npw * 3] /// @param tau - atomic positions [nat * 3] /// @param iat2it - atom to type mapping [nat] - /// @param vloc_factors - precomputed vloc factors [ntype * npw] + /// @param vloc_per_type - precomputed vloc factors per type [ntype * npw] /// @param aux - charge density in G-space [npw] + /// @param scale_factor - tpiba * omega /// /// Output Parameters /// @param force - output forces [nat * 3] @@ -174,55 +175,39 @@ struct cal_force_loc_sincos_op const FPTYPE* gcar, const FPTYPE* tau, const int* iat2it, - const FPTYPE* vloc_factors, + const FPTYPE* vloc_per_type, const std::complex* aux, + const FPTYPE& scale_factor, FPTYPE* force); }; template -struct cal_force_ew_parallel_op +struct cal_force_ew_sincos_op { - /// @brief Calculate Ewald forces (parallel region only) + /// @brief Calculate Ewald forces (sincos loop only) /// /// Input Parameters /// @param ctx - which device this function runs on /// @param nat - number of atoms /// @param npw - number of plane waves - /// @param ntype - number of atom types /// @param ig_gge0 - index of G=0 vector (-1 if not present) /// @param gcar - G-vector Cartesian coordinates [npw * 3] /// @param tau - atomic positions [nat * 3] /// @param iat2it - atom to type mapping [nat] /// @param it_facts - precomputed it_fact for each atom [nat] - /// @param atom_na - number of atoms of each type [ntype] - /// @param zv_values - valence charge for each type [ntype] /// @param aux - structure factor related array [npw] - /// @param alpha - Ewald parameter - /// @param fact - scaling factor - /// @param lat0 - lattice constant - /// @param latvec - lattice vectors [3 * 3] - /// @param G - reciprocal lattice vectors [3 * 3] - /// @param use_pwa - whether PAW is used + /// /// Output Parameters /// @param force - output forces [nat * 3] void operator()(const Device* ctx, const int& nat, const int& npw, - const int& ntype, const int& ig_gge0, const FPTYPE* gcar, const FPTYPE* tau, const int* iat2it, const FPTYPE* it_facts, - const int* atom_na, - const FPTYPE* zv_values, const std::complex* aux, - const FPTYPE& alpha, - const FPTYPE& fact, - const FPTYPE& lat0, - const FPTYPE* latvec, - const FPTYPE* G, - const bool& use_paw, FPTYPE* force); }; @@ -335,13 +320,14 @@ struct cal_force_loc_sincos_op const FPTYPE* gcar, const FPTYPE* tau, const int* iat2it, - const FPTYPE* vloc_factors, + const FPTYPE* vloc_per_type, const std::complex* aux, + const FPTYPE& scale_factor, FPTYPE* force); }; template -struct cal_force_ew_parallel_op +struct cal_force_ew_sincos_op { void operator()(const base_device::DEVICE_GPU* ctx, const int& nat, diff --git a/source/module_hamilt_pw/hamilt_pwdft/kernels/force_sincos_op.cpp b/source/module_hamilt_pw/hamilt_pwdft/kernels/force_sincos_op.cpp deleted file mode 100644 index e7ce8554cc..0000000000 --- a/source/module_hamilt_pw/hamilt_pwdft/kernels/force_sincos_op.cpp +++ /dev/null @@ -1,123 +0,0 @@ -#include "force_sincos_op.h" -#include "module_base/libm/libm.h" - -#ifdef _OPENMP -#include -#endif - -namespace hamilt -{ - -template -struct cal_force_loc_sincos_op -{ - void operator()(const base_device::DEVICE_CPU* ctx, - const int& nat, - const int& npw, - const FPTYPE* gcar, - const FPTYPE* tau, - const int* iat2it, - const FPTYPE* vloc_factors, - const std::complex* aux, - const FPTYPE& scale_factor, - FPTYPE* force) - { - const FPTYPE TWO_PI = 2.0 * M_PI; - -#ifdef _OPENMP -#pragma omp parallel for -#endif - for (int iat = 0; iat < nat; ++iat) - { - const int it = iat2it[iat]; - const FPTYPE tau_x = tau[iat * 3 + 0]; - const FPTYPE tau_y = tau[iat * 3 + 1]; - const FPTYPE tau_z = tau[iat * 3 + 2]; - - FPTYPE local_force[3] = {0.0, 0.0, 0.0}; - - for (int ig = 0; ig < npw; ig++) - { - const FPTYPE phase = TWO_PI * (gcar[ig * 3 + 0] * tau_x + - gcar[ig * 3 + 1] * tau_y + - gcar[ig * 3 + 2] * tau_z); - FPTYPE sinp, cosp; - ModuleBase::libm::sincos(phase, &sinp, &cosp); - - const FPTYPE factor = vloc_factors[ig] * - (cosp * aux[ig].imag() + sinp * aux[ig].real()); - - local_force[0] += gcar[ig * 3 + 0] * factor; - local_force[1] += gcar[ig * 3 + 1] * factor; - local_force[2] += gcar[ig * 3 + 2] * factor; - } - - force[iat * 3 + 0] += local_force[0] * scale_factor; - force[iat * 3 + 1] += local_force[1] * scale_factor; - force[iat * 3 + 2] += local_force[2] * scale_factor; - } - } -}; - -template -struct cal_force_ew_sincos_op -{ - void operator()(const base_device::DEVICE_CPU* ctx, - const int& nat, - const int& npw, - const int& ig_gge0, - const FPTYPE* gcar, - const FPTYPE* tau, - const int* iat2it, - const FPTYPE* it_facts, - const std::complex* aux, - FPTYPE* force) - { - const FPTYPE TWO_PI = 2.0 * M_PI; - -#ifdef _OPENMP -#pragma omp parallel for -#endif - for (int iat = 0; iat < nat; ++iat) - { - const int it = iat2it[iat]; - const FPTYPE tau_x = tau[iat * 3 + 0]; - const FPTYPE tau_y = tau[iat * 3 + 1]; - const FPTYPE tau_z = tau[iat * 3 + 2]; - const FPTYPE it_fact = it_facts[iat]; - - FPTYPE local_force[3] = {0.0, 0.0, 0.0}; - - for (int ig = 0; ig < npw; ig++) - { - // Skip G=0 term - if (ig == ig_gge0) continue; - - const FPTYPE arg = TWO_PI * (gcar[ig * 3 + 0] * tau_x + - gcar[ig * 3 + 1] * tau_y + - gcar[ig * 3 + 2] * tau_z); - FPTYPE sinp, cosp; - ModuleBase::libm::sincos(arg, &sinp, &cosp); - - const FPTYPE sumnb = -cosp * aux[ig].imag() + sinp * aux[ig].real(); - - local_force[0] += gcar[ig * 3 + 0] * sumnb; - local_force[1] += gcar[ig * 3 + 1] * sumnb; - local_force[2] += gcar[ig * 3 + 2] * sumnb; - } - - force[iat * 3 + 0] += local_force[0] * it_fact; - force[iat * 3 + 1] += local_force[1] * it_fact; - force[iat * 3 + 2] += local_force[2] * it_fact; - } - } -}; - -// Template instantiations -template struct cal_force_loc_sincos_op; -template struct cal_force_loc_sincos_op; - -template struct cal_force_ew_sincos_op; -template struct cal_force_ew_sincos_op; - -} // namespace hamilt \ No newline at end of file diff --git a/source/module_hamilt_pw/hamilt_pwdft/kernels/force_sincos_op.h b/source/module_hamilt_pw/hamilt_pwdft/kernels/force_sincos_op.h deleted file mode 100644 index 455b8425a7..0000000000 --- a/source/module_hamilt_pw/hamilt_pwdft/kernels/force_sincos_op.h +++ /dev/null @@ -1,106 +0,0 @@ -#ifndef FORCE_SINCOS_OP_H -#define FORCE_SINCOS_OP_H - -#include "module_base/module_device/types.h" -#include - -namespace hamilt -{ - -template -struct cal_force_loc_sincos_op -{ - /// @brief Calculate local pseudopotential forces (sincos loop only) - /// - /// Input Parameters - /// @param ctx - which device this function runs on - /// @param nat - number of atoms - /// @param npw - number of plane waves - /// @param gcar - G-vector Cartesian coordinates [npw * 3] - /// @param tau - atomic positions [nat * 3] - /// @param iat2it - atom to type mapping [nat] - /// @param vloc_factors - precomputed vloc factors [npw] - /// @param aux - charge density in G-space [npw] - /// @param scale_factor - tpiba * omega - /// - /// Output Parameters - /// @param force - output forces [nat * 3] - void operator()(const Device* ctx, - const int& nat, - const int& npw, - const FPTYPE* gcar, - const FPTYPE* tau, - const int* iat2it, - const FPTYPE* vloc_factors, - const std::complex* aux, - const FPTYPE& scale_factor, - FPTYPE* force); -}; - -template -struct cal_force_ew_sincos_op -{ - /// @brief Calculate Ewald forces (sincos loop only) - /// - /// Input Parameters - /// @param ctx - which device this function runs on - /// @param nat - number of atoms - /// @param npw - number of plane waves - /// @param ig_gge0 - index of G=0 vector (-1 if not present) - /// @param gcar - G-vector Cartesian coordinates [npw * 3] - /// @param tau - atomic positions [nat * 3] - /// @param iat2it - atom to type mapping [nat] - /// @param it_facts - precomputed it_fact for each atom [nat] - /// @param aux - structure factor related array [npw] - /// - /// Output Parameters - /// @param force - output forces [nat * 3] - void operator()(const Device* ctx, - const int& nat, - const int& npw, - const int& ig_gge0, - const FPTYPE* gcar, - const FPTYPE* tau, - const int* iat2it, - const FPTYPE* it_facts, - const std::complex* aux, - FPTYPE* force); -}; - -#if __CUDA || __UT_USE_CUDA || __ROCM || __UT_USE_ROCM - -template -struct cal_force_loc_sincos_op -{ - void operator()(const base_device::DEVICE_GPU* ctx, - const int& nat, - const int& npw, - const FPTYPE* gcar, - const FPTYPE* tau, - const int* iat2it, - const FPTYPE* vloc_factors, - const std::complex* aux, - const FPTYPE& scale_factor, - FPTYPE* force); -}; - -template -struct cal_force_ew_sincos_op -{ - void operator()(const base_device::DEVICE_GPU* ctx, - const int& nat, - const int& npw, - const int& ig_gge0, - const FPTYPE* gcar, - const FPTYPE* tau, - const int* iat2it, - const FPTYPE* it_facts, - const std::complex* aux, - FPTYPE* force); -}; - -#endif // __CUDA || __UT_USE_CUDA || __ROCM || __UT_USE_ROCM - -} // namespace hamilt - -#endif // FORCE_SINCOS_OP_H \ No newline at end of file diff --git a/source/module_hamilt_pw/hamilt_pwdft/kernels/rocm/force_op.hip.cu b/source/module_hamilt_pw/hamilt_pwdft/kernels/rocm/force_op.hip.cu index c78b333b86..9feeb76fed 100644 --- a/source/module_hamilt_pw/hamilt_pwdft/kernels/rocm/force_op.hip.cu +++ b/source/module_hamilt_pw/hamilt_pwdft/kernels/rocm/force_op.hip.cu @@ -618,10 +618,195 @@ template void revertVkbValues(const int *gcar_zero_ptrs, std::complex(const int *gcar_zero_ptrs, const std::complex *vkb_ptr, std::complex *vkb_save_ptr, int nkb, int gcar_zero_count, int npw, int ipol, int npwx); +// HIP kernels for sincos loops +template +__global__ void cal_force_loc_sincos_kernel( + const int nat, + const int npw, + const int ntype, + const FPTYPE* gcar, + const FPTYPE* tau, + const int* iat2it, + const FPTYPE* vloc_per_type, + const thrust::complex* aux, + const FPTYPE scale_factor, + FPTYPE* force) +{ + const FPTYPE TWO_PI = 2.0 * M_PI; + + const int iat = blockIdx.y; + const int ig_start = blockIdx.x * blockDim.x + threadIdx.x; + const int ig_stride = gridDim.x * blockDim.x; + + if (iat >= nat) return; + + // Load atom information to registers + const int it = iat2it[iat]; + const FPTYPE tau_x = tau[iat * 3 + 0]; + const FPTYPE tau_y = tau[iat * 3 + 1]; + const FPTYPE tau_z = tau[iat * 3 + 2]; + + // Local accumulation variables + FPTYPE local_force_x = 0.0; + FPTYPE local_force_y = 0.0; + FPTYPE local_force_z = 0.0; + + // Grid-stride loop over G-vectors + for (int ig = ig_start; ig < npw; ig += ig_stride) { + // Calculate phase: 2π * (G · τ) + const FPTYPE phase = TWO_PI * (gcar[ig * 3 + 0] * tau_x + + gcar[ig * 3 + 1] * tau_y + + gcar[ig * 3 + 2] * tau_z); + + // Use HIP intrinsic for sincos + FPTYPE sinp, cosp; + sincos(phase, &sinp, &cosp); + + // Calculate force factor + const FPTYPE vloc_factor = vloc_per_type[it * npw + ig]; + const FPTYPE factor = vloc_factor * (cosp * aux[ig].imag() + sinp * aux[ig].real()); + + // Accumulate force contributions + local_force_x += gcar[ig * 3 + 0] * factor; + local_force_y += gcar[ig * 3 + 1] * factor; + local_force_z += gcar[ig * 3 + 2] * factor; + } + + // Atomic add to global memory + atomicAdd(&force[iat * 3 + 0], local_force_x * scale_factor); + atomicAdd(&force[iat * 3 + 1], local_force_y * scale_factor); + atomicAdd(&force[iat * 3 + 2], local_force_z * scale_factor); +} + +template +__global__ void cal_force_ew_sincos_kernel( + const int nat, + const int npw, + const int ig_gge0, + const FPTYPE* gcar, + const FPTYPE* tau, + const int* iat2it, + const FPTYPE* it_facts, + const thrust::complex* aux, + FPTYPE* force) +{ + const FPTYPE TWO_PI = 2.0 * M_PI; + + const int iat = blockIdx.y; + const int ig_start = blockIdx.x * blockDim.x + threadIdx.x; + const int ig_stride = gridDim.x * blockDim.x; + + if (iat >= nat) return; + + // Load atom information to registers + const FPTYPE tau_x = tau[iat * 3 + 0]; + const FPTYPE tau_y = tau[iat * 3 + 1]; + const FPTYPE tau_z = tau[iat * 3 + 2]; + const FPTYPE it_fact = it_facts[iat]; + + // Local accumulation variables + FPTYPE local_force_x = 0.0; + FPTYPE local_force_y = 0.0; + FPTYPE local_force_z = 0.0; + + // Grid-stride loop over G-vectors + for (int ig = ig_start; ig < npw; ig += ig_stride) { + // Skip G=0 term + if (ig == ig_gge0) continue; + + // Calculate phase: 2π * (G · τ) + const FPTYPE phase = TWO_PI * (gcar[ig * 3 + 0] * tau_x + + gcar[ig * 3 + 1] * tau_y + + gcar[ig * 3 + 2] * tau_z); + + // Use HIP intrinsic for sincos + FPTYPE sinp, cosp; + sincos(phase, &sinp, &cosp); + + // Calculate Ewald sum contribution + const FPTYPE factor = it_fact * (cosp * aux[ig].imag() + sinp * aux[ig].real()); + + // Accumulate force contributions + local_force_x += gcar[ig * 3 + 0] * factor; + local_force_y += gcar[ig * 3 + 1] * factor; + local_force_z += gcar[ig * 3 + 2] * factor; + } + + // Atomic add to global memory + atomicAdd(&force[iat * 3 + 0], local_force_x); + atomicAdd(&force[iat * 3 + 1], local_force_y); + atomicAdd(&force[iat * 3 + 2], local_force_z); +} + +// GPU operators +template +void cal_force_loc_sincos_op::operator()( + const base_device::DEVICE_GPU* ctx, + const int& nat, + const int& npw, + const int& ntype, + const FPTYPE* gcar, + const FPTYPE* tau, + const int* iat2it, + const FPTYPE* vloc_per_type, + const std::complex* aux, + const FPTYPE& scale_factor, + FPTYPE* force) +{ + // Calculate optimal grid configuration for GPU load balancing + const int threads_per_block = THREADS_PER_BLOCK; + const int max_blocks_per_sm = 32; // Configurable per GPU architecture + const int max_blocks_x = std::min(max_blocks_per_sm, (npw + threads_per_block - 1) / threads_per_block); + + dim3 grid(max_blocks_x, nat); + dim3 block(threads_per_block); + + hipLaunchKernelGGL(cal_force_loc_sincos_kernel, + grid, block, 0, 0, + nat, npw, ntype, gcar, tau, iat2it, vloc_per_type, + reinterpret_cast*>(aux), + scale_factor, force); + + hipCheckOnDebug(); +} + +template +void cal_force_ew_sincos_op::operator()( + const base_device::DEVICE_GPU* ctx, + const int& nat, + const int& npw, + const int& ig_gge0, + const FPTYPE* gcar, + const FPTYPE* tau, + const int* iat2it, + const FPTYPE* it_facts, + const std::complex* aux, + FPTYPE* force) +{ + // Calculate optimal grid configuration for GPU load balancing + const int threads_per_block = THREADS_PER_BLOCK; + const int max_blocks_per_sm = 32; // Configurable per GPU architecture + const int max_blocks_x = std::min(max_blocks_per_sm, (npw + threads_per_block - 1) / threads_per_block); + + dim3 grid(max_blocks_x, nat); + dim3 block(threads_per_block); + + hipLaunchKernelGGL(cal_force_ew_sincos_kernel, + grid, block, 0, 0, + nat, npw, ig_gge0, gcar, tau, iat2it, it_facts, + reinterpret_cast*>(aux), force); + + hipCheckOnDebug(); +} + template struct cal_vkb1_nl_op; template struct cal_force_nl_op; +template struct cal_force_loc_sincos_op; +template struct cal_force_ew_sincos_op; template struct cal_vkb1_nl_op; template struct cal_force_nl_op; +template struct cal_force_loc_sincos_op; +template struct cal_force_ew_sincos_op; } // namespace hamilt diff --git a/source/module_hamilt_pw/hamilt_pwdft/kernels/rocm/force_sincos_op.hip.cu b/source/module_hamilt_pw/hamilt_pwdft/kernels/rocm/force_sincos_op.hip.cu deleted file mode 100644 index a09cd28bed..0000000000 --- a/source/module_hamilt_pw/hamilt_pwdft/kernels/rocm/force_sincos_op.hip.cu +++ /dev/null @@ -1,203 +0,0 @@ -#include "module_hamilt_pw/hamilt_pwdft/kernels/force_sincos_op.h" - -#include -#include -#include -#include - -#define THREADS_PER_BLOCK 256 - -namespace hamilt { - -// HIP kernels for sincos loops -template -__global__ void cal_force_loc_sincos_kernel( - const int nat, - const int npw, - const FPTYPE* gcar, - const FPTYPE* tau, - const int* iat2it, - const FPTYPE* vloc_factors, - const thrust::complex* aux, - const FPTYPE scale_factor, - FPTYPE* force) -{ - const FPTYPE TWO_PI = 2.0 * M_PI; - - const int iat = blockIdx.y; - const int ig_start = blockIdx.x * blockDim.x + threadIdx.x; - const int ig_stride = gridDim.x * blockDim.x; - - if (iat >= nat) return; - - // Load atom information to registers - const int it = iat2it[iat]; - const FPTYPE tau_x = tau[iat * 3 + 0]; - const FPTYPE tau_y = tau[iat * 3 + 1]; - const FPTYPE tau_z = tau[iat * 3 + 2]; - - // Local accumulation variables - FPTYPE local_force_x = 0.0; - FPTYPE local_force_y = 0.0; - FPTYPE local_force_z = 0.0; - - // Grid-stride loop over G-vectors - for (int ig = ig_start; ig < npw; ig += ig_stride) { - // Calculate phase: 2π * (G · τ) - const FPTYPE phase = TWO_PI * ( - gcar[ig * 3 + 0] * tau_x + - gcar[ig * 3 + 1] * tau_y + - gcar[ig * 3 + 2] * tau_z - ); - - // Use HIP intrinsic for sincos - FPTYPE sinp, cosp; - __sincos(phase, &sinp, &cosp); - - // Calculate force factor - const FPTYPE factor = vloc_factors[ig] * - (cosp * aux[ig].imag() + sinp * aux[ig].real()); - - // Accumulate force contributions - local_force_x += gcar[ig * 3 + 0] * factor; - local_force_y += gcar[ig * 3 + 1] * factor; - local_force_z += gcar[ig * 3 + 2] * factor; - } - - // Atomic add to global memory - atomicAdd(&force[iat * 3 + 0], local_force_x * scale_factor); - atomicAdd(&force[iat * 3 + 1], local_force_y * scale_factor); - atomicAdd(&force[iat * 3 + 2], local_force_z * scale_factor); -} - -template -__global__ void cal_force_ew_sincos_kernel( - const int nat, - const int npw, - const int ig_gge0, - const FPTYPE* gcar, - const FPTYPE* tau, - const int* iat2it, - const FPTYPE* it_facts, - const thrust::complex* aux, - FPTYPE* force) -{ - const FPTYPE TWO_PI = 2.0 * M_PI; - - const int iat = blockIdx.y; - const int ig_start = blockIdx.x * blockDim.x + threadIdx.x; - const int ig_stride = gridDim.x * blockDim.x; - - if (iat >= nat) return; - - // Load atom information to registers - const int it = iat2it[iat]; - const FPTYPE tau_x = tau[iat * 3 + 0]; - const FPTYPE tau_y = tau[iat * 3 + 1]; - const FPTYPE tau_z = tau[iat * 3 + 2]; - const FPTYPE it_fact = it_facts[iat]; - - // Local accumulation variables - FPTYPE local_force_x = 0.0; - FPTYPE local_force_y = 0.0; - FPTYPE local_force_z = 0.0; - - // Grid-stride loop over G-vectors - for (int ig = ig_start; ig < npw; ig += ig_stride) { - // Skip G=0 term - if (ig == ig_gge0) continue; - - // Calculate phase: 2π * (G · τ) - const FPTYPE arg = TWO_PI * ( - gcar[ig * 3 + 0] * tau_x + - gcar[ig * 3 + 1] * tau_y + - gcar[ig * 3 + 2] * tau_z - ); - - // Use HIP intrinsic for sincos - FPTYPE sinp, cosp; - __sincos(arg, &sinp, &cosp); - - // Calculate Ewald sum contribution - const FPTYPE sumnb = -cosp * aux[ig].imag() + sinp * aux[ig].real(); - - // Accumulate force contributions - local_force_x += gcar[ig * 3 + 0] * sumnb; - local_force_y += gcar[ig * 3 + 1] * sumnb; - local_force_z += gcar[ig * 3 + 2] * sumnb; - } - - // Atomic add to global memory - atomicAdd(&force[iat * 3 + 0], local_force_x * it_fact); - atomicAdd(&force[iat * 3 + 1], local_force_y * it_fact); - atomicAdd(&force[iat * 3 + 2], local_force_z * it_fact); -} - -// GPU operator implementations -template -void cal_force_loc_sincos_op::operator()( - const base_device::DEVICE_GPU* ctx, - const int& nat, - const int& npw, - const FPTYPE* gcar, - const FPTYPE* tau, - const int* iat2it, - const FPTYPE* vloc_factors, - const std::complex* aux, - const FPTYPE& scale_factor, - FPTYPE* force) -{ - // Calculate optimal grid configuration - const int threads_per_block = THREADS_PER_BLOCK; - const int max_blocks_x = min(32, (npw + threads_per_block - 1) / threads_per_block); - - dim3 grid(max_blocks_x, nat); - dim3 block(threads_per_block); - - hipLaunchKernelGGL(HIP_KERNEL_NAME(cal_force_loc_sincos_kernel), grid, block, 0, 0, - nat, npw, - gcar, tau, iat2it, vloc_factors, - reinterpret_cast*>(aux), - scale_factor, force - ); - - hipCheckOnDebug(); -} - -template -void cal_force_ew_sincos_op::operator()( - const base_device::DEVICE_GPU* ctx, - const int& nat, - const int& npw, - const int& ig_gge0, - const FPTYPE* gcar, - const FPTYPE* tau, - const int* iat2it, - const FPTYPE* it_facts, - const std::complex* aux, - FPTYPE* force) -{ - // Calculate optimal grid configuration - const int threads_per_block = THREADS_PER_BLOCK; - const int max_blocks_x = min(32, (npw + threads_per_block - 1) / threads_per_block); - - dim3 grid(max_blocks_x, nat); - dim3 block(threads_per_block); - - hipLaunchKernelGGL(HIP_KERNEL_NAME(cal_force_ew_sincos_kernel), grid, block, 0, 0, - nat, npw, ig_gge0, - gcar, tau, iat2it, it_facts, - reinterpret_cast*>(aux), - force - ); - - hipCheckOnDebug(); -} - -template struct cal_force_loc_sincos_op; -template struct cal_force_loc_sincos_op; - -template struct cal_force_ew_sincos_op; -template struct cal_force_ew_sincos_op; - -} // namespace hamilt \ No newline at end of file From 48e91dbe125e5efb81ff5d982877f68ce7092f7d Mon Sep 17 00:00:00 2001 From: Jie Date: Tue, 3 Jun 2025 11:26:23 +0800 Subject: [PATCH 4/6] fix vloc computation in cal_force_loc_sincos_op --- source/module_hamilt_pw/hamilt_pwdft/forces.cpp | 15 ++++----------- .../hamilt_pwdft/kernels/cuda/force_op.cu | 7 ++----- .../hamilt_pwdft/kernels/force_op.cpp | 4 +--- .../hamilt_pwdft/kernels/force_op.h | 5 +---- .../hamilt_pwdft/kernels/rocm/force_op.hip.cu | 7 ++----- 5 files changed, 10 insertions(+), 28 deletions(-) diff --git a/source/module_hamilt_pw/hamilt_pwdft/forces.cpp b/source/module_hamilt_pw/hamilt_pwdft/forces.cpp index 5e1d3cd6a2..d50288eef8 100644 --- a/source/module_hamilt_pw/hamilt_pwdft/forces.cpp +++ b/source/module_hamilt_pw/hamilt_pwdft/forces.cpp @@ -537,7 +537,6 @@ void Forces::cal_force_loc(const UnitCell& ucell, // 准备原子相关数据:按照全局原子索引iat顺序 std::vector tau_flat(this->nat * 3); - std::vector iat2it_host(this->nat); std::vector gcar_flat(rho_basis->npw * 3); // 按照原始代码逻辑:遍历全局原子索引iat,通过查表获取(it,ia) @@ -548,7 +547,6 @@ void Forces::cal_force_loc(const UnitCell& ucell, tau_flat[iat * 3 + 0] = static_cast(ucell.atoms[it].tau[ia][0]); tau_flat[iat * 3 + 1] = static_cast(ucell.atoms[it].tau[ia][1]); tau_flat[iat * 3 + 2] = static_cast(ucell.atoms[it].tau[ia][2]); - iat2it_host[iat] = it; } for (int ig = 0; ig < rho_basis->npw; ig++) { @@ -559,9 +557,10 @@ void Forces::cal_force_loc(const UnitCell& ucell, // 重新计算vloc_factors考虑所有原子类型 std::vector vloc_per_type_host(ucell.ntype * rho_basis->npw); - for (int it = 0; it < ucell.ntype; it++) { + for (int iat = 0; iat < this->nat; iat++) { + int it = ucell.iat2it[iat]; for (int ig = 0; ig < rho_basis->npw; ig++) { - vloc_per_type_host[it * rho_basis->npw + ig] = static_cast(vloc(it, rho_basis->ig2igg[ig])); + vloc_per_type_host[iat * rho_basis->npw + ig] = static_cast(vloc(it, rho_basis->ig2igg[ig])); } } @@ -574,7 +573,6 @@ void Forces::cal_force_loc(const UnitCell& ucell, // 设备端内存和指针设置(根据设备类型分支) FPTYPE* d_gcar = gcar_flat.data(); FPTYPE* d_tau = tau_flat.data(); - int* d_iat2it = iat2it_host.data(); FPTYPE* d_vloc_per_type = vloc_per_type_host.data(); std::complex* d_aux = aux_fptype.data(); FPTYPE* d_force = nullptr; @@ -584,13 +582,11 @@ void Forces::cal_force_loc(const UnitCell& ucell, { d_gcar = nullptr; d_tau = nullptr; - d_iat2it = nullptr; d_vloc_per_type = nullptr; d_aux = nullptr; resmem_var_op()(this->ctx, d_gcar, rho_basis->npw * 3); resmem_var_op()(this->ctx, d_tau, this->nat * 3); - resmem_int_op()(this->ctx, d_iat2it, this->nat); resmem_var_op()(this->ctx, d_vloc_per_type, ucell.ntype * rho_basis->npw); resmem_complex_op()(this->ctx, d_aux, rho_basis->npw); resmem_var_op()(this->ctx, d_force, this->nat * 3); @@ -598,7 +594,6 @@ void Forces::cal_force_loc(const UnitCell& ucell, // 数据传输到设备 syncmem_var_h2d_op()(this->ctx, this->cpu_ctx, d_gcar, gcar_flat.data(), rho_basis->npw * 3); syncmem_var_h2d_op()(this->ctx, this->cpu_ctx, d_tau, tau_flat.data(), this->nat * 3); - syncmem_int_h2d_op()(this->ctx, this->cpu_ctx, d_iat2it, iat2it_host.data(), this->nat); syncmem_var_h2d_op()(this->ctx, this->cpu_ctx, d_vloc_per_type, vloc_per_type_host.data(), ucell.ntype * rho_basis->npw); syncmem_complex_h2d_op()(this->ctx, this->cpu_ctx, d_aux, aux_fptype.data(), rho_basis->npw); @@ -623,7 +618,6 @@ void Forces::cal_force_loc(const UnitCell& ucell, ucell.ntype, d_gcar, d_tau, - d_iat2it, d_vloc_per_type, d_aux, scale_factor, @@ -639,7 +633,6 @@ void Forces::cal_force_loc(const UnitCell& ucell, // 清理设备内存 delmem_var_op()(this->ctx, d_gcar); delmem_var_op()(this->ctx, d_tau); - delmem_int_op()(this->ctx, d_iat2it); delmem_var_op()(this->ctx, d_vloc_per_type); delmem_complex_op()(this->ctx, d_aux); delmem_var_op()(this->ctx, d_force); @@ -652,7 +645,7 @@ void Forces::cal_force_loc(const UnitCell& ucell, forcelc(iat, 2) = static_cast(force_host[iat * 3 + 2]); } - // this->print(GlobalV::ofs_running, "local forces", forcelc); + // this->print(GlobalV: :ofs_running, "local forces", forcelc); Parallel_Reduce::reduce_pool(forcelc.c, forcelc.nr * forcelc.nc); delete[] aux; ModuleBase::timer::tick("Forces", "cal_force_loc"); diff --git a/source/module_hamilt_pw/hamilt_pwdft/kernels/cuda/force_op.cu b/source/module_hamilt_pw/hamilt_pwdft/kernels/cuda/force_op.cu index 077864caff..a82bd1ee0b 100644 --- a/source/module_hamilt_pw/hamilt_pwdft/kernels/cuda/force_op.cu +++ b/source/module_hamilt_pw/hamilt_pwdft/kernels/cuda/force_op.cu @@ -21,7 +21,6 @@ __global__ void cal_force_loc_sincos_kernel( const int ntype, const FPTYPE* gcar, const FPTYPE* tau, - const int* iat2it, const FPTYPE* vloc_per_type, const thrust::complex* aux, const FPTYPE scale_factor, @@ -36,7 +35,6 @@ __global__ void cal_force_loc_sincos_kernel( if (iat >= nat) return; // Load atom information to registers - const int it = iat2it[iat]; const FPTYPE tau_x = tau[iat * 3 + 0]; const FPTYPE tau_y = tau[iat * 3 + 1]; const FPTYPE tau_z = tau[iat * 3 + 2]; @@ -58,7 +56,7 @@ __global__ void cal_force_loc_sincos_kernel( sincos(phase, &sinp, &cosp); // Calculate force factor - const FPTYPE vloc_factor = vloc_per_type[it * npw + ig]; + const FPTYPE vloc_factor = vloc_per_type[iat * npw + ig]; const FPTYPE factor = vloc_factor * (cosp * aux[ig].imag() + sinp * aux[ig].real()); // Accumulate force contributions @@ -316,7 +314,6 @@ void cal_force_loc_sincos_op::operator()( const int& ntype, const FPTYPE* gcar, const FPTYPE* tau, - const int* iat2it, const FPTYPE* vloc_per_type, const std::complex* aux, const FPTYPE& scale_factor, @@ -331,7 +328,7 @@ void cal_force_loc_sincos_op::operator()( dim3 block(threads_per_block); cal_force_loc_sincos_kernel<<>>( - nat, npw, ntype, gcar, tau, iat2it, vloc_per_type, + nat, npw, ntype, gcar, tau, vloc_per_type, reinterpret_cast*>(aux), scale_factor, force ); diff --git a/source/module_hamilt_pw/hamilt_pwdft/kernels/force_op.cpp b/source/module_hamilt_pw/hamilt_pwdft/kernels/force_op.cpp index e0dc91a70c..49576abbfa 100644 --- a/source/module_hamilt_pw/hamilt_pwdft/kernels/force_op.cpp +++ b/source/module_hamilt_pw/hamilt_pwdft/kernels/force_op.cpp @@ -438,7 +438,6 @@ struct cal_force_loc_sincos_op const int& ntype, const FPTYPE* gcar, const FPTYPE* tau, - const int* iat2it, const FPTYPE* vloc_per_type, const std::complex* aux, const FPTYPE& scale_factor, @@ -451,7 +450,6 @@ struct cal_force_loc_sincos_op #endif for (int iat = 0; iat < nat; ++iat) { - const int it = iat2it[iat]; const FPTYPE tau_x = tau[iat * 3 + 0]; const FPTYPE tau_y = tau[iat * 3 + 1]; const FPTYPE tau_z = tau[iat * 3 + 2]; @@ -466,7 +464,7 @@ struct cal_force_loc_sincos_op FPTYPE sinp, cosp; ModuleBase::libm::sincos(phase, &sinp, &cosp); - const FPTYPE vloc_factor = vloc_per_type[it * npw + ig]; + const FPTYPE vloc_factor = vloc_per_type[iat * npw + ig]; const FPTYPE factor = vloc_factor * (cosp * aux[ig].imag() + sinp * aux[ig].real()) * scale_factor; local_force[0] += gcar[ig * 3 + 0] * factor; diff --git a/source/module_hamilt_pw/hamilt_pwdft/kernels/force_op.h b/source/module_hamilt_pw/hamilt_pwdft/kernels/force_op.h index 3fc5d92212..58fcd842ef 100644 --- a/source/module_hamilt_pw/hamilt_pwdft/kernels/force_op.h +++ b/source/module_hamilt_pw/hamilt_pwdft/kernels/force_op.h @@ -161,8 +161,7 @@ struct cal_force_loc_sincos_op /// @param ntype - number of atom types /// @param gcar - G-vector Cartesian coordinates [npw * 3] /// @param tau - atomic positions [nat * 3] - /// @param iat2it - atom to type mapping [nat] - /// @param vloc_per_type - precomputed vloc factors per type [ntype * npw] + /// @param vloc_per_type - precomputed vloc factors per atom [nat * npw] /// @param aux - charge density in G-space [npw] /// @param scale_factor - tpiba * omega /// @@ -174,7 +173,6 @@ struct cal_force_loc_sincos_op const int& ntype, const FPTYPE* gcar, const FPTYPE* tau, - const int* iat2it, const FPTYPE* vloc_per_type, const std::complex* aux, const FPTYPE& scale_factor, @@ -319,7 +317,6 @@ struct cal_force_loc_sincos_op const int& ntype, const FPTYPE* gcar, const FPTYPE* tau, - const int* iat2it, const FPTYPE* vloc_per_type, const std::complex* aux, const FPTYPE& scale_factor, diff --git a/source/module_hamilt_pw/hamilt_pwdft/kernels/rocm/force_op.hip.cu b/source/module_hamilt_pw/hamilt_pwdft/kernels/rocm/force_op.hip.cu index 9feeb76fed..7f0d4b099e 100644 --- a/source/module_hamilt_pw/hamilt_pwdft/kernels/rocm/force_op.hip.cu +++ b/source/module_hamilt_pw/hamilt_pwdft/kernels/rocm/force_op.hip.cu @@ -626,7 +626,6 @@ __global__ void cal_force_loc_sincos_kernel( const int ntype, const FPTYPE* gcar, const FPTYPE* tau, - const int* iat2it, const FPTYPE* vloc_per_type, const thrust::complex* aux, const FPTYPE scale_factor, @@ -641,7 +640,6 @@ __global__ void cal_force_loc_sincos_kernel( if (iat >= nat) return; // Load atom information to registers - const int it = iat2it[iat]; const FPTYPE tau_x = tau[iat * 3 + 0]; const FPTYPE tau_y = tau[iat * 3 + 1]; const FPTYPE tau_z = tau[iat * 3 + 2]; @@ -663,7 +661,7 @@ __global__ void cal_force_loc_sincos_kernel( sincos(phase, &sinp, &cosp); // Calculate force factor - const FPTYPE vloc_factor = vloc_per_type[it * npw + ig]; + const FPTYPE vloc_factor = vloc_per_type[iat * npw + ig]; const FPTYPE factor = vloc_factor * (cosp * aux[ig].imag() + sinp * aux[ig].real()); // Accumulate force contributions @@ -747,7 +745,6 @@ void cal_force_loc_sincos_op::operator()( const int& ntype, const FPTYPE* gcar, const FPTYPE* tau, - const int* iat2it, const FPTYPE* vloc_per_type, const std::complex* aux, const FPTYPE& scale_factor, @@ -763,7 +760,7 @@ void cal_force_loc_sincos_op::operator()( hipLaunchKernelGGL(cal_force_loc_sincos_kernel, grid, block, 0, 0, - nat, npw, ntype, gcar, tau, iat2it, vloc_per_type, + nat, npw, ntype, gcar, tau, vloc_per_type, reinterpret_cast*>(aux), scale_factor, force); From 6638968aa21136591aafeac52dc3a24fc91fe12c Mon Sep 17 00:00:00 2001 From: Jie Date: Tue, 3 Jun 2025 20:35:20 +0800 Subject: [PATCH 5/6] fix cal_force_ew --- .../module_hamilt_pw/hamilt_pwdft/forces.cpp | 67 ++++++------------- .../hamilt_pwdft/kernels/cuda/force_op.cu | 8 +-- .../hamilt_pwdft/kernels/force_op.cpp | 3 +- .../hamilt_pwdft/kernels/force_op.h | 3 - .../hamilt_pwdft/kernels/rocm/force_op.hip.cu | 8 +-- 5 files changed, 29 insertions(+), 60 deletions(-) diff --git a/source/module_hamilt_pw/hamilt_pwdft/forces.cpp b/source/module_hamilt_pw/hamilt_pwdft/forces.cpp index d50288eef8..4cade5d1ac 100644 --- a/source/module_hamilt_pw/hamilt_pwdft/forces.cpp +++ b/source/module_hamilt_pw/hamilt_pwdft/forces.cpp @@ -532,17 +532,17 @@ void Forces::cal_force_loc(const UnitCell& ucell, // to G space. maybe need fftw with OpenMP rho_basis->real2recip(aux, aux); - // =============== GPU/CPU异构优化:使用sincos op替换原循环 =============== + // sincos op for G space - // 准备原子相关数据:按照全局原子索引iat顺序 + // data preparation std::vector tau_flat(this->nat * 3); std::vector gcar_flat(rho_basis->npw * 3); - // 按照原始代码逻辑:遍历全局原子索引iat,通过查表获取(it,ia) + for (int iat = 0; iat < this->nat; iat++) { - int it = ucell.iat2it[iat]; // 查表获取原子类型 - int ia = ucell.iat2ia[iat]; // 查表获取该类型内的原子索引 + int it = ucell.iat2it[iat]; + int ia = ucell.iat2ia[iat]; tau_flat[iat * 3 + 0] = static_cast(ucell.atoms[it].tau[ia][0]); tau_flat[iat * 3 + 1] = static_cast(ucell.atoms[it].tau[ia][1]); @@ -555,7 +555,7 @@ void Forces::cal_force_loc(const UnitCell& ucell, gcar_flat[ig * 3 + 2] = static_cast(rho_basis->gcar[ig][2]); } - // 重新计算vloc_factors考虑所有原子类型 + // calculate vloc_factors for all atom types std::vector vloc_per_type_host(ucell.ntype * rho_basis->npw); for (int iat = 0; iat < this->nat; iat++) { int it = ucell.iat2it[iat]; @@ -564,13 +564,11 @@ void Forces::cal_force_loc(const UnitCell& ucell, } } - // 转换aux到FPTYPE类型 std::vector> aux_fptype(rho_basis->npw); for (int ig = 0; ig < rho_basis->npw; ig++) { aux_fptype[ig] = static_cast>(aux[ig]); } - // 设备端内存和指针设置(根据设备类型分支) FPTYPE* d_gcar = gcar_flat.data(); FPTYPE* d_tau = tau_flat.data(); FPTYPE* d_vloc_per_type = vloc_per_type_host.data(); @@ -591,26 +589,22 @@ void Forces::cal_force_loc(const UnitCell& ucell, resmem_complex_op()(this->ctx, d_aux, rho_basis->npw); resmem_var_op()(this->ctx, d_force, this->nat * 3); - // 数据传输到设备 syncmem_var_h2d_op()(this->ctx, this->cpu_ctx, d_gcar, gcar_flat.data(), rho_basis->npw * 3); syncmem_var_h2d_op()(this->ctx, this->cpu_ctx, d_tau, tau_flat.data(), this->nat * 3); syncmem_var_h2d_op()(this->ctx, this->cpu_ctx, d_vloc_per_type, vloc_per_type_host.data(), ucell.ntype * rho_basis->npw); syncmem_complex_h2d_op()(this->ctx, this->cpu_ctx, d_aux, aux_fptype.data(), rho_basis->npw); - // 初始化force为0 base_device::memory::set_memory_op()(this->ctx, d_force, 0.0, this->nat * 3); } else { d_force = force_host.data(); - // 对于CPU,直接初始化为0 std::fill(force_host.begin(), force_host.end(), static_cast(0.0)); } - // 计算缩放因子 const FPTYPE scale_factor = static_cast(ucell.tpiba * ucell.omega); - // 调用op进行sincos计算 + // call op for sincos calculation hamilt::cal_force_loc_sincos_op()( this->ctx, this->nat, @@ -624,13 +618,10 @@ void Forces::cal_force_loc(const UnitCell& ucell, d_force ); - // 根据设备类型处理结果 if (this->device == base_device::GpuDevice) { - // 结果传回CPU syncmem_var_d2h_op()(this->cpu_ctx, this->ctx, force_host.data(), d_force, this->nat * 3); - // 清理设备内存 delmem_var_op()(this->ctx, d_gcar); delmem_var_op()(this->ctx, d_tau); delmem_var_op()(this->ctx, d_vloc_per_type); @@ -638,7 +629,6 @@ void Forces::cal_force_loc(const UnitCell& ucell, delmem_var_op()(this->ctx, d_force); } - // 将结果写入forcelc矩阵 for (int iat = 0; iat < this->nat; iat++) { forcelc(iat, 0) = static_cast(force_host[iat * 3 + 0]); forcelc(iat, 1) = static_cast(force_host[iat * 3 + 1]); @@ -755,17 +745,15 @@ void Forces::cal_force_ew(const UnitCell& ucell, aux[rho_basis->ig_gge0] = std::complex(0.0, 0.0); } - // =============== 第一步:G空间sincos计算(在OpenMP区域外)=============== + // sincos op for cal_force_ew - // 预计算每个原子的it_fact和相关数据:按照全局原子索引iat顺序 std::vector it_facts_host(this->nat); std::vector tau_flat(this->nat * 3); - std::vector iat2it_host(this->nat); - // 按照原始代码逻辑:遍历全局原子索引iat,通过查表获取(it,ia) + // iterate over by lookup table for (int iat = 0; iat < this->nat; iat++) { - int it = ucell.iat2it[iat]; // 查表获取原子类型 - int ia = ucell.iat2ia[iat]; // 查表获取该类型内的原子索引 + int it = ucell.iat2it[iat]; + int ia = ucell.iat2ia[iat]; double zv; if (PARAM.inp.use_paw) @@ -785,10 +773,8 @@ void Forces::cal_force_ew(const UnitCell& ucell, tau_flat[iat * 3 + 0] = static_cast(ucell.atoms[it].tau[ia][0]); tau_flat[iat * 3 + 1] = static_cast(ucell.atoms[it].tau[ia][1]); tau_flat[iat * 3 + 2] = static_cast(ucell.atoms[it].tau[ia][2]); - iat2it_host[iat] = it; } - // 准备设备端数据 std::vector gcar_flat(rho_basis->npw * 3); for (int ig = 0; ig < rho_basis->npw; ig++) { gcar_flat[ig * 3 + 0] = static_cast(rho_basis->gcar[ig][0]); @@ -796,16 +782,13 @@ void Forces::cal_force_ew(const UnitCell& ucell, gcar_flat[ig * 3 + 2] = static_cast(rho_basis->gcar[ig][2]); } - // 转换aux到FPTYPE类型 std::vector> aux_fptype(rho_basis->npw); for (int ig = 0; ig < rho_basis->npw; ig++) { aux_fptype[ig] = static_cast>(aux[ig]); } - // 设备端内存和指针设置(根据设备类型分支) FPTYPE* d_gcar = gcar_flat.data(); FPTYPE* d_tau = tau_flat.data(); - int* d_iat2it = iat2it_host.data(); FPTYPE* d_it_facts = it_facts_host.data(); std::complex* d_aux = aux_fptype.data(); FPTYPE* d_force_g = nullptr; @@ -815,72 +798,66 @@ void Forces::cal_force_ew(const UnitCell& ucell, { d_gcar = nullptr; d_tau = nullptr; - d_iat2it = nullptr; d_it_facts = nullptr; d_aux = nullptr; resmem_var_op()(this->ctx, d_gcar, rho_basis->npw * 3); resmem_var_op()(this->ctx, d_tau, this->nat * 3); - resmem_int_op()(this->ctx, d_iat2it, this->nat); resmem_var_op()(this->ctx, d_it_facts, this->nat); resmem_complex_op()(this->ctx, d_aux, rho_basis->npw); resmem_var_op()(this->ctx, d_force_g, this->nat * 3); - // 数据传输 + syncmem_var_h2d_op()(this->ctx, this->cpu_ctx, d_gcar, gcar_flat.data(), rho_basis->npw * 3); syncmem_var_h2d_op()(this->ctx, this->cpu_ctx, d_tau, tau_flat.data(), this->nat * 3); - syncmem_int_h2d_op()(this->ctx, this->cpu_ctx, d_iat2it, iat2it_host.data(), this->nat); syncmem_var_h2d_op()(this->ctx, this->cpu_ctx, d_it_facts, it_facts_host.data(), this->nat); syncmem_complex_h2d_op()(this->ctx, this->cpu_ctx, d_aux, aux_fptype.data(), rho_basis->npw); - // 初始化force为0 + base_device::memory::set_memory_op()(this->ctx, d_force_g, 0.0, this->nat * 3); } else { d_force_g = force_g_host.data(); - // 对于CPU,直接初始化为0 std::fill(force_g_host.begin(), force_g_host.end(), static_cast(0.0)); } - // 调用op处理G空间sincos计算(在OpenMP区域外,无冲突) + // call op for sincos calculation hamilt::cal_force_ew_sincos_op()( this->ctx, this->nat, rho_basis->npw, - rho_basis->ig_gge0, // G=0项索引,op内部会自动跳过 + rho_basis->ig_gge0, d_gcar, d_tau, - d_iat2it, d_it_facts, d_aux, d_force_g ); - // 根据设备类型处理结果 + if (this->device == base_device::GpuDevice) { - // 将G空间结果传回CPU + syncmem_var_d2h_op()(this->cpu_ctx, this->ctx, force_g_host.data(), d_force_g, this->nat * 3); - // 清理设备内存 + delmem_var_op()(this->ctx, d_gcar); delmem_var_op()(this->ctx, d_tau); - delmem_int_op()(this->ctx, d_iat2it); delmem_var_op()(this->ctx, d_it_facts); delmem_complex_op()(this->ctx, d_aux); delmem_var_op()(this->ctx, d_force_g); } - // 累加到forceion + for (int iat = 0; iat < this->nat; iat++) { forceion(iat, 0) += static_cast(force_g_host[iat * 3 + 0]); forceion(iat, 1) += static_cast(force_g_host[iat * 3 + 1]); forceion(iat, 2) += static_cast(force_g_host[iat * 3 + 2]); } - // =============== 第二步:实空间计算(保留原OpenMP结构)=============== - + +// calculate real space force #ifdef _OPENMP #pragma omp parallel { @@ -904,7 +881,7 @@ void Forces::cal_force_ew(const UnitCell& ucell, iat_end = iat_beg + iat_end; ucell.iat2iait(iat_beg, &ia_beg, &it_beg); - // 只保留实空间相互作用计算(means that the processor contains G=0 term) + if (rho_basis->ig_gge0 >= 0) { double rmax = 5.0 / (sqrt(alpha) * ucell.lat0); diff --git a/source/module_hamilt_pw/hamilt_pwdft/kernels/cuda/force_op.cu b/source/module_hamilt_pw/hamilt_pwdft/kernels/cuda/force_op.cu index a82bd1ee0b..5ff3ddf2b1 100644 --- a/source/module_hamilt_pw/hamilt_pwdft/kernels/cuda/force_op.cu +++ b/source/module_hamilt_pw/hamilt_pwdft/kernels/cuda/force_op.cu @@ -78,7 +78,6 @@ __global__ void cal_force_ew_sincos_kernel( const int ig_gge0, const FPTYPE* gcar, const FPTYPE* tau, - const int* iat2it, const FPTYPE* it_facts, const thrust::complex* aux, FPTYPE* force) @@ -116,8 +115,8 @@ __global__ void cal_force_ew_sincos_kernel( FPTYPE sinp, cosp; sincos(phase, &sinp, &cosp); - // Calculate Ewald sum contribution - const FPTYPE factor = it_fact * (cosp * aux[ig].imag() + sinp * aux[ig].real()); + // Calculate Ewald sum contribution (fixed sign error) + const FPTYPE factor = it_fact * (-cosp * aux[ig].imag() + sinp * aux[ig].real()); // Accumulate force contributions local_force_x += gcar[ig * 3 + 0] * factor; @@ -344,7 +343,6 @@ void cal_force_ew_sincos_op::operator()( const int& ig_gge0, const FPTYPE* gcar, const FPTYPE* tau, - const int* iat2it, const FPTYPE* it_facts, const std::complex* aux, FPTYPE* force) @@ -358,7 +356,7 @@ void cal_force_ew_sincos_op::operator()( dim3 block(threads_per_block); cal_force_ew_sincos_kernel<<>>( - nat, npw, ig_gge0, gcar, tau, iat2it, it_facts, + nat, npw, ig_gge0, gcar, tau, it_facts, reinterpret_cast*>(aux), force ); diff --git a/source/module_hamilt_pw/hamilt_pwdft/kernels/force_op.cpp b/source/module_hamilt_pw/hamilt_pwdft/kernels/force_op.cpp index 49576abbfa..109321d6c3 100644 --- a/source/module_hamilt_pw/hamilt_pwdft/kernels/force_op.cpp +++ b/source/module_hamilt_pw/hamilt_pwdft/kernels/force_op.cpp @@ -489,7 +489,6 @@ struct cal_force_ew_sincos_op const int& ig_gge0, const FPTYPE* gcar, const FPTYPE* tau, - const int* iat2it, const FPTYPE* it_facts, const std::complex* aux, FPTYPE* force) @@ -519,7 +518,7 @@ struct cal_force_ew_sincos_op FPTYPE sinp, cosp; ModuleBase::libm::sincos(phase, &sinp, &cosp); - const FPTYPE factor = it_fact * (cosp * aux[ig].imag() + sinp * aux[ig].real()); + const FPTYPE factor = it_fact * (-cosp * aux[ig].imag() + sinp * aux[ig].real()); local_force[0] += gcar[ig * 3 + 0] * factor; local_force[1] += gcar[ig * 3 + 1] * factor; diff --git a/source/module_hamilt_pw/hamilt_pwdft/kernels/force_op.h b/source/module_hamilt_pw/hamilt_pwdft/kernels/force_op.h index 58fcd842ef..acf490b278 100644 --- a/source/module_hamilt_pw/hamilt_pwdft/kernels/force_op.h +++ b/source/module_hamilt_pw/hamilt_pwdft/kernels/force_op.h @@ -191,7 +191,6 @@ struct cal_force_ew_sincos_op /// @param ig_gge0 - index of G=0 vector (-1 if not present) /// @param gcar - G-vector Cartesian coordinates [npw * 3] /// @param tau - atomic positions [nat * 3] - /// @param iat2it - atom to type mapping [nat] /// @param it_facts - precomputed it_fact for each atom [nat] /// @param aux - structure factor related array [npw] /// @@ -203,7 +202,6 @@ struct cal_force_ew_sincos_op const int& ig_gge0, const FPTYPE* gcar, const FPTYPE* tau, - const int* iat2it, const FPTYPE* it_facts, const std::complex* aux, FPTYPE* force); @@ -332,7 +330,6 @@ struct cal_force_ew_sincos_op const int& ig_gge0, const FPTYPE* gcar, const FPTYPE* tau, - const int* iat2it, const FPTYPE* it_facts, const std::complex* aux, FPTYPE* force); diff --git a/source/module_hamilt_pw/hamilt_pwdft/kernels/rocm/force_op.hip.cu b/source/module_hamilt_pw/hamilt_pwdft/kernels/rocm/force_op.hip.cu index 7f0d4b099e..6bb3a84e7e 100644 --- a/source/module_hamilt_pw/hamilt_pwdft/kernels/rocm/force_op.hip.cu +++ b/source/module_hamilt_pw/hamilt_pwdft/kernels/rocm/force_op.hip.cu @@ -683,7 +683,6 @@ __global__ void cal_force_ew_sincos_kernel( const int ig_gge0, const FPTYPE* gcar, const FPTYPE* tau, - const int* iat2it, const FPTYPE* it_facts, const thrust::complex* aux, FPTYPE* force) @@ -721,8 +720,8 @@ __global__ void cal_force_ew_sincos_kernel( FPTYPE sinp, cosp; sincos(phase, &sinp, &cosp); - // Calculate Ewald sum contribution - const FPTYPE factor = it_fact * (cosp * aux[ig].imag() + sinp * aux[ig].real()); + // Calculate Ewald sum contribution (fixed sign error) + const FPTYPE factor = it_fact * (-cosp * aux[ig].imag() + sinp * aux[ig].real()); // Accumulate force contributions local_force_x += gcar[ig * 3 + 0] * factor; @@ -775,7 +774,6 @@ void cal_force_ew_sincos_op::operator()( const int& ig_gge0, const FPTYPE* gcar, const FPTYPE* tau, - const int* iat2it, const FPTYPE* it_facts, const std::complex* aux, FPTYPE* force) @@ -790,7 +788,7 @@ void cal_force_ew_sincos_op::operator()( hipLaunchKernelGGL(cal_force_ew_sincos_kernel, grid, block, 0, 0, - nat, npw, ig_gge0, gcar, tau, iat2it, it_facts, + nat, npw, ig_gge0, gcar, tau, it_facts, reinterpret_cast*>(aux), force); hipCheckOnDebug(); From f3eafd0cb6438646e14bbe0f92dd0c9ac825e83b Mon Sep 17 00:00:00 2001 From: Jie Date: Wed, 4 Jun 2025 15:51:29 +0800 Subject: [PATCH 6/6] fix malloc error --- source/module_hamilt_pw/hamilt_pwdft/forces.cpp | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/source/module_hamilt_pw/hamilt_pwdft/forces.cpp b/source/module_hamilt_pw/hamilt_pwdft/forces.cpp index 4cade5d1ac..a4d779366b 100644 --- a/source/module_hamilt_pw/hamilt_pwdft/forces.cpp +++ b/source/module_hamilt_pw/hamilt_pwdft/forces.cpp @@ -556,7 +556,7 @@ void Forces::cal_force_loc(const UnitCell& ucell, } // calculate vloc_factors for all atom types - std::vector vloc_per_type_host(ucell.ntype * rho_basis->npw); + std::vector vloc_per_type_host(this->nat * rho_basis->npw); for (int iat = 0; iat < this->nat; iat++) { int it = ucell.iat2it[iat]; for (int ig = 0; ig < rho_basis->npw; ig++) { @@ -585,13 +585,13 @@ void Forces::cal_force_loc(const UnitCell& ucell, resmem_var_op()(this->ctx, d_gcar, rho_basis->npw * 3); resmem_var_op()(this->ctx, d_tau, this->nat * 3); - resmem_var_op()(this->ctx, d_vloc_per_type, ucell.ntype * rho_basis->npw); + resmem_var_op()(this->ctx, d_vloc_per_type, this->nat * rho_basis->npw); resmem_complex_op()(this->ctx, d_aux, rho_basis->npw); resmem_var_op()(this->ctx, d_force, this->nat * 3); syncmem_var_h2d_op()(this->ctx, this->cpu_ctx, d_gcar, gcar_flat.data(), rho_basis->npw * 3); syncmem_var_h2d_op()(this->ctx, this->cpu_ctx, d_tau, tau_flat.data(), this->nat * 3); - syncmem_var_h2d_op()(this->ctx, this->cpu_ctx, d_vloc_per_type, vloc_per_type_host.data(), ucell.ntype * rho_basis->npw); + syncmem_var_h2d_op()(this->ctx, this->cpu_ctx, d_vloc_per_type, vloc_per_type_host.data(), this->nat * rho_basis->npw); syncmem_complex_h2d_op()(this->ctx, this->cpu_ctx, d_aux, aux_fptype.data(), rho_basis->npw); base_device::memory::set_memory_op()(this->ctx, d_force, 0.0, this->nat * 3); @@ -609,7 +609,7 @@ void Forces::cal_force_loc(const UnitCell& ucell, this->ctx, this->nat, rho_basis->npw, - ucell.ntype, + this->nat, d_gcar, d_tau, d_vloc_per_type,