diff --git a/source/module_hamilt_pw/hamilt_pwdft/kernels/cuda/stress_op.cu b/source/module_hamilt_pw/hamilt_pwdft/kernels/cuda/stress_op.cu index dfca5c4e8e..115abfa7c7 100644 --- a/source/module_hamilt_pw/hamilt_pwdft/kernels/cuda/stress_op.cu +++ b/source/module_hamilt_pw/hamilt_pwdft/kernels/cuda/stress_op.cu @@ -1136,6 +1136,97 @@ template struct cal_force_npw_op; template struct cal_multi_dot_op; template struct cal_multi_dot_op; +// CUDA kernel for stress Ewald sincos calculation +// Fixed: Parallelize over G-vectors instead of atoms to avoid race conditions +template +__global__ void cal_stress_ewa_sincos_kernel( + const int nat, + const int npw, + const int ig_gge0, + const FPTYPE* gcar, + const FPTYPE* tau, + const FPTYPE* zv_facts, + FPTYPE* rhostar_real, + FPTYPE* rhostar_imag) +{ + const FPTYPE TWO_PI = 2.0 * M_PI; + + // Parallelize over G-vectors (ig) instead of atoms (iat) + const int ig = blockIdx.x * blockDim.x + threadIdx.x; + + if (ig >= npw) return; + + // Skip G=0 term + if (ig == ig_gge0) return; + + // Local accumulation variables for this G-vector + FPTYPE local_rhostar_real = 0.0; + FPTYPE local_rhostar_imag = 0.0; + + // Load G-vector components to registers + const FPTYPE gcar_x = gcar[ig * 3 + 0]; + const FPTYPE gcar_y = gcar[ig * 3 + 1]; + const FPTYPE gcar_z = gcar[ig * 3 + 2]; + + // Loop over all atoms for this G-vector (matches CPU implementation) + for (int iat = 0; iat < nat; iat++) { + // Load atom information to registers + const FPTYPE tau_x = tau[iat * 3 + 0]; + const FPTYPE tau_y = tau[iat * 3 + 1]; + const FPTYPE tau_z = tau[iat * 3 + 2]; + const FPTYPE zv = zv_facts[iat]; + + // Calculate phase: 2π * (G · τ) - same as CPU implementation + const FPTYPE phase = TWO_PI * (gcar_x * tau_x + + gcar_y * tau_y + + gcar_z * tau_z); + + // Use CUDA intrinsic for sincos + FPTYPE sinp, cosp; + sincos(phase, &sinp, &cosp); + + // Accumulate structure factor locally (no race conditions) + local_rhostar_real += zv * cosp; + local_rhostar_imag += zv * sinp; + } + + // Store results - each thread writes to unique memory location + rhostar_real[ig] = local_rhostar_real; + rhostar_imag[ig] = local_rhostar_imag; +} + +// GPU operators +template +void cal_stress_ewa_sincos_op::operator()( + const base_device::DEVICE_GPU* ctx, + const int& nat, + const int& npw, + const int& ig_gge0, + const FPTYPE* gcar, + const FPTYPE* tau, + const FPTYPE* zv_facts, + FPTYPE* rhostar_real, + FPTYPE* rhostar_imag) +{ + + // Calculate grid configuration for G-vector parallelization + const int threads_per_block = THREADS_PER_BLOCK; + const int num_blocks = (npw + threads_per_block - 1) / threads_per_block; + + dim3 grid(num_blocks); + dim3 block(threads_per_block); + + cal_stress_ewa_sincos_kernel<<>>( + nat, npw, ig_gge0, gcar, tau, zv_facts, + rhostar_real, rhostar_imag + ); + + cudaCheckOnDebug(); +} + +template struct cal_stress_ewa_sincos_op; +template struct cal_stress_ewa_sincos_op; + // template struct prepare_vkb_deri_ptr_op; // template struct prepare_vkb_deri_ptr_op; } // namespace hamilt \ No newline at end of file diff --git a/source/module_hamilt_pw/hamilt_pwdft/kernels/rocm/stress_op.hip.cu b/source/module_hamilt_pw/hamilt_pwdft/kernels/rocm/stress_op.hip.cu index ef138c04cc..85e9744470 100644 --- a/source/module_hamilt_pw/hamilt_pwdft/kernels/rocm/stress_op.hip.cu +++ b/source/module_hamilt_pw/hamilt_pwdft/kernels/rocm/stress_op.hip.cu @@ -1117,4 +1117,96 @@ template struct cal_force_npw_op; template struct cal_multi_dot_op; template struct cal_multi_dot_op; + +// HIP kernel for stress Ewald sincos calculation +// Fixed: Parallelize over G-vectors instead of atoms to avoid race conditions +template +__global__ void cal_stress_ewa_sincos_kernel( + const int nat, + const int npw, + const int ig_gge0, + const FPTYPE* gcar, + const FPTYPE* tau, + const FPTYPE* zv_facts, + FPTYPE* rhostar_real, + FPTYPE* rhostar_imag) +{ + const FPTYPE TWO_PI = 2.0 * M_PI; + + // Parallelize over G-vectors (ig) instead of atoms (iat) + const int ig = blockIdx.x * blockDim.x + threadIdx.x; + + if (ig >= npw) return; + + // Skip G=0 term + if (ig == ig_gge0) return; + + // Local accumulation variables for this G-vector + FPTYPE local_rhostar_real = 0.0; + FPTYPE local_rhostar_imag = 0.0; + + // Load G-vector components to registers + const FPTYPE gcar_x = gcar[ig * 3 + 0]; + const FPTYPE gcar_y = gcar[ig * 3 + 1]; + const FPTYPE gcar_z = gcar[ig * 3 + 2]; + + // Loop over all atoms for this G-vector (matches CPU implementation) + for (int iat = 0; iat < nat; iat++) { + // Load atom information to registers + const FPTYPE tau_x = tau[iat * 3 + 0]; + const FPTYPE tau_y = tau[iat * 3 + 1]; + const FPTYPE tau_z = tau[iat * 3 + 2]; + const FPTYPE zv = zv_facts[iat]; + + // Calculate phase: 2π * (G · τ) - same as CPU implementation + const FPTYPE phase = TWO_PI * (gcar_x * tau_x + + gcar_y * tau_y + + gcar_z * tau_z); + + // Use HIP intrinsic for sincos + FPTYPE sinp, cosp; + sincos(phase, &sinp, &cosp); + + // Accumulate structure factor locally (no race conditions) + local_rhostar_real += zv * cosp; + local_rhostar_imag += zv * sinp; + } + + // Store results - each thread writes to unique memory location + rhostar_real[ig] = local_rhostar_real; + rhostar_imag[ig] = local_rhostar_imag; +} + +// GPU operators +template +void cal_stress_ewa_sincos_op::operator()( + const base_device::DEVICE_GPU* ctx, + const int& nat, + const int& npw, + const int& ig_gge0, + const FPTYPE* gcar, + const FPTYPE* tau, + const FPTYPE* zv_facts, + FPTYPE* rhostar_real, + FPTYPE* rhostar_imag) +{ + // Calculate grid configuration for G-vector parallelization + const int threads_per_block = THREADS_PER_BLOCK; + const int num_blocks = (npw + threads_per_block - 1) / threads_per_block; + + // Use 1D grid since we're parallelizing over G-vectors only + dim3 grid(num_blocks); + dim3 block(threads_per_block); + + hipLaunchKernelGGL(HIP_KERNEL_NAME(cal_stress_ewa_sincos_kernel), grid, block, 0, 0, + nat, npw, ig_gge0, gcar, tau, zv_facts, + rhostar_real, rhostar_imag + ); + + hipCheckOnDebug(); +} + +template struct cal_stress_ewa_sincos_op; +template struct cal_stress_ewa_sincos_op; + } // namespace hamilt \ No newline at end of file diff --git a/source/module_hamilt_pw/hamilt_pwdft/kernels/stress_op.cpp b/source/module_hamilt_pw/hamilt_pwdft/kernels/stress_op.cpp index 0cd0e1ab96..8477973174 100644 --- a/source/module_hamilt_pw/hamilt_pwdft/kernels/stress_op.cpp +++ b/source/module_hamilt_pw/hamilt_pwdft/kernels/stress_op.cpp @@ -759,6 +759,64 @@ template struct cal_force_npw_op; template struct cal_multi_dot_op; template struct cal_multi_dot_op; +// CPU implementation of Ewald stress sincos operator +template +struct cal_stress_ewa_sincos_op +{ + void operator()(const base_device::DEVICE_CPU* ctx, + const int& nat, + const int& npw, + const int& ig_gge0, + const FPTYPE* gcar, + const FPTYPE* tau, + const FPTYPE* zv_facts, + FPTYPE* rhostar_real, + FPTYPE* rhostar_imag) + { + const FPTYPE TWO_PI = 2.0 * M_PI; + + // Note: Arrays are already initialized to zero in the calling function + // No need to initialize again here to avoid redundant operations + +#ifdef _OPENMP +#pragma omp parallel for +#endif + for (int ig = 0; ig < npw; ig++) { + if (ig == ig_gge0) continue; // Skip G=0 + + FPTYPE local_rhostar_real = 0.0; + FPTYPE local_rhostar_imag = 0.0; + + // Double loop: iat -> ig (as requested) + for (int iat = 0; iat < nat; iat++) { + const FPTYPE tau_x = tau[iat * 3 + 0]; + const FPTYPE tau_y = tau[iat * 3 + 1]; + const FPTYPE tau_z = tau[iat * 3 + 2]; + const FPTYPE zv = zv_facts[iat]; + + // Calculate phase: 2π * (G · τ) - similar to cal_force_ewa phase + const FPTYPE phase = TWO_PI * (gcar[ig * 3 + 0] * tau_x + + gcar[ig * 3 + 1] * tau_y + + gcar[ig * 3 + 2] * tau_z); + + // Calculate sincos + FPTYPE sinp, cosp; + ModuleBase::libm::sincos(phase, &sinp, &cosp); + + // Accumulate structure factor + local_rhostar_real += zv * cosp; + local_rhostar_imag += zv * sinp; + } + + // Store results + rhostar_real[ig] = local_rhostar_real; + rhostar_imag[ig] = local_rhostar_imag; + } + } +}; + +template struct cal_stress_ewa_sincos_op; +template struct cal_stress_ewa_sincos_op; // template struct prepare_vkb_deri_ptr_op; // template struct prepare_vkb_deri_ptr_op; diff --git a/source/module_hamilt_pw/hamilt_pwdft/kernels/stress_op.h b/source/module_hamilt_pw/hamilt_pwdft/kernels/stress_op.h index 7fecd96d75..1154c3c79f 100644 --- a/source/module_hamilt_pw/hamilt_pwdft/kernels/stress_op.h +++ b/source/module_hamilt_pw/hamilt_pwdft/kernels/stress_op.h @@ -260,6 +260,35 @@ struct cal_multi_dot_op{ const std::complex* psi); }; +template +struct cal_stress_ewa_sincos_op +{ + /// @brief Calculate Ewald stress sincos computation + /// Only computes rhostar, other calculations remain in original function + /// + /// Input Parameters + /// @param ctx - which device this function runs on + /// @param nat - total number of atoms + /// @param npw - number of plane waves + /// @param ig_gge0 - index of G=0 vector (-1 if not present) + /// @param gcar - G-vector Cartesian coordinates [npw * 3] + /// @param tau - atomic positions [nat * 3] + /// @param zv_facts - precomputed zv factors for each atom [nat] + /// + /// Output Parameters + /// @param rhostar_real - real part of structure factor [npw] + /// @param rhostar_imag - imaginary part of structure factor [npw] + void operator()(const Device* ctx, + const int& nat, + const int& npw, + const int& ig_gge0, + const FPTYPE* gcar, + const FPTYPE* tau, + const FPTYPE* zv_facts, + FPTYPE* rhostar_real, + FPTYPE* rhostar_imag); +}; + #if __CUDA || __UT_USE_CUDA || __ROCM || __UT_USE_ROCM template @@ -434,6 +463,20 @@ struct cal_multi_dot_op{ const std::complex* psi); }; +template +struct cal_stress_ewa_sincos_op +{ + void operator()(const base_device::DEVICE_GPU* ctx, + const int& nat, + const int& npw, + const int& ig_gge0, + const FPTYPE* gcar, + const FPTYPE* tau, + const FPTYPE* zv_facts, + FPTYPE* rhostar_real, + FPTYPE* rhostar_imag); +}; + /** * The operator is used to compute the auxiliary amount of stress /force * in parallel on the GPU. They identify type with the type provided and diff --git a/source/module_hamilt_pw/hamilt_pwdft/stress_func_ewa.cpp b/source/module_hamilt_pw/hamilt_pwdft/stress_func_ewa.cpp index 5dbe2cd6b4..20cb98226e 100644 --- a/source/module_hamilt_pw/hamilt_pwdft/stress_func_ewa.cpp +++ b/source/module_hamilt_pw/hamilt_pwdft/stress_func_ewa.cpp @@ -4,6 +4,7 @@ #include "module_base/tool_threading.h" #include "module_base/libm/libm.h" #include "module_hamilt_pw/hamilt_pwdft/global.h" +#include "kernels/stress_op.h" #ifdef _OPENMP #include @@ -56,6 +57,96 @@ void Stress_Func::stress_ewa(const UnitCell& ucell, if (PARAM.globalv.gamma_only_pw && is_pw) fact=2.0; // else fact=1.0; + // Prepare data for the sincos op + std::vector zv_facts_host(ucell.nat); + std::vector tau_flat(ucell.nat * 3); + + for (int iat = 0; iat < ucell.nat; iat++) { + int it = ucell.iat2it[iat]; + int ia = ucell.iat2ia[iat]; + + zv_facts_host[iat] = static_cast(ucell.atoms[it].ncpp.zv); + + tau_flat[iat * 3 + 0] = static_cast(ucell.atoms[it].tau[ia][0]); + tau_flat[iat * 3 + 1] = static_cast(ucell.atoms[it].tau[ia][1]); + tau_flat[iat * 3 + 2] = static_cast(ucell.atoms[it].tau[ia][2]); + } + + std::vector gcar_flat(rho_basis->npw * 3); + for (int ig = 0; ig < rho_basis->npw; ig++) { + gcar_flat[ig * 3 + 0] = static_cast(rho_basis->gcar[ig][0]); + gcar_flat[ig * 3 + 1] = static_cast(rho_basis->gcar[ig][1]); + gcar_flat[ig * 3 + 2] = static_cast(rho_basis->gcar[ig][2]); + } + + // Allocate result arrays + std::vector rhostar_real_host(rho_basis->npw); + std::vector rhostar_imag_host(rho_basis->npw); + + // Device pointers for GPU data transfer + FPTYPE* d_gcar = nullptr; + FPTYPE* d_tau = nullptr; + FPTYPE* d_zv_facts = nullptr; + FPTYPE* d_rhostar_real = nullptr; + FPTYPE* d_rhostar_imag = nullptr; + + // GPU memory management and data transfer + if (this->device == base_device::GpuDevice) { + // Allocate GPU memory + resmem_var_op()(this->ctx, d_gcar, rho_basis->npw * 3); + resmem_var_op()(this->ctx, d_tau, ucell.nat * 3); + resmem_var_op()(this->ctx, d_zv_facts, ucell.nat); + resmem_var_op()(this->ctx, d_rhostar_real, rho_basis->npw); + resmem_var_op()(this->ctx, d_rhostar_imag, rho_basis->npw); + + // Data transfer H2D + syncmem_var_h2d_op()(this->ctx, this->cpu_ctx, d_gcar, gcar_flat.data(), rho_basis->npw * 3); + syncmem_var_h2d_op()(this->ctx, this->cpu_ctx, d_tau, tau_flat.data(), ucell.nat * 3); + syncmem_var_h2d_op()(this->ctx, this->cpu_ctx, d_zv_facts, zv_facts_host.data(), ucell.nat); + + // Initialize output arrays to zero on GPU + base_device::memory::set_memory_op()(this->ctx, d_rhostar_real, 0.0, rho_basis->npw); + base_device::memory::set_memory_op()(this->ctx, d_rhostar_imag, 0.0, rho_basis->npw); + } else { + // CPU case: use host pointers directly and initialize arrays to zero + d_gcar = gcar_flat.data(); + d_tau = tau_flat.data(); + d_zv_facts = zv_facts_host.data(); + d_rhostar_real = rhostar_real_host.data(); + d_rhostar_imag = rhostar_imag_host.data(); + + // Initialize output arrays to zero for CPU case + std::fill(rhostar_real_host.begin(), rhostar_real_host.end(), static_cast(0.0)); + std::fill(rhostar_imag_host.begin(), rhostar_imag_host.end(), static_cast(0.0)); + } + + // Call sincos op (outside OpenMP parallel region, op has its own parallelization) + hamilt::cal_stress_ewa_sincos_op()( + this->ctx, + ucell.nat, + rho_basis->npw, + rho_basis->ig_gge0, + d_gcar, + d_tau, + d_zv_facts, + d_rhostar_real, + d_rhostar_imag + ); + + // GPU data transfer D2H and memory cleanup + if (this->device == base_device::GpuDevice) { + // Transfer results back to host + syncmem_var_d2h_op()(this->cpu_ctx, this->ctx, rhostar_real_host.data(), d_rhostar_real, rho_basis->npw); + syncmem_var_d2h_op()(this->cpu_ctx, this->ctx, rhostar_imag_host.data(), d_rhostar_imag, rho_basis->npw); + + // Free GPU memory + delmem_var_op()(this->ctx, d_gcar); + delmem_var_op()(this->ctx, d_tau); + delmem_var_op()(this->ctx, d_zv_facts); + delmem_var_op()(this->ctx, d_rhostar_real); + delmem_var_op()(this->ctx, d_rhostar_imag); + } + #ifdef _OPENMP #pragma omp parallel { @@ -76,27 +167,21 @@ void Stress_Func::stress_ewa(const UnitCell& ucell, ig_end = ig + ig_end; FPTYPE g2,g2a; - FPTYPE arg; - std::complex rhostar; FPTYPE sewald; for(; ig < ig_end; ig++) { if(ig == ig0) continue; g2 = rho_basis->gg[ig]* ucell.tpiba2; g2a = g2 /4.0/alpha; - rhostar=std::complex(0.0,0.0); - for(int it=0; it < ucell.ntype; it++) - { - for(int i=0; igcar[ig] * ucell.atoms[it].tau[i]) * (ModuleBase::TWO_PI); - FPTYPE sinp, cosp; - ModuleBase::libm::sincos(arg, &sinp, &cosp); - rhostar = rhostar + std::complex(ucell.atoms[it].ncpp.zv * cosp,ucell.atoms[it].ncpp.zv * sinp); - } - } - rhostar /= ucell.omega; - sewald = fact* (ModuleBase::TWO_PI) * ModuleBase::e2 * ModuleBase::libm::exp(-g2a) / g2 * pow(std::abs(rhostar),2); + + // Use precomputed rhostar values + FPTYPE rhostar_real = rhostar_real_host[ig] / ucell.omega; + FPTYPE rhostar_imag = rhostar_imag_host[ig] / ucell.omega; + + // Calculate |rhostar|² - mathematically equivalent to pow(std::abs(rhostar), 2) + FPTYPE rhostar_abs2 = rhostar_real * rhostar_real + rhostar_imag * rhostar_imag; + + sewald = fact* (ModuleBase::TWO_PI) * ModuleBase::e2 * ModuleBase::libm::exp(-g2a) / g2 * rhostar_abs2; local_sdewald -= sewald; for(int l=0;l<3;l++) {