From 94bfa42dfaf27b80dc059e2ed6ac589b86ed2e69 Mon Sep 17 00:00:00 2001
From: Jie <li_j20@fudan.edu.cn>
Date: Mon, 26 May 2025 15:56:44 +0800
Subject: [PATCH 1/6] implement gpu op for sincos loops

---
 .../kernels/cuda/force_sincos_op.cu           | 204 ++++++++++++++++++
 .../hamilt_pwdft/kernels/force_sincos_op.cpp  | 123 +++++++++++
 .../hamilt_pwdft/kernels/force_sincos_op.h    | 106 +++++++++
 .../kernels/rocm/force_sincos_op.hip.cu       | 203 +++++++++++++++++
 4 files changed, 636 insertions(+)
 create mode 100644 source/module_hamilt_pw/hamilt_pwdft/kernels/cuda/force_sincos_op.cu
 create mode 100644 source/module_hamilt_pw/hamilt_pwdft/kernels/force_sincos_op.cpp
 create mode 100644 source/module_hamilt_pw/hamilt_pwdft/kernels/force_sincos_op.h
 create mode 100644 source/module_hamilt_pw/hamilt_pwdft/kernels/rocm/force_sincos_op.hip.cu
diff --git a/source/module_hamilt_pw/hamilt_pwdft/kernels/cuda/force_sincos_op.cu b/source/module_hamilt_pw/hamilt_pwdft/kernels/cuda/force_sincos_op.cu
new file mode 100644
index 0000000000..f97214893e
--- /dev/null
+++ b/source/module_hamilt_pw/hamilt_pwdft/kernels/cuda/force_sincos_op.cu
@@ -0,0 +1,204 @@
+#include "module_hamilt_pw/hamilt_pwdft/kernels/force_sincos_op.h"
+#include "module_base/module_device/types.h"
+
+#include <complex>
+#include <thrust/complex.h>
+#include <cuda_runtime.h>
+#include <base/macros/macros.h>
+
+#define THREADS_PER_BLOCK 256
+
+namespace hamilt {
+
+// CUDA kernels for sincos loops
+template <typename FPTYPE>
+__global__ void cal_force_loc_sincos_kernel(
+    const int nat,
+    const int npw,
+    const FPTYPE* gcar,
+    const FPTYPE* tau,
+    const int* iat2it,
+    const FPTYPE* vloc_factors,
+    const thrust::complex<FPTYPE>* aux,
+    const FPTYPE scale_factor,
+    FPTYPE* force)
+{
+    const FPTYPE TWO_PI = 2.0 * M_PI;
+    
+    const int iat = blockIdx.y;
+    const int ig_start = blockIdx.x * blockDim.x + threadIdx.x;
+    const int ig_stride = gridDim.x * blockDim.x;
+    
+    if (iat >= nat) return;
+    
+    // Load atom information to registers
+    const int it = iat2it[iat];
+    const FPTYPE tau_x = tau[iat * 3 + 0];
+    const FPTYPE tau_y = tau[iat * 3 + 1];
+    const FPTYPE tau_z = tau[iat * 3 + 2];
+    
+    // Local accumulation variables
+    FPTYPE local_force_x = 0.0;
+    FPTYPE local_force_y = 0.0;
+    FPTYPE local_force_z = 0.0;
+    
+    // Grid-stride loop over G-vectors
+    for (int ig = ig_start; ig < npw; ig += ig_stride) {
+        // Calculate phase: 2π * (G · τ)
+        const FPTYPE phase = TWO_PI * (
+            gcar[ig * 3 + 0] * tau_x + 
+            gcar[ig * 3 + 1] * tau_y + 
+            gcar[ig * 3 + 2] * tau_z
+        );
+        
+        // Use CUDA intrinsic for sincos
+        FPTYPE sinp, cosp;
+        __sincos(phase, &sinp, &cosp);
+        
+        // Calculate force factor
+        const FPTYPE factor = vloc_factors[ig] * 
+                              (cosp * aux[ig].imag() + sinp * aux[ig].real());
+        
+        // Accumulate force contributions
+        local_force_x += gcar[ig * 3 + 0] * factor;
+        local_force_y += gcar[ig * 3 + 1] * factor;
+        local_force_z += gcar[ig * 3 + 2] * factor;
+    }
+    
+    // Atomic add to global memory
+    atomicAdd(&force[iat * 3 + 0], local_force_x * scale_factor);
+    atomicAdd(&force[iat * 3 + 1], local_force_y * scale_factor);
+    atomicAdd(&force[iat * 3 + 2], local_force_z * scale_factor);
+}
+
+template <typename FPTYPE>
+__global__ void cal_force_ew_sincos_kernel(
+    const int nat,
+    const int npw,
+    const int ig_gge0,
+    const FPTYPE* gcar,
+    const FPTYPE* tau,
+    const int* iat2it,
+    const FPTYPE* it_facts,
+    const thrust::complex<FPTYPE>* aux,
+    FPTYPE* force)
+{
+    const FPTYPE TWO_PI = 2.0 * M_PI;
+    
+    const int iat = blockIdx.y;
+    const int ig_start = blockIdx.x * blockDim.x + threadIdx.x;
+    const int ig_stride = gridDim.x * blockDim.x;
+    
+    if (iat >= nat) return;
+    
+    // Load atom information to registers
+    const int it = iat2it[iat];
+    const FPTYPE tau_x = tau[iat * 3 + 0];
+    const FPTYPE tau_y = tau[iat * 3 + 1];
+    const FPTYPE tau_z = tau[iat * 3 + 2];
+    const FPTYPE it_fact = it_facts[iat];
+    
+    // Local accumulation variables
+    FPTYPE local_force_x = 0.0;
+    FPTYPE local_force_y = 0.0;
+    FPTYPE local_force_z = 0.0;
+    
+    // Grid-stride loop over G-vectors
+    for (int ig = ig_start; ig < npw; ig += ig_stride) {
+        // Skip G=0 term
+        if (ig == ig_gge0) continue;
+        
+        // Calculate phase: 2π * (G · τ)
+        const FPTYPE arg = TWO_PI * (
+            gcar[ig * 3 + 0] * tau_x + 
+            gcar[ig * 3 + 1] * tau_y + 
+            gcar[ig * 3 + 2] * tau_z
+        );
+        
+        // Use CUDA intrinsic for sincos
+        FPTYPE sinp, cosp;
+        __sincos(arg, &sinp, &cosp);
+        
+        // Calculate Ewald sum contribution
+        const FPTYPE sumnb = -cosp * aux[ig].imag() + sinp * aux[ig].real();
+        
+        // Accumulate force contributions
+        local_force_x += gcar[ig * 3 + 0] * sumnb;
+        local_force_y += gcar[ig * 3 + 1] * sumnb;
+        local_force_z += gcar[ig * 3 + 2] * sumnb;
+    }
+    
+    // Atomic add to global memory
+    atomicAdd(&force[iat * 3 + 0], local_force_x * it_fact);
+    atomicAdd(&force[iat * 3 + 1], local_force_y * it_fact);
+    atomicAdd(&force[iat * 3 + 2], local_force_z * it_fact);
+}
+
+// GPU operator implementations
+template <typename FPTYPE>
+void cal_force_loc_sincos_op<FPTYPE, base_device::DEVICE_GPU>::operator()(
+    const base_device::DEVICE_GPU* ctx,
+    const int& nat,
+    const int& npw,
+    const FPTYPE* gcar,
+    const FPTYPE* tau,
+    const int* iat2it,
+    const FPTYPE* vloc_factors,
+    const std::complex<FPTYPE>* aux,
+    const FPTYPE& scale_factor,
+    FPTYPE* force)
+{
+    // Calculate optimal grid configuration
+    const int threads_per_block = THREADS_PER_BLOCK;
+    const int max_blocks_x = min(32, (npw + threads_per_block - 1) / threads_per_block);
+    
+    dim3 grid(max_blocks_x, nat);
+    dim3 block(threads_per_block);
+    
+    cal_force_loc_sincos_kernel<FPTYPE><<<grid, block>>>(
+        nat, npw,
+        gcar, tau, iat2it, vloc_factors,
+        reinterpret_cast<const thrust::complex<FPTYPE>*>(aux),
+        scale_factor, force
+    );
+    
+    cudaCheckOnDebug();
+}
+
+template <typename FPTYPE>
+void cal_force_ew_sincos_op<FPTYPE, base_device::DEVICE_GPU>::operator()(
+    const base_device::DEVICE_GPU* ctx,
+    const int& nat,
+    const int& npw,
+    const int& ig_gge0,
+    const FPTYPE* gcar,
+    const FPTYPE* tau,
+    const int* iat2it,
+    const FPTYPE* it_facts,
+    const std::complex<FPTYPE>* aux,
+    FPTYPE* force)
+{
+    // Calculate optimal grid configuration
+    const int threads_per_block = THREADS_PER_BLOCK;
+    const int max_blocks_x = min(32, (npw + threads_per_block - 1) / threads_per_block);
+    
+    dim3 grid(max_blocks_x, nat);
+    dim3 block(threads_per_block);
+    
+    cal_force_ew_sincos_kernel<FPTYPE><<<grid, block>>>(
+        nat, npw, ig_gge0,
+        gcar, tau, iat2it, it_facts,
+        reinterpret_cast<const thrust::complex<FPTYPE>*>(aux),
+        force
+    );
+    
+    cudaCheckOnDebug();
+}
+
+template struct cal_force_loc_sincos_op<float, base_device::DEVICE_GPU>;
+template struct cal_force_loc_sincos_op<double, base_device::DEVICE_GPU>;
+
+template struct cal_force_ew_sincos_op<float, base_device::DEVICE_GPU>;
+template struct cal_force_ew_sincos_op<double, base_device::DEVICE_GPU>;
+
+}  // namespace hamilt 
\ No newline at end of file
diff --git a/source/module_hamilt_pw/hamilt_pwdft/kernels/force_sincos_op.cpp b/source/module_hamilt_pw/hamilt_pwdft/kernels/force_sincos_op.cpp
new file mode 100644
index 0000000000..e7ce8554cc
--- /dev/null
+++ b/source/module_hamilt_pw/hamilt_pwdft/kernels/force_sincos_op.cpp
@@ -0,0 +1,123 @@
+#include "force_sincos_op.h"
+#include "module_base/libm/libm.h"
+
+#ifdef _OPENMP
+#include <omp.h>
+#endif
+
+namespace hamilt
+{
+
+template <typename FPTYPE>
+struct cal_force_loc_sincos_op<FPTYPE, base_device::DEVICE_CPU>
+{
+    void operator()(const base_device::DEVICE_CPU* ctx,
+                    const int& nat,
+                    const int& npw,
+                    const FPTYPE* gcar,
+                    const FPTYPE* tau,
+                    const int* iat2it,
+                    const FPTYPE* vloc_factors,
+                    const std::complex<FPTYPE>* aux,
+                    const FPTYPE& scale_factor,
+                    FPTYPE* force)
+    {
+        const FPTYPE TWO_PI = 2.0 * M_PI;
+
+#ifdef _OPENMP
+#pragma omp parallel for
+#endif
+        for (int iat = 0; iat < nat; ++iat)
+        {
+            const int it = iat2it[iat];
+            const FPTYPE tau_x = tau[iat * 3 + 0];
+            const FPTYPE tau_y = tau[iat * 3 + 1];
+            const FPTYPE tau_z = tau[iat * 3 + 2];
+
+            FPTYPE local_force[3] = {0.0, 0.0, 0.0};
+
+            for (int ig = 0; ig < npw; ig++)
+            {
+                const FPTYPE phase = TWO_PI * (gcar[ig * 3 + 0] * tau_x + 
+                                               gcar[ig * 3 + 1] * tau_y + 
+                                               gcar[ig * 3 + 2] * tau_z);
+                FPTYPE sinp, cosp;
+                ModuleBase::libm::sincos(phase, &sinp, &cosp);
+                
+                const FPTYPE factor = vloc_factors[ig] * 
+                                     (cosp * aux[ig].imag() + sinp * aux[ig].real());
+                
+                local_force[0] += gcar[ig * 3 + 0] * factor;
+                local_force[1] += gcar[ig * 3 + 1] * factor;
+                local_force[2] += gcar[ig * 3 + 2] * factor;
+            }
+
+            force[iat * 3 + 0] += local_force[0] * scale_factor;
+            force[iat * 3 + 1] += local_force[1] * scale_factor;
+            force[iat * 3 + 2] += local_force[2] * scale_factor;
+        }
+    }
+};
+
+template <typename FPTYPE>
+struct cal_force_ew_sincos_op<FPTYPE, base_device::DEVICE_CPU>
+{
+    void operator()(const base_device::DEVICE_CPU* ctx,
+                    const int& nat,
+                    const int& npw,
+                    const int& ig_gge0,
+                    const FPTYPE* gcar,
+                    const FPTYPE* tau,
+                    const int* iat2it,
+                    const FPTYPE* it_facts,
+                    const std::complex<FPTYPE>* aux,
+                    FPTYPE* force)
+    {
+        const FPTYPE TWO_PI = 2.0 * M_PI;
+
+#ifdef _OPENMP
+#pragma omp parallel for
+#endif
+        for (int iat = 0; iat < nat; ++iat)
+        {
+            const int it = iat2it[iat];
+            const FPTYPE tau_x = tau[iat * 3 + 0];
+            const FPTYPE tau_y = tau[iat * 3 + 1];
+            const FPTYPE tau_z = tau[iat * 3 + 2];
+            const FPTYPE it_fact = it_facts[iat];
+
+            FPTYPE local_force[3] = {0.0, 0.0, 0.0};
+
+            for (int ig = 0; ig < npw; ig++)
+            {
+                // Skip G=0 term
+                if (ig == ig_gge0) continue;
+
+                const FPTYPE arg = TWO_PI * (gcar[ig * 3 + 0] * tau_x + 
+                                             gcar[ig * 3 + 1] * tau_y + 
+                                             gcar[ig * 3 + 2] * tau_z);
+                FPTYPE sinp, cosp;
+                ModuleBase::libm::sincos(arg, &sinp, &cosp);
+                
+                const FPTYPE sumnb = -cosp * aux[ig].imag() + sinp * aux[ig].real();
+                
+                local_force[0] += gcar[ig * 3 + 0] * sumnb;
+                local_force[1] += gcar[ig * 3 + 1] * sumnb;
+                local_force[2] += gcar[ig * 3 + 2] * sumnb;
+            }
+
+            force[iat * 3 + 0] += local_force[0] * it_fact;
+            force[iat * 3 + 1] += local_force[1] * it_fact;
+            force[iat * 3 + 2] += local_force[2] * it_fact;
+        }
+    }
+};
+
+// Template instantiations
+template struct cal_force_loc_sincos_op<float, base_device::DEVICE_CPU>;
+template struct cal_force_loc_sincos_op<double, base_device::DEVICE_CPU>;
+
+template struct cal_force_ew_sincos_op<float, base_device::DEVICE_CPU>;
+template struct cal_force_ew_sincos_op<double, base_device::DEVICE_CPU>;
+
+} // namespace hamilt 
\ No newline at end of file
diff --git a/source/module_hamilt_pw/hamilt_pwdft/kernels/force_sincos_op.h b/source/module_hamilt_pw/hamilt_pwdft/kernels/force_sincos_op.h
new file mode 100644
index 0000000000..455b8425a7
--- /dev/null
+++ b/source/module_hamilt_pw/hamilt_pwdft/kernels/force_sincos_op.h
@@ -0,0 +1,106 @@
+#ifndef FORCE_SINCOS_OP_H
+#define FORCE_SINCOS_OP_H
+
+#include "module_base/module_device/types.h"
+#include <complex>
+
+namespace hamilt
+{
+
+template <typename FPTYPE, typename Device>
+struct cal_force_loc_sincos_op
+{
+    /// @brief Calculate local pseudopotential forces (sincos loop only)
+    ///
+    /// Input Parameters
+    /// @param ctx - which device this function runs on
+    /// @param nat - number of atoms
+    /// @param npw - number of plane waves
+    /// @param gcar - G-vector Cartesian coordinates [npw * 3]
+    /// @param tau - atomic positions [nat * 3]
+    /// @param iat2it - atom to type mapping [nat]
+    /// @param vloc_factors - precomputed vloc factors [npw]
+    /// @param aux - charge density in G-space [npw]
+    /// @param scale_factor - tpiba * omega
+    ///
+    /// Output Parameters
+    /// @param force - output forces [nat * 3]
+    void operator()(const Device* ctx,
+                    const int& nat,
+                    const int& npw,
+                    const FPTYPE* gcar,
+                    const FPTYPE* tau,
+                    const int* iat2it,
+                    const FPTYPE* vloc_factors,
+                    const std::complex<FPTYPE>* aux,
+                    const FPTYPE& scale_factor,
+                    FPTYPE* force);
+};
+
+template <typename FPTYPE, typename Device>
+struct cal_force_ew_sincos_op
+{
+    /// @brief Calculate Ewald forces (sincos loop only)
+    ///
+    /// Input Parameters
+    /// @param ctx - which device this function runs on
+    /// @param nat - number of atoms
+    /// @param npw - number of plane waves
+    /// @param ig_gge0 - index of G=0 vector (-1 if not present)
+    /// @param gcar - G-vector Cartesian coordinates [npw * 3]
+    /// @param tau - atomic positions [nat * 3]
+    /// @param iat2it - atom to type mapping [nat]
+    /// @param it_facts - precomputed it_fact for each atom [nat]
+    /// @param aux - structure factor related array [npw]
+    ///
+    /// Output Parameters
+    /// @param force - output forces [nat * 3]
+    void operator()(const Device* ctx,
+                    const int& nat,
+                    const int& npw,
+                    const int& ig_gge0,
+                    const FPTYPE* gcar,
+                    const FPTYPE* tau,
+                    const int* iat2it,
+                    const FPTYPE* it_facts,
+                    const std::complex<FPTYPE>* aux,
+                    FPTYPE* force);
+};
+
+#if __CUDA || __UT_USE_CUDA || __ROCM || __UT_USE_ROCM
+
+template <typename FPTYPE>
+struct cal_force_loc_sincos_op<FPTYPE, base_device::DEVICE_GPU>
+{
+    void operator()(const base_device::DEVICE_GPU* ctx,
+                    const int& nat,
+                    const int& npw,
+                    const FPTYPE* gcar,
+                    const FPTYPE* tau,
+                    const int* iat2it,
+                    const FPTYPE* vloc_factors,
+                    const std::complex<FPTYPE>* aux,
+                    const FPTYPE& scale_factor,
+                    FPTYPE* force);
+};
+
+template <typename FPTYPE>
+struct cal_force_ew_sincos_op<FPTYPE, base_device::DEVICE_GPU>
+{
+    void operator()(const base_device::DEVICE_GPU* ctx,
+                    const int& nat,
+                    const int& npw,
+                    const int& ig_gge0,
+                    const FPTYPE* gcar,
+                    const FPTYPE* tau,
+                    const int* iat2it,
+                    const FPTYPE* it_facts,
+                    const std::complex<FPTYPE>* aux,
+                    FPTYPE* force);
+};
+
+#endif // __CUDA || __UT_USE_CUDA || __ROCM || __UT_USE_ROCM
+
+} // namespace hamilt
+
+#endif // FORCE_SINCOS_OP_H 
\ No newline at end of file
diff --git a/source/module_hamilt_pw/hamilt_pwdft/kernels/rocm/force_sincos_op.hip.cu b/source/module_hamilt_pw/hamilt_pwdft/kernels/rocm/force_sincos_op.hip.cu
new file mode 100644
index 0000000000..a09cd28bed
--- /dev/null
+++ b/source/module_hamilt_pw/hamilt_pwdft/kernels/rocm/force_sincos_op.hip.cu
@@ -0,0 +1,203 @@
+#include "module_hamilt_pw/hamilt_pwdft/kernels/force_sincos_op.h"
+
+#include <complex>
+#include <thrust/complex.h>
+#include <hip/hip_runtime.h>
+#include <base/macros/macros.h>
+
+#define THREADS_PER_BLOCK 256
+
+namespace hamilt {
+
+// HIP kernels for sincos loops
+template <typename FPTYPE>
+__global__ void cal_force_loc_sincos_kernel(
+    const int nat,
+    const int npw,
+    const FPTYPE* gcar,
+    const FPTYPE* tau,
+    const int* iat2it,
+    const FPTYPE* vloc_factors,
+    const thrust::complex<FPTYPE>* aux,
+    const FPTYPE scale_factor,
+    FPTYPE* force)
+{
+    const FPTYPE TWO_PI = 2.0 * M_PI;
+    
+    const int iat = blockIdx.y;
+    const int ig_start = blockIdx.x * blockDim.x + threadIdx.x;
+    const int ig_stride = gridDim.x * blockDim.x;
+    
+    if (iat >= nat) return;
+    
+    // Load atom information to registers
+    const int it = iat2it[iat];
+    const FPTYPE tau_x = tau[iat * 3 + 0];
+    const FPTYPE tau_y = tau[iat * 3 + 1];
+    const FPTYPE tau_z = tau[iat * 3 + 2];
+    
+    // Local accumulation variables
+    FPTYPE local_force_x = 0.0;
+    FPTYPE local_force_y = 0.0;
+    FPTYPE local_force_z = 0.0;
+    
+    // Grid-stride loop over G-vectors
+    for (int ig = ig_start; ig < npw; ig += ig_stride) {
+        // Calculate phase: 2π * (G · τ)
+        const FPTYPE phase = TWO_PI * (
+            gcar[ig * 3 + 0] * tau_x + 
+            gcar[ig * 3 + 1] * tau_y + 
+            gcar[ig * 3 + 2] * tau_z
+        );
+        
+        // Use HIP intrinsic for sincos
+        FPTYPE sinp, cosp;
+        __sincos(phase, &sinp, &cosp);
+        
+        // Calculate force factor
+        const FPTYPE factor = vloc_factors[ig] * 
+                              (cosp * aux[ig].imag() + sinp * aux[ig].real());
+        
+        // Accumulate force contributions
+        local_force_x += gcar[ig * 3 + 0] * factor;
+        local_force_y += gcar[ig * 3 + 1] * factor;
+        local_force_z += gcar[ig * 3 + 2] * factor;
+    }
+    
+    // Atomic add to global memory
+    atomicAdd(&force[iat * 3 + 0], local_force_x * scale_factor);
+    atomicAdd(&force[iat * 3 + 1], local_force_y * scale_factor);
+    atomicAdd(&force[iat * 3 + 2], local_force_z * scale_factor);
+}
+
+template <typename FPTYPE>
+__global__ void cal_force_ew_sincos_kernel(
+    const int nat,
+    const int npw,
+    const int ig_gge0,
+    const FPTYPE* gcar,
+    const FPTYPE* tau,
+    const int* iat2it,
+    const FPTYPE* it_facts,
+    const thrust::complex<FPTYPE>* aux,
+    FPTYPE* force)
+{
+    const FPTYPE TWO_PI = 2.0 * M_PI;
+    
+    const int iat = blockIdx.y;
+    const int ig_start = blockIdx.x * blockDim.x + threadIdx.x;
+    const int ig_stride = gridDim.x * blockDim.x;
+    
+    if (iat >= nat) return;
+    
+    // Load atom information to registers
+    const int it = iat2it[iat];
+    const FPTYPE tau_x = tau[iat * 3 + 0];
+    const FPTYPE tau_y = tau[iat * 3 + 1];
+    const FPTYPE tau_z = tau[iat * 3 + 2];
+    const FPTYPE it_fact = it_facts[iat];
+    
+    // Local accumulation variables
+    FPTYPE local_force_x = 0.0;
+    FPTYPE local_force_y = 0.0;
+    FPTYPE local_force_z = 0.0;
+    
+    // Grid-stride loop over G-vectors
+    for (int ig = ig_start; ig < npw; ig += ig_stride) {
+        // Skip G=0 term
+        if (ig == ig_gge0) continue;
+        
+        // Calculate phase: 2π * (G · τ)
+        const FPTYPE arg = TWO_PI * (
+            gcar[ig * 3 + 0] * tau_x + 
+            gcar[ig * 3 + 1] * tau_y + 
+            gcar[ig * 3 + 2] * tau_z
+        );
+        
+        // Use HIP intrinsic for sincos
+        FPTYPE sinp, cosp;
+        __sincos(arg, &sinp, &cosp);
+        
+        // Calculate Ewald sum contribution
+        const FPTYPE sumnb = -cosp * aux[ig].imag() + sinp * aux[ig].real();
+        
+        // Accumulate force contributions
+        local_force_x += gcar[ig * 3 + 0] * sumnb;
+        local_force_y += gcar[ig * 3 + 1] * sumnb;
+        local_force_z += gcar[ig * 3 + 2] * sumnb;
+    }
+    
+    // Atomic add to global memory
+    atomicAdd(&force[iat * 3 + 0], local_force_x * it_fact);
+    atomicAdd(&force[iat * 3 + 1], local_force_y * it_fact);
+    atomicAdd(&force[iat * 3 + 2], local_force_z * it_fact);
+}
+
+// GPU operator implementations
+template <typename FPTYPE>
+void cal_force_loc_sincos_op<FPTYPE, base_device::DEVICE_GPU>::operator()(
+    const base_device::DEVICE_GPU* ctx,
+    const int& nat,
+    const int& npw,
+    const FPTYPE* gcar,
+    const FPTYPE* tau,
+    const int* iat2it,
+    const FPTYPE* vloc_factors,
+    const std::complex<FPTYPE>* aux,
+    const FPTYPE& scale_factor,
+    FPTYPE* force)
+{
+    // Calculate optimal grid configuration
+    const int threads_per_block = THREADS_PER_BLOCK;
+    const int max_blocks_x = min(32, (npw + threads_per_block - 1) / threads_per_block);
+    
+    dim3 grid(max_blocks_x, nat);
+    dim3 block(threads_per_block);
+    
+    hipLaunchKernelGGL(HIP_KERNEL_NAME(cal_force_loc_sincos_kernel<FPTYPE>), grid, block, 0, 0,
+        nat, npw,
+        gcar, tau, iat2it, vloc_factors,
+        reinterpret_cast<const thrust::complex<FPTYPE>*>(aux),
+        scale_factor, force
+    );
+    
+    hipCheckOnDebug();
+}
+
+template <typename FPTYPE>
+void cal_force_ew_sincos_op<FPTYPE, base_device::DEVICE_GPU>::operator()(
+    const base_device::DEVICE_GPU* ctx,
+    const int& nat,
+    const int& npw,
+    const int& ig_gge0,
+    const FPTYPE* gcar,
+    const FPTYPE* tau,
+    const int* iat2it,
+    const FPTYPE* it_facts,
+    const std::complex<FPTYPE>* aux,
+    FPTYPE* force)
+{
+    // Calculate optimal grid configuration
+    const int threads_per_block = THREADS_PER_BLOCK;
+    const int max_blocks_x = min(32, (npw + threads_per_block - 1) / threads_per_block);
+    
+    dim3 grid(max_blocks_x, nat);
+    dim3 block(threads_per_block);
+    
+    hipLaunchKernelGGL(HIP_KERNEL_NAME(cal_force_ew_sincos_kernel<FPTYPE>), grid, block, 0, 0,
+        nat, npw, ig_gge0,
+        gcar, tau, iat2it, it_facts,
+        reinterpret_cast<const thrust::complex<FPTYPE>*>(aux),
+        force
+    );
+    
+    hipCheckOnDebug();
+}
+
+template struct cal_force_loc_sincos_op<float, base_device::DEVICE_GPU>;
+template struct cal_force_loc_sincos_op<double, base_device::DEVICE_GPU>;
+
+template struct cal_force_ew_sincos_op<float, base_device::DEVICE_GPU>;
+template struct cal_force_ew_sincos_op<double, base_device::DEVICE_GPU>;
+
+}  // namespace hamilt 
\ No newline at end of file

From 69aa10063376e7d5a4b4408e76c4a8451da8c65e Mon Sep 17 00:00:00 2001
From: Jie <li_j20@fudan.edu.cn>
Date: Tue, 27 May 2025 18:35:51 +0800
Subject: [PATCH 2/6] add cpu kernel for cal_force_loc & cal_force_ew

---
 .../module_hamilt_pw/hamilt_pwdft/forces.cpp  | 452 +++++++++++-------
 .../hamilt_pwdft/kernels/force_op.cpp         | 255 ++++++++++
 .../hamilt_pwdft/kernels/force_op.h           | 107 +++++
 3 files changed, 636 insertions(+), 178 deletions(-)

diff --git a/source/module_hamilt_pw/hamilt_pwdft/forces.cpp b/source/module_hamilt_pw/hamilt_pwdft/forces.cpp
index df29f88989..52ddf7da63 100644
--- a/source/module_hamilt_pw/hamilt_pwdft/forces.cpp
+++ b/source/module_hamilt_pw/hamilt_pwdft/forces.cpp
@@ -15,6 +15,8 @@
 #include "module_hamilt_general/module_ewald/H_Ewald_pw.h"
 #include "module_hamilt_general/module_surchem/surchem.h"
 #include "module_hamilt_general/module_vdw/vdw.h"
+#include "module_hamilt_pw/hamilt_pwdft/kernels/force_op.h"
+#include <vector>
 
 #ifdef _OPENMP
 #include <omp.h>
@@ -23,6 +25,208 @@
 #include "module_cell/module_paw/paw_cell.h"
 #endif
 
+// Data preparation function for Ewald parallel operator
+template <typename FPTYPE>
+void prepare_ew_parallel_data(const UnitCell& ucell,
+                              ModulePW::PW_Basis* rho_basis,
+                              const std::complex<double>* aux,
+                              const double alpha,
+                              const double fact,
+                              std::vector<FPTYPE>& gcar_flat,
+                              std::vector<FPTYPE>& tau_flat,
+                              std::vector<int>& iat2it_array,
+                              std::vector<FPTYPE>& it_facts,
+                              std::vector<int>& atom_na,
+                              std::vector<FPTYPE>& zv_values,
+                              std::vector<FPTYPE>& latvec_flat,
+                              std::vector<FPTYPE>& G_flat)
+{
+    const int nat = ucell.nat;
+    const int npw = rho_basis->npw;
+    const int ntype = ucell.ntype;
+    
+    // Prepare gcar_flat
+    gcar_flat.resize(npw * 3);
+    for (int ig = 0; ig < npw; ig++)
+    {
+        gcar_flat[ig * 3 + 0] = static_cast<FPTYPE>(rho_basis->gcar[ig][0]);
+        gcar_flat[ig * 3 + 1] = static_cast<FPTYPE>(rho_basis->gcar[ig][1]);
+        gcar_flat[ig * 3 + 2] = static_cast<FPTYPE>(rho_basis->gcar[ig][2]);
+    }
+    
+    // Prepare tau_flat and iat2it_array
+    tau_flat.resize(nat * 3);
+    iat2it_array.resize(nat);
+    int iat = 0;
+    for (int it = 0; it < ntype; it++)
+    {
+        for (int ia = 0; ia < ucell.atoms[it].na; ia++)
+        {
+            tau_flat[iat * 3 + 0] = static_cast<FPTYPE>(ucell.atoms[it].tau[ia].x);
+            tau_flat[iat * 3 + 1] = static_cast<FPTYPE>(ucell.atoms[it].tau[ia].y);
+            tau_flat[iat * 3 + 2] = static_cast<FPTYPE>(ucell.atoms[it].tau[ia].z);
+            iat2it_array[iat] = it;
+            iat++;
+        }
+    }
+    
+    // Prepare it_facts
+    it_facts.resize(nat);
+    iat = 0;
+    for (int it = 0; it < ntype; it++)
+    {
+        double zv;
+        if (PARAM.inp.use_paw)
+        {
+#ifdef USE_PAW
+            zv = GlobalC::paw_cell.get_val(it);
+#else
+            zv = ucell.atoms[it].ncpp.zv;
+#endif
+        }
+        else
+        {
+            zv = ucell.atoms[it].ncpp.zv;
+        }
+        
+        const FPTYPE it_fact = static_cast<FPTYPE>(zv * ModuleBase::e2 * ucell.tpiba * ModuleBase::TWO_PI / ucell.omega * fact);
+        
+        for (int ia = 0; ia < ucell.atoms[it].na; ia++)
+        {
+            it_facts[iat] = it_fact;
+            iat++;
+        }
+    }
+    
+    // Prepare atom_na
+    atom_na.resize(ntype);
+    for (int it = 0; it < ntype; it++)
+    {
+        atom_na[it] = ucell.atoms[it].na;
+    }
+    
+    // Prepare zv_values
+    zv_values.resize(ntype);
+    for (int it = 0; it < ntype; it++)
+    {
+        if (PARAM.inp.use_paw)
+        {
+#ifdef USE_PAW
+            zv_values[it] = static_cast<FPTYPE>(GlobalC::paw_cell.get_val(it));
+#else
+            zv_values[it] = static_cast<FPTYPE>(ucell.atoms[it].ncpp.zv);
+#endif
+        }
+        else
+        {
+            zv_values[it] = static_cast<FPTYPE>(ucell.atoms[it].ncpp.zv);
+        }
+    }
+    
+    // Prepare latvec_flat (row-major order)
+    latvec_flat.resize(9);
+    latvec_flat[0] = static_cast<FPTYPE>(ucell.latvec.e11);
+    latvec_flat[1] = static_cast<FPTYPE>(ucell.latvec.e12);
+    latvec_flat[2] = static_cast<FPTYPE>(ucell.latvec.e13);
+    latvec_flat[3] = static_cast<FPTYPE>(ucell.latvec.e21);
+    latvec_flat[4] = static_cast<FPTYPE>(ucell.latvec.e22);
+    latvec_flat[5] = static_cast<FPTYPE>(ucell.latvec.e23);
+    latvec_flat[6] = static_cast<FPTYPE>(ucell.latvec.e31);
+    latvec_flat[7] = static_cast<FPTYPE>(ucell.latvec.e32);
+    latvec_flat[8] = static_cast<FPTYPE>(ucell.latvec.e33);
+    
+    // Prepare G_flat (row-major order)
+    G_flat.resize(9);
+    G_flat[0] = static_cast<FPTYPE>(ucell.G.e11);
+    G_flat[1] = static_cast<FPTYPE>(ucell.G.e12);
+    G_flat[2] = static_cast<FPTYPE>(ucell.G.e13);
+    G_flat[3] = static_cast<FPTYPE>(ucell.G.e21);
+    G_flat[4] = static_cast<FPTYPE>(ucell.G.e22);
+    G_flat[5] = static_cast<FPTYPE>(ucell.G.e23);
+    G_flat[6] = static_cast<FPTYPE>(ucell.G.e31);
+    G_flat[7] = static_cast<FPTYPE>(ucell.G.e32);
+    G_flat[8] = static_cast<FPTYPE>(ucell.G.e33);
+}
+
+// Data preparation function for local force sincos operator
+template <typename FPTYPE>
+void prepare_loc_sincos_data(const UnitCell& ucell,
+                             ModulePW::PW_Basis* rho_basis,
+                             const ModuleBase::matrix& vloc,
+                             const std::complex<double>* aux,
+                             std::vector<FPTYPE>& gcar_flat,
+                             std::vector<FPTYPE>& tau_flat,
+                             std::vector<int>& iat2it_array,
+                             std::vector<FPTYPE>& vloc_factors)
+{
+    const int nat = ucell.nat;
+    const int npw = rho_basis->npw;
+    
+    // Prepare gcar_flat
+    gcar_flat.resize(npw * 3);
+    for (int ig = 0; ig < npw; ig++)
+    {
+        gcar_flat[ig * 3 + 0] = static_cast<FPTYPE>(rho_basis->gcar[ig][0]);
+        gcar_flat[ig * 3 + 1] = static_cast<FPTYPE>(rho_basis->gcar[ig][1]);
+        gcar_flat[ig * 3 + 2] = static_cast<FPTYPE>(rho_basis->gcar[ig][2]);
+    }
+    
+    // Prepare tau_flat and iat2it_array
+    tau_flat.resize(nat * 3);
+    iat2it_array.resize(nat);
+    int iat = 0;
+    for (int it = 0; it < ucell.ntype; it++)
+    {
+        for (int ia = 0; ia < ucell.atoms[it].na; ia++)
+        {
+            tau_flat[iat * 3 + 0] = static_cast<FPTYPE>(ucell.atoms[it].tau[ia].x);
+            tau_flat[iat * 3 + 1] = static_cast<FPTYPE>(ucell.atoms[it].tau[ia].y);
+            tau_flat[iat * 3 + 2] = static_cast<FPTYPE>(ucell.atoms[it].tau[ia].z);
+            iat2it_array[iat] = it;
+            iat++;
+        }
+    }
+    
+    // Prepare vloc_factors: vloc(it, ig2igg[ig]) for each G-vector and atom type
+    vloc_factors.resize(npw * ucell.ntype);
+    for (int it = 0; it < ucell.ntype; it++)
+    {
+        for (int ig = 0; ig < npw; ig++)
+        {
+            vloc_factors[it * npw + ig] = static_cast<FPTYPE>(vloc(it, rho_basis->ig2igg[ig]));
+        }
+    }
+}
+
+// Data copy back function for local force sincos operator
+template <typename FPTYPE>
+void copy_back_loc_force_data(const FPTYPE* force,
+                              const int nat,
+                              const FPTYPE& scale_factor,
+                              ModuleBase::matrix& forcelc)
+{
+    for (int iat = 0; iat < nat; iat++)
+    {
+        forcelc(iat, 0) = static_cast<double>(force[iat * 3 + 0] * scale_factor);
+        forcelc(iat, 1) = static_cast<double>(force[iat * 3 + 1] * scale_factor);
+        forcelc(iat, 2) = static_cast<double>(force[iat * 3 + 2] * scale_factor);
+    }
+}
+
+// Data copy back function for Ewald parallel operator
+template <typename FPTYPE>
+void copy_back_ew_force_data(const FPTYPE* force,
+                             const int nat,
+                             ModuleBase::matrix& forceion)
+{
+    for (int iat = 0; iat < nat; iat++)
+    {
+        forceion(iat, 0) = static_cast<double>(force[iat * 3 + 0]);
+        forceion(iat, 1) = static_cast<double>(force[iat * 3 + 1]);
+        forceion(iat, 2) = static_cast<double>(force[iat * 3 + 2]);
+    }
+}
+
 template <typename FPTYPE, typename Device>
 void Forces<FPTYPE, Device>::cal_force(UnitCell& ucell,
                                        ModuleBase::matrix& force,
@@ -531,30 +735,40 @@ void Forces<FPTYPE, Device>::cal_force_loc(const UnitCell& ucell,
     // to G space. maybe need fftw with OpenMP
     rho_basis->real2recip(aux, aux);
 
-#ifdef _OPENMP
-#pragma omp parallel for
-#endif
-    for (int iat = 0; iat < this->nat; ++iat)
+    // Prepare data for the sincos operator
+    std::vector<FPTYPE> gcar_flat, tau_flat, vloc_factors;
+    std::vector<int> iat2it_array;
+    prepare_loc_sincos_data<FPTYPE>(ucell, rho_basis, vloc, aux,
+                                    gcar_flat, tau_flat, iat2it_array, vloc_factors);
+
+    // Prepare force array for the operator
+    std::vector<FPTYPE> force_flat(this->nat * 3, 0.0);
+
+    // Convert aux to FPTYPE complex array
+    std::vector<std::complex<FPTYPE>> aux_fptype(rho_basis->npw);
+    for (int ig = 0; ig < rho_basis->npw; ig++)
     {
-        // read `it` `ia` from the table
-        int it = ucell.iat2it[iat];
-        int ia = ucell.iat2ia[iat];
-        for (int ig = 0; ig < rho_basis->npw; ig++)
-        {
-            const double phase = ModuleBase::TWO_PI * (rho_basis->gcar[ig] * ucell.atoms[it].tau[ia]);
-            double sinp, cosp;
-            ModuleBase::libm::sincos(phase, &sinp, &cosp);
-            const double factor
-                = vloc(it, rho_basis->ig2igg[ig]) * (cosp * aux[ig].imag() + sinp * aux[ig].real());
-            forcelc(iat, 0) += rho_basis->gcar[ig][0] * factor;
-            forcelc(iat, 1) += rho_basis->gcar[ig][1] * factor;
-            forcelc(iat, 2) += rho_basis->gcar[ig][2] * factor;
-        }
-        forcelc(iat, 0) *= (ucell.tpiba * ucell.omega);
-        forcelc(iat, 1) *= (ucell.tpiba * ucell.omega);
-        forcelc(iat, 2) *= (ucell.tpiba * ucell.omega);
+        aux_fptype[ig] = std::complex<FPTYPE>(static_cast<FPTYPE>(aux[ig].real()),
+                                              static_cast<FPTYPE>(aux[ig].imag()));
     }
 
+    // Call the sincos operator
+    hamilt::cal_force_loc_sincos_op<FPTYPE, base_device::DEVICE_CPU> sincos_op;
+    sincos_op(this->ctx,
+              this->nat,
+              rho_basis->npw,
+              ucell.ntype,
+              gcar_flat.data(),
+              tau_flat.data(),
+              iat2it_array.data(),
+              vloc_factors.data(),
+              aux_fptype.data(),
+              force_flat.data());
+
+    // Copy back the results with scaling
+    const FPTYPE scale_factor = static_cast<FPTYPE>(ucell.tpiba * ucell.omega);
+    copy_back_loc_force_data<FPTYPE>(force_flat.data(), this->nat, scale_factor, forcelc);
+
     // this->print(GlobalV::ofs_running, "local forces", forcelc);
     Parallel_Reduce::reduce_pool(forcelc.c, forcelc.nr * forcelc.nc);
     delete[] aux;
@@ -665,166 +879,48 @@ void Forces<FPTYPE, Device>::cal_force_ew(const UnitCell& ucell,
         aux[rho_basis->ig_gge0] = std::complex<double>(0.0, 0.0);
     }
 
-#ifdef _OPENMP
-#pragma omp parallel
-    {
-        int num_threads = omp_get_num_threads();
-        int thread_id = omp_get_thread_num();
-#else
-    int num_threads = 1;
-    int thread_id = 0;
-#endif
+    // Prepare data for the parallel operator
+    std::vector<FPTYPE> gcar_flat, tau_flat, it_facts, zv_values, latvec_flat, G_flat;
+    std::vector<int> iat2it_array, atom_na;
+    prepare_ew_parallel_data<FPTYPE>(ucell, rho_basis, aux, alpha, fact,
+                                     gcar_flat, tau_flat, iat2it_array, it_facts,
+                                     atom_na, zv_values, latvec_flat, G_flat);
 
-        /* Here is task distribution for multi-thread,
-            0. atom will be iterated both in main nat loop and the loop in `if (rho_basis->ig_gge0 >= 0)`.
-                To avoid syncing, we must calculate work range of each thread by our self
-            1. Calculate the iat range [iat_beg, iat_end) by each thread
-                a. when it is single thread stage, [iat_beg, iat_end) will be [0, nat)
-            2. each thread iterate atoms form `iat_beg` to `iat_end-1`
-        */
-        int iat_beg, iat_end;
-        int it_beg, ia_beg;
-        ModuleBase::TASK_DIST_1D(num_threads, thread_id, this->nat, iat_beg, iat_end);
-        iat_end = iat_beg + iat_end;
-        ucell.iat2iait(iat_beg, &ia_beg, &it_beg);
-
-        int iat = iat_beg;
-        int it = it_beg;
-        int ia = ia_beg;
-
-        // preprocess ig_gap for skipping the ig point
-        int ig_gap = (rho_basis->ig_gge0 >= 0 && rho_basis->ig_gge0 < rho_basis->npw) ? rho_basis->ig_gge0 : -1;
-
-        double it_fact = 0.;
-        int last_it = -1;
-
-        // iterating atoms
-        while (iat < iat_end)
-        {
-            if (it != last_it)
-            { // calculate it_tact when it is changed
-                double zv;
-                if (PARAM.inp.use_paw)
-                {
-#ifdef USE_PAW
-                    zv = GlobalC::paw_cell.get_val(it);
-#endif
-                }
-                else
-                {
-                    zv = ucell.atoms[it].ncpp.zv;
-                }
-                it_fact = zv * ModuleBase::e2 * ucell.tpiba * ModuleBase::TWO_PI / ucell.omega * fact;
-                last_it = it;
-            }
-
-            if (ucell.atoms[it].na != 0)
-            {
-                const auto ig_loop = [&](int ig_beg, int ig_end) {
-                    for (int ig = ig_beg; ig < ig_end; ig++)
-                    {
-                        const ModuleBase::Vector3<double> gcar = rho_basis->gcar[ig];
-                        const double arg = ModuleBase::TWO_PI * (gcar * ucell.atoms[it].tau[ia]);
-                        double sinp, cosp;
-                        ModuleBase::libm::sincos(arg, &sinp, &cosp);
-                        double sumnb = -cosp * aux[ig].imag() + sinp * aux[ig].real();
-                        forceion(iat, 0) += gcar[0] * sumnb;
-                        forceion(iat, 1) += gcar[1] * sumnb;
-                        forceion(iat, 2) += gcar[2] * sumnb;
-                    }
-                };
-
-                // skip ig_gge0 point by separating ig loop into two part
-                ig_loop(0, ig_gap);
-                ig_loop(ig_gap + 1, rho_basis->npw);
-
-                forceion(iat, 0) *= it_fact;
-                forceion(iat, 1) *= it_fact;
-                forceion(iat, 2) *= it_fact;
-
-                ++iat;
-                ucell.step_iait(&ia, &it);
-            }
-        }
-
-        // means that the processor contains G=0 term.
-        if (rho_basis->ig_gge0 >= 0)
-        {
-            double rmax = 5.0 / (sqrt(alpha) * ucell.lat0);
-            int nrm = 0;
+    // Prepare force array for the operator
+    std::vector<FPTYPE> force_flat(this->nat * 3, 0.0);
 
-            // output of rgen: the number of vectors in the sphere
-            const int mxr = 200;
-            // the maximum number of R vectors included in r
-            ModuleBase::Vector3<double>* r = new ModuleBase::Vector3<double>[mxr];
-            double* r2 = new double[mxr];
-            ModuleBase::GlobalFunc::ZEROS(r2, mxr);
-            int* irr = new int[mxr];
-            ModuleBase::GlobalFunc::ZEROS(irr, mxr);
-            // the square modulus of R_j-tau_s-tau_s'
-
-            int iat1 = iat_beg;
-            int T1 = it_beg;
-            int I1 = ia_beg;
-            const double sqa = sqrt(alpha);
-            const double sq8a_2pi = sqrt(8.0 * alpha / ModuleBase::TWO_PI);
-
-            // iterating atoms.
-            // do not need to sync threads because task range of each thread is isolated
-            while (iat1 < iat_end)
-            {
-                int iat2 = 0; // mohan fix bug 2011-06-07
-                int I2 = 0;
-                int T2 = 0;
-                while (iat2 < this->nat)
-                {
-                    if (iat1 != iat2 && ucell.atoms[T2].na != 0 && ucell.atoms[T1].na != 0)
-                    {
-                        ModuleBase::Vector3<double> d_tau
-                            = ucell.atoms[T1].tau[I1] - ucell.atoms[T2].tau[I2];
-                        H_Ewald_pw::rgen(d_tau, rmax, irr, ucell.latvec, ucell.G, r, r2, nrm);
-
-                        for (int n = 0; n < nrm; n++)
-                        {
-                            const double rr = sqrt(r2[n]) * ucell.lat0;
-
-                            double factor;
-                            if (PARAM.inp.use_paw)
-                            {
-#ifdef USE_PAW
-                                factor = GlobalC::paw_cell.get_val(T1) * GlobalC::paw_cell.get_val(T2) * ModuleBase::e2
-                                         / (rr * rr)
-                                         * (erfc(sqa * rr) / rr + sq8a_2pi * ModuleBase::libm::exp(-alpha * rr * rr))
-                                         * ucell.lat0;
-#endif
-                            }
-                            else
-                            {
-                                factor = ucell.atoms[T1].ncpp.zv * ucell.atoms[T2].ncpp.zv
-                                         * ModuleBase::e2 / (rr * rr)
-                                         * (erfc(sqa * rr) / rr + sq8a_2pi * ModuleBase::libm::exp(-alpha * rr * rr))
-                                         * ucell.lat0;
-                            }
-
-                            forceion(iat1, 0) -= factor * r[n].x;
-                            forceion(iat1, 1) -= factor * r[n].y;
-                            forceion(iat1, 2) -= factor * r[n].z;
-                        }
-                    }
-                    ++iat2;
-                    ucell.step_iait(&I2, &T2);
-                } // atom b
-                ++iat1;
-                ucell.step_iait(&I1, &T1);
-            } // atom a
-
-            delete[] r;
-            delete[] r2;
-            delete[] irr;
-        }
-#ifdef _OPENMP
+    // Convert aux to FPTYPE complex array
+    std::vector<std::complex<FPTYPE>> aux_fptype(rho_basis->npw);
+    for (int ig = 0; ig < rho_basis->npw; ig++)
+    {
+        aux_fptype[ig] = std::complex<FPTYPE>(static_cast<FPTYPE>(aux[ig].real()),
+                                              static_cast<FPTYPE>(aux[ig].imag()));
     }
-#endif
+
+    // Call the parallel operator
+    hamilt::cal_force_ew_parallel_op<FPTYPE, base_device::DEVICE_CPU> parallel_op;
+    parallel_op(this->ctx,
+                this->nat,
+                rho_basis->npw,
+                ucell.ntype,
+                rho_basis->ig_gge0,
+                gcar_flat.data(),
+                tau_flat.data(),
+                iat2it_array.data(),
+                it_facts.data(),
+                atom_na.data(),
+                zv_values.data(),
+                aux_fptype.data(),
+                static_cast<FPTYPE>(alpha),
+                static_cast<FPTYPE>(fact),
+                static_cast<FPTYPE>(ucell.lat0),
+                latvec_flat.data(),
+                G_flat.data(),
+                PARAM.inp.use_paw,
+                force_flat.data());
+
+    // Copy back the results
+    copy_back_ew_force_data<FPTYPE>(force_flat.data(), this->nat, forceion);
 
     Parallel_Reduce::reduce_pool(forceion.c, forceion.nr * forceion.nc);
 
diff --git a/source/module_hamilt_pw/hamilt_pwdft/kernels/force_op.cpp b/source/module_hamilt_pw/hamilt_pwdft/kernels/force_op.cpp
index 6d797e147d..b5745f2ac5 100644
--- a/source/module_hamilt_pw/hamilt_pwdft/kernels/force_op.cpp
+++ b/source/module_hamilt_pw/hamilt_pwdft/kernels/force_op.cpp
@@ -1,4 +1,8 @@
 #include "module_hamilt_pw/hamilt_pwdft/kernels/force_op.h"
+#include "module_base/libm/libm.h"
+#include "module_base/tool_threading.h"
+#include "module_hamilt_general/module_ewald/H_Ewald_pw.h"
+#include <cmath>
 
 #ifdef _OPENMP
 #include <omp.h>
@@ -424,10 +428,261 @@ struct cal_force_nl_op<FPTYPE, base_device::DEVICE_CPU>
     }
 };
 
+// CPU implementation of local force sincos operator
+template <typename FPTYPE>
+struct cal_force_loc_sincos_op<FPTYPE, base_device::DEVICE_CPU>
+{
+    void operator()(const base_device::DEVICE_CPU* ctx,
+                    const int& nat,
+                    const int& npw,
+                    const int& ntype,
+                    const FPTYPE* gcar,
+                    const FPTYPE* tau,
+                    const int* iat2it,
+                    const FPTYPE* vloc_factors,
+                    const std::complex<FPTYPE>* aux,
+                    FPTYPE* force)
+    {
+        const FPTYPE TWO_PI = 2.0 * M_PI;
+
+#ifdef _OPENMP
+#pragma omp parallel for
+#endif
+        for (int iat = 0; iat < nat; ++iat)
+        {
+            const int it = iat2it[iat];
+            const FPTYPE tau_x = tau[iat * 3 + 0];
+            const FPTYPE tau_y = tau[iat * 3 + 1];
+            const FPTYPE tau_z = tau[iat * 3 + 2];
+
+            FPTYPE local_force[3] = {0.0, 0.0, 0.0};
+
+            for (int ig = 0; ig < npw; ig++)
+            {
+                const FPTYPE phase = TWO_PI * (gcar[ig * 3 + 0] * tau_x + 
+                                               gcar[ig * 3 + 1] * tau_y + 
+                                               gcar[ig * 3 + 2] * tau_z);
+                FPTYPE sinp, cosp;
+                ModuleBase::libm::sincos(phase, &sinp, &cosp);
+                
+                const FPTYPE vloc_factor = vloc_factors[it * npw + ig];
+                const FPTYPE factor = vloc_factor * (cosp * aux[ig].imag() + sinp * aux[ig].real());
+                
+                local_force[0] += gcar[ig * 3 + 0] * factor;
+                local_force[1] += gcar[ig * 3 + 1] * factor;
+                local_force[2] += gcar[ig * 3 + 2] * factor;
+            }
+
+            force[iat * 3 + 0] = local_force[0];
+            force[iat * 3 + 1] = local_force[1];
+            force[iat * 3 + 2] = local_force[2];
+        }
+    }
+};
+
+// CPU implementation of Ewald parallel operator
+template <typename FPTYPE>
+struct cal_force_ew_parallel_op<FPTYPE, base_device::DEVICE_CPU>
+{
+    void operator()(const base_device::DEVICE_CPU* ctx,
+                    const int& nat,
+                    const int& npw,
+                    const int& ntype,
+                    const int& ig_gge0,
+                    const FPTYPE* gcar,
+                    const FPTYPE* tau,
+                    const int* iat2it,
+                    const FPTYPE* it_facts,
+                    const int* atom_na,
+                    const FPTYPE* zv_values,
+                    const std::complex<FPTYPE>* aux,
+                    const FPTYPE& alpha,
+                    const FPTYPE& fact,
+                    const FPTYPE& lat0,
+                    const FPTYPE* latvec,
+                    const FPTYPE* G,
+                    const bool& use_paw,
+                    FPTYPE* force)
+    {
+        const FPTYPE TWO_PI = 2.0 * M_PI;
+        const FPTYPE e2 = 2.0; // ModuleBase::e2 equivalent
+
+#ifdef _OPENMP
+#pragma omp parallel
+        {
+            int num_threads = omp_get_num_threads();
+            int thread_id = omp_get_thread_num();
+#else
+        int num_threads = 1;
+        int thread_id = 0;
+#endif
+
+            // Task distribution for multi-thread
+            int iat_beg, iat_end;
+            ModuleBase::TASK_DIST_1D(num_threads, thread_id, nat, iat_beg, iat_end);
+            iat_end = iat_beg + iat_end;
+
+            // Preprocess ig_gap for skipping the ig point
+            int ig_gap = (ig_gge0 >= 0 && ig_gge0 < npw) ? ig_gge0 : -1;
+
+            // Part 1: sincos calculation (replacing original while loop with for loop)
+            for (int iat = iat_beg; iat < iat_end; ++iat)
+            {
+                const int it = iat2it[iat];
+                
+                // Check if this atom type has atoms
+                if (atom_na[it] == 0) continue;
+
+                const FPTYPE tau_x = tau[iat * 3 + 0];
+                const FPTYPE tau_y = tau[iat * 3 + 1];
+                const FPTYPE tau_z = tau[iat * 3 + 2];
+                const FPTYPE it_fact = it_facts[iat];
+
+                FPTYPE local_force[3] = {0.0, 0.0, 0.0};
+
+                // G-vector loop with ig_gap handling (using for loops for GPU compatibility)
+                for (int ig = 0; ig < npw; ig++)
+                {
+                    // Skip G=0 term
+                    if (ig == ig_gap) continue;
+
+                    const FPTYPE arg = TWO_PI * (gcar[ig * 3 + 0] * tau_x + 
+                                                 gcar[ig * 3 + 1] * tau_y + 
+                                                 gcar[ig * 3 + 2] * tau_z);
+                    FPTYPE sinp, cosp;
+                    ModuleBase::libm::sincos(arg, &sinp, &cosp);
+                    
+                    const FPTYPE sumnb = -cosp * aux[ig].imag() + sinp * aux[ig].real();
+                    
+                    local_force[0] += gcar[ig * 3 + 0] * sumnb;
+                    local_force[1] += gcar[ig * 3 + 1] * sumnb;
+                    local_force[2] += gcar[ig * 3 + 2] * sumnb;
+                }
+
+                // Apply it_fact scaling
+                force[iat * 3 + 0] += local_force[0] * it_fact;
+                force[iat * 3 + 1] += local_force[1] * it_fact;
+                force[iat * 3 + 2] += local_force[2] * it_fact;
+            }
+
+            // Part 2: Real space Ewald calculation (when processor contains G=0 term)
+            if (ig_gge0 >= 0)
+            {
+                const FPTYPE rmax = 5.0 / (sqrt(alpha) * lat0);
+                const int mxr = 200;
+                
+                // Allocate temporary arrays for each thread
+                ModuleBase::Vector3<double>* r_flat = new ModuleBase::Vector3<double>[mxr];
+                double* r2 = new double[mxr];
+                int* irr = new int[mxr];
+                
+                // Initialize arrays
+                for (int i = 0; i < mxr; i++) {
+                    r2[i] = 0.0;
+                    irr[i] = 0;
+                    r_flat[i].x = 0.0;
+                    r_flat[i].y = 0.0;
+                    r_flat[i].z = 0.0;
+                }
+
+                const FPTYPE sqa = sqrt(alpha);
+                const FPTYPE sq8a_2pi = sqrt(8.0 * alpha / TWO_PI);
+
+                // Iterate over atoms in assigned range
+                for (int iat1 = iat_beg; iat1 < iat_end; ++iat1)
+                {
+                    const int T1 = iat2it[iat1];
+                    if (atom_na[T1] == 0) continue;
+
+                    const FPTYPE tau1_x = tau[iat1 * 3 + 0];
+                    const FPTYPE tau1_y = tau[iat1 * 3 + 1];
+                    const FPTYPE tau1_z = tau[iat1 * 3 + 2];
+
+                    // Iterate over all other atoms
+                    for (int iat2 = 0; iat2 < nat; ++iat2)
+                    {
+                        const int T2 = iat2it[iat2];
+                        
+                        if (iat1 != iat2 && atom_na[T2] != 0 && atom_na[T1] != 0)
+                        {
+                            const FPTYPE tau2_x = tau[iat2 * 3 + 0];
+                            const FPTYPE tau2_y = tau[iat2 * 3 + 1];
+                            const FPTYPE tau2_z = tau[iat2 * 3 + 2];
+                            
+                            // Calculate d_tau
+                            const FPTYPE d_tau_x = tau1_x - tau2_x;
+                            const FPTYPE d_tau_y = tau1_y - tau2_y;
+                            const FPTYPE d_tau_z = tau1_z - tau2_z;
+
+                            // Reconstruct Matrix3 objects from flat arrays for rgen call
+                            ModuleBase::Matrix3 latvec_matrix(
+                                static_cast<double>(latvec[0]), static_cast<double>(latvec[1]), static_cast<double>(latvec[2]),
+                                static_cast<double>(latvec[3]), static_cast<double>(latvec[4]), static_cast<double>(latvec[5]),
+                                static_cast<double>(latvec[6]), static_cast<double>(latvec[7]), static_cast<double>(latvec[8])
+                            );
+                            
+                            ModuleBase::Matrix3 G_matrix(
+                                static_cast<double>(G[0]), static_cast<double>(G[1]), static_cast<double>(G[2]),
+                                static_cast<double>(G[3]), static_cast<double>(G[4]), static_cast<double>(G[5]),
+                                static_cast<double>(G[6]), static_cast<double>(G[7]), static_cast<double>(G[8])
+                            );
+                            
+                            ModuleBase::Vector3<double> dtau_vec(
+                                static_cast<double>(d_tau_x),
+                                static_cast<double>(d_tau_y),
+                                static_cast<double>(d_tau_z)
+                            );
+
+                            // Call H_Ewald_pw::rgen to generate neighbor shells
+                            int nrm = 0;
+                            H_Ewald_pw::rgen(dtau_vec, static_cast<double>(rmax), irr, latvec_matrix, G_matrix, r_flat, r2, nrm);
+                            
+                            // Process all generated r vectors
+                            for (int nr = 0; nr < nrm; nr++)
+                            {
+                                const FPTYPE rr = sqrt(r2[nr]) * lat0;
+                                
+                                FPTYPE factor;
+                                if (use_paw)
+                                {
+                                    factor = zv_values[T1] * zv_values[T2] * e2 / (rr * rr)
+                                             * (erfc(sqa * rr) / rr + sq8a_2pi * exp(-alpha * rr * rr))
+                                             * lat0;
+                                }
+                                else
+                                {
+                                    factor = zv_values[T1] * zv_values[T2] * e2 / (rr * rr)
+                                             * (erfc(sqa * rr) / rr + sq8a_2pi * exp(-alpha * rr * rr))
+                                             * lat0;
+                                }
+
+                                force[iat1 * 3 + 0] -= factor * static_cast<FPTYPE>(r_flat[nr].x);
+                                force[iat1 * 3 + 1] -= factor * static_cast<FPTYPE>(r_flat[nr].y);
+                                force[iat1 * 3 + 2] -= factor * static_cast<FPTYPE>(r_flat[nr].z);
+                            }
+                        }
+                    }
+                }
+
+                delete[] r_flat;
+                delete[] r2;
+                delete[] irr;
+            }
+
+#ifdef _OPENMP
+        }
+#endif
+    }
+};
+
 template struct cal_vkb1_nl_op<float, base_device::DEVICE_CPU>;
 template struct cal_force_nl_op<float, base_device::DEVICE_CPU>;
+template struct cal_force_loc_sincos_op<float, base_device::DEVICE_CPU>;
+template struct cal_force_ew_parallel_op<float, base_device::DEVICE_CPU>;
 
 template struct cal_vkb1_nl_op<double, base_device::DEVICE_CPU>;
 template struct cal_force_nl_op<double, base_device::DEVICE_CPU>;
+template struct cal_force_loc_sincos_op<double, base_device::DEVICE_CPU>;
+template struct cal_force_ew_parallel_op<double, base_device::DEVICE_CPU>;
 
 } // namespace hamilt
diff --git a/source/module_hamilt_pw/hamilt_pwdft/kernels/force_op.h b/source/module_hamilt_pw/hamilt_pwdft/kernels/force_op.h
index 3aa5d4f87e..152f41e310 100644
--- a/source/module_hamilt_pw/hamilt_pwdft/kernels/force_op.h
+++ b/source/module_hamilt_pw/hamilt_pwdft/kernels/force_op.h
@@ -146,6 +146,83 @@ struct cal_force_nl_op
                     const FPTYPE* lambda,
                     const std::complex<FPTYPE>* becp,
                     const std::complex<FPTYPE>* dbecp,
+                                          FPTYPE* force);
+};
+
+template <typename FPTYPE, typename Device>
+struct cal_force_loc_sincos_op
+{
+    /// @brief Calculate local pseudopotential forces (sincos loop only)
+    ///
+    /// Input Parameters
+    /// @param ctx - which device this function runs on
+    /// @param nat - number of atoms
+    /// @param npw - number of plane waves
+    /// @param ntype - number of atom types
+    /// @param gcar - G-vector Cartesian coordinates [npw * 3]
+    /// @param tau - atomic positions [nat * 3]
+    /// @param iat2it - atom to type mapping [nat]
+    /// @param vloc_factors - precomputed vloc factors [ntype * npw]
+    /// @param aux - charge density in G-space [npw]
+    ///
+    /// Output Parameters
+    /// @param force - output forces [nat * 3]
+    void operator()(const Device* ctx,
+                    const int& nat,
+                    const int& npw,
+                    const int& ntype,
+                    const FPTYPE* gcar,
+                    const FPTYPE* tau,
+                    const int* iat2it,
+                    const FPTYPE* vloc_factors,
+                    const std::complex<FPTYPE>* aux,
+                    FPTYPE* force);
+};
+
+template <typename FPTYPE, typename Device>
+struct cal_force_ew_parallel_op
+{
+    /// @brief Calculate Ewald forces (parallel region only)
+    ///
+    /// Input Parameters
+    /// @param ctx - which device this function runs on
+    /// @param nat - number of atoms
+    /// @param npw - number of plane waves
+    /// @param ntype - number of atom types
+    /// @param ig_gge0 - index of G=0 vector (-1 if not present)
+    /// @param gcar - G-vector Cartesian coordinates [npw * 3]
+    /// @param tau - atomic positions [nat * 3]
+    /// @param iat2it - atom to type mapping [nat]
+    /// @param it_facts - precomputed it_fact for each atom [nat]
+    /// @param atom_na - number of atoms of each type [ntype]
+    /// @param zv_values - valence charge for each type [ntype]
+    /// @param aux - structure factor related array [npw]
+    /// @param alpha - Ewald parameter
+    /// @param fact - scaling factor
+    /// @param lat0 - lattice constant
+    /// @param latvec - lattice vectors [3 * 3]
+    /// @param G - reciprocal lattice vectors [3 * 3]
+    /// @param use_pwa - whether PAW is used
+    /// Output Parameters
+    /// @param force - output forces [nat * 3]
+    void operator()(const Device* ctx,
+                    const int& nat,
+                    const int& npw,
+                    const int& ntype,
+                    const int& ig_gge0,
+                    const FPTYPE* gcar,
+                    const FPTYPE* tau,
+                    const int* iat2it,
+                    const FPTYPE* it_facts,
+                    const int* atom_na,
+                    const FPTYPE* zv_values,
+                    const std::complex<FPTYPE>* aux,
+                    const FPTYPE& alpha,
+                    const FPTYPE& fact,
+                    const FPTYPE& lat0,
+                    const FPTYPE* latvec,
+                    const FPTYPE* G,
+                    const bool& use_paw,
                     FPTYPE* force);
 };
 
@@ -248,6 +325,36 @@ struct cal_force_nl_op<FPTYPE, base_device::DEVICE_GPU>
                     FPTYPE* force);
 };
 
+template <typename FPTYPE>
+struct cal_force_loc_sincos_op<FPTYPE, base_device::DEVICE_GPU>
+{
+    void operator()(const base_device::DEVICE_GPU* ctx,
+                    const int& nat,
+                    const int& npw,
+                    const int& ntype,
+                    const FPTYPE* gcar,
+                    const FPTYPE* tau,
+                    const int* iat2it,
+                    const FPTYPE* vloc_factors,
+                    const std::complex<FPTYPE>* aux,
+                    FPTYPE* force);
+};
+
+template <typename FPTYPE>
+struct cal_force_ew_parallel_op<FPTYPE, base_device::DEVICE_GPU>
+{
+    void operator()(const base_device::DEVICE_GPU* ctx,
+                    const int& nat,
+                    const int& npw,
+                    const int& ig_gge0,
+                    const FPTYPE* gcar,
+                    const FPTYPE* tau,
+                    const int* iat2it,
+                    const FPTYPE* it_facts,
+                    const std::complex<FPTYPE>* aux,
+                    FPTYPE* force);
+};
+
 /**
  * @brief revert the vkb values for force_nl calculation
  */

From 06b2e2505aaee5010b9147905bc18bf6790c93b3 Mon Sep 17 00:00:00 2001
From: Jie <li_j20@fudan.edu.cn>
Date: Fri, 30 May 2025 15:06:08 +0800
Subject: [PATCH 3/6] fix sincos op for gpu&cpu

---
 .../module_hamilt_pw/hamilt_pwdft/forces.cpp  | 614 ++++++++++--------
 .../hamilt_pwdft/kernels/cuda/force_op.cu     | 184 ++++++
 .../kernels/cuda/force_sincos_op.cu           | 204 ------
 .../hamilt_pwdft/kernels/force_op.cpp         | 204 +-----
 .../hamilt_pwdft/kernels/force_op.h           |  34 +-
 .../hamilt_pwdft/kernels/force_sincos_op.cpp  | 123 ----
 .../hamilt_pwdft/kernels/force_sincos_op.h    | 106 ---
 .../hamilt_pwdft/kernels/rocm/force_op.hip.cu | 185 ++++++
 .../kernels/rocm/force_sincos_op.hip.cu       | 203 ------
 9 files changed, 751 insertions(+), 1106 deletions(-)
 delete mode 100644 source/module_hamilt_pw/hamilt_pwdft/kernels/cuda/force_sincos_op.cu
 delete mode 100644 source/module_hamilt_pw/hamilt_pwdft/kernels/force_sincos_op.cpp
 delete mode 100644 source/module_hamilt_pw/hamilt_pwdft/kernels/force_sincos_op.h
 delete mode 100644 source/module_hamilt_pw/hamilt_pwdft/kernels/rocm/force_sincos_op.hip.cu

diff --git a/source/module_hamilt_pw/hamilt_pwdft/forces.cpp b/source/module_hamilt_pw/hamilt_pwdft/forces.cpp
index 52ddf7da63..5e1d3cd6a2 100644
--- a/source/module_hamilt_pw/hamilt_pwdft/forces.cpp
+++ b/source/module_hamilt_pw/hamilt_pwdft/forces.cpp
@@ -15,8 +15,7 @@
 #include "module_hamilt_general/module_ewald/H_Ewald_pw.h"
 #include "module_hamilt_general/module_surchem/surchem.h"
 #include "module_hamilt_general/module_vdw/vdw.h"
-#include "module_hamilt_pw/hamilt_pwdft/kernels/force_op.h"
-#include <vector>
+#include "kernels/force_op.h"
 
 #ifdef _OPENMP
 #include <omp.h>
@@ -25,208 +24,6 @@
 #include "module_cell/module_paw/paw_cell.h"
 #endif
 
-// Data preparation function for Ewald parallel operator
-template <typename FPTYPE>
-void prepare_ew_parallel_data(const UnitCell& ucell,
-                              ModulePW::PW_Basis* rho_basis,
-                              const std::complex<double>* aux,
-                              const double alpha,
-                              const double fact,
-                              std::vector<FPTYPE>& gcar_flat,
-                              std::vector<FPTYPE>& tau_flat,
-                              std::vector<int>& iat2it_array,
-                              std::vector<FPTYPE>& it_facts,
-                              std::vector<int>& atom_na,
-                              std::vector<FPTYPE>& zv_values,
-                              std::vector<FPTYPE>& latvec_flat,
-                              std::vector<FPTYPE>& G_flat)
-{
-    const int nat = ucell.nat;
-    const int npw = rho_basis->npw;
-    const int ntype = ucell.ntype;
-    
-    // Prepare gcar_flat
-    gcar_flat.resize(npw * 3);
-    for (int ig = 0; ig < npw; ig++)
-    {
-        gcar_flat[ig * 3 + 0] = static_cast<FPTYPE>(rho_basis->gcar[ig][0]);
-        gcar_flat[ig * 3 + 1] = static_cast<FPTYPE>(rho_basis->gcar[ig][1]);
-        gcar_flat[ig * 3 + 2] = static_cast<FPTYPE>(rho_basis->gcar[ig][2]);
-    }
-    
-    // Prepare tau_flat and iat2it_array
-    tau_flat.resize(nat * 3);
-    iat2it_array.resize(nat);
-    int iat = 0;
-    for (int it = 0; it < ntype; it++)
-    {
-        for (int ia = 0; ia < ucell.atoms[it].na; ia++)
-        {
-            tau_flat[iat * 3 + 0] = static_cast<FPTYPE>(ucell.atoms[it].tau[ia].x);
-            tau_flat[iat * 3 + 1] = static_cast<FPTYPE>(ucell.atoms[it].tau[ia].y);
-            tau_flat[iat * 3 + 2] = static_cast<FPTYPE>(ucell.atoms[it].tau[ia].z);
-            iat2it_array[iat] = it;
-            iat++;
-        }
-    }
-    
-    // Prepare it_facts
-    it_facts.resize(nat);
-    iat = 0;
-    for (int it = 0; it < ntype; it++)
-    {
-        double zv;
-        if (PARAM.inp.use_paw)
-        {
-#ifdef USE_PAW
-            zv = GlobalC::paw_cell.get_val(it);
-#else
-            zv = ucell.atoms[it].ncpp.zv;
-#endif
-        }
-        else
-        {
-            zv = ucell.atoms[it].ncpp.zv;
-        }
-        
-        const FPTYPE it_fact = static_cast<FPTYPE>(zv * ModuleBase::e2 * ucell.tpiba * ModuleBase::TWO_PI / ucell.omega * fact);
-        
-        for (int ia = 0; ia < ucell.atoms[it].na; ia++)
-        {
-            it_facts[iat] = it_fact;
-            iat++;
-        }
-    }
-    
-    // Prepare atom_na
-    atom_na.resize(ntype);
-    for (int it = 0; it < ntype; it++)
-    {
-        atom_na[it] = ucell.atoms[it].na;
-    }
-    
-    // Prepare zv_values
-    zv_values.resize(ntype);
-    for (int it = 0; it < ntype; it++)
-    {
-        if (PARAM.inp.use_paw)
-        {
-#ifdef USE_PAW
-            zv_values[it] = static_cast<FPTYPE>(GlobalC::paw_cell.get_val(it));
-#else
-            zv_values[it] = static_cast<FPTYPE>(ucell.atoms[it].ncpp.zv);
-#endif
-        }
-        else
-        {
-            zv_values[it] = static_cast<FPTYPE>(ucell.atoms[it].ncpp.zv);
-        }
-    }
-    
-    // Prepare latvec_flat (row-major order)
-    latvec_flat.resize(9);
-    latvec_flat[0] = static_cast<FPTYPE>(ucell.latvec.e11);
-    latvec_flat[1] = static_cast<FPTYPE>(ucell.latvec.e12);
-    latvec_flat[2] = static_cast<FPTYPE>(ucell.latvec.e13);
-    latvec_flat[3] = static_cast<FPTYPE>(ucell.latvec.e21);
-    latvec_flat[4] = static_cast<FPTYPE>(ucell.latvec.e22);
-    latvec_flat[5] = static_cast<FPTYPE>(ucell.latvec.e23);
-    latvec_flat[6] = static_cast<FPTYPE>(ucell.latvec.e31);
-    latvec_flat[7] = static_cast<FPTYPE>(ucell.latvec.e32);
-    latvec_flat[8] = static_cast<FPTYPE>(ucell.latvec.e33);
-    
-    // Prepare G_flat (row-major order)
-    G_flat.resize(9);
-    G_flat[0] = static_cast<FPTYPE>(ucell.G.e11);
-    G_flat[1] = static_cast<FPTYPE>(ucell.G.e12);
-    G_flat[2] = static_cast<FPTYPE>(ucell.G.e13);
-    G_flat[3] = static_cast<FPTYPE>(ucell.G.e21);
-    G_flat[4] = static_cast<FPTYPE>(ucell.G.e22);
-    G_flat[5] = static_cast<FPTYPE>(ucell.G.e23);
-    G_flat[6] = static_cast<FPTYPE>(ucell.G.e31);
-    G_flat[7] = static_cast<FPTYPE>(ucell.G.e32);
-    G_flat[8] = static_cast<FPTYPE>(ucell.G.e33);
-}
-
-// Data preparation function for local force sincos operator
-template <typename FPTYPE>
-void prepare_loc_sincos_data(const UnitCell& ucell,
-                             ModulePW::PW_Basis* rho_basis,
-                             const ModuleBase::matrix& vloc,
-                             const std::complex<double>* aux,
-                             std::vector<FPTYPE>& gcar_flat,
-                             std::vector<FPTYPE>& tau_flat,
-                             std::vector<int>& iat2it_array,
-                             std::vector<FPTYPE>& vloc_factors)
-{
-    const int nat = ucell.nat;
-    const int npw = rho_basis->npw;
-    
-    // Prepare gcar_flat
-    gcar_flat.resize(npw * 3);
-    for (int ig = 0; ig < npw; ig++)
-    {
-        gcar_flat[ig * 3 + 0] = static_cast<FPTYPE>(rho_basis->gcar[ig][0]);
-        gcar_flat[ig * 3 + 1] = static_cast<FPTYPE>(rho_basis->gcar[ig][1]);
-        gcar_flat[ig * 3 + 2] = static_cast<FPTYPE>(rho_basis->gcar[ig][2]);
-    }
-    
-    // Prepare tau_flat and iat2it_array
-    tau_flat.resize(nat * 3);
-    iat2it_array.resize(nat);
-    int iat = 0;
-    for (int it = 0; it < ucell.ntype; it++)
-    {
-        for (int ia = 0; ia < ucell.atoms[it].na; ia++)
-        {
-            tau_flat[iat * 3 + 0] = static_cast<FPTYPE>(ucell.atoms[it].tau[ia].x);
-            tau_flat[iat * 3 + 1] = static_cast<FPTYPE>(ucell.atoms[it].tau[ia].y);
-            tau_flat[iat * 3 + 2] = static_cast<FPTYPE>(ucell.atoms[it].tau[ia].z);
-            iat2it_array[iat] = it;
-            iat++;
-        }
-    }
-    
-    // Prepare vloc_factors: vloc(it, ig2igg[ig]) for each G-vector and atom type
-    vloc_factors.resize(npw * ucell.ntype);
-    for (int it = 0; it < ucell.ntype; it++)
-    {
-        for (int ig = 0; ig < npw; ig++)
-        {
-            vloc_factors[it * npw + ig] = static_cast<FPTYPE>(vloc(it, rho_basis->ig2igg[ig]));
-        }
-    }
-}
-
-// Data copy back function for local force sincos operator
-template <typename FPTYPE>
-void copy_back_loc_force_data(const FPTYPE* force,
-                              const int nat,
-                              const FPTYPE& scale_factor,
-                              ModuleBase::matrix& forcelc)
-{
-    for (int iat = 0; iat < nat; iat++)
-    {
-        forcelc(iat, 0) = static_cast<double>(force[iat * 3 + 0] * scale_factor);
-        forcelc(iat, 1) = static_cast<double>(force[iat * 3 + 1] * scale_factor);
-        forcelc(iat, 2) = static_cast<double>(force[iat * 3 + 2] * scale_factor);
-    }
-}
-
-// Data copy back function for Ewald parallel operator
-template <typename FPTYPE>
-void copy_back_ew_force_data(const FPTYPE* force,
-                             const int nat,
-                             ModuleBase::matrix& forceion)
-{
-    for (int iat = 0; iat < nat; iat++)
-    {
-        forceion(iat, 0) = static_cast<double>(force[iat * 3 + 0]);
-        forceion(iat, 1) = static_cast<double>(force[iat * 3 + 1]);
-        forceion(iat, 2) = static_cast<double>(force[iat * 3 + 2]);
-    }
-}
-
 template <typename FPTYPE, typename Device>
 void Forces<FPTYPE, Device>::cal_force(UnitCell& ucell,
                                        ModuleBase::matrix& force,
@@ -735,39 +532,125 @@ void Forces<FPTYPE, Device>::cal_force_loc(const UnitCell& ucell,
     // to G space. maybe need fftw with OpenMP
     rho_basis->real2recip(aux, aux);
 
-    // Prepare data for the sincos operator
-    std::vector<FPTYPE> gcar_flat, tau_flat, vloc_factors;
-    std::vector<int> iat2it_array;
-    prepare_loc_sincos_data<FPTYPE>(ucell, rho_basis, vloc, aux,
-                                    gcar_flat, tau_flat, iat2it_array, vloc_factors);
-
-    // Prepare force array for the operator
-    std::vector<FPTYPE> force_flat(this->nat * 3, 0.0);
-
-    // Convert aux to FPTYPE complex array
+    // =============== GPU/CPU异构优化：使用sincos op替换原循环 ===============
+    
+    
+    // 准备原子相关数据：按照全局原子索引iat顺序
+    std::vector<FPTYPE> tau_flat(this->nat * 3);
+    std::vector<int> iat2it_host(this->nat);
+    std::vector<FPTYPE> gcar_flat(rho_basis->npw * 3);
+    
+    // 按照原始代码逻辑：遍历全局原子索引iat，通过查表获取(it,ia)
+    for (int iat = 0; iat < this->nat; iat++) {
+        int it = ucell.iat2it[iat];  // 查表获取原子类型
+        int ia = ucell.iat2ia[iat];  // 查表获取该类型内的原子索引
+        
+        tau_flat[iat * 3 + 0] = static_cast<FPTYPE>(ucell.atoms[it].tau[ia][0]);
+        tau_flat[iat * 3 + 1] = static_cast<FPTYPE>(ucell.atoms[it].tau[ia][1]);
+        tau_flat[iat * 3 + 2] = static_cast<FPTYPE>(ucell.atoms[it].tau[ia][2]);
+        iat2it_host[iat] = it;
+    }
+    
+    for (int ig = 0; ig < rho_basis->npw; ig++) {
+        gcar_flat[ig * 3 + 0] = static_cast<FPTYPE>(rho_basis->gcar[ig][0]);
+        gcar_flat[ig * 3 + 1] = static_cast<FPTYPE>(rho_basis->gcar[ig][1]);
+        gcar_flat[ig * 3 + 2] = static_cast<FPTYPE>(rho_basis->gcar[ig][2]);
+    }
+    
+    // 重新计算vloc_factors考虑所有原子类型
+    std::vector<FPTYPE> vloc_per_type_host(ucell.ntype * rho_basis->npw);
+    for (int it = 0; it < ucell.ntype; it++) {
+        for (int ig = 0; ig < rho_basis->npw; ig++) {
+            vloc_per_type_host[it * rho_basis->npw + ig] = static_cast<FPTYPE>(vloc(it, rho_basis->ig2igg[ig]));
+        }
+    }
+    
+    // 转换aux到FPTYPE类型
     std::vector<std::complex<FPTYPE>> aux_fptype(rho_basis->npw);
-    for (int ig = 0; ig < rho_basis->npw; ig++)
+    for (int ig = 0; ig < rho_basis->npw; ig++) {
+        aux_fptype[ig] = static_cast<std::complex<FPTYPE>>(aux[ig]);
+    }
+    
+    // 设备端内存和指针设置（根据设备类型分支）
+    FPTYPE* d_gcar = gcar_flat.data();
+    FPTYPE* d_tau = tau_flat.data();
+    int* d_iat2it = iat2it_host.data();
+    FPTYPE* d_vloc_per_type = vloc_per_type_host.data();
+    std::complex<FPTYPE>* d_aux = aux_fptype.data();
+    FPTYPE* d_force = nullptr;
+    std::vector<FPTYPE> force_host(this->nat * 3);
+    
+    if (this->device == base_device::GpuDevice)
+    {
+        d_gcar = nullptr;
+        d_tau = nullptr;
+        d_iat2it = nullptr;
+        d_vloc_per_type = nullptr;
+        d_aux = nullptr;
+        
+        resmem_var_op()(this->ctx, d_gcar, rho_basis->npw * 3);
+        resmem_var_op()(this->ctx, d_tau, this->nat * 3);
+        resmem_int_op()(this->ctx, d_iat2it, this->nat);
+        resmem_var_op()(this->ctx, d_vloc_per_type, ucell.ntype * rho_basis->npw);
+        resmem_complex_op()(this->ctx, d_aux, rho_basis->npw);
+        resmem_var_op()(this->ctx, d_force, this->nat * 3);
+        
+        // 数据传输到设备
+        syncmem_var_h2d_op()(this->ctx, this->cpu_ctx, d_gcar, gcar_flat.data(), rho_basis->npw * 3);
+        syncmem_var_h2d_op()(this->ctx, this->cpu_ctx, d_tau, tau_flat.data(), this->nat * 3);
+        syncmem_int_h2d_op()(this->ctx, this->cpu_ctx, d_iat2it, iat2it_host.data(), this->nat);
+        syncmem_var_h2d_op()(this->ctx, this->cpu_ctx, d_vloc_per_type, vloc_per_type_host.data(), ucell.ntype * rho_basis->npw);
+        syncmem_complex_h2d_op()(this->ctx, this->cpu_ctx, d_aux, aux_fptype.data(), rho_basis->npw);
+        
+        // 初始化force为0
+        base_device::memory::set_memory_op<FPTYPE, Device>()(this->ctx, d_force, 0.0, this->nat * 3);
+    }
+    else
     {
-        aux_fptype[ig] = std::complex<FPTYPE>(static_cast<FPTYPE>(aux[ig].real()),
-                                              static_cast<FPTYPE>(aux[ig].imag()));
-    }
-
-    // Call the sincos operator
-    hamilt::cal_force_loc_sincos_op<FPTYPE, base_device::DEVICE_CPU> sincos_op;
-    sincos_op(this->ctx,
-              this->nat,
-              rho_basis->npw,
-              ucell.ntype,
-              gcar_flat.data(),
-              tau_flat.data(),
-              iat2it_array.data(),
-              vloc_factors.data(),
-              aux_fptype.data(),
-              force_flat.data());
-
-    // Copy back the results with scaling
+        d_force = force_host.data();
+        // 对于CPU，直接初始化为0
+        std::fill(force_host.begin(), force_host.end(), static_cast<FPTYPE>(0.0));
+    }
+    
+    // 计算缩放因子
     const FPTYPE scale_factor = static_cast<FPTYPE>(ucell.tpiba * ucell.omega);
-    copy_back_loc_force_data<FPTYPE>(force_flat.data(), this->nat, scale_factor, forcelc);
+    
+    // 调用op进行sincos计算
+    hamilt::cal_force_loc_sincos_op<FPTYPE, Device>()(
+        this->ctx,
+        this->nat,
+        rho_basis->npw,
+        ucell.ntype,
+        d_gcar,
+        d_tau,
+        d_iat2it,
+        d_vloc_per_type,
+        d_aux,
+        scale_factor,
+        d_force
+    );
+    
+    // 根据设备类型处理结果
+    if (this->device == base_device::GpuDevice)
+    {
+        // 结果传回CPU
+        syncmem_var_d2h_op()(this->cpu_ctx, this->ctx, force_host.data(), d_force, this->nat * 3);
+        
+        // 清理设备内存
+        delmem_var_op()(this->ctx, d_gcar);
+        delmem_var_op()(this->ctx, d_tau);
+        delmem_int_op()(this->ctx, d_iat2it);
+        delmem_var_op()(this->ctx, d_vloc_per_type);
+        delmem_complex_op()(this->ctx, d_aux);
+        delmem_var_op()(this->ctx, d_force);
+    }
+    
+    // 将结果写入forcelc矩阵
+    for (int iat = 0; iat < this->nat; iat++) {
+        forcelc(iat, 0) = static_cast<double>(force_host[iat * 3 + 0]);
+        forcelc(iat, 1) = static_cast<double>(force_host[iat * 3 + 1]);
+        forcelc(iat, 2) = static_cast<double>(force_host[iat * 3 + 2]);
+    }
 
     // this->print(GlobalV::ofs_running, "local forces", forcelc);
     Parallel_Reduce::reduce_pool(forcelc.c, forcelc.nr * forcelc.nc);
@@ -879,48 +762,233 @@ void Forces<FPTYPE, Device>::cal_force_ew(const UnitCell& ucell,
         aux[rho_basis->ig_gge0] = std::complex<double>(0.0, 0.0);
     }
 
-    // Prepare data for the parallel operator
-    std::vector<FPTYPE> gcar_flat, tau_flat, it_facts, zv_values, latvec_flat, G_flat;
-    std::vector<int> iat2it_array, atom_na;
-    prepare_ew_parallel_data<FPTYPE>(ucell, rho_basis, aux, alpha, fact,
-                                     gcar_flat, tau_flat, iat2it_array, it_facts,
-                                     atom_na, zv_values, latvec_flat, G_flat);
+    // =============== 第一步：G空间sincos计算（在OpenMP区域外）===============
+    
+    // 预计算每个原子的it_fact和相关数据：按照全局原子索引iat顺序
+    std::vector<FPTYPE> it_facts_host(this->nat);
+    std::vector<FPTYPE> tau_flat(this->nat * 3);
+    std::vector<int> iat2it_host(this->nat);
+    
+    // 按照原始代码逻辑：遍历全局原子索引iat，通过查表获取(it,ia)
+    for (int iat = 0; iat < this->nat; iat++) {
+        int it = ucell.iat2it[iat];  // 查表获取原子类型
+        int ia = ucell.iat2ia[iat];  // 查表获取该类型内的原子索引
+        
+        double zv;
+        if (PARAM.inp.use_paw)
+        {
+#ifdef USE_PAW
+            zv = GlobalC::paw_cell.get_val(it);
+#endif
+        }
+        else
+        {
+            zv = ucell.atoms[it].ncpp.zv;
+        }
+        
+        it_facts_host[iat] = static_cast<FPTYPE>(zv * ModuleBase::e2 * ucell.tpiba * 
+                                                ModuleBase::TWO_PI / ucell.omega * fact);
+        
+        tau_flat[iat * 3 + 0] = static_cast<FPTYPE>(ucell.atoms[it].tau[ia][0]);
+        tau_flat[iat * 3 + 1] = static_cast<FPTYPE>(ucell.atoms[it].tau[ia][1]); 
+        tau_flat[iat * 3 + 2] = static_cast<FPTYPE>(ucell.atoms[it].tau[ia][2]);
+        iat2it_host[iat] = it;
+    }
+    
+    // 准备设备端数据
+    std::vector<FPTYPE> gcar_flat(rho_basis->npw * 3);
+    for (int ig = 0; ig < rho_basis->npw; ig++) {
+        gcar_flat[ig * 3 + 0] = static_cast<FPTYPE>(rho_basis->gcar[ig][0]);
+        gcar_flat[ig * 3 + 1] = static_cast<FPTYPE>(rho_basis->gcar[ig][1]);
+        gcar_flat[ig * 3 + 2] = static_cast<FPTYPE>(rho_basis->gcar[ig][2]);
+    }
+    
+    // 转换aux到FPTYPE类型
+    std::vector<std::complex<FPTYPE>> aux_fptype(rho_basis->npw);
+    for (int ig = 0; ig < rho_basis->npw; ig++) {
+        aux_fptype[ig] = static_cast<std::complex<FPTYPE>>(aux[ig]);
+    }
+    
+    // 设备端内存和指针设置（根据设备类型分支）
+    FPTYPE* d_gcar = gcar_flat.data();
+    FPTYPE* d_tau = tau_flat.data();
+    int* d_iat2it = iat2it_host.data();
+    FPTYPE* d_it_facts = it_facts_host.data();
+    std::complex<FPTYPE>* d_aux = aux_fptype.data();
+    FPTYPE* d_force_g = nullptr;
+    std::vector<FPTYPE> force_g_host(this->nat * 3);
+    
+    if (this->device == base_device::GpuDevice)
+    {
+        d_gcar = nullptr;
+        d_tau = nullptr;
+        d_iat2it = nullptr;
+        d_it_facts = nullptr;
+        d_aux = nullptr;
+        
+        resmem_var_op()(this->ctx, d_gcar, rho_basis->npw * 3);
+        resmem_var_op()(this->ctx, d_tau, this->nat * 3);
+        resmem_int_op()(this->ctx, d_iat2it, this->nat);
+        resmem_var_op()(this->ctx, d_it_facts, this->nat);
+        resmem_complex_op()(this->ctx, d_aux, rho_basis->npw);
+        resmem_var_op()(this->ctx, d_force_g, this->nat * 3);
+        
+        // 数据传输
+        syncmem_var_h2d_op()(this->ctx, this->cpu_ctx, d_gcar, gcar_flat.data(), rho_basis->npw * 3);
+        syncmem_var_h2d_op()(this->ctx, this->cpu_ctx, d_tau, tau_flat.data(), this->nat * 3);
+        syncmem_int_h2d_op()(this->ctx, this->cpu_ctx, d_iat2it, iat2it_host.data(), this->nat);
+        syncmem_var_h2d_op()(this->ctx, this->cpu_ctx, d_it_facts, it_facts_host.data(), this->nat);
+        syncmem_complex_h2d_op()(this->ctx, this->cpu_ctx, d_aux, aux_fptype.data(), rho_basis->npw);
+        
+        // 初始化force为0
+        base_device::memory::set_memory_op<FPTYPE, Device>()(this->ctx, d_force_g, 0.0, this->nat * 3);
+    }
+    else
+    {
+        d_force_g = force_g_host.data();
+        // 对于CPU，直接初始化为0
+        std::fill(force_g_host.begin(), force_g_host.end(), static_cast<FPTYPE>(0.0));
+    }
+    
+    // 调用op处理G空间sincos计算（在OpenMP区域外，无冲突）
+    hamilt::cal_force_ew_sincos_op<FPTYPE, Device>()(
+        this->ctx,
+        this->nat,
+        rho_basis->npw,
+        rho_basis->ig_gge0,  // G=0项索引，op内部会自动跳过
+        d_gcar,
+        d_tau,
+        d_iat2it,
+        d_it_facts,
+        d_aux,
+        d_force_g
+    );
+    
+    // 根据设备类型处理结果
+    if (this->device == base_device::GpuDevice)
+    {
+        // 将G空间结果传回CPU
+        syncmem_var_d2h_op()(this->cpu_ctx, this->ctx, force_g_host.data(), d_force_g, this->nat * 3);
+        
+        // 清理设备内存
+        delmem_var_op()(this->ctx, d_gcar);
+        delmem_var_op()(this->ctx, d_tau);
+        delmem_int_op()(this->ctx, d_iat2it);
+        delmem_var_op()(this->ctx, d_it_facts);
+        delmem_complex_op()(this->ctx, d_aux);
+        delmem_var_op()(this->ctx, d_force_g);
+    }
+    
+    // 累加到forceion
+    for (int iat = 0; iat < this->nat; iat++) {
+        forceion(iat, 0) += static_cast<double>(force_g_host[iat * 3 + 0]);
+        forceion(iat, 1) += static_cast<double>(force_g_host[iat * 3 + 1]); 
+        forceion(iat, 2) += static_cast<double>(force_g_host[iat * 3 + 2]);
+    }
 
-    // Prepare force array for the operator
-    std::vector<FPTYPE> force_flat(this->nat * 3, 0.0);
+    // =============== 第二步：实空间计算（保留原OpenMP结构）===============
 
-    // Convert aux to FPTYPE complex array
-    std::vector<std::complex<FPTYPE>> aux_fptype(rho_basis->npw);
-    for (int ig = 0; ig < rho_basis->npw; ig++)
+#ifdef _OPENMP
+#pragma omp parallel
     {
-        aux_fptype[ig] = std::complex<FPTYPE>(static_cast<FPTYPE>(aux[ig].real()),
-                                              static_cast<FPTYPE>(aux[ig].imag()));
-    }
-
-    // Call the parallel operator
-    hamilt::cal_force_ew_parallel_op<FPTYPE, base_device::DEVICE_CPU> parallel_op;
-    parallel_op(this->ctx,
-                this->nat,
-                rho_basis->npw,
-                ucell.ntype,
-                rho_basis->ig_gge0,
-                gcar_flat.data(),
-                tau_flat.data(),
-                iat2it_array.data(),
-                it_facts.data(),
-                atom_na.data(),
-                zv_values.data(),
-                aux_fptype.data(),
-                static_cast<FPTYPE>(alpha),
-                static_cast<FPTYPE>(fact),
-                static_cast<FPTYPE>(ucell.lat0),
-                latvec_flat.data(),
-                G_flat.data(),
-                PARAM.inp.use_paw,
-                force_flat.data());
-
-    // Copy back the results
-    copy_back_ew_force_data<FPTYPE>(force_flat.data(), this->nat, forceion);
+        int num_threads = omp_get_num_threads();
+        int thread_id = omp_get_thread_num();
+#else
+    int num_threads = 1;
+    int thread_id = 0;
+#endif
+
+        /* Here is task distribution for multi-thread,
+            0. atom will be iterated both in main nat loop and the loop in `if (rho_basis->ig_gge0 >= 0)`.
+                To avoid syncing, we must calculate work range of each thread by our self
+            1. Calculate the iat range [iat_beg, iat_end) by each thread
+                a. when it is single thread stage, [iat_beg, iat_end) will be [0, nat)
+            2. each thread iterate atoms form `iat_beg` to `iat_end-1`
+        */
+        int iat_beg, iat_end;
+        int it_beg, ia_beg;
+        ModuleBase::TASK_DIST_1D(num_threads, thread_id, this->nat, iat_beg, iat_end);
+        iat_end = iat_beg + iat_end;
+        ucell.iat2iait(iat_beg, &ia_beg, &it_beg);
+
+        // 只保留实空间相互作用计算（means that the processor contains G=0 term）
+        if (rho_basis->ig_gge0 >= 0)
+        {
+            double rmax = 5.0 / (sqrt(alpha) * ucell.lat0);
+            int nrm = 0;
+
+            // output of rgen: the number of vectors in the sphere
+            const int mxr = 200;
+            // the maximum number of R vectors included in r
+            ModuleBase::Vector3<double>* r = new ModuleBase::Vector3<double>[mxr];
+            double* r2 = new double[mxr];
+            ModuleBase::GlobalFunc::ZEROS(r2, mxr);
+            int* irr = new int[mxr];
+            ModuleBase::GlobalFunc::ZEROS(irr, mxr);
+            // the square modulus of R_j-tau_s-tau_s'
+
+            int iat1 = iat_beg;
+            int T1 = it_beg;
+            int I1 = ia_beg;
+            const double sqa = sqrt(alpha);
+            const double sq8a_2pi = sqrt(8.0 * alpha / ModuleBase::TWO_PI);
+
+            // iterating atoms.
+            // do not need to sync threads because task range of each thread is isolated
+            while (iat1 < iat_end)
+            {
+                int iat2 = 0; // mohan fix bug 2011-06-07
+                int I2 = 0;
+                int T2 = 0;
+                while (iat2 < this->nat)
+                {
+                    if (iat1 != iat2 && ucell.atoms[T2].na != 0 && ucell.atoms[T1].na != 0)
+                    {
+                        ModuleBase::Vector3<double> d_tau
+                            = ucell.atoms[T1].tau[I1] - ucell.atoms[T2].tau[I2];
+                        H_Ewald_pw::rgen(d_tau, rmax, irr, ucell.latvec, ucell.G, r, r2, nrm);
+
+                        for (int n = 0; n < nrm; n++)
+                        {
+                            const double rr = sqrt(r2[n]) * ucell.lat0;
+
+                            double factor;
+                            if (PARAM.inp.use_paw)
+                            {
+#ifdef USE_PAW
+                                factor = GlobalC::paw_cell.get_val(T1) * GlobalC::paw_cell.get_val(T2) * ModuleBase::e2
+                                         / (rr * rr)
+                                         * (erfc(sqa * rr) / rr + sq8a_2pi * ModuleBase::libm::exp(-alpha * rr * rr))
+                                         * ucell.lat0;
+#endif
+                            }
+                            else
+                            {
+                                factor = ucell.atoms[T1].ncpp.zv * ucell.atoms[T2].ncpp.zv
+                                         * ModuleBase::e2 / (rr * rr)
+                                         * (erfc(sqa * rr) / rr + sq8a_2pi * ModuleBase::libm::exp(-alpha * rr * rr))
+                                         * ucell.lat0;
+                            }
+
+                            forceion(iat1, 0) -= factor * r[n].x;
+                            forceion(iat1, 1) -= factor * r[n].y;
+                            forceion(iat1, 2) -= factor * r[n].z;
+                        }
+                    }
+                    ++iat2;
+                    ucell.step_iait(&I2, &T2);
+                } // atom b
+                ++iat1;
+                ucell.step_iait(&I1, &T1);
+            } // atom a
+
+            delete[] r;
+            delete[] r2;
+            delete[] irr;
+        }
+#ifdef _OPENMP
+    }
+#endif
 
     Parallel_Reduce::reduce_pool(forceion.c, forceion.nr * forceion.nc);
 
diff --git a/source/module_hamilt_pw/hamilt_pwdft/kernels/cuda/force_op.cu b/source/module_hamilt_pw/hamilt_pwdft/kernels/cuda/force_op.cu
index 5d0656d105..077864caff 100644
--- a/source/module_hamilt_pw/hamilt_pwdft/kernels/cuda/force_op.cu
+++ b/source/module_hamilt_pw/hamilt_pwdft/kernels/cuda/force_op.cu
@@ -13,6 +13,125 @@
 
 namespace hamilt {
 
+// CUDA kernels for sincos loops
+template <typename FPTYPE>
+__global__ void cal_force_loc_sincos_kernel(
+    const int nat,
+    const int npw,
+    const int ntype,
+    const FPTYPE* gcar,
+    const FPTYPE* tau,
+    const int* iat2it,
+    const FPTYPE* vloc_per_type,
+    const thrust::complex<FPTYPE>* aux,
+    const FPTYPE scale_factor,
+    FPTYPE* force)
+{
+    const FPTYPE TWO_PI = 2.0 * M_PI;
+    
+    const int iat = blockIdx.y;
+    const int ig_start = blockIdx.x * blockDim.x + threadIdx.x;
+    const int ig_stride = gridDim.x * blockDim.x;
+    
+    if (iat >= nat) return;
+    
+    // Load atom information to registers
+    const int it = iat2it[iat];
+    const FPTYPE tau_x = tau[iat * 3 + 0];
+    const FPTYPE tau_y = tau[iat * 3 + 1];
+    const FPTYPE tau_z = tau[iat * 3 + 2];
+    
+    // Local accumulation variables
+    FPTYPE local_force_x = 0.0;
+    FPTYPE local_force_y = 0.0;
+    FPTYPE local_force_z = 0.0;
+    
+    // Grid-stride loop over G-vectors
+    for (int ig = ig_start; ig < npw; ig += ig_stride) {
+        // Calculate phase: 2π * (G · τ)
+        const FPTYPE phase = TWO_PI * (gcar[ig * 3 + 0] * tau_x + 
+                                       gcar[ig * 3 + 1] * tau_y + 
+                                       gcar[ig * 3 + 2] * tau_z);
+        
+        // Use CUDA intrinsic for sincos
+        FPTYPE sinp, cosp;
+        sincos(phase, &sinp, &cosp);
+        
+        // Calculate force factor
+        const FPTYPE vloc_factor = vloc_per_type[it * npw + ig];
+        const FPTYPE factor = vloc_factor * (cosp * aux[ig].imag() + sinp * aux[ig].real());
+        
+        // Accumulate force contributions
+        local_force_x += gcar[ig * 3 + 0] * factor;
+        local_force_y += gcar[ig * 3 + 1] * factor;
+        local_force_z += gcar[ig * 3 + 2] * factor;
+    }
+    
+    // Atomic add to global memory
+    atomicAdd(&force[iat * 3 + 0], local_force_x * scale_factor);
+    atomicAdd(&force[iat * 3 + 1], local_force_y * scale_factor);
+    atomicAdd(&force[iat * 3 + 2], local_force_z * scale_factor);
+}
+
+template <typename FPTYPE>
+__global__ void cal_force_ew_sincos_kernel(
+    const int nat,
+    const int npw,
+    const int ig_gge0,
+    const FPTYPE* gcar,
+    const FPTYPE* tau,
+    const int* iat2it,
+    const FPTYPE* it_facts,
+    const thrust::complex<FPTYPE>* aux,
+    FPTYPE* force)
+{
+    const FPTYPE TWO_PI = 2.0 * M_PI;
+    
+    const int iat = blockIdx.y;
+    const int ig_start = blockIdx.x * blockDim.x + threadIdx.x;
+    const int ig_stride = gridDim.x * blockDim.x;
+    
+    if (iat >= nat) return;
+    
+    // Load atom information to registers
+    const FPTYPE tau_x = tau[iat * 3 + 0];
+    const FPTYPE tau_y = tau[iat * 3 + 1];
+    const FPTYPE tau_z = tau[iat * 3 + 2];
+    const FPTYPE it_fact = it_facts[iat];
+    
+    // Local accumulation variables
+    FPTYPE local_force_x = 0.0;
+    FPTYPE local_force_y = 0.0;
+    FPTYPE local_force_z = 0.0;
+    
+    // Grid-stride loop over G-vectors
+    for (int ig = ig_start; ig < npw; ig += ig_stride) {
+        // Skip G=0 term
+        if (ig == ig_gge0) continue;
+        
+        // Calculate phase: 2π * (G · τ)
+        const FPTYPE phase = TWO_PI * (gcar[ig * 3 + 0] * tau_x + 
+                                       gcar[ig * 3 + 1] * tau_y + 
+                                       gcar[ig * 3 + 2] * tau_z);
+        
+        // Use CUDA intrinsic for sincos
+        FPTYPE sinp, cosp;
+        sincos(phase, &sinp, &cosp);
+        
+        // Calculate Ewald sum contribution
+        const FPTYPE factor = it_fact * (cosp * aux[ig].imag() + sinp * aux[ig].real());
+        
+        // Accumulate force contributions
+        local_force_x += gcar[ig * 3 + 0] * factor;
+        local_force_y += gcar[ig * 3 + 1] * factor;
+        local_force_z += gcar[ig * 3 + 2] * factor;
+    }
+    
+    // Atomic add to global memory
+    atomicAdd(&force[iat * 3 + 0], local_force_x);
+    atomicAdd(&force[iat * 3 + 1], local_force_y);
+    atomicAdd(&force[iat * 3 + 2], local_force_z);
+}
 
 template <typename FPTYPE>
 __global__ void cal_vkb1_nl(
@@ -188,6 +307,67 @@ void cal_force_nl_op<FPTYPE, base_device::DEVICE_GPU>::operator()(const base_dev
     cudaCheckOnDebug();
 }
 
+// GPU operators
+template <typename FPTYPE>
+void cal_force_loc_sincos_op<FPTYPE, base_device::DEVICE_GPU>::operator()(
+    const base_device::DEVICE_GPU* ctx,
+    const int& nat,
+    const int& npw,
+    const int& ntype,
+    const FPTYPE* gcar,
+    const FPTYPE* tau,
+    const int* iat2it,
+    const FPTYPE* vloc_per_type,
+    const std::complex<FPTYPE>* aux,
+    const FPTYPE& scale_factor,
+    FPTYPE* force)
+{
+    // Calculate optimal grid configuration for GPU load balancing
+    const int threads_per_block = THREADS_PER_BLOCK;
+    const int max_blocks_per_sm = 32; // Configurable per GPU architecture
+    const int max_blocks_x = std::min(max_blocks_per_sm, (npw + threads_per_block - 1) / threads_per_block);
+    
+    dim3 grid(max_blocks_x, nat);
+    dim3 block(threads_per_block);
+    
+    cal_force_loc_sincos_kernel<FPTYPE><<<grid, block>>>(
+        nat, npw, ntype, gcar, tau, iat2it, vloc_per_type,
+        reinterpret_cast<const thrust::complex<FPTYPE>*>(aux),
+        scale_factor, force
+    );
+    
+    cudaCheckOnDebug();
+}
+
+template <typename FPTYPE>
+void cal_force_ew_sincos_op<FPTYPE, base_device::DEVICE_GPU>::operator()(
+    const base_device::DEVICE_GPU* ctx,
+    const int& nat,
+    const int& npw,
+    const int& ig_gge0,
+    const FPTYPE* gcar,
+    const FPTYPE* tau,
+    const int* iat2it,
+    const FPTYPE* it_facts,
+    const std::complex<FPTYPE>* aux,
+    FPTYPE* force)
+{
+    // Calculate optimal grid configuration for GPU load balancing
+    const int threads_per_block = THREADS_PER_BLOCK;
+    const int max_blocks_per_sm = 32; // Configurable per GPU architecture
+    const int max_blocks_x = std::min(max_blocks_per_sm, (npw + threads_per_block - 1) / threads_per_block);
+    
+    dim3 grid(max_blocks_x, nat);
+    dim3 block(threads_per_block);
+    
+    cal_force_ew_sincos_kernel<FPTYPE><<<grid, block>>>(
+        nat, npw, ig_gge0, gcar, tau, iat2it, it_facts,
+        reinterpret_cast<const thrust::complex<FPTYPE>*>(aux), force
+    );
+    
+    cudaCheckOnDebug();
+}
+
 template <typename FPTYPE>
 __global__ void cal_force_nl(
         const int ntype,
@@ -613,8 +793,12 @@ template void saveVkbValues<double>(const int *gcar_zero_ptrs, const std::comple
 
 template struct cal_vkb1_nl_op<float, base_device::DEVICE_GPU>;
 template struct cal_force_nl_op<float, base_device::DEVICE_GPU>;
+template struct cal_force_loc_sincos_op<float, base_device::DEVICE_GPU>;
+template struct cal_force_ew_sincos_op<float, base_device::DEVICE_GPU>;
 
 template struct cal_vkb1_nl_op<double, base_device::DEVICE_GPU>;
 template struct cal_force_nl_op<double, base_device::DEVICE_GPU>;
+template struct cal_force_loc_sincos_op<double, base_device::DEVICE_GPU>;
+template struct cal_force_ew_sincos_op<double, base_device::DEVICE_GPU>;
 
 }  // namespace hamilt
diff --git a/source/module_hamilt_pw/hamilt_pwdft/kernels/cuda/force_sincos_op.cu b/source/module_hamilt_pw/hamilt_pwdft/kernels/cuda/force_sincos_op.cu
deleted file mode 100644
index f97214893e..0000000000
--- a/source/module_hamilt_pw/hamilt_pwdft/kernels/cuda/force_sincos_op.cu
+++ /dev/null
@@ -1,204 +0,0 @@
-#include "module_hamilt_pw/hamilt_pwdft/kernels/force_sincos_op.h"
-#include "module_base/module_device/types.h"
-
-#include <complex>
-#include <thrust/complex.h>
-#include <cuda_runtime.h>
-#include <base/macros/macros.h>
-
-#define THREADS_PER_BLOCK 256
-
-namespace hamilt {
-
-// CUDA kernels for sincos loops
-template <typename FPTYPE>
-__global__ void cal_force_loc_sincos_kernel(
-    const int nat,
-    const int npw,
-    const FPTYPE* gcar,
-    const FPTYPE* tau,
-    const int* iat2it,
-    const FPTYPE* vloc_factors,
-    const thrust::complex<FPTYPE>* aux,
-    const FPTYPE scale_factor,
-    FPTYPE* force)
-{
-    const FPTYPE TWO_PI = 2.0 * M_PI;
-    
-    const int iat = blockIdx.y;
-    const int ig_start = blockIdx.x * blockDim.x + threadIdx.x;
-    const int ig_stride = gridDim.x * blockDim.x;
-    
-    if (iat >= nat) return;
-    
-    // Load atom information to registers
-    const int it = iat2it[iat];
-    const FPTYPE tau_x = tau[iat * 3 + 0];
-    const FPTYPE tau_y = tau[iat * 3 + 1];
-    const FPTYPE tau_z = tau[iat * 3 + 2];
-    
-    // Local accumulation variables
-    FPTYPE local_force_x = 0.0;
-    FPTYPE local_force_y = 0.0;
-    FPTYPE local_force_z = 0.0;
-    
-    // Grid-stride loop over G-vectors
-    for (int ig = ig_start; ig < npw; ig += ig_stride) {
-        // Calculate phase: 2π * (G · τ)
-        const FPTYPE phase = TWO_PI * (
-            gcar[ig * 3 + 0] * tau_x + 
-            gcar[ig * 3 + 1] * tau_y + 
-            gcar[ig * 3 + 2] * tau_z
-        );
-        
-        // Use CUDA intrinsic for sincos
-        FPTYPE sinp, cosp;
-        __sincos(phase, &sinp, &cosp);
-        
-        // Calculate force factor
-        const FPTYPE factor = vloc_factors[ig] * 
-                              (cosp * aux[ig].imag() + sinp * aux[ig].real());
-        
-        // Accumulate force contributions
-        local_force_x += gcar[ig * 3 + 0] * factor;
-        local_force_y += gcar[ig * 3 + 1] * factor;
-        local_force_z += gcar[ig * 3 + 2] * factor;
-    }
-    
-    // Atomic add to global memory
-    atomicAdd(&force[iat * 3 + 0], local_force_x * scale_factor);
-    atomicAdd(&force[iat * 3 + 1], local_force_y * scale_factor);
-    atomicAdd(&force[iat * 3 + 2], local_force_z * scale_factor);
-}
-
-template <typename FPTYPE>
-__global__ void cal_force_ew_sincos_kernel(
-    const int nat,
-    const int npw,
-    const int ig_gge0,
-    const FPTYPE* gcar,
-    const FPTYPE* tau,
-    const int* iat2it,
-    const FPTYPE* it_facts,
-    const thrust::complex<FPTYPE>* aux,
-    FPTYPE* force)
-{
-    const FPTYPE TWO_PI = 2.0 * M_PI;
-    
-    const int iat = blockIdx.y;
-    const int ig_start = blockIdx.x * blockDim.x + threadIdx.x;
-    const int ig_stride = gridDim.x * blockDim.x;
-    
-    if (iat >= nat) return;
-    
-    // Load atom information to registers
-    const int it = iat2it[iat];
-    const FPTYPE tau_x = tau[iat * 3 + 0];
-    const FPTYPE tau_y = tau[iat * 3 + 1];
-    const FPTYPE tau_z = tau[iat * 3 + 2];
-    const FPTYPE it_fact = it_facts[iat];
-    
-    // Local accumulation variables
-    FPTYPE local_force_x = 0.0;
-    FPTYPE local_force_y = 0.0;
-    FPTYPE local_force_z = 0.0;
-    
-    // Grid-stride loop over G-vectors
-    for (int ig = ig_start; ig < npw; ig += ig_stride) {
-        // Skip G=0 term
-        if (ig == ig_gge0) continue;
-        
-        // Calculate phase: 2π * (G · τ)
-        const FPTYPE arg = TWO_PI * (
-            gcar[ig * 3 + 0] * tau_x + 
-            gcar[ig * 3 + 1] * tau_y + 
-            gcar[ig * 3 + 2] * tau_z
-        );
-        
-        // Use CUDA intrinsic for sincos
-        FPTYPE sinp, cosp;
-        __sincos(arg, &sinp, &cosp);
-        
-        // Calculate Ewald sum contribution
-        const FPTYPE sumnb = -cosp * aux[ig].imag() + sinp * aux[ig].real();
-        
-        // Accumulate force contributions
-        local_force_x += gcar[ig * 3 + 0] * sumnb;
-        local_force_y += gcar[ig * 3 + 1] * sumnb;
-        local_force_z += gcar[ig * 3 + 2] * sumnb;
-    }
-    
-    // Atomic add to global memory
-    atomicAdd(&force[iat * 3 + 0], local_force_x * it_fact);
-    atomicAdd(&force[iat * 3 + 1], local_force_y * it_fact);
-    atomicAdd(&force[iat * 3 + 2], local_force_z * it_fact);
-}
-
-// GPU operator implementations
-template <typename FPTYPE>
-void cal_force_loc_sincos_op<FPTYPE, base_device::DEVICE_GPU>::operator()(
-    const base_device::DEVICE_GPU* ctx,
-    const int& nat,
-    const int& npw,
-    const FPTYPE* gcar,
-    const FPTYPE* tau,
-    const int* iat2it,
-    const FPTYPE* vloc_factors,
-    const std::complex<FPTYPE>* aux,
-    const FPTYPE& scale_factor,
-    FPTYPE* force)
-{
-    // Calculate optimal grid configuration
-    const int threads_per_block = THREADS_PER_BLOCK;
-    const int max_blocks_x = min(32, (npw + threads_per_block - 1) / threads_per_block);
-    
-    dim3 grid(max_blocks_x, nat);
-    dim3 block(threads_per_block);
-    
-    cal_force_loc_sincos_kernel<FPTYPE><<<grid, block>>>(
-        nat, npw,
-        gcar, tau, iat2it, vloc_factors,
-        reinterpret_cast<const thrust::complex<FPTYPE>*>(aux),
-        scale_factor, force
-    );
-    
-    cudaCheckOnDebug();
-}
-
-template <typename FPTYPE>
-void cal_force_ew_sincos_op<FPTYPE, base_device::DEVICE_GPU>::operator()(
-    const base_device::DEVICE_GPU* ctx,
-    const int& nat,
-    const int& npw,
-    const int& ig_gge0,
-    const FPTYPE* gcar,
-    const FPTYPE* tau,
-    const int* iat2it,
-    const FPTYPE* it_facts,
-    const std::complex<FPTYPE>* aux,
-    FPTYPE* force)
-{
-    // Calculate optimal grid configuration
-    const int threads_per_block = THREADS_PER_BLOCK;
-    const int max_blocks_x = min(32, (npw + threads_per_block - 1) / threads_per_block);
-    
-    dim3 grid(max_blocks_x, nat);
-    dim3 block(threads_per_block);
-    
-    cal_force_ew_sincos_kernel<FPTYPE><<<grid, block>>>(
-        nat, npw, ig_gge0,
-        gcar, tau, iat2it, it_facts,
-        reinterpret_cast<const thrust::complex<FPTYPE>*>(aux),
-        force
-    );
-    
-    cudaCheckOnDebug();
-}
-
-template struct cal_force_loc_sincos_op<float, base_device::DEVICE_GPU>;
-template struct cal_force_loc_sincos_op<double, base_device::DEVICE_GPU>;
-
-template struct cal_force_ew_sincos_op<float, base_device::DEVICE_GPU>;
-template struct cal_force_ew_sincos_op<double, base_device::DEVICE_GPU>;
-
-}  // namespace hamilt 
\ No newline at end of file
diff --git a/source/module_hamilt_pw/hamilt_pwdft/kernels/force_op.cpp b/source/module_hamilt_pw/hamilt_pwdft/kernels/force_op.cpp
index b5745f2ac5..e0dc91a70c 100644
--- a/source/module_hamilt_pw/hamilt_pwdft/kernels/force_op.cpp
+++ b/source/module_hamilt_pw/hamilt_pwdft/kernels/force_op.cpp
@@ -439,8 +439,9 @@ struct cal_force_loc_sincos_op<FPTYPE, base_device::DEVICE_CPU>
                     const FPTYPE* gcar,
                     const FPTYPE* tau,
                     const int* iat2it,
-                    const FPTYPE* vloc_factors,
+                    const FPTYPE* vloc_per_type,
                     const std::complex<FPTYPE>* aux,
+                    const FPTYPE& scale_factor,
                     FPTYPE* force)
     {
         const FPTYPE TWO_PI = 2.0 * M_PI;
@@ -465,8 +466,8 @@ struct cal_force_loc_sincos_op<FPTYPE, base_device::DEVICE_CPU>
                 FPTYPE sinp, cosp;
                 ModuleBase::libm::sincos(phase, &sinp, &cosp);
                 
-                const FPTYPE vloc_factor = vloc_factors[it * npw + ig];
-                const FPTYPE factor = vloc_factor * (cosp * aux[ig].imag() + sinp * aux[ig].real());
+                const FPTYPE vloc_factor = vloc_per_type[it * npw + ig];
+                const FPTYPE factor = vloc_factor * (cosp * aux[ig].imag() + sinp * aux[ig].real()) * scale_factor;
                 
                 local_force[0] += gcar[ig * 3 + 0] * factor;
                 local_force[1] += gcar[ig * 3 + 1] * factor;
@@ -480,209 +481,66 @@ struct cal_force_loc_sincos_op<FPTYPE, base_device::DEVICE_CPU>
     }
 };
 
-// CPU implementation of Ewald parallel operator
+// CPU implementation of Ewald force sincos operator
 template <typename FPTYPE>
-struct cal_force_ew_parallel_op<FPTYPE, base_device::DEVICE_CPU>
+struct cal_force_ew_sincos_op<FPTYPE, base_device::DEVICE_CPU>
 {
     void operator()(const base_device::DEVICE_CPU* ctx,
                     const int& nat,
                     const int& npw,
-                    const int& ntype,
                     const int& ig_gge0,
                     const FPTYPE* gcar,
                     const FPTYPE* tau,
                     const int* iat2it,
                     const FPTYPE* it_facts,
-                    const int* atom_na,
-                    const FPTYPE* zv_values,
                     const std::complex<FPTYPE>* aux,
-                    const FPTYPE& alpha,
-                    const FPTYPE& fact,
-                    const FPTYPE& lat0,
-                    const FPTYPE* latvec,
-                    const FPTYPE* G,
-                    const bool& use_paw,
                     FPTYPE* force)
     {
         const FPTYPE TWO_PI = 2.0 * M_PI;
-        const FPTYPE e2 = 2.0; // ModuleBase::e2 equivalent
 
 #ifdef _OPENMP
-#pragma omp parallel
-        {
-            int num_threads = omp_get_num_threads();
-            int thread_id = omp_get_thread_num();
-#else
-        int num_threads = 1;
-        int thread_id = 0;
+#pragma omp parallel for
 #endif
+        for (int iat = 0; iat < nat; ++iat)
+        {
+            const FPTYPE tau_x = tau[iat * 3 + 0];
+            const FPTYPE tau_y = tau[iat * 3 + 1];
+            const FPTYPE tau_z = tau[iat * 3 + 2];
+            const FPTYPE it_fact = it_facts[iat];
 
-            // Task distribution for multi-thread
-            int iat_beg, iat_end;
-            ModuleBase::TASK_DIST_1D(num_threads, thread_id, nat, iat_beg, iat_end);
-            iat_end = iat_beg + iat_end;
-
-            // Preprocess ig_gap for skipping the ig point
-            int ig_gap = (ig_gge0 >= 0 && ig_gge0 < npw) ? ig_gge0 : -1;
+            FPTYPE local_force[3] = {0.0, 0.0, 0.0};
 
-            // Part 1: sincos calculation (replacing original while loop with for loop)
-            for (int iat = iat_beg; iat < iat_end; ++iat)
+            for (int ig = 0; ig < npw; ig++)
             {
-                const int it = iat2it[iat];
-                
-                // Check if this atom type has atoms
-                if (atom_na[it] == 0) continue;
-
-                const FPTYPE tau_x = tau[iat * 3 + 0];
-                const FPTYPE tau_y = tau[iat * 3 + 1];
-                const FPTYPE tau_z = tau[iat * 3 + 2];
-                const FPTYPE it_fact = it_facts[iat];
+                // Skip G=0 term
+                if (ig == ig_gge0) continue;
 
-                FPTYPE local_force[3] = {0.0, 0.0, 0.0};
-
-                // G-vector loop with ig_gap handling (using for loops for GPU compatibility)
-                for (int ig = 0; ig < npw; ig++)
-                {
-                    // Skip G=0 term
-                    if (ig == ig_gap) continue;
-
-                    const FPTYPE arg = TWO_PI * (gcar[ig * 3 + 0] * tau_x + 
-                                                 gcar[ig * 3 + 1] * tau_y + 
-                                                 gcar[ig * 3 + 2] * tau_z);
-                    FPTYPE sinp, cosp;
-                    ModuleBase::libm::sincos(arg, &sinp, &cosp);
-                    
-                    const FPTYPE sumnb = -cosp * aux[ig].imag() + sinp * aux[ig].real();
-                    
-                    local_force[0] += gcar[ig * 3 + 0] * sumnb;
-                    local_force[1] += gcar[ig * 3 + 1] * sumnb;
-                    local_force[2] += gcar[ig * 3 + 2] * sumnb;
-                }
-
-                // Apply it_fact scaling
-                force[iat * 3 + 0] += local_force[0] * it_fact;
-                force[iat * 3 + 1] += local_force[1] * it_fact;
-                force[iat * 3 + 2] += local_force[2] * it_fact;
-            }
-
-            // Part 2: Real space Ewald calculation (when processor contains G=0 term)
-            if (ig_gge0 >= 0)
-            {
-                const FPTYPE rmax = 5.0 / (sqrt(alpha) * lat0);
-                const int mxr = 200;
+                const FPTYPE phase = TWO_PI * (gcar[ig * 3 + 0] * tau_x + 
+                                               gcar[ig * 3 + 1] * tau_y + 
+                                               gcar[ig * 3 + 2] * tau_z);
+                FPTYPE sinp, cosp;
+                ModuleBase::libm::sincos(phase, &sinp, &cosp);
                 
-                // Allocate temporary arrays for each thread
-                ModuleBase::Vector3<double>* r_flat = new ModuleBase::Vector3<double>[mxr];
-                double* r2 = new double[mxr];
-                int* irr = new int[mxr];
+                const FPTYPE factor = it_fact * (cosp * aux[ig].imag() + sinp * aux[ig].real());
                 
-                // Initialize arrays
-                for (int i = 0; i < mxr; i++) {
-                    r2[i] = 0.0;
-                    irr[i] = 0;
-                    r_flat[i].x = 0.0;
-                    r_flat[i].y = 0.0;
-                    r_flat[i].z = 0.0;
-                }
-
-                const FPTYPE sqa = sqrt(alpha);
-                const FPTYPE sq8a_2pi = sqrt(8.0 * alpha / TWO_PI);
-
-                // Iterate over atoms in assigned range
-                for (int iat1 = iat_beg; iat1 < iat_end; ++iat1)
-                {
-                    const int T1 = iat2it[iat1];
-                    if (atom_na[T1] == 0) continue;
-
-                    const FPTYPE tau1_x = tau[iat1 * 3 + 0];
-                    const FPTYPE tau1_y = tau[iat1 * 3 + 1];
-                    const FPTYPE tau1_z = tau[iat1 * 3 + 2];
-
-                    // Iterate over all other atoms
-                    for (int iat2 = 0; iat2 < nat; ++iat2)
-                    {
-                        const int T2 = iat2it[iat2];
-                        
-                        if (iat1 != iat2 && atom_na[T2] != 0 && atom_na[T1] != 0)
-                        {
-                            const FPTYPE tau2_x = tau[iat2 * 3 + 0];
-                            const FPTYPE tau2_y = tau[iat2 * 3 + 1];
-                            const FPTYPE tau2_z = tau[iat2 * 3 + 2];
-                            
-                            // Calculate d_tau
-                            const FPTYPE d_tau_x = tau1_x - tau2_x;
-                            const FPTYPE d_tau_y = tau1_y - tau2_y;
-                            const FPTYPE d_tau_z = tau1_z - tau2_z;
-
-                            // Reconstruct Matrix3 objects from flat arrays for rgen call
-                            ModuleBase::Matrix3 latvec_matrix(
-                                static_cast<double>(latvec[0]), static_cast<double>(latvec[1]), static_cast<double>(latvec[2]),
-                                static_cast<double>(latvec[3]), static_cast<double>(latvec[4]), static_cast<double>(latvec[5]),
-                                static_cast<double>(latvec[6]), static_cast<double>(latvec[7]), static_cast<double>(latvec[8])
-                            );
-                            
-                            ModuleBase::Matrix3 G_matrix(
-                                static_cast<double>(G[0]), static_cast<double>(G[1]), static_cast<double>(G[2]),
-                                static_cast<double>(G[3]), static_cast<double>(G[4]), static_cast<double>(G[5]),
-                                static_cast<double>(G[6]), static_cast<double>(G[7]), static_cast<double>(G[8])
-                            );
-                            
-                            ModuleBase::Vector3<double> dtau_vec(
-                                static_cast<double>(d_tau_x),
-                                static_cast<double>(d_tau_y),
-                                static_cast<double>(d_tau_z)
-                            );
-
-                            // Call H_Ewald_pw::rgen to generate neighbor shells
-                            int nrm = 0;
-                            H_Ewald_pw::rgen(dtau_vec, static_cast<double>(rmax), irr, latvec_matrix, G_matrix, r_flat, r2, nrm);
-                            
-                            // Process all generated r vectors
-                            for (int nr = 0; nr < nrm; nr++)
-                            {
-                                const FPTYPE rr = sqrt(r2[nr]) * lat0;
-                                
-                                FPTYPE factor;
-                                if (use_paw)
-                                {
-                                    factor = zv_values[T1] * zv_values[T2] * e2 / (rr * rr)
-                                             * (erfc(sqa * rr) / rr + sq8a_2pi * exp(-alpha * rr * rr))
-                                             * lat0;
-                                }
-                                else
-                                {
-                                    factor = zv_values[T1] * zv_values[T2] * e2 / (rr * rr)
-                                             * (erfc(sqa * rr) / rr + sq8a_2pi * exp(-alpha * rr * rr))
-                                             * lat0;
-                                }
-
-                                force[iat1 * 3 + 0] -= factor * static_cast<FPTYPE>(r_flat[nr].x);
-                                force[iat1 * 3 + 1] -= factor * static_cast<FPTYPE>(r_flat[nr].y);
-                                force[iat1 * 3 + 2] -= factor * static_cast<FPTYPE>(r_flat[nr].z);
-                            }
-                        }
-                    }
-                }
-
-                delete[] r_flat;
-                delete[] r2;
-                delete[] irr;
+                local_force[0] += gcar[ig * 3 + 0] * factor;
+                local_force[1] += gcar[ig * 3 + 1] * factor;
+                local_force[2] += gcar[ig * 3 + 2] * factor;
             }
 
-#ifdef _OPENMP
+            force[iat * 3 + 0] = local_force[0];
+            force[iat * 3 + 1] = local_force[1];
+            force[iat * 3 + 2] = local_force[2];
         }
-#endif
     }
 };
 
 template struct cal_vkb1_nl_op<float, base_device::DEVICE_CPU>;
 template struct cal_force_nl_op<float, base_device::DEVICE_CPU>;
 template struct cal_force_loc_sincos_op<float, base_device::DEVICE_CPU>;
-template struct cal_force_ew_parallel_op<float, base_device::DEVICE_CPU>;
-
+template struct cal_force_ew_sincos_op<float, base_device::DEVICE_CPU>;
 template struct cal_vkb1_nl_op<double, base_device::DEVICE_CPU>;
 template struct cal_force_nl_op<double, base_device::DEVICE_CPU>;
 template struct cal_force_loc_sincos_op<double, base_device::DEVICE_CPU>;
-template struct cal_force_ew_parallel_op<double, base_device::DEVICE_CPU>;
-
+template struct cal_force_ew_sincos_op<double, base_device::DEVICE_CPU>;
 } // namespace hamilt
diff --git a/source/module_hamilt_pw/hamilt_pwdft/kernels/force_op.h b/source/module_hamilt_pw/hamilt_pwdft/kernels/force_op.h
index 152f41e310..3fc5d92212 100644
--- a/source/module_hamilt_pw/hamilt_pwdft/kernels/force_op.h
+++ b/source/module_hamilt_pw/hamilt_pwdft/kernels/force_op.h
@@ -162,8 +162,9 @@ struct cal_force_loc_sincos_op
     /// @param gcar - G-vector Cartesian coordinates [npw * 3]
     /// @param tau - atomic positions [nat * 3]
     /// @param iat2it - atom to type mapping [nat]
-    /// @param vloc_factors - precomputed vloc factors [ntype * npw]
+    /// @param vloc_per_type - precomputed vloc factors per type [ntype * npw]
     /// @param aux - charge density in G-space [npw]
+    /// @param scale_factor - tpiba * omega
     ///
     /// Output Parameters
     /// @param force - output forces [nat * 3]
@@ -174,55 +175,39 @@ struct cal_force_loc_sincos_op
                     const FPTYPE* gcar,
                     const FPTYPE* tau,
                     const int* iat2it,
-                    const FPTYPE* vloc_factors,
+                    const FPTYPE* vloc_per_type,
                     const std::complex<FPTYPE>* aux,
+                    const FPTYPE& scale_factor,
                     FPTYPE* force);
 };
 
 template <typename FPTYPE, typename Device>
-struct cal_force_ew_parallel_op
+struct cal_force_ew_sincos_op
 {
-    /// @brief Calculate Ewald forces (parallel region only)
+    /// @brief Calculate Ewald forces (sincos loop only)
     ///
     /// Input Parameters
     /// @param ctx - which device this function runs on
     /// @param nat - number of atoms
     /// @param npw - number of plane waves
-    /// @param ntype - number of atom types
     /// @param ig_gge0 - index of G=0 vector (-1 if not present)
     /// @param gcar - G-vector Cartesian coordinates [npw * 3]
     /// @param tau - atomic positions [nat * 3]
     /// @param iat2it - atom to type mapping [nat]
     /// @param it_facts - precomputed it_fact for each atom [nat]
-    /// @param atom_na - number of atoms of each type [ntype]
-    /// @param zv_values - valence charge for each type [ntype]
     /// @param aux - structure factor related array [npw]
-    /// @param alpha - Ewald parameter
-    /// @param fact - scaling factor
-    /// @param lat0 - lattice constant
-    /// @param latvec - lattice vectors [3 * 3]
-    /// @param G - reciprocal lattice vectors [3 * 3]
-    /// @param use_pwa - whether PAW is used
+    ///
     /// Output Parameters
     /// @param force - output forces [nat * 3]
     void operator()(const Device* ctx,
                     const int& nat,
                     const int& npw,
-                    const int& ntype,
                     const int& ig_gge0,
                     const FPTYPE* gcar,
                     const FPTYPE* tau,
                     const int* iat2it,
                     const FPTYPE* it_facts,
-                    const int* atom_na,
-                    const FPTYPE* zv_values,
                     const std::complex<FPTYPE>* aux,
-                    const FPTYPE& alpha,
-                    const FPTYPE& fact,
-                    const FPTYPE& lat0,
-                    const FPTYPE* latvec,
-                    const FPTYPE* G,
-                    const bool& use_paw,
                     FPTYPE* force);
 };
 
@@ -335,13 +320,14 @@ struct cal_force_loc_sincos_op<FPTYPE, base_device::DEVICE_GPU>
                     const FPTYPE* gcar,
                     const FPTYPE* tau,
                     const int* iat2it,
-                    const FPTYPE* vloc_factors,
+                    const FPTYPE* vloc_per_type,
                     const std::complex<FPTYPE>* aux,
+                    const FPTYPE& scale_factor,
                     FPTYPE* force);
 };
 
 template <typename FPTYPE>
-struct cal_force_ew_parallel_op<FPTYPE, base_device::DEVICE_GPU>
+struct cal_force_ew_sincos_op<FPTYPE, base_device::DEVICE_GPU>
 {
     void operator()(const base_device::DEVICE_GPU* ctx,
                     const int& nat,
diff --git a/source/module_hamilt_pw/hamilt_pwdft/kernels/force_sincos_op.cpp b/source/module_hamilt_pw/hamilt_pwdft/kernels/force_sincos_op.cpp
deleted file mode 100644
index e7ce8554cc..0000000000
--- a/source/module_hamilt_pw/hamilt_pwdft/kernels/force_sincos_op.cpp
+++ /dev/null
@@ -1,123 +0,0 @@
-#include "force_sincos_op.h"
-#include "module_base/libm/libm.h"
-
-#ifdef _OPENMP
-#include <omp.h>
-#endif
-
-namespace hamilt
-{
-
-template <typename FPTYPE>
-struct cal_force_loc_sincos_op<FPTYPE, base_device::DEVICE_CPU>
-{
-    void operator()(const base_device::DEVICE_CPU* ctx,
-                    const int& nat,
-                    const int& npw,
-                    const FPTYPE* gcar,
-                    const FPTYPE* tau,
-                    const int* iat2it,
-                    const FPTYPE* vloc_factors,
-                    const std::complex<FPTYPE>* aux,
-                    const FPTYPE& scale_factor,
-                    FPTYPE* force)
-    {
-        const FPTYPE TWO_PI = 2.0 * M_PI;
-
-#ifdef _OPENMP
-#pragma omp parallel for
-#endif
-        for (int iat = 0; iat < nat; ++iat)
-        {
-            const int it = iat2it[iat];
-            const FPTYPE tau_x = tau[iat * 3 + 0];
-            const FPTYPE tau_y = tau[iat * 3 + 1];
-            const FPTYPE tau_z = tau[iat * 3 + 2];
-
-            FPTYPE local_force[3] = {0.0, 0.0, 0.0};
-
-            for (int ig = 0; ig < npw; ig++)
-            {
-                const FPTYPE phase = TWO_PI * (gcar[ig * 3 + 0] * tau_x + 
-                                               gcar[ig * 3 + 1] * tau_y + 
-                                               gcar[ig * 3 + 2] * tau_z);
-                FPTYPE sinp, cosp;
-                ModuleBase::libm::sincos(phase, &sinp, &cosp);
-                
-                const FPTYPE factor = vloc_factors[ig] * 
-                                     (cosp * aux[ig].imag() + sinp * aux[ig].real());
-                
-                local_force[0] += gcar[ig * 3 + 0] * factor;
-                local_force[1] += gcar[ig * 3 + 1] * factor;
-                local_force[2] += gcar[ig * 3 + 2] * factor;
-            }
-
-            force[iat * 3 + 0] += local_force[0] * scale_factor;
-            force[iat * 3 + 1] += local_force[1] * scale_factor;
-            force[iat * 3 + 2] += local_force[2] * scale_factor;
-        }
-    }
-};
-
-template <typename FPTYPE>
-struct cal_force_ew_sincos_op<FPTYPE, base_device::DEVICE_CPU>
-{
-    void operator()(const base_device::DEVICE_CPU* ctx,
-                    const int& nat,
-                    const int& npw,
-                    const int& ig_gge0,
-                    const FPTYPE* gcar,
-                    const FPTYPE* tau,
-                    const int* iat2it,
-                    const FPTYPE* it_facts,
-                    const std::complex<FPTYPE>* aux,
-                    FPTYPE* force)
-    {
-        const FPTYPE TWO_PI = 2.0 * M_PI;
-
-#ifdef _OPENMP
-#pragma omp parallel for
-#endif
-        for (int iat = 0; iat < nat; ++iat)
-        {
-            const int it = iat2it[iat];
-            const FPTYPE tau_x = tau[iat * 3 + 0];
-            const FPTYPE tau_y = tau[iat * 3 + 1];
-            const FPTYPE tau_z = tau[iat * 3 + 2];
-            const FPTYPE it_fact = it_facts[iat];
-
-            FPTYPE local_force[3] = {0.0, 0.0, 0.0};
-
-            for (int ig = 0; ig < npw; ig++)
-            {
-                // Skip G=0 term
-                if (ig == ig_gge0) continue;
-
-                const FPTYPE arg = TWO_PI * (gcar[ig * 3 + 0] * tau_x + 
-                                             gcar[ig * 3 + 1] * tau_y + 
-                                             gcar[ig * 3 + 2] * tau_z);
-                FPTYPE sinp, cosp;
-                ModuleBase::libm::sincos(arg, &sinp, &cosp);
-                
-                const FPTYPE sumnb = -cosp * aux[ig].imag() + sinp * aux[ig].real();
-                
-                local_force[0] += gcar[ig * 3 + 0] * sumnb;
-                local_force[1] += gcar[ig * 3 + 1] * sumnb;
-                local_force[2] += gcar[ig * 3 + 2] * sumnb;
-            }
-
-            force[iat * 3 + 0] += local_force[0] * it_fact;
-            force[iat * 3 + 1] += local_force[1] * it_fact;
-            force[iat * 3 + 2] += local_force[2] * it_fact;
-        }
-    }
-};
-
-// Template instantiations
-template struct cal_force_loc_sincos_op<float, base_device::DEVICE_CPU>;
-template struct cal_force_loc_sincos_op<double, base_device::DEVICE_CPU>;
-
-template struct cal_force_ew_sincos_op<float, base_device::DEVICE_CPU>;
-template struct cal_force_ew_sincos_op<double, base_device::DEVICE_CPU>;
-
-} // namespace hamilt 
\ No newline at end of file
diff --git a/source/module_hamilt_pw/hamilt_pwdft/kernels/force_sincos_op.h b/source/module_hamilt_pw/hamilt_pwdft/kernels/force_sincos_op.h
deleted file mode 100644
index 455b8425a7..0000000000
--- a/source/module_hamilt_pw/hamilt_pwdft/kernels/force_sincos_op.h
+++ /dev/null
@@ -1,106 +0,0 @@
-#ifndef FORCE_SINCOS_OP_H
-#define FORCE_SINCOS_OP_H
-
-#include "module_base/module_device/types.h"
-#include <complex>
-
-namespace hamilt
-{
-
-template <typename FPTYPE, typename Device>
-struct cal_force_loc_sincos_op
-{
-    /// @brief Calculate local pseudopotential forces (sincos loop only)
-    ///
-    /// Input Parameters
-    /// @param ctx - which device this function runs on
-    /// @param nat - number of atoms
-    /// @param npw - number of plane waves
-    /// @param gcar - G-vector Cartesian coordinates [npw * 3]
-    /// @param tau - atomic positions [nat * 3]
-    /// @param iat2it - atom to type mapping [nat]
-    /// @param vloc_factors - precomputed vloc factors [npw]
-    /// @param aux - charge density in G-space [npw]
-    /// @param scale_factor - tpiba * omega
-    ///
-    /// Output Parameters
-    /// @param force - output forces [nat * 3]
-    void operator()(const Device* ctx,
-                    const int& nat,
-                    const int& npw,
-                    const FPTYPE* gcar,
-                    const FPTYPE* tau,
-                    const int* iat2it,
-                    const FPTYPE* vloc_factors,
-                    const std::complex<FPTYPE>* aux,
-                    const FPTYPE& scale_factor,
-                    FPTYPE* force);
-};
-
-template <typename FPTYPE, typename Device>
-struct cal_force_ew_sincos_op
-{
-    /// @brief Calculate Ewald forces (sincos loop only)
-    ///
-    /// Input Parameters
-    /// @param ctx - which device this function runs on
-    /// @param nat - number of atoms
-    /// @param npw - number of plane waves
-    /// @param ig_gge0 - index of G=0 vector (-1 if not present)
-    /// @param gcar - G-vector Cartesian coordinates [npw * 3]
-    /// @param tau - atomic positions [nat * 3]
-    /// @param iat2it - atom to type mapping [nat]
-    /// @param it_facts - precomputed it_fact for each atom [nat]
-    /// @param aux - structure factor related array [npw]
-    ///
-    /// Output Parameters
-    /// @param force - output forces [nat * 3]
-    void operator()(const Device* ctx,
-                    const int& nat,
-                    const int& npw,
-                    const int& ig_gge0,
-                    const FPTYPE* gcar,
-                    const FPTYPE* tau,
-                    const int* iat2it,
-                    const FPTYPE* it_facts,
-                    const std::complex<FPTYPE>* aux,
-                    FPTYPE* force);
-};
-
-#if __CUDA || __UT_USE_CUDA || __ROCM || __UT_USE_ROCM
-
-template <typename FPTYPE>
-struct cal_force_loc_sincos_op<FPTYPE, base_device::DEVICE_GPU>
-{
-    void operator()(const base_device::DEVICE_GPU* ctx,
-                    const int& nat,
-                    const int& npw,
-                    const FPTYPE* gcar,
-                    const FPTYPE* tau,
-                    const int* iat2it,
-                    const FPTYPE* vloc_factors,
-                    const std::complex<FPTYPE>* aux,
-                    const FPTYPE& scale_factor,
-                    FPTYPE* force);
-};
-
-template <typename FPTYPE>
-struct cal_force_ew_sincos_op<FPTYPE, base_device::DEVICE_GPU>
-{
-    void operator()(const base_device::DEVICE_GPU* ctx,
-                    const int& nat,
-                    const int& npw,
-                    const int& ig_gge0,
-                    const FPTYPE* gcar,
-                    const FPTYPE* tau,
-                    const int* iat2it,
-                    const FPTYPE* it_facts,
-                    const std::complex<FPTYPE>* aux,
-                    FPTYPE* force);
-};
-
-#endif // __CUDA || __UT_USE_CUDA || __ROCM || __UT_USE_ROCM
-
-} // namespace hamilt
-
-#endif // FORCE_SINCOS_OP_H 
\ No newline at end of file
diff --git a/source/module_hamilt_pw/hamilt_pwdft/kernels/rocm/force_op.hip.cu b/source/module_hamilt_pw/hamilt_pwdft/kernels/rocm/force_op.hip.cu
index c78b333b86..9feeb76fed 100644
--- a/source/module_hamilt_pw/hamilt_pwdft/kernels/rocm/force_op.hip.cu
+++ b/source/module_hamilt_pw/hamilt_pwdft/kernels/rocm/force_op.hip.cu
@@ -618,10 +618,195 @@ template void revertVkbValues<double>(const int *gcar_zero_ptrs, std::complex<do
 // for saveVkbValues functions instantiation
 template void saveVkbValues<double>(const int *gcar_zero_ptrs, const std::complex<double> *vkb_ptr, std::complex<double> *vkb_save_ptr, int nkb, int gcar_zero_count, int npw, int ipol, int npwx);
 
+// HIP kernels for sincos loops
+template <typename FPTYPE>
+__global__ void cal_force_loc_sincos_kernel(
+    const int nat,
+    const int npw,
+    const int ntype,
+    const FPTYPE* gcar,
+    const FPTYPE* tau,
+    const int* iat2it,
+    const FPTYPE* vloc_per_type,
+    const thrust::complex<FPTYPE>* aux,
+    const FPTYPE scale_factor,
+    FPTYPE* force)
+{
+    const FPTYPE TWO_PI = 2.0 * M_PI;
+    
+    const int iat = blockIdx.y;
+    const int ig_start = blockIdx.x * blockDim.x + threadIdx.x;
+    const int ig_stride = gridDim.x * blockDim.x;
+    
+    if (iat >= nat) return;
+    
+    // Load atom information to registers
+    const int it = iat2it[iat];
+    const FPTYPE tau_x = tau[iat * 3 + 0];
+    const FPTYPE tau_y = tau[iat * 3 + 1];
+    const FPTYPE tau_z = tau[iat * 3 + 2];
+    
+    // Local accumulation variables
+    FPTYPE local_force_x = 0.0;
+    FPTYPE local_force_y = 0.0;
+    FPTYPE local_force_z = 0.0;
+    
+    // Grid-stride loop over G-vectors
+    for (int ig = ig_start; ig < npw; ig += ig_stride) {
+        // Calculate phase: 2π * (G · τ)
+        const FPTYPE phase = TWO_PI * (gcar[ig * 3 + 0] * tau_x + 
+                                       gcar[ig * 3 + 1] * tau_y + 
+                                       gcar[ig * 3 + 2] * tau_z);
+        
+        // Use HIP intrinsic for sincos
+        FPTYPE sinp, cosp;
+        sincos(phase, &sinp, &cosp);
+        
+        // Calculate force factor
+        const FPTYPE vloc_factor = vloc_per_type[it * npw + ig];
+        const FPTYPE factor = vloc_factor * (cosp * aux[ig].imag() + sinp * aux[ig].real());
+        
+        // Accumulate force contributions
+        local_force_x += gcar[ig * 3 + 0] * factor;
+        local_force_y += gcar[ig * 3 + 1] * factor;
+        local_force_z += gcar[ig * 3 + 2] * factor;
+    }
+    
+    // Atomic add to global memory
+    atomicAdd(&force[iat * 3 + 0], local_force_x * scale_factor);
+    atomicAdd(&force[iat * 3 + 1], local_force_y * scale_factor);
+    atomicAdd(&force[iat * 3 + 2], local_force_z * scale_factor);
+}
+
+template <typename FPTYPE>
+__global__ void cal_force_ew_sincos_kernel(
+    const int nat,
+    const int npw,
+    const int ig_gge0,
+    const FPTYPE* gcar,
+    const FPTYPE* tau,
+    const int* iat2it,
+    const FPTYPE* it_facts,
+    const thrust::complex<FPTYPE>* aux,
+    FPTYPE* force)
+{
+    const FPTYPE TWO_PI = 2.0 * M_PI;
+    
+    const int iat = blockIdx.y;
+    const int ig_start = blockIdx.x * blockDim.x + threadIdx.x;
+    const int ig_stride = gridDim.x * blockDim.x;
+    
+    if (iat >= nat) return;
+    
+    // Load atom information to registers
+    const FPTYPE tau_x = tau[iat * 3 + 0];
+    const FPTYPE tau_y = tau[iat * 3 + 1];
+    const FPTYPE tau_z = tau[iat * 3 + 2];
+    const FPTYPE it_fact = it_facts[iat];
+    
+    // Local accumulation variables
+    FPTYPE local_force_x = 0.0;
+    FPTYPE local_force_y = 0.0;
+    FPTYPE local_force_z = 0.0;
+    
+    // Grid-stride loop over G-vectors
+    for (int ig = ig_start; ig < npw; ig += ig_stride) {
+        // Skip G=0 term
+        if (ig == ig_gge0) continue;
+        
+        // Calculate phase: 2π * (G · τ)
+        const FPTYPE phase = TWO_PI * (gcar[ig * 3 + 0] * tau_x + 
+                                       gcar[ig * 3 + 1] * tau_y + 
+                                       gcar[ig * 3 + 2] * tau_z);
+        
+        // Use HIP intrinsic for sincos
+        FPTYPE sinp, cosp;
+        sincos(phase, &sinp, &cosp);
+        
+        // Calculate Ewald sum contribution
+        const FPTYPE factor = it_fact * (cosp * aux[ig].imag() + sinp * aux[ig].real());
+        
+        // Accumulate force contributions
+        local_force_x += gcar[ig * 3 + 0] * factor;
+        local_force_y += gcar[ig * 3 + 1] * factor;
+        local_force_z += gcar[ig * 3 + 2] * factor;
+    }
+    
+    // Atomic add to global memory
+    atomicAdd(&force[iat * 3 + 0], local_force_x);
+    atomicAdd(&force[iat * 3 + 1], local_force_y);
+    atomicAdd(&force[iat * 3 + 2], local_force_z);
+}
+
+// GPU operators
+template <typename FPTYPE>
+void cal_force_loc_sincos_op<FPTYPE, base_device::DEVICE_GPU>::operator()(
+    const base_device::DEVICE_GPU* ctx,
+    const int& nat,
+    const int& npw,
+    const int& ntype,
+    const FPTYPE* gcar,
+    const FPTYPE* tau,
+    const int* iat2it,
+    const FPTYPE* vloc_per_type,
+    const std::complex<FPTYPE>* aux,
+    const FPTYPE& scale_factor,
+    FPTYPE* force)
+{
+    // Calculate optimal grid configuration for GPU load balancing
+    const int threads_per_block = THREADS_PER_BLOCK;
+    const int max_blocks_per_sm = 32; // Configurable per GPU architecture
+    const int max_blocks_x = std::min(max_blocks_per_sm, (npw + threads_per_block - 1) / threads_per_block);
+    
+    dim3 grid(max_blocks_x, nat);
+    dim3 block(threads_per_block);
+    
+    hipLaunchKernelGGL(cal_force_loc_sincos_kernel<FPTYPE>,
+                       grid, block, 0, 0,
+                       nat, npw, ntype, gcar, tau, iat2it, vloc_per_type,
+                       reinterpret_cast<const thrust::complex<FPTYPE>*>(aux),
+                       scale_factor, force);
+    
+    hipCheckOnDebug();
+}
+
+template <typename FPTYPE>
+void cal_force_ew_sincos_op<FPTYPE, base_device::DEVICE_GPU>::operator()(
+    const base_device::DEVICE_GPU* ctx,
+    const int& nat,
+    const int& npw,
+    const int& ig_gge0,
+    const FPTYPE* gcar,
+    const FPTYPE* tau,
+    const int* iat2it,
+    const FPTYPE* it_facts,
+    const std::complex<FPTYPE>* aux,
+    FPTYPE* force)
+{
+    // Calculate optimal grid configuration for GPU load balancing
+    const int threads_per_block = THREADS_PER_BLOCK;
+    const int max_blocks_per_sm = 32; // Configurable per GPU architecture
+    const int max_blocks_x = std::min(max_blocks_per_sm, (npw + threads_per_block - 1) / threads_per_block);
+    
+    dim3 grid(max_blocks_x, nat);
+    dim3 block(threads_per_block);
+    
+    hipLaunchKernelGGL(cal_force_ew_sincos_kernel<FPTYPE>,
+                       grid, block, 0, 0,
+                       nat, npw, ig_gge0, gcar, tau, iat2it, it_facts,
+                       reinterpret_cast<const thrust::complex<FPTYPE>*>(aux), force);
+    
+    hipCheckOnDebug();
+}
+
 template struct cal_vkb1_nl_op<float, base_device::DEVICE_GPU>;
 template struct cal_force_nl_op<float, base_device::DEVICE_GPU>;
+template struct cal_force_loc_sincos_op<float, base_device::DEVICE_GPU>;
+template struct cal_force_ew_sincos_op<float, base_device::DEVICE_GPU>;
 
 template struct cal_vkb1_nl_op<double, base_device::DEVICE_GPU>;
 template struct cal_force_nl_op<double, base_device::DEVICE_GPU>;
+template struct cal_force_loc_sincos_op<double, base_device::DEVICE_GPU>;
+template struct cal_force_ew_sincos_op<double, base_device::DEVICE_GPU>;
 
 }  // namespace hamilt
diff --git a/source/module_hamilt_pw/hamilt_pwdft/kernels/rocm/force_sincos_op.hip.cu b/source/module_hamilt_pw/hamilt_pwdft/kernels/rocm/force_sincos_op.hip.cu
deleted file mode 100644
index a09cd28bed..0000000000
--- a/source/module_hamilt_pw/hamilt_pwdft/kernels/rocm/force_sincos_op.hip.cu
+++ /dev/null
@@ -1,203 +0,0 @@
-#include "module_hamilt_pw/hamilt_pwdft/kernels/force_sincos_op.h"
-
-#include <complex>
-#include <thrust/complex.h>
-#include <hip/hip_runtime.h>
-#include <base/macros/macros.h>
-
-#define THREADS_PER_BLOCK 256
-
-namespace hamilt {
-
-// HIP kernels for sincos loops
-template <typename FPTYPE>
-__global__ void cal_force_loc_sincos_kernel(
-    const int nat,
-    const int npw,
-    const FPTYPE* gcar,
-    const FPTYPE* tau,
-    const int* iat2it,
-    const FPTYPE* vloc_factors,
-    const thrust::complex<FPTYPE>* aux,
-    const FPTYPE scale_factor,
-    FPTYPE* force)
-{
-    const FPTYPE TWO_PI = 2.0 * M_PI;
-    
-    const int iat = blockIdx.y;
-    const int ig_start = blockIdx.x * blockDim.x + threadIdx.x;
-    const int ig_stride = gridDim.x * blockDim.x;
-    
-    if (iat >= nat) return;
-    
-    // Load atom information to registers
-    const int it = iat2it[iat];
-    const FPTYPE tau_x = tau[iat * 3 + 0];
-    const FPTYPE tau_y = tau[iat * 3 + 1];
-    const FPTYPE tau_z = tau[iat * 3 + 2];
-    
-    // Local accumulation variables
-    FPTYPE local_force_x = 0.0;
-    FPTYPE local_force_y = 0.0;
-    FPTYPE local_force_z = 0.0;
-    
-    // Grid-stride loop over G-vectors
-    for (int ig = ig_start; ig < npw; ig += ig_stride) {
-        // Calculate phase: 2π * (G · τ)
-        const FPTYPE phase = TWO_PI * (
-            gcar[ig * 3 + 0] * tau_x + 
-            gcar[ig * 3 + 1] * tau_y + 
-            gcar[ig * 3 + 2] * tau_z
-        );
-        
-        // Use HIP intrinsic for sincos
-        FPTYPE sinp, cosp;
-        __sincos(phase, &sinp, &cosp);
-        
-        // Calculate force factor
-        const FPTYPE factor = vloc_factors[ig] * 
-                              (cosp * aux[ig].imag() + sinp * aux[ig].real());
-        
-        // Accumulate force contributions
-        local_force_x += gcar[ig * 3 + 0] * factor;
-        local_force_y += gcar[ig * 3 + 1] * factor;
-        local_force_z += gcar[ig * 3 + 2] * factor;
-    }
-    
-    // Atomic add to global memory
-    atomicAdd(&force[iat * 3 + 0], local_force_x * scale_factor);
-    atomicAdd(&force[iat * 3 + 1], local_force_y * scale_factor);
-    atomicAdd(&force[iat * 3 + 2], local_force_z * scale_factor);
-}
-
-template <typename FPTYPE>
-__global__ void cal_force_ew_sincos_kernel(
-    const int nat,
-    const int npw,
-    const int ig_gge0,
-    const FPTYPE* gcar,
-    const FPTYPE* tau,
-    const int* iat2it,
-    const FPTYPE* it_facts,
-    const thrust::complex<FPTYPE>* aux,
-    FPTYPE* force)
-{
-    const FPTYPE TWO_PI = 2.0 * M_PI;
-    
-    const int iat = blockIdx.y;
-    const int ig_start = blockIdx.x * blockDim.x + threadIdx.x;
-    const int ig_stride = gridDim.x * blockDim.x;
-    
-    if (iat >= nat) return;
-    
-    // Load atom information to registers
-    const int it = iat2it[iat];
-    const FPTYPE tau_x = tau[iat * 3 + 0];
-    const FPTYPE tau_y = tau[iat * 3 + 1];
-    const FPTYPE tau_z = tau[iat * 3 + 2];
-    const FPTYPE it_fact = it_facts[iat];
-    
-    // Local accumulation variables
-    FPTYPE local_force_x = 0.0;
-    FPTYPE local_force_y = 0.0;
-    FPTYPE local_force_z = 0.0;
-    
-    // Grid-stride loop over G-vectors
-    for (int ig = ig_start; ig < npw; ig += ig_stride) {
-        // Skip G=0 term
-        if (ig == ig_gge0) continue;
-        
-        // Calculate phase: 2π * (G · τ)
-        const FPTYPE arg = TWO_PI * (
-            gcar[ig * 3 + 0] * tau_x + 
-            gcar[ig * 3 + 1] * tau_y + 
-            gcar[ig * 3 + 2] * tau_z
-        );
-        
-        // Use HIP intrinsic for sincos
-        FPTYPE sinp, cosp;
-        __sincos(arg, &sinp, &cosp);
-        
-        // Calculate Ewald sum contribution
-        const FPTYPE sumnb = -cosp * aux[ig].imag() + sinp * aux[ig].real();
-        
-        // Accumulate force contributions
-        local_force_x += gcar[ig * 3 + 0] * sumnb;
-        local_force_y += gcar[ig * 3 + 1] * sumnb;
-        local_force_z += gcar[ig * 3 + 2] * sumnb;
-    }
-    
-    // Atomic add to global memory
-    atomicAdd(&force[iat * 3 + 0], local_force_x * it_fact);
-    atomicAdd(&force[iat * 3 + 1], local_force_y * it_fact);
-    atomicAdd(&force[iat * 3 + 2], local_force_z * it_fact);
-}
-
-// GPU operator implementations
-template <typename FPTYPE>
-void cal_force_loc_sincos_op<FPTYPE, base_device::DEVICE_GPU>::operator()(
-    const base_device::DEVICE_GPU* ctx,
-    const int& nat,
-    const int& npw,
-    const FPTYPE* gcar,
-    const FPTYPE* tau,
-    const int* iat2it,
-    const FPTYPE* vloc_factors,
-    const std::complex<FPTYPE>* aux,
-    const FPTYPE& scale_factor,
-    FPTYPE* force)
-{
-    // Calculate optimal grid configuration
-    const int threads_per_block = THREADS_PER_BLOCK;
-    const int max_blocks_x = min(32, (npw + threads_per_block - 1) / threads_per_block);
-    
-    dim3 grid(max_blocks_x, nat);
-    dim3 block(threads_per_block);
-    
-    hipLaunchKernelGGL(HIP_KERNEL_NAME(cal_force_loc_sincos_kernel<FPTYPE>), grid, block, 0, 0,
-        nat, npw,
-        gcar, tau, iat2it, vloc_factors,
-        reinterpret_cast<const thrust::complex<FPTYPE>*>(aux),
-        scale_factor, force
-    );
-    
-    hipCheckOnDebug();
-}
-
-template <typename FPTYPE>
-void cal_force_ew_sincos_op<FPTYPE, base_device::DEVICE_GPU>::operator()(
-    const base_device::DEVICE_GPU* ctx,
-    const int& nat,
-    const int& npw,
-    const int& ig_gge0,
-    const FPTYPE* gcar,
-    const FPTYPE* tau,
-    const int* iat2it,
-    const FPTYPE* it_facts,
-    const std::complex<FPTYPE>* aux,
-    FPTYPE* force)
-{
-    // Calculate optimal grid configuration
-    const int threads_per_block = THREADS_PER_BLOCK;
-    const int max_blocks_x = min(32, (npw + threads_per_block - 1) / threads_per_block);
-    
-    dim3 grid(max_blocks_x, nat);
-    dim3 block(threads_per_block);
-    
-    hipLaunchKernelGGL(HIP_KERNEL_NAME(cal_force_ew_sincos_kernel<FPTYPE>), grid, block, 0, 0,
-        nat, npw, ig_gge0,
-        gcar, tau, iat2it, it_facts,
-        reinterpret_cast<const thrust::complex<FPTYPE>*>(aux),
-        force
-    );
-    
-    hipCheckOnDebug();
-}
-
-template struct cal_force_loc_sincos_op<float, base_device::DEVICE_GPU>;
-template struct cal_force_loc_sincos_op<double, base_device::DEVICE_GPU>;
-
-template struct cal_force_ew_sincos_op<float, base_device::DEVICE_GPU>;
-template struct cal_force_ew_sincos_op<double, base_device::DEVICE_GPU>;
-
-}  // namespace hamilt 
\ No newline at end of file

From 48e91dbe125e5efb81ff5d982877f68ce7092f7d Mon Sep 17 00:00:00 2001
From: Jie <li_j20@fudan.edu.cn>
Date: Tue, 3 Jun 2025 11:26:23 +0800
Subject: [PATCH 4/6] fix vloc computation in cal_force_loc_sincos_op

---
 source/module_hamilt_pw/hamilt_pwdft/forces.cpp   | 15 ++++-----------
 .../hamilt_pwdft/kernels/cuda/force_op.cu         |  7 ++-----
 .../hamilt_pwdft/kernels/force_op.cpp             |  4 +---
 .../hamilt_pwdft/kernels/force_op.h               |  5 +----
 .../hamilt_pwdft/kernels/rocm/force_op.hip.cu     |  7 ++-----
 5 files changed, 10 insertions(+), 28 deletions(-)

diff --git a/source/module_hamilt_pw/hamilt_pwdft/forces.cpp b/source/module_hamilt_pw/hamilt_pwdft/forces.cpp
index 5e1d3cd6a2..d50288eef8 100644
--- a/source/module_hamilt_pw/hamilt_pwdft/forces.cpp
+++ b/source/module_hamilt_pw/hamilt_pwdft/forces.cpp
@@ -537,7 +537,6 @@ void Forces<FPTYPE, Device>::cal_force_loc(const UnitCell& ucell,
     
     // 准备原子相关数据：按照全局原子索引iat顺序
     std::vector<FPTYPE> tau_flat(this->nat * 3);
-    std::vector<int> iat2it_host(this->nat);
     std::vector<FPTYPE> gcar_flat(rho_basis->npw * 3);
     
     // 按照原始代码逻辑：遍历全局原子索引iat，通过查表获取(it,ia)
@@ -548,7 +547,6 @@ void Forces<FPTYPE, Device>::cal_force_loc(const UnitCell& ucell,
         tau_flat[iat * 3 + 0] = static_cast<FPTYPE>(ucell.atoms[it].tau[ia][0]);
         tau_flat[iat * 3 + 1] = static_cast<FPTYPE>(ucell.atoms[it].tau[ia][1]);
         tau_flat[iat * 3 + 2] = static_cast<FPTYPE>(ucell.atoms[it].tau[ia][2]);
-        iat2it_host[iat] = it;
     }
     
     for (int ig = 0; ig < rho_basis->npw; ig++) {
@@ -559,9 +557,10 @@ void Forces<FPTYPE, Device>::cal_force_loc(const UnitCell& ucell,
     
     // 重新计算vloc_factors考虑所有原子类型
     std::vector<FPTYPE> vloc_per_type_host(ucell.ntype * rho_basis->npw);
-    for (int it = 0; it < ucell.ntype; it++) {
+    for (int iat = 0; iat < this->nat; iat++) {
+        int it = ucell.iat2it[iat];
         for (int ig = 0; ig < rho_basis->npw; ig++) {
-            vloc_per_type_host[it * rho_basis->npw + ig] = static_cast<FPTYPE>(vloc(it, rho_basis->ig2igg[ig]));
+            vloc_per_type_host[iat * rho_basis->npw + ig] = static_cast<FPTYPE>(vloc(it, rho_basis->ig2igg[ig]));
         }
     }
     
@@ -574,7 +573,6 @@ void Forces<FPTYPE, Device>::cal_force_loc(const UnitCell& ucell,
     // 设备端内存和指针设置（根据设备类型分支）
     FPTYPE* d_gcar = gcar_flat.data();
     FPTYPE* d_tau = tau_flat.data();
-    int* d_iat2it = iat2it_host.data();
     FPTYPE* d_vloc_per_type = vloc_per_type_host.data();
     std::complex<FPTYPE>* d_aux = aux_fptype.data();
     FPTYPE* d_force = nullptr;
@@ -584,13 +582,11 @@ void Forces<FPTYPE, Device>::cal_force_loc(const UnitCell& ucell,
     {
         d_gcar = nullptr;
         d_tau = nullptr;
-        d_iat2it = nullptr;
         d_vloc_per_type = nullptr;
         d_aux = nullptr;
         
         resmem_var_op()(this->ctx, d_gcar, rho_basis->npw * 3);
         resmem_var_op()(this->ctx, d_tau, this->nat * 3);
-        resmem_int_op()(this->ctx, d_iat2it, this->nat);
         resmem_var_op()(this->ctx, d_vloc_per_type, ucell.ntype * rho_basis->npw);
         resmem_complex_op()(this->ctx, d_aux, rho_basis->npw);
         resmem_var_op()(this->ctx, d_force, this->nat * 3);
@@ -598,7 +594,6 @@ void Forces<FPTYPE, Device>::cal_force_loc(const UnitCell& ucell,
         // 数据传输到设备
         syncmem_var_h2d_op()(this->ctx, this->cpu_ctx, d_gcar, gcar_flat.data(), rho_basis->npw * 3);
         syncmem_var_h2d_op()(this->ctx, this->cpu_ctx, d_tau, tau_flat.data(), this->nat * 3);
-        syncmem_int_h2d_op()(this->ctx, this->cpu_ctx, d_iat2it, iat2it_host.data(), this->nat);
         syncmem_var_h2d_op()(this->ctx, this->cpu_ctx, d_vloc_per_type, vloc_per_type_host.data(), ucell.ntype * rho_basis->npw);
         syncmem_complex_h2d_op()(this->ctx, this->cpu_ctx, d_aux, aux_fptype.data(), rho_basis->npw);
         
@@ -623,7 +618,6 @@ void Forces<FPTYPE, Device>::cal_force_loc(const UnitCell& ucell,
         ucell.ntype,
         d_gcar,
         d_tau,
-        d_iat2it,
         d_vloc_per_type,
         d_aux,
         scale_factor,
@@ -639,7 +633,6 @@ void Forces<FPTYPE, Device>::cal_force_loc(const UnitCell& ucell,
         // 清理设备内存
         delmem_var_op()(this->ctx, d_gcar);
         delmem_var_op()(this->ctx, d_tau);
-        delmem_int_op()(this->ctx, d_iat2it);
         delmem_var_op()(this->ctx, d_vloc_per_type);
         delmem_complex_op()(this->ctx, d_aux);
         delmem_var_op()(this->ctx, d_force);
@@ -652,7 +645,7 @@ void Forces<FPTYPE, Device>::cal_force_loc(const UnitCell& ucell,
         forcelc(iat, 2) = static_cast<double>(force_host[iat * 3 + 2]);
     }
 
-    // this->print(GlobalV::ofs_running, "local forces", forcelc);
+    // this->print(GlobalV: :ofs_running, "local forces", forcelc);
     Parallel_Reduce::reduce_pool(forcelc.c, forcelc.nr * forcelc.nc);
     delete[] aux;
     ModuleBase::timer::tick("Forces", "cal_force_loc");
diff --git a/source/module_hamilt_pw/hamilt_pwdft/kernels/cuda/force_op.cu b/source/module_hamilt_pw/hamilt_pwdft/kernels/cuda/force_op.cu
index 077864caff..a82bd1ee0b 100644
--- a/source/module_hamilt_pw/hamilt_pwdft/kernels/cuda/force_op.cu
+++ b/source/module_hamilt_pw/hamilt_pwdft/kernels/cuda/force_op.cu
@@ -21,7 +21,6 @@ __global__ void cal_force_loc_sincos_kernel(
     const int ntype,
     const FPTYPE* gcar,
     const FPTYPE* tau,
-    const int* iat2it,
     const FPTYPE* vloc_per_type,
     const thrust::complex<FPTYPE>* aux,
     const FPTYPE scale_factor,
@@ -36,7 +35,6 @@ __global__ void cal_force_loc_sincos_kernel(
     if (iat >= nat) return;
     
     // Load atom information to registers
-    const int it = iat2it[iat];
     const FPTYPE tau_x = tau[iat * 3 + 0];
     const FPTYPE tau_y = tau[iat * 3 + 1];
     const FPTYPE tau_z = tau[iat * 3 + 2];
@@ -58,7 +56,7 @@ __global__ void cal_force_loc_sincos_kernel(
         sincos(phase, &sinp, &cosp);
         
         // Calculate force factor
-        const FPTYPE vloc_factor = vloc_per_type[it * npw + ig];
+        const FPTYPE vloc_factor = vloc_per_type[iat * npw + ig];
         const FPTYPE factor = vloc_factor * (cosp * aux[ig].imag() + sinp * aux[ig].real());
         
         // Accumulate force contributions
@@ -316,7 +314,6 @@ void cal_force_loc_sincos_op<FPTYPE, base_device::DEVICE_GPU>::operator()(
     const int& ntype,
     const FPTYPE* gcar,
     const FPTYPE* tau,
-    const int* iat2it,
     const FPTYPE* vloc_per_type,
     const std::complex<FPTYPE>* aux,
     const FPTYPE& scale_factor,
@@ -331,7 +328,7 @@ void cal_force_loc_sincos_op<FPTYPE, base_device::DEVICE_GPU>::operator()(
     dim3 block(threads_per_block);
     
     cal_force_loc_sincos_kernel<FPTYPE><<<grid, block>>>(
-        nat, npw, ntype, gcar, tau, iat2it, vloc_per_type,
+        nat, npw, ntype, gcar, tau, vloc_per_type,
         reinterpret_cast<const thrust::complex<FPTYPE>*>(aux),
         scale_factor, force
     );
diff --git a/source/module_hamilt_pw/hamilt_pwdft/kernels/force_op.cpp b/source/module_hamilt_pw/hamilt_pwdft/kernels/force_op.cpp
index e0dc91a70c..49576abbfa 100644
--- a/source/module_hamilt_pw/hamilt_pwdft/kernels/force_op.cpp
+++ b/source/module_hamilt_pw/hamilt_pwdft/kernels/force_op.cpp
@@ -438,7 +438,6 @@ struct cal_force_loc_sincos_op<FPTYPE, base_device::DEVICE_CPU>
                     const int& ntype,
                     const FPTYPE* gcar,
                     const FPTYPE* tau,
-                    const int* iat2it,
                     const FPTYPE* vloc_per_type,
                     const std::complex<FPTYPE>* aux,
                     const FPTYPE& scale_factor,
@@ -451,7 +450,6 @@ struct cal_force_loc_sincos_op<FPTYPE, base_device::DEVICE_CPU>
 #endif
         for (int iat = 0; iat < nat; ++iat)
         {
-            const int it = iat2it[iat];
             const FPTYPE tau_x = tau[iat * 3 + 0];
             const FPTYPE tau_y = tau[iat * 3 + 1];
             const FPTYPE tau_z = tau[iat * 3 + 2];
@@ -466,7 +464,7 @@ struct cal_force_loc_sincos_op<FPTYPE, base_device::DEVICE_CPU>
                 FPTYPE sinp, cosp;
                 ModuleBase::libm::sincos(phase, &sinp, &cosp);
                 
-                const FPTYPE vloc_factor = vloc_per_type[it * npw + ig];
+                const FPTYPE vloc_factor = vloc_per_type[iat * npw + ig];
                 const FPTYPE factor = vloc_factor * (cosp * aux[ig].imag() + sinp * aux[ig].real()) * scale_factor;
                 
                 local_force[0] += gcar[ig * 3 + 0] * factor;
diff --git a/source/module_hamilt_pw/hamilt_pwdft/kernels/force_op.h b/source/module_hamilt_pw/hamilt_pwdft/kernels/force_op.h
index 3fc5d92212..58fcd842ef 100644
--- a/source/module_hamilt_pw/hamilt_pwdft/kernels/force_op.h
+++ b/source/module_hamilt_pw/hamilt_pwdft/kernels/force_op.h
@@ -161,8 +161,7 @@ struct cal_force_loc_sincos_op
     /// @param ntype - number of atom types
     /// @param gcar - G-vector Cartesian coordinates [npw * 3]
     /// @param tau - atomic positions [nat * 3]
-    /// @param iat2it - atom to type mapping [nat]
-    /// @param vloc_per_type - precomputed vloc factors per type [ntype * npw]
+    /// @param vloc_per_type - precomputed vloc factors per atom [nat * npw]
     /// @param aux - charge density in G-space [npw]
     /// @param scale_factor - tpiba * omega
     ///
@@ -174,7 +173,6 @@ struct cal_force_loc_sincos_op
                     const int& ntype,
                     const FPTYPE* gcar,
                     const FPTYPE* tau,
-                    const int* iat2it,
                     const FPTYPE* vloc_per_type,
                     const std::complex<FPTYPE>* aux,
                     const FPTYPE& scale_factor,
@@ -319,7 +317,6 @@ struct cal_force_loc_sincos_op<FPTYPE, base_device::DEVICE_GPU>
                     const int& ntype,
                     const FPTYPE* gcar,
                     const FPTYPE* tau,
-                    const int* iat2it,
                     const FPTYPE* vloc_per_type,
                     const std::complex<FPTYPE>* aux,
                     const FPTYPE& scale_factor,
diff --git a/source/module_hamilt_pw/hamilt_pwdft/kernels/rocm/force_op.hip.cu b/source/module_hamilt_pw/hamilt_pwdft/kernels/rocm/force_op.hip.cu
index 9feeb76fed..7f0d4b099e 100644
--- a/source/module_hamilt_pw/hamilt_pwdft/kernels/rocm/force_op.hip.cu
+++ b/source/module_hamilt_pw/hamilt_pwdft/kernels/rocm/force_op.hip.cu
@@ -626,7 +626,6 @@ __global__ void cal_force_loc_sincos_kernel(
     const int ntype,
     const FPTYPE* gcar,
     const FPTYPE* tau,
-    const int* iat2it,
     const FPTYPE* vloc_per_type,
     const thrust::complex<FPTYPE>* aux,
     const FPTYPE scale_factor,
@@ -641,7 +640,6 @@ __global__ void cal_force_loc_sincos_kernel(
     if (iat >= nat) return;
     
     // Load atom information to registers
-    const int it = iat2it[iat];
     const FPTYPE tau_x = tau[iat * 3 + 0];
     const FPTYPE tau_y = tau[iat * 3 + 1];
     const FPTYPE tau_z = tau[iat * 3 + 2];
@@ -663,7 +661,7 @@ __global__ void cal_force_loc_sincos_kernel(
         sincos(phase, &sinp, &cosp);
         
         // Calculate force factor
-        const FPTYPE vloc_factor = vloc_per_type[it * npw + ig];
+        const FPTYPE vloc_factor = vloc_per_type[iat * npw + ig];
         const FPTYPE factor = vloc_factor * (cosp * aux[ig].imag() + sinp * aux[ig].real());
         
         // Accumulate force contributions
@@ -747,7 +745,6 @@ void cal_force_loc_sincos_op<FPTYPE, base_device::DEVICE_GPU>::operator()(
     const int& ntype,
     const FPTYPE* gcar,
     const FPTYPE* tau,
-    const int* iat2it,
     const FPTYPE* vloc_per_type,
     const std::complex<FPTYPE>* aux,
     const FPTYPE& scale_factor,
@@ -763,7 +760,7 @@ void cal_force_loc_sincos_op<FPTYPE, base_device::DEVICE_GPU>::operator()(
     
     hipLaunchKernelGGL(cal_force_loc_sincos_kernel<FPTYPE>,
                        grid, block, 0, 0,
-                       nat, npw, ntype, gcar, tau, iat2it, vloc_per_type,
+                       nat, npw, ntype, gcar, tau, vloc_per_type,
                        reinterpret_cast<const thrust::complex<FPTYPE>*>(aux),
                        scale_factor, force);
     

From 6638968aa21136591aafeac52dc3a24fc91fe12c Mon Sep 17 00:00:00 2001
From: Jie <li_j20@fudan.edu.cn>
Date: Tue, 3 Jun 2025 20:35:20 +0800
Subject: [PATCH 5/6] fix cal_force_ew

---
 .../module_hamilt_pw/hamilt_pwdft/forces.cpp  | 67 ++++++-------------
 .../hamilt_pwdft/kernels/cuda/force_op.cu     |  8 +--
 .../hamilt_pwdft/kernels/force_op.cpp         |  3 +-
 .../hamilt_pwdft/kernels/force_op.h           |  3 -
 .../hamilt_pwdft/kernels/rocm/force_op.hip.cu |  8 +--
 5 files changed, 29 insertions(+), 60 deletions(-)

diff --git a/source/module_hamilt_pw/hamilt_pwdft/forces.cpp b/source/module_hamilt_pw/hamilt_pwdft/forces.cpp
index d50288eef8..4cade5d1ac 100644
--- a/source/module_hamilt_pw/hamilt_pwdft/forces.cpp
+++ b/source/module_hamilt_pw/hamilt_pwdft/forces.cpp
@@ -532,17 +532,17 @@ void Forces<FPTYPE, Device>::cal_force_loc(const UnitCell& ucell,
     // to G space. maybe need fftw with OpenMP
     rho_basis->real2recip(aux, aux);
 
-    // =============== GPU/CPU异构优化：使用sincos op替换原循环 ===============
+    // sincos op for G space
     
     
-    // 准备原子相关数据：按照全局原子索引iat顺序
+    // data preparation
     std::vector<FPTYPE> tau_flat(this->nat * 3);
     std::vector<FPTYPE> gcar_flat(rho_basis->npw * 3);
     
-    // 按照原始代码逻辑：遍历全局原子索引iat，通过查表获取(it,ia)
+    
     for (int iat = 0; iat < this->nat; iat++) {
-        int it = ucell.iat2it[iat];  // 查表获取原子类型
-        int ia = ucell.iat2ia[iat];  // 查表获取该类型内的原子索引
+        int it = ucell.iat2it[iat];  
+        int ia = ucell.iat2ia[iat];  
         
         tau_flat[iat * 3 + 0] = static_cast<FPTYPE>(ucell.atoms[it].tau[ia][0]);
         tau_flat[iat * 3 + 1] = static_cast<FPTYPE>(ucell.atoms[it].tau[ia][1]);
@@ -555,7 +555,7 @@ void Forces<FPTYPE, Device>::cal_force_loc(const UnitCell& ucell,
         gcar_flat[ig * 3 + 2] = static_cast<FPTYPE>(rho_basis->gcar[ig][2]);
     }
     
-    // 重新计算vloc_factors考虑所有原子类型
+    // calculate vloc_factors for all atom types
     std::vector<FPTYPE> vloc_per_type_host(ucell.ntype * rho_basis->npw);
     for (int iat = 0; iat < this->nat; iat++) {
         int it = ucell.iat2it[iat];
@@ -564,13 +564,11 @@ void Forces<FPTYPE, Device>::cal_force_loc(const UnitCell& ucell,
         }
     }
     
-    // 转换aux到FPTYPE类型
     std::vector<std::complex<FPTYPE>> aux_fptype(rho_basis->npw);
     for (int ig = 0; ig < rho_basis->npw; ig++) {
         aux_fptype[ig] = static_cast<std::complex<FPTYPE>>(aux[ig]);
     }
     
-    // 设备端内存和指针设置（根据设备类型分支）
     FPTYPE* d_gcar = gcar_flat.data();
     FPTYPE* d_tau = tau_flat.data();
     FPTYPE* d_vloc_per_type = vloc_per_type_host.data();
@@ -591,26 +589,22 @@ void Forces<FPTYPE, Device>::cal_force_loc(const UnitCell& ucell,
         resmem_complex_op()(this->ctx, d_aux, rho_basis->npw);
         resmem_var_op()(this->ctx, d_force, this->nat * 3);
         
-        // 数据传输到设备
         syncmem_var_h2d_op()(this->ctx, this->cpu_ctx, d_gcar, gcar_flat.data(), rho_basis->npw * 3);
         syncmem_var_h2d_op()(this->ctx, this->cpu_ctx, d_tau, tau_flat.data(), this->nat * 3);
         syncmem_var_h2d_op()(this->ctx, this->cpu_ctx, d_vloc_per_type, vloc_per_type_host.data(), ucell.ntype * rho_basis->npw);
         syncmem_complex_h2d_op()(this->ctx, this->cpu_ctx, d_aux, aux_fptype.data(), rho_basis->npw);
         
-        // 初始化force为0
         base_device::memory::set_memory_op<FPTYPE, Device>()(this->ctx, d_force, 0.0, this->nat * 3);
     }
     else
     {
         d_force = force_host.data();
-        // 对于CPU，直接初始化为0
         std::fill(force_host.begin(), force_host.end(), static_cast<FPTYPE>(0.0));
     }
     
-    // 计算缩放因子
     const FPTYPE scale_factor = static_cast<FPTYPE>(ucell.tpiba * ucell.omega);
     
-    // 调用op进行sincos计算
+    // call op for sincos calculation
     hamilt::cal_force_loc_sincos_op<FPTYPE, Device>()(
         this->ctx,
         this->nat,
@@ -624,13 +618,10 @@ void Forces<FPTYPE, Device>::cal_force_loc(const UnitCell& ucell,
         d_force
     );
     
-    // 根据设备类型处理结果
     if (this->device == base_device::GpuDevice)
     {
-        // 结果传回CPU
         syncmem_var_d2h_op()(this->cpu_ctx, this->ctx, force_host.data(), d_force, this->nat * 3);
         
-        // 清理设备内存
         delmem_var_op()(this->ctx, d_gcar);
         delmem_var_op()(this->ctx, d_tau);
         delmem_var_op()(this->ctx, d_vloc_per_type);
@@ -638,7 +629,6 @@ void Forces<FPTYPE, Device>::cal_force_loc(const UnitCell& ucell,
         delmem_var_op()(this->ctx, d_force);
     }
     
-    // 将结果写入forcelc矩阵
     for (int iat = 0; iat < this->nat; iat++) {
         forcelc(iat, 0) = static_cast<double>(force_host[iat * 3 + 0]);
         forcelc(iat, 1) = static_cast<double>(force_host[iat * 3 + 1]);
@@ -755,17 +745,15 @@ void Forces<FPTYPE, Device>::cal_force_ew(const UnitCell& ucell,
         aux[rho_basis->ig_gge0] = std::complex<double>(0.0, 0.0);
     }
 
-    // =============== 第一步：G空间sincos计算（在OpenMP区域外）===============
+    // sincos op for cal_force_ew
     
-    // 预计算每个原子的it_fact和相关数据：按照全局原子索引iat顺序
     std::vector<FPTYPE> it_facts_host(this->nat);
     std::vector<FPTYPE> tau_flat(this->nat * 3);
-    std::vector<int> iat2it_host(this->nat);
     
-    // 按照原始代码逻辑：遍历全局原子索引iat，通过查表获取(it,ia)
+    // iterate over by lookup table
     for (int iat = 0; iat < this->nat; iat++) {
-        int it = ucell.iat2it[iat];  // 查表获取原子类型
-        int ia = ucell.iat2ia[iat];  // 查表获取该类型内的原子索引
+        int it = ucell.iat2it[iat];  
+        int ia = ucell.iat2ia[iat];  
         
         double zv;
         if (PARAM.inp.use_paw)
@@ -785,10 +773,8 @@ void Forces<FPTYPE, Device>::cal_force_ew(const UnitCell& ucell,
         tau_flat[iat * 3 + 0] = static_cast<FPTYPE>(ucell.atoms[it].tau[ia][0]);
         tau_flat[iat * 3 + 1] = static_cast<FPTYPE>(ucell.atoms[it].tau[ia][1]); 
         tau_flat[iat * 3 + 2] = static_cast<FPTYPE>(ucell.atoms[it].tau[ia][2]);
-        iat2it_host[iat] = it;
     }
     
-    // 准备设备端数据
     std::vector<FPTYPE> gcar_flat(rho_basis->npw * 3);
     for (int ig = 0; ig < rho_basis->npw; ig++) {
         gcar_flat[ig * 3 + 0] = static_cast<FPTYPE>(rho_basis->gcar[ig][0]);
@@ -796,16 +782,13 @@ void Forces<FPTYPE, Device>::cal_force_ew(const UnitCell& ucell,
         gcar_flat[ig * 3 + 2] = static_cast<FPTYPE>(rho_basis->gcar[ig][2]);
     }
     
-    // 转换aux到FPTYPE类型
     std::vector<std::complex<FPTYPE>> aux_fptype(rho_basis->npw);
     for (int ig = 0; ig < rho_basis->npw; ig++) {
         aux_fptype[ig] = static_cast<std::complex<FPTYPE>>(aux[ig]);
     }
     
-    // 设备端内存和指针设置（根据设备类型分支）
     FPTYPE* d_gcar = gcar_flat.data();
     FPTYPE* d_tau = tau_flat.data();
-    int* d_iat2it = iat2it_host.data();
     FPTYPE* d_it_facts = it_facts_host.data();
     std::complex<FPTYPE>* d_aux = aux_fptype.data();
     FPTYPE* d_force_g = nullptr;
@@ -815,72 +798,66 @@ void Forces<FPTYPE, Device>::cal_force_ew(const UnitCell& ucell,
     {
         d_gcar = nullptr;
         d_tau = nullptr;
-        d_iat2it = nullptr;
         d_it_facts = nullptr;
         d_aux = nullptr;
         
         resmem_var_op()(this->ctx, d_gcar, rho_basis->npw * 3);
         resmem_var_op()(this->ctx, d_tau, this->nat * 3);
-        resmem_int_op()(this->ctx, d_iat2it, this->nat);
         resmem_var_op()(this->ctx, d_it_facts, this->nat);
         resmem_complex_op()(this->ctx, d_aux, rho_basis->npw);
         resmem_var_op()(this->ctx, d_force_g, this->nat * 3);
         
-        // 数据传输
+        
         syncmem_var_h2d_op()(this->ctx, this->cpu_ctx, d_gcar, gcar_flat.data(), rho_basis->npw * 3);
         syncmem_var_h2d_op()(this->ctx, this->cpu_ctx, d_tau, tau_flat.data(), this->nat * 3);
-        syncmem_int_h2d_op()(this->ctx, this->cpu_ctx, d_iat2it, iat2it_host.data(), this->nat);
         syncmem_var_h2d_op()(this->ctx, this->cpu_ctx, d_it_facts, it_facts_host.data(), this->nat);
         syncmem_complex_h2d_op()(this->ctx, this->cpu_ctx, d_aux, aux_fptype.data(), rho_basis->npw);
         
-        // 初始化force为0
+     
         base_device::memory::set_memory_op<FPTYPE, Device>()(this->ctx, d_force_g, 0.0, this->nat * 3);
     }
     else
     {
         d_force_g = force_g_host.data();
-        // 对于CPU，直接初始化为0
         std::fill(force_g_host.begin(), force_g_host.end(), static_cast<FPTYPE>(0.0));
     }
     
-    // 调用op处理G空间sincos计算（在OpenMP区域外，无冲突）
+    // call op for sincos calculation
     hamilt::cal_force_ew_sincos_op<FPTYPE, Device>()(
         this->ctx,
         this->nat,
         rho_basis->npw,
-        rho_basis->ig_gge0,  // G=0项索引，op内部会自动跳过
+        rho_basis->ig_gge0,  
         d_gcar,
         d_tau,
-        d_iat2it,
         d_it_facts,
         d_aux,
         d_force_g
     );
     
-    // 根据设备类型处理结果
+   
     if (this->device == base_device::GpuDevice)
     {
-        // 将G空间结果传回CPU
+        
         syncmem_var_d2h_op()(this->cpu_ctx, this->ctx, force_g_host.data(), d_force_g, this->nat * 3);
         
-        // 清理设备内存
+        
         delmem_var_op()(this->ctx, d_gcar);
         delmem_var_op()(this->ctx, d_tau);
-        delmem_int_op()(this->ctx, d_iat2it);
         delmem_var_op()(this->ctx, d_it_facts);
         delmem_complex_op()(this->ctx, d_aux);
         delmem_var_op()(this->ctx, d_force_g);
     }
     
-    // 累加到forceion
+
     for (int iat = 0; iat < this->nat; iat++) {
         forceion(iat, 0) += static_cast<double>(force_g_host[iat * 3 + 0]);
         forceion(iat, 1) += static_cast<double>(force_g_host[iat * 3 + 1]); 
         forceion(iat, 2) += static_cast<double>(force_g_host[iat * 3 + 2]);
     }
 
-    // =============== 第二步：实空间计算（保留原OpenMP结构）===============
-
+   
+// calculate real space force
 #ifdef _OPENMP
 #pragma omp parallel
     {
@@ -904,7 +881,7 @@ void Forces<FPTYPE, Device>::cal_force_ew(const UnitCell& ucell,
         iat_end = iat_beg + iat_end;
         ucell.iat2iait(iat_beg, &ia_beg, &it_beg);
 
-        // 只保留实空间相互作用计算（means that the processor contains G=0 term）
+        
         if (rho_basis->ig_gge0 >= 0)
         {
             double rmax = 5.0 / (sqrt(alpha) * ucell.lat0);
diff --git a/source/module_hamilt_pw/hamilt_pwdft/kernels/cuda/force_op.cu b/source/module_hamilt_pw/hamilt_pwdft/kernels/cuda/force_op.cu
index a82bd1ee0b..5ff3ddf2b1 100644
--- a/source/module_hamilt_pw/hamilt_pwdft/kernels/cuda/force_op.cu
+++ b/source/module_hamilt_pw/hamilt_pwdft/kernels/cuda/force_op.cu
@@ -78,7 +78,6 @@ __global__ void cal_force_ew_sincos_kernel(
     const int ig_gge0,
     const FPTYPE* gcar,
     const FPTYPE* tau,
-    const int* iat2it,
     const FPTYPE* it_facts,
     const thrust::complex<FPTYPE>* aux,
     FPTYPE* force)
@@ -116,8 +115,8 @@ __global__ void cal_force_ew_sincos_kernel(
         FPTYPE sinp, cosp;
         sincos(phase, &sinp, &cosp);
         
-        // Calculate Ewald sum contribution
-        const FPTYPE factor = it_fact * (cosp * aux[ig].imag() + sinp * aux[ig].real());
+        // Calculate Ewald sum contribution (fixed sign error)
+        const FPTYPE factor = it_fact * (-cosp * aux[ig].imag() + sinp * aux[ig].real());
         
         // Accumulate force contributions
         local_force_x += gcar[ig * 3 + 0] * factor;
@@ -344,7 +343,6 @@ void cal_force_ew_sincos_op<FPTYPE, base_device::DEVICE_GPU>::operator()(
     const int& ig_gge0,
     const FPTYPE* gcar,
     const FPTYPE* tau,
-    const int* iat2it,
     const FPTYPE* it_facts,
     const std::complex<FPTYPE>* aux,
     FPTYPE* force)
@@ -358,7 +356,7 @@ void cal_force_ew_sincos_op<FPTYPE, base_device::DEVICE_GPU>::operator()(
     dim3 block(threads_per_block);
     
     cal_force_ew_sincos_kernel<FPTYPE><<<grid, block>>>(
-        nat, npw, ig_gge0, gcar, tau, iat2it, it_facts,
+        nat, npw, ig_gge0, gcar, tau, it_facts,
         reinterpret_cast<const thrust::complex<FPTYPE>*>(aux), force
     );
     
diff --git a/source/module_hamilt_pw/hamilt_pwdft/kernels/force_op.cpp b/source/module_hamilt_pw/hamilt_pwdft/kernels/force_op.cpp
index 49576abbfa..109321d6c3 100644
--- a/source/module_hamilt_pw/hamilt_pwdft/kernels/force_op.cpp
+++ b/source/module_hamilt_pw/hamilt_pwdft/kernels/force_op.cpp
@@ -489,7 +489,6 @@ struct cal_force_ew_sincos_op<FPTYPE, base_device::DEVICE_CPU>
                     const int& ig_gge0,
                     const FPTYPE* gcar,
                     const FPTYPE* tau,
-                    const int* iat2it,
                     const FPTYPE* it_facts,
                     const std::complex<FPTYPE>* aux,
                     FPTYPE* force)
@@ -519,7 +518,7 @@ struct cal_force_ew_sincos_op<FPTYPE, base_device::DEVICE_CPU>
                 FPTYPE sinp, cosp;
                 ModuleBase::libm::sincos(phase, &sinp, &cosp);
                 
-                const FPTYPE factor = it_fact * (cosp * aux[ig].imag() + sinp * aux[ig].real());
+                const FPTYPE factor = it_fact * (-cosp * aux[ig].imag() + sinp * aux[ig].real());
                 
                 local_force[0] += gcar[ig * 3 + 0] * factor;
                 local_force[1] += gcar[ig * 3 + 1] * factor;
diff --git a/source/module_hamilt_pw/hamilt_pwdft/kernels/force_op.h b/source/module_hamilt_pw/hamilt_pwdft/kernels/force_op.h
index 58fcd842ef..acf490b278 100644
--- a/source/module_hamilt_pw/hamilt_pwdft/kernels/force_op.h
+++ b/source/module_hamilt_pw/hamilt_pwdft/kernels/force_op.h
@@ -191,7 +191,6 @@ struct cal_force_ew_sincos_op
     /// @param ig_gge0 - index of G=0 vector (-1 if not present)
     /// @param gcar - G-vector Cartesian coordinates [npw * 3]
     /// @param tau - atomic positions [nat * 3]
-    /// @param iat2it - atom to type mapping [nat]
     /// @param it_facts - precomputed it_fact for each atom [nat]
     /// @param aux - structure factor related array [npw]
     ///
@@ -203,7 +202,6 @@ struct cal_force_ew_sincos_op
                     const int& ig_gge0,
                     const FPTYPE* gcar,
                     const FPTYPE* tau,
-                    const int* iat2it,
                     const FPTYPE* it_facts,
                     const std::complex<FPTYPE>* aux,
                     FPTYPE* force);
@@ -332,7 +330,6 @@ struct cal_force_ew_sincos_op<FPTYPE, base_device::DEVICE_GPU>
                     const int& ig_gge0,
                     const FPTYPE* gcar,
                     const FPTYPE* tau,
-                    const int* iat2it,
                     const FPTYPE* it_facts,
                     const std::complex<FPTYPE>* aux,
                     FPTYPE* force);
diff --git a/source/module_hamilt_pw/hamilt_pwdft/kernels/rocm/force_op.hip.cu b/source/module_hamilt_pw/hamilt_pwdft/kernels/rocm/force_op.hip.cu
index 7f0d4b099e..6bb3a84e7e 100644
--- a/source/module_hamilt_pw/hamilt_pwdft/kernels/rocm/force_op.hip.cu
+++ b/source/module_hamilt_pw/hamilt_pwdft/kernels/rocm/force_op.hip.cu
@@ -683,7 +683,6 @@ __global__ void cal_force_ew_sincos_kernel(
     const int ig_gge0,
     const FPTYPE* gcar,
     const FPTYPE* tau,
-    const int* iat2it,
     const FPTYPE* it_facts,
     const thrust::complex<FPTYPE>* aux,
     FPTYPE* force)
@@ -721,8 +720,8 @@ __global__ void cal_force_ew_sincos_kernel(
         FPTYPE sinp, cosp;
         sincos(phase, &sinp, &cosp);
         
-        // Calculate Ewald sum contribution
-        const FPTYPE factor = it_fact * (cosp * aux[ig].imag() + sinp * aux[ig].real());
+        // Calculate Ewald sum contribution (fixed sign error)
+        const FPTYPE factor = it_fact * (-cosp * aux[ig].imag() + sinp * aux[ig].real());
         
         // Accumulate force contributions
         local_force_x += gcar[ig * 3 + 0] * factor;
@@ -775,7 +774,6 @@ void cal_force_ew_sincos_op<FPTYPE, base_device::DEVICE_GPU>::operator()(
     const int& ig_gge0,
     const FPTYPE* gcar,
     const FPTYPE* tau,
-    const int* iat2it,
     const FPTYPE* it_facts,
     const std::complex<FPTYPE>* aux,
     FPTYPE* force)
@@ -790,7 +788,7 @@ void cal_force_ew_sincos_op<FPTYPE, base_device::DEVICE_GPU>::operator()(
     
     hipLaunchKernelGGL(cal_force_ew_sincos_kernel<FPTYPE>,
                        grid, block, 0, 0,
-                       nat, npw, ig_gge0, gcar, tau, iat2it, it_facts,
+                       nat, npw, ig_gge0, gcar, tau, it_facts,
                        reinterpret_cast<const thrust::complex<FPTYPE>*>(aux), force);
     
     hipCheckOnDebug();

From f3eafd0cb6438646e14bbe0f92dd0c9ac825e83b Mon Sep 17 00:00:00 2001
From: Jie <li_j20@fudan.edu.cn>
Date: Wed, 4 Jun 2025 15:51:29 +0800
Subject: [PATCH 6/6] fix malloc error

---
 source/module_hamilt_pw/hamilt_pwdft/forces.cpp | 8 ++++----
 1 file changed, 4 insertions(+), 4 deletions(-)

diff --git a/source/module_hamilt_pw/hamilt_pwdft/forces.cpp b/source/module_hamilt_pw/hamilt_pwdft/forces.cpp
index 4cade5d1ac..a4d779366b 100644
--- a/source/module_hamilt_pw/hamilt_pwdft/forces.cpp
+++ b/source/module_hamilt_pw/hamilt_pwdft/forces.cpp
@@ -556,7 +556,7 @@ void Forces<FPTYPE, Device>::cal_force_loc(const UnitCell& ucell,
     }
     
     // calculate vloc_factors for all atom types
-    std::vector<FPTYPE> vloc_per_type_host(ucell.ntype * rho_basis->npw);
+    std::vector<FPTYPE> vloc_per_type_host(this->nat * rho_basis->npw);
     for (int iat = 0; iat < this->nat; iat++) {
         int it = ucell.iat2it[iat];
         for (int ig = 0; ig < rho_basis->npw; ig++) {
@@ -585,13 +585,13 @@ void Forces<FPTYPE, Device>::cal_force_loc(const UnitCell& ucell,
         
         resmem_var_op()(this->ctx, d_gcar, rho_basis->npw * 3);
         resmem_var_op()(this->ctx, d_tau, this->nat * 3);
-        resmem_var_op()(this->ctx, d_vloc_per_type, ucell.ntype * rho_basis->npw);
+        resmem_var_op()(this->ctx, d_vloc_per_type, this->nat * rho_basis->npw);
         resmem_complex_op()(this->ctx, d_aux, rho_basis->npw);
         resmem_var_op()(this->ctx, d_force, this->nat * 3);
         
         syncmem_var_h2d_op()(this->ctx, this->cpu_ctx, d_gcar, gcar_flat.data(), rho_basis->npw * 3);
         syncmem_var_h2d_op()(this->ctx, this->cpu_ctx, d_tau, tau_flat.data(), this->nat * 3);
-        syncmem_var_h2d_op()(this->ctx, this->cpu_ctx, d_vloc_per_type, vloc_per_type_host.data(), ucell.ntype * rho_basis->npw);
+        syncmem_var_h2d_op()(this->ctx, this->cpu_ctx, d_vloc_per_type, vloc_per_type_host.data(), this->nat * rho_basis->npw);
         syncmem_complex_h2d_op()(this->ctx, this->cpu_ctx, d_aux, aux_fptype.data(), rho_basis->npw);
         
         base_device::memory::set_memory_op<FPTYPE, Device>()(this->ctx, d_force, 0.0, this->nat * 3);
@@ -609,7 +609,7 @@ void Forces<FPTYPE, Device>::cal_force_loc(const UnitCell& ucell,
         this->ctx,
         this->nat,
         rho_basis->npw,
-        ucell.ntype,
+        this->nat,
         d_gcar,
         d_tau,
         d_vloc_per_type,