diff --git a/source/module_hamilt_pw/hamilt_pwdft/forces.cpp b/source/module_hamilt_pw/hamilt_pwdft/forces.cpp
index df29f88989..a4d779366b 100644
--- a/source/module_hamilt_pw/hamilt_pwdft/forces.cpp
+++ b/source/module_hamilt_pw/hamilt_pwdft/forces.cpp
@@ -15,6 +15,7 @@
 #include "module_hamilt_general/module_ewald/H_Ewald_pw.h"
 #include "module_hamilt_general/module_surchem/surchem.h"
 #include "module_hamilt_general/module_vdw/vdw.h"
+#include "kernels/force_op.h"
 
 #ifdef _OPENMP
 #include <omp.h>
@@ -531,31 +532,110 @@ void Forces<FPTYPE, Device>::cal_force_loc(const UnitCell& ucell,
     // to G space. maybe need fftw with OpenMP
     rho_basis->real2recip(aux, aux);
 
-#ifdef _OPENMP
-#pragma omp parallel for
-#endif
-    for (int iat = 0; iat < this->nat; ++iat)
-    {
-        // read `it` `ia` from the table
+    // sincos op for G space
+    
+    
+    // data preparation
+    std::vector<FPTYPE> tau_flat(this->nat * 3);
+    std::vector<FPTYPE> gcar_flat(rho_basis->npw * 3);
+    
+    
+    for (int iat = 0; iat < this->nat; iat++) {
+        int it = ucell.iat2it[iat];  
+        int ia = ucell.iat2ia[iat];  
+        
+        tau_flat[iat * 3 + 0] = static_cast<FPTYPE>(ucell.atoms[it].tau[ia][0]);
+        tau_flat[iat * 3 + 1] = static_cast<FPTYPE>(ucell.atoms[it].tau[ia][1]);
+        tau_flat[iat * 3 + 2] = static_cast<FPTYPE>(ucell.atoms[it].tau[ia][2]);
+    }
+    
+    for (int ig = 0; ig < rho_basis->npw; ig++) {
+        gcar_flat[ig * 3 + 0] = static_cast<FPTYPE>(rho_basis->gcar[ig][0]);
+        gcar_flat[ig * 3 + 1] = static_cast<FPTYPE>(rho_basis->gcar[ig][1]);
+        gcar_flat[ig * 3 + 2] = static_cast<FPTYPE>(rho_basis->gcar[ig][2]);
+    }
+    
+    // calculate vloc_factors for all atom types
+    std::vector<FPTYPE> vloc_per_type_host(this->nat * rho_basis->npw);
+    for (int iat = 0; iat < this->nat; iat++) {
         int it = ucell.iat2it[iat];
-        int ia = ucell.iat2ia[iat];
-        for (int ig = 0; ig < rho_basis->npw; ig++)
-        {
-            const double phase = ModuleBase::TWO_PI * (rho_basis->gcar[ig] * ucell.atoms[it].tau[ia]);
-            double sinp, cosp;
-            ModuleBase::libm::sincos(phase, &sinp, &cosp);
-            const double factor
-                = vloc(it, rho_basis->ig2igg[ig]) * (cosp * aux[ig].imag() + sinp * aux[ig].real());
-            forcelc(iat, 0) += rho_basis->gcar[ig][0] * factor;
-            forcelc(iat, 1) += rho_basis->gcar[ig][1] * factor;
-            forcelc(iat, 2) += rho_basis->gcar[ig][2] * factor;
+        for (int ig = 0; ig < rho_basis->npw; ig++) {
+            vloc_per_type_host[iat * rho_basis->npw + ig] = static_cast<FPTYPE>(vloc(it, rho_basis->ig2igg[ig]));
         }
-        forcelc(iat, 0) *= (ucell.tpiba * ucell.omega);
-        forcelc(iat, 1) *= (ucell.tpiba * ucell.omega);
-        forcelc(iat, 2) *= (ucell.tpiba * ucell.omega);
+    }
+    
+    std::vector<std::complex<FPTYPE>> aux_fptype(rho_basis->npw);
+    for (int ig = 0; ig < rho_basis->npw; ig++) {
+        aux_fptype[ig] = static_cast<std::complex<FPTYPE>>(aux[ig]);
+    }
+    
+    FPTYPE* d_gcar = gcar_flat.data();
+    FPTYPE* d_tau = tau_flat.data();
+    FPTYPE* d_vloc_per_type = vloc_per_type_host.data();
+    std::complex<FPTYPE>* d_aux = aux_fptype.data();
+    FPTYPE* d_force = nullptr;
+    std::vector<FPTYPE> force_host(this->nat * 3);
+    
+    if (this->device == base_device::GpuDevice)
+    {
+        d_gcar = nullptr;
+        d_tau = nullptr;
+        d_vloc_per_type = nullptr;
+        d_aux = nullptr;
+        
+        resmem_var_op()(this->ctx, d_gcar, rho_basis->npw * 3);
+        resmem_var_op()(this->ctx, d_tau, this->nat * 3);
+        resmem_var_op()(this->ctx, d_vloc_per_type, this->nat * rho_basis->npw);
+        resmem_complex_op()(this->ctx, d_aux, rho_basis->npw);
+        resmem_var_op()(this->ctx, d_force, this->nat * 3);
+        
+        syncmem_var_h2d_op()(this->ctx, this->cpu_ctx, d_gcar, gcar_flat.data(), rho_basis->npw * 3);
+        syncmem_var_h2d_op()(this->ctx, this->cpu_ctx, d_tau, tau_flat.data(), this->nat * 3);
+        syncmem_var_h2d_op()(this->ctx, this->cpu_ctx, d_vloc_per_type, vloc_per_type_host.data(), this->nat * rho_basis->npw);
+        syncmem_complex_h2d_op()(this->ctx, this->cpu_ctx, d_aux, aux_fptype.data(), rho_basis->npw);
+        
+        base_device::memory::set_memory_op<FPTYPE, Device>()(this->ctx, d_force, 0.0, this->nat * 3);
+    }
+    else
+    {
+        d_force = force_host.data();
+        std::fill(force_host.begin(), force_host.end(), static_cast<FPTYPE>(0.0));
+    }
+    
+    const FPTYPE scale_factor = static_cast<FPTYPE>(ucell.tpiba * ucell.omega);
+    
+    // call op for sincos calculation
+    hamilt::cal_force_loc_sincos_op<FPTYPE, Device>()(
+        this->ctx,
+        this->nat,
+        rho_basis->npw,
+        this->nat,
+        d_gcar,
+        d_tau,
+        d_vloc_per_type,
+        d_aux,
+        scale_factor,
+        d_force
+    );
+    
+    if (this->device == base_device::GpuDevice)
+    {
+        syncmem_var_d2h_op()(this->cpu_ctx, this->ctx, force_host.data(), d_force, this->nat * 3);
+        
+        delmem_var_op()(this->ctx, d_gcar);
+        delmem_var_op()(this->ctx, d_tau);
+        delmem_var_op()(this->ctx, d_vloc_per_type);
+        delmem_complex_op()(this->ctx, d_aux);
+        delmem_var_op()(this->ctx, d_force);
+    }
+    
+    for (int iat = 0; iat < this->nat; iat++) {
+        forcelc(iat, 0) = static_cast<double>(force_host[iat * 3 + 0]);
+        forcelc(iat, 1) = static_cast<double>(force_host[iat * 3 + 1]);
+        forcelc(iat, 2) = static_cast<double>(force_host[iat * 3 + 2]);
     }
 
-    // this->print(GlobalV::ofs_running, "local forces", forcelc);
+    // this->print(GlobalV: :ofs_running, "local forces", forcelc);
     Parallel_Reduce::reduce_pool(forcelc.c, forcelc.nr * forcelc.nc);
     delete[] aux;
     ModuleBase::timer::tick("Forces", "cal_force_loc");
@@ -665,6 +745,119 @@ void Forces<FPTYPE, Device>::cal_force_ew(const UnitCell& ucell,
         aux[rho_basis->ig_gge0] = std::complex<double>(0.0, 0.0);
     }
 
+    // sincos op for cal_force_ew
+    
+    std::vector<FPTYPE> it_facts_host(this->nat);
+    std::vector<FPTYPE> tau_flat(this->nat * 3);
+    
+    // iterate over by lookup table
+    for (int iat = 0; iat < this->nat; iat++) {
+        int it = ucell.iat2it[iat];  
+        int ia = ucell.iat2ia[iat];  
+        
+        double zv;
+        if (PARAM.inp.use_paw)
+        {
+#ifdef USE_PAW
+            zv = GlobalC::paw_cell.get_val(it);
+#endif
+        }
+        else
+        {
+            zv = ucell.atoms[it].ncpp.zv;
+        }
+        
+        it_facts_host[iat] = static_cast<FPTYPE>(zv * ModuleBase::e2 * ucell.tpiba * 
+                                                ModuleBase::TWO_PI / ucell.omega * fact);
+        
+        tau_flat[iat * 3 + 0] = static_cast<FPTYPE>(ucell.atoms[it].tau[ia][0]);
+        tau_flat[iat * 3 + 1] = static_cast<FPTYPE>(ucell.atoms[it].tau[ia][1]); 
+        tau_flat[iat * 3 + 2] = static_cast<FPTYPE>(ucell.atoms[it].tau[ia][2]);
+    }
+    
+    std::vector<FPTYPE> gcar_flat(rho_basis->npw * 3);
+    for (int ig = 0; ig < rho_basis->npw; ig++) {
+        gcar_flat[ig * 3 + 0] = static_cast<FPTYPE>(rho_basis->gcar[ig][0]);
+        gcar_flat[ig * 3 + 1] = static_cast<FPTYPE>(rho_basis->gcar[ig][1]);
+        gcar_flat[ig * 3 + 2] = static_cast<FPTYPE>(rho_basis->gcar[ig][2]);
+    }
+    
+    std::vector<std::complex<FPTYPE>> aux_fptype(rho_basis->npw);
+    for (int ig = 0; ig < rho_basis->npw; ig++) {
+        aux_fptype[ig] = static_cast<std::complex<FPTYPE>>(aux[ig]);
+    }
+    
+    FPTYPE* d_gcar = gcar_flat.data();
+    FPTYPE* d_tau = tau_flat.data();
+    FPTYPE* d_it_facts = it_facts_host.data();
+    std::complex<FPTYPE>* d_aux = aux_fptype.data();
+    FPTYPE* d_force_g = nullptr;
+    std::vector<FPTYPE> force_g_host(this->nat * 3);
+    
+    if (this->device == base_device::GpuDevice)
+    {
+        d_gcar = nullptr;
+        d_tau = nullptr;
+        d_it_facts = nullptr;
+        d_aux = nullptr;
+        
+        resmem_var_op()(this->ctx, d_gcar, rho_basis->npw * 3);
+        resmem_var_op()(this->ctx, d_tau, this->nat * 3);
+        resmem_var_op()(this->ctx, d_it_facts, this->nat);
+        resmem_complex_op()(this->ctx, d_aux, rho_basis->npw);
+        resmem_var_op()(this->ctx, d_force_g, this->nat * 3);
+        
+        
+        syncmem_var_h2d_op()(this->ctx, this->cpu_ctx, d_gcar, gcar_flat.data(), rho_basis->npw * 3);
+        syncmem_var_h2d_op()(this->ctx, this->cpu_ctx, d_tau, tau_flat.data(), this->nat * 3);
+        syncmem_var_h2d_op()(this->ctx, this->cpu_ctx, d_it_facts, it_facts_host.data(), this->nat);
+        syncmem_complex_h2d_op()(this->ctx, this->cpu_ctx, d_aux, aux_fptype.data(), rho_basis->npw);
+        
+     
+        base_device::memory::set_memory_op<FPTYPE, Device>()(this->ctx, d_force_g, 0.0, this->nat * 3);
+    }
+    else
+    {
+        d_force_g = force_g_host.data();
+        std::fill(force_g_host.begin(), force_g_host.end(), static_cast<FPTYPE>(0.0));
+    }
+    
+    // call op for sincos calculation
+    hamilt::cal_force_ew_sincos_op<FPTYPE, Device>()(
+        this->ctx,
+        this->nat,
+        rho_basis->npw,
+        rho_basis->ig_gge0,  
+        d_gcar,
+        d_tau,
+        d_it_facts,
+        d_aux,
+        d_force_g
+    );
+    
+   
+    if (this->device == base_device::GpuDevice)
+    {
+        
+        syncmem_var_d2h_op()(this->cpu_ctx, this->ctx, force_g_host.data(), d_force_g, this->nat * 3);
+        
+        
+        delmem_var_op()(this->ctx, d_gcar);
+        delmem_var_op()(this->ctx, d_tau);
+        delmem_var_op()(this->ctx, d_it_facts);
+        delmem_complex_op()(this->ctx, d_aux);
+        delmem_var_op()(this->ctx, d_force_g);
+    }
+    
+
+    for (int iat = 0; iat < this->nat; iat++) {
+        forceion(iat, 0) += static_cast<double>(force_g_host[iat * 3 + 0]);
+        forceion(iat, 1) += static_cast<double>(force_g_host[iat * 3 + 1]); 
+        forceion(iat, 2) += static_cast<double>(force_g_host[iat * 3 + 2]);
+    }
+
+   
+// calculate real space force
 #ifdef _OPENMP
 #pragma omp parallel
     {
@@ -688,66 +881,7 @@ void Forces<FPTYPE, Device>::cal_force_ew(const UnitCell& ucell,
         iat_end = iat_beg + iat_end;
         ucell.iat2iait(iat_beg, &ia_beg, &it_beg);
 
-        int iat = iat_beg;
-        int it = it_beg;
-        int ia = ia_beg;
-
-        // preprocess ig_gap for skipping the ig point
-        int ig_gap = (rho_basis->ig_gge0 >= 0 && rho_basis->ig_gge0 < rho_basis->npw) ? rho_basis->ig_gge0 : -1;
-
-        double it_fact = 0.;
-        int last_it = -1;
-
-        // iterating atoms
-        while (iat < iat_end)
-        {
-            if (it != last_it)
-            { // calculate it_tact when it is changed
-                double zv;
-                if (PARAM.inp.use_paw)
-                {
-#ifdef USE_PAW
-                    zv = GlobalC::paw_cell.get_val(it);
-#endif
-                }
-                else
-                {
-                    zv = ucell.atoms[it].ncpp.zv;
-                }
-                it_fact = zv * ModuleBase::e2 * ucell.tpiba * ModuleBase::TWO_PI / ucell.omega * fact;
-                last_it = it;
-            }
-
-            if (ucell.atoms[it].na != 0)
-            {
-                const auto ig_loop = [&](int ig_beg, int ig_end) {
-                    for (int ig = ig_beg; ig < ig_end; ig++)
-                    {
-                        const ModuleBase::Vector3<double> gcar = rho_basis->gcar[ig];
-                        const double arg = ModuleBase::TWO_PI * (gcar * ucell.atoms[it].tau[ia]);
-                        double sinp, cosp;
-                        ModuleBase::libm::sincos(arg, &sinp, &cosp);
-                        double sumnb = -cosp * aux[ig].imag() + sinp * aux[ig].real();
-                        forceion(iat, 0) += gcar[0] * sumnb;
-                        forceion(iat, 1) += gcar[1] * sumnb;
-                        forceion(iat, 2) += gcar[2] * sumnb;
-                    }
-                };
-
-                // skip ig_gge0 point by separating ig loop into two part
-                ig_loop(0, ig_gap);
-                ig_loop(ig_gap + 1, rho_basis->npw);
-
-                forceion(iat, 0) *= it_fact;
-                forceion(iat, 1) *= it_fact;
-                forceion(iat, 2) *= it_fact;
-
-                ++iat;
-                ucell.step_iait(&ia, &it);
-            }
-        }
-
-        // means that the processor contains G=0 term.
+        
         if (rho_basis->ig_gge0 >= 0)
         {
             double rmax = 5.0 / (sqrt(alpha) * ucell.lat0);
diff --git a/source/module_hamilt_pw/hamilt_pwdft/kernels/cuda/force_op.cu b/source/module_hamilt_pw/hamilt_pwdft/kernels/cuda/force_op.cu
index 5d0656d105..5ff3ddf2b1 100644
--- a/source/module_hamilt_pw/hamilt_pwdft/kernels/cuda/force_op.cu
+++ b/source/module_hamilt_pw/hamilt_pwdft/kernels/cuda/force_op.cu
@@ -13,6 +13,122 @@
 
 namespace hamilt {
 
+// CUDA kernels for sincos loops
+template <typename FPTYPE>
+__global__ void cal_force_loc_sincos_kernel(
+    const int nat,
+    const int npw,
+    const int ntype,
+    const FPTYPE* gcar,
+    const FPTYPE* tau,
+    const FPTYPE* vloc_per_type,
+    const thrust::complex<FPTYPE>* aux,
+    const FPTYPE scale_factor,
+    FPTYPE* force)
+{
+    const FPTYPE TWO_PI = 2.0 * M_PI;
+    
+    const int iat = blockIdx.y;
+    const int ig_start = blockIdx.x * blockDim.x + threadIdx.x;
+    const int ig_stride = gridDim.x * blockDim.x;
+    
+    if (iat >= nat) return;
+    
+    // Load atom information to registers
+    const FPTYPE tau_x = tau[iat * 3 + 0];
+    const FPTYPE tau_y = tau[iat * 3 + 1];
+    const FPTYPE tau_z = tau[iat * 3 + 2];
+    
+    // Local accumulation variables
+    FPTYPE local_force_x = 0.0;
+    FPTYPE local_force_y = 0.0;
+    FPTYPE local_force_z = 0.0;
+    
+    // Grid-stride loop over G-vectors
+    for (int ig = ig_start; ig < npw; ig += ig_stride) {
+        // Calculate phase: 2π * (G · τ)
+        const FPTYPE phase = TWO_PI * (gcar[ig * 3 + 0] * tau_x + 
+                                       gcar[ig * 3 + 1] * tau_y + 
+                                       gcar[ig * 3 + 2] * tau_z);
+        
+        // Use CUDA intrinsic for sincos
+        FPTYPE sinp, cosp;
+        sincos(phase, &sinp, &cosp);
+        
+        // Calculate force factor
+        const FPTYPE vloc_factor = vloc_per_type[iat * npw + ig];
+        const FPTYPE factor = vloc_factor * (cosp * aux[ig].imag() + sinp * aux[ig].real());
+        
+        // Accumulate force contributions
+        local_force_x += gcar[ig * 3 + 0] * factor;
+        local_force_y += gcar[ig * 3 + 1] * factor;
+        local_force_z += gcar[ig * 3 + 2] * factor;
+    }
+    
+    // Atomic add to global memory
+    atomicAdd(&force[iat * 3 + 0], local_force_x * scale_factor);
+    atomicAdd(&force[iat * 3 + 1], local_force_y * scale_factor);
+    atomicAdd(&force[iat * 3 + 2], local_force_z * scale_factor);
+}
+
+template <typename FPTYPE>
+__global__ void cal_force_ew_sincos_kernel(
+    const int nat,
+    const int npw,
+    const int ig_gge0,
+    const FPTYPE* gcar,
+    const FPTYPE* tau,
+    const FPTYPE* it_facts,
+    const thrust::complex<FPTYPE>* aux,
+    FPTYPE* force)
+{
+    const FPTYPE TWO_PI = 2.0 * M_PI;
+    
+    const int iat = blockIdx.y;
+    const int ig_start = blockIdx.x * blockDim.x + threadIdx.x;
+    const int ig_stride = gridDim.x * blockDim.x;
+    
+    if (iat >= nat) return;
+    
+    // Load atom information to registers
+    const FPTYPE tau_x = tau[iat * 3 + 0];
+    const FPTYPE tau_y = tau[iat * 3 + 1];
+    const FPTYPE tau_z = tau[iat * 3 + 2];
+    const FPTYPE it_fact = it_facts[iat];
+    
+    // Local accumulation variables
+    FPTYPE local_force_x = 0.0;
+    FPTYPE local_force_y = 0.0;
+    FPTYPE local_force_z = 0.0;
+    
+    // Grid-stride loop over G-vectors
+    for (int ig = ig_start; ig < npw; ig += ig_stride) {
+        // Skip G=0 term
+        if (ig == ig_gge0) continue;
+        
+        // Calculate phase: 2π * (G · τ)
+        const FPTYPE phase = TWO_PI * (gcar[ig * 3 + 0] * tau_x + 
+                                       gcar[ig * 3 + 1] * tau_y + 
+                                       gcar[ig * 3 + 2] * tau_z);
+        
+        // Use CUDA intrinsic for sincos
+        FPTYPE sinp, cosp;
+        sincos(phase, &sinp, &cosp);
+        
+        // Calculate Ewald sum contribution (fixed sign error)
+        const FPTYPE factor = it_fact * (-cosp * aux[ig].imag() + sinp * aux[ig].real());
+        
+        // Accumulate force contributions
+        local_force_x += gcar[ig * 3 + 0] * factor;
+        local_force_y += gcar[ig * 3 + 1] * factor;
+        local_force_z += gcar[ig * 3 + 2] * factor;
+    }
+    
+    // Atomic add to global memory
+    atomicAdd(&force[iat * 3 + 0], local_force_x);
+    atomicAdd(&force[iat * 3 + 1], local_force_y);
+    atomicAdd(&force[iat * 3 + 2], local_force_z);
+}
 
 template <typename FPTYPE>
 __global__ void cal_vkb1_nl(
@@ -188,6 +304,65 @@ void cal_force_nl_op<FPTYPE, base_device::DEVICE_GPU>::operator()(const base_dev
     cudaCheckOnDebug();
 }
 
+// GPU operators
+template <typename FPTYPE>
+void cal_force_loc_sincos_op<FPTYPE, base_device::DEVICE_GPU>::operator()(
+    const base_device::DEVICE_GPU* ctx,
+    const int& nat,
+    const int& npw,
+    const int& ntype,
+    const FPTYPE* gcar,
+    const FPTYPE* tau,
+    const FPTYPE* vloc_per_type,
+    const std::complex<FPTYPE>* aux,
+    const FPTYPE& scale_factor,
+    FPTYPE* force)
+{
+    // Calculate optimal grid configuration for GPU load balancing
+    const int threads_per_block = THREADS_PER_BLOCK;
+    const int max_blocks_per_sm = 32; // Configurable per GPU architecture
+    const int max_blocks_x = std::min(max_blocks_per_sm, (npw + threads_per_block - 1) / threads_per_block);
+    
+    dim3 grid(max_blocks_x, nat);
+    dim3 block(threads_per_block);
+    
+    cal_force_loc_sincos_kernel<FPTYPE><<<grid, block>>>(
+        nat, npw, ntype, gcar, tau, vloc_per_type,
+        reinterpret_cast<const thrust::complex<FPTYPE>*>(aux),
+        scale_factor, force
+    );
+    
+    cudaCheckOnDebug();
+}
+
+template <typename FPTYPE>
+void cal_force_ew_sincos_op<FPTYPE, base_device::DEVICE_GPU>::operator()(
+    const base_device::DEVICE_GPU* ctx,
+    const int& nat,
+    const int& npw,
+    const int& ig_gge0,
+    const FPTYPE* gcar,
+    const FPTYPE* tau,
+    const FPTYPE* it_facts,
+    const std::complex<FPTYPE>* aux,
+    FPTYPE* force)
+{
+    // Calculate optimal grid configuration for GPU load balancing
+    const int threads_per_block = THREADS_PER_BLOCK;
+    const int max_blocks_per_sm = 32; // Configurable per GPU architecture
+    const int max_blocks_x = std::min(max_blocks_per_sm, (npw + threads_per_block - 1) / threads_per_block);
+    
+    dim3 grid(max_blocks_x, nat);
+    dim3 block(threads_per_block);
+    
+    cal_force_ew_sincos_kernel<FPTYPE><<<grid, block>>>(
+        nat, npw, ig_gge0, gcar, tau, it_facts,
+        reinterpret_cast<const thrust::complex<FPTYPE>*>(aux), force
+    );
+    
+    cudaCheckOnDebug();
+}
+
 template <typename FPTYPE>
 __global__ void cal_force_nl(
         const int ntype,
@@ -613,8 +788,12 @@ template void saveVkbValues<double>(const int *gcar_zero_ptrs, const std::comple
 
 template struct cal_vkb1_nl_op<float, base_device::DEVICE_GPU>;
 template struct cal_force_nl_op<float, base_device::DEVICE_GPU>;
+template struct cal_force_loc_sincos_op<float, base_device::DEVICE_GPU>;
+template struct cal_force_ew_sincos_op<float, base_device::DEVICE_GPU>;
 
 template struct cal_vkb1_nl_op<double, base_device::DEVICE_GPU>;
 template struct cal_force_nl_op<double, base_device::DEVICE_GPU>;
+template struct cal_force_loc_sincos_op<double, base_device::DEVICE_GPU>;
+template struct cal_force_ew_sincos_op<double, base_device::DEVICE_GPU>;
 
 }  // namespace hamilt
diff --git a/source/module_hamilt_pw/hamilt_pwdft/kernels/force_op.cpp b/source/module_hamilt_pw/hamilt_pwdft/kernels/force_op.cpp
index 6d797e147d..109321d6c3 100644
--- a/source/module_hamilt_pw/hamilt_pwdft/kernels/force_op.cpp
+++ b/source/module_hamilt_pw/hamilt_pwdft/kernels/force_op.cpp
@@ -1,4 +1,8 @@
 #include "module_hamilt_pw/hamilt_pwdft/kernels/force_op.h"
+#include "module_base/libm/libm.h"
+#include "module_base/tool_threading.h"
+#include "module_hamilt_general/module_ewald/H_Ewald_pw.h"
+#include <cmath>
 
 #ifdef _OPENMP
 #include <omp.h>
@@ -424,10 +428,116 @@ struct cal_force_nl_op<FPTYPE, base_device::DEVICE_CPU>
     }
 };
 
+// CPU implementation of local force sincos operator
+template <typename FPTYPE>
+struct cal_force_loc_sincos_op<FPTYPE, base_device::DEVICE_CPU>
+{
+    void operator()(const base_device::DEVICE_CPU* ctx,
+                    const int& nat,
+                    const int& npw,
+                    const int& ntype,
+                    const FPTYPE* gcar,
+                    const FPTYPE* tau,
+                    const FPTYPE* vloc_per_type,
+                    const std::complex<FPTYPE>* aux,
+                    const FPTYPE& scale_factor,
+                    FPTYPE* force)
+    {
+        const FPTYPE TWO_PI = 2.0 * M_PI;
+
+#ifdef _OPENMP
+#pragma omp parallel for
+#endif
+        for (int iat = 0; iat < nat; ++iat)
+        {
+            const FPTYPE tau_x = tau[iat * 3 + 0];
+            const FPTYPE tau_y = tau[iat * 3 + 1];
+            const FPTYPE tau_z = tau[iat * 3 + 2];
+
+            FPTYPE local_force[3] = {0.0, 0.0, 0.0};
+
+            for (int ig = 0; ig < npw; ig++)
+            {
+                const FPTYPE phase = TWO_PI * (gcar[ig * 3 + 0] * tau_x + 
+                                               gcar[ig * 3 + 1] * tau_y + 
+                                               gcar[ig * 3 + 2] * tau_z);
+                FPTYPE sinp, cosp;
+                ModuleBase::libm::sincos(phase, &sinp, &cosp);
+                
+                const FPTYPE vloc_factor = vloc_per_type[iat * npw + ig];
+                const FPTYPE factor = vloc_factor * (cosp * aux[ig].imag() + sinp * aux[ig].real()) * scale_factor;
+                
+                local_force[0] += gcar[ig * 3 + 0] * factor;
+                local_force[1] += gcar[ig * 3 + 1] * factor;
+                local_force[2] += gcar[ig * 3 + 2] * factor;
+            }
+
+            force[iat * 3 + 0] = local_force[0];
+            force[iat * 3 + 1] = local_force[1];
+            force[iat * 3 + 2] = local_force[2];
+        }
+    }
+};
+
+// CPU implementation of Ewald force sincos operator
+template <typename FPTYPE>
+struct cal_force_ew_sincos_op<FPTYPE, base_device::DEVICE_CPU>
+{
+    void operator()(const base_device::DEVICE_CPU* ctx,
+                    const int& nat,
+                    const int& npw,
+                    const int& ig_gge0,
+                    const FPTYPE* gcar,
+                    const FPTYPE* tau,
+                    const FPTYPE* it_facts,
+                    const std::complex<FPTYPE>* aux,
+                    FPTYPE* force)
+    {
+        const FPTYPE TWO_PI = 2.0 * M_PI;
+
+#ifdef _OPENMP
+#pragma omp parallel for
+#endif
+        for (int iat = 0; iat < nat; ++iat)
+        {
+            const FPTYPE tau_x = tau[iat * 3 + 0];
+            const FPTYPE tau_y = tau[iat * 3 + 1];
+            const FPTYPE tau_z = tau[iat * 3 + 2];
+            const FPTYPE it_fact = it_facts[iat];
+
+            FPTYPE local_force[3] = {0.0, 0.0, 0.0};
+
+            for (int ig = 0; ig < npw; ig++)
+            {
+                // Skip G=0 term
+                if (ig == ig_gge0) continue;
+
+                const FPTYPE phase = TWO_PI * (gcar[ig * 3 + 0] * tau_x + 
+                                               gcar[ig * 3 + 1] * tau_y + 
+                                               gcar[ig * 3 + 2] * tau_z);
+                FPTYPE sinp, cosp;
+                ModuleBase::libm::sincos(phase, &sinp, &cosp);
+                
+                const FPTYPE factor = it_fact * (-cosp * aux[ig].imag() + sinp * aux[ig].real());
+                
+                local_force[0] += gcar[ig * 3 + 0] * factor;
+                local_force[1] += gcar[ig * 3 + 1] * factor;
+                local_force[2] += gcar[ig * 3 + 2] * factor;
+            }
+
+            force[iat * 3 + 0] = local_force[0];
+            force[iat * 3 + 1] = local_force[1];
+            force[iat * 3 + 2] = local_force[2];
+        }
+    }
+};
+
 template struct cal_vkb1_nl_op<float, base_device::DEVICE_CPU>;
 template struct cal_force_nl_op<float, base_device::DEVICE_CPU>;
-
+template struct cal_force_loc_sincos_op<float, base_device::DEVICE_CPU>;
+template struct cal_force_ew_sincos_op<float, base_device::DEVICE_CPU>;
 template struct cal_vkb1_nl_op<double, base_device::DEVICE_CPU>;
 template struct cal_force_nl_op<double, base_device::DEVICE_CPU>;
-
+template struct cal_force_loc_sincos_op<double, base_device::DEVICE_CPU>;
+template struct cal_force_ew_sincos_op<double, base_device::DEVICE_CPU>;
 } // namespace hamilt
diff --git a/source/module_hamilt_pw/hamilt_pwdft/kernels/force_op.h b/source/module_hamilt_pw/hamilt_pwdft/kernels/force_op.h
index 3aa5d4f87e..acf490b278 100644
--- a/source/module_hamilt_pw/hamilt_pwdft/kernels/force_op.h
+++ b/source/module_hamilt_pw/hamilt_pwdft/kernels/force_op.h
@@ -146,6 +146,64 @@ struct cal_force_nl_op
                     const FPTYPE* lambda,
                     const std::complex<FPTYPE>* becp,
                     const std::complex<FPTYPE>* dbecp,
+                                          FPTYPE* force);
+};
+
+template <typename FPTYPE, typename Device>
+struct cal_force_loc_sincos_op
+{
+    /// @brief Calculate local pseudopotential forces (sincos loop only)
+    ///
+    /// Input Parameters
+    /// @param ctx - which device this function runs on
+    /// @param nat - number of atoms
+    /// @param npw - number of plane waves
+    /// @param ntype - number of atom types
+    /// @param gcar - G-vector Cartesian coordinates [npw * 3]
+    /// @param tau - atomic positions [nat * 3]
+    /// @param vloc_per_type - precomputed vloc factors per atom [nat * npw]
+    /// @param aux - charge density in G-space [npw]
+    /// @param scale_factor - tpiba * omega
+    ///
+    /// Output Parameters
+    /// @param force - output forces [nat * 3]
+    void operator()(const Device* ctx,
+                    const int& nat,
+                    const int& npw,
+                    const int& ntype,
+                    const FPTYPE* gcar,
+                    const FPTYPE* tau,
+                    const FPTYPE* vloc_per_type,
+                    const std::complex<FPTYPE>* aux,
+                    const FPTYPE& scale_factor,
+                    FPTYPE* force);
+};
+
+template <typename FPTYPE, typename Device>
+struct cal_force_ew_sincos_op
+{
+    /// @brief Calculate Ewald forces (sincos loop only)
+    ///
+    /// Input Parameters
+    /// @param ctx - which device this function runs on
+    /// @param nat - number of atoms
+    /// @param npw - number of plane waves
+    /// @param ig_gge0 - index of G=0 vector (-1 if not present)
+    /// @param gcar - G-vector Cartesian coordinates [npw * 3]
+    /// @param tau - atomic positions [nat * 3]
+    /// @param it_facts - precomputed it_fact for each atom [nat]
+    /// @param aux - structure factor related array [npw]
+    ///
+    /// Output Parameters
+    /// @param force - output forces [nat * 3]
+    void operator()(const Device* ctx,
+                    const int& nat,
+                    const int& npw,
+                    const int& ig_gge0,
+                    const FPTYPE* gcar,
+                    const FPTYPE* tau,
+                    const FPTYPE* it_facts,
+                    const std::complex<FPTYPE>* aux,
                     FPTYPE* force);
 };
 
@@ -248,6 +306,35 @@ struct cal_force_nl_op<FPTYPE, base_device::DEVICE_GPU>
                     FPTYPE* force);
 };
 
+template <typename FPTYPE>
+struct cal_force_loc_sincos_op<FPTYPE, base_device::DEVICE_GPU>
+{
+    void operator()(const base_device::DEVICE_GPU* ctx,
+                    const int& nat,
+                    const int& npw,
+                    const int& ntype,
+                    const FPTYPE* gcar,
+                    const FPTYPE* tau,
+                    const FPTYPE* vloc_per_type,
+                    const std::complex<FPTYPE>* aux,
+                    const FPTYPE& scale_factor,
+                    FPTYPE* force);
+};
+
+template <typename FPTYPE>
+struct cal_force_ew_sincos_op<FPTYPE, base_device::DEVICE_GPU>
+{
+    void operator()(const base_device::DEVICE_GPU* ctx,
+                    const int& nat,
+                    const int& npw,
+                    const int& ig_gge0,
+                    const FPTYPE* gcar,
+                    const FPTYPE* tau,
+                    const FPTYPE* it_facts,
+                    const std::complex<FPTYPE>* aux,
+                    FPTYPE* force);
+};
+
 /**
  * @brief revert the vkb values for force_nl calculation
  */
diff --git a/source/module_hamilt_pw/hamilt_pwdft/kernels/rocm/force_op.hip.cu b/source/module_hamilt_pw/hamilt_pwdft/kernels/rocm/force_op.hip.cu
index c78b333b86..6bb3a84e7e 100644
--- a/source/module_hamilt_pw/hamilt_pwdft/kernels/rocm/force_op.hip.cu
+++ b/source/module_hamilt_pw/hamilt_pwdft/kernels/rocm/force_op.hip.cu
@@ -618,10 +618,190 @@ template void revertVkbValues<double>(const int *gcar_zero_ptrs, std::complex<do
 // for saveVkbValues functions instantiation
 template void saveVkbValues<double>(const int *gcar_zero_ptrs, const std::complex<double> *vkb_ptr, std::complex<double> *vkb_save_ptr, int nkb, int gcar_zero_count, int npw, int ipol, int npwx);
 
+// HIP kernels for sincos loops
+template <typename FPTYPE>
+__global__ void cal_force_loc_sincos_kernel(
+    const int nat,
+    const int npw,
+    const int ntype,
+    const FPTYPE* gcar,
+    const FPTYPE* tau,
+    const FPTYPE* vloc_per_type,
+    const thrust::complex<FPTYPE>* aux,
+    const FPTYPE scale_factor,
+    FPTYPE* force)
+{
+    const FPTYPE TWO_PI = 2.0 * M_PI;
+    
+    const int iat = blockIdx.y;
+    const int ig_start = blockIdx.x * blockDim.x + threadIdx.x;
+    const int ig_stride = gridDim.x * blockDim.x;
+    
+    if (iat >= nat) return;
+    
+    // Load atom information to registers
+    const FPTYPE tau_x = tau[iat * 3 + 0];
+    const FPTYPE tau_y = tau[iat * 3 + 1];
+    const FPTYPE tau_z = tau[iat * 3 + 2];
+    
+    // Local accumulation variables
+    FPTYPE local_force_x = 0.0;
+    FPTYPE local_force_y = 0.0;
+    FPTYPE local_force_z = 0.0;
+    
+    // Grid-stride loop over G-vectors
+    for (int ig = ig_start; ig < npw; ig += ig_stride) {
+        // Calculate phase: 2π * (G · τ)
+        const FPTYPE phase = TWO_PI * (gcar[ig * 3 + 0] * tau_x + 
+                                       gcar[ig * 3 + 1] * tau_y + 
+                                       gcar[ig * 3 + 2] * tau_z);
+        
+        // Use HIP intrinsic for sincos
+        FPTYPE sinp, cosp;
+        sincos(phase, &sinp, &cosp);
+        
+        // Calculate force factor
+        const FPTYPE vloc_factor = vloc_per_type[iat * npw + ig];
+        const FPTYPE factor = vloc_factor * (cosp * aux[ig].imag() + sinp * aux[ig].real());
+        
+        // Accumulate force contributions
+        local_force_x += gcar[ig * 3 + 0] * factor;
+        local_force_y += gcar[ig * 3 + 1] * factor;
+        local_force_z += gcar[ig * 3 + 2] * factor;
+    }
+    
+    // Atomic add to global memory
+    atomicAdd(&force[iat * 3 + 0], local_force_x * scale_factor);
+    atomicAdd(&force[iat * 3 + 1], local_force_y * scale_factor);
+    atomicAdd(&force[iat * 3 + 2], local_force_z * scale_factor);
+}
+
+template <typename FPTYPE>
+__global__ void cal_force_ew_sincos_kernel(
+    const int nat,
+    const int npw,
+    const int ig_gge0,
+    const FPTYPE* gcar,
+    const FPTYPE* tau,
+    const FPTYPE* it_facts,
+    const thrust::complex<FPTYPE>* aux,
+    FPTYPE* force)
+{
+    const FPTYPE TWO_PI = 2.0 * M_PI;
+    
+    const int iat = blockIdx.y;
+    const int ig_start = blockIdx.x * blockDim.x + threadIdx.x;
+    const int ig_stride = gridDim.x * blockDim.x;
+    
+    if (iat >= nat) return;
+    
+    // Load atom information to registers
+    const FPTYPE tau_x = tau[iat * 3 + 0];
+    const FPTYPE tau_y = tau[iat * 3 + 1];
+    const FPTYPE tau_z = tau[iat * 3 + 2];
+    const FPTYPE it_fact = it_facts[iat];
+    
+    // Local accumulation variables
+    FPTYPE local_force_x = 0.0;
+    FPTYPE local_force_y = 0.0;
+    FPTYPE local_force_z = 0.0;
+    
+    // Grid-stride loop over G-vectors
+    for (int ig = ig_start; ig < npw; ig += ig_stride) {
+        // Skip G=0 term
+        if (ig == ig_gge0) continue;
+        
+        // Calculate phase: 2π * (G · τ)
+        const FPTYPE phase = TWO_PI * (gcar[ig * 3 + 0] * tau_x + 
+                                       gcar[ig * 3 + 1] * tau_y + 
+                                       gcar[ig * 3 + 2] * tau_z);
+        
+        // Use HIP intrinsic for sincos
+        FPTYPE sinp, cosp;
+        sincos(phase, &sinp, &cosp);
+        
+        // Calculate Ewald sum contribution (fixed sign error)
+        const FPTYPE factor = it_fact * (-cosp * aux[ig].imag() + sinp * aux[ig].real());
+        
+        // Accumulate force contributions
+        local_force_x += gcar[ig * 3 + 0] * factor;
+        local_force_y += gcar[ig * 3 + 1] * factor;
+        local_force_z += gcar[ig * 3 + 2] * factor;
+    }
+    
+    // Atomic add to global memory
+    atomicAdd(&force[iat * 3 + 0], local_force_x);
+    atomicAdd(&force[iat * 3 + 1], local_force_y);
+    atomicAdd(&force[iat * 3 + 2], local_force_z);
+}
+
+// GPU operators
+template <typename FPTYPE>
+void cal_force_loc_sincos_op<FPTYPE, base_device::DEVICE_GPU>::operator()(
+    const base_device::DEVICE_GPU* ctx,
+    const int& nat,
+    const int& npw,
+    const int& ntype,
+    const FPTYPE* gcar,
+    const FPTYPE* tau,
+    const FPTYPE* vloc_per_type,
+    const std::complex<FPTYPE>* aux,
+    const FPTYPE& scale_factor,
+    FPTYPE* force)
+{
+    // Calculate optimal grid configuration for GPU load balancing
+    const int threads_per_block = THREADS_PER_BLOCK;
+    const int max_blocks_per_sm = 32; // Configurable per GPU architecture
+    const int max_blocks_x = std::min(max_blocks_per_sm, (npw + threads_per_block - 1) / threads_per_block);
+    
+    dim3 grid(max_blocks_x, nat);
+    dim3 block(threads_per_block);
+    
+    hipLaunchKernelGGL(cal_force_loc_sincos_kernel<FPTYPE>,
+                       grid, block, 0, 0,
+                       nat, npw, ntype, gcar, tau, vloc_per_type,
+                       reinterpret_cast<const thrust::complex<FPTYPE>*>(aux),
+                       scale_factor, force);
+    
+    hipCheckOnDebug();
+}
+
+template <typename FPTYPE>
+void cal_force_ew_sincos_op<FPTYPE, base_device::DEVICE_GPU>::operator()(
+    const base_device::DEVICE_GPU* ctx,
+    const int& nat,
+    const int& npw,
+    const int& ig_gge0,
+    const FPTYPE* gcar,
+    const FPTYPE* tau,
+    const FPTYPE* it_facts,
+    const std::complex<FPTYPE>* aux,
+    FPTYPE* force)
+{
+    // Calculate optimal grid configuration for GPU load balancing
+    const int threads_per_block = THREADS_PER_BLOCK;
+    const int max_blocks_per_sm = 32; // Configurable per GPU architecture
+    const int max_blocks_x = std::min(max_blocks_per_sm, (npw + threads_per_block - 1) / threads_per_block);
+    
+    dim3 grid(max_blocks_x, nat);
+    dim3 block(threads_per_block);
+    
+    hipLaunchKernelGGL(cal_force_ew_sincos_kernel<FPTYPE>,
+                       grid, block, 0, 0,
+                       nat, npw, ig_gge0, gcar, tau, it_facts,
+                       reinterpret_cast<const thrust::complex<FPTYPE>*>(aux), force);
+    
+    hipCheckOnDebug();
+}
+
 template struct cal_vkb1_nl_op<float, base_device::DEVICE_GPU>;
 template struct cal_force_nl_op<float, base_device::DEVICE_GPU>;
+template struct cal_force_loc_sincos_op<float, base_device::DEVICE_GPU>;
+template struct cal_force_ew_sincos_op<float, base_device::DEVICE_GPU>;
 
 template struct cal_vkb1_nl_op<double, base_device::DEVICE_GPU>;
 template struct cal_force_nl_op<double, base_device::DEVICE_GPU>;
+template struct cal_force_loc_sincos_op<double, base_device::DEVICE_GPU>;
+template struct cal_force_ew_sincos_op<double, base_device::DEVICE_GPU>;
 
 }  // namespace hamilt