diff --git a/source/module_hamilt_lcao/hamilt_lcaodft/FORCE_STRESS.cpp b/source/module_hamilt_lcao/hamilt_lcaodft/FORCE_STRESS.cpp
index 532bf7e561..3454a65963 100644
--- a/source/module_hamilt_lcao/hamilt_lcaodft/FORCE_STRESS.cpp
+++ b/source/module_hamilt_lcao/hamilt_lcaodft/FORCE_STRESS.cpp
@@ -22,7 +22,7 @@
 #include "module_hamilt_lcao/hamilt_lcaodft/operator_lcao/nonlocal_new.h"
 
 template <typename T>
-Force_Stress_LCAO<T>::Force_Stress_LCAO(Record_adj& ra, const int nat_in) : RA(&ra), f_pw(nat_in), nat(nat_in)
+Force_Stress_LCAO<T>::Force_Stress_LCAO(Record_adj& ra, const int nat_in) : RA(&ra), nat(nat_in)
 {
 }
 template <typename T>
@@ -861,24 +861,39 @@ void Force_Stress_LCAO<T>::calForcePwPart(UnitCell& ucell,
                                           const Structure_Factor& sf)
 {
     ModuleBase::TITLE("Force_Stress_LCAO", "calForcePwPart");
-    //--------------------------------------------------------
-    // local pseudopotential force:
-    // use charge density; plane wave; local pseudopotential;
-    //--------------------------------------------------------
-    f_pw.cal_force_loc(ucell, fvl_dvl, rhopw, locpp.vloc, chr);
-    //--------------------------------------------------------
-    // ewald force: use plane wave only.
-    //--------------------------------------------------------
-    f_pw.cal_force_ew(ucell, fewalds, rhopw, &sf); // remain problem
+#ifdef __CUDA
+    if(PARAM.inp.device == "gpu")
+    {
+        Forces<double, base_device::DEVICE_GPU> f_pw(nat);
+
+        //--------------------------------------------------------
+        // local pseudopotential force:
+        // use charge density; plane wave; local pseudopotential;
+        //--------------------------------------------------------
+        f_pw.cal_force_loc(ucell, fvl_dvl, rhopw, locpp.vloc, chr);
+        //--------------------------------------------------------
+        // ewald force: use plane wave only.
+        //--------------------------------------------------------
+        f_pw.cal_force_ew(ucell, fewalds, rhopw, &sf); // remain problem
+
+        //--------------------------------------------------------
+        // force due to core correlation.
+        //--------------------------------------------------------
+        f_pw.cal_force_cc(fcc, rhopw, chr, locpp.numeric, ucell);
+        //--------------------------------------------------------
+        // force due to self-consistent charge.
+        //--------------------------------------------------------
+        f_pw.cal_force_scc(fscc, rhopw, vnew, vnew_exist, locpp.numeric, ucell);
+    } else
+#endif
+    {
+        Forces<double, base_device::DEVICE_CPU> f_pw(nat);
+        f_pw.cal_force_loc(ucell, fvl_dvl, rhopw, locpp.vloc, chr);
+        f_pw.cal_force_ew(ucell, fewalds, rhopw, &sf); // remain problem
+        f_pw.cal_force_cc(fcc, rhopw, chr, locpp.numeric, ucell);
+        f_pw.cal_force_scc(fscc, rhopw, vnew, vnew_exist, locpp.numeric, ucell);
+    }
 
-    //--------------------------------------------------------
-    // force due to core correlation.
-    //--------------------------------------------------------
-    f_pw.cal_force_cc(fcc, rhopw, chr, locpp.numeric, ucell);
-    //--------------------------------------------------------
-    // force due to self-consistent charge.
-    //--------------------------------------------------------
-    f_pw.cal_force_scc(fscc, rhopw, vnew, vnew_exist, locpp.numeric, ucell);
     return;
 }
 
diff --git a/source/module_hamilt_lcao/hamilt_lcaodft/FORCE_STRESS.h b/source/module_hamilt_lcao/hamilt_lcaodft/FORCE_STRESS.h
index a11add920e..b55caa7935 100644
--- a/source/module_hamilt_lcao/hamilt_lcaodft/FORCE_STRESS.h
+++ b/source/module_hamilt_lcao/hamilt_lcaodft/FORCE_STRESS.h
@@ -60,7 +60,6 @@ class Force_Stress_LCAO
     Record_adj* RA;
     Force_LCAO<T> flk;
     Stress_Func<double> sc_pw;
-    Forces<double> f_pw;
 
     void forceSymmetry(const UnitCell& ucell, 
                        ModuleBase::matrix& fcs, 
diff --git a/source/module_hamilt_pw/hamilt_pwdft/forces.cpp b/source/module_hamilt_pw/hamilt_pwdft/forces.cpp
index a4d779366b..ecdf746019 100644
--- a/source/module_hamilt_pw/hamilt_pwdft/forces.cpp
+++ b/source/module_hamilt_pw/hamilt_pwdft/forces.cpp
@@ -20,15 +20,13 @@
 #ifdef _OPENMP
 #include <omp.h>
 #endif
-#ifdef USE_PAW
-#include "module_cell/module_paw/paw_cell.h"
-#endif
+
 
 template <typename FPTYPE, typename Device>
 void Forces<FPTYPE, Device>::cal_force(UnitCell& ucell,
                                        ModuleBase::matrix& force,
                                        const elecstate::ElecState& elec,
-                                       ModulePW::PW_Basis* rho_basis,
+                                       const ModulePW::PW_Basis* const rho_basis,
                                        ModuleSymmetry::Symmetry* p_symm,
                                        Structure_Factor* p_sf,
                                        surchem& solvent,
@@ -55,15 +53,7 @@ void Forces<FPTYPE, Device>::cal_force(UnitCell& ucell,
     ModuleBase::matrix forceonsite(nat, 3);
 
     // Force due to local ionic potential
-    // For PAW, calculated together in paw_cell.calculate_force
-    if (!PARAM.inp.use_paw)
-    {
-        this->cal_force_loc(ucell,forcelc, rho_basis, locpp->vloc, chr);
-    }
-    else
-    {
-        forcelc.zero_out();
-    }
+    this->cal_force_loc(ucell,forcelc, rho_basis, locpp->vloc, chr);
 
     // Ewald
     this->cal_force_ew(ucell,forceion, rho_basis, p_sf);
@@ -71,94 +61,15 @@ void Forces<FPTYPE, Device>::cal_force(UnitCell& ucell,
     // Force due to nonlocal part of pseudopotential
     if (wfc_basis != nullptr)
     {
-        if (!PARAM.inp.use_paw)
-        {
-            this->npwx = wfc_basis->npwk_max;
-            Forces::cal_force_nl(forcenl, wg, ekb, pkv, wfc_basis, p_sf, *p_nlpp, ucell, psi_in);
 
-            if (PARAM.globalv.use_uspp)
-            {
-                this->cal_force_us(forcenl, rho_basis, *p_nlpp, elec, ucell);
-            }
-        }
-        else
-        {
-#ifdef USE_PAW
-            for (int ik = 0; ik < wfc_basis->nks; ik++)
-            {
-                const int npw = wfc_basis->npwk[ik];
-                ModuleBase::Vector3<double>* _gk = new ModuleBase::Vector3<double>[npw];
-                for (int ig = 0; ig < npw; ig++)
-                {
-                    _gk[ig] = wfc_basis->getgpluskcar(ik, ig);
-                }
+        this->npwx = wfc_basis->npwk_max;
+        Forces::cal_force_nl(forcenl, wg, ekb, pkv, wfc_basis, p_sf, *p_nlpp, ucell, psi_in);
 
-                double* kpt;
-                kpt = new double[3];
-                kpt[0] = wfc_basis->kvec_c[ik].x;
-                kpt[1] = wfc_basis->kvec_c[ik].y;
-                kpt[2] = wfc_basis->kvec_c[ik].z;
-
-                double** kpg;
-                double** gcar;
-                kpg = new double*[npw];
-                gcar = new double*[npw];
-                for (int ipw = 0; ipw < npw; ipw++)
-                {
-                    kpg[ipw] = new double[3];
-                    kpg[ipw][0] = _gk[ipw].x;
-                    kpg[ipw][1] = _gk[ipw].y;
-                    kpg[ipw][2] = _gk[ipw].z;
-
-                    gcar[ipw] = new double[3];
-                    gcar[ipw][0] = wfc_basis->getgcar(ik, ipw).x;
-                    gcar[ipw][1] = wfc_basis->getgcar(ik, ipw).y;
-                    gcar[ipw][2] = wfc_basis->getgcar(ik, ipw).z;
-                }
-
-                GlobalC::paw_cell.set_paw_k(npw,
-                                            wfc_basis->npwk_max,
-                                            kpt,
-                                            wfc_basis->get_ig2ix(ik).data(),
-                                            wfc_basis->get_ig2iy(ik).data(),
-                                            wfc_basis->get_ig2iz(ik).data(),
-                                            (const double**)kpg,
-                                            ucell.tpiba,
-                                            (const double**)gcar);
-
-                delete[] kpt;
-                for (int ipw = 0; ipw < npw; ipw++)
-                {
-                    delete[] kpg[ipw];
-                    delete[] gcar[ipw];
-                }
-                delete[] kpg;
-                delete[] gcar;
-
-                GlobalC::paw_cell.get_vkb();
-
-                GlobalC::paw_cell.set_currentk(ik);
-
-                psi_in[0].fix_k(ik);
-                double *weight, *epsilon;
-                weight = new double[PARAM.inp.nbands];
-                epsilon = new double[PARAM.inp.nbands];
-                for (int ib = 0; ib < PARAM.inp.nbands; ib++)
-                {
-                    weight[ib] = wg(ik, ib);
-                    epsilon[ib] = ekb(ik, ib);
-                }
-                GlobalC::paw_cell.paw_nl_force(reinterpret_cast<std::complex<double>*>(psi_in[0].get_pointer()),
-                                               epsilon,
-                                               weight,
-                                               PARAM.inp.nbands,
-                                               forcenl.c);
-
-                delete[] weight;
-                delete[] epsilon;
-            }
-#endif
+        if (PARAM.globalv.use_uspp)
+        {
+            this->cal_force_us(forcenl, rho_basis, *p_nlpp, elec, ucell);
         }
+
         // DFT+U and DeltaSpin
         if(PARAM.inp.dft_plus_u || PARAM.inp.sc_mag_switch)
         {
@@ -167,26 +78,10 @@ void Forces<FPTYPE, Device>::cal_force(UnitCell& ucell,
     }
 
     // non-linear core correction
-    // not relevant for PAW
-    if (!PARAM.inp.use_paw)
-    {
-        Forces::cal_force_cc(forcecc, rho_basis, chr, locpp->numeric, ucell);
-    }
-    else
-    {
-        forcecc.zero_out();
-    }
+    Forces::cal_force_cc(forcecc, rho_basis, chr, locpp->numeric, ucell);
 
     // force due to core charge
-    // For PAW, calculated together in paw_cell.calculate_force
-    if (!PARAM.inp.use_paw)
-    {
-        this->cal_force_scc(forcescc, rho_basis, elec.vnew, elec.vnew_exist, locpp->numeric, ucell);
-    }
-    else
-    {
-        forcescc.zero_out();
-    }
+    this->cal_force_scc(forcescc, rho_basis, elec.vnew, elec.vnew_exist, locpp->numeric, ucell);
 
     ModuleBase::matrix stress_vdw_pw; //.create(3,3);
     ModuleBase::matrix force_vdw;
@@ -240,52 +135,6 @@ void Forces<FPTYPE, Device>::cal_force(UnitCell& ucell,
         }
     }
 
-#ifdef USE_PAW
-    if (PARAM.inp.use_paw)
-    {
-        double* force_paw;
-        double* rhor;
-        rhor = new double[rho_basis->nrxx];
-        for (int ir = 0; ir < rho_basis->nrxx; ir++)
-        {
-            rhor[ir] = 0.0;
-        }
-        for (int is = 0; is < PARAM.inp.nspin; is++)
-        {
-            for (int ir = 0; ir < rho_basis->nrxx; ir++)
-            {
-                rhor[ir] += chr->rho[is][ir] + chr->nhat[is][ir];
-            }
-        }
-
-        force_paw = new double[3 * this->nat];
-        ModuleBase::matrix v_xc, v_effective;
-        v_effective.create(PARAM.inp.nspin, rho_basis->nrxx);
-        v_effective.zero_out();
-        elec.pot->update_from_charge(elec.charge, &ucell);
-        v_effective = elec.pot->get_effective_v();
-
-        v_xc.create(PARAM.inp.nspin, rho_basis->nrxx);
-        v_xc.zero_out();
-        const std::tuple<double, double, ModuleBase::matrix> etxc_vtxc_v
-            = XC_Functional::v_xc(rho_basis->nrxx, elec.charge, &ucell);
-        v_xc = std::get<2>(etxc_vtxc_v);
-
-        GlobalC::paw_cell.calculate_force(v_effective.c, v_xc.c, rhor, force_paw);
-
-        for (int iat = 0; iat < this->nat; iat++)
-        {
-            // Ha to Ry
-            forcepaw(iat, 0) = force_paw[3 * iat] * 2.0;
-            forcepaw(iat, 1) = force_paw[3 * iat + 1] * 2.0;
-            forcepaw(iat, 2) = force_paw[3 * iat + 2] * 2.0;
-        }
-
-        delete[] force_paw;
-        delete[] rhor;
-    }
-#endif
-
     // impose total force = 0
     int iat = 0;
     for (int ipol = 0; ipol < 3; ipol++)
@@ -300,11 +149,6 @@ void Forces<FPTYPE, Device>::cal_force(UnitCell& ucell,
                 force(iat, ipol) = forcelc(iat, ipol) + forceion(iat, ipol) + forcenl(iat, ipol) + forcecc(iat, ipol)
                                    + forcescc(iat, ipol);
 
-                if (PARAM.inp.use_paw)
-                {
-                    force(iat, ipol) += forcepaw(iat, ipol);
-                }
-
                 if (vdw_solver != nullptr) // linpz and jiyy added vdw force, modified by zhengdy
                 {
                     force(iat, ipol) += force_vdw(iat, ipol);
@@ -442,14 +286,7 @@ void Forces<FPTYPE, Device>::cal_force(UnitCell& ucell,
         ModuleIO::print_force(GlobalV::ofs_running, ucell, "NLCC     FORCE (eV/Angstrom)", forcecc, false);
         ModuleIO::print_force(GlobalV::ofs_running, ucell, "ION      FORCE (eV/Angstrom)", forceion, false);
         ModuleIO::print_force(GlobalV::ofs_running, ucell, "SCC      FORCE (eV/Angstrom)", forcescc, false);
-        if (PARAM.inp.use_paw)
-        {
-            ModuleIO::print_force(GlobalV::ofs_running,
-                                  ucell,
-                                  "PAW      FORCE (eV/Angstrom)",
-                                  forcepaw,
-                                  false);
-        }
+
         if (PARAM.inp.efield_flag)
         {
             ModuleIO::print_force(GlobalV::ofs_running, ucell, "EFIELD   FORCE (eV/Angstrom)", force_e, false);
@@ -487,13 +324,13 @@ void Forces<FPTYPE, Device>::cal_force(UnitCell& ucell,
 template <typename FPTYPE, typename Device>
 void Forces<FPTYPE, Device>::cal_force_loc(const UnitCell& ucell,
                                            ModuleBase::matrix& forcelc,
-                                           ModulePW::PW_Basis* rho_basis,
+                                           const ModulePW::PW_Basis* const rho_basis,
                                            const ModuleBase::matrix& vloc,
                                            const Charge* const chr)
 {
     ModuleBase::TITLE("Forces", "cal_force_loc");
     ModuleBase::timer::tick("Forces", "cal_force_loc");
-
+    this->device = base_device::get_device_type<Device>(this->ctx);
     std::complex<double>* aux = new std::complex<double>[rho_basis->nmaxgr];
     // now, in all pools , the charge are the same,
     // so, the force calculated by each pool is equal.
@@ -532,110 +369,100 @@ void Forces<FPTYPE, Device>::cal_force_loc(const UnitCell& ucell,
     // to G space. maybe need fftw with OpenMP
     rho_basis->real2recip(aux, aux);
 
-    // sincos op for G space
-    
-    
-    // data preparation
-    std::vector<FPTYPE> tau_flat(this->nat * 3);
-    std::vector<FPTYPE> gcar_flat(rho_basis->npw * 3);
-    
-    
-    for (int iat = 0; iat < this->nat; iat++) {
-        int it = ucell.iat2it[iat];  
-        int ia = ucell.iat2ia[iat];  
-        
-        tau_flat[iat * 3 + 0] = static_cast<FPTYPE>(ucell.atoms[it].tau[ia][0]);
-        tau_flat[iat * 3 + 1] = static_cast<FPTYPE>(ucell.atoms[it].tau[ia][1]);
-        tau_flat[iat * 3 + 2] = static_cast<FPTYPE>(ucell.atoms[it].tau[ia][2]);
-    }
-    
-    for (int ig = 0; ig < rho_basis->npw; ig++) {
-        gcar_flat[ig * 3 + 0] = static_cast<FPTYPE>(rho_basis->gcar[ig][0]);
-        gcar_flat[ig * 3 + 1] = static_cast<FPTYPE>(rho_basis->gcar[ig][1]);
-        gcar_flat[ig * 3 + 2] = static_cast<FPTYPE>(rho_basis->gcar[ig][2]);
-    }
-    
-    // calculate vloc_factors for all atom types
-    std::vector<FPTYPE> vloc_per_type_host(this->nat * rho_basis->npw);
-    for (int iat = 0; iat < this->nat; iat++) {
-        int it = ucell.iat2it[iat];
-        for (int ig = 0; ig < rho_basis->npw; ig++) {
-            vloc_per_type_host[iat * rho_basis->npw + ig] = static_cast<FPTYPE>(vloc(it, rho_basis->ig2igg[ig]));
-        }
-    }
-    
-    std::vector<std::complex<FPTYPE>> aux_fptype(rho_basis->npw);
-    for (int ig = 0; ig < rho_basis->npw; ig++) {
-        aux_fptype[ig] = static_cast<std::complex<FPTYPE>>(aux[ig]);
-    }
-    
-    FPTYPE* d_gcar = gcar_flat.data();
-    FPTYPE* d_tau = tau_flat.data();
-    FPTYPE* d_vloc_per_type = vloc_per_type_host.data();
-    std::complex<FPTYPE>* d_aux = aux_fptype.data();
-    FPTYPE* d_force = nullptr;
-    std::vector<FPTYPE> force_host(this->nat * 3);
-    
-    if (this->device == base_device::GpuDevice)
-    {
-        d_gcar = nullptr;
-        d_tau = nullptr;
-        d_vloc_per_type = nullptr;
-        d_aux = nullptr;
-        
-        resmem_var_op()(this->ctx, d_gcar, rho_basis->npw * 3);
-        resmem_var_op()(this->ctx, d_tau, this->nat * 3);
-        resmem_var_op()(this->ctx, d_vloc_per_type, this->nat * rho_basis->npw);
-        resmem_complex_op()(this->ctx, d_aux, rho_basis->npw);
-        resmem_var_op()(this->ctx, d_force, this->nat * 3);
-        
-        syncmem_var_h2d_op()(this->ctx, this->cpu_ctx, d_gcar, gcar_flat.data(), rho_basis->npw * 3);
-        syncmem_var_h2d_op()(this->ctx, this->cpu_ctx, d_tau, tau_flat.data(), this->nat * 3);
-        syncmem_var_h2d_op()(this->ctx, this->cpu_ctx, d_vloc_per_type, vloc_per_type_host.data(), this->nat * rho_basis->npw);
-        syncmem_complex_h2d_op()(this->ctx, this->cpu_ctx, d_aux, aux_fptype.data(), rho_basis->npw);
-        
-        base_device::memory::set_memory_op<FPTYPE, Device>()(this->ctx, d_force, 0.0, this->nat * 3);
-    }
-    else
+    if(this->device == base_device::GpuDevice)
     {
-        d_force = force_host.data();
-        std::fill(force_host.begin(), force_host.end(), static_cast<FPTYPE>(0.0));
-    }
-    
-    const FPTYPE scale_factor = static_cast<FPTYPE>(ucell.tpiba * ucell.omega);
-    
-    // call op for sincos calculation
-    hamilt::cal_force_loc_sincos_op<FPTYPE, Device>()(
-        this->ctx,
-        this->nat,
-        rho_basis->npw,
-        this->nat,
-        d_gcar,
-        d_tau,
-        d_vloc_per_type,
-        d_aux,
-        scale_factor,
-        d_force
-    );
-    
-    if (this->device == base_device::GpuDevice)
-    {
-        syncmem_var_d2h_op()(this->cpu_ctx, this->ctx, force_host.data(), d_force, this->nat * 3);
-        
-        delmem_var_op()(this->ctx, d_gcar);
-        delmem_var_op()(this->ctx, d_tau);
-        delmem_var_op()(this->ctx, d_vloc_per_type);
-        delmem_complex_op()(this->ctx, d_aux);
-        delmem_var_op()(this->ctx, d_force);
-    }
-    
-    for (int iat = 0; iat < this->nat; iat++) {
-        forcelc(iat, 0) = static_cast<double>(force_host[iat * 3 + 0]);
-        forcelc(iat, 1) = static_cast<double>(force_host[iat * 3 + 1]);
-        forcelc(iat, 2) = static_cast<double>(force_host[iat * 3 + 2]);
-    }
+        std::vector<double> tau_h;
+        std::vector<double> gcar_h;
+        tau_h.resize(this->nat * 3);
+        for(int iat = 0; iat < this->nat; ++iat)
+        {
+            int it = ucell.iat2it[iat];
+            int ia = ucell.iat2ia[iat];
+            tau_h[iat * 3] = ucell.atoms[it].tau[ia].x;
+            tau_h[iat * 3 + 1] = ucell.atoms[it].tau[ia].y;
+            tau_h[iat * 3 + 2] = ucell.atoms[it].tau[ia].z;
+        }
 
-    // this->print(GlobalV: :ofs_running, "local forces", forcelc);
+        gcar_h.resize(rho_basis->npw * 3);
+        for(int ig = 0; ig < rho_basis->npw; ++ig)
+        {
+            gcar_h[ig * 3] = rho_basis->gcar[ig].x;
+            gcar_h[ig * 3 + 1] = rho_basis->gcar[ig].y;
+            gcar_h[ig * 3 + 2] = rho_basis->gcar[ig].z;
+        }
+
+        int* iat2it_d = nullptr;
+        int* ig2gg_d = nullptr;
+        double* gcar_d = nullptr;
+        double* tau_d = nullptr;
+        std::complex<double>* aux_d = nullptr;
+        double* forcelc_d  = nullptr;
+        double* vloc_d = nullptr;
+
+        resmem_int_op()(this->ctx,iat2it_d, this->nat);
+        resmem_int_op()(this->ctx,ig2gg_d, rho_basis->npw);
+        resmem_var_op()(this->ctx,gcar_d, rho_basis->npw * 3);
+        resmem_var_op()(this->ctx,tau_d, this->nat * 3);
+        resmem_complex_op()(this->ctx,aux_d, rho_basis->npw);
+        resmem_var_op()(this->ctx,forcelc_d, this->nat * 3);
+        resmem_var_op()(this->ctx,vloc_d, vloc.nr * vloc.nc);
+
+        syncmem_int_h2d_op()( this->ctx, this->cpu_ctx,iat2it_d, ucell.iat2it, this->nat);
+        syncmem_int_h2d_op()(this->ctx, this->cpu_ctx, ig2gg_d, rho_basis->ig2igg, rho_basis->npw);
+        syncmem_var_h2d_op()(this->ctx, this->cpu_ctx, gcar_d, gcar_h.data(), rho_basis->npw * 3);
+        syncmem_var_h2d_op()(this->ctx, this->cpu_ctx, tau_d, tau_h.data(), this->nat * 3);
+        syncmem_complex_h2d_op()(this->ctx, this->cpu_ctx, aux_d, aux, rho_basis->npw);
+        syncmem_var_h2d_op()(this->ctx, this->cpu_ctx, forcelc_d, forcelc.c, this->nat * 3);
+        syncmem_var_h2d_op()(this->ctx, this->cpu_ctx, vloc_d, vloc.c, vloc.nr * vloc.nc);
+
+        hamilt::cal_force_loc_op<FPTYPE, Device>()(
+            this->nat,
+            rho_basis->npw,
+            ucell.tpiba * ucell.omega,
+            iat2it_d,
+            ig2gg_d,
+            gcar_d,
+            tau_d,
+            aux_d,
+            vloc_d,
+            vloc.nc,
+            forcelc_d);
+        syncmem_var_d2h_op()(this->cpu_ctx, this->ctx, forcelc.c, forcelc_d, this->nat * 3);
+
+        delmem_int_op()(this->ctx,iat2it_d);
+        delmem_int_op()(this->ctx,ig2gg_d);
+        delmem_var_op()(this->ctx,gcar_d);
+        delmem_var_op()(this->ctx,tau_d);
+        delmem_complex_op()(this->ctx,aux_d);
+        delmem_var_op()(this->ctx,forcelc_d);
+        delmem_var_op()(this->ctx,vloc_d);
+    }
+    else{  // calculate forces on CPU
+        #ifdef _OPENMP
+        #pragma omp parallel for
+        #endif
+        for (int iat = 0; iat < this->nat; ++iat)
+        {
+            // read `it` `ia` from the table
+            int it = ucell.iat2it[iat];
+            int ia = ucell.iat2ia[iat];
+            for (int ig = 0; ig < rho_basis->npw; ig++)
+            {
+                const double phase = ModuleBase::TWO_PI * (rho_basis->gcar[ig] * ucell.atoms[it].tau[ia]);
+                double sinp, cosp;
+                ModuleBase::libm::sincos(phase, &sinp, &cosp);
+                const double factor
+                    = vloc(it, rho_basis->ig2igg[ig]) * (cosp * aux[ig].imag() + sinp * aux[ig].real());
+                forcelc(iat, 0) += rho_basis->gcar[ig][0] * factor;
+                forcelc(iat, 1) += rho_basis->gcar[ig][1] * factor;
+                forcelc(iat, 2) += rho_basis->gcar[ig][2] * factor;
+            }
+            forcelc(iat, 0) *= (ucell.tpiba * ucell.omega);
+            forcelc(iat, 1) *= (ucell.tpiba * ucell.omega);
+            forcelc(iat, 2) *= (ucell.tpiba * ucell.omega);
+        }
+    }
+    // this->print(GlobalV::ofs_running, "local forces", forcelc);
     Parallel_Reduce::reduce_pool(forcelc.c, forcelc.nr * forcelc.nc);
     delete[] aux;
     ModuleBase::timer::tick("Forces", "cal_force_loc");
@@ -645,14 +472,14 @@ void Forces<FPTYPE, Device>::cal_force_loc(const UnitCell& ucell,
 template <typename FPTYPE, typename Device>
 void Forces<FPTYPE, Device>::cal_force_ew(const UnitCell& ucell,
                                           ModuleBase::matrix& forceion,
-                                          ModulePW::PW_Basis* rho_basis,
+                                          const ModulePW::PW_Basis* const rho_basis,
                                           const Structure_Factor* p_sf)
 {
     ModuleBase::TITLE("Forces", "cal_force_ew");
     ModuleBase::timer::tick("Forces", "cal_force_ew");
-
+    this->device = base_device::get_device_type<Device>(this->ctx);
     double fact = 2.0;
-    std::complex<double>* aux = new std::complex<double>[rho_basis->npw];
+    std::vector<std::complex<double>> aux(rho_basis->npw);
 
     /*
         blocking rho_basis->nrxnpwx for data locality.
@@ -662,9 +489,7 @@ void Forces<FPTYPE, Device>::cal_force_ew(const UnitCell& ucell,
         performance will be better when number of atom is quite huge
     */
     const int block_ig = 1024;
-#ifdef _OPENMP
 #pragma omp parallel for
-#endif
     for (int igb = 0; igb < rho_basis->npw; igb += block_ig)
     {
         // calculate the actual task length of this block
@@ -679,13 +504,6 @@ void Forces<FPTYPE, Device>::cal_force_ew(const UnitCell& ucell,
             if (ucell.atoms[it].na != 0)
             {
                 double dzv;
-                if (PARAM.inp.use_paw)
-                {
-    #ifdef USE_PAW
-                    dzv = GlobalC::paw_cell.get_val(it);
-    #endif
-                }
-                else
                 {
                     dzv = ucell.atoms[it].ncpp.zv;
                 }
@@ -702,13 +520,6 @@ void Forces<FPTYPE, Device>::cal_force_ew(const UnitCell& ucell,
     double charge = 0.0;
     for (int it = 0; it < ucell.ntype; it++)
     {
-        if (PARAM.inp.use_paw)
-        {
-#ifdef USE_PAW
-            charge += ucell.atoms[it].na * GlobalC::paw_cell.get_val(it);
-#endif
-        }
-        else
         {
             charge += ucell.atoms[it].na * ucell.atoms[it].ncpp.zv; // mohan modify 2007-11-7
         }
@@ -729,14 +540,16 @@ void Forces<FPTYPE, Device>::cal_force_ew(const UnitCell& ucell,
         upperbound = 2.0 * charge * charge * sqrt(2.0 * alpha / ModuleBase::TWO_PI)
                      * erfc(sqrt(ucell.tpiba2 * rho_basis->ggecut / 4.0 / alpha));
     } while (upperbound > 1.0e-6);
-
-#ifdef _OPENMP
+    const int ig0 = rho_basis->ig_gge0;
 #pragma omp parallel for
-#endif
     for (int ig = 0; ig < rho_basis->npw; ig++)
     {
+        if (ig== ig0)
+        {
+            continue; // skip G=0
+        }
         aux[ig] *= ModuleBase::libm::exp(-1.0 * rho_basis->gg[ig] * ucell.tpiba2 / alpha / 4.0)
-                   / (rho_basis->gg[ig] * ucell.tpiba2);
+                / (rho_basis->gg[ig] * ucell.tpiba2);
     }
 
     // set pos rho_basis->ig_gge0 to zero
@@ -744,144 +557,99 @@ void Forces<FPTYPE, Device>::cal_force_ew(const UnitCell& ucell,
     {
         aux[rho_basis->ig_gge0] = std::complex<double>(0.0, 0.0);
     }
-
-    // sincos op for cal_force_ew
-    
-    std::vector<FPTYPE> it_facts_host(this->nat);
-    std::vector<FPTYPE> tau_flat(this->nat * 3);
-    
-    // iterate over by lookup table
-    for (int iat = 0; iat < this->nat; iat++) {
-        int it = ucell.iat2it[iat];  
-        int ia = ucell.iat2ia[iat];  
-        
-        double zv;
-        if (PARAM.inp.use_paw)
+    if(this->device == base_device::GpuDevice)
+    {
+        std::vector<double> tau_h(this->nat * 3);
+        std::vector<double> gcar_h(rho_basis->npw * 3);
+        for(int iat = 0; iat < this->nat; ++iat)
         {
-#ifdef USE_PAW
-            zv = GlobalC::paw_cell.get_val(it);
-#endif
+            int it = ucell.iat2it[iat];
+            int ia = ucell.iat2ia[iat];
+            tau_h[iat * 3] = ucell.atoms[it].tau[ia].x;
+            tau_h[iat * 3 + 1] = ucell.atoms[it].tau[ia].y;
+            tau_h[iat * 3 + 2] = ucell.atoms[it].tau[ia].z;
         }
-        else
+        for(int ig = 0; ig < rho_basis->npw; ++ig)
         {
-            zv = ucell.atoms[it].ncpp.zv;
+            gcar_h[ig * 3] = rho_basis->gcar[ig].x;
+            gcar_h[ig * 3 + 1] = rho_basis->gcar[ig].y;
+            gcar_h[ig * 3 + 2] = rho_basis->gcar[ig].z;
         }
+        std::vector<double> it_fact_h(ucell.ntype);
+        for(int it = 0; it < ucell.ntype; ++it)
+        {
+            it_fact_h[it] = ucell.atoms[it].ncpp.zv * ModuleBase::e2 * ucell.tpiba * ModuleBase::TWO_PI / ucell.omega * fact;
+        }
+
+        int* iat2it_d = nullptr;
+        double* gcar_d = nullptr;
+        double* tau_d = nullptr;
+        double* it_fact_d = nullptr;
+        std::complex<double>* aux_d = nullptr;
+        double* forceion_d  = nullptr;
+        resmem_int_op()(this->ctx, iat2it_d, this->nat);
+        resmem_var_op()(this->ctx, gcar_d, rho_basis->npw * 3);
+        resmem_var_op()(this->ctx, tau_d, this->nat * 3);
+        resmem_var_op()(this->ctx, it_fact_d, ucell.ntype);
+        resmem_complex_op()(this->ctx, aux_d, rho_basis->npw);
+        resmem_var_op()(this->ctx, forceion_d, this->nat * 3);
+
+        syncmem_int_h2d_op()(this->ctx, this->cpu_ctx, iat2it_d, ucell.iat2it, this->nat);
+        syncmem_var_h2d_op()(this->ctx, this->cpu_ctx, gcar_d, gcar_h.data(), rho_basis->npw * 3);
+        syncmem_var_h2d_op()(this->ctx, this->cpu_ctx, tau_d, tau_h.data(), this->nat * 3);
+        syncmem_var_h2d_op()(this->ctx, this->cpu_ctx, it_fact_d, it_fact_h.data(), ucell.ntype);
+        syncmem_complex_h2d_op()(this->ctx, this->cpu_ctx, aux_d, aux.data(), rho_basis->npw);
+        syncmem_var_h2d_op()(this->ctx, this->cpu_ctx, forceion_d, forceion.c, this->nat * 3);
+
+        hamilt::cal_force_ew_op<FPTYPE, Device>()(
+            this->nat,
+            rho_basis->npw,
+            rho_basis->ig_gge0,
+            iat2it_d,
+            gcar_d,
+            tau_d,
+            it_fact_d,
+            aux_d,
+            forceion_d);
         
-        it_facts_host[iat] = static_cast<FPTYPE>(zv * ModuleBase::e2 * ucell.tpiba * 
-                                                ModuleBase::TWO_PI / ucell.omega * fact);
-        
-        tau_flat[iat * 3 + 0] = static_cast<FPTYPE>(ucell.atoms[it].tau[ia][0]);
-        tau_flat[iat * 3 + 1] = static_cast<FPTYPE>(ucell.atoms[it].tau[ia][1]); 
-        tau_flat[iat * 3 + 2] = static_cast<FPTYPE>(ucell.atoms[it].tau[ia][2]);
-    }
-    
-    std::vector<FPTYPE> gcar_flat(rho_basis->npw * 3);
-    for (int ig = 0; ig < rho_basis->npw; ig++) {
-        gcar_flat[ig * 3 + 0] = static_cast<FPTYPE>(rho_basis->gcar[ig][0]);
-        gcar_flat[ig * 3 + 1] = static_cast<FPTYPE>(rho_basis->gcar[ig][1]);
-        gcar_flat[ig * 3 + 2] = static_cast<FPTYPE>(rho_basis->gcar[ig][2]);
-    }
-    
-    std::vector<std::complex<FPTYPE>> aux_fptype(rho_basis->npw);
-    for (int ig = 0; ig < rho_basis->npw; ig++) {
-        aux_fptype[ig] = static_cast<std::complex<FPTYPE>>(aux[ig]);
-    }
-    
-    FPTYPE* d_gcar = gcar_flat.data();
-    FPTYPE* d_tau = tau_flat.data();
-    FPTYPE* d_it_facts = it_facts_host.data();
-    std::complex<FPTYPE>* d_aux = aux_fptype.data();
-    FPTYPE* d_force_g = nullptr;
-    std::vector<FPTYPE> force_g_host(this->nat * 3);
-    
-    if (this->device == base_device::GpuDevice)
-    {
-        d_gcar = nullptr;
-        d_tau = nullptr;
-        d_it_facts = nullptr;
-        d_aux = nullptr;
-        
-        resmem_var_op()(this->ctx, d_gcar, rho_basis->npw * 3);
-        resmem_var_op()(this->ctx, d_tau, this->nat * 3);
-        resmem_var_op()(this->ctx, d_it_facts, this->nat);
-        resmem_complex_op()(this->ctx, d_aux, rho_basis->npw);
-        resmem_var_op()(this->ctx, d_force_g, this->nat * 3);
-        
-        
-        syncmem_var_h2d_op()(this->ctx, this->cpu_ctx, d_gcar, gcar_flat.data(), rho_basis->npw * 3);
-        syncmem_var_h2d_op()(this->ctx, this->cpu_ctx, d_tau, tau_flat.data(), this->nat * 3);
-        syncmem_var_h2d_op()(this->ctx, this->cpu_ctx, d_it_facts, it_facts_host.data(), this->nat);
-        syncmem_complex_h2d_op()(this->ctx, this->cpu_ctx, d_aux, aux_fptype.data(), rho_basis->npw);
-        
-     
-        base_device::memory::set_memory_op<FPTYPE, Device>()(this->ctx, d_force_g, 0.0, this->nat * 3);
-    }
-    else
-    {
-        d_force_g = force_g_host.data();
-        std::fill(force_g_host.begin(), force_g_host.end(), static_cast<FPTYPE>(0.0));
-    }
-    
-    // call op for sincos calculation
-    hamilt::cal_force_ew_sincos_op<FPTYPE, Device>()(
-        this->ctx,
-        this->nat,
-        rho_basis->npw,
-        rho_basis->ig_gge0,  
-        d_gcar,
-        d_tau,
-        d_it_facts,
-        d_aux,
-        d_force_g
-    );
-    
-   
-    if (this->device == base_device::GpuDevice)
+        syncmem_var_d2h_op()(this->cpu_ctx, this->ctx, forceion.c, forceion_d, this->nat * 3);
+        delmem_int_op()(this->ctx,iat2it_d);
+        delmem_var_op()(this->ctx,gcar_d);
+        delmem_var_op()(this->ctx,tau_d);
+        delmem_var_op()(this->ctx,it_fact_d);
+        delmem_complex_op()(this->ctx,aux_d);
+        delmem_var_op()(this->ctx,forceion_d);
+    } else // calculate forces on CPU
     {
-        
-        syncmem_var_d2h_op()(this->cpu_ctx, this->ctx, force_g_host.data(), d_force_g, this->nat * 3);
-        
-        
-        delmem_var_op()(this->ctx, d_gcar);
-        delmem_var_op()(this->ctx, d_tau);
-        delmem_var_op()(this->ctx, d_it_facts);
-        delmem_complex_op()(this->ctx, d_aux);
-        delmem_var_op()(this->ctx, d_force_g);
-    }
-    
+    #pragma omp parallel for
+        for(int iat = 0; iat < this->nat; ++iat)
+        {
+            const int it = ucell.iat2it[iat];
+            const int ia = ucell.iat2ia[iat];
+            double it_fact = ucell.atoms[it].ncpp.zv * ModuleBase::e2 * ucell.tpiba * ModuleBase::TWO_PI / ucell.omega * fact;
 
-    for (int iat = 0; iat < this->nat; iat++) {
-        forceion(iat, 0) += static_cast<double>(force_g_host[iat * 3 + 0]);
-        forceion(iat, 1) += static_cast<double>(force_g_host[iat * 3 + 1]); 
-        forceion(iat, 2) += static_cast<double>(force_g_host[iat * 3 + 2]);
+            for(int ig = 0; ig < rho_basis->npw; ++ig)
+            {
+                if(ig != rho_basis->ig_gge0) // skip G=0
+                {
+                    const ModuleBase::Vector3<double> gcar = rho_basis->gcar[ig];
+                    const double arg = ModuleBase::TWO_PI * (gcar * ucell.atoms[it].tau[ia]);
+                    double sinp, cosp;
+                    ModuleBase::libm::sincos(arg, &sinp, &cosp);
+                    double sumnb = -cosp * aux[ig].imag() + sinp * aux[ig].real();
+                    forceion(iat, 0) += gcar[0] * sumnb;
+                    forceion(iat, 1) += gcar[1] * sumnb;
+                    forceion(iat, 2) += gcar[2] * sumnb;
+                }
+            }
+            forceion(iat, 0) *= it_fact;
+            forceion(iat, 1) *= it_fact;
+            forceion(iat, 2) *= it_fact;
+        }
     }
-
-   
-// calculate real space force
-#ifdef _OPENMP
-#pragma omp parallel
+    // means that the processor contains G=0 term.
+    #pragma omp parallel
     {
-        int num_threads = omp_get_num_threads();
-        int thread_id = omp_get_thread_num();
-#else
-    int num_threads = 1;
-    int thread_id = 0;
-#endif
-
-        /* Here is task distribution for multi-thread,
-            0. atom will be iterated both in main nat loop and the loop in `if (rho_basis->ig_gge0 >= 0)`.
-                To avoid syncing, we must calculate work range of each thread by our self
-            1. Calculate the iat range [iat_beg, iat_end) by each thread
-                a. when it is single thread stage, [iat_beg, iat_end) will be [0, nat)
-            2. each thread iterate atoms form `iat_beg` to `iat_end-1`
-        */
-        int iat_beg, iat_end;
-        int it_beg, ia_beg;
-        ModuleBase::TASK_DIST_1D(num_threads, thread_id, this->nat, iat_beg, iat_end);
-        iat_end = iat_beg + iat_end;
-        ucell.iat2iait(iat_beg, &ia_beg, &it_beg);
-
-        
         if (rho_basis->ig_gge0 >= 0)
         {
             double rmax = 5.0 / (sqrt(alpha) * ucell.lat0);
@@ -890,84 +658,55 @@ void Forces<FPTYPE, Device>::cal_force_ew(const UnitCell& ucell,
             // output of rgen: the number of vectors in the sphere
             const int mxr = 200;
             // the maximum number of R vectors included in r
-            ModuleBase::Vector3<double>* r = new ModuleBase::Vector3<double>[mxr];
-            double* r2 = new double[mxr];
-            ModuleBase::GlobalFunc::ZEROS(r2, mxr);
-            int* irr = new int[mxr];
-            ModuleBase::GlobalFunc::ZEROS(irr, mxr);
+            std::vector<ModuleBase::Vector3<double>> r(mxr);
+            std::vector<double> r2(mxr);
+            std::vector<int> irr(mxr);
             // the square modulus of R_j-tau_s-tau_s'
 
-            int iat1 = iat_beg;
-            int T1 = it_beg;
-            int I1 = ia_beg;
             const double sqa = sqrt(alpha);
             const double sq8a_2pi = sqrt(8.0 * alpha / ModuleBase::TWO_PI);
 
             // iterating atoms.
-            // do not need to sync threads because task range of each thread is isolated
-            while (iat1 < iat_end)
+            #pragma omp for
+            for(int iat1 = 0; iat1 < this->nat; iat1++)
             {
-                int iat2 = 0; // mohan fix bug 2011-06-07
-                int I2 = 0;
-                int T2 = 0;
-                while (iat2 < this->nat)
+                int T1 = ucell.iat2it[iat1];
+                int I1 = ucell.iat2ia[iat1];
+                for(int iat2 = 0; iat2 < this->nat; iat2++)
                 {
-                    if (iat1 != iat2 && ucell.atoms[T2].na != 0 && ucell.atoms[T1].na != 0)
+                    int T2 = ucell.iat2it[iat2];
+                    int I2 = ucell.iat2ia[iat2];
+                    if (iat1 != iat2)
                     {
                         ModuleBase::Vector3<double> d_tau
                             = ucell.atoms[T1].tau[I1] - ucell.atoms[T2].tau[I2];
-                        H_Ewald_pw::rgen(d_tau, rmax, irr, ucell.latvec, ucell.G, r, r2, nrm);
+                        H_Ewald_pw::rgen(d_tau, rmax, irr.data(), ucell.latvec, ucell.G, r.data(), r2.data(), nrm);
 
                         for (int n = 0; n < nrm; n++)
                         {
                             const double rr = sqrt(r2[n]) * ucell.lat0;
 
                             double factor;
-                            if (PARAM.inp.use_paw)
-                            {
-#ifdef USE_PAW
-                                factor = GlobalC::paw_cell.get_val(T1) * GlobalC::paw_cell.get_val(T2) * ModuleBase::e2
-                                         / (rr * rr)
-                                         * (erfc(sqa * rr) / rr + sq8a_2pi * ModuleBase::libm::exp(-alpha * rr * rr))
-                                         * ucell.lat0;
-#endif
-                            }
-                            else
                             {
                                 factor = ucell.atoms[T1].ncpp.zv * ucell.atoms[T2].ncpp.zv
-                                         * ModuleBase::e2 / (rr * rr)
-                                         * (erfc(sqa * rr) / rr + sq8a_2pi * ModuleBase::libm::exp(-alpha * rr * rr))
-                                         * ucell.lat0;
+                                            * ModuleBase::e2 / (rr * rr)
+                                            * (erfc(sqa * rr) / rr + sq8a_2pi * ModuleBase::libm::exp(-alpha * rr * rr))
+                                            * ucell.lat0;
                             }
-
                             forceion(iat1, 0) -= factor * r[n].x;
                             forceion(iat1, 1) -= factor * r[n].y;
                             forceion(iat1, 2) -= factor * r[n].z;
                         }
                     }
-                    ++iat2;
-                    ucell.step_iait(&I2, &T2);
                 } // atom b
-                ++iat1;
-                ucell.step_iait(&I1, &T1);
             } // atom a
-
-            delete[] r;
-            delete[] r2;
-            delete[] irr;
         }
-#ifdef _OPENMP
     }
-#endif
-
     Parallel_Reduce::reduce_pool(forceion.c, forceion.nr * forceion.nc);
-
     // this->print(GlobalV::ofs_running, "ewald forces", forceion);
 
     ModuleBase::timer::tick("Forces", "cal_force_ew");
 
-    delete[] aux;
-
     return;
 }
 
diff --git a/source/module_hamilt_pw/hamilt_pwdft/forces.h b/source/module_hamilt_pw/hamilt_pwdft/forces.h
index 695520dceb..370c6a2eb8 100644
--- a/source/module_hamilt_pw/hamilt_pwdft/forces.h
+++ b/source/module_hamilt_pw/hamilt_pwdft/forces.h
@@ -37,7 +37,7 @@ class Forces
     void cal_force(UnitCell& ucell,
                    ModuleBase::matrix& force,
                    const elecstate::ElecState& elec,
-                   ModulePW::PW_Basis* rho_basis,
+                   const ModulePW::PW_Basis* rho_basis,
                    ModuleSymmetry::Symmetry* p_symm,
                    Structure_Factor* p_sf,
                    surchem& solvent,
@@ -53,15 +53,15 @@ class Forces
 
     void cal_force_loc(const UnitCell& ucell,
                        ModuleBase::matrix& forcelc,
-                       ModulePW::PW_Basis* rho_basis,
+                       const ModulePW::PW_Basis* rho_basis,
                        const ModuleBase::matrix& vloc,
                        const Charge* const chr);
     void cal_force_ew(const UnitCell& ucell,
                       ModuleBase::matrix& forceion, 
-                      ModulePW::PW_Basis* rho_basis, 
+                      const ModulePW::PW_Basis* rho_basis, 
                       const Structure_Factor* p_sf);
     void cal_force_cc(ModuleBase::matrix& forcecc,
-                      ModulePW::PW_Basis* rho_basis,
+                      const ModulePW::PW_Basis* rho_basis,
                       const Charge* const chr,
                       const bool* numeric,
                       UnitCell& ucell_in);
@@ -96,13 +96,13 @@ class Forces
                       const UnitCell& ucell_in,
                       const psi::Psi<complex<FPTYPE>, Device>* psi_in = nullptr);
     void cal_force_scc(ModuleBase::matrix& forcescc,
-                       ModulePW::PW_Basis* rho_basis,
+                       const ModulePW::PW_Basis* rho_basis,
                        const ModuleBase::matrix& v_current,
                        const bool vnew_exist,
                        const bool* numeric,
                        const UnitCell& ucell_in);
     void cal_force_us(ModuleBase::matrix& forcenl,
-                      ModulePW::PW_Basis* rho_basis,
+                      const ModulePW::PW_Basis* rho_basis,
                       const pseudopot_cell_vnl& ppcell_in,
                       const elecstate::ElecState& elec,
                       const UnitCell& ucell);
@@ -113,7 +113,7 @@ class Forces
                      const FPTYPE* rab,
                      const FPTYPE* rhoc,
                      FPTYPE* drhocg,
-                     ModulePW::PW_Basis* rho_basis,
+                     const ModulePW::PW_Basis* rho_basis,
                      int type,
                      const UnitCell& ucell_in); // used in nonlinear core correction stress
     void deriv_drhoc_scc(const bool& numeric,
@@ -122,7 +122,7 @@ class Forces
                      const FPTYPE* rab,
                      const FPTYPE* rhoc,
                      FPTYPE* drhocg,
-                     ModulePW::PW_Basis* rho_basis,
+                     const ModulePW::PW_Basis* rho_basis,
                      const UnitCell& ucell_in); // used in nonlinear core correction stress
   protected:
     Device* ctx = {};
diff --git a/source/module_hamilt_pw/hamilt_pwdft/forces_cc.cpp b/source/module_hamilt_pw/hamilt_pwdft/forces_cc.cpp
index 3346724deb..f3bc7469a3 100644
--- a/source/module_hamilt_pw/hamilt_pwdft/forces_cc.cpp
+++ b/source/module_hamilt_pw/hamilt_pwdft/forces_cc.cpp
@@ -31,7 +31,7 @@
 
 template <typename FPTYPE, typename Device>
 void Forces<FPTYPE, Device>::cal_force_cc(ModuleBase::matrix& forcecc,
-                                          ModulePW::PW_Basis* rho_basis,
+                                          const ModulePW::PW_Basis* rho_basis,
                                           const Charge* const chr,
                                           const bool* numeric,
                                            UnitCell& ucell_in)
@@ -239,7 +239,7 @@ void Forces<FPTYPE, Device>::deriv_drhoc
 	const FPTYPE *rab,
 	const FPTYPE *rhoc,
 	FPTYPE *drhocg,
-	ModulePW::PW_Basis* rho_basis,
+	const ModulePW::PW_Basis* rho_basis,
 	int type,
     const UnitCell& ucell_in
 )
diff --git a/source/module_hamilt_pw/hamilt_pwdft/forces_scc.cpp b/source/module_hamilt_pw/hamilt_pwdft/forces_scc.cpp
index f670ad9b27..4b089d1d26 100644
--- a/source/module_hamilt_pw/hamilt_pwdft/forces_scc.cpp
+++ b/source/module_hamilt_pw/hamilt_pwdft/forces_scc.cpp
@@ -24,7 +24,7 @@
 
 template <typename FPTYPE, typename Device>
 void Forces<FPTYPE, Device>::cal_force_scc(ModuleBase::matrix& forcescc,
-                                           ModulePW::PW_Basis* rho_basis,
+                                           const ModulePW::PW_Basis* rho_basis,
                                            const ModuleBase::matrix& vnew,
                                            const bool vnew_exist,
                                            const bool* numeric,
@@ -147,7 +147,7 @@ void Forces<FPTYPE, Device>::deriv_drhoc_scc(const bool& numeric,
                                               const FPTYPE* rab,
                                               const FPTYPE* rhoc,
                                               FPTYPE* drhocg,
-                                              ModulePW::PW_Basis* rho_basis,
+                                              const ModulePW::PW_Basis* rho_basis,
                                               const UnitCell& ucell_in) {
     int igl0 = 0;
     double gx = 0;
diff --git a/source/module_hamilt_pw/hamilt_pwdft/forces_us.cpp b/source/module_hamilt_pw/hamilt_pwdft/forces_us.cpp
index 4323ee6c4f..8879c89c35 100644
--- a/source/module_hamilt_pw/hamilt_pwdft/forces_us.cpp
+++ b/source/module_hamilt_pw/hamilt_pwdft/forces_us.cpp
@@ -15,7 +15,7 @@
 // On output: the contribution is added to \(\text{forcenl}\).
 template <typename FPTYPE, typename Device>
 void Forces<FPTYPE, Device>::cal_force_us(ModuleBase::matrix& forcenl,
-                                          ModulePW::PW_Basis* rho_basis,
+                                          const ModulePW::PW_Basis* rho_basis,
                                           const pseudopot_cell_vnl& nlpp,
                                           const elecstate::ElecState& elec,
                                           const UnitCell& ucell)
diff --git a/source/module_hamilt_pw/hamilt_pwdft/kernels/cuda/force_op.cu b/source/module_hamilt_pw/hamilt_pwdft/kernels/cuda/force_op.cu
index 5ff3ddf2b1..b8bca46302 100644
--- a/source/module_hamilt_pw/hamilt_pwdft/kernels/cuda/force_op.cu
+++ b/source/module_hamilt_pw/hamilt_pwdft/kernels/cuda/force_op.cu
@@ -1,6 +1,7 @@
 #include "module_hamilt_pw/hamilt_pwdft/kernels/force_op.h"
-// #include "module_psi/kernels/device.h"
+// #include "source_psi/kernels/device.h"
 #include "module_base/module_device/types.h"
+#include "module_base/constants.h"
 
 #include <complex>
 
@@ -10,124 +11,18 @@
 #include <module_base/module_device/device.h>
 
 #define THREADS_PER_BLOCK 256
+#define FULL_MASK 0xffffffff
+#define WARP_SIZE 32
 
 namespace hamilt {
 
-// CUDA kernels for sincos loops
 template <typename FPTYPE>
-__global__ void cal_force_loc_sincos_kernel(
-    const int nat,
-    const int npw,
-    const int ntype,
-    const FPTYPE* gcar,
-    const FPTYPE* tau,
-    const FPTYPE* vloc_per_type,
-    const thrust::complex<FPTYPE>* aux,
-    const FPTYPE scale_factor,
-    FPTYPE* force)
-{
-    const FPTYPE TWO_PI = 2.0 * M_PI;
-    
-    const int iat = blockIdx.y;
-    const int ig_start = blockIdx.x * blockDim.x + threadIdx.x;
-    const int ig_stride = gridDim.x * blockDim.x;
-    
-    if (iat >= nat) return;
-    
-    // Load atom information to registers
-    const FPTYPE tau_x = tau[iat * 3 + 0];
-    const FPTYPE tau_y = tau[iat * 3 + 1];
-    const FPTYPE tau_z = tau[iat * 3 + 2];
-    
-    // Local accumulation variables
-    FPTYPE local_force_x = 0.0;
-    FPTYPE local_force_y = 0.0;
-    FPTYPE local_force_z = 0.0;
-    
-    // Grid-stride loop over G-vectors
-    for (int ig = ig_start; ig < npw; ig += ig_stride) {
-        // Calculate phase: 2π * (G · τ)
-        const FPTYPE phase = TWO_PI * (gcar[ig * 3 + 0] * tau_x + 
-                                       gcar[ig * 3 + 1] * tau_y + 
-                                       gcar[ig * 3 + 2] * tau_z);
-        
-        // Use CUDA intrinsic for sincos
-        FPTYPE sinp, cosp;
-        sincos(phase, &sinp, &cosp);
-        
-        // Calculate force factor
-        const FPTYPE vloc_factor = vloc_per_type[iat * npw + ig];
-        const FPTYPE factor = vloc_factor * (cosp * aux[ig].imag() + sinp * aux[ig].real());
-        
-        // Accumulate force contributions
-        local_force_x += gcar[ig * 3 + 0] * factor;
-        local_force_y += gcar[ig * 3 + 1] * factor;
-        local_force_z += gcar[ig * 3 + 2] * factor;
-    }
-    
-    // Atomic add to global memory
-    atomicAdd(&force[iat * 3 + 0], local_force_x * scale_factor);
-    atomicAdd(&force[iat * 3 + 1], local_force_y * scale_factor);
-    atomicAdd(&force[iat * 3 + 2], local_force_z * scale_factor);
-}
-
-template <typename FPTYPE>
-__global__ void cal_force_ew_sincos_kernel(
-    const int nat,
-    const int npw,
-    const int ig_gge0,
-    const FPTYPE* gcar,
-    const FPTYPE* tau,
-    const FPTYPE* it_facts,
-    const thrust::complex<FPTYPE>* aux,
-    FPTYPE* force)
-{
-    const FPTYPE TWO_PI = 2.0 * M_PI;
-    
-    const int iat = blockIdx.y;
-    const int ig_start = blockIdx.x * blockDim.x + threadIdx.x;
-    const int ig_stride = gridDim.x * blockDim.x;
-    
-    if (iat >= nat) return;
-    
-    // Load atom information to registers
-    const FPTYPE tau_x = tau[iat * 3 + 0];
-    const FPTYPE tau_y = tau[iat * 3 + 1];
-    const FPTYPE tau_z = tau[iat * 3 + 2];
-    const FPTYPE it_fact = it_facts[iat];
-    
-    // Local accumulation variables
-    FPTYPE local_force_x = 0.0;
-    FPTYPE local_force_y = 0.0;
-    FPTYPE local_force_z = 0.0;
-    
-    // Grid-stride loop over G-vectors
-    for (int ig = ig_start; ig < npw; ig += ig_stride) {
-        // Skip G=0 term
-        if (ig == ig_gge0) continue;
-        
-        // Calculate phase: 2π * (G · τ)
-        const FPTYPE phase = TWO_PI * (gcar[ig * 3 + 0] * tau_x + 
-                                       gcar[ig * 3 + 1] * tau_y + 
-                                       gcar[ig * 3 + 2] * tau_z);
-        
-        // Use CUDA intrinsic for sincos
-        FPTYPE sinp, cosp;
-        sincos(phase, &sinp, &cosp);
-        
-        // Calculate Ewald sum contribution (fixed sign error)
-        const FPTYPE factor = it_fact * (-cosp * aux[ig].imag() + sinp * aux[ig].real());
-        
-        // Accumulate force contributions
-        local_force_x += gcar[ig * 3 + 0] * factor;
-        local_force_y += gcar[ig * 3 + 1] * factor;
-        local_force_z += gcar[ig * 3 + 2] * factor;
+__forceinline__
+__device__
+void warp_reduce(FPTYPE & val) {
+    for (int offset = 16; offset > 0; offset >>= 1) {
+        val += __shfl_down_sync(FULL_MASK, val, offset);
     }
-    
-    // Atomic add to global memory
-    atomicAdd(&force[iat * 3 + 0], local_force_x);
-    atomicAdd(&force[iat * 3 + 1], local_force_y);
-    atomicAdd(&force[iat * 3 + 2], local_force_z);
 }
 
 template <typename FPTYPE>
@@ -304,65 +199,6 @@ void cal_force_nl_op<FPTYPE, base_device::DEVICE_GPU>::operator()(const base_dev
     cudaCheckOnDebug();
 }
 
-// GPU operators
-template <typename FPTYPE>
-void cal_force_loc_sincos_op<FPTYPE, base_device::DEVICE_GPU>::operator()(
-    const base_device::DEVICE_GPU* ctx,
-    const int& nat,
-    const int& npw,
-    const int& ntype,
-    const FPTYPE* gcar,
-    const FPTYPE* tau,
-    const FPTYPE* vloc_per_type,
-    const std::complex<FPTYPE>* aux,
-    const FPTYPE& scale_factor,
-    FPTYPE* force)
-{
-    // Calculate optimal grid configuration for GPU load balancing
-    const int threads_per_block = THREADS_PER_BLOCK;
-    const int max_blocks_per_sm = 32; // Configurable per GPU architecture
-    const int max_blocks_x = std::min(max_blocks_per_sm, (npw + threads_per_block - 1) / threads_per_block);
-    
-    dim3 grid(max_blocks_x, nat);
-    dim3 block(threads_per_block);
-    
-    cal_force_loc_sincos_kernel<FPTYPE><<<grid, block>>>(
-        nat, npw, ntype, gcar, tau, vloc_per_type,
-        reinterpret_cast<const thrust::complex<FPTYPE>*>(aux),
-        scale_factor, force
-    );
-    
-    cudaCheckOnDebug();
-}
-
-template <typename FPTYPE>
-void cal_force_ew_sincos_op<FPTYPE, base_device::DEVICE_GPU>::operator()(
-    const base_device::DEVICE_GPU* ctx,
-    const int& nat,
-    const int& npw,
-    const int& ig_gge0,
-    const FPTYPE* gcar,
-    const FPTYPE* tau,
-    const FPTYPE* it_facts,
-    const std::complex<FPTYPE>* aux,
-    FPTYPE* force)
-{
-    // Calculate optimal grid configuration for GPU load balancing
-    const int threads_per_block = THREADS_PER_BLOCK;
-    const int max_blocks_per_sm = 32; // Configurable per GPU architecture
-    const int max_blocks_x = std::min(max_blocks_per_sm, (npw + threads_per_block - 1) / threads_per_block);
-    
-    dim3 grid(max_blocks_x, nat);
-    dim3 block(threads_per_block);
-    
-    cal_force_ew_sincos_kernel<FPTYPE><<<grid, block>>>(
-        nat, npw, ig_gge0, gcar, tau, it_facts,
-        reinterpret_cast<const thrust::complex<FPTYPE>*>(aux), force
-    );
-    
-    cudaCheckOnDebug();
-}
-
 template <typename FPTYPE>
 __global__ void cal_force_nl(
         const int ntype,
@@ -781,6 +617,242 @@ void revertVkbValues(
     cudaCheckOnDebug();
 }
 
+template <typename FPTYPE>
+__global__ void force_loc_kernel(
+    const int nat,
+    const int npw,
+    const FPTYPE tpiba_omega,
+    const int* iat2it,
+    const int* ig2gg_d,
+    const FPTYPE* gcar_d,
+    const FPTYPE* tau_d,
+    const thrust::complex<FPTYPE>* aux_d,
+    const FPTYPE* vloc_d,
+    const int vloc_nc,
+    FPTYPE* forcelc_d)
+{
+    const int iat = blockIdx.x;
+    const int tid = threadIdx.x;
+    const int warp_id = tid / WARP_SIZE;
+    const int lane_id = tid % WARP_SIZE;
+    
+    if (iat >= nat) return;
+    const int it = iat2it[iat]; // get the type of atom
+
+    // Initialize force components
+    FPTYPE force_x = 0.0;
+    FPTYPE force_y = 0.0;
+    FPTYPE force_z = 0.0;
+    
+    const auto tau_x = tau_d[iat * 3 + 0];
+    const auto tau_y = tau_d[iat * 3 + 1];
+    const auto tau_z = tau_d[iat * 3 + 2];
+
+    // Process all plane waves in chunks of blockDim.x
+    for (int ig = tid; ig < npw; ig += blockDim.x) {
+        const auto gcar_x = gcar_d[ig * 3 + 0];
+        const auto gcar_y = gcar_d[ig * 3 + 1];
+        const auto gcar_z = gcar_d[ig * 3 + 2];
+
+        // Calculate phase factor
+        const FPTYPE phase = ModuleBase::TWO_PI * (gcar_x * tau_x +
+                                                   gcar_y * tau_y +
+                                                   gcar_z * tau_z);
+        FPTYPE sinp, cosp;
+        sincos(phase, &sinp, &cosp);
+        
+        // Get vloc value
+        const FPTYPE vloc_val = vloc_d[it * vloc_nc + ig2gg_d[ig]];
+        
+        // Calculate factor
+        const auto aux_val = aux_d[ig];
+        const FPTYPE factor = vloc_val * (cosp * aux_val.imag() + sinp * aux_val.real());
+        
+        // Multiply by gcar components
+        force_x += gcar_x * factor;
+        force_y += gcar_y * factor;
+        force_z += gcar_z * factor;
+    }
+    
+    // Warp-level reduction
+    warp_reduce<FPTYPE>(force_x);
+    warp_reduce<FPTYPE>(force_y);
+    warp_reduce<FPTYPE>(force_z);
+    
+    // First thread in each warp writes to shared memory
+    __shared__ FPTYPE warp_sums_x[THREADS_PER_BLOCK / WARP_SIZE]; // 256 threads / 32 = 8 warps
+    __shared__ FPTYPE warp_sums_y[THREADS_PER_BLOCK / WARP_SIZE];
+    __shared__ FPTYPE warp_sums_z[THREADS_PER_BLOCK / WARP_SIZE];
+    
+    if (lane_id == 0) {
+        warp_sums_x[warp_id] = force_x;
+        warp_sums_y[warp_id] = force_y;
+        warp_sums_z[warp_id] = force_z;
+    }
+    
+    __syncthreads();
+    
+    // Final reduction by first warp
+    if (warp_id == 0) {
+        FPTYPE final_x = (lane_id < blockDim.x/WARP_SIZE) ? warp_sums_x[lane_id] : 0.0;
+        FPTYPE final_y = (lane_id < blockDim.x/WARP_SIZE) ? warp_sums_y[lane_id] : 0.0;
+        FPTYPE final_z = (lane_id < blockDim.x/WARP_SIZE) ? warp_sums_z[lane_id] : 0.0;
+        
+        warp_reduce<FPTYPE>(final_x);
+        warp_reduce<FPTYPE>(final_y);
+        warp_reduce<FPTYPE>(final_z);
+        
+        if (lane_id == 0) {
+            forcelc_d[iat * 3 + 0] = final_x * tpiba_omega;
+            forcelc_d[iat * 3 + 1] = final_y * tpiba_omega;
+            forcelc_d[iat * 3 + 2] = final_z * tpiba_omega;
+        }
+    }
+}
+
+template <typename FPTYPE>
+__global__ void force_ew_kernel(
+    const int nat,
+    const int npw,
+    const int ig_gge0,
+    const int* iat2it,
+    const FPTYPE* gcar_d,
+    const FPTYPE* tau_d,
+    const FPTYPE* it_fact_d,
+    const thrust::complex<FPTYPE>* aux_d,
+    FPTYPE* forceion_d)
+{
+    const int iat = blockIdx.x;
+    const int tid = threadIdx.x;
+    const int warp_id = tid / WARP_SIZE;
+    const int lane_id = tid % WARP_SIZE;
+
+    if( iat >= nat) return;
+    const int it = iat2it[iat]; // get the type of atom
+    const FPTYPE it_fact_val = it_fact_d[it]; // Get it_fact value
+
+    // Initialize force components
+    FPTYPE force_x = 0.0;
+    FPTYPE force_y = 0.0;
+    FPTYPE force_z = 0.0;
+
+    const auto tau_x = tau_d[iat * 3 + 0];
+    const auto tau_y = tau_d[iat * 3 + 1];
+    const auto tau_z = tau_d[iat * 3 + 2];
+
+    for (int ig = tid; ig < npw; ig += blockDim.x) {
+        if(ig == ig_gge0)
+        { continue; }
+        const auto gcar_x = gcar_d[ig * 3 + 0];
+        const auto gcar_y = gcar_d[ig * 3 + 1];
+        const auto gcar_z = gcar_d[ig * 3 + 2];
+
+        // Calculate phase factor
+        const FPTYPE phase = ModuleBase::TWO_PI * (gcar_x * tau_x +
+                                                   gcar_y * tau_y +
+                                                   gcar_z * tau_z);
+        FPTYPE sinp, cosp;
+        sincos(phase, &sinp, &cosp);
+
+        // Calculate force contribution
+        const FPTYPE sumnb = -cosp * aux_d[ig].imag() + sinp * aux_d[ig].real();
+        
+        // Multiply by gcar components
+        force_x += gcar_x * sumnb;
+        force_y += gcar_y * sumnb;
+        force_z += gcar_z * sumnb;
+    }
+
+    // Warp-level reduction
+    warp_reduce<FPTYPE>(force_x);
+    warp_reduce<FPTYPE>(force_y);
+    warp_reduce<FPTYPE>(force_z);
+
+    // First thread in each warp writes to shared memory
+    __shared__ FPTYPE warp_sums_x[THREADS_PER_BLOCK / WARP_SIZE]; // 256 threads / 32 = 8 warps
+    __shared__ FPTYPE warp_sums_y[THREADS_PER_BLOCK / WARP_SIZE];
+    __shared__ FPTYPE warp_sums_z[THREADS_PER_BLOCK / WARP_SIZE];
+
+    if (lane_id == 0) {
+        warp_sums_x[warp_id] = force_x;
+        warp_sums_y[warp_id] = force_y;
+        warp_sums_z[warp_id] = force_z;
+    }
+
+    __syncthreads();
+
+    // Final reduction by first warp
+    if (warp_id == 0) {
+        FPTYPE final_x = (lane_id < blockDim.x/WARP_SIZE) ? warp_sums_x[lane_id] : 0.0;
+        FPTYPE final_y = (lane_id < blockDim.x/WARP_SIZE) ? warp_sums_y[lane_id] : 0.0;
+        FPTYPE final_z = (lane_id < blockDim.x/WARP_SIZE) ? warp_sums_z[lane_id] : 0.0;
+
+        warp_reduce<FPTYPE>(final_x);
+        warp_reduce<FPTYPE>(final_y);
+        warp_reduce<FPTYPE>(final_z);
+
+        if (lane_id == 0) {
+            forceion_d[iat * 3 + 0] = final_x * it_fact_val;
+            forceion_d[iat * 3 + 1] = final_y * it_fact_val;
+            forceion_d[iat * 3 + 2] = final_z * it_fact_val;
+        }
+    }
+}
+
+template <typename FPTYPE>
+void cal_force_loc_op<FPTYPE, base_device::DEVICE_GPU>::operator()(
+    const int nat,
+    const int npw,
+    const FPTYPE tpiba_omega,
+    const int* iat2it,
+    const int* ig2igg,
+    const FPTYPE* gcar,
+    const FPTYPE* tau,
+    const std::complex<FPTYPE>* aux,
+    const FPTYPE* vloc,
+    const int vloc_nc,
+    FPTYPE* forcelc)
+{
+    force_loc_kernel<FPTYPE>
+        <<<nat, THREADS_PER_BLOCK>>>(nat,
+                                     npw,
+                                     tpiba_omega,
+                                     iat2it,
+                                     ig2igg,
+                                     gcar,
+                                     tau,
+                                     reinterpret_cast<const thrust::complex<FPTYPE>*>(aux),
+                                     vloc,
+                                     vloc_nc,
+                                     forcelc); // array of data
+
+}
+
+template <typename FPTYPE>
+void cal_force_ew_op<FPTYPE, base_device::DEVICE_GPU>::operator()(
+    const int nat,
+    const int npw,
+    const int ig_gge0,
+    const int* iat2it,
+    const FPTYPE* gcar,
+    const FPTYPE* tau,
+    const FPTYPE* it_fact,
+    const std::complex<FPTYPE>* aux,
+    FPTYPE* forceion)
+{
+    force_ew_kernel<FPTYPE>
+        <<<nat, THREADS_PER_BLOCK>>>(nat,
+                                     npw,
+                                     ig_gge0,
+                                     iat2it,
+                                     gcar,
+                                     tau,
+                                     it_fact,
+                                     reinterpret_cast<const thrust::complex<FPTYPE>*>(aux),
+                                     forceion); // array of data
+}
+
+
 // for revertVkbValues functions instantiation
 template void revertVkbValues<double>(const int *gcar_zero_ptrs, std::complex<double> *vkb_ptr, const std::complex<double> *vkb_save_ptr, int nkb, int gcar_zero_count, int npw, int ipol, int npwx, const std::complex<double> coeff);
 // for saveVkbValues functions instantiation
@@ -788,12 +860,11 @@ template void saveVkbValues<double>(const int *gcar_zero_ptrs, const std::comple
 
 template struct cal_vkb1_nl_op<float, base_device::DEVICE_GPU>;
 template struct cal_force_nl_op<float, base_device::DEVICE_GPU>;
-template struct cal_force_loc_sincos_op<float, base_device::DEVICE_GPU>;
-template struct cal_force_ew_sincos_op<float, base_device::DEVICE_GPU>;
+template struct cal_force_loc_op<float, base_device::DEVICE_GPU>;
+template struct cal_force_ew_op<float, base_device::DEVICE_GPU>;
 
 template struct cal_vkb1_nl_op<double, base_device::DEVICE_GPU>;
 template struct cal_force_nl_op<double, base_device::DEVICE_GPU>;
-template struct cal_force_loc_sincos_op<double, base_device::DEVICE_GPU>;
-template struct cal_force_ew_sincos_op<double, base_device::DEVICE_GPU>;
-
+template struct cal_force_loc_op<double, base_device::DEVICE_GPU>;
+template struct cal_force_ew_op<double, base_device::DEVICE_GPU>;
 }  // namespace hamilt
diff --git a/source/module_hamilt_pw/hamilt_pwdft/kernels/force_op.cpp b/source/module_hamilt_pw/hamilt_pwdft/kernels/force_op.cpp
index 109321d6c3..367415d76b 100644
--- a/source/module_hamilt_pw/hamilt_pwdft/kernels/force_op.cpp
+++ b/source/module_hamilt_pw/hamilt_pwdft/kernels/force_op.cpp
@@ -428,116 +428,10 @@ struct cal_force_nl_op<FPTYPE, base_device::DEVICE_CPU>
     }
 };
 
-// CPU implementation of local force sincos operator
-template <typename FPTYPE>
-struct cal_force_loc_sincos_op<FPTYPE, base_device::DEVICE_CPU>
-{
-    void operator()(const base_device::DEVICE_CPU* ctx,
-                    const int& nat,
-                    const int& npw,
-                    const int& ntype,
-                    const FPTYPE* gcar,
-                    const FPTYPE* tau,
-                    const FPTYPE* vloc_per_type,
-                    const std::complex<FPTYPE>* aux,
-                    const FPTYPE& scale_factor,
-                    FPTYPE* force)
-    {
-        const FPTYPE TWO_PI = 2.0 * M_PI;
-
-#ifdef _OPENMP
-#pragma omp parallel for
-#endif
-        for (int iat = 0; iat < nat; ++iat)
-        {
-            const FPTYPE tau_x = tau[iat * 3 + 0];
-            const FPTYPE tau_y = tau[iat * 3 + 1];
-            const FPTYPE tau_z = tau[iat * 3 + 2];
-
-            FPTYPE local_force[3] = {0.0, 0.0, 0.0};
-
-            for (int ig = 0; ig < npw; ig++)
-            {
-                const FPTYPE phase = TWO_PI * (gcar[ig * 3 + 0] * tau_x + 
-                                               gcar[ig * 3 + 1] * tau_y + 
-                                               gcar[ig * 3 + 2] * tau_z);
-                FPTYPE sinp, cosp;
-                ModuleBase::libm::sincos(phase, &sinp, &cosp);
-                
-                const FPTYPE vloc_factor = vloc_per_type[iat * npw + ig];
-                const FPTYPE factor = vloc_factor * (cosp * aux[ig].imag() + sinp * aux[ig].real()) * scale_factor;
-                
-                local_force[0] += gcar[ig * 3 + 0] * factor;
-                local_force[1] += gcar[ig * 3 + 1] * factor;
-                local_force[2] += gcar[ig * 3 + 2] * factor;
-            }
-
-            force[iat * 3 + 0] = local_force[0];
-            force[iat * 3 + 1] = local_force[1];
-            force[iat * 3 + 2] = local_force[2];
-        }
-    }
-};
-
-// CPU implementation of Ewald force sincos operator
-template <typename FPTYPE>
-struct cal_force_ew_sincos_op<FPTYPE, base_device::DEVICE_CPU>
-{
-    void operator()(const base_device::DEVICE_CPU* ctx,
-                    const int& nat,
-                    const int& npw,
-                    const int& ig_gge0,
-                    const FPTYPE* gcar,
-                    const FPTYPE* tau,
-                    const FPTYPE* it_facts,
-                    const std::complex<FPTYPE>* aux,
-                    FPTYPE* force)
-    {
-        const FPTYPE TWO_PI = 2.0 * M_PI;
-
-#ifdef _OPENMP
-#pragma omp parallel for
-#endif
-        for (int iat = 0; iat < nat; ++iat)
-        {
-            const FPTYPE tau_x = tau[iat * 3 + 0];
-            const FPTYPE tau_y = tau[iat * 3 + 1];
-            const FPTYPE tau_z = tau[iat * 3 + 2];
-            const FPTYPE it_fact = it_facts[iat];
-
-            FPTYPE local_force[3] = {0.0, 0.0, 0.0};
-
-            for (int ig = 0; ig < npw; ig++)
-            {
-                // Skip G=0 term
-                if (ig == ig_gge0) continue;
-
-                const FPTYPE phase = TWO_PI * (gcar[ig * 3 + 0] * tau_x + 
-                                               gcar[ig * 3 + 1] * tau_y + 
-                                               gcar[ig * 3 + 2] * tau_z);
-                FPTYPE sinp, cosp;
-                ModuleBase::libm::sincos(phase, &sinp, &cosp);
-                
-                const FPTYPE factor = it_fact * (-cosp * aux[ig].imag() + sinp * aux[ig].real());
-                
-                local_force[0] += gcar[ig * 3 + 0] * factor;
-                local_force[1] += gcar[ig * 3 + 1] * factor;
-                local_force[2] += gcar[ig * 3 + 2] * factor;
-            }
-
-            force[iat * 3 + 0] = local_force[0];
-            force[iat * 3 + 1] = local_force[1];
-            force[iat * 3 + 2] = local_force[2];
-        }
-    }
-};
 
 template struct cal_vkb1_nl_op<float, base_device::DEVICE_CPU>;
 template struct cal_force_nl_op<float, base_device::DEVICE_CPU>;
-template struct cal_force_loc_sincos_op<float, base_device::DEVICE_CPU>;
-template struct cal_force_ew_sincos_op<float, base_device::DEVICE_CPU>;
+
 template struct cal_vkb1_nl_op<double, base_device::DEVICE_CPU>;
 template struct cal_force_nl_op<double, base_device::DEVICE_CPU>;
-template struct cal_force_loc_sincos_op<double, base_device::DEVICE_CPU>;
-template struct cal_force_ew_sincos_op<double, base_device::DEVICE_CPU>;
 } // namespace hamilt
diff --git a/source/module_hamilt_pw/hamilt_pwdft/kernels/force_op.h b/source/module_hamilt_pw/hamilt_pwdft/kernels/force_op.h
index acf490b278..ca2c6694cf 100644
--- a/source/module_hamilt_pw/hamilt_pwdft/kernels/force_op.h
+++ b/source/module_hamilt_pw/hamilt_pwdft/kernels/force_op.h
@@ -146,67 +146,39 @@ struct cal_force_nl_op
                     const FPTYPE* lambda,
                     const std::complex<FPTYPE>* becp,
                     const std::complex<FPTYPE>* dbecp,
-                                          FPTYPE* force);
+                    FPTYPE* force);
 };
 
 template <typename FPTYPE, typename Device>
-struct cal_force_loc_sincos_op
-{
-    /// @brief Calculate local pseudopotential forces (sincos loop only)
-    ///
-    /// Input Parameters
-    /// @param ctx - which device this function runs on
-    /// @param nat - number of atoms
-    /// @param npw - number of plane waves
-    /// @param ntype - number of atom types
-    /// @param gcar - G-vector Cartesian coordinates [npw * 3]
-    /// @param tau - atomic positions [nat * 3]
-    /// @param vloc_per_type - precomputed vloc factors per atom [nat * npw]
-    /// @param aux - charge density in G-space [npw]
-    /// @param scale_factor - tpiba * omega
-    ///
-    /// Output Parameters
-    /// @param force - output forces [nat * 3]
-    void operator()(const Device* ctx,
-                    const int& nat,
-                    const int& npw,
-                    const int& ntype,
-                    const FPTYPE* gcar,
-                    const FPTYPE* tau,
-                    const FPTYPE* vloc_per_type,
-                    const std::complex<FPTYPE>* aux,
-                    const FPTYPE& scale_factor,
-                    FPTYPE* force);
+struct cal_force_loc_op{
+    void operator()(
+        const int nat,
+        const int npw,
+        const FPTYPE tpiba_omega,
+        const int* iat2it,
+        const int* ig2igg,
+        const FPTYPE* gcar,
+        const FPTYPE* tau,
+        const std::complex<FPTYPE>* aux,
+        const FPTYPE* vloc,
+        const int vloc_nr,
+        FPTYPE* forcelc) {};
 };
 
 template <typename FPTYPE, typename Device>
-struct cal_force_ew_sincos_op
-{
-    /// @brief Calculate Ewald forces (sincos loop only)
-    ///
-    /// Input Parameters
-    /// @param ctx - which device this function runs on
-    /// @param nat - number of atoms
-    /// @param npw - number of plane waves
-    /// @param ig_gge0 - index of G=0 vector (-1 if not present)
-    /// @param gcar - G-vector Cartesian coordinates [npw * 3]
-    /// @param tau - atomic positions [nat * 3]
-    /// @param it_facts - precomputed it_fact for each atom [nat]
-    /// @param aux - structure factor related array [npw]
-    ///
-    /// Output Parameters
-    /// @param force - output forces [nat * 3]
-    void operator()(const Device* ctx,
-                    const int& nat,
-                    const int& npw,
-                    const int& ig_gge0,
-                    const FPTYPE* gcar,
-                    const FPTYPE* tau,
-                    const FPTYPE* it_facts,
-                    const std::complex<FPTYPE>* aux,
-                    FPTYPE* force);
+struct cal_force_ew_op{
+    void operator()(
+        const int nat,
+        const int npw,
+        const int ig_gge0,
+        const int* iat2it,
+        const FPTYPE* gcar,
+        const FPTYPE* tau,
+        const FPTYPE* it_fact,
+        const std::complex<FPTYPE>* aux,
+        FPTYPE* forceion
+    ) {};
 };
-
 #if __CUDA || __UT_USE_CUDA || __ROCM || __UT_USE_ROCM
 template <typename FPTYPE>
 struct cal_vkb1_nl_op<FPTYPE, base_device::DEVICE_GPU>
@@ -306,35 +278,6 @@ struct cal_force_nl_op<FPTYPE, base_device::DEVICE_GPU>
                     FPTYPE* force);
 };
 
-template <typename FPTYPE>
-struct cal_force_loc_sincos_op<FPTYPE, base_device::DEVICE_GPU>
-{
-    void operator()(const base_device::DEVICE_GPU* ctx,
-                    const int& nat,
-                    const int& npw,
-                    const int& ntype,
-                    const FPTYPE* gcar,
-                    const FPTYPE* tau,
-                    const FPTYPE* vloc_per_type,
-                    const std::complex<FPTYPE>* aux,
-                    const FPTYPE& scale_factor,
-                    FPTYPE* force);
-};
-
-template <typename FPTYPE>
-struct cal_force_ew_sincos_op<FPTYPE, base_device::DEVICE_GPU>
-{
-    void operator()(const base_device::DEVICE_GPU* ctx,
-                    const int& nat,
-                    const int& npw,
-                    const int& ig_gge0,
-                    const FPTYPE* gcar,
-                    const FPTYPE* tau,
-                    const FPTYPE* it_facts,
-                    const std::complex<FPTYPE>* aux,
-                    FPTYPE* force);
-};
-
 /**
  * @brief revert the vkb values for force_nl calculation
  */
@@ -362,6 +305,36 @@ void saveVkbValues(const int* gcar_zero_ptrs,
                    int ipol,
                    int npwx);
 
+template <typename FPTYPE>
+struct cal_force_loc_op<FPTYPE, base_device::DEVICE_GPU>{
+    void operator()(
+        const int nat,
+        const int npw,
+        const FPTYPE tpiba_omega,
+        const int* iat2it,
+        const int* ig2igg,
+        const FPTYPE* gcar,
+        const FPTYPE* tau,
+        const std::complex<FPTYPE>* aux,
+        const FPTYPE* vloc,
+        const int vloc_nr,
+        FPTYPE* forcelc);
+};
+
+template <typename FPTYPE>
+struct cal_force_ew_op<FPTYPE, base_device::DEVICE_GPU>{
+    void operator()(
+        const int nat,
+        const int npw,
+        const int ig_gge0,
+        const int* iat2it,
+        const FPTYPE* gcar,
+        const FPTYPE* tau,
+        const FPTYPE* it_fact,
+        const std::complex<FPTYPE>* aux,
+        FPTYPE* forceion
+    );
+};
 #endif // __CUDA || __UT_USE_CUDA || __ROCM || __UT_USE_ROCM
 } // namespace hamilt
-#endif // W_ABACUS_DEVELOP_ABACUS_DEVELOP_SOURCE_MODULE_HAMILT_PW_HAMILT_PWDFT_KERNELS_FORCE_OP_H
\ No newline at end of file
+#endif // W_ABACUS_DEVELOP_ABACUS_DEVELOP_SOURCE_source_pw_HAMILT_PWDFT_KERNELS_FORCE_OP_H
\ No newline at end of file