From 44b21367652663ba1496a76a180b9af66a2f49ba Mon Sep 17 00:00:00 2001
From: dyzheng <zhengdy@bjaisi.com>
Date: Wed, 13 Aug 2025 18:30:48 +0800
Subject: [PATCH 1/9] Fixed the bug in memory statistics

---
 source/module_hamilt_lcao/module_gint/gint.cpp | 5 +++--
 1 file changed, 3 insertions(+), 2 deletions(-)
diff --git a/source/module_hamilt_lcao/module_gint/gint.cpp b/source/module_hamilt_lcao/module_gint/gint.cpp
index 6ee4c238d1..1222bb82ed 100644
--- a/source/module_hamilt_lcao/module_gint/gint.cpp
+++ b/source/module_hamilt_lcao/module_gint/gint.cpp
@@ -199,6 +199,8 @@ void Gint::initialize_pvpR(const UnitCell& ucell_in, const Grid_Driver* gd, cons
     } else {
         this->hRGintCd->insert_ijrs(this->gridt->get_ijr_info(), ucell_in, npol);
         this->hRGintCd->allocate(nullptr, true);
+        ModuleBase::Memory::record("Gint::hRGintCd",
+                                   this->hRGintCd->get_memory_size());
         for(int is = 0; is < nspin; is++) {
             this->hRGint_tmp[is]->insert_ijrs(this->gridt->get_ijr_info(), ucell_in);
             this->DMRGint[is]->insert_ijrs(this->gridt->get_ijr_info(), ucell_in);
@@ -208,8 +210,7 @@ void Gint::initialize_pvpR(const UnitCell& ucell_in, const Grid_Driver* gd, cons
         ModuleBase::Memory::record("Gint::hRGint_tmp",
                                        this->hRGint_tmp[0]->get_memory_size()*nspin);
         ModuleBase::Memory::record("Gint::DMRGint",
-                                       this->DMRGint[0]->get_memory_size()
-                                           * this->DMRGint.size()*nspin);
+                                       this->DMRGint[0]->get_memory_size()*nspin);
 #ifdef __MPI
         this->DMRGint_full->insert_ijrs(this->gridt->get_ijr_info(), ucell_in, npol);
         this->DMRGint_full->allocate(nullptr, true);

From 07a9a23530437959c6a2e78a3d85492905cd3f8d Mon Sep 17 00:00:00 2001
From: hn <3022939753@qq.com>
Date: Thu, 21 Aug 2025 20:19:32 +0800
Subject: [PATCH 2/9] delete tem Hcontainer to reduce memory usage

---
 .../module_hamilt_lcao/module_gint/gint.cpp   | 105 +++++++++++-------
 source/module_hamilt_lcao/module_gint/gint.h  |   2 +-
 source/module_lr/utils/gint_move.hpp          |   4 +-
 3 files changed, 67 insertions(+), 44 deletions(-)

diff --git a/source/module_hamilt_lcao/module_gint/gint.cpp b/source/module_hamilt_lcao/module_gint/gint.cpp
index 1222bb82ed..6690446bed 100644
--- a/source/module_hamilt_lcao/module_gint/gint.cpp
+++ b/source/module_hamilt_lcao/module_gint/gint.cpp
@@ -33,7 +33,7 @@ Gint::~Gint() {
         delete this->hRGint_tmp[is];
     }
 #ifdef __MPI
-    delete this->DMRGint_full;
+    delete this->DM2D_tmp;
 #endif
 }
 
@@ -171,10 +171,9 @@ void Gint::initialize_pvpR(const UnitCell& ucell_in, const Grid_Driver* gd, cons
             this->hRGint_tmp[is] = new hamilt::HContainer<double>(ucell_in.nat);
         }
 #ifdef __MPI
-        if (this->DMRGint_full != nullptr) {
-            delete this->DMRGint_full;
+        if (this->DM2D_tmp != nullptr) {
+            delete this->DM2D_tmp;
         }
-        this->DMRGint_full = new hamilt::HContainer<double>(ucell_in.nat);
 #endif
     }
 
@@ -211,12 +210,6 @@ void Gint::initialize_pvpR(const UnitCell& ucell_in, const Grid_Driver* gd, cons
                                        this->hRGint_tmp[0]->get_memory_size()*nspin);
         ModuleBase::Memory::record("Gint::DMRGint",
                                        this->DMRGint[0]->get_memory_size()*nspin);
-#ifdef __MPI
-        this->DMRGint_full->insert_ijrs(this->gridt->get_ijr_info(), ucell_in, npol);
-        this->DMRGint_full->allocate(nullptr, true);
-        ModuleBase::Memory::record("Gint::DMRGint_full",
-                                   this->DMRGint_full->get_memory_size());
-#endif
     }
 }
 
@@ -232,10 +225,8 @@ void Gint::reset_DMRGint(const int& nspin)
         {
             for (auto& d : this->DMRGint) { d->allocate(nullptr, false); }
 #ifdef __MPI
-            delete this->DMRGint_full;
-            this->DMRGint_full = new hamilt::HContainer<double>(*this->hRGint);
-            this->DMRGint_full->allocate(nullptr, false);
-#endif
+            delete this->DM2D_tmp;
+#endif       
         }
     }
 }
@@ -263,37 +254,69 @@ void Gint::transfer_DM2DtoGrid(std::vector<hamilt::HContainer<double>*> DM2D) {
     } else // NSPIN=4 case
     {
 #ifdef __MPI
-        hamilt::transferParallels2Serials(*DM2D[0], this->DMRGint_full);
-#else
-        this->DMRGint_full = DM2D[0];
-#endif
-        std::vector<double*> tmp_pointer(4, nullptr);
-        for (int iap = 0; iap < this->DMRGint_full->size_atom_pairs(); ++iap) {
-            auto& ap = this->DMRGint_full->get_atom_pair(iap);
-            int iat1 = ap.get_atom_i();
-            int iat2 = ap.get_atom_j();
-            for (int ir = 0; ir < ap.get_R_size(); ++ir) {
-                const ModuleBase::Vector3<int> r_index = ap.get_R_index(ir);
-                for (int is = 0; is < 4; is++) {
-                    tmp_pointer[is] = this->DMRGint[is]
-                                          ->find_matrix(iat1, iat2, r_index)
-                                          ->get_pointer();
-                }
-                double* data_full = ap.get_pointer(ir);
-                for (int irow = 0; irow < ap.get_row_size(); irow += 2) {
-                    for (int icol = 0; icol < ap.get_col_size(); icol += 2) {
-                        *(tmp_pointer[0])++ = data_full[icol];
-                        *(tmp_pointer[1])++ = data_full[icol + 1];
-                    }
-                    data_full += ap.get_col_size();
-                    for (int icol = 0; icol < ap.get_col_size(); icol += 2) {
-                        *(tmp_pointer[2])++ = data_full[icol];
-                        *(tmp_pointer[3])++ = data_full[icol + 1];
+        int mg = DM2D[0]->get_paraV()->get_global_row_size()/2;
+        int ng = DM2D[0]->get_paraV()->get_global_col_size()/2;
+        int nb = DM2D[0]->get_paraV()->get_block_size()/2;
+        int blacs_ctxt = DM2D[0]->get_paraV()->blacs_ctxt;
+        int *iat2iwt = new int[ucell->nat];
+        for (int iat = 0; iat < ucell->nat; iat++) {
+            iat2iwt[iat] = ucell->get_iat2iwt()[iat]/2;
+        }
+        Parallel_Orbitals *pv = new Parallel_Orbitals();
+        pv->set(mg, ng, nb, blacs_ctxt);
+        pv->set_atomic_trace(iat2iwt, ucell->nat, mg);
+        auto ijr_info = DM2D[0]->get_ijr_info();
+        this-> DM2D_tmp = new hamilt::HContainer<double>(pv, nullptr, &ijr_info);
+        ModuleBase::Memory::record("Gint::DM2D_tmp", this->DM2D_tmp->get_memory_size());
+        for (int is = 0; is < 4; is++){
+            for (int iap = 0; iap < DM2D[0]->size_atom_pairs(); ++iap) {
+                auto& ap = DM2D[0]->get_atom_pair(iap);
+                int iat1 = ap.get_atom_i();
+                int iat2 = ap.get_atom_j();
+                for (int ir = 0; ir < ap.get_R_size(); ++ir) {
+                    const ModuleBase::Vector3<int> r_index = ap.get_R_index(ir);
+                    double* tmp_pointer = this -> DM2D_tmp -> find_matrix(iat1, iat2, r_index)->get_pointer();
+                    double* data_full = ap.get_pointer(ir);
+                    for (int irow = 0; irow < ap.get_row_size(); irow += 2) {
+                        switch (is) {//todo: It can be written more compactly
+                            case 0:
+                                for (int icol = 0; icol < ap.get_col_size(); icol += 2) {
+                                    *(tmp_pointer)++ = data_full[icol];
+                                }
+                                data_full += ap.get_col_size() * 2;
+                                break;
+                            case 1:
+                                for (int icol = 0; icol < ap.get_col_size(); icol += 2) {
+                                    *(tmp_pointer)++ = data_full[icol + 1];
+                                }
+                                data_full += ap.get_col_size() * 2;
+                                break;
+                            case 2:
+                                data_full += ap.get_col_size();
+                                for (int icol = 0; icol < ap.get_col_size(); icol += 2) {
+                                    *(tmp_pointer)++ = data_full[icol];
+                                }
+                                data_full += ap.get_col_size();
+                                break;
+                            case 3:
+                                data_full += ap.get_col_size();
+                                for (int icol = 0; icol < ap.get_col_size(); icol += 2) {
+                                    *(tmp_pointer)++ = data_full[icol + 1];
+                                }
+                                data_full += ap.get_col_size();
+                                break;           
+                        }
                     }
-                    data_full += ap.get_col_size();
                 }
             }
+            hamilt::transferParallels2Serials( *(this->DM2D_tmp), this->DMRGint[is]);
         }
+        // delete iat2iwt;
+        // delete pv;
+        // delete this-> DM2D_tmp;
+#else
+        //this->DMRGint_full = DM2D[0];
+#endif
     }
     ModuleBase::timer::tick("Gint", "transfer_DMR");
 }
\ No newline at end of file
diff --git a/source/module_hamilt_lcao/module_gint/gint.h b/source/module_hamilt_lcao/module_gint/gint.h
index 22d75e650b..ad1c7b72fd 100644
--- a/source/module_hamilt_lcao/module_gint/gint.h
+++ b/source/module_hamilt_lcao/module_gint/gint.h
@@ -264,7 +264,7 @@ class Gint {
     std::vector<hamilt::HContainer<double>*> DMRGint; 
 
     //! tmp tools used in transfer_DM2DtoGrid 
-    hamilt::HContainer<double>* DMRGint_full = nullptr;
+    hamilt::HContainer<double>* DM2D_tmp = nullptr;
 
     std::vector<hamilt::HContainer<double>> pvdpRx_reduced;
     std::vector<hamilt::HContainer<double>> pvdpRy_reduced;
diff --git a/source/module_lr/utils/gint_move.hpp b/source/module_lr/utils/gint_move.hpp
index 6b532073a2..2d610cc77e 100644
--- a/source/module_lr/utils/gint_move.hpp
+++ b/source/module_lr/utils/gint_move.hpp
@@ -60,8 +60,8 @@ Gint& Gint::operator=(Gint&& rhs)
     this->pvdpRz_reduced = std::move(rhs.pvdpRz_reduced);
     this->DMRGint = std::move(rhs.DMRGint);
     this->hRGint_tmp = std::move(rhs.hRGint_tmp);
-    this->DMRGint_full = rhs.DMRGint_full;
-    rhs.DMRGint_full = nullptr;
+    this->DM2D_tmp = rhs.DM2D_tmp;
+    rhs.DM2D_tmp = nullptr;
 
     return *this;
 }

From 0a799ba06acc8be690aebaf99cf98cadef8270e9 Mon Sep 17 00:00:00 2001
From: hn <3022939753@qq.com>
Date: Tue, 26 Aug 2025 11:49:57 +0800
Subject: [PATCH 3/9] delete tem hRGintCd to reduce memory usage

---
 .../module_hamilt_lcao/module_gint/gint.cpp   |  20 +-
 source/module_hamilt_lcao/module_gint/gint.h  |   2 +-
 .../module_gint/gint_k_pvpr.cpp               | 206 +++++++++++++-----
 source/module_lr/utils/gint_move.hpp          |   4 +-
 4 files changed, 161 insertions(+), 71 deletions(-)

diff --git a/source/module_hamilt_lcao/module_gint/gint.cpp b/source/module_hamilt_lcao/module_gint/gint.cpp
index 6690446bed..40d7ef4f57 100644
--- a/source/module_hamilt_lcao/module_gint/gint.cpp
+++ b/source/module_hamilt_lcao/module_gint/gint.cpp
@@ -23,7 +23,7 @@
 Gint::~Gint() {
 
     delete this->hRGint;
-    delete this->hRGintCd;
+    //delete this->hRGintCd;
     // in gamma_only case, DMRGint.size()=0, 
     // in multi-k case, DMRGint.size()=nspin
     for (int is = 0; is < this->DMRGint.size(); is++) {
@@ -155,11 +155,11 @@ void Gint::initialize_pvpR(const UnitCell& ucell_in, const Grid_Driver* gd, cons
         this->hRGint = new hamilt::HContainer<double>(ucell_in.nat);
     } else {
         npol = 2;
-        if (this->hRGintCd != nullptr) {
-            delete this->hRGintCd;
-        }
-        this->hRGintCd
-            = new hamilt::HContainer<std::complex<double>>(ucell_in.nat);
+        // if (this->hRGintCd != nullptr) {
+        //     delete this->hRGintCd;
+        // }
+        // this->hRGintCd
+         //   = new hamilt::HContainer<std::complex<double>>(ucell_in.nat);
         for (int is = 0; is < nspin; is++) {
             if (this->DMRGint[is] != nullptr) {
                 delete this->DMRGint[is];
@@ -196,10 +196,10 @@ void Gint::initialize_pvpR(const UnitCell& ucell_in, const Grid_Driver* gd, cons
                                    this->DMRGint[0]->get_memory_size()
                                        * this->DMRGint.size());
     } else {
-        this->hRGintCd->insert_ijrs(this->gridt->get_ijr_info(), ucell_in, npol);
-        this->hRGintCd->allocate(nullptr, true);
-        ModuleBase::Memory::record("Gint::hRGintCd",
-                                   this->hRGintCd->get_memory_size());
+        // this->hRGintCd->insert_ijrs(this->gridt->get_ijr_info(), ucell_in, npol);
+        // this->hRGintCd->allocate(nullptr, true);
+        // ModuleBase::Memory::record("Gint::hRGintCd",
+        //                            this->hRGintCd->get_memory_size());
         for(int is = 0; is < nspin; is++) {
             this->hRGint_tmp[is]->insert_ijrs(this->gridt->get_ijr_info(), ucell_in);
             this->DMRGint[is]->insert_ijrs(this->gridt->get_ijr_info(), ucell_in);
diff --git a/source/module_hamilt_lcao/module_gint/gint.h b/source/module_hamilt_lcao/module_gint/gint.h
index ad1c7b72fd..1803a4241c 100644
--- a/source/module_hamilt_lcao/module_gint/gint.h
+++ b/source/module_hamilt_lcao/module_gint/gint.h
@@ -258,7 +258,7 @@ class Gint {
     std::vector<hamilt::HContainer<double>*> hRGint_tmp; 
 
     //! stores Hamiltonian in sparse format
-    hamilt::HContainer<std::complex<double>>* hRGintCd = nullptr; 
+    //hamilt::HContainer<std::complex<double>>* hRGintCd = nullptr; 
 
     //! stores DMR in sparse format
     std::vector<hamilt::HContainer<double>*> DMRGint; 
diff --git a/source/module_hamilt_lcao/module_gint/gint_k_pvpr.cpp b/source/module_hamilt_lcao/module_gint/gint_k_pvpr.cpp
index 9a6cca47f7..443f940626 100644
--- a/source/module_hamilt_lcao/module_gint/gint_k_pvpr.cpp
+++ b/source/module_hamilt_lcao/module_gint/gint_k_pvpr.cpp
@@ -78,86 +78,176 @@ void Gint_k::transfer_pvpR(hamilt::HContainer<std::complex<double>>* hR,
     ModuleBase::TITLE("Gint_k", "transfer_pvpR");
     ModuleBase::timer::tick("Gint_k", "transfer_pvpR");
 
-    this->hRGintCd->set_zero();
+    //this->hRGintCd->set_zero();
     
-    for (int iap = 0; iap < this->hRGintCd->size_atom_pairs(); iap++)
-    {
-        auto* ap = &this->hRGintCd->get_atom_pair(iap);
-        const int iat1 = ap->get_atom_i();
-        const int iat2 = ap->get_atom_j();
-        if (iat1 <= iat2)
-        {
-            hamilt::AtomPair<std::complex<double>>* upper_ap = ap;
-            hamilt::AtomPair<std::complex<double>>* lower_ap = this->hRGintCd->find_pair(iat2, iat1);
-            const hamilt::AtomPair<double>* ap_nspin_0 = this->hRGint_tmp[0]->find_pair(iat1, iat2);
-            const hamilt::AtomPair<double>* ap_nspin_3 = this->hRGint_tmp[3]->find_pair(iat1, iat2);
-            for (int ir = 0; ir < upper_ap->get_R_size(); ir++)
-            {   
-                const auto R_index = upper_ap->get_R_index(ir);
-                auto upper_mat = upper_ap->find_matrix(R_index);
-                auto mat_nspin_0 = ap_nspin_0->find_matrix(R_index);
-                auto mat_nspin_3 = ap_nspin_3->find_matrix(R_index);
+#ifdef __MPI
+    int mg = hR->get_paraV()->get_global_row_size()/2;
+    int ng = hR->get_paraV()->get_global_col_size()/2;
+    int nb = hR->get_paraV()->get_block_size()/2;
+    int blacs_ctxt = hR->get_paraV()->blacs_ctxt;
+    int *iat2iwt = new int[ucell_in->nat];
+    for (int iat = 0; iat < ucell_in->nat; iat++) {
+        iat2iwt[iat] = ucell_in->get_iat2iwt()[iat]/2;
+    }
+    Parallel_Orbitals *pv = new Parallel_Orbitals();
+    pv->set(mg, ng, nb, blacs_ctxt);
+    pv->set_atomic_trace(iat2iwt, ucell_in->nat, mg);
+    auto ijr_info = hR->get_ijr_info();
+    
+
+    hamilt::HContainer<double>* hR_tmp = new hamilt::HContainer<double>(pv, nullptr, &ijr_info);
+    ModuleBase::Memory::record("Gint::hRGintCd", hR_tmp->get_memory_size());
+    // std::cout<<"test1"<<std::endl;
+
+    // std::cout<<hRGint_tmp[0]->size_atom_pairs()<<std::endl;
+    // std::cout<<hR_tmp->size_atom_pairs()<<std::endl;
+    // for(int i =0; i < hRGint_tmp[0]->size_atom_pairs(); i++){
+    //     std::cout<<"hRGint_tmp"<<hRGint_tmp[0]->get_atom_pair(i).get_row_size()<<' '<<hRGint_tmp[0]->get_atom_pair(i).get_row_size()<<std::endl;
+    //     std::cout<<"hR_tmp"<<hR_tmp->get_atom_pair(i).get_row_size()<<' '<<hR_tmp->get_atom_pair(i).get_row_size()<<std::endl;
+    //     std::cout<<"pv1:" << pv->get_indexes_col(0).size()<< ' '<<pv->get_indexes_row(0).size()<<std::endl;
+    //     std::cout<<"pv2:" << pv->get_indexes_col().size()<< ' '<<pv->get_indexes_row().size()<<std::endl;
+    // }
+
 
-                // The row size and the col size of upper_matrix is double that of matrix_nspin_0
-                for (int irow = 0; irow < mat_nspin_0->get_row_size(); ++irow)
+
+    for (int is = 0; is < 4; is++){
+        hR_tmp->set_zero();
+        //std::cout<<"is: "<<is<<std::endl;
+        hamilt::transferSerials2Parallels( *(this->hRGint_tmp[is]), hR_tmp);
+        for (int iap = 0; iap < hR->size_atom_pairs(); iap++)
+        {
+            //std::cout<<"iap: "<<iap<<std::endl;
+            auto* ap = &hR->get_atom_pair(iap);
+            const int iat1 = ap->get_atom_i();
+            const int iat2 = ap->get_atom_j();
+            const hamilt::AtomPair<double>* ap_nspin = nullptr;
+            if (iat1 <= iat2)
+            {
+                hamilt::AtomPair<std::complex<double>>* upper_ap = ap;
+                hamilt::AtomPair<std::complex<double>>* lower_ap = hR->find_pair(iat2, iat1);
+                switch (is)
                 {
-                    for (int icol = 0; icol < mat_nspin_0->get_col_size(); ++icol)
-                    {
-                        upper_mat->get_value(2*irow, 2*icol) = mat_nspin_0->get_value(irow, icol) + mat_nspin_3->get_value(irow, icol);
-                        upper_mat->get_value(2*irow+1, 2*icol+1) = mat_nspin_0->get_value(irow, icol) - mat_nspin_3->get_value(irow, icol);
-                    }
+                case 0:
+                    ap_nspin = hR_tmp->find_pair(iat1, iat2);
+                    break;
+                case 3:
+                    ap_nspin = hR_tmp->find_pair(iat1, iat2);
+                    break;
                 }
+                if(ap_nspin == nullptr) break;
+                //const hamilt::AtomPair<double>* ap_nspin_0 = this->hR_tmp[0]->find_pair(iat1, iat2);
+                //const hamilt::AtomPair<double>* ap_nspin_3 = this->hRGint_tmp[3]->find_pair(iat1, iat2);
+                for (int ir = 0; ir < upper_ap->get_R_size(); ir++)
+                {   
+                    const auto R_index = upper_ap->get_R_index(ir);
+                    auto upper_mat = upper_ap->find_matrix(R_index);
+                    //auto mat_nspin_0 = ap_nspin_0->find_matrix(R_index);
+                    //auto mat_nspin_3 = ap_nspin_3->find_matrix(R_index);
+                    auto mat_nspin = ap_nspin->find_matrix(R_index);
 
-                if (PARAM.globalv.domag)
-                {
-                    const hamilt::AtomPair<double>* ap_nspin_1 = this->hRGint_tmp[1]->find_pair(iat1, iat2);
-                    const hamilt::AtomPair<double>* ap_nspin_2 = this->hRGint_tmp[2]->find_pair(iat1, iat2);
-                    const auto mat_nspin_1 = ap_nspin_1->find_matrix(R_index);
-                    const auto mat_nspin_2 = ap_nspin_2->find_matrix(R_index);
-                    for (int irow = 0; irow < mat_nspin_1->get_row_size(); ++irow)
+                    // The row size and the col size of upper_matrix is double that of matrix_nspin_0
+                    for (int irow = 0; irow < mat_nspin->get_row_size(); ++irow)
                     {
-                        for (int icol = 0; icol < mat_nspin_1->get_col_size(); ++icol)
+                        for (int icol = 0; icol < mat_nspin->get_col_size(); ++icol)
                         {
-                            upper_mat->get_value(2*irow, 2*icol+1) = mat_nspin_1->get_value(irow, icol) +  std::complex<double>(0.0, 1.0) * mat_nspin_2->get_value(irow, icol);
-                            upper_mat->get_value(2*irow+1, 2*icol) = mat_nspin_1->get_value(irow, icol) -  std::complex<double>(0.0, 1.0) * mat_nspin_2->get_value(irow, icol);
+                            //upper_mat->get_value(2*irow, 2*icol) = mat_nspin_0->get_value(irow, icol) + mat_nspin_3->get_value(irow, icol);
+                            //upper_mat->get_value(2*irow+1, 2*icol+1) = mat_nspin_0->get_value(irow, icol) - mat_nspin_3->get_value(irow, icol);
+                            switch (is)
+                            {
+                            case 0:
+                                upper_mat->get_value(2*irow, 2*icol) = mat_nspin->get_value(irow, icol);
+                                upper_mat->get_value(2*irow+1, 2*icol+1) = mat_nspin->get_value(irow, icol);
+                                break;
+                            case 3:
+                                upper_mat->get_value(2*irow, 2*icol) += mat_nspin->get_value(irow, icol);
+                                upper_mat->get_value(2*irow+1, 2*icol+1) -= mat_nspin->get_value(irow, icol);
+                                break;
+                            }
                         }
                     }
-                }
 
-                // fill the lower triangle matrix
-                if (iat1 < iat2)
-                {
-                    auto lower_mat = lower_ap->find_matrix(-R_index);
-                    for (int irow = 0; irow < upper_mat->get_row_size(); ++irow)
+                    if (PARAM.globalv.domag)
                     {
-                        for (int icol = 0; icol < upper_mat->get_col_size(); ++icol)
+                        const hamilt::AtomPair<double>* ap_nspin = nullptr;
+                        switch (is)
+                        {
+                        case 1:
+                            ap_nspin = hR_tmp->find_pair(iat1, iat2);
+                            break;
+                        case 2:
+                            ap_nspin = hR_tmp->find_pair(iat1, iat2);
+                            break;
+                        }
+                        //const hamilt::AtomPair<double>* ap_nspin_1 = this->hRGint_tmp[1]->find_pair(iat1, iat2);
+                        //const hamilt::AtomPair<double>* ap_nspin_2 = this->hRGint_tmp[2]->find_pair(iat1, iat2);
+                        //const auto mat_nspin_1 = ap_nspin_1->find_matrix(R_index);
+                        //const auto mat_nspin_2 = ap_nspin_2->find_matrix(R_index);
+                        const auto mat_nspin = ap_nspin->find_matrix(R_index);
+                        for (int irow = 0; irow < mat_nspin->get_row_size(); ++irow)
                         {
-                            lower_mat->get_value(icol, irow) = conj(upper_mat->get_value(irow, icol));
+                            for (int icol = 0; icol < mat_nspin->get_col_size(); ++icol)
+                            {
+                                switch(is)
+                                {
+                                    case 1:
+                                        upper_mat->get_value(2*irow, 2*icol+1) = mat_nspin->get_value(irow, icol);
+                                        upper_mat->get_value(2*irow+1, 2*icol) = mat_nspin->get_value(irow, icol);
+                                        break;
+                                    case 2:
+                                        upper_mat->get_value(2*irow, 2*icol+1) += std::complex<double>(0.0, 1.0) * mat_nspin->get_value(irow, icol);
+                                        upper_mat->get_value(2*irow+1, 2*icol) -= std::complex<double>(0.0, 1.0) * mat_nspin->get_value(irow, icol);
+                                        break;
+                                }
+                                //upper_mat->get_value(2*irow, 2*icol+1) = mat_nspin->get_value(irow, icol) +  std::complex<double>(0.0, 1.0) * mat_nspin_2->get_value(irow, icol);
+                                //upper_mat->get_value(2*irow+1, 2*icol) = mat_nspin->get_value(irow, icol) -  std::complex<double>(0.0, 1.0) * mat_nspin_2->get_value(irow, icol);
+                            }
                         }
                     }
+                    
+                    // fill the lower triangle matrix
+                    if(is == 3){
+                        if (iat1 < iat2)
+                        {
+                            auto lower_mat = lower_ap->find_matrix(-R_index);
+                            for (int irow = 0; irow < upper_mat->get_row_size(); ++irow)
+                            {
+                                for (int icol = 0; icol < upper_mat->get_col_size(); ++icol)
+                                {
+                                    lower_mat->get_value(icol, irow) = conj(upper_mat->get_value(irow, icol));
+                                }
+                            }
+                        }
+                    }
+                        // std::cout<<"end"<<std::endl;
                 }
             }
         }
+        
     }
+    delete[] iat2iwt;
+    delete pv;
+    delete hR_tmp;
+#else
+
+#endif
 
     // ===================================
     // transfer HR from Gint to Veff<OperatorLCAO<std::complex<double>, std::complex<double>>>
     // ===================================
-#ifdef __MPI
-    int size;
-    MPI_Comm_size(MPI_COMM_WORLD, &size);
-    if (size == 1)
-    {
-        hR->add(*this->hRGintCd);
-    }
-    else
-    {
-        hamilt::transferSerials2Parallels<std::complex<double>>(*this->hRGintCd, hR);
-    }
-#else
-    hR->add(*this->hRGintCd);
-#endif
-
+// #ifdef __MPI
+//     int size;
+//     MPI_Comm_size(MPI_COMM_WORLD, &size);
+//     if (size == 1)
+//     {
+//         hR->add(*this->hRGintCd);
+//     }
+//     else
+//     {
+//         hamilt::transferSerials2Parallels<std::complex<double>>(*this->hRGintCd, hR);
+//     }
+// #else
+//     hR->add(*this->hRGintCd);
+// #endif
     ModuleBase::timer::tick("Gint_k", "transfer_pvpR");
     return;
 }
diff --git a/source/module_lr/utils/gint_move.hpp b/source/module_lr/utils/gint_move.hpp
index 2d610cc77e..c3ee35f566 100644
--- a/source/module_lr/utils/gint_move.hpp
+++ b/source/module_lr/utils/gint_move.hpp
@@ -45,8 +45,8 @@ Gint& Gint::operator=(Gint&& rhs)
     // move hR after refactor
     this->hRGint = rhs.hRGint;
     rhs.hRGint = nullptr;
-    this->hRGintCd = rhs.hRGintCd;
-    rhs.hRGintCd = nullptr;
+    // this->hRGintCd = rhs.hRGintCd;
+    // rhs.hRGintCd = nullptr;
     for (int i = 0; i < this->DMRGint.size(); i++)
     {
         delete this->DMRGint[i];

From 4640fbfba3973407ceef302a123d76e8e8262a47 Mon Sep 17 00:00:00 2001
From: hn <3022939753@qq.com>
Date: Sat, 30 Aug 2025 12:28:30 +0800
Subject: [PATCH 4/9] fix parallel bug

---
 .../module_hamilt_lcao/module_gint/gint.cpp   |  10 +-
 source/module_hamilt_lcao/module_gint/gint.h  |   2 +-
 .../module_gint/gint_k_pvpr.cpp               | 170 ++++++++----------
 source/module_lr/utils/gint_move.hpp          |   4 +-
 4 files changed, 82 insertions(+), 104 deletions(-)

diff --git a/source/module_hamilt_lcao/module_gint/gint.cpp b/source/module_hamilt_lcao/module_gint/gint.cpp
index 40d7ef4f57..f1a8805f97 100644
--- a/source/module_hamilt_lcao/module_gint/gint.cpp
+++ b/source/module_hamilt_lcao/module_gint/gint.cpp
@@ -23,7 +23,7 @@
 Gint::~Gint() {
 
     delete this->hRGint;
-    //delete this->hRGintCd;
+    delete this->hR_tmp;
     // in gamma_only case, DMRGint.size()=0, 
     // in multi-k case, DMRGint.size()=nspin
     for (int is = 0; is < this->DMRGint.size(); is++) {
@@ -155,11 +155,9 @@ void Gint::initialize_pvpR(const UnitCell& ucell_in, const Grid_Driver* gd, cons
         this->hRGint = new hamilt::HContainer<double>(ucell_in.nat);
     } else {
         npol = 2;
-        // if (this->hRGintCd != nullptr) {
-        //     delete this->hRGintCd;
-        // }
-        // this->hRGintCd
-         //   = new hamilt::HContainer<std::complex<double>>(ucell_in.nat);
+        if (this->hR_tmp != nullptr) {
+            delete this->hR_tmp;
+        }
         for (int is = 0; is < nspin; is++) {
             if (this->DMRGint[is] != nullptr) {
                 delete this->DMRGint[is];
diff --git a/source/module_hamilt_lcao/module_gint/gint.h b/source/module_hamilt_lcao/module_gint/gint.h
index 1803a4241c..d8090df191 100644
--- a/source/module_hamilt_lcao/module_gint/gint.h
+++ b/source/module_hamilt_lcao/module_gint/gint.h
@@ -258,7 +258,7 @@ class Gint {
     std::vector<hamilt::HContainer<double>*> hRGint_tmp; 
 
     //! stores Hamiltonian in sparse format
-    //hamilt::HContainer<std::complex<double>>* hRGintCd = nullptr; 
+    hamilt::HContainer<std::complex<double>>* hR_tmp = nullptr; 
 
     //! stores DMR in sparse format
     std::vector<hamilt::HContainer<double>*> DMRGint; 
diff --git a/source/module_hamilt_lcao/module_gint/gint_k_pvpr.cpp b/source/module_hamilt_lcao/module_gint/gint_k_pvpr.cpp
index 443f940626..f0a3bc6518 100644
--- a/source/module_hamilt_lcao/module_gint/gint_k_pvpr.cpp
+++ b/source/module_hamilt_lcao/module_gint/gint_k_pvpr.cpp
@@ -78,12 +78,10 @@ void Gint_k::transfer_pvpR(hamilt::HContainer<std::complex<double>>* hR,
     ModuleBase::TITLE("Gint_k", "transfer_pvpR");
     ModuleBase::timer::tick("Gint_k", "transfer_pvpR");
 
-    //this->hRGintCd->set_zero();
-    
-#ifdef __MPI
     int mg = hR->get_paraV()->get_global_row_size()/2;
     int ng = hR->get_paraV()->get_global_col_size()/2;
     int nb = hR->get_paraV()->get_block_size()/2;
+#ifdef __MPI
     int blacs_ctxt = hR->get_paraV()->blacs_ctxt;
     int *iat2iwt = new int[ucell_in->nat];
     for (int iat = 0; iat < ucell_in->nat; iat++) {
@@ -93,119 +91,64 @@ void Gint_k::transfer_pvpR(hamilt::HContainer<std::complex<double>>* hR,
     pv->set(mg, ng, nb, blacs_ctxt);
     pv->set_atomic_trace(iat2iwt, ucell_in->nat, mg);
     auto ijr_info = hR->get_ijr_info();
-    
-
-    hamilt::HContainer<double>* hR_tmp = new hamilt::HContainer<double>(pv, nullptr, &ijr_info);
-    ModuleBase::Memory::record("Gint::hRGintCd", hR_tmp->get_memory_size());
-    // std::cout<<"test1"<<std::endl;
-
-    // std::cout<<hRGint_tmp[0]->size_atom_pairs()<<std::endl;
-    // std::cout<<hR_tmp->size_atom_pairs()<<std::endl;
-    // for(int i =0; i < hRGint_tmp[0]->size_atom_pairs(); i++){
-    //     std::cout<<"hRGint_tmp"<<hRGint_tmp[0]->get_atom_pair(i).get_row_size()<<' '<<hRGint_tmp[0]->get_atom_pair(i).get_row_size()<<std::endl;
-    //     std::cout<<"hR_tmp"<<hR_tmp->get_atom_pair(i).get_row_size()<<' '<<hR_tmp->get_atom_pair(i).get_row_size()<<std::endl;
-    //     std::cout<<"pv1:" << pv->get_indexes_col(0).size()<< ' '<<pv->get_indexes_row(0).size()<<std::endl;
-    //     std::cout<<"pv2:" << pv->get_indexes_col().size()<< ' '<<pv->get_indexes_row().size()<<std::endl;
-    // }
-
 
+    this->hR_tmp = new hamilt::HContainer<std::complex<double>>(pv, nullptr, &ijr_info);
+    ModuleBase::Memory::record("Gint::hRGintCd", this->hR_tmp->get_memory_size());
 
+    //0,3;1,2;1,2;0,3
+    std::vector<int> first = {0, 1, 1, 0};
+    std::vector<int> second= {3, 2, 2, 3};
     for (int is = 0; is < 4; is++){
-        hR_tmp->set_zero();
-        //std::cout<<"is: "<<is<<std::endl;
-        hamilt::transferSerials2Parallels( *(this->hRGint_tmp[is]), hR_tmp);
-        for (int iap = 0; iap < hR->size_atom_pairs(); iap++)
+        this->hR_tmp->set_zero();
+        hamilt::HContainer<std::complex<double>>* hRGint_tmpCd = new hamilt::HContainer<std::complex<double>>(this->ucell->nat);
+        hRGint_tmpCd->insert_ijrs(this->gridt->get_ijr_info(), *(this->ucell));
+        hRGint_tmpCd->allocate(nullptr, true);
+        hRGint_tmpCd->set_zero();
+        for (int iap = 0; iap < hRGint_tmpCd->size_atom_pairs(); iap++)
         {
             //std::cout<<"iap: "<<iap<<std::endl;
-            auto* ap = &hR->get_atom_pair(iap);
+            auto* ap = &hRGint_tmpCd->get_atom_pair(iap);
             const int iat1 = ap->get_atom_i();
             const int iat2 = ap->get_atom_j();
-            const hamilt::AtomPair<double>* ap_nspin = nullptr;
             if (iat1 <= iat2)
             {
                 hamilt::AtomPair<std::complex<double>>* upper_ap = ap;
-                hamilt::AtomPair<std::complex<double>>* lower_ap = hR->find_pair(iat2, iat1);
-                switch (is)
-                {
-                case 0:
-                    ap_nspin = hR_tmp->find_pair(iat1, iat2);
-                    break;
-                case 3:
-                    ap_nspin = hR_tmp->find_pair(iat1, iat2);
-                    break;
-                }
-                if(ap_nspin == nullptr) break;
-                //const hamilt::AtomPair<double>* ap_nspin_0 = this->hR_tmp[0]->find_pair(iat1, iat2);
-                //const hamilt::AtomPair<double>* ap_nspin_3 = this->hRGint_tmp[3]->find_pair(iat1, iat2);
+                hamilt::AtomPair<std::complex<double>>* lower_ap = hRGint_tmpCd->find_pair(iat2, iat1);
+                const hamilt::AtomPair<double>* ap_nspin1 = this->hRGint_tmp[first[is]] ->find_pair(iat1, iat2);
+                const hamilt::AtomPair<double>* ap_nspin2 = this->hRGint_tmp[second[is]] ->find_pair(iat1, iat2);
                 for (int ir = 0; ir < upper_ap->get_R_size(); ir++)
                 {   
+                    //std::cout<<"ir"<<ir<<std::endl;
                     const auto R_index = upper_ap->get_R_index(ir);
                     auto upper_mat = upper_ap->find_matrix(R_index);
-                    //auto mat_nspin_0 = ap_nspin_0->find_matrix(R_index);
-                    //auto mat_nspin_3 = ap_nspin_3->find_matrix(R_index);
-                    auto mat_nspin = ap_nspin->find_matrix(R_index);
-
+                    auto mat_nspin1 = ap_nspin1->find_matrix(R_index);
+                    auto mat_nspin2 = ap_nspin2->find_matrix(R_index);
                     // The row size and the col size of upper_matrix is double that of matrix_nspin_0
-                    for (int irow = 0; irow < mat_nspin->get_row_size(); ++irow)
+                    for (int irow = 0; irow < mat_nspin1->get_row_size(); ++irow)
                     {
-                        for (int icol = 0; icol < mat_nspin->get_col_size(); ++icol)
+                        for (int icol = 0; icol < mat_nspin1->get_col_size(); ++icol)
                         {
-                            //upper_mat->get_value(2*irow, 2*icol) = mat_nspin_0->get_value(irow, icol) + mat_nspin_3->get_value(irow, icol);
-                            //upper_mat->get_value(2*irow+1, 2*icol+1) = mat_nspin_0->get_value(irow, icol) - mat_nspin_3->get_value(irow, icol);
                             switch (is)
                             {
                             case 0:
-                                upper_mat->get_value(2*irow, 2*icol) = mat_nspin->get_value(irow, icol);
-                                upper_mat->get_value(2*irow+1, 2*icol+1) = mat_nspin->get_value(irow, icol);
+                                upper_mat->get_value(irow, icol) = mat_nspin1->get_value(irow, icol) + mat_nspin2->get_value(irow, icol);
+                                break;
+                            case 1:
+                                upper_mat->get_value(irow, icol) = mat_nspin1->get_value(irow, icol) + 
+                                                                        std::complex<double>(0.0, 1.0) * mat_nspin2->get_value(irow, icol);
+                                break;
+                            case 2:
+                                upper_mat->get_value(irow, icol) = mat_nspin1->get_value(irow, icol) - 
+                                                                        std::complex<double>(0.0, 1.0) * mat_nspin2->get_value(irow, icol);
                                 break;
                             case 3:
-                                upper_mat->get_value(2*irow, 2*icol) += mat_nspin->get_value(irow, icol);
-                                upper_mat->get_value(2*irow+1, 2*icol+1) -= mat_nspin->get_value(irow, icol);
+                                upper_mat->get_value(irow, icol) = mat_nspin1->get_value(irow, icol) - mat_nspin2->get_value(irow, icol);
                                 break;
                             }
                         }
                     }
-
-                    if (PARAM.globalv.domag)
-                    {
-                        const hamilt::AtomPair<double>* ap_nspin = nullptr;
-                        switch (is)
-                        {
-                        case 1:
-                            ap_nspin = hR_tmp->find_pair(iat1, iat2);
-                            break;
-                        case 2:
-                            ap_nspin = hR_tmp->find_pair(iat1, iat2);
-                            break;
-                        }
-                        //const hamilt::AtomPair<double>* ap_nspin_1 = this->hRGint_tmp[1]->find_pair(iat1, iat2);
-                        //const hamilt::AtomPair<double>* ap_nspin_2 = this->hRGint_tmp[2]->find_pair(iat1, iat2);
-                        //const auto mat_nspin_1 = ap_nspin_1->find_matrix(R_index);
-                        //const auto mat_nspin_2 = ap_nspin_2->find_matrix(R_index);
-                        const auto mat_nspin = ap_nspin->find_matrix(R_index);
-                        for (int irow = 0; irow < mat_nspin->get_row_size(); ++irow)
-                        {
-                            for (int icol = 0; icol < mat_nspin->get_col_size(); ++icol)
-                            {
-                                switch(is)
-                                {
-                                    case 1:
-                                        upper_mat->get_value(2*irow, 2*icol+1) = mat_nspin->get_value(irow, icol);
-                                        upper_mat->get_value(2*irow+1, 2*icol) = mat_nspin->get_value(irow, icol);
-                                        break;
-                                    case 2:
-                                        upper_mat->get_value(2*irow, 2*icol+1) += std::complex<double>(0.0, 1.0) * mat_nspin->get_value(irow, icol);
-                                        upper_mat->get_value(2*irow+1, 2*icol) -= std::complex<double>(0.0, 1.0) * mat_nspin->get_value(irow, icol);
-                                        break;
-                                }
-                                //upper_mat->get_value(2*irow, 2*icol+1) = mat_nspin->get_value(irow, icol) +  std::complex<double>(0.0, 1.0) * mat_nspin_2->get_value(irow, icol);
-                                //upper_mat->get_value(2*irow+1, 2*icol) = mat_nspin->get_value(irow, icol) -  std::complex<double>(0.0, 1.0) * mat_nspin_2->get_value(irow, icol);
-                            }
-                        }
-                    }
-                    
-                    // fill the lower triangle matrix
-                    if(is == 3){
+                    //fill the lower triangle matrix
+                    if (PARAM.globalv.domag){
                         if (iat1 < iat2)
                         {
                             auto lower_mat = lower_ap->find_matrix(-R_index);
@@ -218,15 +161,52 @@ void Gint_k::transfer_pvpR(hamilt::HContainer<std::complex<double>>* hR,
                             }
                         }
                     }
-                        // std::cout<<"end"<<std::endl;
                 }
             }
         }
-        
+
+        //std::cout<<"success"<<std::endl;
+
+        hamilt::transferSerials2Parallels( *hRGint_tmpCd, this->hR_tmp);
+        for (int iap = 0; iap < hR->size_atom_pairs(); iap++)
+        {
+            //std::cout<<"iap: "<<iap<<std::endl;
+            auto* ap = &hR->get_atom_pair(iap);
+            const int iat1 = ap->get_atom_i();
+            const int iat2 = ap->get_atom_j();
+            auto* ap_nspin = this->hR_tmp ->find_pair(iat1, iat2);
+            for (int ir = 0; ir < ap->get_R_size(); ir++)
+            {   
+                const auto R_index = ap->get_R_index(ir);
+                auto upper_mat = ap->find_matrix(R_index);
+                auto mat_nspin = ap_nspin->find_matrix(R_index);
+
+                // The row size and the col size of upper_matrix is double that of matrix_nspin_0
+                for (int irow = 0; irow < mat_nspin->get_row_size(); ++irow)
+                {
+                    for (int icol = 0; icol < mat_nspin->get_col_size(); ++icol)
+                    {
+                        switch (is)
+                        {
+                        case 0:
+                            upper_mat->get_value(2*irow, 2*icol) += mat_nspin->get_value(irow, icol);
+                            break;
+                        case 1:
+                            upper_mat->get_value(2*irow, 2*icol+1) += mat_nspin->get_value(irow, icol);
+                            break;
+                        case 2:
+                            upper_mat->get_value(2*irow+1, 2*icol) += mat_nspin->get_value(irow, icol);
+                            break;
+                        case 3:
+                            upper_mat->get_value(2*irow+1, 2*icol+1) += mat_nspin->get_value(irow, icol);
+                            break;
+                        }
+                    }
+                }
+            }
+        }
     }
     delete[] iat2iwt;
-    delete pv;
-    delete hR_tmp;
 #else
 
 #endif
@@ -246,7 +226,7 @@ void Gint_k::transfer_pvpR(hamilt::HContainer<std::complex<double>>* hR,
 //         hamilt::transferSerials2Parallels<std::complex<double>>(*this->hRGintCd, hR);
 //     }
 // #else
-//     hR->add(*this->hRGintCd);
+    //  hR->add(*this->hRGintCd);
 // #endif
     ModuleBase::timer::tick("Gint_k", "transfer_pvpR");
     return;
diff --git a/source/module_lr/utils/gint_move.hpp b/source/module_lr/utils/gint_move.hpp
index c3ee35f566..8062ff895d 100644
--- a/source/module_lr/utils/gint_move.hpp
+++ b/source/module_lr/utils/gint_move.hpp
@@ -45,8 +45,8 @@ Gint& Gint::operator=(Gint&& rhs)
     // move hR after refactor
     this->hRGint = rhs.hRGint;
     rhs.hRGint = nullptr;
-    // this->hRGintCd = rhs.hRGintCd;
-    // rhs.hRGintCd = nullptr;
+    this->hR_tmp = rhs.hR_tmp;
+    rhs.hR_tmp = nullptr;
     for (int i = 0; i < this->DMRGint.size(); i++)
     {
         delete this->DMRGint[i];

From e8afb27681284eda291dd444f9d9c7c5a82ff870 Mon Sep 17 00:00:00 2001
From: hn <3022939753@qq.com>
Date: Sun, 31 Aug 2025 15:29:22 +0800
Subject: [PATCH 5/9] Simplify the computational code

---
 .../module_hamilt_lcao/module_gint/gint.cpp   | 42 ++++---------
 .../module_gint/gint_k_pvpr.cpp               | 62 ++++---------------
 2 files changed, 23 insertions(+), 81 deletions(-)

diff --git a/source/module_hamilt_lcao/module_gint/gint.cpp b/source/module_hamilt_lcao/module_gint/gint.cpp
index f1a8805f97..dee289c12d 100644
--- a/source/module_hamilt_lcao/module_gint/gint.cpp
+++ b/source/module_hamilt_lcao/module_gint/gint.cpp
@@ -252,6 +252,10 @@ void Gint::transfer_DM2DtoGrid(std::vector<hamilt::HContainer<double>*> DM2D) {
     } else // NSPIN=4 case
     {
 #ifdef __MPI
+        // is=0:↑↑, 1:↑↓, 2:↓↑, 3:↓↓
+        const int row_set[4] = {0, 0, 1, 1};
+        const int col_set[4] = {0, 1, 0, 1};
+
         int mg = DM2D[0]->get_paraV()->get_global_row_size()/2;
         int ng = DM2D[0]->get_paraV()->get_global_col_size()/2;
         int nb = DM2D[0]->get_paraV()->get_block_size()/2;
@@ -267,42 +271,20 @@ void Gint::transfer_DM2DtoGrid(std::vector<hamilt::HContainer<double>*> DM2D) {
         this-> DM2D_tmp = new hamilt::HContainer<double>(pv, nullptr, &ijr_info);
         ModuleBase::Memory::record("Gint::DM2D_tmp", this->DM2D_tmp->get_memory_size());
         for (int is = 0; is < 4; is++){
+            this->DM2D_tmp->set_zero();
             for (int iap = 0; iap < DM2D[0]->size_atom_pairs(); ++iap) {
                 auto& ap = DM2D[0]->get_atom_pair(iap);
                 int iat1 = ap.get_atom_i();
                 int iat2 = ap.get_atom_j();
                 for (int ir = 0; ir < ap.get_R_size(); ++ir) {
                     const ModuleBase::Vector3<int> r_index = ap.get_R_index(ir);
-                    double* tmp_pointer = this -> DM2D_tmp -> find_matrix(iat1, iat2, r_index)->get_pointer();
-                    double* data_full = ap.get_pointer(ir);
-                    for (int irow = 0; irow < ap.get_row_size(); irow += 2) {
-                        switch (is) {//todo: It can be written more compactly
-                            case 0:
-                                for (int icol = 0; icol < ap.get_col_size(); icol += 2) {
-                                    *(tmp_pointer)++ = data_full[icol];
-                                }
-                                data_full += ap.get_col_size() * 2;
-                                break;
-                            case 1:
-                                for (int icol = 0; icol < ap.get_col_size(); icol += 2) {
-                                    *(tmp_pointer)++ = data_full[icol + 1];
-                                }
-                                data_full += ap.get_col_size() * 2;
-                                break;
-                            case 2:
-                                data_full += ap.get_col_size();
-                                for (int icol = 0; icol < ap.get_col_size(); icol += 2) {
-                                    *(tmp_pointer)++ = data_full[icol];
-                                }
-                                data_full += ap.get_col_size();
-                                break;
-                            case 3:
-                                data_full += ap.get_col_size();
-                                for (int icol = 0; icol < ap.get_col_size(); icol += 2) {
-                                    *(tmp_pointer)++ = data_full[icol + 1];
-                                }
-                                data_full += ap.get_col_size();
-                                break;           
+                    double* matrix_out = this -> DM2D_tmp -> find_matrix(iat1, iat2, r_index)->get_pointer();
+                    double* matrix_in = ap.get_pointer(ir);
+                    for (int irow = 0; irow < ap.get_row_size()/2; irow ++) {
+                        for (int icol = 0; icol < ap.get_col_size()/2; icol++){
+                            int index_i = irow* ap.get_col_size()/2 + icol;
+                            int index_j = (irow*2+row_set[is]) * ap.get_col_size() + icol*2+col_set[is];
+                            matrix_out[index_i] = matrix_in[index_j];
                         }
                     }
                 }
diff --git a/source/module_hamilt_lcao/module_gint/gint_k_pvpr.cpp b/source/module_hamilt_lcao/module_gint/gint_k_pvpr.cpp
index f0a3bc6518..9e80ea22c1 100644
--- a/source/module_hamilt_lcao/module_gint/gint_k_pvpr.cpp
+++ b/source/module_hamilt_lcao/module_gint/gint_k_pvpr.cpp
@@ -81,6 +81,7 @@ void Gint_k::transfer_pvpR(hamilt::HContainer<std::complex<double>>* hR,
     int mg = hR->get_paraV()->get_global_row_size()/2;
     int ng = hR->get_paraV()->get_global_col_size()/2;
     int nb = hR->get_paraV()->get_block_size()/2;
+    hR->set_zero();
 #ifdef __MPI
     int blacs_ctxt = hR->get_paraV()->blacs_ctxt;
     int *iat2iwt = new int[ucell_in->nat];
@@ -98,8 +99,11 @@ void Gint_k::transfer_pvpR(hamilt::HContainer<std::complex<double>>* hR,
     //0,3;1,2;1,2;0,3
     std::vector<int> first = {0, 1, 1, 0};
     std::vector<int> second= {3, 2, 2, 3};
+    std::vector<int> row_set = {0, 0, 1, 1};
+    std::vector<int> col_set = {0, 1, 0, 1};
+    std::vector<int> clx_i = {1, 0, 0, -1};
+    std::vector<int> clx_j = {0, 1, -1, 0};
     for (int is = 0; is < 4; is++){
-        this->hR_tmp->set_zero();
         hamilt::HContainer<std::complex<double>>* hRGint_tmpCd = new hamilt::HContainer<std::complex<double>>(this->ucell->nat);
         hRGint_tmpCd->insert_ijrs(this->gridt->get_ijr_info(), *(this->ucell));
         hRGint_tmpCd->allocate(nullptr, true);
@@ -128,23 +132,8 @@ void Gint_k::transfer_pvpR(hamilt::HContainer<std::complex<double>>* hR,
                     {
                         for (int icol = 0; icol < mat_nspin1->get_col_size(); ++icol)
                         {
-                            switch (is)
-                            {
-                            case 0:
-                                upper_mat->get_value(irow, icol) = mat_nspin1->get_value(irow, icol) + mat_nspin2->get_value(irow, icol);
-                                break;
-                            case 1:
-                                upper_mat->get_value(irow, icol) = mat_nspin1->get_value(irow, icol) + 
-                                                                        std::complex<double>(0.0, 1.0) * mat_nspin2->get_value(irow, icol);
-                                break;
-                            case 2:
-                                upper_mat->get_value(irow, icol) = mat_nspin1->get_value(irow, icol) - 
-                                                                        std::complex<double>(0.0, 1.0) * mat_nspin2->get_value(irow, icol);
-                                break;
-                            case 3:
-                                upper_mat->get_value(irow, icol) = mat_nspin1->get_value(irow, icol) - mat_nspin2->get_value(irow, icol);
-                                break;
-                            }
+                            upper_mat->get_value(irow, icol) = mat_nspin1->get_value(irow, icol) 
+                            + std::complex<double>(clx_i[is], clx_j[is]) * mat_nspin2->get_value(irow, icol);
                         }
                     }
                     //fill the lower triangle matrix
@@ -166,7 +155,7 @@ void Gint_k::transfer_pvpR(hamilt::HContainer<std::complex<double>>* hR,
         }
 
         //std::cout<<"success"<<std::endl;
-
+        this->hR_tmp->set_zero();
         hamilt::transferSerials2Parallels( *hRGint_tmpCd, this->hR_tmp);
         for (int iap = 0; iap < hR->size_atom_pairs(); iap++)
         {
@@ -186,48 +175,19 @@ void Gint_k::transfer_pvpR(hamilt::HContainer<std::complex<double>>* hR,
                 {
                     for (int icol = 0; icol < mat_nspin->get_col_size(); ++icol)
                     {
-                        switch (is)
-                        {
-                        case 0:
-                            upper_mat->get_value(2*irow, 2*icol) += mat_nspin->get_value(irow, icol);
-                            break;
-                        case 1:
-                            upper_mat->get_value(2*irow, 2*icol+1) += mat_nspin->get_value(irow, icol);
-                            break;
-                        case 2:
-                            upper_mat->get_value(2*irow+1, 2*icol) += mat_nspin->get_value(irow, icol);
-                            break;
-                        case 3:
-                            upper_mat->get_value(2*irow+1, 2*icol+1) += mat_nspin->get_value(irow, icol);
-                            break;
-                        }
+                        upper_mat->get_value(2*irow+row_set[is], 2*icol+col_set[is]) = 
+                        mat_nspin->get_value(irow, icol);
                     }
                 }
             }
         }
+        delete hRGint_tmpCd;
     }
     delete[] iat2iwt;
 #else
 
 #endif
 
-    // ===================================
-    // transfer HR from Gint to Veff<OperatorLCAO<std::complex<double>, std::complex<double>>>
-    // ===================================
-// #ifdef __MPI
-//     int size;
-//     MPI_Comm_size(MPI_COMM_WORLD, &size);
-//     if (size == 1)
-//     {
-//         hR->add(*this->hRGintCd);
-//     }
-//     else
-//     {
-//         hamilt::transferSerials2Parallels<std::complex<double>>(*this->hRGintCd, hR);
-//     }
-// #else
-    //  hR->add(*this->hRGintCd);
-// #endif
     ModuleBase::timer::tick("Gint_k", "transfer_pvpR");
     return;
 }

From 67d93e789eb4626b30aa06f5b1bd6b97289ca56a Mon Sep 17 00:00:00 2001
From: hn <3022939753@qq.com>
Date: Tue, 16 Sep 2025 11:40:57 +0800
Subject: [PATCH 6/9] improve the serial code

---
 source/module_hamilt_lcao/module_gint/gint.cpp   | 12 ++++++++----
 .../module_gint/gint_k_pvpr.cpp                  | 16 +++++++++-------
 2 files changed, 17 insertions(+), 11 deletions(-)

diff --git a/source/module_hamilt_lcao/module_gint/gint.cpp b/source/module_hamilt_lcao/module_gint/gint.cpp
index 490228a2f1..4279ab5c30 100644
--- a/source/module_hamilt_lcao/module_gint/gint.cpp
+++ b/source/module_hamilt_lcao/module_gint/gint.cpp
@@ -247,15 +247,15 @@ void Gint::transfer_DM2DtoGrid(std::vector<hamilt::HContainer<double>*> DM2D) {
         }
     } else // NSPIN=4 case
     {
-#ifdef __MPI
+
         // is=0:↑↑, 1:↑↓, 2:↓↑, 3:↓↓
         const int row_set[4] = {0, 0, 1, 1};
         const int col_set[4] = {0, 1, 0, 1};
         int mg = DM2D[0]->get_paraV()->get_global_row_size()/2;
         int ng = DM2D[0]->get_paraV()->get_global_col_size()/2;
         int nb = DM2D[0]->get_paraV()->get_block_size()/2;
+#ifdef __MPI
         int blacs_ctxt = DM2D[0]->get_paraV()->blacs_ctxt;
-
         std::vector<int> iat2iwt(ucell->nat);
         for (int iat = 0; iat < ucell->nat; iat++) {
             iat2iwt[iat] = ucell->get_iat2iwt()[iat]/2;
@@ -266,6 +266,7 @@ void Gint::transfer_DM2DtoGrid(std::vector<hamilt::HContainer<double>*> DM2D) {
         auto ijr_info = DM2D[0]->get_ijr_info();
         this-> DM2D_tmp = new hamilt::HContainer<double>(pv, nullptr, &ijr_info);
         this-> DM2D_tmp->set_zero();
+#endif
         ModuleBase::Memory::record("Gint::DM2D_tmp", this->DM2D_tmp->get_memory_size());
         for (int is = 0; is < 4; is++){
             for (int iap = 0; iap < DM2D[0]->size_atom_pairs(); ++iap) {
@@ -285,11 +286,14 @@ void Gint::transfer_DM2DtoGrid(std::vector<hamilt::HContainer<double>*> DM2D) {
                     }
                 }
             }
+#ifdef __MPI
             hamilt::transferParallels2Serials( *(this->DM2D_tmp), this->DMRGint[is]);
-        }
 #else
-        //this->DMRGint_full = DM2D[0];
+            this->DMRGint[is]->set_zero();
+            this->DMRGint[is]->add(*(this->DM2D_tmp));
 #endif
+        }
+
     }
     ModuleBase::timer::tick("Gint", "transfer_DMR");
 }
\ No newline at end of file
diff --git a/source/module_hamilt_lcao/module_gint/gint_k_pvpr.cpp b/source/module_hamilt_lcao/module_gint/gint_k_pvpr.cpp
index 9a70c8e3f1..c93effbf25 100644
--- a/source/module_hamilt_lcao/module_gint/gint_k_pvpr.cpp
+++ b/source/module_hamilt_lcao/module_gint/gint_k_pvpr.cpp
@@ -78,10 +78,12 @@ void Gint_k::transfer_pvpR(hamilt::HContainer<std::complex<double>>* hR,
     ModuleBase::TITLE("Gint_k", "transfer_pvpR");
     ModuleBase::timer::tick("Gint_k", "transfer_pvpR");
 
+    auto ijr_info = hR->get_ijr_info();
+
+#ifdef __MPI
     int mg = hR->get_paraV()->get_global_row_size()/2;
     int ng = hR->get_paraV()->get_global_col_size()/2;
     int nb = hR->get_paraV()->get_block_size()/2;
-#ifdef __MPI
     int blacs_ctxt = hR->get_paraV()->blacs_ctxt;
     std::vector<int> iat2iwt(ucell_in->nat);
     for (int iat = 0; iat < ucell_in->nat; iat++) {
@@ -90,9 +92,9 @@ void Gint_k::transfer_pvpR(hamilt::HContainer<std::complex<double>>* hR,
     Parallel_Orbitals *pv = new Parallel_Orbitals();
     pv->set(mg, ng, nb, blacs_ctxt);
     pv->set_atomic_trace(iat2iwt.data(), ucell_in->nat, mg);
-    auto ijr_info = hR->get_ijr_info();
-
     this->hR_tmp = new hamilt::HContainer<std::complex<double>>(pv, nullptr, &ijr_info);
+#endif
+    
     ModuleBase::Memory::record("Gint::hRGintCd", this->hR_tmp->get_memory_size());
 
     //select hRGint_tmp
@@ -155,8 +157,12 @@ void Gint_k::transfer_pvpR(hamilt::HContainer<std::complex<double>>* hR,
                 }
             }
         }
+#ifdef __MPI
         // transfer hRGint_tmpCd to parallel hR_tmp
         hamilt::transferSerials2Parallels( *hRGint_tmpCd, this->hR_tmp);
+#else
+        this->hR_tmp = hRGint_tmpCd;
+#endif
         // merge hR_tmp to hR
         for (int iap = 0; iap < hR->size_atom_pairs(); iap++)
         {
@@ -182,10 +188,6 @@ void Gint_k::transfer_pvpR(hamilt::HContainer<std::complex<double>>* hR,
         }
         delete hRGint_tmpCd;
     }
-    delete[] iat2iwt;
-#else
-
-#endif
     ModuleBase::timer::tick("Gint_k", "transfer_pvpR");
     return;
 }

From e04dce324b3e24229c8747bc53aadb69ab0a0b46 Mon Sep 17 00:00:00 2001
From: hn <3022939753@qq.com>
Date: Tue, 16 Sep 2025 12:14:04 +0800
Subject: [PATCH 7/9] fix bug

---
 source/module_hamilt_lcao/module_gint/gint.cpp | 11 +++++++++--
 1 file changed, 9 insertions(+), 2 deletions(-)

diff --git a/source/module_hamilt_lcao/module_gint/gint.cpp b/source/module_hamilt_lcao/module_gint/gint.cpp
index 4279ab5c30..e59839dbf6 100644
--- a/source/module_hamilt_lcao/module_gint/gint.cpp
+++ b/source/module_hamilt_lcao/module_gint/gint.cpp
@@ -254,6 +254,7 @@ void Gint::transfer_DM2DtoGrid(std::vector<hamilt::HContainer<double>*> DM2D) {
         int mg = DM2D[0]->get_paraV()->get_global_row_size()/2;
         int ng = DM2D[0]->get_paraV()->get_global_col_size()/2;
         int nb = DM2D[0]->get_paraV()->get_block_size()/2;
+        auto ijr_info = DM2D[0]->get_ijr_info();
 #ifdef __MPI
         int blacs_ctxt = DM2D[0]->get_paraV()->blacs_ctxt;
         std::vector<int> iat2iwt(ucell->nat);
@@ -263,9 +264,15 @@ void Gint::transfer_DM2DtoGrid(std::vector<hamilt::HContainer<double>*> DM2D) {
         Parallel_Orbitals *pv = new Parallel_Orbitals();
         pv->set(mg, ng, nb, blacs_ctxt);
         pv->set_atomic_trace(iat2iwt.data(), ucell->nat, mg);
-        auto ijr_info = DM2D[0]->get_ijr_info();
         this-> DM2D_tmp = new hamilt::HContainer<double>(pv, nullptr, &ijr_info);
         this-> DM2D_tmp->set_zero();
+#else
+        if (this->DM2D_tmp != nullptr) {
+            delete this->DM2D_tmp;
+        }
+        this-> DM2D_tmp = new hamilt::HContainer<double>(*this->hRGint);
+        this-> DM2D_tmp -> insert_ijrs(this->gridt->get_ijr_info(), *(this->ucell));
+        this-> DM2D_tmp -> allocate(nullptr, true);
 #endif
         ModuleBase::Memory::record("Gint::DM2D_tmp", this->DM2D_tmp->get_memory_size());
         for (int is = 0; is < 4; is++){
@@ -275,7 +282,7 @@ void Gint::transfer_DM2DtoGrid(std::vector<hamilt::HContainer<double>*> DM2D) {
                 int iat2 = ap.get_atom_j();
                 for (int ir = 0; ir < ap.get_R_size(); ++ir) {
                     const ModuleBase::Vector3<int> r_index = ap.get_R_index(ir);
-                    double* matrix_out = DM2D_tmp -> find_matrix(iat1, iat2, r_index)->get_pointer();
+                    double* matrix_out = this-> DM2D_tmp -> find_matrix(iat1, iat2, r_index)->get_pointer();
                     double* matrix_in = ap.get_pointer(ir);
                     for (int irow = 0; irow < ap.get_row_size()/2; irow ++) {
                         for (int icol = 0; icol < ap.get_col_size()/2; icol ++) {

From 4350e3d5682b81fafedfd2922cd8e4f221cd3990 Mon Sep 17 00:00:00 2001
From: dzzz2001 <153698752+dzzz2001@users.noreply.github.com>
Date: Sat, 19 Jul 2025 15:17:42 +0800
Subject: [PATCH 8/9] Perf: support GPU version of cal_force_cc with LCAO basis
 (#6392)

* support GPU version of cal_force_cc with LCAO basis

* fix a bug
---
 .../hamilt_lcaodft/FORCE_STRESS.cpp           | 51 ++++++++++++-------
 .../hamilt_lcaodft/FORCE_STRESS.h             |  1 -
 2 files changed, 33 insertions(+), 19 deletions(-)

diff --git a/source/module_hamilt_lcao/hamilt_lcaodft/FORCE_STRESS.cpp b/source/module_hamilt_lcao/hamilt_lcaodft/FORCE_STRESS.cpp
index 532bf7e561..3454a65963 100644
--- a/source/module_hamilt_lcao/hamilt_lcaodft/FORCE_STRESS.cpp
+++ b/source/module_hamilt_lcao/hamilt_lcaodft/FORCE_STRESS.cpp
@@ -22,7 +22,7 @@
 #include "module_hamilt_lcao/hamilt_lcaodft/operator_lcao/nonlocal_new.h"
 
 template <typename T>
-Force_Stress_LCAO<T>::Force_Stress_LCAO(Record_adj& ra, const int nat_in) : RA(&ra), f_pw(nat_in), nat(nat_in)
+Force_Stress_LCAO<T>::Force_Stress_LCAO(Record_adj& ra, const int nat_in) : RA(&ra), nat(nat_in)
 {
 }
 template <typename T>
@@ -861,24 +861,39 @@ void Force_Stress_LCAO<T>::calForcePwPart(UnitCell& ucell,
                                           const Structure_Factor& sf)
 {
     ModuleBase::TITLE("Force_Stress_LCAO", "calForcePwPart");
-    //--------------------------------------------------------
-    // local pseudopotential force:
-    // use charge density; plane wave; local pseudopotential;
-    //--------------------------------------------------------
-    f_pw.cal_force_loc(ucell, fvl_dvl, rhopw, locpp.vloc, chr);
-    //--------------------------------------------------------
-    // ewald force: use plane wave only.
-    //--------------------------------------------------------
-    f_pw.cal_force_ew(ucell, fewalds, rhopw, &sf); // remain problem
+#ifdef __CUDA
+    if(PARAM.inp.device == "gpu")
+    {
+        Forces<double, base_device::DEVICE_GPU> f_pw(nat);
+
+        //--------------------------------------------------------
+        // local pseudopotential force:
+        // use charge density; plane wave; local pseudopotential;
+        //--------------------------------------------------------
+        f_pw.cal_force_loc(ucell, fvl_dvl, rhopw, locpp.vloc, chr);
+        //--------------------------------------------------------
+        // ewald force: use plane wave only.
+        //--------------------------------------------------------
+        f_pw.cal_force_ew(ucell, fewalds, rhopw, &sf); // remain problem
+
+        //--------------------------------------------------------
+        // force due to core correlation.
+        //--------------------------------------------------------
+        f_pw.cal_force_cc(fcc, rhopw, chr, locpp.numeric, ucell);
+        //--------------------------------------------------------
+        // force due to self-consistent charge.
+        //--------------------------------------------------------
+        f_pw.cal_force_scc(fscc, rhopw, vnew, vnew_exist, locpp.numeric, ucell);
+    } else
+#endif
+    {
+        Forces<double, base_device::DEVICE_CPU> f_pw(nat);
+        f_pw.cal_force_loc(ucell, fvl_dvl, rhopw, locpp.vloc, chr);
+        f_pw.cal_force_ew(ucell, fewalds, rhopw, &sf); // remain problem
+        f_pw.cal_force_cc(fcc, rhopw, chr, locpp.numeric, ucell);
+        f_pw.cal_force_scc(fscc, rhopw, vnew, vnew_exist, locpp.numeric, ucell);
+    }
 
-    //--------------------------------------------------------
-    // force due to core correlation.
-    //--------------------------------------------------------
-    f_pw.cal_force_cc(fcc, rhopw, chr, locpp.numeric, ucell);
-    //--------------------------------------------------------
-    // force due to self-consistent charge.
-    //--------------------------------------------------------
-    f_pw.cal_force_scc(fscc, rhopw, vnew, vnew_exist, locpp.numeric, ucell);
     return;
 }
 
diff --git a/source/module_hamilt_lcao/hamilt_lcaodft/FORCE_STRESS.h b/source/module_hamilt_lcao/hamilt_lcaodft/FORCE_STRESS.h
index a11add920e..b55caa7935 100644
--- a/source/module_hamilt_lcao/hamilt_lcaodft/FORCE_STRESS.h
+++ b/source/module_hamilt_lcao/hamilt_lcaodft/FORCE_STRESS.h
@@ -60,7 +60,6 @@ class Force_Stress_LCAO
     Record_adj* RA;
     Force_LCAO<T> flk;
     Stress_Func<double> sc_pw;
-    Forces<double> f_pw;
 
     void forceSymmetry(const UnitCell& ucell, 
                        ModuleBase::matrix& fcs, 

From 639f19d9095d4a1bcd87fa019418c61fa71f7bd3 Mon Sep 17 00:00:00 2001
From: hn <3022939753@qq.com>
Date: Fri, 10 Oct 2025 21:11:28 +0800
Subject: [PATCH 9/9] Apply the changes related to force calculation from the
 develop branch, fixing the memory error bug that occurs during direct
 cherry-pick.

---
 .../module_hamilt_pw/hamilt_pwdft/forces.cpp  | 689 ++++++------------
 source/module_hamilt_pw/hamilt_pwdft/forces.h |  16 +-
 .../hamilt_pwdft/forces_cc.cpp                |   4 +-
 .../hamilt_pwdft/forces_scc.cpp               |   4 +-
 .../hamilt_pwdft/forces_us.cpp                |   2 +-
 .../hamilt_pwdft/kernels/cuda/force_op.cu     | 427 ++++++-----
 .../hamilt_pwdft/kernels/force_op.cpp         | 108 +--
 .../hamilt_pwdft/kernels/force_op.h           | 141 ++--
 8 files changed, 534 insertions(+), 857 deletions(-)

diff --git a/source/module_hamilt_pw/hamilt_pwdft/forces.cpp b/source/module_hamilt_pw/hamilt_pwdft/forces.cpp
index a4d779366b..ecdf746019 100644
--- a/source/module_hamilt_pw/hamilt_pwdft/forces.cpp
+++ b/source/module_hamilt_pw/hamilt_pwdft/forces.cpp
@@ -20,15 +20,13 @@
 #ifdef _OPENMP
 #include <omp.h>
 #endif
-#ifdef USE_PAW
-#include "module_cell/module_paw/paw_cell.h"
-#endif
+
 
 template <typename FPTYPE, typename Device>
 void Forces<FPTYPE, Device>::cal_force(UnitCell& ucell,
                                        ModuleBase::matrix& force,
                                        const elecstate::ElecState& elec,
-                                       ModulePW::PW_Basis* rho_basis,
+                                       const ModulePW::PW_Basis* const rho_basis,
                                        ModuleSymmetry::Symmetry* p_symm,
                                        Structure_Factor* p_sf,
                                        surchem& solvent,
@@ -55,15 +53,7 @@ void Forces<FPTYPE, Device>::cal_force(UnitCell& ucell,
     ModuleBase::matrix forceonsite(nat, 3);
 
     // Force due to local ionic potential
-    // For PAW, calculated together in paw_cell.calculate_force
-    if (!PARAM.inp.use_paw)
-    {
-        this->cal_force_loc(ucell,forcelc, rho_basis, locpp->vloc, chr);
-    }
-    else
-    {
-        forcelc.zero_out();
-    }
+    this->cal_force_loc(ucell,forcelc, rho_basis, locpp->vloc, chr);
 
     // Ewald
     this->cal_force_ew(ucell,forceion, rho_basis, p_sf);
@@ -71,94 +61,15 @@ void Forces<FPTYPE, Device>::cal_force(UnitCell& ucell,
     // Force due to nonlocal part of pseudopotential
     if (wfc_basis != nullptr)
     {
-        if (!PARAM.inp.use_paw)
-        {
-            this->npwx = wfc_basis->npwk_max;
-            Forces::cal_force_nl(forcenl, wg, ekb, pkv, wfc_basis, p_sf, *p_nlpp, ucell, psi_in);
 
-            if (PARAM.globalv.use_uspp)
-            {
-                this->cal_force_us(forcenl, rho_basis, *p_nlpp, elec, ucell);
-            }
-        }
-        else
-        {
-#ifdef USE_PAW
-            for (int ik = 0; ik < wfc_basis->nks; ik++)
-            {
-                const int npw = wfc_basis->npwk[ik];
-                ModuleBase::Vector3<double>* _gk = new ModuleBase::Vector3<double>[npw];
-                for (int ig = 0; ig < npw; ig++)
-                {
-                    _gk[ig] = wfc_basis->getgpluskcar(ik, ig);
-                }
+        this->npwx = wfc_basis->npwk_max;
+        Forces::cal_force_nl(forcenl, wg, ekb, pkv, wfc_basis, p_sf, *p_nlpp, ucell, psi_in);
 
-                double* kpt;
-                kpt = new double[3];
-                kpt[0] = wfc_basis->kvec_c[ik].x;
-                kpt[1] = wfc_basis->kvec_c[ik].y;
-                kpt[2] = wfc_basis->kvec_c[ik].z;
-
-                double** kpg;
-                double** gcar;
-                kpg = new double*[npw];
-                gcar = new double*[npw];
-                for (int ipw = 0; ipw < npw; ipw++)
-                {
-                    kpg[ipw] = new double[3];
-                    kpg[ipw][0] = _gk[ipw].x;
-                    kpg[ipw][1] = _gk[ipw].y;
-                    kpg[ipw][2] = _gk[ipw].z;
-
-                    gcar[ipw] = new double[3];
-                    gcar[ipw][0] = wfc_basis->getgcar(ik, ipw).x;
-                    gcar[ipw][1] = wfc_basis->getgcar(ik, ipw).y;
-                    gcar[ipw][2] = wfc_basis->getgcar(ik, ipw).z;
-                }
-
-                GlobalC::paw_cell.set_paw_k(npw,
-                                            wfc_basis->npwk_max,
-                                            kpt,
-                                            wfc_basis->get_ig2ix(ik).data(),
-                                            wfc_basis->get_ig2iy(ik).data(),
-                                            wfc_basis->get_ig2iz(ik).data(),
-                                            (const double**)kpg,
-                                            ucell.tpiba,
-                                            (const double**)gcar);
-
-                delete[] kpt;
-                for (int ipw = 0; ipw < npw; ipw++)
-                {
-                    delete[] kpg[ipw];
-                    delete[] gcar[ipw];
-                }
-                delete[] kpg;
-                delete[] gcar;
-
-                GlobalC::paw_cell.get_vkb();
-
-                GlobalC::paw_cell.set_currentk(ik);
-
-                psi_in[0].fix_k(ik);
-                double *weight, *epsilon;
-                weight = new double[PARAM.inp.nbands];
-                epsilon = new double[PARAM.inp.nbands];
-                for (int ib = 0; ib < PARAM.inp.nbands; ib++)
-                {
-                    weight[ib] = wg(ik, ib);
-                    epsilon[ib] = ekb(ik, ib);
-                }
-                GlobalC::paw_cell.paw_nl_force(reinterpret_cast<std::complex<double>*>(psi_in[0].get_pointer()),
-                                               epsilon,
-                                               weight,
-                                               PARAM.inp.nbands,
-                                               forcenl.c);
-
-                delete[] weight;
-                delete[] epsilon;
-            }
-#endif
+        if (PARAM.globalv.use_uspp)
+        {
+            this->cal_force_us(forcenl, rho_basis, *p_nlpp, elec, ucell);
         }
+
         // DFT+U and DeltaSpin
         if(PARAM.inp.dft_plus_u || PARAM.inp.sc_mag_switch)
         {
@@ -167,26 +78,10 @@ void Forces<FPTYPE, Device>::cal_force(UnitCell& ucell,
     }
 
     // non-linear core correction
-    // not relevant for PAW
-    if (!PARAM.inp.use_paw)
-    {
-        Forces::cal_force_cc(forcecc, rho_basis, chr, locpp->numeric, ucell);
-    }
-    else
-    {
-        forcecc.zero_out();
-    }
+    Forces::cal_force_cc(forcecc, rho_basis, chr, locpp->numeric, ucell);
 
     // force due to core charge
-    // For PAW, calculated together in paw_cell.calculate_force
-    if (!PARAM.inp.use_paw)
-    {
-        this->cal_force_scc(forcescc, rho_basis, elec.vnew, elec.vnew_exist, locpp->numeric, ucell);
-    }
-    else
-    {
-        forcescc.zero_out();
-    }
+    this->cal_force_scc(forcescc, rho_basis, elec.vnew, elec.vnew_exist, locpp->numeric, ucell);
 
     ModuleBase::matrix stress_vdw_pw; //.create(3,3);
     ModuleBase::matrix force_vdw;
@@ -240,52 +135,6 @@ void Forces<FPTYPE, Device>::cal_force(UnitCell& ucell,
         }
     }
 
-#ifdef USE_PAW
-    if (PARAM.inp.use_paw)
-    {
-        double* force_paw;
-        double* rhor;
-        rhor = new double[rho_basis->nrxx];
-        for (int ir = 0; ir < rho_basis->nrxx; ir++)
-        {
-            rhor[ir] = 0.0;
-        }
-        for (int is = 0; is < PARAM.inp.nspin; is++)
-        {
-            for (int ir = 0; ir < rho_basis->nrxx; ir++)
-            {
-                rhor[ir] += chr->rho[is][ir] + chr->nhat[is][ir];
-            }
-        }
-
-        force_paw = new double[3 * this->nat];
-        ModuleBase::matrix v_xc, v_effective;
-        v_effective.create(PARAM.inp.nspin, rho_basis->nrxx);
-        v_effective.zero_out();
-        elec.pot->update_from_charge(elec.charge, &ucell);
-        v_effective = elec.pot->get_effective_v();
-
-        v_xc.create(PARAM.inp.nspin, rho_basis->nrxx);
-        v_xc.zero_out();
-        const std::tuple<double, double, ModuleBase::matrix> etxc_vtxc_v
-            = XC_Functional::v_xc(rho_basis->nrxx, elec.charge, &ucell);
-        v_xc = std::get<2>(etxc_vtxc_v);
-
-        GlobalC::paw_cell.calculate_force(v_effective.c, v_xc.c, rhor, force_paw);
-
-        for (int iat = 0; iat < this->nat; iat++)
-        {
-            // Ha to Ry
-            forcepaw(iat, 0) = force_paw[3 * iat] * 2.0;
-            forcepaw(iat, 1) = force_paw[3 * iat + 1] * 2.0;
-            forcepaw(iat, 2) = force_paw[3 * iat + 2] * 2.0;
-        }
-
-        delete[] force_paw;
-        delete[] rhor;
-    }
-#endif
-
     // impose total force = 0
     int iat = 0;
     for (int ipol = 0; ipol < 3; ipol++)
@@ -300,11 +149,6 @@ void Forces<FPTYPE, Device>::cal_force(UnitCell& ucell,
                 force(iat, ipol) = forcelc(iat, ipol) + forceion(iat, ipol) + forcenl(iat, ipol) + forcecc(iat, ipol)
                                    + forcescc(iat, ipol);
 
-                if (PARAM.inp.use_paw)
-                {
-                    force(iat, ipol) += forcepaw(iat, ipol);
-                }
-
                 if (vdw_solver != nullptr) // linpz and jiyy added vdw force, modified by zhengdy
                 {
                     force(iat, ipol) += force_vdw(iat, ipol);
@@ -442,14 +286,7 @@ void Forces<FPTYPE, Device>::cal_force(UnitCell& ucell,
         ModuleIO::print_force(GlobalV::ofs_running, ucell, "NLCC     FORCE (eV/Angstrom)", forcecc, false);
         ModuleIO::print_force(GlobalV::ofs_running, ucell, "ION      FORCE (eV/Angstrom)", forceion, false);
         ModuleIO::print_force(GlobalV::ofs_running, ucell, "SCC      FORCE (eV/Angstrom)", forcescc, false);
-        if (PARAM.inp.use_paw)
-        {
-            ModuleIO::print_force(GlobalV::ofs_running,
-                                  ucell,
-                                  "PAW      FORCE (eV/Angstrom)",
-                                  forcepaw,
-                                  false);
-        }
+
         if (PARAM.inp.efield_flag)
         {
             ModuleIO::print_force(GlobalV::ofs_running, ucell, "EFIELD   FORCE (eV/Angstrom)", force_e, false);
@@ -487,13 +324,13 @@ void Forces<FPTYPE, Device>::cal_force(UnitCell& ucell,
 template <typename FPTYPE, typename Device>
 void Forces<FPTYPE, Device>::cal_force_loc(const UnitCell& ucell,
                                            ModuleBase::matrix& forcelc,
-                                           ModulePW::PW_Basis* rho_basis,
+                                           const ModulePW::PW_Basis* const rho_basis,
                                            const ModuleBase::matrix& vloc,
                                            const Charge* const chr)
 {
     ModuleBase::TITLE("Forces", "cal_force_loc");
     ModuleBase::timer::tick("Forces", "cal_force_loc");
-
+    this->device = base_device::get_device_type<Device>(this->ctx);
     std::complex<double>* aux = new std::complex<double>[rho_basis->nmaxgr];
     // now, in all pools , the charge are the same,
     // so, the force calculated by each pool is equal.
@@ -532,110 +369,100 @@ void Forces<FPTYPE, Device>::cal_force_loc(const UnitCell& ucell,
     // to G space. maybe need fftw with OpenMP
     rho_basis->real2recip(aux, aux);
 
-    // sincos op for G space
-    
-    
-    // data preparation
-    std::vector<FPTYPE> tau_flat(this->nat * 3);
-    std::vector<FPTYPE> gcar_flat(rho_basis->npw * 3);
-    
-    
-    for (int iat = 0; iat < this->nat; iat++) {
-        int it = ucell.iat2it[iat];  
-        int ia = ucell.iat2ia[iat];  
-        
-        tau_flat[iat * 3 + 0] = static_cast<FPTYPE>(ucell.atoms[it].tau[ia][0]);
-        tau_flat[iat * 3 + 1] = static_cast<FPTYPE>(ucell.atoms[it].tau[ia][1]);
-        tau_flat[iat * 3 + 2] = static_cast<FPTYPE>(ucell.atoms[it].tau[ia][2]);
-    }
-    
-    for (int ig = 0; ig < rho_basis->npw; ig++) {
-        gcar_flat[ig * 3 + 0] = static_cast<FPTYPE>(rho_basis->gcar[ig][0]);
-        gcar_flat[ig * 3 + 1] = static_cast<FPTYPE>(rho_basis->gcar[ig][1]);
-        gcar_flat[ig * 3 + 2] = static_cast<FPTYPE>(rho_basis->gcar[ig][2]);
-    }
-    
-    // calculate vloc_factors for all atom types
-    std::vector<FPTYPE> vloc_per_type_host(this->nat * rho_basis->npw);
-    for (int iat = 0; iat < this->nat; iat++) {
-        int it = ucell.iat2it[iat];
-        for (int ig = 0; ig < rho_basis->npw; ig++) {
-            vloc_per_type_host[iat * rho_basis->npw + ig] = static_cast<FPTYPE>(vloc(it, rho_basis->ig2igg[ig]));
-        }
-    }
-    
-    std::vector<std::complex<FPTYPE>> aux_fptype(rho_basis->npw);
-    for (int ig = 0; ig < rho_basis->npw; ig++) {
-        aux_fptype[ig] = static_cast<std::complex<FPTYPE>>(aux[ig]);
-    }
-    
-    FPTYPE* d_gcar = gcar_flat.data();
-    FPTYPE* d_tau = tau_flat.data();
-    FPTYPE* d_vloc_per_type = vloc_per_type_host.data();
-    std::complex<FPTYPE>* d_aux = aux_fptype.data();
-    FPTYPE* d_force = nullptr;
-    std::vector<FPTYPE> force_host(this->nat * 3);
-    
-    if (this->device == base_device::GpuDevice)
-    {
-        d_gcar = nullptr;
-        d_tau = nullptr;
-        d_vloc_per_type = nullptr;
-        d_aux = nullptr;
-        
-        resmem_var_op()(this->ctx, d_gcar, rho_basis->npw * 3);
-        resmem_var_op()(this->ctx, d_tau, this->nat * 3);
-        resmem_var_op()(this->ctx, d_vloc_per_type, this->nat * rho_basis->npw);
-        resmem_complex_op()(this->ctx, d_aux, rho_basis->npw);
-        resmem_var_op()(this->ctx, d_force, this->nat * 3);
-        
-        syncmem_var_h2d_op()(this->ctx, this->cpu_ctx, d_gcar, gcar_flat.data(), rho_basis->npw * 3);
-        syncmem_var_h2d_op()(this->ctx, this->cpu_ctx, d_tau, tau_flat.data(), this->nat * 3);
-        syncmem_var_h2d_op()(this->ctx, this->cpu_ctx, d_vloc_per_type, vloc_per_type_host.data(), this->nat * rho_basis->npw);
-        syncmem_complex_h2d_op()(this->ctx, this->cpu_ctx, d_aux, aux_fptype.data(), rho_basis->npw);
-        
-        base_device::memory::set_memory_op<FPTYPE, Device>()(this->ctx, d_force, 0.0, this->nat * 3);
-    }
-    else
+    if(this->device == base_device::GpuDevice)
     {
-        d_force = force_host.data();
-        std::fill(force_host.begin(), force_host.end(), static_cast<FPTYPE>(0.0));
-    }
-    
-    const FPTYPE scale_factor = static_cast<FPTYPE>(ucell.tpiba * ucell.omega);
-    
-    // call op for sincos calculation
-    hamilt::cal_force_loc_sincos_op<FPTYPE, Device>()(
-        this->ctx,
-        this->nat,
-        rho_basis->npw,
-        this->nat,
-        d_gcar,
-        d_tau,
-        d_vloc_per_type,
-        d_aux,
-        scale_factor,
-        d_force
-    );
-    
-    if (this->device == base_device::GpuDevice)
-    {
-        syncmem_var_d2h_op()(this->cpu_ctx, this->ctx, force_host.data(), d_force, this->nat * 3);
-        
-        delmem_var_op()(this->ctx, d_gcar);
-        delmem_var_op()(this->ctx, d_tau);
-        delmem_var_op()(this->ctx, d_vloc_per_type);
-        delmem_complex_op()(this->ctx, d_aux);
-        delmem_var_op()(this->ctx, d_force);
-    }
-    
-    for (int iat = 0; iat < this->nat; iat++) {
-        forcelc(iat, 0) = static_cast<double>(force_host[iat * 3 + 0]);
-        forcelc(iat, 1) = static_cast<double>(force_host[iat * 3 + 1]);
-        forcelc(iat, 2) = static_cast<double>(force_host[iat * 3 + 2]);
-    }
+        std::vector<double> tau_h;
+        std::vector<double> gcar_h;
+        tau_h.resize(this->nat * 3);
+        for(int iat = 0; iat < this->nat; ++iat)
+        {
+            int it = ucell.iat2it[iat];
+            int ia = ucell.iat2ia[iat];
+            tau_h[iat * 3] = ucell.atoms[it].tau[ia].x;
+            tau_h[iat * 3 + 1] = ucell.atoms[it].tau[ia].y;
+            tau_h[iat * 3 + 2] = ucell.atoms[it].tau[ia].z;
+        }
 
-    // this->print(GlobalV: :ofs_running, "local forces", forcelc);
+        gcar_h.resize(rho_basis->npw * 3);
+        for(int ig = 0; ig < rho_basis->npw; ++ig)
+        {
+            gcar_h[ig * 3] = rho_basis->gcar[ig].x;
+            gcar_h[ig * 3 + 1] = rho_basis->gcar[ig].y;
+            gcar_h[ig * 3 + 2] = rho_basis->gcar[ig].z;
+        }
+
+        int* iat2it_d = nullptr;
+        int* ig2gg_d = nullptr;
+        double* gcar_d = nullptr;
+        double* tau_d = nullptr;
+        std::complex<double>* aux_d = nullptr;
+        double* forcelc_d  = nullptr;
+        double* vloc_d = nullptr;
+
+        resmem_int_op()(this->ctx,iat2it_d, this->nat);
+        resmem_int_op()(this->ctx,ig2gg_d, rho_basis->npw);
+        resmem_var_op()(this->ctx,gcar_d, rho_basis->npw * 3);
+        resmem_var_op()(this->ctx,tau_d, this->nat * 3);
+        resmem_complex_op()(this->ctx,aux_d, rho_basis->npw);
+        resmem_var_op()(this->ctx,forcelc_d, this->nat * 3);
+        resmem_var_op()(this->ctx,vloc_d, vloc.nr * vloc.nc);
+
+        syncmem_int_h2d_op()( this->ctx, this->cpu_ctx,iat2it_d, ucell.iat2it, this->nat);
+        syncmem_int_h2d_op()(this->ctx, this->cpu_ctx, ig2gg_d, rho_basis->ig2igg, rho_basis->npw);
+        syncmem_var_h2d_op()(this->ctx, this->cpu_ctx, gcar_d, gcar_h.data(), rho_basis->npw * 3);
+        syncmem_var_h2d_op()(this->ctx, this->cpu_ctx, tau_d, tau_h.data(), this->nat * 3);
+        syncmem_complex_h2d_op()(this->ctx, this->cpu_ctx, aux_d, aux, rho_basis->npw);
+        syncmem_var_h2d_op()(this->ctx, this->cpu_ctx, forcelc_d, forcelc.c, this->nat * 3);
+        syncmem_var_h2d_op()(this->ctx, this->cpu_ctx, vloc_d, vloc.c, vloc.nr * vloc.nc);
+
+        hamilt::cal_force_loc_op<FPTYPE, Device>()(
+            this->nat,
+            rho_basis->npw,
+            ucell.tpiba * ucell.omega,
+            iat2it_d,
+            ig2gg_d,
+            gcar_d,
+            tau_d,
+            aux_d,
+            vloc_d,
+            vloc.nc,
+            forcelc_d);
+        syncmem_var_d2h_op()(this->cpu_ctx, this->ctx, forcelc.c, forcelc_d, this->nat * 3);
+
+        delmem_int_op()(this->ctx,iat2it_d);
+        delmem_int_op()(this->ctx,ig2gg_d);
+        delmem_var_op()(this->ctx,gcar_d);
+        delmem_var_op()(this->ctx,tau_d);
+        delmem_complex_op()(this->ctx,aux_d);
+        delmem_var_op()(this->ctx,forcelc_d);
+        delmem_var_op()(this->ctx,vloc_d);
+    }
+    else{  // calculate forces on CPU
+        #ifdef _OPENMP
+        #pragma omp parallel for
+        #endif
+        for (int iat = 0; iat < this->nat; ++iat)
+        {
+            // read `it` `ia` from the table
+            int it = ucell.iat2it[iat];
+            int ia = ucell.iat2ia[iat];
+            for (int ig = 0; ig < rho_basis->npw; ig++)
+            {
+                const double phase = ModuleBase::TWO_PI * (rho_basis->gcar[ig] * ucell.atoms[it].tau[ia]);
+                double sinp, cosp;
+                ModuleBase::libm::sincos(phase, &sinp, &cosp);
+                const double factor
+                    = vloc(it, rho_basis->ig2igg[ig]) * (cosp * aux[ig].imag() + sinp * aux[ig].real());
+                forcelc(iat, 0) += rho_basis->gcar[ig][0] * factor;
+                forcelc(iat, 1) += rho_basis->gcar[ig][1] * factor;
+                forcelc(iat, 2) += rho_basis->gcar[ig][2] * factor;
+            }
+            forcelc(iat, 0) *= (ucell.tpiba * ucell.omega);
+            forcelc(iat, 1) *= (ucell.tpiba * ucell.omega);
+            forcelc(iat, 2) *= (ucell.tpiba * ucell.omega);
+        }
+    }
+    // this->print(GlobalV::ofs_running, "local forces", forcelc);
     Parallel_Reduce::reduce_pool(forcelc.c, forcelc.nr * forcelc.nc);
     delete[] aux;
     ModuleBase::timer::tick("Forces", "cal_force_loc");
@@ -645,14 +472,14 @@ void Forces<FPTYPE, Device>::cal_force_loc(const UnitCell& ucell,
 template <typename FPTYPE, typename Device>
 void Forces<FPTYPE, Device>::cal_force_ew(const UnitCell& ucell,
                                           ModuleBase::matrix& forceion,
-                                          ModulePW::PW_Basis* rho_basis,
+                                          const ModulePW::PW_Basis* const rho_basis,
                                           const Structure_Factor* p_sf)
 {
     ModuleBase::TITLE("Forces", "cal_force_ew");
     ModuleBase::timer::tick("Forces", "cal_force_ew");
-
+    this->device = base_device::get_device_type<Device>(this->ctx);
     double fact = 2.0;
-    std::complex<double>* aux = new std::complex<double>[rho_basis->npw];
+    std::vector<std::complex<double>> aux(rho_basis->npw);
 
     /*
         blocking rho_basis->nrxnpwx for data locality.
@@ -662,9 +489,7 @@ void Forces<FPTYPE, Device>::cal_force_ew(const UnitCell& ucell,
         performance will be better when number of atom is quite huge
     */
     const int block_ig = 1024;
-#ifdef _OPENMP
 #pragma omp parallel for
-#endif
     for (int igb = 0; igb < rho_basis->npw; igb += block_ig)
     {
         // calculate the actual task length of this block
@@ -679,13 +504,6 @@ void Forces<FPTYPE, Device>::cal_force_ew(const UnitCell& ucell,
             if (ucell.atoms[it].na != 0)
             {
                 double dzv;
-                if (PARAM.inp.use_paw)
-                {
-    #ifdef USE_PAW
-                    dzv = GlobalC::paw_cell.get_val(it);
-    #endif
-                }
-                else
                 {
                     dzv = ucell.atoms[it].ncpp.zv;
                 }
@@ -702,13 +520,6 @@ void Forces<FPTYPE, Device>::cal_force_ew(const UnitCell& ucell,
     double charge = 0.0;
     for (int it = 0; it < ucell.ntype; it++)
     {
-        if (PARAM.inp.use_paw)
-        {
-#ifdef USE_PAW
-            charge += ucell.atoms[it].na * GlobalC::paw_cell.get_val(it);
-#endif
-        }
-        else
         {
             charge += ucell.atoms[it].na * ucell.atoms[it].ncpp.zv; // mohan modify 2007-11-7
         }
@@ -729,14 +540,16 @@ void Forces<FPTYPE, Device>::cal_force_ew(const UnitCell& ucell,
         upperbound = 2.0 * charge * charge * sqrt(2.0 * alpha / ModuleBase::TWO_PI)
                      * erfc(sqrt(ucell.tpiba2 * rho_basis->ggecut / 4.0 / alpha));
     } while (upperbound > 1.0e-6);
-
-#ifdef _OPENMP
+    const int ig0 = rho_basis->ig_gge0;
 #pragma omp parallel for
-#endif
     for (int ig = 0; ig < rho_basis->npw; ig++)
     {
+        if (ig== ig0)
+        {
+            continue; // skip G=0
+        }
         aux[ig] *= ModuleBase::libm::exp(-1.0 * rho_basis->gg[ig] * ucell.tpiba2 / alpha / 4.0)
-                   / (rho_basis->gg[ig] * ucell.tpiba2);
+                / (rho_basis->gg[ig] * ucell.tpiba2);
     }
 
     // set pos rho_basis->ig_gge0 to zero
@@ -744,144 +557,99 @@ void Forces<FPTYPE, Device>::cal_force_ew(const UnitCell& ucell,
     {
         aux[rho_basis->ig_gge0] = std::complex<double>(0.0, 0.0);
     }
-
-    // sincos op for cal_force_ew
-    
-    std::vector<FPTYPE> it_facts_host(this->nat);
-    std::vector<FPTYPE> tau_flat(this->nat * 3);
-    
-    // iterate over by lookup table
-    for (int iat = 0; iat < this->nat; iat++) {
-        int it = ucell.iat2it[iat];  
-        int ia = ucell.iat2ia[iat];  
-        
-        double zv;
-        if (PARAM.inp.use_paw)
+    if(this->device == base_device::GpuDevice)
+    {
+        std::vector<double> tau_h(this->nat * 3);
+        std::vector<double> gcar_h(rho_basis->npw * 3);
+        for(int iat = 0; iat < this->nat; ++iat)
         {
-#ifdef USE_PAW
-            zv = GlobalC::paw_cell.get_val(it);
-#endif
+            int it = ucell.iat2it[iat];
+            int ia = ucell.iat2ia[iat];
+            tau_h[iat * 3] = ucell.atoms[it].tau[ia].x;
+            tau_h[iat * 3 + 1] = ucell.atoms[it].tau[ia].y;
+            tau_h[iat * 3 + 2] = ucell.atoms[it].tau[ia].z;
         }
-        else
+        for(int ig = 0; ig < rho_basis->npw; ++ig)
         {
-            zv = ucell.atoms[it].ncpp.zv;
+            gcar_h[ig * 3] = rho_basis->gcar[ig].x;
+            gcar_h[ig * 3 + 1] = rho_basis->gcar[ig].y;
+            gcar_h[ig * 3 + 2] = rho_basis->gcar[ig].z;
         }
+        std::vector<double> it_fact_h(ucell.ntype);
+        for(int it = 0; it < ucell.ntype; ++it)
+        {
+            it_fact_h[it] = ucell.atoms[it].ncpp.zv * ModuleBase::e2 * ucell.tpiba * ModuleBase::TWO_PI / ucell.omega * fact;
+        }
+
+        int* iat2it_d = nullptr;
+        double* gcar_d = nullptr;
+        double* tau_d = nullptr;
+        double* it_fact_d = nullptr;
+        std::complex<double>* aux_d = nullptr;
+        double* forceion_d  = nullptr;
+        resmem_int_op()(this->ctx, iat2it_d, this->nat);
+        resmem_var_op()(this->ctx, gcar_d, rho_basis->npw * 3);
+        resmem_var_op()(this->ctx, tau_d, this->nat * 3);
+        resmem_var_op()(this->ctx, it_fact_d, ucell.ntype);
+        resmem_complex_op()(this->ctx, aux_d, rho_basis->npw);
+        resmem_var_op()(this->ctx, forceion_d, this->nat * 3);
+
+        syncmem_int_h2d_op()(this->ctx, this->cpu_ctx, iat2it_d, ucell.iat2it, this->nat);
+        syncmem_var_h2d_op()(this->ctx, this->cpu_ctx, gcar_d, gcar_h.data(), rho_basis->npw * 3);
+        syncmem_var_h2d_op()(this->ctx, this->cpu_ctx, tau_d, tau_h.data(), this->nat * 3);
+        syncmem_var_h2d_op()(this->ctx, this->cpu_ctx, it_fact_d, it_fact_h.data(), ucell.ntype);
+        syncmem_complex_h2d_op()(this->ctx, this->cpu_ctx, aux_d, aux.data(), rho_basis->npw);
+        syncmem_var_h2d_op()(this->ctx, this->cpu_ctx, forceion_d, forceion.c, this->nat * 3);
+
+        hamilt::cal_force_ew_op<FPTYPE, Device>()(
+            this->nat,
+            rho_basis->npw,
+            rho_basis->ig_gge0,
+            iat2it_d,
+            gcar_d,
+            tau_d,
+            it_fact_d,
+            aux_d,
+            forceion_d);
         
-        it_facts_host[iat] = static_cast<FPTYPE>(zv * ModuleBase::e2 * ucell.tpiba * 
-                                                ModuleBase::TWO_PI / ucell.omega * fact);
-        
-        tau_flat[iat * 3 + 0] = static_cast<FPTYPE>(ucell.atoms[it].tau[ia][0]);
-        tau_flat[iat * 3 + 1] = static_cast<FPTYPE>(ucell.atoms[it].tau[ia][1]); 
-        tau_flat[iat * 3 + 2] = static_cast<FPTYPE>(ucell.atoms[it].tau[ia][2]);
-    }
-    
-    std::vector<FPTYPE> gcar_flat(rho_basis->npw * 3);
-    for (int ig = 0; ig < rho_basis->npw; ig++) {
-        gcar_flat[ig * 3 + 0] = static_cast<FPTYPE>(rho_basis->gcar[ig][0]);
-        gcar_flat[ig * 3 + 1] = static_cast<FPTYPE>(rho_basis->gcar[ig][1]);
-        gcar_flat[ig * 3 + 2] = static_cast<FPTYPE>(rho_basis->gcar[ig][2]);
-    }
-    
-    std::vector<std::complex<FPTYPE>> aux_fptype(rho_basis->npw);
-    for (int ig = 0; ig < rho_basis->npw; ig++) {
-        aux_fptype[ig] = static_cast<std::complex<FPTYPE>>(aux[ig]);
-    }
-    
-    FPTYPE* d_gcar = gcar_flat.data();
-    FPTYPE* d_tau = tau_flat.data();
-    FPTYPE* d_it_facts = it_facts_host.data();
-    std::complex<FPTYPE>* d_aux = aux_fptype.data();
-    FPTYPE* d_force_g = nullptr;
-    std::vector<FPTYPE> force_g_host(this->nat * 3);
-    
-    if (this->device == base_device::GpuDevice)
-    {
-        d_gcar = nullptr;
-        d_tau = nullptr;
-        d_it_facts = nullptr;
-        d_aux = nullptr;
-        
-        resmem_var_op()(this->ctx, d_gcar, rho_basis->npw * 3);
-        resmem_var_op()(this->ctx, d_tau, this->nat * 3);
-        resmem_var_op()(this->ctx, d_it_facts, this->nat);
-        resmem_complex_op()(this->ctx, d_aux, rho_basis->npw);
-        resmem_var_op()(this->ctx, d_force_g, this->nat * 3);
-        
-        
-        syncmem_var_h2d_op()(this->ctx, this->cpu_ctx, d_gcar, gcar_flat.data(), rho_basis->npw * 3);
-        syncmem_var_h2d_op()(this->ctx, this->cpu_ctx, d_tau, tau_flat.data(), this->nat * 3);
-        syncmem_var_h2d_op()(this->ctx, this->cpu_ctx, d_it_facts, it_facts_host.data(), this->nat);
-        syncmem_complex_h2d_op()(this->ctx, this->cpu_ctx, d_aux, aux_fptype.data(), rho_basis->npw);
-        
-     
-        base_device::memory::set_memory_op<FPTYPE, Device>()(this->ctx, d_force_g, 0.0, this->nat * 3);
-    }
-    else
-    {
-        d_force_g = force_g_host.data();
-        std::fill(force_g_host.begin(), force_g_host.end(), static_cast<FPTYPE>(0.0));
-    }
-    
-    // call op for sincos calculation
-    hamilt::cal_force_ew_sincos_op<FPTYPE, Device>()(
-        this->ctx,
-        this->nat,
-        rho_basis->npw,
-        rho_basis->ig_gge0,  
-        d_gcar,
-        d_tau,
-        d_it_facts,
-        d_aux,
-        d_force_g
-    );
-    
-   
-    if (this->device == base_device::GpuDevice)
+        syncmem_var_d2h_op()(this->cpu_ctx, this->ctx, forceion.c, forceion_d, this->nat * 3);
+        delmem_int_op()(this->ctx,iat2it_d);
+        delmem_var_op()(this->ctx,gcar_d);
+        delmem_var_op()(this->ctx,tau_d);
+        delmem_var_op()(this->ctx,it_fact_d);
+        delmem_complex_op()(this->ctx,aux_d);
+        delmem_var_op()(this->ctx,forceion_d);
+    } else // calculate forces on CPU
     {
-        
-        syncmem_var_d2h_op()(this->cpu_ctx, this->ctx, force_g_host.data(), d_force_g, this->nat * 3);
-        
-        
-        delmem_var_op()(this->ctx, d_gcar);
-        delmem_var_op()(this->ctx, d_tau);
-        delmem_var_op()(this->ctx, d_it_facts);
-        delmem_complex_op()(this->ctx, d_aux);
-        delmem_var_op()(this->ctx, d_force_g);
-    }
-    
+    #pragma omp parallel for
+        for(int iat = 0; iat < this->nat; ++iat)
+        {
+            const int it = ucell.iat2it[iat];
+            const int ia = ucell.iat2ia[iat];
+            double it_fact = ucell.atoms[it].ncpp.zv * ModuleBase::e2 * ucell.tpiba * ModuleBase::TWO_PI / ucell.omega * fact;
 
-    for (int iat = 0; iat < this->nat; iat++) {
-        forceion(iat, 0) += static_cast<double>(force_g_host[iat * 3 + 0]);
-        forceion(iat, 1) += static_cast<double>(force_g_host[iat * 3 + 1]); 
-        forceion(iat, 2) += static_cast<double>(force_g_host[iat * 3 + 2]);
+            for(int ig = 0; ig < rho_basis->npw; ++ig)
+            {
+                if(ig != rho_basis->ig_gge0) // skip G=0
+                {
+                    const ModuleBase::Vector3<double> gcar = rho_basis->gcar[ig];
+                    const double arg = ModuleBase::TWO_PI * (gcar * ucell.atoms[it].tau[ia]);
+                    double sinp, cosp;
+                    ModuleBase::libm::sincos(arg, &sinp, &cosp);
+                    double sumnb = -cosp * aux[ig].imag() + sinp * aux[ig].real();
+                    forceion(iat, 0) += gcar[0] * sumnb;
+                    forceion(iat, 1) += gcar[1] * sumnb;
+                    forceion(iat, 2) += gcar[2] * sumnb;
+                }
+            }
+            forceion(iat, 0) *= it_fact;
+            forceion(iat, 1) *= it_fact;
+            forceion(iat, 2) *= it_fact;
+        }
     }
-
-   
-// calculate real space force
-#ifdef _OPENMP
-#pragma omp parallel
+    // means that the processor contains G=0 term.
+    #pragma omp parallel
     {
-        int num_threads = omp_get_num_threads();
-        int thread_id = omp_get_thread_num();
-#else
-    int num_threads = 1;
-    int thread_id = 0;
-#endif
-
-        /* Here is task distribution for multi-thread,
-            0. atom will be iterated both in main nat loop and the loop in `if (rho_basis->ig_gge0 >= 0)`.
-                To avoid syncing, we must calculate work range of each thread by our self
-            1. Calculate the iat range [iat_beg, iat_end) by each thread
-                a. when it is single thread stage, [iat_beg, iat_end) will be [0, nat)
-            2. each thread iterate atoms form `iat_beg` to `iat_end-1`
-        */
-        int iat_beg, iat_end;
-        int it_beg, ia_beg;
-        ModuleBase::TASK_DIST_1D(num_threads, thread_id, this->nat, iat_beg, iat_end);
-        iat_end = iat_beg + iat_end;
-        ucell.iat2iait(iat_beg, &ia_beg, &it_beg);
-
-        
         if (rho_basis->ig_gge0 >= 0)
         {
             double rmax = 5.0 / (sqrt(alpha) * ucell.lat0);
@@ -890,84 +658,55 @@ void Forces<FPTYPE, Device>::cal_force_ew(const UnitCell& ucell,
             // output of rgen: the number of vectors in the sphere
             const int mxr = 200;
             // the maximum number of R vectors included in r
-            ModuleBase::Vector3<double>* r = new ModuleBase::Vector3<double>[mxr];
-            double* r2 = new double[mxr];
-            ModuleBase::GlobalFunc::ZEROS(r2, mxr);
-            int* irr = new int[mxr];
-            ModuleBase::GlobalFunc::ZEROS(irr, mxr);
+            std::vector<ModuleBase::Vector3<double>> r(mxr);
+            std::vector<double> r2(mxr);
+            std::vector<int> irr(mxr);
             // the square modulus of R_j-tau_s-tau_s'
 
-            int iat1 = iat_beg;
-            int T1 = it_beg;
-            int I1 = ia_beg;
             const double sqa = sqrt(alpha);
             const double sq8a_2pi = sqrt(8.0 * alpha / ModuleBase::TWO_PI);
 
             // iterating atoms.
-            // do not need to sync threads because task range of each thread is isolated
-            while (iat1 < iat_end)
+            #pragma omp for
+            for(int iat1 = 0; iat1 < this->nat; iat1++)
             {
-                int iat2 = 0; // mohan fix bug 2011-06-07
-                int I2 = 0;
-                int T2 = 0;
-                while (iat2 < this->nat)
+                int T1 = ucell.iat2it[iat1];
+                int I1 = ucell.iat2ia[iat1];
+                for(int iat2 = 0; iat2 < this->nat; iat2++)
                 {
-                    if (iat1 != iat2 && ucell.atoms[T2].na != 0 && ucell.atoms[T1].na != 0)
+                    int T2 = ucell.iat2it[iat2];
+                    int I2 = ucell.iat2ia[iat2];
+                    if (iat1 != iat2)
                     {
                         ModuleBase::Vector3<double> d_tau
                             = ucell.atoms[T1].tau[I1] - ucell.atoms[T2].tau[I2];
-                        H_Ewald_pw::rgen(d_tau, rmax, irr, ucell.latvec, ucell.G, r, r2, nrm);
+                        H_Ewald_pw::rgen(d_tau, rmax, irr.data(), ucell.latvec, ucell.G, r.data(), r2.data(), nrm);
 
                         for (int n = 0; n < nrm; n++)
                         {
                             const double rr = sqrt(r2[n]) * ucell.lat0;
 
                             double factor;
-                            if (PARAM.inp.use_paw)
-                            {
-#ifdef USE_PAW
-                                factor = GlobalC::paw_cell.get_val(T1) * GlobalC::paw_cell.get_val(T2) * ModuleBase::e2
-                                         / (rr * rr)
-                                         * (erfc(sqa * rr) / rr + sq8a_2pi * ModuleBase::libm::exp(-alpha * rr * rr))
-                                         * ucell.lat0;
-#endif
-                            }
-                            else
                             {
                                 factor = ucell.atoms[T1].ncpp.zv * ucell.atoms[T2].ncpp.zv
-                                         * ModuleBase::e2 / (rr * rr)
-                                         * (erfc(sqa * rr) / rr + sq8a_2pi * ModuleBase::libm::exp(-alpha * rr * rr))
-                                         * ucell.lat0;
+                                            * ModuleBase::e2 / (rr * rr)
+                                            * (erfc(sqa * rr) / rr + sq8a_2pi * ModuleBase::libm::exp(-alpha * rr * rr))
+                                            * ucell.lat0;
                             }
-
                             forceion(iat1, 0) -= factor * r[n].x;
                             forceion(iat1, 1) -= factor * r[n].y;
                             forceion(iat1, 2) -= factor * r[n].z;
                         }
                     }
-                    ++iat2;
-                    ucell.step_iait(&I2, &T2);
                 } // atom b
-                ++iat1;
-                ucell.step_iait(&I1, &T1);
             } // atom a
-
-            delete[] r;
-            delete[] r2;
-            delete[] irr;
         }
-#ifdef _OPENMP
     }
-#endif
-
     Parallel_Reduce::reduce_pool(forceion.c, forceion.nr * forceion.nc);
-
     // this->print(GlobalV::ofs_running, "ewald forces", forceion);
 
     ModuleBase::timer::tick("Forces", "cal_force_ew");
 
-    delete[] aux;
-
     return;
 }
 
diff --git a/source/module_hamilt_pw/hamilt_pwdft/forces.h b/source/module_hamilt_pw/hamilt_pwdft/forces.h
index 695520dceb..370c6a2eb8 100644
--- a/source/module_hamilt_pw/hamilt_pwdft/forces.h
+++ b/source/module_hamilt_pw/hamilt_pwdft/forces.h
@@ -37,7 +37,7 @@ class Forces
     void cal_force(UnitCell& ucell,
                    ModuleBase::matrix& force,
                    const elecstate::ElecState& elec,
-                   ModulePW::PW_Basis* rho_basis,
+                   const ModulePW::PW_Basis* rho_basis,
                    ModuleSymmetry::Symmetry* p_symm,
                    Structure_Factor* p_sf,
                    surchem& solvent,
@@ -53,15 +53,15 @@ class Forces
 
     void cal_force_loc(const UnitCell& ucell,
                        ModuleBase::matrix& forcelc,
-                       ModulePW::PW_Basis* rho_basis,
+                       const ModulePW::PW_Basis* rho_basis,
                        const ModuleBase::matrix& vloc,
                        const Charge* const chr);
     void cal_force_ew(const UnitCell& ucell,
                       ModuleBase::matrix& forceion, 
-                      ModulePW::PW_Basis* rho_basis, 
+                      const ModulePW::PW_Basis* rho_basis, 
                       const Structure_Factor* p_sf);
     void cal_force_cc(ModuleBase::matrix& forcecc,
-                      ModulePW::PW_Basis* rho_basis,
+                      const ModulePW::PW_Basis* rho_basis,
                       const Charge* const chr,
                       const bool* numeric,
                       UnitCell& ucell_in);
@@ -96,13 +96,13 @@ class Forces
                       const UnitCell& ucell_in,
                       const psi::Psi<complex<FPTYPE>, Device>* psi_in = nullptr);
     void cal_force_scc(ModuleBase::matrix& forcescc,
-                       ModulePW::PW_Basis* rho_basis,
+                       const ModulePW::PW_Basis* rho_basis,
                        const ModuleBase::matrix& v_current,
                        const bool vnew_exist,
                        const bool* numeric,
                        const UnitCell& ucell_in);
     void cal_force_us(ModuleBase::matrix& forcenl,
-                      ModulePW::PW_Basis* rho_basis,
+                      const ModulePW::PW_Basis* rho_basis,
                       const pseudopot_cell_vnl& ppcell_in,
                       const elecstate::ElecState& elec,
                       const UnitCell& ucell);
@@ -113,7 +113,7 @@ class Forces
                      const FPTYPE* rab,
                      const FPTYPE* rhoc,
                      FPTYPE* drhocg,
-                     ModulePW::PW_Basis* rho_basis,
+                     const ModulePW::PW_Basis* rho_basis,
                      int type,
                      const UnitCell& ucell_in); // used in nonlinear core correction stress
     void deriv_drhoc_scc(const bool& numeric,
@@ -122,7 +122,7 @@ class Forces
                      const FPTYPE* rab,
                      const FPTYPE* rhoc,
                      FPTYPE* drhocg,
-                     ModulePW::PW_Basis* rho_basis,
+                     const ModulePW::PW_Basis* rho_basis,
                      const UnitCell& ucell_in); // used in nonlinear core correction stress
   protected:
     Device* ctx = {};
diff --git a/source/module_hamilt_pw/hamilt_pwdft/forces_cc.cpp b/source/module_hamilt_pw/hamilt_pwdft/forces_cc.cpp
index 3346724deb..f3bc7469a3 100644
--- a/source/module_hamilt_pw/hamilt_pwdft/forces_cc.cpp
+++ b/source/module_hamilt_pw/hamilt_pwdft/forces_cc.cpp
@@ -31,7 +31,7 @@
 
 template <typename FPTYPE, typename Device>
 void Forces<FPTYPE, Device>::cal_force_cc(ModuleBase::matrix& forcecc,
-                                          ModulePW::PW_Basis* rho_basis,
+                                          const ModulePW::PW_Basis* rho_basis,
                                           const Charge* const chr,
                                           const bool* numeric,
                                            UnitCell& ucell_in)
@@ -239,7 +239,7 @@ void Forces<FPTYPE, Device>::deriv_drhoc
 	const FPTYPE *rab,
 	const FPTYPE *rhoc,
 	FPTYPE *drhocg,
-	ModulePW::PW_Basis* rho_basis,
+	const ModulePW::PW_Basis* rho_basis,
 	int type,
     const UnitCell& ucell_in
 )
diff --git a/source/module_hamilt_pw/hamilt_pwdft/forces_scc.cpp b/source/module_hamilt_pw/hamilt_pwdft/forces_scc.cpp
index f670ad9b27..4b089d1d26 100644
--- a/source/module_hamilt_pw/hamilt_pwdft/forces_scc.cpp
+++ b/source/module_hamilt_pw/hamilt_pwdft/forces_scc.cpp
@@ -24,7 +24,7 @@
 
 template <typename FPTYPE, typename Device>
 void Forces<FPTYPE, Device>::cal_force_scc(ModuleBase::matrix& forcescc,
-                                           ModulePW::PW_Basis* rho_basis,
+                                           const ModulePW::PW_Basis* rho_basis,
                                            const ModuleBase::matrix& vnew,
                                            const bool vnew_exist,
                                            const bool* numeric,
@@ -147,7 +147,7 @@ void Forces<FPTYPE, Device>::deriv_drhoc_scc(const bool& numeric,
                                               const FPTYPE* rab,
                                               const FPTYPE* rhoc,
                                               FPTYPE* drhocg,
-                                              ModulePW::PW_Basis* rho_basis,
+                                              const ModulePW::PW_Basis* rho_basis,
                                               const UnitCell& ucell_in) {
     int igl0 = 0;
     double gx = 0;
diff --git a/source/module_hamilt_pw/hamilt_pwdft/forces_us.cpp b/source/module_hamilt_pw/hamilt_pwdft/forces_us.cpp
index 4323ee6c4f..8879c89c35 100644
--- a/source/module_hamilt_pw/hamilt_pwdft/forces_us.cpp
+++ b/source/module_hamilt_pw/hamilt_pwdft/forces_us.cpp
@@ -15,7 +15,7 @@
 // On output: the contribution is added to \(\text{forcenl}\).
 template <typename FPTYPE, typename Device>
 void Forces<FPTYPE, Device>::cal_force_us(ModuleBase::matrix& forcenl,
-                                          ModulePW::PW_Basis* rho_basis,
+                                          const ModulePW::PW_Basis* rho_basis,
                                           const pseudopot_cell_vnl& nlpp,
                                           const elecstate::ElecState& elec,
                                           const UnitCell& ucell)
diff --git a/source/module_hamilt_pw/hamilt_pwdft/kernels/cuda/force_op.cu b/source/module_hamilt_pw/hamilt_pwdft/kernels/cuda/force_op.cu
index 5ff3ddf2b1..b8bca46302 100644
--- a/source/module_hamilt_pw/hamilt_pwdft/kernels/cuda/force_op.cu
+++ b/source/module_hamilt_pw/hamilt_pwdft/kernels/cuda/force_op.cu
@@ -1,6 +1,7 @@
 #include "module_hamilt_pw/hamilt_pwdft/kernels/force_op.h"
-// #include "module_psi/kernels/device.h"
+// #include "source_psi/kernels/device.h"
 #include "module_base/module_device/types.h"
+#include "module_base/constants.h"
 
 #include <complex>
 
@@ -10,124 +11,18 @@
 #include <module_base/module_device/device.h>
 
 #define THREADS_PER_BLOCK 256
+#define FULL_MASK 0xffffffff
+#define WARP_SIZE 32
 
 namespace hamilt {
 
-// CUDA kernels for sincos loops
 template <typename FPTYPE>
-__global__ void cal_force_loc_sincos_kernel(
-    const int nat,
-    const int npw,
-    const int ntype,
-    const FPTYPE* gcar,
-    const FPTYPE* tau,
-    const FPTYPE* vloc_per_type,
-    const thrust::complex<FPTYPE>* aux,
-    const FPTYPE scale_factor,
-    FPTYPE* force)
-{
-    const FPTYPE TWO_PI = 2.0 * M_PI;
-    
-    const int iat = blockIdx.y;
-    const int ig_start = blockIdx.x * blockDim.x + threadIdx.x;
-    const int ig_stride = gridDim.x * blockDim.x;
-    
-    if (iat >= nat) return;
-    
-    // Load atom information to registers
-    const FPTYPE tau_x = tau[iat * 3 + 0];
-    const FPTYPE tau_y = tau[iat * 3 + 1];
-    const FPTYPE tau_z = tau[iat * 3 + 2];
-    
-    // Local accumulation variables
-    FPTYPE local_force_x = 0.0;
-    FPTYPE local_force_y = 0.0;
-    FPTYPE local_force_z = 0.0;
-    
-    // Grid-stride loop over G-vectors
-    for (int ig = ig_start; ig < npw; ig += ig_stride) {
-        // Calculate phase: 2π * (G · τ)
-        const FPTYPE phase = TWO_PI * (gcar[ig * 3 + 0] * tau_x + 
-                                       gcar[ig * 3 + 1] * tau_y + 
-                                       gcar[ig * 3 + 2] * tau_z);
-        
-        // Use CUDA intrinsic for sincos
-        FPTYPE sinp, cosp;
-        sincos(phase, &sinp, &cosp);
-        
-        // Calculate force factor
-        const FPTYPE vloc_factor = vloc_per_type[iat * npw + ig];
-        const FPTYPE factor = vloc_factor * (cosp * aux[ig].imag() + sinp * aux[ig].real());
-        
-        // Accumulate force contributions
-        local_force_x += gcar[ig * 3 + 0] * factor;
-        local_force_y += gcar[ig * 3 + 1] * factor;
-        local_force_z += gcar[ig * 3 + 2] * factor;
-    }
-    
-    // Atomic add to global memory
-    atomicAdd(&force[iat * 3 + 0], local_force_x * scale_factor);
-    atomicAdd(&force[iat * 3 + 1], local_force_y * scale_factor);
-    atomicAdd(&force[iat * 3 + 2], local_force_z * scale_factor);
-}
-
-template <typename FPTYPE>
-__global__ void cal_force_ew_sincos_kernel(
-    const int nat,
-    const int npw,
-    const int ig_gge0,
-    const FPTYPE* gcar,
-    const FPTYPE* tau,
-    const FPTYPE* it_facts,
-    const thrust::complex<FPTYPE>* aux,
-    FPTYPE* force)
-{
-    const FPTYPE TWO_PI = 2.0 * M_PI;
-    
-    const int iat = blockIdx.y;
-    const int ig_start = blockIdx.x * blockDim.x + threadIdx.x;
-    const int ig_stride = gridDim.x * blockDim.x;
-    
-    if (iat >= nat) return;
-    
-    // Load atom information to registers
-    const FPTYPE tau_x = tau[iat * 3 + 0];
-    const FPTYPE tau_y = tau[iat * 3 + 1];
-    const FPTYPE tau_z = tau[iat * 3 + 2];
-    const FPTYPE it_fact = it_facts[iat];
-    
-    // Local accumulation variables
-    FPTYPE local_force_x = 0.0;
-    FPTYPE local_force_y = 0.0;
-    FPTYPE local_force_z = 0.0;
-    
-    // Grid-stride loop over G-vectors
-    for (int ig = ig_start; ig < npw; ig += ig_stride) {
-        // Skip G=0 term
-        if (ig == ig_gge0) continue;
-        
-        // Calculate phase: 2π * (G · τ)
-        const FPTYPE phase = TWO_PI * (gcar[ig * 3 + 0] * tau_x + 
-                                       gcar[ig * 3 + 1] * tau_y + 
-                                       gcar[ig * 3 + 2] * tau_z);
-        
-        // Use CUDA intrinsic for sincos
-        FPTYPE sinp, cosp;
-        sincos(phase, &sinp, &cosp);
-        
-        // Calculate Ewald sum contribution (fixed sign error)
-        const FPTYPE factor = it_fact * (-cosp * aux[ig].imag() + sinp * aux[ig].real());
-        
-        // Accumulate force contributions
-        local_force_x += gcar[ig * 3 + 0] * factor;
-        local_force_y += gcar[ig * 3 + 1] * factor;
-        local_force_z += gcar[ig * 3 + 2] * factor;
+__forceinline__
+__device__
+void warp_reduce(FPTYPE & val) {
+    for (int offset = 16; offset > 0; offset >>= 1) {
+        val += __shfl_down_sync(FULL_MASK, val, offset);
     }
-    
-    // Atomic add to global memory
-    atomicAdd(&force[iat * 3 + 0], local_force_x);
-    atomicAdd(&force[iat * 3 + 1], local_force_y);
-    atomicAdd(&force[iat * 3 + 2], local_force_z);
 }
 
 template <typename FPTYPE>
@@ -304,65 +199,6 @@ void cal_force_nl_op<FPTYPE, base_device::DEVICE_GPU>::operator()(const base_dev
     cudaCheckOnDebug();
 }
 
-// GPU operators
-template <typename FPTYPE>
-void cal_force_loc_sincos_op<FPTYPE, base_device::DEVICE_GPU>::operator()(
-    const base_device::DEVICE_GPU* ctx,
-    const int& nat,
-    const int& npw,
-    const int& ntype,
-    const FPTYPE* gcar,
-    const FPTYPE* tau,
-    const FPTYPE* vloc_per_type,
-    const std::complex<FPTYPE>* aux,
-    const FPTYPE& scale_factor,
-    FPTYPE* force)
-{
-    // Calculate optimal grid configuration for GPU load balancing
-    const int threads_per_block = THREADS_PER_BLOCK;
-    const int max_blocks_per_sm = 32; // Configurable per GPU architecture
-    const int max_blocks_x = std::min(max_blocks_per_sm, (npw + threads_per_block - 1) / threads_per_block);
-    
-    dim3 grid(max_blocks_x, nat);
-    dim3 block(threads_per_block);
-    
-    cal_force_loc_sincos_kernel<FPTYPE><<<grid, block>>>(
-        nat, npw, ntype, gcar, tau, vloc_per_type,
-        reinterpret_cast<const thrust::complex<FPTYPE>*>(aux),
-        scale_factor, force
-    );
-    
-    cudaCheckOnDebug();
-}
-
-template <typename FPTYPE>
-void cal_force_ew_sincos_op<FPTYPE, base_device::DEVICE_GPU>::operator()(
-    const base_device::DEVICE_GPU* ctx,
-    const int& nat,
-    const int& npw,
-    const int& ig_gge0,
-    const FPTYPE* gcar,
-    const FPTYPE* tau,
-    const FPTYPE* it_facts,
-    const std::complex<FPTYPE>* aux,
-    FPTYPE* force)
-{
-    // Calculate optimal grid configuration for GPU load balancing
-    const int threads_per_block = THREADS_PER_BLOCK;
-    const int max_blocks_per_sm = 32; // Configurable per GPU architecture
-    const int max_blocks_x = std::min(max_blocks_per_sm, (npw + threads_per_block - 1) / threads_per_block);
-    
-    dim3 grid(max_blocks_x, nat);
-    dim3 block(threads_per_block);
-    
-    cal_force_ew_sincos_kernel<FPTYPE><<<grid, block>>>(
-        nat, npw, ig_gge0, gcar, tau, it_facts,
-        reinterpret_cast<const thrust::complex<FPTYPE>*>(aux), force
-    );
-    
-    cudaCheckOnDebug();
-}
-
 template <typename FPTYPE>
 __global__ void cal_force_nl(
         const int ntype,
@@ -781,6 +617,242 @@ void revertVkbValues(
     cudaCheckOnDebug();
 }
 
+template <typename FPTYPE>
+__global__ void force_loc_kernel(
+    const int nat,
+    const int npw,
+    const FPTYPE tpiba_omega,
+    const int* iat2it,
+    const int* ig2gg_d,
+    const FPTYPE* gcar_d,
+    const FPTYPE* tau_d,
+    const thrust::complex<FPTYPE>* aux_d,
+    const FPTYPE* vloc_d,
+    const int vloc_nc,
+    FPTYPE* forcelc_d)
+{
+    const int iat = blockIdx.x;
+    const int tid = threadIdx.x;
+    const int warp_id = tid / WARP_SIZE;
+    const int lane_id = tid % WARP_SIZE;
+    
+    if (iat >= nat) return;
+    const int it = iat2it[iat]; // get the type of atom
+
+    // Initialize force components
+    FPTYPE force_x = 0.0;
+    FPTYPE force_y = 0.0;
+    FPTYPE force_z = 0.0;
+    
+    const auto tau_x = tau_d[iat * 3 + 0];
+    const auto tau_y = tau_d[iat * 3 + 1];
+    const auto tau_z = tau_d[iat * 3 + 2];
+
+    // Process all plane waves in chunks of blockDim.x
+    for (int ig = tid; ig < npw; ig += blockDim.x) {
+        const auto gcar_x = gcar_d[ig * 3 + 0];
+        const auto gcar_y = gcar_d[ig * 3 + 1];
+        const auto gcar_z = gcar_d[ig * 3 + 2];
+
+        // Calculate phase factor
+        const FPTYPE phase = ModuleBase::TWO_PI * (gcar_x * tau_x +
+                                                   gcar_y * tau_y +
+                                                   gcar_z * tau_z);
+        FPTYPE sinp, cosp;
+        sincos(phase, &sinp, &cosp);
+        
+        // Get vloc value
+        const FPTYPE vloc_val = vloc_d[it * vloc_nc + ig2gg_d[ig]];
+        
+        // Calculate factor
+        const auto aux_val = aux_d[ig];
+        const FPTYPE factor = vloc_val * (cosp * aux_val.imag() + sinp * aux_val.real());
+        
+        // Multiply by gcar components
+        force_x += gcar_x * factor;
+        force_y += gcar_y * factor;
+        force_z += gcar_z * factor;
+    }
+    
+    // Warp-level reduction
+    warp_reduce<FPTYPE>(force_x);
+    warp_reduce<FPTYPE>(force_y);
+    warp_reduce<FPTYPE>(force_z);
+    
+    // First thread in each warp writes to shared memory
+    __shared__ FPTYPE warp_sums_x[THREADS_PER_BLOCK / WARP_SIZE]; // 256 threads / 32 = 8 warps
+    __shared__ FPTYPE warp_sums_y[THREADS_PER_BLOCK / WARP_SIZE];
+    __shared__ FPTYPE warp_sums_z[THREADS_PER_BLOCK / WARP_SIZE];
+    
+    if (lane_id == 0) {
+        warp_sums_x[warp_id] = force_x;
+        warp_sums_y[warp_id] = force_y;
+        warp_sums_z[warp_id] = force_z;
+    }
+    
+    __syncthreads();
+    
+    // Final reduction by first warp
+    if (warp_id == 0) {
+        FPTYPE final_x = (lane_id < blockDim.x/WARP_SIZE) ? warp_sums_x[lane_id] : 0.0;
+        FPTYPE final_y = (lane_id < blockDim.x/WARP_SIZE) ? warp_sums_y[lane_id] : 0.0;
+        FPTYPE final_z = (lane_id < blockDim.x/WARP_SIZE) ? warp_sums_z[lane_id] : 0.0;
+        
+        warp_reduce<FPTYPE>(final_x);
+        warp_reduce<FPTYPE>(final_y);
+        warp_reduce<FPTYPE>(final_z);
+        
+        if (lane_id == 0) {
+            forcelc_d[iat * 3 + 0] = final_x * tpiba_omega;
+            forcelc_d[iat * 3 + 1] = final_y * tpiba_omega;
+            forcelc_d[iat * 3 + 2] = final_z * tpiba_omega;
+        }
+    }
+}
+
+template <typename FPTYPE>
+__global__ void force_ew_kernel(
+    const int nat,
+    const int npw,
+    const int ig_gge0,
+    const int* iat2it,
+    const FPTYPE* gcar_d,
+    const FPTYPE* tau_d,
+    const FPTYPE* it_fact_d,
+    const thrust::complex<FPTYPE>* aux_d,
+    FPTYPE* forceion_d)
+{
+    const int iat = blockIdx.x;
+    const int tid = threadIdx.x;
+    const int warp_id = tid / WARP_SIZE;
+    const int lane_id = tid % WARP_SIZE;
+
+    if( iat >= nat) return;
+    const int it = iat2it[iat]; // get the type of atom
+    const FPTYPE it_fact_val = it_fact_d[it]; // Get it_fact value
+
+    // Initialize force components
+    FPTYPE force_x = 0.0;
+    FPTYPE force_y = 0.0;
+    FPTYPE force_z = 0.0;
+
+    const auto tau_x = tau_d[iat * 3 + 0];
+    const auto tau_y = tau_d[iat * 3 + 1];
+    const auto tau_z = tau_d[iat * 3 + 2];
+
+    for (int ig = tid; ig < npw; ig += blockDim.x) {
+        if(ig == ig_gge0)
+        { continue; }
+        const auto gcar_x = gcar_d[ig * 3 + 0];
+        const auto gcar_y = gcar_d[ig * 3 + 1];
+        const auto gcar_z = gcar_d[ig * 3 + 2];
+
+        // Calculate phase factor
+        const FPTYPE phase = ModuleBase::TWO_PI * (gcar_x * tau_x +
+                                                   gcar_y * tau_y +
+                                                   gcar_z * tau_z);
+        FPTYPE sinp, cosp;
+        sincos(phase, &sinp, &cosp);
+
+        // Calculate force contribution
+        const FPTYPE sumnb = -cosp * aux_d[ig].imag() + sinp * aux_d[ig].real();
+        
+        // Multiply by gcar components
+        force_x += gcar_x * sumnb;
+        force_y += gcar_y * sumnb;
+        force_z += gcar_z * sumnb;
+    }
+
+    // Warp-level reduction
+    warp_reduce<FPTYPE>(force_x);
+    warp_reduce<FPTYPE>(force_y);
+    warp_reduce<FPTYPE>(force_z);
+
+    // First thread in each warp writes to shared memory
+    __shared__ FPTYPE warp_sums_x[THREADS_PER_BLOCK / WARP_SIZE]; // 256 threads / 32 = 8 warps
+    __shared__ FPTYPE warp_sums_y[THREADS_PER_BLOCK / WARP_SIZE];
+    __shared__ FPTYPE warp_sums_z[THREADS_PER_BLOCK / WARP_SIZE];
+
+    if (lane_id == 0) {
+        warp_sums_x[warp_id] = force_x;
+        warp_sums_y[warp_id] = force_y;
+        warp_sums_z[warp_id] = force_z;
+    }
+
+    __syncthreads();
+
+    // Final reduction by first warp
+    if (warp_id == 0) {
+        FPTYPE final_x = (lane_id < blockDim.x/WARP_SIZE) ? warp_sums_x[lane_id] : 0.0;
+        FPTYPE final_y = (lane_id < blockDim.x/WARP_SIZE) ? warp_sums_y[lane_id] : 0.0;
+        FPTYPE final_z = (lane_id < blockDim.x/WARP_SIZE) ? warp_sums_z[lane_id] : 0.0;
+
+        warp_reduce<FPTYPE>(final_x);
+        warp_reduce<FPTYPE>(final_y);
+        warp_reduce<FPTYPE>(final_z);
+
+        if (lane_id == 0) {
+            forceion_d[iat * 3 + 0] = final_x * it_fact_val;
+            forceion_d[iat * 3 + 1] = final_y * it_fact_val;
+            forceion_d[iat * 3 + 2] = final_z * it_fact_val;
+        }
+    }
+}
+
+template <typename FPTYPE>
+void cal_force_loc_op<FPTYPE, base_device::DEVICE_GPU>::operator()(
+    const int nat,
+    const int npw,
+    const FPTYPE tpiba_omega,
+    const int* iat2it,
+    const int* ig2igg,
+    const FPTYPE* gcar,
+    const FPTYPE* tau,
+    const std::complex<FPTYPE>* aux,
+    const FPTYPE* vloc,
+    const int vloc_nc,
+    FPTYPE* forcelc)
+{
+    force_loc_kernel<FPTYPE>
+        <<<nat, THREADS_PER_BLOCK>>>(nat,
+                                     npw,
+                                     tpiba_omega,
+                                     iat2it,
+                                     ig2igg,
+                                     gcar,
+                                     tau,
+                                     reinterpret_cast<const thrust::complex<FPTYPE>*>(aux),
+                                     vloc,
+                                     vloc_nc,
+                                     forcelc); // array of data
+
+}
+
+template <typename FPTYPE>
+void cal_force_ew_op<FPTYPE, base_device::DEVICE_GPU>::operator()(
+    const int nat,
+    const int npw,
+    const int ig_gge0,
+    const int* iat2it,
+    const FPTYPE* gcar,
+    const FPTYPE* tau,
+    const FPTYPE* it_fact,
+    const std::complex<FPTYPE>* aux,
+    FPTYPE* forceion)
+{
+    force_ew_kernel<FPTYPE>
+        <<<nat, THREADS_PER_BLOCK>>>(nat,
+                                     npw,
+                                     ig_gge0,
+                                     iat2it,
+                                     gcar,
+                                     tau,
+                                     it_fact,
+                                     reinterpret_cast<const thrust::complex<FPTYPE>*>(aux),
+                                     forceion); // array of data
+}
+
+
 // for revertVkbValues functions instantiation
 template void revertVkbValues<double>(const int *gcar_zero_ptrs, std::complex<double> *vkb_ptr, const std::complex<double> *vkb_save_ptr, int nkb, int gcar_zero_count, int npw, int ipol, int npwx, const std::complex<double> coeff);
 // for saveVkbValues functions instantiation
@@ -788,12 +860,11 @@ template void saveVkbValues<double>(const int *gcar_zero_ptrs, const std::comple
 
 template struct cal_vkb1_nl_op<float, base_device::DEVICE_GPU>;
 template struct cal_force_nl_op<float, base_device::DEVICE_GPU>;
-template struct cal_force_loc_sincos_op<float, base_device::DEVICE_GPU>;
-template struct cal_force_ew_sincos_op<float, base_device::DEVICE_GPU>;
+template struct cal_force_loc_op<float, base_device::DEVICE_GPU>;
+template struct cal_force_ew_op<float, base_device::DEVICE_GPU>;
 
 template struct cal_vkb1_nl_op<double, base_device::DEVICE_GPU>;
 template struct cal_force_nl_op<double, base_device::DEVICE_GPU>;
-template struct cal_force_loc_sincos_op<double, base_device::DEVICE_GPU>;
-template struct cal_force_ew_sincos_op<double, base_device::DEVICE_GPU>;
-
+template struct cal_force_loc_op<double, base_device::DEVICE_GPU>;
+template struct cal_force_ew_op<double, base_device::DEVICE_GPU>;
 }  // namespace hamilt
diff --git a/source/module_hamilt_pw/hamilt_pwdft/kernels/force_op.cpp b/source/module_hamilt_pw/hamilt_pwdft/kernels/force_op.cpp
index 109321d6c3..367415d76b 100644
--- a/source/module_hamilt_pw/hamilt_pwdft/kernels/force_op.cpp
+++ b/source/module_hamilt_pw/hamilt_pwdft/kernels/force_op.cpp
@@ -428,116 +428,10 @@ struct cal_force_nl_op<FPTYPE, base_device::DEVICE_CPU>
     }
 };
 
-// CPU implementation of local force sincos operator
-template <typename FPTYPE>
-struct cal_force_loc_sincos_op<FPTYPE, base_device::DEVICE_CPU>
-{
-    void operator()(const base_device::DEVICE_CPU* ctx,
-                    const int& nat,
-                    const int& npw,
-                    const int& ntype,
-                    const FPTYPE* gcar,
-                    const FPTYPE* tau,
-                    const FPTYPE* vloc_per_type,
-                    const std::complex<FPTYPE>* aux,
-                    const FPTYPE& scale_factor,
-                    FPTYPE* force)
-    {
-        const FPTYPE TWO_PI = 2.0 * M_PI;
-
-#ifdef _OPENMP
-#pragma omp parallel for
-#endif
-        for (int iat = 0; iat < nat; ++iat)
-        {
-            const FPTYPE tau_x = tau[iat * 3 + 0];
-            const FPTYPE tau_y = tau[iat * 3 + 1];
-            const FPTYPE tau_z = tau[iat * 3 + 2];
-
-            FPTYPE local_force[3] = {0.0, 0.0, 0.0};
-
-            for (int ig = 0; ig < npw; ig++)
-            {
-                const FPTYPE phase = TWO_PI * (gcar[ig * 3 + 0] * tau_x + 
-                                               gcar[ig * 3 + 1] * tau_y + 
-                                               gcar[ig * 3 + 2] * tau_z);
-                FPTYPE sinp, cosp;
-                ModuleBase::libm::sincos(phase, &sinp, &cosp);
-                
-                const FPTYPE vloc_factor = vloc_per_type[iat * npw + ig];
-                const FPTYPE factor = vloc_factor * (cosp * aux[ig].imag() + sinp * aux[ig].real()) * scale_factor;
-                
-                local_force[0] += gcar[ig * 3 + 0] * factor;
-                local_force[1] += gcar[ig * 3 + 1] * factor;
-                local_force[2] += gcar[ig * 3 + 2] * factor;
-            }
-
-            force[iat * 3 + 0] = local_force[0];
-            force[iat * 3 + 1] = local_force[1];
-            force[iat * 3 + 2] = local_force[2];
-        }
-    }
-};
-
-// CPU implementation of Ewald force sincos operator
-template <typename FPTYPE>
-struct cal_force_ew_sincos_op<FPTYPE, base_device::DEVICE_CPU>
-{
-    void operator()(const base_device::DEVICE_CPU* ctx,
-                    const int& nat,
-                    const int& npw,
-                    const int& ig_gge0,
-                    const FPTYPE* gcar,
-                    const FPTYPE* tau,
-                    const FPTYPE* it_facts,
-                    const std::complex<FPTYPE>* aux,
-                    FPTYPE* force)
-    {
-        const FPTYPE TWO_PI = 2.0 * M_PI;
-
-#ifdef _OPENMP
-#pragma omp parallel for
-#endif
-        for (int iat = 0; iat < nat; ++iat)
-        {
-            const FPTYPE tau_x = tau[iat * 3 + 0];
-            const FPTYPE tau_y = tau[iat * 3 + 1];
-            const FPTYPE tau_z = tau[iat * 3 + 2];
-            const FPTYPE it_fact = it_facts[iat];
-
-            FPTYPE local_force[3] = {0.0, 0.0, 0.0};
-
-            for (int ig = 0; ig < npw; ig++)
-            {
-                // Skip G=0 term
-                if (ig == ig_gge0) continue;
-
-                const FPTYPE phase = TWO_PI * (gcar[ig * 3 + 0] * tau_x + 
-                                               gcar[ig * 3 + 1] * tau_y + 
-                                               gcar[ig * 3 + 2] * tau_z);
-                FPTYPE sinp, cosp;
-                ModuleBase::libm::sincos(phase, &sinp, &cosp);
-                
-                const FPTYPE factor = it_fact * (-cosp * aux[ig].imag() + sinp * aux[ig].real());
-                
-                local_force[0] += gcar[ig * 3 + 0] * factor;
-                local_force[1] += gcar[ig * 3 + 1] * factor;
-                local_force[2] += gcar[ig * 3 + 2] * factor;
-            }
-
-            force[iat * 3 + 0] = local_force[0];
-            force[iat * 3 + 1] = local_force[1];
-            force[iat * 3 + 2] = local_force[2];
-        }
-    }
-};
 
 template struct cal_vkb1_nl_op<float, base_device::DEVICE_CPU>;
 template struct cal_force_nl_op<float, base_device::DEVICE_CPU>;
-template struct cal_force_loc_sincos_op<float, base_device::DEVICE_CPU>;
-template struct cal_force_ew_sincos_op<float, base_device::DEVICE_CPU>;
+
 template struct cal_vkb1_nl_op<double, base_device::DEVICE_CPU>;
 template struct cal_force_nl_op<double, base_device::DEVICE_CPU>;
-template struct cal_force_loc_sincos_op<double, base_device::DEVICE_CPU>;
-template struct cal_force_ew_sincos_op<double, base_device::DEVICE_CPU>;
 } // namespace hamilt
diff --git a/source/module_hamilt_pw/hamilt_pwdft/kernels/force_op.h b/source/module_hamilt_pw/hamilt_pwdft/kernels/force_op.h
index acf490b278..ca2c6694cf 100644
--- a/source/module_hamilt_pw/hamilt_pwdft/kernels/force_op.h
+++ b/source/module_hamilt_pw/hamilt_pwdft/kernels/force_op.h
@@ -146,67 +146,39 @@ struct cal_force_nl_op
                     const FPTYPE* lambda,
                     const std::complex<FPTYPE>* becp,
                     const std::complex<FPTYPE>* dbecp,
-                                          FPTYPE* force);
+                    FPTYPE* force);
 };
 
 template <typename FPTYPE, typename Device>
-struct cal_force_loc_sincos_op
-{
-    /// @brief Calculate local pseudopotential forces (sincos loop only)
-    ///
-    /// Input Parameters
-    /// @param ctx - which device this function runs on
-    /// @param nat - number of atoms
-    /// @param npw - number of plane waves
-    /// @param ntype - number of atom types
-    /// @param gcar - G-vector Cartesian coordinates [npw * 3]
-    /// @param tau - atomic positions [nat * 3]
-    /// @param vloc_per_type - precomputed vloc factors per atom [nat * npw]
-    /// @param aux - charge density in G-space [npw]
-    /// @param scale_factor - tpiba * omega
-    ///
-    /// Output Parameters
-    /// @param force - output forces [nat * 3]
-    void operator()(const Device* ctx,
-                    const int& nat,
-                    const int& npw,
-                    const int& ntype,
-                    const FPTYPE* gcar,
-                    const FPTYPE* tau,
-                    const FPTYPE* vloc_per_type,
-                    const std::complex<FPTYPE>* aux,
-                    const FPTYPE& scale_factor,
-                    FPTYPE* force);
+struct cal_force_loc_op{
+    void operator()(
+        const int nat,
+        const int npw,
+        const FPTYPE tpiba_omega,
+        const int* iat2it,
+        const int* ig2igg,
+        const FPTYPE* gcar,
+        const FPTYPE* tau,
+        const std::complex<FPTYPE>* aux,
+        const FPTYPE* vloc,
+        const int vloc_nr,
+        FPTYPE* forcelc) {};
 };
 
 template <typename FPTYPE, typename Device>
-struct cal_force_ew_sincos_op
-{
-    /// @brief Calculate Ewald forces (sincos loop only)
-    ///
-    /// Input Parameters
-    /// @param ctx - which device this function runs on
-    /// @param nat - number of atoms
-    /// @param npw - number of plane waves
-    /// @param ig_gge0 - index of G=0 vector (-1 if not present)
-    /// @param gcar - G-vector Cartesian coordinates [npw * 3]
-    /// @param tau - atomic positions [nat * 3]
-    /// @param it_facts - precomputed it_fact for each atom [nat]
-    /// @param aux - structure factor related array [npw]
-    ///
-    /// Output Parameters
-    /// @param force - output forces [nat * 3]
-    void operator()(const Device* ctx,
-                    const int& nat,
-                    const int& npw,
-                    const int& ig_gge0,
-                    const FPTYPE* gcar,
-                    const FPTYPE* tau,
-                    const FPTYPE* it_facts,
-                    const std::complex<FPTYPE>* aux,
-                    FPTYPE* force);
+struct cal_force_ew_op{
+    void operator()(
+        const int nat,
+        const int npw,
+        const int ig_gge0,
+        const int* iat2it,
+        const FPTYPE* gcar,
+        const FPTYPE* tau,
+        const FPTYPE* it_fact,
+        const std::complex<FPTYPE>* aux,
+        FPTYPE* forceion
+    ) {};
 };
-
 #if __CUDA || __UT_USE_CUDA || __ROCM || __UT_USE_ROCM
 template <typename FPTYPE>
 struct cal_vkb1_nl_op<FPTYPE, base_device::DEVICE_GPU>
@@ -306,35 +278,6 @@ struct cal_force_nl_op<FPTYPE, base_device::DEVICE_GPU>
                     FPTYPE* force);
 };
 
-template <typename FPTYPE>
-struct cal_force_loc_sincos_op<FPTYPE, base_device::DEVICE_GPU>
-{
-    void operator()(const base_device::DEVICE_GPU* ctx,
-                    const int& nat,
-                    const int& npw,
-                    const int& ntype,
-                    const FPTYPE* gcar,
-                    const FPTYPE* tau,
-                    const FPTYPE* vloc_per_type,
-                    const std::complex<FPTYPE>* aux,
-                    const FPTYPE& scale_factor,
-                    FPTYPE* force);
-};
-
-template <typename FPTYPE>
-struct cal_force_ew_sincos_op<FPTYPE, base_device::DEVICE_GPU>
-{
-    void operator()(const base_device::DEVICE_GPU* ctx,
-                    const int& nat,
-                    const int& npw,
-                    const int& ig_gge0,
-                    const FPTYPE* gcar,
-                    const FPTYPE* tau,
-                    const FPTYPE* it_facts,
-                    const std::complex<FPTYPE>* aux,
-                    FPTYPE* force);
-};
-
 /**
  * @brief revert the vkb values for force_nl calculation
  */
@@ -362,6 +305,36 @@ void saveVkbValues(const int* gcar_zero_ptrs,
                    int ipol,
                    int npwx);
 
+template <typename FPTYPE>
+struct cal_force_loc_op<FPTYPE, base_device::DEVICE_GPU>{
+    void operator()(
+        const int nat,
+        const int npw,
+        const FPTYPE tpiba_omega,
+        const int* iat2it,
+        const int* ig2igg,
+        const FPTYPE* gcar,
+        const FPTYPE* tau,
+        const std::complex<FPTYPE>* aux,
+        const FPTYPE* vloc,
+        const int vloc_nr,
+        FPTYPE* forcelc);
+};
+
+template <typename FPTYPE>
+struct cal_force_ew_op<FPTYPE, base_device::DEVICE_GPU>{
+    void operator()(
+        const int nat,
+        const int npw,
+        const int ig_gge0,
+        const int* iat2it,
+        const FPTYPE* gcar,
+        const FPTYPE* tau,
+        const FPTYPE* it_fact,
+        const std::complex<FPTYPE>* aux,
+        FPTYPE* forceion
+    );
+};
 #endif // __CUDA || __UT_USE_CUDA || __ROCM || __UT_USE_ROCM
 } // namespace hamilt
-#endif // W_ABACUS_DEVELOP_ABACUS_DEVELOP_SOURCE_MODULE_HAMILT_PW_HAMILT_PWDFT_KERNELS_FORCE_OP_H
\ No newline at end of file
+#endif // W_ABACUS_DEVELOP_ABACUS_DEVELOP_SOURCE_source_pw_HAMILT_PWDFT_KERNELS_FORCE_OP_H
\ No newline at end of file