diff --git a/source/Makefile.Objects b/source/Makefile.Objects
index 764cf0ce51..48179be83a 100644
--- a/source/Makefile.Objects
+++ b/source/Makefile.Objects
@@ -334,6 +334,8 @@ OBJS_HAMILT=hamilt_pw.o\
     op_exx_pw.o\
     ekinetic_pw.o\
     ekinetic_op.o\
+    exx_pw_ace.o\
+    exx_pw_pot.o\
     hpsi_norm_op.o\
     veff_pw.o\
     veff_op.o\
diff --git a/source/source_cell/pseudo.cpp b/source/source_cell/pseudo.cpp
index 233632069d..c6dc9de2f7 100644
--- a/source/source_cell/pseudo.cpp
+++ b/source/source_cell/pseudo.cpp
@@ -1,5 +1,6 @@
 #include "pseudo.h"
 #include "source_base/tool_title.h"
+#include <cstdint>
 
 pseudo::pseudo()
 {
diff --git a/source/source_esolver/esolver_ks_pw.cpp b/source/source_esolver/esolver_ks_pw.cpp
index 1f3c23da5a..66f0ca97d1 100644
--- a/source/source_esolver/esolver_ks_pw.cpp
+++ b/source/source_esolver/esolver_ks_pw.cpp
@@ -589,10 +589,20 @@ void ESolver_KS_PW<T, Device>::iter_finish(UnitCell& ucell, const int istep, int
                 auto start = std::chrono::high_resolution_clock::now();
                 exx_helper.set_firstiter(false);
                 exx_helper.op_exx->first_iter = false;
+                double dexx = 0.0;
+                if (PARAM.inp.exx_thr_type == "energy")
+                {
+                    dexx = exx_helper.cal_exx_energy(this->kspw_psi);
+                }
                 exx_helper.set_psi(this->kspw_psi);
+                if (PARAM.inp.exx_thr_type == "energy")
+                {
+                    dexx -= exx_helper.cal_exx_energy(this->kspw_psi);
+                    // std::cout << "dexx = " << dexx << std::endl;
+                }
+                bool conv_ene = std::abs(dexx) < PARAM.inp.exx_ene_thr;
 
-                conv_esolver = exx_helper.exx_after_converge(iter);
-
+                conv_esolver = exx_helper.exx_after_converge(iter, conv_ene);
                 if (!conv_esolver)
                 {
                     auto duration = std::chrono::high_resolution_clock::now() - start;
@@ -602,6 +612,7 @@ void ESolver_KS_PW<T, Device>::iter_finish(UnitCell& ucell, const int istep, int
                     exx_helper.op_exx->first_iter = false;
                     XC_Functional::set_xc_type(ucell.atoms[0].ncpp.xc_func);
                     update_pot(ucell, istep, iter, conv_esolver);
+                    exx_helper.iter_inc();
                 }
             }
         }
@@ -614,8 +625,9 @@ void ESolver_KS_PW<T, Device>::iter_finish(UnitCell& ucell, const int istep, int
     //----------------------------------------------------------
     // 3) Print out electronic wavefunctions in pw basis
     //----------------------------------------------------------
-    if (iter % PARAM.inp.out_freq_elec == 0 || iter == PARAM.inp.scf_nmax || conv_esolver)
+    if (iter % PARAM.inp.out_freq_elec == 0 || iter == PARAM.inp.scf_nmax)
     {
+        // conv_esolver == true has already been dealt with in after_scf
         ModuleIO::write_wfc_pw(GlobalV::KPAR,
                                GlobalV::MY_POOL,
                                GlobalV::MY_RANK,
diff --git a/source/source_io/module_parameter/input_parameter.h b/source/source_io/module_parameter/input_parameter.h
index db2b5d5a72..520db8d100 100644
--- a/source/source_io/module_parameter/input_parameter.h
+++ b/source/source_io/module_parameter/input_parameter.h
@@ -654,6 +654,8 @@ struct Input_para
     // EXX for planewave basis, rhx0820 2025-03-10
     bool exxace = true; // exxace, exact exchange for planewave basis, https://doi.org/10.1021/acs.jctc.6b00092
     bool exx_gamma_extrapolation = true; // gamma point extrapolation for exx, https://doi.org/10.1103/PhysRevB.79.205114
+    std::string exx_thr_type = "density"; // threshold type for exx outer loop, energy or density
+    double exx_ene_thr = 1e-5; // threshold for exx outer loop when exx_thr_type = energy
 
     // ====   #Parameters (23.XC external parameterization) ========
     /*
diff --git a/source/source_io/read_input_item_other.cpp b/source/source_io/read_input_item_other.cpp
index 5694c7b33b..cdd23df1c9 100644
--- a/source/source_io/read_input_item_other.cpp
+++ b/source/source_io/read_input_item_other.cpp
@@ -545,6 +545,32 @@ void ReadInput::item_others()
         read_sync_bool(input.exx_gamma_extrapolation);
         this->add_item(item);
     }
+    {
+        Input_Item item("exx_thr_type");
+        item.annotation = "threshold type for exx outer loop, energy or density";
+        read_sync_string(input.exx_thr_type);
+        item.check_value = [](const Input_Item& item, const Parameter& para) {
+            std::string thr_type = para.input.exx_thr_type;
+            std::transform(thr_type.begin(), thr_type.end(), thr_type.begin(), ::tolower);
+            if (thr_type != "energy" && thr_type != "density")
+            {
+                ModuleBase::WARNING_QUIT("ReadInput", "exx_thr_type should be energy or density");
+            }
+        };
+        this->add_item(item);
+    }
+    {
+        Input_Item item("exx_ene_thr");
+        item.annotation = "threshold for exx outer loop when exx_thr_type = energy";
+        read_sync_double(input.exx_ene_thr);
+        item.check_value = [](const Input_Item& item, const Parameter& para) {
+            if (para.input.exx_ene_thr <= 0)
+            {
+                ModuleBase::WARNING_QUIT("ReadInput", "exx_ene_thr must > 0");
+            }
+        };
+        this->add_item(item);
+    }
 
 }
 } // namespace ModuleIO
\ No newline at end of file
diff --git a/source/source_pw/module_pwdft/CMakeLists.txt b/source/source_pw/module_pwdft/CMakeLists.txt
index 03e808f6e6..81ad6ddc2c 100644
--- a/source/source_pw/module_pwdft/CMakeLists.txt
+++ b/source/source_pw/module_pwdft/CMakeLists.txt
@@ -10,6 +10,8 @@ list(APPEND objects
     operator_pw/operator_pw.cpp
     operator_pw/onsite_proj_pw.cpp
     operator_pw/op_exx_pw.cpp
+    operator_pw/exx_pw_ace.cpp
+    operator_pw/exx_pw_pot.cpp
     forces_nl.cpp
     forces_cc.cpp
     forces_scc.cpp
diff --git a/source/source_pw/module_pwdft/module_exx_helper/exx_helper.cpp b/source/source_pw/module_pwdft/module_exx_helper/exx_helper.cpp
index ffa671c42d..1b41163967 100644
--- a/source/source_pw/module_pwdft/module_exx_helper/exx_helper.cpp
+++ b/source/source_pw/module_pwdft/module_exx_helper/exx_helper.cpp
@@ -8,7 +8,7 @@ double Exx_Helper<T, Device>::cal_exx_energy(psi::Psi<T, Device> *psi_)
 }
 
 template <typename T, typename Device>
-bool Exx_Helper<T, Device>::exx_after_converge(int &iter)
+bool Exx_Helper<T, Device>::exx_after_converge(int &iter, bool ene_conv)
 {
     if (op_exx->first_iter)
     {
@@ -18,10 +18,20 @@ bool Exx_Helper<T, Device>::exx_after_converge(int &iter)
     {
         return true;
     }
-    else if (iter == 1)
+    else if (PARAM.inp.exx_thr_type == "energy" && ene_conv)
     {
         return true;
     }
+    else if (PARAM.inp.exx_thr_type == "density" && iter == 1)
+    {
+        return true;
+    }
+    else if (iter >= PARAM.inp.exx_hybrid_step)
+    {
+        GlobalV::ofs_running << " !!EXX IS NOT CONVERGED!!" << std::endl;
+        std::cout << " !!EXX IS NOT CONVERGED!!" << std::endl;
+        return true;
+    }
     GlobalV::ofs_running << "Updating EXX and rerun SCF" << std::endl;
     iter = 0;
     return false;
diff --git a/source/source_pw/module_pwdft/module_exx_helper/exx_helper.h b/source/source_pw/module_pwdft/module_exx_helper/exx_helper.h
index a67afbc80e..77c6c84295 100644
--- a/source/source_pw/module_pwdft/module_exx_helper/exx_helper.h
+++ b/source/source_pw/module_pwdft/module_exx_helper/exx_helper.h
@@ -21,6 +21,7 @@ struct Exx_Helper
     void set_firstiter(bool flag = true) { first_iter = flag; }
     void set_wg(const ModuleBase::matrix *wg_) { wg = wg_; }
     void set_psi(psi::Psi<T, Device> *psi_);
+    void iter_inc() { exx_iter++; }
 
     void set_op()
     {
@@ -29,7 +30,7 @@ struct Exx_Helper
         op_exx->set_wg(wg);
     }
 
-    bool exx_after_converge(int &iter);
+    bool exx_after_converge(int &iter, bool ene_conv);
 
     double cal_exx_energy(psi::Psi<T, Device> *psi_);
 
@@ -37,6 +38,7 @@ struct Exx_Helper
     bool first_iter = false;
     psi::Psi<T, Device> *psi = nullptr;
     const ModuleBase::matrix *wg = nullptr;
+    int exx_iter = 0;
 
 };
 #endif // EXX_HELPER_H
diff --git a/source/source_pw/module_pwdft/operator_pw/CMakeLists.txt b/source/source_pw/module_pwdft/operator_pw/CMakeLists.txt
index e4f2ef4f10..0a0e923ee0 100644
--- a/source/source_pw/module_pwdft/operator_pw/CMakeLists.txt
+++ b/source/source_pw/module_pwdft/operator_pw/CMakeLists.txt
@@ -6,6 +6,9 @@ list(APPEND operator_ks_pw_srcs
     meta_pw.cpp
     velocity_pw.cpp
     onsite_proj_pw.cpp
+    op_exx_pw.cpp
+    exx_pw_ace.cpp
+    exx_pw_pot.cpp
 )
 
 # this library is included in module_pwdft now
diff --git a/source/source_pw/module_pwdft/operator_pw/exx_pw_ace.cpp b/source/source_pw/module_pwdft/operator_pw/exx_pw_ace.cpp
new file mode 100644
index 0000000000..80c4a0dc6a
--- /dev/null
+++ b/source/source_pw/module_pwdft/operator_pw/exx_pw_ace.cpp
@@ -0,0 +1,247 @@
+#include "op_exx_pw.h"
+
+namespace hamilt
+{
+template <typename T, typename Device>
+void OperatorEXXPW<T, Device>::act_op_ace(const int nbands,
+                                          const int nbasis,
+                                          const int npol,
+                                          const T *tmpsi_in,
+                                          T *tmhpsi,
+                                          const int ngk_ik,
+                                          const bool is_first_node) const
+{
+    ModuleBase::timer::tick("OperatorEXXPW", "act_op_ace");
+    //    std::cout << "act_op_ace" << std::endl;
+    // hpsi += -Xi^\dagger * Xi * psi
+    T* Xi_ace = Xi_ace_k[this->ik];
+    int nbands_tot = psi.get_nbands();
+    int nbasis_max = psi.get_nbasis();
+    //    T* hpsi = nullptr;
+    //    resmem_complex_op()(hpsi, nbands_tot * nbasis);
+    //    setmem_complex_op()(hpsi, 0, nbands_tot * nbasis);
+    T* Xi_psi = nullptr;
+    resmem_complex_op()(Xi_psi, nbands_tot * nbands);
+    setmem_complex_op()(Xi_psi, 0, nbands_tot * nbands);
+
+    char trans_N = 'N', trans_T = 'T', trans_C = 'C';
+    T intermediate_one = 1.0, intermediate_zero = 0.0, intermediate_minus_one = -1.0;
+    // Xi * psi
+    gemm_complex_op()(trans_N,
+                      trans_N,
+                      nbands_tot,
+                      nbands,
+                      nbasis,
+                      &intermediate_one,
+                      Xi_ace,
+                      nbands_tot,
+                      tmpsi_in,
+                      nbasis,
+                      &intermediate_zero,
+                      Xi_psi,
+                      nbands_tot
+    );
+
+    Parallel_Reduce::reduce_pool(Xi_psi, nbands_tot * nbands);
+
+    // Xi^\dagger * (Xi * psi)
+    gemm_complex_op()(trans_C,
+                      trans_N,
+                      nbasis,
+                      nbands,
+                      nbands_tot,
+                      &intermediate_minus_one,
+                      Xi_ace,
+                      nbands_tot,
+                      Xi_psi,
+                      nbands_tot,
+                      &intermediate_one,
+                      tmhpsi,
+                      nbasis
+    );
+
+
+    //    // negative sign, add to hpsi
+    //    vec_add_vec_complex_op()(this->ctx, nbands * nbasis, tmhpsi, hpsi, -1, tmhpsi, 1);
+    //    delmem_complex_op()(hpsi);
+    delmem_complex_op()(Xi_psi);
+    ModuleBase::timer::tick("OperatorEXXPW", "act_op_ace");
+
+}
+
+template <typename T, typename Device>
+void OperatorEXXPW<T, Device>::construct_ace() const
+{
+    //    int nkb = p_exx_helper->psi.get_nbands() * p_exx_helper->psi.get_nk();
+    int nbands = psi.get_nbands();
+    int nbasis = psi.get_nbasis();
+    int nk = psi.get_nk();
+
+    int ik_save = this->ik;
+    int * ik_ = const_cast<int*>(&this->ik);
+
+    T intermediate_one = 1.0, intermediate_zero = 0.0;
+
+    if (h_psi_ace == nullptr)
+    {
+        resmem_complex_op()(h_psi_ace, nbands * nbasis);
+        setmem_complex_op()(h_psi_ace, 0, nbands * nbasis);
+    }
+
+    if (Xi_ace_k.size() != nk)
+    {
+        Xi_ace_k.resize(nk);
+        for (int i = 0; i < nk; i++)
+        {
+            resmem_complex_op()(Xi_ace_k[i], nbands * nbasis);
+        }
+    }
+
+    for (int i = 0; i < nk; i++)
+    {
+        setmem_complex_op()(Xi_ace_k[i], 0, nbands * nbasis);
+    }
+
+    if (L_ace == nullptr)
+    {
+        resmem_complex_op()(L_ace, nbands * nbands);
+        setmem_complex_op()(L_ace, 0, nbands * nbands);
+    }
+
+    if (psi_h_psi_ace == nullptr)
+    {
+        resmem_complex_op()(psi_h_psi_ace, nbands * nbands);
+    }
+
+    if (first_iter) return;
+    ModuleBase::timer::tick("OperatorEXXPW", "construct_ace");
+
+    for (int ik = 0; ik < nk; ik++)
+    {
+        int npwk = wfcpw->npwk[ik];
+
+        T* Xi_ace = Xi_ace_k[ik];
+        psi.fix_kb(ik, 0);
+        T* p_psi = psi.get_pointer();
+
+        setmem_complex_op()(h_psi_ace, 0, nbands * nbasis);
+
+        *ik_ = ik;
+
+        act_op(
+            nbands,
+            nbasis,
+            1,
+            p_psi,
+            h_psi_ace,
+            nbasis,
+            false
+        );
+
+        // psi_h_psi_ace = psi^\dagger * h_psi_ace
+        // p_exx_helper->psi.fix_kb(0, 0);
+        gemm_complex_op()('C',
+                          'N',
+                          nbands,
+                          nbands,
+                          npwk,
+                          &intermediate_one,
+                          p_psi,
+                          nbasis,
+                          h_psi_ace,
+                          nbasis,
+                          &intermediate_zero,
+                          psi_h_psi_ace,
+                          nbands);
+
+        // reduction of psi_h_psi_ace, due to distributed memory
+        Parallel_Reduce::reduce_pool(psi_h_psi_ace, nbands * nbands);
+
+        T intermediate_minus_one = -1.0;
+        axpy_complex_op()(nbands * nbands,
+                          &intermediate_minus_one,
+                          psi_h_psi_ace,
+                          1,
+                          L_ace,
+                          1);
+
+
+        int info = 0;
+        char up = 'U', lo = 'L';
+
+        lapack_potrf()(lo, nbands, L_ace, nbands);
+
+        // expand for-loop
+        for (int i = 0; i < nbands; ++i) {
+            setmem_complex_op()(L_ace + i * nbands, 0, i);
+        }
+
+        // L_ace inv in place
+        char non = 'N';
+        lapack_trtri()(lo, non, nbands, L_ace, nbands);
+
+        // Xi_ace = L_ace^-1 * h_psi_ace^dagger
+        gemm_complex_op()('N',
+                          'C',
+                          nbands,
+                          npwk,
+                          nbands,
+                          &intermediate_one,
+                          L_ace,
+                          nbands,
+                          h_psi_ace,
+                          nbasis,
+                          &intermediate_zero,
+                          Xi_ace,
+                          nbands);
+
+        // clear mem
+        setmem_complex_op()(h_psi_ace, 0, nbands * nbasis);
+        setmem_complex_op()(psi_h_psi_ace, 0, nbands * nbands);
+        setmem_complex_op()(L_ace, 0, nbands * nbands);
+
+    }
+
+    *ik_ = ik_save;
+    ModuleBase::timer::tick("OperatorEXXPW", "construct_ace");
+
+}
+
+template <typename T, typename Device>
+double OperatorEXXPW<T, Device>::cal_exx_energy_ace(psi::Psi<T, Device>* ppsi_) const
+{
+    double Eexx = 0;
+
+    psi::Psi<T, Device> psi_ = *ppsi_;
+    int* ik_ = const_cast<int*>(&this->ik);
+    int ik_save = this->ik;
+    for (int i = 0; i < wfcpw->nks; i++)
+    {
+        setmem_complex_op()(h_psi_ace, 0, psi_.get_nbands() * psi_.get_nbasis());
+        *ik_ = i;
+        psi_.fix_kb(i, 0);
+        T* psi_i = psi_.get_pointer();
+        act_op_ace(psi_.get_nbands(), psi_.get_nbasis(), 1, psi_i, h_psi_ace, 0, true);
+
+        for (int nband = 0; nband < psi_.get_nbands(); nband++)
+        {
+            psi_.fix_kb(i, nband);
+            T* psi_i_n = psi_.get_pointer();
+            T* hpsi_i_n = h_psi_ace + nband * psi_.get_nbasis();
+            double wg_i_n = (*wg)(i, nband);
+            // Eexx += dot(psi_i_n, h_psi_i_n)
+            Eexx += dot_op()(psi_.get_nbasis(), psi_i_n, hpsi_i_n, false) * wg_i_n * 2;
+        }
+    }
+
+    Parallel_Reduce::reduce_pool(Eexx);
+    *ik_ = ik_save;
+    return Eexx;
+}
+template class OperatorEXXPW<std::complex<float>, base_device::DEVICE_CPU>;
+template class OperatorEXXPW<std::complex<double>, base_device::DEVICE_CPU>;
+#if ((defined __CUDA) || (defined __ROCM))
+template class OperatorEXXPW<std::complex<float>, base_device::DEVICE_GPU>;
+template class OperatorEXXPW<std::complex<double>, base_device::DEVICE_GPU>;
+#endif
+}
\ No newline at end of file
diff --git a/source/source_pw/module_pwdft/operator_pw/exx_pw_pot.cpp b/source/source_pw/module_pwdft/operator_pw/exx_pw_pot.cpp
new file mode 100644
index 0000000000..6019ac110c
--- /dev/null
+++ b/source/source_pw/module_pwdft/operator_pw/exx_pw_pot.cpp
@@ -0,0 +1,552 @@
+#include "op_exx_pw.h"
+#include "source_io/module_parameter/parameter.h"
+#include "source_pw/module_pwdft/global.h"
+
+namespace hamilt
+{
+template <typename Real, typename Device>
+void get_exx_potential(const K_Vectors* kv,
+                       const ModulePW::PW_Basis_K* wfcpw,
+                       ModulePW::PW_Basis* rhopw_dev,
+                       Real* pot,
+                       double tpiba,
+                       bool gamma_extrapolation,
+                       double ucell_omega)
+{
+    using setmem_real_cpu_op = base_device::memory::set_memory_op<Real, base_device::DEVICE_CPU>;
+    using syncmem_real_c2d_op = base_device::memory::synchronize_memory_op<Real, base_device::DEVICE_CPU, Device>;
+
+    Real nqs_half1 = 0.5 * kv->nmp[0];
+    Real nqs_half2 = 0.5 * kv->nmp[1];
+    Real nqs_half3 = 0.5 * kv->nmp[2];
+
+    Real* pot_cpu = nullptr;
+    int nks = wfcpw->nks, npw = rhopw_dev->npw;
+    double tpiba2 = tpiba * tpiba;
+    pot_cpu = new Real[npw * wfcpw->nks * wfcpw->nks];
+    // fill zero
+    setmem_real_cpu_op()(pot_cpu, 0, npw * nks * nks);
+
+    // calculate Fock pot
+    auto param_fock = GlobalC::exx_info.info_global.coulomb_param[Conv_Coulomb_Pot_K::Coulomb_Type::Fock];
+    for (auto param: param_fock)
+    {
+        double exx_div = exx_divergence(Conv_Coulomb_Pot_K::Coulomb_Type::Fock,
+                                        0.0,
+                                        kv,
+                                        wfcpw,
+                                        rhopw_dev,
+                                        tpiba,
+                                        gamma_extrapolation,
+                                        ucell_omega);
+        double alpha = std::stod(param["alpha"]);
+        for (int ik = 0; ik < nks; ik++)
+        {
+            for (int iq = 0; iq < nks; iq++)
+            {
+                const ModuleBase::Vector3<double> k_c = wfcpw->kvec_c[ik];
+                const ModuleBase::Vector3<double> k_d = wfcpw->kvec_d[ik];
+                const ModuleBase::Vector3<double> q_c = wfcpw->kvec_c[iq];
+                const ModuleBase::Vector3<double> q_d = wfcpw->kvec_d[iq];
+
+#ifdef _OPENMP
+#pragma omp parallel for schedule(static)
+#endif
+                for (int ig = 0; ig < rhopw_dev->npw; ig++)
+                {
+                    const ModuleBase::Vector3<double> g_d = rhopw_dev->gdirect[ig];
+                    const ModuleBase::Vector3<double> kqg_d = k_d - q_d + g_d;
+                    // For gamma_extrapolation (https://doi.org/10.1103/PhysRevB.79.205114)
+                    // 7/8 of the points in the grid are "activated" and 1/8 are disabled.
+                    // grid_factor is designed for the 7/8 of the grid to function like all of the points
+                    Real grid_factor = 1;
+                    double extrapolate_grid = 8.0 / 7.0;
+                    if (gamma_extrapolation)
+                    {
+                        // if isint(kqg_d[0] * nqs_half1) && isint(kqg_d[1] * nqs_half2) && isint(kqg_d[2] * nqs_half3)
+                        auto isint = [](double x) {
+                            double epsilon = 1e-6; // this follows the isint judgement in q-e
+                            return std::abs(x - std::round(x)) < epsilon;
+                        };
+                        if (isint(kqg_d[0] * nqs_half1) && isint(kqg_d[1] * nqs_half2) && isint(kqg_d[2] * nqs_half3))
+                        {
+                            grid_factor = 0;
+                        }
+                        else
+                        {
+                            grid_factor = extrapolate_grid;
+                        }
+                    }
+
+                    const int nk_fac = PARAM.inp.nspin == 2 ? 2 : 1;
+                    const int nk = nks / nk_fac;
+                    const int ig_kq = ik * nks * npw + iq * npw + ig;
+
+                    Real gg = (k_c - q_c + rhopw_dev->gcar[ig]).norm2() * tpiba2;
+                    // if (kqgcar2 > 1e-12) // vasp uses 1/40 of the smallest (k spacing)**2
+                    if (gg >= 1e-8)
+                    {
+                        Real fac = -ModuleBase::FOUR_PI * ModuleBase::e2 / gg;
+                        pot_cpu[ig_kq] += fac * grid_factor * alpha;
+                    }
+                    // }
+                    else
+                    {
+                        pot_cpu[ig_kq] += exx_div * alpha;
+                    }
+                    // assert(is_finite(density_recip[ig]));
+                }
+            }
+        }
+    }
+
+    // calculate erfc pot
+    auto param_erfc = GlobalC::exx_info.info_global.coulomb_param[Conv_Coulomb_Pot_K::Coulomb_Type::Erfc];
+    for (auto param: param_erfc)
+    {
+        double erfc_omega = std::stod(param["omega"]);
+        double erfc_omega2 = erfc_omega * erfc_omega;
+        double alpha = std::stod(param["alpha"]);
+        double exx_div = exx_divergence(Conv_Coulomb_Pot_K::Coulomb_Type::Erfc,
+                                        erfc_omega,
+                                        kv,
+                                        wfcpw,
+                                        rhopw_dev,
+                                        tpiba,
+                                        gamma_extrapolation,
+                                        ucell_omega);
+        for (int ik = 0; ik < nks; ik++)
+        {
+            for (int iq = 0; iq < nks; iq++)
+            {
+                const ModuleBase::Vector3<double> k_c = wfcpw->kvec_c[ik];
+                const ModuleBase::Vector3<double> k_d = wfcpw->kvec_d[ik];
+                const ModuleBase::Vector3<double> q_c = wfcpw->kvec_c[iq];
+                const ModuleBase::Vector3<double> q_d = wfcpw->kvec_d[iq];
+
+#ifdef _OPENMP
+#pragma omp parallel for schedule(static)
+#endif
+                for (int ig = 0; ig < rhopw_dev->npw; ig++)
+                {
+                    const ModuleBase::Vector3<double> g_d = rhopw_dev->gdirect[ig];
+                    const ModuleBase::Vector3<double> kqg_d = k_d - q_d + g_d;
+                    // For gamma_extrapolation (https://doi.org/10.1103/PhysRevB.79.205114)
+                    // 7/8 of the points in the grid are "activated" and 1/8 are disabled.
+                    // grid_factor is designed for the 7/8 of the grid to function like all of the points
+                    Real grid_factor = 1;
+                    double extrapolate_grid = 8.0 / 7.0;
+                    if (gamma_extrapolation)
+                    {
+                        // if isint(kqg_d[0] * nqs_half1) && isint(kqg_d[1] * nqs_half2) && isint(kqg_d[2] * nqs_half3)
+                        auto isint = [](double x) {
+                            double epsilon = 1e-6; // this follows the isint judgement in q-e
+                            return std::abs(x - std::round(x)) < epsilon;
+                        };
+                        if (isint(kqg_d[0] * nqs_half1) && isint(kqg_d[1] * nqs_half2) && isint(kqg_d[2] * nqs_half3))
+                        {
+                            grid_factor = 0;
+                        }
+                        else
+                        {
+                            grid_factor = extrapolate_grid;
+                        }
+                    }
+
+                    const int nk_fac = PARAM.inp.nspin == 2 ? 2 : 1;
+                    const int nk = nks / nk_fac;
+                    const int ig_kq = ik * nks * npw + iq * npw + ig;
+
+                    Real gg = (k_c - q_c + rhopw_dev->gcar[ig]).norm2() * tpiba2;
+                    // if (kqgcar2 > 1e-12) // vasp uses 1/40 of the smallest (k spacing)**2
+                    if (gg >= 1e-8)
+                    {
+                        Real fac = -ModuleBase::FOUR_PI * ModuleBase::e2 / gg;
+                        pot_cpu[ig_kq] += fac * (1.0 - std::exp(-gg / 4.0 / erfc_omega2)) * grid_factor * alpha;
+                    }
+                    // }
+                    else
+                    {
+                        // if (PARAM.inp.dft_functional == "hse")
+                        if (!gamma_extrapolation)
+                        {
+                            pot_cpu[ig_kq] += (exx_div - ModuleBase::PI * ModuleBase::e2 / erfc_omega2) * alpha;
+                        }
+                        else
+                        {
+                            pot_cpu[ig_kq] += exx_div * alpha;
+                        }
+                    }
+                    // assert(is_finite(density_recip[ig]));
+                }
+            }
+        }
+    }
+
+    // copy the potential to the device memory
+    syncmem_real_c2d_op()(pot, pot_cpu, rhopw_dev->npw * wfcpw->nks * wfcpw->nks);
+
+    delete pot_cpu;
+}
+
+template <typename Real, typename Device>
+void get_exx_stress_potential(const K_Vectors* kv,
+                       const ModulePW::PW_Basis_K* wfcpw,
+                       ModulePW::PW_Basis* rhopw_dev,
+                       Real* pot,
+                       double tpiba,
+                       bool gamma_extrapolation,
+                       double ucell_omega)
+{
+    using setmem_real_cpu_op = base_device::memory::set_memory_op<Real, base_device::DEVICE_CPU>;
+    using syncmem_real_c2d_op = base_device::memory::synchronize_memory_op<Real, base_device::DEVICE_CPU, Device>;
+
+    Real nqs_half1 = 0.5 * kv->nmp[0];
+    Real nqs_half2 = 0.5 * kv->nmp[1];
+    Real nqs_half3 = 0.5 * kv->nmp[2];
+
+    Real* pot_cpu = nullptr;
+    int nks = wfcpw->nks, npw = rhopw_dev->npw;
+    double tpiba2 = tpiba * tpiba;
+    pot_cpu = new Real[npw * wfcpw->nks * wfcpw->nks];
+    // fill zero
+    setmem_real_cpu_op()(pot_cpu, 0, npw * nks * nks);
+
+    // calculate Fock pot
+    auto param_fock = GlobalC::exx_info.info_global.coulomb_param[Conv_Coulomb_Pot_K::Coulomb_Type::Fock];
+    for (auto param: param_fock)
+    {
+        double exx_div = exx_divergence(Conv_Coulomb_Pot_K::Coulomb_Type::Fock,
+                                        0.0,
+                                        kv,
+                                        wfcpw,
+                                        rhopw_dev,
+                                        tpiba,
+                                        gamma_extrapolation,
+                                        ucell_omega);
+        double alpha = std::stod(param["alpha"]);
+        for (int ik = 0; ik < nks; ik++)
+        {
+            for (int iq = 0; iq < nks; iq++)
+            {
+                const ModuleBase::Vector3<double> k_c = wfcpw->kvec_c[ik];
+                const ModuleBase::Vector3<double> k_d = wfcpw->kvec_d[ik];
+                const ModuleBase::Vector3<double> q_c = wfcpw->kvec_c[iq];
+                const ModuleBase::Vector3<double> q_d = wfcpw->kvec_d[iq];
+
+#ifdef _OPENMP
+#pragma omp parallel for schedule(static)
+#endif
+                for (int ig = 0; ig < rhopw_dev->npw; ig++)
+                {
+                    const ModuleBase::Vector3<double> g_d = rhopw_dev->gdirect[ig];
+                    const ModuleBase::Vector3<double> kqg_d = k_d - q_d + g_d;
+                    // For gamma_extrapolation (https://doi.org/10.1103/PhysRevB.79.205114)
+                    // 7/8 of the points in the grid are "activated" and 1/8 are disabled.
+                    // grid_factor is designed for the 7/8 of the grid to function like all of the points
+                    Real grid_factor = 1;
+                    double extrapolate_grid = 8.0 / 7.0;
+                    if (gamma_extrapolation)
+                    {
+                        // if isint(kqg_d[0] * nqs_half1) && isint(kqg_d[1] * nqs_half2) && isint(kqg_d[2] * nqs_half3)
+                        auto isint = [](double x) {
+                            double epsilon = 1e-6; // this follows the isint judgement in q-e
+                            return std::abs(x - std::round(x)) < epsilon;
+                        };
+                        if (isint(kqg_d[0] * nqs_half1) && isint(kqg_d[1] * nqs_half2) && isint(kqg_d[2] * nqs_half3))
+                        {
+                            grid_factor = 0;
+                        }
+                        else
+                        {
+                            grid_factor = extrapolate_grid;
+                        }
+                    }
+
+                    const int nk_fac = PARAM.inp.nspin == 2 ? 2 : 1;
+                    const int nk = nks / nk_fac;
+                    const int ig_kq = ik * nks * npw + iq * npw + ig;
+
+                    Real gg = (k_c - q_c + rhopw_dev->gcar[ig]).norm2() * tpiba2;
+                    // if (kqgcar2 > 1e-12) // vasp uses 1/40 of the smallest (k spacing)**2
+                    if (gg >= 1e-8)
+                    {
+                        Real fac = -ModuleBase::FOUR_PI * ModuleBase::e2 / gg;
+                        pot_cpu[ig_kq] += 1.0 / gg * grid_factor * alpha;
+                    }
+                }
+            }
+        }
+    }
+
+    // calculate erfc pot
+    auto param_erfc = GlobalC::exx_info.info_global.coulomb_param[Conv_Coulomb_Pot_K::Coulomb_Type::Erfc];
+    for (auto param: param_erfc)
+    {
+        double erfc_omega = std::stod(param["omega"]);
+        double erfc_omega2 = erfc_omega * erfc_omega;
+        double alpha = std::stod(param["alpha"]);
+        double exx_div = exx_divergence(Conv_Coulomb_Pot_K::Coulomb_Type::Erfc,
+                                        erfc_omega,
+                                        kv,
+                                        wfcpw,
+                                        rhopw_dev,
+                                        tpiba,
+                                        gamma_extrapolation,
+                                        ucell_omega);
+        for (int ik = 0; ik < nks; ik++)
+        {
+            for (int iq = 0; iq < nks; iq++)
+            {
+                const ModuleBase::Vector3<double> k_c = wfcpw->kvec_c[ik];
+                const ModuleBase::Vector3<double> k_d = wfcpw->kvec_d[ik];
+                const ModuleBase::Vector3<double> q_c = wfcpw->kvec_c[iq];
+                const ModuleBase::Vector3<double> q_d = wfcpw->kvec_d[iq];
+
+#ifdef _OPENMP
+#pragma omp parallel for schedule(static)
+#endif
+                for (int ig = 0; ig < rhopw_dev->npw; ig++)
+                {
+                    const ModuleBase::Vector3<double> g_d = rhopw_dev->gdirect[ig];
+                    const ModuleBase::Vector3<double> kqg_d = k_d - q_d + g_d;
+                    // For gamma_extrapolation (https://doi.org/10.1103/PhysRevB.79.205114)
+                    // 7/8 of the points in the grid are "activated" and 1/8 are disabled.
+                    // grid_factor is designed for the 7/8 of the grid to function like all of the points
+                    Real grid_factor = 1;
+                    double extrapolate_grid = 8.0 / 7.0;
+                    if (gamma_extrapolation)
+                    {
+                        // if isint(kqg_d[0] * nqs_half1) && isint(kqg_d[1] * nqs_half2) && isint(kqg_d[2] * nqs_half3)
+                        auto isint = [](double x) {
+                            double epsilon = 1e-6; // this follows the isint judgement in q-e
+                            return std::abs(x - std::round(x)) < epsilon;
+                        };
+                        if (isint(kqg_d[0] * nqs_half1) && isint(kqg_d[1] * nqs_half2) && isint(kqg_d[2] * nqs_half3))
+                        {
+                            grid_factor = 0;
+                        }
+                        else
+                        {
+                            grid_factor = extrapolate_grid;
+                        }
+                    }
+
+                    const int nk_fac = PARAM.inp.nspin == 2 ? 2 : 1;
+                    const int nk = nks / nk_fac;
+                    const int ig_kq = ik * nks * npw + iq * npw + ig;
+
+                    Real gg = (k_c - q_c + rhopw_dev->gcar[ig]).norm2() * tpiba2;
+                    // if (kqgcar2 > 1e-12) // vasp uses 1/40 of the smallest (k spacing)**2
+                    if (gg >= 1e-8)
+                    {
+                        Real fac = -ModuleBase::FOUR_PI * ModuleBase::e2 / gg;
+                        pot_cpu[ig_kq] += (1.0 - (1.0 + gg / 4.0 / erfc_omega2) * std::exp(-gg / 4.0 / erfc_omega2)) / (1.0 - std::exp(-gg / 4.0 / erfc_omega2)) / gg * grid_factor * alpha;
+                    }
+                    // }
+                    else
+                    {
+                        // if (PARAM.inp.dft_functional == "hse")
+                        if (!gamma_extrapolation)
+                        {
+                            pot_cpu[ig_kq] += 1.0 / 4.0 / erfc_omega2 * alpha;
+                        }
+                    }
+                    // assert(is_finite(density_recip[ig]));
+                }
+            }
+        }
+    }
+
+    // copy the potential to the device memory
+    syncmem_real_c2d_op()(pot, pot_cpu, rhopw_dev->npw * wfcpw->nks * wfcpw->nks);
+
+    delete pot_cpu;
+}
+
+double exx_divergence(Conv_Coulomb_Pot_K::Coulomb_Type coulomb_type,
+                      double erfc_omega,
+                      const K_Vectors* kv,
+                      const ModulePW::PW_Basis_K* wfcpw,
+                      ModulePW::PW_Basis* rhopw_dev,
+                      double tpiba,
+                      bool gamma_extrapolation,
+                      double ucell_omega)
+{
+    double exx_div = 0;
+
+    double nqs_half1 = 0.5 * kv->nmp[0];
+    double nqs_half2 = 0.5 * kv->nmp[1];
+    double nqs_half3 = 0.5 * kv->nmp[2];
+
+    int nk_fac = PARAM.inp.nspin == 2 ? 2 : 1;
+
+    // here we follow the exx_divergence subroutine in q-e (PW/src/exx_base.f90)
+    double alpha = 10.0 / wfcpw->gk_ecut;
+    double tpiba2 = tpiba * tpiba;
+    double div = 0;
+
+    // this is the \sum_q F(q) part
+    // temporarily for all k points, should be replaced to q points later
+    for (int ik = 0; ik < wfcpw->nks; ik++)
+    {
+        const ModuleBase::Vector3<double> k_c = wfcpw->kvec_c[ik];
+        const ModuleBase::Vector3<double> k_d = wfcpw->kvec_d[ik];
+#ifdef _OPENMP
+#pragma omp parallel for reduction(+ : div)
+#endif
+        for (int ig = 0; ig < rhopw_dev->npw; ig++)
+        {
+            const ModuleBase::Vector3<double> q_c = k_c + rhopw_dev->gcar[ig];
+            const ModuleBase::Vector3<double> q_d = k_d + rhopw_dev->gdirect[ig];
+            double qq = q_c.norm2();
+            // For gamma_extrapolation (https://doi.org/10.1103/PhysRevB.79.205114)
+            // 7/8 of the points in the grid are "activated" and 1/8 are disabled.
+            // grid_factor is designed for the 7/8 of the grid to function like all of the points
+            double grid_factor = 1;
+            double extrapolate_grid = 8.0 / 7.0;
+            if (gamma_extrapolation)
+            {
+                auto isint = [](double x) {
+                    double epsilon = 1e-6; // this follows the isint judgement in q-e
+                    return std::abs(x - std::round(x)) < epsilon;
+                };
+                if (isint(q_d[0] * nqs_half1) && isint(q_d[1] * nqs_half2) && isint(q_d[2] * nqs_half3))
+                {
+                    grid_factor = 0;
+                }
+                else
+                {
+                    grid_factor = extrapolate_grid;
+                }
+            }
+
+            if (qq <= 1e-8)
+                continue;
+            // else if (PARAM.inp.dft_functional == "hse")
+            else if (coulomb_type == Conv_Coulomb_Pot_K::Coulomb_Type::Erfc)
+            {
+                double omega = erfc_omega;
+                double omega2 = omega * omega;
+                div += std::exp(-alpha * qq) / qq * (1.0 - std::exp(-qq * tpiba2 / 4.0 / omega2)) * grid_factor;
+            }
+            else
+            {
+                div += std::exp(-alpha * qq) / qq * grid_factor;
+            }
+        }
+    }
+
+    Parallel_Reduce::reduce_pool(div);
+    // std::cout << "EXX div: " << div << std::endl;
+
+    // if (PARAM.inp.dft_functional == "hse")
+    if (!gamma_extrapolation)
+    {
+        if (coulomb_type == Conv_Coulomb_Pot_K::Coulomb_Type::Erfc)
+        {
+            double omega = erfc_omega;
+            div += tpiba2 / 4.0 / omega / omega; // compensate for the finite value when qq = 0
+        }
+        else
+        {
+            div -= alpha;
+        }
+    }
+
+    div *= ModuleBase::e2 * ModuleBase::FOUR_PI / tpiba2 / wfcpw->nks;
+    //    std::cout << "div: " << div << std::endl;
+
+    // numerically value the mean value of F(q) in the reciprocal space
+    // This means we need to calculate the average of F(q) in the first brillouin zone
+    alpha /= tpiba2;
+    int nqq = 100000;
+    double dq = 5.0 / std::sqrt(alpha) / nqq;
+    double aa = 0.0;
+    // if (PARAM.inp.dft_functional == "hse")
+    if (coulomb_type == Conv_Coulomb_Pot_K::Coulomb_Type::Erfc)
+    {
+        double omega = erfc_omega;
+        double omega2 = omega * omega;
+#ifdef _OPENMP
+#pragma omp parallel for reduction(+ : aa)
+#endif
+        for (int i = 0; i < nqq; i++)
+        {
+            double q = dq * (i + 0.5);
+            aa -= exp(-alpha * q * q) * exp(-q * q / 4.0 / omega2) * dq;
+        }
+    }
+    aa *= 8 / ModuleBase::FOUR_PI;
+    aa += 1.0 / std::sqrt(alpha * ModuleBase::PI);
+
+    div -= ModuleBase::e2 * ucell_omega * aa;
+    exx_div = div * wfcpw->nks / nk_fac;
+    //    exx_div = 0;
+    //    std::cout << "EXX divergence: " << exx_div << std::endl;
+
+    return exx_div;
+}
+template class OperatorEXXPW<std::complex<float>, base_device::DEVICE_CPU>;
+template class OperatorEXXPW<std::complex<double>, base_device::DEVICE_CPU>;
+template void get_exx_potential<float, base_device::DEVICE_CPU>(const K_Vectors*,
+                                                                const ModulePW::PW_Basis_K*,
+                                                                ModulePW::PW_Basis*,
+                                                                float*,
+                                                                double,
+                                                                bool,
+                                                                double);
+template void get_exx_potential<double, base_device::DEVICE_CPU>(const K_Vectors*,
+                                                                 const ModulePW::PW_Basis_K*,
+                                                                 ModulePW::PW_Basis*,
+                                                                 double*,
+                                                                 double,
+                                                                 bool,
+                                                                 double);
+template void get_exx_stress_potential<float, base_device::DEVICE_CPU>(const K_Vectors*,
+                                                                const ModulePW::PW_Basis_K*,
+                                                                ModulePW::PW_Basis*,
+                                                                float*,
+                                                                double,
+                                                                bool,
+                                                                double);
+template void get_exx_stress_potential<double, base_device::DEVICE_CPU>(const K_Vectors*,
+                                                                 const ModulePW::PW_Basis_K*,
+                                                                 ModulePW::PW_Basis*,
+                                                                 double*,
+                                                                 double,
+                                                                 bool,
+                                                                 double);
+#if ((defined __CUDA) || (defined __ROCM))
+template class OperatorEXXPW<std::complex<float>, base_device::DEVICE_GPU>;
+template class OperatorEXXPW<std::complex<double>, base_device::DEVICE_GPU>;
+template void get_exx_potential<float, base_device::DEVICE_GPU>(const K_Vectors*,
+                                                                const ModulePW::PW_Basis_K*,
+                                                                ModulePW::PW_Basis*,
+                                                                float*,
+                                                                double,
+                                                                bool,
+                                                                double);
+template void get_exx_potential<double, base_device::DEVICE_GPU>(const K_Vectors*,
+                                                                 const ModulePW::PW_Basis_K*,
+                                                                 ModulePW::PW_Basis*,
+                                                                 double*,
+                                                                 double,
+                                                                 bool,
+                                                                 double);
+template void get_exx_stress_potential<float, base_device::DEVICE_GPU>(const K_Vectors*,
+                                                                const ModulePW::PW_Basis_K*,
+                                                                ModulePW::PW_Basis*,
+                                                                float*,
+                                                                double,
+                                                                bool,
+                                                                double);
+template void get_exx_stress_potential<double, base_device::DEVICE_GPU>(const K_Vectors*,
+                                                                 const ModulePW::PW_Basis_K*,
+                                                                 ModulePW::PW_Basis*,
+                                                                 double*,
+                                                                 double,
+                                                                 bool,
+                                                                 double);
+#endif
+} // namespace hamilt
\ No newline at end of file
diff --git a/source/source_pw/module_pwdft/operator_pw/op_exx_pw.cpp b/source/source_pw/module_pwdft/operator_pw/op_exx_pw.cpp
index 68bfca3158..83879d634f 100644
--- a/source/source_pw/module_pwdft/operator_pw/op_exx_pw.cpp
+++ b/source/source_pw/module_pwdft/operator_pw/op_exx_pw.cpp
@@ -170,7 +170,7 @@ void OperatorEXXPW<T, Device>::act_op(const int nbands,
 //              << std::endl;
     if (!potential_got)
     {
-        get_potential();
+        get_exx_potential<Real, Device>(kv, wfcpw, rhopw_dev, pot, tpiba, gamma_extrapolation, ucell->omega);
         potential_got = true;
     }
 
@@ -263,211 +263,6 @@ void OperatorEXXPW<T, Device>::act_op(const int nbands,
 
 }
 
-template <typename T, typename Device>
-void OperatorEXXPW<T, Device>::act_op_ace(const int nbands,
-                                          const int nbasis,
-                                          const int npol,
-                                          const T *tmpsi_in,
-                                          T *tmhpsi,
-                                          const int ngk_ik,
-                                          const bool is_first_node) const
-{
-    ModuleBase::timer::tick("OperatorEXXPW", "act_op_ace");
-//    std::cout << "act_op_ace" << std::endl;
-    // hpsi += -Xi^\dagger * Xi * psi
-    T* Xi_ace = Xi_ace_k[this->ik];
-    int nbands_tot = psi.get_nbands();
-    int nbasis_max = psi.get_nbasis();
-//    T* hpsi = nullptr;
-//    resmem_complex_op()(hpsi, nbands_tot * nbasis);
-//    setmem_complex_op()(hpsi, 0, nbands_tot * nbasis);
-    T* Xi_psi = nullptr;
-    resmem_complex_op()(Xi_psi, nbands_tot * nbands);
-    setmem_complex_op()(Xi_psi, 0, nbands_tot * nbands);
-
-    char trans_N = 'N', trans_T = 'T', trans_C = 'C';
-    T intermediate_one = 1.0, intermediate_zero = 0.0, intermediate_minus_one = -1.0;
-    // Xi * psi
-    gemm_complex_op()(trans_N,
-                      trans_N,
-                      nbands_tot,
-                      nbands,
-                      nbasis,
-                      &intermediate_one,
-                      Xi_ace,
-                      nbands_tot,
-                      tmpsi_in,
-                      nbasis,
-                      &intermediate_zero,
-                      Xi_psi,
-                      nbands_tot
-        );
-
-    Parallel_Reduce::reduce_pool(Xi_psi, nbands_tot * nbands);
-
-    // Xi^\dagger * (Xi * psi)
-    gemm_complex_op()(trans_C,
-                      trans_N,
-                      nbasis,
-                      nbands,
-                      nbands_tot,
-                      &intermediate_minus_one,
-                      Xi_ace,
-                      nbands_tot,
-                      Xi_psi,
-                      nbands_tot,
-                      &intermediate_one,
-                      tmhpsi,
-                      nbasis
-        );
-
-
-//    // negative sign, add to hpsi
-//    vec_add_vec_complex_op()(this->ctx, nbands * nbasis, tmhpsi, hpsi, -1, tmhpsi, 1);
-//    delmem_complex_op()(hpsi);
-    delmem_complex_op()(Xi_psi);
-    ModuleBase::timer::tick("OperatorEXXPW", "act_op_ace");
-
-}
-
-template <typename T, typename Device>
-void OperatorEXXPW<T, Device>::construct_ace() const
-{
-    ModuleBase::timer::tick("OperatorEXXPW", "construct_ace");
-//    int nkb = p_exx_helper->psi.get_nbands() * p_exx_helper->psi.get_nk();
-    int nbands = psi.get_nbands();
-    int nbasis = psi.get_nbasis();
-    int nk = psi.get_nk();
-
-    int ik_save = this->ik;
-    int * ik_ = const_cast<int*>(&this->ik);
-
-    T intermediate_one = 1.0, intermediate_zero = 0.0;
-
-    if (h_psi_ace == nullptr)
-    {
-        resmem_complex_op()(h_psi_ace, nbands * nbasis);
-        setmem_complex_op()(h_psi_ace, 0, nbands * nbasis);
-    }
-
-    if (Xi_ace_k.size() != nk)
-    {
-        Xi_ace_k.resize(nk);
-        for (int i = 0; i < nk; i++)
-        {
-            resmem_complex_op()(Xi_ace_k[i], nbands * nbasis);
-        }
-    }
-
-    for (int i = 0; i < nk; i++)
-    {
-        setmem_complex_op()(Xi_ace_k[i], 0, nbands * nbasis);
-    }
-
-    if (L_ace == nullptr)
-    {
-        resmem_complex_op()(L_ace, nbands * nbands);
-        setmem_complex_op()(L_ace, 0, nbands * nbands);
-    }
-
-    if (psi_h_psi_ace == nullptr)
-    {
-        resmem_complex_op()(psi_h_psi_ace, nbands * nbands);
-    }
-
-    if (first_iter) return;
-
-    for (int ik = 0; ik < nk; ik++)
-    {
-        int npwk = wfcpw->npwk[ik];
-
-        T* Xi_ace = Xi_ace_k[ik];
-        psi.fix_kb(ik, 0);
-        T* p_psi = psi.get_pointer();
-
-        setmem_complex_op()(h_psi_ace, 0, nbands * nbasis);
-
-        *ik_ = ik;
-
-        act_op(
-            nbands,
-            nbasis,
-            1,
-            p_psi,
-            h_psi_ace,
-            nbasis,
-            false
-            );
-
-        // psi_h_psi_ace = psi^\dagger * h_psi_ace
-        // p_exx_helper->psi.fix_kb(0, 0);
-        gemm_complex_op()('C',
-                          'N',
-                          nbands,
-                          nbands,
-                          npwk,
-                          &intermediate_one,
-                          p_psi,
-                          nbasis,
-                          h_psi_ace,
-                          nbasis,
-                          &intermediate_zero,
-                          psi_h_psi_ace,
-                          nbands);
-
-        // reduction of psi_h_psi_ace, due to distributed memory
-        Parallel_Reduce::reduce_pool(psi_h_psi_ace, nbands * nbands);
-
-        T intermediate_minus_one = -1.0;
-        axpy_complex_op()(nbands * nbands,
-                          &intermediate_minus_one,
-                          psi_h_psi_ace,
-                          1,
-                          L_ace,
-                          1);
-
-
-        int info = 0;
-        char up = 'U', lo = 'L';
-
-        lapack_potrf()(lo, nbands, L_ace, nbands);
-
-        // expand for-loop
-        for (int i = 0; i < nbands; ++i) {
-            setmem_complex_op()(L_ace + i * nbands, 0, i);
-        }
-
-        // L_ace inv in place
-        char non = 'N';
-        lapack_trtri()(lo, non, nbands, L_ace, nbands);
-
-        // Xi_ace = L_ace^-1 * h_psi_ace^dagger
-        gemm_complex_op()('N',
-                          'C',
-                          nbands,
-                          npwk,
-                          nbands,
-                          &intermediate_one,
-                          L_ace,
-                          nbands,
-                          h_psi_ace,
-                          nbasis,
-                          &intermediate_zero,
-                          Xi_ace,
-                          nbands);
-
-        // clear mem
-        setmem_complex_op()(h_psi_ace, 0, nbands * nbasis);
-        setmem_complex_op()(psi_h_psi_ace, 0, nbands * nbands);
-        setmem_complex_op()(L_ace, 0, nbands * nbands);
-
-    }
-
-    *ik_ = ik_save;
-    ModuleBase::timer::tick("OperatorEXXPW", "construct_ace");
-
-}
-
 template <typename T, typename Device>
 std::vector<int> OperatorEXXPW<T, Device>::get_q_points(const int ik) const
 {
@@ -560,291 +355,6 @@ OperatorEXXPW<T, Device>::OperatorEXXPW(const OperatorEXXPW<T_in, Device_in> *op
 
 }
 
-template <typename T, typename Device>
-void OperatorEXXPW<T, Device>::get_potential() const
-{
-    Real nqs_half1 = 0.5 * kv->nmp[0];
-    Real nqs_half2 = 0.5 * kv->nmp[1];
-    Real nqs_half3 = 0.5 * kv->nmp[2];
-
-    Real* pot_cpu = nullptr;
-    int nks = wfcpw->nks, npw = rhopw_dev->npw;
-    double tpiba2 = tpiba * tpiba;
-    pot_cpu = new Real[npw * wfcpw->nks * wfcpw->nks];
-    // fill zero
-    setmem_real_cpu_op()(pot_cpu, 0, npw * nks * nks);
-
-    // calculate Fock pot
-    auto param_fock = GlobalC::exx_info.info_global.coulomb_param[Conv_Coulomb_Pot_K::Coulomb_Type::Fock];
-    for (auto param : param_fock)
-    {
-        double exx_div = exx_divergence(Conv_Coulomb_Pot_K::Coulomb_Type::Fock);
-        double alpha   = std::stod(param["alpha"]);
-        for (int ik = 0; ik < nks; ik++)
-        {
-            for (int iq = 0; iq < nks; iq++)
-            {
-                const ModuleBase::Vector3<double> k_c = wfcpw->kvec_c[ik];
-                const ModuleBase::Vector3<double> k_d = wfcpw->kvec_d[ik];
-                const ModuleBase::Vector3<double> q_c = wfcpw->kvec_c[iq];
-                const ModuleBase::Vector3<double> q_d = wfcpw->kvec_d[iq];
-
-#ifdef _OPENMP
-#pragma omp parallel for schedule(static)
-#endif
-                for (int ig = 0; ig < rhopw_dev->npw; ig++)
-                {
-                    const ModuleBase::Vector3<double> g_d = rhopw_dev->gdirect[ig];
-                    const ModuleBase::Vector3<double> kqg_d = k_d - q_d + g_d;
-                    // For gamma_extrapolation (https://doi.org/10.1103/PhysRevB.79.205114)
-                    // 7/8 of the points in the grid are "activated" and 1/8 are disabled.
-                    // grid_factor is designed for the 7/8 of the grid to function like all of the points
-                    Real grid_factor = 1;
-                    double extrapolate_grid = 8.0 / 7.0;
-                    if (gamma_extrapolation)
-                    {
-                        // if isint(kqg_d[0] * nqs_half1) && isint(kqg_d[1] * nqs_half2) && isint(kqg_d[2] * nqs_half3)
-                        auto isint = [](double x) {
-                            double epsilon = 1e-6; // this follows the isint judgement in q-e
-                            return std::abs(x - std::round(x)) < epsilon;
-                        };
-                        if (isint(kqg_d[0] * nqs_half1) && isint(kqg_d[1] * nqs_half2) && isint(kqg_d[2] * nqs_half3))
-                        {
-                            grid_factor = 0;
-                        }
-                        else
-                        {
-                            grid_factor = extrapolate_grid;
-                        }
-                    }
-
-                    const int nk_fac = PARAM.inp.nspin == 2 ? 2 : 1;
-                    const int nk = nks / nk_fac;
-                    const int ig_kq = ik * nks * npw + iq * npw + ig;
-
-                    Real gg = (k_c - q_c + rhopw_dev->gcar[ig]).norm2() * tpiba2;
-                    // if (kqgcar2 > 1e-12) // vasp uses 1/40 of the smallest (k spacing)**2
-                    if (gg >= 1e-8)
-                    {
-                        Real fac = -ModuleBase::FOUR_PI * ModuleBase::e2 / gg;
-                        pot_cpu[ig_kq] += fac * grid_factor * alpha;
-                    }
-                    // }
-                    else
-                    {
-                        pot_cpu[ig_kq] += exx_div * alpha;
-                    }
-                    // assert(is_finite(density_recip[ig]));
-                }
-            }
-        }
-    }
-
-    // calculate erfc pot
-    auto param_erfc = GlobalC::exx_info.info_global.coulomb_param[Conv_Coulomb_Pot_K::Coulomb_Type::Erfc];
-    for (auto param : param_erfc)
-    {
-        double erfc_omega = std::stod(param["omega"]);
-        double erfc_omega2 = erfc_omega * erfc_omega;
-        double alpha = std::stod(param["alpha"]);
-        double exx_div = exx_divergence(Conv_Coulomb_Pot_K::Coulomb_Type::Erfc, erfc_omega);
-        for (int ik = 0; ik < nks; ik++)
-        {
-            for (int iq = 0; iq < nks; iq++)
-            {
-                const ModuleBase::Vector3<double> k_c = wfcpw->kvec_c[ik];
-                const ModuleBase::Vector3<double> k_d = wfcpw->kvec_d[ik];
-                const ModuleBase::Vector3<double> q_c = wfcpw->kvec_c[iq];
-                const ModuleBase::Vector3<double> q_d = wfcpw->kvec_d[iq];
-
-#ifdef _OPENMP
-#pragma omp parallel for schedule(static)
-#endif
-                for (int ig = 0; ig < rhopw_dev->npw; ig++)
-                {
-                    const ModuleBase::Vector3<double> g_d = rhopw_dev->gdirect[ig];
-                    const ModuleBase::Vector3<double> kqg_d = k_d - q_d + g_d;
-                    // For gamma_extrapolation (https://doi.org/10.1103/PhysRevB.79.205114)
-                    // 7/8 of the points in the grid are "activated" and 1/8 are disabled.
-                    // grid_factor is designed for the 7/8 of the grid to function like all of the points
-                    Real grid_factor = 1;
-                    double extrapolate_grid = 8.0 / 7.0;
-                    if (gamma_extrapolation)
-                    {
-                        // if isint(kqg_d[0] * nqs_half1) && isint(kqg_d[1] * nqs_half2) && isint(kqg_d[2] * nqs_half3)
-                        auto isint = [](double x) {
-                            double epsilon = 1e-6; // this follows the isint judgement in q-e
-                            return std::abs(x - std::round(x)) < epsilon;
-                        };
-                        if (isint(kqg_d[0] * nqs_half1) && isint(kqg_d[1] * nqs_half2) && isint(kqg_d[2] * nqs_half3))
-                        {
-                            grid_factor = 0;
-                        }
-                        else
-                        {
-                            grid_factor = extrapolate_grid;
-                        }
-                    }
-
-                    const int nk_fac = PARAM.inp.nspin == 2 ? 2 : 1;
-                    const int nk = nks / nk_fac;
-                    const int ig_kq = ik * nks * npw + iq * npw + ig;
-
-                    Real gg = (k_c - q_c + rhopw_dev->gcar[ig]).norm2() * tpiba2;
-                    // if (kqgcar2 > 1e-12) // vasp uses 1/40 of the smallest (k spacing)**2
-                    if (gg >= 1e-8)
-                    {
-                        Real fac = -ModuleBase::FOUR_PI * ModuleBase::e2 / gg;
-                        pot_cpu[ig_kq] += fac * (1.0 - std::exp(-gg / 4.0 / erfc_omega2)) * grid_factor * alpha;
-                    }
-                    // }
-                    else
-                    {
-                        // if (PARAM.inp.dft_functional == "hse")
-                        if (!gamma_extrapolation)
-                        {
-                            pot_cpu[ig_kq] += (exx_div - ModuleBase::PI * ModuleBase::e2 / erfc_omega2) * alpha;
-                        }
-                        else
-                        {
-                            pot_cpu[ig_kq] += exx_div * alpha;
-                        }
-                    }
-                    // assert(is_finite(density_recip[ig]));
-                }
-            }
-        }
-    }
-
-    // copy the potential to the device memory
-    syncmem_real_c2d_op()(pot, pot_cpu, rhopw_dev->npw * wfcpw->nks * wfcpw->nks);
-
-    delete pot_cpu;
-}
-
-template <typename T, typename Device>
-double OperatorEXXPW<T, Device>::exx_divergence(Conv_Coulomb_Pot_K::Coulomb_Type coulomb_type, double erfc_omega) const
-{
-    double exx_div = 0;
-
-    Real nqs_half1 = 0.5 * kv->nmp[0];
-    Real nqs_half2 = 0.5 * kv->nmp[1];
-    Real nqs_half3 = 0.5 * kv->nmp[2];
-
-    int nk_fac = PARAM.inp.nspin == 2 ? 2 : 1;
-
-    // here we follow the exx_divergence subroutine in q-e (PW/src/exx_base.f90)
-    double alpha = 10.0 / wfcpw->gk_ecut;
-    double tpiba2 = tpiba * tpiba;
-    double div = 0;
-
-    // this is the \sum_q F(q) part
-    // temporarily for all k points, should be replaced to q points later
-    for (int ik = 0; ik < wfcpw->nks; ik++)
-    {
-        const ModuleBase::Vector3<double> k_c = wfcpw->kvec_c[ik];
-        const ModuleBase::Vector3<double> k_d = wfcpw->kvec_d[ik];
-#ifdef _OPENMP
-#pragma omp parallel for reduction(+:div)
-#endif
-        for (int ig = 0; ig < rhopw_dev->npw; ig++)
-        {
-            const ModuleBase::Vector3<double> q_c = k_c + rhopw_dev->gcar[ig];
-            const ModuleBase::Vector3<double> q_d = k_d + rhopw_dev->gdirect[ig];
-            double qq = q_c.norm2();
-            // For gamma_extrapolation (https://doi.org/10.1103/PhysRevB.79.205114)
-            // 7/8 of the points in the grid are "activated" and 1/8 are disabled.
-            // grid_factor is designed for the 7/8 of the grid to function like all of the points
-            Real grid_factor = 1;
-            double extrapolate_grid = 8.0/7.0;
-            if (gamma_extrapolation)
-            {
-                auto isint = [](double x)
-                {
-                    double epsilon = 1e-6; // this follows the isint judgement in q-e
-                    return std::abs(x - std::round(x)) < epsilon;
-                };
-                if (isint(q_d[0] * nqs_half1) &&
-                    isint(q_d[1] * nqs_half2) &&
-                    isint(q_d[2] * nqs_half3))
-                {
-                    grid_factor = 0;
-                }
-                else
-                {
-                    grid_factor = extrapolate_grid;
-                }
-            }
-
-            if (qq <= 1e-8) continue;
-            // else if (PARAM.inp.dft_functional == "hse")
-            else if (coulomb_type == Conv_Coulomb_Pot_K::Coulomb_Type::Erfc)
-            {
-                double omega = erfc_omega;
-                double omega2 = omega * omega;
-                div += std::exp(-alpha * qq) / qq * (1.0 - std::exp(-qq*tpiba2 / 4.0 / omega2)) * grid_factor;
-            }
-            else
-            {
-                div += std::exp(-alpha * qq) / qq * grid_factor;
-            }
-        }
-    }
-
-    Parallel_Reduce::reduce_pool(div);
-    // std::cout << "EXX div: " << div << std::endl;
-
-    // if (PARAM.inp.dft_functional == "hse")
-    if (!gamma_extrapolation)
-    {
-        if (coulomb_type == Conv_Coulomb_Pot_K::Coulomb_Type::Erfc)
-        {
-            double omega = erfc_omega;
-            div += tpiba2 / 4.0 / omega / omega; // compensate for the finite value when qq = 0
-        }
-        else
-        {
-            div -= alpha;
-        }
-
-    }
-
-    div *= ModuleBase::e2 * ModuleBase::FOUR_PI / tpiba2 / wfcpw->nks;
-//    std::cout << "div: " << div << std::endl;
-
-    // numerically value the mean value of F(q) in the reciprocal space
-    // This means we need to calculate the average of F(q) in the first brillouin zone
-    alpha /= tpiba2;
-    int nqq = 100000;
-    double dq = 5.0 / std::sqrt(alpha) / nqq;
-    double aa = 0.0;
-    // if (PARAM.inp.dft_functional == "hse")
-    if (coulomb_type == Conv_Coulomb_Pot_K::Coulomb_Type::Erfc)
-    {
-        double omega = erfc_omega;
-        double omega2 = omega * omega;
-#ifdef _OPENMP
-#pragma omp parallel for reduction(+:aa)
-#endif
-        for (int i = 0; i < nqq; i++)
-        {
-            double q = dq * (i+0.5);
-            aa -= exp(-alpha * q * q) * exp(-q*q / 4.0 / omega2) * dq;
-        }
-    }
-    aa *= 8 / ModuleBase::FOUR_PI;
-    aa += 1.0 / std::sqrt(alpha * ModuleBase::PI);
-
-    //    printf("ucell: %p\n", ucell);
-    double omega = ucell->omega;
-    div -= ModuleBase::e2 * omega * aa;
-    exx_div = div * wfcpw->nks / nk_fac;
-//    exx_div = 0;
-//    std::cout << "EXX divergence: " << exx_div << std::endl;
-
-    return exx_div;
-}
-
 template <typename T, typename Device>
 double OperatorEXXPW<T, Device>::cal_exx_energy(psi::Psi<T, Device> *psi_) const
 {
@@ -858,41 +368,6 @@ double OperatorEXXPW<T, Device>::cal_exx_energy(psi::Psi<T, Device> *psi_) const
     }
 }
 
-template <typename T, typename Device>
-double OperatorEXXPW<T, Device>::cal_exx_energy_ace(psi::Psi<T, Device> *ppsi_) const
-{
-    double Eexx = 0;
-
-    psi::Psi<T, Device> psi_ = *ppsi_;
-    int *ik_ = const_cast<int*>(&this->ik);
-    int ik_save = this->ik;
-    for (int i = 0; i < wfcpw->nks; i++)
-    {
-        setmem_complex_op()(h_psi_ace, 0, psi_.get_nbands() * psi_.get_nbasis());
-        *ik_ = i;
-        psi_.fix_kb(i, 0);
-        T* psi_i = psi_.get_pointer();
-        act_op_ace(psi_.get_nbands(), psi_.get_nbasis(), 1, psi_i, h_psi_ace, 0, true);
-
-        for (int nband = 0; nband < psi_.get_nbands(); nband++)
-        {
-            psi_.fix_kb(i, nband);
-            T* psi_i_n = psi_.get_pointer();
-            T* hpsi_i_n = h_psi_ace + nband * psi_.get_nbasis();
-            double wg_i_n = (*wg)(i, nband);
-            // Eexx += dot(psi_i_n, h_psi_i_n)
-            Eexx += dot_op()(psi_.get_nbasis(), psi_i_n, hpsi_i_n, false) * wg_i_n * 2;
-
-        }
-
-
-    }
-
-    Parallel_Reduce::reduce_pool(Eexx);
-    *ik_ = ik_save;
-    return Eexx;
-}
-
 template <typename T, typename Device>
 double OperatorEXXPW<T, Device>::cal_exx_energy_op(psi::Psi<T, Device> *ppsi_) const
 {
diff --git a/source/source_pw/module_pwdft/operator_pw/op_exx_pw.h b/source/source_pw/module_pwdft/operator_pw/op_exx_pw.h
index 51b69b13a9..eebd9ff607 100644
--- a/source/source_pw/module_pwdft/operator_pw/op_exx_pw.h
+++ b/source/source_pw/module_pwdft/operator_pw/op_exx_pw.h
@@ -69,10 +69,6 @@ class OperatorEXXPW : public OperatorPW<T, Device>
 
     void multiply_potential(T *density_recip, int ik, int iq) const;
 
-    double exx_divergence(Conv_Coulomb_Pot_K::Coulomb_Type coulomb_type, double erfc_omega = 0) const;
-
-    void get_potential() const;
-
     void act_op(const int nbands,
                 const int nbasis,
                 const int npol,
@@ -164,6 +160,33 @@ class OperatorEXXPW : public OperatorPW<T, Device>
 
 };
 
+template <typename Real, typename Device>
+void get_exx_potential(const K_Vectors* kv,
+                       const ModulePW::PW_Basis_K* wfcpw,
+                       ModulePW::PW_Basis* rhopw_dev,
+                       Real* pot,
+                       double tpiba,
+                       bool gamma_extrapolation,
+                       double ucell_omega);
+
+template <typename Real, typename Device>
+void get_exx_stress_potential(const K_Vectors* kv,
+                              const ModulePW::PW_Basis_K* wfcpw,
+                              ModulePW::PW_Basis* rhopw_dev,
+                              Real* pot,
+                              double tpiba,
+                              bool gamma_extrapolation,
+                              double ucell_omega);
+
+double exx_divergence(Conv_Coulomb_Pot_K::Coulomb_Type coulomb_type,
+                      double erfc_omega,
+                      const K_Vectors* kv,
+                      const ModulePW::PW_Basis_K* wfcpw,
+                      ModulePW::PW_Basis* rhopw_dev,
+                      double tpiba,
+                      bool gamma_extrapolation,
+                      double ucell_omega);
+
 } // namespace hamilt
 
 #endif // OPEXXPW_H
\ No newline at end of file
diff --git a/source/source_pw/module_pwdft/stress_func_exx.cpp b/source/source_pw/module_pwdft/stress_func_exx.cpp
index 1885bb16ee..4aef3c872f 100644
--- a/source/source_pw/module_pwdft/stress_func_exx.cpp
+++ b/source/source_pw/module_pwdft/stress_func_exx.cpp
@@ -1,5 +1,6 @@
-#include "stress_pw.h"
 #include "global.h"
+#include "operator_pw/op_exx_pw.h"
+#include "stress_pw.h"
 
 template <typename FPTYPE, typename Device>
 void Stress_PW<FPTYPE, Device>::stress_exx(ModuleBase::matrix& sigma,
@@ -56,198 +57,8 @@ void Stress_PW<FPTYPE, Device>::stress_exx(ModuleBase::matrix& sigma,
     resmem_real_op()(pot, rhopw->npw * nks * nks);
     resmem_real_op()(pot_stress, rhopw->npw * nks * nks);
 
-    // prepare the coefficients
-    double exx_div = 0;
-
-    // pasted from op_exx_pw.cpp
-    {
-        if (GlobalC::exx_info.info_lip.lambda == 0.0)
-        {
-            return;
-        }
-
-        // here we follow the exx_divergence subroutine in q-e (PW/src/exx_base.f90)
-        double alpha = 10.0 / wfcpw->gk_ecut;
-        double div = 0;
-
-        // this is the \sum_q F(q) part
-        // temporarily for all k points, should be replaced to q points later
-        for (int ik = 0; ik < wfcpw->nks; ik++)
-        {
-            const ModuleBase::Vector3<double> k_c = wfcpw->kvec_c[ik];
-            const ModuleBase::Vector3<double> k_d = wfcpw->kvec_d[ik];
-#ifdef _OPENMP
-#pragma omp parallel for reduction(+:div)
-#endif
-            for (int ig = 0; ig < rhopw->npw; ig++)
-            {
-                const ModuleBase::Vector3<double> q_c = k_c + rhopw->gcar[ig];
-                const ModuleBase::Vector3<double> q_d = k_d + rhopw->gdirect[ig];
-                double qq = q_c.norm2();
-                // For gamma_extrapolation (https://doi.org/10.1103/PhysRevB.79.205114)
-                // 7/8 of the points in the grid are "activated" and 1/8 are disabled.
-                // grid_factor is designed for the 7/8 of the grid to function like all of the points
-                double grid_factor = 1;
-                double extrapolate_grid = 8.0/7.0;
-                if (gamma_extrapolation)
-                {
-                    if (isint(q_d[0] * nqs_half1) &&
-                        isint(q_d[1] * nqs_half2) &&
-                        isint(q_d[2] * nqs_half3))
-                    {
-                        grid_factor = 0;
-                    }
-                    else
-                    {
-                        grid_factor = extrapolate_grid;
-                    }
-                }
-
-
-                if (qq <= 1e-8) continue;
-                else if (GlobalC::exx_info.info_global.ccp_type == Conv_Coulomb_Pot_K::Ccp_Type::Erfc)
-                {
-                    double hse_omega = GlobalC::exx_info.info_global.hse_omega;
-                    double omega2 = hse_omega * hse_omega;
-                    div += std::exp(-alpha * qq) / qq * (1.0 - std::exp(-qq*tpiba2 / 4.0 / omega2)) * grid_factor;
-                }
-                else
-                {
-                    div += std::exp(-alpha * qq) / qq * grid_factor;
-                }
-            }
-        }
-
-        Parallel_Reduce::reduce_pool(div);
-        // std::cout << "EXX div: " << div << std::endl;
-
-        // if (PARAM.inp.dft_functional == "hse")
-        if (!gamma_extrapolation)
-        {
-            if (GlobalC::exx_info.info_global.ccp_type == Conv_Coulomb_Pot_K::Ccp_Type::Erfc)
-            {
-                double omega = GlobalC::exx_info.info_global.hse_omega;
-                div += tpiba2 / 4.0 / omega / omega; // compensate for the finite value when qq = 0
-            }
-            else
-            {
-                div -= alpha;
-            }
-
-        }
-
-        div *= ModuleBase::e2 * ModuleBase::FOUR_PI / tpiba2 / wfcpw->nks;
-
-        // numerically value the mean value of F(q) in the reciprocal space
-        // This means we need to calculate the average of F(q) in the first brillouin zone
-        alpha /= tpiba2;
-        int nqq = 100000;
-        double dq = 5.0 / std::sqrt(alpha) / nqq;
-        double aa = 0.0;
-        // if (PARAM.inp.dft_functional == "hse")
-        if (GlobalC::exx_info.info_global.ccp_type == Conv_Coulomb_Pot_K::Ccp_Type::Erfc)
-        {
-            double hse_omega = GlobalC::exx_info.info_global.hse_omega;
-            double omega2 = hse_omega * hse_omega;
-            #ifdef _OPENMP
-            #pragma omp parallel for reduction(+:aa)
-            #endif
-            for (int i = 0; i < nqq; i++)
-            {
-                double q = dq * (i+0.5);
-                aa -= exp(-alpha * q * q) * exp(-q*q / 4.0 / omega2) * dq;
-            }
-        }
-        aa *= 8 / ModuleBase::FOUR_PI;
-        aa += 1.0 / std::sqrt(alpha * ModuleBase::PI);
-        div -= ModuleBase::e2 * omega * aa;
-        exx_div = div * wfcpw->nks;
-        // std::cout << "EXX divergence: " << exx_div << std::endl;
-
-    }
-
-    // prepare for the potential
-    for (int ik = 0; ik < nks; ik++)
-    {
-        for (int iq = 0; iq < nks; iq++)
-        {
-            const ModuleBase::Vector3<double> k_c = wfcpw->kvec_c[ik];
-            const ModuleBase::Vector3<double> k_d = wfcpw->kvec_d[ik];
-            const ModuleBase::Vector3<double> q_c = wfcpw->kvec_c[iq];
-            const ModuleBase::Vector3<double> q_d = wfcpw->kvec_d[iq];
-
-            #ifdef _OPENMP
-            #pragma omp parallel for schedule(static)
-            #endif
-            for (int ig = 0; ig < rhopw->npw; ig++)
-            {
-                const ModuleBase::Vector3<double> g_d = rhopw->gdirect[ig];
-                const ModuleBase::Vector3<double> kqg_d = k_d - q_d + g_d;
-
-                // For gamma_extrapolation (https://doi.org/10.1103/PhysRevB.79.205114)
-                // 7/8 of the points in the grid are "activated" and 1/8 are disabled.
-                // grid_factor is designed for the 7/8 of the grid to function like all of the points
-                Real grid_factor = 1;
-                if (gamma_extrapolation)
-                {
-                    double extrapolate_grid = 8.0/7.0;
-                    if (isint(kqg_d[0] * nqs_half1) &&
-                        isint(kqg_d[1] * nqs_half2) &&
-                        isint(kqg_d[2] * nqs_half3))
-                    {
-                        grid_factor = 0;
-                    }
-                    else
-                    {
-                        grid_factor = extrapolate_grid;
-                    }
-                }
-
-                const int ig_kq = ik * nks * rhopw->npw + iq * rhopw->npw + ig;
-
-                Real gg = (k_c - q_c + rhopw->gcar[ig]).norm2() * tpiba2;
-                Real hse_omega2 = GlobalC::exx_info.info_global.hse_omega * GlobalC::exx_info.info_global.hse_omega;
-                // if (kqgcar2 > 1e-12) // vasp uses 1/40 of the smallest (k spacing)**2
-                if (gg >= 1e-8)
-                {
-                    Real fac = -ModuleBase::FOUR_PI * ModuleBase::e2 / gg;
-                    // if (PARAM.inp.dft_functional == "hse")
-                    if (GlobalC::exx_info.info_global.ccp_type == Conv_Coulomb_Pot_K::Ccp_Type::Erfc)
-                    {
-                        pot[ig_kq] = fac * (1.0 - std::exp(-gg / 4.0 / hse_omega2)) * grid_factor;
-                        pot_stress[ig_kq] = (1.0 - (1.0 + gg / 4.0 / hse_omega2) * std::exp(-gg / 4.0 / hse_omega2)) / (1.0 - std::exp(-gg / 4.0 / hse_omega2)) / gg;
-                    }
-                    else if (GlobalC::exx_info.info_global.ccp_type == Conv_Coulomb_Pot_K::Ccp_Type::Erf)
-                    {
-                        ModuleBase::WARNING("Stress_PW", "Stress for Erf is not implemented yet");
-                        pot[ig_kq] = fac * grid_factor;
-                        pot_stress[ig_kq] = 1.0 / gg;
-                    }
-                    else if (GlobalC::exx_info.info_global.ccp_type == Conv_Coulomb_Pot_K::Ccp_Type::Hf)
-                    {
-                        pot[ig_kq] = fac * grid_factor;
-                        pot_stress[ig_kq] = 1.0 / gg;
-                    }
-                }
-                // }
-                else
-                {
-                    // if (PARAM.inp.dft_functional == "hse")
-                    if (GlobalC::exx_info.info_global.ccp_type == Conv_Coulomb_Pot_K::Ccp_Type::Erfc && !gamma_extrapolation)
-                    {
-                        pot[ig_kq] = - ModuleBase::PI * ModuleBase::e2 / hse_omega2; // maybe we should add a exx_div here, but q-e does not do that
-                        pot_stress[ig_kq] = 1 / 4.0 / hse_omega2;
-                    }
-                    else
-                    {
-                        pot[ig_kq] = exx_div;
-                        pot_stress[ig_kq] = 0;
-                    }
-                }
-                // assert(is_finite(density_recip[ig]));
-            }
-        }
-    }
+    hamilt::get_exx_potential<Real, Device>(p_kv, wfcpw, rhopw, pot, tpiba, gamma_extrapolation, omega);
+    hamilt::get_exx_stress_potential<Real, Device>(p_kv, wfcpw, rhopw, pot_stress, tpiba, gamma_extrapolation, omega);
 
     // calculate the stress