From c5322c88f89b4a9e75855582752029b82d8e5305 Mon Sep 17 00:00:00 2001
From: Jie Li <76780849+jieli-matrix@users.noreply.github.com>
Date: Tue, 3 Jun 2025 17:27:20 +0800
Subject: [PATCH 1/4] Feature: add use_k_continuity method for initialize psi

---
 docs/advanced/input_files/input-main.md       |  13 +
 source/module_esolver/esolver_ks_pw.cpp       |   3 +-
 source/module_hsolver/diago_dav_subspace.cpp  | 119 ++++----
 source/module_hsolver/hsolver_pw.cpp          | 209 ++++++++++++--
 source/module_hsolver/hsolver_pw.h            |  24 +-
 .../kernels/cuda/math_kernel_op.cu            | 207 ++++++++++++++
 .../module_hsolver/kernels/math_kernel_op.cpp |  93 +++++++
 .../module_hsolver/kernels/math_kernel_op.h   |  68 +++++
 .../kernels/rocm/dngvd_op.hip.cu              | 262 ++++++++++++++----
 .../kernels/rocm/math_kernel_op.hip.cu        | 224 +++++++++++++++
 .../module_hsolver/test/test_hsolver_pw.cpp   |  62 +++++
 .../module_hsolver/test/test_hsolver_sdft.cpp |  60 ++++
 .../module_io/read_input_item_elec_stru.cpp   |  20 ++
 source/module_parameter/input_parameter.h     |   1 +
 source/module_psi/psi_init.cpp                |   4 +-
 .../101_PW_upf100_Al_pseudopots/INPUT         |   2 +
 tests/integrate/102_PW_BPCG/INPUT             |   3 +
 17 files changed, 1234 insertions(+), 140 deletions(-)

diff --git a/docs/advanced/input_files/input-main.md b/docs/advanced/input_files/input-main.md
index 7f5065d3aa..82ba7a5a1c 100644
--- a/docs/advanced/input_files/input-main.md
+++ b/docs/advanced/input_files/input-main.md
@@ -37,6 +37,7 @@
     - [pw\_seed](#pw_seed)
     - [pw\_diag\_thr](#pw_diag_thr)
     - [diago\_smooth\_ethr](#diago_smooth_ethr)
+    - [use\_k\_continuity](#use_k_continuity)
     - [pw\_diag\_nmax](#pw_diag_nmax)
     - [pw\_diag\_ndim](#pw_diag_ndim)
     - [erf\_ecut](#erf_ecut)
@@ -774,6 +775,18 @@ These variables are used to control the plane wave related parameters.
 - **Description**: If `TRUE`, the smooth threshold strategy, which applies a larger threshold (10e-5) for the empty states, will be implemented in the diagonalization methods. (This strategy should not affect total energy, forces, and other ground-state properties, but computational efficiency will be improved.) If `FALSE`, the smooth threshold strategy will not be applied.
 - **Default**: false
 
+### use_k_continuity
+
+- **Type**: Boolean
+- **Availability**: Used only for plane wave basis set.
+- **Description**: Whether to use k-point continuity for initializing wave functions. When enabled, this strategy exploits the similarity between wavefunctions at neighboring k-points by propagating the wavefunction from a previously initialized k-point to a new k-point, significantly reducing the computational cost of the initial guess.
+
+  **Important constraints:**
+  - Must be used together with `diago_smooth_ethr = 1` for optimal performance
+
+  This feature is particularly useful for calculations with dense k-point sampling where the computational cost of wavefunction initialization becomes significant.
+- **Default**: false
+
 ### pw_diag_nmax
 
 - **Type**: Integer
diff --git a/source/module_esolver/esolver_ks_pw.cpp b/source/module_esolver/esolver_ks_pw.cpp
index 8f41d10312..03d1bd468a 100644
--- a/source/module_esolver/esolver_ks_pw.cpp
+++ b/source/module_esolver/esolver_ks_pw.cpp
@@ -520,7 +520,8 @@ void ESolver_KS_PW<T, Device>::hamilt2density_single(UnitCell& ucell,
                                                      hsolver::DiagoIterAssist<T, Device>::SCF_ITER,
                                                      hsolver::DiagoIterAssist<T, Device>::PW_DIAG_NMAX,
                                                      hsolver::DiagoIterAssist<T, Device>::PW_DIAG_THR,
-                                                     hsolver::DiagoIterAssist<T, Device>::need_subspace);
+                                                     hsolver::DiagoIterAssist<T, Device>::need_subspace,
+                                                     PARAM.inp.use_k_continuity);
 
         hsolver_pw_obj.solve(this->p_hamilt,
                              this->kspw_psi[0],
diff --git a/source/module_hsolver/diago_dav_subspace.cpp b/source/module_hsolver/diago_dav_subspace.cpp
index b180d72c13..5808ebe7ca 100644
--- a/source/module_hsolver/diago_dav_subspace.cpp
+++ b/source/module_hsolver/diago_dav_subspace.cpp
@@ -288,27 +288,28 @@ void Diago_DavSubspace<T, Device>::cal_grad(const HPsiFunc& hpsi_func,
                          psi_iter + (nbase) * this->dim,
                          this->dim);
 
-    std::vector<Real> e_temp_cpu(nbase, 0);
+    // Eigenvalues operation section
+    std::vector<Real> e_temp_cpu(this->notconv, 0);
     Real* e_temp_hd = e_temp_cpu.data();
-    if(this->device == base_device::GpuDevice)
+    if (this->device == base_device::GpuDevice)
     {
         e_temp_hd = nullptr;
         resmem_real_op()(this->ctx, e_temp_hd, nbase);
     }
-    for (int m = 0; m < notconv; m++)
+
+    for (int m = 0; m < this->notconv; m++)
     {
-        e_temp_cpu.assign(nbase, (-1.0 * (*eigenvalue_iter)[m]));
-        if (this->device == base_device::GpuDevice)
-        {
-            syncmem_var_h2d_op()(this->ctx, this->cpu_ctx, e_temp_hd, e_temp_cpu.data(), nbase);
-        }
-        vector_mul_vector_op<T, Device>()(this->ctx,
-                                                nbase,
-                                                vcc + m * this->nbase_x,
-                                                vcc + m * this->nbase_x,
-                                                e_temp_hd);
+        e_temp_cpu[m] = -(*eigenvalue_iter)[m];
+    }
+
+    if (this->device == base_device::GpuDevice)
+    {
+        syncmem_var_h2d_op()(this->ctx, this->cpu_ctx, e_temp_hd, e_temp_cpu.data(), this->notconv);
     }
-    if(this->device == base_device::GpuDevice)
+    
+    apply_eigenvalues_op<T, Device>()(nbase, this->nbase_x, this->notconv, this->vcc, this->vcc, e_temp_hd);
+
+    if (this->device == base_device::GpuDevice)
     {
         delmem_real_op()(this->ctx, e_temp_hd);
     }
@@ -333,54 +334,58 @@ void Diago_DavSubspace<T, Device>::cal_grad(const HPsiFunc& hpsi_func,
                          psi_iter + nbase * this->dim,
                          this->dim);
 
-    // "precondition!!!"
-    std::vector<Real> pre(this->dim, 0.0);
-    for (int m = 0; m < notconv; m++)
-    {
-        for (size_t i = 0; i < this->dim; i++)
-        {
-            // pre[i] = std::abs(this->precondition[i] - (*eigenvalue_iter)[m]);
-            double x = std::abs(this->precondition[i] - (*eigenvalue_iter)[m]);
-            pre[i] = 0.5 * (1.0 + x + sqrt(1 + (x - 1.0) * (x - 1.0)));
-        }
+    // Precondition section
 #if defined(__CUDA) || defined(__ROCM)
-        if (this->device == base_device::GpuDevice)
-        {
-            syncmem_var_h2d_op()(this->ctx, this->cpu_ctx, this->d_precondition, pre.data(), this->dim);
-            vector_div_vector_op<T, Device>()(this->ctx,
-                                              this->dim,
-                                              psi_iter + (nbase + m) * this->dim,
-                                              psi_iter + (nbase + m) * this->dim,
-                                              this->d_precondition);
-        }
-        else
+    if (this->device == base_device::GpuDevice)
+    {
+        Real* eigenvalues_gpu = nullptr;
+        resmem_real_op()(this->ctx, eigenvalues_gpu, notconv);
+        syncmem_var_h2d_op()(this->ctx, this->cpu_ctx, eigenvalues_gpu, (*eigenvalue_iter).data(), notconv);
+        
+        precondition_op<T, Device>()(this->dim,
+                                    psi_iter,
+                                    nbase,
+                                    notconv,
+                                    d_precondition,
+                                    eigenvalues_gpu);
+        delmem_real_op()(this->ctx, eigenvalues_gpu);
+    }
+    else
 #endif
-        {
-            vector_div_vector_op<T, Device>()(this->ctx,
-                                              this->dim,
-                                              psi_iter + (nbase + m) * this->dim,
-                                              psi_iter + (nbase + m) * this->dim,
-                                              pre.data());
-        }
+    {
+        precondition_op<T, Device>()(this->dim,
+                                    psi_iter,
+                                    nbase,
+                                    notconv,
+                                    this->precondition.data(),
+                                    (*eigenvalue_iter).data());
     }
 
-    // "normalize!!!" in order to improve numerical stability of subspace diagonalization
-    std::vector<Real> psi_norm(notconv, 0.0);
-    for (size_t i = 0; i < notconv; i++)
+    // Normalize section
+#if defined(__CUDA) || defined(__ROCM)
+    if (this->device == base_device::GpuDevice)
+    {
+        Real* psi_norm = nullptr;
+        resmem_real_op()(this->ctx, psi_norm, notconv);
+        using setmem_real_op = base_device::memory::set_memory_op<Real, Device>;
+        setmem_real_op()(this->ctx, psi_norm, 0.0, notconv);
+        
+        normalize_op<T, Device>()(this->dim,
+                                psi_iter,
+                                nbase,
+                                notconv,
+                                psi_norm);
+        delmem_real_op()(this->ctx, psi_norm);
+    }
+    else
+#endif
     {
-        psi_norm[i] = dot_real_op<T, Device>()(this->ctx,
-                                               this->dim,
-                                               psi_iter + (nbase + i) * this->dim,
-                                               psi_iter + (nbase + i) * this->dim,
-                                               true);
-        assert(psi_norm[i] > 0.0);
-        psi_norm[i] = sqrt(psi_norm[i]);
-
-        vector_div_constant_op<T, Device>()(this->ctx,
-                                            this->dim,
-                                            psi_iter + (nbase + i) * this->dim,
-                                            psi_iter + (nbase + i) * this->dim,
-                                            psi_norm[i]);
+        Real* psi_norm = nullptr;
+        normalize_op<T, Device>()(this->dim,
+                                psi_iter,
+                                nbase,
+                                notconv,
+                                psi_norm);
     }
 
     // update hpsi[:, nbase:nbase+notconv]
diff --git a/source/module_hsolver/hsolver_pw.cpp b/source/module_hsolver/hsolver_pw.cpp
index 0c1ad2e8b8..26161e19df 100644
--- a/source/module_hsolver/hsolver_pw.cpp
+++ b/source/module_hsolver/hsolver_pw.cpp
@@ -280,47 +280,107 @@ void HSolverPW<T, Device>::solve(hamilt::Hamilt<T, Device>* pHamilt,
     std::vector<Real> eigenvalues(this->wfc_basis->nks * psi.get_nbands(), 0.0);
     ethr_band.resize(psi.get_nbands(), this->diag_thr);
 
-    /// Loop over k points for solve Hamiltonian to charge density
-    for (int ik = 0; ik < this->wfc_basis->nks; ++ik)
-    {
-        /// update H(k) for each k point
-        pHamilt->updateHk(ik);
+    // Initialize k-point continuity if enabled
+    static int count = 0;
+    if (use_k_continuity) {
+        build_k_neighbors();
+    }
+
+    // Loop over k points for solve Hamiltonian to charge density
+    if (use_k_continuity) {
+        // K-point continuity case
+        for (int i = 0; i < this->wfc_basis->nks; ++i)
+        {
+            const int ik = k_order[i];
+            
+            // update H(k) for each k point
+            pHamilt->updateHk(ik);
 
 #ifdef USE_PAW
-        this->paw_func_in_kloop(ik, tpiba);
+            this->paw_func_in_kloop(ik, tpiba);
 #endif
 
-        /// update psi pointer for each k point
-        psi.fix_k(ik);
+            // update psi pointer for each k point
+            psi.fix_k(ik);
+
+            // If using k-point continuity and not first k-point, propagate from parent
+            if (ik > 0 && count == 0 && k_parent.find(ik) != k_parent.end()) {
+                propagate_psi(psi, k_parent[ik], ik);
+            }
 
-        // template add precondition calculating here
-        update_precondition(precondition, ik, this->wfc_basis->npwk[ik], Real(pes->pot->get_vl_of_0()));
+            // template add precondition calculating here
+            update_precondition(precondition, ik, this->wfc_basis->npwk[ik], Real(pes->pot->get_vl_of_0()));
 
-        // use smooth threshold for all iter methods
-        if (PARAM.inp.diago_smooth_ethr == true)
-        {
-            this->cal_smooth_ethr(pes->klist->wk[ik],
-                                  &pes->wg(ik, 0),
-                                  DiagoIterAssist<T, Device>::PW_DIAG_THR,
-                                  ethr_band);
-        }
+            // use smooth threshold for all iter methods
+            if (PARAM.inp.diago_smooth_ethr == true)
+            {
+                this->cal_smooth_ethr(pes->klist->wk[ik],
+                                    &pes->wg(ik, 0),
+                                    DiagoIterAssist<T, Device>::PW_DIAG_THR,
+                                    ethr_band);
+            }
 
 #ifdef USE_PAW
-        this->call_paw_cell_set_currentk(ik);
+            this->call_paw_cell_set_currentk(ik);
 #endif
 
-        /// solve eigenvector and eigenvalue for H(k)
-        this->hamiltSolvePsiK(pHamilt, psi, precondition, eigenvalues.data() + ik * psi.get_nbands(), this->wfc_basis->nks);
+            // solve eigenvector and eigenvalue for H(k)
+            this->hamiltSolvePsiK(pHamilt, psi, precondition, eigenvalues.data() + ik * psi.get_nbands(), this->wfc_basis->nks);
 
-        if (skip_charge)
+            if (skip_charge)
+            {
+                GlobalV::ofs_running << "Average iterative diagonalization steps for k-points " << ik
+                                    << " is: " << DiagoIterAssist<T, Device>::avg_iter
+                                    << " ; where current threshold is: " << this->diag_thr << " . " << std::endl;
+                DiagoIterAssist<T, Device>::avg_iter = 0.0;
+            }
+        }
+    }
+    else {
+        // Original code without k-point continuity
+        for (int ik = 0; ik < this->wfc_basis->nks; ++ik)
         {
-            GlobalV::ofs_running << "Average iterative diagonalization steps for k-points " << ik
-                                 << " is: " << DiagoIterAssist<T, Device>::avg_iter
-                                 << " ; where current threshold is: " << this->diag_thr << " . " << std::endl;
-            DiagoIterAssist<T, Device>::avg_iter = 0.0;
+            // update H(k) for each k point
+            pHamilt->updateHk(ik);
+
+#ifdef USE_PAW
+            this->paw_func_in_kloop(ik, tpiba);
+#endif
+
+            // update psi pointer for each k point
+            psi.fix_k(ik);
+
+            // template add precondition calculating here
+            update_precondition(precondition, ik, this->wfc_basis->npwk[ik], Real(pes->pot->get_vl_of_0()));
+
+            // use smooth threshold for all iter methods
+            if (PARAM.inp.diago_smooth_ethr == true)
+            {
+                this->cal_smooth_ethr(pes->klist->wk[ik],
+                                    &pes->wg(ik, 0),
+                                    DiagoIterAssist<T, Device>::PW_DIAG_THR,
+                                    ethr_band);
+            }
+
+#ifdef USE_PAW
+            this->call_paw_cell_set_currentk(ik);
+#endif
+
+            // solve eigenvector and eigenvalue for H(k)
+            this->hamiltSolvePsiK(pHamilt, psi, precondition, eigenvalues.data() + ik * psi.get_nbands(), this->wfc_basis->nks);
+
+            if (skip_charge)
+            {
+                GlobalV::ofs_running << " k(" << ik+1 << "/" << pes->klist->get_nkstot()
+                                     << ") Iter steps (avg)=" << DiagoIterAssist<T, Device>::avg_iter
+                                     << " threshold=" << this->diag_thr << std::endl;
+                DiagoIterAssist<T, Device>::avg_iter = 0.0;
+            }
+            /// calculate the contribution of Psi for charge density rho
         }
-        /// calculate the contribution of Psi for charge density rho
     }
+    
+    count++;
     // END Loop over k points
 
     // copy eigenvalues to ekb in ElecState
@@ -666,6 +726,101 @@ void HSolverPW<T, Device>::output_iterInfo()
     }
 }
 
+template <typename T, typename Device>
+void HSolverPW<T, Device>::build_k_neighbors() {
+    const int nk = this->wfc_basis->nks;
+    kvecs_c.resize(nk);
+    k_order.clear();
+    k_order.reserve(nk);
+    
+    // Store k-points and corresponding indices
+    struct KPoint {
+        ModuleBase::Vector3<double> kvec;
+        int index;
+        double norm;
+        
+        KPoint(const ModuleBase::Vector3<double>& v, int i) : 
+            kvec(v), index(i), norm(v.norm()) {}
+    };
+    
+    // Build k-point list
+    std::vector<KPoint> klist;
+    for (int ik = 0; ik < nk; ++ik) {
+        kvecs_c[ik] = this->wfc_basis->kvec_c[ik];
+        klist.push_back(KPoint(kvecs_c[ik], ik));
+    }
+    
+    // Sort k-points by distance from origin
+    std::sort(klist.begin(), klist.end(),
+        [](const KPoint& a, const KPoint& b) {
+            return a.norm < b.norm;
+        });
+    
+    // Build parent-child relationships
+    k_order.push_back(klist[0].index);
+    
+    // Find nearest processed k-point as parent for each k-point
+    for (int i = 1; i < nk; ++i) {
+        int current_k = klist[i].index;
+        double min_dist = 1e10;
+        int parent = -1;
+        
+        // find the nearest k-point as parent
+        for (int j = 0; j < k_order.size(); ++j) {
+            int processed_k = k_order[j];
+            double dist = (kvecs_c[current_k] - kvecs_c[processed_k]).norm2();
+            if (dist < min_dist) {
+                min_dist = dist;
+                parent = processed_k;
+            }
+        }
+        
+        k_parent[current_k] = parent;
+        k_order.push_back(current_k);
+    }
+}
+
+template <typename T, typename Device>
+void HSolverPW<T, Device>::propagate_psi(psi::Psi<T, Device>& psi, const int from_ik, const int to_ik) {
+    const int nbands = psi.get_nbands();
+    const int npwk = this->wfc_basis->npwk[to_ik];
+    
+    // Get k-point difference
+    ModuleBase::Vector3<double> dk = kvecs_c[to_ik] - kvecs_c[from_ik];
+    
+    // Allocate porter locally
+    T* porter = nullptr;
+    resmem_complex_op()(this->ctx, porter, this->wfc_basis->nmaxgr, "HSolverPW::porter");
+    
+    // Process each band
+    for (int ib = 0; ib < nbands; ib++)
+    {
+        // Fix current k-point and band
+        // psi.fix_k(from_ik);
+        
+        // FFT to real space
+        // this->wfc_basis->recip_to_real(this->ctx, psi.get_pointer(ib), porter, from_ik);
+        this->wfc_basis->recip_to_real(this->ctx, &psi(from_ik, ib, 0), porter, from_ik);
+        
+        // Apply phase factor
+        //     // TODO: Check how to get the r vector
+        //     ModuleBase::Vector3<double> r = this->wfc_basis->get_ir2r(ir);
+        //     double phase = this->wfc_basis->tpiba * (dk.x * r.x + dk.y * r.y + dk.z * r.z);
+        //     psi_real[ir] *= std::exp(std::complex<double>(0.0, phase));
+        // }
+        
+        // Fix k-point for target
+        // psi.fix_k(to_ik);
+        
+        // FFT back to reciprocal space
+        // this->wfc_basis->real_to_recip(this->ctx, porter, psi.get_pointer(ib), to_ik, true);
+        this->wfc_basis->real_to_recip(this->ctx, porter, &psi(to_ik, ib, 0), to_ik);
+    }
+
+    // Clean up porter
+    delmem_complex_op()(this->ctx, porter);
+}
+
 template class HSolverPW<std::complex<float>, base_device::DEVICE_CPU>;
 template class HSolverPW<std::complex<double>, base_device::DEVICE_CPU>;
 #if ((defined __CUDA) || (defined __ROCM))
diff --git a/source/module_hsolver/hsolver_pw.h b/source/module_hsolver/hsolver_pw.h
index 5f44bc10a4..3a52293fb2 100644
--- a/source/module_hsolver/hsolver_pw.h
+++ b/source/module_hsolver/hsolver_pw.h
@@ -5,6 +5,8 @@
 #include "module_hamilt_general/hamilt.h"
 #include "module_base/macros.h"
 #include "module_basis/module_pw/pw_basis_k.h"
+#include <unordered_map>
+#include "module_base/memory.h"
 
 namespace hsolver
 {
@@ -17,6 +19,9 @@ class HSolverPW
     // return T if T is real type(float, double),
     // otherwise return the real type of T(complex<float>, complex<double>)
     using Real = typename GetTypeReal<T>::type;
+    using resmem_complex_op = base_device::memory::resize_memory_op<T, Device>;
+    using delmem_complex_op = base_device::memory::delete_memory_op<T, Device>;
+    using setmem_complex_op = base_device::memory::set_memory_op<T, Device>;
 
   public:
     HSolverPW(ModulePW::PW_Basis_K* wfc_basis_in,
@@ -29,10 +34,12 @@ class HSolverPW
               const int scf_iter_in,
               const int diag_iter_max_in,
               const double diag_thr_in,
-              const bool need_subspace_in)
+              const bool need_subspace_in,
+              const bool use_k_continuity_in = false)
         : wfc_basis(wfc_basis_in), calculation_type(calculation_type_in), basis_type(basis_type_in), method(method_in),
           use_paw(use_paw_in), use_uspp(use_uspp_in), nspin(nspin_in), scf_iter(scf_iter_in),
-          diag_iter_max(diag_iter_max_in), diag_thr(diag_thr_in), need_subspace(need_subspace_in){};
+          diag_iter_max(diag_iter_max_in), diag_thr(diag_thr_in), need_subspace(need_subspace_in),
+          use_k_continuity(use_k_continuity_in) {};
 
     /// @brief solve function for pw
     /// @param pHamilt interface to hamilt
@@ -50,6 +57,7 @@ class HSolverPW
                const double tpiba,
                const int nat);
 
+
   protected:
     // diago caller
     void hamiltSolvePsiK(hamilt::Hamilt<T, Device>* hm,
@@ -78,6 +86,8 @@ class HSolverPW
 
     const bool need_subspace; // for cg or dav_subspace
 
+    const bool use_k_continuity;
+
   protected:
     Device* ctx = {};
 
@@ -98,8 +108,16 @@ class HSolverPW
 
     void paw_func_after_kloop(psi::Psi<T, Device>& psi, elecstate::ElecState* pes,const double tpiba,const int nat);
 #endif
+
+    // K-point continuity related members
+    std::vector<int> k_order;
+    std::unordered_map<int, int> k_parent;
+    std::vector<ModuleBase::Vector3<double>> kvecs_c;
+    
+    void build_k_neighbors();
+    void propagate_psi(psi::Psi<T, Device>& psi, const int from_ik, const int to_ik);
 };
 
 } // namespace hsolver
 
-#endif
\ No newline at end of file
+#endif
diff --git a/source/module_hsolver/kernels/cuda/math_kernel_op.cu b/source/module_hsolver/kernels/cuda/math_kernel_op.cu
index 149b9ce389..ff3655aec6 100644
--- a/source/module_hsolver/kernels/cuda/math_kernel_op.cu
+++ b/source/module_hsolver/kernels/cuda/math_kernel_op.cu
@@ -338,6 +338,55 @@ __global__ void vector_mul_vector_kernel(
     }
 }
 
+// vector operator: result[i] = vector[i] / constant
+template <>
+void vector_div_constant_op<double, base_device::DEVICE_GPU>::operator()(const int& dim,
+                                                                     double* result,
+                                                                     const double* vector,
+                                                                     const double constant)
+{
+    // In small cases, 1024 threads per block will only utilize 17 blocks, much less than 40
+    int thread = thread_per_block;
+    int block = (dim + thread - 1) / thread;
+    vector_div_constant_kernel<double><<<block, thread>>>(dim, result, vector, constant);
+
+    cudaCheckOnDebug();
+}
+
+template <typename FPTYPE>
+inline void vector_div_constant_wrapper(const int& dim,
+                                    std::complex<FPTYPE>* result,
+                                    const std::complex<FPTYPE>* vector,
+                                    const FPTYPE constant)
+{
+    thrust::complex<FPTYPE>* result_tmp = reinterpret_cast<thrust::complex<FPTYPE>*>(result);
+    const thrust::complex<FPTYPE>* vector_tmp = reinterpret_cast<const thrust::complex<FPTYPE>*>(vector);
+
+    int thread = thread_per_block;
+    int block = (dim + thread - 1) / thread;
+    vector_div_constant_kernel<thrust::complex<FPTYPE>><<<block, thread>>>(dim, result_tmp, vector_tmp, constant);
+
+    cudaCheckOnDebug();
+}
+
+template <>
+void vector_div_constant_op<std::complex<float>, base_device::DEVICE_GPU>::operator()(const int& dim,
+                                                                                  std::complex<float>* result,
+                                                                                  const std::complex<float>* vector,
+                                                                                  const float constant)
+{
+    vector_div_constant_wrapper(dim, result, vector, constant);
+}
+
+template <>
+void vector_div_constant_op<std::complex<double>, base_device::DEVICE_GPU>::operator()(const int& dim,
+                                                                                   std::complex<double>* result,
+                                                                                   const std::complex<double>* vector,
+                                                                                   const double constant)
+{
+    vector_div_constant_wrapper(dim, result, vector, constant);
+}
+
 template <typename T>
 __global__ void vector_div_vector_kernel(
     const int size,
@@ -1035,6 +1084,154 @@ void matrixSetToAnother<std::complex<double>, base_device::DEVICE_GPU>::operator
     cudaCheckOnDebug();
 }
 
+template <typename Real>
+__global__ void apply_eigenvalues_kernel(
+        const thrust::complex<Real>* vectors,
+        thrust::complex<Real>* result,
+        const Real* eigenvalues,
+        const int nbase,
+        const int nbase_x,
+        const int notconv)
+{
+    int m = blockIdx.x;
+    int idx = threadIdx.x + blockIdx.y * blockDim.x;
+    
+    if (m < notconv && idx < nbase) {
+        result[m * nbase_x + idx] = eigenvalues[m] * vectors[m * nbase_x + idx];
+    }
+}
+
+template <typename Real>
+__global__ void precondition_kernel(
+        thrust::complex<Real>* psi_iter,
+        const Real* precondition,
+        const Real* eigenvalues,
+        const int dim,
+        const int nbase,
+        const int notconv)
+{
+    int m = blockIdx.x;
+    int i = threadIdx.x + blockIdx.y * blockDim.x;
+    
+    if (m < notconv && i < dim) {
+        Real x = abs(precondition[i] - eigenvalues[m]);
+        Real pre = 0.5 * (1.0 + x + sqrt(1 + (x - 1.0) * (x - 1.0)));
+        psi_iter[(nbase + m) * dim + i] = psi_iter[(nbase + m) * dim + i] / pre;
+    }
+}
+
+template <typename Real>
+__global__ void normalize_kernel(
+        thrust::complex<Real>* psi_iter,
+        Real* psi_norm,
+        const int dim,
+        const int nbase,
+        const int notconv)
+{
+    int m = blockIdx.x;
+    int tid = threadIdx.x;
+    __shared__ Real sum[thread_per_block];
+    
+    sum[tid] = 0.0;
+    
+    // Calculate the sum for normalization
+    for (int i = tid; i < dim; i += thread_per_block) {
+        auto val = psi_iter[(nbase + m) * dim + i];
+        sum[tid] += (val * thrust::conj(val)).real();
+    }
+    
+    __syncthreads();
+    
+    // Parallel reduction in shared memory
+    for (int s = thread_per_block/2; s > warp_size; s >>= 1) {
+        if (tid < s) {
+            sum[tid] += sum[tid + s];
+        }
+        __syncthreads();
+    }
+    
+    if (tid < warp_size) {
+        sum[tid] += sum[tid + 32]; __syncwarp();
+        sum[tid] += sum[tid + 16]; __syncwarp();
+        sum[tid] += sum[tid + 8]; __syncwarp();
+        sum[tid] += sum[tid + 4]; __syncwarp();
+        sum[tid] += sum[tid + 2]; __syncwarp();
+        sum[tid] += sum[tid + 1]; __syncwarp();
+    }
+    
+    __syncthreads();
+    
+    Real norm = sqrt(sum[0]);
+    
+    // Normalize the vector
+    for (int i = tid; i < dim; i += thread_per_block) {
+        psi_iter[(nbase + m) * dim + i] /= norm;
+    }
+    
+    // Store the norm if needed
+    if (tid == 0 && psi_norm != nullptr) {
+        psi_norm[m] = norm;
+    }
+}
+
+template <typename T>
+void apply_eigenvalues_op<T, base_device::DEVICE_GPU>::operator()(const int& nbase,
+                                                                const int& nbase_x,
+                                                                const int& notconv,
+                                                                T* result,
+                                                                const T* vectors,
+                                                                const Real* eigenvalues)
+{
+    const int threads_per_block = 256;
+    const int blocks_per_grid_y = (nbase + threads_per_block - 1) / threads_per_block;
+    
+    dim3 grid(notconv, blocks_per_grid_y);
+    
+    auto vec_complex = reinterpret_cast<const thrust::complex<Real>*>(vectors);
+    auto res_complex = reinterpret_cast<thrust::complex<Real>*>(result);
+    
+    apply_eigenvalues_kernel<Real><<<grid, threads_per_block>>>(
+        vec_complex, res_complex, eigenvalues, nbase, nbase_x, notconv);
+    
+    cudaCheckOnDebug();
+}
+
+template <typename T>
+void precondition_op<T, base_device::DEVICE_GPU>::operator()(const int& dim,
+                                                           T* psi_iter,
+                                                           const int& nbase,
+                                                           const int& notconv,
+                                                           const Real* precondition,
+                                                           const Real* eigenvalues)
+{
+    const int threads_per_block = 256;
+    const int blocks_per_grid_y = (dim + threads_per_block - 1) / threads_per_block;
+    
+    dim3 grid(notconv, blocks_per_grid_y);
+    
+    auto psi_complex = reinterpret_cast<thrust::complex<Real>*>(psi_iter);
+    
+    precondition_kernel<Real><<<grid, threads_per_block>>>(
+        psi_complex, precondition, eigenvalues, dim, nbase, notconv);
+    
+    cudaCheckOnDebug();
+}
+
+template <typename T>
+void normalize_op<T, base_device::DEVICE_GPU>::operator()(const int& dim,
+                                                        T* psi_iter,
+                                                        const int& nbase,
+                                                        const int& notconv,
+                                                        Real* psi_norm)
+{
+    auto psi_complex = reinterpret_cast<thrust::complex<Real>*>(psi_iter);
+    
+    normalize_kernel<Real><<<notconv, thread_per_block>>>(
+        psi_complex, psi_norm, dim, nbase, notconv);
+    
+    cudaCheckOnDebug();
+}
+
 
 // Explicitly instantiate functors for the types of functor registered.
 template struct dot_real_op<std::complex<float>, base_device::DEVICE_GPU>;
@@ -1055,6 +1252,16 @@ template struct vector_div_vector_op<std::complex<double>, base_device::DEVICE_G
 template struct constantvector_addORsub_constantVector_op<std::complex<double>, base_device::DEVICE_GPU>;
 template struct matrixSetToAnother<std::complex<double>, base_device::DEVICE_GPU>;
 
+template struct apply_eigenvalues_op<std::complex<float>, base_device::DEVICE_GPU>;
+template struct apply_eigenvalues_op<std::complex<double>, base_device::DEVICE_GPU>;
+template struct apply_eigenvalues_op<double, base_device::DEVICE_GPU>;
+template struct precondition_op<std::complex<float>, base_device::DEVICE_GPU>;
+template struct precondition_op<std::complex<double>, base_device::DEVICE_GPU>;
+template struct precondition_op<double, base_device::DEVICE_GPU>;
+template struct normalize_op<std::complex<float>, base_device::DEVICE_GPU>;
+template struct normalize_op<std::complex<double>, base_device::DEVICE_GPU>;
+template struct normalize_op<double, base_device::DEVICE_GPU>;
+
 #ifdef __LCAO
 template struct dot_real_op<double, base_device::DEVICE_GPU>;
 template struct vector_div_constant_op<double, base_device::DEVICE_GPU>;
diff --git a/source/module_hsolver/kernels/math_kernel_op.cpp b/source/module_hsolver/kernels/math_kernel_op.cpp
index c3784949ab..745b64fa56 100644
--- a/source/module_hsolver/kernels/math_kernel_op.cpp
+++ b/source/module_hsolver/kernels/math_kernel_op.cpp
@@ -2,6 +2,7 @@
 
 #include <iomanip>
 #include <iostream>
+#include <vector>
 
 namespace hsolver
 {
@@ -355,6 +356,88 @@ struct matrixSetToAnother<T, base_device::DEVICE_CPU>
     }
 };
 
+template <typename T>
+struct apply_eigenvalues_op<T, base_device::DEVICE_CPU>
+{
+    using Real = typename GetTypeReal<T>::type;
+    void operator()(const int& nbase, const int& nbase_x, const int& notconv, T* result, const T* vectors, const Real* eigenvalues)
+    {
+        for (int m = 0; m < notconv; m++)
+        {
+            for (int idx = 0; idx < nbase; idx++)
+            {
+                result[m * nbase_x + idx] = eigenvalues[m] * vectors[m * nbase_x + idx];
+            }
+        }
+    }
+};
+
+template <typename T>
+struct precondition_op<T, base_device::DEVICE_CPU> {
+    using Real = typename GetTypeReal<T>::type;
+    void operator()(
+                   const int& dim,
+                   T* psi_iter,
+                   const int& nbase,
+                   const int& notconv,
+                   const Real* precondition,
+                   const Real* eigenvalues)
+    {
+        std::vector<Real> pre(dim, 0.0);
+        for (int m = 0; m < notconv; m++)
+        {
+            for (size_t i = 0; i < dim; i++)
+            {
+                Real x = std::abs(precondition[i] - eigenvalues[m]);
+                pre[i] = 0.5 * (1.0 + x + sqrt(1 + (x - 1.0) * (x - 1.0)));
+            }
+            const base_device::DEVICE_CPU* d;
+            vector_div_vector_op<T, base_device::DEVICE_CPU>()(
+                                                             d,
+                                                             dim,
+                                                             psi_iter + (nbase + m) * dim,
+                                                             psi_iter + (nbase + m) * dim,
+                                                             pre.data());
+        }
+    }
+};
+
+template <typename T>
+struct normalize_op<T, base_device::DEVICE_CPU> {
+    void operator()(const int& dim,
+                   T* psi_iter,
+                   const int& nbase,
+                   const int& notconv,
+                   typename GetTypeReal<T>::type* psi_norm)
+    {
+        using Real = typename GetTypeReal<T>::type;
+        for (int m = 0; m < notconv; m++)
+        {
+            // Calculate norm using dot_real_op
+            const base_device::DEVICE_CPU* d;
+            Real psi_m_norm = dot_real_op<T, base_device::DEVICE_CPU>()(
+                                                                d,
+                                                                dim,
+                                                                psi_iter + (nbase + m) * dim,
+                                                                psi_iter + (nbase + m) * dim,
+                                                                true);
+            assert(psi_m_norm > 0.0);
+            psi_m_norm = sqrt(psi_m_norm);
+
+            // Normalize using vector_div_constant_op
+            vector_div_constant_op<T, base_device::DEVICE_CPU>()(
+                                                              d,
+                                                              dim,
+                                                              psi_iter + (nbase + m) * dim,
+                                                              psi_iter + (nbase + m) * dim,
+                                                              psi_m_norm);
+            if (psi_norm) {
+                psi_norm[m] = psi_m_norm;
+            }
+        }
+    }
+};
+
 // Explicitly instantiate functors for the types of functor registered.
 template struct scal_op<float, base_device::DEVICE_CPU>;
 template struct axpy_op<std::complex<float>, base_device::DEVICE_CPU>;
@@ -388,6 +471,16 @@ template struct matrixSetToAnother<std::complex<double>, base_device::DEVICE_CPU
 template struct calc_grad_with_block_op<std::complex<double>, base_device::DEVICE_CPU>;
 template struct line_minimize_with_block_op<std::complex<double>, base_device::DEVICE_CPU>;
 
+template struct apply_eigenvalues_op<std::complex<float>, base_device::DEVICE_CPU>;
+template struct apply_eigenvalues_op<std::complex<double>, base_device::DEVICE_CPU>;
+template struct apply_eigenvalues_op<double, base_device::DEVICE_CPU>;
+template struct precondition_op<std::complex<float>, base_device::DEVICE_CPU>;
+template struct precondition_op<std::complex<double>, base_device::DEVICE_CPU>;
+template struct precondition_op<double, base_device::DEVICE_CPU>;
+template struct normalize_op<std::complex<float>, base_device::DEVICE_CPU>;
+template struct normalize_op<std::complex<double>, base_device::DEVICE_CPU>;
+template struct normalize_op<double, base_device::DEVICE_CPU>;
+
 #ifdef __LCAO
 template struct axpy_op<double, base_device::DEVICE_CPU>;
 template struct dot_real_op<double, base_device::DEVICE_CPU>;
diff --git a/source/module_hsolver/kernels/math_kernel_op.h b/source/module_hsolver/kernels/math_kernel_op.h
index 0daf0e5718..a6aec0e321 100644
--- a/source/module_hsolver/kernels/math_kernel_op.h
+++ b/source/module_hsolver/kernels/math_kernel_op.h
@@ -325,6 +325,40 @@ template <typename T, typename Device> struct matrixSetToAnother {
                   T *B, const int &LDB);
 };
 
+
+template <typename T, typename Device>
+struct apply_eigenvalues_op
+{
+    using Real = typename GetTypeReal<T>::type;
+    void operator()(const int& nbase, 
+                    const int& nbase_x, 
+                    const int& notconv, 
+                    T* result, 
+                    const T* vectors, 
+                    const Real* eigenvalues);
+};
+
+template <typename T, typename Device>
+struct precondition_op {
+    using Real = typename GetTypeReal<T>::type;
+    void operator()(const int& dim,
+                   T* psi_iter,
+                   const int& nbase,
+                   const int& notconv,
+                   const Real* precondition,
+                   const Real* eigenvalues);
+};
+
+template <typename T, typename Device>
+struct normalize_op {
+    using Real = typename GetTypeReal<T>::type;
+    void operator()(const int& dim,
+                   T* psi_iter,
+                   const int& nbase,
+                   const int& notconv,
+                   Real* psi_norm = nullptr);
+};
+
 #if __CUDA || __UT_USE_CUDA || __ROCM || __UT_USE_ROCM
 
 template <typename T>
@@ -393,6 +427,40 @@ template <typename T> struct matrixSetToAnother<T, base_device::DEVICE_GPU> {
 void createGpuBlasHandle();
 void destoryBLAShandle();
 
+// vector operator: result[i] = -lambda[i] * vector[i]
+
+template <typename T>
+struct apply_eigenvalues_op<T, base_device::DEVICE_GPU> {
+  using Real = typename GetTypeReal<T>::type;
+  void operator()(const int& nbase, 
+                  const int& nbase_x, 
+                  const int& notconv, 
+                  T* result, 
+                  const T* vectors, 
+                  const Real* eigenvalues);
+};
+
+template <typename T>
+struct precondition_op<T, base_device::DEVICE_GPU> {
+  using Real = typename GetTypeReal<T>::type;
+  void operator()(const int& dim,
+                 T* psi_iter,
+                 const int& nbase,
+                 const int& notconv,
+                 const Real* precondition,
+                 const Real* eigenvalues);
+};
+
+template <typename T>
+struct normalize_op<T, base_device::DEVICE_GPU> {
+  using Real = typename GetTypeReal<T>::type;
+  void operator()(const int& dim,
+                 T* psi_iter,
+                 const int& nbase,
+                 const int& notconv,
+                 Real* psi_norm = nullptr);
+};
+
 #endif // __CUDA || __UT_USE_CUDA || __ROCM || __UT_USE_ROCM
 } // namespace hsolver
 
diff --git a/source/module_hsolver/kernels/rocm/dngvd_op.hip.cu b/source/module_hsolver/kernels/rocm/dngvd_op.hip.cu
index 5eeb1697e8..8cba06db1b 100644
--- a/source/module_hsolver/kernels/rocm/dngvd_op.hip.cu
+++ b/source/module_hsolver/kernels/rocm/dngvd_op.hip.cu
@@ -5,12 +5,25 @@
 
 namespace hsolver {
 
+// NOTE: mimicked from ../cuda/dngvd_op.cu for three dngvd_op
+
+static hipsolverHandle_t hipsolver_H = nullptr;
+// Test on DCU platform. When nstart is greater than 234, code on DCU performs better. 
+const int N_DCU = 234;
+
 void createGpuSolverHandle() {
-    return;
+    if (hipsolver_H == nullptr)
+    {
+        hipsolverErrcheck(hipsolverCreate(&hipsolver_H));
+    }
 }
 
 void destroyGpuSolverHandle() {
-    return;
+    if (hipsolver_H != nullptr)
+    {
+        hipsolverErrcheck(hipsolverDestroy(hipsolver_H));
+        hipsolver_H = nullptr;
+    }
 }
 
 #ifdef __LCAO
@@ -23,22 +36,68 @@ void dngvd_op<double, base_device::DEVICE_GPU>::operator()(const base_device::DE
                                                            double* _eigenvalue,
                                                            double* _vcc)
 {
-    std::vector<double> hcc(nstart * nstart, 0.0);
-    std::vector<double> scc(nstart * nstart, 0.0);
-    std::vector<double> vcc(nstart * nstart, 0.0);
-    std::vector<double> eigenvalue(nstart, 0);
-    hipErrcheck(hipMemcpy(hcc.data(), _hcc, sizeof(double) * hcc.size(), hipMemcpyDeviceToHost));
-    hipErrcheck(hipMemcpy(scc.data(), _scc, sizeof(double) * scc.size(), hipMemcpyDeviceToHost));
-    base_device::DEVICE_CPU* cpu_ctx = {};
-    dngvd_op<double, base_device::DEVICE_CPU>()(cpu_ctx,
-                                                nstart,
-                                                ldh,
-                                                hcc.data(),
-                                                scc.data(),
-                                                eigenvalue.data(),
-                                                vcc.data());
-    hipErrcheck(hipMemcpy(_vcc, vcc.data(), sizeof(double) * vcc.size(), hipMemcpyHostToDevice));
-    hipErrcheck(hipMemcpy(_eigenvalue, eigenvalue.data(), sizeof(double) * eigenvalue.size(), hipMemcpyHostToDevice));
+    // copied from ../cuda/dngvd_op.cu, "dngvd_op"
+    assert(nstart == ldh);
+
+    if (nstart > N_DCU){
+        hipErrcheck(hipMemcpy(_vcc, _hcc, sizeof(double) * ldh * nstart, hipMemcpyDeviceToDevice));
+        // now vcc contains hcc
+
+        // prepare some values for hipsolverDnZhegvd_bufferSize
+        int * devInfo = nullptr;
+        int lwork = 0, info_gpu = 0;
+        double * work = nullptr;
+        hipErrcheck(hipMalloc((void**)&devInfo, sizeof(int)));
+        hipsolverFillMode_t uplo = HIPSOLVER_FILL_MODE_UPPER;
+
+        // calculate the sizes needed for pre-allocated buffer.
+        hipsolverErrcheck(hipsolverDnDsygvd_bufferSize(
+            hipsolver_H, HIPSOLVER_EIG_TYPE_1, HIPSOLVER_EIG_MODE_VECTOR, uplo,
+            nstart,
+            _vcc, ldh,
+            _scc, ldh,
+            _eigenvalue,
+            &lwork));
+
+        // allocate memery
+        hipErrcheck(hipMalloc((void**)&work, sizeof(double) * lwork));
+
+        // compute eigenvalues and eigenvectors.
+        hipsolverErrcheck(hipsolverDnDsygvd(
+            hipsolver_H, HIPSOLVER_EIG_TYPE_1, HIPSOLVER_EIG_MODE_VECTOR, uplo,
+            nstart,
+            _vcc, ldh,
+            const_cast<double *>(_scc), ldh,
+            _eigenvalue,
+            work, lwork, devInfo));
+
+        hipErrcheck(hipMemcpy(&info_gpu, devInfo, sizeof(int), hipMemcpyDeviceToHost));
+
+        // free the buffer
+        hipErrcheck(hipFree(work));
+        hipErrcheck(hipFree(devInfo));
+    }
+    // if(fail_info != nullptr) *fail_info = info_gpu;
+    else{
+        std::vector<double> hcc(nstart * nstart, 0.0);
+        std::vector<double> scc(nstart * nstart, 0.0);
+        std::vector<double> vcc(nstart * nstart, 0.0);
+        std::vector<double> eigenvalue(nstart, 0);
+        hipErrcheck(hipMemcpy(hcc.data(), _hcc, sizeof(double) * hcc.size(), hipMemcpyDeviceToHost));
+        hipErrcheck(hipMemcpy(scc.data(), _scc, sizeof(double) * scc.size(), hipMemcpyDeviceToHost));
+        base_device::DEVICE_CPU* cpu_ctx = {};
+        dngvd_op<double, base_device::DEVICE_CPU>()(cpu_ctx,
+                                                   nstart,
+                                                   ldh,
+                                                   hcc.data(),
+                                                   scc.data(),
+                                                   eigenvalue.data(),
+                                                   vcc.data());
+        hipErrcheck(hipMemcpy(_vcc, vcc.data(), sizeof(double) * vcc.size(), hipMemcpyHostToDevice));
+        hipErrcheck(hipMemcpy(_eigenvalue, eigenvalue.data(), sizeof(double) * eigenvalue.size(), hipMemcpyHostToDevice));
+    }
+
+    
 }
 #endif // __LCAO
 
@@ -51,22 +110,67 @@ void dngvd_op<std::complex<float>, base_device::DEVICE_GPU>::operator()(const ba
                                                                         float* _eigenvalue,
                                                                         std::complex<float>* _vcc)
 {
-    std::vector<std::complex<float>> hcc(nstart * nstart, {0, 0});
-    std::vector<std::complex<float>> scc(nstart * nstart, {0, 0});
-    std::vector<std::complex<float>> vcc(nstart * nstart, {0, 0});
-    std::vector<float> eigenvalue(nstart, 0);
-    hipErrcheck(hipMemcpy(hcc.data(), _hcc, sizeof(std::complex<float>) * hcc.size(), hipMemcpyDeviceToHost));
-    hipErrcheck(hipMemcpy(scc.data(), _scc, sizeof(std::complex<float>) * scc.size(), hipMemcpyDeviceToHost));
-    base_device::DEVICE_CPU* cpu_ctx = {};
-    dngvd_op<std::complex<float>, base_device::DEVICE_CPU>()(cpu_ctx,
-                                                             nstart,
-                                                             ldh,
-                                                             hcc.data(),
-                                                             scc.data(),
-                                                             eigenvalue.data(),
-                                                             vcc.data());
-    hipErrcheck(hipMemcpy(_vcc, vcc.data(), sizeof(std::complex<float>) * vcc.size(), hipMemcpyHostToDevice));
-    hipErrcheck(hipMemcpy(_eigenvalue, eigenvalue.data(), sizeof(float) * eigenvalue.size(), hipMemcpyHostToDevice));
+    // copied from ../cuda/dngvd_op.cu, "dngvd_op"
+    assert(nstart == ldh);
+    
+    if (nstart > N_DCU){
+        hipErrcheck(hipMemcpy(_vcc, _hcc, sizeof(std::complex<float>) * ldh * nstart, hipMemcpyDeviceToDevice));
+        // now vcc contains hcc
+
+        // prepare some values for hipsolverDnZhegvd_bufferSize
+        int * devInfo = nullptr;
+        int lwork = 0, info_gpu = 0;
+        float2 * work = nullptr;
+        hipErrcheck(hipMalloc((void**)&devInfo, sizeof(int)));
+        hipsolverFillMode_t uplo = HIPSOLVER_FILL_MODE_UPPER;
+
+        // calculate the sizes needed for pre-allocated buffer.
+        hipsolverErrcheck(hipsolverDnChegvd_bufferSize(
+            hipsolver_H, HIPSOLVER_EIG_TYPE_1, HIPSOLVER_EIG_MODE_VECTOR, uplo,
+            nstart,
+            reinterpret_cast<const float2 *>(_vcc), ldh,
+            reinterpret_cast<const float2 *>(_scc), ldh,
+            _eigenvalue,
+            &lwork));
+
+        // allocate memery
+        hipErrcheck(hipMalloc((void**)&work, sizeof(float2) * lwork));
+
+        // compute eigenvalues and eigenvectors.
+        hipsolverErrcheck(hipsolverDnChegvd(
+            hipsolver_H, HIPSOLVER_EIG_TYPE_1, HIPSOLVER_EIG_MODE_VECTOR, uplo,
+            nstart,
+            reinterpret_cast<float2 *>(_vcc), ldh,
+            const_cast<float2 *>(reinterpret_cast<const float2 *>(_scc)), ldh,
+            _eigenvalue,
+            work, lwork, devInfo));
+
+        hipErrcheck(hipMemcpy(&info_gpu, devInfo, sizeof(int), hipMemcpyDeviceToHost));
+        // free the buffer
+        hipErrcheck(hipFree(work));
+        hipErrcheck(hipFree(devInfo));
+    }
+    // if(fail_info != nullptr) *fail_info = info_gpu;
+    else{
+        std::vector<std::complex<float>> hcc(nstart * nstart, {0, 0});
+        std::vector<std::complex<float>> scc(nstart * nstart, {0, 0});
+        std::vector<std::complex<float>> vcc(nstart * nstart, {0, 0});
+        std::vector<float> eigenvalue(nstart, 0);
+        hipErrcheck(hipMemcpy(hcc.data(), _hcc, sizeof(std::complex<float>) * hcc.size(), hipMemcpyDeviceToHost));
+        hipErrcheck(hipMemcpy(scc.data(), _scc, sizeof(std::complex<float>) * scc.size(), hipMemcpyDeviceToHost));
+        base_device::DEVICE_CPU* cpu_ctx = {};
+        dngvd_op<std::complex<float>, base_device::DEVICE_CPU>()(cpu_ctx,
+                                                                nstart,
+                                                                ldh,
+                                                                hcc.data(),
+                                                                scc.data(),
+                                                                eigenvalue.data(),
+                                                                vcc.data());
+        hipErrcheck(hipMemcpy(_vcc, vcc.data(), sizeof(std::complex<float>) * vcc.size(), hipMemcpyHostToDevice));
+        hipErrcheck(hipMemcpy(_eigenvalue, eigenvalue.data(), sizeof(float) * eigenvalue.size(), hipMemcpyHostToDevice));
+    }
+
+    
 }
 
 template <>
@@ -76,24 +180,80 @@ void dngvd_op<std::complex<double>, base_device::DEVICE_GPU>::operator()(const b
                                                                          const std::complex<double>* _hcc,
                                                                          const std::complex<double>* _scc,
                                                                          double* _eigenvalue,
-                                                                         std::complex<double>* _vcc)
+                                                                         std::complex<double>* _vcc
+                                                                        )
 {
-    std::vector<std::complex<double>> hcc(nstart * nstart, {0, 0});
-    std::vector<std::complex<double>> scc(nstart * nstart, {0, 0});
-    std::vector<std::complex<double>> vcc(nstart * nstart, {0, 0});
-    std::vector<double> eigenvalue(nstart, 0);
-    hipErrcheck(hipMemcpy(hcc.data(), _hcc, sizeof(std::complex<double>) * hcc.size(), hipMemcpyDeviceToHost));
-    hipErrcheck(hipMemcpy(scc.data(), _scc, sizeof(std::complex<double>) * scc.size(), hipMemcpyDeviceToHost));
-    base_device::DEVICE_CPU* cpu_ctx = {};
-    dngvd_op<std::complex<double>, base_device::DEVICE_CPU>()(cpu_ctx,
-                                                              nstart,
-                                                              ldh,
-                                                              hcc.data(),
-                                                              scc.data(),
-                                                              eigenvalue.data(),
-                                                              vcc.data());
-    hipErrcheck(hipMemcpy(_vcc, vcc.data(), sizeof(std::complex<double>) * vcc.size(), hipMemcpyHostToDevice));
-    hipErrcheck(hipMemcpy(_eigenvalue, eigenvalue.data(), sizeof(double) * eigenvalue.size(), hipMemcpyHostToDevice));
+    // copied from ../cuda/dngvd_op.cu, "dngvd_op"
+    assert(nstart == ldh);
+
+    // save a copy of scc in case the diagonalization fails
+    if (nstart > N_DCU){
+        std::vector<std::complex<double>> scc(nstart * nstart, {0, 0});
+        hipErrcheck(hipMemcpy(scc.data(), _scc, sizeof(std::complex<double>) * scc.size(), hipMemcpyDeviceToHost));
+
+        hipErrcheck(hipMemcpy(_vcc, _hcc, sizeof(std::complex<double>) * ldh * nstart, hipMemcpyDeviceToDevice));
+
+        // now vcc contains hcc
+
+        // prepare some values for hipsolverDnZhegvd_bufferSize
+        int * devInfo = nullptr;
+        int lwork = 0, info_gpu = 0;
+        double2 * work = nullptr;
+        hipErrcheck(hipMalloc((void**)&devInfo, sizeof(int)));
+        hipsolverFillMode_t uplo = HIPSOLVER_FILL_MODE_UPPER;
+
+        // calculate the sizes needed for pre-allocated buffer.
+        hipsolverErrcheck(hipsolverDnZhegvd_bufferSize(
+            hipsolver_H, HIPSOLVER_EIG_TYPE_1, HIPSOLVER_EIG_MODE_VECTOR, uplo,
+            nstart,
+            reinterpret_cast<const double2 *>(_vcc), ldh,
+            reinterpret_cast<const double2 *>(_scc), ldh,
+            _eigenvalue,
+            &lwork));
+
+        // allocate memery
+        hipErrcheck(hipMalloc((void**)&work, sizeof(double2) * lwork));
+
+        // compute eigenvalues and eigenvectors.
+        hipsolverErrcheck(hipsolverDnZhegvd(
+            hipsolver_H, HIPSOLVER_EIG_TYPE_1, HIPSOLVER_EIG_MODE_VECTOR, uplo,
+            nstart,
+            reinterpret_cast<double2 *>(_vcc), ldh,
+            const_cast<double2 *>(reinterpret_cast<const double2 *>(_scc)), ldh,
+            _eigenvalue,
+            work, lwork, devInfo));
+
+        hipErrcheck(hipMemcpy(&info_gpu, devInfo, sizeof(int), hipMemcpyDeviceToHost));
+        // free the buffer
+        hipErrcheck(hipFree(work));
+        hipErrcheck(hipFree(devInfo));
+    }
+    // if(fail_info != nullptr) *fail_info = info_gpu;
+    else{
+        std::vector<std::complex<double>> hcc(nstart * nstart, {0, 0});
+        std::vector<std::complex<double>> scc(nstart * nstart, {0, 0});
+        std::vector<std::complex<double>> vcc(nstart * nstart, {0, 0});
+        std::vector<double> eigenvalue(nstart, 0);
+        hipErrcheck(hipMemcpy(hcc.data(), _hcc, sizeof(std::complex<double>) * hcc.size(), hipMemcpyDeviceToHost));
+        hipErrcheck(hipMemcpy(scc.data(), _scc, sizeof(std::complex<double>) * scc.size(), hipMemcpyDeviceToHost));
+        base_device::DEVICE_CPU* cpu_ctx = {};
+        dngvd_op<std::complex<double>, base_device::DEVICE_CPU>()(cpu_ctx,
+                                                                nstart,
+                                                                ldh,
+                                                                hcc.data(),
+                                                                scc.data(),
+                                                                eigenvalue.data(),
+                                                                vcc.data());
+        hipErrcheck(hipMemcpy(_vcc, vcc.data(), sizeof(std::complex<double>) * vcc.size(), hipMemcpyHostToDevice));
+        hipErrcheck(hipMemcpy(_eigenvalue, eigenvalue.data(), sizeof(double) * eigenvalue.size(), hipMemcpyHostToDevice));
+    }
+
+
+
+
+
+
+    
 }
 
 #ifdef __LCAO
diff --git a/source/module_hsolver/kernels/rocm/math_kernel_op.hip.cu b/source/module_hsolver/kernels/rocm/math_kernel_op.hip.cu
index ef5a1c1ece..9717a06ea9 100644
--- a/source/module_hsolver/kernels/rocm/math_kernel_op.hip.cu
+++ b/source/module_hsolver/kernels/rocm/math_kernel_op.hip.cu
@@ -257,6 +257,83 @@ __global__ void vector_mul_vector_kernel(
     }
 }
 
+template <typename T>
+__launch_bounds__(1024) 
+__global__ void vector_div_constant_kernel(const int size,
+                                                                 T* result,
+                                                                 const T* vector,
+                                                                 const typename GetTypeReal<T>::type constant)
+{
+    int i = blockIdx.x * blockDim.x + threadIdx.x;
+    if (i < size)
+    {
+        result[i] = vector[i] / constant;
+    }
+}
+
+// vector operator: result[i] = vector[i] / constant
+template <>
+void vector_div_constant_op<double, base_device::DEVICE_GPU>::operator()(const int& dim,
+                                                                     double* result,
+                                                                     const double* vector,
+                                                                     const double constant)
+{
+    int thread = 1024;
+    int block = (dim + thread - 1) / thread;
+    hipLaunchKernelGGL(HIP_KERNEL_NAME(vector_div_constant_kernel<double>),
+                       dim3(block),
+                       dim3(thread),
+                       0,
+                       0,
+                       dim,
+                       result,
+                       vector,
+                       constant);
+
+    hipCheckOnDebug();
+}
+
+template <typename FPTYPE>
+inline void vector_div_constant_wrapper(const int& dim,
+                                    std::complex<FPTYPE>* result,
+                                    const std::complex<FPTYPE>* vector,
+                                    const FPTYPE constant)
+{
+    thrust::complex<FPTYPE>* result_tmp = reinterpret_cast<thrust::complex<FPTYPE>*>(result);
+    const thrust::complex<FPTYPE>* vector_tmp = reinterpret_cast<const thrust::complex<FPTYPE>*>(vector);
+    int thread = 1024;
+    int block = (dim + thread - 1) / thread;
+    hipLaunchKernelGGL(HIP_KERNEL_NAME(vector_div_constant_kernel<thrust::complex<FPTYPE>>),
+                       dim3(block),
+                       dim3(thread),
+                       0,
+                       0,
+                       dim,
+                       result_tmp,
+                       vector_tmp,
+                       constant);
+
+    hipCheckOnDebug();
+}
+
+template <>
+void vector_div_constant_op<std::complex<float>, base_device::DEVICE_GPU>::operator()(const int& dim,
+                                                                                  std::complex<float>* result,
+                                                                                  const std::complex<float>* vector,
+                                                                                  const float constant)
+{
+    vector_div_constant_wrapper(dim, result, vector, constant);
+}
+
+template <>
+void vector_div_constant_op<std::complex<double>, base_device::DEVICE_GPU>::operator()(const int& dim,
+                                                                                   std::complex<double>* result,
+                                                                                   const std::complex<double>* vector,
+                                                                                   const double constant)
+{
+    vector_div_constant_wrapper(dim, result, vector, constant);
+}
+
 template <typename T>
 __launch_bounds__(1024)
 __global__ void vector_div_vector_kernel(
@@ -943,6 +1020,143 @@ void matrixSetToAnother<std::complex<double>, base_device::DEVICE_GPU>::operator
     hipCheckOnDebug();
 }
 
+template <typename Real>
+__global__ void apply_eigenvalues_kernel(
+        const thrust::complex<Real>* vectors,
+        thrust::complex<Real>* result,
+        const Real* eigenvalues,
+        const int nbase,
+        const int nbase_x,
+        const int notconv)
+{
+    int m = blockIdx.x;
+    int idx = threadIdx.x + blockIdx.y * blockDim.x;
+    
+    if (m < notconv && idx < nbase) {
+        result[m * nbase_x + idx] = eigenvalues[m] * vectors[m * nbase_x + idx];
+    }
+}
+
+template <typename Real>
+__global__ void precondition_kernel(
+        thrust::complex<Real>* psi_iter,
+        const Real* precondition,
+        const Real* eigenvalues,
+        const int dim,
+        const int nbase,
+        const int notconv)
+{
+    int m = blockIdx.x;
+    int i = threadIdx.x + blockIdx.y * blockDim.x;
+    
+    if (m < notconv && i < dim) {
+        Real x = abs(precondition[i] - eigenvalues[m]);
+        Real pre = 0.5 * (1.0 + x + sqrt(1 + (x - 1.0) * (x - 1.0)));
+        psi_iter[(nbase + m) * dim + i] = psi_iter[(nbase + m) * dim + i] / pre;
+    }
+}
+
+template <typename Real>
+__global__ void normalize_kernel(
+        thrust::complex<Real>* psi_iter,
+        Real* psi_norm,
+        const int dim,
+        const int nbase,
+        const int notconv)
+{
+    int m = blockIdx.x;
+    int tid = threadIdx.x;
+    __shared__ Real sum[THREAD_PER_BLOCK];
+    
+    sum[tid] = 0.0;
+    
+    // Calculate the sum for normalization
+    for (int i = tid; i < dim; i += THREAD_PER_BLOCK) {
+        auto val = psi_iter[(nbase + m) * dim + i];
+        sum[tid] += (val * thrust::conj(val)).real();
+    }
+    
+    __syncthreads();
+    
+    // Parallel reduction in shared memory
+    for (int s = THREAD_PER_BLOCK/2; s > 0; s >>= 1) {
+        if (tid < s) {
+            sum[tid] += sum[tid + s];
+        }
+        __syncthreads();
+    }
+    
+    Real norm = sqrt(sum[0]);
+    
+    // Normalize the vector
+    for (int i = tid; i < dim; i += THREAD_PER_BLOCK) {
+        psi_iter[(nbase + m) * dim + i] /= norm;
+    }
+    
+    // Store the norm if needed
+    if (tid == 0 && psi_norm != nullptr) {
+        psi_norm[m] = norm;
+    }
+}
+
+template <typename T>
+void apply_eigenvalues_op<T, base_device::DEVICE_GPU>::operator()(const int& nbase,
+                                                                const int& nbase_x,
+                                                                const int& notconv,
+                                                                T* result,
+                                                                const T* vectors,
+                                                                const Real* eigenvalues)
+{
+    const int threads_per_block = 256;
+    const int blocks_per_grid_y = (nbase + threads_per_block - 1) / threads_per_block;
+    
+    dim3 grid(notconv, blocks_per_grid_y);
+    
+    auto vec_complex = reinterpret_cast<const thrust::complex<Real>*>(vectors);
+    auto res_complex = reinterpret_cast<thrust::complex<Real>*>(result);
+    
+    apply_eigenvalues_kernel<Real><<<grid, threads_per_block>>>(
+        vec_complex, res_complex, eigenvalues, nbase, nbase_x, notconv);
+    
+    hipCheckOnDebug();
+}
+
+template <typename T>
+void precondition_op<T, base_device::DEVICE_GPU>::operator()(const int& dim,
+                                                           T* psi_iter,
+                                                           const int& nbase,
+                                                           const int& notconv,
+                                                           const Real* precondition,
+                                                           const Real* eigenvalues)
+{
+    const int threads_per_block = 256;
+    const int blocks_per_grid_y = (dim + threads_per_block - 1) / threads_per_block;
+    
+    dim3 grid(notconv, blocks_per_grid_y);
+    
+    auto psi_complex = reinterpret_cast<thrust::complex<Real>*>(psi_iter);
+    
+    precondition_kernel<Real><<<grid, threads_per_block>>>(
+        psi_complex, precondition, eigenvalues, dim, nbase, notconv);
+    
+    hipCheckOnDebug();
+}
+
+template <typename T>
+void normalize_op<T, base_device::DEVICE_GPU>::operator()(const int& dim,
+                                                        T* psi_iter,
+                                                        const int& nbase,
+                                                        const int& notconv,
+                                                        Real* psi_norm)
+{
+    auto psi_complex = reinterpret_cast<thrust::complex<Real>*>(psi_iter);
+    
+    normalize_kernel<Real><<<notconv, THREAD_PER_BLOCK>>>(
+        psi_complex, psi_norm, dim, nbase, notconv);
+    
+    hipCheckOnDebug();
+}
+
 
 
 // Explicitly instantiate functors for the types of functor registered.
@@ -964,6 +1178,16 @@ template struct vector_div_vector_op<std::complex<double>, base_device::DEVICE_G
 template struct constantvector_addORsub_constantVector_op<std::complex<double>, base_device::DEVICE_GPU>;
 template struct matrixSetToAnother<std::complex<double>, base_device::DEVICE_GPU>;
 
+template struct apply_eigenvalues_op<std::complex<float>, base_device::DEVICE_GPU>;
+template struct apply_eigenvalues_op<std::complex<double>, base_device::DEVICE_GPU>;
+template struct apply_eigenvalues_op<double, base_device::DEVICE_GPU>;
+template struct precondition_op<std::complex<float>, base_device::DEVICE_GPU>;
+template struct precondition_op<std::complex<double>, base_device::DEVICE_GPU>;
+template struct precondition_op<double, base_device::DEVICE_GPU>;
+template struct normalize_op<std::complex<float>, base_device::DEVICE_GPU>;
+template struct normalize_op<std::complex<double>, base_device::DEVICE_GPU>;
+template struct normalize_op<double, base_device::DEVICE_GPU>;
+
 #ifdef __LCAO
 template struct dot_real_op<double, base_device::DEVICE_GPU>;
 template struct vector_div_constant_op<double, base_device::DEVICE_GPU>;
diff --git a/source/module_hsolver/test/test_hsolver_pw.cpp b/source/module_hsolver/test/test_hsolver_pw.cpp
index 5763f1b7d8..1345dfc99e 100644
--- a/source/module_hsolver/test/test_hsolver_pw.cpp
+++ b/source/module_hsolver/test/test_hsolver_pw.cpp
@@ -14,6 +14,68 @@
 #include "module_hsolver/hsolver_pw.h"
 #undef private
 #undef protected
+
+// Mock implementations for the template functions causing linking errors
+namespace ModulePW {
+    // Mock implementation for recip_to_real
+    template<typename FPTYPE, typename Device>
+    void PW_Basis_K::recip_to_real(const Device* ctx,
+                                  const std::complex<FPTYPE>* in,
+                                  std::complex<FPTYPE>* out,
+                                  const int ik,
+                                  const bool add,
+                                  const FPTYPE factor) const
+    {
+        // Simple mock implementation that does nothing
+        // In a real test, you might want to implement behavior that simulates the actual function
+    }
+
+    // Mock implementation for real_to_recip
+    template<typename FPTYPE, typename Device>
+    void PW_Basis_K::real_to_recip(const Device* ctx,
+                                  const std::complex<FPTYPE>* in,
+                                  std::complex<FPTYPE>* out,
+                                  const int ik,
+                                  const bool add,
+                                  const FPTYPE factor) const
+    {
+        // Simple mock implementation that does nothing
+    }
+
+    // Explicit template instantiations
+    template void PW_Basis_K::recip_to_real<float, base_device::DEVICE_CPU>(
+        const base_device::DEVICE_CPU* ctx,
+        const std::complex<float>* in,
+        std::complex<float>* out,
+        const int ik,
+        const bool add,
+        const float factor) const;
+
+    template void PW_Basis_K::recip_to_real<double, base_device::DEVICE_CPU>(
+        const base_device::DEVICE_CPU* ctx,
+        const std::complex<double>* in,
+        std::complex<double>* out,
+        const int ik,
+        const bool add,
+        const double factor) const;
+
+    template void PW_Basis_K::real_to_recip<float, base_device::DEVICE_CPU>(
+        const base_device::DEVICE_CPU* ctx,
+        const std::complex<float>* in,
+        std::complex<float>* out,
+        const int ik,
+        const bool add,
+        const float factor) const;
+
+    template void PW_Basis_K::real_to_recip<double, base_device::DEVICE_CPU>(
+        const base_device::DEVICE_CPU* ctx,
+        const std::complex<double>* in,
+        std::complex<double>* out,
+        const int ik,
+        const bool add,
+        const double factor) const;
+}
+
 /************************************************
  *  unit test of HSolverPW class
  ***********************************************/
diff --git a/source/module_hsolver/test/test_hsolver_sdft.cpp b/source/module_hsolver/test/test_hsolver_sdft.cpp
index bd0dbbcb42..ad768ed730 100644
--- a/source/module_hsolver/test/test_hsolver_sdft.cpp
+++ b/source/module_hsolver/test/test_hsolver_sdft.cpp
@@ -134,6 +134,66 @@ template class Stochastic_Iter<std::complex<double>, base_device::DEVICE_CPU>;
 Charge::Charge(){};
 Charge::~Charge(){};
 
+// Mock implementations for the template functions causing linking errors
+namespace ModulePW {
+    // Mock implementation for recip_to_real
+    template<typename FPTYPE, typename Device>
+    void PW_Basis_K::recip_to_real(const Device* ctx,
+                                  const std::complex<FPTYPE>* in,
+                                  std::complex<FPTYPE>* out,
+                                  const int ik,
+                                  const bool add,
+                                  const FPTYPE factor) const
+    {
+        // Simple mock implementation that does nothing
+        // In a real test, you might want to implement behavior that simulates the actual function
+    }
+
+    // Mock implementation for real_to_recip
+    template<typename FPTYPE, typename Device>
+    void PW_Basis_K::real_to_recip(const Device* ctx,
+                                  const std::complex<FPTYPE>* in,
+                                  std::complex<FPTYPE>* out,
+                                  const int ik,
+                                  const bool add,
+                                  const FPTYPE factor) const
+    {
+        // Simple mock implementation that does nothing
+    }
+
+    // Explicit template instantiations
+    template void PW_Basis_K::recip_to_real<float, base_device::DEVICE_CPU>(
+        const base_device::DEVICE_CPU* ctx,
+        const std::complex<float>* in,
+        std::complex<float>* out,
+        const int ik,
+        const bool add,
+        const float factor) const;
+
+    template void PW_Basis_K::recip_to_real<double, base_device::DEVICE_CPU>(
+        const base_device::DEVICE_CPU* ctx,
+        const std::complex<double>* in,
+        std::complex<double>* out,
+        const int ik,
+        const bool add,
+        const double factor) const;
+
+    template void PW_Basis_K::real_to_recip<float, base_device::DEVICE_CPU>(
+        const base_device::DEVICE_CPU* ctx,
+        const std::complex<float>* in,
+        std::complex<float>* out,
+        const int ik,
+        const bool add,
+        const float factor) const;
+
+    template void PW_Basis_K::real_to_recip<double, base_device::DEVICE_CPU>(
+        const base_device::DEVICE_CPU* ctx,
+        const std::complex<double>* in,
+        std::complex<double>* out,
+        const int ik,
+        const bool add,
+        const double factor) const;
+}
 /************************************************
  *  unit test of HSolverPW_SDFT class
  ***********************************************/
diff --git a/source/module_io/read_input_item_elec_stru.cpp b/source/module_io/read_input_item_elec_stru.cpp
index 73c741bb5f..19d5b2f1c2 100644
--- a/source/module_io/read_input_item_elec_stru.cpp
+++ b/source/module_io/read_input_item_elec_stru.cpp
@@ -346,6 +346,26 @@ void ReadInput::item_elec_stru()
         read_sync_bool(input.diago_smooth_ethr);
         this->add_item(item);
     }
+    {
+        Input_Item item("use_k_continuity");
+        item.annotation = "whether to use k-point continuity for initializing wave functions";
+        read_sync_bool(input.use_k_continuity);
+        item.check_value = [](const Input_Item& item, const Parameter& para) {
+            if (para.input.use_k_continuity && para.input.basis_type != "pw") {
+                ModuleBase::WARNING_QUIT("ReadInput", "use_k_continuity only works for PW basis");
+            }
+            if (para.input.use_k_continuity && para.input.calculation == "nscf") {
+                ModuleBase::WARNING_QUIT("ReadInput", "use_k_continuity cannot work for NSCF calculation");
+            }
+            if (para.input.use_k_continuity && para.input.nspin == 2) {
+                ModuleBase::WARNING_QUIT("ReadInput", "use_k_continuity cannot work for spin-polarized calculation");
+            }
+            if (para.input.use_k_continuity && para.input.esolver_type == "sdft") {
+                ModuleBase::WARNING_QUIT("ReadInput", "use_k_continuity cannot work for SDFT calculation");
+            }
+        };
+        this->add_item(item);
+    }
     {
         Input_Item item("pw_diag_ndim");
         item.annotation = "dimension of workspace for Davidson diagonalization";
diff --git a/source/module_parameter/input_parameter.h b/source/module_parameter/input_parameter.h
index d60d92f1b2..f04d569962 100644
--- a/source/module_parameter/input_parameter.h
+++ b/source/module_parameter/input_parameter.h
@@ -89,6 +89,7 @@ struct Input_para
     bool diago_smooth_ethr = false; ///< smooth ethr for iter methods
     int pw_diag_ndim = 4;      ///< dimension of workspace for Davidson diagonalization
     int diago_cg_prec = 1;     ///< mohan add 2012-03-31
+    bool use_k_continuity = false;   ///< whether to use k-point continuity for initializing wave functions
 
     std::string smearing_method = "gauss"; ///< "gauss",
                                            ///< "mp","methfessel-paxton"
diff --git a/source/module_psi/psi_init.cpp b/source/module_psi/psi_init.cpp
index f5a0fa6595..3ff2e70169 100644
--- a/source/module_psi/psi_init.cpp
+++ b/source/module_psi/psi_init.cpp
@@ -115,12 +115,14 @@ void PSIInit<T, Device>::initialize_psi(Psi<std::complex<double>>* psi,
     // like (1, nbands, npwx), in which npwx is the maximal npw of all kpoints
     for (int ik = 0; ik < this->pw_wfc.nks; ik++)
     {
+        if(PARAM.inp.use_k_continuity && ik > 0) continue;
         //! Fix the wavefunction to initialize at given kpoint
         psi->fix_k(ik);
         kspw_psi->fix_k(ik);
 
         //! Update Hamiltonian from other kpoint to the given one
         p_hamilt->updateHk(ik);
+                
 
         //! initialize psi_cpu
         this->psi_initer->init_psig(psi_cpu->get_pointer(), ik);
@@ -210,4 +212,4 @@ template class PSIInit<std::complex<double>, base_device::DEVICE_CPU>;
 template class PSIInit<std::complex<float>, base_device::DEVICE_GPU>;
 template class PSIInit<std::complex<double>, base_device::DEVICE_GPU>;
 #endif
-} // namespace psi
\ No newline at end of file
+} // namespace psi
diff --git a/tests/integrate/101_PW_upf100_Al_pseudopots/INPUT b/tests/integrate/101_PW_upf100_Al_pseudopots/INPUT
index 459347bea3..ded8029b34 100644
--- a/tests/integrate/101_PW_upf100_Al_pseudopots/INPUT
+++ b/tests/integrate/101_PW_upf100_Al_pseudopots/INPUT
@@ -23,3 +23,5 @@ smearing_sigma			0.002
 #Parameters (5.Mixing)
 mixing_type		broyden
 mixing_beta		0.7
+diago_smooth_ethr 1
+use_k_continuity 1
diff --git a/tests/integrate/102_PW_BPCG/INPUT b/tests/integrate/102_PW_BPCG/INPUT
index d02092bb4a..97f6ce7e75 100644
--- a/tests/integrate/102_PW_BPCG/INPUT
+++ b/tests/integrate/102_PW_BPCG/INPUT
@@ -32,3 +32,6 @@ cal_stress          1
 mixing_type         broyden
 mixing_beta         0.4
 mixing_gg0          1.5
+
+diago_smooth_ethr 1
+use_k_continuity 1

From 9263f55f8dc03baaed229f4aff9068082d92c846 Mon Sep 17 00:00:00 2001
From: dyzheng <zhengdy@bjaisi.com>
Date: Thu, 20 Nov 2025 13:23:48 +0800
Subject: [PATCH 2/4] Fix: cuda and rocm compiling error

---
 .../kernels/cuda/math_kernel_op.cu            | 49 ------------
 .../kernels/rocm/math_kernel_op.hip.cu        | 77 -------------------
 2 files changed, 126 deletions(-)

diff --git a/source/module_hsolver/kernels/cuda/math_kernel_op.cu b/source/module_hsolver/kernels/cuda/math_kernel_op.cu
index ff3655aec6..c580eba8f8 100644
--- a/source/module_hsolver/kernels/cuda/math_kernel_op.cu
+++ b/source/module_hsolver/kernels/cuda/math_kernel_op.cu
@@ -338,55 +338,6 @@ __global__ void vector_mul_vector_kernel(
     }
 }
 
-// vector operator: result[i] = vector[i] / constant
-template <>
-void vector_div_constant_op<double, base_device::DEVICE_GPU>::operator()(const int& dim,
-                                                                     double* result,
-                                                                     const double* vector,
-                                                                     const double constant)
-{
-    // In small cases, 1024 threads per block will only utilize 17 blocks, much less than 40
-    int thread = thread_per_block;
-    int block = (dim + thread - 1) / thread;
-    vector_div_constant_kernel<double><<<block, thread>>>(dim, result, vector, constant);
-
-    cudaCheckOnDebug();
-}
-
-template <typename FPTYPE>
-inline void vector_div_constant_wrapper(const int& dim,
-                                    std::complex<FPTYPE>* result,
-                                    const std::complex<FPTYPE>* vector,
-                                    const FPTYPE constant)
-{
-    thrust::complex<FPTYPE>* result_tmp = reinterpret_cast<thrust::complex<FPTYPE>*>(result);
-    const thrust::complex<FPTYPE>* vector_tmp = reinterpret_cast<const thrust::complex<FPTYPE>*>(vector);
-
-    int thread = thread_per_block;
-    int block = (dim + thread - 1) / thread;
-    vector_div_constant_kernel<thrust::complex<FPTYPE>><<<block, thread>>>(dim, result_tmp, vector_tmp, constant);
-
-    cudaCheckOnDebug();
-}
-
-template <>
-void vector_div_constant_op<std::complex<float>, base_device::DEVICE_GPU>::operator()(const int& dim,
-                                                                                  std::complex<float>* result,
-                                                                                  const std::complex<float>* vector,
-                                                                                  const float constant)
-{
-    vector_div_constant_wrapper(dim, result, vector, constant);
-}
-
-template <>
-void vector_div_constant_op<std::complex<double>, base_device::DEVICE_GPU>::operator()(const int& dim,
-                                                                                   std::complex<double>* result,
-                                                                                   const std::complex<double>* vector,
-                                                                                   const double constant)
-{
-    vector_div_constant_wrapper(dim, result, vector, constant);
-}
-
 template <typename T>
 __global__ void vector_div_vector_kernel(
     const int size,
diff --git a/source/module_hsolver/kernels/rocm/math_kernel_op.hip.cu b/source/module_hsolver/kernels/rocm/math_kernel_op.hip.cu
index 9717a06ea9..176f7282cf 100644
--- a/source/module_hsolver/kernels/rocm/math_kernel_op.hip.cu
+++ b/source/module_hsolver/kernels/rocm/math_kernel_op.hip.cu
@@ -257,83 +257,6 @@ __global__ void vector_mul_vector_kernel(
     }
 }
 
-template <typename T>
-__launch_bounds__(1024) 
-__global__ void vector_div_constant_kernel(const int size,
-                                                                 T* result,
-                                                                 const T* vector,
-                                                                 const typename GetTypeReal<T>::type constant)
-{
-    int i = blockIdx.x * blockDim.x + threadIdx.x;
-    if (i < size)
-    {
-        result[i] = vector[i] / constant;
-    }
-}
-
-// vector operator: result[i] = vector[i] / constant
-template <>
-void vector_div_constant_op<double, base_device::DEVICE_GPU>::operator()(const int& dim,
-                                                                     double* result,
-                                                                     const double* vector,
-                                                                     const double constant)
-{
-    int thread = 1024;
-    int block = (dim + thread - 1) / thread;
-    hipLaunchKernelGGL(HIP_KERNEL_NAME(vector_div_constant_kernel<double>),
-                       dim3(block),
-                       dim3(thread),
-                       0,
-                       0,
-                       dim,
-                       result,
-                       vector,
-                       constant);
-
-    hipCheckOnDebug();
-}
-
-template <typename FPTYPE>
-inline void vector_div_constant_wrapper(const int& dim,
-                                    std::complex<FPTYPE>* result,
-                                    const std::complex<FPTYPE>* vector,
-                                    const FPTYPE constant)
-{
-    thrust::complex<FPTYPE>* result_tmp = reinterpret_cast<thrust::complex<FPTYPE>*>(result);
-    const thrust::complex<FPTYPE>* vector_tmp = reinterpret_cast<const thrust::complex<FPTYPE>*>(vector);
-    int thread = 1024;
-    int block = (dim + thread - 1) / thread;
-    hipLaunchKernelGGL(HIP_KERNEL_NAME(vector_div_constant_kernel<thrust::complex<FPTYPE>>),
-                       dim3(block),
-                       dim3(thread),
-                       0,
-                       0,
-                       dim,
-                       result_tmp,
-                       vector_tmp,
-                       constant);
-
-    hipCheckOnDebug();
-}
-
-template <>
-void vector_div_constant_op<std::complex<float>, base_device::DEVICE_GPU>::operator()(const int& dim,
-                                                                                  std::complex<float>* result,
-                                                                                  const std::complex<float>* vector,
-                                                                                  const float constant)
-{
-    vector_div_constant_wrapper(dim, result, vector, constant);
-}
-
-template <>
-void vector_div_constant_op<std::complex<double>, base_device::DEVICE_GPU>::operator()(const int& dim,
-                                                                                   std::complex<double>* result,
-                                                                                   const std::complex<double>* vector,
-                                                                                   const double constant)
-{
-    vector_div_constant_wrapper(dim, result, vector, constant);
-}
-
 template <typename T>
 __launch_bounds__(1024)
 __global__ void vector_div_vector_kernel(

From bdc08e47083e90c442b4b62d734078d3e696d37e Mon Sep 17 00:00:00 2001
From: dyzheng <zhengdy@bjaisi.com>
Date: Thu, 20 Nov 2025 18:15:46 +0800
Subject: [PATCH 3/4] Fix: kpar error with use_k_continuity

---
 source/module_hsolver/hsolver_pw.cpp | 11 +++++++----
 1 file changed, 7 insertions(+), 4 deletions(-)

diff --git a/source/module_hsolver/hsolver_pw.cpp b/source/module_hsolver/hsolver_pw.cpp
index 26161e19df..a23c2834f6 100644
--- a/source/module_hsolver/hsolver_pw.cpp
+++ b/source/module_hsolver/hsolver_pw.cpp
@@ -751,10 +751,13 @@ void HSolverPW<T, Device>::build_k_neighbors() {
     }
     
     // Sort k-points by distance from origin
-    std::sort(klist.begin(), klist.end(),
-        [](const KPoint& a, const KPoint& b) {
-            return a.norm < b.norm;
-        });
+    if(klist.size() > 1)
+    {
+        std::sort(klist.begin()+1, klist.end(),
+            [](const KPoint& a, const KPoint& b) {
+                return a.norm < b.norm;
+            });
+    }
     
     // Build parent-child relationships
     k_order.push_back(klist[0].index);

From 83804a34a9e97af4068991f344c92192c9d6b386 Mon Sep 17 00:00:00 2001
From: dyzheng <zhengdy@bjaisi.com>
Date: Fri, 21 Nov 2025 13:49:06 +0800
Subject: [PATCH 4/4] Fix: test error of CI

---
 tests/integrate/102_PW_BPCG/INPUT | 2 --
 1 file changed, 2 deletions(-)

diff --git a/tests/integrate/102_PW_BPCG/INPUT b/tests/integrate/102_PW_BPCG/INPUT
index 97f6ce7e75..94126f4ce8 100644
--- a/tests/integrate/102_PW_BPCG/INPUT
+++ b/tests/integrate/102_PW_BPCG/INPUT
@@ -33,5 +33,3 @@ mixing_type         broyden
 mixing_beta         0.4
 mixing_gg0          1.5
 
-diago_smooth_ethr 1
-use_k_continuity 1