From 2a86d0692800397436aba3597b767eccecde29aa Mon Sep 17 00:00:00 2001
From: "tianxiang.wang@metax-tech.com" <tianxiang.wang@metax-tech.com>
Date: Fri, 5 Sep 2025 06:04:09 +0000
Subject: [PATCH 1/5] =?UTF-8?q?Perf:=20Optimize=20Diago=5FDavSubspace=20wi?=
 =?UTF-8?q?th=20GPU=20operators=20by=20adding=20and=20fusing=20custom=20ke?=
 =?UTF-8?q?rnels.=20Signed-off-by=EF=BC=9ATianxiang=20Wang<tianxiang.wang@?=
 =?UTF-8?q?metax-tech.com>,=20Contributed=20under=20MetaX=20Integrated=20C?=
 =?UTF-8?q?ircuits=20(Shanghai)=20Co.,=20Ltd.?=
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

---
 .../kernels/cuda/math_kernel_op.cu            |  44 ++++++-
 source/source_base/kernels/math_kernel_op.cpp |  25 ++++
 source/source_base/kernels/math_kernel_op.h   |  40 +++++-
 .../kernels/rocm/math_kernel_op.hip.cu        |  42 ++++++-
 source/source_hsolver/diago_dav_subspace.cpp  |  74 ++---------
 source/source_hsolver/diago_dav_subspace.h    |   2 +
 .../source_hsolver/kernels/bpcg_kernel_op.cpp |  27 ++++
 .../source_hsolver/kernels/bpcg_kernel_op.h   |  55 +++++++--
 .../kernels/cuda/bpcg_kernel_op.cu            | 116 ++++++++++++++----
 .../kernels/rocm/bpcg_kernel_op.hip.cu        | 112 ++++++++++++++---
 10 files changed, 421 insertions(+), 116 deletions(-)
diff --git a/source/source_base/kernels/cuda/math_kernel_op.cu b/source/source_base/kernels/cuda/math_kernel_op.cu
index 415ec9f12a..562760ca7a 100644
--- a/source/source_base/kernels/cuda/math_kernel_op.cu
+++ b/source/source_base/kernels/cuda/math_kernel_op.cu
@@ -133,6 +133,14 @@ __global__ void matrix_copy_kernel(const int n1, const int n2, const T* A, const
     }
 }
 
+template <typename T, typename Real>
+__global__ void matrix_multiply_vector_kernel(const int m, const int n, T *a, const int lda, const Real *b, const Real alpha, T *c, const int ldc){
+    int row = blockIdx.x * blockDim.x + threadIdx.x;
+    int col = blockIdx.y * blockDim.y + threadIdx.y;
+    if (col >= n || row >= m) return;
+    c[col * ldc + row] = a[col * lda + row] * b[col] * alpha;
+}
+
 cublasOperation_t judge_trans_op(bool is_complex, const char& trans, const char* name)
 {
     if (trans == 'N')
@@ -147,7 +155,7 @@ cublasOperation_t judge_trans_op(bool is_complex, const char& trans, const char*
     {
         return CUBLAS_OP_C;
     }
-    else 
+    else
     {
         ModuleBase::WARNING_QUIT(name, std::string("Unknown trans type ") + trans + std::string(" !"));
     }
@@ -438,10 +446,44 @@ void matrixCopy<std::complex<double>, base_device::DEVICE_GPU>::operator()(const
     cudaCheckOnDebug();
 }
 
+template <>
+void matrix_mul_vector_op<double, base_device::DEVICE_GPU>::operator()(const int &m, const int &n,
+                  double *a, const int &lda, const double *b, const double alpha, double *c, const int &ldc){
+    dim3 thread(16, 16, 1);
+    dim3 block((m + thread.x - 1) / thread.x, (n + thread.y - 1) / thread.y, 1);
+    matrix_multiply_vector_kernel<double, double> <<<block, thread >>>(m, n, a, lda,
+    b, alpha, c, ldc);
+    cudaCheckOnDebug();
+}
+
+template <>
+void matrix_mul_vector_op<std::complex<float>, base_device::DEVICE_GPU>::operator()(const int &m, const int &n,
+                  std::complex<float> *a, const int &lda, const float *b, const float alpha, std::complex<float> *c, const int &ldc){
+    dim3 thread(16, 16, 1);
+    dim3 block((m + thread.x - 1) / thread.x, (n + thread.y - 1) / thread.y, 1);
+    matrix_multiply_vector_kernel<thrust::complex<float>, float> <<<block, thread >>>(m, n, reinterpret_cast<thrust::complex<float>*>(a), lda,
+    b, alpha, reinterpret_cast<thrust::complex<float>*>(c), ldc);
+    cudaCheckOnDebug();
+}
+
+template <>
+void matrix_mul_vector_op<std::complex<double>, base_device::DEVICE_GPU>::operator()(const int &m, const int &n,
+                  std::complex<double> *a, const int &lda, const double *b, const double alpha, std::complex<double> *c, const int &ldc)
+{
+    dim3 thread(16, 16, 1);
+    dim3 block((m + thread.x - 1) / thread.x, (n + thread.y - 1) / thread.y, 1);
+    matrix_multiply_vector_kernel<thrust::complex<double>, double> <<<block, thread >>>(m, n, reinterpret_cast<thrust::complex<double>*>(a), lda,
+    b, alpha, reinterpret_cast<thrust::complex<double>*>(c), ldc);
+    cudaCheckOnDebug();
+}
 
 // Explicitly instantiate functors for the types of functor registered.
 
 template struct matrixCopy<std::complex<float>, base_device::DEVICE_GPU>;
 template struct matrixCopy<double, base_device::DEVICE_GPU>;
 template struct matrixCopy<std::complex<double>, base_device::DEVICE_GPU>;
+
+template struct matrix_mul_vector_op<std::complex<float>, base_device::DEVICE_GPU>;
+template struct matrix_mul_vector_op<double, base_device::DEVICE_GPU>;
+template struct matrix_mul_vector_op<std::complex<double>, base_device::DEVICE_GPU>;
 }  // namespace ModuleBase
diff --git a/source/source_base/kernels/math_kernel_op.cpp b/source/source_base/kernels/math_kernel_op.cpp
index dad790e463..343d342e00 100644
--- a/source/source_base/kernels/math_kernel_op.cpp
+++ b/source/source_base/kernels/math_kernel_op.cpp
@@ -119,12 +119,35 @@ struct matrixCopy<T, base_device::DEVICE_CPU>
     }
 };
 
+template <typename T>
+struct matrix_mul_vector_op<T, base_device::DEVICE_CPU> {
+    using Real = typename GetTypeReal<T>::type;
+    void operator()(const int& m, const int &n,
+                  T *a,
+                  const int &lda,
+                  const Real *b,
+                  const Real alpha,
+                  T *c,
+                  const int &ldc){
+#ifdef _OPENMP
+#pragma omp parallel for collapse(2) schedule(static, 8192 / sizeof(T))
+#endif
+        for (int j = 0; j < n; j++){
+            for (int i = 0; i < m; i++){
+                c[j * ldc + i] = a[j * lda + i] * b[j] * alpha;
+            }
+        }
+
+    }
+};
+
 template struct gemv_op<std::complex<float>, base_device::DEVICE_CPU>;
 template struct gemv_op<float, base_device::DEVICE_CPU>;
 template struct gemm_op<std::complex<float>, base_device::DEVICE_CPU>;
 template struct gemm_op<float, base_device::DEVICE_CPU>;
 template struct matrixTranspose_op<std::complex<float>, base_device::DEVICE_CPU>;
 template struct matrixCopy<std::complex<float>, base_device::DEVICE_CPU>;
+template struct matrix_mul_vector_op<std::complex<float>, base_device::DEVICE_CPU>;
 
 template struct gemv_op<std::complex<double>, base_device::DEVICE_CPU>;
 template struct gemv_op<double, base_device::DEVICE_CPU>;
@@ -133,6 +156,8 @@ template struct gemm_op<double, base_device::DEVICE_CPU>;
 template struct matrixTranspose_op<std::complex<double>, base_device::DEVICE_CPU>;
 template struct matrixCopy<double, base_device::DEVICE_CPU>;
 template struct matrixCopy<std::complex<double>, base_device::DEVICE_CPU>;
+template struct matrix_mul_vector_op<double, base_device::DEVICE_CPU>;
+template struct matrix_mul_vector_op<std::complex<double>, base_device::DEVICE_CPU>;
 
 #ifdef __LCAO
 template struct matrixTranspose_op<double, base_device::DEVICE_CPU>;
diff --git a/source/source_base/kernels/math_kernel_op.h b/source/source_base/kernels/math_kernel_op.h
index 9476d32fdc..f5c8e218df 100644
--- a/source/source_base/kernels/math_kernel_op.h
+++ b/source/source_base/kernels/math_kernel_op.h
@@ -104,7 +104,7 @@ template <typename T, typename Device> struct vector_div_constant_op {
   ///
   /// Input Parameters
   /// \param dim : array size
-  /// \param vector : input array 
+  /// \param vector : input array
   /// \param constant : input constant
   ///
   /// Output Parameters
@@ -298,6 +298,31 @@ template <typename T, typename Device> struct matrixCopy {
   void operator()(const int& n1, const int& n2, const T* A, const int& LDA, T* B, const int& LDB);
 };
 
+template <typename T, typename Device>
+struct matrix_mul_vector_op {
+    using Real = typename GetTypeReal<T>::type;
+  /// @brief a * b * beta by each column
+  ///
+  /// Input Parameters
+  /// \param m : row number
+  /// \param n : column number
+  /// \param a : input matrix
+  /// \param lda : leading dimension of matrix a
+  /// \param b : input vector
+  /// \param alpha : factor
+  /// \param ldc : leading dimension of matrix c
+  ///
+  /// Output Parameters
+  /// \param c : output matrix
+  void operator()(const int &m, const int &n,
+                  T *a,
+                  const int &lda,
+                  const Real *b,
+                  const Real alpha,
+                  T *c,
+                  const int &ldc);
+};
+
 template <typename T, typename Device>
 struct apply_eigenvalues_op {
     using Real = typename GetTypeReal<T>::type;
@@ -314,7 +339,7 @@ struct precondition_op {
                    T* psi_iter,
                    const int& nbase,
                    const int& notconv,
-                   const Real* precondition,  
+                   const Real* precondition,
                    const Real* eigenvalues);
 };
 
@@ -393,6 +418,17 @@ template <typename T> struct matrixCopy<T, base_device::DEVICE_GPU> {
                     const int& LDB);
 };
 
+template <typename T> struct matrix_mul_vector_op<T, base_device::DEVICE_GPU> {
+  using Real = typename GetTypeReal<T>::type;
+  void operator()(const int &m, const int &n,
+                  T *a,
+                  const int &lda,
+                  const Real *b,
+                  const Real alpha,
+                  T *c,
+                  const int &ldc);
+};
+
 void createGpuBlasHandle();
 void destoryBLAShandle();
 
diff --git a/source/source_base/kernels/rocm/math_kernel_op.hip.cu b/source/source_base/kernels/rocm/math_kernel_op.hip.cu
index 8eb87988c8..f60cdfc3b1 100644
--- a/source/source_base/kernels/rocm/math_kernel_op.hip.cu
+++ b/source/source_base/kernels/rocm/math_kernel_op.hip.cu
@@ -145,6 +145,15 @@ __launch_bounds__(1024) __global__
     }
 }
 
+template <typename T, typename Real>
+__launch_bounds__(1024) __global__
+void matrix_multiply_vector_kernel(const int m, const int n, T *a, const int lda, const Real *b, const Real alpha, T *c, const int ldc){
+    int row = blockIdx.x * blockDim.x + threadIdx.x;
+    int col = blockIdx.y * blockDim.y + threadIdx.y;
+    if (col >= n || row >= m) return;
+    c[col * ldc + row] = a[col * lda + row] * b[col] * alpha;
+}
+
 hipblasOperation_t judge_trans_op(bool is_complex, const char& trans, const char* name)
 {
     if (trans == 'N')
@@ -159,7 +168,7 @@ hipblasOperation_t judge_trans_op(bool is_complex, const char& trans, const char
     {
         return HIPBLAS_OP_C;
     }
-    else 
+    else
     {
         ModuleBase::WARNING_QUIT(name, std::string("Unknown trans type ") + trans + std::string(" !"));
     }
@@ -437,7 +446,38 @@ void matrixCopy<std::complex<double>, base_device::DEVICE_GPU>::operator()(const
     hipCheckOnDebug();
 }
 
+template <>
+void matrix_mul_vector_op<double, base_device::DEVICE_GPU>::operator()(const int &m, const int &n,
+                  double *a, const int &lda, const double *b, const double alpha, double *c, const int &ldc){
+    dim3 thread(16, 16, 1);
+    dim3 block((m + thread.x - 1) / thread.x, (n + thread.y - 1) / thread.y, 1);
+    hipLaunchKernelGGL(HIP_KERNEL_NAME(matrix_multiply_vector_kernel<double, double>), dim3(block, thread),
+        m, n, a, lda, b, alpha, c, ldc);
+    hipCheckOnDebug();
+}
 
+template <>
+void matrix_mul_vector_op<std::complex<float>, base_device::DEVICE_GPU>::operator()(const int &m, const int &n,
+                  std::complex<float> *a, const int &lda, const float *b, const float alpha, std::complex<float> *c, const int &ldc){
+    dim3 thread(16, 16, 1);
+    dim3 block((m + thread.x - 1) / thread.x, (n + thread.y - 1) / thread.y, 1);
+    hipLaunchKernelGGL(HIP_KERNEL_NAME(matrix_multiply_vector_kernel<thrust::complex<float>, float>), dim3(block, thread),
+        m, n, reinterpret_cast<thrust::complex<float>*>(a), lda,
+        b, alpha, reinterpret_cast<thrust::complex<float>*>(c), ldc);
+    hipCheckOnDebug();
+}
+
+template <>
+void matrix_mul_vector_op<std::complex<double>, base_device::DEVICE_GPU>::operator()(const int &m, const int &n,
+                  std::complex<double> *a, const int &lda, const double *b, const double alpha, std::complex<double> *c, const int &ldc)
+{
+    dim3 thread(16, 16, 1);
+    dim3 block((m + thread.x - 1) / thread.x, (n + thread.y - 1) / thread.y, 1);
+    hipLaunchKernelGGL(HIP_KERNEL_NAME(matrix_multiply_vector_kernel<thrust::complex<double>, double>), dim3(block, thread),
+        m, n, reinterpret_cast<thrust::complex<double>*>(a), lda,
+        b, alpha, reinterpret_cast<thrust::complex<double>*>(c), ldc);
+    hipCheckOnDebug();
+}
 
 // Explicitly instantiate functors for the types of functor registered.
 template struct matrixCopy<double, base_device::DEVICE_GPU>;
diff --git a/source/source_hsolver/diago_dav_subspace.cpp b/source/source_hsolver/diago_dav_subspace.cpp
index 4402fad9f8..fe89b23377 100644
--- a/source/source_hsolver/diago_dav_subspace.cpp
+++ b/source/source_hsolver/diago_dav_subspace.cpp
@@ -76,6 +76,7 @@ Diago_DavSubspace<T, Device>::Diago_DavSubspace(const std::vector<Real>& precond
     {
         resmem_real_op()(this->d_precondition, nbasis_in);
         // syncmem_var_h2d_op()(this->ctx, this->cpu_ctx, this->d_precondition, this->precondition.data(), nbasis_in);
+        resmem_real_op()(this->d_eigenvalue, this->nbase_x);
     }
 #endif
 }
@@ -94,6 +95,7 @@ Diago_DavSubspace<T, Device>::~Diago_DavSubspace()
     if (this->device == base_device::GpuDevice)
     {
         delmem_real_op()(this->d_precondition);
+        delmem_real_op()(this->d_eigenvalue);
     }
 #endif
 }
@@ -316,30 +318,15 @@ void Diago_DavSubspace<T, Device>::cal_grad(const HPsiFunc& hpsi_func,
                          this->dim);
 
     // Eigenvalues operation section
-    std::vector<Real> e_temp_cpu(this->notconv, 0);
-    Real* e_temp_hd = e_temp_cpu.data();
+    Real* e_temp_hd = eigenvalue_iter->data();
     if (this->device == base_device::GpuDevice)
     {
-        e_temp_hd = nullptr;
-        resmem_real_op()(e_temp_hd, nbase);
+        syncmem_var_h2d_op()(this->d_eigenvalue, eigenvalue_iter->data(), this->nbase_x);
+        e_temp_hd = this->d_eigenvalue;
     }
 
-    for (int m = 0; m < this->notconv; m++)
-    {
-        e_temp_cpu[m] = -(*eigenvalue_iter)[m];
-    }
-
-    if (this->device == base_device::GpuDevice)
-    {
-        syncmem_var_h2d_op()(e_temp_hd, e_temp_cpu.data(), this->notconv);
-    }
-    
-    apply_eigenvalues_op<T, Device>()(nbase, this->nbase_x, this->notconv, this->vcc, this->vcc, e_temp_hd);
-
-    if (this->device == base_device::GpuDevice)
-    {
-        delmem_real_op()(e_temp_hd);
-    }
+    // vcc = - vcc * eigenvalue
+    ModuleBase::matrix_mul_vector_op<T, Device>()(nbase, notconv, vcc, this->nbase_x, eigenvalue_iter->data(), -1.0, vcc, this->nbase_x);
 
 #ifdef __DSP
     ModuleBase::gemm_op_mt<T, Device>()
@@ -364,17 +351,12 @@ void Diago_DavSubspace<T, Device>::cal_grad(const HPsiFunc& hpsi_func,
 #if defined(__CUDA) || defined(__ROCM)
     if (this->device == base_device::GpuDevice)
     {
-        Real* eigenvalues_gpu = nullptr;
-        resmem_real_op()(eigenvalues_gpu, notconv);
-        syncmem_var_h2d_op()(eigenvalues_gpu, (*eigenvalue_iter).data(), notconv);
-        
         precondition_op<T, Device>()(this->dim,
                                     psi_iter,
                                     nbase,
                                     notconv,
                                     d_precondition,
-                                    eigenvalues_gpu);
-        delmem_real_op()(eigenvalues_gpu);
+                                    this->d_eigenvalue);
     }
     else
 #endif
@@ -395,7 +377,7 @@ void Diago_DavSubspace<T, Device>::cal_grad(const HPsiFunc& hpsi_func,
         resmem_real_op()(psi_norm, notconv);
         using setmem_real_op = base_device::memory::set_memory_op<Real, Device>;
         setmem_real_op()(psi_norm, 0.0, notconv);
-        
+
         normalize_op<T, Device>()(this->dim,
                                 psi_iter,
                                 nbase,
@@ -641,7 +623,7 @@ void Diago_DavSubspace<T, Device>::diag_zhegvx(const int& nbase,
         }
         else
         {
-#ifdef __MPI  
+#ifdef __MPI
             std::vector<T> h_diag;
             std::vector<T> s_diag;
             std::vector<T> vcc_tmp;
@@ -680,7 +662,7 @@ void Diago_DavSubspace<T, Device>::diag_zhegvx(const int& nbase,
             }
 #else
             std::cout << "Error: parallel diagonalization is not supported in serial mode." << std::endl;
-            exit(1);    
+            exit(1);
 #endif
         }
     }
@@ -772,39 +754,7 @@ void Diago_DavSubspace<T, Device>::refresh(const int& dim,
 
     if (this->device == base_device::GpuDevice)
     {
-#if defined(__CUDA) || defined(__ROCM)
-        T* hcc_cpu = nullptr;
-        T* scc_cpu = nullptr;
-        T* vcc_cpu = nullptr;
-        base_device::memory::resize_memory_op<T, base_device::DEVICE_CPU>()(hcc_cpu,
-                                                                            this->nbase_x * this->nbase_x,
-                                                                            "DAV::hcc");
-        base_device::memory::resize_memory_op<T, base_device::DEVICE_CPU>()(scc_cpu,
-                                                                            this->nbase_x * this->nbase_x,
-                                                                            "DAV::scc");
-        base_device::memory::resize_memory_op<T, base_device::DEVICE_CPU>()(vcc_cpu,
-                                                                            this->nbase_x * this->nbase_x,
-                                                                            "DAV::vcc");
-
-        syncmem_d2h_op()(hcc_cpu, hcc, this->nbase_x * this->nbase_x);
-        syncmem_d2h_op()(scc_cpu, scc, this->nbase_x * this->nbase_x);
-        syncmem_d2h_op()(vcc_cpu, vcc, this->nbase_x * this->nbase_x);
-
-        for (int i = 0; i < nbase; i++)
-        {
-            hcc_cpu[i * this->nbase_x + i] = eigenvalue_in_hsolver[i];
-            scc_cpu[i * this->nbase_x + i] = this->one[0];
-            vcc_cpu[i * this->nbase_x + i] = this->one[0];
-        }
-
-        syncmem_h2d_op()(hcc, hcc_cpu, this->nbase_x * this->nbase_x);
-        syncmem_h2d_op()(scc, scc_cpu, this->nbase_x * this->nbase_x);
-        syncmem_h2d_op()(vcc, vcc_cpu, this->nbase_x * this->nbase_x);
-
-        base_device::memory::delete_memory_op<T, base_device::DEVICE_CPU>()(hcc_cpu);
-        base_device::memory::delete_memory_op<T, base_device::DEVICE_CPU>()(scc_cpu);
-        base_device::memory::delete_memory_op<T, base_device::DEVICE_CPU>()(vcc_cpu);
-#endif
+        refresh_hcc_scc_vcc_op<T, Device>()(nbase, hcc, scc, vcc, this->nbase_x, this->d_eigenvalue, this->one_);
     }
     else
     {
diff --git a/source/source_hsolver/diago_dav_subspace.h b/source/source_hsolver/diago_dav_subspace.h
index 3b4c224ec8..3d76bbf225 100644
--- a/source/source_hsolver/diago_dav_subspace.h
+++ b/source/source_hsolver/diago_dav_subspace.h
@@ -94,6 +94,8 @@ class Diago_DavSubspace
     /// Eigenvectors on the reduced basis
     T* vcc = nullptr;
 
+    Real* d_eigenvalue = nullptr;
+
     /// device type of psi
     Device* ctx = {};
     base_device::DEVICE_CPU* cpu_ctx = {};
diff --git a/source/source_hsolver/kernels/bpcg_kernel_op.cpp b/source/source_hsolver/kernels/bpcg_kernel_op.cpp
index 19c51fc398..3d74591783 100644
--- a/source/source_hsolver/kernels/bpcg_kernel_op.cpp
+++ b/source/source_hsolver/kernels/bpcg_kernel_op.cpp
@@ -183,6 +183,30 @@ struct normalize_op<T, base_device::DEVICE_CPU> {
     }
 };
 
+template <typename T>
+struct refresh_hcc_scc_vcc_op<T, base_device::DEVICE_CPU>
+{
+    using Real = typename GetTypeReal<T>::type;
+    void operator()(const int &n,
+                  T *hcc,
+                  T *scc,
+                  T *vcc,
+                  const int &ldh,
+                  const Real *eigenvalue,
+                  const T &one)
+    {
+#ifdef _OPENMP
+#pragma omp parallel for collapse(1) schedule(static, 8192 / sizeof(T))
+#endif
+        for (int i = 0; i < n; i++)
+        {
+            hcc[i * ldh + i] = eigenvalue[i];
+            scc[i * ldh + i] = one;
+            vcc[i * ldh + i] = one;
+        }
+    }
+};
+
 template struct calc_grad_with_block_op<std::complex<float>, base_device::DEVICE_CPU>;
 template struct line_minimize_with_block_op<std::complex<float>, base_device::DEVICE_CPU>;
 template struct calc_grad_with_block_op<std::complex<double>, base_device::DEVICE_CPU>;
@@ -196,4 +220,7 @@ template struct precondition_op<double, base_device::DEVICE_CPU>;
 template struct normalize_op<std::complex<float>, base_device::DEVICE_CPU>;
 template struct normalize_op<std::complex<double>, base_device::DEVICE_CPU>;
 template struct normalize_op<double, base_device::DEVICE_CPU>;
+template struct refresh_hcc_scc_vcc_op<std::complex<float>, base_device::DEVICE_CPU>;
+template struct refresh_hcc_scc_vcc_op<std::complex<double>, base_device::DEVICE_CPU>;
+template struct refresh_hcc_scc_vcc_op<double, base_device::DEVICE_CPU>;
 } // namespace hsolver
\ No newline at end of file
diff --git a/source/source_hsolver/kernels/bpcg_kernel_op.h b/source/source_hsolver/kernels/bpcg_kernel_op.h
index 802ee3c1e4..9ac7c5e2ce 100644
--- a/source/source_hsolver/kernels/bpcg_kernel_op.h
+++ b/source/source_hsolver/kernels/bpcg_kernel_op.h
@@ -63,11 +63,11 @@ template <typename T, typename Device>
 struct apply_eigenvalues_op
 {
     using Real = typename GetTypeReal<T>::type;
-    void operator()(const int& nbase, 
-                    const int& nbase_x, 
-                    const int& notconv, 
-                    T* result, 
-                    const T* vectors, 
+    void operator()(const int& nbase,
+                    const int& nbase_x,
+                    const int& notconv,
+                    T* result,
+                    const T* vectors,
                     const Real* eigenvalues);
 };
 
@@ -92,6 +92,30 @@ struct normalize_op {
                    Real* psi_norm = nullptr);
 };
 
+template <typename T, typename Device> struct refresh_hcc_scc_vcc_op {
+    using Real = typename GetTypeReal<T>::type;
+  /// @brief refresh hcc scc vcc
+  ///
+  /// Input Parameters
+  /// \param n : first dimension of matrix
+  /// \param ldh : leading dimension of hcc, scc, vcc
+  /// \param nbase : matrix size
+  /// \param eigenvalue : input eigenvalue
+  /// \param one : constant one
+  ///
+  /// Output Parameters
+  /// \param hcc : output matrix hcc
+  /// \param scc : output matrix scc
+  /// \param vcc : output matrix vcc
+  void operator()(const int &n,
+                  T *hcc,
+                  T *scc,
+                  T *vcc,
+                  const int &ldh,
+                  const Real *eigenvalue,
+                  const T& one);
+};
+
 #if __CUDA || __UT_USE_CUDA || __ROCM || __UT_USE_ROCM
 
 template <typename T>
@@ -114,11 +138,11 @@ struct calc_grad_with_block_op<T, base_device::DEVICE_GPU> {
 template <typename T>
 struct apply_eigenvalues_op<T, base_device::DEVICE_GPU> {
   using Real = typename GetTypeReal<T>::type;
-  void operator()(const int& nbase, 
-                  const int& nbase_x, 
-                  const int& notconv, 
-                  T* result, 
-                  const T* vectors, 
+  void operator()(const int& nbase,
+                  const int& nbase_x,
+                  const int& notconv,
+                  T* result,
+                  const T* vectors,
                   const Real* eigenvalues);
 };
 
@@ -143,6 +167,17 @@ struct normalize_op<T, base_device::DEVICE_GPU> {
                  Real* psi_norm = nullptr);
 };
 
+template <typename T>
+struct refresh_hcc_scc_vcc_op<T, base_device::DEVICE_GPU> {
+  using Real = typename GetTypeReal<T>::type;
+  void operator()(const int &n,
+                  T *hcc,
+                  T *scc,
+                  T *vcc,
+                  const int &ldh,
+                  const Real *eigenvalue,
+                  const T& one);
+};
 #endif
 } // namespace hsolver
 
diff --git a/source/source_hsolver/kernels/cuda/bpcg_kernel_op.cu b/source/source_hsolver/kernels/cuda/bpcg_kernel_op.cu
index e8af516274..a8f45ad886 100644
--- a/source/source_hsolver/kernels/cuda/bpcg_kernel_op.cu
+++ b/source/source_hsolver/kernels/cuda/bpcg_kernel_op.cu
@@ -257,7 +257,7 @@ __global__ void apply_eigenvalues_kernel(
 {
     int m = blockIdx.x;
     int idx = threadIdx.x + blockIdx.y * blockDim.x;
-    
+
     if (m < notconv && idx < nbase) {
         result[m * nbase_x + idx] = eigenvalues[m] * vectors[m * nbase_x + idx];
     }
@@ -274,7 +274,7 @@ __global__ void precondition_kernel(
 {
     int m = blockIdx.x;
     int i = threadIdx.x + blockIdx.y * blockDim.x;
-    
+
     if (m < notconv && i < dim) {
         Real x = abs(precondition[i] - eigenvalues[m]);
         Real pre = 0.5 * (1.0 + x + sqrt(1 + (x - 1.0) * (x - 1.0)));
@@ -293,17 +293,17 @@ __global__ void normalize_kernel(
     int m = blockIdx.x;
     int tid = threadIdx.x;
     __shared__ Real sum[thread_per_block];
-    
+
     sum[tid] = 0.0;
-    
+
     // Calculate the sum for normalization
     for (int i = tid; i < dim; i += thread_per_block) {
         auto val = psi_iter[(nbase + m) * dim + i];
         sum[tid] += (val * thrust::conj(val)).real();
     }
-    
+
     __syncthreads();
-    
+
     // Parallel reduction in shared memory
     for (int s = thread_per_block/2; s > warp_size; s >>= 1) {
         if (tid < s) {
@@ -311,7 +311,7 @@ __global__ void normalize_kernel(
         }
         __syncthreads();
     }
-    
+
     if (tid < warp_size) {
         sum[tid] += sum[tid + 32]; __syncwarp();
         sum[tid] += sum[tid + 16]; __syncwarp();
@@ -320,22 +320,41 @@ __global__ void normalize_kernel(
         sum[tid] += sum[tid + 2]; __syncwarp();
         sum[tid] += sum[tid + 1]; __syncwarp();
     }
-    
+
     __syncthreads();
-    
+
     Real norm = sqrt(sum[0]);
-    
+
     // Normalize the vector
     for (int i = tid; i < dim; i += thread_per_block) {
         psi_iter[(nbase + m) * dim + i] /= norm;
     }
-    
+
     // Store the norm if needed
     if (tid == 0 && psi_norm != nullptr) {
         psi_norm[m] = norm;
     }
 }
 
+template <typename T, typename Real>
+__global__ void refresh_hcc_scc_vcc_kernel(
+        const int n,
+        T *hcc,
+        T *scc,
+        T *vcc,
+        const int ldh,
+        const Real *eigenvalue,
+        const T one)
+{
+    int i = blockIdx.x * blockDim.x + threadIdx.x;
+    if (i < n)
+    {
+        hcc[i * ldh + i] = eigenvalue[i];
+        scc[i * ldh + i] = one;
+        vcc[i * ldh + i] = one;
+    }
+}
+
 template <typename T>
 void line_minimize_with_block_op<T, base_device::DEVICE_GPU>::operator()(T* grad_out,
                                                                          T* hgrad_out,
@@ -392,15 +411,15 @@ void apply_eigenvalues_op<T, base_device::DEVICE_GPU>::operator()(const int& nba
 {
     const int threads_per_block = 256;
     const int blocks_per_grid_y = (nbase + threads_per_block - 1) / threads_per_block;
-    
+
     dim3 grid(notconv, blocks_per_grid_y);
-    
+
     auto vec_complex = reinterpret_cast<const thrust::complex<Real>*>(vectors);
     auto res_complex = reinterpret_cast<thrust::complex<Real>*>(result);
-    
+
     apply_eigenvalues_kernel<Real><<<grid, threads_per_block>>>(
         vec_complex, res_complex, eigenvalues, nbase, nbase_x, notconv);
-    
+
     cudaCheckOnDebug();
 }
 
@@ -414,14 +433,14 @@ void precondition_op<T, base_device::DEVICE_GPU>::operator()(const int& dim,
 {
     const int threads_per_block = 256;
     const int blocks_per_grid_y = (dim + threads_per_block - 1) / threads_per_block;
-    
+
     dim3 grid(notconv, blocks_per_grid_y);
-    
+
     auto psi_complex = reinterpret_cast<thrust::complex<Real>*>(psi_iter);
-    
+
     precondition_kernel<Real><<<grid, threads_per_block>>>(
         psi_complex, precondition, eigenvalues, dim, nbase, notconv);
-    
+
     cudaCheckOnDebug();
 }
 
@@ -433,10 +452,62 @@ void normalize_op<T, base_device::DEVICE_GPU>::operator()(const int& dim,
                                                         Real* psi_norm)
 {
     auto psi_complex = reinterpret_cast<thrust::complex<Real>*>(psi_iter);
-    
+
     normalize_kernel<Real><<<notconv, thread_per_block>>>(
         psi_complex, psi_norm, dim, nbase, notconv);
-    
+
+    cudaCheckOnDebug();
+}
+
+template <>
+void refresh_hcc_scc_vcc_op<double, base_device::DEVICE_GPU>::operator()(const int &n,
+                  double *hcc,
+                  double *scc,
+                  double *vcc,
+                  const int &ldh,
+                  const double *eigenvalue,
+                  const double& one)
+{
+    int thread = 512;
+    int block = (n + thread - 1) / thread;
+    refresh_hcc_scc_vcc_kernel<double, double> <<<block, thread >>> (n, hcc, scc, vcc, ldh, eigenvalue, one);
+
+    cudaCheckOnDebug();
+}
+
+template <>
+void refresh_hcc_scc_vcc_op<std::complex<float>, base_device::DEVICE_GPU>::operator()(const int &n,
+                  std::complex<float> *hcc,
+                  std::complex<float> *scc,
+                  std::complex<float> *vcc,
+                  const int &ldh,
+                  const float *eigenvalue,
+                  const std::complex<float>& one)
+{
+    int thread = 512;
+    int block = (n + thread - 1) / thread;
+    refresh_hcc_scc_vcc_kernel<thrust::complex<float>, float> <<<block, thread >>> (n, reinterpret_cast<thrust::complex<float>*>(hcc),
+                    reinterpret_cast<thrust::complex<float>*>(scc), reinterpret_cast<thrust::complex<float>*>(vcc), ldh, eigenvalue,
+                    thrust::complex<float>(one));
+
+    cudaCheckOnDebug();
+}
+
+template <>
+void refresh_hcc_scc_vcc_op<std::complex<double>, base_device::DEVICE_GPU>::operator()(const int &n,
+                  std::complex<double> *hcc,
+                  std::complex<double> *scc,
+                  std::complex<double> *vcc,
+                  const int &ldh,
+                  const double *eigenvalue,
+                  const std::complex<double>& one)
+{
+    int thread = 512;
+    int block = (n + thread - 1) / thread;
+    refresh_hcc_scc_vcc_kernel<thrust::complex<double>, double> <<<block, thread >>> (n, reinterpret_cast<thrust::complex<double>*>(hcc),
+                    reinterpret_cast<thrust::complex<double>*>(scc), reinterpret_cast<thrust::complex<double>*>(vcc), ldh, eigenvalue,
+                    thrust::complex<double>(one));
+
     cudaCheckOnDebug();
 }
 
@@ -453,4 +524,7 @@ template struct precondition_op<double, base_device::DEVICE_GPU>;
 template struct normalize_op<std::complex<float>, base_device::DEVICE_GPU>;
 template struct normalize_op<std::complex<double>, base_device::DEVICE_GPU>;
 template struct normalize_op<double, base_device::DEVICE_GPU>;
+template struct refresh_hcc_scc_vcc_op<std::complex<float>, base_device::DEVICE_GPU>;
+template struct refresh_hcc_scc_vcc_op<std::complex<double>, base_device::DEVICE_GPU>;
+template struct refresh_hcc_scc_vcc_op<double, base_device::DEVICE_GPU>;
 }
\ No newline at end of file
diff --git a/source/source_hsolver/kernels/rocm/bpcg_kernel_op.hip.cu b/source/source_hsolver/kernels/rocm/bpcg_kernel_op.hip.cu
index 095cd4deff..b7bbeb7494 100644
--- a/source/source_hsolver/kernels/rocm/bpcg_kernel_op.hip.cu
+++ b/source/source_hsolver/kernels/rocm/bpcg_kernel_op.hip.cu
@@ -182,7 +182,7 @@ __global__ void apply_eigenvalues_kernel(
 {
     int m = blockIdx.x;
     int idx = threadIdx.x + blockIdx.y * blockDim.x;
-    
+
     if (m < notconv && idx < nbase) {
         result[m * nbase_x + idx] = eigenvalues[m] * vectors[m * nbase_x + idx];
     }
@@ -199,7 +199,7 @@ __global__ void precondition_kernel(
 {
     int m = blockIdx.x;
     int i = threadIdx.x + blockIdx.y * blockDim.x;
-    
+
     if (m < notconv && i < dim) {
         Real x = abs(precondition[i] - eigenvalues[m]);
         Real pre = 0.5 * (1.0 + x + sqrt(1 + (x - 1.0) * (x - 1.0)));
@@ -218,17 +218,17 @@ __global__ void normalize_kernel(
     int m = blockIdx.x;
     int tid = threadIdx.x;
     __shared__ Real sum[THREAD_PER_BLOCK];
-    
+
     sum[tid] = 0.0;
-    
+
     // Calculate the sum for normalization
     for (int i = tid; i < dim; i += THREAD_PER_BLOCK) {
         auto val = psi_iter[(nbase + m) * dim + i];
         sum[tid] += (val * thrust::conj(val)).real();
     }
-    
+
     __syncthreads();
-    
+
     // Parallel reduction in shared memory
     for (int s = THREAD_PER_BLOCK/2; s > 0; s >>= 1) {
         if (tid < s) {
@@ -236,20 +236,39 @@ __global__ void normalize_kernel(
         }
         __syncthreads();
     }
-    
+
     Real norm = sqrt(sum[0]);
-    
+
     // Normalize the vector
     for (int i = tid; i < dim; i += THREAD_PER_BLOCK) {
         psi_iter[(nbase + m) * dim + i] /= norm;
     }
-    
+
     // Store the norm if needed
     if (tid == 0 && psi_norm != nullptr) {
         psi_norm[m] = norm;
     }
 }
 
+template <typename T, typename Real>
+__global__ void refresh_hcc_scc_vcc_kernel(
+        const int n,
+        T *hcc,
+        T *scc,
+        T *vcc,
+        const int ldh,
+        const Real *eigenvalue,
+        const T one)
+{
+    int i = blockIdx.x * blockDim.x + threadIdx.x;
+    if (i < n)
+    {
+        hcc[i * ldh + i] = eigenvalue[i];
+        scc[i * ldh + i] = one;
+        vcc[i * ldh + i] = one;
+    }
+}
+
 template <typename T>
 void line_minimize_with_block_op<T, base_device::DEVICE_GPU>::operator()(T* grad_out,
                                                                          T* hgrad_out,
@@ -306,15 +325,15 @@ void apply_eigenvalues_op<T, base_device::DEVICE_GPU>::operator()(const int& nba
 {
     const int threads_per_block = 256;
     const int blocks_per_grid_y = (nbase + threads_per_block - 1) / threads_per_block;
-    
+
     dim3 grid(notconv, blocks_per_grid_y);
-    
+
     auto vec_complex = reinterpret_cast<const thrust::complex<Real>*>(vectors);
     auto res_complex = reinterpret_cast<thrust::complex<Real>*>(result);
-    
+
     apply_eigenvalues_kernel<Real><<<grid, threads_per_block>>>(
         vec_complex, res_complex, eigenvalues, nbase, nbase_x, notconv);
-    
+
     hipCheckOnDebug();
 }
 
@@ -328,14 +347,14 @@ void precondition_op<T, base_device::DEVICE_GPU>::operator()(const int& dim,
 {
     const int threads_per_block = 256;
     const int blocks_per_grid_y = (dim + threads_per_block - 1) / threads_per_block;
-    
+
     dim3 grid(notconv, blocks_per_grid_y);
-    
+
     auto psi_complex = reinterpret_cast<thrust::complex<Real>*>(psi_iter);
-    
+
     precondition_kernel<Real><<<grid, threads_per_block>>>(
         psi_complex, precondition, eigenvalues, dim, nbase, notconv);
-    
+
     hipCheckOnDebug();
 }
 
@@ -347,10 +366,62 @@ void normalize_op<T, base_device::DEVICE_GPU>::operator()(const int& dim,
                                                         Real* psi_norm)
 {
     auto psi_complex = reinterpret_cast<thrust::complex<Real>*>(psi_iter);
-    
+
     normalize_kernel<Real><<<notconv, THREAD_PER_BLOCK>>>(
         psi_complex, psi_norm, dim, nbase, notconv);
-    
+
+    hipCheckOnDebug();
+}
+
+template <>
+void refresh_hcc_scc_vcc_op<double, base_device::DEVICE_GPU>::operator()(const int &n,
+                  double *hcc,
+                  double *scc,
+                  double *vcc,
+                  const int &ldh,
+                  const double *eigenvalue,
+                  const double& one)
+{
+    int thread = 512;
+    int block = (n + thread - 1) / thread;
+    refresh_hcc_scc_vcc_kernel<double, double> <<<block, thread >>> (n, hcc, scc, vcc, ldh, eigenvalue, one);
+
+    hipCheckOnDebug();
+}
+
+template <>
+void refresh_hcc_scc_vcc_op<std::complex<float>, base_device::DEVICE_GPU>::operator()(const int &n,
+                  std::complex<float> *hcc,
+                  std::complex<float> *scc,
+                  std::complex<float> *vcc,
+                  const int &ldh,
+                  const float *eigenvalue,
+                  const std::complex<float>& one)
+{
+    int thread = 512;
+    int block = (n + thread - 1) / thread;
+    refresh_hcc_scc_vcc_kernel<thrust::complex<float>, float> <<<block, thread >>> (n, reinterpret_cast<thrust::complex<float>*>(hcc),
+                    reinterpret_cast<thrust::complex<float>*>(scc), reinterpret_cast<thrust::complex<float>*>(vcc), ldh, eigenvalue,
+                    thrust::complex<float>(one));
+
+    hipCheckOnDebug();
+}
+
+template <>
+void refresh_hcc_scc_vcc_op<std::complex<double>, base_device::DEVICE_GPU>::operator()(const int &n,
+                  std::complex<double> *hcc,
+                  std::complex<double> *scc,
+                  std::complex<double> *vcc,
+                  const int &ldh,
+                  const double *eigenvalue,
+                  const std::complex<double>& one)
+{
+    int thread = 512;
+    int block = (n + thread - 1) / thread;
+    refresh_hcc_scc_vcc_kernel<thrust::complex<double>, double> <<<block, thread >>> (n, reinterpret_cast<thrust::complex<double>*>(hcc),
+                    reinterpret_cast<thrust::complex<double>*>(scc), reinterpret_cast<thrust::complex<double>*>(vcc), ldh, eigenvalue,
+                    thrust::complex<double>(one));
+
     hipCheckOnDebug();
 }
 
@@ -367,4 +438,7 @@ template struct precondition_op<double, base_device::DEVICE_GPU>;
 template struct normalize_op<std::complex<float>, base_device::DEVICE_GPU>;
 template struct normalize_op<std::complex<double>, base_device::DEVICE_GPU>;
 template struct normalize_op<double, base_device::DEVICE_GPU>;
+template struct refresh_hcc_scc_vcc_op<std::complex<float>, base_device::DEVICE_GPU>;
+template struct refresh_hcc_scc_vcc_op<std::complex<double>, base_device::DEVICE_GPU>;
+template struct refresh_hcc_scc_vcc_op<double, base_device::DEVICE_GPU>;
 }
\ No newline at end of file

From 7d0a07ecafeedc3e89a8dc26f66b6cb94e25daae Mon Sep 17 00:00:00 2001
From: "tianxiang.wang@metax-tech.com" <tianxiang.wang@metax-tech.com>
Date: Fri, 5 Sep 2025 06:40:09 +0000
Subject: [PATCH 2/5] =?UTF-8?q?Perf:=20reduce=20memory=20allocation=20and?=
 =?UTF-8?q?=20copy=20in=20Diago=5FDavSubspace::diag=5Fzhegvx=20Signed-off-?=
 =?UTF-8?q?by=EF=BC=9ATianxiang=20Wang<tianxiang.wang@metax-tech.com>,=20C?=
 =?UTF-8?q?ontributed=20under=20MetaX=20Integrated=20Circuits=20(Shanghai)?=
 =?UTF-8?q?=20Co.,=20Ltd.?=
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

---
 source/source_hsolver/diago_dav_subspace.cpp  | 33 +++----------------
 source/source_hsolver/diago_dav_subspace.h    |  1 +
 .../source_hsolver/kernels/cuda/dngvd_op.cu   |  2 +-
 .../kernels/rocm/dngvd_op.hip.cu              | 12 +++----
 4 files changed, 13 insertions(+), 35 deletions(-)

diff --git a/source/source_hsolver/diago_dav_subspace.cpp b/source/source_hsolver/diago_dav_subspace.cpp
index fe89b23377..7a78cef163 100644
--- a/source/source_hsolver/diago_dav_subspace.cpp
+++ b/source/source_hsolver/diago_dav_subspace.cpp
@@ -76,6 +76,7 @@ Diago_DavSubspace<T, Device>::Diago_DavSubspace(const std::vector<Real>& precond
     {
         resmem_real_op()(this->d_precondition, nbasis_in);
         // syncmem_var_h2d_op()(this->ctx, this->cpu_ctx, this->d_precondition, this->precondition.data(), nbasis_in);
+        base_device::memory::resize_memory_op<T, Device>()(this->d_scc, this->nbase_x * this->nbase_x);
         resmem_real_op()(this->d_eigenvalue, this->nbase_x);
     }
 #endif
@@ -95,6 +96,7 @@ Diago_DavSubspace<T, Device>::~Diago_DavSubspace()
     if (this->device == base_device::GpuDevice)
     {
         delmem_real_op()(this->d_precondition);
+        delmem_complex_op()(this->d_scc);
         delmem_real_op()(this->d_eigenvalue);
     }
 #endif
@@ -546,34 +548,9 @@ void Diago_DavSubspace<T, Device>::diag_zhegvx(const int& nbase,
 #if defined(__CUDA) || defined(__ROCM)
         if (this->diag_comm.rank == 0)
         {
-            Real* eigenvalue_gpu = nullptr;
-            resmem_real_op()(eigenvalue_gpu, this->nbase_x);
-
-            syncmem_var_h2d_op()(eigenvalue_gpu, (*eigenvalue_iter).data(), this->nbase_x);
-
-            T* hcc_gpu = nullptr;
-            T* scc_gpu = nullptr;
-            T* vcc_gpu = nullptr;
-            base_device::memory::resize_memory_op<T, Device>()(hcc_gpu, nbase * nbase);
-            base_device::memory::resize_memory_op<T, Device>()(scc_gpu, nbase * nbase);
-            base_device::memory::resize_memory_op<T, Device>()(vcc_gpu, nbase * nbase);
-            for(int i=0;i<nbase;i++)
-            {
-                base_device::memory::synchronize_memory_op<T, Device, Device>()(hcc_gpu + i * nbase, hcc + i * nbase_x, nbase);
-                base_device::memory::synchronize_memory_op<T, Device, Device>()(scc_gpu + i * nbase, scc + i * nbase_x, nbase);
-            }
-            dngvd_op<T, Device>()(this->ctx, nbase, nbase, hcc_gpu, scc_gpu, eigenvalue_gpu, vcc_gpu);
-            for(int i=0;i<nbase;i++)
-            {
-                base_device::memory::synchronize_memory_op<T, Device, Device>()(vcc + i * nbase_x, vcc_gpu + i * nbase, nbase);
-            }
-            delmem_complex_op()(hcc_gpu);
-            delmem_complex_op()(scc_gpu);
-            delmem_complex_op()(vcc_gpu);
-
-            syncmem_var_d2h_op()((*eigenvalue_iter).data(), eigenvalue_gpu, this->nbase_x);
-
-            delmem_real_op()(eigenvalue_gpu);
+            base_device::memory::synchronize_memory_op<T, Device, Device>()(this->d_scc, scc, nbase * this->nbase_x);
+            dngvd_op<T, Device>()(this->ctx, nbase, this->nbase_x, this->hcc, this->d_scc, this->d_eigenvalue, this->vcc);
+            syncmem_var_d2h_op()((*eigenvalue_iter).data(), this->d_eigenvalue, this->nbase_x);
         }
 #endif
     }
diff --git a/source/source_hsolver/diago_dav_subspace.h b/source/source_hsolver/diago_dav_subspace.h
index 3d76bbf225..ba5cf34e50 100644
--- a/source/source_hsolver/diago_dav_subspace.h
+++ b/source/source_hsolver/diago_dav_subspace.h
@@ -94,6 +94,7 @@ class Diago_DavSubspace
     /// Eigenvectors on the reduced basis
     T* vcc = nullptr;
 
+    T* d_scc = nullptr;
     Real* d_eigenvalue = nullptr;
 
     /// device type of psi
diff --git a/source/source_hsolver/kernels/cuda/dngvd_op.cu b/source/source_hsolver/kernels/cuda/dngvd_op.cu
index 4ce3d9a1d0..5c1ef8ccab 100644
--- a/source/source_hsolver/kernels/cuda/dngvd_op.cu
+++ b/source/source_hsolver/kernels/cuda/dngvd_op.cu
@@ -216,7 +216,7 @@ struct dngvd_op<T, base_device::DEVICE_GPU>
                     Real* W,    // eigenvalue
                     T* V)
     {
-        assert(nstart == ldh);
+        // assert(nstart == ldh);
         // A to V
         cudaErrcheck(cudaMemcpy(V, A, sizeof(T) * ldh * nstart, cudaMemcpyDeviceToDevice));
         xhegvd_wrapper(CUBLAS_FILL_MODE_UPPER, nstart, V, ldh,
diff --git a/source/source_hsolver/kernels/rocm/dngvd_op.hip.cu b/source/source_hsolver/kernels/rocm/dngvd_op.hip.cu
index a359ccda87..6730e0f936 100644
--- a/source/source_hsolver/kernels/rocm/dngvd_op.hip.cu
+++ b/source/source_hsolver/kernels/rocm/dngvd_op.hip.cu
@@ -8,7 +8,7 @@ namespace hsolver {
 // NOTE: mimicked from ../cuda/dngvd_op.cu for three dngvd_op
 
 static hipsolverHandle_t hipsolver_H = nullptr;
-// Test on DCU platform. When nstart is greater than 234, code on DCU performs better. 
+// Test on DCU platform. When nstart is greater than 234, code on DCU performs better.
 const int N_DCU = 234;
 
 void createGpuSolverHandle() {
@@ -97,7 +97,7 @@ void dngvd_op<double, base_device::DEVICE_GPU>::operator()(const base_device::DE
         hipErrcheck(hipMemcpy(_eigenvalue, eigenvalue.data(), sizeof(double) * eigenvalue.size(), hipMemcpyHostToDevice));
     }
 
-    
+
 }
 #endif // __LCAO
 
@@ -112,7 +112,7 @@ void dngvd_op<std::complex<float>, base_device::DEVICE_GPU>::operator()(const ba
 {
     // copied from ../cuda/dngvd_op.cu, "dngvd_op"
     assert(nstart == ldh);
-    
+
     if (nstart > N_DCU){
         hipErrcheck(hipMemcpy(_vcc, _hcc, sizeof(std::complex<float>) * ldh * nstart, hipMemcpyDeviceToDevice));
         // now vcc contains hcc
@@ -170,7 +170,7 @@ void dngvd_op<std::complex<float>, base_device::DEVICE_GPU>::operator()(const ba
         hipErrcheck(hipMemcpy(_eigenvalue, eigenvalue.data(), sizeof(float) * eigenvalue.size(), hipMemcpyHostToDevice));
     }
 
-    
+
 }
 
 template <>
@@ -184,7 +184,7 @@ void dngvd_op<std::complex<double>, base_device::DEVICE_GPU>::operator()(const b
                                                                         )
 {
     // copied from ../cuda/dngvd_op.cu, "dngvd_op"
-    assert(nstart == ldh);
+    // assert(nstart == ldh);
 
     // save a copy of scc in case the diagonalization fails
     if (nstart > N_DCU){
@@ -253,7 +253,7 @@ void dngvd_op<std::complex<double>, base_device::DEVICE_GPU>::operator()(const b
 
 
 
-    
+
 }
 
 #ifdef __LCAO

From 675dd233c6911fd5dd297d2d072e322459313acb Mon Sep 17 00:00:00 2001
From: "tianxiang.wang@metax-tech.com" <tianxiang.wang@metax-tech.com>
Date: Mon, 8 Sep 2025 01:45:03 +0000
Subject: [PATCH 3/5] =?UTF-8?q?Perf:=20Replace=20loop-based=202D=20copy=20?=
 =?UTF-8?q?and=20memset=20with=20memcpy=5F2d=5Fop,=20memset=5F2d=5Fop=20Si?=
 =?UTF-8?q?gned-off-by=EF=BC=9ATianxiang=20Wang<tianxiang.wang@metax-tech.?=
 =?UTF-8?q?com>,=20Contributed=20under=20MetaX=20Integrated=20Circuits=20(?=
 =?UTF-8?q?Shanghai)=20Co.,=20Ltd.?=
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

---
 .../module_device/cuda/memory_op.cu           |  68 +++++++++++
 .../source_base/module_device/memory_op.cpp   | 113 ++++++++++++++++++
 source/source_base/module_device/memory_op.h  | 113 +++++++++++++++++-
 source/source_hsolver/diago_dav_subspace.cpp  |  21 +---
 source/source_hsolver/diago_dav_subspace.h    |   2 +
 5 files changed, 300 insertions(+), 17 deletions(-)

diff --git a/source/source_base/module_device/cuda/memory_op.cu b/source/source_base/module_device/cuda/memory_op.cu
index 43eae7975c..ecc972c3ba 100644
--- a/source/source_base/module_device/cuda/memory_op.cu
+++ b/source/source_base/module_device/cuda/memory_op.cu
@@ -85,6 +85,16 @@ void set_memory_op<FPTYPE, base_device::DEVICE_GPU>::operator()(FPTYPE* arr,
     cudaErrcheck(cudaMemset(arr, var, sizeof(FPTYPE) * size));
 }
 
+template <typename FPTYPE>
+void set_memory_2d_op<FPTYPE, base_device::DEVICE_GPU>::operator()(FPTYPE* arr,
+                                                                   const size_t pitch,
+                                                                   const int var,
+                                                                   const size_t width,
+                                                                   const size_t height)
+{
+    cudaErrcheck(cudaMemset2D(arr, sizeof(FPTYPE) * pitch , var, sizeof(FPTYPE) * width, height));
+}
+
 template <typename FPTYPE>
 void synchronize_memory_op<FPTYPE, base_device::DEVICE_CPU, base_device::DEVICE_GPU>::operator()(
     FPTYPE* arr_out,
@@ -112,6 +122,42 @@ void synchronize_memory_op<FPTYPE, base_device::DEVICE_GPU, base_device::DEVICE_
     cudaErrcheck(cudaMemcpy(arr_out, arr_in, sizeof(FPTYPE) * size, cudaMemcpyDeviceToDevice));
 }
 
+template <typename FPTYPE>
+void synchronize_memory_2d_op<FPTYPE, base_device::DEVICE_CPU, base_device::DEVICE_GPU>::operator()(
+    FPTYPE* arr_out,
+    const size_t dpitch,
+    const FPTYPE* arr_in,
+    const size_t spitch,
+    const size_t width,
+    const size_t height)
+{
+    cudaErrcheck(cudaMemcpy2D(arr_out, dpitch * sizeof(FPTYPE), arr_in, spitch * sizeof(FPTYPE), width * sizeof(FPTYPE), height, cudaMemcpyDeviceToHost));
+}
+
+template <typename FPTYPE>
+void synchronize_memory_2d_op<FPTYPE, base_device::DEVICE_GPU, base_device::DEVICE_CPU>::operator()(
+    FPTYPE* arr_out,
+    const size_t dpitch,
+    const FPTYPE* arr_in,
+    const size_t spitch,
+    const size_t width,
+    const size_t height)
+{
+    cudaErrcheck(cudaMemcpy2D(arr_out, dpitch * sizeof(FPTYPE), arr_in, spitch * sizeof(FPTYPE), width * sizeof(FPTYPE), height, cudaMemcpyHostToDevice));
+}
+
+template <typename FPTYPE>
+void synchronize_memory_2d_op<FPTYPE, base_device::DEVICE_GPU, base_device::DEVICE_GPU>::operator()(
+    FPTYPE* arr_out,
+    const size_t dpitch,
+    const FPTYPE* arr_in,
+    const size_t spitch,
+    const size_t width,
+    const size_t height)
+{
+    cudaErrcheck(cudaMemcpy2D(arr_out, dpitch * sizeof(FPTYPE), arr_in, spitch * sizeof(FPTYPE), width * sizeof(FPTYPE), height, cudaMemcpyDeviceToDevice));
+}
+
 template <typename FPTYPE_out, typename FPTYPE_in>
 struct cast_memory_op<FPTYPE_out, FPTYPE_in, base_device::DEVICE_GPU, base_device::DEVICE_GPU>
 {
@@ -196,6 +242,12 @@ template struct set_memory_op<double, base_device::DEVICE_GPU>;
 template struct set_memory_op<std::complex<float>, base_device::DEVICE_GPU>;
 template struct set_memory_op<std::complex<double>, base_device::DEVICE_GPU>;
 
+template struct set_memory_2d_op<int, base_device::DEVICE_GPU>;
+template struct set_memory_2d_op<float, base_device::DEVICE_GPU>;
+template struct set_memory_2d_op<double, base_device::DEVICE_GPU>;
+template struct set_memory_2d_op<std::complex<float>, base_device::DEVICE_GPU>;
+template struct set_memory_2d_op<std::complex<double>, base_device::DEVICE_GPU>;
+
 template struct synchronize_memory_op<int, base_device::DEVICE_CPU, base_device::DEVICE_GPU>;
 template struct synchronize_memory_op<int, base_device::DEVICE_GPU, base_device::DEVICE_CPU>;
 template struct synchronize_memory_op<int, base_device::DEVICE_GPU, base_device::DEVICE_GPU>;
@@ -212,6 +264,22 @@ template struct synchronize_memory_op<std::complex<double>, base_device::DEVICE_
 template struct synchronize_memory_op<std::complex<double>, base_device::DEVICE_GPU, base_device::DEVICE_CPU>;
 template struct synchronize_memory_op<std::complex<double>, base_device::DEVICE_GPU, base_device::DEVICE_GPU>;
 
+template struct synchronize_memory_2d_op<int, base_device::DEVICE_CPU, base_device::DEVICE_GPU>;
+template struct synchronize_memory_2d_op<int, base_device::DEVICE_GPU, base_device::DEVICE_CPU>;
+template struct synchronize_memory_2d_op<int, base_device::DEVICE_GPU, base_device::DEVICE_GPU>;
+template struct synchronize_memory_2d_op<float, base_device::DEVICE_CPU, base_device::DEVICE_GPU>;
+template struct synchronize_memory_2d_op<float, base_device::DEVICE_GPU, base_device::DEVICE_CPU>;
+template struct synchronize_memory_2d_op<float, base_device::DEVICE_GPU, base_device::DEVICE_GPU>;
+template struct synchronize_memory_2d_op<double, base_device::DEVICE_CPU, base_device::DEVICE_GPU>;
+template struct synchronize_memory_2d_op<double, base_device::DEVICE_GPU, base_device::DEVICE_CPU>;
+template struct synchronize_memory_2d_op<double, base_device::DEVICE_GPU, base_device::DEVICE_GPU>;
+template struct synchronize_memory_2d_op<std::complex<float>, base_device::DEVICE_CPU, base_device::DEVICE_GPU>;
+template struct synchronize_memory_2d_op<std::complex<float>, base_device::DEVICE_GPU, base_device::DEVICE_CPU>;
+template struct synchronize_memory_2d_op<std::complex<float>, base_device::DEVICE_GPU, base_device::DEVICE_GPU>;
+template struct synchronize_memory_2d_op<std::complex<double>, base_device::DEVICE_CPU, base_device::DEVICE_GPU>;
+template struct synchronize_memory_2d_op<std::complex<double>, base_device::DEVICE_GPU, base_device::DEVICE_CPU>;
+template struct synchronize_memory_2d_op<std::complex<double>, base_device::DEVICE_GPU, base_device::DEVICE_GPU>;
+
 template struct cast_memory_op<float, float, base_device::DEVICE_GPU, base_device::DEVICE_GPU>;
 template struct cast_memory_op<double, double, base_device::DEVICE_GPU, base_device::DEVICE_GPU>;
 template struct cast_memory_op<float, double, base_device::DEVICE_GPU, base_device::DEVICE_GPU>;
diff --git a/source/source_base/module_device/memory_op.cpp b/source/source_base/module_device/memory_op.cpp
index 189014c981..155bf46983 100644
--- a/source/source_base/module_device/memory_op.cpp
+++ b/source/source_base/module_device/memory_op.cpp
@@ -55,6 +55,18 @@ struct set_memory_op<FPTYPE, base_device::DEVICE_CPU>
     }
 };
 
+template <typename FPTYPE>
+struct set_memory_2d_op<FPTYPE, base_device::DEVICE_CPU>
+{
+    void operator()(FPTYPE* arr, const size_t pitch, const int var, const size_t width, const size_t height)
+    {
+        for (size_t i = 0; i < height; i++){
+            set_memory_op<FPTYPE, base_device::DEVICE_CPU>()(arr + i * pitch, var, width);
+        }
+    }
+};
+
+
 template <typename FPTYPE>
 struct synchronize_memory_op<FPTYPE, base_device::DEVICE_CPU, base_device::DEVICE_CPU>
 {
@@ -70,6 +82,23 @@ struct synchronize_memory_op<FPTYPE, base_device::DEVICE_CPU, base_device::DEVIC
     }
 };
 
+template <typename FPTYPE>
+struct synchronize_memory_2d_op<FPTYPE, base_device::DEVICE_CPU, base_device::DEVICE_CPU>
+{
+    void operator()(FPTYPE* arr_out,
+                    const size_t dpitch,
+                    const FPTYPE* arr_in,
+                    const size_t spitch,
+                    const size_t width,
+                    const size_t height)
+    {
+        for (int i = 0; i < height; i++){
+            synchronize_memory_op<FPTYPE, base_device::DEVICE_CPU, base_device::DEVICE_CPU>()(
+            arr_out + i * dpitch, arr_in + i * spitch, width);
+        }
+    }
+};
+
 template <typename FPTYPE_out, typename FPTYPE_in>
 struct cast_memory_op<FPTYPE_out, FPTYPE_in, base_device::DEVICE_CPU, base_device::DEVICE_CPU>
 {
@@ -108,12 +137,24 @@ template struct set_memory_op<double, base_device::DEVICE_CPU>;
 template struct set_memory_op<std::complex<float>, base_device::DEVICE_CPU>;
 template struct set_memory_op<std::complex<double>, base_device::DEVICE_CPU>;
 
+template struct set_memory_2d_op<int, base_device::DEVICE_CPU>;
+template struct set_memory_2d_op<float, base_device::DEVICE_CPU>;
+template struct set_memory_2d_op<double, base_device::DEVICE_CPU>;
+template struct set_memory_2d_op<std::complex<float>, base_device::DEVICE_CPU>;
+template struct set_memory_2d_op<std::complex<double>, base_device::DEVICE_CPU>;
+
 template struct synchronize_memory_op<int, base_device::DEVICE_CPU, base_device::DEVICE_CPU>;
 template struct synchronize_memory_op<float, base_device::DEVICE_CPU, base_device::DEVICE_CPU>;
 template struct synchronize_memory_op<double, base_device::DEVICE_CPU, base_device::DEVICE_CPU>;
 template struct synchronize_memory_op<std::complex<float>, base_device::DEVICE_CPU, base_device::DEVICE_CPU>;
 template struct synchronize_memory_op<std::complex<double>, base_device::DEVICE_CPU, base_device::DEVICE_CPU>;
 
+template struct synchronize_memory_2d_op<int, base_device::DEVICE_CPU, base_device::DEVICE_CPU>;
+template struct synchronize_memory_2d_op<float, base_device::DEVICE_CPU, base_device::DEVICE_CPU>;
+template struct synchronize_memory_2d_op<double, base_device::DEVICE_CPU, base_device::DEVICE_CPU>;
+template struct synchronize_memory_2d_op<std::complex<float>, base_device::DEVICE_CPU, base_device::DEVICE_CPU>;
+template struct synchronize_memory_2d_op<std::complex<double>, base_device::DEVICE_CPU, base_device::DEVICE_CPU>;
+
 template struct cast_memory_op<float, float, base_device::DEVICE_CPU, base_device::DEVICE_CPU>;
 template struct cast_memory_op<double, double, base_device::DEVICE_CPU, base_device::DEVICE_CPU>;
 template struct cast_memory_op<float, double, base_device::DEVICE_CPU, base_device::DEVICE_CPU>;
@@ -167,6 +208,14 @@ struct set_memory_op<FPTYPE, base_device::DEVICE_GPU>
     }
 };
 
+template <typename FPTYPE>
+struct set_memory_2d_op<FPTYPE, base_device::DEVICE_GPU>
+{
+    void operator()(FPTYPE* arr, const size_t pitch, const int var, const size_t width, const size_t height)
+    {
+    }
+};
+
 template <typename FPTYPE>
 struct synchronize_memory_op<FPTYPE, base_device::DEVICE_GPU, base_device::DEVICE_GPU>
 {
@@ -197,6 +246,48 @@ struct synchronize_memory_op<FPTYPE, base_device::DEVICE_CPU, base_device::DEVIC
     }
 };
 
+template <typename FPTYPE>
+struct synchronize_memory_2d_op<FPTYPE, base_device::DEVICE_GPU, base_device::DEVICE_GPU>
+{
+    void operator()(const Device_in* dev_in,
+                    FPTYPE* arr_out,
+                    const size_t dpitch,
+                    const FPTYPE* arr_in,
+                    const size_t spitch,
+                    const size_t width,
+                    const size_t height)
+    {
+    }
+};
+
+template <typename FPTYPE>
+struct synchronize_memory_2d_op<FPTYPE, base_device::DEVICE_GPU, base_device::DEVICE_CPU>
+{
+    void operator()(const Device_in* dev_in,
+                    FPTYPE* arr_out,
+                    const size_t dpitch,
+                    const FPTYPE* arr_in,
+                    const size_t spitch,
+                    const size_t width,
+                    const size_t height)
+    {
+    }
+};
+
+template <typename FPTYPE>
+struct synchronize_memory_2d_op<FPTYPE, base_device::DEVICE_CPU, base_device::DEVICE_GPU>
+{
+    void operator()(const Device_in* dev_in,
+                    FPTYPE* arr_out,
+                    const size_t dpitch,
+                    const FPTYPE* arr_in,
+                    const size_t spitch,
+                    const size_t width,
+                    const size_t height)
+    {
+    }
+};
+
 template <typename FPTYPE_out, typename FPTYPE_in>
 struct cast_memory_op<FPTYPE_out, FPTYPE_in, base_device::DEVICE_GPU, base_device::DEVICE_GPU>
 {
@@ -247,6 +338,12 @@ template struct set_memory_op<double, base_device::DEVICE_GPU>;
 template struct set_memory_op<std::complex<float>, base_device::DEVICE_GPU>;
 template struct set_memory_op<std::complex<double>, base_device::DEVICE_GPU>;
 
+template struct set_memory_2d_op<int, base_device::DEVICE_GPU>;
+template struct set_memory_2d_op<float, base_device::DEVICE_GPU>;
+template struct set_memory_2d_op<double, base_device::DEVICE_GPU>;
+template struct set_memory_2d_op<std::complex<float>, base_device::DEVICE_GPU>;
+template struct set_memory_2d_op<std::complex<double>, base_device::DEVICE_GPU>;
+
 template struct synchronize_memory_op<int, base_device::DEVICE_CPU, base_device::DEVICE_GPU>;
 template struct synchronize_memory_op<int, base_device::DEVICE_GPU, base_device::DEVICE_CPU>;
 template struct synchronize_memory_op<int, base_device::DEVICE_GPU, base_device::DEVICE_GPU>;
@@ -263,6 +360,22 @@ template struct synchronize_memory_op<std::complex<double>, base_device::DEVICE_
 template struct synchronize_memory_op<std::complex<double>, base_device::DEVICE_GPU, base_device::DEVICE_CPU>;
 template struct synchronize_memory_op<std::complex<double>, base_device::DEVICE_GPU, base_device::DEVICE_GPU>;
 
+template struct synchronize_memory_2d_op<int, base_device::DEVICE_CPU, base_device::DEVICE_GPU>;
+template struct synchronize_memory_2d_op<int, base_device::DEVICE_GPU, base_device::DEVICE_CPU>;
+template struct synchronize_memory_2d_op<int, base_device::DEVICE_GPU, base_device::DEVICE_GPU>;
+template struct synchronize_memory_2d_op<float, base_device::DEVICE_CPU, base_device::DEVICE_GPU>;
+template struct synchronize_memory_2d_op<float, base_device::DEVICE_GPU, base_device::DEVICE_CPU>;
+template struct synchronize_memory_2d_op<float, base_device::DEVICE_GPU, base_device::DEVICE_GPU>;
+template struct synchronize_memory_2d_op<double, base_device::DEVICE_CPU, base_device::DEVICE_GPU>;
+template struct synchronize_memory_2d_op<double, base_device::DEVICE_GPU, base_device::DEVICE_CPU>;
+template struct synchronize_memory_2d_op<double, base_device::DEVICE_GPU, base_device::DEVICE_GPU>;
+template struct synchronize_memory_2d_op<std::complex<float>, base_device::DEVICE_CPU, base_device::DEVICE_GPU>;
+template struct synchronize_memory_2d_op<std::complex<float>, base_device::DEVICE_GPU, base_device::DEVICE_CPU>;
+template struct synchronize_memory_2d_op<std::complex<float>, base_device::DEVICE_GPU, base_device::DEVICE_GPU>;
+template struct synchronize_memory_2d_op<std::complex<double>, base_device::DEVICE_CPU, base_device::DEVICE_GPU>;
+template struct synchronize_memory_2d_op<std::complex<double>, base_device::DEVICE_GPU, base_device::DEVICE_CPU>;
+template struct synchronize_memory_2d_op<std::complex<double>, base_device::DEVICE_GPU, base_device::DEVICE_GPU>;
+
 template struct cast_memory_op<float, float, base_device::DEVICE_GPU, base_device::DEVICE_GPU>;
 template struct cast_memory_op<double, double, base_device::DEVICE_GPU, base_device::DEVICE_GPU>;
 template struct cast_memory_op<float, double, base_device::DEVICE_GPU, base_device::DEVICE_GPU>;
diff --git a/source/source_base/module_device/memory_op.h b/source/source_base/module_device/memory_op.h
index 30182649d3..c24acbb024 100644
--- a/source/source_base/module_device/memory_op.h
+++ b/source/source_base/module_device/memory_op.h
@@ -40,6 +40,22 @@ struct set_memory_op
     void operator()(FPTYPE* arr, const int var, const size_t size);
 };
 
+template <typename FPTYPE, typename Device>
+struct set_memory_2d_op
+{
+    /// @brief memset2D for multi-device
+    ///
+    /// Input Parameters
+    /// \param var : the specified constant value
+    /// \param pitch : Pitch in elements  of 2D device memory
+    /// \param width : Width of matrix set (columns in elements)
+    /// \param height : Height of matrix set (rows)
+    ///
+    /// Output Parameters
+    /// \param arr : output array initialized by the input value
+    void operator()(FPTYPE* arr, const size_t pitch, const int var, const size_t width, const size_t height);
+};
+
 template <typename FPTYPE, typename Device_out, typename Device_in>
 struct synchronize_memory_op
 {
@@ -56,6 +72,28 @@ struct synchronize_memory_op
                     const size_t size);
 };
 
+template <typename FPTYPE, typename Device_out, typename Device_in>
+struct synchronize_memory_2d_op
+{
+    /// @brief memcpy2D for multi-device
+    ///
+    /// Input Parameters
+    /// \param arr_in : input array
+    /// \param dpitch : Pitch in elements of destination memory
+    /// \param spitch : Pitch in elements  of source memory
+    /// \param width : Width of matrix transfer (columns in elements)
+    /// \param height : Height of matrix transfer (rows)
+    ///
+    /// Output Parameters
+    /// \param arr_out : output array initialized by the input array
+    void operator()(FPTYPE* arr_out,
+                    const size_t dpitch,
+                    const FPTYPE* arr_in,
+                    const size_t spitch,
+                    const size_t width,
+                    const size_t height);
+};
+
 template <typename FPTYPE_out, typename FPTYPE_in, typename Device_out, typename Device_in>
 struct cast_memory_op
 {
@@ -113,6 +151,12 @@ struct set_memory_op<FPTYPE, base_device::DEVICE_GPU>
     void operator()(FPTYPE* arr, const int var, const size_t size);
 };
 
+template <typename FPTYPE>
+struct set_memory_2d_op<FPTYPE, base_device::DEVICE_GPU>
+{
+    void operator()(FPTYPE* arr, const size_t pitch, const int var, const size_t width, const size_t height);
+};
+
 template <typename FPTYPE>
 struct synchronize_memory_op<FPTYPE, base_device::DEVICE_CPU, base_device::DEVICE_GPU>
 {
@@ -133,7 +177,38 @@ struct synchronize_memory_op<FPTYPE, base_device::DEVICE_GPU, base_device::DEVIC
     void operator()(FPTYPE* arr_out,
                     const FPTYPE* arr_in,
                     const size_t size);
-    
+
+};
+
+template <typename FPTYPE>
+struct synchronize_memory_2d_op<FPTYPE, base_device::DEVICE_CPU, base_device::DEVICE_GPU>
+{
+    void operator()(FPTYPE* arr_out,
+                    const size_t dpitch,
+                    const FPTYPE* arr_in,
+                    const size_t spitch,
+                    const size_t width,
+                    const size_t height);
+};
+template <typename FPTYPE>
+struct synchronize_memory_2d_op<FPTYPE, base_device::DEVICE_GPU, base_device::DEVICE_CPU>
+{
+    void operator()(FPTYPE* arr_out,
+                    const size_t dpitch,
+                    const FPTYPE* arr_in,
+                    const size_t spitch,
+                    const size_t width,
+                    const size_t height);
+};
+template <typename FPTYPE>
+struct synchronize_memory_2d_op<FPTYPE, base_device::DEVICE_GPU, base_device::DEVICE_GPU>
+{
+    void operator()(FPTYPE* arr_out,
+                    const size_t dpitch,
+                    const FPTYPE* arr_in,
+                    const size_t spitch,
+                    const size_t width,
+                    const size_t height);
 };
 
 template <typename FPTYPE>
@@ -194,6 +269,16 @@ using setmem_dd_op = base_device::memory::set_memory_op<double, base_device::DEV
 using setmem_cd_op = base_device::memory::set_memory_op<std::complex<float>, base_device::DEVICE_GPU>;
 using setmem_zd_op = base_device::memory::set_memory_op<std::complex<double>, base_device::DEVICE_GPU>;
 
+using setmem_sh_2d_op = base_device::memory::set_memory_2d_op<float, base_device::DEVICE_CPU>;
+using setmem_dh_2d_op = base_device::memory::set_memory_2d_op<double, base_device::DEVICE_CPU>;
+using setmem_ch_2d_op = base_device::memory::set_memory_2d_op<std::complex<float>, base_device::DEVICE_CPU>;
+using setmem_zh_2d_op = base_device::memory::set_memory_2d_op<std::complex<double>, base_device::DEVICE_CPU>;
+
+using setmem_sd_2d_op = base_device::memory::set_memory_2d_op<float, base_device::DEVICE_GPU>;
+using setmem_dd_2d_op = base_device::memory::set_memory_2d_op<double, base_device::DEVICE_GPU>;
+using setmem_cd_2d_op = base_device::memory::set_memory_2d_op<std::complex<float>, base_device::DEVICE_GPU>;
+using setmem_zd_2d_op = base_device::memory::set_memory_2d_op<std::complex<double>, base_device::DEVICE_GPU>;
+
 using delmem_sh_op = base_device::memory::delete_memory_op<float, base_device::DEVICE_CPU>;
 using delmem_dh_op = base_device::memory::delete_memory_op<double, base_device::DEVICE_CPU>;
 using delmem_ch_op = base_device::memory::delete_memory_op<std::complex<float>, base_device::DEVICE_CPU>;
@@ -230,6 +315,32 @@ using syncmem_z2z_h2d_op
 using syncmem_z2z_d2h_op
     = base_device::memory::synchronize_memory_op<std::complex<double>, base_device::DEVICE_CPU, base_device::DEVICE_GPU>;
 
+using syncmem_c2c_h2h_op
+    = base_device::memory::synchronize_memory_op<std::complex<float>, base_device::DEVICE_CPU, base_device::DEVICE_CPU>;
+using syncmem_c2c_h2d_op
+    = base_device::memory::synchronize_memory_op<std::complex<float>, base_device::DEVICE_GPU, base_device::DEVICE_CPU>;
+using syncmem_c2c_d2h_op
+    = base_device::memory::synchronize_memory_op<std::complex<float>, base_device::DEVICE_CPU, base_device::DEVICE_GPU>;
+using syncmem_z2z_h2h_op
+    = base_device::memory::synchronize_memory_op<std::complex<double>, base_device::DEVICE_CPU, base_device::DEVICE_CPU>;
+using syncmem_z2z_h2d_op
+    = base_device::memory::synchronize_memory_op<std::complex<double>, base_device::DEVICE_GPU, base_device::DEVICE_CPU>;
+using syncmem_z2z_d2h_op
+    = base_device::memory::synchronize_memory_op<std::complex<double>, base_device::DEVICE_CPU, base_device::DEVICE_GPU>;
+
+using syncmem_c2c_h2h_2d_op
+    = base_device::memory::synchronize_memory_2d_op<std::complex<float>, base_device::DEVICE_CPU, base_device::DEVICE_CPU>;
+using syncmem_c2c_h2d_2d_op
+    = base_device::memory::synchronize_memory_2d_op<std::complex<float>, base_device::DEVICE_GPU, base_device::DEVICE_CPU>;
+using syncmem_c2c_d2h_2d_op
+    = base_device::memory::synchronize_memory_2d_op<std::complex<float>, base_device::DEVICE_CPU, base_device::DEVICE_GPU>;
+using syncmem_z2z_h2h_2d_op
+    = base_device::memory::synchronize_memory_2d_op<std::complex<double>, base_device::DEVICE_CPU, base_device::DEVICE_CPU>;
+using syncmem_z2z_h2d_2d_op
+    = base_device::memory::synchronize_memory_2d_op<std::complex<double>, base_device::DEVICE_GPU, base_device::DEVICE_CPU>;
+using syncmem_z2z_d2h_2d_op
+    = base_device::memory::synchronize_memory_2d_op<std::complex<double>, base_device::DEVICE_CPU, base_device::DEVICE_GPU>;
+
 using castmem_s2d_h2h_op
     = base_device::memory::cast_memory_op<double, float, base_device::DEVICE_CPU, base_device::DEVICE_CPU>;
 using castmem_s2d_h2d_op
diff --git a/source/source_hsolver/diago_dav_subspace.cpp b/source/source_hsolver/diago_dav_subspace.cpp
index 7a78cef163..d427f246aa 100644
--- a/source/source_hsolver/diago_dav_subspace.cpp
+++ b/source/source_hsolver/diago_dav_subspace.cpp
@@ -129,13 +129,10 @@ int Diago_DavSubspace<T, Device>::diag_once(const HPsiFunc& hpsi_func,
 
     ModuleBase::timer::tick("Diago_DavSubspace", "first");
 
+    syncmem_complex_2d_op()(this->psi_in_iter, this->dim, psi_in, psi_in_dmax, this->dim, this->n_band);
     for (int m = 0; m < this->n_band; m++)
     {
         unconv[m] = m;
-
-        syncmem_complex_op()(this->psi_in_iter + m * this->dim,
-                             psi_in + m * psi_in_dmax,
-                             this->dim);
     }
 
     // compute h*psi_in_iter
@@ -247,12 +244,7 @@ int Diago_DavSubspace<T, Device>::diag_once(const HPsiFunc& hpsi_func,
                 // estimate of the eigenvectors and set the basis dimension to N;
 
                 // update this->psi_in_iter according to psi_in
-                for (size_t i = 0; i < this->n_band; i++)
-                {
-                    syncmem_complex_op()(this->psi_in_iter + i * this->dim,
-                                         psi_in + i * psi_in_dmax,
-                                         this->dim);
-                }
+                syncmem_complex_2d_op()(this->psi_in_iter, this->dim, psi_in, psi_in_dmax, this->dim, this->n_band);
 
                 this->refresh(this->dim,
                               this->n_band,
@@ -722,12 +714,9 @@ void Diago_DavSubspace<T, Device>::refresh(const int& dim,
     nbase = nband;
 
     // set hcc/scc/vcc to 0
-    for (size_t i = 0; i < nbase; i++)
-    {
-        setmem_complex_op()(&hcc[this->nbase_x * i], 0, nbase);
-        setmem_complex_op()(&scc[this->nbase_x * i], 0, nbase);
-        setmem_complex_op()(&vcc[this->nbase_x * i], 0, nbase);
-    }
+    setmem_complex_2d_op()(hcc, this->nbase_x, 0, nbase, nbase);
+    setmem_complex_2d_op()(scc, this->nbase_x, 0, nbase, nbase);
+    setmem_complex_2d_op()(vcc, this->nbase_x, 0, nbase, nbase);
 
     if (this->device == base_device::GpuDevice)
     {
diff --git a/source/source_hsolver/diago_dav_subspace.h b/source/source_hsolver/diago_dav_subspace.h
index ba5cf34e50..a2227949d4 100644
--- a/source/source_hsolver/diago_dav_subspace.h
+++ b/source/source_hsolver/diago_dav_subspace.h
@@ -179,6 +179,7 @@ class Diago_DavSubspace
     using delmem_real_op = base_device::memory::delete_memory_op<Real, Device>;
 #endif
     using setmem_real_op = base_device::memory::set_memory_op<Real, Device>;
+    using setmem_complex_2d_op = base_device::memory::set_memory_2d_op<T, Device>;
 
     using resmem_real_h_op = base_device::memory::resize_memory_op<Real, base_device::DEVICE_CPU>;
     using delmem_real_h_op = base_device::memory::delete_memory_op<Real, base_device::DEVICE_CPU>;
@@ -187,6 +188,7 @@ class Diago_DavSubspace
     using syncmem_var_h2d_op = base_device::memory::synchronize_memory_op<Real, Device, base_device::DEVICE_CPU>;
     using syncmem_var_d2h_op = base_device::memory::synchronize_memory_op<Real, base_device::DEVICE_CPU, Device>;
     using syncmem_complex_op = base_device::memory::synchronize_memory_op<T, Device, Device>;
+    using syncmem_complex_2d_op = base_device::memory::synchronize_memory_2d_op<T, Device, Device>;
     using castmem_complex_op = base_device::memory::cast_memory_op<std::complex<double>, T, Device, Device>;
     using syncmem_h2d_op = base_device::memory::synchronize_memory_op<T, Device, base_device::DEVICE_CPU>;
     using syncmem_d2h_op = base_device::memory::synchronize_memory_op<T, base_device::DEVICE_CPU, Device>;

From 4d270a6575539db3303dd3a6232a22c45c7e0589 Mon Sep 17 00:00:00 2001
From: "tianxiang.wang@metax-tech.com" <tianxiang.wang@metax-tech.com>
Date: Mon, 8 Sep 2025 03:29:39 +0000
Subject: [PATCH 4/5] =?UTF-8?q?Perf:=20use=20warp=20reduce=20instead=20of?=
 =?UTF-8?q?=20shared=20memory=20for=20better=20efficiency=20Signed-off-by?=
 =?UTF-8?q?=EF=BC=9ATianxiang=20Wang<tianxiang.wang@metax-tech.com>,=20Con?=
 =?UTF-8?q?tributed=20under=20MetaX=20Integrated=20Circuits=20(Shanghai)?=
 =?UTF-8?q?=20Co.,=20Ltd.?=
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

---
 .../kernels/cuda/bpcg_kernel_op.cu            | 67 ++++++++++++-------
 1 file changed, 41 insertions(+), 26 deletions(-)

diff --git a/source/source_hsolver/kernels/cuda/bpcg_kernel_op.cu b/source/source_hsolver/kernels/cuda/bpcg_kernel_op.cu
index a8f45ad886..b0a08cc513 100644
--- a/source/source_hsolver/kernels/cuda/bpcg_kernel_op.cu
+++ b/source/source_hsolver/kernels/cuda/bpcg_kernel_op.cu
@@ -7,6 +7,8 @@ namespace hsolver
 {
 const int warp_size = 32;
 const int thread_per_block = 256;
+#define FULL_MASK 0xffffffff
+#define WARP_SIZE 32
 
 template <typename Real>
 __global__ void line_minimize_with_block(
@@ -282,6 +284,37 @@ __global__ void precondition_kernel(
     }
 }
 
+template <typename Real>
+__device__ Real warpReduceSum(Real val) {
+    for (int offset = WARP_SIZE / 2; offset > 0; offset >>= 1)
+        val += __shfl_down_sync(FULL_MASK, val, offset);
+    return val;
+}
+
+template <typename Real>
+__device__ Real blockReduceSum(Real val, volatile Real* shared) {
+    int lane = threadIdx.x % WARP_SIZE;
+    int wid  = threadIdx.x / WARP_SIZE;
+
+    val = warpReduceSum(val);
+
+    if (lane == 0)
+        shared[wid] = val;
+
+    __syncthreads();
+
+    Real sum = 0.0;
+    if (wid == 0) {
+        sum = (threadIdx.x < blockDim.x / 32) ? shared[lane] : 0.0;
+        sum = warpReduceSum(sum);
+        if (lane == 0) shared[0] = sum;
+    }
+
+    __syncthreads();
+    return shared[0];
+}
+
+
 template <typename Real>
 __global__ void normalize_kernel(
         thrust::complex<Real>* psi_iter,
@@ -292,38 +325,19 @@ __global__ void normalize_kernel(
 {
     int m = blockIdx.x;
     int tid = threadIdx.x;
-    __shared__ Real sum[thread_per_block];
+    extern __shared__ char s_char[];
+    Real* shared = reinterpret_cast<Real*>(s_char);
 
-    sum[tid] = 0.0;
+    Real local_sum = 0.0;
 
     // Calculate the sum for normalization
     for (int i = tid; i < dim; i += thread_per_block) {
         auto val = psi_iter[(nbase + m) * dim + i];
-        sum[tid] += (val * thrust::conj(val)).real();
+        local_sum += (val * thrust::conj(val)).real();
     }
 
-    __syncthreads();
-
-    // Parallel reduction in shared memory
-    for (int s = thread_per_block/2; s > warp_size; s >>= 1) {
-        if (tid < s) {
-            sum[tid] += sum[tid + s];
-        }
-        __syncthreads();
-    }
-
-    if (tid < warp_size) {
-        sum[tid] += sum[tid + 32]; __syncwarp();
-        sum[tid] += sum[tid + 16]; __syncwarp();
-        sum[tid] += sum[tid + 8]; __syncwarp();
-        sum[tid] += sum[tid + 4]; __syncwarp();
-        sum[tid] += sum[tid + 2]; __syncwarp();
-        sum[tid] += sum[tid + 1]; __syncwarp();
-    }
-
-    __syncthreads();
-
-    Real norm = sqrt(sum[0]);
+    Real l2_sq = blockReduceSum(local_sum, shared);
+    Real norm = sqrt(l2_sq);
 
     // Normalize the vector
     for (int i = tid; i < dim; i += thread_per_block) {
@@ -452,8 +466,9 @@ void normalize_op<T, base_device::DEVICE_GPU>::operator()(const int& dim,
                                                         Real* psi_norm)
 {
     auto psi_complex = reinterpret_cast<thrust::complex<Real>*>(psi_iter);
+    int sharedMemSize = (thread_per_block / WARP_SIZE) * sizeof(Real);
 
-    normalize_kernel<Real><<<notconv, thread_per_block>>>(
+    normalize_kernel<Real><<<notconv, thread_per_block, sharedMemSize, 0>>>(
         psi_complex, psi_norm, dim, nbase, notconv);
 
     cudaCheckOnDebug();

From 53323929cef6e150c45e3f8a459f57871564fe27 Mon Sep 17 00:00:00 2001
From: "tianxiang.wang@metax-tech.com" <tianxiang.wang@metax-tech.com>
Date: Mon, 8 Sep 2025 06:52:33 +0000
Subject: [PATCH 5/5] =?UTF-8?q?Fix=20compilation=20error=20Signed-off-by?=
 =?UTF-8?q?=EF=BC=9ATianxiang=20Wang<tianxiang.wang@metax-tech.com>,=20Con?=
 =?UTF-8?q?tributed=20under=20MetaX=20Integrated=20Circuits=20(Shanghai)?=
 =?UTF-8?q?=20Co.,=20Ltd.?=
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

---
 source/source_base/module_device/memory_op.cpp | 9 +++------
 1 file changed, 3 insertions(+), 6 deletions(-)

diff --git a/source/source_base/module_device/memory_op.cpp b/source/source_base/module_device/memory_op.cpp
index 155bf46983..da22bc2ce6 100644
--- a/source/source_base/module_device/memory_op.cpp
+++ b/source/source_base/module_device/memory_op.cpp
@@ -249,8 +249,7 @@ struct synchronize_memory_op<FPTYPE, base_device::DEVICE_CPU, base_device::DEVIC
 template <typename FPTYPE>
 struct synchronize_memory_2d_op<FPTYPE, base_device::DEVICE_GPU, base_device::DEVICE_GPU>
 {
-    void operator()(const Device_in* dev_in,
-                    FPTYPE* arr_out,
+    void operator()(FPTYPE* arr_out,
                     const size_t dpitch,
                     const FPTYPE* arr_in,
                     const size_t spitch,
@@ -263,8 +262,7 @@ struct synchronize_memory_2d_op<FPTYPE, base_device::DEVICE_GPU, base_device::DE
 template <typename FPTYPE>
 struct synchronize_memory_2d_op<FPTYPE, base_device::DEVICE_GPU, base_device::DEVICE_CPU>
 {
-    void operator()(const Device_in* dev_in,
-                    FPTYPE* arr_out,
+    void operator()(FPTYPE* arr_out,
                     const size_t dpitch,
                     const FPTYPE* arr_in,
                     const size_t spitch,
@@ -277,8 +275,7 @@ struct synchronize_memory_2d_op<FPTYPE, base_device::DEVICE_GPU, base_device::DE
 template <typename FPTYPE>
 struct synchronize_memory_2d_op<FPTYPE, base_device::DEVICE_CPU, base_device::DEVICE_GPU>
 {
-    void operator()(const Device_in* dev_in,
-                    FPTYPE* arr_out,
+    void operator()(FPTYPE* arr_out,
                     const size_t dpitch,
                     const FPTYPE* arr_in,
                     const size_t spitch,