From 8e6cd2d6f9e5f96f83715cef066ea6f6d8973525 Mon Sep 17 00:00:00 2001
From: AsTonyshment <baotaoni@pku.edu.cn>
Date: Fri, 13 Mar 2026 20:27:12 +0800
Subject: [PATCH] Feature: Multi-GPU support for RT-TDDFT

---
 CMakeLists.txt                                |  20 +-
 cmake/SetupCuBlasMp.cmake                     |  78 +++
 cmake/SetupCuSolverMp.cmake                   |  25 +-
 .../source_esolver/esolver_ks_lcao_tddft.cpp  |  47 +-
 source/source_esolver/esolver_ks_lcao_tddft.h |   6 +-
 source/source_lcao/module_rt/CMakeLists.txt   |   2 +
 source/source_lcao/module_rt/band_energy.cpp  | 344 +++++++-----
 source/source_lcao/module_rt/band_energy.h    |   4 +-
 source/source_lcao/module_rt/evolve_elec.cpp  |  13 +-
 source/source_lcao/module_rt/evolve_elec.h    |  76 ++-
 source/source_lcao/module_rt/evolve_psi.cpp   |  29 +-
 source/source_lcao/module_rt/evolve_psi.h     |   5 +-
 .../module_rt/kernels/cublasmp_context.h      | 167 ++++++
 .../kernels/cuda/band_energy_kernel.cu        |  88 +++
 .../kernels/cuda/band_energy_kernel.cuh       |  31 ++
 .../module_rt/kernels/cuda/norm_psi_kernel.cu |  94 ++++
 .../kernels/cuda/norm_psi_kernel.cuh          |  30 +
 .../source_lcao/module_rt/middle_hamilt.cpp   | 188 ++++---
 source/source_lcao/module_rt/middle_hamilt.h  |   4 +-
 source/source_lcao/module_rt/norm_psi.cpp     | 448 ++++++++-------
 source/source_lcao/module_rt/norm_psi.h       |   4 +-
 source/source_lcao/module_rt/propagator.cpp   |  11 +-
 source/source_lcao/module_rt/propagator.h     |   7 +-
 .../source_lcao/module_rt/propagator_cn2.cpp  | 522 ++++++++++--------
 source/source_lcao/module_rt/upsi.cpp         | 184 +++---
 source/source_lcao/module_rt/upsi.h           |   4 +-
 26 files changed, 1668 insertions(+), 763 deletions(-)
 create mode 100644 cmake/SetupCuBlasMp.cmake
 create mode 100644 source/source_lcao/module_rt/kernels/cublasmp_context.h
 create mode 100644 source/source_lcao/module_rt/kernels/cuda/band_energy_kernel.cu
 create mode 100644 source/source_lcao/module_rt/kernels/cuda/band_energy_kernel.cuh
 create mode 100644 source/source_lcao/module_rt/kernels/cuda/norm_psi_kernel.cu
 create mode 100644 source/source_lcao/module_rt/kernels/cuda/norm_psi_kernel.cuh

diff --git a/CMakeLists.txt b/CMakeLists.txt
index 5745173cc1..0ccee1f673 100644
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -449,10 +449,28 @@ if(USE_CUDA)
       set(CMAKE_CUDA_FLAGS "${CMAKE_CUDA_FLAGS} -Xcompiler=${OpenMP_CXX_FLAGS}" CACHE STRING "CUDA flags" FORCE)
     endif()
     if (ENABLE_CUSOLVERMP)
-      # Keep cuSolverMp discovery/linking logic in a dedicated module.
+      # Keep cuSOLVERMp discovery/linking logic in a dedicated module.
       include(cmake/SetupCuSolverMp.cmake)
       abacus_setup_cusolvermp(${ABACUS_BIN_NAME})
     endif()
+    if (ENABLE_CUBLASMP)
+      # Enforcement 1: cuBLASMp requires cuSOLVERMp to be enabled
+      if (NOT ENABLE_CUSOLVERMP)
+        message(FATAL_ERROR 
+          "ENABLE_CUBLASMP is set to ON, but ENABLE_CUSOLVERMP is OFF."
+          "In ABACUS, cuBLASMp support requires cuSOLVERMp to be enabled simultaneously."
+          "Please set -DENABLE_CUSOLVERMP=ON.")
+      endif()
+      # Enforcement 2: cuBLASMp 0.8.0+ is incompatible with CAL backend
+      # Note: _use_cal is defined inside abacus_setup_cusolvermp
+      if (_use_cal)
+        message(FATAL_ERROR 
+          "cuBLASMp 0.8.0+ requires NCCL Symmetric Memory, but cuSOLVERMp is using CAL backend."
+          "Please upgrade cuSOLVERMp to >= 0.7.0 to use NCCL for both.")
+      endif()
+      include(cmake/SetupCuBlasMp.cmake)
+      abacus_setup_cublasmp(${ABACUS_BIN_NAME})
+    endif()
   endif()
 endif()
 
diff --git a/cmake/SetupCuBlasMp.cmake b/cmake/SetupCuBlasMp.cmake
new file mode 100644
index 0000000000..7937a02936
--- /dev/null
+++ b/cmake/SetupCuBlasMp.cmake
@@ -0,0 +1,78 @@
+# =============================================================================
+# Configure cuBLASMp dependencies and linking for ABACUS
+# =============================================================================
+
+include_guard(GLOBAL)
+
+function(abacus_setup_cublasmp target_name)
+  add_compile_definitions(__CUBLASMP)
+
+  # 1. Search for cuBLASMp library and header files
+  # libcublasmp.so
+  find_library(CUBLASMP_LIBRARY NAMES cublasmp
+      HINTS ${CUBLASMP_PATH} ${NVHPC_ROOT_DIR}
+      PATH_SUFFIXES lib lib64 math_libs/lib math_libs/lib64)
+
+  # cublasmp.h
+  find_path(CUBLASMP_INCLUDE_DIR NAMES cublasmp.h
+      HINTS ${CUBLASMP_PATH} ${NVHPC_ROOT_DIR}
+      PATH_SUFFIXES include math_libs/include)
+
+  if(NOT CUBLASMP_LIBRARY OR NOT CUBLASMP_INCLUDE_DIR)
+    message(FATAL_ERROR
+      "cuBLASMp not found. Please ensure CUBLASMP_PATH is set correctly."
+    )
+  endif()
+
+  message(STATUS "Found cuBLASMp: ${CUBLASMP_LIBRARY}")
+
+  # 2. Version validation by parsing header macros
+  set(CUBLASMP_VERSION_STR "")
+  set(CUBLASMP_VERSION_HEADER "${CUBLASMP_INCLUDE_DIR}/cublasmp.h")
+  
+  if(EXISTS "${CUBLASMP_VERSION_HEADER}")
+    # Extract version lines using regular expressions from cublasmp.h
+    file(STRINGS "${CUBLASMP_VERSION_HEADER}" CUBLASMP_MAJOR_LINE
+         REGEX "^#define[ \t]+CUBLASMP_VER_MAJOR[ \t]+[0-9]+")
+    file(STRINGS "${CUBLASMP_VERSION_HEADER}" CUBLASMP_MINOR_LINE
+         REGEX "^#define[ \t]+CUBLASMP_VER_MINOR[ \t]+[0-9]+")
+    file(STRINGS "${CUBLASMP_VERSION_HEADER}" CUBLASMP_PATCH_LINE
+         REGEX "^#define[ \t]+CUBLASMP_VER_PATCH[ \t]+[0-9]+")
+    
+    # Extract numeric values from the matched strings
+    string(REGEX MATCH "([0-9]+)" CUBLASMP_VER_MAJOR "${CUBLASMP_MAJOR_LINE}")
+    string(REGEX MATCH "([0-9]+)" CUBLASMP_VER_MINOR "${CUBLASMP_MINOR_LINE}")
+    string(REGEX MATCH "([0-9]+)" CUBLASMP_VER_PATCH "${CUBLASMP_PATCH_LINE}")
+    
+    if(NOT CUBLASMP_VER_MAJOR STREQUAL ""
+       AND NOT CUBLASMP_VER_MINOR STREQUAL ""
+       AND NOT CUBLASMP_VER_PATCH STREQUAL "")
+      set(CUBLASMP_VERSION_STR
+          "${CUBLASMP_VER_MAJOR}.${CUBLASMP_VER_MINOR}.${CUBLASMP_VER_PATCH}")
+    endif()
+  endif()
+
+  message(STATUS "Detected cuBLASMp version: ${CUBLASMP_VERSION_STR}")
+
+  # 3. Version constraint: ABACUS requires cuBLASMp >= 0.8.0
+  if(CUBLASMP_VERSION_STR AND CUBLASMP_VERSION_STR VERSION_LESS "0.8.0")
+    message(FATAL_ERROR
+      "cuBLASMp version ${CUBLASMP_VERSION_STR} is too old. "
+      "ABACUS requires cuBLASMp >= 0.8.0 for NCCL Symmetric Memory support."
+    )
+  elseif(NOT CUBLASMP_VERSION_STR)
+    message(WARNING "Could not detect cuBLASMp version. Proceeding cautiously.")
+  endif()
+
+  # 4. Create cublasMp::cublasMp imported target
+  if(NOT TARGET cublasMp::cublasMp)
+    add_library(cublasMp::cublasMp IMPORTED INTERFACE)
+    set_target_properties(cublasMp::cublasMp PROPERTIES
+        INTERFACE_LINK_LIBRARIES "${CUBLASMP_LIBRARY};NCCL::NCCL"
+        INTERFACE_INCLUDE_DIRECTORIES "${CUBLASMP_INCLUDE_DIR}")
+  endif()
+
+  # 5. Link the library to the target
+  target_link_libraries(${target_name} cublasMp::cublasMp)
+
+endfunction()
diff --git a/cmake/SetupCuSolverMp.cmake b/cmake/SetupCuSolverMp.cmake
index a2bcd00bdf..004665686b 100644
--- a/cmake/SetupCuSolverMp.cmake
+++ b/cmake/SetupCuSolverMp.cmake
@@ -1,5 +1,5 @@
 # =============================================================================
-# Configure cuSolverMp dependencies and linking for ABACUS
+# Configure cuSOLVERMp dependencies and linking for ABACUS
 # =============================================================================
 
 include_guard(GLOBAL)
@@ -7,7 +7,7 @@ include_guard(GLOBAL)
 function(abacus_setup_cusolvermp target_name)
   add_compile_definitions(__CUSOLVERMP)
 
-  # Find cuSolverMp first, then decide communicator backend.
+  # Find cuSOLVERMp first, then decide communicator backend.
   find_library(CUSOLVERMP_LIBRARY NAMES cusolverMp
       HINTS ${CAL_CUSOLVERMP_PATH} ${NVHPC_ROOT_DIR}
       PATH_SUFFIXES lib lib64 math_libs/lib math_libs/lib64)
@@ -18,11 +18,11 @@ function(abacus_setup_cusolvermp target_name)
 
   if(NOT CUSOLVERMP_LIBRARY OR NOT CUSOLVERMP_INCLUDE_DIR)
     message(FATAL_ERROR
-      "cusolverMp not found. Set CUSOLVERMP_PATH or NVHPC_ROOT_DIR."
+      "cuSOLVERMp not found. Set CUSOLVERMP_PATH or NVHPC_ROOT_DIR."
     )
   endif()
 
-  message(STATUS "Found cusolverMp: ${CUSOLVERMP_LIBRARY}")
+  message(STATUS "Found cuSOLVERMp: ${CUSOLVERMP_LIBRARY}")
 
   set(CUSOLVERMP_VERSION_STR "")
   set(CUSOLVERMP_VERSION_HEADER "${CUSOLVERMP_INCLUDE_DIR}/cusolverMp.h")
@@ -47,27 +47,30 @@ function(abacus_setup_cusolvermp target_name)
   # Check minimum version requirement (>= 0.4.0)
   if(CUSOLVERMP_VERSION_STR AND CUSOLVERMP_VERSION_STR VERSION_LESS "0.4.0")
     message(FATAL_ERROR
-      "cuSolverMp version ${CUSOLVERMP_VERSION_STR} is too old. "
-      "ABACUS requires cuSolverMp >= 0.4.0 (NVIDIA HPC SDK >= 23.5). "
+      "cuSOLVERMp version ${CUSOLVERMP_VERSION_STR} is too old. "
+      "ABACUS requires cuSOLVERMp >= 0.4.0 (NVIDIA HPC SDK >= 23.5). "
       "Please upgrade your NVIDIA HPC SDK installation."
     )
   endif()
 
-  # Auto-select communicator backend by cuSolverMp version.
-  # cuSolverMp < 0.7.0 -> CAL, otherwise -> NCCL.
+  # Auto-select communicator backend by cuSOLVERMp version.
+  # cuSOLVERMp < 0.7.0 -> CAL, otherwise -> NCCL.
   set(_use_cal OFF)
   if(CUSOLVERMP_VERSION_STR AND CUSOLVERMP_VERSION_STR VERSION_LESS "0.7.0")
     set(_use_cal ON)
     message(STATUS
-      "Detected cuSolverMp ${CUSOLVERMP_VERSION_STR} (< 0.7.0). Using CAL backend.")
+      "Detected cuSOLVERMp ${CUSOLVERMP_VERSION_STR} (< 0.7.0). Using CAL backend.")
   elseif(CUSOLVERMP_VERSION_STR)
     message(STATUS
-      "Detected cuSolverMp ${CUSOLVERMP_VERSION_STR} (>= 0.7.0). Using NCCL backend.")
+      "Detected cuSOLVERMp ${CUSOLVERMP_VERSION_STR} (>= 0.7.0). Using NCCL backend.")
   elseif(NOT CUSOLVERMP_VERSION_STR)
     message(WARNING
-      "Unable to detect cuSolverMp version from header. Using NCCL backend by default.")
+      "Unable to detect cuSOLVERMp version from header. Using NCCL backend by default.")
   endif()
 
+  # Raise the variable to the caller's scope
+  set(_use_cal ${_use_cal} PARENT_SCOPE)
+
   # Backend selection:
   # - _use_cal=ON  -> cal communicator backend
   # - _use_cal=OFF -> NCCL communicator backend
diff --git a/source/source_esolver/esolver_ks_lcao_tddft.cpp b/source/source_esolver/esolver_ks_lcao_tddft.cpp
index 2e463acdcd..de109583a1 100644
--- a/source/source_esolver/esolver_ks_lcao_tddft.cpp
+++ b/source/source_esolver/esolver_ks_lcao_tddft.cpp
@@ -30,7 +30,10 @@ ESolver_KS_LCAO_TDDFT<TR, Device>::ESolver_KS_LCAO_TDDFT()
     if (ct_device_type == ct::DeviceType::GpuDevice)
     {
         use_tensor = true;
-        use_lapack = true;
+        if (PARAM.inp.ks_solver != "cusolvermp")
+        {
+            use_lapack = true;
+        }
     }
 }
 
@@ -235,21 +238,22 @@ void ESolver_KS_LCAO_TDDFT<TR, Device>::hamilt2rho_single(UnitCell& ucell,
     {
         if (istep >= TD_info::estep_shift + 1)
         {
-            module_rt::Evolve_elec<Device>::solve_psi(istep,
-                                                      PARAM.inp.nbands,
-                                                      PARAM.globalv.nlocal,
-                                                      this->kv.get_nks(),
-                                                      static_cast<hamilt::Hamilt<std::complex<double>>*>(this->p_hamilt),
-                                                      this->pv,
-                                                      this->psi,
-                                                      this->psi_laststep,
-                                                      this->Hk_laststep,
-                                                      this->Sk_laststep,
-                                                      this->pelec->ekb,
-                                                      GlobalV::ofs_running,
-                                                      PARAM.inp.propagator,
-                                                      use_tensor,
-                                                      use_lapack);
+            module_rt::Evolve_elec<Device>::solve_psi(
+                istep,
+                PARAM.inp.nbands,
+                PARAM.globalv.nlocal,
+                this->kv.get_nks(),
+                static_cast<hamilt::Hamilt<std::complex<double>>*>(this->p_hamilt),
+                this->pv,
+                this->psi,
+                this->psi_laststep,
+                this->Hk_laststep,
+                this->Sk_laststep,
+                this->pelec->ekb,
+                GlobalV::ofs_running,
+                PARAM.inp.propagator,
+                use_tensor,
+                use_lapack);
         }
         this->weight_dm_rho(ucell);
     }
@@ -346,11 +350,18 @@ void ESolver_KS_LCAO_TDDFT<TR, Device>::iter_finish(UnitCell& ucell,
     {
         if (use_tensor && use_lapack)
         {
-            elecstate::cal_edm_tddft_tensor_lapack<Device>(this->pv, this->dmat, this->kv, static_cast<hamilt::Hamilt<std::complex<double>>*>(this->p_hamilt));
+            elecstate::cal_edm_tddft_tensor_lapack<Device>(
+                this->pv,
+                this->dmat,
+                this->kv,
+                static_cast<hamilt::Hamilt<std::complex<double>>*>(this->p_hamilt));
         }
         else
         {
-            elecstate::cal_edm_tddft(this->pv, this->dmat, this->kv, static_cast<hamilt::Hamilt<std::complex<double>>*>(this->p_hamilt));
+            elecstate::cal_edm_tddft(this->pv,
+                                     this->dmat,
+                                     this->kv,
+                                     static_cast<hamilt::Hamilt<std::complex<double>>*>(this->p_hamilt));
         }
     }
 }
diff --git a/source/source_esolver/esolver_ks_lcao_tddft.h b/source/source_esolver/esolver_ks_lcao_tddft.h
index 53e6ac77f5..f534b303f4 100644
--- a/source/source_esolver/esolver_ks_lcao_tddft.h
+++ b/source/source_esolver/esolver_ks_lcao_tddft.h
@@ -3,10 +3,11 @@
 #include "esolver_ks.h"
 #include "esolver_ks_lcao.h"
 #include "source_base/module_container/ATen/core/tensor.h" // ct::Tensor
-#include "source_lcao/module_rt/gather_mat.h"              // MPI gathering and distributing functions
+#include "source_lcao/module_rt/boundary_fix.h"
+#include "source_lcao/module_rt/gather_mat.h" // MPI gathering and distributing functions
+#include "source_lcao/module_rt/kernels/cublasmp_context.h"
 #include "source_lcao/module_rt/td_info.h"
 #include "source_lcao/module_rt/velocity_op.h"
-#include "source_lcao/module_rt/boundary_fix.h"
 
 namespace ModuleESolver
 {
@@ -51,6 +52,7 @@ class ESolver_KS_LCAO_TDDFT : public ESolver_KS_LCAO<std::complex<double>, TR>
     //! Control heterogeneous computing of the TDDFT solver
     bool use_tensor = false;
     bool use_lapack = false;
+    CublasMpResources cublas_res;
 
     // Control the device type for Hk_laststep and Sk_laststep
     // Set to CPU temporarily, should wait for further GPU development
diff --git a/source/source_lcao/module_rt/CMakeLists.txt b/source/source_lcao/module_rt/CMakeLists.txt
index 35a65a5aa7..a1056c3347 100644
--- a/source/source_lcao/module_rt/CMakeLists.txt
+++ b/source/source_lcao/module_rt/CMakeLists.txt
@@ -22,6 +22,8 @@ if(ENABLE_LCAO)
         list(APPEND objects
             kernels/cuda/snap_psibeta_kernel.cu
             kernels/cuda/snap_psibeta_gpu.cu
+            kernels/cuda/norm_psi_kernel.cu
+            kernels/cuda/band_energy_kernel.cu
         )
     endif()
 
diff --git a/source/source_lcao/module_rt/band_energy.cpp b/source/source_lcao/module_rt/band_energy.cpp
index 63b2fad71b..601840b285 100644
--- a/source/source_lcao/module_rt/band_energy.cpp
+++ b/source/source_lcao/module_rt/band_energy.cpp
@@ -4,6 +4,10 @@
 #include "source_base/module_container/ATen/kernels/blas.h"
 #include "source_base/module_external/scalapack_connector.h"
 
+#ifdef __CUBLASMP
+#include "kernels/cuda/band_energy_kernel.cuh"
+#endif
+
 #include <complex>
 #include <iostream>
 
@@ -165,140 +169,226 @@ void compute_ekb_tensor(const Parallel_Orbitals* pv,
                         const ct::Tensor& Htmp,
                         const ct::Tensor& psi_k,
                         ct::Tensor& ekb,
-                        std::ofstream& ofs_running)
+                        std::ofstream& ofs_running,
+                        CublasMpResources& cublas_res)
 {
-    assert(pv->nloc_wfc > 0 && pv->nloc > 0);
-
-    // Create Tensor objects for temporary data
-    ct::Tensor tmp1(ct::DataType::DT_COMPLEX_DOUBLE, ct::DeviceType::CpuDevice, ct::TensorShape({pv->nloc_wfc}));
-    tmp1.zero();
-
-    ct::Tensor eij(ct::DataType::DT_COMPLEX_DOUBLE, ct::DeviceType::CpuDevice, ct::TensorShape({pv->nloc}));
-    eij.zero();
-
-    // Perform matrix multiplication: tmp1 = Htmp * psi_k
-    ScalapackConnector::gemm('N',
-                             'N',
-                             nlocal,
-                             nband,
-                             nlocal,
-                             1.0,
-                             Htmp.data<std::complex<double>>(),
-                             1,
-                             1,
-                             pv->desc,
-                             psi_k.data<std::complex<double>>(),
-                             1,
-                             1,
-                             pv->desc_wfc,
-                             0.0,
-                             tmp1.data<std::complex<double>>(),
-                             1,
-                             1,
-                             pv->desc_wfc);
-
-    // Perform matrix multiplication: eij = psi_k^dagger * tmp1
-    ScalapackConnector::gemm('C',
-                             'N',
-                             nband,
-                             nband,
-                             nlocal,
-                             1.0,
-                             psi_k.data<std::complex<double>>(),
-                             1,
-                             1,
-                             pv->desc_wfc,
-                             tmp1.data<std::complex<double>>(),
-                             1,
-                             1,
-                             pv->desc_wfc,
-                             0.0,
-                             eij.data<std::complex<double>>(),
-                             1,
-                             1,
-                             pv->desc_Eij);
-
-    if (PARAM.inp.td_print_eij >= 0.0)
+#ifdef __CUBLASMP
+    // 1. Resource validation
+    if (!cublas_res.is_initialized || cublas_res.cublasmp_grid == nullptr)
     {
-        ofs_running
-            << "------------------------------------------------------------------------------------------------"
-            << std::endl;
-        ofs_running << " Eij:" << std::endl;
-        for (int i = 0; i < pv->nrow_bands; i++)
-        {
-            const int in = i * pv->ncol;
-            for (int j = 0; j < pv->ncol_bands; j++)
-            {
-                double aa = eij.data<std::complex<double>>()[in + j].real();
-                double bb = eij.data<std::complex<double>>()[in + j].imag();
-                if (std::abs(aa) < PARAM.inp.td_print_eij)
-                {
-                    aa = 0.0;
-                }
-                if (std::abs(bb) < PARAM.inp.td_print_eij)
-                {
-                    bb = 0.0;
-                }
-                if (std::abs(aa) > 0.0 || std::abs(bb) > 0.0)
-                {
-                    std::streamsize original_precision = ofs_running.precision();
-                    ofs_running << std::fixed << std::setprecision(8);
-                    ofs_running << "i = " << std::setw(2) << i << ", j = " << std::setw(2) << j
-                                << ", Eij = " << std::setw(12) << aa << " + " << std::setw(12) << bb << " i"
-                                << std::endl;
-                    ofs_running.unsetf(std::ios_base::fixed);
-                    ofs_running.precision(original_precision);
-                }
-            }
-        }
-        ofs_running << std::endl;
-        ofs_running
-            << "------------------------------------------------------------------------------------------------"
-            << std::endl;
+        return;
     }
 
-    int info = 0;
-    int naroc[2] = {0, 0};
+    assert(pv->nloc_wfc > 0 && pv->nloc > 0);
+    assert(Htmp.device_type() == ct::DeviceType::GpuDevice);
+    assert(psi_k.device_type() == ct::DeviceType::GpuDevice);
+    assert(ekb.device_type() == ct::DeviceType::GpuDevice);
 
-    // Create a Tensor for eii
-    assert(nband > 0);
-    ct::Tensor eii(ct::DataType::DT_DOUBLE, ct::DeviceType::CpuDevice, ct::TensorShape({nband}));
-    eii.zero();
+    // 2. Data Pointers
+    void* d_H = static_cast<void*>(const_cast<std::complex<double>*>(Htmp.data<std::complex<double>>()));
+    void* d_Psi = static_cast<void*>(const_cast<std::complex<double>*>(psi_k.data<std::complex<double>>()));
 
-    for (int iprow = 0; iprow < pv->dim0; ++iprow)
-    {
-        for (int ipcol = 0; ipcol < pv->dim1; ++ipcol)
-        {
-            if (iprow == pv->coord[0] && ipcol == pv->coord[1])
-            {
-                naroc[0] = pv->nrow;
-                naroc[1] = pv->ncol;
-                for (int j = 0; j < naroc[1]; ++j)
-                {
-                    int igcol = globalIndex(j, pv->nb, pv->dim1, ipcol);
-                    if (igcol >= nband)
-                    {
-                        continue;
-                    }
-                    for (int i = 0; i < naroc[0]; ++i)
-                    {
-                        int igrow = globalIndex(i, pv->nb, pv->dim0, iprow);
-                        if (igrow >= nband)
-                        {
-                            continue;
-                        }
-                        if (igcol == igrow)
-                        {
-                            eii.data<double>()[igcol] = eij.data<std::complex<double>>()[j * naroc[0] + i].real();
-                        }
-                    }
-                }
-            }
-        } // loop ipcol
-    } // loop iprow
+    int64_t psi_elems = psi_k.NumElements();
+    ct::Tensor Tmp1_gpu(ct::DataType::DT_COMPLEX_DOUBLE, ct::DeviceType::GpuDevice, ct::TensorShape({psi_elems}));
+    void* d_Tmp1 = static_cast<void*>(Tmp1_gpu.data<std::complex<double>>());
+
+    int64_t eij_elems = pv->nloc;
+    ct::Tensor Eij_gpu(ct::DataType::DT_COMPLEX_DOUBLE, ct::DeviceType::GpuDevice, ct::TensorShape({eij_elems}));
+    void* d_Eij = static_cast<void*>(Eij_gpu.data<std::complex<double>>());
+
+    std::complex<double> alpha = {1.0, 0.0};
+    std::complex<double> beta = {0.0, 0.0};
 
-    // Perform MPI reduction to compute ekb
-    info = MPI_Allreduce(eii.data<double>(), ekb.data<double>(), nband, MPI_DOUBLE, MPI_SUM, pv->comm());
+    // 3. Matrix Descriptors Creation
+    cublasMpMatrixDescriptor_t desc_H, desc_Psi, desc_Eij;
+
+    // H descriptor: nlocal x nlocal
+    cublasMpMatrixDescriptorCreate(pv->desc[2],
+                                   pv->desc[3],
+                                   pv->desc[4],
+                                   pv->desc[5],
+                                   0,
+                                   0,
+                                   pv->desc[8],
+                                   CUDA_C_64F,
+                                   cublas_res.cublasmp_grid,
+                                   &desc_H);
+
+    // Psi descriptor: nlocal x nband
+    cublasMpMatrixDescriptorCreate(pv->desc_wfc[2],
+                                   pv->desc_wfc[3],
+                                   pv->desc_wfc[4],
+                                   pv->desc_wfc[5],
+                                   0,
+                                   0,
+                                   pv->desc_wfc[8],
+                                   CUDA_C_64F,
+                                   cublas_res.cublasmp_grid,
+                                   &desc_Psi);
+
+    // Eij descriptor: MUST use nband x nband physically, to match pv->desc_Eij expectations
+    cublasMpMatrixDescriptorCreate(nband,
+                                   nband,
+                                   pv->desc_Eij[4],
+                                   pv->desc_Eij[5],
+                                   0,
+                                   0,
+                                   pv->desc_Eij[8],
+                                   CUDA_C_64F,
+                                   cublas_res.cublasmp_grid,
+                                   &desc_Eij);
+
+    size_t ws_dev = 0, ws_host = 0;
+    void *d_work = nullptr, *h_work = nullptr;
+
+    // 4. GEMM 1: Tmp1 = H * Psi
+    cublasMpGemm_bufferSize(cublas_res.cublasmp_handle,
+                            CUBLAS_OP_N,
+                            CUBLAS_OP_N,
+                            pv->desc[2],
+                            pv->desc_wfc[3],
+                            pv->desc[3],
+                            &alpha,
+                            d_H,
+                            1,
+                            1,
+                            desc_H,
+                            d_Psi,
+                            1,
+                            1,
+                            desc_Psi,
+                            &beta,
+                            d_Tmp1,
+                            1,
+                            1,
+                            desc_Psi,
+                            CUBLAS_COMPUTE_64F,
+                            &ws_dev,
+                            &ws_host);
+
+    cudaMallocAsync(&d_work, ws_dev, cublas_res.stream);
+    h_work = malloc(ws_host);
+
+    cublasMpGemm(cublas_res.cublasmp_handle,
+                 CUBLAS_OP_N,
+                 CUBLAS_OP_N,
+                 pv->desc[2],
+                 pv->desc_wfc[3],
+                 pv->desc[3],
+                 &alpha,
+                 d_H,
+                 1,
+                 1,
+                 desc_H,
+                 d_Psi,
+                 1,
+                 1,
+                 desc_Psi,
+                 &beta,
+                 d_Tmp1,
+                 1,
+                 1,
+                 desc_Psi,
+                 CUBLAS_COMPUTE_64F,
+                 d_work,
+                 ws_dev,
+                 h_work,
+                 ws_host);
+
+    cudaFreeAsync(d_work, cublas_res.stream);
+    free(h_work);
+
+    // 5. GEMM 2: Eij = Psi^H * Tmp1
+    cublasMpGemm_bufferSize(cublas_res.cublasmp_handle,
+                            CUBLAS_OP_C,
+                            CUBLAS_OP_N,
+                            pv->desc_wfc[3],
+                            pv->desc_wfc[3],
+                            pv->desc_wfc[2],
+                            &alpha,
+                            d_Psi,
+                            1,
+                            1,
+                            desc_Psi,
+                            d_Tmp1,
+                            1,
+                            1,
+                            desc_Psi,
+                            &beta,
+                            d_Eij,
+                            1,
+                            1,
+                            desc_Eij,
+                            CUBLAS_COMPUTE_64F,
+                            &ws_dev,
+                            &ws_host);
+
+    cudaMallocAsync(&d_work, ws_dev, cublas_res.stream);
+    h_work = malloc(ws_host);
+
+    cublasMpGemm(cublas_res.cublasmp_handle,
+                 CUBLAS_OP_C,
+                 CUBLAS_OP_N,
+                 pv->desc_wfc[3],
+                 pv->desc_wfc[3],
+                 pv->desc_wfc[2],
+                 &alpha,
+                 d_Psi,
+                 1,
+                 1,
+                 desc_Psi,
+                 d_Tmp1,
+                 1,
+                 1,
+                 desc_Psi,
+                 &beta,
+                 d_Eij,
+                 1,
+                 1,
+                 desc_Eij,
+                 CUBLAS_COMPUTE_64F,
+                 d_work,
+                 ws_dev,
+                 h_work,
+                 ws_host);
+
+    cudaFreeAsync(d_work, cublas_res.stream);
+    free(h_work);
+
+    // 6. Extract Diagonal directly on GPU
+    // Prepare a zero-initialized buffer on GPU to store the local parts of the diagonal
+    ct::Tensor eii_gpu(ct::DataType::DT_DOUBLE, ct::DeviceType::GpuDevice, ct::TensorShape({nband}));
+    double* d_eii = static_cast<double*>(eii_gpu.data<double>());
+    cudaMemsetAsync(d_eii, 0, nband * sizeof(double), cublas_res.stream);
+
+    // Launch the extraction kernel
+    module_rt::gpu::launch_extract_ekb_kernel(reinterpret_cast<cuDoubleComplex*>(d_Eij),
+                                              d_eii,
+                                              pv->desc_Eij[8],
+                                              pv->nloc,
+                                              pv->desc_Eij[4],
+                                              pv->dim0,
+                                              pv->dim1,
+                                              pv->coord[0],
+                                              pv->coord[1],
+                                              nband,
+                                              cublas_res.stream);
+
+    // 7. CUDA-aware MPI Reduction
+    // VERY IMPORTANT: We must synchronize the stream before passing the GPU pointer
+    // to MPI, because MPI operations are generally synchronous to the CPU thread.
+    cudaStreamSynchronize(cublas_res.stream);
+
+    double* d_ekb = static_cast<double*>(ekb.data<double>());
+
+    // Direct GPU-to-GPU reduction using CUDA-aware MPI
+    MPI_Allreduce(d_eii, d_ekb, nband, MPI_DOUBLE, MPI_SUM, pv->comm());
+
+    // 8. Cleanup
+    cublasMpMatrixDescriptorDestroy(desc_H);
+    cublasMpMatrixDescriptorDestroy(desc_Psi);
+    cublasMpMatrixDescriptorDestroy(desc_Eij);
+#endif // __CUBLASMP
 }
 
 template <typename Device>
diff --git a/source/source_lcao/module_rt/band_energy.h b/source/source_lcao/module_rt/band_energy.h
index 93c83ccdb0..107cd749e5 100644
--- a/source/source_lcao/module_rt/band_energy.h
+++ b/source/source_lcao/module_rt/band_energy.h
@@ -8,6 +8,7 @@
 
 #include "source_base/module_container/ATen/core/tensor.h" // ct::Tensor
 #include "source_basis/module_ao/parallel_orbitals.h"
+#include "source_lcao/module_rt/kernels/cublasmp_context.h"
 
 #include <complex>
 
@@ -38,7 +39,8 @@ void compute_ekb_tensor(const Parallel_Orbitals* pv,
                         const ct::Tensor& Htmp,
                         const ct::Tensor& psi_k,
                         ct::Tensor& ekb,
-                        std::ofstream& ofs_running);
+                        std::ofstream& ofs_running,
+                        CublasMpResources& cublas_res);
 
 template <typename Device>
 void compute_ekb_tensor_lapack(const Parallel_Orbitals* pv,
diff --git a/source/source_lcao/module_rt/evolve_elec.cpp b/source/source_lcao/module_rt/evolve_elec.cpp
index 6d6eebc2ff..dd7507974a 100644
--- a/source/source_lcao/module_rt/evolve_elec.cpp
+++ b/source/source_lcao/module_rt/evolve_elec.cpp
@@ -41,6 +41,12 @@ void Evolve_elec<Device>::solve_psi(const int& istep,
     // Control the print of matrix to running_md.log
     const int print_matrix = 0;
 
+    // Multi-GPU support
+    CublasMpResources cublas_res;
+#ifdef __CUBLASMP
+    init_cublasmp_resources(cublas_res, MPI_COMM_WORLD, para_orb.desc);
+#endif
+
     for (int ik = 0; ik < nks; ik++)
     {
         phm->updateHk(ik);
@@ -171,7 +177,8 @@ void Evolve_elec<Device>::solve_psi(const int& istep,
                                       propagator,
                                       ofs_running,
                                       print_matrix,
-                                      use_lapack);
+                                      use_lapack,
+                                      cublas_res);
 
             ModuleBase::timer::tick("TD_Efficiency", "host_device_comm");
             // Need to distribute global psi back to all processes
@@ -237,6 +244,10 @@ void Evolve_elec<Device>::solve_psi(const int& istep,
         ModuleBase::timer::tick("TD_Efficiency", "evolve_k");
     } // end k
 
+#ifdef __CUBLASMP
+    finalize_cublasmp_resources(cublas_res);
+#endif
+
     ModuleBase::timer::tick("Evolve_elec", "solve_psi");
     return;
 }
diff --git a/source/source_lcao/module_rt/evolve_elec.h b/source/source_lcao/module_rt/evolve_elec.h
index 5d0a8e5455..3c2aa95cf6 100644
--- a/source/source_lcao/module_rt/evolve_elec.h
+++ b/source/source_lcao/module_rt/evolve_elec.h
@@ -12,6 +12,7 @@
 #include "source_esolver/esolver_ks_lcao_tddft.h"
 #include "source_lcao/hamilt_lcao.h"
 #include "source_lcao/module_rt/gather_mat.h" // MPI gathering and distributing functions
+#include "source_lcao/module_rt/kernels/cublasmp_context.h"
 #include "source_psi/psi.h"
 
 //-----------------------------------------------------------
@@ -26,64 +27,101 @@
 // Print the shape of a Tensor
 inline void print_tensor_shape(const ct::Tensor& tensor, const std::string& name)
 {
-    std::cout << "Shape of " << name << ": [";
+    GlobalV::ofs_running << "Shape of " << name << ": [";
     for (int i = 0; i < tensor.shape().ndim(); ++i)
     {
-        std::cout << tensor.shape().dim_size(i);
+        GlobalV::ofs_running << tensor.shape().dim_size(i);
         if (i < tensor.shape().ndim() - 1)
         {
-            std::cout << ", ";
+            GlobalV::ofs_running << ", ";
         }
     }
-    std::cout << "]" << std::endl;
+    GlobalV::ofs_running << "]" << std::endl;
 }
 
 // Recursive print function
+template <typename T>
+inline void print_single_element(const T& val, double threshold)
+{
+    double clean_val = (std::abs(val) < threshold) ? 0.0 : static_cast<double>(val);
+    GlobalV::ofs_running << std::fixed << std::setprecision(6) << clean_val;
+}
+inline void print_single_element(const std::complex<double>& val, double threshold)
+{
+    double re = (std::abs(val.real()) < threshold) ? 0.0 : val.real();
+    double im = (std::abs(val.imag()) < threshold) ? 0.0 : val.imag();
+    GlobalV::ofs_running << std::fixed << std::setprecision(6) << "(" << re << "," << im << ")";
+}
+
 template <typename T>
 inline void print_tensor_data_recursive(const T* data,
                                         const std::vector<int64_t>& shape,
                                         const std::vector<int64_t>& strides,
                                         int dim,
                                         std::vector<int64_t>& indices,
-                                        const std::string& name)
+                                        const std::string& name,
+                                        const double threshold = 1e-10)
 {
     if (dim == shape.size())
     {
-        // Recursion base case: print data when reaching the innermost dimension
-        std::cout << name;
+        GlobalV::ofs_running << name;
         for (size_t i = 0; i < indices.size(); ++i)
         {
-            std::cout << "[" << indices[i] << "]";
+            GlobalV::ofs_running << "[" << indices[i] << "]";
         }
-        std::cout << " = " << *data << std::endl;
+        GlobalV::ofs_running << " = ";
+
+        print_single_element(*data, threshold);
+
+        GlobalV::ofs_running << std::endl;
         return;
     }
-    // Recursively process the current dimension
+
     for (int64_t i = 0; i < shape[dim]; ++i)
     {
         indices[dim] = i;
-        print_tensor_data_recursive(data + i * strides[dim], shape, strides, dim + 1, indices, name);
+        print_tensor_data_recursive(data + i * strides[dim], shape, strides, dim + 1, indices, name, threshold);
     }
 }
 
-// Generic print function
 template <typename T>
 inline void print_tensor_data(const ct::Tensor& tensor, const std::string& name)
 {
-    const std::vector<int64_t>& shape = tensor.shape().dims();
-    const std::vector<int64_t>& strides = tensor.shape().strides();
-    const T* data = tensor.data<T>();
+    const ct::Tensor* p_tensor = &tensor;
+    ct::Tensor cpu_tensor_buffer;
+
+    if (tensor.device_type() != ct::DeviceType::CpuDevice)
+    {
+        cpu_tensor_buffer = tensor.to_device<ct::DEVICE_CPU>();
+        p_tensor = &cpu_tensor_buffer;
+    }
+
+    const std::vector<int64_t>& shape = p_tensor->shape().dims();
+    const std::vector<int64_t>& strides = p_tensor->shape().strides();
+
+    const T* data = p_tensor->data<T>();
+
     std::vector<int64_t> indices(shape.size(), 0);
     print_tensor_data_recursive(data, shape, strides, 0, indices, name);
 }
 
-// Specialization for std::complex<double>
 template <>
 inline void print_tensor_data<std::complex<double>>(const ct::Tensor& tensor, const std::string& name)
 {
-    const std::vector<int64_t>& shape = tensor.shape().dims();
-    const std::vector<int64_t>& strides = tensor.shape().strides();
-    const std::complex<double>* data = tensor.data<std::complex<double>>();
+    const ct::Tensor* p_tensor = &tensor;
+    ct::Tensor cpu_tensor_buffer;
+
+    if (tensor.device_type() != ct::DeviceType::CpuDevice)
+    {
+        cpu_tensor_buffer = tensor.to_device<ct::DEVICE_CPU>();
+        p_tensor = &cpu_tensor_buffer;
+    }
+
+    const std::vector<int64_t>& shape = p_tensor->shape().dims();
+    const std::vector<int64_t>& strides = p_tensor->shape().strides();
+
+    const std::complex<double>* data = p_tensor->data<std::complex<double>>();
+
     std::vector<int64_t> indices(shape.size(), 0);
     print_tensor_data_recursive(data, shape, strides, 0, indices, name);
 }
diff --git a/source/source_lcao/module_rt/evolve_psi.cpp b/source/source_lcao/module_rt/evolve_psi.cpp
index 5a2f116fcd..ca2458651f 100644
--- a/source/source_lcao/module_rt/evolve_psi.cpp
+++ b/source/source_lcao/module_rt/evolve_psi.cpp
@@ -129,7 +129,8 @@ void evolve_psi_tensor(const int nband,
                        int propagator,
                        std::ofstream& ofs_running,
                        const int print_matrix,
-                       const bool use_lapack)
+                       const bool use_lapack,
+                       CublasMpResources& cublas_res)
 {
     ModuleBase::TITLE("module_rt", "evolve_psi_tensor");
     time_t time_start = time(nullptr);
@@ -221,7 +222,16 @@ void evolve_psi_tensor(const int nband,
     {
         if (!use_lapack)
         {
-            half_Hmatrix_tensor(pv, nband, nlocal, Htmp, Stmp, H_laststep, S_laststep, ofs_running, print_matrix);
+            half_Hmatrix_tensor(pv,
+                                nband,
+                                nlocal,
+                                Htmp,
+                                Stmp,
+                                H_laststep,
+                                S_laststep,
+                                ofs_running,
+                                print_matrix,
+                                cublas_res);
         }
         else if (myid == root_proc)
         {
@@ -249,12 +259,13 @@ void evolve_psi_tensor(const int nband,
                                            U_operator,
                                            ofs_running,
                                            print_matrix,
-                                           use_lapack);
+                                           use_lapack,
+                                           cublas_res);
 
     // (3) Apply U_operator (psi_k = U * psi_last)
     if (!use_lapack)
     {
-        upsi_tensor(pv, nband, nlocal, U_operator, psi_k_laststep, psi_k, ofs_running, print_matrix);
+        upsi_tensor(pv, nband, nlocal, U_operator, psi_k_laststep, psi_k, ofs_running, print_matrix, cublas_res);
     }
     else if (myid == root_proc)
     {
@@ -264,7 +275,7 @@ void evolve_psi_tensor(const int nband,
     // (4) Normalize psi_k
     if (!use_lapack)
     {
-        norm_psi_tensor(pv, nband, nlocal, Stmp, psi_k, ofs_running, print_matrix);
+        norm_psi_tensor(pv, nband, nlocal, Stmp, psi_k, ofs_running, print_matrix, cublas_res);
     }
     else if (myid == root_proc)
     {
@@ -287,7 +298,7 @@ void evolve_psi_tensor(const int nband,
 
     if (!use_lapack)
     {
-        compute_ekb_tensor(pv, nband, nlocal, Hold, psi_k, ekb, ofs_running);
+        compute_ekb_tensor(pv, nband, nlocal, Hold, psi_k, ekb, ofs_running, cublas_res);
     }
     else if (myid == root_proc)
     {
@@ -323,7 +334,8 @@ template void evolve_psi_tensor<base_device::DEVICE_CPU>(const int nband,
                                                          int propagator,
                                                          std::ofstream& ofs_running,
                                                          const int print_matrix,
-                                                         const bool use_lapack);
+                                                         const bool use_lapack,
+                                                         CublasMpResources& cublas_res);
 
 #if ((defined __CUDA) /* || (defined __ROCM) */)
 template void evolve_psi_tensor<base_device::DEVICE_GPU>(const int nband,
@@ -338,7 +350,8 @@ template void evolve_psi_tensor<base_device::DEVICE_GPU>(const int nband,
                                                          int propagator,
                                                          std::ofstream& ofs_running,
                                                          const int print_matrix,
-                                                         const bool use_lapack);
+                                                         const bool use_lapack,
+                                                         CublasMpResources& cublas_res);
 #endif // __CUDA
 
 } // namespace module_rt
diff --git a/source/source_lcao/module_rt/evolve_psi.h b/source/source_lcao/module_rt/evolve_psi.h
index 413b115a0f..34a29d6881 100644
--- a/source/source_lcao/module_rt/evolve_psi.h
+++ b/source/source_lcao/module_rt/evolve_psi.h
@@ -10,6 +10,8 @@
 #include "source_base/module_container/ATen/core/tensor_map.h" // TensorMap
 #include "source_basis/module_ao/parallel_orbitals.h"
 #include "source_lcao/hamilt_lcao.h"
+#include "source_lcao/module_rt/evolve_elec.h"
+#include "source_lcao/module_rt/kernels/cublasmp_context.h"
 
 namespace module_rt
 {
@@ -39,7 +41,8 @@ void evolve_psi_tensor(const int nband,
                        int propagator,
                        std::ofstream& ofs_running,
                        const int print_matrix,
-                       const bool use_lapack);
+                       const bool use_lapack,
+                       CublasMpResources& cublas_res);
 } // namespace module_rt
 
 #endif
\ No newline at end of file
diff --git a/source/source_lcao/module_rt/kernels/cublasmp_context.h b/source/source_lcao/module_rt/kernels/cublasmp_context.h
new file mode 100644
index 0000000000..897f1964c2
--- /dev/null
+++ b/source/source_lcao/module_rt/kernels/cublasmp_context.h
@@ -0,0 +1,167 @@
+#ifndef CUBLASMP_CONTEXT_H
+#define CUBLASMP_CONTEXT_H
+
+#ifdef __MPI
+#include <mpi.h>
+#endif
+
+#ifdef __CUDA
+#include <cuda_runtime.h>
+#endif
+
+#ifdef __CUBLASMP
+#include "source_base/global_variable.h"
+#include "source_base/module_device/device.h"
+
+#include <cublasmp.h>
+#include <cusolverMp.h>
+#include <iostream>
+#include <nccl.h>
+
+extern "C"
+{
+#include "source_hsolver/module_genelpa/Cblacs.h"
+}
+
+#define LOG_DEBUG(msg)                                                                                                 \
+    do                                                                                                                 \
+    {                                                                                                                  \
+        if (g_EnableDebugLog)                                                                                          \
+        {                                                                                                              \
+            std::cerr << "[DEBUG] " << msg << " (at " << __func__ << ")" << std::endl;                                 \
+        }                                                                                                              \
+    } while (0)
+#endif // __CUBLASMP
+
+// The struct is ALWAYS available.
+struct CublasMpResources
+{
+    bool is_initialized = false;
+
+#ifdef __MPI
+    MPI_Comm mpi_comm = MPI_COMM_NULL;
+#endif
+
+#ifdef __CUDA
+    cudaStream_t stream = nullptr;
+#endif
+
+#ifdef __CUBLASMP
+    ncclComm_t nccl_comm = nullptr;
+
+    cublasMpHandle_t cublasmp_handle = nullptr;
+    cublasMpGrid_t cublasmp_grid = nullptr;
+
+    cusolverMpHandle_t cusolvermp_handle = nullptr;
+    cusolverMpGrid_t cusolvermp_grid = nullptr;
+#endif
+};
+
+// API functions are only visible when cuBLASMp is enabled.
+#ifdef __CUBLASMP
+
+inline void init_cublasmp_resources(CublasMpResources& res, MPI_Comm mpi_comm, const int* desc)
+{
+    if (res.is_initialized)
+    {
+        return;
+    }
+
+    res.mpi_comm = mpi_comm;
+    MPI_Barrier(res.mpi_comm);
+
+    // 1. Get BLACS topology info
+    int cblacs_ctxt = desc[1];
+    int nprows, npcols, myprow, mypcol;
+    Cblacs_gridinfo(cblacs_ctxt, &nprows, &npcols, &myprow, &mypcol);
+
+    GlobalV::ofs_running << "nprows = " << nprows << std::endl;
+    GlobalV::ofs_running << "npcols = " << npcols << std::endl;
+    GlobalV::ofs_running << "myprow = " << myprow << std::endl;
+    GlobalV::ofs_running << "mypcol = " << mypcol << std::endl;
+    GlobalV::ofs_running << "device = " << base_device::DeviceContext::instance().get_device_id() << std::endl;
+
+    int rank, size;
+    MPI_Comm_rank(res.mpi_comm, &rank);
+    MPI_Comm_size(res.mpi_comm, &size);
+
+    int device_id = base_device::DeviceContext::instance().get_device_id();
+    cudaSetDevice(device_id);
+    cudaStreamCreate(&res.stream);
+
+    // 2. Initialize NCCL communicator
+    ncclUniqueId id;
+    if (rank == 0)
+    {
+        ncclGetUniqueId(&id);
+    }
+    // Broadcast the unique NCCL ID to all ranks
+    MPI_Bcast((void*)&id, sizeof(id), MPI_BYTE, 0, res.mpi_comm);
+    // Initialize NCCL with the generated ID
+    ncclCommInitRank(&res.nccl_comm, size, id, rank);
+
+    // 3. Initialize cuBLASMp specific resources
+    cublasMpCreate(&res.cublasmp_handle, res.stream);
+    cublasMpGridCreate(nprows, npcols, CUBLASMP_GRID_LAYOUT_ROW_MAJOR, res.nccl_comm, &res.cublasmp_grid);
+
+    // 4. Initialize cuSOLVERMp specific resources
+    cusolverMpCreate(&res.cusolvermp_handle, device_id, res.stream);
+    cusolverMpCreateDeviceGrid(res.cusolvermp_handle,
+                               &res.cusolvermp_grid,
+                               res.nccl_comm,
+                               nprows,
+                               npcols,
+                               CUSOLVERMP_GRID_MAPPING_ROW_MAJOR);
+
+    res.is_initialized = true;
+}
+
+inline void finalize_cublasmp_resources(CublasMpResources& res)
+{
+    if (!res.is_initialized)
+    {
+        return;
+    }
+
+    if (res.stream)
+    {
+        cudaStreamSynchronize(res.stream);
+    }
+
+    // Destroy cuBLASMp resources
+    if (res.cublasmp_grid)
+    {
+        cublasMpGridDestroy(res.cublasmp_grid);
+    }
+    if (res.cublasmp_handle)
+    {
+        cublasMpDestroy(res.cublasmp_handle);
+    }
+
+    // Destroy cuSOLVERMp resources
+    if (res.cusolvermp_grid)
+    {
+        cusolverMpDestroyGrid(res.cusolvermp_grid);
+    }
+    if (res.cusolvermp_handle)
+    {
+        cusolverMpDestroy(res.cusolvermp_handle);
+    }
+
+    // Destroy NCCL communicator
+    if (res.nccl_comm)
+    {
+        ncclCommDestroy(res.nccl_comm);
+    }
+
+    if (res.stream)
+    {
+        cudaStreamDestroy(res.stream);
+    }
+
+    res.is_initialized = false;
+}
+
+#endif // __CUBLASMP
+
+#endif // CUBLASMP_CONTEXT_H
diff --git a/source/source_lcao/module_rt/kernels/cuda/band_energy_kernel.cu b/source/source_lcao/module_rt/kernels/cuda/band_energy_kernel.cu
new file mode 100644
index 0000000000..74e55610a0
--- /dev/null
+++ b/source/source_lcao/module_rt/kernels/cuda/band_energy_kernel.cu
@@ -0,0 +1,88 @@
+#include "band_energy_kernel.cuh"
+
+#ifdef __CUBLASMP
+namespace module_rt
+{
+namespace gpu
+{
+
+// Device function for global index mapping
+__device__ inline int get_global_index_dev(int local_idx, int block_size, int num_procs, int proc_coord)
+{
+    return (local_idx / block_size) * (num_procs * block_size) + proc_coord * block_size + (local_idx % block_size);
+}
+
+// Kernel to extract the real part of the diagonal elements of Eij
+__global__ void extract_ekb_kernel(const cuDoubleComplex* d_Eij,
+                                   double* d_eii,
+                                   int lld,
+                                   int local_elems,
+                                   int nb,
+                                   int dim0,
+                                   int dim1,
+                                   int my_prow,
+                                   int my_pcol,
+                                   int nband)
+{
+    int idx = blockIdx.x * blockDim.x + threadIdx.x;
+
+    // Iterate over the allocated local buffer
+    if (idx >= local_elems)
+    {
+        return;
+    }
+
+    // Column-major indexing using the formal Leading Dimension (LLD)
+    int i = idx % lld;
+    int j = idx / lld;
+
+    int grow = get_global_index_dev(i, nb, dim0, my_prow);
+    int gcol = get_global_index_dev(j, nb, dim1, my_pcol);
+
+    // Filter out invalid blocks
+    if (grow >= nband || gcol >= nband)
+    {
+        return;
+    }
+
+    // Extract the diagonal elements
+    if (grow == gcol)
+    {
+        d_eii[grow] = cuCreal(d_Eij[idx]);
+    }
+}
+
+// Wrapper implementation
+void launch_extract_ekb_kernel(const cuDoubleComplex* d_Eij,
+                               double* d_eii,
+                               int lld,
+                               int local_elems,
+                               int nb,
+                               int dim0,
+                               int dim1,
+                               int my_prow,
+                               int my_pcol,
+                               int nband,
+                               cudaStream_t stream)
+{
+    if (local_elems > 0)
+    {
+        int threads_per_block = 256;
+        int blocks_per_grid = (local_elems + threads_per_block - 1) / threads_per_block;
+
+        extract_ekb_kernel<<<blocks_per_grid, threads_per_block, 0, stream>>>(d_Eij,
+                                                                              d_eii,
+                                                                              lld,
+                                                                              local_elems,
+                                                                              nb,
+                                                                              dim0,
+                                                                              dim1,
+                                                                              my_prow,
+                                                                              my_pcol,
+                                                                              nband);
+    }
+}
+
+} // namespace gpu
+} // namespace module_rt
+#endif // __CUBLASMP
diff --git a/source/source_lcao/module_rt/kernels/cuda/band_energy_kernel.cuh b/source/source_lcao/module_rt/kernels/cuda/band_energy_kernel.cuh
new file mode 100644
index 0000000000..2c0198b8f5
--- /dev/null
+++ b/source/source_lcao/module_rt/kernels/cuda/band_energy_kernel.cuh
@@ -0,0 +1,31 @@
+#ifndef BAND_ENERGY_KERNEL_CUH
+#define BAND_ENERGY_KERNEL_CUH
+
+#include <cuda_runtime.h>
+
+#ifdef __CUBLASMP
+#include <cuComplex.h>
+
+namespace module_rt
+{
+namespace gpu
+{
+
+// Standard C++ wrapper to launch the diagonal extraction kernel
+void launch_extract_ekb_kernel(const cuDoubleComplex* d_Eij,
+                               double* d_eii,
+                               int lld,
+                               int local_elems,
+                               int nb,
+                               int dim0,
+                               int dim1,
+                               int my_prow,
+                               int my_pcol,
+                               int nband,
+                               cudaStream_t stream);
+
+} // namespace gpu
+} // namespace module_rt
+#endif // __CUBLASMP
+
+#endif // BAND_ENERGY_KERNEL_CUH
diff --git a/source/source_lcao/module_rt/kernels/cuda/norm_psi_kernel.cu b/source/source_lcao/module_rt/kernels/cuda/norm_psi_kernel.cu
new file mode 100644
index 0000000000..4008df990a
--- /dev/null
+++ b/source/source_lcao/module_rt/kernels/cuda/norm_psi_kernel.cu
@@ -0,0 +1,94 @@
+#include "norm_psi_kernel.cuh"
+
+#include <math_constants.h>
+
+#ifdef __CUBLASMP
+namespace module_rt
+{
+namespace gpu
+{
+
+// Device function for global index mapping
+__device__ inline int get_global_index_dev(int local_idx, int block_size, int num_procs, int proc_coord)
+{
+    return (local_idx / block_size) * (num_procs * block_size) + proc_coord * block_size + (local_idx % block_size);
+}
+
+// CUDA kernel to normalize the Cij matrix directly on the GPU
+__global__ void normalize_cij_kernel(cuDoubleComplex* d_Cij,
+                                     int lld,
+                                     int local_elems,
+                                     int nb,
+                                     int dim0,
+                                     int dim1,
+                                     int my_prow,
+                                     int my_pcol,
+                                     int nband)
+{
+    int idx = blockIdx.x * blockDim.x + threadIdx.x;
+
+    // We iterate over the entire allocated buffer (local_elems)
+    if (idx >= local_elems)
+    {
+        return;
+    }
+
+    // Column-major indexing using the formal Leading Dimension (LLD)
+    int i = idx % lld;
+    int j = idx / lld;
+
+    int grow = get_global_index_dev(i, nb, dim0, my_prow);
+    int gcol = get_global_index_dev(j, nb, dim1, my_pcol);
+
+    // Filter out the "empty" spaces that are outside the nband x nband logic
+    if (grow >= nband || gcol >= nband)
+    {
+        return;
+    }
+
+    if (grow == gcol)
+    {
+        double val = cuCreal(d_Cij[idx]);
+        if (val < 1e-12)
+            val = 1e-12;
+        d_Cij[idx] = make_cuDoubleComplex(1.0 / sqrt(val), 0.0);
+    }
+    else
+    {
+        d_Cij[idx] = make_cuDoubleComplex(0.0, 0.0);
+    }
+}
+
+// Wrapper function implementation
+void launch_normalize_cij_kernel(cuDoubleComplex* d_Cij,
+                                 int nrow,
+                                 int ncol,
+                                 int nb,
+                                 int dim0,
+                                 int dim1,
+                                 int my_prow,
+                                 int my_pcol,
+                                 int nband,
+                                 cudaStream_t stream)
+{
+    int total_elems = nrow * ncol;
+    if (total_elems > 0)
+    {
+        int threads_per_block = 256;
+        int blocks_per_grid = (total_elems + threads_per_block - 1) / threads_per_block;
+
+        normalize_cij_kernel<<<blocks_per_grid, threads_per_block, 0, stream>>>(d_Cij,
+                                                                                nrow,
+                                                                                ncol,
+                                                                                nb,
+                                                                                dim0,
+                                                                                dim1,
+                                                                                my_prow,
+                                                                                my_pcol,
+                                                                                nband);
+    }
+}
+
+} // namespace gpu
+} // namespace module_rt
+#endif // __CUBLASMP
diff --git a/source/source_lcao/module_rt/kernels/cuda/norm_psi_kernel.cuh b/source/source_lcao/module_rt/kernels/cuda/norm_psi_kernel.cuh
new file mode 100644
index 0000000000..af2fb0dd6e
--- /dev/null
+++ b/source/source_lcao/module_rt/kernels/cuda/norm_psi_kernel.cuh
@@ -0,0 +1,30 @@
+#ifndef NORM_PSI_KERNEL_CUH
+#define NORM_PSI_KERNEL_CUH
+
+#include <cuda_runtime.h>
+
+#ifdef __CUBLASMP
+#include <cuComplex.h>
+
+namespace module_rt
+{
+namespace gpu
+{
+
+// Standard C++ wrapper to launch the normalization kernel
+void launch_normalize_cij_kernel(cuDoubleComplex* d_Cij,
+                                 int nrow,
+                                 int ncol,
+                                 int nb,
+                                 int dim0,
+                                 int dim1,
+                                 int my_prow,
+                                 int my_pcol,
+                                 int nband,
+                                 cudaStream_t stream);
+
+} // namespace gpu
+} // namespace module_rt
+#endif // __CUBLASMP
+
+#endif // NORM_PSI_KERNEL_CUH
diff --git a/source/source_lcao/module_rt/middle_hamilt.cpp b/source/source_lcao/module_rt/middle_hamilt.cpp
index 10f432fbe9..a7b875a270 100644
--- a/source/source_lcao/module_rt/middle_hamilt.cpp
+++ b/source/source_lcao/module_rt/middle_hamilt.cpp
@@ -1,9 +1,12 @@
 #include "middle_hamilt.h"
 
+#include "source_base/global_variable.h"
 #include "source_base/module_container/ATen/kernels/blas.h"
 #include "source_base/module_device/memory_op.h" // memory operations
 #include "source_base/module_external/scalapack_connector.h"
+#include "source_base/timer.h"
 
+#include <cassert>
 #include <complex>
 #include <iostream>
 
@@ -80,88 +83,125 @@ void half_Hmatrix_tensor(const Parallel_Orbitals* pv,
                          const ct::Tensor& H_laststep,
                          const ct::Tensor& S_laststep,
                          std::ofstream& ofs_running,
-                         const int print_matrix)
+                         const int print_matrix,
+                         CublasMpResources& cublas_res)
 {
-    if (print_matrix)
+#ifdef __CUBLASMP
+    // 1. Validate resources and ensure the grid is properly initialized
+    if (!cublas_res.is_initialized || cublas_res.cublasmp_grid == nullptr)
     {
-        ofs_running << std::setprecision(10);
-        ofs_running << std::endl;
-        ofs_running << " H(t+dt) :" << std::endl;
-        for (int i = 0; i < pv->nrow; i++)
-        {
-            const int in = i * pv->ncol;
-            for (int j = 0; j < pv->ncol; j++)
-            {
-                ofs_running << Htmp.data<std::complex<double>>()[in + j].real() << "+"
-                            << Htmp.data<std::complex<double>>()[in + j].imag() << "i ";
-            }
-            ofs_running << std::endl;
-        }
-        ofs_running << std::endl;
-        ofs_running << std::endl;
-        ofs_running << " H(t):" << std::endl;
-        for (int i = 0; i < pv->nrow; i++)
-        {
-            const int in = i * pv->ncol;
-            for (int j = 0; j < pv->ncol; j++)
-            {
-                ofs_running << H_laststep.data<std::complex<double>>()[in + j].real() << "+"
-                            << H_laststep.data<std::complex<double>>()[in + j].imag() << "i ";
-            }
-            ofs_running << std::endl;
-        }
-        ofs_running << std::endl;
+        return;
     }
 
+    assert(Htmp.device_type() == ct::DeviceType::GpuDevice);
+    assert(Stmp.device_type() == ct::DeviceType::GpuDevice);
+    assert(H_laststep.device_type() == ct::DeviceType::GpuDevice);
+    assert(S_laststep.device_type() == ct::DeviceType::GpuDevice);
+
+    // 2. Extract device pointers
+    void* d_Htmp = static_cast<void*>(Htmp.data<std::complex<double>>());
+    void* d_Stmp = static_cast<void*>(Stmp.data<std::complex<double>>());
+    void* d_H_last = static_cast<void*>(const_cast<std::complex<double>*>(H_laststep.data<std::complex<double>>()));
+    void* d_S_last = static_cast<void*>(const_cast<std::complex<double>*>(S_laststep.data<std::complex<double>>()));
+
+    int64_t m_global = pv->desc[2];
+    int64_t n_global = pv->desc[3];
+    int64_t mb = pv->desc[4];
+    int64_t nb = pv->desc[5];
+    int64_t rsrc = pv->desc[6];
+    int64_t csrc = pv->desc[7];
+    int64_t lld = pv->desc[8];
+
+    // 3. Create matrix descriptor
+    cublasMpMatrixDescriptor_t desc_mat;
+    cublasMpMatrixDescriptorCreate(m_global,
+                                   n_global,
+                                   mb,
+                                   nb,
+                                   rsrc,
+                                   csrc,
+                                   lld,
+                                   CUDA_C_64F,
+                                   cublas_res.cublasmp_grid,
+                                   &desc_mat);
+
     std::complex<double> alpha = {0.5, 0.0};
     std::complex<double> beta = {0.5, 0.0};
 
-    // Perform the operation Htmp = alpha * H_laststep + beta * Htmp
-    ScalapackConnector::geadd('N',
-                              nlocal,
-                              nlocal,
-                              alpha,
-                              H_laststep.data<std::complex<double>>(),
-                              1,
-                              1,
-                              pv->desc,
-                              beta,
-                              Htmp.data<std::complex<double>>(),
-                              1,
-                              1,
-                              pv->desc);
-
-    // Perform the operation Stmp = alpha * S_laststep + beta * Stmp
-    ScalapackConnector::geadd('N',
-                              nlocal,
-                              nlocal,
-                              alpha,
-                              S_laststep.data<std::complex<double>>(),
-                              1,
-                              1,
-                              pv->desc,
-                              beta,
-                              Stmp.data<std::complex<double>>(),
-                              1,
-                              1,
-                              pv->desc);
+    size_t ws_size_dev = 0;
+    size_t ws_size_host = 0;
 
-    if (print_matrix)
-    {
-        ofs_running << std::endl;
-        ofs_running << " H (t+dt/2) :" << std::endl;
-        for (int i = 0; i < pv->nrow; i++)
-        {
-            const int in = i * pv->ncol;
-            for (int j = 0; j < pv->ncol; j++)
-            {
-                ofs_running << Htmp.data<std::complex<double>>()[in + j].real() << "+"
-                            << Htmp.data<std::complex<double>>()[in + j].imag() << "i ";
-            }
-            ofs_running << std::endl;
-        }
-        ofs_running << std::endl;
-    }
+    // 4. Query workspace size
+    cublasMpGeadd_bufferSize(cublas_res.cublasmp_handle,
+                             CUBLAS_OP_N,
+                             m_global,
+                             n_global,
+                             &alpha,
+                             d_H_last,
+                             1,
+                             1,
+                             desc_mat,
+                             &beta,
+                             d_Htmp,
+                             1,
+                             1,
+                             desc_mat,
+                             &ws_size_dev,
+                             &ws_size_host);
+
+    void* d_work = nullptr;
+    void* h_work = nullptr;
+
+    cudaMallocAsync(&d_work, ws_size_dev, cublas_res.stream);
+    h_work = malloc(ws_size_host);
+
+    // 5. Compute Htmp = 0.5 * H_last + 0.5 * Htmp
+    cublasMpGeadd(cublas_res.cublasmp_handle,
+                  CUBLAS_OP_N,
+                  m_global,
+                  n_global,
+                  &alpha,
+                  d_H_last,
+                  1,
+                  1,
+                  desc_mat,
+                  &beta,
+                  d_Htmp,
+                  1,
+                  1,
+                  desc_mat,
+                  d_work,
+                  ws_size_dev,
+                  h_work,
+                  ws_size_host);
+
+    // 6. Compute Stmp = 0.5 * S_last + 0.5 * Stmp
+    cublasMpGeadd(cublas_res.cublasmp_handle,
+                  CUBLAS_OP_N,
+                  m_global,
+                  n_global,
+                  &alpha,
+                  d_S_last,
+                  1,
+                  1,
+                  desc_mat,
+                  &beta,
+                  d_Stmp,
+                  1,
+                  1,
+                  desc_mat,
+                  d_work,
+                  ws_size_dev,
+                  h_work,
+                  ws_size_host);
+
+    // 7. Synchronize stream and release resources
+    cudaStreamSynchronize(cublas_res.stream);
+
+    cublasMpMatrixDescriptorDestroy(desc_mat);
+    cudaFreeAsync(d_work, cublas_res.stream);
+    free(h_work);
+#endif // __CUBLASMP
 }
 
 template <typename Device>
diff --git a/source/source_lcao/module_rt/middle_hamilt.h b/source/source_lcao/module_rt/middle_hamilt.h
index e505185b00..5b59154e4b 100644
--- a/source/source_lcao/module_rt/middle_hamilt.h
+++ b/source/source_lcao/module_rt/middle_hamilt.h
@@ -8,6 +8,7 @@
 
 #include "source_base/module_container/ATen/core/tensor.h" // ct::Tensor
 #include "source_basis/module_ao/parallel_orbitals.h"
+#include "source_lcao/module_rt/kernels/cublasmp_context.h"
 
 #include <complex>
 
@@ -43,7 +44,8 @@ void half_Hmatrix_tensor(const Parallel_Orbitals* pv,
                          const ct::Tensor& H_laststep,
                          const ct::Tensor& S_laststep,
                          std::ofstream& ofs_running,
-                         const int print_matrix);
+                         const int print_matrix,
+                         CublasMpResources& cublas_res);
 
 template <typename Device>
 void half_Hmatrix_tensor_lapack(const Parallel_Orbitals* pv,
diff --git a/source/source_lcao/module_rt/norm_psi.cpp b/source/source_lcao/module_rt/norm_psi.cpp
index f5a9c6c8b4..40884b02df 100644
--- a/source/source_lcao/module_rt/norm_psi.cpp
+++ b/source/source_lcao/module_rt/norm_psi.cpp
@@ -4,6 +4,11 @@
 #include "source_base/module_container/ATen/kernels/blas.h"
 #include "source_base/module_external/blas_connector.h"
 #include "source_base/module_external/scalapack_connector.h"
+#include "source_base/timer.h"
+
+#ifdef __CUBLASMP
+#include "kernels/cuda/norm_psi_kernel.cuh"
+#endif
 
 #include <cassert>
 #include <complex>
@@ -231,207 +236,264 @@ void norm_psi_tensor(const Parallel_Orbitals* pv,
                      const ct::Tensor& Stmp,
                      ct::Tensor& psi_k,
                      std::ofstream& ofs_running,
-                     const int print_matrix)
+                     const int print_matrix,
+                     CublasMpResources& cublas_res)
 {
-    assert(pv->nloc_wfc > 0 && pv->nloc > 0);
-
-    // Create Tensor objects for temporary data
-    ct::Tensor tmp1(ct::DataType::DT_COMPLEX_DOUBLE, ct::DeviceType::CpuDevice, ct::TensorShape({pv->nloc_wfc}));
-    tmp1.zero();
-
-    ct::Tensor Cij(ct::DataType::DT_COMPLEX_DOUBLE, ct::DeviceType::CpuDevice, ct::TensorShape({pv->nloc}));
-    Cij.zero();
-
-    // Perform matrix multiplication: tmp1 = Stmp * psi_k
-    ScalapackConnector::gemm('N',
-                             'N',
-                             nlocal,
-                             nband,
-                             nlocal,
-                             1.0,
-                             Stmp.data<std::complex<double>>(),
-                             1,
-                             1,
-                             pv->desc,
-                             psi_k.data<std::complex<double>>(),
-                             1,
-                             1,
-                             pv->desc_wfc,
-                             0.0,
-                             tmp1.data<std::complex<double>>(),
-                             1,
-                             1,
-                             pv->desc_wfc);
-
-    // Perform matrix multiplication: Cij = psi_k^dagger * tmp1
-    ScalapackConnector::gemm('C',
-                             'N',
-                             nband,
-                             nband,
-                             nlocal,
-                             1.0,
-                             psi_k.data<std::complex<double>>(),
-                             1,
-                             1,
-                             pv->desc_wfc,
-                             tmp1.data<std::complex<double>>(),
-                             1,
-                             1,
-                             pv->desc_wfc,
-                             0.0,
-                             Cij.data<std::complex<double>>(),
-                             1,
-                             1,
-                             pv->desc_Eij);
-
-    if (print_matrix)
+#ifdef __CUBLASMP
+    if (!cublas_res.is_initialized || cublas_res.cublasmp_grid == nullptr)
     {
-        ofs_running << "original Cij :" << std::endl;
-        for (int i = 0; i < pv->ncol; i++)
-        {
-            const int in = i * pv->ncol;
-            for (int j = 0; j < pv->nrow; j++)
-            {
-                double aa = Cij.data<std::complex<double>>()[in + j].real();
-                double bb = Cij.data<std::complex<double>>()[in + j].imag();
-                if (std::abs(aa) < 1e-8)
-                {
-                    aa = 0.0;
-                }
-                if (std::abs(bb) < 1e-8)
-                {
-                    bb = 0.0;
-                }
-                ofs_running << aa << "+" << bb << "i ";
-            }
-            ofs_running << std::endl;
-        }
-        ofs_running << std::endl;
+        return;
     }
 
-    int naroc[2] = {0, 0}; // maximum number of row or column
+    void* d_S = static_cast<void*>(const_cast<std::complex<double>*>(Stmp.data<std::complex<double>>()));
+    void* d_Psi = static_cast<void*>(psi_k.data<std::complex<double>>());
+    int64_t psi_elems = psi_k.NumElements();
 
-    for (int iprow = 0; iprow < pv->dim0; ++iprow)
-    {
-        for (int ipcol = 0; ipcol < pv->dim1; ++ipcol)
-        {
-            if (iprow == pv->coord[0] && ipcol == pv->coord[1])
-            {
-                naroc[0] = pv->nrow;
-                naroc[1] = pv->ncol;
-                for (int j = 0; j < naroc[1]; ++j)
-                {
-                    int igcol = globalIndex(j, pv->nb, pv->dim1, ipcol);
-                    if (igcol >= nband)
-                    {
-                        continue;
-                    }
-                    for (int i = 0; i < naroc[0]; ++i)
-                    {
-                        int igrow = globalIndex(i, pv->nb, pv->dim0, iprow);
-                        if (igrow >= nband)
-                        {
-                            continue;
-                        }
-                        if (igcol == igrow)
-                        {
-                            Cij.data<std::complex<double>>()[j * naroc[0] + i]
-                                = {1.0 / sqrt(Cij.data<std::complex<double>>()[j * naroc[0] + i].real()), 0.0};
-                        }
-                        else
-                        {
-                            Cij.data<std::complex<double>>()[j * naroc[0] + i] = {0.0, 0.0};
-                        }
-                    }
-                }
-            }
-        } // loop ipcol
-    } // loop iprow
+    ct::Tensor Tmp1_gpu(ct::DataType::DT_COMPLEX_DOUBLE, ct::DeviceType::GpuDevice, ct::TensorShape({psi_elems}));
+    void* d_Tmp1 = static_cast<void*>(Tmp1_gpu.data<std::complex<double>>());
 
-    // Copy psi_k to tmp1 (using deep copy)
-    // tmp1.CopyFrom(psi_k); // Does not work because this will cause tmp1 and psi_k to share the same data
-    tmp1 = psi_k; // operator= overload for Tensor class
+    int64_t cij_elems = pv->nloc;
+    ct::Tensor Cij_gpu(ct::DataType::DT_COMPLEX_DOUBLE, ct::DeviceType::GpuDevice, ct::TensorShape({cij_elems}));
+    void* d_Cij = static_cast<void*>(Cij_gpu.data<std::complex<double>>());
 
-    // Perform matrix multiplication: psi_k = tmp1 * Cij
-    ScalapackConnector::gemm('N',
-                             'N',
-                             nlocal,
-                             nband,
-                             nband,
-                             1.0,
-                             tmp1.data<std::complex<double>>(),
-                             1,
-                             1,
-                             pv->desc_wfc,
-                             Cij.data<std::complex<double>>(),
-                             1,
-                             1,
-                             pv->desc_Eij,
-                             0.0,
-                             psi_k.data<std::complex<double>>(),
-                             1,
-                             1,
-                             pv->desc_wfc);
+    cudaMemsetAsync(d_Cij, 0, cij_elems * sizeof(std::complex<double>), cublas_res.stream);
 
-    if (print_matrix)
-    {
-        ofs_running << " Cij:" << std::endl;
-        for (int i = 0; i < pv->ncol; i++)
-        {
-            const int in = i * pv->ncol;
-            for (int j = 0; j < pv->nrow; j++)
-            {
-                ofs_running << Cij.data<std::complex<double>>()[in + j].real() << "+"
-                            << Cij.data<std::complex<double>>()[in + j].imag() << "i ";
-            }
-            ofs_running << std::endl;
-        }
-        ofs_running << std::endl;
-        ofs_running << std::endl;
-        ofs_running << " psi_k:" << std::endl;
-        for (int i = 0; i < pv->ncol_bands; i++)
-        {
-            const int in = i * pv->ncol;
-            for (int j = 0; j < pv->ncol; j++)
-            {
-                double aa = psi_k.data<std::complex<double>>()[in + j].real();
-                double bb = psi_k.data<std::complex<double>>()[in + j].imag();
-                if (std::abs(aa) < 1e-8)
-                {
-                    aa = 0.0;
-                }
-                if (std::abs(bb) < 1e-8)
-                {
-                    bb = 0.0;
-                }
-                ofs_running << aa << "+" << bb << "i ";
-            }
-            ofs_running << std::endl;
-        }
-        ofs_running << std::endl;
-        ofs_running << " psi_k before normalization:" << std::endl;
-        for (int i = 0; i < pv->ncol_bands; i++)
-        {
-            const int in = i * pv->ncol;
-            for (int j = 0; j < pv->ncol; j++)
-            {
-                double aa = tmp1.data<std::complex<double>>()[in + j].real();
-                double bb = tmp1.data<std::complex<double>>()[in + j].imag();
-                if (std::abs(aa) < 1e-8)
-                {
-                    aa = 0.0;
-                }
-                if (std::abs(bb) < 1e-8)
-                {
-                    bb = 0.0;
-                }
-                ofs_running << aa << "+" << bb << "i ";
-            }
-            ofs_running << std::endl;
-        }
-        ofs_running << std::endl;
-        ofs_running << std::endl;
-    }
+    std::complex<double> alpha = {1.0, 0.0};
+    std::complex<double> beta = {0.0, 0.0};
+
+    cublasMpMatrixDescriptor_t desc_S, desc_Psi, desc_Cij;
+
+    cublasMpMatrixDescriptorCreate(nlocal,
+                                   nlocal,
+                                   pv->desc[4],
+                                   pv->desc[5],
+                                   0,
+                                   0,
+                                   pv->desc[8],
+                                   CUDA_C_64F,
+                                   cublas_res.cublasmp_grid,
+                                   &desc_S);
+
+    cublasMpMatrixDescriptorCreate(nlocal,
+                                   nband,
+                                   pv->desc_wfc[4],
+                                   pv->desc_wfc[5],
+                                   0,
+                                   0,
+                                   pv->desc_wfc[8],
+                                   CUDA_C_64F,
+                                   cublas_res.cublasmp_grid,
+                                   &desc_Psi);
+
+    cublasMpMatrixDescriptorCreate(nband,
+                                   nband,
+                                   pv->desc_Eij[4],
+                                   pv->desc_Eij[5],
+                                   0,
+                                   0,
+                                   pv->desc_Eij[8],
+                                   CUDA_C_64F,
+                                   cublas_res.cublasmp_grid,
+                                   &desc_Cij);
+
+    size_t ws_dev = 0, ws_host = 0;
+    void *d_work = nullptr, *h_work = nullptr;
+
+    // GEMM 1: S * Psi -> Tmp1
+    cublasMpGemm_bufferSize(cublas_res.cublasmp_handle,
+                            CUBLAS_OP_N,
+                            CUBLAS_OP_N,
+                            nlocal,
+                            nband,
+                            nlocal,
+                            &alpha,
+                            d_S,
+                            1,
+                            1,
+                            desc_S,
+                            d_Psi,
+                            1,
+                            1,
+                            desc_Psi,
+                            &beta,
+                            d_Tmp1,
+                            1,
+                            1,
+                            desc_Psi,
+                            CUBLAS_COMPUTE_64F,
+                            &ws_dev,
+                            &ws_host);
+
+    cudaMallocAsync(&d_work, ws_dev, cublas_res.stream);
+    h_work = malloc(ws_host);
+
+    cublasMpGemm(cublas_res.cublasmp_handle,
+                 CUBLAS_OP_N,
+                 CUBLAS_OP_N,
+                 nlocal,
+                 nband,
+                 nlocal,
+                 &alpha,
+                 d_S,
+                 1,
+                 1,
+                 desc_S,
+                 d_Psi,
+                 1,
+                 1,
+                 desc_Psi,
+                 &beta,
+                 d_Tmp1,
+                 1,
+                 1,
+                 desc_Psi,
+                 CUBLAS_COMPUTE_64F,
+                 d_work,
+                 ws_dev,
+                 h_work,
+                 ws_host);
+
+    cudaFreeAsync(d_work, cublas_res.stream);
+    free(h_work);
+
+    // GEMM 2: Psi^H * Tmp1 -> Cij
+    cublasMpGemm_bufferSize(cublas_res.cublasmp_handle,
+                            CUBLAS_OP_C,
+                            CUBLAS_OP_N,
+                            nband,
+                            nband,
+                            nlocal,
+                            &alpha,
+                            d_Psi,
+                            1,
+                            1,
+                            desc_Psi,
+                            d_Tmp1,
+                            1,
+                            1,
+                            desc_Psi,
+                            &beta,
+                            d_Cij,
+                            1,
+                            1,
+                            desc_Cij,
+                            CUBLAS_COMPUTE_64F,
+                            &ws_dev,
+                            &ws_host);
+
+    cudaMallocAsync(&d_work, ws_dev, cublas_res.stream);
+    h_work = malloc(ws_host);
+
+    cublasMpGemm(cublas_res.cublasmp_handle,
+                 CUBLAS_OP_C,
+                 CUBLAS_OP_N,
+                 nband,
+                 nband,
+                 nlocal,
+                 &alpha,
+                 d_Psi,
+                 1,
+                 1,
+                 desc_Psi,
+                 d_Tmp1,
+                 1,
+                 1,
+                 desc_Psi,
+                 &beta,
+                 d_Cij,
+                 1,
+                 1,
+                 desc_Cij,
+                 CUBLAS_COMPUTE_64F,
+                 d_work,
+                 ws_dev,
+                 h_work,
+                 ws_host);
+
+    cudaFreeAsync(d_work, cublas_res.stream);
+    free(h_work);
+
+    // Launch GPU In-place Normalization using the C++ wrapper
+    module_rt::gpu::launch_normalize_cij_kernel(reinterpret_cast<cuDoubleComplex*>(d_Cij),
+                                                pv->desc_Eij[8],
+                                                pv->nloc,
+                                                pv->desc_Eij[4],
+                                                pv->dim0,
+                                                pv->dim1,
+                                                pv->coord[0],
+                                                pv->coord[1],
+                                                nband,
+                                                cublas_res.stream);
+
+    // GEMM 3: Tmp1 * Cij -> Psi
+    cudaMemcpyAsync(d_Tmp1,
+                    d_Psi,
+                    psi_elems * sizeof(std::complex<double>),
+                    cudaMemcpyDeviceToDevice,
+                    cublas_res.stream);
+
+    cublasMpGemm_bufferSize(cublas_res.cublasmp_handle,
+                            CUBLAS_OP_N,
+                            CUBLAS_OP_N,
+                            nlocal,
+                            nband,
+                            nband,
+                            &alpha,
+                            d_Tmp1,
+                            1,
+                            1,
+                            desc_Psi,
+                            d_Cij,
+                            1,
+                            1,
+                            desc_Cij,
+                            &beta,
+                            d_Psi,
+                            1,
+                            1,
+                            desc_Psi,
+                            CUBLAS_COMPUTE_64F,
+                            &ws_dev,
+                            &ws_host);
+
+    cudaMallocAsync(&d_work, ws_dev, cublas_res.stream);
+    h_work = malloc(ws_host);
+
+    cublasMpGemm(cublas_res.cublasmp_handle,
+                 CUBLAS_OP_N,
+                 CUBLAS_OP_N,
+                 nlocal,
+                 nband,
+                 nband,
+                 &alpha,
+                 d_Tmp1,
+                 1,
+                 1,
+                 desc_Psi,
+                 d_Cij,
+                 1,
+                 1,
+                 desc_Cij,
+                 &beta,
+                 d_Psi,
+                 1,
+                 1,
+                 desc_Psi,
+                 CUBLAS_COMPUTE_64F,
+                 d_work,
+                 ws_dev,
+                 h_work,
+                 ws_host);
+
+    cudaStreamSynchronize(cublas_res.stream);
+
+    cublasMpMatrixDescriptorDestroy(desc_S);
+    cublasMpMatrixDescriptorDestroy(desc_Psi);
+    cublasMpMatrixDescriptorDestroy(desc_Cij);
+
+    cudaFreeAsync(d_work, cublas_res.stream);
+    free(h_work);
+#endif // __CUBLASMP
 }
 
 template <typename Device>
diff --git a/source/source_lcao/module_rt/norm_psi.h b/source/source_lcao/module_rt/norm_psi.h
index 9c9435f318..f61aae86ec 100644
--- a/source/source_lcao/module_rt/norm_psi.h
+++ b/source/source_lcao/module_rt/norm_psi.h
@@ -8,6 +8,7 @@
 
 #include "source_base/module_container/ATen/core/tensor.h" // ct::Tensor
 #include "source_basis/module_ao/parallel_orbitals.h"
+#include "source_lcao/module_rt/kernels/cublasmp_context.h"
 
 #include <complex>
 
@@ -39,7 +40,8 @@ void norm_psi_tensor(const Parallel_Orbitals* pv,
                      const ct::Tensor& Stmp,
                      ct::Tensor& psi_k,
                      std::ofstream& ofs_running,
-                     const int print_matrix);
+                     const int print_matrix,
+                     CublasMpResources& cublas_res);
 
 template <typename Device>
 void norm_psi_tensor_lapack(const Parallel_Orbitals* pv,
diff --git a/source/source_lcao/module_rt/propagator.cpp b/source/source_lcao/module_rt/propagator.cpp
index 2854839066..b74b4fb8dd 100644
--- a/source/source_lcao/module_rt/propagator.cpp
+++ b/source/source_lcao/module_rt/propagator.cpp
@@ -54,7 +54,8 @@ void Propagator::compute_propagator_tensor(const int nlocal,
                                            ct::Tensor& U_operator,
                                            std::ofstream& ofs_running,
                                            const int print_matrix,
-                                           const bool use_lapack) const
+                                           const bool use_lapack,
+                                           CublasMpResources& cublas_res) const
 {
     int tag = 0;
     switch (ptype)
@@ -62,7 +63,7 @@ void Propagator::compute_propagator_tensor(const int nlocal,
     case 0:
         if (!use_lapack)
         {
-            compute_propagator_cn2_tensor(nlocal, Stmp, Htmp, U_operator, ofs_running, print_matrix);
+            compute_propagator_cn2_tensor(nlocal, Stmp, Htmp, U_operator, ofs_running, print_matrix, cublas_res);
         }
         else
         {
@@ -91,7 +92,8 @@ template void Propagator::compute_propagator_tensor<base_device::DEVICE_CPU>(con
                                                                              ct::Tensor& U_operator,
                                                                              std::ofstream& ofs_running,
                                                                              const int print_matrix,
-                                                                             const bool use_lapack) const;
+                                                                             const bool use_lapack,
+                                                                             CublasMpResources& cublas_res) const;
 #if ((defined __CUDA) /* || (defined __ROCM) */)
 template void Propagator::compute_propagator_tensor<base_device::DEVICE_GPU>(const int nlocal,
                                                                              const ct::Tensor& Stmp,
@@ -100,7 +102,8 @@ template void Propagator::compute_propagator_tensor<base_device::DEVICE_GPU>(con
                                                                              ct::Tensor& U_operator,
                                                                              std::ofstream& ofs_running,
                                                                              const int print_matrix,
-                                                                             const bool use_lapack) const;
+                                                                             const bool use_lapack,
+                                                                             CublasMpResources& cublas_res) const;
 #endif // __CUDA
 #endif // __MPI
 } // namespace module_rt
diff --git a/source/source_lcao/module_rt/propagator.h b/source/source_lcao/module_rt/propagator.h
index ca9c7c140b..7bc12e9715 100644
--- a/source/source_lcao/module_rt/propagator.h
+++ b/source/source_lcao/module_rt/propagator.h
@@ -9,6 +9,7 @@
 #include "source_base/constants.h"
 #include "source_base/module_container/ATen/core/tensor.h" // ct::Tensor
 #include "source_basis/module_ao/parallel_orbitals.h"
+#include "source_lcao/module_rt/kernels/cublasmp_context.h"
 
 #include <complex>
 
@@ -139,7 +140,8 @@ class Propagator
                                    ct::Tensor& U_operator,
                                    std::ofstream& ofs_running,
                                    const int print_matrix,
-                                   const bool use_lapack) const;
+                                   const bool use_lapack,
+                                   CublasMpResources& cublas_res) const;
 #endif // __MPI
 
   private:
@@ -170,7 +172,8 @@ class Propagator
                                        const ct::Tensor& Htmp,
                                        ct::Tensor& U_operator,
                                        std::ofstream& ofs_running,
-                                       const int print_matrix) const;
+                                       const int print_matrix,
+                                       CublasMpResources& cublas_res) const;
 
     template <typename Device>
     void compute_propagator_cn2_tensor_lapack(const int nlocal,
diff --git a/source/source_lcao/module_rt/propagator_cn2.cpp b/source/source_lcao/module_rt/propagator_cn2.cpp
index 8ef07b0ebb..3f85ed26a0 100644
--- a/source/source_lcao/module_rt/propagator_cn2.cpp
+++ b/source/source_lcao/module_rt/propagator_cn2.cpp
@@ -6,6 +6,7 @@
 #include "source_base/module_device/memory_op.h"              // memory operations
 #include "source_base/module_external/blas_connector.h"
 #include "source_base/module_external/scalapack_connector.h"
+#include "source_base/timer.h"
 #include "source_io/module_parameter/parameter.h"
 
 #include <cassert>
@@ -252,249 +253,304 @@ void Propagator::compute_propagator_cn2_tensor(const int nlocal,
                                                const ct::Tensor& Htmp,
                                                ct::Tensor& U_operator,
                                                std::ofstream& ofs_running,
-                                               const int print_matrix) const
+                                               const int print_matrix,
+                                               CublasMpResources& cublas_res) const
 {
-    // (1) copy Htmp to Numerator & Denominator
-    ct::Tensor Numerator(ct::DataType::DT_COMPLEX_DOUBLE,
-                         ct::DeviceType::CpuDevice,
-                         ct::TensorShape({this->ParaV->nloc}));
-    Numerator.zero();
-    BlasConnector::copy(this->ParaV->nloc,
-                        Htmp.data<std::complex<double>>(),
-                        1,
-                        Numerator.data<std::complex<double>>(),
-                        1);
-
-    ct::Tensor Denominator(ct::DataType::DT_COMPLEX_DOUBLE,
-                           ct::DeviceType::CpuDevice,
-                           ct::TensorShape({this->ParaV->nloc}));
-    Denominator.zero();
-    BlasConnector::copy(this->ParaV->nloc,
-                        Htmp.data<std::complex<double>>(),
-                        1,
-                        Denominator.data<std::complex<double>>(),
-                        1);
-
-    if (print_matrix)
+#ifdef __CUBLASMP
+    // 1. Resource Validation
+    if (!cublas_res.is_initialized || cublas_res.cublasmp_grid == nullptr || cublas_res.cusolvermp_grid == nullptr)
     {
-        ofs_running << std::endl;
-        ofs_running << " S matrix :" << std::endl;
-        for (int i = 0; i < this->ParaV->nrow; i++)
-        {
-            const int in = i * this->ParaV->ncol;
-            for (int j = 0; j < this->ParaV->ncol; j++)
-            {
-                ofs_running << Stmp.data<std::complex<double>>()[in + j].real() << "+"
-                            << Stmp.data<std::complex<double>>()[in + j].imag() << "i ";
-            }
-            ofs_running << std::endl;
-        }
-        ofs_running << std::endl;
-        ofs_running << std::endl;
-        ofs_running << " H matrix :" << std::endl;
-        for (int i = 0; i < this->ParaV->nrow; i++)
-        {
-            const int in = i * this->ParaV->ncol;
-            for (int j = 0; j < this->ParaV->ncol; j++)
-            {
-                ofs_running << Numerator.data<std::complex<double>>()[in + j].real() << "+"
-                            << Numerator.data<std::complex<double>>()[in + j].imag() << "i ";
-            }
-            ofs_running << std::endl;
-        }
-        ofs_running << std::endl;
+        return;
     }
 
-    // ->>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>
-    // (2) compute Numerator & Denominator by GEADD
-    // Numerator = Stmp - i*para * Htmp;     beta1 = - para = -0.25 * this->dt
-    // Denominator = Stmp + i*para * Htmp;   beta2 = para = 0.25 * this->dt
-    std::complex<double> alpha = {1.0, 0.0};
-    std::complex<double> beta1 = {0.0, -0.25 * this->dt};
-    std::complex<double> beta2 = {0.0, 0.25 * this->dt};
-
-    ScalapackConnector::geadd('N',
-                              nlocal,
-                              nlocal,
-                              alpha,
-                              Stmp.data<std::complex<double>>(),
-                              1,
-                              1,
-                              this->ParaV->desc,
-                              beta1,
-                              Numerator.data<std::complex<double>>(),
-                              1,
-                              1,
-                              this->ParaV->desc);
-    ScalapackConnector::geadd('N',
-                              nlocal,
-                              nlocal,
-                              alpha,
-                              Stmp.data<std::complex<double>>(),
-                              1,
-                              1,
-                              this->ParaV->desc,
-                              beta2,
-                              Denominator.data<std::complex<double>>(),
-                              1,
-                              1,
-                              this->ParaV->desc);
-
-    if (print_matrix)
-    {
-        ofs_running << " beta=" << beta1 << std::endl;
-        ofs_running << " fenmu:" << std::endl;
-        for (int i = 0; i < this->ParaV->nrow; i++)
-        {
-            const int in = i * this->ParaV->ncol;
-            for (int j = 0; j < this->ParaV->ncol; j++)
-            {
-                ofs_running << Denominator.data<std::complex<double>>()[in + j].real() << "+"
-                            << Denominator.data<std::complex<double>>()[in + j].imag() << "i ";
-            }
-            ofs_running << std::endl;
-        }
-        ofs_running << std::endl;
-    }
-
-    //->>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>
-    // (3) Next, invert Denominator
-    ct::Tensor ipiv(ct::DataType::DT_INT,
-                    ct::DeviceType::CpuDevice,
-                    ct::TensorShape({this->ParaV->nrow + this->ParaV->nb}));
-    ipiv.zero();
-    int info = 0;
-    // (3.1) compute ipiv
-    ScalapackConnector::getrf(nlocal,
-                              nlocal,
-                              Denominator.data<std::complex<double>>(),
-                              1,
-                              1,
-                              this->ParaV->desc,
-                              ipiv.data<int>(),
-                              &info);
-
-    // Print ipiv
-    if (print_matrix)
-    {
-        ofs_running << " this->ParaV->nloc = " << this->ParaV->nloc << std::endl;
-        ofs_running << " this->ParaV->nrow = " << this->ParaV->nrow << std::endl;
-        ofs_running << " this->ParaV->ncol = " << this->ParaV->ncol << std::endl;
-        ofs_running << " this->ParaV->nb = " << this->ParaV->nb << std::endl;
-        ofs_running << " this->ParaV->get_block_size() = " << this->ParaV->get_block_size() << std::endl;
-        ofs_running << " nlocal = " << nlocal << std::endl;
-        ofs_running << " ipiv:" << std::endl;
-        for (int i = 0; i < this->ParaV->nloc; i++)
-        {
-            ofs_running << ipiv.data<int>()[i] << " ";
-        }
-        ofs_running << std::endl;
-    }
-
-    int lwork = -1;
-    int liwotk = -1;
-    ct::Tensor work(ct::DataType::DT_COMPLEX_DOUBLE, ct::DeviceType::CpuDevice, ct::TensorShape({1}));
-    ct::Tensor iwork(ct::DataType::DT_INT, ct::DeviceType::CpuDevice, ct::TensorShape({1}));
-    // (3.2) compute work
-    ScalapackConnector::getri(nlocal,
-                              Denominator.data<std::complex<double>>(),
-                              1,
-                              1,
-                              this->ParaV->desc,
-                              ipiv.data<int>(),
-                              work.data<std::complex<double>>(),
-                              &lwork,
-                              iwork.data<int>(),
-                              &liwotk,
-                              &info);
-    lwork = work.data<std::complex<double>>()[0].real();
-    work.resize(ct::TensorShape({lwork}));
-    liwotk = iwork.data<int>()[0];
-    iwork.resize(ct::TensorShape({liwotk}));
-    // (3.3) compute inverse matrix of Denominator
-    ScalapackConnector::getri(nlocal,
-                              Denominator.data<std::complex<double>>(),
-                              1,
-                              1,
-                              this->ParaV->desc,
-                              ipiv.data<int>(),
-                              work.data<std::complex<double>>(),
-                              &lwork,
-                              iwork.data<int>(),
-                              &liwotk,
-                              &info);
-    assert(0 == info);
-
-    //->>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>
-
-    // (4) U_operator = Denominator * Numerator;
-    ScalapackConnector::gemm('N',
-                             'N',
-                             nlocal,
-                             nlocal,
-                             nlocal,
-                             1.0,
-                             Denominator.data<std::complex<double>>(),
-                             1,
+    assert(Stmp.device_type() == ct::DeviceType::GpuDevice);
+    assert(Htmp.device_type() == ct::DeviceType::GpuDevice);
+    assert(U_operator.device_type() == ct::DeviceType::GpuDevice);
+
+    // 2. Extract Pointers
+    void* d_S = static_cast<void*>(Stmp.data<std::complex<double>>());
+    void* d_H = static_cast<void*>(Htmp.data<std::complex<double>>());
+    void* d_Num = static_cast<void*>(U_operator.data<std::complex<double>>());
+
+    int64_t len_loc = this->ParaV->nloc;
+
+    // Allocate temporary tensor for denominator matrix
+    ct::Tensor Denominator_gpu(ct::DataType::DT_COMPLEX_DOUBLE, ct::DeviceType::GpuDevice, ct::TensorShape({len_loc}));
+    void* d_Den = static_cast<void*>(Denominator_gpu.data<std::complex<double>>());
+
+    // 3. Matrix Descriptors Creation
+    int64_t m_global = this->ParaV->desc[2];
+    int64_t n_global = this->ParaV->desc[3];
+    int64_t mb = this->ParaV->desc[4];
+    int64_t nb = this->ParaV->desc[5];
+    int64_t rsrc = this->ParaV->desc[6];
+    int64_t csrc = this->ParaV->desc[7];
+    int64_t lld = this->ParaV->desc[8];
+
+    // 3.1 cuBLASMp Descriptor
+    cublasMpMatrixDescriptor_t desc_blas;
+    cublasMpMatrixDescriptorCreate(m_global,
+                                   n_global,
+                                   mb,
+                                   nb,
+                                   rsrc,
+                                   csrc,
+                                   lld,
+                                   CUDA_C_64F,
+                                   cublas_res.cublasmp_grid,
+                                   &desc_blas);
+
+    // 3.2 cuSOLVERMp Descriptor
+    cusolverMpMatrixDescriptor_t desc_solver;
+    cusolverMpCreateMatrixDesc(&desc_solver,
+                               cublas_res.cusolvermp_grid,
+                               CUDA_C_64F,
+                               m_global,
+                               n_global,
+                               mb,
+                               nb,
+                               rsrc,
+                               csrc,
+                               lld);
+
+    // 4. Construct A (Denominator) and B (Numerator) using Geadd
+    std::complex<double> one = {1.0, 0.0};
+    std::complex<double> coef_neg_i = {0.0, -0.25 * this->dt};
+    std::complex<double> coef_pos_i = {0.0, 0.25 * this->dt};
+
+    cudaMemcpyAsync(d_Num, d_S, len_loc * sizeof(std::complex<double>), cudaMemcpyDeviceToDevice, cublas_res.stream);
+    cudaMemcpyAsync(d_Den, d_S, len_loc * sizeof(std::complex<double>), cudaMemcpyDeviceToDevice, cublas_res.stream);
+
+    size_t ws_geadd_dev = 0, ws_geadd_host = 0;
+    cublasMpGeadd_bufferSize(cublas_res.cublasmp_handle,
+                             CUBLAS_OP_N,
+                             m_global,
+                             n_global,
+                             &coef_neg_i,
+                             d_H,
                              1,
-                             this->ParaV->desc,
-                             Numerator.data<std::complex<double>>(),
                              1,
-                             1,
-                             this->ParaV->desc,
-                             0.0,
-                             U_operator.data<std::complex<double>>(),
+                             desc_blas,
+                             &one,
+                             d_Num,
                              1,
                              1,
-                             this->ParaV->desc);
-
-    if (print_matrix)
+                             desc_blas,
+                             &ws_geadd_dev,
+                             &ws_geadd_host);
+
+    void *d_work_geadd = nullptr, *h_work_geadd = nullptr;
+    cudaMallocAsync(&d_work_geadd, ws_geadd_dev, cublas_res.stream);
+    h_work_geadd = malloc(ws_geadd_host);
+
+    // B = S - i * (dt/4) * H
+    cublasMpGeadd(cublas_res.cublasmp_handle,
+                  CUBLAS_OP_N,
+                  m_global,
+                  n_global,
+                  &coef_neg_i,
+                  d_H,
+                  1,
+                  1,
+                  desc_blas,
+                  &one,
+                  d_Num,
+                  1,
+                  1,
+                  desc_blas,
+                  d_work_geadd,
+                  ws_geadd_dev,
+                  h_work_geadd,
+                  ws_geadd_host);
+
+    // A = S + i * (dt/4) * H
+    cublasMpGeadd(cublas_res.cublasmp_handle,
+                  CUBLAS_OP_N,
+                  m_global,
+                  n_global,
+                  &coef_pos_i,
+                  d_H,
+                  1,
+                  1,
+                  desc_blas,
+                  &one,
+                  d_Den,
+                  1,
+                  1,
+                  desc_blas,
+                  d_work_geadd,
+                  ws_geadd_dev,
+                  h_work_geadd,
+                  ws_geadd_host);
+
+    cudaFreeAsync(d_work_geadd, cublas_res.stream);
+    free(h_work_geadd);
+
+    // 5. QR Factorization of A (Denominator)
+    int64_t tau_size = m_global + nb;
+    void* d_tau;
+    cudaMallocAsync(&d_tau, tau_size * sizeof(std::complex<double>), cublas_res.stream);
+
+    int* d_info;
+    cudaMallocAsync(&d_info, sizeof(int), cublas_res.stream);
+    cudaMemsetAsync(d_info, 0, sizeof(int), cublas_res.stream);
+
+    size_t ws_geqrf_dev = 0, ws_geqrf_host = 0;
+    cusolverMpGeqrf_bufferSize(cublas_res.cusolvermp_handle,
+                               m_global,
+                               n_global,
+                               d_Den,
+                               1,
+                               1,
+                               desc_solver,
+                               CUDA_C_64F,
+                               &ws_geqrf_dev,
+                               &ws_geqrf_host);
+
+    void *d_work_geqrf = nullptr, *h_work_geqrf = nullptr;
+    cudaMallocAsync(&d_work_geqrf, ws_geqrf_dev, cublas_res.stream);
+    h_work_geqrf = malloc(ws_geqrf_host);
+
+    cusolverMpGeqrf(cublas_res.cusolvermp_handle,
+                    m_global,
+                    n_global,
+                    d_Den,
+                    1,
+                    1,
+                    desc_solver,
+                    d_tau,
+                    CUDA_C_64F,
+                    d_work_geqrf,
+                    ws_geqrf_dev,
+                    h_work_geqrf,
+                    ws_geqrf_host,
+                    d_info);
+
+    cudaFreeAsync(d_work_geqrf, cublas_res.stream);
+    free(h_work_geqrf);
+
+    // Check QR Info
+    int h_info = 0;
+    cudaMemcpyAsync(&h_info, d_info, sizeof(int), cudaMemcpyDeviceToHost, cublas_res.stream);
+    cudaStreamSynchronize(cublas_res.stream);
+    if (h_info != 0)
     {
-        ofs_running << " fenmu^-1:" << std::endl;
-        for (int i = 0; i < this->ParaV->nrow; i++)
-        {
-            const int in = i * this->ParaV->ncol;
-            for (int j = 0; j < this->ParaV->ncol; j++)
-            {
-                ofs_running << Denominator.data<std::complex<double>>()[in + j].real() << "+"
-                            << Denominator.data<std::complex<double>>()[in + j].imag() << "i ";
-            }
-            ofs_running << std::endl;
-        }
-        ofs_running << std::endl;
-        ofs_running << " fenzi:" << std::endl;
-        for (int i = 0; i < this->ParaV->nrow; i++)
-        {
-            const int in = i * this->ParaV->ncol;
-            for (int j = 0; j < this->ParaV->ncol; j++)
-            {
-                ofs_running << Numerator.data<std::complex<double>>()[in + j].real() << "+"
-                            << Numerator.data<std::complex<double>>()[in + j].imag() << "i ";
-            }
-            ofs_running << std::endl;
-        }
-        ofs_running << std::endl;
-        ofs_running << " U operator:" << std::endl;
-        for (int i = 0; i < this->ParaV->nrow; i++)
-        {
-            const int in = i * this->ParaV->ncol;
-            for (int j = 0; j < this->ParaV->ncol; j++)
-            {
-                double aa = U_operator.data<std::complex<double>>()[in + j].real();
-                double bb = U_operator.data<std::complex<double>>()[in + j].imag();
-                if (std::abs(aa) < 1e-8)
-                {
-                    aa = 0.0;
-                }
-                if (std::abs(bb) < 1e-8)
-                {
-                    bb = 0.0;
-                }
-                ofs_running << aa << "+" << bb << "i ";
-            }
-            ofs_running << std::endl;
-        }
+        std::cerr << "CRITICAL: cusolverMpGeqrf failed with Info: " << h_info << std::endl;
+        MPI_Abort(MPI_COMM_WORLD, -1);
     }
+
+    // 6. Apply Q^H to B (Numerator)
+    size_t ws_ormqr_dev = 0, ws_ormqr_host = 0;
+    cusolverMpOrmqr_bufferSize(cublas_res.cusolvermp_handle,
+                               CUBLAS_SIDE_LEFT,
+                               CUBLAS_OP_C,
+                               m_global,
+                               n_global,
+                               n_global,
+                               d_Den,
+                               1,
+                               1,
+                               desc_solver,
+                               d_tau,
+                               d_Num,
+                               1,
+                               1,
+                               desc_solver,
+                               CUDA_C_64F,
+                               &ws_ormqr_dev,
+                               &ws_ormqr_host);
+
+    void *d_work_ormqr = nullptr, *h_work_ormqr = nullptr;
+    cudaMallocAsync(&d_work_ormqr, ws_ormqr_dev, cublas_res.stream);
+    h_work_ormqr = malloc(ws_ormqr_host);
+
+    cusolverMpOrmqr(cublas_res.cusolvermp_handle,
+                    CUBLAS_SIDE_LEFT,
+                    CUBLAS_OP_C,
+                    m_global,
+                    n_global,
+                    n_global,
+                    d_Den,
+                    1,
+                    1,
+                    desc_solver,
+                    d_tau,
+                    d_Num,
+                    1,
+                    1,
+                    desc_solver,
+                    CUDA_C_64F,
+                    d_work_ormqr,
+                    ws_ormqr_dev,
+                    h_work_ormqr,
+                    ws_ormqr_host,
+                    d_info);
+
+    cudaFreeAsync(d_work_ormqr, cublas_res.stream);
+    free(h_work_ormqr);
+
+    // 7. Solve Triangular System (TRSM)
+    size_t ws_trsm_dev = 0, ws_trsm_host = 0;
+    std::complex<double> alpha_trsm = {1.0, 0.0};
+
+    cublasMpTrsm_bufferSize(cublas_res.cublasmp_handle,
+                            CUBLAS_SIDE_LEFT,
+                            CUBLAS_FILL_MODE_UPPER,
+                            CUBLAS_OP_N,
+                            CUBLAS_DIAG_NON_UNIT,
+                            m_global,
+                            n_global,
+                            &alpha_trsm,
+                            d_Den,
+                            1,
+                            1,
+                            desc_blas,
+                            d_Num,
+                            1,
+                            1,
+                            desc_blas,
+                            CUBLAS_COMPUTE_64F,
+                            &ws_trsm_dev,
+                            &ws_trsm_host);
+
+    void *d_work_trsm = nullptr, *h_work_trsm = nullptr;
+    cudaMallocAsync(&d_work_trsm, ws_trsm_dev, cublas_res.stream);
+    h_work_trsm = malloc(ws_trsm_host);
+
+    cublasMpTrsm(cublas_res.cublasmp_handle,
+                 CUBLAS_SIDE_LEFT,
+                 CUBLAS_FILL_MODE_UPPER,
+                 CUBLAS_OP_N,
+                 CUBLAS_DIAG_NON_UNIT,
+                 m_global,
+                 n_global,
+                 &alpha_trsm,
+                 d_Den,
+                 1,
+                 1,
+                 desc_blas,
+                 d_Num,
+                 1,
+                 1,
+                 desc_blas,
+                 CUBLAS_COMPUTE_64F,
+                 d_work_trsm,
+                 ws_trsm_dev,
+                 h_work_trsm,
+                 ws_trsm_host);
+
+    cudaFreeAsync(d_work_trsm, cublas_res.stream);
+    free(h_work_trsm);
+
+    // 8. Cleanup and Final Synchronization
+    cudaStreamSynchronize(cublas_res.stream);
+
+    cublasMpMatrixDescriptorDestroy(desc_blas);
+    cusolverMpDestroyMatrixDesc(desc_solver);
+
+    cudaFreeAsync(d_tau, cublas_res.stream);
+    cudaFreeAsync(d_info, cublas_res.stream);
+#endif // __CUBLASMP
 }
 
 template <typename Device>
diff --git a/source/source_lcao/module_rt/upsi.cpp b/source/source_lcao/module_rt/upsi.cpp
index 0982a77426..e0f1b38d75 100644
--- a/source/source_lcao/module_rt/upsi.cpp
+++ b/source/source_lcao/module_rt/upsi.cpp
@@ -2,7 +2,9 @@
 
 #include "source_base/module_container/ATen/kernels/blas.h"
 #include "source_base/module_external/scalapack_connector.h"
+#include "source_base/timer.h"
 
+#include <cassert>
 #include <complex>
 #include <iostream>
 
@@ -93,74 +95,126 @@ void upsi_tensor(const Parallel_Orbitals* pv,
                  const ct::Tensor& psi_k_laststep,
                  ct::Tensor& psi_k,
                  std::ofstream& ofs_running,
-                 const int print_matrix)
+                 const int print_matrix,
+                 CublasMpResources& cublas_res)
 {
-    ScalapackConnector::gemm('N',
-                             'N',
-                             nlocal,
-                             nband,
-                             nlocal,
-                             1.0,
-                             U_operator.data<std::complex<double>>(),
-                             1,
-                             1,
-                             pv->desc,
-                             psi_k_laststep.data<std::complex<double>>(),
-                             1,
-                             1,
-                             pv->desc_wfc,
-                             0.0,
-                             psi_k.data<std::complex<double>>(),
-                             1,
-                             1,
-                             pv->desc_wfc);
-
-    if (print_matrix)
+#ifdef __CUBLASMP
+    // 1. Resource validation
+    if (!cublas_res.is_initialized || cublas_res.cublasmp_grid == nullptr)
     {
-        ofs_running << std::endl;
-        ofs_running << " psi_k:" << std::endl;
-        for (int i = 0; i < pv->ncol_bands; i++)
-        {
-            const int in = i * pv->ncol;
-            for (int j = 0; j < pv->ncol; j++)
-            {
-                double aa = psi_k.data<std::complex<double>>()[in + j].real();
-                double bb = psi_k.data<std::complex<double>>()[in + j].imag();
-                if (std::abs(aa) < 1e-8)
-                {
-                    aa = 0.0;
-                }
-                if (std::abs(bb) < 1e-8)
-                {
-                    bb = 0.0;
-                }
-                ofs_running << aa << "+" << bb << "i ";
-            }
-            ofs_running << std::endl;
-        }
-        ofs_running << std::endl;
-        ofs_running << " psi_k_laststep:" << std::endl;
-        for (int i = 0; i < pv->ncol_bands; i++)
-        {
-            const int in = i * pv->ncol;
-            for (int j = 0; j < pv->ncol; j++)
-            {
-                double aa = psi_k_laststep.data<std::complex<double>>()[in + j].real();
-                double bb = psi_k_laststep.data<std::complex<double>>()[in + j].imag();
-                if (std::abs(aa) < 1e-8)
-                {
-                    aa = 0.0;
-                }
-                if (std::abs(bb) < 1e-8)
-                {
-                    bb = 0.0;
-                }
-                ofs_running << aa << "+" << bb << "i ";
-            }
-            ofs_running << std::endl;
-        }
-        ofs_running << std::endl;
+        return;
     }
+
+    assert(U_operator.device_type() == ct::DeviceType::GpuDevice);
+    assert(psi_k_laststep.device_type() == ct::DeviceType::GpuDevice);
+    assert(psi_k.device_type() == ct::DeviceType::GpuDevice);
+
+    // 2. Extract device pointers
+    void* d_U = static_cast<void*>(const_cast<std::complex<double>*>(U_operator.data<std::complex<double>>()));
+    void* d_Psi_old
+        = static_cast<void*>(const_cast<std::complex<double>*>(psi_k_laststep.data<std::complex<double>>()));
+    void* d_Psi_k = static_cast<void*>(psi_k.data<std::complex<double>>());
+
+    // 3. Create matrix descriptor for U operator (N x N)
+    int64_t m_u = pv->desc[2];
+    int64_t n_u = pv->desc[3];
+    int64_t mb_u = pv->desc[4];
+    int64_t nb_u = pv->desc[5];
+    int64_t lld_u = pv->desc[8];
+
+    cublasMpMatrixDescriptor_t desc_U;
+    cublasMpMatrixDescriptorCreate(m_u, n_u, mb_u, nb_u, 0, 0, lld_u, CUDA_C_64F, cublas_res.cublasmp_grid, &desc_U);
+
+    // 4. Create matrix descriptor for Psi (N x nband)
+    int64_t m_psi = pv->desc_wfc[2];
+    int64_t n_psi = pv->desc_wfc[3];
+    int64_t mb_psi = pv->desc_wfc[4];
+    int64_t nb_psi = pv->desc_wfc[5];
+    int64_t lld_psi = pv->desc_wfc[8];
+
+    cublasMpMatrixDescriptor_t desc_Psi;
+    cublasMpMatrixDescriptorCreate(m_psi,
+                                   n_psi,
+                                   mb_psi,
+                                   nb_psi,
+                                   0,
+                                   0,
+                                   lld_psi,
+                                   CUDA_C_64F,
+                                   cublas_res.cublasmp_grid,
+                                   &desc_Psi);
+
+    // 5. Query workspace size for GEMM: Psi_k = 1.0 * U * Psi_old + 0.0 * Psi_k
+    std::complex<double> alpha = {1.0, 0.0};
+    std::complex<double> beta = {0.0, 0.0};
+    size_t ws_gemm_dev = 0;
+    size_t ws_gemm_host = 0;
+
+    cublasMpGemm_bufferSize(cublas_res.cublasmp_handle,
+                            CUBLAS_OP_N,
+                            CUBLAS_OP_N,
+                            m_u,
+                            n_psi,
+                            n_u,
+                            &alpha,
+                            d_U,
+                            1,
+                            1,
+                            desc_U,
+                            d_Psi_old,
+                            1,
+                            1,
+                            desc_Psi,
+                            &beta,
+                            d_Psi_k,
+                            1,
+                            1,
+                            desc_Psi,
+                            CUBLAS_COMPUTE_64F,
+                            &ws_gemm_dev,
+                            &ws_gemm_host);
+
+    void* d_work = nullptr;
+    void* h_work = nullptr;
+
+    cudaMallocAsync(&d_work, ws_gemm_dev, cublas_res.stream);
+    h_work = malloc(ws_gemm_host);
+
+    // 6. Execute GEMM
+    cublasMpGemm(cublas_res.cublasmp_handle,
+                 CUBLAS_OP_N,
+                 CUBLAS_OP_N,
+                 m_u,
+                 n_psi,
+                 n_u,
+                 &alpha,
+                 d_U,
+                 1,
+                 1,
+                 desc_U,
+                 d_Psi_old,
+                 1,
+                 1,
+                 desc_Psi,
+                 &beta,
+                 d_Psi_k,
+                 1,
+                 1,
+                 desc_Psi,
+                 CUBLAS_COMPUTE_64F,
+                 d_work,
+                 ws_gemm_dev,
+                 h_work,
+                 ws_gemm_host);
+
+    // 7. Synchronize and clean up resources
+    cudaStreamSynchronize(cublas_res.stream);
+
+    cublasMpMatrixDescriptorDestroy(desc_U);
+    cublasMpMatrixDescriptorDestroy(desc_Psi);
+    cudaFreeAsync(d_work, cublas_res.stream);
+    free(h_work);
+#endif // __CUBLASMP
 }
 
 template <typename Device>
diff --git a/source/source_lcao/module_rt/upsi.h b/source/source_lcao/module_rt/upsi.h
index 6cf0976840..e5404a0622 100644
--- a/source/source_lcao/module_rt/upsi.h
+++ b/source/source_lcao/module_rt/upsi.h
@@ -9,6 +9,7 @@
 
 #include "source_base/module_container/ATen/core/tensor.h" // ct::Tensor
 #include "source_basis/module_ao/parallel_orbitals.h"
+#include "source_lcao/module_rt/kernels/cublasmp_context.h"
 
 #include <complex>
 
@@ -42,7 +43,8 @@ void upsi_tensor(const Parallel_Orbitals* pv,
                  const ct::Tensor& psi_k_laststep,
                  ct::Tensor& psi_k,
                  std::ofstream& ofs_running,
-                 const int print_matrix);
+                 const int print_matrix,
+                 CublasMpResources& cublas_res);
 
 template <typename Device>
 void upsi_tensor_lapack(const Parallel_Orbitals* pv,