diff --git a/CMakeLists.txt b/CMakeLists.txt
index 0824dd762e..2da2648743 100644
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -252,8 +252,8 @@ if(ENABLE_LCAO)
     add_compile_definitions(__PEXSI)
     set(CMAKE_CXX_STANDARD 14)
   endif()
-  if(NEW_GINT)
-    add_compile_definitions(__NEW_GINT)
+  if(OLD_GINT)
+    add_compile_definitions(__OLD_GINT)
   endif()
 else()
   set(ENABLE_MLALGO OFF)
diff --git a/source/Makefile.Objects b/source/Makefile.Objects
index c94a7f4f9e..85a5c14a94 100644
--- a/source/Makefile.Objects
+++ b/source/Makefile.Objects
@@ -61,6 +61,7 @@ VPATH=./src_global:\
 ./module_hamilt_lcao/module_deltaspin:\
 ./module_hamilt_lcao/hamilt_lcaodft/operator_lcao:\
 ./module_hamilt_lcao/module_gint:\
+./module_hamilt_lcao/module_gint/temp_gint:\
 ./module_relax:\
 ./source_hamilt/module_vdw:\
 ./module_io:\
@@ -273,13 +274,13 @@ OBJS_ESOLVER_LCAO=esolver_ks_lcao.o\
       lcao_others.o\
       esolver_dm2rho.o\
 
-OBJS_GINT=gint.o\
+OBJS_GINT=gint_old.o\
       gint_gamma_env.o\
       gint_gamma_vl.o\
-      gint_fvl.o\
-      gint_rho.o\
-      gint_tau.o\
-      gint_vl.o\
+      gint_fvl_old.o\
+      gint_rho_old.o\
+      gint_tau_old.o\
+      gint_vl_old.o\
       gint_k_env.o\
       gint_k_sparse1.o\
       gint_k_pvpr.o\
@@ -298,6 +299,30 @@ OBJS_GINT=gint.o\
       cal_ddpsir_ylm.o\
       mult_psi_dmr.o\
       init_orb.o\
+      batch_biggrid.o\
+      big_grid.o\
+      biggrid_info.o\
+      divide_info.o\
+      gint_atom.o\
+      gint_common.o\
+      gint_dvlocal.o\
+      gint_env_gamma.o\
+      gint_env_k.o\
+      gint_fvl_meta.o\
+      gint_fvl.o\
+      gint_info.o\
+      gint_interface.o\
+      gint_rho.o\
+      gint_tau.o\
+      gint_vl_metagga_nspin4.o\
+      gint_vl_metagga.o\
+      gint_vl_nspin4.o\
+      gint_vl.o\
+      gint.o\
+      localcell_info.o\
+      phi_operator.o\
+      set_ddphi.o\
+      unitcell_info.o\
 
 OBJS_HAMILT=hamilt_pw.o\
     hamilt_sdft_pw.o\
diff --git a/source/module_hamilt_lcao/hamilt_lcaodft/operator_lcao/veff_lcao.cpp b/source/module_hamilt_lcao/hamilt_lcaodft/operator_lcao/veff_lcao.cpp
index b64f046e3a..6253b1dcf4 100644
--- a/source/module_hamilt_lcao/hamilt_lcaodft/operator_lcao/veff_lcao.cpp
+++ b/source/module_hamilt_lcao/hamilt_lcaodft/operator_lcao/veff_lcao.cpp
@@ -68,7 +68,7 @@ void Veff<OperatorLCAO<double, double>>::contributeHR()
     double* vr_eff1 = this->pot->get_effective_v(this->current_spin);
     double* vofk_eff1 = this->pot->get_effective_vofk(this->current_spin);
 
-#ifndef __NEW_GINT
+#ifdef __OLD_GINT
     if(XC_Functional::get_ked_flag())
     {
         Gint_inout inout(vr_eff1, vofk_eff1, Gint_Tools::job_type::vlocal_meta);
@@ -113,7 +113,7 @@ void Veff<OperatorLCAO<std::complex<double>, double>>::contributeHR()
     double* vr_eff1 = this->pot->get_effective_v(this->current_spin);
     double* vofk_eff1 = this->pot->get_effective_vofk(this->current_spin);
 
-#ifndef __NEW_GINT
+#ifdef __OLD_GINT
     // if you change the place of the following code,
     // rememeber to delete the #include
     if(XC_Functional::get_ked_flag())
@@ -155,7 +155,7 @@ void Veff<OperatorLCAO<std::complex<double>, std::complex<double>>>::contributeH
     ModuleBase::TITLE("Veff", "contributeHR");
     ModuleBase::timer::tick("Veff", "contributeHR");
 
-#ifndef __NEW_GINT
+#ifdef __OLD_GINT
     double* vr_eff1 = nullptr;
     double* vofk_eff1 = nullptr;
     for (int is = 0; is < 4; is++)
@@ -187,19 +187,15 @@ void Veff<OperatorLCAO<std::complex<double>, std::complex<double>>>::contributeH
         if(XC_Functional::get_ked_flag())
         {
             vofk_eff[is] = this->pot->get_effective_vofk(is);
-            if(is == 3)
-            {
-                ModuleGint::cal_gint_vl_metagga(vr_eff, vofk_eff, this->hR);
-            }
-        }
-        else
-        {
-            if(is == 3)
-            {
-                ModuleGint::cal_gint_vl(vr_eff, this->hR);
-            }
         }
     }
+    if(XC_Functional::get_ked_flag())
+    {
+        ModuleGint::cal_gint_vl_metagga(vr_eff, vofk_eff, this->hR);
+    } else
+    {
+        ModuleGint::cal_gint_vl(vr_eff, this->hR);
+    }
 #endif
 
     ModuleBase::timer::tick("Veff", "contributeHR");
diff --git a/source/module_hamilt_lcao/hamilt_lcaodft/operator_lcao/veff_lcao.h b/source/module_hamilt_lcao/hamilt_lcaodft/operator_lcao/veff_lcao.h
index 696f094048..8f456695ce 100644
--- a/source/module_hamilt_lcao/hamilt_lcaodft/operator_lcao/veff_lcao.h
+++ b/source/module_hamilt_lcao/hamilt_lcaodft/operator_lcao/veff_lcao.h
@@ -50,7 +50,9 @@ class Veff<OperatorLCAO<TK, TR>> : public OperatorLCAO<TK, TR>
         this->cal_type = calculation_type::lcao_gint;
 
         this->initialize_HR(ucell_in, GridD_in);
+#ifdef __OLD_GINT
         GK_in->initialize_pvpR(*ucell_in, GridD_in, nspin);
+#endif
     }
     /**
      * @brief Construct a new Veff object for Gamma-only calculation
@@ -69,8 +71,9 @@ class Veff<OperatorLCAO<TK, TR>> : public OperatorLCAO<TK, TR>
     {
         this->cal_type = calculation_type::lcao_gint;
         this->initialize_HR(ucell_in, GridD_in);
-
+#ifdef __OLD_GINT
         GG_in->initialize_pvpR(*ucell_in, GridD_in, nspin);
+#endif
     }
 
     ~Veff<OperatorLCAO<TK, TR>>(){};
diff --git a/source/module_hamilt_lcao/hamilt_lcaodft/pulay_force_stress_gint.hpp b/source/module_hamilt_lcao/hamilt_lcaodft/pulay_force_stress_gint.hpp
index 6d2e3326aa..aa59ad87d4 100644
--- a/source/module_hamilt_lcao/hamilt_lcaodft/pulay_force_stress_gint.hpp
+++ b/source/module_hamilt_lcao/hamilt_lcaodft/pulay_force_stress_gint.hpp
@@ -20,7 +20,7 @@ namespace PulayForceStress
     {
         const int nspin = PARAM.inp.nspin;
 
-#ifndef __NEW_GINT
+#ifdef __OLD_GINT
         if (set_dmr_gint) { gint.transfer_DM2DtoGrid(dm.get_DMR_vector()); }    // 2d block to grid
         for (int is = 0; is < nspin; ++is)
         {
diff --git a/source/module_hamilt_lcao/hamilt_lcaodft/spar_dh.cpp b/source/module_hamilt_lcao/hamilt_lcaodft/spar_dh.cpp
index fddf2e584d..381c61ec87 100644
--- a/source/module_hamilt_lcao/hamilt_lcaodft/spar_dh.cpp
+++ b/source/module_hamilt_lcao/hamilt_lcaodft/spar_dh.cpp
@@ -2,6 +2,7 @@
 
 #include "module_parameter/parameter.h"
 #include "module_hamilt_lcao/hamilt_lcaodft/LCAO_domain.h"
+#include "module_hamilt_lcao/module_gint/temp_gint/gint_interface.h"
 #include <vector>
 
 void sparse_format::cal_dS(const UnitCell& ucell,
@@ -49,7 +50,6 @@ delete[] fsr_dh.DHloc_fixedR_y;
 delete[] fsr_dh.DHloc_fixedR_z;
 return;
 }
-
 void sparse_format::cal_dH(const UnitCell& ucell,
                            const Parallel_Orbitals& pv,
                            LCAO_HS_Arrays& HS_Arrays,
@@ -58,6 +58,7 @@ void sparse_format::cal_dH(const UnitCell& ucell,
                            const LCAO_Orbitals& orb,
                            const int& current_spin,
                            const double& sparse_thr,
+                           const ModuleBase::matrix& v_eff,
                            Gint_k& gint_k)
 {
     ModuleBase::TITLE("sparse_format", "cal_dH");
@@ -106,8 +107,38 @@ void sparse_format::cal_dH(const UnitCell& ucell,
     delete[] fsr_dh.DHloc_fixedR_y;
     delete[] fsr_dh.DHloc_fixedR_z;
 
-    gint_k.cal_dvlocal_R_sparseMatrix(current_spin, sparse_thr, HS_Arrays, &pv, ucell, grid);
-
+    if(PARAM.inp.nspin==2)
+    {
+#ifdef __OLD_GINT
+        gint_k.allocate_pvdpR();
+        // note: some MPI process will not have grids when MPI cores are too
+        // many, v_eff in these processes are empty
+        const double* vr_eff1
+            = v_eff.nc * v_eff.nr > 0 ? &(v_eff(current_spin, 0)) : nullptr;
+
+        if (!PARAM.globalv.gamma_only_local) 
+        {
+            if (PARAM.inp.vl_in_h) 
+            {
+                Gint_inout inout(vr_eff1,
+                                 current_spin,
+                                 Gint_Tools::job_type::dvlocal);
+                gint_k.cal_gint(&inout);
+            }
+        }
+        gint_k.cal_dvlocal_R_sparseMatrix(current_spin, sparse_thr, HS_Arrays, &pv, ucell, grid);
+        gint_k.destroy_pvdpR();
+#else
+        const double* vr_eff1
+            = v_eff.nc * v_eff.nr > 0 ? &(v_eff(current_spin, 0)) : nullptr;
+        if (!PARAM.globalv.gamma_only_local) 
+        {
+            ModuleGint::cal_dvlocal_R_sparseMatrix(
+                PARAM.inp.nspin, PARAM.globalv.npol, current_spin, PARAM.globalv.nlocal,
+                sparse_thr, vr_eff1, pv, ucell, grid, HS_Arrays);
+        }
+#endif
+    }
     return;
 }
 
diff --git a/source/module_hamilt_lcao/hamilt_lcaodft/spar_dh.h b/source/module_hamilt_lcao/hamilt_lcaodft/spar_dh.h
index 1ec555f4fa..a477a29648 100644
--- a/source/module_hamilt_lcao/hamilt_lcaodft/spar_dh.h
+++ b/source/module_hamilt_lcao/hamilt_lcaodft/spar_dh.h
@@ -19,6 +19,7 @@ void cal_dH(const UnitCell& ucell,
             const LCAO_Orbitals& orb,
             const int& current_spin,
             const double& sparse_thr,
+            const ModuleBase::matrix& v_eff,
             Gint_k& gint_k);
 
 // calculated the derivative of the overlap matrix: <phi|dphi>
diff --git a/source/module_hamilt_lcao/module_gint/CMakeLists.txt b/source/module_hamilt_lcao/module_gint/CMakeLists.txt
index 7b43114adb..0505957b9c 100644
--- a/source/module_hamilt_lcao/module_gint/CMakeLists.txt
+++ b/source/module_hamilt_lcao/module_gint/CMakeLists.txt
@@ -2,13 +2,13 @@
 if(ENABLE_LCAO)
 
 list(APPEND objects
-    gint.cpp
+    gint_old.cpp
     gint_gamma_env.cpp
     gint_gamma_vl.cpp
-    gint_fvl.cpp
-    gint_rho.cpp
-    gint_tau.cpp
-    gint_vl.cpp
+    gint_fvl_old.cpp
+    gint_rho_old.cpp
+    gint_tau_old.cpp
+    gint_vl_old.cpp
     gint_k_env.cpp
     gint_k_sparse1.cpp
     gint_k_pvpr.cpp
@@ -29,7 +29,7 @@ list(APPEND objects
     init_orb.cpp
 )
 
-if(NEW_GINT)
+if(NOT DEFINED OLD_GINT)
   list(APPEND objects
       temp_gint/biggrid_info.cpp
       temp_gint/big_grid.cpp
@@ -45,6 +45,9 @@ if(NEW_GINT)
       temp_gint/gint_tau.cpp
       temp_gint/gint_fvl.cpp
       temp_gint/gint_fvl_meta.cpp
+      temp_gint/gint_env_gamma.cpp
+      temp_gint/gint_env_k.cpp
+      temp_gint/gint_dvlocal.cpp
       temp_gint/localcell_info.cpp
       temp_gint/phi_operator.cpp
       temp_gint/set_ddphi.cpp
@@ -52,6 +55,24 @@ if(NEW_GINT)
       temp_gint/gint_common.cpp
       temp_gint/gint_interface.cpp
       )
+  if(USE_CUDA)
+    list(APPEND objects
+        temp_gint/kernel/gint_gpu_vars.cpp
+        temp_gint/kernel/phi_operator_gpu.cu
+        temp_gint/kernel/phi_operator_kernel.cu
+        temp_gint/kernel/set_const_mem.cu
+        temp_gint/batch_biggrid.cpp
+        temp_gint/gint_vl_gpu.cpp
+        temp_gint/gint_rho_gpu.cpp
+        temp_gint/gint_fvl_gpu.cpp
+        temp_gint/gint_vl_metagga_gpu.cpp
+        temp_gint/gint_vl_nspin4_gpu.cpp
+        temp_gint/gint_vl_metagga_nspin4_gpu.cpp
+        temp_gint/gint_tau_gpu.cpp
+        temp_gint/gint_fvl_meta_gpu.cpp
+        temp_gint/kernel/dgemm_vbatch.cu
+    )
+  endif()
 endif()
 
 if(USE_CUDA)
diff --git a/source/module_hamilt_lcao/module_gint/gint_fvl.cpp b/source/module_hamilt_lcao/module_gint/gint_fvl_old.cpp
similarity index 100%
rename from source/module_hamilt_lcao/module_gint/gint_fvl.cpp
rename to source/module_hamilt_lcao/module_gint/gint_fvl_old.cpp
diff --git a/source/module_hamilt_lcao/module_gint/gint_k_sparse1.cpp b/source/module_hamilt_lcao/module_gint/gint_k_sparse1.cpp
index 8cd610bde7..4cd9cd7dbb 100644
--- a/source/module_hamilt_lcao/module_gint/gint_k_sparse1.cpp
+++ b/source/module_hamilt_lcao/module_gint/gint_k_sparse1.cpp
@@ -337,7 +337,6 @@ void Gint_k::cal_dvlocal_R_sparseMatrix(const int& current_spin,
     std::map<Abfs::Vector3_Order<int>, std::map<size_t, std::map<size_t, std::complex<double>>>>
         pvdpRz_soc_sparseMatrix;
 
-    int lgd = 0;
     double temp_value_double;
     std::complex<double> temp_value_complex;
 
diff --git a/source/module_hamilt_lcao/module_gint/gint.cpp b/source/module_hamilt_lcao/module_gint/gint_old.cpp
similarity index 100%
rename from source/module_hamilt_lcao/module_gint/gint.cpp
rename to source/module_hamilt_lcao/module_gint/gint_old.cpp
diff --git a/source/module_hamilt_lcao/module_gint/gint_rho.cpp b/source/module_hamilt_lcao/module_gint/gint_rho_old.cpp
similarity index 100%
rename from source/module_hamilt_lcao/module_gint/gint_rho.cpp
rename to source/module_hamilt_lcao/module_gint/gint_rho_old.cpp
diff --git a/source/module_hamilt_lcao/module_gint/gint_tau.cpp b/source/module_hamilt_lcao/module_gint/gint_tau_old.cpp
similarity index 100%
rename from source/module_hamilt_lcao/module_gint/gint_tau.cpp
rename to source/module_hamilt_lcao/module_gint/gint_tau_old.cpp
diff --git a/source/module_hamilt_lcao/module_gint/gint_vl.cpp b/source/module_hamilt_lcao/module_gint/gint_vl_old.cpp
similarity index 100%
rename from source/module_hamilt_lcao/module_gint/gint_vl.cpp
rename to source/module_hamilt_lcao/module_gint/gint_vl_old.cpp
diff --git a/source/module_hamilt_lcao/module_gint/kernels/cuda/cuda_tools.cu b/source/module_hamilt_lcao/module_gint/kernels/cuda/cuda_tools.cu
index 32dfe42b24..c9bf122628 100644
--- a/source/module_hamilt_lcao/module_gint/kernels/cuda/cuda_tools.cu
+++ b/source/module_hamilt_lcao/module_gint/kernels/cuda/cuda_tools.cu
@@ -4,27 +4,7 @@
 
 #include "cuda_tools.cuh"
 
-cudaError_t check(cudaError_t result, const char *const func, const char *const file, const int line)
-{
-    if (result != cudaSuccess)
-    {
-        fprintf(stderr, "CUDA Runtime Error at %s:%d code=%s \"%s\" \n", file, line, cudaGetErrorString(result), func);
-        exit(EXIT_FAILURE);
-    }
-    return result;
-}
-cudaError_t __checkCudaLastError(const char *file, const int line)
-{
-    cudaError_t result = cudaGetLastError();
-    if (result != cudaSuccess)
-    {
-        fprintf(stderr, "%s(%i) : getLastCudaError():%s\n", file, line, cudaGetErrorString(result));
-        assert(result == cudaSuccess);
-    }
-    return result;
-}
-
-void dump_cuda_array_to_file(double* cuda_array,
+void dump_cuda_array_to_file(const double* cuda_array,
                              int width,
                              int hight,
                              const std::string& filename)
diff --git a/source/module_hamilt_lcao/module_gint/kernels/cuda/cuda_tools.cuh b/source/module_hamilt_lcao/module_gint/kernels/cuda/cuda_tools.cuh
index 803e76ff22..dab697df8c 100644
--- a/source/module_hamilt_lcao/module_gint/kernels/cuda/cuda_tools.cuh
+++ b/source/module_hamilt_lcao/module_gint/kernels/cuda/cuda_tools.cuh
@@ -9,21 +9,47 @@
 #include <iostream>
 #include <sstream>
 
-#define checkCuda(val) check(val, #val, __FILE__, __LINE__)
-#define checkCudaLastError() __checkCudaLastError(__FILE__, __LINE__)
+#define checkCuda(val) check((val), #val, __FILE__, __LINE__)
+#define checkCudaLastError() __getLastCudaError(__FILE__, __LINE__)
 
-cudaError_t check(cudaError_t result, const char *const func, const char *const file, const int line);
-cudaError_t __checkCudaLastError(const char *file, const int line);
+inline void check(cudaError_t result, char const *const func, const char *const file,
+           int const line) {
+  if (result) {
+    fprintf(stderr, "CUDA error at %s:%d code=%d(%s) \"%s\" \n", file, line,
+            static_cast<unsigned int>(result), cudaGetErrorString(result), func);
+    exit(EXIT_FAILURE);
+  }
+}
+
+inline void __getLastCudaError(const char *file,
+                               const int line) 
+{
+  cudaError_t err = cudaGetLastError();
+
+  if (cudaSuccess != err) {
+    fprintf(stderr,
+            "%s(%i) : getLastCudaError() CUDA error :"
+            " (%d) %s.\n",
+            file, line, static_cast<int>(err),
+            cudaGetErrorString(err));
+    exit(EXIT_FAILURE);
+  }
+}
 
-void dump_cuda_array_to_file(double* cuda_array,
+static inline int ceildiv(int x, int y)
+{
+    return (x + y - 1) / y;
+}
+
+void dump_cuda_array_to_file(const double* cuda_array,
                              int width,
                              int hight,
                              const std::string& filename);
 
-inline int ceil_div(int a, int b)
-{
-    return (a + b - 1) / b;
-}
+// inline int ceil_div(int a, int b)
+// {
+//     return (a + b - 1) / b;
+// }
 
 /*
  * @brief: A simple wrapper for cudaMalloc and cudaFree, sync and async CUDA
diff --git a/source/module_hamilt_lcao/module_gint/kernels/cuda/vbatch_matrix_mul.cuh b/source/module_hamilt_lcao/module_gint/kernels/cuda/vbatch_matrix_mul.cuh
index b45805ec87..230e5a6f44 100644
--- a/source/module_hamilt_lcao/module_gint/kernels/cuda/vbatch_matrix_mul.cuh
+++ b/source/module_hamilt_lcao/module_gint/kernels/cuda/vbatch_matrix_mul.cuh
@@ -320,11 +320,6 @@ static __global__ void vbatched_gemm_kernel(int* M,
                                           alpha_tmp);
 }
 
-static inline int ceildiv(int x, int y)
-{
-    return (x + y - 1) / y;
-}
-
 /**
  * Performs a batched matrix multiplication using the vbatched_gemm_impl
  * function.
diff --git a/source/module_hamilt_lcao/module_gint/temp_gint/batch_biggrid.cpp b/source/module_hamilt_lcao/module_gint/temp_gint/batch_biggrid.cpp
new file mode 100644
index 0000000000..8372506e46
--- /dev/null
+++ b/source/module_hamilt_lcao/module_gint/temp_gint/batch_biggrid.cpp
@@ -0,0 +1,34 @@
+#include "batch_biggrid.h"
+
+namespace ModuleGint
+{
+
+int BatchBigGrid::max_batch_size_ = 0;
+int BatchBigGrid::max_atoms_num_ = 0;
+int BatchBigGrid::max_phi_len_ = 0;
+int BatchBigGrid::max_atom_pairs_num_ = 0;
+
+BatchBigGrid::BatchBigGrid(std::vector<std::shared_ptr<BigGrid>> biggrids)
+{
+    biggrids_ = biggrids;
+    max_batch_size_ = std::max(max_batch_size_, (int)biggrids_.size());
+    int atom_pairs_num = 0;
+    for(const auto& biggrid : biggrids_)
+    {
+        for(const auto& atom: biggrid->get_atoms())
+        {
+            max_nw_ = std::max(max_nw_, atom->get_nw());
+        }
+        max_atoms_num_per_bgrid_ = std::max(max_atoms_num_per_bgrid_, biggrid->get_atoms_num());
+        atoms_num_ += biggrid->get_atoms_num();
+        atom_pairs_num += std::pow(biggrid->get_atoms_num(), 2);
+        phi_len_ += biggrid->get_phi_len() * biggrid->get_mgrids_num();
+    }
+    max_atoms_num_ = std::max(max_atoms_num_, atoms_num_);
+    max_phi_len_ = std::max(max_phi_len_, phi_len_);
+    max_atom_pairs_num_ = std::max(max_atom_pairs_num_, atom_pairs_num);
+}
+
+
+
+}
diff --git a/source/module_hamilt_lcao/module_gint/temp_gint/batch_biggrid.h b/source/module_hamilt_lcao/module_gint/temp_gint/batch_biggrid.h
new file mode 100644
index 0000000000..d4de77d1db
--- /dev/null
+++ b/source/module_hamilt_lcao/module_gint/temp_gint/batch_biggrid.h
@@ -0,0 +1,50 @@
+#pragma once
+#include <memory>
+#include <vector>
+#include "big_grid.h"
+
+namespace ModuleGint
+{
+
+class BatchBigGrid
+{
+    public:
+    BatchBigGrid(std::vector<std::shared_ptr<BigGrid>> biggrids);
+    
+    const std::vector<std::shared_ptr<BigGrid>>& get_bgrids() { return biggrids_; }
+
+    int get_batch_size() const { return biggrids_.size(); }
+    int get_atoms_num() const { return atoms_num_; }
+    int get_phi_len() const { return phi_len_;}
+    int get_max_atoms_num_per_bgrid() const { return max_atoms_num_per_bgrid_; }
+    bool empty() {return atoms_num_ == 0; }
+    static int get_max_batch_size() { return max_batch_size_; }
+    static int get_max_atoms_num() { return max_atoms_num_; }
+    static int get_max_phi_len() { return max_phi_len_; }
+    static int get_max_atom_pairs_num() { return max_atom_pairs_num_; }
+    static std::shared_ptr<const BigGridInfo> get_bgrid_info() { return BigGrid::get_bgrid_info(); }
+    
+    private:
+    std::vector<std::shared_ptr<BigGrid>> biggrids_;
+
+    // the max nw of an atom
+    int max_nw_ = 0;
+    
+    int phi_len_ = 0;
+    // number of atoms in the batch
+    int atoms_num_ = 0;
+
+    // the max number of atoms of a single biggrid
+    int max_atoms_num_per_bgrid_ = 0;
+
+    // the max number of biggrids of a biggrids batch
+    static int max_batch_size_;
+    // the max number of total atoms of a biggrids batch
+    static int max_atoms_num_;
+    // the max number of total wavefunctions of a biggrids batch
+    static int max_phi_len_;
+    // the max number of atom pairs of a biggrids batch
+    static int max_atom_pairs_num_;
+};
+
+}
\ No newline at end of file
diff --git a/source/module_hamilt_lcao/module_gint/temp_gint/big_grid.cpp b/source/module_hamilt_lcao/module_gint/temp_gint/big_grid.cpp
index d972cd90bb..e20a0fb50a 100644
--- a/source/module_hamilt_lcao/module_gint/temp_gint/big_grid.cpp
+++ b/source/module_hamilt_lcao/module_gint/temp_gint/big_grid.cpp
@@ -13,7 +13,7 @@ void BigGrid::add_atom(const GintAtom* atom)
     atoms_.push_back(atom);
 }
 
-int BigGrid::get_mgrid_phi_len() const
+int BigGrid::get_phi_len() const
 {
     int len = 0;
     for(const auto& atom : atoms_)
@@ -73,6 +73,11 @@ void BigGrid::set_mgrids_local_idx(std::vector<int>& mgrids_idx) const
     }
 }
 
+void BigGrid::set_atom_relative_coords(const GintAtom* atom, std::vector<Vec3d>& atom_coord) const
+{
+    set_atom_relative_coords(atom->get_bgrid_idx(), atom->get_tau_in_bgrid(), atom_coord);
+}
+
 void BigGrid::set_atom_relative_coords(const Vec3i bgrid_idx, const Vec3d tau_in_bgrid, std::vector<Vec3d>& atom_coord) const
 {
     Vec3i this_bgrid_idx = localcell_info_->get_bgrid_global_idx_3D(idx_);
@@ -84,17 +89,18 @@ void BigGrid::set_atom_relative_coords(const Vec3i bgrid_idx, const Vec3d tau_in
     atom_coord.resize(biggrid_info_->get_mgrids_num());
     for(int im = 0; im < biggrid_info_->get_mgrids_num(); ++im)
     {
-        const Vec3d& mcell_coord = biggrid_info_->get_mgrid_coord(im);
-        atom_coord[im] = mcell_coord - bgrid_relative_coord;
+        const Vec3d& mgrid_coord = biggrid_info_->get_mgrid_coord(im);
+        atom_coord[im] = mgrid_coord - bgrid_relative_coord;
     }
 }
 
-
-void BigGrid::set_atom_relative_coords(const GintAtom* atom, std::vector<Vec3d>& atom_coord) const
+Vec3d BigGrid::get_bgrid_atom_rcoord(const GintAtom* atom) const
 {
-    set_atom_relative_coords(atom->get_bgrid_idx(), atom->get_tau_in_bgrid(), atom_coord);
+    Vec3i this_bgrid_idx = localcell_info_->get_bgrid_global_idx_3D(idx_);
+    return unitcell_info_->get_relative_coord(atom->get_bgrid_idx(), this_bgrid_idx) + atom->get_tau_in_bgrid();
 }
 
+
 bool BigGrid::is_atom_on_bgrid(const GintAtom* atom) const
 {
     std::vector<Vec3d> coords;
diff --git a/source/module_hamilt_lcao/module_gint/temp_gint/big_grid.h b/source/module_hamilt_lcao/module_gint/temp_gint/big_grid.h
index c1d5596e13..55bed7a251 100644
--- a/source/module_hamilt_lcao/module_gint/temp_gint/big_grid.h
+++ b/source/module_hamilt_lcao/module_gint/temp_gint/big_grid.h
@@ -17,30 +17,30 @@ class BigGrid
         // constructor
         BigGrid(int idx);
 
-        static void init_localcell_info(std::shared_ptr<const LocalCellInfo> localcell_info) { localcell_info_ = localcell_info; };
-        static void init_unitcell_info(std::shared_ptr<const UnitCellInfo> unitcell_info) { unitcell_info_ = unitcell_info; };
-        static void init_bgrid_info(std::shared_ptr<const BigGridInfo> biggrid_info) { biggrid_info_ = biggrid_info; };
+        static void init_localcell_info(std::shared_ptr<const LocalCellInfo> localcell_info) { localcell_info_ = localcell_info; }
+        static void init_unitcell_info(std::shared_ptr<const UnitCellInfo> unitcell_info) { unitcell_info_ = unitcell_info; }
+        static void init_bgrid_info(std::shared_ptr<const BigGridInfo> biggrid_info) { biggrid_info_ = biggrid_info; }
 
         // getter functions
-        int get_idx() const { return idx_; };
-        std::shared_ptr<const LocalCellInfo> get_localcell_info() const { return localcell_info_; };
-        std::shared_ptr<const UnitCellInfo> get_unitcell_info() const {return unitcell_info_; };
-        std::shared_ptr<const BigGridInfo> get_bgrid_info() const { return biggrid_info_; };
-        const std::vector<const GintAtom*>& get_atoms() const { return atoms_; };
-        const GintAtom* get_atom(int i) const { return atoms_[i]; };
+        int get_idx() const { return idx_; }
+        static std::shared_ptr<const LocalCellInfo> get_localcell_info() { return localcell_info_; }
+        static std::shared_ptr<const UnitCellInfo> get_unitcell_info() { return unitcell_info_; }
+        static std::shared_ptr<const BigGridInfo> get_bgrid_info() { return biggrid_info_; }
+        const std::vector<const GintAtom*>& get_atoms() const { return atoms_; }
+        const GintAtom* get_atom(int i) const { return atoms_[i]; }
 
         // get the number of meshgrids in the big grid
-        int get_mgrids_num() const { return biggrid_info_->get_mgrids_num(); };
+        int get_mgrids_num() const { return biggrid_info_->get_mgrids_num(); }
 
         // get the number of atoms that can affect the big grid
-        int get_atoms_num() const { return atoms_.size(); };
+        int get_atoms_num() const { return atoms_.size(); }
 
         // add an atom to the big grid
         void add_atom(const GintAtom* atom);
 
         // get the total number of phi of a meshgrid
         // return: (\sum_{i=0}^{atoms_->size()} atoms_[i]->nw)
-        int get_mgrid_phi_len() const;
+        int get_phi_len() const;
 
         // set the start index of the phi of each atom
         // return: vector[i] = \sum_{j=0}^{i-1} atoms_[j]->nw
@@ -55,6 +55,9 @@ class BigGrid
         // set the 1D index of the meshgrids in the local cell
         void set_mgrids_local_idx(std::vector<int>& mgrids_idx) const;
 
+        // a wrapper function to get the relative coordinates of the atom and the meshgrids
+        void set_atom_relative_coords(const GintAtom* atom, std::vector<Vec3d>& atom_coord) const;
+        
         /**
          * @brief Set the coordinates of the meshgrids of the big grid relative to an atom
          * 
@@ -64,8 +67,8 @@ class BigGrid
          */
         void set_atom_relative_coords(const Vec3i bgrid_idx, const Vec3d tau_in_bgrid, std::vector<Vec3d>& atom_coord) const;
 
-        // a wrapper function to get the relative coordinates of the atom and the meshgrids
-        void set_atom_relative_coords(const GintAtom* atom, std::vector<Vec3d>& atom_coord) const;
+        // get the relative coords of the atom and the biggrid (used in gpu code)
+        Vec3d get_bgrid_atom_rcoord(const GintAtom* atom) const;
 
         // if the atom affects the big grid, return true, otherwise false
         // note when we say an atom affects a big grid, it does not mean that the atom affects all the meshgrid on the big grid,
diff --git a/source/module_hamilt_lcao/module_gint/temp_gint/biggrid_info.h b/source/module_hamilt_lcao/module_gint/temp_gint/biggrid_info.h
index f8bcb79665..c017f87a3d 100644
--- a/source/module_hamilt_lcao/module_gint/temp_gint/biggrid_info.h
+++ b/source/module_hamilt_lcao/module_gint/temp_gint/biggrid_info.h
@@ -22,36 +22,36 @@ class BigGridInfo
             Vec3d biggrid_vec3,
             int nmx, int nmy, int nmz);
         
-        Vec3d get_cartesian_coord(const Vec3d& index_3d) const { return index_3d * biggrid_latvec0_; };
-        Vec3d get_cartesian_coord(const Vec3i& index_3d) const { return index_3d * biggrid_latvec0_; };
-        const Vec3d get_direct_coord(const Vec3d& cart_coord) const { return cart_coord * biggrid_GT_; };
+        Vec3d get_cartesian_coord(const Vec3d& index_3d) const { return index_3d * biggrid_latvec0_; }
+        Vec3d get_cartesian_coord(const Vec3i& index_3d) const { return index_3d * biggrid_latvec0_; }
+        Vec3d get_direct_coord(const Vec3d& cart_coord) const { return cart_coord * biggrid_GT_; }
 
         // Return the maximum number of big grids that can fit inside a sphere of radius r,
         // along the three lattice vector directions.
         Vec3i max_ext_bgrid_num(double r) const;
 
         // get number of meshgrids along three lattice directions
-        int get_nmx() const { return nmx_; };
-        int get_nmy() const { return nmy_; };
-        int get_nmz() const { return nmz_; };
-        int get_mgrids_num() const { return nmxyz_; };
+        int get_nmx() const { return nmx_; }
+        int get_nmy() const { return nmy_; }
+        int get_nmz() const { return nmz_; }
+        int get_mgrids_num() const { return nmxyz_; }
 
-        const std::vector<Vec3d>& get_mgrids_coord() const { return meshgrid_coords_; };
-        const Vec3d& get_mgrid_coord(int index_1d) const { return meshgrid_coords_[index_1d]; };
+        const std::vector<Vec3d>& get_mgrids_coord() const { return meshgrid_coords_; }
+        const Vec3d& get_mgrid_coord(int index_1d) const { return meshgrid_coords_[index_1d]; }
 
-        std::shared_ptr<const MeshGridInfo> get_mgrid_info() const { return meshgrid_info_; };
+        std::shared_ptr<const MeshGridInfo> get_mgrid_info() const { return meshgrid_info_; }
 
         // get the 3D index of a meshgrid in the big grid from the 1D index
         Vec3i mgrid_idx_1Dto3D(int index_1d) const
         {
             return index1Dto3D(index_1d, nmx_, nmy_, nmz_);
-        };
+        }
 
         // get the 1D index of a meshgrid in the big grid from the 3D index
         int mgrid_idx_3Dto1D(const Vec3i index_3d) const
         {
             return index3Dto1D(index_3d.x, index_3d.y, index_3d.z, nmx_, nmy_, nmz_);
-        };
+        }
 
     private:
         // basis vectors of the big grid
diff --git a/source/module_hamilt_lcao/module_gint/temp_gint/gint.cpp b/source/module_hamilt_lcao/module_gint/temp_gint/gint.cpp
index e766c46d9f..d7de110f24 100644
--- a/source/module_hamilt_lcao/module_gint/temp_gint/gint.cpp
+++ b/source/module_hamilt_lcao/module_gint/temp_gint/gint.cpp
@@ -3,6 +3,6 @@
 namespace ModuleGint
 {
 
-std::shared_ptr<GintInfo> Gint::gint_info_ = nullptr;
+GintInfo* Gint::gint_info_ = nullptr;
 
 }
\ No newline at end of file
diff --git a/source/module_hamilt_lcao/module_gint/temp_gint/gint.h b/source/module_hamilt_lcao/module_gint/temp_gint/gint.h
index a14f014a6c..1255bae971 100644
--- a/source/module_hamilt_lcao/module_gint/temp_gint/gint.h
+++ b/source/module_hamilt_lcao/module_gint/temp_gint/gint.h
@@ -12,17 +12,15 @@ class Gint
     Gint() = default;
     virtual ~Gint() = default;
 
-    virtual void cal_gint() = 0;
-
     // note that gint_info_ is a static member variable
     // it is shared by all instances of Gint
-    static void init_gint_info(std::shared_ptr<GintInfo> gint_info)
+    static void set_gint_info(GintInfo* gint_info)
     {
         gint_info_ = gint_info;
     }
 
     protected:
-    static std::shared_ptr<GintInfo> gint_info_;
+    static GintInfo* gint_info_;
 };
 
 }
\ No newline at end of file
diff --git a/source/module_hamilt_lcao/module_gint/temp_gint/gint_atom.cpp b/source/module_hamilt_lcao/module_gint/temp_gint/gint_atom.cpp
index 6ae3735ec6..7121694244 100644
--- a/source/module_hamilt_lcao/module_gint/temp_gint/gint_atom.cpp
+++ b/source/module_hamilt_lcao/module_gint/temp_gint/gint_atom.cpp
@@ -1,10 +1,38 @@
 #include "source_base/ylm.h"
 #include "source_base/array_pool.h"
 #include "gint_atom.h"
+#include "source_cell/unitcell.h"
 #include "gint_helper.h"
 
 namespace ModuleGint
 {
+GintAtom::GintAtom(
+    const Atom* atom,
+    int it, int ia, int iat,
+    Vec3i biggrid_idx,
+    Vec3i unitcell_idx,
+    Vec3d tau_in_biggrid,
+    const Numerical_Orbital* orb,
+    const UnitCell* ucell)
+: atom_(atom), it_(it), ia_(ia), iat_(iat), biggrid_idx_(biggrid_idx),
+  unitcell_idx_(unitcell_idx), tau_in_biggrid_(tau_in_biggrid),
+  orb_(orb), ucell_(ucell)
+{
+    p_psi_uniform_.resize(atom_->nw);
+    p_dpsi_uniform_.resize(atom_->nw);
+    p_ddpsi_uniform_.resize(atom_->nw);
+    for (int iw=0; iw < atom_->nw; ++iw)
+    {
+        if ( atom_->iw2_new[iw] )
+        {
+            int l = atom_->iw2l[iw];
+            int n = atom_->iw2n[iw];
+            p_psi_uniform_[iw] = orb_->PhiLN(l, n).psi_uniform.data();
+            p_dpsi_uniform_[iw] = orb_->PhiLN(l, n).dpsi_uniform.data();
+            p_ddpsi_uniform_[iw] = orb_->PhiLN(l, n).ddpsi_uniform.data();
+        }
+    }
+}
 
 template <typename T>
 void GintAtom::set_phi(const std::vector<Vec3d>& coords, const int stride, T* phi) const
@@ -14,20 +42,6 @@ void GintAtom::set_phi(const std::vector<Vec3d>& coords, const int stride, T* ph
     // orb_ does not have the member variable dr_uniform
     const double dr_uniform = orb_->PhiLN(0, 0).dr_uniform;
 
-    // store the pointer to reduce repeated address fetching
-    std::vector<const double*> p_psi_uniform(atom_->nw);
-    std::vector<const double*> p_dpsi_uniform(atom_->nw);
-    for(int iw = 0; iw < atom_->nw; iw++)
-    {
-        if(atom_->iw2_new[iw])
-        {
-            int l = atom_->iw2l[iw];
-            int n = atom_->iw2n[iw];
-            p_psi_uniform[iw] = orb_->PhiLN(l, n).psi_uniform.data();
-            p_dpsi_uniform[iw] = orb_->PhiLN(l, n).dpsi_uniform.data();
-        }
-    }
-
     // store the spherical harmonics
     // it's outside the loop to reduce the vector allocation overhead
     std::vector<double> ylma;
@@ -35,7 +49,6 @@ void GintAtom::set_phi(const std::vector<Vec3d>& coords, const int stride, T* ph
     for(int im = 0; im < num_mgrids; im++)
     {
         const Vec3d& coord = coords[im];
-
         // 1e-9 is to avoid division by zero
         const double dist = coord.norm() < 1e-9 ? 1e-9 : coord.norm();
         if(dist > orb_->getRcut())
@@ -74,8 +87,8 @@ void GintAtom::set_phi(const std::vector<Vec3d>& coords, const int stride, T* ph
             {
                 if(atom_->iw2_new[iw])
                 {
-                    auto psi_uniform = p_psi_uniform[iw];
-                    auto dpsi_uniform = p_dpsi_uniform[iw];
+                    auto psi_uniform = p_psi_uniform_[iw];
+                    auto dpsi_uniform = p_dpsi_uniform_[iw];
                     psi = c1 * psi_uniform[ip] + c2 * dpsi_uniform[ip]
                         + c3 * psi_uniform[ip + 1] + c4 * dpsi_uniform[ip + 1];
                 }
@@ -94,22 +107,6 @@ void GintAtom::set_phi_dphi(
     
     // orb_ does not have the member variable dr_uniform
     const double dr_uniform = orb_->PhiLN(0, 0).dr_uniform;
-
-    // store the pointer to reduce repeated address fetching
-    std::vector<const double*> p_psi_uniform(atom_->nw);
-    std::vector<const double*> p_dpsi_uniform(atom_->nw);
-    std::vector<int> phi_nr_uniform(atom_->nw);
-    for (int iw=0; iw< atom_->nw; ++iw)
-    {
-        if ( atom_->iw2_new[iw] )
-        {
-            int l = atom_->iw2l[iw];
-            int n = atom_->iw2n[iw];
-            p_psi_uniform[iw] = orb_->PhiLN(l, n).psi_uniform.data();
-            p_dpsi_uniform[iw] = orb_->PhiLN(l, n).dpsi_uniform.data();
-            phi_nr_uniform[iw] = orb_->PhiLN(l, n).nr_uniform;
-        }
-    }
     
     std::vector<double> rly(std::pow(atom_->nwl + 1, 2));
     // TODO: replace array_pool with std::vector
@@ -157,24 +154,16 @@ void GintAtom::set_phi_dphi(
                 // function from interpolation method.
                 if(atom_->iw2_new[iw])
                 {
-                    auto psi_uniform = p_psi_uniform[iw];
-                    auto dpsi_uniform = p_dpsi_uniform[iw];
-
-                    if(ip >= phi_nr_uniform[iw] - 4)
-                    {
-                        tmp = dtmp = 0.0;
-                    }
-                    else
-                    {
-                        // use Polynomia Interpolation method to get the
-                        // wave functions
-
-                        tmp = x12 * (psi_uniform[ip] * x3 + psi_uniform[ip + 3] * x0)
-                            + x03 * (psi_uniform[ip + 1] * x2 - psi_uniform[ip + 2] * x1);
-
-                        dtmp = x12 * (dpsi_uniform[ip] * x3 + dpsi_uniform[ip + 3] * x0)
-                            + x03 * (dpsi_uniform[ip + 1] * x2 - dpsi_uniform[ip + 2] * x1);
-                    }
+                    auto psi_uniform = p_psi_uniform_[iw];
+                    auto dpsi_uniform = p_dpsi_uniform_[iw];
+                    // use Polynomia Interpolation method to get the
+                    // wave functions
+
+                    tmp = x12 * (psi_uniform[ip] * x3 + psi_uniform[ip + 3] * x0)
+                        + x03 * (psi_uniform[ip + 1] * x2 - psi_uniform[ip + 2] * x1);
+
+                    dtmp = x12 * (dpsi_uniform[ip] * x3 + dpsi_uniform[ip + 3] * x0)
+                        + x03 * (dpsi_uniform[ip + 1] * x2 - dpsi_uniform[ip + 2] * x1);
                 } // new l is used.
 
                 // get the 'l' of this localized wave function
diff --git a/source/module_hamilt_lcao/module_gint/temp_gint/gint_atom.h b/source/module_hamilt_lcao/module_gint/temp_gint/gint_atom.h
index b1da5d586a..aff8aae5b9 100644
--- a/source/module_hamilt_lcao/module_gint/temp_gint/gint_atom.h
+++ b/source/module_hamilt_lcao/module_gint/temp_gint/gint_atom.h
@@ -13,28 +13,26 @@ class GintAtom
         // constructor
         GintAtom(
             const Atom* atom,
-            int ia,
-            int iat,
+            int it, int ia, int iat,
             Vec3i biggrid_idx,
             Vec3i unitcell_idx,
             Vec3d tau_in_biggrid,
-            const Numerical_Orbital* orb)
-            : atom_(atom), ia_(ia), iat_(iat), biggrid_idx_(biggrid_idx),
-              unitcell_idx_(unitcell_idx), tau_in_biggrid_(tau_in_biggrid),
-              orb_(orb) {};
+            const Numerical_Orbital* orb,
+            const UnitCell* ucell);
 
         // getter functions
-        const Atom* get_atom() const { return atom_; };
-        int get_ia() const { return ia_; };
-        int get_iat() const { return iat_; };
-        const Vec3i& get_bgrid_idx() const { return biggrid_idx_; };
-        const Vec3i& get_unitcell_idx() const { return unitcell_idx_; };
-        const Vec3i& get_R() const { return unitcell_idx_; };
-        const Vec3d& get_tau_in_bgrid() const { return tau_in_biggrid_; };
-        const Numerical_Orbital* get_orb() const { return orb_; };
-
-        int get_nw() const { return atom_->nw; };
-        double get_rcut() const { return orb_->getRcut(); };
+        const Atom* get_atom() const { return atom_; }
+        int get_ia() const { return ia_; }
+        int get_iat() const { return iat_; }
+        int get_start_iw() const { return ucell_->itiaiw2iwt(it_, ia_, 0); }  // get the start index of global atomic orbitals
+        const Vec3i& get_bgrid_idx() const { return biggrid_idx_; }
+        const Vec3i& get_unitcell_idx() const { return unitcell_idx_; }
+        const Vec3i& get_R() const { return unitcell_idx_; }
+        const Vec3d& get_tau_in_bgrid() const { return tau_in_biggrid_; }
+        const Numerical_Orbital* get_orb() const { return orb_; }
+
+        int get_nw() const { return atom_->nw; }
+        double get_rcut() const { return orb_->getRcut(); }
         
         /**
          * @brief Get the wave function values of the atom at a meshgrid.
@@ -91,13 +89,16 @@ class GintAtom
     private:
         // the atom object
         const Atom* atom_;
-
-        // the global index of the atom
-        int iat_;
+        
+        // the global index of the atom type
+        int it_;
 
         // the global index of the atom among the same type of atoms
         int ia_;
 
+        // the global index of the atom
+        int iat_;
+        
         // the index of big grid which contains this atom
         Vec3i biggrid_idx_;
 
@@ -109,10 +110,13 @@ class GintAtom
         Vec3d tau_in_biggrid_;
 
         // the numerical orbitals of this atom
-        // In fact, I think the Numerical_Orbital class
-        // should be a member of the Atom class, not the GintAtom class
         const Numerical_Orbital* orb_;
 
+        const UnitCell* ucell_;
+        
+        std::vector<const double*> p_psi_uniform_;
+        std::vector<const double*> p_dpsi_uniform_;
+        std::vector<const double*> p_ddpsi_uniform_;
 };
 
 } // namespace ModuleGint
diff --git a/source/module_hamilt_lcao/module_gint/temp_gint/gint_common.cpp b/source/module_hamilt_lcao/module_gint/temp_gint/gint_common.cpp
index 52edcf8e18..39b63191e7 100644
--- a/source/module_hamilt_lcao/module_gint/temp_gint/gint_common.cpp
+++ b/source/module_hamilt_lcao/module_gint/temp_gint/gint_common.cpp
@@ -3,22 +3,29 @@
 #include "module_hamilt_lcao/module_hcontainer/hcontainer_funcs.h"
 #include "module_parameter/parameter.h"
 
+#ifdef __MPI
+#include "source_base/blacs_connector.h"
+#include <mpi.h>
+#endif
+
 namespace ModuleGint
 {
 
-void compose_hr_gint(std::shared_ptr<HContainer<double>> hr_gint)
+void compose_hr_gint(HContainer<double>& hr_gint)
 {
-    for (int iap = 0; iap < hr_gint->size_atom_pairs(); iap++)
+    ModuleBase::TITLE("Gint", "compose_hr_gint");
+    ModuleBase::timer::tick("Gint", "compose_hr_gint");
+    for (int iap = 0; iap < hr_gint.size_atom_pairs(); iap++)
     {
-        auto& ap = hr_gint->get_atom_pair(iap);
+        auto& ap = hr_gint.get_atom_pair(iap);
         const int iat1 = ap.get_atom_i();
         const int iat2 = ap.get_atom_j();
         if (iat1 > iat2)
         {
             // fill lower triangle matrix with upper triangle matrix
             // the upper <IJR> is <iat2, iat1>
-            const hamilt::AtomPair<double>* upper_ap = hr_gint->find_pair(iat2, iat1);
-            const hamilt::AtomPair<double>* lower_ap = hr_gint->find_pair(iat1, iat2);
+            const hamilt::AtomPair<double>* upper_ap = hr_gint.find_pair(iat2, iat1);
+            const hamilt::AtomPair<double>* lower_ap = hr_gint.find_pair(iat1, iat2);
 #ifdef __DEBUG
             assert(upper_ap != nullptr);
 #endif
@@ -37,22 +44,25 @@ void compose_hr_gint(std::shared_ptr<HContainer<double>> hr_gint)
             }
         }
     }
+    ModuleBase::timer::tick("Gint", "compose_hr_gint");
 }
 
-void compose_hr_gint(std::vector<std::shared_ptr<HContainer<double>>> hr_gint_part,
-        std::shared_ptr<HContainer<std::complex<double>>> hr_gint_full)
+void compose_hr_gint(const std::vector<HContainer<double>>& hr_gint_part,
+                     HContainer<std::complex<double>>& hr_gint_full)
 {
-    for (int iap = 0; iap < hr_gint_full->size_atom_pairs(); iap++)
+    ModuleBase::TITLE("Gint", "compose_hr_gint");
+    ModuleBase::timer::tick("Gint", "compose_hr_gint");
+    for (int iap = 0; iap < hr_gint_full.size_atom_pairs(); iap++)
     {
-        auto* ap = &hr_gint_full->get_atom_pair(iap);
+        auto* ap = &(hr_gint_full.get_atom_pair(iap));
         const int iat1 = ap->get_atom_i();
         const int iat2 = ap->get_atom_j();
         if (iat1 <= iat2)
         {
             hamilt::AtomPair<std::complex<double>>* upper_ap = ap;
-            hamilt::AtomPair<std::complex<double>>* lower_ap = hr_gint_full->find_pair(iat2, iat1);
-            const hamilt::AtomPair<double>* ap_nspin_0 = hr_gint_part[0]->find_pair(iat1, iat2);
-            const hamilt::AtomPair<double>* ap_nspin_3 = hr_gint_part[3]->find_pair(iat1, iat2);
+            hamilt::AtomPair<std::complex<double>>* lower_ap = hr_gint_full.find_pair(iat2, iat1);
+            const hamilt::AtomPair<double>* ap_nspin_0 = hr_gint_part[0].find_pair(iat1, iat2);
+            const hamilt::AtomPair<double>* ap_nspin_3 = hr_gint_part[3].find_pair(iat1, iat2);
             for (int ir = 0; ir < upper_ap->get_R_size(); ir++)
             {
                 const auto R_index = upper_ap->get_R_index(ir);
@@ -72,8 +82,8 @@ void compose_hr_gint(std::vector<std::shared_ptr<HContainer<double>>> hr_gint_pa
 
                 if (PARAM.globalv.domag)
                 {
-                    const hamilt::AtomPair<double>* ap_nspin_1 = hr_gint_part[1]->find_pair(iat1, iat2);
-                    const hamilt::AtomPair<double>* ap_nspin_2 = hr_gint_part[2]->find_pair(iat1, iat2);
+                    const hamilt::AtomPair<double>* ap_nspin_1 = hr_gint_part[1].find_pair(iat1, iat2);
+                    const hamilt::AtomPair<double>* ap_nspin_2 = hr_gint_part[2].find_pair(iat1, iat2);
                     const auto mat_nspin_1 = ap_nspin_1->find_matrix(R_index);
                     const auto mat_nspin_2 = ap_nspin_2->find_matrix(R_index);
                     for (int irow = 0; irow < mat_nspin_1->get_row_size(); ++irow)
@@ -101,65 +111,68 @@ void compose_hr_gint(std::vector<std::shared_ptr<HContainer<double>>> hr_gint_pa
             }
         }
     }
+    ModuleBase::timer::tick("Gint", "compose_hr_gint");
 }
 
 template <typename T>
-void transfer_hr_gint_to_hR(std::shared_ptr<const HContainer<T>> hr_gint, HContainer<T>* hR)
+void transfer_hr_gint_to_hR(const HContainer<T>& hr_gint, HContainer<T>& hR)
 {
+    ModuleBase::TITLE("Gint", "transfer_hr_gint_to_hR");
+    ModuleBase::timer::tick("Gint", "transfer_hr_gint_to_hR");
 #ifdef __MPI
     int size = 0;
     MPI_Comm_size(MPI_COMM_WORLD, &size);
     if (size == 1)
     {
-        hR->add(*hr_gint);
+        hR.add(hr_gint);
     }
     else
     {
-        hamilt::transferSerials2Parallels(*hr_gint, hR);
+        hamilt::transferSerials2Parallels(hr_gint, &hR);
     }
 #else
-    hR->add(*hr_gint);
+    hR.add(hr_gint);
 #endif
+    ModuleBase::timer::tick("Gint", "transfer_hr_gint_to_hR");
 }
 
 // gint_info should not have been a parameter, but it was added to initialize dm_gint_full
 // In the future, we might try to remove the gint_info parameter
 template<typename T>
 void transfer_dm_2d_to_gint(
-    std::shared_ptr<const GintInfo> gint_info,
+    const GintInfo& gint_info,
     std::vector<HContainer<T>*> dm,
-    std::vector<std::shared_ptr<HContainer<T>>> dm_gint)
+    std::vector<HContainer<T>>& dm_gint)
 {
-    // To check whether input parameter dm_2d has been initialized
-#ifdef __DEBUG
-    assert(PARAM.inp.nspin == dm.size()
-           && "The size of dm should be equal to the number of spins!");
-#endif
+    ModuleBase::TITLE("Gint", "transfer_dm_2d_to_gint");
+    ModuleBase::timer::tick("Gint", "transfer_dm_2d_to_gint");
 
     if (PARAM.inp.nspin != 4)
     {
-        for (int is = 0; is < PARAM.inp.nspin; is++)
+        // dm_gint.size() usually equals to PARAM.inp.nspin,
+        // but there is exception within module_lr
+        for (int is = 0; is < dm_gint.size(); is++)
         {
 #ifdef __MPI
-            hamilt::transferParallels2Serials(*dm[is], dm_gint[is].get());
+            hamilt::transferParallels2Serials(*dm[is], &dm_gint[is]);
 #else
-            dm_gint[is]->set_zero();
-            dm_gint[is]->add(*dm[is]);
+            dm_gint[is].set_zero();
+            dm_gint[is].add(*dm[is]);
 #endif
         }
     } else  // NSPIN=4 case
     {
 #ifdef __MPI
         const int npol = 2;
-        std::shared_ptr<HContainer<T>> dm_full = gint_info->get_hr<T>(npol);
-        hamilt::transferParallels2Serials(*dm[0], dm_full.get());
+        HContainer<T> dm_full = gint_info.get_hr<T>(npol);
+        hamilt::transferParallels2Serials(*dm[0], &dm_full);
 #else
-        HContainer<T>* dm_full = dm[0];
+        HContainer<T>& dm_full = *(dm[0]);
 #endif
         std::vector<T*> tmp_pointer(4, nullptr);
-        for (int iap = 0; iap < dm_full->size_atom_pairs(); iap++)
+        for (int iap = 0; iap < dm_full.size_atom_pairs(); iap++)
         {
-            auto& ap = dm_full->get_atom_pair(iap);
+            auto& ap = dm_full.get_atom_pair(iap);
             const int iat1 = ap.get_atom_i();
             const int iat2 = ap.get_atom_j();
             for (int ir = 0; ir < ap.get_R_size(); ir++)
@@ -168,7 +181,7 @@ void transfer_dm_2d_to_gint(
                 for (int is = 0; is < 4; is++)
                 {
                     tmp_pointer[is] =
-                        dm_gint[is]->find_matrix(iat1, iat2, r_index)->get_pointer();
+                        dm_gint[is].find_matrix(iat1, iat2, r_index)->get_pointer();
                 }
                 T* data_full = ap.get_pointer(ir);
                 for (int irow = 0; irow < ap.get_row_size(); irow += 2)
@@ -189,21 +202,142 @@ void transfer_dm_2d_to_gint(
             }
         }
     }
+    ModuleBase::timer::tick("Gint", "transfer_dm_2d_to_gint");
+}
+
+int globalIndex(int localindex, int nblk, int nprocs, int myproc)
+{
+    const int iblock = localindex / nblk;
+    const int gIndex = (iblock * nprocs + myproc) * nblk + localindex % nblk;
+    return gIndex;
+}
+
+int localIndex(int globalindex, int nblk, int nprocs, int& myproc)
+{
+    myproc = int((globalindex % (nblk * nprocs)) / nblk);
+    return int(globalindex / (nblk * nprocs)) * nblk + globalindex % nblk;
 }
 
+template <typename T>
+void wfc_2d_to_gint(const T* wfc_2d,
+                    int nbands,  // needed if MPI is disabled
+                    int nlocal,  // needed if MPI is disabled
+                    const Parallel_Orbitals& pv,
+                    T* wfc_gint,
+                    const GintInfo& gint_info)
+{
+    ModuleBase::TITLE("Gint", "wfc_2d_to_gint");
+    ModuleBase::timer::tick("Gint", "wfc_2d_to_gint");
+
+#ifdef __MPI
+    // dimension related
+    nlocal = pv.desc_wfc[2];
+    nbands = pv.desc_wfc[3];
+
+    const std::vector<int>& trace_lo = gint_info.get_trace_lo();
+
+    // MPI and memory related
+    const int mem_stride = 1;
+    int mpi_info = 0;
+
+    // get the rank of the current process
+    int rank = 0;
+    MPI_Comm_rank(pv.comm(), &rank);
+
+    // calculate the maximum number of nlocal over all processes in pv.comm() range
+    long buf_size;
+    mpi_info = MPI_Reduce(&pv.nloc_wfc, &buf_size, 1, MPI_LONG, MPI_MAX, 0, pv.comm());
+    mpi_info = MPI_Bcast(&buf_size, 1, MPI_LONG, 0, pv.comm()); // get and then broadcast
+    std::vector<T> wfc_block(buf_size);
+
+    // this quantity seems to have the value returned by function numroc_ in ScaLAPACK?
+    int naroc[2];
+
+    // for BLACS broadcast
+    char scope = 'A';
+    char top = ' ';
+
+    // loop over all processors
+    for (int iprow = 0; iprow < pv.dim0; ++iprow)
+    {
+        for (int ipcol = 0; ipcol < pv.dim1; ++ipcol)
+        {
+            if (iprow == pv.coord[0] && ipcol == pv.coord[1])
+            {
+                BlasConnector::copy(pv.nloc_wfc, wfc_2d, mem_stride, wfc_block.data(), mem_stride);
+                naroc[0] = pv.nrow;
+                naroc[1] = pv.ncol_bands;
+                Cxgebs2d(pv.blacs_ctxt, &scope, &top, 2, 1, naroc, 2);
+                Cxgebs2d(pv.blacs_ctxt, &scope, &top, buf_size, 1, wfc_block.data(), buf_size);
+            }
+            else
+            {
+                Cxgebr2d(pv.blacs_ctxt, &scope, &top, 2, 1, naroc, 2, iprow, ipcol);
+                Cxgebr2d(pv.blacs_ctxt, &scope, &top, buf_size, 1, wfc_block.data(), buf_size, iprow, ipcol);
+            }
+
+            // then use it to set the wfc_grid.
+            const int nb = pv.nb;
+            const int dim0 = pv.dim0;
+            const int dim1 = pv.dim1;
+            for (int j = 0; j < naroc[1]; ++j)
+            {
+                int igcol = globalIndex(j, nb, dim1, ipcol);
+                if (igcol >= PARAM.inp.nbands)
+                {
+                    continue;
+                }
+                for (int i = 0; i < naroc[0]; ++i)
+                {
+                    int igrow = globalIndex(i, nb, dim0, iprow);
+                    int mu_local = trace_lo[igrow];
+                    if (wfc_gint && mu_local >= 0)
+                    {
+                        wfc_gint[igcol * nlocal + mu_local] = wfc_block[j * naroc[0] + i];
+                    }
+                }
+            }
+            // this operation will let all processors have the same wfc_grid
+        }
+    }
+#else
+    for (int i = 0; i < nbands; ++i)
+    {
+        for (int j = 0; j < nlocal; ++j)
+        {
+            wfc_gint[i * nlocal + j] = wfc_2d[i * nlocal + j];
+        }
+    }
+#endif
+    ModuleBase::timer::tick("Gint", "wfc_2d_to_gint");
+}
 
 template void transfer_hr_gint_to_hR(
-    std::shared_ptr<const HContainer<double>> hr_gint,
-    HContainer<double>* hR);
+    const HContainer<double>& hr_gint,
+    HContainer<double>& hR);
 template void transfer_hr_gint_to_hR(
-    std::shared_ptr<const HContainer<std::complex<double>>> hr_gint,
-    HContainer<std::complex<double>>* hR);
+    const HContainer<std::complex<double>>& hr_gint,
+    HContainer<std::complex<double>>& hR);
 template void transfer_dm_2d_to_gint(
-    std::shared_ptr<const GintInfo> gint_info,
+    const GintInfo& gint_info,
     std::vector<HContainer<double>*> dm,
-    std::vector<std::shared_ptr<HContainer<double>>> dm_gint);
+    std::vector<HContainer<double>>& dm_gint);
 template void transfer_dm_2d_to_gint(
-    std::shared_ptr<const GintInfo> gint_info,
+    const GintInfo& gint_info,
     std::vector<HContainer<std::complex<double>>*> dm,
-    std::vector<std::shared_ptr<HContainer<std::complex<double>>>> dm_gint);
+    std::vector<HContainer<std::complex<double>>>& dm_gint);
+template void wfc_2d_to_gint(
+    const double* wfc_2d,
+    int nbands,
+    int nlocal,
+    const Parallel_Orbitals& pv,
+    double* wfc_grid,
+    const GintInfo& gint_info);
+template void wfc_2d_to_gint(
+    const std::complex<double>* wfc_2d,
+    int nbands,
+    int nlocal,
+    const Parallel_Orbitals& pv,
+    std::complex<double>* wfc_grid,
+    const GintInfo& gint_info);
 }
\ No newline at end of file
diff --git a/source/module_hamilt_lcao/module_gint/temp_gint/gint_common.h b/source/module_hamilt_lcao/module_gint/temp_gint/gint_common.h
index 47f0eda35b..485978ccf8 100644
--- a/source/module_hamilt_lcao/module_gint/temp_gint/gint_common.h
+++ b/source/module_hamilt_lcao/module_gint/temp_gint/gint_common.h
@@ -5,18 +5,20 @@
 namespace ModuleGint
 {
     // fill the lower triangle matrix with the upper triangle matrix
-    void compose_hr_gint(std::shared_ptr<HContainer<double>> hr_gint);
+    void compose_hr_gint(HContainer<double>& hr_gint);
     // for nspin=4 case
-    void compose_hr_gint(std::vector<std::shared_ptr<HContainer<double>>> hr_gint_part,
-        std::shared_ptr<HContainer<std::complex<double>>> hr_gint_full);
+    void compose_hr_gint(const std::vector<HContainer<double>>& hr_gint_part,
+                         HContainer<std::complex<double>>& hr_gint_full);
 
     template <typename T>
-    void transfer_hr_gint_to_hR(std::shared_ptr<const HContainer<T>> hr_gint, HContainer<T>* hR);
+    void transfer_hr_gint_to_hR(const HContainer<T>& hr_gint, HContainer<T>& hR);
 
     template<typename T>
     void transfer_dm_2d_to_gint(
-        std::shared_ptr<const GintInfo> gint_info,
+        const GintInfo& gint_info,
         std::vector<HContainer<T>*> dm,
-        std::vector<std::shared_ptr<HContainer<T>>> dm_gint);
+        std::vector<HContainer<T>>& dm_gint);
 
+    template<typename T>
+    void wfc_2d_to_gint(const T* wfc_2d, int nbands, int nlocal, const Parallel_Orbitals& pv, T* wfc_grid, const GintInfo& gint_info);
 }
diff --git a/source/module_hamilt_lcao/module_gint/temp_gint/gint_dvlocal.cpp b/source/module_hamilt_lcao/module_gint/temp_gint/gint_dvlocal.cpp
new file mode 100644
index 0000000000..78a8b91069
--- /dev/null
+++ b/source/module_hamilt_lcao/module_gint/temp_gint/gint_dvlocal.cpp
@@ -0,0 +1,271 @@
+#include <map>
+#include "gint_dvlocal.h"
+#include "phi_operator.h"
+#include "source_base/parallel_reduce.h"
+
+namespace ModuleGint
+{
+
+void Gint_dvlocal::cal_dvlocal()
+{
+    ModuleBase::TITLE("Gint", "cal_gint_dvlocal");
+    ModuleBase::timer::tick("Gint", "cal_gint_dvlocal");
+    init_hr_gint_();
+    cal_hr_gint_();
+    ModuleBase::timer::tick("Gint", "cal_gint_dvlocal");
+}
+
+void Gint_dvlocal::init_hr_gint_()
+{
+    pvdpRx = gint_info_->get_hr<double>();
+    pvdpRy = gint_info_->get_hr<double>();
+    pvdpRz = gint_info_->get_hr<double>();
+}
+
+void Gint_dvlocal::cal_hr_gint_()
+{
+#pragma omp parallel
+    {
+        PhiOperator phi_op;
+        std::vector<double> phi;
+        std::vector<double> phi_vldr3;
+        std::vector<double> dphi_x;
+        std::vector<double> dphi_y;
+        std::vector<double> dphi_z;
+#pragma omp for schedule(dynamic)
+        for(const auto& biggrid: gint_info_->get_biggrids())
+        {
+            if(biggrid->get_atoms().empty())
+            {
+                continue;
+            }
+            phi_op.set_bgrid(biggrid);
+            const int phi_len = phi_op.get_rows() * phi_op.get_cols();
+            phi.resize(phi_len);
+            phi_vldr3.resize(phi_len);
+            dphi_x.resize(phi_len);
+            dphi_y.resize(phi_len);
+            dphi_z.resize(phi_len);
+            phi_op.set_phi_dphi(phi.data(), dphi_x.data(), dphi_y.data(), dphi_z.data());
+            phi_op.phi_mul_vldr3(vr_eff_, dr3_, phi.data(), phi_vldr3.data());
+            phi_op.phi_mul_phi(phi_vldr3.data(), dphi_x.data(), pvdpRx, PhiOperator::Triangular_Matrix::Upper);
+            phi_op.phi_mul_phi(phi_vldr3.data(), dphi_y.data(), pvdpRy, PhiOperator::Triangular_Matrix::Upper);
+            phi_op.phi_mul_phi(phi_vldr3.data(), dphi_z.data(), pvdpRz, PhiOperator::Triangular_Matrix::Upper);
+        }
+    }
+}
+
+void Gint_dvlocal::cal_dvlocal_R_sparseMatrix(
+    const int nspin,
+    const int cspin,
+    const int nlocal,
+    const double sparse_thr, 
+    const Parallel_Orbitals& pv,
+    const UnitCell& ucell,
+    const Grid_Driver& gdriver,
+    LCAO_HS_Arrays& hs_arrays)
+{
+    ModuleBase::TITLE("Gint", "cal_dvlocal_R_sparseMatrix");
+    ModuleBase::timer::tick("Gint", "cal_dvlocal_R_sparseMatrix");
+    std::map<Abfs::Vector3_Order<int>, std::map<size_t, std::map<size_t, double>>> pvdpRx_sparseMatrix;
+    std::map<Abfs::Vector3_Order<int>, std::map<size_t, std::map<size_t, double>>> pvdpRy_sparseMatrix;
+    std::map<Abfs::Vector3_Order<int>, std::map<size_t, std::map<size_t, double>>> pvdpRz_sparseMatrix;
+    
+    double temp_value_double;
+
+    Vec3d tau1, dtau;
+    for (int iap = 0; iap < pvdpRx.size_atom_pairs(); iap++)
+    {
+        const auto& ap = pvdpRx.get_atom_pair(iap);
+        const int iat1 = ap.get_atom_i();
+        const int iat2 = ap.get_atom_j();
+        const int it1 = ucell.iat2it[iat1];
+        const int it2 = ucell.iat2it[iat2];
+        const Atom* atom1 = &ucell.atoms[it1];
+        const Atom* atom2 = &ucell.atoms[it2];
+        const int start1 = ucell.itiaiw2iwt(it1, ucell.iat2ia[iat1], 0);
+        const int start2 = ucell.itiaiw2iwt(it2, ucell.iat2ia[iat2], 0);
+
+        for (int ir = 0; ir < ap.get_R_size(); ir++)
+        {
+            const ModuleBase::Vector3<int> R = ap.get_R_index(ir);
+            Abfs::Vector3_Order<int> dR(R.x, R.y, R.z);
+            double* p_pvdpRx = pvdpRx.get_atom_pair(iap).get_pointer(ir);
+            double* p_pvdpRy = pvdpRy.get_atom_pair(iap).get_pointer(ir);
+            double* p_pvdpRz = pvdpRz.get_atom_pair(iap).get_pointer(ir);
+
+            for (int iw = 0; iw < atom1->nw * npol_; iw++)
+            {
+                for (int iw2 = 0; iw2 < atom2->nw * npol_; iw2++)
+                {
+                    const int nw = atom2->nw;
+                    const int mug0 = iw / npol_;
+                    const int nug0 = iw2 / npol_;
+                    const int iw_nowg = mug0 * nw + nug0;
+                    
+                    double temp_value = p_pvdpRx[iw_nowg];
+                    if (std::abs(temp_value) > sparse_thr)
+                    {
+                        pvdpRx_sparseMatrix[dR][start1 + iw][start2 + iw2] = temp_value;
+                    }
+                    temp_value = p_pvdpRy[iw_nowg];
+                    if (std::abs(temp_value) > sparse_thr)
+                    {
+                        pvdpRy_sparseMatrix[dR][start1 + iw][start2 + iw2] = temp_value;
+                    }
+                    temp_value = p_pvdpRz[iw_nowg];
+                    if (std::abs(temp_value) > sparse_thr)
+                    {
+                        pvdpRz_sparseMatrix[dR][start1 + iw][start2 + iw2] = temp_value;
+                    }
+                }
+            }
+        }
+    }
+    distribute_pvdpR_sparseMatrix(cspin, 0, nlocal, sparse_thr, pvdpRx_sparseMatrix, pv, hs_arrays);
+    distribute_pvdpR_sparseMatrix(cspin, 1, nlocal, sparse_thr, pvdpRy_sparseMatrix, pv, hs_arrays);
+    distribute_pvdpR_sparseMatrix(cspin, 2, nlocal, sparse_thr, pvdpRz_sparseMatrix, pv, hs_arrays);
+    ModuleBase::timer::tick("Gint", "cal_dvlocal_R_sparseMatrix");
+}
+
+
+void Gint_dvlocal::distribute_pvdpR_sparseMatrix(
+    const int cspin,
+    const int dim,
+    const int nlocal,
+    const double sparse_threshold,
+    const std::map<Abfs::Vector3_Order<int>,
+                    std::map<size_t, std::map<size_t, double>>>&
+        pvdpR_sparseMatrix,
+    const Parallel_Orbitals& pv,
+    LCAO_HS_Arrays& hs_arrays)
+{
+    int total_R_num = hs_arrays.all_R_coor.size();
+    std::vector<int> nonzero_num(total_R_num);
+    std::vector<int> minus_nonzero_num(total_R_num);
+    int count = 0;
+    for (const auto& R_coor: hs_arrays.all_R_coor)
+    {
+        auto iter = pvdpR_sparseMatrix.find(R_coor);
+        if (iter != pvdpR_sparseMatrix.end())
+        {
+            for (auto& row_loop: iter->second)
+            {
+                nonzero_num[count] += row_loop.second.size();
+            }
+        }
+
+        auto minus_R_coor = -1 * R_coor;
+
+        iter = pvdpR_sparseMatrix.find(minus_R_coor);
+        if (iter != pvdpR_sparseMatrix.end())
+        {
+            for (auto& row_loop: iter->second)
+            {
+                minus_nonzero_num[count] += row_loop.second.size();
+            }
+        }
+        count++;
+    }
+
+    Parallel_Reduce::reduce_all(nonzero_num.data(), total_R_num);
+    Parallel_Reduce::reduce_all(minus_nonzero_num.data(), total_R_num);
+
+    std::vector<double> tmp(nlocal);
+    count = 0;
+
+    const std::vector<int>& trace_lo = gint_info_->get_trace_lo();
+    for (const auto& R_coor: hs_arrays.all_R_coor)
+    {
+        if (nonzero_num[count] != 0 || minus_nonzero_num[count] != 0)
+        {
+            auto minus_R_coor = -1 * R_coor;
+
+            for (int row = 0; row < nlocal; ++row)
+            {
+                tmp.assign(tmp.size(), 0);
+
+                auto iter = pvdpR_sparseMatrix.find(R_coor);
+                if (iter != pvdpR_sparseMatrix.end())
+                {
+
+                    if (trace_lo[row] >= 0)
+                    {
+                        auto row_iter = iter->second.find(row);
+                        if (row_iter != iter->second.end())
+                        {
+                            for (auto& value: row_iter->second)
+                            {
+                                tmp[value.first] = value.second;
+                            }
+                        }
+                    }
+                }
+
+                auto minus_R_iter = pvdpR_sparseMatrix.find(minus_R_coor);
+                if (minus_R_iter != pvdpR_sparseMatrix.end())
+                {
+                    for (int col = 0; col < row; ++col)
+                    {
+                        if (trace_lo[col] >= 0)
+                        {
+                            auto row_iter = minus_R_iter->second.find(col);
+                            if (row_iter != minus_R_iter->second.end())
+                            {
+                                auto col_iter = row_iter->second.find(row);
+                                if (col_iter != row_iter->second.end())
+                                {
+                                    tmp[col] = col_iter->second;
+                                }
+                            }
+                        }
+                    }
+                }
+
+                Parallel_Reduce::reduce_pool(tmp.data(), nlocal);
+
+                if (pv.global2local_row(row) >= 0)
+                {
+                    for (int col = 0; col < nlocal; ++col)
+                    {
+                        if (pv.global2local_col(col) >= 0)
+                        {
+                            if (std::abs(tmp[col]) > sparse_threshold)
+                            {
+                                if (dim == 0)
+                                {
+                                    double& value = hs_arrays.dHRx_sparse[cspin][R_coor][row][col];
+                                    value += tmp[col];
+                                    if (std::abs(value) <= sparse_threshold)
+                                    {
+                                        hs_arrays.dHRx_sparse[cspin][R_coor][row].erase(col);
+                                    }
+                                }
+                                if (dim == 1)
+                                {
+                                    double& value = hs_arrays.dHRy_sparse[cspin][R_coor][row][col];
+                                    value += tmp[col];
+                                    if (std::abs(value) <= sparse_threshold)
+                                    {
+                                        hs_arrays.dHRy_sparse[cspin][R_coor][row].erase(col);
+                                    }
+                                }
+                                if (dim == 2)
+                                {
+                                    double& value = hs_arrays.dHRz_sparse[cspin][R_coor][row][col];
+                                    value += tmp[col];
+                                    if (std::abs(value) <= sparse_threshold)
+                                    {
+                                        hs_arrays.dHRz_sparse[cspin][R_coor][row].erase(col);
+                                    }
+                                }
+                            }
+                        }
+                    }
+                }
+            }
+        }
+        count++;
+    }
+}
+}
\ No newline at end of file
diff --git a/source/module_hamilt_lcao/module_gint/temp_gint/gint_dvlocal.h b/source/module_hamilt_lcao/module_gint/temp_gint/gint_dvlocal.h
new file mode 100644
index 0000000000..77976aad78
--- /dev/null
+++ b/source/module_hamilt_lcao/module_gint/temp_gint/gint_dvlocal.h
@@ -0,0 +1,65 @@
+#pragma once
+#include <memory>
+#include <vector>
+#include "module_hamilt_lcao/module_hcontainer/hcontainer.h"
+#include "module_hamilt_lcao/hamilt_lcaodft/LCAO_HS_arrays.hpp"
+#include "source_base/abfs-vector3_order.h"
+#include "gint.h"
+#include "gint_info.h"
+
+namespace ModuleGint
+{
+
+class Gint_dvlocal : public Gint
+{
+    public:
+    Gint_dvlocal(
+        const double* vr_eff,
+        const int nspin,
+        const int npol)
+        : vr_eff_(vr_eff), nspin_(nspin), npol_(npol), dr3_(gint_info_->get_mgrid_volume())
+        {
+            assert(nspin_ == 2); //   currently only npin == 2 is supported
+        }
+    
+    void cal_dvlocal();
+
+    void cal_dvlocal_R_sparseMatrix(
+        const int nspin,
+        const int cspin,
+        const int nlocal,
+        const double sparse_thr, 
+        const Parallel_Orbitals& pv,
+        const UnitCell& ucell,
+        const Grid_Driver& gdriver,
+        LCAO_HS_Arrays& hs_arrays);
+    
+    private:
+    void init_hr_gint_();
+
+    void cal_hr_gint_();
+
+    void distribute_pvdpR_sparseMatrix(
+        const int cspin,
+        const int dim,
+        const int nlocal,
+        const double sparse_threshold,
+        const std::map<Abfs::Vector3_Order<int>,
+                       std::map<size_t, std::map<size_t, double>>>&
+            pvdpR_sparseMatrix,
+        const Parallel_Orbitals& pv,
+        LCAO_HS_Arrays& HS_Arrays);
+
+    // input
+    const double* vr_eff_;
+    int nspin_;
+    int npol_;
+
+    // intermediate variables
+    double dr3_;
+    HContainer<double> pvdpRx;
+    HContainer<double> pvdpRy;
+    HContainer<double> pvdpRz;
+};
+
+}
\ No newline at end of file
diff --git a/source/module_hamilt_lcao/module_gint/temp_gint/gint_env_gamma.cpp b/source/module_hamilt_lcao/module_gint/temp_gint/gint_env_gamma.cpp
new file mode 100644
index 0000000000..71fabbd703
--- /dev/null
+++ b/source/module_hamilt_lcao/module_gint/temp_gint/gint_env_gamma.cpp
@@ -0,0 +1,48 @@
+#include "gint_env_gamma.h"
+#include "gint_common.h"
+#include "phi_operator.h"
+
+namespace ModuleGint
+{
+
+Gint_env_gamma::Gint_env_gamma(
+    const double* psid,
+    const Parallel_Orbitals* pv,
+    const int nbands,
+    const int nlocal,
+    double* rho)
+    :rho_(rho)
+{
+    wfc_gint_.resize(nbands * gint_info_->get_lgd());
+    wfc_2d_to_gint(psid, nbands, nlocal, *pv, wfc_gint_.data(), *gint_info_);
+}
+
+void Gint_env_gamma::cal_env_band(const int iband)
+{
+    ModuleBase::TITLE("Gint", "cal_gint_env");
+    ModuleBase::timer::tick("Gint", "cal_gint_env");
+    ModuleBase::GlobalFunc::ZEROS(rho_, gint_info_->get_local_mgrid_num());
+    const double* wfc_gint_band = &wfc_gint_[iband * gint_info_->get_lgd()];
+#pragma omp parallel
+    {
+        PhiOperator phi_op;
+        std::vector<double> phi;
+#pragma omp for schedule(dynamic)
+        for(const auto& biggrid: gint_info_->get_biggrids())
+        {
+            if(biggrid->get_atoms().empty())
+            {
+                continue;
+            }
+            phi_op.set_bgrid(biggrid);
+            const int phi_len = phi_op.get_rows() * phi_op.get_cols();
+            phi.resize(phi_len);
+            phi_op.set_phi(phi.data());
+            phi_op.cal_env_gamma(phi.data(), wfc_gint_band, gint_info_->get_trace_lo(), rho_);
+        }
+    }
+    ModuleBase::timer::tick("Gint", "cal_gint_env");
+}
+
+
+}
\ No newline at end of file
diff --git a/source/module_hamilt_lcao/module_gint/temp_gint/gint_env_gamma.h b/source/module_hamilt_lcao/module_gint/temp_gint/gint_env_gamma.h
new file mode 100644
index 0000000000..6ba3dca4fa
--- /dev/null
+++ b/source/module_hamilt_lcao/module_gint/temp_gint/gint_env_gamma.h
@@ -0,0 +1,32 @@
+#pragma once
+
+#include <memory>
+#include <vector>
+#include "module_hamilt_lcao/module_hcontainer/hcontainer.h"
+#include "gint.h"
+#include "gint_info.h"
+
+namespace ModuleGint
+{
+
+class Gint_env_gamma : public Gint
+{
+    public:
+    Gint_env_gamma(
+        const double* psid,
+        const Parallel_Orbitals* pv,
+        const int nbands,
+        const int nlocal,
+        double* rho);
+
+    void cal_env_band(const int iband);
+
+    private:
+    // output
+    double* rho_;
+
+    // intermediate variable
+    std::vector<double> wfc_gint_;
+};
+
+}
\ No newline at end of file
diff --git a/source/module_hamilt_lcao/module_gint/temp_gint/gint_env_k.cpp b/source/module_hamilt_lcao/module_gint/temp_gint/gint_env_k.cpp
new file mode 100644
index 0000000000..b92ed8ddfc
--- /dev/null
+++ b/source/module_hamilt_lcao/module_gint/temp_gint/gint_env_k.cpp
@@ -0,0 +1,54 @@
+#include "gint_env_k.h"
+#include "gint_common.h"
+#include "phi_operator.h"
+
+namespace ModuleGint
+{
+
+Gint_env_k::Gint_env_k(
+    const std::complex<double>* psid,
+    const Parallel_Orbitals* pv,
+    const std::vector<Vec3d>& kvec_c,
+    const std::vector<Vec3d>& kvec_d,
+    const int nbands,
+    const int nlocal,
+    const int ik,
+    const int nspin,
+    const int npol,
+    double* rho)
+    :kvec_c_(kvec_c), kvec_d_(kvec_d), ik_(ik), nspin_(nspin), npol_(npol), rho_(rho)
+{
+    wfc_gint_.resize(nbands * gint_info_->get_lgd());
+    wfc_2d_to_gint(psid, nbands, nlocal, *pv, wfc_gint_.data(), *gint_info_);
+}
+
+void Gint_env_k::cal_env_band(const int iband)
+{
+    ModuleBase::TITLE("Gint", "cal_gint_env");
+    ModuleBase::timer::tick("Gint", "cal_gint_env");
+    ModuleBase::GlobalFunc::ZEROS(rho_, gint_info_->get_local_mgrid_num());
+    const std::complex<double>* wfc_gint_band = &wfc_gint_[iband * gint_info_->get_lgd()];
+#pragma omp parallel
+    {
+        PhiOperator phi_op;
+        std::vector<double> phi;
+#pragma omp for schedule(dynamic)
+        for(const auto& biggrid: gint_info_->get_biggrids())
+        {
+            if(biggrid->get_atoms().empty())
+            {
+                continue;
+            }
+            phi_op.set_bgrid(biggrid);
+            const int phi_len = phi_op.get_rows() * phi_op.get_cols();
+            phi.resize(phi_len);
+            phi_op.set_phi(phi.data());
+            phi_op.cal_env_k(phi.data(), wfc_gint_band, gint_info_->get_trace_lo(), ik_, nspin_,
+                             npol_, gint_info_->get_lgd(), kvec_c_, kvec_d_, rho_);
+        }
+    }
+    ModuleBase::timer::tick("Gint", "cal_gint_env");
+}
+
+
+}
\ No newline at end of file
diff --git a/source/module_hamilt_lcao/module_gint/temp_gint/gint_env_k.h b/source/module_hamilt_lcao/module_gint/temp_gint/gint_env_k.h
new file mode 100644
index 0000000000..4d1232e591
--- /dev/null
+++ b/source/module_hamilt_lcao/module_gint/temp_gint/gint_env_k.h
@@ -0,0 +1,44 @@
+#pragma once
+
+#include <memory>
+#include <vector>
+#include "module_hamilt_lcao/module_hcontainer/hcontainer.h"
+#include "gint.h"
+#include "gint_info.h"
+
+namespace ModuleGint
+{
+
+class Gint_env_k : public Gint
+{
+    public:
+    Gint_env_k(
+        const std::complex<double>* psid,
+        const Parallel_Orbitals* pv,
+        const std::vector<Vec3d>& kvec_c,
+        const std::vector<Vec3d>& kvec_d,
+        const int nbands,
+        const int nlocal,
+        const int ik,
+        const int nspin,
+        const int npol,
+        double* rho);
+
+    void cal_env_band(const int iband);
+
+    private:
+    // input
+    const std::vector<Vec3d>& kvec_c_;
+    const std::vector<Vec3d>& kvec_d_;
+    int ik_;
+    int nspin_;
+    int npol_;
+
+    // output
+    double* rho_;
+
+    // intermediate variable
+    std::vector<std::complex<double>> wfc_gint_;
+};
+
+}
\ No newline at end of file
diff --git a/source/module_hamilt_lcao/module_gint/temp_gint/gint_fvl.cpp b/source/module_hamilt_lcao/module_gint/temp_gint/gint_fvl.cpp
index 01fd6de0ab..3fc9bde005 100644
--- a/source/module_hamilt_lcao/module_gint/temp_gint/gint_fvl.cpp
+++ b/source/module_hamilt_lcao/module_gint/temp_gint/gint_fvl.cpp
@@ -8,9 +8,12 @@ namespace ModuleGint
 
 void Gint_fvl::cal_gint()
 {
+    ModuleBase::TITLE("Gint", "cal_gint_fvl");
+    ModuleBase::timer::tick("Gint", "cal_gint_fvl");
     init_dm_gint_();
-    transfer_dm_2d_to_gint(gint_info_, dm_vec_, dm_gint_vec_);
+    transfer_dm_2d_to_gint(*gint_info_, dm_vec_, dm_gint_vec_);
     cal_fvl_svl_();
+    ModuleBase::timer::tick("Gint", "cal_gint_fvl");
 }
 
 void Gint_fvl::init_dm_gint_()
@@ -64,7 +67,7 @@ void Gint_fvl::cal_fvl_svl_()
             for (int is = 0; is < nspin_; is++)
             {
                 phi_op.phi_mul_vldr3(vr_eff_[is], dr3_, phi.data(), phi_vldr3.data());
-                phi_op.phi_mul_dm(phi_vldr3.data(), *dm_gint_vec_[is], false, phi_vldr3_dm.data());
+                phi_op.phi_mul_dm(phi_vldr3.data(), dm_gint_vec_[is], false, phi_vldr3_dm.data());
                 if(isforce_)
                 {
                     phi_op.phi_dot_dphi(phi_vldr3_dm.data(), dphi_x.data(), dphi_y.data(), dphi_z.data(), fvl_thread);
diff --git a/source/module_hamilt_lcao/module_gint/temp_gint/gint_fvl.h b/source/module_hamilt_lcao/module_gint/temp_gint/gint_fvl.h
index 013c7b2e0a..9e225fed0f 100644
--- a/source/module_hamilt_lcao/module_gint/temp_gint/gint_fvl.h
+++ b/source/module_hamilt_lcao/module_gint/temp_gint/gint_fvl.h
@@ -23,9 +23,9 @@ class Gint_fvl : public Gint
         ModuleBase::matrix* svl)
         : nspin_(nspin), vr_eff_(vr_eff), dm_vec_(dm_vec),
           isforce_(isforce), isstress_(isstress), fvl_(fvl), svl_(svl),
-          dr3_(gint_info_->get_mgrid_volume()) {};
+          dr3_(gint_info_->get_mgrid_volume()) {}
 
-    void cal_gint() override;
+    void cal_gint();
 
     private:
     void init_dm_gint_();
@@ -44,7 +44,7 @@ class Gint_fvl : public Gint
     ModuleBase::matrix* svl_;
 
     // intermediate variables
-    std::vector<std::shared_ptr<HContainer<double>>> dm_gint_vec_;
+    std::vector<HContainer<double>> dm_gint_vec_;
 
     double dr3_;
 };
diff --git a/source/module_hamilt_lcao/module_gint/temp_gint/gint_fvl_gpu.cpp b/source/module_hamilt_lcao/module_gint/temp_gint/gint_fvl_gpu.cpp
new file mode 100644
index 0000000000..1d90304d2c
--- /dev/null
+++ b/source/module_hamilt_lcao/module_gint/temp_gint/gint_fvl_gpu.cpp
@@ -0,0 +1,134 @@
+#include "gint_fvl_gpu.h"
+#include "gint_common.h"
+#include "gint_helper.h"
+#include "batch_biggrid.h"
+#include "kernel/phi_operator_gpu.h"
+
+namespace ModuleGint
+{
+
+void Gint_fvl_gpu::cal_gint()
+{
+    ModuleBase::TITLE("Gint", "cal_gint_fvl");
+    ModuleBase::timer::tick("Gint", "cal_gint_fvl");
+    init_dm_gint_();
+    transfer_dm_2d_to_gint(*gint_info_, dm_vec_, dm_gint_vec_);
+    cal_fvl_svl_();
+    ModuleBase::timer::tick("Gint", "cal_gint_fvl");
+}
+
+void Gint_fvl_gpu::init_dm_gint_()
+{
+    dm_gint_vec_.resize(nspin_);
+    for (int is = 0; is < nspin_; is++)
+    {
+        dm_gint_vec_[is] = gint_info_->get_hr<double>();
+    }
+}
+
+void Gint_fvl_gpu::transfer_cpu_to_gpu_()
+{
+    dm_gint_d_vec_.resize(nspin_);
+    vr_eff_d_vec_.resize(nspin_);
+    for (int is = 0; is < nspin_; is++)
+    {
+        dm_gint_d_vec_[is] = CudaMemWrapper<double>(dm_gint_vec_[is].get_nnr(), 0, false);
+        checkCuda(cudaMemcpy(dm_gint_d_vec_[is].get_device_ptr(), dm_gint_vec_[is].get_wrapper(), 
+                             dm_gint_vec_[is].get_nnr() * sizeof(double), cudaMemcpyHostToDevice));
+        vr_eff_d_vec_[is] = CudaMemWrapper<double>(gint_info_->get_local_mgrid_num(), 0, false);
+        checkCuda(cudaMemcpy(vr_eff_d_vec_[is].get_device_ptr(), vr_eff_[is],
+                             gint_info_->get_local_mgrid_num() * sizeof(double), cudaMemcpyHostToDevice));
+    }
+    if (isforce_)
+    {
+        fvl_d_ = CudaMemWrapper<double>(gint_info_->get_nat() * 3, 0, true);
+    }
+    if (isstress_)
+    {
+        svl_d_ = CudaMemWrapper<double>(6, 0, true);
+    }
+}
+
+void Gint_fvl_gpu::transfer_gpu_to_cpu_()
+{
+    if (isforce_)
+    {
+        fvl_d_.copy_device_to_host_sync();
+        for (int iat = 0; iat < gint_info_->get_nat(); iat++)
+        {
+            for (int j = 0; j < 3; j++)
+            {
+                fvl_[0](iat, j) += fvl_d_.get_host_ptr()[iat * 3 + j];
+            }
+        }
+    }
+    if (isstress_)
+    {
+        svl_d_.copy_device_to_host_sync();
+        svl_[0](0, 0) += svl_d_.get_host_ptr()[0];
+        svl_[0](0, 1) += svl_d_.get_host_ptr()[1];
+        svl_[0](0, 2) += svl_d_.get_host_ptr()[2];
+        svl_[0](1, 1) += svl_d_.get_host_ptr()[3];
+        svl_[0](1, 2) += svl_d_.get_host_ptr()[4];
+        svl_[0](2, 2) += svl_d_.get_host_ptr()[5];
+    }
+}
+
+void Gint_fvl_gpu::cal_fvl_svl_()
+{
+    transfer_cpu_to_gpu_();
+#pragma omp parallel num_threads(gint_info_->get_streams_num())
+    {
+        // 20240620 Note that it must be set again here because 
+        // cuda's device is not safe in a multi-threaded environment.
+        checkCuda(cudaSetDevice(gint_info_->get_dev_id()));
+        cudaStream_t stream;
+        checkCuda(cudaStreamCreate(&stream));
+        PhiOperatorGpu phi_op(gint_info_->get_gpu_vars(), stream);
+        CudaMemWrapper<double> phi(BatchBigGrid::get_max_phi_len(), stream, false);
+        CudaMemWrapper<double> phi_vldr3(BatchBigGrid::get_max_phi_len(), stream, false);
+        CudaMemWrapper<double> phi_vldr3_dm(BatchBigGrid::get_max_phi_len(), stream, false);
+        CudaMemWrapper<double> dphi_x(BatchBigGrid::get_max_phi_len(), stream, false);
+        CudaMemWrapper<double> dphi_y(BatchBigGrid::get_max_phi_len(), stream, false);
+        CudaMemWrapper<double> dphi_z(BatchBigGrid::get_max_phi_len(), stream, false);
+
+        #pragma omp for schedule(dynamic)
+        for(const auto& bgrid_batch: gint_info_->get_bgrid_batches())
+        {
+            if(bgrid_batch->empty())
+            {
+                continue;
+            }
+            phi_op.set_bgrid_batch(bgrid_batch);
+            phi_op.set_phi_dphi(phi.get_device_ptr(),
+                                dphi_x.get_device_ptr(),
+                                dphi_y.get_device_ptr(),
+                                dphi_z.get_device_ptr());
+            for(int is = 0; is < nspin_; is++)
+            {
+                const bool is_symm = false;
+                phi_op.phi_mul_vldr3(vr_eff_d_vec_[is].get_device_ptr(), dr3_,
+                                     phi.get_device_ptr(), phi_vldr3.get_device_ptr());
+                phi_op.phi_mul_dm(phi_vldr3.get_device_ptr(), dm_gint_d_vec_[is].get_device_ptr(),
+                                  dm_gint_vec_[is], is_symm, phi_vldr3_dm.get_device_ptr());
+                if (isforce_)
+                {
+                    phi_op.phi_dot_dphi(phi_vldr3_dm.get_device_ptr(),
+                                        dphi_x.get_device_ptr(), dphi_y.get_device_ptr(),
+                                        dphi_z.get_device_ptr(), fvl_d_.get_device_ptr());
+                }
+                if (isstress_)
+                {
+                    phi_op.phi_dot_dphi_r(phi_vldr3_dm.get_device_ptr(),
+                                          dphi_x.get_device_ptr(), dphi_y.get_device_ptr(),
+                                          dphi_z.get_device_ptr(), svl_d_.get_device_ptr());
+                }
+            }
+       }
+       checkCuda(cudaStreamSynchronize(stream));
+       checkCuda(cudaStreamDestroy(stream));
+    }
+    transfer_gpu_to_cpu_();
+}
+
+}
\ No newline at end of file
diff --git a/source/module_hamilt_lcao/module_gint/temp_gint/gint_fvl_gpu.h b/source/module_hamilt_lcao/module_gint/temp_gint/gint_fvl_gpu.h
new file mode 100644
index 0000000000..6d3d341e64
--- /dev/null
+++ b/source/module_hamilt_lcao/module_gint/temp_gint/gint_fvl_gpu.h
@@ -0,0 +1,61 @@
+#pragma once
+
+#include <memory>
+#include <vector>
+#include "module_hamilt_lcao/module_hcontainer/hcontainer.h"
+#include "source_base/matrix.h"
+#include "gint.h"
+#include "gint_info.h"
+#include "module_hamilt_lcao/module_gint/temp_gint/kernel/cuda_mem_wrapper.h"
+
+namespace ModuleGint
+{
+
+class Gint_fvl_gpu : public Gint
+{
+    public:
+    Gint_fvl_gpu(
+        const int nspin,
+        const std::vector<const double*>& vr_eff,
+        const std::vector<HContainer<double>*>& dm_vec,
+        const bool isforce,
+        const bool isstress,
+        ModuleBase::matrix* fvl,
+        ModuleBase::matrix* svl)
+        : nspin_(nspin), vr_eff_(vr_eff), dm_vec_(dm_vec),
+          isforce_(isforce), isstress_(isstress), fvl_(fvl), svl_(svl),
+          dr3_(gint_info_->get_mgrid_volume()) {}
+
+    void cal_gint();
+
+    private:
+    void init_dm_gint_();
+
+    void cal_fvl_svl_();
+    
+    void transfer_cpu_to_gpu_();
+    void transfer_gpu_to_cpu_();
+    // input
+    const int nspin_;
+    std::vector<const double*> vr_eff_;
+    std::vector<HContainer<double>*> dm_vec_;
+    const bool isforce_;
+    const bool isstress_;
+
+    // output
+    ModuleBase::matrix* fvl_;
+    ModuleBase::matrix* svl_;
+
+    // intermediate variables
+    std::vector<HContainer<double>> dm_gint_vec_;
+
+    double dr3_;
+    
+    // GPU memory
+    std::vector<CudaMemWrapper<double>> vr_eff_d_vec_;
+    std::vector<CudaMemWrapper<double>> dm_gint_d_vec_;
+    CudaMemWrapper<double> fvl_d_;
+    CudaMemWrapper<double> svl_d_;
+};
+
+}
\ No newline at end of file
diff --git a/source/module_hamilt_lcao/module_gint/temp_gint/gint_fvl_meta.cpp b/source/module_hamilt_lcao/module_gint/temp_gint/gint_fvl_meta.cpp
index 15ca44b041..3299600c99 100644
--- a/source/module_hamilt_lcao/module_gint/temp_gint/gint_fvl_meta.cpp
+++ b/source/module_hamilt_lcao/module_gint/temp_gint/gint_fvl_meta.cpp
@@ -8,9 +8,12 @@ namespace ModuleGint
 
 void Gint_fvl_meta::cal_gint()
 {
+    ModuleBase::TITLE("Gint", "cal_gint_fvl");
+    ModuleBase::timer::tick("Gint", "cal_gint_fvl");
     init_dm_gint_();
-    transfer_dm_2d_to_gint(gint_info_, dm_vec_, dm_gint_vec_);
+    transfer_dm_2d_to_gint(*gint_info_, dm_vec_, dm_gint_vec_);
     cal_fvl_svl_();
+    ModuleBase::timer::tick("Gint", "cal_gint_fvl");
 }
 
 void Gint_fvl_meta::init_dm_gint_()
@@ -93,10 +96,10 @@ void Gint_fvl_meta::cal_fvl_svl_()
                 phi_op.phi_mul_vldr3(vofk_[is], dr3_, dphi_x.data(), dphi_x_vldr3.data());
                 phi_op.phi_mul_vldr3(vofk_[is], dr3_, dphi_y.data(), dphi_y_vldr3.data());
                 phi_op.phi_mul_vldr3(vofk_[is], dr3_, dphi_z.data(), dphi_z_vldr3.data());
-                phi_op.phi_mul_dm(phi_vldr3.data(), *dm_gint_vec_[is], false, phi_vldr3_dm.data());
-                phi_op.phi_mul_dm(dphi_x_vldr3.data(), *dm_gint_vec_[is], false, dphi_x_vldr3_dm.data());
-                phi_op.phi_mul_dm(dphi_y_vldr3.data(), *dm_gint_vec_[is], false, dphi_y_vldr3_dm.data());
-                phi_op.phi_mul_dm(dphi_z_vldr3.data(), *dm_gint_vec_[is], false, dphi_z_vldr3_dm.data());
+                phi_op.phi_mul_dm(phi_vldr3.data(), dm_gint_vec_[is], false, phi_vldr3_dm.data());
+                phi_op.phi_mul_dm(dphi_x_vldr3.data(), dm_gint_vec_[is], false, dphi_x_vldr3_dm.data());
+                phi_op.phi_mul_dm(dphi_y_vldr3.data(), dm_gint_vec_[is], false, dphi_y_vldr3_dm.data());
+                phi_op.phi_mul_dm(dphi_z_vldr3.data(), dm_gint_vec_[is], false, dphi_z_vldr3_dm.data());
                 if(isforce_)
                 {
                     phi_op.phi_dot_dphi(phi_vldr3_dm.data(), dphi_x.data(), dphi_y.data(), dphi_z.data(), fvl_thread);
diff --git a/source/module_hamilt_lcao/module_gint/temp_gint/gint_fvl_meta.h b/source/module_hamilt_lcao/module_gint/temp_gint/gint_fvl_meta.h
index 636bbc47b5..1abeac9d11 100644
--- a/source/module_hamilt_lcao/module_gint/temp_gint/gint_fvl_meta.h
+++ b/source/module_hamilt_lcao/module_gint/temp_gint/gint_fvl_meta.h
@@ -23,9 +23,9 @@ class Gint_fvl_meta : public Gint
         ModuleBase::matrix* svl)
         : nspin_(nspin), vr_eff_(vr_eff), vofk_(vofk), dm_vec_(dm_vec),
           isforce_(isforce), isstress_(isstress), fvl_(fvl), svl_(svl),
-          dr3_(gint_info_->get_mgrid_volume()) {};
+          dr3_(gint_info_->get_mgrid_volume()) {}
 
-    void cal_gint() override;
+    void cal_gint();
 
     private:
     void init_dm_gint_();
@@ -45,7 +45,7 @@ class Gint_fvl_meta : public Gint
     ModuleBase::matrix* svl_;
 
     // intermediate variables
-    std::vector<std::shared_ptr<HContainer<double>>> dm_gint_vec_;
+    std::vector<HContainer<double>> dm_gint_vec_;
 
     double dr3_;
 };
diff --git a/source/module_hamilt_lcao/module_gint/temp_gint/gint_fvl_meta_gpu.cpp b/source/module_hamilt_lcao/module_gint/temp_gint/gint_fvl_meta_gpu.cpp
new file mode 100644
index 0000000000..fa19925d04
--- /dev/null
+++ b/source/module_hamilt_lcao/module_gint/temp_gint/gint_fvl_meta_gpu.cpp
@@ -0,0 +1,182 @@
+#include "gint_fvl_meta_gpu.h"
+#include "gint_common.h"
+#include "gint_helper.h"
+#include "batch_biggrid.h"
+#include "kernel/phi_operator_gpu.h"
+
+namespace ModuleGint
+{
+
+void Gint_fvl_meta_gpu::cal_gint()
+{
+    ModuleBase::TITLE("Gint", "cal_gint_fvl");
+    ModuleBase::timer::tick("Gint", "cal_gint_fvl");
+    init_dm_gint_();
+    transfer_dm_2d_to_gint(*gint_info_, dm_vec_, dm_gint_vec_);
+    cal_fvl_svl_();
+    ModuleBase::timer::tick("Gint", "cal_gint_fvl");
+}
+
+void Gint_fvl_meta_gpu::init_dm_gint_()
+{
+    dm_gint_vec_.resize(nspin_);
+    for (int is = 0; is < nspin_; is++)
+    {
+        dm_gint_vec_[is] = gint_info_->get_hr<double>();
+    }
+}
+
+void Gint_fvl_meta_gpu::transfer_cpu_to_gpu_()
+{
+    dm_gint_d_vec_.resize(nspin_);
+    vr_eff_d_vec_.resize(nspin_);
+    vofk_d_vec_.resize(nspin_);
+    for (int is = 0; is < nspin_; is++)
+    {
+        dm_gint_d_vec_[is] = CudaMemWrapper<double>(dm_gint_vec_[is].get_nnr(), 0, false);
+        checkCuda(cudaMemcpy(dm_gint_d_vec_[is].get_device_ptr(), dm_gint_vec_[is].get_wrapper(), 
+                             dm_gint_vec_[is].get_nnr() * sizeof(double), cudaMemcpyHostToDevice));
+        vr_eff_d_vec_[is] = CudaMemWrapper<double>(gint_info_->get_local_mgrid_num(), 0, false);
+        checkCuda(cudaMemcpy(vr_eff_d_vec_[is].get_device_ptr(), vr_eff_[is],
+                             gint_info_->get_local_mgrid_num() * sizeof(double), cudaMemcpyHostToDevice));
+        vofk_d_vec_[is] = CudaMemWrapper<double>(gint_info_->get_local_mgrid_num(), 0, false);
+        checkCuda(cudaMemcpy(vofk_d_vec_[is].get_device_ptr(), vofk_[is],
+                        gint_info_->get_local_mgrid_num() * sizeof(double), cudaMemcpyHostToDevice));
+    }
+    if (isforce_)
+    {
+        fvl_d_ = CudaMemWrapper<double>(gint_info_->get_nat() * 3, 0, true);
+    }
+    if (isstress_)
+    {
+        svl_d_ = CudaMemWrapper<double>(6, 0, true);
+    }
+}
+
+void Gint_fvl_meta_gpu::transfer_gpu_to_cpu_()
+{
+    if (isforce_)
+    {
+        fvl_d_.copy_device_to_host_sync();
+        for (int iat = 0; iat < gint_info_->get_nat(); iat++)
+        {
+            for (int j = 0; j < 3; j++)
+            {
+                fvl_[0](iat, j) += fvl_d_.get_host_ptr()[iat * 3 + j];
+            }
+        }
+    }
+    if (isstress_)
+    {
+        svl_d_.copy_device_to_host_sync();
+        svl_[0](0, 0) += svl_d_.get_host_ptr()[0];
+        svl_[0](0, 1) += svl_d_.get_host_ptr()[1];
+        svl_[0](0, 2) += svl_d_.get_host_ptr()[2];
+        svl_[0](1, 1) += svl_d_.get_host_ptr()[3];
+        svl_[0](1, 2) += svl_d_.get_host_ptr()[4];
+        svl_[0](2, 2) += svl_d_.get_host_ptr()[5];
+    }
+}
+
+void Gint_fvl_meta_gpu::cal_fvl_svl_()
+{
+    transfer_cpu_to_gpu_();
+#pragma omp parallel num_threads(gint_info_->get_streams_num())
+    {
+        // 20240620 Note that it must be set again here because 
+        // cuda's device is not safe in a multi-threaded environment.
+        checkCuda(cudaSetDevice(gint_info_->get_dev_id()));
+        cudaStream_t stream;
+        checkCuda(cudaStreamCreate(&stream));
+        PhiOperatorGpu phi_op(gint_info_->get_gpu_vars(), stream);
+        CudaMemWrapper<double> phi(BatchBigGrid::get_max_phi_len(), stream, false);
+        CudaMemWrapper<double> phi_vldr3(BatchBigGrid::get_max_phi_len(), stream, false);
+        CudaMemWrapper<double> phi_vldr3_dm(BatchBigGrid::get_max_phi_len(), stream, false);
+        CudaMemWrapper<double> dphi_x(BatchBigGrid::get_max_phi_len(), stream, false);
+        CudaMemWrapper<double> dphi_y(BatchBigGrid::get_max_phi_len(), stream, false);
+        CudaMemWrapper<double> dphi_z(BatchBigGrid::get_max_phi_len(), stream, false);
+        CudaMemWrapper<double> dphi_x_vldr3(BatchBigGrid::get_max_phi_len(), stream, false);
+        CudaMemWrapper<double> dphi_y_vldr3(BatchBigGrid::get_max_phi_len(), stream, false);
+        CudaMemWrapper<double> dphi_z_vldr3(BatchBigGrid::get_max_phi_len(), stream, false);
+        CudaMemWrapper<double> dphi_x_vldr3_dm(BatchBigGrid::get_max_phi_len(), stream, false);
+        CudaMemWrapper<double> dphi_y_vldr3_dm(BatchBigGrid::get_max_phi_len(), stream, false);
+        CudaMemWrapper<double> dphi_z_vldr3_dm(BatchBigGrid::get_max_phi_len(), stream, false);
+        CudaMemWrapper<double> ddphi_xx(BatchBigGrid::get_max_phi_len(), stream, false);
+        CudaMemWrapper<double> ddphi_xy(BatchBigGrid::get_max_phi_len(), stream, false);
+        CudaMemWrapper<double> ddphi_xz(BatchBigGrid::get_max_phi_len(), stream, false);
+        CudaMemWrapper<double> ddphi_yy(BatchBigGrid::get_max_phi_len(), stream, false);
+        CudaMemWrapper<double> ddphi_yz(BatchBigGrid::get_max_phi_len(), stream, false);
+        CudaMemWrapper<double> ddphi_zz(BatchBigGrid::get_max_phi_len(), stream, false);
+        #pragma omp for schedule(dynamic)
+        for(const auto& bgrid_batch: gint_info_->get_bgrid_batches())
+        {
+            if(bgrid_batch->empty())
+            {
+                continue;
+            }
+            phi_op.set_bgrid_batch(bgrid_batch);
+            phi_op.set_phi_dphi(phi.get_device_ptr(),
+                                dphi_x.get_device_ptr(),
+                                dphi_y.get_device_ptr(),
+                                dphi_z.get_device_ptr());
+            phi_op.set_ddphi(ddphi_xx.get_device_ptr(), ddphi_xy.get_device_ptr(),
+                             ddphi_xz.get_device_ptr(), ddphi_yy.get_device_ptr(),
+                             ddphi_yz.get_device_ptr(), ddphi_zz.get_device_ptr());            
+            for(int is = 0; is < nspin_; is++)
+            {
+                const bool is_symm = false;
+                phi_op.phi_mul_vldr3(vr_eff_d_vec_[is].get_device_ptr(), dr3_,
+                                     phi.get_device_ptr(), phi_vldr3.get_device_ptr());
+                phi_op.phi_mul_vldr3(vofk_d_vec_[is].get_device_ptr(), dr3_,
+                                     dphi_x.get_device_ptr(), dphi_x_vldr3.get_device_ptr());   
+                phi_op.phi_mul_vldr3(vofk_d_vec_[is].get_device_ptr(), dr3_,
+                                     dphi_y.get_device_ptr(), dphi_y_vldr3.get_device_ptr());
+                phi_op.phi_mul_vldr3(vofk_d_vec_[is].get_device_ptr(), dr3_,
+                                     dphi_z.get_device_ptr(), dphi_z_vldr3.get_device_ptr()); 
+                phi_op.phi_mul_dm(phi_vldr3.get_device_ptr(), dm_gint_d_vec_[is].get_device_ptr(),
+                                  dm_gint_vec_[is], is_symm, phi_vldr3_dm.get_device_ptr());
+                phi_op.phi_mul_dm(dphi_x_vldr3.get_device_ptr(), dm_gint_d_vec_[is].get_device_ptr(),
+                                  dm_gint_vec_[is], is_symm, dphi_x_vldr3_dm.get_device_ptr());
+                phi_op.phi_mul_dm(dphi_y_vldr3.get_device_ptr(), dm_gint_d_vec_[is].get_device_ptr(),
+                                  dm_gint_vec_[is], is_symm, dphi_y_vldr3_dm.get_device_ptr());
+                phi_op.phi_mul_dm(dphi_z_vldr3.get_device_ptr(), dm_gint_d_vec_[is].get_device_ptr(),
+                                  dm_gint_vec_[is], is_symm, dphi_z_vldr3_dm.get_device_ptr());
+                if (isforce_)
+                {
+                    phi_op.phi_dot_dphi(phi_vldr3_dm.get_device_ptr(),
+                                        dphi_x.get_device_ptr(), dphi_y.get_device_ptr(),
+                                        dphi_z.get_device_ptr(), fvl_d_.get_device_ptr());
+                    phi_op.phi_dot_dphi(dphi_x_vldr3_dm.get_device_ptr(),
+                                        ddphi_xx.get_device_ptr(), ddphi_xy.get_device_ptr(),
+                                        ddphi_xz.get_device_ptr(), fvl_d_.get_device_ptr());
+                    phi_op.phi_dot_dphi(dphi_y_vldr3_dm.get_device_ptr(),
+                                        ddphi_xy.get_device_ptr(), ddphi_yy.get_device_ptr(),
+                                        ddphi_yz.get_device_ptr(), fvl_d_.get_device_ptr());
+                    phi_op.phi_dot_dphi(dphi_z_vldr3_dm.get_device_ptr(),
+                                        ddphi_xz.get_device_ptr(), ddphi_yz.get_device_ptr(),
+                                        ddphi_zz.get_device_ptr(), fvl_d_.get_device_ptr());
+                }
+                if (isstress_)
+                {
+                    phi_op.phi_dot_dphi_r(phi_vldr3_dm.get_device_ptr(),
+                                          dphi_x.get_device_ptr(), dphi_y.get_device_ptr(),
+                                          dphi_z.get_device_ptr(), svl_d_.get_device_ptr());
+                    phi_op.phi_dot_dphi_r(dphi_x_vldr3_dm.get_device_ptr(),
+                                          ddphi_xx.get_device_ptr(), ddphi_xy.get_device_ptr(),
+                                          ddphi_xz.get_device_ptr(), svl_d_.get_device_ptr());
+                    phi_op.phi_dot_dphi_r(dphi_y_vldr3_dm.get_device_ptr(),
+                                          ddphi_xy.get_device_ptr(), ddphi_yy.get_device_ptr(),
+                                          ddphi_yz.get_device_ptr(), svl_d_.get_device_ptr());
+                    phi_op.phi_dot_dphi_r(dphi_z_vldr3_dm.get_device_ptr(),
+                                          ddphi_xz.get_device_ptr(), ddphi_yz.get_device_ptr(),
+                                          ddphi_zz.get_device_ptr(), svl_d_.get_device_ptr());
+                }
+            }
+       }
+       checkCuda(cudaStreamSynchronize(stream));
+       checkCuda(cudaStreamDestroy(stream));
+    }
+    transfer_gpu_to_cpu_();
+}
+
+} // namespace ModuleGint
\ No newline at end of file
diff --git a/source/module_hamilt_lcao/module_gint/temp_gint/gint_fvl_meta_gpu.h b/source/module_hamilt_lcao/module_gint/temp_gint/gint_fvl_meta_gpu.h
new file mode 100644
index 0000000000..22baba9d6d
--- /dev/null
+++ b/source/module_hamilt_lcao/module_gint/temp_gint/gint_fvl_meta_gpu.h
@@ -0,0 +1,64 @@
+#pragma once
+
+#include <memory>
+#include <vector>
+#include "module_hamilt_lcao/module_hcontainer/hcontainer.h"
+#include "source_base/matrix.h"
+#include "gint.h"
+#include "gint_info.h"
+#include "module_hamilt_lcao/module_gint/temp_gint/kernel/cuda_mem_wrapper.h"
+
+namespace ModuleGint
+{
+class Gint_fvl_meta_gpu : public Gint
+{
+    public:
+    Gint_fvl_meta_gpu(
+        const int nspin,
+        const std::vector<const double*>& vr_eff,
+        const std::vector<const double*>& vofk,
+        const std::vector<HContainer<double>*>& dm_vec,
+        const bool isforce,
+        const bool isstress,
+        ModuleBase::matrix* fvl,
+        ModuleBase::matrix* svl)
+        : nspin_(nspin), vr_eff_(vr_eff), vofk_(vofk), dm_vec_(dm_vec),
+          isforce_(isforce), isstress_(isstress), fvl_(fvl), svl_(svl),
+          dr3_(gint_info_->get_mgrid_volume()) {}
+
+    void cal_gint();
+
+    private:
+    void init_dm_gint_();
+
+    void transfer_cpu_to_gpu_();
+
+    void transfer_gpu_to_cpu_();
+
+    void cal_fvl_svl_();
+
+    // input
+    const int nspin_;
+    std::vector<const double*> vr_eff_;
+    std::vector<const double*> vofk_;
+    std::vector<HContainer<double>*> dm_vec_;
+    const bool isforce_;
+    const bool isstress_;
+
+    // output
+    ModuleBase::matrix* fvl_;
+    ModuleBase::matrix* svl_;
+
+    // intermediate variables
+    std::vector<HContainer<double>> dm_gint_vec_;
+
+    double dr3_;
+
+    std::vector<CudaMemWrapper<double>> vr_eff_d_vec_;
+    std::vector<CudaMemWrapper<double>> vofk_d_vec_;
+    std::vector<CudaMemWrapper<double>> dm_gint_d_vec_;
+    CudaMemWrapper<double> fvl_d_;
+    CudaMemWrapper<double> svl_d_;
+};
+
+} // namespace ModuleGint
\ No newline at end of file
diff --git a/source/module_hamilt_lcao/module_gint/temp_gint/gint_helper.h b/source/module_hamilt_lcao/module_gint/temp_gint/gint_helper.h
index 687c37df50..a017f81ba0 100644
--- a/source/module_hamilt_lcao/module_gint/temp_gint/gint_helper.h
+++ b/source/module_hamilt_lcao/module_gint/temp_gint/gint_helper.h
@@ -5,17 +5,13 @@
 #include "gint_type.h"
 #include "source_base/timer.h"
 
-template <typename T>
-std::shared_ptr<const T> toConstSharedPtr(std::shared_ptr<T> ptr) {
-    return std::static_pointer_cast<const T>(ptr);
-}
-
-
+namespace ModuleGint
+{
 inline int index3Dto1D(const int id_x, const int id_y, const int id_z,
                         const int dim_x, const int dim_y, const int dim_z)
 {
     return id_z + id_y * dim_z + id_x * dim_y * dim_z;
-};
+}
 
 inline Vec3i index1Dto3D(const int index_1d,
                             const int dim_x, const int dim_y, const int dim_z)
@@ -24,7 +20,7 @@ inline Vec3i index1Dto3D(const int index_1d,
     int id_y = (index_1d - id_x * dim_y * dim_z) / dim_z;
     int id_z = index_1d % dim_z;
     return Vec3i(id_x, id_y, id_z);
-};
+}
 
 // if exponent is an integer between 0 and 5 (the most common cases in gint) and
 // and exp is a variable that cannot be determined at compile time (which means the compiler cannot optimize the code),
@@ -49,15 +45,17 @@ inline double pow_int(const double base, const int exp)
         double result = std::pow(base, exp);
         return result;
     }
-};
+}
 
 inline int floor_div(const int a, const int b)
 {
     // a ^ b < 0 means a and b have different signs
     return a / b - (a % b != 0 && (a ^ b) < 0);
-};
+}
 
 inline int ceil_div(const int a, const int b)
 {
     return a / b + (a % b != 0 && (a ^ b) > 0); 
-};
\ No newline at end of file
+}
+
+}
\ No newline at end of file
diff --git a/source/module_hamilt_lcao/module_gint/temp_gint/gint_info.cpp b/source/module_hamilt_lcao/module_gint/temp_gint/gint_info.cpp
index 11f2a5d59e..b0738e28e4 100644
--- a/source/module_hamilt_lcao/module_gint/temp_gint/gint_info.cpp
+++ b/source/module_hamilt_lcao/module_gint/temp_gint/gint_info.cpp
@@ -41,25 +41,38 @@ GintInfo::GintInfo(
         biggrids_.push_back(std::make_shared<BigGrid>(i));
     }
 
-    // initialize the atoms
+    // initialize the atoms and the numerical orbital
     init_atoms_(ucell_->ntype, ucell_->atoms, Phi);
 
+    // initialize trace_lo_ and lgd_
+    init_trace_lo_(ucell, PARAM.inp.nspin);
+
     // initialize the ijr_info
     // this step needs to be done after init_atoms_, because it requires the information of is_atom_on_bgrid
     init_ijr_info_(ucell, gd);
+
+    #ifdef __CUDA
+    if(PARAM.inp.device == "gpu")
+    {
+        streams_num_ = PARAM.inp.nstream;  // the default value of num_stream is 4
+        const int batch_size = nbz_local;
+        init_bgrid_batches_(batch_size);
+        gpu_vars_ = std::make_shared<GintGpuVars>(biggrid_info_, ucell, Phi);
+    }
+    #endif
 }
 
 template <typename T>
-std::shared_ptr<HContainer<T>> GintInfo::get_hr(int npol) const
+HContainer<T> GintInfo::get_hr(int npol) const
 {
-    auto p_hr = std::make_shared<HContainer<T>>(ucell_->nat);
+    auto hr = HContainer<T>(ucell_->nat);
     if(PARAM.inp.gamma_only)
     {
-        p_hr->fix_gamma();
+        hr.fix_gamma();
     }
-    p_hr->insert_ijrs(&ijr_info_, *ucell_, npol);
-    p_hr->allocate(nullptr, true);
-    return p_hr;
+    hr.insert_ijrs(&ijr_info_, *ucell_, npol);
+    hr.allocate(nullptr, true);
+    return hr;
 }
 
 void GintInfo::init_atoms_(int ntype, const Atom* atoms, const Numerical_Orbital* Phi)
@@ -68,12 +81,14 @@ void GintInfo::init_atoms_(int ntype, const Atom* atoms, const Numerical_Orbital
     int iat = 0;
     is_atom_in_proc_.resize(ucell_->nat, false);
     atoms_.resize(ucell_->nat);
+    orbs_.resize(ntype);
 
 // TODO: USE OPENMP TO PARALLELIZE THIS LOOP
     for(int i = 0; i < ntype; i++)
     {
         const auto& atom = atoms[i];
-        const auto *orb = &Phi[i];
+        orbs_[i] = Phi[i];
+        const auto *orb = &orbs_[i];
 
         // rcut extends to the maximum big grids in x, y, z directions
         Vec3i ext_bgrid = biggrid_info_->max_ext_bgrid_num(atom.Rcut);
@@ -124,7 +139,7 @@ void GintInfo::init_atoms_(int ntype, const Atom* atoms, const Numerical_Orbital
                                                      atom_bgrid_idx.y - ucell_idx_bgrid.y * unitcell_info_->get_nby(),
                                                      atom_bgrid_idx.z - ucell_idx_bgrid.z * unitcell_info_->get_nbz());
                             r_to_atom.insert(std::make_pair(ucell_idx_relative, 
-                                GintAtom(&atom, j, iat, ext_atom_bgrid_idx, ucell_idx_relative, tau_in_biggrid, orb)));
+                                GintAtom(&atom, i, j, iat, ext_atom_bgrid_idx, ucell_idx_relative, tau_in_biggrid, orb, ucell_)));
                         }
                         if(biggrids_[bgrid_local_idx]->is_atom_on_bgrid(&r_to_atom.at(ucell_idx_relative)))
                         {
@@ -140,6 +155,47 @@ void GintInfo::init_atoms_(int ntype, const Atom* atoms, const Numerical_Orbital
     ModuleBase::timer::tick("GintInfo", "init_atoms");
 }
 
+void GintInfo::init_trace_lo_(const UnitCell& ucell, const int nspin)
+{
+    this->trace_lo_ = std::vector<int>(PARAM.globalv.nlocal, -1);
+    this->lgd_ = 0;
+    int iat = 0;
+    int iw_all = 0;
+    int iw_local = 0;
+    for (int it = 0; it < ucell.ntype; it++)
+    {
+        for (int ia = 0; ia < ucell.atoms[it].na; ia++)
+        {
+            if (is_atom_in_proc_[iat]) 
+            {
+                int nw0 = ucell.atoms[it].nw;
+                if (nspin== 4)
+                { // added by zhengdy-soc, need to be double in soc
+                    nw0 *= 2;
+                    this->lgd_ += nw0;
+                } else {
+                    this->lgd_ += nw0;
+                }
+
+                for (int iw = 0; iw < nw0; iw++)
+                {
+                    this->trace_lo_[iw_all] = iw_local;
+                    ++iw_local;
+                    ++iw_all;
+                }
+            } else {
+                // global index of atomic orbitals
+                iw_all += ucell.atoms[it].nw;
+                if (nspin == 4)
+                {
+                    iw_all += ucell.atoms[it].nw;
+                }
+            }
+            ++iat;
+        }
+    }
+}
+
 void GintInfo::init_ijr_info_(const UnitCell& ucell, Grid_Driver& gd)
 {
     HContainer<double> hr_gint_local(ucell.nat);
@@ -207,6 +263,22 @@ void GintInfo::init_ijr_info_(const UnitCell& ucell, Grid_Driver& gd)
     return;
 }
 
-template std::shared_ptr<HContainer<double>> GintInfo::get_hr<double>(int npol) const;
-template std::shared_ptr<HContainer<std::complex<double>>> GintInfo::get_hr<std::complex<double>>(int npol) const;
+#ifdef __CUDA
+void GintInfo::init_bgrid_batches_(int batch_size)
+{
+    for (int i = 0; i < biggrids_.size(); i += batch_size)
+    {
+        std::vector<std::shared_ptr<BigGrid>> bgrid_vec;
+        for(int j = i; j < i + batch_size && j < biggrids_.size(); j++)
+        {
+            bgrid_vec.push_back(biggrids_[j]);
+        }
+        auto bgrid_batch = std::make_shared<BatchBigGrid>(bgrid_vec);
+        bgrid_batches_.push_back(bgrid_batch);
+    }
+}
+#endif
+
+template HContainer<double> GintInfo::get_hr<double>(int npol) const;
+template HContainer<std::complex<double>> GintInfo::get_hr<std::complex<double>>(int npol) const;
 }
\ No newline at end of file
diff --git a/source/module_hamilt_lcao/module_gint/temp_gint/gint_info.h b/source/module_hamilt_lcao/module_gint/temp_gint/gint_info.h
index c234ec165c..88f9b7c6bc 100644
--- a/source/module_hamilt_lcao/module_gint/temp_gint/gint_info.h
+++ b/source/module_hamilt_lcao/module_gint/temp_gint/gint_info.h
@@ -13,6 +13,11 @@
 #include "localcell_info.h"
 #include "divide_info.h"
 
+#ifdef __CUDA
+#include "batch_biggrid.h"
+#include "module_hamilt_lcao/module_gint/temp_gint/kernel/gint_gpu_vars.h"
+#endif
+
 namespace ModuleGint
 {
 
@@ -29,20 +34,26 @@ class GintInfo
         const UnitCell& ucell, Grid_Driver& gd);
 
     // getter functions
-    std::vector<std::shared_ptr<BigGrid>> get_biggrids() const { return biggrids_; };
-    double get_local_mgrid_num() const { return localcell_info_->get_mgrids_num(); };
-    double get_mgrid_volume() const { return meshgrid_info_->get_volume(); };
+    const std::vector<std::shared_ptr<BigGrid>>& get_biggrids() { return biggrids_; }
+    const std::vector<int>& get_trace_lo() const{ return trace_lo_; }
+    int get_lgd() const { return lgd_; }
+    int get_nat() const { return ucell_->nat; }        // return the number of atoms in the unitcell
+    int get_local_mgrid_num() const { return localcell_info_->get_mgrids_num(); }
+    double get_mgrid_volume() const { return meshgrid_info_->get_volume(); }
 
     //=========================================
     // functions about hcontainer
     //=========================================
     template <typename T>
-    std::shared_ptr<HContainer<T>> get_hr(int npol = 1) const;
+    HContainer<T> get_hr(int npol = 1) const;
     
     private:
     // initialize the atoms
     void init_atoms_(int ntype, const Atom* atoms, const Numerical_Orbital* Phi);
 
+    // initialize trace_lo_ and lgd_
+    void init_trace_lo_(const UnitCell& ucell, const int nspin);
+
     // initialize the ijr_info
     void init_ijr_info_(const UnitCell& ucell, Grid_Driver& gd);
 
@@ -77,6 +88,30 @@ class GintInfo
 
     // format for storing atomic pair information in hcontainer, used for initializing hcontainer
     std::vector<int> ijr_info_;
+
+    // map the global index of atomic orbitals to local index
+    std::vector<int> trace_lo_;
+    
+    // store the information about Numerical orbitals
+    std::vector<Numerical_Orbital> orbs_;
+
+    // total num of atomic orbitals on this proc
+    int lgd_ = 0;
+
+    #ifdef __CUDA
+    public:
+    std::vector<std::shared_ptr<BatchBigGrid>>& get_bgrid_batches() { return bgrid_batches_; };
+    std::shared_ptr<const GintGpuVars> get_gpu_vars() const { return gpu_vars_; };
+    int get_dev_id() const { return gpu_vars_->dev_id_; };
+    int get_streams_num() const { return streams_num_; };
+    
+    private:
+    void init_bgrid_batches_(int batch_size);
+    std::vector<std::shared_ptr<BatchBigGrid>> bgrid_batches_;
+    std::shared_ptr<const GintGpuVars> gpu_vars_;
+    // More streams can improve parallelism and may speed up grid integration, at the cost of higher GPU memory usage.
+    int streams_num_;
+    #endif
 };
 
 } // namespace ModuleGint
diff --git a/source/module_hamilt_lcao/module_gint/temp_gint/gint_interface.cpp b/source/module_hamilt_lcao/module_gint/temp_gint/gint_interface.cpp
index bd945b8b19..a66b061ab3 100644
--- a/source/module_hamilt_lcao/module_gint/temp_gint/gint_interface.cpp
+++ b/source/module_hamilt_lcao/module_gint/temp_gint/gint_interface.cpp
@@ -1,5 +1,6 @@
 #include "gint_interface.h"
 #include "source_base/timer.h"
+#include "module_parameter/parameter.h"
 #include "gint_vl.h"
 #include "gint_vl_metagga.h"
 #include "gint_vl_nspin4.h"
@@ -9,6 +10,17 @@
 #include "gint_rho.h"
 #include "gint_tau.h"
 
+#ifdef __CUDA
+#include "gint_vl_gpu.h"
+#include "gint_rho_gpu.h"
+#include "gint_fvl_gpu.h"
+#include "gint_vl_nspin4_gpu.h"
+#include "gint_vl_metagga_gpu.h"
+#include "gint_vl_metagga_nspin4_gpu.h"
+#include "gint_tau_gpu.h"
+#include "gint_fvl_meta_gpu.h"
+#endif
+
 namespace ModuleGint
 {
 
@@ -16,20 +28,35 @@ void cal_gint_vl(
     const double* vr_eff,
     HContainer<double>* hR)
 {
-    ModuleBase::timer::tick("Gint", "cal_gint_vl");
-    Gint_vl gint_vl(vr_eff, hR);
-    gint_vl.cal_gint();
-    ModuleBase::timer::tick("Gint", "cal_gint_vl");
+#ifdef __CUDA
+    if(PARAM.inp.device == "gpu")
+    {
+        Gint_vl_gpu gint_vl(vr_eff, hR);
+        gint_vl.cal_gint();
+    } else
+#endif
+    {
+        Gint_vl gint_vl(vr_eff, hR);
+        gint_vl.cal_gint();
+    }
 }
 
+// nspin == 4 case
 void cal_gint_vl(
     std::vector<const double*> vr_eff,
     HContainer<std::complex<double>>* hR)
 {
-    ModuleBase::timer::tick("Gint", "cal_gint_vl");
-    Gint_vl_nspin4 gint_vl_nspin4(vr_eff, hR);
-    gint_vl_nspin4.cal_gint();
-    ModuleBase::timer::tick("Gint", "cal_gint_vl");
+    #ifdef __CUDA
+    if(PARAM.inp.device == "gpu")
+    {
+        Gint_vl_nspin4_gpu gint_vl_nspin4(vr_eff, hR);
+        gint_vl_nspin4.cal_gint();
+    } else
+    #endif
+    {
+        Gint_vl_nspin4 gint_vl_nspin4(vr_eff, hR);
+        gint_vl_nspin4.cal_gint();
+    }
 }
 
 void cal_gint_vl_metagga(
@@ -37,32 +64,55 @@ void cal_gint_vl_metagga(
     const double* vfork,
     HContainer<double>* hR)
 {
-    ModuleBase::timer::tick("Gint", "cal_gint_vl_metagga");
-    Gint_vl_metagga gint_vl_metagga(vr_eff, vfork, hR);
-    gint_vl_metagga.cal_gint();
-    ModuleBase::timer::tick("Gint", "cal_gint_vl_metagga");
+#ifdef __CUDA
+    if(PARAM.inp.device == "gpu")
+    {
+        Gint_vl_metagga_gpu gint_vl_metagga(vr_eff, vfork, hR);
+        gint_vl_metagga.cal_gint();
+    } else
+#endif
+    {
+        Gint_vl_metagga gint_vl_metagga(vr_eff, vfork, hR);
+        gint_vl_metagga.cal_gint();
+    }
 }
 
+// nspin == 4 case
 void cal_gint_vl_metagga(
     std::vector<const double*> vr_eff,
     std::vector<const double*> vofk,
     HContainer<std::complex<double>>* hR)
 {
-    ModuleBase::timer::tick("Gint", "cal_gint_vl_metagga");
-    Gint_vl_metagga_nspin4 gint_vl_metagga_nspin4(vr_eff, vofk, hR);
-    gint_vl_metagga_nspin4.cal_gint();
-    ModuleBase::timer::tick("Gint", "cal_gint_vl_metagga");
+#ifdef __CUDA
+    if(PARAM.inp.device == "gpu")
+    {
+        Gint_vl_metagga_nspin4_gpu gint_vl_metagga_nspin4(vr_eff, vofk, hR);
+        gint_vl_metagga_nspin4.cal_gint();
+    } else
+#endif
+    {
+        Gint_vl_metagga_nspin4 gint_vl_metagga_nspin4(vr_eff, vofk, hR);
+        gint_vl_metagga_nspin4.cal_gint();
+    }
 }
 
 void cal_gint_rho(
     const std::vector<HContainer<double>*>& dm_vec,
     const int nspin,
-    double **rho)
+    double **rho,
+    bool is_dm_symm)
 {
-    ModuleBase::timer::tick("Gint", "cal_gint_rho");
-    Gint_rho gint_rho(dm_vec, nspin, rho);
-    gint_rho.cal_gint();
-    ModuleBase::timer::tick("Gint", "cal_gint_rho");
+    #ifdef __CUDA
+    if(PARAM.inp.device == "gpu")
+    {
+        Gint_rho_gpu gint_rho(dm_vec, nspin, rho, is_dm_symm);
+        gint_rho.cal_gint();
+    } else
+    #endif
+    {
+        Gint_rho gint_rho(dm_vec, nspin, rho, is_dm_symm);
+        gint_rho.cal_gint();
+    }
 }
 
 void cal_gint_tau(        
@@ -70,10 +120,17 @@ void cal_gint_tau(
     const int nspin,
     double** tau)
 {
-    ModuleBase::timer::tick("Gint", "cal_gint_tau");
-    Gint_tau gint_tau(dm_vec, nspin, tau);
-    gint_tau.cal_gint();
-    ModuleBase::timer::tick("Gint", "cal_gint_tau");
+    #ifdef __CUDA
+    if(PARAM.inp.device == "gpu")
+    {
+        Gint_tau_gpu gint_tau(dm_vec, nspin, tau);
+        gint_tau.cal_gint();
+    } else
+    #endif
+    {
+        Gint_tau gint_tau(dm_vec, nspin, tau);
+        gint_tau.cal_gint();
+    }
 }
 
 void cal_gint_fvl(
@@ -85,10 +142,17 @@ void cal_gint_fvl(
     ModuleBase::matrix* fvl,
     ModuleBase::matrix* svl)
 {
-    ModuleBase::timer::tick("Gint", "cal_gint_fvl");
-    Gint_fvl gint_fvl(nspin, vr_eff, dm_vec, isforce, isstress, fvl, svl);
-    gint_fvl.cal_gint();
-    ModuleBase::timer::tick("Gint", "cal_gint_fvl");
+#ifdef __CUDA
+    if(PARAM.inp.device == "gpu")
+    {
+        Gint_fvl_gpu gint_fvl_gpu(nspin, vr_eff, dm_vec, isforce, isstress, fvl, svl);
+        gint_fvl_gpu.cal_gint();
+    } else
+#endif
+    {
+        Gint_fvl gint_fvl(nspin, vr_eff, dm_vec, isforce, isstress, fvl, svl);
+        gint_fvl.cal_gint();
+    }
 }
 
 void cal_gint_fvl_meta(
@@ -101,10 +165,36 @@ void cal_gint_fvl_meta(
     ModuleBase::matrix* fvl,
     ModuleBase::matrix* svl)
 {
-    ModuleBase::timer::tick("Gint", "cal_gint_fvl_meta");
-    Gint_fvl_meta gint_fvl_meta(nspin, vr_eff, vofk, dm_vec, isforce, isstress, fvl, svl);
-    gint_fvl_meta.cal_gint();
-    ModuleBase::timer::tick("Gint", "cal_gint_fvl_meta");
+#ifdef __CUDA
+    if(PARAM.inp.device == "gpu")
+    {
+        Gint_fvl_meta_gpu gint_fvl_meta(nspin, vr_eff, vofk, dm_vec, isforce, isstress, fvl, svl);
+        gint_fvl_meta.cal_gint();
+    } else
+#endif
+    {
+        Gint_fvl_meta gint_fvl_meta(nspin, vr_eff, vofk, dm_vec, isforce, isstress, fvl, svl);
+        gint_fvl_meta.cal_gint();
+    }
+}
+
+void cal_dvlocal_R_sparseMatrix(
+    const int nspin,
+    const int npol,
+    const int current_spin,
+    const int nlocal,
+    const double sparse_thr,
+    const double* vr_eff,
+    const Parallel_Orbitals& pv,
+    const UnitCell& ucell,
+    const Grid_Driver& gdriver,
+    LCAO_HS_Arrays& hs_arrays)
+{
+    Gint_dvlocal gint_dvlocal(vr_eff, nspin, npol);
+    gint_dvlocal.cal_dvlocal();
+    gint_dvlocal.cal_dvlocal_R_sparseMatrix(
+        nspin, current_spin, nlocal, sparse_thr,
+        pv, ucell, gdriver, hs_arrays);
 }
 
 } // namespace ModuleGint
\ No newline at end of file
diff --git a/source/module_hamilt_lcao/module_gint/temp_gint/gint_interface.h b/source/module_hamilt_lcao/module_gint/temp_gint/gint_interface.h
index cec6b12e01..f674e24011 100644
--- a/source/module_hamilt_lcao/module_gint/temp_gint/gint_interface.h
+++ b/source/module_hamilt_lcao/module_gint/temp_gint/gint_interface.h
@@ -2,7 +2,7 @@
 #include <vector>
 #include "module_hamilt_lcao/module_hcontainer/hcontainer.h"
 #include "gint_type.h"
-
+#include "gint_dvlocal.h"
 
 namespace ModuleGint
 {
@@ -28,7 +28,8 @@ void cal_gint_vl_metagga(
 void cal_gint_rho(
     const std::vector<HContainer<double>*>& dm_vec,
     const int nspin,
-    double **rho);
+    double **rho,
+    bool is_dm_symm = true);
 
 void cal_gint_tau(        
     const std::vector<HContainer<double>*>& dm_vec,
@@ -54,6 +55,17 @@ void cal_gint_fvl_meta(
     ModuleBase::matrix* fvl,
     ModuleBase::matrix* svl);
 
+void cal_dvlocal_R_sparseMatrix(
+    const int nspin,
+    const int npol,
+    const int current_spin,
+    const int nlocal,
+    const double sparse_thr,
+    const double* vr_eff,
+    const Parallel_Orbitals& pv,
+    const UnitCell& ucell,
+    const Grid_Driver& gdriver,
+    LCAO_HS_Arrays& hs_arrays);
 
 
 } // namespace ModuleGint
\ No newline at end of file
diff --git a/source/module_hamilt_lcao/module_gint/temp_gint/gint_rho.cpp b/source/module_hamilt_lcao/module_gint/temp_gint/gint_rho.cpp
index 2924487c7e..c96b10a731 100644
--- a/source/module_hamilt_lcao/module_gint/temp_gint/gint_rho.cpp
+++ b/source/module_hamilt_lcao/module_gint/temp_gint/gint_rho.cpp
@@ -8,9 +8,12 @@ namespace ModuleGint
 
 void Gint_rho::cal_gint()
 {
+    ModuleBase::TITLE("Gint", "cal_gint_rho");
+    ModuleBase::timer::tick("Gint", "cal_gint_rho");
     init_dm_gint_();
-    transfer_dm_2d_to_gint(gint_info_, dm_vec_, dm_gint_vec_);
+    transfer_dm_2d_to_gint(*gint_info_, dm_vec_, dm_gint_vec_);
     cal_rho_();
+    ModuleBase::timer::tick("Gint", "cal_gint_rho");
 }
 
 void Gint_rho::init_dm_gint_()
@@ -43,7 +46,7 @@ void Gint_rho::cal_rho_()
             phi_op.set_phi(phi.data());
             for (int is = 0; is < nspin_; is++)
             {
-                phi_op.phi_mul_dm(phi.data(), *dm_gint_vec_[is], true, phi_dm.data());
+                phi_op.phi_mul_dm(phi.data(), dm_gint_vec_[is], is_dm_symm_, phi_dm.data());
                 phi_op.phi_dot_phi(phi.data(), phi_dm.data(), rho_[is]);
             }
         }
diff --git a/source/module_hamilt_lcao/module_gint/temp_gint/gint_rho.h b/source/module_hamilt_lcao/module_gint/temp_gint/gint_rho.h
index 6bd2b51030..e0a15edbdc 100644
--- a/source/module_hamilt_lcao/module_gint/temp_gint/gint_rho.h
+++ b/source/module_hamilt_lcao/module_gint/temp_gint/gint_rho.h
@@ -15,10 +15,11 @@ class Gint_rho : public Gint
     Gint_rho(
         const std::vector<HContainer<double>*>& dm_vec,
         const int nspin,
-        double **rho)
-        : dm_vec_(dm_vec), nspin_(nspin), rho_(rho) {};
+        double **rho,
+        bool is_dm_symm = true)
+        : dm_vec_(dm_vec), nspin_(nspin), rho_(rho), is_dm_symm_(is_dm_symm) {}
     
-    void cal_gint() override;
+    void cal_gint();
 
     private:
     void init_dm_gint_();
@@ -28,14 +29,16 @@ class Gint_rho : public Gint
     // input
     const std::vector<HContainer<double>*> dm_vec_;
     const int nspin_;
+    
+    // if true, it means the DMR matrix is symmetric,
+    // which leads to faster computations compared to the asymmetric case.
+    const bool is_dm_symm_;
 
     // output
     double **rho_;
 
-    //========================
     // Intermediate variables
-    //========================
-    std::vector<std::shared_ptr<HContainer<double>>> dm_gint_vec_;
+    std::vector<HContainer<double>> dm_gint_vec_;
 };
 
 }
\ No newline at end of file
diff --git a/source/module_hamilt_lcao/module_gint/temp_gint/gint_rho_gpu.cpp b/source/module_hamilt_lcao/module_gint/temp_gint/gint_rho_gpu.cpp
new file mode 100644
index 0000000000..ca24002579
--- /dev/null
+++ b/source/module_hamilt_lcao/module_gint/temp_gint/gint_rho_gpu.cpp
@@ -0,0 +1,86 @@
+#include "gint_rho_gpu.h"
+#include "gint_common.h"
+#include "gint_helper.h"
+#include "batch_biggrid.h"
+#include "kernel/phi_operator_gpu.h"
+
+namespace ModuleGint
+{
+
+void Gint_rho_gpu::cal_gint()
+{
+    ModuleBase::TITLE("Gint", "cal_gint_rho");
+    ModuleBase::timer::tick("Gint", "cal_gint_rho");
+    init_dm_gint_();
+    transfer_dm_2d_to_gint(*gint_info_, dm_vec_, dm_gint_vec_);
+    cal_rho_();
+    ModuleBase::timer::tick("Gint", "cal_gint_rho");
+}
+
+void Gint_rho_gpu::init_dm_gint_()
+{
+    dm_gint_vec_.resize(nspin_);
+    for (int is = 0; is < nspin_; is++)
+    {
+        dm_gint_vec_[is] = gint_info_->get_hr<double>();
+    }
+}
+
+void Gint_rho_gpu::transfer_cpu_to_gpu_()
+{
+    dm_gint_d_vec_.resize(nspin_);
+    rho_d_vec_.resize(nspin_);
+    for (int is = 0; is < nspin_; is++)
+    {
+        dm_gint_d_vec_[is] = CudaMemWrapper<double>(dm_gint_vec_[is].get_nnr(), 0, false);
+        rho_d_vec_[is] = CudaMemWrapper<double>(gint_info_->get_local_mgrid_num(), 0, false);
+        checkCuda(cudaMemcpy(dm_gint_d_vec_[is].get_device_ptr(), dm_gint_vec_[is].get_wrapper(), 
+            dm_gint_vec_[is].get_nnr() * sizeof(double), cudaMemcpyHostToDevice));
+    }
+}
+
+void Gint_rho_gpu::transfer_gpu_to_cpu_()
+{
+    for (int is = 0; is < nspin_; is++)
+    {
+        checkCuda(cudaMemcpy(rho_[is], rho_d_vec_[is].get_device_ptr(), 
+            gint_info_->get_local_mgrid_num() * sizeof(double), cudaMemcpyDeviceToHost));
+    }
+}
+
+void Gint_rho_gpu::cal_rho_()
+{
+    transfer_cpu_to_gpu_();
+#pragma omp parallel num_threads(gint_info_->get_streams_num())
+    {
+        // 20240620 Note that it must be set again here because 
+        // cuda's device is not safe in a multi-threaded environment.
+        checkCuda(cudaSetDevice(gint_info_->get_dev_id()));
+        cudaStream_t stream;
+        checkCuda(cudaStreamCreate(&stream));
+        PhiOperatorGpu phi_op(gint_info_->get_gpu_vars(), stream);
+        CudaMemWrapper<double> phi(BatchBigGrid::get_max_phi_len(), stream, false);
+        CudaMemWrapper<double> phi_dm(BatchBigGrid::get_max_phi_len(), stream, false);
+        #pragma omp for schedule(dynamic)
+        for(const auto& bgrid_batch: gint_info_->get_bgrid_batches())
+        {
+            if(bgrid_batch->empty())
+            {
+                continue;
+            }
+            phi_op.set_bgrid_batch(bgrid_batch);
+            phi_op.set_phi(phi.get_device_ptr());
+            for(int is = 0; is < nspin_; is++)
+            {
+                phi_op.phi_mul_dm(phi.get_device_ptr(), dm_gint_d_vec_[is].get_device_ptr(), dm_gint_vec_[is],
+                                  is_dm_symm_, phi_dm.get_device_ptr());
+                phi_op.phi_dot_phi(phi.get_device_ptr(), phi_dm.get_device_ptr(), rho_d_vec_[is].get_device_ptr());
+            }
+       }
+       checkCuda(cudaStreamSynchronize(stream));
+       checkCuda(cudaStreamDestroy(stream));
+    }
+    transfer_gpu_to_cpu_();
+}
+
+}  // namespace ModuleGint
\ No newline at end of file
diff --git a/source/module_hamilt_lcao/module_gint/temp_gint/gint_rho_gpu.h b/source/module_hamilt_lcao/module_gint/temp_gint/gint_rho_gpu.h
new file mode 100644
index 0000000000..13db0f5a85
--- /dev/null
+++ b/source/module_hamilt_lcao/module_gint/temp_gint/gint_rho_gpu.h
@@ -0,0 +1,52 @@
+#pragma once
+
+#include <memory>
+#include <vector>
+#include "module_hamilt_lcao/module_hcontainer/hcontainer.h"
+#include "gint.h"
+#include "gint_info.h"
+#include "module_hamilt_lcao/module_gint/temp_gint/kernel/cuda_mem_wrapper.h"
+
+namespace ModuleGint
+{
+
+class Gint_rho_gpu: public Gint
+{
+    public:
+    Gint_rho_gpu(
+        const std::vector<HContainer<double>*>& dm_vec,
+        const int nspin,
+        double **rho,
+        bool is_dm_symm = true)
+        : dm_vec_(dm_vec), nspin_(nspin), rho_(rho), is_dm_symm_(is_dm_symm) {}
+    
+    void cal_gint();
+
+    private:
+    void init_dm_gint_();
+
+    void cal_rho_();
+
+    void transfer_cpu_to_gpu_();
+
+    void transfer_gpu_to_cpu_();
+
+    // input
+    const std::vector<HContainer<double>*> dm_vec_;
+    const int nspin_;
+
+    // if true, it means the DMR matrix is symmetric,
+    // which leads to faster computations compared to the asymmetric case.
+    const bool is_dm_symm_;
+
+    // output
+    double **rho_;
+
+    // Intermediate variables
+    std::vector<HContainer<double>> dm_gint_vec_;
+
+    std::vector<CudaMemWrapper<double>> dm_gint_d_vec_;
+    std::vector<CudaMemWrapper<double>> rho_d_vec_;
+};
+
+}
\ No newline at end of file
diff --git a/source/module_hamilt_lcao/module_gint/temp_gint/gint_tau.cpp b/source/module_hamilt_lcao/module_gint/temp_gint/gint_tau.cpp
index f5d0b70a0c..1b5e282384 100644
--- a/source/module_hamilt_lcao/module_gint/temp_gint/gint_tau.cpp
+++ b/source/module_hamilt_lcao/module_gint/temp_gint/gint_tau.cpp
@@ -8,9 +8,12 @@ namespace ModuleGint
 
 void Gint_tau::cal_gint()
 {
+    ModuleBase::TITLE("Gint", "cal_gint_tau");
+    ModuleBase::timer::tick("Gint", "cal_gint_tau");
     init_dm_gint_();
-    transfer_dm_2d_to_gint(gint_info_, dm_vec_, dm_gint_vec_);
+    transfer_dm_2d_to_gint(*gint_info_, dm_vec_, dm_gint_vec_);
     cal_tau_();
+    ModuleBase::timer::tick("Gint", "cal_gint_tau");
 }
 
 void Gint_tau::init_dm_gint_()
@@ -51,9 +54,9 @@ void Gint_tau::cal_tau_()
             phi_op.set_phi_dphi(nullptr, dphi_x.data(), dphi_y.data(), dphi_z.data());
             for (int is = 0; is < nspin_; is++)
             {
-                phi_op.phi_mul_dm(dphi_x.data(), *dm_gint_vec_[is], true, dphi_x_dm.data());
-                phi_op.phi_mul_dm(dphi_y.data(), *dm_gint_vec_[is], true, dphi_y_dm.data());
-                phi_op.phi_mul_dm(dphi_z.data(), *dm_gint_vec_[is], true, dphi_z_dm.data());
+                phi_op.phi_mul_dm(dphi_x.data(), dm_gint_vec_[is], true, dphi_x_dm.data());
+                phi_op.phi_mul_dm(dphi_y.data(), dm_gint_vec_[is], true, dphi_y_dm.data());
+                phi_op.phi_mul_dm(dphi_z.data(), dm_gint_vec_[is], true, dphi_z_dm.data());
                 phi_op.phi_dot_phi(dphi_x.data(), dphi_x_dm.data(), kin_[is]);
                 phi_op.phi_dot_phi(dphi_y.data(), dphi_y_dm.data(), kin_[is]);
                 phi_op.phi_dot_phi(dphi_z.data(), dphi_z_dm.data(), kin_[is]);
diff --git a/source/module_hamilt_lcao/module_gint/temp_gint/gint_tau.h b/source/module_hamilt_lcao/module_gint/temp_gint/gint_tau.h
index d36552a79e..b1d3b0664a 100644
--- a/source/module_hamilt_lcao/module_gint/temp_gint/gint_tau.h
+++ b/source/module_hamilt_lcao/module_gint/temp_gint/gint_tau.h
@@ -15,9 +15,9 @@ class Gint_tau : public Gint
         const std::vector<HContainer<double>*>& dm_vec,
         const int nspin,
         double** tau)
-        : dm_vec_(dm_vec), nspin_(nspin), kin_(tau) {};
+        : dm_vec_(dm_vec), nspin_(nspin), kin_(tau) {}
     
-    void cal_gint() override;
+    void cal_gint();
     
     private:
     void init_dm_gint_();
@@ -31,10 +31,8 @@ class Gint_tau : public Gint
     // output
     double **kin_;
 
-    //========================
     // Intermediate variables
-    //========================
-    std::vector<std::shared_ptr<HContainer<double>>> dm_gint_vec_;
+    std::vector<HContainer<double>> dm_gint_vec_;
 };
 
 } // namespace ModuleGint
diff --git a/source/module_hamilt_lcao/module_gint/temp_gint/gint_tau_gpu.cpp b/source/module_hamilt_lcao/module_gint/temp_gint/gint_tau_gpu.cpp
new file mode 100644
index 0000000000..cbeeead322
--- /dev/null
+++ b/source/module_hamilt_lcao/module_gint/temp_gint/gint_tau_gpu.cpp
@@ -0,0 +1,98 @@
+#include "gint_tau_gpu.h"
+#include "gint_common.h"
+#include "gint_helper.h"
+#include "batch_biggrid.h"
+#include "kernel/phi_operator_gpu.h"
+
+namespace ModuleGint
+{
+
+void Gint_tau_gpu::cal_gint()
+{
+    ModuleBase::TITLE("Gint", "cal_gint_tau");
+    ModuleBase::timer::tick("Gint", "cal_gint_tau");
+    init_dm_gint_();
+    transfer_dm_2d_to_gint(*gint_info_, dm_vec_, dm_gint_vec_);
+    cal_tau_();
+    ModuleBase::timer::tick("Gint", "cal_gint_tau");
+}
+
+void Gint_tau_gpu::init_dm_gint_()
+{
+    dm_gint_vec_.resize(nspin_);
+    for (int is = 0; is < nspin_; is++)
+    {
+        dm_gint_vec_[is] = gint_info_->get_hr<double>();
+    }
+}
+
+void Gint_tau_gpu::transfer_cpu_to_gpu_()
+{
+    dm_gint_d_vec_.resize(nspin_);
+    kin_d_vec_.resize(nspin_);
+    for (int is = 0; is < nspin_; is++)
+    {
+        dm_gint_d_vec_[is] = CudaMemWrapper<double>(dm_gint_vec_[is].get_nnr(), 0, false);
+        kin_d_vec_[is] = CudaMemWrapper<double>(gint_info_->get_local_mgrid_num(), 0, false);
+        checkCuda(cudaMemcpy(dm_gint_d_vec_[is].get_device_ptr(), dm_gint_vec_[is].get_wrapper(), 
+            dm_gint_vec_[is].get_nnr() * sizeof(double), cudaMemcpyHostToDevice));
+    }
+}
+
+void Gint_tau_gpu::transfer_gpu_to_cpu_()
+{
+    for (int is = 0; is < nspin_; is++)
+    {
+        checkCuda(cudaMemcpy(kin_[is], kin_d_vec_[is].get_device_ptr(), 
+            gint_info_->get_local_mgrid_num() * sizeof(double), cudaMemcpyDeviceToHost));
+    }
+}
+
+void Gint_tau_gpu::cal_tau_()
+{
+    transfer_cpu_to_gpu_();
+#pragma omp parallel num_threads(gint_info_->get_streams_num())
+    {
+        // 20240620 Note that it must be set again here because 
+        // cuda's device is not safe in a multi-threaded environment.
+        checkCuda(cudaSetDevice(gint_info_->get_dev_id()));
+        cudaStream_t stream;
+        checkCuda(cudaStreamCreate(&stream));
+        PhiOperatorGpu phi_op(gint_info_->get_gpu_vars(), stream);
+        CudaMemWrapper<double> dphi_x(BatchBigGrid::get_max_phi_len(), stream, false);
+        CudaMemWrapper<double> dphi_y(BatchBigGrid::get_max_phi_len(), stream, false);
+        CudaMemWrapper<double> dphi_z(BatchBigGrid::get_max_phi_len(), stream, false);
+        CudaMemWrapper<double> dphi_x_dm(BatchBigGrid::get_max_phi_len(), stream, false);
+        CudaMemWrapper<double> dphi_y_dm(BatchBigGrid::get_max_phi_len(), stream, false);
+        CudaMemWrapper<double> dphi_z_dm(BatchBigGrid::get_max_phi_len(), stream, false);
+        #pragma omp for schedule(dynamic)
+        for(const auto& bgrid_batch: gint_info_->get_bgrid_batches())
+        {
+            if(bgrid_batch->empty())
+            {
+                continue;
+            }
+            phi_op.set_bgrid_batch(bgrid_batch);
+            phi_op.set_phi_dphi(nullptr,
+                                dphi_x.get_device_ptr(), dphi_y.get_device_ptr(), dphi_z.get_device_ptr());
+            for(int is = 0; is < nspin_; is++)
+            {
+                const bool is_symm = true;
+                phi_op.phi_mul_dm(dphi_x.get_device_ptr(), dm_gint_d_vec_[is].get_device_ptr(), dm_gint_vec_[is],
+                                  is_symm, dphi_x_dm.get_device_ptr());
+                phi_op.phi_mul_dm(dphi_y.get_device_ptr(), dm_gint_d_vec_[is].get_device_ptr(), dm_gint_vec_[is],
+                                  is_symm, dphi_y_dm.get_device_ptr());
+                phi_op.phi_mul_dm(dphi_z.get_device_ptr(), dm_gint_d_vec_[is].get_device_ptr(), dm_gint_vec_[is],
+                                  is_symm, dphi_z_dm.get_device_ptr());
+                phi_op.phi_dot_phi(dphi_x.get_device_ptr(), dphi_x_dm.get_device_ptr(), kin_d_vec_[is].get_device_ptr());
+                phi_op.phi_dot_phi(dphi_y.get_device_ptr(), dphi_y_dm.get_device_ptr(), kin_d_vec_[is].get_device_ptr());
+                phi_op.phi_dot_phi(dphi_z.get_device_ptr(), dphi_z_dm.get_device_ptr(), kin_d_vec_[is].get_device_ptr());
+            }
+       }
+       checkCuda(cudaStreamSynchronize(stream));
+       checkCuda(cudaStreamDestroy(stream));
+    }
+    transfer_gpu_to_cpu_();
+}
+
+}
diff --git a/source/module_hamilt_lcao/module_gint/temp_gint/gint_tau_gpu.h b/source/module_hamilt_lcao/module_gint/temp_gint/gint_tau_gpu.h
new file mode 100644
index 0000000000..bfac5a48a3
--- /dev/null
+++ b/source/module_hamilt_lcao/module_gint/temp_gint/gint_tau_gpu.h
@@ -0,0 +1,47 @@
+#pragma once
+
+#include <memory>
+#include <vector>
+#include "module_hamilt_lcao/module_hcontainer/hcontainer.h"
+#include "gint.h"
+#include "gint_info.h"
+#include "module_hamilt_lcao/module_gint/temp_gint/kernel/cuda_mem_wrapper.h"
+
+namespace ModuleGint
+{
+
+class Gint_tau_gpu : public Gint
+{
+    public:
+    Gint_tau_gpu(
+        const std::vector<HContainer<double>*>& dm_vec,
+        const int nspin,
+        double** tau)
+        : dm_vec_(dm_vec), nspin_(nspin), kin_(tau) {}
+    
+    void cal_gint();
+    
+    private:
+    void init_dm_gint_();
+    
+    void transfer_cpu_to_gpu_();
+
+    void transfer_gpu_to_cpu_();
+
+    void cal_tau_();
+
+    // input
+    const std::vector<HContainer<double>*> dm_vec_;
+    const int nspin_;
+
+    // output
+    double **kin_;
+
+    // Intermediate variables
+    std::vector<HContainer<double>> dm_gint_vec_;
+
+    std::vector<CudaMemWrapper<double>> dm_gint_d_vec_;
+    std::vector<CudaMemWrapper<double>> kin_d_vec_;
+};
+
+}
\ No newline at end of file
diff --git a/source/module_hamilt_lcao/module_gint/temp_gint/gint_type.h b/source/module_hamilt_lcao/module_gint/temp_gint/gint_type.h
index 9cf623765b..4d1b2e8537 100644
--- a/source/module_hamilt_lcao/module_gint/temp_gint/gint_type.h
+++ b/source/module_hamilt_lcao/module_gint/temp_gint/gint_type.h
@@ -4,9 +4,12 @@
 #include "source_base/vector3.h"
 #include "source_base/matrix3.h"
 
-using Matrix3 = ModuleBase::Matrix3;
-using Vec3d = ModuleBase::Vector3<double>;
-using Vec3i = ModuleBase::Vector3<int>;
+namespace ModuleGint
+{
+    using Matrix3 = ModuleBase::Matrix3;
+    using Vec3d = ModuleBase::Vector3<double>;
+    using Vec3i = ModuleBase::Vector3<int>;
 
-template <typename T>
-using HContainer = hamilt::HContainer<T>;
\ No newline at end of file
+    template <typename T>
+    using HContainer = hamilt::HContainer<T>;
+}
\ No newline at end of file
diff --git a/source/module_hamilt_lcao/module_gint/temp_gint/gint_vl.cpp b/source/module_hamilt_lcao/module_gint/temp_gint/gint_vl.cpp
index b0107bf064..ee40327d72 100644
--- a/source/module_hamilt_lcao/module_gint/temp_gint/gint_vl.cpp
+++ b/source/module_hamilt_lcao/module_gint/temp_gint/gint_vl.cpp
@@ -9,10 +9,13 @@ namespace ModuleGint
 
 void Gint_vl::cal_gint()
 {
+    ModuleBase::TITLE("Gint", "cal_gint_vl");
+    ModuleBase::timer::tick("Gint", "cal_gint_vl");
     init_hr_gint_();
     cal_hr_gint_();
     compose_hr_gint(hr_gint_);
-    transfer_hr_gint_to_hR(toConstSharedPtr(hr_gint_), hR_);
+    transfer_hr_gint_to_hR(hr_gint_, *hR_);
+    ModuleBase::timer::tick("Gint", "cal_gint_vl");
 }
 
 //========================
@@ -26,8 +29,6 @@ void Gint_vl::init_hr_gint_()
 
 void Gint_vl::cal_hr_gint_()
 {
-// be careful!!
-// each thread will have a copy of hr_gint_, this may cause a lot of memory usage
 #pragma omp parallel
     {
         PhiOperator phi_op;
@@ -46,7 +47,7 @@ void Gint_vl::cal_hr_gint_()
             phi_vldr3.resize(phi_len);
             phi_op.set_phi(phi.data());
             phi_op.phi_mul_vldr3(vr_eff_, dr3_, phi.data(), phi_vldr3.data());
-            phi_op.phi_mul_phi(phi.data(), phi_vldr3.data(), *hr_gint_, PhiOperator::Triangular_Matrix::Upper);
+            phi_op.phi_mul_phi(phi.data(), phi_vldr3.data(), hr_gint_, PhiOperator::Triangular_Matrix::Upper);
         }
     }
 }
diff --git a/source/module_hamilt_lcao/module_gint/temp_gint/gint_vl.h b/source/module_hamilt_lcao/module_gint/temp_gint/gint_vl.h
index fa3f4b9888..fc717629c5 100644
--- a/source/module_hamilt_lcao/module_gint/temp_gint/gint_vl.h
+++ b/source/module_hamilt_lcao/module_gint/temp_gint/gint_vl.h
@@ -15,9 +15,9 @@ class Gint_vl : public Gint
     Gint_vl(
         const double* vr_eff,
         HContainer<double>* hR)
-        : vr_eff_(vr_eff), hR_(hR), dr3_(gint_info_->get_mgrid_volume()){};
+        : vr_eff_(vr_eff), hR_(hR), dr3_(gint_info_->get_mgrid_volume()) {}
     
-    void cal_gint() override;
+    void cal_gint();
 
     private:
 
@@ -33,12 +33,10 @@ class Gint_vl : public Gint
     // output
     HContainer<double>* hR_;
 
-    //========================
     // Intermediate variables
-    //========================
     double dr3_;
 
-    std::shared_ptr<HContainer<double>> hr_gint_;
+    HContainer<double> hr_gint_;
 };
 
 }
\ No newline at end of file
diff --git a/source/module_hamilt_lcao/module_gint/temp_gint/gint_vl_gpu.cpp b/source/module_hamilt_lcao/module_gint/temp_gint/gint_vl_gpu.cpp
new file mode 100644
index 0000000000..fe9162bc4e
--- /dev/null
+++ b/source/module_hamilt_lcao/module_gint/temp_gint/gint_vl_gpu.cpp
@@ -0,0 +1,73 @@
+#include "gint_vl_gpu.h"
+#include "gint_common.h"
+#include "gint_helper.h"
+#include "batch_biggrid.h"
+#include "kernel/phi_operator_gpu.h"
+
+namespace ModuleGint
+{
+
+void Gint_vl_gpu::cal_gint()
+{
+    ModuleBase::TITLE("Gint", "cal_gint_vl");
+    ModuleBase::timer::tick("Gint", "cal_gint_vl");
+    init_hr_gint_();
+    cal_hr_gint_();
+    compose_hr_gint(hr_gint_);
+    transfer_hr_gint_to_hR(hr_gint_, *hR_);
+    ModuleBase::timer::tick("Gint", "cal_gint_vl");
+}
+
+void Gint_vl_gpu::init_hr_gint_()
+{
+    hr_gint_ = gint_info_->get_hr<double>();
+}
+
+void Gint_vl_gpu::transfer_cpu_to_gpu_()
+{
+    hr_gint_d_ = CudaMemWrapper<double>(hr_gint_.get_nnr(), 0, false);
+    vr_eff_d_ = CudaMemWrapper<double>(gint_info_->get_local_mgrid_num(), 0, false);
+    checkCuda(cudaMemcpy(vr_eff_d_.get_device_ptr(), vr_eff_,
+        gint_info_->get_local_mgrid_num() * sizeof(double), cudaMemcpyHostToDevice));
+}
+
+void Gint_vl_gpu::transfer_gpu_to_cpu_()
+{
+    checkCuda(cudaMemcpy(hr_gint_.get_wrapper(), hr_gint_d_.get_device_ptr(), 
+        hr_gint_.get_nnr() * sizeof(double), cudaMemcpyDeviceToHost));
+}
+
+void Gint_vl_gpu::cal_hr_gint_()
+{
+    transfer_cpu_to_gpu_();
+#pragma omp parallel num_threads(gint_info_->get_streams_num())
+    {
+        // 20240620 Note that it must be set again here because 
+        // cuda's device is not safe in a multi-threaded environment.
+        checkCuda(cudaSetDevice(gint_info_->get_dev_id()));
+        cudaStream_t stream;
+        checkCuda(cudaStreamCreate(&stream));
+        PhiOperatorGpu phi_op(gint_info_->get_gpu_vars(), stream);
+        CudaMemWrapper<double> phi(BatchBigGrid::get_max_phi_len(), stream, false);
+        CudaMemWrapper<double> phi_vldr3(BatchBigGrid::get_max_phi_len(), stream, false);
+        #pragma omp for schedule(dynamic)
+        for(const auto& bgrid_batch: gint_info_->get_bgrid_batches())
+        {
+            if(bgrid_batch->empty())
+            {
+                continue;
+            }
+            phi_op.set_bgrid_batch(bgrid_batch);
+            phi_op.set_phi(phi.get_device_ptr());
+            phi_op.phi_mul_vldr3(vr_eff_d_.get_device_ptr(), dr3_,
+                 phi.get_device_ptr(), phi_vldr3.get_device_ptr());
+            phi_op.phi_mul_phi(phi.get_device_ptr(), phi_vldr3.get_device_ptr(),
+                 hr_gint_, hr_gint_d_.get_device_ptr());
+        }
+        checkCuda(cudaStreamSynchronize(stream));
+        checkCuda(cudaStreamDestroy(stream));
+    }
+    transfer_gpu_to_cpu_();
+}
+
+}
\ No newline at end of file
diff --git a/source/module_hamilt_lcao/module_gint/temp_gint/gint_vl_gpu.h b/source/module_hamilt_lcao/module_gint/temp_gint/gint_vl_gpu.h
new file mode 100644
index 0000000000..53290658c8
--- /dev/null
+++ b/source/module_hamilt_lcao/module_gint/temp_gint/gint_vl_gpu.h
@@ -0,0 +1,49 @@
+#pragma once
+
+#include <memory>
+#include <vector>
+#include "module_hamilt_lcao/module_hcontainer/hcontainer.h"
+#include "gint.h"
+#include "gint_info.h"
+#include "module_hamilt_lcao/module_gint/temp_gint/kernel/cuda_mem_wrapper.h"
+
+namespace ModuleGint
+{
+
+class Gint_vl_gpu : public Gint
+{
+    public:
+    Gint_vl_gpu(
+        const double* vr_eff,
+        HContainer<double>* hR)
+        : vr_eff_(vr_eff), hR_(hR), dr3_(gint_info_->get_mgrid_volume()) {}
+    
+    void cal_gint();
+
+    private:
+
+    void init_hr_gint_();
+
+    void transfer_cpu_to_gpu_();
+
+    void transfer_gpu_to_cpu_();
+
+    void cal_hr_gint_();
+
+    // input
+    const double* vr_eff_;
+
+        
+    // output
+    HContainer<double>* hR_;
+
+    // Intermediate variables
+    double dr3_;
+
+    HContainer<double> hr_gint_;
+    
+    CudaMemWrapper<double> hr_gint_d_;
+    CudaMemWrapper<double> vr_eff_d_;
+};
+
+}
\ No newline at end of file
diff --git a/source/module_hamilt_lcao/module_gint/temp_gint/gint_vl_metagga.cpp b/source/module_hamilt_lcao/module_gint/temp_gint/gint_vl_metagga.cpp
index fa651a89e1..2c885adca2 100644
--- a/source/module_hamilt_lcao/module_gint/temp_gint/gint_vl_metagga.cpp
+++ b/source/module_hamilt_lcao/module_gint/temp_gint/gint_vl_metagga.cpp
@@ -9,10 +9,13 @@ namespace ModuleGint
 
 void Gint_vl_metagga::cal_gint()
 {
+    ModuleBase::TITLE("Gint", "cal_gint_vl");
+    ModuleBase::timer::tick("Gint", "cal_gint_vl");
     init_hr_gint_();
     cal_hr_gint_();
     compose_hr_gint(hr_gint_);
-    transfer_hr_gint_to_hR(toConstSharedPtr(hr_gint_), hR_);
+    transfer_hr_gint_to_hR(hr_gint_, *hR_);
+    ModuleBase::timer::tick("Gint", "cal_gint_vl");
 }
 
 //========================
@@ -26,8 +29,6 @@ void Gint_vl_metagga::init_hr_gint_()
 
 void Gint_vl_metagga::cal_hr_gint_()
 {
-// be careful!!
-// each thread will have a copy of hr_gint_, this may cause a lot of memory usage
 #pragma omp parallel
     {
         PhiOperator phi_op;
@@ -61,10 +62,10 @@ void Gint_vl_metagga::cal_hr_gint_()
             phi_op.phi_mul_vldr3(vofk_, dr3_, dphi_x.data(), dphi_x_vldr3.data());
             phi_op.phi_mul_vldr3(vofk_, dr3_, dphi_y.data(), dphi_y_vldr3.data());
             phi_op.phi_mul_vldr3(vofk_, dr3_, dphi_z.data(), dphi_z_vldr3.data());
-            phi_op.phi_mul_phi(phi.data(), phi_vldr3.data(), *hr_gint_, PhiOperator::Triangular_Matrix::Upper);
-            phi_op.phi_mul_phi(dphi_x.data(), dphi_x_vldr3.data(), *hr_gint_, PhiOperator::Triangular_Matrix::Upper);
-            phi_op.phi_mul_phi(dphi_y.data(), dphi_y_vldr3.data(), *hr_gint_, PhiOperator::Triangular_Matrix::Upper);
-            phi_op.phi_mul_phi(dphi_z.data(), dphi_z_vldr3.data(), *hr_gint_, PhiOperator::Triangular_Matrix::Upper);
+            phi_op.phi_mul_phi(phi.data(), phi_vldr3.data(), hr_gint_, PhiOperator::Triangular_Matrix::Upper);
+            phi_op.phi_mul_phi(dphi_x.data(), dphi_x_vldr3.data(), hr_gint_, PhiOperator::Triangular_Matrix::Upper);
+            phi_op.phi_mul_phi(dphi_y.data(), dphi_y_vldr3.data(), hr_gint_, PhiOperator::Triangular_Matrix::Upper);
+            phi_op.phi_mul_phi(dphi_z.data(), dphi_z_vldr3.data(), hr_gint_, PhiOperator::Triangular_Matrix::Upper);
         }
     }
 }
diff --git a/source/module_hamilt_lcao/module_gint/temp_gint/gint_vl_metagga.h b/source/module_hamilt_lcao/module_gint/temp_gint/gint_vl_metagga.h
index 6ae267f7d4..01bef660a2 100644
--- a/source/module_hamilt_lcao/module_gint/temp_gint/gint_vl_metagga.h
+++ b/source/module_hamilt_lcao/module_gint/temp_gint/gint_vl_metagga.h
@@ -16,9 +16,9 @@ class Gint_vl_metagga : public Gint
         const double* vr_eff,
         const double* vofk,
         HContainer<double>* hR)
-        : vr_eff_(vr_eff), vofk_(vofk), hR_(hR), dr3_(gint_info_->get_mgrid_volume()){};
+        : vr_eff_(vr_eff), vofk_(vofk), hR_(hR), dr3_(gint_info_->get_mgrid_volume()) {}
     
-    void cal_gint() override;
+    void cal_gint();
 
     private:
 
@@ -35,12 +35,10 @@ class Gint_vl_metagga : public Gint
     // output
     HContainer<double>* hR_;
 
-    //========================
     // Intermediate variables
-    //========================
     double dr3_;
 
-    std::shared_ptr<HContainer<double>> hr_gint_;
+    HContainer<double> hr_gint_;
 
 };
 
diff --git a/source/module_hamilt_lcao/module_gint/temp_gint/gint_vl_metagga_gpu.cpp b/source/module_hamilt_lcao/module_gint/temp_gint/gint_vl_metagga_gpu.cpp
new file mode 100644
index 0000000000..9c2dad8421
--- /dev/null
+++ b/source/module_hamilt_lcao/module_gint/temp_gint/gint_vl_metagga_gpu.cpp
@@ -0,0 +1,98 @@
+#include "gint_vl_metagga_gpu.h"
+#include "gint_common.h"
+#include "gint_helper.h"
+#include "batch_biggrid.h"
+#include "kernel/phi_operator_gpu.h"
+
+namespace ModuleGint
+{
+
+void Gint_vl_metagga_gpu::cal_gint()
+{
+    ModuleBase::TITLE("Gint", "cal_gint_vl");
+    ModuleBase::timer::tick("Gint", "cal_gint_vl");
+    init_hr_gint_();
+    cal_hr_gint_();
+    compose_hr_gint(hr_gint_);
+    transfer_hr_gint_to_hR(hr_gint_, *hR_);
+    ModuleBase::timer::tick("Gint", "cal_gint_vl");
+}
+
+//========================
+// Private functions
+//========================
+
+void Gint_vl_metagga_gpu::init_hr_gint_()
+{
+    hr_gint_ = gint_info_->get_hr<double>();
+}
+
+void Gint_vl_metagga_gpu::transfer_cpu_to_gpu_()
+{
+    hr_gint_d_ = CudaMemWrapper<double>(hr_gint_.get_nnr(), 0, false);
+    vr_eff_d_ = CudaMemWrapper<double>(gint_info_->get_local_mgrid_num(), 0, false);
+    vofk_d_ = CudaMemWrapper<double>(gint_info_->get_local_mgrid_num(), 0, false);
+    checkCuda(cudaMemcpy(vr_eff_d_.get_device_ptr(), vr_eff_,
+        gint_info_->get_local_mgrid_num() * sizeof(double), cudaMemcpyHostToDevice));
+    checkCuda(cudaMemcpy(vofk_d_.get_device_ptr(), vofk_,
+        gint_info_->get_local_mgrid_num() * sizeof(double), cudaMemcpyHostToDevice));
+}
+
+void Gint_vl_metagga_gpu::transfer_gpu_to_cpu_()
+{
+    checkCuda(cudaMemcpy(hr_gint_.get_wrapper(), hr_gint_d_.get_device_ptr(), 
+        hr_gint_.get_nnr() * sizeof(double), cudaMemcpyDeviceToHost));
+}
+
+void Gint_vl_metagga_gpu::cal_hr_gint_()
+{
+    transfer_cpu_to_gpu_();
+#pragma omp parallel num_threads(gint_info_->get_streams_num())
+    {
+        // 20240620 Note that it must be set again here because 
+        // cuda's device is not safe in a multi-threaded environment.
+        checkCuda(cudaSetDevice(gint_info_->get_dev_id()));
+        cudaStream_t stream;
+        checkCuda(cudaStreamCreate(&stream));
+        PhiOperatorGpu phi_op(gint_info_->get_gpu_vars(), stream);
+        CudaMemWrapper<double> phi(BatchBigGrid::get_max_phi_len(), stream, false);
+        CudaMemWrapper<double> phi_vldr3(BatchBigGrid::get_max_phi_len(), stream, false);
+        CudaMemWrapper<double> dphi_x(BatchBigGrid::get_max_phi_len(), stream, false);
+        CudaMemWrapper<double> dphi_y(BatchBigGrid::get_max_phi_len(), stream, false);
+        CudaMemWrapper<double> dphi_z(BatchBigGrid::get_max_phi_len(), stream, false);
+        CudaMemWrapper<double> dphi_x_vldr3(BatchBigGrid::get_max_phi_len(), stream, false);
+        CudaMemWrapper<double> dphi_y_vldr3(BatchBigGrid::get_max_phi_len(), stream, false);
+        CudaMemWrapper<double> dphi_z_vldr3(BatchBigGrid::get_max_phi_len(), stream, false);
+        #pragma omp for schedule(dynamic)
+        for(const auto& bgrid_batch: gint_info_->get_bgrid_batches())
+        {
+            if(bgrid_batch->empty())
+            {
+                continue;
+            }
+            phi_op.set_bgrid_batch(bgrid_batch);
+            phi_op.set_phi_dphi(phi.get_device_ptr(),
+                                dphi_x.get_device_ptr(), dphi_y.get_device_ptr(), dphi_z.get_device_ptr());
+            phi_op.phi_mul_vldr3(vr_eff_d_.get_device_ptr(), dr3_,
+                                 phi.get_device_ptr(), phi_vldr3.get_device_ptr());
+            phi_op.phi_mul_vldr3(vofk_d_.get_device_ptr(), dr3_,
+                                 dphi_x.get_device_ptr(), dphi_x_vldr3.get_device_ptr());
+            phi_op.phi_mul_vldr3(vofk_d_.get_device_ptr(), dr3_,
+                                 dphi_y.get_device_ptr(), dphi_y_vldr3.get_device_ptr());
+            phi_op.phi_mul_vldr3(vofk_d_.get_device_ptr(), dr3_,
+                                 dphi_z.get_device_ptr(), dphi_z_vldr3.get_device_ptr());
+            phi_op.phi_mul_phi(phi.get_device_ptr(), phi_vldr3.get_device_ptr(),
+                               hr_gint_, hr_gint_d_.get_device_ptr());
+            phi_op.phi_mul_phi(dphi_x.get_device_ptr(), dphi_x_vldr3.get_device_ptr(),
+                               hr_gint_, hr_gint_d_.get_device_ptr());
+            phi_op.phi_mul_phi(dphi_y.get_device_ptr(), dphi_y_vldr3.get_device_ptr(),
+                               hr_gint_, hr_gint_d_.get_device_ptr());
+            phi_op.phi_mul_phi(dphi_z.get_device_ptr(), dphi_z_vldr3.get_device_ptr(),
+                               hr_gint_, hr_gint_d_.get_device_ptr());
+        }
+        checkCuda(cudaStreamSynchronize(stream));
+        checkCuda(cudaStreamDestroy(stream));
+    }
+    transfer_gpu_to_cpu_();
+}
+}
\ No newline at end of file
diff --git a/source/module_hamilt_lcao/module_gint/temp_gint/gint_vl_metagga_gpu.h b/source/module_hamilt_lcao/module_gint/temp_gint/gint_vl_metagga_gpu.h
new file mode 100644
index 0000000000..efdc01762a
--- /dev/null
+++ b/source/module_hamilt_lcao/module_gint/temp_gint/gint_vl_metagga_gpu.h
@@ -0,0 +1,53 @@
+#pragma once
+
+#include <memory>
+#include <vector>
+#include "module_hamilt_lcao/module_hcontainer/hcontainer.h"
+#include "gint.h"
+#include "gint_info.h"
+#include "module_hamilt_lcao/module_gint/temp_gint/kernel/cuda_mem_wrapper.h"
+
+namespace ModuleGint
+{
+
+class Gint_vl_metagga_gpu : public Gint
+{
+    public:
+    Gint_vl_metagga_gpu(
+        const double* vr_eff,
+        const double* vofk,
+        HContainer<double>* hR)
+        : vr_eff_(vr_eff), vofk_(vofk), hR_(hR), dr3_(gint_info_->get_mgrid_volume()) {}
+    
+    void cal_gint();
+
+    private:
+
+    void init_hr_gint_();
+
+    void transfer_cpu_to_gpu_();
+
+    void transfer_gpu_to_cpu_();
+    
+    // note that only the upper triangle matrix of hR is calculated
+    // that's why we need compose_hr_gint() to fill the lower triangle matrix.
+    void cal_hr_gint_();
+
+    // input
+    const double* vr_eff_;
+    const double* vofk_;
+
+    // output
+    HContainer<double>* hR_;
+
+    // Intermediate variables
+    double dr3_;
+
+    HContainer<double> hr_gint_;
+    
+    CudaMemWrapper<double> hr_gint_d_;
+    CudaMemWrapper<double> vr_eff_d_;
+    CudaMemWrapper<double> vofk_d_;
+};
+
+}
\ No newline at end of file
diff --git a/source/module_hamilt_lcao/module_gint/temp_gint/gint_vl_metagga_nspin4.cpp b/source/module_hamilt_lcao/module_gint/temp_gint/gint_vl_metagga_nspin4.cpp
index 986b182c09..5c6086031c 100644
--- a/source/module_hamilt_lcao/module_gint/temp_gint/gint_vl_metagga_nspin4.cpp
+++ b/source/module_hamilt_lcao/module_gint/temp_gint/gint_vl_metagga_nspin4.cpp
@@ -11,10 +11,13 @@ namespace ModuleGint
 
 void Gint_vl_metagga_nspin4::cal_gint()
 {
+    ModuleBase::TITLE("Gint", "cal_gint_vl");
+    ModuleBase::timer::tick("Gint", "cal_gint_vl");
     init_hr_gint_();
     cal_hr_gint_();
     compose_hr_gint(hr_gint_part_, hr_gint_full_);
-    transfer_hr_gint_to_hR(toConstSharedPtr(hr_gint_full_), hR_);
+    transfer_hr_gint_to_hR(hr_gint_full_, *hR_);
+    ModuleBase::timer::tick("Gint", "cal_gint_vl");
 }
 
 void Gint_vl_metagga_nspin4::init_hr_gint_()
@@ -65,10 +68,10 @@ void Gint_vl_metagga_nspin4::cal_hr_gint_()
                 phi_op.phi_mul_vldr3(vofk_[is], dr3_, dphi_x.data(), dphi_x_vldr3.data());
                 phi_op.phi_mul_vldr3(vofk_[is], dr3_, dphi_y.data(), dphi_y_vldr3.data());
                 phi_op.phi_mul_vldr3(vofk_[is], dr3_, dphi_z.data(), dphi_z_vldr3.data());
-                phi_op.phi_mul_phi(phi.data(), phi_vldr3.data(), *hr_gint_part_[is], PhiOperator::Triangular_Matrix::Upper);
-                phi_op.phi_mul_phi(dphi_x.data(), dphi_x_vldr3.data(), *hr_gint_part_[is], PhiOperator::Triangular_Matrix::Upper);
-                phi_op.phi_mul_phi(dphi_y.data(), dphi_y_vldr3.data(), *hr_gint_part_[is], PhiOperator::Triangular_Matrix::Upper);
-                phi_op.phi_mul_phi(dphi_z.data(), dphi_z_vldr3.data(), *hr_gint_part_[is], PhiOperator::Triangular_Matrix::Upper);
+                phi_op.phi_mul_phi(phi.data(), phi_vldr3.data(), hr_gint_part_[is], PhiOperator::Triangular_Matrix::Upper);
+                phi_op.phi_mul_phi(dphi_x.data(), dphi_x_vldr3.data(), hr_gint_part_[is], PhiOperator::Triangular_Matrix::Upper);
+                phi_op.phi_mul_phi(dphi_y.data(), dphi_y_vldr3.data(), hr_gint_part_[is], PhiOperator::Triangular_Matrix::Upper);
+                phi_op.phi_mul_phi(dphi_z.data(), dphi_z_vldr3.data(), hr_gint_part_[is], PhiOperator::Triangular_Matrix::Upper);
             }
         }
     }
diff --git a/source/module_hamilt_lcao/module_gint/temp_gint/gint_vl_metagga_nspin4.h b/source/module_hamilt_lcao/module_gint/temp_gint/gint_vl_metagga_nspin4.h
index 40bf386fa3..abdbde3f08 100644
--- a/source/module_hamilt_lcao/module_gint/temp_gint/gint_vl_metagga_nspin4.h
+++ b/source/module_hamilt_lcao/module_gint/temp_gint/gint_vl_metagga_nspin4.h
@@ -16,9 +16,9 @@ class Gint_vl_metagga_nspin4 : public Gint
         std::vector<const double*> vr_eff,
         std::vector<const double*> vofk,
         HContainer<std::complex<double>>* hR)
-        : vr_eff_(vr_eff), vofk_(vofk), hR_(hR), dr3_(gint_info_->get_mgrid_volume()){};
+        : vr_eff_(vr_eff), vofk_(vofk), hR_(hR), dr3_(gint_info_->get_mgrid_volume()) {}
     
-    void cal_gint() override;
+    void cal_gint();
 
     private:
     void init_hr_gint_();
@@ -31,15 +31,13 @@ class Gint_vl_metagga_nspin4 : public Gint
     // output
     HContainer<std::complex<double>>* hR_;
 
-    //========================
     // Intermediate variables
-    //========================
     const double dr3_;
 
     const int nspin_ = 4;
 
-    std::vector<std::shared_ptr<HContainer<double>>> hr_gint_part_;
-    std::shared_ptr<HContainer<std::complex<double>>> hr_gint_full_;
+    std::vector<HContainer<double>> hr_gint_part_;
+    HContainer<std::complex<double>> hr_gint_full_;
 };
 
 }
\ No newline at end of file
diff --git a/source/module_hamilt_lcao/module_gint/temp_gint/gint_vl_metagga_nspin4_gpu.cpp b/source/module_hamilt_lcao/module_gint/temp_gint/gint_vl_metagga_nspin4_gpu.cpp
new file mode 100644
index 0000000000..9adc4cb137
--- /dev/null
+++ b/source/module_hamilt_lcao/module_gint/temp_gint/gint_vl_metagga_nspin4_gpu.cpp
@@ -0,0 +1,113 @@
+#include "gint_vl_metagga_nspin4_gpu.h"
+#include "gint_common.h"
+#include "gint_helper.h"
+#include "batch_biggrid.h"
+#include "kernel/phi_operator_gpu.h"
+
+namespace ModuleGint
+{
+
+void Gint_vl_metagga_nspin4_gpu::cal_gint()
+{
+    ModuleBase::TITLE("Gint", "cal_gint_vl");
+    ModuleBase::timer::tick("Gint", "cal_gint_vl");
+    init_hr_gint_();
+    cal_hr_gint_();
+    compose_hr_gint(hr_gint_part_, hr_gint_full_);
+    transfer_hr_gint_to_hR(hr_gint_full_, *hR_);
+    ModuleBase::timer::tick("Gint", "cal_gint_vl");
+}
+
+void Gint_vl_metagga_nspin4_gpu::init_hr_gint_()
+{
+    hr_gint_part_.resize(nspin_);
+    for(int i = 0; i < nspin_; i++)
+    {
+        hr_gint_part_[i] = gint_info_->get_hr<double>();
+    }
+    const int npol = 2;
+    hr_gint_full_ = gint_info_->get_hr<std::complex<double>>(npol);
+}
+
+void Gint_vl_metagga_nspin4_gpu::transfer_cpu_to_gpu_()
+{
+    vr_eff_d_.resize(nspin_);
+    vofk_d_.resize(nspin_);
+    hr_gint_part_d_.resize(nspin_);
+    for(int i = 0; i < nspin_; i++)
+    {
+        hr_gint_part_d_[i] = CudaMemWrapper<double>(hr_gint_part_[i].get_nnr(), 0, false);
+        vr_eff_d_[i] = CudaMemWrapper<double>(gint_info_->get_local_mgrid_num(), 0, false);
+        vofk_d_[i] = CudaMemWrapper<double>(gint_info_->get_local_mgrid_num(), 0, false);
+        checkCuda(cudaMemcpy(vr_eff_d_[i].get_device_ptr(), vr_eff_[i],
+                  gint_info_->get_local_mgrid_num() * sizeof(double), cudaMemcpyHostToDevice));
+        checkCuda(cudaMemcpy(vofk_d_[i].get_device_ptr(), vofk_[i],
+                  gint_info_->get_local_mgrid_num() * sizeof(double), cudaMemcpyHostToDevice));
+    }
+}
+
+void Gint_vl_metagga_nspin4_gpu::transfer_gpu_to_cpu_()
+{
+    for(int i = 0; i < nspin_; i++)
+    {
+        checkCuda(cudaMemcpy(hr_gint_part_[i].get_wrapper(), hr_gint_part_d_[i].get_device_ptr(), 
+                             hr_gint_part_[i].get_nnr() * sizeof(double), cudaMemcpyDeviceToHost));
+    }
+}
+
+void Gint_vl_metagga_nspin4_gpu::cal_hr_gint_()
+{
+    transfer_cpu_to_gpu_();
+#pragma omp parallel num_threads(gint_info_->get_streams_num())
+    {
+        // 20240620 Note that it must be set again here because 
+        // cuda's device is not safe in a multi-threaded environment.
+        checkCuda(cudaSetDevice(gint_info_->get_dev_id()));
+        cudaStream_t stream;
+        checkCuda(cudaStreamCreate(&stream));
+        PhiOperatorGpu phi_op(gint_info_->get_gpu_vars(), stream);
+        CudaMemWrapper<double> phi(BatchBigGrid::get_max_phi_len(), stream, false);
+        CudaMemWrapper<double> phi_vldr3(BatchBigGrid::get_max_phi_len(), stream, false);
+        CudaMemWrapper<double> dphi_x(BatchBigGrid::get_max_phi_len(), stream, false);
+        CudaMemWrapper<double> dphi_y(BatchBigGrid::get_max_phi_len(), stream, false);
+        CudaMemWrapper<double> dphi_z(BatchBigGrid::get_max_phi_len(), stream, false);
+        CudaMemWrapper<double> dphi_x_vldr3(BatchBigGrid::get_max_phi_len(), stream, false);
+        CudaMemWrapper<double> dphi_y_vldr3(BatchBigGrid::get_max_phi_len(), stream, false);
+        CudaMemWrapper<double> dphi_z_vldr3(BatchBigGrid::get_max_phi_len(), stream, false);
+        #pragma omp for schedule(dynamic)
+        for(const auto& bgrid_batch: gint_info_->get_bgrid_batches())
+        {
+            if(bgrid_batch->empty())
+            {
+                continue;
+            }
+            phi_op.set_bgrid_batch(bgrid_batch);
+            phi_op.set_phi_dphi(phi.get_device_ptr(),
+                                dphi_x.get_device_ptr(), dphi_y.get_device_ptr(), dphi_z.get_device_ptr());
+            for(int is = 0; is < nspin_; is++)
+            {
+                phi_op.phi_mul_vldr3(vr_eff_d_[is].get_device_ptr(), dr3_,
+                                     phi.get_device_ptr(), phi_vldr3.get_device_ptr());
+                phi_op.phi_mul_vldr3(vofk_d_[is].get_device_ptr(), dr3_,
+                                     dphi_x.get_device_ptr(), dphi_x_vldr3.get_device_ptr());
+                phi_op.phi_mul_vldr3(vofk_d_[is].get_device_ptr(), dr3_,
+                                     dphi_y.get_device_ptr(), dphi_y_vldr3.get_device_ptr());
+                phi_op.phi_mul_vldr3(vofk_d_[is].get_device_ptr(), dr3_,
+                                     dphi_z.get_device_ptr(), dphi_z_vldr3.get_device_ptr());
+                phi_op.phi_mul_phi(phi.get_device_ptr(), phi_vldr3.get_device_ptr(),
+                                   hr_gint_part_[is], hr_gint_part_d_[is].get_device_ptr());
+                phi_op.phi_mul_phi(dphi_x.get_device_ptr(), dphi_x_vldr3.get_device_ptr(),
+                                   hr_gint_part_[is], hr_gint_part_d_[is].get_device_ptr());
+                phi_op.phi_mul_phi(dphi_y.get_device_ptr(), dphi_y_vldr3.get_device_ptr(),
+                                   hr_gint_part_[is], hr_gint_part_d_[is].get_device_ptr());
+                phi_op.phi_mul_phi(dphi_z.get_device_ptr(), dphi_z_vldr3.get_device_ptr(),
+                                   hr_gint_part_[is], hr_gint_part_d_[is].get_device_ptr());
+            }
+        }
+        checkCuda(cudaStreamSynchronize(stream));
+        checkCuda(cudaStreamDestroy(stream));
+    }
+    transfer_gpu_to_cpu_();
+}
+
+} // namespace ModuleGint
\ No newline at end of file
diff --git a/source/module_hamilt_lcao/module_gint/temp_gint/gint_vl_metagga_nspin4_gpu.h b/source/module_hamilt_lcao/module_gint/temp_gint/gint_vl_metagga_nspin4_gpu.h
new file mode 100644
index 0000000000..d38665dffa
--- /dev/null
+++ b/source/module_hamilt_lcao/module_gint/temp_gint/gint_vl_metagga_nspin4_gpu.h
@@ -0,0 +1,52 @@
+#pragma once
+
+#include <memory>
+#include <vector>
+#include "module_hamilt_lcao/module_hcontainer/hcontainer.h"
+#include "gint.h"
+#include "gint_info.h"
+#include "module_hamilt_lcao/module_gint/temp_gint/kernel/cuda_mem_wrapper.h"
+
+namespace ModuleGint
+{
+
+class Gint_vl_metagga_nspin4_gpu : public Gint
+{
+    public:
+    Gint_vl_metagga_nspin4_gpu(
+        std::vector<const double*> vr_eff,
+        std::vector<const double*> vofk,
+        HContainer<std::complex<double>>* hR)
+        : vr_eff_(vr_eff), vofk_(vofk), hR_(hR), dr3_(gint_info_->get_mgrid_volume()) {}
+    
+    void cal_gint();
+
+    private:
+    void init_hr_gint_();
+
+    void transfer_cpu_to_gpu_();
+
+    void transfer_gpu_to_cpu_();
+    
+    void cal_hr_gint_();
+
+    // input
+    std::vector<const double*> vr_eff_;
+    std::vector<const double*> vofk_;
+    // output
+    HContainer<std::complex<double>>* hR_;
+
+    // Intermediate variables
+    const double dr3_;
+
+    const int nspin_ = 4;
+
+    std::vector<HContainer<double>> hr_gint_part_;
+    HContainer<std::complex<double>> hr_gint_full_;
+    
+    std::vector<CudaMemWrapper<double>> vr_eff_d_;
+    std::vector<CudaMemWrapper<double>> vofk_d_;
+    std::vector<CudaMemWrapper<double>> hr_gint_part_d_;
+};
+
+}
\ No newline at end of file
diff --git a/source/module_hamilt_lcao/module_gint/temp_gint/gint_vl_nspin4.cpp b/source/module_hamilt_lcao/module_gint/temp_gint/gint_vl_nspin4.cpp
index db211570ca..27db0a7db3 100644
--- a/source/module_hamilt_lcao/module_gint/temp_gint/gint_vl_nspin4.cpp
+++ b/source/module_hamilt_lcao/module_gint/temp_gint/gint_vl_nspin4.cpp
@@ -10,10 +10,13 @@ namespace ModuleGint
 {
 void Gint_vl_nspin4::cal_gint()
 {
+    ModuleBase::TITLE("Gint", "cal_gint_vl");
+    ModuleBase::timer::tick("Gint", "cal_gint_vl");
     init_hr_gint_();
     cal_hr_gint_();
     compose_hr_gint(hr_gint_part_, hr_gint_full_);
-    transfer_hr_gint_to_hR(toConstSharedPtr(hr_gint_full_), hR_);
+    transfer_hr_gint_to_hR(hr_gint_full_, *hR_);
+    ModuleBase::timer::tick("Gint", "cal_gint_vl");
 }
 
 void Gint_vl_nspin4::init_hr_gint_()
@@ -49,7 +52,7 @@ void Gint_vl_nspin4::cal_hr_gint_()
             for(int is = 0; is < nspin_; is++)
             {
                 phi_op.phi_mul_vldr3(vr_eff_[is], dr3_, phi.data(), phi_vldr3.data());
-                phi_op.phi_mul_phi(phi.data(), phi_vldr3.data(), *hr_gint_part_[is], PhiOperator::Triangular_Matrix::Upper);
+                phi_op.phi_mul_phi(phi.data(), phi_vldr3.data(), hr_gint_part_[is], PhiOperator::Triangular_Matrix::Upper);
             }
         }
     }
diff --git a/source/module_hamilt_lcao/module_gint/temp_gint/gint_vl_nspin4.h b/source/module_hamilt_lcao/module_gint/temp_gint/gint_vl_nspin4.h
index 6338055db6..9436b5c397 100644
--- a/source/module_hamilt_lcao/module_gint/temp_gint/gint_vl_nspin4.h
+++ b/source/module_hamilt_lcao/module_gint/temp_gint/gint_vl_nspin4.h
@@ -15,9 +15,9 @@ class Gint_vl_nspin4 : public Gint
     Gint_vl_nspin4(
         std::vector<const double*> vr_eff,
         HContainer<std::complex<double>>* hR)
-        : vr_eff_(vr_eff), hR_(hR), dr3_(gint_info_->get_mgrid_volume()){};
+        : vr_eff_(vr_eff), hR_(hR), dr3_(gint_info_->get_mgrid_volume()) {}
     
-    void cal_gint() override;
+    void cal_gint();
 
     private:
 
@@ -33,16 +33,13 @@ class Gint_vl_nspin4 : public Gint
     // output
     HContainer<std::complex<double>>* hR_;
 
-    //========================
     // Intermediate variables
-    //========================
     const double dr3_;
 
     const int nspin_ = 4;
 
-    std::vector<std::shared_ptr<HContainer<double>>> hr_gint_part_;
-    std::shared_ptr<HContainer<std::complex<double>>> hr_gint_full_;
-
+    std::vector<HContainer<double>> hr_gint_part_;
+    HContainer<std::complex<double>> hr_gint_full_;
 };
 
 } // namespace ModuleGint
\ No newline at end of file
diff --git a/source/module_hamilt_lcao/module_gint/temp_gint/gint_vl_nspin4_gpu.cpp b/source/module_hamilt_lcao/module_gint/temp_gint/gint_vl_nspin4_gpu.cpp
new file mode 100644
index 0000000000..c070258db5
--- /dev/null
+++ b/source/module_hamilt_lcao/module_gint/temp_gint/gint_vl_nspin4_gpu.cpp
@@ -0,0 +1,91 @@
+#include "gint_vl_nspin4_gpu.h"
+#include "gint_common.h"
+#include "gint_helper.h"
+#include "batch_biggrid.h"
+#include "kernel/phi_operator_gpu.h"
+
+namespace ModuleGint
+{
+
+void Gint_vl_nspin4_gpu::cal_gint()
+{
+    ModuleBase::TITLE("Gint", "cal_gint_vl");
+    ModuleBase::timer::tick("Gint", "cal_gint_vl");
+    init_hr_gint_();
+    cal_hr_gint_();
+    compose_hr_gint(hr_gint_part_, hr_gint_full_);
+    transfer_hr_gint_to_hR(hr_gint_full_, *hR_);
+    ModuleBase::timer::tick("Gint", "cal_gint_vl");
+}
+
+void Gint_vl_nspin4_gpu::init_hr_gint_()
+{
+    hr_gint_part_.resize(nspin_);
+    for(int i = 0; i < nspin_; i++)
+    {
+        hr_gint_part_[i] = gint_info_->get_hr<double>();
+    }
+    const int npol = 2;
+    hr_gint_full_ = gint_info_->get_hr<std::complex<double>>(npol);
+}
+
+void Gint_vl_nspin4_gpu::transfer_cpu_to_gpu_()
+{
+    vr_eff_d_.resize(nspin_);
+    hr_gint_part_d_.resize(nspin_);
+    for(int i = 0; i < nspin_; i++)
+    {
+        hr_gint_part_d_[i] = CudaMemWrapper<double>(hr_gint_part_[i].get_nnr(), 0, false);
+        vr_eff_d_[i] = CudaMemWrapper<double>(gint_info_->get_local_mgrid_num(), 0, false);
+        checkCuda(cudaMemcpy(vr_eff_d_[i].get_device_ptr(), vr_eff_[i],
+                  gint_info_->get_local_mgrid_num() * sizeof(double), cudaMemcpyHostToDevice));
+    }
+}
+
+void Gint_vl_nspin4_gpu::transfer_gpu_to_cpu_()
+{
+    for(int i = 0; i < nspin_; i++)
+    {
+        checkCuda(cudaMemcpy(hr_gint_part_[i].get_wrapper(), hr_gint_part_d_[i].get_device_ptr(), 
+                             hr_gint_part_[i].get_nnr() * sizeof(double), cudaMemcpyDeviceToHost));
+    }
+}
+
+
+void Gint_vl_nspin4_gpu::cal_hr_gint_()
+{
+    transfer_cpu_to_gpu_();
+#pragma omp parallel num_threads(gint_info_->get_streams_num())
+    {
+        // 20240620 Note that it must be set again here because 
+        // cuda's device is not safe in a multi-threaded environment.
+        checkCuda(cudaSetDevice(gint_info_->get_dev_id()));
+        cudaStream_t stream;
+        checkCuda(cudaStreamCreate(&stream));
+        PhiOperatorGpu phi_op(gint_info_->get_gpu_vars(), stream);
+        CudaMemWrapper<double> phi(BatchBigGrid::get_max_phi_len(), stream, false);
+        CudaMemWrapper<double> phi_vldr3(BatchBigGrid::get_max_phi_len(), stream, false);
+        #pragma omp for schedule(dynamic)
+        for(const auto& bgrid_batch: gint_info_->get_bgrid_batches())
+        {
+            if(bgrid_batch->empty())
+            {
+                continue;
+            }
+            phi_op.set_bgrid_batch(bgrid_batch);
+            phi_op.set_phi(phi.get_device_ptr());
+            for(int is = 0; is < nspin_; is++)
+            {
+                phi_op.phi_mul_vldr3(vr_eff_d_[is].get_device_ptr(), dr3_,
+                                     phi.get_device_ptr(), phi_vldr3.get_device_ptr());
+                phi_op.phi_mul_phi(phi.get_device_ptr(), phi_vldr3.get_device_ptr(),
+                                   hr_gint_part_[is], hr_gint_part_d_[is].get_device_ptr());
+            }
+        }
+        checkCuda(cudaStreamSynchronize(stream));
+        checkCuda(cudaStreamDestroy(stream));
+    }
+    transfer_gpu_to_cpu_();
+}
+
+} // namespace ModuleGint
\ No newline at end of file
diff --git a/source/module_hamilt_lcao/module_gint/temp_gint/gint_vl_nspin4_gpu.h b/source/module_hamilt_lcao/module_gint/temp_gint/gint_vl_nspin4_gpu.h
new file mode 100644
index 0000000000..a7decea9e8
--- /dev/null
+++ b/source/module_hamilt_lcao/module_gint/temp_gint/gint_vl_nspin4_gpu.h
@@ -0,0 +1,53 @@
+#pragma once
+
+#include <memory>
+#include <vector>
+#include "module_hamilt_lcao/module_hcontainer/hcontainer.h"
+#include "gint.h"
+#include "gint_info.h"
+#include "module_hamilt_lcao/module_gint/temp_gint/kernel/cuda_mem_wrapper.h"
+
+namespace ModuleGint
+{
+
+class Gint_vl_nspin4_gpu : public Gint
+{
+    public:
+    Gint_vl_nspin4_gpu(
+        std::vector<const double*> vr_eff,
+        HContainer<std::complex<double>>* hR)
+        : vr_eff_(vr_eff), hR_(hR), dr3_(gint_info_->get_mgrid_volume()) {}
+    
+    void cal_gint();
+
+    private:
+
+    void init_hr_gint_();
+    
+    void transfer_cpu_to_gpu_();
+
+    void transfer_gpu_to_cpu_();
+    
+    // note that only the upper triangle matrix of hR is calculated
+    // that's why we need compose_hr_gint() to fill the lower triangle matrix.
+    void cal_hr_gint_();
+
+    // input
+    std::vector<const double*> vr_eff_;
+
+    // output
+    HContainer<std::complex<double>>* hR_;
+
+    // Intermediate variables
+    const double dr3_;
+
+    const int nspin_ = 4;
+
+    std::vector<HContainer<double>> hr_gint_part_;
+    HContainer<std::complex<double>> hr_gint_full_;
+    
+    std::vector<CudaMemWrapper<double>> vr_eff_d_;
+    std::vector<CudaMemWrapper<double>> hr_gint_part_d_;
+};
+
+} // namespace ModuleGint
\ No newline at end of file
diff --git a/source/module_hamilt_lcao/module_gint/temp_gint/kernel/cuda_mem_wrapper.h b/source/module_hamilt_lcao/module_gint/temp_gint/kernel/cuda_mem_wrapper.h
new file mode 100644
index 0000000000..9b7ad27e83
--- /dev/null
+++ b/source/module_hamilt_lcao/module_gint/temp_gint/kernel/cuda_mem_wrapper.h
@@ -0,0 +1,171 @@
+#pragma once
+#include <cuda_runtime.h>
+#include "source_base/tool_quit.h"
+#include "gint_helper.cuh"
+
+template <typename T>
+class CudaMemWrapper
+{
+  public:
+
+    CudaMemWrapper() = default;
+    CudaMemWrapper(const CudaMemWrapper& other) = delete;
+    CudaMemWrapper& operator=(const CudaMemWrapper& other) = delete;
+    CudaMemWrapper(CudaMemWrapper&& other) noexcept
+    {
+      this->device_ptr_ = other.device_ptr_;
+      this->host_ptr_ = other.host_ptr_;
+      this->size_ = other.size_;
+      this->malloc_host_ = other.malloc_host_;
+      this->stream_ = other.stream_;
+
+      other.device_ptr_ = nullptr;
+      other.host_ptr_ = nullptr;
+      other.size_ = 0;
+      other.malloc_host_ = false;
+      other.stream_ = 0;
+    }
+    
+    CudaMemWrapper& operator=(CudaMemWrapper&& other) noexcept
+    {
+      if (this != &other)
+      {
+        this->device_ptr_ = other.device_ptr_;
+        this->host_ptr_ = other.host_ptr_;
+        this->size_ = other.size_;
+        this->malloc_host_ = other.malloc_host_;
+        this->stream_ = other.stream_;
+
+        other.device_ptr_ = nullptr;
+        other.host_ptr_ = nullptr;
+        other.size_ = 0;
+        other.malloc_host_ = false;
+        other.stream_ = 0;
+      }
+      return *this;
+    }
+
+    CudaMemWrapper(size_t size,
+                   cudaStream_t stream = 0,
+                   bool malloc_host = true)
+    {
+      size_ = size;
+      malloc_host_ = malloc_host;
+      stream_ = stream;
+
+      if (malloc_host)
+      { 
+        checkCuda(cudaMallocHost((void**)&host_ptr_, size_* sizeof(T)));
+        memset(host_ptr_, 0, size_ * sizeof(T));
+      }
+      else
+      { host_ptr_ = nullptr; }
+
+      checkCuda(cudaMalloc((void**)&device_ptr_, size_ * sizeof(T)));
+      checkCuda(cudaMemset(device_ptr_, 0, size_ * sizeof(T)));
+    }
+    
+    ~CudaMemWrapper()
+    {
+      free();
+    }
+
+    void copy_host_to_device_sync(size_t size)
+    {
+      if (host_ptr_ == nullptr)
+        { ModuleBase::WARNING_QUIT("cuda_mem_wrapper", "Host pointer is null, cannot copy to device."); } 
+      checkCuda(cudaMemcpy(device_ptr_, host_ptr_, size * sizeof(T), cudaMemcpyHostToDevice));
+    }
+
+    void copy_host_to_device_sync()
+    {
+      copy_host_to_device_sync(size_);
+    }
+
+    void copy_host_to_device_async(size_t size)
+    {
+      if (host_ptr_ == nullptr)
+        { ModuleBase::WARNING_QUIT("cuda_mem_wrapper", "Host pointer is null, cannot copy to device."); } 
+      checkCuda(cudaMemcpyAsync(device_ptr_, host_ptr_, size * sizeof(T), cudaMemcpyHostToDevice, stream_));
+    }
+
+    void copy_host_to_device_async()
+    {
+      copy_host_to_device_async(size_);
+    }
+
+    void copy_device_to_host_sync(size_t size)
+    {
+      if (host_ptr_ == nullptr)
+        { ModuleBase::WARNING_QUIT("cuda_mem_wrapper", "Host pointer is null, cannot copy to host."); } 
+      checkCuda(cudaMemcpy(host_ptr_, device_ptr_, size * sizeof(T), cudaMemcpyDeviceToHost));
+    }
+
+    void copy_device_to_host_sync()
+    {
+      copy_device_to_host_sync(size_);
+    }
+
+    void copy_device_to_host_async(size_t size)
+    {
+      if (host_ptr_ == nullptr)
+        { ModuleBase::WARNING_QUIT("cuda_mem_wrapper", "Host pointer is null, cannot copy to host."); } 
+      checkCuda(cudaMemcpyAsync(host_ptr_, device_ptr_, size * sizeof(T), cudaMemcpyDeviceToHost, stream_));
+    }
+
+    void copy_device_to_host_async()
+    {
+      copy_device_to_host_async(size_);
+    }
+    
+    void memset_device_sync(const size_t size, const int value = 0)
+    {
+      checkCuda(cudaMemset(device_ptr_, value, size * sizeof(T)));
+    }
+
+    void memset_device_sync(const int value = 0)
+    {
+      memset_device_sync(size_, value);
+    }
+
+    void memset_device_async(const size_t size, const int value = 0)
+    {
+      checkCuda(cudaMemsetAsync(device_ptr_, value, size * sizeof(T), stream_));
+    }
+
+    void memset_device_async(const int value = 0)
+    {
+      memset_device_async(size_, value);
+    }
+
+    void memset_host(const size_t size, const int value = 0)
+    {
+      if (host_ptr_ == nullptr)
+        { ModuleBase::WARNING_QUIT("cuda_mem_wrapper", "Host pointer is null, cannot memset host."); } 
+      checkCuda(cudaMemset(host_ptr_, value, size * sizeof(T)));
+    }
+
+    void memset_host(const int value = 0)
+    {
+      memset_host(size_, value);
+    }
+
+    void free()
+    {
+      checkCuda(cudaFree(device_ptr_));
+      checkCuda(cudaFreeHost(host_ptr_));
+    }
+
+    T* get_device_ptr() { return device_ptr_; }
+    T* get_host_ptr() { return host_ptr_; }
+    const T* get_device_ptr() const { return device_ptr_; }
+    const T* get_host_ptr() const { return host_ptr_; }
+    size_t get_size() const { return size_; }
+
+  private:
+    T* device_ptr_ = nullptr;
+    T* host_ptr_ = nullptr;
+    size_t size_ = 0;
+    bool malloc_host_ = false;
+    cudaStream_t stream_ = 0;
+};
\ No newline at end of file
diff --git a/source/module_hamilt_lcao/module_gint/temp_gint/kernel/dgemm_vbatch.cu b/source/module_hamilt_lcao/module_gint/temp_gint/kernel/dgemm_vbatch.cu
new file mode 100644
index 0000000000..b35e0669b6
--- /dev/null
+++ b/source/module_hamilt_lcao/module_gint/temp_gint/kernel/dgemm_vbatch.cu
@@ -0,0 +1,38 @@
+#include "gemm_tn_vbatch.cuh"
+#include "gemm_nn_vbatch.cuh"
+#include "dgemm_vbatch.h"
+
+void dgemm_nn_vbatch(
+    int max_m, int max_n, int max_k,
+    const int* m_d, const int* n_d, const int* k_d,
+    const double* const* A_array_d, const int* lda_d,
+    const double* const* B_array_d, const int* ldb_d,
+    double** C_array_d, const int* ldc_d,
+    int batchCount, cudaStream_t stream,
+    const double* alpha)
+{
+    vbatched_gemm_nn_impl<double, 8, 4, 16, 16, 8, 8, 4, 8, 4>
+    (max_m, max_n, m_d, n_d, k_d,
+    A_array_d, lda_d,
+    B_array_d, ldb_d,
+    C_array_d, ldc_d,
+    batchCount, stream, alpha);
+
+}
+
+void dgemm_tn_vbatch(
+    int max_m, int max_n, int max_k,
+    const int* m_d, const int* n_d, const int* k_d,
+    const double* const* A_array_d, const int* lda_d,
+    const double* const* B_array_d, const int* ldb_d,
+    double** C_array_d, const int* ldc_d,
+    int batchCount, cudaStream_t stream,
+    const double* alpha)
+{
+    vbatched_gemm_tn_impl<double, 8,4,16,16,4,8,4,8,4>
+        (max_m, max_n, m_d, n_d, k_d,
+        A_array_d, lda_d,
+        B_array_d, ldb_d,
+        C_array_d, ldc_d,
+        batchCount, stream, alpha);
+}
diff --git a/source/module_hamilt_lcao/module_gint/temp_gint/kernel/dgemm_vbatch.h b/source/module_hamilt_lcao/module_gint/temp_gint/kernel/dgemm_vbatch.h
new file mode 100644
index 0000000000..8589bcf62e
--- /dev/null
+++ b/source/module_hamilt_lcao/module_gint/temp_gint/kernel/dgemm_vbatch.h
@@ -0,0 +1,23 @@
+#pragma once
+
+#include <cuda_runtime.h>
+
+// C(batch_id) = alpha * A(batch_id) * B(batch_id) + C(batch_id)
+void dgemm_nn_vbatch(
+    int max_m, int max_n, int max_k,
+    const int* m_d, const int* n_d, const int* k_d,
+    const double* const* A_array_d, const int* lda_d,
+    const double* const* B_array_d, const int* ldb_d,
+    double** C_array_d, const int* ldc_d,
+    int batchCount, cudaStream_t stream,
+    const double* alpha = nullptr);
+
+// C(batch_id) = alpha * A(batch_id)^T * B(batch_id) + C(batch_id)
+void dgemm_tn_vbatch(
+    int max_m, int max_n, int max_k,
+    const int* m_d, const int* n_d, const int* k_d,
+    const double* const* A_array_d, const int* lda_d,
+    const double* const* B_array_d, const int* ldb_d,
+    double** C_array_d, const int* ldc_d,
+    int batchCount, cudaStream_t stream,
+    const double* alpha = nullptr);
\ No newline at end of file
diff --git a/source/module_hamilt_lcao/module_gint/temp_gint/kernel/gemm_nn_vbatch.cuh b/source/module_hamilt_lcao/module_gint/temp_gint/kernel/gemm_nn_vbatch.cuh
new file mode 100644
index 0000000000..5ad934e305
--- /dev/null
+++ b/source/module_hamilt_lcao/module_gint/temp_gint/kernel/gemm_nn_vbatch.cuh
@@ -0,0 +1,427 @@
+#ifndef GEMM_NN_VBATCH_CUH
+#define GEMM_NN_VBATCH_CUH
+#include <assert.h> // for assert
+#include <cublas_v2.h>
+#include <cuda.h> // for CUDA_VERSION
+#include <cuda_runtime.h>
+#include <stdio.h> // for fprintf and stderr
+
+#include "gint_helper.cuh"
+#include <functional>
+
+
+#define sA(i, j) sA[(j)*slda + (i)]
+#define sB(i, j) sB[(j)*sldb + (i)]
+#define fetch(A, m, n, bound) offs_d##A[min(n * LD##A + m, bound)]
+
+template <typename T,
+          int DIM_X,
+          int DIM_Y,
+          int BLK_M,
+          int BLK_N,
+          int BLK_K,
+          int DIM_XA,
+          int DIM_YA,
+          int DIM_XB,
+          int DIM_YB,
+          int THR_M,
+          int THR_N>
+static __device__ void vbatched_gemm_nn_device(int M,
+                                               int N,
+                                               int K,
+                                               const T* __restrict__ A,
+                                               int LDA,
+                                               const T* __restrict__ B,
+                                               int LDB,
+                                               T* __restrict__ C,
+                                               int LDC,
+                                               T*  sA,
+                                               int slda,
+                                               T*  sB,
+                                               int sldb,
+                                               T alpha)
+{
+    int idx = threadIdx.x; // thread's m dimension
+    int idy = threadIdx.y; // thread's n dimension
+
+    int idt = DIM_X * idy + idx; // thread's global number
+
+    int idxA = idt % DIM_XA; // idx within A
+    int idyA = idt / DIM_XA; // idy within A
+
+    int idxB = idt % DIM_XB; // idx within B
+    int idyB = idt / DIM_XB; // idy within B
+
+    int blx = blockIdx.x; // block's m dimension
+    int bly = blockIdx.y; // block's n dimension
+
+    // Registers for the innermost loop
+    T rC[THR_N][THR_M];
+    T rA[THR_M];
+    T rB[THR_N];
+
+    // Registers for the dev->shmem copy
+    T ra[BLK_K / DIM_YA][BLK_M / DIM_XA];
+    T rb[BLK_N / DIM_YB][BLK_K / DIM_XB];
+
+    // bound is the correction to offs_d in order to not get out of memory bound
+    // so bound could be negative value since offs_d could be out of bound
+    const T* offs_dA = A + blx * BLK_M + idyA * LDA + idxA;
+    int boundA
+        = (LDA * (K - 1) + M) - (blx * BLK_M + idyA * LDA + idxA) - 1;
+
+    const T* offs_dB = B + bly * BLK_N * LDB + idyB * LDB + idxB;
+    int boundB
+        = (LDB * (N - 1) + K) - (bly * BLK_N * LDB + idyB * LDB + idxB) - 1;
+
+    int m, n, k, kk;
+
+// Zero C
+#pragma unroll
+    for (n = 0; n < THR_N; n++)
+    {
+#pragma unroll
+        for (m = 0; m < THR_M; m++)
+        {
+            rC[n][m] = 0.0;
+        }
+    }
+
+// Load A dev->shmem
+#pragma unroll
+    for (n = 0; n < BLK_K; n += DIM_YA)
+    {
+#pragma unroll
+        for (m = 0; m < BLK_M; m += DIM_XA)
+        {
+            sA(m + idxA, n + idyA) = fetch(A, m, n, boundA);
+        }
+    }
+
+#pragma unroll
+    for (n = 0; n < BLK_N; n += DIM_YB)
+    {
+#pragma unroll
+        for (m = 0; m < BLK_K; m += DIM_XB)
+        {
+            sB(m + idxB, n + idyB) = fetch(B, m, n, boundB);
+        }
+    }
+
+    __syncthreads();
+
+    for (kk = 0; kk < K - BLK_K; kk += BLK_K)
+    {
+        offs_dA += BLK_K * LDA;
+        boundA -= BLK_K * LDA;
+
+        offs_dB += BLK_K;
+        boundB -= BLK_K;
+
+// Load A dev->regs
+#pragma unroll
+        for (n = 0; n < BLK_K / DIM_YA; n++)
+        {
+#pragma unroll
+            for (m = 0; m < BLK_M / DIM_XA; m++)
+            {
+                ra[n][m] = fetch(A, m * DIM_XA, n * DIM_YA, boundA);
+            }
+        }
+
+// Load B dev->regs
+#pragma unroll
+        for (n = 0; n < BLK_N / DIM_YB; n++)
+        {
+#pragma unroll
+            for (m = 0; m < BLK_K / DIM_XB; m++)
+            {
+                rb[n][m] = fetch(B, m * DIM_XB, n * DIM_YB, boundB);
+            }
+        }
+
+// Multiply
+#pragma unroll
+        for (k = 0; k < BLK_K; k++)
+        {
+// Load A shmem->regs
+#pragma unroll
+            for (m = 0; m < THR_M; m++)
+            {
+                rA[m] = sA(m * DIM_X + idx, k);
+            }
+
+// Load B shmem->regs
+#pragma unroll
+            for (n = 0; n < THR_N; n++)
+            {
+                rB[n] = sB(k, n * DIM_Y + idy);
+            }
+
+// Compute
+#pragma unroll
+            for (n = 0; n < THR_N; n++)
+            {
+#pragma unroll
+                for (m = 0; m < THR_M; m++)
+                {
+                    rC[n][m] += rA[m] * rB[n];
+                }
+            }
+        }
+
+        __syncthreads();
+
+// Load A regs->shmem
+#pragma unroll
+        for (n = 0; n < BLK_K / DIM_YA; n++)
+        {
+#pragma unroll
+            for (m = 0; m < BLK_M / DIM_XA; m++)
+            {
+                sA(m * DIM_XA + idxA, n * DIM_YA + idyA) = ra[n][m];
+            }
+        }
+
+// Load B regs->shmem
+#pragma unroll
+        for (n = 0; n < BLK_N / DIM_YB; n++)
+        {
+#pragma unroll
+            for (m = 0; m < BLK_K / DIM_XB; m++)
+            {
+                sB(m * DIM_XB + idxB, n * DIM_YB + idyB) = rb[n][m];
+            }
+        }
+        __syncthreads();
+    }
+
+    // Multiply last full (BLK_K) or partial block of
+    // columns of op(A) and rows of op(B).
+    // It's okay that m,n exceed matrix bounds as all work is in registers
+    // or shared memory, and out-of-bounds rC[n][m] will not be saved later.
+    kk = K - kk;
+#pragma unroll
+    for (k = 0; k < kk; k++)
+    {
+// Load A shmem->regs
+#pragma unroll
+        for (m = 0; m < THR_M; m++)
+        {
+            rA[m] = sA(m * DIM_X + idx, k);
+        }
+
+// Load B shmem->regs
+#pragma unroll
+        for (n = 0; n < THR_N; n++)
+        {
+            rB[n] = sB(k, n * DIM_Y + idy);
+        }
+
+// Compute
+#pragma unroll
+        for (n = 0; n < THR_N; n++)
+        {
+#pragma unroll
+            for (m = 0; m < THR_M; m++)
+            {
+                rC[n][m] += rA[m] * rB[n];
+            }
+        }
+    }
+
+// Store C regs->dev
+#pragma unroll
+    for (n = 0; n < THR_N; n++)
+    {
+        int coord_dCn = bly * BLK_N + n * DIM_Y + idy;
+#pragma unroll
+        for (m = 0; m < THR_M; m++)
+        {
+            int coord_dCm = blx * BLK_M + m * DIM_X + idx;
+            if (coord_dCm < M && coord_dCn < N)
+            {
+                int offsC = coord_dCn * LDC + coord_dCm;
+
+                atomicAdd(C + offsC, rC[n][m] * alpha);
+            }
+        }
+    }
+}
+
+/******************************************************************************/
+template <typename T,
+          int DIM_X,
+          int DIM_Y,
+          int BLK_M,
+          int BLK_N,
+          int BLK_K,
+          int DIM_XA,
+          int DIM_YA,
+          int DIM_XB,
+          int DIM_YB>
+static __global__ void vbatched_gemm_nn_kernel(const int* M,
+                                              const int* N,
+                                              const int* K,
+                                              const T* const* global_A_array,
+                                              const int* global_lda,
+                                              const T* const* global_B_array,
+                                              const int* global_ldb,
+                                              T** global_C_array,
+                                              const int* global_ldc,
+                                              const T* alpha)
+{
+    extern __shared__ __align__(sizeof(T)) unsigned char smem[];
+    T* shared_mem = reinterpret_cast<T*>(smem);
+
+    int batchid = blockIdx.z;
+    int local_M = (int)M[batchid];
+    int local_N = (int)N[batchid];
+    int local_K = (int)K[batchid];
+
+    if (blockIdx.x >= (local_M + BLK_M - 1) / BLK_M)
+        return;
+    if (blockIdx.y >= (local_N + BLK_N - 1) / BLK_N)
+        return;
+
+    int shared_lda = BLK_M + 1;
+    int shared_ldb = BLK_K + 1;
+    T* shared_A = (T*)shared_mem;
+    T* shared_B = shared_A + shared_lda * BLK_K;
+    double alpha_tmp = 1.0;
+    if (alpha != nullptr)
+    {
+        alpha_tmp = alpha[batchid];
+    }
+    vbatched_gemm_nn_device<T,
+                           DIM_X,
+                           DIM_Y,
+                           BLK_M,
+                           BLK_N,
+                           BLK_K,
+                           DIM_XA,
+                           DIM_YA,
+                           DIM_XB,
+                           DIM_YB,
+                           (BLK_M / DIM_X),
+                           (BLK_N / DIM_Y)>(local_M,
+                                            local_N,
+                                            local_K,
+                                            global_A_array[batchid],
+                                            (int)global_lda[batchid],
+                                            global_B_array[batchid],
+                                            (int)global_ldb[batchid],
+                                            global_C_array[batchid],
+                                            (int)global_ldc[batchid],
+                                            shared_A,
+                                            shared_lda,
+                                            shared_B,
+                                            shared_ldb,
+                                            alpha_tmp);
+}
+
+/**
+ * Performs a batched matrix multiplication using the vbatched_gemm_impl
+ * function.
+ *
+ * C = alpha * A * B + C
+ * @tparam T The data type of the matrices.
+ * @tparam DIM_X The number of threads in the x-dimension of each block.
+ * @tparam DIM_Y The number of threads in the y-dimension of each block.
+ * @tparam BLK_M The number of rows processed by each thread block.
+ * @tparam BLK_N The number of columns processed by each thread block.
+ * @tparam BLK_K The number of elements processed by each thread block along the
+ * K dimension.
+ * @tparam DIM_XA The number of threads in the x-dimension used for loading
+ * matrix A.
+ * @tparam DIM_YA The number of threads in the y-dimension used for loading
+ * matrix A.
+ * @tparam DIM_XB The number of threads in the x-dimension used for loading
+ * matrix B.
+ * @tparam DIM_YB The number of threads in the y-dimension used for loading
+ * matrix B.
+ * @param max_m The maximum number of rows in the matrices.
+ * @param max_n The maximum number of columns in the matrices.
+ * @param m An array of batch sizes for the number of rows in each matrix.
+ * @param n An array of batch sizes for the number of columns in each matrix.
+ * @param k An array of batch sizes for the number of elements in each matrix
+ * along the K dimension.
+ * @param global_A_array An array of pointers to the input matrices A.
+ * @param global_lda An array of leading dimensions for the input matrices A.
+ * @param global_B_array An array of pointers to the input matrices B.
+ * @param global_ldb An array of leading dimensions for the input matrices B.
+ * @param global_C_array An array of pointers to the output matrices C.
+ * @param global_ldc An array of leading dimensions for the output matrices C.
+ * @param batchCount The number of matrices in the batch.
+ * @param stream The CUDA stream to use for the computation.
+ * @param alpha The scalar value to multiply the matrices by (optional, default
+ * is nullptr). generate by copilot
+ */
+template <typename T,
+          int DIM_X,
+          int DIM_Y,
+          int BLK_M,
+          int BLK_N,
+          int BLK_K,
+          int DIM_XA,
+          int DIM_YA,
+          int DIM_XB,
+          int DIM_YB>
+void vbatched_gemm_nn_impl(int max_m,
+                           int max_n,
+                           const int* m,
+                           const int* n,
+                           const int* k,
+                           const T* const* global_A_array,
+                           const int* global_lda,
+                           const T* const* global_B_array,
+                           const int* global_ldb,
+                           T** global_C_array,
+                           const int* global_ldc,
+                           int batchCount,
+                           cudaStream_t stream,
+                           const T* alpha = nullptr)
+{
+    // The positions of A and B have been swapped here.
+    // This is because vbatch_gemm_nn_kernel is column major,
+    // but vatched_gemm_nn_impl is designed to be row major,
+
+    size_t shared_mem_size = 0;
+    shared_mem_size += (BLK_M + 1) * BLK_K * sizeof(T);
+    shared_mem_size += (BLK_K + 1) * BLK_N * sizeof(T);
+    dim3 dimBlock(DIM_X, DIM_Y);
+    const int max_batch_count = 32768;
+
+    for (int i = 0; i < batchCount; i += max_batch_count)
+    {
+        const int ibatch = min(max_batch_count, batchCount - i);
+        dim3 dimGrid(ceil_div(max_n, BLK_M),
+                     ceil_div(max_m, BLK_N),
+                     ibatch);
+        const T* alpha_tmp = nullptr;
+        if (alpha != nullptr)
+        {
+            alpha_tmp = alpha + i;
+        }
+
+        vbatched_gemm_nn_kernel<T,
+                                DIM_X,
+                                DIM_Y,
+                                BLK_M,
+                                BLK_N,
+                                BLK_K,
+                                DIM_XA,
+                                DIM_YA,
+                                DIM_XB,
+                                DIM_YB>
+            <<<dimGrid, dimBlock, shared_mem_size, stream>>>(
+                n + i, m + i, k + i,
+                global_B_array + i, global_ldb + i,
+                global_A_array + i, global_lda + i,
+                global_C_array + i, global_ldc + i,
+                alpha_tmp);
+        checkCudaLastError();
+    }
+}
+
+#endif // GEMM_VBATCH_CUH
\ No newline at end of file
diff --git a/source/module_hamilt_lcao/module_gint/temp_gint/kernel/gemm_tn_vbatch.cuh b/source/module_hamilt_lcao/module_gint/temp_gint/kernel/gemm_tn_vbatch.cuh
new file mode 100644
index 0000000000..701e93e81f
--- /dev/null
+++ b/source/module_hamilt_lcao/module_gint/temp_gint/kernel/gemm_tn_vbatch.cuh
@@ -0,0 +1,452 @@
+#ifndef GEMM_TN_VBATCH_CUH
+#define GEMM_TN_VBATCH_CUH
+#include <assert.h> // for assert
+#include <cublas_v2.h>
+#include <cuda.h> // for CUDA_VERSION
+#include <cuda_runtime.h>
+#include <stdio.h> // for fprintf and stderr
+
+#include "gint_helper.cuh"
+#include <functional>
+
+
+#define sA(i, j) sA[(j)*slda + (i)]
+#define sB(i, j) sB[(j)*sldb + (i)]
+#define fetch(A, m, n, bound) offs_d##A[min(n * LD##A + m, bound)]
+
+template <typename T,
+          int DIM_X,
+          int DIM_Y,
+          int BLK_M,
+          int BLK_N,
+          int BLK_K,
+          int DIM_XA,
+          int DIM_YA,
+          int DIM_XB,
+          int DIM_YB,
+          int THR_M,
+          int THR_N>
+static __device__ void vbatched_gemm_nt_device(int M,
+                                               int N,
+                                               int K,
+                                               const T* __restrict__ A,
+                                               int LDA,
+                                               const T* __restrict__ B,
+                                               int LDB,
+                                               T* __restrict__ C,
+                                               int LDC,
+                                               T*  sA,
+                                               int slda,
+                                               T*  sB,
+                                               int sldb,
+                                               T alpha)
+{
+    int idx = threadIdx.x; // thread's m dimension
+    int idy = threadIdx.y; // thread's n dimension
+
+    int idt = DIM_X * idy + idx; // thread's global number
+
+    int idxA = idt % DIM_XA; // idx within A
+    int idyA = idt / DIM_XA; // idy within A
+
+    int idxB = idt % DIM_XB; // idx within B
+    int idyB = idt / DIM_XB; // idy within B
+
+    int blx = blockIdx.x; // block's m dimension
+    int bly = blockIdx.y; // block's n dimension
+
+    // Registers for the innermost loop
+    T rC[THR_N][THR_M];
+    T rA[THR_M];
+    T rB[THR_N];
+
+    // Registers for the dev->shmem copy
+    T ra[BLK_K / DIM_YA][BLK_M / DIM_XA];
+    T rb[BLK_K / DIM_YB][BLK_N / DIM_XB];
+
+    // bound is the correction to offs_d in order to not get out of memory bound
+    // so bound could be negative value since offs_d could be out of bound
+    const T* offs_dA = A + blx * BLK_M + idyA * LDA + idxA;
+    int boundA
+        = (LDA * (K - 1) + M) - (blx * BLK_M + idyA * LDA + idxA) - 1;
+
+    const T* offs_dB = B + bly * BLK_N + idyB * LDB + idxB;
+    int boundB
+        = (LDB * (K - 1) + N) - (bly * BLK_N + idyB * LDB + idxB) - 1;
+
+    int m, n, k, kk;
+
+// Zero C
+#pragma unroll
+    for (n = 0; n < THR_N; n++)
+    {
+#pragma unroll
+        for (m = 0; m < THR_M; m++)
+        {
+            rC[n][m] = 0.0;
+        }
+    }
+
+// Load A dev->shmem
+#pragma unroll
+    for (n = 0; n < BLK_K; n += DIM_YA)
+    {
+#pragma unroll
+        for (m = 0; m < BLK_M; m += DIM_XA)
+        {
+            sA(m + idxA, n + idyA) = fetch(A, m, n, boundA);
+        }
+    }
+
+#pragma unroll
+    for (n = 0; n < BLK_K; n += DIM_YB)
+    {
+#pragma unroll
+        for (m = 0; m < BLK_N; m += DIM_XB)
+        {
+            sB(n + idyB, m + idxB) = fetch(B, m, n, boundB);
+        }
+    }
+
+    __syncthreads();
+
+    for (kk = 0; kk < K - BLK_K; kk += BLK_K)
+    {
+        offs_dA += BLK_K * LDA;
+        boundA -= BLK_K * LDA;
+
+        offs_dB += BLK_K * LDB;
+        boundB -= BLK_K * LDB;
+
+// Load A dev->regs
+#pragma unroll
+        for (n = 0; n < BLK_K / DIM_YA; n++)
+        {
+#pragma unroll
+            for (m = 0; m < BLK_M / DIM_XA; m++)
+            {
+                ra[n][m] = fetch(A, m * DIM_XA, n * DIM_YA, boundA);
+            }
+        }
+
+// Load B dev->regs
+#pragma unroll
+        for (n = 0; n < BLK_K / DIM_YB; n++)
+        {
+#pragma unroll
+            for (m = 0; m < BLK_N / DIM_XB; m++)
+            {
+                rb[n][m] = fetch(B, m * DIM_XB, n * DIM_YB, boundB);
+            }
+        }
+
+// Multiply
+#pragma unroll
+        for (k = 0; k < BLK_K; k++)
+        {
+// Load A shmem->regs
+#pragma unroll
+            for (m = 0; m < THR_M; m++)
+            {
+                rA[m] = sA(m * DIM_X + idx, k);
+            }
+
+// Load B shmem->regs
+#pragma unroll
+            for (n = 0; n < THR_N; n++)
+            {
+                rB[n] = sB(k, n * DIM_Y + idy);
+            }
+
+// Compute
+#pragma unroll
+            for (n = 0; n < THR_N; n++)
+            {
+#pragma unroll
+                for (m = 0; m < THR_M; m++)
+                {
+                    rC[n][m] += rA[m] * rB[n];
+                }
+            }
+        }
+
+        __syncthreads();
+
+// Load A regs->shmem
+#pragma unroll
+        for (n = 0; n < BLK_K / DIM_YA; n++)
+        {
+#pragma unroll
+            for (m = 0; m < BLK_M / DIM_XA; m++)
+            {
+                sA(m * DIM_XA + idxA, n * DIM_YA + idyA) = ra[n][m];
+            }
+        }
+
+// Load B regs->shmem
+#pragma unroll
+        for (n = 0; n < BLK_K / DIM_YB; n++)
+        {
+#pragma unroll
+            for (m = 0; m < BLK_N / DIM_XB; m++)
+            {
+                sB(n * DIM_YB + idyB, m * DIM_XB + idxB) = rb[n][m];
+            }
+        }
+        __syncthreads();
+    }
+
+    // Multiply last full (BLK_K) or partial block of
+    // columns of op(A) and rows of op(B).
+    // It's okay that m,n exceed matrix bounds as all work is in registers
+    // or shared memory, and out-of-bounds rC[n][m] will not be saved later.
+    kk = K - kk;
+#pragma unroll
+    for (k = 0; k < kk; k++)
+    {
+// Load A shmem->regs
+#pragma unroll
+        for (m = 0; m < THR_M; m++)
+        {
+            rA[m] = sA(m * DIM_X + idx, k);
+        }
+
+// Load B shmem->regs
+#pragma unroll
+        for (n = 0; n < THR_N; n++)
+        {
+            rB[n] = sB(k, n * DIM_Y + idy);
+        }
+
+// Compute
+#pragma unroll
+        for (n = 0; n < THR_N; n++)
+        {
+#pragma unroll
+            for (m = 0; m < THR_M; m++)
+            {
+                rC[n][m] += rA[m] * rB[n];
+            }
+        }
+    }
+
+// Store C regs->dev
+#pragma unroll
+    for (n = 0; n < THR_N; n++)
+    {
+        int coord_dCn = bly * BLK_N + n * DIM_Y + idy;
+#pragma unroll
+        for (m = 0; m < THR_M; m++)
+        {
+            int coord_dCm = blx * BLK_M + m * DIM_X + idx;
+            if (coord_dCm < M && coord_dCn < N)
+            {
+                int offsC = coord_dCn * LDC + coord_dCm;
+
+                atomicAdd(C + offsC, rC[n][m] * alpha);
+            }
+        }
+    }
+}
+
+/******************************************************************************/
+template <typename T,
+          int DIM_X,
+          int DIM_Y,
+          int BLK_M,
+          int BLK_N,
+          int BLK_K,
+          int DIM_XA,
+          int DIM_YA,
+          int DIM_XB,
+          int DIM_YB>
+static __global__ void vbatched_gemm_nt_kernel(const int* M,
+                                              const int* N,
+                                              const int* K,
+                                              const T* const* global_A_array,
+                                              const int* global_lda,
+                                              const T* const* global_B_array,
+                                              const int* global_ldb,
+                                              T** global_C_array,
+                                              const int* global_ldc,
+                                              const T* alpha)
+{
+    extern __shared__ __align__(sizeof(T)) unsigned char smem[];
+    T* shared_mem = reinterpret_cast<T*>(smem);
+
+    int batchid = blockIdx.z;
+    int local_M = (int)M[batchid];
+    int local_N = (int)N[batchid];
+    int local_K = (int)K[batchid];
+
+    if (blockIdx.x >= (local_M + BLK_M - 1) / BLK_M)
+        return;
+    if (blockIdx.y >= (local_N + BLK_N - 1) / BLK_N)
+        return;
+
+    int shared_lda = BLK_M + 1;
+    int shared_ldb = BLK_K + 1;
+    T* shared_A = (T*)shared_mem;
+    T* shared_B = shared_A + shared_lda * BLK_K;
+    double alpha_tmp = 1.0;
+    if (alpha != nullptr)
+    {
+        alpha_tmp = alpha[batchid];
+    }
+    vbatched_gemm_nt_device<T,
+                           DIM_X,
+                           DIM_Y,
+                           BLK_M,
+                           BLK_N,
+                           BLK_K,
+                           DIM_XA,
+                           DIM_YA,
+                           DIM_XB,
+                           DIM_YB,
+                           (BLK_M / DIM_X),
+                           (BLK_N / DIM_Y)>(local_M,
+                                            local_N,
+                                            local_K,
+                                            global_A_array[batchid],
+                                            (int)global_lda[batchid],
+                                            global_B_array[batchid],
+                                            (int)global_ldb[batchid],
+                                            global_C_array[batchid],
+                                            (int)global_ldc[batchid],
+                                            shared_A,
+                                            shared_lda,
+                                            shared_B,
+                                            shared_ldb,
+                                            alpha_tmp);
+}
+
+/**
+ * Performs a batched matrix multiplication using the vbatched_gemm_impl
+ * function.
+ *
+ * C = alpha * trans(A) * B + C
+ * @tparam T The data type of the matrices.
+ * @tparam DIM_X The number of threads in the x-dimension of each block.
+ * @tparam DIM_Y The number of threads in the y-dimension of each block.
+ * @tparam BLK_M The number of rows processed by each thread block.
+ * @tparam BLK_N The number of columns processed by each thread block.
+ * @tparam BLK_K The number of elements processed by each thread block along the
+ * K dimension.
+ * @tparam DIM_XA The number of threads in the x-dimension used for loading
+ * matrix A.
+ * @tparam DIM_YA The number of threads in the y-dimension used for loading
+ * matrix A.
+ * @tparam DIM_XB The number of threads in the x-dimension used for loading
+ * matrix B.
+ * @tparam DIM_YB The number of threads in the y-dimension used for loading
+ * matrix B.
+ * @param max_m The maximum number of rows in the matrices.
+ * @param max_n The maximum number of columns in the matrices.
+ * @param m An array of batch sizes for the number of rows in each matrix.
+ * @param n An array of batch sizes for the number of columns in each matrix.
+ * @param k An array of batch sizes for the number of elements in each matrix
+ * along the K dimension.
+ * @param global_A_array An array of pointers to the input matrices A.
+ * @param global_lda An array of leading dimensions for the input matrices A.
+ * @param global_B_array An array of pointers to the input matrices B.
+ * @param global_ldb An array of leading dimensions for the input matrices B.
+ * @param global_C_array An array of pointers to the output matrices C.
+ * @param global_ldc An array of leading dimensions for the output matrices C.
+ * @param batchCount The number of matrices in the batch.
+ * @param stream The CUDA stream to use for the computation.
+ * @param alpha The scalar value to multiply the matrices by (optional, default
+ * is nullptr). generate by copilot
+ */
+
+/*
+ * Why do we need to implement our own matrix multiplication based on the magma
+ * code? There are two main reasons. First is when we are doing batch matrix
+ * multiplication, since we need to accumulate the results of the
+ * multiplications, it is necessary to pass the same memory address of matrix C
+ * to different multiplications. This way, the accumulation can be done directly
+ * through atomic operations during the matrix multiplication, avoiding the
+ * reduction operations after the multiplication. Secondly, when calculating the
+ * charge density, where C = alpha * A * B + C, the value of alpha might be
+ * different for the same batch of matrices. Using the standard matrix
+ * multiplication interface would require breaking down the batch matrix
+ * multiplication into smaller batches. In practice, it is difficult to
+ * accumulate a batch.
+ *
+ * Moreover, taking into account the specific requirements of our application,
+ * especially the fact that we can relatively easily control the arrangement of
+ * the matrix elements, we have only implemented one type of requirement for
+ * matrix transposition. That is, we have implemented the operation C = alpha *
+ * A * trans(B) + C under the constraint of column-major order.
+ *
+ * Finally, we would like to thank Magma for its contributions to the field of
+ * scientific computing.
+ */
+
+template <typename T,
+          int DIM_X,
+          int DIM_Y,
+          int BLK_M,
+          int BLK_N,
+          int BLK_K,
+          int DIM_XA,
+          int DIM_YA,
+          int DIM_XB,
+          int DIM_YB>
+void vbatched_gemm_tn_impl(int max_m,
+                           int max_n,
+                           const int* m,
+                           const int* n,
+                           const int* k,
+                           const T* const* global_A_array,
+                           const int* global_lda,
+                           const T* const* global_B_array,
+                           const int* global_ldb,
+                           T** global_C_array,
+                           const int* global_ldc,
+                           int batchCount,
+                           cudaStream_t stream,
+                           const T* alpha = nullptr)
+{
+    // The positions of A and B have been swapped here.
+    // This is because vbatch_gemm__tn_kernel is column major,
+    // but vatched_gemm_nt_impl is designed to be row major,
+
+    size_t shared_mem_size = 0;
+    shared_mem_size += (BLK_M + 1) * BLK_K * sizeof(T);
+    shared_mem_size += (BLK_K + 1) * BLK_N * sizeof(T);
+    dim3 dimBlock(DIM_X, DIM_Y);
+    const int max_batch_count = 32768;
+
+    for (int i = 0; i < batchCount; i += max_batch_count)
+    {
+        const int ibatch = min(max_batch_count, batchCount - i);
+        dim3 dimGrid(ceil_div(max_n, BLK_M),
+                     ceil_div(max_m, BLK_N),
+                     ibatch);
+        const T* alpha_tmp = nullptr;
+        if (alpha != nullptr)
+        {
+            alpha_tmp = alpha + i;
+        }
+
+        vbatched_gemm_nt_kernel<T,
+                                DIM_X,
+                                DIM_Y,
+                                BLK_M,
+                                BLK_N,
+                                BLK_K,
+                                DIM_XA,
+                                DIM_YA,
+                                DIM_XB,
+                                DIM_YB>
+            <<<dimGrid, dimBlock, shared_mem_size, stream>>>(
+                n + i, m + i, k + i,
+                global_B_array + i, global_ldb + i,
+                global_A_array + i, global_lda + i,
+                global_C_array + i, global_ldc + i,
+                alpha_tmp);
+        checkCudaLastError();
+    }
+}
+
+#endif // GEMM_TN_VBATCH_CUH
\ No newline at end of file
diff --git a/source/module_hamilt_lcao/module_gint/temp_gint/kernel/gint_gpu_vars.cpp b/source/module_hamilt_lcao/module_gint/temp_gint/kernel/gint_gpu_vars.cpp
new file mode 100644
index 0000000000..f4443762f0
--- /dev/null
+++ b/source/module_hamilt_lcao/module_gint/temp_gint/kernel/gint_gpu_vars.cpp
@@ -0,0 +1,126 @@
+#include "gint_gpu_vars.h"
+#include "source_base/module_device/device.h"
+
+namespace ModuleGint
+{
+
+GintGpuVars::GintGpuVars(std::shared_ptr<const BigGridInfo> biggrid_info,
+                         const UnitCell& ucell,
+                         const Numerical_Orbital* Phi)
+{
+// set device
+#ifdef __MPI
+    dev_id_ = base_device::information::set_device_by_rank();
+#endif
+    std::vector<double> ylmcoef_h(100);
+    for (int i = 0; i < 100; i++)
+    {
+        ylmcoef_h[i] = ModuleBase::Ylm::ylmcoef[i];
+    }
+    set_ylmcoe_d(ylmcoef_h.data(), &ylmcoef_d);
+
+    const int ntype = ucell.ntype;
+    std::vector<int> atom_nw_h(ntype);
+    std::vector<int> ucell_atom_nwl_h(ntype);
+    for (int i = 0; i < ntype; i++)
+    {
+        atom_nw_h[i] = ucell.atoms[i].nw;
+        ucell_atom_nwl_h[i] = ucell.atoms[i].nwl;
+    }
+    checkCuda(cudaMalloc((void**)&atom_nw_d, sizeof(int) * ntype));
+    checkCuda(cudaMemcpy(atom_nw_d, atom_nw_h.data(), sizeof(int) * ntype, cudaMemcpyHostToDevice));
+    checkCuda(cudaMalloc((void**)&ucell_atom_nwl_d, sizeof(int) * ntype));
+    checkCuda(cudaMemcpy(ucell_atom_nwl_d, ucell_atom_nwl_h.data(), sizeof(int) * ntype, cudaMemcpyHostToDevice));
+
+    dr_uniform = Phi[0].PhiLN(0, 0).dr_uniform;
+    double max_rcut = 0;
+    std::vector<double> rcut_h(ntype);
+    for (int i = 0; i < ntype; i++)
+    {
+        rcut_h[i] = Phi[i].getRcut();
+        if (rcut_h[i] > max_rcut)
+        {
+            max_rcut = rcut_h[i];
+        }
+    }
+    checkCuda(cudaMalloc((void**)&rcut_d, sizeof(double) * ntype));
+    checkCuda(cudaMemcpy(rcut_d, rcut_h.data(), sizeof(double) * ntype, cudaMemcpyHostToDevice));
+    nr_max = static_cast<int>(1 / dr_uniform * max_rcut) + 10;
+    
+    nwmax = ucell.nwmax;
+    std::vector<double> psi_u_h(ntype * nwmax * nr_max);
+    std::vector<double> dpsi_u_h(ntype * nwmax * nr_max);
+    std::vector<double> d2psi_u_h(ntype * nwmax * nr_max);
+    // std::vector<bool> cannot use data(), so std::vector<char> is used instead
+    std::vector<char> atom_iw2_new_h(ntype * nwmax);
+    std::vector<int> atom_iw2_ylm_h(ntype * nwmax);
+    std::vector<int> atom_iw2_l_h(ntype * nwmax);
+    for (int i = 0; i < ntype; i++)
+    {
+        Atom* atomx = &ucell.atoms[i];
+        for (int j = 0; j < atomx->nw; j++)
+        {
+            atom_iw2_new_h[i * nwmax + j] = atomx->iw2_new[j];
+            atom_iw2_ylm_h[i * nwmax + j] = atomx->iw2_ylm[j];
+            atom_iw2_l_h[i * nwmax + j] = atomx->iw2l[j];
+            const auto psi_ptr = &Phi[i].PhiLN(atomx->iw2l[j], atomx->iw2n[j]);
+            const int psi_size = psi_ptr->psi_uniform.size();
+            int idx = i * nwmax * nr_max + j * nr_max;
+            for (int k = 0; k < psi_size; k++)
+            {
+                psi_u_h[idx + k] = psi_ptr->psi_uniform[k];
+                dpsi_u_h[idx + k] = psi_ptr->dpsi_uniform[k];
+                d2psi_u_h[idx + k] = psi_ptr->ddpsi_uniform[k];
+            }
+        }
+    }
+
+    checkCuda(cudaMalloc((void**)&atom_iw2_new_d, sizeof(bool) * ntype * nwmax));
+    checkCuda(cudaMemcpy(atom_iw2_new_d, atom_iw2_new_h.data(), sizeof(bool) * ntype * nwmax, cudaMemcpyHostToDevice));
+    checkCuda(cudaMalloc((void**)&atom_iw2_ylm_d, sizeof(int) * ntype * nwmax));
+    checkCuda(cudaMemcpy(atom_iw2_ylm_d, atom_iw2_ylm_h.data(), sizeof(int) * ntype * nwmax, cudaMemcpyHostToDevice));
+    checkCuda(cudaMalloc((void**)&atom_iw2_l_d, sizeof(int) * ntype * nwmax));
+    checkCuda(cudaMemcpy(atom_iw2_l_d, atom_iw2_l_h.data(), sizeof(int) * ntype * nwmax, cudaMemcpyHostToDevice));
+    checkCuda(cudaMalloc((void**)&psi_u_d, sizeof(double) * ntype * nwmax * nr_max));
+    checkCuda(cudaMemcpy(psi_u_d, psi_u_h.data(), sizeof(double) * ntype * nwmax * nr_max, cudaMemcpyHostToDevice));
+    checkCuda(cudaMalloc((void**)&dpsi_u_d, sizeof(double) * ntype * nwmax * nr_max));
+    checkCuda(cudaMemcpy(dpsi_u_d, dpsi_u_h.data(), sizeof(double) * ntype * nwmax * nr_max, cudaMemcpyHostToDevice));
+    checkCuda(cudaMalloc((void**)&d2psi_u_d, sizeof(double) * ntype * nwmax * nr_max));
+    checkCuda(cudaMemcpy(d2psi_u_d, d2psi_u_h.data(), sizeof(double) * ntype * nwmax * nr_max, cudaMemcpyHostToDevice));
+    
+    const int mgrid_num = biggrid_info->get_mgrids_num();
+    std::vector<double3> mgrids_pos_h(mgrid_num);
+    for(int i = 0; i < mgrid_num; i++)
+    {
+        mgrids_pos_h[i].x = biggrid_info->get_mgrid_coord(i).x;
+        mgrids_pos_h[i].y = biggrid_info->get_mgrid_coord(i).y;
+        mgrids_pos_h[i].z = biggrid_info->get_mgrid_coord(i).z;
+    }
+    checkCuda(cudaMalloc((void**)&mgrids_pos_d, sizeof(double3) * mgrid_num));
+    checkCuda(cudaMemcpy(mgrids_pos_d, mgrids_pos_h.data(), sizeof(double3) * mgrid_num, cudaMemcpyHostToDevice));
+    
+    checkCuda(cudaMalloc((void**)&iat2it_d, sizeof(int) * ucell.nat));
+    checkCuda(cudaMemcpy(iat2it_d, ucell.iat2it, sizeof(int) * ucell.nat, cudaMemcpyHostToDevice));
+
+    gemm_algo_selector(mgrid_num, fastest_matrix_mul, ucell);
+}
+
+GintGpuVars::~GintGpuVars()
+{
+#ifdef __MPI
+    checkCuda(cudaSetDevice(dev_id_));
+#endif
+    checkCuda(cudaFree(rcut_d));
+    checkCuda(cudaFree(atom_nw_d));
+    checkCuda(cudaFree(ucell_atom_nwl_d));
+    checkCuda(cudaFree(atom_iw2_new_d));
+    checkCuda(cudaFree(atom_iw2_ylm_d));
+    checkCuda(cudaFree(atom_iw2_l_d));
+    checkCuda(cudaFree(psi_u_d));
+    checkCuda(cudaFree(dpsi_u_d));
+    checkCuda(cudaFree(d2psi_u_d));
+    checkCuda(cudaFree(mgrids_pos_d));
+    checkCuda(cudaFree(iat2it_d));
+}
+
+}
\ No newline at end of file
diff --git a/source/module_hamilt_lcao/module_gint/temp_gint/kernel/gint_gpu_vars.h b/source/module_hamilt_lcao/module_gint/temp_gint/kernel/gint_gpu_vars.h
new file mode 100644
index 0000000000..7d2515b3b0
--- /dev/null
+++ b/source/module_hamilt_lcao/module_gint/temp_gint/kernel/gint_gpu_vars.h
@@ -0,0 +1,46 @@
+#pragma once
+
+#include <cuda_runtime.h>
+#include "set_const_mem.cuh"
+#include "source_base/ylm.h"
+#include "source_cell/unitcell.h"
+#include "source_cell/atom_spec.h"
+#include "module_hamilt_lcao/module_gint/temp_gint/biggrid_info.h"
+#include "gint_helper.cuh"
+#include "module_hamilt_lcao/module_gint/kernels/cuda/gemm_selector.cuh"
+
+namespace ModuleGint
+{
+
+class GintGpuVars
+{
+    public:
+    GintGpuVars(std::shared_ptr<const BigGridInfo> bgrid_info,
+                const UnitCell& ucell,
+                const Numerical_Orbital* Phi);
+    ~GintGpuVars();
+    
+    int nwmax;
+    double dr_uniform;
+    double nr_max;
+    // ylmcoef_d is __constant__ memory, no need to cudaFree
+    double* ylmcoef_d = nullptr;
+    double* rcut_d = nullptr;
+    int* atom_nw_d = nullptr;
+    int* ucell_atom_nwl_d = nullptr;
+    bool* atom_iw2_new_d = nullptr;
+    int* atom_iw2_ylm_d = nullptr;
+    int* atom_iw2_l_d = nullptr;
+    double* psi_u_d = nullptr;
+    double* dpsi_u_d = nullptr;
+    double* d2psi_u_d = nullptr;
+    double3* mgrids_pos_d = nullptr;
+    int* iat2it_d = nullptr;
+
+    // the index of gpu device
+    int dev_id_ = 0;
+    matrix_multiple_func_type fastest_matrix_mul;
+
+};
+
+}
\ No newline at end of file
diff --git a/source/module_hamilt_lcao/module_gint/temp_gint/kernel/gint_helper.cuh b/source/module_hamilt_lcao/module_gint/temp_gint/kernel/gint_helper.cuh
new file mode 100644
index 0000000000..7a6e925531
--- /dev/null
+++ b/source/module_hamilt_lcao/module_gint/temp_gint/kernel/gint_helper.cuh
@@ -0,0 +1,75 @@
+#pragma once
+#include <cstdio>
+
+// if exponent is an integer between 0 and 5 (the most common cases in gint) and
+// and exp is a variable that cannot be determined at compile time (which means the compiler cannot optimize the code),
+// pow_int is much faster than std::pow
+template<typename T>
+__forceinline__ __device__ T pow_int(const T base, const int exp)
+{
+    switch (exp)
+    {
+    case 0:
+        return 1.0;
+    case 1:
+        return base;
+    case 2:
+        return base * base;
+    case 3:
+        return base * base * base;
+    case 4:
+        return base * base * base * base;
+    case 5:
+        return base * base * base * base * base;
+    default:
+        double result = std::pow(base, exp);
+        return result;
+    }
+}
+
+template<typename T>
+__forceinline__ __device__ T warpReduceSum(T val)
+{   
+    val += __shfl_xor_sync(0xffffffff, val, 16, 32);
+    val += __shfl_xor_sync(0xffffffff, val, 8, 32);
+    val += __shfl_xor_sync(0xffffffff, val, 4, 32);
+    val += __shfl_xor_sync(0xffffffff, val, 2, 32);
+    val += __shfl_xor_sync(0xffffffff, val, 1, 32);
+    return val;
+}
+
+inline int ceil_div(const int a, const int b)
+{
+    return a / b + (a % b != 0 && (a ^ b) > 0); 
+}
+
+inline void check(cudaError_t result, char const *const func, const char *const file,
+           int const line) {
+  if (result) {
+    fprintf(stderr, "CUDA error at %s:%d code=%d(%s) \"%s\" \n", file, line,
+            static_cast<unsigned int>(result), cudaGetErrorString(result), func);
+    exit(EXIT_FAILURE);
+  }
+}
+
+inline void __getLastCudaError(const char *file,
+                               const int line) 
+{
+  cudaError_t err = cudaGetLastError();
+
+  if (cudaSuccess != err) {
+    fprintf(stderr,
+            "%s(%i) : getLastCudaError() CUDA error :"
+            " (%d) %s.\n",
+            file, line, static_cast<int>(err),
+            cudaGetErrorString(err));
+    exit(EXIT_FAILURE);
+  }
+}
+
+// This will output the proper CUDA error strings in the event
+// that a CUDA host call returns an error
+#define checkCuda(val) check((val), #val, __FILE__, __LINE__)
+
+// This will output the proper error string when calling cudaGetLastError
+#define checkCudaLastError() __getLastCudaError(__FILE__, __LINE__)
\ No newline at end of file
diff --git a/source/module_hamilt_lcao/module_gint/temp_gint/kernel/phi_operator_gpu.cu b/source/module_hamilt_lcao/module_gint/temp_gint/kernel/phi_operator_gpu.cu
new file mode 100644
index 0000000000..edc07959d4
--- /dev/null
+++ b/source/module_hamilt_lcao/module_gint/temp_gint/kernel/phi_operator_gpu.cu
@@ -0,0 +1,466 @@
+#include "phi_operator_gpu.h"
+#include "phi_operator_kernel.cuh"
+#include "dgemm_vbatch.h"
+#include <cuda_runtime.h>
+
+namespace ModuleGint
+{
+PhiOperatorGpu::PhiOperatorGpu(std::shared_ptr<const GintGpuVars> gint_gpu_vars, cudaStream_t stream)
+:gint_gpu_vars_(gint_gpu_vars), stream_(stream),
+mgrids_num_(BatchBigGrid::get_bgrid_info()->get_mgrids_num()),
+atoms_num_info_(BatchBigGrid::get_max_batch_size(), stream_, true),
+bgrids_phi_len_(BatchBigGrid::get_max_batch_size(), stream_, true),
+bgrids_phi_start_(BatchBigGrid::get_max_batch_size(), stream_, true),
+atoms_iat_(BatchBigGrid::get_max_atoms_num(), stream_, true),
+atoms_bgrids_rcoords_(BatchBigGrid::get_max_atoms_num(), stream_, true),
+atoms_phi_start_(BatchBigGrid::get_max_atoms_num(), stream_, true),
+mgrids_local_idx_batch_(BatchBigGrid::get_max_batch_size() 
+    * BatchBigGrid::get_bgrid_info()->get_mgrids_num(), stream_, true),
+gemm_m_(BatchBigGrid::get_max_atom_pairs_num(), stream_, true),
+gemm_n_(BatchBigGrid::get_max_atom_pairs_num(), stream_, true),
+gemm_k_(BatchBigGrid::get_max_atom_pairs_num(), stream_, true),
+gemm_lda_(BatchBigGrid::get_max_atom_pairs_num(), stream_, true),
+gemm_ldb_(BatchBigGrid::get_max_atom_pairs_num(), stream_, true),
+gemm_ldc_(BatchBigGrid::get_max_atom_pairs_num(), stream_, true),
+gemm_A_(BatchBigGrid::get_max_atom_pairs_num(), stream_, true),
+gemm_B_(BatchBigGrid::get_max_atom_pairs_num(), stream_, true),
+gemm_C_(BatchBigGrid::get_max_atom_pairs_num(), stream_, true),
+gemm_alpha_(BatchBigGrid::get_max_atom_pairs_num(), stream_, true)
+{
+    checkCuda(cudaEventCreateWithFlags(&event_, cudaEventDisableTiming));
+}
+
+PhiOperatorGpu::~PhiOperatorGpu()
+{
+    checkCuda(cudaEventDestroy(event_));
+}
+
+void PhiOperatorGpu::set_bgrid_batch(std::shared_ptr<BatchBigGrid> bgrid_batch)
+{
+    bgrid_batch_ = bgrid_batch;
+    auto atoms_num_info_h = atoms_num_info_.get_host_ptr();
+    auto bgrids_phi_len_h = bgrids_phi_len_.get_host_ptr();
+    auto bgrids_phi_start_h = bgrids_phi_start_.get_host_ptr();
+    auto atoms_iat_h = atoms_iat_.get_host_ptr();
+    auto atoms_bgrids_rcoords_h = atoms_bgrids_rcoords_.get_host_ptr();
+    auto atoms_phi_start_h = atoms_phi_start_.get_host_ptr();
+    auto mgrids_local_idx_batch_h = mgrids_local_idx_batch_.get_host_ptr();
+    int i = 0;
+    int j = 0;
+    int atoms_accum = 0;
+    phi_len_ = 0;
+    int phi_start = 0;
+    std::vector<int> mgrids_local_idx;
+    checkCuda(cudaEventSynchronize(event_));
+    for (const auto& bgrid : bgrid_batch->get_bgrids())
+    {
+        atoms_num_info_h[i] = make_int2(bgrid->get_atoms_num(), atoms_accum);
+        atoms_accum += bgrid->get_atoms_num();
+        bgrids_phi_start_h[i] = phi_start;
+        bgrid->set_mgrids_local_idx(mgrids_local_idx);
+        std::copy(mgrids_local_idx.begin(), mgrids_local_idx.end(),
+            mgrids_local_idx_batch_h + i * mgrids_num_);
+        int phi_len_bgrid = 0;
+        for (const auto& atom : bgrid->get_atoms())
+        {
+            atoms_iat_h[j] = atom->get_iat();
+            Vec3d rcoord = bgrid->get_bgrid_atom_rcoord(atom);
+            atoms_bgrids_rcoords_h[j] = make_double3(rcoord.x, rcoord.y, rcoord.z);
+            atoms_phi_start_h[j] = phi_len_ + phi_len_bgrid;
+            phi_len_bgrid += atom->get_nw();
+            j++;
+        }
+        bgrids_phi_len_h[i] = phi_len_bgrid;
+        phi_len_ += phi_len_bgrid * bgrid->get_mgrids_num();
+        phi_start += phi_len_bgrid * bgrid->get_mgrids_num();
+        i++;
+    }
+
+    atoms_num_info_.copy_host_to_device_async(bgrid_batch->get_batch_size());
+    bgrids_phi_len_.copy_host_to_device_async(bgrid_batch->get_batch_size());
+    bgrids_phi_start_.copy_host_to_device_async(bgrid_batch->get_batch_size());
+    atoms_iat_.copy_host_to_device_async(bgrid_batch->get_atoms_num());
+    atoms_bgrids_rcoords_.copy_host_to_device_async(bgrid_batch->get_atoms_num());
+    atoms_phi_start_.copy_host_to_device_async(bgrid_batch->get_atoms_num());
+    mgrids_local_idx_batch_.copy_host_to_device_async(bgrid_batch->get_batch_size() * mgrids_num_);
+    checkCuda(cudaEventRecord(event_, stream_));
+}
+
+void PhiOperatorGpu::set_phi(double* phi_d) const
+{
+    // checkCuda(cudaMemsetAsync(phi_d, 0, phi_len_ * sizeof(double), stream_));
+    dim3 grid_dim(mgrids_num_, bgrid_batch_->get_batch_size());
+    dim3 threads_per_block(64);
+    set_phi_kernel<<<grid_dim, threads_per_block, 0, stream_>>>(
+        gint_gpu_vars_->nwmax,
+        mgrids_num_,
+        gint_gpu_vars_->nr_max,
+        gint_gpu_vars_->dr_uniform,
+        gint_gpu_vars_->ylmcoef_d,
+        gint_gpu_vars_->ucell_atom_nwl_d,
+        gint_gpu_vars_->atom_iw2_new_d,
+        gint_gpu_vars_->atom_iw2_ylm_d,
+        gint_gpu_vars_->atom_nw_d,
+        gint_gpu_vars_->iat2it_d,
+        gint_gpu_vars_->rcut_d,
+        gint_gpu_vars_->psi_u_d,
+        gint_gpu_vars_->dpsi_u_d,
+        gint_gpu_vars_->mgrids_pos_d,
+        atoms_iat_.get_device_ptr(),
+        atoms_bgrids_rcoords_.get_device_ptr(),
+        atoms_num_info_.get_device_ptr(),
+        atoms_phi_start_.get_device_ptr(),
+        bgrids_phi_len_.get_device_ptr(),
+        phi_d);
+    checkCudaLastError();
+}
+
+void PhiOperatorGpu::set_phi_dphi(double* phi_d, double* dphi_x_d, double* dphi_y_d, double* dphi_z_d) const
+{
+    dim3 grid_dim(mgrids_num_, bgrid_batch_->get_batch_size());
+    dim3 threads_per_block(64);
+    set_phi_dphi_kernel<<<grid_dim, threads_per_block, 0, stream_>>>(
+        gint_gpu_vars_->nwmax,
+        mgrids_num_,
+        gint_gpu_vars_->nr_max,
+        gint_gpu_vars_->dr_uniform,
+        gint_gpu_vars_->ylmcoef_d,
+        gint_gpu_vars_->ucell_atom_nwl_d,
+        gint_gpu_vars_->atom_iw2_new_d,
+        gint_gpu_vars_->atom_iw2_ylm_d,
+        gint_gpu_vars_->atom_iw2_l_d,
+        gint_gpu_vars_->atom_nw_d,
+        gint_gpu_vars_->iat2it_d,
+        gint_gpu_vars_->rcut_d,
+        gint_gpu_vars_->psi_u_d,
+        gint_gpu_vars_->dpsi_u_d,
+        gint_gpu_vars_->mgrids_pos_d,
+        atoms_iat_.get_device_ptr(),
+        atoms_bgrids_rcoords_.get_device_ptr(),
+        atoms_num_info_.get_device_ptr(),
+        atoms_phi_start_.get_device_ptr(),
+        bgrids_phi_len_.get_device_ptr(),
+        phi_d,
+        dphi_x_d,
+        dphi_y_d,
+        dphi_z_d);
+    checkCudaLastError();
+}
+
+void PhiOperatorGpu::set_ddphi(double* ddphi_xx_d, double* ddphi_xy_d, double* ddphi_xz_d,
+                               double* ddphi_yy_d, double* ddphi_yz_d, double* ddphi_zz_d) const
+{
+    // Since the underlying implementation of `set_ddphi` uses `ddphi +=` instead of `ddphi =`,
+    // the ddphi array needs to be zeroed out at the beginning of the function.
+    checkCuda(cudaMemsetAsync(ddphi_xx_d, 0, phi_len_ * sizeof(double), stream_));
+    checkCuda(cudaMemsetAsync(ddphi_xy_d, 0, phi_len_ * sizeof(double), stream_));
+    checkCuda(cudaMemsetAsync(ddphi_xz_d, 0, phi_len_ * sizeof(double), stream_));
+    checkCuda(cudaMemsetAsync(ddphi_yy_d, 0, phi_len_ * sizeof(double), stream_));
+    checkCuda(cudaMemsetAsync(ddphi_yz_d, 0, phi_len_ * sizeof(double), stream_));
+    checkCuda(cudaMemsetAsync(ddphi_zz_d, 0, phi_len_ * sizeof(double), stream_));
+    dim3 grid_dim(mgrids_num_, bgrid_batch_->get_batch_size());
+    dim3 threads_per_block(64);
+    set_ddphi_kernel<<<grid_dim, threads_per_block, 0, stream_>>>(
+        gint_gpu_vars_->nwmax,
+        mgrids_num_,
+        gint_gpu_vars_->nr_max,
+        gint_gpu_vars_->dr_uniform,
+        gint_gpu_vars_->ylmcoef_d,
+        gint_gpu_vars_->ucell_atom_nwl_d,
+        gint_gpu_vars_->atom_iw2_new_d,
+        gint_gpu_vars_->atom_iw2_ylm_d,
+        gint_gpu_vars_->atom_iw2_l_d,
+        gint_gpu_vars_->atom_nw_d,
+        gint_gpu_vars_->iat2it_d,
+        gint_gpu_vars_->rcut_d,
+        gint_gpu_vars_->psi_u_d,
+        gint_gpu_vars_->dpsi_u_d,
+        gint_gpu_vars_->mgrids_pos_d,
+        atoms_iat_.get_device_ptr(),
+        atoms_bgrids_rcoords_.get_device_ptr(),
+        atoms_num_info_.get_device_ptr(),
+        atoms_phi_start_.get_device_ptr(),
+        bgrids_phi_len_.get_device_ptr(),
+        ddphi_xx_d,
+        ddphi_xy_d,
+        ddphi_xz_d,
+        ddphi_yy_d,
+        ddphi_yz_d,
+        ddphi_zz_d);
+    checkCudaLastError();
+}
+
+void PhiOperatorGpu::phi_mul_vldr3(
+    const double* vl_d,
+    const double dr3,
+    const double* phi_d,
+    double* result_d) const
+{
+    dim3 grid_dim(mgrids_num_, bgrid_batch_->get_batch_size());
+    dim3 threads_per_block(64);
+    phi_mul_vldr3_kernel<<<grid_dim, threads_per_block, 0, stream_>>>(
+        vl_d,
+        dr3,
+        phi_d,
+        mgrids_num_,
+        mgrids_local_idx_batch_.get_device_ptr(),
+        bgrids_phi_len_.get_device_ptr(),
+        bgrids_phi_start_.get_device_ptr(),
+        result_d);
+    checkCudaLastError();
+}
+
+void PhiOperatorGpu::phi_mul_phi(
+    const double* phi_d,
+    const double* phi_vldr3_d,
+    HContainer<double>& hRGint,
+    double* hr_d) const
+{
+    // ap_num means number of atom pairs
+    int ap_num = 0;
+    int max_m = 0;
+    int max_n = 0;
+    int max_k = mgrids_num_;
+    checkCuda(cudaEventSynchronize(event_));
+    for (int i = 0; i < bgrid_batch_->get_batch_size(); i++)
+    {
+        auto bgrid = bgrid_batch_->get_bgrids()[i];
+        // the length of phi on a mesh grid
+        const int phi_len_mgrid = bgrid->get_phi_len();
+        const int pre_atoms = atoms_num_info_.get_host_ptr()[i].y;
+        for (int ia_1 = 0; ia_1 < bgrid->get_atoms_num(); ia_1++)
+        {
+            auto atom_1 = bgrid->get_atoms()[ia_1];
+            const int iat_1 = atom_1->get_iat();
+            const auto& r_1 = atom_1->get_R();
+            const int nw1 = atom_1->get_nw();
+            const int phi_1_offset = atoms_phi_start_.get_host_ptr()[pre_atoms + ia_1];
+
+            for (int ia_2 = 0; ia_2 < bgrid->get_atoms_num(); ia_2++)
+            {
+                auto atom_2 = bgrid->get_atoms()[ia_2];
+                const int iat_2 = atom_2->get_iat();
+                const auto& r_2 = atom_2->get_R();
+                const int nw2 = atom_2->get_nw();
+
+                if(iat_1 > iat_2)
+                { continue; }
+                
+                int hr_offset = hRGint.find_matrix_offset(iat_1, iat_2, r_1 - r_2);
+                if (hr_offset == -1)
+                { continue; }
+
+                const int phi_2_offset = atoms_phi_start_.get_host_ptr()[pre_atoms + ia_2];
+
+                gemm_A_.get_host_ptr()[ap_num] = phi_d + phi_1_offset;
+                gemm_B_.get_host_ptr()[ap_num] = phi_vldr3_d + phi_2_offset;
+                gemm_C_.get_host_ptr()[ap_num] = hr_d + hr_offset;
+                gemm_lda_.get_host_ptr()[ap_num] = phi_len_mgrid;
+                gemm_ldb_.get_host_ptr()[ap_num] = phi_len_mgrid;
+                gemm_ldc_.get_host_ptr()[ap_num] = nw2;
+                gemm_m_.get_host_ptr()[ap_num] = nw1;
+                gemm_n_.get_host_ptr()[ap_num] = nw2;
+                gemm_k_.get_host_ptr()[ap_num] = bgrid->get_mgrids_num();
+                ap_num++;
+
+                max_m = std::max(max_m, nw1);
+                max_n = std::max(max_n, nw2);
+            }
+        }
+    }
+
+    gemm_A_.copy_host_to_device_async(ap_num);
+    gemm_B_.copy_host_to_device_async(ap_num);
+    gemm_C_.copy_host_to_device_async(ap_num);
+    gemm_lda_.copy_host_to_device_async(ap_num);
+    gemm_ldb_.copy_host_to_device_async(ap_num);
+    gemm_ldc_.copy_host_to_device_async(ap_num);
+    gemm_m_.copy_host_to_device_async(ap_num);
+    gemm_n_.copy_host_to_device_async(ap_num);
+    gemm_k_.copy_host_to_device_async(ap_num);
+    checkCuda(cudaEventRecord(event_, stream_));
+    
+    dgemm_tn_vbatch(max_m,
+                    max_n,
+                    max_k,
+                    gemm_m_.get_device_ptr(),
+                    gemm_n_.get_device_ptr(),
+                    gemm_k_.get_device_ptr(),
+                    gemm_A_.get_device_ptr(),
+                    gemm_lda_.get_device_ptr(),
+                    gemm_B_.get_device_ptr(),
+                    gemm_ldb_.get_device_ptr(),
+                    gemm_C_.get_device_ptr(),
+                    gemm_ldc_.get_device_ptr(),
+                    ap_num,
+                    stream_,
+                    nullptr);
+}
+
+void PhiOperatorGpu::phi_mul_dm(
+    const double* phi_d,
+    const double* dm_d,
+    const HContainer<double>& dm,
+    const bool is_symm,
+    double* phi_dm_d)
+{
+    checkCuda(cudaMemsetAsync(phi_dm_d, 0, phi_len_ * sizeof(double), stream_));
+    // ap_num means number of atom pairs
+    int ap_num = 0;
+    int max_m = mgrids_num_;
+    int max_n = 0;
+    int max_k = 0;
+    checkCuda(cudaEventSynchronize(event_));
+    for (int i = 0; i < bgrid_batch_->get_batch_size(); i++)
+    {
+        auto bgrid = bgrid_batch_->get_bgrids()[i];
+        // the length of phi on a mesh grid
+        const int phi_len_mgrid = bgrid->get_phi_len();
+        const int pre_atoms = atoms_num_info_.get_host_ptr()[i].y;
+        for (int ia_1 = 0; ia_1 < bgrid->get_atoms_num(); ia_1++)
+        {
+            auto atom_1 = bgrid->get_atoms()[ia_1];
+            const int iat_1 = atom_1->get_iat();
+            const auto& r_1 = atom_1->get_R();
+            const int nw1 = atom_1->get_nw();
+            const int phi_1_offset = atoms_phi_start_.get_host_ptr()[pre_atoms + ia_1];
+            int ia_2 = is_symm ? ia_1 : 0;
+            for (; ia_2 < bgrid->get_atoms_num(); ia_2++)
+            {
+                auto atom_2 = bgrid->get_atoms()[ia_2];
+                const int iat_2 = atom_2->get_iat();
+                const auto& r_2 = atom_2->get_R();
+                const int nw2 = atom_2->get_nw();
+
+                int dm_offset = dm.find_matrix_offset(iat_1, iat_2, r_1-r_2);
+                if (dm_offset == -1)
+                { continue; }
+
+                const int phi_dm_offset = atoms_phi_start_.get_host_ptr()[pre_atoms + ia_2];
+
+                gemm_A_.get_host_ptr()[ap_num] = phi_d + phi_1_offset;
+                gemm_B_.get_host_ptr()[ap_num] = dm_d + dm_offset;
+                gemm_C_.get_host_ptr()[ap_num] = phi_dm_d + phi_dm_offset;
+                gemm_lda_.get_host_ptr()[ap_num] = phi_len_mgrid;
+                gemm_ldb_.get_host_ptr()[ap_num] = nw2;
+                gemm_ldc_.get_host_ptr()[ap_num] = phi_len_mgrid;
+                gemm_m_.get_host_ptr()[ap_num] = mgrids_num_;
+                gemm_n_.get_host_ptr()[ap_num] = nw2;
+                gemm_k_.get_host_ptr()[ap_num] = nw1;
+                gemm_alpha_.get_host_ptr()[ap_num] = ia_1 == ia_2 ? 1.0 : 2.0;
+                ap_num++;
+
+                max_n = std::max(max_n, nw2);
+                max_k = std::max(max_k, nw1);
+            }
+        }
+    }
+
+    gemm_A_.copy_host_to_device_async(ap_num);
+    gemm_B_.copy_host_to_device_async(ap_num);
+    gemm_C_.copy_host_to_device_async(ap_num);
+    gemm_lda_.copy_host_to_device_async(ap_num);
+    gemm_ldb_.copy_host_to_device_async(ap_num);
+    gemm_ldc_.copy_host_to_device_async(ap_num);
+    gemm_m_.copy_host_to_device_async(ap_num);
+    gemm_n_.copy_host_to_device_async(ap_num);
+    gemm_k_.copy_host_to_device_async(ap_num);
+    if(is_symm)
+    {
+        // if is_symm == false, gemm_alpha_ always equals 1.0,
+        // so we don't need to copy it to device
+        gemm_alpha_.copy_host_to_device_async(ap_num);
+    }
+    checkCuda(cudaEventRecord(event_, stream_));
+
+    auto alpha_ptr = is_symm ? gemm_alpha_.get_device_ptr() : nullptr;
+    dgemm_nn_vbatch(max_m,
+                    max_n,
+                    max_k,
+                    gemm_m_.get_device_ptr(),
+                    gemm_n_.get_device_ptr(),
+                    gemm_k_.get_device_ptr(),
+                    gemm_A_.get_device_ptr(),
+                    gemm_lda_.get_device_ptr(),
+                    gemm_B_.get_device_ptr(),
+                    gemm_ldb_.get_device_ptr(),
+                    gemm_C_.get_device_ptr(),
+                    gemm_ldc_.get_device_ptr(),
+                    ap_num,
+                    stream_,
+                    alpha_ptr);
+}
+
+void PhiOperatorGpu::phi_dot_phi(
+    const double* phi_i_d,
+    const double* phi_j_d,
+    double* rho_d) const
+{
+    dim3 grid_dim(mgrids_num_, bgrid_batch_->get_batch_size());
+    dim3 threads_per_block(64);
+    phi_dot_phi_kernel<<<grid_dim, threads_per_block, sizeof(double) * 32, stream_>>>(
+        phi_i_d,
+        phi_j_d,
+        mgrids_num_,
+        mgrids_local_idx_batch_.get_device_ptr(),
+        bgrids_phi_len_.get_device_ptr(),
+        bgrids_phi_start_.get_device_ptr(),
+        rho_d);
+    checkCudaLastError();
+}
+
+void PhiOperatorGpu::phi_dot_dphi(
+    const double* phi_d,
+    const double* dphi_x_d,
+    const double* dphi_y_d,
+    const double* dphi_z_d,
+    double* fvl_d) const
+{
+    dim3 grid_dim(bgrid_batch_->get_max_atoms_num_per_bgrid(),
+                  bgrid_batch_->get_batch_size());
+    dim3 threads_per_block(32);
+    phi_dot_dphi_kernel<<<grid_dim, threads_per_block, sizeof(double) * 32 * 3, stream_>>>(
+        phi_d,
+        dphi_x_d,
+        dphi_y_d,
+        dphi_z_d,
+        mgrids_num_,
+        bgrids_phi_len_.get_device_ptr(),
+        atoms_num_info_.get_device_ptr(),
+        atoms_phi_start_.get_device_ptr(),
+        atoms_iat_.get_device_ptr(),
+        gint_gpu_vars_->iat2it_d,
+        gint_gpu_vars_->atom_nw_d,
+        fvl_d);
+    checkCudaLastError();
+}
+
+void PhiOperatorGpu::phi_dot_dphi_r(
+    const double* phi_d,
+    const double* dphi_x_d,
+    const double* dphi_y_d,
+    const double* dphi_z_d,
+    double* svl_d) const
+{
+    dim3 grid_dim(mgrids_num_,
+                  bgrid_batch_->get_batch_size());
+    dim3 threads_per_block(32);
+    phi_dot_dphi_r_kernel<<<grid_dim, threads_per_block, sizeof(double) * 32 * 6, stream_>>>(
+        phi_d,
+        dphi_x_d,
+        dphi_y_d,
+        dphi_z_d,
+        mgrids_num_,
+        bgrids_phi_len_.get_device_ptr(),
+        atoms_num_info_.get_device_ptr(),
+        atoms_phi_start_.get_device_ptr(),
+        atoms_iat_.get_device_ptr(),
+        atoms_bgrids_rcoords_.get_device_ptr(),
+        gint_gpu_vars_->mgrids_pos_d,
+        gint_gpu_vars_->iat2it_d,
+        gint_gpu_vars_->atom_nw_d,
+        svl_d);
+    checkCudaLastError();
+}
+
+}
\ No newline at end of file
diff --git a/source/module_hamilt_lcao/module_gint/temp_gint/kernel/phi_operator_gpu.h b/source/module_hamilt_lcao/module_gint/temp_gint/kernel/phi_operator_gpu.h
new file mode 100644
index 0000000000..4988e265ce
--- /dev/null
+++ b/source/module_hamilt_lcao/module_gint/temp_gint/kernel/phi_operator_gpu.h
@@ -0,0 +1,110 @@
+#pragma once
+#include <memory>
+#include <cuda_runtime.h>
+
+#include "module_hamilt_lcao/module_gint/temp_gint/batch_biggrid.h"
+#include "gint_helper.cuh"
+#include "gint_gpu_vars.h"
+#include "cuda_mem_wrapper.h"
+
+namespace ModuleGint
+{
+
+class PhiOperatorGpu
+{
+
+public:
+    PhiOperatorGpu(std::shared_ptr<const GintGpuVars> gint_gpu_vars, cudaStream_t stream = 0);
+    ~PhiOperatorGpu();
+
+    void set_bgrid_batch(std::shared_ptr<BatchBigGrid> bgrid_batch);
+
+    void set_phi(double* phi_d) const;
+
+    void set_phi_dphi(double* phi_d, double* dphi_x_d, double* dphi_y_d, double* dphi_z_d) const;
+
+    void set_ddphi(double* ddphi_xx_d, double* ddphi_xy_d, double* ddphi_xz_d,
+                   double* ddphi_yy_d, double* ddphi_yz_d, double* ddphi_zz_d) const;
+
+    void phi_mul_vldr3(
+        const double* vl_d,
+        const double dr3,
+        const double* phi_d,
+        double* result_d) const;
+    
+    void phi_mul_phi(
+        const double* phi_d,
+        const double* phi_vldr3_d,
+        HContainer<double>& hRGint,
+        double* hr_d) const;
+    
+    void phi_mul_dm(
+        const double* phi_d,
+        const double* dm_d,
+        const HContainer<double>& dm,
+        const bool is_symm,
+        double* phi_dm_d);
+
+    void phi_dot_phi(
+        const double* phi_i_d,
+        const double* phi_j_d,
+        double* rho_d) const;
+    
+    void phi_dot_dphi(
+        const double* phi_d,
+        const double* dphi_x_d,
+        const double* dphi_y_d,
+        const double* dphi_z_d,
+        double* fvl_d) const;
+    
+    void phi_dot_dphi_r(
+        const double* phi_d,
+        const double* dphi_x_d,
+        const double* dphi_y_d,
+        const double* dphi_z_d,
+        double* svl_d) const;
+
+private:
+    std::shared_ptr<BatchBigGrid> bgrid_batch_;
+    std::shared_ptr<const GintGpuVars> gint_gpu_vars_;
+
+    // the number of meshgrids on a biggrid
+    int mgrids_num_;
+    
+    int phi_len_;
+
+    cudaStream_t stream_ = 0;
+    cudaEvent_t event_;
+
+    // The first number in every group of two represents the number of atoms on that bigcell.
+    // The second number represents the cumulative number of atoms up to that bigcell.
+    CudaMemWrapper<int2> atoms_num_info_;
+
+    // the iat of each atom
+    CudaMemWrapper<int> atoms_iat_;
+
+    // atoms_bgrids_rcoords_ here represents the relative coordinates from the big grid to the atoms
+    CudaMemWrapper<double3> atoms_bgrids_rcoords_;
+
+    // the start index of the phi array for each atom
+    CudaMemWrapper<int> atoms_phi_start_;
+    // The length of phi for a single meshgrid on each big grid.
+    CudaMemWrapper<int> bgrids_phi_len_;
+    // The start index of the phi array for each big grid.
+    CudaMemWrapper<int> bgrids_phi_start_;
+    // Mapping of the index of meshgrid in the batch of biggrids to the index of meshgrid in the local cell
+    CudaMemWrapper<int> mgrids_local_idx_batch_;
+
+    mutable CudaMemWrapper<int> gemm_m_;
+    mutable CudaMemWrapper<int> gemm_n_;
+    mutable CudaMemWrapper<int> gemm_k_;
+    mutable CudaMemWrapper<int> gemm_lda_;
+    mutable CudaMemWrapper<int> gemm_ldb_;
+    mutable CudaMemWrapper<int> gemm_ldc_;
+    mutable CudaMemWrapper<const double*> gemm_A_;
+    mutable CudaMemWrapper<const double*> gemm_B_;
+    mutable CudaMemWrapper<double*> gemm_C_; 
+    mutable CudaMemWrapper<double> gemm_alpha_;
+};
+
+}
\ No newline at end of file
diff --git a/source/module_hamilt_lcao/module_gint/temp_gint/kernel/phi_operator_kernel.cu b/source/module_hamilt_lcao/module_gint/temp_gint/kernel/phi_operator_kernel.cu
new file mode 100644
index 0000000000..5db767f501
--- /dev/null
+++ b/source/module_hamilt_lcao/module_gint/temp_gint/kernel/phi_operator_kernel.cu
@@ -0,0 +1,580 @@
+#include "phi_operator_kernel.cuh"
+#include "gint_helper.cuh"
+#include "sph.cuh"
+
+namespace ModuleGint
+{
+
+__global__ void set_phi_kernel(
+    const int nwmax,
+    const int mgrids_num,
+    const int nrmax,
+    const double dr_uniform,
+    const double* __restrict__ ylmcoef,
+    const int* __restrict__ ucell_atom_nwl,
+    const bool* __restrict__ atom_iw2_new,
+    const int* __restrict__ atom_iw2_ylm,
+    const int* __restrict__ atom_nw,
+    const int* __restrict__ iat2it,
+    const double* __restrict__ rcut,
+    const double* __restrict__ psi_u,
+    const double* __restrict__ dpsi_u,
+    const double3* __restrict__ mgrids_pos,
+    const int* __restrict__ atoms_iat,
+    const double3* __restrict__ atoms_bgrids_rcoords,
+    const int2* __restrict__ atoms_num_info,
+    const int* __restrict__ atoms_phi_start,
+    const int* __restrict__ bgrids_phi_len,
+    double* __restrict__ phi)
+{
+    const int bgrid_id = blockIdx.y;
+    const int mgrid_id = blockIdx.x;
+    const int atoms_num = atoms_num_info[bgrid_id].x;
+    const int pre_atoms_num = atoms_num_info[bgrid_id].y;
+    const double3 mgrid_pos = mgrids_pos[mgrid_id];
+    
+    for (int atom_id = threadIdx.x; atom_id < atoms_num; atom_id += blockDim.x)
+    {
+        const int atom_type = iat2it[atoms_iat[atom_id + pre_atoms_num]];
+        const double3 rcoord = atoms_bgrids_rcoords[atom_id + pre_atoms_num];       // rcoord is the ralative coordinate of an atom and a biggrid
+        const double3 coord = make_double3(mgrid_pos.x-rcoord.x,                    // coord is the relative coordinate of an atom and a meshgrid
+                                           mgrid_pos.y-rcoord.y,
+                                           mgrid_pos.z-rcoord.z);
+        double dist = norm3d(coord.x, coord.y, coord.z);
+        if (dist < rcut[atom_type])
+        {
+            if (dist < 1.0E-9)
+            { dist += 1.0E-9; }
+            // since nwl is less or equal than 5, the size of ylma is (5+1)^2
+            double ylma[36];
+            const int nwl = ucell_atom_nwl[atom_type];
+            sph_harm(nwl, ylmcoef, coord.x/dist, coord.y/dist, coord.z/dist, ylma);
+
+            const double pos = dist / dr_uniform;
+            const int ip = static_cast<int>(pos);
+            const double dx = pos - ip;
+            const double dx2 = dx * dx;
+            const double dx3 = dx2 * dx;
+
+            const double c3 = 3.0 * dx2 - 2.0 * dx3;
+            const double c1 = 1.0 - c3;
+            const double c2 = (dx - 2.0 * dx2 + dx3) * dr_uniform;
+            const double c4 = (dx3 - dx2) * dr_uniform;
+
+            double psi = 0;
+            const int it_nw = atom_type * nwmax;
+            int iw_nr = it_nw * nrmax + ip;
+            int phi_idx = atoms_phi_start[atom_id + pre_atoms_num] +
+                          bgrids_phi_len[bgrid_id] * mgrid_id;
+
+            for (int iw = 0; iw < atom_nw[atom_type]; iw++, iw_nr += nrmax)
+            {
+                if (atom_iw2_new[it_nw + iw])
+                {
+                    psi = c1 * psi_u[iw_nr] + c2 * dpsi_u[iw_nr]
+                          + c3 * psi_u[iw_nr + 1] + c4 * dpsi_u[iw_nr + 1];
+                }
+                phi[phi_idx + iw] = psi * ylma[atom_iw2_ylm[it_nw + iw]];
+            }
+        }
+        else
+        {
+            int phi_idx = atoms_phi_start[atom_id + pre_atoms_num] +
+                          bgrids_phi_len[bgrid_id] * mgrid_id;
+            for (int iw = 0; iw < atom_nw[atom_type]; iw++)
+            {
+                phi[phi_idx + iw] = 0.0;
+            }
+        }
+    }
+}
+
+__global__ void set_phi_dphi_kernel(
+    const int nwmax,
+    const int mgrids_num,
+    const int nrmax,
+    const double dr_uniform,
+    const double* __restrict__ ylmcoef,
+    const int* __restrict__ ucell_atom_nwl,
+    const bool* __restrict__ atom_iw2_new,
+    const int* __restrict__ atom_iw2_ylm,
+    const int* __restrict__ atom_iw2_l,
+    const int* __restrict__ atom_nw,
+    const int* __restrict__ iat2it,
+    const double* __restrict__ rcut,
+    const double* __restrict__ psi_u,
+    const double* __restrict__ dpsi_u,
+    const double3* __restrict__ mgrids_pos,
+    const int* __restrict__ atoms_iat,
+    const double3* __restrict__ atoms_bgrids_rcoords,
+    const int2* __restrict__ atoms_num_info,
+    const int* __restrict__ atoms_phi_start,
+    const int* __restrict__ bgrids_phi_len,
+    double* __restrict__ phi,
+    double* __restrict__ dphi_x,
+    double* __restrict__ dphi_y,
+    double* __restrict__ dphi_z)
+{
+    const int bgrid_id = blockIdx.y;
+    const int mgrid_id = blockIdx.x;
+    const int atoms_num = atoms_num_info[bgrid_id].x;
+    const int pre_atoms_num = atoms_num_info[bgrid_id].y;
+    const double3 mgrid_pos = mgrids_pos[mgrid_id];
+    
+    for (int atom_id = threadIdx.x; atom_id < atoms_num; atom_id += blockDim.x)
+    {
+        const int atom_type = iat2it[atoms_iat[atom_id + pre_atoms_num]];
+        const double3 rcoord = atoms_bgrids_rcoords[atom_id + pre_atoms_num];
+        const double3 coord = make_double3(mgrid_pos.x-rcoord.x,
+                                           mgrid_pos.y-rcoord.y,
+                                           mgrid_pos.z-rcoord.z);
+        double dist = norm3d(coord.x, coord.y, coord.z);
+        if (dist < rcut[atom_type])
+        {
+            if (dist < 1.0E-9)
+            { dist += 1.0E-9; }
+            // since nwl is less or equal than 5, the size of rly is (5+1)^2
+            // size of grly = 36 * 3
+            double rly[36];
+            double grly[36 * 3];
+            const int nwl = ucell_atom_nwl[atom_type];
+            grad_rl_sph_harm(nwl, ylmcoef, coord.x, coord.y, coord.z, rly, grly);
+
+            // interpolation
+            const double pos = dist / dr_uniform;
+            const int ip = static_cast<int>(pos);
+            const double x0 = pos - ip;
+            const double x1 = 1.0 - x0;
+            const double x2 = 2.0 - x0;
+            const double x3 = 3.0 - x0;
+            const double x12 = x1 * x2 / 6;
+            const double x03 = x0 * x3 / 2;
+            double tmp = 0;
+            double dtmp = 0;
+            const int it_nw = atom_type * nwmax;
+            int iw_nr = it_nw * nrmax + ip;
+            int phi_idx = atoms_phi_start[atom_id + pre_atoms_num] +
+                          bgrids_phi_len[bgrid_id] * mgrid_id;
+            for (int iw = 0; iw < atom_nw[atom_type]; iw++, iw_nr += nrmax)
+            {
+                if (atom_iw2_new[it_nw + iw])
+                {
+                    tmp = x12 * (psi_u[iw_nr] * x3 + psi_u[iw_nr + 3] * x0)
+                        + x03 * (psi_u[iw_nr + 1] * x2 - psi_u[iw_nr + 2] * x1);
+                    dtmp = x12 * (dpsi_u[iw_nr] * x3 + dpsi_u[iw_nr + 3] * x0)
+                         + x03 * (dpsi_u[iw_nr + 1] * x2 - dpsi_u[iw_nr + 2] * x1);
+                }
+                const int iw_l = atom_iw2_l[it_nw + iw];
+                const int idx_ylm = atom_iw2_ylm [it_nw + iw];
+                const double rl = pow_int(dist, iw_l);
+                const double tmprl = tmp / rl;
+
+                // if phi == nullptr, it means that we only need dphi.
+                if(phi != nullptr)
+                {
+                    phi[phi_idx + iw] = tmprl * rly[idx_ylm];
+                }
+                // derivative of wave functions with respect to atom positions.
+                const double tmpdphi_rly = (dtmp - tmp * iw_l / dist) / rl * rly[idx_ylm] / dist;
+
+                dphi_x[phi_idx + iw] =  tmpdphi_rly * coord.x + tmprl * grly[idx_ylm * 3 + 0];
+                dphi_y[phi_idx + iw] =  tmpdphi_rly * coord.y + tmprl * grly[idx_ylm * 3 + 1];
+                dphi_z[phi_idx + iw] =  tmpdphi_rly * coord.z + tmprl * grly[idx_ylm * 3 + 2];
+            }
+        }
+        else
+        {
+            int phi_idx = atoms_phi_start[atom_id + pre_atoms_num] +
+                          bgrids_phi_len[bgrid_id] * mgrid_id;
+            for (int iw = 0; iw < atom_nw[atom_type]; iw++)
+            {
+                if(phi != nullptr)
+                {
+                    phi[phi_idx + iw] = 0.0;
+                }
+                dphi_x[phi_idx + iw] = 0.0;
+                dphi_y[phi_idx + iw] = 0.0;
+                dphi_z[phi_idx + iw] = 0.0;
+            }
+        }
+    }
+}
+
+// The code for `set_ddphi_kernel` is quite difficult to understand.
+// To grasp it, you better refer to the CPU function `set_ddphi`
+__global__ void set_ddphi_kernel(
+    const int nwmax,
+    const int mgrids_num,
+    const int nrmax,
+    const double dr_uniform,
+    const double* __restrict__ ylmcoef,
+    const int* __restrict__ ucell_atom_nwl,
+    const bool* __restrict__ atom_iw2_new,
+    const int* __restrict__ atom_iw2_ylm,
+    const int* __restrict__ atom_iw2_l,
+    const int* __restrict__ atom_nw,
+    const int* __restrict__ iat2it,
+    const double* __restrict__ rcut,
+    const double* __restrict__ psi_u,
+    const double* __restrict__ dpsi_u,
+    const double3* __restrict__ mgrids_pos,
+    const int* __restrict__ atoms_iat,
+    const double3* __restrict__ atoms_bgrids_rcoords,
+    const int2* __restrict__ atoms_num_info,
+    const int* __restrict__ atoms_phi_start,
+    const int* __restrict__ bgrids_phi_len,
+    double* __restrict__ ddphi_xx,
+    double* __restrict__ ddphi_xy,
+    double* __restrict__ ddphi_xz,
+    double* __restrict__ ddphi_yy,
+    double* __restrict__ ddphi_yz,
+    double* __restrict__ ddphi_zz)
+{
+    const int bgrid_id = blockIdx.y;
+    const int mgrid_id = blockIdx.x;
+    const int atoms_num = atoms_num_info[bgrid_id].x;
+    const int pre_atoms_num = atoms_num_info[bgrid_id].y;
+    const double3 mgrid_pos = mgrids_pos[mgrid_id];
+    
+    for (int atom_id = threadIdx.x; atom_id < atoms_num; atom_id += blockDim.x)
+    {
+        const int atom_type = iat2it[atoms_iat[atom_id + pre_atoms_num]];
+        const double3 rcoord = atoms_bgrids_rcoords[atom_id + pre_atoms_num];
+        double coord[3]{mgrid_pos.x-rcoord.x,
+                        mgrid_pos.y-rcoord.y,
+                        mgrid_pos.z-rcoord.z};
+        double dist = norm3d(coord[0], coord[1], coord[2]);
+        if (dist < rcut[atom_type])
+        {
+            int phi_idx = atoms_phi_start[atom_id + pre_atoms_num] +
+                          bgrids_phi_len[bgrid_id] * mgrid_id;
+            for(int i = 0; i < 6; i++)
+            {
+                coord[i/2] += std::pow(-1, i%2) * 0.0001;
+                double dist = norm3d(coord[0], coord[1], coord[2]);
+                if (dist < 1.0E-9)
+                { dist += 1.0E-9; }
+                // since nwl is less or equal than 5, the size of rly is (5+1)^2
+                // size of grly = 36 * 3
+                double rly[36];
+                double grly[36 * 3];
+                const int nwl = ucell_atom_nwl[atom_type];
+                grad_rl_sph_harm(nwl, ylmcoef, coord[0], coord[1], coord[2], rly, grly);
+
+                // interpolation
+                const double pos = dist / dr_uniform;
+                const int ip = static_cast<int>(pos);
+                const double x0 = pos - ip;
+                const double x1 = 1.0 - x0;
+                const double x2 = 2.0 - x0;
+                const double x3 = 3.0 - x0;
+                const double x12 = x1 * x2 / 6;
+                const double x03 = x0 * x3 / 2;
+                double tmp = 0;
+                double dtmp = 0;
+                const int it_nw = atom_type * nwmax;
+                int iw_nr = it_nw * nrmax + ip;
+                for (int iw = 0; iw < atom_nw[atom_type]; iw++, iw_nr += nrmax)
+                {
+                    if (atom_iw2_new[it_nw + iw])
+                    {
+                        tmp = x12 * (psi_u[iw_nr] * x3 + psi_u[iw_nr + 3] * x0)
+                            + x03 * (psi_u[iw_nr + 1] * x2 - psi_u[iw_nr + 2] * x1);
+                        dtmp = x12 * (dpsi_u[iw_nr] * x3 + dpsi_u[iw_nr + 3] * x0)
+                            + x03 * (dpsi_u[iw_nr + 1] * x2 - dpsi_u[iw_nr + 2] * x1);
+                    }
+                    const int iw_l = atom_iw2_l[it_nw + iw];
+                    const int idx_ylm = atom_iw2_ylm [it_nw + iw];
+                    const double rl = pow_int(dist, iw_l);
+                    const double tmprl = tmp / rl;
+                    const double tmpdphi_rly = (dtmp - tmp * iw_l / dist) / rl * rly[idx_ylm] / dist;
+                    
+                    double dphi[3];
+                    dphi[0] = tmpdphi_rly * coord[0] + tmprl * grly[idx_ylm * 3 + 0];
+                    dphi[1] = tmpdphi_rly * coord[1] + tmprl * grly[idx_ylm * 3 + 1];
+                    dphi[2] = tmpdphi_rly * coord[2] + tmprl * grly[idx_ylm * 3 + 2];
+
+                    if (i == 0)
+                    {
+                        ddphi_xx[phi_idx + iw] += dphi[0];
+                        ddphi_xy[phi_idx + iw] += dphi[1];
+                        ddphi_xz[phi_idx + iw] += dphi[2];
+                    } else if (i == 1)
+                    {
+                        ddphi_xx[phi_idx + iw] -= dphi[0];
+                        ddphi_xy[phi_idx + iw] -= dphi[1];
+                        ddphi_xz[phi_idx + iw] -= dphi[2];
+                    } else if (i == 2)
+                    {
+                        ddphi_xy[phi_idx + iw] += dphi[0];
+                        ddphi_yy[phi_idx + iw] += dphi[1];
+                        ddphi_yz[phi_idx + iw] += dphi[2];
+                    } else if (i == 3)
+                    {
+                        ddphi_xy[phi_idx + iw] -= dphi[0];
+                        ddphi_yy[phi_idx + iw] -= dphi[1];
+                        ddphi_yz[phi_idx + iw] -= dphi[2];
+                    } else if (i == 4)
+                    {
+                        ddphi_xz[phi_idx + iw] += dphi[0];
+                        ddphi_yz[phi_idx + iw] += dphi[1];
+                        ddphi_zz[phi_idx + iw] += dphi[2];
+                    } else // i == 5
+                    {
+                        ddphi_xz[phi_idx + iw] -= dphi[0];
+                        ddphi_yz[phi_idx + iw] -= dphi[1];
+                        ddphi_zz[phi_idx + iw] -= dphi[2];
+                    }
+                }
+                coord[i/2] -= std::pow(-1, i%2) * 0.0001;  // recover coord
+            }
+
+            for (int iw = 0; iw < atom_nw[atom_type]; iw++)
+            {
+                ddphi_xx[phi_idx + iw] /= 0.0002;
+                ddphi_xy[phi_idx + iw] /= 0.0004;
+                ddphi_xz[phi_idx + iw] /= 0.0004;
+                ddphi_yy[phi_idx + iw] /= 0.0002;
+                ddphi_yz[phi_idx + iw] /= 0.0004;
+                ddphi_zz[phi_idx + iw] /= 0.0002;
+            }
+        }
+    }
+}
+
+__global__ void phi_mul_vldr3_kernel(
+    const double* __restrict__ vl,
+    const double dr3,
+    const double* __restrict__ phi,
+    const int mgrids_per_bgrid,
+    const int* __restrict__ mgrids_local_idx,
+    const int* __restrict__ bgrids_phi_len,
+    const int* __restrict__ bgrids_phi_start,
+    double* __restrict__ result)
+{
+    const int bgrid_id = blockIdx.y;
+    const int mgrid_id = blockIdx.x;
+    const int phi_len = bgrids_phi_len[bgrid_id];
+    const int phi_start = bgrids_phi_start[bgrid_id] + mgrid_id * phi_len;
+    const int mgrid_id_in_batch = bgrid_id * mgrids_per_bgrid + mgrid_id;
+    const double vldr3 =  vl[mgrids_local_idx[mgrid_id_in_batch]] * dr3;
+    for(int i = threadIdx.x; i < phi_len; i += blockDim.x)
+    {
+        result[phi_start + i] = phi[phi_start + i] * vldr3;
+    }
+}
+
+// rho(ir) = \sum_{iwt} \phi_i(ir,iwt) * \phi_j^*(ir,iwt)
+// each block calculate the dot product of phi_i and phi_j of a meshgrid
+__global__ void phi_dot_phi_kernel(
+    const double* __restrict__ phi_i,
+    const double* __restrict__ phi_j,
+    const int mgrids_per_bgrid,
+    const int* __restrict__ mgrids_local_idx,
+    const int* __restrict__ bgrids_phi_len,
+    const int* __restrict__ bgrids_phi_start,
+    double* __restrict__ rho)
+{
+    __shared__ double s_data[32];    // the length of s_data equals the max warp num of a block
+    const int bgrid_id = blockIdx.y;
+    const int mgrid_id = blockIdx.x;
+    const int phi_len = bgrids_phi_len[bgrid_id];
+    const int phi_start = bgrids_phi_start[bgrid_id] + mgrid_id * phi_len;
+    const double* phi_i_mgrid = phi_i + phi_start;
+    const double* phi_j_mgrid = phi_j + phi_start;
+    const int mgrid_id_in_batch = bgrid_id * mgrids_per_bgrid + mgrid_id;
+    const int mgrid_local_idx = mgrids_local_idx[mgrid_id_in_batch];
+    const int tid = threadIdx.x;
+    const int warp_id = tid / 32;
+    const int lane_id = tid % 32;
+    double tmp_sum = 0;
+
+    for (int i = tid; i < phi_len; i += blockDim.x)
+    {
+        tmp_sum += phi_i_mgrid[i] * phi_j_mgrid[i];
+    }
+
+    tmp_sum = warpReduceSum(tmp_sum);
+
+    if (lane_id == 0)
+    {
+        s_data[warp_id] = tmp_sum;
+    }
+    __syncthreads();
+
+    tmp_sum = (tid < blockDim.x / 32) ? s_data[tid] : 0;
+    if(warp_id == 0)
+    {
+        tmp_sum = warpReduceSum(tmp_sum);
+    }
+
+    if(tid == 0)
+    {
+        rho[mgrid_local_idx] += tmp_sum;
+    }
+}
+
+__global__ void phi_dot_dphi_kernel(
+    const double* __restrict__ phi,
+    const double* __restrict__ dphi_x,
+    const double* __restrict__ dphi_y,
+    const double* __restrict__ dphi_z,
+    const int mgrids_per_bgrid,
+    const int* __restrict__ bgrids_phi_len,
+    const int2* __restrict__ atoms_num_info,
+    const int* __restrict__ atoms_phi_start,
+    const int* __restrict__ atoms_iat,
+    const int* __restrict__ iat2it,
+    const int* __restrict__ atom_nw,
+    double* force)
+{
+    __shared__ double s_data[32 * 3];    // the length of s_data equals the max warp num of a block times 3
+    const int bgrid_id = blockIdx.y;
+    const int atoms_num = atoms_num_info[bgrid_id].x;
+    const int pre_atoms_num = atoms_num_info[bgrid_id].y;
+    const int bgrid_phi_len = bgrids_phi_len[bgrid_id];
+    const int tid = threadIdx.x;
+    const int warp_id = tid / 32;
+    const int lane_id = tid % 32;
+
+    for (int atom_id = blockIdx.x; atom_id < atoms_num; atom_id += gridDim.x)
+    {
+        const int atom_phi_start = atoms_phi_start[atom_id + pre_atoms_num];
+        const int iat = atoms_iat[atom_id + pre_atoms_num];
+        const int nw = atom_nw[iat2it[iat]];
+        double f[3] = {0.0, 0.0, 0.0};
+        for (int mgrid_id = 0; mgrid_id < mgrids_per_bgrid; mgrid_id++)
+        {
+            const int phi_start = atom_phi_start + mgrid_id * bgrid_phi_len;
+            for (int iw = tid; iw < nw; iw += blockDim.x)
+            {
+                int phi_idx = phi_start + iw;
+                f[0] += phi[phi_idx] * dphi_x[phi_idx];
+                f[1] += phi[phi_idx] * dphi_y[phi_idx];
+                f[2] += phi[phi_idx] * dphi_z[phi_idx];
+            }
+        }
+
+        // reduce the force in each block
+        for (int i = 0; i < 3; i++)
+        {
+            f[i] = warpReduceSum(f[i]);
+        }
+
+        if (lane_id == 0)
+        {
+            for (int i = 0; i < 3; i++)
+            {
+                s_data[warp_id * 3 + i] = f[i];
+            }
+        }
+        __syncthreads();
+        
+        for (int i = 0; i < 3; i++)
+        {
+            f[i] = (tid < blockDim.x / 32) ? s_data[tid * 3 + i] : 0;
+        }
+        if (warp_id == 0)
+        {
+            for (int i = 0; i < 3; i++)
+            {
+                f[i] = warpReduceSum(f[i]);
+            }
+        }
+        if (tid == 0)
+        {
+            for (int i = 0; i < 3; i++)
+            {
+                atomicAdd(&force[iat * 3 + i], f[i] * 2);
+            }
+        }
+    }
+}
+
+__global__ void phi_dot_dphi_r_kernel(
+    const double* __restrict__ phi,
+    const double* __restrict__ dphi_x,
+    const double* __restrict__ dphi_y,
+    const double* __restrict__ dphi_z,
+    const int mgrids_per_bgrid,
+    const int* __restrict__ bgrids_phi_len,
+    const int2* __restrict__ atoms_num_info,
+    const int* __restrict__ atoms_phi_start,
+    const int* __restrict__ atoms_iat,
+    const double3* __restrict__ atoms_bgrids_rcoords,
+    const double3* __restrict__ mgrids_pos,
+    const int* __restrict__ iat2it,
+    const int* __restrict__ atom_nw,
+    double* __restrict__ svl)
+{
+    __shared__ double s_data[32 * 6];  // the length of s_data equals the max warp num of a block times 6
+    const int tid = threadIdx.x;
+    const int bgrid_id = blockIdx.y;
+    const int atoms_num = atoms_num_info[bgrid_id].x;
+    const int pre_atoms_num = atoms_num_info[bgrid_id].y;
+    const int bgrid_phi_len = bgrids_phi_len[bgrid_id];
+    const int warp_id = tid / 32;
+    const int lane_id = tid % 32;
+    
+    double stress[6]{0.0};
+    for (int mgrid_id = blockIdx.x; mgrid_id < mgrids_per_bgrid; mgrid_id += gridDim.x)
+    {
+        const double3 mgrid_pos = mgrids_pos[mgrid_id];
+        for (int atom_id = 0; atom_id < atoms_num; atom_id++)
+        {
+            const int atom_phi_start = atoms_phi_start[atom_id + pre_atoms_num] + mgrid_id * bgrid_phi_len;
+            const int iat = atoms_iat[atom_id + pre_atoms_num];
+            const int nw = atom_nw[iat2it[iat]];
+            const double3 rcoord = atoms_bgrids_rcoords[atom_id + pre_atoms_num];       // rcoord is the ralative coordinate of an atom and a biggrid
+            const double3 coord = make_double3(mgrid_pos.x-rcoord.x,                    // coord is the relative coordinate of an atom and a meshgrid
+                                               mgrid_pos.y-rcoord.y,
+                                               mgrid_pos.z-rcoord.z);
+            for (int iw = tid; iw < nw; iw += blockDim.x)
+            {
+                int phi_idx = atom_phi_start + iw;
+                stress[0] += phi[phi_idx] * dphi_x[phi_idx] * coord.x;
+                stress[1] += phi[phi_idx] * dphi_x[phi_idx] * coord.y;
+                stress[2] += phi[phi_idx] * dphi_x[phi_idx] * coord.z;
+                stress[3] += phi[phi_idx] * dphi_y[phi_idx] * coord.y;
+                stress[4] += phi[phi_idx] * dphi_y[phi_idx] * coord.z;
+                stress[5] += phi[phi_idx] * dphi_z[phi_idx] * coord.z;
+            }
+        }
+    }
+    
+    // reduce the stress in each block
+    for (int i = 0; i < 6; i++)
+    {
+        stress[i] = warpReduceSum(stress[i]);
+    }
+
+    if (lane_id == 0)
+    {
+        for (int i = 0; i < 6; i++)
+        {
+            s_data[warp_id * 6 + i] = stress[i];
+        }
+    }
+    __syncthreads();
+
+    for (int i = 0; i < 6; i++)
+    {
+        stress[i] = (tid < blockDim.x / 32) ? s_data[tid * 6 + i] : 0;
+    }
+    if (warp_id == 0)
+    {
+        for (int i = 0; i < 6; i++)
+        {
+            stress[i] = warpReduceSum(stress[i]);
+        }
+    }
+    if (tid == 0)
+    {
+        for (int i = 0; i < 6; i++)
+        {
+            atomicAdd(&svl[i], stress[i] * 2);
+        }
+    }
+}
+
+}
\ No newline at end of file
diff --git a/source/module_hamilt_lcao/module_gint/temp_gint/kernel/phi_operator_kernel.cuh b/source/module_hamilt_lcao/module_gint/temp_gint/kernel/phi_operator_kernel.cuh
new file mode 100644
index 0000000000..4d32475542
--- /dev/null
+++ b/source/module_hamilt_lcao/module_gint/temp_gint/kernel/phi_operator_kernel.cuh
@@ -0,0 +1,135 @@
+#pragma once
+
+#include <cuda_runtime.h>
+
+namespace ModuleGint
+{
+
+__global__ void set_phi_kernel(
+    const int nwmax,
+    const int mgrids_num,
+    const int nrmax,
+    const double dr_uniform,
+    const double* __restrict__ ylmcoef,
+    const int* __restrict__ ucell_atom_nwl,
+    const bool* __restrict__ atom_iw2_new,
+    const int* __restrict__ atom_iw2_ylm,
+    const int* __restrict__ atom_nw,
+    const int* __restrict__ iat2it,
+    const double* __restrict__ rcut,
+    const double* __restrict__ psi_u,
+    const double* __restrict__ dpsi_u,
+    const double3* __restrict__ mgrids_pos,
+    const int* __restrict__ atoms_iat,
+    const double3* __restrict__ atoms_bgrids_rcoords,
+    const int2* __restrict__ atoms_num_info,
+    const int* __restrict__ atoms_phi_start,
+    const int* __restrict__ bgrids_phi_len,
+    double* __restrict__ phi);
+
+__global__ void set_phi_dphi_kernel(
+    const int nwmax,
+    const int mgrids_num,
+    const int nrmax,
+    const double dr_uniform,
+    const double* __restrict__ ylmcoef,
+    const int* __restrict__ ucell_atom_nwl,
+    const bool* __restrict__ atom_iw2_new,
+    const int* __restrict__ atom_iw2_ylm,
+    const int* __restrict__ atom_iw2_l,
+    const int* __restrict__ atom_nw,
+    const int* __restrict__ iat2it,
+    const double* __restrict__ rcut,
+    const double* __restrict__ psi_u,
+    const double* __restrict__ dpsi_u,
+    const double3* __restrict__ mgrids_pos,
+    const int* __restrict__ atoms_iat,
+    const double3* __restrict__ atoms_bgrids_rcoords,
+    const int2* __restrict__ atoms_num_info,
+    const int* __restrict__ atoms_phi_start,
+    const int* __restrict__ bgrids_phi_len,
+    double* __restrict__ phi,
+    double* __restrict__ dphi_x,
+    double* __restrict__ dphi_y,
+    double* __restrict__ dphi_z);
+
+__global__ void set_ddphi_kernel(
+    const int nwmax,
+    const int mgrids_num,
+    const int nrmax,
+    const double dr_uniform,
+    const double* __restrict__ ylmcoef,
+    const int* __restrict__ ucell_atom_nwl,
+    const bool* __restrict__ atom_iw2_new,
+    const int* __restrict__ atom_iw2_ylm,
+    const int* __restrict__ atom_iw2_l,
+    const int* __restrict__ atom_nw,
+    const int* __restrict__ iat2it,
+    const double* __restrict__ rcut,
+    const double* __restrict__ psi_u,
+    const double* __restrict__ dpsi_u,
+    const double3* __restrict__ mgrids_pos,
+    const int* __restrict__ atoms_iat,
+    const double3* __restrict__ atoms_bgrids_rcoords,
+    const int2* __restrict__ atoms_num_info,
+    const int* __restrict__ atoms_phi_start,
+    const int* __restrict__ bgrids_phi_len,
+    double* __restrict__ ddphi_xx,
+    double* __restrict__ ddphi_xy,
+    double* __restrict__ ddphi_xz,
+    double* __restrict__ ddphi_yy,
+    double* __restrict__ ddphi_yz,
+    double* __restrict__ ddphi_zz);
+
+__global__ void phi_mul_vldr3_kernel(
+    const double* __restrict__ vl,
+    const double dr3,
+    const double* __restrict__ phi,
+    const int mgrids_per_bgrid,
+    const int* __restrict__ mgrids_local_idx,
+    const int* __restrict__ bgrids_phi_len,
+    const int* __restrict__ bgrids_phi_start,
+    double* __restrict__ result);
+
+// rho(ir) = \sum_{iwt} \phi_i(ir,iwt) * \phi_j^*(ir,iwt)
+// each block calculate the dot product of phi_i and phi_j of a meshgrid
+__global__ void phi_dot_phi_kernel(
+    const double* __restrict__ phi_i,           // phi_i(ir,iwt)
+    const double* __restrict__ phi_j,           // phi_j(ir,iwt)
+    const int mgrids_per_bgrid,                 // the number of mgrids of each biggrid
+    const int* __restrict__ mgrids_local_idx,   // the idx of mgrid in local cell
+    const int* __restrict__ bgrids_phi_len,     // the length of phi on a mgrid of a biggrid
+    const int* __restrict__ bgrids_phi_start,   // the start idx in phi of each biggrid
+    double* __restrict__ rho);                  // rho(ir)
+
+__global__ void phi_dot_dphi_kernel(
+    const double* __restrict__ phi,
+    const double* __restrict__ dphi_x,
+    const double* __restrict__ dphi_y,
+    const double* __restrict__ dphi_z,
+    const int mgrids_per_bgrid,
+    const int* __restrict__ bgrids_phi_len,
+    const int2* __restrict__ atoms_num_info,
+    const int* __restrict__ atoms_phi_start,
+    const int* __restrict__ atoms_iat,
+    const int* __restrict__ iat2it,
+    const int* __restrict__ atom_nw,
+    double* force);
+
+__global__ void phi_dot_dphi_r_kernel(
+    const double* __restrict__ phi,
+    const double* __restrict__ dphi_x,
+    const double* __restrict__ dphi_y,
+    const double* __restrict__ dphi_z,
+    const int mgrids_per_bgrid,
+    const int* __restrict__ bgrids_phi_len,
+    const int2* __restrict__ atoms_num_info,
+    const int* __restrict__ atoms_phi_start,
+    const int* __restrict__ atoms_iat,
+    const double3* __restrict__ atoms_bgrids_rcoords,
+    const double3* __restrict__ mgrids_pos,
+    const int* __restrict__ iat2it,
+    const int* __restrict__ atom_nw,
+    double* __restrict__ svl);
+    
+}
diff --git a/source/module_hamilt_lcao/module_gint/temp_gint/kernel/set_const_mem.cu b/source/module_hamilt_lcao/module_gint/temp_gint/kernel/set_const_mem.cu
new file mode 100644
index 0000000000..38fba5de00
--- /dev/null
+++ b/source/module_hamilt_lcao/module_gint/temp_gint/kernel/set_const_mem.cu
@@ -0,0 +1,13 @@
+#include "set_const_mem.cuh"
+#include "gint_helper.cuh"
+
+__constant__ double ylmcoe_d[100];
+
+namespace ModuleGint
+{
+    __host__ void set_ylmcoe_d(const double* ylmcoe_h, double** ylmcoe_d_addr)
+    {
+        checkCuda(cudaMemcpyToSymbol(ylmcoe_d, ylmcoe_h, sizeof(double) * 100));
+        checkCuda(cudaGetSymbolAddress((void**)ylmcoe_d_addr, ylmcoe_d));
+    }
+}
\ No newline at end of file
diff --git a/source/module_hamilt_lcao/module_gint/temp_gint/kernel/set_const_mem.cuh b/source/module_hamilt_lcao/module_gint/temp_gint/kernel/set_const_mem.cuh
new file mode 100644
index 0000000000..715fa98cde
--- /dev/null
+++ b/source/module_hamilt_lcao/module_gint/temp_gint/kernel/set_const_mem.cuh
@@ -0,0 +1,7 @@
+#pragma once
+#include <cuda_runtime.h>
+
+namespace ModuleGint
+{
+__host__ void set_ylmcoe_d(const double* ylmcoe_h, double** ylmcoe_d_addr);
+}
\ No newline at end of file
diff --git a/source/module_hamilt_lcao/module_gint/temp_gint/kernel/sph.cuh b/source/module_hamilt_lcao/module_gint/temp_gint/kernel/sph.cuh
new file mode 100644
index 0000000000..b36828222b
--- /dev/null
+++ b/source/module_hamilt_lcao/module_gint/temp_gint/kernel/sph.cuh
@@ -0,0 +1,396 @@
+#pragma once
+
+namespace ModuleGint
+{
+
+static __device__ void sph_harm(
+    const int nwl,
+    const double* __restrict__ ylmcoef,
+    const double x,
+	const double y,
+	const double z,
+    double* __restrict__ ylma
+)
+{
+   /***************************
+   L = 0
+   ***************************/
+   ylma[0] = ylmcoef[0]; // l=0, m=0
+   double tmp0;
+   if (nwl == 0)
+       return;
+
+   /***************************
+   L = 1
+   ***************************/
+   ylma[1] = ylmcoef[1] * z;  // l=1, m=0
+   ylma[2] = -ylmcoef[1] * x; // l=1, m=1
+   ylma[3] = -ylmcoef[1] * y; // l=1, m=-1
+   if (nwl == 1)
+       return;
+
+   /***************************
+   L = 2
+   ***************************/
+   tmp0=ylmcoef[3] * ylma[0];
+   ylma[4] = ylmcoef[2] * z * ylma[1] - tmp0 ; // l=2, m=0
+   tmp0 = ylmcoef[4] * z;
+   ylma[5] = tmp0 * ylma[2]; // l=2,m=1
+   ylma[6] = tmp0 * ylma[3]; // l=2,m=-1
+
+   tmp0 = ylmcoef[4] * x;
+   ylma[7] = ylmcoef[5] * ylma[4] - ylmcoef[6] * ylma[0]
+             - tmp0 * ylma[2]; // l=2,m=2
+   ylma[8] = -tmp0 * ylma[3];
+   if (nwl == 2)
+       return;
+
+   /***************************
+   L = 3
+   ***************************/
+   tmp0=ylmcoef[8] * ylma[1];
+   ylma[9] = ylmcoef[7] * z * ylma[4] - tmp0; // l=3, m=0
+
+   tmp0 = ylmcoef[9] * z;
+   ylma[10] = tmp0 * ylma[5] - ylmcoef[10] * ylma[2]; // l=3,m=1
+   ylma[11] = tmp0 * ylma[6] - ylmcoef[10] * ylma[3]; // l=3,m=-1
+
+   tmp0 = ylmcoef[11] * z;
+   ylma[12] = tmp0 * ylma[7]; // l=3,m=2
+   ylma[13] = tmp0 * ylma[8]; // l=3,m=-2
+
+   tmp0 = ylmcoef[14] * x;
+   ylma[14] = ylmcoef[12] * ylma[10] - ylmcoef[13] * ylma[2]
+              - tmp0 * ylma[7]; // l=3,m=3
+   ylma[15] = ylmcoef[12] * ylma[11] - ylmcoef[13] * ylma[3]
+              - tmp0 * ylma[8]; // l=3,m=-3
+   if (nwl == 3)
+       return;
+
+   /***************************
+   L = 4
+   ***************************/
+   tmp0=ylmcoef[16] * ylma[4];
+   ylma[16] = ylmcoef[15] * z * ylma[9] - tmp0; // l=4,m=0
+
+   tmp0 = ylmcoef[17] * z;
+   ylma[17] = tmp0 * ylma[10] - ylmcoef[18] * ylma[5]; // l=4,m=1
+   ylma[18] = tmp0 * ylma[11] - ylmcoef[18] * ylma[6]; // l=4,m=-1
+
+   tmp0 = ylmcoef[19] * z;
+   ylma[19] = tmp0 * ylma[12] - ylmcoef[20] * ylma[7]; // l=4,m=2
+   ylma[20] = tmp0 * ylma[13] - ylmcoef[20] * ylma[8]; // l=4,m=-2
+
+   tmp0 = 3.0 * z;
+   ylma[21] = tmp0 * ylma[14]; // l=4,m=3
+   ylma[22] = tmp0 * ylma[15]; // l=4,m=-3
+
+   tmp0 = ylmcoef[23] * x;
+   ylma[23] = ylmcoef[21] * ylma[19] - ylmcoef[22] * ylma[7]
+              - tmp0 * ylma[14]; // l=4,m=4
+   ylma[24] = ylmcoef[21] * ylma[20] - ylmcoef[22] * ylma[8]
+              - tmp0 * ylma[15]; // l=4,m=-4
+   if (nwl == 4)
+       return;
+
+   /***************************
+   L = 5
+   ***************************/
+   tmp0=ylmcoef[25] * ylma[9];
+   ylma[25]
+       = ylmcoef[24] * z * ylma[16] - tmp0; // l=5,m=0
+
+   tmp0 = ylmcoef[26] * z;
+   ylma[26] = tmp0 * ylma[17] - ylmcoef[27] * ylma[10]; // l=5,m=1
+   ylma[27] = tmp0 * ylma[18] - ylmcoef[27] * ylma[11]; // l=5,m=-1
+
+   tmp0 = ylmcoef[28] * z;
+   ylma[28] = tmp0 * ylma[19] - ylmcoef[29] * ylma[12]; // l=5,m=2
+   ylma[29] = tmp0 * ylma[20] - ylmcoef[29] * ylma[13]; // l=5,m=-2
+
+   tmp0 = ylmcoef[30] * z;
+   ylma[30] = tmp0 * ylma[21] - ylmcoef[31] * ylma[14]; // l=5,m=3
+   ylma[31] = tmp0 * ylma[22] - ylmcoef[31] * ylma[15]; // l=5,m=-3
+
+   tmp0 = ylmcoef[32] * z;
+   ylma[32] = tmp0 * ylma[23]; // l=5,m=4
+   ylma[33] = tmp0 * ylma[24]; // l=5,m=-4
+
+   tmp0 = ylmcoef[35] * x;
+   ylma[34] = ylmcoef[33] * ylma[30] - ylmcoef[34] * ylma[14]
+              - tmp0 * ylma[23]; // l=5,m=5
+   ylma[35] = ylmcoef[33] * ylma[31] - ylmcoef[34] * ylma[15]
+              - tmp0 * ylma[24]; // l=5,m=-5
+   if (nwl == 5)
+       return;
+   /*
+   // if nwl > 5
+   for (int il = 6; il <= nwl; il++)
+   {
+       int istart = il * il;
+       int istart1 = (il - 1) * (il - 1);
+       int istart2 = (il - 2) * (il - 2);
+
+       double fac2 = sqrt(4.0 * istart - 1.0);
+       double fac4 = sqrt(4.0 * istart1 - 1.0);
+
+       for (int im = 0; im < 2 * il - 1; im++)
+       {
+           int imm = (im + 1) / 2;
+           ylma[istart + im] = fac2 / sqrt((double)istart - imm * imm) * (z
+   * ylma[istart1 + im] - sqrt((double)istart1 - imm * imm) / fac4 *
+   ylma[istart2 + im]);
+       }
+
+       double bl1 = sqrt(2.0 * il / (2.0 * il + 1.0));
+       double bl2 = sqrt((2.0 * il - 2.0) / (2.0 * il - 1.0));
+       double bl3 = sqrt(2.0) / fac2;
+
+       ylma[istart + 2 * il - 1] = (bl3 * ylma[istart + 2 * il - 5] - bl2 *
+   ylma[istart2 + 2 * il - 5] - 2.0 * x * ylma[istart1 + 2 * il - 3]) /
+   bl1; ylma[istart + 2 * il] = (bl3 * ylma[istart + 2 * il - 4] - bl2 *
+   ylma[istart2 + 2 * il - 4] - 2.0 * x * ylma[istart1 + 2 * il - 2]) /
+   bl1;
+   }*/
+}
+
+static __device__ void grad_rl_sph_harm(
+    const int nwl,
+    const double* __restrict__ ylmcoef,
+    const double x,
+	const double y,
+	const double z,
+    double* __restrict__ rly,
+    double* __restrict__ grly
+)
+{
+    double r2 = x * x + y * y + z * z;
+    double tx = x * 2;
+    double ty = y * 2;
+    double tz = z * 2;
+
+    //begin calculation
+	/***************************
+			 L = 0
+	***************************/
+	rly[0] = ylmcoef[0]; //l=0, m=0
+	grly[0] = grly[1] = grly[2] = 0.0;
+	if (nwl == 0) return;
+
+	/***************************
+			 L = 1
+	***************************/
+	rly[1] = ylmcoef[1]*z; //l=1, m=0
+	grly[3] = grly[4] = 0.0;
+	grly[5] = ylmcoef[1];
+
+	rly[2] = -ylmcoef[1]*x; //l=1, m=1
+	grly[7] = grly[8] = 0.0;
+	grly[6] = -ylmcoef[1];
+
+	rly[3] = -ylmcoef[1]*y; //l=1, m=-1
+	grly[9] = grly[11] = 0.0;
+	grly[10] = -ylmcoef[1];
+
+	if (nwl == 1) return;
+
+	/***************************
+			 L = 2
+	***************************/
+	rly[4] = ylmcoef[2]*z*rly[1]-ylmcoef[3]*rly[0]*r2;//l=2, m=0
+	grly[12] = ylmcoef[2]*z*grly[3]-ylmcoef[3]*(grly[0]*r2+rly[0]*tx);//l=2, m=0
+	grly[13] = ylmcoef[2]*z*grly[4]-ylmcoef[3]*(grly[1]*r2+rly[0]*ty);//l=2, m=0
+	grly[14] = ylmcoef[2]*(z*grly[5]+rly[1])-ylmcoef[3]*(grly[2]*r2+rly[0]*tz);//l=2, m=0
+
+
+	double tmp0 = ylmcoef[4]*z;
+	rly[5] = tmp0*rly[2];//l=2,m=1
+	grly[15] = tmp0*grly[6];
+	grly[16] = tmp0*grly[7];
+	grly[17] = ylmcoef[4]*(rly[2]+z*grly[8]);
+
+	rly[6] = tmp0*rly[3];//l=2,m=-1
+	grly[18] = tmp0*grly[9];
+	grly[19] = tmp0*grly[10];
+	grly[20] = ylmcoef[4]*(rly[3]+z*grly[11]);
+
+	double tmp2 = ylmcoef[4]*x;
+	rly[7]= ylmcoef[5]*rly[4]-ylmcoef[6]*rly[0]*r2 - tmp2*rly[2];//l=2,m=2
+	grly[21] = ylmcoef[5]*grly[12]-ylmcoef[6]*(rly[0]*tx+grly[0]*r2)-ylmcoef[4]*(x*grly[6]+rly[2]);
+
+//	std::cout << "\np1 = "<< ylmcoef[5]*grly[12] << " p2 = " << -ylmcoef[6]*rly[0]*tx
+//						<< " p3 = " << -ylmcoef[4]*x*grly[6] << " p4 = " << -ylmcoef[4]*rly[2] << std::endl;
+
+	grly[22] = ylmcoef[5]*grly[13]-ylmcoef[6]*(rly[0]*ty+grly[1]*r2)-tmp2*grly[7];
+	grly[23] = ylmcoef[5]*grly[14]-ylmcoef[6]*(rly[0]*tz+grly[2]*r2)-tmp2*grly[8];
+
+	rly[8] = -tmp2*rly[3];
+	grly[24] = -ylmcoef[4]*(rly[3]+x*grly[9]);
+	grly[25] = -tmp2*grly[10];
+	grly[26] = -tmp2*grly[11];
+//	rly[8] = tmp1+tmp2*rly[3];//l=2,m=-2
+	if (nwl == 2) return;
+
+	/***************************
+			 L = 3
+	***************************/
+	rly[9] = ylmcoef[7]*z*rly[4]-ylmcoef[8]*rly[1]*r2; //l=3, m=0
+	grly[27] = ylmcoef[7]*z*grly[12]-ylmcoef[8]*(rly[1]*tx+grly[3]*r2);
+	grly[28] = ylmcoef[7]*z*grly[13]-ylmcoef[8]*(rly[1]*ty+grly[4]*r2);
+	grly[29] = ylmcoef[7]*(rly[4]+z*grly[14])-ylmcoef[8]*(rly[1]*tz+grly[5]*r2);
+
+	double tmp3 = ylmcoef[9]*z;
+	rly[10] = tmp3*rly[5]-ylmcoef[10]*rly[2]*r2;//l=3,m=1
+	grly[30] = tmp3*grly[15]-ylmcoef[10]*(grly[6]*r2+rly[2]*tx);
+	grly[31] = tmp3*grly[16]-ylmcoef[10]*(grly[7]*r2+rly[2]*ty);
+	grly[32] = ylmcoef[9]*(z*grly[17]+rly[5])-ylmcoef[10]*(grly[8]*r2+rly[2]*tz);
+
+	rly[11] = tmp3*rly[6]-ylmcoef[10]*rly[3]*r2;//l=3,m=-1
+	grly[33] = tmp3*grly[18]-ylmcoef[10]*(grly[9]*r2+rly[3]*tx);
+	grly[34] = tmp3*grly[19]-ylmcoef[10]*(grly[10]*r2+rly[3]*ty);
+	grly[35] = ylmcoef[9]*(z*grly[20]+rly[6])-ylmcoef[10]*(grly[11]*r2+rly[3]*tz);
+
+	double tmp4 = ylmcoef[11]*z;
+	rly[12] = tmp4*rly[7];//l=3,m=2
+	grly[36] = tmp4*grly[21];
+	grly[37] = tmp4*grly[22];
+	grly[38] = ylmcoef[11]*(z*grly[23]+rly[7]);
+
+	rly[13] = tmp4*rly[8];//l=3,m=-2
+	grly[39] = tmp4*grly[24];
+	grly[40] = tmp4*grly[25];
+	grly[41] = ylmcoef[11]*(z*grly[26]+rly[8]);
+
+	double tmp5 = ylmcoef[14]*x;
+	rly[14] = ylmcoef[12]*rly[10]-ylmcoef[13]*rly[2]*r2-tmp5*rly[7];//l=3,m=3
+	grly[42] = ylmcoef[12]*grly[30]-ylmcoef[13]*(rly[2]*tx+grly[6]*r2)-ylmcoef[14]*(rly[7]+x*grly[21]);
+	grly[43] = ylmcoef[12]*grly[31]-ylmcoef[13]*(rly[2]*ty+grly[7]*r2)-tmp5*grly[22];
+	grly[44] = ylmcoef[12]*grly[32]-ylmcoef[13]*(rly[2]*tz+grly[8]*r2)-tmp5*grly[23];
+
+	rly[15] = ylmcoef[12]*rly[11]-ylmcoef[13]*rly[3]*r2-tmp5*rly[8];//l=3,m=-3
+	grly[45] = ylmcoef[12]*grly[33]-ylmcoef[13]*(rly[3]*tx+grly[9]*r2)-ylmcoef[14]*(rly[8]+x*grly[24]);
+	grly[46] = ylmcoef[12]*grly[34]-ylmcoef[13]*(rly[3]*ty+grly[10]*r2)-tmp5*grly[25];
+	grly[47] = ylmcoef[12]*grly[35]-ylmcoef[13]*(rly[3]*tz+grly[11]*r2)-tmp5*grly[26];
+	if (nwl == 3) return;
+
+	/***************************
+			 L = 4
+	***************************/
+	rly[16] = ylmcoef[15]*z*rly[9]-ylmcoef[16]*rly[4]*r2;//l=4,m=0
+	grly[48] = ylmcoef[15]*z*grly[27]-ylmcoef[16]*(rly[4]*tx+grly[12]*r2);
+	grly[49] = ylmcoef[15]*z*grly[28]-ylmcoef[16]*(rly[4]*ty+grly[13]*r2);
+	grly[50] = ylmcoef[15]*(z*grly[29]+rly[9])-ylmcoef[16]*(rly[4]*tz+grly[14]*r2);
+
+	double tmp6 = ylmcoef[17]*z;
+	rly[17] = tmp6*rly[10]-ylmcoef[18]*rly[5]*r2;//l=4,m=1
+	grly[51] = tmp6*grly[30]-ylmcoef[18]*(rly[5]*tx+grly[15]*r2);
+	grly[52] = tmp6*grly[31]-ylmcoef[18]*(rly[5]*ty+grly[16]*r2);
+	grly[53] = ylmcoef[17]*(z*grly[32]+rly[10])-ylmcoef[18]*(rly[5]*tz+grly[17]*r2);
+
+	rly[18] = tmp6*rly[11]-ylmcoef[18]*rly[6]*r2;//l=4,m=-1
+	grly[54] = tmp6*grly[33]-ylmcoef[18]*(rly[6]*tx+grly[18]*r2);
+	grly[55] = tmp6*grly[34]-ylmcoef[18]*(rly[6]*ty+grly[19]*r2);
+	grly[56] = ylmcoef[17]*(z*grly[35]+rly[11])-ylmcoef[18]*(rly[6]*tz+grly[20]*r2);
+
+	double tmp7 = ylmcoef[19]*z;
+	rly[19] = tmp7*rly[12]-ylmcoef[20]*rly[7]*r2;//l=4,m=2
+	grly[57] = tmp7*grly[36]-ylmcoef[20]*(rly[7]*tx+grly[21]*r2);
+	grly[58] = tmp7*grly[37]-ylmcoef[20]*(rly[7]*ty+grly[22]*r2);
+	grly[59] = ylmcoef[19]*(z*grly[38]+rly[12])-ylmcoef[20]*(rly[7]*tz+grly[23]*r2);
+
+	rly[20] = tmp7*rly[13]-ylmcoef[20]*rly[8]*r2;//l=4,m=-2
+	grly[60] = tmp7*grly[39]-ylmcoef[20]*(rly[8]*tx+grly[24]*r2);
+	grly[61] = tmp7*grly[40]-ylmcoef[20]*(rly[8]*ty+grly[25]*r2);
+	grly[62] = ylmcoef[19]*(z*grly[41]+rly[13])-ylmcoef[20]*(rly[8]*tz+grly[26]*r2);
+
+	double tmp8 = 3.0*z;
+	rly[21] = tmp8*rly[14];//l=4,m=3
+	grly[63] = tmp8*grly[42];
+	grly[64] = tmp8*grly[43];
+	grly[65] = 3.0*(z*grly[44]+rly[14]);
+
+
+	rly[22] = tmp8*rly[15];//l=4,m=-3
+	grly[66] = tmp8*grly[45];
+	grly[67] = tmp8*grly[46];
+	grly[68] = 3.0*(z*grly[47]+rly[15]);
+
+	double tmp9 = ylmcoef[23]*x;
+	rly[23] = ylmcoef[21]*rly[19]-ylmcoef[22]*rly[7]*r2-tmp9*rly[14];//l=4,m=4
+	grly[69] = ylmcoef[21]*grly[57]-ylmcoef[22]*(rly[7]*tx+grly[21]*r2)-ylmcoef[23]*(x*grly[42]+rly[14]);
+	grly[70] = ylmcoef[21]*grly[58]-ylmcoef[22]*(rly[7]*ty+grly[22]*r2)-tmp9*grly[43];
+	grly[71] = ylmcoef[21]*grly[59]-ylmcoef[22]*(rly[7]*tz+grly[23]*r2)-tmp9*grly[44];
+
+	rly[24] = ylmcoef[21]*rly[20]-ylmcoef[22]*rly[8]*r2-tmp9*rly[15];//l=4,m=-4
+	grly[72] = ylmcoef[21]*grly[60]-ylmcoef[22]*(rly[8]*tx+grly[24]*r2)-ylmcoef[23]*(x*grly[45]+rly[15]);
+	grly[73] = ylmcoef[21]*grly[61]-ylmcoef[22]*(rly[8]*ty+grly[25]*r2)-tmp9*grly[46];
+	grly[74] = ylmcoef[21]*grly[62]-ylmcoef[22]*(rly[8]*tz+grly[26]*r2)-tmp9*grly[47];
+
+	if (nwl == 4) return;
+
+	/***************************
+			 L = 5
+	***************************/
+	rly[25] = ylmcoef[24]*z*rly[16]-ylmcoef[25]*rly[9]*r2;//l=5,m=0
+	grly[75] = ylmcoef[24]*z*grly[48]-ylmcoef[25]*(rly[9]*tx+grly[27]*r2);
+	grly[76] = ylmcoef[24]*z*grly[49]-ylmcoef[25]*(rly[9]*ty+grly[28]*r2);
+	grly[77] = ylmcoef[24]*(z*grly[50]+rly[16])-ylmcoef[25]*(rly[9]*tz+grly[29]*r2);
+
+	double tmp10 = ylmcoef[26]*z;
+	rly[26] = tmp10*rly[17]-ylmcoef[27]*rly[10]*r2;//l=5,m=1
+	grly[78] = tmp10*grly[51]-ylmcoef[27]*(rly[10]*tx+grly[30]*r2);
+	grly[79] = tmp10*grly[52]-ylmcoef[27]*(rly[10]*ty+grly[31]*r2);
+	grly[80] = ylmcoef[26]*(z*grly[53]+rly[17])-ylmcoef[27]*(rly[10]*tz+grly[32]*r2);
+
+	rly[27] = tmp10*rly[18]-ylmcoef[27]*rly[11]*r2;//l=5,m=-1
+	grly[81] = tmp10*grly[54]-ylmcoef[27]*(rly[11]*tx+grly[33]*r2);
+	grly[82] = tmp10*grly[55]-ylmcoef[27]*(rly[11]*ty+grly[34]*r2);
+	grly[83] = ylmcoef[26]*(z*grly[56]+rly[18])-ylmcoef[27]*(rly[11]*tz+grly[35]*r2);
+
+	double tmp11 = ylmcoef[28]*z;
+	rly[28] = tmp11*rly[19]-ylmcoef[29]*rly[12]*r2;//l=5,m=2
+	grly[84] = tmp11*grly[57]-ylmcoef[29]*(rly[12]*tx+grly[36]*r2);
+	grly[85] = tmp11*grly[58]-ylmcoef[29]*(rly[12]*ty+grly[37]*r2);
+	grly[86] = ylmcoef[28]*(z*grly[59]+rly[19])-ylmcoef[29]*(rly[12]*tz+grly[38]*r2);
+
+	rly[29] = tmp11*rly[20]-ylmcoef[29]*rly[13]*r2;//l=5,m=-2
+	grly[87] = tmp11*grly[60]-ylmcoef[29]*(rly[13]*tx+grly[39]*r2);
+	grly[88] = tmp11*grly[61]-ylmcoef[29]*(rly[13]*ty+grly[40]*r2);
+	grly[89] = ylmcoef[28]*(z*grly[62]+rly[20])-ylmcoef[29]*(rly[13]*tz+grly[41]*r2);
+
+	double tmp12 = ylmcoef[30]*z;
+	rly[30] = tmp12*rly[21]-ylmcoef[31]*rly[14]*r2;//l=5,m=3
+	grly[90] = tmp12*grly[63]-ylmcoef[31]*(grly[42]*r2+rly[14]*tx);
+	grly[91] = tmp12*grly[64]-ylmcoef[31]*(grly[43]*r2+rly[14]*ty);
+	grly[92] = ylmcoef[30]*(z*grly[65]+rly[21])-ylmcoef[31]*(grly[44]*r2+rly[14]*tz);
+
+	rly[31] = tmp12*rly[22]-ylmcoef[31]*rly[15]*r2;//l=5,m=-3
+	grly[93] = tmp12*grly[66]-ylmcoef[31]*(grly[45]*r2+rly[15]*tx);
+	grly[94] = tmp12*grly[67]-ylmcoef[31]*(grly[46]*r2+rly[15]*ty);
+	grly[95] = ylmcoef[30]*(z*grly[68]+rly[22])-ylmcoef[31]*(grly[47]*r2+rly[15]*tz);
+
+	double tmp13 = ylmcoef[32]*z;
+	rly[32] = tmp13*rly[23];//l=5,m=4
+	grly[96] = tmp13*grly[69];
+	grly[97] = tmp13*grly[70];
+	grly[98] = ylmcoef[32]*(rly[23]+z*grly[71]);
+
+	rly[33] = tmp13*rly[24];//l=5,m=-4
+	grly[99] = tmp13*grly[72];
+	grly[100] = tmp13*grly[73];
+	grly[101] = ylmcoef[32]*(rly[24]+z*grly[74]);
+
+	double tmp14 = ylmcoef[35]*x;
+	rly[34] = ylmcoef[33]*rly[30]-ylmcoef[34]*rly[14]*r2-tmp14*rly[23];//l=5,m=5
+	grly[102] = ylmcoef[33]*grly[90]-ylmcoef[34]*(rly[14]*tx+grly[42]*r2)-ylmcoef[35]*(x*grly[69]+rly[23]);
+	grly[103] = ylmcoef[33]*grly[91]-ylmcoef[34]*(rly[14]*ty+grly[43]*r2)-tmp14*grly[70];
+	grly[104] = ylmcoef[33]*grly[92]-ylmcoef[34]*(rly[14]*tz+grly[44]*r2)-tmp14*grly[71];
+
+	rly[35] = ylmcoef[33]*rly[31]-ylmcoef[34]*rly[15]*r2-tmp14*rly[24];//l=5,m=-5
+	grly[105] = ylmcoef[33]*grly[93]-ylmcoef[34]*(rly[15]*tx+grly[45]*r2)-ylmcoef[35]*(x*grly[72]+rly[24]);
+	grly[106] = ylmcoef[33]*grly[94]-ylmcoef[34]*(rly[15]*ty+grly[46]*r2)-tmp14*grly[73];
+	grly[107] = ylmcoef[33]*grly[95]-ylmcoef[34]*(rly[15]*tz+grly[47]*r2)-tmp14*grly[74];
+
+	if (nwl == 5) return;
+}
+}
\ No newline at end of file
diff --git a/source/module_hamilt_lcao/module_gint/temp_gint/localcell_info.h b/source/module_hamilt_lcao/module_gint/temp_gint/localcell_info.h
index f24d1194b4..0c146b86ab 100644
--- a/source/module_hamilt_lcao/module_gint/temp_gint/localcell_info.h
+++ b/source/module_hamilt_lcao/module_gint/temp_gint/localcell_info.h
@@ -17,16 +17,16 @@ class LocalCellInfo
             std::shared_ptr<const UnitCellInfo> unitcell_info);
 
         // getter functions
-        int get_startidx_bx() const { return startidx_bx_; };
-        int get_startidx_by() const { return startidx_by_; };
-        int get_startidx_bz() const { return startidx_bz_; };
-        int get_nbx() const { return nbx_; };
-        int get_nby() const { return nby_; };
-        int get_nbz() const { return nbz_; };
-        int get_bgrids_num() const { return nbxyz_; };
-        int get_mgrids_num() const { return nmxyz_; };
-        std::shared_ptr<const UnitCellInfo> get_unitcell_info() const { return unitcell_info_; };
-        std::shared_ptr<const BigGridInfo> get_bgrid_info() const { return unitcell_info_->get_bgrid_info(); };
+        int get_startidx_bx() const { return startidx_bx_; }
+        int get_startidx_by() const { return startidx_by_; }
+        int get_startidx_bz() const { return startidx_bz_; }
+        int get_nbx() const { return nbx_; }
+        int get_nby() const { return nby_; }
+        int get_nbz() const { return nbz_; }
+        int get_bgrids_num() const { return nbxyz_; }
+        int get_mgrids_num() const { return nmxyz_; }
+        std::shared_ptr<const UnitCellInfo> get_unitcell_info() const { return unitcell_info_; }
+        std::shared_ptr<const BigGridInfo> get_bgrid_info() const { return unitcell_info_->get_bgrid_info(); }
 
         //====================================================================
         // functions related to the big grid
diff --git a/source/module_hamilt_lcao/module_gint/temp_gint/meshgrid_info.h b/source/module_hamilt_lcao/module_gint/temp_gint/meshgrid_info.h
index 99376c9a20..a8307b1048 100644
--- a/source/module_hamilt_lcao/module_gint/temp_gint/meshgrid_info.h
+++ b/source/module_hamilt_lcao/module_gint/temp_gint/meshgrid_info.h
@@ -35,10 +35,10 @@ class MeshGridInfo
                 meshgrid_GT_ = meshgrid_latvec0_.Inverse();
 
                 meshgrid_volume_ = std::abs(meshgrid_latvec0_.Det());
-            };
-        
-        double get_volume() const { return meshgrid_volume_; };
-        Vec3d get_cartesian_coord(const Vec3i& index_3d) const { return index_3d * meshgrid_latvec0_; };
+            }
+
+        double get_volume() const { return meshgrid_volume_; }
+        Vec3d get_cartesian_coord(const Vec3i& index_3d) const { return index_3d * meshgrid_latvec0_; }
         Vec3d get_direct_coord(const Vec3d& cart_coord) const { return cart_coord * meshgrid_GT_; }
 
     private:
diff --git a/source/module_hamilt_lcao/module_gint/temp_gint/phi_operator.cpp b/source/module_hamilt_lcao/module_gint/temp_gint/phi_operator.cpp
index d714546864..5df52f9453 100644
--- a/source/module_hamilt_lcao/module_gint/temp_gint/phi_operator.cpp
+++ b/source/module_hamilt_lcao/module_gint/temp_gint/phi_operator.cpp
@@ -9,7 +9,7 @@ void PhiOperator::set_bgrid(std::shared_ptr<const BigGrid> biggrid)
 {
     biggrid_ = biggrid;
     rows_ = biggrid_->get_mgrids_num();
-    cols_ = biggrid_->get_mgrid_phi_len();
+    cols_ = biggrid_->get_phi_len();
 
     biggrid_->set_atoms_startidx(atoms_startidx_);
     biggrid_->set_atoms_phi_len(atoms_phi_len_);
@@ -18,14 +18,13 @@ void PhiOperator::set_bgrid(std::shared_ptr<const BigGrid> biggrid)
     // init is_atom_on_mgrid_ and atoms_relative_coords_
     const int atoms_num = biggrid_->get_atoms_num();
     atoms_relative_coords_.resize(atoms_num);
-    is_atom_on_mgrid_.resize(atoms_num);
+    is_atom_on_mgrid_.resize(biggrid_->get_mgrids_num() * atoms_num);
     for(int i = 0; i < atoms_num; ++i)
     {
         biggrid_->set_atom_relative_coords(biggrid_->get_atom(i), atoms_relative_coords_[i]);
-        is_atom_on_mgrid_[i].resize(rows_);
         for(int j = 0; j < rows_; ++j)
         {
-            is_atom_on_mgrid_[i][j] = atoms_relative_coords_[i][j].norm() <= biggrid_->get_atom(i)->get_rcut();
+            is_atom_on_mgrid_[i * rows_ + j] = atoms_relative_coords_[i][j].norm() <= biggrid_->get_atom(i)->get_rcut();
         }
     }
 
@@ -109,10 +108,10 @@ void PhiOperator::phi_dot_dphi_r(
         for(int j = 0; j < biggrid_->get_atoms_num(); ++j)
         {
             const int start_idx = atoms_startidx_[j];
+            const Vec3d& r3 = atoms_relative_coords_[j][i];
             for(int k = 0; k < atoms_phi_len_[j]; ++k)
             {
                 const int idx = i * cols_ + start_idx + k;
-                const Vec3d& r3 = atoms_relative_coords_[j][i];
                 const double phi_val = phi[idx];
                 sxx += phi_val * dphi_x[idx] * r3[0];
                 sxy += phi_val * dphi_x[idx] * r3[1];
@@ -131,6 +130,86 @@ void PhiOperator::phi_dot_dphi_r(
     svl[0](2, 2) += szz * 2;
 }
 
+void PhiOperator::cal_env_gamma(
+    const double* phi,
+    const double* wfc,
+    const vector<int>& trace_lo,
+    double* rho) const
+{
+    for(int i = 0; i < biggrid_->get_atoms_num(); ++i)
+    {
+        const auto atom = biggrid_->get_atom(i);
+        const int iw_start = atom->get_start_iw();
+        const int start_idx = atoms_startidx_[i];
+        for(int j = 0; j < biggrid_->get_mgrids_num(); ++j)
+        {
+            if(is_atom_on_mgrid(i, j))
+            {   
+                double tmp = 0.0;
+                int iw_lo = trace_lo[iw_start];
+                for(int iw = 0; iw < atom->get_nw(); ++iw, ++iw_lo)
+                {
+                    tmp += phi[j * cols_ + start_idx + iw] * wfc[iw_lo];
+                }
+                rho[meshgrids_local_idx_[j]] += tmp;
+            }
+        }
+    }
+}
+
+void PhiOperator::cal_env_k(
+    const double* phi,
+    const std::complex<double>* wfc,
+    const vector<int>& trace_lo,
+    const int ik,
+    const int nspin,
+    const int npol,
+    const int lgd,
+    const std::vector<Vec3d>& kvec_c,
+    const std::vector<Vec3d>& kvec_d,
+    double* rho) const
+{
+    for(int i = 0; i < biggrid_->get_atoms_num(); ++i)
+    {
+        const auto atom = biggrid_->get_atom(i);
+        const int iw_start = atom->get_start_iw();
+        const Vec3d R(atom->get_unitcell_idx());
+        const double arg = (kvec_d[ik] * R) * ModuleBase::TWO_PI;
+        const std::complex<double> kphase = std::complex<double>(cos(arg), sin(arg));
+        const int start_idx = atoms_startidx_[i];
+        for(int j = 0; j < biggrid_->get_mgrids_num(); ++j)
+        {
+            if(is_atom_on_mgrid(i, j))
+            {   
+                std::complex<double> tmp{0.0, 0.0};
+                int phi_start_idx = j * cols_ + start_idx;
+
+                int iw_lo = 0;
+                if (nspin == 4) // is it a simple add of 2 spins?
+                {
+                    for (int is = 0; is < 2; ++is)
+                    {
+                        iw_lo = trace_lo[iw_start] / npol + lgd / npol * is;
+                        for (int iw = 0; iw < atom->get_nw(); ++iw, ++iw_lo)
+                        {
+                            tmp += std::complex<double>(phi[phi_start_idx + iw], 0.0) * wfc[iw_lo] * kphase;
+                        }
+                    }
+                }
+                else
+                {
+                    iw_lo = trace_lo[iw_start];
+                    for (int iw = 0; iw < atom->get_nw(); ++iw, ++iw_lo)
+                    {
+                        tmp += std::complex<double>(phi[phi_start_idx + iw], 0.0) * wfc[iw_lo] * kphase;
+                    }
+                }
+                rho[meshgrids_local_idx_[j]] += tmp.real();
+            }
+        }
+    }
+}
+
 
 //===============================
 // private methods
@@ -150,7 +229,7 @@ void PhiOperator::init_atom_pair_start_end_idx_()
             int end_idx = -1;
             for(int mgrid_idx = 0; mgrid_idx < mgrids_num; ++mgrid_idx)
             {
-                if(is_atom_on_mgrid_[i][mgrid_idx] && is_atom_on_mgrid_[j][mgrid_idx])
+                if(is_atom_on_mgrid(i, mgrid_idx) && is_atom_on_mgrid(j, mgrid_idx))
                 {
                     start_idx = mgrid_idx;
                     break;
@@ -158,7 +237,7 @@ void PhiOperator::init_atom_pair_start_end_idx_()
             }
             for(int mgrid_idx = mgrids_num - 1; mgrid_idx >= 0; --mgrid_idx)
             {
-                if(is_atom_on_mgrid_[i][mgrid_idx] && is_atom_on_mgrid_[j][mgrid_idx])
+                if(is_atom_on_mgrid(i, mgrid_idx) && is_atom_on_mgrid(j, mgrid_idx))
                 {
                     end_idx = mgrid_idx;
                     break;
diff --git a/source/module_hamilt_lcao/module_gint/temp_gint/phi_operator.h b/source/module_hamilt_lcao/module_gint/temp_gint/phi_operator.h
index 5b5366e701..48044e0014 100644
--- a/source/module_hamilt_lcao/module_gint/temp_gint/phi_operator.h
+++ b/source/module_hamilt_lcao/module_gint/temp_gint/phi_operator.h
@@ -3,7 +3,7 @@
 #include <memory>
 #include <vector>
 #include <utility>
-#include <module_hamilt_lcao/module_hcontainer/hcontainer.h>
+#include "module_hamilt_lcao/module_hcontainer/hcontainer.h"
 #include "big_grid.h"
 
 namespace ModuleGint
@@ -28,8 +28,8 @@ class PhiOperator
     void set_bgrid(std::shared_ptr<const BigGrid> biggrid);
 
     // getter
-    int get_rows() const {return rows_;};
-    int get_cols() const {return cols_;};
+    int get_rows() const {return rows_;}
+    int get_cols() const {return cols_;}
 
     // get phi of the big grid
     // the dimension of phi is num_mgrids * (\sum_{i=0}^{atoms_->size()} atoms_[i]->nw)
@@ -93,6 +93,24 @@ class PhiOperator
         const double* dphi_z,
         ModuleBase::matrix *svl) const;
 
+    void cal_env_gamma(
+        const double* phi,
+        const double* wfc,
+        const vector<int>& trace_lo,
+        double* rho) const;
+    
+    void cal_env_k(
+        const double* phi,
+        const std::complex<double>* wfc,
+        const vector<int>& trace_lo,
+        const int ik,
+        const int nspin,
+        const int npol,
+        const int lgd,
+        const std::vector<Vec3d>& kvec_c,
+        const std::vector<Vec3d>& kvec_d,
+        double* rho) const;
+
     private:
     void init_atom_pair_start_end_idx_();
 
@@ -103,14 +121,19 @@ class PhiOperator
         int x = std::min(a, b);
         int y = std::abs(a - b);
         return atom_pair_start_end_idx_[(2 * biggrid_->get_atoms_num() - x + 1) * x / 2 + y];
-    };
+    }
+
+    bool is_atom_on_mgrid(int atom_idx, int mgrid_idx) const
+    {
+        return is_atom_on_mgrid_[atom_idx * rows_ + mgrid_idx];
+    }
 
     // the row number of the phi matrix
     // rows_ = biggrid_->get_mgrids_num()
     int rows_;
 
     // the column number of the phi matrix
-    // cols_ = biggrid_->get_mgrid_phi_len()
+    // cols_ = biggrid_->get_phi_len()
     int cols_;
 
     // the local index of the meshgrids
@@ -124,9 +147,8 @@ class PhiOperator
     std::vector<std::vector<Vec3d>> atoms_relative_coords_;
 
     // record whether the atom affects the meshgrid
-    // is_atom_on_mgrid_[i][j] = true if the ith atom affects the jth meshgrid, otherwise false
-    // FIXME,std::vector<std::vector<bool>> is not a efficient data structure, we can use a 1D array to replace it.
-    std::vector<std::vector<bool>> is_atom_on_mgrid_;
+    // is_atom_on_mgrid_[i * rows_ + j] = true if the ith atom affects jhe ith meshgrid, otherwise false
+    std::vector<bool> is_atom_on_mgrid_;
 
     // the start index of the phi of each atom
     std::vector<int> atoms_startidx_;
diff --git a/source/module_hamilt_lcao/module_gint/temp_gint/phi_operator.hpp b/source/module_hamilt_lcao/module_gint/temp_gint/phi_operator.hpp
index 44603560d2..79c4b29c23 100644
--- a/source/module_hamilt_lcao/module_gint/temp_gint/phi_operator.hpp
+++ b/source/module_hamilt_lcao/module_gint/temp_gint/phi_operator.hpp
@@ -103,7 +103,7 @@ void PhiOperator::phi_mul_vldr3(
     }
 }
 
-// hr(iwt_i,iwt_j) = \sum_{ir} phi_i(ir,iwt_i) * phi_i(ir,iwt_j)
+// hr(iwt_i,iwt_j) += \sum_{ir} phi_i(ir,iwt_i) * phi_i(ir,iwt_j)
 // this is a thread-safe function
 template<typename T>
 void PhiOperator::phi_mul_phi(
diff --git a/source/module_hamilt_lcao/module_gint/temp_gint/set_ddphi.cpp b/source/module_hamilt_lcao/module_gint/temp_gint/set_ddphi.cpp
index 4d01acc262..c84f087487 100644
--- a/source/module_hamilt_lcao/module_gint/temp_gint/set_ddphi.cpp
+++ b/source/module_hamilt_lcao/module_gint/temp_gint/set_ddphi.cpp
@@ -20,24 +20,6 @@ void GintAtom::set_ddphi(
     // orb_ does not have the member variable dr_uniform
     const double dr_uniform = orb_->PhiLN(0, 0).dr_uniform;
 
-    // store the pointer to reduce repeated address fetching
-    std::vector<const double*> p_psi_uniform(atom_->nw);
-    std::vector<const double*> p_dpsi_uniform(atom_->nw);
-    std::vector<const double*> p_ddpsi_uniform(atom_->nw);
-    std::vector<int> phi_nr_uniform(atom_->nw);
-    for (int iw=0; iw< atom_->nw; ++iw)
-    {
-        if ( atom_->iw2_new[iw] )
-        {
-            int l = atom_->iw2l[iw];
-            int n = atom_->iw2n[iw];
-            p_psi_uniform[iw] = orb_->PhiLN(l, n).psi_uniform.data();
-            p_dpsi_uniform[iw] = orb_->PhiLN(l, n).dpsi_uniform.data();
-            p_ddpsi_uniform[iw] = orb_->PhiLN(l, n).ddpsi_uniform.data();
-            phi_nr_uniform[iw] = orb_->PhiLN(l, n).nr_uniform;
-        }
-    }
-
     std::vector<double> rly(std::pow(atom_->nwl + 1, 2));
     ModuleBase::Array_Pool<double> grly(std::pow(atom_->nwl + 1, 2), 3);
     // TODO: A better data structure such as a 3D tensor can be used to store dphi
@@ -96,24 +78,15 @@ void GintAtom::set_ddphi(
             {
                 if(atom_->iw2_new[iw])
                 {
-                    auto psi_uniform = p_psi_uniform[iw];
-                    auto dpsi_uniform = p_dpsi_uniform[iw];
-
-                    if(ip >= phi_nr_uniform[iw] - 4)
-                    {
-                        tmp = dtmp = 0.0;
-                    }
-                    else
-                    {
-                        // use Polynomia Interpolation method to get the
-                        // wave functions
-
-                        tmp = x12 * (psi_uniform[ip] * x3 + psi_uniform[ip + 3] * x0)
-                            + x03 * (psi_uniform[ip + 1] * x2 - psi_uniform[ip + 2] * x1);
-
-                        dtmp = x12 * (dpsi_uniform[ip] * x3 + dpsi_uniform[ip + 3] * x0)
-                            + x03 * (dpsi_uniform[ip + 1] * x2 - dpsi_uniform[ip + 2] * x1);
-                    }
+                    auto psi_uniform = p_psi_uniform_[iw];
+                    auto dpsi_uniform = p_dpsi_uniform_[iw];
+                    // use Polynomia Interpolation method to get the
+                    // wave functions
+                    tmp = x12 * (psi_uniform[ip] * x3 + psi_uniform[ip + 3] * x0)
+                        + x03 * (psi_uniform[ip + 1] * x2 - psi_uniform[ip + 2] * x1);
+
+                    dtmp = x12 * (dpsi_uniform[ip] * x3 + dpsi_uniform[ip + 3] * x0)
+                        + x03 * (dpsi_uniform[ip + 1] * x2 - dpsi_uniform[ip + 2] * x1);
                 }
 
                 // get the 'l' of this localized wave function
diff --git a/source/module_hamilt_lcao/module_gint/temp_gint/unitcell_info.h b/source/module_hamilt_lcao/module_gint/temp_gint/unitcell_info.h
index b75806fa2a..df1e88b38c 100644
--- a/source/module_hamilt_lcao/module_gint/temp_gint/unitcell_info.h
+++ b/source/module_hamilt_lcao/module_gint/temp_gint/unitcell_info.h
@@ -21,16 +21,16 @@ class UnitCellInfo
             int nmx, int nmy, int nmz);
         
         // getter functions
-        int get_nbx() const { return nbx_; };
-        int get_nby() const { return nby_; };
-        int get_nbz() const { return nbz_; };
-        int get_bgrids_num() const { return nbxyz_; };
-        int get_nmx() const { return nmx_; };
-        int get_nmy() const { return nmy_; };
-        int get_nmz() const { return nmz_; };
-        int get_mgrids_num() const { return nmxyz_; };
-        std::shared_ptr<const BigGridInfo> get_bgrid_info() const { return biggrid_info_; };
-        std::shared_ptr<const MeshGridInfo> get_mgrid_info() const { return meshgrid_info_; };
+        int get_nbx() const { return nbx_; }
+        int get_nby() const { return nby_; }
+        int get_nbz() const { return nbz_; }
+        int get_bgrids_num() const { return nbxyz_; }
+        int get_nmx() const { return nmx_; }
+        int get_nmy() const { return nmy_; }
+        int get_nmz() const { return nmz_; }
+        int get_mgrids_num() const { return nmxyz_; }
+        std::shared_ptr<const BigGridInfo> get_bgrid_info() const { return biggrid_info_; }
+        std::shared_ptr<const MeshGridInfo> get_mgrid_info() const { return meshgrid_info_; }
 
         //====================================================================
         // functions related to the big grid
@@ -40,25 +40,25 @@ class UnitCellInfo
         Vec3i bgrid_idx_1Dto3D(const int index_1d) const
         {
             return index1Dto3D(index_1d, nbx_, nby_, nbz_);
-        };
+        }
 
         // transform the 3D index of a biggrid in the unit cell to the 1D index
         int bgrid_idx_3Dto1D(const Vec3i index_3d) const
         {
             return index3Dto1D(index_3d.x, index_3d.y, index_3d.z, nbx_, nby_, nbz_);
-        };
+        }
 
         // get the cartesian coordinate of a big grid in the unit cell from the 3D index
         Vec3d get_bgrid_coord(Vec3i index_3d) const
         {
             return biggrid_info_->get_cartesian_coord(index_3d);
-        };
+        }
 
         // get the cartesian coordinate of a big grid in the unit cell from the 1D index
         Vec3d get_bgrid_coord(int index_1d) const
         {
             return get_bgrid_coord(bgrid_idx_1Dto3D(index_1d));
-        };
+        }
 
         // get the 3D index of a big grid in the unit cell from the cartesian coordinate
         Vec3i get_bgrid_idx_3d(const Vec3d coord) const
@@ -68,7 +68,7 @@ class UnitCellInfo
                 static_cast<int>(floor(direct_coord.x)),
                 static_cast<int>(floor(direct_coord.y)),
                 static_cast<int>(floor(direct_coord.z)));
-        };
+        }
 
         // Get the relative Cartesian coordinates of big grid A relative to big grid B
         // returned vector = coordinates of point A - coordinates of point B
@@ -77,7 +77,7 @@ class UnitCellInfo
         Vec3d get_relative_coord(Vec3i index_3d_a, Vec3i index_3d_b) const
         {
             return get_bgrid_coord(index_3d_a - index_3d_b);
-        };
+        }
 
         // get the extended unitcell index of a big grid
         Vec3i get_unitcell_idx(const Vec3i index_3d) const
@@ -85,7 +85,7 @@ class UnitCellInfo
             return Vec3i(floor_div(index_3d.x, nbx_),
                          floor_div(index_3d.y, nby_),
                          floor_div(index_3d.z, nbz_));
-        };
+        }
 
         // map the extended big grid index to the big grid index in unitcell
         Vec3i map_ext_idx_to_ucell(const Vec3i index_3d) const
@@ -93,7 +93,7 @@ class UnitCellInfo
             return Vec3i(index_3d.x - floor_div(index_3d.x, nbx_) * nbx_,
                          index_3d.y - floor_div(index_3d.y, nby_) * nby_,
                          index_3d.z - floor_div(index_3d.z, nbz_) * nbz_);
-        };
+        }
 
 
         //====================================================================
@@ -116,7 +116,7 @@ class UnitCellInfo
         Vec3d get_mgrid_coord(Vec3i index_3d) const
         {
             return meshgrid_info_->get_cartesian_coord(index_3d);
-        };
+        }
 
         // get the cartesian coordinate of a meshgrid in the unit cell from the 1D index
         Vec3d get_mgrid_coord(int index_1d) const
diff --git a/source/module_hamilt_lcao/module_hcontainer/hcontainer.cpp b/source/module_hamilt_lcao/module_hcontainer/hcontainer.cpp
index 54250a2ce2..89b71306a1 100644
--- a/source/module_hamilt_lcao/module_hcontainer/hcontainer.cpp
+++ b/source/module_hamilt_lcao/module_hcontainer/hcontainer.cpp
@@ -16,6 +16,9 @@ HContainer<T>::~HContainer()
     }
 }
 
+template <typename T>
+HContainer<T>::HContainer() {}
+
 // copy constructor
 template <typename T>
 HContainer<T>::HContainer(const HContainer<T>& HR_in, T* data_array)
@@ -35,17 +38,38 @@ HContainer<T>::HContainer(const HContainer<T>& HR_in, T* data_array)
 
 // move constructor
 template <typename T>
-HContainer<T>::HContainer(HContainer<T>&& HR_in)
+HContainer<T>::HContainer(HContainer<T>&& HR_in) noexcept
 {
     this->atom_pairs = std::move(HR_in.atom_pairs);
     this->sparse_ap = std::move(HR_in.sparse_ap);
     this->sparse_ap_index = std::move(HR_in.sparse_ap_index);
+    this->wrapper_pointer = HR_in.wrapper_pointer;
     this->gamma_only = HR_in.gamma_only;
     this->paraV = HR_in.paraV;
     this->current_R = -1;
+    HR_in.wrapper_pointer = nullptr;
     // tmp terms not moved
 }
 
+// move assignment
+template <typename T>
+HContainer<T>& HContainer<T>::operator=(HContainer<T>&& HR_in) noexcept
+{
+    if (this != &HR_in)
+    {
+        this->atom_pairs = std::move(HR_in.atom_pairs);
+        this->sparse_ap = std::move(HR_in.sparse_ap);
+        this->sparse_ap_index = std::move(HR_in.sparse_ap_index);
+        this->wrapper_pointer = HR_in.wrapper_pointer;
+        this->gamma_only = HR_in.gamma_only;
+        this->paraV = HR_in.paraV;
+        this->current_R = -1;
+
+        HR_in.wrapper_pointer = nullptr;
+    }
+    return *this;
+}
+
 // simple constructor
 template <typename T>
 HContainer<T>::HContainer(int natom)
diff --git a/source/module_hamilt_lcao/module_hcontainer/hcontainer.h b/source/module_hamilt_lcao/module_hcontainer/hcontainer.h
index cf50e7c263..edaca9577e 100644
--- a/source/module_hamilt_lcao/module_hcontainer/hcontainer.h
+++ b/source/module_hamilt_lcao/module_hcontainer/hcontainer.h
@@ -146,6 +146,8 @@ class HContainer
     // Destructor of class HContainer
     ~HContainer();
 
+    HContainer();
+
     /**
      * @brief copy constructor
      * when data_array is not nullptr, new HContainer will be wrapper for data_array
@@ -154,7 +156,9 @@ class HContainer
     HContainer(const HContainer<T>& HR_in, T* data_array = nullptr);
 
     // move constructor
-    HContainer(HContainer<T>&& HR_in);
+    HContainer(HContainer<T>&& HR_in) noexcept;
+    // move assignment
+    HContainer& operator=(HContainer<T>&& HR_in) noexcept;
 
     // simple constructor
     HContainer(int natom);
diff --git a/source/module_io/cal_ldos.cpp b/source/module_io/cal_ldos.cpp
index 4826e65cc0..31133cfa81 100644
--- a/source/module_io/cal_ldos.cpp
+++ b/source/module_io/cal_ldos.cpp
@@ -60,7 +60,7 @@ void Cal_ldos<T>::cal_ldos_lcao(const elecstate::ElecStateLCAO<T>* pelec,
         }
 
     // calculate ldos
-#ifndef __NEW_GINT
+#ifdef __OLD_GINT
         ModuleBase::WARNING_QUIT("Cal_ldos::dm2ldos",
                                  "do not support old grid integral, please recompile with __NEW_GINT");
 #else
diff --git a/source/module_io/get_pchg_lcao.cpp b/source/module_io/get_pchg_lcao.cpp
index b8b84f3125..316ffdb3f6 100644
--- a/source/module_io/get_pchg_lcao.cpp
+++ b/source/module_io/get_pchg_lcao.cpp
@@ -3,6 +3,7 @@
 #include "module_io/cube_io.h"
 #include "source_estate/module_charge/symmetry_rho.h"
 #include "source_estate/module_dm/cal_dm_psi.h"
+#include "module_hamilt_lcao/module_gint/temp_gint/gint_interface.h"
 
 Get_pchg_lcao::Get_pchg_lcao(psi::Psi<double>* psi_gamma_in, const Parallel_Orbitals* ParaV_in)
     : psi_gamma(psi_gamma_in), ParaV(ParaV_in)
@@ -69,10 +70,14 @@ void Get_pchg_lcao::begin(Gint_Gamma& gg,
 
             DM.init_DMR(GridD_in, ucell_in);
             DM.cal_DMR();
+#ifdef __OLD_GINT
             gg.initialize_pvpR(*ucell_in, GridD_in, nspin);
             gg.transfer_DM2DtoGrid(DM.get_DMR_vector());
             Gint_inout inout(rho, Gint_Tools::job_type::rho, nspin);
             gg.cal_gint(&inout);
+#else
+            ModuleGint::cal_gint_rho(DM.get_DMR_vector(), nspin, rho);
+#endif
 
             // A solution to replace the original implementation of the following code:
             // pelec->charge->save_rho_before_sum_band();
@@ -164,10 +169,15 @@ void Get_pchg_lcao::begin(Gint_k& gk,
 
                     DM.init_DMR(GridD_in, ucell_in);
                     DM.cal_DMR(ik);
+#ifdef __OLD_GINT
                     gk.initialize_pvpR(*ucell_in, GridD_in, nspin);
                     gk.transfer_DM2DtoGrid(DM.get_DMR_vector());
                     Gint_inout inout(rho, Gint_Tools::job_type::rho, nspin);
                     gk.cal_gint(&inout);
+#else
+                    ModuleGint::cal_gint_rho(DM.get_DMR_vector(), nspin, rho);
+#endif
+                
 
                     // Using std::vector to replace the original double** rho_save
                     std::vector<std::vector<double>> rho_save(nspin, std::vector<double>(rhopw_nrxx));
@@ -206,11 +216,14 @@ void Get_pchg_lcao::begin(Gint_k& gk,
 
                 DM.init_DMR(GridD_in, ucell_in);
                 DM.cal_DMR();
+#ifdef __OLD_GINT
                 gk.initialize_pvpR(*ucell_in, GridD_in, nspin);
                 gk.transfer_DM2DtoGrid(DM.get_DMR_vector());
                 Gint_inout inout(rho, Gint_Tools::job_type::rho, nspin);
                 gk.cal_gint(&inout);
-
+#else
+                ModuleGint::cal_gint_rho(DM.get_DMR_vector(), nspin, rho);
+#endif
                 // Using std::vector to replace the original double** rho_save
                 std::vector<std::vector<double>> rho_save(nspin, std::vector<double>(rhopw_nrxx));
 
diff --git a/source/module_io/get_wf_lcao.cpp b/source/module_io/get_wf_lcao.cpp
index 039b2231de..197068ed7b 100644
--- a/source/module_io/get_wf_lcao.cpp
+++ b/source/module_io/get_wf_lcao.cpp
@@ -4,6 +4,11 @@
 #include "module_io/write_wfc_pw.h"
 #include "source_base/memory.h"
 
+#ifndef __OLD_GINT
+#include "module_hamilt_lcao/module_gint/temp_gint/gint_env_gamma.h"
+#include "module_hamilt_lcao/module_gint/temp_gint/gint_env_k.h"
+#endif
+
 Get_wf_lcao::Get_wf_lcao(const elecstate::ElecState* pes)
 {
     pes_ = pes;
@@ -40,6 +45,7 @@ void Get_wf_lcao::begin(const UnitCell& ucell,
 
     prepare_get_wf(ofs_running);
 
+#ifdef __OLD_GINT
     // allocate grid wave functions for gamma_only
     std::vector<double**> wfc_gamma_grid(nspin);
     for (int is = 0; is < nspin; ++is)
@@ -50,6 +56,7 @@ void Get_wf_lcao::begin(const UnitCell& ucell,
             wfc_gamma_grid[is][ib] = new double[gg.gridt->lgd];
         }
     }
+#endif
 
     // for pw_wfc in G space
     psi::Psi<std::complex<double>> psi_g;
@@ -57,42 +64,48 @@ void Get_wf_lcao::begin(const UnitCell& ucell,
     // if (out_wfc_pw || out_wfc_r)
     psi_g.resize(nspin, nbands, kv.ngk[0]);
 
+#ifdef __OLD_GINT
     const double mem_size = sizeof(double) * double(gg.gridt->lgd) * double(nbands) * double(nspin) / 1024.0 / 1024.0;
     ModuleBase::Memory::record("Get_wf_lcao::begin", mem_size);
-    ModuleBase::GlobalFunc::OUT(ofs_running, "On-the-fly memory consumption (MB)", mem_size);
+    ModuleBase::GlobalFunc::OUT(GlobalV::ofs_running, "On-the-fly memory consumption (MB)", mem_size);
+#endif
 
     // Set this->bands_picked_
     this->select_bands(out_wfc_norm, nbands, fermi_band);
 
     // Calculate out_wfc_norm
-    for (int ib = 0; ib < nbands; ++ib)
+    for (int is = 0; is < nspin; ++is)
     {
-        if (bands_picked_[ib])
+        psid->fix_k(is);
+#ifdef __OLD_GINT
+    #ifdef __MPI
+        wfc_2d_to_grid(psid->get_pointer(), para_orb, wfc_gamma_grid[is], gg.gridt->trace_lo);
+    #else
+        // if not MPI enabled, it is the case psid holds a global matrix. 
+        // use fix_k to switch between different spin channels (actually kpoints, 
+        // because now the same kpoint in different spin channels are treated
+        // as distinct kpoints)
+        for (int i = 0; i < nbands; ++i)
         {
-            for (int is = 0; is < nspin; ++is)
+            for (int j = 0; j < nlocal; ++j)
             {
-                ModuleBase::GlobalFunc::ZEROS(pes_->charge->rho[is], pw_wfc->nrxx);
-
-                psid->fix_k(is);
-#ifdef __MPI
-                wfc_2d_to_grid(psid->get_pointer(), para_orb, wfc_gamma_grid[is], gg.gridt->trace_lo);
+                wfc_gamma_grid[is][i][j] = psid[0](i, j);
+            }
+        }
+    #endif
 #else
-                // if not MPI enabled, it is the case psid holds a global matrix.
-                // use fix_k to switch between different spin channels (actually kpoints,
-                // because now the same kpoint in different spin channels are treated
-                // as distinct kpoints)
-
-                for (int i = 0; i < nbands; ++i)
-                {
-                    for (int j = 0; j < nlocal; ++j)
-                    {
-                        wfc_gamma_grid[is][i][j] = psid[0](i, j);
-                    }
-                }
+        ModuleGint::Gint_env_gamma gint_env(psid->get_pointer(), &para_orb, nbands, nlocal, pes_->charge->rho[is]);
 #endif
-
+        for (int ib = 0; ib < nbands; ++ib)
+        {
+            if (bands_picked_[ib])
+            {
+            #ifdef __OLD_GINT
+                ModuleBase::GlobalFunc::ZEROS(pes_->charge->rho[is], pw_wfc->nrxx);
                 gg.cal_env(wfc_gamma_grid[is][ib], pes_->charge->rho[is], ucell);
-
+            #else
+                gint_env.cal_env_band(ib);
+            #endif
                 pes_->charge->save_rho_before_sum_band();
 
                 // pint out information
@@ -124,33 +137,37 @@ void Get_wf_lcao::begin(const UnitCell& ucell,
     this->select_bands(out_wfc_re_im, nbands, fermi_band);
 
     // Calculate out_wfc_re_im
-    for (int ib = 0; ib < nbands; ++ib)
+    for (int is = 0; is < nspin; ++is)
     {
-        if (bands_picked_[ib])
+        psid->fix_k(is);
+#ifdef __OLD_GINT
+    #ifdef __MPI
+        wfc_2d_to_grid(psid->get_pointer(), para_orb, wfc_gamma_grid[is], gg.gridt->trace_lo);
+    #else
+        // if not MPI enabled, it is the case psid holds a global matrix. use fix_k to switch between
+        // different spin channels (actually kpoints, because now the same kpoint in different spin channels
+        // are treated as distinct kpoints)
+        for (int i = 0; i < nbands; ++i)
         {
-            for (int is = 0; is < nspin; ++is)
+            for (int j = 0; j < nlocal; ++j)
             {
-                ModuleBase::GlobalFunc::ZEROS(pes_->charge->rho[is], pw_wfc->nrxx);
-
-                psid->fix_k(is);
-#ifdef __MPI
-                wfc_2d_to_grid(psid->get_pointer(), para_orb, wfc_gamma_grid[is], gg.gridt->trace_lo);
+                wfc_gamma_grid[is][i][j] = psid[0](i, j);
+            }
+        }
+    #endif
 #else
-                // if not MPI enabled, it is the case psid holds a global matrix. use fix_k to switch between
-                // different spin channels (actually kpoints, because now the same kpoint in different spin channels
-                // are treated as distinct kpoints)
-
-                for (int i = 0; i < nbands; ++i)
-                {
-                    for (int j = 0; j < nlocal; ++j)
-                    {
-                        wfc_gamma_grid[is][i][j] = psid[0](i, j);
-                    }
-                }
+        ModuleGint::Gint_env_gamma gint_env(psid->get_pointer(), &para_orb, nbands, nlocal, pes_->charge->rho[is]);
 #endif
-
+        for (int ib = 0; ib < nbands; ++ib)
+        {
+            if (bands_picked_[ib])
+            {
+#ifdef __OLD_GINT
+                ModuleBase::GlobalFunc::ZEROS(pes_->charge->rho[is], pw_wfc->nrxx);
                 gg.cal_env(wfc_gamma_grid[is][ib], pes_->charge->rho[is], ucell);
-
+#else
+                gint_env.cal_env_band(ib);
+#endif
                 pes_->charge->save_rho_before_sum_band();
 
                 const double ef_tmp = this->pes_->eferm.get_efval(is);
@@ -202,6 +219,7 @@ void Get_wf_lcao::begin(const UnitCell& ucell,
                            pw_wfc,
                            ofs_running);
 
+#ifdef __OLD_GINT
     for (int is = 0; is < nspin; ++is)
     {
         for (int ib = 0; ib < nbands; ++ib)
@@ -210,6 +228,7 @@ void Get_wf_lcao::begin(const UnitCell& ucell,
         }
         delete[] wfc_gamma_grid[is];
     }
+#endif
     return;
 }
 
@@ -240,6 +259,7 @@ void Get_wf_lcao::begin(const UnitCell& ucell,
     // allocate grid wave functions for multi-k
     const int nks = kv.get_nks();
     std::vector<std::complex<double>**> wfc_k_grid(nks);
+#ifdef __OLD_GINT
     for (int ik = 0; ik < nks; ++ik)
     {
         wfc_k_grid[ik] = new std::complex<double>*[nbands];
@@ -252,7 +272,8 @@ void Get_wf_lcao::begin(const UnitCell& ucell,
     const double mem_size
         = sizeof(std::complex<double>) * double(gk.gridt->lgd) * double(nbands) * double(nks) / 1024.0 / 1024.0;
     ModuleBase::Memory::record("Get_wf_lcao::begin", mem_size);
-    ModuleBase::GlobalFunc::OUT(ofs_running, "On-the-fly memory consumption (MB)", mem_size);
+    ModuleBase::GlobalFunc::OUT(GlobalV::ofs_running, "On-the-fly memory consumption (MB)", mem_size);
+#endif
 
     // for pw_wfc in G space
     psi::Psi<std::complex<double>> psi_g;
@@ -263,34 +284,44 @@ void Get_wf_lcao::begin(const UnitCell& ucell,
     // Set this->bands_picked_
     this->select_bands(out_wfc_norm, nbands, fermi_band);
 
-    // Calculate out_wfc_norm
-    for (int ib = 0; ib < nbands; ++ib)
+   // Calculate out_wfc_norm
+    const int nspin0 = (nspin == 2) ? 2 : 1;
+    for (int ik = 0; ik < nks; ++ik) // the loop of nspin0 is included
     {
-        if (bands_picked_[ib])
+        const int ispin = kv.isk[ik];
+        //  2d-to-grid conversion is unified into `wfc_2d_to_grid`.
+        psi->fix_k(ik);
+
+#ifdef __OLD_GINT
+    #ifdef __MPI // need to deal with NSPIN=4 !!!!
+        wfc_2d_to_grid(psi->get_pointer(), para_orb, wfc_k_grid[ik], gk.gridt->trace_lo);
+    #else
+        for (int i = 0; i < nbands; ++i)
         {
-            const int nspin0 = (nspin == 2) ? 2 : 1;
-            for (int ik = 0; ik < nks; ++ik) // the loop of nspin0 is included
+            for (int j = 0; j < nlocal; ++j)
             {
-                const int ispin = kv.isk[ik];
+                wfc_k_grid[ik][i][j] = psi[0](i, j);
+            }
+        }
+    #endif
+#else
+        ModuleGint::Gint_env_k gint_env(psi->get_pointer(), &para_orb, kv.kvec_c, kv.kvec_d,
+                                        nbands, nlocal, ik, PARAM.inp.nspin, PARAM.globalv.npol, pes_->charge->rho[ispin]);
+#endif
+        
+        for (int ib = 0; ib < nbands; ++ib)
+        {
+            if (bands_picked_[ib])
+            {
+#ifdef __OLD_GINT
                 ModuleBase::GlobalFunc::ZEROS(pes_->charge->rho[ispin],
                                               pw_wfc->nrxx); // terrible, you make changes on another instance's data???
 
-                //  2d-to-grid conversion is unified into `wfc_2d_to_grid`.
-                psi->fix_k(ik);
-
-#ifdef __MPI // need to deal with NSPIN=4 !!!!
-                wfc_2d_to_grid(psi->get_pointer(), para_orb, wfc_k_grid[ik], gk.gridt->trace_lo);
-#else
-                for (int i = 0; i < nbands; ++i)
-                {
-                    for (int j = 0; j < nlocal; ++j)
-                    {
-                        wfc_k_grid[ik][i][j] = psi[0](i, j);
-                    }
-                }
-#endif
                 // deal with NSPIN=4
                 gk.cal_env_k(ik, wfc_k_grid[ik][ib], pes_->charge->rho[ispin], kv.kvec_c, kv.kvec_d, ucell);
+#else
+                gint_env.cal_env_band(ib);
+#endif
 
                 // ik0 is the real k-point index, starting from 0
                 int ik0 = kv.ik2iktot[ik];
@@ -404,7 +435,7 @@ void Get_wf_lcao::begin(const UnitCell& ucell,
             }
         }
     }
-
+#ifdef __OLD_GINT
     for (int ik = 0; ik < nks; ++ik)
     {
         for (int ib = 0; ib < nbands; ++ib)
@@ -413,7 +444,7 @@ void Get_wf_lcao::begin(const UnitCell& ucell,
         }
         delete[] wfc_k_grid[ik];
     }
-
+#endif
     return;
 }
 
diff --git a/source/module_io/read_input_item_other.cpp b/source/module_io/read_input_item_other.cpp
index 74b1e25221..5694c7b33b 100644
--- a/source/module_io/read_input_item_other.cpp
+++ b/source/module_io/read_input_item_other.cpp
@@ -501,6 +501,12 @@ void ReadInput::item_others()
         item.annotation = "whether to perform rdmft calculation, default is false";
         read_sync_bool(input.rdmft);
         this->add_item(item);
+        item.check_value = [](const Input_Item& item, const Parameter& para) {
+            if (para.input.rdmft && para.input.nspin == 4)
+            {
+                ModuleBase::WARNING_QUIT("ReadInput", "rdmft is not available for nspin = 4");
+            }
+        };
     }
     {
         Input_Item item("rdmft_power_alpha");
diff --git a/source/module_io/write_HS_R.cpp b/source/module_io/write_HS_R.cpp
index ebd91a3d35..e2540dca40 100644
--- a/source/module_io/write_HS_R.cpp
+++ b/source/module_io/write_HS_R.cpp
@@ -146,8 +146,6 @@ void ModuleIO::output_dHR(const int& istep,
 	GlobalV::ofs_running << " |                                                                    |" << std::endl;
 	GlobalV::ofs_running << " >>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>" << std::endl;
 
-    gint_k.allocate_pvdpR();
-
     const int nspin = PARAM.inp.nspin;
 
     if (nspin == 1 || nspin == 4) 
@@ -163,28 +161,13 @@ void ModuleIO::output_dHR(const int& istep,
 				orb,
 				cspin,
 				sparse_thr,
+                v_eff,
 				gint_k);
 	} 
 	else if (nspin == 2) 
 	{
 		for (int cspin = 0; cspin < 2; cspin++) 
 		{
-            // note: some MPI process will not have grids when MPI cores are too
-            // many, v_eff in these processes are empty
-            const double* vr_eff1
-                = v_eff.nc * v_eff.nr > 0 ? &(v_eff(cspin, 0)) : nullptr;
-
-			if (!PARAM.globalv.gamma_only_local) 
-			{
-				if (PARAM.inp.vl_in_h) 
-				{
-                    Gint_inout inout(vr_eff1,
-                                     cspin,
-                                     Gint_Tools::job_type::dvlocal);
-                    gint_k.cal_gint(&inout);
-                }
-            }
-
             sparse_format::cal_dH(ucell,
                                   pv,
                                   HS_Arrays,
@@ -193,6 +176,7 @@ void ModuleIO::output_dHR(const int& istep,
                                   orb,
                                   cspin,
                                   sparse_thr,
+                                  v_eff,
                                   gint_k);
         }
     }
@@ -201,8 +185,6 @@ void ModuleIO::output_dHR(const int& istep,
 
     sparse_format::destroy_dH_R_sparse(HS_Arrays);
 
-    gint_k.destroy_pvdpR();
-
     ModuleBase::timer::tick("ModuleIO", "output_dHR");
     return;
 }
diff --git a/source/module_lr/esolver_lrtd_lcao.cpp b/source/module_lr/esolver_lrtd_lcao.cpp
index 1e09216303..d51fbea6a7 100644
--- a/source/module_lr/esolver_lrtd_lcao.cpp
+++ b/source/module_lr/esolver_lrtd_lcao.cpp
@@ -241,7 +241,7 @@ LR::ESolver_LR<T, TR>::ESolver_LR(ModuleESolver::ESolver_KS_LCAO<T, TR>&& ks_sol
         this->nupdown = cal_nupdown_form_occ(ks_sol.pelec->wg);
         reset_dim_spin2();
     }
-
+#ifdef __OLD_GINT
     //grid integration
     this->gt_ = std::move(ks_sol.GridT);
 
@@ -255,7 +255,9 @@ LR::ESolver_LR<T, TR>::ESolver_LR(ModuleESolver::ESolver_KS_LCAO<T, TR>&& ks_sol
 	}
     this->set_gint();
     this->gint_->reset_DMRGint(1);
-
+#else
+    this->gint_info_ = std::move(ks_sol.gint_info_);
+#endif
     // move pw basis
     if (this->pw_rho_flag)
     {
@@ -393,6 +395,7 @@ LR::ESolver_LR<T, TR>::ESolver_LR(const Input_para& inp, UnitCell& ucell) : inpu
                          this->ucell,
                          search_radius,
                          PARAM.inp.test_atom_input);
+#ifdef __OLD_GINT
     this->set_gint();
     this->gint_->gridt = &this->gt_;
 
@@ -451,7 +454,26 @@ LR::ESolver_LR<T, TR>::ESolver_LR(const Input_para& inp, UnitCell& ucell) : inpu
         &ucell,
         &orb);
     this->gint_->initialize_pvpR(ucell, &this->gd, 1); // always use nspin=1 for transition density
-
+#else
+    gint_info_.reset(
+        new ModuleGint::GintInfo(
+        this->pw_big->nbx,
+        this->pw_big->nby,
+        this->pw_big->nbz,
+        this->pw_rho->nx,
+        this->pw_rho->ny,
+        this->pw_rho->nz,
+        0,
+        0,
+        this->pw_big->nbzp_start,
+        this->pw_big->nbx,
+        this->pw_big->nby,
+        this->pw_big->nbzp,
+        orb.Phi,
+        ucell,
+        this->gd));
+    ModuleGint::Gint::set_gint_info(gint_info_.get());
+#endif
     // if EXX from scratch, init 2-center integral and calculate Cs, Vs 
 #ifdef __EXX
     if ((xc_kernel == "hf" || xc_kernel == "hse") && this->input.lr_solver != "spectrum")
diff --git a/source/module_lr/esolver_lrtd_lcao.h b/source/module_lr/esolver_lrtd_lcao.h
index e6335ab69c..17e2a64a55 100644
--- a/source/module_lr/esolver_lrtd_lcao.h
+++ b/source/module_lr/esolver_lrtd_lcao.h
@@ -17,6 +17,7 @@
 #include "source_estate/module_dm/density_matrix.h"
 #include "module_lr/potentials/pot_hxc_lrtd.h"
 #include "module_lr/hamilt_casida.h"
+#include "module_hamilt_lcao/module_gint/temp_gint/gint_info.h"
 #ifdef __EXX
 // #include <RI/physics/Exx.h>
 #include "module_ri/Exx_LRI.h"
@@ -93,6 +94,9 @@ namespace LR
         Gint_Gamma gint_g_;
         Gint_k gint_k_;
         typename TGint<T>::type* gint_ = nullptr;
+        #ifndef __OLD_GINT
+        std::unique_ptr<ModuleGint::GintInfo> gint_info_ = nullptr;
+        #endif
         void set_gint();
 
         /// @brief variables for parallel distribution of KS orbitals
diff --git a/source/module_lr/lr_spectrum.cpp b/source/module_lr/lr_spectrum.cpp
index d2fb10ab02..28c23270e7 100644
--- a/source/module_lr/lr_spectrum.cpp
+++ b/source/module_lr/lr_spectrum.cpp
@@ -6,6 +6,7 @@
 #include "module_lr/utils/lr_util.h"
 #include "module_lr/utils/lr_util_hcontainer.h"
 #include "module_lr/utils/lr_util_print.h"
+#include "module_hamilt_lcao/module_gint/temp_gint/gint_interface.h"
 
 template <typename T>
 elecstate::DensityMatrix<T, T> LR::LR_Spectrum<T>::cal_transition_density_matrix(const int istate, const T* X_in, const bool need_R)
@@ -34,6 +35,7 @@ elecstate::DensityMatrix<T, T> LR::LR_Spectrum<T>::cal_transition_density_matrix
     return DM_trans;
 }
 
+#ifdef __OLD_GINT
 template<typename T>
 void LR::LR_Spectrum<T>::cal_gint_rho(double** rho, const int& nrxx)
 {
@@ -41,6 +43,7 @@ void LR::LR_Spectrum<T>::cal_gint_rho(double** rho, const int& nrxx)
     Gint_inout inout_rho(rho, Gint_Tools::job_type::rho, 1, false);
     this->gint->cal_gint(&inout_rho);
 }
+#endif
 
 inline void check_sum_rule(const double& osc_tot)
 {
@@ -59,12 +62,16 @@ ModuleBase::Vector3<double> LR::LR_Spectrum<double>::cal_transition_dipole_istat
     const elecstate::DensityMatrix<double, double>& DM_trans = this->cal_transition_density_matrix(istate);
     for (int is = 0;is < this->nspin_x;++is)
     {
-        this->gint->transfer_DM2DtoGrid({ DM_trans.get_DMR_vector().at(is) });
-
         // 2. transition density
         double** rho_trans;
         LR_Util::_allocate_2order_nested_ptr(rho_trans, 1, this->rho_basis.nrxx);
+#ifdef __OLD_GINT
+        this->gint->transfer_DM2DtoGrid({ DM_trans.get_DMR_vector().at(is) });
         this->cal_gint_rho(rho_trans, this->rho_basis.nrxx);
+#else
+        ModuleBase::GlobalFunc::ZEROS(rho_trans[0], this->rho_basis.nrxx);
+        ModuleGint::cal_gint_rho({ DM_trans.get_DMR_vector().at(is) }, 1, rho_trans, false);
+#endif
 
         // 3. transition dipole moment
         for (int ir = 0; ir < rho_basis.nrxx; ++ir)
@@ -79,7 +86,7 @@ ModuleBase::Vector3<double> LR::LR_Spectrum<double>::cal_transition_dipole_istat
         }
         LR_Util::_deallocate_2order_nested_ptr(rho_trans, 1);
     }
-    trans_dipole *= (ucell.omega / static_cast<double>(gint->get_ncxyz()));   // dv
+    trans_dipole *= (ucell.omega / static_cast<double>(rho_basis.nxyz));   // dv
     trans_dipole *= static_cast<double>(this->nk);  // nk is divided inside DM_trans, now recover it
     if (this->nspin_x == 1) { trans_dipole *= sqrt(2.0); } // *2 for 2 spins, /sqrt(2) for the halfed dimension of X in the normalizaiton
     Parallel_Reduce::reduce_all(trans_dipole.x);
@@ -108,14 +115,24 @@ ModuleBase::Vector3<std::complex<double>> LR::LR_Spectrum<std::complex<double>>:
 
         // real part
         LR_Util::get_DMR_real_imag_part(DM_trans, DM_trans_real_imag, ucell.nat, 'R');
+#ifdef __OLD_GINT
         this->gint->transfer_DM2DtoGrid(DM_trans_real_imag.get_DMR_vector());
         this->cal_gint_rho(rho_trans_real, this->rho_basis.nrxx);
+#else
+        ModuleBase::GlobalFunc::ZEROS(rho_trans_real[0], this->rho_basis.nrxx);
+        ModuleGint::cal_gint_rho(DM_trans_real_imag.get_DMR_vector(), 1, rho_trans_real, false);
+#endif
         // LR_Util::print_grid_nonzero(rho_trans_real[0], this->rho_basis.nrxx, 10, "rho_trans");
 
         // imag part
         LR_Util::get_DMR_real_imag_part(DM_trans, DM_trans_real_imag, ucell.nat, 'I');
+#ifdef __OLD_GINT
         this->gint->transfer_DM2DtoGrid(DM_trans_real_imag.get_DMR_vector());
         this->cal_gint_rho(rho_trans_imag, this->rho_basis.nrxx);
+#else
+        ModuleBase::GlobalFunc::ZEROS(rho_trans_imag[0], this->rho_basis.nrxx);
+        ModuleGint::cal_gint_rho(DM_trans_real_imag.get_DMR_vector(), 1, rho_trans_imag, false);
+#endif
         // LR_Util::print_grid_nonzero(rho_trans_imag[0], this->rho_basis.nrxx, 10, "rho_trans");
 
         // 3. transition dipole moment
@@ -133,7 +150,7 @@ ModuleBase::Vector3<std::complex<double>> LR::LR_Spectrum<std::complex<double>>:
         LR_Util::_deallocate_2order_nested_ptr(rho_trans_real, 1);
         LR_Util::_deallocate_2order_nested_ptr(rho_trans_imag, 1);
     }
-    trans_dipole *= (ucell.omega / static_cast<double>(gint->get_ncxyz()));   // dv
+    trans_dipole *= (ucell.omega / static_cast<double>(rho_basis.nxyz));   // dv
     trans_dipole *= static_cast<double>(this->nk);  // nk is divided inside DM_trans, now recover it
     if (this->nspin_x == 1) { trans_dipole *= sqrt(2.0); } // *2 for 2 spins, /sqrt(2) for the halfed dimension of X in the normalizaiton
     Parallel_Reduce::reduce_all(trans_dipole.x);
diff --git a/source/module_lr/operator_casida/operator_lr_hxc.cpp b/source/module_lr/operator_casida/operator_lr_hxc.cpp
index ebff00e5f1..8ec9fece42 100644
--- a/source/module_lr/operator_casida/operator_lr_hxc.cpp
+++ b/source/module_lr/operator_casida/operator_lr_hxc.cpp
@@ -10,6 +10,7 @@
 #include "module_hamilt_lcao/module_hcontainer/hcontainer_funcs.h"
 #include "module_lr/ao_to_mo_transformer/ao_to_mo.h"
 #include "source_pw/hamilt_pwdft/global.h"
+#include "module_hamilt_lcao/module_gint/temp_gint/gint_interface.h"
 
 inline double conj(double a) { return a; }
 inline std::complex<double> conj(std::complex<double> a) { return std::conj(a); }
@@ -22,7 +23,6 @@ namespace LR
         ModuleBase::TITLE("OperatorLRHxc", "act");
         const int& sl = ispin_ks[0];
         const auto psil_ks = LR_Util::get_psi_spin(psi_ks, sl, nk);
-        const int& lgd = gint->gridt->lgd;
 
         this->DM_trans->cal_DMR();  //DM_trans->get_DMR_vector() is 2d-block parallized
         // LR_Util::print_DMR(*DM_trans, ucell.nat, "DMR");
@@ -55,7 +55,6 @@ namespace LR
     {
         ModuleBase::TITLE("OperatorLRHxc", "grid_calculation(real)");
         ModuleBase::timer::tick("OperatorLRHxc", "grid_calculation");
-        this->gint->transfer_DM2DtoGrid(this->DM_trans->get_DMR_vector());     // 2d block to grid
 
         // 2. transition electron density
         // \f[ \tilde{\rho}(r)=\sum_{\mu_j, \mu_b}\tilde{\rho}_{\mu_j,\mu_b}\phi_{\mu_b}(r)\phi_{\mu_j}(r) \f]
@@ -63,20 +62,28 @@ namespace LR
         const int& nrxx = this->pot.lock()->nrxx;
         LR_Util::_allocate_2order_nested_ptr(rho_trans, 1, nrxx); // currently gint_kernel_rho uses PARAM.inp.nspin, it needs refactor
         ModuleBase::GlobalFunc::ZEROS(rho_trans[0], nrxx);
+#ifdef __OLD_GINT
+        this->gint->transfer_DM2DtoGrid(this->DM_trans->get_DMR_vector());     // 2d block to grid
         Gint_inout inout_rho(rho_trans, Gint_Tools::job_type::rho, 1, false);
         this->gint->cal_gint(&inout_rho);
-
+#else
+        ModuleGint::cal_gint_rho(this->DM_trans->get_DMR_vector(), 1, rho_trans, false);
+#endif
         // 3. v_hxc = f_hxc * rho_trans
         ModuleBase::matrix vr_hxc(1, nrxx);   //grid
         this->pot.lock()->cal_v_eff(rho_trans, ucell, vr_hxc, ispin_ks);
         LR_Util::_deallocate_2order_nested_ptr(rho_trans, 1);
 
         // 4. V^{Hxc}_{\mu,\nu}=\int{dr} \phi_\mu(r) v_{Hxc}(r) \phi_\mu(r)
+        this->hR->set_zero();   // clear hR for each bands
+#ifdef __OLD_GINT
         Gint_inout inout_vlocal(vr_hxc.c, 0, Gint_Tools::job_type::vlocal);
         this->gint->get_hRGint()->set_zero();
         this->gint->cal_gint(&inout_vlocal);
-        this->hR->set_zero();   // clear hR for each bands
         this->gint->transfer_pvpR(&*this->hR, &ucell);    //grid to 2d block
+#else
+        ModuleGint::cal_gint_vl(vr_hxc.c, &*this->hR);
+#endif
         ModuleBase::timer::tick("OperatorLRHxc", "grid_calculation");
     }
 
@@ -96,8 +103,6 @@ namespace LR
                 LR_Util::get_DMR_real_imag_part(*this->DM_trans, DM_trans_real_imag, ucell.nat, type);
                 // if (this->first_print)LR_Util::print_DMR(DM_trans_real_imag, ucell.nat, "DMR(2d, real)");
 
-                this->gint->transfer_DM2DtoGrid(DM_trans_real_imag.get_DMR_vector());
-                // LR_Util::print_HR(*this->gint->get_DMRGint()[0], this->ucell.nat, "DMR(grid, real)");
 
                 // 2. transition electron density
                 double** rho_trans;
@@ -105,8 +110,14 @@ namespace LR
 
                 LR_Util::_allocate_2order_nested_ptr(rho_trans, 1, nrxx); // nspin=1 for transition density
                 ModuleBase::GlobalFunc::ZEROS(rho_trans[0], nrxx);
+#ifdef __OLD_GINT
+                this->gint->transfer_DM2DtoGrid(DM_trans_real_imag.get_DMR_vector());
+                // LR_Util::print_HR(*this->gint->get_DMRGint()[0], this->ucell.nat, "DMR(grid, real)");
                 Gint_inout inout_rho(rho_trans, Gint_Tools::job_type::rho, 1, false);
                 this->gint->cal_gint(&inout_rho);
+#else
+                ModuleGint::cal_gint_rho(DM_trans_real_imag.get_DMR_vector(), 1, rho_trans, false);
+#endif
                 // print_grid_nonzero(rho_trans[0], nrxx, 10, "rho_trans");
 
                 // 3. v_hxc = f_hxc * rho_trans
@@ -117,13 +128,16 @@ namespace LR
                 LR_Util::_deallocate_2order_nested_ptr(rho_trans, 1);
 
                 // 4. V^{Hxc}_{\mu,\nu}=\int{dr} \phi_\mu(r) v_{Hxc}(r) \phi_\mu(r)
+                HR_real_imag.set_zero();
+#ifdef __OLD_GINT
                 Gint_inout inout_vlocal(vr_hxc.c, 0, Gint_Tools::job_type::vlocal);
                 this->gint->get_hRGint()->set_zero();
                 this->gint->cal_gint(&inout_vlocal);
-
                 // LR_Util::print_HR(*this->gint->get_hRGint(), this->ucell.nat, "VR(grid)");
-                HR_real_imag.set_zero();
                 this->gint->transfer_pvpR(&HR_real_imag, &ucell, &this->gd);
+#else
+                ModuleGint::cal_gint_vl(vr_hxc.c, &HR_real_imag);
+#endif
                 // LR_Util::print_HR(HR_real_imag, this->ucell.nat, "VR(real, 2d)");
                 LR_Util::set_HR_real_imag_part(HR_real_imag, *this->hR, ucell.nat, type);
             };
diff --git a/source/module_rdmft/rdmft_tools.cpp b/source/module_rdmft/rdmft_tools.cpp
index bd45a49631..bff3229fa0 100644
--- a/source/module_rdmft/rdmft_tools.cpp
+++ b/source/module_rdmft/rdmft_tools.cpp
@@ -12,6 +12,7 @@
 #include "source_estate/module_pot/pot_local.h"
 #include "source_estate/module_pot/pot_xc.h"
 #include "source_pw/hamilt_pwdft/structure_factor.h"
+#include "module_hamilt_lcao/module_gint/temp_gint/gint_interface.h"
 
 #include <iostream>
 #include <cmath>
@@ -240,8 +241,9 @@ void Veff_rdmft<TK, TR>::initialize_HR(const UnitCell* ucell_in, const Grid_Driv
 
 
 // this part of the code is copying from class Veff and do some modifications.
-template<typename TK, typename TR>
-void Veff_rdmft<TK, TR>::contributeHR()
+// nspin == 1 or 2 case
+template<>
+void Veff_rdmft<std::complex<double>, double>::contributeHR()
 {
     ModuleBase::TITLE("Veff", "contributeHR");
     ModuleBase::timer::tick("Veff", "contributeHR");
@@ -261,8 +263,12 @@ void Veff_rdmft<TK, TR>::contributeHR()
             vr_eff_rdmft = &v_matrix_hartree(is, 0);
 
             // do grid integral calculation to get HR
+#ifdef __OLD_GINT
             Gint_inout inout(vr_eff_rdmft, is, Gint_Tools::job_type::vlocal);
             this->GK->cal_gint(&inout);
+#else
+            ModuleGint::cal_gint_vl(vr_eff_rdmft, this->hR);
+#endif
         }
     }
     else if( potential_ == "local" )
@@ -276,8 +282,12 @@ void Veff_rdmft<TK, TR>::contributeHR()
         vr_eff_rdmft = &v_matrix_local(0, 0);
 
         // do grid integral calculation to get HR
+#ifdef __OLD_GINT
         Gint_inout inout(vr_eff_rdmft, 0, Gint_Tools::job_type::vlocal);
         this->GK->cal_gint(&inout);
+#else
+        ModuleGint::cal_gint_vl(vr_eff_rdmft, this->hR);
+#endif
     }
     else if( potential_ == "xc" )
     {
@@ -296,8 +306,12 @@ void Veff_rdmft<TK, TR>::contributeHR()
             vr_eff_rdmft = &v_matrix_XC(is, 0);
 
             // do grid integral calculation to get HR
+#ifdef __OLD_GINT
             Gint_inout inout(vr_eff_rdmft, is, Gint_Tools::job_type::vlocal);
             this->GK->cal_gint(&inout);
+#else
+            ModuleGint::cal_gint_vl(vr_eff_rdmft, this->hR);
+#endif
         }
     }
     else
@@ -307,7 +321,9 @@ void Veff_rdmft<TK, TR>::contributeHR()
 
     // get HR for 2D-block parallel format
     // this->GK->transfer_pvpR(this->hR);
+#ifdef __OLD_GINT
     this->GK->transfer_pvpR(this->hR,this->ucell,this->gd);
+#endif
 
     if(this->nspin == 2) 
     { 
@@ -318,6 +334,12 @@ void Veff_rdmft<TK, TR>::contributeHR()
     return;
 }
 
+template<>
+void Veff_rdmft<std::complex<double>, std::complex<double>>::contributeHR()
+{
+    // nspin = 4 case not implemented currently.
+}
+
 // this part of the code is copying from class Veff and do some modifications.
 // special case of gamma-only
 template<>
@@ -343,8 +365,12 @@ void Veff_rdmft<double, double>::contributeHR()
             vr_eff_rdmft = &v_matrix_hartree(is, 0);
 
             // do grid integral calculation to get HR
+#ifdef __OLD_GINT
             Gint_inout inout(vr_eff_rdmft, is, Gint_Tools::job_type::vlocal);
             this->GG->cal_gint(&inout);
+#else
+            ModuleGint::cal_gint_vl(vr_eff_rdmft, this->hR);
+#endif
         }
     }
     else if( potential_ == "local" )
@@ -358,12 +384,16 @@ void Veff_rdmft<double, double>::contributeHR()
         vr_eff_rdmft = &v_matrix_local(0, 0);
 
         // do grid integral calculation to get HR
+#ifdef __OLD_GINT
         Gint_inout inout(vr_eff_rdmft, 0, Gint_Tools::job_type::vlocal);
 
         // because in gamma_only, cal_gint would not set hRGint zero first
         // so must use cal_vlocal(), and in rdmft_test.h, calculate V_hartree->contributeHR() first
 
         this->GG->cal_vlocal(&inout, false);  // cal_gint ???
+#else
+        ModuleGint::cal_gint_vl(vr_eff_rdmft, this->hR);
+#endif
     }
     else if( potential_ == "xc" )
     {
@@ -381,8 +411,12 @@ void Veff_rdmft<double, double>::contributeHR()
             vr_eff_rdmft = &v_matrix_XC(is, 0);
 
             // do grid integral calculation to get HR
+#ifdef __OLD_GINT
             Gint_inout inout(vr_eff_rdmft, is, Gint_Tools::job_type::vlocal);
             this->GG->cal_gint(&inout);
+#else
+            ModuleGint::cal_gint_vl(vr_eff_rdmft, this->hR);
+#endif
         }
     }
     else
@@ -390,9 +424,10 @@ void Veff_rdmft<double, double>::contributeHR()
         std::cout << "\n\n!!!!!!\n there may be something wrong when use class Veff_rdmft\n\n!!!!!!\n";
     }
 
+#ifdef __OLD_GINT
     // get HR for 2D-block parallel format
     this->GG->transfer_pvpR(this->hR,this->ucell);
-
+#endif
     this->new_e_iteration = false;
 
     if(this->nspin == 2)
diff --git a/source/module_rdmft/rdmft_tools.h b/source/module_rdmft/rdmft_tools.h
index ac3db2744d..e8a5c52e5f 100644
--- a/source/module_rdmft/rdmft_tools.h
+++ b/source/module_rdmft/rdmft_tools.h
@@ -284,8 +284,9 @@ class Veff_rdmft : public hamilt::OperatorLCAO<TK, TR>
         this->cal_type = hamilt::calculation_type::lcao_gint;
 
         this->initialize_HR(ucell_in, GridD_in);
-
+#ifdef __OLD_GINT
         GK_in->initialize_pvpR(*ucell_in, GridD_in, nspin);
+#endif
     }
     Veff_rdmft(Gint_Gamma* GG_in,
                hamilt::HS_Matrix_K<TK>* hsk_in,
@@ -310,8 +311,9 @@ class Veff_rdmft : public hamilt::OperatorLCAO<TK, TR>
         this->cal_type = hamilt::calculation_type::lcao_gint;
 
         this->initialize_HR(ucell_in, GridD_in);
-
+#ifdef __OLD_GINT
         GG_in->initialize_pvpR(*ucell_in, GridD_in, nspin);
+#endif
     }
 
     ~Veff_rdmft<TK, TR>(){};
diff --git a/source/module_rdmft/update_state_rdmft.cpp b/source/module_rdmft/update_state_rdmft.cpp
index 0cdffed794..183bfba6a1 100644
--- a/source/module_rdmft/update_state_rdmft.cpp
+++ b/source/module_rdmft/update_state_rdmft.cpp
@@ -8,6 +8,7 @@
 #include "source_estate/module_dm/cal_dm_psi.h"
 #include "source_estate/module_dm/density_matrix.h"
 #include "source_estate/module_charge/symmetry_rho.h"
+#include "module_hamilt_lcao/module_gint/temp_gint/gint_interface.h"
 
 
 namespace rdmft
@@ -105,10 +106,13 @@ void RDMFT<TK, TR>::update_charge(UnitCell& ucell)
         {
             ModuleBase::GlobalFunc::ZEROS(charge->rho[is], charge->nrxx);
         }
-
+#ifdef __OLD_GINT
         GG->transfer_DM2DtoGrid(DM_gamma_only.get_DMR_vector());
         Gint_inout inout(charge->rho, Gint_Tools::job_type::rho, nspin);
         GG->cal_gint(&inout);
+#else
+        ModuleGint::cal_gint_rho(DM_gamma_only.get_DMR_vector(), nspin, charge->rho);
+#endif
 
         if (XC_Functional::get_ked_flag())
         {
@@ -136,9 +140,13 @@ void RDMFT<TK, TR>::update_charge(UnitCell& ucell)
             ModuleBase::GlobalFunc::ZEROS(charge->rho[is], charge->nrxx);
         }
 
+#ifdef __OLD_GINT
         GK->transfer_DM2DtoGrid(DM.get_DMR_vector());
         Gint_inout inout(charge->rho, Gint_Tools::job_type::rho, nspin);
         GK->cal_gint(&inout);
+#else
+        ModuleGint::cal_gint_rho(DM.get_DMR_vector(), nspin, charge->rho);
+#endif
 
         if (XC_Functional::get_ked_flag())
         {
diff --git a/source/source_base/intarray.cpp b/source/source_base/intarray.cpp
index a2e3dcce4d..10c3b7f39d 100644
--- a/source/source_base/intarray.cpp
+++ b/source/source_base/intarray.cpp
@@ -6,8 +6,6 @@
 
 namespace ModuleBase
 {
-int IntArray::arrayCount = 0;
-
 void IntArrayAlloc()
 {
 	std::cout << "\n Allocation error for IntArray " << std::endl;
@@ -23,7 +21,6 @@ IntArray::IntArray(const int d1,const int d2)
 	size = bound1 * bound2;
 	ptr = new int[size];zero_out();
 	assert( ptr != nullptr);
-	++arrayCount;
 }
 
 IntArray::IntArray(const int d1,const int d2,const int d3)
@@ -37,7 +34,6 @@ IntArray::IntArray(const int d1,const int d2,const int d3)
 	size = bound1 * bound2 * bound3 ;	//* sizeof(float);
 	ptr = new int[size];zero_out();
 	assert(ptr != nullptr);
-	++arrayCount;
 }
 
 IntArray::IntArray(const int d1,const int d2,const int d3,const int d4)
@@ -52,7 +48,6 @@ IntArray::IntArray(const int d1,const int d2,const int d3,const int d4)
 	size = bound1 * bound2 * bound3 * bound4 ;	//* sizeof(float);
 	ptr = new int[size];zero_out();
 	assert(ptr != nullptr);
-	++arrayCount;
 }
 
 IntArray::IntArray(const int d1,const int d2,const int d3,
@@ -68,7 +63,6 @@ IntArray::IntArray(const int d1,const int d2,const int d3,
 	size = bound1 * bound2 * bound3 * bound4 * bound5;
 	ptr = new int[size];zero_out();
 	assert(ptr != nullptr);
-	++arrayCount;
 }
 
 IntArray::IntArray(const int d1,const int d2,const int d3,
@@ -85,7 +79,6 @@ IntArray::IntArray(const int d1,const int d2,const int d3,
     size = bound1 * bound2 * bound3 * bound4 * bound5 * bound6;
 	ptr = new int[size];zero_out();
 	assert(ptr != nullptr);
-	++arrayCount;
 }
 
 //********************************
diff --git a/source/source_base/intarray.h b/source/source_base/intarray.h
index 96996b5b22..9147dc184e 100644
--- a/source/source_base/intarray.h
+++ b/source/source_base/intarray.h
@@ -48,17 +48,30 @@ class IntArray
     void create(const int d1, const int d2, const int d3, const int d4, const int d5, const int d6);
 
     /**
-     * @brief Equal an IntArray to another one
+     * @brief copy assignment
      *
      * @param right
      * @return const IntArray&
      */
-    const IntArray &operator=(const IntArray &right)
-    {
-        assert( this->size == right.size );
-        for (int i = 0;i < size;i++) ptr[i] = right.ptr[i];
-        return *this;// enables x = y = z;
-    };
+    IntArray &operator=(const IntArray &other)
+    {
+        if(this != &other)
+        {
+            delete[] ptr;
+            size = other.size;
+            dim = other.dim;
+            bound1 = other.bound1;
+            bound2 = other.bound2;
+            bound3 = other.bound3;
+            bound4 = other.bound4;
+            bound5 = other.bound5;
+            bound6 = other.bound6;
+            ptr = new int[size];
+            for (int i = 0;i < size;i++)
+            { ptr[i] = other.ptr[i]; }
+        }
+        return *this;
+    }
 
     /**
      * @brief Equal all elements of an IntArray to an
@@ -71,7 +84,7 @@ class IntArray
     {
         for (int i = 0;i < size;i++) ptr[i] = right;
         return *this;// enables x = y = z;
-    };
+    }
 
     /**
      * @brief Access elements by using operator "()"
@@ -85,14 +98,14 @@ class IntArray
         assert( d1 < bound1 );
         assert( d2 < bound2 );
         return ptr[ d1 * bound2 + d2 ];
-    };
+    }
     int &operator()(const int d1, const int d2, const int d3)
     {
         assert( d1 < bound1 );
         assert( d2 < bound2 );
         assert( d3 < bound3 );
         return ptr[ (d1 * bound2 + d2) * bound3 + d3 ];
-    };
+    }
     int &operator()(const int d1, const int d2, const int d3, const int d4)
     {
         assert( d1 < bound1 );
@@ -100,7 +113,7 @@ class IntArray
         assert( d3 < bound3 );
         assert( d4 < bound4 );
         return ptr[ ((d1 * bound2 + d2) * bound3 + d3) * bound4 + d4 ];
-    };
+    }
     int &operator()(const int d1, const int d2, const int d3, const int d4, const int d5)
     {
         assert( d1 < bound1 );
@@ -109,7 +122,7 @@ class IntArray
         assert( d4 < bound4 );
         assert( d5 < bound5 );
         return ptr[ (((d1 * bound2 + d2) * bound3 + d3) * bound4 + d4) * bound5 + d5 ];
-    };
+    }
     int &operator()(const int d1, const int d2, const int d3, const int d4, const int d5, const int d6)
     {
         assert( d1 < bound1 );
@@ -119,7 +132,7 @@ class IntArray
         assert( d5 < bound5 );
         assert( d6 < bound6 );
         return ptr[ ((((d1 * bound2 + d2) * bound3 + d3) * bound4 + d4) * bound5 + d5) * bound6 + d6 ];
-    };
+    }
 
     /**
      * @brief Access elements by using "()" through pointer
@@ -134,14 +147,14 @@ class IntArray
         assert( d1 < bound1 );
         assert( d2 < bound2 );
         return ptr[ d1 * bound2 + d2 ];
-    };
+    }
     const int &operator()(const int d1, const int d2, const int d3) const
     {
         assert( d1 < bound1 );
         assert( d2 < bound2 );
         assert( d3 < bound3 );
         return ptr[ (d1 * bound2 + d2) * bound3 + d3 ];
-    };
+    }
     const int &operator()(const int d1, const int d2, const int d3, const int d4) const
     {
         assert( d1 < bound1 );
@@ -149,7 +162,7 @@ class IntArray
         assert( d3 < bound3 );
         assert( d4 < bound4 );
         return ptr[ ((d1 * bound2 + d2) * bound3 + d3) * bound4 + d4 ];
-    };
+    }
     const int &operator()(const int d1, const int d2, const int d3, const int d4, const int d5) const
     {
         assert( d1 < bound1 );
@@ -158,7 +171,7 @@ class IntArray
         assert( d4 < bound4 );
         assert( d5 < bound5 );
         return ptr[ (((d1 * bound2 + d2) * bound3 + d3) * bound4 + d4) * bound5 + d5 ];
-    };
+    }
     const int &operator()(const int d1, const int d2, const int d3, const int d4, const int d5, const int d6) const
     {
         assert( d1 < bound1 );
@@ -168,7 +181,7 @@ class IntArray
         assert( d5 < bound5 );
         assert( d6 < bound6 );
         return ptr[ ((((d1 * bound2 + d2) * bound3 + d3) * bound4 + d4) * bound5 + d5) * bound6 + d6 ];
-    };
+    }
 
     /**
      * @brief Set all elements of an IntArray to zero
@@ -209,16 +222,6 @@ class IntArray
         return bound6;
     }
 
-    /**
-     * @brief Get the Array Count object
-     *
-     * @return int
-     */
-    static int getArrayCount(void)
-    {
-        return arrayCount;
-    }
-
   private:
     int size=0;
     int dim=0;
@@ -228,7 +231,6 @@ class IntArray
     int bound4=0; 
     int bound5=0; 
     int bound6=0;
-    static int arrayCount;
     void freemem();
 };
 } // namespace ModuleBase
diff --git a/source/source_base/test/intarray_test.cpp b/source/source_base/test/intarray_test.cpp
index 6ccfb24452..7372b4e115 100644
--- a/source/source_base/test/intarray_test.cpp
+++ b/source/source_base/test/intarray_test.cpp
@@ -12,8 +12,6 @@
  *     - construct an int array (2 to 6 dimensions)
  *   - Creat 
  *     - create an int array (2 to 6 dimensions)
- *   - GetArrayCount
- *     - get the total number of int array created
  *   - GetSize
  *     - get the total size of an int array
  *   - GetDim
@@ -51,14 +49,6 @@ class IntArrayTest : public testing::Test
 	const int zero = 0;
 };
 
-TEST_F(IntArrayTest,GetArrayCount)
-{
-	count0 = ModuleBase::IntArray::getArrayCount();
-	ModuleBase::IntArray c3, c4;
-	count1 = ModuleBase::IntArray::getArrayCount();
-	EXPECT_EQ((count1-count0),2);
-}
-
 TEST_F(IntArrayTest,Construct)
 {
 	ModuleBase::IntArray x2(1,5);
diff --git a/source/source_basis/module_ao/ORB_atomic.cpp b/source/source_basis/module_ao/ORB_atomic.cpp
index 9b40d923ef..99f5953bda 100644
--- a/source/source_basis/module_ao/ORB_atomic.cpp
+++ b/source/source_basis/module_ao/ORB_atomic.cpp
@@ -7,8 +7,6 @@ Numerical_Orbital::Numerical_Orbital()
 {
     // make std::pair of new and delete
     // question remains
-    this->nchi = nullptr;
-    this->phiLN = new Numerical_Orbital_Lm[1];
     this->rcut = 0.0;
     this->max_nchi = 0;
     this->type = 0;
@@ -16,8 +14,6 @@ Numerical_Orbital::Numerical_Orbital()
 
 Numerical_Orbital::~Numerical_Orbital()
 {
-    delete[] nchi;
-    delete[] phiLN;
 }
 
 void Numerical_Orbital::set_orbital_info(const int& type_in,
@@ -34,8 +30,7 @@ void Numerical_Orbital::set_orbital_info(const int& type_in,
     this->lmax = lmax_in;
 
     // (2) set nchi and total nchi.
-    delete[] this->nchi;
-    this->nchi = new int[this->lmax + 1];
+    this->nchi.resize(this->lmax + 1);
     for (int i = 0; i < this->lmax + 1; i++)
     {
         this->nchi[i] = nchi_in[i];
diff --git a/source/source_basis/module_ao/ORB_atomic.h b/source/source_basis/module_ao/ORB_atomic.h
index 71212f8b28..e71c0958d3 100644
--- a/source/source_basis/module_ao/ORB_atomic.h
+++ b/source/source_basis/module_ao/ORB_atomic.h
@@ -66,7 +66,6 @@ class Numerical_Orbital
 	
 	const inline Numerical_Orbital_Lm& PhiLN( const int &L, const int &N)const
 	{ 	
-		assert(this->phiLN != nullptr);
 		return this->phiLN[ this->find_chi(L, N) ];
 	}
 	
@@ -98,7 +97,7 @@ class Numerical_Orbital
 		NOAR.set_position(R1_in, R2_in);
 	}
 
-	Numerical_Orbital_Lm*& chi() { return this->phiLN; }
+	std::vector<Numerical_Orbital_Lm>& chi() { return this->phiLN; }
 				
 private:
 	
@@ -115,13 +114,13 @@ class Numerical_Orbital
 	
 	int type;
 	int lmax;
-	int* nchi;
+	std::vector<int> nchi;
 	int total_nchi;
 	int max_nchi;
 	ModuleBase::IntArray find_chi;
 	double rcut;
 
-	Numerical_Orbital_Lm* phiLN;// length: total_nchi (only store radial function )
+	std::vector<Numerical_Orbital_Lm> phiLN;// length: total_nchi (only store radial function )
 
 	//==========================================================
 	// Keep the old interface
diff --git a/source/source_basis/module_ao/ORB_read.cpp b/source/source_basis/module_ao/ORB_read.cpp
index 36d5d55f35..8cb0e4075f 100644
--- a/source/source_basis/module_ao/ORB_read.cpp
+++ b/source/source_basis/module_ao/ORB_read.cpp
@@ -419,8 +419,7 @@ void LCAO_Orbitals::read_orb_file(std::ofstream& ofs_in, // GlobalV::ofs_running
     }
 
     // OUT(GlobalV::ofs_running,"Total number of chi(l,n)",total_nchi);
-    delete[] ao[it].phiLN;
-    ao[it].phiLN = new Numerical_Orbital_Lm[total_nchi];
+    ao[it].phiLN.resize(total_nchi);
 
     int meshr = 0; // number of mesh points
     int meshr_read = 0;
diff --git a/source/source_basis/module_nao/radial_set.cpp b/source/source_basis/module_nao/radial_set.cpp
index 9c83590926..2570e99806 100644
--- a/source/source_basis/module_nao/radial_set.cpp
+++ b/source/source_basis/module_nao/radial_set.cpp
@@ -87,9 +87,8 @@ RadialSet& RadialSet::operator=(const RadialSet& rhs)
 
 void RadialSet::to_numerical_orbital(Numerical_Orbital& no, const int nk_legacy, const double lcao_dk) const
 {
-    delete[] no.chi();
-
-    no.chi() = new Numerical_Orbital_Lm[nchi_];
+    no.chi().clear();
+    no.chi().resize(nchi_);
     for (int i = 0; i < nchi_; i++)
     {
         chi_[i].to_numerical_orbital_lm(no.chi()[i], nk_legacy, lcao_dk);
diff --git a/source/source_esolver/esolver_ks_lcao.cpp b/source/source_esolver/esolver_ks_lcao.cpp
index 483ae23d5a..a45f12a127 100644
--- a/source/source_esolver/esolver_ks_lcao.cpp
+++ b/source/source_esolver/esolver_ks_lcao.cpp
@@ -61,6 +61,8 @@
 // test RDMFT
 #include "module_rdmft/rdmft.h"
 
+#include "module_hamilt_lcao/module_gint/temp_gint/gint_info.h"
+
 #include <iostream>
 
 namespace ModuleESolver
@@ -830,7 +832,6 @@ void ESolver_KS_LCAO<TK, TR>::iter_finish(UnitCell& ucell, const int istep, int&
             this->p_chgmix->mix_dmr(dm);
         }
     }
-
     // 6) save charge density
     // Peize Lin add 2020.04.04
     if (GlobalC::restart.info_save.save_charge)
diff --git a/source/source_esolver/esolver_ks_lcao.h b/source/source_esolver/esolver_ks_lcao.h
index 43a180104f..59e427f013 100644
--- a/source/source_esolver/esolver_ks_lcao.h
+++ b/source/source_esolver/esolver_ks_lcao.h
@@ -95,6 +95,11 @@ class ESolver_KS_LCAO : public ESolver_KS<TK>
     //! Grid integration: used to store some basic information
     Grid_Technique GridT;
 
+#ifndef __OLD_GINT
+    //! GintInfo: used to store some basic infomation about module_gint
+    std::unique_ptr<ModuleGint::GintInfo> gint_info_;
+#endif
+
     //! NAO orbitals: two-center integrations
     TwoCenterBundle two_center_bundle_;
 
diff --git a/source/source_esolver/lcao_before_scf.cpp b/source/source_esolver/lcao_before_scf.cpp
index 0e7142d4a9..7bcd63a5f1 100644
--- a/source/source_esolver/lcao_before_scf.cpp
+++ b/source/source_esolver/lcao_before_scf.cpp
@@ -60,6 +60,7 @@ void ESolver_KS_LCAO<TK, TR>::before_scf(UnitCell& ucell, const int istep)
                          PARAM.inp.test_atom_input);
 
     //! 4) initialize NAO basis set 
+#ifdef __OLD_GINT
     double dr_uniform = 0.001;
     std::vector<double> rcuts;
     std::vector<std::vector<double>> psi_u;
@@ -92,10 +93,19 @@ void ESolver_KS_LCAO<TK, TR>::before_scf(UnitCell& ucell, const int istep)
                              dpsi_u,
                              d2psi_u,
                              PARAM.inp.nstream);
+    
+    psi_u.clear();
+    psi_u.shrink_to_fit();
+    dpsi_u.clear();
+    dpsi_u.shrink_to_fit();
+    d2psi_u.clear();
+    d2psi_u.shrink_to_fit();
+    LCAO_domain::grid_prepare(this->GridT, this->GG, this->GK, ucell, orb_, *this->pw_rho, *this->pw_big);
 
     //! 6) prepare grid integral
-#ifdef __NEW_GINT
-    auto gint_info = std::make_shared<ModuleGint::GintInfo>(
+#else
+    gint_info_.reset(
+        new ModuleGint::GintInfo(
         this->pw_big->nbx,
         this->pw_big->nby,
         this->pw_big->nbz,
@@ -110,26 +120,16 @@ void ESolver_KS_LCAO<TK, TR>::before_scf(UnitCell& ucell, const int istep)
         this->pw_big->nbzp,
         orb_.Phi,
         ucell,
-        this->gd);
-    ModuleGint::Gint::init_gint_info(gint_info);
+        this->gd));
+    ModuleGint::Gint::set_gint_info(gint_info_.get());
 #endif
 
-    psi_u.clear();
-    psi_u.shrink_to_fit();
-    dpsi_u.clear();
-    dpsi_u.shrink_to_fit();
-    d2psi_u.clear();
-    d2psi_u.shrink_to_fit();
-
     // 7) For each atom, calculate the adjacent atoms in different cells
     // and allocate the space for H(R) and S(R).
     // If k point is used here, allocate HlocR after atom_arrange.
     this->RA.for_2d(ucell, this->gd, this->pv, PARAM.globalv.gamma_only_local, orb_.cutoffs());
 
-    // 8) after ions move, prepare grid in Gint
-    LCAO_domain::grid_prepare(this->GridT, this->GG, this->GK, ucell, orb_, *this->pw_rho, *this->pw_big);
-
-    // 9) initialize the Hamiltonian operators
+    // 8) initialize the Hamiltonian operators
     // if atom moves, then delete old pointer and add a new one
     if (this->p_hamilt != nullptr)
     {
@@ -169,7 +169,7 @@ void ESolver_KS_LCAO<TK, TR>::before_scf(UnitCell& ucell, const int istep)
 
 
 #ifdef __MLALGO
-    // 10) for each ionic step, the overlap <phi|alpha> must be rebuilt
+    // 9) for each ionic step, the overlap <phi|alpha> must be rebuilt
     // since it depends on ionic positions
     if (PARAM.globalv.deepks_setorb)
     {
@@ -198,7 +198,7 @@ void ESolver_KS_LCAO<TK, TR>::before_scf(UnitCell& ucell, const int istep)
     }
 #endif
 
-    // 11) prepare sc calculation
+    // 10) prepare sc calculation
     if (PARAM.inp.sc_mag_switch)
     {
         spinconstrain::SpinConstrain<TK>& sc = spinconstrain::SpinConstrain<TK>::getScInstance();
@@ -217,7 +217,7 @@ void ESolver_KS_LCAO<TK, TR>::before_scf(UnitCell& ucell, const int istep)
                    this->pelec);
     }
 
-    // 12) set xc type before the first cal of xc in pelec->init_scf
+    // 11) set xc type before the first cal of xc in pelec->init_scf
     // Peize Lin add 2016-12-03
 #ifdef __EXX
     if (PARAM.inp.calculation != "nscf")
@@ -233,10 +233,10 @@ void ESolver_KS_LCAO<TK, TR>::before_scf(UnitCell& ucell, const int istep)
     }
 #endif
 
-    // 13) init_scf, should be before_scf? mohan add 2025-03-10
+    // 12) init_scf, should be before_scf? mohan add 2025-03-10
     this->pelec->init_scf(istep, ucell, this->Pgrid, this->sf.strucFac, this->locpp.numeric, ucell.symm);
 
-    // 14) initalize DMR
+    // 13) initalize DMR
     // DMR should be same size with Hamiltonian(R)
     dynamic_cast<elecstate::ElecStateLCAO<TK>*>(this->pelec)
         ->get_DM()
@@ -247,7 +247,7 @@ void ESolver_KS_LCAO<TK, TR>::before_scf(UnitCell& ucell, const int istep)
     this->ld.init_DMR(ucell, orb_, this->pv, this->gd);
 #endif
 
-    // 15) two cases are considered:
+    // 14) two cases are considered:
     // 1. DMK in DensityMatrix is not empty (istep > 0), then DMR is initialized by DMK
     // 2. DMK in DensityMatrix is empty (istep == 0), then DMR is initialized by zeros
     if (istep > 0)
@@ -255,7 +255,7 @@ void ESolver_KS_LCAO<TK, TR>::before_scf(UnitCell& ucell, const int istep)
         dynamic_cast<elecstate::ElecStateLCAO<TK>*>(this->pelec)->get_DM()->cal_DMR();
     }
 
-    // 16) the electron charge density should be symmetrized,
+    // 15) the electron charge density should be symmetrized,
     // here is the initialization
     Symmetry_rho srho;
     for (int is = 0; is < PARAM.inp.nspin; is++)
@@ -263,10 +263,10 @@ void ESolver_KS_LCAO<TK, TR>::before_scf(UnitCell& ucell, const int istep)
         srho.begin(is, this->chr, this->pw_rho, ucell.symm);
     }
 
-    // 17) why we need to set this sentence? mohan add 2025-03-10
+    // 16) why we need to set this sentence? mohan add 2025-03-10
     this->p_hamilt->non_first_scf = istep;
 
-    // 18) update of RDMFT, added by jghan
+    // 17) update of RDMFT, added by jghan
     if (PARAM.inp.rdmft == true)
     {
         // necessary operation of these parameters have be done with p_esolver->Init() in source/source_main/driver_run.cpp
diff --git a/source/source_esolver/lcao_others.cpp b/source/source_esolver/lcao_others.cpp
index 702c0ab49b..6d9c3a98a0 100644
--- a/source/source_esolver/lcao_others.cpp
+++ b/source/source_esolver/lcao_others.cpp
@@ -91,6 +91,7 @@ void ESolver_KS_LCAO<TK, TR>::others(UnitCell& ucell, const int istep)
                          PARAM.inp.test_atom_input);
 
     // (3) Periodic condition search for each grid.
+#ifdef __OLD_GINT
     double dr_uniform = 0.001;
     std::vector<double> rcuts;
     std::vector<std::vector<double>> psi_u;
@@ -98,7 +99,6 @@ void ESolver_KS_LCAO<TK, TR>::others(UnitCell& ucell, const int istep)
     std::vector<std::vector<double>> d2psi_u;
 
     Gint_Tools::init_orb(dr_uniform, rcuts, ucell, orb_, psi_u, dpsi_u, d2psi_u);
-
     this->GridT.set_pbc_grid(this->pw_rho->nx,
                              this->pw_rho->ny,
                              this->pw_rho->nz,
@@ -122,12 +122,35 @@ void ESolver_KS_LCAO<TK, TR>::others(UnitCell& ucell, const int istep)
                              dpsi_u,
                              d2psi_u,
                              PARAM.inp.nstream);
+        
     psi_u.clear();
     psi_u.shrink_to_fit();
     dpsi_u.clear();
     dpsi_u.shrink_to_fit();
     d2psi_u.clear();
     d2psi_u.shrink_to_fit();
+    // prepare grid in Gint
+    LCAO_domain::grid_prepare(this->GridT, this->GG, this->GK, ucell, orb_, *this->pw_rho, *this->pw_big);
+#else
+    gint_info_.reset(
+        new ModuleGint::GintInfo(
+        this->pw_big->nbx,
+        this->pw_big->nby,
+        this->pw_big->nbz,
+        this->pw_rho->nx,
+        this->pw_rho->ny,
+        this->pw_rho->nz,
+        0,
+        0,
+        this->pw_big->nbzp_start,
+        this->pw_big->nbx,
+        this->pw_big->nby,
+        this->pw_big->nbzp,
+        orb_.Phi,
+        ucell,
+        this->gd));
+    ModuleGint::Gint::set_gint_info(gint_info_.get());
+#endif
 
     // (2)For each atom, calculate the adjacent atoms in different cells
     // and allocate the space for H(R) and S(R).
@@ -184,9 +207,6 @@ void ESolver_KS_LCAO<TK, TR>::others(UnitCell& ucell, const int istep)
         }
     }
 
-    // prepare grid in Gint
-    LCAO_domain::grid_prepare(this->GridT, this->GG, this->GK, ucell, orb_, *this->pw_rho, *this->pw_big);
-
     // init Hamiltonian
     if (this->p_hamilt != nullptr)
     {
diff --git a/source/source_estate/elecstate_lcao.cpp b/source/source_estate/elecstate_lcao.cpp
index f18e87ac51..38e854a4eb 100644
--- a/source/source_estate/elecstate_lcao.cpp
+++ b/source/source_estate/elecstate_lcao.cpp
@@ -34,7 +34,7 @@ void ElecStateLCAO<std::complex<double>>::psiToRho(const psi::Psi<std::complex<d
     //------------------------------------------------------------
 
     ModuleBase::GlobalFunc::NOTE("Calculate the charge on real space grid!");
-#ifndef __NEW_GINT
+#ifdef __OLD_GINT
     this->gint_k->transfer_DM2DtoGrid(this->DM->get_DMR_vector()); // transfer DM2D to DM_grid in gint
     Gint_inout inout(this->charge->rho, Gint_Tools::job_type::rho, PARAM.inp.nspin);
     this->gint_k->cal_gint(&inout);
@@ -71,7 +71,7 @@ void ElecStateLCAO<double>::psiToRho(const psi::Psi<double>& psi)
     //------------------------------------------------------------
     ModuleBase::GlobalFunc::NOTE("Calculate the charge on real space grid!");
 
-#ifndef __NEW_GINT 
+#ifdef __OLD_GINT 
     this->gint_gamma->transfer_DM2DtoGrid(this->DM->get_DMR_vector()); // transfer DM2D to DM_grid in gint
     Gint_inout inout(this->charge->rho, Gint_Tools::job_type::rho, PARAM.inp.nspin);
     this->gint_gamma->cal_gint(&inout);
@@ -139,7 +139,7 @@ void ElecStateLCAO<double>::dmToRho(std::vector<double*> pexsi_DM, std::vector<d
     }
 
     ModuleBase::GlobalFunc::NOTE("Calculate the charge on real space grid!");
-#ifndef __NEW_GINT
+#ifdef __OLD_GINT
     this->gint_gamma->transfer_DM2DtoGrid(this->DM->get_DMR_vector()); // transfer DM2D to DM_grid in gint
     Gint_inout inout(this->charge->rho, Gint_Tools::job_type::rho, PARAM.inp.nspin);
     this->gint_gamma->cal_gint(&inout);
@@ -152,7 +152,7 @@ void ElecStateLCAO<double>::dmToRho(std::vector<double*> pexsi_DM, std::vector<d
         {
             ModuleBase::GlobalFunc::ZEROS(this->charge->kin_r[0], this->charge->nrxx);
         }
-#ifndef __NEW_GINT
+#ifdef __OLD_GINT
         Gint_inout inout1(this->charge->kin_r, Gint_Tools::job_type::tau);
         this->gint_gamma->cal_gint(&inout1);
 #else
diff --git a/source/source_estate/elecstate_lcao_cal_tau.cpp b/source/source_estate/elecstate_lcao_cal_tau.cpp
index d07aeba678..2b611f4c17 100644
--- a/source/source_estate/elecstate_lcao_cal_tau.cpp
+++ b/source/source_estate/elecstate_lcao_cal_tau.cpp
@@ -16,7 +16,7 @@ void ElecStateLCAO<std::complex<double>>::cal_tau(const psi::Psi<std::complex<do
     {
         ModuleBase::GlobalFunc::ZEROS(this->charge->kin_r[is], this->charge->nrxx);
     }
-#ifndef __NEW_GINT
+#ifdef __OLD_GINT
     Gint_inout inout1(this->charge->kin_r, Gint_Tools::job_type::tau, PARAM.inp.nspin);
     this->gint_k->cal_gint(&inout1);
 #else
@@ -36,7 +36,7 @@ void ElecStateLCAO<double>::cal_tau(const psi::Psi<double>& psi)
     {
         ModuleBase::GlobalFunc::ZEROS(this->charge->kin_r[is], this->charge->nrxx);
     }
-#ifndef __NEW_GINT
+#ifdef __OLD_GINT
     Gint_inout inout1(this->charge->kin_r, Gint_Tools::job_type::tau, PARAM.inp.nspin);
     this->gint_gamma->cal_gint(&inout1);
 #else
diff --git a/tests/09_DeePKS/102_NO_GO_deepks_nscf/result.ref b/tests/09_DeePKS/102_NO_GO_deepks_nscf/result.ref
index 4fb5aa8716..540c8789a6 100644
--- a/tests/09_DeePKS/102_NO_GO_deepks_nscf/result.ref
+++ b/tests/09_DeePKS/102_NO_GO_deepks_nscf/result.ref
@@ -1,8 +1,8 @@
 etotref -74.3929556166736603
 etotperatomref -14.8785911233
-totalforceref 778.174241
-totalstressref 1272.711589
+totalforceref 1495.625575
+totalstressref 574.321174
 totaldosref 12
 deepks_desc 8.045214
-deepks_dm_eig 29.53046025202608
+deepks_dm_eig 29.530460252025964
 totaltimeref 1.12
diff --git a/tests/09_DeePKS/102_NO_KP_deepks_nscf/result.ref b/tests/09_DeePKS/102_NO_KP_deepks_nscf/result.ref
index 8d613e5354..123d104b33 100644
--- a/tests/09_DeePKS/102_NO_KP_deepks_nscf/result.ref
+++ b/tests/09_DeePKS/102_NO_KP_deepks_nscf/result.ref
@@ -1,7 +1,7 @@
 etotref -469.5735907784966230
 etotperatomref -156.5245302595
-totalforceref 330.972666
-totalstressref 24771.556634
+totalforceref 10.194156
+totalstressref 510.485544
 totaldosref 28
 deepks_desc 2.126589
 deepks_dm_eig 10.532812121143177