From e1dd489db5a999776db985a75d3f31cbf3727972 Mon Sep 17 00:00:00 2001
From: mohan <mohan.chen.chen.mohan@gmail.com>
Date: Thu, 25 Mar 2021 23:05:17 +0800
Subject: [PATCH 01/60] add comments in LOOPcell and LOOP_ions

---
 ABACUS.develop/source/src_lcao/LOOP_cell.cpp | 5 +++++
 ABACUS.develop/source/src_lcao/LOOP_ions.cpp | 9 ++++++++-
 2 files changed, 13 insertions(+), 1 deletion(-)

diff --git a/ABACUS.develop/source/src_lcao/LOOP_cell.cpp b/ABACUS.develop/source/src_lcao/LOOP_cell.cpp
index 003b9eb7a3..785bd935c4 100644
--- a/ABACUS.develop/source/src_lcao/LOOP_cell.cpp
+++ b/ABACUS.develop/source/src_lcao/LOOP_cell.cpp
@@ -24,6 +24,7 @@ void LOOP_cell::opt_cell(void)
     UFFT.allocate();
 
     // output is ppcell.vloc 3D local pseudopotentials
+	// without structure factors
     // this function belongs to cell LOOP
     ppcell.init_vloc(pw.nggm, ppcell.vloc);
 
@@ -34,6 +35,8 @@ void LOOP_cell::opt_cell(void)
     pot.init_pot(ion_step, pw.strucFac);
 
 
+	// PLEASE simplify the Exx_Global interface
+	// mohan add 2021-03-25
 	// Peize Lin 2016-12-03
 	if (CALCULATION=="scf" || CALCULATION=="relax" || CALCULATION=="cell-relax")
 	{
@@ -52,6 +55,8 @@ void LOOP_cell::opt_cell(void)
 		}
 	}	
 
+	// PLEASE do not use INPUT global variable
+	// mohan add 2021-03-25
 	// Quxin added for DFT+U
 	if(INPUT.dft_plus_u) 
 	{
diff --git a/ABACUS.develop/source/src_lcao/LOOP_ions.cpp b/ABACUS.develop/source/src_lcao/LOOP_ions.cpp
index 294b5d4519..42e865710d 100644
--- a/ABACUS.develop/source/src_lcao/LOOP_ions.cpp
+++ b/ABACUS.develop/source/src_lcao/LOOP_ions.cpp
@@ -108,6 +108,8 @@ void LOOP_ions::opt_ions(void)
 		
 		time_t eend = time(NULL);
 
+		// PLEASE move the details of CE to other places
+		// mohan add 2021-03-25
         //xiaohui add 2014-07-07, for second-order extrapolation
         int iat=0;
         if(CALCULATION=="relax" || CALCULATION=="cell-relax")
@@ -134,6 +136,10 @@ void LOOP_ions::opt_ions(void)
             }
         }
 
+		// PLEASE design a proper interface to output potentials,
+		// not only electrostatic potential but also others
+		// mohan add 2021-03-25
+		// we need to have a proper
         if(pot.out_potential == 2)
         {
             stringstream ssp;
@@ -155,7 +161,8 @@ void LOOP_ions::opt_ions(void)
         }            
         time_t fend = time(NULL);
 
-
+		// PLEASE move the details of CE to other places
+		// mohan add 2021-03-25
         //xiaohui add 2014-07-07, for second-order extrapolation
         iat=0;
         if(FORCE)

From c655c9c084be2a22e9e45cbf5066851e07796ab2 Mon Sep 17 00:00:00 2001
From: zdy <dyzheng@mail.ustc.edu.cn>
Date: Fri, 26 Mar 2021 22:55:49 +0800
Subject: [PATCH 02/60] update ions.cpp output in cell-relax

---
 ABACUS.develop/source/src_pw/ions.cpp | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/ABACUS.develop/source/src_pw/ions.cpp b/ABACUS.develop/source/src_pw/ions.cpp
index d3faddfcd1..d5b629cd39 100644
--- a/ABACUS.develop/source/src_pw/ions.cpp
+++ b/ABACUS.develop/source/src_pw/ions.cpp
@@ -224,7 +224,7 @@ void Ions::opt_ions_pw(void)
 
     }
 
-    if(CALCULATION=="scf" || CALCULATION=="relax")
+    if(CALCULATION=="scf" || CALCULATION=="relax" || CALCULATION=="cell-relax")
     {
         ofs_running << "\n\n --------------------------------------------" << endl;
         ofs_running << setprecision(16);

From 81fbd6b27117188f41a35e29ed479b2396c4ec7b Mon Sep 17 00:00:00 2001
From: mohan <mohan.chen.chen.mohan@gmail.com>
Date: Sat, 27 Mar 2021 22:21:30 +0800
Subject: [PATCH 03/60] add comments in ORB_nonlocal

---
 ABACUS.develop/source/src_lcao/ORB_nonlocal.h    | 8 ++++----
 ABACUS.develop/source/src_lcao/ORB_nonlocal_lm.h | 4 ----
 2 files changed, 4 insertions(+), 8 deletions(-)

diff --git a/ABACUS.develop/source/src_lcao/ORB_nonlocal.h b/ABACUS.develop/source/src_lcao/ORB_nonlocal.h
index 308fa52ee8..5eb6f9e97e 100644
--- a/ABACUS.develop/source/src_lcao/ORB_nonlocal.h
+++ b/ABACUS.develop/source/src_lcao/ORB_nonlocal.h
@@ -1,7 +1,3 @@
-//=========================================================
-//AUTHOR : liaochen, mohan
-//DATE : 2009-03-04
-//=========================================================
 #ifndef NUMERICAL_NONLOCAL_H
 #define NUMERICAL_NONLOCAL_H
 
@@ -87,9 +83,13 @@ class Numerical_Nonlocal
 	// each Beta may have different L.
 	int nproj;
 	int *LfromBeta;
+
 	int nproj_soc;//demention of D_ij^so
+
 	ComplexArray Coefficient_D_so;   //(:,:,:),  spin-orbit case,  added by zhengdy-soc
+
 	int non_zero_count_soc[4];
+
 	int *index1_soc[4], *index2_soc[4];
 };
 
diff --git a/ABACUS.develop/source/src_lcao/ORB_nonlocal_lm.h b/ABACUS.develop/source/src_lcao/ORB_nonlocal_lm.h
index 4f3eaf42f5..9b6eb4e531 100644
--- a/ABACUS.develop/source/src_lcao/ORB_nonlocal_lm.h
+++ b/ABACUS.develop/source/src_lcao/ORB_nonlocal_lm.h
@@ -1,7 +1,3 @@
-//=========================================================
-//AUTHOR : liaochen, mohan
-//DATE : 2008-03-04
-//=========================================================
 #ifndef NUMERICAL_NONLOCAL_LM
 #define NUMERICAL_NONLOCAL_LM
 

From 34be14f83770d381fbe2de74581f0df4f5e41cdf Mon Sep 17 00:00:00 2001
From: mohan <mohan.chen.chen.mohan@gmail.com>
Date: Sun, 28 Mar 2021 21:14:12 +0800
Subject: [PATCH 04/60] add comment in gint_k

---
 ABACUS.develop/source/src_lcao/ORB_nonlocal.h |   2 +
 ABACUS.develop/source/src_lcao/ORB_read.h     |   5 -
 ABACUS.develop/source/src_lcao/gint_k.cpp     | 102 ++--
 ABACUS.develop/source/src_lcao/gint_k_fvl.cpp | 456 ++++++++++--------
 4 files changed, 273 insertions(+), 292 deletions(-)

diff --git a/ABACUS.develop/source/src_lcao/ORB_nonlocal.h b/ABACUS.develop/source/src_lcao/ORB_nonlocal.h
index 5eb6f9e97e..b96690e7fc 100644
--- a/ABACUS.develop/source/src_lcao/ORB_nonlocal.h
+++ b/ABACUS.develop/source/src_lcao/ORB_nonlocal.h
@@ -84,6 +84,8 @@ class Numerical_Nonlocal
 	int nproj;
 	int *LfromBeta;
 
+	// PLEASE consider the following parameters can be moved to the 'pseudopotential' module
+	// mohan note 2021-03-28
 	int nproj_soc;//demention of D_ij^so
 
 	ComplexArray Coefficient_D_so;   //(:,:,:),  spin-orbit case,  added by zhengdy-soc
diff --git a/ABACUS.develop/source/src_lcao/ORB_read.h b/ABACUS.develop/source/src_lcao/ORB_read.h
index 3010dc1efc..06ed11f642 100644
--- a/ABACUS.develop/source/src_lcao/ORB_read.h
+++ b/ABACUS.develop/source/src_lcao/ORB_read.h
@@ -1,8 +1,3 @@
-//=========================================================
-//AUTHOR : mohan
-//DATE : 2009-04-23
-//Last Update : 2021-02-11
-//=========================================================
 #ifndef LCAO_ORBITALS_H
 #define LCAO_ORBITALS_H
 
diff --git a/ABACUS.develop/source/src_lcao/gint_k.cpp b/ABACUS.develop/source/src_lcao/gint_k.cpp
index 4249aff9a7..55b6c3426d 100644
--- a/ABACUS.develop/source/src_lcao/gint_k.cpp
+++ b/ABACUS.develop/source/src_lcao/gint_k.cpp
@@ -39,7 +39,7 @@ void Gint_k::allocate_pvpR(void)
 		WARNING_QUIT("Gint_k::allocate_pvpR","pvpR has been allocated!");
 	}
 
-//	reduced = NURSE; 
+	//	reduced = NURSE; 
 	//xiaohui modify 2015-05-30
 	//cout << " reduced algorithm for grid integration = " << reduced << endl;
 
@@ -139,8 +139,11 @@ void Gint_k::destroy_pvpR(void)
 
 // fold the <phi | vl |dphi(R)> * DM(R) to 
 // calculate the force.
-void Gint_k::folding_force(matrix& fvl_dphi,
-	double* pvdpx, double* pvdpy, double* pvdpz)
+void Gint_k::folding_force(
+	matrix& fvl_dphi,
+	double* pvdpx, 
+	double* pvdpy, 
+	double* pvdpz)
 {
 	TITLE("Gint_k","folding_force");
 	timer::tick("Gint_k","folding_force");
@@ -311,9 +314,18 @@ void Gint_k::folding_force(matrix& fvl_dphi,
 
 // fold the <phi | vl * R_beta|dphi(R_alpha)> * DM(R) to 
 // calculate the stress.
-void Gint_k::folding_stress(matrix& fvl_dphi, matrix& svl_dphi,
-	double* pvdpx, double* pvdpy, double* pvdpz,
-	double* pvdp11, double* pvdp22, double* pvdp33,double* pvdp12, double* pvdp13, double* pvdp23)
+void Gint_k::folding_stress(
+	matrix& fvl_dphi, 
+	matrix& svl_dphi,
+	double* pvdpx, 
+	double* pvdpy, 
+	double* pvdpz,
+	double* pvdp11, 
+	double* pvdp22, 
+	double* pvdp33,
+	double* pvdp12, 
+	double* pvdp13, 
+	double* pvdp23)
 {
 	TITLE("Gint_k","folding_stress");
 	timer::tick("Gint_k","folding_stress");
@@ -1132,9 +1144,15 @@ void Gint_k::folding_vl_k_nc(const int &ik)
 	return;
 }
 
-void Gint_k::set_ijk_atom(const int &grid_index, const int &size,
-	double*** psir_ylm, double*** dr, bool** cal_flag, 
-	double** distance, double* ylma, const double &delta_r)
+void Gint_k::set_ijk_atom(
+	const int &grid_index, 
+	const int &size,
+	double*** psir_ylm, 
+	double*** dr, 
+	bool** cal_flag, 
+	double** distance, 
+	double* ylma, 
+	const double &delta_r)
 {
 	const Numerical_Orbital_Lm* pointer;
 	double mt[3];
@@ -1350,72 +1368,7 @@ void Gint_k::destroy_pvpR_tr(void)
 void Gint_k::distribute_pvpR_tr(void)
 {
     TITLE("Gint_k","distribute_pvpR_tr");
-/*
-    int lgd = 0;
-    double R_minX = GridD.getD_minX();
-    double R_minY = GridD.getD_minY();
-    double R_minZ = GridD.getD_minZ();
-
-    int R_x;
-    int R_y;
-    int R_z;
-
-    Vector3<double> tau1, dtau, dR;
-    for(int T1=0; T1<ucell.ntype; ++T1)
-    {
-        for(int I1=0; I1<ucell.atoms[T1].na; ++I1)
-        {
-            const int iat = ucell.itia2iat(T1,I1);
-            // atom in this grid piece.
-            if(GridT.in_this_processor[iat])
-            {
-                Atom* atom1 = &ucell.atoms[T1];
-                const int start1 = ucell.itiaiw2iwt(T1, I1, 0);
 
-                // get the start positions of elements.
-                const int DM_start = LNNR.nlocstartg[iat];
-
-                // get the coordinates of adjacent atoms.
-                tau1 = ucell.atoms[T1].tau[I1];
-                //GridD.Find_atom(tau1);	
-                GridD.Find_atom(tau1, T1, I1);
-                // search for the adjacent atoms.
-                int nad = 0;
-
-int adj_number = 0;
-                for(int ad = 0; ad < GridD.getAdjacentNum()+1; ad++)
-                {
-                    // get iat2
-                    const int T2 = GridD.getType(ad);
-                    const int I2 = GridD.getNatom(ad);
-                    const int iat2 = ucell.itia2iat(T2, I2);
-
-                    // adjacent atom is also on the grid.
-                    if(GridT.in_this_processor[iat2])
-                    {
-int index = 0;
-                        Atom* atom2 = &ucell.atoms[T2];
-                        dtau = GridD.getAdjacentTau(ad) - tau1;
-                        double distance = dtau.norm() * ucell.lat0;
-                        double rcut = ORB.Phi[T1].getRcut() + ORB.Phi[T2].getRcut();
-
-                        // for the local part, only need to calculate <phi_i | phi_j> within range
-                        // mohan note 2012-07-06
-                        if(distance < rcut)
-                        {
-adj_number++;
-                            const int start2 = ucell.itiaiw2iwt(T2, I2, 0);
-
-                            // calculate the distance between iat1 and iat2.
-                            // Vector3<double> dR = GridD.getAdjacentTau(ad) - tau1;
-                            dR.x = GridD.getBox(ad).x;
-                            dR.y = GridD.getBox(ad).y;
-                            dR.z = GridD.getBox(ad).z;
-
-                            R_x = (int) (dR.x -R_minX);
-                            R_y = (int) (dR.y -R_minY);
-                            R_z = (int) (dR.z -R_minZ);
-*/
     int R_x = GridD.getCellX();
     int R_y = GridD.getCellY();
     int R_z = GridD.getCellZ();
@@ -1497,6 +1450,7 @@ adj_number++;
     return;
 }
 
+
 void Gint_k::cal_vlocal_R(const int current_spin)
 {
     TITLE("Gint_k","cal_vlocal_R");
diff --git a/ABACUS.develop/source/src_lcao/gint_k_fvl.cpp b/ABACUS.develop/source/src_lcao/gint_k_fvl.cpp
index 110f8da139..ac0d6e0a6d 100644
--- a/ABACUS.develop/source/src_lcao/gint_k_fvl.cpp
+++ b/ABACUS.develop/source/src_lcao/gint_k_fvl.cpp
@@ -1,7 +1,6 @@
 #include "gint_k.h"
 #include "../src_pw/global.h"
 #include "LCAO_nnr.h"
-
 #include "global_fp.h" // mohan add 2021-01-30
 
 void Gint_k::fvl_k_RealSpace(matrix& fvl_dphi, const double *vl)
@@ -24,8 +23,6 @@ void Gint_k::fvl_k_RealSpace(matrix& fvl_dphi, const double *vl)
 	{
 		nnrg = 1;
 	}
-	
-		
 
 	// to store < phi | vlocal | dphi>
 	double* pvdpx = new double[nnrg];
@@ -199,226 +196,247 @@ void Gint_k::fvl_k_RealSpace(matrix& fvl_dphi, const double *vl)
 	return;
 }
 
-void Gint_k::svl_k_RealSpace(matrix& fvl_dphi, matrix& svl_dphi, const double *vl)
+void Gint_k::svl_k_RealSpace(
+	matrix& fvl_dphi, 
+	matrix& svl_dphi, 
+	const double *vl)
 {
-        TITLE("Gint_k","cal_stress");
-        timer::tick("Gint_k","cal_stress");
+	TITLE("Gint_k","cal_stress");
+	timer::tick("Gint_k","cal_stress");
 
-        if(!this->reduced)
-        {
-                WARNING_QUIT("Gint_k::cal_stress_k","The stress with k can only with reduced H.");
-        }
+	if(!this->reduced)
+	{
+		WARNING_QUIT("Gint_k::cal_stress_k","The stress with k can only with reduced H.");
+	}
 
-        int nnrg = LNNR.nnrg;
+	int nnrg = LNNR.nnrg;
 
-        if(OUT_LEVEL != "m") ofs_running << " LNNR.nnrg in cal_force_k = " << LNNR.nnrg << endl;
-        assert(nnrg>=0);
+	if(OUT_LEVEL != "m") ofs_running << " LNNR.nnrg in cal_force_k = " << LNNR.nnrg << endl;
+	assert(nnrg>=0);
 
-        // just because to make thea arrys meaningful.
-        if(LNNR.nnrg == 0)
-        {
-                nnrg = 1;
-        }
+	// just because to make thea arrys meaningful.
+	if(LNNR.nnrg == 0)
+	{
+		nnrg = 1;
+	}
 
-        // to store < phi | vlocal | dphi>
-        double* pvdpx = new double[nnrg];
-        double* pvdpy = new double[nnrg];
-        double* pvdpz = new double[nnrg];
-        double* pvdp11 = new double[nnrg];
-        double* pvdp22 = new double[nnrg];
-        double* pvdp33 = new double[nnrg];
-        double* pvdp12 = new double[nnrg];
-        double* pvdp13 = new double[nnrg];
-        double* pvdp23 = new double[nnrg];
-        ZEROS(pvdpx, nnrg);
-        ZEROS(pvdpy, nnrg);
-        ZEROS(pvdpz, nnrg);
-        ZEROS(pvdp11, nnrg);
-        ZEROS(pvdp22, nnrg);
-        ZEROS(pvdp33, nnrg);
-        ZEROS(pvdp12, nnrg);
-        ZEROS(pvdp13, nnrg);
-        ZEROS(pvdp23, nnrg);
+	// to store < phi | vlocal | dphi>
+	double* pvdpx = new double[nnrg];
+	double* pvdpy = new double[nnrg];
+	double* pvdpz = new double[nnrg];
+	double* pvdp11 = new double[nnrg];
+	double* pvdp22 = new double[nnrg];
+	double* pvdp33 = new double[nnrg];
+	double* pvdp12 = new double[nnrg];
+	double* pvdp13 = new double[nnrg];
+	double* pvdp23 = new double[nnrg];
+	ZEROS(pvdpx, nnrg);
+	ZEROS(pvdpy, nnrg);
+	ZEROS(pvdpz, nnrg);
+	ZEROS(pvdp11, nnrg);
+	ZEROS(pvdp22, nnrg);
+	ZEROS(pvdp33, nnrg);
+	ZEROS(pvdp12, nnrg);
+	ZEROS(pvdp13, nnrg);
+	ZEROS(pvdp23, nnrg);
 
 
-    const double delta_r = ORB.dr_uniform;
-    // it's a uniform grid to save orbital values, so the delta_r is a constant.
-    const int max_size = GridT.max_atom;
-    // how many meshcells in bigcell.
-    const int bxyz = GridT.bxyz;
-
-        double*** dr;// vectors between atom and grid: [bxyz, maxsize, 3]
-        double** distance; // distance between atom and grid: [bxyz, maxsize]
-        double*** psir_ylm;
-        bool** cal_flag;
-        double* ylma;
-        double*** dphi_x;
-        double*** dphi_y;
-        double*** dphi_z;
-    if(max_size!=0)
-    {
-        dr = new double**[bxyz];
-        distance = new double*[bxyz];
-        psir_ylm = new double**[bxyz];
-        cal_flag = new bool*[bxyz];
-                dphi_x = new double**[bxyz];
-                dphi_y = new double**[bxyz];
-                dphi_z = new double**[bxyz];
+	const double delta_r = ORB.dr_uniform;
+	// it's a uniform grid to save orbital values, so the delta_r is a constant.
+	const int max_size = GridT.max_atom;
+	// how many meshcells in bigcell.
+	const int bxyz = GridT.bxyz;
 
-        // mohan fix bug 2011-05-02
-        int nn = 0;
-        for(int it=0; it<ucell.ntype; it++)
-        {
-            nn = max(nn, (ucell.atoms[it].nwl+1)*(ucell.atoms[it].nwl+1));
-        }
-        ylma = new double[nn];
-        ZEROS(ylma, nn);
+	double*** dr;// vectors between atom and grid: [bxyz, maxsize, 3]
+	double** distance; // distance between atom and grid: [bxyz, maxsize]
+	double*** psir_ylm;
+	bool** cal_flag;
+	double* ylma;
+	double*** dphi_x;
+	double*** dphi_y;
+	double*** dphi_z;
 
-        for(int i=0; i<bxyz; i++)
-        {
-            dr[i] = new double*[max_size];
-            psir_ylm[i] = new double*[max_size];
-            distance[i] = new double[max_size];
-            cal_flag[i] = new bool[max_size];
-                        dphi_x[i] = new double*[max_size];
-                        dphi_y[i] = new double*[max_size];
-                        dphi_z[i] = new double*[max_size];
+	if(max_size!=0)
+	{
+		dr = new double**[bxyz];
+		distance = new double*[bxyz];
+		psir_ylm = new double**[bxyz];
+		cal_flag = new bool*[bxyz];
+		dphi_x = new double**[bxyz];
+		dphi_y = new double**[bxyz];
+		dphi_z = new double**[bxyz];
 
-            ZEROS(distance[i], max_size);
-            ZEROS(cal_flag[i], max_size);
+		// mohan fix bug 2011-05-02
+		int nn = 0;
+		for(int it=0; it<ucell.ntype; it++)
+		{
+			nn = max(nn, (ucell.atoms[it].nwl+1)*(ucell.atoms[it].nwl+1));
+		}
+		ylma = new double[nn];
+		ZEROS(ylma, nn);
 
-            for(int j=0; j<max_size; j++)
-            {
-                dr[i][j] = new double[3];
-                psir_ylm[i][j] = new double[ucell.nwmax];
-                                dphi_x[i][j] = new double[ucell.nwmax];
-                                dphi_y[i][j] = new double[ucell.nwmax];
-                                dphi_z[i][j] = new double[ucell.nwmax];
-                ZEROS(dr[i][j],3);
-                ZEROS(psir_ylm[i][j],ucell.nwmax);
-                ZEROS(dphi_x[i][j],ucell.nwmax);
-                ZEROS(dphi_y[i][j],ucell.nwmax);
-                ZEROS(dphi_z[i][j],ucell.nwmax);
-            }
-        }
-    }
+		for(int i=0; i<bxyz; i++)
+		{
+			dr[i] = new double*[max_size];
+			psir_ylm[i] = new double*[max_size];
+			distance[i] = new double[max_size];
+			cal_flag[i] = new bool[max_size];
+			dphi_x[i] = new double*[max_size];
+			dphi_y[i] = new double*[max_size];
+			dphi_z[i] = new double*[max_size];
 
-    assert(this->ncxyz!=0);
-    const double dv = ucell.omega/this->ncxyz;
-    int vl_index=0;
-    double* vldr3 = new double[bxyz];
-    ZEROS(vldr3, bxyz);
+			ZEROS(distance[i], max_size);
+			ZEROS(cal_flag[i], max_size);
 
-        for(int i=0; i<nbx; i++)
-        {
-                for(int j=0; j<nby; j++)
-                {
-                        for(int k=nbz_start; k<nbz_start+nbz; k++)
-                        {
-                                const int grid_index = (k-nbz_start) + j * nbz + i * nby * nbz;
-                                const int size = GridT.how_many_atoms[ grid_index ];
-                                if(size==0) continue;
-
-                                //---------------------------------
-                                // get the wave functions in this
-                                // grid.
-                                //---------------------------------
-                                this->set_ijk_atom_force(grid_index, size,
-                                psir_ylm, dr, cal_flag,
-                                distance, ylma, delta_r,
-                                dphi_x, dphi_y, dphi_z);
-
-                                int bindex = 0;
-                                // z is the fastest,
-                                for(int ii=0; ii<pw.bx; ii++)
-                                {
-                                        for(int jj=0; jj<pw.by; jj++)
-                                        {
-                                                for(int kk=0; kk<pw.bz; kk++)
-                                                {
-                                                        const int iii = i*pw.bx + ii;
-                                                        const int jjj = j*pw.by + jj;
-                                                        const int kkk = k*pw.bz + kk;
-                                                        vl_index = (kkk-pw.nczp_start) + jjj*pw.nczp + iii*pw.ncy*pw.nczp;
-                                                        vldr3[bindex] = vl[ vl_index ] * dv;
-                                                //        vldr3[bindex] = dv; // for overlap test
-
-                                                        ++bindex;
-                                                }
-                                        }
-                                }
-//cout<<"loop  "<<i<<" "<<j<<" "<<k<<endl;//test
-
-                                this->evaluate_vl_stress(grid_index, size,i,j,k,
-                                        psir_ylm, cal_flag, vldr3, distance,
-                                        dphi_x, dphi_y, dphi_z,
-                                        pvdpx, pvdpy, pvdpz,
-                                        pvdp11, pvdp22, pvdp33, pvdp12, pvdp13, pvdp23, dr, GridT);
-                        }// int k
-                }// int j
-        } // int i
-
-
-        //---------------------------------------
-        // Folding R here
-        //---------------------------------------
-
-
-        //LM.DHloc_fixedR_x
-        this->folding_stress(fvl_dphi, svl_dphi, pvdpx, pvdpy, pvdpz,
-                             pvdp11, pvdp22, pvdp33, pvdp12, pvdp13, pvdp23);
-    
-        delete[] pvdpx;
-        delete[] pvdpy;
-        delete[] pvdpz;
-        delete[] pvdp11;
-        delete[] pvdp22;
-        delete[] pvdp33;
-        delete[] pvdp12;
-        delete[] pvdp13;
-        delete[] pvdp23;
+			for(int j=0; j<max_size; j++)
+			{
+				dr[i][j] = new double[3];
+				psir_ylm[i][j] = new double[ucell.nwmax];
+				dphi_x[i][j] = new double[ucell.nwmax];
+				dphi_y[i][j] = new double[ucell.nwmax];
+				dphi_z[i][j] = new double[ucell.nwmax];
+				ZEROS(dr[i][j],3);
+				ZEROS(psir_ylm[i][j],ucell.nwmax);
+				ZEROS(dphi_x[i][j],ucell.nwmax);
+				ZEROS(dphi_y[i][j],ucell.nwmax);
+				ZEROS(dphi_z[i][j],ucell.nwmax);
+			}
+		}
+	}
 
-    delete[] vldr3;
-    if(max_size!=0)
-    {
-        for(int i=0; i<pw.bxyz; i++)
-        {
-            for(int j=0; j<max_size; j++)
-            {
-                delete[] dr[i][j];
-                delete[] psir_ylm[i][j];
-                                delete[] dphi_x[i][j];
-                                delete[] dphi_y[i][j];
-                                delete[] dphi_z[i][j];
-            }
-            delete[] dr[i];
-            delete[] distance[i];
-            delete[] psir_ylm[i];
-            delete[] cal_flag[i];
-                        delete[] dphi_x[i];
-                        delete[] dphi_y[i];
-                        delete[] dphi_z[i];
-        }
-        delete[] dr;
-        delete[] distance;
-        delete[] psir_ylm;
-                delete[] dphi_x;
-                delete[] dphi_y;
-                delete[] dphi_z;
-        delete[] cal_flag;
+	assert(this->ncxyz!=0);
+	const double dv = ucell.omega/this->ncxyz;
+	int vl_index=0;
+	double* vldr3 = new double[bxyz];
+	ZEROS(vldr3, bxyz);
 
-        delete[] ylma;
-    }
-        timer::tick("Gint_k","cal_stress");
-        return;
+	for(int i=0; i<nbx; i++)
+	{
+		for(int j=0; j<nby; j++)
+		{
+			for(int k=nbz_start; k<nbz_start+nbz; k++)
+			{
+				const int grid_index = (k-nbz_start) + j * nbz + i * nby * nbz;
+				const int size = GridT.how_many_atoms[ grid_index ];
+				if(size==0) continue;
+
+				//---------------------------------
+				// get the wave functions in this
+				// grid.
+				//---------------------------------
+				this->set_ijk_atom_force(grid_index, size,
+						psir_ylm, dr, cal_flag,
+						distance, ylma, delta_r,
+						dphi_x, dphi_y, dphi_z);
+
+				int bindex = 0;
+				// z is the fastest,
+				for(int ii=0; ii<pw.bx; ii++)
+				{
+					for(int jj=0; jj<pw.by; jj++)
+					{
+						for(int kk=0; kk<pw.bz; kk++)
+						{
+							const int iii = i*pw.bx + ii;
+							const int jjj = j*pw.by + jj;
+							const int kkk = k*pw.bz + kk;
+							vl_index = (kkk-pw.nczp_start) + jjj*pw.nczp + iii*pw.ncy*pw.nczp;
+							vldr3[bindex] = vl[ vl_index ] * dv;
+							//        vldr3[bindex] = dv; // for overlap test
+
+							++bindex;
+						}
+					}
+				}
+				//cout<<"loop  "<<i<<" "<<j<<" "<<k<<endl;//test
+
+				this->evaluate_vl_stress(grid_index, size,i,j,k,
+						psir_ylm, cal_flag, vldr3, distance,
+						dphi_x, dphi_y, dphi_z,
+						pvdpx, pvdpy, pvdpz,
+						pvdp11, pvdp22, pvdp33, pvdp12, pvdp13, pvdp23, dr, GridT);
+			}// int k
+		}// int j
+	} // int i
+
+
+	//---------------------------------------
+	// Folding R here
+	//---------------------------------------
+
+	//LM.DHloc_fixedR_x
+	this->folding_stress(fvl_dphi, svl_dphi, pvdpx, pvdpy, pvdpz,
+			pvdp11, pvdp22, pvdp33, pvdp12, pvdp13, pvdp23);
+
+	delete[] pvdpx;
+	delete[] pvdpy;
+	delete[] pvdpz;
+	delete[] pvdp11;
+	delete[] pvdp22;
+	delete[] pvdp33;
+	delete[] pvdp12;
+	delete[] pvdp13;
+	delete[] pvdp23;
+
+	delete[] vldr3;
+	if(max_size!=0)
+	{
+		for(int i=0; i<pw.bxyz; i++)
+		{
+			for(int j=0; j<max_size; j++)
+			{
+				delete[] dr[i][j];
+				delete[] psir_ylm[i][j];
+				delete[] dphi_x[i][j];
+				delete[] dphi_y[i][j];
+				delete[] dphi_z[i][j];
+			}
+			delete[] dr[i];
+			delete[] distance[i];
+			delete[] psir_ylm[i];
+			delete[] cal_flag[i];
+			delete[] dphi_x[i];
+			delete[] dphi_y[i];
+			delete[] dphi_z[i];
+		}
+		delete[] dr;
+		delete[] distance;
+		delete[] psir_ylm;
+		delete[] dphi_x;
+		delete[] dphi_y;
+		delete[] dphi_z;
+		delete[] cal_flag;
+
+		delete[] ylma;
+	}
+	timer::tick("Gint_k","cal_stress");
+	return;
 }
 
 
-void Gint_k::evaluate_vl_stress(const int &grid_index, const int &size, const int &i, const int &j, const int &k,
-	double*** psir_ylm, bool** cal_flag, double* vldr3, double** distance,
-	double*** dphi_x, double*** dphi_y, double*** dphi_z,
-	double* pvdpx, double* pvdpy, double* pvdpz, 
-        double* pvdp11, double* pvdp22, double* pvdp33, double* pvdp12, double* pvdp13, double* pvdp23, double*** dr,
+void Gint_k::evaluate_vl_stress(
+	const int &grid_index, 
+	const int &size, 
+	const int &i, 
+	const int &j, 
+	const int &k,
+	double*** psir_ylm, 
+	bool** cal_flag, 
+	double* vldr3, 
+	double** distance,
+	double*** dphi_x, 
+	double*** dphi_y, 
+	double*** dphi_z,
+	double* pvdpx, 
+	double* pvdpy, 
+	double* pvdpz, 
+	double* pvdp11, 
+	double* pvdp22, 
+	double* pvdp33, 
+	double* pvdp12, 
+	double* pvdp13, 
+	double* pvdp23, 
+	double*** dr,
 	const Grid_Technique &gt)
 {
 
@@ -950,10 +968,22 @@ void Gint_k::evaluate_vl_force(const int &grid_index, const int &size, const int
         return;
 }
 
-void Gint_k::set_ijk_atom_force(const int &grid_index, const int &size,
-	double*** psir_ylm, double*** dr, bool** cal_flag, 
-	double** distance, double* ylma, const double &delta_r,
-	double*** dphi_x, double*** dphi_y, double*** dphi_z)
+
+// PLEASE be aware that 'set_ijk' subroutines should be reconstructed
+// since it has been used everytime grid integral is needed
+// mohan add 2021-03-28
+void Gint_k::set_ijk_atom_force(
+	const int &grid_index, 
+	const int &size,
+	double*** psir_ylm, 
+	double*** dr, 
+	bool** cal_flag, 
+	double** distance, 
+	double* ylma, 
+	const double &delta_r,
+	double*** dphi_x, 
+	double*** dphi_y, 
+	double*** dphi_z)
 {
 	const Numerical_Orbital_Lm* pointer;
 	double mt[3];
@@ -1008,12 +1038,12 @@ void Gint_k::set_ijk_atom_force(const int &grid_index, const int &size,
             //-------------------------------------------------
             // Here we can not deal with the situation on
             // r = 0, so if r = 0,  r-->1e-9
-            //-------------------------------------------------
+			//-------------------------------------------------
 
-                        if (distance[ib][id] < 1e-9)    // pengfei Li add 2016-3-3
-                        {
-                            distance[ib][id] = 1e-9;
-                        }
+			if (distance[ib][id] < 1e-9)    // pengfei Li add 2016-3-3
+			{
+				distance[ib][id] = 1e-9;
+			}
 
 			// these parameters are about interpolation
 			// because once we know the distance from atom to grid point,

From 80af4bc1def078d3c5046cf2028d1f1819555dd1 Mon Sep 17 00:00:00 2001
From: zdy <dyzheng@mail.ustc.edu.cn>
Date: Tue, 30 Mar 2021 10:03:50 +0800
Subject: [PATCH 05/60] fixed bug in SOC and banned hpseps+gamma_only line

---
 ABACUS.develop/source/input.cpp            | 3 ++-
 ABACUS.develop/source/src_pw/hamilt_pw.cpp | 2 +-
 2 files changed, 3 insertions(+), 2 deletions(-)

diff --git a/ABACUS.develop/source/input.cpp b/ABACUS.develop/source/input.cpp
index af16488f3f..91afa98ace 100644
--- a/ABACUS.develop/source/input.cpp
+++ b/ABACUS.develop/source/input.cpp
@@ -2646,7 +2646,8 @@ void Input::Check(void)
 			else if (ks_solver == "hpseps")
 			{
 #ifdef __MPI
-				ofs_warning << "It's a good choice to use hpseps!" << endl;
+				ofs_warning << "It's not a good choice to use hpseps!" << endl;
+				if(gamma_only) WARNING_QUIT("Input","hpseps can not be used for gamma_only.");
 #else
 				WARNING_QUIT("Input","hpseps can not be used for series version.");
 #endif
diff --git a/ABACUS.develop/source/src_pw/hamilt_pw.cpp b/ABACUS.develop/source/src_pw/hamilt_pw.cpp
index 702e12382c..ef47f13b81 100644
--- a/ABACUS.develop/source/src_pw/hamilt_pw.cpp
+++ b/ABACUS.develop/source/src_pw/hamilt_pw.cpp
@@ -122,7 +122,7 @@ void Hamilt_PW::cinitcgg(
     ComplexMatrix hvec(nstart,n_band);
 	int dmin,dmax;
 	const int npw = kv.ngk[ik];
-	if(!NONCOLIN)
+	if(NSPIN != 4)
 	{
 		dmin= npw;
 		dmax = wf.npwx;

From 610cfe01a8b64e04a4ce290d47c56e31d3da3722 Mon Sep 17 00:00:00 2001
From: mohan <mohan.chen.chen.mohan@gmail.com>
Date: Tue, 30 Mar 2021 21:34:19 +0800
Subject: [PATCH 06/60] add comments in LCAO_gen_fixedH

---
 ABACUS.develop/source/src_lcao/LCAO_gen_fixedH.cpp | 2 ++
 ABACUS.develop/source/src_lcao/ORB_gen_tables.h    | 2 ++
 2 files changed, 4 insertions(+)

diff --git a/ABACUS.develop/source/src_lcao/LCAO_gen_fixedH.cpp b/ABACUS.develop/source/src_lcao/LCAO_gen_fixedH.cpp
index ae2b14d3fe..3de837ddda 100644
--- a/ABACUS.develop/source/src_lcao/LCAO_gen_fixedH.cpp
+++ b/ABACUS.develop/source/src_lcao/LCAO_gen_fixedH.cpp
@@ -123,6 +123,8 @@ void LCAO_gen_fixedH::build_ST_new(const char& dtype, const bool& calc_deri)
 							complex<double> *olm2 = &olm1[0];
 							if(!calc_deri)
 							{
+								// PLEASE use UOT as an input parameter of this subroutine
+								// mohan add 2021-03-30
 								UOT.snap_psipsi( olm, 0, dtype, tau1, 
 										T1, L1, m1, N1, GridD.getAdjacentTau(ad), 
 										T2, L2, m2, N2,
diff --git a/ABACUS.develop/source/src_lcao/ORB_gen_tables.h b/ABACUS.develop/source/src_lcao/ORB_gen_tables.h
index 487cf48ead..05b5c07906 100644
--- a/ABACUS.develop/source/src_lcao/ORB_gen_tables.h
+++ b/ABACUS.develop/source/src_lcao/ORB_gen_tables.h
@@ -103,6 +103,8 @@ class ORB_gen_tables
 
 };
 
+// PLEASE try to get rid of UOT, which is a global variable
+// mohan add 2021-03-30
 extern ORB_gen_tables UOT;
 
 #endif

From bc6b231b05811f34b33a9ab00e648f83a0fdf7c0 Mon Sep 17 00:00:00 2001
From: qianrui <Terry_Liu@pku.edu.cn>
Date: Wed, 31 Mar 2021 13:42:25 +0800
Subject: [PATCH 07/60] modify h_psi to make it able to calculate all bands in
 one time

---
 ABACUS.develop/source/src_pw/hamilt_pw.cpp | 261 ++++++++++++++-------
 ABACUS.develop/source/src_pw/hamilt_pw.h   |   6 +-
 2 files changed, 179 insertions(+), 88 deletions(-)

diff --git a/ABACUS.develop/source/src_pw/hamilt_pw.cpp b/ABACUS.develop/source/src_pw/hamilt_pw.cpp
index a9f9edfc96..732a755083 100644
--- a/ABACUS.develop/source/src_pw/hamilt_pw.cpp
+++ b/ABACUS.develop/source/src_pw/hamilt_pw.cpp
@@ -13,7 +13,6 @@ Hamilt_PW::Hamilt_PW()
     spsi = new complex<double>[1];
     GR_index = new int[1];
     Bec = new complex<double>[1];
-    Ps = new complex<double>[1];
 }
 
 Hamilt_PW::~Hamilt_PW()
@@ -26,7 +25,6 @@ Hamilt_PW::~Hamilt_PW()
     delete[] spsi;
     delete[] GR_index;
     delete[] Bec;
-    delete[] Ps;
 }
 
 
@@ -38,13 +36,11 @@ void Hamilt_PW::init(const int &npwx, const int &npol, const int &nkb, const int
     delete[] spsi;
     delete[] GR_index;
     delete[] Bec;
-    delete[] Ps;
 
     this->hpsi = new complex<double> [npwx * npol];
     this->spsi = new complex<double> [npwx * npol];
     this->GR_index = new int[nrxx];
     this->Bec = new complex<double> [nkb];
-    this->Ps  = new complex<double> [nkb * npol];
 
     ZEROS(this->hpsi, npwx * npol);
     ZEROS(this->spsi, npwx * npol);
@@ -128,16 +124,19 @@ void Hamilt_PW::cinitcgg(
 	complex<double> *aux=new complex<double> [dmax*nstart];
 	complex<double> *paux = aux;
 	complex<double> *ppsi = psi.c;
-	for(int m=0;m<nstart;++m)
-	{
-		this->h_psi(ppsi, paux);
-		paux += dmax;
-		ppsi += dmax;
-	}
+	//qianrui replace it
+	this->h_psi(psi.c, aux, nstart);
+	//for(int m=0;m<nstart;++m)
+	//{
+	//	this->h_psi(ppsi, paux);
+	//	paux += dmax;
+	//	ppsi += dmax;
+	//}
 	char trans1 = 'C';
 	char trans2 = 'N';
 	zgemm_(&trans1,&trans2,&nstart,&nstart,&dmin,&ONE,psi.c,&dmax,aux,&dmax,&ZERO,hc.c,&nstart);
 	hc=transpose(hc,false);
+
 	zgemm_(&trans1,&trans2,&nstart,&nstart,&dmin,&ONE,psi.c,&dmax,psi.c,&dmax,&ZERO,sc.c,&nstart);
 	sc=transpose(sc,false);
 	//After psis are strictly normalized, we should use this part. 
@@ -455,103 +454,155 @@ void Hamilt_PW::s_1psi
 }
 
 
-void Hamilt_PW::h_psi(const complex<double> *psi_in, complex<double> *hpsi)
+void Hamilt_PW::h_psi(const complex<double> *psi_in, complex<double> *hpsi, const int m)
 {
     timer::tick("Hamilt_PW","h_psi",'H');
     int i = 0;
     int j = 0;
     int ig= 0;
 
-	if(NSPIN!=4) ZEROS(hpsi, wf.npw);
-	else ZEROS(hpsi, wf.npwx * NPOL);//added by zhengdy-soc
+	//if(NSPIN!=4) ZEROS(hpsi, wf.npw);
+	//else ZEROS(hpsi, wf.npwx * NPOL);//added by zhengdy-soc
+	int dmax = wf.npwx * NPOL;
 
 	//------------------------------------
 	//(1) the kinetical energy.
 	//------------------------------------
+	complex<double> *tmhpsi;
+	const complex<double> *tmpsi_in;
  	if(T_IN_H)
 	{	
-		for (ig = 0;ig < wf.npw;ig++)
-		{
-			hpsi[ig] = wf.g2kin[ig] * psi_in[ig];
-		}
-		//added by zhengdy-soc
-		if(NSPIN==4)
+		tmhpsi = hpsi;
+		tmpsi_in = psi_in;
+		for(int ib = 0 ; ib < m; ++ib)
 		{
-			for (ig = wf.npwx;ig < wf.npw + wf.npwx;ig++)
+			for(ig = 0;ig < wf.npw; ++ig)
 			{
-				hpsi[ig] = wf.g2kin[ig - wf.npwx] * psi_in[ig];
+				tmhpsi[ig] = wf.g2kin[ig] * tmpsi_in[ig];
+			}
+			if(NSPIN==4){
+				for(ig=wf.npw; ig < wf.npwx; ++ig)
+				{
+					tmhpsi[ig] = 0;
+				}
+				tmhpsi +=wf.npwx;
+				tmpsi_in += wf.npwx;
+				for (ig = 0;ig < wf.npw ;++ig)
+				{
+					tmhpsi[ig] = wf.g2kin[ig] * tmpsi_in[ig];
+				}
+				for(ig=wf.npw; ig < wf.npwx; ++ig)
+				{
+					tmhpsi[ig] =0;
+				}
 			}
+			tmhpsi += wf.npwx;
+			tmpsi_in += wf.npwx;
 		}
 	}
 
 	//------------------------------------
 	//(2) the local potential.
-	//------------------------------------
+	//-----------------------------------
+	timer::tick("Hamilt_PW","vloc",'H');
 	if(VL_IN_H)
 	{
-		if(NSPIN!=4)
+		tmhpsi = hpsi;
+		tmpsi_in = psi_in;
+		for(int ib = 0 ; ib < m; ++ib)
 		{
-			ZEROS( UFFT.porter, pw.nrxx);
-			UFFT.RoundTrip( psi_in, pot.vrs1, GR_index, UFFT.porter );
+			if(NSPIN!=4){
+				ZEROS( UFFT.porter, pw.nrxx);
+				UFFT.RoundTrip( tmpsi_in, pot.vrs1, GR_index, UFFT.porter );
 
-			for (j = 0;j < wf.npw;j++)
-			{
-				hpsi[j] += UFFT.porter[ GR_index[j] ];
-			}
-		}
-		else
-		{
-			complex<double>* porter1 = new complex<double>[pw.nrxx];
-			ZEROS( UFFT.porter, pw.nrxx);
-			ZEROS( porter1, pw.nrxx);
-			for (int ig=0; ig< wf.npw; ig++)
-			{
-				UFFT.porter[ GR_index[ig]  ] = psi_in[ig];
-				porter1[ GR_index[ig]  ] = psi_in[ig + wf.npwx];
+				for (j = 0;j < wf.npw;j++)
+				{
+					tmhpsi[j] += UFFT.porter[ GR_index[j] ];
+				}
 			}
-			// (2) fft to real space and doing things.
-			pw.FFT_wfc.FFT3D( UFFT.porter, 1);
-			pw.FFT_wfc.FFT3D( porter1, 1);
-			complex<double> sup,sdown;
-			for (int ir=0; ir< pw.nrxx; ir++)
+			else
 			{
-				sup = UFFT.porter[ir] * (pot.vrs(0,ir) + pot.vrs(3,ir)) +
-					porter1[ir] * (pot.vrs(1,ir) - complex<double>(0.0,1.0) * pot.vrs(2,ir));
-				sdown = porter1[ir] * (pot.vrs(0,ir) - pot.vrs(3,ir)) +
-				UFFT.porter[ir] * (pot.vrs(1,ir) + complex<double>(0.0,1.0) * pot.vrs(2,ir));
-				UFFT.porter[ir] = sup;
-				porter1[ir] = sdown;
-			}
-			// (3) fft back to G space.
-			pw.FFT_wfc.FFT3D( UFFT.porter, -1);
-			pw.FFT_wfc.FFT3D( porter1, -1);
+				complex<double>* porter1 = new complex<double>[pw.nrxx];
+				ZEROS( UFFT.porter, pw.nrxx);
+				ZEROS( porter1, pw.nrxx);
+				for (int ig=0; ig< wf.npw; ig++)
+				{
+					UFFT.porter[ GR_index[ig]  ] = tmpsi_in[ig];
+					porter1[ GR_index[ig]  ] = tmpsi_in[ig + wf.npwx];
+				}
+				// (2) fft to real space and doing things.
+				pw.FFT_wfc.FFT3D( UFFT.porter, 1);
+				pw.FFT_wfc.FFT3D( porter1, 1);
+				complex<double> sup,sdown;
+				for (int ir=0; ir< pw.nrxx; ir++)
+				{
+					sup = UFFT.porter[ir] * (pot.vrs(0,ir) + pot.vrs(3,ir)) +
+						porter1[ir] * (pot.vrs(1,ir) - complex<double>(0.0,1.0) * pot.vrs(2,ir));
+					sdown = porter1[ir] * (pot.vrs(0,ir) - pot.vrs(3,ir)) +
+					UFFT.porter[ir] * (pot.vrs(1,ir) + complex<double>(0.0,1.0) * pot.vrs(2,ir));
+					UFFT.porter[ir] = sup;
+					porter1[ir] = sdown;
+				}
+				// (3) fft back to G space.
+				pw.FFT_wfc.FFT3D( UFFT.porter, -1);
+				pw.FFT_wfc.FFT3D( porter1, -1);
 
-			for (j = 0;j < wf.npw;j++)
-			{
-				hpsi[j] += UFFT.porter[ GR_index[j] ];
-				hpsi[j+wf.npwx] += porter1[ GR_index[j] ];
+				for (j = 0;j < wf.npw;j++)
+				{
+					tmhpsi[j] += UFFT.porter[ GR_index[j] ];
+					tmhpsi[j+wf.npwx] += porter1[ GR_index[j] ];
+				}
+				delete[] porter1;
 			}
-			delete[] porter1;
+			tmhpsi += dmax;
+			tmpsi_in += dmax;
 		}
 	}
+	timer::tick("Hamilt_PW","vloc",'H');
 
 	//------------------------------------
 	// (3) the nonlocal pseudopotential.
 	//------------------------------------
+	timer::tick("Hamilt_PW","vnl",'H');
 	if(VNL_IN_H)
 	{
 		if ( ppcell.nkb > 0)
 		{
 			//<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<
-			//qianrui improve 2021-3-16
+			//qianrui optimize 2021-3-31
 			int nkb=ppcell.nkb;
-			ComplexMatrix becp(NPOL,nkb,false);
+			ComplexMatrix becp(NPOL * m, nkb, false);
 			char transa = 'C';
 			char transb = 'N';
-			zgemm_(&transa,&transb,&nkb,&NPOL,&wf.npw,&ONE,ppcell.vkb.c,&wf.npwx,psi_in,&wf.npwx,&ZERO,becp.c,&nkb);
-			becp=transpose(becp,false);
-			Parallel_Reduce::reduce_complex_double_pool( becp.c, ppcell.nkb * NPOL);
-			this->add_vuspsi(hpsi, becp.c);
+			if(m==1 && NPOL==1)
+			{
+				int inc = 1;
+				zgemv_(&transa, &wf.npw, &nkb, &ONE, ppcell.vkb.c, &wf.npwx, psi_in, &inc, &ZERO, becp.c, &inc);
+			}
+			else
+			{
+				int npm = NPOL * m;
+				zgemm_(&transa,&transb,&nkb,&npm,&wf.npw,&ONE,ppcell.vkb.c,&wf.npwx,psi_in,&wf.npwx,&ZERO,becp.c,&nkb);
+				//add_vuspsi is moddified, thus tranpose not needed here.
+				//if(NONCOLIN)
+				//{
+				//	ComplexMatrix partbecp(NPOL, nkb ,false);
+				//	for(int ib = 0; ib < m; ++ib)
+				//	{
+//
+				//		for ( i = 0;i < NPOL;i++)
+				//			for (j = 0;j < nkb;j++)
+				//				partbecp(i, j) = tmbecp[i*nkb+j];
+				//		for (j = 0; j < nkb; j++)
+				//			for (i = 0;i < NPOL;i++)
+				//				tmbecp[j*NPOL+i] = partbecp(i, j);
+				//		tmbecp += NPOL * nkb;
+				//	}
+				//}
+			}
+
+			Parallel_Reduce::reduce_complex_double_pool( becp.c, nkb * NPOL * m);
+			this->add_vuspsi(hpsi, becp.c, m);
 			//======================================================================
 			/*complex<double> *becp = new complex<double>[ ppcell.nkb * NPOL ];
 			ZEROS(becp,ppcell.nkb * NPOL);
@@ -575,21 +626,25 @@ void Hamilt_PW::h_psi(const complex<double> *psi_in, complex<double> *hpsi)
 			//>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>
 		}
 	}
-
+	timer::tick("Hamilt_PW","vnl",'H');
     timer::tick("Hamilt_PW","h_psi",'H');
     return;
 }
 
-void Hamilt_PW::add_vuspsi(complex<double> *hpsi_in,const complex<double> *becp)
+void Hamilt_PW::add_vuspsi(complex<double> *hpsi_in,const complex<double> *becp, const int m)
 {
     timer::tick("Hamilt_PW","add_vuspsi",'I');
-    ZEROS( Ps, ppcell.nkb * NPOL );
+	int nkb = ppcell.nkb;
+	complex<double> *Ps  = new complex<double> [nkb * NPOL * m];
+    ZEROS( Ps, NPOL * m * nkb);
 
     int sum = 0;
     int iat = 0;
     // this function sum up each non-local pseudopotential located in each atom,
     // all we need to do is put the right Dij coefficient to each becp, which
     // is calculated before.
+    if(NSPIN!=4)
+	{
     for (int it=0; it<ucell.ntype; it++)
     {
         const int Nprojs = ucell.atoms[it].nh;
@@ -602,29 +657,58 @@ void Hamilt_PW::add_vuspsi(complex<double> *hpsi_in,const complex<double> *becp)
             {
                 for (int ip2=0; ip2<Nprojs; ip2++)
                 {
-					if(NSPIN!=4)
-						this->Ps[sum+ip2] += ppcell.deeq(CURRENT_SPIN, iat, ip, ip2) * becp[sum+ip];
-					else
+					for(int ib = 0; ib < m ; ++ib)
 					{
-						this->Ps[sum+ ip2*2] += ppcell.deeq_nc(0, iat, ip2, ip) * becp[sum+ip*2]
-							+ppcell.deeq_nc(1, iat, ip2, ip) * becp[sum+ip*2+1];
-						this->Ps[sum+ ip2*2+1] += ppcell.deeq_nc(2, iat, ip2, ip) * becp[sum+ip*2]
-							+ppcell.deeq_nc(3, iat, ip2, ip) * becp[sum+ip*2+1];
-					}
-				}// end ih
+						Ps[(sum + ip2) * m + ib] += ppcell.deeq(CURRENT_SPIN, iat, ip, ip2) * becp[ib * nkb + sum + ip];
+					}//end ib
+                }// end ih
+            }//end jh 
+			sum += Nprojs;
+			++iat;
+        } //end na
+    } //end nt
+	}
+	else
+	{
+	for (int it=0; it<ucell.ntype; it++)
+    {
+		int psind,becpind;
+		complex<double> becp1,becp2;
+        const int Nprojs = ucell.atoms[it].nh;
+        for (int ia=0; ia<ucell.atoms[it].na; ia++)
+        {
+            // each atom has Nprojs, means this is with structure factor;
+            // each projector (each atom) must multiply coefficient
+            // with all the other projectors.
+            for (int ip=0; ip<Nprojs; ip++)
+            {
+                for (int ip2=0; ip2<Nprojs; ip2++)
+                {
+					for(int ib = 0; ib < m ; ++ib)
+					{
+						psind = (sum+ip2) * 2 * m + ib * 2;
+						becpind = ib*nkb*2 + sum + ip;
+						becp1 =  becp[becpind];
+						becp2 =  becp[becpind + nkb];
+						Ps[psind] += ppcell.deeq_nc(0, iat, ip2, ip) * becp1
+							+ppcell.deeq_nc(1, iat, ip2, ip) * becp2;
+						Ps[psind +1] += ppcell.deeq_nc(2, iat, ip2, ip) * becp1
+							+ppcell.deeq_nc(3, iat, ip2, ip) * becp2;
+					}//end ib
+                }// end ih
             }//end jh
-		if(NSPIN!=4) sum += Nprojs;
-		else sum += 2 * Nprojs;
-		++iat;
+		 	sum += 2 * Nprojs;
+			++iat;
         } //end na
     } //end nt
+	}
 
 	/*
     for (int ig=0;ig<wf.npw;ig++)
     {
         for (int i=0;i< ppcell.nkb;i++)
         {
-            hpsi_in[ig]+=this->Ps[i]*ppcell.vkb(i,ig);
+            hpsi_in[ig]+=Ps[i]*ppcell.vkb(i,ig);
         }
     }
 	*/
@@ -632,10 +716,19 @@ void Hamilt_PW::add_vuspsi(complex<double> *hpsi_in,const complex<double> *becp)
 
 	// use simple method.
 	//<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<
-	//qianrui improve 2021-3-16
+	//qianrui optimize 2021-3-31
 	char transa = 'N';
 	char transb = 'T';
-	zgemm_(&transa,&transb,&wf.npw,&NPOL,&ppcell.nkb,&ONE,ppcell.vkb.c,&wf.npwx,Ps,&NPOL,&ONE,hpsi_in,&wf.npwx);
+	if(NPOL==1 && m==1)
+	{
+		int inc = 1;
+		zgemv_(&transa, &wf.npw, &ppcell.nkb, &ONE, ppcell.vkb.c, &wf.npwx, Ps, &inc, &ONE, hpsi_in, &inc);
+	}
+	else
+	{
+		int npm = NPOL*m;
+		zgemm_(&transa,&transb,&wf.npw,&npm,&ppcell.nkb,&ONE,ppcell.vkb.c,&wf.npwx,Ps,&npm,&ONE,hpsi_in,&wf.npwx);
+	}
 	//======================================================================
 	/*if(!NONCOLIN)
 	for(int i=0; i<ppcell.nkb; i++)
@@ -664,7 +757,7 @@ void Hamilt_PW::add_vuspsi(complex<double> *hpsi_in,const complex<double> *becp)
 		}
 	}*/
 	//>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>
-
+	delete []Ps;
     timer::tick("Hamilt_PW","add_vuspsi",'I');
     return;
 }
diff --git a/ABACUS.develop/source/src_pw/hamilt_pw.h b/ABACUS.develop/source/src_pw/hamilt_pw.h
index 3e2e28af9b..368eaf712b 100644
--- a/ABACUS.develop/source/src_pw/hamilt_pw.h
+++ b/ABACUS.develop/source/src_pw/hamilt_pw.h
@@ -47,7 +47,7 @@ class Hamilt_PW
         complex<double> *hpsi,
         complex<double> *spsi);
 
-    void h_psi( const complex<double> *psi, complex<double> *hpsi);
+    void h_psi( const complex<double> *psi, complex<double> *hpsi, const int m = 1); // qianrui add a default parameter 2021-3-31
 
     void s_1psi(
         const int npw,
@@ -63,11 +63,9 @@ class Hamilt_PW
     // hpsi , spsi
     complex<double> *hpsi;
     complex<double> *spsi;
-
     complex<double> *Bec;
-    complex<double> *Ps;
 
-    void add_vuspsi(complex<double> *hpsi, const complex<double> *becp);
+    void add_vuspsi(complex<double> *hpsi, const complex<double> *becp, const int m);
 
 	private:
 

From 04c6fe2ee04d7508181bbd38b1eddfaf559984ec Mon Sep 17 00:00:00 2001
From: mohan <mohan.chen.chen.mohan@gmail.com>
Date: Wed, 31 Mar 2021 20:46:12 +0800
Subject: [PATCH 08/60] add ORB_api in src_external

---
 .../source/src_external/ORB_api/Makefile      | 60 +++++++++++++++++++
 .../src_external/ORB_api/Makefile.Objects     | 28 +++++++++
 .../src_external/ORB_api/Makefile.system      | 14 +++++
 .../source/src_external/ORB_api/Makefile.vars | 31 ++++++++++
 .../source/src_external/ORB_api/main.cpp      | 44 ++++++++++++++
 ABACUS.develop/source/src_pw/charge_extra.cpp |  6 ++
 6 files changed, 183 insertions(+)
 create mode 100644 ABACUS.develop/source/src_external/ORB_api/Makefile
 create mode 100644 ABACUS.develop/source/src_external/ORB_api/Makefile.Objects
 create mode 100644 ABACUS.develop/source/src_external/ORB_api/Makefile.system
 create mode 100644 ABACUS.develop/source/src_external/ORB_api/Makefile.vars
 create mode 100644 ABACUS.develop/source/src_external/ORB_api/main.cpp

diff --git a/ABACUS.develop/source/src_external/ORB_api/Makefile b/ABACUS.develop/source/src_external/ORB_api/Makefile
new file mode 100644
index 0000000000..7f01dfeb7c
--- /dev/null
+++ b/ABACUS.develop/source/src_external/ORB_api/Makefile
@@ -0,0 +1,60 @@
+# This is the Makefile of ABACUS
+
+include Makefile.system
+include Makefile.Objects
+
+VPATH=../../src_global\
+:../../src_lcao\
+:./\
+
+#==========================
+# Define HONG
+#==========================
+HONG=-D__MPI -DMETIS -DMKL_ILP64
+
+#==========================
+# OPTIMIZE OPTIONS
+#==========================
+OPTS_GDB = -g -W 
+
+#==========================
+# OBJECTS NEEDED
+#==========================
+FP_OBJS_0=$(OBJS_ORBITAL)\
+$(OBJS_GLOBAL)\
+main.o\
+
+FP_OBJS=$(patsubst %.o, ${OBJ_DIR}/%.o, ${FP_OBJS_0})
+PDIAG_OBJS=$(patsubst %.o, ${OBJ_DIR}/%.o, ${OBJS_PDIAG})
+PDIAG_MR=$(patsubst %.o, ${OBJ_DIR}/%.o, ${PDIAG_MR_0})
+
+#==========================
+# MAKING OPTIONS
+#==========================
+fp_mpi : 
+	@ make init
+	@ make -j $(NP) parallel
+
+init :
+	@ if [ ! -d $(OBJ_DIR) ]; then mkdir $(OBJ_DIR); fi
+	@ if [ ! -d $(OBJ_DIR)/README ]; then echo "This directory contains all of the .o files" > $(OBJ_DIR)/README; fi
+	@ if [ ! -d ../bin ]; then mkdir ../bin; fi
+
+parallel : ${FP_OBJS} ${PDIAG_OBJS} ${PDIAG_MR} ${HEADERS}
+	${CPLUSPLUS_MPI} ${OPTS} ${OPTS_MPI} $(FP_OBJS) ${PDIAG_OBJS} ${PDIAG_MR} ${LIBS} -o  ../bin/${VERSION}.mpi
+
+serial : ${FP_OBJS} ${HEADERS} 
+	${CPLUSPLUS} ${OPTS} $(FP_OBJS) ${LIBS} -o ${VERSION}.fp.x 
+
+#==========================
+# rules
+#==========================
+${OBJ_DIR}/%.o:%.cpp
+	${CPLUSPLUS_MPI} ${OPTS} ${OPTS_MPI} -c ${HONG} $< -o $@
+${OBJ_DIR}/%.o:%.f
+	${FORTRAN} -c ${HONG} $< -o $@	 
+
+.PHONY:clean
+clean:
+	@ if [ -d $(OBJ_DIR) ]; then rm -rf $(OBJ_DIR); fi
+	@ if [ -d ../bin ]; then rm -rf ../bin; fi
diff --git a/ABACUS.develop/source/src_external/ORB_api/Makefile.Objects b/ABACUS.develop/source/src_external/ORB_api/Makefile.Objects
new file mode 100644
index 0000000000..3fd35dbc6d
--- /dev/null
+++ b/ABACUS.develop/source/src_external/ORB_api/Makefile.Objects
@@ -0,0 +1,28 @@
+#
+# This is a test makefile for Electronic-structure
+#
+# This particular makefile defines all the executables and objects
+# files needed, who they depend on, and the compilation defaults.#
+# The file makefile.local is included below.
+# That file defines the actual commands to use to run the C++
+# compiler, library options and directories, etc., all of which are
+# machine specific and depend on the local installation.  Hence the name.
+#
+
+VERSION = ABACUS
+HEADERS = *.h
+
+OBJS_ORBITAL=ORB_control.o\
+ORB_read.o\
+ORB_atomic.o\
+ORB_atomic_lm.o\
+ORB_nonlocal.o\
+ORB_nonlocal_lm.o\
+ORB_gaunt_table.o\
+ORB_table_beta.o\
+ORB_table_phi.o\
+ORB_table_alpha.o\
+ORB_gen_tables.o\
+
+OBJS_GLOBAL=timer.o\
+vector3.o\
diff --git a/ABACUS.develop/source/src_external/ORB_api/Makefile.system b/ABACUS.develop/source/src_external/ORB_api/Makefile.system
new file mode 100644
index 0000000000..e6fdeda197
--- /dev/null
+++ b/ABACUS.develop/source/src_external/ORB_api/Makefile.system
@@ -0,0 +1,14 @@
+include Makefile.vars
+
+#==========================
+# LIBS and INCLUDES
+#==========================
+LIBS = -lifcore -lm -lpthread 
+
+INCLUDES = -I. -Icommands 
+
+#==========================
+# OPTIMIZE OPTIONS
+#==========================
+OPTS     = ${INCLUDES} -Ofast -traceback -std=c++11 -simd -march=native -xHost -m64 -qopenmp -Werror -Wall -pedantic -g
+OPTS_MPI = -cxx=${CPLUSPLUS}
diff --git a/ABACUS.develop/source/src_external/ORB_api/Makefile.vars b/ABACUS.develop/source/src_external/ORB_api/Makefile.vars
new file mode 100644
index 0000000000..13f5dac39a
--- /dev/null
+++ b/ABACUS.develop/source/src_external/ORB_api/Makefile.vars
@@ -0,0 +1,31 @@
+FORTRAN       = ifort
+
+CPLUSPLUS      = icpc
+#CPLUSPLUS     = /public/intel2017/bin/icpc
+
+CPLUSPLUS_MPI = mpiicpc
+#CPLUSPLUS_MPI = /public/intel2017/impi/2017.1.132/intel64/bin/mpiicpc
+
+LAPACK_DIR    = $(MKLROOT)
+#LAPACK_DIR = /public/intel2017/compilers_and_libraries_2017.1.132/linux/mkl
+#LAPACK_DIR = $(MKLROOT)
+#LAPACK_DIR    = /public/intel2017/mkl
+
+FFTW_DIR = /home/mohan/1_Software/impi_fftw-3.3.8
+#FFTW_DIR = /home/qianrui/intelcompile/impi_fftw
+#FFTW_DIR       = /public/udata/xiaohui/software/fftw2
+#FFTW_DIR       =/opt/fftw/3.3.6-p12/intel/2017.update4
+#FFTW_DIR      = /public/fftw-3.3.8
+
+BOOST_DIR = /home/mohan/1_Software/impi_boost-1.70.0
+#BOOST_DIR = /home/qianrui/intelcompile/impi_boost
+#BOOST_DIR      = /public/udata/xiaohui/software/boost_1_39_0
+#BOOST_DIR      = /opt/boost/1.64.0
+
+ELPA_DIR = /home/mohan/1_Software/impi_elpa-16.05.005
+#ELPA_DIR = /home/qianrui/intelcompile/impi_elpa
+#ELPA_DIR   = /public/udata/xiaohui/ELPA-2016.05.004
+#ELPA_DIR = /opt/elpa/intel_2017_update4
+
+OBJ_DIR = obj
+NP      = 14
diff --git a/ABACUS.develop/source/src_external/ORB_api/main.cpp b/ABACUS.develop/source/src_external/ORB_api/main.cpp
new file mode 100644
index 0000000000..f13db39ea1
--- /dev/null
+++ b/ABACUS.develop/source/src_external/ORB_api/main.cpp
@@ -0,0 +1,44 @@
+#include "timer.h"
+#include <ctime>
+
+void calculate();
+
+int main(int argc, char **argv)
+{
+
+    calculate();
+
+    return 0;
+}
+
+
+void calculate()
+{
+
+	time_t time_start = std::time(NULL);
+
+//	timer::start();
+
+	//----------------------------------------------------------
+	// main program for doing electronic structure calculations
+	//----------------------------------------------------------
+//	Driver DD;
+//	DD.init();
+
+	time_t	time_finish= std::time(NULL);
+
+	// print out information before ABACUS ends
+	cout << "\n START  Time  : " << ctime(&time_start);
+	cout << " FINISH Time  : " << ctime(&time_finish);
+	cout << " TOTAL  Time  : " << difftime(time_finish, time_start) << endl;
+
+	double total_time = difftime(time_finish, time_start);
+	int hour = total_time / 3600;
+	int mins = ( total_time - 3600 * hour ) / 60;
+	int secs = total_time - 3600 * hour - 60 * mins ;
+	cout << " Total  Time  : " << hour << " h "
+	            << mins << " mins "
+	            << secs << " secs "<< endl;
+
+    return;
+}
diff --git a/ABACUS.develop/source/src_pw/charge_extra.cpp b/ABACUS.develop/source/src_pw/charge_extra.cpp
index 8ef14f105c..de5a355fde 100644
--- a/ABACUS.develop/source/src_pw/charge_extra.cpp
+++ b/ABACUS.develop/source/src_pw/charge_extra.cpp
@@ -14,6 +14,12 @@ Charge_Extra::Charge_Extra()
 	// for second-order extrapolation
 	this->delta_rho3 = new double*[NSPIN];
 
+	// PLEASE update the following lines, because
+	// the pw.nrxx may not be initialized yet
+	// since Charge_Extra is a member of LOOP_ions
+	// you can move the initialization of the following 
+	// arrays to somewhere else
+	// mohan add 2021-03-30
 	for(int is=0; is<NSPIN; is++)
 	{
 		delta_rho1[is] = new double[pw.nrxx];

From 2afd4c789ea5f8493efc9175127ec9cac4e5a997 Mon Sep 17 00:00:00 2001
From: mohan <mohan.chen.chen.mohan@gmail.com>
Date: Thu, 1 Apr 2021 09:51:13 +0800
Subject: [PATCH 09/60] update ORB_api and modifies related files in ABACUS

---
 .../source/src_external/ORB_api/Makefile      | 26 ++++++++++---------
 .../src_external/ORB_api/Makefile.Objects     | 11 +++++---
 .../src_external/ORB_api/Makefile.system      |  2 +-
 .../source/src_external/ORB_api/Makefile.vars |  6 ++---
 .../source/src_external/ORB_api/main.cpp      |  5 ++--
 .../source/src_global/complexmatrix.h         |  2 +-
 .../source/src_global/lapack_connector.h      | 17 +-----------
 ABACUS.develop/source/src_global/matrix.h     |  1 +
 ABACUS.develop/source/src_global/matrix3.h    |  4 +--
 .../src_global/sph_bessel_recursive-d1.cpp    |  3 +--
 .../src_global/sph_bessel_recursive-d2.cpp    |  3 +--
 ABACUS.develop/source/src_pw/global.h         |  4 +--
 12 files changed, 36 insertions(+), 48 deletions(-)

diff --git a/ABACUS.develop/source/src_external/ORB_api/Makefile b/ABACUS.develop/source/src_external/ORB_api/Makefile
index 7f01dfeb7c..fc57d4f986 100644
--- a/ABACUS.develop/source/src_external/ORB_api/Makefile
+++ b/ABACUS.develop/source/src_external/ORB_api/Makefile
@@ -1,4 +1,4 @@
-# This is the Makefile of ABACUS
+# This is the Makefile of ABACUS-ORB API
 
 include Makefile.system
 include Makefile.Objects
@@ -10,19 +10,22 @@ VPATH=../../src_global\
 #==========================
 # Define HONG
 #==========================
-HONG=-D__MPI -DMETIS -DMKL_ILP64
+HONG= -DMETIS -DMKL_ILP64
 
 #==========================
 # OPTIMIZE OPTIONS
 #==========================
-OPTS_GDB = -g -W 
+OPTS_GDB = -g -W
 
 #==========================
 # OBJECTS NEEDED
 #==========================
-FP_OBJS_0=$(OBJS_ORBITAL)\
-$(OBJS_GLOBAL)\
-main.o\
+#FP_OBJS_0=$(OBJS_ORBITAL)\
+#$(OBJS_GLOBAL)\
+#main.o\
+
+FP_OBJS_0=main.o\
+$(OBJS_ORB)\
 
 FP_OBJS=$(patsubst %.o, ${OBJ_DIR}/%.o, ${FP_OBJS_0})
 PDIAG_OBJS=$(patsubst %.o, ${OBJ_DIR}/%.o, ${OBJS_PDIAG})
@@ -33,18 +36,18 @@ PDIAG_MR=$(patsubst %.o, ${OBJ_DIR}/%.o, ${PDIAG_MR_0})
 #==========================
 fp_mpi : 
 	@ make init
-	@ make -j $(NP) parallel
+	@ make -j $(NP) serial2 
 
 init :
 	@ if [ ! -d $(OBJ_DIR) ]; then mkdir $(OBJ_DIR); fi
 	@ if [ ! -d $(OBJ_DIR)/README ]; then echo "This directory contains all of the .o files" > $(OBJ_DIR)/README; fi
 	@ if [ ! -d ../bin ]; then mkdir ../bin; fi
 
-parallel : ${FP_OBJS} ${PDIAG_OBJS} ${PDIAG_MR} ${HEADERS}
-	${CPLUSPLUS_MPI} ${OPTS} ${OPTS_MPI} $(FP_OBJS) ${PDIAG_OBJS} ${PDIAG_MR} ${LIBS} -o  ../bin/${VERSION}.mpi
-
 serial : ${FP_OBJS} ${HEADERS} 
-	${CPLUSPLUS} ${OPTS} $(FP_OBJS) ${LIBS} -o ${VERSION}.fp.x 
+	${CPLUSPLUS} ${OPTS} $(FP_OBJS) ${LIBS} -o ${VERSION}.x 
+
+serial2 : ${FP_OBJS} 
+	${CPLUSPLUS} ${OPTS} $(FP_OBJS) ${LIBS} -o ${VERSION}.x 
 
 #==========================
 # rules
@@ -57,4 +60,3 @@ ${OBJ_DIR}/%.o:%.f
 .PHONY:clean
 clean:
 	@ if [ -d $(OBJ_DIR) ]; then rm -rf $(OBJ_DIR); fi
-	@ if [ -d ../bin ]; then rm -rf ../bin; fi
diff --git a/ABACUS.develop/source/src_external/ORB_api/Makefile.Objects b/ABACUS.develop/source/src_external/ORB_api/Makefile.Objects
index 3fd35dbc6d..5cc70d29c3 100644
--- a/ABACUS.develop/source/src_external/ORB_api/Makefile.Objects
+++ b/ABACUS.develop/source/src_external/ORB_api/Makefile.Objects
@@ -9,8 +9,10 @@
 # machine specific and depend on the local installation.  Hence the name.
 #
 
-VERSION = ABACUS
-HEADERS = *.h
+VERSION= ABACUS-ORB
+HEADERS= *.h
+
+OBJS_ORB=ORB_read.o\
 
 OBJS_ORBITAL=ORB_control.o\
 ORB_read.o\
@@ -24,5 +26,6 @@ ORB_table_phi.o\
 ORB_table_alpha.o\
 ORB_gen_tables.o\
 
-OBJS_GLOBAL=timer.o\
-vector3.o\
+OBJS_GLOBAL=#sph_bessel.o\
+#sph_bessel_recursive-d1.o\
+#sph_bessel_recursive-d2.o\
diff --git a/ABACUS.develop/source/src_external/ORB_api/Makefile.system b/ABACUS.develop/source/src_external/ORB_api/Makefile.system
index e6fdeda197..ac93188d21 100644
--- a/ABACUS.develop/source/src_external/ORB_api/Makefile.system
+++ b/ABACUS.develop/source/src_external/ORB_api/Makefile.system
@@ -10,5 +10,5 @@ INCLUDES = -I. -Icommands
 #==========================
 # OPTIMIZE OPTIONS
 #==========================
-OPTS     = ${INCLUDES} -Ofast -traceback -std=c++11 -simd -march=native -xHost -m64 -qopenmp -Werror -Wall -pedantic -g
+OPTS     = ${INCLUDES} -Ofast -std=c++11 -simd -march=native -m64 -Werror -Wall -pedantic -g
 OPTS_MPI = -cxx=${CPLUSPLUS}
diff --git a/ABACUS.develop/source/src_external/ORB_api/Makefile.vars b/ABACUS.develop/source/src_external/ORB_api/Makefile.vars
index 13f5dac39a..f0e5a56adc 100644
--- a/ABACUS.develop/source/src_external/ORB_api/Makefile.vars
+++ b/ABACUS.develop/source/src_external/ORB_api/Makefile.vars
@@ -1,10 +1,8 @@
-FORTRAN       = ifort
-
 CPLUSPLUS      = icpc
 #CPLUSPLUS     = /public/intel2017/bin/icpc
 
-CPLUSPLUS_MPI = mpiicpc
-#CPLUSPLUS_MPI = /public/intel2017/impi/2017.1.132/intel64/bin/mpiicpc
+#CPLUSPLUS_MPI = mpiicpc
+CPLUSPLUS_MPI = icpc 
 
 LAPACK_DIR    = $(MKLROOT)
 #LAPACK_DIR = /public/intel2017/compilers_and_libraries_2017.1.132/linux/mkl
diff --git a/ABACUS.develop/source/src_external/ORB_api/main.cpp b/ABACUS.develop/source/src_external/ORB_api/main.cpp
index f13db39ea1..4258a489ff 100644
--- a/ABACUS.develop/source/src_external/ORB_api/main.cpp
+++ b/ABACUS.develop/source/src_external/ORB_api/main.cpp
@@ -1,4 +1,4 @@
-#include "timer.h"
+//#include "timer.h"
 #include <ctime>
 
 void calculate();
@@ -14,7 +14,7 @@ int main(int argc, char **argv)
 
 void calculate()
 {
-
+/*
 	time_t time_start = std::time(NULL);
 
 //	timer::start();
@@ -39,6 +39,7 @@ void calculate()
 	cout << " Total  Time  : " << hour << " h "
 	            << mins << " mins "
 	            << secs << " secs "<< endl;
+*/
 
     return;
 }
diff --git a/ABACUS.develop/source/src_global/complexmatrix.h b/ABACUS.develop/source/src_global/complexmatrix.h
index 34fdfe2b47..3dd7a64994 100644
--- a/ABACUS.develop/source/src_global/complexmatrix.h
+++ b/ABACUS.develop/source/src_global/complexmatrix.h
@@ -9,7 +9,7 @@
 #include <complex>
 using namespace std;
 
-#include "src_global/matrix.h"
+#include "matrix.h"
 
 #ifdef _MCD_CHECK
 #include "src_parallel/mcd.h"
diff --git a/ABACUS.develop/source/src_global/lapack_connector.h b/ABACUS.develop/source/src_global/lapack_connector.h
index 3494a65c18..19f6476027 100644
--- a/ABACUS.develop/source/src_global/lapack_connector.h
+++ b/ABACUS.develop/source/src_global/lapack_connector.h
@@ -1,18 +1,3 @@
-// =============================================================================
-//                          C++ Header File
-// Project:         LapackConnector
-// File:            LapackConnector.hpp
-// Author:          sltk
-// Comment:         LapackConnector provide the connector to the fortran Lapack routine.
-// Warning:
-// Start time:      2007-03-08
-// Last modified:   2008-08-12 ywcui : add zhegvx
-// 					2008-08-13 mohan : find bug,test.
-// 					2008-09-03 mohan : Add zgesv
-// 					2009-03-08 mohan : add ilaenv
-//					2010-01-22 spshu : add dgesvd
-// =============================================================================
-
 #ifndef LAPACKCONNECTOR_HPP
 #define LAPACKCONNECTOR_HPP
 
@@ -23,7 +8,7 @@
 #include "matrix.h"
 #include "complexmatrix.h"
 #include "blas_connector.h"
-#include "src_global/global_function.h"
+#include "global_function.h"
 
 
 extern "C"
diff --git a/ABACUS.develop/source/src_global/matrix.h b/ABACUS.develop/source/src_global/matrix.h
index 2f4200ff74..e6609bc1c7 100644
--- a/ABACUS.develop/source/src_global/matrix.h
+++ b/ABACUS.develop/source/src_global/matrix.h
@@ -16,6 +16,7 @@ class matrix
 {
 	/* data */
 public:
+
 	int nr=0;
 	int nc=0;   /* Number of rows and columns */
 	double *c=nullptr;    /* Holds the data */
diff --git a/ABACUS.develop/source/src_global/matrix3.h b/ABACUS.develop/source/src_global/matrix3.h
index a22afc9144..a126bbdeb5 100644
--- a/ABACUS.develop/source/src_global/matrix3.h
+++ b/ABACUS.develop/source/src_global/matrix3.h
@@ -9,8 +9,8 @@
 #include "../src_parallel/mcd.h"
 #endif
 
-#include "src_global/vector3.h"
-#include "src_global/matrix.h"
+#include "vector3.h"
+#include "matrix.h"
 
 class Matrix3
 {
diff --git a/ABACUS.develop/source/src_global/sph_bessel_recursive-d1.cpp b/ABACUS.develop/source/src_global/sph_bessel_recursive-d1.cpp
index 231a18e4b8..d48872d9b0 100644
--- a/ABACUS.develop/source/src_global/sph_bessel_recursive-d1.cpp
+++ b/ABACUS.develop/source/src_global/sph_bessel_recursive-d1.cpp
@@ -4,8 +4,7 @@
 //==========================================================
 
 #include "sph_bessel_recursive.h"
-
-#include "src_global/constants.h"
+#include "constants.h"
 
 #include<cmath>
 #include<stdexcept>
diff --git a/ABACUS.develop/source/src_global/sph_bessel_recursive-d2.cpp b/ABACUS.develop/source/src_global/sph_bessel_recursive-d2.cpp
index 945f691f69..80193179dc 100644
--- a/ABACUS.develop/source/src_global/sph_bessel_recursive-d2.cpp
+++ b/ABACUS.develop/source/src_global/sph_bessel_recursive-d2.cpp
@@ -4,8 +4,7 @@
 //==========================================================
 
 #include "sph_bessel_recursive.h"
-
-#include "src_global/constants.h"
+#include "constants.h"
 
 #include<cmath>
 #include<stdexcept>
diff --git a/ABACUS.develop/source/src_pw/global.h b/ABACUS.develop/source/src_pw/global.h
index c9039d6d78..b417e75361 100644
--- a/ABACUS.develop/source/src_pw/global.h
+++ b/ABACUS.develop/source/src_pw/global.h
@@ -7,8 +7,8 @@
 #define GLOBAL_H
 
 #include "../run_pw.h"
-#include "src_global/global_variable.h"
-#include "src_global/global_function.h"
+#include "../src_global/global_variable.h"
+#include "../src_global/global_function.h"
 #include "pw_basis.h"
 #include "energy.h"
 #include "pseudopot_cell_vnl.h"

From ea3584c0d354d4426c87e86ea9d798b897fbb448 Mon Sep 17 00:00:00 2001
From: linpz <linpz@mail.ustc.edu.cn>
Date: Thu, 1 Apr 2021 10:31:21 +0800
Subject: [PATCH 10/60] 1. add OpenMP in PW_Basis::setup_structure_factor

---
 ABACUS.develop/source/src_global/vector3.h | 22 ++++-----
 ABACUS.develop/source/src_pw/pw_basis.cpp  | 55 ++++++----------------
 2 files changed, 25 insertions(+), 52 deletions(-)

diff --git a/ABACUS.develop/source/src_global/vector3.h b/ABACUS.develop/source/src_global/vector3.h
index a80c3d03b7..fbdbe11ee2 100644
--- a/ABACUS.develop/source/src_global/vector3.h
+++ b/ABACUS.develop/source/src_global/vector3.h
@@ -42,20 +42,20 @@ class Vector3
 	void print(void)const ;		// mohan add 2009-11-29
 };
 
-template <class T> Vector3<T> operator+( const Vector3<T> &u, const Vector3<T> &v ) { return Vector3<T>( u.x+v.x, u.y+v.y, u.z+v.z ); }
-template <class T> Vector3<T> operator-( const Vector3<T> &u, const Vector3<T> &v ) { return Vector3<T>( u.x-v.x, u.y-v.y, u.z-v.z ); }
+template <class T> inline Vector3<T> operator+( const Vector3<T> &u, const Vector3<T> &v ) { return Vector3<T>( u.x+v.x, u.y+v.y, u.z+v.z ); }
+template <class T> inline Vector3<T> operator-( const Vector3<T> &u, const Vector3<T> &v ) { return Vector3<T>( u.x-v.x, u.y-v.y, u.z-v.z ); }
 //u.v=(ux*vx)+(uy*vy)+(uz*vz)                                                     
-template <class T> T          operator*( const Vector3<T> &u, const Vector3<T> &v ) { return ( u.x*v.x + u.y*v.y + u.z*v.z ); }
-template <class T> Vector3<T> operator*( const T &s,          const Vector3<T> &u ) { return Vector3<T>( u.x*s, u.y*s, u.z*s ); }
-template <class T> Vector3<T> operator*( const Vector3<T> &u, const T &s          ) { return Vector3<T>( u.x*s, u.y*s, u.z*s ); } // mohan add 2009-5-10
-template <class T> Vector3<T> operator/( const Vector3<T> &u, const T &s          ) { return Vector3<T>( u.x/s, u.y/s, u.z/s ); }
+template <class T> inline T          operator*( const Vector3<T> &u, const Vector3<T> &v ) { return ( u.x*v.x + u.y*v.y + u.z*v.z ); }
+template <class T> inline Vector3<T> operator*( const T &s,          const Vector3<T> &u ) { return Vector3<T>( u.x*s, u.y*s, u.z*s ); }
+template <class T> inline Vector3<T> operator*( const Vector3<T> &u, const T &s          ) { return Vector3<T>( u.x*s, u.y*s, u.z*s ); } // mohan add 2009-5-10
+template <class T> inline Vector3<T> operator/( const Vector3<T> &u, const T &s          ) { return Vector3<T>( u.x/s, u.y/s, u.z/s ); }
 //u.v=(ux*vx)+(uy*vy)+(uz*vz)
-template <class T> T          dot      ( const Vector3<T> &u, const Vector3<T> &v ) { return ( u.x*v.x + u.y*v.y + u.z*v.z ); }
+template <class T> inline T          dot      ( const Vector3<T> &u, const Vector3<T> &v ) { return ( u.x*v.x + u.y*v.y + u.z*v.z ); }
 // | i  j  k  |
 // | ux uy uz |
 // | vx vy vz |
 // u.v=(uy*vz-uz*vy)i+(-ux*vz+uz*vx)j+(ux*vy-uy*vx)k
-template <class T> Vector3<T> operator^(const Vector3<T> &u,const Vector3<T> &v)
+template <class T> inline Vector3<T> operator^(const Vector3<T> &u,const Vector3<T> &v)
 {	
 	return Vector3<T> ( u.y * v.z - u.z * v.y,
 	                   -u.x * v.z + u.z * v.x,
@@ -65,7 +65,7 @@ template <class T> Vector3<T> operator^(const Vector3<T> &u,const Vector3<T> &v)
 // | ux uy uz |
 // | vx vy vz |
 // u.v=(uy*vz-uz*vy)i+(-ux*vz+uz*vx)j+(ux*vy-uy*vzx)k
-template <class T> Vector3<T> cross(const Vector3<T> &u,const Vector3<T> &v)
+template <class T> inline Vector3<T> cross(const Vector3<T> &u,const Vector3<T> &v)
 {
 	return Vector3<T> ( u.y * v.z - u.z * v.y,
 	                   -u.x * v.z + u.z * v.x,
@@ -80,9 +80,9 @@ template <class T> Vector3<T> cross(const Vector3<T> &u,const Vector3<T> &v)
 //}
 
 //whether m1 != m2
-template <class T> bool operator !=(const Vector3<T> &u, const Vector3<T> &v){ return !(u == v); }
+template <class T> inline bool operator !=(const Vector3<T> &u, const Vector3<T> &v){ return !(u == v); }
 //whether u == v
-template <class T> bool operator ==(const Vector3<T> &u, const Vector3<T> &v)
+template <class T> inline bool operator ==(const Vector3<T> &u, const Vector3<T> &v)
 {
 	if(u.x == v.x && u.y == v.y && u.z == v.z)
 		return true;
diff --git a/ABACUS.develop/source/src_pw/pw_basis.cpp b/ABACUS.develop/source/src_pw/pw_basis.cpp
index 900ee567fc..2af04bfc65 100644
--- a/ABACUS.develop/source/src_pw/pw_basis.cpp
+++ b/ABACUS.develop/source/src_pw/pw_basis.cpp
@@ -6,6 +6,7 @@
 #include "tools.h"
 #include "pw_basis.h"
 #include "../src_pw/pw_complement.h"
+#include <omp.h>
 
 PW_Basis::PW_Basis()
 {
@@ -729,63 +730,35 @@ void PW_Basis::get_nggm(const int ngmc_local)
 
 
 //  Calculate structure factor
-void PW_Basis::setup_structure_factor(void)
+void PW_Basis::setup_structure_factor(void)			// Peize Lin optimize and add OpenMP 2021.04.01
 {
     TITLE("PW_Basis","setup_structure_factor");
     timer::tick("PW_Basis","setup_struc_factor");
-    complex<double> ci_tpi = NEG_IMAG_UNIT * TWO_PI;
-    complex<double> x;
+    const complex<double> ci_tpi = NEG_IMAG_UNIT * TWO_PI;
 
-    this->strucFac.create( Ucell->ntype, this->ngmc);
-    this->strucFac.zero_out();
+    this->strucFac.create(Ucell->ntype, this->ngmc);
     Memory::record("PW_Basis","struc_fac", Ucell->ntype*this->ngmc,"complexmatrix");
 
-#define complex2cal_strufac
 //	string outstr;
 //	outstr = global_out_dir + "strucFac.dat"; 
 //	ofstream ofs( outstr.c_str() ) ;
 
-    for (int it=0; it< Ucell->ntype; it++)
+    for (int it=0; it<Ucell->ntype; it++)
     {
-        const Atom* atom = &Ucell->atoms[it];	
-        for (int ig=0; ig< this->ngmc; ig++)
+		const int na = Ucell->atoms[it].na;
+		const Vector3<double> * const tau = Ucell->atoms[it].tau;
+
+		#pragma omp parallel for schedule(static)
+        for (int ig=0; ig<this->ngmc; ig++)
         {
-#ifdef complex2cal_strufac
+			const Vector3<double> gcar_ig = gcar[ig];
             complex<double> sum_phase = ZERO;
-#else
-            double sum_cos = 0.0;
-            double sum_sin = 0.0;
-#endif
-            for (int ia=0; ia< atom->na; ia++)
+            for (int ia=0; ia<na; ia++)
             {
-                //----------------------------------------------------------
-                // EXPLAIN : Don't use Dot function until we can optimize
-                // it, use the following x*x + y*y + z*z instead!
-                //----------------------------------------------------------
                 // e^{-i G*tau}
-
-#ifdef complex2cal_strufac
-                sum_phase += exp( ci_tpi * (
-                                      gcar[ig].x * atom->tau[ia].x +
-                                      gcar[ig].y * atom->tau[ia].y +
-                                      gcar[ig].z * atom->tau[ia].z ) );
-#else
-                const double theta = TWO_PI * (
-                                         gcar[ig].x * atom->tau[ia].x +
-                                         gcar[ig].y * atom->tau[ia].y +
-                                         gcar[ig].z * atom->tau[ia].z );
-                sum_cos += cos( theta );
-                sum_sin += sin( theta );
-#endif
+                sum_phase += exp( ci_tpi * (gcar_ig * tau[ia]) );
             }
-#ifdef complex2cal_strufac
-            this->strucFac(it, ig) = sum_phase;
-#else
-            this->strucFac(it, ig) = complex<double>( sum_cos, -sum_sin );
-#endif
-
-//			double tmpx = strucFac(it, ig).real() ;
-//			double tmpy = strucFac(it, ig).imag() ;
+            this->strucFac(it,ig) = sum_phase;
         }
     }
 

From bc6f3ab2d4fd8b7807b8dc956300e2986b927b03 Mon Sep 17 00:00:00 2001
From: qianrui <Terry_Liu@pku.edu.cn>
Date: Thu, 1 Apr 2021 10:38:26 +0800
Subject: [PATCH 11/60] fix a bug in schmit_orth which results from commit
 e647aa9

---
 ABACUS.develop/source/src_pw/diago_cg.cpp | 40 +++++++++++------------
 ABACUS.develop/source/src_pw/diago_cg.h   |  4 ++-
 2 files changed, 23 insertions(+), 21 deletions(-)

diff --git a/ABACUS.develop/source/src_pw/diago_cg.cpp b/ABACUS.develop/source/src_pw/diago_cg.cpp
index ed6fb16dcc..6d69f022ec 100644
--- a/ABACUS.develop/source/src_pw/diago_cg.cpp
+++ b/ABACUS.develop/source/src_pw/diago_cg.cpp
@@ -444,7 +444,7 @@ void Diago_CG::schmit_orth
     //qianrui replace 2021-3-15
     char trans2='N';
     zgemv_(&trans2,&dim,&m,&NEG_ONE,psi.c,&dmx,lagrange,&inc,&ONE,psi_m,&inc);
-    psi_norm -= ddot_real(m,lagrange,lagrange);
+    psi_norm -= ddot_real(m,lagrange,lagrange,false);
     //======================================================================
     /*for (int j = 0; j < m; j++)
     {
@@ -486,23 +486,8 @@ double Diago_CG::ddot_real
 (
     const int &dim,
     const complex<double>* psi_L,
-    const complex<double>* psi_R
-)
-{
-    complex<double> result(0,0);
-    for (int i=0;i<dim;i++)
-    {
-        result += conj( psi_L[i] ) * psi_R[i];
-    }
-    Parallel_Reduce::reduce_complex_double_pool( result );
-    return result.real();
-}
-
-complex<double> Diago_CG::ddot
-(
-    const int & dim,
-    const complex<double> * psi_L,
-    const complex<double> * psi_R
+    const complex<double>* psi_R,
+    const bool reduce
 )
 {
     //<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<
@@ -513,7 +498,7 @@ complex<double> Diago_CG::ddot
     pL=(double *)psi_L;
     pR=(double *)psi_R;
     double result=LapackConnector::dot(dim2,pL,1,pR,1);
-    Parallel_Reduce::reduce_double_pool( result );
+    if(reduce)  Parallel_Reduce::reduce_double_pool( result );
     return result;
     //======================================================================
     /*complex<double> result(0,0);
@@ -524,8 +509,23 @@ complex<double> Diago_CG::ddot
     Parallel_Reduce::reduce_complex_double_pool( result );
     return result.real();*/
     //>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>
-}  // end of ddot
+}
 
+complex<double> Diago_CG::ddot
+(
+    const int & dim,
+    const complex<double> * psi_L,
+    const complex<double> * psi_R
+)
+{
+    complex<double> result(0, 0);
+    for (int i = 0; i < dim ; i++)
+    {
+        result += conj(psi_L[i]) *  psi_R[i] ;
+    }
+    Parallel_Reduce::reduce_complex_double_pool( result );
+    return result;
+}  // end of ddot
 
 // this return <psi(m)|psik>
 complex<double> Diago_CG::ddot
diff --git a/ABACUS.develop/source/src_pw/diago_cg.h b/ABACUS.develop/source/src_pw/diago_cg.h
index d389d90422..7065ade9de 100644
--- a/ABACUS.develop/source/src_pw/diago_cg.h
+++ b/ABACUS.develop/source/src_pw/diago_cg.h
@@ -15,13 +15,15 @@ class Diago_CG
     static double ddot_real(
         const int & dim,
         const complex<double>* psi_L,
-        const complex<double>* psi_R) ;
+        const complex<double>* psi_R,
+        const bool reduce = true) ;
 
     static complex<double> ddot(
         const int & dim,
         const complex<double>* psi_L,
         const complex<double>* psi_R ) ;
 
+
     static complex<double> ddot(
         const int & dim,
         const ComplexMatrix &psi,

From 36df797ef8f39711751e715c8a623b79b4654b09 Mon Sep 17 00:00:00 2001
From: mohan <mohan.chen.chen.mohan@gmail.com>
Date: Thu, 1 Apr 2021 16:49:00 +0800
Subject: [PATCH 12/60] update ORB files

---
 ABACUS.develop/source/run_lcao.cpp            |  2 +-
 ABACUS.develop/source/src_io/energy_dos.cpp   |  4 +--
 .../source/src_io/mulliken_charge.cpp         |  4 +--
 ABACUS.develop/source/src_lcao/LOOP_ions.cpp  |  2 +-
 .../source/src_lcao/ORB_control.cpp           | 30 +++++++++----------
 ABACUS.develop/source/src_lcao/ORB_control.h  | 12 ++------
 ABACUS.develop/source/src_lcao/ORB_read.cpp   |  8 ++---
 .../source/src_lcao/ORB_table_alpha.cpp       | 26 +++++++---------
 ABACUS.develop/source/src_lcao/run_md.cpp     |  2 +-
 9 files changed, 37 insertions(+), 53 deletions(-)

diff --git a/ABACUS.develop/source/run_lcao.cpp b/ABACUS.develop/source/run_lcao.cpp
index 8ac68eea46..f2fad9aba6 100644
--- a/ABACUS.develop/source/run_lcao.cpp
+++ b/ABACUS.develop/source/run_lcao.cpp
@@ -45,7 +45,7 @@ void Run_lcao::lcao_line(void)
 
     // * reading the localized orbitals/projectors 
 	// * construct the interpolation tables.
-	hm.orb_con.set_orb_tables();
+	hm.orb_con.set_orb_tables(UOT);
 
 	// * allocate H and S matrices according to computational resources
 	// * set the 'trace' between local H/S and global H/S
diff --git a/ABACUS.develop/source/src_io/energy_dos.cpp b/ABACUS.develop/source/src_io/energy_dos.cpp
index 4f54e904cf..d39b431f68 100644
--- a/ABACUS.develop/source/src_io/energy_dos.cpp
+++ b/ABACUS.develop/source/src_io/energy_dos.cpp
@@ -327,7 +327,7 @@ void energy::perform_dos(void)
 				atom_arrange::search( SEARCH_RADIUS );//qifeng-2019-01-21
 
 				// mohan update 2021-02-10
-				hm.orb_con.set_orb_tables();
+				hm.orb_con.set_orb_tables(UOT);
 				LM.allocate_HS_R(LNNR.nnr);
 				LM.zeros_HSR('S', LNNR.nnr);
 				UHM.genH.calculate_S_no();
@@ -409,7 +409,7 @@ void energy::perform_dos(void)
 				atom_arrange::delete_vector( SEARCH_RADIUS );
 #endif
 				// mohan update 2021-02-10
-				hm.orb_con.clear_after_ions();
+				hm.orb_con.clear_after_ions(UOT);
 			}//else
 
 		 MPI_Reduce(pdosk[is].c, pdos[is].c , NUM , MPI_DOUBLE , MPI_SUM, 0, MPI_COMM_WORLD);
diff --git a/ABACUS.develop/source/src_io/mulliken_charge.cpp b/ABACUS.develop/source/src_io/mulliken_charge.cpp
index a2b3fd15bf..68046f9c61 100644
--- a/ABACUS.develop/source/src_io/mulliken_charge.cpp
+++ b/ABACUS.develop/source/src_io/mulliken_charge.cpp
@@ -166,7 +166,7 @@ void Mulliken_Charge::cal_mulliken(void)
 			mud[0].create(ParaO.ncol,ParaO.nrow);
 			atom_arrange::set_sr_NL();
 			atom_arrange::search( SEARCH_RADIUS );//qifeng-2019-01-21
-			hm.orb_con.set_orb_tables();
+			hm.orb_con.set_orb_tables(UOT);
 			LM.allocate_HS_R(LNNR.nnr);
 			LM.zeros_HSR('S', LNNR.nnr);
 			UHM.genH.calculate_S_no();
@@ -224,7 +224,7 @@ void Mulliken_Charge::cal_mulliken(void)
 #ifdef __MPI
 			atom_arrange::delete_vector( SEARCH_RADIUS );
 #endif
-			hm.orb_con.clear_after_ions();
+			hm.orb_con.clear_after_ions(UOT);
 
 		}//else                     
 		MPI_Reduce(MecMulP[is], DecMulP[is] , NLOCAL , MPI_DOUBLE , MPI_SUM, 0, MPI_COMM_WORLD);
diff --git a/ABACUS.develop/source/src_lcao/LOOP_ions.cpp b/ABACUS.develop/source/src_lcao/LOOP_ions.cpp
index 42e865710d..1373db65a0 100644
--- a/ABACUS.develop/source/src_lcao/LOOP_ions.cpp
+++ b/ABACUS.develop/source/src_lcao/LOOP_ions.cpp
@@ -225,7 +225,7 @@ void LOOP_ions::opt_ions(void)
     }
 
 	// mohan update 2021-02-10
-    hm.orb_con.clear_after_ions();
+    hm.orb_con.clear_after_ions(UOT);
 
     timer::tick("LOOP_ions","opt_ions",'B'); 
     return;
diff --git a/ABACUS.develop/source/src_lcao/ORB_control.cpp b/ABACUS.develop/source/src_lcao/ORB_control.cpp
index 302e93083f..b47bc21cec 100644
--- a/ABACUS.develop/source/src_lcao/ORB_control.cpp
+++ b/ABACUS.develop/source/src_lcao/ORB_control.cpp
@@ -4,19 +4,15 @@
 #include "src_global/sltk_atom_arrange.h"
 #include "ORB_gen_tables.h"
 #include "build_st_pw.h"
+#include "../src_pdiag/pdiag_double.h"
 
 ORB_control::ORB_control()
 {}
 
 ORB_control::~ORB_control()
-{
-	if(test_deconstructor)
-	{
-		cout << " ~ORB_control()" << endl;
-	}
-}
+{}
 
-void ORB_control::set_orb_tables(void)
+void ORB_control::set_orb_tables(ORB_gen_tables &OGT)
 {
     TITLE("ORB_control","set_orb_tables");
 	timer::tick("ORB_control","set_orb_tables",'B');
@@ -34,7 +30,7 @@ void ORB_control::set_orb_tables(void)
 	}
 
     //=============================================================================
-    // (2) FUNCTION : Generate Gaunt_Coefficients and S-table using UOT.init
+    // (2) FUNCTION : Generate Gaunt_Coefficients and S-table using OGT.init
     // 	   Must have 'Numerical Orbital' infomation
     // (2) RESULT : we have tabulated S table for use.
     //=============================================================================
@@ -43,23 +39,25 @@ void ORB_control::set_orb_tables(void)
     // 1: generate overlap table
     // 2: generate kinetic table
     // 3: generate overlap & kinetic table
-    UOT.gen_tables(job0);
+    OGT.gen_tables(job0);
     // init lat0, in order to interpolated value from this table.
-    UOT.set_unit(ucell.lat0);
+    OGT.set_unit(ucell.lat0);
 
 
 	timer::tick("ORB_control","set_orb_tables",'B');
     return;
 }
 
-void ORB_control::clear_after_ions(void)
+void ORB_control::clear_after_ions(ORB_gen_tables &OGT)
 {
     TITLE("ORB_control","clear_after_ions");
-    UOT.MOT.Destroy_Table();
-    UOT.tbeta.Destroy_Table_Beta();
-    //caoyu add 2021-03-18
-    if (INPUT.out_descriptor && BASIS_TYPE == "lcao") {
-        UOT.talpha.Destroy_Table_Alpha();
+    OGT.MOT.Destroy_Table();
+    OGT.tbeta.Destroy_Table_Beta();
+    
+	//caoyu add 2021-03-18
+    if (INPUT.out_descriptor && BASIS_TYPE == "lcao") 
+	{
+        OGT.talpha.Destroy_Table_Alpha();
     }
     return;
 }
diff --git a/ABACUS.develop/source/src_lcao/ORB_control.h b/ABACUS.develop/source/src_lcao/ORB_control.h
index c69397bc1d..bd34d797ed 100644
--- a/ABACUS.develop/source/src_lcao/ORB_control.h
+++ b/ABACUS.develop/source/src_lcao/ORB_control.h
@@ -1,14 +1,7 @@
-//==========================================================
-// AUTHOR : mohan, ywcui
-// Last Update: 2021-02-10
-//==========================================================
 #ifndef ORB_CONTROL_H 
 #define ORB_CONTROL_H 
 
-#include "../src_pw/tools.h"
-
 #include "ORB_gen_tables.h"
-#include "../src_pdiag/pdiag_double.h"
 
 class ORB_control 
 {
@@ -19,8 +12,9 @@ class ORB_control
     ~ORB_control();
 
     // Generate the S(overlap),T,NL matrix.
-    void set_orb_tables();
-    void clear_after_ions();
+    void set_orb_tables(ORB_gen_tables &OGT);
+
+    void clear_after_ions(ORB_gen_tables &OGT);
 
 };
 #endif
diff --git a/ABACUS.develop/source/src_lcao/ORB_read.cpp b/ABACUS.develop/source/src_lcao/ORB_read.cpp
index bd85939faa..3e87061e70 100644
--- a/ABACUS.develop/source/src_lcao/ORB_read.cpp
+++ b/ABACUS.develop/source/src_lcao/ORB_read.cpp
@@ -272,8 +272,6 @@ void LCAO_Orbitals::Set_NonLocal(const int &it, int &n_projectors)
 // PLEASE avoid using capital letters for local variables
 // mohan note 2021-03-23 
 	const int N_PROJECTORS = atom->nh;//zhengdy-soc
-//cout << " number of projectros " << N_PROJECTORS << endl;
-//	cout << " number of projectros " << n_projectors << endl;
 
 	// set the nonlocal projector objects
 	Numerical_Nonlocal_Lm* tmpBeta_lm = new Numerical_Nonlocal_Lm[n_projectors];
@@ -289,11 +287,11 @@ void LCAO_Orbitals::Set_NonLocal(const int &it, int &n_projectors)
 		{
 			const int lnow = atom->lll[p1];
 
-		// this will be wrong if dion is non-diagoal
-		//Coefficient_D_in(lnow,lnow)=atom->dion(p1,p1);//LiuXh 2016-01-14
+			// this will be wrong if dion is non-diagoal
+			//Coefficient_D_in(lnow,lnow)=atom->dion(p1,p1);//LiuXh 2016-01-14
 			Coefficient_D_in(p1,p1)=atom->dion(p1,p1);//LiuXh 2016-01-14
 
-		// only keep the nonzero part.
+			// only keep the nonzero part.
 			int cut_mesh = atom->mesh; 
 			for(int ir=atom->mesh-1; ir>=0; --ir)
 			{
diff --git a/ABACUS.develop/source/src_lcao/ORB_table_alpha.cpp b/ABACUS.develop/source/src_lcao/ORB_table_alpha.cpp
index d0f639d28d..3ab48ec50a 100644
--- a/ABACUS.develop/source/src_lcao/ORB_table_alpha.cpp
+++ b/ABACUS.develop/source/src_lcao/ORB_table_alpha.cpp
@@ -1,11 +1,7 @@
 //caoyu add 2021-03-17
-
 #include "ORB_table_alpha.h"
 #include "ORB_read.h"
-#include "../src_pw/global.h"
 #include <stdexcept>
-#include "../src_ri/exx_abfs.h"
-#include "../src_io/winput.h"
 
 double ORB_table_alpha::dr = -1.0;
 
@@ -415,7 +411,7 @@ void ORB_table_alpha::print_Table_DSR(void)
 	ofstream ofs;
 	stringstream ss;
 	// the parameter 'winput::spillage_outdir' is read from INPUTw.
-	ss << winput::spillage_outdir << "/" << "S_I_mu_alpha.dat";
+	ss << "./S_I_mu_alpha.dat";
 	if (MY_RANK == 0)
 	{
 		ofs.open(ss.str().c_str());
@@ -434,7 +430,7 @@ void ORB_table_alpha::print_Table_DSR(void)
 					for (int N2 = 0; N2 < ORB.Alpha[0].getNchi(L2); N2++)
 					{
 						const int Opair = this->DS_Opair(T1, L1, L2, N1, N2);	//Opair
-						ofs <<setw(20)<< "atom_type: " << ucell.atoms[T1].label << endl;
+						//ofs <<setw(20)<< "atom_type: " << ucell.atoms[T1].label << endl;
 						ofs <<setw(20)<< "lcao basis: " << "L1=" << L1 << ", N1=" << N1 << endl;
 						ofs <<setw(20)<< "descriptor basis: " << "L2=" << L2 << ", N2=" << N2 << endl;
 						for (int il = 0; il < this-> DS_2Lplus1[T1]; il++)
@@ -456,13 +452,11 @@ void ORB_table_alpha::print_Table_DSR(void)
 								if ( (ir+1) % 8 == 0) ofs << endl;
 							}
 							ofs << endl <<endl;
-						}
-					}
-				}
-			}
-		}
-
-
-	}
-
-}
\ No newline at end of file
+						}// il
+					}// N2
+				}// L2
+			}// N1
+		}// L1
+	}// T1
+	return;
+}
diff --git a/ABACUS.develop/source/src_lcao/run_md.cpp b/ABACUS.develop/source/src_lcao/run_md.cpp
index e85f99ac2a..b8767fdd9c 100644
--- a/ABACUS.develop/source/src_lcao/run_md.cpp
+++ b/ABACUS.develop/source/src_lcao/run_md.cpp
@@ -236,7 +236,7 @@ void Run_MD::opt_ions(void)
     }
 
 	// mohan update 2021-02-10
-    hm.orb_con.clear_after_ions();
+    hm.orb_con.clear_after_ions(UOT);
 
     timer::tick("Run_MD","opt_ions",'B'); 
     return;

From 6964564d676cf5dd3ff5aa31664493839b4c3b76 Mon Sep 17 00:00:00 2001
From: mohan <mohan.chen.chen.mohan@gmail.com>
Date: Thu, 1 Apr 2021 21:31:34 +0800
Subject: [PATCH 13/60] delete usage of tools.h in ORB_gaunt_table

---
 .../source/src_global/global_function.h       |   5 -
 ABACUS.develop/source/src_global/mathzone.h   |   3 +
 .../source/src_lcao/ORB_gaunt_table.cpp       | 133 +++++++-----------
 .../source/src_lcao/ORB_gaunt_table.h         |  57 ++++----
 4 files changed, 89 insertions(+), 109 deletions(-)

diff --git a/ABACUS.develop/source/src_global/global_function.h b/ABACUS.develop/source/src_global/global_function.h
index fc9835478b..bcebfff51b 100644
--- a/ABACUS.develop/source/src_global/global_function.h
+++ b/ABACUS.develop/source/src_global/global_function.h
@@ -1,8 +1,3 @@
-//==========================================================
-// AUTHOR : mohan
-// LAST UPDATE : 2009-02-26
-// Add : READ_VALUE; SCAN_BEGIN; SCAN_END; 2009-02-26
-//==========================================================
 #ifndef GLOBAL_FUNCTION_H
 #define GLOBAL_FUNCTION_H
 
diff --git a/ABACUS.develop/source/src_global/mathzone.h b/ABACUS.develop/source/src_global/mathzone.h
index e6b58265b9..0e146364df 100644
--- a/ABACUS.develop/source/src_global/mathzone.h
+++ b/ABACUS.develop/source/src_global/mathzone.h
@@ -2,9 +2,12 @@
 #define MATHZONE_H
 
 #include "realarray.h"
+#include "vector3.h"
+#include "matrix3.h"
 #include <vector>
 #include <map>
 #include <cassert>
+#include <complex>
 using namespace std;
 
 class Mathzone
diff --git a/ABACUS.develop/source/src_lcao/ORB_gaunt_table.cpp b/ABACUS.develop/source/src_lcao/ORB_gaunt_table.cpp
index b36b373ff0..18c0449dec 100644
--- a/ABACUS.develop/source/src_lcao/ORB_gaunt_table.cpp
+++ b/ABACUS.develop/source/src_lcao/ORB_gaunt_table.cpp
@@ -1,4 +1,13 @@
 #include "ORB_gaunt_table.h"
+#include <math.h>
+#include <cassert>
+#include "../src_global/timer.h"
+#include "../src_global/memory.h"
+#include "../src_global/mathzone.h"
+#include "../src_global/global_function.h"
+//#include "../src_global/matrix3.h"
+#include "../src_global/vector3.h"
+#include "../src_global/constants.h"
 
 ORB_gaunt_table::ORB_gaunt_table(){}
 ORB_gaunt_table::~ORB_gaunt_table(){}
@@ -37,64 +46,30 @@ void ORB_gaunt_table::init_Gaunt(const int &lmax)
 					
 							Gaunt_Coefficients(dim1, dim2, dim) = 
 								this->Get_Gaunt_SH (L1, m1, L2, m2, L, m);	
-
-							/*
-							if (dim1 == 1 && dim2 == 1 && dim == 4)
-							{
-								cout << "\nGaunt_Coef = " << Gaunt_Coefficients (dim1, dim2, dim) << endl;
-								cout << "\nGet_Gaunt_SH = " << Get_Gaunt_SH (L1, m1, L2, m2, L, m) << endl;
-							}
-							*/
-
-						//	Gaunt_Coefficients(dim1, dim2, dim) = 
-						//		this->Cal_Gaunt_single(L1, m1, L2, m2, L, m, 0.0, PI, 0.0, TWO_PI);
-						
-							//test
-//							double G_revers = this->Cal_Gaunt_single(L2, m2, L1, m1, L, m, 0.0, pi, 0.0, tpi);
-
-//							cout <<  Gaunt_Coefficients(dim1, dim2, dim3) << setw(20) << G_revers << endl;
-							
-							/*
-							//test
-							int M1, M2, M;
-							if(m1 % 2 == 0) M1 = - m1 / 2;
-							else	M1 = (m1+1) / 2;
-							if(m2 % 2 == 0) M2 = - m2 / 2;
-							else	M2 = (m2+1) / 2;
-							if(m % 2 == 0) M = - m / 2;
-							else	M = (m+1) / 2;
-							*/
-							
-							/*
-							double G1 = Get_Gaunt_SH(L1, m1, L2, m2, L, m);
-
-							if( fabs(Gaunt_Coefficients(dim1, dim2, dim) - G1) > 1e-5)
-							{
-								cout << "\nl1 = " << L1 << " m1 = " << m1 << " l2 = " << L2 << " m2 = " << m2 
-												<< " L = " << L << " M = " << m << endl;
-							
-								cout << Gaunt_Coefficients(dim1, dim2, dim) << setw(20) << G1 << setw(20) 
-											<< Gaunt_Coefficients(dim1, dim2, dim) - G1 << endl;
-							}
-							*/
-						}
-                    }
-                }
-            }
-        }
-    }
+						}// m2
+                    }// L2
+                }// m1
+            }// L1
+        }// m
+    }// L
 
     timer::tick("ORB_gaunt_table", "init_Gaunt",'D');
     return;
 }
 
+
 double ORB_gaunt_table::Cal_Gaunt_single
 (
-    const int &L1, const int &m1,
-    const int &L2, const int &m2,
-    const int &L, const int &m,
-    const double &s1, const double &e1,
-    const double &s2, const double &e2
+    const int &L1, 
+	const int &m1,
+    const int &L2, 
+	const int &m2,
+    const int &L, 
+	const int &m,
+    const double &s1, 
+	const double &e1,
+    const double &s2, 
+	const double &e2
 )
 {
 	timer::tick("ORB_gaunt_table", "Cal_Gaunt_single");
@@ -121,7 +96,6 @@ double ORB_gaunt_table::Cal_Gaunt_single
 		for (int j = 0;j < 16;j++)
 		{
 			double theta = ((s1 + e1) + (e1 - s1) * absc[i]) / 2;
-			//double phi = ((s2 + e2) + (e2 - s2) * absc[j]) / 2;
 
 			result += weight[i] * weight[j] * sin(theta) *
 			          this->Ylm_Gaunt( this->get_lm_index(L1, m1), 16 * i + j) *
@@ -135,11 +109,14 @@ double ORB_gaunt_table::Cal_Gaunt_single
 	return result;
 }
 
+
 void ORB_gaunt_table::init_Ylm_Gaunt
 (
  	const int &lmax,
-    const double &s1, const double &e1,
-    const double &s2, const double &e2
+    const double &s1, 
+	const double &e1,
+    const double &s2, 
+	const double &e2
 )
 {
 	TITLE("ORB_gaunt_table", "init_Ylm_Gaunt");
@@ -153,12 +130,6 @@ void ORB_gaunt_table::init_Ylm_Gaunt
 		0.09501250983763744, 0.2816035507792589, 0.4580167776572274, 0.6178762444026438, 
 		0.755404408355003, 0.8656312023878318, 0.9445750230732326, 0.9894009349916499 };
 
-	//static double weight[16] = {
-	//	0.02715245941175406, 0.06225352393864778, 0.0951585116824929, 0.1246289712555339, 
-	//	0.1495959888165768, 0.1691565193950026, 0.1826034150449236, 0.1894506104550685, 
-	//	0.1894506104550685, 0.1826034150449236, 0.1691565193950026, 0.1495959888165768, 
-	//	0.1246289712555339, 0.0951585116824929, 0.06225352393864778, 0.02715245941175406 };
-
 	//initialization of ylm_map
 
 	Vector3<double> g_gaunt[256];
@@ -182,21 +153,25 @@ void ORB_gaunt_table::init_Ylm_Gaunt
 	return;
 }
 
-int ORB_gaunt_table::get_lm_index(const int l, const int m)
+
+int ORB_gaunt_table::get_lm_index(
+	const int l, 
+	const int m)
 {
 	return l*l+m;
 }
 
+
 /**********************
 //Rasch and Yu's Method
 ***********************/
-
 //total pointers
 int ORB_gaunt_table::P_EL(const int& L)
 {
 	return (L+1) * (L+2) * (L+3) * (L+4) / 24;
 }
 
+
 //effective pointers
 int ORB_gaunt_table::EP_EL(const int& L)
 {
@@ -204,6 +179,7 @@ int ORB_gaunt_table::EP_EL(const int& L)
 	else return (L+1) * (L+3) * (L+5) * (3*L+5) / 192;
 }
 
+
 int ORB_gaunt_table::index_func
 (
  	const int& l1,
@@ -219,6 +195,7 @@ int ORB_gaunt_table::index_func
 	return aux1 + aux2 + aux3 + m3;
 }
 
+
 void ORB_gaunt_table::init_Gaunt_CH(const int& Lmax)
 {
 	TITLE("ORB_gaunt_table","init_Gaunt_CH");
@@ -272,15 +249,16 @@ void ORB_gaunt_table::init_Gaunt_CH(const int& Lmax)
 					}
 
 					ic1++;
-				}
-			}
-		}
-	}
+				}// m3
+			}// l3
+		}// l2
+	} // l1
 
 	timer::tick("ORB_gaunt_table","init_Gaunt_CH",'D');
 	return;
 }
 
+
 //using wigner 3j expression
 double ORB_gaunt_table::Calc_Gaunt_CH
 (
@@ -324,6 +302,7 @@ double ORB_gaunt_table::Calc_Gaunt_CH
 	timer::tick("ORB_gaunt_table","Calc_Gaunt_CH");
 }
 	
+
 double ORB_gaunt_table::Get_Gaunt_CH
 (
  	const int& l1,
@@ -382,6 +361,7 @@ double ORB_gaunt_table::Get_Gaunt_CH
 	catch( out_of_range ){ return 0; }
 }
 	
+
 //Input value
 //m1, m2, m3 are restricted within 0 to 2l+1
 //and should be transformed first
@@ -466,19 +446,6 @@ double ORB_gaunt_table::Get_Gaunt_SH
 	timer::tick("ORB_gaunt_table","Get_Gaunt_SH");
 }
 
-/*	// Peize Lin delete 2016-08-26
-void ORB_gaunt_table::ZEROS()
-{
-	for(int ir = 0; ir < 5000; ir++)
-	{
-		for(int ic = 0; ic < 30; ic++)
-		{
-			Gaunt_CH[ir][ic] = 0.0;
-		}
-	}
-	return;
-}
-*/
 
 double ORB_gaunt_table::Fact(const int& n)
 {
@@ -490,7 +457,12 @@ double ORB_gaunt_table::Fact(const int& n)
 	return val;
 }
 
-void ORB_gaunt_table::Swap(int& l1, int& m1, int& l2, int & m2)
+
+void ORB_gaunt_table::Swap(
+	int& l1, 
+	int& m1, 
+	int& l2, 
+	int & m2)
 {
 	int tmp1, tmp2;
 	if(l1 >= l2) return;
@@ -508,6 +480,7 @@ void ORB_gaunt_table::Swap(int& l1, int& m1, int& l2, int & m2)
 	return;
 }
 
+
 int ORB_gaunt_table::Index_M(const int& m)
 {
 	if(m % 2 == 0) return (- m / 2);
diff --git a/ABACUS.develop/source/src_lcao/ORB_gaunt_table.h b/ABACUS.develop/source/src_lcao/ORB_gaunt_table.h
index aa09f9c51e..0f0b265eb3 100644
--- a/ABACUS.develop/source/src_lcao/ORB_gaunt_table.h
+++ b/ABACUS.develop/source/src_lcao/ORB_gaunt_table.h
@@ -1,12 +1,9 @@
-//=========================================================
-//AUTHOR : Mohan 
-//DATE : 2009-04-23
-//=========================================================
 #ifndef ORB_GAUNT_TABLE_H
 #define ORB_GAUNT_TABLE_H
 
-#include "../src_pw/tools.h"
 #include <map>
+#include "../src_global/realarray.h"
+#include "../src_global/matrix.h"
 
 class ORB_gaunt_table
 {
@@ -59,52 +56,64 @@ class ORB_gaunt_table
 	//============================================================
 	// (1) Make Ylm_Gaunt Table.
 	//============================================================
-	void init_Ylm_Gaunt(const int &lmax, const double &s1,const double &e1,
-		const double &s2,const double &e2);
+	void init_Ylm_Gaunt(
+		const int &lmax, 
+		const double &s1,
+		const double &e1,
+		const double &s2,
+		const double &e2);
 
 	//============================================================
 	// (2) Use Ylm_Gaunt to calculate Gaunt Coefficinets element
 	//============================================================
 	double Cal_Gaunt_single(
-	   	const int &l1, const int &m1, 
-	   	const int &l2, const int &m2, 
-	   	const int &l, const int &m, 
-	   	const double &s1, const double &e1,    
-	   	const double &s2, const double &e2);
+	   	const int &l1, 
+		const int &m1, 
+	   	const int &l2, 
+		const int &m2, 
+	   	const int &l, 
+		const int &m, 
+	   	const double &s1, 
+		const double &e1,    
+	   	const double &s2, 
+		const double &e2);
 
 	//============================================================
 	// (3) Make the whole Gaunt Coefficients table
 	//============================================================
 	void init_Gaunt(const int &lmax);
 
-
 	//========================================================
 	// Small function
 	//========================================================
 	static int get_lm_index(const int l, const int m);
+
 	static int Index_M(const int& m);
 
 	private:
 	
-	//Index Function
-	//Yu's mehtod
+	// Index Function
+	// Yu's mehtod
 	// Peize Lin delete void ZEROS(); 2016-08-26
 	
 	int P_EL(const int& L);
+
 	int EP_EL(const int& L);
+
 	int index_func(
-					const int& l1,
-					const int& l2,
-					const int& l3,
-					const int& m3	);
+			const int& l1,
+			const int& l2,
+			const int& l3,
+			const int& m3	);
 	
 	double Fact(const int& n);
+
 	void Swap(
-				int& l1,
-				int& m1,
-				int& l2,
-				int& m2	);
-	
+			int& l1,
+			int& m1,
+			int& l2,
+			int& m2	);
+
 	//2*Lmax+1
 	std::map<int,std::map<int,double>> Gaunt_CH;		// Peize Lin update 2016-08-26
 	

From 19bcaef80e73a19b2cd7d91d2626701c3b0b5c5d Mon Sep 17 00:00:00 2001
From: linpz <linpz@mail.ustc.edu.cn>
Date: Fri, 2 Apr 2021 16:47:11 +0800
Subject: [PATCH 14/60] 1. delete unnecessary header files included in class
 Sph_Bessel_Recursive

---
 .../src_global/sph_bessel_recursive-d1.cpp      |  8 ++------
 .../src_global/sph_bessel_recursive-d2.cpp      | 13 +++----------
 .../source/src_global/sph_bessel_recursive.h    | 17 ++++++++---------
 3 files changed, 13 insertions(+), 25 deletions(-)

diff --git a/ABACUS.develop/source/src_global/sph_bessel_recursive-d1.cpp b/ABACUS.develop/source/src_global/sph_bessel_recursive-d1.cpp
index d48872d9b0..e65ce87ccc 100644
--- a/ABACUS.develop/source/src_global/sph_bessel_recursive-d1.cpp
+++ b/ABACUS.develop/source/src_global/sph_bessel_recursive-d1.cpp
@@ -9,12 +9,8 @@
 #include<cmath>
 #include<stdexcept>
 
-// Peize Lin test
-#include<iostream>
-#include<sys/time.h>
-using namespace std;
 
-vector<Sph_Bessel_Recursive::D1> Sph_Bessel_Recursive_Pool::D1::sb_pool;
+std::vector<Sph_Bessel_Recursive::D1> Sph_Bessel_Recursive_Pool::D1::sb_pool;
 
 void Sph_Bessel_Recursive::D1::set_dx( const double dx_in )
 {
@@ -27,7 +23,7 @@ void Sph_Bessel_Recursive::D1::set_dx( const double dx_in )
 	}
 }
 
-const vector<vector<double>> & Sph_Bessel_Recursive::D1::cal_jlx( const int lmax, const size_t ix_size )
+const std::vector<std::vector<double>> & Sph_Bessel_Recursive::D1::cal_jlx( const int lmax, const size_t ix_size )
 {
 	if(lmax<0)
 		throw std::invalid_argument("Sph_Bessel_Recursive::jlx l<0");
diff --git a/ABACUS.develop/source/src_global/sph_bessel_recursive-d2.cpp b/ABACUS.develop/source/src_global/sph_bessel_recursive-d2.cpp
index 80193179dc..f012cd25e4 100644
--- a/ABACUS.develop/source/src_global/sph_bessel_recursive-d2.cpp
+++ b/ABACUS.develop/source/src_global/sph_bessel_recursive-d2.cpp
@@ -9,14 +9,7 @@
 #include<cmath>
 #include<stdexcept>
 
-// Peize Lin test
-#include<iostream>
-#include<sys/time.h>
-#include"src_external/src_test/src_ri/exx_lcao-test.h"
-#include"src_lcao/global_fp.h"
-using namespace std;
-
-vector<Sph_Bessel_Recursive::D2> Sph_Bessel_Recursive_Pool::D2::sb_pool;
+std::vector<Sph_Bessel_Recursive::D2> Sph_Bessel_Recursive_Pool::D2::sb_pool;
 
 void Sph_Bessel_Recursive::D2::set_dx( const double dx_in )
 {
@@ -29,7 +22,7 @@ void Sph_Bessel_Recursive::D2::set_dx( const double dx_in )
 	}
 }
 
-const vector<vector<vector<double>>> & Sph_Bessel_Recursive::D2::cal_jlx( const int lmax, const size_t ix1_size, const size_t ix2_size )
+const std::vector<std::vector<std::vector<double>>> & Sph_Bessel_Recursive::D2::cal_jlx( const int lmax, const size_t ix1_size, const size_t ix2_size )
 {
 	if(lmax<0)
 		throw std::invalid_argument("Sph_Bessel_Recursive::jlx l<0");
@@ -49,7 +42,7 @@ void Sph_Bessel_Recursive::D2::cal_jlx_0( const int l_size, const size_t ix1_siz
 		const double jlx0 = (0==l) ? 1.0 : 0.0;
 
 		if( jlx[l].size()<ix1_size )
-			jlx[l].resize(ix1_size,vector<double>(1,jlx0));
+			jlx[l].resize(ix1_size,std::vector<double>(1,jlx0));
 		
 		if( jlx[l][0].size()<ix2_size )
 			jlx[l][0].resize(ix2_size,jlx0);
diff --git a/ABACUS.develop/source/src_global/sph_bessel_recursive.h b/ABACUS.develop/source/src_global/sph_bessel_recursive.h
index d8ee591e6b..3eb51b10fc 100644
--- a/ABACUS.develop/source/src_global/sph_bessel_recursive.h
+++ b/ABACUS.develop/source/src_global/sph_bessel_recursive.h
@@ -7,7 +7,6 @@
 #define SPH_BESSEL_RECURSIVE_H
 
 #include<vector>
-using namespace std;
 
 class Sph_Bessel_Recursive
 {
@@ -21,14 +20,14 @@ class Sph_Bessel_Recursive
 class Sph_Bessel_Recursive::D1
 {
 public:	
-	const vector<vector<double>> & cal_jlx( const int lmax, const size_t ix_size );
-	const vector<vector<double>> & get_jlx()const{ return jlx; }
+	const std::vector<std::vector<double>> & cal_jlx( const int lmax, const size_t ix_size );
+	const std::vector<std::vector<double>> & get_jlx()const{ return jlx; }
 	
 	void set_dx(const double dx_in);
 	double get_dx()const{ return dx; }
 
 private:
-	vector<vector<double>> jlx;		// jlx[l][x]
+	std::vector<std::vector<double>> jlx;		// jlx[l][x]
 	double dx;
 	bool finish_set_dx = false;
 	
@@ -45,14 +44,14 @@ class Sph_Bessel_Recursive::D1
 class Sph_Bessel_Recursive::D2
 {
 public:	
-	const vector<vector<vector<double>>> & cal_jlx( const int lmax, const size_t ix1_size, const size_t ix2_size );
-	const vector<vector<vector<double>>> & get_jlx()const{ return jlx; }
+	const std::vector<std::vector<std::vector<double>>> & cal_jlx( const int lmax, const size_t ix1_size, const size_t ix2_size );
+	const std::vector<std::vector<std::vector<double>>> & get_jlx()const{ return jlx; }
 	
 	void set_dx(const double dx_in);
 	double get_dx()const{ return dx; }
 
 private:
-	vector<vector<vector<double>>> jlx;		// jlx[l][x1][x2]
+	std::vector<std::vector<std::vector<double>>> jlx;		// jlx[l][x1][x2]
 	double dx;
 	bool finish_set_dx = false;
 	
@@ -72,12 +71,12 @@ class Sph_Bessel_Recursive_Pool
 	class D1
 	{
 		public:
-		static vector<Sph_Bessel_Recursive::D1> sb_pool;
+		static std::vector<Sph_Bessel_Recursive::D1> sb_pool;
 	};
 	class D2
 	{
 		public:
-		static vector<Sph_Bessel_Recursive::D2> sb_pool;
+		static std::vector<Sph_Bessel_Recursive::D2> sb_pool;
 	};
 };
 

From de5e6734cc319f94d95f0e8bd7d69ac048912670 Mon Sep 17 00:00:00 2001
From: mohan <mohan.chen.chen.mohan@gmail.com>
Date: Fri, 2 Apr 2021 20:28:25 +0800
Subject: [PATCH 15/60] add lat0 parameter in ORB_control::set_orb_tables
 subroutine to minimize global variables

---
 ABACUS.develop/source/run_lcao.cpp               | 2 +-
 ABACUS.develop/source/src_io/energy_dos.cpp      | 2 +-
 ABACUS.develop/source/src_io/mulliken_charge.cpp | 2 +-
 ABACUS.develop/source/src_lcao/ORB_control.cpp   | 6 ++++--
 ABACUS.develop/source/src_lcao/ORB_control.h     | 2 +-
 5 files changed, 8 insertions(+), 6 deletions(-)

diff --git a/ABACUS.develop/source/run_lcao.cpp b/ABACUS.develop/source/run_lcao.cpp
index f2fad9aba6..6f8c19d31b 100644
--- a/ABACUS.develop/source/run_lcao.cpp
+++ b/ABACUS.develop/source/run_lcao.cpp
@@ -45,7 +45,7 @@ void Run_lcao::lcao_line(void)
 
     // * reading the localized orbitals/projectors 
 	// * construct the interpolation tables.
-	hm.orb_con.set_orb_tables(UOT);
+	hm.orb_con.set_orb_tables(UOT, ucell.lat0);
 
 	// * allocate H and S matrices according to computational resources
 	// * set the 'trace' between local H/S and global H/S
diff --git a/ABACUS.develop/source/src_io/energy_dos.cpp b/ABACUS.develop/source/src_io/energy_dos.cpp
index d39b431f68..84b8222d5f 100644
--- a/ABACUS.develop/source/src_io/energy_dos.cpp
+++ b/ABACUS.develop/source/src_io/energy_dos.cpp
@@ -327,7 +327,7 @@ void energy::perform_dos(void)
 				atom_arrange::search( SEARCH_RADIUS );//qifeng-2019-01-21
 
 				// mohan update 2021-02-10
-				hm.orb_con.set_orb_tables(UOT);
+				hm.orb_con.set_orb_tables(UOT, ucell.lat0);
 				LM.allocate_HS_R(LNNR.nnr);
 				LM.zeros_HSR('S', LNNR.nnr);
 				UHM.genH.calculate_S_no();
diff --git a/ABACUS.develop/source/src_io/mulliken_charge.cpp b/ABACUS.develop/source/src_io/mulliken_charge.cpp
index 68046f9c61..218e6968a8 100644
--- a/ABACUS.develop/source/src_io/mulliken_charge.cpp
+++ b/ABACUS.develop/source/src_io/mulliken_charge.cpp
@@ -166,7 +166,7 @@ void Mulliken_Charge::cal_mulliken(void)
 			mud[0].create(ParaO.ncol,ParaO.nrow);
 			atom_arrange::set_sr_NL();
 			atom_arrange::search( SEARCH_RADIUS );//qifeng-2019-01-21
-			hm.orb_con.set_orb_tables(UOT);
+			hm.orb_con.set_orb_tables(UOT, ucell.lat0);
 			LM.allocate_HS_R(LNNR.nnr);
 			LM.zeros_HSR('S', LNNR.nnr);
 			UHM.genH.calculate_S_no();
diff --git a/ABACUS.develop/source/src_lcao/ORB_control.cpp b/ABACUS.develop/source/src_lcao/ORB_control.cpp
index b47bc21cec..e33fe078b6 100644
--- a/ABACUS.develop/source/src_lcao/ORB_control.cpp
+++ b/ABACUS.develop/source/src_lcao/ORB_control.cpp
@@ -12,7 +12,7 @@ ORB_control::ORB_control()
 ORB_control::~ORB_control()
 {}
 
-void ORB_control::set_orb_tables(ORB_gen_tables &OGT)
+void ORB_control::set_orb_tables(ORB_gen_tables &OGT, const double &lat0)
 {
     TITLE("ORB_control","set_orb_tables");
 	timer::tick("ORB_control","set_orb_tables",'B');
@@ -41,7 +41,9 @@ void ORB_control::set_orb_tables(ORB_gen_tables &OGT)
     // 3: generate overlap & kinetic table
     OGT.gen_tables(job0);
     // init lat0, in order to interpolated value from this table.
-    OGT.set_unit(ucell.lat0);
+
+	assert(lat0>0.0);
+    OGT.set_unit(lat0);
 
 
 	timer::tick("ORB_control","set_orb_tables",'B');
diff --git a/ABACUS.develop/source/src_lcao/ORB_control.h b/ABACUS.develop/source/src_lcao/ORB_control.h
index bd34d797ed..daaf87e926 100644
--- a/ABACUS.develop/source/src_lcao/ORB_control.h
+++ b/ABACUS.develop/source/src_lcao/ORB_control.h
@@ -12,7 +12,7 @@ class ORB_control
     ~ORB_control();
 
     // Generate the S(overlap),T,NL matrix.
-    void set_orb_tables(ORB_gen_tables &OGT);
+    void set_orb_tables(ORB_gen_tables &OGT, const double &lat0);
 
     void clear_after_ions(ORB_gen_tables &OGT);
 

From 22f6cebabe84e8b1b5440427ba613488c1eae15b Mon Sep 17 00:00:00 2001
From: mohan <mohan.chen.chen.mohan@gmail.com>
Date: Fri, 2 Apr 2021 21:20:39 +0800
Subject: [PATCH 16/60] modify ORB_table_beta.cpp and ORB_table_beta.h

---
 .../source/src_lcao/ORB_gen_tables.cpp        |  2 +-
 .../source/src_lcao/ORB_nonlocal.cpp          | 31 ++++++-------------
 .../source/src_lcao/ORB_table_beta.cpp        | 14 ++++-----
 .../source/src_lcao/ORB_table_beta.h          |  5 ++-
 4 files changed, 22 insertions(+), 30 deletions(-)

diff --git a/ABACUS.develop/source/src_lcao/ORB_gen_tables.cpp b/ABACUS.develop/source/src_lcao/ORB_gen_tables.cpp
index c9b2ff6a32..9e20c9c93c 100644
--- a/ABACUS.develop/source/src_lcao/ORB_gen_tables.cpp
+++ b/ABACUS.develop/source/src_lcao/ORB_gen_tables.cpp
@@ -53,7 +53,7 @@ void ORB_gen_tables::gen_tables( const int &job0 )
 
 	// NL: nonlocal
 	tbeta.init_NL_Tpair();
-	tbeta.init_NL_Opair(); // add 2009-5-8
+	tbeta.init_NL_Opair(ORB); // add 2009-5-8
 
 	//caoyu add 2021-03-18
 	// DS: Descriptor
diff --git a/ABACUS.develop/source/src_lcao/ORB_nonlocal.cpp b/ABACUS.develop/source/src_lcao/ORB_nonlocal.cpp
index bf60401f0e..aa90a0150a 100644
--- a/ABACUS.develop/source/src_lcao/ORB_nonlocal.cpp
+++ b/ABACUS.develop/source/src_lcao/ORB_nonlocal.cpp
@@ -41,6 +41,7 @@ void Numerical_Nonlocal::set_type_info
 	const bool has_so
 )
 {
+	// PLEASE take care of this warning
 	if (type_in < 0 || type_in > 2)
 	{
 		WARNING("Numerical_Nonlocal", "bad input of type_in: not ready yet for type >2");
@@ -56,47 +57,35 @@ void Numerical_Nonlocal::set_type_info
 	}
 
 	this->lmax = lmax_in;
-//----------------------------------------------------------
-//EXPLAIN : Coefficient D used in calculate elements of NLps
-//----------------------------------------------------------
-/*2016-07-19, LiuXh
-	this->Coefficient_D.create( lmax_in+1, lmax_in+1);
-	for (int L1 = 0; L1 < lmax + 1; L1++)
-	{
-		for (int L2 = 0; L2 < lmax + 1; L2++)
-		{
-			this->Coefficient_D(L1, L2) = Coefficient_D_in(L1, L2);
-		}
-	}
-2016-07-19, LiuXh*/
 
 //----------------------------------------------------------
 //EXPLAIN : LfromBeta
 //----------------------------------------------------------
 	this->nproj = nproj_in;
-	if(has_so){ 
+
+	if(has_so)
+	{ 
 		this->nproj_soc = nproj_in_so;
 	}
-	//assert(nproj <= lmax_in+1); //LiuXh 2016-01-13, 2016-05-16
+
 	assert(nproj <= nproj_in+1); //LiuXh 2016-01-13, 2016-05-16
 	assert(nproj >= 0);
 
-//2016-07-19 begin, LiuXh
-	if(!has_so){
+	//2016-07-19 begin, LiuXh
+	if(!has_so)
+	{
 		this->Coefficient_D.create( nproj_in+1, nproj_in+1);
 		ZEROS(this->non_zero_count_soc, 4);
 		if(lmax_in > -1) //LiuXh add 20180328, fix bug of Hydrogen element with single projector pseudopot
-		{ //LiuXh add 20180328
-//			for (int L1 = 0; L1 < nproj + 1; L1++)
+		{
 			for (int L1 = 0; L1 < min(this->Coefficient_D.nr, Coefficient_D_in.nr); L1++)
 			{
-//				for (int L2 = 0; L2 < nproj + 1; L2++)
 				for (int L2 = 0; L2 < min(this->Coefficient_D.nc, Coefficient_D_in.nc); L2++)
 				{
 					this->Coefficient_D(L1, L2) = Coefficient_D_in(L1, L2);
 				}
 			}
-		} //LiuXh add 20180328
+		}
 	}
 	else//zhengdy-soc
 	{
diff --git a/ABACUS.develop/source/src_lcao/ORB_table_beta.cpp b/ABACUS.develop/source/src_lcao/ORB_table_beta.cpp
index b20574c327..7c4223e449 100644
--- a/ABACUS.develop/source/src_lcao/ORB_table_beta.cpp
+++ b/ABACUS.develop/source/src_lcao/ORB_table_beta.cpp
@@ -402,11 +402,11 @@ void ORB_table_beta::init_NL_Tpair(void)
 
 
 
-void ORB_table_beta::init_NL_Opair(void)
+void ORB_table_beta::init_NL_Opair(LCAO_Orbitals &orb)
 {
-	const int lmax = ORB.get_lmax();
-	const int nchimax = ORB.get_nchimax();
-	const int nprojmax = ORB.nprojmax;
+	const int lmax = orb.get_lmax();
+	const int nchimax = orb.get_nchimax();
+	const int nprojmax = orb.nprojmax;
 	
 	// may have bug if we use all H!
 	if( nprojmax == 0)
@@ -427,13 +427,13 @@ void ORB_table_beta::init_NL_Opair(void)
 		{
 			const int nlpair = this->NL_Tpair(T1, T0);
 			int index = 0;
-			for(int L1=0; L1<ORB.Phi[T1].getLmax()+1; L1++)
+			for(int L1=0; L1<orb.Phi[T1].getLmax()+1; L1++)
 			{
-				for(int N1=0; N1<ORB.Phi[T1].getNchi(L1); N1++)
+				for(int N1=0; N1<orb.Phi[T1].getNchi(L1); N1++)
 				{
 					// notice !! T0 must be Beta( Nonlocal projector)
 					// mohan update 2011-03-07
-					for(int ip=0; ip<ORB.nproj[T0]; ip++)
+					for(int ip=0; ip<orb.nproj[T0]; ip++)
 					{
 						assert( nlpair < NL_nTpairs );
 						assert( L1 < lmax+1 );
diff --git a/ABACUS.develop/source/src_lcao/ORB_table_beta.h b/ABACUS.develop/source/src_lcao/ORB_table_beta.h
index 17488fe18e..b39ae22b3d 100644
--- a/ABACUS.develop/source/src_lcao/ORB_table_beta.h
+++ b/ABACUS.develop/source/src_lcao/ORB_table_beta.h
@@ -2,6 +2,7 @@
 #define ORB_TABLE_BETA_H 
 
 #include "src_pw/tools.h"
+#include "ORB_read.h"
 #include "ORB_atomic.h"
 #include "ORB_atomic_lm.h"
 #include "ORB_nonlocal.h"
@@ -33,7 +34,9 @@ class ORB_table_beta
 	// O stands for orbitals.
 	//-------------------------
 	void init_NL_Tpair(void);
-    void init_NL_Opair(void);
+
+    void init_NL_Opair(LCAO_Orbitals &orb);
+
 	int NL_nTpairs;
 	IntArray NL_Tpair;
 	IntArray NL_Opair;

From 164762a9ef58107c4c0b156d949b2b01f43bda5c Mon Sep 17 00:00:00 2001
From: mohan <mohan.chen.chen.mohan@gmail.com>
Date: Sat, 3 Apr 2021 15:02:42 +0800
Subject: [PATCH 17/60] set ntype as an input parameter

---
 .../source/src_lcao/ORB_control.cpp           |  2 +-
 ABACUS.develop/source/src_lcao/ORB_read.cpp   | 34 +++++++++----------
 ABACUS.develop/source/src_lcao/ORB_read.h     |  8 +++--
 3 files changed, 23 insertions(+), 21 deletions(-)

diff --git a/ABACUS.develop/source/src_lcao/ORB_control.cpp b/ABACUS.develop/source/src_lcao/ORB_control.cpp
index e33fe078b6..fc0b88b64e 100644
--- a/ABACUS.develop/source/src_lcao/ORB_control.cpp
+++ b/ABACUS.develop/source/src_lcao/ORB_control.cpp
@@ -21,7 +21,7 @@ void ORB_control::set_orb_tables(ORB_gen_tables &OGT, const double &lat0)
     // (1) FUNCTION : use 'info' to generate 'Numerical Orbital'
     // (1) RESULT : We have 'Numerical Orbital' for calculate S-table and T-table.
 	//=============================================================================
-    ORB.Read_Orbitals();
+    ORB.Read_Orbitals(ucell.ntype);
 
 	if(CALCULATION=="test")
 	{
diff --git a/ABACUS.develop/source/src_lcao/ORB_read.cpp b/ABACUS.develop/source/src_lcao/ORB_read.cpp
index 3e87061e70..ed69e3c801 100644
--- a/ABACUS.develop/source/src_lcao/ORB_read.cpp
+++ b/ABACUS.develop/source/src_lcao/ORB_read.cpp
@@ -97,7 +97,7 @@ void LCAO_Orbitals::bcast_files(void)
 #endif
 
 
-void LCAO_Orbitals::Read_Orbitals(void)
+void LCAO_Orbitals::Read_Orbitals(const int &ntype_in)
 {
 	TITLE("LCAO_Orbitals", "Read_Orbitals");
 	timer::tick("LCAO_Orbitals","Read_Orbitals",'C');
@@ -140,9 +140,11 @@ void LCAO_Orbitals::Read_Orbitals(void)
     assert(dR > 0.0);
     assert(Rmax > 0.0);
 
-	this->ntype = ucell.ntype;
+	this->ntype = ntype_in; 
+	assert(ntype>0);
+
 	this->lmax = ucell.lmax;
-	for(int i=0; i<ucell.ntype; i++)
+	for(int i=0; i<ntype; i++)
 	{
 		OUT(ofs_running,"atom label",ucell.atoms[i].label);
 	}
@@ -181,12 +183,9 @@ void LCAO_Orbitals::Read_Orbitals(void)
 	// Read in numerical atomic orbitals for each atom type.
 	//>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>
 	delete[] this->Phi;
-// PLEASE avoid using 'ucell' as global variable 
-// if 'ntype' is really needed, the variable should be initialized
-// as a parameter of this class
-// mohan note 2021-03-23
-	this->Phi = new Numerical_Orbital[ucell.ntype];
-	for(int it=0; it<ucell.ntype; it++)
+
+	this->Phi = new Numerical_Orbital[ntype];
+	for(int it=0; it<ntype; it++)
 	{
 		this->Read_PAO(it);	
 	}
@@ -201,11 +200,11 @@ void LCAO_Orbitals::Read_Orbitals(void)
 	// mohan note 2011-03-04
 	//>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>
 	delete[] this->Beta;
-	this->Beta = new Numerical_Nonlocal[ucell.ntype];
+	this->Beta = new Numerical_Nonlocal[ntype];
 
 	delete[] nproj;
-	this->nproj = new int[ucell.ntype];
-	ZEROS(nproj, ucell.ntype);
+	this->nproj = new int[ntype];
+	ZEROS(nproj, ntype);
 	
 	this->nprojmax = 0;
 	
@@ -214,7 +213,7 @@ void LCAO_Orbitals::Read_Orbitals(void)
 	// if false: get nonlocal information from .upf or .vwr directly
 	bool readin_nonlocal = false;
 
-	for(int it=0; it<ucell.ntype; it++)
+	for(int it=0; it<ntype; it++)
 	{
 		if(readin_nonlocal)
 		{
@@ -942,7 +941,8 @@ void LCAO_Orbitals::Read_PAO(const int& it)
 void LCAO_Orbitals::set_nl_index(void)
 {
 	TITLE("LCAO_Orbitals","set_nl_index");
-	int ntype = ucell.ntype;
+
+	assert(this->ntype>0);
 
 	this->nkb=0;
 	for(int it=0; it<ntype; it++)
@@ -962,7 +962,7 @@ void LCAO_Orbitals::set_nl_index(void)
 	this->itiaib2ib_all.create(ntype, ucell.namax, this->nkb);
 
 	int ib_all = 0;
-	for(int it=0; it<ucell.ntype; it++)
+	for(int it=0; it<ntype; it++)
 	{
 		for(int ia=0; ia<ucell.atoms[it].na; ia++)
 		{
@@ -987,13 +987,13 @@ void LCAO_Orbitals::set_nl_index(void)
 
 
 	int nh_max = 0;
-	for(int it=0; it<ucell.ntype; it++)
+	for(int it=0; it<ntype; it++)
 	{
 		nh_max = max(nh_max, ucell.atoms[it].nh);
 	}
 
 	this->ib2_ylm.create(ntype, nh_max);
-	for(int it=0; it<ucell.ntype; it++)
+	for(int it=0; it<ntype; it++)
 	{
 		int index = 0;
 		for(int ib=0; ib< this->nproj[it]; ib++)
diff --git a/ABACUS.develop/source/src_lcao/ORB_read.h b/ABACUS.develop/source/src_lcao/ORB_read.h
index 06ed11f642..84dc3b5ba0 100644
--- a/ABACUS.develop/source/src_lcao/ORB_read.h
+++ b/ABACUS.develop/source/src_lcao/ORB_read.h
@@ -20,7 +20,7 @@ class LCAO_Orbitals
 	LCAO_Orbitals();
 	~LCAO_Orbitals();
 
-	void Read_Orbitals(void);
+	void Read_Orbitals(const int &ntype_in);
 
 	void Read_PAO(const int& it);
 
@@ -30,7 +30,6 @@ class LCAO_Orbitals
 	// read in the NONLOCAL projector from file.
 	void Read_NonLocal(const int& it, int &n_projectors);
 
-	void set_nl_index(void);
 
 	void Read_Descriptor(void);		//caoyu add 2020-3-16
 
@@ -87,7 +86,10 @@ class LCAO_Orbitals
 	int nchimax;
 	int lmax_d;	//caoyu add 2021-03-17
 	int nchimax_d;	//caoyu add 2021-03-17
-	int ntype;
+	int ntype; // number of elements
+
+
+	void set_nl_index(void);
 
 };
 

From 31352223cbd033b2c98a343763bd15adf323bd8b Mon Sep 17 00:00:00 2001
From: mohan <mohan.chen.chen.mohan@gmail.com>
Date: Sat, 3 Apr 2021 16:12:13 +0800
Subject: [PATCH 18/60] keep eliminating global variables in ORB files

---
 .../src_global/global_function-func_each_2.h  | 19 +------------
 ABACUS.develop/source/src_global/sph_bessel.h |  1 -
 ABACUS.develop/source/src_io/energy_dos.cpp   |  2 +-
 .../source/src_io/mulliken_charge.cpp         |  2 +-
 ABACUS.develop/source/src_lcao/LOOP_ions.cpp  |  2 +-
 .../source/src_lcao/ORB_control.cpp           |  4 +--
 ABACUS.develop/source/src_lcao/ORB_control.h  |  3 +-
 .../source/src_lcao/ORB_gen_tables.h          | 12 +++-----
 .../source/src_lcao/ORB_table_alpha.cpp       |  6 ++--
 .../source/src_lcao/ORB_table_alpha.h         |  9 ++----
 .../source/src_lcao/ORB_table_beta.cpp        | 28 ++++++++-----------
 .../source/src_lcao/ORB_table_beta.h          | 15 ++++------
 .../source/src_lcao/ORB_table_phi.h           |  2 +-
 ABACUS.develop/source/src_lcao/run_md.cpp     |  2 +-
 14 files changed, 39 insertions(+), 68 deletions(-)

diff --git a/ABACUS.develop/source/src_global/global_function-func_each_2.h b/ABACUS.develop/source/src_global/global_function-func_each_2.h
index 76561f3c3a..1d7b01d31f 100644
--- a/ABACUS.develop/source/src_global/global_function-func_each_2.h
+++ b/ABACUS.develop/source/src_global/global_function-func_each_2.h
@@ -1,23 +1,6 @@
 // AUTHOR:	Peize Lin
 // Date: 	2016-09-07
 
-
-/*˵����
-����
-	T tA;
-	const T tB;
-	const ��������(������);
-��
-	func( tA, tB, �������� );
-���
-	L1<L2<...<Ln<T>>...>> t_listA;
-	const L1<L2<...<Ln<T>>...>> t_listB;
-	������L1��L2��...LnΪvector��map��
-��
-	FUNC_EACH_2( t_listA, t_listB, func, �������� );
-*/
-
-
 #ifndef FUNC_EACH_2_H
 #define FUNC_EACH_2_H
 
@@ -63,4 +46,4 @@ void FUNC_EACH_2(
 	}
 }
 
-#endif // FUNC_EACH_2_H
\ No newline at end of file
+#endif // FUNC_EACH_2_H
diff --git a/ABACUS.develop/source/src_global/sph_bessel.h b/ABACUS.develop/source/src_global/sph_bessel.h
index 4b5a7391e1..c83dc6e9c4 100644
--- a/ABACUS.develop/source/src_global/sph_bessel.h
+++ b/ABACUS.develop/source/src_global/sph_bessel.h
@@ -1,7 +1,6 @@
 #ifndef SPH_BESSEL_H
 #define SPH_BESSEL_H
 
-#include "realarray.h"
 using namespace std;
 
 class Sph_Bessel
diff --git a/ABACUS.develop/source/src_io/energy_dos.cpp b/ABACUS.develop/source/src_io/energy_dos.cpp
index 84b8222d5f..13c1273750 100644
--- a/ABACUS.develop/source/src_io/energy_dos.cpp
+++ b/ABACUS.develop/source/src_io/energy_dos.cpp
@@ -409,7 +409,7 @@ void energy::perform_dos(void)
 				atom_arrange::delete_vector( SEARCH_RADIUS );
 #endif
 				// mohan update 2021-02-10
-				hm.orb_con.clear_after_ions(UOT);
+				hm.orb_con.clear_after_ions(UOT, ORB);
 			}//else
 
 		 MPI_Reduce(pdosk[is].c, pdos[is].c , NUM , MPI_DOUBLE , MPI_SUM, 0, MPI_COMM_WORLD);
diff --git a/ABACUS.develop/source/src_io/mulliken_charge.cpp b/ABACUS.develop/source/src_io/mulliken_charge.cpp
index 218e6968a8..a30ef92e8e 100644
--- a/ABACUS.develop/source/src_io/mulliken_charge.cpp
+++ b/ABACUS.develop/source/src_io/mulliken_charge.cpp
@@ -224,7 +224,7 @@ void Mulliken_Charge::cal_mulliken(void)
 #ifdef __MPI
 			atom_arrange::delete_vector( SEARCH_RADIUS );
 #endif
-			hm.orb_con.clear_after_ions(UOT);
+			hm.orb_con.clear_after_ions(UOT, ORB);
 
 		}//else                     
 		MPI_Reduce(MecMulP[is], DecMulP[is] , NLOCAL , MPI_DOUBLE , MPI_SUM, 0, MPI_COMM_WORLD);
diff --git a/ABACUS.develop/source/src_lcao/LOOP_ions.cpp b/ABACUS.develop/source/src_lcao/LOOP_ions.cpp
index 1373db65a0..ed2338819c 100644
--- a/ABACUS.develop/source/src_lcao/LOOP_ions.cpp
+++ b/ABACUS.develop/source/src_lcao/LOOP_ions.cpp
@@ -225,7 +225,7 @@ void LOOP_ions::opt_ions(void)
     }
 
 	// mohan update 2021-02-10
-    hm.orb_con.clear_after_ions(UOT);
+    hm.orb_con.clear_after_ions(UOT, ORB);
 
     timer::tick("LOOP_ions","opt_ions",'B'); 
     return;
diff --git a/ABACUS.develop/source/src_lcao/ORB_control.cpp b/ABACUS.develop/source/src_lcao/ORB_control.cpp
index fc0b88b64e..7f5d1fddbd 100644
--- a/ABACUS.develop/source/src_lcao/ORB_control.cpp
+++ b/ABACUS.develop/source/src_lcao/ORB_control.cpp
@@ -50,11 +50,11 @@ void ORB_control::set_orb_tables(ORB_gen_tables &OGT, const double &lat0)
     return;
 }
 
-void ORB_control::clear_after_ions(ORB_gen_tables &OGT)
+void ORB_control::clear_after_ions(ORB_gen_tables &OGT, LCAO_Orbitals &orb)
 {
     TITLE("ORB_control","clear_after_ions");
     OGT.MOT.Destroy_Table();
-    OGT.tbeta.Destroy_Table_Beta();
+    OGT.tbeta.Destroy_Table_Beta(orb);
     
 	//caoyu add 2021-03-18
     if (INPUT.out_descriptor && BASIS_TYPE == "lcao") 
diff --git a/ABACUS.develop/source/src_lcao/ORB_control.h b/ABACUS.develop/source/src_lcao/ORB_control.h
index daaf87e926..1e879f1863 100644
--- a/ABACUS.develop/source/src_lcao/ORB_control.h
+++ b/ABACUS.develop/source/src_lcao/ORB_control.h
@@ -2,6 +2,7 @@
 #define ORB_CONTROL_H 
 
 #include "ORB_gen_tables.h"
+#include "ORB_read.h"
 
 class ORB_control 
 {
@@ -14,7 +15,7 @@ class ORB_control
     // Generate the S(overlap),T,NL matrix.
     void set_orb_tables(ORB_gen_tables &OGT, const double &lat0);
 
-    void clear_after_ions(ORB_gen_tables &OGT);
+    void clear_after_ions(ORB_gen_tables &OGT, LCAO_Orbitals &orb);
 
 };
 #endif
diff --git a/ABACUS.develop/source/src_lcao/ORB_gen_tables.h b/ABACUS.develop/source/src_lcao/ORB_gen_tables.h
index 05b5c07906..18f9f0f3f2 100644
--- a/ABACUS.develop/source/src_lcao/ORB_gen_tables.h
+++ b/ABACUS.develop/source/src_lcao/ORB_gen_tables.h
@@ -1,13 +1,9 @@
-//=========================================================
-//AUTHOR : Mohan 
-//DATE : 2009-04-22
-//=========================================================
-#ifndef USE_OVERLAP_TABLE_H
-#define USE_OVERLAP_TABLE_H
+#ifndef ORB_GEN_TABLES_H
+#define ORB_GEN_TABLES_H
 
-#include "src_pw/tools.h"
+#include "../src_pw/tools.h"
+#include "../src_global/ylm.h"
 #include "ORB_gaunt_table.h"
-#include "src_global/ylm.h"
 #include "ORB_table_beta.h"
 #include "ORB_table_phi.h"
 #include "ORB_table_alpha.h"		//caoyu add 2020-3-18
diff --git a/ABACUS.develop/source/src_lcao/ORB_table_alpha.cpp b/ABACUS.develop/source/src_lcao/ORB_table_alpha.cpp
index 3ab48ec50a..462956c64b 100644
--- a/ABACUS.develop/source/src_lcao/ORB_table_alpha.cpp
+++ b/ABACUS.develop/source/src_lcao/ORB_table_alpha.cpp
@@ -148,6 +148,7 @@ void ORB_table_alpha::cal_S_PhiAlpha_R(
 	const vector<vector<double>>& jlm1 = pSB->get_jlx()[l - 1];
 	const vector<vector<double>>& jl = pSB->get_jlx()[l];
 	const vector<vector<double>>& jlp1 = pSB->get_jlx()[l + 1];
+
 	for (int ir = 0; ir < rmesh; ir++)
 	{
 		ZEROS(integrated_func, kmesh);
@@ -211,9 +212,8 @@ void ORB_table_alpha::cal_S_PhiAlpha_R(
 	}
 
 	delete[] integrated_func;
-
-
 	delete[] k1_dot_k2;
+
 	timer::tick("ORB_table_alpha", "S_PhiAlpha_R");
 	return;
 }
@@ -224,6 +224,8 @@ void ORB_table_alpha::init_Table_Alpha(Sph_Bessel_Recursive::D2* pSB)
 	TITLE("ORB_table_alpha", "init_Table_Alpha");
 	timer::tick("ORB_table_alpha", "init_Table_Alpha", 'D');
 
+	assert(ntype>0);
+
 	// (1) allocate 1st dimension ( overlap, derivative)
 	this->Table_DSR = new double**** [2];
 	// (2) allocate 2nd dimension ( overlap, derivative)
diff --git a/ABACUS.develop/source/src_lcao/ORB_table_alpha.h b/ABACUS.develop/source/src_lcao/ORB_table_alpha.h
index 3c56c28d77..b6c6eacf0c 100644
--- a/ABACUS.develop/source/src_lcao/ORB_table_alpha.h
+++ b/ABACUS.develop/source/src_lcao/ORB_table_alpha.h
@@ -1,14 +1,10 @@
-//caoyu add 2021-03-17
-
 #ifndef ORB_TABLE_ALPHA_H 
 #define ORB_TABLE_ALPHA_H 
 
-#include "src_pw/tools.h"
-#include "ORB_atomic.h"
 #include "ORB_atomic_lm.h"
-#include "ORB_gaunt_table.h"
-#include "src_global/sph_bessel_recursive.h"
+#include "../src_global/sph_bessel_recursive.h"
 
+//caoyu add 2021-03-17
 
 class ORB_table_alpha
 {
@@ -31,6 +27,7 @@ class ORB_table_alpha
 	//-------------------------
 	// O stands for orbitals.
 	//-------------------------
+
 	void init_DS_Opair(void);
 	void init_DS_2Lplus1(void);
 	IntArray DS_Opair;
diff --git a/ABACUS.develop/source/src_lcao/ORB_table_beta.cpp b/ABACUS.develop/source/src_lcao/ORB_table_beta.cpp
index 7c4223e449..c28464a1ae 100644
--- a/ABACUS.develop/source/src_lcao/ORB_table_beta.cpp
+++ b/ABACUS.develop/source/src_lcao/ORB_table_beta.cpp
@@ -1,7 +1,6 @@
+#include <stdexcept>
 #include "ORB_table_beta.h"
 #include "ORB_read.h"
-#include <stdexcept>
-#include "../src_ri/exx_abfs.h"
 
 double ORB_table_beta::dr = -1.0;
 
@@ -121,11 +120,11 @@ int ORB_table_beta::get_rmesh(const double &R1, const double &R2)
 
 void ORB_table_beta::cal_VNL_PhiBeta_R(
 		Sph_Bessel_Recursive::D2 *pSB, // mohan add 2021-03-06
-        const int &l,
-        const Numerical_Orbital_Lm &n1,
-        const Numerical_Nonlocal_Lm &n2,
-        const int &rmesh,
-        double *rs,
+		const int &l,
+		const Numerical_Orbital_Lm &n1,
+		const Numerical_Nonlocal_Lm &n2,
+		const int &rmesh,
+		double *rs,
 		double *drs)
 {
 	timer::tick ("ORB_table_beta", "VNL_PhiBeta_R");
@@ -209,9 +208,8 @@ void ORB_table_beta::cal_VNL_PhiBeta_R(
 	}
 	
 	delete [] integrated_func;
-	
-
 	delete[] k1_dot_k2;
+
 	timer::tick ("ORB_table_beta", "VNL_PhiBeta_R");
 	return;
 }
@@ -227,7 +225,6 @@ void ORB_table_beta::init_Table_Beta(Sph_Bessel_Recursive::D2 *pSB)
 	// (2) allocate 2nd dimension ( overlap, derivative)
 	this->Table_NR[0] = new double*** [this->NL_nTpairs];
 	this->Table_NR[1] = new double*** [this->NL_nTpairs];
-
 	
 	// <1Phi|2Beta> 
 	for (int T1 = 0;  T1 < ntype ; T1++) // type 1 is orbital
@@ -236,8 +233,7 @@ void ORB_table_beta::init_Table_Beta(Sph_Bessel_Recursive::D2 *pSB)
 		{
 			// Tpair: type pair.
 			const int Tpair=this->NL_Tpair(T1,T2);
-			const int Lmax1 = ORB.Phi[T1].getLmax();
-			
+			const int Lmax1 = ORB.Phi[T1].getLmax();			
 			const int NBeta = ORB.nproj[T2];
 			
 			//-------------------------------------------------------------
@@ -262,7 +258,6 @@ void ORB_table_beta::init_Table_Beta(Sph_Bessel_Recursive::D2 *pSB)
             {
                 for (int N1 = 0; N1 < ORB.Phi[T1].getNchi(L1); N1++)
 				{
-					
 					// number of projectors.
 					for (int nb = 0; nb < NBeta; nb ++)
 					{
@@ -304,6 +299,7 @@ void ORB_table_beta::init_Table_Beta(Sph_Bessel_Recursive::D2 *pSB)
 							}
 
 							assert(nb < ORB.nproj[T2]);	
+
 							this->cal_VNL_PhiBeta_R(
 								pSB, // mohan add 2021-03-06
 								L,
@@ -327,11 +323,11 @@ void ORB_table_beta::init_Table_Beta(Sph_Bessel_Recursive::D2 *pSB)
 }
 
 
-void ORB_table_beta::Destroy_Table_Beta(void)
+void ORB_table_beta::Destroy_Table_Beta(LCAO_Orbitals &orb)
 {
 	if(!destroy_nr) return;
 
-	const int ntype = ORB.get_ntype();
+	const int ntype = orb.get_ntype();
 	for(int ir = 0; ir < 2; ir ++)
 	{
 		for(int T1=0; T1<ntype; T1++)
@@ -340,7 +336,7 @@ void ORB_table_beta::Destroy_Table_Beta(void)
 			{
 				const int Tpair = this->NL_Tpair(T1,T2); 
 				const int L2plus1 = this->NL_L2plus1(T1,T2);
-				const int pairs = ORB.Phi[T1].getTotal_nchi() * ORB.nproj[T2]; 
+				const int pairs = orb.Phi[T1].getTotal_nchi() * orb.nproj[T2]; 
 
 				// mohan fix bug 2011-03-30
 				if(pairs ==0) continue;
diff --git a/ABACUS.develop/source/src_lcao/ORB_table_beta.h b/ABACUS.develop/source/src_lcao/ORB_table_beta.h
index b39ae22b3d..fe14ce1360 100644
--- a/ABACUS.develop/source/src_lcao/ORB_table_beta.h
+++ b/ABACUS.develop/source/src_lcao/ORB_table_beta.h
@@ -1,14 +1,9 @@
 #ifndef ORB_TABLE_BETA_H 
 #define ORB_TABLE_BETA_H 
 
-#include "src_pw/tools.h"
-#include "ORB_read.h"
-#include "ORB_atomic.h"
-#include "ORB_atomic_lm.h"
-#include "ORB_nonlocal.h"
-#include "ORB_nonlocal_lm.h"
-#include "ORB_gaunt_table.h"
-#include "src_global/sph_bessel_recursive.h"
+#include "ORB_read.h" // use LCAO_Orbitals
+#include "ORB_atomic_lm.h" // use Numerical_Orbital_Lm
+#include "../src_global/sph_bessel_recursive.h" // use Sph_Bessel_Recursive
 
 class ORB_table_beta
 {
@@ -44,11 +39,12 @@ class ORB_table_beta
 
 	void init_Table_Beta(Sph_Bessel_Recursive::D2 *pSB);
 
-	void Destroy_Table_Beta(void);
+	void Destroy_Table_Beta(LCAO_Orbitals &orb);
 
 	static int get_rmesh( const double &R1, const double &R2);
 
 	static double dr;
+
 	int Rmesh;
 
 	private:
@@ -69,6 +65,7 @@ class ORB_table_beta
 	double dk;
 	int nlm;
 	int kmesh;
+
 	double *kpoint;
 	double *r;
 	double *rab;
diff --git a/ABACUS.develop/source/src_lcao/ORB_table_phi.h b/ABACUS.develop/source/src_lcao/ORB_table_phi.h
index 416a51075a..9942123bfd 100644
--- a/ABACUS.develop/source/src_lcao/ORB_table_phi.h
+++ b/ABACUS.develop/source/src_lcao/ORB_table_phi.h
@@ -6,7 +6,7 @@
 #include "ORB_atomic_lm.h"
 #include "ORB_gaunt_table.h"
 #include "center2_orb.h"
-#include "src_global/sph_bessel_recursive.h"
+#include "../src_global/sph_bessel_recursive.h"
 #include <set>
 
 class ORB_table_phi
diff --git a/ABACUS.develop/source/src_lcao/run_md.cpp b/ABACUS.develop/source/src_lcao/run_md.cpp
index b8767fdd9c..94058284bb 100644
--- a/ABACUS.develop/source/src_lcao/run_md.cpp
+++ b/ABACUS.develop/source/src_lcao/run_md.cpp
@@ -236,7 +236,7 @@ void Run_MD::opt_ions(void)
     }
 
 	// mohan update 2021-02-10
-    hm.orb_con.clear_after_ions(UOT);
+    hm.orb_con.clear_after_ions(UOT, ORB);
 
     timer::tick("Run_MD","opt_ions",'B'); 
     return;

From d262d735b239a549f359cd67384201de076bb774 Mon Sep 17 00:00:00 2001
From: mohan <mohan.chen.chen.mohan@gmail.com>
Date: Sat, 3 Apr 2021 16:43:31 +0800
Subject: [PATCH 19/60] keep eliminating global variables in ORB

---
 ABACUS.develop/source/run_lcao.cpp            |  2 +-
 ABACUS.develop/source/src_io/energy_dos.cpp   |  2 +-
 .../source/src_io/mulliken_charge.cpp         |  2 +-
 .../source/src_lcao/ORB_control.cpp           | 11 ++-
 ABACUS.develop/source/src_lcao/ORB_control.h  |  9 +-
 .../source/src_lcao/ORB_gen_tables.cpp        | 50 +++++------
 .../source/src_lcao/ORB_gen_tables.h          |  3 +-
 .../source/src_lcao/ORB_table_phi.cpp         | 84 ++++++++++---------
 .../source/src_lcao/ORB_table_phi.h           | 39 ++++++---
 9 files changed, 116 insertions(+), 86 deletions(-)

diff --git a/ABACUS.develop/source/run_lcao.cpp b/ABACUS.develop/source/run_lcao.cpp
index 6f8c19d31b..d4661b2006 100644
--- a/ABACUS.develop/source/run_lcao.cpp
+++ b/ABACUS.develop/source/run_lcao.cpp
@@ -45,7 +45,7 @@ void Run_lcao::lcao_line(void)
 
     // * reading the localized orbitals/projectors 
 	// * construct the interpolation tables.
-	hm.orb_con.set_orb_tables(UOT, ucell.lat0);
+	hm.orb_con.set_orb_tables(UOT, ORB, ucell.lat0);
 
 	// * allocate H and S matrices according to computational resources
 	// * set the 'trace' between local H/S and global H/S
diff --git a/ABACUS.develop/source/src_io/energy_dos.cpp b/ABACUS.develop/source/src_io/energy_dos.cpp
index 13c1273750..1c975ac8d9 100644
--- a/ABACUS.develop/source/src_io/energy_dos.cpp
+++ b/ABACUS.develop/source/src_io/energy_dos.cpp
@@ -327,7 +327,7 @@ void energy::perform_dos(void)
 				atom_arrange::search( SEARCH_RADIUS );//qifeng-2019-01-21
 
 				// mohan update 2021-02-10
-				hm.orb_con.set_orb_tables(UOT, ucell.lat0);
+				hm.orb_con.set_orb_tables(UOT, ORB, ucell.lat0);
 				LM.allocate_HS_R(LNNR.nnr);
 				LM.zeros_HSR('S', LNNR.nnr);
 				UHM.genH.calculate_S_no();
diff --git a/ABACUS.develop/source/src_io/mulliken_charge.cpp b/ABACUS.develop/source/src_io/mulliken_charge.cpp
index a30ef92e8e..7ce98a1d9d 100644
--- a/ABACUS.develop/source/src_io/mulliken_charge.cpp
+++ b/ABACUS.develop/source/src_io/mulliken_charge.cpp
@@ -166,7 +166,7 @@ void Mulliken_Charge::cal_mulliken(void)
 			mud[0].create(ParaO.ncol,ParaO.nrow);
 			atom_arrange::set_sr_NL();
 			atom_arrange::search( SEARCH_RADIUS );//qifeng-2019-01-21
-			hm.orb_con.set_orb_tables(UOT, ucell.lat0);
+			hm.orb_con.set_orb_tables(UOT, ORB, ucell.lat0);
 			LM.allocate_HS_R(LNNR.nnr);
 			LM.zeros_HSR('S', LNNR.nnr);
 			UHM.genH.calculate_S_no();
diff --git a/ABACUS.develop/source/src_lcao/ORB_control.cpp b/ABACUS.develop/source/src_lcao/ORB_control.cpp
index 7f5d1fddbd..c2b87ec409 100644
--- a/ABACUS.develop/source/src_lcao/ORB_control.cpp
+++ b/ABACUS.develop/source/src_lcao/ORB_control.cpp
@@ -12,7 +12,10 @@ ORB_control::ORB_control()
 ORB_control::~ORB_control()
 {}
 
-void ORB_control::set_orb_tables(ORB_gen_tables &OGT, const double &lat0)
+void ORB_control::set_orb_tables(
+	ORB_gen_tables &OGT, 
+	LCAO_Orbitals &orb,
+	const double &lat0)
 {
     TITLE("ORB_control","set_orb_tables");
 	timer::tick("ORB_control","set_orb_tables",'B');
@@ -21,7 +24,7 @@ void ORB_control::set_orb_tables(ORB_gen_tables &OGT, const double &lat0)
     // (1) FUNCTION : use 'info' to generate 'Numerical Orbital'
     // (1) RESULT : We have 'Numerical Orbital' for calculate S-table and T-table.
 	//=============================================================================
-    ORB.Read_Orbitals(ucell.ntype);
+    orb.Read_Orbitals(ucell.ntype);
 
 	if(CALCULATION=="test")
 	{
@@ -39,7 +42,7 @@ void ORB_control::set_orb_tables(ORB_gen_tables &OGT, const double &lat0)
     // 1: generate overlap table
     // 2: generate kinetic table
     // 3: generate overlap & kinetic table
-    OGT.gen_tables(job0);
+    OGT.gen_tables(job0, orb);
     // init lat0, in order to interpolated value from this table.
 
 	assert(lat0>0.0);
@@ -53,7 +56,7 @@ void ORB_control::set_orb_tables(ORB_gen_tables &OGT, const double &lat0)
 void ORB_control::clear_after_ions(ORB_gen_tables &OGT, LCAO_Orbitals &orb)
 {
     TITLE("ORB_control","clear_after_ions");
-    OGT.MOT.Destroy_Table();
+    OGT.MOT.Destroy_Table(orb);
     OGT.tbeta.Destroy_Table_Beta(orb);
     
 	//caoyu add 2021-03-18
diff --git a/ABACUS.develop/source/src_lcao/ORB_control.h b/ABACUS.develop/source/src_lcao/ORB_control.h
index 1e879f1863..ea3f9ddb58 100644
--- a/ABACUS.develop/source/src_lcao/ORB_control.h
+++ b/ABACUS.develop/source/src_lcao/ORB_control.h
@@ -13,9 +13,14 @@ class ORB_control
     ~ORB_control();
 
     // Generate the S(overlap),T,NL matrix.
-    void set_orb_tables(ORB_gen_tables &OGT, const double &lat0);
+    void set_orb_tables(
+		ORB_gen_tables &OGT, 
+		LCAO_Orbitals &orb,
+		const double &lat0);
 
-    void clear_after_ions(ORB_gen_tables &OGT, LCAO_Orbitals &orb);
+    void clear_after_ions(
+		ORB_gen_tables &OGT, 
+		LCAO_Orbitals &orb);
 
 };
 #endif
diff --git a/ABACUS.develop/source/src_lcao/ORB_gen_tables.cpp b/ABACUS.develop/source/src_lcao/ORB_gen_tables.cpp
index 9e20c9c93c..a84acdd24c 100644
--- a/ABACUS.develop/source/src_lcao/ORB_gen_tables.cpp
+++ b/ABACUS.develop/source/src_lcao/ORB_gen_tables.cpp
@@ -10,7 +10,7 @@ ORB_gen_tables::ORB_gen_tables(){}
 ORB_gen_tables::~ORB_gen_tables(){}
 
 // call in hamilt_linear::init_before_ions.
-void ORB_gen_tables::gen_tables( const int &job0 )
+void ORB_gen_tables::gen_tables( const int &job0, LCAO_Orbitals &orb )
 {
 	TITLE("ORB_gen_tables","gen_tables");
 	timer::tick("ORB_gen_tables","gen_tables",'C');
@@ -21,39 +21,39 @@ void ORB_gen_tables::gen_tables( const int &job0 )
 	// (1) MOT: make overlap table.
 	//=========================================
 	MOT.allocate(
-		ORB.get_ntype(),// number of atom types
-        ORB.get_lmax(),// max L used to calculate overlap
-        ORB.get_kmesh(), // kpoints, for integration in k space
-        ORB.get_Rmax(),// max value of radial table
-        ORB.get_dR(),// delta R, for making radial table
-        ORB.get_dk() ); // delta k, for integration in k space
+		orb.get_ntype(),// number of atom types
+        orb.get_lmax(),// max L used to calculate overlap
+        orb.get_kmesh(), // kpoints, for integration in k space
+        orb.get_Rmax(),// max value of radial table
+        orb.get_dR(),// delta R, for making radial table
+        orb.get_dk() ); // delta k, for integration in k space
 
 	tbeta.allocate(
-		ORB.get_ntype(),// number of atom types
-        ORB.get_lmax(),// max L used to calculate overlap
-        ORB.get_kmesh(), // kpoints, for integration in k space
-        ORB.get_Rmax(),// max value of radial table
-        ORB.get_dR(),// delta R, for making radial table
-        ORB.get_dk() ); // delta k, for integration in k space
+		orb.get_ntype(),// number of atom types
+        orb.get_lmax(),// max L used to calculate overlap
+        orb.get_kmesh(), // kpoints, for integration in k space
+        orb.get_Rmax(),// max value of radial table
+        orb.get_dR(),// delta R, for making radial table
+        orb.get_dk() ); // delta k, for integration in k space
 
 	//caoyu add 2021-03-18
 	if (INPUT.out_descriptor && BASIS_TYPE == "lcao") {
 		talpha.allocate(
-			ORB.get_ntype(),// number of atom types
-			ORB.get_lmax(),// max L used to calculate overlap
-			ORB.get_kmesh(), // kpoints, for integration in k space
-			ORB.get_Rmax(),// max value of radial table
-			ORB.get_dR(),// delta R, for making radial table
-			ORB.get_dk()); // delta k, for integration in k space
+			orb.get_ntype(),// number of atom types
+			orb.get_lmax(),// max L used to calculate overlap
+			orb.get_kmesh(), // kpoints, for integration in k space
+			orb.get_Rmax(),// max value of radial table
+			orb.get_dR(),// delta R, for making radial table
+			orb.get_dk()); // delta k, for integration in k space
 	}
 
 	// OV: overlap
-	MOT.init_OV_Tpair();
-	MOT.init_OV_Opair();
+	MOT.init_OV_Tpair(orb);
+	MOT.init_OV_Opair(orb);
 
 	// NL: nonlocal
 	tbeta.init_NL_Tpair();
-	tbeta.init_NL_Opair(ORB); // add 2009-5-8
+	tbeta.init_NL_Opair(orb); // add 2009-5-8
 
 	//caoyu add 2021-03-18
 	// DS: Descriptor
@@ -73,7 +73,7 @@ void ORB_gen_tables::gen_tables( const int &job0 )
 	MOT.init_Table_Spherical_Bessel (2,1, Lmax_used, Lmax);
 	
 	//calculate S(R) for interpolation
-	MOT.init_Table(job0);
+	MOT.init_Table(job0, orb);
 	tbeta.init_Table_Beta( MOT.pSB );// add 2009-5-8
 
 	//caoyu add 2021-03-18
@@ -87,9 +87,9 @@ void ORB_gen_tables::gen_tables( const int &job0 )
 	//=========================================
 
 	const int lmax = (Lmax_used-1) / 2 ;
-	//MGT.init_Ylm_Gaunt(ORB.get_lmax()+1, 0.0,PI,0.0,TWO_PI);
+	//MGT.init_Ylm_Gaunt(orb.get_lmax()+1, 0.0,PI,0.0,TWO_PI);
 	MGT.init_Gaunt_CH( lmax );
-	//MGT.init_Gaunt(ORB.get_lmax()+1);
+	//MGT.init_Gaunt(orb.get_lmax()+1);
 	MGT.init_Gaunt( lmax );
 
 
diff --git a/ABACUS.develop/source/src_lcao/ORB_gen_tables.h b/ABACUS.develop/source/src_lcao/ORB_gen_tables.h
index 18f9f0f3f2..dd24b76cd0 100644
--- a/ABACUS.develop/source/src_lcao/ORB_gen_tables.h
+++ b/ABACUS.develop/source/src_lcao/ORB_gen_tables.h
@@ -7,6 +7,7 @@
 #include "ORB_table_beta.h"
 #include "ORB_table_phi.h"
 #include "ORB_table_alpha.h"		//caoyu add 2020-3-18
+#include "ORB_read.h"
 
 //------------------------------------
 // used to be 'Use_Overlap_Table',
@@ -21,7 +22,7 @@ class ORB_gen_tables
 	ORB_gen_tables();
 	~ORB_gen_tables();
 
-	void gen_tables( const int &job0 );
+	void gen_tables( const int &job0, LCAO_Orbitals &orb);
 	void set_unit( const double &v ){lat0=v;}
 	
 	void snap_psipsi(
diff --git a/ABACUS.develop/source/src_lcao/ORB_table_phi.cpp b/ABACUS.develop/source/src_lcao/ORB_table_phi.cpp
index a55f78d433..a71d3c8d03 100644
--- a/ABACUS.develop/source/src_lcao/ORB_table_phi.cpp
+++ b/ABACUS.develop/source/src_lcao/ORB_table_phi.cpp
@@ -383,11 +383,13 @@ void ORB_table_phi::cal_ST_Phi12_R
 
 
 
-void ORB_table_phi::init_Table( const int &job0 )
+void ORB_table_phi::init_Table(
+	const int &job0, 
+	LCAO_Orbitals &orb)
 {
 	TITLE("ORB_table_phi", "init_Table");
 	timer::tick("ORB_table_phi", "init_Table",'D');
-	const int ntype = ORB.get_ntype();
+	const int ntype = orb.get_ntype();
 	assert( ORB_table_phi::dr > 0.0);
 	assert( OV_nTpairs>0);
 
@@ -430,8 +432,8 @@ void ORB_table_phi::init_Table( const int &job0 )
 		{
 			// get the bigger lmax between two types
 			const int Tpair=this->OV_Tpair(T1,T2);
-			const int Lmax1 = ORB.Phi[T1].getLmax();
-			const int Lmax2 = ORB.Phi[T2].getLmax();
+			const int Lmax1 = orb.Phi[T1].getLmax();
+			const int Lmax2 = orb.Phi[T2].getLmax();
 
 			//L2plus1 could be reduced by considering Gaunt Coefficient
 			//remain to be modified
@@ -448,8 +450,8 @@ void ORB_table_phi::init_Table( const int &job0 )
 			
 			const int L2plus1 =  2*lmax_now + 1;
 
-			const int nchi1 = ORB.Phi[T1].getTotal_nchi();
-			const int nchi2 = ORB.Phi[T2].getTotal_nchi();
+			const int nchi1 = orb.Phi[T1].getTotal_nchi();
+			const int nchi2 = orb.Phi[T2].getTotal_nchi();
 			const int pairs_chi = nchi1 * nchi2;
 
 			// init 2nd dimension
@@ -474,8 +476,8 @@ void ORB_table_phi::init_Table( const int &job0 )
 				break;
 			}
 
-			const double Rcut1 = ORB.Phi[T1].getRcut();
-			const double Rcut2 = ORB.Phi[T2].getRcut();
+			const double Rcut1 = orb.Phi[T1].getRcut();
+			const double Rcut2 = orb.Phi[T2].getRcut();
 			assert(Rcut1>0.0 && Rcut1<100);
 			assert(Rcut2>0.0 && Rcut2<100);
 
@@ -484,11 +486,11 @@ void ORB_table_phi::init_Table( const int &job0 )
 			
 			for (int L1 = 0; L1 < Lmax1 + 1; L1++)
 			{
-				for (int N1 = 0; N1 < ORB.Phi[T1].getNchi(L1); N1++)
+				for (int N1 = 0; N1 < orb.Phi[T1].getNchi(L1); N1++)
 				{
 					for (int L2 = 0; L2 < Lmax2 + 1; L2 ++)
 					{
-						for (int N2 = 0; N2 < ORB.Phi[T2].getNchi(L2); N2++)
+						for (int N2 = 0; N2 < orb.Phi[T2].getNchi(L2); N2++)
 						{		
 							// get the second index.
 							const int Opair = this->OV_Opair(Tpair,L1,L2,N1,N2);
@@ -582,8 +584,8 @@ void ORB_table_phi::init_Table( const int &job0 )
 									case 1:
 									{
 										this->cal_ST_Phi12_R(1,L, 
-												ORB.Phi[T1].PhiLN(L1,N1),
-												ORB.Phi[T2].PhiLN(L2,N2),
+												orb.Phi[T1].PhiLN(L1,N1),
+												orb.Phi[T2].PhiLN(L2,N2),
 												rmesh,
 												Table_SR[0][Tpair][Opair][L],
 												Table_SR[1][Tpair][Opair][L]);
@@ -593,8 +595,8 @@ void ORB_table_phi::init_Table( const int &job0 )
 									{
 
 										this->cal_ST_Phi12_R(2,L, 
-												ORB.Phi[T1].PhiLN(L1,N1),
-												ORB.Phi[T2].PhiLN(L2,N2),
+												orb.Phi[T1].PhiLN(L1,N1),
+												orb.Phi[T2].PhiLN(L2,N2),
 												rmesh,
 												Table_TR[0][Tpair][Opair][L],
 												Table_TR[1][Tpair][Opair][L]);
@@ -603,15 +605,15 @@ void ORB_table_phi::init_Table( const int &job0 )
 									case 3:
 									{	
 										this->cal_ST_Phi12_R(1,L, 
-												ORB.Phi[T1].PhiLN(L1,N1),
-												ORB.Phi[T2].PhiLN(L2,N2),
+												orb.Phi[T1].PhiLN(L1,N1),
+												orb.Phi[T2].PhiLN(L2,N2),
 												rmesh,
 												Table_SR[0][Tpair][Opair][L],
 												Table_SR[1][Tpair][Opair][L]);
 
 										this->cal_ST_Phi12_R(2,L, 
-												ORB.Phi[T1].PhiLN(L1,N1),
-												ORB.Phi[T2].PhiLN(L2,N2),
+												orb.Phi[T1].PhiLN(L1,N1),
+												orb.Phi[T2].PhiLN(L2,N2),
 												rmesh,
 												Table_TR[0][Tpair][Opair][L],
 												Table_TR[1][Tpair][Opair][L]);
@@ -647,11 +649,11 @@ void ORB_table_phi::init_Table( const int &job0 )
 }
 
 
-void ORB_table_phi::Destroy_Table(void)
+void ORB_table_phi::Destroy_Table(LCAO_Orbitals &orb)
 {
 	if(!destroy_sr && !destroy_tr) return;
 	
-	const int ntype = ORB.get_ntype();
+	const int ntype = orb.get_ntype();
 	int dim1 = 0;
 	for (int ir = 0; ir < 2; ir++)
 	{
@@ -661,10 +663,10 @@ void ORB_table_phi::Destroy_Table(void)
 			// means that T2 >= T1
     	    for (int T2 = T1; T2 < ntype; T2++)
         	{
-				const int Lmax1 = ORB.Phi[T1].getLmax();
-				const int Lmax2 = ORB.Phi[T2].getLmax();
+				const int Lmax1 = orb.Phi[T1].getLmax();
+				const int Lmax2 = orb.Phi[T2].getLmax();
 				const int lmax_now = std::max(Lmax1, Lmax2);
-				const int pairs = ORB.Phi[T1].getTotal_nchi() * ORB.Phi[T2].getTotal_nchi();
+				const int pairs = orb.Phi[T1].getTotal_nchi() * orb.Phi[T2].getTotal_nchi();
 				
 				for (int dim2 = 0; dim2 < pairs; dim2++)
 				{
@@ -696,7 +698,7 @@ void ORB_table_phi::Destroy_Table(void)
 
 
 
-void ORB_table_phi::init_OV_Tpair(void)
+void ORB_table_phi::init_OV_Tpair(LCAO_Orbitals &orb)
 {
 	TITLE("ORB_table_phi","init_OV_Tpair");
     assert(ntype>0);
@@ -720,7 +722,7 @@ void ORB_table_phi::init_OV_Tpair(void)
             
 			++index;
 			// (2) pairs about lmax
-			this->OV_L2plus1(T1,T2) = max(ORB.Phi[T1].getLmax(), ORB.Phi[T2].getLmax() )*2+1;
+			this->OV_L2plus1(T1,T2) = max(orb.Phi[T1].getLmax(), orb.Phi[T2].getLmax() )*2+1;
 			this->OV_L2plus1(T2,T1) = this->OV_L2plus1(T1,T2);
         }
     }
@@ -729,10 +731,10 @@ void ORB_table_phi::init_OV_Tpair(void)
 
 
 
-void ORB_table_phi::init_OV_Opair(void)
+void ORB_table_phi::init_OV_Opair(LCAO_Orbitals &orb)
 {
-    const int lmax = ORB.get_lmax(); 
-    const int nchimax = ORB.get_nchimax();
+    const int lmax = orb.get_lmax(); 
+    const int nchimax = orb.get_nchimax();
 	assert(lmax+1 > 0);
 	assert(nchimax > 0);
 	assert(OV_nTpairs > 0);
@@ -747,27 +749,31 @@ void ORB_table_phi::init_OV_Opair(void)
         {
 			const int dim1 = this->OV_Tpair(T1,T2);
 			int index=0;
-            for(int L1=0; L1<ORB.Phi[T1].getLmax()+1; L1++)
+            for(int L1=0; L1<orb.Phi[T1].getLmax()+1; L1++)
             {
-                for(int N1=0; N1<ORB.Phi[T1].getNchi(L1); N1++)
+                for(int N1=0; N1<orb.Phi[T1].getNchi(L1); N1++)
                 {
-                    for(int L2=0; L2<ORB.Phi[T2].getLmax()+1; L2++)
+                    for(int L2=0; L2<orb.Phi[T2].getLmax()+1; L2++)
                     {
-                        for(int N2=0; N2<ORB.Phi[T2].getNchi(L2); N2++)
+                        for(int N2=0; N2<orb.Phi[T2].getNchi(L2); N2++)
                         {
                             this->OV_Opair(dim1, L1, L2, N1, N2) = index;
                             ++index;
-                        }
-                    }
-                }
-            }
-        }
-    }
+                        }// N2
+                    }// L2
+                }// N1
+            }// L1
+        }// T2
+    }// T1
     return;
 }
 
 // Peize Lin update 2016-01-26
-void ORB_table_phi::init_Lmax (const int orb_num, const int mode, int &Lmax_used, int &Lmax) const
+void ORB_table_phi::init_Lmax (
+	const int orb_num, 
+	const int mode, 
+	int &Lmax_used, 
+	int &Lmax) const
 {
 	auto cal_Lmax_Phi = [](int &Lmax)
 	{
diff --git a/ABACUS.develop/source/src_lcao/ORB_table_phi.h b/ABACUS.develop/source/src_lcao/ORB_table_phi.h
index 9942123bfd..e254484d9d 100644
--- a/ABACUS.develop/source/src_lcao/ORB_table_phi.h
+++ b/ABACUS.develop/source/src_lcao/ORB_table_phi.h
@@ -1,10 +1,11 @@
 #ifndef ORB_TABLE_PHI_H 
 #define ORB_TABLE_PHI_H 
 
-#include "src_pw/tools.h"
-#include "ORB_atomic.h"
+//#include "src_pw/tools.h"
+//#include "ORB_gaunt_table.h"
+//#include "ORB_atomic.h"
+#include "ORB_read.h"
 #include "ORB_atomic_lm.h"
-#include "ORB_gaunt_table.h"
 #include "center2_orb.h"
 #include "../src_global/sph_bessel_recursive.h"
 #include <set>
@@ -24,8 +25,11 @@ class ORB_table_phi
 		const double &dR_in,
 		const double &dk_in);
 
-	void init_Table(const int &job);
-	void Destroy_Table(void);
+	void init_Table(
+		const int &job,
+		LCAO_Orbitals &orb);
+
+	void Destroy_Table(LCAO_Orbitals &orb);
 
 	// Five dimension:
 	// (1) 0: normal (S(R)) ; 1: derivative( dS/dR )
@@ -40,13 +44,22 @@ class ORB_table_phi
 	bool destroy_tr;
 	
 	//=================================================
-	//make table of Spherical bessel
-	//Sph_Bes : jlx[kmesh][Rmesh][L]
-	//L should be 2*Lmax, which is max L of all type
+	// make table of Spherical bessel
+	// Sph_Bes : jlx[kmesh][Rmesh][L]
+	// L should be 2*Lmax, which is max L of all type
 	//=================================================
 	// Peize Lin update 2016-01-26
-	void init_Lmax (const int orb_num, const int mode, int &Lmax_used, int &Lmax) const;
-	void init_Table_Spherical_Bessel (const int orb_num, const int mode, int &Lmax_used, int &Lmax);
+	void init_Lmax(
+		const int orb_num, 
+		const int mode, 
+		int &Lmax_used, 
+		int &Lmax) const;
+
+	void init_Table_Spherical_Bessel(
+		const int orb_num, 
+		const int mode, 
+		int &Lmax_used, 
+		int &Lmax);
 
 	// Peize Lin add 2017-04-24, and change all jlx in this class
 	Sph_Bessel_Recursive::D2* pSB = nullptr;
@@ -61,8 +74,10 @@ class ORB_table_phi
 	// T stands for atom type.
 	// O stands for orbitals.
 	//-------------------------
-    void init_OV_Tpair(void);
-    void init_OV_Opair(void);
+
+    void init_OV_Tpair(LCAO_Orbitals &orb);
+    void init_OV_Opair(LCAO_Orbitals &orb);
+
 	int OV_nTpairs;
     IntArray OV_Tpair;
     IntArray OV_Opair;

From 8b6d0f1c4600ed5cd5f52b7721fb2cfbbc373d9d Mon Sep 17 00:00:00 2001
From: mohan <mohan.chen.chen.mohan@gmail.com>
Date: Sat, 3 Apr 2021 17:20:14 +0800
Subject: [PATCH 20/60] add math_integral class

---
 ABACUS.develop/source/Makefile.Objects        |   1 +
 ABACUS.develop/source/src_global/integral.cpp |  14 +-
 ABACUS.develop/source/src_global/integral.h   |  11 +-
 .../source/src_global/math_integral.cpp       | 220 ++++++++++++++++++
 .../source/src_global/math_integral.h         |  51 ++++
 ABACUS.develop/source/src_global/poission.cpp |   8 +-
 .../source/src_lcao/ORB_atomic_lm.cpp         |  16 +-
 .../source/src_lcao/ORB_table_phi.cpp         |   2 +-
 .../source/src_lcao/ORB_table_phi.h           |   3 -
 9 files changed, 294 insertions(+), 32 deletions(-)
 create mode 100644 ABACUS.develop/source/src_global/math_integral.cpp
 create mode 100644 ABACUS.develop/source/src_global/math_integral.h

diff --git a/ABACUS.develop/source/Makefile.Objects b/ABACUS.develop/source/Makefile.Objects
index bed1c5618c..e490523d6a 100644
--- a/ABACUS.develop/source/Makefile.Objects
+++ b/ABACUS.develop/source/Makefile.Objects
@@ -81,6 +81,7 @@ memory.o\
 print_info.o\
 mathzone.o\
 mathzone_add1.o\
+math_integral.o\
 integral.o \
 poission.o \
 polint.o \
diff --git a/ABACUS.develop/source/src_global/integral.cpp b/ABACUS.develop/source/src_global/integral.cpp
index ea7430dccb..6638d256e0 100644
--- a/ABACUS.develop/source/src_global/integral.cpp
+++ b/ABACUS.develop/source/src_global/integral.cpp
@@ -5,12 +5,12 @@
 #include <cmath>
 using namespace std;
 
-int Integral::n_root = 512;
-bool Integral::calc_wx = false;
-double* Integral::gauleg_w;
-double* Integral::gauleg_x;
+int Integral_G::n_root = 512;
+bool Integral_G::calc_wx = false;
+double* Integral_G::gauleg_w;
+double* Integral_G::gauleg_x;
 
-double Integral::Gauss_Legendre
+double Integral_G::Gauss_Legendre
 (
 	const double &a,
 	const double &b,
@@ -27,7 +27,7 @@ double Integral::Gauss_Legendre
 	
 	if(!calc_wx) 
 	{
-		Integral::gauleg();
+		Integral_G::gauleg();
 		calc_wx = true;
 	}
 
@@ -55,7 +55,7 @@ double Integral::Gauss_Legendre
 	return sum * dab / 2.0;
 }
 
-void Integral::gauleg()
+void Integral_G::gauleg(void)
 {
 	int m, j, i;
 	double z1,z,xm,xl,pp,p3,p2,p1;
diff --git a/ABACUS.develop/source/src_global/integral.h b/ABACUS.develop/source/src_global/integral.h
index 7f1f3fde7e..f26ae67b59 100644
--- a/ABACUS.develop/source/src_global/integral.h
+++ b/ABACUS.develop/source/src_global/integral.h
@@ -1,11 +1,12 @@
-#ifndef INTEGRAL_H
-#define INTEGRAL_H
+#ifndef INTEGRAL_G_H
+#define INTEGRAL_G_H
 
-class Integral
+class Integral_G
 {
 	public:
-	Integral();
-	~Integral();
+
+	Integral_G();
+	~Integral_G();
 
 	static double Gauss_Legendre
 	(
diff --git a/ABACUS.develop/source/src_global/math_integral.cpp b/ABACUS.develop/source/src_global/math_integral.cpp
new file mode 100644
index 0000000000..ed6a4bdd9c
--- /dev/null
+++ b/ABACUS.develop/source/src_global/math_integral.cpp
@@ -0,0 +1,220 @@
+#include "math_integral.h"
+#include <stddef.h> // use size_t
+#include <cassert>
+
+Integral::Integral(){}
+
+Integral::~Integral(){}
+
+
+// Peize Lin accelerate 2017-10-02
+/*
+void Integral::Simpson_Integral
+(
+    const int mesh,
+    const double *func,
+    const double *rab,
+    double &asum
+)
+{
+    //     simpson's rule integration. On input:
+    //     mesh = mhe number of grid points (should be odd)
+    //     func(i)= function to be integrated
+    //     rab(i) = r(i) * dr(i)/di * di
+    //     For the logarithmic grid not including r=0 :
+    //     r(i) = r_0*exp((i-1)*dx) ==> rab(i)=r(i)*dx
+    //     For the logarithmic grid including r=0 :
+    //     r(i) = a(exp((i-1)*dx)-1) ==> rab(i)=(r(i)+a)*dx
+    //     Output in asum = \sum_i c_i f(i)*rab(i) = \int_0^\infty f(r) dr
+    //     where c_i are alternativaly 2/3, 4/3 except c_1 = c_mesh = 1/3
+    
+    //  simpson's rule integrator for function stored on the
+    //  radial logarithmic mesh
+    //	routine assumes that mesh is an odd number so run check
+    if (mesh % 2 == 0)
+    {
+        cout << "\n error in subroutine simpson ";
+        cout << "\n routine assumes mesh is odd but mesh = "
+             << mesh << endl;
+        return;
+    }
+
+    asum = 0.00;
+    const double r12 = 1.00 / 12.00;
+    double f3 = func [0] * rab [0] * r12;
+    for (int i = 1;i < mesh;i += 2)
+    {
+        const double f1 = f3;
+        const double f2 = func [i] * rab [i] * r12;
+        f3 = func [i + 1] * rab [i + 1] * r12;
+        asum += 4.00 * f1 + 16.00 * f2 + 4.00 * f3;
+    }
+    return;
+}// end subroutine simpson
+*/
+
+
+// Peize Lin accelerate 2017-10-02
+void Integral::Simpson_Integral
+(
+    const int mesh,
+    const double *func,
+    const double *rab,
+    double &asum
+)
+{
+    /*     simpson's rule integration. On input:
+    !      mesh = mhe number of grid points (should be odd)
+    !      func(i)= function to be integrated
+    !      rab(i) = r(i) * dr(i)/di * di
+    !      For the logarithmic grid not including r=0 :
+    !      r(i) = r_0*exp((i-1)*dx) ==> rab(i)=r(i)*dx
+    !      For the logarithmic grid including r=0 :
+    !      r(i) = a(exp((i-1)*dx)-1) ==> rab(i)=(r(i)+a)*dx
+    !      Output in asum = \sum_i c_i f(i)*rab(i) = \int_0^\infty f(r) dr
+    !      where c_i are alternativaly 2/3, 4/3 except c_1 = c_mesh = 1/3
+    */
+    //  simpson's rule integrator for function stored on the
+    //  radial logarithmic mesh
+    //	routine assumes that mesh is an odd number so run check
+    assert(mesh&1);
+
+    asum = 0.00;
+	const size_t end = mesh-2;
+    for( size_t i=1; i!=end; i+=2 )
+    {
+		const double f1 = func[i]*rab[i];
+		asum += f1 + f1 + func[i+1]*rab[i+1];
+    }
+	const double f1 = func[mesh-2]*rab[mesh-2];
+	asum += f1+f1;
+	asum += asum;
+	asum += func[0]*rab[0] + func[mesh-1]*rab[mesh-1];
+	asum /= 3.0;
+    return;
+}// end subroutine simpson
+
+
+// Peize Lin accelerate 2017-10-02
+void Integral::Simpson_Integral
+(
+    const int mesh,
+    const double *func,
+    const double dr,
+    double &asum
+)
+{
+    /*     simpson's rule integration. On input:
+    !      mesh = mhe number of grid points (should be odd)
+    !      func(i)= function to be integrated
+    !      rab(i) = r(i) * dr(i)/di * di
+    !      For the logarithmic grid not including r=0 :
+    !      r(i) = r_0*exp((i-1)*dx) ==> rab(i)=r(i)*dx
+    !      For the logarithmic grid including r=0 :
+    !      r(i) = a(exp((i-1)*dx)-1) ==> rab(i)=(r(i)+a)*dx
+    !      Output in asum = \sum_i c_i f(i)*rab(i) = \int_0^\infty f(r) dr
+    !      where c_i are alternativaly 2/3, 4/3 except c_1 = c_mesh = 1/3
+    */
+    //  simpson's rule integrator for function stored on the
+    //  radial logarithmic mesh
+    //	routine assumes that mesh is an odd number so run check
+    assert(mesh&1);
+
+    asum = 0.00;
+	const size_t end = mesh-2;
+    for(size_t i=1; i!=end; i+=2 )
+    {
+		const double f1 = func[i];
+		asum += f1 + f1 + func[i+1];
+    }
+	const double f1 = func[mesh-2];
+	asum += f1+f1;
+	asum += asum;
+	asum += func[0] + func[mesh-1];
+	asum *= dr/3.0;
+    return;
+}// end subroutine simpson
+
+
+// Peize Lin add 2016-02-14
+void Integral::Simpson_Integral_0toall
+(
+    const int mesh,
+    const double *func,
+    const double *rab,
+    double *asum
+)
+{
+    // asum(r) = \int_{r'=0}^{r} dr' f(r') 
+
+    const double r2=1.00/2.00, r3=1.00/3.00;
+    asum[0] = 0.00;
+    double f3 = func [0] * rab [0];
+    for( int i=1; i<mesh; i+=2)
+    {
+        const double f1 = f3;
+        const double f2 = func[i] * rab[i] ;
+        f3 = func[i+1] * rab[i+1] ;
+        asum[i] = asum[i-1] + r2*( f1 + f2);
+        if(i+1<mesh)
+        {
+            asum[i+1] = asum[i-1] + r3*( f1 + 4.00*f2 + f3 );
+        }
+    }
+    return;
+}
+
+
+// Peize Lin add 2016-02-14
+// faster but still have bug
+/*void Integral::Simpson_Integral_alltoinf
+(
+    const int mesh,
+    const double *func,
+    const double *rab,
+    double *asum
+)
+{
+    // asum(r) = \int_{r'=r}^{+\infty} dr' f(r') 
+    //         = \inf_{r'=r}^{mesh} dr' f(r')
+
+    const double r2=1.00/2.00, r3=1.00/3.00;
+    asum[mesh-1] = 0.00;
+    const int odd_mesh = (mesh-1)^~1;
+    double f1 = func[odd_mesh] * rab[odd_mesh];
+    for( size_t i=(mesh-3)|1; i>0; i-=2)
+    {
+        const double f3 = f1;   
+        if( i+3==mesh )
+        {
+            const double f4 = func[mesh-1] * rab[mesh-1];
+            asum[mesh-2] = r2*(f3 + f4);
+        }
+        const double f2 = func[i] * rab[i] ;
+        f1 = func[i-1] * rab[i-1] ;
+        asum[i-1] = asum[i+1] + r3*( f1 + 4.00*f2 + f3 );
+        asum[i] = asum[i-1] - r2*( f1 + f2);
+    }
+    return;
+}*/
+
+
+// Peize Lin add 2016-06-11
+// a little lower
+void Integral::Simpson_Integral_alltoinf
+(
+    const int mesh,
+    const double *func,
+    const double *rab,
+    double *asum
+)
+{
+    Integral::Simpson_Integral_0toall( mesh, func, rab, asum );
+
+    const double asum_all = asum[mesh-1];
+    for (int i = 0;i < mesh; ++i)
+	{
+        asum[i] = asum_all - asum[i];
+	}
+	return;
+}
diff --git a/ABACUS.develop/source/src_global/math_integral.h b/ABACUS.develop/source/src_global/math_integral.h
new file mode 100644
index 0000000000..6dfd206f57
--- /dev/null
+++ b/ABACUS.develop/source/src_global/math_integral.h
@@ -0,0 +1,51 @@
+#ifndef MATH_INTEGRAL_H
+#define MATH_INTEGRAL_H
+
+// mohan add 2021-04-03
+
+class Integral
+{
+
+	public:
+
+    Integral();
+    ~Integral();
+
+	// Peize Lin accelerate 2017-10-02
+    static void Simpson_Integral
+    (
+        const int mesh,
+        const double *func,
+        const double *rab,
+        double &asum
+    );
+
+	// Peize Lin accelerate 2017-10-02
+	static void Simpson_Integral
+	(
+		const int mesh,
+		const double *func,
+		const double dr,
+		double &asum
+	);
+
+    // Peize Lin add 2016-02-14
+    static void Simpson_Integral_0toall
+    (
+        const int mesh,
+        const double *func,
+        const double *rab,
+        double *asum
+    );
+
+    // Peize Lin add 2016-02-14
+    static void Simpson_Integral_alltoinf
+    (
+        const int mesh,
+        const double *func,
+        const double *rab,
+        double *asum
+    );     
+
+};
+#endif
diff --git a/ABACUS.develop/source/src_global/poission.cpp b/ABACUS.develop/source/src_global/poission.cpp
index 378877899e..ac78f9b579 100644
--- a/ABACUS.develop/source/src_global/poission.cpp
+++ b/ABACUS.develop/source/src_global/poission.cpp
@@ -32,11 +32,11 @@ void Poission::SolPoissonEq
     //value at the beginning
     a = r[0];
     b = r[mesh-1];
-    pot[0] = Integral::Gauss_Legendre(a, b, rad_f2, r, mesh) * 4.0 * PI * e2;
+    pot[0] = Integral_G::Gauss_Legendre(a, b, rad_f2, r, mesh) * 4.0 * PI * e2;
 
     //value at the end
     assert(r[mesh-1] > tiny);
-    pot[mesh-1] = Integral::Gauss_Legendre(a, b, rad_f1, r, mesh) * 4.0 * PI / r[mesh-1] * e2;
+    pot[mesh-1] = Integral_G::Gauss_Legendre(a, b, rad_f1, r, mesh) * 4.0 * PI / r[mesh-1] * e2;
 	
 	//points in the interval
     for(int ir = 1; ir < mesh-1; ir++)
@@ -46,10 +46,10 @@ void Poission::SolPoissonEq
         c = r[mesh-1];
 
         //integrate inside
-        const double inside = Integral::Gauss_Legendre(a, b, rad_f1, r, mesh) / r[ir];
+        const double inside = Integral_G::Gauss_Legendre(a, b, rad_f1, r, mesh) / r[ir];
 
         //integrate outside
-        const double outside = Integral::Gauss_Legendre(b, c, rad_f2, r, mesh);
+        const double outside = Integral_G::Gauss_Legendre(b, c, rad_f2, r, mesh);
 
         //inside + outside
         pot[ir] = (inside + outside) * 4.0 * PI * e2;
diff --git a/ABACUS.develop/source/src_lcao/ORB_atomic_lm.cpp b/ABACUS.develop/source/src_lcao/ORB_atomic_lm.cpp
index 7a6d724c19..c2cfde12a9 100644
--- a/ABACUS.develop/source/src_lcao/ORB_atomic_lm.cpp
+++ b/ABACUS.develop/source/src_lcao/ORB_atomic_lm.cpp
@@ -1,10 +1,7 @@
-//=========================================================
-//AUTHOR : liaochen, mohan
-//DATE : 2008-11-12
-//=========================================================
 #include "ORB_atomic_lm.h"
-#include "src_global/sph_bessel_recursive.h"
-#include "src_global/lapack_connector.h"
+#include "../src_global/sph_bessel_recursive.h"
+#include "../src_global/lapack_connector.h"
+#include "../src_global/timer.h"
 #include <omp.h>
 
 Numerical_Orbital_Lm::Numerical_Orbital_Lm()
@@ -27,12 +24,7 @@ Numerical_Orbital_Lm::Numerical_Orbital_Lm()
 }
 
 Numerical_Orbital_Lm::~Numerical_Orbital_Lm()
-{
-	if(test_deconstructor)
-	{
-		cout << " ~Numerical_Orbital_Lm()" << endl;
-	}
-}
+{}
 
 void Numerical_Orbital_Lm::set_orbital_info
 (
diff --git a/ABACUS.develop/source/src_lcao/ORB_table_phi.cpp b/ABACUS.develop/source/src_lcao/ORB_table_phi.cpp
index a71d3c8d03..aff7401840 100644
--- a/ABACUS.develop/source/src_lcao/ORB_table_phi.cpp
+++ b/ABACUS.develop/source/src_lcao/ORB_table_phi.cpp
@@ -1,6 +1,6 @@
+#include <stdexcept>
 #include "ORB_table_phi.h"
 #include "ORB_read.h"
-#include <stdexcept>
 #include "../src_ri/exx_abfs.h"
 
 double ORB_table_phi::dr = -1.0;
diff --git a/ABACUS.develop/source/src_lcao/ORB_table_phi.h b/ABACUS.develop/source/src_lcao/ORB_table_phi.h
index e254484d9d..acc68853d0 100644
--- a/ABACUS.develop/source/src_lcao/ORB_table_phi.h
+++ b/ABACUS.develop/source/src_lcao/ORB_table_phi.h
@@ -1,9 +1,6 @@
 #ifndef ORB_TABLE_PHI_H 
 #define ORB_TABLE_PHI_H 
 
-//#include "src_pw/tools.h"
-//#include "ORB_gaunt_table.h"
-//#include "ORB_atomic.h"
 #include "ORB_read.h"
 #include "ORB_atomic_lm.h"
 #include "center2_orb.h"

From df89607c9ba9fd0473e1079f4d090606395c63c8 Mon Sep 17 00:00:00 2001
From: mohan <mohan.chen.chen.mohan@gmail.com>
Date: Sat, 3 Apr 2021 17:45:58 +0800
Subject: [PATCH 21/60] delete SimpsonIntegrals in Mathzone, use the one in
 src_global/math_integral.h instead

---
 .../source/src_external/ORB_api/Makefile      |   2 +-
 .../src_external/ORB_api/Makefile.Objects     |   9 +-
 .../src_external/ORB_api/Makefile.system      |   2 +-
 .../source/src_global/gram_schmidt_orth-inl.h |   7 +-
 ABACUS.develop/source/src_global/mathzone.cpp | 202 ------------------
 ABACUS.develop/source/src_global/mathzone.h   |  35 ---
 ABACUS.develop/source/src_io/bessel_basis.cpp |   3 +-
 ABACUS.develop/source/src_io/to_wannier90.cpp |   6 +-
 .../source/src_lcao/ORB_atomic_lm.cpp         |   9 +-
 .../source/src_lcao/ORB_nonlocal_lm.cpp       |   3 +-
 ABACUS.develop/source/src_lcao/ORB_read.cpp   |  11 +-
 .../source/src_lcao/ORB_table_alpha.cpp       |   9 +-
 .../source/src_lcao/ORB_table_beta.cpp        |   9 +-
 .../source/src_lcao/ORB_table_phi.cpp         |  32 ++-
 ABACUS.develop/source/src_pw/charge.cpp       |  11 +-
 ABACUS.develop/source/src_pw/forces.cpp       |   3 +-
 .../source/src_pw/pseudopot_cell_vl.cpp       |   7 +-
 .../source/src_pw/pseudopot_cell_vnl.cpp      |   5 +-
 .../source/src_pw/stress_func_cc.cpp          |   3 +-
 .../source/src_pw/stress_func_loc.cpp         |   5 +-
 .../source/src_pw/wavefunc_in_pw.cpp          |   5 +-
 ABACUS.develop/source/src_pw/wf_atomic.cpp    |   7 +-
 .../source/src_ri/conv_coulomb_pot.cpp        |   5 +-
 ABACUS.develop/source/src_ri/exx_abfs-io.cpp  |   5 +-
 24 files changed, 84 insertions(+), 311 deletions(-)

diff --git a/ABACUS.develop/source/src_external/ORB_api/Makefile b/ABACUS.develop/source/src_external/ORB_api/Makefile
index fc57d4f986..010843ff16 100644
--- a/ABACUS.develop/source/src_external/ORB_api/Makefile
+++ b/ABACUS.develop/source/src_external/ORB_api/Makefile
@@ -25,7 +25,7 @@ OPTS_GDB = -g -W
 #main.o\
 
 FP_OBJS_0=main.o\
-$(OBJS_ORB)\
+$(OBJS_TRY)\
 
 FP_OBJS=$(patsubst %.o, ${OBJ_DIR}/%.o, ${FP_OBJS_0})
 PDIAG_OBJS=$(patsubst %.o, ${OBJ_DIR}/%.o, ${OBJS_PDIAG})
diff --git a/ABACUS.develop/source/src_external/ORB_api/Makefile.Objects b/ABACUS.develop/source/src_external/ORB_api/Makefile.Objects
index 5cc70d29c3..7aed1ec43c 100644
--- a/ABACUS.develop/source/src_external/ORB_api/Makefile.Objects
+++ b/ABACUS.develop/source/src_external/ORB_api/Makefile.Objects
@@ -12,7 +12,7 @@
 VERSION= ABACUS-ORB
 HEADERS= *.h
 
-OBJS_ORB=ORB_read.o\
+OBJS_TRY=math_integral.o\
 
 OBJS_ORBITAL=ORB_control.o\
 ORB_read.o\
@@ -26,6 +26,7 @@ ORB_table_phi.o\
 ORB_table_alpha.o\
 ORB_gen_tables.o\
 
-OBJS_GLOBAL=#sph_bessel.o\
-#sph_bessel_recursive-d1.o\
-#sph_bessel_recursive-d2.o\
+OBJS_GLOBAL=sph_bessel.o\
+sph_bessel_recursive-d1.o\
+sph_bessel_recursive-d2.o\
+timer.o\
diff --git a/ABACUS.develop/source/src_external/ORB_api/Makefile.system b/ABACUS.develop/source/src_external/ORB_api/Makefile.system
index ac93188d21..c9fa3891cf 100644
--- a/ABACUS.develop/source/src_external/ORB_api/Makefile.system
+++ b/ABACUS.develop/source/src_external/ORB_api/Makefile.system
@@ -11,4 +11,4 @@ INCLUDES = -I. -Icommands
 # OPTIMIZE OPTIONS
 #==========================
 OPTS     = ${INCLUDES} -Ofast -std=c++11 -simd -march=native -m64 -Werror -Wall -pedantic -g
-OPTS_MPI = -cxx=${CPLUSPLUS}
+#OPTS_MPI = -cxx=${CPLUSPLUS}
diff --git a/ABACUS.develop/source/src_global/gram_schmidt_orth-inl.h b/ABACUS.develop/source/src_global/gram_schmidt_orth-inl.h
index 1358260fcd..dd985feffd 100644
--- a/ABACUS.develop/source/src_global/gram_schmidt_orth-inl.h
+++ b/ABACUS.develop/source/src_global/gram_schmidt_orth-inl.h
@@ -10,6 +10,7 @@
 
 #include "mathzone.h"
 #include "lapack_connector.h"
+#include "math_integral.h" // mohan add 2021-04-03
 
 template<typename Func_Type, typename R_Type>
 Gram_Schmidt_Orth<Func_Type,R_Type>::Gram_Schmidt_Orth( const vector<R_Type> &rab_in, const Coordinate &coordinate_in )
@@ -75,13 +76,13 @@ Func_Type Gram_Schmidt_Orth<Func_Type,R_Type>::cal_norm( const vector<Func_Type>
 	{
 		case Coordinate::Cartesian:
 		{
-			Mathzone::Simpson_Integral( f.size(), VECTOR_TO_PTR(f), VECTOR_TO_PTR(rab), norm);		
+			Integral::Simpson_Integral( f.size(), VECTOR_TO_PTR(f), VECTOR_TO_PTR(rab), norm);		
 			break;
 		}
 		case Coordinate::Sphere:	
 		{
 			const vector<Func_Type> &&tmp_func = Mathzone::Pointwise_Product( f, radial_2 );
-			Mathzone::Simpson_Integral( f.size(), VECTOR_TO_PTR(tmp_func), VECTOR_TO_PTR(rab), norm);	
+			Integral::Simpson_Integral( f.size(), VECTOR_TO_PTR(tmp_func), VECTOR_TO_PTR(rab), norm);	
 			break;
 		}
 		default:
@@ -93,4 +94,4 @@ Func_Type Gram_Schmidt_Orth<Func_Type,R_Type>::cal_norm( const vector<Func_Type>
 	return norm;
 }
 
-#endif	// GRAM_SCHMIDT_ORTH_INL_H
\ No newline at end of file
+#endif	// GRAM_SCHMIDT_ORTH_INL_H
diff --git a/ABACUS.develop/source/src_global/mathzone.cpp b/ABACUS.develop/source/src_global/mathzone.cpp
index 7a8f3176b1..5acca148be 100644
--- a/ABACUS.develop/source/src_global/mathzone.cpp
+++ b/ABACUS.develop/source/src_global/mathzone.cpp
@@ -1391,208 +1391,6 @@ int Mathzone::Semi_Fact(const int n)
     return semif;
 }
 
-// Peize Lin accelerate 2017-10-02
-/*
-void Mathzone::Simpson_Integral
-(
-    const int mesh,
-    const double *func,
-    const double *rab,
-    double &asum
-)
-{
-    //     simpson's rule integration. On input:
-    //     mesh = mhe number of grid points (should be odd)
-    //     func(i)= function to be integrated
-    //     rab(i) = r(i) * dr(i)/di * di
-    //     For the logarithmic grid not including r=0 :
-    //     r(i) = r_0*exp((i-1)*dx) ==> rab(i)=r(i)*dx
-    //     For the logarithmic grid including r=0 :
-    //     r(i) = a(exp((i-1)*dx)-1) ==> rab(i)=(r(i)+a)*dx
-    //     Output in asum = \sum_i c_i f(i)*rab(i) = \int_0^\infty f(r) dr
-    //     where c_i are alternativaly 2/3, 4/3 except c_1 = c_mesh = 1/3
-    
-    //  simpson's rule integrator for function stored on the
-    //  radial logarithmic mesh
-    //	routine assumes that mesh is an odd number so run check
-    if (mesh % 2 == 0)
-    {
-        cout << "\n error in subroutine simpson ";
-        cout << "\n routine assumes mesh is odd but mesh = "
-             << mesh << endl;
-        return;
-    }
-
-    asum = 0.00;
-    const double r12 = 1.00 / 12.00;
-    double f3 = func [0] * rab [0] * r12;
-    for (int i = 1;i < mesh;i += 2)
-    {
-        const double f1 = f3;
-        const double f2 = func [i] * rab [i] * r12;
-        f3 = func [i + 1] * rab [i + 1] * r12;
-        asum += 4.00 * f1 + 16.00 * f2 + 4.00 * f3;
-    }
-    return;
-}// end subroutine simpson
-*/
-
-// Peize Lin accelerate 2017-10-02
-void Mathzone::Simpson_Integral
-(
-    const int mesh,
-    const double *func,
-    const double *rab,
-    double &asum
-)
-{
-    /*     simpson's rule integration. On input:
-    !      mesh = mhe number of grid points (should be odd)
-    !      func(i)= function to be integrated
-    !      rab(i) = r(i) * dr(i)/di * di
-    !      For the logarithmic grid not including r=0 :
-    !      r(i) = r_0*exp((i-1)*dx) ==> rab(i)=r(i)*dx
-    !      For the logarithmic grid including r=0 :
-    !      r(i) = a(exp((i-1)*dx)-1) ==> rab(i)=(r(i)+a)*dx
-    !      Output in asum = \sum_i c_i f(i)*rab(i) = \int_0^\infty f(r) dr
-    !      where c_i are alternativaly 2/3, 4/3 except c_1 = c_mesh = 1/3
-    */
-    //  simpson's rule integrator for function stored on the
-    //  radial logarithmic mesh
-    //	routine assumes that mesh is an odd number so run check
-    assert(mesh&1);
-
-    asum = 0.00;
-	const size_t end = mesh-2;
-    for( size_t i=1; i!=end; i+=2 )
-    {
-		const double f1 = func[i]*rab[i];
-		asum += f1 + f1 + func[i+1]*rab[i+1];
-    }
-	const double f1 = func[mesh-2]*rab[mesh-2];
-	asum += f1+f1;
-	asum += asum;
-	asum += func[0]*rab[0] + func[mesh-1]*rab[mesh-1];
-	asum /= 3.0;
-    return;
-}// end subroutine simpson
-
-// Peize Lin accelerate 2017-10-02
-void Mathzone::Simpson_Integral
-(
-    const int mesh,
-    const double *func,
-    const double dr,
-    double &asum
-)
-{
-    /*     simpson's rule integration. On input:
-    !      mesh = mhe number of grid points (should be odd)
-    !      func(i)= function to be integrated
-    !      rab(i) = r(i) * dr(i)/di * di
-    !      For the logarithmic grid not including r=0 :
-    !      r(i) = r_0*exp((i-1)*dx) ==> rab(i)=r(i)*dx
-    !      For the logarithmic grid including r=0 :
-    !      r(i) = a(exp((i-1)*dx)-1) ==> rab(i)=(r(i)+a)*dx
-    !      Output in asum = \sum_i c_i f(i)*rab(i) = \int_0^\infty f(r) dr
-    !      where c_i are alternativaly 2/3, 4/3 except c_1 = c_mesh = 1/3
-    */
-    //  simpson's rule integrator for function stored on the
-    //  radial logarithmic mesh
-    //	routine assumes that mesh is an odd number so run check
-    assert(mesh&1);
-
-    asum = 0.00;
-	const size_t end = mesh-2;
-    for( size_t i=1; i!=end; i+=2 )
-    {
-		const double f1 = func[i];
-		asum += f1 + f1 + func[i+1];
-    }
-	const double f1 = func[mesh-2];
-	asum += f1+f1;
-	asum += asum;
-	asum += func[0] + func[mesh-1];
-	asum *= dr/3.0;
-    return;
-}// end subroutine simpson
-
-// Peize Lin add 2016-02-14
-void Mathzone::Simpson_Integral_0toall
-(
-    const int mesh,
-    const double *func,
-    const double *rab,
-    double *asum
-)
-{
-    // asum(r) = \int_{r'=0}^{r} dr' f(r') 
-
-    const double r2=1.00/2.00, r3=1.00/3.00;
-    asum[0] = 0.00;
-    double f3 = func [0] * rab [0];
-    for( int i=1; i<mesh; i+=2)
-    {
-        const double f1 = f3;
-        const double f2 = func[i] * rab[i] ;
-        f3 = func[i+1] * rab[i+1] ;
-        asum[i] = asum[i-1] + r2*( f1 + f2);
-        if(i+1<mesh)
-        {
-            asum[i+1] = asum[i-1] + r3*( f1 + 4.00*f2 + f3 );
-        }
-    }
-    return;
-}
-
-// Peize Lin add 2016-02-14
-// faster but still have bug
-/*void Mathzone::Simpson_Integral_alltoinf
-(
-    const int mesh,
-    const double *func,
-    const double *rab,
-    double *asum
-)
-{
-    // asum(r) = \int_{r'=r}^{+\infty} dr' f(r') 
-    //         = \inf_{r'=r}^{mesh} dr' f(r')
-
-    const double r2=1.00/2.00, r3=1.00/3.00;
-    asum[mesh-1] = 0.00;
-    const int odd_mesh = (mesh-1)^~1;
-    double f1 = func[odd_mesh] * rab[odd_mesh];
-    for( size_t i=(mesh-3)|1; i>0; i-=2)
-    {
-        const double f3 = f1;   
-        if( i+3==mesh )
-        {
-            const double f4 = func[mesh-1] * rab[mesh-1];
-            asum[mesh-2] = r2*(f3 + f4);
-        }
-        const double f2 = func[i] * rab[i] ;
-        f1 = func[i-1] * rab[i-1] ;
-        asum[i-1] = asum[i+1] + r3*( f1 + 4.00*f2 + f3 );
-        asum[i] = asum[i-1] - r2*( f1 + f2);
-    }
-    return;
-}*/
-
-// Peize Lin add 2016-06-11
-// a little lower
-void Mathzone::Simpson_Integral_alltoinf
-(
-    const int mesh,
-    const double *func,
-    const double *rab,
-    double *asum
-)
-{
-    Mathzone::Simpson_Integral_0toall( mesh, func, rab, asum );
-    const double asum_all = asum[mesh-1];
-    for (int i = 0;i < mesh; ++i)
-        asum[i] = asum_all - asum[i];
-}
 
 void Mathzone::To_Polar_Coordinate
 (
diff --git a/ABACUS.develop/source/src_global/mathzone.h b/ABACUS.develop/source/src_global/mathzone.h
index 0e146364df..c9e369c8ca 100644
--- a/ABACUS.develop/source/src_global/mathzone.h
+++ b/ABACUS.develop/source/src_global/mathzone.h
@@ -147,41 +147,6 @@ class Mathzone
     static long double Fact(const int n);
     static int Semi_Fact(const int n);
 
-	// Peize Lin accelerate 2017-10-02
-    static void Simpson_Integral
-    (
-        const int mesh,
-        const double *func,
-        const double *rab,
-        double &asum
-    );
-	// Peize Lin accelerate 2017-10-02
-	static void Simpson_Integral
-	(
-		const int mesh,
-		const double *func,
-		const double dr,
-		double &asum
-	);
-
-    // Peize Lin add 2016-02-14
-    static void Simpson_Integral_0toall
-    (
-        const int mesh,
-        const double *func,
-        const double *rab,
-        double *asum
-    );
-
-    // Peize Lin add 2016-02-14
-    static void Simpson_Integral_alltoinf
-    (
-        const int mesh,
-        const double *func,
-        const double *rab,
-        double *asum
-    );     
-	
 	// Peize Lin add 2016-08-03
 	template< typename Type >
 	static vector<Type> Pointwise_Product( const vector<Type> &f1, const vector<Type> &f2 )
diff --git a/ABACUS.develop/source/src_io/bessel_basis.cpp b/ABACUS.develop/source/src_io/bessel_basis.cpp
index 47f7a40d8e..5c0c44df3b 100644
--- a/ABACUS.develop/source/src_io/bessel_basis.cpp
+++ b/ABACUS.develop/source/src_io/bessel_basis.cpp
@@ -1,6 +1,7 @@
 #include "bessel_basis.h"
 #include "../src_pw/global.h"
 #include "../src_parallel/parallel_common.h"
+#include "../src_global/math_integral.h"
 
 Bessel_Basis::Bessel_Basis()
 {
@@ -306,7 +307,7 @@ void Bessel_Basis::init_TableOne(
 				}
 				
 				// make table value
-				Mathzone::Simpson_Integral(rmesh, function, rab, this->TableOne(l, ie, ik) );
+				Integral::Simpson_Integral(rmesh, function, rab, this->TableOne(l, ie, ik) );
 			}
 			
 		}// end ie
diff --git a/ABACUS.develop/source/src_io/to_wannier90.cpp b/ABACUS.develop/source/src_io/to_wannier90.cpp
index 6601106b13..9d9685cfdf 100644
--- a/ABACUS.develop/source/src_io/to_wannier90.cpp
+++ b/ABACUS.develop/source/src_io/to_wannier90.cpp
@@ -1,6 +1,6 @@
 #include "to_wannier90.h"
 #include "../src_lcao/global_fp.h" // mohan add 2021-01-30, this module should be modified
- 
+#include "../src_global/math_integral.h" 
 
 
 toWannier90::toWannier90(int num_kpts, Matrix3 recip_lattice)
@@ -1430,7 +1430,7 @@ void toWannier90::integral(const int meshr, const double *psir, const double *r,
 	}
 	
 	double unit = 0.0;
-	Mathzone::Simpson_Integral(meshr, inner_part, rab, unit);
+	Integral::Simpson_Integral(meshr, inner_part, rab, unit);
 	delete[] inner_part;
 
 	double *aux = new double[meshr];
@@ -1445,7 +1445,7 @@ void toWannier90::integral(const int meshr, const double *psir, const double *r,
 		}
 		
 		double vqint = 0.0;
-		Mathzone::Simpson_Integral(meshr, vchi, rab, vqint);
+		Integral::Simpson_Integral(meshr, vchi, rab, vqint);
 
 		table[iq] =  vqint * pref;
 	}
diff --git a/ABACUS.develop/source/src_lcao/ORB_atomic_lm.cpp b/ABACUS.develop/source/src_lcao/ORB_atomic_lm.cpp
index c2cfde12a9..82fca8474a 100644
--- a/ABACUS.develop/source/src_lcao/ORB_atomic_lm.cpp
+++ b/ABACUS.develop/source/src_lcao/ORB_atomic_lm.cpp
@@ -2,6 +2,7 @@
 #include "../src_global/sph_bessel_recursive.h"
 #include "../src_global/lapack_connector.h"
 #include "../src_global/timer.h"
+#include "../src_global/math_integral.h"
 #include <omp.h>
 
 Numerical_Orbital_Lm::Numerical_Orbital_Lm()
@@ -388,7 +389,7 @@ void Numerical_Orbital_Lm::cal_kradial(void)
 			integrated_func[ir] = this->psir[ir] * this->r_radial[ir] * jl[ir];
 		}
 
-		Mathzone::Simpson_Integral(
+		Integral::Simpson_Integral(
 				this->nr,
 				integrated_func,
 				VECTOR_TO_PTR(this->rab),
@@ -442,7 +443,7 @@ void Numerical_Orbital_Lm::cal_kradial_sbpool(void)
 		const vector<double> &jlk = jl[ik];
 		for (int ir = 0; ir < nr; ir++)
 			integrated_func[ir] = psir2[ir] * jlk[ir];
-		Mathzone::Simpson_Integral(
+		Integral::Simpson_Integral(
 				this->nr,
 				VECTOR_TO_PTR(integrated_func),
 				dr,
@@ -560,7 +561,7 @@ void Numerical_Orbital_Lm::norm_test(void)const
 	double sumr = 0.0;
 	//double sumk = 0.0;
 
-	Mathzone::Simpson_Integral(this->nr, f, VECTOR_TO_PTR(this->rab), sumr);
+	Integral::Simpson_Integral(this->nr, f, VECTOR_TO_PTR(this->rab), sumr);
 
 	delete[] f;
 	f = new double[nk];
@@ -569,7 +570,7 @@ void Numerical_Orbital_Lm::norm_test(void)const
 		f[ik] = this->psik[ik] * this->psik[ik];
 	}
 
-//	Mathzone::Simpson_Integral(this->nk, f, this->k_radial, sumk);
+//	Integral::Simpson_Integral(this->nk, f, this->k_radial, sumk);
 	
 	//means nothing.
 	//ofs_running << setw(12) << sumk << endl;
diff --git a/ABACUS.develop/source/src_lcao/ORB_nonlocal_lm.cpp b/ABACUS.develop/source/src_lcao/ORB_nonlocal_lm.cpp
index d693f7a6da..0c134e9873 100644
--- a/ABACUS.develop/source/src_lcao/ORB_nonlocal_lm.cpp
+++ b/ABACUS.develop/source/src_lcao/ORB_nonlocal_lm.cpp
@@ -4,6 +4,7 @@
 //=========================================================
 #include "ORB_nonlocal_lm.h"
 #include "../src_pw/global.h"
+#include "../src_global/math_integral.h"
 
 Numerical_Nonlocal_Lm::Numerical_Nonlocal_Lm()
 {
@@ -245,7 +246,7 @@ void Numerical_Nonlocal_Lm::get_kradial(void)
             integrated_func[ir] = this->beta_r[ir] * this->r_radial[ir] * jl[ir];
         }
 
-        Mathzone::Simpson_Integral(
+        Integral::Simpson_Integral(
                 this->nr,
                 integrated_func,
                 this->rab,
diff --git a/ABACUS.develop/source/src_lcao/ORB_read.cpp b/ABACUS.develop/source/src_lcao/ORB_read.cpp
index ed69e3c801..eef6f4d19e 100644
--- a/ABACUS.develop/source/src_lcao/ORB_read.cpp
+++ b/ABACUS.develop/source/src_lcao/ORB_read.cpp
@@ -1,6 +1,7 @@
 #include "ORB_read.h"
 #include "../src_pw/global.h" // only use ucell.atoms[it]
 #include <cstring>		// Peize Lin fix bug about strcmp 2016-08-02
+#include "../src_global/math_integral.h"
 
 //==============================
 // Define an object here! 
@@ -882,7 +883,7 @@ void LCAO_Orbitals::Read_PAO(const int& it)
 				inner[ir] = psir[ir] * psir[ir];
 			}
 			double unit = 0.0;
-			Mathzone::Simpson_Integral(meshr, inner, rab, unit);
+			Integral::Simpson_Integral(meshr, inner, rab, unit);
 
 			// check unit: \sum ( psi[r] * r )^2 = 1
 			ofs_running << setprecision(3) << setw(12) << unit;
@@ -897,7 +898,7 @@ void LCAO_Orbitals::Read_PAO(const int& it)
 			{
 				inner[ir] = psir[ir] * psir[ir];
 			}
-			Mathzone::Simpson_Integral(meshr, inner, rab, unit);
+			Integral::Simpson_Integral(meshr, inner, rab, unit);
 			delete[] inner;
 			ofs_running << setw(12) << unit << endl;
 			
@@ -1212,9 +1213,7 @@ void LCAO_Orbitals::Read_Descriptor(void)	//read descriptor basis
 			}
 			double unit = 0.0;
 
-// PLEASE make Simpson_Integral as input parameters?
-// mohan note 2021-03-23
-			Mathzone::Simpson_Integral(meshr, inner, rab, unit);
+			Integral::Simpson_Integral(meshr, inner, rab, unit);
 
 			// check unit: \sum ( psi[r] * r )^2 = 1
 			ofs_running << setprecision(3) << setw(12) << unit;
@@ -1229,7 +1228,7 @@ void LCAO_Orbitals::Read_Descriptor(void)	//read descriptor basis
 			{
 				inner[ir] = psir[ir] * psir[ir];
 			}
-			Mathzone::Simpson_Integral(meshr, inner, rab, unit);
+			Integral::Simpson_Integral(meshr, inner, rab, unit);
 			delete[] inner;
 			ofs_running << setw(12) << unit << endl;
 
diff --git a/ABACUS.develop/source/src_lcao/ORB_table_alpha.cpp b/ABACUS.develop/source/src_lcao/ORB_table_alpha.cpp
index 462956c64b..ed6681a115 100644
--- a/ABACUS.develop/source/src_lcao/ORB_table_alpha.cpp
+++ b/ABACUS.develop/source/src_lcao/ORB_table_alpha.cpp
@@ -1,6 +1,7 @@
 //caoyu add 2021-03-17
 #include "ORB_table_alpha.h"
 #include "ORB_read.h"
+#include "../src_global/math_integral.h"
 #include <stdexcept>
 
 double ORB_table_alpha::dr = -1.0;
@@ -159,7 +160,7 @@ void ORB_table_alpha::cal_S_PhiAlpha_R(
 			integrated_func[ik] = jl[ir][ik] * k1_dot_k2[ik];
 		}
 		// Call simpson integration
-		Mathzone::Simpson_Integral(kmesh, integrated_func, kab, temp);
+		Integral::Simpson_Integral(kmesh, integrated_func, kab, temp);
 		rs[ir] = temp * FOUR_PI;
 
 		//drs
@@ -172,7 +173,7 @@ void ORB_table_alpha::cal_S_PhiAlpha_R(
 				integrated_func[ik] = jlm1[ir][ik] * k1_dot_k2[ik] * kpoint[ik];
 			}
 
-			Mathzone::Simpson_Integral(kmesh, integrated_func, kab, temp1);
+			Integral::Simpson_Integral(kmesh, integrated_func, kab, temp1);
 		}
 
 
@@ -181,7 +182,7 @@ void ORB_table_alpha::cal_S_PhiAlpha_R(
 			integrated_func[ik] = jlp1[ir][ik] * k1_dot_k2[ik] * kpoint[ik];
 		}
 
-		Mathzone::Simpson_Integral(kmesh, integrated_func, kab, temp2);
+		Integral::Simpson_Integral(kmesh, integrated_func, kab, temp2);
 
 		if (l == 0)
 		{
@@ -207,7 +208,7 @@ void ORB_table_alpha::cal_S_PhiAlpha_R(
 		}
 
 		// Call simpson integration
-		Mathzone::Simpson_Integral(kmesh, integrated_func, kab, temp);
+		Integral::Simpson_Integral(kmesh, integrated_func, kab, temp);
 		rs[0] = FOUR_PI / Mathzone_Add1::dualfac(2 * l + 1) * temp;
 	}
 
diff --git a/ABACUS.develop/source/src_lcao/ORB_table_beta.cpp b/ABACUS.develop/source/src_lcao/ORB_table_beta.cpp
index c28464a1ae..5eff28e964 100644
--- a/ABACUS.develop/source/src_lcao/ORB_table_beta.cpp
+++ b/ABACUS.develop/source/src_lcao/ORB_table_beta.cpp
@@ -1,6 +1,7 @@
 #include <stdexcept>
 #include "ORB_table_beta.h"
 #include "ORB_read.h"
+#include "../src_global/math_integral.h"
 
 double ORB_table_beta::dr = -1.0;
 
@@ -155,7 +156,7 @@ void ORB_table_beta::cal_VNL_PhiBeta_R(
 			integrated_func[ik] = jl[ir][ik] * k1_dot_k2[ik];
 		}
 		// Call simpson integration
-		Mathzone::Simpson_Integral(kmesh,integrated_func,kab,temp);
+		Integral::Simpson_Integral(kmesh,integrated_func,kab,temp);
 		rs[ir] = temp * FOUR_PI;
 		
 		//drs
@@ -168,7 +169,7 @@ void ORB_table_beta::cal_VNL_PhiBeta_R(
 				integrated_func[ik] = jlm1[ir][ik] * k1_dot_k2[ik] * kpoint[ik];
 			}
 
-			Mathzone::Simpson_Integral(kmesh,integrated_func,kab,temp1);
+			Integral::Simpson_Integral(kmesh,integrated_func,kab,temp1);
 		}
 		
 				
@@ -177,7 +178,7 @@ void ORB_table_beta::cal_VNL_PhiBeta_R(
 			integrated_func[ik] = jlp1[ir][ik] * k1_dot_k2[ik] * kpoint[ik];
 		}
 		
-		Mathzone::Simpson_Integral(kmesh,integrated_func,kab,temp2);
+		Integral::Simpson_Integral(kmesh,integrated_func,kab,temp2);
 		
 		if (l == 0)
 		{
@@ -203,7 +204,7 @@ void ORB_table_beta::cal_VNL_PhiBeta_R(
 		}
 		
 		// Call simpson integration
-		Mathzone::Simpson_Integral(kmesh,integrated_func,kab,temp);
+		Integral::Simpson_Integral(kmesh,integrated_func,kab,temp);
 		rs[0] = FOUR_PI / Mathzone_Add1::dualfac (2*l+1) * temp;
 	}
 	
diff --git a/ABACUS.develop/source/src_lcao/ORB_table_phi.cpp b/ABACUS.develop/source/src_lcao/ORB_table_phi.cpp
index aff7401840..41d5a22d46 100644
--- a/ABACUS.develop/source/src_lcao/ORB_table_phi.cpp
+++ b/ABACUS.develop/source/src_lcao/ORB_table_phi.cpp
@@ -2,6 +2,7 @@
 #include "ORB_table_phi.h"
 #include "ORB_read.h"
 #include "../src_ri/exx_abfs.h"
+#include "../src_global/math_integral.h"
 
 double ORB_table_phi::dr = -1.0;
 
@@ -194,7 +195,8 @@ void ORB_table_phi::cal_ST_Phi12_R
 		}
 		// Call simpson integration
 		double temp = 0.0;
-		Mathzone::Simpson_Integral(kmesh,integrated_func,dk,temp);
+
+		Integral::Simpson_Integral(kmesh,integrated_func,dk,temp);
 		rs[ir] = temp * FOUR_PI ;
 		
 		// Peize Lin accelerate 2017-10-02
@@ -215,9 +217,8 @@ void ORB_table_phi::cal_ST_Phi12_R
 				integrated_func[ik] = (jlp1_r[ik]-fac*jlm1_r[ik]) * k1_dot_k2_dot_kpoint[ik];
 			}
 		}
-		// PLEASE try to make Simpson_Integral as input parameters
-		// mohan note 2021-03-23
-		Mathzone::Simpson_Integral(kmesh,integrated_func,dk,temp);
+
+		Integral::Simpson_Integral(kmesh,integrated_func,dk,temp);
 		drs[ir] = -FOUR_PI*(l+1)/(2.0*l+1) * temp;
 	}
 
@@ -235,9 +236,7 @@ void ORB_table_phi::cal_ST_Phi12_R
 			integrated_func[ik] = k1_dot_k2[ik] * pow (kpoint[ik], l);
 		}
 		
-		// PLEASE try to make Simpson_Integral as input parameters
-		// mohan note 2021-03-23
-		Mathzone::Simpson_Integral(kmesh,integrated_func,kab,temp);
+		Integral::Simpson_Integral(kmesh,integrated_func,kab,temp);
 		rs[0] = FOUR_PI / Mathzone_Add1::dualfac (2*l+1) * temp;
 	}
 
@@ -326,10 +325,8 @@ void ORB_table_phi::cal_ST_Phi12_R
 			integrated_func[ik] = jl_r[ik] * k1_dot_k2[ik];
 		}
 		double temp = 0.0;
-//		Mathzone::Simpson_Integral(kmesh,integrated_func,kab,temp);
-		// PLEASE try to make Simpson_Integral as input parameters
-		// mohan note 2021-03-23
-		Mathzone::Simpson_Integral(kmesh,VECTOR_TO_PTR(integrated_func),dk,temp);
+
+		Integral::Simpson_Integral(kmesh,VECTOR_TO_PTR(integrated_func),dk,temp);
 		rs[ir] = temp * FOUR_PI ;
 		
 		const vector<double> &jlm1_r = jlm1[ir];
@@ -349,10 +346,8 @@ void ORB_table_phi::cal_ST_Phi12_R
 				integrated_func[ik] = (jlp1_r[ik]-fac*jlm1_r[ik]) * k1_dot_k2_dot_kpoint[ik];
 			}
 		}
-//		Mathzone::Simpson_Integral(kmesh,integrated_func,kab,temp);
-		// PLEASE try to make Simpson_Integral as input parameters
-		// mohan note 2021-03-23
-		Mathzone::Simpson_Integral(kmesh,VECTOR_TO_PTR(integrated_func),dk,temp);
+
+		Integral::Simpson_Integral(kmesh,VECTOR_TO_PTR(integrated_func),dk,temp);
 		drs[ir] = -FOUR_PI*(l+1)/(2.0*l+1) * temp;
 	}
 
@@ -366,10 +361,9 @@ void ORB_table_phi::cal_ST_Phi12_R
 				integrated_func[ik] = k1_dot_k2[ik] * pow (kpoint[ik], l);
 			}
 			double temp = 0.0;
-	//		Mathzone::Simpson_Integral(kmesh,integrated_func,kab,temp);
-			// PLEASE try to make Simpson_Integral as input parameters
-			// mohan note 2021-03-23
-			Mathzone::Simpson_Integral(kmesh,VECTOR_TO_PTR(integrated_func),dk,temp);
+
+			Integral::Simpson_Integral(kmesh,VECTOR_TO_PTR(integrated_func),dk,temp);
+
 			// PLEASE try to make dualfac function as input parameters
 			// mohan note 2021-03-23
 			rs[0] = FOUR_PI / Mathzone_Add1::dualfac (2*l+1) * temp;
diff --git a/ABACUS.develop/source/src_pw/charge.cpp b/ABACUS.develop/source/src_pw/charge.cpp
index 36981de51b..9a93f688f3 100644
--- a/ABACUS.develop/source/src_pw/charge.cpp
+++ b/ABACUS.develop/source/src_pw/charge.cpp
@@ -21,6 +21,7 @@
 #include "charge.h"
 #include "magnetism.h"
 #include "../src_parallel/parallel_grid.h"
+#include "../src_global/math_integral.h"
 
 Charge::Charge()
 {
@@ -227,7 +228,7 @@ void Charge::atomic_rho(const int spin_number_need, double** rho_in)const
 		rhoatm[0] = rhoatm[1] / rhoatm[0];  
 
 		double charge = 0.0;
-		Mathzone::Simpson_Integral(atom->msh,atom->rho_at,atom->rab,charge);
+		Integral::Simpson_Integral(atom->msh,atom->rho_at,atom->rab,charge);
 
 		OUT(ofs_warning,"charge from rho_at",charge);
 		assert(charge!=0.0);
@@ -255,7 +256,7 @@ void Charge::atomic_rho(const int spin_number_need, double** rho_in)const
 //              rho1d [ir] = atom->rho_at[ir];
 				rho1d[ir] = rhoatm[ir];
             }
-            Mathzone::Simpson_Integral(mesh, rho1d, atom->rab , rho_lgl[0]);
+            Integral::Simpson_Integral(mesh, rho1d, atom->rab , rho_lgl[0]);
         }
 
 
@@ -283,7 +284,7 @@ void Charge::atomic_rho(const int spin_number_need, double** rho_in)const
                     rho1d[ir] = rhoatm[ir] * sin(gxx) / gxx;
                 }
             }
-            Mathzone::Simpson_Integral(mesh , rho1d, atom->rab , rho_lgl [ig]);
+            Integral::Simpson_Integral(mesh , rho1d, atom->rab , rho_lgl [ig]);
         }
 		delete[] rhoatm;
         
@@ -651,7 +652,7 @@ void Charge::non_linear_core_correction
             {
                 aux [ir] = r [ir] * r [ir] * rhoc [ir];
             }
-            Mathzone::Simpson_Integral(mesh, aux, rab, rhocg1);
+            Integral::Simpson_Integral(mesh, aux, rab, rhocg1);
             //rhocg [1] = fpi * rhocg1 / omega;
             rhocg [0] = FOUR_PI * rhocg1 / ucell.omega;//mohan modify 2008-01-19
             igl0 = 1;
@@ -666,7 +667,7 @@ void Charge::non_linear_core_correction
             {
                 aux [ir] = r[ir] * r[ir] * rhoc [ir] * aux [ir];
             } //  enddo
-            Mathzone::Simpson_Integral(mesh, aux, rab, rhocg1);
+            Integral::Simpson_Integral(mesh, aux, rab, rhocg1);
             rhocg [igl] = FOUR_PI * rhocg1 / ucell.omega;
         } //  enddo
         delete [] aux;
diff --git a/ABACUS.develop/source/src_pw/forces.cpp b/ABACUS.develop/source/src_pw/forces.cpp
index 2b944bf137..580e51581b 100644
--- a/ABACUS.develop/source/src_pw/forces.cpp
+++ b/ABACUS.develop/source/src_pw/forces.cpp
@@ -5,6 +5,7 @@
 #include "symmetry.h"
 // new
 #include "H_XC_pw.h"
+#include "../src_global/math_integral.h"
 
 double Forces::output_acc = 1.0e-8; // (Ryd/angstrom).	
 
@@ -837,7 +838,7 @@ void Forces::cal_force_scc(matrix& forcescc)
                     aux[ir] = ucell.atoms[nt].rho_at[ir] * sin(gxx) / gxx;
                 }
             }
-            Mathzone::Simpson_Integral(mesh , aux, ucell.atoms[nt].rab , rhocgnt [ig]);
+            Integral::Simpson_Integral(mesh , aux, ucell.atoms[nt].rab , rhocgnt [ig]);
         }
 
         int iat = 0;
diff --git a/ABACUS.develop/source/src_pw/pseudopot_cell_vl.cpp b/ABACUS.develop/source/src_pw/pseudopot_cell_vl.cpp
index 7c08a04648..8c879bdde9 100644
--- a/ABACUS.develop/source/src_pw/pseudopot_cell_vl.cpp
+++ b/ABACUS.develop/source/src_pw/pseudopot_cell_vl.cpp
@@ -2,6 +2,7 @@
 #include "tools.h"
 #include "../src_pw/myfunc.h"
 #include "pseudopot_cell_vl.h"
+#include "../src_global/math_integral.h"
 
 pseudopot_cell_vl::pseudopot_cell_vl()
 {
@@ -138,7 +139,7 @@ void pseudopot_cell_vl::vloc_of_g(
 	{
 		aux[ir] = r[ir] * zp_in * e2 / ucell.omega;
 	}
-	Mathzone::Simpson_Integral(msh, aux, rab, vloc_1d[0] );
+	Integral::Simpson_Integral(msh, aux, rab, vloc_1d[0] );
 	vloc_1d[0] *= 4*3.1415926;
 	cout << "  vloc_1d[0]=" <<  vloc_1d[0]/pw.ngmc << endl;
 	cout << "  vloc_1d[0]=" <<  vloc_1d[0]/pw.ncxyz << endl;
@@ -156,7 +157,7 @@ void pseudopot_cell_vl::vloc_of_g(
 			aux[ir] = r [ir] * (r [ir] * vloc_at [ir] + zp_in * e2);
 			//aux[ir] = r [ir] * (r [ir] * vloc_at [ir] );
 		}
-		Mathzone::Simpson_Integral(msh, aux, rab, vloc_1d[0] );
+		Integral::Simpson_Integral(msh, aux, rab, vloc_1d[0] );
 		igl0 = 1;	
 	}
 	else
@@ -182,7 +183,7 @@ void pseudopot_cell_vl::vloc_of_g(
 		{
 			aux [ir] = aux1 [ir] * sin(gx * r [ir]) / gx;
 		}
-		Mathzone::Simpson_Integral(msh, aux, rab, vloc_1d[ig] );
+		Integral::Simpson_Integral(msh, aux, rab, vloc_1d[ig] );
 		//  here we add the analytic fourier transform of the erf function
 		vloc_1d[ig] -= fac * exp(- gx2 * 0.25)/ gx2;
 	} // enddo
diff --git a/ABACUS.develop/source/src_pw/pseudopot_cell_vnl.cpp b/ABACUS.develop/source/src_pw/pseudopot_cell_vnl.cpp
index bc06304a20..fd83eb7201 100644
--- a/ABACUS.develop/source/src_pw/pseudopot_cell_vnl.cpp
+++ b/ABACUS.develop/source/src_pw/pseudopot_cell_vnl.cpp
@@ -7,6 +7,7 @@
 #include "tools.h"
 #include "wavefunc.h"
 #include "../src_lcao/ORB_gen_tables.h"
+#include "../src_global/math_integral.h"
 
 pseudopot_cell_vnl::pseudopot_cell_vnl()
 {
@@ -398,7 +399,7 @@ void pseudopot_cell_vnl::init_vnl(UnitCell_pseudo &cell)
 					          jl[ir] * cell.atoms[it].r[ir];
 				} 
 				double vqint;
-				Mathzone::Simpson_Integral(kkbeta, aux, cell.atoms[it].rab, vqint);
+				Integral::Simpson_Integral(kkbeta, aux, cell.atoms[it].rab, vqint);
 				this->tab(it, ib, iq) = vqint * pref;
 			} 
 		} 
@@ -624,7 +625,7 @@ void pseudopot_cell_vnl::init_vnl_alpha(void)          // pengfei Li 2018-3-23
 								  ucell.atoms[it].r[ir] * ucell.atoms[it].r[ir];
 					}
 					double vqint;
-					Mathzone::Simpson_Integral(kkbeta, aux, ucell.atoms[it].rab, vqint);
+					Integral::Simpson_Integral(kkbeta, aux, ucell.atoms[it].rab, vqint);
 					this->tab_alpha(it, ib, L, iq) = vqint * pref;
 				}
 			}
diff --git a/ABACUS.develop/source/src_pw/stress_func_cc.cpp b/ABACUS.develop/source/src_pw/stress_func_cc.cpp
index 410c0d4bc1..0a09c7f13d 100644
--- a/ABACUS.develop/source/src_pw/stress_func_cc.cpp
+++ b/ABACUS.develop/source/src_pw/stress_func_cc.cpp
@@ -1,5 +1,6 @@
 #include "./stress_func.h"
 #include "./H_XC_pw.h"
+#include "../src_global/math_integral.h"
 
 //NLCC term, need to be tested
 void Stress_Func::stress_cc(matrix& sigma, const bool is_pw)
@@ -182,7 +183,7 @@ void Stress_Func::deriv_drhoc
 		{
 			aux [ir] = r [ir] * rhoc [ir] * (r [ir] * cos (gx * r [ir] ) / gx - sin (gx * r [ir] ) / pow(gx,2));
 		}//ir
-		Mathzone::Simpson_Integral(mesh, aux, rab, rhocg1);
+		Integral::Simpson_Integral(mesh, aux, rab, rhocg1);
 		drhocg [igl] = FOUR_PI / ucell.omega * rhocg1;
 	}//igl
 	
diff --git a/ABACUS.develop/source/src_pw/stress_func_loc.cpp b/ABACUS.develop/source/src_pw/stress_func_loc.cpp
index afd8c77e37..78edceb05b 100644
--- a/ABACUS.develop/source/src_pw/stress_func_loc.cpp
+++ b/ABACUS.develop/source/src_pw/stress_func_loc.cpp
@@ -1,4 +1,5 @@
-#include"stress_func.h"
+#include "stress_func.h"
+#include "../src_global/math_integral.h"
 
 //calculate local pseudopotential stress in PW or VL_dVL stress in LCAO
 void Stress_Func::stress_loc(matrix& sigma, const bool is_pw)
@@ -176,7 +177,7 @@ double*  dvloc
 			aux [i] = aux1 [i] * (r [i] * cos (gx * r [i] ) / gx - sin (gx * r [i] ) / pow(gx,2));
 		}
 		// simpson (msh, aux, rab, vlcp);
-		Mathzone::Simpson_Integral(msh, aux, rab, vlcp );
+		Integral::Simpson_Integral(msh, aux, rab, vlcp );
 		// DV(g^2)/Dg^2 = (DV(g)/Dg)/2g
 		vlcp *= FOUR_PI / ucell.omega / 2.0 / gx;
 		// subtract the long-range term
diff --git a/ABACUS.develop/source/src_pw/wavefunc_in_pw.cpp b/ABACUS.develop/source/src_pw/wavefunc_in_pw.cpp
index eda6a710f7..5d9c872561 100644
--- a/ABACUS.develop/source/src_pw/wavefunc_in_pw.cpp
+++ b/ABACUS.develop/source/src_pw/wavefunc_in_pw.cpp
@@ -1,5 +1,6 @@
 #include "wavefunc_in_pw.h"
 #include <cstring>		// Peize Lin fix bug about strcmp 2016-08-02
+#include "../src_global/math_integral.h"
 
 void Wavefunc_in_pw::make_table_q(std::vector<string> &fn, realArray &table_local)
 {
@@ -212,7 +213,7 @@ const double *rab, const int &l, double* table)
 	}
 	
 	double unit = 0.0;
-	Mathzone::Simpson_Integral(meshr, inner_part, rab, unit);
+	Integral::Simpson_Integral(meshr, inner_part, rab, unit);
 	delete[] inner_part;
 	OUT(ofs_running,"normalize unit",unit);
 
@@ -228,7 +229,7 @@ const double *rab, const int &l, double* table)
 		}
 		
 		double vqint = 0.0;
-		Mathzone::Simpson_Integral(meshr, vchi, rab, vqint);
+		Integral::Simpson_Integral(meshr, vchi, rab, vqint);
 
 		table[iq] =  vqint * pref;
 	}
diff --git a/ABACUS.develop/source/src_pw/wf_atomic.cpp b/ABACUS.develop/source/src_pw/wf_atomic.cpp
index d2a64b57bd..c7320262d5 100644
--- a/ABACUS.develop/source/src_pw/wf_atomic.cpp
+++ b/ABACUS.develop/source/src_pw/wf_atomic.cpp
@@ -1,5 +1,6 @@
 #include "wf_atomic.h"
 #include "global.h"
+#include "../src_global/math_integral.h"
 
 WF_atomic::WF_atomic()
 {
@@ -79,7 +80,7 @@ void WF_atomic::init_at_1(void)
                 inner_part[ir] = atom->chi(ic,ir) * atom->chi(ic,ir);
             }
             double unit = 0.0;
-            Mathzone::Simpson_Integral(atom->msh, inner_part, atom->rab, unit);
+            Integral::Simpson_Integral(atom->msh, inner_part, atom->rab, unit);
             delete[] inner_part;
 
 			ofs_running << " the unit of pseudo atomic orbital is " << unit; 
@@ -101,7 +102,7 @@ void WF_atomic::init_at_1(void)
                 inner_part[ir] = atom->chi(ic,ir) * atom->chi(ic,ir);
             }
             unit = 0.0;
-            Mathzone::Simpson_Integral(atom->msh, inner_part, atom->rab, unit);
+            Integral::Simpson_Integral(atom->msh, inner_part, atom->rab, unit);
             delete[] inner_part;
 
 			ofs_running << ", renormalize to " << unit << endl;
@@ -119,7 +120,7 @@ void WF_atomic::init_at_1(void)
                     }
 
                     double vqint = 0.0;
-                    Mathzone::Simpson_Integral(atom->msh, vchi, atom->rab, vqint);
+                    Integral::Simpson_Integral(atom->msh, vchi, atom->rab, vqint);
 
                     ppcell.tab_at(it, ic, iq) =  vqint * pref;
                     //				if( it == 0 && ic == 0 )
diff --git a/ABACUS.develop/source/src_ri/conv_coulomb_pot.cpp b/ABACUS.develop/source/src_ri/conv_coulomb_pot.cpp
index ef87663767..54bc284341 100644
--- a/ABACUS.develop/source/src_ri/conv_coulomb_pot.cpp
+++ b/ABACUS.develop/source/src_ri/conv_coulomb_pot.cpp
@@ -5,6 +5,7 @@
 #include "src_global/global_function.h"
 
 #include "src_external/src_test/test_function.h"
+#include "src_global/math_integral.h" // mohan add 2021-04-03
 
 Conv_Coulomb_Pot::Conv_Coulomb_Pot(const Numerical_Orbital_Lm &orb_in)
 	:orb(orb_in)
@@ -37,7 +38,7 @@ void Conv_Coulomb_Pot::cal_conv_coulomb_pot()
 	{
 		tmp_func[ir] = orb.getPsi(ir) * pow( orb.getRadial(ir), orb.getL()+2 );
 	}	
-	Mathzone::Simpson_Integral_0toall( orb.getNr(), VECTOR_TO_PTR(tmp_func), orb.getRab(), VECTOR_TO_PTR(tmp_integral) );
+	Integral::Simpson_Integral_0toall( orb.getNr(), VECTOR_TO_PTR(tmp_func), orb.getRab(), VECTOR_TO_PTR(tmp_integral) );
 	conv_coulomb_pot[0]=0;
 	for( size_t ir=1; ir!=orb.getNr(); ++ir )
 	{
@@ -53,7 +54,7 @@ void Conv_Coulomb_Pot::cal_conv_coulomb_pot()
 	{
 		tmp_func[ir] = orb.getPsi(ir) / pow( orb.getRadial(ir), orb.getL()-1 );
 	}
-	Mathzone::Simpson_Integral_alltoinf( orb.getNr(), VECTOR_TO_PTR(tmp_func), orb.getRab(), VECTOR_TO_PTR(tmp_integral) );
+	Integral::Simpson_Integral_alltoinf( orb.getNr(), VECTOR_TO_PTR(tmp_func), orb.getRab(), VECTOR_TO_PTR(tmp_integral) );
 	for( size_t ir=0; ir!=orb.getNr(); ++ir )
 	{
 		conv_coulomb_pot[ir] += tmp_integral[ir] * pow( orb.getRadial(ir), orb.getL() );
diff --git a/ABACUS.develop/source/src_ri/exx_abfs-io.cpp b/ABACUS.develop/source/src_ri/exx_abfs-io.cpp
index d390bcb293..5c2f5b1803 100644
--- a/ABACUS.develop/source/src_ri/exx_abfs-io.cpp
+++ b/ABACUS.develop/source/src_ri/exx_abfs-io.cpp
@@ -7,6 +7,7 @@
 #include "src_pw/global.h"
 #include "src_lcao/ORB_read.h"
 #include "src_global/global_function.h"
+#include "src_global/math_integral.h" // mohan add 2021-04-03
 
 
 vector<vector<vector<Numerical_Orbital_Lm>>> Exx_Abfs::IO::construct_abfs(
@@ -212,7 +213,7 @@ vector<vector<Numerical_Orbital_Lm>> Exx_Abfs::IO::construct_abfs_T(
 				inner[ir] = psir[ir] * psir[ir];
 			}
 			double unit = 0.0;	
-			Mathzone::Simpson_Integral(meshr, VECTOR_TO_PTR(inner), VECTOR_TO_PTR(rab), unit);
+			Integral::Simpson_Integral(meshr, VECTOR_TO_PTR(inner), VECTOR_TO_PTR(rab), unit);
 			for( int ir=0; ir!=meshr; ++ir )
 			{
 				psis[L][N][ir] /= sqrt(unit);
@@ -510,4 +511,4 @@ void Exx_Abfs::IO::print_matrix(
 			}
 		}
 	}
-}
\ No newline at end of file
+}

From 8c01a630114138c0e7520a869fd8ad7f9e3ccbb5 Mon Sep 17 00:00:00 2001
From: maki49 <1579492865@qq.com>
Date: Sat, 3 Apr 2021 17:47:06 +0800
Subject: [PATCH 22/60] some files haven't be successfully submitted in reading
 descriptor

---
 ABACUS.develop/examples/H2O-deepks-lcao/STRU     |  4 ++++
 ABACUS.develop/source/src_io/read_atoms.cpp      |  4 ++++
 .../source/src_lcao/ORB_gen_tables.cpp           | 16 ++--------------
 ABACUS.develop/source/src_lcao/ORB_gen_tables.h  |  4 +---
 4 files changed, 11 insertions(+), 17 deletions(-)

diff --git a/ABACUS.develop/examples/H2O-deepks-lcao/STRU b/ABACUS.develop/examples/H2O-deepks-lcao/STRU
index 8dc2ac2824..6148031dd4 100644
--- a/ABACUS.develop/examples/H2O-deepks-lcao/STRU
+++ b/ABACUS.develop/examples/H2O-deepks-lcao/STRU
@@ -6,6 +6,10 @@ NUMERICAL_ORBITAL
 H_gga_8au_60Ry_2s1p.orb
 O_gga_7au_60Ry_2s2p1d.orb
 
+NUMERICAL_DESCRIPTOR
+jle.orb
+
+
 LATTICE_CONSTANT
 10
 
diff --git a/ABACUS.develop/source/src_io/read_atoms.cpp b/ABACUS.develop/source/src_io/read_atoms.cpp
index 3e8f51bf46..288a95426b 100644
--- a/ABACUS.develop/source/src_io/read_atoms.cpp
+++ b/ABACUS.develop/source/src_io/read_atoms.cpp
@@ -71,6 +71,10 @@ void UnitCell_pseudo::read_atom_species(ifstream &ifa)
 				
 			}
 		}	
+		// caoyu add 2021-03-16
+		if (SCAN_BEGIN(ifa, "NUMERICAL_DESCRIPTOR")) {
+			ifa >> ORB.descriptor_file;
+		}
 	}
 
 	// Peize Lin add 2016-09-23
diff --git a/ABACUS.develop/source/src_lcao/ORB_gen_tables.cpp b/ABACUS.develop/source/src_lcao/ORB_gen_tables.cpp
index 043948466f..7404617773 100644
--- a/ABACUS.develop/source/src_lcao/ORB_gen_tables.cpp
+++ b/ABACUS.develop/source/src_lcao/ORB_gen_tables.cpp
@@ -828,9 +828,7 @@ void ORB_gen_tables::snap_psialpha(
 	const int& T2,
 	const int& L2,
 	const int& m2,
-	const int& N2,
-	complex<double>* olm1,
-	const int is) const
+	const int& N2) const
 {
 
 	if (job != 0 && job != 1)
@@ -964,20 +962,10 @@ void ORB_gen_tables::snap_psialpha(
 			case 0: // calculate overlap.
 			{
 				if (NSPIN != 4) olm[0] += tmpOlm0 * rly[MGT.get_lm_index(L, m)];
-				else if (olm1 != NULL)
-				{
-					olm1[0] += tmpOlm0 * rly[MGT.get_lm_index(L, m)];
-					olm1[1] += 0;//tmpOlm0 * (tmp(0,0)+tmp(0,1));
-					olm1[2] += 0;//tmpOlm0 * (tmp(1,0)+tmp(1,1));
-					olm1[3] += tmpOlm0 * rly[MGT.get_lm_index(L, m)];
-
-				}
 				else
 				{
-					WARNING_QUIT("ORB_gen_tables::snap_psialpha", "something wrong!");
-
+					WARNING_QUIT("ORB_gen_tables::snap_psialpha", "deepks with NSPIN>1 has not implemented yet!");
 				}
-
 				/*
 				if( abs ( tmpOlm0 * rly[ MGT.get_lm_index(L, m) ] ) > 1.0e-3 )
 				{
diff --git a/ABACUS.develop/source/src_lcao/ORB_gen_tables.h b/ABACUS.develop/source/src_lcao/ORB_gen_tables.h
index 2fe9619ee2..de890d2028 100644
--- a/ABACUS.develop/source/src_lcao/ORB_gen_tables.h
+++ b/ABACUS.develop/source/src_lcao/ORB_gen_tables.h
@@ -79,9 +79,7 @@ class ORB_gen_tables
 		const int& I2,
 		const int& l2,
 		const int& m2,
-		const int& n2,
-		complex<double>* olm1 = NULL,
-		const int is = 0)const;
+		const int& n2)const;
 
 	// set as public because in hamilt_linear, 
 	// we need to destroy the tables: SR,TR,NR

From 74162cb51b63510db9198f992de24721dee1b88a Mon Sep 17 00:00:00 2001
From: mohan <mohan.chen.chen.mohan@gmail.com>
Date: Sat, 3 Apr 2021 20:44:14 +0800
Subject: [PATCH 23/60] make Exx_Abfs::Lmax as an input for ORB module

---
 ABACUS.develop/source/README                  | 26 +------------------
 ABACUS.develop/source/run_lcao.cpp            |  2 +-
 .../source/src_io/cal_r_overlap_R.cpp         |  2 +-
 ABACUS.develop/source/src_io/energy_dos.cpp   |  2 +-
 .../source/src_io/mulliken_charge.cpp         |  2 +-
 .../source/src_io/unk_overlap_lcao.cpp        |  2 +-
 .../source/src_lcao/ORB_control.cpp           |  5 ++--
 ABACUS.develop/source/src_lcao/ORB_control.h  |  3 ++-
 .../source/src_lcao/ORB_gen_tables.cpp        | 18 ++++++++++---
 .../source/src_lcao/ORB_gen_tables.h          |  6 ++++-
 .../source/src_lcao/ORB_table_phi.cpp         | 20 ++++++++------
 .../source/src_lcao/ORB_table_phi.h           |  6 +++--
 .../exx_abfs-matrix_lcaoslcaos_lcaoslcaos.cpp |  2 +-
 .../source/src_ri/exx_abfs-matrix_orbs11.cpp  |  2 +-
 .../source/src_ri/exx_abfs-matrix_orbs21.cpp  |  2 +-
 .../source/src_ri/exx_abfs-matrix_orbs22.cpp  |  2 +-
 16 files changed, 50 insertions(+), 52 deletions(-)

diff --git a/ABACUS.develop/source/README b/ABACUS.develop/source/README
index 191291c344..d75f606b16 100644
--- a/ABACUS.develop/source/README
+++ b/ABACUS.develop/source/README
@@ -1,29 +1,5 @@
+
 Currently we are working on optimizing the code structure of ABACUS,
 implementing new functions, and adding more autotests.
 
 -- mohan 2021-02-11
-
-URGENT:
-
-Ask Xiaohui Liu: all functions named with 'after_vc' should be reconstructed.
-The 'FINAL_SCF' global varialble should be removed. 
-(condition: need to reconstruct these codes within a given time)
-
-Ask Xiaohui Liu and Daye Zheng: We need test examples.
-
-Ask Fuxiang He: we need to remove all TDDFT-related global variables
-in global_variable.h, we need TDDFT examples.
-(condition: need to reconstruct these codes within a given time)
-
-Ask Daye Zheng: MD, force, stress modules need reconstruction
-
-NEED TO DO:
-
-Ask Xiaohui: we need to remove DQ and NQX in global_variable.h, 
-but the NQX is computed in ./src_pw/pseudopot_cell_vnl.cpp
-
-Ask Peize: exx_lip.h and related Exx codes
-
-QUESTION:
-
-* in pw_basis.cpp, why ggwfc2=ggwfc if gamma_only is used?
diff --git a/ABACUS.develop/source/run_lcao.cpp b/ABACUS.develop/source/run_lcao.cpp
index d4661b2006..d5f04d0967 100644
--- a/ABACUS.develop/source/run_lcao.cpp
+++ b/ABACUS.develop/source/run_lcao.cpp
@@ -45,7 +45,7 @@ void Run_lcao::lcao_line(void)
 
     // * reading the localized orbitals/projectors 
 	// * construct the interpolation tables.
-	hm.orb_con.set_orb_tables(UOT, ORB, ucell.lat0);
+	hm.orb_con.set_orb_tables(UOT, ORB, ucell.lat0, Exx_Abfs::Lmax);
 
 	// * allocate H and S matrices according to computational resources
 	// * set the 'trace' between local H/S and global H/S
diff --git a/ABACUS.develop/source/src_io/cal_r_overlap_R.cpp b/ABACUS.develop/source/src_io/cal_r_overlap_R.cpp
index ff803ecd56..31d94aa6d1 100644
--- a/ABACUS.develop/source/src_io/cal_r_overlap_R.cpp
+++ b/ABACUS.develop/source/src_io/cal_r_overlap_R.cpp
@@ -61,7 +61,7 @@ void cal_r_overlap_R::init()
 		ORB.get_dR(),// delta R, for making radial table
 		ORB.get_dk()); // delta k, for integration in k space
 		
-	MOT.init_Table_Spherical_Bessel (2, 3, Lmax_used, Lmax);
+	MOT.init_Table_Spherical_Bessel (2, 3, Lmax_used, Lmax, Exx_Abfs::Lmax);
 
 	Ylm::set_coefficients();
 
diff --git a/ABACUS.develop/source/src_io/energy_dos.cpp b/ABACUS.develop/source/src_io/energy_dos.cpp
index 1c975ac8d9..42d760b93a 100644
--- a/ABACUS.develop/source/src_io/energy_dos.cpp
+++ b/ABACUS.develop/source/src_io/energy_dos.cpp
@@ -327,7 +327,7 @@ void energy::perform_dos(void)
 				atom_arrange::search( SEARCH_RADIUS );//qifeng-2019-01-21
 
 				// mohan update 2021-02-10
-				hm.orb_con.set_orb_tables(UOT, ORB, ucell.lat0);
+				hm.orb_con.set_orb_tables(UOT, ORB, ucell.lat0, Exx_Abfs::Lmax);
 				LM.allocate_HS_R(LNNR.nnr);
 				LM.zeros_HSR('S', LNNR.nnr);
 				UHM.genH.calculate_S_no();
diff --git a/ABACUS.develop/source/src_io/mulliken_charge.cpp b/ABACUS.develop/source/src_io/mulliken_charge.cpp
index 7ce98a1d9d..b5959b9d0e 100644
--- a/ABACUS.develop/source/src_io/mulliken_charge.cpp
+++ b/ABACUS.develop/source/src_io/mulliken_charge.cpp
@@ -166,7 +166,7 @@ void Mulliken_Charge::cal_mulliken(void)
 			mud[0].create(ParaO.ncol,ParaO.nrow);
 			atom_arrange::set_sr_NL();
 			atom_arrange::search( SEARCH_RADIUS );//qifeng-2019-01-21
-			hm.orb_con.set_orb_tables(UOT, ORB, ucell.lat0);
+			hm.orb_con.set_orb_tables(UOT, ORB, ucell.lat0, Exx_Abfs::Lmax);
 			LM.allocate_HS_R(LNNR.nnr);
 			LM.zeros_HSR('S', LNNR.nnr);
 			UHM.genH.calculate_S_no();
diff --git a/ABACUS.develop/source/src_io/unk_overlap_lcao.cpp b/ABACUS.develop/source/src_io/unk_overlap_lcao.cpp
index 82e6c6b57e..f9ccb7432d 100644
--- a/ABACUS.develop/source/src_io/unk_overlap_lcao.cpp
+++ b/ABACUS.develop/source/src_io/unk_overlap_lcao.cpp
@@ -69,7 +69,7 @@ void unkOverlap_lcao::init()
 		ORB.get_dR(),// delta R, for making radial table
 		ORB.get_dk()); // delta k, for integration in k space
 		
-	MOT.init_Table_Spherical_Bessel (2, 3, Lmax_used, Lmax);
+	MOT.init_Table_Spherical_Bessel (2, 3, Lmax_used, Lmax, Exx_Abfs::Lmax);
 
 	Ylm::set_coefficients ();
 
diff --git a/ABACUS.develop/source/src_lcao/ORB_control.cpp b/ABACUS.develop/source/src_lcao/ORB_control.cpp
index c2b87ec409..fc02da0b04 100644
--- a/ABACUS.develop/source/src_lcao/ORB_control.cpp
+++ b/ABACUS.develop/source/src_lcao/ORB_control.cpp
@@ -15,7 +15,8 @@ ORB_control::~ORB_control()
 void ORB_control::set_orb_tables(
 	ORB_gen_tables &OGT, 
 	LCAO_Orbitals &orb,
-	const double &lat0)
+	const double &lat0,
+	const int &Lmax_exx)
 {
     TITLE("ORB_control","set_orb_tables");
 	timer::tick("ORB_control","set_orb_tables",'B');
@@ -42,7 +43,7 @@ void ORB_control::set_orb_tables(
     // 1: generate overlap table
     // 2: generate kinetic table
     // 3: generate overlap & kinetic table
-    OGT.gen_tables(job0, orb);
+    OGT.gen_tables(job0, orb, Lmax_exx);
     // init lat0, in order to interpolated value from this table.
 
 	assert(lat0>0.0);
diff --git a/ABACUS.develop/source/src_lcao/ORB_control.h b/ABACUS.develop/source/src_lcao/ORB_control.h
index ea3f9ddb58..7fdc9d6779 100644
--- a/ABACUS.develop/source/src_lcao/ORB_control.h
+++ b/ABACUS.develop/source/src_lcao/ORB_control.h
@@ -16,7 +16,8 @@ class ORB_control
     void set_orb_tables(
 		ORB_gen_tables &OGT, 
 		LCAO_Orbitals &orb,
-		const double &lat0);
+		const double &lat0,
+		const int &Lmax_exx);
 
     void clear_after_ions(
 		ORB_gen_tables &OGT, 
diff --git a/ABACUS.develop/source/src_lcao/ORB_gen_tables.cpp b/ABACUS.develop/source/src_lcao/ORB_gen_tables.cpp
index 35bb240129..13afa1b0a3 100644
--- a/ABACUS.develop/source/src_lcao/ORB_gen_tables.cpp
+++ b/ABACUS.develop/source/src_lcao/ORB_gen_tables.cpp
@@ -10,7 +10,10 @@ ORB_gen_tables::ORB_gen_tables(){}
 ORB_gen_tables::~ORB_gen_tables(){}
 
 // call in hamilt_linear::init_before_ions.
-void ORB_gen_tables::gen_tables( const int &job0, LCAO_Orbitals &orb )
+void ORB_gen_tables::gen_tables( 
+	const int &job0, 
+	LCAO_Orbitals &orb, 
+	const int &Lmax_exx)
 {
 	TITLE("ORB_gen_tables","gen_tables");
 	timer::tick("ORB_gen_tables","gen_tables",'C');
@@ -68,16 +71,23 @@ void ORB_gen_tables::gen_tables( const int &job0, LCAO_Orbitals &orb )
 	//liaochen add 2010/4/29
 	Ylm::set_coefficients ();
 
+	// PLEASE add explanations for all options of 'orb_num' and 'mode'
+	// mohan add 2021-04-03
 	// Peize Lin update 2016-01-26
-	int Lmax_used, Lmax;
-	MOT.init_Table_Spherical_Bessel (2,1, Lmax_used, Lmax);
+	int orb_num=2; //
+	int mode=1; // 1: <phi|phi> and <phi|beta>
+	int Lmax_used=0;
+	int Lmax=0;
+
+	MOT.init_Table_Spherical_Bessel (orb_num, mode, Lmax_used, Lmax, Lmax_exx);
 	
 	//calculate S(R) for interpolation
 	MOT.init_Table(job0, orb);
 	tbeta.init_Table_Beta( MOT.pSB );// add 2009-5-8
 
 	//caoyu add 2021-03-18
-	if (INPUT.out_descriptor && BASIS_TYPE == "lcao") {
+	if (INPUT.out_descriptor && BASIS_TYPE == "lcao") 
+	{
 		talpha.init_Table_Alpha(MOT.pSB);
 		talpha.print_Table_DSR();	
 	}
diff --git a/ABACUS.develop/source/src_lcao/ORB_gen_tables.h b/ABACUS.develop/source/src_lcao/ORB_gen_tables.h
index 9b6f383740..176c2e5cf2 100644
--- a/ABACUS.develop/source/src_lcao/ORB_gen_tables.h
+++ b/ABACUS.develop/source/src_lcao/ORB_gen_tables.h
@@ -22,7 +22,11 @@ class ORB_gen_tables
 	ORB_gen_tables();
 	~ORB_gen_tables();
 
-	void gen_tables( const int &job0, LCAO_Orbitals &orb);
+	void gen_tables( 
+		const int &job0, 
+		LCAO_Orbitals &orb,
+		const int &Lmax_exx);
+
 	void set_unit( const double &v ){lat0=v;}
 	
 	void snap_psipsi(
diff --git a/ABACUS.develop/source/src_lcao/ORB_table_phi.cpp b/ABACUS.develop/source/src_lcao/ORB_table_phi.cpp
index 41d5a22d46..c79d52f091 100644
--- a/ABACUS.develop/source/src_lcao/ORB_table_phi.cpp
+++ b/ABACUS.develop/source/src_lcao/ORB_table_phi.cpp
@@ -1,7 +1,5 @@
 #include <stdexcept>
 #include "ORB_table_phi.h"
-#include "ORB_read.h"
-#include "../src_ri/exx_abfs.h"
 #include "../src_global/math_integral.h"
 
 double ORB_table_phi::dr = -1.0;
@@ -767,7 +765,8 @@ void ORB_table_phi::init_Lmax (
 	const int orb_num, 
 	const int mode, 
 	int &Lmax_used, 
-	int &Lmax) const
+	int &Lmax,
+	const int &Lmax_exx) const
 {
 	auto cal_Lmax_Phi = [](int &Lmax)
 	{
@@ -805,7 +804,7 @@ void ORB_table_phi::init_Lmax (
 					Lmax_used = 2*Lmax + 1;
 					break;
 				case 2:			// used in <jY|jY> or <Abfs|Abfs>
-					Lmax = max(Lmax, Exx_Abfs::Lmax);
+					Lmax = max(Lmax, Lmax_exx);
 					Lmax_used = 2*Lmax + 1;
 					break;
 				case 3:                // used in berryphase by jingan
@@ -824,8 +823,8 @@ void ORB_table_phi::init_Lmax (
 				case 1:			// used in <jY|PhiPhi> or <Abfs|PhiPhi>
 					cal_Lmax_Phi(Lmax);
 					Lmax_used = 2*Lmax + 1;
-					Lmax = max(Lmax, Exx_Abfs::Lmax);
-					Lmax_used += Exx_Abfs::Lmax;
+					Lmax = max(Lmax, Lmax_exx);
+					Lmax_used += Lmax_exx;
 					break;
 				default:
 					throw invalid_argument("ORB_table_phi::init_Lmax orb_num=3, mode error");
@@ -853,11 +852,16 @@ void ORB_table_phi::init_Lmax (
 }
 
 // Peize Lin update 2016-01-26
-void ORB_table_phi::init_Table_Spherical_Bessel (const int orb_num, const int mode, int &Lmax_used, int &Lmax)
+void ORB_table_phi::init_Table_Spherical_Bessel (
+	const int orb_num, 
+	const int mode, 
+	int &Lmax_used, 
+	int &Lmax,
+	const int &Lmax_exx)
 {
 	TITLE("ORB_table_phi", "init_Table_Spherical_Bessel");
 
-	this->init_Lmax (orb_num,mode,Lmax_used,Lmax);		// Peize Lin add 2016-01-26
+	this->init_Lmax (orb_num,mode,Lmax_used,Lmax,Lmax_exx);		// Peize Lin add 2016-01-26
 
 	for( auto & sb : Sph_Bessel_Recursive_Pool::D2::sb_pool )
 	{
diff --git a/ABACUS.develop/source/src_lcao/ORB_table_phi.h b/ABACUS.develop/source/src_lcao/ORB_table_phi.h
index acc68853d0..57553a2d31 100644
--- a/ABACUS.develop/source/src_lcao/ORB_table_phi.h
+++ b/ABACUS.develop/source/src_lcao/ORB_table_phi.h
@@ -50,13 +50,15 @@ class ORB_table_phi
 		const int orb_num, 
 		const int mode, 
 		int &Lmax_used, 
-		int &Lmax) const;
+		int &Lmax,
+		const int &Lmax_exx) const;
 
 	void init_Table_Spherical_Bessel(
 		const int orb_num, 
 		const int mode, 
 		int &Lmax_used, 
-		int &Lmax);
+		int &Lmax,
+		const int &Lmax_exx);
 
 	// Peize Lin add 2017-04-24, and change all jlx in this class
 	Sph_Bessel_Recursive::D2* pSB = nullptr;
diff --git a/ABACUS.develop/source/src_ri/exx_abfs-matrix_lcaoslcaos_lcaoslcaos.cpp b/ABACUS.develop/source/src_ri/exx_abfs-matrix_lcaoslcaos_lcaoslcaos.cpp
index 763692cab3..7f1626c038 100644
--- a/ABACUS.develop/source/src_ri/exx_abfs-matrix_lcaoslcaos_lcaoslcaos.cpp
+++ b/ABACUS.develop/source/src_ri/exx_abfs-matrix_lcaoslcaos_lcaoslcaos.cpp
@@ -21,7 +21,7 @@ void Exx_Abfs::Matrix_Lcaoslcaos_Lcaoslcaos::init(
 //		ORB.get_dk() / kmesh_times);				// delta k, for integration in k space
 		ORB.get_dk());											// Peize Lin change 2017-04-16
 	int Lmax_used, Lmax;
-	MOT.init_Table_Spherical_Bessel (4,mode, Lmax_used, Lmax);
+	MOT.init_Table_Spherical_Bessel (4,mode, Lmax_used, Lmax, Exx_Abfs::Lmax);
 //	MOT.init_OV_Tpair();							// for MOT.OV_L2plus1
 //	MOT.Destroy_Table_Spherical_Bessel (Lmax_used);				// why?
 
diff --git a/ABACUS.develop/source/src_ri/exx_abfs-matrix_orbs11.cpp b/ABACUS.develop/source/src_ri/exx_abfs-matrix_orbs11.cpp
index b4b11c5e0c..dbb966a44d 100644
--- a/ABACUS.develop/source/src_ri/exx_abfs-matrix_orbs11.cpp
+++ b/ABACUS.develop/source/src_ri/exx_abfs-matrix_orbs11.cpp
@@ -36,7 +36,7 @@ void Exx_Abfs::Matrix_Orbs11::init(
 //ofs<<"TIME@Exx_Abfs::Matrix_Orbs11::init::MOT.allocate\t"<<time_during(t_start)<<endl;
 	int Lmax_used, Lmax;
 //gettimeofday( &t_start, NULL);
-	MOT.init_Table_Spherical_Bessel (2, mode, Lmax_used, Lmax);
+	MOT.init_Table_Spherical_Bessel (2, mode, Lmax_used, Lmax, Exx_Abfs::Lmax);
 //	MOT.init_OV_Tpair();							// for MOT.OV_L2plus1
 //	MOT.Destroy_Table_Spherical_Bessel (Lmax_used);				// why?
 //ofs<<"TIME@Exx_Abfs::Matrix_Orbs11::init::MOT.init_Table_Spherical_Bessel\t"<<time_during(t_start)<<endl;
diff --git a/ABACUS.develop/source/src_ri/exx_abfs-matrix_orbs21.cpp b/ABACUS.develop/source/src_ri/exx_abfs-matrix_orbs21.cpp
index c270afc72b..37ceea7394 100644
--- a/ABACUS.develop/source/src_ri/exx_abfs-matrix_orbs21.cpp
+++ b/ABACUS.develop/source/src_ri/exx_abfs-matrix_orbs21.cpp
@@ -37,7 +37,7 @@ void Exx_Abfs::Matrix_Orbs21::init(
 //ofs<<"TIME@Exx_Abfs::Matrix_Orbs21::init::MOT.allocate\t"<<time_during(t_start)<<endl;
 	int Lmax_used, Lmax;
 //gettimeofday( &t_start, NULL);
-	MOT.init_Table_Spherical_Bessel (3,mode, Lmax_used, Lmax);
+	MOT.init_Table_Spherical_Bessel (3,mode, Lmax_used, Lmax, Exx_Abfs::Lmax);
 //	MOT.init_OV_Tpair();							// for MOT.OV_L2plus1
 //	MOT.Destroy_Table_Spherical_Bessel (Lmax_used);				// why?
 //ofs<<"TIME@Exx_Abfs::Matrix_Orbs21::init::MOT.init_Table_Spherical_Bessel\t"<<time_during(t_start)<<endl;
diff --git a/ABACUS.develop/source/src_ri/exx_abfs-matrix_orbs22.cpp b/ABACUS.develop/source/src_ri/exx_abfs-matrix_orbs22.cpp
index ce9da0d84e..af048d02d0 100644
--- a/ABACUS.develop/source/src_ri/exx_abfs-matrix_orbs22.cpp
+++ b/ABACUS.develop/source/src_ri/exx_abfs-matrix_orbs22.cpp
@@ -32,7 +32,7 @@ gettimeofday( &t_start, NULL);
 //		ORB.get_dk() / kmesh_times);				// delta k, for integration in k space
 		ORB.get_dk());											// Peize Lin change 2017-04-16
 	int Lmax_used, Lmax;
-	MOT.init_Table_Spherical_Bessel (4,mode, Lmax_used, Lmax);
+	MOT.init_Table_Spherical_Bessel (4,mode, Lmax_used, Lmax, Exx_Abfs::Lmax);
 //	MOT.init_OV_Tpair();							// for MOT.OV_L2plus1
 //	MOT.Destroy_Table_Spherical_Bessel (Lmax_used);				// why?
 

From aba7fa1515b5f9a9277dc458fabe57d5a13f88f4 Mon Sep 17 00:00:00 2001
From: qianrui <Terry_Liu@pku.edu.cn>
Date: Sat, 3 Apr 2021 21:15:41 +0800
Subject: [PATCH 24/60] fix a bug in h_psi for NSPIN!=4

---
 ABACUS.develop/source/src_pw/hamilt_pw.cpp | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/ABACUS.develop/source/src_pw/hamilt_pw.cpp b/ABACUS.develop/source/src_pw/hamilt_pw.cpp
index b0de0211df..2a1181ace7 100644
--- a/ABACUS.develop/source/src_pw/hamilt_pw.cpp
+++ b/ABACUS.develop/source/src_pw/hamilt_pw.cpp
@@ -706,7 +706,7 @@ void Hamilt_PW::add_vuspsi(complex<double> *hpsi_in,const complex<double> *becp,
 					}//end ib
                 }// end ih
             }//end jh
-		 	sum += 2 * Nprojs;
+		 	sum += Nprojs;
 			++iat;
         } //end na
     } //end nt

From 4c6ff81b8762eb85ef5ece09c74d651da662a7aa Mon Sep 17 00:00:00 2001
From: maki49 <1579492865@qq.com>
Date: Sat, 3 Apr 2021 21:54:08 +0800
Subject: [PATCH 25/60] out lcao-line descriptor

---
 ABACUS.develop/source/Makefile.Objects        |   2 +
 .../source/src_lcao/LCAO_descriptor.cpp       | 426 ++++++++++++++++++
 .../source/src_lcao/LCAO_descriptor.h         |  31 ++
 ABACUS.develop/source/src_lcao/LOOP_ions.cpp  |   9 +
 4 files changed, 468 insertions(+)
 create mode 100644 ABACUS.develop/source/src_lcao/LCAO_descriptor.cpp
 create mode 100644 ABACUS.develop/source/src_lcao/LCAO_descriptor.h

diff --git a/ABACUS.develop/source/Makefile.Objects b/ABACUS.develop/source/Makefile.Objects
index fe41bb5ab8..86fadfdbd4 100644
--- a/ABACUS.develop/source/Makefile.Objects
+++ b/ABACUS.develop/source/Makefile.Objects
@@ -129,6 +129,7 @@ ORB_nonlocal_lm.o\
 ORB_gaunt_table.o\
 ORB_table_beta.o\
 ORB_table_phi.o\
+ORB_table_alpha.o\
 ORB_gen_tables.o\
 local_orbital_wfc.o\
 local_orbital_charge.o\
@@ -152,6 +153,7 @@ LCAO_matrix.o\
 LCAO_nnr.o \
 LCAO_diago.o\
 LCAO_evolve.o\
+LCAO_descriptor.o\
 ylm.o\
 FORCE_STRESS.o\
 FORCE_gamma.o\
diff --git a/ABACUS.develop/source/src_lcao/LCAO_descriptor.cpp b/ABACUS.develop/source/src_lcao/LCAO_descriptor.cpp
new file mode 100644
index 0000000000..3dd7320c61
--- /dev/null
+++ b/ABACUS.develop/source/src_lcao/LCAO_descriptor.cpp
@@ -0,0 +1,426 @@
+//caoyu add 2021-03-29
+#include "LCAO_descriptor.h"
+#include "LCAO_matrix.h"
+#include "../src_global/lapack_connector.h"
+#include "../src_global/intarray.h"
+#include "../src_global/complexmatrix.h"
+#include "global_fp.h"
+#include "../src_pw/global.h"
+#include "../src_io/winput.h"
+
+LCAO_Descriptor::LCAO_Descriptor()
+{
+    S_mu_alpha = new double[1];
+    PDM = new double[1];
+    mu_index = new IntArray[1];
+    d = new double[1];
+}
+LCAO_Descriptor::~LCAO_Descriptor()
+{
+    delete[] S_mu_alpha;
+    delete[] PDM;
+    delete[] mu_index;
+    delete[] d;
+}
+
+void LCAO_Descriptor::build_S_descriptor(const bool &calc_deri)
+{
+    TITLE("LCAO_Descriptor", "build_S_descriptor");
+
+    // =======init==============
+    // cal n(descriptor) per atom , related to Lmax, nchi(L) and m. (not total_nchi!)
+    for (int l = 0; l <= ORB.get_lmax_d(); l++)
+    {
+        this->des_per_atom += ORB.Alpha[0].getNchi(l) * (2 * l + 1);
+    }
+    this->n_descriptor = ucell.nat * this->des_per_atom;
+    const long DMsize = this->n_descriptor * this->n_descriptor;
+    delete[] S_mu_alpha;
+    S_mu_alpha = new double[DMsize];
+
+    this->init_mu_index();
+    // =======init==============
+
+    //array to store data
+    double olm[3] = {0.0, 0.0, 0.0};
+
+    //\sum{T} e**{ikT} <\phi_{ia}|d\phi_{k\beta}(T)>	//???
+    Vector3<double> tau1, tau2, dtau;
+    Vector3<double> dtau1, dtau2, tau0;
+    for (int T1 = 0; T1 < ucell.ntype; ++T1)
+    {
+        Atom *atom1 = &ucell.atoms[T1];
+        for (int I1 = 0; I1 < atom1->na; ++I1)
+        {
+            tau1 = atom1->tau[I1];
+            //GridD.Find_atom(tau1);
+            GridD.Find_atom(tau1, T1, I1);
+            for (int ad = 0; ad < GridD.getAdjacentNum() + 1; ++ad)
+            {
+                const int T2 = GridD.getType(ad);
+                const int I2 = GridD.getNatom(ad);
+                Atom *atom2 = &ucell.atoms[T2];
+                tau2 = GridD.getAdjacentTau(ad);
+                dtau = tau2 - tau1;
+                double distance = dtau.norm() * ucell.lat0;
+                double rcut = ORB.Phi[T1].getRcut() + ORB.Phi[T2].getRcut(); //Rcut is subject to ORB.Phi to keep dimension of S_mu_alpha same as Sloc
+                if (distance < rcut)
+                {
+                    int iw1_all = ucell.itiaiw2iwt(T1, I1, 0); //iw1_all = combined index (it, ia, iw)
+
+                    for (int jj = 0; jj < atom1->nw * NPOL; ++jj)
+                    {
+                        const int jj0 = jj / NPOL;
+                        const int L1 = atom1->iw2l[jj0];
+                        const int N1 = atom1->iw2n[jj0];
+                        const int m1 = atom1->iw2m[jj0];
+
+                        //init iw2_all
+                        int iw2_all = 0;
+                        int iatom = 0;
+                        for (int it = 0; it < T2; it++)
+                        {
+                            for (int ia = 0; ia < ucell.atoms[it].na; ia++)
+                            {
+                                iatom++; // cal how many atoms before ad in ucell
+                            }
+                        }
+                        iatom += I2;
+
+                        iw2_all = iatom * this->des_per_atom;
+
+                        for (int L2 = 0; L2 < ORB.Alpha[0].getLmax(); ++L2)
+                        {
+                            for (int N2 = 0; N2 < ORB.Alpha[0].getNchi(L2); ++N2)
+                            {
+                                for (int m2 = 0; m2 < 2 * L2 + 1; ++m2)
+                                {
+                                    olm[0] = olm[1] = olm[2] = 0.0;
+
+                                    complex<double> olm1[4] = {ZERO, ZERO, ZERO, ZERO};
+                                    complex<double> *olm2 = &olm1[0];
+                                    if (!calc_deri)
+                                    {
+                                        UOT.snap_psialpha(olm, 0, tau1,
+                                                          T1, L1, m1, N1, GridD.getAdjacentTau(ad),
+                                                          T2, L2, m2, N2);
+
+                                        if (GAMMA_ONLY_LOCAL)
+                                        {
+                                            this->set_S_mu_alpha(iw1_all, iw2_all, olm[0]);
+                                        }
+                                    }
+                                    /*derivation will be needed in next step
+									else // calculate the derivative
+									{
+										UOT.snap_psipsi( olm, 1, dtype, 
+											tau1, T1, L1, m1, N1,
+											GridD.getAdjacentTau(ad), T2, L2, m2, N2
+											);
+
+										if(GAMMA_ONLY_LOCAL)
+										{
+										}
+									}
+									*/
+                                    ++iw2_all;
+                                } //m2
+                            }     //N2
+                        }         //nw2(L2)
+                        ++iw1_all;
+                    } // nw1
+                }     // distance
+            }         // ad
+        }             // I1
+    }                 // T1
+
+    if (!GAMMA_ONLY_LOCAL)
+    {
+        WARNING_QUIT("LCAO_Descriptor::build_S_descriptor", "muti-kpoint method for descriptor is not implemented yet! ");
+    }
+
+    return;
+}
+
+void LCAO_Descriptor::set_S_mu_alpha(const int &iw1_all, const int &iw2_all, const double &v)
+{
+    //const int ir = ParaO.trace_loc_row[iw1_all];
+    //const int ic = ParaO.trace_loc_col[iw2_all];
+    //no parellel yet
+    const int ir = iw1_all;
+    const int ic = iw2_all;
+    //const int index = ir * ParaO.ncol + ic;
+    long index;
+    if (KS_SOLVER == "genelpa" || KS_SOLVER == "scalapack_gvx") // save the matrix as column major format
+    {
+        index = ic * NLOCAL + ir;
+    }
+    else
+    {
+        index = ir * this->n_descriptor + ic; //row: lcao orbitals; col: descriptor basis
+    }
+
+    this->S_mu_alpha[index] += v;
+
+    return;
+}
+
+void LCAO_Descriptor::cal_projective_DM()
+{
+    //step 1: cal inv of Sloc
+    double *sinv = new double[NLOCAL * NLOCAL]; //size :NLOCAL*NLOCAL
+    for (int i = 0; i < NLOCAL; i++)
+    {
+        for (int j = 0; j < NLOCAL; j++)
+        {
+            sinv[i * NLOCAL + j] = LM.Sloc[i * ParaO.ncol + j];
+        }
+    }
+    /*
+    //++++++++++++++test++++++++++++++
+    for (int i = 0; i < NLOCAL; i++)
+    {
+        for (int j = 0; j < NLOCAL; j++)
+        {
+            cout << sinv[i * NLOCAL + j] << " ";
+        }
+        cout << endl;
+    }
+    //++++++++++++++test++++++++++++++
+    */
+    int info;
+    const char uplo = 'L';
+    dpotrf_(&uplo, &NLOCAL, sinv, &NLOCAL, &info);
+    dpotri_(&uplo, &NLOCAL, sinv, &NLOCAL, &info);
+    if (info != 0)
+    {
+        cout << "info = " << info << endl;
+        WARNING_QUIT("LCAO_Descriptor", "Something wrong in calculating inverse of Sloc!");
+    }
+
+    //step 2: get lcao density matrix as array
+    double *dm = new double[NLOCAL * NLOCAL]; //size :NLOCAL*NLOCAL
+    for (int i = 0; i < NLOCAL; i++)
+    {
+        for (int j = 0; j < NLOCAL; j++)
+        {
+            dm[i * NLOCAL + j] = LOC.DM[0][i][j]; //only consider default NSPIN = 1
+        }
+    }
+    //step 3: get SS_alpha_mu and SS_nu_beta
+    double *ss = this->S_mu_alpha; //SS_nu_beta
+
+    //step 4 : multiply
+    //cal ssT*sinvT*DM*sinv*ss
+
+    const long tmp_PDM_size = NLOCAL * this->n_descriptor;
+    double *tmp_PDM = new double[tmp_PDM_size];
+    const long PDM_size = this->n_descriptor * this->n_descriptor;
+    delete[] this->PDM;
+    this->PDM = new double[PDM_size];
+
+    const char t = 'T';  //transpose
+    const char nt = 'N'; //non transpose
+    const double alpha = 1;
+    const double beta = 0;
+    double *a = sinv;
+    double *b = ss;
+    double *c = tmp_PDM;
+    dgemm_(&nt, &nt, &NLOCAL, &n_descriptor, &NLOCAL, &alpha, a, &NLOCAL, b, &NLOCAL, &beta, c, &NLOCAL); //S_nu_nu*SS_nu_beta
+    a = dm;
+    b = c;
+    c = this->PDM;
+    dgemm_(&nt, &nt, &NLOCAL, &n_descriptor, &NLOCAL, &alpha, a, &NLOCAL, b, &NLOCAL, &beta, c, &NLOCAL); //DM*S_nu_nu*SS_nu_beta
+    a = sinv;
+    b = c;
+    c = tmp_PDM;
+    dgemm_(&t, &nt, &NLOCAL, &n_descriptor, &NLOCAL, &alpha, a, &NLOCAL, b, &NLOCAL, &beta, c, &NLOCAL); //S_mu_mu*DM*S_nu_nu*SS_nu_beta
+    a = ss;
+    b = c;
+    c = this->PDM;
+    dgemm_(&t, &nt, &n_descriptor, &n_descriptor, &NLOCAL, &alpha, a, &NLOCAL, b, &NLOCAL, &beta, c, &n_descriptor); //SS_alpha_mu*S_mu_mu*DM*S_nu_nu*SS_nu_beta
+
+    delete[] dm;
+    delete[] sinv;
+    delete[] tmp_PDM;
+    return;
+}
+
+void LCAO_Descriptor::cal_descriptor()
+{
+    delete[] d;
+    d = new double[this->n_descriptor];
+    //==========print preparation=============
+    ofs_running << " print out each DM_Inl" << endl;
+    ofstream ofs;
+    stringstream ss;
+    ss << winput::spillage_outdir << "/"
+       << "projective_DM.dat";
+    if (MY_RANK == 0)
+    {
+        ofs.open(ss.str().c_str());
+    }
+    //==========print preparation=============
+    const int lmax = ORB.get_lmax_d();
+    int id = 0;
+    for (int it = 0; it < ucell.ntype; it++)
+    {
+        for (int ia = 0; ia < ucell.atoms[it].na; ia++)
+        {
+            ofs << ucell.atoms[it].label << " atom_index " << ia + 1 << " n_descriptor " << this->des_per_atom << endl;
+            for (int l = 0; l <= lmax; l++)
+            {
+                int nmax = ORB.Alpha[0].getNchi(l);
+                for (int n = 0; n < nmax; n++)
+                {
+                    const int dim = 2 * l + 1;
+                    // descriptor for atom (it, ia)
+                    ComplexMatrix des(dim, dim);
+                    for (int m = 0; m < 2 * l + 1; m++)
+                    {
+                        const int ii = mu_index[it](ia, l, n, m);
+                        for (int m2 = 0; m2 < 2 * l + 1; m2++)
+                        {
+                            const int jj = mu_index[it](ia, l, n, m2);
+
+                            long index = ii * this->n_descriptor + jj;
+                            des(m, m2) = this->PDM[index];
+                        }
+                        //					ofs_running << setw(15) << des(m,m2);
+                    }
+                    //			ofs_running << endl;
+
+                    this->print_projective_DM(ofs, des, it, ia, l, n);
+
+                    //ofs_running << "dimension of des is " << 2 * l + 1 << endl;
+                    if (l == 0)
+                    {
+                        this->d[id] = des(0, 0).real();
+                        ++id;
+                    }
+                    else
+                    {
+                        // diagonalizae
+                        // assume des matrix is Hermitian
+                        char jobz = 'N'; // eigenvalues only
+                        char uplo = 'U'; // upper matrix is stored
+                        int ndim = des.nr;
+                        double *tmpd = new double[ndim]();
+                        const int lwork = 2 * ndim;
+                        complex<double> *work = new complex<double>[lwork]();
+                        double *rwork = new double[3 * ndim - 2]();
+                        int infor = 0;
+                        // diag by calling zheev
+                        LapackConnector::zheev(jobz, uplo, ndim, des, ndim, tmpd, work, lwork, rwork, &infor);
+                        // put the eigenvalues into d (descriptor)
+                        for (int idim = 0; idim < ndim; ++idim)
+                        {
+                            this->d[id] = tmpd[idim];
+                            ++id;
+                        }
+                        delete[] tmpd;
+                        delete[] rwork;
+                        delete[] work;
+                    }
+                }
+
+            } //l
+        }     //ia
+    }         //it
+    if (ofs)
+        ofs.close();
+    this->print_descriptor();
+    return;
+}
+
+void LCAO_Descriptor::init_mu_index(void)
+{
+    ofs_running << " Initialize the mu index for deepks (lcao line)" << endl;
+    const int lmax = ORB.get_lmax_d();
+    const int nmax = ORB.get_nchimax_d();
+    assert(lmax >= 0);
+    assert(nmax >= 0);
+    ofs_running << " lmax = " << lmax << endl;
+    ofs_running << " nmax = " << nmax << endl;
+
+    delete[] this->mu_index;
+    this->mu_index = new IntArray[ucell.ntype];
+
+    int mu = 0;
+    for (int it = 0; it < ucell.ntype; it++)
+    {
+        this->mu_index[it].create(
+            ucell.atoms[it].na,
+            lmax + 1, // l starts from 0
+            nmax,
+            2 * lmax + 1); // m ==> 2*l+1
+
+        ofs_running << "Type " << it + 1
+                    << " number_of_atoms " << ucell.atoms[it].na << endl;
+
+        for (int ia = 0; ia < ucell.atoms[it].na; ia++)
+        {
+            for (int l = 0; l < lmax + 1; l++)
+            {
+                for (int n = 0; n < ORB.Alpha[0].getNchi(l); n++)
+                {
+                    for (int m = 0; m < 2 * l + 1; m++)
+                    {
+                        this->mu_index[it](ia, l, n, m) = mu;
+                        mu++;
+                    }
+                }
+            }
+        }
+    }
+    assert(this->n_descriptor == mu);
+    ofs_running << "descriptors_per_atom " << this->des_per_atom << endl;
+    ofs_running << "total_descriptors " << this->n_descriptor << endl;
+
+    return;
+}
+
+void LCAO_Descriptor::print_projective_DM(ofstream &ofs, ComplexMatrix &des, const int &it, const int &ia, const int &l, const int &n)
+{
+    ofs << "L=" << l << "   N=" << n << endl;
+    for (int i = 0; i < 2 * l + 1; i++)
+    {
+        for (int j = 0; j < 2 * l + 1; j++)
+        {
+            ofs << des(i, j).real() << " ";
+        }
+        ofs << endl;
+    }
+    return;
+}
+void LCAO_Descriptor::print_descriptor()
+{
+    TITLE("LCAO_Descriptor", "print_descriptor");
+    ofstream ofs;
+    stringstream ss;
+    // the parameter 'winput::spillage_outdir' is read from INPUTw.
+    ss << winput::spillage_outdir << "/"
+       << "descriptor.dat";
+    if (MY_RANK == 0)
+    {
+        ofs.open(ss.str().c_str());
+    }
+    for (int it = 0; it < ucell.ntype; it++)
+    {
+        for (int ia = 0; ia < ucell.atoms[it].na; ia++)
+        {
+            ofs << ucell.atoms[it].label << " atom_index " << ia + 1 << " n_descriptor " << this->des_per_atom << endl;
+            int id0 = this->mu_index[it](ia, 0, 0, 0);
+            for (int id = id0; id < id0 + this->des_per_atom; ++id)
+            {
+                if ((id - id0) > 0 && (id - id0) % 8 == 0)
+                    ofs << endl;
+                ofs << d[id] << " ";
+            }
+            ofs << endl;
+        }
+        ofs << endl;
+    }
+    ofs_running << "descriptors are printed" << endl;
+    return;
+}
diff --git a/ABACUS.develop/source/src_lcao/LCAO_descriptor.h b/ABACUS.develop/source/src_lcao/LCAO_descriptor.h
new file mode 100644
index 0000000000..218d56738b
--- /dev/null
+++ b/ABACUS.develop/source/src_lcao/LCAO_descriptor.h
@@ -0,0 +1,31 @@
+//caoyu add 2021-03-29
+#include "../src_global/intarray.h"
+#include "../src_global/complexmatrix.h"
+
+#ifndef LCAO_MATRIX_DESCRIPTOR_H
+#define LCAO_MATRIX_DESCRIPTOR_H
+
+class LCAO_Descriptor
+{
+public:
+    LCAO_Descriptor();
+    ~LCAO_Descriptor();
+
+    void build_S_descriptor(const bool &calc_deri); //cal S_alpha_mu：overlap between lcao basis Phi and descriptor basis Alpha
+    void cal_projective_DM();                       //cal PDM: S_alpha_mu * inv(Sloc) * DM * inv(Sloc) * S_nu_beta
+    void cal_descriptor();                          //cal d: EIGENVALUE of PDM in block of I_n_l
+    void print_descriptor();
+
+private:
+    double *S_mu_alpha; //overlap between lcao and descriptor basis
+    double *PDM;        //projective density matrix
+    double *d;          //descriptors
+    int n_descriptor = 0;
+    int des_per_atom = 0; //\sum_L{Nchi(L)*(2L+1)}
+    IntArray *mu_index;
+    void init_mu_index(void);
+    void set_S_mu_alpha(const int &iw1_all, const int &iw2_all, const double &v);
+    void print_projective_DM(ofstream &ofs, ComplexMatrix &des, const int &it, const int &ia, const int &l, const int &n);
+};
+
+#endif
\ No newline at end of file
diff --git a/ABACUS.develop/source/src_lcao/LOOP_ions.cpp b/ABACUS.develop/source/src_lcao/LOOP_ions.cpp
index 8e48f8eec0..e9c942bbdb 100644
--- a/ABACUS.develop/source/src_lcao/LOOP_ions.cpp
+++ b/ABACUS.develop/source/src_lcao/LOOP_ions.cpp
@@ -13,6 +13,7 @@
 #include "ELEC_scf.h"
 #include "src_global/sltk_atom_arrange.h"
 #include "src_pw/vdwd2.h"
+#include "LCAO_descriptor.h"
 
 LOOP_ions::LOOP_ions()
 {}
@@ -145,6 +146,14 @@ void LOOP_ions::opt_ions(void)
 		{
 			this->output_HS_R(); //LiuXh add 2019-07-15
 		}
+        //caoyu add 2021-03-31
+        if (INPUT.out_descriptor)
+        {
+            LCAO_Descriptor ld;
+            ld.build_S_descriptor(0);  //derivation not needed yet
+            ld.cal_projective_DM();
+            ld.cal_descriptor();
+        }
 
         time_t fstart = time(NULL);
         if (CALCULATION=="scf" || CALCULATION=="relax" || CALCULATION=="cell-relax")

From 36c2c1a134de99809f65f480777abe1404072fae Mon Sep 17 00:00:00 2001
From: zdy <dyzheng@mail.ustc.edu.cn>
Date: Sat, 3 Apr 2021 23:08:17 +0800
Subject: [PATCH 26/60] add new input parameter read_file_dir

---
 ABACUS.develop/source/input.cpp               | 28 ++++++++++++++++---
 ABACUS.develop/source/input.h                 |  3 +-
 ABACUS.develop/source/input_conv.cpp          |  4 +--
 .../source/src_global/global_variable.cpp     |  1 +
 .../source/src_global/global_variable.h       |  1 +
 ABACUS.develop/source/src_io/write_input.cpp  |  3 +-
 ABACUS.develop/source/src_pw/potential.cpp    |  2 +-
 7 files changed, 33 insertions(+), 9 deletions(-)

diff --git a/ABACUS.develop/source/input.cpp b/ABACUS.develop/source/input.cpp
index 91afa98ace..12b88882d3 100644
--- a/ABACUS.develop/source/input.cpp
+++ b/ABACUS.develop/source/input.cpp
@@ -115,6 +115,7 @@ void Input::Default(void)
     atom_file = "";//xiaohui modify 2015-02-01
     kpoint_file = "";//xiaohui modify 2015-02-01
     pseudo_dir = "";
+	read_file_dir = "auto";
     pseudo_type = "auto"; // mohan add 2013-05-20 (xiaohui add 2013-06-23)
 	wannier_card = "";
     latname = "test";
@@ -359,7 +360,7 @@ void Input::Default(void)
 	kernel_type="rpa";
 	eels_method=0;
 	absorption_method=0;
-	system="bulk";
+	system_type="bulk";
 	eta=0.05;
 	domega=0.01;
 	nomega=300;
@@ -949,6 +950,10 @@ bool Input::Read(const string &fn)
         {
             read_value(ifs, restart_mode);
         }
+		else if (strcmp("read_file_dir", word) == 0)
+		{
+			read_value(ifs, read_file_dir);
+		}
         else if (strcmp("start_wfc", word) == 0)
         {
             read_value(ifs, start_wfc);
@@ -1404,7 +1409,7 @@ bool Input::Read(const string &fn)
 	    }
 	    else if (strcmp("system", word) == 0)
 	    {
-	        read_value(ifs, system);
+	        read_value(ifs, system_type);
 	    }
 	    else if (strcmp("eta", word) == 0)
 	    {
@@ -2040,6 +2045,7 @@ void Input::Bcast()
     Parallel_Common::bcast_double( mixing_gg0 ); //mohan add 2014-09-27
 
     Parallel_Common::bcast_string( restart_mode );
+	Parallel_Common::bcast_string( read_file_dir);
     Parallel_Common::bcast_string( start_wfc );
 	Parallel_Common::bcast_int( mem_saver );
 	Parallel_Common::bcast_int( printe );
@@ -2161,7 +2167,7 @@ void Input::Bcast()
 	Parallel_Common::bcast_string( kernel_type );
 	Parallel_Common::bcast_int( eels_method );
 	Parallel_Common::bcast_int( absorption_method );
-    Parallel_Common::bcast_string( system );
+    Parallel_Common::bcast_string( system_type );
     Parallel_Common::bcast_double( eta );
     Parallel_Common::bcast_double( domega );
     Parallel_Common::bcast_int( nomega );
@@ -2815,7 +2821,7 @@ void Input::Check(void)
 	// pengfei 2016-12-14
 	if(spectral_type!="None")
 	{
-		if( system!="bulk" && system!="surface")
+		if( system_type!="bulk" && system_type!="surface")
 		{
 			WARNING_QUIT("Input","system must be bulk or surface");
 		}
@@ -2937,6 +2943,20 @@ void Input::Check(void)
 			}
 		}
 	}
+
+	const string ss = "test -d " + read_file_dir;
+	if(read_file_dir=="auto")
+	{
+		global_readin_dir = global_out_dir;
+	}
+	else if( system( ss.c_str() ))
+	{
+		WARNING_QUIT("Input","please set right files directory for reading in.");
+	}
+	else
+	{
+		global_readin_dir = read_file_dir + '/';
+	}
 	
     return;
 }
diff --git a/ABACUS.develop/source/input.h b/ABACUS.develop/source/input.h
index d0733bdba5..1181fadad4 100644
--- a/ABACUS.develop/source/input.h
+++ b/ABACUS.develop/source/input.h
@@ -29,6 +29,7 @@ class Input
     string suffix;			// suffix of out put dir
     string atom_file;		// file contains atomic positions -- xiaohui modify 2015-02-01
     string pseudo_dir;      // directory of pseudopotential
+	string read_file_dir;   // directory of files for reading
     string pseudo_type;     // the type of pseudopotential, mohan add 2013-05-20, ABACUS supports
 			    			// UPF format (default) and vwr format. (xiaohui add 2013-06-23)
     string kpoint_file;		// file contains k-points -- xiaohui modify 2015-02-01
@@ -296,7 +297,7 @@ class Input
 	int      absorption_method;      // 0: vasp's method  1: pwscf's method
 	//int		 epsilon_choice;         // 0: hilbert_transform method; 1: standard method
 	string   kernel_type;           // the kernel type: rpa, tdlda ...
-	string system;                 // bulk or surface
+	string system_type;                 // bulk or surface
 	double  eta;                   // unit(Ry)
 	double  domega;                // unit(Ry)
 	int     nomega;
diff --git a/ABACUS.develop/source/input_conv.cpp b/ABACUS.develop/source/input_conv.cpp
index 0cc6c51edb..f56cef2abd 100644
--- a/ABACUS.develop/source/input_conv.cpp
+++ b/ABACUS.develop/source/input_conv.cpp
@@ -274,7 +274,7 @@ void Input_Conv::Convert(void)
 		}
 		//chi0_hilbert.epsilon = INPUT.epsilon;
 		chi0_hilbert.kernel_type = INPUT.kernel_type;
-		chi0_hilbert.system = INPUT.system;
+		chi0_hilbert.system = INPUT.system_type;
 		chi0_hilbert.eta = INPUT.eta;
 		chi0_hilbert.domega = INPUT.domega;
 		chi0_hilbert.nomega = INPUT.nomega;
@@ -314,7 +314,7 @@ void Input_Conv::Convert(void)
 	{
 		//chi0_standard.epsilon = INPUT.epsilon;
 		chi0_standard.epsilon = true;
-		chi0_standard.system = INPUT.system;
+		chi0_standard.system = INPUT.system_type;
 		chi0_standard.eta = INPUT.eta;
 		chi0_standard.domega = INPUT.domega;
 		chi0_standard.nomega = INPUT.nomega;
diff --git a/ABACUS.develop/source/src_global/global_variable.cpp b/ABACUS.develop/source/src_global/global_variable.cpp
index e7d0a7c5de..7e1da67d44 100644
--- a/ABACUS.develop/source/src_global/global_variable.cpp
+++ b/ABACUS.develop/source/src_global/global_variable.cpp
@@ -122,6 +122,7 @@ string	global_pseudo_dir = "./";
 string  global_pseudo_type = "upf"; // mohan add 2013-05-20, default is UPF, we can also use VWR (xiaohui add 2013-06-23)
 string	global_epm_pseudo_card;
 string	global_out_dir;
+string  global_readin_dir; //zhengdy modified
 
 ofstream ofs_running;
 ofstream ofs_warning;
diff --git a/ABACUS.develop/source/src_global/global_variable.h b/ABACUS.develop/source/src_global/global_variable.h
index b8b0b2a3e1..e86b1b4c3d 100644
--- a/ABACUS.develop/source/src_global/global_variable.h
+++ b/ABACUS.develop/source/src_global/global_variable.h
@@ -149,6 +149,7 @@ extern string	global_wannier_card;
 extern string	global_pseudo_dir;
 extern string   global_pseudo_type; // mohan add 2013-05-20 (xiaohui add 2013-06-23)
 extern string 	global_out_dir;
+extern string   global_readin_dir; //zhengdy modified
 
 extern ofstream ofs_running;
 extern ofstream ofs_warning;
diff --git a/ABACUS.develop/source/src_io/write_input.cpp b/ABACUS.develop/source/src_io/write_input.cpp
index c715edc478..aa1cb447fd 100644
--- a/ABACUS.develop/source/src_io/write_input.cpp
+++ b/ABACUS.develop/source/src_io/write_input.cpp
@@ -57,6 +57,7 @@ void Input::Print(const string &fn)const
 	OUTP(ofs,"out_band",out_band,"output energy and band structure");
 	OUTP(ofs,"restart_save",restart_save,"print to disk every step for restart");
 	OUTP(ofs,"restart_load",restart_load,"restart from disk");
+	OUTP(ofs,"read_file_dir",read_file_dir,"directory of files for reading");
 	OUTP(ofs,"nx",nx,"number of points along x axis for FFT grid");
 	OUTP(ofs,"ny",ny,"number of points along y axis for FFT grid");
 	OUTP(ofs,"nz",nz,"number of points along z axis for FFT grid");	
@@ -208,7 +209,7 @@ void Input::Print(const string &fn)const
 	OUTP(ofs,"kernel_type",kernel_type,"the kernel type: rpa, tdlda ...");
 	OUTP(ofs,"eels_method",eels_method,"0: hilbert_transform method; 1: standard method");
 	OUTP(ofs,"absorption_method",absorption_method,"0: vasp's method  1: pwscf's method");
-	OUTP(ofs,"system",system,"the calculate system");
+	OUTP(ofs,"system",system_type,"the calculate system");
 	OUTP(ofs,"eta",eta,"eta(Ry)");
 	OUTP(ofs,"domega",domega,"domega(Ry)");
 	OUTP(ofs,"nomega",nomega,"nomega");
diff --git a/ABACUS.develop/source/src_pw/potential.cpp b/ABACUS.develop/source/src_pw/potential.cpp
index c95cffda30..8998db9ffe 100644
--- a/ABACUS.develop/source/src_pw/potential.cpp
+++ b/ABACUS.develop/source/src_pw/potential.cpp
@@ -117,7 +117,7 @@ void Potential::init_pot(
             for(int is=0; is<NSPIN; is++)
             {
                 stringstream ssc;
-                ssc << global_out_dir << "SPIN" << is + 1 << "_CHG";
+                ssc << global_readin_dir << "SPIN" << is + 1 << "_CHG";
                 ofs_running << ssc.str() << endl;
                 // mohan update 2012-02-10
                 if(CHR.read_rho( is, ssc.str(), CHR.rho[is] )) 

From 4682505637f9686b845acd0b8bc5f0b4c5eb1348 Mon Sep 17 00:00:00 2001
From: mohan <mohan.chen.chen.mohan@gmail.com>
Date: Mon, 5 Apr 2021 14:40:05 +0800
Subject: [PATCH 27/60] eliminate inclusion of global files in ORB* files

---
 ABACUS.develop/source/src_global/complexarray.cpp  | 8 --------
 ABACUS.develop/source/src_global/complexarray.h    | 7 -------
 ABACUS.develop/source/src_global/complexmatrix.h   | 5 -----
 ABACUS.develop/source/src_global/constants.h       | 4 ----
 ABACUS.develop/source/src_global/vector3.h         | 3 ---
 ABACUS.develop/source/src_lcao/ORB_atomic.h        | 8 +++++++-
 ABACUS.develop/source/src_lcao/ORB_gen_tables.h    | 6 ++++--
 ABACUS.develop/source/src_lcao/ORB_nonlocal.cpp    | 2 ++
 ABACUS.develop/source/src_lcao/ORB_nonlocal.h      | 5 ++++-
 ABACUS.develop/source/src_lcao/ORB_nonlocal_lm.h   | 5 ++++-
 ABACUS.develop/source/src_lcao/ORB_read.h          | 2 +-
 ABACUS.develop/source/src_lcao/ORB_table_phi.h     | 6 ++----
 ABACUS.develop/source/src_lcao/gint_gamma_env.cpp  | 1 +
 ABACUS.develop/source/src_lcao/gint_gamma_fvl.cpp  | 1 +
 ABACUS.develop/source/src_lcao/gint_gamma_mull.cpp | 1 +
 ABACUS.develop/source/src_lcao/gint_gamma_rho.cpp  | 1 +
 ABACUS.develop/source/src_lcao/gint_gamma_vl.cpp   | 1 +
 ABACUS.develop/source/src_lcao/gint_k_fvl.cpp      | 2 ++
 18 files changed, 31 insertions(+), 37 deletions(-)

diff --git a/ABACUS.develop/source/src_global/complexarray.cpp b/ABACUS.develop/source/src_global/complexarray.cpp
index b3826c499f..7637a6f159 100644
--- a/ABACUS.develop/source/src_global/complexarray.cpp
+++ b/ABACUS.develop/source/src_global/complexarray.cpp
@@ -1,11 +1,3 @@
-/***********************************************************
-    DFT++ is a density functional package developed
-	by the research group
-    of Professor Tomas Arias
-
-    Copyright 1996-2003 Sohrab Ismail-Beigi
-************************************************************/
-
 #include <iostream>
 #include <fstream>
 #include <iomanip>
diff --git a/ABACUS.develop/source/src_global/complexarray.h b/ABACUS.develop/source/src_global/complexarray.h
index 838091e65c..28dc04066d 100644
--- a/ABACUS.develop/source/src_global/complexarray.h
+++ b/ABACUS.develop/source/src_global/complexarray.h
@@ -1,10 +1,3 @@
-/*
-    DFT++ is a density functional package developed by the research group
-    of Professor Tomas Arias
-
-    Copyright 1996-2003 Sohrab Ismail-Beigi
-*/
-
 #ifndef COMPLEX_ARRAY_H
 #define COMPLEX_ARRAY_H
 
diff --git a/ABACUS.develop/source/src_global/complexmatrix.h b/ABACUS.develop/source/src_global/complexmatrix.h
index 3dd7a64994..d43afb658d 100644
--- a/ABACUS.develop/source/src_global/complexmatrix.h
+++ b/ABACUS.develop/source/src_global/complexmatrix.h
@@ -1,8 +1,3 @@
-//==========================================================
-// Author : Lixin He, Mohan Chen
-// Update : Peize Lin
-// Last Update : 2018-09-04
-//==========================================================
 #ifndef COMPLEXMATRIX_H
 #define COMPLEXMATRIX_H
 
diff --git a/ABACUS.develop/source/src_global/constants.h b/ABACUS.develop/source/src_global/constants.h
index 6e0b31b71a..258358a774 100644
--- a/ABACUS.develop/source/src_global/constants.h
+++ b/ABACUS.develop/source/src_global/constants.h
@@ -1,7 +1,3 @@
-//==========================================================
-// AUTHOR : Lixin He,mohan
-// DATE : 2008-11-07
-//==========================================================
 #ifndef CONSTANT_H
 #define CONSTANT_H
 #include <complex>
diff --git a/ABACUS.develop/source/src_global/vector3.h b/ABACUS.develop/source/src_global/vector3.h
index fbdbe11ee2..d321ad53e4 100644
--- a/ABACUS.develop/source/src_global/vector3.h
+++ b/ABACUS.develop/source/src_global/vector3.h
@@ -1,6 +1,3 @@
-//==========================================================
-//
-//==========================================================
 #ifndef VECTOR3_H
 #define VECTOR3_H
 
diff --git a/ABACUS.develop/source/src_lcao/ORB_atomic.h b/ABACUS.develop/source/src_lcao/ORB_atomic.h
index 6f4e2defea..48000ccbf8 100644
--- a/ABACUS.develop/source/src_lcao/ORB_atomic.h
+++ b/ABACUS.develop/source/src_lcao/ORB_atomic.h
@@ -6,7 +6,13 @@
 #ifndef NUMERICAL_ORBITAL_H
 #define NUMERICAL_ORBITAL_H
 
-#include "../src_pw/tools.h"
+#include <string>
+using namespace std;
+
+#include "../src_global/intarray.h"
+#include "../src_global/vector3.h"
+
+//#include "../src_pw/tools.h"
 #include "ORB_atomic_lm.h"
 
 //=========================================================
diff --git a/ABACUS.develop/source/src_lcao/ORB_gen_tables.h b/ABACUS.develop/source/src_lcao/ORB_gen_tables.h
index 176c2e5cf2..0e025875fc 100644
--- a/ABACUS.develop/source/src_lcao/ORB_gen_tables.h
+++ b/ABACUS.develop/source/src_lcao/ORB_gen_tables.h
@@ -1,13 +1,15 @@
 #ifndef ORB_GEN_TABLES_H
 #define ORB_GEN_TABLES_H
 
-#include "../src_pw/tools.h"
-#include "../src_global/ylm.h"
+//#include "../src_pw/tools.h"
+//#include "../src_global/ylm.h"
+
 #include "ORB_gaunt_table.h"
 #include "ORB_table_beta.h"
 #include "ORB_table_phi.h"
 #include "ORB_table_alpha.h"		//caoyu add 2020-3-18
 #include "ORB_read.h"
+#include "../src_global/vector3.h"
 
 //------------------------------------
 // used to be 'Use_Overlap_Table',
diff --git a/ABACUS.develop/source/src_lcao/ORB_nonlocal.cpp b/ABACUS.develop/source/src_lcao/ORB_nonlocal.cpp
index aa90a0150a..ec3a798f41 100644
--- a/ABACUS.develop/source/src_lcao/ORB_nonlocal.cpp
+++ b/ABACUS.develop/source/src_lcao/ORB_nonlocal.cpp
@@ -3,6 +3,8 @@
 //DATE : 2008-03-04
 //=========================================================
 #include "ORB_nonlocal.h"
+#include "../src_global/global_function.h"
+#include "../src_global/constants.h"
 
 Numerical_Nonlocal::Numerical_Nonlocal()
 {
diff --git a/ABACUS.develop/source/src_lcao/ORB_nonlocal.h b/ABACUS.develop/source/src_lcao/ORB_nonlocal.h
index b96690e7fc..9b01388415 100644
--- a/ABACUS.develop/source/src_lcao/ORB_nonlocal.h
+++ b/ABACUS.develop/source/src_lcao/ORB_nonlocal.h
@@ -1,7 +1,10 @@
 #ifndef NUMERICAL_NONLOCAL_H
 #define NUMERICAL_NONLOCAL_H
 
-#include "../src_pw/tools.h"
+//#include "../src_pw/tools.h"
+
+#include "../src_global/complexarray.h"
+#include "../src_global/complexmatrix.h"
 #include "ORB_nonlocal_lm.h"
 //=========================================================
 //CLASS  Numerical_Nonlocal
diff --git a/ABACUS.develop/source/src_lcao/ORB_nonlocal_lm.h b/ABACUS.develop/source/src_lcao/ORB_nonlocal_lm.h
index 9b6eb4e531..a6f10e6560 100644
--- a/ABACUS.develop/source/src_lcao/ORB_nonlocal_lm.h
+++ b/ABACUS.develop/source/src_lcao/ORB_nonlocal_lm.h
@@ -1,7 +1,10 @@
 #ifndef NUMERICAL_NONLOCAL_LM
 #define NUMERICAL_NONLOCAL_LM
 
-#include "../src_pw/tools.h"
+#include <string>
+using namespace std;
+
+//#include "../src_pw/tools.h"
 //=========================================================
 //CLASS Numerical_Nonlocal_Lm
 //Note : contain information about each projector
diff --git a/ABACUS.develop/source/src_lcao/ORB_read.h b/ABACUS.develop/source/src_lcao/ORB_read.h
index 84dc3b5ba0..b2b3e7a93a 100644
--- a/ABACUS.develop/source/src_lcao/ORB_read.h
+++ b/ABACUS.develop/source/src_lcao/ORB_read.h
@@ -1,7 +1,7 @@
 #ifndef LCAO_ORBITALS_H
 #define LCAO_ORBITALS_H
 
-#include "../src_pw/tools.h"
+//#include "../src_pw/tools.h"
 #include "ORB_atomic.h"
 #include "ORB_atomic_lm.h"
 #include "ORB_nonlocal.h"
diff --git a/ABACUS.develop/source/src_lcao/ORB_table_phi.h b/ABACUS.develop/source/src_lcao/ORB_table_phi.h
index 57553a2d31..594d1ed260 100644
--- a/ABACUS.develop/source/src_lcao/ORB_table_phi.h
+++ b/ABACUS.develop/source/src_lcao/ORB_table_phi.h
@@ -3,7 +3,6 @@
 
 #include "ORB_read.h"
 #include "ORB_atomic_lm.h"
-#include "center2_orb.h"
 #include "../src_global/sph_bessel_recursive.h"
 #include <set>
 
@@ -90,7 +89,6 @@ class ORB_table_phi
 	static double dr;
 	int Rmesh;
 
-	private:
 
 	void cal_ST_Phi12_R(
 		const int &job,
@@ -111,6 +109,8 @@ class ORB_table_phi
 		double *rs,
 		double *drs) const;
 
+	private:
+
 	// variables
     int ntype;
 	int lmax;
@@ -122,7 +122,5 @@ class ORB_table_phi
 	double *r;
 	double *rab;
 	double *kab;	
-
-	friend class Center2_Orb::Orb11;			// Peize Lin add 2016-01-24
 };
 #endif
diff --git a/ABACUS.develop/source/src_lcao/gint_gamma_env.cpp b/ABACUS.develop/source/src_lcao/gint_gamma_env.cpp
index b0b883d2c5..b9e14831f1 100644
--- a/ABACUS.develop/source/src_lcao/gint_gamma_env.cpp
+++ b/ABACUS.develop/source/src_lcao/gint_gamma_env.cpp
@@ -2,6 +2,7 @@
 #include "grid_technique.h"
 #include "ORB_read.h"
 #include "../src_pw/global.h"
+#include "../src_global/ylm.h"
 
 void Gint_Gamma::cal_env(const double* wfc, double* rho)
 {
diff --git a/ABACUS.develop/source/src_lcao/gint_gamma_fvl.cpp b/ABACUS.develop/source/src_lcao/gint_gamma_fvl.cpp
index 782830c817..21cc803710 100644
--- a/ABACUS.develop/source/src_lcao/gint_gamma_fvl.cpp
+++ b/ABACUS.develop/source/src_lcao/gint_gamma_fvl.cpp
@@ -5,6 +5,7 @@
 #include "src_global/blas_connector.h"
 
 #include "global_fp.h" // mohan add 2021-01-30
+#include "../src_global/ylm.h"
 
 void Gint_Gamma::cal_force(const double* vlocal_in)
 {
diff --git a/ABACUS.develop/source/src_lcao/gint_gamma_mull.cpp b/ABACUS.develop/source/src_lcao/gint_gamma_mull.cpp
index 3f88752f27..e7b094ed80 100644
--- a/ABACUS.develop/source/src_lcao/gint_gamma_mull.cpp
+++ b/ABACUS.develop/source/src_lcao/gint_gamma_mull.cpp
@@ -4,6 +4,7 @@
 #include "../src_pw/global.h"
 
 #include "global_fp.h" // mohan add 2021-01-30
+#include "../src_global/ylm.h"
 
 void Gint_Gamma::cal_mulliken(double** mulliken)
 {
diff --git a/ABACUS.develop/source/src_lcao/gint_gamma_rho.cpp b/ABACUS.develop/source/src_lcao/gint_gamma_rho.cpp
index e594d00c35..3a042204b5 100644
--- a/ABACUS.develop/source/src_lcao/gint_gamma_rho.cpp
+++ b/ABACUS.develop/source/src_lcao/gint_gamma_rho.cpp
@@ -6,6 +6,7 @@
 #include <mkl_service.h>
 
 #include "global_fp.h" // mohan add 2021-01-30
+#include "../src_global/ylm.h"
 
 void Gint_Gamma::setVindex(const int ncyz, const int ibx, const int jby, const int kbz, int* vindex) const
 {
diff --git a/ABACUS.develop/source/src_lcao/gint_gamma_vl.cpp b/ABACUS.develop/source/src_lcao/gint_gamma_vl.cpp
index 207aa636bb..16bc32dd86 100644
--- a/ABACUS.develop/source/src_lcao/gint_gamma_vl.cpp
+++ b/ABACUS.develop/source/src_lcao/gint_gamma_vl.cpp
@@ -6,6 +6,7 @@
 #include <mkl_service.h>
 
 #include "global_fp.h" // mohan add 2021-01-30
+#include "../src_global/ylm.h"
 //#include <vector>
 
 extern "C"
diff --git a/ABACUS.develop/source/src_lcao/gint_k_fvl.cpp b/ABACUS.develop/source/src_lcao/gint_k_fvl.cpp
index ac0d6e0a6d..bfa8f85e68 100644
--- a/ABACUS.develop/source/src_lcao/gint_k_fvl.cpp
+++ b/ABACUS.develop/source/src_lcao/gint_k_fvl.cpp
@@ -3,6 +3,8 @@
 #include "LCAO_nnr.h"
 #include "global_fp.h" // mohan add 2021-01-30
 
+#include "../src_global/ylm.h"
+
 void Gint_k::fvl_k_RealSpace(matrix& fvl_dphi, const double *vl)
 {
 	TITLE("Gint_k","cal_force");

From 722437d12f7b4b20552039a5ba860e25508be520 Mon Sep 17 00:00:00 2001
From: mohan <mohan.chen.chen.mohan@gmail.com>
Date: Mon, 5 Apr 2021 14:48:59 +0800
Subject: [PATCH 28/60] add __NORMAL option in complex to avoid linking to
 lapack in some cases

---
 .../source/src_global/complexmatrix.cpp       | 44 +++++++++++--------
 1 file changed, 25 insertions(+), 19 deletions(-)

diff --git a/ABACUS.develop/source/src_global/complexmatrix.cpp b/ABACUS.develop/source/src_global/complexmatrix.cpp
index 3df5bea358..057a1dc046 100644
--- a/ABACUS.develop/source/src_global/complexmatrix.cpp
+++ b/ABACUS.develop/source/src_global/complexmatrix.cpp
@@ -1,15 +1,14 @@
-//==========================================================
-// AUTHOR : Lixin He, Mohan Chen
-// LAST UPDATE : 2009-03-23 modify "=" operator
-//==========================================================
-
 #include <cassert>
 #include <new>
 #include <cstdlib>
 #include <cstring>
 #include <iostream>
 #include "complexmatrix.h"
+
+#ifdef __NORMAL
+#else
 #include "lapack_connector.h"
+#endif
 
 // constructor with sizes
 ComplexMatrix::ComplexMatrix(const int nrows, const int ncols, const bool flag_zero)
@@ -75,7 +74,9 @@ ComplexMatrix::ComplexMatrix(const matrix &m)
 	{
 		c = new complex<double>[size];
 		for( int i=0; i<size; ++i)
+		{
 			c[i] = m.c[i];
+		}
 	}
 }
 
@@ -123,7 +124,7 @@ void ComplexMatrix::create(const int nr_in, const int nc_in, const bool flag_zer
 	}
 }
 
-void ComplexMatrix::set_as_identity_matrix()
+void ComplexMatrix::set_as_identity_matrix(void)
 {
 	for(int i=0; i<nr; i++)
 	{
@@ -165,23 +166,28 @@ ComplexMatrix operator*(const ComplexMatrix &m1, const ComplexMatrix &m2)
 	assert(m1.nc == m2.nr);
 	ComplexMatrix mprod(m1.nr, m2.nc);
 
+// mohan add 2021-04-05
+#ifdef __NORMAL
 	complex<double> z;
-//	for (int i = 0;i < m1.nr;i++)
-//	{
-//		for (int j = 0;j < m2.nc;j++)
-//		{
-//			z = complex<double>(0,0);
-//			for (int k = 0;k < m1.nc;k++)
-//			{
-//				z += m1(i, k) * m2(k, j);
-//			}
-//			mprod(i, j) = z;
-//		}
-//	}
+	for (int i = 0;i < m1.nr;i++)
+	{
+		for (int j = 0;j < m2.nc;j++)
+		{
+			z = complex<double>(0,0);
+			for (int k = 0;k < m1.nc;k++)
+			{
+				z += m1(i, k) * m2(k, j);
+			}
+			mprod(i, j) = z;
+		}
+	}
+#else
 	// Peize Lin accelerate 2017-10-27
 	LapackConnector::gemm('N', 'N', m1.nr, m2.nc, m1.nc,
 		1, m1.c, m1.nc, m2.c, m2.nc,
 		0, mprod.c, mprod.nc);
+#endif
+
 	return mprod;
 }
 
@@ -403,4 +409,4 @@ ComplexMatrix conj(const ComplexMatrix &m)
 	for(int i=0; i!=m.size; ++i)
 		cm.c[i] = conj(m.c[i]);
 	return cm;
-}
\ No newline at end of file
+}

From 75db64f843b3bc8747ad1079513ee173f24d5892 Mon Sep 17 00:00:00 2001
From: mohan <mohan.chen.chen.mohan@gmail.com>
Date: Mon, 5 Apr 2021 15:30:30 +0800
Subject: [PATCH 29/60] update matrix to make connecting to lapack unnecessary

---
 .../source/src_external/ORB_api/Makefile      |  3 +-
 .../src_external/ORB_api/Makefile.Objects     |  3 ++
 ABACUS.develop/source/src_global/matrix.cpp   | 47 ++++++++++++++++---
 .../source/src_lcao/ORB_atomic_lm.cpp         |  3 +-
 4 files changed, 47 insertions(+), 9 deletions(-)

diff --git a/ABACUS.develop/source/src_external/ORB_api/Makefile b/ABACUS.develop/source/src_external/ORB_api/Makefile
index 010843ff16..637c9b3617 100644
--- a/ABACUS.develop/source/src_external/ORB_api/Makefile
+++ b/ABACUS.develop/source/src_external/ORB_api/Makefile
@@ -10,7 +10,7 @@ VPATH=../../src_global\
 #==========================
 # Define HONG
 #==========================
-HONG= -DMETIS -DMKL_ILP64
+HONG= -DMETIS -DMKL_ILP64 -D__NORMAL
 
 #==========================
 # OPTIMIZE OPTIONS
@@ -26,6 +26,7 @@ OPTS_GDB = -g -W
 
 FP_OBJS_0=main.o\
 $(OBJS_TRY)\
+$(OBJS_ORBITAL)\
 
 FP_OBJS=$(patsubst %.o, ${OBJ_DIR}/%.o, ${FP_OBJS_0})
 PDIAG_OBJS=$(patsubst %.o, ${OBJ_DIR}/%.o, ${OBJS_PDIAG})
diff --git a/ABACUS.develop/source/src_external/ORB_api/Makefile.Objects b/ABACUS.develop/source/src_external/ORB_api/Makefile.Objects
index 7aed1ec43c..cd8c50d06d 100644
--- a/ABACUS.develop/source/src_external/ORB_api/Makefile.Objects
+++ b/ABACUS.develop/source/src_external/ORB_api/Makefile.Objects
@@ -13,6 +13,9 @@ VERSION= ABACUS-ORB
 HEADERS= *.h
 
 OBJS_TRY=math_integral.o\
+complexarray.o\
+complexmatrix.o\
+matrix.o\
 
 OBJS_ORBITAL=ORB_control.o\
 ORB_read.o\
diff --git a/ABACUS.develop/source/src_global/matrix.cpp b/ABACUS.develop/source/src_global/matrix.cpp
index db726d667e..7fbb7fc0a4 100644
--- a/ABACUS.develop/source/src_global/matrix.cpp
+++ b/ABACUS.develop/source/src_global/matrix.cpp
@@ -11,7 +11,11 @@
 
 using namespace std;
 #include "matrix.h"
+
+#ifdef __NORMAL
+#else
 #include "lapack_connector.h"
+#endif
 
 //*********************************************************
 // The init() function is the main initialization routine.
@@ -23,7 +27,13 @@ using namespace std;
 
 void matrixAlloc()
 {
+// mohan add 2021-04-25
+#ifdef __NORMAL
+	cout << "Allocation error for Matrix" << endl;
+	exit(0);
+#else
 	WARNING_QUIT("matrix","Allocation error for Matrix");
+#endif
 }
 
 matrix::matrix( const int nrows, const int ncols, const bool flag_zero )
@@ -189,19 +199,28 @@ matrix operator*(const matrix &m1, const matrix &m2)
     // allocate the result and zero it out
     matrix mprod( m1.nr, m2.nc, false );
 
+#ifdef __NORMAL
+	mprod.zero_out();
     // do the multiply and return
-//    for (int i = 0;i < m1.nr;i++)
-//        for (int j = 0;j < m2.nc;j++)
-//            for (int k = 0;k < m1.nc;k++)
-//                //mprod(i, j) += m2(i, k) * m1(k, j);
-//                mprod(i, j) += m1(i, k) * m2(k, j);
-	
+    for (int i = 0;i < m1.nr;i++)
+	{
+        for (int j = 0;j < m2.nc;j++)
+		{
+            for (int k = 0;k < m1.nc;k++)
+			{
+                mprod(i, j) += m1(i, k) * m2(k, j);
+			}
+		}
+	}
+#else
 	// Peize Lin accelerate 2017-10-27
 	LapackConnector::gemm(
 		'N', 'N', 
 		m1.nr, m2.nc, m1.nc,
 		1, m1.c, m1.nc, m2.c, m2.nc, 
 		0, mprod.c, mprod.nc);
+#endif
+
 	return mprod;
 }
 
@@ -377,7 +396,9 @@ double matrix::min() const
 	double value = std::numeric_limits<double>::max();
 	const int size = nr * nc;
 	for( int i=0; i<size; ++i )
+	{
 		value = std::min( value, c[i] );
+	}
 	return value;
 }
 
@@ -387,11 +408,23 @@ double matrix::absmax() const
 	double value = 0;
 	const int size = nr * nc;
 	for( int i=0; i<size; ++i )
+	{
 		value = std::max( value, std::abs(c[i]) );
+	}
 	return value;
 }
 
 double matrix::norm() const
 {
+// mohan add 2021-04-25, no tests.
+#ifdef  __NORMAL
+	double nn = 0.0;
+	for(int i=0; i<nr*nc; ++i)
+	{
+		nn += c[i]*c[i];
+	}	
+	return sqrt(nn);
+#else
 	return LapackConnector::nrm2(nr*nc,c,1);
-}
\ No newline at end of file
+#endif
+}
diff --git a/ABACUS.develop/source/src_lcao/ORB_atomic_lm.cpp b/ABACUS.develop/source/src_lcao/ORB_atomic_lm.cpp
index 82fca8474a..79949719f6 100644
--- a/ABACUS.develop/source/src_lcao/ORB_atomic_lm.cpp
+++ b/ABACUS.develop/source/src_lcao/ORB_atomic_lm.cpp
@@ -213,7 +213,8 @@ void Numerical_Orbital_Lm::extra_uniform(const double &dr_uniform_in)
 	#pragma omp parallel for schedule(static)
 	for (int ir = 0; ir < this->nr_uniform; ir++)
 	{
-		const double psi_uniform_tmp  = Mathzone_Add1::Uni_RadialF(VECTOR_TO_PTR(this->psi), this->nr, this->rab[0], ir * dr_uniform); 
+		const double psi_uniform_tmp  = 
+		Mathzone_Add1::Uni_RadialF(VECTOR_TO_PTR(this->psi), this->nr, this->rab[0], ir * dr_uniform); 
 		this->psi_uniform[ir] = psi_uniform_tmp;
 //    	this->psi_uniform[ir] = Mathzone::Polynomial_Interpolation(this->psi, this->nr, this->rab[0], ir * dr_uniform); 
     }

From 0c44bde2460468c06e4edbdb7e51bbc93d781513 Mon Sep 17 00:00:00 2001
From: mohan <mohan.chen.chen.mohan@gmail.com>
Date: Tue, 6 Apr 2021 11:01:19 +0800
Subject: [PATCH 30/60] update of ORB files

---
 .../source/src_lcao/LCAO_descriptor.cpp       | 68 +++++++++++++++----
 .../source/src_lcao/LCAO_descriptor.h         | 60 +++++++++++-----
 ABACUS.develop/source/src_lcao/ORB_atomic.cpp |  4 --
 .../source/src_lcao/ORB_atomic_lm.cpp         | 32 ++++++++-
 4 files changed, 127 insertions(+), 37 deletions(-)

diff --git a/ABACUS.develop/source/src_lcao/LCAO_descriptor.cpp b/ABACUS.develop/source/src_lcao/LCAO_descriptor.cpp
index 3dd7320c61..2bbdc0495a 100644
--- a/ABACUS.develop/source/src_lcao/LCAO_descriptor.cpp
+++ b/ABACUS.develop/source/src_lcao/LCAO_descriptor.cpp
@@ -1,4 +1,3 @@
-//caoyu add 2021-03-29
 #include "LCAO_descriptor.h"
 #include "LCAO_matrix.h"
 #include "../src_global/lapack_connector.h"
@@ -8,6 +7,8 @@
 #include "../src_pw/global.h"
 #include "../src_io/winput.h"
 
+//caoyu add 2021-03-29
+
 LCAO_Descriptor::LCAO_Descriptor()
 {
     S_mu_alpha = new double[1];
@@ -29,12 +30,19 @@ void LCAO_Descriptor::build_S_descriptor(const bool &calc_deri)
 
     // =======init==============
     // cal n(descriptor) per atom , related to Lmax, nchi(L) and m. (not total_nchi!)
+
+	this->des_per_atom=0;
     for (int l = 0; l <= ORB.get_lmax_d(); l++)
     {
         this->des_per_atom += ORB.Alpha[0].getNchi(l) * (2 * l + 1);
     }
+
+	// total number of descriptors
     this->n_descriptor = ucell.nat * this->des_per_atom;
+
+	// size of the full density matrix (DM)
     const long DMsize = this->n_descriptor * this->n_descriptor;
+
     delete[] S_mu_alpha;
     S_mu_alpha = new double[DMsize];
 
@@ -47,6 +55,7 @@ void LCAO_Descriptor::build_S_descriptor(const bool &calc_deri)
     //\sum{T} e**{ikT} <\phi_{ia}|d\phi_{k\beta}(T)>	//???
     Vector3<double> tau1, tau2, dtau;
     Vector3<double> dtau1, dtau2, tau0;
+
     for (int T1 = 0; T1 < ucell.ntype; ++T1)
     {
         Atom *atom1 = &ucell.atoms[T1];
@@ -63,10 +72,12 @@ void LCAO_Descriptor::build_S_descriptor(const bool &calc_deri)
                 tau2 = GridD.getAdjacentTau(ad);
                 dtau = tau2 - tau1;
                 double distance = dtau.norm() * ucell.lat0;
-                double rcut = ORB.Phi[T1].getRcut() + ORB.Phi[T2].getRcut(); //Rcut is subject to ORB.Phi to keep dimension of S_mu_alpha same as Sloc
+				// rcut is subject to ORB.Phi to keep dimension of S_mu_alpha same as Sloc
+                double rcut = ORB.Phi[T1].getRcut() + ORB.Phi[T2].getRcut();
                 if (distance < rcut)
                 {
-                    int iw1_all = ucell.itiaiw2iwt(T1, I1, 0); //iw1_all = combined index (it, ia, iw)
+					// iw1_all = combined index (it, ia, iw)
+                    int iw1_all = ucell.itiaiw2iwt(T1, I1, 0);
 
                     for (int jj = 0; jj < atom1->nw * NPOL; ++jj)
                     {
@@ -95,7 +106,9 @@ void LCAO_Descriptor::build_S_descriptor(const bool &calc_deri)
                             {
                                 for (int m2 = 0; m2 < 2 * L2 + 1; ++m2)
                                 {
-                                    olm[0] = olm[1] = olm[2] = 0.0;
+                                    olm[0] = 0.0;
+									olm[1] = 0.0;
+									olm[2] = 0.0;
 
                                     complex<double> olm1[4] = {ZERO, ZERO, ZERO, ZERO};
                                     complex<double> *olm2 = &olm1[0];
@@ -136,7 +149,8 @@ void LCAO_Descriptor::build_S_descriptor(const bool &calc_deri)
 
     if (!GAMMA_ONLY_LOCAL)
     {
-        WARNING_QUIT("LCAO_Descriptor::build_S_descriptor", "muti-kpoint method for descriptor is not implemented yet! ");
+        WARNING_QUIT("LCAO_Descriptor::build_S_descriptor", 
+		"muti-kpoint method for descriptor is not implemented yet! ");
     }
 
     return;
@@ -165,7 +179,7 @@ void LCAO_Descriptor::set_S_mu_alpha(const int &iw1_all, const int &iw2_all, con
     return;
 }
 
-void LCAO_Descriptor::cal_projective_DM()
+void LCAO_Descriptor::cal_projective_DM(void)
 {
     //step 1: cal inv of Sloc
     double *sinv = new double[NLOCAL * NLOCAL]; //size :NLOCAL*NLOCAL
@@ -216,6 +230,7 @@ void LCAO_Descriptor::cal_projective_DM()
     const long tmp_PDM_size = NLOCAL * this->n_descriptor;
     double *tmp_PDM = new double[tmp_PDM_size];
     const long PDM_size = this->n_descriptor * this->n_descriptor;
+
     delete[] this->PDM;
     this->PDM = new double[PDM_size];
 
@@ -226,19 +241,23 @@ void LCAO_Descriptor::cal_projective_DM()
     double *a = sinv;
     double *b = ss;
     double *c = tmp_PDM;
-    dgemm_(&nt, &nt, &NLOCAL, &n_descriptor, &NLOCAL, &alpha, a, &NLOCAL, b, &NLOCAL, &beta, c, &NLOCAL); //S_nu_nu*SS_nu_beta
+    dgemm_(&nt, &nt, &NLOCAL, &n_descriptor, &NLOCAL, &alpha, a, 
+	&NLOCAL, b, &NLOCAL, &beta, c, &NLOCAL); //S_nu_nu*SS_nu_beta
     a = dm;
     b = c;
     c = this->PDM;
-    dgemm_(&nt, &nt, &NLOCAL, &n_descriptor, &NLOCAL, &alpha, a, &NLOCAL, b, &NLOCAL, &beta, c, &NLOCAL); //DM*S_nu_nu*SS_nu_beta
+    dgemm_(&nt, &nt, &NLOCAL, &n_descriptor, &NLOCAL, &alpha, a, 
+	&NLOCAL, b, &NLOCAL, &beta, c, &NLOCAL); //DM*S_nu_nu*SS_nu_beta
     a = sinv;
     b = c;
     c = tmp_PDM;
-    dgemm_(&t, &nt, &NLOCAL, &n_descriptor, &NLOCAL, &alpha, a, &NLOCAL, b, &NLOCAL, &beta, c, &NLOCAL); //S_mu_mu*DM*S_nu_nu*SS_nu_beta
+    dgemm_(&t, &nt, &NLOCAL, &n_descriptor, &NLOCAL, &alpha, a, 
+	&NLOCAL, b, &NLOCAL, &beta, c, &NLOCAL); //S_mu_mu*DM*S_nu_nu*SS_nu_beta
     a = ss;
     b = c;
     c = this->PDM;
-    dgemm_(&t, &nt, &n_descriptor, &n_descriptor, &NLOCAL, &alpha, a, &NLOCAL, b, &NLOCAL, &beta, c, &n_descriptor); //SS_alpha_mu*S_mu_mu*DM*S_nu_nu*SS_nu_beta
+    dgemm_(&t, &nt, &n_descriptor, &n_descriptor, &NLOCAL, &alpha, a, 
+	&NLOCAL, b, &NLOCAL, &beta, c, &n_descriptor); //SS_alpha_mu*S_mu_mu*DM*S_nu_nu*SS_nu_beta
 
     delete[] dm;
     delete[] sinv;
@@ -246,10 +265,11 @@ void LCAO_Descriptor::cal_projective_DM()
     return;
 }
 
-void LCAO_Descriptor::cal_descriptor()
+void LCAO_Descriptor::cal_descriptor(void)
 {
     delete[] d;
     d = new double[this->n_descriptor];
+
     //==========print preparation=============
     ofs_running << " print out each DM_Inl" << endl;
     ofstream ofs;
@@ -267,7 +287,9 @@ void LCAO_Descriptor::cal_descriptor()
     {
         for (int ia = 0; ia < ucell.atoms[it].na; ia++)
         {
-            ofs << ucell.atoms[it].label << " atom_index " << ia + 1 << " n_descriptor " << this->des_per_atom << endl;
+            ofs << ucell.atoms[it].label << " atom_index " << ia + 1 
+			<< " n_descriptor " << this->des_per_atom << endl;
+
             for (int l = 0; l <= lmax; l++)
             {
                 int nmax = ORB.Alpha[0].getNchi(l);
@@ -327,12 +349,17 @@ void LCAO_Descriptor::cal_descriptor()
             } //l
         }     //ia
     }         //it
+
     if (ofs)
+	{
         ofs.close();
+	}
+
     this->print_descriptor();
     return;
 }
 
+
 void LCAO_Descriptor::init_mu_index(void)
 {
     ofs_running << " Initialize the mu index for deepks (lcao line)" << endl;
@@ -380,7 +407,14 @@ void LCAO_Descriptor::init_mu_index(void)
     return;
 }
 
-void LCAO_Descriptor::print_projective_DM(ofstream &ofs, ComplexMatrix &des, const int &it, const int &ia, const int &l, const int &n)
+
+void LCAO_Descriptor::print_projective_DM(
+	ofstream &ofs, 
+	ComplexMatrix &des, 
+	const int &it, 
+	const int &ia, 
+	const int &l, 
+	const int &n)
 {
     ofs << "L=" << l << "   N=" << n << endl;
     for (int i = 0; i < 2 * l + 1; i++)
@@ -393,7 +427,9 @@ void LCAO_Descriptor::print_projective_DM(ofstream &ofs, ComplexMatrix &des, con
     }
     return;
 }
-void LCAO_Descriptor::print_descriptor()
+
+
+void LCAO_Descriptor::print_descriptor(void)
 {
     TITLE("LCAO_Descriptor", "print_descriptor");
     ofstream ofs;
@@ -405,11 +441,13 @@ void LCAO_Descriptor::print_descriptor()
     {
         ofs.open(ss.str().c_str());
     }
+
     for (int it = 0; it < ucell.ntype; it++)
     {
         for (int ia = 0; ia < ucell.atoms[it].na; ia++)
         {
-            ofs << ucell.atoms[it].label << " atom_index " << ia + 1 << " n_descriptor " << this->des_per_atom << endl;
+            ofs << ucell.atoms[it].label << " atom_index " 
+			<< ia + 1 << " n_descriptor " << this->des_per_atom << endl;
             int id0 = this->mu_index[it](ia, 0, 0, 0);
             for (int id = id0; id < id0 + this->des_per_atom; ++id)
             {
diff --git a/ABACUS.develop/source/src_lcao/LCAO_descriptor.h b/ABACUS.develop/source/src_lcao/LCAO_descriptor.h
index 218d56738b..7ece689fd4 100644
--- a/ABACUS.develop/source/src_lcao/LCAO_descriptor.h
+++ b/ABACUS.develop/source/src_lcao/LCAO_descriptor.h
@@ -1,31 +1,59 @@
-//caoyu add 2021-03-29
+#ifndef LCAO_DESCRIPTOR_H
+#define LCAO_DESCRIPTOR_H
+
 #include "../src_global/intarray.h"
 #include "../src_global/complexmatrix.h"
 
-#ifndef LCAO_MATRIX_DESCRIPTOR_H
-#define LCAO_MATRIX_DESCRIPTOR_H
-
+//caoyu add 2021-03-29
 class LCAO_Descriptor
 {
 public:
+
     LCAO_Descriptor();
     ~LCAO_Descriptor();
 
-    void build_S_descriptor(const bool &calc_deri); //cal S_alpha_mu：overlap between lcao basis Phi and descriptor basis Alpha
-    void cal_projective_DM();                       //cal PDM: S_alpha_mu * inv(Sloc) * DM * inv(Sloc) * S_nu_beta
-    void cal_descriptor();                          //cal d: EIGENVALUE of PDM in block of I_n_l
-    void print_descriptor();
+	// cal S_alpha_mu：overlap between lcao basis Phi and descriptor basis Al
+    void build_S_descriptor(const bool &calc_deri); 
+
+	// cal PDM: S_alpha_mu * inv(Sloc) * DM * inv(Sloc) * S_nu_beta
+    void cal_projective_DM(void);
+
+	// cal d: EIGENVALUE of PDM in block of I_n_l
+    void cal_descriptor(void);
+    void print_descriptor(void);
 
 private:
-    double *S_mu_alpha; //overlap between lcao and descriptor basis
-    double *PDM;        //projective density matrix
-    double *d;          //descriptors
-    int n_descriptor = 0;
-    int des_per_atom = 0; //\sum_L{Nchi(L)*(2L+1)}
+
+	// overlap between lcao and descriptor basis
+    double *S_mu_alpha;
+
+	// projective density matrix
+    double *PDM;
+
+	// descriptors
+    double *d;
+
+    int n_descriptor;
+
+	// \sum_L{Nchi(L)*(2L+1)}
+    int des_per_atom;
+
     IntArray *mu_index;
+
     void init_mu_index(void);
-    void set_S_mu_alpha(const int &iw1_all, const int &iw2_all, const double &v);
-    void print_projective_DM(ofstream &ofs, ComplexMatrix &des, const int &it, const int &ia, const int &l, const int &n);
+    
+	void set_S_mu_alpha(
+		const int &iw1_all, 
+		const int &iw2_all, 
+		const double &v);
+
+    void print_projective_DM(
+		ofstream &ofs, 
+		ComplexMatrix &des, 
+		const int &it, 
+		const int &ia, 
+		const int &l, 
+		const int &n);
 };
 
-#endif
\ No newline at end of file
+#endif
diff --git a/ABACUS.develop/source/src_lcao/ORB_atomic.cpp b/ABACUS.develop/source/src_lcao/ORB_atomic.cpp
index 9d68cbfb44..2cd425c7f9 100644
--- a/ABACUS.develop/source/src_lcao/ORB_atomic.cpp
+++ b/ABACUS.develop/source/src_lcao/ORB_atomic.cpp
@@ -1,7 +1,3 @@
-//=========================================================
-//AUTHOR : liaochen
-//DATE : 2008-11-12
-//=========================================================
 #include "ORB_atomic.h"
 
 Vector3<double> Numerical_Orbital::R1;
diff --git a/ABACUS.develop/source/src_lcao/ORB_atomic_lm.cpp b/ABACUS.develop/source/src_lcao/ORB_atomic_lm.cpp
index 79949719f6..665ea33ba5 100644
--- a/ABACUS.develop/source/src_lcao/ORB_atomic_lm.cpp
+++ b/ABACUS.develop/source/src_lcao/ORB_atomic_lm.cpp
@@ -462,16 +462,22 @@ void Numerical_Orbital_Lm::cal_kradial_sbpool(void)
 
 	// dr must be all the same for Sph_Bessel_Recursive_Pool
 	const double dr = this->rab[0];
+	
 	for( int ir=1; ir<this->nr; ++ir )
+	{
 		assert( dr == this->rab[ir] );
+	}
 
 	Sph_Bessel_Recursive::D2* pSB = nullptr;
 	for( auto & sb : Sph_Bessel_Recursive_Pool::D2::sb_pool )
+	{
 		if( this->dk * dr == sb.get_dx() )
 		{
 			pSB = &sb;
 			break;
 		}
+	}
+
 	if(!pSB)
 	{
 		Sph_Bessel_Recursive_Pool::D2::sb_pool.push_back({});
@@ -485,20 +491,42 @@ void Numerical_Orbital_Lm::cal_kradial_sbpool(void)
 
 	vector<double> r_tmp(nr);
 	for( int ir=0; ir!=nr; ++ir )
+	{
 		r_tmp[ir] = this->psir[ir] * this->r_radial[ir] * this->rab[ir];
+	}
+
 	constexpr double one_three=1.0/3.0, two_three=2.0/3.0, four_three=4.0/3.0;
-	r_tmp[0]*=one_three;	r_tmp[nr-1]*=one_three;
+	r_tmp[0]*=one_three;	
+	r_tmp[nr-1]*=one_three;
+
 	for( int ir=1; ir!=nr-1; ++ir )
+	{
 		r_tmp[ir] *= (ir&1) ? four_three : two_three;
+	}
 
+#ifdef __NORMAL
+	// need to be checked (avoid using Lapack)
+	for(int ik=0; ik<nk; ++ik)
+	{
+		double psi_f_tmp = 0.0; 
+		for(int ir=0; ir<nr; ++ir)
+		{
+			psi_f_tmp += r_tmp[ir]*jl[ik][ir];
+		}
+		psi_f_tmp *= pref;
+	}
+#else
 	#pragma omp parallel for schedule(static)
 	for (int ik = 0; ik < nk; ik++)
 	{
-		const double psi_f_tmp = pref * LapackConnector::dot( this->nr, VECTOR_TO_PTR(r_tmp), 1, VECTOR_TO_PTR(jl[ik]), 1 ) ;
+		const double psi_f_tmp = 
+		pref * LapackConnector::dot( this->nr, VECTOR_TO_PTR(r_tmp), 1, VECTOR_TO_PTR(jl[ik]), 1 ) ;
 		this->psif[ik] = psi_f_tmp;
 		this->psik[ik] = psi_f_tmp * k_radial[ik];
 		this->psik2[ik] = this->psik[ik] * k_radial[ik];
 	}
+#endif
+	return;
 }
 
 // Peize Lin add 2017-12-11

From 74efacfea9b09690bd28e0c6b05d95bc79d161dd Mon Sep 17 00:00:00 2001
From: mohan <mohan.chen.chen.mohan@gmail.com>
Date: Tue, 6 Apr 2021 15:32:06 +0800
Subject: [PATCH 31/60] optimize src_lcao/LCAO_gen_fixedH.cpp

---
 .../source/src_lcao/LCAO_gen_fixedH.cpp       | 425 +++++++++++-------
 1 file changed, 256 insertions(+), 169 deletions(-)

diff --git a/ABACUS.develop/source/src_lcao/LCAO_gen_fixedH.cpp b/ABACUS.develop/source/src_lcao/LCAO_gen_fixedH.cpp
index 3de837ddda..21694a4c85 100644
--- a/ABACUS.develop/source/src_lcao/LCAO_gen_fixedH.cpp
+++ b/ABACUS.develop/source/src_lcao/LCAO_gen_fixedH.cpp
@@ -15,12 +15,6 @@ void LCAO_gen_fixedH::calculate_NL_no(void)
 {
     TITLE("LCAO_gen_fixedH","calculate_NL_no");
 
-	// PLEASE rebuild the following two functions,
-	// 'build_Nonlocal_beta' and  'build_Nonlocal_mu',
-	// because the two functions are extremely time consuming
-	// for small systems, especially for multiple-k points
-	// mohan note 2021-03-23
-
 	if(GAMMA_ONLY_LOCAL)
 	{
 	  	//for gamma only.
@@ -123,8 +117,6 @@ void LCAO_gen_fixedH::build_ST_new(const char& dtype, const bool& calc_deri)
 							complex<double> *olm2 = &olm1[0];
 							if(!calc_deri)
 							{
-								// PLEASE use UOT as an input parameter of this subroutine
-								// mohan add 2021-03-30
 								UOT.snap_psipsi( olm, 0, dtype, tau1, 
 										T1, L1, m1, N1, GridD.getAdjacentTau(ad), 
 										T2, L2, m2, N2,
@@ -412,13 +404,24 @@ void LCAO_gen_fixedH::build_Nonlocal_mu(const bool &calc_deri)
 	// while beta is in the supercell.
 	// while phi2 is in the supercell.
 
-	int nnr = 0;
+	int nnr_temp, nnr = 0;
 	Vector3<double> tau1, tau2, dtau;
 	Vector3<double> dtau1, dtau2, tau0;
 	double distance = 0.0;
 	double distance1, distance2;
 	double rcut = 0.0;
 	double rcut1, rcut2;
+
+	matrix Rcut(ucell.ntype,ucell.ntype);
+	matrix Rcut_beta(ucell.ntype,ucell.ntype);
+	for(int i=0; i<ucell.ntype; ++i)
+	{
+		for(int j=0; j<ucell.ntype; ++j)
+		{
+			Rcut(i,j) = ORB.Phi[i].getRcut() + ORB.Phi[j].getRcut();
+			Rcut_beta(i,j) = ORB.Phi[i].getRcut() + ORB.Beta[j].get_rcut_max();
+		}
+	}
 		
 //	Record_adj RA;
 //	RA.for_2d();
@@ -427,6 +430,7 @@ void LCAO_gen_fixedH::build_Nonlocal_mu(const bool &calc_deri)
     for (int T1 = 0; T1 < ucell.ntype; ++T1)
     {
 		const Atom* atom1 = &ucell.atoms[T1];
+		const int nw_tot1 = atom1->nw*NPOL;
         for (int I1 =0; I1< atom1->na; ++I1)
         {
             //GridD.Find_atom( atom1->tau[I1] );
@@ -436,10 +440,13 @@ void LCAO_gen_fixedH::build_Nonlocal_mu(const bool &calc_deri)
             tau1 = atom1->tau[I1];
 
 			// psi2
-            for (int ad2=0; ad2<GridD.getAdjacentNum()+1; ++ad2)
+			int adjnumplus = GridD.getAdjacentNum()+1;
+            //for (int ad2=0; ad2<GridD.getAdjacentNum()+1; ++ad2)
+			for(int ad2=0; ad2<adjnumplus; ++ad2)
 			{
 				const int T2 = GridD.getType(ad2);
 				const Atom* atom2 = &ucell.atoms[T2];
+				const int nw_tot2 = atom2->nw*NPOL;
                 
 				const int I2 = GridD.getNatom(ad2);
 				//const int iat2 = ucell.itia2iat(T2, I2);
@@ -447,21 +454,23 @@ void LCAO_gen_fixedH::build_Nonlocal_mu(const bool &calc_deri)
                 tau2 = GridD.getAdjacentTau(ad2);
 
 				bool is_adj = false;
-					
-				dtau = tau2 - tau1;
-				distance = dtau.norm() * ucell.lat0;
-				// this rcut is in order to make nnr consistent 
-				// with other matrix.
-				rcut = ORB.Phi[T1].getRcut() + ORB.Phi[T2].getRcut();
-				if(distance < rcut) is_adj = true;
-				else if(distance >= rcut)
-				{
-                    for (int ad0 = 0; ad0 < GridD.getAdjacentNum()+1; ++ad0)
+
+                dtau = tau2 - tau1;
+                distance = dtau.norm() * ucell.lat0;
+                // this rcut is in order to make nnr consistent
+                // with other matrix.
+                //rcut = ORB.Phi[T1].getRcut() + ORB.Phi[T2].getRcut();
+                rcut = Rcut(T1,T2);
+                if(distance < rcut) is_adj = true;
+                /*else if(distance >= rcut)
+                {
+                    //for (int ad0 = 0; ad0 < GridD.getAdjacentNum()+1; ++ad0)
+                    for(int ad0 = 0; ad0 < adjnumplus; ++ad0)
                     {
-						const int T0 = GridD.getType(ad0);
-						//const int I0 = GridD.getNatom(ad0);
-						//const int T0 = RA.info[iat1][ad0][3];
-						//const int I0 = RA.info[iat1][ad0][4];
+                        const int T0 = GridD.getType(ad0);
+                        //const int I0 = GridD.getNatom(ad0);
+                        //const int T0 = RA.info[iat1][ad0][3];
+                        //const int I0 = RA.info[iat1][ad0][4];
                         //const int iat0 = ucell.itia2iat(T0, I0);
                         //const int start0 = ucell.itiaiw2iwt(T0, I0, 0);
 
@@ -472,175 +481,227 @@ void LCAO_gen_fixedH::build_Nonlocal_mu(const bool &calc_deri)
                         double distance1 = dtau1.norm() * ucell.lat0;
                         double distance2 = dtau2.norm() * ucell.lat0;
 
-                        rcut1 = ORB.Phi[T1].getRcut() + ORB.Beta[T0].get_rcut_max();
-                        rcut2 = ORB.Phi[T2].getRcut() + ORB.Beta[T0].get_rcut_max();
+                        //rcut1 = ORB.Phi[T1].getRcut() + ORB.Beta[T0].get_rcut_max();
+                        //rcut2 = ORB.Phi[T2].getRcut() + ORB.Beta[T0].get_rcut_max();
 
-                        if( distance1 < rcut1 && distance2 < rcut2 )
+                        //if( distance1 < rcut1 && distance2 < rcut2 )
+                        if( distance1 < Rcut_beta(T1,T0) && distance2 < Rcut_beta(T2,T0))
                         {
                             is_adj = true;
                             break;
                         }
                     }
-				}
+                }*/
 
 
-				if(is_adj)
+				//(3) run over all projectors in nonlocal pseudopotential.
+				//for (int ad0=0; ad0 < GridD.getAdjacentNum()+1 ; ++ad0)
+				for(int ad0=0; ad0 < adjnumplus; ++ad0)
 				{
-					// < psi1 | all projectors | psi2 >
-					// ----------------------------- enter the nnr increaing zone -------------------------
-					for (int j=0; j<atom1->nw*NPOL; j++)
-					{
-						const int j0 = j/NPOL;//added by zhengdy-soc
-						const int iw1_all = start1 + j;
-						const int mu = ParaO.trace_loc_row[iw1_all];
-						if(mu < 0)continue; 
+					const int T0 = GridD.getType(ad0);
 
-						// fix a serious bug: atom2[T2] -> atom2
-						// mohan 2010-12-20
-						for (int k=0; k<atom2->nw*NPOL; k++)
-						{
-							const int k0 = k/NPOL;
-							const int iw2_all = start2 + k;
-							const int nu = ParaO.trace_loc_col[iw2_all];						
-							if(nu < 0)continue;
+					// mohan add 2010-12-19
+					if( ORB.nproj[T0] == 0) continue; 
 
+					//const int I0 = GridD.getNatom(ad0);
+					//const int start0 = ucell.itiaiw2iwt(T0, I0, 0);
+					tau0 = GridD.getAdjacentTau(ad0);
 
-							//(3) run over all projectors in nonlocal pseudopotential.
-							for (int ad0=0; ad0 < GridD.getAdjacentNum()+1 ; ++ad0)
-							{
-								const int T0 = GridD.getType(ad0);
+					dtau1 = tau0 - tau1;
+					dtau2 = tau0 - tau2;
+					distance1 = dtau1.norm() * ucell.lat0;
+					distance2 = dtau2.norm() * ucell.lat0;
 
-								// mohan add 2010-12-19
-								if( ORB.nproj[T0] == 0) continue; 
+					// seems a bug here!! mohan 2011-06-17
+					//rcut1 = ORB.Phi[T1].getRcut() + ORB.Beta[T0].get_rcut_max();
+					//rcut2 = ORB.Phi[T2].getRcut() + ORB.Beta[T0].get_rcut_max();
 
-								//const int I0 = GridD.getNatom(ad0);
-								//const int start0 = ucell.itiaiw2iwt(T0, I0, 0);
-								tau0 = GridD.getAdjacentTau(ad0);
+					//if(distance1 < rcut1 && distance2 < rcut2)
+					if(distance1 < Rcut_beta(T1,T0) && distance2 < Rcut_beta(T2,T0))
+					{
+						is_adj = true;
+						// < psi1 | all projectors | psi2 >
+						// ----------------------------- enter the nnr increaing zone -------------------------
+						//for (int j=0; j<atom1->nw*NPOL; j++)
+						nnr_temp = 0;
+						for(int j=0; j<nw_tot1; j++)
+						{
+							//const int j0 = j/NPOL;//added by zhengdy-soc
+							const int iw1_all = start1 + j;
+							const int mu = ParaO.trace_loc_row[iw1_all];
+							if(mu < 0) continue;
+							const int j0 = j/NPOL;//added by zhengdy-soc
 
-								dtau1 = tau0 - tau1;
-								dtau2 = tau0 - tau2;
-								distance1 = dtau1.norm() * ucell.lat0;
-								distance2 = dtau2.norm() * ucell.lat0;
+							// fix a serious bug: atom2[T2] -> atom2
+							// mohan 2010-12-20
+							//for (int k=0; k<atom2->nw*NPOL; k++)
+							for(int k=0; k<nw_tot2; k++)
+							{
+								//const int k0 = k/NPOL;
+								const int iw2_all = start2 + k;
+								const int nu = ParaO.trace_loc_col[iw2_all];						
+								if(nu < 0) continue;
+								const int k0 = k/NPOL;
 
-								// seems a bug here!! mohan 2011-06-17
-								rcut1 = ORB.Phi[T1].getRcut() + ORB.Beta[T0].get_rcut_max();
-								rcut2 = ORB.Phi[T2].getRcut() + ORB.Beta[T0].get_rcut_max();
 
-								if(distance1 < rcut1 && distance2 < rcut2)
+								//const Atom* atom0 = &ucell.atoms[T0];
+								double nlm[3]={0,0,0};
+								complex<double> nlm1[4]={0,0,0,0};//modified by zhengdy-soc
+								complex<double> *nlm2 = NULL;
+								if(NSPIN==4) nlm2 = &nlm1[0];
+								if(!calc_deri)
 								{
-									//const Atom* atom0 = &ucell.atoms[T0];
-									double nlm[3]={0,0,0};
-									complex<double> nlm1[4]={0,0,0,0};//modified by zhengdy-soc
-									complex<double> *nlm2 = NULL;
-									if(NSPIN==4) nlm2 = &nlm1[0];
-									if(!calc_deri)
+									int is0 = (j-j0*NPOL) + (k-k0*NPOL)*2;
+									UOT.snap_psibeta(
+											nlm, 0, tau1, T1,
+											atom1->iw2l[ j0 ], // L1
+											atom1->iw2m[ j0 ], // m1
+											atom1->iw2n[ j0 ], // N1
+											tau2, T2,
+											atom2->iw2l[ k0 ], // L2
+											atom2->iw2m[ k0 ], // m2
+											atom2->iw2n[ k0 ], // n2
+											tau0, T0,
+											nlm2, is0 //for soc
+											);
+
+									if(NSPIN!=4) LM.Hloc_fixedR[nnr+nnr_temp] += nlm[0];
+									else
+									{
+										int is = (j-j0*NPOL) + (k-k0*NPOL)*2;
+										LM.Hloc_fixedR_soc[nnr+nnr_temp] += nlm1[is];
+									}
+
+									/*if(GAMMA_ONLY_LOCAL)
+									{
+										// mohan add 2010-12-20
+										if( nlm[0]!=0.0 )
+										{
+											// ofs_running << setw(10) << iw1_all << setw(10) 
+											// << iw2_all << setw(20) << nlm[0] << endl; 
+											LM.set_HSgamma(iw1_all,iw2_all,nlm[0],'N');//N stands for nonlocal.
+										}
+									}
+									else
+									{
+										if(NSPIN!=4) LM.Hloc_fixedR[nnr+nnr_temp] += nlm[0];
+										else
+										{
+											int is = (j-j0*NPOL) + (k-k0*NPOL)*2;
+											LM.Hloc_fixedR_soc[nnr+nnr_temp] += nlm1[is];
+										}
+									}*/
+								}// calc_deri
+								else // calculate the derivative
+								{
+									// mohan change the order on 2011-06-17
+									// origin: < psi1 | beta > < beta | dpsi2/dtau >
+									//now: < psi1/dtau | beta > < beta | psi2 >
+									UOT.snap_psibeta(
+											nlm, 1, 
+											tau2, 
+											T2,
+											atom2->iw2l[ k0 ], // L2
+											atom2->iw2m[ k0 ], // m2
+											atom2->iw2n[ k0 ], // n2
+											tau1, 
+											T1,
+											atom1->iw2l[ j0 ], // L1
+											atom1->iw2m[ j0 ], // m1
+											atom1->iw2n[ j0 ], // N1
+											tau0, T0
+											);
+
+
+									LM.DHloc_fixedR_x[nnr+nnr_temp] += nlm[0];
+									LM.DHloc_fixedR_y[nnr+nnr_temp] += nlm[1];
+									LM.DHloc_fixedR_z[nnr+nnr_temp] += nlm[2];
+									/*if(GAMMA_ONLY_LOCAL)
 									{
-										int is0 = (j-j0*NPOL) + (k-k0*NPOL)*2;
 										UOT.snap_psibeta(
-												nlm, 0, tau1, T1,
+												nlm, 1, 
+												tau1, 
+												T1,
 												atom1->iw2l[ j0 ], // L1
 												atom1->iw2m[ j0 ], // m1
 												atom1->iw2n[ j0 ], // N1
-												tau2, T2,
+												tau2, 
+												T2,
 												atom2->iw2l[ k0 ], // L2
 												atom2->iw2m[ k0 ], // m2
 												atom2->iw2n[ k0 ], // n2
-												tau0, T0,
-												nlm2, is0 //for soc
+												tau0, T0
 												);
 
-
-										if(GAMMA_ONLY_LOCAL)
-										{
-											// mohan add 2010-12-20
-											if( nlm[0]!=0.0 )
-											{
-												// ofs_running << setw(10) << iw1_all << setw(10) 
-												// << iw2_all << setw(20) << nlm[0] << endl; 
-												LM.set_HSgamma(iw1_all,iw2_all,nlm[0],'N');//N stands for nonlocal.
-											}
-										}
-										else
-										{
-											if(NSPIN!=4) LM.Hloc_fixedR[nnr] += nlm[0];
-											else
-											{
-												int is = (j-j0*NPOL) + (k-k0*NPOL)*2;
-												LM.Hloc_fixedR_soc[nnr] += nlm1[is];
-											}
-										}
-									}// calc_deri
-									else // calculate the derivative
+										// sum all projectors for one atom.
+										LM.set_force (iw1_all, iw2_all,	nlm[0], nlm[1], nlm[2], 'N');
+									}
+									else
 									{
-										if(GAMMA_ONLY_LOCAL)
-										{
-											UOT.snap_psibeta(
-													nlm, 1, 
-													tau1, 
-													T1,
-													atom1->iw2l[ j0 ], // L1
-													atom1->iw2m[ j0 ], // m1
-													atom1->iw2n[ j0 ], // N1
-													tau2, 
-													T2,
-													atom2->iw2l[ k0 ], // L2
-													atom2->iw2m[ k0 ], // m2
-													atom2->iw2n[ k0 ], // n2
-													tau0, T0
-													);
-
-											// sum all projectors for one atom.
-											LM.set_force (iw1_all, iw2_all,	nlm[0], nlm[1], nlm[2], 'N');
-										}
-										else
-										{
-											// mohan change the order on 2011-06-17
-											// origin: < psi1 | beta > < beta | dpsi2/dtau >
-											//now: < psi1/dtau | beta > < beta | psi2 >
-											UOT.snap_psibeta(
-													nlm, 1, 
-													tau2, 
-													T2,
-													atom2->iw2l[ k0 ], // L2
-													atom2->iw2m[ k0 ], // m2
-													atom2->iw2n[ k0 ], // n2
-													tau1, 
-													T1,
-													atom1->iw2l[ j0 ], // L1
-													atom1->iw2m[ j0 ], // m1
-													atom1->iw2n[ j0 ], // N1
-													tau0, T0
-													);
-
-
-											LM.DHloc_fixedR_x[nnr] += nlm[0];
-											LM.DHloc_fixedR_y[nnr] += nlm[1];
-											LM.DHloc_fixedR_z[nnr] += nlm[2];
-										}
-									}//!calc_deri
-								}// distance
-							} // ad0
+										// mohan change the order on 2011-06-17
+										// origin: < psi1 | beta > < beta | dpsi2/dtau >
+										//now: < psi1/dtau | beta > < beta | psi2 >
+										UOT.snap_psibeta(
+												nlm, 1, 
+												tau2, 
+												T2,
+												atom2->iw2l[ k0 ], // L2
+												atom2->iw2m[ k0 ], // m2
+												atom2->iw2n[ k0 ], // n2
+												tau1, 
+												T1,
+												atom1->iw2l[ j0 ], // L1
+												atom1->iw2m[ j0 ], // m1
+												atom1->iw2n[ j0 ], // N1
+												tau0, T0
+												);
+
+
+										LM.DHloc_fixedR_x[nnr+nnr_temp] += nlm[0];
+										LM.DHloc_fixedR_y[nnr+nnr_temp] += nlm[1];
+										LM.DHloc_fixedR_z[nnr+nnr_temp] += nlm[2];
+									}*/
+								}//!calc_deri
+								++nnr_temp;
+							}// k
+						} // j
+							//++nnr;
+					}// distance
+				} // ad0
+
+				if(is_adj)
+				{
+					for(int j=0; j<nw_tot1; j++)
+					{
+						const int iw1_all = start1 + j;
+						const int mu = ParaO.trace_loc_row[iw1_all];
+						if(mu < 0) continue;
+						for(int k=0; k<nw_tot2; k++)
+						{
+							const int iw2_all = start2 + k;
+							const int nu = ParaO.trace_loc_col[iw2_all];						
+							if(nu < 0) continue;
 							++nnr;
-						}// k
-					} // j 
-				}// end is_adj
+						}
+					}
+				}
 				//----------------------------------------------------------------------------------
 			} // ad2
 		} // I1
 	} // T1
 
 
-	if(!GAMMA_ONLY_LOCAL)
-	{
+	//if(!GAMMA_ONLY_LOCAL)
+	//{
 //		cout << " nr="  << nnr << endl;
 //		cout << " LNNR.nnr=" << LNNR.nnr << endl;
 //		ofs_running << " nr="  << nnr << endl;
 //		ofs_running << " LNNR.nnr=" << LNNR.nnr << endl;
-		if( nnr!=LNNR.nnr)
-		{
-			WARNING_QUIT("LCAO_gen_fixedH::build_Nonlocal_mu","nnr!=LNNR.nnr");
-		}
+	if( nnr!=LNNR.nnr)
+	{
+		WARNING_QUIT("LCAO_gen_fixedH::build_Nonlocal_mu","nnr!=LNNR.nnr");
 	}
+	//}
 
 //	cout << " build_Nonlocal_mu done" << endl;
 
@@ -654,6 +715,16 @@ void LCAO_gen_fixedH::build_Nonlocal_beta(const bool& calc_deri)
     TITLE("LCAO_gen_fixedH","build_Nonlocal_beta");
     timer::tick ("LCAO_gen_fixedH","build_Nonlocal_beta",'G');
 
+	matrix Rcut(ucell.ntype,ucell.ntype);
+	for(int i=0; i<ucell.ntype; ++i)
+	{
+		for(int j=i; j<ucell.ntype; ++j)
+		{
+			Rcut(i,j) = ORB.Phi[i].getRcut() + ORB.Phi[j].getRcut();
+			Rcut(j,i) = Rcut(i,j);
+		}
+	}
+
     for (int T0 = 0; T0 < ucell.ntype; T0++)
     {
 		Atom* atom0 = &ucell.atoms[T0]; 
@@ -664,48 +735,59 @@ void LCAO_gen_fixedH::build_Nonlocal_beta(const bool& calc_deri)
 
             //(2)
             //for each projector (T0, I0), one pair of ads are used
-            for (int ad=0; ad<GridD.getAdjacentNum()+1 ; ad++)
+			int adjnumplus = GridD.getAdjacentNum()+1;
+            //for (int ad=0; ad<GridD.getAdjacentNum()+1 ; ad++)
+			for(int ad=0; ad < adjnumplus; ad++)
             {
                 const int T1 = GridD.getType(ad);
                 const int I1 = GridD.getNatom(ad);
-				const int iat = ucell.itia2iat(T1, I1);
+				//const int iat = ucell.itia2iat(T1, I1);
                 const int start = ucell.itiaiw2iwt(T1, I1, 0);
                 const Vector3<double> tau1 = GridD.getAdjacentTau(ad);
 				const Atom* atom1 = &ucell.atoms[T1];
+				const int nw_tot1 = atom1->nw*NPOL;
 
 				// use to label < mu | H | nu(prime) >
-				int nnr = LNNR.nlocstart[iat];
+				//int nnr = LNNR.nlocstart[iat];
             
 				//(3)
-				for (int ad2=0; ad2 < GridD.getAdjacentNum()+1 ; ad2++)
+				//for (int ad2=0; ad2 < GridD.getAdjacentNum()+1 ; ad2++)
+				for(int ad2=0; ad2 < adjnumplus; ad2++)
 				{
+					if(ad2<ad && !calc_deri) continue; // add by liuyu 20210406
 					const int T2 = GridD.getType(ad2);
 					const int I2 = GridD.getNatom(ad2);
 					const int start2 = ucell.itiaiw2iwt(T2, I2, 0);
 					const Vector3<double> tau2 = GridD.getAdjacentTau(ad2);
 					const Atom* atom2 = &ucell.atoms[T2];
+					const int nw_tot2 = atom2->nw*NPOL;
 
 					Vector3<double> dtau = tau2 - tau1;
 					double distance = dtau.norm() * ucell.lat0;
-					double rcut = ORB.Phi[T1].getRcut() + ORB.Phi[T2].getRcut();
-					if(distance < rcut)
+					//double rcut = ORB.Phi[T1].getRcut() + ORB.Phi[T2].getRcut();
+					//if(distance < rcut)
+					if(distance < Rcut(T1,T2))
 					{
 						// ------------- enter the nnr increaing zone --------------
-						for (int j=0; j<atom1->nw*NPOL; j++)
+						//for (int j=0; j<atom1->nw*NPOL; j++)
+						for(int j=0; j<nw_tot1; j++)
 						{
-							const int j0 = j/NPOL;
+							//const int j0 = j/NPOL;
 							const int iw1_all = start + j;
 							const int mu = ParaO.trace_loc_row[iw1_all];
-							if(mu < 0)continue; 
+							if(mu < 0)continue;
+							const int j0 = j/NPOL;
 
 							// mohan fix bug 2010-12-20
 							// atom2[T2] -> atom2.
-							for (int k=0; k<atom2->nw*NPOL; k++)
+							//for (int k=0; k<atom2->nw*NPOL; k++)
+							for(int k=0; k<nw_tot2; k++)
 							{
-								const int k0 = k/NPOL;
+								//const int k0 = k/NPOL;
 								const int iw2_all = start2 + k;
 								const int nu = ParaO.trace_loc_col[iw2_all];
 								if(nu < 0)continue;
+								const int k0 = k/NPOL;
 
 								double nlm[3];
 								nlm[0] = nlm[1] = nlm[2] = 0.0;
@@ -724,7 +806,9 @@ void LCAO_gen_fixedH::build_Nonlocal_beta(const bool& calc_deri)
 											ucell.atoms[T0].tau[I0], T0
 											);
 
-									if(GAMMA_ONLY_LOCAL)
+									LM.set_HSgamma(iw1_all,iw2_all,nlm[0],'N');//N stands for nonlocal.
+									if(ad!=ad2) LM.set_HSgamma(iw2_all,iw1_all,nlm[0],'N'); // add by liuyu 20210406
+									/*if(GAMMA_ONLY_LOCAL)
 									{
 										LM.set_HSgamma(iw1_all,iw2_all,nlm[0],'N');//N stands for nonlocal.
 									}
@@ -734,7 +818,7 @@ void LCAO_gen_fixedH::build_Nonlocal_beta(const bool& calc_deri)
 //										assert( nnr < LNNR.nnr );
 //										LM.Hloc_fixedR[ nnr ] += nlm[0];
 //										++nnr;
-									}
+									}*/
 								}
 								else  // calculate force
 								{
@@ -750,7 +834,8 @@ void LCAO_gen_fixedH::build_Nonlocal_beta(const bool& calc_deri)
 											ucell.atoms[T0].tau[I0], T0
 											);
 
-									if(GAMMA_ONLY_LOCAL)
+									LM.set_force(iw1_all, iw2_all, nlm[0], nlm[1], nlm[2], 'N');
+									/*if(GAMMA_ONLY_LOCAL)
 									{
 										//add part of nonlocal ps derivatives to T matrix
 										LM.set_force(iw1_all, iw2_all, nlm[0], nlm[1], nlm[2], 'N');
@@ -762,7 +847,7 @@ void LCAO_gen_fixedH::build_Nonlocal_beta(const bool& calc_deri)
 										//LM.DHloc_fixedR_y[ nnr ] += nlm[1];
 										//LM.DHloc_fixedR_z[ nnr ] += nlm[2];
 										++nnr;
-									}
+									}*/
 								}
 							}// end k
 						}// j 
@@ -770,7 +855,7 @@ void LCAO_gen_fixedH::build_Nonlocal_beta(const bool& calc_deri)
                 }// ad2
 				// mohan add 2011-06-16
 
-				if(!GAMMA_ONLY_LOCAL) // mohan fix bug 2011-06-26
+				/*if(!GAMMA_ONLY_LOCAL) // mohan fix bug 2011-06-26
 				{
 					if( iat < ucell.nat-1 )
 					{
@@ -782,12 +867,14 @@ void LCAO_gen_fixedH::build_Nonlocal_beta(const bool& calc_deri)
 							WARNING_QUIT("build_Nonlocal_beta","nnr");
 						}
 					}
-				}
+				}*/
             }// ad
         }// end I0
     }// end T0
 
     timer::tick ("LCAO_gen_fixedH","build_Nonlocal_beta",'G');
+	//test << "Time = " << 1.0*(clock()-start)/CLOCKS_PER_SEC << " s" << endl;
+	//test.close();
     return;
 }
 

From a87b95fd2818062ecb2222397a6967f8b8e2af6d Mon Sep 17 00:00:00 2001
From: mohan <mohan.chen.chen.mohan@gmail.com>
Date: Tue, 6 Apr 2021 15:36:31 +0800
Subject: [PATCH 32/60] delete useless variable itia* in ORB_read

---
 ABACUS.develop/source/src_lcao/ORB_read.cpp | 27 ---------------------
 ABACUS.develop/source/src_lcao/ORB_read.h   |  1 -
 2 files changed, 28 deletions(-)

diff --git a/ABACUS.develop/source/src_lcao/ORB_read.cpp b/ABACUS.develop/source/src_lcao/ORB_read.cpp
index eef6f4d19e..4a19f7ec57 100644
--- a/ABACUS.develop/source/src_lcao/ORB_read.cpp
+++ b/ABACUS.develop/source/src_lcao/ORB_read.cpp
@@ -960,33 +960,6 @@ void LCAO_Orbitals::set_nl_index(void)
 	}
 	
 
-	this->itiaib2ib_all.create(ntype, ucell.namax, this->nkb);
-
-	int ib_all = 0;
-	for(int it=0; it<ntype; it++)
-	{
-		for(int ia=0; ia<ucell.atoms[it].na; ia++)
-		{
-			for(int ib=0; ib<ucell.atoms[it].nh; ib++)
-			{
-				itiaib2ib_all(it,ia,ib) = ib_all;
-				++ib_all;
-			}
-			/*
-			for(int ib=0; ib< this->nproj[it]; ib++)
-			{
-				for(int m=0; m< 2*Beta[it].Proj[ib].getL()+1; m++)
-				{
-					itiaib2ib_all(it,ia,ib) = ib_all;
-					++ib_all;
-				}
-			}
-			*/	
-		}
-	}
-	assert(ib_all==nkb);
-
-
 	int nh_max = 0;
 	for(int it=0; it<ntype; it++)
 	{
diff --git a/ABACUS.develop/source/src_lcao/ORB_read.h b/ABACUS.develop/source/src_lcao/ORB_read.h
index b2b3e7a93a..0728158f59 100644
--- a/ABACUS.develop/source/src_lcao/ORB_read.h
+++ b/ABACUS.develop/source/src_lcao/ORB_read.h
@@ -67,7 +67,6 @@ class LCAO_Orbitals
 	int *nproj; //mohan add 2010-12-19
 	int nprojmax; // mohan add 2010-03-07
 	int nkb; // total number of projectors.
-	IntArray itiaib2ib_all;
 	IntArray ib2_ylm;
 	
 	double dr_uniform;

From 70bcce025c77d38e583e8276fbf6d37316c50a74 Mon Sep 17 00:00:00 2001
From: mohan <mohan.chen.chen.mohan@gmail.com>
Date: Tue, 6 Apr 2021 15:45:45 +0800
Subject: [PATCH 33/60] delete nkb variable in ORB_read, the variable has been
 defined in ppcell.nkb

---
 ABACUS.develop/source/src_lcao/ORB_read.cpp       | 15 ---------------
 ABACUS.develop/source/src_lcao/ORB_read.h         |  2 --
 ABACUS.develop/source/src_lcao/grid_technique.cpp |  6 +++++-
 3 files changed, 5 insertions(+), 18 deletions(-)

diff --git a/ABACUS.develop/source/src_lcao/ORB_read.cpp b/ABACUS.develop/source/src_lcao/ORB_read.cpp
index 4a19f7ec57..1eba3a72d8 100644
--- a/ABACUS.develop/source/src_lcao/ORB_read.cpp
+++ b/ABACUS.develop/source/src_lcao/ORB_read.cpp
@@ -945,21 +945,6 @@ void LCAO_Orbitals::set_nl_index(void)
 
 	assert(this->ntype>0);
 
-	this->nkb=0;
-	for(int it=0; it<ntype; it++)
-	{
-		nkb += ucell.atoms[it].na * ucell.atoms[it].nh;
-//		cout << " projectors for " << ucell.atoms[it].label << " is " << ucell.atoms[it].nh << endl;
-	}
-
-	// mohan update 2011-05-01
-	if(nkb==0)
-	{
-		WARNING("LCAO_Orbitals","No non-local projectos, it must all be H atoms.");
-		return;
-	}
-	
-
 	int nh_max = 0;
 	for(int it=0; it<ntype; it++)
 	{
diff --git a/ABACUS.develop/source/src_lcao/ORB_read.h b/ABACUS.develop/source/src_lcao/ORB_read.h
index 0728158f59..f35f2cff64 100644
--- a/ABACUS.develop/source/src_lcao/ORB_read.h
+++ b/ABACUS.develop/source/src_lcao/ORB_read.h
@@ -66,7 +66,6 @@ class LCAO_Orbitals
 	double Rmax;
 	int *nproj; //mohan add 2010-12-19
 	int nprojmax; // mohan add 2010-03-07
-	int nkb; // total number of projectors.
 	IntArray ib2_ylm;
 	
 	double dr_uniform;
@@ -87,7 +86,6 @@ class LCAO_Orbitals
 	int nchimax_d;	//caoyu add 2021-03-17
 	int ntype; // number of elements
 
-
 	void set_nl_index(void);
 
 };
diff --git a/ABACUS.develop/source/src_lcao/grid_technique.cpp b/ABACUS.develop/source/src_lcao/grid_technique.cpp
index 6fddc38f0d..57168c4ac9 100644
--- a/ABACUS.develop/source/src_lcao/grid_technique.cpp
+++ b/ABACUS.develop/source/src_lcao/grid_technique.cpp
@@ -352,7 +352,11 @@ void Grid_Technique::cal_trace_beta(void)
 {
 	// save the atom information in trace_beta//
 	delete[] trace_beta;
-	int nkb=ORB.nkb;
+
+	// mohan modify 2021-04-06
+	//int nkb=ORB.nkb;
+	int nkb=ppcell.nkb;
+
 	this->trace_beta = new int[nkb];
 	for(int i=0; i<nkb; i++)
 	{

From 220a6161b49c0f7af971179c99f43018057a4bca Mon Sep 17 00:00:00 2001
From: mohan <mohan.chen.chen.mohan@gmail.com>
Date: Tue, 6 Apr 2021 16:25:41 +0800
Subject: [PATCH 34/60] add GRID_api in src_external

---
 .../source/src_external/GRID_api/Makefile     | 63 +++++++++++++++++++
 .../src_external/GRID_api/Makefile.Objects    | 26 ++++++++
 .../src_external/GRID_api/Makefile.system     | 14 +++++
 .../src_external/GRID_api/Makefile.vars       | 29 +++++++++
 .../source/src_external/GRID_api/main.cpp     | 45 +++++++++++++
 ABACUS.develop/source/src_lcao/grid_bigcell.h |  1 +
 .../source/src_lcao/grid_meshball.h           |  1 +
 .../source/src_lcao/grid_technique.cpp        | 14 +++--
 .../source/src_lcao/grid_technique.h          |  4 +-
 9 files changed, 191 insertions(+), 6 deletions(-)
 create mode 100644 ABACUS.develop/source/src_external/GRID_api/Makefile
 create mode 100644 ABACUS.develop/source/src_external/GRID_api/Makefile.Objects
 create mode 100644 ABACUS.develop/source/src_external/GRID_api/Makefile.system
 create mode 100644 ABACUS.develop/source/src_external/GRID_api/Makefile.vars
 create mode 100644 ABACUS.develop/source/src_external/GRID_api/main.cpp

diff --git a/ABACUS.develop/source/src_external/GRID_api/Makefile b/ABACUS.develop/source/src_external/GRID_api/Makefile
new file mode 100644
index 0000000000..91366ca57b
--- /dev/null
+++ b/ABACUS.develop/source/src_external/GRID_api/Makefile
@@ -0,0 +1,63 @@
+# This is the Makefile of ABACUS-ORB API
+
+include Makefile.system
+include Makefile.Objects
+
+VPATH=../../src_global\
+:../../src_lcao\
+:./\
+
+#==========================
+# Define HONG
+#==========================
+HONG= -DMETIS -DMKL_ILP64
+
+#==========================
+# OPTIMIZE OPTIONS
+#==========================
+OPTS_GDB = -g -W
+
+#==========================
+# OBJECTS NEEDED
+#==========================
+#FP_OBJS_0=$(OBJS_ORBITAL)\
+#$(OBJS_GLOBAL)\
+#main.o\
+
+FP_OBJS_0=main.o\
+$(OBJS_TRY)\
+$(OBJS_ORBITAL)\
+
+FP_OBJS=$(patsubst %.o, ${OBJ_DIR}/%.o, ${FP_OBJS_0})
+PDIAG_OBJS=$(patsubst %.o, ${OBJ_DIR}/%.o, ${OBJS_PDIAG})
+PDIAG_MR=$(patsubst %.o, ${OBJ_DIR}/%.o, ${PDIAG_MR_0})
+
+#==========================
+# MAKING OPTIONS
+#==========================
+fp_mpi : 
+	@ make init
+	@ make -j $(NP) serial2 
+
+init :
+	@ if [ ! -d $(OBJ_DIR) ]; then mkdir $(OBJ_DIR); fi
+	@ if [ ! -d $(OBJ_DIR)/README ]; then echo "This directory contains all of the .o files" > $(OBJ_DIR)/README; fi
+	@ if [ ! -d ../bin ]; then mkdir ../bin; fi
+
+serial : ${FP_OBJS} ${HEADERS} 
+	${CPLUSPLUS} ${OPTS} $(FP_OBJS) ${LIBS} -o ${VERSION}.x 
+
+serial2 : ${FP_OBJS} 
+	${CPLUSPLUS} ${OPTS} $(FP_OBJS) ${LIBS} -o ${VERSION}.x 
+
+#==========================
+# rules
+#==========================
+${OBJ_DIR}/%.o:%.cpp
+	${CPLUSPLUS_MPI} ${OPTS} ${OPTS_MPI} -c ${HONG} $< -o $@
+${OBJ_DIR}/%.o:%.f
+	${FORTRAN} -c ${HONG} $< -o $@	 
+
+.PHONY:clean
+clean:
+	@ if [ -d $(OBJ_DIR) ]; then rm -rf $(OBJ_DIR); fi
diff --git a/ABACUS.develop/source/src_external/GRID_api/Makefile.Objects b/ABACUS.develop/source/src_external/GRID_api/Makefile.Objects
new file mode 100644
index 0000000000..57b3e6e618
--- /dev/null
+++ b/ABACUS.develop/source/src_external/GRID_api/Makefile.Objects
@@ -0,0 +1,26 @@
+#
+# This is a test makefile for Electronic-structure
+#
+# This particular makefile defines all the executables and objects
+# files needed, who they depend on, and the compilation defaults.#
+# The file makefile.local is included below.
+# That file defines the actual commands to use to run the C++
+# compiler, library options and directories, etc., all of which are
+# machine specific and depend on the local installation.  Hence the name.
+#
+
+VERSION= ABACUS-GRID
+HEADERS= *.h
+
+OBJS_TRY=math_integral.o\
+complexarray.o\
+complexmatrix.o\
+matrix.o\
+
+OBJS_GRID=grid_base.o\
+grid_base_beta.o\
+grid_bigcell.o\
+grid_meshball.o\
+grid_meshk.o\
+grid_technique.o\
+
diff --git a/ABACUS.develop/source/src_external/GRID_api/Makefile.system b/ABACUS.develop/source/src_external/GRID_api/Makefile.system
new file mode 100644
index 0000000000..c9fa3891cf
--- /dev/null
+++ b/ABACUS.develop/source/src_external/GRID_api/Makefile.system
@@ -0,0 +1,14 @@
+include Makefile.vars
+
+#==========================
+# LIBS and INCLUDES
+#==========================
+LIBS = -lifcore -lm -lpthread 
+
+INCLUDES = -I. -Icommands 
+
+#==========================
+# OPTIMIZE OPTIONS
+#==========================
+OPTS     = ${INCLUDES} -Ofast -std=c++11 -simd -march=native -m64 -Werror -Wall -pedantic -g
+#OPTS_MPI = -cxx=${CPLUSPLUS}
diff --git a/ABACUS.develop/source/src_external/GRID_api/Makefile.vars b/ABACUS.develop/source/src_external/GRID_api/Makefile.vars
new file mode 100644
index 0000000000..f0e5a56adc
--- /dev/null
+++ b/ABACUS.develop/source/src_external/GRID_api/Makefile.vars
@@ -0,0 +1,29 @@
+CPLUSPLUS      = icpc
+#CPLUSPLUS     = /public/intel2017/bin/icpc
+
+#CPLUSPLUS_MPI = mpiicpc
+CPLUSPLUS_MPI = icpc 
+
+LAPACK_DIR    = $(MKLROOT)
+#LAPACK_DIR = /public/intel2017/compilers_and_libraries_2017.1.132/linux/mkl
+#LAPACK_DIR = $(MKLROOT)
+#LAPACK_DIR    = /public/intel2017/mkl
+
+FFTW_DIR = /home/mohan/1_Software/impi_fftw-3.3.8
+#FFTW_DIR = /home/qianrui/intelcompile/impi_fftw
+#FFTW_DIR       = /public/udata/xiaohui/software/fftw2
+#FFTW_DIR       =/opt/fftw/3.3.6-p12/intel/2017.update4
+#FFTW_DIR      = /public/fftw-3.3.8
+
+BOOST_DIR = /home/mohan/1_Software/impi_boost-1.70.0
+#BOOST_DIR = /home/qianrui/intelcompile/impi_boost
+#BOOST_DIR      = /public/udata/xiaohui/software/boost_1_39_0
+#BOOST_DIR      = /opt/boost/1.64.0
+
+ELPA_DIR = /home/mohan/1_Software/impi_elpa-16.05.005
+#ELPA_DIR = /home/qianrui/intelcompile/impi_elpa
+#ELPA_DIR   = /public/udata/xiaohui/ELPA-2016.05.004
+#ELPA_DIR = /opt/elpa/intel_2017_update4
+
+OBJ_DIR = obj
+NP      = 14
diff --git a/ABACUS.develop/source/src_external/GRID_api/main.cpp b/ABACUS.develop/source/src_external/GRID_api/main.cpp
new file mode 100644
index 0000000000..4258a489ff
--- /dev/null
+++ b/ABACUS.develop/source/src_external/GRID_api/main.cpp
@@ -0,0 +1,45 @@
+//#include "timer.h"
+#include <ctime>
+
+void calculate();
+
+int main(int argc, char **argv)
+{
+
+    calculate();
+
+    return 0;
+}
+
+
+void calculate()
+{
+/*
+	time_t time_start = std::time(NULL);
+
+//	timer::start();
+
+	//----------------------------------------------------------
+	// main program for doing electronic structure calculations
+	//----------------------------------------------------------
+//	Driver DD;
+//	DD.init();
+
+	time_t	time_finish= std::time(NULL);
+
+	// print out information before ABACUS ends
+	cout << "\n START  Time  : " << ctime(&time_start);
+	cout << " FINISH Time  : " << ctime(&time_finish);
+	cout << " TOTAL  Time  : " << difftime(time_finish, time_start) << endl;
+
+	double total_time = difftime(time_finish, time_start);
+	int hour = total_time / 3600;
+	int mins = ( total_time - 3600 * hour ) / 60;
+	int secs = total_time - 3600 * hour - 60 * mins ;
+	cout << " Total  Time  : " << hour << " h "
+	            << mins << " mins "
+	            << secs << " secs "<< endl;
+*/
+
+    return;
+}
diff --git a/ABACUS.develop/source/src_lcao/grid_bigcell.h b/ABACUS.develop/source/src_lcao/grid_bigcell.h
index fa951fc136..37e7acea29 100644
--- a/ABACUS.develop/source/src_lcao/grid_bigcell.h
+++ b/ABACUS.develop/source/src_lcao/grid_bigcell.h
@@ -1,5 +1,6 @@
 #ifndef GRID_BIGCELL_H
 #define GRID_BIGCELL_H
+
 #include "../src_pw/tools.h"
 #include "grid_meshcell.h"
 
diff --git a/ABACUS.develop/source/src_lcao/grid_meshball.h b/ABACUS.develop/source/src_lcao/grid_meshball.h
index 67dd4b3cb4..72ccdcc1e1 100644
--- a/ABACUS.develop/source/src_lcao/grid_meshball.h
+++ b/ABACUS.develop/source/src_lcao/grid_meshball.h
@@ -30,6 +30,7 @@ class Grid_MeshBall : public Grid_BigCell
 	// init the meshball radius,
 	// search each meshcell of this meshball.
 	void init_meshball(void);
+
 	void delete_meshball_positions(void); //LiuXh add 2018-12-14
 
 	private:
diff --git a/ABACUS.develop/source/src_lcao/grid_technique.cpp b/ABACUS.develop/source/src_lcao/grid_technique.cpp
index 57168c4ac9..3d75ad84af 100644
--- a/ABACUS.develop/source/src_lcao/grid_technique.cpp
+++ b/ABACUS.develop/source/src_lcao/grid_technique.cpp
@@ -93,6 +93,10 @@ void Grid_Technique::set_pbc_grid(
 	return;
 }
 
+
+// PLEASE update this 'init_atoms_on_grid' to make
+// it adapted to 'cuboid' shape of grid
+// mohan add 2021-04-06
 void Grid_Technique::init_atoms_on_grid(void)
 {
 	TITLE("Grid_Technique","init_atoms_on_grid");
@@ -398,6 +402,7 @@ void Grid_Technique::cal_trace_beta(void)
 }
 
 
+// set 'lgd' variable
 void Grid_Technique::cal_trace_lo(void)
 {	
 	TITLE("Grid_Technique","cal_trace_lo");
@@ -429,10 +434,12 @@ void Grid_Technique::cal_trace_lo(void)
 				if(NSPIN==4)
 				{//added by zhengdy-soc, need to be double in soc
 					nw0 *= 2;
-					lgd += nw0;
+					this->lgd += nw0;
 				}
 				else
-					lgd += ucell.atoms[it].nw;
+				{
+					this->lgd += ucell.atoms[it].nw;
+				}
 				
 				for(int iw=0; iw<nw0; iw++)
 				{
@@ -443,6 +450,7 @@ void Grid_Technique::cal_trace_lo(void)
 			}
 			else
 			{
+				// global index of atomic orbitals
 				iw_all += ucell.atoms[it].nw;
 				if(NSPIN==4) iw_all += ucell.atoms[it].nw;
 			}
@@ -468,5 +476,3 @@ void Grid_Technique::cal_trace_lo(void)
 	assert(iw_all == NLOCAL);
 	return;
 }
-
-
diff --git a/ABACUS.develop/source/src_lcao/grid_technique.h b/ABACUS.develop/source/src_lcao/grid_technique.h
index a5dba02873..db183b1b02 100644
--- a/ABACUS.develop/source/src_lcao/grid_technique.h
+++ b/ABACUS.develop/source/src_lcao/grid_technique.h
@@ -1,10 +1,10 @@
-// Author: mohan
-// Date: 2009-10-17
 #ifndef GRID_TECHNIQUE_H
 #define GRID_TECHNIQUE_H
 
 #include "grid_meshball.h"
 
+// Author: mohan
+// Date: 2009-10-17
 class Grid_Technique : public Grid_MeshBall
 {
 	// public variables.

From 78e59d59d1e8ca1588d7639d1c587aa841d19f7b Mon Sep 17 00:00:00 2001
From: zdy <dyzheng@mail.ustc.edu.cn>
Date: Tue, 6 Apr 2021 17:30:35 +0800
Subject: [PATCH 35/60] fixed bug in cell-relax, start refactor after_vc part

---
 ABACUS.develop/source/src_lcao/LOOP_ions.cpp  | 44 ++---------------
 ABACUS.develop/source/src_lcao/run_md.cpp     | 43 ++--------------
 ABACUS.develop/source/src_pw/charge_extra.cpp | 49 +++++++++++++++++++
 ABACUS.develop/source/src_pw/charge_extra.h   |  6 +++
 ABACUS.develop/source/src_pw/ions.cpp         | 42 ++--------------
 ABACUS.develop/source/src_pw/stress_pw.cpp    |  6 +--
 ABACUS.develop/source/src_pw/stress_pw.h      |  2 +-
 7 files changed, 72 insertions(+), 120 deletions(-)

diff --git a/ABACUS.develop/source/src_lcao/LOOP_ions.cpp b/ABACUS.develop/source/src_lcao/LOOP_ions.cpp
index dab0326ae1..1dd1cbfc1c 100644
--- a/ABACUS.develop/source/src_lcao/LOOP_ions.cpp
+++ b/ABACUS.develop/source/src_lcao/LOOP_ions.cpp
@@ -109,32 +109,10 @@ void LOOP_ions::opt_ions(void)
 		
 		time_t eend = time(NULL);
 
-		// PLEASE move the details of CE to other places
-		// mohan add 2021-03-25
-        //xiaohui add 2014-07-07, for second-order extrapolation
-        int iat=0;
+		//for second-order extrapolation
         if(CALCULATION=="relax" || CALCULATION=="cell-relax")
         {
-            for(int it = 0;it < ucell.ntype;it++)
-            {
-                Atom* atom = &ucell.atoms[it];
-                for(int ia =0;ia< ucell.atoms[it].na;ia++)
-                {
-                    CE.pos_old2[3*iat  ] = CE.pos_old1[3*iat  ];
-                    CE.pos_old2[3*iat+1] = CE.pos_old1[3*iat+1];
-                    CE.pos_old2[3*iat+2] = CE.pos_old1[3*iat+2];
-
-                    CE.pos_old1[3*iat  ] = CE.pos_now[3*iat  ];
-                    CE.pos_old1[3*iat+1] = CE.pos_now[3*iat+1];
-                    CE.pos_old1[3*iat+2] = CE.pos_now[3*iat+2];
-
-                    CE.pos_now[3*iat  ] = atom->tau[ia].x*ucell.lat0;
-                    CE.pos_now[3*iat+1] = atom->tau[ia].y*ucell.lat0;
-                    CE.pos_now[3*iat+2] = atom->tau[ia].z*ucell.lat0;
-
-                    iat++;
-                }
-            }
+            CE.update_all_pos(ucell);
         }
 
 		// PLEASE design a proper interface to output potentials,
@@ -173,21 +151,9 @@ void LOOP_ions::opt_ions(void)
 		// PLEASE move the details of CE to other places
 		// mohan add 2021-03-25
         //xiaohui add 2014-07-07, for second-order extrapolation
-        iat=0;
         if(FORCE)
         {
-            for(int it = 0;it < ucell.ntype;it++)
-            {
-                Atom* atom = &ucell.atoms[it];
-                for(int ia =0;ia< ucell.atoms[it].na;ia++)
-                {
-                    CE.pos_next[3*iat  ] = atom->tau[ia].x*ucell.lat0;
-                    CE.pos_next[3*iat+1] = atom->tau[ia].y*ucell.lat0;
-                    CE.pos_next[3*iat+2] = atom->tau[ia].z*ucell.lat0;
-
-                    iat++;
-                }
-            }
+            CE.save_pos_next(ucell);
         }
 		
         if(OUT_LEVEL=="i")
@@ -282,7 +248,7 @@ bool LOOP_ions::force_stress(
             }
             else // ions are not converged
             {
-                CE.istep = istep;
+                CE.update_istep(istep); 
                 CE.extrapolate_charge();
 
                 if(pot.extra_pot=="dm")
@@ -396,7 +362,7 @@ xiaohui modify 2014-08-09*/
             }
             else
             {
-                CE.istep = force_step;
+                CE.update_istep(force_step);
                 CE.extrapolate_charge();
 
                 if(pot.extra_pot=="dm")
diff --git a/ABACUS.develop/source/src_lcao/run_md.cpp b/ABACUS.develop/source/src_lcao/run_md.cpp
index 94058284bb..92f0f20d52 100644
--- a/ABACUS.develop/source/src_lcao/run_md.cpp
+++ b/ABACUS.develop/source/src_lcao/run_md.cpp
@@ -128,28 +128,7 @@ void Run_MD::opt_ions(void)
 		time_t eend = time(NULL);
 
         //xiaohui add 2014-07-07, for second-order extrapolation
-		int iat=0;
-
-		for(int it = 0;it < ucell.ntype;it++)
-		{
-			Atom* atom = &ucell.atoms[it];
-			for(int ia =0;ia< ucell.atoms[it].na;ia++)
-			{
-				CE.pos_old2[3*iat  ] = CE.pos_old1[3*iat  ];
-				CE.pos_old2[3*iat+1] = CE.pos_old1[3*iat+1];
-				CE.pos_old2[3*iat+2] = CE.pos_old1[3*iat+2];
-
-				CE.pos_old1[3*iat  ] = CE.pos_now[3*iat  ];
-				CE.pos_old1[3*iat+1] = CE.pos_now[3*iat+1];
-				CE.pos_old1[3*iat+2] = CE.pos_now[3*iat+2];
-
-				CE.pos_now[3*iat  ] = atom->tau[ia].x*ucell.lat0;
-				CE.pos_now[3*iat+1] = atom->tau[ia].y*ucell.lat0;
-				CE.pos_now[3*iat+2] = atom->tau[ia].z*ucell.lat0;
-
-				iat++;
-			}
-		}
+		CE.update_all_pos(ucell);
 
 		if(mdtype==1||mdtype==2)   
 		{
@@ -181,22 +160,10 @@ void Run_MD::opt_ions(void)
         time_t fend = time(NULL);
 
         //xiaohui add 2014-07-07, for second-order extrapolation
-		iat=0;
-		for(int it = 0;it < ucell.ntype;it++)
-		{
-			Atom* atom = &ucell.atoms[it];
-			for(int ia =0;ia< ucell.atoms[it].na;ia++)
-			{
-				CE.pos_next[3*iat  ] = atom->tau[ia].x*ucell.lat0;
-				CE.pos_next[3*iat+1] = atom->tau[ia].y*ucell.lat0;
-				CE.pos_next[3*iat+2] = atom->tau[ia].z*ucell.lat0;
-
-				iat++;
-			}
-		}
+		CE.save_pos_next(ucell);
 
 		//xiaohui add CE.istep = istep 2014-07-07
-		CE.istep = istep;
+		CE.update_istep(istep);
 
 		// charge extrapolation if istep>0.
 		CE.extrapolate_charge();
@@ -276,7 +243,7 @@ bool Run_MD::force_stress(const int &istep, int &force_step, int &stress_step)
             }
             else // ions are not converged
             {
-                CE.istep = istep;
+                CE.update_istep(istep);
                 CE.extrapolate_charge();
 
                 if(pot.extra_pot=="dm")
@@ -400,7 +367,7 @@ xiaohui modify 2014-08-09*/
             //atom_arrange::delete_vector( SEARCH_RADIUS );
 #endif
                 //CE.istep = istep;
-                CE.istep = force_step;
+                CE.update_istep(force_step);
                 CE.extrapolate_charge();
 
                 if(pot.extra_pot=="dm")//xiaohui modify 2015-02-01
diff --git a/ABACUS.develop/source/src_pw/charge_extra.cpp b/ABACUS.develop/source/src_pw/charge_extra.cpp
index de5a355fde..3db6aee6b9 100644
--- a/ABACUS.develop/source/src_pw/charge_extra.cpp
+++ b/ABACUS.develop/source/src_pw/charge_extra.cpp
@@ -420,3 +420,52 @@ void Charge_Extra::find_alpha_and_beta(void)
 	}
 	return;
 }
+
+void Charge_Extra::save_pos_next(const UnitCell_pseudo& ucell)
+{
+	int iat=0;
+	for(int it = 0;it < ucell.ntype;it++)
+    {
+        Atom* atom = &ucell.atoms[it];
+        for(int ia =0;ia< ucell.atoms[it].na;ia++)
+        {
+            this->pos_next[3*iat  ] = atom->tau[ia].x*ucell.lat0;
+            this->pos_next[3*iat+1] = atom->tau[ia].y*ucell.lat0;
+            this->pos_next[3*iat+2] = atom->tau[ia].z*ucell.lat0;
+
+            iat++;
+        }
+    }
+	return;
+}
+
+void Charge_Extra::update_istep(const int &step)
+{
+	this->istep = step;
+	return;
+}
+
+void Charge_Extra::update_all_pos(const UnitCell_pseudo& ucell)
+{
+	int iat = 0;
+	for(int it = 0;it < ucell.ntype;it++)
+    {
+        Atom* atom = &ucell.atoms[it];
+        for(int ia =0;ia< ucell.atoms[it].na;ia++)
+        {
+            this->pos_old2[3*iat  ] = this->pos_old1[3*iat  ];
+            this->pos_old2[3*iat+1] = this->pos_old1[3*iat+1];
+            this->pos_old2[3*iat+2] = this->pos_old1[3*iat+2];
+
+            this->pos_old1[3*iat  ] = this->pos_now[3*iat  ];
+            this->pos_old1[3*iat+1] = this->pos_now[3*iat+1];
+            this->pos_old1[3*iat+2] = this->pos_now[3*iat+2];
+
+            this->pos_now[3*iat  ] = atom->tau[ia].x*ucell.lat0;
+            this->pos_now[3*iat+1] = atom->tau[ia].y*ucell.lat0;
+            this->pos_now[3*iat+2] = atom->tau[ia].z*ucell.lat0;
+
+            iat++;
+        }
+    }
+}
diff --git a/ABACUS.develop/source/src_pw/charge_extra.h b/ABACUS.develop/source/src_pw/charge_extra.h
index 67fffaaeb2..cd323f1514 100644
--- a/ABACUS.develop/source/src_pw/charge_extra.h
+++ b/ABACUS.develop/source/src_pw/charge_extra.h
@@ -1,5 +1,6 @@
 #ifndef CHARGE_EXTRA_H
 #define CHARGE_EXTRA_H
+#include "src_pw/unitcell_pseudo.h"
 
 using namespace std;
 
@@ -17,6 +18,11 @@ class Charge_Extra
 	void allocate_ions(void);
 	void extrapolate_charge(void);
 
+	void save_pos_next(const UnitCell_pseudo& ucell);
+	void update_istep(const int &step);
+	void update_all_pos(const UnitCell_pseudo& ucell);
+
+	private:
 	// use "istep = ions.istep"
 	int istep;
 
diff --git a/ABACUS.develop/source/src_pw/ions.cpp b/ABACUS.develop/source/src_pw/ions.cpp
index d5b629cd39..0ac0ddcdc1 100644
--- a/ABACUS.develop/source/src_pw/ions.cpp
+++ b/ABACUS.develop/source/src_pw/ions.cpp
@@ -156,29 +156,9 @@ void Ions::opt_ions_pw(void)
 		}
 	
 
-		int iat=0; //LiuXh add 20180619
 		if(CALCULATION=="relax"|| CALCULATION=="md" || CALCULATION=="cell-relax")
 		{
-			for(int it = 0;it < ucell.ntype;it++)
-			{
-				Atom* atom = &ucell.atoms[it];
-				for(int ia =0;ia< ucell.atoms[it].na;ia++)
-				{
-					CE.pos_old2[3*iat  ] = CE.pos_old1[3*iat  ];
-					CE.pos_old2[3*iat+1] = CE.pos_old1[3*iat+1];
-					CE.pos_old2[3*iat+2] = CE.pos_old1[3*iat+2];
-
-					CE.pos_old1[3*iat  ] = CE.pos_now[3*iat  ];
-					CE.pos_old1[3*iat+1] = CE.pos_now[3*iat+1];
-					CE.pos_old1[3*iat+2] = CE.pos_now[3*iat+2];
-
-					CE.pos_now[3*iat  ] = atom->tau[ia].x*ucell.lat0;
-					CE.pos_now[3*iat+1] = atom->tau[ia].y*ucell.lat0;
-					CE.pos_now[3*iat+2] = atom->tau[ia].z*ucell.lat0;
-
-					iat++;
-				}
-			}
+			CE.update_all_pos(ucell);
 		}
 
 		if(pot.out_potential == 2)
@@ -404,24 +384,10 @@ bool Ions::force_stress(const int &istep, int &force_step, int &stress_step)  //
             }
             else
             {
-                //stress_step = 1;
-                pw.setup_structure_factor();
-                int iat=0; //LiuXh add 20180619
-                for(int it = 0;it < ucell.ntype;it++)
-                {
-                    Atom* atom = &ucell.atoms[it];
-                    for(int ia =0;ia< ucell.atoms[it].na;ia++)
-                    {
-                        CE.pos_next[3*iat  ] = atom->tau[ia].x*ucell.lat0;
-                        CE.pos_next[3*iat+1] = atom->tau[ia].y*ucell.lat0;
-                        CE.pos_next[3*iat+2] = atom->tau[ia].z*ucell.lat0;
-
-                        iat++;
-                    }
-                }
-                CE.istep = force_step;
-
+                CE.save_pos_next(ucell);
+                CE.update_istep(force_step);
                 CE.extrapolate_charge();
+
                 pot.init_pot( istep, pw.strucFac );
                 wf.wfcinit();
                 ++force_step;
diff --git a/ABACUS.develop/source/src_pw/stress_pw.cpp b/ABACUS.develop/source/src_pw/stress_pw.cpp
index b66d0eae23..127f913b54 100644
--- a/ABACUS.develop/source/src_pw/stress_pw.cpp
+++ b/ABACUS.develop/source/src_pw/stress_pw.cpp
@@ -2,16 +2,14 @@
 #include "./H_XC_pw.h"
 #include "src_pw/vdwd2.h"
 
-void Stress_PW::cal_stress(matrix& sigma)
+void Stress_PW::cal_stress(matrix& sigmatot)
 {
 	TITLE("Stress_PW","cal_stress");
 	timer::tick("Stress_PW","cal_stress",'E');    
 
-	sigma.create(3,3);
+	sigmatot.create(3,3);
 	matrix sigmaxc;
 	sigmaxc.create(3,3);
-	matrix sigmatot;
-	sigmatot.create(3,3);
 	matrix sigmahar;
 	sigmahar.create(3,3);
 	matrix sigmakin;
diff --git a/ABACUS.develop/source/src_pw/stress_pw.h b/ABACUS.develop/source/src_pw/stress_pw.h
index 48e5ce1234..8a4e8f3174 100644
--- a/ABACUS.develop/source/src_pw/stress_pw.h
+++ b/ABACUS.develop/source/src_pw/stress_pw.h
@@ -11,7 +11,7 @@ class Stress_PW:public Stress_Func
 	~Stress_PW (){};
 
 	//calculate the stress in PW basis
-	void cal_stress(matrix& sigma);
+	void cal_stress(matrix& sigmatot);
 
 	private :
 	//call the vdw stress

From 7151cb74aafa9896fe90499b5730c37515184abe Mon Sep 17 00:00:00 2001
From: mohan <mohan.chen.chen.mohan@gmail.com>
Date: Tue, 6 Apr 2021 20:46:19 +0800
Subject: [PATCH 36/60] fix a segfault in src_lcao/LCAO_gen_fixedH.cpp

---
 .../source/src_lcao/LCAO_gen_fixedH.cpp       | 427 +++++++-----------
 1 file changed, 171 insertions(+), 256 deletions(-)

diff --git a/ABACUS.develop/source/src_lcao/LCAO_gen_fixedH.cpp b/ABACUS.develop/source/src_lcao/LCAO_gen_fixedH.cpp
index 21694a4c85..8de02c5ad8 100644
--- a/ABACUS.develop/source/src_lcao/LCAO_gen_fixedH.cpp
+++ b/ABACUS.develop/source/src_lcao/LCAO_gen_fixedH.cpp
@@ -15,6 +15,12 @@ void LCAO_gen_fixedH::calculate_NL_no(void)
 {
     TITLE("LCAO_gen_fixedH","calculate_NL_no");
 
+	// PLEASE rebuild the following two functions,
+	// 'build_Nonlocal_beta' and  'build_Nonlocal_mu',
+	// because the two functions are extremely time consuming
+	// for small systems, especially for multiple-k points
+	// mohan note 2021-03-23
+
 	if(GAMMA_ONLY_LOCAL)
 	{
 	  	//for gamma only.
@@ -117,6 +123,8 @@ void LCAO_gen_fixedH::build_ST_new(const char& dtype, const bool& calc_deri)
 							complex<double> *olm2 = &olm1[0];
 							if(!calc_deri)
 							{
+								// PLEASE use UOT as an input parameter of this subroutine
+								// mohan add 2021-03-30
 								UOT.snap_psipsi( olm, 0, dtype, tau1, 
 										T1, L1, m1, N1, GridD.getAdjacentTau(ad), 
 										T2, L2, m2, N2,
@@ -404,24 +412,13 @@ void LCAO_gen_fixedH::build_Nonlocal_mu(const bool &calc_deri)
 	// while beta is in the supercell.
 	// while phi2 is in the supercell.
 
-	int nnr_temp, nnr = 0;
+	int nnr = 0;
 	Vector3<double> tau1, tau2, dtau;
 	Vector3<double> dtau1, dtau2, tau0;
 	double distance = 0.0;
 	double distance1, distance2;
 	double rcut = 0.0;
 	double rcut1, rcut2;
-
-	matrix Rcut(ucell.ntype,ucell.ntype);
-	matrix Rcut_beta(ucell.ntype,ucell.ntype);
-	for(int i=0; i<ucell.ntype; ++i)
-	{
-		for(int j=0; j<ucell.ntype; ++j)
-		{
-			Rcut(i,j) = ORB.Phi[i].getRcut() + ORB.Phi[j].getRcut();
-			Rcut_beta(i,j) = ORB.Phi[i].getRcut() + ORB.Beta[j].get_rcut_max();
-		}
-	}
 		
 //	Record_adj RA;
 //	RA.for_2d();
@@ -430,7 +427,6 @@ void LCAO_gen_fixedH::build_Nonlocal_mu(const bool &calc_deri)
     for (int T1 = 0; T1 < ucell.ntype; ++T1)
     {
 		const Atom* atom1 = &ucell.atoms[T1];
-		const int nw_tot1 = atom1->nw*NPOL;
         for (int I1 =0; I1< atom1->na; ++I1)
         {
             //GridD.Find_atom( atom1->tau[I1] );
@@ -440,13 +436,10 @@ void LCAO_gen_fixedH::build_Nonlocal_mu(const bool &calc_deri)
             tau1 = atom1->tau[I1];
 
 			// psi2
-			int adjnumplus = GridD.getAdjacentNum()+1;
-            //for (int ad2=0; ad2<GridD.getAdjacentNum()+1; ++ad2)
-			for(int ad2=0; ad2<adjnumplus; ++ad2)
+            for (int ad2=0; ad2<GridD.getAdjacentNum()+1; ++ad2)
 			{
 				const int T2 = GridD.getType(ad2);
 				const Atom* atom2 = &ucell.atoms[T2];
-				const int nw_tot2 = atom2->nw*NPOL;
                 
 				const int I2 = GridD.getNatom(ad2);
 				//const int iat2 = ucell.itia2iat(T2, I2);
@@ -454,23 +447,21 @@ void LCAO_gen_fixedH::build_Nonlocal_mu(const bool &calc_deri)
                 tau2 = GridD.getAdjacentTau(ad2);
 
 				bool is_adj = false;
-
-                dtau = tau2 - tau1;
-                distance = dtau.norm() * ucell.lat0;
-                // this rcut is in order to make nnr consistent
-                // with other matrix.
-                //rcut = ORB.Phi[T1].getRcut() + ORB.Phi[T2].getRcut();
-                rcut = Rcut(T1,T2);
-                if(distance < rcut) is_adj = true;
-                /*else if(distance >= rcut)
-                {
-                    //for (int ad0 = 0; ad0 < GridD.getAdjacentNum()+1; ++ad0)
-                    for(int ad0 = 0; ad0 < adjnumplus; ++ad0)
+					
+				dtau = tau2 - tau1;
+				distance = dtau.norm() * ucell.lat0;
+				// this rcut is in order to make nnr consistent 
+				// with other matrix.
+				rcut = ORB.Phi[T1].getRcut() + ORB.Phi[T2].getRcut();
+				if(distance < rcut) is_adj = true;
+				else if(distance >= rcut)
+				{
+                    for (int ad0 = 0; ad0 < GridD.getAdjacentNum()+1; ++ad0)
                     {
-                        const int T0 = GridD.getType(ad0);
-                        //const int I0 = GridD.getNatom(ad0);
-                        //const int T0 = RA.info[iat1][ad0][3];
-                        //const int I0 = RA.info[iat1][ad0][4];
+						const int T0 = GridD.getType(ad0);
+						//const int I0 = GridD.getNatom(ad0);
+						//const int T0 = RA.info[iat1][ad0][3];
+						//const int I0 = RA.info[iat1][ad0][4];
                         //const int iat0 = ucell.itia2iat(T0, I0);
                         //const int start0 = ucell.itiaiw2iwt(T0, I0, 0);
 
@@ -481,227 +472,175 @@ void LCAO_gen_fixedH::build_Nonlocal_mu(const bool &calc_deri)
                         double distance1 = dtau1.norm() * ucell.lat0;
                         double distance2 = dtau2.norm() * ucell.lat0;
 
-                        //rcut1 = ORB.Phi[T1].getRcut() + ORB.Beta[T0].get_rcut_max();
-                        //rcut2 = ORB.Phi[T2].getRcut() + ORB.Beta[T0].get_rcut_max();
+                        rcut1 = ORB.Phi[T1].getRcut() + ORB.Beta[T0].get_rcut_max();
+                        rcut2 = ORB.Phi[T2].getRcut() + ORB.Beta[T0].get_rcut_max();
 
-                        //if( distance1 < rcut1 && distance2 < rcut2 )
-                        if( distance1 < Rcut_beta(T1,T0) && distance2 < Rcut_beta(T2,T0))
+                        if( distance1 < rcut1 && distance2 < rcut2 )
                         {
                             is_adj = true;
                             break;
                         }
                     }
-                }*/
+				}
 
 
-				//(3) run over all projectors in nonlocal pseudopotential.
-				//for (int ad0=0; ad0 < GridD.getAdjacentNum()+1 ; ++ad0)
-				for(int ad0=0; ad0 < adjnumplus; ++ad0)
+				if(is_adj)
 				{
-					const int T0 = GridD.getType(ad0);
-
-					// mohan add 2010-12-19
-					if( ORB.nproj[T0] == 0) continue; 
-
-					//const int I0 = GridD.getNatom(ad0);
-					//const int start0 = ucell.itiaiw2iwt(T0, I0, 0);
-					tau0 = GridD.getAdjacentTau(ad0);
-
-					dtau1 = tau0 - tau1;
-					dtau2 = tau0 - tau2;
-					distance1 = dtau1.norm() * ucell.lat0;
-					distance2 = dtau2.norm() * ucell.lat0;
-
-					// seems a bug here!! mohan 2011-06-17
-					//rcut1 = ORB.Phi[T1].getRcut() + ORB.Beta[T0].get_rcut_max();
-					//rcut2 = ORB.Phi[T2].getRcut() + ORB.Beta[T0].get_rcut_max();
-
-					//if(distance1 < rcut1 && distance2 < rcut2)
-					if(distance1 < Rcut_beta(T1,T0) && distance2 < Rcut_beta(T2,T0))
+					// < psi1 | all projectors | psi2 >
+					// ----------------------------- enter the nnr increaing zone -------------------------
+					for (int j=0; j<atom1->nw*NPOL; j++)
 					{
-						is_adj = true;
-						// < psi1 | all projectors | psi2 >
-						// ----------------------------- enter the nnr increaing zone -------------------------
-						//for (int j=0; j<atom1->nw*NPOL; j++)
-						nnr_temp = 0;
-						for(int j=0; j<nw_tot1; j++)
+						const int j0 = j/NPOL;//added by zhengdy-soc
+						const int iw1_all = start1 + j;
+						const int mu = ParaO.trace_loc_row[iw1_all];
+						if(mu < 0)continue; 
+
+						// fix a serious bug: atom2[T2] -> atom2
+						// mohan 2010-12-20
+						for (int k=0; k<atom2->nw*NPOL; k++)
 						{
-							//const int j0 = j/NPOL;//added by zhengdy-soc
-							const int iw1_all = start1 + j;
-							const int mu = ParaO.trace_loc_row[iw1_all];
-							if(mu < 0) continue;
-							const int j0 = j/NPOL;//added by zhengdy-soc
+							const int k0 = k/NPOL;
+							const int iw2_all = start2 + k;
+							const int nu = ParaO.trace_loc_col[iw2_all];						
+							if(nu < 0)continue;
+
 
-							// fix a serious bug: atom2[T2] -> atom2
-							// mohan 2010-12-20
-							//for (int k=0; k<atom2->nw*NPOL; k++)
-							for(int k=0; k<nw_tot2; k++)
+							//(3) run over all projectors in nonlocal pseudopotential.
+							for (int ad0=0; ad0 < GridD.getAdjacentNum()+1 ; ++ad0)
 							{
-								//const int k0 = k/NPOL;
-								const int iw2_all = start2 + k;
-								const int nu = ParaO.trace_loc_col[iw2_all];						
-								if(nu < 0) continue;
-								const int k0 = k/NPOL;
+								const int T0 = GridD.getType(ad0);
 
+								// mohan add 2010-12-19
+								if( ORB.nproj[T0] == 0) continue; 
 
-								//const Atom* atom0 = &ucell.atoms[T0];
-								double nlm[3]={0,0,0};
-								complex<double> nlm1[4]={0,0,0,0};//modified by zhengdy-soc
-								complex<double> *nlm2 = NULL;
-								if(NSPIN==4) nlm2 = &nlm1[0];
-								if(!calc_deri)
-								{
-									int is0 = (j-j0*NPOL) + (k-k0*NPOL)*2;
-									UOT.snap_psibeta(
-											nlm, 0, tau1, T1,
-											atom1->iw2l[ j0 ], // L1
-											atom1->iw2m[ j0 ], // m1
-											atom1->iw2n[ j0 ], // N1
-											tau2, T2,
-											atom2->iw2l[ k0 ], // L2
-											atom2->iw2m[ k0 ], // m2
-											atom2->iw2n[ k0 ], // n2
-											tau0, T0,
-											nlm2, is0 //for soc
-											);
+								//const int I0 = GridD.getNatom(ad0);
+								//const int start0 = ucell.itiaiw2iwt(T0, I0, 0);
+								tau0 = GridD.getAdjacentTau(ad0);
 
-									if(NSPIN!=4) LM.Hloc_fixedR[nnr+nnr_temp] += nlm[0];
-									else
-									{
-										int is = (j-j0*NPOL) + (k-k0*NPOL)*2;
-										LM.Hloc_fixedR_soc[nnr+nnr_temp] += nlm1[is];
-									}
-
-									/*if(GAMMA_ONLY_LOCAL)
-									{
-										// mohan add 2010-12-20
-										if( nlm[0]!=0.0 )
-										{
-											// ofs_running << setw(10) << iw1_all << setw(10) 
-											// << iw2_all << setw(20) << nlm[0] << endl; 
-											LM.set_HSgamma(iw1_all,iw2_all,nlm[0],'N');//N stands for nonlocal.
-										}
-									}
-									else
-									{
-										if(NSPIN!=4) LM.Hloc_fixedR[nnr+nnr_temp] += nlm[0];
-										else
-										{
-											int is = (j-j0*NPOL) + (k-k0*NPOL)*2;
-											LM.Hloc_fixedR_soc[nnr+nnr_temp] += nlm1[is];
-										}
-									}*/
-								}// calc_deri
-								else // calculate the derivative
-								{
-									// mohan change the order on 2011-06-17
-									// origin: < psi1 | beta > < beta | dpsi2/dtau >
-									//now: < psi1/dtau | beta > < beta | psi2 >
-									UOT.snap_psibeta(
-											nlm, 1, 
-											tau2, 
-											T2,
-											atom2->iw2l[ k0 ], // L2
-											atom2->iw2m[ k0 ], // m2
-											atom2->iw2n[ k0 ], // n2
-											tau1, 
-											T1,
-											atom1->iw2l[ j0 ], // L1
-											atom1->iw2m[ j0 ], // m1
-											atom1->iw2n[ j0 ], // N1
-											tau0, T0
-											);
+								dtau1 = tau0 - tau1;
+								dtau2 = tau0 - tau2;
+								distance1 = dtau1.norm() * ucell.lat0;
+								distance2 = dtau2.norm() * ucell.lat0;
 
+								// seems a bug here!! mohan 2011-06-17
+								rcut1 = ORB.Phi[T1].getRcut() + ORB.Beta[T0].get_rcut_max();
+								rcut2 = ORB.Phi[T2].getRcut() + ORB.Beta[T0].get_rcut_max();
 
-									LM.DHloc_fixedR_x[nnr+nnr_temp] += nlm[0];
-									LM.DHloc_fixedR_y[nnr+nnr_temp] += nlm[1];
-									LM.DHloc_fixedR_z[nnr+nnr_temp] += nlm[2];
-									/*if(GAMMA_ONLY_LOCAL)
+								if(distance1 < rcut1 && distance2 < rcut2)
+								{
+									//const Atom* atom0 = &ucell.atoms[T0];
+									double nlm[3]={0,0,0};
+									complex<double> nlm1[4]={0,0,0,0};//modified by zhengdy-soc
+									complex<double> *nlm2 = NULL;
+									if(NSPIN==4) nlm2 = &nlm1[0];
+									if(!calc_deri)
 									{
+										int is0 = (j-j0*NPOL) + (k-k0*NPOL)*2;
 										UOT.snap_psibeta(
-												nlm, 1, 
-												tau1, 
-												T1,
+												nlm, 0, tau1, T1,
 												atom1->iw2l[ j0 ], // L1
 												atom1->iw2m[ j0 ], // m1
 												atom1->iw2n[ j0 ], // N1
-												tau2, 
-												T2,
+												tau2, T2,
 												atom2->iw2l[ k0 ], // L2
 												atom2->iw2m[ k0 ], // m2
 												atom2->iw2n[ k0 ], // n2
-												tau0, T0
-												);
-
-										// sum all projectors for one atom.
-										LM.set_force (iw1_all, iw2_all,	nlm[0], nlm[1], nlm[2], 'N');
-									}
-									else
-									{
-										// mohan change the order on 2011-06-17
-										// origin: < psi1 | beta > < beta | dpsi2/dtau >
-										//now: < psi1/dtau | beta > < beta | psi2 >
-										UOT.snap_psibeta(
-												nlm, 1, 
-												tau2, 
-												T2,
-												atom2->iw2l[ k0 ], // L2
-												atom2->iw2m[ k0 ], // m2
-												atom2->iw2n[ k0 ], // n2
-												tau1, 
-												T1,
-												atom1->iw2l[ j0 ], // L1
-												atom1->iw2m[ j0 ], // m1
-												atom1->iw2n[ j0 ], // N1
-												tau0, T0
+												tau0, T0,
+												nlm2, is0 //for soc
 												);
 
 
-										LM.DHloc_fixedR_x[nnr+nnr_temp] += nlm[0];
-										LM.DHloc_fixedR_y[nnr+nnr_temp] += nlm[1];
-										LM.DHloc_fixedR_z[nnr+nnr_temp] += nlm[2];
-									}*/
-								}//!calc_deri
-								++nnr_temp;
-							}// k
-						} // j
-							//++nnr;
-					}// distance
-				} // ad0
-
-				if(is_adj)
-				{
-					for(int j=0; j<nw_tot1; j++)
-					{
-						const int iw1_all = start1 + j;
-						const int mu = ParaO.trace_loc_row[iw1_all];
-						if(mu < 0) continue;
-						for(int k=0; k<nw_tot2; k++)
-						{
-							const int iw2_all = start2 + k;
-							const int nu = ParaO.trace_loc_col[iw2_all];						
-							if(nu < 0) continue;
+										if(GAMMA_ONLY_LOCAL)
+										{
+											// mohan add 2010-12-20
+											if( nlm[0]!=0.0 )
+											{
+												// ofs_running << setw(10) << iw1_all << setw(10) 
+												// << iw2_all << setw(20) << nlm[0] << endl; 
+												LM.set_HSgamma(iw1_all,iw2_all,nlm[0],'N');//N stands for nonlocal.
+											}
+										}
+										else
+										{
+											if(NSPIN!=4) LM.Hloc_fixedR[nnr] += nlm[0];
+											else
+											{
+												int is = (j-j0*NPOL) + (k-k0*NPOL)*2;
+												LM.Hloc_fixedR_soc[nnr] += nlm1[is];
+											}
+										}
+									}// calc_deri
+									else // calculate the derivative
+									{
+										if(GAMMA_ONLY_LOCAL)
+										{
+											UOT.snap_psibeta(
+													nlm, 1, 
+													tau1, 
+													T1,
+													atom1->iw2l[ j0 ], // L1
+													atom1->iw2m[ j0 ], // m1
+													atom1->iw2n[ j0 ], // N1
+													tau2, 
+													T2,
+													atom2->iw2l[ k0 ], // L2
+													atom2->iw2m[ k0 ], // m2
+													atom2->iw2n[ k0 ], // n2
+													tau0, T0
+													);
+
+											// sum all projectors for one atom.
+											LM.set_force (iw1_all, iw2_all,	nlm[0], nlm[1], nlm[2], 'N');
+										}
+										else
+										{
+											// mohan change the order on 2011-06-17
+											// origin: < psi1 | beta > < beta | dpsi2/dtau >
+											//now: < psi1/dtau | beta > < beta | psi2 >
+											UOT.snap_psibeta(
+													nlm, 1, 
+													tau2, 
+													T2,
+													atom2->iw2l[ k0 ], // L2
+													atom2->iw2m[ k0 ], // m2
+													atom2->iw2n[ k0 ], // n2
+													tau1, 
+													T1,
+													atom1->iw2l[ j0 ], // L1
+													atom1->iw2m[ j0 ], // m1
+													atom1->iw2n[ j0 ], // N1
+													tau0, T0
+													);
+
+
+											LM.DHloc_fixedR_x[nnr] += nlm[0];
+											LM.DHloc_fixedR_y[nnr] += nlm[1];
+											LM.DHloc_fixedR_z[nnr] += nlm[2];
+										}
+									}//!calc_deri
+								}// distance
+							} // ad0
 							++nnr;
-						}
-					}
-				}
+						}// k
+					} // j 
+				}// end is_adj
 				//----------------------------------------------------------------------------------
 			} // ad2
 		} // I1
 	} // T1
 
 
-	//if(!GAMMA_ONLY_LOCAL)
-	//{
+	if(!GAMMA_ONLY_LOCAL)
+	{
 //		cout << " nr="  << nnr << endl;
 //		cout << " LNNR.nnr=" << LNNR.nnr << endl;
 //		ofs_running << " nr="  << nnr << endl;
 //		ofs_running << " LNNR.nnr=" << LNNR.nnr << endl;
-	if( nnr!=LNNR.nnr)
-	{
-		WARNING_QUIT("LCAO_gen_fixedH::build_Nonlocal_mu","nnr!=LNNR.nnr");
+		if( nnr!=LNNR.nnr)
+		{
+			WARNING_QUIT("LCAO_gen_fixedH::build_Nonlocal_mu","nnr!=LNNR.nnr");
+		}
 	}
-	//}
 
 //	cout << " build_Nonlocal_mu done" << endl;
 
@@ -715,16 +654,6 @@ void LCAO_gen_fixedH::build_Nonlocal_beta(const bool& calc_deri)
     TITLE("LCAO_gen_fixedH","build_Nonlocal_beta");
     timer::tick ("LCAO_gen_fixedH","build_Nonlocal_beta",'G');
 
-	matrix Rcut(ucell.ntype,ucell.ntype);
-	for(int i=0; i<ucell.ntype; ++i)
-	{
-		for(int j=i; j<ucell.ntype; ++j)
-		{
-			Rcut(i,j) = ORB.Phi[i].getRcut() + ORB.Phi[j].getRcut();
-			Rcut(j,i) = Rcut(i,j);
-		}
-	}
-
     for (int T0 = 0; T0 < ucell.ntype; T0++)
     {
 		Atom* atom0 = &ucell.atoms[T0]; 
@@ -735,59 +664,49 @@ void LCAO_gen_fixedH::build_Nonlocal_beta(const bool& calc_deri)
 
             //(2)
             //for each projector (T0, I0), one pair of ads are used
-			int adjnumplus = GridD.getAdjacentNum()+1;
-            //for (int ad=0; ad<GridD.getAdjacentNum()+1 ; ad++)
-			for(int ad=0; ad < adjnumplus; ad++)
+            for (int ad=0; ad<GridD.getAdjacentNum()+1 ; ad++)
             {
                 const int T1 = GridD.getType(ad);
                 const int I1 = GridD.getNatom(ad);
-				//const int iat = ucell.itia2iat(T1, I1);
+				const int iat = ucell.itia2iat(T1, I1);
                 const int start = ucell.itiaiw2iwt(T1, I1, 0);
                 const Vector3<double> tau1 = GridD.getAdjacentTau(ad);
 				const Atom* atom1 = &ucell.atoms[T1];
-				const int nw_tot1 = atom1->nw*NPOL;
 
 				// use to label < mu | H | nu(prime) >
-				//int nnr = LNNR.nlocstart[iat];
+				int nnr = LNNR.nlocstart[iat];
             
 				//(3)
-				//for (int ad2=0; ad2 < GridD.getAdjacentNum()+1 ; ad2++)
-				for(int ad2=0; ad2 < adjnumplus; ad2++)
+				for (int ad2=0; ad2 < GridD.getAdjacentNum()+1 ; ad2++)
 				{
-					if(ad2<ad && !calc_deri) continue; // add by liuyu 20210406
+					if(ad2<ad && !calc_deri) continue; //add by liuyu 20210406
 					const int T2 = GridD.getType(ad2);
 					const int I2 = GridD.getNatom(ad2);
 					const int start2 = ucell.itiaiw2iwt(T2, I2, 0);
 					const Vector3<double> tau2 = GridD.getAdjacentTau(ad2);
 					const Atom* atom2 = &ucell.atoms[T2];
-					const int nw_tot2 = atom2->nw*NPOL;
 
 					Vector3<double> dtau = tau2 - tau1;
 					double distance = dtau.norm() * ucell.lat0;
-					//double rcut = ORB.Phi[T1].getRcut() + ORB.Phi[T2].getRcut();
-					//if(distance < rcut)
-					if(distance < Rcut(T1,T2))
+					double rcut = ORB.Phi[T1].getRcut() + ORB.Phi[T2].getRcut();
+					if(distance < rcut)
 					{
 						// ------------- enter the nnr increaing zone --------------
-						//for (int j=0; j<atom1->nw*NPOL; j++)
-						for(int j=0; j<nw_tot1; j++)
+						for (int j=0; j<atom1->nw*NPOL; j++)
 						{
-							//const int j0 = j/NPOL;
+							const int j0 = j/NPOL;
 							const int iw1_all = start + j;
 							const int mu = ParaO.trace_loc_row[iw1_all];
-							if(mu < 0)continue;
-							const int j0 = j/NPOL;
+							if(mu < 0)continue; 
 
 							// mohan fix bug 2010-12-20
 							// atom2[T2] -> atom2.
-							//for (int k=0; k<atom2->nw*NPOL; k++)
-							for(int k=0; k<nw_tot2; k++)
+							for (int k=0; k<atom2->nw*NPOL; k++)
 							{
-								//const int k0 = k/NPOL;
+								const int k0 = k/NPOL;
 								const int iw2_all = start2 + k;
 								const int nu = ParaO.trace_loc_col[iw2_all];
 								if(nu < 0)continue;
-								const int k0 = k/NPOL;
 
 								double nlm[3];
 								nlm[0] = nlm[1] = nlm[2] = 0.0;
@@ -806,11 +725,10 @@ void LCAO_gen_fixedH::build_Nonlocal_beta(const bool& calc_deri)
 											ucell.atoms[T0].tau[I0], T0
 											);
 
-									LM.set_HSgamma(iw1_all,iw2_all,nlm[0],'N');//N stands for nonlocal.
-									if(ad!=ad2) LM.set_HSgamma(iw2_all,iw1_all,nlm[0],'N'); // add by liuyu 20210406
-									/*if(GAMMA_ONLY_LOCAL)
+									if(GAMMA_ONLY_LOCAL)
 									{
 										LM.set_HSgamma(iw1_all,iw2_all,nlm[0],'N');//N stands for nonlocal.
+										if(ad!=ad2) LM.set_HSgamma(iw2_all,iw1_all,nlm[0],'N'); //add by liuyu 20210406
 									}
 									else
 									{
@@ -818,7 +736,7 @@ void LCAO_gen_fixedH::build_Nonlocal_beta(const bool& calc_deri)
 //										assert( nnr < LNNR.nnr );
 //										LM.Hloc_fixedR[ nnr ] += nlm[0];
 //										++nnr;
-									}*/
+									}
 								}
 								else  // calculate force
 								{
@@ -834,8 +752,7 @@ void LCAO_gen_fixedH::build_Nonlocal_beta(const bool& calc_deri)
 											ucell.atoms[T0].tau[I0], T0
 											);
 
-									LM.set_force(iw1_all, iw2_all, nlm[0], nlm[1], nlm[2], 'N');
-									/*if(GAMMA_ONLY_LOCAL)
+									if(GAMMA_ONLY_LOCAL)
 									{
 										//add part of nonlocal ps derivatives to T matrix
 										LM.set_force(iw1_all, iw2_all, nlm[0], nlm[1], nlm[2], 'N');
@@ -847,7 +764,7 @@ void LCAO_gen_fixedH::build_Nonlocal_beta(const bool& calc_deri)
 										//LM.DHloc_fixedR_y[ nnr ] += nlm[1];
 										//LM.DHloc_fixedR_z[ nnr ] += nlm[2];
 										++nnr;
-									}*/
+									}
 								}
 							}// end k
 						}// j 
@@ -855,7 +772,7 @@ void LCAO_gen_fixedH::build_Nonlocal_beta(const bool& calc_deri)
                 }// ad2
 				// mohan add 2011-06-16
 
-				/*if(!GAMMA_ONLY_LOCAL) // mohan fix bug 2011-06-26
+				if(!GAMMA_ONLY_LOCAL) // mohan fix bug 2011-06-26
 				{
 					if( iat < ucell.nat-1 )
 					{
@@ -867,14 +784,12 @@ void LCAO_gen_fixedH::build_Nonlocal_beta(const bool& calc_deri)
 							WARNING_QUIT("build_Nonlocal_beta","nnr");
 						}
 					}
-				}*/
+				}
             }// ad
         }// end I0
     }// end T0
 
     timer::tick ("LCAO_gen_fixedH","build_Nonlocal_beta",'G');
-	//test << "Time = " << 1.0*(clock()-start)/CLOCKS_PER_SEC << " s" << endl;
-	//test.close();
     return;
 }
 

From 2fb55330837a26ccad6ddeb06f12b73a76d502b7 Mon Sep 17 00:00:00 2001
From: mohan <mohan.chen.chen.mohan@gmail.com>
Date: Tue, 6 Apr 2021 21:59:49 +0800
Subject: [PATCH 37/60] delte NEW_DM global variable, use INPUT.new_dm instead,
 this will be modified further in future

---
 ABACUS.develop/source/input.cpp                |  6 +++---
 ABACUS.develop/source/input.h                  |  2 +-
 ABACUS.develop/source/input_conv.cpp           |  2 +-
 .../source/src_global/global_variable.cpp      |  2 --
 .../source/src_global/global_variable.h        |  1 -
 ABACUS.develop/source/src_io/write_input.cpp   |  2 +-
 ABACUS.develop/source/src_lcao/DM_gamma.cpp    | 10 +++++-----
 ABACUS.develop/source/src_lcao/FORCE_gamma.cpp |  2 +-
 .../source/src_lcao/FORCE_gamma_edm.cpp        |  2 +-
 ABACUS.develop/source/src_lcao/FORCE_k.cpp     |  6 ++++--
 .../source/src_lcao/local_orbital_charge.cpp   |  2 +-
 .../source/src_pdiag/pdiag_double.cpp          | 18 +++++++++---------
 12 files changed, 27 insertions(+), 28 deletions(-)

diff --git a/ABACUS.develop/source/input.cpp b/ABACUS.develop/source/input.cpp
index 12b88882d3..25fa136250 100644
--- a/ABACUS.develop/source/input.cpp
+++ b/ABACUS.develop/source/input.cpp
@@ -457,7 +457,7 @@ void Input::Default(void)
 
 	cell_factor = 1.2; //LiuXh add 20180619
 
-	newDM=1; // Shen Yu add 2019/5/9
+	new_dm=1; // Shen Yu add 2019/5/9
 	mulliken=0;// qi feng add 2019/9/10
 
 //----------------------------------------------------------			//Peize Lin add 2020-04-04
@@ -1662,7 +1662,7 @@ bool Input::Read(const string &fn)
 		}
 		else if (strcmp("newdm", word) == 0)
 		{
-			read_value(ifs, newDM);
+			read_value(ifs, new_dm);
 		}
 //----------------------------------------------------------------------------------
 //         Xin Qu added on 2020-10-29 for DFT+U
@@ -2265,7 +2265,7 @@ void Input::Bcast()
 	
 		//Parallel_Common::bcast_int( epsilon0_choice );
     Parallel_Common::bcast_double( cell_factor); //LiuXh add 20180619
-    Parallel_Common::bcast_int( newDM ); // Shen Yu add 2019/5/9
+    Parallel_Common::bcast_int( new_dm ); // Shen Yu add 2019/5/9
     Parallel_Common::bcast_bool( restart_save ); // Peize Lin add 2020.04.04
     Parallel_Common::bcast_bool( restart_load ); // Peize Lin add 2020.04.04
 
diff --git a/ABACUS.develop/source/input.h b/ABACUS.develop/source/input.h
index 1181fadad4..3385c146ac 100644
--- a/ABACUS.develop/source/input.h
+++ b/ABACUS.develop/source/input.h
@@ -399,7 +399,7 @@ class Input
 //  2: use new DM algorithm and only show key debug information
 //  3: use new DM algorithm and show all detail debug information
 //==========================================================
-    int newDM;
+    int new_dm;
 
 //==========================================================
 //    DFT+U       Xin Qu added on 2020-10-29
diff --git a/ABACUS.develop/source/input_conv.cpp b/ABACUS.develop/source/input_conv.cpp
index f56cef2abd..39093d3816 100644
--- a/ABACUS.develop/source/input_conv.cpp
+++ b/ABACUS.develop/source/input_conv.cpp
@@ -578,7 +578,7 @@ void Input_Conv::Convert(void)
 
     ppcell.cell_factor = INPUT.cell_factor; //LiuXh add 20180619
 
-    NEW_DM=INPUT.newDM;  // Shen Yu add 2019/5/9
+//    NEW_DM=INPUT.new_dm;  // Shen Yu add 2019/5/9
 
 //----------------------------------------------------------
 // main parameters / electrons / spin ( 2/16 )
diff --git a/ABACUS.develop/source/src_global/global_variable.cpp b/ABACUS.develop/source/src_global/global_variable.cpp
index 7e1da67d44..3bffb172c2 100644
--- a/ABACUS.develop/source/src_global/global_variable.cpp
+++ b/ABACUS.develop/source/src_global/global_variable.cpp
@@ -177,5 +177,3 @@ int NPOL      = 1;
 int PRENSPIN  = 1;
 
 bool FINAL_SCF = false; //LiuXh add 20180619
-
-int NEW_DM=0;  // Shen Yu add 2019/5/9
diff --git a/ABACUS.develop/source/src_global/global_variable.h b/ABACUS.develop/source/src_global/global_variable.h
index e86b1b4c3d..b82760862f 100644
--- a/ABACUS.develop/source/src_global/global_variable.h
+++ b/ABACUS.develop/source/src_global/global_variable.h
@@ -196,6 +196,5 @@ extern int test_ion_dynamics;
 extern int test_deconstructor;
 
 extern bool FINAL_SCF; //LiuXh add 20180619
-extern int NEW_DM;  // Shen Yu add 2019/5/9
 
 #endif
diff --git a/ABACUS.develop/source/src_io/write_input.cpp b/ABACUS.develop/source/src_io/write_input.cpp
index aa1cb447fd..5db6d49b03 100644
--- a/ABACUS.develop/source/src_io/write_input.cpp
+++ b/ABACUS.develop/source/src_io/write_input.cpp
@@ -32,7 +32,6 @@ void Input::Print(const string &fn)const
 	OUTP(ofs,"nche_sto",nche_sto,"number of orders for Chebyshev expansion in stochastic DFT");
 	OUTP(ofs,"symmetry",symmetry,"turn symmetry on or off");	
 	OUTP(ofs,"nelec",nelec,"input number of electrons");
-	OUTP(ofs,"newdm",newDM,"");
 
 	ofs << "\n#Parameters (2.PW)" << endl;
 	OUTP(ofs,"ecutwfc",ecutwfc,"#energy cutoff for wave functions");
@@ -92,6 +91,7 @@ void Input::Print(const string &fn)const
 
 	ofs << "\n#Parameters (4.LCAO)" << endl;
 	OUTP(ofs,"basis_type",basis_type,"PW; LCAO in pw; LCAO");
+	OUTP(ofs,"new_dm",new_dm,"Type of density matrix; 0: old 1: new");
 	if(ks_solver=="HPSEPS" || ks_solver=="genelpa" || ks_solver=="scalapack_gvx")
 	{
 		OUTP(ofs,"nb2d",nb2d,"2d distribution of atoms");
diff --git a/ABACUS.develop/source/src_lcao/DM_gamma.cpp b/ABACUS.develop/source/src_lcao/DM_gamma.cpp
index f98f018642..81a89fe62d 100644
--- a/ABACUS.develop/source/src_lcao/DM_gamma.cpp
+++ b/ABACUS.develop/source/src_lcao/DM_gamma.cpp
@@ -108,7 +108,7 @@ int Local_Orbital_Charge::setAlltoallvParameter(MPI_Comm comm_2D, int blacs_ctxt
         int prow, pcol;
         Cblacs_pcoord(blacs_ctxt, pnum, &prow, &pcol);
         receiver_size_process[pnum]=nRow_in_proc[prow]*nCol_in_proc[pcol];
-        if(NEW_DM>1)
+        if(INPUT.new_dm>1)
         {
             OUT(ofs_running,"pnum",pnum);
             OUT(ofs_running,"prow",prow);
@@ -182,7 +182,7 @@ int Local_Orbital_Charge::setAlltoallvParameter(MPI_Comm comm_2D, int blacs_ctxt
                   sender_2D_index, sender_size_process, sender_displacement_process, MPI_INT, comm_2D);
 
 
-    if(NEW_DM>1)
+    if(INPUT.new_dm>1)
     {
         ofs_running<<"receiver_size is "<<receiver_size<<" ; receiver_size of each process is:\n";
         for(int i=0; i<nprocs; ++i)
@@ -281,7 +281,7 @@ void Local_Orbital_Charge::cal_dk_gamma_from_2D(void)
 
     for(int is=0; is<NSPIN; ++is)
     {
-        if(NEW_DM>1)
+        if(INPUT.new_dm>1)
         // outputDM( ParaO.blacs_ctxt, ParaO.nb);
         {
             // int myid;
@@ -319,7 +319,7 @@ void Local_Orbital_Charge::cal_dk_gamma_from_2D(void)
                                                                 // so the row and column index should be switched
             if(sender_buffer[i]!=0) ++nNONZERO;
         }
-        if(NEW_DM>1) 
+        if(INPUT.new_dm>1) 
         {
             OUT(ofs_running,"number of non-zero elements in sender_buffer",nNONZERO);
             OUT(ofs_running,"sender_size",sender_size);
@@ -348,7 +348,7 @@ void Local_Orbital_Charge::cal_dk_gamma_from_2D(void)
             if(receiver_buffer[i]!=0) ++nNONZERO;
         }
 
-        if(NEW_DM>1)
+        if(INPUT.new_dm>1)
         {
             OUT(ofs_running,"number of non-zero elements in receiver_buffer",nNONZERO);
             OUT(ofs_running,"receiver_size",receiver_size);
diff --git a/ABACUS.develop/source/src_lcao/FORCE_gamma.cpp b/ABACUS.develop/source/src_lcao/FORCE_gamma.cpp
index 9a06374dbe..f21b2e23e1 100644
--- a/ABACUS.develop/source/src_lcao/FORCE_gamma.cpp
+++ b/ABACUS.develop/source/src_lcao/FORCE_gamma.cpp
@@ -33,7 +33,7 @@ void Force_LCAO_gamma::ftable_gamma (
     // calculate the 'energy density matrix' here.
     this->cal_foverlap(isforce, isstress, foverlap, soverlap);
 
-    if(NEW_DM>0)
+    if(INPUT.new_dm>0)
     {
         this->cal_ftvnl_dphi(LOC.wfc_dm_2d.dm_gamma, isforce, isstress, ftvnl_dphi, stvnl_dphi);
         this->cal_fvnl_dbeta(LOC.wfc_dm_2d.dm_gamma, isforce, isstress, fvnl_dbeta, svnl_dbeta);
diff --git a/ABACUS.develop/source/src_lcao/FORCE_gamma_edm.cpp b/ABACUS.develop/source/src_lcao/FORCE_gamma_edm.cpp
index e82c9e9e1c..6b725fe446 100644
--- a/ABACUS.develop/source/src_lcao/FORCE_gamma_edm.cpp
+++ b/ABACUS.develop/source/src_lcao/FORCE_gamma_edm.cpp
@@ -655,7 +655,7 @@ void Force_LCAO_gamma::cal_foverlap(
     timer::tick("Force_LCAO_gamma","cal_foverlap",'G');
 
     // set energy density matrix.
-    if(NEW_DM>0)
+    if(INPUT.new_dm>0)
     {
         timer::tick("Force_LCAO_gamma","cal_edm_2d",'H');
 
diff --git a/ABACUS.develop/source/src_lcao/FORCE_k.cpp b/ABACUS.develop/source/src_lcao/FORCE_k.cpp
index 7ba5aa0cc8..3289020223 100644
--- a/ABACUS.develop/source/src_lcao/FORCE_k.cpp
+++ b/ABACUS.develop/source/src_lcao/FORCE_k.cpp
@@ -864,7 +864,8 @@ void Force_LCAO_k::cal_fvnl_dbeta_k(
 											{
 												for(int ipol=0;ipol<3;ipol++)
 												{
-													svnl_dbeta(jpol, ipol) += dm2d[is][iir] * (nlm[jpol] * r1[ipol] + nlm1[jpol] * r0[ipol]);
+													svnl_dbeta(jpol, ipol) += dm2d[is][iir] * 
+													(nlm[jpol] * r1[ipol] + nlm1[jpol] * r0[ipol]);
 												}
 											}
 										}
@@ -883,7 +884,8 @@ void Force_LCAO_k::cal_fvnl_dbeta_k(
 
 	assert( iir == LNNR.nnr );
 
-	if(isstress){
+	if(isstress)
+	{
 		for(int i=0;i<3;i++)
 		{
 			for(int j=0;j<3;j++)
diff --git a/ABACUS.develop/source/src_lcao/local_orbital_charge.cpp b/ABACUS.develop/source/src_lcao/local_orbital_charge.cpp
index 860915b6a1..92d35c2a7f 100644
--- a/ABACUS.develop/source/src_lcao/local_orbital_charge.cpp
+++ b/ABACUS.develop/source/src_lcao/local_orbital_charge.cpp
@@ -119,7 +119,7 @@ void Local_Orbital_Charge::sum_bands(void)
         }
         else if(KS_SOLVER=="genelpa" || KS_SOLVER=="scalapack_gvx")
         {
-            if(NEW_DM>0)
+            if(INPUT.new_dm>0)
             {
                 //density matrix has already been calcualted.
                 timer::tick("LCAO_Charge","cal_dm_2d",'F');
diff --git a/ABACUS.develop/source/src_pdiag/pdiag_double.cpp b/ABACUS.develop/source/src_pdiag/pdiag_double.cpp
index 6a3b6a3c7b..df36a8d0ec 100644
--- a/ABACUS.develop/source/src_pdiag/pdiag_double.cpp
+++ b/ABACUS.develop/source/src_pdiag/pdiag_double.cpp
@@ -549,7 +549,7 @@ void Pdiag_Double::diago_double_begin(
         delete[] eigen;
 	    OUT(ofs_running,"eigenvalues were copied to ekb");
 
-        if(NEW_DM==0)
+        if(INPUT.new_dm==0)
         {
             // convert wave function to band distribution 
 			// and calculate the density matrix in the tranditional way
@@ -655,9 +655,9 @@ void Pdiag_Double::diago_double_begin(
 		}
 		memcpy( ekb, ekb_tmp.data(), sizeof(double)*NBANDS ); 
 		
-		if(NEW_DM==0)
+		if(INPUT.new_dm==0)
 		{
-			throw domain_error("NEW_DM must be 1. "+TO_STRING(__FILE__)+" line "+TO_STRING(__LINE__));
+			throw domain_error("INPUT.new_dm must be 1. "+TO_STRING(__FILE__)+" line "+TO_STRING(__LINE__));
 		}
 	}
 	else if(KS_SOLVER=="lapack_gvx")
@@ -700,9 +700,9 @@ void Pdiag_Double::diago_double_begin(
 			throw runtime_error("M="+TO_STRING(M)+". NBANDS="+TO_STRING(NBANDS)+". "+TO_STRING(__FILE__)+" line "+TO_STRING(__LINE__));
 		}
 		
-		if(NEW_DM==0)
+		if(INPUT.new_dm==0)
 		{
-			throw domain_error("NEW_DM must be 1. "+TO_STRING(__FILE__)+" line "+TO_STRING(__LINE__));
+			throw domain_error("INPUT.new_dm must be 1. "+TO_STRING(__FILE__)+" line "+TO_STRING(__LINE__));
 		}
 	}
 	else if(KS_SOLVER=="scalapack_gvx")
@@ -755,9 +755,9 @@ void Pdiag_Double::diago_double_begin(
 		{
 			throw runtime_error("M="+TO_STRING(M)+". NZ="+TO_STRING(NZ)+". "+TO_STRING(__FILE__)+" line "+TO_STRING(__LINE__));
 		}
-		if(NEW_DM==0)
+		if(INPUT.new_dm==0)
 		{
-			throw domain_error("NEW_DM must be 1. "+TO_STRING(__FILE__)+" line "+TO_STRING(__LINE__));
+			throw domain_error("INPUT.new_dm must be 1. "+TO_STRING(__FILE__)+" line "+TO_STRING(__LINE__));
 		}
 	}	
     //delete[] Stmp; //LiuXh 20171109
@@ -1037,8 +1037,8 @@ void Pdiag_Double::diago_complex_begin(const int &ik, complex<double> **wfc, Com
 		if(M!=NZ)
 			throw runtime_error("M="+TO_STRING(M)+". NZ="+TO_STRING(NZ)+". "+TO_STRING(__FILE__)+" line "+TO_STRING(__LINE__));
 		
-//		if(NEW_DM==0)
-//			throw domain_error("NEW_DM must be 1. "+TO_STRING(__FILE__)+" line "+TO_STRING(__LINE__));
+//		if(INPUT.new_dm==0)
+//			throw domain_error("INPUT.new_dm must be 1. "+TO_STRING(__FILE__)+" line "+TO_STRING(__LINE__));
 		// the follow will be deleted after finish newdm
 		{
 			//change eigenvector matrix from block-cycle distribute matrix to column-divided distribute matrix

From 73631b3d34983076937da4409f0145ea28bff299 Mon Sep 17 00:00:00 2001
From: mohan <mohan.chen.chen.mohan@gmail.com>
Date: Tue, 6 Apr 2021 22:22:43 +0800
Subject: [PATCH 38/60] add some comments in ylm and ORB_gen_tables

---
 ABACUS.develop/source/src_global/ylm.cpp      | 12 ++++--
 .../source/src_lcao/ORB_gen_tables.cpp        | 38 +++++++++++--------
 2 files changed, 31 insertions(+), 19 deletions(-)

diff --git a/ABACUS.develop/source/src_global/ylm.cpp b/ABACUS.develop/source/src_global/ylm.cpp
index 1b0c2b7088..e1284e9c4b 100644
--- a/ABACUS.develop/source/src_global/ylm.cpp
+++ b/ABACUS.develop/source/src_global/ylm.cpp
@@ -1045,7 +1045,7 @@ void Ylm::grad_rl_sph_harm
 	return;
 }
 	
-void Ylm::set_coefficients ()
+void Ylm::set_coefficients(void)
 {
 	Ylm::ylmcoef[0] = 1.0 / sqrt(FOUR_PI);
 	Ylm::ylmcoef[1] = sqrt (3.0 / FOUR_PI);
@@ -1086,6 +1086,7 @@ void Ylm::set_coefficients ()
 	return;
 }
 
+
 void Ylm::test1 (void)
 {
 	Vector3<double> R (20.0, 0.0, 0.0);
@@ -1121,6 +1122,7 @@ void Ylm::test1 (void)
 	return;
 }
 
+
 void Ylm::test2 (void)
 {
 	Vector3<double> R (0.1,-0.2,0.5);
@@ -1515,7 +1517,8 @@ void Ylm::rlylm
 	return;
 }
 
-void Ylm::test()
+
+void Ylm::test(void)
 {
 	Vector3<double> R(0.0, 0.0, 1.0);
 	
@@ -1609,6 +1612,7 @@ void Ylm::test()
 	return;
 }
 
+
 void Ylm::ZEROS(double u[], const int& n)
 {
 	for(int i = 0; i < n; i++)
@@ -1618,6 +1622,7 @@ void Ylm::ZEROS(double u[], const int& n)
 	return;
 }
 
+
 //==========================================================
 // MEMBER FUNCTION : 
 // NAME : Fact ( n! )
@@ -1637,6 +1642,7 @@ long double Ylm::Fact(const int n)
 	return f;
 }
 
+
 int Ylm::Semi_Fact(const int n)
 {
 	int semif = 1;
@@ -1647,10 +1653,10 @@ int Ylm::Semi_Fact(const int n)
 	return semif;
 }
 
+
 double Ylm::sgn(const double x)
 {
 	if(x < 0.0) return -1.0;
 	if(x > 0.0) return 1.0;
 	return 0.0;
 }
-
diff --git a/ABACUS.develop/source/src_lcao/ORB_gen_tables.cpp b/ABACUS.develop/source/src_lcao/ORB_gen_tables.cpp
index 13afa1b0a3..ad8301bf4b 100644
--- a/ABACUS.develop/source/src_lcao/ORB_gen_tables.cpp
+++ b/ABACUS.develop/source/src_lcao/ORB_gen_tables.cpp
@@ -134,10 +134,18 @@ void ORB_gen_tables::snap_psibeta(
 		return;
 	}
 
-	timer::tick ("ORB_gen_tables","snap_psibeta",'X');
+//	timer::tick ("ORB_gen_tables","snap_psibeta",'X');
 
+	// PLEASE update this option,
+	// has_so only needs to be determined once,
+	// has_so can be used as a static variable,
+	// or an input parameter
+	// mohan add 2021-04-06
 	bool has_so = 0;
-	if(ORB.Beta[T0].get_count_soc(0)>0 ) has_so = 1;
+	if(ORB.Beta[T0].get_count_soc(0)>0 ) 
+	{
+		has_so = 1;
+	}
 
 	const int nproj = ORB.nproj[T0];
 	bool *calproj = new bool[nproj];
@@ -163,7 +171,10 @@ void ORB_gen_tables::snap_psibeta(
 	bool all_out = true;
 	for(int ip=0; ip<nproj; ip++)
 	{
+
+		// PLEASE note that all projectors should share the same rcut
 		const double Rcut0 = ORB.Beta[T0].Proj[ip].getRcut();
+
 		if( distance10 > (Rcut1 + Rcut0) || distance20 > (Rcut2 + Rcut0) )  
 		{
 			calproj[ip] = false;
@@ -183,7 +194,7 @@ void ORB_gen_tables::snap_psibeta(
 		delete[] calproj;
 		delete[] rmesh1;
 		delete[] rmesh2;
-		timer::tick ("ORB_gen_tables","snap_psibeta",'X');
+//		timer::tick ("ORB_gen_tables","snap_psibeta",'X');
 		return;
 	}
 
@@ -194,7 +205,9 @@ void ORB_gen_tables::snap_psibeta(
 	double psa, psb;
 	double x0a,x1a,x2a,x3a,x123a,x120a,x032a,x031a;
 	double x0b,x1b,x2b,x3b,x123b,x120b,x032b,x031b;
-	
+
+	// PLEASE note that x1a*x2a is called twice, etc.
+	// mohan add 2021-04-06	
 	psa = distance10 / tbeta.dr;
 	iqa = static_cast<int>(psa);
    	x0a = psa - static_cast<double>(iqa);
@@ -218,12 +231,6 @@ void ORB_gen_tables::snap_psibeta(
 	x031b = x0b*x3b*x1b/2.0;
 	
 	//UNIT VECTOR
-			
-	//double unit_vec_dRa[3];
-	//unit_vec_dRa[0] = dRa.x;
-	//unit_vec_dRa[1] = dRa.y;
-	//unit_vec_dRa[2] = dRa.z;
-	
 	double unit_vec_dRb[3];
 	unit_vec_dRb[0] = dRb.x;
 	unit_vec_dRb[1] = dRb.y;
@@ -287,21 +294,18 @@ void ORB_gen_tables::snap_psibeta(
 		if( !calproj[nb] ) continue;
 
 		const int L0 = ORB.Beta[T0].getL_Beta(nb);
-		//const int next_ip = 2* L0 +1;
-	
+		//const int next_ip = 2* L0 +1;	
 
 //-------------------------------------------------------------------
 // move iterations for psi1 and psi2 from cal_fvnl_dbeta 
 // to here --- 2021/03/20 mohan chen
 //-------------------------------------------------------------------
 
-
 		// <psi1 | Beta>
 		const int Opair1 = tbeta.NL_Opair(Tpair1, L1, N1, nb); 
 		// <psi2 | Beta>
 		const int Opair2 = tbeta.NL_Opair(Tpair2, L2, N2, nb); 
 		
-			
 		for(int m0=0; m0<2*L0+1; m0++)
 		{
 			++ip;
@@ -512,8 +516,10 @@ void ORB_gen_tables::snap_psibeta(
 					}
 				}
 				break;
+
 			case 1://need to be added later
-			{break;}
+				{break;}
+
 			default: break;
 		}
 	}
@@ -522,7 +528,7 @@ void ORB_gen_tables::snap_psibeta(
 	delete[] rmesh1;
 	delete[] rmesh2;
 
-	timer::tick ("ORB_gen_tables","snap_psibeta",'X');
+//	timer::tick ("ORB_gen_tables","snap_psibeta",'X');
 	return;
 }
 

From 386695939edf7636867327e34d11c69d54a3a376 Mon Sep 17 00:00:00 2001
From: zdy <dyzheng@mail.ustc.edu.cn>
Date: Wed, 7 Apr 2021 14:30:02 +0800
Subject: [PATCH 39/60] fixed a bug in FORCE&&STRESS, but this part still
 puzzling

---
 ABACUS.develop/source/src_pw/ions.cpp | 6 +++++-
 1 file changed, 5 insertions(+), 1 deletion(-)

diff --git a/ABACUS.develop/source/src_pw/ions.cpp b/ABACUS.develop/source/src_pw/ions.cpp
index 0ac0ddcdc1..57f5177d5c 100644
--- a/ABACUS.develop/source/src_pw/ions.cpp
+++ b/ABACUS.develop/source/src_pw/ions.cpp
@@ -384,11 +384,15 @@ bool Ions::force_stress(const int &istep, int &force_step, int &stress_step)  //
             }
             else
             {
+				ofs_running << " Setup the structure factor in plane wave basis." << endl;
+                pw.setup_structure_factor();
+				ofs_running << " Setup the extrapolated charge." << endl;
                 CE.save_pos_next(ucell);
                 CE.update_istep(force_step);
                 CE.extrapolate_charge();
-
+				ofs_running << " Setup the Vl+Vh+Vxc according to new structure factor and new charge." << endl;
                 pot.init_pot( istep, pw.strucFac );
+				ofs_running << " Setup the new wave functions?" << endl;
                 wf.wfcinit();
                 ++force_step;
                 return 0;

From e86582be427be47d8908cbeadb1f4a6175b46073 Mon Sep 17 00:00:00 2001
From: zdy <dyzheng@mail.ustc.edu.cn>
Date: Wed, 7 Apr 2021 17:00:22 +0800
Subject: [PATCH 40/60] refactor some repete codes in Unitcell

---
 .../source/src_ions/ions_move_basic.cpp       | 100 ++----------------
 ABACUS.develop/source/src_pw/charge_extra.cpp |  43 ++------
 ABACUS.develop/source/src_pw/unitcell.cpp     |  98 +++++++++++++++++
 ABACUS.develop/source/src_pw/unitcell.h       |   5 +
 4 files changed, 123 insertions(+), 123 deletions(-)

diff --git a/ABACUS.develop/source/src_ions/ions_move_basic.cpp b/ABACUS.develop/source/src_ions/ions_move_basic.cpp
index dbac3e1158..723b2e854a 100644
--- a/ABACUS.develop/source/src_ions/ions_move_basic.cpp
+++ b/ABACUS.develop/source/src_ions/ions_move_basic.cpp
@@ -36,19 +36,13 @@ void Ions_Move_Basic::setup_gradient(double* pos, double *grad, const matrix &fo
 	// the unit of pos: Bohr.
 	// the unit of force: Ry/Bohr.
 	// the unit of gradient: 
+	ucell.save_cartesian_position(pos);
 	int iat=0;
 	for(int it = 0;it < ucell.ntype;it++)
 	{
 		Atom* atom = &ucell.atoms[it];
 		for(int ia =0;ia< ucell.atoms[it].na;ia++)
 		{	
-			pos[3*iat  ] = atom->tau[ia].x*ucell.lat0;
-			pos[3*iat+1] = atom->tau[ia].y*ucell.lat0;
-			pos[3*iat+2] = atom->tau[ia].z*ucell.lat0;
-
-			// mohan remove mbl constrain 2010-04-26
-			// mohan add mbl constrain 2010-07-11
-			// mohan add ucell.lat0 2010-07-27
 			if(atom->mbl[ia].x == 1)
 			{
 				grad[3*iat  ] = -force(iat, 0)*ucell.lat0;
@@ -77,15 +71,12 @@ void Ions_Move_Basic::move_atoms(double *move, double *pos)
 	assert(move!=NULL);
 	assert(pos!=NULL);
 
-	// unit: Bohr
-	int iat=0;
-
 	//------------------------
 	// for test only
 	//------------------------
 	if(test_ion_dynamics)
 	{
-		iat=0;
+		int iat=0;
 		ofs_running << "\n movement of ions (unit is Bohr) : " << endl;
 		ofs_running << " " << setw(12) << "Atom" << setw(15) << "x" << setw(15) << "y" << setw(15) << "z" << endl;
 		for(int it = 0;it < ucell.ntype;it++)
@@ -105,89 +96,20 @@ void Ions_Move_Basic::move_atoms(double *move, double *pos)
 		assert( iat == ucell.nat );
 	}
 
-	iat = 0;
-	double move_threshold = 1.0e-10;
-	for(int it = 0;it < ucell.ntype;it++)
+	const double move_threshold = 1.0e-10;
+	const int total_freedom = ucell.nat * 3;
+	for(int i =0;i<total_freedom;i++)
 	{
-		Atom* atom = &ucell.atoms[it];
-		for(int ia =0;ia< atom->na;ia++)
+		if( abs(move[i]) > move_threshold )
 		{
-			// mohan add 2010-08-06
-			// otherwise, there might be bug for
-			// sltk_grid, on system CO when C
-			// atom is put on (0,0,0)
-			for(int i=0; i<3; i++)
-			{
-				if( abs(move[3*iat+i]) < move_threshold )
-				{
-					move[3*iat+i] = 0.0;
-				}
-			}
-		
-			// mohan modify 2010-04-26
-			if(atom->mbl[ia].x!=0)
-			{
-				atom->tau[ia].x = (move[3*iat]+pos[3*iat])/ucell.lat0;
-			}
-			if(atom->mbl[ia].y!=0)
-			{
-				atom->tau[ia].y = (move[3*iat+1]+pos[3*iat+1])/ucell.lat0;
-			}
-			if(atom->mbl[ia].z!=0)
-			{
-				atom->tau[ia].z = (move[3*iat+2]+pos[3*iat+2])/ucell.lat0;
-			}
-
-			// the direct coordinates also need to be updated.
-			atom->taud[ia] = atom->tau[ia] * ucell.GT;
-//			cout << " tau=" << atom->tau[ia].x << " " << atom->tau[ia].y << " " << atom->tau[ia].z << endl;
-			iat++;
+			pos[i] += move[i];
 		}
 	}
-	assert(iat == ucell.nat);
-
-	//----------------------------------------------
-	// because of the periodic boundary condition
-	// we need to adjust the atom positions,
-	// first adjust direct coordinates,
-	// then update them into cartesian coordinates,
-	//----------------------------------------------
-	for(int it=0; it<ucell.ntype; it++)
-	{
-		Atom* atom = &ucell.atoms[it];
-		for(int ia=0; ia<atom->na; ia++)
-		{
-			// mohan update 2011-03-21
-			if(atom->taud[ia].x<0) atom->taud[ia].x += 1.0;
-			if(atom->taud[ia].y<0) atom->taud[ia].y += 1.0;
-			if(atom->taud[ia].z<0) atom->taud[ia].z += 1.0;
-			if(atom->taud[ia].x>=1.0) atom->taud[ia].x -= 1.0;
-			if(atom->taud[ia].y>=1.0) atom->taud[ia].y -= 1.0;
-			if(atom->taud[ia].z>=1.0) atom->taud[ia].z -= 1.0;
-
-			if(atom->taud[ia].x<0 || atom->taud[ia].y<0
-				|| atom->taud[ia].z<0 ||
-				atom->taud[ia].x>=1.0 ||
-				atom->taud[ia].y>=1.0 ||
-				atom->taud[ia].z>=1.0)
-			{
-				ofs_warning << " it=" << it+1 << " ia=" << ia+1 << endl;
-				ofs_warning << "d=" << atom->taud[ia].x << " " << 
-				atom->taud[ia].y << " " << atom->taud[ia].z << endl;
-				WARNING_QUIT("Ions_Move_Basic::move_ions","the movement of atom is larger than the length of cell.");
-			}
+	ucell.update_pos_tau(pos);
 
-			atom->tau[ia] = atom->taud[ia] * ucell.latvec;
-		}
-	}
-//2015-09-16
-#ifdef __MPI
-    MPI_Barrier(MPI_COMM_WORLD);
-    for (int i=0;i<ucell.ntype;i++)
-    {
-        ucell.atoms[i].bcast_atom(); // bcast tau array
-    }
-#endif
+	ucell.periodic_boundary_adjustment();
+	
+	ucell.bcast_atoms_tau();
 
 	//--------------------------------------------
 	// Print out the structure file.
diff --git a/ABACUS.develop/source/src_pw/charge_extra.cpp b/ABACUS.develop/source/src_pw/charge_extra.cpp
index 3db6aee6b9..13e0cdac34 100644
--- a/ABACUS.develop/source/src_pw/charge_extra.cpp
+++ b/ABACUS.develop/source/src_pw/charge_extra.cpp
@@ -423,19 +423,7 @@ void Charge_Extra::find_alpha_and_beta(void)
 
 void Charge_Extra::save_pos_next(const UnitCell_pseudo& ucell)
 {
-	int iat=0;
-	for(int it = 0;it < ucell.ntype;it++)
-    {
-        Atom* atom = &ucell.atoms[it];
-        for(int ia =0;ia< ucell.atoms[it].na;ia++)
-        {
-            this->pos_next[3*iat  ] = atom->tau[ia].x*ucell.lat0;
-            this->pos_next[3*iat+1] = atom->tau[ia].y*ucell.lat0;
-            this->pos_next[3*iat+2] = atom->tau[ia].z*ucell.lat0;
-
-            iat++;
-        }
-    }
+	ucell.save_cartesian_position(this->pos_next);
 	return;
 }
 
@@ -447,25 +435,12 @@ void Charge_Extra::update_istep(const int &step)
 
 void Charge_Extra::update_all_pos(const UnitCell_pseudo& ucell)
 {
-	int iat = 0;
-	for(int it = 0;it < ucell.ntype;it++)
-    {
-        Atom* atom = &ucell.atoms[it];
-        for(int ia =0;ia< ucell.atoms[it].na;ia++)
-        {
-            this->pos_old2[3*iat  ] = this->pos_old1[3*iat  ];
-            this->pos_old2[3*iat+1] = this->pos_old1[3*iat+1];
-            this->pos_old2[3*iat+2] = this->pos_old1[3*iat+2];
-
-            this->pos_old1[3*iat  ] = this->pos_now[3*iat  ];
-            this->pos_old1[3*iat+1] = this->pos_now[3*iat+1];
-            this->pos_old1[3*iat+2] = this->pos_now[3*iat+2];
-
-            this->pos_now[3*iat  ] = atom->tau[ia].x*ucell.lat0;
-            this->pos_now[3*iat+1] = atom->tau[ia].y*ucell.lat0;
-            this->pos_now[3*iat+2] = atom->tau[ia].z*ucell.lat0;
-
-            iat++;
-        }
-    }
+	const int total_freedom = ucell.nat * 3;
+	for(int i=0;i<total_freedom;i++)
+	{
+		this->pos_old2[i] = this->pos_old1[i];
+		this->pos_old1[i] = this->pos_now[i];
+	}
+	ucell.save_cartesian_position(this->pos_now);
+	return;
 }
diff --git a/ABACUS.develop/source/src_pw/unitcell.cpp b/ABACUS.develop/source/src_pw/unitcell.cpp
index a31cadaa72..bdd55623a8 100644
--- a/ABACUS.develop/source/src_pw/unitcell.cpp
+++ b/ABACUS.develop/source/src_pw/unitcell.cpp
@@ -266,3 +266,101 @@ void UnitCell::set_iat2it(void)
 	}
 	return;
 }
+
+void UnitCell::update_pos_tau(const double* pos)
+{
+    int iat = 0;
+	for(int it = 0;it < this->ntype;it++)
+	{
+		Atom* atom = &this->atoms[it];
+		for(int ia =0;ia< atom->na;ia++)
+		{		
+			if(atom->mbl[ia].x!=0)
+			{
+				atom->tau[ia].x = pos[3*iat] / this->lat0;
+			}
+			if(atom->mbl[ia].y!=0)
+			{
+				atom->tau[ia].y = pos[3*iat+1] / this->lat0;
+			}
+			if(atom->mbl[ia].z!=0)
+			{
+				atom->tau[ia].z = pos[3*iat+2] / this->lat0;
+			}
+
+			// the direct coordinates also need to be updated.
+			atom->taud[ia] = atom->tau[ia] * this->GT;
+			iat++;
+		}
+	}
+	assert(iat == this->nat);
+    return;
+}
+
+void UnitCell::periodic_boundary_adjustment()
+{
+    //----------------------------------------------
+	// because of the periodic boundary condition
+	// we need to adjust the atom positions,
+	// first adjust direct coordinates,
+	// then update them into cartesian coordinates,
+	//----------------------------------------------
+	for(int it=0; it<this->ntype; it++)
+	{
+		Atom* atom = &this->atoms[it];
+		for(int ia=0; ia<atom->na; ia++)
+		{
+			// mohan update 2011-03-21
+			if(atom->taud[ia].x<0) atom->taud[ia].x += 1.0;
+			if(atom->taud[ia].y<0) atom->taud[ia].y += 1.0;
+			if(atom->taud[ia].z<0) atom->taud[ia].z += 1.0;
+			if(atom->taud[ia].x>=1.0) atom->taud[ia].x -= 1.0;
+			if(atom->taud[ia].y>=1.0) atom->taud[ia].y -= 1.0;
+			if(atom->taud[ia].z>=1.0) atom->taud[ia].z -= 1.0;
+
+			if(atom->taud[ia].x<0 || atom->taud[ia].y<0
+				|| atom->taud[ia].z<0 ||
+				atom->taud[ia].x>=1.0 ||
+				atom->taud[ia].y>=1.0 ||
+				atom->taud[ia].z>=1.0)
+			{
+				ofs_warning << " it=" << it+1 << " ia=" << ia+1 << endl;
+				ofs_warning << "d=" << atom->taud[ia].x << " " << 
+				atom->taud[ia].y << " " << atom->taud[ia].z << endl;
+				WARNING_QUIT("Ions_Move_Basic::move_ions","the movement of atom is larger than the length of cell.");
+			}
+
+			atom->tau[ia] = atom->taud[ia] * this->latvec;
+		}
+	}
+    return;
+}
+
+void UnitCell::bcast_atoms_tau()
+{
+#ifdef __MPI
+    MPI_Barrier(MPI_COMM_WORLD);
+    for (int i=0;i<ucell.ntype;i++)
+    {
+        ucell.atoms[i].bcast_atom(); // bcast tau array
+    }
+#endif
+}
+
+void UnitCell::save_cartesian_position(double* pos)const
+{
+    int iat=0;
+	for(int it = 0;it < this->ntype;it++)
+	{
+		Atom* atom = &this->atoms[it];
+		for(int ia =0; ia<atoms->na; ia++)
+		{	
+			pos[3*iat  ] = atom->tau[ia].x*this->lat0;
+			pos[3*iat+1] = atom->tau[ia].y*this->lat0;
+			pos[3*iat+2] = atom->tau[ia].z*this->lat0;
+            iat++;
+        }
+    }
+    assert(iat == this->nat);
+    return;
+}
\ No newline at end of file
diff --git a/ABACUS.develop/source/src_pw/unitcell.h b/ABACUS.develop/source/src_pw/unitcell.h
index 02139b858a..d66af0d358 100644
--- a/ABACUS.develop/source/src_pw/unitcell.h
+++ b/ABACUS.develop/source/src_pw/unitcell.h
@@ -70,6 +70,11 @@ class UnitCell
     void print_cell_cif(const string &fn)const;
     const double& getNelec(void)const {return electrons_number;}
 
+    void update_pos_tau(const double* pos);
+    void periodic_boundary_adjustment();
+    void bcast_atoms_tau();
+    void save_cartesian_position(double* pos)const;
+
 protected:
 
     double electrons_number;

From 2e0ca29a8a6d1a0583c350fef22d42cf0a94dbba Mon Sep 17 00:00:00 2001
From: mohan <mohan.chen.chen.mohan@gmail.com>
Date: Thu, 8 Apr 2021 10:12:33 +0800
Subject: [PATCH 41/60] update grid base

---
 ABACUS.develop/source/src_lcao/grid_base.cpp  |  8 ++-
 ABACUS.develop/source/src_lcao/grid_base.h    | 62 ++++++++++++++-----
 .../source/src_lcao/grid_base_beta.h          |  9 ++-
 3 files changed, 57 insertions(+), 22 deletions(-)

diff --git a/ABACUS.develop/source/src_lcao/grid_base.cpp b/ABACUS.develop/source/src_lcao/grid_base.cpp
index 12c1264e2f..e8382fc421 100644
--- a/ABACUS.develop/source/src_lcao/grid_base.cpp
+++ b/ABACUS.develop/source/src_lcao/grid_base.cpp
@@ -143,6 +143,7 @@ void Grid_Base::init(
 	return;
 }
 
+
 void Grid_Base::get_rcut_max(void)
 {
 	assert( ORB.get_ntype() > 0 );
@@ -200,7 +201,9 @@ void Grid_Base::get_small_box(
 	    tau_dir.x, tau_dir.y, tau_dir.z
 	);
 
-	if (tau_dir.x < 0.0 || tau_dir.x > 1.0 || tau_dir.y < 0.0 || tau_dir.y > 1.0 || tau_dir.z < 0.0 || tau_dir.z > 1.0)
+	if (tau_dir.x < 0.0 || tau_dir.x > 1.0 
+	|| tau_dir.y < 0.0 || tau_dir.y > 1.0 
+	|| tau_dir.z < 0.0 || tau_dir.z > 1.0)
 	{
 		cout << "\n tau.x = " << tau.x;
 		cout << "\n tau.y = " << tau.y;
@@ -209,7 +212,8 @@ void Grid_Base::get_small_box(
 		cout << "\n tau_dir.x = " << tau_dir.x;
 		cout << "\n tau_dir.y = " << tau_dir.y;
 		cout << "\n tau_dir.z = " << tau_dir.z;
-		WARNING_QUIT("Grid_Base::get_small_box","Positions(x,y,z) Of tau and R2 in Direct Coordinates should be between 0 and 1!");
+		WARNING_QUIT("Grid_Base::get_small_box",
+		"Positions(x,y,z) Of tau and R2 in Direct Coordinates should be between 0 and 1!");
 	}
 
 	tau_max_direct = tau_dir + this->Rcut_max_direct[T];
diff --git a/ABACUS.develop/source/src_lcao/grid_base.h b/ABACUS.develop/source/src_lcao/grid_base.h
index dd81eb1266..2514a1597a 100644
--- a/ABACUS.develop/source/src_lcao/grid_base.h
+++ b/ABACUS.develop/source/src_lcao/grid_base.h
@@ -46,40 +46,72 @@ class Grid_Base
 
 	int* ijk_index;
 
-	Matrix3 latvec,latvec0;
-	Vector3<double> a1, a2, a3;
-	double a1_len, a2_len, a3_len;
+	Matrix3 latvec;
+	Matrix3 latvec0;
+
+	Vector3<double> a1;
+	Vector3<double> a2;
+	Vector3<double> a3;
+
+	double a1_len;
+	double a2_len;
+	double a3_len;
+
 	Vector3<double> da_d;
-	double da1, da2, da3;
-	int nx, ny, nz, nxyz;
+
+	double da1;
+	double da2;
+	double da3;
+
+	int nx;
+	int ny;
+	int nz;
+	int nxyz;
+
 	Vector3<double> *cartesian;
+
 	double lat0;
 
 	int test;
+
 	double *Rcut_max;
+
 	Vector3<double> *Rcut_max_direct;
-	int grid_number, grid_number_last;
+
+	int grid_number;
+	int grid_number_last;
 	
-	double *norm1, *norm2;
+	double *norm1;
+	double *norm2;
 	
-	double Rcut1, Rcut2;
+	double Rcut1;
+	double Rcut2;
 	
-	Vector3<double> *dR1, *dR2;
+	Vector3<double> *dR1;
+	Vector3<double> *dR2;
 
 	const Numerical_Orbital_Lm* pointer1;
 	const Numerical_Orbital_Lm* pointer2;
 
-	int iw1_all,iw2_all;
-	int index1, index2;
-	Vector3<int> edge_min, edge_max;
+	int iw1_all;
+	int iw2_all;
+	int index1;
+	int index2;
+
+	Vector3<int> edge_min;
+	Vector3<int> edge_max;
 
 	enum cal_type{ cal_charge, cal_local } job;
 	
 	double** yy1;
 	double** yy2;
-	int n1,n2; // (lmax+1)^2
-	int n1_last, n2_last;
-	int lmax1, lmax2;
+
+	int n1; // (lmax+1)^2
+	int n2;
+	int n1_last;
+	int n2_last;
+	int lmax1;
+	int lmax2;
 
 };
 
diff --git a/ABACUS.develop/source/src_lcao/grid_base_beta.h b/ABACUS.develop/source/src_lcao/grid_base_beta.h
index 3e5c8cc2c5..b4091dec95 100644
--- a/ABACUS.develop/source/src_lcao/grid_base_beta.h
+++ b/ABACUS.develop/source/src_lcao/grid_base_beta.h
@@ -1,13 +1,11 @@
-//=========================================================
-//AUTHOR : mohan
-//DATE : 2008-09-16
-//=========================================================
 #ifndef GRID_BASE_BETA_H
 #define GRID_BASE_BETA_H
 
 #include "../src_pw/tools.h"
 #include "ORB_atomic_lm.h"
 
+//AUTHOR : mohan
+//DATE : 2008-09-16
 // this class is inherited by Grid_Integral_Beta.h
 // this class provides basic Grid operation and the 
 // corresponding information.
@@ -34,7 +32,8 @@ class Grid_Base_Beta
 	double* rho1; // about charge
 	double **density_kernel;
 	double vfactor;
-	Matrix3 latvec,latvec0;
+	Matrix3 latvec;
+	Matrix3 latvec0;
 	int* nnn;
 	double lat0;
 	enum cal_type{ cal_charge, cal_local, cal_vnlb } job;

From ee20bf14610ec55b39417f6f5a381dc9b6bf59bb Mon Sep 17 00:00:00 2001
From: YuLiu <liuyu@stu.pku.edu.cn>
Date: Thu, 8 Apr 2021 10:59:15 +0800
Subject: [PATCH 42/60] seems a bug in src_lcao/LCAO_gen_fixedH.cpp when
 multiple cores are parallel

---
 .../source/src_lcao/LCAO_gen_fixedH.cpp       | 71 +++++++++++--------
 1 file changed, 43 insertions(+), 28 deletions(-)

diff --git a/ABACUS.develop/source/src_lcao/LCAO_gen_fixedH.cpp b/ABACUS.develop/source/src_lcao/LCAO_gen_fixedH.cpp
index 8de02c5ad8..3378918e5a 100644
--- a/ABACUS.develop/source/src_lcao/LCAO_gen_fixedH.cpp
+++ b/ABACUS.develop/source/src_lcao/LCAO_gen_fixedH.cpp
@@ -649,11 +649,21 @@ void LCAO_gen_fixedH::build_Nonlocal_mu(const bool &calc_deri)
 }
 
 
-void LCAO_gen_fixedH::build_Nonlocal_beta(const bool& calc_deri)
+void LCAO_gen_fixedH::build_Nonlocal_beta(const bool& calc_deri) //update by liuyu 2021-04-07
 {
     TITLE("LCAO_gen_fixedH","build_Nonlocal_beta");
     timer::tick ("LCAO_gen_fixedH","build_Nonlocal_beta",'G');
 
+	matrix Rcut;
+	Rcut.create(ucell.ntype, ucell.ntype);
+	for(int i=0; i<ucell.ntype; i++)
+	{
+        for(int j=0; j<ucell.ntype; j++)
+        {
+            Rcut(i,j) = ORB.Phi[i].getRcut() + ORB.Phi[j].getRcut();
+        }
+    }
+	
     for (int T0 = 0; T0 < ucell.ntype; T0++)
     {
 		Atom* atom0 = &ucell.atoms[T0]; 
@@ -668,45 +678,50 @@ void LCAO_gen_fixedH::build_Nonlocal_beta(const bool& calc_deri)
             {
                 const int T1 = GridD.getType(ad);
                 const int I1 = GridD.getNatom(ad);
-				const int iat = ucell.itia2iat(T1, I1);
+				//const int iat = ucell.itia2iat(T1, I1);
                 const int start = ucell.itiaiw2iwt(T1, I1, 0);
                 const Vector3<double> tau1 = GridD.getAdjacentTau(ad);
 				const Atom* atom1 = &ucell.atoms[T1];
+				const int nw1_tot = atom1->nw*NPOL;
 
 				// use to label < mu | H | nu(prime) >
-				int nnr = LNNR.nlocstart[iat];
+				//int nnr = LNNR.nlocstart[iat];
             
 				//(3)
 				for (int ad2=0; ad2 < GridD.getAdjacentNum()+1 ; ad2++)
 				{
-					if(ad2<ad && !calc_deri) continue; //add by liuyu 20210406
+					//if(ad2<ad && !calc_deri) continue; //add by liuyu 20210406
 					const int T2 = GridD.getType(ad2);
 					const int I2 = GridD.getNatom(ad2);
 					const int start2 = ucell.itiaiw2iwt(T2, I2, 0);
 					const Vector3<double> tau2 = GridD.getAdjacentTau(ad2);
 					const Atom* atom2 = &ucell.atoms[T2];
+					const int nw2_tot = atom2->nw*NPOL;
 
 					Vector3<double> dtau = tau2 - tau1;
 					double distance = dtau.norm() * ucell.lat0;
-					double rcut = ORB.Phi[T1].getRcut() + ORB.Phi[T2].getRcut();
+					double rcut = Rcut(T1,T2);
+					//double rcut = ORB.Phi[T1].getRcut() + ORB.Phi[T2].getRcut();
 					if(distance < rcut)
 					{
 						// ------------- enter the nnr increaing zone --------------
-						for (int j=0; j<atom1->nw*NPOL; j++)
+						//for (int j=0; j<atom1->nw*NPOL; j++)
+						for (int j=0; j<nw1_tot; j++)
 						{
-							const int j0 = j/NPOL;
 							const int iw1_all = start + j;
 							const int mu = ParaO.trace_loc_row[iw1_all];
-							if(mu < 0)continue; 
+							if(mu < 0)continue;
+							const int j0 = j/NPOL;
 
 							// mohan fix bug 2010-12-20
 							// atom2[T2] -> atom2.
-							for (int k=0; k<atom2->nw*NPOL; k++)
+							//for (int k=0; k<atom2->nw*NPOL; k++)
+							for (int k=0; k<nw2_tot; k++)
 							{
-								const int k0 = k/NPOL;
 								const int iw2_all = start2 + k;
 								const int nu = ParaO.trace_loc_col[iw2_all];
 								if(nu < 0)continue;
+								const int k0 = k/NPOL;
 
 								double nlm[3];
 								nlm[0] = nlm[1] = nlm[2] = 0.0;
@@ -725,18 +740,18 @@ void LCAO_gen_fixedH::build_Nonlocal_beta(const bool& calc_deri)
 											ucell.atoms[T0].tau[I0], T0
 											);
 
-									if(GAMMA_ONLY_LOCAL)
-									{
+									//if(GAMMA_ONLY_LOCAL)
+									//{
 										LM.set_HSgamma(iw1_all,iw2_all,nlm[0],'N');//N stands for nonlocal.
-										if(ad!=ad2) LM.set_HSgamma(iw2_all,iw1_all,nlm[0],'N'); //add by liuyu 20210406
-									}
-									else
-									{
-										WARNING_QUIT("LCAO_gen_fixedH::build_Nonlocal_beta","not consistent with k point algorithm.");
+										//if(ad!=ad2) LM.set_HSgamma(iw2_all,iw1_all,nlm[0],'N'); //add by liuyu 20210406
+									//}
+								//	else
+								//	{
+								//		WARNING_QUIT("LCAO_gen_fixedH::build_Nonlocal_beta","not consistent with k point algorithm.");
 //										assert( nnr < LNNR.nnr );
 //										LM.Hloc_fixedR[ nnr ] += nlm[0];
 //										++nnr;
-									}
+								//	}
 								}
 								else  // calculate force
 								{
@@ -752,19 +767,19 @@ void LCAO_gen_fixedH::build_Nonlocal_beta(const bool& calc_deri)
 											ucell.atoms[T0].tau[I0], T0
 											);
 
-									if(GAMMA_ONLY_LOCAL)
-									{
+									//if(GAMMA_ONLY_LOCAL)
+									//{
 										//add part of nonlocal ps derivatives to T matrix
 										LM.set_force(iw1_all, iw2_all, nlm[0], nlm[1], nlm[2], 'N');
-									}
-									else
-									{
-										WARNING_QUIT("LCAO_gen_fixedH::build_Nonlocal_beta","not consistent with k point algorithm.");
+									//}
+									//else
+									//{
+										//WARNING_QUIT("LCAO_gen_fixedH::build_Nonlocal_beta","not consistent with k point algorithm.");
 										//LM.DHloc_fixedR_x[ nnr ] += nlm[0];
 										//LM.DHloc_fixedR_y[ nnr ] += nlm[1];
 										//LM.DHloc_fixedR_z[ nnr ] += nlm[2];
-										++nnr;
-									}
+										//++nnr;
+									//}
 								}
 							}// end k
 						}// j 
@@ -772,7 +787,7 @@ void LCAO_gen_fixedH::build_Nonlocal_beta(const bool& calc_deri)
                 }// ad2
 				// mohan add 2011-06-16
 
-				if(!GAMMA_ONLY_LOCAL) // mohan fix bug 2011-06-26
+				/*if(!GAMMA_ONLY_LOCAL) // mohan fix bug 2011-06-26
 				{
 					if( iat < ucell.nat-1 )
 					{
@@ -784,7 +799,7 @@ void LCAO_gen_fixedH::build_Nonlocal_beta(const bool& calc_deri)
 							WARNING_QUIT("build_Nonlocal_beta","nnr");
 						}
 					}
-				}
+				}*/
             }// ad
         }// end I0
     }// end T0

From 896c5995495768ed13e001bb58ad5af5f7688fe4 Mon Sep 17 00:00:00 2001
From: linpz <linpz@mail.ustc.edu.cn>
Date: Thu, 8 Apr 2021 18:09:55 +0800
Subject: [PATCH 43/60] 1. add BSSE for lcao 	1.1 src_pw/pseudopot_upf.h 
 1.2 src_pw/pseudopot_upf.cpp 	1.3 src_pw/atom_pseudo.h 	1.4
 src_io/read_atoms.cpp 	1.5 src_io/read_pseudopot.cpp 	1.6 src_pw/charge.cpp

2. refactor Charge::atomic_rho()
	2.1 src_pw/charge.cpp
---
 .../source/src_global/math_integral.cpp       |  18 +-
 .../source/src_global/math_integral.h         |  18 +-
 ABACUS.develop/source/src_io/read_atoms.cpp   |   5 +
 .../source/src_io/read_pseudopot.cpp          |   2 +
 ABACUS.develop/source/src_pw/atom_pseudo.h    |   1 +
 ABACUS.develop/source/src_pw/charge.cpp       | 431 +++++++++---------
 .../source/src_pw/pseudopot_upf.cpp           |  13 +
 ABACUS.develop/source/src_pw/pseudopot_upf.h  |   1 +
 8 files changed, 248 insertions(+), 241 deletions(-)

diff --git a/ABACUS.develop/source/src_global/math_integral.cpp b/ABACUS.develop/source/src_global/math_integral.cpp
index ed6a4bdd9c..1b7b994e63 100644
--- a/ABACUS.develop/source/src_global/math_integral.cpp
+++ b/ABACUS.develop/source/src_global/math_integral.cpp
@@ -58,8 +58,8 @@ void Integral::Simpson_Integral
 void Integral::Simpson_Integral
 (
     const int mesh,
-    const double *func,
-    const double *rab,
+    const double * const func,
+    const double * const rab,
     double &asum
 )
 {
@@ -99,7 +99,7 @@ void Integral::Simpson_Integral
 void Integral::Simpson_Integral
 (
     const int mesh,
-    const double *func,
+    const double * const func,
     const double dr,
     double &asum
 )
@@ -140,9 +140,9 @@ void Integral::Simpson_Integral
 void Integral::Simpson_Integral_0toall
 (
     const int mesh,
-    const double *func,
-    const double *rab,
-    double *asum
+    const double * const func,
+    const double * const rab,
+    double * const asum
 )
 {
     // asum(r) = \int_{r'=0}^{r} dr' f(r') 
@@ -204,9 +204,9 @@ void Integral::Simpson_Integral_0toall
 void Integral::Simpson_Integral_alltoinf
 (
     const int mesh,
-    const double *func,
-    const double *rab,
-    double *asum
+    const double * const func,
+    const double * const rab,
+    double * const asum
 )
 {
     Integral::Simpson_Integral_0toall( mesh, func, rab, asum );
diff --git a/ABACUS.develop/source/src_global/math_integral.h b/ABACUS.develop/source/src_global/math_integral.h
index 6dfd206f57..16c64f6a9e 100644
--- a/ABACUS.develop/source/src_global/math_integral.h
+++ b/ABACUS.develop/source/src_global/math_integral.h
@@ -15,8 +15,8 @@ class Integral
     static void Simpson_Integral
     (
         const int mesh,
-        const double *func,
-        const double *rab,
+        const double * const func,
+        const double * const rab,
         double &asum
     );
 
@@ -24,7 +24,7 @@ class Integral
 	static void Simpson_Integral
 	(
 		const int mesh,
-		const double *func,
+		const double * const func,
 		const double dr,
 		double &asum
 	);
@@ -33,18 +33,18 @@ class Integral
     static void Simpson_Integral_0toall
     (
         const int mesh,
-        const double *func,
-        const double *rab,
-        double *asum
+        const double * const func,
+        const double * const rab,
+        double * const asum
     );
 
     // Peize Lin add 2016-02-14
     static void Simpson_Integral_alltoinf
     (
         const int mesh,
-        const double *func,
-        const double *rab,
-        double *asum
+        const double * const func,
+        const double * const rab,
+        double * const asum
     );     
 
 };
diff --git a/ABACUS.develop/source/src_io/read_atoms.cpp b/ABACUS.develop/source/src_io/read_atoms.cpp
index 288a95426b..11f4251180 100644
--- a/ABACUS.develop/source/src_io/read_atoms.cpp
+++ b/ABACUS.develop/source/src_io/read_atoms.cpp
@@ -32,6 +32,11 @@ void UnitCell_pseudo::read_atom_species(ifstream &ifa)
 						<< setw(12) << atom_mass[i] 
 						<< setw(18) << pseudo_fn[i];
 			}
+
+			// Peize Lin test for bsse 2021.04.07
+			const string bsse_label = "empty";
+			if(search( atom_label[i].begin(), atom_label[i].end(), bsse_label.begin(), bsse_label.end() ) != atom_label[i].end())
+				this->atoms[i].flag_empty_element = true;
 		}
 	}
 
diff --git a/ABACUS.develop/source/src_io/read_pseudopot.cpp b/ABACUS.develop/source/src_io/read_pseudopot.cpp
index 99d3e4cbd1..a6a935abe9 100644
--- a/ABACUS.develop/source/src_io/read_pseudopot.cpp
+++ b/ABACUS.develop/source/src_io/read_pseudopot.cpp
@@ -39,6 +39,8 @@ void UnitCell_pseudo::read_pseudopot(const string &pp_dir)
 			pp_address = pp_dir + this->pseudo_fn[i];
 			//error = upf.read_pseudo_upf( pp_address ); xiaohui modify 2013-06-23
 			error = upf.init_pseudo_reader( pp_address ); //xiaohui add 2013-06-23
+			if(this->atoms[i].flag_empty_element)					// Peize Lin add for bsse 2021.04.07
+				upf.set_empty_element();			
 			//average pseudopotential if needed
 			error_ap = upf.average_p(); //added by zhengdy 2020-10-20
 		}
diff --git a/ABACUS.develop/source/src_pw/atom_pseudo.h b/ABACUS.develop/source/src_pw/atom_pseudo.h
index 54af751714..2536bae2e8 100644
--- a/ABACUS.develop/source/src_pw/atom_pseudo.h
+++ b/ABACUS.develop/source/src_pw/atom_pseudo.h
@@ -23,6 +23,7 @@ class Atom_pseudo : public pseudo_us
 	Vector3<int> *mbl; //If this atom can move
 	string pseudo_fn;// File name of pseudopotentia
 	double mass; // the mass of atom
+	bool flag_empty_element;	// whether is the empty element for bsse.	Peize Lin add 2021.04.07
 
 protected:
 
diff --git a/ABACUS.develop/source/src_pw/charge.cpp b/ABACUS.develop/source/src_pw/charge.cpp
index 9a93f688f3..6494728ae1 100644
--- a/ABACUS.develop/source/src_pw/charge.cpp
+++ b/ABACUS.develop/source/src_pw/charge.cpp
@@ -22,6 +22,7 @@
 #include "magnetism.h"
 #include "../src_parallel/parallel_grid.h"
 #include "../src_global/math_integral.h"
+#include <vector>
 
 Charge::Charge()
 {
@@ -169,268 +170,257 @@ void Charge::renormalize_rho(void)
 // rho_at (read from pseudopotential files)
 // allocate work space (psic must already be allocated)
 //-------------------------------------------------------
-void Charge::atomic_rho(const int spin_number_need, double** rho_in)const
+void Charge::atomic_rho(const int spin_number_need, double** rho_in)const		// Peize Lin refactor 2021.04.08
 {
     TITLE("Charge","atomic_rho");
     timer::tick("Charge","atomic_rho");
 
-	assert(ucell.meshx>0);
-    double *rho1d = new double[ucell.meshx];
-    
-	// one dimension of charge in G space.
-	double *rho_lgl= new double[ pw.nggm ];
-    ZEROS(rho1d, ucell.meshx);
-    ZEROS(rho_lgl, pw.nggm);
-
-	// use interpolation to get three dimension charge density.
-    ComplexMatrix rho_g3d( spin_number_need, pw.ngmc);
-
-
-	// check the start magnetization
-	int startmag_type = 1;
-	for(int it=0; it<ucell.ntype; it++)
+	const ComplexMatrix rho_g3d = [&]()->ComplexMatrix
 	{
-		for(int ia=0; ia<ucell.atoms[it].na; ia++)
+		// use interpolation to get three dimension charge density.
+		ComplexMatrix rho_g3d( spin_number_need, pw.ngmc);
+		
+		// check the start magnetization
+		const int startmag_type = [&]()->int
 		{
-			if(ucell.atoms[it].mag[ia]!=0.0)
-			{
-				startmag_type = 2;
-				break;
-			}
-		}
-	}
-
-	if(NSPIN==4) 
-	{
-		startmag_type = 1;//zhengdy-soc, type 2 is still wrong.
-	}
-	OUT(ofs_warning,"startmag_type",startmag_type);
-
-
-    for (int it = 0;it < ucell.ntype;it++)
-    {
-		Atom* atom = &ucell.atoms[it];
-
-		// mesh point of this element.
-        const int mesh = atom->msh;
-
-        //----------------------------------------------------------
-        // Here we check the electron number 
-        //----------------------------------------------------------
-		double* rhoatm = new double[mesh];
-		for(int ir=0; ir<mesh; ++ir)
+			if(NSPIN==4)		//zhengdy-soc, type 2 is still wrong.
+				return 1;
+			for(int it=0; it<ucell.ntype; it++)
+				for(int ia=0; ia<ucell.atoms[it].na; ia++)
+					if(ucell.atoms[it].mag[ia]!=0.0)
+						return 2;
+			return 1;
+		}();
+		OUT(ofs_warning,"startmag_type",startmag_type);
+
+		for (int it = 0;it < ucell.ntype;it++)
 		{
-			double r2=atom->r[ir]*atom->r[ir];
-			rhoatm[ir]=atom->rho_at[ir]/FOUR_PI/r2;
-		}
-		rhoatm[0] = pow( (rhoatm[2]/rhoatm[1]), 1./(atom->r[2]-atom->r[1]) );//zws add
-		rhoatm[0] = pow(rhoatm[0], atom->r[1]);
-		rhoatm[0] = rhoatm[1] / rhoatm[0];  
-
-		double charge = 0.0;
-		Integral::Simpson_Integral(atom->msh,atom->rho_at,atom->rab,charge);
+			const Atom* const atom = &ucell.atoms[it];
 
-		OUT(ofs_warning,"charge from rho_at",charge);
-		assert(charge!=0.0);
-		double scale=1.0;
+			if(!atom->flag_empty_element)		// Peize Lin add for bsse 2021.04.07
+			{		
+				const std::vector<double> rho_lgl = [&]()->std::vector<double>
+				{
+					// one dimension of charge in G space.
+					std::vector<double> rho_lgl(pw.nggm,0);
 
-		if(charge!=atom->zv)
-		{
-			OUT(ofs_warning,"charge should be",atom->zv);
-			scale = atom->zv/charge;
-		}
+					// mesh point of this element.
+					const int mesh = atom->msh;
 
-		for(int ir=0; ir<mesh; ++ir)
-		{
-			rhoatm[ir] *= scale;
-			rhoatm[ir] *= (FOUR_PI*atom->r[ir]*atom->r[ir]);
-		}
+					//----------------------------------------------------------
+					// Here we check the electron number 
+					//----------------------------------------------------------
+					const std::vector<double> rhoatm = [&]()->std::vector<double>
+					{
+						std::vector<double> rhoatm(mesh);		
+						for(int ir=0; ir<mesh; ++ir)
+						{
+							double r2=atom->r[ir]*atom->r[ir];
+							rhoatm[ir]=atom->rho_at[ir]/FOUR_PI/r2;
+						}
+						rhoatm[0] = pow( (rhoatm[2]/rhoatm[1]), 1./(atom->r[2]-atom->r[1]) );//zws add
+						rhoatm[0] = pow(rhoatm[0], atom->r[1]);
+						rhoatm[0] = rhoatm[1] / rhoatm[0];
 
-        //----------------------------------------------------------
-        // Here we compute the G=0 term
-        //----------------------------------------------------------
-        if (pw.gstart == 1)
-        {
-            for (int ir = 0;ir < mesh;ir++)
-            {
-//              rho1d [ir] = atom->rho_at[ir];
-				rho1d[ir] = rhoatm[ir];
-            }
-            Integral::Simpson_Integral(mesh, rho1d, atom->rab , rho_lgl[0]);
-        }
+						double charge = 0.0;
+						Integral::Simpson_Integral(atom->msh,atom->rho_at,atom->rab,charge);
+						OUT(ofs_warning,"charge from rho_at",charge);
+						assert(charge!=0.0 || charge==atom->zv);		// Peize Lin add charge==atom->zv for bsse 2021.04.07
 
+						double scale=1.0;
+						if(charge!=atom->zv)
+						{
+							OUT(ofs_warning,"charge should be",atom->zv);
+							scale = atom->zv/charge;
+						}
 
-        if (test_charge>0) cout<<"\n |G|=0 term done." <<endl;
-        //----------------------------------------------------------
-        // Here we compute the G<>0 term
-        // But if in parallel case
-        // G=0 term only belong to 1 cpu.
-        // Other processors start from '0'
-        //----------------------------------------------------------
-        for (int ig = pw.gstart; ig < pw.nggm ;ig++)
-        {
-            const double gx = sqrt(pw.ggs [ig]) * ucell.tpiba;
-            for (int ir = 0; ir < mesh;ir++)
-            {
-                if ( atom->r[ir] < 1.0e-8 )
-                {
-                    rho1d[ir] = rhoatm[ir];
-                    //rho1d[ir] = atom->rho_at[ir];
-                }
-                else
-                {
-                    const double gxx = gx * atom->r[ir];
-                    rho1d[ir] = rhoatm[ir] * sin(gxx) / gxx;
-                    rho1d[ir] = rhoatm[ir] * sin(gxx) / gxx;
-                }
-            }
-            Integral::Simpson_Integral(mesh , rho1d, atom->rab , rho_lgl [ig]);
-        }
-		delete[] rhoatm;
-        
-		
-		if (test_charge>0) cout<<" |G|>0 term done." <<endl;
-        //----------------------------------------------------------
-        // EXPLAIN : Complete the transfer of rho from real space to
-        // reciprocal space
-        //----------------------------------------------------------
-        for (int ig=0; ig< pw.nggm ; ig++)
-        {
-            rho_lgl[ig] /= ucell.omega;
-        }
-        //----------------------------------------------------------
-        // EXPLAIN : compute the 3D atomic charge in reciprocal space
-        //----------------------------------------------------------
-        if(spin_number_need==1)
-        {
-            for (int ig=0; ig< pw.ngmc ;ig++)
-            {
-                rho_g3d(0, ig) += pw.strucFac(it, ig) * rho_lgl[ pw.ig2ngg[ig] ];
-            }
-		}
-		// mohan add 2011-06-14, initialize the charge density according to each atom 
-		else if(spin_number_need==2)
-		{
-			if(startmag_type==1)
-			{
-				for (int ig = 0; ig < pw.ngmc ; ig++)
-				{
-					const complex<double> swap = pw.strucFac(it, ig)* rho_lgl[pw.ig2ngg[ig]];
-					//rho_g3d(0, ig) += swap * mag.nelup_percent(it);
-					//rho_g3d(1, ig) += swap * mag.neldw_percent(it);
-					const double up = 0.5 * ( 1 + mag.start_magnetization[it] / atom->zv );
-					const double dw = 0.5 * ( 1 - mag.start_magnetization[it] / atom->zv );
-					rho_g3d(0, ig) += swap * up;
-					rho_g3d(1, ig) += swap * dw;
-				}
-			}
-			// mohan add 2011-06-14
-			else if(startmag_type==2)
-			{
-				complex<double> swap = ZERO;
-				complex<double> ci_tpi = NEG_IMAG_UNIT * TWO_PI;
-				for (int ia = 0; ia < atom->na; ia++)
+						for(int ir=0; ir<mesh; ++ir)
+						{
+							rhoatm[ir] *= scale;
+							rhoatm[ir] *= (FOUR_PI*atom->r[ir]*atom->r[ir]);
+						}
+						return rhoatm;
+					}();
+
+					assert(ucell.meshx>0);
+					vector<double> rho1d(ucell.meshx);
+					//----------------------------------------------------------
+					// Here we compute the G=0 term
+					//----------------------------------------------------------
+					if (pw.gstart == 1)
+					{
+						for (int ir = 0;ir < mesh;ir++)
+						{
+			//              rho1d [ir] = atom->rho_at[ir];
+							rho1d[ir] = rhoatm[ir];
+						}
+						Integral::Simpson_Integral(mesh, rho1d.data(), atom->rab, rho_lgl[0]);
+					}
+					if (test_charge>0) cout<<"\n |G|=0 term done." <<endl;
+					//----------------------------------------------------------
+					// Here we compute the G<>0 term
+					// But if in parallel case
+					// G=0 term only belong to 1 cpu.
+					// Other processors start from '0'
+					//----------------------------------------------------------
+					for (int ig = pw.gstart; ig < pw.nggm ;ig++)
+					{
+						const double gx = sqrt(pw.ggs [ig]) * ucell.tpiba;
+						for (int ir = 0; ir < mesh;ir++)
+						{
+							if ( atom->r[ir] < 1.0e-8 )
+							{
+								rho1d[ir] = rhoatm[ir];
+								//rho1d[ir] = atom->rho_at[ir];
+							}
+							else
+							{
+								const double gxx = gx * atom->r[ir];
+								rho1d[ir] = rhoatm[ir] * sin(gxx) / gxx;
+								rho1d[ir] = rhoatm[ir] * sin(gxx) / gxx;
+							}
+						}
+						Integral::Simpson_Integral(mesh, rho1d.data(), atom->rab, rho_lgl[ig]);
+					}
+					
+					if (test_charge>0) cout<<" |G|>0 term done." <<endl;
+					//----------------------------------------------------------
+					// EXPLAIN : Complete the transfer of rho from real space to
+					// reciprocal space
+					//----------------------------------------------------------
+					for (int ig=0; ig< pw.nggm ; ig++)
+						rho_lgl[ig] /= ucell.omega;
+					return rho_lgl;
+				}();
+				//----------------------------------------------------------
+				// EXPLAIN : compute the 3D atomic charge in reciprocal space
+				//----------------------------------------------------------
+				if(spin_number_need==1)
 				{
-					//const double up = 0.5 * ( 1 + atom->mag[ia] );
-					//const double dw = 0.5 * ( 1 - atom->mag[ia] );
-					const double up = 0.5 * ( 1 + atom->mag[ia] / atom->zv );
-					const double dw = 0.5 * ( 1 - atom->mag[ia] / atom->zv );
-					//cout << " atom " << ia << " up=" << up << " dw=" << dw << endl;
-
-					for (int ig = 0; ig < pw.ngmc ; ig++)
+					for (int ig=0; ig< pw.ngmc ;ig++)
 					{
-						const double Gtau = 
-							pw.gcar[ig].x * atom->tau[ia].x
-							+ pw.gcar[ig].y * atom->tau[ia].y
-							+ pw.gcar[ig].z * atom->tau[ia].z; 
-
-						swap = exp(ci_tpi * Gtau) * rho_lgl[pw.ig2ngg[ig]];
-
-						rho_g3d(0, ig) += swap * up;
-						rho_g3d(1, ig) += swap * dw;
+						rho_g3d(0, ig) += pw.strucFac(it, ig) * rho_lgl[ pw.ig2ngg[ig] ];
 					}
 				}
-			}
-		}
-		else if(spin_number_need==4)
-		{
-			//noncolinear case
-			if(startmag_type == 1)
-			{
-				for (int ig = 0; ig < pw.ngmc ; ig++)
+				// mohan add 2011-06-14, initialize the charge density according to each atom 
+				else if(spin_number_need==2)
 				{
-					const complex<double> swap = pw.strucFac(it, ig)* rho_lgl[pw.ig2ngg[ig]];
-					rho_g3d(0, ig) += swap ;
-					if(DOMAG)
+					if(startmag_type==1)
 					{
-						rho_g3d(1, ig) += swap * (mag.start_magnetization[it] / atom->zv) 
-						* sin(soc.angle1[it]) * cos(soc.angle2[it]);
-						rho_g3d(2, ig) += swap * (mag.start_magnetization[it] / atom->zv) 
-						* sin(soc.angle1[it]) * sin(soc.angle2[it]);
-						rho_g3d(3, ig) += swap * (mag.start_magnetization[it] / atom->zv) 
-						* cos(soc.angle1[it]);
+						for (int ig = 0; ig < pw.ngmc ; ig++)
+						{
+							const complex<double> swap = pw.strucFac(it, ig)* rho_lgl[pw.ig2ngg[ig]];
+							//rho_g3d(0, ig) += swap * mag.nelup_percent(it);
+							//rho_g3d(1, ig) += swap * mag.neldw_percent(it);
+							const double up = 0.5 * ( 1 + mag.start_magnetization[it] / atom->zv );
+							const double dw = 0.5 * ( 1 - mag.start_magnetization[it] / atom->zv );
+							rho_g3d(0, ig) += swap * up;
+							rho_g3d(1, ig) += swap * dw;
+						}
 					}
-					else if(DOMAG_Z)
+					// mohan add 2011-06-14
+					else if(startmag_type==2)
 					{
-						//rho_g3d(3, ig) += swap * mag.start_magnetization[it];
-						rho_g3d(3, ig) += swap * (mag.start_magnetization[it] / atom->zv);
+						complex<double> swap = ZERO;
+						complex<double> ci_tpi = NEG_IMAG_UNIT * TWO_PI;
+						for (int ia = 0; ia < atom->na; ia++)
+						{
+							//const double up = 0.5 * ( 1 + atom->mag[ia] );
+							//const double dw = 0.5 * ( 1 - atom->mag[ia] );
+							const double up = 0.5 * ( 1 + atom->mag[ia] / atom->zv );
+							const double dw = 0.5 * ( 1 - atom->mag[ia] / atom->zv );
+							//cout << " atom " << ia << " up=" << up << " dw=" << dw << endl;
+
+							for (int ig = 0; ig < pw.ngmc ; ig++)
+							{
+								const double Gtau = 
+									pw.gcar[ig].x * atom->tau[ia].x
+									+ pw.gcar[ig].y * atom->tau[ia].y
+									+ pw.gcar[ig].z * atom->tau[ia].z; 
+
+								swap = exp(ci_tpi * Gtau) * rho_lgl[pw.ig2ngg[ig]];
+
+								rho_g3d(0, ig) += swap * up;
+								rho_g3d(1, ig) += swap * dw;
+							}
+						}
 					}
 				}
-			}
-			else if(startmag_type == 2)
-			{//zdy-warning-not-available
-				complex<double> swap = ZERO;
-				complex<double> ci_tpi = NEG_IMAG_UNIT * TWO_PI;
-				for(int ia = 0;ia<atom->na;ia++)
+				else if(spin_number_need==4)
 				{
-					for (int ig = 0; ig < pw.ngmc ; ig++)
+					//noncolinear case
+					if(startmag_type == 1)
 					{
-						const double Gtau =
-							pw.gcar[ig].x * atom->tau[ia].x
-							+ pw.gcar[ig].y * atom->tau[ia].y
-							+ pw.gcar[ig].z * atom->tau[ia].z;
-
-						swap = exp(ci_tpi * Gtau) * rho_lgl[pw.ig2ngg[ig]];
-
-						rho_g3d(0, ig) += swap;
-						if(DOMAG)
+						for (int ig = 0; ig < pw.ngmc ; ig++)
 						{
-							rho_g3d(1, ig) += swap * (atom->mag[ia] / atom->zv) 
+							const complex<double> swap = pw.strucFac(it, ig)* rho_lgl[pw.ig2ngg[ig]];
+							rho_g3d(0, ig) += swap ;
+							if(DOMAG)
+							{
+								rho_g3d(1, ig) += swap * (mag.start_magnetization[it] / atom->zv) 
 								* sin(soc.angle1[it]) * cos(soc.angle2[it]);
-							rho_g3d(2, ig) += swap * (atom->mag[ia] / atom->zv) 
+								rho_g3d(2, ig) += swap * (mag.start_magnetization[it] / atom->zv) 
 								* sin(soc.angle1[it]) * sin(soc.angle2[it]);
-							rho_g3d(3, ig) += swap * (atom->mag[ia] / atom->zv) 
+								rho_g3d(3, ig) += swap * (mag.start_magnetization[it] / atom->zv) 
 								* cos(soc.angle1[it]);
+							}
+							else if(DOMAG_Z)
+							{
+								//rho_g3d(3, ig) += swap * mag.start_magnetization[it];
+								rho_g3d(3, ig) += swap * (mag.start_magnetization[it] / atom->zv);
+							}
 						}
-						else if(DOMAG_Z)
+					}
+					else if(startmag_type == 2)
+					{//zdy-warning-not-available
+						complex<double> swap = ZERO;
+						complex<double> ci_tpi = NEG_IMAG_UNIT * TWO_PI;
+						for(int ia = 0;ia<atom->na;ia++)
 						{
-							rho_g3d(3, ig) += swap * (atom->mag[ia] / atom->zv);
+							for (int ig = 0; ig < pw.ngmc ; ig++)
+							{
+								const double Gtau =
+									pw.gcar[ig].x * atom->tau[ia].x
+									+ pw.gcar[ig].y * atom->tau[ia].y
+									+ pw.gcar[ig].z * atom->tau[ia].z;
+
+								swap = exp(ci_tpi * Gtau) * rho_lgl[pw.ig2ngg[ig]];
+
+								rho_g3d(0, ig) += swap;
+								if(DOMAG)
+								{
+									rho_g3d(1, ig) += swap * (atom->mag[ia] / atom->zv) 
+										* sin(soc.angle1[it]) * cos(soc.angle2[it]);
+									rho_g3d(2, ig) += swap * (atom->mag[ia] / atom->zv) 
+										* sin(soc.angle1[it]) * sin(soc.angle2[it]);
+									rho_g3d(3, ig) += swap * (atom->mag[ia] / atom->zv) 
+										* cos(soc.angle1[it]);
+								}
+								else if(DOMAG_Z)
+								{
+									rho_g3d(3, ig) += swap * (atom->mag[ia] / atom->zv);
+								}
+							}
 						}
 					}
 				}
+				else
+				{
+					WARNING_QUIT("Charge::spin_number_need"," Either 1 or 2 or 4, check SPIN number !");
+				}
 			}
 		}
-		else
-		{
-			WARNING_QUIT("Charge::spin_number_need"," Either 1 or 2 or 4, check SPIN number !");
-		}
-	}
-
-    delete [] rho_lgl;
-    delete [] rho1d;;
-
+		return rho_g3d;
+	}();
 
 	assert( spin_number_need > 0 );
-	double* ne = new double[spin_number_need];
-	ZEROS( ne, spin_number_need);
+	vector<double> ne(spin_number_need);
     for (int is = 0; is < spin_number_need;is++)
     {
         UFFT.ToRealSpace( is, rho_g3d, rho_in[is]);
 
 		for(int ir=0; ir<pw.nrxx; ++ir)
-		{
 			ne[is] += rho_in[is][ir];
-		}
 		ne[is] *= ucell.omega/(double)pw.ncxyz; 
 		Parallel_Reduce::reduce_double_pool( ne[is] );
 
@@ -488,12 +478,8 @@ void Charge::atomic_rho(const int spin_number_need, double** rho_in)const
 	OUT(ofs_warning,"total electron number from rho",ne_tot);
 	OUT(ofs_warning,"should be",ucell.nelec);
 	for(int is=0; is<spin_number_need; ++is)
-	{
 		for(int ir=0; ir<pw.nrxx; ++ir)
-		{
 			rho_in[is][ir] = rho_in[is][ir] / ne_tot * ucell.nelec;
-		}
-	}
 
 	// if TWO_EFEMI, 
 	// the total magnetism will affect the calculation of
@@ -502,7 +488,6 @@ void Charge::atomic_rho(const int spin_number_need, double** rho_in)const
 
 	//ofs_running << " Superposition of atomic wave function as First-Charge done." << endl;
 	//2014-06-22
-	delete[] ne;
 
     timer::tick("Charge","atomic_rho");
     return;
diff --git a/ABACUS.develop/source/src_pw/pseudopot_upf.cpp b/ABACUS.develop/source/src_pw/pseudopot_upf.cpp
index 2b6a76f8fa..e503e833b7 100644
--- a/ABACUS.develop/source/src_pw/pseudopot_upf.cpp
+++ b/ABACUS.develop/source/src_pw/pseudopot_upf.cpp
@@ -2296,3 +2296,16 @@ int Pseudopot_upf::average_p()
 	this->has_so = 0;	
 	return error;
 }
+
+// Peize Lin add for bsse 2021.04.07
+void Pseudopot_upf::set_empty_element()
+{
+	this->zp = 0;
+	for(int ir=0; ir<mesh; ++ir)
+		this->vloc[ir] = 0;
+	for(int i=0; i<nbeta; ++i)
+		for(int j=0; j<nbeta; ++j)
+			this->dion(i,j) = 0;
+	for(int ir=0; ir<mesh; ++ir)
+		this->rho_at[ir] = 0;
+}
\ No newline at end of file
diff --git a/ABACUS.develop/source/src_pw/pseudopot_upf.h b/ABACUS.develop/source/src_pw/pseudopot_upf.h
index 2d24d4f749..23c41007ef 100644
--- a/ABACUS.develop/source/src_pw/pseudopot_upf.h
+++ b/ABACUS.develop/source/src_pw/pseudopot_upf.h
@@ -89,6 +89,7 @@ class Pseudopot_upf
 
 	bool functional_error;//xiaohui add 2015-03-24
 	int average_p(); //zhengdy add 2020-10-20
+	void set_empty_element();		// Peize Lin add for bsse 2022.04.07
 
 private:
 

From f312d8f7f1e33b00b40f5e9240c0dee86334777e Mon Sep 17 00:00:00 2001
From: zdy <dyzheng@mail.ustc.edu.cn>
Date: Fri, 9 Apr 2021 21:22:24 +0800
Subject: [PATCH 44/60] fixed nspin=2 bug

---
 ABACUS.develop/source/input_conv.cpp | 2 ++
 1 file changed, 2 insertions(+)

diff --git a/ABACUS.develop/source/input_conv.cpp b/ABACUS.develop/source/input_conv.cpp
index f56cef2abd..d6f82662d5 100644
--- a/ABACUS.develop/source/input_conv.cpp
+++ b/ABACUS.develop/source/input_conv.cpp
@@ -397,6 +397,8 @@ void Input_Conv::Convert(void)
 #endif
 	}
 	else{
+		delete[] soc.m_loc;
+		soc.m_loc = new Vector3<double> [INPUT.ntype];
 		LSPINORB = false;
 		NONCOLIN = false;
 		DOMAG = false;

From 90cf8aba494df11b7ef4f90a948b9ed7ab31cafe Mon Sep 17 00:00:00 2001
From: maki49 <1579492865@qq.com>
Date: Sat, 10 Apr 2021 22:05:31 +0800
Subject: [PATCH 45/60] fix bugs in lcao-line descriptor

---
 .../source/src_lcao/LCAO_descriptor.cpp       |  231 +-
 .../source/src_lcao/LCAO_descriptor.h         |    8 +-
 ABACUS.develop/source/src_lcao/LOOP_ions.cpp  |    2 +-
 .../source/src_lcao/ORB_gen_tables.cpp        | 2020 +++++++++--------
 .../source/src_lcao/ORB_table_alpha.cpp       |  927 ++++----
 .../source/src_lcao/ORB_table_alpha.h         |  138 +-
 6 files changed, 1643 insertions(+), 1683 deletions(-)

diff --git a/ABACUS.develop/source/src_lcao/LCAO_descriptor.cpp b/ABACUS.develop/source/src_lcao/LCAO_descriptor.cpp
index 2bbdc0495a..f1a7538c2c 100644
--- a/ABACUS.develop/source/src_lcao/LCAO_descriptor.cpp
+++ b/ABACUS.develop/source/src_lcao/LCAO_descriptor.cpp
@@ -1,3 +1,4 @@
+//caoyu add 2021-03-29
 #include "LCAO_descriptor.h"
 #include "LCAO_matrix.h"
 #include "../src_global/lapack_connector.h"
@@ -7,8 +8,6 @@
 #include "../src_pw/global.h"
 #include "../src_io/winput.h"
 
-//caoyu add 2021-03-29
-
 LCAO_Descriptor::LCAO_Descriptor()
 {
     S_mu_alpha = new double[1];
@@ -30,21 +29,15 @@ void LCAO_Descriptor::build_S_descriptor(const bool &calc_deri)
 
     // =======init==============
     // cal n(descriptor) per atom , related to Lmax, nchi(L) and m. (not total_nchi!)
-
-	this->des_per_atom=0;
     for (int l = 0; l <= ORB.get_lmax_d(); l++)
     {
         this->des_per_atom += ORB.Alpha[0].getNchi(l) * (2 * l + 1);
     }
-
-	// total number of descriptors
     this->n_descriptor = ucell.nat * this->des_per_atom;
-
-	// size of the full density matrix (DM)
-    const long DMsize = this->n_descriptor * this->n_descriptor;
-
+    const long Ssize = this->n_descriptor * NLOCAL;
     delete[] S_mu_alpha;
-    S_mu_alpha = new double[DMsize];
+    S_mu_alpha = new double[Ssize];
+    ZEROS(S_mu_alpha, Ssize);
 
     this->init_mu_index();
     // =======init==============
@@ -55,7 +48,6 @@ void LCAO_Descriptor::build_S_descriptor(const bool &calc_deri)
     //\sum{T} e**{ikT} <\phi_{ia}|d\phi_{k\beta}(T)>	//???
     Vector3<double> tau1, tau2, dtau;
     Vector3<double> dtau1, dtau2, tau0;
-
     for (int T1 = 0; T1 < ucell.ntype; ++T1)
     {
         Atom *atom1 = &ucell.atoms[T1];
@@ -64,20 +56,25 @@ void LCAO_Descriptor::build_S_descriptor(const bool &calc_deri)
             tau1 = atom1->tau[I1];
             //GridD.Find_atom(tau1);
             GridD.Find_atom(tau1, T1, I1);
+            int *T2arr = new int[GridD.getAdjacentNum() + 1];
+            int *I2arr = new int[GridD.getAdjacentNum() + 1];
+            for (int ad = 0; ad < GridD.getAdjacentNum() + 1; ++ad)
+            {
+                T2arr[ad] = GridD.getType(ad);
+                I2arr[ad] = GridD.getNatom(ad);
+            }
             for (int ad = 0; ad < GridD.getAdjacentNum() + 1; ++ad)
             {
-                const int T2 = GridD.getType(ad);
-                const int I2 = GridD.getNatom(ad);
-                Atom *atom2 = &ucell.atoms[T2];
+                //const int T2 = GridD.getType(ad);
+                //const int I2 = GridD.getNatom(ad);
+                Atom *atom2 = &ucell.atoms[T2arr[ad]];
                 tau2 = GridD.getAdjacentTau(ad);
                 dtau = tau2 - tau1;
                 double distance = dtau.norm() * ucell.lat0;
-				// rcut is subject to ORB.Phi to keep dimension of S_mu_alpha same as Sloc
-                double rcut = ORB.Phi[T1].getRcut() + ORB.Phi[T2].getRcut();
+                double rcut = ORB.Phi[T1].getRcut() + ORB.Alpha[0].getRcut(); //Rcut is subject to ORB.Phi to keep dimension of S_mu_alpha same as Sloc
                 if (distance < rcut)
                 {
-					// iw1_all = combined index (it, ia, iw)
-                    int iw1_all = ucell.itiaiw2iwt(T1, I1, 0);
+                    int iw1_all = ucell.itiaiw2iwt(T1, I1, 0); //iw1_all = combined index (it, ia, iw)
 
                     for (int jj = 0; jj < atom1->nw * NPOL; ++jj)
                     {
@@ -89,53 +86,33 @@ void LCAO_Descriptor::build_S_descriptor(const bool &calc_deri)
                         //init iw2_all
                         int iw2_all = 0;
                         int iatom = 0;
-                        for (int it = 0; it < T2; it++)
+                        for (int it = 0; it < T2arr[ad]; it++)
                         {
                             for (int ia = 0; ia < ucell.atoms[it].na; ia++)
                             {
                                 iatom++; // cal how many atoms before ad in ucell
                             }
                         }
-                        iatom += I2;
-
+                        iatom += I2arr[ad];
                         iw2_all = iatom * this->des_per_atom;
 
-                        for (int L2 = 0; L2 < ORB.Alpha[0].getLmax(); ++L2)
+                        for (int L2 = 0; L2 <= ORB.Alpha[0].getLmax(); ++L2)
                         {
                             for (int N2 = 0; N2 < ORB.Alpha[0].getNchi(L2); ++N2)
                             {
                                 for (int m2 = 0; m2 < 2 * L2 + 1; ++m2)
                                 {
-                                    olm[0] = 0.0;
-									olm[1] = 0.0;
-									olm[2] = 0.0;
-
-                                    complex<double> olm1[4] = {ZERO, ZERO, ZERO, ZERO};
-                                    complex<double> *olm2 = &olm1[0];
+                                    olm[0] = olm[1] = olm[2] = 0.0;
                                     if (!calc_deri)
                                     {
                                         UOT.snap_psialpha(olm, 0, tau1,
                                                           T1, L1, m1, N1, GridD.getAdjacentTau(ad),
-                                                          T2, L2, m2, N2);
-
+                                                          T2arr[ad], L2, m2, N2);
                                         if (GAMMA_ONLY_LOCAL)
                                         {
                                             this->set_S_mu_alpha(iw1_all, iw2_all, olm[0]);
                                         }
                                     }
-                                    /*derivation will be needed in next step
-									else // calculate the derivative
-									{
-										UOT.snap_psipsi( olm, 1, dtype, 
-											tau1, T1, L1, m1, N1,
-											GridD.getAdjacentTau(ad), T2, L2, m2, N2
-											);
-
-										if(GAMMA_ONLY_LOCAL)
-										{
-										}
-									}
-									*/
                                     ++iw2_all;
                                 } //m2
                             }     //N2
@@ -144,15 +121,14 @@ void LCAO_Descriptor::build_S_descriptor(const bool &calc_deri)
                     } // nw1
                 }     // distance
             }         // ad
-        }             // I1
-    }                 // T1
-
+            delete[] T2arr;
+            delete[] I2arr;
+        } // I1
+    }     // T1
     if (!GAMMA_ONLY_LOCAL)
     {
-        WARNING_QUIT("LCAO_Descriptor::build_S_descriptor", 
-		"muti-kpoint method for descriptor is not implemented yet! ");
+        WARNING_QUIT("LCAO_Descriptor::build_S_descriptor", "muti-kpoint method for descriptor is not implemented yet! ");
     }
-
     return;
 }
 
@@ -173,109 +149,101 @@ void LCAO_Descriptor::set_S_mu_alpha(const int &iw1_all, const int &iw2_all, con
     {
         index = ir * this->n_descriptor + ic; //row: lcao orbitals; col: descriptor basis
     }
-
     this->S_mu_alpha[index] += v;
-
     return;
 }
 
-void LCAO_Descriptor::cal_projective_DM(void)
+void LCAO_Descriptor::cal_projected_DM()
 {
-    //step 1: cal inv of Sloc
-    double *sinv = new double[NLOCAL * NLOCAL]; //size :NLOCAL*NLOCAL
-    for (int i = 0; i < NLOCAL; i++)
+    //step 1: get dm: the coefficient of wfc, not charge density
+    double *dm = new double[NLOCAL * NLOCAL];
+    ZEROS(dm, NLOCAL * NLOCAL);
+    for (int i = 0; i < LOC.wfc_dm_2d.dm_gamma[0].nr; i++)
     {
-        for (int j = 0; j < NLOCAL; j++)
+        for (int j = 0; j < LOC.wfc_dm_2d.dm_gamma[0].nc; j++)
         {
-            sinv[i * NLOCAL + j] = LM.Sloc[i * ParaO.ncol + j];
+            dm[i * NLOCAL + j] = LOC.wfc_dm_2d.dm_gamma[0](i, j); //only consider default NSPIN = 1
         }
     }
-    /*
-    //++++++++++++++test++++++++++++++
-    for (int i = 0; i < NLOCAL; i++)
+/*
+    //===============test==============
+    cout << "test: out wfc_dm_2d.dm_gamma[0](i, j)" << endl;
+    for (int ir = 0; ir < NLOCAL; ir++)
     {
-        for (int j = 0; j < NLOCAL; j++)
+        for (int ic = 0; ic < NLOCAL; ic++)
         {
-            cout << sinv[i * NLOCAL + j] << " ";
+            cout << dm[ir * NLOCAL + ic] << " ";
         }
         cout << endl;
     }
-    //++++++++++++++test++++++++++++++
-    */
-    int info;
-    const char uplo = 'L';
-    dpotrf_(&uplo, &NLOCAL, sinv, &NLOCAL, &info);
-    dpotri_(&uplo, &NLOCAL, sinv, &NLOCAL, &info);
-    if (info != 0)
-    {
-        cout << "info = " << info << endl;
-        WARNING_QUIT("LCAO_Descriptor", "Something wrong in calculating inverse of Sloc!");
-    }
-
-    //step 2: get lcao density matrix as array
-    double *dm = new double[NLOCAL * NLOCAL]; //size :NLOCAL*NLOCAL
-    for (int i = 0; i < NLOCAL; i++)
+    //===============\test==============
+*/
+    //step 2: get SS_alpha_mu and SS_nu_beta
+    double *ss = this->S_mu_alpha; //SS_nu_beta
+/*
+    //===============test==============
+    cout << "test: out S_nu_beta" << endl;
+    for (int ir = 0; ir < NLOCAL; ir++)
     {
-        for (int j = 0; j < NLOCAL; j++)
+        for (int ic = 0; ic < this->n_descriptor; ic++)
         {
-            dm[i * NLOCAL + j] = LOC.DM[0][i][j]; //only consider default NSPIN = 1
+            cout << ss[ir * this->n_descriptor + ic] << " ";
         }
+        cout << endl;
     }
-    //step 3: get SS_alpha_mu and SS_nu_beta
-    double *ss = this->S_mu_alpha; //SS_nu_beta
-
-    //step 4 : multiply
-    //cal ssT*sinvT*DM*sinv*ss
+    //===============\test==============
+*/
+    //step 3 : multiply
+    //cal ssT*DM*ss
 
     const long tmp_PDM_size = NLOCAL * this->n_descriptor;
     double *tmp_PDM = new double[tmp_PDM_size];
+    ZEROS(tmp_PDM, tmp_PDM_size);
     const long PDM_size = this->n_descriptor * this->n_descriptor;
-
     delete[] this->PDM;
     this->PDM = new double[PDM_size];
+    ZEROS(this->PDM, PDM_size);
 
     const char t = 'T';  //transpose
     const char nt = 'N'; //non transpose
     const double alpha = 1;
     const double beta = 0;
-    double *a = sinv;
+    double *a = dm;
     double *b = ss;
     double *c = tmp_PDM;
-    dgemm_(&nt, &nt, &NLOCAL, &n_descriptor, &NLOCAL, &alpha, a, 
-	&NLOCAL, b, &NLOCAL, &beta, c, &NLOCAL); //S_nu_nu*SS_nu_beta
-    a = dm;
-    b = c;
-    c = this->PDM;
-    dgemm_(&nt, &nt, &NLOCAL, &n_descriptor, &NLOCAL, &alpha, a, 
-	&NLOCAL, b, &NLOCAL, &beta, c, &NLOCAL); //DM*S_nu_nu*SS_nu_beta
-    a = sinv;
-    b = c;
-    c = tmp_PDM;
-    dgemm_(&t, &nt, &NLOCAL, &n_descriptor, &NLOCAL, &alpha, a, 
-	&NLOCAL, b, &NLOCAL, &beta, c, &NLOCAL); //S_mu_mu*DM*S_nu_nu*SS_nu_beta
+    dgemm_(&nt, &nt, &NLOCAL, &n_descriptor, &NLOCAL, &alpha, a, &NLOCAL, b, &NLOCAL, &beta, c, &NLOCAL); //S_nu_nu*SS_nu_beta
     a = ss;
     b = c;
     c = this->PDM;
-    dgemm_(&t, &nt, &n_descriptor, &n_descriptor, &NLOCAL, &alpha, a, 
-	&NLOCAL, b, &NLOCAL, &beta, c, &n_descriptor); //SS_alpha_mu*S_mu_mu*DM*S_nu_nu*SS_nu_beta
-
-    delete[] dm;
-    delete[] sinv;
+    dgemm_(&t, &nt, &n_descriptor, &n_descriptor, &NLOCAL, &alpha, a, &NLOCAL, b, &NLOCAL, &beta, c, &n_descriptor); //SS_alpha_mu*S_mu_mu*DM*S_nu_nu*SS_nu_beta
+/*
+    //===============test==============
+    cout << "test: out PDM" << endl;
+    for (int ir = 0; ir < n_descriptor; ir++)
+    {
+        for (int ic = 0; ic < n_descriptor; ic++)
+        {
+            cout << this->PDM[ir * n_descriptor + ic] << " ";
+        }
+        cout << endl;
+    }
+    //===============\test==============
+*/
     delete[] tmp_PDM;
+    delete[] dm;
     return;
 }
 
-void LCAO_Descriptor::cal_descriptor(void)
+void LCAO_Descriptor::cal_descriptor()
 {
     delete[] d;
     d = new double[this->n_descriptor];
-
     //==========print preparation=============
     ofs_running << " print out each DM_Inl" << endl;
     ofstream ofs;
     stringstream ss;
     ss << winput::spillage_outdir << "/"
-       << "projective_DM.dat";
+       << "projected_DM.dat";
     if (MY_RANK == 0)
     {
         ofs.open(ss.str().c_str());
@@ -287,9 +255,7 @@ void LCAO_Descriptor::cal_descriptor(void)
     {
         for (int ia = 0; ia < ucell.atoms[it].na; ia++)
         {
-            ofs << ucell.atoms[it].label << " atom_index " << ia + 1 
-			<< " n_descriptor " << this->des_per_atom << endl;
-
+            ofs << ucell.atoms[it].label << " atom_index " << ia + 1 << " n_descriptor " << this->des_per_atom << endl;
             for (int l = 0; l <= lmax; l++)
             {
                 int nmax = ORB.Alpha[0].getNchi(l);
@@ -306,13 +272,14 @@ void LCAO_Descriptor::cal_descriptor(void)
                             const int jj = mu_index[it](ia, l, n, m2);
 
                             long index = ii * this->n_descriptor + jj;
-                            des(m, m2) = this->PDM[index];
+                            assert(index >= 0);
+                            assert(index < this->n_descriptor * this->n_descriptor);
+                            complex<double> tmp(this->PDM[index], 0);
+                            des(m, m2) += tmp;
                         }
-                        //					ofs_running << setw(15) << des(m,m2);
                     }
-                    //			ofs_running << endl;
 
-                    this->print_projective_DM(ofs, des, it, ia, l, n);
+                    this->print_projected_DM(ofs, des, it, ia, l, n);
 
                     //ofs_running << "dimension of des is " << 2 * l + 1 << endl;
                     if (l == 0)
@@ -344,22 +311,14 @@ void LCAO_Descriptor::cal_descriptor(void)
                         delete[] rwork;
                         delete[] work;
                     }
-                }
-
-            } //l
-        }     //ia
-    }         //it
-
-    if (ofs)
-	{
-        ofs.close();
-	}
-
+                } //n
+            }     //l
+        }         //ia
+    }             //it
     this->print_descriptor();
     return;
 }
 
-
 void LCAO_Descriptor::init_mu_index(void)
 {
     ofs_running << " Initialize the mu index for deepks (lcao line)" << endl;
@@ -407,14 +366,7 @@ void LCAO_Descriptor::init_mu_index(void)
     return;
 }
 
-
-void LCAO_Descriptor::print_projective_DM(
-	ofstream &ofs, 
-	ComplexMatrix &des, 
-	const int &it, 
-	const int &ia, 
-	const int &l, 
-	const int &n)
+void LCAO_Descriptor::print_projected_DM(ofstream &ofs, ComplexMatrix &des, const int &it, const int &ia, const int &l, const int &n)
 {
     ofs << "L=" << l << "   N=" << n << endl;
     for (int i = 0; i < 2 * l + 1; i++)
@@ -427,9 +379,7 @@ void LCAO_Descriptor::print_projective_DM(
     }
     return;
 }
-
-
-void LCAO_Descriptor::print_descriptor(void)
+void LCAO_Descriptor::print_descriptor()
 {
     TITLE("LCAO_Descriptor", "print_descriptor");
     ofstream ofs;
@@ -441,13 +391,11 @@ void LCAO_Descriptor::print_descriptor(void)
     {
         ofs.open(ss.str().c_str());
     }
-
     for (int it = 0; it < ucell.ntype; it++)
     {
         for (int ia = 0; ia < ucell.atoms[it].na; ia++)
         {
-            ofs << ucell.atoms[it].label << " atom_index " 
-			<< ia + 1 << " n_descriptor " << this->des_per_atom << endl;
+            ofs << ucell.atoms[it].label << " atom_index " << ia + 1 << " n_descriptor " << this->des_per_atom << endl;
             int id0 = this->mu_index[it](ia, 0, 0, 0);
             for (int id = id0; id < id0 + this->des_per_atom; ++id)
             {
@@ -455,9 +403,8 @@ void LCAO_Descriptor::print_descriptor(void)
                     ofs << endl;
                 ofs << d[id] << " ";
             }
-            ofs << endl;
+            ofs << endl << endl;
         }
-        ofs << endl;
     }
     ofs_running << "descriptors are printed" << endl;
     return;
diff --git a/ABACUS.develop/source/src_lcao/LCAO_descriptor.h b/ABACUS.develop/source/src_lcao/LCAO_descriptor.h
index 7ece689fd4..c92c1b67d1 100644
--- a/ABACUS.develop/source/src_lcao/LCAO_descriptor.h
+++ b/ABACUS.develop/source/src_lcao/LCAO_descriptor.h
@@ -12,11 +12,11 @@ class LCAO_Descriptor
     LCAO_Descriptor();
     ~LCAO_Descriptor();
 
-	// cal S_alpha_mu：overlap between lcao basis Phi and descriptor basis Al
+	// cal S_alpha_mu: overlap between lcao basis Phi and descriptor basis Al
     void build_S_descriptor(const bool &calc_deri); 
 
 	// cal PDM: S_alpha_mu * inv(Sloc) * DM * inv(Sloc) * S_nu_beta
-    void cal_projective_DM(void);
+    void cal_projected_DM(void);
 
 	// cal d: EIGENVALUE of PDM in block of I_n_l
     void cal_descriptor(void);
@@ -27,7 +27,7 @@ class LCAO_Descriptor
 	// overlap between lcao and descriptor basis
     double *S_mu_alpha;
 
-	// projective density matrix
+	// projected density matrix
     double *PDM;
 
 	// descriptors
@@ -47,7 +47,7 @@ class LCAO_Descriptor
 		const int &iw2_all, 
 		const double &v);
 
-    void print_projective_DM(
+    void print_projected_DM(
 		ofstream &ofs, 
 		ComplexMatrix &des, 
 		const int &it, 
diff --git a/ABACUS.develop/source/src_lcao/LOOP_ions.cpp b/ABACUS.develop/source/src_lcao/LOOP_ions.cpp
index 1dd1cbfc1c..779652d5cf 100644
--- a/ABACUS.develop/source/src_lcao/LOOP_ions.cpp
+++ b/ABACUS.develop/source/src_lcao/LOOP_ions.cpp
@@ -137,7 +137,7 @@ void LOOP_ions::opt_ions(void)
         {
             LCAO_Descriptor ld;
             ld.build_S_descriptor(0);  //derivation not needed yet
-            ld.cal_projective_DM();
+            ld.cal_projected_DM();
             ld.cal_descriptor();
         }
 
diff --git a/ABACUS.develop/source/src_lcao/ORB_gen_tables.cpp b/ABACUS.develop/source/src_lcao/ORB_gen_tables.cpp
index 13afa1b0a3..51e3e0ed45 100644
--- a/ABACUS.develop/source/src_lcao/ORB_gen_tables.cpp
+++ b/ABACUS.develop/source/src_lcao/ORB_gen_tables.cpp
@@ -1,1001 +1,1019 @@
-#include "src_pw/global.h"
-#include "ORB_read.h"
-#include "ORB_gen_tables.h"
-#include "src_global/ylm.h"
-
-// here is a member of ORB_gen_tables class
-ORB_gen_tables UOT;
-
-ORB_gen_tables::ORB_gen_tables(){}
-ORB_gen_tables::~ORB_gen_tables(){}
-
-// call in hamilt_linear::init_before_ions.
-void ORB_gen_tables::gen_tables( 
-	const int &job0, 
-	LCAO_Orbitals &orb, 
-	const int &Lmax_exx)
-{
-	TITLE("ORB_gen_tables","gen_tables");
-	timer::tick("ORB_gen_tables","gen_tables",'C');
-
-	ofs_running << "\n SETUP THE TWO-CENTER INTEGRATION TABLES" << endl;
-	
-	//=========================================
-	// (1) MOT: make overlap table.
-	//=========================================
-	MOT.allocate(
-		orb.get_ntype(),// number of atom types
-        orb.get_lmax(),// max L used to calculate overlap
-        orb.get_kmesh(), // kpoints, for integration in k space
-        orb.get_Rmax(),// max value of radial table
-        orb.get_dR(),// delta R, for making radial table
-        orb.get_dk() ); // delta k, for integration in k space
-
-	tbeta.allocate(
-		orb.get_ntype(),// number of atom types
-        orb.get_lmax(),// max L used to calculate overlap
-        orb.get_kmesh(), // kpoints, for integration in k space
-        orb.get_Rmax(),// max value of radial table
-        orb.get_dR(),// delta R, for making radial table
-        orb.get_dk() ); // delta k, for integration in k space
-
-	//caoyu add 2021-03-18
-	if (INPUT.out_descriptor && BASIS_TYPE == "lcao") {
-		talpha.allocate(
-			orb.get_ntype(),// number of atom types
-			orb.get_lmax(),// max L used to calculate overlap
-			orb.get_kmesh(), // kpoints, for integration in k space
-			orb.get_Rmax(),// max value of radial table
-			orb.get_dR(),// delta R, for making radial table
-			orb.get_dk()); // delta k, for integration in k space
-	}
-
-	// OV: overlap
-	MOT.init_OV_Tpair(orb);
-	MOT.init_OV_Opair(orb);
-
-	// NL: nonlocal
-	tbeta.init_NL_Tpair();
-	tbeta.init_NL_Opair(orb); // add 2009-5-8
-
-	//caoyu add 2021-03-18
-	// DS: Descriptor
-	if (INPUT.out_descriptor && BASIS_TYPE == "lcao") {
-		talpha.init_DS_Opair();
-		talpha.init_DS_2Lplus1();
-	}
-
-	//=========================================
-	// (2) init Ylm Coef
-	//=========================================
-	//liaochen add 2010/4/29
-	Ylm::set_coefficients ();
-
-	// PLEASE add explanations for all options of 'orb_num' and 'mode'
-	// mohan add 2021-04-03
-	// Peize Lin update 2016-01-26
-	int orb_num=2; //
-	int mode=1; // 1: <phi|phi> and <phi|beta>
-	int Lmax_used=0;
-	int Lmax=0;
-
-	MOT.init_Table_Spherical_Bessel (orb_num, mode, Lmax_used, Lmax, Lmax_exx);
-	
-	//calculate S(R) for interpolation
-	MOT.init_Table(job0, orb);
-	tbeta.init_Table_Beta( MOT.pSB );// add 2009-5-8
-
-	//caoyu add 2021-03-18
-	if (INPUT.out_descriptor && BASIS_TYPE == "lcao") 
-	{
-		talpha.init_Table_Alpha(MOT.pSB);
-		talpha.print_Table_DSR();	
-	}
-
-	//=========================================
-	// (3) make Gaunt coefficients table
-	//=========================================
-
-	const int lmax = (Lmax_used-1) / 2 ;
-	//MGT.init_Ylm_Gaunt(orb.get_lmax()+1, 0.0,PI,0.0,TWO_PI);
-	MGT.init_Gaunt_CH( lmax );
-	//MGT.init_Gaunt(orb.get_lmax()+1);
-	MGT.init_Gaunt( lmax );
-
-
-
-	timer::tick("ORB_gen_tables","gen_tables",'C');
-	return;
-}
-
-void ORB_gen_tables::snap_psibeta(
-	double nlm[],
-	const int& job,
-	const Vector3<double> &R1,
-	const int &T1,
-	const int &L1,
-	const int &m1,
-	const int &N1,
-	const Vector3<double> &R2,
-	const int &T2,
-	const int &L2,
-	const int &m2,
-	const int &N2,
-	const Vector3<double> &R0,// The projector.
-	const int &T0,
-	complex<double> *nlm1,
-	const int is) const
-{
-	//TITLE ("ORB_gen_tables","snap_psibeta");
-
-	//optimized by zhengdy-soc
-	if(NSPIN==4 && ORB.Beta[T0].get_count_soc(is)==0) 
-	{
-		return;
-	}
-
-	timer::tick ("ORB_gen_tables","snap_psibeta",'X');
-
-	bool has_so = 0;
-	if(ORB.Beta[T0].get_count_soc(0)>0 ) has_so = 1;
-
-	const int nproj = ORB.nproj[T0];
-	bool *calproj = new bool[nproj];
-	int* rmesh1 = new int[nproj];
-	int* rmesh2 = new int[nproj];
-
-	//rcut of orbtials and projectors
-	const double Rcut1 = ORB.Phi[T1].getRcut();
-	const double Rcut2 = ORB.Phi[T2].getRcut();
-	
-	//in our calculation, we always put orbital phi at the left side of <phi|beta>
-	//because <phi|beta> = <beta|phi>
-	const Vector3<double> dRa = (R0-R1)*this->lat0 ; 
-	const Vector3<double> dRb = (R0-R2)*this->lat0 ;
-	
-	double distance10 = dRa.norm();
-	double distance20 = dRb.norm();
-
-	// mohan add 2011-03-10
-	// because the table length is different accordint to each length
-	// of projector, so sometimes some shorter projectors need not be 
-	// calculated.
-	bool all_out = true;
-	for(int ip=0; ip<nproj; ip++)
-	{
-		const double Rcut0 = ORB.Beta[T0].Proj[ip].getRcut();
-		if( distance10 > (Rcut1 + Rcut0) || distance20 > (Rcut2 + Rcut0) )  
-		{
-			calproj[ip] = false;
-		}
-		else
-		{
-			all_out = false;
-			calproj[ip] = true;
-			//length of table for interpolation
-			rmesh1[ip] = tbeta.get_rmesh(Rcut1, Rcut0);
-			rmesh2[ip] = tbeta.get_rmesh(Rcut2, Rcut0);
-		}
-	}
-
-	if(all_out)
-	{
-		delete[] calproj;
-		delete[] rmesh1;
-		delete[] rmesh2;
-		timer::tick ("ORB_gen_tables","snap_psibeta",'X');
-		return;
-	}
-
-
-	//FOR INTERPOLATION
-	double* curr; //current pointer
-	int iqa, iqb;
-	double psa, psb;
-	double x0a,x1a,x2a,x3a,x123a,x120a,x032a,x031a;
-	double x0b,x1b,x2b,x3b,x123b,x120b,x032b,x031b;
-	
-	psa = distance10 / tbeta.dr;
-	iqa = static_cast<int>(psa);
-   	x0a = psa - static_cast<double>(iqa);
-  	x1a = 1.0 - x0a;
-   	x2a = 2.0 - x0a;
-    x3a = 3.0 - x0a;
-	x123a = x1a*x2a*x3a/6.0;
-	x120a = x1a*x2a*x0a/6.0;
-	x032a = x0a*x3a*x2a/2.0;
-	x031a = x0a*x3a*x1a/2.0;
-	
-	psb = distance20 / tbeta.dr;
-	iqb = (int) psb;
-   	x0b = psb - (double)iqb ;
-  	x1b = 1.0 - x0b;
-   	x2b = 2.0 - x0b;
-    x3b = 3.0 - x0b;
-	x123b = x1b*x2b*x3b/6.0;
-	x120b = x1b*x2b*x0b/6.0;
-	x032b = x0b*x3b*x2b/2.0;
-	x031b = x0b*x3b*x1b/2.0;
-	
-	//UNIT VECTOR
-			
-	//double unit_vec_dRa[3];
-	//unit_vec_dRa[0] = dRa.x;
-	//unit_vec_dRa[1] = dRa.y;
-	//unit_vec_dRa[2] = dRa.z;
-	
-	double unit_vec_dRb[3];
-	unit_vec_dRb[0] = dRb.x;
-	unit_vec_dRb[1] = dRb.y;
-	unit_vec_dRb[2] = dRb.z;
-	
-	//special case for R = 0;
-	const double tiny1 = 1e-12;
-	const double tiny2 = 1e-10;
-
-	if(distance10 < tiny1) distance10 += tiny1;
-	if(distance20 < tiny1) distance20 += tiny1;
-	
-
-	// Find three dimension of 'Table_NR' '
-	// Notice!!! T1 must be orbital, 
-	// T0 must be nonlocal orbital
-	// usage : pairs_nonlocal_type(T1 : orbital, T0 : projector);
-	const int Tpair1 = tbeta.NL_Tpair(T1, T0);
-	const int Tpair2 = tbeta.NL_Tpair(T2, T0);
-	const int T1_2Lplus1 = tbeta.NL_L2plus1(T1, T0);
-	const int T2_2Lplus1 = tbeta.NL_L2plus1(T2, T0);
-
-	//gaunt index
-	const int gindex1 = L1*L1+m1;
-	const int gindex2 = L2*L2+m2;
-
-	// Peize Lin change rlya, rlyb, grlyb 2016-08-26
-	vector<double> rlya;
-	vector<double> rlyb;
-	vector<vector<double>> grlyb;
-	
-	Ylm::rl_sph_harm (T1_2Lplus1-1, dRa.x, dRa.y, dRa.z, rlya);
-	if (job == 0) 
-	{
-		Ylm::rl_sph_harm (T2_2Lplus1-1, dRb.x, dRb.y, dRb.z, rlyb);
-	}
-	else 
-	{
-		Ylm::grad_rl_sph_harm (T2_2Lplus1-1, dRb.x, dRb.y, dRb.z, rlyb, grlyb);
-	}
-	//==============================================================================
-	// Formula :                         T1       T0          T0        T2
-	// sum_{L0}sum_{m0}
-	// 			D_{L0,L0} <psi1_{L1,N1}|Beta_{L0,m0}><Beta_{L0,m0}|psi2_{L2,N2}>
-	//==============================================================================
-	//double v = 0.0;
-
-	// mohan update 2011-03-07
-	int n_projection =1;
-	if(has_so) 
-	{
-		n_projection = ORB.Beta[T0].get_nproj_soc();
-	}
-
-	vector<complex<double>> term_a_nc(n_projection,{0,0});		// Peize Lin change ptr to vector at 2020.01.31
-	vector<complex<double>> term_b_nc(n_projection,{0,0});		// Peize Lin change ptr to vector at 2020.01.31
-	int ip = -1;
-
-	for(int nb=0; nb<nproj; nb++)
-	{
-		if( !calproj[nb] ) continue;
-
-		const int L0 = ORB.Beta[T0].getL_Beta(nb);
-		//const int next_ip = 2* L0 +1;
-	
-
-//-------------------------------------------------------------------
-// move iterations for psi1 and psi2 from cal_fvnl_dbeta 
-// to here --- 2021/03/20 mohan chen
-//-------------------------------------------------------------------
-
-
-		// <psi1 | Beta>
-		const int Opair1 = tbeta.NL_Opair(Tpair1, L1, N1, nb); 
-		// <psi2 | Beta>
-		const int Opair2 = tbeta.NL_Opair(Tpair2, L2, N2, nb); 
-		
-			
-		for(int m0=0; m0<2*L0+1; m0++)
-		{
-			++ip;
-			int gindex0 = L0*L0+m0;
-			
-			//loop of {lmn}
-			double term_a = 0.0;
-			double term_b = 0.0;
-			double term_c[3] = {0,0,0};	
-			
-			//=============
-			// FIRST PART	
-			//=============
-			for(int L=0; L<T1_2Lplus1; L++)
-			{
-				//triangle rule for gaunt coefficients
-				int AL = L1 + L0;
-				int SL = abs (L1 - L0);
-				if ((L > AL) || (L < SL) || ((L-SL) % 2 == 1)) continue;
-			
-				//prefac = (i)^{lphi - lbeta - l}
-				//R0-R1 ==> <phi|beta>
-				double i_exp = pow(-1.0, (L1-L0-L)/2);
-				double rl1 = pow(distance10, L);			
-				double Interp_Vnla = 0.0;
-				if (distance10 > tiny2)
-				{	
-					curr = tbeta.Table_NR[0][Tpair1][Opair1][L];
-					if( iqa >= rmesh1[nb]-4)
-					{
-						Interp_Vnla = 0.0;
-					}
-					else
-					{
-						Interp_Vnla = i_exp * (x123a*curr[iqa]+x120a*curr[iqa+3]+x032a*curr[iqa+1]-x031a*curr[iqa+2]);
-					}
-					Interp_Vnla /= rl1;
-				}
-				else 
-				{
-					Interp_Vnla = i_exp * tbeta.Table_NR[0][Tpair1][Opair1][L][0];
-				}
-	
-				//------------------------------------------
-				//  Overlap value = S_from_table * G * Ylm				
-				//------------------------------------------
-				for(int m=0; m<2*L+1; m++)
-				{
-					int gindexa = L*L+m;
-					//double tmpGaunt = this->MGT.Get_Gaunt_SH(L1, m1, L0, m0, L, m); 
-					double tmpGaunt = this->MGT.Gaunt_Coefficients (gindex1, gindex0, gindexa);
-					term_a += tmpGaunt * Interp_Vnla * rlya[ MGT.get_lm_index(L, m) ];
-				}
-			} //end L
-
-			//=============
-			// SECOND PART	
-			//=============
-			for(int L=0; L<T2_2Lplus1; L++)
-			{
-				//triangle rule for gaunt coefficients
-				int AL = L2 + L0;
-				int SL = abs (L2 - L0);
-				if ((L > AL) || (L < SL) || ((L-SL) % 2 == 1)) continue;
-
-				double Interp_Vnlb = 0.0;
-				double Interp_Vnlc = 0.0;
-				
-				//prefac
-				double i_exp = pow(-1.0, (L2-L0-L)/2);
-				double rl2 = pow (distance20, L);	
-				if (distance20 > tiny2)
-				{
-					curr = tbeta.Table_NR[0][Tpair2][Opair2][L];
-   					
-					if( iqb >= rmesh2[nb]-4) 
-					{
-						Interp_Vnlb = 0.0;
-					}
-					else 
-					{
-						Interp_Vnlb = i_exp * (x123b*curr[iqb]+x120b*curr[iqb+3]+x032b*curr[iqb+1]-curr[iqb+2]*x031b);
-					}
-					
-					Interp_Vnlb /= rl2;
-				}
-				else 
-				{
-					Interp_Vnlb = i_exp * tbeta.Table_NR[0][Tpair2][Opair2][L][0];
-				}
-
-				
-				if (job == 1) // 1 means calculate the derivative part.
-				{
-					if (distance20 > tiny2)
-					{
-						curr = tbeta.Table_NR[1][Tpair2][Opair2][L];
-   					
-						if( iqb >= rmesh2[nb]-4) 
-						{
-							Interp_Vnlc = 0.0;
-						}
-						else 
-						{
-							Interp_Vnlc = i_exp * (x123b*curr[iqb]+x120b*curr[iqb+3]+x032b*curr[iqb+1]-curr[iqb+2]*x031b);
-						}
-						Interp_Vnlc = Interp_Vnlc / pow(distance20, L) - Interp_Vnlb * L / distance20;
-					}
-					else 
-					{
-						Interp_Vnlc = 0.0;
-					}
-				}
-				
-				// sum up the second part.	
-				for(int m=0; m<2*L+1; m++)
-				{
-					int gindexb = L*L+m;
-					//double tmpGaunt = this->MGT.Get_Gaunt_SH(L0, m0, L2, m2, L, m);
-					double tmpGaunt = this->MGT.Gaunt_Coefficients (gindex0, gindex2, gindexb);
-					const int lm = MGT.get_lm_index(L, m);
-					
-					switch (job)
-					{
-						case 0:// calculate the overlap part.
-						{
-							term_b += tmpGaunt * Interp_Vnlb * rlyb[lm];
-							break;
-						}
-						case 1: // calculate the derivative part.
-						{
-							double tt1 = tmpGaunt * Interp_Vnlc * rlyb[lm] / distance20;
-							double tt2 = tmpGaunt * Interp_Vnlb;
-										
-							for(int ir = 0; ir < 3; ir++)
-							{
-								term_c[ir] += tt1 * unit_vec_dRb[ir] 
-											+ tt2 * grlyb[lm][ir];
-							}
-
-							break;
-						}
-						default: break;
-					}
-				}// end m of SECOND PART
-			}// end L of SECOND PART
-		
-		
-			//added by zhengdy-soc, store them for soc case
-			if(has_so)
-			{
-				term_a_nc[ip] = term_a;
-				term_b_nc[ip] = term_b;
-			}
-		
-			//===============================================
-			// THIRD PART: SUM THE VALUE FROM ALL PROJECTS.
-			//===============================================
-			switch (job)
-			{
-				case 0://calculate the overlap part.
-				{
-					//nlm[0] += term_a * term_b * ORB.Beta[T0].getCoefficient_D(L0, L0);//LiuXh 2016-01-14
-					if(!has_so) 
-					{
-						nlm[0] += term_a * term_b * ORB.Beta[T0].getCoefficient_D(nb, nb);//LiuXh 2016-01-14
-					}
-					break;
-				}
-				case 1: //calculate the derivative part.
-				{
-					for(int jr = 0; jr < 3; jr++) 
-					{
-						//nlm[jr] += term_c[jr] * term_a * ORB.Beta[T0].getCoefficient_D(L0, L0);//LiuXh 2016-01-14
-						if(!has_so) 
-						{
-							nlm[jr] += term_c[jr] * term_a * ORB.Beta[T0].getCoefficient_D(nb, nb);//LiuXh 2016-01-14
-						}
-					}
-					break;
-				}
-				default: break;
-			}
-		}//!m0
-	}//!L0
-
-	//zhengdy-soc, calculate non-local term
-	if(has_so)
-	{
-		switch (job)
-		{
-			case 0://overlap part
-				for(int no=0;no<ORB.Beta[T0].get_count_soc(is);no++)
-				{
-					const int p1 = ORB.Beta[T0].get_index1_soc(is, no);
-					const int p2 = ORB.Beta[T0].get_index2_soc(is, no);
-					if(NSPIN==4 && nlm1!=NULL)
-					{
-						nlm1[is] += term_a_nc[p1] * term_b_nc[p2] * ORB.Beta[T0].getCoefficient_D_so(is, p2, p1);
-					}
-					else if(NSPIN!=4)
-					{
-						nlm[0] += (term_a_nc[p1] * term_b_nc[p2] * ORB.Beta[T0].getCoefficient_D_so(0, p2, p1)).real();
-					}
-					else
-					{
-						WARNING_QUIT("ORB_gen_tables::snap_psibeta","Conflict! Didn't count non-local part");
-					}
-				}
-				break;
-			case 1://need to be added later
-			{break;}
-			default: break;
-		}
-	}
-
-	delete[] calproj;
-	delete[] rmesh1;
-	delete[] rmesh2;
-
-	timer::tick ("ORB_gen_tables","snap_psibeta",'X');
-	return;
-}
-
-void ORB_gen_tables::snap_psipsi(
-	double olm[],
-	const int &job, //0, 1
-	const char &dtype, // derivative type: S or T
-	const Vector3<double> &R1,
-    const int &T1,
-    const int &L1,
-    const int &m1,
-    const int &N1,
-    const Vector3<double> &R2,
-    const int &T2,
-    const int &L2,
-    const int &m2,
-    const int &N2,
-	complex<double> *olm1)const
-{
-	//TITLE("ORB_gen_tables","snap_psipsi");
-	//timer::tick ("ORB_gen_tables", "snap_psipsi");
-	if(job != 0 && job != 1)
-	{
-		WARNING_QUIT("ORB_gen_tables::snap_psipsi","job must be equal to 0 or 1!");
-	}
-	
-	Numerical_Orbital::set_position(R1, R2);
-	assert(this->lat0>0.0);
-
-	// (1) get distance between R1 and R2 (a.u.)
-	// judge if there exist overlap
-	double distance = Numerical_Orbital::get_distance()*this->lat0;
-	
-	const double Rcut1 = ORB.Phi[T1].getRcut();
-	const double Rcut2 = ORB.Phi[T2].getRcut();
-
-	if(job == 0) ZEROS(olm, 1);
-	else if(job == 1) ZEROS(olm, 3);
-	
-	if( distance > (Rcut1 + Rcut2) ) return;
-	
-	//if distance == 0
-	//\int psi(r) psi(r-R) dr independent of R if R == 0
-	//distance += tiny1 avoid overflow during calculation
-	const double tiny1 = 1e-12;
-	const double tiny2 = 1e-10;
-	if(distance < tiny1) distance += tiny1;
-	
-	// (2) if there exist overlap, calculate the mesh number
-	// between two atoms
-	const int rmesh = this->MOT.get_rmesh(Rcut1, Rcut2);
-	
-	// (3) Find three dimension of 'Table_S' or 'Table_T'
-	// dim1 : type pairs,
-	// dim2 : radial orbital pairs,
-	// dim3 : find lmax between T1 and T2, and get lmax*2+1
-	const int dim1 = this->MOT.OV_Tpair(T1, T2);
-	const int dim3 = this->MOT.OV_L2plus1(T1, T2); //2*lmax+1
-	
-	int dim2;
-	if (T1 <= T2) dim2 = this->MOT.OV_Opair(dim1, L1, L2, N1, N2); 
-	else dim2 = this->MOT.OV_Opair(dim1, L2, L1, N2, N1);
-		
-	// Find the needed Ylm(dR) dimension 
-	const int nlm = dim3 * dim3; //(2lmax+1)*(2lmax+!)
-
-	//Gaunt Index
-	const int gindex1 = L1*L1+m1;
-	const int gindex2 = L2*L2+m2;
-
-	assert(nlm < 400);
-	// Peize Lin change rly, grly 2016-08-26
-	vector<double> rly;			
-	vector<vector<double>> grly;
-	
-//	double *ylm = new double[nlm];
-//	dR = R1 - R2;
-	double arr_dR[3];
-	arr_dR[0] = Numerical_Orbital::getX() * this->lat0;
-	arr_dR[1] = Numerical_Orbital::getY() * this->lat0;
-	arr_dR[2] = Numerical_Orbital::getZ() * this->lat0;
-	
-	//double xdr = arr_dR[0] / distance;
-	//double ydr = arr_dR[1] / distance;
-	//double zdr = arr_dR[2] / distance;
-	
-	//=======================
-	// *r**l*Ylm_real
-	// include its derivations
-	//=======================
-	if (job == 0) 
-	{
-//		Ylm::rlylm(dim3, arr_dR[0], arr_dR[1], arr_dR[2], rly);
-//		Ylm::sph_harm (dim3-1, xdr, ydr, zdr, rly);
-		Ylm::rl_sph_harm (dim3-1, arr_dR[0], arr_dR[1], arr_dR[2], rly);
-	}
-	else 
-	{
-//		Ylm::rlylm(dim3, arr_dR[0], arr_dR[1], arr_dR[2], rly, grly);
-		Ylm::grad_rl_sph_harm (dim3-1, arr_dR[0], arr_dR[1], arr_dR[2], rly, grly);
-	}
-
-	switch( dtype )
-	{
-		case 'S':
-		for (int L = 0; L < dim3; L++) //maxL = dim3-1
-		{
-			//===========================================================
-			// triangle rule for L and sum of L, L1, L2 should be even
-			//===========================================================
-			int AL = L1 + L2;
-			int SL = abs (L1 - L2);
-
-			if ((L > AL) || (L < SL) || ((L-SL) % 2 == 1)) continue;
-			
-			double Interp_Slm = 0.0;
-			double Interp_dSlm = 0.0;
-			double tmpOlm0 = 0.0;
-			double tmpOlm1 = 0.0;
-			
-			// prefactor
-			double i_exp = pow(-1.0, (L1 - L2 - L) / 2);
-			double rl = pow (distance, L);
-
-			if (distance > tiny2)
-			{
-				Interp_Slm = i_exp * Mathzone::Polynomial_Interpolation(
-						MOT.Table_SR[0][dim1][dim2][L],	rmesh, MOT.dr, distance );
-				Interp_Slm /= rl;
-			}
-			else // distance = 0.0; 
-			{
-				Interp_Slm = i_exp * MOT.Table_SR[0][dim1][dim2][L][0];
-			}
-				
-			if (job == 1)//calculate the derivative.
-			{
-				if (distance > tiny2)
-				{
-					Interp_dSlm = i_exp * Mathzone::Polynomial_Interpolation(
-						MOT.Table_SR[1][dim1][dim2][L], rmesh, MOT.dr, distance );
-					Interp_dSlm = Interp_dSlm / pow (distance, L) - Interp_Slm * L / distance;
-				}
-				else 
-				{
-					Interp_dSlm = 0.0;
-				}
-			}
-			
-			for (int m = 0; m < 2*L+1; m++)
-			{
-				int gindex = L*L+m;
-	//			double tmpGaunt1 = MGT.Get_Gaunt_SH(L1, m1, L2, m2, L, m);
-				double tmpGaunt = MGT.Gaunt_Coefficients (gindex1, gindex2, gindex);	
-							
-				tmpOlm0 = Interp_Slm * tmpGaunt;
-	
-				if (job == 1) 
-				{
-					tmpOlm1 = Interp_dSlm * tmpGaunt;
-				}
-				
-				switch( job )
-				{
-					case 0: // calculate overlap.
-					{	
-						if(NSPIN!=4) olm[0] += tmpOlm0 * rly[ MGT.get_lm_index(L, m) ] ;
-						else if(olm1!= NULL)
-						{
-							olm1[0] += tmpOlm0 * rly[ MGT.get_lm_index(L,m) ] ;
-							olm1[1] += 0;//tmpOlm0 * (tmp(0,0)+tmp(0,1));
-							olm1[2] += 0;//tmpOlm0 * (tmp(1,0)+tmp(1,1));
-							olm1[3] += tmpOlm0 * rly[ MGT.get_lm_index(L,m) ] ;
-							
-						}
-						else
-						{
-							WARNING_QUIT("ORB_gen_tables::snap_psipsi","something wrong!");
-							
-						}
-					
-						/*		
-						if( abs ( tmpOlm0 * rly[ MGT.get_lm_index(L, m) ] ) > 1.0e-3 )
-						{
-						cout << " L=" << L << " m=" << m << " tmpOlm0=" << tmpOlm0 
-						<< " rly=" << rly[ MGT.get_lm_index(L, m) ] 
-						<< " r=" << olm[0]
-						<< endl;
-						}
-						*/
-						break;
-					}
-					case 1: // calculate gradient.
-					{
-						for(int ir = 0; ir < 3; ir++)
-						{
-							olm[ir] += tmpOlm0 * grly[ MGT.get_lm_index(L, m) ][ir]
-									 + tmpOlm1 * rly[ MGT.get_lm_index(L, m) ] * arr_dR[ir] / distance;
-						}
-						break;
-					}
-					default: break;
-				}
-			}//!m
-		}
-		break;
-
-		case 'T':
-		for (int L = 0; L < dim3; L++)
-		{
-			int AL = L1 + L2;
-			int SL = abs (L1 - L2);
-
-			if ((L > AL) || (L < SL) || ((L-SL) % 2 == 1)) continue;
-
-			double Interp_Tlm, Interp_dTlm, tmpKem0, tmpKem1;
-			Interp_Tlm = Interp_dTlm = tmpKem0 = tmpKem1 = 0.0;
-			
-			//pre-fac
-			double i_exp = pow(-1.0, (L1 - L2 - L) / 2);
-
-			double rl = pow (distance, L);
-			if (distance > tiny2)
-			{
-				Interp_Tlm = i_exp * Mathzone::Polynomial_Interpolation(
-						MOT.Table_TR[0][dim1][dim2][L],	rmesh, MOT.dr, distance );	
-				Interp_Tlm /= rl;
-			}
-			else Interp_Tlm = i_exp * MOT.Table_TR[0][dim1][dim2][L][0];
-				
-			
-			if (job == 1)
-			{
-				if (distance > tiny2)
-				{
-					Interp_dTlm = i_exp * Mathzone::Polynomial_Interpolation(
-						MOT.Table_TR[1][dim1][dim2][L], rmesh, MOT.dr, distance );
-					Interp_dTlm = Interp_dTlm / rl - Interp_Tlm * L / distance;
-				}
-				else Interp_dTlm = 0.0;
-			}
-			
-			for (int m = 0; m < 2*L+1; m++)
-			{
-				int gindex = L*L+m;
-			//	double tmpGaunt = MGT.Get_Gaunt_SH(L1, m1, L2, m2, L, m);
-				double tmpGaunt = MGT.Gaunt_Coefficients (gindex1, gindex2, gindex);
-					
-				tmpKem0 = Interp_Tlm * tmpGaunt;
-				if (job == 1) 
-				{
-					tmpKem1 = Interp_dTlm * tmpGaunt;
-				}
-				
-				switch( job )
-				{
-					case 0:
-					{
-						if(NSPIN!=4) olm[0] += tmpKem0 * rly[ MGT.get_lm_index(L, m) ];
-						else if(olm1 != NULL)
-						{
-							olm1[0] += tmpKem0 * rly[ MGT.get_lm_index(L,m) ];
-							olm1[1] += 0;//tmpKem0 * (tmp(0,0)+tmp(0,1));
-							olm1[2] += 0;//tmpKem0 * (tmp(1,0)+tmp(1,1));
-							olm1[3] += tmpKem0 * rly[ MGT.get_lm_index(L,m) ];
-						}
-						else
-						{
-							WARNING_QUIT("ORB_gen_tables::snap_psipsi","something wrong in T.");
-						}
-						break;
-					}
-					case 1: 
-					{
-						for(int ir = 0; ir < 3; ir++)
-						{
-							olm[ir] += tmpKem0 * grly[ MGT.get_lm_index(L, m) ][ir]
-								    + tmpKem1 * rly[ MGT.get_lm_index(L, m) ] * arr_dR[ir] / distance;
-						}
-						break;
-					}
-					default: break;
-				}
-			}// end T: m
-		}// end T: :
-		break;
-	}
-//	timer::tick ("ORB_gen_tables", "snap_psipsi");
-	return;
-}
-
-double ORB_gen_tables::get_distance( const Vector3<double> &R1, const Vector3<double> &R2)const
-{
-	assert( this->lat0 > 0.0);
-	Vector3<double> dR = R1 - R2;
-	return dR.norm() * this->lat0;	
-}
-
-//caoyu add 2021-03-17
-void ORB_gen_tables::snap_psialpha(
-	double olm[],
-	const int& job,
-	const Vector3<double>& R1,
-	const int& T1,
-	const int& L1,
-	const int& m1,
-	const int& N1,
-	const Vector3<double>& R2,
-	const int& T2,
-	const int& L2,
-	const int& m2,
-	const int& N2) const
-{
-
-	if (job != 0 && job != 1)
-	{
-		WARNING_QUIT("ORB_gen_tables::snap_psialpha", "job must be equal to 0 or 1!");
-	}
-
-	Numerical_Orbital::set_position(R1, R2);
-	assert(this->lat0 > 0.0);
-
-	// (1) get distance between R1 and R2 (a.u.)
-	// judge if there exist overlap
-	double distance = Numerical_Orbital::get_distance() * this->lat0;
-
-	const double Rcut1 = ORB.Phi[T1].getRcut();
-	const double Rcut2 = ORB.Alpha[0].getRcut();
-
-	if (job == 0) ZEROS(olm, 1);
-	else if (job == 1) ZEROS(olm, 3);
-
-	if (distance > (Rcut1 + Rcut2)) return;
-
-	//if distance == 0
-	//\int psi(r) psi(r-R) dr independent of R if R == 0
-	//distance += tiny1 avoid overflow during calculation
-	const double tiny1 = 1e-12;
-	const double tiny2 = 1e-10;
-	if (distance < tiny1) distance += tiny1;
-
-	// (2) if there exist overlap, calculate the mesh number
-	// between two atoms
-	const int rmesh = this->talpha.get_rmesh(Rcut1, Rcut2);
-
-	// (3) Find three dimension of 'Table_DS'
-	// dim1 : type pairs, equal to T1 here 
-	// dim2 : radial orbital pairs,
-	// dim3 : find lmax between T1 and T2, and get lmax*2+1
-	const int dim1 = T1;
-	int dim2 = this->talpha.DS_Opair(dim1, L1, L2, N1, N2);
-	int dim3 = this->talpha.DS_2Lplus1[T1];
-
-	//Gaunt Index
-		const int gindex1 = L1 * L1 + m1;
-	const int gindex2 = L2 * L2 + m2;
-
-	// Peize Lin change rly, grly 2016-08-26
-	vector<double> rly;
-	vector<vector<double>> grly;
-
-	double arr_dR[3];
-	arr_dR[0] = Numerical_Orbital::getX() * this->lat0;
-	arr_dR[1] = Numerical_Orbital::getY() * this->lat0;
-	arr_dR[2] = Numerical_Orbital::getZ() * this->lat0;
-
-	//double xdr = arr_dR[0] / distance;
-	//double ydr = arr_dR[1] / distance;
-	//double zdr = arr_dR[2] / distance;
-
-	//=======================
-	// *r**l*Ylm_real
-	// include its derivations
-	//=======================
-	if (job == 0)
-	{
-		Ylm::rl_sph_harm(dim3 - 1, arr_dR[0], arr_dR[1], arr_dR[2], rly);
-	}
-	else
-	{
-		Ylm::grad_rl_sph_harm(dim3 - 1, arr_dR[0], arr_dR[1], arr_dR[2], rly, grly);
-	}
-
-	for (int L = 0; L < dim3; L++) //maxL = dim3-1
-	{
-		//===========================================================
-		// triangle rule for L and sum of L, L1, L2 should be even
-		//===========================================================
-		int AL = L1 + L2;
-		int SL = abs(L1 - L2);
-
-		if ((L > AL) || (L < SL) || ((L - SL) % 2 == 1)) continue;
-
-		double Interp_Slm = 0.0;
-		double Interp_dSlm = 0.0;
-		double tmpOlm0 = 0.0;
-		double tmpOlm1 = 0.0;
-
-		// prefactor
-		double i_exp = pow(-1.0, (L1 - L2 - L) / 2);
-		double rl = pow(distance, L);
-
-		if (distance > tiny2)
-		{
-			Interp_Slm = i_exp * Mathzone::Polynomial_Interpolation(
-				talpha.Table_DSR[0][dim1][dim2][L], rmesh, MOT.dr, distance);
-			Interp_Slm /= rl;
-		}
-		else // distance = 0.0; 
-		{
-			Interp_Slm = i_exp * talpha.Table_DSR[0][dim1][dim2][L][0];
-		}
-
-		if (job == 1)//calculate the derivative.
-		{
-			if (distance > tiny2)
-			{
-				Interp_dSlm = i_exp * Mathzone::Polynomial_Interpolation(
-					talpha.Table_DSR[1][dim1][dim2][L], rmesh, MOT.dr, distance);
-				Interp_dSlm = Interp_dSlm / pow(distance, L) - Interp_Slm * L / distance;
-			}
-			else
-			{
-				Interp_dSlm = 0.0;
-			}
-		}
-
-		for (int m = 0; m < 2 * L + 1; m++)
-		{
-			int gindex = L * L + m;
-			//			double tmpGaunt1 = MGT.Get_Gaunt_SH(L1, m1, L2, m2, L, m);
-			double tmpGaunt = MGT.Gaunt_Coefficients(gindex1, gindex2, gindex);
-
-			tmpOlm0 = Interp_Slm * tmpGaunt;
-
-			if (job == 1)
-			{
-				tmpOlm1 = Interp_dSlm * tmpGaunt;
-			}
-
-			switch (job)
-			{
-			case 0: // calculate overlap.
-			{
-				if (NSPIN != 4) olm[0] += tmpOlm0 * rly[MGT.get_lm_index(L, m)];
-				else
-				{
-					WARNING_QUIT("ORB_gen_tables::snap_psialpha", "deepks with NSPIN>1 has not implemented yet!");
-				}
-				/*
-				if( abs ( tmpOlm0 * rly[ MGT.get_lm_index(L, m) ] ) > 1.0e-3 )
-				{
-				cout << " L=" << L << " m=" << m << " tmpOlm0=" << tmpOlm0
-				<< " rly=" << rly[ MGT.get_lm_index(L, m) ]
-				<< " r=" << olm[0]
-				<< endl;
-				}
-				*/
-				break;
-			}
-			case 1: // calculate gradient.
-			{
-				for (int ir = 0; ir < 3; ir++)
-				{
-					olm[ir] += tmpOlm0 * grly[MGT.get_lm_index(L, m)][ir]
-						+ tmpOlm1 * rly[MGT.get_lm_index(L, m)] * arr_dR[ir] / distance;
-				}
-				break;
-			}
-			default: break;
-			}
-		}//!m
-	}
-
-	return;
-}
+#include "src_pw/global.h"
+#include "ORB_read.h"
+#include "ORB_gen_tables.h"
+#include "src_global/ylm.h"
+
+// here is a member of ORB_gen_tables class
+ORB_gen_tables UOT;
+
+ORB_gen_tables::ORB_gen_tables() {}
+ORB_gen_tables::~ORB_gen_tables() {}
+
+// call in hamilt_linear::init_before_ions.
+void ORB_gen_tables::gen_tables(
+	const int &job0,
+	LCAO_Orbitals &orb,
+	const int &Lmax_exx)
+{
+	TITLE("ORB_gen_tables", "gen_tables");
+	timer::tick("ORB_gen_tables", "gen_tables", 'C');
+
+	ofs_running << "\n SETUP THE TWO-CENTER INTEGRATION TABLES" << endl;
+
+	//=========================================
+	// (1) MOT: make overlap table.
+	//=========================================
+	MOT.allocate(
+		orb.get_ntype(), // number of atom types
+		orb.get_lmax(),	 // max L used to calculate overlap
+		orb.get_kmesh(), // kpoints, for integration in k space
+		orb.get_Rmax(),	 // max value of radial table
+		orb.get_dR(),	 // delta R, for making radial table
+		orb.get_dk());	 // delta k, for integration in k space
+
+	tbeta.allocate(
+		orb.get_ntype(), // number of atom types
+		orb.get_lmax(),	 // max L used to calculate overlap
+		orb.get_kmesh(), // kpoints, for integration in k space
+		orb.get_Rmax(),	 // max value of radial table
+		orb.get_dR(),	 // delta R, for making radial table
+		orb.get_dk());	 // delta k, for integration in k space
+
+	//caoyu add 2021-03-18
+	if (INPUT.out_descriptor && BASIS_TYPE == "lcao")
+	{
+		talpha.allocate(
+			orb.get_ntype(), // number of atom types
+			orb.get_lmax(),	 // max L used to calculate overlap
+			orb.get_kmesh(), // kpoints, for integration in k space
+			orb.get_Rmax(),	 // max value of radial table
+			orb.get_dR(),	 // delta R, for making radial table
+			orb.get_dk());	 // delta k, for integration in k space
+	}
+
+	// OV: overlap
+	MOT.init_OV_Tpair(orb);
+	MOT.init_OV_Opair(orb);
+
+	// NL: nonlocal
+	tbeta.init_NL_Tpair();
+	tbeta.init_NL_Opair(orb); // add 2009-5-8
+
+	//caoyu add 2021-03-18
+	// DS: Descriptor
+	if (INPUT.out_descriptor && BASIS_TYPE == "lcao")
+	{
+		talpha.init_DS_Opair();
+		talpha.init_DS_2Lplus1();
+	}
+
+	//=========================================
+	// (2) init Ylm Coef
+	//=========================================
+	//liaochen add 2010/4/29
+	Ylm::set_coefficients();
+
+	// PLEASE add explanations for all options of 'orb_num' and 'mode'
+	// mohan add 2021-04-03
+	// Peize Lin update 2016-01-26
+	int orb_num = 2; //
+	int mode = 1;	 // 1: <phi|phi> and <phi|beta>
+	int Lmax_used = 0;
+	int Lmax = 0;
+
+	MOT.init_Table_Spherical_Bessel(orb_num, mode, Lmax_used, Lmax, Lmax_exx);
+
+	//calculate S(R) for interpolation
+	MOT.init_Table(job0, orb);
+	tbeta.init_Table_Beta(MOT.pSB); // add 2009-5-8
+
+	//caoyu add 2021-03-18
+	if (INPUT.out_descriptor && BASIS_TYPE == "lcao")
+	{
+		talpha.init_Table_Alpha(MOT.pSB);
+		//talpha.print_Table_DSR();
+	}
+
+	//=========================================
+	// (3) make Gaunt coefficients table
+	//=========================================
+
+	const int lmax = (Lmax_used - 1) / 2;
+	//MGT.init_Ylm_Gaunt(orb.get_lmax()+1, 0.0,PI,0.0,TWO_PI);
+	MGT.init_Gaunt_CH(lmax);
+	//MGT.init_Gaunt(orb.get_lmax()+1);
+	MGT.init_Gaunt(lmax);
+
+	timer::tick("ORB_gen_tables", "gen_tables", 'C');
+	return;
+}
+
+void ORB_gen_tables::snap_psibeta(
+	double nlm[],
+	const int &job,
+	const Vector3<double> &R1,
+	const int &T1,
+	const int &L1,
+	const int &m1,
+	const int &N1,
+	const Vector3<double> &R2,
+	const int &T2,
+	const int &L2,
+	const int &m2,
+	const int &N2,
+	const Vector3<double> &R0, // The projector.
+	const int &T0,
+	complex<double> *nlm1,
+	const int is) const
+{
+	//TITLE ("ORB_gen_tables","snap_psibeta");
+
+	//optimized by zhengdy-soc
+	if (NSPIN == 4 && ORB.Beta[T0].get_count_soc(is) == 0)
+	{
+		return;
+	}
+
+	timer::tick("ORB_gen_tables", "snap_psibeta", 'X');
+
+	bool has_so = 0;
+	if (ORB.Beta[T0].get_count_soc(0) > 0)
+		has_so = 1;
+
+	const int nproj = ORB.nproj[T0];
+	bool *calproj = new bool[nproj];
+	int *rmesh1 = new int[nproj];
+	int *rmesh2 = new int[nproj];
+
+	//rcut of orbtials and projectors
+	const double Rcut1 = ORB.Phi[T1].getRcut();
+	const double Rcut2 = ORB.Phi[T2].getRcut();
+
+	//in our calculation, we always put orbital phi at the left side of <phi|beta>
+	//because <phi|beta> = <beta|phi>
+	const Vector3<double> dRa = (R0 - R1) * this->lat0;
+	const Vector3<double> dRb = (R0 - R2) * this->lat0;
+
+	double distance10 = dRa.norm();
+	double distance20 = dRb.norm();
+
+	// mohan add 2011-03-10
+	// because the table length is different accordint to each length
+	// of projector, so sometimes some shorter projectors need not be
+	// calculated.
+	bool all_out = true;
+	for (int ip = 0; ip < nproj; ip++)
+	{
+		const double Rcut0 = ORB.Beta[T0].Proj[ip].getRcut();
+		if (distance10 > (Rcut1 + Rcut0) || distance20 > (Rcut2 + Rcut0))
+		{
+			calproj[ip] = false;
+		}
+		else
+		{
+			all_out = false;
+			calproj[ip] = true;
+			//length of table for interpolation
+			rmesh1[ip] = tbeta.get_rmesh(Rcut1, Rcut0);
+			rmesh2[ip] = tbeta.get_rmesh(Rcut2, Rcut0);
+		}
+	}
+
+	if (all_out)
+	{
+		delete[] calproj;
+		delete[] rmesh1;
+		delete[] rmesh2;
+		timer::tick("ORB_gen_tables", "snap_psibeta", 'X');
+		return;
+	}
+
+	//FOR INTERPOLATION
+	double *curr; //current pointer
+	int iqa, iqb;
+	double psa, psb;
+	double x0a, x1a, x2a, x3a, x123a, x120a, x032a, x031a;
+	double x0b, x1b, x2b, x3b, x123b, x120b, x032b, x031b;
+
+	psa = distance10 / tbeta.dr;
+	iqa = static_cast<int>(psa);
+	x0a = psa - static_cast<double>(iqa);
+	x1a = 1.0 - x0a;
+	x2a = 2.0 - x0a;
+	x3a = 3.0 - x0a;
+	x123a = x1a * x2a * x3a / 6.0;
+	x120a = x1a * x2a * x0a / 6.0;
+	x032a = x0a * x3a * x2a / 2.0;
+	x031a = x0a * x3a * x1a / 2.0;
+
+	psb = distance20 / tbeta.dr;
+	iqb = (int)psb;
+	x0b = psb - (double)iqb;
+	x1b = 1.0 - x0b;
+	x2b = 2.0 - x0b;
+	x3b = 3.0 - x0b;
+	x123b = x1b * x2b * x3b / 6.0;
+	x120b = x1b * x2b * x0b / 6.0;
+	x032b = x0b * x3b * x2b / 2.0;
+	x031b = x0b * x3b * x1b / 2.0;
+
+	//UNIT VECTOR
+
+	//double unit_vec_dRa[3];
+	//unit_vec_dRa[0] = dRa.x;
+	//unit_vec_dRa[1] = dRa.y;
+	//unit_vec_dRa[2] = dRa.z;
+
+	double unit_vec_dRb[3];
+	unit_vec_dRb[0] = dRb.x;
+	unit_vec_dRb[1] = dRb.y;
+	unit_vec_dRb[2] = dRb.z;
+
+	//special case for R = 0;
+	const double tiny1 = 1e-12;
+	const double tiny2 = 1e-10;
+
+	if (distance10 < tiny1)
+		distance10 += tiny1;
+	if (distance20 < tiny1)
+		distance20 += tiny1;
+
+	// Find three dimension of 'Table_NR' '
+	// Notice!!! T1 must be orbital,
+	// T0 must be nonlocal orbital
+	// usage : pairs_nonlocal_type(T1 : orbital, T0 : projector);
+	const int Tpair1 = tbeta.NL_Tpair(T1, T0);
+	const int Tpair2 = tbeta.NL_Tpair(T2, T0);
+	const int T1_2Lplus1 = tbeta.NL_L2plus1(T1, T0);
+	const int T2_2Lplus1 = tbeta.NL_L2plus1(T2, T0);
+
+	//gaunt index
+	const int gindex1 = L1 * L1 + m1;
+	const int gindex2 = L2 * L2 + m2;
+
+	// Peize Lin change rlya, rlyb, grlyb 2016-08-26
+	vector<double> rlya;
+	vector<double> rlyb;
+	vector<vector<double>> grlyb;
+
+	Ylm::rl_sph_harm(T1_2Lplus1 - 1, dRa.x, dRa.y, dRa.z, rlya);
+	if (job == 0)
+	{
+		Ylm::rl_sph_harm(T2_2Lplus1 - 1, dRb.x, dRb.y, dRb.z, rlyb);
+	}
+	else
+	{
+		Ylm::grad_rl_sph_harm(T2_2Lplus1 - 1, dRb.x, dRb.y, dRb.z, rlyb, grlyb);
+	}
+	//==============================================================================
+	// Formula :                         T1       T0          T0        T2
+	// sum_{L0}sum_{m0}
+	// 			D_{L0,L0} <psi1_{L1,N1}|Beta_{L0,m0}><Beta_{L0,m0}|psi2_{L2,N2}>
+	//==============================================================================
+	//double v = 0.0;
+
+	// mohan update 2011-03-07
+	int n_projection = 1;
+	if (has_so)
+	{
+		n_projection = ORB.Beta[T0].get_nproj_soc();
+	}
+
+	vector<complex<double>> term_a_nc(n_projection, {0, 0}); // Peize Lin change ptr to vector at 2020.01.31
+	vector<complex<double>> term_b_nc(n_projection, {0, 0}); // Peize Lin change ptr to vector at 2020.01.31
+	int ip = -1;
+
+	for (int nb = 0; nb < nproj; nb++)
+	{
+		if (!calproj[nb])
+			continue;
+
+		const int L0 = ORB.Beta[T0].getL_Beta(nb);
+		//const int next_ip = 2* L0 +1;
+
+		//-------------------------------------------------------------------
+		// move iterations for psi1 and psi2 from cal_fvnl_dbeta
+		// to here --- 2021/03/20 mohan chen
+		//-------------------------------------------------------------------
+
+		// <psi1 | Beta>
+		const int Opair1 = tbeta.NL_Opair(Tpair1, L1, N1, nb);
+		// <psi2 | Beta>
+		const int Opair2 = tbeta.NL_Opair(Tpair2, L2, N2, nb);
+
+		for (int m0 = 0; m0 < 2 * L0 + 1; m0++)
+		{
+			++ip;
+			int gindex0 = L0 * L0 + m0;
+
+			//loop of {lmn}
+			double term_a = 0.0;
+			double term_b = 0.0;
+			double term_c[3] = {0, 0, 0};
+
+			//=============
+			// FIRST PART
+			//=============
+			for (int L = 0; L < T1_2Lplus1; L++)
+			{
+				//triangle rule for gaunt coefficients
+				int AL = L1 + L0;
+				int SL = abs(L1 - L0);
+				if ((L > AL) || (L < SL) || ((L - SL) % 2 == 1))
+					continue;
+
+				//prefac = (i)^{lphi - lbeta - l}
+				//R0-R1 ==> <phi|beta>
+				double i_exp = pow(-1.0, (L1 - L0 - L) / 2);
+				double rl1 = pow(distance10, L);
+				double Interp_Vnla = 0.0;
+				if (distance10 > tiny2)
+				{
+					curr = tbeta.Table_NR[0][Tpair1][Opair1][L];
+					if (iqa >= rmesh1[nb] - 4)
+					{
+						Interp_Vnla = 0.0;
+					}
+					else
+					{
+						Interp_Vnla = i_exp * (x123a * curr[iqa] + x120a * curr[iqa + 3] + x032a * curr[iqa + 1] - x031a * curr[iqa + 2]);
+					}
+					Interp_Vnla /= rl1;
+				}
+				else
+				{
+					Interp_Vnla = i_exp * tbeta.Table_NR[0][Tpair1][Opair1][L][0];
+				}
+
+				//------------------------------------------
+				//  Overlap value = S_from_table * G * Ylm
+				//------------------------------------------
+				for (int m = 0; m < 2 * L + 1; m++)
+				{
+					int gindexa = L * L + m;
+					//double tmpGaunt = this->MGT.Get_Gaunt_SH(L1, m1, L0, m0, L, m);
+					double tmpGaunt = this->MGT.Gaunt_Coefficients(gindex1, gindex0, gindexa);
+					term_a += tmpGaunt * Interp_Vnla * rlya[MGT.get_lm_index(L, m)];
+				}
+			} //end L
+
+			//=============
+			// SECOND PART
+			//=============
+			for (int L = 0; L < T2_2Lplus1; L++)
+			{
+				//triangle rule for gaunt coefficients
+				int AL = L2 + L0;
+				int SL = abs(L2 - L0);
+				if ((L > AL) || (L < SL) || ((L - SL) % 2 == 1))
+					continue;
+
+				double Interp_Vnlb = 0.0;
+				double Interp_Vnlc = 0.0;
+
+				//prefac
+				double i_exp = pow(-1.0, (L2 - L0 - L) / 2);
+				double rl2 = pow(distance20, L);
+				if (distance20 > tiny2)
+				{
+					curr = tbeta.Table_NR[0][Tpair2][Opair2][L];
+
+					if (iqb >= rmesh2[nb] - 4)
+					{
+						Interp_Vnlb = 0.0;
+					}
+					else
+					{
+						Interp_Vnlb = i_exp * (x123b * curr[iqb] + x120b * curr[iqb + 3] + x032b * curr[iqb + 1] - curr[iqb + 2] * x031b);
+					}
+
+					Interp_Vnlb /= rl2;
+				}
+				else
+				{
+					Interp_Vnlb = i_exp * tbeta.Table_NR[0][Tpair2][Opair2][L][0];
+				}
+
+				if (job == 1) // 1 means calculate the derivative part.
+				{
+					if (distance20 > tiny2)
+					{
+						curr = tbeta.Table_NR[1][Tpair2][Opair2][L];
+
+						if (iqb >= rmesh2[nb] - 4)
+						{
+							Interp_Vnlc = 0.0;
+						}
+						else
+						{
+							Interp_Vnlc = i_exp * (x123b * curr[iqb] + x120b * curr[iqb + 3] + x032b * curr[iqb + 1] - curr[iqb + 2] * x031b);
+						}
+						Interp_Vnlc = Interp_Vnlc / pow(distance20, L) - Interp_Vnlb * L / distance20;
+					}
+					else
+					{
+						Interp_Vnlc = 0.0;
+					}
+				}
+
+				// sum up the second part.
+				for (int m = 0; m < 2 * L + 1; m++)
+				{
+					int gindexb = L * L + m;
+					//double tmpGaunt = this->MGT.Get_Gaunt_SH(L0, m0, L2, m2, L, m);
+					double tmpGaunt = this->MGT.Gaunt_Coefficients(gindex0, gindex2, gindexb);
+					const int lm = MGT.get_lm_index(L, m);
+
+					switch (job)
+					{
+					case 0: // calculate the overlap part.
+					{
+						term_b += tmpGaunt * Interp_Vnlb * rlyb[lm];
+						break;
+					}
+					case 1: // calculate the derivative part.
+					{
+						double tt1 = tmpGaunt * Interp_Vnlc * rlyb[lm] / distance20;
+						double tt2 = tmpGaunt * Interp_Vnlb;
+
+						for (int ir = 0; ir < 3; ir++)
+						{
+							term_c[ir] += tt1 * unit_vec_dRb[ir] + tt2 * grlyb[lm][ir];
+						}
+
+						break;
+					}
+					default:
+						break;
+					}
+				} // end m of SECOND PART
+			}	  // end L of SECOND PART
+
+			//added by zhengdy-soc, store them for soc case
+			if (has_so)
+			{
+				term_a_nc[ip] = term_a;
+				term_b_nc[ip] = term_b;
+			}
+
+			//===============================================
+			// THIRD PART: SUM THE VALUE FROM ALL PROJECTS.
+			//===============================================
+			switch (job)
+			{
+			case 0: //calculate the overlap part.
+			{
+				//nlm[0] += term_a * term_b * ORB.Beta[T0].getCoefficient_D(L0, L0);//LiuXh 2016-01-14
+				if (!has_so)
+				{
+					nlm[0] += term_a * term_b * ORB.Beta[T0].getCoefficient_D(nb, nb); //LiuXh 2016-01-14
+				}
+				break;
+			}
+			case 1: //calculate the derivative part.
+			{
+				for (int jr = 0; jr < 3; jr++)
+				{
+					//nlm[jr] += term_c[jr] * term_a * ORB.Beta[T0].getCoefficient_D(L0, L0);//LiuXh 2016-01-14
+					if (!has_so)
+					{
+						nlm[jr] += term_c[jr] * term_a * ORB.Beta[T0].getCoefficient_D(nb, nb); //LiuXh 2016-01-14
+					}
+				}
+				break;
+			}
+			default:
+				break;
+			}
+		} //!m0
+	}	  //!L0
+
+	//zhengdy-soc, calculate non-local term
+	if (has_so)
+	{
+		switch (job)
+		{
+		case 0: //overlap part
+			for (int no = 0; no < ORB.Beta[T0].get_count_soc(is); no++)
+			{
+				const int p1 = ORB.Beta[T0].get_index1_soc(is, no);
+				const int p2 = ORB.Beta[T0].get_index2_soc(is, no);
+				if (NSPIN == 4 && nlm1 != NULL)
+				{
+					nlm1[is] += term_a_nc[p1] * term_b_nc[p2] * ORB.Beta[T0].getCoefficient_D_so(is, p2, p1);
+				}
+				else if (NSPIN != 4)
+				{
+					nlm[0] += (term_a_nc[p1] * term_b_nc[p2] * ORB.Beta[T0].getCoefficient_D_so(0, p2, p1)).real();
+				}
+				else
+				{
+					WARNING_QUIT("ORB_gen_tables::snap_psibeta", "Conflict! Didn't count non-local part");
+				}
+			}
+			break;
+		case 1: //need to be added later
+		{
+			break;
+		}
+		default:
+			break;
+		}
+	}
+
+	delete[] calproj;
+	delete[] rmesh1;
+	delete[] rmesh2;
+
+	timer::tick("ORB_gen_tables", "snap_psibeta", 'X');
+	return;
+}
+
+void ORB_gen_tables::snap_psipsi(
+	double olm[],
+	const int &job,	   //0, 1
+	const char &dtype, // derivative type: S or T
+	const Vector3<double> &R1,
+	const int &T1,
+	const int &L1,
+	const int &m1,
+	const int &N1,
+	const Vector3<double> &R2,
+	const int &T2,
+	const int &L2,
+	const int &m2,
+	const int &N2,
+	complex<double> *olm1) const
+{
+	//TITLE("ORB_gen_tables","snap_psipsi");
+	//timer::tick ("ORB_gen_tables", "snap_psipsi");
+	if (job != 0 && job != 1)
+	{
+		WARNING_QUIT("ORB_gen_tables::snap_psipsi", "job must be equal to 0 or 1!");
+	}
+
+	Numerical_Orbital::set_position(R1, R2);
+	assert(this->lat0 > 0.0);
+
+	// (1) get distance between R1 and R2 (a.u.)
+	// judge if there exist overlap
+	double distance = Numerical_Orbital::get_distance() * this->lat0;
+
+	const double Rcut1 = ORB.Phi[T1].getRcut();
+	const double Rcut2 = ORB.Phi[T2].getRcut();
+
+	if (job == 0)
+		ZEROS(olm, 1);
+	else if (job == 1)
+		ZEROS(olm, 3);
+
+	if (distance > (Rcut1 + Rcut2))
+		return;
+
+	//if distance == 0
+	//\int psi(r) psi(r-R) dr independent of R if R == 0
+	//distance += tiny1 avoid overflow during calculation
+	const double tiny1 = 1e-12;
+	const double tiny2 = 1e-10;
+	if (distance < tiny1)
+		distance += tiny1;
+
+	// (2) if there exist overlap, calculate the mesh number
+	// between two atoms
+	const int rmesh = this->MOT.get_rmesh(Rcut1, Rcut2);
+
+	// (3) Find three dimension of 'Table_S' or 'Table_T'
+	// dim1 : type pairs,
+	// dim2 : radial orbital pairs,
+	// dim3 : find lmax between T1 and T2, and get lmax*2+1
+	const int dim1 = this->MOT.OV_Tpair(T1, T2);
+	const int dim3 = this->MOT.OV_L2plus1(T1, T2); //2*lmax+1
+
+	int dim2;
+	if (T1 <= T2)
+		dim2 = this->MOT.OV_Opair(dim1, L1, L2, N1, N2);
+	else
+		dim2 = this->MOT.OV_Opair(dim1, L2, L1, N2, N1);
+
+	// Find the needed Ylm(dR) dimension
+	const int nlm = dim3 * dim3; //(2lmax+1)*(2lmax+!)
+
+	//Gaunt Index
+	const int gindex1 = L1 * L1 + m1;
+	const int gindex2 = L2 * L2 + m2;
+
+	assert(nlm < 400);
+	// Peize Lin change rly, grly 2016-08-26
+	vector<double> rly;
+	vector<vector<double>> grly;
+
+	//	double *ylm = new double[nlm];
+	//	dR = R1 - R2;
+	double arr_dR[3];
+	arr_dR[0] = Numerical_Orbital::getX() * this->lat0;
+	arr_dR[1] = Numerical_Orbital::getY() * this->lat0;
+	arr_dR[2] = Numerical_Orbital::getZ() * this->lat0;
+
+	//double xdr = arr_dR[0] / distance;
+	//double ydr = arr_dR[1] / distance;
+	//double zdr = arr_dR[2] / distance;
+
+	//=======================
+	// *r**l*Ylm_real
+	// include its derivations
+	//=======================
+	if (job == 0)
+	{
+		//		Ylm::rlylm(dim3, arr_dR[0], arr_dR[1], arr_dR[2], rly);
+		//		Ylm::sph_harm (dim3-1, xdr, ydr, zdr, rly);
+		Ylm::rl_sph_harm(dim3 - 1, arr_dR[0], arr_dR[1], arr_dR[2], rly);
+	}
+	else
+	{
+		//		Ylm::rlylm(dim3, arr_dR[0], arr_dR[1], arr_dR[2], rly, grly);
+		Ylm::grad_rl_sph_harm(dim3 - 1, arr_dR[0], arr_dR[1], arr_dR[2], rly, grly);
+	}
+
+	switch (dtype)
+	{
+	case 'S':
+		for (int L = 0; L < dim3; L++) //maxL = dim3-1
+		{
+			//===========================================================
+			// triangle rule for L and sum of L, L1, L2 should be even
+			//===========================================================
+			int AL = L1 + L2;
+			int SL = abs(L1 - L2);
+
+			if ((L > AL) || (L < SL) || ((L - SL) % 2 == 1))
+				continue;
+
+			double Interp_Slm = 0.0;
+			double Interp_dSlm = 0.0;
+			double tmpOlm0 = 0.0;
+			double tmpOlm1 = 0.0;
+
+			// prefactor
+			double i_exp = pow(-1.0, (L1 - L2 - L) / 2);
+			double rl = pow(distance, L);
+
+			if (distance > tiny2)
+			{
+				Interp_Slm = i_exp * Mathzone::Polynomial_Interpolation(
+										 MOT.Table_SR[0][dim1][dim2][L], rmesh, MOT.dr, distance);
+				Interp_Slm /= rl;
+			}
+			else // distance = 0.0;
+			{
+				Interp_Slm = i_exp * MOT.Table_SR[0][dim1][dim2][L][0];
+			}
+
+			if (job == 1) //calculate the derivative.
+			{
+				if (distance > tiny2)
+				{
+					Interp_dSlm = i_exp * Mathzone::Polynomial_Interpolation(
+											  MOT.Table_SR[1][dim1][dim2][L], rmesh, MOT.dr, distance);
+					Interp_dSlm = Interp_dSlm / pow(distance, L) - Interp_Slm * L / distance;
+				}
+				else
+				{
+					Interp_dSlm = 0.0;
+				}
+			}
+
+			for (int m = 0; m < 2 * L + 1; m++)
+			{
+				int gindex = L * L + m;
+				//			double tmpGaunt1 = MGT.Get_Gaunt_SH(L1, m1, L2, m2, L, m);
+				double tmpGaunt = MGT.Gaunt_Coefficients(gindex1, gindex2, gindex);
+
+				tmpOlm0 = Interp_Slm * tmpGaunt;
+
+				if (job == 1)
+				{
+					tmpOlm1 = Interp_dSlm * tmpGaunt;
+				}
+
+				switch (job)
+				{
+				case 0: // calculate overlap.
+				{
+					if (NSPIN != 4)
+						olm[0] += tmpOlm0 * rly[MGT.get_lm_index(L, m)];
+					else if (olm1 != NULL)
+					{
+						olm1[0] += tmpOlm0 * rly[MGT.get_lm_index(L, m)];
+						olm1[1] += 0; //tmpOlm0 * (tmp(0,0)+tmp(0,1));
+						olm1[2] += 0; //tmpOlm0 * (tmp(1,0)+tmp(1,1));
+						olm1[3] += tmpOlm0 * rly[MGT.get_lm_index(L, m)];
+					}
+					else
+					{
+						WARNING_QUIT("ORB_gen_tables::snap_psipsi", "something wrong!");
+					}
+
+					/*		
+						if( abs ( tmpOlm0 * rly[ MGT.get_lm_index(L, m) ] ) > 1.0e-3 )
+						{
+						cout << " L=" << L << " m=" << m << " tmpOlm0=" << tmpOlm0 
+						<< " rly=" << rly[ MGT.get_lm_index(L, m) ] 
+						<< " r=" << olm[0]
+						<< endl;
+						}
+						*/
+					break;
+				}
+				case 1: // calculate gradient.
+				{
+					for (int ir = 0; ir < 3; ir++)
+					{
+						olm[ir] += tmpOlm0 * grly[MGT.get_lm_index(L, m)][ir] + tmpOlm1 * rly[MGT.get_lm_index(L, m)] * arr_dR[ir] / distance;
+					}
+					break;
+				}
+				default:
+					break;
+				}
+			} //!m
+		}
+		break;
+
+	case 'T':
+		for (int L = 0; L < dim3; L++)
+		{
+			int AL = L1 + L2;
+			int SL = abs(L1 - L2);
+
+			if ((L > AL) || (L < SL) || ((L - SL) % 2 == 1))
+				continue;
+
+			double Interp_Tlm, Interp_dTlm, tmpKem0, tmpKem1;
+			Interp_Tlm = Interp_dTlm = tmpKem0 = tmpKem1 = 0.0;
+
+			//pre-fac
+			double i_exp = pow(-1.0, (L1 - L2 - L) / 2);
+
+			double rl = pow(distance, L);
+			if (distance > tiny2)
+			{
+				Interp_Tlm = i_exp * Mathzone::Polynomial_Interpolation(
+										 MOT.Table_TR[0][dim1][dim2][L], rmesh, MOT.dr, distance);
+				Interp_Tlm /= rl;
+			}
+			else
+				Interp_Tlm = i_exp * MOT.Table_TR[0][dim1][dim2][L][0];
+
+			if (job == 1)
+			{
+				if (distance > tiny2)
+				{
+					Interp_dTlm = i_exp * Mathzone::Polynomial_Interpolation(
+											  MOT.Table_TR[1][dim1][dim2][L], rmesh, MOT.dr, distance);
+					Interp_dTlm = Interp_dTlm / rl - Interp_Tlm * L / distance;
+				}
+				else
+					Interp_dTlm = 0.0;
+			}
+
+			for (int m = 0; m < 2 * L + 1; m++)
+			{
+				int gindex = L * L + m;
+				//	double tmpGaunt = MGT.Get_Gaunt_SH(L1, m1, L2, m2, L, m);
+				double tmpGaunt = MGT.Gaunt_Coefficients(gindex1, gindex2, gindex);
+
+				tmpKem0 = Interp_Tlm * tmpGaunt;
+				if (job == 1)
+				{
+					tmpKem1 = Interp_dTlm * tmpGaunt;
+				}
+
+				switch (job)
+				{
+				case 0:
+				{
+					if (NSPIN != 4)
+						olm[0] += tmpKem0 * rly[MGT.get_lm_index(L, m)];
+					else if (olm1 != NULL)
+					{
+						olm1[0] += tmpKem0 * rly[MGT.get_lm_index(L, m)];
+						olm1[1] += 0; //tmpKem0 * (tmp(0,0)+tmp(0,1));
+						olm1[2] += 0; //tmpKem0 * (tmp(1,0)+tmp(1,1));
+						olm1[3] += tmpKem0 * rly[MGT.get_lm_index(L, m)];
+					}
+					else
+					{
+						WARNING_QUIT("ORB_gen_tables::snap_psipsi", "something wrong in T.");
+					}
+					break;
+				}
+				case 1:
+				{
+					for (int ir = 0; ir < 3; ir++)
+					{
+						olm[ir] += tmpKem0 * grly[MGT.get_lm_index(L, m)][ir] + tmpKem1 * rly[MGT.get_lm_index(L, m)] * arr_dR[ir] / distance;
+					}
+					break;
+				}
+				default:
+					break;
+				}
+			} // end T: m
+		}	  // end T: :
+		break;
+	}
+	//	timer::tick ("ORB_gen_tables", "snap_psipsi");
+	return;
+}
+
+double ORB_gen_tables::get_distance(const Vector3<double> &R1, const Vector3<double> &R2) const
+{
+	assert(this->lat0 > 0.0);
+	Vector3<double> dR = R1 - R2;
+	return dR.norm() * this->lat0;
+}
+
+//caoyu add 2021-03-17
+void ORB_gen_tables::snap_psialpha(
+	double olm[],
+	const int &job,
+	const Vector3<double> &R1,
+	const int &T1,
+	const int &L1,
+	const int &m1,
+	const int &N1,
+	const Vector3<double> &R2,
+	const int &T2,
+	const int &L2,
+	const int &m2,
+	const int &N2) const
+{
+
+	if (job != 0 && job != 1)
+	{
+		WARNING_QUIT("ORB_gen_tables::snap_psialpha", "job must be equal to 0 or 1!");
+	}
+
+	Numerical_Orbital::set_position(R1, R2);
+	assert(this->lat0 > 0.0);
+
+	// (1) get distance between R1 and R2 (a.u.)
+	// judge if there exist overlap
+	double distance = Numerical_Orbital::get_distance() * this->lat0;
+
+	const double Rcut1 = ORB.Phi[T1].getRcut();
+	const double Rcut2 = ORB.Alpha[0].getRcut();
+
+	if (job == 0)
+		ZEROS(olm, 1);
+	else if (job == 1)
+		ZEROS(olm, 3);
+
+	if (distance > (Rcut1 + Rcut2))
+		return;
+
+	//if distance == 0
+	//\int psi(r) psi(r-R) dr independent of R if R == 0
+	//distance += tiny1 avoid overflow during calculation
+	const double tiny1 = 1e-12;
+	const double tiny2 = 1e-10;
+	if (distance < tiny1)
+		distance += tiny1;
+
+	// (2) if there exist overlap, calculate the mesh number
+	// between two atoms
+	const int rmesh = this->talpha.get_rmesh(Rcut1, Rcut2);
+
+	// (3) Find three dimension of 'Table_DS'
+	// dim1 : type pairs, equal to T1 here
+	// dim2 : radial orbital pairs,
+	// dim3 : find lmax between T1 and T2, and get lmax*2+1
+	const int dim1 = T1;
+	int dim2 = this->talpha.DS_Opair(dim1, L1, L2, N1, N2);
+	int dim3 = this->talpha.DS_2Lplus1[T1];
+
+	//Gaunt Index
+	const int gindex1 = L1 * L1 + m1;
+	const int gindex2 = L2 * L2 + m2;
+
+	// Peize Lin change rly, grly 2016-08-26
+	vector<double> rly;
+	vector<vector<double>> grly;
+
+	double arr_dR[3];
+	arr_dR[0] = Numerical_Orbital::getX() * this->lat0;
+	arr_dR[1] = Numerical_Orbital::getY() * this->lat0;
+	arr_dR[2] = Numerical_Orbital::getZ() * this->lat0;
+
+	//double xdr = arr_dR[0] / distance;
+	//double ydr = arr_dR[1] / distance;
+	//double zdr = arr_dR[2] / distance;
+
+	//=======================
+	// *r**l*Ylm_real
+	// include its derivations
+	//=======================
+	if (job == 0)
+	{
+		Ylm::rl_sph_harm(dim3 - 1, arr_dR[0], arr_dR[1], arr_dR[2], rly);
+	}
+	else
+	{
+		Ylm::grad_rl_sph_harm(dim3 - 1, arr_dR[0], arr_dR[1], arr_dR[2], rly, grly);
+	}
+
+	for (int L = 0; L < dim3; L++) //maxL = dim3-1
+	{
+		//===========================================================
+		// triangle rule for L and sum of L, L1, L2 should be even
+		//===========================================================
+		int AL = L1 + L2;
+		int SL = abs(L1 - L2);
+
+		if ((L > AL) || (L < SL) || ((L - SL) % 2 == 1))
+			continue;
+
+		double Interp_Slm = 0.0;
+		double Interp_dSlm = 0.0;
+		double tmpOlm0 = 0.0;
+		double tmpOlm1 = 0.0;
+
+		// prefactor
+		double i_exp = pow(-1.0, (L1 - L2 - L) / 2);
+		double rl = pow(distance, L);
+
+		if (distance > tiny2)
+		{
+			Interp_Slm = i_exp * Mathzone::Polynomial_Interpolation(
+									 talpha.Table_DSR[0][dim1][dim2][L], rmesh, MOT.dr, distance);
+			Interp_Slm /= rl;
+		}
+		else // distance = 0.0;
+		{
+			Interp_Slm = i_exp * talpha.Table_DSR[0][dim1][dim2][L][0];
+		}
+
+		if (job == 1) //calculate the derivative.
+		{
+			if (distance > tiny2)
+			{
+				Interp_dSlm = i_exp * Mathzone::Polynomial_Interpolation(
+										  talpha.Table_DSR[1][dim1][dim2][L], rmesh, MOT.dr, distance);
+				Interp_dSlm = Interp_dSlm / pow(distance, L) - Interp_Slm * L / distance;
+			}
+			else
+			{
+				Interp_dSlm = 0.0;
+			}
+		}
+
+		for (int m = 0; m < 2 * L + 1; m++)
+		{
+			int gindex = L * L + m;
+			//			double tmpGaunt1 = MGT.Get_Gaunt_SH(L1, m1, L2, m2, L, m);
+			double tmpGaunt = MGT.Gaunt_Coefficients(gindex1, gindex2, gindex);
+
+			tmpOlm0 = Interp_Slm * tmpGaunt;
+
+			if (job == 1)
+			{
+				tmpOlm1 = Interp_dSlm * tmpGaunt;
+			}
+
+			switch (job)
+			{
+			case 0: // calculate overlap.
+			{
+				if (NSPIN != 4)
+					olm[0] += tmpOlm0 * rly[MGT.get_lm_index(L, m)];
+				else
+				{
+					WARNING_QUIT("ORB_gen_tables::snap_psialpha", "deepks with NSPIN>1 has not implemented yet!");
+				}
+				/*
+				if( abs ( tmpOlm0 * rly[ MGT.get_lm_index(L, m) ] ) > 1.0e-3 )
+				{
+				cout << " L=" << L << " m=" << m << " tmpOlm0=" << tmpOlm0
+				<< " rly=" << rly[ MGT.get_lm_index(L, m) ]
+				<< " r=" << olm[0]
+				<< endl;
+				}
+				*/
+				break;
+			}
+			case 1: // calculate gradient.
+			{
+				for (int ir = 0; ir < 3; ir++)
+				{
+					olm[ir] += tmpOlm0 * grly[MGT.get_lm_index(L, m)][ir] + tmpOlm1 * rly[MGT.get_lm_index(L, m)] * arr_dR[ir] / distance;
+				}
+				break;
+			}
+			default:
+				break;
+			}
+		} //!m
+	}
+
+	return;
+}
diff --git a/ABACUS.develop/source/src_lcao/ORB_table_alpha.cpp b/ABACUS.develop/source/src_lcao/ORB_table_alpha.cpp
index ed6681a115..a160816d29 100644
--- a/ABACUS.develop/source/src_lcao/ORB_table_alpha.cpp
+++ b/ABACUS.develop/source/src_lcao/ORB_table_alpha.cpp
@@ -1,465 +1,462 @@
-//caoyu add 2021-03-17
-#include "ORB_table_alpha.h"
-#include "ORB_read.h"
-#include "../src_global/math_integral.h"
-#include <stdexcept>
-
-double ORB_table_alpha::dr = -1.0;
-
-ORB_table_alpha::ORB_table_alpha()
-{
-	destroy_nr = false;
-
-	ntype = 0;
-	lmax = 0;
-	kmesh = 0;
-	Rmax = 0.0;
-	dr = 0.0;
-	dk = 0.0;
-
-	nlm = 0;
-	Rmesh = 0;
-
-	kpoint = new double[1];
-	r = new double[1];
-	rab = new double[1];
-	kab = new double[1];
-	DS_2Lplus1 = new int[1];
-}
-
-ORB_table_alpha::~ORB_table_alpha()
-{
-	delete[] kpoint;
-	delete[] r;
-	delete[] rab;
-	delete[] kab;
-	delete[] DS_2Lplus1;
-}
-
-void ORB_table_alpha::allocate
-(
-	const int& ntype_in,
-	const int& lmax_in,
-	const int& kmesh_in,
-	const double& Rmax_in,
-	const double& dr_in,
-	const double& dk_in
-)
-{
-	TITLE("ORB_table_alpha", "allocate");
-
-	this->ntype = ntype_in;// type of elements.
-	this->lmax = lmax_in;
-	this->kmesh = kmesh_in;
-	this->Rmax = Rmax_in;
-	this->dr = dr_in;
-	this->dk = dk_in;
-
-	assert(ntype > 0);
-	assert(lmax >= 0);
-	assert(kmesh > 0.0);
-	assert(Rmax >= 0.0);
-	assert(dr > 0.0);
-	assert(dk > 0.0);
-
-	// calculated from input parameters
-	this->nlm = (2 * lmax + 1) * (2 * lmax + 1);
-	this->Rmesh = static_cast<int>(Rmax / dr) + 4;
-	if (Rmesh % 2 == 0)
-	{
-		++Rmesh;
-	}
-
-	//	OUT(ofs_running,"lmax",lmax);
-	//	OUT(ofs_running,"Rmax (Bohr)",Rmax);
-	//	OUT(ofs_running,"dr (Bohr)",dr);
-	//	OUT(ofs_running,"dk",dk);
-	//	OUT(ofs_running,"nlm",nlm);
-	//	OUT(ofs_running,"kmesh",kmesh);
-
-	delete[] kpoint;
-	delete[] r;
-	kpoint = new double[kmesh];
-	r = new double[Rmesh];
-
-	delete[] rab;
-	delete[] kab;
-	kab = new double[kmesh];
-	rab = new double[Rmesh];
-
-	for (int ik = 0; ik < kmesh; ik++)
-	{
-		kpoint[ik] = ik * dk_in;
-		kab[ik] = dk_in;
-	}
-
-	for (int ir = 0; ir < Rmesh; ir++)
-	{
-		r[ir] = ir * dr;
-		rab[ir] = dr;
-	}
-
-	//	OUT(ofs_running,"allocate kpoint, r, rab, kab","Done");
-	return;
-}
-
-
-int ORB_table_alpha::get_rmesh(const double& R1, const double& R2)
-{
-	int rmesh = static_cast<int>((R1 + R2) / ORB_table_alpha::dr) + 5;
-	//mohan update 2009-09-08 +1 ==> +5
-	//considering interpolation or so on...
-	if (rmesh % 2 == 0) rmesh++;
-
-	if (rmesh <= 0)
-	{
-		ofs_warning << "\n R1 = " << R1 << " R2 = " << R2;
-		ofs_warning << "\n rmesh = " << rmesh;
-		WARNING_QUIT("ORB_table_alpha::get_rmesh", "rmesh <= 0");
-	}
-	return rmesh;
-}
-
-
-
-void ORB_table_alpha::cal_S_PhiAlpha_R(
-	Sph_Bessel_Recursive::D2* pSB, // mohan add 2021-03-06
-	const int& l,
-	const Numerical_Orbital_Lm& n1,
-	const Numerical_Orbital_Lm& n2,
-	const int& rmesh,
-	double* rs,
-	double* drs)
-{
-	timer::tick("ORB_table_alpha", "S_PhiAlpha_R");
-
-	assert(kmesh > 0);
-
-	//start calc	
-	double* k1_dot_k2 = new double[kmesh];
-
-	for (int ik = 0; ik < kmesh; ik++)
-	{
-		k1_dot_k2[ik] = n1.getPsi_k(ik) * n2.getPsi_k(ik);
-	}
-
-	//previous version
-	double* integrated_func = new double[kmesh];
-
-	const vector<vector<double>>& jlm1 = pSB->get_jlx()[l - 1];
-	const vector<vector<double>>& jl = pSB->get_jlx()[l];
-	const vector<vector<double>>& jlp1 = pSB->get_jlx()[l + 1];
-
-	for (int ir = 0; ir < rmesh; ir++)
-	{
-		ZEROS(integrated_func, kmesh);
-		double temp = 0.0;
-
-		for (int ik = 0; ik < kmesh; ik++)
-		{
-			integrated_func[ik] = jl[ir][ik] * k1_dot_k2[ik];
-		}
-		// Call simpson integration
-		Integral::Simpson_Integral(kmesh, integrated_func, kab, temp);
-		rs[ir] = temp * FOUR_PI;
-
-		//drs
-		double temp1, temp2;
-
-		if (l > 0)
-		{
-			for (int ik = 0; ik < kmesh; ik++)
-			{
-				integrated_func[ik] = jlm1[ir][ik] * k1_dot_k2[ik] * kpoint[ik];
-			}
-
-			Integral::Simpson_Integral(kmesh, integrated_func, kab, temp1);
-		}
-
-
-		for (int ik = 0; ik < kmesh; ik++)
-		{
-			integrated_func[ik] = jlp1[ir][ik] * k1_dot_k2[ik] * kpoint[ik];
-		}
-
-		Integral::Simpson_Integral(kmesh, integrated_func, kab, temp2);
-
-		if (l == 0)
-		{
-			drs[ir] = -FOUR_PI * temp2;
-		}
-		else
-		{
-			drs[ir] = FOUR_PI * (temp1 * l - (l + 1) * temp2) / (2.0 * l + 1);
-		}
-	}
-
-	//liaochen modify on 2010/4/22
-	//special case for R=0
-	//we store Slm(R) / R**l at the fisrt point, rather than Slm(R)
-	if (l > 0)
-	{
-		ZEROS(integrated_func, kmesh);
-		double temp = 0.0;
-
-		for (int ik = 0; ik < kmesh; ik++)
-		{
-			integrated_func[ik] = k1_dot_k2[ik] * pow(kpoint[ik], l);
-		}
-
-		// Call simpson integration
-		Integral::Simpson_Integral(kmesh, integrated_func, kab, temp);
-		rs[0] = FOUR_PI / Mathzone_Add1::dualfac(2 * l + 1) * temp;
-	}
-
-	delete[] integrated_func;
-	delete[] k1_dot_k2;
-
-	timer::tick("ORB_table_alpha", "S_PhiAlpha_R");
-	return;
-}
-
-
-void ORB_table_alpha::init_Table_Alpha(Sph_Bessel_Recursive::D2* pSB)
-{
-	TITLE("ORB_table_alpha", "init_Table_Alpha");
-	timer::tick("ORB_table_alpha", "init_Table_Alpha", 'D');
-
-	assert(ntype>0);
-
-	// (1) allocate 1st dimension ( overlap, derivative)
-	this->Table_DSR = new double**** [2];
-	// (2) allocate 2nd dimension ( overlap, derivative)
-	this->Table_DSR[0] = new double*** [this->ntype];
-	this->Table_DSR[1] = new double*** [this->ntype];
-
-	// <1Phi|2Alpha> 
-	for (int T1 = 0; T1 < ntype; T1++) // type 1 is orbital
-	{
-		const int Lmax1 = ORB.Phi[T1].getLmax();
-		const int Lmax2 = ORB.Alpha[0].getLmax();
-		const int lmax_now = std::max(Lmax1, Lmax2);
-		int L2plus1 = 2 * lmax_now + 1;
-		//-------------------------------------------------------------
-		// how many <psi|alpha_l>
-		// here we count all possible psi with (L,N) index for type T1.
-		//-------------------------------------------------------------
-		const int pairs_chi = ORB.Phi[T1].getTotal_nchi() * ORB.Alpha[0].getTotal_nchi();
-
-		if (pairs_chi == 0)continue;
-
-		// init 2nd dimension
-		this->Table_DSR[0][T1] = new double** [pairs_chi];
-		this->Table_DSR[1][T1] = new double** [pairs_chi];
-
-		const double Rcut1 = ORB.Phi[T1].getRcut();
-		for (int L1 = 0; L1 < Lmax1 + 1; L1++)
-		{
-			for (int N1 = 0; N1 < ORB.Phi[T1].getNchi(L1); N1++)
-			{
-				for (int L2 = 0; L2 < Lmax2 + 1; L2++)
-				{
-					for (int N2 = 0; N2 < ORB.Alpha[0].getNchi(L2); N2++)
-					{
-						// get the second index.
-						const int Opair = this->DS_Opair(T1, L1, L2, N1, N2);
-
-						// init 3rd dimension
-						this->Table_DSR[0][T1][Opair] = new double* [L2plus1];
-						this->Table_DSR[1][T1][Opair] = new double* [L2plus1];
-
-						const double Rcut1 = ORB.Phi[T1].getRcut();
-						const double Rcut2 = ORB.Alpha[0].getRcut();
-						assert(Rcut1 > 0.0 && Rcut1 < 100);
-						assert(Rcut2 > 0.0 && Rcut2 < 100);
-
-						const int rmesh = this->get_rmesh(Rcut1, Rcut2);
-						assert(rmesh < this->Rmesh);
-
-						//L=|L1-L2|,|L1-L2|+2,...,L1+L2
-						const int SL = abs(L1 - L2);
-						const int AL = L1 + L2;
-
-						for (int L = 0; L < L2plus1; L++)
-						{
-							//Allocation
-							this->Table_DSR[0][T1][Opair][L] = new double[rmesh];
-							this->Table_DSR[1][T1][Opair][L] = new double[rmesh];
-
-							Memory::record("ORB_table_alpha", "Table_DSR",
-								2 * this->ntype * pairs_chi * rmesh, "double");
-
-							//for those L whose Gaunt Coefficients = 0, we
-							//assign every element in Table_DSR as zero
-							if ((L > AL) || (L < SL) || ((L - SL) % 2 == 1))
-							{
-								ZEROS(Table_DSR[0][T1][Opair][L], rmesh);
-								ZEROS(Table_DSR[1][T1][Opair][L], rmesh);
-
-								continue;
-							}
-
-							this->cal_S_PhiAlpha_R(
-								pSB, // mohan add 2021-03-06
-								L,
-								ORB.Phi[T1].PhiLN(L1, N1),
-								ORB.Alpha[0].PhiLN(L2, N2), // mohan update 2011-03-07
-								rmesh,
-								this->Table_DSR[0][T1][Opair][L],
-								this->Table_DSR[1][T1][Opair][L]);
-						}// end L2plus1
-					}// end N2
-				}// end L2
-			}// end N1
-		}// end L1
-	}// end T1
-	destroy_nr = true;
-
-
-	//	OUT(ofs_running,"allocate non-local potential matrix","Done");
-	timer::tick("ORB_table_alpha", "init_Table_Alpha", 'D');
-	return;
-}
-
-
-void ORB_table_alpha::Destroy_Table_Alpha(void)
-{
-	if (!destroy_nr) return;
-
-	const int ntype = ORB.get_ntype();
-	for (int ir = 0; ir < 2; ir++)
-	{
-		for (int T1 = 0; T1 < ntype; T1++)
-		{
-			const int Lmax1 = ORB.Phi[T1].getLmax();
-			const int Lmax2 = ORB.Alpha[0].getLmax();
-			const int lmax_now = std::max(Lmax1, Lmax2);
-			const int pairs = ORB.Phi[T1].getTotal_nchi() * ORB.Alpha[0].getTotal_nchi();
-
-				// mohan fix bug 2011-03-30
-				if (pairs == 0) continue;
-			for (int dim2 = 0; dim2 < pairs; dim2++)
-			{
-				for (int L = 0; L < 2*lmax_now + 1; L++)
-				{
-					delete[] Table_DSR[ir][T1][dim2][L];
-				}
-				delete[] Table_DSR[ir][T1][dim2];
-			}
-			delete[] Table_DSR[ir][T1];
-		}
-		delete[] Table_DSR[ir];
-	}
-	delete[] Table_DSR;
-	return;
-}
-
-void ORB_table_alpha::init_DS_2Lplus1(void)
-{
-	TITLE("Make_Overlap_Table", "init_DS_2Lplus1");
-	assert(this->ntype > 0);
-	delete[] DS_2Lplus1;
-	DS_2Lplus1=new int[ntype]; // 2Lmax+1 for each T1
-
-	int index = 0;
-	for (int T1 = 0; T1 < ntype; T1++)
-	{
-			this->DS_2Lplus1[T1] = max(ORB.Phi[T1].getLmax(), ORB.Alpha[0].getLmax()) * 2 + 1;
-	}
-	return;
-}
-
-void ORB_table_alpha::init_DS_Opair(void)
-{
-	const int lmax = ORB.get_lmax();
-	const int nchimax = ORB.get_nchimax();
-	const int lmax_d = ORB.get_lmax_d();
-	const int nchimax_d = ORB.get_nchimax_d();
-	assert(lmax + 1 > 0);
-	assert(lmax_d + 1 > 0);
-	assert(nchimax > 0);
-	assert(nchimax_d > 0);
-
-	this->DS_Opair.create(this->ntype, lmax+1, lmax_d+1, nchimax, nchimax_d);
-
-	// <1psi|2beta>
-	// 1. orbital
-	for (int T1 = 0; T1 < ntype; T1++)	//alpha is not related to atom type !
-	{
-		int index = 0;
-		for (int L1 = 0; L1 < ORB.Phi[T1].getLmax() + 1; L1++)
-		{
-			for (int N1 = 0; N1 < ORB.Phi[T1].getNchi(L1); N1++)
-			{
-				for (int L2 = 0; L2 < ORB.Alpha[0].getLmax() + 1; L2++)
-				{
-					for (int N2 = 0; N2 < ORB.Alpha[0].getNchi(L2); N2++)
-					{
-						this->DS_Opair(T1, L1, L2, N1, N2) = index;
-						++index;
-					}
-				}
-			}
-		}
-	}
-	return;
-}
-
-//caoyu add 2021-03-20
-void ORB_table_alpha::print_Table_DSR(void)
-{
-	TITLE("ORB_table_alpha", "print_Table_DSR");
-	NEW_PART("Overlap table S between lcao orbital and descriptor basis : S_{I_mu_alpha}");
-
-	ofstream ofs;
-	stringstream ss;
-	// the parameter 'winput::spillage_outdir' is read from INPUTw.
-	ss << "./S_I_mu_alpha.dat";
-	if (MY_RANK == 0)
-	{
-		ofs.open(ss.str().c_str());
-	}
-
-	for (int T1 = 0; T1 < this->ntype; T1++)	//T1
-	{
-		const int Lmax1 = ORB.Phi[T1].getLmax();
-		const int Lmax2 = ORB.Alpha[0].getLmax();
-		for (int L1 = 0; L1 < Lmax1 + 1; L1++)
-		{
-			for (int N1 = 0; N1 < ORB.Phi[T1].getNchi(L1); N1++)
-			{
-				for (int L2 = 0; L2 < Lmax2 + 1; L2++)
-				{
-					for (int N2 = 0; N2 < ORB.Alpha[0].getNchi(L2); N2++)
-					{
-						const int Opair = this->DS_Opair(T1, L1, L2, N1, N2);	//Opair
-						//ofs <<setw(20)<< "atom_type: " << ucell.atoms[T1].label << endl;
-						ofs <<setw(20)<< "lcao basis: " << "L1=" << L1 << ", N1=" << N1 << endl;
-						ofs <<setw(20)<< "descriptor basis: " << "L2=" << L2 << ", N2=" << N2 << endl;
-						for (int il = 0; il < this-> DS_2Lplus1[T1]; il++)
-						{
-							ofs << "L=" << il << endl;
-							const double Rcut1 = ORB.Phi[T1].getRcut();
-							const double Rcut2 = ORB.Alpha[0].getRcut();
-							const int rmesh = this->get_rmesh(Rcut1, Rcut2);
-							
-							if (Table_DSR[0][T1][Opair][il][1]==0)	//remain to be discussed
-							{
-								ofs << "S(R)=0"<<endl<<endl;
-								continue;
-							}
-							ofs << "Rcut1="<<Rcut1<<", Rcut2="<<Rcut2<<", rmesh="<<rmesh<<", dr="<<this->dr<<";"<<endl;
-							for (int ir = 0; ir < rmesh; ir++)
-							{
-								ofs << Table_DSR[0][T1][Opair][il][ir] << " ";
-								if ( (ir+1) % 8 == 0) ofs << endl;
-							}
-							ofs << endl <<endl;
-						}// il
-					}// N2
-				}// L2
-			}// N1
-		}// L1
-	}// T1
-	return;
-}
+//caoyu add 2021-03-17
+#include "ORB_table_alpha.h"
+#include "ORB_read.h"
+#include "../src_global/math_integral.h"
+#include <stdexcept>
+
+double ORB_table_alpha::dr = -1.0;
+
+ORB_table_alpha::ORB_table_alpha()
+{
+	destroy_nr = false;
+
+	ntype = 0;
+	lmax = 0;
+	kmesh = 0;
+	Rmax = 0.0;
+	dr = 0.0;
+	dk = 0.0;
+
+	nlm = 0;
+	Rmesh = 0;
+
+	kpoint = new double[1];
+	r = new double[1];
+	rab = new double[1];
+	kab = new double[1];
+	DS_2Lplus1 = new int[1];
+}
+
+ORB_table_alpha::~ORB_table_alpha()
+{
+	delete[] kpoint;
+	delete[] r;
+	delete[] rab;
+	delete[] kab;
+	delete[] DS_2Lplus1;
+}
+
+void ORB_table_alpha::allocate(
+	const int &ntype_in,
+	const int &lmax_in,
+	const int &kmesh_in,
+	const double &Rmax_in,
+	const double &dr_in,
+	const double &dk_in)
+{
+	TITLE("ORB_table_alpha", "allocate");
+
+	this->ntype = ntype_in; // type of elements.
+	this->lmax = lmax_in;
+	this->kmesh = kmesh_in;
+	this->Rmax = Rmax_in;
+	this->dr = dr_in;
+	this->dk = dk_in;
+
+	assert(ntype > 0);
+	assert(lmax >= 0);
+	assert(kmesh > 0.0);
+	assert(Rmax >= 0.0);
+	assert(dr > 0.0);
+	assert(dk > 0.0);
+
+	// calculated from input parameters
+	this->nlm = (2 * lmax + 1) * (2 * lmax + 1);
+	this->Rmesh = static_cast<int>(Rmax / dr) + 4;
+	if (Rmesh % 2 == 0)
+	{
+		++Rmesh;
+	}
+
+	//	OUT(ofs_running,"lmax",lmax);
+	//	OUT(ofs_running,"Rmax (Bohr)",Rmax);
+	//	OUT(ofs_running,"dr (Bohr)",dr);
+	//	OUT(ofs_running,"dk",dk);
+	//	OUT(ofs_running,"nlm",nlm);
+	//	OUT(ofs_running,"kmesh",kmesh);
+
+	delete[] kpoint;
+	delete[] r;
+	kpoint = new double[kmesh];
+	r = new double[Rmesh];
+
+	delete[] rab;
+	delete[] kab;
+	kab = new double[kmesh];
+	rab = new double[Rmesh];
+
+	for (int ik = 0; ik < kmesh; ik++)
+	{
+		kpoint[ik] = ik * dk_in;
+		kab[ik] = dk_in;
+	}
+
+	for (int ir = 0; ir < Rmesh; ir++)
+	{
+		r[ir] = ir * dr;
+		rab[ir] = dr;
+	}
+
+	//	OUT(ofs_running,"allocate kpoint, r, rab, kab","Done");
+	return;
+}
+
+int ORB_table_alpha::get_rmesh(const double &R1, const double &R2)
+{
+	int rmesh = static_cast<int>((R1 + R2) / ORB_table_alpha::dr) + 5;
+	//mohan update 2009-09-08 +1 ==> +5
+	//considering interpolation or so on...
+	if (rmesh % 2 == 0)
+		rmesh++;
+
+	if (rmesh <= 0)
+	{
+		ofs_warning << "\n R1 = " << R1 << " R2 = " << R2;
+		ofs_warning << "\n rmesh = " << rmesh;
+		WARNING_QUIT("ORB_table_alpha::get_rmesh", "rmesh <= 0");
+	}
+	return rmesh;
+}
+
+void ORB_table_alpha::cal_S_PhiAlpha_R(
+	Sph_Bessel_Recursive::D2 *pSB, // mohan add 2021-03-06
+	const int &l,
+	const Numerical_Orbital_Lm &n1,
+	const Numerical_Orbital_Lm &n2,
+	const int &rmesh,
+	double *rs,
+	double *drs)
+{
+	timer::tick("ORB_table_alpha", "S_PhiAlpha_R");
+
+	assert(kmesh > 0);
+
+	//start calc
+	double *k1_dot_k2 = new double[kmesh];
+
+	for (int ik = 0; ik < kmesh; ik++)
+	{
+		k1_dot_k2[ik] = n1.getPsi_k(ik) * n2.getPsi_k(ik);
+	}
+
+	//previous version
+	double *integrated_func = new double[kmesh];
+
+	const vector<vector<double>> &jlm1 = pSB->get_jlx()[l - 1];
+	const vector<vector<double>> &jl = pSB->get_jlx()[l];
+	const vector<vector<double>> &jlp1 = pSB->get_jlx()[l + 1];
+
+	for (int ir = 0; ir < rmesh; ir++)
+	{
+		ZEROS(integrated_func, kmesh);
+		double temp = 0.0;
+
+		for (int ik = 0; ik < kmesh; ik++)
+		{
+			integrated_func[ik] = jl[ir][ik] * k1_dot_k2[ik];
+		}
+		// Call simpson integration
+		Integral::Simpson_Integral(kmesh, integrated_func, kab, temp);
+		rs[ir] = temp * FOUR_PI;
+
+		//drs
+		double temp1, temp2;
+
+		if (l > 0)
+		{
+			for (int ik = 0; ik < kmesh; ik++)
+			{
+				integrated_func[ik] = jlm1[ir][ik] * k1_dot_k2[ik] * kpoint[ik];
+			}
+
+			Integral::Simpson_Integral(kmesh, integrated_func, kab, temp1);
+		}
+
+		for (int ik = 0; ik < kmesh; ik++)
+		{
+			integrated_func[ik] = jlp1[ir][ik] * k1_dot_k2[ik] * kpoint[ik];
+		}
+
+		Integral::Simpson_Integral(kmesh, integrated_func, kab, temp2);
+
+		if (l == 0)
+		{
+			drs[ir] = -FOUR_PI * temp2;
+		}
+		else
+		{
+			drs[ir] = FOUR_PI * (temp1 * l - (l + 1) * temp2) / (2.0 * l + 1);
+		}
+	}
+
+	//liaochen modify on 2010/4/22
+	//special case for R=0
+	//we store Slm(R) / R**l at the fisrt point, rather than Slm(R)
+	if (l > 0)
+	{
+		ZEROS(integrated_func, kmesh);
+		double temp = 0.0;
+
+		for (int ik = 0; ik < kmesh; ik++)
+		{
+			integrated_func[ik] = k1_dot_k2[ik] * pow(kpoint[ik], l);
+		}
+
+		// Call simpson integration
+		Integral::Simpson_Integral(kmesh, integrated_func, kab, temp);
+		rs[0] = FOUR_PI / Mathzone_Add1::dualfac(2 * l + 1) * temp;
+	}
+
+	delete[] integrated_func;
+	delete[] k1_dot_k2;
+
+	timer::tick("ORB_table_alpha", "S_PhiAlpha_R");
+	return;
+}
+
+void ORB_table_alpha::init_Table_Alpha(Sph_Bessel_Recursive::D2 *pSB)
+{
+	TITLE("ORB_table_alpha", "init_Table_Alpha");
+	timer::tick("ORB_table_alpha", "init_Table_Alpha", 'D');
+
+	assert(ntype > 0);
+
+	// (1) allocate 1st dimension ( overlap, derivative)
+	this->Table_DSR = new double ****[2];
+	// (2) allocate 2nd dimension ( overlap, derivative)
+	this->Table_DSR[0] = new double ***[this->ntype];
+	this->Table_DSR[1] = new double ***[this->ntype];
+
+	// <1Phi|2Alpha>
+	for (int T1 = 0; T1 < ntype; T1++) // type 1 is orbital
+	{
+		const int Lmax1 = ORB.Phi[T1].getLmax();
+		const int Lmax2 = ORB.Alpha[0].getLmax();
+		const int lmax_now = std::max(Lmax1, Lmax2);
+		int L2plus1 = 2 * lmax_now + 1;
+		//-------------------------------------------------------------
+		// how many <psi|alpha_l>
+		// here we count all possible psi with (L,N) index for type T1.
+		//-------------------------------------------------------------
+		const int pairs_chi = ORB.Phi[T1].getTotal_nchi() * ORB.Alpha[0].getTotal_nchi();
+
+		if (pairs_chi == 0)
+			continue;
+
+		// init 2nd dimension
+		this->Table_DSR[0][T1] = new double **[pairs_chi];
+		this->Table_DSR[1][T1] = new double **[pairs_chi];
+
+		const double Rcut1 = ORB.Phi[T1].getRcut();
+		for (int L1 = 0; L1 < Lmax1 + 1; L1++)
+		{
+			for (int N1 = 0; N1 < ORB.Phi[T1].getNchi(L1); N1++)
+			{
+				for (int L2 = 0; L2 < Lmax2 + 1; L2++)
+				{
+					for (int N2 = 0; N2 < ORB.Alpha[0].getNchi(L2); N2++)
+					{
+						// get the second index.
+						const int Opair = this->DS_Opair(T1, L1, L2, N1, N2);
+
+						// init 3rd dimension
+						this->Table_DSR[0][T1][Opair] = new double *[L2plus1];
+						this->Table_DSR[1][T1][Opair] = new double *[L2plus1];
+
+						const double Rcut1 = ORB.Phi[T1].getRcut();
+						const double Rcut2 = ORB.Alpha[0].getRcut();
+						assert(Rcut1 > 0.0 && Rcut1 < 100);
+						assert(Rcut2 > 0.0 && Rcut2 < 100);
+
+						const int rmesh = this->get_rmesh(Rcut1, Rcut2);
+						assert(rmesh < this->Rmesh);
+
+						//L=|L1-L2|,|L1-L2|+2,...,L1+L2
+						const int SL = abs(L1 - L2);
+						const int AL = L1 + L2;
+
+						for (int L = 0; L < L2plus1; L++)
+						{
+							//Allocation
+							this->Table_DSR[0][T1][Opair][L] = new double[rmesh];
+							this->Table_DSR[1][T1][Opair][L] = new double[rmesh];
+
+							Memory::record("ORB_table_alpha", "Table_DSR",
+										   2 * this->ntype * pairs_chi * rmesh, "double");
+
+							//for those L whose Gaunt Coefficients = 0, we
+							//assign every element in Table_DSR as zero
+							if ((L > AL) || (L < SL) || ((L - SL) % 2 == 1))
+							{
+								ZEROS(Table_DSR[0][T1][Opair][L], rmesh);
+								ZEROS(Table_DSR[1][T1][Opair][L], rmesh);
+
+								continue;
+							}
+
+							this->cal_S_PhiAlpha_R(
+								pSB, // mohan add 2021-03-06
+								L,
+								ORB.Phi[T1].PhiLN(L1, N1),
+								ORB.Alpha[0].PhiLN(L2, N2), // mohan update 2011-03-07
+								rmesh,
+								this->Table_DSR[0][T1][Opair][L],
+								this->Table_DSR[1][T1][Opair][L]);
+						} // end L2plus1
+					}	  // end N2
+				}		  // end L2
+			}			  // end N1
+		}				  // end L1
+	}					  // end T1
+	destroy_nr = true;
+
+	//	OUT(ofs_running,"allocate non-local potential matrix","Done");
+	timer::tick("ORB_table_alpha", "init_Table_Alpha", 'D');
+	return;
+}
+
+void ORB_table_alpha::Destroy_Table_Alpha(void)
+{
+	if (!destroy_nr)
+		return;
+
+	const int ntype = ORB.get_ntype();
+	for (int ir = 0; ir < 2; ir++)
+	{
+		for (int T1 = 0; T1 < ntype; T1++)
+		{
+			const int Lmax1 = ORB.Phi[T1].getLmax();
+			const int Lmax2 = ORB.Alpha[0].getLmax();
+			const int lmax_now = std::max(Lmax1, Lmax2);
+			const int pairs = ORB.Phi[T1].getTotal_nchi() * ORB.Alpha[0].getTotal_nchi();
+
+			// mohan fix bug 2011-03-30
+			if (pairs == 0)
+				continue;
+			for (int dim2 = 0; dim2 < pairs; dim2++)
+			{
+				for (int L = 0; L < 2 * lmax_now + 1; L++)
+				{
+					delete[] Table_DSR[ir][T1][dim2][L];
+				}
+				delete[] Table_DSR[ir][T1][dim2];
+			}
+			delete[] Table_DSR[ir][T1];
+		}
+		delete[] Table_DSR[ir];
+	}
+	delete[] Table_DSR;
+	return;
+}
+
+void ORB_table_alpha::init_DS_2Lplus1(void)
+{
+	TITLE("Make_Overlap_Table", "init_DS_2Lplus1");
+	assert(this->ntype > 0);
+	delete[] DS_2Lplus1;
+	DS_2Lplus1 = new int[ntype]; // 2Lmax+1 for each T1
+
+	int index = 0;
+	for (int T1 = 0; T1 < ntype; T1++)
+	{
+		this->DS_2Lplus1[T1] = max(ORB.Phi[T1].getLmax(), ORB.Alpha[0].getLmax()) * 2 + 1;
+	}
+	return;
+}
+
+void ORB_table_alpha::init_DS_Opair(void)
+{
+	const int lmax = ORB.get_lmax();
+	const int nchimax = ORB.get_nchimax();
+	const int lmax_d = ORB.get_lmax_d();
+	const int nchimax_d = ORB.get_nchimax_d();
+	assert(lmax + 1 > 0);
+	assert(lmax_d + 1 > 0);
+	assert(nchimax > 0);
+	assert(nchimax_d > 0);
+
+	this->DS_Opair.create(this->ntype, lmax + 1, lmax_d + 1, nchimax, nchimax_d);
+
+	// <1psi|2beta>
+	// 1. orbital
+	for (int T1 = 0; T1 < ntype; T1++) //alpha is not related to atom type !
+	{
+		int index = 0;
+		for (int L1 = 0; L1 < ORB.Phi[T1].getLmax() + 1; L1++)
+		{
+			for (int N1 = 0; N1 < ORB.Phi[T1].getNchi(L1); N1++)
+			{
+				for (int L2 = 0; L2 < ORB.Alpha[0].getLmax() + 1; L2++)
+				{
+					for (int N2 = 0; N2 < ORB.Alpha[0].getNchi(L2); N2++)
+					{
+						this->DS_Opair(T1, L1, L2, N1, N2) = index;
+						++index;
+					}
+				}
+			}
+		}
+	}
+	return;
+}
+
+/*
+//caoyu add 2021-03-20
+void ORB_table_alpha::print_Table_DSR(void)
+{
+	TITLE("ORB_table_alpha", "print_Table_DSR");
+	NEW_PART("Overlap table S between lcao orbital and descriptor basis : S_{I_mu_alpha}");
+
+	ofstream ofs;
+	stringstream ss;
+	// the parameter 'winput::spillage_outdir' is read from INPUTw.
+	ss << "./S_I_mu_alpha.dat";
+	if (MY_RANK == 0)
+	{
+		ofs.open(ss.str().c_str());
+	}
+
+	for (int T1 = 0; T1 < this->ntype; T1++)	//T1
+	{
+		const int Lmax1 = ORB.Phi[T1].getLmax();
+		const int Lmax2 = ORB.Alpha[0].getLmax();
+		for (int L1 = 0; L1 < Lmax1 + 1; L1++)
+		{
+			for (int N1 = 0; N1 < ORB.Phi[T1].getNchi(L1); N1++)
+			{
+				for (int L2 = 0; L2 < Lmax2 + 1; L2++)
+				{
+					for (int N2 = 0; N2 < ORB.Alpha[0].getNchi(L2); N2++)
+					{
+						const int Opair = this->DS_Opair(T1, L1, L2, N1, N2);	//Opair
+						//ofs <<setw(20)<< "atom_type: " << ucell.atoms[T1].label << endl;
+						ofs <<setw(20)<< "lcao basis: " << "L1=" << L1 << ", N1=" << N1 << endl;
+						ofs <<setw(20)<< "descriptor basis: " << "L2=" << L2 << ", N2=" << N2 << endl;
+						for (int il = 0; il < this-> DS_2Lplus1[T1]; il++)
+						{
+							ofs << "L=" << il << endl;
+							const double Rcut1 = ORB.Phi[T1].getRcut();
+							const double Rcut2 = ORB.Alpha[0].getRcut();
+							const int rmesh = this->get_rmesh(Rcut1, Rcut2);
+							
+							if (Table_DSR[0][T1][Opair][il][1]==0)	//remain to be discussed
+							{
+								ofs << "S(R)=0"<<endl<<endl;
+								continue;
+							}
+							ofs << "Rcut1="<<Rcut1<<", Rcut2="<<Rcut2<<", rmesh="<<rmesh<<", dr="<<this->dr<<";"<<endl;
+							for (int ir = 0; ir < rmesh; ir++)
+							{
+								ofs << Table_DSR[0][T1][Opair][il][ir] << " ";
+								if ( (ir+1) % 8 == 0) ofs << endl;
+							}
+							ofs << endl <<endl;
+						}// il
+					}// N2
+				}// L2
+			}// N1
+		}// L1
+	}// T1
+	return;
+}
+*/
diff --git a/ABACUS.develop/source/src_lcao/ORB_table_alpha.h b/ABACUS.develop/source/src_lcao/ORB_table_alpha.h
index b6c6eacf0c..eca622e497 100644
--- a/ABACUS.develop/source/src_lcao/ORB_table_alpha.h
+++ b/ABACUS.develop/source/src_lcao/ORB_table_alpha.h
@@ -1,70 +1,68 @@
-#ifndef ORB_TABLE_ALPHA_H 
-#define ORB_TABLE_ALPHA_H 
-
-#include "ORB_atomic_lm.h"
-#include "../src_global/sph_bessel_recursive.h"
-
-//caoyu add 2021-03-17
-
-class ORB_table_alpha
-{
-	public:
-
-	ORB_table_alpha();
-	~ORB_table_alpha();
-
-	void allocate(
-		const int& ntype,
-		const int& lmax_in,
-		const int& kmesh_in,
-		const double& Rmax_in,
-		const double& dR_in,
-		const double& dk_in);
-
-	double***** Table_DSR;//overlap between lcao basis phi and descriptor basis alpha
-	bool destroy_nr;
-
-	//-------------------------
-	// O stands for orbitals.
-	//-------------------------
-
-	void init_DS_Opair(void);
-	void init_DS_2Lplus1(void);
-	IntArray DS_Opair;
-	int* DS_2Lplus1;
-
-	void init_Table_Alpha(Sph_Bessel_Recursive::D2* pSB);
-
-	void Destroy_Table_Alpha(void);
-
-	static int get_rmesh(const double& R1, const double& R2);
-
-	static double dr;
-	int Rmesh;
-	int ntype;
-	int lmax;
-
-	void print_Table_DSR(void);		//caoyu add 2021-03-20
-
-	private:
-
-	void cal_S_PhiAlpha_R(
-		Sph_Bessel_Recursive::D2* pSB, // mohan add 2021-03-06
-		const int& l,	
-		const Numerical_Orbital_Lm& n1,
-		const Numerical_Orbital_Lm& n2,
-		const int& rmesh,
-		double* rs,
-		double* drs);
-
-	// variables
-	double Rmax;
-	double dk;
-	int nlm;
-	int kmesh;
-	double* kpoint;
-	double* r;
-	double* rab;
-	double* kab;
-};
-#endif
+#ifndef ORB_TABLE_ALPHA_H
+#define ORB_TABLE_ALPHA_H
+
+#include "ORB_atomic_lm.h"
+#include "../src_global/sph_bessel_recursive.h"
+
+//caoyu add 2021-03-17
+
+class ORB_table_alpha
+{
+public:
+	ORB_table_alpha();
+	~ORB_table_alpha();
+
+	void allocate(
+		const int &ntype,
+		const int &lmax_in,
+		const int &kmesh_in,
+		const double &Rmax_in,
+		const double &dR_in,
+		const double &dk_in);
+
+	double *****Table_DSR; //overlap between lcao basis phi and descriptor basis alpha
+	bool destroy_nr;
+
+	//-------------------------
+	// O stands for orbitals.
+	//-------------------------
+
+	void init_DS_Opair(void);
+	void init_DS_2Lplus1(void);
+	IntArray DS_Opair;
+	int *DS_2Lplus1;
+
+	void init_Table_Alpha(Sph_Bessel_Recursive::D2 *pSB);
+
+	void Destroy_Table_Alpha(void);
+
+	static int get_rmesh(const double &R1, const double &R2);
+
+	static double dr;
+	int Rmesh;
+	int ntype;
+	int lmax;
+
+	//void print_Table_DSR(void);		//caoyu add 2021-03-20
+
+private:
+	void cal_S_PhiAlpha_R(
+		Sph_Bessel_Recursive::D2 *pSB, // mohan add 2021-03-06
+		const int &l,
+		const Numerical_Orbital_Lm &n1,
+		const Numerical_Orbital_Lm &n2,
+		const int &rmesh,
+		double *rs,
+		double *drs);
+
+	// variables
+	double Rmax;
+	double dk;
+	int nlm;
+	int kmesh;
+	double *kpoint;
+	double *r;
+	double *rab;
+	double *kab;
+};
+#endif

From b462c0140308499af448ff7e1e8944dccdb1258e Mon Sep 17 00:00:00 2001
From: mohan <mohan.chen.chen.mohan@gmail.com>
Date: Sun, 11 Apr 2021 13:22:35 +0800
Subject: [PATCH 46/60] fix ORB_gen_tables conflicts

---
 .../source/src_lcao/ORB_gen_tables.cpp        | 1010 -----------------
 1 file changed, 1010 deletions(-)

diff --git a/ABACUS.develop/source/src_lcao/ORB_gen_tables.cpp b/ABACUS.develop/source/src_lcao/ORB_gen_tables.cpp
index aeeee031b9..51e3e0ed45 100644
--- a/ABACUS.develop/source/src_lcao/ORB_gen_tables.cpp
+++ b/ABACUS.develop/source/src_lcao/ORB_gen_tables.cpp
@@ -1,1012 +1,3 @@
-<<<<<<< HEAD
-#include "src_pw/global.h"
-#include "ORB_read.h"
-#include "ORB_gen_tables.h"
-#include "src_global/ylm.h"
-
-// here is a member of ORB_gen_tables class
-ORB_gen_tables UOT;
-
-ORB_gen_tables::ORB_gen_tables(){}
-ORB_gen_tables::~ORB_gen_tables(){}
-
-// call in hamilt_linear::init_before_ions.
-void ORB_gen_tables::gen_tables( 
-	const int &job0, 
-	LCAO_Orbitals &orb, 
-	const int &Lmax_exx)
-{
-	TITLE("ORB_gen_tables","gen_tables");
-	timer::tick("ORB_gen_tables","gen_tables",'C');
-
-	ofs_running << "\n SETUP THE TWO-CENTER INTEGRATION TABLES" << endl;
-	
-	//=========================================
-	// (1) MOT: make overlap table.
-	//=========================================
-	MOT.allocate(
-		orb.get_ntype(),// number of atom types
-        orb.get_lmax(),// max L used to calculate overlap
-        orb.get_kmesh(), // kpoints, for integration in k space
-        orb.get_Rmax(),// max value of radial table
-        orb.get_dR(),// delta R, for making radial table
-        orb.get_dk() ); // delta k, for integration in k space
-
-	tbeta.allocate(
-		orb.get_ntype(),// number of atom types
-        orb.get_lmax(),// max L used to calculate overlap
-        orb.get_kmesh(), // kpoints, for integration in k space
-        orb.get_Rmax(),// max value of radial table
-        orb.get_dR(),// delta R, for making radial table
-        orb.get_dk() ); // delta k, for integration in k space
-
-	//caoyu add 2021-03-18
-	if (INPUT.out_descriptor && BASIS_TYPE == "lcao") {
-		talpha.allocate(
-			orb.get_ntype(),// number of atom types
-			orb.get_lmax(),// max L used to calculate overlap
-			orb.get_kmesh(), // kpoints, for integration in k space
-			orb.get_Rmax(),// max value of radial table
-			orb.get_dR(),// delta R, for making radial table
-			orb.get_dk()); // delta k, for integration in k space
-	}
-
-	// OV: overlap
-	MOT.init_OV_Tpair(orb);
-	MOT.init_OV_Opair(orb);
-
-	// NL: nonlocal
-	tbeta.init_NL_Tpair();
-	tbeta.init_NL_Opair(orb); // add 2009-5-8
-
-	//caoyu add 2021-03-18
-	// DS: Descriptor
-	if (INPUT.out_descriptor && BASIS_TYPE == "lcao") {
-		talpha.init_DS_Opair();
-		talpha.init_DS_2Lplus1();
-	}
-
-	//=========================================
-	// (2) init Ylm Coef
-	//=========================================
-	//liaochen add 2010/4/29
-	Ylm::set_coefficients ();
-
-	// PLEASE add explanations for all options of 'orb_num' and 'mode'
-	// mohan add 2021-04-03
-	// Peize Lin update 2016-01-26
-	int orb_num=2; //
-	int mode=1; // 1: <phi|phi> and <phi|beta>
-	int Lmax_used=0;
-	int Lmax=0;
-
-	MOT.init_Table_Spherical_Bessel (orb_num, mode, Lmax_used, Lmax, Lmax_exx);
-	
-	//calculate S(R) for interpolation
-	MOT.init_Table(job0, orb);
-	tbeta.init_Table_Beta( MOT.pSB );// add 2009-5-8
-
-	//caoyu add 2021-03-18
-	if (INPUT.out_descriptor && BASIS_TYPE == "lcao") 
-	{
-		talpha.init_Table_Alpha(MOT.pSB);
-		talpha.print_Table_DSR();	
-	}
-
-	//=========================================
-	// (3) make Gaunt coefficients table
-	//=========================================
-
-	const int lmax = (Lmax_used-1) / 2 ;
-	//MGT.init_Ylm_Gaunt(orb.get_lmax()+1, 0.0,PI,0.0,TWO_PI);
-	MGT.init_Gaunt_CH( lmax );
-	//MGT.init_Gaunt(orb.get_lmax()+1);
-	MGT.init_Gaunt( lmax );
-
-
-
-	timer::tick("ORB_gen_tables","gen_tables",'C');
-	return;
-}
-
-void ORB_gen_tables::snap_psibeta(
-	double nlm[],
-	const int& job,
-	const Vector3<double> &R1,
-	const int &T1,
-	const int &L1,
-	const int &m1,
-	const int &N1,
-	const Vector3<double> &R2,
-	const int &T2,
-	const int &L2,
-	const int &m2,
-	const int &N2,
-	const Vector3<double> &R0,// The projector.
-	const int &T0,
-	complex<double> *nlm1,
-	const int is) const
-{
-	//TITLE ("ORB_gen_tables","snap_psibeta");
-
-	//optimized by zhengdy-soc
-	if(NSPIN==4 && ORB.Beta[T0].get_count_soc(is)==0) 
-	{
-		return;
-	}
-
-//	timer::tick ("ORB_gen_tables","snap_psibeta",'X');
-
-	// PLEASE update this option,
-	// has_so only needs to be determined once,
-	// has_so can be used as a static variable,
-	// or an input parameter
-	// mohan add 2021-04-06
-	bool has_so = 0;
-	if(ORB.Beta[T0].get_count_soc(0)>0 ) 
-	{
-		has_so = 1;
-	}
-
-	const int nproj = ORB.nproj[T0];
-	bool *calproj = new bool[nproj];
-	int* rmesh1 = new int[nproj];
-	int* rmesh2 = new int[nproj];
-
-	//rcut of orbtials and projectors
-	const double Rcut1 = ORB.Phi[T1].getRcut();
-	const double Rcut2 = ORB.Phi[T2].getRcut();
-	
-	//in our calculation, we always put orbital phi at the left side of <phi|beta>
-	//because <phi|beta> = <beta|phi>
-	const Vector3<double> dRa = (R0-R1)*this->lat0 ; 
-	const Vector3<double> dRb = (R0-R2)*this->lat0 ;
-	
-	double distance10 = dRa.norm();
-	double distance20 = dRb.norm();
-
-	// mohan add 2011-03-10
-	// because the table length is different accordint to each length
-	// of projector, so sometimes some shorter projectors need not be 
-	// calculated.
-	bool all_out = true;
-	for(int ip=0; ip<nproj; ip++)
-	{
-
-		// PLEASE note that all projectors should share the same rcut
-		const double Rcut0 = ORB.Beta[T0].Proj[ip].getRcut();
-
-		if( distance10 > (Rcut1 + Rcut0) || distance20 > (Rcut2 + Rcut0) )  
-		{
-			calproj[ip] = false;
-		}
-		else
-		{
-			all_out = false;
-			calproj[ip] = true;
-			//length of table for interpolation
-			rmesh1[ip] = tbeta.get_rmesh(Rcut1, Rcut0);
-			rmesh2[ip] = tbeta.get_rmesh(Rcut2, Rcut0);
-		}
-	}
-
-	if(all_out)
-	{
-		delete[] calproj;
-		delete[] rmesh1;
-		delete[] rmesh2;
-//		timer::tick ("ORB_gen_tables","snap_psibeta",'X');
-		return;
-	}
-
-
-	//FOR INTERPOLATION
-	double* curr; //current pointer
-	int iqa, iqb;
-	double psa, psb;
-	double x0a,x1a,x2a,x3a,x123a,x120a,x032a,x031a;
-	double x0b,x1b,x2b,x3b,x123b,x120b,x032b,x031b;
-
-	// PLEASE note that x1a*x2a is called twice, etc.
-	// mohan add 2021-04-06	
-	psa = distance10 / tbeta.dr;
-	iqa = static_cast<int>(psa);
-   	x0a = psa - static_cast<double>(iqa);
-  	x1a = 1.0 - x0a;
-   	x2a = 2.0 - x0a;
-    x3a = 3.0 - x0a;
-	x123a = x1a*x2a*x3a/6.0;
-	x120a = x1a*x2a*x0a/6.0;
-	x032a = x0a*x3a*x2a/2.0;
-	x031a = x0a*x3a*x1a/2.0;
-	
-	psb = distance20 / tbeta.dr;
-	iqb = (int) psb;
-   	x0b = psb - (double)iqb ;
-  	x1b = 1.0 - x0b;
-   	x2b = 2.0 - x0b;
-    x3b = 3.0 - x0b;
-	x123b = x1b*x2b*x3b/6.0;
-	x120b = x1b*x2b*x0b/6.0;
-	x032b = x0b*x3b*x2b/2.0;
-	x031b = x0b*x3b*x1b/2.0;
-	
-	//UNIT VECTOR
-	double unit_vec_dRb[3];
-	unit_vec_dRb[0] = dRb.x;
-	unit_vec_dRb[1] = dRb.y;
-	unit_vec_dRb[2] = dRb.z;
-	
-	//special case for R = 0;
-	const double tiny1 = 1e-12;
-	const double tiny2 = 1e-10;
-
-	if(distance10 < tiny1) distance10 += tiny1;
-	if(distance20 < tiny1) distance20 += tiny1;
-	
-
-	// Find three dimension of 'Table_NR' '
-	// Notice!!! T1 must be orbital, 
-	// T0 must be nonlocal orbital
-	// usage : pairs_nonlocal_type(T1 : orbital, T0 : projector);
-	const int Tpair1 = tbeta.NL_Tpair(T1, T0);
-	const int Tpair2 = tbeta.NL_Tpair(T2, T0);
-	const int T1_2Lplus1 = tbeta.NL_L2plus1(T1, T0);
-	const int T2_2Lplus1 = tbeta.NL_L2plus1(T2, T0);
-
-	//gaunt index
-	const int gindex1 = L1*L1+m1;
-	const int gindex2 = L2*L2+m2;
-
-	// Peize Lin change rlya, rlyb, grlyb 2016-08-26
-	vector<double> rlya;
-	vector<double> rlyb;
-	vector<vector<double>> grlyb;
-	
-	Ylm::rl_sph_harm (T1_2Lplus1-1, dRa.x, dRa.y, dRa.z, rlya);
-	if (job == 0) 
-	{
-		Ylm::rl_sph_harm (T2_2Lplus1-1, dRb.x, dRb.y, dRb.z, rlyb);
-	}
-	else 
-	{
-		Ylm::grad_rl_sph_harm (T2_2Lplus1-1, dRb.x, dRb.y, dRb.z, rlyb, grlyb);
-	}
-	//==============================================================================
-	// Formula :                         T1       T0          T0        T2
-	// sum_{L0}sum_{m0}
-	// 			D_{L0,L0} <psi1_{L1,N1}|Beta_{L0,m0}><Beta_{L0,m0}|psi2_{L2,N2}>
-	//==============================================================================
-	//double v = 0.0;
-
-	// mohan update 2011-03-07
-	int n_projection =1;
-	if(has_so) 
-	{
-		n_projection = ORB.Beta[T0].get_nproj_soc();
-	}
-
-	vector<complex<double>> term_a_nc(n_projection,{0,0});		// Peize Lin change ptr to vector at 2020.01.31
-	vector<complex<double>> term_b_nc(n_projection,{0,0});		// Peize Lin change ptr to vector at 2020.01.31
-	int ip = -1;
-
-	for(int nb=0; nb<nproj; nb++)
-	{
-		if( !calproj[nb] ) continue;
-
-		const int L0 = ORB.Beta[T0].getL_Beta(nb);
-		//const int next_ip = 2* L0 +1;	
-
-//-------------------------------------------------------------------
-// move iterations for psi1 and psi2 from cal_fvnl_dbeta 
-// to here --- 2021/03/20 mohan chen
-//-------------------------------------------------------------------
-
-		// <psi1 | Beta>
-		const int Opair1 = tbeta.NL_Opair(Tpair1, L1, N1, nb); 
-		// <psi2 | Beta>
-		const int Opair2 = tbeta.NL_Opair(Tpair2, L2, N2, nb); 
-		
-		for(int m0=0; m0<2*L0+1; m0++)
-		{
-			++ip;
-			int gindex0 = L0*L0+m0;
-			
-			//loop of {lmn}
-			double term_a = 0.0;
-			double term_b = 0.0;
-			double term_c[3] = {0,0,0};	
-			
-			//=============
-			// FIRST PART	
-			//=============
-			for(int L=0; L<T1_2Lplus1; L++)
-			{
-				//triangle rule for gaunt coefficients
-				int AL = L1 + L0;
-				int SL = abs (L1 - L0);
-				if ((L > AL) || (L < SL) || ((L-SL) % 2 == 1)) continue;
-			
-				//prefac = (i)^{lphi - lbeta - l}
-				//R0-R1 ==> <phi|beta>
-				double i_exp = pow(-1.0, (L1-L0-L)/2);
-				double rl1 = pow(distance10, L);			
-				double Interp_Vnla = 0.0;
-				if (distance10 > tiny2)
-				{	
-					curr = tbeta.Table_NR[0][Tpair1][Opair1][L];
-					if( iqa >= rmesh1[nb]-4)
-					{
-						Interp_Vnla = 0.0;
-					}
-					else
-					{
-						Interp_Vnla = i_exp * (x123a*curr[iqa]+x120a*curr[iqa+3]+x032a*curr[iqa+1]-x031a*curr[iqa+2]);
-					}
-					Interp_Vnla /= rl1;
-				}
-				else 
-				{
-					Interp_Vnla = i_exp * tbeta.Table_NR[0][Tpair1][Opair1][L][0];
-				}
-	
-				//------------------------------------------
-				//  Overlap value = S_from_table * G * Ylm				
-				//------------------------------------------
-				for(int m=0; m<2*L+1; m++)
-				{
-					int gindexa = L*L+m;
-					//double tmpGaunt = this->MGT.Get_Gaunt_SH(L1, m1, L0, m0, L, m); 
-					double tmpGaunt = this->MGT.Gaunt_Coefficients (gindex1, gindex0, gindexa);
-					term_a += tmpGaunt * Interp_Vnla * rlya[ MGT.get_lm_index(L, m) ];
-				}
-			} //end L
-
-			//=============
-			// SECOND PART	
-			//=============
-			for(int L=0; L<T2_2Lplus1; L++)
-			{
-				//triangle rule for gaunt coefficients
-				int AL = L2 + L0;
-				int SL = abs (L2 - L0);
-				if ((L > AL) || (L < SL) || ((L-SL) % 2 == 1)) continue;
-
-				double Interp_Vnlb = 0.0;
-				double Interp_Vnlc = 0.0;
-				
-				//prefac
-				double i_exp = pow(-1.0, (L2-L0-L)/2);
-				double rl2 = pow (distance20, L);	
-				if (distance20 > tiny2)
-				{
-					curr = tbeta.Table_NR[0][Tpair2][Opair2][L];
-   					
-					if( iqb >= rmesh2[nb]-4) 
-					{
-						Interp_Vnlb = 0.0;
-					}
-					else 
-					{
-						Interp_Vnlb = i_exp * (x123b*curr[iqb]+x120b*curr[iqb+3]+x032b*curr[iqb+1]-curr[iqb+2]*x031b);
-					}
-					
-					Interp_Vnlb /= rl2;
-				}
-				else 
-				{
-					Interp_Vnlb = i_exp * tbeta.Table_NR[0][Tpair2][Opair2][L][0];
-				}
-
-				
-				if (job == 1) // 1 means calculate the derivative part.
-				{
-					if (distance20 > tiny2)
-					{
-						curr = tbeta.Table_NR[1][Tpair2][Opair2][L];
-   					
-						if( iqb >= rmesh2[nb]-4) 
-						{
-							Interp_Vnlc = 0.0;
-						}
-						else 
-						{
-							Interp_Vnlc = i_exp * (x123b*curr[iqb]+x120b*curr[iqb+3]+x032b*curr[iqb+1]-curr[iqb+2]*x031b);
-						}
-						Interp_Vnlc = Interp_Vnlc / pow(distance20, L) - Interp_Vnlb * L / distance20;
-					}
-					else 
-					{
-						Interp_Vnlc = 0.0;
-					}
-				}
-				
-				// sum up the second part.	
-				for(int m=0; m<2*L+1; m++)
-				{
-					int gindexb = L*L+m;
-					//double tmpGaunt = this->MGT.Get_Gaunt_SH(L0, m0, L2, m2, L, m);
-					double tmpGaunt = this->MGT.Gaunt_Coefficients (gindex0, gindex2, gindexb);
-					const int lm = MGT.get_lm_index(L, m);
-					
-					switch (job)
-					{
-						case 0:// calculate the overlap part.
-						{
-							term_b += tmpGaunt * Interp_Vnlb * rlyb[lm];
-							break;
-						}
-						case 1: // calculate the derivative part.
-						{
-							double tt1 = tmpGaunt * Interp_Vnlc * rlyb[lm] / distance20;
-							double tt2 = tmpGaunt * Interp_Vnlb;
-										
-							for(int ir = 0; ir < 3; ir++)
-							{
-								term_c[ir] += tt1 * unit_vec_dRb[ir] 
-											+ tt2 * grlyb[lm][ir];
-							}
-
-							break;
-						}
-						default: break;
-					}
-				}// end m of SECOND PART
-			}// end L of SECOND PART
-		
-		
-			//added by zhengdy-soc, store them for soc case
-			if(has_so)
-			{
-				term_a_nc[ip] = term_a;
-				term_b_nc[ip] = term_b;
-			}
-		
-			//===============================================
-			// THIRD PART: SUM THE VALUE FROM ALL PROJECTS.
-			//===============================================
-			switch (job)
-			{
-				case 0://calculate the overlap part.
-				{
-					//nlm[0] += term_a * term_b * ORB.Beta[T0].getCoefficient_D(L0, L0);//LiuXh 2016-01-14
-					if(!has_so) 
-					{
-						nlm[0] += term_a * term_b * ORB.Beta[T0].getCoefficient_D(nb, nb);//LiuXh 2016-01-14
-					}
-					break;
-				}
-				case 1: //calculate the derivative part.
-				{
-					for(int jr = 0; jr < 3; jr++) 
-					{
-						//nlm[jr] += term_c[jr] * term_a * ORB.Beta[T0].getCoefficient_D(L0, L0);//LiuXh 2016-01-14
-						if(!has_so) 
-						{
-							nlm[jr] += term_c[jr] * term_a * ORB.Beta[T0].getCoefficient_D(nb, nb);//LiuXh 2016-01-14
-						}
-					}
-					break;
-				}
-				default: break;
-			}
-		}//!m0
-	}//!L0
-
-	//zhengdy-soc, calculate non-local term
-	if(has_so)
-	{
-		switch (job)
-		{
-			case 0://overlap part
-				for(int no=0;no<ORB.Beta[T0].get_count_soc(is);no++)
-				{
-					const int p1 = ORB.Beta[T0].get_index1_soc(is, no);
-					const int p2 = ORB.Beta[T0].get_index2_soc(is, no);
-					if(NSPIN==4 && nlm1!=NULL)
-					{
-						nlm1[is] += term_a_nc[p1] * term_b_nc[p2] * ORB.Beta[T0].getCoefficient_D_so(is, p2, p1);
-					}
-					else if(NSPIN!=4)
-					{
-						nlm[0] += (term_a_nc[p1] * term_b_nc[p2] * ORB.Beta[T0].getCoefficient_D_so(0, p2, p1)).real();
-					}
-					else
-					{
-						WARNING_QUIT("ORB_gen_tables::snap_psibeta","Conflict! Didn't count non-local part");
-					}
-				}
-				break;
-
-			case 1://need to be added later
-				{break;}
-
-			default: break;
-		}
-	}
-
-	delete[] calproj;
-	delete[] rmesh1;
-	delete[] rmesh2;
-
-//	timer::tick ("ORB_gen_tables","snap_psibeta",'X');
-	return;
-}
-
-void ORB_gen_tables::snap_psipsi(
-	double olm[],
-	const int &job, //0, 1
-	const char &dtype, // derivative type: S or T
-	const Vector3<double> &R1,
-    const int &T1,
-    const int &L1,
-    const int &m1,
-    const int &N1,
-    const Vector3<double> &R2,
-    const int &T2,
-    const int &L2,
-    const int &m2,
-    const int &N2,
-	complex<double> *olm1)const
-{
-	//TITLE("ORB_gen_tables","snap_psipsi");
-	//timer::tick ("ORB_gen_tables", "snap_psipsi");
-	if(job != 0 && job != 1)
-	{
-		WARNING_QUIT("ORB_gen_tables::snap_psipsi","job must be equal to 0 or 1!");
-	}
-	
-	Numerical_Orbital::set_position(R1, R2);
-	assert(this->lat0>0.0);
-
-	// (1) get distance between R1 and R2 (a.u.)
-	// judge if there exist overlap
-	double distance = Numerical_Orbital::get_distance()*this->lat0;
-	
-	const double Rcut1 = ORB.Phi[T1].getRcut();
-	const double Rcut2 = ORB.Phi[T2].getRcut();
-
-	if(job == 0) ZEROS(olm, 1);
-	else if(job == 1) ZEROS(olm, 3);
-	
-	if( distance > (Rcut1 + Rcut2) ) return;
-	
-	//if distance == 0
-	//\int psi(r) psi(r-R) dr independent of R if R == 0
-	//distance += tiny1 avoid overflow during calculation
-	const double tiny1 = 1e-12;
-	const double tiny2 = 1e-10;
-	if(distance < tiny1) distance += tiny1;
-	
-	// (2) if there exist overlap, calculate the mesh number
-	// between two atoms
-	const int rmesh = this->MOT.get_rmesh(Rcut1, Rcut2);
-	
-	// (3) Find three dimension of 'Table_S' or 'Table_T'
-	// dim1 : type pairs,
-	// dim2 : radial orbital pairs,
-	// dim3 : find lmax between T1 and T2, and get lmax*2+1
-	const int dim1 = this->MOT.OV_Tpair(T1, T2);
-	const int dim3 = this->MOT.OV_L2plus1(T1, T2); //2*lmax+1
-	
-	int dim2;
-	if (T1 <= T2) dim2 = this->MOT.OV_Opair(dim1, L1, L2, N1, N2); 
-	else dim2 = this->MOT.OV_Opair(dim1, L2, L1, N2, N1);
-		
-	// Find the needed Ylm(dR) dimension 
-	const int nlm = dim3 * dim3; //(2lmax+1)*(2lmax+!)
-
-	//Gaunt Index
-	const int gindex1 = L1*L1+m1;
-	const int gindex2 = L2*L2+m2;
-
-	assert(nlm < 400);
-	// Peize Lin change rly, grly 2016-08-26
-	vector<double> rly;			
-	vector<vector<double>> grly;
-	
-//	double *ylm = new double[nlm];
-//	dR = R1 - R2;
-	double arr_dR[3];
-	arr_dR[0] = Numerical_Orbital::getX() * this->lat0;
-	arr_dR[1] = Numerical_Orbital::getY() * this->lat0;
-	arr_dR[2] = Numerical_Orbital::getZ() * this->lat0;
-	
-	//double xdr = arr_dR[0] / distance;
-	//double ydr = arr_dR[1] / distance;
-	//double zdr = arr_dR[2] / distance;
-	
-	//=======================
-	// *r**l*Ylm_real
-	// include its derivations
-	//=======================
-	if (job == 0) 
-	{
-//		Ylm::rlylm(dim3, arr_dR[0], arr_dR[1], arr_dR[2], rly);
-//		Ylm::sph_harm (dim3-1, xdr, ydr, zdr, rly);
-		Ylm::rl_sph_harm (dim3-1, arr_dR[0], arr_dR[1], arr_dR[2], rly);
-	}
-	else 
-	{
-//		Ylm::rlylm(dim3, arr_dR[0], arr_dR[1], arr_dR[2], rly, grly);
-		Ylm::grad_rl_sph_harm (dim3-1, arr_dR[0], arr_dR[1], arr_dR[2], rly, grly);
-	}
-
-	switch( dtype )
-	{
-		case 'S':
-		for (int L = 0; L < dim3; L++) //maxL = dim3-1
-		{
-			//===========================================================
-			// triangle rule for L and sum of L, L1, L2 should be even
-			//===========================================================
-			int AL = L1 + L2;
-			int SL = abs (L1 - L2);
-
-			if ((L > AL) || (L < SL) || ((L-SL) % 2 == 1)) continue;
-			
-			double Interp_Slm = 0.0;
-			double Interp_dSlm = 0.0;
-			double tmpOlm0 = 0.0;
-			double tmpOlm1 = 0.0;
-			
-			// prefactor
-			double i_exp = pow(-1.0, (L1 - L2 - L) / 2);
-			double rl = pow (distance, L);
-
-			if (distance > tiny2)
-			{
-				Interp_Slm = i_exp * Mathzone::Polynomial_Interpolation(
-						MOT.Table_SR[0][dim1][dim2][L],	rmesh, MOT.dr, distance );
-				Interp_Slm /= rl;
-			}
-			else // distance = 0.0; 
-			{
-				Interp_Slm = i_exp * MOT.Table_SR[0][dim1][dim2][L][0];
-			}
-				
-			if (job == 1)//calculate the derivative.
-			{
-				if (distance > tiny2)
-				{
-					Interp_dSlm = i_exp * Mathzone::Polynomial_Interpolation(
-						MOT.Table_SR[1][dim1][dim2][L], rmesh, MOT.dr, distance );
-					Interp_dSlm = Interp_dSlm / pow (distance, L) - Interp_Slm * L / distance;
-				}
-				else 
-				{
-					Interp_dSlm = 0.0;
-				}
-			}
-			
-			for (int m = 0; m < 2*L+1; m++)
-			{
-				int gindex = L*L+m;
-	//			double tmpGaunt1 = MGT.Get_Gaunt_SH(L1, m1, L2, m2, L, m);
-				double tmpGaunt = MGT.Gaunt_Coefficients (gindex1, gindex2, gindex);	
-							
-				tmpOlm0 = Interp_Slm * tmpGaunt;
-	
-				if (job == 1) 
-				{
-					tmpOlm1 = Interp_dSlm * tmpGaunt;
-				}
-				
-				switch( job )
-				{
-					case 0: // calculate overlap.
-					{	
-						if(NSPIN!=4) olm[0] += tmpOlm0 * rly[ MGT.get_lm_index(L, m) ] ;
-						else if(olm1!= NULL)
-						{
-							olm1[0] += tmpOlm0 * rly[ MGT.get_lm_index(L,m) ] ;
-							olm1[1] += 0;//tmpOlm0 * (tmp(0,0)+tmp(0,1));
-							olm1[2] += 0;//tmpOlm0 * (tmp(1,0)+tmp(1,1));
-							olm1[3] += tmpOlm0 * rly[ MGT.get_lm_index(L,m) ] ;
-							
-						}
-						else
-						{
-							WARNING_QUIT("ORB_gen_tables::snap_psipsi","something wrong!");
-							
-						}
-					
-						/*		
-						if( abs ( tmpOlm0 * rly[ MGT.get_lm_index(L, m) ] ) > 1.0e-3 )
-						{
-						cout << " L=" << L << " m=" << m << " tmpOlm0=" << tmpOlm0 
-						<< " rly=" << rly[ MGT.get_lm_index(L, m) ] 
-						<< " r=" << olm[0]
-						<< endl;
-						}
-						*/
-						break;
-					}
-					case 1: // calculate gradient.
-					{
-						for(int ir = 0; ir < 3; ir++)
-						{
-							olm[ir] += tmpOlm0 * grly[ MGT.get_lm_index(L, m) ][ir]
-									 + tmpOlm1 * rly[ MGT.get_lm_index(L, m) ] * arr_dR[ir] / distance;
-						}
-						break;
-					}
-					default: break;
-				}
-			}//!m
-		}
-		break;
-
-		case 'T':
-		for (int L = 0; L < dim3; L++)
-		{
-			int AL = L1 + L2;
-			int SL = abs (L1 - L2);
-
-			if ((L > AL) || (L < SL) || ((L-SL) % 2 == 1)) continue;
-
-			double Interp_Tlm, Interp_dTlm, tmpKem0, tmpKem1;
-			Interp_Tlm = Interp_dTlm = tmpKem0 = tmpKem1 = 0.0;
-			
-			//pre-fac
-			double i_exp = pow(-1.0, (L1 - L2 - L) / 2);
-
-			double rl = pow (distance, L);
-			if (distance > tiny2)
-			{
-				Interp_Tlm = i_exp * Mathzone::Polynomial_Interpolation(
-						MOT.Table_TR[0][dim1][dim2][L],	rmesh, MOT.dr, distance );	
-				Interp_Tlm /= rl;
-			}
-			else Interp_Tlm = i_exp * MOT.Table_TR[0][dim1][dim2][L][0];
-				
-			
-			if (job == 1)
-			{
-				if (distance > tiny2)
-				{
-					Interp_dTlm = i_exp * Mathzone::Polynomial_Interpolation(
-						MOT.Table_TR[1][dim1][dim2][L], rmesh, MOT.dr, distance );
-					Interp_dTlm = Interp_dTlm / rl - Interp_Tlm * L / distance;
-				}
-				else Interp_dTlm = 0.0;
-			}
-			
-			for (int m = 0; m < 2*L+1; m++)
-			{
-				int gindex = L*L+m;
-			//	double tmpGaunt = MGT.Get_Gaunt_SH(L1, m1, L2, m2, L, m);
-				double tmpGaunt = MGT.Gaunt_Coefficients (gindex1, gindex2, gindex);
-					
-				tmpKem0 = Interp_Tlm * tmpGaunt;
-				if (job == 1) 
-				{
-					tmpKem1 = Interp_dTlm * tmpGaunt;
-				}
-				
-				switch( job )
-				{
-					case 0:
-					{
-						if(NSPIN!=4) olm[0] += tmpKem0 * rly[ MGT.get_lm_index(L, m) ];
-						else if(olm1 != NULL)
-						{
-							olm1[0] += tmpKem0 * rly[ MGT.get_lm_index(L,m) ];
-							olm1[1] += 0;//tmpKem0 * (tmp(0,0)+tmp(0,1));
-							olm1[2] += 0;//tmpKem0 * (tmp(1,0)+tmp(1,1));
-							olm1[3] += tmpKem0 * rly[ MGT.get_lm_index(L,m) ];
-						}
-						else
-						{
-							WARNING_QUIT("ORB_gen_tables::snap_psipsi","something wrong in T.");
-						}
-						break;
-					}
-					case 1: 
-					{
-						for(int ir = 0; ir < 3; ir++)
-						{
-							olm[ir] += tmpKem0 * grly[ MGT.get_lm_index(L, m) ][ir]
-								    + tmpKem1 * rly[ MGT.get_lm_index(L, m) ] * arr_dR[ir] / distance;
-						}
-						break;
-					}
-					default: break;
-				}
-			}// end T: m
-		}// end T: :
-		break;
-	}
-//	timer::tick ("ORB_gen_tables", "snap_psipsi");
-	return;
-}
-
-double ORB_gen_tables::get_distance( const Vector3<double> &R1, const Vector3<double> &R2)const
-{
-	assert( this->lat0 > 0.0);
-	Vector3<double> dR = R1 - R2;
-	return dR.norm() * this->lat0;	
-}
-
-//caoyu add 2021-03-17
-void ORB_gen_tables::snap_psialpha(
-	double olm[],
-	const int& job,
-	const Vector3<double>& R1,
-	const int& T1,
-	const int& L1,
-	const int& m1,
-	const int& N1,
-	const Vector3<double>& R2,
-	const int& T2,
-	const int& L2,
-	const int& m2,
-	const int& N2) const
-{
-
-	if (job != 0 && job != 1)
-	{
-		WARNING_QUIT("ORB_gen_tables::snap_psialpha", "job must be equal to 0 or 1!");
-	}
-
-	Numerical_Orbital::set_position(R1, R2);
-	assert(this->lat0 > 0.0);
-
-	// (1) get distance between R1 and R2 (a.u.)
-	// judge if there exist overlap
-	double distance = Numerical_Orbital::get_distance() * this->lat0;
-
-	const double Rcut1 = ORB.Phi[T1].getRcut();
-	const double Rcut2 = ORB.Alpha[0].getRcut();
-
-	if (job == 0) ZEROS(olm, 1);
-	else if (job == 1) ZEROS(olm, 3);
-
-	if (distance > (Rcut1 + Rcut2)) return;
-
-	//if distance == 0
-	//\int psi(r) psi(r-R) dr independent of R if R == 0
-	//distance += tiny1 avoid overflow during calculation
-	const double tiny1 = 1e-12;
-	const double tiny2 = 1e-10;
-	if (distance < tiny1) distance += tiny1;
-
-	// (2) if there exist overlap, calculate the mesh number
-	// between two atoms
-	const int rmesh = this->talpha.get_rmesh(Rcut1, Rcut2);
-
-	// (3) Find three dimension of 'Table_DS'
-	// dim1 : type pairs, equal to T1 here 
-	// dim2 : radial orbital pairs,
-	// dim3 : find lmax between T1 and T2, and get lmax*2+1
-	const int dim1 = T1;
-	int dim2 = this->talpha.DS_Opair(dim1, L1, L2, N1, N2);
-	int dim3 = this->talpha.DS_2Lplus1[T1];
-
-	//Gaunt Index
-		const int gindex1 = L1 * L1 + m1;
-	const int gindex2 = L2 * L2 + m2;
-
-	// Peize Lin change rly, grly 2016-08-26
-	vector<double> rly;
-	vector<vector<double>> grly;
-
-	double arr_dR[3];
-	arr_dR[0] = Numerical_Orbital::getX() * this->lat0;
-	arr_dR[1] = Numerical_Orbital::getY() * this->lat0;
-	arr_dR[2] = Numerical_Orbital::getZ() * this->lat0;
-
-	//double xdr = arr_dR[0] / distance;
-	//double ydr = arr_dR[1] / distance;
-	//double zdr = arr_dR[2] / distance;
-
-	//=======================
-	// *r**l*Ylm_real
-	// include its derivations
-	//=======================
-	if (job == 0)
-	{
-		Ylm::rl_sph_harm(dim3 - 1, arr_dR[0], arr_dR[1], arr_dR[2], rly);
-	}
-	else
-	{
-		Ylm::grad_rl_sph_harm(dim3 - 1, arr_dR[0], arr_dR[1], arr_dR[2], rly, grly);
-	}
-
-	for (int L = 0; L < dim3; L++) //maxL = dim3-1
-	{
-		//===========================================================
-		// triangle rule for L and sum of L, L1, L2 should be even
-		//===========================================================
-		int AL = L1 + L2;
-		int SL = abs(L1 - L2);
-
-		if ((L > AL) || (L < SL) || ((L - SL) % 2 == 1)) continue;
-
-		double Interp_Slm = 0.0;
-		double Interp_dSlm = 0.0;
-		double tmpOlm0 = 0.0;
-		double tmpOlm1 = 0.0;
-
-		// prefactor
-		double i_exp = pow(-1.0, (L1 - L2 - L) / 2);
-		double rl = pow(distance, L);
-
-		if (distance > tiny2)
-		{
-			Interp_Slm = i_exp * Mathzone::Polynomial_Interpolation(
-				talpha.Table_DSR[0][dim1][dim2][L], rmesh, MOT.dr, distance);
-			Interp_Slm /= rl;
-		}
-		else // distance = 0.0; 
-		{
-			Interp_Slm = i_exp * talpha.Table_DSR[0][dim1][dim2][L][0];
-		}
-
-		if (job == 1)//calculate the derivative.
-		{
-			if (distance > tiny2)
-			{
-				Interp_dSlm = i_exp * Mathzone::Polynomial_Interpolation(
-					talpha.Table_DSR[1][dim1][dim2][L], rmesh, MOT.dr, distance);
-				Interp_dSlm = Interp_dSlm / pow(distance, L) - Interp_Slm * L / distance;
-			}
-			else
-			{
-				Interp_dSlm = 0.0;
-			}
-		}
-
-		for (int m = 0; m < 2 * L + 1; m++)
-		{
-			int gindex = L * L + m;
-			//			double tmpGaunt1 = MGT.Get_Gaunt_SH(L1, m1, L2, m2, L, m);
-			double tmpGaunt = MGT.Gaunt_Coefficients(gindex1, gindex2, gindex);
-
-			tmpOlm0 = Interp_Slm * tmpGaunt;
-
-			if (job == 1)
-			{
-				tmpOlm1 = Interp_dSlm * tmpGaunt;
-			}
-
-			switch (job)
-			{
-			case 0: // calculate overlap.
-			{
-				if (NSPIN != 4) olm[0] += tmpOlm0 * rly[MGT.get_lm_index(L, m)];
-				else
-				{
-					WARNING_QUIT("ORB_gen_tables::snap_psialpha", "deepks with NSPIN>1 has not implemented yet!");
-				}
-				/*
-				if( abs ( tmpOlm0 * rly[ MGT.get_lm_index(L, m) ] ) > 1.0e-3 )
-				{
-				cout << " L=" << L << " m=" << m << " tmpOlm0=" << tmpOlm0
-				<< " rly=" << rly[ MGT.get_lm_index(L, m) ]
-				<< " r=" << olm[0]
-				<< endl;
-				}
-				*/
-				break;
-			}
-			case 1: // calculate gradient.
-			{
-				for (int ir = 0; ir < 3; ir++)
-				{
-					olm[ir] += tmpOlm0 * grly[MGT.get_lm_index(L, m)][ir]
-						+ tmpOlm1 * rly[MGT.get_lm_index(L, m)] * arr_dR[ir] / distance;
-				}
-				break;
-			}
-			default: break;
-			}
-		}//!m
-	}
-
-	return;
-}
-=======
 #include "src_pw/global.h"
 #include "ORB_read.h"
 #include "ORB_gen_tables.h"
@@ -2026,4 +1017,3 @@ void ORB_gen_tables::snap_psialpha(
 
 	return;
 }
->>>>>>> bb6b769f65ba8d15bff47d34656568675dea341e

From 17ab4b8fc9ee1bc31381cbf745092d6d1f6195c5 Mon Sep 17 00:00:00 2001
From: maki49 <1579492865@qq.com>
Date: Sun, 11 Apr 2021 16:37:08 +0800
Subject: [PATCH 47/60] revert

---
 ABACUS.develop/examples/H2O-deepks-lcao/INPUT | 1 +
 1 file changed, 1 insertion(+)

diff --git a/ABACUS.develop/examples/H2O-deepks-lcao/INPUT b/ABACUS.develop/examples/H2O-deepks-lcao/INPUT
index e9253e2028..94db0eb32e 100644
--- a/ABACUS.develop/examples/H2O-deepks-lcao/INPUT
+++ b/ABACUS.develop/examples/H2O-deepks-lcao/INPUT
@@ -27,3 +27,4 @@ mixing_beta             0.4
 out_band                0
 out_descriptor          1 
 lmax_descriptor			2
+newdm		1
\ No newline at end of file

From 7622d7369dd84ccec2d72d9a331452310fe8f9dc Mon Sep 17 00:00:00 2001
From: mohan <mohan.chen.chen.mohan@gmail.com>
Date: Mon, 12 Apr 2021 17:48:53 +0800
Subject: [PATCH 48/60] fix a bug about no initialization of variable

---
 ABACUS.develop/source/src_lcao/LCAO_descriptor.cpp | 1 +
 1 file changed, 1 insertion(+)

diff --git a/ABACUS.develop/source/src_lcao/LCAO_descriptor.cpp b/ABACUS.develop/source/src_lcao/LCAO_descriptor.cpp
index f1a7538c2c..c2ced8db93 100644
--- a/ABACUS.develop/source/src_lcao/LCAO_descriptor.cpp
+++ b/ABACUS.develop/source/src_lcao/LCAO_descriptor.cpp
@@ -29,6 +29,7 @@ void LCAO_Descriptor::build_S_descriptor(const bool &calc_deri)
 
     // =======init==============
     // cal n(descriptor) per atom , related to Lmax, nchi(L) and m. (not total_nchi!)
+	this->des_per_atom=0; // mohan add 2021-04-21
     for (int l = 0; l <= ORB.get_lmax_d(); l++)
     {
         this->des_per_atom += ORB.Alpha[0].getNchi(l) * (2 * l + 1);

From 2d8b148c748f457e27175de66b7dd8b0b576bd42 Mon Sep 17 00:00:00 2001
From: 80610702-git <quxin@mail.ustc.edu.cn>
Date: Wed, 14 Apr 2021 09:38:08 +0800
Subject: [PATCH 49/60] DFT+U stress

Fix bug of stress of DFT+U; Change fortran code lscc.f90 to C++
---
 ABACUS.develop/source/Makefile.Objects        |   1 -
 ABACUS.develop/source/src_lcao/dftu_relax.cpp | 395 ++++++++++--------
 .../source/src_lcao/dftu_yukawa.cpp           |  17 +-
 ABACUS.develop/source/src_lcao/dftu_yukawa.h  |   3 +
 4 files changed, 236 insertions(+), 180 deletions(-)

diff --git a/ABACUS.develop/source/Makefile.Objects b/ABACUS.develop/source/Makefile.Objects
index a8c5084e35..99e184b16c 100644
--- a/ABACUS.develop/source/Makefile.Objects
+++ b/ABACUS.develop/source/Makefile.Objects
@@ -227,7 +227,6 @@ variable_cell.o\
 dftu.o\
 dftu_yukawa.o\
 dftu_relax.o\
-lscc.o\
 
 OBJS_COMMON=atom_spec.o \
 unitcell.o \
diff --git a/ABACUS.develop/source/src_lcao/dftu_relax.cpp b/ABACUS.develop/source/src_lcao/dftu_relax.cpp
index 16e3a4d1fb..31424f7d6d 100644
--- a/ABACUS.develop/source/src_lcao/dftu_relax.cpp
+++ b/ABACUS.develop/source/src_lcao/dftu_relax.cpp
@@ -16,15 +16,35 @@
 #include "../src_pw/global.h"
 #include "global_fp.h"
 #include "../src_global/global_function.h"
-#include "../src_global/scalapack_connector.h"
+//#include "../src_global/scalapack_connector.h"
 #include "../src_global/lapack_connector.h"
-#include "../src_pw/inverse_matrix.h"
-#include "local_orbital_ions.h"
-#include "lcao_matrix.h"
+#include "../src_global/inverse_matrix.h"
+#include "LOOP_ions.h"
+#include "LCAO_matrix.h"
 #include "../src_pw/magnetism.h"
-#include "use_overlap_table.h"
+#include "ORB_gen_tables.h"
 #include "../src_pw/charge.h"
 
+extern "C"
+{
+  void pzgemm_(
+		const char *transa, const char *transb,
+		const int *M, const int *N, const int *K,
+		const std::complex<double> *alpha,
+		const std::complex<double> *A, const int *IA, const int *JA, const int *DESCA,
+		const std::complex<double> *B, const int *IB, const int *JB, const int *DESCB,
+		const std::complex<double> *beta,
+		std::complex<double> *C, const int *IC, const int *JC, const int *DESCC);
+  
+  void pdgemm_(
+		const char *transa, const char *transb,
+		const int *M, const int *N, const int *K,
+		const double *alpha,
+		const double *A, const int *IA, const int *JA, const int *DESCA,
+		const double *B, const int *IB, const int *JB, const int *DESCB,
+		const double *beta,
+		double *C, const int *IC, const int *JC, const int *DESCC);
+}
 
 DFTU_RELAX::DFTU_RELAX(){}
 
@@ -144,7 +164,7 @@ void DFTU_RELAX::force_stress()
 				}
 				else
 				{
-					if(NSPIN==1 || NSPIN==4)
+					if(NSPIN==1)
 					{
 						double val = get_onebody_eff_pot(T1, iat1, L1, n1, 0, m1, m2, cal_type, false);
 						VU_k.at(0).at(irc) = complex<double>(val, 0.0);
@@ -191,13 +211,15 @@ void DFTU_RELAX::force_stress()
 }
 
 
-void DFTU_RELAX::cal_force_k(vector<vector<complex<double>>> &VU)
+void DFTU_RELAX::cal_force_k(const vector<vector<complex<double>>>& VU)
 {
 	TITLE("DFTU_RELAX", "cal_force_k");
 
-	const char transN = 'N', transT = 'T';
+	const char transN = 'N', transT = 'T', transC='C';
 	const int  one_int = 1;
-	const double alpha = 1.0, beta = 0.0;
+	// const double alpha = 1.0, beta = 0.0;
+  const complex<double> alpha(1.0,0.0), beta(0.0,0.0);
+  const complex<double> zero(0.0,0.0);
 	
 	vector<vector<complex<double>>> ftmp(ucell.nat);
 	for(int ia=0; ia<ucell.nat; ia++)
@@ -208,7 +230,7 @@ void DFTU_RELAX::cal_force_k(vector<vector<complex<double>>> &VU)
 	vector<vector<complex<double>>> dm_VU_dSm(3);
 	for(int dim=0; dim<3; dim++)
 	{
-		dm_VU_dSm.at(dim).resize(ParaO.nloc, complex<double>(0.0, 0.0));
+		dm_VU_dSm.at(dim).resize(ParaO.nloc, zero);
 	}
 	
 	for(int ik=0; ik<kv.nks; ik++)	
@@ -217,8 +239,8 @@ void DFTU_RELAX::cal_force_k(vector<vector<complex<double>>> &VU)
 
 		for(int dim=0; dim<3; dim++)
 		{
-			vector<complex<double>> mat_tmp(ParaO.nloc, complex<double>(0.0, 0.0));
-			vector<complex<double>> force_tmp(ParaO.nloc, complex<double>(0.0, 0.0));
+			vector<complex<double>> mat_tmp(ParaO.nloc);
+			vector<complex<double>> force_tmp(ParaO.nloc);
 
 			if(dim==0) //dim=1,2 are same as dim=0
 			{
@@ -251,7 +273,7 @@ void DFTU_RELAX::cal_force_k(vector<vector<complex<double>>> &VU)
 			//=========================================
 			ZEROS(VECTOR_TO_PTR(force_tmp), ParaO.nloc);
 
-			pzgemm_(&transN, &transT,
+			pzgemm_(&transN, &transC,
 				&NLOCAL, &NLOCAL, &NLOCAL,
 				&alpha, 
 				this->dSm_k[ik][dim], &one_int, &one_int, ParaO.desc, 
@@ -263,7 +285,7 @@ void DFTU_RELAX::cal_force_k(vector<vector<complex<double>>> &VU)
 			{
 				dm_VU_dSm.at(dim).at(irc) -= force_tmp.at(irc);
 			}
-		}//end dim				
+		}//end dim
 	}//end ik
 
 	for(int dim=0; dim<3; dim++)
@@ -296,35 +318,36 @@ void DFTU_RELAX::cal_force_k(vector<vector<complex<double>>> &VU)
 		}
 	}
 
-
 	return;
 }
 
 
-void DFTU_RELAX::cal_stress_k(vector<vector<complex<double>>> &VU)
+void DFTU_RELAX::cal_stress_k(const vector<vector<complex<double>>>& VU)
 {
 	TITLE("DFTU_RELAX", "cal_stress_k");
 
-	const char transN = 'N', transT = 'T';
+	const char transN = 'N', transT = 'T', transC='C';
 	const int  one_int = 1;
-	const double alpha = 1.0, beta = 0.0;
-	
+	//const double alpha = 1.0, beta = 0.0;
+	const complex<double> alpha(1.0,0.0), beta(0.0,0.0);
+  const complex<double> zero(0.0,0.0);
+  
 	int count = 0;
 	for(int dim1=0; dim1<3; dim1++)
 	{
 		for(int dim2=dim1; dim2<3; dim2++)
 		{
-			vector<complex<double>> dm_VU_sover(ParaO.nloc, complex<double>(0.0, 0.0));
+			vector<complex<double>> dm_VU_sover(ParaO.nloc, zero);
 
 			for(int ik=0; ik<kv.nks; ik++)
 			{
 				const int spin = kv.isk[ik];
 				
 				// The first term
-				vector<complex<double>> stress_tmp(ParaO.nloc, complex<double>(0.0, 0.0));
+				vector<complex<double>> stress_tmp(ParaO.nloc);
 
 				//Calculate mat_tmp=dm*VU
-				vector<complex<double>> mat_tmp(ParaO.nloc, complex<double>(0.0, 0.0));
+				vector<complex<double>> mat_tmp(ParaO.nloc);
 
 				pzgemm_(&transT, &transN,
 					&NLOCAL, &NLOCAL, &NLOCAL,
@@ -344,13 +367,14 @@ void DFTU_RELAX::cal_stress_k(vector<vector<complex<double>>> &VU)
 
 				for(int irc=0; irc<ParaO.nloc; irc++)
 				{
-					dm_VU_sover.at(irc) -= 0.5*stress_tmp.at(irc);
+					// dm_VU_sover.at(irc) -= 0.5*stress_tmp.at(irc);
+          dm_VU_sover.at(irc) -= 0.5*stress_tmp.at(irc);
 				}
 
 				// The second term
 				ZEROS(VECTOR_TO_PTR(stress_tmp), ParaO.nloc);
 
-				pzgemm_(&transN, &transT,
+				pzgemm_(&transN, &transC,
 					&NLOCAL, &NLOCAL, &NLOCAL,
 					&alpha, 
 					this->soverlap_k[ik][count], &one_int, &one_int, ParaO.desc, 
@@ -360,7 +384,8 @@ void DFTU_RELAX::cal_stress_k(vector<vector<complex<double>>> &VU)
 
 				for(int irc=0; irc<ParaO.nloc; irc++)
 				{
-					dm_VU_sover.at(irc) += 0.5*stress_tmp.at(irc);
+					// dm_VU_sover.at(irc) += 0.5*stress_tmp.at(irc);
+          dm_VU_sover.at(irc) -= 0.5*stress_tmp.at(irc);
 				}
 
 			}//end ik
@@ -385,8 +410,8 @@ void DFTU_RELAX::cal_stress_k(vector<vector<complex<double>>> &VU)
 			double val = stmp.real();
 			MPI_Allreduce(&val, &stress_dftu.at(dim1).at(dim2), 1, MPI_DOUBLE, MPI_SUM, MPI_COMM_WORLD);
 
-			complex<double> tmp;
-			MPI_Allreduce(&stmp, &tmp, 1, MPI_DOUBLE, MPI_SUM, MPI_COMM_WORLD);
+			// complex<double> tmp;
+			// MPI_Allreduce(&stmp, &tmp, 1, MPI_DOUBLE, MPI_SUM, MPI_COMM_WORLD);
 						
 			count++;
 		}//end dim2
@@ -413,7 +438,7 @@ void DFTU_RELAX::cal_stress_k(vector<vector<complex<double>>> &VU)
 }
 
 
-void DFTU_RELAX::cal_force_gamma(vector<vector<double>> &VU)
+void DFTU_RELAX::cal_force_gamma(const vector<vector<double>> &VU)
 {
 	TITLE("DFTU_RELAX", "cal_force_gamma");
 
@@ -578,7 +603,7 @@ void DFTU_RELAX::cal_force_gamma(vector<vector<double>> &VU)
 }
 
 
-void DFTU_RELAX::cal_stress_gamma(vector<vector<double>> &VU)
+void DFTU_RELAX::cal_stress_gamma(const vector<vector<double>> &VU)
 {
 	TITLE("DFTU_RELAX", "cal_stress_gamma");
 
@@ -637,7 +662,7 @@ void DFTU_RELAX::cal_stress_gamma(vector<vector<double>> &VU)
 
 				for(int irc=0; irc<ParaO.nloc; irc++)
 				{
-					dm_VU_sover.at(irc) += 0.5*stress_tmp.at(irc);
+					dm_VU_sover.at(irc) -= 0.5*stress_tmp.at(irc);
 				}
 
 			}//end ik
@@ -727,145 +752,179 @@ void DFTU_RELAX::folding_dSm_soverlap()
 	}
 	
 
-	Vector3<double> tau1, tau2, dtau;
-	Vector3<double> dtau1, dtau2, tau0;
+	  Vector3<double> tau1, tau2, dtau;
+	  Vector3<double> dtau1, dtau2, tau0;
     for(int T1=0; T1<ucell.ntype; ++T1)
     {
-		Atom* atom1 = &ucell.atoms[T1];
-        for(int I1=0; I1<atom1->na; ++I1)
+		  Atom* atom1 = &ucell.atoms[T1];
+      for(int I1=0; I1<atom1->na; ++I1)
+      {
+			  tau1 = atom1->tau[I1];
+        const int start1 = ucell.itiaiw2iwt(T1,I1,0);    
+
+        GridD.Find_atom(tau1, T1, I1);
+        for(int ad=0; ad<GridD.getAdjacentNum()+1; ++ad)
         {
-			tau1 = atom1->tau[I1];
-            
-            GridD.Find_atom(tau1, T1, I1);
-            for(int ad=0; ad<GridD.getAdjacentNum()+1; ++ad)
-            {
-                const int T2 = GridD.getType(ad);
-				const int I2 = GridD.getNatom(ad);
-
-				Atom* atom2 = &ucell.atoms[T2];
-
-				tau2 = GridD.getAdjacentTau(ad);
-				dtau = tau2 - tau1;
-
-				double distance = dtau.norm() * ucell.lat0;
-				double rcut = ORB.Phi[T1].getRcut() + ORB.Phi[T2].getRcut();				
-
-				if(distance < rcut)
-				{
-					int iw1_all = ucell.itiaiw2iwt( T1, I1, 0) ; //iw1_all = combined index (it, ia, iw)
-
-					for(int jj=0; jj<atom1->nw*NPOL; ++jj)
-					{
-						const int jj0 = jj/NPOL;
-						const int L1 = atom1->iw2l[jj0];
-						const int N1 = atom1->iw2n[jj0];
-						const int m1 = atom1->iw2m[jj0];
-						int iw2_all = ucell.itiaiw2iwt( T2, I2, 0);
-
-						for(int kk=0; kk<atom2->nw*NPOL; ++kk)
-						{
-							const int kk0 = kk/NPOL;
-							const int L2 = atom2->iw2l[kk0];
-							const int N2 = atom2->iw2n[kk0];
-							const int m2 = atom2->iw2m[kk0];
-							
-							if ( !ParaO.in_this_processor(iw1_all,iw2_all) )
-							{
-								++iw2_all;
-								continue;
-							}
-
-							int mu = ParaO.trace_loc_row[iw1_all];
-							int nu = ParaO.trace_loc_col[iw2_all];
-							int irc = nu*ParaO.nrow + mu;
-														
-							if(GAMMA_ONLY_LOCAL)
-							{
-								if(STRESS)
-								{
-									this->soverlap_gamma[0][irc] += LM.DSloc_Rx[nnr]*LM.DH_r[nnr*3+0];
-									this->soverlap_gamma[1][irc] += LM.DSloc_Rx[nnr]*LM.DH_r[nnr*3+1];
-									this->soverlap_gamma[2][irc] += LM.DSloc_Rx[nnr]*LM.DH_r[nnr*3+2];
-									this->soverlap_gamma[3][irc] += LM.DSloc_Ry[nnr]*LM.DH_r[nnr*3+1];
-									this->soverlap_gamma[4][irc] += LM.DSloc_Ry[nnr]*LM.DH_r[nnr*3+2];
-									this->soverlap_gamma[5][irc] += LM.DSloc_Rz[nnr]*LM.DH_r[nnr*3+2];
-								}
-							}
-							else
-							{
-								Vector3<double> dR(GridD.getBox(ad).x, GridD.getBox(ad).y, GridD.getBox(ad).z); 
-							
-								for(int ik=0; ik<kv.nks; ik++)
-								{								
-									const double arg = ( kv.kvec_d[ik] * dR ) * TWO_PI;
-									const complex<double> kphase = complex <double> ( cos(arg),  sin(arg) );
-
-									this->dSm_k[ik][0][irc] += LM.DSloc_Rx[nnr]*kphase;
-									this->dSm_k[ik][1][irc] += LM.DSloc_Ry[nnr]*kphase;
-									this->dSm_k[ik][2][irc] += LM.DSloc_Rz[nnr]*kphase;
-
-									if(STRESS)
-									{																												
-										this->soverlap_k[ik][0][irc] += LM.DSloc_Rx[nnr]*LM.DH_r[nnr*3+0]*kphase;
-										this->soverlap_k[ik][1][irc] += LM.DSloc_Rx[nnr]*LM.DH_r[nnr*3+1]*kphase;
-										this->soverlap_k[ik][2][irc] += LM.DSloc_Rx[nnr]*LM.DH_r[nnr*3+2]*kphase;
-										this->soverlap_k[ik][3][irc] += LM.DSloc_Ry[nnr]*LM.DH_r[nnr*3+1]*kphase;
-										this->soverlap_k[ik][4][irc] += LM.DSloc_Ry[nnr]*LM.DH_r[nnr*3+2]*kphase;
-										this->soverlap_k[ik][5][irc] += LM.DSloc_Rz[nnr]*LM.DH_r[nnr*3+2]*kphase;																
-									}
-								}	
-							}
-																																																																				
-							++nnr;													
-							++iw2_all;
-						}// nw2 
-
-						++iw1_all;
-						
-					}// nw1
-				}// distance
-				else if(distance>=rcut)
-				{
-					int start1 = ucell.itiaiw2iwt( T1, I1, 0);
-					int start2 = ucell.itiaiw2iwt( T2, I2, 0);
-					bool is_adj = false;
-					for (int ad0=0; ad0<GridD.getAdjacentNum()+1; ++ad0)
-					{
-						const int T0 = GridD.getType(ad0);
-						
-						tau0 = GridD.getAdjacentTau(ad0);
-						dtau1 = tau0 - tau1;
-						double distance1 = dtau1.norm() * ucell.lat0;
-						double rcut1 = ORB.Phi[T1].getRcut() + ORB.Beta[T0].get_rcut_max();
-						dtau2 = tau0 - tau2;
-						double distance2 = dtau2.norm() * ucell.lat0;
-						double rcut2 = ORB.Phi[T2].getRcut() + ORB.Beta[T0].get_rcut_max();
-						if(distance1<rcut1 && distance2<rcut2)
-						{
-							is_adj = true;
-							break;
-						}
-					}//ad0
-					if( is_adj )
-					{
-						for(int jj=0; jj<atom1->nw * NPOL; ++jj)
-						{
-							const int mu = ParaO.trace_loc_row[start1+jj];
-							if(mu<0)continue; 
-
-							for(int kk=0; kk<atom2->nw * NPOL; ++kk)
-							{
-								const int nu = ParaO.trace_loc_col[start2+kk];
-								if(nu<0)continue;
-
-								++nnr;
-							}//kk
-						}//jj
-					}
-				}//distance
-			}// ad
-		}// I1
-	}// T1
+          const int T2 = GridD.getType(ad);
+				  const int I2 = GridD.getNatom(ad);
+          const int start2 = ucell.itiaiw2iwt(T2, I2, 0);
+
+				  Atom* atom2 = &ucell.atoms[T2];
+
+				  tau2 = GridD.getAdjacentTau(ad);
+				  dtau = tau2 - tau1;
+
+				  double distance = dtau.norm() * ucell.lat0;
+				  double rcut = ORB.Phi[T1].getRcut() + ORB.Phi[T2].getRcut();
+
+          bool adj = false;
+				  if(distance < rcut) adj = true;
+				  else if(distance >= rcut)
+				  {
+				  	for (int ad0 = 0; ad0 < GridD.getAdjacentNum()+1; ++ad0)
+				  	{
+				  		const int T0 = GridD.getType(ad0); 
+				  		const int I0 = GridD.getNatom(ad0); 
+				  		const int iat0 = ucell.itia2iat(T0, I0);
+				  		const int start0 = ucell.itiaiw2iwt(T0, I0, 0);
+
+				  		tau0 = GridD.getAdjacentTau(ad0);
+				  		dtau1 = tau0 - tau1;
+				  		dtau2 = tau0 - tau2;
+
+				  		double distance1 = dtau1.norm() * ucell.lat0;
+				  		double distance2 = dtau2.norm() * ucell.lat0;
+
+				  		double rcut1 = ORB.Phi[T1].getRcut() + ORB.Beta[T0].get_rcut_max();
+				  		double rcut2 = ORB.Phi[T2].getRcut() + ORB.Beta[T0].get_rcut_max();
+
+				  		if( distance1 < rcut1 && distance2 < rcut2 )
+				  		{
+				  			adj = true;
+				  			break;
+				  		}
+				  	}
+				  }				
+
+				  if(adj)
+				  {
+				  	for(int jj=0; jj<atom1->nw*NPOL; ++jj)
+				  	{
+              const int jj0 = jj/NPOL;
+
+              const int iw1_all = start1 + jj0; 
+              const int mu = ParaO.trace_loc_row[iw1_all];
+					    if(mu<0)continue;
+
+				  		const int L1 = atom1->iw2l[jj0];
+				  		const int N1 = atom1->iw2n[jj0];
+				  		const int m1 = atom1->iw2m[jj0];
+
+
+				  		for(int kk=0; kk<atom2->nw*NPOL; ++kk)
+				  		{
+                const int kk0 = kk/NPOL;
+
+                const int iw2_all = start2 + kk0;
+						    const int nu = ParaO.trace_loc_col[iw2_all];
+						    if(nu<0)continue;
+
+				  			const int L2 = atom2->iw2l[kk0];
+				  			const int N2 = atom2->iw2n[kk0];
+				  			const int m2 = atom2->iw2m[kk0];
+  
+				  			// if ( !ParaO.in_this_processor(iw1_all,iw2_all) )
+				  			// {
+				  				// ++iw2_all;
+				  				// continue;
+				  			// }
+
+				  			// int mu = ParaO.trace_loc_row[iw1_all];
+				  			// int nu = ParaO.trace_loc_col[iw2_all];
+				  			int irc = nu*ParaO.nrow + mu;
+  
+				  			if(GAMMA_ONLY_LOCAL)
+							  {
+							  	if(STRESS)
+							  	{
+							  		this->soverlap_gamma[0][irc] += LM.DSloc_Rx[nnr]*LM.DH_r[nnr*3+0];
+							  		this->soverlap_gamma[1][irc] += LM.DSloc_Rx[nnr]*LM.DH_r[nnr*3+1];
+							  		this->soverlap_gamma[2][irc] += LM.DSloc_Rx[nnr]*LM.DH_r[nnr*3+2];
+							  		this->soverlap_gamma[3][irc] += LM.DSloc_Ry[nnr]*LM.DH_r[nnr*3+1];
+							  		this->soverlap_gamma[4][irc] += LM.DSloc_Ry[nnr]*LM.DH_r[nnr*3+2];
+							  		this->soverlap_gamma[5][irc] += LM.DSloc_Rz[nnr]*LM.DH_r[nnr*3+2];
+							  	}
+							  }
+				  			else
+				  			{
+				  				Vector3<double> dR(GridD.getBox(ad).x, GridD.getBox(ad).y, GridD.getBox(ad).z); 
+  
+				  				for(int ik=0; ik<kv.nks; ik++)
+				  				{
+				  					const double arg = ( kv.kvec_d[ik] * dR ) * TWO_PI;
+				  					const complex<double> kphase( cos(arg),  sin(arg) );
+
+				  					this->dSm_k[ik][0][irc] += LM.DSloc_Rx[nnr]*kphase;
+				  					this->dSm_k[ik][1][irc] += LM.DSloc_Ry[nnr]*kphase;
+				  					this->dSm_k[ik][2][irc] += LM.DSloc_Rz[nnr]*kphase;
+
+				  					if(STRESS)
+				  					{		
+				  						this->soverlap_k[ik][0][irc] += LM.DSloc_Rx[nnr]*LM.DH_r[nnr*3+0]*kphase;
+				  						this->soverlap_k[ik][1][irc] += LM.DSloc_Rx[nnr]*LM.DH_r[nnr*3+1]*kphase;
+				  						this->soverlap_k[ik][2][irc] += LM.DSloc_Rx[nnr]*LM.DH_r[nnr*3+2]*kphase;
+				  						this->soverlap_k[ik][3][irc] += LM.DSloc_Ry[nnr]*LM.DH_r[nnr*3+1]*kphase;
+				  						this->soverlap_k[ik][4][irc] += LM.DSloc_Ry[nnr]*LM.DH_r[nnr*3+2]*kphase;
+				  						this->soverlap_k[ik][5][irc] += LM.DSloc_Rz[nnr]*LM.DH_r[nnr*3+2]*kphase;																
+				  					}
+				  				}
+				  			}
+				  			++nnr;
+				  		}// kk
+				    }// jj
+				  }// adj
+				  // else if(distance>=rcut)
+				  // {
+				  	// int start1 = ucell.itiaiw2iwt( T1, I1, 0);
+				  	// int start2 = ucell.itiaiw2iwt( T2, I2, 0);
+				  	// bool is_adj = false;
+				  	// for (int ad0=0; ad0<GridD.getAdjacentNum()+1; ++ad0)
+				  	// {
+				  	// 	const int T0 = GridD.getType(ad0);
+				  		
+				  	// 	tau0 = GridD.getAdjacentTau(ad0);
+				  	// 	dtau1 = tau0 - tau1;
+				  	// 	double distance1 = dtau1.norm() * ucell.lat0;
+				  	// 	double rcut1 = ORB.Phi[T1].getRcut() + ORB.Beta[T0].get_rcut_max();
+				  	// 	dtau2 = tau0 - tau2;
+				  	// 	double distance2 = dtau2.norm() * ucell.lat0;
+				  	// 	double rcut2 = ORB.Phi[T2].getRcut() + ORB.Beta[T0].get_rcut_max();
+				  	// 	if(distance1<rcut1 && distance2<rcut2)
+				  	// 	{
+				  	// 		is_adj = true;
+				  	// 		break;
+				  	// 	}
+				  	// }//ad0
+				  	// if( is_adj )
+				  	// {
+				  // 		for(int jj=0; jj<atom1->nw * NPOL; ++jj)
+				  // 		{
+				  // 			const int mu = ParaO.trace_loc_row[start1+jj];
+				  // 			if(mu<0) continue; 
+
+				  // 			for(int kk=0; kk<atom2->nw * NPOL; ++kk)
+				  // 			{
+				  // 				const int nu = ParaO.trace_loc_col[start2+kk];
+				  // 				if(nu<0) continue;
+
+				  // 				++nnr;
+				  // 			}//kk
+				  // 		}//jj
+				  // 	// }
+				  // }//distance
+			  }// ad
+		  }// I1
+	  }// T1
 
 	return;
 }
@@ -944,7 +1003,7 @@ void DFTU_RELAX::erase_force_stress()
 				delete [] soverlap_gamma[i];
 			}
 			delete [] soverlap_gamma;
-
+      soverlap_gamma=nullptr;
 		}
 	}
 	else
@@ -962,6 +1021,7 @@ void DFTU_RELAX::erase_force_stress()
 			delete [] dSm_k[ik];
 		}
 		delete [] dSm_k;
+    dSm_k = nullptr;
 
 		if(STRESS)
 		{
@@ -978,6 +1038,7 @@ void DFTU_RELAX::erase_force_stress()
 				delete [] soverlap_k[ik];
 			}
 			delete [] soverlap_k;
+      soverlap_k = nullptr;
 		}
 	}
 			
diff --git a/ABACUS.develop/source/src_lcao/dftu_yukawa.cpp b/ABACUS.develop/source/src_lcao/dftu_yukawa.cpp
index c6707678fb..fd214580d0 100644
--- a/ABACUS.develop/source/src_lcao/dftu_yukawa.cpp
+++ b/ABACUS.develop/source/src_lcao/dftu_yukawa.cpp
@@ -20,13 +20,6 @@
 #include "local_orbital_ions.h"
 #include "lcao_matrix.h"
 
-
-extern "C"
-{
-	void sphbsl_(int *n, double *r, double *A, double *val);
-	void sphhnk_(int *n, double *r, double *A, double *val);
-}
-
 DFTU_Yukawa::DFTU_Yukawa(){}
 
 DFTU_Yukawa::~DFTU_Yukawa(){}
@@ -95,14 +88,14 @@ void DFTU_Yukawa::cal_slater_Fk(const int L, const int T)
 						int l = 2*k;
 						if(ir0<ir1)  //less than
 						{
-						 	sphbsl_(&l, &r0, &lambda, &bslval);
-							sphhnk_(&l, &r1, &lambda, &hnkval);
+						 	bslval=this->spherical_Bessel(l, r0, lambda);
+							hnkval=this->spherical_Hankel(l, r1, lambda);
 						}
 						else //greater than
 						{
-						 	sphbsl_(&l, &r1, &lambda, &bslval);
-							sphhnk_(&l, &r0, &lambda, &hnkval);
-						}					
+						 	bslval=this->spherical_Bessel(l, r1, lambda);
+							hnkval=this->spherical_Hankel(l, r0, lambda);
+						}				
 						this->Fk.at(T).at(L).at(chi).at(k) -= (4*k+1)*lambda*pow(R_L0,2)*bslval*hnkval*pow(R_L1,2)*pow(r0,2)*pow(r1,2)*rab0*rab1;					
 					}
 				}
diff --git a/ABACUS.develop/source/src_lcao/dftu_yukawa.h b/ABACUS.develop/source/src_lcao/dftu_yukawa.h
index 8ed5791cd2..49c7813a1f 100644
--- a/ABACUS.develop/source/src_lcao/dftu_yukawa.h
+++ b/ABACUS.develop/source/src_lcao/dftu_yukawa.h
@@ -23,6 +23,9 @@ class DFTU_Yukawa
     void cal_yukawa_lambda();
     void cal_slater_UJ(const int istep, const int iter);
 
+    double spherical_Bessel(const int k, const double r, const double lambda);
+    double spherical_Hankel(const int k, const double r, const double lambda);
+
     //void cal_unscreened_slater_Fk(const int L, const int T); //L:angular momnet, T:atom type
     //void cal_slater_Vsc(const int T, const int L);
 

From e2db1f09dba8dac91fa63b9424a71960b0595d6a Mon Sep 17 00:00:00 2001
From: 80610702-git <quxin@mail.ustc.edu.cn>
Date: Wed, 14 Apr 2021 09:39:02 +0800
Subject: [PATCH 50/60] Delete lscc.f

---
 ABACUS.develop/source/src_lcao/lscc.f | 87 ---------------------------
 1 file changed, 87 deletions(-)
 delete mode 100644 ABACUS.develop/source/src_lcao/lscc.f

diff --git a/ABACUS.develop/source/src_lcao/lscc.f b/ABACUS.develop/source/src_lcao/lscc.f
deleted file mode 100644
index 492c941ca9..0000000000
--- a/ABACUS.develop/source/src_lcao/lscc.f
+++ /dev/null
@@ -1,87 +0,0 @@
-      subroutine sphbsl(n,r,A,val) 
-        integer :: n
-        real*8 :: r,A
-        real*8 :: x,val
-        x = r*A
-        if (n .eq. 0) then
-        
-          if ( x .lt. 1.d-3 ) then
-            val = 1 + x**2/6
-          else
-            val = dsinh(x)/x
-          end if
-        else if (n .eq. 2) then
-        
-          if ( x .lt. 1.d-2 ) then
-            val = -x**2/15 -x**4/210 - x**6/7560
-          else
-            val = 3*dcosh(x)/x**2 + (-3-x**2)*dsinh(x)/x**3
-          end if
-        
-        else if (n .eq. 4) then
-
-          if( x .lt. 5.d-1)then
-            val = x**4/945 + x**6/20790 + x**8/1081080 + x**10/97297200
-          else
-            val = -5*(21+2*x**2)*dcosh(x)/x**4+(105+45*x**2+x**4)*
-     &       dsinh(x)/x**5
-          end if
-        
-        else if (n .eq. 6) then
-        
-          if ( x .lt. 9.d-1) then
-            val = -x**6/135135-x**8/4054050-x**10/275675400
-          else
-            val = 21*(495+60*x**2+x**4)*dcosh(x)/x**6 +
-     &       (-10395-4725*x**2-210*x**4-x**6)*dsinh(x)/x**7
-          end if
-        
-        else
-        end if
-      END subroutine sphbsl
-
-      subroutine sphhnk(n,r,A,val)
-        integer :: n
-        real*8 :: r,A
-        real*8 :: x,val
-        x = r*A
-        if (n .eq. 0) then
-        
-          if ( x .lt. 1.d-3 ) then
-            val = -1/x + 1 -x/2 + x**2/6
-          else
-            val = -dexp(-x)/x
-          endif
-        
-        else if (n .eq. 2) then
-        
-          if ( x .lt. 1.d-2) then
-            val = 3/x**3-1/(2*x)+x/8-x**2/15+x**3/48
-          else
-            val = dexp(-x)*(3+3*x+x**2)/x**3
-          endif
-        
-        else if (n .eq. 4) then
-        
-          if (x .lt. 5.d-1) then
-            val = -105/x**5 + 15/(2*x**3) - 3/(8*x) + x/48 - x**3/384 
-     &        +x**4/945
-          else
-            val = -dexp(-x)*(105+105*x+45*x**2+10*x**3+x**4)/x**5
-          endif
-        
-        else if (n .eq. 6) then
-
-          if (x .lt. 9.d-1) then
-            val = 10395/x**7 - 945/(2*x**5) + 105/(8*x**3) -5/(16*x) + 
-     &            x/128-x**3/3840 + x**5/46080 - x**6/135135
-          else
-            val = dexp(-x)*(10395+10395*x+4725*x**2+1260*x**3+210*x**
-     &       4+21*x**5+x**6)/x**7
-          endif
-        
-        else
-        endif
-      END SUBROUTINE sphhnk
-
-

From 5d8b8389e8c72651632732edf5020772606fed6c Mon Sep 17 00:00:00 2001
From: dyzheng <49852742+dyzheng@users.noreply.github.com>
Date: Fri, 16 Apr 2021 13:02:33 +0800
Subject: [PATCH 51/60] Revert "DFT+U"

---
 ABACUS.develop/source/Makefile.Objects        |   1 +
 ABACUS.develop/source/src_lcao/dftu_relax.cpp | 387 ++++++++----------
 .../source/src_lcao/dftu_yukawa.cpp           |  17 +-
 ABACUS.develop/source/src_lcao/dftu_yukawa.h  |   3 -
 ABACUS.develop/source/src_lcao/lscc.f         |  87 ++++
 5 files changed, 263 insertions(+), 232 deletions(-)
 create mode 100644 ABACUS.develop/source/src_lcao/lscc.f

diff --git a/ABACUS.develop/source/Makefile.Objects b/ABACUS.develop/source/Makefile.Objects
index 10825db039..b60de00435 100644
--- a/ABACUS.develop/source/Makefile.Objects
+++ b/ABACUS.develop/source/Makefile.Objects
@@ -264,6 +264,7 @@ variable_cell.o\
 dftu.o\
 dftu_yukawa.o\
 dftu_relax.o\
+lscc.o\
 
 OBJS_COMMON=atom_spec.o \
 unitcell.o \
diff --git a/ABACUS.develop/source/src_lcao/dftu_relax.cpp b/ABACUS.develop/source/src_lcao/dftu_relax.cpp
index 31424f7d6d..9051fd32a1 100644
--- a/ABACUS.develop/source/src_lcao/dftu_relax.cpp
+++ b/ABACUS.develop/source/src_lcao/dftu_relax.cpp
@@ -16,7 +16,7 @@
 #include "../src_pw/global.h"
 #include "global_fp.h"
 #include "../src_global/global_function.h"
-//#include "../src_global/scalapack_connector.h"
+#include "../src_global/scalapack_connector.h"
 #include "../src_global/lapack_connector.h"
 #include "../src_global/inverse_matrix.h"
 #include "LOOP_ions.h"
@@ -25,26 +25,6 @@
 #include "ORB_gen_tables.h"
 #include "../src_pw/charge.h"
 
-extern "C"
-{
-  void pzgemm_(
-		const char *transa, const char *transb,
-		const int *M, const int *N, const int *K,
-		const std::complex<double> *alpha,
-		const std::complex<double> *A, const int *IA, const int *JA, const int *DESCA,
-		const std::complex<double> *B, const int *IB, const int *JB, const int *DESCB,
-		const std::complex<double> *beta,
-		std::complex<double> *C, const int *IC, const int *JC, const int *DESCC);
-  
-  void pdgemm_(
-		const char *transa, const char *transb,
-		const int *M, const int *N, const int *K,
-		const double *alpha,
-		const double *A, const int *IA, const int *JA, const int *DESCA,
-		const double *B, const int *IB, const int *JB, const int *DESCB,
-		const double *beta,
-		double *C, const int *IC, const int *JC, const int *DESCC);
-}
 
 DFTU_RELAX::DFTU_RELAX(){}
 
@@ -164,7 +144,7 @@ void DFTU_RELAX::force_stress()
 				}
 				else
 				{
-					if(NSPIN==1)
+					if(NSPIN==1 || NSPIN==4)
 					{
 						double val = get_onebody_eff_pot(T1, iat1, L1, n1, 0, m1, m2, cal_type, false);
 						VU_k.at(0).at(irc) = complex<double>(val, 0.0);
@@ -211,15 +191,13 @@ void DFTU_RELAX::force_stress()
 }
 
 
-void DFTU_RELAX::cal_force_k(const vector<vector<complex<double>>>& VU)
+void DFTU_RELAX::cal_force_k(vector<vector<complex<double>>> &VU)
 {
 	TITLE("DFTU_RELAX", "cal_force_k");
 
-	const char transN = 'N', transT = 'T', transC='C';
+	const char transN = 'N', transT = 'T';
 	const int  one_int = 1;
-	// const double alpha = 1.0, beta = 0.0;
-  const complex<double> alpha(1.0,0.0), beta(0.0,0.0);
-  const complex<double> zero(0.0,0.0);
+	const double alpha = 1.0, beta = 0.0;
 	
 	vector<vector<complex<double>>> ftmp(ucell.nat);
 	for(int ia=0; ia<ucell.nat; ia++)
@@ -230,7 +208,7 @@ void DFTU_RELAX::cal_force_k(const vector<vector<complex<double>>>& VU)
 	vector<vector<complex<double>>> dm_VU_dSm(3);
 	for(int dim=0; dim<3; dim++)
 	{
-		dm_VU_dSm.at(dim).resize(ParaO.nloc, zero);
+		dm_VU_dSm.at(dim).resize(ParaO.nloc, complex<double>(0.0, 0.0));
 	}
 	
 	for(int ik=0; ik<kv.nks; ik++)	
@@ -239,8 +217,8 @@ void DFTU_RELAX::cal_force_k(const vector<vector<complex<double>>>& VU)
 
 		for(int dim=0; dim<3; dim++)
 		{
-			vector<complex<double>> mat_tmp(ParaO.nloc);
-			vector<complex<double>> force_tmp(ParaO.nloc);
+			vector<complex<double>> mat_tmp(ParaO.nloc, complex<double>(0.0, 0.0));
+			vector<complex<double>> force_tmp(ParaO.nloc, complex<double>(0.0, 0.0));
 
 			if(dim==0) //dim=1,2 are same as dim=0
 			{
@@ -273,7 +251,7 @@ void DFTU_RELAX::cal_force_k(const vector<vector<complex<double>>>& VU)
 			//=========================================
 			ZEROS(VECTOR_TO_PTR(force_tmp), ParaO.nloc);
 
-			pzgemm_(&transN, &transC,
+			pzgemm_(&transN, &transT,
 				&NLOCAL, &NLOCAL, &NLOCAL,
 				&alpha, 
 				this->dSm_k[ik][dim], &one_int, &one_int, ParaO.desc, 
@@ -285,7 +263,7 @@ void DFTU_RELAX::cal_force_k(const vector<vector<complex<double>>>& VU)
 			{
 				dm_VU_dSm.at(dim).at(irc) -= force_tmp.at(irc);
 			}
-		}//end dim
+		}//end dim				
 	}//end ik
 
 	for(int dim=0; dim<3; dim++)
@@ -318,36 +296,35 @@ void DFTU_RELAX::cal_force_k(const vector<vector<complex<double>>>& VU)
 		}
 	}
 
+
 	return;
 }
 
 
-void DFTU_RELAX::cal_stress_k(const vector<vector<complex<double>>>& VU)
+void DFTU_RELAX::cal_stress_k(vector<vector<complex<double>>> &VU)
 {
 	TITLE("DFTU_RELAX", "cal_stress_k");
 
-	const char transN = 'N', transT = 'T', transC='C';
+	const char transN = 'N', transT = 'T';
 	const int  one_int = 1;
-	//const double alpha = 1.0, beta = 0.0;
-	const complex<double> alpha(1.0,0.0), beta(0.0,0.0);
-  const complex<double> zero(0.0,0.0);
-  
+	const double alpha = 1.0, beta = 0.0;
+	
 	int count = 0;
 	for(int dim1=0; dim1<3; dim1++)
 	{
 		for(int dim2=dim1; dim2<3; dim2++)
 		{
-			vector<complex<double>> dm_VU_sover(ParaO.nloc, zero);
+			vector<complex<double>> dm_VU_sover(ParaO.nloc, complex<double>(0.0, 0.0));
 
 			for(int ik=0; ik<kv.nks; ik++)
 			{
 				const int spin = kv.isk[ik];
 				
 				// The first term
-				vector<complex<double>> stress_tmp(ParaO.nloc);
+				vector<complex<double>> stress_tmp(ParaO.nloc, complex<double>(0.0, 0.0));
 
 				//Calculate mat_tmp=dm*VU
-				vector<complex<double>> mat_tmp(ParaO.nloc);
+				vector<complex<double>> mat_tmp(ParaO.nloc, complex<double>(0.0, 0.0));
 
 				pzgemm_(&transT, &transN,
 					&NLOCAL, &NLOCAL, &NLOCAL,
@@ -367,14 +344,13 @@ void DFTU_RELAX::cal_stress_k(const vector<vector<complex<double>>>& VU)
 
 				for(int irc=0; irc<ParaO.nloc; irc++)
 				{
-					// dm_VU_sover.at(irc) -= 0.5*stress_tmp.at(irc);
-          dm_VU_sover.at(irc) -= 0.5*stress_tmp.at(irc);
+					dm_VU_sover.at(irc) -= 0.5*stress_tmp.at(irc);
 				}
 
 				// The second term
 				ZEROS(VECTOR_TO_PTR(stress_tmp), ParaO.nloc);
 
-				pzgemm_(&transN, &transC,
+				pzgemm_(&transN, &transT,
 					&NLOCAL, &NLOCAL, &NLOCAL,
 					&alpha, 
 					this->soverlap_k[ik][count], &one_int, &one_int, ParaO.desc, 
@@ -384,8 +360,7 @@ void DFTU_RELAX::cal_stress_k(const vector<vector<complex<double>>>& VU)
 
 				for(int irc=0; irc<ParaO.nloc; irc++)
 				{
-					// dm_VU_sover.at(irc) += 0.5*stress_tmp.at(irc);
-          dm_VU_sover.at(irc) -= 0.5*stress_tmp.at(irc);
+					dm_VU_sover.at(irc) += 0.5*stress_tmp.at(irc);
 				}
 
 			}//end ik
@@ -410,8 +385,8 @@ void DFTU_RELAX::cal_stress_k(const vector<vector<complex<double>>>& VU)
 			double val = stmp.real();
 			MPI_Allreduce(&val, &stress_dftu.at(dim1).at(dim2), 1, MPI_DOUBLE, MPI_SUM, MPI_COMM_WORLD);
 
-			// complex<double> tmp;
-			// MPI_Allreduce(&stmp, &tmp, 1, MPI_DOUBLE, MPI_SUM, MPI_COMM_WORLD);
+			complex<double> tmp;
+			MPI_Allreduce(&stmp, &tmp, 1, MPI_DOUBLE, MPI_SUM, MPI_COMM_WORLD);
 						
 			count++;
 		}//end dim2
@@ -438,7 +413,7 @@ void DFTU_RELAX::cal_stress_k(const vector<vector<complex<double>>>& VU)
 }
 
 
-void DFTU_RELAX::cal_force_gamma(const vector<vector<double>> &VU)
+void DFTU_RELAX::cal_force_gamma(vector<vector<double>> &VU)
 {
 	TITLE("DFTU_RELAX", "cal_force_gamma");
 
@@ -603,7 +578,7 @@ void DFTU_RELAX::cal_force_gamma(const vector<vector<double>> &VU)
 }
 
 
-void DFTU_RELAX::cal_stress_gamma(const vector<vector<double>> &VU)
+void DFTU_RELAX::cal_stress_gamma(vector<vector<double>> &VU)
 {
 	TITLE("DFTU_RELAX", "cal_stress_gamma");
 
@@ -662,7 +637,7 @@ void DFTU_RELAX::cal_stress_gamma(const vector<vector<double>> &VU)
 
 				for(int irc=0; irc<ParaO.nloc; irc++)
 				{
-					dm_VU_sover.at(irc) -= 0.5*stress_tmp.at(irc);
+					dm_VU_sover.at(irc) += 0.5*stress_tmp.at(irc);
 				}
 
 			}//end ik
@@ -752,179 +727,145 @@ void DFTU_RELAX::folding_dSm_soverlap()
 	}
 	
 
-	  Vector3<double> tau1, tau2, dtau;
-	  Vector3<double> dtau1, dtau2, tau0;
+	Vector3<double> tau1, tau2, dtau;
+	Vector3<double> dtau1, dtau2, tau0;
     for(int T1=0; T1<ucell.ntype; ++T1)
     {
-		  Atom* atom1 = &ucell.atoms[T1];
-      for(int I1=0; I1<atom1->na; ++I1)
-      {
-			  tau1 = atom1->tau[I1];
-        const int start1 = ucell.itiaiw2iwt(T1,I1,0);    
-
-        GridD.Find_atom(tau1, T1, I1);
-        for(int ad=0; ad<GridD.getAdjacentNum()+1; ++ad)
+		Atom* atom1 = &ucell.atoms[T1];
+        for(int I1=0; I1<atom1->na; ++I1)
         {
-          const int T2 = GridD.getType(ad);
-				  const int I2 = GridD.getNatom(ad);
-          const int start2 = ucell.itiaiw2iwt(T2, I2, 0);
-
-				  Atom* atom2 = &ucell.atoms[T2];
-
-				  tau2 = GridD.getAdjacentTau(ad);
-				  dtau = tau2 - tau1;
-
-				  double distance = dtau.norm() * ucell.lat0;
-				  double rcut = ORB.Phi[T1].getRcut() + ORB.Phi[T2].getRcut();
-
-          bool adj = false;
-				  if(distance < rcut) adj = true;
-				  else if(distance >= rcut)
-				  {
-				  	for (int ad0 = 0; ad0 < GridD.getAdjacentNum()+1; ++ad0)
-				  	{
-				  		const int T0 = GridD.getType(ad0); 
-				  		const int I0 = GridD.getNatom(ad0); 
-				  		const int iat0 = ucell.itia2iat(T0, I0);
-				  		const int start0 = ucell.itiaiw2iwt(T0, I0, 0);
-
-				  		tau0 = GridD.getAdjacentTau(ad0);
-				  		dtau1 = tau0 - tau1;
-				  		dtau2 = tau0 - tau2;
-
-				  		double distance1 = dtau1.norm() * ucell.lat0;
-				  		double distance2 = dtau2.norm() * ucell.lat0;
-
-				  		double rcut1 = ORB.Phi[T1].getRcut() + ORB.Beta[T0].get_rcut_max();
-				  		double rcut2 = ORB.Phi[T2].getRcut() + ORB.Beta[T0].get_rcut_max();
-
-				  		if( distance1 < rcut1 && distance2 < rcut2 )
-				  		{
-				  			adj = true;
-				  			break;
-				  		}
-				  	}
-				  }				
-
-				  if(adj)
-				  {
-				  	for(int jj=0; jj<atom1->nw*NPOL; ++jj)
-				  	{
-              const int jj0 = jj/NPOL;
-
-              const int iw1_all = start1 + jj0; 
-              const int mu = ParaO.trace_loc_row[iw1_all];
-					    if(mu<0)continue;
-
-				  		const int L1 = atom1->iw2l[jj0];
-				  		const int N1 = atom1->iw2n[jj0];
-				  		const int m1 = atom1->iw2m[jj0];
-
-
-				  		for(int kk=0; kk<atom2->nw*NPOL; ++kk)
-				  		{
-                const int kk0 = kk/NPOL;
-
-                const int iw2_all = start2 + kk0;
-						    const int nu = ParaO.trace_loc_col[iw2_all];
-						    if(nu<0)continue;
-
-				  			const int L2 = atom2->iw2l[kk0];
-				  			const int N2 = atom2->iw2n[kk0];
-				  			const int m2 = atom2->iw2m[kk0];
-  
-				  			// if ( !ParaO.in_this_processor(iw1_all,iw2_all) )
-				  			// {
-				  				// ++iw2_all;
-				  				// continue;
-				  			// }
-
-				  			// int mu = ParaO.trace_loc_row[iw1_all];
-				  			// int nu = ParaO.trace_loc_col[iw2_all];
-				  			int irc = nu*ParaO.nrow + mu;
-  
-				  			if(GAMMA_ONLY_LOCAL)
-							  {
-							  	if(STRESS)
-							  	{
-							  		this->soverlap_gamma[0][irc] += LM.DSloc_Rx[nnr]*LM.DH_r[nnr*3+0];
-							  		this->soverlap_gamma[1][irc] += LM.DSloc_Rx[nnr]*LM.DH_r[nnr*3+1];
-							  		this->soverlap_gamma[2][irc] += LM.DSloc_Rx[nnr]*LM.DH_r[nnr*3+2];
-							  		this->soverlap_gamma[3][irc] += LM.DSloc_Ry[nnr]*LM.DH_r[nnr*3+1];
-							  		this->soverlap_gamma[4][irc] += LM.DSloc_Ry[nnr]*LM.DH_r[nnr*3+2];
-							  		this->soverlap_gamma[5][irc] += LM.DSloc_Rz[nnr]*LM.DH_r[nnr*3+2];
-							  	}
-							  }
-				  			else
-				  			{
-				  				Vector3<double> dR(GridD.getBox(ad).x, GridD.getBox(ad).y, GridD.getBox(ad).z); 
-  
-				  				for(int ik=0; ik<kv.nks; ik++)
-				  				{
-				  					const double arg = ( kv.kvec_d[ik] * dR ) * TWO_PI;
-				  					const complex<double> kphase( cos(arg),  sin(arg) );
-
-				  					this->dSm_k[ik][0][irc] += LM.DSloc_Rx[nnr]*kphase;
-				  					this->dSm_k[ik][1][irc] += LM.DSloc_Ry[nnr]*kphase;
-				  					this->dSm_k[ik][2][irc] += LM.DSloc_Rz[nnr]*kphase;
-
-				  					if(STRESS)
-				  					{		
-				  						this->soverlap_k[ik][0][irc] += LM.DSloc_Rx[nnr]*LM.DH_r[nnr*3+0]*kphase;
-				  						this->soverlap_k[ik][1][irc] += LM.DSloc_Rx[nnr]*LM.DH_r[nnr*3+1]*kphase;
-				  						this->soverlap_k[ik][2][irc] += LM.DSloc_Rx[nnr]*LM.DH_r[nnr*3+2]*kphase;
-				  						this->soverlap_k[ik][3][irc] += LM.DSloc_Ry[nnr]*LM.DH_r[nnr*3+1]*kphase;
-				  						this->soverlap_k[ik][4][irc] += LM.DSloc_Ry[nnr]*LM.DH_r[nnr*3+2]*kphase;
-				  						this->soverlap_k[ik][5][irc] += LM.DSloc_Rz[nnr]*LM.DH_r[nnr*3+2]*kphase;																
-				  					}
-				  				}
-				  			}
-				  			++nnr;
-				  		}// kk
-				    }// jj
-				  }// adj
-				  // else if(distance>=rcut)
-				  // {
-				  	// int start1 = ucell.itiaiw2iwt( T1, I1, 0);
-				  	// int start2 = ucell.itiaiw2iwt( T2, I2, 0);
-				  	// bool is_adj = false;
-				  	// for (int ad0=0; ad0<GridD.getAdjacentNum()+1; ++ad0)
-				  	// {
-				  	// 	const int T0 = GridD.getType(ad0);
-				  		
-				  	// 	tau0 = GridD.getAdjacentTau(ad0);
-				  	// 	dtau1 = tau0 - tau1;
-				  	// 	double distance1 = dtau1.norm() * ucell.lat0;
-				  	// 	double rcut1 = ORB.Phi[T1].getRcut() + ORB.Beta[T0].get_rcut_max();
-				  	// 	dtau2 = tau0 - tau2;
-				  	// 	double distance2 = dtau2.norm() * ucell.lat0;
-				  	// 	double rcut2 = ORB.Phi[T2].getRcut() + ORB.Beta[T0].get_rcut_max();
-				  	// 	if(distance1<rcut1 && distance2<rcut2)
-				  	// 	{
-				  	// 		is_adj = true;
-				  	// 		break;
-				  	// 	}
-				  	// }//ad0
-				  	// if( is_adj )
-				  	// {
-				  // 		for(int jj=0; jj<atom1->nw * NPOL; ++jj)
-				  // 		{
-				  // 			const int mu = ParaO.trace_loc_row[start1+jj];
-				  // 			if(mu<0) continue; 
-
-				  // 			for(int kk=0; kk<atom2->nw * NPOL; ++kk)
-				  // 			{
-				  // 				const int nu = ParaO.trace_loc_col[start2+kk];
-				  // 				if(nu<0) continue;
-
-				  // 				++nnr;
-				  // 			}//kk
-				  // 		}//jj
-				  // 	// }
-				  // }//distance
-			  }// ad
-		  }// I1
-	  }// T1
+			tau1 = atom1->tau[I1];
+            
+            GridD.Find_atom(tau1, T1, I1);
+            for(int ad=0; ad<GridD.getAdjacentNum()+1; ++ad)
+            {
+                const int T2 = GridD.getType(ad);
+				const int I2 = GridD.getNatom(ad);
+
+				Atom* atom2 = &ucell.atoms[T2];
+
+				tau2 = GridD.getAdjacentTau(ad);
+				dtau = tau2 - tau1;
+
+				double distance = dtau.norm() * ucell.lat0;
+				double rcut = ORB.Phi[T1].getRcut() + ORB.Phi[T2].getRcut();				
+
+				if(distance < rcut)
+				{
+					int iw1_all = ucell.itiaiw2iwt( T1, I1, 0) ; //iw1_all = combined index (it, ia, iw)
+
+					for(int jj=0; jj<atom1->nw*NPOL; ++jj)
+					{
+						const int jj0 = jj/NPOL;
+						const int L1 = atom1->iw2l[jj0];
+						const int N1 = atom1->iw2n[jj0];
+						const int m1 = atom1->iw2m[jj0];
+						int iw2_all = ucell.itiaiw2iwt( T2, I2, 0);
+
+						for(int kk=0; kk<atom2->nw*NPOL; ++kk)
+						{
+							const int kk0 = kk/NPOL;
+							const int L2 = atom2->iw2l[kk0];
+							const int N2 = atom2->iw2n[kk0];
+							const int m2 = atom2->iw2m[kk0];
+							
+							if ( !ParaO.in_this_processor(iw1_all,iw2_all) )
+							{
+								++iw2_all;
+								continue;
+							}
+
+							int mu = ParaO.trace_loc_row[iw1_all];
+							int nu = ParaO.trace_loc_col[iw2_all];
+							int irc = nu*ParaO.nrow + mu;
+														
+							if(GAMMA_ONLY_LOCAL)
+							{
+								if(STRESS)
+								{
+									this->soverlap_gamma[0][irc] += LM.DSloc_Rx[nnr]*LM.DH_r[nnr*3+0];
+									this->soverlap_gamma[1][irc] += LM.DSloc_Rx[nnr]*LM.DH_r[nnr*3+1];
+									this->soverlap_gamma[2][irc] += LM.DSloc_Rx[nnr]*LM.DH_r[nnr*3+2];
+									this->soverlap_gamma[3][irc] += LM.DSloc_Ry[nnr]*LM.DH_r[nnr*3+1];
+									this->soverlap_gamma[4][irc] += LM.DSloc_Ry[nnr]*LM.DH_r[nnr*3+2];
+									this->soverlap_gamma[5][irc] += LM.DSloc_Rz[nnr]*LM.DH_r[nnr*3+2];
+								}
+							}
+							else
+							{
+								Vector3<double> dR(GridD.getBox(ad).x, GridD.getBox(ad).y, GridD.getBox(ad).z); 
+							
+								for(int ik=0; ik<kv.nks; ik++)
+								{								
+									const double arg = ( kv.kvec_d[ik] * dR ) * TWO_PI;
+									const complex<double> kphase = complex <double> ( cos(arg),  sin(arg) );
+
+									this->dSm_k[ik][0][irc] += LM.DSloc_Rx[nnr]*kphase;
+									this->dSm_k[ik][1][irc] += LM.DSloc_Ry[nnr]*kphase;
+									this->dSm_k[ik][2][irc] += LM.DSloc_Rz[nnr]*kphase;
+
+									if(STRESS)
+									{																												
+										this->soverlap_k[ik][0][irc] += LM.DSloc_Rx[nnr]*LM.DH_r[nnr*3+0]*kphase;
+										this->soverlap_k[ik][1][irc] += LM.DSloc_Rx[nnr]*LM.DH_r[nnr*3+1]*kphase;
+										this->soverlap_k[ik][2][irc] += LM.DSloc_Rx[nnr]*LM.DH_r[nnr*3+2]*kphase;
+										this->soverlap_k[ik][3][irc] += LM.DSloc_Ry[nnr]*LM.DH_r[nnr*3+1]*kphase;
+										this->soverlap_k[ik][4][irc] += LM.DSloc_Ry[nnr]*LM.DH_r[nnr*3+2]*kphase;
+										this->soverlap_k[ik][5][irc] += LM.DSloc_Rz[nnr]*LM.DH_r[nnr*3+2]*kphase;																
+									}
+								}	
+							}
+																																																																				
+							++nnr;													
+							++iw2_all;
+						}// nw2 
+
+						++iw1_all;
+						
+					}// nw1
+				}// distance
+				else if(distance>=rcut)
+				{
+					int start1 = ucell.itiaiw2iwt( T1, I1, 0);
+					int start2 = ucell.itiaiw2iwt( T2, I2, 0);
+					bool is_adj = false;
+					for (int ad0=0; ad0<GridD.getAdjacentNum()+1; ++ad0)
+					{
+						const int T0 = GridD.getType(ad0);
+						
+						tau0 = GridD.getAdjacentTau(ad0);
+						dtau1 = tau0 - tau1;
+						double distance1 = dtau1.norm() * ucell.lat0;
+						double rcut1 = ORB.Phi[T1].getRcut() + ORB.Beta[T0].get_rcut_max();
+						dtau2 = tau0 - tau2;
+						double distance2 = dtau2.norm() * ucell.lat0;
+						double rcut2 = ORB.Phi[T2].getRcut() + ORB.Beta[T0].get_rcut_max();
+						if(distance1<rcut1 && distance2<rcut2)
+						{
+							is_adj = true;
+							break;
+						}
+					}//ad0
+					if( is_adj )
+					{
+						for(int jj=0; jj<atom1->nw * NPOL; ++jj)
+						{
+							const int mu = ParaO.trace_loc_row[start1+jj];
+							if(mu<0)continue; 
+
+							for(int kk=0; kk<atom2->nw * NPOL; ++kk)
+							{
+								const int nu = ParaO.trace_loc_col[start2+kk];
+								if(nu<0)continue;
+
+								++nnr;
+							}//kk
+						}//jj
+					}
+				}//distance
+			}// ad
+		}// I1
+	}// T1
 
 	return;
 }
@@ -1003,7 +944,7 @@ void DFTU_RELAX::erase_force_stress()
 				delete [] soverlap_gamma[i];
 			}
 			delete [] soverlap_gamma;
-      soverlap_gamma=nullptr;
+
 		}
 	}
 	else
@@ -1021,7 +962,6 @@ void DFTU_RELAX::erase_force_stress()
 			delete [] dSm_k[ik];
 		}
 		delete [] dSm_k;
-    dSm_k = nullptr;
 
 		if(STRESS)
 		{
@@ -1038,7 +978,6 @@ void DFTU_RELAX::erase_force_stress()
 				delete [] soverlap_k[ik];
 			}
 			delete [] soverlap_k;
-      soverlap_k = nullptr;
 		}
 	}
 			
diff --git a/ABACUS.develop/source/src_lcao/dftu_yukawa.cpp b/ABACUS.develop/source/src_lcao/dftu_yukawa.cpp
index 3d754a65fb..f789958888 100644
--- a/ABACUS.develop/source/src_lcao/dftu_yukawa.cpp
+++ b/ABACUS.develop/source/src_lcao/dftu_yukawa.cpp
@@ -20,6 +20,13 @@
 #include "LOOP_ions.h"
 #include "LCAO_matrix.h"
 
+
+extern "C"
+{
+	void sphbsl_(int *n, double *r, double *A, double *val);
+	void sphhnk_(int *n, double *r, double *A, double *val);
+}
+
 DFTU_Yukawa::DFTU_Yukawa(){}
 
 DFTU_Yukawa::~DFTU_Yukawa(){}
@@ -88,14 +95,14 @@ void DFTU_Yukawa::cal_slater_Fk(const int L, const int T)
 						int l = 2*k;
 						if(ir0<ir1)  //less than
 						{
-						 	bslval=this->spherical_Bessel(l, r0, lambda);
-							hnkval=this->spherical_Hankel(l, r1, lambda);
+						 	sphbsl_(&l, &r0, &lambda, &bslval);
+							sphhnk_(&l, &r1, &lambda, &hnkval);
 						}
 						else //greater than
 						{
-						 	bslval=this->spherical_Bessel(l, r1, lambda);
-							hnkval=this->spherical_Hankel(l, r0, lambda);
-						}				
+						 	sphbsl_(&l, &r1, &lambda, &bslval);
+							sphhnk_(&l, &r0, &lambda, &hnkval);
+						}					
 						this->Fk.at(T).at(L).at(chi).at(k) -= (4*k+1)*lambda*pow(R_L0,2)*bslval*hnkval*pow(R_L1,2)*pow(r0,2)*pow(r1,2)*rab0*rab1;					
 					}
 				}
diff --git a/ABACUS.develop/source/src_lcao/dftu_yukawa.h b/ABACUS.develop/source/src_lcao/dftu_yukawa.h
index 49c7813a1f..8ed5791cd2 100644
--- a/ABACUS.develop/source/src_lcao/dftu_yukawa.h
+++ b/ABACUS.develop/source/src_lcao/dftu_yukawa.h
@@ -23,9 +23,6 @@ class DFTU_Yukawa
     void cal_yukawa_lambda();
     void cal_slater_UJ(const int istep, const int iter);
 
-    double spherical_Bessel(const int k, const double r, const double lambda);
-    double spherical_Hankel(const int k, const double r, const double lambda);
-
     //void cal_unscreened_slater_Fk(const int L, const int T); //L:angular momnet, T:atom type
     //void cal_slater_Vsc(const int T, const int L);
 
diff --git a/ABACUS.develop/source/src_lcao/lscc.f b/ABACUS.develop/source/src_lcao/lscc.f
new file mode 100644
index 0000000000..492c941ca9
--- /dev/null
+++ b/ABACUS.develop/source/src_lcao/lscc.f
@@ -0,0 +1,87 @@
+      subroutine sphbsl(n,r,A,val) 
+        integer :: n
+        real*8 :: r,A
+        real*8 :: x,val
+        x = r*A
+        if (n .eq. 0) then
+        
+          if ( x .lt. 1.d-3 ) then
+            val = 1 + x**2/6
+          else
+            val = dsinh(x)/x
+          end if
+        else if (n .eq. 2) then
+        
+          if ( x .lt. 1.d-2 ) then
+            val = -x**2/15 -x**4/210 - x**6/7560
+          else
+            val = 3*dcosh(x)/x**2 + (-3-x**2)*dsinh(x)/x**3
+          end if
+        
+        else if (n .eq. 4) then
+
+          if( x .lt. 5.d-1)then
+            val = x**4/945 + x**6/20790 + x**8/1081080 + x**10/97297200
+          else
+            val = -5*(21+2*x**2)*dcosh(x)/x**4+(105+45*x**2+x**4)*
+     &       dsinh(x)/x**5
+          end if
+        
+        else if (n .eq. 6) then
+        
+          if ( x .lt. 9.d-1) then
+            val = -x**6/135135-x**8/4054050-x**10/275675400
+          else
+            val = 21*(495+60*x**2+x**4)*dcosh(x)/x**6 +
+     &       (-10395-4725*x**2-210*x**4-x**6)*dsinh(x)/x**7
+          end if
+        
+        else
+        end if
+      END subroutine sphbsl
+
+      subroutine sphhnk(n,r,A,val)
+        integer :: n
+        real*8 :: r,A
+        real*8 :: x,val
+        x = r*A
+        if (n .eq. 0) then
+        
+          if ( x .lt. 1.d-3 ) then
+            val = -1/x + 1 -x/2 + x**2/6
+          else
+            val = -dexp(-x)/x
+          endif
+        
+        else if (n .eq. 2) then
+        
+          if ( x .lt. 1.d-2) then
+            val = 3/x**3-1/(2*x)+x/8-x**2/15+x**3/48
+          else
+            val = dexp(-x)*(3+3*x+x**2)/x**3
+          endif
+        
+        else if (n .eq. 4) then
+        
+          if (x .lt. 5.d-1) then
+            val = -105/x**5 + 15/(2*x**3) - 3/(8*x) + x/48 - x**3/384 
+     &        +x**4/945
+          else
+            val = -dexp(-x)*(105+105*x+45*x**2+10*x**3+x**4)/x**5
+          endif
+        
+        else if (n .eq. 6) then
+
+          if (x .lt. 9.d-1) then
+            val = 10395/x**7 - 945/(2*x**5) + 105/(8*x**3) -5/(16*x) + 
+     &            x/128-x**3/3840 + x**5/46080 - x**6/135135
+          else
+            val = dexp(-x)*(10395+10395*x+4725*x**2+1260*x**3+210*x**
+     &       4+21*x**5+x**6)/x**7
+          endif
+        
+        else
+        endif
+      END SUBROUTINE sphhnk
+
+

From deb8acf8fd93d6954dd9827a8109191f4eadbef9 Mon Sep 17 00:00:00 2001
From: mohan <mohan.chen.chen.mohan@gmail.com>
Date: Fri, 16 Apr 2021 21:32:53 +0800
Subject: [PATCH 52/60] fix a bug when the pseudopotential dir is incorrect

---
 .../source/src_io/read_pseudopot.cpp          | 30 ++++++++--------
 .../source/src_pw/pseudopot_upf.cpp           | 36 +++++++++++--------
 2 files changed, 35 insertions(+), 31 deletions(-)

diff --git a/ABACUS.develop/source/src_io/read_pseudopot.cpp b/ABACUS.develop/source/src_io/read_pseudopot.cpp
index a6a935abe9..645fdc3781 100644
--- a/ABACUS.develop/source/src_io/read_pseudopot.cpp
+++ b/ABACUS.develop/source/src_io/read_pseudopot.cpp
@@ -9,20 +9,13 @@
 //==========================================================
 void UnitCell_pseudo::read_pseudopot(const string &pp_dir)
 {
-	if(test_pseudo_cell) TITLE("UnitCell_pseudo","read_pseudopot");
+	TITLE("UnitCell_pseudo","read_pseudopot");
 //----------------------------------------------------------
 // EXPLAIN : setup reading log for pseudopot_upf
 //----------------------------------------------------------
 	stringstream ss;
 	ss << global_out_dir << "atom_pseudo.log";
 	
-//	ofstream ofs;
-	
-//	if(MY_RANK==0)
-//	{
-//		ofs.open( ss.str().c_str(), ios::out);
-//	}
-
 //----------------------------------------------------------
 // EXPLAIN : Read in the atomic pseudo potential
 //----------------------------------------------------------
@@ -37,12 +30,17 @@ void UnitCell_pseudo::read_pseudopot(const string &pp_dir)
 		if(MY_RANK==0)
 		{
 			pp_address = pp_dir + this->pseudo_fn[i];
-			//error = upf.read_pseudo_upf( pp_address ); xiaohui modify 2013-06-23
 			error = upf.init_pseudo_reader( pp_address ); //xiaohui add 2013-06-23
-			if(this->atoms[i].flag_empty_element)					// Peize Lin add for bsse 2021.04.07
-				upf.set_empty_element();			
-			//average pseudopotential if needed
-			error_ap = upf.average_p(); //added by zhengdy 2020-10-20
+
+			if(error==0) // mohan add 2021-04-16
+			{
+				if(this->atoms[i].flag_empty_element)	// Peize Lin add for bsse 2021.04.07
+				{
+					upf.set_empty_element();			
+				}
+				//average pseudopotential if needed
+				error_ap = upf.average_p(); //added by zhengdy 2020-10-20
+			}
 		}
 
 #ifdef __MPI
@@ -56,15 +54,15 @@ void UnitCell_pseudo::read_pseudopot(const string &pp_dir)
 		{
 			cout << " Pseudopotential directory now is : " << pp_address << endl;
 			ofs_warning << " Pseudopotential directory now is : " << pp_address << endl;
-			WARNING_QUIT("UnitCell_pseudo::read_pseudopot","Couldn't find pseudopotential file.");
+			WARNING_QUIT("read_pseudopot","Couldn't find pseudopotential file.");
 		}
 		else if(error==2)
 		{
-			WARNING_QUIT("UnitCell_pseudo::read_pseudopot","Something in pseudopotential not match.");
+			WARNING_QUIT("read_pseudopot","Pseudopotential data do not match.");
 		}
 		else if(error==3)
 		{
-			WARNING_QUIT("UnitCell_pseudo::read_pseudopot","Please check the reference states in pseudopotential .vwr file.\n Also the norm of the read in pseudo wave functions\n explicitly please check S, P and D channels.\n If the norm of the wave function is \n unreasonable large (should be near 1.0), ABACUS would quit. \n The solution is to turn off the wave functions  \n and the corresponding non-local projectors together\n in .vwr pseudopotential file.");
+			WARNING_QUIT("read_pseudopot","Check the reference states in pseudopotential .vwr file.\n Also the norm of the read in pseudo wave functions\n explicitly please check S, P and D channels.\n If the norm of the wave function is \n unreasonable large (should be near 1.0), ABACUS would quit. \n The solution is to turn off the wave functions  \n and the corresponding non-local projectors together\n in .vwr pseudopotential file.");
 		}
 //		OUT(ofs_running,"PP_ERRROR",error);
 
diff --git a/ABACUS.develop/source/src_pw/pseudopot_upf.cpp b/ABACUS.develop/source/src_pw/pseudopot_upf.cpp
index e503e833b7..235e53bbb3 100644
--- a/ABACUS.develop/source/src_pw/pseudopot_upf.cpp
+++ b/ABACUS.develop/source/src_pw/pseudopot_upf.cpp
@@ -9,7 +9,7 @@
 #include <math.h>
 #include <string>
 #include <sstream>
-#include <cstring>		// Peize Lin fix bug about strcpy 2016-08-02
+#include <cstring> // Peize Lin fix bug about strcpy 2016-08-02
 
 int Number[2];
 
@@ -35,14 +35,14 @@ Pseudopot_upf::Pseudopot_upf()
 	this->jchi = new double[1];
 	this->jjj = new double[1];
 
-        functional_error = 0;//xiaohui add 2015-03-24
+	functional_error = 0;//xiaohui add 2015-03-24
 }
 
 Pseudopot_upf::~Pseudopot_upf()
 {
-	delete [] els;  //header_15
-	delete [] lchi; //header_16
-	delete [] oc;   //header_17
+	delete [] els; 
+	delete [] lchi;
+	delete [] oc;
 
 	delete [] r;    //mesh_1
 	delete [] rab;  //mesh_2
@@ -61,7 +61,7 @@ Pseudopot_upf::~Pseudopot_upf()
 
 int Pseudopot_upf::init_pseudo_reader(const string &fn)
 {
-    if(test_pp) TITLE("Pseudopot_upf","init");
+    TITLE("Pseudopot_upf","init");
     // First check if this pseudo-potential has spin-orbit information
     ifstream ifs(fn.c_str(), ios::in);
 
@@ -72,10 +72,10 @@ int Pseudopot_upf::init_pseudo_reader(const string &fn)
     }
 
     //cout << "global_pseudo_type =" << global_pseudo_type << endl;
-    if(global_pseudo_type=="auto") //{zws
+    if(global_pseudo_type=="auto") //zws
 	{
 		set_pseudo_type(fn);
-	} //}
+	}
 
 	// read in the .UPF type of pseudopotentials
 	if(global_pseudo_type=="upf")
@@ -98,9 +98,6 @@ int Pseudopot_upf::init_pseudo_reader(const string &fn)
 		return info;
 	}
 
-
-
-
 	return 0;
 }
 
@@ -108,7 +105,7 @@ int Pseudopot_upf::init_pseudo_reader(const string &fn)
 //----------------------------------------------------------
 // setting the type of the pseudopotential file
 //----------------------------------------------------------
-int Pseudopot_upf::set_pseudo_type(const string &fn) //{zws add
+int Pseudopot_upf::set_pseudo_type(const string &fn) //zws add
 {
     ifstream pptype_ifs(fn.c_str(), ios::in);
     string dummy, strversion;
@@ -202,7 +199,7 @@ int Pseudopot_upf::read_pseudo_vwr(ifstream &ifs)
 	if(mesh%2==0) 
 	{
 		mesh=mesh-1;
-		ofs_running << " Mesh number - 1, because we need odd number, \n this may affect some polar atomic orbitals." << endl;
+		ofs_running << " Mesh number - 1, we need odd number, \n this may affect some polar atomic orbitals." << endl;
 	}
 	ofs_running << setw(15) << "MESH" << setw(15) << mesh << endl;
 	// (2) read in nlcc: nonlinear core correction
@@ -2298,14 +2295,23 @@ int Pseudopot_upf::average_p()
 }
 
 // Peize Lin add for bsse 2021.04.07
-void Pseudopot_upf::set_empty_element()
+void Pseudopot_upf::set_empty_element(void)
 {
 	this->zp = 0;
 	for(int ir=0; ir<mesh; ++ir)
+	{
 		this->vloc[ir] = 0;
+	}
 	for(int i=0; i<nbeta; ++i)
+	{
 		for(int j=0; j<nbeta; ++j)
+		{
 			this->dion(i,j) = 0;
+		}
+	}
 	for(int ir=0; ir<mesh; ++ir)
+	{
 		this->rho_at[ir] = 0;
-}
\ No newline at end of file
+	}
+	return;
+}

From ddfc1d008294e75a4211e9227b1caae5f5da7c6e Mon Sep 17 00:00:00 2001
From: mohan <mohan.chen.chen.mohan@gmail.com>
Date: Fri, 16 Apr 2021 22:17:03 +0800
Subject: [PATCH 53/60] delete ucell.lmax in ORB*

---
 .../source/src_lcao/ORB_control.cpp           |  2 +-
 ABACUS.develop/source/src_lcao/ORB_read.cpp   | 35 ++++++++++++++-----
 ABACUS.develop/source/src_lcao/ORB_read.h     |  4 ++-
 3 files changed, 30 insertions(+), 11 deletions(-)

diff --git a/ABACUS.develop/source/src_lcao/ORB_control.cpp b/ABACUS.develop/source/src_lcao/ORB_control.cpp
index fc02da0b04..77df730d8f 100644
--- a/ABACUS.develop/source/src_lcao/ORB_control.cpp
+++ b/ABACUS.develop/source/src_lcao/ORB_control.cpp
@@ -25,7 +25,7 @@ void ORB_control::set_orb_tables(
     // (1) FUNCTION : use 'info' to generate 'Numerical Orbital'
     // (1) RESULT : We have 'Numerical Orbital' for calculate S-table and T-table.
 	//=============================================================================
-    orb.Read_Orbitals(ucell.ntype);
+    orb.Read_Orbitals(ucell.ntype, ucell.lmax);
 
 	if(CALCULATION=="test")
 	{
diff --git a/ABACUS.develop/source/src_lcao/ORB_read.cpp b/ABACUS.develop/source/src_lcao/ORB_read.cpp
index 1eba3a72d8..61d9782678 100644
--- a/ABACUS.develop/source/src_lcao/ORB_read.cpp
+++ b/ABACUS.develop/source/src_lcao/ORB_read.cpp
@@ -90,15 +90,17 @@ void LCAO_Orbitals::bcast_files(void)
 //			nonlocal_file.push_back ( nfile );
 		}
 
-		ofs_running << " " << ucell.atoms[it].label << " orbital file: " << orbital_file[it] << endl;
-//		ofs_running << " " << ucell.atoms[it].label << " nonlocal file: " << nonlocal_file[it] << endl;
+		ofs_running << " orbital file: " << orbital_file[it] << endl;
+//		ofs_running << " nonlocal file: " << nonlocal_file[it] << endl;
 	}
 	return;
 }
 #endif
 
 
-void LCAO_Orbitals::Read_Orbitals(const int &ntype_in)
+void LCAO_Orbitals::Read_Orbitals(
+	const int &ntype_in, 
+	const int &lmax_in)
 {
 	TITLE("LCAO_Orbitals", "Read_Orbitals");
 	timer::tick("LCAO_Orbitals","Read_Orbitals",'C');
@@ -144,11 +146,14 @@ void LCAO_Orbitals::Read_Orbitals(const int &ntype_in)
 	this->ntype = ntype_in; 
 	assert(ntype>0);
 
-	this->lmax = ucell.lmax;
-	for(int i=0; i<ntype; i++)
-	{
-		OUT(ofs_running,"atom label",ucell.atoms[i].label);
-	}
+	assert(lmax_in>=0); // mohan add 2021-04-16
+	this->lmax = lmax_in;
+
+// mohan comment out 2021-04-16
+//	for(int i=0; i<ntype; i++)
+//	{
+//		OUT(ofs_running,"atom label",ucell.atoms[i].label);
+//	}
 
 	//-------------------------------------------------
 	//(2) set the kmesh according to ecutwfc and dk. 
@@ -682,9 +687,21 @@ void LCAO_Orbitals::Read_NonLocal(const int &it, int &n_projectors)
 		}
 	}// end projectors.
 	
-	this->Beta[it].set_type_info(it, label, ps_type, nlmax, Coefficient_D_in, Coefficient_D_in_so, n_projectors, 0, LfromBeta, tmpBeta_lm, ucell.atoms[it].has_so);
+	this->Beta[it].set_type_info(
+		it, 
+		label, 
+		ps_type, 
+		nlmax, 
+		Coefficient_D_in, 
+		Coefficient_D_in_so, 
+		n_projectors, 
+		0, 
+		LfromBeta, 
+		tmpBeta_lm, 
+		ucell.atoms[it].has_so);
 		
 	ifs.close();
+
 	delete[] LfromBeta;
 	delete[] tmpBeta_lm;
 
diff --git a/ABACUS.develop/source/src_lcao/ORB_read.h b/ABACUS.develop/source/src_lcao/ORB_read.h
index f35f2cff64..2a3aea4e04 100644
--- a/ABACUS.develop/source/src_lcao/ORB_read.h
+++ b/ABACUS.develop/source/src_lcao/ORB_read.h
@@ -20,7 +20,9 @@ class LCAO_Orbitals
 	LCAO_Orbitals();
 	~LCAO_Orbitals();
 
-	void Read_Orbitals(const int &ntype_in);
+	void Read_Orbitals(
+		const int &ntype_in,
+		const int &lmax_in);
 
 	void Read_PAO(const int& it);
 

From ef9654ee33e3c01dd25d60e5a61f643436637f6f Mon Sep 17 00:00:00 2001
From: mohan <mohan.chen.chen.mohan@gmail.com>
Date: Fri, 16 Apr 2021 22:46:33 +0800
Subject: [PATCH 54/60] set ORB parameters in ORB codes, not through global
 variables

---
 ABACUS.develop/source/input_conv.cpp          |  9 ++++----
 ABACUS.develop/source/run_lcao.cpp            | 10 ++++++++-
 ABACUS.develop/source/src_io/energy_dos.cpp   | 13 +++++++++--
 .../source/src_io/mulliken_charge.cpp         | 15 ++++++++++++-
 .../source/src_lcao/ORB_control.cpp           | 17 ++++++++++++++
 ABACUS.develop/source/src_lcao/ORB_control.h  |  4 ++++
 ABACUS.develop/source/src_lcao/ORB_read.cpp   | 22 +++++++++----------
 7 files changed, 70 insertions(+), 20 deletions(-)

diff --git a/ABACUS.develop/source/input_conv.cpp b/ABACUS.develop/source/input_conv.cpp
index c51fb80b2d..8903aaba06 100644
--- a/ABACUS.develop/source/input_conv.cpp
+++ b/ABACUS.develop/source/input_conv.cpp
@@ -627,10 +627,11 @@ void Input_Conv::Convert(void)
 //----------------------------------------------------------
 // About LCAO
 //----------------------------------------------------------
-	ORB.ecutwfc = INPUT.lcao_ecut;
-	ORB.dk = INPUT.lcao_dk;
-	ORB.dR = INPUT.lcao_dr;
-	ORB.Rmax = INPUT.lcao_rmax; 
+// mohan add 2021-04-16
+//	ORB.ecutwfc = INPUT.lcao_ecut;
+//	ORB.dk = INPUT.lcao_dk;
+//	ORB.dR = INPUT.lcao_dr;
+//	ORB.Rmax = INPUT.lcao_rmax; 
 
 	// mohan add 2021-02-16
 	berryphase::berry_phase_flag = INPUT.berry_phase;
diff --git a/ABACUS.develop/source/run_lcao.cpp b/ABACUS.develop/source/run_lcao.cpp
index d5f04d0967..d1e758f19e 100644
--- a/ABACUS.develop/source/run_lcao.cpp
+++ b/ABACUS.develop/source/run_lcao.cpp
@@ -45,7 +45,15 @@ void Run_lcao::lcao_line(void)
 
     // * reading the localized orbitals/projectors 
 	// * construct the interpolation tables.
-	hm.orb_con.set_orb_tables(UOT, ORB, ucell.lat0, Exx_Abfs::Lmax);
+	hm.orb_con.set_orb_tables(
+		UOT, 
+		ORB,
+		INPUT.lcao_ecut,
+		INPUT.lcao_dk,
+		INPUT.lcao_dr,
+		INPUT.lcao_rmax, 
+		ucell.lat0, 
+		Exx_Abfs::Lmax);
 
 	// * allocate H and S matrices according to computational resources
 	// * set the 'trace' between local H/S and global H/S
diff --git a/ABACUS.develop/source/src_io/energy_dos.cpp b/ABACUS.develop/source/src_io/energy_dos.cpp
index 42d760b93a..47f91e2950 100644
--- a/ABACUS.develop/source/src_io/energy_dos.cpp
+++ b/ABACUS.develop/source/src_io/energy_dos.cpp
@@ -326,8 +326,17 @@ void energy::perform_dos(void)
 				atom_arrange::set_sr_NL();
 				atom_arrange::search( SEARCH_RADIUS );//qifeng-2019-01-21
 
-				// mohan update 2021-02-10
-				hm.orb_con.set_orb_tables(UOT, ORB, ucell.lat0, Exx_Abfs::Lmax);
+				// mohan update 2021-04-16
+				hm.orb_con.set_orb_tables(
+						UOT, 
+						ORB,
+						INPUT.lcao_ecut,
+						INPUT.lcao_dk,
+						INPUT.lcao_dr,
+						INPUT.lcao_rmax, 
+						ucell.lat0, 
+						Exx_Abfs::Lmax);
+
 				LM.allocate_HS_R(LNNR.nnr);
 				LM.zeros_HSR('S', LNNR.nnr);
 				UHM.genH.calculate_S_no();
diff --git a/ABACUS.develop/source/src_io/mulliken_charge.cpp b/ABACUS.develop/source/src_io/mulliken_charge.cpp
index b5959b9d0e..1df4861cfc 100644
--- a/ABACUS.develop/source/src_io/mulliken_charge.cpp
+++ b/ABACUS.develop/source/src_io/mulliken_charge.cpp
@@ -166,7 +166,20 @@ void Mulliken_Charge::cal_mulliken(void)
 			mud[0].create(ParaO.ncol,ParaO.nrow);
 			atom_arrange::set_sr_NL();
 			atom_arrange::search( SEARCH_RADIUS );//qifeng-2019-01-21
-			hm.orb_con.set_orb_tables(UOT, ORB, ucell.lat0, Exx_Abfs::Lmax);
+
+			// 2021-04-16
+			hm.orb_con.set_orb_tables(
+					UOT, 
+					ORB,
+					INPUT.lcao_ecut,
+					INPUT.lcao_dk,
+					INPUT.lcao_dr,
+					INPUT.lcao_rmax, 
+					ucell.lat0, 
+					Exx_Abfs::Lmax);
+
+
+
 			LM.allocate_HS_R(LNNR.nnr);
 			LM.zeros_HSR('S', LNNR.nnr);
 			UHM.genH.calculate_S_no();
diff --git a/ABACUS.develop/source/src_lcao/ORB_control.cpp b/ABACUS.develop/source/src_lcao/ORB_control.cpp
index 77df730d8f..57661b7108 100644
--- a/ABACUS.develop/source/src_lcao/ORB_control.cpp
+++ b/ABACUS.develop/source/src_lcao/ORB_control.cpp
@@ -15,6 +15,10 @@ ORB_control::~ORB_control()
 void ORB_control::set_orb_tables(
 	ORB_gen_tables &OGT, 
 	LCAO_Orbitals &orb,
+	const double &lcao_ecut_in, // mohan add 2021-04-16
+	const double &lcao_dk_in, // mohan add 2021-04-16
+	const double &lcao_dr_in, // mohan add 2021-04-16
+	const double &lcao_rmax_in, // mohan add 2021-04-16
 	const double &lat0,
 	const int &Lmax_exx)
 {
@@ -25,6 +29,19 @@ void ORB_control::set_orb_tables(
     // (1) FUNCTION : use 'info' to generate 'Numerical Orbital'
     // (1) RESULT : We have 'Numerical Orbital' for calculate S-table and T-table.
 	//=============================================================================
+
+	// mohan add 2021-04-16
+	assert(lcao_ecut_in>0.0);
+	assert(lcao_dk_in>0.0);
+	assert(lcao_dr_in>0.0);
+	assert(lcao_rmax_in>0.0);
+
+	// mohan add 2021-04-16
+	orb.ecutwfc = lcao_ecut_in;
+	orb.dk = lcao_dk_in;
+	orb.dR = lcao_dr_in;
+	orb.Rmax = lcao_rmax_in;
+	
     orb.Read_Orbitals(ucell.ntype, ucell.lmax);
 
 	if(CALCULATION=="test")
diff --git a/ABACUS.develop/source/src_lcao/ORB_control.h b/ABACUS.develop/source/src_lcao/ORB_control.h
index 7fdc9d6779..40536b0d05 100644
--- a/ABACUS.develop/source/src_lcao/ORB_control.h
+++ b/ABACUS.develop/source/src_lcao/ORB_control.h
@@ -16,6 +16,10 @@ class ORB_control
     void set_orb_tables(
 		ORB_gen_tables &OGT, 
 		LCAO_Orbitals &orb,
+		const double &lcao_ecut_in, // mohan add 2021-04-16
+		const double &lcao_dk_in, // mohan add 2021-04-16
+		const double &lcao_dr_in, // mohan add 2021-04-16
+		const double &lcao_rmax_in, // mohan add 2021-04-16
 		const double &lat0,
 		const int &Lmax_exx);
 
diff --git a/ABACUS.develop/source/src_lcao/ORB_read.cpp b/ABACUS.develop/source/src_lcao/ORB_read.cpp
index 61d9782678..6377f76957 100644
--- a/ABACUS.develop/source/src_lcao/ORB_read.cpp
+++ b/ABACUS.develop/source/src_lcao/ORB_read.cpp
@@ -273,18 +273,15 @@ void LCAO_Orbitals::Set_NonLocal(const int &it, int &n_projectors)
 	// get the number of non-local projectors
 	n_projectors = atom->nbeta;
 
-
 // PLEASE avoid using capital letters for local variables
 // mohan note 2021-03-23 
-	const int N_PROJECTORS = atom->nh;//zhengdy-soc
+	const int nh = atom->nh;//zhengdy-soc
 
 	// set the nonlocal projector objects
 	Numerical_Nonlocal_Lm* tmpBeta_lm = new Numerical_Nonlocal_Lm[n_projectors];
 
-	//const int nproj_allowed = atom->lmax + 1;	
-	//matrix Coefficient_D_in(nproj_allowed, nproj_allowed); //LiuXh 2016-01-14
 	matrix Coefficient_D_in(n_projectors, n_projectors); //LiuXh 2016-01-14
-	ComplexMatrix Coefficient_D_in_so(N_PROJECTORS*2, N_PROJECTORS*2);//zhengdy-soc
+	ComplexMatrix Coefficient_D_in_so(nh*2, nh*2);//zhengdy-soc
 
 	if(!atom->has_so)
 	{
@@ -388,7 +385,7 @@ void LCAO_Orbitals::Set_NonLocal(const int &it, int &n_projectors)
 											conj(soc.rotylm(m2,mj))*soc.spinor(l2,j2,m,is2);
 									}
 									soc.fcoef(it,is1,is2,ip1,ip2) = coeff;
-									Coefficient_D_in_so(ip1 + N_PROJECTORS*is1, ip2 + N_PROJECTORS*is2) = atom->dion(p1,p2) * soc.fcoef(it, is1, is2, ip1, ip2);
+									Coefficient_D_in_so(ip1 + nh*is1, ip2 + nh*is2) = atom->dion(p1,p2) * soc.fcoef(it, is1, is2, ip1, ip2);
 									if(p1 != p2) soc.fcoef(it, is1, is2, ip1, ip2) = complex<double>(0.0,0.0);
 								}
 							}
@@ -396,7 +393,7 @@ void LCAO_Orbitals::Set_NonLocal(const int &it, int &n_projectors)
 						ip2++;
 					}
 				}
-				assert(ip2==N_PROJECTORS);
+				assert(ip2==nh);
 				ip1++;
 			}
 		// only keep the nonzero part.
@@ -434,7 +431,7 @@ void LCAO_Orbitals::Set_NonLocal(const int &it, int &n_projectors)
 			delete[] beta_r;
 		}
 
-		assert(ip1==N_PROJECTORS);
+		assert(ip1==nh);
 
 		this->Beta[it].set_type_info(
 			it, 
@@ -444,7 +441,7 @@ void LCAO_Orbitals::Set_NonLocal(const int &it, int &n_projectors)
 			Coefficient_D_in, 
 			Coefficient_D_in_so, 
 			n_projectors, 
-			N_PROJECTORS, 
+			nh, 
 			atom->lll, 
 			tmpBeta_lm, 
 			1);//zhengdy-soc 2018-09-10
@@ -1042,10 +1039,11 @@ void LCAO_Orbitals::Read_Descriptor(void)	//read descriptor basis
 	}
 
 #ifdef __MPI
-		Parallel_Common::bcast_int(lmax);
-		Parallel_Common::bcast_int(nchimax);
-		Parallel_Common::bcast_int(nchi, lmax + 1);
+	Parallel_Common::bcast_int(lmax);
+	Parallel_Common::bcast_int(nchimax);
+	Parallel_Common::bcast_int(nchi, lmax + 1);
 #endif		
+
 	this->lmax_d = lmax;
 	this->nchimax_d = nchimax;
 	// calculate total number of chi

From d4705f2154806702ce1a7f03158a6e2524504b6f Mon Sep 17 00:00:00 2001
From: linpz <linpz@mail.ustc.edu.cn>
Date: Sat, 17 Apr 2021 15:21:21 +0800
Subject: [PATCH 55/60] 1. fix bug of flag_empty_element

---
 ABACUS.develop/source/src_io/read_atoms.cpp | 7 +++++--
 ABACUS.develop/source/src_pw/atom_pseudo.h  | 2 +-
 2 files changed, 6 insertions(+), 3 deletions(-)

diff --git a/ABACUS.develop/source/src_io/read_atoms.cpp b/ABACUS.develop/source/src_io/read_atoms.cpp
index 11f4251180..1797c0f4fe 100644
--- a/ABACUS.develop/source/src_io/read_atoms.cpp
+++ b/ABACUS.develop/source/src_io/read_atoms.cpp
@@ -3,6 +3,8 @@
 #include "src_pw/global.h"
 #include <cstring>		// Peize Lin fix bug about strcmp 2016-08-02
 
+#include "src_external/src_test/icecream.hpp"
+
 void UnitCell_pseudo::read_atom_species(ifstream &ifa)
 {
 	TITLE("UnitCell_pseudo","read_atom_species");
@@ -35,8 +37,9 @@ void UnitCell_pseudo::read_atom_species(ifstream &ifa)
 
 			// Peize Lin test for bsse 2021.04.07
 			const string bsse_label = "empty";
-			if(search( atom_label[i].begin(), atom_label[i].end(), bsse_label.begin(), bsse_label.end() ) != atom_label[i].end())
-				this->atoms[i].flag_empty_element = true;
+			this->atoms[i].flag_empty_element = 
+				(search( atom_label[i].begin(), atom_label[i].end(), bsse_label.begin(), bsse_label.end() ) != atom_label[i].end())
+				? true : false;
 		}
 	}
 
diff --git a/ABACUS.develop/source/src_pw/atom_pseudo.h b/ABACUS.develop/source/src_pw/atom_pseudo.h
index 2536bae2e8..f66cc4f323 100644
--- a/ABACUS.develop/source/src_pw/atom_pseudo.h
+++ b/ABACUS.develop/source/src_pw/atom_pseudo.h
@@ -23,7 +23,7 @@ class Atom_pseudo : public pseudo_us
 	Vector3<int> *mbl; //If this atom can move
 	string pseudo_fn;// File name of pseudopotentia
 	double mass; // the mass of atom
-	bool flag_empty_element;	// whether is the empty element for bsse.	Peize Lin add 2021.04.07
+	bool flag_empty_element = false;	// whether is the empty element for bsse.	Peize Lin add 2021.04.07
 
 protected:
 

From 0815376c899006c28c6de589138bf17c2b126dec Mon Sep 17 00:00:00 2001
From: linpz <linpz@mail.ustc.edu.cn>
Date: Sat, 17 Apr 2021 16:08:52 +0800
Subject: [PATCH 56/60] 1. delete testing header file

---
 ABACUS.develop/source/src_io/read_atoms.cpp | 2 --
 1 file changed, 2 deletions(-)

diff --git a/ABACUS.develop/source/src_io/read_atoms.cpp b/ABACUS.develop/source/src_io/read_atoms.cpp
index 1797c0f4fe..e568172bb7 100644
--- a/ABACUS.develop/source/src_io/read_atoms.cpp
+++ b/ABACUS.develop/source/src_io/read_atoms.cpp
@@ -3,8 +3,6 @@
 #include "src_pw/global.h"
 #include <cstring>		// Peize Lin fix bug about strcmp 2016-08-02
 
-#include "src_external/src_test/icecream.hpp"
-
 void UnitCell_pseudo::read_atom_species(ifstream &ifa)
 {
 	TITLE("UnitCell_pseudo","read_atom_species");

From 5a6a8491b7288ae8671acc8fa99b2dc67d3ec198 Mon Sep 17 00:00:00 2001
From: zdy <dyzheng@mail.ustc.edu.cn>
Date: Tue, 20 Apr 2021 17:53:57 +0800
Subject: [PATCH 57/60] fixed bug of relax

---
 ABACUS.develop/source/src_pw/unitcell.cpp | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/ABACUS.develop/source/src_pw/unitcell.cpp b/ABACUS.develop/source/src_pw/unitcell.cpp
index bdd55623a8..5a47691423 100644
--- a/ABACUS.develop/source/src_pw/unitcell.cpp
+++ b/ABACUS.develop/source/src_pw/unitcell.cpp
@@ -353,7 +353,7 @@ void UnitCell::save_cartesian_position(double* pos)const
 	for(int it = 0;it < this->ntype;it++)
 	{
 		Atom* atom = &this->atoms[it];
-		for(int ia =0; ia<atoms->na; ia++)
+		for(int ia =0; ia<atom->na; ia++)
 		{	
 			pos[3*iat  ] = atom->tau[ia].x*this->lat0;
 			pos[3*iat+1] = atom->tau[ia].y*this->lat0;
@@ -363,4 +363,4 @@ void UnitCell::save_cartesian_position(double* pos)const
     }
     assert(iat == this->nat);
     return;
-}
\ No newline at end of file
+}

From e2149c14bccb01fc66d742d9934eb3a9db8865ba Mon Sep 17 00:00:00 2001
From: linpz <linpz@mail.ustc.edu.cn>
Date: Wed, 21 Apr 2021 18:04:49 +0800
Subject: [PATCH 58/60] 1. add Matrix3::Zero()

---
 ABACUS.develop/source/src_global/matrix3.cpp | 7 +++++++
 ABACUS.develop/source/src_global/matrix3.h   | 1 +
 2 files changed, 8 insertions(+)

diff --git a/ABACUS.develop/source/src_global/matrix3.cpp b/ABACUS.develop/source/src_global/matrix3.cpp
index 5495eb1de9..29371c0a77 100644
--- a/ABACUS.develop/source/src_global/matrix3.cpp
+++ b/ABACUS.develop/source/src_global/matrix3.cpp
@@ -23,6 +23,13 @@ void Matrix3::Identity(void)
 	e31 = 0;e32 = 0;e33 = 1;
 }
 
+void Matrix3::Zero(void)
+{
+	e11 = 0;e12 = 0;e13 = 0;
+	e21 = 0;e22 = 0;e23 = 0;
+	e31 = 0;e32 = 0;e33 = 0;
+}
+
 double Matrix3::Det(void) const 
 {
 	return	e11*e22*e33 -
diff --git a/ABACUS.develop/source/src_global/matrix3.h b/ABACUS.develop/source/src_global/matrix3.h
index a126bbdeb5..7729fe5cde 100644
--- a/ABACUS.develop/source/src_global/matrix3.h
+++ b/ABACUS.develop/source/src_global/matrix3.h
@@ -26,6 +26,7 @@ class Matrix3
 
 	void Reset(void);
 	void Identity(void);
+	void Zero(void);
 	double Det(void) const ;
 	Matrix3	Transpose(void) const ;
 	Matrix3	Inverse(void) const ;

From ddfbcf78fcee228a6479db52ee2cc56d15367ad6 Mon Sep 17 00:00:00 2001
From: linpz <linpz@mail.ustc.edu.cn>
Date: Wed, 21 Apr 2021 18:05:15 +0800
Subject: [PATCH 59/60] 1. fix bug in Vdwd2::cal_stress()

---
 ABACUS.develop/source/src_pw/vdwd2.cpp | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/ABACUS.develop/source/src_pw/vdwd2.cpp b/ABACUS.develop/source/src_pw/vdwd2.cpp
index 77a355ebaa..b3159576fc 100644
--- a/ABACUS.develop/source/src_pw/vdwd2.cpp
+++ b/ABACUS.develop/source/src_pw/vdwd2.cpp
@@ -103,7 +103,7 @@ void Vdwd2::cal_stress()
     TITLE("Vdwd2","stress");
 	para.initset(ucell);
 
-	stress.Reset();
+	stress.Zero();
 	
 	for( int it1=0; it1!=ucell.ntype; ++it1 )
 	{

From 397f78c97c019dff51b3908e30de2df64860f304 Mon Sep 17 00:00:00 2001
From: Quxin <78459762+80610702-git@users.noreply.github.com>
Date: Wed, 21 Apr 2021 22:43:48 +0800
Subject: [PATCH 60/60] DFT+U (#25)

* DFT+U

fix the problem of dftu_relax.h header file

* DFT+U

Fix bug in stress calculation; delete lscc.f
---
 ABACUS.develop/source/Makefile.Objects        |   1 -
 ABACUS.develop/source/src_lcao/dftu_relax.cpp | 387 ++++++++++--------
 ABACUS.develop/source/src_lcao/dftu_relax.h   |   8 +-
 .../source/src_lcao/dftu_yukawa.cpp           |  80 +++-
 ABACUS.develop/source/src_lcao/dftu_yukawa.h  |   3 +
 ABACUS.develop/source/src_lcao/lscc.f         |  87 ----
 6 files changed, 299 insertions(+), 267 deletions(-)
 delete mode 100644 ABACUS.develop/source/src_lcao/lscc.f

diff --git a/ABACUS.develop/source/Makefile.Objects b/ABACUS.develop/source/Makefile.Objects
index b60de00435..10825db039 100644
--- a/ABACUS.develop/source/Makefile.Objects
+++ b/ABACUS.develop/source/Makefile.Objects
@@ -264,7 +264,6 @@ variable_cell.o\
 dftu.o\
 dftu_yukawa.o\
 dftu_relax.o\
-lscc.o\
 
 OBJS_COMMON=atom_spec.o \
 unitcell.o \
diff --git a/ABACUS.develop/source/src_lcao/dftu_relax.cpp b/ABACUS.develop/source/src_lcao/dftu_relax.cpp
index 9051fd32a1..31424f7d6d 100644
--- a/ABACUS.develop/source/src_lcao/dftu_relax.cpp
+++ b/ABACUS.develop/source/src_lcao/dftu_relax.cpp
@@ -16,7 +16,7 @@
 #include "../src_pw/global.h"
 #include "global_fp.h"
 #include "../src_global/global_function.h"
-#include "../src_global/scalapack_connector.h"
+//#include "../src_global/scalapack_connector.h"
 #include "../src_global/lapack_connector.h"
 #include "../src_global/inverse_matrix.h"
 #include "LOOP_ions.h"
@@ -25,6 +25,26 @@
 #include "ORB_gen_tables.h"
 #include "../src_pw/charge.h"
 
+extern "C"
+{
+  void pzgemm_(
+		const char *transa, const char *transb,
+		const int *M, const int *N, const int *K,
+		const std::complex<double> *alpha,
+		const std::complex<double> *A, const int *IA, const int *JA, const int *DESCA,
+		const std::complex<double> *B, const int *IB, const int *JB, const int *DESCB,
+		const std::complex<double> *beta,
+		std::complex<double> *C, const int *IC, const int *JC, const int *DESCC);
+  
+  void pdgemm_(
+		const char *transa, const char *transb,
+		const int *M, const int *N, const int *K,
+		const double *alpha,
+		const double *A, const int *IA, const int *JA, const int *DESCA,
+		const double *B, const int *IB, const int *JB, const int *DESCB,
+		const double *beta,
+		double *C, const int *IC, const int *JC, const int *DESCC);
+}
 
 DFTU_RELAX::DFTU_RELAX(){}
 
@@ -144,7 +164,7 @@ void DFTU_RELAX::force_stress()
 				}
 				else
 				{
-					if(NSPIN==1 || NSPIN==4)
+					if(NSPIN==1)
 					{
 						double val = get_onebody_eff_pot(T1, iat1, L1, n1, 0, m1, m2, cal_type, false);
 						VU_k.at(0).at(irc) = complex<double>(val, 0.0);
@@ -191,13 +211,15 @@ void DFTU_RELAX::force_stress()
 }
 
 
-void DFTU_RELAX::cal_force_k(vector<vector<complex<double>>> &VU)
+void DFTU_RELAX::cal_force_k(const vector<vector<complex<double>>>& VU)
 {
 	TITLE("DFTU_RELAX", "cal_force_k");
 
-	const char transN = 'N', transT = 'T';
+	const char transN = 'N', transT = 'T', transC='C';
 	const int  one_int = 1;
-	const double alpha = 1.0, beta = 0.0;
+	// const double alpha = 1.0, beta = 0.0;
+  const complex<double> alpha(1.0,0.0), beta(0.0,0.0);
+  const complex<double> zero(0.0,0.0);
 	
 	vector<vector<complex<double>>> ftmp(ucell.nat);
 	for(int ia=0; ia<ucell.nat; ia++)
@@ -208,7 +230,7 @@ void DFTU_RELAX::cal_force_k(vector<vector<complex<double>>> &VU)
 	vector<vector<complex<double>>> dm_VU_dSm(3);
 	for(int dim=0; dim<3; dim++)
 	{
-		dm_VU_dSm.at(dim).resize(ParaO.nloc, complex<double>(0.0, 0.0));
+		dm_VU_dSm.at(dim).resize(ParaO.nloc, zero);
 	}
 	
 	for(int ik=0; ik<kv.nks; ik++)	
@@ -217,8 +239,8 @@ void DFTU_RELAX::cal_force_k(vector<vector<complex<double>>> &VU)
 
 		for(int dim=0; dim<3; dim++)
 		{
-			vector<complex<double>> mat_tmp(ParaO.nloc, complex<double>(0.0, 0.0));
-			vector<complex<double>> force_tmp(ParaO.nloc, complex<double>(0.0, 0.0));
+			vector<complex<double>> mat_tmp(ParaO.nloc);
+			vector<complex<double>> force_tmp(ParaO.nloc);
 
 			if(dim==0) //dim=1,2 are same as dim=0
 			{
@@ -251,7 +273,7 @@ void DFTU_RELAX::cal_force_k(vector<vector<complex<double>>> &VU)
 			//=========================================
 			ZEROS(VECTOR_TO_PTR(force_tmp), ParaO.nloc);
 
-			pzgemm_(&transN, &transT,
+			pzgemm_(&transN, &transC,
 				&NLOCAL, &NLOCAL, &NLOCAL,
 				&alpha, 
 				this->dSm_k[ik][dim], &one_int, &one_int, ParaO.desc, 
@@ -263,7 +285,7 @@ void DFTU_RELAX::cal_force_k(vector<vector<complex<double>>> &VU)
 			{
 				dm_VU_dSm.at(dim).at(irc) -= force_tmp.at(irc);
 			}
-		}//end dim				
+		}//end dim
 	}//end ik
 
 	for(int dim=0; dim<3; dim++)
@@ -296,35 +318,36 @@ void DFTU_RELAX::cal_force_k(vector<vector<complex<double>>> &VU)
 		}
 	}
 
-
 	return;
 }
 
 
-void DFTU_RELAX::cal_stress_k(vector<vector<complex<double>>> &VU)
+void DFTU_RELAX::cal_stress_k(const vector<vector<complex<double>>>& VU)
 {
 	TITLE("DFTU_RELAX", "cal_stress_k");
 
-	const char transN = 'N', transT = 'T';
+	const char transN = 'N', transT = 'T', transC='C';
 	const int  one_int = 1;
-	const double alpha = 1.0, beta = 0.0;
-	
+	//const double alpha = 1.0, beta = 0.0;
+	const complex<double> alpha(1.0,0.0), beta(0.0,0.0);
+  const complex<double> zero(0.0,0.0);
+  
 	int count = 0;
 	for(int dim1=0; dim1<3; dim1++)
 	{
 		for(int dim2=dim1; dim2<3; dim2++)
 		{
-			vector<complex<double>> dm_VU_sover(ParaO.nloc, complex<double>(0.0, 0.0));
+			vector<complex<double>> dm_VU_sover(ParaO.nloc, zero);
 
 			for(int ik=0; ik<kv.nks; ik++)
 			{
 				const int spin = kv.isk[ik];
 				
 				// The first term
-				vector<complex<double>> stress_tmp(ParaO.nloc, complex<double>(0.0, 0.0));
+				vector<complex<double>> stress_tmp(ParaO.nloc);
 
 				//Calculate mat_tmp=dm*VU
-				vector<complex<double>> mat_tmp(ParaO.nloc, complex<double>(0.0, 0.0));
+				vector<complex<double>> mat_tmp(ParaO.nloc);
 
 				pzgemm_(&transT, &transN,
 					&NLOCAL, &NLOCAL, &NLOCAL,
@@ -344,13 +367,14 @@ void DFTU_RELAX::cal_stress_k(vector<vector<complex<double>>> &VU)
 
 				for(int irc=0; irc<ParaO.nloc; irc++)
 				{
-					dm_VU_sover.at(irc) -= 0.5*stress_tmp.at(irc);
+					// dm_VU_sover.at(irc) -= 0.5*stress_tmp.at(irc);
+          dm_VU_sover.at(irc) -= 0.5*stress_tmp.at(irc);
 				}
 
 				// The second term
 				ZEROS(VECTOR_TO_PTR(stress_tmp), ParaO.nloc);
 
-				pzgemm_(&transN, &transT,
+				pzgemm_(&transN, &transC,
 					&NLOCAL, &NLOCAL, &NLOCAL,
 					&alpha, 
 					this->soverlap_k[ik][count], &one_int, &one_int, ParaO.desc, 
@@ -360,7 +384,8 @@ void DFTU_RELAX::cal_stress_k(vector<vector<complex<double>>> &VU)
 
 				for(int irc=0; irc<ParaO.nloc; irc++)
 				{
-					dm_VU_sover.at(irc) += 0.5*stress_tmp.at(irc);
+					// dm_VU_sover.at(irc) += 0.5*stress_tmp.at(irc);
+          dm_VU_sover.at(irc) -= 0.5*stress_tmp.at(irc);
 				}
 
 			}//end ik
@@ -385,8 +410,8 @@ void DFTU_RELAX::cal_stress_k(vector<vector<complex<double>>> &VU)
 			double val = stmp.real();
 			MPI_Allreduce(&val, &stress_dftu.at(dim1).at(dim2), 1, MPI_DOUBLE, MPI_SUM, MPI_COMM_WORLD);
 
-			complex<double> tmp;
-			MPI_Allreduce(&stmp, &tmp, 1, MPI_DOUBLE, MPI_SUM, MPI_COMM_WORLD);
+			// complex<double> tmp;
+			// MPI_Allreduce(&stmp, &tmp, 1, MPI_DOUBLE, MPI_SUM, MPI_COMM_WORLD);
 						
 			count++;
 		}//end dim2
@@ -413,7 +438,7 @@ void DFTU_RELAX::cal_stress_k(vector<vector<complex<double>>> &VU)
 }
 
 
-void DFTU_RELAX::cal_force_gamma(vector<vector<double>> &VU)
+void DFTU_RELAX::cal_force_gamma(const vector<vector<double>> &VU)
 {
 	TITLE("DFTU_RELAX", "cal_force_gamma");
 
@@ -578,7 +603,7 @@ void DFTU_RELAX::cal_force_gamma(vector<vector<double>> &VU)
 }
 
 
-void DFTU_RELAX::cal_stress_gamma(vector<vector<double>> &VU)
+void DFTU_RELAX::cal_stress_gamma(const vector<vector<double>> &VU)
 {
 	TITLE("DFTU_RELAX", "cal_stress_gamma");
 
@@ -637,7 +662,7 @@ void DFTU_RELAX::cal_stress_gamma(vector<vector<double>> &VU)
 
 				for(int irc=0; irc<ParaO.nloc; irc++)
 				{
-					dm_VU_sover.at(irc) += 0.5*stress_tmp.at(irc);
+					dm_VU_sover.at(irc) -= 0.5*stress_tmp.at(irc);
 				}
 
 			}//end ik
@@ -727,145 +752,179 @@ void DFTU_RELAX::folding_dSm_soverlap()
 	}
 	
 
-	Vector3<double> tau1, tau2, dtau;
-	Vector3<double> dtau1, dtau2, tau0;
+	  Vector3<double> tau1, tau2, dtau;
+	  Vector3<double> dtau1, dtau2, tau0;
     for(int T1=0; T1<ucell.ntype; ++T1)
     {
-		Atom* atom1 = &ucell.atoms[T1];
-        for(int I1=0; I1<atom1->na; ++I1)
+		  Atom* atom1 = &ucell.atoms[T1];
+      for(int I1=0; I1<atom1->na; ++I1)
+      {
+			  tau1 = atom1->tau[I1];
+        const int start1 = ucell.itiaiw2iwt(T1,I1,0);    
+
+        GridD.Find_atom(tau1, T1, I1);
+        for(int ad=0; ad<GridD.getAdjacentNum()+1; ++ad)
         {
-			tau1 = atom1->tau[I1];
-            
-            GridD.Find_atom(tau1, T1, I1);
-            for(int ad=0; ad<GridD.getAdjacentNum()+1; ++ad)
-            {
-                const int T2 = GridD.getType(ad);
-				const int I2 = GridD.getNatom(ad);
-
-				Atom* atom2 = &ucell.atoms[T2];
-
-				tau2 = GridD.getAdjacentTau(ad);
-				dtau = tau2 - tau1;
-
-				double distance = dtau.norm() * ucell.lat0;
-				double rcut = ORB.Phi[T1].getRcut() + ORB.Phi[T2].getRcut();				
-
-				if(distance < rcut)
-				{
-					int iw1_all = ucell.itiaiw2iwt( T1, I1, 0) ; //iw1_all = combined index (it, ia, iw)
-
-					for(int jj=0; jj<atom1->nw*NPOL; ++jj)
-					{
-						const int jj0 = jj/NPOL;
-						const int L1 = atom1->iw2l[jj0];
-						const int N1 = atom1->iw2n[jj0];
-						const int m1 = atom1->iw2m[jj0];
-						int iw2_all = ucell.itiaiw2iwt( T2, I2, 0);
-
-						for(int kk=0; kk<atom2->nw*NPOL; ++kk)
-						{
-							const int kk0 = kk/NPOL;
-							const int L2 = atom2->iw2l[kk0];
-							const int N2 = atom2->iw2n[kk0];
-							const int m2 = atom2->iw2m[kk0];
-							
-							if ( !ParaO.in_this_processor(iw1_all,iw2_all) )
-							{
-								++iw2_all;
-								continue;
-							}
-
-							int mu = ParaO.trace_loc_row[iw1_all];
-							int nu = ParaO.trace_loc_col[iw2_all];
-							int irc = nu*ParaO.nrow + mu;
-														
-							if(GAMMA_ONLY_LOCAL)
-							{
-								if(STRESS)
-								{
-									this->soverlap_gamma[0][irc] += LM.DSloc_Rx[nnr]*LM.DH_r[nnr*3+0];
-									this->soverlap_gamma[1][irc] += LM.DSloc_Rx[nnr]*LM.DH_r[nnr*3+1];
-									this->soverlap_gamma[2][irc] += LM.DSloc_Rx[nnr]*LM.DH_r[nnr*3+2];
-									this->soverlap_gamma[3][irc] += LM.DSloc_Ry[nnr]*LM.DH_r[nnr*3+1];
-									this->soverlap_gamma[4][irc] += LM.DSloc_Ry[nnr]*LM.DH_r[nnr*3+2];
-									this->soverlap_gamma[5][irc] += LM.DSloc_Rz[nnr]*LM.DH_r[nnr*3+2];
-								}
-							}
-							else
-							{
-								Vector3<double> dR(GridD.getBox(ad).x, GridD.getBox(ad).y, GridD.getBox(ad).z); 
-							
-								for(int ik=0; ik<kv.nks; ik++)
-								{								
-									const double arg = ( kv.kvec_d[ik] * dR ) * TWO_PI;
-									const complex<double> kphase = complex <double> ( cos(arg),  sin(arg) );
-
-									this->dSm_k[ik][0][irc] += LM.DSloc_Rx[nnr]*kphase;
-									this->dSm_k[ik][1][irc] += LM.DSloc_Ry[nnr]*kphase;
-									this->dSm_k[ik][2][irc] += LM.DSloc_Rz[nnr]*kphase;
-
-									if(STRESS)
-									{																												
-										this->soverlap_k[ik][0][irc] += LM.DSloc_Rx[nnr]*LM.DH_r[nnr*3+0]*kphase;
-										this->soverlap_k[ik][1][irc] += LM.DSloc_Rx[nnr]*LM.DH_r[nnr*3+1]*kphase;
-										this->soverlap_k[ik][2][irc] += LM.DSloc_Rx[nnr]*LM.DH_r[nnr*3+2]*kphase;
-										this->soverlap_k[ik][3][irc] += LM.DSloc_Ry[nnr]*LM.DH_r[nnr*3+1]*kphase;
-										this->soverlap_k[ik][4][irc] += LM.DSloc_Ry[nnr]*LM.DH_r[nnr*3+2]*kphase;
-										this->soverlap_k[ik][5][irc] += LM.DSloc_Rz[nnr]*LM.DH_r[nnr*3+2]*kphase;																
-									}
-								}	
-							}
-																																																																				
-							++nnr;													
-							++iw2_all;
-						}// nw2 
-
-						++iw1_all;
-						
-					}// nw1
-				}// distance
-				else if(distance>=rcut)
-				{
-					int start1 = ucell.itiaiw2iwt( T1, I1, 0);
-					int start2 = ucell.itiaiw2iwt( T2, I2, 0);
-					bool is_adj = false;
-					for (int ad0=0; ad0<GridD.getAdjacentNum()+1; ++ad0)
-					{
-						const int T0 = GridD.getType(ad0);
-						
-						tau0 = GridD.getAdjacentTau(ad0);
-						dtau1 = tau0 - tau1;
-						double distance1 = dtau1.norm() * ucell.lat0;
-						double rcut1 = ORB.Phi[T1].getRcut() + ORB.Beta[T0].get_rcut_max();
-						dtau2 = tau0 - tau2;
-						double distance2 = dtau2.norm() * ucell.lat0;
-						double rcut2 = ORB.Phi[T2].getRcut() + ORB.Beta[T0].get_rcut_max();
-						if(distance1<rcut1 && distance2<rcut2)
-						{
-							is_adj = true;
-							break;
-						}
-					}//ad0
-					if( is_adj )
-					{
-						for(int jj=0; jj<atom1->nw * NPOL; ++jj)
-						{
-							const int mu = ParaO.trace_loc_row[start1+jj];
-							if(mu<0)continue; 
-
-							for(int kk=0; kk<atom2->nw * NPOL; ++kk)
-							{
-								const int nu = ParaO.trace_loc_col[start2+kk];
-								if(nu<0)continue;
-
-								++nnr;
-							}//kk
-						}//jj
-					}
-				}//distance
-			}// ad
-		}// I1
-	}// T1
+          const int T2 = GridD.getType(ad);
+				  const int I2 = GridD.getNatom(ad);
+          const int start2 = ucell.itiaiw2iwt(T2, I2, 0);
+
+				  Atom* atom2 = &ucell.atoms[T2];
+
+				  tau2 = GridD.getAdjacentTau(ad);
+				  dtau = tau2 - tau1;
+
+				  double distance = dtau.norm() * ucell.lat0;
+				  double rcut = ORB.Phi[T1].getRcut() + ORB.Phi[T2].getRcut();
+
+          bool adj = false;
+				  if(distance < rcut) adj = true;
+				  else if(distance >= rcut)
+				  {
+				  	for (int ad0 = 0; ad0 < GridD.getAdjacentNum()+1; ++ad0)
+				  	{
+				  		const int T0 = GridD.getType(ad0); 
+				  		const int I0 = GridD.getNatom(ad0); 
+				  		const int iat0 = ucell.itia2iat(T0, I0);
+				  		const int start0 = ucell.itiaiw2iwt(T0, I0, 0);
+
+				  		tau0 = GridD.getAdjacentTau(ad0);
+				  		dtau1 = tau0 - tau1;
+				  		dtau2 = tau0 - tau2;
+
+				  		double distance1 = dtau1.norm() * ucell.lat0;
+				  		double distance2 = dtau2.norm() * ucell.lat0;
+
+				  		double rcut1 = ORB.Phi[T1].getRcut() + ORB.Beta[T0].get_rcut_max();
+				  		double rcut2 = ORB.Phi[T2].getRcut() + ORB.Beta[T0].get_rcut_max();
+
+				  		if( distance1 < rcut1 && distance2 < rcut2 )
+				  		{
+				  			adj = true;
+				  			break;
+				  		}
+				  	}
+				  }				
+
+				  if(adj)
+				  {
+				  	for(int jj=0; jj<atom1->nw*NPOL; ++jj)
+				  	{
+              const int jj0 = jj/NPOL;
+
+              const int iw1_all = start1 + jj0; 
+              const int mu = ParaO.trace_loc_row[iw1_all];
+					    if(mu<0)continue;
+
+				  		const int L1 = atom1->iw2l[jj0];
+				  		const int N1 = atom1->iw2n[jj0];
+				  		const int m1 = atom1->iw2m[jj0];
+
+
+				  		for(int kk=0; kk<atom2->nw*NPOL; ++kk)
+				  		{
+                const int kk0 = kk/NPOL;
+
+                const int iw2_all = start2 + kk0;
+						    const int nu = ParaO.trace_loc_col[iw2_all];
+						    if(nu<0)continue;
+
+				  			const int L2 = atom2->iw2l[kk0];
+				  			const int N2 = atom2->iw2n[kk0];
+				  			const int m2 = atom2->iw2m[kk0];
+  
+				  			// if ( !ParaO.in_this_processor(iw1_all,iw2_all) )
+				  			// {
+				  				// ++iw2_all;
+				  				// continue;
+				  			// }
+
+				  			// int mu = ParaO.trace_loc_row[iw1_all];
+				  			// int nu = ParaO.trace_loc_col[iw2_all];
+				  			int irc = nu*ParaO.nrow + mu;
+  
+				  			if(GAMMA_ONLY_LOCAL)
+							  {
+							  	if(STRESS)
+							  	{
+							  		this->soverlap_gamma[0][irc] += LM.DSloc_Rx[nnr]*LM.DH_r[nnr*3+0];
+							  		this->soverlap_gamma[1][irc] += LM.DSloc_Rx[nnr]*LM.DH_r[nnr*3+1];
+							  		this->soverlap_gamma[2][irc] += LM.DSloc_Rx[nnr]*LM.DH_r[nnr*3+2];
+							  		this->soverlap_gamma[3][irc] += LM.DSloc_Ry[nnr]*LM.DH_r[nnr*3+1];
+							  		this->soverlap_gamma[4][irc] += LM.DSloc_Ry[nnr]*LM.DH_r[nnr*3+2];
+							  		this->soverlap_gamma[5][irc] += LM.DSloc_Rz[nnr]*LM.DH_r[nnr*3+2];
+							  	}
+							  }
+				  			else
+				  			{
+				  				Vector3<double> dR(GridD.getBox(ad).x, GridD.getBox(ad).y, GridD.getBox(ad).z); 
+  
+				  				for(int ik=0; ik<kv.nks; ik++)
+				  				{
+				  					const double arg = ( kv.kvec_d[ik] * dR ) * TWO_PI;
+				  					const complex<double> kphase( cos(arg),  sin(arg) );
+
+				  					this->dSm_k[ik][0][irc] += LM.DSloc_Rx[nnr]*kphase;
+				  					this->dSm_k[ik][1][irc] += LM.DSloc_Ry[nnr]*kphase;
+				  					this->dSm_k[ik][2][irc] += LM.DSloc_Rz[nnr]*kphase;
+
+				  					if(STRESS)
+				  					{		
+				  						this->soverlap_k[ik][0][irc] += LM.DSloc_Rx[nnr]*LM.DH_r[nnr*3+0]*kphase;
+				  						this->soverlap_k[ik][1][irc] += LM.DSloc_Rx[nnr]*LM.DH_r[nnr*3+1]*kphase;
+				  						this->soverlap_k[ik][2][irc] += LM.DSloc_Rx[nnr]*LM.DH_r[nnr*3+2]*kphase;
+				  						this->soverlap_k[ik][3][irc] += LM.DSloc_Ry[nnr]*LM.DH_r[nnr*3+1]*kphase;
+				  						this->soverlap_k[ik][4][irc] += LM.DSloc_Ry[nnr]*LM.DH_r[nnr*3+2]*kphase;
+				  						this->soverlap_k[ik][5][irc] += LM.DSloc_Rz[nnr]*LM.DH_r[nnr*3+2]*kphase;																
+				  					}
+				  				}
+				  			}
+				  			++nnr;
+				  		}// kk
+				    }// jj
+				  }// adj
+				  // else if(distance>=rcut)
+				  // {
+				  	// int start1 = ucell.itiaiw2iwt( T1, I1, 0);
+				  	// int start2 = ucell.itiaiw2iwt( T2, I2, 0);
+				  	// bool is_adj = false;
+				  	// for (int ad0=0; ad0<GridD.getAdjacentNum()+1; ++ad0)
+				  	// {
+				  	// 	const int T0 = GridD.getType(ad0);
+				  		
+				  	// 	tau0 = GridD.getAdjacentTau(ad0);
+				  	// 	dtau1 = tau0 - tau1;
+				  	// 	double distance1 = dtau1.norm() * ucell.lat0;
+				  	// 	double rcut1 = ORB.Phi[T1].getRcut() + ORB.Beta[T0].get_rcut_max();
+				  	// 	dtau2 = tau0 - tau2;
+				  	// 	double distance2 = dtau2.norm() * ucell.lat0;
+				  	// 	double rcut2 = ORB.Phi[T2].getRcut() + ORB.Beta[T0].get_rcut_max();
+				  	// 	if(distance1<rcut1 && distance2<rcut2)
+				  	// 	{
+				  	// 		is_adj = true;
+				  	// 		break;
+				  	// 	}
+				  	// }//ad0
+				  	// if( is_adj )
+				  	// {
+				  // 		for(int jj=0; jj<atom1->nw * NPOL; ++jj)
+				  // 		{
+				  // 			const int mu = ParaO.trace_loc_row[start1+jj];
+				  // 			if(mu<0) continue; 
+
+				  // 			for(int kk=0; kk<atom2->nw * NPOL; ++kk)
+				  // 			{
+				  // 				const int nu = ParaO.trace_loc_col[start2+kk];
+				  // 				if(nu<0) continue;
+
+				  // 				++nnr;
+				  // 			}//kk
+				  // 		}//jj
+				  // 	// }
+				  // }//distance
+			  }// ad
+		  }// I1
+	  }// T1
 
 	return;
 }
@@ -944,7 +1003,7 @@ void DFTU_RELAX::erase_force_stress()
 				delete [] soverlap_gamma[i];
 			}
 			delete [] soverlap_gamma;
-
+      soverlap_gamma=nullptr;
 		}
 	}
 	else
@@ -962,6 +1021,7 @@ void DFTU_RELAX::erase_force_stress()
 			delete [] dSm_k[ik];
 		}
 		delete [] dSm_k;
+    dSm_k = nullptr;
 
 		if(STRESS)
 		{
@@ -978,6 +1038,7 @@ void DFTU_RELAX::erase_force_stress()
 				delete [] soverlap_k[ik];
 			}
 			delete [] soverlap_k;
+      soverlap_k = nullptr;
 		}
 	}
 			
diff --git a/ABACUS.develop/source/src_lcao/dftu_relax.h b/ABACUS.develop/source/src_lcao/dftu_relax.h
index 9908c7478d..9550359ac9 100644
--- a/ABACUS.develop/source/src_lcao/dftu_relax.h
+++ b/ABACUS.develop/source/src_lcao/dftu_relax.h
@@ -27,10 +27,10 @@ class DFTU_RELAX : public DFTU_Yukawa
     void folding_dSm_soverlap();
     void allocate_force_stress();
     void erase_force_stress();
-    void cal_force_k(vector<vector<complex<double>>> &VU);
-    void cal_force_gamma(vector<vector<double>> &VU);
-    void cal_stress_k(vector<vector<complex<double>>> &VU);
-    void cal_stress_gamma(vector<vector<double>> &VU);
+    void cal_force_k(const vector<vector<complex<double>>> &VU);
+    void cal_force_gamma(const vector<vector<double>> &VU);
+    void cal_stress_k(const vector<vector<complex<double>>> &VU);
+    void cal_stress_gamma(const vector<vector<double>> &VU);
 
     double get_onebody_eff_pot
     (
diff --git a/ABACUS.develop/source/src_lcao/dftu_yukawa.cpp b/ABACUS.develop/source/src_lcao/dftu_yukawa.cpp
index f789958888..9a502bc730 100644
--- a/ABACUS.develop/source/src_lcao/dftu_yukawa.cpp
+++ b/ABACUS.develop/source/src_lcao/dftu_yukawa.cpp
@@ -20,13 +20,6 @@
 #include "LOOP_ions.h"
 #include "LCAO_matrix.h"
 
-
-extern "C"
-{
-	void sphbsl_(int *n, double *r, double *A, double *val);
-	void sphhnk_(int *n, double *r, double *A, double *val);
-}
-
 DFTU_Yukawa::DFTU_Yukawa(){}
 
 DFTU_Yukawa::~DFTU_Yukawa(){}
@@ -95,14 +88,14 @@ void DFTU_Yukawa::cal_slater_Fk(const int L, const int T)
 						int l = 2*k;
 						if(ir0<ir1)  //less than
 						{
-						 	sphbsl_(&l, &r0, &lambda, &bslval);
-							sphhnk_(&l, &r1, &lambda, &hnkval);
+						 	bslval=this->spherical_Bessel(l, r0, lambda);
+							hnkval=this->spherical_Hankel(l, r1, lambda);
 						}
 						else //greater than
 						{
-						 	sphbsl_(&l, &r1, &lambda, &bslval);
-							sphhnk_(&l, &r0, &lambda, &hnkval);
-						}					
+						 	bslval=this->spherical_Bessel(l, r1, lambda);
+							hnkval=this->spherical_Hankel(l, r0, lambda);
+						}				
 						this->Fk.at(T).at(L).at(chi).at(k) -= (4*k+1)*lambda*pow(R_L0,2)*bslval*hnkval*pow(R_L1,2)*pow(r0,2)*pow(r1,2)*rab0*rab1;					
 					}
 				}
@@ -219,6 +212,69 @@ void DFTU_Yukawa::cal_slater_UJ(const int istep, const int iter)
 	return;
 }
 
+
+double DFTU_Yukawa::spherical_Bessel(const int k, const double r, const double lambda)
+{
+  TITLE("DFTU_Yukawa", "spherical_Bessel");
+
+  double val;
+  double x=r*lambda;
+  if(k==0)
+  {
+    if(x < 1.0e-3) val=1+pow(x,2)/6.0;
+    else val = sinh(x)/x;
+  }
+  else if(k==2)
+  {
+    if(x < 1.0e-2) val=-pow(x,2)/15.0 -pow(x,4)/210.0 - pow(x,6)/7560.0;
+    else val = 3*cosh(x)/pow(x,2) + (-3-pow(x,2))*sinh(x)/pow(x,3);
+  }
+  else if(k==4)
+  {
+    if(x < 5.0e-1) val=pow(x,4)/945.0 + pow(x,6)/20790.0 + pow(x,8)/1081080.0 + pow(x,10)/97297200.0;
+    else val = -5*(21+2*pow(x,2))*cosh(x)/pow(x,4)+(105+45*pow(x,2)+pow(x,4))*sinh(x)/pow(x,5);
+  }
+  else if(k==6)
+  {
+    if(x < 9.0e-1) val=-pow(x,6)/135135.0-pow(x,8)/4054050.0-pow(x,10)/275675400.0;
+    else val = 21*(495+60*pow(x,2)+pow(x,4))*cosh(x)/pow(x,6) + 
+              (-10395-4725*pow(x,2)-210*pow(x,4)-pow(x,6))*sinh(x)/pow(x,7);
+  }
+  return val;
+}
+
+
+double DFTU_Yukawa::spherical_Hankel(const int k, const double r, const double lambda)
+{
+  TITLE("DFTU_Yukawa", "spherical_Bessel");
+
+  double val;
+  double x=r*lambda;
+  if(k==0)
+  {
+    if(x < 1.0e-3) val=-1/x + 1 -x/2.0 + pow(x,2)/6.0;
+    else val = -exp(-x)/x;
+  }
+  else if(k==2)
+  {
+    if(x < 1.0e-2) val=3/pow(x,3)-1/(2*x)+x/8-pow(x,2)/15.0+pow(x,3)/48.0;
+    else val = exp(-x)*(3+3*x+pow(x,2))/pow(x,3);
+  }
+  else if(k==4)
+  {
+    if(x < 5.0e-1) val=-105/pow(x,5) + 15/(2*pow(x,3)) - 3/(8*x) + x/48 - pow(x,3)/384.0+pow(x,4)/945.0;
+    else val = -exp(-x)*(105+105*x+45*pow(x,2)+10*pow(x,3)+pow(x,4))/pow(x,5);
+  }
+  else if(k==6)
+  {
+    if(x < 9.0e-1) val=10395/pow(x,7) - 945/(2*pow(x,5)) + 105/(8*pow(x,3)) -5/(16*x)
+                        +x/128.0-pow(x,3)/3840.0 + pow(x,5)/46080.0 - pow(x,6)/135135.0;
+    else val = exp(-x)*(10395+10395*x+4725*pow(x,2)+1260*pow(x,3)+210*pow(x,4) + 
+                21*pow(x,5)+pow(x,6))/pow(x,7);
+  }
+  return val;
+}
+
 /*
 void DFTU::cal_unscreened_slater_Fk(const int L, const int T)
 {
diff --git a/ABACUS.develop/source/src_lcao/dftu_yukawa.h b/ABACUS.develop/source/src_lcao/dftu_yukawa.h
index 8ed5791cd2..49c7813a1f 100644
--- a/ABACUS.develop/source/src_lcao/dftu_yukawa.h
+++ b/ABACUS.develop/source/src_lcao/dftu_yukawa.h
@@ -23,6 +23,9 @@ class DFTU_Yukawa
     void cal_yukawa_lambda();
     void cal_slater_UJ(const int istep, const int iter);
 
+    double spherical_Bessel(const int k, const double r, const double lambda);
+    double spherical_Hankel(const int k, const double r, const double lambda);
+
     //void cal_unscreened_slater_Fk(const int L, const int T); //L:angular momnet, T:atom type
     //void cal_slater_Vsc(const int T, const int L);
 
diff --git a/ABACUS.develop/source/src_lcao/lscc.f b/ABACUS.develop/source/src_lcao/lscc.f
deleted file mode 100644
index 492c941ca9..0000000000
--- a/ABACUS.develop/source/src_lcao/lscc.f
+++ /dev/null
@@ -1,87 +0,0 @@
-      subroutine sphbsl(n,r,A,val) 
-        integer :: n
-        real*8 :: r,A
-        real*8 :: x,val
-        x = r*A
-        if (n .eq. 0) then
-        
-          if ( x .lt. 1.d-3 ) then
-            val = 1 + x**2/6
-          else
-            val = dsinh(x)/x
-          end if
-        else if (n .eq. 2) then
-        
-          if ( x .lt. 1.d-2 ) then
-            val = -x**2/15 -x**4/210 - x**6/7560
-          else
-            val = 3*dcosh(x)/x**2 + (-3-x**2)*dsinh(x)/x**3
-          end if
-        
-        else if (n .eq. 4) then
-
-          if( x .lt. 5.d-1)then
-            val = x**4/945 + x**6/20790 + x**8/1081080 + x**10/97297200
-          else
-            val = -5*(21+2*x**2)*dcosh(x)/x**4+(105+45*x**2+x**4)*
-     &       dsinh(x)/x**5
-          end if
-        
-        else if (n .eq. 6) then
-        
-          if ( x .lt. 9.d-1) then
-            val = -x**6/135135-x**8/4054050-x**10/275675400
-          else
-            val = 21*(495+60*x**2+x**4)*dcosh(x)/x**6 +
-     &       (-10395-4725*x**2-210*x**4-x**6)*dsinh(x)/x**7
-          end if
-        
-        else
-        end if
-      END subroutine sphbsl
-
-      subroutine sphhnk(n,r,A,val)
-        integer :: n
-        real*8 :: r,A
-        real*8 :: x,val
-        x = r*A
-        if (n .eq. 0) then
-        
-          if ( x .lt. 1.d-3 ) then
-            val = -1/x + 1 -x/2 + x**2/6
-          else
-            val = -dexp(-x)/x
-          endif
-        
-        else if (n .eq. 2) then
-        
-          if ( x .lt. 1.d-2) then
-            val = 3/x**3-1/(2*x)+x/8-x**2/15+x**3/48
-          else
-            val = dexp(-x)*(3+3*x+x**2)/x**3
-          endif
-        
-        else if (n .eq. 4) then
-        
-          if (x .lt. 5.d-1) then
-            val = -105/x**5 + 15/(2*x**3) - 3/(8*x) + x/48 - x**3/384 
-     &        +x**4/945
-          else
-            val = -dexp(-x)*(105+105*x+45*x**2+10*x**3+x**4)/x**5
-          endif
-        
-        else if (n .eq. 6) then
-
-          if (x .lt. 9.d-1) then
-            val = 10395/x**7 - 945/(2*x**5) + 105/(8*x**3) -5/(16*x) + 
-     &            x/128-x**3/3840 + x**5/46080 - x**6/135135
-          else
-            val = dexp(-x)*(10395+10395*x+4725*x**2+1260*x**3+210*x**
-     &       4+21*x**5+x**6)/x**7
-          endif
-        
-        else
-        endif
-      END SUBROUTINE sphhnk
-
-