From 3017ab7c7c2ec305ec7e97f64d7a188c0838e53e Mon Sep 17 00:00:00 2001
From: Han Wang <wang_han@iapcm.ac.cn>
Date: Wed, 17 Mar 2021 08:46:48 +0800
Subject: [PATCH 1/7] add deepmd namespace for lib

---
 source/lib/include/ComputeDescriptor.h        |  6 +-
 source/lib/include/Stopwatch.h                | 54 --------------
 source/lib/include/coord.h                    |  8 +-
 source/lib/include/env_mat.h                  | 61 ++++++++-------
 source/lib/include/ewald.h                    |  5 +-
 source/lib/include/fmt_nlist.h                | 33 +++------
 source/lib/include/gelu.h                     |  4 +
 source/lib/include/map_aparam.h               |  3 +
 source/lib/include/neighbor_list.h            | 28 ++++---
 source/lib/include/pair_tab.h                 |  3 +
 source/lib/include/prod_env_mat.h             |  4 +
 source/lib/include/prod_force.h               |  6 +-
 source/lib/include/prod_force_grad.h          |  3 +
 source/lib/include/prod_virial.h              |  4 +
 source/lib/include/prod_virial_grad.h         |  3 +
 source/lib/include/region.h                   |  4 +
 source/lib/include/soft_min_switch.h          |  4 +
 source/lib/include/soft_min_switch_force.h    |  4 +
 .../lib/include/soft_min_switch_force_grad.h  |  4 +
 source/lib/include/soft_min_switch_virial.h   |  3 +
 .../lib/include/soft_min_switch_virial_grad.h |  3 +
 source/lib/include/switcher.h                 | 28 +------
 source/lib/src/coord.cc                       | 16 +++-
 source/lib/src/env_mat.cc                     | 32 +++++---
 source/lib/src/ewald.cc                       |  5 ++
 source/lib/src/fmt_nlist.cc                   | 40 ++++++++--
 source/lib/src/gelu.cc                        | 18 ++---
 source/lib/src/map_aparam.cc                  |  6 +-
 source/lib/src/neighbor_list.cc               |  8 +-
 source/lib/src/pair_tab.cc                    |  7 +-
 source/lib/src/prod_env_mat.cc                | 26 +++++--
 source/lib/src/prod_force.cc                  | 24 ++++--
 source/lib/src/prod_force_grad.cc             | 24 ++++--
 source/lib/src/prod_virial.cc                 | 24 ++++--
 source/lib/src/prod_virial_grad.cc            | 24 ++++--
 source/lib/src/region.cc                      | 36 ++++++---
 source/lib/src/soft_min_switch.cc             |  8 +-
 source/lib/src/soft_min_switch_force.cc       |  6 +-
 source/lib/src/soft_min_switch_force_grad.cc  |  6 +-
 source/lib/src/soft_min_switch_virial.cc      |  6 +-
 source/lib/src/soft_min_switch_virial_grad.cc |  6 +-
 source/lib/tests/test_coord.cc                | 10 +--
 source/lib/tests/test_env_mat_a.cc            | 26 +++----
 source/lib/tests/test_env_mat_r.cc            | 18 ++---
 source/lib/tests/test_ewald.cc                |  4 +-
 source/lib/tests/test_fmt_nlist.cc            |  4 +-
 source/lib/tests/test_gelu.cc                 |  8 +-
 source/lib/tests/test_map_aparam.cc           |  2 +-
 source/lib/tests/test_neighbor_list.cc        |  4 +-
 source/lib/tests/test_pair_tab.cc             | 74 +++++++++----------
 source/lib/tests/test_prod_force_a.cc         |  6 +-
 source/lib/tests/test_prod_force_grad_a.cc    |  4 +-
 source/lib/tests/test_prod_force_grad_r.cc    |  4 +-
 source/lib/tests/test_prod_force_r.cc         |  4 +-
 source/lib/tests/test_prod_virial_a.cc        |  6 +-
 source/lib/tests/test_prod_virial_grad_a.cc   |  4 +-
 source/lib/tests/test_prod_virial_grad_r.cc   |  4 +-
 source/lib/tests/test_prod_virial_r.cc        |  6 +-
 source/lib/tests/test_simulation_region.cc    |  2 +-
 source/lib/tests/test_soft_min_switch.cc      | 14 ++--
 .../lib/tests/test_soft_min_switch_force.cc   |  6 +-
 .../tests/test_soft_min_switch_force_grad.cc  |  6 +-
 .../lib/tests/test_soft_min_switch_virial.cc  |  6 +-
 .../tests/test_soft_min_switch_virial_grad.cc |  6 +-
 source/lmp/fix_dplr.cpp                       |  4 +-
 source/lmp/pair_deepmd.cpp                    |  2 +-
 source/lmp/pair_deepmd.h.in                   | 13 ----
 source/lmp/pppm_dplr.cpp                      | 13 ----
 source/lmp/pppm_dplr.h                        | 13 ----
 source/op/ewald_recp.cc                       |  4 +-
 source/op/gelu_multi_device.cc                |  8 +-
 source/op/map_aparam.cc                       |  2 +-
 source/op/pair_tab.cc                         |  2 +-
 source/op/prod_env_mat_multi_device.cc        | 20 ++---
 source/op/prod_force_se_a.cc                  | 15 ++--
 source/op/prod_force_se_a_grad.cc             | 13 ++--
 source/op/prod_force_se_r.cc                  | 15 ++--
 source/op/prod_force_se_r_grad.cc             |  2 +-
 source/op/prod_virial_se_a.cc                 | 19 ++---
 source/op/prod_virial_se_a_grad.cc            | 15 ++--
 source/op/prod_virial_se_r.cc                 |  2 +-
 source/op/prod_virial_se_r_grad.cc            |  2 +-
 source/op/soft_min.cc                         |  2 +-
 source/op/soft_min_force.cc                   |  2 +-
 source/op/soft_min_force_grad.cc              |  2 +-
 source/op/soft_min_virial.cc                  |  2 +-
 source/op/soft_min_virial_grad.cc             |  2 +-
 87 files changed, 547 insertions(+), 452 deletions(-)
 delete mode 100644 source/lib/include/Stopwatch.h

diff --git a/source/lib/include/ComputeDescriptor.h b/source/lib/include/ComputeDescriptor.h
index aaac53b657..7d4d6e0c3a 100644
--- a/source/lib/include/ComputeDescriptor.h
+++ b/source/lib/include/ComputeDescriptor.h
@@ -826,7 +826,7 @@ void compute_descriptor_se_a_extf (std::vector<double > &		descrpt_a,
       double inr4 = inr2 * inr2;
       double inr3 = inr4 * nr;
       double sw, dsw;
-      spline5_switch(sw, dsw, nr, rmin, rmax);
+      deepmd::spline5_switch(sw, dsw, nr, rmin, rmax);
       int idx_deriv = nei_iter * 4 * 3;	// 4 components time 3 directions
       int idx_value = nei_iter * 4;	// 4 components
       // projections
@@ -935,7 +935,7 @@ void compute_descriptor_se_a_ef_para (std::vector<double > &		descrpt_a,
       double inr4 = inr2 * inr2;
       double inr3 = inr4 * nr;
       double sw, dsw;
-      spline5_switch(sw, dsw, nr, rmin, rmax);
+      deepmd::spline5_switch(sw, dsw, nr, rmin, rmax);
       int idx_deriv = nei_iter * 4 * 3;	// 4 components time 3 directions
       int idx_value = nei_iter * 4;	// 4 components
       // projections
@@ -1043,7 +1043,7 @@ void compute_descriptor_se_a_ef_vert (std::vector<double > &		descrpt_a,
       double inr4 = inr2 * inr2;
       double inr3 = inr4 * nr;
       double sw, dsw;
-      spline5_switch(sw, dsw, nr, rmin, rmax);
+      deepmd::spline5_switch(sw, dsw, nr, rmin, rmax);
       int idx_deriv = nei_iter * 4 * 3;	// 4 components time 3 directions
       int idx_value = nei_iter * 4;	// 4 components
       // projections
diff --git a/source/lib/include/Stopwatch.h b/source/lib/include/Stopwatch.h
deleted file mode 100644
index 0abd73bcf1..0000000000
--- a/source/lib/include/Stopwatch.h
+++ /dev/null
@@ -1,54 +0,0 @@
-#ifndef __Stopwatch_h_wanghan__
-#define __Stopwatch_h_wanghan__
-
-#include <sys/param.h>
-#include <sys/times.h>
-#include <sys/types.h>
-
-class Stopwatch 
-{ 
-public: 
-  Stopwatch(): HZi (1./HZ) {}; 
- 
-  void              start(); 
-  void              stop(); 
-  double            system() const; 
-  double            user() const; 
-  double            real() const; 
- 
-  static double     resolution() {return 1./HZ;}; 
-private:
-  struct tms tic, toc;
-  long r1, r0;
-  double HZi;
-};               
-
-inline double Stopwatch::user () const
-{
-  return (double)(toc.tms_utime - tic.tms_utime) * HZi;
-}
-
-inline double Stopwatch::system () const
-{
-  return (double)(toc.tms_stime - tic.tms_stime) * HZi;
-}
-
-inline double Stopwatch::real () const
-{
-  return (double)(r1 - r0) * HZi;
-}
-
-inline void Stopwatch::stop ()
-{
-  r1 = times (&toc);
-}
-
-inline void Stopwatch::start() 
-{
-  r0 = times (&tic);
-}
-
-
-#endif
-// end of file 
-
diff --git a/source/lib/include/coord.h b/source/lib/include/coord.h
index dc81efc1b2..0da8eedd23 100644
--- a/source/lib/include/coord.h
+++ b/source/lib/include/coord.h
@@ -2,13 +2,15 @@
 
 #include "region.h"
 
+namespace deepmd{
+
 // normalize coords
 template <typename FPTYPE>
 void
 normalize_coord_cpu(
     FPTYPE * coord,
     const int natom,
-    const Region<FPTYPE> & region);
+    const deepmd::Region<FPTYPE> & region);
 
 // copy coordinates
 // outputs:
@@ -32,4 +34,6 @@ copy_coord_cpu(
     const int & nloc,
     const int & mem_nall,
     const float & rcut,
-    const Region<FPTYPE> & region);
+    const deepmd::Region<FPTYPE> & region);
+
+}
diff --git a/source/lib/include/env_mat.h b/source/lib/include/env_mat.h
index 1217051f70..b94e683027 100644
--- a/source/lib/include/env_mat.h
+++ b/source/lib/include/env_mat.h
@@ -1,6 +1,41 @@
 #pragma once
 
 #include <vector>
+
+namespace deepmd{
+
+template<typename FPTYPE> 
+void env_mat_a_cpu (
+    std::vector<FPTYPE > &	        descrpt_a,
+    std::vector<FPTYPE > &	        descrpt_a_deriv,
+    std::vector<FPTYPE > &	        rij_a,
+    const std::vector<FPTYPE > &	posi,
+    const std::vector<int > &		type,
+    const int &				i_idx,
+    const std::vector<int > &		fmt_nlist,
+    const std::vector<int > &		sec, 
+    const float &			rmin,
+    const float &			rmax) ;
+
+template<typename FPTYPE> 
+void env_mat_r_cpu (
+    std::vector<FPTYPE > &	        descrpt_a,
+    std::vector<FPTYPE > &	        descrpt_a_deriv,
+    std::vector<FPTYPE > &	        rij_a,
+    const std::vector<FPTYPE > &	posi,
+    const std::vector<int > &		type,
+    const int &				i_idx,
+    const std::vector<int > &		fmt_nlist_a,
+    const std::vector<int > &		sec_a, 
+    const float &			rmin,
+    const float &			rmax);
+
+}
+
+////////////////////////////////////////////////////////
+// legacy code
+////////////////////////////////////////////////////////
+
 #include "SimulationRegion.h"
 
 void env_mat_a (
@@ -18,19 +53,6 @@ void env_mat_a (
     const double &			rmin,
     const double &			rmax);
 
-template<typename FPTYPE> 
-void env_mat_a_cpu (
-    std::vector<FPTYPE > &	        descrpt_a,
-    std::vector<FPTYPE > &	        descrpt_a_deriv,
-    std::vector<FPTYPE > &	        rij_a,
-    const std::vector<FPTYPE > &	posi,
-    const std::vector<int > &		type,
-    const int &				i_idx,
-    const std::vector<int > &		fmt_nlist,
-    const std::vector<int > &		sec, 
-    const float &			rmin,
-    const float &			rmax) ;
-
 void env_mat_r (
     std::vector<double > &		descrpt_r,
     std::vector<double > &		descrpt_r_deriv,
@@ -46,16 +68,3 @@ void env_mat_r (
     const double &			rmin, 
     const double &			rmax);
 
-template<typename FPTYPE> 
-void env_mat_r_cpu (
-    std::vector<FPTYPE > &	        descrpt_a,
-    std::vector<FPTYPE > &	        descrpt_a_deriv,
-    std::vector<FPTYPE > &	        rij_a,
-    const std::vector<FPTYPE > &	posi,
-    const std::vector<int > &		type,
-    const int &				i_idx,
-    const std::vector<int > &		fmt_nlist_a,
-    const std::vector<int > &		sec_a, 
-    const float &			rmin,
-    const float &			rmax);
-
diff --git a/source/lib/include/ewald.h b/source/lib/include/ewald.h
index 10d400e62a..bce151365a 100644
--- a/source/lib/include/ewald.h
+++ b/source/lib/include/ewald.h
@@ -7,6 +7,8 @@
 #include "utilities.h"
 #include "region.h"
 
+namespace deepmd{
+
 // 8.988e9 / pc.electron_volt / pc.angstrom * (1.602e-19)**2
 const double ElectrostaticConvertion = 14.39964535475696995031;
 
@@ -29,6 +31,7 @@ ewald_recp(
     std::vector<VALUETYPE> &		virial,
     const std::vector<VALUETYPE>&	coord,
     const std::vector<VALUETYPE>&	charge,
-    const Region<VALUETYPE>&		region, 
+    const deepmd::Region<VALUETYPE>&	region, 
     const EwaldParameters<VALUETYPE>&	param);
 
+}
diff --git a/source/lib/include/fmt_nlist.h b/source/lib/include/fmt_nlist.h
index 3b8a30eefe..13c9082240 100644
--- a/source/lib/include/fmt_nlist.h
+++ b/source/lib/include/fmt_nlist.h
@@ -2,7 +2,8 @@
 
 #include <vector>
 #include "neighbor_list.h"
-#include "SimulationRegion.h"
+
+namespace deepmd{
 
 template <typename FPTYPE>
 void format_nlist_cpu(
@@ -15,6 +16,15 @@ void format_nlist_cpu(
     const float rcut, 
     const std::vector<int> sec);
 
+}
+
+
+////////////////////////////////////////////////////////
+// legacy code
+////////////////////////////////////////////////////////
+
+#include "SimulationRegion.h"
+
 // return:	-1	OK
 //		> 0	the type of unsuccessful neighbor list
 int format_nlist_i_fill_a (
@@ -45,24 +55,3 @@ int format_nlist_i_cpu (
 
 
 
-struct NeighborInfo 
-{
-  int type;
-  double dist;
-  int index;
-  NeighborInfo () 
-      : type (0), dist(0), index(0) 
-      {
-      }
-  NeighborInfo (int tt, double dd, int ii) 
-      : type (tt), dist(dd), index(ii) 
-      {
-      }
-  bool operator < (const NeighborInfo & b) const 
-      {
-	return (type < b.type || 
-		(type == b.type && 
-		 (dist < b.dist || 
-		  (dist == b.dist && index < b.index) ) ) );
-      }
-};
diff --git a/source/lib/include/gelu.h b/source/lib/include/gelu.h
index edf1729c4f..20f2d96de5 100644
--- a/source/lib/include/gelu.h
+++ b/source/lib/include/gelu.h
@@ -1,5 +1,7 @@
 #pragma once
 
+namespace deepmd{
+
 template<typename FPTYPE>
 void gelu_cpu(
     FPTYPE * out, 
@@ -43,3 +45,5 @@ void gelu_grad_grad_gpu_cuda(
     const FPTYPE * dy_2,
     const int size);
 #endif // GOOGLE_CUDA
+
+}
diff --git a/source/lib/include/map_aparam.h b/source/lib/include/map_aparam.h
index b209d8314a..3ee3d1dc12 100644
--- a/source/lib/include/map_aparam.h
+++ b/source/lib/include/map_aparam.h
@@ -1,5 +1,7 @@
 #pragma once
 
+namespace deepmd{
+  
 template <typename FPTYPE>
 void map_aparam_cpu (
     FPTYPE * output,
@@ -10,3 +12,4 @@ void map_aparam_cpu (
     const int & numb_aparam
     );
 
+}
diff --git a/source/lib/include/neighbor_list.h b/source/lib/include/neighbor_list.h
index ef4013c7d7..ddcf64846c 100644
--- a/source/lib/include/neighbor_list.h
+++ b/source/lib/include/neighbor_list.h
@@ -9,6 +9,8 @@
 #include "utilities.h"
 #include "SimulationRegion.h"
 
+namespace deepmd{
+
 // format of the input neighbor list
 struct InputNlist
 {
@@ -61,6 +63,22 @@ build_nlist_cpu(
     const int & mem_size,
     const float & rcut);
 
+#if GOOGLE_CUDA
+void convert_nlist_gpu_cuda(
+    InputNlist & gpu_nlist,
+    InputNlist & cpu_nlist,
+    int* & gpu_memory,
+    const int & max_nbor_size);
+
+void free_nlist_gpu_cuda(InputNlist & gpu_nlist);
+#endif // GOOGLE_CUDA
+
+} // namespace deepmd
+
+
+////////////////////////////////////////////////////////
+// legacy code
+////////////////////////////////////////////////////////
 
 // build nlist by an extended grid
 void
@@ -121,13 +139,3 @@ copy_coord (std::vector<double > &		out_c,
 	    const std::vector<int > &		in_t,
 	    const double &			rc,
 	    const SimulationRegion<double > &	region);
-
-#if GOOGLE_CUDA
-void convert_nlist_gpu_cuda(
-    InputNlist & gpu_nlist,
-    InputNlist & cpu_nlist,
-    int* & gpu_memory,
-    const int & max_nbor_size);
-
-void free_nlist_gpu_cuda(InputNlist & gpu_nlist);
-#endif // GOOGLE_CUDA
diff --git a/source/lib/include/pair_tab.h b/source/lib/include/pair_tab.h
index 9176b43c0c..db05b68df2 100644
--- a/source/lib/include/pair_tab.h
+++ b/source/lib/include/pair_tab.h
@@ -1,5 +1,7 @@
 #pragma once
 
+namespace deepmd{
+  
 template<typename FPTYPE>
 void pair_tab_cpu(
     FPTYPE * energy,
@@ -16,3 +18,4 @@ void pair_tab_cpu(
     const std::vector<int> & sel_r
     );
 
+}
diff --git a/source/lib/include/prod_env_mat.h b/source/lib/include/prod_env_mat.h
index 6b866c5590..0789bf75a6 100644
--- a/source/lib/include/prod_env_mat.h
+++ b/source/lib/include/prod_env_mat.h
@@ -3,6 +3,8 @@
 #include "device.h"
 #include "neighbor_list.h"
 
+namespace deepmd{
+
 template<typename FPTYPE>
 void prod_env_mat_a_cpu(
     FPTYPE * em, 
@@ -89,3 +91,5 @@ void env_mat_nbor_update(
     const int size);
 #endif // GOOGLE_CUDA
 
+}
+
diff --git a/source/lib/include/prod_force.h b/source/lib/include/prod_force.h
index 9236f7802f..1667edb61c 100644
--- a/source/lib/include/prod_force.h
+++ b/source/lib/include/prod_force.h
@@ -1,5 +1,7 @@
 #pragma once
 
+namespace deepmd{
+
 template<typename FPTYPE>
 void prod_force_a_cpu(
     FPTYPE * force, 
@@ -40,4 +42,6 @@ void prod_force_r_gpu_cuda(
     const int nloc, 
     const int nall, 
     const int nnei);
-#endif // GOOGLE_CUDA
\ No newline at end of file
+#endif // GOOGLE_CUDA
+
+}
diff --git a/source/lib/include/prod_force_grad.h b/source/lib/include/prod_force_grad.h
index d191fe76fa..b4b95f2ac3 100644
--- a/source/lib/include/prod_force_grad.h
+++ b/source/lib/include/prod_force_grad.h
@@ -1,5 +1,7 @@
 #pragma once
 
+namespace deepmd{
+  
 template<typename FPTYPE>
 void prod_force_grad_a_cpu(
     FPTYPE * grad_net, 
@@ -18,3 +20,4 @@ void prod_force_grad_r_cpu(
     const int nloc, 
     const int nnei);
 
+}
diff --git a/source/lib/include/prod_virial.h b/source/lib/include/prod_virial.h
index 5a5c9ad996..6655059e12 100644
--- a/source/lib/include/prod_virial.h
+++ b/source/lib/include/prod_virial.h
@@ -1,5 +1,7 @@
 #pragma once
 
+namespace deepmd{
+
 template<typename FPTYPE>
 void prod_virial_a_cpu(
     FPTYPE * virial, 
@@ -49,3 +51,5 @@ void prod_virial_r_gpu_cuda(
     const int nall, 
     const int nnei);
 #endif // GOOGLE_CUDA
+
+} //namespace deepmd
diff --git a/source/lib/include/prod_virial_grad.h b/source/lib/include/prod_virial_grad.h
index 2ba8c01a44..ab0f84ffec 100644
--- a/source/lib/include/prod_virial_grad.h
+++ b/source/lib/include/prod_virial_grad.h
@@ -1,5 +1,7 @@
 #pragma once
 
+namespace deepmd{
+  
 template<typename FPTYPE>
 void prod_virial_grad_a_cpu(
     FPTYPE * grad_net,
@@ -20,3 +22,4 @@ void prod_virial_grad_r_cpu(
     const int nloc,
     const int nnei);
 
+}
diff --git a/source/lib/include/region.h b/source/lib/include/region.h
index 049ef882cc..fd24bd6b4d 100644
--- a/source/lib/include/region.h
+++ b/source/lib/include/region.h
@@ -1,5 +1,7 @@
 #pragma once
 
+namespace deepmd{
+
 template<typename FPTYPE>
 struct Region
 {
@@ -34,3 +36,5 @@ convert_to_phys_cpu(
     const Region<FPTYPE> & region,
     const FPTYPE * ri);
 
+}
+
diff --git a/source/lib/include/soft_min_switch.h b/source/lib/include/soft_min_switch.h
index 9d0b20a1c5..4b382cde93 100644
--- a/source/lib/include/soft_min_switch.h
+++ b/source/lib/include/soft_min_switch.h
@@ -1,5 +1,7 @@
 #pragma once
 
+namespace deepmd{
+  
 template <typename FPTYPE>
 void soft_min_switch_cpu(
     FPTYPE * sw_value,
@@ -11,3 +13,5 @@ void soft_min_switch_cpu(
     const FPTYPE & alpha,
     const FPTYPE & rmin,
     const FPTYPE & rmax);
+
+}
diff --git a/source/lib/include/soft_min_switch_force.h b/source/lib/include/soft_min_switch_force.h
index dfcb47ca52..854458a3c7 100644
--- a/source/lib/include/soft_min_switch_force.h
+++ b/source/lib/include/soft_min_switch_force.h
@@ -1,5 +1,7 @@
 #pragma once
 
+namespace deepmd{
+  
 template<typename FPTYPE>
 void soft_min_switch_force_cpu(
     FPTYPE * force, 
@@ -9,3 +11,5 @@ void soft_min_switch_force_cpu(
     const int nloc, 
     const int nall, 
     const int nnei);
+
+}
diff --git a/source/lib/include/soft_min_switch_force_grad.h b/source/lib/include/soft_min_switch_force_grad.h
index 329ca6e66d..afe4c3b36e 100644
--- a/source/lib/include/soft_min_switch_force_grad.h
+++ b/source/lib/include/soft_min_switch_force_grad.h
@@ -1,5 +1,7 @@
 #pragma once
 
+namespace deepmd{
+  
 template<typename FPTYPE>
 void soft_min_switch_force_grad_cpu(
     FPTYPE * grad_net, 
@@ -8,3 +10,5 @@ void soft_min_switch_force_grad_cpu(
     const int * nlist, 
     const int nloc, 
     const int nnei);
+
+}
diff --git a/source/lib/include/soft_min_switch_virial.h b/source/lib/include/soft_min_switch_virial.h
index 8e2dd1de04..4833eec262 100644
--- a/source/lib/include/soft_min_switch_virial.h
+++ b/source/lib/include/soft_min_switch_virial.h
@@ -1,5 +1,7 @@
 #pragma once
 
+namespace deepmd{
+  
 template<typename FPTYPE>
 void soft_min_switch_virial_cpu(
     FPTYPE * virial, 
@@ -12,3 +14,4 @@ void soft_min_switch_virial_cpu(
     const int nall, 
     const int nnei);
 
+}
diff --git a/source/lib/include/soft_min_switch_virial_grad.h b/source/lib/include/soft_min_switch_virial_grad.h
index 4e4ed1514e..1b1ec0da44 100644
--- a/source/lib/include/soft_min_switch_virial_grad.h
+++ b/source/lib/include/soft_min_switch_virial_grad.h
@@ -1,5 +1,7 @@
 #pragma once
 
+namespace deepmd{
+  
 template<typename FPTYPE>
 void soft_min_switch_virial_grad_cpu(
     FPTYPE * grad_net, 
@@ -10,3 +12,4 @@ void soft_min_switch_virial_grad_cpu(
     const int nloc, 
     const int nnei);
 
+}
diff --git a/source/lib/include/switcher.h b/source/lib/include/switcher.h
index 671c8be137..606b3c3196 100644
--- a/source/lib/include/switcher.h
+++ b/source/lib/include/switcher.h
@@ -1,5 +1,7 @@
 #pragma once
 
+namespace deepmd{
+
 inline double
 cos_switch (const double & xx, 
 	    const double & rmin, 
@@ -64,30 +66,6 @@ spline3_switch (double & vv,
   }
 }
 
-// template <typename TYPE>
-// inline void
-// spline5_switch (TYPE & vv,
-// 		TYPE & dd,
-// 		const TYPE & xx, 
-// 		const TYPE & rmin, 
-// 		const TYPE & rmax) 
-// {
-//   if (xx < rmin) {
-//     dd = 0;
-//     vv = 1;
-//   }
-//   else if (xx < rmax) {
-//     double uu = (xx - rmin) / (rmax - rmin) ;
-//     double du = 1. / (rmax - rmin) ;
-//     vv = uu*uu*uu * (-6 * uu*uu + 15 * uu - 10) + 1;
-//     dd = ( 3 * uu*uu * (-6 * uu*uu + 15 * uu - 10) + uu*uu*uu * (-12 * uu + 15) ) * du;
-//   }
-//   else {
-//     dd = 0;
-//     vv = 0;
-//   }
-// }
-
 template <typename FPTYPE>
 inline void 
 spline5_switch (
@@ -112,3 +90,5 @@ spline5_switch (
     vv = 0;
   }
 }
+
+}
diff --git a/source/lib/src/coord.cc b/source/lib/src/coord.cc
index fb57b2c403..81c44f4925 100644
--- a/source/lib/src/coord.cc
+++ b/source/lib/src/coord.cc
@@ -3,9 +3,12 @@
 #include "SimulationRegion.h"
 #include <vector>
 
+using namespace deepmd;
+
 // normalize coords
 template <typename FPTYPE>
 void
+deepmd::
 normalize_coord_cpu(
     FPTYPE * coord,
     const int natom,
@@ -25,6 +28,7 @@ normalize_coord_cpu(
 
 template <typename FPTYPE>
 int
+deepmd::
 copy_coord_cpu(
     FPTYPE * out_c,
     int * out_t,
@@ -67,20 +71,23 @@ copy_coord_cpu(
 
 template
 void
+deepmd::
 normalize_coord_cpu<double>(
     double * coord,
     const int natom,
-    const Region<double> & region);
+    const deepmd::Region<double> & region);
 
 template
 void
+deepmd::
 normalize_coord_cpu<float>(
     float * coord,
     const int natom,
-    const Region<float> & region);
+    const deepmd::Region<float> & region);
 
 template
 int
+deepmd::
 copy_coord_cpu<double>(
     double * out_c,
     int * out_t,
@@ -91,10 +98,11 @@ copy_coord_cpu<double>(
     const int & nloc,
     const int & mem_nall,
     const float & rcut,
-    const Region<double> & region);
+    const deepmd::Region<double> & region);
 
 template
 int
+deepmd::
 copy_coord_cpu<float>(
     float * out_c,
     int * out_t,
@@ -105,7 +113,7 @@ copy_coord_cpu<float>(
     const int & nloc,
     const int & mem_nall,
     const float & rcut,
-    const Region<float> & region);
+    const deepmd::Region<float> & region);
 
 
 
diff --git a/source/lib/src/env_mat.cc b/source/lib/src/env_mat.cc
index c9cadb4b0d..52398a17a8 100644
--- a/source/lib/src/env_mat.cc
+++ b/source/lib/src/env_mat.cc
@@ -57,7 +57,7 @@ void env_mat_a (
       double inr4 = inr2 * inr2;
       double inr3 = inr4 * nr;
       double sw, dsw;
-      spline5_switch(sw, dsw, nr, rmin, rmax);
+      deepmd::spline5_switch(sw, dsw, nr, rmin, rmax);
       int idx_deriv = nei_iter * 4 * 3;	// 4 components time 3 directions
       int idx_value = nei_iter * 4;	// 4 components
       // 4 value components
@@ -92,7 +92,9 @@ void env_mat_a (
 
 
 template<typename FPTYPE> 
-void env_mat_a_cpu (
+void 
+deepmd::
+env_mat_a_cpu (
     std::vector<FPTYPE > &	        descrpt_a,
     std::vector<FPTYPE > &	        descrpt_a_deriv,
     std::vector<FPTYPE > &	        rij_a,
@@ -134,7 +136,7 @@ void env_mat_a_cpu (
             FPTYPE inr4 = inr2 * inr2;
             FPTYPE inr3 = inr4 * nr;
             FPTYPE sw, dsw;
-            spline5_switch(sw, dsw, nr, rmin, rmax);
+            deepmd::spline5_switch(sw, dsw, nr, rmin, rmax);
             int idx_deriv = nei_iter * 4 * 3;	// 4 components time 3 directions
             int idx_value = nei_iter * 4;	// 4 components
             // 4 value components
@@ -222,7 +224,7 @@ void env_mat_r (
       double inr4 = inr2 * inr2;
       double inr3 = inr4 * nr;
       double sw, dsw;
-      spline5_switch(sw, dsw, nr, rmin, rmax);
+      deepmd::spline5_switch(sw, dsw, nr, rmin, rmax);
       int idx_deriv = nei_iter * 3;	// 1 components time 3 directions
       int idx_value = nei_iter;		// 1 components
       // value components
@@ -238,7 +240,9 @@ void env_mat_r (
 }
 
 template<typename FPTYPE> 
-void env_mat_r_cpu (
+void 
+deepmd::
+env_mat_r_cpu (
     std::vector<FPTYPE > &		descrpt_a,
     std::vector<FPTYPE > &	        descrpt_a_deriv,
     std::vector<FPTYPE > &	        rij_a,
@@ -281,7 +285,7 @@ void env_mat_r_cpu (
             FPTYPE inr4 = inr2 * inr2;
             FPTYPE inr3 = inr4 * nr;
             FPTYPE sw, dsw;
-            spline5_switch(sw, dsw, nr, rmin, rmax);
+            deepmd::spline5_switch(sw, dsw, nr, rmin, rmax);
             int idx_deriv = nei_iter * 3;	// 1 components time 3 directions
             int idx_value = nei_iter;	    // 1 components
             // 4 value components
@@ -298,7 +302,9 @@ void env_mat_r_cpu (
 
 
 template
-void env_mat_a_cpu<double> (
+void 
+deepmd::
+env_mat_a_cpu<double> (
     std::vector<double > &	        descrpt_a,
     std::vector<double > &	        descrpt_a_deriv,
     std::vector<double > &	        rij_a,
@@ -312,7 +318,9 @@ void env_mat_a_cpu<double> (
 
 
 template
-void env_mat_a_cpu<float> (
+void 
+deepmd::
+env_mat_a_cpu<float> (
     std::vector<float > &	        descrpt_a,
     std::vector<float > &	        descrpt_a_deriv,
     std::vector<float > &	        rij_a,
@@ -326,7 +334,9 @@ void env_mat_a_cpu<float> (
 
 
 template
-void env_mat_r_cpu<double> (
+void 
+deepmd::
+env_mat_r_cpu<double> (
     std::vector<double > &	        descrpt_r,
     std::vector<double > &	        descrpt_r_deriv,
     std::vector<double > &	        rij_r,
@@ -340,7 +350,9 @@ void env_mat_r_cpu<double> (
 
 
 template
-void env_mat_r_cpu<float> (
+void 
+deepmd::
+env_mat_r_cpu<float> (
     std::vector<float > &	        descrpt_r,
     std::vector<float > &	        descrpt_r_deriv,
     std::vector<float > &	        rij_r,
diff --git a/source/lib/src/ewald.cc b/source/lib/src/ewald.cc
index b12a4757e7..5942f6fedc 100644
--- a/source/lib/src/ewald.cc
+++ b/source/lib/src/ewald.cc
@@ -1,6 +1,8 @@
 #include "ewald.h"
 #include "SimulationRegion.h"
 
+using namespace deepmd;
+
 template<typename VALUETYPE> 
 VALUETYPE
 dir_err_esti(const VALUETYPE & test_q,
@@ -86,6 +88,7 @@ cmpt_k(std::vector<int> & KK,
 // inputs: coordinates charges region
 template <typename VALUETYPE>
 void 
+deepmd::
 ewald_recp(
     VALUETYPE &				ener, 
     std::vector<VALUETYPE> &		force,
@@ -266,6 +269,7 @@ ewald_recp(
 
 template
 void 
+deepmd::
 ewald_recp<float>(
     float &				ener, 
     std::vector<float> &		force,
@@ -277,6 +281,7 @@ ewald_recp<float>(
 
 template
 void 
+deepmd::
 ewald_recp<double>(
     double &				ener, 
     std::vector<double> &		force,
diff --git a/source/lib/src/fmt_nlist.cc b/source/lib/src/fmt_nlist.cc
index 0d9b921b1b..2c577c2f05 100644
--- a/source/lib/src/fmt_nlist.cc
+++ b/source/lib/src/fmt_nlist.cc
@@ -5,6 +5,30 @@
 #include "SimulationRegion.h"
 #include <iostream>
 
+using namespace deepmd;
+
+struct NeighborInfo 
+{
+  int type;
+  double dist;
+  int index;
+  NeighborInfo () 
+      : type (0), dist(0), index(0) 
+      {
+      }
+  NeighborInfo (int tt, double dd, int ii) 
+      : type (tt), dist(dd), index(ii) 
+      {
+      }
+  bool operator < (const NeighborInfo & b) const 
+      {
+	return (type < b.type || 
+		(type == b.type && 
+		 (dist < b.dist || 
+		  (dist == b.dist && index < b.index) ) ) );
+      }
+};
+
 int format_nlist_i_fill_a (
     std::vector<int > &			fmt_nei_idx_a,
     std::vector<int > &			fmt_nei_idx_r,
@@ -123,7 +147,9 @@ int format_nlist_i_cpu (
 }
 
 template<typename FPTYPE> 
-void format_nlist_cpu (
+void 
+deepmd::
+format_nlist_cpu (
     int * nlist,
     const InputNlist & in_nlist,
     const FPTYPE * coord, 
@@ -187,9 +213,11 @@ int format_nlist_i_cpu<float> (
     const std::vector<int > &   sec_a);
 
 template
-void format_nlist_cpu<double> (
+void 
+deepmd::
+format_nlist_cpu<double> (
     int * nlist,
-    const InputNlist & in_nlist,
+    const deepmd::InputNlist & in_nlist,
     const double * coord, 
     const int * type, 
     const int nloc, 
@@ -199,9 +227,11 @@ void format_nlist_cpu<double> (
 
 
 template
-void format_nlist_cpu<float> (
+void 
+deepmd::
+format_nlist_cpu<float> (
     int * nlist,
-    const InputNlist & in_nlist,
+    const deepmd::InputNlist & in_nlist,
     const float * coord, 
     const int * type, 
     const int nloc, 
diff --git a/source/lib/src/gelu.cc b/source/lib/src/gelu.cc
index b887df89de..c554da0578 100644
--- a/source/lib/src/gelu.cc
+++ b/source/lib/src/gelu.cc
@@ -3,7 +3,7 @@
 #include "device.h"
 
 template<typename FPTYPE>
-void gelu_cpu(
+void deepmd::gelu_cpu(
     FPTYPE * out, 
     const FPTYPE * xx, 
     const int size)
@@ -14,7 +14,7 @@ void gelu_cpu(
 }
 
 template<typename FPTYPE>
-void gelu_grad_cpu(
+void deepmd::gelu_grad_cpu(
     FPTYPE * out, 
     const FPTYPE * xx,
     const FPTYPE * dy, 
@@ -27,7 +27,7 @@ void gelu_grad_cpu(
 }
 
 template<typename FPTYPE>
-void gelu_grad_grad_cpu(
+void deepmd::gelu_grad_grad_cpu(
     FPTYPE * out,
     const FPTYPE * xx,
     const FPTYPE * dy, 
@@ -41,9 +41,9 @@ void gelu_grad_grad_cpu(
   }
 }
 
-template void gelu_cpu<float>(float * out, const float * x, const int size);
-template void gelu_cpu<double>(double * out, const double * x, const int size);
-template void gelu_grad_cpu<float>(float * out, const float * x, const float * dy, const int size);
-template void gelu_grad_cpu<double>(double * out, const double * x, const double * dy, const int size);
-template void gelu_grad_grad_cpu<float>(float * out, const float * x, const float * dy, const float * dy_2, const int size);
-template void gelu_grad_grad_cpu<double>(double * out, const double * x, const double * dy, const double * dy_2, const int size);
+template void deepmd::gelu_cpu<float>(float * out, const float * x, const int size);
+template void deepmd::gelu_cpu<double>(double * out, const double * x, const int size);
+template void deepmd::gelu_grad_cpu<float>(float * out, const float * x, const float * dy, const int size);
+template void deepmd::gelu_grad_cpu<double>(double * out, const double * x, const double * dy, const int size);
+template void deepmd::gelu_grad_grad_cpu<float>(float * out, const float * x, const float * dy, const float * dy_2, const int size);
+template void deepmd::gelu_grad_grad_cpu<double>(double * out, const double * x, const double * dy, const double * dy_2, const int size);
diff --git a/source/lib/src/map_aparam.cc b/source/lib/src/map_aparam.cc
index b7e9973d5f..7e60f1c3b8 100644
--- a/source/lib/src/map_aparam.cc
+++ b/source/lib/src/map_aparam.cc
@@ -1,7 +1,7 @@
 #include "map_aparam.h"
 
 template <typename FPTYPE>
-void map_aparam_cpu (
+void deepmd::map_aparam_cpu (
     FPTYPE * output,
     const FPTYPE * aparam,
     const int * nlist,
@@ -38,7 +38,7 @@ void map_aparam_cpu (
 }
 
 template
-void map_aparam_cpu<double> (
+void deepmd::map_aparam_cpu<double> (
     double * output,
     const double * aparam,
     const int * nlist,
@@ -48,7 +48,7 @@ void map_aparam_cpu<double> (
     );
 
 template
-void map_aparam_cpu<float> (
+void deepmd::map_aparam_cpu<float> (
     float * output,
     const float * aparam,
     const int * nlist,
diff --git a/source/lib/src/neighbor_list.cc b/source/lib/src/neighbor_list.cc
index 4223b50fe6..e426d63906 100644
--- a/source/lib/src/neighbor_list.cc
+++ b/source/lib/src/neighbor_list.cc
@@ -743,8 +743,10 @@ copy_coord (std::vector<double > & out_c,
   }
 }
 
+using namespace deepmd;
 
 void
+deepmd::
 convert_nlist(
     InputNlist & to_nlist,
     std::vector<std::vector<int> > & from_nlist
@@ -759,6 +761,7 @@ convert_nlist(
 }
 
 int
+deepmd::
 max_numneigh(
     const InputNlist & nlist
     )
@@ -772,6 +775,7 @@ max_numneigh(
 
 template <typename FPTYPE>
 int
+deepmd::
 build_nlist_cpu(
     InputNlist & nlist,
     int * max_list_size,
@@ -817,6 +821,7 @@ build_nlist_cpu(
 
 template
 int
+deepmd::
 build_nlist_cpu<double>(
     InputNlist & nlist,
     int * max_list_size,
@@ -828,6 +833,7 @@ build_nlist_cpu<double>(
 
 template
 int
+deepmd::
 build_nlist_cpu<float>(
     InputNlist & nlist,
     int * max_list_size,
@@ -868,4 +874,4 @@ void free_nlist_gpu_cuda(
   delete_device_memory(gpu_nlist.numneigh);
   delete_device_memory(gpu_nlist.firstneigh);
 }
-#endif // GOOGLE_CUDA
\ No newline at end of file
+#endif // GOOGLE_CUDA
diff --git a/source/lib/src/pair_tab.cc b/source/lib/src/pair_tab.cc
index d0a435b1b1..5137e17ac9 100644
--- a/source/lib/src/pair_tab.cc
+++ b/source/lib/src/pair_tab.cc
@@ -122,7 +122,8 @@ _cum_sum (
 }
 
 template<typename FPTYPE>
-void pair_tab_cpu(
+void 
+deepmd::pair_tab_cpu(
     FPTYPE * energy,
     FPTYPE * force,
     FPTYPE * virial,
@@ -211,7 +212,7 @@ void pair_tab_cpu(
 
 
 template
-void pair_tab_cpu<float>(
+void deepmd::pair_tab_cpu<float>(
     float * energy,
     float * force,
     float * virial,
@@ -227,7 +228,7 @@ void pair_tab_cpu<float>(
     );
 
 template
-void pair_tab_cpu<double>(
+void deepmd::pair_tab_cpu<double>(
     double * energy,
     double * force,
     double * virial,
diff --git a/source/lib/src/prod_env_mat.cc b/source/lib/src/prod_env_mat.cc
index a900b99a5c..597473021d 100644
--- a/source/lib/src/prod_env_mat.cc
+++ b/source/lib/src/prod_env_mat.cc
@@ -5,8 +5,12 @@
 #include "fmt_nlist.h"
 #include "env_mat.h"
 
+using namespace deepmd;
+
 template<typename FPTYPE>
-void prod_env_mat_a_cpu(
+void
+deepmd::
+prod_env_mat_a_cpu(
     FPTYPE * em, 
     FPTYPE * em_deriv, 
     FPTYPE * rij, 
@@ -88,7 +92,9 @@ void prod_env_mat_a_cpu(
 }
 
 template<typename FPTYPE>
-void prod_env_mat_r_cpu(
+void 
+deepmd::
+prod_env_mat_r_cpu(
     FPTYPE * em, 
     FPTYPE * em_deriv, 
     FPTYPE * rij, 
@@ -171,7 +177,9 @@ void prod_env_mat_r_cpu(
 
 
 template
-void prod_env_mat_a_cpu<double>(
+void 
+deepmd::
+prod_env_mat_a_cpu<double>(
     double * em, 
     double * em_deriv, 
     double * rij, 
@@ -189,7 +197,9 @@ void prod_env_mat_a_cpu<double>(
     const std::vector<int> sec);
 
 template
-void prod_env_mat_a_cpu<float>(
+void
+deepmd::
+prod_env_mat_a_cpu<float>(
     float * em, 
     float * em_deriv, 
     float * rij, 
@@ -207,7 +217,9 @@ void prod_env_mat_a_cpu<float>(
     const std::vector<int> sec);
 
 template
-void prod_env_mat_r_cpu<double>(
+void
+deepmd::
+prod_env_mat_r_cpu<double>(
     double * em, 
     double * em_deriv, 
     double * rij, 
@@ -225,7 +237,9 @@ void prod_env_mat_r_cpu<double>(
     const std::vector<int> sec);
 
 template
-void prod_env_mat_r_cpu<float>(
+void 
+deepmd::
+prod_env_mat_r_cpu<float>(
     float * em, 
     float * em_deriv, 
     float * rij, 
diff --git a/source/lib/src/prod_force.cc b/source/lib/src/prod_force.cc
index 16d153f9e1..ffe177e16c 100644
--- a/source/lib/src/prod_force.cc
+++ b/source/lib/src/prod_force.cc
@@ -20,7 +20,9 @@ make_index_range (
 
 
 template<typename FPTYPE>
-void prod_force_a_cpu(
+void 
+deepmd::
+prod_force_a_cpu(
     FPTYPE * force, 
     const FPTYPE * net_deriv, 
     const FPTYPE * env_deriv, 
@@ -56,7 +58,9 @@ void prod_force_a_cpu(
 }
 
 template
-void prod_force_a_cpu<double>(
+void 
+deepmd::
+prod_force_a_cpu<double>(
     double * force, 
     const double * net_deriv, 
     const double * env_deriv, 
@@ -66,7 +70,9 @@ void prod_force_a_cpu<double>(
     const int nnei);
 
 template
-void prod_force_a_cpu<float>(
+void 
+deepmd::
+prod_force_a_cpu<float>(
     float * force, 
     const float * net_deriv, 
     const float * env_deriv, 
@@ -77,7 +83,9 @@ void prod_force_a_cpu<float>(
 
 
 template<typename FPTYPE>
-void prod_force_r_cpu(
+void 
+deepmd::
+prod_force_r_cpu(
     FPTYPE * force, 
     const FPTYPE * net_deriv, 
     const FPTYPE * env_deriv, 
@@ -117,7 +125,9 @@ void prod_force_r_cpu(
 }
 
 template
-void prod_force_r_cpu<double>(
+void 
+deepmd::
+prod_force_r_cpu<double>(
     double * force, 
     const double * net_deriv, 
     const double * env_deriv, 
@@ -127,7 +137,9 @@ void prod_force_r_cpu<double>(
     const int nnei);
 
 template
-void prod_force_r_cpu<float>(
+void 
+deepmd::
+prod_force_r_cpu<float>(
     float * force, 
     const float * net_deriv, 
     const float * env_deriv, 
diff --git a/source/lib/src/prod_force_grad.cc b/source/lib/src/prod_force_grad.cc
index 49c510c8b5..7872ea5c55 100644
--- a/source/lib/src/prod_force_grad.cc
+++ b/source/lib/src/prod_force_grad.cc
@@ -21,7 +21,9 @@ make_index_range (
 
 
 template<typename FPTYPE>
-void prod_force_grad_a_cpu(
+void 
+deepmd::
+prod_force_grad_a_cpu(
     FPTYPE * grad_net, 
     const FPTYPE * grad, 
     const FPTYPE * env_deriv, 
@@ -67,7 +69,9 @@ void prod_force_grad_a_cpu(
 
 
 template
-void prod_force_grad_a_cpu<double>(
+void 
+deepmd::
+prod_force_grad_a_cpu<double>(
     double * grad_net, 
     const double * grad, 
     const double * env_deriv, 
@@ -76,7 +80,9 @@ void prod_force_grad_a_cpu<double>(
     const int nnei) ;
 
 template
-void prod_force_grad_a_cpu<float>(
+void 
+deepmd::
+prod_force_grad_a_cpu<float>(
     float * grad_net, 
     const float * grad, 
     const float * env_deriv, 
@@ -87,7 +93,9 @@ void prod_force_grad_a_cpu<float>(
 
 
 template<typename FPTYPE>
-void prod_force_grad_r_cpu(
+void 
+deepmd::
+prod_force_grad_r_cpu(
     FPTYPE * grad_net, 
     const FPTYPE * grad, 
     const FPTYPE * env_deriv, 
@@ -134,7 +142,9 @@ void prod_force_grad_r_cpu(
 }
 
 template
-void prod_force_grad_r_cpu<double>(
+void 
+deepmd::
+prod_force_grad_r_cpu<double>(
     double * grad_net, 
     const double * grad, 
     const double * env_deriv, 
@@ -143,7 +153,9 @@ void prod_force_grad_r_cpu<double>(
     const int nnei) ;
 
 template
-void prod_force_grad_r_cpu<float>(
+void
+deepmd::
+prod_force_grad_r_cpu<float>(
     float * grad_net, 
     const float * grad, 
     const float * env_deriv, 
diff --git a/source/lib/src/prod_virial.cc b/source/lib/src/prod_virial.cc
index 89c8f46185..086bc94245 100644
--- a/source/lib/src/prod_virial.cc
+++ b/source/lib/src/prod_virial.cc
@@ -20,7 +20,9 @@ make_index_range (
 }
 
 template<typename FPTYPE>
-void prod_virial_a_cpu(
+void 
+deepmd::
+prod_virial_a_cpu(
     FPTYPE * virial, 
     FPTYPE * atom_virial, 
     const FPTYPE * net_deriv, 
@@ -65,7 +67,9 @@ void prod_virial_a_cpu(
 }
 
 template
-void prod_virial_a_cpu<double>(
+void 
+deepmd::
+prod_virial_a_cpu<double>(
     double * virial, 
     double * atom_virial, 
     const double * net_deriv, 
@@ -77,7 +81,9 @@ void prod_virial_a_cpu<double>(
     const int nnei) ;
 
 template
-void prod_virial_a_cpu<float>(
+void 
+deepmd::
+prod_virial_a_cpu<float>(
     float * virial, 
     float * atom_virial, 
     const float * net_deriv, 
@@ -90,7 +96,9 @@ void prod_virial_a_cpu<float>(
 
 
 template<typename FPTYPE>
-void prod_virial_r_cpu(
+void 
+deepmd::
+prod_virial_r_cpu(
     FPTYPE * virial, 
     FPTYPE * atom_virial, 
     const FPTYPE * net_deriv, 
@@ -131,7 +139,9 @@ void prod_virial_r_cpu(
 }
 
 template
-void prod_virial_r_cpu<double>(
+void 
+deepmd::
+prod_virial_r_cpu<double>(
     double * virial, 
     double * atom_virial, 
     const double * net_deriv, 
@@ -143,7 +153,9 @@ void prod_virial_r_cpu<double>(
     const int nnei);
 
 template
-void prod_virial_r_cpu<float>(
+void 
+deepmd::
+prod_virial_r_cpu<float>(
     float * virial, 
     float * atom_virial, 
     const float * net_deriv, 
diff --git a/source/lib/src/prod_virial_grad.cc b/source/lib/src/prod_virial_grad.cc
index 3a53692417..59c3192fc0 100644
--- a/source/lib/src/prod_virial_grad.cc
+++ b/source/lib/src/prod_virial_grad.cc
@@ -19,7 +19,9 @@ make_index_range (
 }
 
 template<typename FPTYPE>
-void prod_virial_grad_a_cpu(
+void 
+deepmd::
+prod_virial_grad_a_cpu(
     FPTYPE * grad_net,
     const FPTYPE * grad,
     const FPTYPE * env_deriv,
@@ -61,7 +63,9 @@ void prod_virial_grad_a_cpu(
 
 
 template
-void prod_virial_grad_a_cpu<double>(
+void 
+deepmd::
+prod_virial_grad_a_cpu<double>(
     double * grad_net,
     const double * grad,
     const double * env_deriv,
@@ -71,7 +75,9 @@ void prod_virial_grad_a_cpu<double>(
     const int nnei);
 
 template
-void prod_virial_grad_a_cpu<float>(
+void 
+deepmd::
+prod_virial_grad_a_cpu<float>(
     float * grad_net,
     const float * grad,
     const float * env_deriv,
@@ -82,7 +88,9 @@ void prod_virial_grad_a_cpu<float>(
 
 
 template<typename FPTYPE>
-void prod_virial_grad_r_cpu(
+void 
+deepmd::
+prod_virial_grad_r_cpu(
     FPTYPE * grad_net,
     const FPTYPE * grad,
     const FPTYPE * env_deriv,
@@ -127,7 +135,9 @@ void prod_virial_grad_r_cpu(
 
 
 template
-void prod_virial_grad_r_cpu<double>(
+void 
+deepmd::
+prod_virial_grad_r_cpu<double>(
     double * grad_net,
     const double * grad,
     const double * env_deriv,
@@ -137,7 +147,9 @@ void prod_virial_grad_r_cpu<double>(
     const int nnei);
 
 template
-void prod_virial_grad_r_cpu<float>(
+void 
+deepmd::
+prod_virial_grad_r_cpu<float>(
     float * grad_net,
     const float * grad,
     const float * env_deriv,
diff --git a/source/lib/src/region.cc b/source/lib/src/region.cc
index b8ab55d1a8..62dcdb9b68 100644
--- a/source/lib/src/region.cc
+++ b/source/lib/src/region.cc
@@ -3,6 +3,8 @@
 #include "region.h"
 #define BOXT_DIM 9
 
+using namespace deepmd;
+
 template<typename FPTYPE>
 Region<FPTYPE>::
 Region()
@@ -80,6 +82,7 @@ tensor_t_dot_vec (
 
 template<typename FPTYPE>
 void
+deepmd::
 init_region_cpu(
     Region<FPTYPE> & region,
     const FPTYPE * boxt)
@@ -90,6 +93,7 @@ init_region_cpu(
 
 template<typename FPTYPE>
 void
+deepmd::
 convert_to_inter_cpu(
     FPTYPE * ri, 
     const Region<FPTYPE> & region,
@@ -100,6 +104,7 @@ convert_to_inter_cpu(
 
 template<typename FPTYPE>
 void
+deepmd::
 convert_to_phys_cpu(
     FPTYPE * rp, 
     const Region<FPTYPE> & region,
@@ -110,6 +115,7 @@ convert_to_phys_cpu(
 
 template<typename FPTYPE>
 FPTYPE
+deepmd::
 volume_cpu(
     const Region<FPTYPE> & region)
 {
@@ -117,49 +123,59 @@ volume_cpu(
 }
 
 template
-void init_region_cpu<double>(
-    Region<double> & region,
+void 
+deepmd::
+init_region_cpu<double>(
+    deepmd::Region<double> & region,
     const double * boxt);
 
 template
-void init_region_cpu<float>(
-    Region<float> & region,
+void 
+deepmd::
+init_region_cpu<float>(
+    deepmd::Region<float> & region,
     const float * boxt);
 
 template
 void
+deepmd::
 convert_to_inter_cpu<double>(
     double * ri, 
-    const Region<double> & region,
+    const deepmd::Region<double> & region,
     const double * rp);
 
 template
 void
+deepmd::
 convert_to_inter_cpu<float>(
     float * ri, 
-    const Region<float> & region,
+    const deepmd::Region<float> & region,
     const float * rp);
 
 template
 void
+deepmd::
 convert_to_phys_cpu<double>(
     double * ri, 
-    const Region<double> & region,
+    const deepmd::Region<double> & region,
     const double * rp);
 
 template
 void
+deepmd::
 convert_to_phys_cpu<float>(
     float * ri, 
-    const Region<float> & region,
+    const deepmd::Region<float> & region,
     const float * rp);
 
 template
 double
+deepmd::
 volume_cpu<double>(
-    const Region<double> & region);
+    const deepmd::Region<double> & region);
 
 template
 float
+deepmd::
 volume_cpu<float>(
-    const Region<float> & region);
+    const deepmd::Region<float> & region);
diff --git a/source/lib/src/soft_min_switch.cc b/source/lib/src/soft_min_switch.cc
index fbbf8bbbdd..88471a3d4b 100644
--- a/source/lib/src/soft_min_switch.cc
+++ b/source/lib/src/soft_min_switch.cc
@@ -4,7 +4,7 @@
 #include "switcher.h"
 
 template <typename FPTYPE>
-void soft_min_switch_cpu(
+void deepmd::soft_min_switch_cpu(
     FPTYPE * sw_value,
     FPTYPE * sw_deriv,
     const FPTYPE * rij,
@@ -46,7 +46,7 @@ void soft_min_switch_cpu(
     }
     FPTYPE smin = bb / aa;
     FPTYPE vv, dd;
-    spline5_switch(vv, dd, smin, static_cast<FPTYPE>(rmin), static_cast<FPTYPE>(rmax));
+    spline5_switch(vv, dd, smin, rmin, rmax);
     // value of switch
     sw_value[i_idx] = vv;
     // deriv of switch distributed as force
@@ -80,7 +80,7 @@ void soft_min_switch_cpu(
 }
 
 template
-void soft_min_switch_cpu<double>(
+void deepmd::soft_min_switch_cpu<double>(
     double * sw_value,
     double * sw_deriv,
     const double * rij,
@@ -92,7 +92,7 @@ void soft_min_switch_cpu<double>(
     const double & rmax);
 
 template
-void soft_min_switch_cpu<float>(
+void deepmd::soft_min_switch_cpu<float>(
     float * sw_value,
     float * sw_deriv,
     const float * rij,
diff --git a/source/lib/src/soft_min_switch_force.cc b/source/lib/src/soft_min_switch_force.cc
index e189276c6e..724952493d 100644
--- a/source/lib/src/soft_min_switch_force.cc
+++ b/source/lib/src/soft_min_switch_force.cc
@@ -2,7 +2,7 @@
 #include <iostream>
 
 template<typename FPTYPE>
-void soft_min_switch_force_cpu(
+void deepmd::soft_min_switch_force_cpu(
     FPTYPE * force, 
     const FPTYPE * du, 
     const FPTYPE * sw_deriv, 
@@ -41,7 +41,7 @@ void soft_min_switch_force_cpu(
 }
 
 template
-void soft_min_switch_force_cpu<double>(
+void deepmd::soft_min_switch_force_cpu<double>(
     double * force, 
     const double * du, 
     const double * sw_deriv, 
@@ -51,7 +51,7 @@ void soft_min_switch_force_cpu<double>(
     const int nnei);
 
 template
-void soft_min_switch_force_cpu<float>(
+void deepmd::soft_min_switch_force_cpu<float>(
     float * force, 
     const float * du, 
     const float * sw_deriv, 
diff --git a/source/lib/src/soft_min_switch_force_grad.cc b/source/lib/src/soft_min_switch_force_grad.cc
index 63d388b174..31e46e9d6d 100644
--- a/source/lib/src/soft_min_switch_force_grad.cc
+++ b/source/lib/src/soft_min_switch_force_grad.cc
@@ -2,7 +2,7 @@
 #include <iostream>
 
 template<typename FPTYPE>
-void soft_min_switch_force_grad_cpu(
+void deepmd::soft_min_switch_force_grad_cpu(
     FPTYPE * grad_net, 
     const FPTYPE * grad,
     const FPTYPE * sw_deriv, 
@@ -41,7 +41,7 @@ void soft_min_switch_force_grad_cpu(
 }
 
 template
-void soft_min_switch_force_grad_cpu<double>(
+void deepmd::soft_min_switch_force_grad_cpu<double>(
     double * grad_net, 
     const double * grad,
     const double * sw_deriv, 
@@ -50,7 +50,7 @@ void soft_min_switch_force_grad_cpu<double>(
     const int nnei);
 
 template
-void soft_min_switch_force_grad_cpu<float>(
+void deepmd::soft_min_switch_force_grad_cpu<float>(
     float * grad_net, 
     const float * grad,
     const float * sw_deriv, 
diff --git a/source/lib/src/soft_min_switch_virial.cc b/source/lib/src/soft_min_switch_virial.cc
index 9ceacc5d72..a93ab3c1fb 100644
--- a/source/lib/src/soft_min_switch_virial.cc
+++ b/source/lib/src/soft_min_switch_virial.cc
@@ -2,7 +2,7 @@
 #include <iostream>
 
 template<typename FPTYPE>
-void soft_min_switch_virial_cpu(
+void deepmd::soft_min_switch_virial_cpu(
     FPTYPE * virial, 
     FPTYPE * atom_virial, 
     const FPTYPE * du, 
@@ -47,7 +47,7 @@ void soft_min_switch_virial_cpu(
 
 
 template
-void soft_min_switch_virial_cpu<double>(
+void deepmd::soft_min_switch_virial_cpu<double>(
     double * virial, 
     double * atom_virial, 
     const double * du, 
@@ -59,7 +59,7 @@ void soft_min_switch_virial_cpu<double>(
     const int nnei);
 
 template
-void soft_min_switch_virial_cpu<float>(
+void deepmd::soft_min_switch_virial_cpu<float>(
     float * virial, 
     float * atom_virial, 
     const float * du, 
diff --git a/source/lib/src/soft_min_switch_virial_grad.cc b/source/lib/src/soft_min_switch_virial_grad.cc
index 3718810dfd..1bb28a7c63 100644
--- a/source/lib/src/soft_min_switch_virial_grad.cc
+++ b/source/lib/src/soft_min_switch_virial_grad.cc
@@ -1,7 +1,7 @@
 #include "soft_min_switch_virial_grad.h"
 
 template<typename FPTYPE>
-void soft_min_switch_virial_grad_cpu(
+void deepmd::soft_min_switch_virial_grad_cpu(
     FPTYPE * grad_net, 
     const FPTYPE * grad,
     const FPTYPE * sw_deriv, 
@@ -41,7 +41,7 @@ void soft_min_switch_virial_grad_cpu(
 }
 
 template
-void soft_min_switch_virial_grad_cpu<double>(
+void deepmd::soft_min_switch_virial_grad_cpu<double>(
     double * grad_net, 
     const double * grad,
     const double * sw_deriv, 
@@ -51,7 +51,7 @@ void soft_min_switch_virial_grad_cpu<double>(
     const int nnei);
 
 template
-void soft_min_switch_virial_grad_cpu<float>(
+void deepmd::soft_min_switch_virial_grad_cpu<float>(
     float * grad_net, 
     const float * grad,
     const float * sw_deriv, 
diff --git a/source/lib/tests/test_coord.cc b/source/lib/tests/test_coord.cc
index 111b66bb06..3706874b55 100644
--- a/source/lib/tests/test_coord.cc
+++ b/source/lib/tests/test_coord.cc
@@ -37,7 +37,7 @@ class TestNormCoord : public ::testing::Test
 
 TEST_F(TestNormCoord, cpu_case0)
 {
-  Region<double> region;
+  deepmd::Region<double> region;
   init_region_cpu(region, &boxt[0]);
   std::vector<double > out_c(r0);
   normalize_coord_cpu(&out_c[0], natoms, region);
@@ -48,7 +48,7 @@ TEST_F(TestNormCoord, cpu_case0)
 
 TEST_F(TestNormCoord, cpu_case1)
 {
-  Region<double> region;
+  deepmd::Region<double> region;
   init_region_cpu(region, &boxt[0]);
   std::vector<double > out_c(r1);
   normalize_coord_cpu(&out_c[0], natoms, region);
@@ -59,7 +59,7 @@ TEST_F(TestNormCoord, cpu_case1)
 
 TEST_F(TestNormCoord, cpu_case2)
 {
-  Region<double> region;
+  deepmd::Region<double> region;
   init_region_cpu(region, &boxt[0]);
   std::vector<double > out_c(r2);
   normalize_coord_cpu(&out_c[0], natoms, region);
@@ -167,7 +167,7 @@ TEST_F(TestCopyCoord, cpu)
   std::vector<int > out_t(mem_size);
   std::vector<int > mapping(mem_size);
   int nall;
-  Region<double> region;
+  deepmd::Region<double> region;
   init_region_cpu(region, &boxt[0]);
   
   int ret = copy_coord_cpu(
@@ -211,7 +211,7 @@ TEST_F(TestCopyCoord, cpu_lessmem)
   std::vector<int > out_t(mem_size);
   std::vector<int > mapping(mem_size);
   int nall;
-  Region<double> region;
+  deepmd::Region<double> region;
   init_region_cpu(region, &boxt[0]);
   
   int ret = copy_coord_cpu(
diff --git a/source/lib/tests/test_env_mat_a.cc b/source/lib/tests/test_env_mat_a.cc
index 25fab3e59d..c08a4f4705 100644
--- a/source/lib/tests/test_env_mat_a.cc
+++ b/source/lib/tests/test_env_mat_a.cc
@@ -239,7 +239,7 @@ TEST_F(TestEnvMatA, cpu)
   for(int ii = 0; ii < nloc; ++ii){
     int ret = format_nlist_i_cpu<double>(fmt_nlist_a, posi_cpy, atype_cpy, ii, nlist_a_cpy[ii], rc, sec_a);    
     EXPECT_EQ(ret, -1);
-    env_mat_a_cpu<double>(env, env_deriv, rij_a, posi_cpy, atype_cpy, ii, fmt_nlist_a, sec_a, rc_smth, rc);    
+    deepmd::env_mat_a_cpu<double>(env, env_deriv, rij_a, posi_cpy, atype_cpy, ii, fmt_nlist_a, sec_a, rc_smth, rc);    
     EXPECT_EQ(env.size(), sec_a[2]*4);
     EXPECT_EQ(env.size(), env_deriv.size()/3);
     EXPECT_EQ(rij_a.size(), sec_a[2]*3);
@@ -265,7 +265,7 @@ TEST_F(TestEnvMatA, cpu_equal_orig_cpy)
     int ret_1 = format_nlist_i_cpu<double>(fmt_nlist_a_1, posi_cpy, atype_cpy, ii, nlist_a_cpy[ii], rc, sec_a);  
   
     EXPECT_EQ(ret_1, -1);
-    env_mat_a_cpu<double>(env_1, env_deriv_1, rij_a_1, posi_cpy, atype_cpy, ii, fmt_nlist_a_1, sec_a, rc_smth, rc);
+    deepmd::env_mat_a_cpu<double>(env_1, env_deriv_1, rij_a_1, posi_cpy, atype_cpy, ii, fmt_nlist_a_1, sec_a, rc_smth, rc);
 
     EXPECT_EQ(env_0.size(), env_1.size());
     EXPECT_EQ(env_deriv_0.size(), env_deriv_1.size());
@@ -291,7 +291,7 @@ TEST_F(TestEnvMatA, cpu_num_deriv)
   for(int ii = 0; ii < nloc; ++ii){
     int ret = format_nlist_i_cpu<double>(fmt_nlist_a, posi_cpy, atype_cpy, ii, nlist_a_cpy[ii], rc, sec_a);    
     EXPECT_EQ(ret, -1);
-    env_mat_a_cpu<double>(env, env_deriv, rij_a, posi_cpy, atype_cpy, ii, fmt_nlist_a, sec_a, rc_smth, rc);    
+    deepmd::env_mat_a_cpu<double>(env, env_deriv, rij_a, posi_cpy, atype_cpy, ii, fmt_nlist_a, sec_a, rc_smth, rc);    
     EXPECT_EQ(env.size(), sec_a[2]*4);
     EXPECT_EQ(env.size(), env_deriv.size()/3);
     EXPECT_EQ(rij_a.size(), sec_a[2]*3);
@@ -373,7 +373,7 @@ TEST_F(TestEnvMatAShortSel, cpu)
   for(int ii = 0; ii < nloc; ++ii){
     int ret = format_nlist_i_cpu<double>(fmt_nlist_a, posi_cpy, atype_cpy, ii, nlist_a_cpy[ii], rc, sec_a);    
     EXPECT_EQ(ret, 1);
-    env_mat_a_cpu<double>(env, env_deriv, rij_a, posi_cpy, atype_cpy, ii, fmt_nlist_a, sec_a, rc_smth, rc);    
+    deepmd::env_mat_a_cpu<double>(env, env_deriv, rij_a, posi_cpy, atype_cpy, ii, fmt_nlist_a, sec_a, rc_smth, rc);    
     EXPECT_EQ(env.size(), sec_a[2]*4);
     EXPECT_EQ(env.size(), env_deriv.size()/3);
     EXPECT_EQ(rij_a.size(), sec_a[2]*3);
@@ -399,14 +399,14 @@ TEST_F(TestEnvMatA, prod_cpu)
   }
   std::vector<int> ilist(nloc), numneigh(nloc);
   std::vector<int*> firstneigh(nloc);
-  InputNlist inlist(nloc, &ilist[0], &numneigh[0], &firstneigh[0]);
-  convert_nlist(inlist, nlist_a_cpy);
+  deepmd::InputNlist inlist(nloc, &ilist[0], &numneigh[0], &firstneigh[0]);
+  deepmd::convert_nlist(inlist, nlist_a_cpy);
   
   std::vector<double > em(nloc * ndescrpt), em_deriv(nloc * ndescrpt * 3), rij(nloc * nnei * 3);
   std::vector<int> nlist(nloc * nnei);
   std::vector<double > avg(ntypes * ndescrpt, 0);
   std::vector<double > std(ntypes * ndescrpt, 1);
-  prod_env_mat_a_cpu(
+  deepmd::prod_env_mat_a_cpu(
       &em[0],
       &em_deriv[0],
       &rij[0],
@@ -448,13 +448,13 @@ TEST_F(TestEnvMatA, prod_cpu_equal_cpu)
   }
   std::vector<int> ilist(nloc), numneigh(nloc);
   std::vector<int*> firstneigh(nloc);
-  InputNlist inlist(nloc, &ilist[0], &numneigh[0], &firstneigh[0]);
+  deepmd::InputNlist inlist(nloc, &ilist[0], &numneigh[0], &firstneigh[0]);
   convert_nlist(inlist, nlist_a_cpy);
   std::vector<double > em(nloc * ndescrpt), em_deriv(nloc * ndescrpt * 3), rij(nloc * nnei * 3);
   std::vector<int> nlist(nloc * nnei);
   std::vector<double > avg(ntypes * ndescrpt, 0);
   std::vector<double > std(ntypes * ndescrpt, 1);
-  prod_env_mat_a_cpu(
+  deepmd::prod_env_mat_a_cpu(
       &em[0],
       &em_deriv[0],
       &rij[0],
@@ -476,7 +476,7 @@ TEST_F(TestEnvMatA, prod_cpu_equal_cpu)
   for(int ii = 0; ii < nloc; ++ii){
     int ret_1 = format_nlist_i_cpu<double>(fmt_nlist_a_1, posi_cpy, atype_cpy, ii, nlist_a_cpy[ii], rc, sec_a);  
     EXPECT_EQ(ret_1, -1);
-    env_mat_a_cpu<double>(env_1, env_deriv_1, rij_a_1, posi_cpy, atype_cpy, ii, fmt_nlist_a_1, sec_a, rc_smth, rc);
+    deepmd::env_mat_a_cpu<double>(env_1, env_deriv_1, rij_a_1, posi_cpy, atype_cpy, ii, fmt_nlist_a_1, sec_a, rc_smth, rc);
     EXPECT_EQ(env_1.size(), nnei * 4);
     EXPECT_EQ(env_deriv_1.size(), nnei * 4 * 3);
     EXPECT_EQ(rij_a_1.size(), nnei * 3);
@@ -535,7 +535,7 @@ TEST_F(TestEnvMatA, prod_gpu_cuda)
   }
   std::vector<int> ilist(nloc), numneigh(nloc);
   std::vector<int*> firstneigh(nloc);
-  InputNlist inlist(nloc, &ilist[0], &numneigh[0], &firstneigh[0]), gpu_inlist;
+  deepmd::InputNlist inlist(nloc, &ilist[0], &numneigh[0], &firstneigh[0]), gpu_inlist;
   convert_nlist(inlist, nlist_a_cpy);
   std::vector<double > em(nloc * ndescrpt, 0.0), em_deriv(nloc * ndescrpt * 3, 0.0), rij(nloc * nnei * 3, 0.0);
   std::vector<int> nlist(nloc * nnei, 0);
@@ -625,7 +625,7 @@ TEST_F(TestEnvMatA, prod_gpu_cuda_equal_cpu)
   }
   std::vector<int> ilist(nloc), numneigh(nloc);
   std::vector<int*> firstneigh(nloc);
-  InputNlist inlist(nloc, &ilist[0], &numneigh[0], &firstneigh[0]), gpu_inlist;
+  deepmd::InputNlist inlist(nloc, &ilist[0], &numneigh[0], &firstneigh[0]), gpu_inlist;
   convert_nlist(inlist, nlist_a_cpy);
   std::vector<double > em(nloc * ndescrpt, 0.0), em_deriv(nloc * ndescrpt * 3, 0.0), rij(nloc * nnei * 3, 0.0);
   std::vector<int> nlist(nloc * nnei, 0);
@@ -689,7 +689,7 @@ TEST_F(TestEnvMatA, prod_gpu_cuda_equal_cpu)
   for(int ii = 0; ii < nloc; ++ii){
     int ret_1 = format_nlist_i_cpu<double>(fmt_nlist_a_1, posi_cpy, atype_cpy, ii, nlist_a_cpy[ii], rc, sec_a);  
     EXPECT_EQ(ret_1, -1);
-    env_mat_a_cpu<double>(env_1, env_deriv_1, rij_a_1, posi_cpy, atype_cpy, ii, fmt_nlist_a_1, sec_a, rc_smth, rc);
+    deepmd::env_mat_a_cpu<double>(env_1, env_deriv_1, rij_a_1, posi_cpy, atype_cpy, ii, fmt_nlist_a_1, sec_a, rc_smth, rc);
     EXPECT_EQ(env_1.size(), nnei * 4);
     EXPECT_EQ(env_deriv_1.size(), nnei * 4 * 3);
     EXPECT_EQ(rij_a_1.size(), nnei * 3);
diff --git a/source/lib/tests/test_env_mat_r.cc b/source/lib/tests/test_env_mat_r.cc
index 4dfd369c1a..f571dbdaf1 100644
--- a/source/lib/tests/test_env_mat_r.cc
+++ b/source/lib/tests/test_env_mat_r.cc
@@ -168,7 +168,7 @@ TEST_F(TestEnvMatR, cpu)
   for(int ii = 0; ii < nloc; ++ii){
     int ret = format_nlist_i_cpu<double>(fmt_nlist_a, posi_cpy, atype_cpy, ii, nlist_a_cpy[ii], rc, sec_a);    
     EXPECT_EQ(ret, -1);
-    env_mat_r_cpu<double>(env, env_deriv, rij_a, posi_cpy, atype_cpy, ii, fmt_nlist_a, sec_a, rc_smth, rc);    
+    deepmd::env_mat_r_cpu<double>(env, env_deriv, rij_a, posi_cpy, atype_cpy, ii, fmt_nlist_a, sec_a, rc_smth, rc);    
     for (int jj = 0; jj < sec_a[2]; ++jj){
       EXPECT_LT(fabs(env[jj] - expected_env[ii*sec_a[2] + jj]) , 1e-5);
     }    
@@ -188,7 +188,7 @@ TEST_F(TestEnvMatR, cpu_equal_orig_cpy)
 
     int ret_1 = format_nlist_i_cpu<double>(fmt_nlist_a_1, posi_cpy, atype_cpy, ii, nlist_a_cpy[ii], rc, sec_a);  
     EXPECT_EQ(ret_1, -1);
-    env_mat_r_cpu<double>(env_1, env_deriv_1, rij_a_1, posi_cpy, atype_cpy, ii, fmt_nlist_a_1, sec_a, rc_smth, rc);
+    deepmd::env_mat_r_cpu<double>(env_1, env_deriv_1, rij_a_1, posi_cpy, atype_cpy, ii, fmt_nlist_a_1, sec_a, rc_smth, rc);
 
     EXPECT_EQ(env_0.size(), env_1.size());
     EXPECT_EQ(env_deriv_0.size(), env_deriv_1.size());
@@ -214,7 +214,7 @@ TEST_F(TestEnvMatR, cpu_num_deriv)
   for(int ii = 0; ii < nloc; ++ii){
     int ret = format_nlist_i_cpu<double>(fmt_nlist_a, posi_cpy, atype_cpy, ii, nlist_a_cpy[ii], rc, sec_a);    
     EXPECT_EQ(ret, -1);
-    env_mat_r_cpu<double>(env, env_deriv, rij_a, posi_cpy, atype_cpy, ii, fmt_nlist_a, sec_a, rc_smth, rc);    
+    deepmd::env_mat_r_cpu<double>(env, env_deriv, rij_a, posi_cpy, atype_cpy, ii, fmt_nlist_a, sec_a, rc_smth, rc);    
 
     for (int jj = 0; jj < sec_a[2]; ++jj){
       int j_idx = fmt_nlist_a[jj];
@@ -251,14 +251,14 @@ TEST_F(TestEnvMatR, prod_cpu)
   }
   std::vector<int> ilist(nloc), numneigh(nloc);
   std::vector<int*> firstneigh(nloc);
-  InputNlist inlist(nloc, &ilist[0], &numneigh[0], &firstneigh[0]);
+  deepmd::InputNlist inlist(nloc, &ilist[0], &numneigh[0], &firstneigh[0]);
   convert_nlist(inlist, nlist_a_cpy);
   
   std::vector<double > em(nloc * ndescrpt), em_deriv(nloc * ndescrpt * 3), rij(nloc * nnei * 3);
   std::vector<int> nlist(nloc * nnei);
   std::vector<double > avg(ntypes * ndescrpt, 0);
   std::vector<double > std(ntypes * ndescrpt, 1);
-  prod_env_mat_r_cpu(
+  deepmd::prod_env_mat_r_cpu(
       &em[0],
       &em_deriv[0],
       &rij[0],
@@ -300,13 +300,13 @@ TEST_F(TestEnvMatR, prod_cpu_equal_cpu)
   }
   std::vector<int> ilist(nloc), numneigh(nloc);
   std::vector<int*> firstneigh(nloc);
-  InputNlist inlist(nloc, &ilist[0], &numneigh[0], &firstneigh[0]);
+  deepmd::InputNlist inlist(nloc, &ilist[0], &numneigh[0], &firstneigh[0]);
   convert_nlist(inlist, nlist_a_cpy);
   std::vector<double > em(nloc * ndescrpt), em_deriv(nloc * ndescrpt * 3), rij(nloc * nnei * 3);
   std::vector<int> nlist(nloc * nnei);
   std::vector<double > avg(ntypes * ndescrpt, 0);
   std::vector<double > std(ntypes * ndescrpt, 1);
-  prod_env_mat_r_cpu(
+  deepmd::prod_env_mat_r_cpu(
       &em[0],
       &em_deriv[0],
       &rij[0],
@@ -328,7 +328,7 @@ TEST_F(TestEnvMatR, prod_cpu_equal_cpu)
   for(int ii = 0; ii < nloc; ++ii){
     int ret_1 = format_nlist_i_cpu<double>(fmt_nlist_a_1, posi_cpy, atype_cpy, ii, nlist_a_cpy[ii], rc, sec_a);
     EXPECT_EQ(ret_1, -1);
-    env_mat_r_cpu<double>(env_1, env_deriv_1, rij_a_1, posi_cpy, atype_cpy, ii, fmt_nlist_a_1, sec_a, rc_smth, rc);
+    deepmd::env_mat_r_cpu<double>(env_1, env_deriv_1, rij_a_1, posi_cpy, atype_cpy, ii, fmt_nlist_a_1, sec_a, rc_smth, rc);
     EXPECT_EQ(env_1.size(), nnei * 1);
     EXPECT_EQ(env_deriv_1.size(), nnei * 1 * 3);
     EXPECT_EQ(rij_a_1.size(), nnei * 3);
@@ -531,7 +531,7 @@ TEST_F(TestEnvMatR, prod_gpu_cuda_equal_cpu)
   for(int ii = 0; ii < nloc; ++ii){
     int ret_1 = format_nlist_i_cpu<double>(fmt_nlist_a_1, posi_cpy, atype_cpy, ii, nlist_a_cpy[ii], rc, sec_a);  
     EXPECT_EQ(ret_1, -1);
-    env_mat_r_cpu<double>(env_1, env_deriv_1, rij_a_1, posi_cpy, atype_cpy, ii, fmt_nlist_a_1, sec_a, rc_smth, rc);
+    deepmd::env_mat_r_cpu<double>(env_1, env_deriv_1, rij_a_1, posi_cpy, atype_cpy, ii, fmt_nlist_a_1, sec_a, rc_smth, rc);
     EXPECT_EQ(env_1.size(), nnei * 1);
     EXPECT_EQ(env_deriv_1.size(), nnei * 1 * 3);
     EXPECT_EQ(rij_a_1.size(), nnei * 3);
diff --git a/source/lib/tests/test_ewald.cc b/source/lib/tests/test_ewald.cc
index 44cb758273..cb8e59e1d9 100644
--- a/source/lib/tests/test_ewald.cc
+++ b/source/lib/tests/test_ewald.cc
@@ -20,7 +20,7 @@ class TestEwald : public ::testing::Test
   std::vector<double > boxt = {
     13., 0., 0., 0., 13., 0., 0., 0., 13.
   };
-  EwaldParameters<double> eparam;
+  deepmd::EwaldParameters<double> eparam;
   double expected_e = 4.7215808340392229e+00;
   std::vector<double> expected_f = {
     -5.4937025715874448e+00,5.6659817006308417e+00,3.8059426028301313e-01,2.5210962791915938e+00,-2.6383552457553545e+00,-4.8998411247787405e-01,2.7390037416771147e+00,-3.2890571945143514e+00,3.8057620258450320e-01,6.7561832843578351e+00,-1.3707287681111919e+00,2.7733203842981604e+00,-3.3297964389679557e+00,1.0404967238120841e+00,-1.8035649784287722e+00,-3.1927842946711418e+00,5.9166278393797123e-01,-1.2409417562590299e+00,
@@ -38,7 +38,7 @@ TEST_F(TestEwald, cpu)
 {
   double ener;
   std::vector<double > force, virial;
-  Region<double> region;
+  deepmd::Region<double> region;
   init_region_cpu(region, &boxt[0]);
   ewald_recp(ener, force, virial, coord, charge, region, eparam);
   EXPECT_LT(fabs(ener - expected_e), 1e-10);
diff --git a/source/lib/tests/test_fmt_nlist.cc b/source/lib/tests/test_fmt_nlist.cc
index a37562a230..6dc539b7d6 100644
--- a/source/lib/tests/test_fmt_nlist.cc
+++ b/source/lib/tests/test_fmt_nlist.cc
@@ -212,7 +212,7 @@ TEST_F(TestFormatNlist, cpu)
   std::vector<int > ilist(inum);
   std::vector<int > numneigh(inum);
   std::vector<int* > firstneigh(inum);
-  InputNlist in_nlist(inum, &ilist[0], &numneigh[0], &firstneigh[0]);
+  deepmd::InputNlist in_nlist(inum, &ilist[0], &numneigh[0], &firstneigh[0]);
   convert_nlist(in_nlist, nlist_a_0);
   // allocate the mem for the result
   std::vector<int> nlist(inum * sec_a.back());
@@ -280,7 +280,7 @@ TEST_F(TestFormatNlistShortSel, cpu)
   std::vector<int > ilist(inum);
   std::vector<int > numneigh(inum);
   std::vector<int* > firstneigh(inum);
-  InputNlist in_nlist(inum, &ilist[0], &numneigh[0], &firstneigh[0]);
+  deepmd::InputNlist in_nlist(inum, &ilist[0], &numneigh[0], &firstneigh[0]);
   convert_nlist(in_nlist, nlist_a_0);  
   // mem
   std::vector<int> nlist(inum * sec_a.back());
diff --git a/source/lib/tests/test_gelu.cc b/source/lib/tests/test_gelu.cc
index f8a65ae308..0b05dd71d5 100644
--- a/source/lib/tests/test_gelu.cc
+++ b/source/lib/tests/test_gelu.cc
@@ -112,7 +112,7 @@ class TestGelu : public ::testing::Test
 TEST_F(TestGelu, gelu_cpu)
 {
   std::vector<double> gelu(nloc);
-  gelu_cpu<double> (&gelu[0], &xx[0], nloc);
+  deepmd::gelu_cpu<double> (&gelu[0], &xx[0], nloc);
   EXPECT_EQ(gelu.size(), nloc);
   EXPECT_EQ(gelu.size(), expected_gelu.size());
   for (int jj = 0; jj < gelu.size(); ++jj){
@@ -124,7 +124,7 @@ TEST_F(TestGelu, gelu_grad_cpu)
 {
   std::vector<double> dy(100, 1.0);
   std::vector<double> gelu_grad(nloc);
-  gelu_grad_cpu<double> (&gelu_grad[0], &xx[0], &dy[0], nloc);
+  deepmd::gelu_grad_cpu<double> (&gelu_grad[0], &xx[0], &dy[0], nloc);
   EXPECT_EQ(gelu_grad.size(), nloc);
   EXPECT_EQ(gelu_grad.size(), expected_gelu_grad.size());
   for (int jj = 0; jj < gelu_grad.size(); ++jj){
@@ -137,7 +137,7 @@ TEST_F(TestGelu, gelu_grad_grad_cpu)
   std::vector<double> dy(100, 1.0);
   std::vector<double> dy_2(100, 1.0);
   std::vector<double> gelu_grad_grad(nloc);
-  gelu_grad_grad_cpu<double> (&gelu_grad_grad[0], &xx[0], &dy[0], &dy_2[0], nloc);
+  deepmd::gelu_grad_grad_cpu<double> (&gelu_grad_grad[0], &xx[0], &dy[0], &dy_2[0], nloc);
   EXPECT_EQ(gelu_grad_grad.size(), nloc);
   EXPECT_EQ(gelu_grad_grad.size(), expected_gelu_grad_grad.size());
   for (int jj = 0; jj < gelu_grad_grad.size(); ++jj){
@@ -211,4 +211,4 @@ TEST_F(TestGelu, gelu_grad_grad_gpu_cuda)
     EXPECT_LT(fabs(gelu_grad_grad[jj] - expected_gelu_grad_grad[jj]) , 1e-5);
   }  
 }
-#endif // GOOGLE_CUDA
\ No newline at end of file
+#endif // GOOGLE_CUDA
diff --git a/source/lib/tests/test_map_aparam.cc b/source/lib/tests/test_map_aparam.cc
index 286f878ffe..a393345eb0 100644
--- a/source/lib/tests/test_map_aparam.cc
+++ b/source/lib/tests/test_map_aparam.cc
@@ -71,7 +71,7 @@ class TestMapAparam : public ::testing::Test
 TEST_F(TestMapAparam, cpu)
 {
   std::vector<double> output(nloc * nnei * numb_aparam);
-  map_aparam_cpu(
+  deepmd::map_aparam_cpu(
       &output[0],
       &aparam[0],
       &nlist[0],
diff --git a/source/lib/tests/test_neighbor_list.cc b/source/lib/tests/test_neighbor_list.cc
index a0962cecbf..cd8211fb09 100644
--- a/source/lib/tests/test_neighbor_list.cc
+++ b/source/lib/tests/test_neighbor_list.cc
@@ -54,7 +54,7 @@ TEST_F(TestNeighborList, cpu)
     firstneigh[ii] = new int[mem_size];
   }
 
-  InputNlist nlist(nloc, ilist, numneigh, firstneigh);
+  deepmd::InputNlist nlist(nloc, ilist, numneigh, firstneigh);
   int max_list_size;
   int ret = build_nlist_cpu(
       nlist,
@@ -94,7 +94,7 @@ TEST_F(TestNeighborList, cpu_lessmem)
     firstneigh[ii] = new int[mem_size];
   }
 
-  InputNlist nlist(nloc, ilist, numneigh, firstneigh);
+  deepmd::InputNlist nlist(nloc, ilist, numneigh, firstneigh);
   int max_list_size;
   int ret = build_nlist_cpu(
       nlist,
diff --git a/source/lib/tests/test_pair_tab.cc b/source/lib/tests/test_pair_tab.cc
index 0c7cf5ea6f..c55e96369f 100644
--- a/source/lib/tests/test_pair_tab.cc
+++ b/source/lib/tests/test_pair_tab.cc
@@ -98,7 +98,7 @@ class TestPairTab : public ::testing::Test
       }
       std::vector<double > t_env, t_env_deriv, t_rij;
       // compute env_mat and its deriv, record
-      env_mat_a_cpu<double>(t_env, t_env_deriv, t_rij, posi_cpy, atype_cpy, ii, fmt_nlist_a, sec_a, rc_smth, rc);    
+      deepmd::env_mat_a_cpu<double>(t_env, t_env_deriv, t_rij, posi_cpy, atype_cpy, ii, fmt_nlist_a, sec_a, rc_smth, rc);    
       for (int jj = 0; jj < ndescrpt; ++jj){
 	env[ii*ndescrpt+jj] = t_env[jj];
 	for (int dd = 0; dd < 3; ++dd){
@@ -134,7 +134,7 @@ TEST_F(TestPairTab, cpu)
   std::vector<double > virial(nall * 9);
   std::vector<double > scale(nloc, 1.0);
 
-  pair_tab_cpu(
+  deepmd::pair_tab_cpu(
       &energy[0],
       &force[0],
       &virial[0],
@@ -197,7 +197,7 @@ TEST_F(TestPairTab, cpu_f_num_deriv)
   std::vector<double > virial(9, 0.);
   std::vector<double > atom_virial(nall * 9);
   std::vector<double > scale(nloc, 1.0);
-  pair_tab_cpu(
+  deepmd::pair_tab_cpu(
       &energy[0],
       &force[0],
       &atom_virial[0],
@@ -242,8 +242,8 @@ TEST_F(TestPairTab, cpu_f_num_deriv)
       build_nlist(nlist_cpy_1, t_nlist, posi_cpy_1, nloc, rc, rc, nat_stt, ncell, ext_stt, ext_end, region, ncell);
       std::vector<int> ilist_0(nloc), numneigh_0(nloc), ilist_1(nloc), numneigh_1(nloc);;
       std::vector<int*> firstneigh_0(nloc), firstneigh_1(nloc);
-      InputNlist inlist_0(nloc, &ilist_0[0], &numneigh_0[0], &firstneigh_0[0]);
-      InputNlist inlist_1(nloc, &ilist_1[0], &numneigh_1[0], &firstneigh_1[0]);
+      deepmd::InputNlist inlist_0(nloc, &ilist_0[0], &numneigh_0[0], &firstneigh_0[0]);
+      deepmd::InputNlist inlist_1(nloc, &ilist_1[0], &numneigh_1[0], &firstneigh_1[0]);
       convert_nlist(inlist_0, nlist_cpy_0);
       convert_nlist(inlist_1, nlist_cpy_1);
       int max_nnei_0 = max_numneigh(inlist_0);
@@ -254,11 +254,11 @@ TEST_F(TestPairTab, cpu_f_num_deriv)
       std::vector<int> nlist_0(nloc * nnei), nlist_1(nloc * nnei);
       std::vector<double > avg(ntypes * ndescrpt, 0);
       std::vector<double > std(ntypes * ndescrpt, 1);
-      prod_env_mat_a_cpu(&t_em[0], &t_em_deriv[0], &rij_0[0], &nlist_0[0], &posi_cpy_0[0], &atype_cpy_0[0], inlist_0, max_nnei_0, &avg[0], &std[0], nloc, nall, rc, rc_smth, sec_a);
-      prod_env_mat_a_cpu(&t_em[0], &t_em_deriv[0], &rij_1[0], &nlist_1[0], &posi_cpy_1[0], &atype_cpy_1[0], inlist_1, max_nnei_1, &avg[0], &std[0], nloc, nall, rc, rc_smth, sec_a);
+      deepmd::prod_env_mat_a_cpu(&t_em[0], &t_em_deriv[0], &rij_0[0], &nlist_0[0], &posi_cpy_0[0], &atype_cpy_0[0], inlist_0, max_nnei_0, &avg[0], &std[0], nloc, nall, rc, rc_smth, sec_a);
+      deepmd::prod_env_mat_a_cpu(&t_em[0], &t_em_deriv[0], &rij_1[0], &nlist_1[0], &posi_cpy_1[0], &atype_cpy_1[0], inlist_1, max_nnei_1, &avg[0], &std[0], nloc, nall, rc, rc_smth, sec_a);
       std::vector<double> energy_0(nloc), energy_1(nloc);
       std::vector<double> t_force(nall * 3), t_virial(nall * 9);
-      pair_tab_cpu(
+      deepmd::pair_tab_cpu(
 	  &energy_0[0],
 	  &t_force[0],
 	  &t_virial[0],
@@ -271,7 +271,7 @@ TEST_F(TestPairTab, cpu_f_num_deriv)
 	  &natoms[0],
 	  sel_a,
 	  sel_r);  
-      pair_tab_cpu(
+      deepmd::pair_tab_cpu(
 	  &energy_1[0],
 	  &t_force[0],
 	  &t_virial[0],
@@ -305,7 +305,7 @@ TEST_F(TestPairTab, cpu_f_num_deriv_scale)
   std::vector<double > virial(9, 0.);
   std::vector<double > atom_virial(nall * 9);
   std::vector<double > scale(nloc, new_scale);
-  pair_tab_cpu(
+  deepmd::pair_tab_cpu(
       &energy[0],
       &force[0],
       &atom_virial[0],
@@ -350,8 +350,8 @@ TEST_F(TestPairTab, cpu_f_num_deriv_scale)
       build_nlist(nlist_cpy_1, t_nlist, posi_cpy_1, nloc, rc, rc, nat_stt, ncell, ext_stt, ext_end, region, ncell);
       std::vector<int> ilist_0(nloc), numneigh_0(nloc), ilist_1(nloc), numneigh_1(nloc);;
       std::vector<int*> firstneigh_0(nloc), firstneigh_1(nloc);
-      InputNlist inlist_0(nloc, &ilist_0[0], &numneigh_0[0], &firstneigh_0[0]);
-      InputNlist inlist_1(nloc, &ilist_1[0], &numneigh_1[0], &firstneigh_1[0]);
+      deepmd::InputNlist inlist_0(nloc, &ilist_0[0], &numneigh_0[0], &firstneigh_0[0]);
+      deepmd::InputNlist inlist_1(nloc, &ilist_1[0], &numneigh_1[0], &firstneigh_1[0]);
       convert_nlist(inlist_0, nlist_cpy_0);
       convert_nlist(inlist_1, nlist_cpy_1);
       int max_nnei_0 = max_numneigh(inlist_0);
@@ -362,11 +362,11 @@ TEST_F(TestPairTab, cpu_f_num_deriv_scale)
       std::vector<int> nlist_0(nloc * nnei), nlist_1(nloc * nnei);
       std::vector<double > avg(ntypes * ndescrpt, 0);
       std::vector<double > std(ntypes * ndescrpt, 1);
-      prod_env_mat_a_cpu(&t_em[0], &t_em_deriv[0], &rij_0[0], &nlist_0[0], &posi_cpy_0[0], &atype_cpy_0[0], inlist_0, max_nnei_0, &avg[0], &std[0], nloc, nall, rc, rc_smth, sec_a);
-      prod_env_mat_a_cpu(&t_em[0], &t_em_deriv[0], &rij_1[0], &nlist_1[0], &posi_cpy_1[0], &atype_cpy_1[0], inlist_1, max_nnei_1, &avg[0], &std[0], nloc, nall, rc, rc_smth, sec_a);
+      deepmd::prod_env_mat_a_cpu(&t_em[0], &t_em_deriv[0], &rij_0[0], &nlist_0[0], &posi_cpy_0[0], &atype_cpy_0[0], inlist_0, max_nnei_0, &avg[0], &std[0], nloc, nall, rc, rc_smth, sec_a);
+      deepmd::prod_env_mat_a_cpu(&t_em[0], &t_em_deriv[0], &rij_1[0], &nlist_1[0], &posi_cpy_1[0], &atype_cpy_1[0], inlist_1, max_nnei_1, &avg[0], &std[0], nloc, nall, rc, rc_smth, sec_a);
       std::vector<double> energy_0(nloc), energy_1(nloc);
       std::vector<double> t_force(nall * 3), t_virial(nall * 9);
-      pair_tab_cpu(
+      deepmd::pair_tab_cpu(
 	  &energy_0[0],
 	  &t_force[0],
 	  &t_virial[0],
@@ -379,7 +379,7 @@ TEST_F(TestPairTab, cpu_f_num_deriv_scale)
 	  &natoms[0],
 	  sel_a,
 	  sel_r);  
-      pair_tab_cpu(
+      deepmd::pair_tab_cpu(
 	  &energy_1[0],
 	  &t_force[0],
 	  &t_virial[0],
@@ -411,7 +411,7 @@ TEST_F(TestPairTab, cpu_v_num_deriv)
   std::vector<double > virial(9, 0.);
   std::vector<double > atom_virial(nall * 9);
   std::vector<double > scale(nloc, 1.0);
-  pair_tab_cpu(
+  deepmd::pair_tab_cpu(
       &energy[0],
       &force[0],
       &atom_virial[0],
@@ -471,8 +471,8 @@ TEST_F(TestPairTab, cpu_v_num_deriv)
       build_nlist(nlist_cpy_1, t_nlist, posi_cpy_1, nloc, rc, rc, nat_stt, ncell, ext_stt, ext_end, region_1, ncell);
       std::vector<int> ilist_0(nloc), numneigh_0(nloc), ilist_1(nloc), numneigh_1(nloc);;
       std::vector<int*> firstneigh_0(nloc), firstneigh_1(nloc);
-      InputNlist inlist_0(nloc, &ilist_0[0], &numneigh_0[0], &firstneigh_0[0]);
-      InputNlist inlist_1(nloc, &ilist_1[0], &numneigh_1[0], &firstneigh_1[0]);
+      deepmd::InputNlist inlist_0(nloc, &ilist_0[0], &numneigh_0[0], &firstneigh_0[0]);
+      deepmd::InputNlist inlist_1(nloc, &ilist_1[0], &numneigh_1[0], &firstneigh_1[0]);
       convert_nlist(inlist_0, nlist_cpy_0);
       convert_nlist(inlist_1, nlist_cpy_1);
       int max_nnei_0 = max_numneigh(inlist_0);
@@ -483,11 +483,11 @@ TEST_F(TestPairTab, cpu_v_num_deriv)
       std::vector<int> nlist_0(nloc * nnei), nlist_1(nloc * nnei);
       std::vector<double > avg(ntypes * ndescrpt, 0);
       std::vector<double > std(ntypes * ndescrpt, 1);
-      prod_env_mat_a_cpu(&t_em[0], &t_em_deriv[0], &rij_0[0], &nlist_0[0], &posi_cpy_0[0], &atype_cpy_0[0], inlist_0, max_nnei_0, &avg[0], &std[0], nloc, nall, rc, rc_smth, sec_a);
-      prod_env_mat_a_cpu(&t_em[0], &t_em_deriv[0], &rij_1[0], &nlist_1[0], &posi_cpy_1[0], &atype_cpy_1[0], inlist_1, max_nnei_1, &avg[0], &std[0], nloc, nall, rc, rc_smth, sec_a);
+      deepmd::prod_env_mat_a_cpu(&t_em[0], &t_em_deriv[0], &rij_0[0], &nlist_0[0], &posi_cpy_0[0], &atype_cpy_0[0], inlist_0, max_nnei_0, &avg[0], &std[0], nloc, nall, rc, rc_smth, sec_a);
+      deepmd::prod_env_mat_a_cpu(&t_em[0], &t_em_deriv[0], &rij_1[0], &nlist_1[0], &posi_cpy_1[0], &atype_cpy_1[0], inlist_1, max_nnei_1, &avg[0], &std[0], nloc, nall, rc, rc_smth, sec_a);
       std::vector<double> energy_0(nloc), energy_1(nloc);
       std::vector<double> t_force(nall * 3), t_virial(nall * 9);
-      pair_tab_cpu(
+      deepmd::pair_tab_cpu(
 	  &energy_0[0],
 	  &t_force[0],
 	  &t_virial[0],
@@ -500,7 +500,7 @@ TEST_F(TestPairTab, cpu_v_num_deriv)
 	  &natoms[0],
 	  sel_a,
 	  sel_r);  
-      pair_tab_cpu(
+      deepmd::pair_tab_cpu(
 	  &energy_1[0],
 	  &t_force[0],
 	  &t_virial[0],
@@ -543,7 +543,7 @@ TEST_F(TestPairTab, cpu_v_num_deriv_scale)
   std::vector<double > virial(9, 0.);
   std::vector<double > atom_virial(nall * 9);
   std::vector<double > scale(nloc, new_scale);
-  pair_tab_cpu(
+  deepmd::pair_tab_cpu(
       &energy[0],
       &force[0],
       &atom_virial[0],
@@ -603,8 +603,8 @@ TEST_F(TestPairTab, cpu_v_num_deriv_scale)
       build_nlist(nlist_cpy_1, t_nlist, posi_cpy_1, nloc, rc, rc, nat_stt, ncell, ext_stt, ext_end, region_1, ncell);
       std::vector<int> ilist_0(nloc), numneigh_0(nloc), ilist_1(nloc), numneigh_1(nloc);;
       std::vector<int*> firstneigh_0(nloc), firstneigh_1(nloc);
-      InputNlist inlist_0(nloc, &ilist_0[0], &numneigh_0[0], &firstneigh_0[0]);
-      InputNlist inlist_1(nloc, &ilist_1[0], &numneigh_1[0], &firstneigh_1[0]);
+      deepmd::InputNlist inlist_0(nloc, &ilist_0[0], &numneigh_0[0], &firstneigh_0[0]);
+      deepmd::InputNlist inlist_1(nloc, &ilist_1[0], &numneigh_1[0], &firstneigh_1[0]);
       convert_nlist(inlist_0, nlist_cpy_0);
       convert_nlist(inlist_1, nlist_cpy_1);
       int max_nnei_0 = max_numneigh(inlist_0);
@@ -615,11 +615,11 @@ TEST_F(TestPairTab, cpu_v_num_deriv_scale)
       std::vector<int> nlist_0(nloc * nnei), nlist_1(nloc * nnei);
       std::vector<double > avg(ntypes * ndescrpt, 0);
       std::vector<double > std(ntypes * ndescrpt, 1);
-      prod_env_mat_a_cpu(&t_em[0], &t_em_deriv[0], &rij_0[0], &nlist_0[0], &posi_cpy_0[0], &atype_cpy_0[0], inlist_0, max_nnei_0, &avg[0], &std[0], nloc, nall, rc, rc_smth, sec_a);
-      prod_env_mat_a_cpu(&t_em[0], &t_em_deriv[0], &rij_1[0], &nlist_1[0], &posi_cpy_1[0], &atype_cpy_1[0], inlist_1, max_nnei_1, &avg[0], &std[0], nloc, nall, rc, rc_smth, sec_a);
+      deepmd::prod_env_mat_a_cpu(&t_em[0], &t_em_deriv[0], &rij_0[0], &nlist_0[0], &posi_cpy_0[0], &atype_cpy_0[0], inlist_0, max_nnei_0, &avg[0], &std[0], nloc, nall, rc, rc_smth, sec_a);
+      deepmd::prod_env_mat_a_cpu(&t_em[0], &t_em_deriv[0], &rij_1[0], &nlist_1[0], &posi_cpy_1[0], &atype_cpy_1[0], inlist_1, max_nnei_1, &avg[0], &std[0], nloc, nall, rc, rc_smth, sec_a);
       std::vector<double> energy_0(nloc), energy_1(nloc);
       std::vector<double> t_force(nall * 3), t_virial(nall * 9);
-      pair_tab_cpu(
+      deepmd::pair_tab_cpu(
 	  &energy_0[0],
 	  &t_force[0],
 	  &t_virial[0],
@@ -632,7 +632,7 @@ TEST_F(TestPairTab, cpu_v_num_deriv_scale)
 	  &natoms[0],
 	  sel_a,
 	  sel_r);  
-      pair_tab_cpu(
+      deepmd::pair_tab_cpu(
 	  &energy_1[0],
 	  &t_force[0],
 	  &t_virial[0],
@@ -675,7 +675,7 @@ TEST_F(TestPairTabTriBox, cpu_v_num_deriv)
   std::vector<double > virial(9, 0.);
   std::vector<double > atom_virial(nall * 9);
   std::vector<double > scale(nloc, 1.0);
-  pair_tab_cpu(
+  deepmd::pair_tab_cpu(
       &energy[0],
       &force[0],
       &atom_virial[0],
@@ -735,8 +735,8 @@ TEST_F(TestPairTabTriBox, cpu_v_num_deriv)
       build_nlist(nlist_cpy_1, t_nlist, posi_cpy_1, nloc, rc, rc, nat_stt, ncell, ext_stt, ext_end, region_1, ncell);
       std::vector<int> ilist_0(nloc), numneigh_0(nloc), ilist_1(nloc), numneigh_1(nloc);;
       std::vector<int*> firstneigh_0(nloc), firstneigh_1(nloc);
-      InputNlist inlist_0(nloc, &ilist_0[0], &numneigh_0[0], &firstneigh_0[0]);
-      InputNlist inlist_1(nloc, &ilist_1[0], &numneigh_1[0], &firstneigh_1[0]);
+      deepmd::InputNlist inlist_0(nloc, &ilist_0[0], &numneigh_0[0], &firstneigh_0[0]);
+      deepmd::InputNlist inlist_1(nloc, &ilist_1[0], &numneigh_1[0], &firstneigh_1[0]);
       convert_nlist(inlist_0, nlist_cpy_0);
       convert_nlist(inlist_1, nlist_cpy_1);
       int max_nnei_0 = max_numneigh(inlist_0);
@@ -747,11 +747,11 @@ TEST_F(TestPairTabTriBox, cpu_v_num_deriv)
       std::vector<int> nlist_0(nloc * nnei), nlist_1(nloc * nnei);
       std::vector<double > avg(ntypes * ndescrpt, 0);
       std::vector<double > std(ntypes * ndescrpt, 1);
-      prod_env_mat_a_cpu(&t_em[0], &t_em_deriv[0], &rij_0[0], &nlist_0[0], &posi_cpy_0[0], &atype_cpy_0[0], inlist_0, max_nnei_0, &avg[0], &std[0], nloc, nall, rc, rc_smth, sec_a);
-      prod_env_mat_a_cpu(&t_em[0], &t_em_deriv[0], &rij_1[0], &nlist_1[0], &posi_cpy_1[0], &atype_cpy_1[0], inlist_1, max_nnei_1, &avg[0], &std[0], nloc, nall, rc, rc_smth, sec_a);
+      deepmd::prod_env_mat_a_cpu(&t_em[0], &t_em_deriv[0], &rij_0[0], &nlist_0[0], &posi_cpy_0[0], &atype_cpy_0[0], inlist_0, max_nnei_0, &avg[0], &std[0], nloc, nall, rc, rc_smth, sec_a);
+      deepmd::prod_env_mat_a_cpu(&t_em[0], &t_em_deriv[0], &rij_1[0], &nlist_1[0], &posi_cpy_1[0], &atype_cpy_1[0], inlist_1, max_nnei_1, &avg[0], &std[0], nloc, nall, rc, rc_smth, sec_a);
       std::vector<double> energy_0(nloc), energy_1(nloc);
       std::vector<double> t_force(nall * 3), t_virial(nall * 9);
-      pair_tab_cpu(
+      deepmd::pair_tab_cpu(
 	  &energy_0[0],
 	  &t_force[0],
 	  &t_virial[0],
@@ -764,7 +764,7 @@ TEST_F(TestPairTabTriBox, cpu_v_num_deriv)
 	  &natoms[0],
 	  sel_a,
 	  sel_r);  
-      pair_tab_cpu(
+      deepmd::pair_tab_cpu(
 	  &energy_1[0],
 	  &t_force[0],
 	  &t_virial[0],
diff --git a/source/lib/tests/test_prod_force_a.cc b/source/lib/tests/test_prod_force_a.cc
index 7cd76e6ed7..7aeb9ca99f 100644
--- a/source/lib/tests/test_prod_force_a.cc
+++ b/source/lib/tests/test_prod_force_a.cc
@@ -65,7 +65,7 @@ class TestProdForceA : public ::testing::Test
       }
       std::vector<double > t_env, t_env_deriv, t_rij_a;
       // compute env_mat and its deriv, record
-      env_mat_a_cpu<double>(t_env, t_env_deriv, t_rij_a, posi_cpy, atype_cpy, ii, fmt_nlist_a, sec_a, rc_smth, rc);    
+      deepmd::env_mat_a_cpu<double>(t_env, t_env_deriv, t_rij_a, posi_cpy, atype_cpy, ii, fmt_nlist_a, sec_a, rc_smth, rc);    
       for (int jj = 0; jj < ndescrpt; ++jj){
 	env[ii*ndescrpt+jj] = t_env[jj];
 	for (int dd = 0; dd < 3; ++dd){
@@ -86,7 +86,7 @@ TEST_F(TestProdForceA, cpu)
 {
   std::vector<double> force(nall * 3);
   int n_a_sel = nnei;
-  prod_force_a_cpu<double> (&force[0], &net_deriv[0], &env_deriv[0], &nlist[0], nloc, nall, nnei);
+  deepmd::prod_force_a_cpu<double> (&force[0], &net_deriv[0], &env_deriv[0], &nlist[0], nloc, nall, nnei);
   EXPECT_EQ(force.size(), nall * 3);
   EXPECT_EQ(force.size(), expected_force.size());
   for (int jj = 0; jj < force.size(); ++jj){
@@ -126,4 +126,4 @@ TEST_F(TestProdForceA, gpu_cuda)
     EXPECT_LT(fabs(force[jj] - expected_force[jj]) , 1e-5);
   }
 }
-#endif // GOOGLE_CUDA
\ No newline at end of file
+#endif // GOOGLE_CUDA
diff --git a/source/lib/tests/test_prod_force_grad_a.cc b/source/lib/tests/test_prod_force_grad_a.cc
index b52c1c951e..82ab484616 100644
--- a/source/lib/tests/test_prod_force_grad_a.cc
+++ b/source/lib/tests/test_prod_force_grad_a.cc
@@ -64,7 +64,7 @@ class TestProdForceGradA : public ::testing::Test
       }
       std::vector<double > t_env, t_env_deriv, t_rij_a;
       // compute env_mat and its deriv, record
-      env_mat_a_cpu<double>(t_env, t_env_deriv, t_rij_a, posi_cpy, atype_cpy, ii, fmt_nlist_a, sec_a, rc_smth, rc);    
+      deepmd::env_mat_a_cpu<double>(t_env, t_env_deriv, t_rij_a, posi_cpy, atype_cpy, ii, fmt_nlist_a, sec_a, rc_smth, rc);    
       for (int jj = 0; jj < ndescrpt; ++jj){
 	env[ii*ndescrpt+jj] = t_env[jj];
 	for (int dd = 0; dd < 3; ++dd){
@@ -84,7 +84,7 @@ class TestProdForceGradA : public ::testing::Test
 TEST_F(TestProdForceGradA, cpu)
 {
   std::vector<double> grad_net(nloc * ndescrpt);
-  prod_force_grad_a_cpu<double>(&grad_net[0], &grad[0], &env_deriv[0], &nlist[0], nloc, nnei);
+  deepmd::prod_force_grad_a_cpu<double>(&grad_net[0], &grad[0], &env_deriv[0], &nlist[0], nloc, nnei);
   EXPECT_EQ(grad_net.size(), nloc * ndescrpt);
   EXPECT_EQ(grad_net.size(), expected_grad_net.size());
   for (int jj = 0; jj < grad_net.size(); ++jj){
diff --git a/source/lib/tests/test_prod_force_grad_r.cc b/source/lib/tests/test_prod_force_grad_r.cc
index c32ce150e8..37534db7f8 100644
--- a/source/lib/tests/test_prod_force_grad_r.cc
+++ b/source/lib/tests/test_prod_force_grad_r.cc
@@ -64,7 +64,7 @@ class TestProdForceGradR : public ::testing::Test
       }
       std::vector<double > t_env, t_env_deriv, t_rij_a;
       // compute env_mat and its deriv, record
-      env_mat_a_cpu<double>(t_env, t_env_deriv, t_rij_a, posi_cpy, atype_cpy, ii, fmt_nlist_a, sec_a, rc_smth, rc);    
+      deepmd::env_mat_a_cpu<double>(t_env, t_env_deriv, t_rij_a, posi_cpy, atype_cpy, ii, fmt_nlist_a, sec_a, rc_smth, rc);    
       for (int jj = 0; jj < ndescrpt; ++jj){
 	env[ii*ndescrpt+jj] = t_env[jj];
 	for (int dd = 0; dd < 3; ++dd){
@@ -84,7 +84,7 @@ class TestProdForceGradR : public ::testing::Test
 TEST_F(TestProdForceGradR, cpu)
 {
   std::vector<double> grad_net(nloc * ndescrpt);
-  prod_force_grad_r_cpu<double>(&grad_net[0], &grad[0], &env_deriv[0], &nlist[0], nloc, nnei);
+  deepmd::prod_force_grad_r_cpu<double>(&grad_net[0], &grad[0], &env_deriv[0], &nlist[0], nloc, nnei);
   EXPECT_EQ(grad_net.size(), nloc * ndescrpt);
   EXPECT_EQ(grad_net.size(), expected_grad_net.size());
   for (int jj = 0; jj < grad_net.size(); ++jj){
diff --git a/source/lib/tests/test_prod_force_r.cc b/source/lib/tests/test_prod_force_r.cc
index 6a3d68576b..033c41a7fe 100644
--- a/source/lib/tests/test_prod_force_r.cc
+++ b/source/lib/tests/test_prod_force_r.cc
@@ -65,7 +65,7 @@ class TestProdForceR : public ::testing::Test
       }
       std::vector<double > t_env, t_env_deriv, t_rij_a;
       // compute env_mat and its deriv, record
-      env_mat_r_cpu<double>(t_env, t_env_deriv, t_rij_a, posi_cpy, atype_cpy, ii, fmt_nlist_a, sec_a, rc_smth, rc);    
+      deepmd::env_mat_r_cpu<double>(t_env, t_env_deriv, t_rij_a, posi_cpy, atype_cpy, ii, fmt_nlist_a, sec_a, rc_smth, rc);    
       for (int jj = 0; jj < ndescrpt; ++jj){
 	env[ii*ndescrpt+jj] = t_env[jj];
 	for (int dd = 0; dd < 3; ++dd){
@@ -86,7 +86,7 @@ TEST_F(TestProdForceR, cpu)
 {
   std::vector<double> force(nall * 3);
   int n_a_sel = nnei;
-  prod_force_r_cpu<double> (&force[0], &net_deriv[0], &env_deriv[0], &nlist[0], nloc, nall, nnei);
+  deepmd::prod_force_r_cpu<double> (&force[0], &net_deriv[0], &env_deriv[0], &nlist[0], nloc, nall, nnei);
   EXPECT_EQ(force.size(), nall * 3);
   EXPECT_EQ(force.size(), expected_force.size());
   for (int jj = 0; jj < force.size(); ++jj){
diff --git a/source/lib/tests/test_prod_virial_a.cc b/source/lib/tests/test_prod_virial_a.cc
index aa0e7bfac9..f1d4ee619a 100644
--- a/source/lib/tests/test_prod_virial_a.cc
+++ b/source/lib/tests/test_prod_virial_a.cc
@@ -68,7 +68,7 @@ class TestProdVirialA : public ::testing::Test
       }
       std::vector<double > t_env, t_env_deriv, t_rij;
       // compute env_mat and its deriv, record
-      env_mat_a_cpu<double>(t_env, t_env_deriv, t_rij, posi_cpy, atype_cpy, ii, fmt_nlist_a, sec_a, rc_smth, rc);    
+      deepmd::env_mat_a_cpu<double>(t_env, t_env_deriv, t_rij, posi_cpy, atype_cpy, ii, fmt_nlist_a, sec_a, rc_smth, rc);    
       for (int jj = 0; jj < ndescrpt; ++jj){
 	env[ii*ndescrpt+jj] = t_env[jj];
 	for (int dd = 0; dd < 3; ++dd){
@@ -93,7 +93,7 @@ TEST_F(TestProdVirialA, cpu)
   std::vector<double> virial(9);
   std::vector<double> atom_virial(nall * 9);
   int n_a_sel = nnei;
-  prod_virial_a_cpu<double> (&virial[0], &atom_virial[0], &net_deriv[0], &env_deriv[0], &rij[0], &nlist[0], nloc, nall, nnei);
+  deepmd::prod_virial_a_cpu<double> (&virial[0], &atom_virial[0], &net_deriv[0], &env_deriv[0], &rij[0], &nlist[0], nloc, nall, nnei);
   EXPECT_EQ(virial.size(), 9);
   EXPECT_EQ(virial.size(), expected_virial.size());
   EXPECT_EQ(atom_virial.size(), nall * 9);  
@@ -159,4 +159,4 @@ TEST_F(TestProdVirialA, gpu_cuda)
     EXPECT_LT(fabs(atom_virial[jj] - expected_atom_virial[jj]) , 1e-5);
   }  
 }
-#endif // GOOGLE_CUDA
\ No newline at end of file
+#endif // GOOGLE_CUDA
diff --git a/source/lib/tests/test_prod_virial_grad_a.cc b/source/lib/tests/test_prod_virial_grad_a.cc
index cba31a10a8..53ad63e965 100644
--- a/source/lib/tests/test_prod_virial_grad_a.cc
+++ b/source/lib/tests/test_prod_virial_grad_a.cc
@@ -64,7 +64,7 @@ class TestProdVirialGradA : public ::testing::Test
       }
       std::vector<double > t_env, t_env_deriv, t_rij;
       // compute env_mat and its deriv, record
-      env_mat_a_cpu<double>(t_env, t_env_deriv, t_rij, posi_cpy, atype_cpy, ii, fmt_nlist_a, sec_a, rc_smth, rc);    
+      deepmd::env_mat_a_cpu<double>(t_env, t_env_deriv, t_rij, posi_cpy, atype_cpy, ii, fmt_nlist_a, sec_a, rc_smth, rc);    
       for (int jj = 0; jj < ndescrpt; ++jj){
 	env[ii*ndescrpt+jj] = t_env[jj];
 	for (int dd = 0; dd < 3; ++dd){
@@ -88,7 +88,7 @@ TEST_F(TestProdVirialGradA, cpu)
 {
   std::vector<double> grad_net(nloc * ndescrpt);
   int n_a_sel = nnei;
-  prod_virial_grad_a_cpu<double> (&grad_net[0], &grad[0], &env_deriv[0], &rij[0], &nlist[0], nloc, nnei);
+  deepmd::prod_virial_grad_a_cpu<double> (&grad_net[0], &grad[0], &env_deriv[0], &rij[0], &nlist[0], nloc, nnei);
   EXPECT_EQ(grad_net.size(), nloc * ndescrpt);
   EXPECT_EQ(grad_net.size(), expected_grad_net.size());
   for (int jj = 0; jj < grad_net.size(); ++jj){
diff --git a/source/lib/tests/test_prod_virial_grad_r.cc b/source/lib/tests/test_prod_virial_grad_r.cc
index 45e6944590..2cb0c91038 100644
--- a/source/lib/tests/test_prod_virial_grad_r.cc
+++ b/source/lib/tests/test_prod_virial_grad_r.cc
@@ -64,7 +64,7 @@ class TestProdVirialGradR : public ::testing::Test
       }
       std::vector<double > t_env, t_env_deriv, t_rij;
       // compute env_mat and its deriv, record
-      env_mat_a_cpu<double>(t_env, t_env_deriv, t_rij, posi_cpy, atype_cpy, ii, fmt_nlist_a, sec_a, rc_smth, rc);    
+      deepmd::env_mat_a_cpu<double>(t_env, t_env_deriv, t_rij, posi_cpy, atype_cpy, ii, fmt_nlist_a, sec_a, rc_smth, rc);    
       for (int jj = 0; jj < ndescrpt; ++jj){
 	env[ii*ndescrpt+jj] = t_env[jj];
 	for (int dd = 0; dd < 3; ++dd){
@@ -88,7 +88,7 @@ TEST_F(TestProdVirialGradR, cpu)
 {
   std::vector<double> grad_net(nloc * ndescrpt);
   int n_a_sel = nnei;
-  prod_virial_grad_r_cpu<double> (&grad_net[0], &grad[0], &env_deriv[0], &rij[0], &nlist[0], nloc, nnei);
+  deepmd::prod_virial_grad_r_cpu<double> (&grad_net[0], &grad[0], &env_deriv[0], &rij[0], &nlist[0], nloc, nnei);
   EXPECT_EQ(grad_net.size(), nloc * ndescrpt);
   EXPECT_EQ(grad_net.size(), expected_grad_net.size());
   for (int jj = 0; jj < grad_net.size(); ++jj){
diff --git a/source/lib/tests/test_prod_virial_r.cc b/source/lib/tests/test_prod_virial_r.cc
index c09a2f04ad..101b1659f8 100644
--- a/source/lib/tests/test_prod_virial_r.cc
+++ b/source/lib/tests/test_prod_virial_r.cc
@@ -68,7 +68,7 @@ class TestProdVirialR : public ::testing::Test
       }
       std::vector<double > t_env, t_env_deriv, t_rij;
       // compute env_mat and its deriv, record
-      env_mat_r_cpu<double>(t_env, t_env_deriv, t_rij, posi_cpy, atype_cpy, ii, fmt_nlist_a, sec_a, rc_smth, rc);    
+      deepmd::env_mat_r_cpu<double>(t_env, t_env_deriv, t_rij, posi_cpy, atype_cpy, ii, fmt_nlist_a, sec_a, rc_smth, rc);    
       for (int jj = 0; jj < ndescrpt; ++jj){
 	env[ii*ndescrpt+jj] = t_env[jj];
 	for (int dd = 0; dd < 3; ++dd){
@@ -93,7 +93,7 @@ TEST_F(TestProdVirialR, cpu)
   std::vector<double> virial(9);
   std::vector<double> atom_virial(nall * 9);
   int n_a_sel = nnei;
-  prod_virial_r_cpu<double> (&virial[0], &atom_virial[0], &net_deriv[0], &env_deriv[0], &rij[0], &nlist[0], nloc, nall, nnei);
+  deepmd::prod_virial_r_cpu<double> (&virial[0], &atom_virial[0], &net_deriv[0], &env_deriv[0], &rij[0], &nlist[0], nloc, nall, nnei);
   EXPECT_EQ(virial.size(), 9);
   EXPECT_EQ(atom_virial.size(), nall * 9);  
   EXPECT_EQ(virial.size(), expected_virial.size());
@@ -159,4 +159,4 @@ TEST_F(TestProdVirialR, gpu_cuda)
     EXPECT_LT(fabs(atom_virial[jj] - expected_atom_virial[jj]) , 1e-5);
   }
 }
-#endif // GOOGLE_CUDA
\ No newline at end of file
+#endif // GOOGLE_CUDA
diff --git a/source/lib/tests/test_simulation_region.cc b/source/lib/tests/test_simulation_region.cc
index dbd65d2883..674c505c02 100644
--- a/source/lib/tests/test_simulation_region.cc
+++ b/source/lib/tests/test_simulation_region.cc
@@ -41,7 +41,7 @@ TEST_F(TestRegion, orig)
 TEST_F(TestRegion, cpu)
 {
   // check rec_box
-  Region<double> region;
+  deepmd::Region<double> region;
   init_region_cpu(region, &ref_boxt[0]);
   for(int ii = 0; ii < 9; ++ii){
     EXPECT_LT(fabs(region.rec_boxt[ii] - ref_rec_boxt[ii]), 1e-10);
diff --git a/source/lib/tests/test_soft_min_switch.cc b/source/lib/tests/test_soft_min_switch.cc
index 0e9319c183..e8e1a0eddc 100644
--- a/source/lib/tests/test_soft_min_switch.cc
+++ b/source/lib/tests/test_soft_min_switch.cc
@@ -65,7 +65,7 @@ class TestSoftMinSwitch : public ::testing::Test
       }
       std::vector<double > t_env, t_env_deriv, t_rij;
       // compute env_mat and its deriv, record
-      env_mat_a_cpu<double>(t_env, t_env_deriv, t_rij, posi_cpy, atype_cpy, ii, fmt_nlist_a, sec_a, rc_smth, rc);    
+      deepmd::env_mat_a_cpu<double>(t_env, t_env_deriv, t_rij, posi_cpy, atype_cpy, ii, fmt_nlist_a, sec_a, rc_smth, rc);    
       for (int jj = 0; jj < nnei * 3; ++jj){
 	rij[ii*nnei*3 + jj] = t_rij[jj];
       }      
@@ -79,7 +79,7 @@ TEST_F(TestSoftMinSwitch, cpu)
 {
   std::vector<double> sw_value(nloc);
   std::vector<double> sw_deriv(nloc * nnei * 3);
-  soft_min_switch_cpu<double> (&sw_value[0], &sw_deriv[0], &rij[0], &nlist[0], nloc, nnei, alpha, rmin, rmax);
+  deepmd::soft_min_switch_cpu<double> (&sw_value[0], &sw_deriv[0], &rij[0], &nlist[0], nloc, nnei, alpha, rmin, rmax);
   EXPECT_EQ(sw_value.size(), nloc);
   EXPECT_EQ(sw_value.size(), expected_value.size());
   EXPECT_EQ(sw_deriv.size(), nloc * nnei * 3);
@@ -106,7 +106,7 @@ TEST_F(TestSoftMinSwitch, cpu_num_deriv)
   std::vector<int> fmt_nlist_a;
   double hh = 1e-5;
   
-  soft_min_switch_cpu<double> (&sw_value[0], &sw_deriv[0], &rij[0], &nlist[0], nloc, nnei, alpha, rmin, rmax);
+  deepmd::soft_min_switch_cpu<double> (&sw_value[0], &sw_deriv[0], &rij[0], &nlist[0], nloc, nnei, alpha, rmin, rmax);
   EXPECT_EQ(sw_value.size(), nloc);
   EXPECT_EQ(sw_deriv.size(), nloc * nnei * 3);
 
@@ -123,8 +123,8 @@ TEST_F(TestSoftMinSwitch, cpu_num_deriv)
 	std::vector<double> posi_1 = posi_cpy;
 	posi_0[j_idx*3+dd] -= hh;
 	posi_1[j_idx*3+dd] += hh;
-	env_mat_a_cpu<double>(env, env_deriv, t_rij_0, posi_0, atype_cpy, ii, fmt_nlist_a, sec_a, rc_smth, rc);
-	env_mat_a_cpu<double>(env, env_deriv, t_rij_1, posi_1, atype_cpy, ii, fmt_nlist_a, sec_a, rc_smth, rc);
+	deepmd::env_mat_a_cpu<double>(env, env_deriv, t_rij_0, posi_0, atype_cpy, ii, fmt_nlist_a, sec_a, rc_smth, rc);
+	deepmd::env_mat_a_cpu<double>(env, env_deriv, t_rij_1, posi_1, atype_cpy, ii, fmt_nlist_a, sec_a, rc_smth, rc);
 	EXPECT_EQ(t_rij_0.size(), nnei * 3);
 	EXPECT_EQ(t_rij_1.size(), nnei * 3);
 	rij_0 = rij;
@@ -133,8 +133,8 @@ TEST_F(TestSoftMinSwitch, cpu_num_deriv)
 	  rij_0[ii*nnei*3 + jj*3 + dd] = t_rij_0[jj*3 + dd];
 	  rij_1[ii*nnei*3 + jj*3 + dd] = t_rij_1[jj*3 + dd];
 	}      
-	soft_min_switch_cpu<double> (&sw_value_0[0], &sw_deriv_0[0], &rij_0[0], &nlist[0], nloc, nnei, alpha, rmin, rmax);
-	soft_min_switch_cpu<double> (&sw_value_1[0], &sw_deriv_1[0], &rij_1[0], &nlist[0], nloc, nnei, alpha, rmin, rmax);
+	deepmd::soft_min_switch_cpu<double> (&sw_value_0[0], &sw_deriv_0[0], &rij_0[0], &nlist[0], nloc, nnei, alpha, rmin, rmax);
+	deepmd::soft_min_switch_cpu<double> (&sw_value_1[0], &sw_deriv_1[0], &rij_1[0], &nlist[0], nloc, nnei, alpha, rmin, rmax);
 	double ana_deriv = sw_deriv[ii*nnei*3 + jj*3 + dd];
 	double num_deriv = (sw_value_1[ii] - sw_value_0[ii]) / (2. * hh);
 	EXPECT_LT(fabs(num_deriv - ana_deriv), 1e-5);
diff --git a/source/lib/tests/test_soft_min_switch_force.cc b/source/lib/tests/test_soft_min_switch_force.cc
index ebe1b62dfe..da40ab662b 100644
--- a/source/lib/tests/test_soft_min_switch_force.cc
+++ b/source/lib/tests/test_soft_min_switch_force.cc
@@ -66,14 +66,14 @@ class TestSoftMinSwitchForce : public ::testing::Test
       }
       std::vector<double > t_env, t_env_deriv, t_rij;
       // compute env_mat and its deriv, record
-      env_mat_a_cpu<double>(t_env, t_env_deriv, t_rij, posi_cpy, atype_cpy, ii, fmt_nlist_a, sec_a, rc_smth, rc);    
+      deepmd::env_mat_a_cpu<double>(t_env, t_env_deriv, t_rij, posi_cpy, atype_cpy, ii, fmt_nlist_a, sec_a, rc_smth, rc);    
       for (int jj = 0; jj < nnei * 3; ++jj){
 	rij[ii*nnei*3 + jj] = t_rij[jj];
       }      
     }
     sw_value.resize(nloc);
     sw_deriv.resize(nloc * nnei * 3);
-    soft_min_switch_cpu<double> (&sw_value[0], &sw_deriv[0], &rij[0], &nlist[0], nloc, 
+    deepmd::soft_min_switch_cpu<double> (&sw_value[0], &sw_deriv[0], &rij[0], &nlist[0], nloc, 
 				 nnei, alpha, rmin, rmax);
     du.resize(nloc);
     for (int ii = 0; ii < nloc; ++ii){
@@ -87,7 +87,7 @@ class TestSoftMinSwitchForce : public ::testing::Test
 TEST_F(TestSoftMinSwitchForce, cpu)
 {
   std::vector<double> force(nall * 3);
-  soft_min_switch_force_cpu(
+  deepmd::soft_min_switch_force_cpu(
       &force[0],
       &du[0],
       &sw_deriv[0],
diff --git a/source/lib/tests/test_soft_min_switch_force_grad.cc b/source/lib/tests/test_soft_min_switch_force_grad.cc
index 66faf0801a..0591b91e3f 100644
--- a/source/lib/tests/test_soft_min_switch_force_grad.cc
+++ b/source/lib/tests/test_soft_min_switch_force_grad.cc
@@ -66,14 +66,14 @@ class TestSoftMinSwitchForceGrad : public ::testing::Test
       }
       std::vector<double > t_env, t_env_deriv, t_rij;
       // compute env_mat and its deriv, record
-      env_mat_a_cpu<double>(t_env, t_env_deriv, t_rij, posi_cpy, atype_cpy, ii, fmt_nlist_a, sec_a, rc_smth, rc);    
+      deepmd::env_mat_a_cpu<double>(t_env, t_env_deriv, t_rij, posi_cpy, atype_cpy, ii, fmt_nlist_a, sec_a, rc_smth, rc);    
       for (int jj = 0; jj < nnei * 3; ++jj){
 	rij[ii*nnei*3 + jj] = t_rij[jj];
       }      
     }
     sw_value.resize(nloc);
     sw_deriv.resize(nloc * nnei * 3);
-    soft_min_switch_cpu<double> (&sw_value[0], &sw_deriv[0], &rij[0], &nlist[0], nloc, 
+    deepmd::soft_min_switch_cpu<double> (&sw_value[0], &sw_deriv[0], &rij[0], &nlist[0], nloc, 
 				 nnei, alpha, rmin, rmax);
     grad.resize(nloc * 3);
     for (int ii = 0; ii < nloc; ++ii){
@@ -87,7 +87,7 @@ class TestSoftMinSwitchForceGrad : public ::testing::Test
 TEST_F(TestSoftMinSwitchForceGrad, cpu)
 {
   std::vector<double> grad_net(nloc);
-  soft_min_switch_force_grad_cpu(
+  deepmd::soft_min_switch_force_grad_cpu(
       &grad_net[0],
       &grad[0],
       &sw_deriv[0],
diff --git a/source/lib/tests/test_soft_min_switch_virial.cc b/source/lib/tests/test_soft_min_switch_virial.cc
index 6132590adf..69471eb9ce 100644
--- a/source/lib/tests/test_soft_min_switch_virial.cc
+++ b/source/lib/tests/test_soft_min_switch_virial.cc
@@ -69,14 +69,14 @@ class TestSoftMinSwitchVirial : public ::testing::Test
       }
       std::vector<double > t_env, t_env_deriv, t_rij;
       // compute env_mat and its deriv, record
-      env_mat_a_cpu<double>(t_env, t_env_deriv, t_rij, posi_cpy, atype_cpy, ii, fmt_nlist_a, sec_a, rc_smth, rc);    
+      deepmd::env_mat_a_cpu<double>(t_env, t_env_deriv, t_rij, posi_cpy, atype_cpy, ii, fmt_nlist_a, sec_a, rc_smth, rc);    
       for (int jj = 0; jj < nnei * 3; ++jj){
 	rij[ii*nnei*3 + jj] = t_rij[jj];
       }      
     }
     sw_value.resize(nloc);
     sw_deriv.resize(nloc * nnei * 3);
-    soft_min_switch_cpu<double> (&sw_value[0], &sw_deriv[0], &rij[0], &nlist[0], nloc, 
+    deepmd::soft_min_switch_cpu<double> (&sw_value[0], &sw_deriv[0], &rij[0], &nlist[0], nloc, 
 				 nnei, alpha, rmin, rmax);
     du.resize(nloc);
     for (int ii = 0; ii < nloc; ++ii){
@@ -91,7 +91,7 @@ TEST_F(TestSoftMinSwitchVirial, cpu)
 {
   std::vector<double> virial(9);
   std::vector<double> atom_virial(nall * 9);
-  soft_min_switch_virial_cpu(
+  deepmd::soft_min_switch_virial_cpu(
       &virial[0],
       &atom_virial[0],
       &du[0],
diff --git a/source/lib/tests/test_soft_min_switch_virial_grad.cc b/source/lib/tests/test_soft_min_switch_virial_grad.cc
index 540e846f73..db5b05fe26 100644
--- a/source/lib/tests/test_soft_min_switch_virial_grad.cc
+++ b/source/lib/tests/test_soft_min_switch_virial_grad.cc
@@ -66,14 +66,14 @@ class TestSoftMinSwitchVirialGrad : public ::testing::Test
       }
       std::vector<double > t_env, t_env_deriv, t_rij;
       // compute env_mat and its deriv, record
-      env_mat_a_cpu<double>(t_env, t_env_deriv, t_rij, posi_cpy, atype_cpy, ii, fmt_nlist_a, sec_a, rc_smth, rc);    
+      deepmd::env_mat_a_cpu<double>(t_env, t_env_deriv, t_rij, posi_cpy, atype_cpy, ii, fmt_nlist_a, sec_a, rc_smth, rc);    
       for (int jj = 0; jj < nnei * 3; ++jj){
 	rij[ii*nnei*3 + jj] = t_rij[jj];
       }      
     }
     sw_value.resize(nloc);
     sw_deriv.resize(nloc * nnei * 3);
-    soft_min_switch_cpu<double> (&sw_value[0], &sw_deriv[0], &rij[0], &nlist[0], nloc, 
+    deepmd::soft_min_switch_cpu<double> (&sw_value[0], &sw_deriv[0], &rij[0], &nlist[0], nloc, 
 				 nnei, alpha, rmin, rmax);
     grad.resize(nloc * 3);
     for (int ii = 0; ii < nloc; ++ii){
@@ -87,7 +87,7 @@ class TestSoftMinSwitchVirialGrad : public ::testing::Test
 TEST_F(TestSoftMinSwitchVirialGrad, cpu)
 {
   std::vector<double> grad_net(nloc);
-  soft_min_switch_virial_grad_cpu(
+  deepmd::soft_min_switch_virial_grad_cpu(
       &grad_net[0],
       &grad[0],
       &sw_deriv[0],
diff --git a/source/lmp/fix_dplr.cpp b/source/lmp/fix_dplr.cpp
index 2a5826aa84..5ce85490fc 100644
--- a/source/lmp/fix_dplr.cpp
+++ b/source/lmp/fix_dplr.cpp
@@ -263,7 +263,7 @@ void FixDPLR::pre_force(int vflag)
   }
   // get lammps nlist
   NeighList * list = pair_deepmd->list;
-  InputNlist lmp_list (list->inum, list->ilist, list->numneigh, list->firstneigh);
+  deepmd::InputNlist lmp_list (list->inum, list->ilist, list->numneigh, list->firstneigh);
   // declear output
   vector<FLOAT_PREC> tensor;
   // compute
@@ -417,7 +417,7 @@ void FixDPLR::post_force(int vflag)
   }
   // lmp nlist
   NeighList * list = pair_deepmd->list;
-  InputNlist lmp_list (list->inum, list->ilist, list->numneigh, list->firstneigh);
+  deepmd::InputNlist lmp_list (list->inum, list->ilist, list->numneigh, list->firstneigh);
   // bonded pairs
   vector<pair<int,int> > valid_pairs;
   get_valid_pairs(valid_pairs);  
diff --git a/source/lmp/pair_deepmd.cpp b/source/lmp/pair_deepmd.cpp
index 593e6a4246..8e2d691562 100644
--- a/source/lmp/pair_deepmd.cpp
+++ b/source/lmp/pair_deepmd.cpp
@@ -360,7 +360,7 @@ void PairDeepMD::compute(int eflag, int vflag)
   multi_models_no_mod_devi = (numb_models > 1 && (out_freq == 0 || update->ntimestep % out_freq != 0));
   multi_models_mod_devi = (numb_models > 1 && (out_freq > 0 && update->ntimestep % out_freq == 0));
   if (do_ghost) {
-    InputNlist lmp_list (list->inum, list->ilist, list->numneigh, list->firstneigh);
+    deepmd::InputNlist lmp_list (list->inum, list->ilist, list->numneigh, list->firstneigh);
     if (single_model || multi_models_no_mod_devi) {
       if ( ! (eflag_atom || vflag_atom) ) {      
 #ifdef HIGH_PREC
diff --git a/source/lmp/pair_deepmd.h.in b/source/lmp/pair_deepmd.h.in
index a54f45c2f3..ee20de13c6 100644
--- a/source/lmp/pair_deepmd.h.in
+++ b/source/lmp/pair_deepmd.h.in
@@ -1,16 +1,3 @@
-/* -*- c++ -*- ----------------------------------------------------------
-   LAMMPS - Large-scale Atomic/Molecular Massively Parallel Simulator
-   http://lammps.sandia.gov, Sandia National Laboratories
-   Steve Plimpton, sjplimp@sandia.gov
-
-   Copyright (2003) Sandia Corporation.  Under the terms of Contract
-   DE-AC04-94AL85000 with Sandia Corporation, the U.S. Government retains
-   certain rights in this software.  This software is distributed under
-   the GNU General Public License.
-
-   See the README file in the top-level LAMMPS directory.
-------------------------------------------------------------------------- */
-
 #ifdef PAIR_CLASS
 
 PairStyle(deepmd,PairDeepMD)
diff --git a/source/lmp/pppm_dplr.cpp b/source/lmp/pppm_dplr.cpp
index da95f58c9d..7b2fbe6e65 100644
--- a/source/lmp/pppm_dplr.cpp
+++ b/source/lmp/pppm_dplr.cpp
@@ -1,16 +1,3 @@
-/* ----------------------------------------------------------------------
-   LAMMPS - Large-scale Atomic/Molecular Massively Parallel Simulator
-   https://lammps.sandia.gov/, Sandia National Laboratories
-   Steve Plimpton, sjplimp@sandia.gov
-
-   Copyright (2003) Sandia Corporation.  Under the terms of Contract
-   DE-AC04-94AL85000 with Sandia Corporation, the U.S. Government retains
-   certain rights in this software.  This software is distributed under
-   the GNU General Public License.
-
-   See the README file in the top-level LAMMPS directory.
-------------------------------------------------------------------------- */
-
 #include <math.h>
 #include "pppm_dplr.h"
 #include "atom.h"
diff --git a/source/lmp/pppm_dplr.h b/source/lmp/pppm_dplr.h
index d9752583d5..4f22a9c621 100644
--- a/source/lmp/pppm_dplr.h
+++ b/source/lmp/pppm_dplr.h
@@ -1,16 +1,3 @@
-/* -*- c++ -*- ----------------------------------------------------------
-   LAMMPS - Large-scale Atomic/Molecular Massively Parallel Simulator
-   http://lammps.sandia.gov, Sandia National Laboratories
-   Steve Plimpton, sjplimp@sandia.gov
-
-   Copyright (2003) Sandia Corporation.  Under the terms of Contract
-   DE-AC04-94AL85000 with Sandia Corporation, the U.S. Government retains
-   certain rights in this software.  This software is distributed under
-   the GNU General Public License.
-
-   See the README file in the top-level LAMMPS directory.
-------------------------------------------------------------------------- */
-
 #ifdef KSPACE_CLASS
 
 KSpaceStyle(pppm/dplr,PPPMDPLR)
diff --git a/source/op/ewald_recp.cc b/source/op/ewald_recp.cc
index 21ac163b5c..ae3aa84bc1 100644
--- a/source/op/ewald_recp.cc
+++ b/source/op/ewald_recp.cc
@@ -79,7 +79,7 @@ class EwaldRecpOp : public OpKernel {
       int coord_iter = kk * nloc * 3;
       int charge_iter = kk * nloc;
       // set region
-      Region<FPTYPE> region;
+      deepmd::Region<FPTYPE> region;
       init_region_cpu(region, &box(box_iter));
 
       // set & normalize coord
@@ -117,7 +117,7 @@ class EwaldRecpOp : public OpKernel {
     }
   }
 private:
-  EwaldParameters<FPTYPE> ep;
+  deepmd::EwaldParameters<FPTYPE> ep;
 };
 
 #define REGISTER_CPU(T)                                                                 \
diff --git a/source/op/gelu_multi_device.cc b/source/op/gelu_multi_device.cc
index 4ea7afad17..5923ee5675 100644
--- a/source/op/gelu_multi_device.cc
+++ b/source/op/gelu_multi_device.cc
@@ -51,7 +51,7 @@ class GeluOp : public OpKernel {
       #endif // GOOGLE_CUDA
     }
     else if (device == "CPU") {
-      gelu_cpu(
+      deepmd::gelu_cpu(
           out, 
           x, size);
     }
@@ -94,7 +94,7 @@ class GeluGradOp : public OpKernel {
       #endif // GOOGLE_CUDA
     }
     else if (device == "CPU") {
-      gelu_grad_cpu(
+      deepmd::gelu_grad_cpu(
           out, 
           x, dy, size);
     }
@@ -135,7 +135,7 @@ class GeluGradGradOp : public OpKernel {
       #endif // GOOGLE_CUDA
     }
     else if (device == "CPU") {
-      gelu_grad_grad_cpu(
+      deepmd::gelu_grad_grad_cpu(
           out, 
           x, dy, dy_2, size);
     }
@@ -170,4 +170,4 @@ REGISTER_KERNEL_BUILDER(                                                \
     GeluGradGradOp<GPUDevice, T>);                                      
 REGISTER_GPU(float);
 REGISTER_GPU(double);
-#endif // GOOGLE_CUDA
\ No newline at end of file
+#endif // GOOGLE_CUDA
diff --git a/source/op/map_aparam.cc b/source/op/map_aparam.cc
index 1a08b678c3..3eba13990a 100644
--- a/source/op/map_aparam.cc
+++ b/source/op/map_aparam.cc
@@ -62,7 +62,7 @@ class MapAparamOp : public OpKernel {
       int output_iter	= kk * nloc * nnei * numb_aparam;
       int aparam_iter	= kk * nall * numb_aparam;
       int nlist_iter	= kk * nloc * nnei;
-      map_aparam_cpu(
+      deepmd::map_aparam_cpu(
 	  &output(output_iter),
 	  &aparam(aparam_iter),
 	  &nlist(nlist_iter),
diff --git a/source/op/pair_tab.cc b/source/op/pair_tab.cc
index 31c2083cf7..fb3689b5a8 100644
--- a/source/op/pair_tab.cc
+++ b/source/op/pair_tab.cc
@@ -126,7 +126,7 @@ class PairTabOp : public OpKernel {
     // loop over samples
 #pragma omp parallel for 
     for (int kk = 0; kk < nframes; ++kk){
-      pair_tab_cpu<FPTYPE>(
+      deepmd::pair_tab_cpu<FPTYPE>(
 	  &energy(kk,0),
 	  &force(kk,0),
 	  &virial(kk,0),
diff --git a/source/op/prod_env_mat_multi_device.cc b/source/op/prod_env_mat_multi_device.cc
index e12fa73017..c0423db358 100644
--- a/source/op/prod_env_mat_multi_device.cc
+++ b/source/op/prod_env_mat_multi_device.cc
@@ -89,7 +89,7 @@ _prepare_coord_nlist_cpu(
     int const** type,
     std::vector<int> & type_cpy,
     std::vector<int> & idx_mapping,
-    InputNlist & inlist,
+    deepmd::InputNlist & inlist,
     std::vector<int> & ilist,
     std::vector<int> & numneigh,
     std::vector<int*> & firstneigh,
@@ -271,7 +271,7 @@ class ProdEnvMatAOp : public OpKernel {
       array_longlong = uint64_temp.flat<unsigned long long>().data();
 
       // update nbor list
-      InputNlist inlist;
+      deepmd::InputNlist inlist;
       inlist.inum = nloc;
       env_mat_nbor_update(
           inlist, gpu_inlist, max_nbor_size, nbor_list_dev,
@@ -284,7 +284,7 @@ class ProdEnvMatAOp : public OpKernel {
       #endif //GOOGLE_CUDA
     }
     else if (device == "CPU") {
-      InputNlist inlist;
+      deepmd::InputNlist inlist;
       // some buffers, be freed after the evaluation of this frame
       std::vector<int> idx_mapping;
       std::vector<int> ilist(nloc), numneigh(nloc);
@@ -325,7 +325,7 @@ class ProdEnvMatAOp : public OpKernel {
   std::string device;
   int * array_int = NULL;
   unsigned long long * array_longlong = NULL;
-  InputNlist gpu_inlist;
+  deepmd::InputNlist gpu_inlist;
   int * nbor_list_dev = NULL;
 };
 
@@ -481,7 +481,7 @@ class ProdEnvMatROp : public OpKernel {
       array_longlong = uint64_temp.flat<unsigned long long>().data();
       
       // update nbor list
-      InputNlist inlist;
+      deepmd::InputNlist inlist;
       inlist.inum = nloc;
       env_mat_nbor_update(
           inlist, gpu_inlist, max_nbor_size, nbor_list_dev,
@@ -494,7 +494,7 @@ class ProdEnvMatROp : public OpKernel {
       #endif //GOOGLE_CUDA
     }
     else if (device == "CPU") {
-      InputNlist inlist;
+      deepmd::InputNlist inlist;
       // some buffers, be freed after the evaluation of this frame
       std::vector<int> idx_mapping;
       std::vector<int> ilist(nloc), numneigh(nloc);
@@ -533,7 +533,7 @@ class ProdEnvMatROp : public OpKernel {
   std::string device;
   int * array_int = NULL;
   unsigned long long * array_longlong = NULL;
-  InputNlist gpu_inlist;
+  deepmd::InputNlist gpu_inlist;
   int * nbor_list_dev = NULL;
 };
 
@@ -557,7 +557,7 @@ _norm_copy_coord_cpu(
 {
   std::vector<FPTYPE> tmp_coord(nall*3);
   std::copy(coord, coord+nall*3, tmp_coord.begin());
-  Region<FPTYPE> region;
+  deepmd::Region<FPTYPE> region;
   init_region_cpu(region, box);
   normalize_coord_cpu(&tmp_coord[0], nall, region);
   int tt;
@@ -599,7 +599,7 @@ _build_nlist_cpu(
       jlist[ii].resize(mem_nnei);
       firstneigh[ii] = &jlist[ii][0];
     }
-    InputNlist inlist(nloc, &ilist[0], &numneigh[0], &firstneigh[0]);
+    deepmd::InputNlist inlist(nloc, &ilist[0], &numneigh[0], &firstneigh[0]);
     int ret = build_nlist_cpu(
 	inlist, &max_nnei, 
 	coord, nloc, new_nall, mem_nnei, rcut_r);
@@ -639,7 +639,7 @@ _prepare_coord_nlist_cpu(
     int const** type,
     std::vector<int> & type_cpy,
     std::vector<int> & idx_mapping,
-    InputNlist & inlist,
+    deepmd::InputNlist & inlist,
     std::vector<int> & ilist,
     std::vector<int> & numneigh,
     std::vector<int*> & firstneigh,
diff --git a/source/op/prod_force_se_a.cc b/source/op/prod_force_se_a.cc
index d286326c76..a07cd1d02d 100644
--- a/source/op/prod_force_se_a.cc
+++ b/source/op/prod_force_se_a.cc
@@ -90,13 +90,14 @@ class ProdForceSeAOp : public OpKernel {
       int in_iter	= kk * nloc * ndescrpt * 3;
       int nlist_iter	= kk * nloc * nnei;
 
-      prod_force_a_cpu<FPTYPE>(&force(force_iter),
-			       &net_deriv(net_iter),
-			       &in_deriv(in_iter),
-			       &nlist(nlist_iter),
-			       nloc, 
-			       nall,
-			       nnei);
+      deepmd::prod_force_a_cpu<FPTYPE>(
+	  &force(force_iter),
+	  &net_deriv(net_iter),
+	  &in_deriv(in_iter),
+	  &nlist(nlist_iter),
+	  nloc, 
+	  nall,
+	  nnei);
     }
   }
 private:
diff --git a/source/op/prod_force_se_a_grad.cc b/source/op/prod_force_se_a_grad.cc
index aa8dd4e62b..59878bf7d9 100644
--- a/source/op/prod_force_se_a_grad.cc
+++ b/source/op/prod_force_se_a_grad.cc
@@ -87,12 +87,13 @@ class ProdForceSeAGradOp : public OpKernel
       int nlist_iter	= kk * nloc * nnei;
       int grad_net_iter	= kk * nloc * ndescrpt;
 
-      prod_force_grad_a_cpu(&grad_net(grad_net_iter),
-			    &grad(grad_iter),
-			    &in_deriv(in_iter),
-			    &nlist(nlist_iter),
-			    nloc, 
-			    nnei);
+      deepmd::prod_force_grad_a_cpu(
+	  &grad_net(grad_net_iter),
+	  &grad(grad_iter),
+	  &in_deriv(in_iter),
+	  &nlist(nlist_iter),
+	  nloc, 
+	  nnei);
     }
   }
 private:
diff --git a/source/op/prod_force_se_r.cc b/source/op/prod_force_se_r.cc
index 8ce34a1489..1aa6d76760 100644
--- a/source/op/prod_force_se_r.cc
+++ b/source/op/prod_force_se_r.cc
@@ -81,13 +81,14 @@ class ProdForceSeROp : public OpKernel {
       int in_iter	= kk * nloc * ndescrpt * 3;
       int nlist_iter	= kk * nloc * nnei;
 
-      prod_force_r_cpu<FPTYPE>(&force(force_iter),
-			       &net_deriv(net_iter),
-			       &in_deriv(in_iter),
-			       &nlist(nlist_iter),
-			       nloc, 
-			       nall,
-			       nnei);
+      deepmd::prod_force_r_cpu<FPTYPE>(
+	  &force(force_iter),
+	  &net_deriv(net_iter),
+	  &in_deriv(in_iter),
+	  &nlist(nlist_iter),
+	  nloc, 
+	  nall,
+	  nnei);
     }
   }
 };
diff --git a/source/op/prod_force_se_r_grad.cc b/source/op/prod_force_se_r_grad.cc
index 039a452e94..be8ebec213 100644
--- a/source/op/prod_force_se_r_grad.cc
+++ b/source/op/prod_force_se_r_grad.cc
@@ -81,7 +81,7 @@ class ProdForceSeRGradOp : public OpKernel
       int nlist_iter	= kk * nloc * nnei;
       int grad_net_iter	= kk * nloc * ndescrpt;
       
-      prod_force_grad_r_cpu(
+      deepmd::prod_force_grad_r_cpu(
 	  &grad_net(grad_net_iter),
 	  &grad(grad_iter),
 	  &in_deriv(in_iter),
diff --git a/source/op/prod_virial_se_a.cc b/source/op/prod_virial_se_a.cc
index 454edb2fbd..80223f5e67 100644
--- a/source/op/prod_virial_se_a.cc
+++ b/source/op/prod_virial_se_a.cc
@@ -91,15 +91,16 @@ class ProdVirialSeAOp : public OpKernel {
       int virial_iter	= kk * 9;
       int atom_virial_iter	= kk * nall * 9;
 
-      prod_virial_a_cpu<FPTYPE>(&virial(virial_iter),
-				&atom_virial(atom_virial_iter),
-				&net_deriv(net_iter),
-				&in_deriv(in_iter),
-				&rij(rij_iter),
-				&nlist(nlist_iter),
-				nloc,
-				nall,
-				nnei);
+      deepmd::prod_virial_a_cpu<FPTYPE>(
+	  &virial(virial_iter),
+	  &atom_virial(atom_virial_iter),
+	  &net_deriv(net_iter),
+	  &in_deriv(in_iter),
+	  &rij(rij_iter),
+	  &nlist(nlist_iter),
+	  nloc,
+	  nall,
+	  nnei);
     }
   }
 private:
diff --git a/source/op/prod_virial_se_a_grad.cc b/source/op/prod_virial_se_a_grad.cc
index d1366baf6f..2e6056c09c 100644
--- a/source/op/prod_virial_se_a_grad.cc
+++ b/source/op/prod_virial_se_a_grad.cc
@@ -95,13 +95,14 @@ class ProdVirialSeAGradOp : public OpKernel
       int nlist_iter	= kk * nloc * nnei;
       int grad_net_iter	= kk * nloc * ndescrpt;
 
-      prod_virial_grad_a_cpu(&grad_net(grad_net_iter),
-			     &grad(grad_iter),
-			     &in_deriv(in_iter),
-			     &rij(rij_iter),
-			     &nlist(nlist_iter),
-			     nloc,
-			     nnei);
+      deepmd::prod_virial_grad_a_cpu(
+	  &grad_net(grad_net_iter),
+	  &grad(grad_iter),
+	  &in_deriv(in_iter),
+	  &rij(rij_iter),
+	  &nlist(nlist_iter),
+	  nloc,
+	  nnei);
     }
   }
 private:
diff --git a/source/op/prod_virial_se_r.cc b/source/op/prod_virial_se_r.cc
index 4b6285ff48..d063de03a3 100644
--- a/source/op/prod_virial_se_r.cc
+++ b/source/op/prod_virial_se_r.cc
@@ -84,7 +84,7 @@ class ProdVirialSeROp : public OpKernel {
       int virial_iter	= kk * 9;
       int atom_virial_iter	= kk * nall * 9;
 
-      prod_virial_r_cpu(
+      deepmd::prod_virial_r_cpu(
 	  &virial(virial_iter),
 	  &atom_virial(atom_virial_iter),
 	  &net_deriv(net_iter),
diff --git a/source/op/prod_virial_se_r_grad.cc b/source/op/prod_virial_se_r_grad.cc
index d900781709..57482f0f8a 100644
--- a/source/op/prod_virial_se_r_grad.cc
+++ b/source/op/prod_virial_se_r_grad.cc
@@ -89,7 +89,7 @@ class ProdVirialSeRGradOp : public OpKernel
       int nlist_iter	= kk * nloc * nnei;
       int grad_net_iter	= kk * nloc * ndescrpt;
 
-      prod_virial_grad_r_cpu(
+      deepmd::prod_virial_grad_r_cpu(
 	  &grad_net(grad_net_iter),
 	  &grad(grad_iter),
 	  &in_deriv(in_iter),
diff --git a/source/op/soft_min.cc b/source/op/soft_min.cc
index 4dcedbaea1..cae371fc70 100644
--- a/source/op/soft_min.cc
+++ b/source/op/soft_min.cc
@@ -91,7 +91,7 @@ class SoftMinSwitchOp : public OpKernel {
     // loop over samples
 #pragma omp parallel for 
     for (int kk = 0; kk < nframes; ++kk){
-      soft_min_switch_cpu<FPTYPE>(
+      deepmd::soft_min_switch_cpu<FPTYPE>(
 	  &sw_value(kk, 0),
 	  &sw_deriv(kk, 0),
 	  &rij(kk, 0),
diff --git a/source/op/soft_min_force.cc b/source/op/soft_min_force.cc
index 143d70d5ae..15e5e3b41d 100644
--- a/source/op/soft_min_force.cc
+++ b/source/op/soft_min_force.cc
@@ -68,7 +68,7 @@ class SoftMinForceOp : public OpKernel {
     // loop over samples
 #pragma omp parallel for 
     for (int kk = 0; kk < nframes; ++kk){
-      soft_min_switch_force_cpu(
+      deepmd::soft_min_switch_force_cpu(
 	  &force(kk,0),
 	  &du(kk,0),
 	  &sw_deriv(kk,0),
diff --git a/source/op/soft_min_force_grad.cc b/source/op/soft_min_force_grad.cc
index 59ccb32351..6a161e4f4d 100644
--- a/source/op/soft_min_force_grad.cc
+++ b/source/op/soft_min_force_grad.cc
@@ -80,7 +80,7 @@ class SoftMinForceGradOp : public OpKernel
     // loop over frames
 #pragma omp parallel for
     for (int kk = 0; kk < nframes; ++kk){
-      soft_min_switch_force_grad_cpu(
+      deepmd::soft_min_switch_force_grad_cpu(
 	  &grad_net(kk,0),
 	  &grad(kk,0),
 	  &sw_deriv(kk,0),
diff --git a/source/op/soft_min_virial.cc b/source/op/soft_min_virial.cc
index 05e76d2741..3dcc2e6daa 100644
--- a/source/op/soft_min_virial.cc
+++ b/source/op/soft_min_virial.cc
@@ -82,7 +82,7 @@ class SoftMinVirialOp : public OpKernel {
     // loop over samples
 #pragma omp parallel for
     for (int kk = 0; kk < nframes; ++kk){
-      soft_min_switch_virial_cpu(
+      deepmd::soft_min_switch_virial_cpu(
 	  &virial(kk,0),
 	  &atom_virial(kk,0),
 	  &du(kk,0),
diff --git a/source/op/soft_min_virial_grad.cc b/source/op/soft_min_virial_grad.cc
index a115c461e3..c5c5399195 100644
--- a/source/op/soft_min_virial_grad.cc
+++ b/source/op/soft_min_virial_grad.cc
@@ -87,7 +87,7 @@ class SoftMinVirialGradOp : public OpKernel
     // loop over frames
 #pragma omp parallel for
     for (int kk = 0; kk < nframes; ++kk){
-      soft_min_switch_virial_grad_cpu(
+      deepmd::soft_min_switch_virial_grad_cpu(
 	  &grad_net(kk, 0),
 	  &grad(kk, 0),
 	  &sw_deriv(kk, 0),

From cccefd7ca957e1617e89c105c8bfc21601c19be6 Mon Sep 17 00:00:00 2001
From: Han Wang <wang_han@iapcm.ac.cn>
Date: Wed, 17 Mar 2021 08:56:52 +0800
Subject: [PATCH 2/7] fix bug in tests

---
 source/api_cc/tests/test_deepdipole.cc         | 2 +-
 source/api_cc/tests/test_deeppolar.cc          | 2 +-
 source/api_cc/tests/test_deeppot_a.cc          | 8 ++++----
 source/api_cc/tests/test_deeppot_model_devi.cc | 4 ++--
 source/api_cc/tests/test_deeppot_r.cc          | 8 ++++----
 source/api_cc/tests/test_dipolecharge.cc       | 6 +++---
 source/api_cc/tests/test_ewald.cc              | 4 ++--
 source/api_cc/tests/test_utils.h               | 4 ++--
 8 files changed, 19 insertions(+), 19 deletions(-)

diff --git a/source/api_cc/tests/test_deepdipole.cc b/source/api_cc/tests/test_deepdipole.cc
index bfdcc1de17..fbc7388f7a 100644
--- a/source/api_cc/tests/test_deepdipole.cc
+++ b/source/api_cc/tests/test_deepdipole.cc
@@ -88,7 +88,7 @@ TEST_F(TestInferDeepDipole, cpu_lmp_nlist)
   std::vector<int> ilist(nloc), numneigh(nloc);
   std::vector<int*> firstneigh(nloc);
   std::vector<std::vector<int > > nlist_data;
-  InputNlist inlist(nloc, &ilist[0], &numneigh[0], &firstneigh[0]);
+  deepmd::InputNlist inlist(nloc, &ilist[0], &numneigh[0], &firstneigh[0]);
   _build_nlist(nlist_data, coord_cpy, atype_cpy, mapping,
 	       coord, atype, box, rc);
   int nall = coord_cpy.size() / 3;
diff --git a/source/api_cc/tests/test_deeppolar.cc b/source/api_cc/tests/test_deeppolar.cc
index 19e9b8af7c..a76e84179a 100644
--- a/source/api_cc/tests/test_deeppolar.cc
+++ b/source/api_cc/tests/test_deeppolar.cc
@@ -88,7 +88,7 @@ TEST_F(TestInferDeepPolar, cpu_lmp_nlist)
   std::vector<int> ilist(nloc), numneigh(nloc);
   std::vector<int*> firstneigh(nloc);
   std::vector<std::vector<int > > nlist_data;
-  InputNlist inlist(nloc, &ilist[0], &numneigh[0], &firstneigh[0]);
+  deepmd::InputNlist inlist(nloc, &ilist[0], &numneigh[0], &firstneigh[0]);
   _build_nlist(nlist_data, coord_cpy, atype_cpy, mapping,
 	       coord, atype, box, rc);
   int nall = coord_cpy.size() / 3;
diff --git a/source/api_cc/tests/test_deeppot_a.cc b/source/api_cc/tests/test_deeppot_a.cc
index 77b343f91e..61336c4e33 100644
--- a/source/api_cc/tests/test_deeppot_a.cc
+++ b/source/api_cc/tests/test_deeppot_a.cc
@@ -183,7 +183,7 @@ TEST_F(TestInferDeepPotA, cpu_lmp_nlist)
   int nall = coord_cpy.size() / 3;
   std::vector<int> ilist(nloc), numneigh(nloc);
   std::vector<int*> firstneigh(nloc);
-  InputNlist inlist(nloc, &ilist[0], &numneigh[0], &firstneigh[0]);
+  deepmd::InputNlist inlist(nloc, &ilist[0], &numneigh[0], &firstneigh[0]);
   convert_nlist(inlist, nlist_data);  
   
   double ener;
@@ -234,7 +234,7 @@ TEST_F(TestInferDeepPotA, cpu_lmp_nlist_atomic)
   int nall = coord_cpy.size() / 3;
   std::vector<int> ilist(nloc), numneigh(nloc);
   std::vector<int*> firstneigh(nloc);
-  InputNlist inlist(nloc, &ilist[0], &numneigh[0], &firstneigh[0]);
+  deepmd::InputNlist inlist(nloc, &ilist[0], &numneigh[0], &firstneigh[0]);
   convert_nlist(inlist, nlist_data);  
   
   double ener;
@@ -307,7 +307,7 @@ TEST_F(TestInferDeepPotA, cpu_lmp_nlist_2rc)
   int nall = coord_cpy.size() / 3;
   std::vector<int> ilist(nloc), numneigh(nloc);
   std::vector<int*> firstneigh(nloc);
-  InputNlist inlist(nloc, &ilist[0], &numneigh[0], &firstneigh[0]);
+  deepmd::InputNlist inlist(nloc, &ilist[0], &numneigh[0], &firstneigh[0]);
   convert_nlist(inlist, nlist_data);  
   
   double ener;
@@ -373,7 +373,7 @@ TEST_F(TestInferDeepPotA, cpu_lmp_nlist_type_sel)
   int nall = coord_cpy.size() / 3;
   std::vector<int> ilist(nloc), numneigh(nloc);
   std::vector<int*> firstneigh(nloc);
-  InputNlist inlist(nloc, &ilist[0], &numneigh[0], &firstneigh[0]);
+  deepmd::InputNlist inlist(nloc, &ilist[0], &numneigh[0], &firstneigh[0]);
   convert_nlist(inlist, nlist_data);  
 
   // dp compute
diff --git a/source/api_cc/tests/test_deeppot_model_devi.cc b/source/api_cc/tests/test_deeppot_model_devi.cc
index f70b66a9eb..678b30a2ea 100644
--- a/source/api_cc/tests/test_deeppot_model_devi.cc
+++ b/source/api_cc/tests/test_deeppot_model_devi.cc
@@ -93,7 +93,7 @@ TEST_F(TestInferDeepPotModeDevi, cpu_lmp_list)
   int nall = coord_cpy.size() / 3;
   std::vector<int> ilist(nloc), numneigh(nloc);
   std::vector<int*> firstneigh(nloc);
-  InputNlist inlist(nloc, &ilist[0], &numneigh[0], &firstneigh[0]);
+  deepmd::InputNlist inlist(nloc, &ilist[0], &numneigh[0], &firstneigh[0]);
   convert_nlist(inlist, nlist_data);  
 
   int nmodel = 2;
@@ -170,7 +170,7 @@ TEST_F(TestInferDeepPotModeDevi, cpu_lmp_list_atomic)
   int nall = coord_cpy.size() / 3;
   std::vector<int> ilist(nloc), numneigh(nloc);
   std::vector<int*> firstneigh(nloc);
-  InputNlist inlist(nloc, &ilist[0], &numneigh[0], &firstneigh[0]);
+  deepmd::InputNlist inlist(nloc, &ilist[0], &numneigh[0], &firstneigh[0]);
   convert_nlist(inlist, nlist_data);  
 
   int nmodel = 2;
diff --git a/source/api_cc/tests/test_deeppot_r.cc b/source/api_cc/tests/test_deeppot_r.cc
index e9b1c45c85..2d6af9f6ae 100644
--- a/source/api_cc/tests/test_deeppot_r.cc
+++ b/source/api_cc/tests/test_deeppot_r.cc
@@ -183,7 +183,7 @@ TEST_F(TestInferDeepPotR, cpu_lmp_nlist)
   int nall = coord_cpy.size() / 3;
   std::vector<int> ilist(nloc), numneigh(nloc);
   std::vector<int*> firstneigh(nloc);
-  InputNlist inlist(nloc, &ilist[0], &numneigh[0], &firstneigh[0]);
+  deepmd::InputNlist inlist(nloc, &ilist[0], &numneigh[0], &firstneigh[0]);
   convert_nlist(inlist, nlist_data);  
   
   double ener;
@@ -234,7 +234,7 @@ TEST_F(TestInferDeepPotR, cpu_lmp_nlist_atomic)
   int nall = coord_cpy.size() / 3;
   std::vector<int> ilist(nloc), numneigh(nloc);
   std::vector<int*> firstneigh(nloc);
-  InputNlist inlist(nloc, &ilist[0], &numneigh[0], &firstneigh[0]);
+  deepmd::InputNlist inlist(nloc, &ilist[0], &numneigh[0], &firstneigh[0]);
   convert_nlist(inlist, nlist_data);  
   
   double ener;
@@ -307,7 +307,7 @@ TEST_F(TestInferDeepPotR, cpu_lmp_nlist_2rc)
   int nall = coord_cpy.size() / 3;
   std::vector<int> ilist(nloc), numneigh(nloc);
   std::vector<int*> firstneigh(nloc);
-  InputNlist inlist(nloc, &ilist[0], &numneigh[0], &firstneigh[0]);
+  deepmd::InputNlist inlist(nloc, &ilist[0], &numneigh[0], &firstneigh[0]);
   convert_nlist(inlist, nlist_data);  
   
   double ener;
@@ -373,7 +373,7 @@ TEST_F(TestInferDeepPotR, cpu_lmp_nlist_type_sel)
   int nall = coord_cpy.size() / 3;
   std::vector<int> ilist(nloc), numneigh(nloc);
   std::vector<int*> firstneigh(nloc);
-  InputNlist inlist(nloc, &ilist[0], &numneigh[0], &firstneigh[0]);
+  deepmd::InputNlist inlist(nloc, &ilist[0], &numneigh[0], &firstneigh[0]);
   convert_nlist(inlist, nlist_data);  
 
   // dp compute
diff --git a/source/api_cc/tests/test_dipolecharge.cc b/source/api_cc/tests/test_dipolecharge.cc
index 9ce16b9442..b13c3c7d7a 100644
--- a/source/api_cc/tests/test_dipolecharge.cc
+++ b/source/api_cc/tests/test_dipolecharge.cc
@@ -116,7 +116,7 @@ TEST_F(TestDipoleCharge, cpu_lmp_nlist)
   int nghost = nall - nloc;
   std::vector<int> ilist(nloc), numneigh(nloc);
   std::vector<int*> firstneigh(nloc);
-  InputNlist inlist(nloc, &ilist[0], &numneigh[0], &firstneigh[0]);
+  deepmd::InputNlist inlist(nloc, &ilist[0], &numneigh[0], &firstneigh[0]);
   convert_nlist(inlist, nlist_data);  
 
   // evaluate dipole
@@ -174,9 +174,9 @@ TEST_F(TestDipoleCharge, cpu_lmp_nlist)
   // compute the recp part of the ele interaction
   double eener;
   std::vector<double> eforce, evirial;
-  Region<double> region;
+  deepmd::Region<double> region;
   init_region_cpu(region, &box[0]);
-  EwaldParameters<double> eparam;
+  deepmd::EwaldParameters<double> eparam;
   eparam.beta = 0.2;
   eparam.spacing = 4;
   ewald_recp(eener, eforce, evirial, coord, charge, region, eparam);
diff --git a/source/api_cc/tests/test_ewald.cc b/source/api_cc/tests/test_ewald.cc
index 73dcbe686e..e27b1087e3 100644
--- a/source/api_cc/tests/test_ewald.cc
+++ b/source/api_cc/tests/test_ewald.cc
@@ -35,7 +35,7 @@ TEST_F(TestInferEwald, cpu_numfv)
   class MyModel : public EnergyModelTest<double>
   {
     const std::vector<double > & charge;
-    EwaldParameters<double> eparam;    
+    deepmd::EwaldParameters<double> eparam;    
 public:
     MyModel(
 	const std::vector<double> & charge_
@@ -48,7 +48,7 @@ TEST_F(TestInferEwald, cpu_numfv)
 	std::vector<double> &	virial,
 	const std::vector<double> & coord,
 	const std::vector<double> & box) {
-      Region<double> region;
+      deepmd::Region<double> region;
       init_region_cpu(region, &box[0]);
       ewald_recp(ener, force, virial, coord, charge, region, eparam);
     }
diff --git a/source/api_cc/tests/test_utils.h b/source/api_cc/tests/test_utils.h
index ed4b40b5ff..e6a496e374 100644
--- a/source/api_cc/tests/test_utils.h
+++ b/source/api_cc/tests/test_utils.h
@@ -90,13 +90,13 @@ class EnergyModelTest
     VALUETYPE ener;
     std::vector<VALUETYPE> force, virial;
     compute(ener, force, virial, coord, box);
-    Region<VALUETYPE> region;
+    deepmd::Region<VALUETYPE> region;
     init_region_cpu(region, &box[0]);
     for(int ii = 0; ii < 9; ++ii){
       std::vector<VALUETYPE> box0(box), box1(box);
       box0[ii] += hh;
       box1[ii] -= hh;
-      Region<VALUETYPE> region0, region1;
+      deepmd::Region<VALUETYPE> region0, region1;
       init_region_cpu(region0, &box0[0]);
       init_region_cpu(region1, &box1[0]);
       std::vector<VALUETYPE> coord0(coord), coord1(coord);

From 122875771b81ea0663ac538507c27791f88b8efa Mon Sep 17 00:00:00 2001
From: Han Wang <wang_han@iapcm.ac.cn>
Date: Wed, 17 Mar 2021 09:52:38 +0800
Subject: [PATCH 3/7] fix bugs in multi_device ops

---
 source/op/gelu_multi_device.cc         |  6 +++---
 source/op/prod_env_mat_multi_device.cc | 10 +++++-----
 source/op/prod_force_multi_device.cc   |  8 ++++----
 source/op/prod_virial_multi_device.cc  | 10 +++++-----
 source/op/tabulate_multi_device.cc     |  8 ++++----
 5 files changed, 21 insertions(+), 21 deletions(-)

diff --git a/source/op/gelu_multi_device.cc b/source/op/gelu_multi_device.cc
index 5923ee5675..ece0d07ab3 100644
--- a/source/op/gelu_multi_device.cc
+++ b/source/op/gelu_multi_device.cc
@@ -45,7 +45,7 @@ class GeluOp : public OpKernel {
 
     if (device == "GPU") {
       #if GOOGLE_CUDA
-      gelu_gpu_cuda(
+      deepmd::gelu_gpu_cuda(
           out, 
           x, size);
       #endif // GOOGLE_CUDA
@@ -88,7 +88,7 @@ class GeluGradOp : public OpKernel {
 
     if (device == "GPU") {
       #if GOOGLE_CUDA
-      gelu_grad_gpu_cuda(
+      deepmd::gelu_grad_gpu_cuda(
           out, 
           x, dy, size);
       #endif // GOOGLE_CUDA
@@ -129,7 +129,7 @@ class GeluGradGradOp : public OpKernel {
 
     if (device == "GPU") {
       #if GOOGLE_CUDA
-      gelu_grad_grad_gpu_cuda(
+      deepmd::gelu_grad_grad_gpu_cuda(
           out, 
           x, dy, dy_2, size);
       #endif // GOOGLE_CUDA
diff --git a/source/op/prod_env_mat_multi_device.cc b/source/op/prod_env_mat_multi_device.cc
index c0423db358..10b3f837b0 100644
--- a/source/op/prod_env_mat_multi_device.cc
+++ b/source/op/prod_env_mat_multi_device.cc
@@ -273,12 +273,12 @@ class ProdEnvMatAOp : public OpKernel {
       // update nbor list
       deepmd::InputNlist inlist;
       inlist.inum = nloc;
-      env_mat_nbor_update(
+      deepmd::env_mat_nbor_update(
           inlist, gpu_inlist, max_nbor_size, nbor_list_dev,
           mesh_tensor.flat<int>().data(), static_cast<int>(mesh_tensor.NumElements()));
       OP_REQUIRES (context, (max_numneigh(inlist) <= GPU_MAX_NBOR_SIZE), errors::InvalidArgument ("Assert failed, max neighbor size of atom(lammps) " + std::to_string(max_numneigh(inlist)) + " is larger than " + std::to_string(GPU_MAX_NBOR_SIZE) + ", which currently is not supported by deepmd-kit."));
       // launch the gpu(nv) compute function
-      prod_env_mat_a_gpu_cuda(
+      deepmd::prod_env_mat_a_gpu_cuda(
           em, em_deriv, rij, nlist, 
           coord, type, gpu_inlist, array_int, array_longlong, max_nbor_size, avg, std, nloc, nall, rcut_r, rcut_r_smth, sec_a);
       #endif //GOOGLE_CUDA
@@ -300,7 +300,7 @@ class ProdEnvMatAOp : public OpKernel {
 	  frame_nall, mem_cpy, mem_nnei, max_nbor_size,
 	  box, mesh_tensor.flat<int>().data(), nloc, nei_mode, rcut_r, max_cpy_trial, max_nnei_trial);
       // launch the cpu compute function
-      prod_env_mat_a_cpu(
+      deepmd::prod_env_mat_a_cpu(
 	  em, em_deriv, rij, nlist, 
 	  coord, type, inlist, max_nbor_size, avg, std, nloc, frame_nall, rcut_r, rcut_r_smth, sec_a);
       // do nlist mapping if coords were copied
@@ -483,12 +483,12 @@ class ProdEnvMatROp : public OpKernel {
       // update nbor list
       deepmd::InputNlist inlist;
       inlist.inum = nloc;
-      env_mat_nbor_update(
+      deepmd::env_mat_nbor_update(
           inlist, gpu_inlist, max_nbor_size, nbor_list_dev,
           mesh_tensor.flat<int>().data(), static_cast<int>(mesh_tensor.NumElements()));
       OP_REQUIRES (context, (max_numneigh(inlist) <= GPU_MAX_NBOR_SIZE), errors::InvalidArgument ("Assert failed, max neighbor size of atom(lammps) " + std::to_string(max_numneigh(inlist)) + " is larger than " + std::to_string(GPU_MAX_NBOR_SIZE) + ", which currently is not supported by deepmd-kit."));
       // launch the gpu(nv) compute function
-      prod_env_mat_r_gpu_cuda(
+      deepmd::prod_env_mat_r_gpu_cuda(
           em, em_deriv, rij, nlist, 
           coord, type, gpu_inlist, array_int, array_longlong, max_nbor_size, avg, std, nloc, nall, rcut, rcut_smth, sec);
       #endif //GOOGLE_CUDA
diff --git a/source/op/prod_force_multi_device.cc b/source/op/prod_force_multi_device.cc
index 209a12639f..29af42379a 100644
--- a/source/op/prod_force_multi_device.cc
+++ b/source/op/prod_force_multi_device.cc
@@ -78,13 +78,13 @@ class ProdForceSeAOp : public OpKernel {
 
     if (device == "GPU") {
       #if GOOGLE_CUDA
-      prod_force_a_gpu_cuda(    
+      deepmd::prod_force_a_gpu_cuda(    
           force, 
           net_deriv, in_deriv, nlist, nloc, nall, nnei);
       #endif // GOOGLE_CUDA
     }
     else if (device == "CPU") {
-      prod_force_a_cpu(    
+      deepmd::prod_force_a_cpu(    
           force, 
           net_deriv, in_deriv, nlist, nloc, nall, nnei);
     }
@@ -151,13 +151,13 @@ class ProdForceSeROp : public OpKernel {
 
     if (device == "GPU") {
       #if GOOGLE_CUDA
-      prod_force_r_gpu_cuda(    
+      deepmd::prod_force_r_gpu_cuda(    
           force, 
           net_deriv, in_deriv, nlist, nloc, nall, nnei);
       #endif // GOOGLE_CUDA
     }
     else if (device == "CPU") {
-      prod_force_r_cpu(    
+      deepmd::prod_force_r_cpu(    
           force, 
           net_deriv, in_deriv, nlist, nloc, nall, nnei);
     }
diff --git a/source/op/prod_virial_multi_device.cc b/source/op/prod_virial_multi_device.cc
index 9aab65443c..9e16437b44 100644
--- a/source/op/prod_virial_multi_device.cc
+++ b/source/op/prod_virial_multi_device.cc
@@ -86,13 +86,13 @@ class ProdVirialSeAOp : public OpKernel {
     
     if (device == "GPU") {
       #if GOOGLE_CUDA
-      prod_virial_a_gpu_cuda(    
+      deepmd::prod_virial_a_gpu_cuda(    
           virial, atom_virial,
           net_deriv, in_deriv, rij, nlist, nloc, nall, nnei);
       #endif // GOOGLE_CUDA
     }
     else if (device == "CPU") {
-      prod_virial_a_cpu(    
+      deepmd::prod_virial_a_cpu(    
           virial, atom_virial,
           net_deriv, in_deriv, rij, nlist, nloc, nall, nnei);
     }
@@ -164,13 +164,13 @@ class ProdVirialSeROp : public OpKernel {
     
     if (device == "GPU") {
       #if GOOGLE_CUDA
-      prod_virial_r_gpu_cuda(    
+      deepmd::prod_virial_r_gpu_cuda(    
           virial, atom_virial,
           net_deriv, in_deriv, rij, nlist, nloc, nall, nnei);
       #endif // GOOGLE_CUDA
     }
     else if (device == "CPU") {
-      prod_virial_r_cpu(    
+      deepmd::prod_virial_r_cpu(    
           virial, atom_virial,
           net_deriv, in_deriv, rij, nlist, nloc, nall, nnei);
     }
@@ -200,4 +200,4 @@ REGISTER_KERNEL_BUILDER(
     ProdVirialSeROp<GPUDevice, T>);
 REGISTER_GPU(float);
 REGISTER_GPU(double);
-#endif  // GOOGLE_CUDA
\ No newline at end of file
+#endif  // GOOGLE_CUDA
diff --git a/source/op/tabulate_multi_device.cc b/source/op/tabulate_multi_device.cc
index b4e891518d..8d8f9e82d6 100644
--- a/source/op/tabulate_multi_device.cc
+++ b/source/op/tabulate_multi_device.cc
@@ -63,13 +63,13 @@ class TabulateFusionOp : public OpKernel {
 
     if (device == "GPU") {
       #if GOOGLE_CUDA
-      tabulate_fusion_gpu_cuda(    
+      deepmd::tabulate_fusion_gpu_cuda(    
           descriptor,
           table, table_info, em_x, em, nloc, nnei, last_layer_size);
       #endif // GOOGLE_CUDA
     }
     else if (device == "CPU") {
-      tabulate_fusion_cpu(    
+      deepmd::tabulate_fusion_cpu(    
           descriptor,
           table, table_info, em_x, em, nloc, nnei, last_layer_size);
     }
@@ -125,13 +125,13 @@ class TabulateFusionGradOp : public OpKernel {
 
     if (device == "GPU") {
       #if GOOGLE_CUDA
-      tabulate_fusion_grad_gpu_cuda(    
+      deepmd::tabulate_fusion_grad_gpu_cuda(    
           dy_dem_x, dy_dem,
           table, table_info, em_x, em, dy, nloc, nnei, last_layer_size);
       #endif // GOOGLE_CUDA
     }
     else if (device == "CPU") {
-      tabulate_fusion_grad_cpu(    
+      deepmd::tabulate_fusion_grad_cpu(    
           dy_dem_x, dy_dem,
           table, table_info, em_x, em, dy, nloc, nnei, last_layer_size);
     }

From 9ec178631ea4475f053da657c6fa7247a2c9c491 Mon Sep 17 00:00:00 2001
From: Han Wang <wang_han@iapcm.ac.cn>
Date: Wed, 17 Mar 2021 10:19:00 +0800
Subject: [PATCH 4/7] namespace for tabulate and utilities

---
 source/api_cc/src/DeepPot.cc               |  4 +-
 source/lib/include/ComputeDescriptor.h     | 92 +++++++++++-----------
 source/lib/include/SimulationRegion_Impl.h | 12 +--
 source/lib/include/tabulate.h              |  5 ++
 source/lib/include/utilities.h             |  3 +
 source/lib/src/env_mat.cc                  |  8 +-
 source/lib/src/ewald.cc                    |  2 +-
 source/lib/src/fmt_nlist.cc                |  4 +-
 source/lib/src/neighbor_list.cc            |  8 +-
 source/lib/src/tabulate.cc                 | 12 +--
 source/lib/src/utilities.cc                |  4 +-
 source/lib/tests/test_tabulate.cc          | 10 +--
 source/op/descrpt.cc                       | 10 +--
 source/op/prod_env_mat_multi_device.cc     |  8 +-
 14 files changed, 95 insertions(+), 87 deletions(-)

diff --git a/source/api_cc/src/DeepPot.cc b/source/api_cc/src/DeepPot.cc
index 9a3e8d7a10..b9412a505b 100644
--- a/source/api_cc/src/DeepPot.cc
+++ b/source/api_cc/src/DeepPot.cc
@@ -908,7 +908,7 @@ compute_std_f (std::vector<VALUETYPE> &		std,
       vdiff[0] = tmp_f[0] - tmp_avg[0];
       vdiff[1] = tmp_f[1] - tmp_avg[1];
       vdiff[2] = tmp_f[2] - tmp_avg[2];
-      std[jj] += dot3(vdiff, vdiff);
+      std[jj] += deepmd::dot3(vdiff, vdiff);
     }
   }
 
@@ -931,7 +931,7 @@ compute_relative_std_f (std::vector<VALUETYPE> &std,
       vdiff[0] = tmp_avg[0];
       vdiff[1] = tmp_avg[1];
       vdiff[2] = tmp_avg[2];
-      VALUETYPE f_norm = sqrt(dot3(vdiff, vdiff));
+      VALUETYPE f_norm = sqrt(deepmd::dot3(vdiff, vdiff));
       // relative std = std/(abs(f)+eps)
       std[ii] /= f_norm + eps;
   }
diff --git a/source/lib/include/ComputeDescriptor.h b/source/lib/include/ComputeDescriptor.h
index 7d4d6e0c3a..8bc246881a 100644
--- a/source/lib/include/ComputeDescriptor.h
+++ b/source/lib/include/ComputeDescriptor.h
@@ -110,11 +110,11 @@ compute_dRdT (double (* dRdT)[9],
   const double *xx = rot;
   const double *yy = rot+3;
 
-  double nr1 = sqrt(dot3(r1, r1));
+  double nr1 = sqrt(deepmd::dot3(r1, r1));
   double nr12 = nr1  * nr1;
   double nr13 = nr1  * nr12;
   double nr14 = nr12 * nr12;
-  double r1dr2 = dot3(r1, r2);
+  double r1dr2 = deepmd::dot3(r1, r2);
 
   // dRdT0
   for (int ii = 0; ii < 3; ++ii){
@@ -137,7 +137,7 @@ compute_dRdT (double (* dRdT)[9],
   }
   double tmpy[3];
   for (int dd = 0; dd < 3; ++dd) tmpy[dd] = r2[dd] - r1dr2 / nr12 * r1[dd];
-  double ntmpy = sqrt(dot3(tmpy, tmpy));
+  double ntmpy = sqrt(deepmd::dot3(tmpy, tmpy));
   double ydRdy [3] = {0};
   for (int ii = 0; ii < 3; ++ii){
     for (int jj = 0; jj < 3; ++jj){
@@ -153,8 +153,8 @@ compute_dRdT (double (* dRdT)[9],
   // dRdT2
   for (int ii = 0; ii < 3; ++ii){
     double res[3];
-    cprod(dRdT0 + ii*3, yy, dRdT2 + ii*3);
-    cprod(xx, dRdT1 + ii*3, res);
+    deepmd::cprod(dRdT0 + ii*3, yy, dRdT2 + ii*3);
+    deepmd::cprod(xx, dRdT1 + ii*3, res);
     for (int dd = 0; dd < 3; ++dd) dRdT2[ii*3+dd] += res[dd];
   }
 }
@@ -171,11 +171,11 @@ compute_dRdT_1 (double (* dRdT)[9],
   const double *xx = rot;
   const double *yy = rot+3;
 
-  double nr1 = sqrt(dot3(r1, r1));
+  double nr1 = sqrt(deepmd::dot3(r1, r1));
   double nr12 = nr1  * nr1;
   double nr13 = nr1  * nr12;
   double nr14 = nr12 * nr12;
-  double r1dr2 = dot3(r1, r2);
+  double r1dr2 = deepmd::dot3(r1, r2);
 
   // dRdT0
   for (int ii = 0; ii < 3; ++ii){
@@ -198,7 +198,7 @@ compute_dRdT_1 (double (* dRdT)[9],
   }
   double tmpy[3];
   for (int dd = 0; dd < 3; ++dd) tmpy[dd] = r2[dd] - r1dr2 / nr12 * r1[dd];
-  double ntmpy = sqrt(dot3(tmpy, tmpy));
+  double ntmpy = sqrt(deepmd::dot3(tmpy, tmpy));
   double ydRdy [3] = {0};
   for (int ii = 0; ii < 3; ++ii){
     for (int jj = 0; jj < 3; ++jj){
@@ -214,8 +214,8 @@ compute_dRdT_1 (double (* dRdT)[9],
   // dRdT2
   for (int ii = 0; ii < 3; ++ii){
     double res[3];
-    cprod(dRdT0 + ii*3, yy, dRdT2 + ii*3);
-    cprod(xx, dRdT1 + ii*3, res);
+    deepmd::cprod(dRdT0 + ii*3, yy, dRdT2 + ii*3);
+    deepmd::cprod(xx, dRdT1 + ii*3, res);
     for (int dd = 0; dd < 3; ++dd) dRdT2[ii*3+dd] += res[dd];
   }
 }
@@ -233,9 +233,9 @@ compute_dRdT_2 (double (* dRdT)[9],
   const double *xx = rot;
   const double *yy = rot+3;
 
-  double nr1 = sqrt(dot3(r1, r1));
+  double nr1 = sqrt(deepmd::dot3(r1, r1));
   double nr12 = nr1  * nr1;
-  double r1dr2 = dot3(r1, r2);
+  double r1dr2 = deepmd::dot3(r1, r2);
 
   // dRdT0
   for (int ii = 0; ii < 3; ++ii){
@@ -256,7 +256,7 @@ compute_dRdT_2 (double (* dRdT)[9],
   }
   double tmpy[3];
   for (int dd = 0; dd < 3; ++dd) tmpy[dd] = r2[dd] - r1dr2 / nr12 * r1[dd];
-  double ntmpy = sqrt(dot3(tmpy, tmpy));
+  double ntmpy = sqrt(deepmd::dot3(tmpy, tmpy));
   double ydRdy [3] = {0};
   for (int ii = 0; ii < 3; ++ii){
     for (int jj = 0; jj < 3; ++jj){
@@ -272,8 +272,8 @@ compute_dRdT_2 (double (* dRdT)[9],
   // dRdT2
   for (int ii = 0; ii < 3; ++ii){
     double res[3];
-    cprod(dRdT0 + ii*3, yy, dRdT2 + ii*3);
-    cprod(xx, dRdT1 + ii*3, res);
+    deepmd::cprod(dRdT0 + ii*3, yy, dRdT2 + ii*3);
+    deepmd::cprod(xx, dRdT1 + ii*3, res);
     for (int dd = 0; dd < 3; ++dd) dRdT2[ii*3+dd] += res[dd];
   }
 }
@@ -353,7 +353,7 @@ void compute_descriptor (std::vector<double > &			descrpt_a,
   // 	cout << jj << "\t  jidx " << j_idx;
   // 	if (j_idx >= 0){
   // 	  cout << "\t type " << type[j_idx];
-  // 	  cout << "\t " << sqrt(dot3(&sel_a_diff[jj][0], &sel_a_diff[jj][0]));
+  // 	  cout << "\t " << sqrt(deepmd::dot3(&sel_a_diff[jj][0], &sel_a_diff[jj][0]));
   // 	}
   // 	cout << endl;
   //     }
@@ -365,7 +365,7 @@ void compute_descriptor (std::vector<double > &			descrpt_a,
   // 	cout << jj << "\t  jidx " << j_idx;
   // 	if (j_idx >= 0){
   // 	  cout << "\t type " << type[j_idx];
-  // 	  cout << "\t " << sqrt(dot3(&sel_r_diff[jj][0], &sel_r_diff[jj][0]));
+  // 	  cout << "\t " << sqrt(deepmd::dot3(&sel_r_diff[jj][0], &sel_r_diff[jj][0]));
   // 	}
   // 	cout << endl;
   //     }
@@ -402,13 +402,13 @@ void compute_descriptor (std::vector<double > &			descrpt_a,
     xx[dd] = r1[dd];
     yy[dd] = r2[dd];
   }
-  double norm_xx = sqrt(dot3(xx, xx));
+  double norm_xx = sqrt(deepmd::dot3(xx, xx));
   for (unsigned dd = 0; dd < 3; ++dd) xx[dd] /= norm_xx;
-  double dxy = dot3(xx, yy);
+  double dxy = deepmd::dot3(xx, yy);
   for (unsigned dd = 0; dd < 3; ++dd) yy[dd] -= dxy * xx[dd];
-  double norm_yy = sqrt(dot3(yy, yy));
+  double norm_yy = sqrt(deepmd::dot3(yy, yy));
   for (unsigned dd = 0; dd < 3; ++dd) yy[dd] /= norm_yy;
-  cprod(xx, yy, zz);  
+  deepmd::cprod(xx, yy, zz);  
   rot_mat.resize (9);
   for (int dd = 0; dd < 9; ++dd) rot_mat[dd] = rot[dd];
 
@@ -419,8 +419,8 @@ void compute_descriptor (std::vector<double > &			descrpt_a,
     for (int jj = sec_a[ii]; jj < sec_a[ii+1]; ++jj){
       if (fmt_nlist_a[jj] < 0) break;
       double rdiff[3] ;
-      dotmv3(rdiff, rot, &sel_a_diff[jj][0]);
-      double rr2 = dot3(rdiff, rdiff);
+      deepmd::dotmv3(rdiff, rot, &sel_a_diff[jj][0]);
+      double rr2 = deepmd::dot3(rdiff, rdiff);
       double rr = sqrt(rr2);
 #ifdef DESCRPT_THETAPHI
       double cos_theta = rdiff[2] / rr;
@@ -445,7 +445,7 @@ void compute_descriptor (std::vector<double > &			descrpt_a,
     for (int jj = sec_r[ii]; jj < sec_r[ii+1]; ++jj){
       if (fmt_nlist_r[jj] < 0) break;
       const double *rdiff = &sel_r_diff[jj][0];
-      double rr = sqrt (dot3(rdiff, rdiff));
+      double rr = sqrt (deepmd::dot3(rdiff, rdiff));
       descrpt_r[jj] = 1./rr;      
     }
   }
@@ -474,8 +474,8 @@ void compute_descriptor (std::vector<double > &			descrpt_a,
       double dtrdST[4][3];
       double * rr = &sel_a_diff[nei_iter][0];
       double tr[3] ;
-      dotmv3(tr, rot, rr);
-      double nr2 = dot3(tr, tr);
+      deepmd::dotmv3(tr, rot, rr);
+      double nr2 = deepmd::dot3(tr, tr);
       double nr = sqrt(nr2);
       double nr3 = nr * nr2;
       for (int dd = 0; dd < 3; ++dd){
@@ -601,7 +601,7 @@ void compute_descriptor (std::vector<double > &			descrpt_a,
       if (fmt_nlist_r[nei_iter] < 0) break;      
 
       const double * rr = &sel_r_diff[nei_iter][0];
-      double nr = sqrt(dot3(rr, rr));
+      double nr = sqrt(deepmd::dot3(rr, rr));
       double nr3 = nr * nr * nr;
       int idx = nei_iter * 12;
 
@@ -699,13 +699,13 @@ void compute_descriptor (std::vector<double > &			descrpt_a,
     xx[dd] = r1[dd];
     yy[dd] = r2[dd];
   }
-  double norm_xx = sqrt(dot3(xx, xx));
+  double norm_xx = sqrt(deepmd::dot3(xx, xx));
   for (unsigned dd = 0; dd < 3; ++dd) xx[dd] /= norm_xx;
-  double dxy = dot3(xx, yy);
+  double dxy = deepmd::dot3(xx, yy);
   for (unsigned dd = 0; dd < 3; ++dd) yy[dd] -= dxy * xx[dd];
-  double norm_yy = sqrt(dot3(yy, yy));
+  double norm_yy = sqrt(deepmd::dot3(yy, yy));
   for (unsigned dd = 0; dd < 3; ++dd) yy[dd] /= norm_yy;
-  cprod(xx, yy, zz);  
+  deepmd::cprod(xx, yy, zz);  
   rot_mat.resize (9);
   for (int dd = 0; dd < 9; ++dd) rot_mat[dd] = rot[dd];
 
@@ -716,8 +716,8 @@ void compute_descriptor (std::vector<double > &			descrpt_a,
     for (int jj = sec_a[ii]; jj < sec_a[ii+1]; ++jj){
       if (fmt_nlist_a[jj] < 0) break;
       double rdiff[3] ;
-      dotmv3(rdiff, rot, &sel_a_diff[jj][0]);
-      double rr2 = dot3(rdiff, rdiff);
+      deepmd::dotmv3(rdiff, rot, &sel_a_diff[jj][0]);
+      double rr2 = deepmd::dot3(rdiff, rdiff);
       double rr = sqrt(rr2);
 #ifdef DESCRPT_THETAPHI
       double cos_theta = rdiff[2] / rr;
@@ -742,8 +742,8 @@ void compute_descriptor (std::vector<double > &			descrpt_a,
     for (int jj = sec_r[ii]; jj < sec_r[ii+1]; ++jj){
       if (fmt_nlist_r[jj] < 0) break;
       double rdiff[3] ;
-      dotmv3(rdiff, rot, &sel_r_diff[jj][0]);
-      double rr = sqrt (dot3(rdiff, rdiff));
+      deepmd::dotmv3(rdiff, rot, &sel_r_diff[jj][0]);
+      double rr = sqrt (deepmd::dot3(rdiff, rdiff));
       descrpt_r[jj] = 1./rr;
     }
   }  
@@ -784,7 +784,7 @@ void compute_descriptor_se_a_extf (std::vector<double > &		descrpt_a,
       ef[ii] = ef_[ii];
     }
   }
-  assert( fabs(dot3(ef, ef) - 1.0) < 1e-12 ), "ef should be a normalized std::vector";
+  assert( fabs(deepmd::dot3(ef, ef) - 1.0) < 1e-12 ), "ef should be a normalized std::vector";
 
   // compute the diff of the neighbors
   std::vector<std::vector<double > > sel_a_diff (sec_a.back());
@@ -819,7 +819,7 @@ void compute_descriptor_se_a_extf (std::vector<double > &		descrpt_a,
       if (fmt_nlist_a[nei_iter] < 0) break;
       const double * rr = &sel_a_diff[nei_iter][0];
       // check validity of ef
-      double nr2 = dot3(rr, rr);
+      double nr2 = deepmd::dot3(rr, rr);
       double inr = 1./sqrt(nr2);
       double nr = nr2 * inr;
       double inr2 = inr * inr;
@@ -830,7 +830,7 @@ void compute_descriptor_se_a_extf (std::vector<double > &		descrpt_a,
       int idx_deriv = nei_iter * 4 * 3;	// 4 components time 3 directions
       int idx_value = nei_iter * 4;	// 4 components
       // projections
-      double rp = dot3(rr, ef);
+      double rp = deepmd::dot3(rr, ef);
       double rv[3];
       rv[0] = rr[0] - rp * ef[0];
       rv[1] = rr[1] - rp * ef[1];
@@ -893,7 +893,7 @@ void compute_descriptor_se_a_ef_para (std::vector<double > &		descrpt_a,
       ef[ii] = ef_[ii];
     }
   }
-  assert( fabs(dot3(ef, ef) - 1.0) < 1e-12 ), "ef should be a normalized vector";
+  assert( fabs(deepmd::dot3(ef, ef) - 1.0) < 1e-12 ), "ef should be a normalized vector";
 
   // compute the diff of the neighbors
   std::vector<std::vector<double > > sel_a_diff (sec_a.back());
@@ -928,7 +928,7 @@ void compute_descriptor_se_a_ef_para (std::vector<double > &		descrpt_a,
       if (fmt_nlist_a[nei_iter] < 0) break;
       const double * rr = &sel_a_diff[nei_iter][0];
       // check validity of ef
-      double nr2 = dot3(rr, rr);
+      double nr2 = deepmd::dot3(rr, rr);
       double inr = 1./sqrt(nr2);
       double nr = nr2 * inr;
       double inr2 = inr * inr;
@@ -940,9 +940,9 @@ void compute_descriptor_se_a_ef_para (std::vector<double > &		descrpt_a,
       int idx_value = nei_iter * 4;	// 4 components
       // projections
       double rp[3];
-      rp[0] = dot3(rr, ef) * ef[0];
-      rp[1] = dot3(rr, ef) * ef[1];
-      rp[2] = dot3(rr, ef) * ef[2];
+      rp[0] = deepmd::dot3(rr, ef) * ef[0];
+      rp[1] = deepmd::dot3(rr, ef) * ef[1];
+      rp[2] = deepmd::dot3(rr, ef) * ef[2];
       // 4 value components
       descrpt_a[idx_value + 0] = 1 / nr;
       descrpt_a[idx_value + 1] = rp[0] / nr2;
@@ -1001,7 +1001,7 @@ void compute_descriptor_se_a_ef_vert (std::vector<double > &		descrpt_a,
       ef[ii] = ef_[ii];
     }
   }
-  assert( fabs(dot3(ef, ef) - 1.0) < 1e-12 ), "ef should be a normalized vector";
+  assert( fabs(deepmd::dot3(ef, ef) - 1.0) < 1e-12 ), "ef should be a normalized vector";
 
   // compute the diff of the neighbors
   std::vector<std::vector<double > > sel_a_diff (sec_a.back());
@@ -1036,7 +1036,7 @@ void compute_descriptor_se_a_ef_vert (std::vector<double > &		descrpt_a,
       if (fmt_nlist_a[nei_iter] < 0) break;
       const double * rr = &sel_a_diff[nei_iter][0];
       // check validity of ef
-      double nr2 = dot3(rr, rr);
+      double nr2 = deepmd::dot3(rr, rr);
       double inr = 1./sqrt(nr2);
       double nr = nr2 * inr;
       double inr2 = inr * inr;
@@ -1047,7 +1047,7 @@ void compute_descriptor_se_a_ef_vert (std::vector<double > &		descrpt_a,
       int idx_deriv = nei_iter * 4 * 3;	// 4 components time 3 directions
       int idx_value = nei_iter * 4;	// 4 components
       // projections
-      double rp = dot3(rr, ef);
+      double rp = deepmd::dot3(rr, ef);
       double rv[3];
       rv[0] = rr[0] - rp * ef[0];
       rv[1] = rr[1] - rp * ef[1];
diff --git a/source/lib/include/SimulationRegion_Impl.h b/source/lib/include/SimulationRegion_Impl.h
index d19f1a5650..5b7b8248fd 100644
--- a/source/lib/include/SimulationRegion_Impl.h
+++ b/source/lib/include/SimulationRegion_Impl.h
@@ -417,12 +417,12 @@ SimulationRegion<VALUETYPE>::
 toFaceDistance	(double * dd) const
 {
   double tmp[3];
-  cprod(boxt+3, boxt+6, tmp);
-  dd[0] = volume * invsqrt(dot3(tmp,tmp));
-  cprod(boxt+6, boxt+0, tmp);
-  dd[1] = volume * invsqrt(dot3(tmp,tmp));
-  cprod(boxt+0, boxt+3, tmp);
-  dd[2] = volume * invsqrt(dot3(tmp,tmp));
+  deepmd::cprod(boxt+3, boxt+6, tmp);
+  dd[0] = volume * deepmd::invsqrt(deepmd::dot3(tmp,tmp));
+  deepmd::cprod(boxt+6, boxt+0, tmp);
+  dd[1] = volume * deepmd::invsqrt(deepmd::dot3(tmp,tmp));
+  deepmd::cprod(boxt+0, boxt+3, tmp);
+  dd[2] = volume * deepmd::invsqrt(deepmd::dot3(tmp,tmp));
 }
 
 // static int tmp_count = 0;
diff --git a/source/lib/include/tabulate.h b/source/lib/include/tabulate.h
index 77507ec8eb..b684be3c4c 100644
--- a/source/lib/include/tabulate.h
+++ b/source/lib/include/tabulate.h
@@ -1,5 +1,7 @@
 #pragma once
 
+namespace deepmd{
+
 template<typename FPTYPE>
 void tabulate_fusion_cpu(
     FPTYPE * out,
@@ -49,3 +51,6 @@ void tabulate_fusion_grad_gpu_cuda(
     const int nnei, 
     const int last_layer_size);
 #endif // GOOGLE_CUDA
+
+}
+
diff --git a/source/lib/include/utilities.h b/source/lib/include/utilities.h
index 08a808ac11..e95ca3e684 100644
--- a/source/lib/include/utilities.h
+++ b/source/lib/include/utilities.h
@@ -5,6 +5,8 @@
 #include <iostream>
 #include <cmath>
 
+namespace deepmd{
+
 void cum_sum(
     std::vector<int> & sec, 
     const std::vector<int> & n_sel);
@@ -74,3 +76,4 @@ invsqrt<float> (const float x)
   return 1./sqrtf (x);
 }
 
+}
diff --git a/source/lib/src/env_mat.cc b/source/lib/src/env_mat.cc
index 52398a17a8..7b0d3e4140 100644
--- a/source/lib/src/env_mat.cc
+++ b/source/lib/src/env_mat.cc
@@ -50,7 +50,7 @@ void env_mat_a (
     for (int nei_iter = sec_a[sec_iter]; nei_iter < sec_a[sec_iter+1]; ++nei_iter) {      
       if (fmt_nlist_a[nei_iter] < 0) break;
       const double * rr = &sel_a_diff[nei_iter][0];
-      double nr2 = dot3(rr, rr);
+      double nr2 = deepmd::dot3(rr, rr);
       double inr = 1./sqrt(nr2);
       double nr = nr2 * inr;
       double inr2 = inr * inr;
@@ -129,7 +129,7 @@ env_mat_a_cpu (
         for (int nei_iter = sec_a[sec_iter]; nei_iter < sec_a[sec_iter+1]; ++nei_iter) {      
             if (fmt_nlist_a[nei_iter] < 0) break;
             const FPTYPE * rr = &rij_a[nei_iter * 3];
-            FPTYPE nr2 = dot3(rr, rr);
+            FPTYPE nr2 = deepmd::dot3(rr, rr);
             FPTYPE inr = 1./sqrt(nr2);
             FPTYPE nr = nr2 * inr;
             FPTYPE inr2 = inr * inr;
@@ -217,7 +217,7 @@ void env_mat_r (
     for (int nei_iter = sec[sec_iter]; nei_iter < sec[sec_iter+1]; ++nei_iter) {      
       if (fmt_nlist[nei_iter] < 0) break;
       const double * rr = &sel_diff[nei_iter][0];
-      double nr2 = dot3(rr, rr);
+      double nr2 = deepmd::dot3(rr, rr);
       double inr = 1./sqrt(nr2);
       double nr = nr2 * inr;
       double inr2 = inr * inr;
@@ -278,7 +278,7 @@ env_mat_r_cpu (
         for (int nei_iter = sec[sec_iter]; nei_iter < sec[sec_iter+1]; ++nei_iter) {      
             if (fmt_nlist[nei_iter] < 0) break;
             const FPTYPE * rr = &rij_a[nei_iter * 3];
-            FPTYPE nr2 = dot3(rr, rr);
+            FPTYPE nr2 = deepmd::dot3(rr, rr);
             FPTYPE inr = 1./sqrt(nr2);
             FPTYPE nr = nr2 * inr;
             FPTYPE inr2 = inr * inr;
diff --git a/source/lib/src/ewald.cc b/source/lib/src/ewald.cc
index 5942f6fedc..486d2cbb73 100644
--- a/source/lib/src/ewald.cc
+++ b/source/lib/src/ewald.cc
@@ -72,7 +72,7 @@ cmpt_k(std::vector<int> & KK,
 {
   KK.resize(3);
   for (int dd = 0; dd < 3; ++dd){
-    VALUETYPE ll = sqrt(dot3(boxt+dd*3, boxt+dd*3));
+    VALUETYPE ll = sqrt(deepmd::dot3(boxt+dd*3, boxt+dd*3));
     KK[dd] = ll / param.spacing;
     // KK[dd] should be large enough 
     if (KK[dd] * param.spacing < ll) KK[dd] += 1;
diff --git a/source/lib/src/fmt_nlist.cc b/source/lib/src/fmt_nlist.cc
index 2c577c2f05..add83dadcf 100644
--- a/source/lib/src/fmt_nlist.cc
+++ b/source/lib/src/fmt_nlist.cc
@@ -72,7 +72,7 @@ int format_nlist_i_fill_a (
     else {
       for (int dd = 0; dd < 3; ++dd) diff[dd] = posi[j_idx*3+dd] - posi[i_idx*3+dd];
     }
-    double rr = sqrt(dot3(diff, diff));    
+    double rr = sqrt(deepmd::dot3(diff, diff));    
     if (rr <= rcut) {
       sel_nei.push_back(NeighborInfo (type[j_idx], rr, j_idx));
     }
@@ -125,7 +125,7 @@ int format_nlist_i_cpu (
         for (int dd = 0; dd < 3; ++dd) {
             diff[dd] = posi[j_idx * 3 + dd] - posi[i_idx * 3 + dd];
         }
-        FPTYPE rr = sqrt(dot3(diff, diff));    
+        FPTYPE rr = sqrt(deepmd::dot3(diff, diff));    
         if (rr <= rcut) {
             sel_nei.push_back(NeighborInfo(type[j_idx], rr, j_idx));
         }
diff --git a/source/lib/src/neighbor_list.cc b/source/lib/src/neighbor_list.cc
index e426d63906..e0ceae4a38 100644
--- a/source/lib/src/neighbor_list.cc
+++ b/source/lib/src/neighbor_list.cc
@@ -213,7 +213,7 @@ build_nlist_cell (std::vector<std::vector<int> > &	nlist0,
 	  diff[dd0] += shift[dd1] * boxt[3*dd1+dd0];
 	}
       }
-      double r2 = dot3(diff, diff);
+      double r2 = deepmd::dot3(diff, diff);
       if (r2 < rc02) {
 	if (i_idx < nloc) nlist0[i_idx].push_back (j_idx);
 	if (j_idx < nloc) nlist0[j_idx].push_back (i_idx);
@@ -254,7 +254,7 @@ build_nlist_cell (std::vector<std::vector<int> > &	nlist0,
 	  diff[dd0] += shift[dd1] * boxt[3*dd1+dd0];
 	}
       }
-      double r2 = dot3(diff, diff);
+      double r2 = deepmd::dot3(diff, diff);
       if (r2 < rc02) {
 	nlist0[i_idx].push_back (j_idx);
       }
@@ -612,7 +612,7 @@ build_nlist (std::vector<std::vector<int > > & nlist0,
 	diff[1] = posi3[jj*3+1] - posi3[ii*3+1];
 	diff[2] = posi3[jj*3+2] - posi3[ii*3+2];
       }
-      double r2 = dot3(diff, diff);
+      double r2 = deepmd::dot3(diff, diff);
       if (r2 < rc02) {
 	nlist0[ii].push_back (jj);
 	nlist0[jj].push_back (ii);
@@ -800,7 +800,7 @@ build_nlist_cpu(
       for(int dd = 0; dd < 3; ++dd){
 	diff[dd] = c_cpy[ii*3+dd] - c_cpy[jj*3+dd];
       }
-      FPTYPE diff2 = dot3(diff, diff);
+      FPTYPE diff2 = deepmd::dot3(diff, diff);
       if(diff2 < rcut2){
 	jlist.push_back(jj);
       }
diff --git a/source/lib/src/tabulate.cc b/source/lib/src/tabulate.cc
index 98b561a348..b1049226d3 100644
--- a/source/lib/src/tabulate.cc
+++ b/source/lib/src/tabulate.cc
@@ -51,7 +51,7 @@ inline FPTYPE dot(
 }
 
 template<typename FPTYPE>
-void tabulate_fusion_cpu(
+void deepmd::tabulate_fusion_cpu(
     FPTYPE * out,
     const FPTYPE * table, 
     const FPTYPE * table_info, 
@@ -112,7 +112,7 @@ void tabulate_fusion_cpu(
 }
 
 template<typename FPTYPE>
-void tabulate_fusion_grad_cpu(
+void deepmd::tabulate_fusion_grad_cpu(
     FPTYPE * dy_dem_x, 
     FPTYPE * dy_dem,
     const FPTYPE * table, 
@@ -186,7 +186,7 @@ void tabulate_fusion_grad_cpu(
   }
 }
 
-template void tabulate_fusion_cpu<float>(float * out, const float * table, const float * table_info, const float * em_x, const float * em, const int nloc, const int nnei, const int last_layer_size);
-template void tabulate_fusion_cpu<double>(double * out, const double * table, const double * table_info, const double * em_x, const double * em, const int nloc, const int nnei, const int last_layer_size);
-template void tabulate_fusion_grad_cpu<float> (float * dy_dem_x, float * dy_dem, const float * table, const float * table_info, const float * em_x, const float * em, const float * dy, const int nloc, const int nnei, const int last_layer_size); 
-template void tabulate_fusion_grad_cpu<double> (double * dy_dem_x, double * dy_dem, const double * table, const double * table_info, const double * em_x, const double * em, const double * dy, const int nloc, const int nnei, const int last_layer_size);
+template void deepmd::tabulate_fusion_cpu<float>(float * out, const float * table, const float * table_info, const float * em_x, const float * em, const int nloc, const int nnei, const int last_layer_size);
+template void deepmd::tabulate_fusion_cpu<double>(double * out, const double * table, const double * table_info, const double * em_x, const double * em, const int nloc, const int nnei, const int last_layer_size);
+template void deepmd::tabulate_fusion_grad_cpu<float> (float * dy_dem_x, float * dy_dem, const float * table, const float * table_info, const float * em_x, const float * em, const float * dy, const int nloc, const int nnei, const int last_layer_size); 
+template void deepmd::tabulate_fusion_grad_cpu<double> (double * dy_dem_x, double * dy_dem, const double * table, const double * table_info, const double * em_x, const double * em, const double * dy, const int nloc, const int nnei, const int last_layer_size);
diff --git a/source/lib/src/utilities.cc b/source/lib/src/utilities.cc
index 2176715938..df7bac98f5 100644
--- a/source/lib/src/utilities.cc
+++ b/source/lib/src/utilities.cc
@@ -1,7 +1,7 @@
 #include "utilities.h"
 
 // functions used in custom ops
-void cum_sum(
+void deepmd::cum_sum(
     std::vector<int> & sec, 
     const std::vector<int> & n_sel) 
 {
@@ -10,4 +10,4 @@ void cum_sum(
   for (int ii = 1; ii < sec.size(); ++ii) {
     sec[ii] = sec[ii-1] + n_sel[ii-1];
   }
-}
\ No newline at end of file
+}
diff --git a/source/lib/tests/test_tabulate.cc b/source/lib/tests/test_tabulate.cc
index a17f2474b7..f92629efb4 100644
--- a/source/lib/tests/test_tabulate.cc
+++ b/source/lib/tests/test_tabulate.cc
@@ -147,7 +147,7 @@ class TestTabulate : public ::testing::Test
 TEST_F(TestTabulate, tabulate_fusion_cpu)
 {
   std::vector<double> xyz_scatter(nloc * nnei * last_layer_size);
-  tabulate_fusion_cpu<double>(&xyz_scatter[0], &table[0], &info[0], &em_x[0], &em[0], nloc, nnei, last_layer_size);
+  deepmd::tabulate_fusion_cpu<double>(&xyz_scatter[0], &table[0], &info[0], &em_x[0], &em[0], nloc, nnei, last_layer_size);
   EXPECT_EQ(xyz_scatter.size(), nloc * nnei * last_layer_size);
   EXPECT_EQ(xyz_scatter.size(), expected_xyz_scatter.size());
   for (int jj = 0; jj < xyz_scatter.size(); ++jj){
@@ -160,7 +160,7 @@ TEST_F(TestTabulate, tabulate_fusion_grad_cpu)
   std::vector<double> dy_dem_x(em_x.size());
   std::vector<double> dy_dem(em.size());
   std::vector<double> dy(nloc * nnei * last_layer_size, 1.0);
-  tabulate_fusion_grad_cpu<double>(&dy_dem_x[0], &dy_dem[0], &table[0], &info[0], &em_x[0], &em[0], &dy[0], nloc, nnei, last_layer_size);
+  deepmd::tabulate_fusion_grad_cpu<double>(&dy_dem_x[0], &dy_dem[0], &table[0], &info[0], &em_x[0], &em[0], &dy[0], nloc, nnei, last_layer_size);
   EXPECT_EQ(dy_dem_x.size(), nloc * nnei);
   EXPECT_EQ(dy_dem.size(), nloc * nnei * 4);
   EXPECT_EQ(dy_dem_x.size(), expected_dy_dem_x.size());
@@ -183,7 +183,7 @@ TEST_F(TestTabulate, tabulate_fusion_gpu_cuda)
   malloc_device_memory_sync(table_dev, table);
   malloc_device_memory_sync(em_x_dev, em_x);
   malloc_device_memory_sync(em_dev, em);
-  tabulate_fusion_gpu_cuda<double>(xyz_scatter_dev, table_dev, &info[0], em_x_dev, em_dev, nloc, nnei, last_layer_size);
+  deepmd::tabulate_fusion_gpu_cuda<double>(xyz_scatter_dev, table_dev, &info[0], em_x_dev, em_dev, nloc, nnei, last_layer_size);
   memcpy_device_to_host(xyz_scatter_dev, xyz_scatter);
   delete_device_memory(xyz_scatter_dev);
   delete_device_memory(table_dev);
@@ -210,7 +210,7 @@ TEST_F(TestTabulate, tabulate_fusion_grad_gpu_cuda)
   malloc_device_memory_sync(em_x_dev, em_x);
   malloc_device_memory_sync(em_dev, em);
   malloc_device_memory_sync(dy_dev, dy);
-  tabulate_fusion_grad_gpu_cuda<double>(dy_dem_x_dev, dy_dem_dev, table_dev, &info[0], em_x_dev, em_dev, dy_dev, nloc, nnei, last_layer_size);
+  deepmd::tabulate_fusion_grad_gpu_cuda<double>(dy_dem_x_dev, dy_dem_dev, table_dev, &info[0], em_x_dev, em_dev, dy_dev, nloc, nnei, last_layer_size);
   memcpy_device_to_host(dy_dem_x_dev, dy_dem_x);
   memcpy_device_to_host(dy_dem_dev, dy_dem);
   delete_device_memory(dy_dem_x_dev);
@@ -231,4 +231,4 @@ TEST_F(TestTabulate, tabulate_fusion_grad_gpu_cuda)
     EXPECT_LT(fabs(dy_dem[jj] - expected_dy_dem[jj]) , 1e-5);
   }
 }
-#endif // GOOGLE_CUDA
\ No newline at end of file
+#endif // GOOGLE_CUDA
diff --git a/source/op/descrpt.cc b/source/op/descrpt.cc
index 1cbfb96574..48fc4f3943 100644
--- a/source/op/descrpt.cc
+++ b/source/op/descrpt.cc
@@ -495,7 +495,7 @@ class DescrptOp : public OpKernel {
 	    }
 	  }
 	  sort_info.push_back (std::pair<compute_t, int> 
-			       (dot3(diff, diff), list_idx) );
+			       (deepmd::dot3(diff, diff), list_idx) );
 	}
       }
       sort (sort_info.begin(), sort_info.end());
@@ -527,7 +527,7 @@ class DescrptOp : public OpKernel {
 	    }
 	  }
 	  sort_info.push_back (std::pair<compute_t, int> 
-			       (dot3(diff, diff), list_idx) );
+			       (deepmd::dot3(diff, diff), list_idx) );
 	}
       }
       sort (sort_info.begin(), sort_info.end());
@@ -580,9 +580,9 @@ class DescrptOp : public OpKernel {
 	}
       }
     }
-    compute_t rij = dot3(diff[0], diff[1]);
-    compute_t rii = dot3(diff[0], diff[0]);
-    compute_t rjj = dot3(diff[1], diff[1]);
+    compute_t rij = deepmd::dot3(diff[0], diff[1]);
+    compute_t rii = deepmd::dot3(diff[0], diff[0]);
+    compute_t rjj = deepmd::dot3(diff[1], diff[1]);
     if ( fabs (rij / sqrt(rii * rjj) + 1) < 1e-4  ) {
       return false;
     }
diff --git a/source/op/prod_env_mat_multi_device.cc b/source/op/prod_env_mat_multi_device.cc
index 10b3f837b0..7a26ce2b41 100644
--- a/source/op/prod_env_mat_multi_device.cc
+++ b/source/op/prod_env_mat_multi_device.cc
@@ -120,8 +120,8 @@ class ProdEnvMatAOp : public OpKernel {
     OP_REQUIRES_OK(context, context->GetAttr("sel_r", &sel_r));
     // OP_REQUIRES_OK(context, context->GetAttr("nloc", &nloc_f));
     // OP_REQUIRES_OK(context, context->GetAttr("nall", &nall_f));
-    cum_sum (sec_a, sel_a);
-    cum_sum (sec_r, sel_r);
+    deepmd::cum_sum (sec_a, sel_a);
+    deepmd::cum_sum (sec_r, sel_r);
     ndescrpt_a = sec_a.back() * 4;
     ndescrpt_r = sec_r.back() * 1;
     ndescrpt = ndescrpt_a + ndescrpt_r;
@@ -336,9 +336,9 @@ class ProdEnvMatROp : public OpKernel {
     OP_REQUIRES_OK(context, context->GetAttr("rcut", &rcut));
     OP_REQUIRES_OK(context, context->GetAttr("rcut_smth", &rcut_smth));
     OP_REQUIRES_OK(context, context->GetAttr("sel", &sel));
-    cum_sum (sec, sel);
+    deepmd::cum_sum (sec, sel);
     sel_null.resize(3, 0);
-    cum_sum (sec_null, sel_null);
+    deepmd::cum_sum (sec_null, sel_null);
     ndescrpt = sec.back() * 1;
     nnei = sec.back();
     max_nbor_size = 1024;

From f38678f320a76e463893e898cb18c9fea6030b3f Mon Sep 17 00:00:00 2001
From: Han Wang <wang_han@iapcm.ac.cn>
Date: Wed, 17 Mar 2021 10:27:36 +0800
Subject: [PATCH 5/7] fix bug : namespace in cuda compiling

---
 source/lib/src/cuda/prod_env_mat.cu | 2 ++
 1 file changed, 2 insertions(+)

diff --git a/source/lib/src/cuda/prod_env_mat.cu b/source/lib/src/cuda/prod_env_mat.cu
index d26e089efa..febdc093f4 100644
--- a/source/lib/src/cuda/prod_env_mat.cu
+++ b/source/lib/src/cuda/prod_env_mat.cu
@@ -4,6 +4,8 @@
 #include <cub/block/block_store.cuh>
 #include <cub/block/block_radix_sort.cuh>
 
+using namespace deepmd;
+
 // common part of prod_env_mat
 template <
     typename    Key,

From 53de560b81cd6ed81d2fdfc41f2ec4a799b94c87 Mon Sep 17 00:00:00 2001
From: denghuilu <denghuilu@pku.edu.cn>
Date: Fri, 19 Mar 2021 16:47:26 +0800
Subject: [PATCH 6/7] fix bug of gpu namespace

---
 source/lib/include/gpu_cuda.h          | 26 ++++++++++++++++++++++++++
 source/lib/src/cuda/gelu.cu            |  3 ++-
 source/lib/src/cuda/prod_env_mat.cu    | 12 ++++++------
 source/lib/src/cuda/prod_force.cu      | 19 ++-----------------
 source/lib/src/cuda/prod_virial.cu     | 19 ++-----------------
 source/lib/src/cuda/tabulate.cu        |  4 ++--
 source/lib/src/neighbor_list.cc        |  4 ++--
 source/lib/src/prod_env_mat.cc         |  2 +-
 source/lib/tests/test_env_mat_a.cc     |  8 ++++----
 source/lib/tests/test_env_mat_r.cc     |  8 ++++----
 source/lib/tests/test_gelu.cc          |  6 +++---
 source/lib/tests/test_prod_force_a.cc  |  2 +-
 source/lib/tests/test_prod_force_r.cc  |  2 +-
 source/lib/tests/test_prod_virial_a.cc |  2 +-
 source/lib/tests/test_prod_virial_r.cc |  2 +-
 15 files changed, 58 insertions(+), 61 deletions(-)

diff --git a/source/lib/include/gpu_cuda.h b/source/lib/include/gpu_cuda.h
index 845e0b9d9f..6ccbbb9356 100644
--- a/source/lib/include/gpu_cuda.h
+++ b/source/lib/include/gpu_cuda.h
@@ -13,6 +13,23 @@ inline void cudaAssert(cudaError_t code, const char *file, int line, bool abort=
   }
 }
 
+#if defined(__CUDA_ARCH__) && __CUDA_ARCH__ < 600
+static __inline__ __device__ double atomicAdd(
+    double* address, 
+    double val) 
+{
+  unsigned long long int* address_as_ull = (unsigned long long int*)address;
+  unsigned long long int old = *address_as_ull, assumed;
+  do {
+    assumed = old;
+    old = atomicCAS(address_as_ull, assumed,
+          __double_as_longlong(val + __longlong_as_double(assumed)));
+  // Note: uses integer comparison to avoid hang in case of NaN (since NaN != NaN) } while (assumed != old);
+  } while (assumed != old);
+  return __longlong_as_double(old);
+}
+#endif
+
 template <typename FPTYPE>
 void memcpy_host_to_device(
     FPTYPE * device, 
@@ -70,4 +87,13 @@ void delete_device_memory(
   if (device != NULL) {
     cudaErrcheck(cudaFree(device));
   }
+}
+
+template <typename FPTYPE>
+void memset_device_memory(
+    FPTYPE * device, 
+    const FPTYPE var,
+    const int size) 
+{
+  cudaErrcheck(cudaMemset(device, var, sizeof(FPTYPE) * size));  
 }
\ No newline at end of file
diff --git a/source/lib/src/cuda/gelu.cu b/source/lib/src/cuda/gelu.cu
index cd4cfa0541..2b5b3074bb 100644
--- a/source/lib/src/cuda/gelu.cu
+++ b/source/lib/src/cuda/gelu.cu
@@ -1,6 +1,5 @@
 #include "gelu.h"
 #include "device.h"
-#include "gpu_cuda.h"
 
 template <typename FPTYPE>
 __global__ void gelu(
@@ -49,6 +48,7 @@ __global__ void gelu_grad_grad(
   out[idx] = dy[idx] * dy_2[idx] * (0.134145 * SQRT_2_PI * xx[idx] * xx[idx] * (1 - var1 * var1) - SQRT_2_PI * xx[idx] * var2 * (0.134145 * xx[idx] * xx[idx] + 1) * var1 + var2);
 }
 
+namespace deepmd {
 template<typename FPTYPE>
 void gelu_gpu_cuda(
     FPTYPE * out, 
@@ -94,3 +94,4 @@ template void gelu_grad_gpu_cuda<float>(float * out, const float * x, const floa
 template void gelu_grad_gpu_cuda<double>(double * out, const double * x, const double * dy, const int size);
 template void gelu_grad_grad_gpu_cuda<float>(float * out, const float * x, const float * dy, const float * dy_2, const int size);
 template void gelu_grad_grad_gpu_cuda<double>(double * out, const double * x, const double * dy, const double * dy_2, const int size);
+}
\ No newline at end of file
diff --git a/source/lib/src/cuda/prod_env_mat.cu b/source/lib/src/cuda/prod_env_mat.cu
index febdc093f4..00a7401f85 100644
--- a/source/lib/src/cuda/prod_env_mat.cu
+++ b/source/lib/src/cuda/prod_env_mat.cu
@@ -4,8 +4,6 @@
 #include <cub/block/block_store.cuh>
 #include <cub/block/block_radix_sort.cuh>
 
-using namespace deepmd;
-
 // common part of prod_env_mat
 template <
     typename    Key,
@@ -156,7 +154,7 @@ void format_nbor_list_1024 (
     int_64 * key,
     const FPTYPE* coord,
     const int* type,
-    const InputNlist & gpu_inlist,
+    const deepmd::InputNlist & gpu_inlist,
     const int& nloc,       
     const float& rcut, 
     int * i_idx) 
@@ -182,7 +180,7 @@ void format_nbor_list_2048 (
     int_64 * key,
     const FPTYPE* coord,
     const int* type,
-    const InputNlist & gpu_inlist,
+    const deepmd::InputNlist & gpu_inlist,
     const int& nloc,       
     const float& rcut, 
     int * i_idx) 
@@ -208,7 +206,7 @@ void format_nbor_list_4096 (
     int_64 * key,
     const FPTYPE* coord,
     const int* type,
-    const InputNlist & gpu_inlist,
+    const deepmd::InputNlist & gpu_inlist,
     const int& nloc,       
     const float& rcut, 
     int * i_idx)
@@ -234,7 +232,7 @@ void format_nbor_list(
     int * nlist, 
     const FPTYPE * coord, 
     const int * type, 
-    const InputNlist & gpu_inlist,
+    const deepmd::InputNlist & gpu_inlist,
     int * array_int,
     int_64 * array_longlong,
     const int max_nbor_size,
@@ -433,6 +431,7 @@ __global__ void compute_env_mat_r(
   }
 }
 
+namespace deepmd {
 template <typename FPTYPE>
 void prod_env_mat_a_gpu_cuda(    
     FPTYPE * em, 
@@ -505,3 +504,4 @@ template void prod_env_mat_a_gpu_cuda<float>(float * em, float * em_deriv, float
 template void prod_env_mat_a_gpu_cuda<double>(double * em, double * em_deriv, double * rij, int * nlist, const double * coord, const int * type, const InputNlist & gpu_inlist, int * array_int, unsigned long long * array_longlong, const int max_nbor_size, const double * avg, const double * std, const int nloc, const int nall, const float rcut, const float rcut_smth, const std::vector<int> sec);
 template void prod_env_mat_r_gpu_cuda<float>(float * em, float * em_deriv, float * rij, int * nlist, const float * coord, const int * type, const InputNlist & gpu_inlist, int * array_int, unsigned long long * array_longlong, const int max_nbor_size, const float * avg, const float * std, const int nloc, const int nall, const float rcut, const float rcut_smth, const std::vector<int> sec);
 template void prod_env_mat_r_gpu_cuda<double>(double * em, double * em_deriv, double * rij, int * nlist, const double * coord, const int * type, const InputNlist & gpu_inlist, int * array_int, unsigned long long * array_longlong, const int max_nbor_size, const double * avg, const double * std, const int nloc, const int nall, const float rcut, const float rcut_smth, const std::vector<int> sec);
+}
diff --git a/source/lib/src/cuda/prod_force.cu b/source/lib/src/cuda/prod_force.cu
index d440d3b153..97321e74e8 100644
--- a/source/lib/src/cuda/prod_force.cu
+++ b/source/lib/src/cuda/prod_force.cu
@@ -2,23 +2,6 @@
 #include "gpu_cuda.h"
 #include "prod_force.h"
 
-#if defined(__CUDA_ARCH__) && __CUDA_ARCH__ < 600
-static __inline__ __device__ double atomicAdd(
-    double* address, 
-    double val) 
-{
-  unsigned long long int* address_as_ull = (unsigned long long int*)address;
-  unsigned long long int old = *address_as_ull, assumed;
-  do {
-    assumed = old;
-    old = atomicCAS(address_as_ull, assumed,
-          __double_as_longlong(val + __longlong_as_double(assumed)));
-  // Note: uses integer comparison to avoid hang in case of NaN (since NaN != NaN) } while (assumed != old);
-  } while (assumed != old);
-  return __longlong_as_double(old);
-}
-#endif
-
 template <
     typename FPTYPE,
     int      THREADS_PER_BLOCK>
@@ -112,6 +95,7 @@ __global__ void force_deriv_wrt_neighbors_r(
         net_deriv[idx * ndescrpt + idy] * in_deriv[idx * ndescrpt * 3 + idy * 3 + idz]);
 }
 
+namespace deepmd {
 template<typename FPTYPE> 
 void prod_force_a_gpu_cuda(    
     FPTYPE * force, 
@@ -172,3 +156,4 @@ template void prod_force_a_gpu_cuda<float>(float * force, const float * net_deri
 template void prod_force_a_gpu_cuda<double>(double * force, const double * net_deriv, const double * in_deriv, const int * nlist, const int nloc, const int nall, const int nnei);
 template void prod_force_r_gpu_cuda<float>(float * force, const float * net_deriv, const float * in_deriv, const int * nlist, const int nloc, const int nall, const int nnei);
 template void prod_force_r_gpu_cuda<double>(double * force, const double * net_deriv, const double * in_deriv, const int * nlist, const int nloc, const int nall, const int nnei);
+}
diff --git a/source/lib/src/cuda/prod_virial.cu b/source/lib/src/cuda/prod_virial.cu
index 19fb6c1b2f..032e1b1c09 100644
--- a/source/lib/src/cuda/prod_virial.cu
+++ b/source/lib/src/cuda/prod_virial.cu
@@ -1,23 +1,6 @@
 #include "gpu_cuda.h"
 #include "prod_virial.h"
 
-#if defined(__CUDA_ARCH__) && __CUDA_ARCH__ < 600
-static __inline__ __device__ double atomicAdd(
-    double* address, 
-    double val) 
-{
-  unsigned long long int* address_as_ull = (unsigned long long int*)address;
-  unsigned long long int old = *address_as_ull, assumed;
-  do {
-    assumed = old;
-    old = atomicCAS(address_as_ull, assumed,
-          __double_as_longlong(val + __longlong_as_double(assumed)));
-  // Note: uses integer comparison to avoid hang in case of NaN (since NaN != NaN) } while (assumed != old);
-  } while (assumed != old);
-  return __longlong_as_double(old);
-}
-#endif
-
 template<typename FPTYPE>
 __global__ void virial_deriv_wrt_neighbors_a(
     FPTYPE * virial, 
@@ -90,6 +73,7 @@ __global__ void virial_deriv_wrt_neighbors_r(
         net_deriv[idx * ndescrpt + idy] * rij[idx * nnei * 3 + idy * 3 + idz % 3] * in_deriv[idx * ndescrpt * 3 + idy * 3 + idz / 3]);
 }
 
+namespace deepmd {
 template<typename FPTYPE>
 void prod_virial_a_gpu_cuda(
     FPTYPE * virial, 
@@ -152,3 +136,4 @@ template void prod_virial_a_gpu_cuda<float>(float * virial, float * atom_virial,
 template void prod_virial_a_gpu_cuda<double>(double * virial, double * atom_virial, const double * net_deriv, const double * in_deriv, const double * rij, const int * nlist, const int nloc, const int nall, const int nnei);
 template void prod_virial_r_gpu_cuda<float>(float * virial, float * atom_virial, const float * net_deriv, const float * in_deriv, const float * rij, const int * nlist, const int nloc, const int nall, const int nnei);
 template void prod_virial_r_gpu_cuda<double>(double * virial, double * atom_virial, const double * net_deriv, const double * in_deriv, const double * rij, const int * nlist, const int nloc, const int nall, const int nnei);
+}
diff --git a/source/lib/src/cuda/tabulate.cu b/source/lib/src/cuda/tabulate.cu
index 83803733ff..281c8adbc2 100644
--- a/source/lib/src/cuda/tabulate.cu
+++ b/source/lib/src/cuda/tabulate.cu
@@ -1,5 +1,3 @@
-#include <cub/cub.cuh>
-#include <cub/device/device_radix_sort.cuh>
 #include "tabulate.h"
 #include "gpu_cuda.h"
 
@@ -193,6 +191,7 @@ __global__ void tabulate_fusion_grad_fifth_order_polynomial(
   }
 }
 
+namespace deepmd {
 template<typename FPTYPE>
 void tabulate_fusion_gpu_cuda(
     FPTYPE * out,
@@ -238,3 +237,4 @@ template void tabulate_fusion_gpu_cuda<float>(float * out, const float * table,
 template void tabulate_fusion_gpu_cuda<double>(double * out, const double * table, const double * table_info, const double * em_x, const double * em, const int nloc, const int nnei, const int last_layer_size);
 template void tabulate_fusion_grad_gpu_cuda<float> (float * dy_dem_x, float * dy_dem, const float * table, const float * table_info, const float * em_x, const float * em, const float * dy, const int nloc, const int nnei, const int last_layer_size); 
 template void tabulate_fusion_grad_gpu_cuda<double> (double * dy_dem_x, double * dy_dem, const double * table, const double * table_info, const double * em_x, const double * em, const double * dy, const int nloc, const int nnei, const int last_layer_size);
+}
diff --git a/source/lib/src/neighbor_list.cc b/source/lib/src/neighbor_list.cc
index e0ceae4a38..89e8552524 100644
--- a/source/lib/src/neighbor_list.cc
+++ b/source/lib/src/neighbor_list.cc
@@ -844,7 +844,7 @@ build_nlist_cpu<float>(
     const float & rcut);
 
 #if GOOGLE_CUDA
-void convert_nlist_gpu_cuda(
+void deepmd::convert_nlist_gpu_cuda(
     InputNlist & gpu_nlist,
     InputNlist & cpu_nlist,
     int* & gpu_memory,
@@ -867,7 +867,7 @@ void convert_nlist_gpu_cuda(
   free(_firstneigh);
 }
 
-void free_nlist_gpu_cuda(
+void deepmd::free_nlist_gpu_cuda(
     InputNlist & gpu_nlist)
 {
   delete_device_memory(gpu_nlist.ilist);
diff --git a/source/lib/src/prod_env_mat.cc b/source/lib/src/prod_env_mat.cc
index 597473021d..c5e3223e9e 100644
--- a/source/lib/src/prod_env_mat.cc
+++ b/source/lib/src/prod_env_mat.cc
@@ -257,7 +257,7 @@ prod_env_mat_r_cpu<float>(
     const std::vector<int> sec);
 
 #if GOOGLE_CUDA
-void env_mat_nbor_update(
+void deepmd::env_mat_nbor_update(
     InputNlist &inlist,
     InputNlist &gpu_inlist,
     int &max_nbor_size,
diff --git a/source/lib/tests/test_env_mat_a.cc b/source/lib/tests/test_env_mat_a.cc
index c08a4f4705..369d09d872 100644
--- a/source/lib/tests/test_env_mat_a.cc
+++ b/source/lib/tests/test_env_mat_a.cc
@@ -557,9 +557,9 @@ TEST_F(TestEnvMatA, prod_gpu_cuda)
   malloc_device_memory(array_int_dev, sec_a.size() + nloc * sec_a.size() + nloc);
   malloc_device_memory(array_longlong_dev, nloc * GPU_MAX_NBOR_SIZE * 2);
   malloc_device_memory(memory_dev, nloc * max_nbor_size);
-  convert_nlist_gpu_cuda(gpu_inlist, inlist, memory_dev, max_nbor_size);
+  deepmd::convert_nlist_gpu_cuda(gpu_inlist, inlist, memory_dev, max_nbor_size);
 
-  prod_env_mat_a_gpu_cuda(    
+  deepmd::prod_env_mat_a_gpu_cuda(    
       em_dev, 
       em_deriv_dev, 
       rij_dev, 
@@ -648,9 +648,9 @@ TEST_F(TestEnvMatA, prod_gpu_cuda_equal_cpu)
   malloc_device_memory(array_int_dev, sec_a.size() + nloc * sec_a.size() + nloc);
   malloc_device_memory(array_longlong_dev, nloc * GPU_MAX_NBOR_SIZE * 2);
   malloc_device_memory(memory_dev, nloc * max_nbor_size);
-  convert_nlist_gpu_cuda(gpu_inlist, inlist, memory_dev, max_nbor_size);
+  deepmd::convert_nlist_gpu_cuda(gpu_inlist, inlist, memory_dev, max_nbor_size);
 
-  prod_env_mat_a_gpu_cuda(    
+  deepmd::prod_env_mat_a_gpu_cuda(    
       em_dev, 
       em_deriv_dev, 
       rij_dev, 
diff --git a/source/lib/tests/test_env_mat_r.cc b/source/lib/tests/test_env_mat_r.cc
index f571dbdaf1..3a50a892ff 100644
--- a/source/lib/tests/test_env_mat_r.cc
+++ b/source/lib/tests/test_env_mat_r.cc
@@ -377,7 +377,7 @@ TEST_F(TestEnvMatR, prod_gpu_cuda)
   }
   std::vector<int> ilist(nloc), numneigh(nloc);
   std::vector<int*> firstneigh(nloc);
-  InputNlist inlist(nloc, &ilist[0], &numneigh[0], &firstneigh[0]), gpu_inlist;
+  deepmd::InputNlist inlist(nloc, &ilist[0], &numneigh[0], &firstneigh[0]), gpu_inlist;
   convert_nlist(inlist, nlist_a_cpy);
   std::vector<double > em(nloc * ndescrpt, 0.0), em_deriv(nloc * ndescrpt * 3, 0.0), rij(nloc * nnei * 3, 0.0);
   std::vector<int> nlist(nloc * nnei, 0);
@@ -402,7 +402,7 @@ TEST_F(TestEnvMatR, prod_gpu_cuda)
   malloc_device_memory(memory_dev, nloc * max_nbor_size);
   convert_nlist_gpu_cuda(gpu_inlist, inlist, memory_dev, max_nbor_size);
 
-  prod_env_mat_r_gpu_cuda(    
+  deepmd::prod_env_mat_r_gpu_cuda(    
       em_dev, 
       em_deriv_dev, 
       rij_dev, 
@@ -467,7 +467,7 @@ TEST_F(TestEnvMatR, prod_gpu_cuda_equal_cpu)
   }
   std::vector<int> ilist(nloc), numneigh(nloc);
   std::vector<int*> firstneigh(nloc);
-  InputNlist inlist(nloc, &ilist[0], &numneigh[0], &firstneigh[0]), gpu_inlist;
+  deepmd::InputNlist inlist(nloc, &ilist[0], &numneigh[0], &firstneigh[0]), gpu_inlist;
   convert_nlist(inlist, nlist_a_cpy);
   std::vector<double > em(nloc * ndescrpt, 0.0), em_deriv(nloc * ndescrpt * 3, 0.0), rij(nloc * nnei * 3, 0.0);
   std::vector<int> nlist(nloc * nnei, 0);
@@ -492,7 +492,7 @@ TEST_F(TestEnvMatR, prod_gpu_cuda_equal_cpu)
   malloc_device_memory(memory_dev, nloc * max_nbor_size);
   convert_nlist_gpu_cuda(gpu_inlist, inlist, memory_dev, max_nbor_size);
 
-  prod_env_mat_r_gpu_cuda(    
+  deepmd::prod_env_mat_r_gpu_cuda(    
       em_dev, 
       em_deriv_dev, 
       rij_dev, 
diff --git a/source/lib/tests/test_gelu.cc b/source/lib/tests/test_gelu.cc
index 0b05dd71d5..9becbcac47 100644
--- a/source/lib/tests/test_gelu.cc
+++ b/source/lib/tests/test_gelu.cc
@@ -153,7 +153,7 @@ TEST_F(TestGelu, gelu_gpu_cuda)
   double * gelu_dev = NULL, * xx_dev = NULL;
   malloc_device_memory_sync(gelu_dev, gelu);
   malloc_device_memory_sync(xx_dev, xx);
-  gelu_gpu_cuda<double> (gelu_dev, xx_dev, nloc);
+  deepmd::gelu_gpu_cuda<double> (gelu_dev, xx_dev, nloc);
   memcpy_device_to_host(gelu_dev, gelu);
   delete_device_memory(gelu_dev);
   delete_device_memory(xx_dev);
@@ -174,7 +174,7 @@ TEST_F(TestGelu, gelu_grad_gpu_cuda)
   malloc_device_memory_sync(gelu_grad_dev, gelu_grad);
   malloc_device_memory_sync(xx_dev, xx);
   malloc_device_memory_sync(dy_dev, dy);
-  gelu_grad_gpu_cuda<double> (gelu_grad_dev, xx_dev, dy_dev, nloc);
+  deepmd::gelu_grad_gpu_cuda<double> (gelu_grad_dev, xx_dev, dy_dev, nloc);
   memcpy_device_to_host(gelu_grad_dev, gelu_grad);
   delete_device_memory(gelu_grad_dev);
   delete_device_memory(xx_dev);
@@ -198,7 +198,7 @@ TEST_F(TestGelu, gelu_grad_grad_gpu_cuda)
   malloc_device_memory_sync(xx_dev, xx);
   malloc_device_memory_sync(dy_dev, dy);
   malloc_device_memory_sync(dy_2_dev, dy_2);
-  gelu_grad_grad_gpu_cuda<double> (gelu_grad_grad_dev, xx_dev, dy_dev, dy_2_dev, nloc);
+  deepmd::gelu_grad_grad_gpu_cuda<double> (gelu_grad_grad_dev, xx_dev, dy_dev, dy_2_dev, nloc);
   memcpy_device_to_host(gelu_grad_grad_dev, gelu_grad_grad);
   delete_device_memory(gelu_grad_grad_dev);
   delete_device_memory(xx_dev);
diff --git a/source/lib/tests/test_prod_force_a.cc b/source/lib/tests/test_prod_force_a.cc
index 7aeb9ca99f..3318714aff 100644
--- a/source/lib/tests/test_prod_force_a.cc
+++ b/source/lib/tests/test_prod_force_a.cc
@@ -112,7 +112,7 @@ TEST_F(TestProdForceA, gpu_cuda)
   malloc_device_memory_sync(net_deriv_dev, net_deriv);
   malloc_device_memory_sync(env_deriv_dev, env_deriv);
 
-  prod_force_a_gpu_cuda<double> (force_dev, net_deriv_dev, env_deriv_dev, nlist_dev, nloc, nall, nnei);
+  deepmd::prod_force_a_gpu_cuda<double> (force_dev, net_deriv_dev, env_deriv_dev, nlist_dev, nloc, nall, nnei);
   
   memcpy_device_to_host(force_dev, force);
   delete_device_memory(nlist_dev);
diff --git a/source/lib/tests/test_prod_force_r.cc b/source/lib/tests/test_prod_force_r.cc
index 033c41a7fe..1247a94dda 100644
--- a/source/lib/tests/test_prod_force_r.cc
+++ b/source/lib/tests/test_prod_force_r.cc
@@ -112,7 +112,7 @@ TEST_F(TestProdForceR, gpu_cuda)
   malloc_device_memory_sync(net_deriv_dev, net_deriv);
   malloc_device_memory_sync(env_deriv_dev, env_deriv);
 
-  prod_force_r_gpu_cuda<double> (force_dev, net_deriv_dev, env_deriv_dev, nlist_dev, nloc, nall, nnei);
+  deepmd::prod_force_r_gpu_cuda<double> (force_dev, net_deriv_dev, env_deriv_dev, nlist_dev, nloc, nall, nnei);
 
   memcpy_device_to_host(force_dev, force);
   delete_device_memory(nlist_dev);
diff --git a/source/lib/tests/test_prod_virial_a.cc b/source/lib/tests/test_prod_virial_a.cc
index f1d4ee619a..1eb7d0f0f9 100644
--- a/source/lib/tests/test_prod_virial_a.cc
+++ b/source/lib/tests/test_prod_virial_a.cc
@@ -130,7 +130,7 @@ TEST_F(TestProdVirialA, gpu_cuda)
   malloc_device_memory_sync(env_deriv_dev, env_deriv);  
   malloc_device_memory_sync(rij_dev, rij);  
 
-  prod_virial_a_gpu_cuda<double> (virial_dev, atom_virial_dev, net_deriv_dev, env_deriv_dev, rij_dev, nlist_dev, nloc, nall, nnei);
+  deepmd::prod_virial_a_gpu_cuda<double> (virial_dev, atom_virial_dev, net_deriv_dev, env_deriv_dev, rij_dev, nlist_dev, nloc, nall, nnei);
   
   memcpy_device_to_host(virial_dev, virial);
   memcpy_device_to_host(atom_virial_dev, atom_virial);
diff --git a/source/lib/tests/test_prod_virial_r.cc b/source/lib/tests/test_prod_virial_r.cc
index 101b1659f8..4780d9358d 100644
--- a/source/lib/tests/test_prod_virial_r.cc
+++ b/source/lib/tests/test_prod_virial_r.cc
@@ -130,7 +130,7 @@ TEST_F(TestProdVirialR, gpu_cuda)
   malloc_device_memory_sync(env_deriv_dev, env_deriv);  
   malloc_device_memory_sync(rij_dev, rij);  
 
-  prod_virial_r_gpu_cuda<double> (virial_dev, atom_virial_dev, net_deriv_dev, env_deriv_dev, rij_dev, nlist_dev, nloc, nall, nnei);
+  deepmd::prod_virial_r_gpu_cuda<double> (virial_dev, atom_virial_dev, net_deriv_dev, env_deriv_dev, rij_dev, nlist_dev, nloc, nall, nnei);
   
   memcpy_device_to_host(virial_dev, virial);
   memcpy_device_to_host(atom_virial_dev, atom_virial);

From a8ada896734b9443b3dbeb5e371870ea2ad28d99 Mon Sep 17 00:00:00 2001
From: denghuilu <denghuilu@pku.edu.cn>
Date: Fri, 19 Mar 2021 18:33:33 +0800
Subject: [PATCH 7/7] add namespace deepmd for gpu_cuda.h

---
 source/lib/include/gpu_cuda.h          |   4 +-
 source/lib/tests/test_env_mat_a.cc     | 100 ++++++++++++------------
 source/lib/tests/test_env_mat_r.cc     | 102 ++++++++++++-------------
 source/lib/tests/test_gelu.cc          |  42 +++++-----
 source/lib/tests/test_prod_force_a.cc  |  18 ++---
 source/lib/tests/test_prod_force_r.cc  |  18 ++---
 source/lib/tests/test_prod_virial_a.cc |  28 +++----
 source/lib/tests/test_prod_virial_r.cc |  28 +++----
 source/lib/tests/test_tabulate.cc      |  46 +++++------
 9 files changed, 194 insertions(+), 192 deletions(-)

diff --git a/source/lib/include/gpu_cuda.h b/source/lib/include/gpu_cuda.h
index 6ccbbb9356..48db721436 100644
--- a/source/lib/include/gpu_cuda.h
+++ b/source/lib/include/gpu_cuda.h
@@ -30,6 +30,7 @@ static __inline__ __device__ double atomicAdd(
 }
 #endif
 
+namespace deepmd {
 template <typename FPTYPE>
 void memcpy_host_to_device(
     FPTYPE * device, 
@@ -96,4 +97,5 @@ void memset_device_memory(
     const int size) 
 {
   cudaErrcheck(cudaMemset(device, var, sizeof(FPTYPE) * size));  
-}
\ No newline at end of file
+}
+} // end of namespace deepmd
\ No newline at end of file
diff --git a/source/lib/tests/test_env_mat_a.cc b/source/lib/tests/test_env_mat_a.cc
index 369d09d872..d32203c692 100644
--- a/source/lib/tests/test_env_mat_a.cc
+++ b/source/lib/tests/test_env_mat_a.cc
@@ -546,17 +546,17 @@ TEST_F(TestEnvMatA, prod_gpu_cuda)
   double * posi_cpy_dev = NULL, * avg_dev = NULL, * std_dev = NULL;
   int * atype_cpy_dev = NULL, * nlist_dev = NULL, * array_int_dev = NULL, * memory_dev = NULL;
   int_64 * array_longlong_dev = NULL;
-  malloc_device_memory_sync(em_dev, em);
-  malloc_device_memory_sync(em_deriv_dev, em_deriv);
-  malloc_device_memory_sync(rij_dev, rij);
-  malloc_device_memory_sync(posi_cpy_dev, posi_cpy);
-  malloc_device_memory_sync(avg_dev, avg);
-  malloc_device_memory_sync(std_dev, std);
-  malloc_device_memory_sync(atype_cpy_dev, atype_cpy);
-  malloc_device_memory_sync(nlist_dev, nlist);
-  malloc_device_memory(array_int_dev, sec_a.size() + nloc * sec_a.size() + nloc);
-  malloc_device_memory(array_longlong_dev, nloc * GPU_MAX_NBOR_SIZE * 2);
-  malloc_device_memory(memory_dev, nloc * max_nbor_size);
+  deepmd::malloc_device_memory_sync(em_dev, em);
+  deepmd::malloc_device_memory_sync(em_deriv_dev, em_deriv);
+  deepmd::malloc_device_memory_sync(rij_dev, rij);
+  deepmd::malloc_device_memory_sync(posi_cpy_dev, posi_cpy);
+  deepmd::malloc_device_memory_sync(avg_dev, avg);
+  deepmd::malloc_device_memory_sync(std_dev, std);
+  deepmd::malloc_device_memory_sync(atype_cpy_dev, atype_cpy);
+  deepmd::malloc_device_memory_sync(nlist_dev, nlist);
+  deepmd::malloc_device_memory(array_int_dev, sec_a.size() + nloc * sec_a.size() + nloc);
+  deepmd::malloc_device_memory(array_longlong_dev, nloc * GPU_MAX_NBOR_SIZE * 2);
+  deepmd::malloc_device_memory(memory_dev, nloc * max_nbor_size);
   deepmd::convert_nlist_gpu_cuda(gpu_inlist, inlist, memory_dev, max_nbor_size);
 
   deepmd::prod_env_mat_a_gpu_cuda(    
@@ -577,18 +577,18 @@ TEST_F(TestEnvMatA, prod_gpu_cuda)
       rc, 
       rc_smth,
       sec_a);
-  memcpy_device_to_host(em_dev, em);
-  delete_device_memory(em_dev);
-  delete_device_memory(em_deriv_dev);
-  delete_device_memory(nlist_dev);
-  delete_device_memory(posi_cpy_dev);
-  delete_device_memory(atype_cpy_dev);
-  delete_device_memory(array_int_dev);
-  delete_device_memory(array_longlong_dev);
-  delete_device_memory(avg_dev);
-  delete_device_memory(std_dev);
-  delete_device_memory(memory_dev);
-  free_nlist_gpu_cuda(gpu_inlist);
+  deepmd::memcpy_device_to_host(em_dev, em);
+  deepmd::delete_device_memory(em_dev);
+  deepmd::delete_device_memory(em_deriv_dev);
+  deepmd::delete_device_memory(nlist_dev);
+  deepmd::delete_device_memory(posi_cpy_dev);
+  deepmd::delete_device_memory(atype_cpy_dev);
+  deepmd::delete_device_memory(array_int_dev);
+  deepmd::delete_device_memory(array_longlong_dev);
+  deepmd::delete_device_memory(avg_dev);
+  deepmd::delete_device_memory(std_dev);
+  deepmd::delete_device_memory(memory_dev);
+  deepmd::free_nlist_gpu_cuda(gpu_inlist);
 
   for(int ii = 0; ii < nloc; ++ii){
     for (int jj = 0; jj < nnei; ++jj){
@@ -636,18 +636,18 @@ TEST_F(TestEnvMatA, prod_gpu_cuda_equal_cpu)
   double * posi_cpy_dev = NULL, * avg_dev = NULL, * std_dev = NULL;
   int * atype_cpy_dev = NULL, * nlist_dev = NULL, * array_int_dev = NULL, * memory_dev = NULL;
   int_64 * array_longlong_dev = NULL;
-  malloc_device_memory_sync(em_dev, em);
-  malloc_device_memory_sync(em_deriv_dev, em_deriv);
-  malloc_device_memory_sync(rij_dev, rij);
-  malloc_device_memory_sync(posi_cpy_dev, posi_cpy);
-  malloc_device_memory_sync(avg_dev, avg);
-  malloc_device_memory_sync(std_dev, std);
-
-  malloc_device_memory_sync(atype_cpy_dev, atype_cpy);
-  malloc_device_memory_sync(nlist_dev, nlist);
-  malloc_device_memory(array_int_dev, sec_a.size() + nloc * sec_a.size() + nloc);
-  malloc_device_memory(array_longlong_dev, nloc * GPU_MAX_NBOR_SIZE * 2);
-  malloc_device_memory(memory_dev, nloc * max_nbor_size);
+  deepmd::malloc_device_memory_sync(em_dev, em);
+  deepmd::malloc_device_memory_sync(em_deriv_dev, em_deriv);
+  deepmd::malloc_device_memory_sync(rij_dev, rij);
+  deepmd::malloc_device_memory_sync(posi_cpy_dev, posi_cpy);
+  deepmd::malloc_device_memory_sync(avg_dev, avg);
+  deepmd::malloc_device_memory_sync(std_dev, std);
+
+  deepmd::malloc_device_memory_sync(atype_cpy_dev, atype_cpy);
+  deepmd::malloc_device_memory_sync(nlist_dev, nlist);
+  deepmd::malloc_device_memory(array_int_dev, sec_a.size() + nloc * sec_a.size() + nloc);
+  deepmd::malloc_device_memory(array_longlong_dev, nloc * GPU_MAX_NBOR_SIZE * 2);
+  deepmd::malloc_device_memory(memory_dev, nloc * max_nbor_size);
   deepmd::convert_nlist_gpu_cuda(gpu_inlist, inlist, memory_dev, max_nbor_size);
 
   deepmd::prod_env_mat_a_gpu_cuda(    
@@ -668,21 +668,21 @@ TEST_F(TestEnvMatA, prod_gpu_cuda_equal_cpu)
       rc, 
       rc_smth,
       sec_a);
-  memcpy_device_to_host(em_dev, em);
-  memcpy_device_to_host(em_deriv_dev, em_deriv);
-  memcpy_device_to_host(rij_dev, rij);
-  memcpy_device_to_host(nlist_dev, nlist);
-  delete_device_memory(em_dev);
-  delete_device_memory(em_deriv_dev);
-  delete_device_memory(nlist_dev);
-  delete_device_memory(posi_cpy_dev);
-  delete_device_memory(atype_cpy_dev);
-  delete_device_memory(array_int_dev);
-  delete_device_memory(array_longlong_dev);
-  delete_device_memory(avg_dev);
-  delete_device_memory(std_dev);
-  delete_device_memory(memory_dev);
-  free_nlist_gpu_cuda(gpu_inlist);
+  deepmd::memcpy_device_to_host(em_dev, em);
+  deepmd::memcpy_device_to_host(em_deriv_dev, em_deriv);
+  deepmd::memcpy_device_to_host(rij_dev, rij);
+  deepmd::memcpy_device_to_host(nlist_dev, nlist);
+  deepmd::delete_device_memory(em_dev);
+  deepmd::delete_device_memory(em_deriv_dev);
+  deepmd::delete_device_memory(nlist_dev);
+  deepmd::delete_device_memory(posi_cpy_dev);
+  deepmd::delete_device_memory(atype_cpy_dev);
+  deepmd::delete_device_memory(array_int_dev);
+  deepmd::delete_device_memory(array_longlong_dev);
+  deepmd::delete_device_memory(avg_dev);
+  deepmd::delete_device_memory(std_dev);
+  deepmd::delete_device_memory(memory_dev);
+  deepmd::free_nlist_gpu_cuda(gpu_inlist);
 
   std::vector<int> fmt_nlist_a_1, fmt_nlist_r_1;
   std::vector<double> env_1, env_deriv_1, rij_a_1;
diff --git a/source/lib/tests/test_env_mat_r.cc b/source/lib/tests/test_env_mat_r.cc
index 3a50a892ff..aac71eff8c 100644
--- a/source/lib/tests/test_env_mat_r.cc
+++ b/source/lib/tests/test_env_mat_r.cc
@@ -388,19 +388,19 @@ TEST_F(TestEnvMatR, prod_gpu_cuda)
   double * posi_cpy_dev = NULL, * avg_dev = NULL, * std_dev = NULL;
   int * atype_cpy_dev = NULL, * nlist_dev = NULL, * array_int_dev = NULL, * memory_dev = NULL;
   int_64 * array_longlong_dev = NULL;
-  malloc_device_memory_sync(em_dev, em);
-  malloc_device_memory_sync(em_deriv_dev, em_deriv);
-  malloc_device_memory_sync(rij_dev, rij);
-  malloc_device_memory_sync(posi_cpy_dev, posi_cpy);
-  malloc_device_memory_sync(avg_dev, avg);
-  malloc_device_memory_sync(std_dev, std);
+  deepmd::malloc_device_memory_sync(em_dev, em);
+  deepmd::malloc_device_memory_sync(em_deriv_dev, em_deriv);
+  deepmd::malloc_device_memory_sync(rij_dev, rij);
+  deepmd::malloc_device_memory_sync(posi_cpy_dev, posi_cpy);
+  deepmd::malloc_device_memory_sync(avg_dev, avg);
+  deepmd::malloc_device_memory_sync(std_dev, std);
 
-  malloc_device_memory_sync(atype_cpy_dev, atype_cpy);
-  malloc_device_memory_sync(nlist_dev, nlist);
-  malloc_device_memory(array_int_dev, sec_a.size() + nloc * sec_a.size() + nloc);
-  malloc_device_memory(array_longlong_dev, nloc * GPU_MAX_NBOR_SIZE * 2);
-  malloc_device_memory(memory_dev, nloc * max_nbor_size);
-  convert_nlist_gpu_cuda(gpu_inlist, inlist, memory_dev, max_nbor_size);
+  deepmd::malloc_device_memory_sync(atype_cpy_dev, atype_cpy);
+  deepmd::malloc_device_memory_sync(nlist_dev, nlist);
+  deepmd::malloc_device_memory(array_int_dev, sec_a.size() + nloc * sec_a.size() + nloc);
+  deepmd::malloc_device_memory(array_longlong_dev, nloc * GPU_MAX_NBOR_SIZE * 2);
+  deepmd::malloc_device_memory(memory_dev, nloc * max_nbor_size);
+  deepmd::convert_nlist_gpu_cuda(gpu_inlist, inlist, memory_dev, max_nbor_size);
 
   deepmd::prod_env_mat_r_gpu_cuda(    
       em_dev, 
@@ -420,18 +420,18 @@ TEST_F(TestEnvMatR, prod_gpu_cuda)
       rc, 
       rc_smth,
       sec_a);
-  memcpy_device_to_host(em_dev, em);
-  delete_device_memory(em_dev);
-  delete_device_memory(em_deriv_dev);
-  delete_device_memory(nlist_dev);
-  delete_device_memory(posi_cpy_dev);
-  delete_device_memory(atype_cpy_dev);
-  delete_device_memory(array_int_dev);
-  delete_device_memory(array_longlong_dev);
-  delete_device_memory(avg_dev);
-  delete_device_memory(std_dev);
-  delete_device_memory(memory_dev);
-  free_nlist_gpu_cuda(gpu_inlist);
+  deepmd::memcpy_device_to_host(em_dev, em);
+  deepmd::delete_device_memory(em_dev);
+  deepmd::delete_device_memory(em_deriv_dev);
+  deepmd::delete_device_memory(nlist_dev);
+  deepmd::delete_device_memory(posi_cpy_dev);
+  deepmd::delete_device_memory(atype_cpy_dev);
+  deepmd::delete_device_memory(array_int_dev);
+  deepmd::delete_device_memory(array_longlong_dev);
+  deepmd::delete_device_memory(avg_dev);
+  deepmd::delete_device_memory(std_dev);
+  deepmd::delete_device_memory(memory_dev);
+  deepmd::free_nlist_gpu_cuda(gpu_inlist);
 
   for(int ii = 0; ii < nloc; ++ii){
     for (int jj = 0; jj < nnei; ++jj){
@@ -478,19 +478,19 @@ TEST_F(TestEnvMatR, prod_gpu_cuda_equal_cpu)
   double * posi_cpy_dev = NULL, * avg_dev = NULL, * std_dev = NULL;
   int * atype_cpy_dev = NULL, * nlist_dev = NULL, * array_int_dev = NULL, * memory_dev = NULL;
   int_64 * array_longlong_dev = NULL;
-  malloc_device_memory_sync(em_dev, em);
-  malloc_device_memory_sync(em_deriv_dev, em_deriv);
-  malloc_device_memory_sync(rij_dev, rij);
-  malloc_device_memory_sync(posi_cpy_dev, posi_cpy);
-  malloc_device_memory_sync(avg_dev, avg);
-  malloc_device_memory_sync(std_dev, std);
+  deepmd::malloc_device_memory_sync(em_dev, em);
+  deepmd::malloc_device_memory_sync(em_deriv_dev, em_deriv);
+  deepmd::malloc_device_memory_sync(rij_dev, rij);
+  deepmd::malloc_device_memory_sync(posi_cpy_dev, posi_cpy);
+  deepmd::malloc_device_memory_sync(avg_dev, avg);
+  deepmd::malloc_device_memory_sync(std_dev, std);
 
-  malloc_device_memory_sync(atype_cpy_dev, atype_cpy);
-  malloc_device_memory_sync(nlist_dev, nlist);
-  malloc_device_memory(array_int_dev, sec_a.size() + nloc * sec_a.size() + nloc);
-  malloc_device_memory(array_longlong_dev, nloc * GPU_MAX_NBOR_SIZE * 2);
-  malloc_device_memory(memory_dev, nloc * max_nbor_size);
-  convert_nlist_gpu_cuda(gpu_inlist, inlist, memory_dev, max_nbor_size);
+  deepmd::malloc_device_memory_sync(atype_cpy_dev, atype_cpy);
+  deepmd::malloc_device_memory_sync(nlist_dev, nlist);
+  deepmd::malloc_device_memory(array_int_dev, sec_a.size() + nloc * sec_a.size() + nloc);
+  deepmd::malloc_device_memory(array_longlong_dev, nloc * GPU_MAX_NBOR_SIZE * 2);
+  deepmd::malloc_device_memory(memory_dev, nloc * max_nbor_size);
+  deepmd::convert_nlist_gpu_cuda(gpu_inlist, inlist, memory_dev, max_nbor_size);
 
   deepmd::prod_env_mat_r_gpu_cuda(    
       em_dev, 
@@ -510,21 +510,21 @@ TEST_F(TestEnvMatR, prod_gpu_cuda_equal_cpu)
       rc, 
       rc_smth,
       sec_a);
-  memcpy_device_to_host(em_dev, em);
-  memcpy_device_to_host(em_deriv_dev, em_deriv);
-  memcpy_device_to_host(rij_dev, rij);
-  memcpy_device_to_host(nlist_dev, nlist);
-  delete_device_memory(em_dev);
-  delete_device_memory(em_deriv_dev);
-  delete_device_memory(nlist_dev);
-  delete_device_memory(posi_cpy_dev);
-  delete_device_memory(atype_cpy_dev);
-  delete_device_memory(array_int_dev);
-  delete_device_memory(array_longlong_dev);
-  delete_device_memory(avg_dev);
-  delete_device_memory(std_dev);
-  delete_device_memory(memory_dev);
-  free_nlist_gpu_cuda(gpu_inlist);
+  deepmd::memcpy_device_to_host(em_dev, em);
+  deepmd::memcpy_device_to_host(em_deriv_dev, em_deriv);
+  deepmd::memcpy_device_to_host(rij_dev, rij);
+  deepmd::memcpy_device_to_host(nlist_dev, nlist);
+  deepmd::delete_device_memory(em_dev);
+  deepmd::delete_device_memory(em_deriv_dev);
+  deepmd::delete_device_memory(nlist_dev);
+  deepmd::delete_device_memory(posi_cpy_dev);
+  deepmd::delete_device_memory(atype_cpy_dev);
+  deepmd::delete_device_memory(array_int_dev);
+  deepmd::delete_device_memory(array_longlong_dev);
+  deepmd::delete_device_memory(avg_dev);
+  deepmd::delete_device_memory(std_dev);
+  deepmd::delete_device_memory(memory_dev);
+  deepmd::free_nlist_gpu_cuda(gpu_inlist);
 
   std::vector<int> fmt_nlist_a_1, fmt_nlist_r_1;
   std::vector<double> env_1, env_deriv_1, rij_a_1;
diff --git a/source/lib/tests/test_gelu.cc b/source/lib/tests/test_gelu.cc
index 9becbcac47..4d85b2dd27 100644
--- a/source/lib/tests/test_gelu.cc
+++ b/source/lib/tests/test_gelu.cc
@@ -151,12 +151,12 @@ TEST_F(TestGelu, gelu_gpu_cuda)
   std::vector<double> gelu(nloc, 0.0);
   
   double * gelu_dev = NULL, * xx_dev = NULL;
-  malloc_device_memory_sync(gelu_dev, gelu);
-  malloc_device_memory_sync(xx_dev, xx);
+  deepmd::malloc_device_memory_sync(gelu_dev, gelu);
+  deepmd::malloc_device_memory_sync(xx_dev, xx);
   deepmd::gelu_gpu_cuda<double> (gelu_dev, xx_dev, nloc);
-  memcpy_device_to_host(gelu_dev, gelu);
-  delete_device_memory(gelu_dev);
-  delete_device_memory(xx_dev);
+  deepmd::memcpy_device_to_host(gelu_dev, gelu);
+  deepmd::delete_device_memory(gelu_dev);
+  deepmd::delete_device_memory(xx_dev);
 
   EXPECT_EQ(gelu.size(), nloc);
   EXPECT_EQ(gelu.size(), expected_gelu.size());
@@ -171,14 +171,14 @@ TEST_F(TestGelu, gelu_grad_gpu_cuda)
   std::vector<double> gelu_grad(nloc, 0.0);
 
   double * gelu_grad_dev = NULL, * xx_dev = NULL, * dy_dev = NULL;
-  malloc_device_memory_sync(gelu_grad_dev, gelu_grad);
-  malloc_device_memory_sync(xx_dev, xx);
-  malloc_device_memory_sync(dy_dev, dy);
+  deepmd::malloc_device_memory_sync(gelu_grad_dev, gelu_grad);
+  deepmd::malloc_device_memory_sync(xx_dev, xx);
+  deepmd::malloc_device_memory_sync(dy_dev, dy);
   deepmd::gelu_grad_gpu_cuda<double> (gelu_grad_dev, xx_dev, dy_dev, nloc);
-  memcpy_device_to_host(gelu_grad_dev, gelu_grad);
-  delete_device_memory(gelu_grad_dev);
-  delete_device_memory(xx_dev);
-  delete_device_memory(dy_dev);
+  deepmd::memcpy_device_to_host(gelu_grad_dev, gelu_grad);
+  deepmd::delete_device_memory(gelu_grad_dev);
+  deepmd::delete_device_memory(xx_dev);
+  deepmd::delete_device_memory(dy_dev);
 
   EXPECT_EQ(gelu_grad.size(), nloc);
   EXPECT_EQ(gelu_grad.size(), expected_gelu_grad.size());
@@ -194,16 +194,16 @@ TEST_F(TestGelu, gelu_grad_grad_gpu_cuda)
   std::vector<double> gelu_grad_grad(nloc, 0.0);
 
   double * gelu_grad_grad_dev = NULL, * xx_dev = NULL, * dy_dev = NULL, * dy_2_dev = NULL;
-  malloc_device_memory_sync(gelu_grad_grad_dev, gelu_grad_grad);
-  malloc_device_memory_sync(xx_dev, xx);
-  malloc_device_memory_sync(dy_dev, dy);
-  malloc_device_memory_sync(dy_2_dev, dy_2);
+  deepmd::malloc_device_memory_sync(gelu_grad_grad_dev, gelu_grad_grad);
+  deepmd::malloc_device_memory_sync(xx_dev, xx);
+  deepmd::malloc_device_memory_sync(dy_dev, dy);
+  deepmd::malloc_device_memory_sync(dy_2_dev, dy_2);
   deepmd::gelu_grad_grad_gpu_cuda<double> (gelu_grad_grad_dev, xx_dev, dy_dev, dy_2_dev, nloc);
-  memcpy_device_to_host(gelu_grad_grad_dev, gelu_grad_grad);
-  delete_device_memory(gelu_grad_grad_dev);
-  delete_device_memory(xx_dev);
-  delete_device_memory(dy_dev);
-  delete_device_memory(dy_2_dev);
+  deepmd::memcpy_device_to_host(gelu_grad_grad_dev, gelu_grad_grad);
+  deepmd::delete_device_memory(gelu_grad_grad_dev);
+  deepmd::delete_device_memory(xx_dev);
+  deepmd::delete_device_memory(dy_dev);
+  deepmd::delete_device_memory(dy_2_dev);
 
   EXPECT_EQ(gelu_grad_grad.size(), nloc);
   EXPECT_EQ(gelu_grad_grad.size(), expected_gelu_grad_grad.size());
diff --git a/source/lib/tests/test_prod_force_a.cc b/source/lib/tests/test_prod_force_a.cc
index 3318714aff..d9c7c1319d 100644
--- a/source/lib/tests/test_prod_force_a.cc
+++ b/source/lib/tests/test_prod_force_a.cc
@@ -107,18 +107,18 @@ TEST_F(TestProdForceA, gpu_cuda)
   int * nlist_dev = NULL;
   double * force_dev = NULL, * net_deriv_dev = NULL, * env_deriv_dev = NULL;
 
-  malloc_device_memory_sync(nlist_dev, nlist);
-  malloc_device_memory_sync(force_dev, force);
-  malloc_device_memory_sync(net_deriv_dev, net_deriv);
-  malloc_device_memory_sync(env_deriv_dev, env_deriv);
+  deepmd::malloc_device_memory_sync(nlist_dev, nlist);
+  deepmd::malloc_device_memory_sync(force_dev, force);
+  deepmd::malloc_device_memory_sync(net_deriv_dev, net_deriv);
+  deepmd::malloc_device_memory_sync(env_deriv_dev, env_deriv);
 
   deepmd::prod_force_a_gpu_cuda<double> (force_dev, net_deriv_dev, env_deriv_dev, nlist_dev, nloc, nall, nnei);
   
-  memcpy_device_to_host(force_dev, force);
-  delete_device_memory(nlist_dev);
-  delete_device_memory(force_dev);
-  delete_device_memory(net_deriv_dev);
-  delete_device_memory(env_deriv_dev);
+  deepmd::memcpy_device_to_host(force_dev, force);
+  deepmd::delete_device_memory(nlist_dev);
+  deepmd::delete_device_memory(force_dev);
+  deepmd::delete_device_memory(net_deriv_dev);
+  deepmd::delete_device_memory(env_deriv_dev);
 
   EXPECT_EQ(force.size(), nall * 3);
   EXPECT_EQ(force.size(), expected_force.size());
diff --git a/source/lib/tests/test_prod_force_r.cc b/source/lib/tests/test_prod_force_r.cc
index 1247a94dda..e77cafdace 100644
--- a/source/lib/tests/test_prod_force_r.cc
+++ b/source/lib/tests/test_prod_force_r.cc
@@ -107,18 +107,18 @@ TEST_F(TestProdForceR, gpu_cuda)
   int * nlist_dev = NULL;
   double * force_dev = NULL, * net_deriv_dev = NULL, * env_deriv_dev = NULL;
 
-  malloc_device_memory_sync(nlist_dev, nlist);
-  malloc_device_memory_sync(force_dev, force);
-  malloc_device_memory_sync(net_deriv_dev, net_deriv);
-  malloc_device_memory_sync(env_deriv_dev, env_deriv);
+  deepmd::malloc_device_memory_sync(nlist_dev, nlist);
+  deepmd::malloc_device_memory_sync(force_dev, force);
+  deepmd::malloc_device_memory_sync(net_deriv_dev, net_deriv);
+  deepmd::malloc_device_memory_sync(env_deriv_dev, env_deriv);
 
   deepmd::prod_force_r_gpu_cuda<double> (force_dev, net_deriv_dev, env_deriv_dev, nlist_dev, nloc, nall, nnei);
 
-  memcpy_device_to_host(force_dev, force);
-  delete_device_memory(nlist_dev);
-  delete_device_memory(force_dev);
-  delete_device_memory(net_deriv_dev);
-  delete_device_memory(env_deriv_dev);
+  deepmd::memcpy_device_to_host(force_dev, force);
+  deepmd::delete_device_memory(nlist_dev);
+  deepmd::delete_device_memory(force_dev);
+  deepmd::delete_device_memory(net_deriv_dev);
+  deepmd::delete_device_memory(env_deriv_dev);
 
   EXPECT_EQ(force.size(), nall * 3);
   EXPECT_EQ(force.size(), expected_force.size());
diff --git a/source/lib/tests/test_prod_virial_a.cc b/source/lib/tests/test_prod_virial_a.cc
index 1eb7d0f0f9..4cade7c771 100644
--- a/source/lib/tests/test_prod_virial_a.cc
+++ b/source/lib/tests/test_prod_virial_a.cc
@@ -123,23 +123,23 @@ TEST_F(TestProdVirialA, gpu_cuda)
   int * nlist_dev = NULL;
   double * virial_dev = NULL, *atom_virial_dev = NULL, * net_deriv_dev = NULL, * env_deriv_dev = NULL, * rij_dev = NULL;
 
-  malloc_device_memory_sync(nlist_dev, nlist);
-  malloc_device_memory_sync(virial_dev, virial);
-  malloc_device_memory_sync(atom_virial_dev, atom_virial);
-  malloc_device_memory_sync(net_deriv_dev, net_deriv);  
-  malloc_device_memory_sync(env_deriv_dev, env_deriv);  
-  malloc_device_memory_sync(rij_dev, rij);  
+  deepmd::malloc_device_memory_sync(nlist_dev, nlist);
+  deepmd::malloc_device_memory_sync(virial_dev, virial);
+  deepmd::malloc_device_memory_sync(atom_virial_dev, atom_virial);
+  deepmd::malloc_device_memory_sync(net_deriv_dev, net_deriv);  
+  deepmd::malloc_device_memory_sync(env_deriv_dev, env_deriv);  
+  deepmd::malloc_device_memory_sync(rij_dev, rij);  
 
   deepmd::prod_virial_a_gpu_cuda<double> (virial_dev, atom_virial_dev, net_deriv_dev, env_deriv_dev, rij_dev, nlist_dev, nloc, nall, nnei);
   
-  memcpy_device_to_host(virial_dev, virial);
-  memcpy_device_to_host(atom_virial_dev, atom_virial);
-  delete_device_memory(nlist_dev);
-  delete_device_memory(virial_dev);
-  delete_device_memory(atom_virial_dev);
-  delete_device_memory(net_deriv_dev);
-  delete_device_memory(env_deriv_dev);
-  delete_device_memory(rij_dev);
+  deepmd::memcpy_device_to_host(virial_dev, virial);
+  deepmd::memcpy_device_to_host(atom_virial_dev, atom_virial);
+  deepmd::delete_device_memory(nlist_dev);
+  deepmd::delete_device_memory(virial_dev);
+  deepmd::delete_device_memory(atom_virial_dev);
+  deepmd::delete_device_memory(net_deriv_dev);
+  deepmd::delete_device_memory(env_deriv_dev);
+  deepmd::delete_device_memory(rij_dev);
   // virial are not calculated in gpu currently;
   for (int ii = 0; ii < 9; ii++) {
     virial[ii] = 0;
diff --git a/source/lib/tests/test_prod_virial_r.cc b/source/lib/tests/test_prod_virial_r.cc
index 4780d9358d..b321454b8e 100644
--- a/source/lib/tests/test_prod_virial_r.cc
+++ b/source/lib/tests/test_prod_virial_r.cc
@@ -123,23 +123,23 @@ TEST_F(TestProdVirialR, gpu_cuda)
   int * nlist_dev = NULL;
   double * virial_dev = NULL, *atom_virial_dev = NULL, * net_deriv_dev = NULL, * env_deriv_dev = NULL, * rij_dev = NULL;
 
-  malloc_device_memory_sync(nlist_dev, nlist);
-  malloc_device_memory_sync(virial_dev, virial);
-  malloc_device_memory_sync(atom_virial_dev, atom_virial);
-  malloc_device_memory_sync(net_deriv_dev, net_deriv);  
-  malloc_device_memory_sync(env_deriv_dev, env_deriv);  
-  malloc_device_memory_sync(rij_dev, rij);  
+  deepmd::malloc_device_memory_sync(nlist_dev, nlist);
+  deepmd::malloc_device_memory_sync(virial_dev, virial);
+  deepmd::malloc_device_memory_sync(atom_virial_dev, atom_virial);
+  deepmd::malloc_device_memory_sync(net_deriv_dev, net_deriv);  
+  deepmd::malloc_device_memory_sync(env_deriv_dev, env_deriv);  
+  deepmd::malloc_device_memory_sync(rij_dev, rij);  
 
   deepmd::prod_virial_r_gpu_cuda<double> (virial_dev, atom_virial_dev, net_deriv_dev, env_deriv_dev, rij_dev, nlist_dev, nloc, nall, nnei);
   
-  memcpy_device_to_host(virial_dev, virial);
-  memcpy_device_to_host(atom_virial_dev, atom_virial);
-  delete_device_memory(nlist_dev);
-  delete_device_memory(virial_dev);
-  delete_device_memory(atom_virial_dev);
-  delete_device_memory(net_deriv_dev);
-  delete_device_memory(env_deriv_dev);
-  delete_device_memory(rij_dev);
+  deepmd::memcpy_device_to_host(virial_dev, virial);
+  deepmd::memcpy_device_to_host(atom_virial_dev, atom_virial);
+  deepmd::delete_device_memory(nlist_dev);
+  deepmd::delete_device_memory(virial_dev);
+  deepmd::delete_device_memory(atom_virial_dev);
+  deepmd::delete_device_memory(net_deriv_dev);
+  deepmd::delete_device_memory(env_deriv_dev);
+  deepmd::delete_device_memory(rij_dev);
   // virial are not calculated in gpu currently;
   for (int ii = 0; ii < 9; ii++) {
     virial[ii] = 0;
diff --git a/source/lib/tests/test_tabulate.cc b/source/lib/tests/test_tabulate.cc
index f92629efb4..b22cca03d8 100644
--- a/source/lib/tests/test_tabulate.cc
+++ b/source/lib/tests/test_tabulate.cc
@@ -179,16 +179,16 @@ TEST_F(TestTabulate, tabulate_fusion_gpu_cuda)
   std::vector<double> xyz_scatter(nloc * nnei * last_layer_size, 0.0);
 
   double * xyz_scatter_dev = NULL, * table_dev = NULL, * em_x_dev = NULL, * em_dev = NULL;
-  malloc_device_memory_sync(xyz_scatter_dev, xyz_scatter);
-  malloc_device_memory_sync(table_dev, table);
-  malloc_device_memory_sync(em_x_dev, em_x);
-  malloc_device_memory_sync(em_dev, em);
+  deepmd::malloc_device_memory_sync(xyz_scatter_dev, xyz_scatter);
+  deepmd::malloc_device_memory_sync(table_dev, table);
+  deepmd::malloc_device_memory_sync(em_x_dev, em_x);
+  deepmd::malloc_device_memory_sync(em_dev, em);
   deepmd::tabulate_fusion_gpu_cuda<double>(xyz_scatter_dev, table_dev, &info[0], em_x_dev, em_dev, nloc, nnei, last_layer_size);
-  memcpy_device_to_host(xyz_scatter_dev, xyz_scatter);
-  delete_device_memory(xyz_scatter_dev);
-  delete_device_memory(table_dev);
-  delete_device_memory(em_x_dev);
-  delete_device_memory(em_dev);
+  deepmd::memcpy_device_to_host(xyz_scatter_dev, xyz_scatter);
+  deepmd::delete_device_memory(xyz_scatter_dev);
+  deepmd::delete_device_memory(table_dev);
+  deepmd::delete_device_memory(em_x_dev);
+  deepmd::delete_device_memory(em_dev);
 
   EXPECT_EQ(xyz_scatter.size(), nloc * nnei * last_layer_size);
   EXPECT_EQ(xyz_scatter.size(), expected_xyz_scatter.size());
@@ -204,21 +204,21 @@ TEST_F(TestTabulate, tabulate_fusion_grad_gpu_cuda)
   std::vector<double> dy(nloc * nnei * last_layer_size, 1.0);
 
   double * dy_dem_x_dev = NULL, * dy_dem_dev = NULL, * table_dev = NULL, * em_x_dev = NULL, * em_dev = NULL, * dy_dev = NULL;
-  malloc_device_memory_sync(dy_dem_x_dev, dy_dem_x);
-  malloc_device_memory_sync(dy_dem_dev, dy_dem);
-  malloc_device_memory_sync(table_dev, table);
-  malloc_device_memory_sync(em_x_dev, em_x);
-  malloc_device_memory_sync(em_dev, em);
-  malloc_device_memory_sync(dy_dev, dy);
+  deepmd::malloc_device_memory_sync(dy_dem_x_dev, dy_dem_x);
+  deepmd::malloc_device_memory_sync(dy_dem_dev, dy_dem);
+  deepmd::malloc_device_memory_sync(table_dev, table);
+  deepmd::malloc_device_memory_sync(em_x_dev, em_x);
+  deepmd::malloc_device_memory_sync(em_dev, em);
+  deepmd::malloc_device_memory_sync(dy_dev, dy);
   deepmd::tabulate_fusion_grad_gpu_cuda<double>(dy_dem_x_dev, dy_dem_dev, table_dev, &info[0], em_x_dev, em_dev, dy_dev, nloc, nnei, last_layer_size);
-  memcpy_device_to_host(dy_dem_x_dev, dy_dem_x);
-  memcpy_device_to_host(dy_dem_dev, dy_dem);
-  delete_device_memory(dy_dem_x_dev);
-  delete_device_memory(dy_dem_dev);
-  delete_device_memory(table_dev);
-  delete_device_memory(em_x_dev);
-  delete_device_memory(em_dev);
-  delete_device_memory(dy_dev);
+  deepmd::memcpy_device_to_host(dy_dem_x_dev, dy_dem_x);
+  deepmd::memcpy_device_to_host(dy_dem_dev, dy_dem);
+  deepmd::delete_device_memory(dy_dem_x_dev);
+  deepmd::delete_device_memory(dy_dem_dev);
+  deepmd::delete_device_memory(table_dev);
+  deepmd::delete_device_memory(em_x_dev);
+  deepmd::delete_device_memory(em_dev);
+  deepmd::delete_device_memory(dy_dev);
 
   EXPECT_EQ(dy_dem_x.size(), nloc * nnei);
   EXPECT_EQ(dy_dem.size(), nloc * nnei * 4);