diff --git a/.travis.yml b/.travis.yml
index 4db9e48dd4..dd4b0eed9e 100644
--- a/.travis.yml
+++ b/.travis.yml
@@ -83,7 +83,7 @@ jobs:
         - CXX=g++-7
         - TENSORFLOW_VERSION=2.1
       install:
-        - python -m pip install twine cibuildwheel==1.1.0 scikit-build
+        - python -m pip install twine cibuildwheel==1.1.0 scikit-build setuptools_scm
       script:
         - python -m cibuildwheel --output-dir wheelhouse
         - python setup.py sdist
diff --git a/pyproject.toml b/pyproject.toml
index b1251c5890..2e9e3239ab 100644
--- a/pyproject.toml
+++ b/pyproject.toml
@@ -1,3 +1,3 @@
 [build-system]
-requires = ["setuptools", "wheel", "scikit-build", "cmake", "ninja", "m2r"]
+requires = ["setuptools", "setuptools_scm", "wheel", "scikit-build", "cmake", "ninja", "m2r"]
 
diff --git a/setup.py b/setup.py
index 8c6a335ad4..99a86d3da8 100644
--- a/setup.py
+++ b/setup.py
@@ -1,9 +1,21 @@
 from skbuild import setup
 from skbuild.exceptions import SKBuildError
 from skbuild.cmaker import get_cmake_version
+from setuptools_scm import get_version
 from packaging.version import LegacyVersion
 from os import path, makedirs
-import imp
+import imp, sys, platform
+
+def get_dp_install_path() :
+    site_packages_path = path.join(path.dirname(path.__file__), 'site-packages')
+    dp_scm_version     = get_version(root="./", relative_to=__file__)
+    python_version     = 'py' + str(sys.version_info.major + sys.version_info.minor * 0.1)
+    os_info            = sys.platform
+    machine_info       = platform.machine()
+    dp_pip_install_path    = site_packages_path + '/deepmd'
+    dp_setup_install_path    = site_packages_path + '/deepmd_kit-' + dp_scm_version + '-' + python_version + '-' + os_info + '-' + machine_info + '.egg/deepmd'
+    
+    return dp_pip_install_path, dp_setup_install_path
 
 readme_file = path.join(path.dirname(path.abspath(__file__)), 'README.md')
 try:
@@ -34,6 +46,8 @@
 except OSError:
     pass
 
+dp_pip_install_path, dp_setup_install_path = get_dp_install_path()
+
 setup(
     name="deepmd-kit",
     setup_requires=setup_requires,
@@ -56,6 +70,8 @@
                 '-DBUILD_PY_IF:BOOL=TRUE', 
                 '-DBUILD_CPP_IF:BOOL=FALSE',
                 '-DFLOAT_PREC:STRING=high',
+                '-DDP_PIP_INSTALL_PATH=%s' % dp_pip_install_path,
+                '-DDP_SETUP_INSTALL_PATH=%s' % dp_setup_install_path,
     ],
     cmake_source_dir='source',
     cmake_minimum_required_version='3.0',
diff --git a/source/CMakeLists.txt b/source/CMakeLists.txt
index 6b18cb95ac..84c3d326da 100644
--- a/source/CMakeLists.txt
+++ b/source/CMakeLists.txt
@@ -186,7 +186,7 @@ if (BUILD_CPP_IF)
   set (LIB_DEEPMD_OP		"deepmd_op")
   if (USE_CUDA_TOOLKIT)
     set (LIB_DEEPMD_OP_CUDA		"deepmd_op_cuda")
-  else()
+  else ()
     set (LIB_DEEPMD_OP_CUDA		"deepmd_op")
   endif()
   if (CMAKE_CXX_COMPILER_VERSION VERSION_GREATER_EQUAL 4.9)
diff --git a/source/lib/include/NNPInter.h b/source/lib/include/NNPInter.h
index 69a0ecf7d1..a32940a738 100644
--- a/source/lib/include/NNPInter.h
+++ b/source/lib/include/NNPInter.h
@@ -93,10 +93,8 @@ class NNPInter
   compute_t *array_double;
   InternalNeighborList nlist;
   NNPAtomMap<VALUETYPE> nnpmap;
-  unsigned long long *array_longlong;
-  int *ilist, *jrange, *jlist, *array_int;
+  int *ilist, *jrange, *jlist;
   int ilist_size, jrange_size, jlist_size;
-  int arr_int_size, arr_ll_size, arr_dou_size;
 
   // function used for neighbor list copy
   vector<int> get_sel_a() const;
@@ -191,13 +189,10 @@ class NNPInterModelDevi
   vector<vector<int> > sec;
   InternalNeighborList nlist;
   NNPAtomMap<VALUETYPE> nnpmap;
-  unsigned long long *array_longlong;
-  int max_sec_size = 0, max_sec_back = 0;
-  int *ilist, *jrange, *jlist, *array_int;
-  int ilist_size, jrange_size, jlist_size, arr_int_size, arr_ll_size, arr_dou_size;
+  int *ilist, *jrange, *jlist;
+  int ilist_size, jrange_size, jlist_size;
 
   // function used for nborlist copy
-  void get_max_sec();
   vector<vector<int> > get_sel() const;
   void cum_sum(const std::vector<std::vector<int32> > n_sel);
 #ifdef USE_CUDA_TOOLKIT
diff --git a/source/lib/include/SimulationRegion_Impl.h b/source/lib/include/SimulationRegion_Impl.h
index 2b4ed1a72b..8fa318c561 100644
--- a/source/lib/include/SimulationRegion_Impl.h
+++ b/source/lib/include/SimulationRegion_Impl.h
@@ -5,6 +5,7 @@
 #include <iostream>
 #include <limits>
 #include <typeinfo>
+#include <stdexcept>
 
 using namespace std;
 
@@ -500,6 +501,9 @@ computeVolume()
       boxt[0*3+1] * (boxt[1*3+0]*boxt[2*3+2] - boxt[2*3+0]*boxt[1*3+2]) +
       boxt[0*3+2] * (boxt[1*3+0]*boxt[2*3+1] - boxt[2*3+0]*boxt[1*3+1]);
   volumei = static_cast<double>(1.)/volume;
+  if (volume < 0) {
+    throw std::runtime_error("Negative volume detected. Please make sure the simulation cell obeys the right-hand rule.");
+  }
 }
 
 template<typename VALUETYPE>
diff --git a/source/lib/include/common.h b/source/lib/include/common.h
index 8db7d817ce..3912f21f7f 100644
--- a/source/lib/include/common.h
+++ b/source/lib/include/common.h
@@ -144,9 +144,6 @@ session_input_tensors (vector<std::pair<string, Tensor>>& input_tensors,
 		       const int                        * ilist, 
 		       const int                        * jrange,
 		       const int                        * jlist,
-		       int                              * array_int,
-		       unsigned long long               * array_longlong, 
-		       double                           * array_double,
 		       const vector<VALUETYPE>		& fparam_,
 		       const vector<VALUETYPE>	        & aparam_,
 		       const NNPAtomMap<VALUETYPE>      & nnpmap,
diff --git a/source/lib/src/NNPInter.cc b/source/lib/src/NNPInter.cc
index f4f39945ff..aea9d48c9b 100644
--- a/source/lib/src/NNPInter.cc
+++ b/source/lib/src/NNPInter.cc
@@ -3,7 +3,6 @@
 #include "SimulationRegion.h"
 #include <stdexcept>	
 
-#define MAGIC_NUMBER 1024
 
 #ifdef  USE_CUDA_TOOLKIT
 #include "cuda_runtime.h"
@@ -14,7 +13,7 @@
 #define cudaErrcheck(res) { cudaAssert((res), __FILE__, __LINE__); }
 inline void cudaAssert(cudaError_t code, const char *file, int line, bool abort=true)
 {
-    if (code != cudaSuccess) 
+    if (code != cudaSuccess)
     {
         fprintf(stderr,"cuda assert: %s %s %d\n", cudaGetErrorString(code), file, line);
         if (abort) exit(code);
@@ -273,9 +272,6 @@ NNPInter::~NNPInter() {
         cudaErrcheck(cudaFree(ilist));
         cudaErrcheck(cudaFree(jrange));
         cudaErrcheck(cudaFree(jlist));
-        cudaErrcheck(cudaFree(array_int));
-        cudaErrcheck(cudaFree(array_longlong));
-        cudaErrcheck(cudaFree(array_double));
     }
     #endif
 }
@@ -283,23 +279,12 @@ NNPInter::~NNPInter() {
 #ifdef USE_CUDA_TOOLKIT
 void NNPInter::update_nbor(const InternalNeighborList & nlist, const int nloc) {
     if (!init_nbor) {
-        sec_a = cum_sum(get_sel_a());
         cudaErrcheck(cudaMalloc((void**)&ilist, sizeof(int) * nlist.ilist.size()));
         cudaErrcheck(cudaMalloc((void**)&jrange, sizeof(int) * nlist.jrange.size()));
         cudaErrcheck(cudaMalloc((void**)&jlist, sizeof(int) * nlist.jlist.size()));
-        cudaErrcheck(cudaMalloc((void**)&array_int, sizeof(int) * (sec_a.size() + nloc * sec_a.size() + nloc)));
-        cudaErrcheck(cudaMalloc((void**)&array_longlong, sizeof(unsigned long long) * nloc * MAGIC_NUMBER * 2));
-        #ifdef HIGH_PREC
-            cudaErrcheck(cudaMalloc((void**)&array_double, sizeof(compute_t) * nloc * sec_a.back() * 3));
-        #else
-            cudaErrcheck(cudaMalloc((void**)&array_double, sizeof(compute_t) * nloc * sec_a.back() * 3));
-        #endif
         ilist_size = nlist.ilist.size();
         jrange_size = nlist.jrange.size();
         jlist_size = nlist.jlist.size();
-        arr_int_size = sec_a.size() + nloc * sec_a.size() + nloc;
-        arr_ll_size = nloc * MAGIC_NUMBER * 2;
-        arr_dou_size = nloc * sec_a.back() * 3;
         init_nbor = true;
     }
     if (ilist_size < nlist.ilist.size()) {
@@ -317,25 +302,7 @@ void NNPInter::update_nbor(const InternalNeighborList & nlist, const int nloc) {
         cudaErrcheck(cudaMalloc((void**)&jlist, sizeof(int) * nlist.jlist.size()));
         jlist_size = nlist.jlist.size();
     }
-    if (arr_int_size < sec_a.size() + nloc * sec_a.size() + nloc) {
-        cudaErrcheck(cudaFree(array_int));
-        cudaErrcheck(cudaMalloc((void**)&array_int, sizeof(int) * (sec_a.size() + nloc * sec_a.size() + nloc)));
-        arr_int_size = sec_a.size() + nloc * sec_a.size() + nloc;
-    }
-    if (arr_ll_size < nloc * MAGIC_NUMBER * 2) {
-        cudaErrcheck(cudaFree(array_longlong));
-        cudaErrcheck(cudaMalloc((void**)&array_longlong, sizeof(unsigned long long) * nloc * MAGIC_NUMBER * 2));
-        arr_ll_size = nloc * MAGIC_NUMBER * 2;
-    }
-    if (arr_dou_size < nloc * sec_a.back() * 3) {
-        cudaErrcheck(cudaFree(array_double));
-        #ifdef HIGH_PREC
-            cudaErrcheck(cudaMalloc((void**)&array_double, sizeof(compute_t) * nloc * sec_a.back() * 3));
-        #else
-            cudaErrcheck(cudaMalloc((void**)&array_double, sizeof(compute_t) * nloc * sec_a.back() * 3));
-        #endif
-        arr_dou_size = nloc * sec_a.back() * 3;
-    }
+    
     cudaErrcheck(cudaMemcpy(ilist, &nlist.ilist[0], sizeof(int) * nlist.ilist.size(), cudaMemcpyHostToDevice));
     cudaErrcheck(cudaMemcpy(jrange, &nlist.jrange[0], sizeof(int) * nlist.jrange.size(), cudaMemcpyHostToDevice));
     cudaErrcheck(cudaMemcpy(jlist, &nlist.jlist[0], sizeof(int) * nlist.jlist.size(), cudaMemcpyHostToDevice));
@@ -378,14 +345,10 @@ init (const string & model, const int & gpu_rank)
   if (dfparam < 0) dfparam = 0;
   if (daparam < 0) daparam = 0;
   inited = true;
-
+  
   init_nbor = false;
-  array_int = NULL;
-  array_double = NULL;
-  array_longlong = NULL;
   ilist = NULL; jrange = NULL; jlist = NULL;
   ilist_size = 0; jrange_size = 0; jlist_size = 0;
-  arr_int_size = 0; arr_ll_size = 0; arr_dou_size = 0;
 }
 #else
 void
@@ -415,12 +378,8 @@ init (const string & model, const int & gpu_rank)
   inited = true;
 
   init_nbor = false;
-  array_int = NULL;
-  array_double = NULL;
-  array_longlong = NULL;
   ilist = NULL; jrange = NULL; jlist = NULL;
   ilist_size = 0; jrange_size = 0; jlist_size = 0;
-  arr_int_size = 0; arr_ll_size = 0; arr_dou_size = 0;
 }
 #endif
 
@@ -602,7 +561,7 @@ compute_inner (ENERGYTYPE &			dener,
     }
 
     #ifdef USE_CUDA_TOOLKIT
-        int ret = session_input_tensors (input_tensors, dcoord_, ntypes, datype_, dbox, ilist, jrange, jlist, array_int, array_longlong, array_double, fparam, aparam, nnpmap, nghost);
+        int ret = session_input_tensors (input_tensors, dcoord_, ntypes, datype_, dbox, ilist, jrange, jlist, fparam, aparam, nnpmap, nghost);
     #else
         int ret = session_input_tensors (input_tensors, dcoord_, ntypes, datype_, dbox, nlist, fparam, aparam, nnpmap, nghost);
     #endif
@@ -669,7 +628,7 @@ compute (ENERGYTYPE &			dener,
     }
 
     #ifdef USE_CUDA_TOOLKIT
-        int ret = session_input_tensors (input_tensors, dcoord_, ntypes, datype_, dbox, ilist, jrange, jlist, array_int, array_longlong, array_double, fparam, aparam, nnpmap, nghost);
+        int ret = session_input_tensors (input_tensors, dcoord_, ntypes, datype_, dbox, ilist, jrange, jlist, fparam, aparam, nnpmap, nghost);
     #else
         int ret = session_input_tensors (input_tensors, dcoord_, ntypes, datype_, dbox, nlist, fparam, aparam, nnpmap, nghost);
     #endif
@@ -710,9 +669,6 @@ NNPInterModelDevi::~NNPInterModelDevi() {
         cudaErrcheck(cudaFree(ilist));
         cudaErrcheck(cudaFree(jrange));
         cudaErrcheck(cudaFree(jlist));
-        cudaErrcheck(cudaFree(array_int));
-        cudaErrcheck(cudaFree(array_longlong));
-        cudaErrcheck(cudaFree(array_double));
     }
 #endif
 }
@@ -761,14 +717,10 @@ init (const vector<string> & models, const int & gpu_rank)
   // cell_size = rcut;
   // ntypes = get_ntypes();
   inited = true;
-
+  
   init_nbor = false;
-  array_int = NULL;
-  array_double = NULL;
-  array_longlong = NULL;
   ilist = NULL; jrange = NULL; jlist = NULL;
   ilist_size = 0; jrange_size = 0; jlist_size = 0;
-  arr_int_size = 0; arr_ll_size = 0; arr_dou_size = 0;
 }
 #else
 void
@@ -798,14 +750,10 @@ init (const vector<string> & models, const int & gpu_rank)
   // cell_size = rcut;
   // ntypes = get_ntypes();
   inited = true;
-
+  
   init_nbor = false;
-  array_int = NULL;
-  array_double = NULL;
-  array_longlong = NULL;
   ilist = NULL; jrange = NULL; jlist = NULL;
   ilist_size = 0; jrange_size = 0; jlist_size = 0;
-  arr_int_size = 0; arr_ll_size = 0; arr_dou_size = 0;
 }
 #endif
 
@@ -873,40 +821,18 @@ cum_sum (const std::vector<std::vector<int32> > n_sel)
     }
 }
 
-void  
-NNPInterModelDevi::
-get_max_sec() 
-{
-    for (int ii = 0; ii < numb_models; ii++) {
-        this->max_sec_size = max_sec_size < sec[ii].size() ? sec[ii].size() : max_sec_size;
-        this->max_sec_back = max_sec_back < sec[ii].back() ? sec[ii].back() : max_sec_back;
-    }
-}
-
 #ifdef USE_CUDA_TOOLKIT
 void
 NNPInterModelDevi::
 update_nbor(const InternalNeighborList & nlist, const int nloc) 
 {
     if (!init_nbor) {
-        cum_sum(get_sel());
-        get_max_sec();
         cudaErrcheck(cudaMalloc((void**)&ilist, sizeof(int) * nlist.ilist.size()));
         cudaErrcheck(cudaMalloc((void**)&jrange, sizeof(int) * nlist.jrange.size()));
         cudaErrcheck(cudaMalloc((void**)&jlist, sizeof(int) * nlist.jlist.size()));
-        cudaErrcheck(cudaMalloc((void**)&array_int, sizeof(int) * (max_sec_size + nloc * max_sec_size + nloc)));
-        cudaErrcheck(cudaMalloc((void**)&array_longlong, sizeof(unsigned long long) * nloc * MAGIC_NUMBER * 2));
-        #ifdef HIGH_PREC
-            cudaErrcheck(cudaMalloc((void**)&array_double, sizeof(compute_t) * nloc * max_sec_back * 3));
-        #else
-            cudaErrcheck(cudaMalloc((void**)&array_double, sizeof(compute_t) * nloc * max_sec_back * 3));
-        #endif
         ilist_size = nlist.ilist.size();
         jrange_size = nlist.jrange.size();
         jlist_size = nlist.jlist.size();
-        arr_int_size = max_sec_size + nloc * max_sec_size + nloc;
-        arr_ll_size = nloc * MAGIC_NUMBER * 2;
-        arr_dou_size = nloc * max_sec_back * 3;
         init_nbor = true;
     }
     if (ilist_size < nlist.ilist.size()) {
@@ -924,25 +850,7 @@ update_nbor(const InternalNeighborList & nlist, const int nloc)
         cudaErrcheck(cudaMalloc((void**)&jlist, sizeof(int) * nlist.jlist.size()));
         jlist_size = nlist.jlist.size();
     }
-    if (arr_int_size < max_sec_size + nloc * max_sec_size + nloc) {
-        cudaErrcheck(cudaFree(array_int));
-        cudaErrcheck(cudaMalloc((void**)&array_int, sizeof(int) * (max_sec_size + nloc * max_sec_size + nloc)));
-        arr_int_size = max_sec_size + nloc * max_sec_size + nloc;
-    }
-    if (arr_ll_size < nloc * MAGIC_NUMBER * 2) {
-        cudaErrcheck(cudaFree(array_longlong));
-        cudaErrcheck(cudaMalloc((void**)&array_longlong, sizeof(unsigned long long) * nloc * MAGIC_NUMBER * 2));
-        arr_ll_size = nloc * MAGIC_NUMBER * 2;
-    }
-    if (arr_dou_size < nloc * max_sec_back * 3) {
-        cudaErrcheck(cudaFree(array_double));
-        #ifdef HIGH_PREC
-            cudaErrcheck(cudaMalloc((void**)&array_double, sizeof(compute_t) * nloc * max_sec_back * 3));
-        #else
-            cudaErrcheck(cudaMalloc((void**)&array_double, sizeof(compute_t) * nloc * max_sec_back * 3));
-        #endif
-        arr_dou_size = nloc * max_sec_back * 3;
-    }
+
     cudaErrcheck(cudaMemcpy(ilist, &nlist.ilist[0], sizeof(int) * nlist.ilist.size(), cudaMemcpyHostToDevice));
     cudaErrcheck(cudaMemcpy(jrange, &nlist.jrange[0], sizeof(int) * nlist.jrange.size(), cudaMemcpyHostToDevice));
     cudaErrcheck(cudaMemcpy(jlist, &nlist.jlist[0], sizeof(int) * nlist.jlist.size(), cudaMemcpyHostToDevice));
@@ -1044,7 +952,7 @@ compute (vector<ENERGYTYPE> &		all_energy,
 
     }
     #ifdef USE_CUDA_TOOLKIT
-        int ret = session_input_tensors (input_tensors, dcoord_, ntypes, datype_, dbox, ilist, jrange, jlist, array_int, array_longlong, array_double, fparam, aparam, nnpmap, nghost);
+        int ret = session_input_tensors (input_tensors, dcoord_, ntypes, datype_, dbox, ilist, jrange, jlist, fparam, aparam, nnpmap, nghost);
     #else
         int ret = session_input_tensors (input_tensors, dcoord_, ntypes, datype_, dbox, nlist, fparam, aparam, nnpmap, nghost);
     #endif
@@ -1094,7 +1002,7 @@ compute (vector<ENERGYTYPE> &			all_energy,
         
     }
     #ifdef USE_CUDA_TOOLKIT
-        int ret = session_input_tensors (input_tensors, dcoord_, ntypes, datype_, dbox, ilist, jrange, jlist, array_int, array_longlong, array_double, fparam, aparam, nnpmap, nghost);
+        int ret = session_input_tensors (input_tensors, dcoord_, ntypes, datype_, dbox, ilist, jrange, jlist, fparam, aparam, nnpmap, nghost);
     #else
         int ret = session_input_tensors (input_tensors, dcoord_, ntypes, datype_, dbox, nlist, fparam, aparam, nnpmap, nghost);
     #endif
diff --git a/source/lib/src/common.cc b/source/lib/src/common.cc
index 296800a396..dd4c3f672a 100644
--- a/source/lib/src/common.cc
+++ b/source/lib/src/common.cc
@@ -479,9 +479,6 @@ session_input_tensors (
     const int                           *   ilist, 
     const int                           *   jrange,
     const int                           *   jlist,
-    int                                 *   array_int,
-    unsigned long long                  *   array_longlong, 
-    double                              *   array_double,
     const vector<VALUETYPE>	            &   fparam_,
     const vector<VALUETYPE>	            &   aparam_,
     const NNPAtomMap<VALUETYPE>         &   nnpmap,
@@ -511,7 +508,7 @@ session_input_tensors (
     box_shape.AddDim (nframes);
     box_shape.AddDim (9);
     TensorShape mesh_shape;
-    mesh_shape.AddDim (32);
+    mesh_shape.AddDim (16);
     TensorShape natoms_shape;
     natoms_shape.AddDim (2 + ntypes);
     TensorShape fparam_shape;
@@ -565,7 +562,7 @@ session_input_tensors (
         }
     }
     
-    for (int ii = 0; ii < 32; ++ii) mesh(ii) = 0;
+    for (int ii = 0; ii < 16; ++ii) mesh(ii) = 0;
     
     mesh (0) = sizeof(int *) / sizeof(int);
     assert (mesh(0) * sizeof(int) == sizeof(int *));
@@ -577,9 +574,6 @@ session_input_tensors (
     memcpy (&mesh(4), &(ilist), sizeof(int *));
     memcpy (&mesh(8), &(jrange), sizeof(int *));
     memcpy (&mesh(12), &(jlist), sizeof(int *));
-    memcpy (&mesh(16), &(array_int), sizeof(int *));
-    memcpy (&mesh(20), &(array_longlong), sizeof(unsigned long long *));
-    memcpy (&mesh(24), &(array_double), sizeof(double *));
 
     natoms (0) = nloc;
     natoms (1) = nall;
diff --git a/source/lmp/env.sh.in b/source/lmp/env.sh.in
index 00bc3b18b6..2e091432f6 100644
--- a/source/lmp/env.sh.in
+++ b/source/lmp/env.sh.in
@@ -8,4 +8,4 @@ TF_RPATH=`echo $TENSORFLOW_LIBRARY_PATH | sed "s/;/ -Wl,-rpath=/g"`
 
 NNP_INC=" -std=c++11 @PREC_DEF@ @TTM_DEF@ @OLD_LMP_PPPM_DEF@ -I$TF_INCLUDE_DIRS -I$DEEPMD_ROOT/include/deepmd "
 NNP_PATH=" -L$TF_LIBRARY_PATH -L$DEEPMD_ROOT/lib"
-NNP_LIB=" -Wl,--no-as-needed -l@LIB_DEEPMD_OP@ -l@LIB_DEEPMD_OP_CUDA@ -l@LIB_DEEPMD@ -ltensorflow_cc -ltensorflow_framework -Wl,-rpath=$TF_RPATH -Wl,-rpath=$DEEPMD_ROOT/lib"
+NNP_LIB=" -Wl,--no-as-needed -l@LIB_DEEPMD_OP_CUDA@ -l@LIB_DEEPMD_OP@ -l@LIB_DEEPMD@ -ltensorflow_cc -ltensorflow_framework -Wl,-rpath=$TF_RPATH -Wl,-rpath=$DEEPMD_ROOT/lib"
diff --git a/source/lmp/fix_dplr.cpp b/source/lmp/fix_dplr.cpp
index c0f0bb930c..03fef55420 100644
--- a/source/lmp/fix_dplr.cpp
+++ b/source/lmp/fix_dplr.cpp
@@ -24,6 +24,7 @@ is_key (const string& input)
   keys.push_back("model");
   keys.push_back("type_associate");
   keys.push_back("bond_type");
+  keys.push_back("efield");
   for (int ii = 0; ii < keys.size(); ++ii){
     if (input == keys[ii]) {
       return true;
@@ -34,7 +35,11 @@ is_key (const string& input)
 
 
 FixDPLR::FixDPLR(LAMMPS *lmp, int narg, char **arg) 
-    :Fix(lmp, narg, arg)
+    :Fix(lmp, narg, arg), 
+     efield(3, 0.0), 
+     efield_fsum(4, 0.0), 
+     efield_fsum_all(4, 0.0), 
+     efield_force_flag(0)
 {
   virial_flag = 1;
 
@@ -54,7 +59,14 @@ FixDPLR::FixDPLR(LAMMPS *lmp, int narg, char **arg)
       model = string(arg[iarg+1]);
       iarg += 2;
     }
-    if (string(arg[iarg]) == string("type_associate")) {
+    else if (string(arg[iarg]) == string("efield")) {
+      if (iarg+3 > narg) error->all(FLERR,"Illegal fix adapt command, efield should be provided 3 float numbers");
+      efield[0] = atof(arg[iarg+1]);
+      efield[1] = atof(arg[iarg+2]);
+      efield[2] = atof(arg[iarg+3]);
+      iarg += 4;
+    }
+    else if (string(arg[iarg]) == string("type_associate")) {
       int iend = iarg+1;
       while (iend < narg && (! is_key(arg[iend]) )) {
 	map_vec.push_back(atoi(arg[iend])-1);
@@ -62,7 +74,7 @@ FixDPLR::FixDPLR(LAMMPS *lmp, int narg, char **arg)
       }
       iarg = iend;
     }
-    if (string(arg[iarg]) == string("bond_type")) {
+    else if (string(arg[iarg]) == string("bond_type")) {
       int iend = iarg+1;
       while (iend < narg && (! is_key(arg[iend]) )) {
 	bond_type.push_back(atoi(arg[iend])-1);
@@ -105,6 +117,7 @@ FixDPLR::FixDPLR(LAMMPS *lmp, int narg, char **arg)
 int FixDPLR::setmask()
 {
   int mask = 0;
+  mask |= THERMO_ENERGY;
   mask |= POST_INTEGRATE;
   mask |= PRE_FORCE;
   mask |= POST_FORCE;
@@ -347,6 +360,7 @@ void FixDPLR::post_force(int vflag)
   int nall = nlocal + nghost;
   vector<FLOAT_PREC> dcoord(nall*3, 0.0), dbox(9, 0.0), dfele(nlocal*3, 0.0);
   vector<int> dtype(nall, 0);
+  // set values for dcoord, dbox, dfele
   {
     int *type = atom->type;
     for (int ii = 0; ii < nall; ++ii){
@@ -366,9 +380,40 @@ void FixDPLR::post_force(int vflag)
       }
     }
     assert(dfele_.size() == nlocal * 3);
+    // revise force according to efield
     for (int ii = 0; ii < nlocal*3; ++ii){
       dfele[ii] = dfele_[ii];
     }
+    // revise force and virial according to efield
+    double * q = atom->q;
+    imageint *image = atom->image;
+    double unwrap[3];
+    double v[6];
+    efield_fsum[0] = efield_fsum[1] = efield_fsum[2] = efield_fsum[3] = 0.0;
+    efield_force_flag = 0;
+    for (int ii = 0; ii < nlocal; ++ii){
+      double tmpf[3];
+      for (int dd = 0; dd < 3; ++dd){
+	tmpf[dd] = q[ii] * efield[dd];
+      }
+      for (int dd = 0; dd < 3; ++dd){
+	dfele[ii*3+dd] += tmpf[dd];
+      }
+      domain->unmap(x[ii],image[ii],unwrap);
+      efield_fsum[0] -= tmpf[0]*unwrap[0]+tmpf[1]*unwrap[1]+tmpf[2]*unwrap[2];
+      efield_fsum[1] += tmpf[0];
+      efield_fsum[2] += tmpf[1];
+      efield_fsum[3] += tmpf[2];
+      if (evflag) {
+	v[0] = tmpf[0] *unwrap[0];
+	v[1] = tmpf[1] *unwrap[1];
+	v[2] = tmpf[2] *unwrap[2];
+	v[3] = tmpf[0] *unwrap[1];
+	v[4] = tmpf[0] *unwrap[2];
+	v[5] = tmpf[1] *unwrap[2];
+	v_tally(ii, v);
+      }
+    }
   }
   // lmp nlist
   NeighList * list = pair_nnp->list;
@@ -466,5 +511,28 @@ void FixDPLR::unpack_reverse_comm(int n, int *list, double *buf)
   }
 }
 
+/* ----------------------------------------------------------------------
+   return energy added by fix
+------------------------------------------------------------------------- */
 
+double FixDPLR::compute_scalar(void)
+{
+  if (efield_force_flag == 0) {
+    MPI_Allreduce(&efield_fsum[0],&efield_fsum_all[0],4,MPI_DOUBLE,MPI_SUM,world);
+    efield_force_flag = 1;
+  }
+  return efield_fsum_all[0];
+}
+
+/* ----------------------------------------------------------------------
+   return total extra force due to fix
+------------------------------------------------------------------------- */
 
+double FixDPLR::compute_vector(int n)
+{
+  if (efield_force_flag == 0) {
+    MPI_Allreduce(&efield_fsum[0],&efield_fsum_all[0],4,MPI_DOUBLE,MPI_SUM,world);
+    efield_force_flag = 1;
+  }
+  return efield_fsum_all[n+1];
+}
diff --git a/source/lmp/fix_dplr.h b/source/lmp/fix_dplr.h
index edc9204874..a0f45185b4 100644
--- a/source/lmp/fix_dplr.h
+++ b/source/lmp/fix_dplr.h
@@ -32,6 +32,8 @@ namespace LAMMPS_NS {
     void post_force(int);
     int pack_reverse_comm(int, int, double *);
     void unpack_reverse_comm(int, int *, double *);
+    double compute_scalar(void);
+    double compute_vector(int);
 private:
     PairNNP * pair_nnp;
     DeepTensor dpt;
@@ -45,6 +47,9 @@ namespace LAMMPS_NS {
     map<int,int > bk_type_asso;
     vector<FLOAT_PREC> dipole_recd;
     vector<double> dfcorr_buff;
+    vector<double> efield;
+    vector<double> efield_fsum, efield_fsum_all;
+    int efield_force_flag;
     void get_valid_pairs(vector<pair<int,int> >& pairs);
   };
 }
diff --git a/source/op/CMakeLists.txt b/source/op/CMakeLists.txt
index 89416056e1..993a1b6fd4 100644
--- a/source/op/CMakeLists.txt
+++ b/source/op/CMakeLists.txt
@@ -3,8 +3,9 @@
 set(OP_LIB ${PROJECT_SOURCE_DIR}/lib/src/SimulationRegion.cpp ${PROJECT_SOURCE_DIR}/lib/src/NeighborList.cpp)
 
 set (OP_CXX_FLAG -D_GLIBCXX_USE_CXX11_ABI=${OP_CXX_ABI} )
-file(GLOB OP_SRC prod_force.cc prod_virial.cc descrpt.cc descrpt_se_a.cc descrpt_se_r.cc tab_inter.cc prod_force_se_a.cc prod_virial_se_a.cc prod_force_se_r.cc prod_virial_se_r.cc soft_min.cc soft_min_force.cc soft_min_virial.cc ewald_recp.cc)
-file(GLOB OP_CUDA_SRC prod_force.cc prod_virial.cc descrpt.cc descrpt_se_a_gpu.cc descrpt_se_r_gpu.cc tab_inter.cc prod_force_se_a_gpu.cc prod_virial_se_a_gpu.cc prod_force_se_r_gpu.cc prod_virial_se_r_gpu.cc soft_min.cc soft_min_force.cc soft_min_virial.cc )
+file(GLOB OP_SRC prod_force.cc prod_virial.cc descrpt.cc descrpt_se_a.cc descrpt_se_r.cc tab_inter.cc prod_force_se_a.cc prod_virial_se_a.cc prod_force_se_r.cc prod_virial_se_r.cc soft_min.cc soft_min_force.cc soft_min_virial.cc ewald_recp.cc gelu.cc)
+file(GLOB OP_PY_CUDA_SRC prod_force.cc prod_virial.cc descrpt.cc descrpt_se_a.cc descrpt_se_r.cc tab_inter.cc prod_force_se_a.cc prod_virial_se_a.cc prod_force_se_r.cc prod_virial_se_r.cc soft_min.cc soft_min_force.cc soft_min_virial.cc ewald_recp.cc gelu_gpu.cc)
+file(GLOB OP_CUDA_SRC prod_force.cc prod_virial.cc descrpt.cc descrpt_se_a_gpu.cc descrpt_se_r_gpu.cc tab_inter.cc prod_force_se_a_gpu.cc prod_virial_se_a_gpu.cc prod_force_se_r_gpu.cc prod_virial_se_r_gpu.cc soft_min.cc soft_min_force.cc soft_min_virial.cc gelu_gpu.cc)
 file(GLOB OP_GRADS_SRC prod_force_grad.cc prod_force_se_a_grad.cc prod_force_se_r_grad.cc prod_virial_grad.cc prod_virial_se_a_grad.cc prod_virial_se_r_grad.cc soft_min_force_grad.cc soft_min_virial_grad.cc )
 file(GLOB OP_PY *.py)
 
@@ -23,8 +24,22 @@ if (BUILD_CPP_IF)
 endif (BUILD_CPP_IF)
 
 if (BUILD_PY_IF)
-  add_library(op_abi SHARED ${OP_SRC} ${OP_LIB})
-  add_library(op_grads SHARED ${OP_GRADS_SRC})
+  set(CMAKE_BUILD_WITH_INSTALL_RPATH TRUE)
+  set(CMAKE_INSTALL_RPATH DESTINATION ${DP_PIP_INSTALL_PATH} ${DP_SETUP_INSTALL_PATH} ${CMAKE_BINARY_DIR}/op/cuda)
+  if (USE_CUDA_TOOLKIT)
+    add_library(op_abi SHARED ${OP_PY_CUDA_SRC} ${OP_LIB})
+    add_library(op_grads SHARED ${OP_GRADS_SRC})
+    add_subdirectory(cuda)
+    find_package(CUDA REQUIRED)
+    include_directories(${CUDA_INCLUDE_DIRS})
+    set (EXTRA_LIBS ${EXTRA_LIBS} deepmd_op_cuda)
+    target_link_libraries (op_abi ${EXTRA_LIBS})
+    target_link_libraries (op_grads ${EXTRA_LIBS})
+  else (USE_CUDA_TOOLKIT)
+    add_library(op_abi SHARED ${OP_SRC} ${OP_LIB})
+    add_library(op_grads SHARED ${OP_GRADS_SRC})
+  endif(USE_CUDA_TOOLKIT)
+  message(STATUS ${TensorFlowFramework_LIBRARY})
   target_link_libraries(
     op_abi ${TensorFlowFramework_LIBRARY}
     )
diff --git a/source/op/_gelu.py b/source/op/_gelu.py
new file mode 100644
index 0000000000..ac0585da78
--- /dev/null
+++ b/source/op/_gelu.py
@@ -0,0 +1,15 @@
+#!/usr/bin/env python3
+"""
+First-order derivatives and second-order derivatives for gelu function.
+"""
+
+from tensorflow.python.framework import ops
+from deepmd.env import op_module
+
+@ops.RegisterGradient("Gelu")
+def _gelu_cc (op, dy) :
+    return op_module.gelu_grad(dy, op.inputs[0])     
+
+@ops.RegisterGradient("GeluGrad")
+def _gelu_grad_cc (op, dy) :
+    return [op_module.gelu_grad(dy, op.inputs[1]), op_module.gelu_grad_grad(dy, op.inputs[0], op.inputs[1])]
diff --git a/source/op/cuda/CMakeLists.txt b/source/op/cuda/CMakeLists.txt
index 25f796500b..89dd0b5922 100644
--- a/source/op/cuda/CMakeLists.txt
+++ b/source/op/cuda/CMakeLists.txt
@@ -80,9 +80,14 @@ else ()
 endif()
 
 set (SOURCE_FILES
-    descrpt_se_a.cu descrpt_se_r.cu prod_force_se_a.cu prod_force_se_r.cu prod_virial_se_a.cu prod_virial_se_r.cu 
+    descrpt_se_a.cu descrpt_se_r.cu prod_force_se_a.cu prod_force_se_r.cu prod_virial_se_a.cu prod_virial_se_r.cu gelu.cu 
 )
 
 cuda_add_library(deepmd_op_cuda SHARED ${SOURCE_FILES})
 
-install(TARGETS deepmd_op_cuda DESTINATION lib/)
+if (BUILD_CPP_IF)
+    install(TARGETS deepmd_op_cuda DESTINATION lib/)
+endif (BUILD_CPP_IF)
+if (BUILD_PY_IF)
+    install(TARGETS deepmd_op_cuda DESTINATION deepmd/)
+endif (BUILD_PY_IF)
diff --git a/source/op/cuda/descrpt_se_a.cu b/source/op/cuda/descrpt_se_a.cu
index 4b309522fa..1636df8ff5 100644
--- a/source/op/cuda/descrpt_se_a.cu
+++ b/source/op/cuda/descrpt_se_a.cu
@@ -18,8 +18,6 @@ limitations under the License.
 #include <cub/block/block_radix_sort.cuh>
 #include <cuda_runtime.h>
 
-#define MAGIC_NUMBER 1024
-
 #ifdef HIGH_PREC
     typedef double  VALUETYPE;
 #else
@@ -118,11 +116,12 @@ __global__ void format_nlist_fill_a_se_a(const VALUETYPE * coord,
                             const int  * jlist,
                             const float rcut,
                             int_64 * key,
-                            int * i_idx)
+                            int * i_idx,
+                            const int MAGIC_NUMBER)
 {   
     // <<<nloc, MAGIC_NUMBER>>>
-    const unsigned int idx = blockIdx.x;
-    const unsigned int idy = threadIdx.x;
+    const unsigned int idx = blockIdx.y;
+    const unsigned int idy = blockIdx.x * blockDim.x + threadIdx.x;
     
     const int nsize = jrange[i_idx[idx] + 1] - jrange[i_idx[idx]];
     if (idy >= nsize) {
@@ -154,7 +153,8 @@ __global__ void format_nlist_fill_b_se_a(int * nlist,
                             int_64 * key,
                             const int * sec_a,
                             const int sec_a_size,
-                            int * nei_iter_dev)
+                            int * nei_iter_dev,
+                            const int MAGIC_NUMBER)
 { 
 
     const unsigned int idy = blockIdx.x * blockDim.x + threadIdx.x;
@@ -194,7 +194,6 @@ __global__ void compute_descriptor_se_a (VALUETYPE* descript,
                             const VALUETYPE* coord,
                             const VALUETYPE rmin,
                             const VALUETYPE rmax,
-                            compute_t* sel_a_diff_dev,
                             const int sec_a_size)
 {   
     // <<<nloc, sec_a.back()>>>
@@ -208,16 +207,14 @@ __global__ void compute_descriptor_se_a (VALUETYPE* descript,
     VALUETYPE * row_descript = descript + idx * ndescrpt;
     VALUETYPE * row_descript_deriv = descript_deriv + idx * descript_deriv_size;
     VALUETYPE * row_rij = rij + idx * rij_size;
-    compute_t * sel_a_diff = sel_a_diff_dev + idx * nlist_size * 3;
     int * row_nlist = nlist + idx * nlist_size;
 
     if (row_nlist[idy] >= 0) {
         const int & j_idx = row_nlist[idy];
         for (int kk = 0; kk < 3; kk++) {
-            sel_a_diff[idy * 3 + kk] = coord[j_idx * 3 + kk] - coord[idx * 3 + kk];
-            row_rij[idy * 3 + kk] = sel_a_diff[idy * 3 + kk];
+            row_rij[idy * 3 + kk] = coord[j_idx * 3 + kk] - coord[idx * 3 + kk];
         }
-        const compute_t * rr = &sel_a_diff[idy * 3 + 0];
+        const compute_t * rr = &row_rij[idy * 3 + 0];
         compute_t nr2 = dev_dot(rr, rr);
         compute_t inr = 1./sqrt(nr2);
         compute_t nr = nr2 * inr;
@@ -263,6 +260,166 @@ __global__ void compute_descriptor_se_a (VALUETYPE* descript,
     }
 }
 
+void format_nbor_list_256 (
+    const VALUETYPE* coord,
+    const int* type,
+    const int* jrange,
+    const int* jlist,
+    const int& nloc,       
+    const float& rcut_r, 
+    int * i_idx, 
+    int_64 * key
+) 
+{   
+    const int LEN = 256;
+    const int MAGIC_NUMBER = 256;
+    const int nblock = (MAGIC_NUMBER + LEN - 1) / LEN;
+    dim3 block_grid(nblock, nloc);
+    format_nlist_fill_a_se_a
+    <<<block_grid, LEN>>> (
+        coord,
+        type,
+        jrange,
+        jlist,
+        rcut_r,
+        key,
+        i_idx,
+        MAGIC_NUMBER
+    );
+    const int ITEMS_PER_THREAD = 4;
+    const int BLOCK_THREADS = MAGIC_NUMBER / ITEMS_PER_THREAD;
+    // BlockSortKernel<NeighborInfo, BLOCK_THREADS, ITEMS_PER_THREAD><<<g_grid_size, BLOCK_THREADS>>> (
+    BlockSortKernel<int_64, BLOCK_THREADS, ITEMS_PER_THREAD> <<<nloc, BLOCK_THREADS>>> (key, key + nloc * MAGIC_NUMBER);
+}
+
+void format_nbor_list_512 (
+    const VALUETYPE* coord,
+    const int* type,
+    const int* jrange,
+    const int* jlist,
+    const int& nloc,       
+    const float& rcut_r, 
+    int * i_idx, 
+    int_64 * key
+) 
+{   
+    const int LEN = 256;
+    const int MAGIC_NUMBER = 512;
+    const int nblock = (MAGIC_NUMBER + LEN - 1) / LEN;
+    dim3 block_grid(nblock, nloc);
+    format_nlist_fill_a_se_a
+    <<<block_grid, LEN>>> (
+        coord,
+        type,
+        jrange,
+        jlist,
+        rcut_r,
+        key,
+        i_idx,
+        MAGIC_NUMBER
+    );
+    const int ITEMS_PER_THREAD = 4;
+    const int BLOCK_THREADS = MAGIC_NUMBER / ITEMS_PER_THREAD;
+    // BlockSortKernel<NeighborInfo, BLOCK_THREADS, ITEMS_PER_THREAD><<<g_grid_size, BLOCK_THREADS>>> (
+    BlockSortKernel<int_64, BLOCK_THREADS, ITEMS_PER_THREAD> <<<nloc, BLOCK_THREADS>>> (key, key + nloc * MAGIC_NUMBER);
+}
+
+void format_nbor_list_1024 (
+    const VALUETYPE* coord,
+    const int* type,
+    const int* jrange,
+    const int* jlist,
+    const int& nloc,       
+    const float& rcut_r, 
+    int * i_idx, 
+    int_64 * key
+) 
+{   
+    const int LEN = 256;
+    const int MAGIC_NUMBER = 1024;
+    const int nblock = (MAGIC_NUMBER + LEN - 1) / LEN;
+    dim3 block_grid(nblock, nloc);
+    format_nlist_fill_a_se_a
+    <<<block_grid, LEN>>> (
+        coord,
+        type,
+        jrange,
+        jlist,
+        rcut_r,
+        key,
+        i_idx,
+        MAGIC_NUMBER
+    );
+    const int ITEMS_PER_THREAD = 8;
+    const int BLOCK_THREADS = MAGIC_NUMBER / ITEMS_PER_THREAD;
+    // BlockSortKernel<NeighborInfo, BLOCK_THREADS, ITEMS_PER_THREAD><<<g_grid_size, BLOCK_THREADS>>> (
+    BlockSortKernel<int_64, BLOCK_THREADS, ITEMS_PER_THREAD> <<<nloc, BLOCK_THREADS>>> (key, key + nloc * MAGIC_NUMBER);
+}
+
+void format_nbor_list_2048 (
+    const VALUETYPE* coord,
+    const int* type,
+    const int* jrange,
+    const int* jlist,
+    const int& nloc,       
+    const float& rcut_r, 
+    int * i_idx, 
+    int_64 * key
+) 
+{   
+    const int LEN = 256;
+    const int MAGIC_NUMBER = 2048;
+    const int nblock = (MAGIC_NUMBER + LEN - 1) / LEN;
+    dim3 block_grid(nblock, nloc);
+    format_nlist_fill_a_se_a
+    <<<block_grid, LEN>>> (
+        coord,
+        type,
+        jrange,
+        jlist,
+        rcut_r,
+        key,
+        i_idx,
+        MAGIC_NUMBER
+    );
+    const int ITEMS_PER_THREAD = 8;
+    const int BLOCK_THREADS = MAGIC_NUMBER / ITEMS_PER_THREAD;
+    // BlockSortKernel<NeighborInfo, BLOCK_THREADS, ITEMS_PER_THREAD><<<g_grid_size, BLOCK_THREADS>>> (
+    BlockSortKernel<int_64, BLOCK_THREADS, ITEMS_PER_THREAD> <<<nloc, BLOCK_THREADS>>> (key, key + nloc * MAGIC_NUMBER);
+}
+
+void format_nbor_list_4096 (
+    const VALUETYPE* coord,
+    const int* type,
+    const int* jrange,
+    const int* jlist,
+    const int& nloc,       
+    const float& rcut_r, 
+    int * i_idx, 
+    int_64 * key
+) 
+{   
+    const int LEN = 256;
+    const int MAGIC_NUMBER = 4096;
+    const int nblock = (MAGIC_NUMBER + LEN - 1) / LEN;
+    dim3 block_grid(nblock, nloc);
+    format_nlist_fill_a_se_a
+    <<<block_grid, LEN>>> (
+        coord,
+        type,
+        jrange,
+        jlist,
+        rcut_r,
+        key,
+        i_idx,
+        MAGIC_NUMBER
+    );
+    const int ITEMS_PER_THREAD = 16;
+    const int BLOCK_THREADS = MAGIC_NUMBER / ITEMS_PER_THREAD;
+    // BlockSortKernel<NeighborInfo, BLOCK_THREADS, ITEMS_PER_THREAD><<<g_grid_size, BLOCK_THREADS>>> (
+    BlockSortKernel<int_64, BLOCK_THREADS, ITEMS_PER_THREAD> <<<nloc, BLOCK_THREADS>>> (key, key + nloc * MAGIC_NUMBER);
+}
+
 void DescrptSeALauncher(const VALUETYPE* coord,
                             const int* type,
                             const int* ilist,
@@ -270,22 +427,20 @@ void DescrptSeALauncher(const VALUETYPE* coord,
                             const int* jlist,
                             int* array_int,
                             unsigned long long* array_longlong,
-                            compute_t* array_double,
                             const VALUETYPE* avg,
                             const VALUETYPE* std,
                             VALUETYPE* descript,
                             VALUETYPE* descript_deriv,
                             VALUETYPE* rij,
                             int* nlist,
-                            const int& ntypes,
                             const int& nloc,       
-                            const int& nall,       
                             const int& nnei,       
                             const float& rcut_r,     
                             const float& rcut_r_smth,
                             const int& ndescrpt, 
                             const std::vector<int>& sec_a,      
-                            const bool& fill_nei_a
+                            const bool& fill_nei_a,
+                            const int MAGIC_NUMBER
 )
 {   
     const int LEN = 256;
@@ -294,42 +449,76 @@ void DescrptSeALauncher(const VALUETYPE* coord,
     int * nei_iter = array_int + sec_a.size(); // = new int[sec_a_size];
     int * i_idx = array_int + sec_a.size() + nloc * sec_a.size();
     int_64 * key = array_longlong;
-    compute_t * sel_a_diff = array_double;    // = new VALUETYPE *[nlist_size]; nnei
-    // int_64 * key = NULL;
-    // VALUETYPE * sel_a_diff = NULL;    // = new VALUETYPE *[nlist_size]; nnei
-
+    
     cudaError_t res = cudaSuccess;
-    // res = cudaMalloc((void**)&sec_a_dev, sizeof(int) * sec_a.size()); cudaErrcheck(res);
-    // res = cudaMalloc((void**)&nei_iter, sizeof(int) * nloc * sec_a.size()); cudaErrcheck(res);
-    // res = cudaMalloc((void**)&i_idx, sizeof(int) * nloc); cudaErrcheck(res);
-    // res = cudaMalloc((void**)&key, sizeof(unsigned long long) * nloc * MAGIC_NUMBER * 2); cudaErrcheck(res);
-    // res = cudaMalloc((void**)&sel_a_diff, sizeof(VALUETYPE) * nnei * 3 * nloc); cudaErrcheck(res);
     res = cudaMemcpy(sec_a_dev, &sec_a[0], sizeof(int) * sec_a.size(), cudaMemcpyHostToDevice); cudaErrcheck(res);    
     res = cudaMemset(key, 0xffffffff, sizeof(int_64) * nloc * MAGIC_NUMBER); cudaErrcheck(res);
     res = cudaMemset(nlist, -1, sizeof(int) * nloc * nnei); cudaErrcheck(res);
     res = cudaMemset(descript, 0.0, sizeof(VALUETYPE) * nloc * ndescrpt); cudaErrcheck(res);
     res = cudaMemset(descript_deriv, 0.0, sizeof(VALUETYPE) * nloc * ndescrpt * 3); cudaErrcheck(res);
-    // res = cudaMemset(rij, 0.0, sizeof(VALUETYPE) * nloc * nnei * 3); cudaErrcheck(res);
 
     if (fill_nei_a) {
         // ~~~
         // cudaProfilerStart();
         get_i_idx_se_a<<<nblock, LEN>>> (nloc, ilist, i_idx);
 
-        format_nlist_fill_a_se_a<<<nloc, MAGIC_NUMBER>>> (
-                            coord,
-                            type,
-                            jrange,
-                            jlist,
-                            rcut_r,
-                            key,
-                            i_idx
-        );
-        const int ITEMS_PER_THREAD = 4;
-        const int BLOCK_THREADS = MAGIC_NUMBER / ITEMS_PER_THREAD;
-        // BlockSortKernel<NeighborInfo, BLOCK_THREADS, ITEMS_PER_THREAD><<<g_grid_size, BLOCK_THREADS>>> (
-        BlockSortKernel<int_64, BLOCK_THREADS, ITEMS_PER_THREAD> <<<nloc, BLOCK_THREADS>>> (key, key + nloc * MAGIC_NUMBER);
-        
+        if (nnei <= 256) {
+            format_nbor_list_256 (
+                coord,
+                type,
+                jrange,
+                jlist,
+                nloc,       
+                rcut_r, 
+                i_idx, 
+                key
+            ); 
+        } else if (nnei <= 512) {
+            format_nbor_list_512 (
+                coord,
+                type,
+                jrange,
+                jlist,
+                nloc,       
+                rcut_r, 
+                i_idx, 
+                key
+            ); 
+        } else if (nnei <= 1024) {
+            format_nbor_list_1024 (
+                coord,
+                type,
+                jrange,
+                jlist,
+                nloc,       
+                rcut_r, 
+                i_idx, 
+                key
+            ); 
+        } else if (nnei <= 2048) {
+            format_nbor_list_2048 (
+                coord,
+                type,
+                jrange,
+                jlist,
+                nloc,       
+                rcut_r, 
+                i_idx, 
+                key
+            ); 
+        } else if (nnei <= 4096) {
+            format_nbor_list_4096 (
+                coord,
+                type,
+                jrange,
+                jlist,
+                nloc,       
+                rcut_r, 
+                i_idx, 
+                key
+            ); 
+        } 
+
         format_nlist_fill_b_se_a<<<nblock, LEN>>> (
                             nlist,
                             nnei,       
@@ -339,7 +528,8 @@ void DescrptSeALauncher(const VALUETYPE* coord,
                             key,
                             sec_a_dev,
                             sec_a.size(),
-                            nei_iter
+                            nei_iter,
+                            MAGIC_NUMBER
         );
     }
 
@@ -360,14 +550,6 @@ void DescrptSeALauncher(const VALUETYPE* coord,
                             coord,
                             rcut_r_smth,
                             rcut_r,
-                            sel_a_diff,
                             sec_a.back()
     );
-////
-    // res = cudaFree(sec_a_dev);                  cudaErrcheck(res);
-    // res = cudaFree(key);                        cudaErrcheck(res);
-    // res = cudaFree(i_idx);                      cudaErrcheck(res);
-    // res = cudaFree(nei_iter);                   cudaErrcheck(res);
-    // res = cudaFree(sel_a_diff);                 cudaErrcheck(res);
-    //output some interesting things...
 }  
\ No newline at end of file
diff --git a/source/op/cuda/gelu.cu b/source/op/cuda/gelu.cu
new file mode 100644
index 0000000000..99b7b1aed4
--- /dev/null
+++ b/source/op/cuda/gelu.cu
@@ -0,0 +1,77 @@
+#include <cuda_runtime.h>
+#include <stdio.h>
+
+#define SQRT_2_PI 0.7978845608028654 
+
+template <typename T>
+__global__ void gelu(const T * in, T * out, int const size) {
+    int const idx = blockIdx.x * blockDim.x + threadIdx.x;
+    if (idx >= size) {return;}
+
+    out[idx] = in[idx] * 0.5 * (1.0 + tanh(SQRT_2_PI * (in[idx] + 0.044715 * in[idx] * in[idx] *in[idx])));
+}
+
+template <typename T>
+__global__ void gelu_grad(const T * dy, const T * in, T * out, int const size) {
+    int const idx = blockIdx.x * blockDim.x + threadIdx.x;
+    if (idx >= size) {return;}
+
+    // out[idx] = in[idx] * 0.5 * (1.0 + tanh(SQRT_2_PI * (in[idx] + 0.044715 * in[idx] * in[idx] *in[idx])));
+    T const var1 = tanh(SQRT_2_PI * (in[idx] + 0.044715 * in[idx] * in[idx] *in[idx]));
+    out[idx] = dy[idx] * (0.5 * SQRT_2_PI * in[idx] * (1 - var1 * var1) * (0.134145 * in[idx] * in[idx] + 1) + 0.5 * var1 + 0.5);
+}
+
+template <typename T>
+__global__ void gelu_grad_grad(const T * dy, const T * dy_, const T * in, T * out, int const size) {
+    int const idx = blockIdx.x * blockDim.x + threadIdx.x;
+    if (idx >= size) {return;}
+
+    // out[idx] = in[idx] * 0.5 * (1.0 + tanh(SQRT_2_PI * (in[idx] + 0.044715 * in[idx] * in[idx] *in[idx])));
+    T const var1 = tanh(SQRT_2_PI * (in[idx] + 0.044715 * in[idx] * in[idx] *in[idx]));
+    T const var2 = SQRT_2_PI * (1 - var1 * var1) * (0.134145 * in[idx] * in[idx] + 1);
+    
+	out[idx] = dy[idx] * dy_[idx] * (0.134145 * SQRT_2_PI * in[idx] * in[idx] * (1 - var1 * var1) - SQRT_2_PI * in[idx] * var2 * (0.134145 * in[idx] * in[idx] + 1) * var1 + var2);
+}
+
+
+void GeluLauncher(const float * in, float * out, int const size) {
+    int const THREAD_ITEMS = 1024;
+    int const BLOCK_NUMS = (size + THREAD_ITEMS - 1) / THREAD_ITEMS;
+
+    gelu<<<BLOCK_NUMS, THREAD_ITEMS>>>(in, out, size);
+}
+
+void GeluLauncher(const double * in, double * out, int const size) {
+    int const THREAD_ITEMS = 1024;
+    int const BLOCK_NUMS = (size + THREAD_ITEMS - 1) / THREAD_ITEMS;
+
+    gelu<<<BLOCK_NUMS, THREAD_ITEMS>>>(in, out, size);
+}
+
+void GeluGradLauncher(const float * dy, const float * in, float * out, int const size) {
+    int const THREAD_ITEMS = 1024;
+    int const BLOCK_NUMS = (size + THREAD_ITEMS - 1) / THREAD_ITEMS;
+
+    gelu_grad<<<BLOCK_NUMS, THREAD_ITEMS>>>(dy, in, out, size);
+}
+
+void GeluGradLauncher(const double * dy, const double * in, double * out, int const size) {
+    int const THREAD_ITEMS = 1024;
+    int const BLOCK_NUMS = (size + THREAD_ITEMS - 1) / THREAD_ITEMS;
+
+    gelu_grad<<<BLOCK_NUMS, THREAD_ITEMS>>>(dy, in, out, size);
+}
+
+void GeluGradGradLauncher(const float * dy, const float * dy_, const float * in, float * out, int const size) {
+    int const THREAD_ITEMS = 1024;
+    int const BLOCK_NUMS = (size + THREAD_ITEMS - 1) / THREAD_ITEMS;
+
+    gelu_grad_grad<<<BLOCK_NUMS, THREAD_ITEMS>>>(dy, dy_, in, out, size);
+}
+
+void GeluGradGradLauncher(const double * dy, const double * dy_, const double * in, double * out, int const size) {
+    int const THREAD_ITEMS = 1024;
+    int const BLOCK_NUMS = (size + THREAD_ITEMS - 1) / THREAD_ITEMS;
+
+    gelu_grad_grad<<<BLOCK_NUMS, THREAD_ITEMS>>>(dy, dy_, in, out, size);
+}
diff --git a/source/op/descrpt_se_a_gpu.cc b/source/op/descrpt_se_a_gpu.cc
index 70dd9c7751..fd5ae632cb 100644
--- a/source/op/descrpt_se_a_gpu.cc
+++ b/source/op/descrpt_se_a_gpu.cc
@@ -67,6 +67,24 @@ REGISTER_OP("DescrptSeA")
     .Output("nlist: int32");
 #endif
 
+int get_magic_number(int const nnei) {
+    if (nnei <= 256) {
+        return 256;
+    }
+    else if (nnei <= 512) {
+        return 512;
+    }
+    else if (nnei <= 1024) {
+        return 1024;
+    }
+    else if (nnei <= 2048) {
+        return 2048;
+    }
+    else if (nnei <= 4096) {
+        return 4096;
+    }
+}
+
 void DescrptSeALauncher(const VALUETYPE* coord,
                             const int* type,
                             const int* ilist,
@@ -74,22 +92,20 @@ void DescrptSeALauncher(const VALUETYPE* coord,
                             const int* jlist,
                             int* array_int,
                             unsigned long long* array_longlong,
-                            compute_t* array_double,
                             const VALUETYPE* avg,
                             const VALUETYPE* std,
                             VALUETYPE* descript,
                             VALUETYPE* descript_deriv,
                             VALUETYPE* rij,
                             int* nlist,
-                            const int& ntypes,
                             const int& nloc,
-                            const int& nall,
                             const int& nnei,
                             const float& rcut_r,
                             const float& rcut_r_smth,
                             const int& ndescrpt,
                             const std::vector<int>& sec_a,
-                            const bool& fill_nei_a
+                            const bool& fill_nei_a,
+                            const int MAGIC_NUMBER
 );
 
 class DescrptSeAOp : public OpKernel {
@@ -112,6 +128,7 @@ class DescrptSeAOp : public OpKernel {
         nnei_r = sec_r.back();
         nnei = nnei_a + nnei_r;
         fill_nei_a = (rcut_a < 0);
+        magic_number = get_magic_number(nnei);
     }
 
     void Compute(OpKernelContext* context) override {
@@ -158,7 +175,7 @@ class DescrptSeAOp : public OpKernel {
         
         OP_REQUIRES (context, (ntypes == int(sel_a.size())),	errors::InvalidArgument ("number of types should match the length of sel array"));
         OP_REQUIRES (context, (ntypes == int(sel_r.size())),	errors::InvalidArgument ("number of types should match the length of sel array"));
-        OP_REQUIRES (context, (nnei <= 1024),	                errors::InvalidArgument ("Assert failed, max neighbor size of atom(nnei) " + std::to_string(nnei) + " is larger than 1024!, which currently is not supported by deepmd-kit."));
+        OP_REQUIRES (context, (nnei <= 4096),	                errors::InvalidArgument ("Assert failed, max neighbor size of atom(nnei) " + std::to_string(nnei) + " is larger than 4096, which currently is not supported by deepmd-kit."));
 
         // Create output tensors
         TensorShape descrpt_shape ;
@@ -192,14 +209,23 @@ class DescrptSeAOp : public OpKernel {
 	    					     nlist_shape,
 	    					     &nlist_tensor));
         
-	    int * ilist = NULL, *jrange = NULL, *jlist = NULL;
-        int * array_int = NULL; unsigned long long *array_longlong = NULL; compute_t *array_double = NULL;
+        // allocate temp memory, temp memory must not be used after this operation!
+        Tensor int_temp;
+        TensorShape int_shape;
+        int_shape.AddDim(sec_a.size() + nloc * sec_a.size() + nloc);
+        OP_REQUIRES_OK(context, context->allocate_temp(DT_INT32, int_shape, &int_temp));
+
+        Tensor uint64_temp;
+        TensorShape uint64_shape;
+        uint64_shape.AddDim(nloc * magic_number * 2);
+        OP_REQUIRES_OK(context, context->allocate_temp(DT_UINT64, uint64_shape, &uint64_temp));
+
+	    int * ilist = NULL, * jrange = NULL, * jlist = NULL;
+        int * array_int = int_temp.flat<int>().data(); 
+        unsigned long long * array_longlong = uint64_temp.flat<unsigned long long>().data(); 
         cudaErrcheck(cudaMemcpy(&(ilist), 4 + mesh_tensor.flat<int>().data(), sizeof(int *), cudaMemcpyDeviceToHost));
         cudaErrcheck(cudaMemcpy(&(jrange), 8 + mesh_tensor.flat<int>().data(), sizeof(int *), cudaMemcpyDeviceToHost));
         cudaErrcheck(cudaMemcpy(&(jlist), 12 + mesh_tensor.flat<int>().data(), sizeof(int *), cudaMemcpyDeviceToHost));
-        cudaErrcheck(cudaMemcpy(&(array_int), 16 + mesh_tensor.flat<int>().data(), sizeof(int *), cudaMemcpyDeviceToHost));
-        cudaErrcheck(cudaMemcpy(&(array_longlong), 20 + mesh_tensor.flat<int>().data(), sizeof(unsigned long long *), cudaMemcpyDeviceToHost));
-        cudaErrcheck(cudaMemcpy(&(array_double), 24 + mesh_tensor.flat<int>().data(), sizeof(compute_t *), cudaMemcpyDeviceToHost));
 
         // Launch computation
         for (int II = 0; II < nsamples; II++) {
@@ -210,22 +236,20 @@ class DescrptSeAOp : public OpKernel {
                         jlist,
                         array_int,
                         array_longlong,
-                        array_double,
                         avg_tensor.matrix<VALUETYPE>().data(),
                         std_tensor.matrix<VALUETYPE>().data(),
                         descrpt_tensor->matrix<VALUETYPE>().data() + II * (nloc * ndescrpt),
                         descrpt_deriv_tensor->matrix<VALUETYPE>().data() + II * (nloc * ndescrpt * 3),
                         rij_tensor->matrix<VALUETYPE>().data() + II * (nloc * nnei * 3),
                         nlist_tensor->matrix<int>().data() + II * (nloc * nnei),
-                        ntypes,
                         nloc,
-                        nall,
                         nnei,
                         rcut_r,
                         rcut_r_smth,
                         ndescrpt,
                         sec_a,
-                        fill_nei_a
+                        fill_nei_a,
+                        magic_number
             );
         }
         // std::cout << "done" << std::endl;
@@ -242,7 +266,7 @@ class DescrptSeAOp : public OpKernel {
     std::vector<int> sec_a;
     std::vector<int> sec_r;
     int ndescrpt, ndescrpt_a, ndescrpt_r;
-    int nnei, nnei_a, nnei_r, nloc, nall;
+    int nnei, nnei_a, nnei_r, nloc, nall, magic_number;
     bool fill_nei_a;
 
     //private func
diff --git a/source/op/gelu.cc b/source/op/gelu.cc
new file mode 100644
index 0000000000..26c53c8511
--- /dev/null
+++ b/source/op/gelu.cc
@@ -0,0 +1,164 @@
+#include "tensorflow/core/framework/op.h"
+#include "tensorflow/core/framework/op_kernel.h"
+#include "tensorflow/core/framework/register_types.h"
+#include "tensorflow/core/framework/shape_inference.h"
+#define SQRT_2_PI 0.7978845608028654
+
+using namespace tensorflow;
+using CPUDevice = Eigen::ThreadPoolDevice;
+using GPUDevice = Eigen::GpuDevice;
+
+REGISTER_OP("Gelu")
+    .Attr("T: {float, double}")
+    .Input("x: T")
+    .Output("output: T");
+
+REGISTER_OP("GeluGrad")
+    .Attr("T: {float, double}")
+    .Input("dy: T")
+    .Input("x: T")
+    .Output("output: T");
+
+REGISTER_OP("GeluGradGrad")
+    .Attr("T: {float, double}")
+    .Input("dy: T")
+    .Input("dy_: T")
+    .Input("x: T")
+    .Output("output: T");
+
+template <typename Device, typename T>
+struct GeluFunctor {
+    void operator()(const Device& d, const T * in, T * out, int const size) {
+		#pragma omp parallel for 
+		for (int ii = 0; ii < size; ii++) {
+			out[ii] = in[ii] * 0.5 * (1.0 + tanh(SQRT_2_PI * (in[ii] + 0.044715 * in[ii] * in[ii] * in[ii])));
+		}
+	}
+};
+
+template <typename Device, typename T>
+struct GeluGradFunctor {
+    void operator()(const Device& d, const T * dy, const T * in, T * out, int const size) {
+        #pragma omp parallel for 
+		for (int ii = 0; ii < size; ii++) {
+        	T const var1 = tanh(SQRT_2_PI * (in[ii] + 0.044715 * in[ii] * in[ii] *in[ii]));
+    		out[ii] = dy[ii] * (0.5 * SQRT_2_PI * in[ii] * (1 - var1 * var1) * (0.134145 * in[ii] * in[ii] + 1) + 0.5 * var1 + 0.5);
+		}
+    }
+};
+
+template <typename Device, typename T>
+struct GeluGradGradFunctor {
+    void operator()(const Device& d, const T * dy, const T * dy_, const T * in, T * out, int const size) {
+        #pragma omp parallel for
+		for (int ii = 0; ii < size; ii++) {
+			T const var1 = tanh(SQRT_2_PI * (in[ii] + 0.044715 * in[ii] * in[ii] *in[ii]));
+    		T const var2 = SQRT_2_PI * (1 - var1 * var1) * (0.134145 * in[ii] * in[ii] + 1);
+
+    		out[ii] = dy[ii] * dy_[ii] * (0.134145 * SQRT_2_PI * in[ii] * in[ii] * (1 - var1 * var1) - SQRT_2_PI * in[ii] * var2 * (0.134145 * in[ii] * in[ii] + 1) * var1 + var2);
+        }
+    }
+};
+
+// OpKernel definition.
+// template parameter <T> is the datatype of the tensors.
+template <typename Device, typename T>
+class GeluOp : public OpKernel {
+  public :
+    explicit GeluOp(OpKernelConstruction* context) : OpKernel(context) {}
+
+    void Compute(OpKernelContext* context) override {
+        // Grab the input tensor
+        const Tensor& x = context->input(0);
+        
+        Tensor * output = NULL;
+        int context_output_index = 0;
+        OP_REQUIRES_OK(context, context->allocate_output(context_output_index++, 
+					    x.shape(),
+					    &output));
+		
+		GeluFunctor<Device, T>()(
+			context->eigen_device<Device>(),
+			x.flat<T>().data(),
+			output->flat<T>().data(),
+			static_cast<int>(output->NumElements())
+		);
+        // GeluLauncher(x.flat<T>().data(), output->flat<T>().data(), static_cast<int>(output->NumElements()));
+    }
+};
+
+// OpKernel definition.
+// template parameter <T> is the datatype of the tensors.
+template <typename Device, typename T>
+class GeluGradOp : public OpKernel {
+  public :
+    explicit GeluGradOp(OpKernelConstruction* context) : OpKernel(context) {}
+
+    void Compute(OpKernelContext* context) override {
+        // Grab the input tensor
+        const Tensor& dy = context->input(0);
+        const Tensor& x  = context->input(1);
+        
+        Tensor * output = NULL;
+        int context_output_index = 0;
+        OP_REQUIRES_OK(context, context->allocate_output(context_output_index++, 
+					    x.shape(),
+					    &output));
+		
+		GeluGradFunctor<Device, T>()(
+            context->eigen_device<Device>(),
+            dy.flat<T>().data(),
+            x.flat<T>().data(),
+            output->flat<T>().data(),
+            static_cast<int>(output->NumElements())
+        );
+        // GeluGradLauncher(dy.flat<T>().data(), x.flat<T>().data(), output->flat<T>().data(), static_cast<int>(output->NumElements()));
+    }
+};
+
+// OpKernel definition.
+// template parameter <T> is the datatype of the tensors.
+template <typename Device, typename T>
+class GeluGradGradOp : public OpKernel {
+  public :
+    explicit GeluGradGradOp(OpKernelConstruction* context) : OpKernel(context) {}
+	
+    void Compute(OpKernelContext* context) override {
+        // Grab the input tensor
+        const Tensor& dy = context->input(0);
+        const Tensor& dy_ = context->input(1);
+        const Tensor& x  = context->input(2);
+	
+		Tensor * output = NULL;
+		int context_output_index = 0;	
+        OP_REQUIRES_OK(context, context->allocate_output(context_output_index++, 
+					    x.shape(),
+					    &output));
+		
+		GeluGradGradFunctor<Device, T>()(
+            context->eigen_device<Device>(),
+            dy.flat<T>().data(),
+            dy_.flat<T>().data(),
+            x.flat<T>().data(),
+            output->flat<T>().data(),
+            static_cast<int>(output->NumElements())
+        );
+        // GeluGradGradLauncher(dy.flat<T>().data(), x.flat<T>().data(), output->flat<T>().data(), static_cast<int>(output->NumElements()));
+    }
+};
+
+#define REGISTER_CPU(T)                                                     \
+    /* Declare explicit instantiations in kernel_example.cu.cc. */          \
+    REGISTER_KERNEL_BUILDER(                                                \
+        Name("Gelu").Device(DEVICE_CPU).TypeConstraint<T>("T"),             \
+        GeluOp<CPUDevice, T>);                                              \
+    /* Declare explicit instantiations in kernel_example.cu.cc. */          \
+    REGISTER_KERNEL_BUILDER(                                                \
+        Name("GeluGrad").Device(DEVICE_CPU).TypeConstraint<T>("T"),         \
+        GeluGradOp<CPUDevice, T>);                                          \
+    /* Declare explicit instantiations in kernel_example.cu.cc. */          \
+    REGISTER_KERNEL_BUILDER(                                                \
+        Name("GeluGradGrad").Device(DEVICE_CPU).TypeConstraint<T>("T"),     \
+        GeluGradGradOp<CPUDevice, T>);
+    REGISTER_CPU(float);
+    REGISTER_CPU(double);
\ No newline at end of file
diff --git a/source/op/gelu_gpu.cc b/source/op/gelu_gpu.cc
new file mode 100644
index 0000000000..34d4183f98
--- /dev/null
+++ b/source/op/gelu_gpu.cc
@@ -0,0 +1,159 @@
+#include "tensorflow/core/framework/op.h"
+#include "tensorflow/core/framework/op_kernel.h"
+#include "tensorflow/core/framework/register_types.h"
+#include "tensorflow/core/framework/shape_inference.h"
+
+using namespace tensorflow;
+using CPUDevice = Eigen::ThreadPoolDevice;
+using GPUDevice = Eigen::GpuDevice;
+
+REGISTER_OP("Gelu")
+    .Attr("T: {float, double}")
+    .Input("x: T")
+    .Output("output: T");
+
+REGISTER_OP("GeluGrad")
+    .Attr("T: {float, double}")
+    .Input("dy: T")
+    .Input("x: T")
+    .Output("output: T");
+
+REGISTER_OP("GeluGradGrad")
+    .Attr("T: {float, double}")
+    .Input("dy: T")
+    .Input("dy_: T")
+    .Input("x: T")
+    .Output("output: T");
+
+// maybe instead use cudnn activation forward 
+void GeluLauncher(const float * in, float * out, int const size);
+void GeluLauncher(const double * in, double * out, int const size);
+
+void GeluGradLauncher(const float * dy, const float * in, float * out, int const size);
+void GeluGradLauncher(const double * dy, const double * in, double * out, int const size);
+
+void GeluGradGradLauncher(const float * dy, const float * dy_, const float * in, float * out, int const size);
+void GeluGradGradLauncher(const double * dy, const double * dy_, const double * in, double * out, int const size);
+
+template <typename Device, typename T>
+struct GeluFunctor {
+    void operator()(const Device& d, const T * in, T * out, int const size) {
+		GeluLauncher(in, out, size);
+	}
+};
+
+template <typename Device, typename T>
+struct GeluGradFunctor {
+    void operator()(const Device& d, const T * dy, const T * in, T * out, int const size) {
+        GeluGradLauncher(dy, in, out, size);
+    }
+};
+
+template <typename Device, typename T>
+struct GeluGradGradFunctor {
+    void operator()(const Device& d, const T * dy, const T * dy_, const T * in, T * out, int const size) {
+        GeluGradGradLauncher(dy, dy_, in, out, size);
+    }
+};
+
+// OpKernel definition.
+// template parameter <T> is the datatype of the tensors.
+template <typename Device, typename T>
+class GeluOp : public OpKernel {
+  public :
+    explicit GeluOp(OpKernelConstruction* context) : OpKernel(context) {}
+
+    void Compute(OpKernelContext* context) override {
+        // Grab the input tensor
+        const Tensor& x = context->input(0);
+        Tensor * output = NULL;
+        int context_output_index = 0;
+        OP_REQUIRES_OK(context, context->allocate_output(context_output_index++, 
+					    x.shape(),
+					    &output));
+		
+		GeluFunctor<Device, T>()(
+			context->eigen_device<Device>(),
+			x.flat<T>().data(),
+			output->flat<T>().data(),
+			static_cast<int>(output->NumElements())
+		);
+        // GeluLauncher(x.flat<T>().data(), output->flat<T>().data(), static_cast<int>(output->NumElements()));
+    }
+};
+
+// OpKernel definition.
+// template parameter <T> is the datatype of the tensors.
+template <typename Device, typename T>
+class GeluGradOp : public OpKernel {
+  public :
+    explicit GeluGradOp(OpKernelConstruction* context) : OpKernel(context) {}
+
+    void Compute(OpKernelContext* context) override {
+        // Grab the input tensor
+        const Tensor& dy = context->input(0);
+        const Tensor& x  = context->input(1);
+        
+        Tensor * output = NULL;
+        int context_output_index = 0;
+        OP_REQUIRES_OK(context, context->allocate_output(context_output_index++, 
+					    x.shape(),
+					    &output));
+		
+		GeluGradFunctor<Device, T>()(
+            context->eigen_device<Device>(),
+            dy.flat<T>().data(),
+            x.flat<T>().data(),
+            output->flat<T>().data(),
+            static_cast<int>(output->NumElements())
+        );
+        // GeluGradLauncher(dy.flat<T>().data(), x.flat<T>().data(), output->flat<T>().data(), static_cast<int>(output->NumElements()));
+    }
+};
+
+// OpKernel definition.
+// template parameter <T> is the datatype of the tensors.
+template <typename Device, typename T>
+class GeluGradGradOp : public OpKernel {
+  public :
+    explicit GeluGradGradOp(OpKernelConstruction* context) : OpKernel(context) {}
+	
+    void Compute(OpKernelContext* context) override {
+        // Grab the input tensor
+        const Tensor& dy = context->input(0);
+        const Tensor& dy_ = context->input(1);
+        const Tensor& x  = context->input(2);
+	
+		Tensor * output = NULL;
+		int context_output_index = 0;	
+        OP_REQUIRES_OK(context, context->allocate_output(context_output_index++, 
+					    x.shape(),
+					    &output));
+		
+		GeluGradGradFunctor<Device, T>()(
+            context->eigen_device<Device>(),
+            dy.flat<T>().data(),
+            dy_.flat<T>().data(),
+            x.flat<T>().data(),
+            output->flat<T>().data(),
+            static_cast<int>(output->NumElements())
+        );
+        // GeluGradGradLauncher(dy.flat<T>().data(), x.flat<T>().data(), output->flat<T>().data(), static_cast<int>(output->NumElements()));
+    }
+};
+
+#define REGISTER_GPU(T)                                                     \
+    /* Declare explicit instantiations in kernel_example.cu.cc. */          \
+    REGISTER_KERNEL_BUILDER(                                                \
+        Name("Gelu").Device(DEVICE_GPU).TypeConstraint<T>("T"),             \
+        GeluOp<GPUDevice, T>);                                              \
+    /* Declare explicit instantiations in kernel_example.cu.cc. */          \
+    REGISTER_KERNEL_BUILDER(                                                \
+        Name("GeluGrad").Device(DEVICE_GPU).TypeConstraint<T>("T"),         \
+        GeluGradOp<GPUDevice, T>);                                          \
+    /* Declare explicit instantiations in kernel_example.cu.cc. */          \
+    REGISTER_KERNEL_BUILDER(                                                \
+        Name("GeluGradGrad").Device(DEVICE_GPU).TypeConstraint<T>("T"),     \
+        GeluGradGradOp<GPUDevice, T>);                                      
+    REGISTER_GPU(float);
+    REGISTER_GPU(double);
diff --git a/source/train/Data.py b/source/train/Data.py
index f3ecaeeb62..149c76bbe8 100644
--- a/source/train/Data.py
+++ b/source/train/Data.py
@@ -97,6 +97,8 @@ def check_batch_size (self, batch_size) :
                 tmpe = np.load(os.path.join(ii, "coord.npy")).astype(global_ener_float_precision)
             else:
                 tmpe = np.load(os.path.join(ii, "coord.npy")).astype(global_np_float_precision)
+            if tmpe.ndim == 1:
+                tmpe = tmpe.reshape([1,-1])            
             if tmpe.shape[0] < batch_size :
                 return ii, tmpe.shape[0]
         return None
@@ -106,6 +108,8 @@ def check_test_size (self, test_size) :
             tmpe = np.load(os.path.join(self.test_dir, "coord.npy")).astype(global_ener_float_precision)
         else:
             tmpe = np.load(os.path.join(self.test_dir, "coord.npy")).astype(global_np_float_precision)            
+        if tmpe.ndim == 1:
+            tmpe = tmpe.reshape([1,-1])            
         if tmpe.shape[0] < test_size :
             return self.test_dir, tmpe.shape[0]
         else :
@@ -271,6 +275,8 @@ def _load_set(self, set_name) :
             coord = np.load(path).astype(global_ener_float_precision)
         else:
             coord = np.load(path).astype(global_np_float_precision)            
+        if coord.ndim == 1:
+            coord = coord.reshape([1,-1])
         nframes = coord.shape[0]
         assert(coord.shape[1] == self.data_dict['coord']['ndof'] * self.natoms)
         # load keys
diff --git a/source/train/DescrptSeA.py b/source/train/DescrptSeA.py
index d409f7134f..e46265231f 100644
--- a/source/train/DescrptSeA.py
+++ b/source/train/DescrptSeA.py
@@ -17,6 +17,7 @@ def __init__ (self, jdata):
                .add('resnet_dt',bool,   default = False) \
                .add('trainable',bool,   default = True) \
                .add('seed',     int) \
+               .add('type_one_side', bool, default = False) \
                .add('exclude_types', list, default = []) \
                .add('set_davg_zero', bool, default = False) \
                .add('activation_function', str,    default = 'tanh') \
@@ -39,6 +40,9 @@ def __init__ (self, jdata):
             self.exclude_types.add((tt[0], tt[1]))
             self.exclude_types.add((tt[1], tt[0]))
         self.set_davg_zero = class_data['set_davg_zero']
+        self.type_one_side = class_data['type_one_side']
+        if self.type_one_side and len(exclude_types) != 0:
+            raise RuntimeError('"type_one_side" is not compatible with "exclude_types"')
 
         # descrpt config
         self.sel_r = [ 0 for ii in range(len(self.sel_a)) ]
@@ -244,17 +248,27 @@ def _pass_filter(self,
         inputs = tf.reshape(inputs, [-1, self.ndescrpt * natoms[0]])
         output = []
         output_qmat = []
-        for type_i in range(self.ntypes):
-            inputs_i = tf.slice (inputs,
-                                 [ 0, start_index*      self.ndescrpt],
-                                 [-1, natoms[2+type_i]* self.ndescrpt] )
+        if not self.type_one_side:
+            for type_i in range(self.ntypes):
+                inputs_i = tf.slice (inputs,
+                                     [ 0, start_index*      self.ndescrpt],
+                                     [-1, natoms[2+type_i]* self.ndescrpt] )
+                inputs_i = tf.reshape(inputs_i, [-1, self.ndescrpt])
+                layer, qmat = self._filter(tf.cast(inputs_i, self.filter_precision), type_i, name='filter_type_'+str(type_i)+suffix, natoms=natoms, reuse=reuse, seed = self.seed, trainable = trainable, activation_fn = self.filter_activation_fn)
+                layer = tf.reshape(layer, [tf.shape(inputs)[0], natoms[2+type_i] * self.get_dim_out()])
+                qmat  = tf.reshape(qmat,  [tf.shape(inputs)[0], natoms[2+type_i] * self.get_dim_rot_mat_1() * 3])
+                output.append(layer)
+                output_qmat.append(qmat)
+                start_index += natoms[2+type_i]
+        else :
+            inputs_i = inputs
             inputs_i = tf.reshape(inputs_i, [-1, self.ndescrpt])
-            layer, qmat = self._filter(tf.cast(inputs_i, self.filter_precision), type_i, name='filter_type_'+str(type_i)+suffix, natoms=natoms, reuse=reuse, seed = self.seed, trainable = trainable, activation_fn = self.filter_activation_fn)
-            layer = tf.reshape(layer, [tf.shape(inputs)[0], natoms[2+type_i] * self.get_dim_out()])
-            qmat  = tf.reshape(qmat,  [tf.shape(inputs)[0], natoms[2+type_i] * self.get_dim_rot_mat_1() * 3])
+            type_i = -1
+            layer, qmat = self._filter(tf.cast(inputs_i, self.filter_precision), type_i, name='filter_type_all'+suffix, natoms=natoms, reuse=reuse, seed = self.seed, trainable = trainable, activation_fn = self.filter_activation_fn)
+            layer = tf.reshape(layer, [tf.shape(inputs)[0], natoms[0] * self.get_dim_out()])
+            qmat  = tf.reshape(qmat,  [tf.shape(inputs)[0], natoms[0] * self.get_dim_rot_mat_1() * 3])
             output.append(layer)
             output_qmat.append(qmat)
-            start_index += natoms[2+type_i]
         output = tf.concat(output, axis = 1)
         output_qmat = tf.concat(output_qmat, axis = 1)
         return output, output_qmat
@@ -353,6 +367,7 @@ def _filter(self,
                                   self.filter_precision,
                                   tf.random_normal_initializer(stddev=stddev, mean = bavg, seed = seed), 
                                     trainable = trainable)
+                hidden = tf.reshape(activation_fn(tf.matmul(xyz_scatter, w) + b), [-1, outputs_size[ii]])
                 if self.filter_resnet_dt :
                     idt = tf.get_variable('idt_'+str(ii)+'_'+str(type_i), 
                                           [1, outputs_size[ii]], 
@@ -361,16 +376,16 @@ def _filter(self,
                                           trainable = trainable)
                 if outputs_size[ii] == outputs_size[ii-1]:
                     if self.filter_resnet_dt :
-                        xyz_scatter += activation_fn(tf.matmul(xyz_scatter, w) + b) * idt
+                        xyz_scatter += hidden * idt
                     else :
-                        xyz_scatter += activation_fn(tf.matmul(xyz_scatter, w) + b)
+                        xyz_scatter += hidden
                 elif outputs_size[ii] == outputs_size[ii-1] * 2: 
                     if self.filter_resnet_dt :
-                        xyz_scatter = tf.concat([xyz_scatter,xyz_scatter], 1) + activation_fn(tf.matmul(xyz_scatter, w) + b) * idt
+                        xyz_scatter = tf.concat([xyz_scatter,xyz_scatter], 1) + hidden * idt
                     else :
-                        xyz_scatter = tf.concat([xyz_scatter,xyz_scatter], 1) + activation_fn(tf.matmul(xyz_scatter, w) + b)
+                        xyz_scatter = tf.concat([xyz_scatter,xyz_scatter], 1) + hidden
                 else:
-                    xyz_scatter = activation_fn(tf.matmul(xyz_scatter, w) + b)
+                    xyz_scatter = hidden
             else:
               w = tf.zeros((outputs_size[0], outputs_size[-1]), dtype=global_tf_float_precision)
               xyz_scatter = tf.matmul(xyz_scatter, w)
@@ -440,6 +455,7 @@ def _filter_type_ext(self,
                                 self.filter_precision,
                                 tf.random_normal_initializer(stddev=stddev, mean = bavg, seed = seed),
                                   trainable = trainable)
+              hidden = tf.reshape(activation_fn(tf.matmul(xyz_scatter, w) + b), [-1, outputs_size[ii]])
               if self.filter_resnet_dt :
                   idt = tf.get_variable('idt_'+str(ii)+'_'+str(type_i), 
                                         [1, outputs_size[ii]], 
@@ -448,16 +464,16 @@ def _filter_type_ext(self,
                                         trainable = trainable)
               if outputs_size[ii] == outputs_size[ii-1]:
                   if self.filter_resnet_dt :
-                      xyz_scatter += activation_fn(tf.matmul(xyz_scatter, w) + b) * idt
+                      xyz_scatter += hidden * idt
                   else :
-                      xyz_scatter += activation_fn(tf.matmul(xyz_scatter, w) + b)
+                      xyz_scatter += hidden
               elif outputs_size[ii] == outputs_size[ii-1] * 2: 
                   if self.filter_resnet_dt :
-                      xyz_scatter = tf.concat([xyz_scatter,xyz_scatter], 1) + activation_fn(tf.matmul(xyz_scatter, w) + b) * idt
+                      xyz_scatter = tf.concat([xyz_scatter,xyz_scatter], 1) + hidden * idt
                   else :
-                      xyz_scatter = tf.concat([xyz_scatter,xyz_scatter], 1) + activation_fn(tf.matmul(xyz_scatter, w) + b)
+                      xyz_scatter = tf.concat([xyz_scatter,xyz_scatter], 1) + hidden
               else:
-                  xyz_scatter = activation_fn(tf.matmul(xyz_scatter, w) + b)
+                  xyz_scatter = hidden
             # natom x nei_type_i x out_size
             xyz_scatter = tf.reshape(xyz_scatter, (-1, shape_i[1]//4, outputs_size[-1]))
             # natom x nei_type_i x 4  
diff --git a/source/train/DescrptSeR.py b/source/train/DescrptSeR.py
index ed53a3bcd4..5c4af52d4d 100644
--- a/source/train/DescrptSeR.py
+++ b/source/train/DescrptSeR.py
@@ -298,6 +298,7 @@ def _filter_r(self,
                                             self.filter_precision,
                                             tf.random_normal_initializer(stddev=stddev, mean = bavg, seed = seed), 
                                             trainable = trainable)
+                        hidden = tf.reshape(activation_fn(tf.matmul(xyz_scatter, w) + b), [-1, outputs_size[ii]])
                         if self.filter_resnet_dt :
                             idt = tf.get_variable('idt_'+str(ii)+'_'+str(type_i), 
                                                   [1, outputs_size[ii]], 
@@ -306,16 +307,16 @@ def _filter_r(self,
                                                   trainable = trainable)
                         if outputs_size[ii] == outputs_size[ii-1]:
                             if self.filter_resnet_dt :
-                                xyz_scatter += activation_fn(tf.matmul(xyz_scatter, w) + b) * idt
+                                xyz_scatter += hidden * idt
                             else :
-                                xyz_scatter += activation_fn(tf.matmul(xyz_scatter, w) + b)
+                                xyz_scatter += hidden
                         elif outputs_size[ii] == outputs_size[ii-1] * 2: 
                             if self.filter_resnet_dt :
-                                xyz_scatter = tf.concat([xyz_scatter,xyz_scatter], 1) + activation_fn(tf.matmul(xyz_scatter, w) + b) * idt
+                                xyz_scatter = tf.concat([xyz_scatter,xyz_scatter], 1) + hidden * idt
                             else :
-                                xyz_scatter = tf.concat([xyz_scatter,xyz_scatter], 1) + activation_fn(tf.matmul(xyz_scatter, w) + b)
+                                xyz_scatter = tf.concat([xyz_scatter,xyz_scatter], 1) + hidden
                         else:
-                            xyz_scatter = activation_fn(tf.matmul(xyz_scatter, w) + b)
+                            xyz_scatter = hidden
                 else:
                     w = tf.zeros((outputs_size[0], outputs_size[-1]), dtype=global_tf_float_precision)
                     xyz_scatter = tf.matmul(xyz_scatter, w)
diff --git a/source/train/Fitting.py b/source/train/Fitting.py
index 49ca40b2c7..1effa8d7c2 100644
--- a/source/train/Fitting.py
+++ b/source/train/Fitting.py
@@ -194,12 +194,14 @@ def build (self,
             if self.numb_fparam > 0 :
                 ext_fparam = tf.tile(fparam, [1, natoms[2+type_i]])
                 ext_fparam = tf.reshape(ext_fparam, [-1, self.numb_fparam])
+                ext_fparam = tf.cast(ext_fparam,self.fitting_precision)
                 layer = tf.concat([layer, ext_fparam], axis = 1)
             if self.numb_aparam > 0 :
                 ext_aparam = tf.slice(aparam, 
                                       [ 0, start_index      * self.numb_aparam],
                                       [-1, natoms[2+type_i] * self.numb_aparam])
                 ext_aparam = tf.reshape(ext_aparam, [-1, self.numb_aparam])
+                ext_aparam = tf.cast(ext_aparam,self.fitting_precision)
                 layer = tf.concat([layer, ext_aparam], axis = 1)
             start_index += natoms[2+type_i]
                 
diff --git a/source/train/Network.py b/source/train/Network.py
index ed188c085d..1afa83bd1c 100644
--- a/source/train/Network.py
+++ b/source/train/Network.py
@@ -41,9 +41,9 @@ def one_layer(inputs,
                 # return activation_fn(hidden_bn)
             else:
                 if use_timestep :
-                    return activation_fn(hidden) * idt
+                    return tf.reshape(activation_fn(hidden), [-1, outputs_size]) * idt
                 else :
-                    return activation_fn(hidden)                    
+                    return tf.reshape(activation_fn(hidden), [-1, outputs_size])                    
         else:
             if useBN:
                 None
diff --git a/source/train/Trainer.py b/source/train/Trainer.py
index 50db1adfbc..b6428c987e 100644
--- a/source/train/Trainer.py
+++ b/source/train/Trainer.py
@@ -28,6 +28,7 @@
 import deepmd._prod_virial_se_r_grad
 import deepmd._soft_min_force_grad
 import deepmd._soft_min_virial_grad
+import deepmd._gelu
 
 from deepmd.common import j_must_have, ClassArg
 
diff --git a/source/train/calculator.py b/source/train/calculator.py
index 37fb7ad412..de439c3f78 100644
--- a/source/train/calculator.py
+++ b/source/train/calculator.py
@@ -42,10 +42,13 @@ def __init__(self, model, label="DP", type_dict=None, **kwargs):
 
     def calculate(self, atoms=None, properties=["energy", "forces", "stress"], system_changes=all_changes):
         coord = atoms.get_positions().reshape([1, -1])
-        cell = atoms.get_cell().reshape([1, -1])
+        if sum(atoms.get_pbc())>0:
+           cell = atoms.get_cell().reshape([1, -1])
+        else:
+           cell = None
         symbols = atoms.get_chemical_symbols()
         atype = [self.type_dict[k] for k in symbols]
-        e, f, v = self.dp.eval(coord, cell, atype)
+        e, f, v = self.dp.eval(coords=coord, cells=cell, atom_types=atype)
         self.results['energy'] = e[0]
         self.results['forces'] = f[0]
         self.results['stress'] = v[0]
diff --git a/source/train/common.py b/source/train/common.py
index c250e4cdfc..887669a278 100644
--- a/source/train/common.py
+++ b/source/train/common.py
@@ -2,19 +2,22 @@
 import numpy as np
 import math
 from deepmd.env import tf
+from deepmd.env import op_module
 from deepmd.RunOptions import global_tf_float_precision
 
-def gelu(x):
-    """Gaussian Error Linear Unit.
-    This is a smoother version of the RELU.
-    Original paper: https://arxiv.org/abs/1606.08415
-    Args:
-    x: float Tensor to perform activation.
-    Returns:
-    `x` with the GELU activation applied.
-    """
-    cdf = 0.5 * (1.0 + tf.tanh((math.sqrt(2 / math.pi) * (x + 0.044715 * tf.pow(x, 3)))))
-    return x * cdf
+# def gelu(x):
+#     """Gaussian Error Linear Unit.
+#     This is a smoother version of the RELU.
+#     Original paper: https://arxiv.org/abs/1606.08415
+#     Args:
+#     x: float Tensor to perform activation.
+#     Returns:
+#     `x` with the GELU activation applied.
+#     """
+#     cdf = 0.5 * (1.0 + tf.tanh((math.sqrt(2 / math.pi) * (x + 0.044715 * tf.pow(x, 3)))))
+#     return x * cdf
+def gelu(x) :
+    return op_module.gelu(x)
 
 data_requirement = {}
 activation_fn_dict = {
@@ -40,7 +43,6 @@ def add_data_requirement(key,
                              'repeat': repeat,
     }
     
-
 def select_idx_map(atom_type, 
                    type_sel):
     sort_type_sel = np.sort(type_sel)
diff --git a/source/train/test.py b/source/train/test.py
index 2674e9c01e..d8639020ce 100644
--- a/source/train/test.py
+++ b/source/train/test.py
@@ -32,12 +32,12 @@ def test (args):
         dp = DeepWFC(args.model)    
     else :
         raise RuntimeError('unknow model type '+de.model_type)
-    for ii in all_sys:
+    for cc,ii in enumerate(all_sys):
         args.system = ii
         print ("# ---------------output of dp test--------------- ")
         print ("# testing system : " + ii)
         if de.model_type == 'ener':
-            err, siz = test_ener(dp, args)
+            err, siz = test_ener(dp, args, append_detail = (cc!=0))
         elif de.model_type == 'dipole':
             err, siz = test_dipole(dp, args)
         elif de.model_type == 'polar':
@@ -89,7 +89,14 @@ def weighted_average(err_coll, siz_coll):
     return sum_err
 
 
-def test_ener (dp, args) :
+def save_txt_file(fname, data, header = "", append = False):
+    fp = fname
+    if append : fp = open(fp, 'ab')
+    np.savetxt(fp, data, header = header)
+    if append : fp.close()
+    
+    
+def test_ener (dp, args, append_detail = False) :
     if args.rand_seed is not None :
         np.random.seed(args.rand_seed % (2**32))
 
@@ -122,10 +129,7 @@ def test_ener (dp, args) :
     else :
         aparam = None
     detail_file = args.detail_file
-    if detail_file is not None:
-        atomic = True
-    else:
-        atomic = False
+    atomic = False
 
     ret = dp.eval(coord, box, atype, fparam = fparam, aparam = aparam, atomic = atomic)
     energy = ret[0]
@@ -158,18 +162,21 @@ def test_ener (dp, args) :
         pe = np.concatenate((np.reshape(test_data["energy"][:numb_test], [-1,1]),
                              np.reshape(energy, [-1,1])), 
                             axis = 1)
-        np.savetxt(detail_file+".e.out", pe, 
-                   header = 'data_e pred_e')
+        save_txt_file(detail_file+".e.out", pe,
+                      header = '%s: data_e pred_e' % args.system,
+                      append = append_detail)
         pf = np.concatenate((np.reshape(test_data["force"] [:numb_test], [-1,3]), 
                              np.reshape(force,  [-1,3])), 
                             axis = 1)
-        np.savetxt(detail_file+".f.out", pf,
-                   header = 'data_fx data_fy data_fz pred_fx pred_fy pred_fz')
+        save_txt_file(detail_file+".f.out", pf,
+                      header = '%s: data_fx data_fy data_fz pred_fx pred_fy pred_fz' % args.system,
+                      append = append_detail)
         pv = np.concatenate((np.reshape(test_data["virial"][:numb_test], [-1,9]), 
                              np.reshape(virial, [-1,9])), 
                             axis = 1)
-        np.savetxt(detail_file+".v.out", pv,
-                   header = 'data_vxx data_vxy data_vxz data_vyx data_vyy data_vyz data_vzx data_vzy data_vzz pred_vxx pred_vxy pred_vxz pred_vyx pred_vyy pred_vyz pred_vzx pred_vzy pred_vzz')        
+        save_txt_file(detail_file+".v.out", pv,
+                      header = '%s: data_vxx data_vxy data_vxz data_vyx data_vyy data_vyz data_vzx data_vzy data_vzz pred_vxx pred_vxy pred_vxz pred_vyx pred_vyy pred_vyz pred_vzx pred_vzy pred_vzz' % args.system,
+                      append = append_detail)
     return [l2ea, l2f, l2va], [energy.size, force.size, virial.size]
 
 
diff --git a/source/train/transform.py b/source/train/transform.py
index 6ae7723f7e..1d587ae531 100644
--- a/source/train/transform.py
+++ b/source/train/transform.py
@@ -1,6 +1,21 @@
 from deepmd.env import tf
 import re
 import numpy as np
+
+def convertNumber(number):
+    binary = bin(number).replace("0b", "").zfill(16)
+    sign = int(binary[0]) * (-2) + 1
+    exp = int(binary[1:6], 2)
+    frac = (int(binary[6:], 2) + 2 ** 10) * (2 ** -10)
+    return sign * (2 ** (exp - 15)) * frac
+
+
+def convertMatrix(matrix, shape):
+    matrix = matrix.flatten()
+    tmp = np.array([convertNumber(matrix[i]) for i in range(len(matrix))])
+    return tmp.reshape(shape)
+
+
 def transform(args):
     raw_graph = load_graph(args.raw_model)
     old_graph = load_graph(args.old_model)
@@ -34,31 +49,59 @@ def transform_graph(raw_graph,old_graph):
 
     for node in raw_graph_def.node:
         if node.name in raw_graph_node.keys():
-            if precision_dict[old_graph_node[node.name].dtype][1] == "float16" or precision_dict[raw_graph_node[node.name].dtype][1] == "float16":
-                raise RuntimeError("float16 conversions not currently supported")
 
             check_dim(raw_graph_node, old_graph_node, node.name)
+            tensor_shape = [dim.size for dim in raw_graph_node[node.name].tensor_shape.dim]
+            old_graph_dtype = precision_dict[old_graph_node[node.name].dtype]
+            raw_graph_dtype = precision_dict[raw_graph_node[node.name].dtype]
+            print("%s is passed from old graph(%s) to raw graph(%s)" % (node.name, old_graph_dtype[1],raw_graph_dtype[1]))
+            
+            if raw_graph_dtype[1] == "float16":
+                if old_graph_dtype[1] == "float64" or old_graph_dtype[1] == "float32":
+                    if (len(tensor_shape) != 1) or (tensor_shape[0] != 1):
+                        tensor_value = np.frombuffer(old_graph_node[node.name].tensor_content, dtype=old_graph_dtype[0])
+                        tensor_value = tensor_value.astype(np.float16)
+                        node.attr["value"].CopyFrom(tf.AttrValue(tensor=tf.make_tensor_proto(tensor_value, tf.float16, tensor_shape)))
 
-            if re.fullmatch("final_layer_type_\d+/bias",node.name) == None:
-                tensor_value = np.frombuffer(old_graph_node[node.name].tensor_content,dtype = precision_dict[old_graph_node[node.name].dtype][0])
-                tensor_value = tensor_value.astype(dtype=precision_dict[raw_graph_node[node.name].dtype][0])
-                node.attr["value"].tensor.tensor_content = tensor_value.tostring()
+                    else:
+                        if old_graph_dtype[1] == "float64":
+                            tensor_value = (np.array(old_graph_node[node.name].double_val)).astype(np.float16)
+                            node.attr["value"].CopyFrom(tf.AttrValue(tensor=tf.make_tensor_proto(tensor_value,tf.float16, [1])))
 
-            else:
-                if precision_dict[old_graph_node[node.name].dtype][1] == "float64":
-                    tensor_value = (np.array(old_graph_node[node.name].double_val)).astype(precision_dict[raw_graph_node[node.name].dtype][0])
-                    node.attr["value"].CopyFrom(tf.AttrValue(tensor=tf.make_tensor_proto(tensor_value,precision_dict[raw_graph_node[node.name].dtype][0], [1])))
-                
-                elif precision_dict[old_graph_node[node.name].dtype][1] == "float32":
-                    tensor_value = (np.array(old_graph_node[node.name].float_val)).astype(precision_dict[raw_graph_node[node.name].dtype][0])
-                    node.attr["value"].CopyFrom(tf.AttrValue(tensor=tf.make_tensor_proto(tensor_value, precision_dict[raw_graph_node[node.name].dtype][0], [1])))
-                
-                elif precision_dict[old_graph_node[node.name].dtype][1] == "float16":
-                    tensor_value = (np.array(old_graph_node[node.name].half_val)).astype(precision_dict[raw_graph_node[node.name].dtype][0])
-                    node.attr["value"].CopyFrom(tf.AttrValue(tensor=tf.make_tensor_proto(tensor_value, precision_dict[raw_graph_node[node.name].dtype][0], [1])))
+                        elif old_graph_dtype[1] == "float32":
+                            tensor_value = (np.array(old_graph_node[node.name].float_val)).astype(np.float16)
+                            node.attr["value"].CopyFrom(tf.AttrValue(tensor=tf.make_tensor_proto(tensor_value,tf.float16, [1])))
+
+                elif old_graph_dtype[1] == "float16":
+                    tensor_value = convertMatrix(np.array(old_graph_node[node.name].half_val), tensor_shape)
+                    node.attr["value"].CopyFrom(tf.AttrValue(tensor=tf.make_tensor_proto(tensor_value, tf.float16, tensor_value.shape)))
             
-            print("%s is passed from old graph(%s) to raw graph(%s)" % (node.name,precision_dict[old_graph_node[node.name].dtype][1],precision_dict[raw_graph_node[node.name].dtype][1]))
-    
+            elif raw_graph_dtype[1] == "float64" or raw_graph_dtype[1] == "float32":
+                if old_graph_dtype[1] == "float64" or old_graph_dtype[1] == "float32":
+                    if (len(tensor_shape) != 1) or (tensor_shape[0] != 1):
+                        tensor_value = np.frombuffer(old_graph_node[node.name].tensor_content,dtype = old_graph_dtype[0])
+                        tensor_value = tensor_value.astype(dtype=raw_graph_dtype[0])
+                        node.attr["value"].tensor.tensor_content = tensor_value.tostring()
+
+                    else:
+                        if old_graph_dtype[1] == "float64":
+                            tensor_value = (np.array(old_graph_node[node.name].double_val)).astype(raw_graph_dtype[0])
+                            node.attr["value"].CopyFrom(tf.AttrValue(tensor=tf.make_tensor_proto(tensor_value,raw_graph_dtype[0], [1])))
+
+                        elif old_graph_dtype[1] == "float32": 
+                            tensor_value = (np.array(old_graph_node[node.name].float_val)).astype(raw_graph_dtype[0])
+                            node.attr["value"].CopyFrom(tf.AttrValue(tensor=tf.make_tensor_proto(tensor_value,raw_graph_dtype[0], [1])))
+                
+                elif old_graph_dtype[1] == "float16":
+                    if (len(tensor_shape) != 1) or (tensor_shape[0] != 1):
+                        tensor_value = convertMatrix(np.array(old_graph_node[node.name].half_val), tensor_shape)
+                        tensor_value = tensor_value.astype(raw_graph_dtype[0])
+                        node.attr["value"].tensor.tensor_content = tensor_value.tostring()
+                    else:
+                        tensor_value = convertMatrix(np.array(old_graph_node[node.name].half_val), tensor_shape)
+                        tensor_value = tensor_value.astype(raw_graph_dtype[0])
+                        node.attr["value"].CopyFrom(tf.AttrValue(tensor=tf.make_tensor_proto(tensor_value,raw_graph_dtype[0], tensor_value.shape)))
+
     return raw_graph_def
 
 def check_dim(raw_graph_node, old_graph_node, node_name):
@@ -77,8 +120,16 @@ def load_transform_node(graph):
 layer_\d+_type_\d+/matrix|\
 layer_\d+_type_\d+/bias|\
 layer_\d+_type_\d+/idt|\
+final_layer_type_\d+/matrix|\
+descrpt_attr/t_avg|\
+descrpt_attr/t_std|\
 final_layer_type_\d+/bias|\
-final_layer_type_\d+/matrix\
+fitting_attr/t_fparam_avg|\
+fitting_attr/t_fparam_istd|\
+fitting_attr/t_aparam_avg|\
+fitting_attr/t_aparam_istd|\
+model_attr/t_tab_info|\
+model_attr/t_tab_data|\
 "
     for node in graph.node:
         if re.fullmatch(transform_node_pattern,node.name) != None: