diff --git a/.travis.yml b/.travis.yml
index 21b69b37ec..4db9e48dd4 100644
--- a/.travis.yml
+++ b/.travis.yml
@@ -14,9 +14,10 @@ addons:
     - g++-7
     - gcc-8
     - g++-8
-matrix:
+jobs:
   include:
-    - python: 3.6
+    - stage: unit tests
+      python: 3.6
       env: 
       - CC=gcc-4.8
       - CXX=g++-4.8
@@ -65,14 +66,31 @@ matrix:
       env:
       - CC=gcc-5
       - CXX=g++-5
-      - TENSORFLOW_VERSION=2.0
+      - TENSORFLOW_VERSION=2.1
     - python: 3.7
       env:
       - CC=gcc-8
       - CXX=g++-8
-      - TENSORFLOW_VERSION=2.0
+      - TENSORFLOW_VERSION=2.1
+    - stage: build whls
+      services: docker
+      env:
+        - TWINE_USERNAME=__token__
+        - CIBW_BUILD="cp36-* cp37-*"
+        - CIBW_BEFORE_BUILD="pip install tensorflow && sed -i 's/libresolv.so.2\"/libresolv.so.2\", \"libtensorflow_framework.so.2\"/g' \$(find / -name policy.json)"
+        - CIBW_SKIP="*-win32 *-manylinux_i686"
+        - CC=gcc-7
+        - CXX=g++-7
+        - TENSORFLOW_VERSION=2.1
+      install:
+        - python -m pip install twine cibuildwheel==1.1.0 scikit-build
+      script:
+        - python -m cibuildwheel --output-dir wheelhouse
+        - python setup.py sdist
+      after_success:
+        - if [[ $TRAVIS_TAG ]]; then python -m twine upload wheelhouse/*; python -m twine upload dist/*.tar.gz;  fi
 before_install:
-#  - pip install --upgrade pip
+  #- pip install --upgrade pip
   - pip install --upgrade setuptools
   - pip install tensorflow==$TENSORFLOW_VERSION
 install:
diff --git a/README.md b/README.md
index a6187acc03..178758f5db 100644
--- a/README.md
+++ b/README.md
@@ -111,18 +111,17 @@ Both CPU and GPU version offline package are avaiable in [the Releases page](htt
 
 ## Install the python interface 
 ### Install the Tensorflow's python interface
-First, check the python version and compiler version on your machine 
+First, check the python version on your machine 
 ```bash
-python --version; gcc --version
+python --version
 ```
-If your python version is 3.7.x, it is highly recommended that the GNU C/C++ compiler is higher than or equal to 5.0.
 
 We follow the virtual environment approach to install the tensorflow's Python interface. The full instruction can be found on [the tensorflow's official website](https://www.tensorflow.org/install/pip). Now we assume that the Python interface will be installed to virtual environment directory `$tensorflow_venv`
 ```bash
 virtualenv -p python3 $tensorflow_venv
 source $tensorflow_venv/bin/activate
 pip install --upgrade pip
-pip install --upgrade tensorflow==1.14.0
+pip install --upgrade tensorflow==2.1.0
 ```
 It is notice that everytime a new shell is started and one wants to use `DeePMD-kit`, the virtual environment should be activated by 
 ```bash
@@ -136,31 +135,21 @@ If one has multiple python interpreters named like python3.x, it can be specifie
 ```bash
 virtualenv -p python3.7 $tensorflow_venv
 ```
-If one needs the GPU support of deepmd-kit, the GPU version of tensorflow should be installed by
-```bash
-pip install --upgrade tensorflow-gpu==1.14.0
+If one does not need the GPU support of deepmd-kit and is concerned about package size, the CPU-only version of tensorflow should be installed by	
+```bash	
+pip install --upgrade tensorflow-cpu==2.1.0	
 ```
 To verify the installation, run
 ```bash
-python -c "import tensorflow as tf; sess=tf.Session(); print(sess.run(tf.reduce_sum(tf.random_normal([1000, 1000]))))"
+python -c "import tensorflow as tf;print(tf.reduce_sum(tf.random.normal([1000, 1000])))"
 ```
 One should remember to activate the virtual environment every time he/she uses deepmd-kit.
 
 ### Install the DeePMD-kit's python interface
 
-Clone the DeePMD-kit source code
-```bash
-cd /some/workspace
-git clone --recursive https://github.com/deepmodeling/deepmd-kit.git deepmd-kit -b devel
-```
-If one downloads the .zip file from the github, then the default folder of source code would be `deepmd-kit-master` rather than `deepmd-kit`. For convenience, you may want to record the location of source to a variable, saying `deepmd_source_dir` by
-```bash
-cd deepmd-kit
-deepmd_source_dir=`pwd`
-```
-Then execute
+Execute
 ```bash
-pip install .
+pip install deepmd-kit
 ```
 To test the installation, one may execute
 ```bash
@@ -189,11 +178,30 @@ If one does not need to use DeePMD-kit with Lammps or I-Pi, then the python inte
 
 ### Install the Tensorflow's C++ interface
 
-It is highly recommended that one keeps the same C/C++ compiler as the python interface. The C++ interface of DeePMD-kit was tested with compiler gcc >= 4.8. It is noticed that the I-Pi support is only compiled with gcc >= 4.9.
+Check the compiler version on your machine
+
+```
+gcc --version
+```
+
+The C++ interface of DeePMD-kit was tested with compiler gcc >= 4.8. It is noticed that the I-Pi support is only compiled with gcc >= 4.9.
 
 First the C++ interface of Tensorflow should be installed. It is noted that the version of Tensorflow should be in consistent with the python interface. We assume that you have followed our instruction and installed tensorflow python interface 1.14.0 with, then you may follow [the instruction for CPU](doc/install-tf.1.14.md) to install the corresponding C++ interface (CPU only). If one wants GPU supports, he/she should follow [the instruction for GPU](doc/install-tf.1.14-gpu.md) to install the C++ interface.
 
 ### Install the DeePMD-kit's C++ interface
+
+Clone the DeePMD-kit source code
+```bash
+cd /some/workspace
+git clone --recursive https://github.com/deepmodeling/deepmd-kit.git deepmd-kit
+```
+
+For convenience, you may want to record the location of source to a variable, saying `deepmd_source_dir` by
+```bash
+cd deepmd-kit
+deepmd_source_dir=`pwd`
+```
+
 Now goto the source code directory of DeePMD-kit and make a build place.
 ```bash
 cd $deepmd_source_dir/source
@@ -437,8 +445,6 @@ positional arguments:
 
 optional arguments:
   -h, --help            show this help message and exit
-  -t INTER_THREADS, --inter-threads INTER_THREADS
-                        With default value 0. Setting the "inter_op_parallelism_threads" key for the tensorflow, the "intra_op_parallelism_threads" will be set by the env variable OMP_NUM_THREADS
   --init-model INIT_MODEL
                         Initialize a model by the provided checkpoint
   --restart RESTART     Restart the training from the provided checkpoint
@@ -449,6 +455,15 @@ The keys `intra_op_parallelism_threads` and `inter_op_parallelism_threads` are T
 
 **`--restart model.ckpt`**, continues the training from the checkpoint `model.ckpt`.
 
+On some resources limited machines, one may want to control the number of threads used by DeePMD-kit. This is achieved by three environmental variables: `OMP_NUM_THREADS`, `TF_INTRA_OP_PARALLELISM_THREADS` and `TF_INTER_OP_PARALLELISM_THREADS`. `OMP_NUM_THREADS` controls the multithreading of DeePMD-kit implemented operations. `TF_INTRA_OP_PARALLELISM_THREADS` and `TF_INTER_OP_PARALLELISM_THREADS` controls `intra_op_parallelism_threads` and `inter_op_parallelism_threads`, which are  Tensorflow configurations for multithreading. An explanation is found [here](https://stackoverflow.com/questions/41233635/meaning-of-inter-op-parallelism-threads-and-intra-op-parallelism-threads).
+
+For example if you wish to use 3 cores of 2 CPUs on one node, you may set the environmental variables and run DeePMD-kit as follows:
+```bash
+export OMP_NUM_THREADS=6
+export TF_INTRA_OP_PARALLELISM_THREADS=3
+export TF_INTER_OP_PARALLELISM_THREADS=2
+dp train input.json
+```
 
 ## Freeze a model
 
@@ -606,18 +621,6 @@ rm -r *
 ```
 and redo the `cmake` process.
 
-## Training: TensorFlow abi binary cannot be found when doing training
-If you confront such kind of error: 
-
-```
-$deepmd_root/lib/deepmd/libop_abi.so: undefined symbol:
-_ZN10tensorflow8internal21CheckOpMessageBuilder9NewStringB5cxx11Ev
-```
-
-This may happen if you are using a gcc >= 5.0, and tensorflow was compiled with gcc < 5.0. You may set `-DOP_CXX_ABI=0` in the process of `cmake`.
-
-Another possible reason might be the large gap between the python version of TensorFlow and the TensorFlow c++ interface.
-
 ## MD: cannot run LAMMPS after installing a new version of DeePMD-kit
 This typically happens when you install a new version of DeePMD-kit and copy directly the generated `USER-DEEPMD` to a LAMMPS source code folder and re-install LAMMPS.
 
diff --git a/doc/install-tf.1.14.md b/doc/install-tf.1.14.md
index bfb5fe6717..b863a50cb6 100644
--- a/doc/install-tf.1.14.md
+++ b/doc/install-tf.1.14.md
@@ -42,6 +42,7 @@ Now, copy the libraries to the tensorflow's installation directory:
 mkdir $tensorflow_root/lib
 cp -d bazel-bin/tensorflow/libtensorflow_cc.so* $tensorflow_root/lib/
 cp -d bazel-bin/tensorflow/libtensorflow_framework.so* $tensorflow_root/lib/
+cp -d $tensorflow_root/lib/libtensorflow_framework.so.1 $tensorflow_root/lib/libtensorflow_framework.so
 ```
 Then copy the headers
 ```bash
diff --git a/examples/water/train/polar.json b/examples/water/train/polar.json
index 6a1558b124..60e3fa3494 100644
--- a/examples/water/train/polar.json
+++ b/examples/water/train/polar.json
@@ -31,9 +31,9 @@
     
     "learning_rate" :{
 	"type":		"exp",
-	"start_lr":	0.001,
 	"decay_steps":	5000,
-	"decay_rate":	0.95,
+	"start_lr":	0.001,	
+	"stop_lr":	3.51e-8,
 	"_comment":	"that's all"
     },
 
diff --git a/examples/water/train/polar_se_a.json b/examples/water/train/polar_se_a.json
index 55899e564d..dc90e481ce 100644
--- a/examples/water/train/polar_se_a.json
+++ b/examples/water/train/polar_se_a.json
@@ -3,7 +3,7 @@
     "_comment": " model parameters",
     "model":{
 	"type_map":		["O", "H"],
-	"data_stat_nbatch":	1,
+	"data_stat_nbatch":	10,
 	"descriptor" :{
 	    "type":		"se_a",
 	    "sel":		[46, 92],
@@ -18,7 +18,7 @@
 	"fitting_net": {
 	    "type":		"polar",
 	    "sel_type":		[0],
-	    "fit_diag":		true,
+	    "fit_diag":		false,
 	    "neuron":		[100, 100, 100],
 	    "resnet_dt":	true,
 	    "seed":		1,
@@ -29,9 +29,9 @@
     
     "learning_rate" :{
 	"type":		"exp",
-	"start_lr":	0.01,
 	"decay_steps":	5000,
-	"decay_rate":	0.95,
+	"start_lr":	0.01,
+	"stop_lr":	3.51e-7,
 	"_comment":	"that's all"
     },
 
diff --git a/examples/water/train/wannier.json b/examples/water/train/wannier.json
index e969675989..f23f5e0d62 100644
--- a/examples/water/train/wannier.json
+++ b/examples/water/train/wannier.json
@@ -32,9 +32,9 @@
     
     "learning_rate" :{
 	"type":		"exp",
-	"start_lr":	0.001,
 	"decay_steps":	5000,
-	"decay_rate":	0.95,
+	"start_lr":	0.001,	
+	"stop_lr":	3.51e-8,
 	"_comment":	"that's all"
     },
 
diff --git a/examples/water/train/water.json b/examples/water/train/water.json
index 8b1c6619f7..23ba559aed 100644
--- a/examples/water/train/water.json
+++ b/examples/water/train/water.json
@@ -3,6 +3,7 @@
     "_comment": " model parameters",
     "model":{
 	"type_map":		["O", "H"],
+	"data_stat_nbatch":	10,
 	"descriptor": {
 	    "type":		"loc_frame",
 	    "sel_a":		[16, 32],
@@ -28,9 +29,9 @@
     
     "learning_rate" :{
 	"type":		"exp",
-	"start_lr":	0.001,
 	"decay_steps":	5000,
-	"decay_rate":	0.95,
+	"start_lr":	0.001,
+	"stop_lr":	3.51e-8,
 	"_comment":	"that's all"
     },
 
diff --git a/examples/water/train/water_se_a.json b/examples/water/train/water_se_a.json
index 4557e64fa5..cb005530c1 100644
--- a/examples/water/train/water_se_a.json
+++ b/examples/water/train/water_se_a.json
@@ -24,9 +24,9 @@
 
     "learning_rate" :{
 	"type":		"exp",
-	"start_lr":	0.001,
 	"decay_steps":	5000,
-	"decay_rate":	0.95,
+	"start_lr":	0.001,	
+	"stop_lr":	3.51e-8,
 	"_comment":	"that's all"
     },
 
diff --git a/examples/water/train/water_se_ar.json b/examples/water/train/water_se_ar.json
index e3677f6205..2173f2e1d9 100644
--- a/examples/water/train/water_se_ar.json
+++ b/examples/water/train/water_se_ar.json
@@ -35,9 +35,9 @@
 
     "learning_rate" :{
 	"type":		"exp",
-	"start_lr":	0.005,
 	"decay_steps":	5000,
-	"decay_rate":	0.95,
+	"start_lr":	0.005,
+	"stop_lr":	1.76e-7,
 	"_comment":	"that's all"
     },
 
diff --git a/examples/water/train/water_se_r.json b/examples/water/train/water_se_r.json
index c577047189..7faf55a3c3 100644
--- a/examples/water/train/water_se_r.json
+++ b/examples/water/train/water_se_r.json
@@ -23,9 +23,10 @@
     },
 
     "learning_rate" : {
-	"start_lr":	0.005,
+	"type":		"exp",
 	"decay_steps":	5000,
-	"decay_rate":	0.95,
+	"start_lr":	0.005,	
+	"stop_lr":	1.76e-7,
 	"_comment":	" that's all"
     },
 
diff --git a/examples/water/train/water_srtab_example.json b/examples/water/train/water_srtab_example.json
index 846017a24c..f2a0a4a39c 100644
--- a/examples/water/train/water_srtab_example.json
+++ b/examples/water/train/water_srtab_example.json
@@ -32,9 +32,9 @@
 
     "learning_rate" :{
 	"type":		"exp",
-	"start_lr":	0.001,
 	"decay_steps":	5000,
-	"decay_rate":	0.95,
+	"start_lr":	0.001,	
+	"stop_lr":	3.51e-8,
 	"_comment":	"that's all"
     },
 
diff --git a/setup.py b/setup.py
index 2f5679458a..8c6a335ad4 100644
--- a/setup.py
+++ b/setup.py
@@ -3,7 +3,7 @@
 from skbuild.cmaker import get_cmake_version
 from packaging.version import LegacyVersion
 from os import path, makedirs
-import imp,sys
+import imp
 
 readme_file = path.join(path.dirname(path.abspath(__file__)), 'README.md')
 try:
@@ -20,7 +20,7 @@
     tf_install_dir = imp.find_module('tensorflow', [site_packages_path])[1]
 
 install_requires=['numpy', 'scipy']
-setup_requires=['setuptools_scm']
+setup_requires=['setuptools_scm', 'scikit-build', 'cmake']
 
 # add cmake as a build requirement if cmake>3.0 is not installed
 try:
diff --git a/source/CMakeLists.txt b/source/CMakeLists.txt
index 4c43fc9227..6b18cb95ac 100644
--- a/source/CMakeLists.txt
+++ b/source/CMakeLists.txt
@@ -1,5 +1,6 @@
 cmake_minimum_required(VERSION 3.7)
 project(DeePMD)
+set(CMAKE_LINK_WHAT_YOU_USE TRUE)
 
 # build cpp or python interfaces
 if (NOT DEFINED BUILD_CPP_IF) 
@@ -48,6 +49,9 @@ set (CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -std=c++11 -Wno-ignored-attributes")
 # find tensorflow, I need tf abi info
 find_package(tensorflow REQUIRED)
 
+# find threads
+find_package(Threads)
+
 # auto op_cxx_abi
 if (NOT DEFINED OP_CXX_ABI)
   if (BUILD_PY_IF) 
@@ -133,6 +137,16 @@ if (USE_TTM)
   set(TTM_DEF "-DUSE_TTM")
 endif (USE_TTM)
 
+# old pppm interface
+if(NOT DEFINED OLD_LMP_PPPM)
+  set(OLD_LMP_PPPM FALSE)
+endif(NOT DEFINED OLD_LMP_PPPM)
+if (OLD_LMP_PPPM)
+  set(OLD_LMP_PPPM_DEF "-DOLD_LMP_PPPM")
+  message(STATUS "Use old lammps pppm interface")
+endif()
+add_definitions (${OLD_LMP_PPPM_DEF})
+
 # define build type
 if ((NOT DEFINED CMAKE_BUILD_TYPE) OR CMAKE_BUILD_TYPE STREQUAL "")
    set (CMAKE_BUILD_TYPE release)
diff --git a/source/cmake/Findtensorflow.cmake b/source/cmake/Findtensorflow.cmake
index b9cf247e22..708b8e86d5 100644
--- a/source/cmake/Findtensorflow.cmake
+++ b/source/cmake/Findtensorflow.cmake
@@ -10,9 +10,13 @@
 # TensorFlowFramework_LIBRARY    
 # TensorFlowFramework_LIBRARY_PATH
 
+string(REPLACE "lib64" "lib" TENSORFLOW_ROOT_NO64 ${TENSORFLOW_ROOT})
+
 # define the search path
 list(APPEND TensorFlow_search_PATHS ${TENSORFLOW_ROOT})
 list(APPEND TensorFlow_search_PATHS "${TENSORFLOW_ROOT}/../tensorflow_core")
+list(APPEND TensorFlow_search_PATHS ${TENSORFLOW_ROOT_NO64})
+list(APPEND TensorFlow_search_PATHS "${TENSORFLOW_ROOT_NO64}/../tensorflow_core")
 list(APPEND TensorFlow_search_PATHS "/usr/")
 list(APPEND TensorFlow_search_PATHS "/usr/local/")
 
@@ -28,9 +32,18 @@ find_path(TensorFlow_INCLUDE_DIRS
   PATH_SUFFIXES "/include"
   NO_DEFAULT_PATH
   )
+find_path(TensorFlow_INCLUDE_DIRS_GOOGLE
+  NAMES 
+  google/protobuf/type.pb.h
+  PATHS ${TensorFlow_search_PATHS} 
+  PATH_SUFFIXES "/include"
+  NO_DEFAULT_PATH
+  )
+list(APPEND TensorFlow_INCLUDE_DIRS ${TensorFlow_INCLUDE_DIRS_GOOGLE})
+  
 if (NOT TensorFlow_INCLUDE_DIRS AND tensorflow_FIND_REQUIRED)
   message(FATAL_ERROR 
-    "Not found 'include/tensorflow/core/public/session.h' directory in path '${TensorFlow_search_PATHS}' "
+    "Not found 'tensorflow/core/public/session.h' directory in path '${TensorFlow_search_PATHS}' "
     "You can manually set the tensorflow install path by -DTENSORFLOW_ROOT ")
 endif ()
 
diff --git a/source/lib/include/DataModifier.h b/source/lib/include/DataModifier.h
new file mode 100644
index 0000000000..838b1463ec
--- /dev/null
+++ b/source/lib/include/DataModifier.h
@@ -0,0 +1,50 @@
+#pragma once
+
+#include "NNPInter.h"
+
+class DataModifier
+{
+public:
+  DataModifier();
+  DataModifier(const string & model, 
+	       const int & gpu_rank = 0, 
+	       const string & name_scope = "");
+  ~DataModifier () {};
+  void init (const string & model, 
+	     const int & gpu_rank = 0, 
+	     const string & name_scope = "");
+  void print_summary(const string &pre) const;
+public:
+  void compute (vector<VALUETYPE> &		dfcorr_,
+		vector<VALUETYPE> &		dvcorr_,
+		const vector<VALUETYPE> &	dcoord_,
+		const vector<int> &		datype_,
+		const vector<VALUETYPE> &	dbox, 
+		const vector<pair<int,int>> &	pairs,
+		const vector<VALUETYPE> &	delef_, 
+		const int			nghost,
+		const LammpsNeighborList &	lmp_list);
+  VALUETYPE cutoff () const {assert(inited); return rcut;};
+  int numb_types () const {assert(inited); return ntypes;};
+  vector<int> sel_types () const {assert(inited); return sel_type;};
+private:
+  Session* session;
+  string name_scope, name_prefix;
+  int num_intra_nthreads, num_inter_nthreads;
+  GraphDef graph_def;
+  bool inited;
+  VALUETYPE rcut;
+  VALUETYPE cell_size;
+  int ntypes;
+  string model_type;
+  vector<int> sel_type;
+  template<class VT> VT get_scalar(const string & name) const;
+  template<class VT> void get_vector(vector<VT> & vec, const string & name) const;
+  void run_model (vector<VALUETYPE> &		dforce,
+		  vector<VALUETYPE> &		dvirial,
+		  Session *			session,
+		  const std::vector<std::pair<string, Tensor>> & input_tensors,
+		  const NNPAtomMap<VALUETYPE> &	nnpmap,
+		  const int			nghost);
+};
+
diff --git a/source/lib/include/DeepTensor.h b/source/lib/include/DeepTensor.h
new file mode 100644
index 0000000000..867cff37cc
--- /dev/null
+++ b/source/lib/include/DeepTensor.h
@@ -0,0 +1,63 @@
+#pragma once
+
+#include "NNPInter.h"
+
+class DeepTensor
+{
+public:
+  DeepTensor();
+  DeepTensor(const string & model, 
+	     const int & gpu_rank = 0, 
+	     const string &name_scope = "");
+  void init (const string & model, 
+	     const int & gpu_rank = 0, 
+	     const string &name_scope = "");
+  void print_summary(const string &pre) const;
+public:
+  void compute (vector<VALUETYPE> &		value,
+		const vector<VALUETYPE> &	coord,
+		const vector<int> &		atype,
+		const vector<VALUETYPE> &	box,
+		const int			nghost = 0);
+  void compute (vector<VALUETYPE> &		value,
+		const vector<VALUETYPE> &	coord,
+		const vector<int> &		atype,
+		const vector<VALUETYPE> &	box, 
+		const int			nghost,
+		const LammpsNeighborList &	lmp_list);
+  VALUETYPE cutoff () const {assert(inited); return rcut;};
+  int numb_types () const {assert(inited); return ntypes;};
+  int output_dim () const {assert(inited); return odim;};
+  const vector<int> & sel_types () const {assert(inited); return sel_type;};
+private:
+  Session* session;
+  string name_scope;
+  int num_intra_nthreads, num_inter_nthreads;
+  GraphDef graph_def;
+  bool inited;
+  VALUETYPE rcut;
+  VALUETYPE cell_size;
+  int ntypes;
+  string model_type;
+  int odim;
+  vector<int> sel_type;
+  template<class VT> VT get_scalar(const string & name) const;
+  template<class VT> void get_vector (vector<VT> & vec, const string & name) const;
+  void run_model (vector<VALUETYPE> &		d_tensor_,
+		  Session *			session, 
+		  const std::vector<std::pair<string, Tensor>> & input_tensors,
+		  const NNPAtomMap<VALUETYPE> &	nnpmap, 
+		  const int			nghost = 0);
+  void compute_inner (vector<VALUETYPE> &	value,
+		      const vector<VALUETYPE> &	coord,
+		      const vector<int> &	atype,
+		      const vector<VALUETYPE> &	box,
+		      const int			nghost = 0);
+  void compute_inner (vector<VALUETYPE> &	value,
+		      const vector<VALUETYPE> &	coord,
+		      const vector<int> &	atype,
+		      const vector<VALUETYPE> &	box, 
+		      const int			nghost,
+		      const InternalNeighborList&lmp_list);
+};
+
diff --git a/source/lib/include/Ewald.h b/source/lib/include/Ewald.h
new file mode 100644
index 0000000000..f6bd016337
--- /dev/null
+++ b/source/lib/include/Ewald.h
@@ -0,0 +1,288 @@
+#pragma once
+
+#include<algorithm>
+#include<cassert>
+#include<omp.h>
+
+#include "SimulationRegion.h"
+
+// 8.988e9 / pc.electron_volt / pc.angstrom * (1.602e-19)**2
+const double ElectrostaticConvertion = 14.39964535475696995031;
+
+template <typename VALUETYPE>
+struct EwaldParameters 
+{
+  VALUETYPE rcut = 6.0;
+  VALUETYPE beta = 2;
+  VALUETYPE spacing = 4;
+};
+
+template<typename VALUETYPE> 
+VALUETYPE
+dir_err_esti(const VALUETYPE & test_q,
+	     const VALUETYPE & c2,
+	     const VALUETYPE & nn,
+	     const EwaldParameters<VALUETYPE> & param) 
+{
+  const VALUETYPE & rcut = param.rcut;
+  const VALUETYPE & beta = param.beta;
+  const VALUETYPE rho_q2 = c2/nn;  
+  VALUETYPE sum = 2 * test_q 
+      * sqrt (rho_q2 / rcut)
+      * exp (- beta*beta*rcut*rcut) * ElectrostaticConvertion;
+  return sum;
+}
+
+template<typename VALUETYPE> 
+VALUETYPE
+rec_err_esti(const VALUETYPE & test_q,
+	     const VALUETYPE & c2,
+	     const VALUETYPE & nn,
+	     const EwaldParameters<VALUETYPE>&	param,
+	     const SimulationRegion<double>&	region) 
+{
+  const VALUETYPE & beta = param.beta;
+  vector<int> KK;
+  cmpt_k(KK, region, param);
+  const double * rec_box = region.getRecBoxTensor();
+  double sum = 0;
+  int BD[3];
+  for (int dd = 0; dd < 3; ++dd){
+    BD[dd] = KK[dd]/2 + 10;
+  }
+  int mm[3];
+  for (mm[0] = -BD[0]; mm[0] <= BD[0]; ++mm[0]){
+    for (mm[1] = -BD[1]; mm[1] <= BD[1]; ++mm[1]){
+      for (mm[2] = -BD[2]; mm[2] <= BD[2]; ++mm[2]){
+        if (mm[0] >= - int(KK[0])/2 && mm[0] <= int(KK[0])/2 &&
+            mm[1] >= - int(KK[1])/2 && mm[1] <= int(KK[1])/2 &&
+            mm[2] >= - int(KK[2])/2 && mm[2] <= int(KK[2])/2) continue;
+	VALUETYPE rm[3] = {0,0,0};	  
+	for (int dd = 0; dd < 3; ++dd){
+	  rm[0] += mm[dd] * rec_box[dd*3+0];
+	  rm[1] += mm[dd] * rec_box[dd*3+1];
+	  rm[2] += mm[dd] * rec_box[dd*3+2];
+	}
+	VALUETYPE mm2 = rm[0] * rm[0] + rm[1] * rm[1] + rm[2] * rm[2];
+        sum += exp (-2 * M_PI * M_PI / beta / beta * mm2) / mm2;
+      }
+    }
+  }
+  VALUETYPE vol = region.getVolume();
+  // cout << "sum: " << sqrt(sum) 
+  //      << " KK: " << KK[0] 
+  //      << " rbox: " << rec_box[0] 
+  //      << " c2: " << c2 
+  //      << " vol: " << vol << endl;
+  sum = test_q * 2 * sqrt(sum) * sqrt(c2) / vol * ElectrostaticConvertion;
+  return sum;
+}
+
+template <typename VALUETYPE>
+void
+cmpt_k(vector<int> & KK,
+       const SimulationRegion<double>&		region, 
+       const EwaldParameters<VALUETYPE>&	param)
+{
+  const double * boxt_ = region.getBoxTensor();
+  VALUETYPE boxt[9];
+  for (int dd = 0; dd < 9; ++dd){
+    boxt[dd] = static_cast<VALUETYPE>(boxt_[dd]);
+  }  
+  KK.resize(3);
+  for (int dd = 0; dd < 3; ++dd){
+    VALUETYPE ll = sqrt(MathUtilities::dot<VALUETYPE>(boxt+dd*3, boxt+dd*3));
+    KK[dd] = ll / param.spacing;
+    // KK[dd] should be large enough 
+    if (KK[dd] * param.spacing < ll) KK[dd] += 1;
+    assert(KK[dd] * param.spacing >= ll);
+    // KK[dd] should be even
+    if ((KK[dd] / 2) * 2 != KK[dd]) KK[dd] += 1;
+    assert((KK[dd] / 2) * 2 == KK[dd]);
+  }
+}
+
+// compute the reciprocal part of the Ewald sum.
+// outputs: energy force virial
+// inputs: coordinates charges region
+template <typename VALUETYPE>
+void 
+EwaldReciprocal(VALUETYPE &			ener, 
+		vector<VALUETYPE> &		force,
+		vector<VALUETYPE> &		virial,
+		const vector<VALUETYPE>&	coord,
+		const vector<VALUETYPE>&	charge,
+		const SimulationRegion<double>& region, 
+		const EwaldParameters<VALUETYPE>&	param)
+{
+  // natoms
+  int natoms = charge.size();
+  // init returns
+  force.resize(natoms * 3);  
+  virial.resize(9);
+  ener = 0;
+  fill(force.begin(), force.end(), static_cast<VALUETYPE>(0));
+  fill(virial.begin(), virial.end(), static_cast<VALUETYPE>(0));
+
+  // number of threads
+  int nthreads = 1;
+#pragma omp parallel 
+  {
+    if (0 == omp_get_thread_num()) {
+      nthreads = omp_get_num_threads();
+    }
+  }
+
+  // K grid
+  vector<int> KK(3);
+  int totK = 1;
+  cmpt_k<VALUETYPE>(KK, region, param);
+  for (int dd = 0; dd < 3; ++dd){
+    totK *= (KK[dd]+1);
+  }  
+  int stride[3];
+  for (int dd = 0; dd < 3; ++dd) stride[dd] = KK[dd]+1;
+  
+  // compute the sq
+  vector<vector<VALUETYPE> > thread_sqr(nthreads), thread_sqi(nthreads);
+  for (int ii = 0; ii < nthreads; ++ii){
+    thread_sqr[ii].resize(totK, static_cast<VALUETYPE>(0));
+    thread_sqi[ii].resize(totK, static_cast<VALUETYPE>(0));
+  }  
+  // firstly loop over particles then loop over m
+#pragma omp parallel for num_threads(nthreads)
+  for (int ii = 0; ii < natoms; ++ii){
+    int thread_id = omp_get_thread_num();
+    double ir[3];
+    double tmpcoord[3] = {coord[ii*3], coord[ii*3+1], coord[ii*3+2]};
+    region.phys2Inter(ir, tmpcoord);
+    for (int mm0 = -KK[0]/2; mm0 <= KK[0]/2; ++mm0){
+      double mr[3];
+      mr[0] = ir[0] * mm0;      
+      int shift0 = (mm0 + KK[0]/2) * stride[1] * stride[2];
+      for (int mm1 = -KK[1]/2; mm1 <= KK[1]/2; ++mm1){
+	mr[1] = ir[1] * mm1;
+	int shift1 = (mm1 + KK[1]/2) * stride[2];
+	for (int mm2 = -KK[2]/2; mm2 <= KK[2]/2; ++mm2){
+	  if (mm0 == 0 && mm1 == 0 && mm2 == 0) continue;
+	  int mc = shift0 + shift1 + mm2 + KK[2]/2;
+	  mr[2] = ir[2] * mm2;
+	  double mdotr = 2. * M_PI * (mr[0]+mr[1]+mr[2]);
+	  thread_sqr[thread_id][mc] += charge[ii] * cos(mdotr);
+	  thread_sqi[thread_id][mc] += charge[ii] * sin(mdotr);
+	}
+      }
+    }
+  }
+  VALUETYPE * sqr = new VALUETYPE[totK];
+  VALUETYPE * sqi = new VALUETYPE[totK];
+  for (int ii = 0; ii < totK; ++ii){
+    sqr[ii] = static_cast<VALUETYPE>(0);
+    sqi[ii] = static_cast<VALUETYPE>(0);
+    for (int jj = 0; jj < nthreads; ++jj){
+      sqr[ii] += thread_sqr[jj][ii];
+      sqi[ii] += thread_sqi[jj][ii];
+    }
+  }  
+
+  // get rbox
+  VALUETYPE rec_box[9];
+  const double * rec_box_ = region.getRecBoxTensor();
+  for (int ii = 0; ii < 9; ++ii){
+    rec_box[ii] = static_cast<VALUETYPE>(rec_box_[ii]);
+  }
+  
+  vector<VALUETYPE> thread_ener(nthreads, 0.);
+  vector<vector<VALUETYPE> > thread_force(nthreads);
+  vector<vector<VALUETYPE> > thread_virial(nthreads);
+  for (int ii = 0; ii < nthreads; ++ii){
+    thread_force[ii].resize(natoms * 3, 0.);
+    thread_virial[ii].resize(9, 0.);
+  }
+  // calculate ener, force and virial
+  // firstly loop over particles then loop over m  
+#pragma omp parallel for num_threads(nthreads)
+  for (int mc = 0; mc < totK; ++mc){
+    int thread_id = omp_get_thread_num();
+    int mm0 = mc / (stride[1] * stride[2]);
+    int left = mc - mm0 * stride[1] * stride[2];
+    int mm1 = left / stride[2];
+    int mm2 = left - mm1 * stride[2];
+    mm0 -= KK[0]/2;
+    mm1 -= KK[1]/2;
+    mm2 -= KK[2]/2;
+  // for (int mm0 = -KK[0]/2; mm0 <= KK[0]/2; ++mm0){
+  //   int shift0 = (mm0 + KK[0]/2) * stride[1] * stride[2];
+  //   for (int mm1 = -KK[1]/2; mm1 <= KK[1]/2; ++mm1){
+  //     int shift1 = (mm1 + KK[1]/2) * stride[2];
+  //     for (int mm2 = -KK[2]/2; mm2 <= KK[2]/2; ++mm2){
+  // 	int mc = shift0 + shift1 + mm2 + KK[2]/2;
+	if (mm0 == 0 && mm1 == 0 && mm2 == 0) continue;
+	// \bm m and \vert m \vert^2
+	VALUETYPE rm[3] = {0,0,0};	  
+	rm[0] += mm0 * rec_box[0*3+0];
+	rm[1] += mm0 * rec_box[1*3+0];
+	rm[2] += mm0 * rec_box[2*3+0];
+	rm[0] += mm1 * rec_box[0*3+1];
+	rm[1] += mm1 * rec_box[1*3+1];
+	rm[2] += mm1 * rec_box[2*3+1];
+	rm[0] += mm2 * rec_box[0*3+2];
+	rm[1] += mm2 * rec_box[1*3+2];
+	rm[2] += mm2 * rec_box[2*3+2];
+	VALUETYPE nmm2 = rm[0] * rm[0] + rm[1] * rm[1] + rm[2] * rm[2];
+	// energy
+	VALUETYPE expnmm2 = exp(- M_PI * M_PI * nmm2 / (param.beta * param.beta)) / nmm2;
+	VALUETYPE eincr = expnmm2 * (sqr[mc] * sqr[mc] + sqi[mc] * sqi[mc]);
+	thread_ener[thread_id] += eincr;
+	// virial
+	VALUETYPE vpref = -2. * (1. + M_PI * M_PI * nmm2 / (param.beta * param.beta)) / nmm2;
+	for (int dd0 = 0; dd0 < 3; ++dd0){
+	  for (int dd1 = 0; dd1 < 3; ++dd1){	    
+	    VALUETYPE tmp = vpref * rm[dd0] * rm[dd1];
+	    if (dd0 == dd1) tmp += 1;
+	    thread_virial[thread_id][dd0*3+dd1] += eincr * tmp;
+	  }
+	}
+	// force
+	for (int ii = 0; ii < natoms; ++ii){
+	  VALUETYPE mdotr = - 2. * M_PI * (coord[ii*3+0]*rm[0] + coord[ii*3+1]*rm[1] + coord[ii*3+2]*rm[2]);
+	  VALUETYPE tmpr = charge[ii] * cos(mdotr);
+	  VALUETYPE tmpi = charge[ii] * sin(mdotr);
+	  VALUETYPE cc = 4. * M_PI * (tmpr * sqi[mc] + tmpi * sqr[mc]) * expnmm2;
+	  thread_force[thread_id][ii*3+0] -= rm[0] * cc;
+	  thread_force[thread_id][ii*3+1] -= rm[1] * cc;
+	  thread_force[thread_id][ii*3+2] -= rm[2] * cc;
+	}
+    //   }
+    // }
+  }
+  // reduce thread results
+  for (int ii = 0; ii < nthreads; ++ii){
+    ener += thread_ener[ii];
+  }
+  for (int jj = 0; jj < 9; ++jj){
+    for (int ii = 0; ii < nthreads; ++ii){
+      virial[jj] += thread_virial[ii][jj];
+    }
+  }
+  for (int jj = 0; jj < natoms * 3; ++jj){
+    for (int ii = 0; ii < nthreads; ++ii){
+      force[jj] += thread_force[ii][jj];
+    }
+  }
+
+  VALUETYPE vol = static_cast<VALUETYPE>(region.getVolume());
+  ener /= 2 * M_PI * vol;
+  ener *= ElectrostaticConvertion;
+  for (int ii = 0; ii < 3*natoms; ++ii){
+    force[ii] /= 2 * M_PI * vol;
+    force[ii] *= ElectrostaticConvertion;
+  }  
+  for (int ii = 0; ii < 3*3; ++ii){
+    virial[ii] /= 2 * M_PI * vol;
+    virial[ii] *= ElectrostaticConvertion;
+  }  
+  delete[]sqr;
+  delete[]sqi;
+}
+
diff --git a/source/lib/include/NNPAtomMap.h b/source/lib/include/NNPAtomMap.h
index c7474981e7..559dc69dd7 100644
--- a/source/lib/include/NNPAtomMap.h
+++ b/source/lib/include/NNPAtomMap.h
@@ -19,6 +19,7 @@ class NNPAtomMap
 		 const int stride = 1) const ;
   const vector<int > & get_type () const {return atype;}
   const vector<int > & get_fwd_map () const {return fwd_idx_map;}
+  const vector<int > & get_bkw_map () const {return idx_map;}
 private:
   vector<int> idx_map;
   vector<int> fwd_idx_map;
diff --git a/source/lib/include/NNPInter.h b/source/lib/include/NNPInter.h
index 92d0587eea..69a0ecf7d1 100644
--- a/source/lib/include/NNPInter.h
+++ b/source/lib/include/NNPInter.h
@@ -1,54 +1,7 @@
 #pragma once
 
-#include "tensorflow/core/public/session.h"
-#include "tensorflow/core/platform/env.h"
-#include "tensorflow/core/framework/op.h"
-#include "tensorflow/core/framework/op_kernel.h"
-#include "tensorflow/core/framework/shape_inference.h"
-#include "NNPAtomMap.h"
-#include <vector>
-#include "version.h"
-
+#include "common.h"
 typedef double compute_t;
-using namespace tensorflow;
-using namespace std;
-
-#ifdef HIGH_PREC
-typedef double VALUETYPE;
-typedef double ENERGYTYPE;
-#else 
-typedef float  VALUETYPE;
-typedef double ENERGYTYPE;
-#endif
-
-struct LammpsNeighborList 
-{
-  int inum;
-  const int * ilist;
-  const int * numneigh;
-  const int *const* firstneigh;
-  LammpsNeighborList (int inum_, 
-		      const int * ilist_,
-		      const int * numneigh_, 
-		      const int *const* firstneigh_) 
-      : inum(inum_), ilist(ilist_), numneigh(numneigh_), firstneigh(firstneigh_)
-      {
-      }
-};
-
-struct InternalNeighborList 
-{
-  int * pilist;
-  int * pjrange;
-  int * pjlist;
-  vector<int > ilist;
-  vector<int > jrange;
-  vector<int > jlist;
-  void clear () {ilist.clear(); jrange.clear(); jlist.clear();}
-  void make_ptrs () {
-    pilist = &ilist[0]; pjrange = &jrange[0]; pjlist = &jlist[0];
-  }
-};
 
 class NNPInter 
 {
@@ -106,6 +59,7 @@ class NNPInter
   int numb_types () const {assert(inited); return ntypes;};
   int dim_fparam () const {assert(inited); return dfparam;};
   int dim_aparam () const {assert(inited); return daparam;};
+  void get_type_map (std::string & type_map);
 private:
   Session* session;
   int num_intra_nthreads, num_inter_nthreads;
@@ -122,6 +76,16 @@ class NNPInter
   void validate_fparam_aparam(const int & nloc,
 			      const vector<VALUETYPE> &fparam,
 			      const vector<VALUETYPE> &aparam)const ;
+  void compute_inner (ENERGYTYPE &			ener,
+		vector<VALUETYPE> &		force,
+		vector<VALUETYPE> &		virial,
+		const vector<VALUETYPE> &	coord,
+		const vector<int> &		atype,
+		const vector<VALUETYPE> &	box, 
+		const int			nghost,
+		const int &			ago,
+		const vector<VALUETYPE>	&	fparam = vector<VALUETYPE>(),
+		const vector<VALUETYPE>	&	aparam = vector<VALUETYPE>());
 
   // copy neighbor list info from host
   bool init_nbor;
diff --git a/source/lib/include/NeighborList.h b/source/lib/include/NeighborList.h
index b919d15476..8e9f51b8e7 100644
--- a/source/lib/include/NeighborList.h
+++ b/source/lib/include/NeighborList.h
@@ -7,6 +7,7 @@
 #include "MathUtilities.h"
 #include "SimulationRegion.h"
 
+// build nlist by an extended grid
 void
 build_nlist (vector<vector<int > > &	nlist0,
 	     vector<vector<int > > &	nlist1,
@@ -20,6 +21,8 @@ build_nlist (vector<vector<int > > &	nlist0,
 	     const vector<int > &	ext_end_,
 	     const SimulationRegion<double> & region,
 	     const vector<int > &	global_grid);
+
+// build nlist by a grid for a periodic region
 void
 build_nlist (vector<vector<int > > &	nlist0,
 	     vector<vector<int > > &	nlist1,
@@ -28,6 +31,8 @@ build_nlist (vector<vector<int > > &	nlist0,
 	     const double &		rc1,
 	     const vector<int > &	grid,
 	     const SimulationRegion<double> & region);
+
+// build nlist by a grid for a periodic region, atoms selected by sel0 and sel1
 void
 build_nlist (vector<vector<int > > &	nlist0,
 	     vector<vector<int > > &	nlist1,
@@ -38,6 +43,19 @@ build_nlist (vector<vector<int > > &	nlist0,
 	     const double &		rc1,
 	     const vector<int > &	grid,
 	     const SimulationRegion<double> & region);
+
+// brute force (all-to-all distance computation) neighbor list building
+// if region is NULL, open boundary is assumed,
+// otherwise, periodic boundary condition is defined by region
+void
+build_nlist (vector<vector<int > > & nlist0,
+	     vector<vector<int > > & nlist1,
+	     const vector<double > & coord,
+	     const double & rc0_,
+	     const double & rc1_,
+	     const SimulationRegion<double > * region = NULL);
+
+// copy periodic images for the system
 void 
 copy_coord (vector<double > & out_c, 
 	    vector<int > & out_t, 
diff --git a/source/lib/include/common.h b/source/lib/include/common.h
new file mode 100644
index 0000000000..8db7d817ce
--- /dev/null
+++ b/source/lib/include/common.h
@@ -0,0 +1,221 @@
+#pragma once
+
+#include "tensorflow/core/public/session.h"
+#include "tensorflow/core/platform/env.h"
+#include "tensorflow/core/framework/op.h"
+#include "tensorflow/core/framework/op_kernel.h"
+#include "tensorflow/core/framework/shape_inference.h"
+
+using namespace tensorflow;
+using namespace std;
+
+#include "NNPAtomMap.h"
+#include <vector>
+#include "version.h"
+
+#ifdef HIGH_PREC
+typedef double VALUETYPE;
+typedef double ENERGYTYPE;
+#else 
+typedef float  VALUETYPE;
+typedef double ENERGYTYPE;
+#endif
+
+struct LammpsNeighborList 
+{
+  int inum;
+  const int * ilist;
+  const int * numneigh;
+  const int *const* firstneigh;
+  LammpsNeighborList (int inum_, 
+		      const int * ilist_,
+		      const int * numneigh_, 
+		      const int *const* firstneigh_) 
+      : inum(inum_), ilist(ilist_), numneigh(numneigh_), firstneigh(firstneigh_)
+      {
+      }
+};
+
+struct InternalNeighborList 
+{
+  int * pilist;
+  int * pjrange;
+  int * pjlist;
+  vector<int > ilist;
+  vector<int > jrange;
+  vector<int > jlist;
+  void clear () {ilist.clear(); jrange.clear(); jlist.clear();}
+  void make_ptrs () {
+    pilist = &ilist[0]; pjrange = &jrange[0]; pjlist = &jlist[0];
+  }
+};
+
+void
+convert_nlist_lmp_internal (InternalNeighborList & list,
+			    const LammpsNeighborList & lmp_list);
+
+void
+shuffle_nlist (InternalNeighborList & list, 
+	       const vector<int> & fwd_map);
+
+void
+shuffle_nlist (InternalNeighborList & list, 
+	       const NNPAtomMap<VALUETYPE> & map);
+
+void
+shuffle_nlist_exclude_empty (InternalNeighborList & list, 
+			     const vector<int> & fwd_map);
+
+
+void 
+select_by_type(vector<int> & fwd_map,
+	       vector<int> & bkw_map,
+	       int & nghost_real, 
+	       const vector<VALUETYPE> & dcoord_, 
+	       const vector<int> & datype_,
+	       const int & nghost,
+	       const vector<int> & sel_type_);
+
+void
+select_real_atoms(vector<int> & fwd_map,
+		  vector<int> & bkw_map,
+		  int & nghost_real,
+		  const vector<VALUETYPE> & dcoord_, 
+		  const vector<int> & datype_,
+		  const int & nghost,
+		  const int & ntypes);
+
+template<typename VT>
+void 
+select_map(vector<VT> & out,
+	   const vector<VT > & in,
+	   const vector<int > & fwd_map, 
+	   const int & stride);
+
+void
+get_env_nthreads(int & num_intra_nthreads,
+		 int & num_inter_nthreads);
+
+void
+checkStatus(const tensorflow::Status& status);
+
+string name_prefix(const string & name_scope);
+
+template<typename VT>
+VT
+session_get_scalar(Session* session, const string name, const string scope = "");
+
+template<typename VT>
+void
+session_get_vector(vector<VT> & o_vec, Session* session, const string name_, const string scope = "");
+
+int
+session_input_tensors (std::vector<std::pair<string, Tensor>> & input_tensors,
+		       const vector<VALUETYPE> &	dcoord_,
+		       const int &			ntypes,
+		       const vector<int> &		datype_,
+		       const vector<VALUETYPE> &	dbox, 
+		       const VALUETYPE &		cell_size,
+		       const vector<VALUETYPE> &	fparam_,
+		       const vector<VALUETYPE> &	aparam_,
+		       const NNPAtomMap<VALUETYPE>&	nnpmap,
+		       const int			nghost = 0,
+		       const string			scope = "");
+
+int
+session_input_tensors (std::vector<std::pair<string, Tensor>> & input_tensors,
+		       const vector<VALUETYPE> &	dcoord_,
+		       const int &			ntypes,
+		       const vector<int> &		datype_,
+		       const vector<VALUETYPE> &	dbox,		    
+		       InternalNeighborList &		dlist, 
+		       const vector<VALUETYPE> &	fparam_,
+		       const vector<VALUETYPE> &	aparam_,
+		       const NNPAtomMap<VALUETYPE>&	nnpmap,
+		       const int			nghost,
+		       const string			scope = "");
+
+int 
+session_input_tensors (vector<std::pair<string, Tensor>>& input_tensors,
+		       const vector<VALUETYPE>          & dcoord_,
+		       const int                        & ntypes,
+		       const vector<int>                & atype_,
+		       const vector<VALUETYPE>          & dbox,
+		       const int                        * ilist, 
+		       const int                        * jrange,
+		       const int                        * jlist,
+		       int                              * array_int,
+		       unsigned long long               * array_longlong, 
+		       double                           * array_double,
+		       const vector<VALUETYPE>		& fparam_,
+		       const vector<VALUETYPE>	        & aparam_,
+		       const NNPAtomMap<VALUETYPE>      & nnpmap,
+		       const int			& nghost);
+
+
+template<typename VT>
+VT
+session_get_scalar(Session* session, const string name_, const string scope) 
+{
+  string name = name_;
+  if (scope != "") {
+    name = scope + "/" + name;
+  }
+  std::vector<Tensor> output_tensors;
+  checkStatus (session->Run(std::vector<std::pair<string, Tensor>> ({}), 
+			    {name.c_str()}, 
+			    {}, 
+			    &output_tensors));
+  Tensor output_rc = output_tensors[0];
+  auto orc = output_rc.flat <VT> ();
+  return orc(0);
+}
+
+template<typename VT>
+void
+session_get_vector(vector<VT> & o_vec, Session* session, const string name_, const string scope) 
+{
+  string name = name_;
+  if (scope != "") {
+    name = scope + "/" + name;
+  }
+  std::vector<Tensor> output_tensors;
+  checkStatus (session->Run(std::vector<std::pair<string, Tensor>> ({}), 
+			    {name.c_str()}, 
+			    {}, 
+			    &output_tensors));
+  Tensor output_rc = output_tensors[0];
+  assert(1 == output_rc.shape().dims());
+  int dof = output_rc.shape().dim_size(0);
+  o_vec.resize(dof);
+  auto orc = output_rc.flat <VT> ();
+  for (int ii = 0; ii < dof; ++ii){
+    o_vec[ii] = orc(ii);
+  }  
+}
+
+
+template<typename VT>
+void 
+select_map(vector<VT> & out,
+	   const vector<VT > & in,
+	   const vector<int > & idx_map, 
+	   const int & stride)
+{
+#ifdef DEBUG
+  assert(in.size() / stride * stride == in.size()), "in size should be multiples of stride"
+#endif
+  for (int ii = 0; ii < in.size() / stride; ++ii){
+#ifdef DEBUG
+    assert(ii < idx_map.size()), "idx goes over the idx map size";
+    assert(idx_map[ii] < out.size()), "mappped idx goes over the out size";
+#endif
+    if (idx_map[ii] >= 0) {
+      int to_ii = idx_map[ii];
+      for (int dd = 0; dd < stride; ++dd){
+	out[to_ii * stride + dd] = in[ii * stride + dd];
+      }
+    }
+  }
+}
+
diff --git a/source/lib/src/DataModifier.cc b/source/lib/src/DataModifier.cc
new file mode 100644
index 0000000000..65b3f85dd4
--- /dev/null
+++ b/source/lib/src/DataModifier.cc
@@ -0,0 +1,245 @@
+#include "DataModifier.h"
+
+DataModifier::
+DataModifier()
+    : inited (false)
+{
+}
+
+DataModifier::
+DataModifier(const string & model, 
+	     const int & gpu_rank, 
+	     const string &name_scope_)
+    : inited (false), name_scope(name_scope_)
+{
+  get_env_nthreads(num_intra_nthreads, num_inter_nthreads);
+  init(model, gpu_rank);  
+}
+
+void
+DataModifier::
+init (const string & model, 
+      const int & gpu_rank, 
+      const string &name_scope_)
+{  
+  assert (!inited);
+  name_scope = name_scope_;
+  SessionOptions options;
+  options.config.set_inter_op_parallelism_threads(num_inter_nthreads);
+  options.config.set_intra_op_parallelism_threads(num_intra_nthreads);
+  checkStatus(NewSession(options, &session));
+  checkStatus(ReadBinaryProto(Env::Default(), model, &graph_def));
+  checkStatus(session->Create(graph_def));  
+  // int nnodes = graph_def.node_size();
+  // for (int ii = 0; ii < nnodes; ++ii){
+  //   cout << ii << " \t " << graph_def.node(ii).name() << endl;
+  // }
+  rcut = get_scalar<VALUETYPE>("descrpt_attr/rcut");
+  cell_size = rcut;
+  ntypes = get_scalar<int>("descrpt_attr/ntypes");
+  model_type = get_scalar<string>("model_attr/model_type");
+  get_vector<int>(sel_type, "model_attr/sel_type");
+  sort(sel_type.begin(), sel_type.end());
+  inited = true;
+}
+
+template<class VT>
+VT
+DataModifier::
+get_scalar (const string & name) const
+{
+  return session_get_scalar<VT>(session, name, name_scope);
+}
+
+template<class VT>
+void
+DataModifier::
+get_vector (vector<VT> & vec, const string & name) const
+{
+  session_get_vector<VT>(vec, session, name, name_scope);
+}
+
+void 
+DataModifier::
+run_model (vector<VALUETYPE> &		dforce,
+	   vector<VALUETYPE> &		dvirial,
+	   Session *			session, 
+	   const std::vector<std::pair<string, Tensor>> & input_tensors,
+	   const NNPAtomMap<VALUETYPE> &nnpmap, 
+	   const int			nghost)
+{
+  unsigned nloc = nnpmap.get_type().size();
+  unsigned nall = nloc + nghost;
+  if (nloc == 0) {
+    dforce.clear();
+    dvirial.clear();
+    return;
+  }
+
+  std::vector<Tensor> output_tensors;
+  checkStatus (session->Run(input_tensors, 
+			    {"o_dm_force", "o_dm_virial", "o_dm_av"},
+			    {}, 
+			    &output_tensors));
+  int cc = 0;
+  Tensor output_f = output_tensors[cc++];
+  Tensor output_v = output_tensors[cc++];
+  Tensor output_av = output_tensors[cc++];
+  assert (output_f.dims() == 2), "dim of output tensor should be 2";
+  assert (output_v.dims() == 2), "dim of output tensor should be 2";
+  assert (output_av.dims() == 2), "dim of output tensor should be 2";
+  int nframes = output_f.dim_size(0);
+  int natoms = output_f.dim_size(1) / 3;
+  assert (output_f.dim_size(0) == 1), "nframes should match";
+  assert (natoms == nall), "natoms should be nall";
+  assert (output_v.dim_size(0) == nframes), "nframes should match";
+  assert (output_v.dim_size(1) == 9), "dof of virial should be 9";
+  assert (output_av.dim_size(0) == nframes), "nframes should match";
+  assert (output_av.dim_size(1) == natoms * 9), "dof of atom virial should be 9 * natoms";  
+
+  auto of = output_f.flat<VALUETYPE> ();
+  auto ov = output_v.flat<VALUETYPE> ();
+
+  dforce.resize(nall*3);
+  dvirial.resize(9);
+  for (int ii = 0; ii < nall * 3; ++ii){
+    dforce[ii] = of(ii);
+  }
+  for (int ii = 0; ii < 9; ++ii){
+    dvirial[ii] = ov(ii);
+  }
+}
+
+
+
+void
+DataModifier::
+compute (vector<VALUETYPE> &		dfcorr_,
+	 vector<VALUETYPE> &		dvcorr_,
+	 const vector<VALUETYPE> &	dcoord_,
+	 const vector<int> &		datype_,
+	 const vector<VALUETYPE> &	dbox, 
+	 const vector<pair<int,int>> &	pairs,
+	 const vector<VALUETYPE> &	delef_, 
+	 const int			nghost,
+	 const LammpsNeighborList &	lmp_list)
+{
+  // firstly do selection
+  int nall = datype_.size();
+  int nloc = nall - nghost;
+  int nghost_real;
+  vector<int > real_fwd_map, real_bkw_map;
+  select_real_atoms(real_fwd_map, real_bkw_map, nghost_real, dcoord_, datype_, nghost, ntypes);  
+  int nall_real = real_bkw_map.size();
+  int nloc_real = nall_real - nghost_real;
+  if (nloc_real == 0){
+    dfcorr_.resize(nall * 3);
+    dvcorr_.resize(9);
+    fill(dfcorr_.begin(), dfcorr_.end(), 0.0);
+    fill(dvcorr_.begin(), dvcorr_.end(), 0.0);
+    return;
+  }
+  // resize to nall_real
+  vector<VALUETYPE> dcoord_real;
+  vector<VALUETYPE> delef_real;
+  vector<int> datype_real;
+  dcoord_real.resize(nall_real * 3);
+  delef_real.resize(nall_real * 3);
+  datype_real.resize(nall_real);
+  // fwd map
+  select_map<VALUETYPE>(dcoord_real, dcoord_, real_fwd_map, 3);
+  select_map<VALUETYPE>(delef_real, delef_, real_fwd_map, 3);
+  select_map<int>(datype_real, datype_, real_fwd_map, 1);
+  // internal nlist
+  InternalNeighborList nlist_;
+  convert_nlist_lmp_internal(nlist_, lmp_list);
+  shuffle_nlist_exclude_empty(nlist_, real_fwd_map);  
+  // sort atoms
+  NNPAtomMap<VALUETYPE> nnpmap (datype_real.begin(), datype_real.begin() + nloc_real);
+  assert (nloc_real == nnpmap.get_type().size());
+  const vector<int> & sort_fwd_map(nnpmap.get_fwd_map());
+  const vector<int> & sort_bkw_map(nnpmap.get_bkw_map());
+  // shuffle nlist
+  InternalNeighborList nlist(nlist_);
+  shuffle_nlist (nlist, nnpmap);
+  // make input tensors
+  std::vector<std::pair<string, Tensor>> input_tensors;
+  int ret = session_input_tensors (input_tensors, dcoord_real, ntypes, datype_real, dbox, nlist, vector<VALUETYPE>(), vector<VALUETYPE>(), nnpmap, nghost_real, name_scope);
+  assert (nloc_real == ret);
+  // make bond idx map
+  vector<int > bd_idx(nall, -1);
+  for (int ii = 0; ii < pairs.size(); ++ii){
+    bd_idx[pairs[ii].first] = pairs[ii].second;
+  }
+  // make extf by bond idx map
+  vector<int > dtype_sort_loc = nnpmap.get_type();
+  vector<VALUETYPE> dextf;
+  for(int ii = 0; ii < dtype_sort_loc.size(); ++ii){
+    if (binary_search(sel_type.begin(), sel_type.end(), dtype_sort_loc[ii])){
+      // selected atom
+      int first_idx = real_bkw_map[sort_bkw_map[ii]];
+      int second_idx = bd_idx[first_idx];
+      assert(second_idx >= 0);
+      dextf.push_back(delef_[second_idx*3+0]);
+      dextf.push_back(delef_[second_idx*3+1]);
+      dextf.push_back(delef_[second_idx*3+2]);
+    }
+  }
+  // dextf should be loc and virtual
+  assert(dextf.size() == (nloc - nloc_real)*3);
+  // make tensor for extf
+  int nframes = 1;
+  TensorShape extf_shape ;
+  extf_shape.AddDim (nframes);
+  extf_shape.AddDim (dextf.size());
+#ifdef HIGH_PREC
+  Tensor extf_tensor	(DT_DOUBLE, extf_shape);
+#else
+  Tensor extf_tensor	(DT_FLOAT, extf_shape);
+#endif
+  auto extf = extf_tensor.matrix<VALUETYPE> ();
+  for (int ii = 0; ii < nframes; ++ii){
+    for (int jj = 0; jj < extf.size(); ++jj){
+      extf(ii,jj) = dextf[jj];
+    }
+  }
+  // append extf to input tensor
+  input_tensors.push_back({"t_ef", extf_tensor});  
+  // run model
+  vector<VALUETYPE> dfcorr, dvcorr;
+  run_model (dfcorr, dvcorr, session, input_tensors, nnpmap, nghost_real);
+  assert(dfcorr.size() == nall_real * 3);
+  // back map force
+  vector<VALUETYPE> dfcorr_1 = dfcorr;
+  nnpmap.backward (dfcorr_1.begin(), dfcorr.begin(), 3);
+  assert(dfcorr_1.size() == nall_real * 3);
+  // resize to all and clear
+  vector<VALUETYPE> dfcorr_2(nall*3);
+  fill(dfcorr_2.begin(), dfcorr_2.end(), 0.0);
+  // back map to original position
+  for (int ii = 0; ii < nall_real; ++ii){
+    for (int dd = 0; dd < 3; ++dd){
+      dfcorr_2[real_bkw_map[ii]*3+dd] += dfcorr_1[ii*3+dd];
+    }
+  }
+  // self correction of bonded force
+  for (int ii = 0; ii < pairs.size(); ++ii){
+    for (int dd = 0; dd < 3; ++dd){
+      dfcorr_2[pairs[ii].first*3+dd] += delef_[pairs[ii].second*3+dd];
+    }    
+  }
+  // add ele contrinution
+  dfcorr_ = dfcorr_2;
+  // for (int ii = 0; ii < nloc; ++ii){
+  //   for (int dd = 0; dd < 3; ++dd){
+  //     dfcorr_[ii*3+dd] += delef_[ii*3+dd];
+  //   }
+  // }  
+  for (int ii = 0; ii < nloc_real; ++ii){
+    int oii = real_bkw_map[ii];
+    for (int dd = 0; dd < 3; ++dd){
+      dfcorr_[oii*3+dd] += delef_[oii*3+dd];
+    }    
+  }
+  dvcorr_ = dvcorr;
+}
diff --git a/source/lib/src/DeepTensor.cc b/source/lib/src/DeepTensor.cc
new file mode 100644
index 0000000000..bd3a0ea138
--- /dev/null
+++ b/source/lib/src/DeepTensor.cc
@@ -0,0 +1,184 @@
+#include "DeepTensor.h"
+
+DeepTensor::
+DeepTensor()
+    : inited (false)
+{
+}
+
+DeepTensor::
+DeepTensor(const string & model, 
+	   const int & gpu_rank, 
+	   const string &name_scope_)
+    : inited (false), name_scope(name_scope_)
+{
+  get_env_nthreads(num_intra_nthreads, num_inter_nthreads);
+  init(model, gpu_rank);  
+}
+
+void
+DeepTensor::
+init (const string & model, 
+      const int & gpu_rank, 
+      const string &name_scope_)
+{
+  assert (!inited);
+  name_scope = name_scope_;
+  SessionOptions options;
+  options.config.set_inter_op_parallelism_threads(num_inter_nthreads);
+  options.config.set_intra_op_parallelism_threads(num_intra_nthreads);
+  checkStatus (NewSession(options, &session));
+  checkStatus (ReadBinaryProto(Env::Default(), model, &graph_def));
+  checkStatus (session->Create(graph_def));  
+  rcut = get_scalar<VALUETYPE>("descrpt_attr/rcut");
+  cell_size = rcut;
+  ntypes = get_scalar<int>("descrpt_attr/ntypes");
+  model_type = get_scalar<string>("model_attr/model_type");
+  odim = get_scalar<int>("model_attr/output_dim");
+  get_vector<int>(sel_type, "model_attr/sel_type");
+  inited = true;
+}
+
+template<class VT>
+VT
+DeepTensor::
+get_scalar (const string & name) const
+{
+  return session_get_scalar<VT>(session, name, name_scope);
+}
+
+template<class VT>
+void
+DeepTensor::
+get_vector (vector<VT> & vec, const string & name) const
+{
+  session_get_vector<VT>(vec, session, name, name_scope);
+}
+
+void 
+DeepTensor::
+run_model (vector<VALUETYPE> &		d_tensor_,
+	   Session *			session, 
+	   const std::vector<std::pair<string, Tensor>> & input_tensors,
+	   const NNPAtomMap<VALUETYPE> &nnpmap, 
+	   const int			nghost)
+{
+  unsigned nloc = nnpmap.get_type().size();
+  unsigned nall = nloc + nghost;
+  if (nloc == 0) {
+    // return empty
+    d_tensor_.clear();
+    return;
+  }
+
+  std::vector<Tensor> output_tensors;
+  checkStatus (session->Run(input_tensors, 
+			    {name_prefix(name_scope) + "o_" + model_type},
+			    {}, 
+			    &output_tensors));
+  
+  Tensor output_t = output_tensors[0];
+  assert (output_t.dims() == 1), "dim of output tensor should be 1";
+  int o_size = output_t.dim_size(0);
+
+  auto ot = output_t.flat<VALUETYPE> ();
+
+  vector<VALUETYPE> d_tensor (o_size);
+  for (unsigned ii = 0; ii < o_size; ++ii){
+    d_tensor[ii] = ot(ii);
+  }
+  d_tensor_ = d_tensor;
+}
+
+
+void
+DeepTensor::
+compute (vector<VALUETYPE> &		dtensor_,
+	 const vector<VALUETYPE> &	dcoord_,
+	 const vector<int> &		datype_,
+	 const vector<VALUETYPE> &	dbox, 
+	 const int			nghost)
+{
+  vector<VALUETYPE> dcoord;
+  vector<int> datype, fwd_map, bkw_map;
+  int nghost_real;
+  select_real_atoms(fwd_map, bkw_map, nghost_real, dcoord_, datype_, nghost, ntypes);
+  // resize to nall_real
+  dcoord.resize(bkw_map.size() * 3);
+  datype.resize(bkw_map.size());
+  // fwd map
+  select_map<VALUETYPE>(dcoord, dcoord_, fwd_map, 3);
+  select_map<int>(datype, datype_, fwd_map, 1);
+  compute_inner(dtensor_, dcoord, datype, dbox, nghost_real);
+}
+
+void
+DeepTensor::
+compute (vector<VALUETYPE> &		dtensor_,
+	 const vector<VALUETYPE> &	dcoord_,
+	 const vector<int> &		datype_,
+	 const vector<VALUETYPE> &	dbox, 
+	 const int			nghost,
+	 const LammpsNeighborList &	lmp_list)
+{
+  vector<VALUETYPE> dcoord;
+  vector<int> datype, fwd_map, bkw_map;
+  int nghost_real;
+  select_real_atoms(fwd_map, bkw_map, nghost_real, dcoord_, datype_, nghost, ntypes);
+  // resize to nall_real
+  dcoord.resize(bkw_map.size() * 3);
+  datype.resize(bkw_map.size());
+  // fwd map
+  select_map<VALUETYPE>(dcoord, dcoord_, fwd_map, 3);
+  select_map<int>(datype, datype_, fwd_map, 1);
+  // internal nlist
+  InternalNeighborList nlist;
+  convert_nlist_lmp_internal(nlist, lmp_list);
+  shuffle_nlist_exclude_empty(nlist, fwd_map);  
+  compute_inner(dtensor_, dcoord, datype, dbox, nghost_real, nlist);
+}
+
+
+void
+DeepTensor::
+compute_inner (vector<VALUETYPE> &		dtensor_,
+	       const vector<VALUETYPE> &	dcoord_,
+	       const vector<int> &		datype_,
+	       const vector<VALUETYPE> &	dbox, 
+	       const int			nghost)
+{
+  int nall = dcoord_.size() / 3;
+  int nloc = nall - nghost;
+  NNPAtomMap<VALUETYPE> nnpmap (datype_.begin(), datype_.begin() + nloc);
+  assert (nloc == nnpmap.get_type().size());
+
+  std::vector<std::pair<string, Tensor>> input_tensors;
+  int ret = session_input_tensors (input_tensors, dcoord_, ntypes, datype_, dbox, cell_size, vector<VALUETYPE>(), vector<VALUETYPE>(), nnpmap, nghost, name_scope);
+  assert (ret == nloc);
+
+  run_model (dtensor_, session, input_tensors, nnpmap, nghost);
+}
+
+void
+DeepTensor::
+compute_inner (vector<VALUETYPE> &		dtensor_,
+	       const vector<VALUETYPE> &	dcoord_,
+	       const vector<int> &		datype_,
+	       const vector<VALUETYPE> &	dbox, 
+	       const int			nghost,
+	       const InternalNeighborList &	nlist_)
+{
+  int nall = dcoord_.size() / 3;
+  int nloc = nall - nghost;
+  NNPAtomMap<VALUETYPE> nnpmap (datype_.begin(), datype_.begin() + nloc);
+  assert (nloc == nnpmap.get_type().size());
+
+  InternalNeighborList nlist(nlist_);
+  shuffle_nlist (nlist, nnpmap);
+
+  std::vector<std::pair<string, Tensor>> input_tensors;
+  int ret = session_input_tensors (input_tensors, dcoord_, ntypes, datype_, dbox, nlist, vector<VALUETYPE>(), vector<VALUETYPE>(), nnpmap, nghost, name_scope);
+  assert (nloc == ret);
+
+  run_model (dtensor_, session, input_tensors, nnpmap, nghost);
+}
diff --git a/source/lib/src/NNPInter.cc b/source/lib/src/NNPInter.cc
index ae72db8487..f4f39945ff 100644
--- a/source/lib/src/NNPInter.cc
+++ b/source/lib/src/NNPInter.cc
@@ -3,8 +3,7 @@
 #include "SimulationRegion.h"
 #include <stdexcept>	
 
-#define MAGIC_NUMBER 256
-typedef double compute_t;
+#define MAGIC_NUMBER 1024
 
 #ifdef  USE_CUDA_TOOLKIT
 #include "cuda_runtime.h"
@@ -23,14 +22,6 @@ inline void cudaAssert(cudaError_t code, const char *file, int line, bool abort=
 }
 #endif
 
-static
-void
-checkStatus(const tensorflow::Status& status) {
-  if (!status.ok()) {
-    std::cout << status.ToString() << std::endl;
-    exit(1);
-  }
-}
 
 static 
 std::vector<int> cum_sum (const std::vector<int32> & n_sel) {
@@ -43,451 +34,6 @@ std::vector<int> cum_sum (const std::vector<int32> & n_sel) {
     return sec;
 }
 
-static void
-convert_nlist_lmp_internal (InternalNeighborList & list,
-			    const LammpsNeighborList & lmp_list) 
-{
-  list.clear();
-  int total_num_nei = 0;
-  int inum = lmp_list.inum;
-  for (int ii = 0; ii < inum; ++ii){
-    total_num_nei += lmp_list.numneigh[ii];
-  }
-  list.ilist.resize(inum);
-  list.jrange.resize(inum+1);
-  list.jlist.resize(total_num_nei);
-  memcpy(&list.ilist[0], lmp_list.ilist, inum*sizeof(int));
-  list.jrange[0] = 0;
-  for (int ii = 0; ii < inum; ++ii){
-    int jnum = lmp_list.numneigh[ii];
-    list.jrange[ii+1] = list.jrange[ii] + jnum;
-    const int * jlist = lmp_list.firstneigh[ii];
-    memcpy(&(list.jlist[list.jrange[ii]]), jlist, jnum*sizeof(int));
-  }
-}
-
-static void
-shuffle_nlist (InternalNeighborList & list, 
-	       const NNPAtomMap<VALUETYPE> & map)
-{
-  const vector<int> & fwd_map = map.get_fwd_map();
-  int nloc = fwd_map.size();
-  for (unsigned ii = 0; ii < list.ilist.size(); ++ii){
-    if (list.ilist[ii] < nloc) {
-      list.ilist[ii] = fwd_map[list.ilist[ii]];
-    }
-  }
-  for (unsigned ii = 0; ii < list.jlist.size(); ++ii){
-    if (list.jlist[ii] < nloc) {
-      list.jlist[ii] = fwd_map[list.jlist[ii]];
-    }
-  }
-}
-
-static int
-make_input_tensors (std::vector<std::pair<string, Tensor>> & input_tensors,
-		    const vector<VALUETYPE> &	dcoord_,
-		    const int &			ntypes,
-		    const vector<int> &		datype_,
-		    const vector<VALUETYPE> &	dbox, 
-		    const VALUETYPE &		cell_size,
-    		    const vector<VALUETYPE> &	fparam_,
-    		    const vector<VALUETYPE> &	aparam_,
-		    const NNPAtomMap<VALUETYPE>&nnpmap,
-		    const int			nghost = 0)
-{
-  bool b_ghost = (nghost != 0);
-  
-  assert (dbox.size() == 9);
-
-  int nframes = 1;
-  int nall = dcoord_.size() / 3;
-  int nloc = nall - nghost;
-  assert (nall == datype_.size());
-
-  vector<int > datype = nnpmap.get_type();
-  vector<int > type_count (ntypes, 0);
-  for (unsigned ii = 0; ii < datype.size(); ++ii){
-    type_count[datype[ii]] ++;
-  }
-  datype.insert (datype.end(), datype_.begin() + nloc, datype_.end());
-
-  SimulationRegion<VALUETYPE> region;
-  vector<double > dbox_(9);
-  for (int dd = 0; dd < 9; ++dd) dbox_[dd] = dbox[dd];
-  region.reinitBox (&dbox_[0]);
-  double box_l[3];
-  region.toFaceDistance (box_l);
-  
-  vector<int > ncell (3, 2);
-  for (int dd = 0; dd < 3; ++dd){
-    ncell[dd] = box_l[dd] / cell_size;
-    if (ncell[dd] < 2) ncell[dd] = 2;
-  }
-  vector<int > next(3, 0);
-  for (int dd = 0; dd < 3; ++dd){
-    double cellh = box_l[dd] / ncell[dd];
-    next[dd] = cellh / cell_size;
-    if (next[dd] * cellh < cell_size) next[dd]++;
-    assert (next[dd] * cellh >= cell_size);
-  }
-
-  TensorShape coord_shape ;
-  coord_shape.AddDim (nframes);
-  coord_shape.AddDim (nall * 3);
-  TensorShape type_shape ;
-  type_shape.AddDim (nframes);
-  type_shape.AddDim (nall);
-  TensorShape box_shape ;
-  box_shape.AddDim (nframes);
-  box_shape.AddDim (9);
-  TensorShape mesh_shape ;
-  if (!b_ghost){
-    mesh_shape.AddDim (6);
-  }
-  else {
-    mesh_shape.AddDim (12);
-  }
-  TensorShape natoms_shape ;
-  natoms_shape.AddDim (2 + ntypes);
-  TensorShape fparam_shape ;
-  fparam_shape.AddDim (nframes);
-  fparam_shape.AddDim (fparam_.size());
-  TensorShape aparam_shape ;
-  aparam_shape.AddDim (nframes);
-  aparam_shape.AddDim (aparam_.size());
-  
-#ifdef HIGH_PREC
-  Tensor coord_tensor	(DT_DOUBLE, coord_shape);
-  Tensor box_tensor	(DT_DOUBLE, box_shape);
-  Tensor fparam_tensor  (DT_DOUBLE, fparam_shape);
-  Tensor aparam_tensor  (DT_DOUBLE, aparam_shape);
-#else
-  Tensor coord_tensor	(DT_FLOAT, coord_shape);
-  Tensor box_tensor	(DT_FLOAT, box_shape);
-  Tensor fparam_tensor  (DT_FLOAT, fparam_shape);
-  Tensor aparam_tensor  (DT_FLOAT, aparam_shape);
-#endif
-  Tensor type_tensor	(DT_INT32, type_shape);
-  Tensor mesh_tensor	(DT_INT32, mesh_shape);
-  Tensor natoms_tensor	(DT_INT32, natoms_shape);
-
-  auto coord = coord_tensor.matrix<VALUETYPE> ();
-  auto type = type_tensor.matrix<int> ();
-  auto box = box_tensor.matrix<VALUETYPE> ();
-  auto mesh = mesh_tensor.flat<int> ();
-  auto natoms = natoms_tensor.flat<int> ();  
-  auto fparam = fparam_tensor.matrix<VALUETYPE> ();
-  auto aparam = aparam_tensor.matrix<VALUETYPE> ();
-
-  vector<VALUETYPE> dcoord (dcoord_);
-  nnpmap.forward (dcoord.begin(), dcoord_.begin(), 3);
-  
-  for (int ii = 0; ii < nframes; ++ii){
-    for (int jj = 0; jj < nall * 3; ++jj){
-      coord(ii, jj) = dcoord[jj];
-    }
-    for (int jj = 0; jj < 9; ++jj){
-      box(ii, jj) = dbox[jj];
-    }
-    for (int jj = 0; jj < nall; ++jj){
-      type(ii, jj) = datype[jj];
-    }
-    for (int jj = 0; jj < fparam_.size(); ++jj){
-      fparam(ii, jj) = fparam_[jj];
-    }
-    for (int jj = 0; jj < aparam_.size(); ++jj){
-      aparam(ii, jj) = aparam_[jj];
-    }
-  }
-  mesh (1-1) = 0;
-  mesh (2-1) = 0;
-  mesh (3-1) = 0;
-  mesh (4-1) = ncell[0];
-  mesh (5-1) = ncell[1];
-  mesh (6-1) = ncell[2];
-  if (b_ghost){
-    mesh(7-1) = -next[0];
-    mesh(8-1) = -next[1];
-    mesh(9-1) = -next[2];
-    mesh(10-1) = ncell[0] + next[0];
-    mesh(11-1) = ncell[1] + next[1];
-    mesh(12-1) = ncell[2] + next[2];
-  }
-  natoms (0) = nloc;
-  natoms (1) = nall;
-  for (int ii = 0; ii < ntypes; ++ii) natoms(ii+2) = type_count[ii];
-
-  input_tensors = {
-    {"t_coord",	coord_tensor}, 
-    {"t_type",	type_tensor},
-    {"t_box",	box_tensor},
-    {"t_mesh",	mesh_tensor},
-    {"t_natoms",natoms_tensor},
-  };  
-  if (fparam_.size() > 0) {
-    input_tensors.push_back({"t_fparam", fparam_tensor});
-  }
-  if (aparam_.size() > 0) {
-    input_tensors.push_back({"t_aparam", aparam_tensor});
-  }
-  return nloc;
-}
-
-static int
-make_input_tensors (std::vector<std::pair<string, Tensor>> & input_tensors,
-		    const vector<VALUETYPE> &	dcoord_,
-		    const int &			ntypes,
-		    const vector<int> &		datype_,
-		    const vector<VALUETYPE> &	dbox,		    
-		    InternalNeighborList &	dlist, 
-    		    const vector<VALUETYPE> &	fparam_,
-    		    const vector<VALUETYPE> &	aparam_,
-		    const NNPAtomMap<VALUETYPE>&nnpmap,
-    		    const int			nghost)
-{
-  assert (dbox.size() == 9);
-
-  int nframes = 1;
-  int nall = dcoord_.size() / 3;
-  int nloc = nall - nghost;
-  assert (nall == datype_.size());
-
-  vector<int > datype = nnpmap.get_type();
-  vector<int > type_count (ntypes, 0);
-  for (unsigned ii = 0; ii < datype.size(); ++ii){
-    type_count[datype[ii]] ++;
-  }
-  datype.insert (datype.end(), datype_.begin() + nloc, datype_.end());
-
-  TensorShape coord_shape ;
-  coord_shape.AddDim (nframes);
-  coord_shape.AddDim (nall * 3);
-  TensorShape type_shape ;
-  type_shape.AddDim (nframes);
-  type_shape.AddDim (nall);
-  TensorShape box_shape ;
-  box_shape.AddDim (nframes);
-  box_shape.AddDim (9);
-  TensorShape mesh_shape ;
-  mesh_shape.AddDim (16);
-  TensorShape natoms_shape ;
-  natoms_shape.AddDim (2 + ntypes);
-  TensorShape fparam_shape ;
-  fparam_shape.AddDim (nframes);
-  fparam_shape.AddDim (fparam_.size());
-  TensorShape aparam_shape ;
-  aparam_shape.AddDim (nframes);
-  aparam_shape.AddDim (aparam_.size());
-  
-#ifdef HIGH_PREC
-  Tensor coord_tensor	(DT_DOUBLE, coord_shape);
-  Tensor box_tensor	(DT_DOUBLE, box_shape);
-  Tensor fparam_tensor  (DT_DOUBLE, fparam_shape);
-  Tensor aparam_tensor  (DT_DOUBLE, aparam_shape);
-#else
-  Tensor coord_tensor	(DT_FLOAT, coord_shape);
-  Tensor box_tensor	(DT_FLOAT, box_shape);
-  Tensor fparam_tensor  (DT_FLOAT, fparam_shape);
-  Tensor aparam_tensor  (DT_FLOAT, aparam_shape);
-#endif
-  Tensor type_tensor	(DT_INT32, type_shape);
-  Tensor mesh_tensor	(DT_INT32, mesh_shape);
-  Tensor natoms_tensor	(DT_INT32, natoms_shape);
-
-  auto coord = coord_tensor.matrix<VALUETYPE> ();
-  auto type = type_tensor.matrix<int> ();
-  auto box = box_tensor.matrix<VALUETYPE> ();
-  auto mesh = mesh_tensor.flat<int> ();
-  auto natoms = natoms_tensor.flat<int> ();
-  auto fparam = fparam_tensor.matrix<VALUETYPE> ();
-  auto aparam = aparam_tensor.matrix<VALUETYPE> ();
-
-  vector<VALUETYPE> dcoord (dcoord_);
-  nnpmap.forward (dcoord.begin(), dcoord_.begin(), 3);
-  
-  for (int ii = 0; ii < nframes; ++ii){
-    for (int jj = 0; jj < nall * 3; ++jj){
-      coord(ii, jj) = dcoord[jj];
-    }
-    for (int jj = 0; jj < 9; ++jj){
-      box(ii, jj) = dbox[jj];
-    }
-    for (int jj = 0; jj < nall; ++jj){
-      type(ii, jj) = datype[jj];
-    }
-    for (int jj = 0; jj < fparam_.size(); ++jj){
-      fparam(ii, jj) = fparam_[jj];
-    }
-    for (int jj = 0; jj < aparam_.size(); ++jj){
-      aparam(ii, jj) = aparam_[jj];
-    }
-  }
-  
-  for (int ii = 0; ii < 16; ++ii) mesh(ii) = 0;
-  
-  mesh (0) = sizeof(int *) / sizeof(int);
-  assert (mesh(0) * sizeof(int) == sizeof(int *));
-  const int & stride = mesh(0);
-  mesh (1) = dlist.ilist.size();
-  assert (mesh(1) == nloc);
-  assert (stride <= 4);
-  dlist.make_ptrs();
-  memcpy (&mesh(4), &(dlist.pilist), sizeof(int *));
-  memcpy (&mesh(8), &(dlist.pjrange), sizeof(int *));
-  memcpy (&mesh(12), &(dlist.pjlist), sizeof(int *));
-
-  natoms (0) = nloc;
-  natoms (1) = nall;
-  for (int ii = 0; ii < ntypes; ++ii) natoms(ii+2) = type_count[ii];
-
-  input_tensors = {
-    {"t_coord",	coord_tensor}, 
-    {"t_type",	type_tensor},
-    {"t_box",		box_tensor},
-    {"t_mesh",	mesh_tensor},
-    {"t_natoms",	natoms_tensor},
-  };  
-  if (fparam_.size() > 0) {
-    input_tensors.push_back({"t_fparam", fparam_tensor});
-  }
-  if (aparam_.size() > 0) {
-    input_tensors.push_back({"t_aparam", aparam_tensor});
-  }
-
-  return nloc;
-}
-
-static int make_input_tensors (
-        vector<std::pair<string, Tensor>>   &   input_tensors,
-		    const vector<VALUETYPE>             &	  dcoord_,
-		    const int                           &   ntypes,
-		    const vector<int>                   &	  datype_,
-		    const vector<VALUETYPE>             &	  dbox,
-		    const int                           *   ilist, 
-		    const int                           *   jrange,
-		    const int                           *   jlist,
-		    int                                 *   array_int,
-		    unsigned long long                  *   array_longlong, 
-		    compute_t                           *   array_double,
-        const vector<VALUETYPE>	            &   fparam_,
-        const vector<VALUETYPE>	            &   aparam_,
-		    const NNPAtomMap<VALUETYPE>         &   nnpmap,
-    		const int			                      &   nghost)
-{
-    assert (dbox.size() == 9);
-
-    int nframes = 1;
-    int nall = dcoord_.size() / 3;
-    int nloc = nall - nghost;
-    assert (nall == datype_.size());
-
-    vector<int > datype = nnpmap.get_type();
-    vector<int > type_count (ntypes, 0);
-    for (unsigned ii = 0; ii < datype.size(); ++ii) {
-        type_count[datype[ii]] ++;
-    }
-    datype.insert (datype.end(), datype_.begin() + nloc, datype_.end());
-
-    TensorShape coord_shape ;
-    coord_shape.AddDim (nframes);
-    coord_shape.AddDim (nall * 3);
-    TensorShape type_shape ;
-    type_shape.AddDim (nframes);
-    type_shape.AddDim (nall);
-    TensorShape box_shape ;
-    box_shape.AddDim (nframes);
-    box_shape.AddDim (9);
-    TensorShape mesh_shape;
-    mesh_shape.AddDim (32);
-    TensorShape natoms_shape;
-    natoms_shape.AddDim (2 + ntypes);
-    TensorShape fparam_shape;
-    fparam_shape.AddDim (nframes);
-    fparam_shape.AddDim (fparam_.size());
-    TensorShape aparam_shape ;
-    aparam_shape.AddDim (nframes);
-    aparam_shape.AddDim (aparam_.size());
-
-    #ifdef HIGH_PREC
-        Tensor coord_tensor	(DT_DOUBLE, coord_shape);
-        Tensor box_tensor	(DT_DOUBLE, box_shape);
-        Tensor fparam_tensor(DT_DOUBLE, fparam_shape);
-        Tensor aparam_tensor(DT_DOUBLE, fparam_shape);
-    #else
-        Tensor coord_tensor	(DT_FLOAT, coord_shape);
-        Tensor box_tensor	(DT_FLOAT, box_shape);
-        Tensor fparam_tensor(DT_FLOAT, fparam_shape);
-        Tensor aparam_tensor(DT_FLOAT, fparam_shape);
-    #endif
-    Tensor type_tensor	(DT_INT32, type_shape);
-    Tensor mesh_tensor	(DT_INT32, mesh_shape);
-    Tensor natoms_tensor(DT_INT32, natoms_shape);
-
-    auto coord = coord_tensor.matrix<VALUETYPE> ();
-    auto type = type_tensor.matrix<int> ();
-    auto box = box_tensor.matrix<VALUETYPE> ();
-    auto mesh = mesh_tensor.flat<int> ();
-    auto natoms = natoms_tensor.flat<int> ();
-    auto fparam = fparam_tensor.matrix<VALUETYPE> ();
-    auto aparam = aparam_tensor.matrix<VALUETYPE> ();
-
-    vector<VALUETYPE> dcoord (dcoord_);
-    nnpmap.forward (dcoord.begin(), dcoord_.begin(), 3);
-
-    for (int ii = 0; ii < nframes; ++ii) {
-        for (int jj = 0; jj < nall * 3; ++jj) {
-            coord(ii, jj) = dcoord[jj];
-        }
-        for (int jj = 0; jj < 9; ++jj) {
-            box(ii, jj) = dbox[jj];
-        }
-        for (int jj = 0; jj < nall; ++jj) {
-            type(ii, jj) = datype[jj];
-        }
-        for (int jj = 0; jj < fparam_.size(); ++jj) {
-            fparam(ii, jj) = fparam_[jj];
-        }
-        for (int jj = 0; jj < aparam_.size(); ++jj) {
-            aparam(ii, jj) = aparam_[jj];
-        }
-    }
-    
-    for (int ii = 0; ii < 32; ++ii) mesh(ii) = 0;
-    
-    mesh (0) = sizeof(int *) / sizeof(int);
-    assert (mesh(0) * sizeof(int) == sizeof(int *));
-    const int & stride = mesh(0);
-    // mesh (1) = dlist.ilist.size();
-    mesh (1) = nloc;
-    assert (mesh(1) == nloc);
-    assert (stride <= 4);
-    memcpy (&mesh(4), &(ilist), sizeof(int *));
-    memcpy (&mesh(8), &(jrange), sizeof(int *));
-    memcpy (&mesh(12), &(jlist), sizeof(int *));
-    memcpy (&mesh(16), &(array_int), sizeof(int *));
-    memcpy (&mesh(20), &(array_longlong), sizeof(unsigned long long *));
-    memcpy (&mesh(24), &(array_double), sizeof(compute_t *));
-
-    natoms (0) = nloc;
-    natoms (1) = nall;
-    for (int ii = 0; ii < ntypes; ++ii) natoms(ii+2) = type_count[ii];
-    
-    input_tensors = {
-        {"t_coord",	coord_tensor}, 
-        {"t_type",	type_tensor},
-        {"t_box",		box_tensor},
-        {"t_mesh",	mesh_tensor},
-        {"t_natoms",	natoms_tensor},
-    };  
-    if (fparam_.size() > 0) {
-        input_tensors.push_back({"t_fparam", fparam_tensor});
-    }
-    if (aparam_.size() > 0) {
-        input_tensors.push_back({"t_aparam", aparam_tensor});
-    }
-    return nloc;
-}
 
 static void 
 run_model (ENERGYTYPE &			dener,
@@ -705,28 +251,6 @@ static void run_model (ENERGYTYPE   &	dener,
 #endif
 }
 
-static void
-get_env_nthreads(int & num_intra_nthreads,
-		 int & num_inter_nthreads)
-{
-  num_intra_nthreads = 0;
-  num_inter_nthreads = 0;
-  const char* env_intra_nthreads = std::getenv("OMP_NUM_THREADS");
-  const char* env_inter_nthreads = std::getenv("TF_INTER_OP_PARALLELISM_THREADS");
-  if (env_intra_nthreads && 
-      string(env_intra_nthreads) != string("") && 
-      atoi(env_intra_nthreads) >= 0
-      ) {
-    num_intra_nthreads = atoi(env_intra_nthreads);
-  }
-  if (env_inter_nthreads && 
-      string(env_inter_nthreads) != string("") &&
-      atoi(env_inter_nthreads) >= 0
-      ) {
-    num_inter_nthreads = atoi(env_inter_nthreads);
-  }
-}
-
 
 NNPInter::
 NNPInter ()
@@ -921,14 +445,7 @@ VT
 NNPInter::
 get_scalar (const string & name) const
 {
-  std::vector<Tensor> output_tensors;
-  checkStatus (session->Run(std::vector<std::pair<string, Tensor>> ({}), 
-			    {name.c_str()}, 
-			    {}, 
-			    &output_tensors));
-  Tensor output_rc = output_tensors[0];
-  auto orc = output_rc.flat <VT> ();
-  return orc(0);
+  return session_get_scalar<VT>(session, name);
 }
 
 std::string graph_info(const GraphDef & graph_def) {
@@ -949,6 +466,7 @@ std::string graph_info(const GraphDef & graph_def) {
             // std::cout << str << std::endl;
         }
     }
+    return str;
 }
 
 // init the tmp array data
@@ -1008,7 +526,7 @@ compute (ENERGYTYPE &			dener,
   validate_fparam_aparam(nloc, fparam, aparam);
 
   std::vector<std::pair<string, Tensor>> input_tensors;
-  int ret = make_input_tensors (input_tensors, dcoord_, ntypes, datype_, dbox, cell_size, fparam, aparam, nnpmap, nghost);
+  int ret = session_input_tensors (input_tensors, dcoord_, ntypes, datype_, dbox, cell_size, fparam, aparam, nnpmap, nghost);
   assert (ret == nloc);
 
   run_model (dener, dforce_, dvirial, session, input_tensors, nnpmap, nghost);
@@ -1026,10 +544,49 @@ compute (ENERGYTYPE &			dener,
 	 const LammpsNeighborList &	lmp_list,
    const int               &  ago,
 	 const vector<VALUETYPE> &	fparam,
-	 const vector<VALUETYPE> &	aparam)
+	 const vector<VALUETYPE> &	aparam_)
+{
+  vector<VALUETYPE> dcoord, dforce, aparam;
+  vector<int> datype, fwd_map, bkw_map;
+  int nghost_real;
+  select_real_atoms(fwd_map, bkw_map, nghost_real, dcoord_, datype_, nghost, ntypes);
+  // resize to nall_real
+  dcoord.resize(bkw_map.size() * 3);
+  datype.resize(bkw_map.size());
+  // fwd map
+  select_map<VALUETYPE>(dcoord, dcoord_, fwd_map, 3);
+  select_map<int>(datype, datype_, fwd_map, 1);
+  // aparam
+  if (daparam > 0){
+    aparam.resize(bkw_map.size());
+    select_map<VALUETYPE>(aparam, aparam_, fwd_map, daparam);
+  }
+  // internal nlist
+  if (ago == 0){
+    convert_nlist_lmp_internal(nlist, lmp_list);
+    shuffle_nlist_exclude_empty(nlist, fwd_map);  
+  }
+  compute_inner(dener, dforce, dvirial, dcoord, datype, dbox, nghost_real, ago, fparam, aparam);
+  // bkw map
+  select_map<VALUETYPE>(dforce_, dforce, bkw_map, 3);
+}
+
+void
+NNPInter::
+compute_inner (ENERGYTYPE &			dener,
+	       vector<VALUETYPE> &		dforce_,
+	       vector<VALUETYPE> &		dvirial,
+	       const vector<VALUETYPE> &	dcoord_,
+	       const vector<int> &		datype_,
+	       const vector<VALUETYPE> &	dbox, 
+	       const int			nghost,
+	       const int               &  ago,
+	       const vector<VALUETYPE> &	fparam,
+	       const vector<VALUETYPE> &	aparam)
 {
   int nall = dcoord_.size() / 3;
   int nloc = nall - nghost;
+
     validate_fparam_aparam(nloc, fparam, aparam);
     std::vector<std::pair<string, Tensor>> input_tensors;
 
@@ -1038,17 +595,16 @@ compute (ENERGYTYPE &			dener,
         nnpmap = NNPAtomMap<VALUETYPE> (datype_.begin(), datype_.begin() + nloc);
         assert (nloc == nnpmap.get_type().size());
 
-        // InternalNeighborList nlist;
-        convert_nlist_lmp_internal (nlist, lmp_list);
-        shuffle_nlist (nlist, nnpmap);
+	shuffle_nlist (nlist, nnpmap);
         #ifdef USE_CUDA_TOOLKIT
             update_nbor(nlist, nloc);
         #endif
     }
+
     #ifdef USE_CUDA_TOOLKIT
-        int ret = make_input_tensors (input_tensors, dcoord_, ntypes, datype_, dbox, ilist, jrange, jlist, array_int, array_longlong, array_double, fparam, aparam, nnpmap, nghost);
+        int ret = session_input_tensors (input_tensors, dcoord_, ntypes, datype_, dbox, ilist, jrange, jlist, array_int, array_longlong, array_double, fparam, aparam, nnpmap, nghost);
     #else
-        int ret = make_input_tensors (input_tensors, dcoord_, ntypes, datype_, dbox, nlist, fparam, aparam, nnpmap, nghost);
+        int ret = session_input_tensors (input_tensors, dcoord_, ntypes, datype_, dbox, nlist, fparam, aparam, nnpmap, nghost);
     #endif
     assert (nloc == ret);
     run_model (dener, dforce_, dvirial, session, input_tensors, nnpmap, nghost);
@@ -1072,7 +628,7 @@ compute (ENERGYTYPE &			dener,
   validate_fparam_aparam(nnpmap.get_type().size(), fparam, aparam);
 
   std::vector<std::pair<string, Tensor>> input_tensors;
-  int nloc = make_input_tensors (input_tensors, dcoord_, ntypes, datype_, dbox, cell_size, fparam, aparam, nnpmap);
+  int nloc = session_input_tensors (input_tensors, dcoord_, ntypes, datype_, dbox, cell_size, fparam, aparam, nnpmap);
 
   run_model (dener, dforce_, dvirial, datom_energy_, datom_virial_, session, input_tensors, nnpmap);
 }
@@ -1110,17 +666,22 @@ compute (ENERGYTYPE &			dener,
         #ifdef USE_CUDA_TOOLKIT
             update_nbor(nlist, nloc);
         #endif
-
     }
+
     #ifdef USE_CUDA_TOOLKIT
-        int ret = make_input_tensors (input_tensors, dcoord_, ntypes, datype_, dbox, ilist, jrange, jlist, array_int, array_longlong, array_double, fparam, aparam, nnpmap, nghost);
+        int ret = session_input_tensors (input_tensors, dcoord_, ntypes, datype_, dbox, ilist, jrange, jlist, array_int, array_longlong, array_double, fparam, aparam, nnpmap, nghost);
     #else
-        int ret = make_input_tensors (input_tensors, dcoord_, ntypes, datype_, dbox, nlist, fparam, aparam, nnpmap, nghost);
+        int ret = session_input_tensors (input_tensors, dcoord_, ntypes, datype_, dbox, nlist, fparam, aparam, nnpmap, nghost);
     #endif
     assert (nloc == ret);
     run_model (dener, dforce_, dvirial, datom_energy_, datom_virial_, session, input_tensors, nnpmap, nghost);
 }
 
+void
+NNPInter::
+get_type_map(std::string & type_map){
+    type_map = get_scalar<std::string>("model_attr/tmap");
+}
 
 
 
@@ -1255,18 +816,12 @@ get_scalar(const string name) const
 {
   VT myrcut = 0;
   for (unsigned ii = 0; ii < numb_models; ++ii){
-    std::vector<Tensor> output_tensors;
-    checkStatus (sessions[ii]->Run(std::vector<std::pair<string, Tensor>> ({}), 
-				   {name.c_str()}, 
-				   {}, 
-				   &output_tensors));
-    Tensor output_rc = output_tensors[0];
-    auto orc = output_rc.flat <VT> ();
+    VT ret = session_get_scalar<VT>(sessions[ii], name);
     if (ii == 0){
-      myrcut = orc(0);
+      myrcut = ret;
     }
     else {
-      assert (myrcut == orc(0));
+      assert (myrcut == ret);
     }
   }
   return myrcut;
@@ -1426,7 +981,7 @@ compute (ENERGYTYPE &			dener,
   validate_fparam_aparam(nnpmap.get_type().size(), fparam, aparam);
 
   std::vector<std::pair<string, Tensor>> input_tensors;
-  int nloc = make_input_tensors (input_tensors, dcoord_, ntypes, datype_, dbox, cell_size, fparam, aparam, nnpmap);
+  int nloc = session_input_tensors (input_tensors, dcoord_, ntypes, datype_, dbox, cell_size, fparam, aparam, nnpmap);
 
   vector<ENERGYTYPE > all_energy (numb_models);
   vector<vector<VALUETYPE > > all_force (numb_models);
@@ -1489,9 +1044,9 @@ compute (vector<ENERGYTYPE> &		all_energy,
 
     }
     #ifdef USE_CUDA_TOOLKIT
-        int ret = make_input_tensors (input_tensors, dcoord_, ntypes, datype_, dbox, ilist, jrange, jlist, array_int, array_longlong, array_double, fparam, aparam, nnpmap, nghost);
+        int ret = session_input_tensors (input_tensors, dcoord_, ntypes, datype_, dbox, ilist, jrange, jlist, array_int, array_longlong, array_double, fparam, aparam, nnpmap, nghost);
     #else
-        int ret = make_input_tensors (input_tensors, dcoord_, ntypes, datype_, dbox, nlist, fparam, aparam, nnpmap, nghost);
+        int ret = session_input_tensors (input_tensors, dcoord_, ntypes, datype_, dbox, nlist, fparam, aparam, nnpmap, nghost);
     #endif
 
     all_energy.resize (numb_models);
@@ -1539,9 +1094,9 @@ compute (vector<ENERGYTYPE> &			all_energy,
         
     }
     #ifdef USE_CUDA_TOOLKIT
-        int ret = make_input_tensors (input_tensors, dcoord_, ntypes, datype_, dbox, ilist, jrange, jlist, array_int, array_longlong, array_double, fparam, aparam, nnpmap, nghost);
+        int ret = session_input_tensors (input_tensors, dcoord_, ntypes, datype_, dbox, ilist, jrange, jlist, array_int, array_longlong, array_double, fparam, aparam, nnpmap, nghost);
     #else
-        int ret = make_input_tensors (input_tensors, dcoord_, ntypes, datype_, dbox, nlist, fparam, aparam, nnpmap, nghost);
+        int ret = session_input_tensors (input_tensors, dcoord_, ntypes, datype_, dbox, nlist, fparam, aparam, nnpmap, nghost);
     #endif
 
     all_energy.resize (numb_models);
diff --git a/source/lib/src/NeighborList.cpp b/source/lib/src/NeighborList.cpp
index 282e0877b5..e7ced4834b 100644
--- a/source/lib/src/NeighborList.cpp
+++ b/source/lib/src/NeighborList.cpp
@@ -572,6 +572,58 @@ build_nlist (vector<vector<int > > &	nlist0,
   }
 }   
 
+
+void
+build_nlist (vector<vector<int > > & nlist0,
+	     vector<vector<int > > & nlist1,
+	     const vector<double > & posi3,
+	     const double & rc0_,
+	     const double & rc1_,
+	     const SimulationRegion<double > * region)
+{
+  double rc0 (rc0_);
+  double rc1 (rc1_);
+  assert (rc0 <= rc1);
+  double rc02 = rc0 * rc0;
+  // negative rc0 means not applying rc0
+  if (rc0 < 0) rc02 = 0;
+  double rc12 = rc1 * rc1;
+
+  unsigned natoms = posi3.size()/3;
+  nlist0.clear();
+  nlist1.clear();
+  nlist0.resize(natoms);
+  nlist1.resize(natoms);
+  for (unsigned ii = 0; ii < natoms; ++ii){
+    nlist0[ii].reserve (60);
+    nlist1[ii].reserve (60);
+  }
+  for (unsigned ii = 0; ii < natoms; ++ii){
+    for (unsigned jj = ii+1; jj < natoms; ++jj){
+      double diff[3];
+      if (region != NULL) {
+	region->diffNearestNeighbor (posi3[jj*3+0], posi3[jj*3+1], posi3[jj*3+2],
+				     posi3[ii*3+0], posi3[ii*3+1], posi3[ii*3+2],
+				     diff[0], diff[1], diff[2]);
+      }
+      else {
+	diff[0] = posi3[jj*3+0] - posi3[ii*3+0];
+	diff[1] = posi3[jj*3+1] - posi3[ii*3+1];
+	diff[2] = posi3[jj*3+2] - posi3[ii*3+2];
+      }
+      double r2 = MathUtilities::dot<double> (diff, diff);
+      if (r2 < rc02) {
+	nlist0[ii].push_back (jj);
+	nlist0[jj].push_back (ii);
+      }
+      else if (r2 < rc12) {
+	nlist1[ii].push_back (jj);
+	nlist1[jj].push_back (ii);
+      }
+    }
+  }
+}
+
 static int compute_pbc_shift (int idx, 
 			      int ncell)
 {
diff --git a/source/lib/src/common.cc b/source/lib/src/common.cc
new file mode 100644
index 0000000000..296800a396
--- /dev/null
+++ b/source/lib/src/common.cc
@@ -0,0 +1,602 @@
+#include "common.h"
+#include "NNPAtomMap.h"
+#include "SimulationRegion.h"
+
+void 
+select_by_type(vector<int> & fwd_map,
+	       vector<int> & bkw_map,
+	       int & nghost_real, 
+	       const vector<VALUETYPE> & dcoord_, 
+	       const vector<int> & datype_,
+	       const int & nghost,
+	       const vector<int> & sel_type_)
+{
+  vector<int> sel_type (sel_type_);
+  sort(sel_type.begin(), sel_type.end());  
+  int nall = dcoord_.size() / 3;
+  int nloc = nall - nghost;
+  int nloc_real = 0;
+  nghost_real = 0;
+  fwd_map.resize(nall);
+  bkw_map.clear();
+  bkw_map.reserve(nall);  
+  int cc = 0;
+  for (int ii = 0; ii < nall; ++ii){
+    // exclude virtual sites
+    // select the type with id < ntypes
+    if (binary_search(sel_type.begin(), sel_type.end(), datype_[ii])){
+      bkw_map.push_back(ii);
+      if (ii < nloc) {
+	nloc_real += 1;
+      }
+      else{
+	nghost_real += 1;
+      }
+      fwd_map[ii] = cc;
+      cc ++;
+    }
+    else{
+      fwd_map[ii] = -1;
+    }
+  }  
+  assert((nloc_real+nghost_real) == bkw_map.size());  
+}	       
+
+
+void
+select_real_atoms(vector<int> & fwd_map,
+		  vector<int> & bkw_map,
+		  int & nghost_real,
+		  const vector<VALUETYPE> & dcoord_, 
+		  const vector<int> & datype_,
+		  const int & nghost,
+		  const int & ntypes)
+{
+  vector<int > sel_type;
+  for (int ii = 0; ii < ntypes; ++ii){
+    sel_type.push_back(ii);
+  }
+  select_by_type(fwd_map, bkw_map, nghost_real, dcoord_, datype_, nghost, sel_type);
+}
+
+void
+convert_nlist_lmp_internal (InternalNeighborList & list,
+			    const LammpsNeighborList & lmp_list) 
+{
+  list.clear();
+  int total_num_nei = 0;
+  int inum = lmp_list.inum;
+  for (int ii = 0; ii < inum; ++ii){
+    total_num_nei += lmp_list.numneigh[ii];
+  }
+  list.ilist.resize(inum);
+  list.jrange.resize(inum+1);
+  list.jlist.resize(total_num_nei);
+  memcpy(&list.ilist[0], lmp_list.ilist, inum*sizeof(int));
+  list.jrange[0] = 0;
+  for (int ii = 0; ii < inum; ++ii){
+    int jnum = lmp_list.numneigh[ii];
+    list.jrange[ii+1] = list.jrange[ii] + jnum;
+    const int * jlist = lmp_list.firstneigh[ii];
+    memcpy(&(list.jlist[list.jrange[ii]]), jlist, jnum*sizeof(int));
+  }
+}
+
+void
+shuffle_nlist (InternalNeighborList & list, 
+	       const NNPAtomMap<VALUETYPE> & map)
+{
+  const vector<int> & fwd_map = map.get_fwd_map();
+  shuffle_nlist(list, fwd_map);
+}
+
+void
+shuffle_nlist (InternalNeighborList & list, 
+	       const vector<int> & fwd_map)
+{
+  int nloc = fwd_map.size();
+  for (unsigned ii = 0; ii < list.ilist.size(); ++ii){
+    if (list.ilist[ii] < nloc) {
+      list.ilist[ii] = fwd_map[list.ilist[ii]];
+    }
+  }
+  for (unsigned ii = 0; ii < list.jlist.size(); ++ii){
+    if (list.jlist[ii] < nloc) {
+      list.jlist[ii] = fwd_map[list.jlist[ii]];
+    }
+  }
+}
+
+void
+shuffle_nlist_exclude_empty (InternalNeighborList & list, 
+			     const vector<int> & fwd_map)
+{
+  int old_nloc = fwd_map.size();
+  shuffle_nlist(list, fwd_map);
+  vector<int> new_ilist, new_jrange, new_jlist, new_icount;
+  new_ilist.reserve(list.ilist.size());
+  new_icount.reserve(list.ilist.size());
+  new_jrange.reserve(list.jrange.size());
+  new_jlist.reserve(list.jlist.size());
+  for(int ii = 0; ii < list.ilist.size(); ++ii){
+    if(list.ilist[ii] >= 0){
+      new_ilist.push_back(list.ilist[ii]);
+    }
+  }
+  new_jrange.resize(new_ilist.size()+1);
+  new_jrange[0] = 0;
+  int ci = 0;
+  for(int ii = 0; ii < list.ilist.size(); ++ii){
+    if (list.ilist[ii] < 0) continue;
+    int js = list.jrange[ii];
+    int je = list.jrange[ii+1];
+    int cc = 0;
+    for (int jj = js; jj < je; ++jj){
+      if (list.jlist[jj] >= 0) {
+	new_jlist.push_back(list.jlist[jj]);
+	cc++;
+      }      
+    }
+    new_jrange[ci+1] = new_jrange[ci] + cc;
+    ci ++;
+  }
+  list.ilist = new_ilist;
+  list.jrange = new_jrange;
+  list.jlist = new_jlist;
+}
+
+void
+checkStatus(const tensorflow::Status& status) {
+  if (!status.ok()) {
+    std::cout << status.ToString() << std::endl;
+    exit(1);
+  }
+}
+
+void
+get_env_nthreads(int & num_intra_nthreads,
+		 int & num_inter_nthreads)
+{
+  num_intra_nthreads = 0;
+  num_inter_nthreads = 0;
+  const char* env_intra_nthreads = std::getenv("TF_INTRA_OP_PARALLELISM_THREADS");
+  const char* env_inter_nthreads = std::getenv("TF_INTER_OP_PARALLELISM_THREADS");
+  if (env_intra_nthreads && 
+      string(env_intra_nthreads) != string("") && 
+      atoi(env_intra_nthreads) >= 0
+      ) {
+    num_intra_nthreads = atoi(env_intra_nthreads);
+  }
+  if (env_inter_nthreads && 
+      string(env_inter_nthreads) != string("") &&
+      atoi(env_inter_nthreads) >= 0
+      ) {
+    num_inter_nthreads = atoi(env_inter_nthreads);
+  }
+}
+
+string
+name_prefix(const string & scope)
+{
+  string prefix = "";
+  if (scope != ""){
+    prefix = scope + "/";
+  }
+  return prefix;
+}
+
+int
+session_input_tensors (std::vector<std::pair<string, Tensor>> & input_tensors,
+		       const vector<VALUETYPE> &	dcoord_,
+		       const int &			ntypes,
+		       const vector<int> &		datype_,
+		       const vector<VALUETYPE> &	dbox, 
+		       const VALUETYPE &		cell_size,
+		       const vector<VALUETYPE> &	fparam_,
+		       const vector<VALUETYPE> &	aparam_,
+		       const NNPAtomMap<VALUETYPE>&	nnpmap,
+		       const int			nghost, 
+		       const string			scope)
+{
+  bool b_ghost = (nghost != 0);
+  
+  assert (dbox.size() == 9);
+
+  int nframes = 1;
+  int nall = dcoord_.size() / 3;
+  int nloc = nall - nghost;
+  assert (nall == datype_.size());
+
+  vector<int > datype = nnpmap.get_type();
+  vector<int > type_count (ntypes, 0);
+  for (unsigned ii = 0; ii < datype.size(); ++ii){
+    type_count[datype[ii]] ++;
+  }
+  datype.insert (datype.end(), datype_.begin() + nloc, datype_.end());
+
+  SimulationRegion<VALUETYPE> region;
+  vector<double > dbox_(9);
+  for (int dd = 0; dd < 9; ++dd) dbox_[dd] = dbox[dd];
+  region.reinitBox (&dbox_[0]);
+  double box_l[3];
+  region.toFaceDistance (box_l);
+  
+  vector<int > ncell (3, 2);
+  for (int dd = 0; dd < 3; ++dd){
+    ncell[dd] = box_l[dd] / cell_size;
+    if (ncell[dd] < 2) ncell[dd] = 2;
+  }
+  vector<int > next(3, 0);
+  for (int dd = 0; dd < 3; ++dd){
+    double cellh = box_l[dd] / ncell[dd];
+    next[dd] = cellh / cell_size;
+    if (next[dd] * cellh < cell_size) next[dd]++;
+    assert (next[dd] * cellh >= cell_size);
+  }
+
+  TensorShape coord_shape ;
+  coord_shape.AddDim (nframes);
+  coord_shape.AddDim (nall * 3);
+  TensorShape type_shape ;
+  type_shape.AddDim (nframes);
+  type_shape.AddDim (nall);
+  TensorShape box_shape ;
+  box_shape.AddDim (nframes);
+  box_shape.AddDim (9);
+  TensorShape mesh_shape ;
+  if (!b_ghost){
+    mesh_shape.AddDim (6);
+  }
+  else {
+    mesh_shape.AddDim (12);
+  }
+  TensorShape natoms_shape ;
+  natoms_shape.AddDim (2 + ntypes);
+  TensorShape fparam_shape ;
+  fparam_shape.AddDim (nframes);
+  fparam_shape.AddDim (fparam_.size());
+  TensorShape aparam_shape ;
+  aparam_shape.AddDim (nframes);
+  aparam_shape.AddDim (aparam_.size());
+  
+#ifdef HIGH_PREC
+  Tensor coord_tensor	(DT_DOUBLE, coord_shape);
+  Tensor box_tensor	(DT_DOUBLE, box_shape);
+  Tensor fparam_tensor  (DT_DOUBLE, fparam_shape);
+  Tensor aparam_tensor  (DT_DOUBLE, aparam_shape);
+#else
+  Tensor coord_tensor	(DT_FLOAT, coord_shape);
+  Tensor box_tensor	(DT_FLOAT, box_shape);
+  Tensor fparam_tensor  (DT_FLOAT, fparam_shape);
+  Tensor aparam_tensor  (DT_FLOAT, aparam_shape);
+#endif
+  Tensor type_tensor	(DT_INT32, type_shape);
+  Tensor mesh_tensor	(DT_INT32, mesh_shape);
+  Tensor natoms_tensor	(DT_INT32, natoms_shape);
+
+  auto coord = coord_tensor.matrix<VALUETYPE> ();
+  auto type = type_tensor.matrix<int> ();
+  auto box = box_tensor.matrix<VALUETYPE> ();
+  auto mesh = mesh_tensor.flat<int> ();
+  auto natoms = natoms_tensor.flat<int> ();  
+  auto fparam = fparam_tensor.matrix<VALUETYPE> ();
+  auto aparam = aparam_tensor.matrix<VALUETYPE> ();
+
+  vector<VALUETYPE> dcoord (dcoord_);
+  nnpmap.forward (dcoord.begin(), dcoord_.begin(), 3);
+  
+  for (int ii = 0; ii < nframes; ++ii){
+    for (int jj = 0; jj < nall * 3; ++jj){
+      coord(ii, jj) = dcoord[jj];
+    }
+    for (int jj = 0; jj < 9; ++jj){
+      box(ii, jj) = dbox[jj];
+    }
+    for (int jj = 0; jj < nall; ++jj){
+      type(ii, jj) = datype[jj];
+    }
+    for (int jj = 0; jj < fparam_.size(); ++jj){
+      fparam(ii, jj) = fparam_[jj];
+    }
+    for (int jj = 0; jj < aparam_.size(); ++jj){
+      aparam(ii, jj) = aparam_[jj];
+    }
+  }
+  mesh (1-1) = 0;
+  mesh (2-1) = 0;
+  mesh (3-1) = 0;
+  mesh (4-1) = ncell[0];
+  mesh (5-1) = ncell[1];
+  mesh (6-1) = ncell[2];
+  if (b_ghost){
+    mesh(7-1) = -next[0];
+    mesh(8-1) = -next[1];
+    mesh(9-1) = -next[2];
+    mesh(10-1) = ncell[0] + next[0];
+    mesh(11-1) = ncell[1] + next[1];
+    mesh(12-1) = ncell[2] + next[2];
+  }
+  natoms (0) = nloc;
+  natoms (1) = nall;
+  for (int ii = 0; ii < ntypes; ++ii) natoms(ii+2) = type_count[ii];
+
+  string prefix = "";
+  if (scope != ""){
+    prefix = scope + "/";
+  }
+  input_tensors = {
+    {prefix+"t_coord",	coord_tensor}, 
+    {prefix+"t_type",	type_tensor},
+    {prefix+"t_box",	box_tensor},
+    {prefix+"t_mesh",	mesh_tensor},
+    {prefix+"t_natoms",	natoms_tensor},
+  };  
+  if (fparam_.size() > 0) {
+    input_tensors.push_back({prefix+"t_fparam", fparam_tensor});
+  }
+  if (aparam_.size() > 0) {
+    input_tensors.push_back({prefix+"t_aparam", aparam_tensor});
+  }
+  return nloc;
+}
+
+int
+session_input_tensors (std::vector<std::pair<string, Tensor>> & input_tensors,
+		       const vector<VALUETYPE> &	dcoord_,
+		       const int &			ntypes,
+		       const vector<int> &		datype_,
+		       const vector<VALUETYPE> &	dbox,		    
+		       InternalNeighborList &		dlist, 
+		       const vector<VALUETYPE> &	fparam_,
+		       const vector<VALUETYPE> &	aparam_,
+		       const NNPAtomMap<VALUETYPE>&	nnpmap,
+		       const int			nghost,
+		       const string			scope)
+{
+  assert (dbox.size() == 9);
+
+  int nframes = 1;
+  int nall = dcoord_.size() / 3;
+  int nloc = nall - nghost;
+  assert (nall == datype_.size());  
+
+  vector<int > datype = nnpmap.get_type();
+  vector<int > type_count (ntypes, 0);
+  for (unsigned ii = 0; ii < datype.size(); ++ii){
+    type_count[datype[ii]] ++;
+  }
+  datype.insert (datype.end(), datype_.begin() + nloc, datype_.end());
+
+  TensorShape coord_shape ;
+  coord_shape.AddDim (nframes);
+  coord_shape.AddDim (nall * 3);
+  TensorShape type_shape ;
+  type_shape.AddDim (nframes);
+  type_shape.AddDim (nall);
+  TensorShape box_shape ;
+  box_shape.AddDim (nframes);
+  box_shape.AddDim (9);
+  TensorShape mesh_shape ;
+  mesh_shape.AddDim (16);
+  TensorShape natoms_shape ;
+  natoms_shape.AddDim (2 + ntypes);
+  TensorShape fparam_shape ;
+  fparam_shape.AddDim (nframes);
+  fparam_shape.AddDim (fparam_.size());
+  TensorShape aparam_shape ;
+  aparam_shape.AddDim (nframes);
+  aparam_shape.AddDim (aparam_.size());
+  
+#ifdef HIGH_PREC
+  Tensor coord_tensor	(DT_DOUBLE, coord_shape);
+  Tensor box_tensor	(DT_DOUBLE, box_shape);
+  Tensor fparam_tensor  (DT_DOUBLE, fparam_shape);
+  Tensor aparam_tensor  (DT_DOUBLE, aparam_shape);
+#else
+  Tensor coord_tensor	(DT_FLOAT, coord_shape);
+  Tensor box_tensor	(DT_FLOAT, box_shape);
+  Tensor fparam_tensor  (DT_FLOAT, fparam_shape);
+  Tensor aparam_tensor  (DT_FLOAT, aparam_shape);
+#endif
+  Tensor type_tensor	(DT_INT32, type_shape);
+  Tensor mesh_tensor	(DT_INT32, mesh_shape);
+  Tensor natoms_tensor	(DT_INT32, natoms_shape);
+
+  auto coord = coord_tensor.matrix<VALUETYPE> ();
+  auto type = type_tensor.matrix<int> ();
+  auto box = box_tensor.matrix<VALUETYPE> ();
+  auto mesh = mesh_tensor.flat<int> ();
+  auto natoms = natoms_tensor.flat<int> ();
+  auto fparam = fparam_tensor.matrix<VALUETYPE> ();
+  auto aparam = aparam_tensor.matrix<VALUETYPE> ();
+
+  vector<VALUETYPE> dcoord (dcoord_);
+  nnpmap.forward (dcoord.begin(), dcoord_.begin(), 3);
+  
+  for (int ii = 0; ii < nframes; ++ii){
+    for (int jj = 0; jj < nall * 3; ++jj){
+      coord(ii, jj) = dcoord[jj];
+    }
+    for (int jj = 0; jj < 9; ++jj){
+      box(ii, jj) = dbox[jj];
+    }
+    for (int jj = 0; jj < nall; ++jj){
+      type(ii, jj) = datype[jj];
+    }
+    for (int jj = 0; jj < fparam_.size(); ++jj){
+      fparam(ii, jj) = fparam_[jj];
+    }
+    for (int jj = 0; jj < aparam_.size(); ++jj){
+      aparam(ii, jj) = aparam_[jj];
+    }
+  }
+  
+  for (int ii = 0; ii < 16; ++ii) mesh(ii) = 0;
+  
+  mesh (0) = sizeof(int *) / sizeof(int);
+  assert (mesh(0) * sizeof(int) == sizeof(int *));
+  const int & stride = mesh(0);
+  mesh (1) = dlist.ilist.size();
+  assert (mesh(1) == nloc);
+  assert (stride <= 4);
+  dlist.make_ptrs();
+  memcpy (&mesh(4), &(dlist.pilist), sizeof(int *));
+  memcpy (&mesh(8), &(dlist.pjrange), sizeof(int *));
+  memcpy (&mesh(12), &(dlist.pjlist), sizeof(int *));
+
+  natoms (0) = nloc;
+  natoms (1) = nall;
+  for (int ii = 0; ii < ntypes; ++ii) natoms(ii+2) = type_count[ii];
+
+  string prefix = "";
+  if (scope != ""){
+    prefix = scope + "/";
+  }
+  input_tensors = {
+    {prefix+"t_coord",	coord_tensor}, 
+    {prefix+"t_type",	type_tensor},
+    {prefix+"t_box",	box_tensor},
+    {prefix+"t_mesh",	mesh_tensor},
+    {prefix+"t_natoms",natoms_tensor},
+  };  
+  if (fparam_.size() > 0) {
+    input_tensors.push_back({prefix+"t_fparam", fparam_tensor});
+  }
+  if (aparam_.size() > 0) {
+    input_tensors.push_back({prefix+"t_aparam", aparam_tensor});
+  }
+
+  return nloc;
+}
+
+int
+session_input_tensors (
+    vector<std::pair<string, Tensor>>   &   input_tensors,
+    const vector<VALUETYPE>             &	  dcoord_,
+    const int                           &   ntypes,
+    const vector<int>                   &	  datype_,
+    const vector<VALUETYPE>             &	  dbox,
+    const int                           *   ilist, 
+    const int                           *   jrange,
+    const int                           *   jlist,
+    int                                 *   array_int,
+    unsigned long long                  *   array_longlong, 
+    double                              *   array_double,
+    const vector<VALUETYPE>	            &   fparam_,
+    const vector<VALUETYPE>	            &   aparam_,
+    const NNPAtomMap<VALUETYPE>         &   nnpmap,
+    const int			                      &   nghost)
+{
+    assert (dbox.size() == 9);
+
+    int nframes = 1;
+    int nall = dcoord_.size() / 3;
+    int nloc = nall - nghost;
+    assert (nall == datype_.size());
+
+    vector<int > datype = nnpmap.get_type();
+    vector<int > type_count (ntypes, 0);
+    for (unsigned ii = 0; ii < datype.size(); ++ii) {
+        type_count[datype[ii]] ++;
+    }
+    datype.insert (datype.end(), datype_.begin() + nloc, datype_.end());
+
+    TensorShape coord_shape ;
+    coord_shape.AddDim (nframes);
+    coord_shape.AddDim (nall * 3);
+    TensorShape type_shape ;
+    type_shape.AddDim (nframes);
+    type_shape.AddDim (nall);
+    TensorShape box_shape ;
+    box_shape.AddDim (nframes);
+    box_shape.AddDim (9);
+    TensorShape mesh_shape;
+    mesh_shape.AddDim (32);
+    TensorShape natoms_shape;
+    natoms_shape.AddDim (2 + ntypes);
+    TensorShape fparam_shape;
+    fparam_shape.AddDim (nframes);
+    fparam_shape.AddDim (fparam_.size());
+    TensorShape aparam_shape ;
+    aparam_shape.AddDim (nframes);
+    aparam_shape.AddDim (aparam_.size());
+
+    #ifdef HIGH_PREC
+        Tensor coord_tensor	(DT_DOUBLE, coord_shape);
+        Tensor box_tensor	(DT_DOUBLE, box_shape);
+        Tensor fparam_tensor(DT_DOUBLE, fparam_shape);
+        Tensor aparam_tensor(DT_DOUBLE, fparam_shape);
+    #else
+        Tensor coord_tensor	(DT_FLOAT, coord_shape);
+        Tensor box_tensor	(DT_FLOAT, box_shape);
+        Tensor fparam_tensor(DT_FLOAT, fparam_shape);
+        Tensor aparam_tensor(DT_FLOAT, fparam_shape);
+    #endif
+    Tensor type_tensor	(DT_INT32, type_shape);
+    Tensor mesh_tensor	(DT_INT32, mesh_shape);
+    Tensor natoms_tensor(DT_INT32, natoms_shape);
+
+    auto coord = coord_tensor.matrix<VALUETYPE> ();
+    auto type = type_tensor.matrix<int> ();
+    auto box = box_tensor.matrix<VALUETYPE> ();
+    auto mesh = mesh_tensor.flat<int> ();
+    auto natoms = natoms_tensor.flat<int> ();
+    auto fparam = fparam_tensor.matrix<VALUETYPE> ();
+    auto aparam = aparam_tensor.matrix<VALUETYPE> ();
+
+    vector<VALUETYPE> dcoord (dcoord_);
+    nnpmap.forward (dcoord.begin(), dcoord_.begin(), 3);
+
+    for (int ii = 0; ii < nframes; ++ii) {
+        for (int jj = 0; jj < nall * 3; ++jj) {
+            coord(ii, jj) = dcoord[jj];
+        }
+        for (int jj = 0; jj < 9; ++jj) {
+            box(ii, jj) = dbox[jj];
+        }
+        for (int jj = 0; jj < nall; ++jj) {
+            type(ii, jj) = datype[jj];
+        }
+        for (int jj = 0; jj < fparam_.size(); ++jj) {
+            fparam(ii, jj) = fparam_[jj];
+        }
+        for (int jj = 0; jj < aparam_.size(); ++jj) {
+            aparam(ii, jj) = aparam_[jj];
+        }
+    }
+    
+    for (int ii = 0; ii < 32; ++ii) mesh(ii) = 0;
+    
+    mesh (0) = sizeof(int *) / sizeof(int);
+    assert (mesh(0) * sizeof(int) == sizeof(int *));
+    const int & stride = mesh(0);
+    // mesh (1) = dlist.ilist.size();
+    mesh (1) = nloc;
+    assert (mesh(1) == nloc);
+    assert (stride <= 4);
+    memcpy (&mesh(4), &(ilist), sizeof(int *));
+    memcpy (&mesh(8), &(jrange), sizeof(int *));
+    memcpy (&mesh(12), &(jlist), sizeof(int *));
+    memcpy (&mesh(16), &(array_int), sizeof(int *));
+    memcpy (&mesh(20), &(array_longlong), sizeof(unsigned long long *));
+    memcpy (&mesh(24), &(array_double), sizeof(double *));
+
+    natoms (0) = nloc;
+    natoms (1) = nall;
+    for (int ii = 0; ii < ntypes; ++ii) natoms(ii+2) = type_count[ii];
+    
+    input_tensors = {
+        {"t_coord",	coord_tensor}, 
+        {"t_type",	type_tensor},
+        {"t_box",		box_tensor},
+        {"t_mesh",	mesh_tensor},
+        {"t_natoms",	natoms_tensor},
+    };  
+    if (fparam_.size() > 0) {
+        input_tensors.push_back({"t_fparam", fparam_tensor});
+    }
+    if (aparam_.size() > 0) {
+        input_tensors.push_back({"t_aparam", aparam_tensor});
+    }
+    return nloc;
+}
diff --git a/source/lmp/env.sh.in b/source/lmp/env.sh.in
index 2a9253ba70..00bc3b18b6 100644
--- a/source/lmp/env.sh.in
+++ b/source/lmp/env.sh.in
@@ -6,6 +6,6 @@ TF_INCLUDE_DIRS=`echo $TENSORFLOW_INCLUDE_DIRS | sed "s/;/ -I/g"`
 TF_LIBRARY_PATH=`echo $TENSORFLOW_LIBRARY_PATH | sed "s/;/ -L/g"`
 TF_RPATH=`echo $TENSORFLOW_LIBRARY_PATH | sed "s/;/ -Wl,-rpath=/g"`
 
-NNP_INC=" -std=c++11 @PREC_DEF@ @TTM_DEF@ -I$TF_INCLUDE_DIRS -I$DEEPMD_ROOT/include/deepmd "
+NNP_INC=" -std=c++11 @PREC_DEF@ @TTM_DEF@ @OLD_LMP_PPPM_DEF@ -I$TF_INCLUDE_DIRS -I$DEEPMD_ROOT/include/deepmd "
 NNP_PATH=" -L$TF_LIBRARY_PATH -L$DEEPMD_ROOT/lib"
 NNP_LIB=" -Wl,--no-as-needed -l@LIB_DEEPMD_OP@ -l@LIB_DEEPMD_OP_CUDA@ -l@LIB_DEEPMD@ -ltensorflow_cc -ltensorflow_framework -Wl,-rpath=$TF_RPATH -Wl,-rpath=$DEEPMD_ROOT/lib"
diff --git a/source/lmp/fix_dplr.cpp b/source/lmp/fix_dplr.cpp
new file mode 100644
index 0000000000..c0f0bb930c
--- /dev/null
+++ b/source/lmp/fix_dplr.cpp
@@ -0,0 +1,470 @@
+#include <iostream>
+#include <iomanip>
+#include <limits>
+#include "atom.h"
+#include "domain.h"
+#include "comm.h"
+#include "force.h"
+#include "update.h"
+#include "error.h"
+#include "neighbor.h"
+#include "neigh_list.h"
+#include "fix.h"
+#include "fix_dplr.h"
+#include "pppm_dplr.h"
+
+using namespace LAMMPS_NS;
+using namespace FixConst;
+using namespace std;
+
+static bool 
+is_key (const string& input) 
+{
+  vector<string> keys ;
+  keys.push_back("model");
+  keys.push_back("type_associate");
+  keys.push_back("bond_type");
+  for (int ii = 0; ii < keys.size(); ++ii){
+    if (input == keys[ii]) {
+      return true;
+    }
+  }
+  return false;
+}
+
+
+FixDPLR::FixDPLR(LAMMPS *lmp, int narg, char **arg) 
+    :Fix(lmp, narg, arg)
+{
+  virial_flag = 1;
+
+  if (strcmp(update->unit_style,"metal") != 0) {
+    error->all(FLERR,"Pair deepmd requires metal unit, please set it by \"units metal\"");
+  }
+  
+  int iarg = 3;
+  vector<int> map_vec;
+  bond_type.clear();
+  while (iarg < narg) {
+    if (! is_key(arg[iarg])) {
+      error->all(FLERR,"Illegal pair_style command\nwrong number of parameters\n");
+    }
+    if (string(arg[iarg]) == string("model")) {
+      if (iarg+1 > narg) error->all(FLERR,"Illegal fix adapt command");
+      model = string(arg[iarg+1]);
+      iarg += 2;
+    }
+    if (string(arg[iarg]) == string("type_associate")) {
+      int iend = iarg+1;
+      while (iend < narg && (! is_key(arg[iend]) )) {
+	map_vec.push_back(atoi(arg[iend])-1);
+	iend ++;
+      }
+      iarg = iend;
+    }
+    if (string(arg[iarg]) == string("bond_type")) {
+      int iend = iarg+1;
+      while (iend < narg && (! is_key(arg[iend]) )) {
+	bond_type.push_back(atoi(arg[iend])-1);
+	iend ++;
+      }
+      sort(bond_type.begin(), bond_type.end());
+      iarg = iend;
+    }
+    else {
+      break;
+    }
+  }
+  assert(map_vec.size() % 2 == 0), "number of ints provided by type_associate should be even";
+  for (int ii = 0; ii < map_vec.size()/2; ++ii){
+    type_asso[map_vec[ii*2+0]] = map_vec[ii*2+1];
+    bk_type_asso[map_vec[ii*2+1]] = map_vec[ii*2+0];
+  }
+
+  // dpt.init(model);
+  // dtm.init("frozen_model.pb");
+  dpt.init(model, 0, "dipole_charge");
+  dtm.init(model, 0, "dipole_charge");
+
+  sel_type = dpt.sel_types();
+  sort(sel_type.begin(), sel_type.end());
+  dpl_type.clear();
+  for (int ii = 0; ii < sel_type.size(); ++ii){
+    dpl_type.push_back(type_asso[sel_type[ii]]);
+  }
+
+  pair_nnp = (PairNNP *) force->pair_match("deepmd",1);
+  if (!pair_nnp) {
+    error->all(FLERR,"pair_style deepmd should be set before this fix\n");
+  }
+
+  // set comm size needed by this fix
+  comm_reverse = 3;
+}
+
+int FixDPLR::setmask()
+{
+  int mask = 0;
+  mask |= POST_INTEGRATE;
+  mask |= PRE_FORCE;
+  mask |= POST_FORCE;
+  return mask;
+}
+
+void FixDPLR::init()
+{
+  // double **xx = atom->x;
+  // double **vv = atom->v;
+  // int nlocal = atom->nlocal;
+  // for (int ii = 0; ii < nlocal; ++ii){
+  //   cout << xx[ii][0] << " " 
+  // 	 << xx[ii][1] << " " 
+  // 	 << xx[ii][2] << "   " 
+  // 	 << vv[ii][0] << " " 
+  // 	 << vv[ii][1] << " " 
+  // 	 << vv[ii][2] << " " 
+  // 	 << endl;
+  // }
+}
+
+void FixDPLR::setup(int vflag)
+{
+  // if (strstr(update->integrate_style,"verlet"))
+  //   post_force(vflag);
+  // else {
+  //   error->all(FLERR, "respa is not supported by this fix");
+  // }
+  if (vflag) {
+    v_setup(vflag);
+  }
+  else {
+    evflag = 0;
+  }
+}
+
+
+void
+FixDPLR::get_valid_pairs(vector<pair<int,int> >& pairs)
+{
+  pairs.clear();
+  
+  int nlocal = atom->nlocal;
+  int nghost = atom->nghost;
+  int nall = nlocal + nghost;
+  vector<int > dtype (nall);
+  // get type
+  {
+    int *type = atom->type;
+    for (int ii = 0; ii < nall; ++ii){
+      dtype[ii] = type[ii] - 1;
+    }
+  }
+
+  int **bondlist = neighbor->bondlist;
+  int nbondlist = neighbor->nbondlist;
+  for (int ii = 0; ii < nbondlist; ++ii){
+    int idx0=-1, idx1=-1;
+    int bd_type = bondlist[ii][2] - 1;
+    if ( ! binary_search(bond_type.begin(), bond_type.end(), bd_type) ){
+      continue;
+    }
+    if (binary_search(sel_type.begin(), sel_type.end(), dtype[bondlist[ii][0]]) 
+	&& 
+	binary_search(dpl_type.begin(), dpl_type.end(), dtype[bondlist[ii][1]])
+	){
+      idx0 = bondlist[ii][0];
+      idx1 = bondlist[ii][1];
+    }
+    else if (binary_search(sel_type.begin(), sel_type.end(), dtype[bondlist[ii][1]])
+	     &&
+	     binary_search(dpl_type.begin(), dpl_type.end(), dtype[bondlist[ii][0]])
+	){
+      idx0 = bondlist[ii][1];
+      idx1 = bondlist[ii][0];
+    }
+    else {
+      error->all(FLERR, "find a bonded pair the types of which are not associated");
+    }
+    if ( ! (idx0 < nlocal && idx1 < nlocal) ){
+      error->all(FLERR, "find a bonded pair that is not on the same processor, something should not happen");
+    }
+    pairs.push_back(pair<int,int>(idx0, idx1));
+  }
+}
+
+void FixDPLR::post_integrate()
+{
+  double **x = atom->x;
+  double **v = atom->v;
+  int *type = atom->type;
+  int nlocal = atom->nlocal;
+  int nghost = atom->nghost;
+  int nall = nlocal + nghost;
+
+  vector<pair<int,int> > valid_pairs;
+  get_valid_pairs(valid_pairs);  
+  
+  for (int ii = 0; ii < valid_pairs.size(); ++ii){
+    int idx0 = valid_pairs[ii].first;
+    int idx1 = valid_pairs[ii].second;
+    for (int dd = 0; dd < 3; ++dd){
+      x[idx1][dd] = x[idx0][dd] ;
+      v[idx1][dd] = v[idx0][dd] ;
+      // v[idx1][dd] = 0.0;
+    }
+  }
+}
+
+void FixDPLR::pre_force(int vflag)
+{
+  double **x = atom->x;
+  int *type = atom->type;
+  int nlocal = atom->nlocal;
+  int nghost = atom->nghost;
+  int nall = nlocal + nghost;
+
+  // if (eflag_atom) {
+  //   error->all(FLERR,"atomic energy calculation is not supported by this fix\n");
+  // }
+  
+  // declear inputs
+  vector<int > dtype (nall);
+  vector<FLOAT_PREC > dbox (9, 0) ;
+  vector<FLOAT_PREC > dcoord (nall * 3, 0.);
+  // get type
+  for (int ii = 0; ii < nall; ++ii){
+    dtype[ii] = type[ii] - 1;
+  }  
+  // get box
+  dbox[0] = domain->h[0];	// xx
+  dbox[4] = domain->h[1];	// yy
+  dbox[8] = domain->h[2];	// zz
+  dbox[7] = domain->h[3];	// zy
+  dbox[6] = domain->h[4];	// zx
+  dbox[3] = domain->h[5];	// yx
+  // get coord
+  for (int ii = 0; ii < nall; ++ii){
+    for (int dd = 0; dd < 3; ++dd){
+      dcoord[ii*3+dd] = x[ii][dd] - domain->boxlo[dd];
+    }
+  }
+  // get lammps nlist
+  NeighList * list = pair_nnp->list;
+  LammpsNeighborList lmp_list (list->inum, list->ilist, list->numneigh, list->firstneigh);
+  // declear output
+  vector<FLOAT_PREC> tensor;
+  // compute
+  dpt.compute(tensor, dcoord, dtype, dbox, nghost, lmp_list);
+  // cout << "tensor of size " << tensor.size() << endl;
+  // cout << "nghost " << nghost << endl;
+  // cout << "nall " << dtype.size() << endl;
+  // cout << "nloc " << nlocal << endl;
+  // for (int ii = 0; ii < tensor.size(); ++ii){
+  //   if (ii%3 == 0){
+  //     cout << endl;
+  //   }
+  //   cout << tensor[ii] << "\t";
+  // }
+  // cout << endl;
+  // for (int ii = 0; ii < nlocal * 3; ++ii){
+  //   if (ii%3 == 0){
+  //     cout << endl;
+  //   }
+  //   cout << dcoord[ii] << "\t";
+  // }
+  // int max_type = 0;
+  // for (int ii = 0; ii < dtype.size(); ++ii){
+  //   if (dtype[ii] > max_type) {
+  //     max_type = dtype[ii];
+  //   }
+  // }
+
+  // selected type
+  vector<int> dpl_type;
+  for (int ii = 0; ii < sel_type.size(); ++ii){
+    dpl_type.push_back(type_asso[sel_type[ii]]);
+  }
+  vector<int> sel_fwd, sel_bwd;
+  int sel_nghost;
+  select_by_type(sel_fwd, sel_bwd, sel_nghost, dcoord, dtype, nghost, sel_type);
+  int sel_nall = sel_bwd.size();
+  int sel_nloc = sel_nall - sel_nghost;
+  vector<int> sel_type(sel_bwd.size());
+  select_map<int>(sel_type, dtype, sel_fwd, 1);
+  
+  NNPAtomMap<FLOAT_PREC> nnp_map(sel_type.begin(), sel_type.begin() + sel_nloc);
+  const vector<int> & sort_fwd_map(nnp_map.get_fwd_map());
+
+  vector<pair<int,int> > valid_pairs;
+  get_valid_pairs(valid_pairs);  
+  
+  int odim = dpt.output_dim();
+  assert(odim == 3);
+  dipole_recd.resize(nall * 3);
+  fill(dipole_recd.begin(), dipole_recd.end(), 0.0);
+  for (int ii = 0; ii < valid_pairs.size(); ++ii){
+    int idx0 = valid_pairs[ii].first;
+    int idx1 = valid_pairs[ii].second;
+    assert(idx0 < sel_fwd.size() && sel_fwd[idx0] < sort_fwd_map.size());
+    int res_idx = sort_fwd_map[sel_fwd[idx0]];
+    // int ret_idx = dpl_bwd[res_idx];
+    for (int dd = 0; dd < 3; ++dd){
+      x[idx1][dd] = x[idx0][dd] + tensor[res_idx * 3 + dd];
+      // res_buff[idx1 * odim + dd] = tensor[res_idx * odim + dd];
+      dipole_recd[idx0*3+dd] = tensor[res_idx * 3 + dd];
+    }
+  }
+  // cout << "-------------------- fix/dplr: pre force " << endl;
+  // for (int ii = 0; ii < nlocal; ++ii){
+  //   cout << ii << "    ";
+  //   for (int dd = 0; dd < 3; ++dd){
+  //     cout << x[ii][dd] << " " ;
+  //   }
+  //   cout << endl;
+  // }
+}
+
+
+void FixDPLR::post_force(int vflag)
+{
+  if (vflag) {
+    v_setup(vflag);
+  }
+  else {
+    evflag = 0;
+  }
+  if (vflag_atom) {
+    error->all(FLERR,"atomic virial calculation is not supported by this fix\n");
+  }
+
+  PPPMDPLR * pppm_dplr = (PPPMDPLR*) force->kspace_match("pppm/dplr", 1);
+  if (!pppm_dplr) {
+    error->all(FLERR,"kspace_style pppm/dplr should be set before this fix\n");
+  }
+  const vector<double > & dfele_(pppm_dplr->get_fele());
+  int nlocal = atom->nlocal;
+  int nghost = atom->nghost;
+  int nall = nlocal + nghost;
+  vector<FLOAT_PREC> dcoord(nall*3, 0.0), dbox(9, 0.0), dfele(nlocal*3, 0.0);
+  vector<int> dtype(nall, 0);
+  {
+    int *type = atom->type;
+    for (int ii = 0; ii < nall; ++ii){
+      dtype[ii] = type[ii] - 1;
+    }
+    dbox[0] = domain->h[0];	// xx
+    dbox[4] = domain->h[1];	// yy
+    dbox[8] = domain->h[2];	// zz
+    dbox[7] = domain->h[3];	// zy
+    dbox[6] = domain->h[4];	// zx
+    dbox[3] = domain->h[5];	// yx
+    // get coord
+    double ** x = atom->x;
+    for (int ii = 0; ii < nall; ++ii){
+      for (int dd = 0; dd < 3; ++dd){
+	dcoord[ii*3+dd] = x[ii][dd] - domain->boxlo[dd];
+      }
+    }
+    assert(dfele_.size() == nlocal * 3);
+    for (int ii = 0; ii < nlocal*3; ++ii){
+      dfele[ii] = dfele_[ii];
+    }
+  }
+  // lmp nlist
+  NeighList * list = pair_nnp->list;
+  LammpsNeighborList lmp_list (list->inum, list->ilist, list->numneigh, list->firstneigh);
+  // bonded pairs
+  vector<pair<int,int> > valid_pairs;
+  get_valid_pairs(valid_pairs);  
+  // output vects
+  vector<FLOAT_PREC> dfcorr, dvcorr;
+  // compute
+  dtm.compute(dfcorr, dvcorr, dcoord, dtype, dbox, valid_pairs, dfele, nghost, lmp_list);
+  assert(dfcorr.size() == dcoord.size());
+  assert(dfcorr.size() == nall * 3);
+  // backward communication of fcorr
+  dfcorr_buff.resize(dfcorr.size());
+  copy(dfcorr.begin(), dfcorr.end(), dfcorr_buff.begin());
+  comm->reverse_comm_fix(this,3);
+  copy(dfcorr_buff.begin(), dfcorr_buff.end(), dfcorr.begin());
+  // // check and print
+  // cout << "-------------------- fix/dplr: post force " << endl;
+  // cout << "dfcorr.size() " << dfcorr.size() << endl;
+  // cout << "dcoord.size() " << dcoord.size() << endl;
+  // for (int ii = 0; ii < nlocal; ++ii){
+  //   cout << ii << "\t x: ";
+  //   for (int dd = 0; dd < 3; ++dd){
+  //     cout << dcoord[ii*3+dd] << " \t " ;
+  //   }
+  //   cout << ii << "\t f: ";
+  //   for (int dd = 0; dd < 3; ++dd){
+  //     cout << dfcorr[ii*3+dd] << " \t " ;
+  //   }
+  //   cout << endl;
+  // }
+  // apply the force correction
+  double ** f = atom->f;
+  for (int ii = 0; ii < nlocal; ++ii){
+    for(int dd = 0; dd < 3; ++dd){
+      f[ii][dd] += dfcorr[ii*3+dd];
+    }
+  }
+  // cout << "virial corr1 ";
+  // for (int ii = 0; ii < 9; ++ii){
+  //   cout << dvcorr[ii] << " " ;
+  // }
+  // cout << endl;
+  for (int ii = 0; ii < valid_pairs.size(); ++ii){
+    int idx0 = valid_pairs[ii].first;
+    int idx1 = valid_pairs[ii].second;
+    for (int dd0 = 0; dd0 < 3; ++dd0){
+      for (int dd1 = 0; dd1 < 3; ++dd1){
+	dvcorr[dd0*3+dd1] -= dfele[idx1*3+dd0] * dipole_recd[idx0*3+dd1];
+      }
+    }    
+  }
+  // cout << "virial corr2 ";
+  // for (int ii = 0; ii < 9; ++ii){
+  //   cout << dvcorr[ii] << " " ;
+  // }
+  // cout << endl;
+  if (evflag){
+    double vv[6] = {0.0};
+    vv[0] += dvcorr[0];
+    vv[1] += dvcorr[4];
+    vv[2] += dvcorr[8];
+    vv[3] += dvcorr[3];
+    vv[4] += dvcorr[6];
+    vv[5] += dvcorr[7];
+    v_tally(0, vv);
+  }
+}
+
+
+int FixDPLR::pack_reverse_comm(int n, int first, double *buf)
+{
+  int m = 0;
+  int last = first + n;
+  for (int i = first; i < last; i++) {
+    buf[m++] = dfcorr_buff[3*i+0];
+    buf[m++] = dfcorr_buff[3*i+1];
+    buf[m++] = dfcorr_buff[3*i+2];
+  }
+  return m;
+}
+
+/* ---------------------------------------------------------------------- */
+
+void FixDPLR::unpack_reverse_comm(int n, int *list, double *buf)
+{
+  int m = 0;
+  for (int i = 0; i < n; i++) {
+    int j = list[i];
+    dfcorr_buff[3*j+0] += buf[m++];
+    dfcorr_buff[3*j+1] += buf[m++];
+    dfcorr_buff[3*j+2] += buf[m++];
+  }
+}
+
+
+
diff --git a/source/lmp/fix_dplr.h b/source/lmp/fix_dplr.h
new file mode 100644
index 0000000000..edc9204874
--- /dev/null
+++ b/source/lmp/fix_dplr.h
@@ -0,0 +1,53 @@
+#ifdef FIX_CLASS
+
+FixStyle(dplr,FixDPLR)
+
+#else
+
+#ifndef LMP_FIX_DPLR_H
+#define LMP_FIX_DPLR_H
+
+#include <stdio.h>
+#include "fix.h"
+#include "pair_nnp.h"
+#include "DeepTensor.h"
+#include "DataModifier.h"
+
+#ifdef HIGH_PREC
+#define FLOAT_PREC double
+#else
+#define FLOAT_PREC float
+#endif
+
+namespace LAMMPS_NS {
+  class FixDPLR : public Fix {
+public:
+    FixDPLR(class LAMMPS *, int, char **);
+    virtual ~FixDPLR() {};
+    int setmask();
+    void init();
+    void setup(int);
+    void post_integrate();
+    void pre_force(int);
+    void post_force(int);
+    int pack_reverse_comm(int, int, double *);
+    void unpack_reverse_comm(int, int *, double *);
+private:
+    PairNNP * pair_nnp;
+    DeepTensor dpt;
+    DataModifier dtm;
+    string model;
+    int ntypes;
+    vector<int > sel_type;
+    vector<int > dpl_type;
+    vector<int > bond_type;
+    map<int,int > type_asso;
+    map<int,int > bk_type_asso;
+    vector<FLOAT_PREC> dipole_recd;
+    vector<double> dfcorr_buff;
+    void get_valid_pairs(vector<pair<int,int> >& pairs);
+  };
+}
+
+#endif // LMP_FIX_DPLR_H
+#endif // FIX_CLASS
diff --git a/source/lmp/pair_nnp.cpp b/source/lmp/pair_nnp.cpp
index 0b74dd38eb..ba3dcd3286 100644
--- a/source/lmp/pair_nnp.cpp
+++ b/source/lmp/pair_nnp.cpp
@@ -15,6 +15,7 @@
 #include "neigh_request.h"
 #include "modify.h"
 #include "fix.h"
+#include "citeme.h"
 #ifdef USE_TTM
 #include "fix_ttm_mod.h"
 #endif
@@ -24,6 +25,22 @@
 using namespace LAMMPS_NS;
 using namespace std;
 
+static const char cite_user_deepmd_package[] =
+	"USER-DEEPMD package:\n\n"
+    "@article{Wang_ComputPhysCommun_2018_v228_p178,\n"
+    "  author = {Wang, Han and Zhang, Linfeng and Han, Jiequn and E, Weinan},\n"
+    "  doi = {10.1016/j.cpc.2018.03.016},\n"
+    "  url = {https://doi.org/10.1016/j.cpc.2018.03.016},\n"
+    "  year = 2018,\n"
+    "  month = {jul},\n"
+    "  publisher = {Elsevier {BV}},\n"
+    "  volume = 228,\n"
+    "  journal = {Comput. Phys. Commun.},\n"
+    "  title = {{DeePMD-kit: A deep learning package for many-body potential energy representation and molecular dynamics}},\n"
+    "  pages = {178--184}\n"
+	"}\n\n";
+
+
 static int stringCmp(const void *a, const void* b)
 {
     char* m = (char*)a;
@@ -185,6 +202,7 @@ PairNNP::PairNNP(LAMMPS *lmp)
     : Pair(lmp)
       
 {
+  if (lmp->citeme) lmp->citeme->add(cite_user_deepmd_package);
   if (strcmp(update->unit_style,"metal") != 0) {
     error->all(FLERR,"Pair deepmd requires metal unit, please set it by \"units metal\"");
   }
diff --git a/source/lmp/pppm_dplr.cpp b/source/lmp/pppm_dplr.cpp
new file mode 100644
index 0000000000..e5643e114f
--- /dev/null
+++ b/source/lmp/pppm_dplr.cpp
@@ -0,0 +1,406 @@
+/* ----------------------------------------------------------------------
+   LAMMPS - Large-scale Atomic/Molecular Massively Parallel Simulator
+   http://lammps.sandia.gov, Sandia National Laboratories
+   Steve Plimpton, sjplimp@sandia.gov
+
+   Copyright (2003) Sandia Corporation.  Under the terms of Contract
+   DE-AC04-94AL85000 with Sandia Corporation, the U.S. Government retains
+   certain rights in this software.  This software is distributed under
+   the GNU General Public License.
+
+   See the README file in the top-level LAMMPS directory.
+------------------------------------------------------------------------- */
+
+#include <math.h>
+#include "pppm_dplr.h"
+#include "atom.h"
+#include "domain.h"
+#include "force.h"
+#include "memory.h"
+#include "error.h"
+#include "math_const.h"
+#include "pppm.h"
+#include "gridcomm.h"
+
+using namespace LAMMPS_NS;
+using namespace MathConst;
+
+enum{REVERSE_RHO};
+enum{FORWARD_IK,FORWARD_AD,FORWARD_IK_PERATOM,FORWARD_AD_PERATOM};
+
+#define OFFSET 16384
+
+#ifdef FFT_SINGLE
+#define ZEROF 0.0f
+#define ONEF  1.0f
+#else
+#define ZEROF 0.0
+#define ONEF  1.0
+#endif
+
+/* ---------------------------------------------------------------------- */
+
+#ifdef OLD_LMP_PPPM
+PPPMDPLR::PPPMDPLR(LAMMPS *lmp, int narg, char **arg) :
+  PPPM(lmp, narg, arg)
+#else
+PPPMDPLR::PPPMDPLR(LAMMPS *lmp) :
+  PPPM(lmp)
+#endif
+{
+  triclinic_support = 1;
+}
+
+/* ---------------------------------------------------------------------- */
+
+void PPPMDPLR::init()
+{
+  // DPLR PPPM requires newton on, b/c it computes forces on ghost atoms
+
+  if (force->newton == 0)
+    error->all(FLERR,"Kspace style pppm/dplr requires newton on");
+
+  PPPM::init();
+
+  int nlocal = atom->nlocal;
+  // cout << " ninit pppm/dplr ---------------------- " << nlocal << endl;
+  fele.resize(nlocal*3);
+  fill(fele.begin(), fele.end(), 0.0);
+}
+
+
+/* ----------------------------------------------------------------------
+   compute the PPPM long-range force, energy, virial
+------------------------------------------------------------------------- */
+
+void PPPMDPLR::compute(int eflag, int vflag)
+{
+  int i,j;
+
+  // set energy/virial flags
+  // invoke allocate_peratom() if needed for first time
+
+  if (eflag || vflag) ev_setup(eflag,vflag);
+  else evflag = evflag_atom = eflag_global = vflag_global =
+         eflag_atom = vflag_atom = 0;
+
+  if (evflag_atom && !peratom_allocate_flag) {
+    allocate_peratom();
+    cg_peratom->ghost_notify();
+    cg_peratom->setup();
+  }
+
+  // if atom count has changed, update qsum and qsqsum
+
+  if (atom->natoms != natoms_original) {
+    qsum_qsq();
+    natoms_original = atom->natoms;
+  }
+
+  // return if there are no charges
+
+  if (qsqsum == 0.0) return;
+
+  // convert atoms from box to lamda coords
+
+  if (triclinic == 0) boxlo = domain->boxlo;
+  else {
+    boxlo = domain->boxlo_lamda;
+    domain->x2lamda(atom->nlocal);
+  }
+
+  // extend size of per-atom arrays if necessary
+
+  if (atom->nmax > nmax) {
+    memory->destroy(part2grid);
+    nmax = atom->nmax;
+    memory->create(part2grid,nmax,3,"pppm:part2grid");
+  }
+
+  // find grid points for all my particles
+  // map my particle charge onto my local 3d density grid
+
+  particle_map();
+  make_rho();
+
+  // all procs communicate density values from their ghost cells
+  //   to fully sum contribution in their 3d bricks
+  // remap from 3d decomposition to FFT decomposition
+
+  cg->reverse_comm(this,REVERSE_RHO);
+  brick2fft();
+
+  // compute potential gradient on my FFT grid and
+  //   portion of e_long on this proc's FFT grid
+  // return gradients (electric fields) in 3d brick decomposition
+  // also performs per-atom calculations via poisson_peratom()
+
+  poisson();
+
+  // all procs communicate E-field values
+  // to fill ghost cells surrounding their 3d bricks
+
+  if (differentiation_flag == 1) cg->forward_comm(this,FORWARD_AD);
+  else cg->forward_comm(this,FORWARD_IK);
+
+  // extra per-atom energy/virial communication
+
+  if (evflag_atom) {
+    if (differentiation_flag == 1 && vflag_atom)
+      cg_peratom->forward_comm(this,FORWARD_AD_PERATOM);
+    else if (differentiation_flag == 0)
+      cg_peratom->forward_comm(this,FORWARD_IK_PERATOM);
+  }
+
+  // calculate the force on my particles
+
+  fieldforce();
+
+  // extra per-atom energy/virial communication
+
+  if (evflag_atom) fieldforce_peratom();
+
+  // sum global energy across procs and add in volume-dependent term
+
+  const double qscale = qqrd2e * scale;
+
+  if (eflag_global) {
+    double energy_all;
+    MPI_Allreduce(&energy,&energy_all,1,MPI_DOUBLE,MPI_SUM,world);
+    energy = energy_all;
+
+    energy *= 0.5*volume;
+    // do not add self-term, for neutral systems qsum == 0
+    // energy -= g_ewald*qsqsum/MY_PIS +
+    //   MY_PI2*qsum*qsum / (g_ewald*g_ewald*volume);
+    energy *= qscale;
+  }
+
+  // sum global virial across procs
+
+  if (vflag_global) {
+    double virial_all[6];
+    MPI_Allreduce(virial,virial_all,6,MPI_DOUBLE,MPI_SUM,world);
+    for (i = 0; i < 6; i++) virial[i] = 0.5*qscale*volume*virial_all[i];
+  }
+  // std::cout<< "energy in pppm -------------------" << std::endl;
+  // std::cout << energy << " " 
+  // 	    << std::endl;
+  // std::cout<< "virial in pppm -------------------" << std::endl;
+  // for (int ii = 0; ii < 6; ++ii){
+  //   std::cout << virial[ii] << " " ;
+  // }
+  // std::cout << std::endl;
+
+  // per-atom energy/virial
+  // energy includes self-energy correction
+  // ntotal accounts for TIP4P tallying eatom/vatom for ghost atoms
+
+  if (evflag_atom) {
+    double *q = atom->q;
+    int nlocal = atom->nlocal;
+    int ntotal = nlocal;
+    if (tip4pflag) ntotal += atom->nghost;
+
+    if (eflag_atom) {
+      for (i = 0; i < nlocal; i++) {
+        eatom[i] *= 0.5;
+        eatom[i] -= g_ewald*q[i]*q[i]/MY_PIS + MY_PI2*q[i]*qsum /
+          (g_ewald*g_ewald*volume);
+        eatom[i] *= qscale;
+      }
+      for (i = nlocal; i < ntotal; i++) eatom[i] *= 0.5*qscale;
+    }
+
+    if (vflag_atom) {
+      for (i = 0; i < ntotal; i++)
+        for (j = 0; j < 6; j++) vatom[i][j] *= 0.5*qscale;
+    }
+  }
+
+  // 2d slab correction
+
+  if (slabflag == 1) slabcorr();
+
+  // convert atoms back from lamda to box coords
+
+  if (triclinic) domain->lamda2x(atom->nlocal);
+}
+
+
+/* ----------------------------------------------------------------------
+   interpolate from grid to get electric field & force on my particles for ik
+------------------------------------------------------------------------- */
+void PPPMDPLR::fieldforce_ik()
+{
+  int i,l,m,n,nx,ny,nz,mx,my,mz;
+  FFT_SCALAR dx,dy,dz,x0,y0,z0;
+  FFT_SCALAR ekx,eky,ekz;
+
+  // loop over my charges, interpolate electric field from nearby grid points
+  // (nx,ny,nz) = global coords of grid pt to "lower left" of charge
+  // (dx,dy,dz) = distance to "lower left" grid pt
+  // (mx,my,mz) = global coords of moving stencil pt
+  // ek = 3 components of E-field on particle
+
+  double *q = atom->q;
+  double **x = atom->x;
+  // double **f = atom->f;
+
+  int nlocal = atom->nlocal;
+  int nghost = atom->nghost;
+  int nall = nlocal + nghost;
+
+  fele.resize(nlocal*3);
+  fill(fele.begin(), fele.end(), 0.0);
+
+  for (i = 0; i < nlocal; i++) {
+    nx = part2grid[i][0];
+    ny = part2grid[i][1];
+    nz = part2grid[i][2];
+    dx = nx+shiftone - (x[i][0]-boxlo[0])*delxinv;
+    dy = ny+shiftone - (x[i][1]-boxlo[1])*delyinv;
+    dz = nz+shiftone - (x[i][2]-boxlo[2])*delzinv;
+
+    compute_rho1d(dx,dy,dz);
+
+    ekx = eky = ekz = ZEROF;
+    for (n = nlower; n <= nupper; n++) {
+      mz = n+nz;
+      z0 = rho1d[2][n];
+      for (m = nlower; m <= nupper; m++) {
+        my = m+ny;
+        y0 = z0*rho1d[1][m];
+        for (l = nlower; l <= nupper; l++) {
+          mx = l+nx;
+          x0 = y0*rho1d[0][l];
+          ekx -= x0*vdx_brick[mz][my][mx];
+          eky -= x0*vdy_brick[mz][my][mx];
+          ekz -= x0*vdz_brick[mz][my][mx];
+        }
+      }
+    }
+
+    // convert E-field to force
+
+    const double qfactor = qqrd2e * scale * q[i];
+    fele[i*3+0] += qfactor*ekx;
+    fele[i*3+1] += qfactor*eky;
+    if (slabflag != 2) fele[i*3+2] += qfactor*ekz;
+  }
+
+  // vector<FLOAT_PREC> dcoord(nall*3), dbox(9);
+  // vector<int> dtype(nall);
+  // {
+  //   double ** xx = atom->x;
+  //   for(int ii = 0; ii < nall; ++ii){
+  //     for (int dd = 0; dd < 3; +=dd){
+  // 	dcoord[ii*3+dd] = xx[ii][dd];
+  //     }
+  //   }
+  //   int *type = atom->type;
+  //   for (int ii = 0; ii < nall; ++ii){
+  //     dtype[ii] = type[ii] - 1;
+  //   }
+  // }
+}
+
+/* ----------------------------------------------------------------------
+   interpolate from grid to get electric field & force on my particles for ad
+------------------------------------------------------------------------- */
+
+void PPPMDPLR::fieldforce_ad()
+{
+  int i,l,m,n,nx,ny,nz,mx,my,mz;
+  FFT_SCALAR dx,dy,dz;
+  FFT_SCALAR ekx,eky,ekz;
+  double s1,s2,s3;
+  double sf = 0.0;
+  double *prd;
+
+  prd = domain->prd;
+  double xprd = prd[0];
+  double yprd = prd[1];
+  double zprd = prd[2];
+
+  double hx_inv = nx_pppm/xprd;
+  double hy_inv = ny_pppm/yprd;
+  double hz_inv = nz_pppm/zprd;
+
+  // loop over my charges, interpolate electric field from nearby grid points
+  // (nx,ny,nz) = global coords of grid pt to "lower left" of charge
+  // (dx,dy,dz) = distance to "lower left" grid pt
+  // (mx,my,mz) = global coords of moving stencil pt
+  // ek = 3 components of E-field on particle
+
+  double *q = atom->q;
+  double **x = atom->x;
+  double **f = atom->f;
+
+  int nlocal = atom->nlocal;
+
+  vector<double > fele(nlocal, 0.0);
+
+  for (i = 0; i < nlocal; i++) {
+    nx = part2grid[i][0];
+    ny = part2grid[i][1];
+    nz = part2grid[i][2];
+    dx = nx+shiftone - (x[i][0]-boxlo[0])*delxinv;
+    dy = ny+shiftone - (x[i][1]-boxlo[1])*delyinv;
+    dz = nz+shiftone - (x[i][2]-boxlo[2])*delzinv;
+
+    compute_rho1d(dx,dy,dz);
+    compute_drho1d(dx,dy,dz);
+
+    ekx = eky = ekz = ZEROF;
+    for (n = nlower; n <= nupper; n++) {
+      mz = n+nz;
+      for (m = nlower; m <= nupper; m++) {
+        my = m+ny;
+        for (l = nlower; l <= nupper; l++) {
+          mx = l+nx;
+          ekx += drho1d[0][l]*rho1d[1][m]*rho1d[2][n]*u_brick[mz][my][mx];
+          eky += rho1d[0][l]*drho1d[1][m]*rho1d[2][n]*u_brick[mz][my][mx];
+          ekz += rho1d[0][l]*rho1d[1][m]*drho1d[2][n]*u_brick[mz][my][mx];
+        }
+      }
+    }
+    ekx *= hx_inv;
+    eky *= hy_inv;
+    ekz *= hz_inv;
+
+    // convert E-field to force and substract self forces
+
+    const double qfactor = qqrd2e * scale;
+
+    s1 = x[i][0]*hx_inv;
+    s2 = x[i][1]*hy_inv;
+    s3 = x[i][2]*hz_inv;
+    sf = sf_coeff[0]*sin(2*MY_PI*s1);
+    sf += sf_coeff[1]*sin(4*MY_PI*s1);
+    sf *= 2*q[i]*q[i];
+    fele[i*3+0] += qfactor*(ekx*q[i] - sf);
+
+    sf = sf_coeff[2]*sin(2*MY_PI*s2);
+    sf += sf_coeff[3]*sin(4*MY_PI*s2);
+    sf *= 2*q[i]*q[i];
+    fele[i*3+1] += qfactor*(eky*q[i] - sf);
+
+
+    sf = sf_coeff[4]*sin(2*MY_PI*s3);
+    sf += sf_coeff[5]*sin(4*MY_PI*s3);
+    sf *= 2*q[i]*q[i];
+    if (slabflag != 2) fele[i*3+2] += qfactor*(ekz*q[i] - sf);
+  }
+
+  // for (int ii = 0; ii < nlocal; ++ii){
+  //   cout << ii << "\t ";
+  //   for (int dd = 0; dd < 3; ++dd){
+  //     cout << fele[ii*3+dd] << " " ;
+  //   }
+  //   cout << endl;
+  // }
+}
+
+
+
diff --git a/source/lmp/pppm_dplr.h b/source/lmp/pppm_dplr.h
new file mode 100644
index 0000000000..17680c01cd
--- /dev/null
+++ b/source/lmp/pppm_dplr.h
@@ -0,0 +1,58 @@
+/* -*- c++ -*- ----------------------------------------------------------
+   LAMMPS - Large-scale Atomic/Molecular Massively Parallel Simulator
+   http://lammps.sandia.gov, Sandia National Laboratories
+   Steve Plimpton, sjplimp@sandia.gov
+
+   Copyright (2003) Sandia Corporation.  Under the terms of Contract
+   DE-AC04-94AL85000 with Sandia Corporation, the U.S. Government retains
+   certain rights in this software.  This software is distributed under
+   the GNU General Public License.
+
+   See the README file in the top-level LAMMPS directory.
+------------------------------------------------------------------------- */
+
+#ifdef KSPACE_CLASS
+
+KSpaceStyle(pppm/dplr,PPPMDPLR)
+
+#else
+
+#ifndef LMP_PPPM_DPLR_H
+#define LMP_PPPM_DPLR_H
+
+#ifdef HIGH_PREC
+#define FLOAT_PREC double
+#else
+#define FLOAT_PREC float
+#endif
+
+#include "pppm.h"
+#include <iostream>
+#include <vector>
+using namespace std;
+
+namespace LAMMPS_NS {
+
+  class PPPMDPLR : public PPPM {
+public:
+#ifdef OLD_LMP_PPPM
+    PPPMDPLR(class LAMMPS *, int, char **);
+#else
+    PPPMDPLR(class LAMMPS *);
+#endif
+    virtual ~PPPMDPLR () {};
+    void init();
+    const vector<double > & get_fele() const {return fele;};
+protected:
+    virtual void compute(int, int);
+    virtual void fieldforce_ik();
+    virtual void fieldforce_ad();    
+private:
+    vector<double > fele;
+  };
+
+}
+
+#endif
+#endif
+
diff --git a/source/op/CMakeLists.txt b/source/op/CMakeLists.txt
index 6fdc02cfd6..89416056e1 100644
--- a/source/op/CMakeLists.txt
+++ b/source/op/CMakeLists.txt
@@ -3,7 +3,7 @@
 set(OP_LIB ${PROJECT_SOURCE_DIR}/lib/src/SimulationRegion.cpp ${PROJECT_SOURCE_DIR}/lib/src/NeighborList.cpp)
 
 set (OP_CXX_FLAG -D_GLIBCXX_USE_CXX11_ABI=${OP_CXX_ABI} )
-file(GLOB OP_SRC prod_force.cc prod_virial.cc descrpt.cc descrpt_se_a.cc descrpt_se_r.cc tab_inter.cc prod_force_se_a.cc prod_virial_se_a.cc prod_force_se_r.cc prod_virial_se_r.cc soft_min.cc soft_min_force.cc soft_min_virial.cc )
+file(GLOB OP_SRC prod_force.cc prod_virial.cc descrpt.cc descrpt_se_a.cc descrpt_se_r.cc tab_inter.cc prod_force_se_a.cc prod_virial_se_a.cc prod_force_se_r.cc prod_virial_se_r.cc soft_min.cc soft_min_force.cc soft_min_virial.cc ewald_recp.cc)
 file(GLOB OP_CUDA_SRC prod_force.cc prod_virial.cc descrpt.cc descrpt_se_a_gpu.cc descrpt_se_r_gpu.cc tab_inter.cc prod_force_se_a_gpu.cc prod_virial_se_a_gpu.cc prod_force_se_r_gpu.cc prod_virial_se_r_gpu.cc soft_min.cc soft_min_force.cc soft_min_virial.cc )
 file(GLOB OP_GRADS_SRC prod_force_grad.cc prod_force_se_a_grad.cc prod_force_se_r_grad.cc prod_virial_grad.cc prod_virial_se_a_grad.cc prod_virial_se_r_grad.cc soft_min_force_grad.cc soft_min_virial_grad.cc )
 file(GLOB OP_PY *.py)
diff --git a/source/op/_prod_force_grad.py b/source/op/_prod_force_grad.py
index 9d16682b53..ddd20d9a5b 100644
--- a/source/op/_prod_force_grad.py
+++ b/source/op/_prod_force_grad.py
@@ -3,23 +3,8 @@
 Gradients for prod force.
 """
 
-import os
-import platform
-import tensorflow as tf
 from tensorflow.python.framework import ops
-from tensorflow.python.ops import array_ops
-from tensorflow.python.ops import sparse_ops
-
-if platform.system() == "Windows":
-    ext = "dll"
-elif platform.system() == "Darwin":
-    ext = "dylib"
-else:
-    ext = "so"
-module_path = os.path.dirname(os.path.realpath(__file__))
-module_file = os.path.join(module_path, 'libop_grads.{}'.format(ext))
-assert (os.path.isfile(module_file)), 'module op_grads does not exist'
-op_grads_module = tf.load_op_library(module_file)
+from deepmd.env import op_grads_module
      
 @ops.RegisterGradient("ProdForce")
 def _prod_force_grad_cc (op, grad):    
diff --git a/source/op/_prod_force_se_a_grad.py b/source/op/_prod_force_se_a_grad.py
index 566f304483..8f69ef5139 100644
--- a/source/op/_prod_force_se_a_grad.py
+++ b/source/op/_prod_force_se_a_grad.py
@@ -3,23 +3,8 @@
 Gradients for prod force.
 """
 
-import os
-import platform
-import tensorflow as tf
 from tensorflow.python.framework import ops
-from tensorflow.python.ops import array_ops
-from tensorflow.python.ops import sparse_ops
-
-if platform.system() == "Windows":
-    ext = "dll"
-elif platform.system() == "Darwin":
-    ext = "dylib"
-else:
-    ext = "so"
-module_path = os.path.dirname(os.path.realpath(__file__))
-module_file = os.path.join(module_path, 'libop_grads.{}'.format(ext))
-assert (os.path.isfile(module_file)), 'module op_grads does not exist'
-op_grads_module = tf.load_op_library(module_file)
+from deepmd.env import op_grads_module
      
 @ops.RegisterGradient("ProdForceSeA")
 def _prod_force_se_a_grad_cc (op, grad):    
diff --git a/source/op/_prod_force_se_r_grad.py b/source/op/_prod_force_se_r_grad.py
index 71b2c05306..721ab927da 100644
--- a/source/op/_prod_force_se_r_grad.py
+++ b/source/op/_prod_force_se_r_grad.py
@@ -3,23 +3,8 @@
 Gradients for prod force.
 """
 
-import os
-import platform
-import tensorflow as tf
 from tensorflow.python.framework import ops
-from tensorflow.python.ops import array_ops
-from tensorflow.python.ops import sparse_ops
-
-if platform.system() == "Windows":
-    ext = "dll"
-elif platform.system() == "Darwin":
-    ext = "dylib"
-else:
-    ext = "so"
-module_path = os.path.dirname(os.path.realpath(__file__))
-module_file = os.path.join(module_path, 'libop_grads.{}'.format(ext))
-assert (os.path.isfile(module_file)), 'module op_grads does not exist'
-op_grads_module = tf.load_op_library(module_file)
+from deepmd.env import op_grads_module
      
 @ops.RegisterGradient("ProdForceSeR")
 def _prod_force_se_a_grad_cc (op, grad):    
diff --git a/source/op/_prod_virial_grad.py b/source/op/_prod_virial_grad.py
index 5d6ae58aea..8ed49200ed 100644
--- a/source/op/_prod_virial_grad.py
+++ b/source/op/_prod_virial_grad.py
@@ -3,23 +3,8 @@
 Gradients for prod virial.
 """
 
-import os
-import platform
-import tensorflow as tf
 from tensorflow.python.framework import ops
-from tensorflow.python.ops import array_ops
-from tensorflow.python.ops import sparse_ops
-
-if platform.system() == "Windows":
-    ext = "dll"
-elif platform.system() == "Darwin":
-    ext = "dylib"
-else:
-    ext = "so"
-module_path = os.path.dirname(os.path.realpath(__file__))
-module_file = os.path.join(module_path, 'libop_grads.{}'.format(ext))
-assert (os.path.isfile(module_file)), 'module op_grads does not exist'
-op_grads_module = tf.load_op_library(module_file)
+from deepmd.env import op_grads_module
      
 @ops.RegisterGradient("ProdVirial")
 def _prod_virial_grad_cc (op, grad, grad_atom):    
diff --git a/source/op/_prod_virial_se_a_grad.py b/source/op/_prod_virial_se_a_grad.py
index 5c28a06c2d..ea19a3ef14 100644
--- a/source/op/_prod_virial_se_a_grad.py
+++ b/source/op/_prod_virial_se_a_grad.py
@@ -3,23 +3,8 @@
 Gradients for prod virial.
 """
 
-import os
-import platform
-import tensorflow as tf
 from tensorflow.python.framework import ops
-from tensorflow.python.ops import array_ops
-from tensorflow.python.ops import sparse_ops
-
-if platform.system() == "Windows":
-    ext = "dll"
-elif platform.system() == "Darwin":
-    ext = "dylib"
-else:
-    ext = "so"
-module_path = os.path.dirname(os.path.realpath(__file__))
-module_file = os.path.join(module_path, 'libop_grads.{}'.format(ext))
-assert (os.path.isfile(module_file)), 'module op_grads does not exist'
-op_grads_module = tf.load_op_library(module_file)
+from deepmd.env import op_grads_module
      
 @ops.RegisterGradient("ProdVirialSeA")
 def _prod_virial_se_a_grad_cc (op, grad, grad_atom):    
diff --git a/source/op/_prod_virial_se_r_grad.py b/source/op/_prod_virial_se_r_grad.py
index 7ac054a750..367f2c90c3 100644
--- a/source/op/_prod_virial_se_r_grad.py
+++ b/source/op/_prod_virial_se_r_grad.py
@@ -3,23 +3,8 @@
 Gradients for prod virial.
 """
 
-import os
-import platform
-import tensorflow as tf
 from tensorflow.python.framework import ops
-from tensorflow.python.ops import array_ops
-from tensorflow.python.ops import sparse_ops
-
-if platform.system() == "Windows":
-    ext = "dll"
-elif platform.system() == "Darwin":
-    ext = "dylib"
-else:
-    ext = "so"
-module_path = os.path.dirname(os.path.realpath(__file__))
-module_file = os.path.join(module_path, 'libop_grads.{}'.format(ext))
-assert (os.path.isfile(module_file)), 'module op_grads does not exist'
-op_grads_module = tf.load_op_library(module_file)
+from deepmd.env import op_grads_module
      
 @ops.RegisterGradient("ProdVirialSeR")
 def _prod_virial_se_a_grad_cc (op, grad, grad_atom):    
diff --git a/source/op/_soft_min_force_grad.py b/source/op/_soft_min_force_grad.py
index 13d56472ad..be3d2c29d5 100644
--- a/source/op/_soft_min_force_grad.py
+++ b/source/op/_soft_min_force_grad.py
@@ -3,23 +3,9 @@
 Gradients for soft min force
 """
 
-import os
-import platform
-import tensorflow as tf
 from tensorflow.python.framework import ops
-from tensorflow.python.ops import array_ops
-from tensorflow.python.ops import sparse_ops
+from deepmd.env import op_grads_module
 
-if platform.system() == "Windows":
-    ext = "dll"
-elif platform.system() == "Darwin":
-    ext = "dylib"
-else:
-    ext = "so"
-module_path = os.path.dirname(os.path.realpath(__file__))
-module_file = os.path.join(module_path, 'libop_grads.{}'.format(ext))
-assert (os.path.isfile(module_file)), 'module op_grads does not exist'
-op_grads_module = tf.load_op_library(module_file)
      
 @ops.RegisterGradient("SoftMinForce")
 def _soft_min_force_grad_cc (op, grad):    
diff --git a/source/op/_soft_min_virial_grad.py b/source/op/_soft_min_virial_grad.py
index 91149e6f9e..6c6d980aa2 100644
--- a/source/op/_soft_min_virial_grad.py
+++ b/source/op/_soft_min_virial_grad.py
@@ -3,23 +3,9 @@
 Gradients for soft min virial.
 """
 
-import os
-import platform
-import tensorflow as tf
 from tensorflow.python.framework import ops
-from tensorflow.python.ops import array_ops
-from tensorflow.python.ops import sparse_ops
+from deepmd.env import op_grads_module
 
-if platform.system() == "Windows":
-    ext = "dll"
-elif platform.system() == "Darwin":
-    ext = "dylib"
-else:
-    ext = "so"
-module_path = os.path.dirname(os.path.realpath(__file__))
-module_file = os.path.join(module_path, 'libop_grads.{}'.format(ext))
-assert (os.path.isfile(module_file)), 'module op_grads does not exist'
-op_grads_module = tf.load_op_library(module_file)
      
 @ops.RegisterGradient("SoftMinVirial")
 def _soft_min_virial_grad_cc (op, grad, grad_atom):    
diff --git a/source/op/cuda/CMakeLists.txt b/source/op/cuda/CMakeLists.txt
index a9847abd3e..25f796500b 100644
--- a/source/op/cuda/CMakeLists.txt
+++ b/source/op/cuda/CMakeLists.txt
@@ -14,14 +14,70 @@ SET(CMAKE_CXX_STANDARD 11)
 SET(CMAKE_CUDA_STANDARD 11)
 # nvcc -o libdeepmd_op_cuda.so -I/usr/local/cub-1.8.0 -rdc=true -DHIGH_PREC=true -gencode arch=compute_61,code=sm_61 -shared -Xcompiler -fPIC deepmd_op.cu -L/usr/local/cuda/lib64 -lcudadevrt
 # very important here! Include path to cub.
-include_directories(cub) 
-# nvcc flags
-set(CUDA_NVCC_FLAGS -gencode arch=compute_60,code=sm_60; # Pascal – GP100/Tesla P100 – DGX-1 (Generic Pascal)
-                    -gencode arch=compute_61,code=sm_61; # Pascal - GTX 1080, GTX 1070, GTX 1060, GTX 1050, GTX 1030, Titan Xp, Tesla P40, Tesla P4, Discrete GPU on the NVIDIA Drive PX2
-                    -gencode arch=compute_70,code=sm_70; # Volta  - GV100/Tesla V100, GTX 1180 (GV104)
-                    -gencode arch=compute_75,code=sm_75; # Turing - RTX 2080, Titan RTX, Quadro R8000
-                    -O3; -Xcompiler -fPIC;
-    )
+# for searching device compute capability, https://developer.nvidia.com/cuda-gpus
+include_directories(cub)
+
+message(STATUS "CUDA major version is " ${CUDA_VERSION_MAJOR})
+
+if (${CUDA_VERSION_MAJOR} GREATER "10")
+    # nvcc flags
+    set(CUDA_NVCC_FLAGS -gencode arch=compute_50,code=sm_50;
+                        -gencode arch=compute_52,code=sm_52; # Tesla M40, Tesla M40, Quadro M6000...
+                        -gencode arch=compute_53,code=sm_53; 
+                        -gencode arch=compute_60,code=sm_60; # Pascal – GP100/Tesla P100 – DGX-1 (Generic Pascal)
+                        -gencode arch=compute_61,code=sm_61; # Pascal - GTX 1080, GTX 1070, GTX 1060, GTX 1050, GTX 1030, Titan Xp, Tesla P40, Tesla P4, Discrete GPU on the NVIDIA Drive PX2
+                        -gencode arch=compute_70,code=sm_70; # Volta  - GV100/Tesla V100, GTX 1180 (GV104)
+                        -gencode arch=compute_75,code=sm_75; # Turing - RTX 2080, Titan RTX, Quadro R8000
+                        -O3; -Xcompiler -fPIC;
+        )
+elseif (${CUDA_VERSION_MAJOR} STREQUAL "10")
+    set(CUDA_NVCC_FLAGS -gencode arch=compute_30,code=sm_30; # Tesla K10, Quadro K600 K420 K410,
+                        -gencode arch=compute_35,code=sm_35; # Tesla K20 K40, TITAN Z Black, GTX 780Ti 780
+                        -gencode arch=compute_37,code=sm_37; # Tesla K80
+                        -gencode arch=compute_50,code=sm_50; # Quadro 620 1200
+                        -gencode arch=compute_52,code=sm_52; # Tesla M40 M40, Quadro M6000 M5000 M4000 M2000, TITAN X, GTX 980Ti 980 970 960 950
+                        -gencode arch=compute_53,code=sm_53; # Jetson TX1, Tegra X1
+                        -gencode arch=compute_60,code=sm_60; # Pascal – GP100/Tesla P100 – DGX-1 (Generic Pascal)
+                        -gencode arch=compute_61,code=sm_61; # Pascal - GTX 1080, GTX 1070, GTX 1060, GTX 1050, GTX 1030, Titan Xp, Tesla P40, Tesla P4, Discrete GPU on the NVIDIA Drive PX2
+                        -gencode arch=compute_70,code=sm_70; # Volta  - GV100/Tesla V100, GTX 1180 (GV104)
+                        -gencode arch=compute_75,code=sm_75; # Turing - RTX 2080, Titan RTX, Quadro R8000
+                        -O3; -Xcompiler -fPIC;
+        )
+elseif (${CUDA_VERSION_MAJOR} STREQUAL "9")
+    set(CUDA_NVCC_FLAGS -gencode arch=compute_30,code=sm_30;
+                        -gencode arch=compute_35,code=sm_35;
+                        -gencode arch=compute_37,code=sm_37;
+                        -gencode arch=compute_50,code=sm_50;
+                        -gencode arch=compute_52,code=sm_52; # Tesla M40, Tesla M40, Quadro M6000...
+                        -gencode arch=compute_53,code=sm_53; 
+                        -gencode arch=compute_60,code=sm_60; # Pascal – GP100/Tesla P100 – DGX-1 (Generic Pascal)
+                        -gencode arch=compute_61,code=sm_61; # Pascal - GTX 1080, GTX 1070, GTX 1060, GTX 1050, GTX 1030, Titan Xp, Tesla P40, Tesla P4, Discrete GPU on the NVIDIA Drive PX2
+                        -gencode arch=compute_70,code=sm_70; # Volta  - GV100/Tesla V100, GTX 1180 (GV104)
+                        -O3; -Xcompiler -fPIC;
+        )
+elseif (${CUDA_VERSION_MAJOR} STREQUAL "8")
+    set(CUDA_NVCC_FLAGS -gencode arch=compute_30,code=sm_30;
+                        -gencode arch=compute_35,code=sm_35;
+                        -gencode arch=compute_37,code=sm_37;
+                        -gencode arch=compute_50,code=sm_50;
+                        -gencode arch=compute_52,code=sm_52; # Tesla M40, Tesla M40, Quadro M6000...
+                        -gencode arch=compute_53,code=sm_53; 
+                        -gencode arch=compute_60,code=sm_60; # Pascal – GP100/Tesla P100 – DGX-1 (Generic Pascal)
+                        -gencode arch=compute_61,code=sm_61; # Pascal - GTX 1080, GTX 1070, GTX 1060, GTX 1050, GTX 1030, Titan Xp, Tesla P40, Tesla P4, Discrete GPU on the NVIDIA Drive PX2
+                        -O3; -Xcompiler -fPIC;
+        )
+elseif (${CUDA_VERSION_MAJOR} STREQUAL "7")
+    set(CUDA_NVCC_FLAGS -gencode arch=compute_30,code=sm_30;
+                        -gencode arch=compute_35,code=sm_35;
+                        -gencode arch=compute_37,code=sm_37;
+                        -gencode arch=compute_50,code=sm_50;
+                        -gencode arch=compute_52,code=sm_52; # Tesla M40, Tesla M40, Quadro M6000...
+                        -gencode arch=compute_53,code=sm_53; 
+                        -O3; -Xcompiler -fPIC;
+        )
+else () 
+    message(FATAL_ERROR "unsupported CUDA_VERSION " ${CUDA_VERSION} ", please use a newer version (>=7.0) of CUDA toolkit!")
+endif()
 
 set (SOURCE_FILES
     descrpt_se_a.cu descrpt_se_r.cu prod_force_se_a.cu prod_force_se_r.cu prod_virial_se_a.cu prod_virial_se_r.cu 
diff --git a/source/op/cuda/descrpt_se_a.cu b/source/op/cuda/descrpt_se_a.cu
index 8b6b3ee575..4b309522fa 100644
--- a/source/op/cuda/descrpt_se_a.cu
+++ b/source/op/cuda/descrpt_se_a.cu
@@ -18,7 +18,7 @@ limitations under the License.
 #include <cub/block/block_radix_sort.cuh>
 #include <cuda_runtime.h>
 
-#define MAGIC_NUMBER 256
+#define MAGIC_NUMBER 1024
 
 #ifdef HIGH_PREC
     typedef double  VALUETYPE;
@@ -40,20 +40,6 @@ inline void cudaAssert(cudaError_t code, const char *file, int line, bool abort=
     }
 }
 
-#if defined(__CUDA_ARCH__) && __CUDA_ARCH__ < 600
-static __inline__ __device__ double atomicAdd(double* address, double val) {
-    unsigned long long int* address_as_ull = (unsigned long long int*)address;
-    unsigned long long int old = *address_as_ull, assumed;
-    do {
-        assumed = old;
-        old = atomicCAS(address_as_ull, assumed,
-                __double_as_longlong(val + __longlong_as_double(assumed)));
-    // Note: uses integer comparison to avoid hang in case of NaN (since NaN != NaN) } while (assumed != old);
-    } while (assumed != old);
-    return __longlong_as_double(old);
-}
-#endif
-
 template <
     typename    Key,
     int         BLOCK_THREADS,
@@ -340,7 +326,7 @@ void DescrptSeALauncher(const VALUETYPE* coord,
                             i_idx
         );
         const int ITEMS_PER_THREAD = 4;
-        const int BLOCK_THREADS = 64;
+        const int BLOCK_THREADS = MAGIC_NUMBER / ITEMS_PER_THREAD;
         // BlockSortKernel<NeighborInfo, BLOCK_THREADS, ITEMS_PER_THREAD><<<g_grid_size, BLOCK_THREADS>>> (
         BlockSortKernel<int_64, BLOCK_THREADS, ITEMS_PER_THREAD> <<<nloc, BLOCK_THREADS>>> (key, key + nloc * MAGIC_NUMBER);
         
diff --git a/source/op/cuda/descrpt_se_r.cu b/source/op/cuda/descrpt_se_r.cu
index 2a4a126166..fa9678be34 100644
--- a/source/op/cuda/descrpt_se_r.cu
+++ b/source/op/cuda/descrpt_se_r.cu
@@ -41,20 +41,6 @@ inline void cudaAssert(cudaError_t code, const char *file, int line, bool abort=
     }
 }
 
-#if defined(__CUDA_ARCH__) && __CUDA_ARCH__ < 600
-static __inline__ __device__ double atomicAdd(double* address, double val) {
-    unsigned long long int* address_as_ull = (unsigned long long int*)address;
-    unsigned long long int old = *address_as_ull, assumed;
-    do {
-        assumed = old;
-        old = atomicCAS(address_as_ull, assumed,
-                __double_as_longlong(val + __longlong_as_double(assumed)));
-    // Note: uses integer comparison to avoid hang in case of NaN (since NaN != NaN) } while (assumed != old);
-    } while (assumed != old);
-    return __longlong_as_double(old);
-}
-#endif
-
 template <
     typename    Key,
     int         BLOCK_THREADS,
diff --git a/source/op/cuda/prod_virial_se_a.cu b/source/op/cuda/prod_virial_se_a.cu
index f93e14bcec..241e2b7e06 100644
--- a/source/op/cuda/prod_virial_se_a.cu
+++ b/source/op/cuda/prod_virial_se_a.cu
@@ -45,6 +45,9 @@ __global__ void deriv_wrt_neighbors_se_a(VALUETYPE * virial,
 {
     // idx -> nloc
     // idy -> nnei
+    // idz = dd0 * 3 + dd1
+    // dd0 = idz / 3
+    // dd1 = idz % 3
     const unsigned int idx = blockIdx.x * blockDim.x + threadIdx.x;
     const unsigned int idy = blockIdx.y;
     const unsigned int idz = threadIdx.y;
@@ -58,7 +61,7 @@ __global__ void deriv_wrt_neighbors_se_a(VALUETYPE * virial,
         return;
     }
     // atomicAdd(virial + idz, net_deriv[idx * ndescrpt + idy * 4 + idw] * rij[idx * nnei * 3 + idy * 3 + idz / 3] * in_deriv[idx * ndescrpt * 3 + (idy * 4 + idw) * 3 + idz % 3]);
-    atomicAdd(atom_virial + j_idx * 9 + idz, net_deriv[idx * ndescrpt + idy * 4 + idw] * rij[idx * nnei * 3 + idy * 3 + idz / 3] * in_deriv[idx * ndescrpt * 3 + (idy * 4 + idw) * 3 + idz % 3]);
+    atomicAdd(atom_virial + j_idx * 9 + idz, net_deriv[idx * ndescrpt + idy * 4 + idw] * rij[idx * nnei * 3 + idy * 3 + idz % 3] * in_deriv[idx * ndescrpt * 3 + (idy * 4 + idw) * 3 + idz / 3]);
 }
 
 void ProdVirialSeALauncher(VALUETYPE * virial, 
diff --git a/source/op/cuda/prod_virial_se_r.cu b/source/op/cuda/prod_virial_se_r.cu
index d5b6aad3cc..a2c02007fc 100644
--- a/source/op/cuda/prod_virial_se_r.cu
+++ b/source/op/cuda/prod_virial_se_r.cu
@@ -47,6 +47,9 @@ __global__ void deriv_wrt_neighbors_se_r(VALUETYPE * virial,
 {
     // idx -> nloc
     // idy -> nnei
+    // idz = dd0 * 3 + dd1
+    // dd0 = idz / 3
+    // dd1 = idz % 3
     const unsigned int idx = blockIdx.x * blockDim.x + threadIdx.x;
     const unsigned int idy = blockIdx.y;
     const unsigned int idz = threadIdx.y;
@@ -58,7 +61,7 @@ __global__ void deriv_wrt_neighbors_se_r(VALUETYPE * virial,
     if (j_idx < 0) {
         return;
     }
-    atomicAdd(atom_virial + j_idx * 9 + idz, net_deriv[idx * ndescrpt + idy] * rij[idx * nnei * 3 + idy * 3 + idz / 3] * in_deriv[idx * ndescrpt * 3 + idy * 3 + idz % 3]);
+    atomicAdd(atom_virial + j_idx * 9 + idz, net_deriv[idx * ndescrpt + idy] * rij[idx * nnei * 3 + idy * 3 + idz % 3] * in_deriv[idx * ndescrpt * 3 + idy * 3 + idz / 3]);
 }
 
 void ProdVirialSeRLauncher(VALUETYPE * virial, 
diff --git a/source/op/descrpt.cc b/source/op/descrpt.cc
index 4e55049400..75c7640b2b 100644
--- a/source/op/descrpt.cc
+++ b/source/op/descrpt.cc
@@ -120,15 +120,34 @@ class DescrptOp : public OpKernel {
 
     int nei_mode = 0;
     if (mesh_tensor.shape().dim_size(0) == 16) {
+      // lammps neighbor list
       nei_mode = 3;
     }
     else if (mesh_tensor.shape().dim_size(0) == 12) {
+      // user provided extended mesh
       nei_mode = 2;
     }
     else if (mesh_tensor.shape().dim_size(0) == 6) {
+      // manual copied pbc
       assert (nloc == nall);
       nei_mode = 1;
     }
+    else if (mesh_tensor.shape().dim_size(0) == 0) {
+      // no pbc
+      nei_mode = -1;
+    }
+    else {
+      throw runtime_error("invalid mesh tensor");
+    }
+    bool b_pbc = true;
+    // if region is given extended, do not use pbc
+    if (nei_mode >= 1 || nei_mode == -1) {
+      b_pbc = false;
+    }
+    bool b_norm_atom = false;
+    if (nei_mode == 1){
+      b_norm_atom = true;
+    }
 
     // Create an output tensor
     TensorShape descrpt_shape ;
@@ -200,7 +219,7 @@ class DescrptOp : public OpKernel {
 	for (int dd = 0; dd < 3; ++dd){
 	  d_coord3[ii*3+dd] = coord(kk, ii*3+dd);
 	}
-	if (nei_mode <= 1){
+	if (b_norm_atom){
 	  compute_t inter[3];
 	  region.phys2Inter (inter, &d_coord3[3*ii]);
 	  for (int dd = 0; dd < 3; ++dd){
@@ -263,14 +282,11 @@ class DescrptOp : public OpKernel {
 	}
 	::build_nlist (d_nlist_a, d_nlist_r, d_coord3, nloc, rcut_a, rcut_r, nat_stt, ncell, ext_stt, ext_end, region, ncell);
       }
-      else {
-	build_nlist (d_nlist_a, d_nlist_r, rcut_a, rcut_r, d_coord3, region);      
+      else if (nei_mode == -1){
+	::build_nlist (d_nlist_a, d_nlist_r, d_coord3, rcut_a, rcut_r, NULL);
       }
-
-      bool b_pbc = true;
-      // if region is given extended, do not use pbc
-      if (nei_mode >= 1) {
-	b_pbc = false;
+      else {
+	throw runtime_error("unknow neighbor mode");
       }
 
       // loop over atoms, compute descriptors for each atom
@@ -399,48 +415,6 @@ class DescrptOp : public OpKernel {
     }
   }
   void 
-  build_nlist (vector<vector<int > > & nlist0,
-	       vector<vector<int > > & nlist1,
-	       const compute_t & rc0_,
-	       const compute_t & rc1_,
-	       const vector<compute_t > & posi3,
-	       const SimulationRegion<compute_t > & region) const {
-    compute_t rc0 (rc0_);
-    compute_t rc1 (rc1_);
-    assert (rc0 <= rc1);
-    compute_t rc02 = rc0 * rc0;
-    // negative rc0 means not applying rc0
-    if (rc0 < 0) rc02 = 0;
-    compute_t rc12 = rc1 * rc1;
-
-    unsigned natoms = posi3.size()/3;
-    nlist0.clear();
-    nlist1.clear();
-    nlist0.resize(natoms);
-    nlist1.resize(natoms);
-    for (unsigned ii = 0; ii < natoms; ++ii){
-      nlist0[ii].reserve (60);
-      nlist1[ii].reserve (60);
-    }
-    for (unsigned ii = 0; ii < natoms; ++ii){
-      for (unsigned jj = ii+1; jj < natoms; ++jj){
-	compute_t diff[3];
-	region.diffNearestNeighbor (posi3[jj*3+0], posi3[jj*3+1], posi3[jj*3+2],
-				    posi3[ii*3+0], posi3[ii*3+1], posi3[ii*3+2],
-				    diff[0], diff[1], diff[2]);
-	compute_t r2 = MathUtilities::dot<compute_t> (diff, diff);
-	if (r2 < rc02) {
-	  nlist0[ii].push_back (jj);
-	  nlist0[jj].push_back (ii);
-	}
-	else if (r2 < rc12) {
-	  nlist1[ii].push_back (jj);
-	  nlist1[jj].push_back (ii);
-	}
-      }
-    }
-  }
-  void 
   make_axis (vector<int > & axis_type,
 	     vector<int > & axis_idx,
 	     const int & type,
diff --git a/source/op/descrpt_se_a.cc b/source/op/descrpt_se_a.cc
index 1764191b5a..b970f6dbf0 100644
--- a/source/op/descrpt_se_a.cc
+++ b/source/op/descrpt_se_a.cc
@@ -119,15 +119,34 @@ class DescrptSeAOp : public OpKernel {
 
     int nei_mode = 0;
     if (mesh_tensor.shape().dim_size(0) == 16) {
+      // lammps neighbor list
       nei_mode = 3;
     }
     else if (mesh_tensor.shape().dim_size(0) == 12) {
+      // user provided extended mesh
       nei_mode = 2;
     }
     else if (mesh_tensor.shape().dim_size(0) == 6) {
+      // manual copied pbc
       assert (nloc == nall);
       nei_mode = 1;
     }
+    else if (mesh_tensor.shape().dim_size(0) == 0) {
+      // no pbc
+      nei_mode = -1;
+    }
+    else {
+      throw runtime_error("invalid mesh tensor");
+    }
+    bool b_pbc = true;
+    // if region is given extended, do not use pbc
+    if (nei_mode >= 1 || nei_mode == -1) {
+      b_pbc = false;
+    }
+    bool b_norm_atom = false;
+    if (nei_mode == 1){
+      b_norm_atom = true;
+    }
 
     // Create an output tensor
     TensorShape descrpt_shape ;
@@ -196,7 +215,7 @@ class DescrptSeAOp : public OpKernel {
 	for (int dd = 0; dd < 3; ++dd){
 	  d_coord3[ii*3+dd] = coord(kk, ii*3+dd);
 	}
-	if (nei_mode <= 1){
+	if (b_norm_atom){
 	  compute_t inter[3];
 	  region.phys2Inter (inter, &d_coord3[3*ii]);
 	  for (int dd = 0; dd < 3; ++dd){
@@ -259,14 +278,11 @@ class DescrptSeAOp : public OpKernel {
 	}
 	::build_nlist (d_nlist_a, d_nlist_r, d_coord3, nloc, rcut_a, rcut_r, nat_stt, ncell, ext_stt, ext_end, region, ncell);
       }
-      else {
-	build_nlist (d_nlist_a, d_nlist_r, rcut_a, rcut_r, d_coord3, region);      
+      else if (nei_mode == -1){
+	::build_nlist (d_nlist_a, d_nlist_r, d_coord3, rcut_a, rcut_r, NULL);
       }
-
-      bool b_pbc = true;
-      // if region is given extended, do not use pbc
-      if (nei_mode >= 1) {
-	b_pbc = false;
+      else {
+	throw runtime_error("unknow neighbor mode");
       }
 
       // loop over atoms, compute descriptors for each atom
@@ -351,48 +367,6 @@ class DescrptSeAOp : public OpKernel {
       sec[ii] = sec[ii-1] + n_sel[ii-1];
     }
   }
-  void 
-  build_nlist (vector<vector<int > > & nlist0,
-	       vector<vector<int > > & nlist1,
-	       const compute_t & rc0_,
-	       const compute_t & rc1_,
-	       const vector<compute_t > & posi3,
-	       const SimulationRegion<compute_t > & region) const {
-    compute_t rc0 (rc0_);
-    compute_t rc1 (rc1_);
-    assert (rc0 <= rc1);
-    compute_t rc02 = rc0 * rc0;
-    // negative rc0 means not applying rc0
-    if (rc0 < 0) rc02 = 0;
-    compute_t rc12 = rc1 * rc1;
-
-    unsigned natoms = posi3.size()/3;
-    nlist0.clear();
-    nlist1.clear();
-    nlist0.resize(natoms);
-    nlist1.resize(natoms);
-    for (unsigned ii = 0; ii < natoms; ++ii){
-      nlist0[ii].reserve (60);
-      nlist1[ii].reserve (60);
-    }
-    for (unsigned ii = 0; ii < natoms; ++ii){
-      for (unsigned jj = ii+1; jj < natoms; ++jj){
-	compute_t diff[3];
-	region.diffNearestNeighbor (posi3[jj*3+0], posi3[jj*3+1], posi3[jj*3+2],
-				    posi3[ii*3+0], posi3[ii*3+1], posi3[ii*3+2],
-				    diff[0], diff[1], diff[2]);
-	compute_t r2 = MathUtilities::dot<compute_t> (diff, diff);
-	if (r2 < rc02) {
-	  nlist0[ii].push_back (jj);
-	  nlist0[jj].push_back (ii);
-	}
-	else if (r2 < rc12) {
-	  nlist1[ii].push_back (jj);
-	  nlist1[jj].push_back (ii);
-	}
-      }
-    }
-  }
 };
 
 REGISTER_KERNEL_BUILDER(Name("DescrptSeA").Device(DEVICE_CPU), DescrptSeAOp);
diff --git a/source/op/descrpt_se_a_gpu.cc b/source/op/descrpt_se_a_gpu.cc
index 93c83016fb..70dd9c7751 100644
--- a/source/op/descrpt_se_a_gpu.cc
+++ b/source/op/descrpt_se_a_gpu.cc
@@ -7,7 +7,6 @@
 #include "tensorflow/core/framework/shape_inference.h"
 
 using namespace tensorflow;  // NOLINT(build/namespaces)
-#define MAGIC_NUMBER 256
 
 #ifdef HIGH_PREC
     typedef double VALUETYPE ;
@@ -159,7 +158,8 @@ class DescrptSeAOp : public OpKernel {
         
         OP_REQUIRES (context, (ntypes == int(sel_a.size())),	errors::InvalidArgument ("number of types should match the length of sel array"));
         OP_REQUIRES (context, (ntypes == int(sel_r.size())),	errors::InvalidArgument ("number of types should match the length of sel array"));
-        
+        OP_REQUIRES (context, (nnei <= 1024),	                errors::InvalidArgument ("Assert failed, max neighbor size of atom(nnei) " + std::to_string(nnei) + " is larger than 1024!, which currently is not supported by deepmd-kit."));
+
         // Create output tensors
         TensorShape descrpt_shape ;
         descrpt_shape.AddDim (nsamples);
@@ -201,7 +201,6 @@ class DescrptSeAOp : public OpKernel {
         cudaErrcheck(cudaMemcpy(&(array_longlong), 20 + mesh_tensor.flat<int>().data(), sizeof(unsigned long long *), cudaMemcpyDeviceToHost));
         cudaErrcheck(cudaMemcpy(&(array_double), 24 + mesh_tensor.flat<int>().data(), sizeof(compute_t *), cudaMemcpyDeviceToHost));
 
-        // cudaErrcheck(cudaMemcpy(jlist, host_jlist, sizeof(int) * nloc * MAGIC_NUMBER, cudaMemcpyHostToDevice));
         // Launch computation
         for (int II = 0; II < nsamples; II++) {
             DescrptSeALauncher(coord_tensor.matrix<VALUETYPE>().data() + II * (nall * 3),    // related to the kk argument
diff --git a/source/op/descrpt_se_r.cc b/source/op/descrpt_se_r.cc
index 04624d1e83..6798df503c 100644
--- a/source/op/descrpt_se_r.cc
+++ b/source/op/descrpt_se_r.cc
@@ -108,15 +108,34 @@ class DescrptSeROp : public OpKernel {
 
     int nei_mode = 0;
     if (mesh_tensor.shape().dim_size(0) == 16) {
+      // lammps neighbor list
       nei_mode = 3;
     }
     else if (mesh_tensor.shape().dim_size(0) == 12) {
+      // user provided extended mesh
       nei_mode = 2;
     }
     else if (mesh_tensor.shape().dim_size(0) == 6) {
+      // manual copied pbc
       assert (nloc == nall);
       nei_mode = 1;
     }
+    else if (mesh_tensor.shape().dim_size(0) == 0) {
+      // no pbc
+      nei_mode = -1;
+    }
+    else {
+      throw runtime_error("invalid mesh tensor");
+    }
+    bool b_pbc = true;
+    // if region is given extended, do not use pbc
+    if (nei_mode >= 1 || nei_mode == -1) {
+      b_pbc = false;
+    }
+    bool b_norm_atom = false;
+    if (nei_mode == 1){
+      b_norm_atom = true;
+    }
 
     // Create an output tensor
     TensorShape descrpt_shape ;
@@ -178,7 +197,7 @@ class DescrptSeROp : public OpKernel {
 	for (int dd = 0; dd < 3; ++dd){
 	  d_coord3[ii*3+dd] = coord(kk, ii*3+dd);
 	}
-	if (nei_mode <= 1){
+	if (b_norm_atom){
 	  compute_t inter[3];
 	  region.phys2Inter (inter, &d_coord3[3*ii]);
 	  for (int dd = 0; dd < 3; ++dd){
@@ -241,14 +260,11 @@ class DescrptSeROp : public OpKernel {
 	}
 	::build_nlist (d_nlist_null, d_nlist, d_coord3, nloc, -1, rcut, nat_stt, ncell, ext_stt, ext_end, region, ncell);
       }
-      else {
-	build_nlist (d_nlist_null, d_nlist, -1, rcut, d_coord3, region);      
+      else if (nei_mode == -1){
+	::build_nlist (d_nlist_null, d_nlist, d_coord3, -1, rcut, NULL);
       }
-
-      bool b_pbc = true;
-      // if region is given extended, do not use pbc
-      if (nei_mode >= 1) {
-	b_pbc = false;
+      else {
+	throw runtime_error("unknow neighbor mode");
       }
 
       // loop over atoms, compute descriptors for each atom
@@ -333,48 +349,6 @@ class DescrptSeROp : public OpKernel {
       sec[ii] = sec[ii-1] + n_sel[ii-1];
     }
   }
-  void 
-  build_nlist (vector<vector<int > > & nlist0,
-	       vector<vector<int > > & nlist1,
-	       const compute_t & rc0_,
-	       const compute_t & rc1_,
-	       const vector<compute_t > & posi3,
-	       const SimulationRegion<compute_t > & region) const {
-    compute_t rc0 (rc0_);
-    compute_t rc1 (rc1_);
-    assert (rc0 <= rc1);
-    compute_t rc02 = rc0 * rc0;
-    // negative rc0 means not applying rc0
-    if (rc0 < 0) rc02 = 0;
-    compute_t rc12 = rc1 * rc1;
-
-    unsigned natoms = posi3.size()/3;
-    nlist0.clear();
-    nlist1.clear();
-    nlist0.resize(natoms);
-    nlist1.resize(natoms);
-    for (unsigned ii = 0; ii < natoms; ++ii){
-      nlist0[ii].reserve (60);
-      nlist1[ii].reserve (60);
-    }
-    for (unsigned ii = 0; ii < natoms; ++ii){
-      for (unsigned jj = ii+1; jj < natoms; ++jj){
-	compute_t diff[3];
-	region.diffNearestNeighbor (posi3[jj*3+0], posi3[jj*3+1], posi3[jj*3+2],
-				    posi3[ii*3+0], posi3[ii*3+1], posi3[ii*3+2],
-				    diff[0], diff[1], diff[2]);
-	compute_t r2 = MathUtilities::dot<compute_t> (diff, diff);
-	if (r2 < rc02) {
-	  nlist0[ii].push_back (jj);
-	  nlist0[jj].push_back (ii);
-	}
-	else if (r2 < rc12) {
-	  nlist1[ii].push_back (jj);
-	  nlist1[jj].push_back (ii);
-	}
-      }
-    }
-  }
 };
 
 REGISTER_KERNEL_BUILDER(Name("DescrptSeR").Device(DEVICE_CPU), DescrptSeROp);
diff --git a/source/op/ewald_recp.cc b/source/op/ewald_recp.cc
new file mode 100644
index 0000000000..29daaa53c8
--- /dev/null
+++ b/source/op/ewald_recp.cc
@@ -0,0 +1,157 @@
+#include "tensorflow/core/framework/op.h"
+#include "tensorflow/core/framework/op_kernel.h"
+#include "tensorflow/core/framework/shape_inference.h"
+#include <iostream>
+
+#include "Ewald.h"
+
+typedef double boxtensor_t ;
+
+using namespace tensorflow;
+using namespace std;
+
+#ifdef HIGH_PREC
+typedef double VALUETYPE ;
+#else 
+typedef float  VALUETYPE ;
+#endif
+
+#ifdef HIGH_PREC
+REGISTER_OP("EwaldRecp")
+.Input("coord: double")
+.Input("charge: double")
+.Input("natoms: int32")
+.Input("box: double")
+.Attr("ewald_beta: float")
+.Attr("ewald_h: float")
+.Output("energy: double")
+.Output("force: double")
+.Output("virial: double");
+#else
+REGISTER_OP("EwaldRecp")
+.Input("coord: float")
+.Input("charge: float")
+.Input("natoms: int32")
+.Input("box: float")
+.Attr("ewald_beta: float")
+.Attr("ewald_h: float")
+.Output("energy: float")
+.Output("force: float")
+.Output("virial: float");
+#endif
+
+class EwaldRecpOp : public OpKernel {
+public:
+  explicit EwaldRecpOp(OpKernelConstruction* context) : OpKernel(context) {
+    float beta, spacing;
+    OP_REQUIRES_OK(context, context->GetAttr("ewald_beta", &(beta)));
+    OP_REQUIRES_OK(context, context->GetAttr("ewald_h", &(spacing)));
+    ep.beta = beta;
+    ep.spacing = spacing;
+  }
+
+  void Compute(OpKernelContext* context) override {
+    // Grab the input tensor
+    int cc = 0;
+    const Tensor& coord_tensor	= context->input(cc++);
+    const Tensor& charge_tensor	= context->input(cc++);
+    const Tensor& natoms_tensor	= context->input(cc++);
+    const Tensor& box_tensor	= context->input(cc++);
+
+    // set size of the sample
+    OP_REQUIRES (context, (coord_tensor.shape().dims() == 1),		errors::InvalidArgument ("Dim of coord should be 1"));
+    OP_REQUIRES (context, (charge_tensor.shape().dims() == 1),		errors::InvalidArgument ("Dim of charge should be 1"));
+    OP_REQUIRES (context, (natoms_tensor.shape().dim_size(0) == 1),	errors::InvalidArgument ("size of natoms should be 1"));
+    OP_REQUIRES (context, (box_tensor.shape().dims() == 1),		errors::InvalidArgument ("Dim of box should be 1"));
+    auto natoms	= natoms_tensor.flat<int>();
+    int nloc = natoms(0);
+    int nsamples = coord_tensor.shape().dim_size(0) / (nloc * 3);
+
+    // check the sizes
+    OP_REQUIRES (context, (nsamples * nloc * 3 == coord_tensor.shape().dim_size(0)),	errors::InvalidArgument ("coord  number of samples should match"));
+    OP_REQUIRES (context, (nsamples * nloc * 1 == charge_tensor.shape().dim_size(0)),	errors::InvalidArgument ("charge number of samples should match"));
+    OP_REQUIRES (context, (nsamples * 9 == box_tensor.shape().dim_size(0)),		errors::InvalidArgument ("box    number of samples should match"));
+
+    // Create an output tensor
+    TensorShape energy_shape ;
+    energy_shape.AddDim (nsamples);
+    TensorShape force_shape ;
+    force_shape.AddDim (nsamples);
+    force_shape.AddDim (nloc * 3);
+    TensorShape virial_shape ;
+    virial_shape.AddDim (nsamples);
+    virial_shape.AddDim (9);
+
+    cc = 0;
+    Tensor* energy_tensor = NULL;
+    OP_REQUIRES_OK(context, context->allocate_output(cc++, energy_shape, &energy_tensor));
+    Tensor* force_tensor = NULL;
+    OP_REQUIRES_OK(context, context->allocate_output(cc++, force_shape, &force_tensor));
+    Tensor* virial_tensor = NULL;
+    OP_REQUIRES_OK(context, context->allocate_output(cc++, virial_shape, &virial_tensor));
+    
+    auto coord	= coord_tensor	.flat<VALUETYPE>();
+    auto charge	= charge_tensor	.flat<VALUETYPE>();
+    auto box	= box_tensor	.flat<VALUETYPE>();
+    auto energy	= energy_tensor	->flat<VALUETYPE>();
+    auto force	= force_tensor	->matrix<VALUETYPE>();
+    auto virial	= virial_tensor	->matrix<VALUETYPE>();
+
+    for (int kk = 0; kk < nsamples; ++kk){
+      int box_iter = kk * 9;
+      int coord_iter = kk * nloc * 3;
+      int charge_iter = kk * nloc;
+      // set region
+      boxtensor_t boxt [9] = {0};
+      for (int dd = 0; dd < 9; ++dd) {
+	boxt[dd] = box(box_iter + dd);
+      }
+      SimulationRegion<boxtensor_t > region;
+      region.reinitBox (boxt);
+
+      // set & normalize coord
+      vector<boxtensor_t > d_coord3_ (nloc*3);
+      for (int ii = 0; ii < nloc; ++ii){
+	for (int dd = 0; dd < 3; ++dd){
+	  d_coord3_[ii*3+dd] = coord(coord_iter + ii*3+dd);
+	}
+	double inter[3];
+	region.phys2Inter (inter, &d_coord3_[3*ii]);
+	for (int dd = 0; dd < 3; ++dd){
+	  if      (inter[dd] < 0 ) inter[dd] += 1.;
+	  else if (inter[dd] >= 1) inter[dd] -= 1.;
+	}
+      }
+      vector<VALUETYPE > d_coord3 (nloc*3);
+      for (int ii = 0; ii < nloc * 3; ++ii) {
+	d_coord3[ii] = d_coord3_[ii];
+      }
+
+      // set charge
+      vector<VALUETYPE > d_charge (nloc);
+      for (int ii = 0; ii < nloc; ++ii) d_charge[ii] = charge(charge_iter + ii);
+
+      // prepare outputs vectors
+      VALUETYPE d_ener;
+      vector<VALUETYPE> d_force(nloc*3);
+      vector<VALUETYPE> d_virial(9);
+
+      // compute
+      EwaldReciprocal(d_ener, d_force, d_virial, d_coord3, d_charge, region, ep);
+
+      // copy output
+      energy(kk) = d_ener;
+      for (int ii = 0; ii < nloc * 3; ++ii){
+	force(kk, ii) = d_force[ii];
+      }
+      for (int ii = 0; ii < 9; ++ii){
+	virial(kk, ii) = d_virial[ii];
+      }
+    }
+  }
+private:
+  EwaldParameters<VALUETYPE> ep;
+};
+
+REGISTER_KERNEL_BUILDER(Name("EwaldRecp").Device(DEVICE_CPU), EwaldRecpOp);
+
diff --git a/source/op/prod_virial.cc b/source/op/prod_virial.cc
index df061aa12f..55b0b4b60d 100644
--- a/source/op/prod_virial.cc
+++ b/source/op/prod_virial.cc
@@ -147,7 +147,7 @@ class ProdVirialOp : public OpKernel {
 	      VALUETYPE pref = -1.0 * net_deriv (net_iter + i_idx * ndescrpt + aa);
 	      for (int dd0 = 0; dd0 < 3; ++dd0){
 		for (int dd1 = 0; dd1 < 3; ++dd1){
-		  VALUETYPE tmp_v = pref * rij (rij_iter + i_idx * nnei * 3 + jj * 3 + dd0) *  in_deriv (in_iter + i_idx * ndescrpt * 12 + aa * 12 + 3 + dd1);
+		  VALUETYPE tmp_v = pref * rij (rij_iter + i_idx * nnei * 3 + jj * 3 + dd1) *  in_deriv (in_iter + i_idx * ndescrpt * 12 + aa * 12 + 3 + dd0);
 		  virial (virial_iter + dd0 * 3 + dd1) += tmp_v;
 		  atom_virial (atom_virial_iter + j_idx * 9 + dd0 * 3 + dd1) += tmp_v;
 		}
@@ -159,7 +159,7 @@ class ProdVirialOp : public OpKernel {
 	      VALUETYPE pref = -1.0 * net_deriv (net_iter + i_idx * ndescrpt + aa);
 	      for (int dd0 = 0; dd0 < 3; ++dd0){
 		for (int dd1 = 0; dd1 < 3; ++dd1){
-		  VALUETYPE tmp_v = pref * rij (rij_iter + i_idx * nnei * 3 + jj * 3 + dd0) *  in_deriv (in_iter + i_idx * ndescrpt * 12 + aa * 12 + 6 + dd1);
+		  VALUETYPE tmp_v = pref * rij (rij_iter + i_idx * nnei * 3 + jj * 3 + dd1) *  in_deriv (in_iter + i_idx * ndescrpt * 12 + aa * 12 + 6 + dd0);
 		  virial (virial_iter + dd0 * 3 + dd1) += tmp_v;
 		  atom_virial (atom_virial_iter + j_idx * 9 + dd0 * 3 + dd1) += tmp_v;
 		}
@@ -173,7 +173,7 @@ class ProdVirialOp : public OpKernel {
 	      VALUETYPE pref = -1.0 * net_deriv (net_iter + i_idx * ndescrpt + aa);
 	      for (int dd0 = 0; dd0 < 3; ++dd0){
 		for (int dd1 = 0; dd1 < 3; ++dd1){
-		  VALUETYPE tmp_v = pref * rij (rij_iter + i_idx * nnei * 3 + jj * 3 + dd0) *  in_deriv (in_iter + i_idx * ndescrpt * 12 + aa * 12 + 9 + dd1);
+		  VALUETYPE tmp_v = pref * rij (rij_iter + i_idx * nnei * 3 + jj * 3 + dd1) *  in_deriv (in_iter + i_idx * ndescrpt * 12 + aa * 12 + 9 + dd0);
 		  virial (virial_iter + dd0 * 3 + dd1) += tmp_v;
 		  atom_virial (atom_virial_iter + j_idx * 9 + dd0 * 3 + dd1) += tmp_v;
 		}
diff --git a/source/op/prod_virial_grad.cc b/source/op/prod_virial_grad.cc
index aaa0e75b14..5257467029 100644
--- a/source/op/prod_virial_grad.cc
+++ b/source/op/prod_virial_grad.cc
@@ -152,7 +152,7 @@ class ProdVirialGradOp : public OpKernel
 	      for (int dd0 = 0; dd0 < 3; ++dd0){
 		for (int dd1 = 0; dd1 < 3; ++dd1){
 		  grad_net (grad_net_iter + i_idx * ndescrpt + aa) += 
-		      -1.0 * grad (grad_iter + dd0 * 3 + dd1) * rij (rij_iter + i_idx * nnei * 3 + jj * 3 + dd0) * in_deriv (in_iter + i_idx * ndescrpt * 12 + aa * 12 + 3 + dd1);
+		      -1.0 * grad (grad_iter + dd0 * 3 + dd1) * rij (rij_iter + i_idx * nnei * 3 + jj * 3 + dd1) * in_deriv (in_iter + i_idx * ndescrpt * 12 + aa * 12 + 3 + dd0);
 		}
 	      }
 	    }
@@ -162,7 +162,7 @@ class ProdVirialGradOp : public OpKernel
 	      for (int dd0 = 0; dd0 < 3; ++dd0){
 		for (int dd1 = 0; dd1 < 3; ++dd1){
 		  grad_net (grad_net_iter + i_idx * ndescrpt + aa) += 
-		      -1.0 * grad (grad_iter + dd0 * 3 + dd1) * rij (rij_iter + i_idx * nnei * 3 + jj * 3 + dd0) * in_deriv (in_iter + i_idx * ndescrpt * 12 + aa * 12 + 6 + dd1);
+		      -1.0 * grad (grad_iter + dd0 * 3 + dd1) * rij (rij_iter + i_idx * nnei * 3 + jj * 3 + dd1) * in_deriv (in_iter + i_idx * ndescrpt * 12 + aa * 12 + 6 + dd0);
 		}
 	      }
 	    }
@@ -174,7 +174,7 @@ class ProdVirialGradOp : public OpKernel
 	      for (int dd0 = 0; dd0 < 3; ++dd0){
 		for (int dd1 = 0; dd1 < 3; ++dd1){
 		  grad_net (grad_net_iter + i_idx * ndescrpt + aa) += 
-		      -1.0 * grad (grad_iter + dd0 * 3 + dd1) * rij (rij_iter + i_idx * nnei * 3 + jj * 3 + dd0) * in_deriv (in_iter + i_idx * ndescrpt * 12 + aa * 12 + 9 + dd1);
+		      -1.0 * grad (grad_iter + dd0 * 3 + dd1) * rij (rij_iter + i_idx * nnei * 3 + jj * 3 + dd1) * in_deriv (in_iter + i_idx * ndescrpt * 12 + aa * 12 + 9 + dd0);
 		}
 	      }
 	    }
diff --git a/source/op/prod_virial_se_a.cc b/source/op/prod_virial_se_a.cc
index 89077750af..2f71d37505 100644
--- a/source/op/prod_virial_se_a.cc
+++ b/source/op/prod_virial_se_a.cc
@@ -134,7 +134,7 @@ class ProdVirialSeAOp : public OpKernel {
 	    VALUETYPE pref = -1.0 * net_deriv (net_iter + i_idx * ndescrpt + aa);
 	    for (int dd0 = 0; dd0 < 3; ++dd0){
 	      for (int dd1 = 0; dd1 < 3; ++dd1){
-		VALUETYPE tmp_v = pref * rij (rij_iter + i_idx * nnei * 3 + jj * 3 + dd0) *  in_deriv (in_iter + i_idx * ndescrpt * 3 + aa * 3 + dd1);
+		VALUETYPE tmp_v = pref * rij (rij_iter + i_idx * nnei * 3 + jj * 3 + dd1) *  in_deriv (in_iter + i_idx * ndescrpt * 3 + aa * 3 + dd0);
 		virial (virial_iter + dd0 * 3 + dd1) -= tmp_v;
 		atom_virial (atom_virial_iter + j_idx * 9 + dd0 * 3 + dd1) -= tmp_v;
 	      }
diff --git a/source/op/prod_virial_se_a_grad.cc b/source/op/prod_virial_se_a_grad.cc
index 0d19a1c19a..660f652566 100644
--- a/source/op/prod_virial_se_a_grad.cc
+++ b/source/op/prod_virial_se_a_grad.cc
@@ -137,7 +137,7 @@ class ProdVirialSeAGradOp : public OpKernel
 	    for (int dd0 = 0; dd0 < 3; ++dd0){
 	      for (int dd1 = 0; dd1 < 3; ++dd1){
 		grad_net (grad_net_iter + i_idx * ndescrpt + aa) -= 
-		    -1.0 * grad (grad_iter + dd0 * 3 + dd1) * rij (rij_iter + i_idx * nnei * 3 + jj * 3 + dd0) * in_deriv (in_iter + i_idx * ndescrpt * 3 + aa * 3 + dd1);
+		    -1.0 * grad (grad_iter + dd0 * 3 + dd1) * rij (rij_iter + i_idx * nnei * 3 + jj * 3 + dd1) * in_deriv (in_iter + i_idx * ndescrpt * 3 + aa * 3 + dd0);
 	      }
 	    }
 	  }
diff --git a/source/op/prod_virial_se_r.cc b/source/op/prod_virial_se_r.cc
index f9b5a71d84..1d21234421 100644
--- a/source/op/prod_virial_se_r.cc
+++ b/source/op/prod_virial_se_r.cc
@@ -122,7 +122,7 @@ class ProdVirialSeROp : public OpKernel {
 	  VALUETYPE pref = -1.0 * net_deriv (net_iter + i_idx * ndescrpt + jj);
 	  for (int dd0 = 0; dd0 < 3; ++dd0){
 	    for (int dd1 = 0; dd1 < 3; ++dd1){
-	      VALUETYPE tmp_v = pref * rij (rij_iter + i_idx * nnei * 3 + jj * 3 + dd0) *  in_deriv (in_iter + i_idx * ndescrpt * 3 + jj * 3 + dd1);
+	      VALUETYPE tmp_v = pref * rij (rij_iter + i_idx * nnei * 3 + jj * 3 + dd1) *  in_deriv (in_iter + i_idx * ndescrpt * 3 + jj * 3 + dd0);
 	      virial (virial_iter + dd0 * 3 + dd1) -= tmp_v;
 	      atom_virial (atom_virial_iter + j_idx * 9 + dd0 * 3 + dd1) -= tmp_v;
 	    }
diff --git a/source/op/prod_virial_se_r_grad.cc b/source/op/prod_virial_se_r_grad.cc
index 002aa1b907..20b53cf3c9 100644
--- a/source/op/prod_virial_se_r_grad.cc
+++ b/source/op/prod_virial_se_r_grad.cc
@@ -126,7 +126,7 @@ class ProdVirialSeRGradOp : public OpKernel
 	  for (int dd0 = 0; dd0 < 3; ++dd0){
 	    for (int dd1 = 0; dd1 < 3; ++dd1){
 	      grad_net (grad_net_iter + i_idx * ndescrpt + jj) -= 
-		  -1.0 * grad (grad_iter + dd0 * 3 + dd1) * rij (rij_iter + i_idx * nnei * 3 + jj * 3 + dd0) * in_deriv (in_iter + i_idx * ndescrpt * 3 + jj * 3 + dd1);
+		  -1.0 * grad (grad_iter + dd0 * 3 + dd1) * rij (rij_iter + i_idx * nnei * 3 + jj * 3 + dd1) * in_deriv (in_iter + i_idx * ndescrpt * 3 + jj * 3 + dd0);
 	    }
 	  }
 	}
diff --git a/source/scripts/config.py b/source/scripts/config.py
index 6e87cf8329..5d3d804b26 100644
--- a/source/scripts/config.py
+++ b/source/scripts/config.py
@@ -1,6 +1,6 @@
 #!/usr/bin/env python3
 
-import glob,os,sys,json,argparse
+import glob,os,json
 import numpy as np
 
 
diff --git a/source/scripts/freeze.py b/source/scripts/freeze.py
index 2420251744..2e6b211bc8 100755
--- a/source/scripts/freeze.py
+++ b/source/scripts/freeze.py
@@ -3,29 +3,10 @@
 # freeze.py :
 # see https://blog.metaflow.fr/tensorflow-how-to-freeze-a-model-and-serve-it-with-a-python-api-d4f3596b3adc
 
-import platform
-import os, argparse, json
-import sys
-
 from deepmd.env import tf
-
-dir = os.path.dirname(os.path.realpath(__file__))
-
-from tensorflow.python.framework import ops
-
-# load force module
-if platform.system() == "Windows":
-    ext = "dll"
-elif platform.system() == "Darwin":
-    ext = "dylib"
-else:
-    ext = "so"
-module_path = os.path.dirname(os.path.realpath(__file__)) + "/../"
-assert (os.path.isfile (module_path  + "deepmd/libop_abi.{}".format(ext) )), "force module does not exist"
-op_module = tf.load_op_library(module_path + "deepmd/libop_abi.{}".format(ext))
+from deepmd.env import op_module
 
 # load grad of force module
-sys.path.append (module_path )
 import deepmd._prod_force_grad
 import deepmd._prod_virial_grad
 import deepmd._prod_force_se_a_grad
@@ -35,19 +16,21 @@
 import deepmd._soft_min_force_grad
 import deepmd._soft_min_virial_grad
 
-def _make_node_names(model_type = None) : 
+def _make_node_names(model_type, modifier_type = None) :
     if model_type == 'ener':
         nodes = "o_energy,o_force,o_virial,o_atom_energy,o_atom_virial,descrpt_attr/rcut,descrpt_attr/ntypes,fitting_attr/dfparam,fitting_attr/daparam,model_attr/tmap,model_attr/model_type"
     elif model_type == 'wfc':
         nodes = "o_wfc,descrpt_attr/rcut,descrpt_attr/ntypes,model_attr/tmap,model_attr/sel_type,model_attr/model_type"
     elif model_type == 'dipole':
-        nodes = "o_dipole,descrpt_attr/rcut,descrpt_attr/ntypes,model_attr/tmap,model_attr/sel_type,model_attr/model_type"
+        nodes = "o_dipole,o_rmat,o_rmat_deriv,o_nlist,o_rij,descrpt_attr/rcut,descrpt_attr/ntypes,descrpt_attr/sel,descrpt_attr/ndescrpt,model_attr/tmap,model_attr/sel_type,model_attr/model_type,model_attr/output_dim"
     elif model_type == 'polar':
         nodes = "o_polar,descrpt_attr/rcut,descrpt_attr/ntypes,model_attr/tmap,model_attr/sel_type,model_attr/model_type"
     elif model_type == 'global_polar':
         nodes = "o_global_polar,descrpt_attr/rcut,descrpt_attr/ntypes,model_attr/tmap,model_attr/sel_type,model_attr/model_type"
     else:
         raise RuntimeError('unknow model type ' + model_type)
+    if modifier_type == 'dipole_charge':
+        nodes += ",modifier_attr/type,modifier_attr/mdl_name,modifier_attr/mdl_charge_map,modifier_attr/sys_charge_map,modifier_attr/ewald_h,modifier_attr/ewald_beta,dipole_charge/descrpt_attr/rcut,dipole_charge/descrpt_attr/ntypes,dipole_charge/model_attr/tmap,dipole_charge/model_attr/model_type,o_dm_force,dipole_charge/model_attr/sel_type,dipole_charge/o_dipole,dipole_charge/model_attr/output_dim,o_dm_virial,o_dm_av"
     return nodes
 
 def freeze_graph(model_folder, 
@@ -75,13 +58,18 @@ def freeze_graph(model_folder,
     # We retrieve the protobuf graph definition
     graph = tf.get_default_graph()
     input_graph_def = graph.as_graph_def()
+    nodes = [n.name for n in input_graph_def.node]
 
     # We start a session and restore the graph weights
     with tf.Session() as sess:
         saver.restore(sess, input_checkpoint)
         model_type = sess.run('model_attr/model_type:0', feed_dict = {}).decode('utf-8')
+        if 'modifier_attr/type' in nodes:
+            modifier_type = sess.run('modifier_attr/type:0', feed_dict = {}).decode('utf-8')
+        else:
+            modifier_type = None
         if output_node_names is None :
-            output_node_names = _make_node_names(model_type)
+            output_node_names = _make_node_names(model_type, modifier_type)
         print('The following nodes will be frozen: %s' % output_node_names)
 
         # We use a built-in TF helper to export variables to constants
diff --git a/source/tests/common.py b/source/tests/common.py
index 812a238c4a..f92fe397cc 100644
--- a/source/tests/common.py
+++ b/source/tests/common.py
@@ -11,7 +11,7 @@
     global_default_dw_hh = 1e-2
     global_default_places = 3
 else :
-    global_default_fv_hh = 1e-6
+    global_default_fv_hh = 1e-5
     global_default_dw_hh = 1e-4
     global_default_places = 5
 
@@ -35,26 +35,34 @@ def gen_data() :
 class Data():
     def __init__ (self, 
                   rand_pert = 0.1, 
-                  seed = 1) :
+                  seed = 1, 
+                  box_scale = 20) :
         coord = [[0.0, 0.0, 0.1], [1.1, 0.0, 0.1], [0.0, 1.1, 0.1], 
                  [4.0, 0.0, 0.0], [5.1, 0.0, 0.0], [4.0, 1.1, 0.0]]
+        self.nframes = 1
         self.coord = np.array(coord)
+        self.coord = self._copy_nframes(self.coord)
         np.random.seed(seed)
         self.coord += rand_pert * np.random.random(self.coord.shape)
         self.fparam = np.array([[0.1, 0.2]])
         self.aparam = np.tile(self.fparam, [1, 6])
+        self.fparam = self._copy_nframes(self.fparam)
+        self.aparam = self._copy_nframes(self.aparam)
         self.atype = np.array([0, 1, 1, 0, 1, 1], dtype = int)
-        self.cell = 20 * np.eye(3)
-        self.nframes = 1
+        self.cell = box_scale * np.eye(3)
+        self.cell = self._copy_nframes(self.cell)
         self.coord = self.coord.reshape([self.nframes, -1])
         self.cell = self.cell.reshape([self.nframes, -1])
         self.natoms = len(self.atype)        
         self.idx_map = np.lexsort ((np.arange(self.natoms), self.atype))
-        self.coord = self.coord.reshape([1, -1, 3])
+        self.coord = self.coord.reshape([self.nframes, -1, 3])
         self.coord = self.coord[:,self.idx_map,:]
-        self.coord = self.coord.reshape([1, -1])        
+        self.coord = self.coord.reshape([self.nframes, -1])        
         self.atype = self.atype[self.idx_map]
-        self.datype = np.tile(self.atype, [self.nframes,1])
+        self.datype = self._copy_nframes(self.atype)
+
+    def _copy_nframes(self, xx):
+        return np.tile(xx, [self.nframes, 1])
         
     def get_data(self) :
         return self.coord, self.cell, self.datype
@@ -68,39 +76,80 @@ def get_natoms (self) :
     def get_ntypes(self) :
         return max(self.atype) + 1
 
+    # def get_test_box_data (self,
+    #                        hh) :
+    #     coord0_, box0_, type0_ = self.get_data()
+    #     coord0 = coord0_[0]
+    #     box0 = box0_[0]
+    #     type0 = type0_[0]
+    #     nc = np.array( [coord0, coord0*(1+hh), coord0*(1-hh)] )
+    #     nb = np.array( [box0, box0*(1+hh), box0*(1-hh)] )
+    #     nt = np.array( [type0, type0, type0] )
+    #     for dd in range(3) :
+    #         tmpc = np.copy (coord0)
+    #         tmpb = np.copy (box0)
+    #         tmpc = np.reshape(tmpc, [-1, 3])
+    #         tmpc [:,dd] *= (1+hh)
+    #         tmpc = np.reshape(tmpc, [-1])
+    #         tmpb = np.reshape(tmpb, [-1, 3])
+    #         tmpb [dd,:] *= (1+hh)
+    #         tmpb = np.reshape(tmpb, [-1])
+    #         nc = np.append (nc, [tmpc], axis = 0)
+    #         nb = np.append (nb, [tmpb], axis = 0)
+    #         nt = np.append (nt, [type0], axis = 0)
+    #         tmpc = np.copy (coord0)
+    #         tmpb = np.copy (box0)
+    #         tmpc = np.reshape(tmpc, [-1, 3])
+    #         tmpc [:,dd] *= (1-hh)
+    #         tmpc = np.reshape(tmpc, [-1])
+    #         tmpb = np.reshape(tmpb, [-1, 3])
+    #         tmpb [dd,:] *= (1-hh)
+    #         tmpb = np.reshape(tmpb, [-1])
+    #         nc = np.append (nc, [tmpc], axis = 0)
+    #         nb = np.append (nb, [tmpb], axis = 0)
+    #         nt = np.append (nt, [type0], axis = 0)
+    #     return nc, nb, nt
+
     def get_test_box_data (self,
-                           hh) :
+                           hh, 
+                           rand_pert = 0.1) :
         coord0_, box0_, type0_ = self.get_data()
-        coord0 = coord0_[0]
-        box0 = box0_[0]
-        type0 = type0_[0]
-        nc = np.array( [coord0, coord0*(1+hh), coord0*(1-hh)] )
-        nb = np.array( [box0, box0*(1+hh), box0*(1-hh)] )
-        nt = np.array( [type0, type0, type0] )
-        for dd in range(3) :
-            tmpc = np.copy (coord0)
-            tmpb = np.copy (box0)
-            tmpc = np.reshape(tmpc, [-1, 3])
-            tmpc [:,dd] *= (1+hh)
-            tmpc = np.reshape(tmpc, [-1])
-            tmpb = np.reshape(tmpb, [-1, 3])
-            tmpb [dd,:] *= (1+hh)
-            tmpb = np.reshape(tmpb, [-1])
-            nc = np.append (nc, [tmpc], axis = 0)
-            nb = np.append (nb, [tmpb], axis = 0)
-            nt = np.append (nt, [type0], axis = 0)
-            tmpc = np.copy (coord0)
-            tmpb = np.copy (box0)
-            tmpc = np.reshape(tmpc, [-1, 3])
-            tmpc [:,dd] *= (1-hh)
-            tmpc = np.reshape(tmpc, [-1])
-            tmpb = np.reshape(tmpb, [-1, 3])
-            tmpb [dd,:] *= (1-hh)
-            tmpb = np.reshape(tmpb, [-1])
-            nc = np.append (nc, [tmpc], axis = 0)
-            nb = np.append (nb, [tmpb], axis = 0)
-            nt = np.append (nt, [type0], axis = 0)
-        return nc, nb, nt
+        coord = coord0_[0]
+        box = box0_[0]
+        box += rand_pert * np.random.random(box.shape)
+        atype = type0_[0]
+        nframes = 1
+        natoms = coord.size // 3
+        box3 = np.reshape(box, [nframes, 3,3])
+        rbox3 = np.linalg.inv(box3)
+        coord3 = np.reshape(coord, [nframes, natoms, 3])
+        rcoord3 = np.matmul(coord3, rbox3)
+        
+        all_coord = [coord.reshape([nframes, natoms*3])]
+        all_box = [box.reshape([nframes,9])]
+        all_atype = [atype]
+        for ii in range(3):
+            for jj in range(3):
+                box3p = np.copy(box3)
+                box3m = np.copy(box3)
+                box3p[:,ii,jj] = box3[:,ii,jj] + hh
+                box3m[:,ii,jj] = box3[:,ii,jj] - hh
+                boxp = np.reshape(box3p, [-1,9])
+                boxm = np.reshape(box3m, [-1,9])
+                coord3p = np.matmul(rcoord3, box3p)
+                coord3m = np.matmul(rcoord3, box3m)
+                coordp = np.reshape(coord3p, [nframes,-1])
+                coordm = np.reshape(coord3m, [nframes,-1])
+                all_coord.append(coordp)
+                all_coord.append(coordm)
+                all_box.append(boxp)
+                all_box.append(boxm)
+                all_atype.append(atype)
+                all_atype.append(atype)
+        all_coord = np.reshape(all_coord, [-1, natoms * 3])
+        all_box = np.reshape(all_box, [-1, 9])
+        all_atype = np.reshape(all_atype, [-1, natoms])        
+        return all_coord, all_box, all_atype
 
 
 def force_test (inter, 
@@ -178,16 +227,22 @@ def virial_test (inter,
                               inter.type:      dtype,
                               inter.tnatoms:   inter.natoms}
         )
-    # check
-    ana_vir3 = (virial[0][0] + virial[0][4] + virial[0][8])/3. / comp_vol(dbox[0])
-    num_vir3 = -(energy[1] - energy[2]) / (comp_vol(dbox[1]) - comp_vol(dbox[2]))
-    testCase.assertAlmostEqual(ana_vir3, num_vir3, places=places)
-    vir_idx = [0, 4, 8]
-    for dd in range (3) :
-        ana_v = (virial[0][vir_idx[dd]] / comp_vol(dbox[0]))
-        idx = 2 * (dd+1) + 1
-        num_v = ( -(energy[idx] - energy[idx+1]) / (comp_vol(dbox[idx]) - comp_vol(dbox[idx+1])) )
-        testCase.assertAlmostEqual(ana_v, num_v, places=places)
+    ana_vir = virial[0].reshape([3,3])
+    num_vir = np.zeros([3,3])
+    for ii in range(3):
+        for jj in range(3):
+            ep = energy[1+(ii*3+jj)*2+0]
+            em = energy[1+(ii*3+jj)*2+1]
+            num_vir[ii][jj] = -(ep - em) / (2.*hh)
+    num_vir = np.transpose(num_vir, [1,0])    
+    box3 = dbox[0].reshape([3,3])
+    num_vir = np.matmul(num_vir, box3)
+    for ii in range(3):
+        for jj in range(3):
+            testCase.assertAlmostEqual(ana_vir[ii][jj], num_vir[ii][jj],
+                                       places=places, 
+                                       msg = 'virial component %d %d ' % (ii,jj))
+    
 
 
 def force_dw_test (inter, 
diff --git a/source/tests/data_modifier/dipole.json b/source/tests/data_modifier/dipole.json
new file mode 100644
index 0000000000..283ebe161b
--- /dev/null
+++ b/source/tests/data_modifier/dipole.json
@@ -0,0 +1,60 @@
+{
+    "with_distrib":	false,
+    "_comment": " model parameters",
+    "model":{
+	"type_map":		["O", "H"],
+	"descriptor" :{
+	    "type":		"se_a",
+	    "sel":		[46, 92],
+	    "rcut_smth":	3.80,
+	    "rcut":		4.00,
+	    "neuron":		[25, 50, 100],
+	    "resnet_dt":	false,
+	    "axis_neuron":	6,
+	    "seed":		1,
+	    "_comment":		" that's all"
+	},
+	"fitting_net": {
+	    "type":		"dipole",
+	    "dipole_type":		[0],
+	    "neuron":		[100, 100, 100],
+	    "resnet_dt":	true,
+	    "seed":		1,
+	    "_comment":		" that's all"
+	},
+	"_comment":	" that's all"
+    },
+    
+    "learning_rate" :{
+	"type":		"exp",
+	"start_lr":	0.01,
+	"decay_steps":	5000,
+	"decay_rate":	0.95,
+	"_comment":	"that's all"
+    },
+
+    "_comment": " traing controls",
+    "training": {
+	"systems":	["data_modifier/sys_10"], 
+	"set_prefix":	"set",    
+	"stop_batch":	1000000,
+	"batch_size":	4,
+
+	"seed":		1,
+
+	"_comment": " display and restart",
+	"_comment": " frequencies counted in batch",
+	"disp_file":	"lcurve.out",
+	"disp_freq":	100,
+	"numb_test":	5,
+	"save_freq":	500,
+	"save_ckpt":	"model.ckpt",
+	"load_ckpt":	"model.ckpt",
+	"disp_training":true,
+	"time_training":true,
+	"_comment":	"that's all"
+    },
+
+    "_comment":		"that's all"
+}
+
diff --git a/source/tests/data_modifier/sys_10/set.000/box.npy b/source/tests/data_modifier/sys_10/set.000/box.npy
new file mode 100644
index 0000000000..4935ebe88c
Binary files /dev/null and b/source/tests/data_modifier/sys_10/set.000/box.npy differ
diff --git a/source/tests/data_modifier/sys_10/set.000/coord.npy b/source/tests/data_modifier/sys_10/set.000/coord.npy
new file mode 100644
index 0000000000..7bd26fdc05
Binary files /dev/null and b/source/tests/data_modifier/sys_10/set.000/coord.npy differ
diff --git a/source/tests/data_modifier/sys_10/set.000/dipole.npy b/source/tests/data_modifier/sys_10/set.000/dipole.npy
new file mode 100644
index 0000000000..960a39f74a
Binary files /dev/null and b/source/tests/data_modifier/sys_10/set.000/dipole.npy differ
diff --git a/source/tests/data_modifier/sys_10/set.000/energy.npy b/source/tests/data_modifier/sys_10/set.000/energy.npy
new file mode 100644
index 0000000000..0aa3b12a38
Binary files /dev/null and b/source/tests/data_modifier/sys_10/set.000/energy.npy differ
diff --git a/source/tests/data_modifier/sys_10/set.000/force.npy b/source/tests/data_modifier/sys_10/set.000/force.npy
new file mode 100644
index 0000000000..9fe56bda58
Binary files /dev/null and b/source/tests/data_modifier/sys_10/set.000/force.npy differ
diff --git a/source/tests/data_modifier/sys_10/type.raw b/source/tests/data_modifier/sys_10/type.raw
new file mode 100644
index 0000000000..59f789c42f
--- /dev/null
+++ b/source/tests/data_modifier/sys_10/type.raw
@@ -0,0 +1,48 @@
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
diff --git a/source/tests/data_modifier/sys_10/type_map.raw b/source/tests/data_modifier/sys_10/type_map.raw
new file mode 100644
index 0000000000..2583239060
--- /dev/null
+++ b/source/tests/data_modifier/sys_10/type_map.raw
@@ -0,0 +1,2 @@
+Type_0
+Type_1
diff --git a/source/tests/test_data_modifier.py b/source/tests/test_data_modifier.py
new file mode 100644
index 0000000000..4e7b43663e
--- /dev/null
+++ b/source/tests/test_data_modifier.py
@@ -0,0 +1,157 @@
+import os,sys,platform,json
+import numpy as np
+import unittest
+from deepmd.env import tf
+
+from deepmd.common import j_must_have, data_requirement
+from deepmd.RunOptions import RunOptions
+from deepmd.Trainer import NNPTrainer
+from deepmd.DataSystem import DeepmdDataSystem
+from deepmd.RunOptions import global_tf_float_precision
+from deepmd.RunOptions import global_np_float_precision
+from deepmd.RunOptions import global_ener_float_precision
+from deepmd.EwaldRecp import EwaldRecp
+from deepmd.DataModifier import DipoleChargeModifier
+
+from common import Data
+
+if global_np_float_precision == np.float32 :
+    global_default_fv_hh = 1e-2
+    global_default_dw_hh = 1e-2
+    global_default_places = 3
+else :
+    global_default_fv_hh = 1e-6
+    global_default_dw_hh = 1e-4
+    global_default_places = 5
+
+modifier_datapath = 'data_modifier'
+
+class Args() :
+    INPUT = os.path.join(modifier_datapath, 'dipole.json')
+    restart = None
+    init_model = None
+    inter_threads = 0
+
+class TestDataModifier (unittest.TestCase) :
+
+    def setUp(self):
+        # with tf.variable_scope('load', reuse = False) :
+        tf.reset_default_graph()        
+        self._setUp()
+
+    def tearDown(self):
+        tf.reset_default_graph()        
+
+    def _setUp(self):
+        args = Args()
+        run_opt = RunOptions(args, False)
+        with open (args.INPUT, 'r') as fp:
+           jdata = json.load (fp)
+
+        # init model
+        model = NNPTrainer (jdata, run_opt = run_opt)
+        rcut = model.model.get_rcut()
+
+        # init data system
+        systems = j_must_have(jdata['training'], 'systems')
+        set_pfx = j_must_have(jdata['training'], 'set_prefix')
+        batch_size = j_must_have(jdata['training'], 'batch_size')
+        test_size = j_must_have(jdata['training'], 'numb_test')    
+        data = DeepmdDataSystem(systems, 
+                                batch_size, 
+                                test_size, 
+                                rcut, 
+                                set_prefix=set_pfx)
+        data.add_dict(data_requirement)
+
+        # clear the default graph
+        tf.reset_default_graph()
+
+        # build the model with stats from the first system
+        model.build (data)
+        
+        # freeze the graph
+        with tf.Session() as sess:
+            init_op = tf.global_variables_initializer()
+            sess.run(init_op)
+            graph = tf.get_default_graph()
+            input_graph_def = graph.as_graph_def()
+            nodes = "o_dipole,o_rmat,o_rmat_deriv,o_nlist,o_rij,descrpt_attr/rcut,descrpt_attr/ntypes,descrpt_attr/sel,descrpt_attr/ndescrpt,model_attr/tmap,model_attr/sel_type,model_attr/model_type"
+            output_graph_def = tf.graph_util.convert_variables_to_constants(
+                sess,
+                input_graph_def,
+                nodes.split(",") 
+            )
+            output_graph = os.path.join(modifier_datapath, 'dipole.pb')
+            with tf.gfile.GFile(output_graph, "wb") as f:
+                f.write(output_graph_def.SerializeToString())
+
+    def test_fv(self):
+        # with tf.variable_scope('load', reuse = False) :
+        self._test_fv()
+            
+    def _test_fv (self):
+        dcm = DipoleChargeModifier(os.path.join(modifier_datapath, "dipole.pb"),
+                                   [-8],
+                                   [6, 1],
+                                   1,
+                                   0.25)
+        data = Data()
+        coord, box, atype = data.get_data()
+        atype = atype[0]
+        ve, vf, vv = dcm.eval(coord, box, atype)
+
+        hh = global_default_fv_hh
+        hh=1e-4
+        places = global_default_places
+        places=1
+        nframes = coord.shape[0]
+        ndof = coord.shape[1]
+        natoms = ndof // 3
+        vf = np.reshape(vf, [nframes, -1])
+        for ii in range(ndof):
+            coordp = np.copy(coord)
+            coordm = np.copy(coord)
+            coordp[:,ii] += hh
+            coordm[:,ii] -= hh
+            ep, _, __ = dcm.eval(coordp, box, atype, eval_fv = False)
+            em, _, __ = dcm.eval(coordm, box, atype, eval_fv = False)
+            num_f = -(ep - em) / (2.*hh)
+            for ff in range(nframes):
+                self.assertAlmostEqual(vf[ff,ii], num_f[ff], 
+                                       places = places,
+                                       msg = 'frame %d dof %d does not match' % (ff, ii))
+
+        box3 = np.reshape(box, [nframes, 3,3])
+        rbox3 = np.linalg.inv(box3)
+        coord3 = np.reshape(coord, [nframes, natoms, 3])
+        rcoord3 = np.matmul(coord3, rbox3)
+        num_deriv = np.zeros([nframes,3,3])
+        for ii in range(3):
+            for jj in range(3):
+                box3p = np.copy(box3)
+                box3m = np.copy(box3)
+                box3p[:,ii,jj] = box3[:,ii,jj] + hh
+                box3m[:,ii,jj] = box3[:,ii,jj] - hh
+                boxp = np.reshape(box3p, [-1,9])
+                boxm = np.reshape(box3m, [-1,9])
+                coord3p = np.matmul(rcoord3, box3p)
+                coord3m = np.matmul(rcoord3, box3m)
+                coordp = np.reshape(coord3p, [nframes,-1])
+                coordm = np.reshape(coord3m, [nframes,-1])
+                ep, _, __ = dcm.eval(coordp, boxp, atype, eval_fv = False)
+                em, _, __ = dcm.eval(coordm, boxm, atype, eval_fv = False)
+                num_deriv[:,ii,jj] = -(ep - em) / (2.*hh)
+        # box3t = np.transpose(box3, [0,2,1])
+        # t_esti = np.matmul(num_deriv, box3t)
+        num_deriv = np.transpose(num_deriv, [0,2,1])
+        t_esti = np.matmul(num_deriv, box3)
+
+        print(t_esti, '\n', vv.reshape([-1, 3, 3]))
+        for ff in range(nframes):
+            for ii in range(3):
+                for jj in range(3):                
+                    self.assertAlmostEqual(t_esti[ff][ii][jj], vv[ff,ii*3+jj], 
+                                           places = places,
+                                           msg = "frame %d virial component [%d,%d] failed" % (ff, ii, jj))
+            
diff --git a/source/tests/test_data_modifier_shuffle.py b/source/tests/test_data_modifier_shuffle.py
new file mode 100644
index 0000000000..91b69fd936
--- /dev/null
+++ b/source/tests/test_data_modifier_shuffle.py
@@ -0,0 +1,212 @@
+import os,sys,platform,json,shutil
+import numpy as np
+import unittest
+import dpdata
+from deepmd.env import tf
+
+from deepmd.common import j_must_have, data_requirement
+from deepmd.RunOptions import RunOptions
+from deepmd.Trainer import NNPTrainer
+from deepmd.DataSystem import DeepmdDataSystem
+from deepmd.RunOptions import global_tf_float_precision
+from deepmd.RunOptions import global_np_float_precision
+from deepmd.RunOptions import global_ener_float_precision
+from deepmd.EwaldRecp import EwaldRecp
+from deepmd.DataModifier import DipoleChargeModifier
+from deepmd.DeepDipole import DeepDipole
+
+from common import Data
+
+if global_np_float_precision == np.float32 :
+    global_default_fv_hh = 1e-2
+    global_default_dw_hh = 1e-2
+    global_default_places = 3
+else :
+    global_default_fv_hh = 1e-6
+    global_default_dw_hh = 1e-4
+    global_default_places = 5
+
+modifier_datapath = 'data_modifier'
+
+class Args() :
+    # INPUT = os.path.join(modifier_datapath, 'dipole.json')
+    restart = None
+    init_model = None
+    inter_threads = 0
+
+class TestDataModifier (unittest.TestCase) :
+
+    def setUp(self):
+        # with tf.variable_scope('load', reuse = False) :
+        tf.reset_default_graph()        
+        self._setUp()
+
+    def tearDown(self):
+        tf.reset_default_graph()        
+        if os.path.isdir(os.path.join(modifier_datapath, 'sys_test_0')):
+            shutil.rmtree(os.path.join(modifier_datapath, 'sys_test_0'))
+        if os.path.isfile(os.path.join(modifier_datapath, 'dipole.pb')):
+            os.remove(os.path.join(modifier_datapath, 'dipole.pb'))
+
+    def _setUp(self):
+        args = Args()
+        run_opt = RunOptions(args, False)
+        jdata = self._setUp_jdata()
+        self._setUp_data()
+
+        # init model
+        model = NNPTrainer (jdata, run_opt = run_opt)
+        rcut = model.model.get_rcut()
+
+        # init data system
+        systems = j_must_have(jdata['training'], 'systems')
+        set_pfx = j_must_have(jdata['training'], 'set_prefix')
+        batch_size = j_must_have(jdata['training'], 'batch_size')
+        test_size = j_must_have(jdata['training'], 'numb_test')    
+        data = DeepmdDataSystem(systems, 
+                                batch_size, 
+                                test_size, 
+                                rcut, 
+                                set_prefix=set_pfx)
+        data.add_dict(data_requirement)
+
+        # clear the default graph
+        tf.reset_default_graph()
+
+        # build the model with stats from the first system
+        model.build (data)
+        
+        # freeze the graph
+        with tf.Session() as sess:
+            init_op = tf.global_variables_initializer()
+            sess.run(init_op)
+            graph = tf.get_default_graph()
+            input_graph_def = graph.as_graph_def()
+            nodes = "o_dipole,o_rmat,o_rmat_deriv,o_nlist,o_rij,descrpt_attr/rcut,descrpt_attr/ntypes,descrpt_attr/sel,descrpt_attr/ndescrpt,model_attr/tmap,model_attr/sel_type,model_attr/model_type"
+            output_graph_def = tf.graph_util.convert_variables_to_constants(
+                sess,
+                input_graph_def,
+                nodes.split(",") 
+            )
+            output_graph = os.path.join(modifier_datapath, 'dipole.pb')
+            with tf.gfile.GFile(output_graph, "wb") as f:
+                f.write(output_graph_def.SerializeToString())
+
+    def _setUp_data(self):        
+        jdata = self._setUp_jdata()
+        # sys0
+        self.atom_types0 = np.array([0, 3, 2, 1, 3, 4, 1, 4], dtype = int)
+        self.natoms = len(self.atom_types0)
+        self.nframes = 1
+        scale = 10.0
+        self.sel_type = jdata['model']['fitting_net']['dipole_type']
+        self.nsel = 0
+        for ii in self.sel_type:
+            self.nsel += np.sum(self.atom_types0 == ii)
+        self.coords0 = np.random.random([self.nframes, self.natoms * 3]) * scale
+        self.dipoles0 = np.random.random([self.nframes, self.nsel * 3]) 
+        self.box0 = np.reshape(np.eye(3) * scale, [-1, 9])
+        self.box0 = np.tile(self.box0, [self.nframes, 1])
+        self._write_sys_data(os.path.join(modifier_datapath, 'sys_test_0'), 
+                             self.atom_types0, self.coords0, self.dipoles0, self.box0)
+        # sys1
+        self.idx_map = np.array([6, 7, 1, 0, 5, 2, 4, 3], dtype = int)
+        self.sel_idx_map = np.array([3, 0, 2, 1], dtype = int)
+        self.atom_types1 = self.atom_types0[self.idx_map]        
+        self.coords1 = np.reshape(self.coords0, [self.nframes, -1, 3])
+        self.coords1 = self.coords1[:,self.idx_map,:]
+        self.coords1 = np.reshape(self.coords1, [self.nframes, self.natoms*3])
+        self.dipoles1 = self.dipoles0[:,self.sel_idx_map]
+        self.box1 = self.box0
+
+    def _write_sys_data(self, dirname, atom_types, coords, dipoles, box):
+        os.makedirs(dirname, exist_ok = True)
+        os.makedirs(dirname+'/set.0', exist_ok = True)
+        np.savetxt(os.path.join(dirname, 'type.raw'), atom_types, fmt = '%d')
+        np.save(os.path.join(dirname, 'set.0', 'coord.npy'), coords)
+        np.save(os.path.join(dirname, 'set.0', 'dipole.npy'), dipoles)
+        np.save(os.path.join(dirname, 'set.0', 'box.npy'), box)
+
+    def _setUp_jdata(self):
+        aa = {"a":[1,2,3]}
+        jdata = {
+            "model":{
+	        "type_map":		["A", "B", "C", "D", "E"],
+	        "descriptor" :{
+	            "type":		"se_a",
+	            "sel":              [50, 50, 50, 50, 50],
+	            "rcut_smth":	3.80,
+	            "rcut":		4.00,
+	            "neuron":		[2, 4],
+	            "resnet_dt":	False,
+	            "axis_neuron":	4,
+	            "seed":		1,
+	        },
+	        "fitting_net": {
+	            "type":		"dipole",
+	            "dipole_type":	[1, 3],
+	            "neuron":		[10],
+	            "resnet_dt":	True,
+	            "seed":		1,
+	        },
+            },
+            "learning_rate" :{
+	        "type":		"exp",
+	        "start_lr":	0.01,
+	        "decay_steps":	5000,
+	        "decay_rate":	0.95,
+            },
+            "training": {
+	        "systems":	["data_modifier/sys_test_0"], 
+	        "set_prefix":	"set",    
+	        "stop_batch":	1000000,
+	        "batch_size":	1,
+	        "numb_test":	2,
+            },
+        }
+        return jdata
+
+
+    def test_z_dipole(self):
+        dd = DeepDipole(os.path.join(modifier_datapath, "dipole.pb"))
+            
+        dv0 = dd.eval(self.coords0, self.box0, self.atom_types0)
+        dv1 = dd.eval(self.coords1, self.box1, self.atom_types1)
+
+        dv01 = dv0.reshape([self.nframes, -1, 3])
+        dv01 = dv01[:,self.sel_idx_map, :]
+        dv01 = dv01.reshape([self.nframes, -1])
+        dv1 = dv1.reshape([self.nframes, -1])
+
+        for ii in range(self.nframes):
+            for jj in range(self.nsel):
+                self.assertAlmostEqual(
+                    dv01[ii][jj], dv1[ii][jj], 
+                    msg = "dipole [%d,%d] dose not match" % (ii, jj))
+
+
+    def test_modify(self):
+        dcm = DipoleChargeModifier(os.path.join(modifier_datapath, "dipole.pb"),
+                                   [-1, -3],
+                                   [1, 1, 1, 1, 1],
+                                   1,
+                                   0.25)
+        ve0, vf0, vv0 = dcm.eval(self.coords0, self.box0, self.atom_types0)
+        ve1, vf1, vv1 = dcm.eval(self.coords1, self.box1, self.atom_types1)
+        vf01 = vf0[:,self.idx_map, :]
+
+        for ii in range(self.nframes):
+            self.assertAlmostEqual(ve0[ii], ve1[ii], 
+                                   msg = 'energy %d should match' % ii)
+        for ii in range(self.nframes):
+            for jj in range(9):
+                self.assertAlmostEqual(vv0[ii][jj], vv1[ii][jj], 
+                                       msg = 'virial [%d,%d] should match' % (ii,jj))
+        for ii in range(self.nframes):
+            for jj in range(self.natoms):
+                for dd in range(3):
+                    self.assertAlmostEqual(
+                        vf01[ii][jj][dd], vf1[ii][jj][dd], 
+                        msg = "force [%d,%d,%d] dose not match" % (ii,jj,dd))
+                    
+        
diff --git a/source/tests/test_deepmd_data.py b/source/tests/test_deepmd_data.py
index 6db3017d00..005ab26893 100644
--- a/source/tests/test_deepmd_data.py
+++ b/source/tests/test_deepmd_data.py
@@ -10,6 +10,62 @@
 else:
     places = 12
 
+class TestDataTypeSel(unittest.TestCase):
+    def setUp(self):
+        self.data_name = 'test_data'
+        os.makedirs(self.data_name, exist_ok = True)
+        os.makedirs(os.path.join(self.data_name,'set.foo'), exist_ok = True)
+        np.savetxt(os.path.join(self.data_name, 'type.raw'), 
+                   np.array([0, 1, 1, 0, 1, 1]), 
+                   fmt = '%d')
+        self.nframes = 3
+        self.natoms = 6
+        # coord
+        path = os.path.join(self.data_name, 'set.foo', 'coord.npy')
+        self.coord = np.random.random([self.nframes, self.natoms, 3])
+        np.save(path, np.reshape(self.coord, [self.nframes, -1]))
+        self.coord = self.coord[:,[0,3,1,2,4,5],:]
+        self.coord = self.coord.reshape([self.nframes, -1])
+        # box
+        path = os.path.join(self.data_name, 'set.foo', 'box.npy')
+        self.box = np.random.random([self.nframes, 9])
+        np.save(path, self.box)
+        # value
+        path = os.path.join(self.data_name, 'set.foo', 'value_1.npy')
+        self.value_1 = np.arange(self.nframes * 2)
+        self.value_1 = np.reshape(self.value_1, [self.nframes, 2])
+        np.save(path, self.value_1)
+        # value
+        path = os.path.join(self.data_name, 'set.foo', 'value_2.npy')
+        self.value_2 = np.arange(self.nframes * 4)
+        self.value_2 = np.reshape(self.value_2, [self.nframes, 4])
+        np.save(path, self.value_2)
+
+    def tearDown(self) :
+        shutil.rmtree(self.data_name)
+
+    def test_load_set_1(self) :
+        dd = DeepmdData(self.data_name)\
+             .add('value_1', 1, atomic=True, must=True, type_sel = [0])
+        data = dd._load_set(os.path.join(self.data_name, 'set.foo'))
+        self.assertEqual(data['value_1'].shape, (self.nframes, 2))
+        for ii in range(self.nframes):
+            for jj in range(2):
+                self.assertAlmostEqual(data['value_1'][ii][jj],
+                                       self.value_1[ii][jj])
+                
+
+    def test_load_set_2(self) :
+        dd = DeepmdData(self.data_name)\
+             .add('value_2', 1, atomic=True, must=True, type_sel = [1])
+        data = dd._load_set(os.path.join(self.data_name, 'set.foo'))
+        self.assertEqual(data['value_2'].shape, (self.nframes, 4))
+        for ii in range(self.nframes):
+            for jj in range(4):
+                self.assertAlmostEqual(data['value_2'][ii][jj],
+                                       self.value_2[ii][jj])                
+
+
 class TestData (unittest.TestCase) :
     def setUp (self) :
         self.data_name = 'test_data'
diff --git a/source/tests/test_deepmd_data_sys.py b/source/tests/test_deepmd_data_sys.py
index 39c5a45a60..d08b148f3a 100644
--- a/source/tests/test_deepmd_data_sys.py
+++ b/source/tests/test_deepmd_data_sys.py
@@ -231,6 +231,38 @@ def test_get_batch(self):
         ), 0.0)
 
 
+
+    def test_prob_sys_size_1(self) :
+        batch_size = 1
+        test_size = 1
+        ds = DeepmdDataSystem(self.sys_name, batch_size, test_size, 2.0)
+        prob = ds._prob_sys_size_ext("prob_sys_size; 0:2:2; 2:4:8")
+        self.assertAlmostEqual(np.sum(prob), 1)
+        self.assertAlmostEqual(np.sum(prob[0:2]), 0.2)
+        self.assertAlmostEqual(np.sum(prob[2:4]), 0.8)
+        # number of training set is self.nset-1
+        # shift is the total number of set size shift...
+        shift = np.sum(np.arange(self.nset-1))
+        self.assertAlmostEqual(prob[1]/prob[0], float(self.nframes[1]*(self.nset-1)+shift)/float(self.nframes[0]*(self.nset-1)+shift))
+        self.assertAlmostEqual(prob[3]/prob[2], float(self.nframes[3]*(self.nset-1)+shift)/float(self.nframes[2]*(self.nset-1)+shift))
+
+
+    def test_prob_sys_size_1(self) :
+        batch_size = 1
+        test_size = 1
+        ds = DeepmdDataSystem(self.sys_name, batch_size, test_size, 2.0)
+        prob = ds._prob_sys_size_ext("prob_sys_size; 1:2:0.4; 2:4:1.6")
+        self.assertAlmostEqual(np.sum(prob), 1)
+        self.assertAlmostEqual(np.sum(prob[1:2]), 0.2)
+        self.assertAlmostEqual(np.sum(prob[2:4]), 0.8)
+        # number of training set is self.nset-1
+        # shift is the total number of set size shift...
+        shift = np.sum(np.arange(self.nset-1))
+        self.assertAlmostEqual(prob[0], 0.0)
+        self.assertAlmostEqual(prob[1], 0.2)
+        self.assertAlmostEqual(prob[3]/prob[2], float(self.nframes[3]*(self.nset-1)+shift)/float(self.nframes[2]*(self.nset-1)+shift))
+
+
     def _idx_map(self, target, idx_map, ndof):
         natoms = len(idx_map)
         target = target.reshape([-1, natoms, ndof])
diff --git a/source/tests/test_descrpt_nonsmth.py b/source/tests/test_descrpt_nonsmth.py
index 0414607b58..7d12e01e74 100644
--- a/source/tests/test_descrpt_nonsmth.py
+++ b/source/tests/test_descrpt_nonsmth.py
@@ -24,9 +24,10 @@
 from deepmd.RunOptions import global_ener_float_precision
 
 class Inter():
-    def __init__ (self,
-                  data,
-                  comp = 0) :
+    def setUp (self,
+               data,
+               comp = 0, 
+               pbc = True) :
         self.sess = tf.Session()
         self.data = data
         self.natoms = self.data.get_natoms()
@@ -46,10 +47,13 @@ def __init__ (self,
         dstd = np.ones  ([self.ntypes, self.ndescrpt])
         self.t_avg = tf.constant(davg.astype(global_np_float_precision))
         self.t_std = tf.constant(dstd.astype(global_np_float_precision))
-        self.default_mesh = np.zeros (6, dtype = np.int32)
-        self.default_mesh[3] = 2
-        self.default_mesh[4] = 2
-        self.default_mesh[5] = 2
+        if pbc:
+            self.default_mesh = np.zeros (6, dtype = np.int32)
+            self.default_mesh[3] = 2
+            self.default_mesh[4] = 2
+            self.default_mesh[5] = 2
+        else :
+            self.default_mesh = np.array([], dtype = np.int32)
         # make place holder
         self.coord      = tf.placeholder(global_tf_float_precision, [None, self.natoms[0] * 3], name='t_coord')
         self.box        = tf.placeholder(global_tf_float_precision, [None, 9], name='t_box')
@@ -77,12 +81,13 @@ def comp_ef (self,
                  tnatoms,
                  name,
                  reuse = None) :
+        t_default_mesh = tf.constant(self.default_mesh)
         descrpt, descrpt_deriv, rij, nlist, axis, rot_mat \
             = op_module.descrpt (dcoord, 
                                  dtype,
                                  tnatoms,
                                  dbox, 
-                                 tf.constant(self.default_mesh),
+                                 t_default_mesh,
                                  self.t_avg,
                                  self.t_std,
                                  rcut_a = self.rcut_a, 
@@ -153,49 +158,112 @@ def comp_v_dw (self,
 
 
 class TestNonSmooth(Inter, unittest.TestCase):
-    def __init__ (self, *args, **kwargs):
+    # def __init__ (self, *args, **kwargs):
+    #     self.places = 5
+    #     data = Data()
+    #     Inter.__init__(self, data)
+    #     unittest.TestCase.__init__(self, *args, **kwargs)
+    #     self.controller = object()
+
+    def setUp(self):
         self.places = 5
         data = Data()
-        Inter.__init__(self, data)
-        unittest.TestCase.__init__(self, *args, **kwargs)
-        self.controller = object()
+        Inter.setUp(self, data)
 
     def test_force (self) :
-        force_test(self, self)
-        # t_energy, t_force, t_virial \
-        #     = self.comp_ef (self.coord, self.box, self.type, self.tnatoms, name = "test")
-        # self.sess.run (tf.global_variables_initializer())
-        # dcoord, dbox, dtype = self.data.get_data ()
-        # hh = 1e-6
-        # dcoordp = np.copy(dcoord)
-        # dcoordm = np.copy(dcoord)
-        # dcoordp[0,0] = dcoord[0,0] + hh
-        # dcoordm[0,0] = dcoord[0,0] - hh
-        # [axis0, nlist0, d0] = self.sess.run ([self.axis, self.nlist, self.descrpt], 
-        #                                  feed_dict = {
-        #                                      self.coord:     dcoordp,
-        #                                      self.box:       dbox,
-        #                                      self.type:      dtype,
-        #                                      self.tnatoms:   self.natoms}
-        # )
-        # [axis1, nlist1, d1] = self.sess.run ([self.axis, self.nlist, self.descrpt], 
-        #                                  feed_dict = {
-        #                                      self.coord:     dcoordm,
-        #                                      self.box:       dbox,
-        #                                      self.type:      dtype,
-        #                                      self.tnatoms:   self.natoms}
-        # )
-        # print((nlist0 - nlist1))
-        # print((axis0 - axis1))
+        force_test(self, self, suffix = '_se')
 
     def test_virial (self) :
-        virial_test(self, self)
+        virial_test(self, self, suffix = '_se')
 
     def test_force_dw (self) :
-        force_dw_test(self, self)
+        force_dw_test(self, self, suffix = '_se')
 
     def test_virial_dw (self) :
-        virial_dw_test(self, self)
+        virial_dw_test(self, self, suffix = '_se')
+
+
+class TestLFPbc(unittest.TestCase):
+    def test_pbc(self):
+        data = Data()
+        inter0 = Inter()
+        inter1 = Inter()
+        inter0.setUp(data, pbc = True)
+        inter1.setUp(data, pbc = False)
+        inter0.net_w_i = np.copy(np.ones(inter0.ndescrpt))
+        inter1.net_w_i = np.copy(np.ones(inter1.ndescrpt))
+
+        t_energy0, t_force0, t_virial0 \
+            = inter0.comp_ef (inter0.coord, inter0.box, inter0.type, inter0.tnatoms, name = "test_lf_pbc_true")
+        t_energy1, t_force1, t_virial1 \
+            = inter1.comp_ef (inter1.coord, inter1.box, inter1.type, inter1.tnatoms, name = "test_lf_pbc_false")
+
+        inter0.sess.run (tf.global_variables_initializer())
+        inter1.sess.run (tf.global_variables_initializer())
+
+        dcoord, dbox, dtype = data.get_data ()
+
+        [e0, f0, v0] = inter0.sess.run ([t_energy0, t_force0, t_virial0], 
+                                        feed_dict = {
+                                            inter0.coord:     dcoord,
+                                            inter0.box:       dbox,
+                                            inter0.type:      dtype,
+                                            inter0.tnatoms:   inter0.natoms})
+        [e1, f1, v1] = inter1.sess.run ([t_energy1, t_force1, t_virial1], 
+                                        feed_dict = {
+                                            inter1.coord:     dcoord,
+                                            inter1.box:       dbox,
+                                            inter1.type:      dtype,
+                                            inter1.tnatoms:   inter1.natoms})
+        
+        self.assertAlmostEqual(e0[0], e1[0])
+        for ii in range(f0[0].size):
+            # print(ii)
+            self.assertAlmostEqual(f0[0][ii], f1[0][ii])
+        for ii in range(v0[0].size):
+            # print(ii)
+            self.assertAlmostEqual(v0[0][ii], v1[0][ii])
+
+    def test_pbc_small_box(self):
+        data0 = Data()
+        data1 = Data(box_scale = 2)
+        inter0 = Inter()
+        inter1 = Inter()
+        inter0.setUp(data0, pbc = True)
+        inter1.setUp(data1, pbc = False)
+        inter0.net_w_i = np.copy(np.ones(inter0.ndescrpt))
+        inter1.net_w_i = np.copy(np.ones(inter1.ndescrpt))
+
+        t_energy0, t_force0, t_virial0 \
+            = inter0.comp_ef (inter0.coord, inter0.box, inter0.type, inter0.tnatoms, name = "test_lf_pbc_sbox_true")
+        t_energy1, t_force1, t_virial1 \
+            = inter1.comp_ef (inter1.coord, inter1.box, inter1.type, inter1.tnatoms, name = "test_lf_pbc_sbox_false")
+
+        inter0.sess.run (tf.global_variables_initializer())
+        inter1.sess.run (tf.global_variables_initializer())
+
+        dcoord, dbox, dtype = data0.get_data ()
+        [e0, f0, v0] = inter0.sess.run ([t_energy0, t_force0, t_virial0], 
+                                        feed_dict = {
+                                            inter0.coord:     dcoord,
+                                            inter0.box:       dbox,
+                                            inter0.type:      dtype,
+                                            inter0.tnatoms:   inter0.natoms})
+        dcoord, dbox, dtype = data1.get_data ()
+        [e1, f1, v1] = inter1.sess.run ([t_energy1, t_force1, t_virial1], 
+                                        feed_dict = {
+                                            inter1.coord:     dcoord,
+                                            inter1.box:       dbox,
+                                            inter1.type:      dtype,
+                                            inter1.tnatoms:   inter1.natoms})
+        
+        self.assertAlmostEqual(e0[0], e1[0])
+        for ii in range(f0[0].size):
+            # print(ii)
+            self.assertAlmostEqual(f0[0][ii], f1[0][ii])
+        for ii in range(v0[0].size):
+            # print(ii)
+            self.assertAlmostEqual(v0[0][ii], v1[0][ii])
 
 
 if __name__ == '__main__':
diff --git a/source/tests/test_descrpt_se_ar.py b/source/tests/test_descrpt_se_ar.py
index 823bbb67ec..01c9e99496 100644
--- a/source/tests/test_descrpt_se_ar.py
+++ b/source/tests/test_descrpt_se_ar.py
@@ -26,8 +26,8 @@
 from deepmd.RunOptions import global_ener_float_precision
 
 class Inter():
-    def __init__ (self, 
-                  data) :
+    def setUp (self, 
+               data) :
         self.sess = tf.Session()
         self.data = data
         self.natoms = self.data.get_natoms()
@@ -89,7 +89,7 @@ def comp_ef (self,
                  tnatoms,
                  name,
                  reuse = None) :
-        dout = self.descrpt.build(dcoord, dtype, tnatoms, dbox, self.default_mesh, self.avg, self.std, suffix=name, reuse=reuse)
+        dout = self.descrpt.build(dcoord, dtype, tnatoms, dbox, self.default_mesh, suffix=name, reuse=reuse)
         inputs_reshape = tf.reshape (dout, [-1, self.descrpt.get_dim_out()])
         atom_ener = self._net (inputs_reshape, name, reuse = reuse)
         atom_ener_reshape = tf.reshape(atom_ener, [-1, self.natoms[0]])       
@@ -99,11 +99,16 @@ def comp_ef (self,
 
 
 class TestDescrptAR(Inter, unittest.TestCase):
-    def __init__ (self, *args, **kwargs):
+    # def __init__ (self, *args, **kwargs):
+    #     data = Data()
+    #     Inter.__init__(self, data)
+    #     unittest.TestCase.__init__(self, *args, **kwargs)
+    #     self.controller = object()
+
+    def setUp(self):
+        self.places = 5
         data = Data()
-        Inter.__init__(self, data)
-        unittest.TestCase.__init__(self, *args, **kwargs)
-        self.controller = object()
+        Inter.setUp(self, data)
 
     def test_force (self) :
         force_test(self, self, suffix = '_se_ar')
diff --git a/source/tests/test_descrpt_se_r.py b/source/tests/test_descrpt_se_r.py
index 83d6dd1f47..fa21712f90 100644
--- a/source/tests/test_descrpt_se_r.py
+++ b/source/tests/test_descrpt_se_r.py
@@ -26,8 +26,9 @@
 from deepmd.RunOptions import global_ener_float_precision
 
 class Inter():
-    def __init__ (self, 
-                  data) :
+    def setUp (self, 
+               data, 
+               pbc = True) :
         self.sess = tf.Session()
         self.data = data
         self.natoms = self.data.get_natoms()
@@ -42,10 +43,13 @@ def __init__ (self,
         dstd = np.ones  ([self.ntypes, self.ndescrpt])
         self.t_avg = tf.constant(davg.astype(global_np_float_precision))
         self.t_std = tf.constant(dstd.astype(global_np_float_precision))
-        self.default_mesh = np.zeros (6, dtype = np.int32)
-        self.default_mesh[3] = 2
-        self.default_mesh[4] = 2
-        self.default_mesh[5] = 2
+        if pbc:
+            self.default_mesh = np.zeros (6, dtype = np.int32)
+            self.default_mesh[3] = 2
+            self.default_mesh[4] = 2
+            self.default_mesh[5] = 2
+        else:
+            self.default_mesh = np.array([], dtype = np.int32)            
         # make place holder
         self.coord      = tf.placeholder(global_tf_float_precision, [None, self.natoms[0] * 3], name='t_coord')
         self.box        = tf.placeholder(global_tf_float_precision, [None, 9], name='t_box')
@@ -137,11 +141,16 @@ def comp_v_dw (self,
 
 
 class TestSmooth(Inter, unittest.TestCase):
-    def __init__ (self, *args, **kwargs):
+    # def __init__ (self, *args, **kwargs):
+    #     data = Data()
+    #     Inter.__init__(self, data)
+    #     unittest.TestCase.__init__(self, *args, **kwargs)
+    #     self.controller = object()
+
+    def setUp(self):
+        self.places = 5
         data = Data()
-        Inter.__init__(self, data)
-        unittest.TestCase.__init__(self, *args, **kwargs)
-        self.controller = object()
+        Inter.setUp(self, data)
 
     def test_force (self) :
         force_test(self, self, suffix = '_se_r')
@@ -156,5 +165,89 @@ def test_virial_dw (self) :
         virial_dw_test(self, self, suffix = '_se_r')
 
 
+class TestSeRPbc(unittest.TestCase):
+    def test_pbc(self):
+        data = Data()
+        inter0 = Inter()
+        inter1 = Inter()
+        inter0.setUp(data, pbc = True)
+        inter1.setUp(data, pbc = False)
+        inter0.net_w_i = np.copy(np.ones(inter0.ndescrpt))
+        inter1.net_w_i = np.copy(np.ones(inter1.ndescrpt))
+
+        t_energy0, t_force0, t_virial0 \
+            = inter0.comp_ef (inter0.coord, inter0.box, inter0.type, inter0.tnatoms, name = "test_ser_pbc_true")
+        t_energy1, t_force1, t_virial1 \
+            = inter1.comp_ef (inter1.coord, inter1.box, inter1.type, inter1.tnatoms, name = "test_ser_pbc_false")
+
+        inter0.sess.run (tf.global_variables_initializer())
+        inter1.sess.run (tf.global_variables_initializer())
+
+        dcoord, dbox, dtype = data.get_data ()
+
+        [e0, f0, v0] = inter0.sess.run ([t_energy0, t_force0, t_virial0], 
+                                        feed_dict = {
+                                            inter0.coord:     dcoord,
+                                            inter0.box:       dbox,
+                                            inter0.type:      dtype,
+                                            inter0.tnatoms:   inter0.natoms})
+        [e1, f1, v1] = inter1.sess.run ([t_energy1, t_force1, t_virial1], 
+                                        feed_dict = {
+                                            inter1.coord:     dcoord,
+                                            inter1.box:       dbox,
+                                            inter1.type:      dtype,
+                                            inter1.tnatoms:   inter1.natoms})
+
+        self.assertAlmostEqual(e0[0], e1[0])
+        for ii in range(f0[0].size):
+            # print(ii)
+            self.assertAlmostEqual(f0[0][ii], f1[0][ii])
+        for ii in range(v0[0].size):
+            # print(ii)
+            self.assertAlmostEqual(v0[0][ii], v1[0][ii])
+
+
+    def test_pbc_small_box(self):
+        data0 = Data()
+        data1 = Data(box_scale = 2)
+        inter0 = Inter()
+        inter1 = Inter()
+        inter0.setUp(data0, pbc = True)
+        inter1.setUp(data1, pbc = False)
+        inter0.net_w_i = np.copy(np.ones(inter0.ndescrpt))
+        inter1.net_w_i = np.copy(np.ones(inter1.ndescrpt))
+
+        t_energy0, t_force0, t_virial0 \
+            = inter0.comp_ef (inter0.coord, inter0.box, inter0.type, inter0.tnatoms, name = "test_ser_pbc_sbox_true")
+        t_energy1, t_force1, t_virial1 \
+            = inter1.comp_ef (inter1.coord, inter1.box, inter1.type, inter1.tnatoms, name = "test_ser_pbc_sbox_false")
+
+        inter0.sess.run (tf.global_variables_initializer())
+        inter1.sess.run (tf.global_variables_initializer())
+
+        dcoord, dbox, dtype = data0.get_data ()
+        [e0, f0, v0] = inter0.sess.run ([t_energy0, t_force0, t_virial0], 
+                                        feed_dict = {
+                                            inter0.coord:     dcoord,
+                                            inter0.box:       dbox,
+                                            inter0.type:      dtype,
+                                            inter0.tnatoms:   inter0.natoms})
+        dcoord, dbox, dtype = data1.get_data ()
+        [e1, f1, v1] = inter1.sess.run ([t_energy1, t_force1, t_virial1], 
+                                        feed_dict = {
+                                            inter1.coord:     dcoord,
+                                            inter1.box:       dbox,
+                                            inter1.type:      dtype,
+                                            inter1.tnatoms:   inter1.natoms})
+
+        self.assertAlmostEqual(e0[0], e1[0])
+        for ii in range(f0[0].size):
+            # print(ii)
+            self.assertAlmostEqual(f0[0][ii], f1[0][ii])
+        for ii in range(v0[0].size):
+            # print(ii)
+            self.assertAlmostEqual(v0[0][ii], v1[0][ii])
+
+
 if __name__ == '__main__':
     unittest.main()
diff --git a/source/tests/test_descrpt_smooth.py b/source/tests/test_descrpt_smooth.py
index f718a5925d..7876fbfc3b 100644
--- a/source/tests/test_descrpt_smooth.py
+++ b/source/tests/test_descrpt_smooth.py
@@ -26,8 +26,9 @@
 from deepmd.RunOptions import global_ener_float_precision
 
 class Inter():
-    def __init__ (self, 
-                  data) :
+    def setUp (self, 
+               data, 
+               pbc = True) :
         self.sess = tf.Session()
         self.data = data
         self.natoms = self.data.get_natoms()
@@ -47,10 +48,13 @@ def __init__ (self,
         dstd = np.ones  ([self.ntypes, self.ndescrpt])
         self.t_avg = tf.constant(davg.astype(global_np_float_precision))
         self.t_std = tf.constant(dstd.astype(global_np_float_precision))
-        self.default_mesh = np.zeros (6, dtype = np.int32)
-        self.default_mesh[3] = 2
-        self.default_mesh[4] = 2
-        self.default_mesh[5] = 2
+        if pbc:
+            self.default_mesh = np.zeros (6, dtype = np.int32)
+            self.default_mesh[3] = 2
+            self.default_mesh[4] = 2
+            self.default_mesh[5] = 2
+        else:
+            self.default_mesh = np.array([], dtype = np.int32)
         # make place holder
         self.coord      = tf.placeholder(global_tf_float_precision, [None, self.natoms[0] * 3], name='t_coord')
         self.box        = tf.placeholder(global_tf_float_precision, [None, 9], name='t_box')
@@ -148,11 +152,16 @@ def comp_v_dw (self,
 
 
 class TestSmooth(Inter, unittest.TestCase):
-    def __init__ (self, *args, **kwargs):
+    # def __init__ (self, *args, **kwargs):
+    #     data = Data()
+    #     Inter.__init__(self, data)
+    #     unittest.TestCase.__init__(self, *args, **kwargs)
+    #     self.controller = object()
+
+    def setUp(self):
+        self.places = 5
         data = Data()
-        Inter.__init__(self, data)
-        unittest.TestCase.__init__(self, *args, **kwargs)
-        self.controller = object()
+        Inter.setUp(self, data)
 
     def test_force (self) :
         force_test(self, self, suffix = '_smth')
@@ -167,5 +176,88 @@ def test_virial_dw (self) :
         virial_dw_test(self, self, suffix = '_smth')
 
 
+class TestSeAPbc(unittest.TestCase):
+    def test_pbc(self):
+        data = Data()
+        inter0 = Inter()
+        inter1 = Inter()
+        inter0.setUp(data, pbc = True)
+        inter1.setUp(data, pbc = False)
+        inter0.net_w_i = np.copy(np.ones(inter0.ndescrpt))
+        inter1.net_w_i = np.copy(np.ones(inter1.ndescrpt))
+
+        t_energy0, t_force0, t_virial0 \
+            = inter0.comp_ef (inter0.coord, inter0.box, inter0.type, inter0.tnatoms, name = "test_sea_pbc_true")
+        t_energy1, t_force1, t_virial1 \
+            = inter1.comp_ef (inter1.coord, inter1.box, inter1.type, inter1.tnatoms, name = "test_sea_pbc_false")
+
+        inter0.sess.run (tf.global_variables_initializer())
+        inter1.sess.run (tf.global_variables_initializer())
+
+        dcoord, dbox, dtype = data.get_data ()
+
+        [e0, f0, v0] = inter0.sess.run ([t_energy0, t_force0, t_virial0], 
+                                        feed_dict = {
+                                            inter0.coord:     dcoord,
+                                            inter0.box:       dbox,
+                                            inter0.type:      dtype,
+                                            inter0.tnatoms:   inter0.natoms})
+        [e1, f1, v1] = inter1.sess.run ([t_energy1, t_force1, t_virial1], 
+                                        feed_dict = {
+                                            inter1.coord:     dcoord,
+                                            inter1.box:       dbox,
+                                            inter1.type:      dtype,
+                                            inter1.tnatoms:   inter1.natoms})
+
+        self.assertAlmostEqual(e0[0], e1[0])
+        for ii in range(f0[0].size):
+            # print(ii)
+            self.assertAlmostEqual(f0[0][ii], f1[0][ii])
+        for ii in range(v0[0].size):
+            # print(ii)
+            self.assertAlmostEqual(v0[0][ii], v1[0][ii])
+
+    def test_pbc_small_box(self):
+        data0 = Data()
+        data1 = Data(box_scale = 2)
+        inter0 = Inter()
+        inter1 = Inter()
+        inter0.setUp(data0, pbc = True)
+        inter1.setUp(data1, pbc = False)
+        inter0.net_w_i = np.copy(np.ones(inter0.ndescrpt))
+        inter1.net_w_i = np.copy(np.ones(inter1.ndescrpt))
+
+        t_energy0, t_force0, t_virial0 \
+            = inter0.comp_ef (inter0.coord, inter0.box, inter0.type, inter0.tnatoms, name = "test_sea_pbc_sbox_true")
+        t_energy1, t_force1, t_virial1 \
+            = inter1.comp_ef (inter1.coord, inter1.box, inter1.type, inter1.tnatoms, name = "test_sea_pbc_sbox_false")
+
+        inter0.sess.run (tf.global_variables_initializer())
+        inter1.sess.run (tf.global_variables_initializer())
+
+        dcoord, dbox, dtype = data0.get_data ()
+        [e0, f0, v0] = inter0.sess.run ([t_energy0, t_force0, t_virial0], 
+                                        feed_dict = {
+                                            inter0.coord:     dcoord,
+                                            inter0.box:       dbox,
+                                            inter0.type:      dtype,
+                                            inter0.tnatoms:   inter0.natoms})
+        dcoord, dbox, dtype = data1.get_data ()
+        [e1, f1, v1] = inter1.sess.run ([t_energy1, t_force1, t_virial1], 
+                                        feed_dict = {
+                                            inter1.coord:     dcoord,
+                                            inter1.box:       dbox,
+                                            inter1.type:      dtype,
+                                            inter1.tnatoms:   inter1.natoms})
+
+        self.assertAlmostEqual(e0[0], e1[0])
+        for ii in range(f0[0].size):
+            # print(ii)
+            self.assertAlmostEqual(f0[0][ii], f1[0][ii])
+        for ii in range(v0[0].size):
+            # print(ii)
+            self.assertAlmostEqual(v0[0][ii], v1[0][ii])
+
+
 if __name__ == '__main__':
     unittest.main()
diff --git a/source/tests/test_ewald.py b/source/tests/test_ewald.py
new file mode 100644
index 0000000000..4131cfa4df
--- /dev/null
+++ b/source/tests/test_ewald.py
@@ -0,0 +1,217 @@
+import os,sys,platform
+import numpy as np
+import unittest
+from deepmd.env import tf
+
+from deepmd.RunOptions import global_tf_float_precision
+from deepmd.RunOptions import global_np_float_precision
+from deepmd.RunOptions import global_ener_float_precision
+from deepmd.EwaldRecp import op_module
+from deepmd.EwaldRecp import EwaldRecp
+
+if global_np_float_precision == np.float32 :
+    global_default_fv_hh = 1e-2
+    global_default_dw_hh = 1e-2
+    global_default_places = 3
+else :
+    global_default_fv_hh = 1e-6
+    global_default_dw_hh = 1e-4
+    global_default_places = 5
+
+
+class TestEwaldRecp (unittest.TestCase) :
+    def setUp(self):
+        boxl = 4.5 # NOTICE grid should not change before and after box pert...
+        box_pert = 0.2
+        self.natoms = 16
+        self.nframes = 2
+        self.ewald_h = 1
+        self.ewald_beta = 1
+        self.dbox = []
+        self.dcoord = []
+        self.rcoord = []
+        self.dcharge = []
+        for ii in range(self.nframes):
+            # box
+            box = np.eye(3) * boxl
+            box[1][1] += 1
+            box[2][2] += 2
+            box += np.random.random([3,3]) * box_pert
+            box = 0.5 * (box + box.T)
+            self.dbox.append(box)
+            # scaled 
+            coord = np.random.random([self.natoms, 3])
+            self.rcoord.append(coord)
+            # real coords
+            self.dcoord.append(np.matmul(coord, box))
+            # charge
+            dcharge = np.random.random([self.natoms])
+            dcharge -= np.average(dcharge)
+            assert(np.abs(np.sum(self.dcharge) - 0) < 1e-12)
+            self.dcharge.append(dcharge)
+        self.dbox = np.array(self.dbox).reshape([self.nframes, 9])
+        self.rcoord = np.array(self.rcoord).reshape([self.nframes, 3*self.natoms])
+        self.dcoord = np.array(self.dcoord).reshape([self.nframes, 3*self.natoms])
+        self.dcharge = np.array(self.dcharge).reshape([self.nframes, self.natoms])
+        # place holders
+        self.coord      = tf.placeholder(global_tf_float_precision, [None], name='t_coord')
+        self.charge     = tf.placeholder(global_tf_float_precision, [None], name='t_charge')
+        self.box        = tf.placeholder(global_tf_float_precision, [None], name='t_box')
+        self.nloc    = tf.placeholder(tf.int32, [1], name = "t_nloc")        
+
+    def test_py_interface(self):
+        hh = 1e-4
+        places = 4
+        sess = tf.Session()
+        t_energy, t_force, t_virial \
+            = op_module.ewald_recp(self.coord, self.charge, self.nloc, self.box, 
+                                   ewald_h = self.ewald_h,
+                                   ewald_beta = self.ewald_beta)
+        [e, f, v] = sess.run([t_energy, t_force, t_virial], 
+                           feed_dict = {
+                               self.coord:  self.dcoord.reshape([-1]),
+                               self.charge: self.dcharge.reshape([-1]),
+                               self.box:    self.dbox.reshape([-1]),
+                               self.nloc:   [self.natoms],
+                           })
+        er = EwaldRecp(self.ewald_h, self.ewald_beta)
+        e1, f1, v1 = er.eval(self.dcoord, self.dcharge, self.dbox)        
+        for ff in range(self.nframes):
+            self.assertAlmostEqual(e[ff], e1[ff], 
+                                   places = places,
+                                   msg = "frame %d energy failed" % (ff))
+            for idx in range(self.natoms):
+                for dd in range(3):
+                    self.assertAlmostEqual(f[ff, idx*3+dd], f1[ff,idx*3+dd], 
+                                           places = places,
+                                           msg = "frame %d force component [%d,%d] failed" % (ff, idx, dd))
+            for d0 in range(3):
+                for d1 in range(3):
+                    self.assertAlmostEqual(v[ff, d0*3+d1], v[ff,d0*3+d1], 
+                                           places = places,
+                                           msg = "frame %d virial component [%d,%d] failed" % (ff, d0, d1))
+
+
+
+    def test_force(self):
+        hh = 1e-4
+        places = 4
+        sess = tf.Session()
+        t_energy, t_force, t_virial \
+            = op_module.ewald_recp(self.coord, self.charge, self.nloc, self.box, 
+                                   ewald_h = self.ewald_h,
+                                   ewald_beta = self.ewald_beta)
+        [force] = sess.run([t_force], 
+                           feed_dict = {
+                               self.coord:  self.dcoord.reshape([-1]),
+                               self.charge: self.dcharge.reshape([-1]),
+                               self.box:    self.dbox.reshape([-1]),
+                               self.nloc:   [self.natoms],
+                           })
+        for idx in range(self.natoms):
+            for dd in range(3):
+                dcoordp = np.copy(self.dcoord)
+                dcoordm = np.copy(self.dcoord)
+                dcoordp[:,idx*3+dd] = self.dcoord[:,idx*3+dd] + hh
+                dcoordm[:,idx*3+dd] = self.dcoord[:,idx*3+dd] - hh
+                energyp = sess.run([t_energy], 
+                                   feed_dict = {
+                                       self.coord:  dcoordp.reshape([-1]),
+                                       self.charge: self.dcharge.reshape([-1]),
+                                       self.box:    self.dbox.reshape([-1]),
+                                       self.nloc:   [self.natoms],
+                                   })                                
+                energym = sess.run([t_energy], 
+                                   feed_dict = {
+                                       self.coord:  dcoordm.reshape([-1]),
+                                       self.charge: self.dcharge.reshape([-1]),
+                                       self.box:    self.dbox.reshape([-1]),
+                                       self.nloc:   [self.natoms],
+                                   })
+                c_force = -(energyp[0] - energym[0]) / (2*hh)
+                for ff in range(self.nframes):
+                    self.assertAlmostEqual(c_force[ff], force[ff,idx*3+dd], 
+                                           places = places,
+                                           msg = "frame %d force component [%d,%d] failed" % (ff, idx, dd))
+
+
+    def test_virial(self):
+        hh = 1e-4
+        places = 5
+        sess = tf.Session()
+        t_energy, t_force, t_virial \
+            = op_module.ewald_recp(self.coord, self.charge, self.nloc, self.box, 
+                                   ewald_h = self.ewald_h,
+                                   ewald_beta = self.ewald_beta)
+        [virial] = sess.run([t_virial], 
+                           feed_dict = {
+                               self.coord:  self.dcoord.reshape([-1]),
+                               self.charge: self.dcharge.reshape([-1]),
+                               self.box:    self.dbox.reshape([-1]),
+                               self.nloc:   [self.natoms],
+                           })
+
+        from scipy.stats import ortho_group
+
+        
+
+        self.dbox3 = np.reshape(self.dbox, [self.nframes, 3,3])
+        self.drbox3 = np.linalg.inv(self.dbox3)
+        # print(np.matmul(self.dbox3, self.drbox3))
+        # print(np.matmul(self.drbox3, self.dbox3))
+        self.dcoord3 = np.reshape(self.dcoord, [self.nframes, self.natoms, 3])
+        self.rcoord3 = np.matmul(self.dcoord3, self.drbox3)
+        # print(np.linalg.norm(self.dcoord - np.matmul(self.rcoord3, self.dbox3).reshape([self.nframes,-1])))
+        # print(np.matmul(self.dcoord3, self.drbox3))
+        # print('check rcoord ', np.linalg.norm(self.rcoord3 - self.rcoord.reshape([self.nframes, self.natoms, 3])))
+
+        num_deriv = np.zeros([self.nframes,3,3])
+        for ii in range(3):
+            for jj in range(3):
+                dbox3p = np.copy(self.dbox3)
+                dbox3m = np.copy(self.dbox3)
+                dbox3p[:,ii,jj] = self.dbox3[:,ii,jj] + hh
+                dbox3m[:,ii,jj] = self.dbox3[:,ii,jj] - hh
+                dboxp = np.reshape(dbox3p, [-1,9])
+                dboxm = np.reshape(dbox3m, [-1,9])
+                dcoord = self.dcoord
+                dcoord3p = np.matmul(self.rcoord3, dbox3p)
+                dcoord3m = np.matmul(self.rcoord3, dbox3m)
+                dcoordp = np.reshape(dcoord3p, [self.nframes,-1])
+                dcoordm = np.reshape(dcoord3m, [self.nframes,-1])
+                energyp = sess.run([t_energy],
+                                   feed_dict = {
+                                       self.coord:  dcoordp.reshape([-1]),
+                                       self.charge: self.dcharge.reshape([-1]),
+                                       self.box:    dboxp.reshape([-1]),
+                                       self.nloc:   [self.natoms],
+                                   })
+                energym = sess.run([t_energy], 
+                                   feed_dict = {
+                                       self.coord:  dcoordm.reshape([-1]),
+                                       self.charge: self.dcharge.reshape([-1]),
+                                       self.box:    dboxm.reshape([-1]),
+                                       self.nloc:   [self.natoms],
+                                   })
+                num_deriv[:,ii,jj] = -(energyp[0] - energym[0]) / (2.*hh)
+        dbox3t = np.transpose(self.dbox3, [0,2,1])
+        t_esti = np.matmul(num_deriv, dbox3t)
+        # # t_esti = np.matmul(num_deriv, self.dbox3)
+        # print(num_deriv[0])
+        # print(t_esti[0])
+        # # print(0.5 * (t_esti[0] + t_esti[0].T))
+        # print(virial[0].reshape([3,3]))
+        # # print(0.5 * (t_esti[0] + t_esti[0].T) - virial[0].reshape([3,3]))
+        # print(0.5 * (t_esti[0] + t_esti[0]) - virial[0].reshape([3,3]))
+        # print(0.5 * (t_esti[0] + t_esti[0].T) - virial[0].reshape([3,3]))        
+        for ff in range(self.nframes):
+            for ii in range(3):
+                for jj in range(3):                
+                    self.assertAlmostEqual(t_esti[ff][ii][jj], virial[ff,ii*3+jj], 
+                                           places = places,
+                                           msg = "frame %d virial component [%d,%d] failed" % (ff, ii, jj))
+            
+                
+
+
+
diff --git a/source/tests/test_fitting_stat.py b/source/tests/test_fitting_stat.py
index aed3a6992a..0cbd693ae1 100644
--- a/source/tests/test_fitting_stat.py
+++ b/source/tests/test_fitting_stat.py
@@ -69,7 +69,7 @@ def test (self) :
         all_data = _make_fake_data(sys_natoms, sys_nframes, avgs, stds)
         frefa, frefs = _brute_fparam(all_data, len(avgs))
         arefa, arefs = _brute_aparam(all_data, len(avgs))
-        fitting.compute_dstats(all_data, protection = 1e-2)
+        fitting.compute_input_stats(all_data, protection = 1e-2)
         # print(frefa, frefs)
         for ii in range(len(avgs)):
             self.assertAlmostEqual(frefa[ii], fitting.fparam_avg[ii])
diff --git a/source/tests/test_gen_stat_data.py b/source/tests/test_gen_stat_data.py
new file mode 100644
index 0000000000..852915bd12
--- /dev/null
+++ b/source/tests/test_gen_stat_data.py
@@ -0,0 +1,122 @@
+import os,sys,platform,json,shutil
+import numpy as np
+import unittest
+import dpdata
+
+from deepmd.DataSystem import DeepmdDataSystem
+from deepmd.Fitting import EnerFitting
+from deepmd.Model import make_all_stat, merge_sys_stat, _make_all_stat_ref
+
+def gen_sys(nframes, atom_types):
+    natoms = len(atom_types)
+    data = {}
+    data['coords'] = np.random.random([nframes, natoms, 3])
+    data['forces'] = np.random.random([nframes, natoms, 3])
+    data['cells'] = np.random.random([nframes, 9])
+    data['energies'] = np.random.random([nframes, 1])
+    types = list(set(list(atom_types)))
+    types.sort()
+    data['atom_names'] = []
+    data['atom_numbs'] = []
+    for ii in range(len(types)):
+        data['atom_names'] .append( 'TYPE_%d' % ii )
+        data['atom_numbs'] .append(np.sum(atom_types == ii))
+    data['atom_types'] = np.array(atom_types, dtype = int)
+    return data
+
+class TestGenStatData(unittest.TestCase) :
+    def setUp(self):
+        data0 = gen_sys(20, [0, 1, 0, 2, 1])
+        data1 = gen_sys(30, [0, 1, 0, 0])
+        sys0 = dpdata.LabeledSystem()
+        sys1 = dpdata.LabeledSystem()
+        sys0.data = data0
+        sys1.data = data1
+        sys0.to_deepmd_npy('system_0', set_size = 10)
+        sys1.to_deepmd_npy('system_1', set_size = 10)
+        
+    def tearDown(self):
+        shutil.rmtree('system_0')
+        shutil.rmtree('system_1')
+
+    def _comp_data(self, d0, d1) :
+        for ii in range(d0.shape[0]):
+            for jj in range(d0.shape[1]):
+                for kk in range(d0.shape[2]):
+                    self.assertAlmostEqual(d0[ii][jj][kk], d1[ii][jj][kk])
+
+    def test_merge_all_stat(self):
+        np.random.seed(0)
+        data0 = DeepmdDataSystem(['system_0', 'system_1'], 
+                                5, 
+                                10, 
+                                1.0)
+        data0.add('energy', 1, must = True)
+        np.random.seed(0)
+        data1 = DeepmdDataSystem(['system_0', 'system_1'], 
+                                5, 
+                                10, 
+                                1.0)
+        data1.add('force', 3, atomic = True, must = True)
+        np.random.seed(0)
+        data2 = DeepmdDataSystem(['system_0', 'system_1'], 
+                                5, 
+                                10, 
+                                1.0)
+        data2.add('force', 3, atomic = True, must = True)
+        
+        np.random.seed(0)
+        all_stat_0 = make_all_stat(data0, 10, merge_sys = False)
+        np.random.seed(0)
+        all_stat_1 = make_all_stat(data1, 10, merge_sys = True)
+        all_stat_2 = merge_sys_stat(all_stat_0)
+        np.random.seed(0)
+        all_stat_3 = _make_all_stat_ref(data2, 10)
+        
+        ####################################
+        # only check if the energy is concatenated correctly
+        ####################################
+        dd = 'energy'
+            # if 'find_' in dd: continue
+            # if 'natoms_vec' in dd: continue
+            # if 'default_mesh' in dd: continue
+            # print(all_stat_2[dd])
+            # print(dd, all_stat_1[dd])
+        d1 = np.array(all_stat_1[dd])
+        d2 = np.array(all_stat_2[dd])
+        d3 = np.array(all_stat_3[dd])
+        # print(dd)
+        # print(d1.shape)
+        # print(d2.shape)            
+        # self.assertEqual(all_stat_2[dd], all_stat_1[dd])
+        self._comp_data(d1, d2)
+        self._comp_data(d1, d3)
+
+
+class TestEnerShift(unittest.TestCase):
+    def setUp(self):
+        data0 = gen_sys(30, [0, 1, 0, 2, 1])
+        data1 = gen_sys(30, [0, 1, 0, 0])    
+        sys0 = dpdata.LabeledSystem()
+        sys1 = dpdata.LabeledSystem()
+        sys0.data = data0
+        sys1.data = data1
+        sys0.to_deepmd_npy('system_0', set_size = 10)
+        sys1.to_deepmd_npy('system_1', set_size = 10)
+        
+    def tearDown(self):
+        shutil.rmtree('system_0')
+        shutil.rmtree('system_1')
+
+    def test_ener_shift(self):
+        np.random.seed(0)
+        data = DeepmdDataSystem(['system_0', 'system_1'], 
+                                5, 
+                                10, 
+                                1.0)
+        data.add('energy', 1, must = True)
+        ener_shift0 = data.compute_energy_shift(rcond = 1)
+        all_stat = make_all_stat(data, 4, merge_sys = False)
+        ener_shift1 = EnerFitting._compute_output_stats(all_stat, rcond = 1)        
+        for ii in range(len(ener_shift0)):
+            self.assertAlmostEqual(ener_shift0[ii], ener_shift1[ii])
diff --git a/source/tests/test_model_loc_frame.py b/source/tests/test_model_loc_frame.py
index 5981ee6497..b651862885 100644
--- a/source/tests/test_model_loc_frame.py
+++ b/source/tests/test_model_loc_frame.py
@@ -49,8 +49,8 @@ def test_model(self):
                       'natoms_vec' : [test_data['natoms_vec']],
                       'default_mesh' : [test_data['default_mesh']]
         }
-        model._compute_dstats(input_data)
-        model.bias_atom_e = data.compute_energy_shift()
+        model._compute_input_stat(input_data)
+        model.fitting.bias_atom_e = data.compute_energy_shift()
 
         t_prop_c           = tf.placeholder(tf.float32, [5],    name='t_prop_c')
         t_energy           = tf.placeholder(global_ener_float_precision, [None], name='t_energy')
diff --git a/source/tests/test_model_se_a.py b/source/tests/test_model_se_a.py
index d5be148358..0d54f14c5f 100644
--- a/source/tests/test_model_se_a.py
+++ b/source/tests/test_model_se_a.py
@@ -49,8 +49,8 @@ def test_model(self):
                       'natoms_vec' : [test_data['natoms_vec']],
                       'default_mesh' : [test_data['default_mesh']]
         }
-        model._compute_dstats(input_data)
-        model.bias_atom_e = data.compute_energy_shift()
+        model._compute_input_stat(input_data)
+        model.descrpt.bias_atom_e = data.compute_energy_shift()
 
         t_prop_c           = tf.placeholder(tf.float32, [5],    name='t_prop_c')
         t_energy           = tf.placeholder(global_ener_float_precision, [None], name='t_energy')
diff --git a/source/tests/test_model_se_a_aparam.py b/source/tests/test_model_se_a_aparam.py
index 293145e377..58b060225c 100644
--- a/source/tests/test_model_se_a_aparam.py
+++ b/source/tests/test_model_se_a_aparam.py
@@ -51,8 +51,8 @@ def test_model(self):
                       'default_mesh' : [test_data['default_mesh']],
                       'aparam': [test_data['aparam']],
         }
-        model._compute_dstats(input_data)
-        model.bias_atom_e = data.compute_energy_shift()
+        model._compute_input_stat(input_data)
+        model.descrpt.bias_atom_e = data.compute_energy_shift()
 
         t_prop_c           = tf.placeholder(tf.float32, [5],    name='t_prop_c')
         t_energy           = tf.placeholder(global_ener_float_precision, [None], name='t_energy')
diff --git a/source/tests/test_model_se_a_fparam.py b/source/tests/test_model_se_a_fparam.py
index 4f85af0579..ec4a46c7d4 100644
--- a/source/tests/test_model_se_a_fparam.py
+++ b/source/tests/test_model_se_a_fparam.py
@@ -49,8 +49,8 @@ def test_model(self):
                       'default_mesh' : [test_data['default_mesh']],
                       'fparam': [test_data['fparam']],
         }
-        model._compute_dstats(input_data)
-        model.bias_atom_e = data.compute_energy_shift()
+        model._compute_input_stat(input_data)
+        model.descrpt.bias_atom_e = data.compute_energy_shift()
 
         t_prop_c           = tf.placeholder(tf.float32, [5],    name='t_prop_c')
         t_energy           = tf.placeholder(global_ener_float_precision, [None], name='t_energy')
diff --git a/source/tests/test_model_se_a_srtab.py b/source/tests/test_model_se_a_srtab.py
index 82d1492067..c2950fe788 100644
--- a/source/tests/test_model_se_a_srtab.py
+++ b/source/tests/test_model_se_a_srtab.py
@@ -58,8 +58,8 @@ def test_model(self):
                       'natoms_vec' : [test_data['natoms_vec']],
                       'default_mesh' : [test_data['default_mesh']]
         }
-        model._compute_dstats(input_data)
-        model.bias_atom_e = data.compute_energy_shift()
+        model._compute_input_stat(input_data)
+        model.descrpt.bias_atom_e = data.compute_energy_shift()
 
         t_prop_c           = tf.placeholder(tf.float32, [5],    name='t_prop_c')
         t_energy           = tf.placeholder(global_ener_float_precision, [None], name='t_energy')
diff --git a/source/tests/test_model_se_r.py b/source/tests/test_model_se_r.py
index 965e89a1bd..d3607a9164 100644
--- a/source/tests/test_model_se_r.py
+++ b/source/tests/test_model_se_r.py
@@ -48,8 +48,8 @@ def test_model(self):
                       'natoms_vec' : [test_data['natoms_vec']],
                       'default_mesh' : [test_data['default_mesh']]
         }
-        model._compute_dstats(input_data)
-        model.bias_atom_e = data.compute_energy_shift()
+        model._compute_input_stat(input_data)
+        model.descrpt.bias_atom_e = data.compute_energy_shift()
 
         t_prop_c           = tf.placeholder(tf.float32, [5],    name='t_prop_c')
         t_energy           = tf.placeholder(global_ener_float_precision, [None], name='t_energy')
diff --git a/source/tests/test_polar_se_a.py b/source/tests/test_polar_se_a.py
index 0506c84ff2..275b4fa707 100644
--- a/source/tests/test_polar_se_a.py
+++ b/source/tests/test_polar_se_a.py
@@ -49,7 +49,7 @@ def test_model(self):
                       'default_mesh' : [test_data['default_mesh']],
                       'fparam': [test_data['fparam']],
         }
-        model._compute_dstats(input_data)
+        model._compute_input_stat(input_data)
 
         t_prop_c           = tf.placeholder(tf.float32, [5],    name='t_prop_c')
         t_energy           = tf.placeholder(global_ener_float_precision, [None], name='t_energy')
diff --git a/source/tests/test_sel_idx.py b/source/tests/test_sel_idx.py
new file mode 100644
index 0000000000..47ef9c8496
--- /dev/null
+++ b/source/tests/test_sel_idx.py
@@ -0,0 +1,20 @@
+import os,sys
+import numpy as np
+import unittest
+
+from deepmd.common import select_idx_map
+
+def test():
+    raise RuntimeError
+
+class TestSelIdx (unittest.TestCase) :
+    def test_add (self) :
+        atom_type = np.array([0,1,2,2,1,0], dtype = int)
+        type_sel = np.array([1,0], dtype = int)
+        idx_map = select_idx_map(atom_type, type_sel)
+        new_atom_type = atom_type[idx_map]
+        self.assertEqual(list(idx_map), [0, 5, 1, 4])
+        self.assertEqual(list(new_atom_type), [0, 0, 1, 1])
+
+if __name__ == '__main__':
+    unittest.main()
diff --git a/source/tests/test_tab_nonsmth.py b/source/tests/test_tab_nonsmth.py
index ef77792071..007d41cd63 100644
--- a/source/tests/test_tab_nonsmth.py
+++ b/source/tests/test_tab_nonsmth.py
@@ -35,10 +35,10 @@ def _make_tab(ntype) :
 
 
 class IntplInter(Inter):
-    def __init__ (self, 
-                  data) :
+    def setUp (self, 
+               data) :
         # tabulated
-        Inter.__init__(self, data)
+        Inter.setUp(self, data)
         _make_tab(data.get_ntypes())
         self.srtab = TabInter('tab.xvg')
         self.smin_alpha = 0.3
@@ -153,12 +153,17 @@ def comp_interpl_ef (self,
     
 
 class TestTabNonSmooth(IntplInter, unittest.TestCase):
-    def __init__ (self, *args, **kwargs):
+    # def __init__ (self, *args, **kwargs):
+    #     self.places = 5
+    #     data = Data()
+    #     IntplInter.__init__(self, data)
+    #     unittest.TestCase.__init__(self, *args, **kwargs)
+    #     self.controller = object()
+
+    def setUp(self):
         self.places = 5
         data = Data()
-        IntplInter.__init__(self, data)
-        unittest.TestCase.__init__(self, *args, **kwargs)
-        self.controller = object()
+        IntplInter.setUp(self, data)
 
     def test_force (self) :
         force_test(self, self, places=5, suffix = '_tab')
diff --git a/source/tests/test_tab_smooth.py b/source/tests/test_tab_smooth.py
index 28219cd504..6d34aa06d1 100644
--- a/source/tests/test_tab_smooth.py
+++ b/source/tests/test_tab_smooth.py
@@ -35,10 +35,10 @@ def _make_tab(ntype) :
 
 
 class IntplInter(Inter):
-    def __init__ (self, 
-                  data) :
+    def setUp (self, 
+               data) :
         # tabulated
-        Inter.__init__(self, data)
+        Inter.setUp(self, data)
         _make_tab(data.get_ntypes())
         self.srtab = TabInter('tab.xvg')
         self.smin_alpha = 0.3
@@ -151,12 +151,17 @@ def comp_ef (self,
     
 
 class TestTabSmooth(IntplInter, unittest.TestCase):
-    def __init__ (self, *args, **kwargs):
+    # def __init__ (self, *args, **kwargs):
+    #     self.places = 5
+    #     data = Data()
+    #     IntplInter.__init__(self, data)
+    #     unittest.TestCase.__init__(self, *args, **kwargs)
+    #     self.controller = object()
+
+    def setUp(self):
         self.places = 5
         data = Data()
-        IntplInter.__init__(self, data)
-        unittest.TestCase.__init__(self, *args, **kwargs)
-        self.controller = object()
+        IntplInter.setUp(self, data)
 
     def test_force (self) :
         force_test(self, self, places=5, suffix = '_tab_smth')
diff --git a/source/tests/test_wfc.py b/source/tests/test_wfc.py
index fc0d850b01..d4b408cd60 100644
--- a/source/tests/test_wfc.py
+++ b/source/tests/test_wfc.py
@@ -48,7 +48,7 @@ def test_model(self):
                       'default_mesh' : [test_data['default_mesh']],
                       'fparam': [test_data['fparam']],
         }
-        model._compute_dstats(input_data)
+        model._compute_input_stat(input_data)
 
         t_prop_c           = tf.placeholder(tf.float32, [5],    name='t_prop_c')
         t_energy           = tf.placeholder(global_ener_float_precision, [None], name='t_energy')
diff --git a/source/train/CMakeLists.txt b/source/train/CMakeLists.txt
index 48164f8a9c..1875d2097c 100644
--- a/source/train/CMakeLists.txt
+++ b/source/train/CMakeLists.txt
@@ -2,7 +2,7 @@
 
 configure_file("RunOptions.py.in" "${CMAKE_CURRENT_BINARY_DIR}/RunOptions.py" @ONLY)
 
-file(GLOB LIB_PY main.py common.py env.py compat.py calculator.py Network.py Deep*.py Data.py DataSystem.py Model*.py Descrpt*.py Fitting.py Loss.py LearningRate.py Trainer.py TabInter.py ${CMAKE_CURRENT_BINARY_DIR}/RunOptions.py)
+file(GLOB LIB_PY main.py common.py env.py compat.py calculator.py Network.py Deep*.py Data.py DataSystem.py Model*.py Descrpt*.py Fitting.py Loss.py LearningRate.py Trainer.py TabInter.py EwaldRecp.py DataModifier.py ${CMAKE_CURRENT_BINARY_DIR}/RunOptions.py transform.py)
 
 file(GLOB CLS_PY  Local.py Slurm.py)
 
diff --git a/source/train/Data.py b/source/train/Data.py
index 399a13712b..f3ecaeeb62 100644
--- a/source/train/Data.py
+++ b/source/train/Data.py
@@ -2,10 +2,8 @@
 
 import time
 import glob
-import random
 import numpy as np
 import os.path
-from deepmd.RunOptions import global_tf_float_precision
 from deepmd.RunOptions import global_np_float_precision
 from deepmd.RunOptions import global_ener_float_precision
 
@@ -14,7 +12,8 @@ def __init__ (self,
                   sys_path, 
                   set_prefix = 'set',
                   shuffle_test = True, 
-                  type_map = None) :
+                  type_map = None, 
+                  modifier = None) :
         self.dirs = glob.glob (os.path.join(sys_path, set_prefix + ".*"))
         self.dirs.sort()
         # load atom type
@@ -24,6 +23,8 @@ def __init__ (self,
         self.type_map = self._load_type_map(sys_path)
         if self.type_map is not None:
             assert(len(self.type_map) >= max(self.atom_type)+1)
+        # check pbc
+        self.pbc = self._check_pbc(sys_path)
         # enforce type_map if necessary
         if type_map is not None and self.type_map is not None:
             atom_type_ = [type_map.index(self.type_map[ii]) for ii in self.atom_type]
@@ -46,6 +47,8 @@ def __init__ (self,
         self.set_count = 0
         self.iterator = 0
         self.shuffle_test = shuffle_test
+        # set modifier
+        self.modifier = modifier
 
 
     def add(self, 
@@ -117,17 +120,35 @@ def get_batch(self, batch_size) :
             self._load_batch_set (self.train_dirs[self.set_count % self.get_numb_set()])
             self.set_count += 1
             set_size = self.batch_set["coord"].shape[0]
+            if self.modifier is not None:
+                self.modifier.modify_data(self.batch_set)
         iterator_1 = self.iterator + batch_size
         if iterator_1 >= set_size :
             iterator_1 = set_size
         idx = np.arange (self.iterator, iterator_1)
         self.iterator += batch_size
-        return self._get_subdata(self.batch_set, idx)
+        ret = self._get_subdata(self.batch_set, idx)
+        return ret
 
-    def get_test (self) :
+    def get_test (self, ntests = -1) :
         if not hasattr(self, 'test_set') :            
             self._load_test_set(self.test_dir, self.shuffle_test)
-        return self._get_subdata(self.test_set)        
+        if ntests == -1:
+            idx = None
+        else :
+            ntests_ = ntests if ntests < self.test_set['type'].shape[0] else self.test_set['type'].shape[0]
+            # print('ntest', self.test_set['type'].shape[0], ntests, ntests_)
+            idx = np.arange(ntests_)
+        ret = self._get_subdata(self.test_set, idx = idx)
+        if self.modifier is not None:
+            self.modifier.modify_data(ret)
+        return ret
+
+    def get_ntypes(self) :
+        if self.type_map is not None:
+            return len(self.type_map)
+        else:
+            return max(self.get_atom_type()) + 1
 
     def get_type_map(self) :
         return self.type_map
@@ -140,7 +161,10 @@ def get_numb_set (self) :
 
     def get_numb_batch (self, batch_size, set_idx) :
         data = self._load_set(self.train_dirs[set_idx])
-        return data["coord"].shape[0] // batch_size
+        ret = data["coord"].shape[0] // batch_size
+        if ret == 0:
+            ret = 1
+        return ret
 
     def get_sys_numb_batch (self, batch_size) :
         ret = 0
@@ -210,7 +234,10 @@ def _get_subdata(self, data, idx = None) :
     def _load_batch_set (self,
                          set_name) :
         self.batch_set = self._load_set(set_name)
-        self.batch_set, sf_idx = self._shuffle_data(self.batch_set)
+        self.batch_set, _ = self._shuffle_data(self.batch_set)
+        self.reset_get_batch()
+
+    def reset_get_batch(self):
         self.iterator = 0
 
     def _load_test_set (self,
@@ -218,7 +245,7 @@ def _load_test_set (self,
                        shuffle_test) :
         self.test_set = self._load_set(set_name)        
         if shuffle_test :
-            self.test_set, sf_idx = self._shuffle_data(self.test_set)
+            self.test_set, _ = self._shuffle_data(self.test_set)
 
     def _shuffle_data (self,
                        data) :
@@ -238,7 +265,6 @@ def _shuffle_data (self,
         return ret, idx
 
     def _load_set(self, set_name) :
-        ret = {}
         # get nframes
         path = os.path.join(set_name, "coord.npy")
         if self.data_dict['coord']['high_prec'] :
@@ -269,7 +295,7 @@ def _load_set(self, set_name) :
                 data['find_'+kk] = data['find_'+k_in]
                 tmp_in = data[k_in].astype(global_ener_float_precision)
                 data[kk] = np.sum(np.reshape(tmp_in, [nframes, self.natoms, ndof]), axis = 1)
-                
+
         return data
 
 
@@ -330,6 +356,12 @@ def _load_type_map(self, sys_path) :
         else :
             return None
 
+    def _check_pbc(self, sys_path):
+        pbc = True
+        if os.path.isfile(os.path.join(sys_path, 'nopbc')) :
+            pbc = False
+        return pbc
+
 
 class DataSets (object):
     def __init__ (self, 
@@ -459,7 +491,6 @@ def load_data(self, set_name, data_name, shape, is_necessary = True):
         return 0, data
 
     def load_set(self, set_name, shuffle = True):
-        start_time = time.time()
         data = {}
         data["box"] = self.load_data(set_name, "box", [-1, 9])
         nframe = data["box"].shape[0]
@@ -497,7 +528,6 @@ def load_set(self, set_name, shuffle = True):
             data[ii] = data[ii][:, self.idx_map]
         for ii in ["coord", "force", "atom_pref"]:
             data[ii] = data[ii][:, self.idx3_map]
-        end_time = time.time()
         return data
 
     def load_batch_set (self,
diff --git a/source/train/DataModifier.py b/source/train/DataModifier.py
new file mode 100644
index 0000000000..0afa5f1c4d
--- /dev/null
+++ b/source/train/DataModifier.py
@@ -0,0 +1,347 @@
+import os
+import numpy as np
+from deepmd.DeepDipole import DeepDipole
+from deepmd.env import tf
+from deepmd.common import select_idx_map, make_default_mesh
+from deepmd.EwaldRecp import EwaldRecp
+from deepmd.RunOptions import global_tf_float_precision
+from deepmd.RunOptions import global_np_float_precision
+from deepmd.RunOptions import global_ener_float_precision
+from deepmd.RunOptions import global_cvt_2_tf_float
+from deepmd.RunOptions import global_cvt_2_ener_float
+from deepmd.env import op_module
+
+
+class DipoleChargeModifier(DeepDipole):
+    def __init__(self, 
+                 model_name, 
+                 model_charge_map,
+                 sys_charge_map, 
+                 ewald_h = 1, 
+                 ewald_beta = 1):
+        # the dipole model is loaded with prefix 'dipole_charge'
+        self.modifier_prefix = 'dipole_charge'
+        # init dipole model
+        DeepDipole.__init__(self, 
+                            model_name, 
+                            load_prefix = self.modifier_prefix, 
+                            default_tf_graph = True)
+        self.model_name = model_name
+        self.model_charge_map = model_charge_map
+        self.sys_charge_map = sys_charge_map
+        self.sel_type = list(self.get_sel_type())
+        # init ewald recp
+        self.ewald_h = ewald_h
+        self.ewald_beta = ewald_beta
+        self.er = EwaldRecp(self.ewald_h, self.ewald_beta)
+        # dimension of dipole
+        self.ext_dim = 3
+        self.t_ndesc  = self.graph.get_tensor_by_name(os.path.join(self.modifier_prefix, 'descrpt_attr/ndescrpt:0'))
+        self.t_sela  = self.graph.get_tensor_by_name(os.path.join(self.modifier_prefix, 'descrpt_attr/sel:0'))
+        [self.ndescrpt, self.sel_a] = self.sess.run([self.t_ndesc, self.t_sela])
+        self.sel_r = [ 0 for ii in range(len(self.sel_a)) ]
+        self.nnei_a = np.cumsum(self.sel_a)[-1]
+        self.nnei_r = np.cumsum(self.sel_r)[-1]
+        self.nnei = self.nnei_a + self.nnei_r
+        self.ndescrpt_a = self.nnei_a * 4
+        self.ndescrpt_r = self.nnei_r * 1
+        assert(self.ndescrpt == self.ndescrpt_a + self.ndescrpt_r)
+        self.force = None
+        self.ntypes = len(self.sel_a)
+
+    def build_fv_graph(self):
+        with tf.variable_scope('modifier_attr') :
+            t_mdl_name = tf.constant(self.model_name, 
+                                     name = 'mdl_name', 
+                                     dtype = tf.string)
+            t_modi_type = tf.constant(self.modifier_prefix, 
+                                      name = 'type', 
+                                      dtype = tf.string)
+            t_mdl_charge_map = tf.constant(' '.join([str(ii) for ii in self.model_charge_map]),
+                                            name = 'mdl_charge_map', 
+                                            dtype = tf.string)
+            t_sys_charge_map = tf.constant(' '.join([str(ii) for ii in self.sys_charge_map]),
+                                            name = 'sys_charge_map', 
+                                            dtype = tf.string)
+            t_ewald_h = tf.constant(self.ewald_h,
+                                    name = 'ewald_h', 
+                                    dtype = tf.float64)
+            t_ewald_b = tf.constant(self.ewald_beta,
+                                    name = 'ewald_beta',
+                                    dtype = tf.float64)
+        with self.graph.as_default():
+            return self._build_fv_graph_inner()        
+
+    def _build_fv_graph_inner(self):
+        self.t_ef = tf.placeholder(global_tf_float_precision, [None], name = 't_ef')
+        nf = 10
+        nfxnas = 64*nf
+        nfxna = 192*nf
+        nf = -1
+        nfxnas = -1
+        nfxna = -1
+        self.t_box_reshape = tf.reshape(self.t_box, [-1, 9])
+        t_nframes = tf.shape(self.t_box_reshape)[0]
+        # (nframes x natoms_sel) x 1 x 3
+        self.t_ef_reshape = tf.reshape(self.t_ef, [nfxnas, 1, 3])
+        # (nframes x natoms) x ndescrpt
+        self.descrpt = self.graph.get_tensor_by_name(os.path.join(self.modifier_prefix, 'o_rmat:0'))
+        self.descrpt_deriv = self.graph.get_tensor_by_name(os.path.join(self.modifier_prefix, 'o_rmat_deriv:0'))
+        self.nlist = self.graph.get_tensor_by_name(os.path.join(self.modifier_prefix, 'o_nlist:0'))
+        self.rij = self.graph.get_tensor_by_name(os.path.join(self.modifier_prefix, 'o_rij:0'))
+        # self.descrpt_reshape = tf.reshape(self.descrpt, [nf, 192 * self.ndescrpt])
+        # self.descrpt_deriv = tf.reshape(self.descrpt_deriv, [nf, 192 * self.ndescrpt * 3])
+
+        # nframes x (natoms_sel x 3)
+        self.t_tensor_reshpe = tf.reshape(self.t_tensor, [t_nframes, -1])
+        # nframes x (natoms x 3)
+        self.t_tensor_reshpe = self._enrich(self.t_tensor_reshpe, dof = 3)
+        # (nframes x natoms) x 3
+        self.t_tensor_reshpe = tf.reshape(self.t_tensor_reshpe, [nfxna, 3])
+        # (nframes x natoms) x 1
+        self.t_dipole_x = tf.slice(self.t_tensor_reshpe, [0, 0], [nfxna, 1])
+        self.t_dipole_y = tf.slice(self.t_tensor_reshpe, [0, 1], [nfxna, 1])
+        self.t_dipole_z = tf.slice(self.t_tensor_reshpe, [0, 2], [nfxna, 1])
+        self.t_dipole_z = tf.reshape(self.t_dipole_z, [nfxna, 1])
+        # (nframes x natoms) x ndescrpt
+        [self.t_dipole_x_d] = tf.gradients(self.t_dipole_x, self.descrpt)
+        [self.t_dipole_y_d] = tf.gradients(self.t_dipole_y, self.descrpt)
+        [self.t_dipole_z_d] = tf.gradients(self.t_dipole_z, self.descrpt)
+        # nframes x (natoms x ndescrpt)
+        self.t_dipole_x_d = tf.reshape(self.t_dipole_x_d, [-1, self.t_natoms[0] * self.ndescrpt])
+        self.t_dipole_y_d = tf.reshape(self.t_dipole_y_d, [-1, self.t_natoms[0] * self.ndescrpt])
+        self.t_dipole_z_d = tf.reshape(self.t_dipole_z_d, [-1, self.t_natoms[0] * self.ndescrpt])
+        # nframes x (natoms_sel x ndescrpt)
+        self.t_dipole_x_d = self._slice_descrpt_deriv(self.t_dipole_x_d)
+        self.t_dipole_y_d = self._slice_descrpt_deriv(self.t_dipole_y_d)
+        self.t_dipole_z_d = self._slice_descrpt_deriv(self.t_dipole_z_d)
+        # (nframes x natoms_sel) x ndescrpt
+        self.t_dipole_x_d = tf.reshape(self.t_dipole_x_d, [nfxnas, self.ndescrpt])
+        self.t_dipole_y_d = tf.reshape(self.t_dipole_y_d, [nfxnas, self.ndescrpt])
+        self.t_dipole_z_d = tf.reshape(self.t_dipole_z_d, [nfxnas, self.ndescrpt])
+        # (nframes x natoms_sel) x 3 x ndescrpt
+        self.t_dipole_d = tf.concat([self.t_dipole_x_d, self.t_dipole_y_d, self.t_dipole_z_d], axis = 1)
+        self.t_dipole_d = tf.reshape(self.t_dipole_d, [nfxnas, 3*self.ndescrpt])
+        # (nframes x natoms_sel) x 3 x ndescrpt
+        self.t_dipole_d = tf.reshape(self.t_dipole_d, [-1, 3, self.ndescrpt])
+        # (nframes x natoms_sel) x 1 x ndescrpt
+        self.t_ef_d = tf.matmul(self.t_ef_reshape, self.t_dipole_d)
+        # nframes x (natoms_sel x ndescrpt)
+        self.t_ef_d = tf.reshape(self.t_ef_d, [t_nframes, -1])
+        # nframes x (natoms x ndescrpt)
+        self.t_ef_d = self._enrich(self.t_ef_d, dof = self.ndescrpt)
+        self.t_ef_d = tf.reshape(self.t_ef_d, [nf, self.t_natoms[0] * self.ndescrpt])
+        # t_ef_d is force (with -1), prod_forc takes deriv, so we need the opposite
+        self.t_ef_d_oppo = -self.t_ef_d
+        
+        force = op_module.prod_force_se_a(self.t_ef_d_oppo,
+                                          self.descrpt_deriv, 
+                                          self.nlist, 
+                                          self.t_natoms,
+                                          n_a_sel = self.nnei_a,
+                                          n_r_sel = self.nnei_r)
+        virial, atom_virial \
+            = op_module.prod_virial_se_a (self.t_ef_d_oppo,
+                                          self.descrpt_deriv,
+                                          self.rij,
+                                          self.nlist,
+                                          self.t_natoms,
+                                          n_a_sel = self.nnei_a,
+                                          n_r_sel = self.nnei_r)
+        force = tf.identity(force, name='o_dm_force')
+        virial = tf.identity(virial, name='o_dm_virial')
+        atom_virial = tf.identity(atom_virial, name='o_dm_av')
+        return force, virial, atom_virial
+
+
+    def _enrich(self, dipole, dof = 3):
+        coll = []                
+        sel_start_idx = 0
+        for type_i in range(self.ntypes):
+            if type_i in self.sel_type:
+                di = tf.slice(dipole, 
+                              [ 0, sel_start_idx           * dof],
+                              [-1, self.t_natoms[2+type_i] * dof])
+                sel_start_idx += self.t_natoms[2+type_i]
+            else:
+                di = tf.zeros([tf.shape(dipole)[0], self.t_natoms[2+type_i] * dof],
+                              dtype = global_tf_float_precision)
+            coll.append(di)
+        return tf.concat(coll, axis = 1)
+
+    def _slice_descrpt_deriv(self, deriv):
+        coll = []
+        start_idx = 0
+        for type_i in range(self.ntypes):
+            if type_i in self.sel_type:
+                di = tf.slice(deriv, 
+                              [ 0, start_idx               * self.ndescrpt],
+                              [-1, self.t_natoms[2+type_i] * self.ndescrpt])
+                coll.append(di)
+            start_idx += self.t_natoms[2+type_i]
+        return tf.concat(coll, axis = 1)        
+
+
+    def eval(self, coord, box, atype, eval_fv = True):
+        coord, atype, imap = self.sort_input(coord, atype)
+        natoms = coord.shape[1] // 3
+        nframes = coord.shape[0]
+        box = np.reshape(box, [nframes, 9])
+        atype = np.reshape(atype, [natoms])
+        sel_idx_map = select_idx_map(atype, self.sel_type)
+        nsel = len(sel_idx_map)
+        # setup charge
+        charge = np.zeros([natoms])
+        for ii in range(natoms):
+            charge[ii] = self.sys_charge_map[atype[ii]]
+        charge = np.tile(charge, [nframes, 1])
+
+        # add wfcc
+        all_coord, all_charge, dipole = self._extend_system(coord, box, atype, charge)
+        
+        # print('compute er')
+        batch_size = 5
+        tot_e = []
+        all_f = []
+        all_v = []
+        for ii in range(0,nframes,batch_size):
+            e,f,v = self.er.eval(all_coord[ii:ii+batch_size], all_charge[ii:ii+batch_size], box[ii:ii+batch_size])
+            tot_e.append(e)
+            all_f.append(f)
+            all_v.append(v)
+        tot_e = np.concatenate(tot_e, axis = 0)
+        all_f = np.concatenate(all_f, axis = 0)
+        all_v = np.concatenate(all_v, axis = 0)
+        # print('finish  er')
+        # reshape
+        tot_e.reshape([nframes,1])
+
+        tot_f = None
+        tot_v = None
+        if self.force is None:
+            self.force, self.virial, self.av = self.build_fv_graph()
+        if eval_fv:
+            # compute f
+            ext_f = all_f[:,natoms*3:]
+            corr_f = []
+            corr_v = []
+            corr_av = []
+            for ii in range(0,nframes,batch_size):
+                f, v, av = self.eval_fv(coord[ii:ii+batch_size], box[ii:ii+batch_size], atype, ext_f[ii:ii+batch_size])
+                corr_f.append(f)
+                corr_v.append(v)
+                corr_av.append(av)
+            corr_f = np.concatenate(corr_f, axis = 0)
+            corr_v = np.concatenate(corr_v, axis = 0)
+            corr_av = np.concatenate(corr_av, axis = 0)
+            tot_f = all_f[:,:natoms*3] + corr_f
+            for ii in range(nsel):            
+                orig_idx = sel_idx_map[ii]            
+                tot_f[:,orig_idx*3:orig_idx*3+3] += ext_f[:,ii*3:ii*3+3]                
+            tot_f = self.reverse_map(np.reshape(tot_f, [nframes,-1,3]), imap)
+            # reshape
+            tot_f = tot_f.reshape([nframes,natoms,3])
+            # compute v
+            dipole3 = np.reshape(dipole, [nframes, nsel, 3])
+            ext_f3 = np.reshape(ext_f, [nframes, nsel, 3])
+            ext_f3 = np.transpose(ext_f3, [0, 2, 1])
+            # fd_corr_v = -np.matmul(ext_f3, dipole3).T.reshape([nframes, 9])
+            # fd_corr_v = -np.matmul(ext_f3, dipole3)
+            # fd_corr_v = np.transpose(fd_corr_v, [0, 2, 1]).reshape([nframes, 9])
+            fd_corr_v = -np.matmul(ext_f3, dipole3).reshape([nframes, 9])
+            # print(all_v, '\n', corr_v, '\n', fd_corr_v)
+            tot_v = all_v + corr_v + fd_corr_v
+            # reshape
+            tot_v = tot_v.reshape([nframes,9])
+
+        return tot_e, tot_f, tot_v
+
+
+    def eval_fv(self, coords, cells, atom_types, ext_f) :
+        # reshape the inputs 
+        cells = np.reshape(cells, [-1, 9])
+        nframes = cells.shape[0]
+        coords = np.reshape(coords, [nframes, -1])
+        natoms = coords.shape[1] // 3
+
+        # sort inputs
+        coords, atom_types, imap, sel_at, sel_imap = self.sort_input(coords, atom_types, sel_atoms = self.get_sel_type())
+
+        # make natoms_vec and default_mesh
+        natoms_vec = self.make_natoms_vec(atom_types)
+        assert(natoms_vec[0] == natoms)
+        default_mesh = make_default_mesh(cells)
+
+        # evaluate
+        tensor = []
+        feed_dict_test = {}
+        feed_dict_test[self.t_natoms] = natoms_vec
+        feed_dict_test[self.t_type  ] = np.tile(atom_types, [nframes, 1]).reshape([-1])
+        feed_dict_test[self.t_coord ] = coords.reshape([-1])
+        feed_dict_test[self.t_box   ] = cells.reshape([-1])
+        feed_dict_test[self.t_mesh  ] = default_mesh.reshape([-1])
+        feed_dict_test[self.t_ef    ] = ext_f.reshape([-1])
+        # print(self.sess.run(tf.shape(self.t_tensor), feed_dict = feed_dict_test))
+        fout, vout, avout \
+            = self.sess.run([self.force, self.virial, self.av],
+                            feed_dict = feed_dict_test)
+        # print('fout: ', fout.shape, fout)
+        fout = self.reverse_map(np.reshape(fout, [nframes,-1,3]), imap)
+        fout = np.reshape(fout, [nframes, -1])
+        return fout, vout, avout
+
+
+    def _extend_system(self, coord, box, atype, charge):
+        natoms = coord.shape[1] // 3
+        nframes = coord.shape[0]
+        # sel atoms and setup ref coord
+        sel_idx_map = select_idx_map(atype, self.sel_type)
+        nsel = len(sel_idx_map)
+        coord3 = coord.reshape([nframes, natoms, 3])
+        ref_coord = coord3[:,sel_idx_map,:]
+        ref_coord = np.reshape(ref_coord, [nframes, nsel * 3])
+        
+        dipole = DeepDipole.eval(self, coord, box, atype)
+        dipole = np.reshape(dipole, [nframes, nsel * 3])
+        
+        wfcc_coord = ref_coord + dipole
+        # wfcc_coord = dipole
+        wfcc_charge = np.zeros([nsel])
+        for ii in range(nsel):
+            orig_idx = self.sel_type.index(atype[sel_idx_map[ii]])
+            wfcc_charge[ii] = self.model_charge_map[orig_idx]
+        wfcc_charge = np.tile(wfcc_charge, [nframes, 1])
+
+        wfcc_coord = np.reshape(wfcc_coord, [nframes, nsel * 3])
+        wfcc_charge = np.reshape(wfcc_charge, [nframes, nsel])
+
+        all_coord = np.concatenate((coord, wfcc_coord), axis = 1)
+        all_charge = np.concatenate((charge, wfcc_charge), axis = 1)
+
+        return all_coord, all_charge, dipole
+
+
+    def modify_data(self, data):
+        if 'find_energy' not in data and 'find_force' not in data and 'find_virial' not in data:
+            return
+
+        get_nframes=None
+        coord = data['coord'][:get_nframes,:]
+        box = data['box'][:get_nframes,:]
+        atype = data['type'][:get_nframes,:]
+        atype = atype[0]
+        nframes = coord.shape[0]
+
+        tot_e, tot_f, tot_v = self.eval(coord, box, atype)
+
+        # print(tot_f[:,0])
+        
+        if 'find_energy' in data and data['find_energy'] == 1.0 :
+            data['energy'] -= tot_e.reshape(data['energy'].shape)
+        if 'find_force' in data and data['find_force'] == 1.0 :
+            data['force'] -= tot_f.reshape(data['force'].shape)
+        if 'find_virial' in data and data['find_virial'] == 1.0 :
+            data['virial'] -= tot_v.reshape(data['virial'].shape)
+
+
+                           
diff --git a/source/train/DataSystem.py b/source/train/DataSystem.py
index 7bb61b477c..61d59f5cea 100644
--- a/source/train/DataSystem.py
+++ b/source/train/DataSystem.py
@@ -1,13 +1,11 @@
 #!/usr/bin/env python3
 
-import os, sys
+import os
 import collections
+import warnings
 import numpy as np
-
-module_path = os.path.dirname(os.path.realpath(__file__)) + "/"
-sys.path.append (module_path)
-from Data import DataSets
-from Data import DeepmdData
+from deepmd.Data import DataSets
+from deepmd.Data import DeepmdData
 
 
 class DeepmdDataSystem() :
@@ -18,8 +16,8 @@ def __init__ (self,
                   rcut,
                   set_prefix = 'set',
                   shuffle_test = True,
-                  run_opt = None, 
-                  type_map = None) :
+                  type_map = None, 
+                  modifier = None) :
         # init data
         self.rcut = rcut
         self.system_dirs = systems
@@ -29,8 +27,8 @@ def __init__ (self,
             self.data_systems.append(DeepmdData(ii, 
                                                 set_prefix=set_prefix, 
                                                 shuffle_test=shuffle_test, 
-                                                type_map = type_map))
-
+                                                type_map = type_map, 
+                                                modifier = modifier))
         # batch size
         self.batch_size = batch_size
         if isinstance(self.batch_size, int) :
@@ -54,7 +52,7 @@ def __init__ (self,
         # natoms, nbatches
         ntypes = []
         for ii in self.data_systems :
-            ntypes.append(np.max(ii.get_atom_type()) + 1)
+            ntypes.append(ii.get_ntypes())
         self.sys_ntypes = max(ntypes)
         self.natoms = []
         self.natoms_vec = []
@@ -75,33 +73,39 @@ def __init__ (self,
         for ii in range(self.nsystems) :
             chk_ret = self.data_systems[ii].check_batch_size(self.batch_size[ii])
             if chk_ret is not None :
-                raise RuntimeError ("system %s required batch size %d is larger than the size %d of the dataset %s" % \
-                                    (self.system_dirs[ii], self.batch_size[ii], chk_ret[1], chk_ret[0]))
+                warnings.warn("system %s required batch size is larger than the size of the dataset %s (%d > %d)" % \
+                              (self.system_dirs[ii], chk_ret[0], self.batch_size[ii], chk_ret[1]))
             chk_ret = self.data_systems[ii].check_test_size(test_size)
             if chk_ret is not None :
-                print("WARNNING: system %s required test size %d is larger than the size %d of the dataset %s" % \
-                      (self.system_dirs[ii], test_size, chk_ret[1], chk_ret[0]))
-
-        # print summary
-        if run_opt is not None:
-            self.print_summary(run_opt)
+                warnings.warn("system %s required test size is larger than the size of the dataset %s (%d > %d)" % \
+                              (self.system_dirs[ii], chk_ret[0], test_size, chk_ret[1]))
 
 
-    def _load_test(self):
+    def _load_test(self, ntests = -1):
         self.test_data = collections.defaultdict(list)
-        self.default_mesh = []
         for ii in range(self.nsystems) :
-            test_system_data = self.data_systems[ii].get_test ()
+            test_system_data = self.data_systems[ii].get_test(ntests = ntests)
             for nn in test_system_data:
                 self.test_data[nn].append(test_system_data[nn])
-            cell_size = np.max (self.rcut)
-            avg_box = np.average (test_system_data["box"], axis = 0)
-            avg_box = np.reshape (avg_box, [3,3])
-            ncell = (np.linalg.norm(avg_box, axis=1)/ cell_size).astype(np.int32)
-            ncell[ncell < 2] = 2
-            default_mesh = np.zeros (6, dtype = np.int32)
-            default_mesh[3:6] = ncell
-            self.default_mesh.append(default_mesh)
+
+
+    def _make_default_mesh(self):
+        self.default_mesh = []
+        cell_size = np.max (self.rcut)
+        for ii in range(self.nsystems) :
+            if self.data_systems[ii].pbc :
+                test_system_data = self.data_systems[ii].get_batch(self.batch_size[ii])
+                self.data_systems[ii].reset_get_batch()
+                # test_system_data = self.data_systems[ii].get_test()
+                avg_box = np.average (test_system_data["box"], axis = 0)
+                avg_box = np.reshape (avg_box, [3,3])
+                ncell = (np.linalg.norm(avg_box, axis=1)/ cell_size).astype(np.int32)
+                ncell[ncell < 2] = 2
+                default_mesh = np.zeros (6, dtype = np.int32)
+                default_mesh[3:6] = ncell
+                self.default_mesh.append(default_mesh)
+            else:
+                self.default_mesh.append(np.array([], dtype = np.int32))
 
 
     def compute_energy_shift(self, rcond = 1e-3, key = 'energy') :
@@ -146,24 +150,57 @@ def reduce(self,
     def get_data_dict(self) :
         return self.data_systems[0].get_data_dict()
 
+
+    def _get_sys_probs(self,
+                       sys_probs,
+                       auto_prob_style) :        
+        if sys_probs is None :
+            if auto_prob_style == "prob_uniform" :
+                prob = None
+            elif auto_prob_style == "prob_sys_size" :
+                prob = self.prob_nbatches
+            elif auto_prob_style[:14] == "prob_sys_size;" :
+                prob = self._prob_sys_size_ext(auto_prob_style)
+            else :
+                raise RuntimeError("unkown style " + auto_prob_style )
+        else :
+            prob = self._process_sys_probs(sys_probs)
+        return prob
+
+
     def get_batch (self, 
                    sys_idx = None,
-                   sys_weights = None,
-                   style = "prob_sys_size") :
+                   sys_probs = None,
+                   auto_prob_style = "prob_sys_size") :
+        """
+        Get a batch of data from the data system        
+
+        Parameters
+        ----------
+        sys_idx: int
+            The index of system from which the batch is get. 
+            If sys_idx is not None, `sys_probs` and `auto_prob_style` are ignored
+            If sys_idx is None, automatically determine the system according to `sys_probs` or `auto_prob_style`, see the following.
+        sys_probs: list of float
+            The probabilitis of systems to get the batch.
+            Summation of positive elements of this list should be no greater than 1.
+            Element of this list can be negative, the probability of the corresponding system is determined automatically by the number of batches in the system.
+        auto_prob_style: float
+            Determine the probability of systems automatically. The method is assigned by this key and can be
+            - "prob_uniform"  : the probability all the systems are equal, namely 1.0/self.get_nsystems()
+            - "prob_sys_size" : the probability of a system is proportional to the number of batches in the system
+            - "prob_sys_size;stt_idx:end_idx:weight;stt_idx:end_idx:weight;..." : 
+                                the list of systems is devided into blocks. A block is specified by `stt_idx:end_idx:weight`, 
+                                where `stt_idx` is the starting index of the system, `end_idx` is then ending (not including) index of the system,
+                                the probabilities of the systems in this block sums up to `weight`, and the relatively probabilities within this block is proportional 
+                                to the number of batches in the system.
+        """
         if not hasattr(self, 'default_mesh') :
-            self._load_test()
+            self._make_default_mesh()
         if sys_idx is not None :
             self.pick_idx = sys_idx
         else :
-            if sys_weights is None :
-                if style == "prob_sys_size" :
-                    prob = self.prob_nbatches
-                elif style == "prob_uniform" :
-                    prob = None
-                else :
-                    raise RuntimeError("unkown get_batch style")
-            else :
-                prob = self.process_sys_weights(sys_weights)
+            prob = self._get_sys_probs(sys_probs, auto_prob_style)
             self.pick_idx = np.random.choice(np.arange(self.nsystems), p = prob)
         b_data = self.data_systems[self.pick_idx].get_batch(self.batch_size[self.pick_idx])
         b_data["natoms_vec"] = self.natoms_vec[self.pick_idx]
@@ -171,9 +208,12 @@ def get_batch (self,
         return b_data
 
     def get_test (self, 
-                  sys_idx = None) :
+                  sys_idx = None, 
+                  ntests = -1) :
         if not hasattr(self, 'default_mesh') :
-            self._load_test()
+            self._make_default_mesh()
+        if not hasattr(self, 'test_data') :
+            self._load_test(ntests = ntests)
         if sys_idx is not None :
             idx = sys_idx
         else :
@@ -212,21 +252,26 @@ def _format_name_length(self, name, width) :
             name = '-- ' + name
             return name 
 
-    def print_summary(self, run_opt) :
+    def print_summary(self, 
+                      run_opt,
+                      sys_probs = None,
+                      auto_prob_style = "prob_sys_size") :
+        prob = self._get_sys_probs(sys_probs, auto_prob_style)
         tmp_msg = ""
         # width 65
         sys_width = 42
-        tmp_msg += "---Summary of DataSystem-----------------------------------------\n"
+        tmp_msg += "---Summary of DataSystem------------------------------------------------\n"
         tmp_msg += "find %d system(s):\n" % self.nsystems
         tmp_msg += "%s  " % self._format_name_length('system', sys_width)
-        tmp_msg += "%s  %s  %s\n" % ('natoms', 'bch_sz', 'n_bch')
+        tmp_msg += "%s  %s  %s  %5s\n" % ('natoms', 'bch_sz', 'n_bch', 'prob')
         for ii in range(self.nsystems) :
-            tmp_msg += ("%s  %6d  %6d  %5d\n" % 
+            tmp_msg += ("%s  %6d  %6d  %5d  %5.3f\n" % 
                         (self._format_name_length(self.system_dirs[ii], sys_width),
                          self.natoms[ii], 
                          self.batch_size[ii], 
-                         self.nbatches[ii]) )
-        tmp_msg += "-----------------------------------------------------------------\n"
+                         self.nbatches[ii], 
+                         prob[ii]) )
+        tmp_msg += "------------------------------------------------------------------------\n"
         run_opt.message(tmp_msg)
 
         
@@ -252,18 +297,39 @@ def _check_type_map_consistency(self, type_map_list):
                     ret = ii
         return ret
 
-    def _process_sys_weights(self, sys_weights) :
-        sys_weights = np.array(sys_weights)
-        type_filter = sys_weights >= 0
-        assigned_sum_prob = np.sum(type_filter * sys_weights)
+    def _process_sys_probs(self, sys_probs) :
+        sys_probs = np.array(sys_probs)
+        type_filter = sys_probs >= 0
+        assigned_sum_prob = np.sum(type_filter * sys_probs)
         assert assigned_sum_prob <= 1, "the sum of assigned probability should be less than 1"
         rest_sum_prob = 1. - assigned_sum_prob
         rest_nbatch = (1 - type_filter) * self.nbatches
         rest_prob = rest_sum_prob * rest_nbatch / np.sum(rest_nbatch)
-        ret_prob = rest_prob + type_filter * sys_weights
+        ret_prob = rest_prob + type_filter * sys_probs
         assert np.sum(ret_prob) == 1, "sum of probs should be 1"
         return ret_prob
-
+    
+    def _prob_sys_size_ext(self, keywords):
+        block_str = keywords.split(';')[1:]
+        block_stt = []
+        block_end = []
+        block_weights = []
+        for ii in block_str:
+            stt = int(ii.split(':')[0])
+            end = int(ii.split(':')[1])
+            weight = float(ii.split(':')[2])
+            assert(weight >= 0), "the weight of a block should be no less than 0"
+            block_stt.append(stt)
+            block_end.append(end)
+            block_weights.append(weight)
+        nblocks = len(block_str)
+        block_probs = np.array(block_weights) / np.sum(block_weights)
+        sys_probs = np.zeros([self.get_nsystems()])
+        for ii in range(nblocks):
+            nbatch_block = self.nbatches[block_stt[ii]:block_end[ii]]
+            tmp_prob = [float(i) for i in nbatch_block] / np.sum(nbatch_block)
+            sys_probs[block_stt[ii]:block_end[ii]] = tmp_prob * block_probs[ii]
+        return sys_probs
 
 
 
diff --git a/source/train/DeepDipole.py b/source/train/DeepDipole.py
index 3ef107b4a2..1183486624 100644
--- a/source/train/DeepDipole.py
+++ b/source/train/DeepDipole.py
@@ -1,11 +1,11 @@
 #!/usr/bin/env python3
 
-import os,sys
-import numpy as np
 from deepmd.DeepEval import DeepTensor
 
 class DeepDipole (DeepTensor) :
     def __init__(self, 
-                 model_file) :
-        DeepTensor.__init__(self, model_file, 'dipole', 3)
+                 model_file, 
+                 load_prefix = 'load', 
+                 default_tf_graph = False) :
+        DeepTensor.__init__(self, model_file, 'dipole', 3, load_prefix = load_prefix, default_tf_graph = default_tf_graph)
 
diff --git a/source/train/DeepEval.py b/source/train/DeepEval.py
index 3868f6e258..3a6e5624bc 100644
--- a/source/train/DeepEval.py
+++ b/source/train/DeepEval.py
@@ -1,47 +1,37 @@
 #!/usr/bin/env python3
 import platform
-import os,sys
+import os
 import numpy as np
 
 from deepmd.env import tf
-
-from tensorflow.python.framework import ops
-
-if platform.system() == "Windows":
-    ext = "dll"
-elif platform.system() == "Darwin":
-    ext = "dylib"
-else:
-    ext = "so"
-module_path = os.path.dirname(os.path.realpath(__file__))
-assert (os.path.isfile (os.path.join(module_path, "libop_abi.{}".format(ext)))), "op module does not exist"
-op_module = tf.load_op_library(os.path.join(module_path, "libop_abi.{}".format(ext)))
+from deepmd.env import default_tf_session_config
+from deepmd.common import make_default_mesh
 
 class DeepEval():
     """
     common methods for DeepPot, DeepWFC, DeepPolar, ...
     """
     def __init__(self, 
-                 model_file) :
-        model_file = model_file
-        graph = self.load_graph (model_file)
-        t_mt = graph.get_tensor_by_name('load/model_attr/model_type:0')
-        sess = tf.Session (graph = graph)        
+                 model_file, 
+                 load_prefix = 'load', 
+                 default_tf_graph = False) :
+        self.graph = self._load_graph (model_file, prefix = load_prefix, default_tf_graph = default_tf_graph)
+        t_mt = self.graph.get_tensor_by_name(os.path.join(load_prefix, 'model_attr/model_type:0'))
+        sess = tf.Session (graph = self.graph, config=default_tf_session_config)
         [mt] = sess.run([t_mt], feed_dict = {})
         self.model_type = mt.decode('utf-8')
 
-    def load_graph(self, 
-                   frozen_graph_filename, 
-                   prefix = 'load'):
+    def _load_graph(self, 
+                    frozen_graph_filename, 
+                    prefix = 'load', 
+                    default_tf_graph = False):
         # We load the protobuf file from the disk and parse it to retrieve the 
         # unserialized graph_def
         with tf.gfile.GFile(frozen_graph_filename, "rb") as f:
             graph_def = tf.GraphDef()
             graph_def.ParseFromString(f.read())
 
-        # Then, we can use again a convenient built-in function to import a graph_def into the 
-        # current default Graph
-        with tf.Graph().as_default() as graph:
+        if default_tf_graph:
             tf.import_graph_def(
                 graph_def, 
                 input_map=None, 
@@ -49,21 +39,23 @@ def load_graph(self,
                 name=prefix, 
                 producer_op_list=None
             )
+            graph = tf.get_default_graph()
+        else :
+            # Then, we can use again a convenient built-in function to import a graph_def into the 
+            # current default Graph
+            with tf.Graph().as_default() as graph:
+                tf.import_graph_def(
+                    graph_def,
+                    input_map=None,
+                    return_elements=None,
+                    name=prefix,
+                    producer_op_list=None
+                )
+        # for ii in graph.as_graph_def().node:
+        #     print(ii.name)
+
         return graph
 
-    def make_default_mesh(self, test_box) :
-        ncell = np.ones (3, dtype=np.int32)
-        avg_box = np.average (test_box, axis = 0)
-        cell_size = 3
-        avg_box = np.reshape (avg_box, [3,3])
-        for ii in range (3) :
-            ncell[ii] = int ( np.linalg.norm(avg_box[ii]) / cell_size )
-            if (ncell[ii] < 2) : ncell[ii] = 2
-        default_mesh = np.zeros (6, dtype = np.int32)
-        default_mesh[3] = ncell[0]
-        default_mesh[4] = ncell[1]
-        default_mesh[5] = ncell[2]
-        return default_mesh
 
     def sort_input(self, coord, atom_type, sel_atoms = None) :
         if sel_atoms is not None:
@@ -110,27 +102,29 @@ class DeepTensor(DeepEval) :
     def __init__(self, 
                  model_file, 
                  variable_name,                  
-                 variable_dof) :
-        DeepEval.__init__(self, model_file)
-        self.model_file = model_file
-        self.graph = self.load_graph (self.model_file)
+                 variable_dof, 
+                 load_prefix = 'load', 
+                 default_tf_graph = False) :
+        DeepEval.__init__(self, model_file, load_prefix = load_prefix, default_tf_graph = default_tf_graph)
+        # self.model_file = model_file
+        # self.graph = self.load_graph (self.model_file)
         self.variable_name = variable_name
         self.variable_dof = variable_dof
         # checkout input/output tensors from graph
-        self.t_ntypes = self.graph.get_tensor_by_name ('load/descrpt_attr/ntypes:0')
-        self.t_rcut   = self.graph.get_tensor_by_name ('load/descrpt_attr/rcut:0')
-        self.t_tmap   = self.graph.get_tensor_by_name ('load/model_attr/tmap:0')
-        self.t_sel_type= self.graph.get_tensor_by_name ('load/model_attr/sel_type:0')
+        self.t_ntypes = self.graph.get_tensor_by_name (os.path.join(load_prefix, 'descrpt_attr/ntypes:0'))
+        self.t_rcut   = self.graph.get_tensor_by_name (os.path.join(load_prefix, 'descrpt_attr/rcut:0'))
+        self.t_tmap   = self.graph.get_tensor_by_name (os.path.join(load_prefix, 'model_attr/tmap:0'))
+        self.t_sel_type= self.graph.get_tensor_by_name (os.path.join(load_prefix, 'model_attr/sel_type:0'))
         # inputs
-        self.t_coord  = self.graph.get_tensor_by_name ('load/t_coord:0')
-        self.t_type   = self.graph.get_tensor_by_name ('load/t_type:0')
-        self.t_natoms = self.graph.get_tensor_by_name ('load/t_natoms:0')
-        self.t_box    = self.graph.get_tensor_by_name ('load/t_box:0')
-        self.t_mesh   = self.graph.get_tensor_by_name ('load/t_mesh:0')
+        self.t_coord  = self.graph.get_tensor_by_name (os.path.join(load_prefix, 't_coord:0'))
+        self.t_type   = self.graph.get_tensor_by_name (os.path.join(load_prefix, 't_type:0'))
+        self.t_natoms = self.graph.get_tensor_by_name (os.path.join(load_prefix, 't_natoms:0'))
+        self.t_box    = self.graph.get_tensor_by_name (os.path.join(load_prefix, 't_box:0'))
+        self.t_mesh   = self.graph.get_tensor_by_name (os.path.join(load_prefix, 't_mesh:0'))
         # outputs
-        self.t_tensor = self.graph.get_tensor_by_name ('load/o_%s:0' % self.variable_name)
+        self.t_tensor = self.graph.get_tensor_by_name (os.path.join(load_prefix, 'o_%s:0' % self.variable_name))
         # start a tf session associated to the graph
-        self.sess = tf.Session (graph = self.graph)        
+        self.sess = tf.Session (graph = self.graph, config=default_tf_session_config)
         [self.ntypes, self.rcut, self.tmap, self.tselt] = self.sess.run([self.t_ntypes, self.t_rcut, self.t_tmap, self.t_sel_type])
         self.tmap = self.tmap.decode('UTF-8').split()
 
@@ -168,18 +162,17 @@ def eval(self,
         # make natoms_vec and default_mesh
         natoms_vec = self.make_natoms_vec(atom_types)
         assert(natoms_vec[0] == natoms)
-        default_mesh = self.make_default_mesh(cells)
 
         # evaluate
         tensor = []
         feed_dict_test = {}
         feed_dict_test[self.t_natoms] = natoms_vec
-        feed_dict_test[self.t_mesh  ] = default_mesh
         feed_dict_test[self.t_type  ] = atom_types
         t_out = [self.t_tensor]
         for ii in range(nframes) :
             feed_dict_test[self.t_coord] = np.reshape(coords[ii:ii+1, :], [-1])
             feed_dict_test[self.t_box  ] = np.reshape(cells [ii:ii+1, :], [-1])
+            feed_dict_test[self.t_mesh ] = make_default_mesh(cells[ii:ii+1, :])
             v_out = self.sess.run (t_out, feed_dict = feed_dict_test)
             tensor.append(v_out[0])
 
diff --git a/source/train/DeepPolar.py b/source/train/DeepPolar.py
index 3af499dd07..705eacfe62 100644
--- a/source/train/DeepPolar.py
+++ b/source/train/DeepPolar.py
@@ -1,19 +1,19 @@
 #!/usr/bin/env python3
 
-import os,sys
-import numpy as np
 from deepmd.DeepEval import DeepTensor
 
 class DeepPolar (DeepTensor) :
     def __init__(self, 
-                 model_file) :
-        DeepTensor.__init__(self, model_file, 'polar', 9)
+                 model_file, 
+                 default_tf_graph = False) :
+        DeepTensor.__init__(self, model_file, 'polar', 9, default_tf_graph = default_tf_graph)
 
     
 class DeepGlobalPolar (DeepTensor) :
     def __init__(self, 
-                 model_file) :
-        DeepTensor.__init__(self, model_file, 'global_polar', 9)
+                 model_file, 
+                 default_tf_graph = False) :
+        DeepTensor.__init__(self, model_file, 'global_polar', 9, default_tf_graph = default_tf_graph)
 
     def eval(self,
              coords, 
diff --git a/source/train/DeepPot.py b/source/train/DeepPot.py
index dc62e56d8e..bd46ee5578 100644
--- a/source/train/DeepPot.py
+++ b/source/train/DeepPot.py
@@ -1,15 +1,19 @@
 #!/usr/bin/env python3
 
-import os,sys
 import numpy as np
 from deepmd.env import tf
+from deepmd.env import default_tf_session_config
+from deepmd.common import make_default_mesh
 from deepmd.DeepEval import DeepEval
+from deepmd.DataModifier import DipoleChargeModifier
 
 class DeepPot (DeepEval) :
     def __init__(self, 
-                 model_file) :
-        self.model_file = model_file
-        self.graph = self.load_graph (self.model_file)
+                 model_file, 
+                 default_tf_graph = False) :
+        DeepEval.__init__(self, model_file, default_tf_graph = default_tf_graph)
+        # self.model_file = model_file
+        # self.graph = self.load_graph (self.model_file)
         # checkout input/output tensors from graph
         self.t_ntypes = self.graph.get_tensor_by_name ('load/descrpt_attr/ntypes:0')
         self.t_rcut   = self.graph.get_tensor_by_name ('load/descrpt_attr/rcut:0')
@@ -41,9 +45,27 @@ def __init__(self,
                 self.t_aparam = self.graph.get_tensor_by_name ('load/t_aparam:0')
         self.has_aparam = self.t_aparam is not None
         # start a tf session associated to the graph
-        self.sess = tf.Session (graph = self.graph)        
+        self.sess = tf.Session (graph = self.graph, config=default_tf_session_config)        
         [self.ntypes, self.rcut, self.dfparam, self.daparam, self.tmap] = self.sess.run([self.t_ntypes, self.t_rcut, self.t_dfparam, self.t_daparam, self.t_tmap])
         self.tmap = self.tmap.decode('UTF-8').split()
+        # setup modifier
+        try:
+            t_modifier_type = self.graph.get_tensor_by_name('load/modifier_attr/type:0')
+            self.modifier_type = self.sess.run(t_modifier_type).decode('UTF-8')
+        except ValueError:
+            self.modifier_type = None
+        except KeyError:
+            self.modifier_type = None
+        if self.modifier_type == 'dipole_charge':
+            t_mdl_name = self.graph.get_tensor_by_name('load/modifier_attr/mdl_name:0')
+            t_mdl_charge_map = self.graph.get_tensor_by_name('load/modifier_attr/mdl_charge_map:0')
+            t_sys_charge_map = self.graph.get_tensor_by_name('load/modifier_attr/sys_charge_map:0')
+            t_ewald_h = self.graph.get_tensor_by_name('load/modifier_attr/ewald_h:0')
+            t_ewald_beta = self.graph.get_tensor_by_name('load/modifier_attr/ewald_beta:0')
+            [mdl_name, mdl_charge_map, sys_charge_map, ewald_h, ewald_beta] = self.sess.run([t_mdl_name, t_mdl_charge_map, t_sys_charge_map, t_ewald_h, t_ewald_beta])
+            mdl_charge_map = [int(ii) for ii in mdl_charge_map.decode('UTF-8').split()]
+            sys_charge_map = [int(ii) for ii in sys_charge_map.decode('UTF-8').split()]
+            self.dm = DipoleChargeModifier(mdl_name, mdl_charge_map, sys_charge_map, ewald_h = ewald_h, ewald_beta = ewald_beta)
 
 
     def get_ntypes(self) :
@@ -61,8 +83,27 @@ def get_dim_aparam(self) :
     def get_type_map(self):
         return self.tmap
 
-
     def eval(self,
+             coords,
+             cells,
+             atom_types,
+             fparam = None,
+             aparam = None,
+             atomic = False) :
+        if atomic :
+            if self.modifier_type is not None:
+                raise RuntimeError('modifier does not support atomic modification')
+            return self.eval_inner(coords, cells, atom_types, fparam = fparam, aparam = aparam, atomic = atomic)
+        else :
+            e, f, v = self.eval_inner(coords, cells, atom_types, fparam = fparam, aparam = aparam, atomic = atomic)
+            if self.modifier_type is not None:
+                me, mf, mv = self.dm.eval(coords, cells, atom_types)
+                e += me.reshape(e.shape)
+                f += mf.reshape(f.shape)
+                v += mv.reshape(v.shape)
+            return e, f, v
+
+    def eval_inner(self,
              coords, 
              cells, 
              atom_types, 
@@ -70,9 +111,18 @@ def eval(self,
              aparam = None, 
              atomic = False) :
         # standarize the shape of inputs
-        coords = np.array(coords)
-        cells = np.array(cells)
-        atom_types = np.array(atom_types, dtype = int)
+        atom_types = np.array(atom_types, dtype = int).reshape([-1])
+        natoms = atom_types.size
+        coords = np.reshape(np.array(coords), [-1, natoms * 3])
+        nframes = coords.shape[0]
+        if cells is None:
+            pbc = False
+            # make cells to work around the requirement of pbc
+            cells = np.tile(np.eye(3), [nframes, 1]).reshape([nframes, 9])
+        else:
+            pbc = True
+            cells = np.array(cells).reshape([nframes, 9])
+        
         if self.has_fparam :
             assert(fparam is not None)
             fparam = np.array(fparam)
@@ -81,10 +131,6 @@ def eval(self,
             aparam = np.array(aparam)
 
         # reshape the inputs 
-        cells = np.reshape(cells, [-1, 9])
-        nframes = cells.shape[0]
-        coords = np.reshape(coords, [nframes, -1])
-        natoms = coords.shape[1] // 3
         if self.has_fparam :
             fdim = self.get_dim_fparam()
             if fparam.size == nframes * fdim :
@@ -110,7 +156,6 @@ def eval(self,
         # make natoms_vec and default_mesh
         natoms_vec = self.make_natoms_vec(atom_types)
         assert(natoms_vec[0] == natoms)
-        default_mesh = self.make_default_mesh(cells)
 
         # evaluate
         energy = []
@@ -120,7 +165,6 @@ def eval(self,
         av = []
         feed_dict_test = {}
         feed_dict_test[self.t_natoms] = natoms_vec
-        feed_dict_test[self.t_mesh  ] = default_mesh
         feed_dict_test[self.t_type  ] = atom_types
         t_out = [self.t_energy, 
                  self.t_force, 
@@ -131,6 +175,10 @@ def eval(self,
         for ii in range(nframes) :
             feed_dict_test[self.t_coord] = np.reshape(coords[ii:ii+1, :], [-1])
             feed_dict_test[self.t_box  ] = np.reshape(cells [ii:ii+1, :], [-1])
+            if pbc:
+                feed_dict_test[self.t_mesh ] = make_default_mesh(cells[ii:ii+1, :])
+            else:
+                feed_dict_test[self.t_mesh ] = np.array([], dtype = np.int32)
             if self.has_fparam:
                 feed_dict_test[self.t_fparam] = np.reshape(fparam[ii:ii+1, :], [-1])
             if self.has_aparam:
@@ -159,3 +207,4 @@ def eval(self,
         else :
             return energy, force, virial
 
+
diff --git a/source/train/DeepWFC.py b/source/train/DeepWFC.py
index 98bf2578a0..0fff8947da 100644
--- a/source/train/DeepWFC.py
+++ b/source/train/DeepWFC.py
@@ -1,11 +1,10 @@
 #!/usr/bin/env python3
 
-import os,sys
-import numpy as np
 from deepmd.DeepEval import DeepTensor
 
 class DeepWFC (DeepTensor) :
     def __init__(self, 
-                 model_file) :
-        DeepTensor.__init__(self, model_file, 'wfc', 12)
+                 model_file, 
+                 default_tf_graph = False) :
+        DeepTensor.__init__(self, model_file, 'wfc', 12, default_tf_graph = default_tf_graph)
 
diff --git a/source/train/DescrptLocFrame.py b/source/train/DescrptLocFrame.py
index ab00ac31e2..69c1473db0 100644
--- a/source/train/DescrptLocFrame.py
+++ b/source/train/DescrptLocFrame.py
@@ -1,23 +1,10 @@
-import platform
-import os
 import numpy as np
 from deepmd.env import tf
 from deepmd.common import ClassArg
 from deepmd.RunOptions import global_tf_float_precision
 from deepmd.RunOptions import global_np_float_precision
-from deepmd.RunOptions import global_ener_float_precision
-from deepmd.RunOptions import global_cvt_2_tf_float
-from deepmd.RunOptions import global_cvt_2_ener_float
-
-if platform.system() == "Windows":
-    ext = "dll"
-elif platform.system() == "Darwin":
-    ext = "dylib"
-else:
-    ext = "so"
-module_path = os.path.dirname(os.path.realpath(__file__)) + "/"
-assert (os.path.isfile (module_path  + "libop_abi.{}".format(ext) )), "op module does not exist"
-op_module = tf.load_op_library(module_path + "libop_abi.{}".format(ext))
+from deepmd.env import op_module
+from deepmd.env import default_tf_session_config
 
 class DescrptLocFrame () :
     def __init__(self, jdata):
@@ -42,6 +29,34 @@ def __init__(self, jdata):
         self.ndescrpt_a = self.nnei_a * 4
         self.ndescrpt_r = self.nnei_r * 1
         self.ndescrpt = self.ndescrpt_a + self.ndescrpt_r
+        self.davg = None
+        self.dstd = None
+
+        self.place_holders = {}
+        avg_zero = np.zeros([self.ntypes,self.ndescrpt]).astype(global_np_float_precision)
+        std_ones = np.ones ([self.ntypes,self.ndescrpt]).astype(global_np_float_precision)
+        sub_graph = tf.Graph()
+        with sub_graph.as_default():
+            name_pfx = 'd_lf_'
+            for ii in ['coord', 'box']:
+                self.place_holders[ii] = tf.placeholder(global_np_float_precision, [None, None], name = name_pfx+'t_'+ii)
+            self.place_holders['type'] = tf.placeholder(tf.int32, [None, None], name=name_pfx+'t_type')
+            self.place_holders['natoms_vec'] = tf.placeholder(tf.int32, [self.ntypes+2], name=name_pfx+'t_natoms')
+            self.place_holders['default_mesh'] = tf.placeholder(tf.int32, [None], name=name_pfx+'t_mesh')
+            self.stat_descrpt, descrpt_deriv, rij, nlist, axis, rot_mat \
+                = op_module.descrpt (self.place_holders['coord'],
+                                     self.place_holders['type'],
+                                     self.place_holders['natoms_vec'],
+                                     self.place_holders['box'],
+                                     self.place_holders['default_mesh'],
+                                     tf.constant(avg_zero),
+                                     tf.constant(std_ones),
+                                     rcut_a = self.rcut_a,
+                                     rcut_r = self.rcut_r,
+                                     sel_a = self.sel_a,
+                                     sel_r = self.sel_r,
+                                     axis_rule = self.axis_rule)
+        self.sub_sess = tf.Session(graph = sub_graph, config=default_tf_session_config)
 
 
     def get_rcut (self) :
@@ -56,7 +71,7 @@ def get_dim_out (self) :
     def get_nlist (self) :
         return self.nlist, self.rij, self.sel_a, self.sel_r
 
-    def compute_dstats (self,
+    def compute_input_stats (self,
                         data_coord, 
                         data_box, 
                         data_atype, 
@@ -85,10 +100,9 @@ def compute_dstats (self,
                         dstd[ii] = 1e-2            
                 all_davg.append(davg)
                 all_dstd.append(dstd)
-        davg = np.array(all_davg)
-        dstd = np.array(all_dstd)
-        return davg, dstd
-        
+        self.davg = np.array(all_davg)
+        self.dstd = np.array(all_dstd)        
+
         
     def build (self, 
                coord_, 
@@ -96,10 +110,10 @@ def build (self,
                natoms,
                box_, 
                mesh,
-               davg = None, 
-               dstd = None,
                suffix = '', 
                reuse = None):
+        davg = self.davg
+        dstd = self.dstd
         with tf.variable_scope('descrpt_attr' + suffix, reuse = reuse) :
             if davg is None:
                 davg = np.zeros([self.ntypes, self.ndescrpt]) 
@@ -174,31 +188,15 @@ def _compute_dstats_sys_nonsmth (self,
                                     data_atype, 
                                     natoms_vec,
                                     mesh) :    
-        avg_zero = np.zeros([self.ntypes,self.ndescrpt]).astype(global_np_float_precision)
-        std_ones = np.ones ([self.ntypes,self.ndescrpt]).astype(global_np_float_precision)
-        sub_graph = tf.Graph()
-        with sub_graph.as_default():
-            descrpt, descrpt_deriv, rij, nlist, axis, rot_mat \
-                = op_module.descrpt (tf.constant(data_coord),
-                                     tf.constant(data_atype),
-                                     tf.constant(natoms_vec, dtype = tf.int32),
-                                     tf.constant(data_box),
-                                     tf.constant(mesh),
-                                     tf.constant(avg_zero),
-                                     tf.constant(std_ones),
-                                     rcut_a = self.rcut_a,
-                                     rcut_r = self.rcut_r,
-                                     sel_a = self.sel_a,
-                                     sel_r = self.sel_r,
-                                     axis_rule = self.axis_rule)
-        # self.sess.run(tf.global_variables_initializer())
-        # sub_sess = tf.Session(graph = sub_graph, 
-        #                       config=tf.ConfigProto(intra_op_parallelism_threads=self.run_opt.num_intra_threads, 
-        #                                             inter_op_parallelism_threads=self.run_opt.num_inter_threads
-        #                       ))
-        sub_sess = tf.Session(graph = sub_graph)
-        dd_all = sub_sess.run(descrpt)
-        sub_sess.close()
+        dd_all \
+            = self.sub_sess.run(self.stat_descrpt, 
+                                feed_dict = {
+                                    self.place_holders['coord']: data_coord,
+                                    self.place_holders['type']: data_atype,
+                                    self.place_holders['natoms_vec']: natoms_vec,
+                                    self.place_holders['box']: data_box,
+                                    self.place_holders['default_mesh']: mesh,
+                                })
         natoms = natoms_vec
         dd_all = np.reshape(dd_all, [-1, self.ndescrpt * natoms[0]])
         start_index = 0
diff --git a/source/train/DescrptSeA.py b/source/train/DescrptSeA.py
index dfe7329445..d409f7134f 100644
--- a/source/train/DescrptSeA.py
+++ b/source/train/DescrptSeA.py
@@ -1,23 +1,10 @@
-import platform
-import os,sys,warnings
 import numpy as np
 from deepmd.env import tf
-from deepmd.common import ClassArg
+from deepmd.common import ClassArg, get_activation_func, get_precision
 from deepmd.RunOptions import global_tf_float_precision
 from deepmd.RunOptions import global_np_float_precision
-from deepmd.RunOptions import global_ener_float_precision
-from deepmd.RunOptions import global_cvt_2_tf_float
-from deepmd.RunOptions import global_cvt_2_ener_float
-
-if platform.system() == "Windows":
-    ext = "dll"
-elif platform.system() == "Darwin":
-    ext = "dylib"
-else:
-    ext = "so"
-module_path = os.path.dirname(os.path.realpath(__file__)) + "/"
-assert (os.path.isfile (module_path  + "libop_abi.{}".format(ext) )), "op module does not exist"
-op_module = tf.load_op_library(module_path + "libop_abi.{}".format(ext))
+from deepmd.env import op_module
+from deepmd.env import default_tf_session_config
 
 class DescrptSeA ():
     def __init__ (self, jdata):
@@ -29,7 +16,11 @@ def __init__ (self, jdata):
                .add('axis_neuron', int, default = 4, alias = 'n_axis_neuron') \
                .add('resnet_dt',bool,   default = False) \
                .add('trainable',bool,   default = True) \
-               .add('seed',     int) 
+               .add('seed',     int) \
+               .add('exclude_types', list, default = []) \
+               .add('set_davg_zero', bool, default = False) \
+               .add('activation_function', str,    default = 'tanh') \
+               .add('precision', str, default = "default")
         class_data = args.parse(jdata)
         self.sel_a = class_data['sel']
         self.rcut_r = class_data['rcut']
@@ -39,6 +30,15 @@ def __init__ (self, jdata):
         self.filter_resnet_dt = class_data['resnet_dt']
         self.seed = class_data['seed']
         self.trainable = class_data['trainable']
+        self.filter_activation_fn = get_activation_func(class_data['activation_function'])
+        self.filter_precision = get_precision(class_data['precision'])
+        exclude_types = class_data['exclude_types']
+        self.exclude_types = set()
+        for tt in exclude_types:
+            assert(len(tt) == 2)
+            self.exclude_types.add((tt[0], tt[1]))
+            self.exclude_types.add((tt[1], tt[0]))
+        self.set_davg_zero = class_data['set_davg_zero']
 
         # descrpt config
         self.sel_r = [ 0 for ii in range(len(self.sel_a)) ]
@@ -52,8 +52,35 @@ def __init__ (self, jdata):
         self.ndescrpt_a = self.nnei_a * 4
         self.ndescrpt_r = self.nnei_r * 1
         self.ndescrpt = self.ndescrpt_a + self.ndescrpt_r
-
         self.useBN = False
+        self.dstd = None
+        self.davg = None
+
+        self.place_holders = {}
+        avg_zero = np.zeros([self.ntypes,self.ndescrpt]).astype(global_np_float_precision)
+        std_ones = np.ones ([self.ntypes,self.ndescrpt]).astype(global_np_float_precision)
+        sub_graph = tf.Graph()
+        with sub_graph.as_default():
+            name_pfx = 'd_sea_'
+            for ii in ['coord', 'box']:
+                self.place_holders[ii] = tf.placeholder(global_np_float_precision, [None, None], name = name_pfx+'t_'+ii)
+            self.place_holders['type'] = tf.placeholder(tf.int32, [None, None], name=name_pfx+'t_type')
+            self.place_holders['natoms_vec'] = tf.placeholder(tf.int32, [self.ntypes+2], name=name_pfx+'t_natoms')
+            self.place_holders['default_mesh'] = tf.placeholder(tf.int32, [None], name=name_pfx+'t_mesh')
+            self.stat_descrpt, descrpt_deriv, rij, nlist \
+                = op_module.descrpt_se_a(self.place_holders['coord'],
+                                         self.place_holders['type'],
+                                         self.place_holders['natoms_vec'],
+                                         self.place_holders['box'],
+                                         self.place_holders['default_mesh'],
+                                         tf.constant(avg_zero),
+                                         tf.constant(std_ones),
+                                         rcut_a = self.rcut_a,
+                                         rcut_r = self.rcut_r,
+                                         rcut_r_smth = self.rcut_r_smth,
+                                         sel_a = self.sel_a,
+                                         sel_r = self.sel_r)
+        self.sub_sess = tf.Session(graph = sub_graph, config=default_tf_session_config)
 
 
     def get_rcut (self) :
@@ -71,7 +98,7 @@ def get_dim_rot_mat_1 (self) :
     def get_nlist (self) :
         return self.nlist, self.rij, self.sel_a, self.sel_r
 
-    def compute_dstats (self,
+    def compute_input_stats (self,
                         data_coord, 
                         data_box, 
                         data_atype, 
@@ -110,10 +137,9 @@ def compute_dstats (self,
                 all_davg.append(davg)
                 all_dstd.append(dstd)
 
-        davg = np.array(all_davg)
-        dstd = np.array(all_dstd)
-
-        return davg, dstd
+        if not self.set_davg_zero:
+            self.davg = np.array(all_davg)
+        self.dstd = np.array(all_dstd)
 
 
     def build (self, 
@@ -122,11 +148,10 @@ def build (self,
                natoms,
                box_, 
                mesh,
-               davg = None, 
-               dstd = None,
                suffix = '', 
                reuse = None):
-
+        davg = self.davg
+        dstd = self.dstd
         with tf.variable_scope('descrpt_attr' + suffix, reuse = reuse) :
             if davg is None:
                 davg = np.zeros([self.ntypes, self.ndescrpt]) 
@@ -138,6 +163,12 @@ def build (self,
             t_ntypes = tf.constant(self.ntypes, 
                                    name = 'ntypes', 
                                    dtype = tf.int32)
+            t_ndescrpt = tf.constant(self.ndescrpt, 
+                                     name = 'ndescrpt', 
+                                     dtype = tf.int32)            
+            t_sel = tf.constant(self.sel_a, 
+                                name = 'sel', 
+                                dtype = tf.int32)            
             self.t_avg = tf.get_variable('t_avg', 
                                          davg.shape, 
                                          dtype = global_tf_float_precision,
@@ -168,6 +199,10 @@ def build (self,
                                        sel_r = self.sel_r)
 
         self.descrpt_reshape = tf.reshape(self.descrpt, [-1, self.ndescrpt])
+        self.descrpt_reshape = tf.identity(self.descrpt_reshape, name = 'o_rmat')
+        self.descrpt_deriv = tf.identity(self.descrpt_deriv, name = 'o_rmat_deriv')
+        self.rij = tf.identity(self.rij, name = 'o_rij')
+        self.nlist = tf.identity(self.nlist, name = 'o_nlist')
 
         self.dout, self.qmat = self._pass_filter(self.descrpt_reshape, natoms, suffix = suffix, reuse = reuse, trainable = self.trainable)
 
@@ -207,7 +242,6 @@ def _pass_filter(self,
                      trainable = True) :
         start_index = 0
         inputs = tf.reshape(inputs, [-1, self.ndescrpt * natoms[0]])
-        shape = inputs.get_shape().as_list()
         output = []
         output_qmat = []
         for type_i in range(self.ntypes):
@@ -215,7 +249,7 @@ def _pass_filter(self,
                                  [ 0, start_index*      self.ndescrpt],
                                  [-1, natoms[2+type_i]* self.ndescrpt] )
             inputs_i = tf.reshape(inputs_i, [-1, self.ndescrpt])
-            layer, qmat = self._filter(inputs_i, name='filter_type_'+str(type_i)+suffix, natoms=natoms, reuse=reuse, seed = self.seed, trainable = trainable)
+            layer, qmat = self._filter(tf.cast(inputs_i, self.filter_precision), type_i, name='filter_type_'+str(type_i)+suffix, natoms=natoms, reuse=reuse, seed = self.seed, trainable = trainable, activation_fn = self.filter_activation_fn)
             layer = tf.reshape(layer, [tf.shape(inputs)[0], natoms[2+type_i] * self.get_dim_out()])
             qmat  = tf.reshape(qmat,  [tf.shape(inputs)[0], natoms[2+type_i] * self.get_dim_rot_mat_1() * 3])
             output.append(layer)
@@ -232,32 +266,15 @@ def _compute_dstats_sys_smth (self,
                                  data_atype,                             
                                  natoms_vec,
                                  mesh) :    
-        avg_zero = np.zeros([self.ntypes,self.ndescrpt]).astype(global_np_float_precision)
-        std_ones = np.ones ([self.ntypes,self.ndescrpt]).astype(global_np_float_precision)
-        sub_graph = tf.Graph()
-        with sub_graph.as_default():
-            descrpt, descrpt_deriv, rij, nlist \
-                = op_module.descrpt_se_a (tf.constant(data_coord),
-                                           tf.constant(data_atype),
-                                           tf.constant(natoms_vec, dtype = tf.int32),
-                                           tf.constant(data_box),
-                                           tf.constant(mesh),
-                                           tf.constant(avg_zero),
-                                           tf.constant(std_ones),
-                                           rcut_a = self.rcut_a,
-                                           rcut_r = self.rcut_r,
-                                           rcut_r_smth = self.rcut_r_smth,
-                                           sel_a = self.sel_a,
-                                           sel_r = self.sel_r)
-        # self.sess.run(tf.global_variables_initializer())
-        # sub_sess = tf.Session(graph = sub_graph,
-        #                       config=tf.ConfigProto(intra_op_parallelism_threads=self.run_opt.num_intra_threads, 
-        #                                             inter_op_parallelism_threads=self.run_opt.num_inter_threads
-
-        #                       ))
-        sub_sess = tf.Session(graph = sub_graph)
-        dd_all = sub_sess.run(descrpt)
-        sub_sess.close()
+        dd_all \
+            = self.sub_sess.run(self.stat_descrpt, 
+                                feed_dict = {
+                                    self.place_holders['coord']: data_coord,
+                                    self.place_holders['type']: data_atype,
+                                    self.place_holders['natoms_vec']: natoms_vec,
+                                    self.place_holders['box']: data_box,
+                                    self.place_holders['default_mesh']: mesh,
+                                })
         natoms = natoms_vec
         dd_all = np.reshape(dd_all, [-1, self.ndescrpt * natoms[0]])
         start_index = 0
@@ -297,6 +314,7 @@ def _compute_std (self,sumv2, sumv, sumn) :
 
     def _filter(self, 
                    inputs, 
+                   type_input,
                    natoms,
                    activation_fn=tf.nn.tanh, 
                    stddev=1.0,
@@ -323,35 +341,39 @@ def _filter(self,
             # with (natom x nei_type_i) x 4  
             inputs_reshape = tf.reshape(inputs_i, [-1, 4])
             xyz_scatter = tf.reshape(tf.slice(inputs_reshape, [0,0],[-1,1]),[-1,1])
-            for ii in range(1, len(outputs_size)):
-              w = tf.get_variable('matrix_'+str(ii)+'_'+str(type_i), 
-                                [outputs_size[ii - 1], outputs_size[ii]], 
-                                global_tf_float_precision,
-                                  tf.random_normal_initializer(stddev=stddev/np.sqrt(outputs_size[ii]+outputs_size[ii-1]), seed = seed), 
-                                  trainable = trainable)
-              b = tf.get_variable('bias_'+str(ii)+'_'+str(type_i), 
-                                [1, outputs_size[ii]], 
-                                global_tf_float_precision,
-                                tf.random_normal_initializer(stddev=stddev, mean = bavg, seed = seed), 
-                                  trainable = trainable)
-              if self.filter_resnet_dt :
-                  idt = tf.get_variable('idt_'+str(ii)+'_'+str(type_i), 
-                                        [1, outputs_size[ii]], 
-                                        global_tf_float_precision,
-                                        tf.random_normal_initializer(stddev=0.001, mean = 1.0, seed = seed), 
-                                        trainable = trainable)
-              if outputs_size[ii] == outputs_size[ii-1]:
-                  if self.filter_resnet_dt :
-                      xyz_scatter += activation_fn(tf.matmul(xyz_scatter, w) + b) * idt
-                  else :
-                      xyz_scatter += activation_fn(tf.matmul(xyz_scatter, w) + b)
-              elif outputs_size[ii] == outputs_size[ii-1] * 2: 
-                  if self.filter_resnet_dt :
-                      xyz_scatter = tf.concat([xyz_scatter,xyz_scatter], 1) + activation_fn(tf.matmul(xyz_scatter, w) + b) * idt
-                  else :
-                      xyz_scatter = tf.concat([xyz_scatter,xyz_scatter], 1) + activation_fn(tf.matmul(xyz_scatter, w) + b)
-              else:
-                  xyz_scatter = activation_fn(tf.matmul(xyz_scatter, w) + b)
+            if (type_input, type_i) not in self.exclude_types:
+              for ii in range(1, len(outputs_size)):
+                w = tf.get_variable('matrix_'+str(ii)+'_'+str(type_i), 
+                                  [outputs_size[ii - 1], outputs_size[ii]], 
+                                  self.filter_precision,
+                                    tf.random_normal_initializer(stddev=stddev/np.sqrt(outputs_size[ii]+outputs_size[ii-1]), seed = seed), 
+                                    trainable = trainable)
+                b = tf.get_variable('bias_'+str(ii)+'_'+str(type_i), 
+                                  [1, outputs_size[ii]], 
+                                  self.filter_precision,
+                                  tf.random_normal_initializer(stddev=stddev, mean = bavg, seed = seed), 
+                                    trainable = trainable)
+                if self.filter_resnet_dt :
+                    idt = tf.get_variable('idt_'+str(ii)+'_'+str(type_i), 
+                                          [1, outputs_size[ii]], 
+                                          self.filter_precision,
+                                          tf.random_normal_initializer(stddev=0.001, mean = 1.0, seed = seed), 
+                                          trainable = trainable)
+                if outputs_size[ii] == outputs_size[ii-1]:
+                    if self.filter_resnet_dt :
+                        xyz_scatter += activation_fn(tf.matmul(xyz_scatter, w) + b) * idt
+                    else :
+                        xyz_scatter += activation_fn(tf.matmul(xyz_scatter, w) + b)
+                elif outputs_size[ii] == outputs_size[ii-1] * 2: 
+                    if self.filter_resnet_dt :
+                        xyz_scatter = tf.concat([xyz_scatter,xyz_scatter], 1) + activation_fn(tf.matmul(xyz_scatter, w) + b) * idt
+                    else :
+                        xyz_scatter = tf.concat([xyz_scatter,xyz_scatter], 1) + activation_fn(tf.matmul(xyz_scatter, w) + b)
+                else:
+                    xyz_scatter = activation_fn(tf.matmul(xyz_scatter, w) + b)
+            else:
+              w = tf.zeros((outputs_size[0], outputs_size[-1]), dtype=global_tf_float_precision)
+              xyz_scatter = tf.matmul(xyz_scatter, w)
             # natom x nei_type_i x out_size
             xyz_scatter = tf.reshape(xyz_scatter, (-1, shape_i[1]//4, outputs_size[-1]))
             xyz_scatter_total.append(xyz_scatter)
@@ -389,7 +411,6 @@ def _filter_type_ext(self,
                            seed=None,
                          trainable = True):
         # natom x (nei x 4)
-        shape = inputs.get_shape().as_list()
         outputs_size = [1] + self.filter_neuron
         outputs_size_2 = self.n_axis_neuron
         with tf.variable_scope(name, reuse=reuse):
@@ -411,18 +432,18 @@ def _filter_type_ext(self,
             for ii in range(1, len(outputs_size)):
               w = tf.get_variable('matrix_'+str(ii)+'_'+str(type_i), 
                                 [outputs_size[ii - 1], outputs_size[ii]], 
-                                global_tf_float_precision,
+                                self.filter_precision,
                                   tf.random_normal_initializer(stddev=stddev/np.sqrt(outputs_size[ii]+outputs_size[ii-1]), seed = seed),
                                   trainable = trainable)
               b = tf.get_variable('bias_'+str(ii)+'_'+str(type_i), 
                                 [1, outputs_size[ii]], 
-                                global_tf_float_precision,
+                                self.filter_precision,
                                 tf.random_normal_initializer(stddev=stddev, mean = bavg, seed = seed),
                                   trainable = trainable)
               if self.filter_resnet_dt :
                   idt = tf.get_variable('idt_'+str(ii)+'_'+str(type_i), 
                                         [1, outputs_size[ii]], 
-                                        global_tf_float_precision,
+                                        self.filter_precision,
                                         tf.random_normal_initializer(stddev=0.001, mean = 1.0, seed = seed),
                                         trainable = trainable)
               if outputs_size[ii] == outputs_size[ii-1]:
diff --git a/source/train/DescrptSeAR.py b/source/train/DescrptSeAR.py
index 138e9222f6..dadc2f3d95 100644
--- a/source/train/DescrptSeAR.py
+++ b/source/train/DescrptSeAR.py
@@ -1,26 +1,10 @@
-import platform
-import os,sys,warnings
 import numpy as np
 from deepmd.env import tf
 from deepmd.common import ClassArg
-from deepmd.RunOptions import global_tf_float_precision
-from deepmd.RunOptions import global_np_float_precision
-from deepmd.RunOptions import global_ener_float_precision
-from deepmd.RunOptions import global_cvt_2_tf_float
-from deepmd.RunOptions import global_cvt_2_ener_float
 
 from deepmd.DescrptSeA import DescrptSeA
 from deepmd.DescrptSeR import DescrptSeR
-
-if platform.system() == "Windows":
-    ext = "dll"
-elif platform.system() == "Darwin":
-    ext = "dylib"
-else:
-    ext = "so"
-module_path = os.path.dirname(os.path.realpath(__file__)) + "/"
-assert (os.path.isfile (module_path  + "libop_abi.{}".format(ext) )), "op module does not exist"
-op_module = tf.load_op_library(module_path + "libop_abi.{}".format(ext))
+from deepmd.env import op_module
 
 class DescrptSeAR ():
     def __init__ (self, jdata):
@@ -33,6 +17,8 @@ def __init__ (self, jdata):
         self.descrpt_a = DescrptSeA(self.param_a)
         self.descrpt_r = DescrptSeR(self.param_r)        
         assert(self.descrpt_a.get_ntypes() == self.descrpt_r.get_ntypes())
+        self.davg = None
+        self.dstd = None
 
     def get_rcut (self) :
         return np.max([self.descrpt_a.get_rcut(), self.descrpt_r.get_rcut()])
@@ -49,15 +35,16 @@ def get_nlist_a (self) :
     def get_nlist_r (self) :
         return self.descrpt_r.nlist, self.descrpt_r.rij, self.descrpt_r.sel_a, self.descrpt_r.sel_r
 
-    def compute_dstats (self,
+    def compute_input_stats (self,
                         data_coord, 
                         data_box, 
                         data_atype, 
                         natoms_vec,
                         mesh) :    
-        davg_a, dstd_a = self.descrpt_a.compute_dstats(data_coord, data_box, data_atype, natoms_vec, mesh)
-        davg_r, dstd_r = self.descrpt_r.compute_dstats(data_coord, data_box, data_atype, natoms_vec, mesh)
-        return [davg_a, davg_r], [dstd_a, dstd_r]
+        self.descrpt_a.compute_input_stats(data_coord, data_box, data_atype, natoms_vec, mesh)
+        self.descrpt_r.compute_input_stats(data_coord, data_box, data_atype, natoms_vec, mesh)
+        self.davg = [self.descrpt_a.davg, self.descrpt_r.davg]
+        self.dstd = [self.descrpt_a.dstd, self.descrpt_r.dstd]
 
 
     def build (self, 
@@ -66,13 +53,19 @@ def build (self,
                natoms,
                box, 
                mesh,
-               davg,
-               dstd,
                suffix = '', 
                reuse = None):
+        davg = self.davg
+        dstd = self.dstd
+        if davg is None:
+            davg = [np.zeros([self.descrpt_a.ntypes, self.descrpt_a.ndescrpt]), 
+                    np.zeros([self.descrpt_r.ntypes, self.descrpt_r.ndescrpt])]
+        if dstd is None:
+            dstd = [np.ones ([self.descrpt_a.ntypes, self.descrpt_a.ndescrpt]), 
+                    np.ones ([self.descrpt_r.ntypes, self.descrpt_r.ndescrpt])]
         # dout
-        self.dout_a = self.descrpt_a.build(coord_, atype_, natoms, box, mesh, davg[0], dstd[0], suffix=suffix+'_a', reuse=reuse)
-        self.dout_r = self.descrpt_r.build(coord_, atype_, natoms, box, mesh, davg[1], dstd[1], suffix=suffix,      reuse=reuse)
+        self.dout_a = self.descrpt_a.build(coord_, atype_, natoms, box, mesh, suffix=suffix+'_a', reuse=reuse)
+        self.dout_r = self.descrpt_r.build(coord_, atype_, natoms, box, mesh, suffix=suffix     , reuse=reuse)
         self.dout_a = tf.reshape(self.dout_a, [-1, self.descrpt_a.get_dim_out()])
         self.dout_r = tf.reshape(self.dout_r, [-1, self.descrpt_r.get_dim_out()])
         self.dout = tf.concat([self.dout_a, self.dout_r], axis = 1)
diff --git a/source/train/DescrptSeR.py b/source/train/DescrptSeR.py
index f7f8147a6c..ed53a3bcd4 100644
--- a/source/train/DescrptSeR.py
+++ b/source/train/DescrptSeR.py
@@ -1,23 +1,10 @@
-import os,warnings
-import platform
 import numpy as np
 from deepmd.env import tf
-from deepmd.common import ClassArg
+from deepmd.common import ClassArg, get_activation_func, get_precision
 from deepmd.RunOptions import global_tf_float_precision
 from deepmd.RunOptions import global_np_float_precision
-from deepmd.RunOptions import global_ener_float_precision
-from deepmd.RunOptions import global_cvt_2_tf_float
-from deepmd.RunOptions import global_cvt_2_ener_float
-
-if platform.system() == "Windows":
-    ext = "dll"
-elif platform.system() == "Darwin":
-    ext = "dylib"
-else:
-    ext = "so"
-module_path = os.path.dirname(os.path.realpath(__file__)) + "/"
-assert (os.path.isfile (module_path  + "libop_abi.{}".format(ext) )), "op module does not exist"
-op_module = tf.load_op_library(module_path + "libop_abi.{}".format(ext))
+from deepmd.env import op_module
+from deepmd.env import default_tf_session_config
 
 class DescrptSeR ():
     def __init__ (self, jdata):
@@ -28,7 +15,11 @@ def __init__ (self, jdata):
                .add('neuron',   list,   default = [10, 20, 40]) \
                .add('resnet_dt',bool,   default = False) \
                .add('trainable',bool,   default = True) \
-               .add('seed',     int) 
+               .add('seed',     int) \
+               .add('exclude_types', list, default = []) \
+               .add('set_davg_zero', bool, default = False) \
+               .add("activation_function", str, default = "tanh") \
+               .add("precision",           str, default = "default")
         class_data = args.parse(jdata)
         self.sel_r = class_data['sel']
         self.rcut = class_data['rcut']
@@ -37,6 +28,15 @@ def __init__ (self, jdata):
         self.filter_resnet_dt = class_data['resnet_dt']
         self.seed = class_data['seed']        
         self.trainable = class_data['trainable']
+        self.filter_activation_fn = get_activation_func(class_data["activation_function"]) 
+        self.filter_precision = get_precision(class_data['precision'])  
+        exclude_types = class_data['exclude_types']
+        self.exclude_types = set()
+        for tt in exclude_types:
+            assert(len(tt) == 2)
+            self.exclude_types.add((tt[0], tt[1]))
+            self.exclude_types.add((tt[1], tt[0]))
+        self.set_davg_zero = class_data['set_davg_zero']
 
         # descrpt config
         self.sel_a = [ 0 for ii in range(len(self.sel_r)) ]
@@ -48,8 +48,33 @@ def __init__ (self, jdata):
         self.ndescrpt_a = self.nnei_a * 4
         self.ndescrpt_r = self.nnei_r * 1
         self.ndescrpt = self.nnei_r
-
         self.useBN = False
+        self.davg = None
+        self.dstd = None
+
+        self.place_holders = {}
+        avg_zero = np.zeros([self.ntypes,self.ndescrpt]).astype(global_np_float_precision)
+        std_ones = np.ones ([self.ntypes,self.ndescrpt]).astype(global_np_float_precision)
+        sub_graph = tf.Graph()
+        with sub_graph.as_default():
+            name_pfx = 'd_ser_'
+            for ii in ['coord', 'box']:
+                self.place_holders[ii] = tf.placeholder(global_np_float_precision, [None, None], name = name_pfx+'t_'+ii)
+            self.place_holders['type'] = tf.placeholder(tf.int32, [None, None], name=name_pfx+'t_type')
+            self.place_holders['natoms_vec'] = tf.placeholder(tf.int32, [self.ntypes+2], name=name_pfx+'t_natoms')
+            self.place_holders['default_mesh'] = tf.placeholder(tf.int32, [None], name=name_pfx+'t_mesh')
+            self.stat_descrpt, descrpt_deriv, rij, nlist \
+                = op_module.descrpt_se_r(self.place_holders['coord'],
+                                         self.place_holders['type'],
+                                         self.place_holders['natoms_vec'],
+                                         self.place_holders['box'],
+                                         self.place_holders['default_mesh'],
+                                         tf.constant(avg_zero),
+                                         tf.constant(std_ones),
+                                         rcut = self.rcut,
+                                         rcut_smth = self.rcut_smth,
+                                         sel = self.sel_r)
+            self.sub_sess = tf.Session(graph = sub_graph, config=default_tf_session_config)
 
 
     def get_rcut (self) :
@@ -64,7 +89,7 @@ def get_dim_out (self) :
     def get_nlist (self) :
         return self.nlist, self.rij, self.sel_a, self.sel_r
 
-    def compute_dstats (self,
+    def compute_input_stats (self,
                         data_coord, 
                         data_box, 
                         data_atype, 
@@ -92,10 +117,10 @@ def compute_dstats (self,
             all_davg.append(davg)
             all_dstd.append(dstd)
 
-        davg = np.array(all_davg)
-        dstd = np.array(all_dstd)
+        if not self.set_davg_zero:
+            self.davg = np.array(all_davg)
+        self.dstd = np.array(all_dstd)
 
-        return davg, dstd
 
     def build (self, 
                coord_, 
@@ -103,10 +128,10 @@ def build (self,
                natoms,
                box_, 
                mesh,
-               davg = None, 
-               dstd = None,
                suffix = '', 
                reuse = None):
+        davg = self.davg
+        dstd = self.dstd
         with tf.variable_scope('descrpt_attr' + suffix, reuse = reuse) :
             if davg is None:
                 davg = np.zeros([self.ntypes, self.ndescrpt]) 
@@ -177,14 +202,13 @@ def _pass_filter(self,
                      trainable = True) :
         start_index = 0
         inputs = tf.reshape(inputs, [-1, self.ndescrpt * natoms[0]])
-        shape = inputs.get_shape().as_list()
         output = []
         for type_i in range(self.ntypes):
             inputs_i = tf.slice (inputs,
                                  [ 0, start_index*      self.ndescrpt],
                                  [-1, natoms[2+type_i]* self.ndescrpt] )
             inputs_i = tf.reshape(inputs_i, [-1, self.ndescrpt])
-            layer = self._filter_r(inputs_i, name='filter_type_'+str(type_i)+suffix, natoms=natoms, reuse=reuse, seed = self.seed, trainable = trainable)
+            layer = self._filter_r(tf.cast(inputs_i, self.filter_precision), type_i, name='filter_type_'+str(type_i)+suffix, natoms=natoms, reuse=reuse, seed = self.seed, trainable = trainable, activation_fn = self.filter_activation_fn)
             layer = tf.reshape(layer, [tf.shape(inputs)[0], natoms[2+type_i] * self.get_dim_out()])
             output.append(layer)
             start_index += natoms[2+type_i]
@@ -197,37 +221,21 @@ def _compute_dstats_sys_se_r (self,
                                   data_atype,                             
                                   natoms_vec,
                                   mesh) :    
-        avg_zero = np.zeros([self.ntypes,self.ndescrpt]).astype(global_np_float_precision)
-        std_ones = np.ones ([self.ntypes,self.ndescrpt]).astype(global_np_float_precision)
-        sub_graph = tf.Graph()
-        with sub_graph.as_default():
-            descrpt, descrpt_deriv, rij, nlist \
-                = op_module.descrpt_se_r (tf.constant(data_coord),
-                                           tf.constant(data_atype),
-                                           tf.constant(natoms_vec, dtype = tf.int32),
-                                           tf.constant(data_box),
-                                           tf.constant(mesh),
-                                           tf.constant(avg_zero),
-                                           tf.constant(std_ones),
-                                           rcut = self.rcut,
-                                           rcut_smth = self.rcut_smth,
-                                           sel = self.sel_r)
-        # sub_sess = tf.Session(graph = sub_graph,
-        #                       config=tf.ConfigProto(intra_op_parallelism_threads=self.run_opt.num_intra_threads, 
-        #                                             inter_op_parallelism_threads=self.run_opt.num_inter_threads
-
-        #                       ))
-        sub_sess = tf.Session(graph = sub_graph)
-        dd_all = sub_sess.run(descrpt)
-        sub_sess.close()
+        dd_all \
+            = self.sub_sess.run(self.stat_descrpt, 
+                                feed_dict = {
+                                    self.place_holders['coord']: data_coord,
+                                    self.place_holders['type']: data_atype,
+                                    self.place_holders['natoms_vec']: natoms_vec,
+                                    self.place_holders['box']: data_box,
+                                    self.place_holders['default_mesh']: mesh,
+                                })
         natoms = natoms_vec
         dd_all = np.reshape(dd_all, [-1, self.ndescrpt * natoms[0]])
         start_index = 0
         sysr = []
-        sysa = []
         sysn = []
         sysr2 = []
-        sysa2 = []
         for type_i in range(self.ntypes):
             end_index = start_index + self.ndescrpt * natoms[2+type_i]
             dd = dd_all[:, start_index:end_index]
@@ -254,6 +262,7 @@ def _compute_std (self,sumv2, sumv, sumn) :
 
     def _filter_r(self, 
                   inputs, 
+                  type_input,
                   natoms,
                   activation_fn=tf.nn.tanh, 
                   stddev=1.0,
@@ -263,7 +272,6 @@ def _filter_r(self,
                   seed=None, 
                   trainable = True):
         # natom x nei
-        shape = inputs.get_shape().as_list()
         outputs_size = [1] + self.filter_neuron
         with tf.variable_scope(name, reuse=reuse):
             start_index = 0
@@ -278,35 +286,39 @@ def _filter_r(self,
                 shape_i = inputs_i.get_shape().as_list()
                 # with (natom x nei_type_i) x 1
                 xyz_scatter = tf.reshape(inputs_i, [-1, 1])
-                for ii in range(1, len(outputs_size)):
-                    w = tf.get_variable('matrix_'+str(ii)+'_'+str(type_i), 
-                                        [outputs_size[ii - 1], outputs_size[ii]], 
-                                        global_tf_float_precision,
-                                        tf.random_normal_initializer(stddev=stddev/np.sqrt(outputs_size[ii]+outputs_size[ii-1]), seed = seed), 
-                                        trainable = trainable)
-                    b = tf.get_variable('bias_'+str(ii)+'_'+str(type_i), 
-                                        [1, outputs_size[ii]], 
-                                        global_tf_float_precision,
-                                        tf.random_normal_initializer(stddev=stddev, mean = bavg, seed = seed), 
-                                        trainable = trainable)
-                    if self.filter_resnet_dt :
-                        idt = tf.get_variable('idt_'+str(ii)+'_'+str(type_i), 
-                                              [1, outputs_size[ii]], 
-                                              global_tf_float_precision,
-                                              tf.random_normal_initializer(stddev=0.001, mean = 1.0, seed = seed), 
-                                              trainable = trainable)
-                    if outputs_size[ii] == outputs_size[ii-1]:
-                        if self.filter_resnet_dt :
-                            xyz_scatter += activation_fn(tf.matmul(xyz_scatter, w) + b) * idt
-                        else :
-                            xyz_scatter += activation_fn(tf.matmul(xyz_scatter, w) + b)
-                    elif outputs_size[ii] == outputs_size[ii-1] * 2: 
+                if (type_input, type_i) not in self.exclude_types:
+                    for ii in range(1, len(outputs_size)):
+                        w = tf.get_variable('matrix_'+str(ii)+'_'+str(type_i), 
+                                            [outputs_size[ii - 1], outputs_size[ii]], 
+                                            self.filter_precision,
+                                            tf.random_normal_initializer(stddev=stddev/np.sqrt(outputs_size[ii]+outputs_size[ii-1]), seed = seed), 
+                                            trainable = trainable)
+                        b = tf.get_variable('bias_'+str(ii)+'_'+str(type_i), 
+                                            [1, outputs_size[ii]], 
+                                            self.filter_precision,
+                                            tf.random_normal_initializer(stddev=stddev, mean = bavg, seed = seed), 
+                                            trainable = trainable)
                         if self.filter_resnet_dt :
-                            xyz_scatter = tf.concat([xyz_scatter,xyz_scatter], 1) + activation_fn(tf.matmul(xyz_scatter, w) + b) * idt
-                        else :
-                            xyz_scatter = tf.concat([xyz_scatter,xyz_scatter], 1) + activation_fn(tf.matmul(xyz_scatter, w) + b)
-                    else:
-                        xyz_scatter = activation_fn(tf.matmul(xyz_scatter, w) + b)
+                            idt = tf.get_variable('idt_'+str(ii)+'_'+str(type_i), 
+                                                  [1, outputs_size[ii]], 
+                                                  self.filter_precision,
+                                                  tf.random_normal_initializer(stddev=0.001, mean = 1.0, seed = seed), 
+                                                  trainable = trainable)
+                        if outputs_size[ii] == outputs_size[ii-1]:
+                            if self.filter_resnet_dt :
+                                xyz_scatter += activation_fn(tf.matmul(xyz_scatter, w) + b) * idt
+                            else :
+                                xyz_scatter += activation_fn(tf.matmul(xyz_scatter, w) + b)
+                        elif outputs_size[ii] == outputs_size[ii-1] * 2: 
+                            if self.filter_resnet_dt :
+                                xyz_scatter = tf.concat([xyz_scatter,xyz_scatter], 1) + activation_fn(tf.matmul(xyz_scatter, w) + b) * idt
+                            else :
+                                xyz_scatter = tf.concat([xyz_scatter,xyz_scatter], 1) + activation_fn(tf.matmul(xyz_scatter, w) + b)
+                        else:
+                            xyz_scatter = activation_fn(tf.matmul(xyz_scatter, w) + b)
+                else:
+                    w = tf.zeros((outputs_size[0], outputs_size[-1]), dtype=global_tf_float_precision)
+                    xyz_scatter = tf.matmul(xyz_scatter, w)
                 # natom x nei_type_i x out_size
                 xyz_scatter = tf.reshape(xyz_scatter, (-1, shape_i[1], outputs_size[-1]))
                 xyz_scatter_total.append(xyz_scatter)
diff --git a/source/train/EwaldRecp.py b/source/train/EwaldRecp.py
new file mode 100644
index 0000000000..2517669454
--- /dev/null
+++ b/source/train/EwaldRecp.py
@@ -0,0 +1,55 @@
+import numpy as np
+from deepmd.env import tf
+from deepmd.common import ClassArg
+from deepmd.RunOptions import global_tf_float_precision
+from deepmd.RunOptions import global_np_float_precision
+from deepmd.RunOptions import global_ener_float_precision
+from deepmd.RunOptions import global_cvt_2_tf_float
+from deepmd.RunOptions import global_cvt_2_ener_float
+from deepmd.env import op_module
+from deepmd.env import default_tf_session_config
+
+class EwaldRecp () :
+    def __init__(self, 
+                 hh,
+                 beta):
+        self.hh = hh
+        self.beta = beta
+        with tf.Graph().as_default() as graph:
+            # place holders
+            self.t_nloc       = tf.placeholder(tf.int32, [1], name = "t_nloc")
+            self.t_coord      = tf.placeholder(global_tf_float_precision, [None], name='t_coord')
+            self.t_charge     = tf.placeholder(global_tf_float_precision, [None], name='t_charge')
+            self.t_box        = tf.placeholder(global_tf_float_precision, [None], name='t_box')
+            # output            
+            self.t_energy, self.t_force, self.t_virial \
+                = op_module.ewald_recp(self.t_coord, self.t_charge, self.t_nloc, self.t_box, 
+                                       ewald_h = self.hh,
+                                       ewald_beta = self.beta)
+        self.sess = tf.Session(graph=graph, config=default_tf_session_config)
+
+    def eval(self, 
+             coord, 
+             charge, 
+             box) :
+        coord = np.array(coord)
+        charge = np.array(charge)
+        box = np.array(box)
+        nframes = charge.shape[0]
+        natoms = charge.shape[1]
+        coord = np.reshape(coord, [nframes * 3 * natoms])
+        charge = np.reshape(charge, [nframes * natoms])
+        box = np.reshape(box, [nframes * 9])
+
+        [energy, force, virial] \
+            = self.sess.run([self.t_energy, self.t_force, self.t_virial], 
+                            feed_dict = {
+                                self.t_coord:  coord,
+                                self.t_charge: charge,
+                                self.t_box:    box,
+                                self.t_nloc:   [natoms],
+                            })
+
+        return energy, force, virial
+             
+             
diff --git a/source/train/Fitting.py b/source/train/Fitting.py
index 59ea7a5ed8..49ca40b2c7 100644
--- a/source/train/Fitting.py
+++ b/source/train/Fitting.py
@@ -1,17 +1,13 @@
-import os,warnings
+import warnings
 import numpy as np
 
 from deepmd.env import tf
-from deepmd.common import ClassArg, add_data_requirement
+from deepmd.common import ClassArg, add_data_requirement, get_activation_func, get_precision
 from deepmd.Network import one_layer
 from deepmd.DescrptLocFrame import DescrptLocFrame
 from deepmd.DescrptSeA import DescrptSeA
 
 from deepmd.RunOptions import global_tf_float_precision
-from deepmd.RunOptions import global_np_float_precision
-from deepmd.RunOptions import global_ener_float_precision
-from deepmd.RunOptions import global_cvt_2_tf_float
-from deepmd.RunOptions import global_cvt_2_ener_float
 
 class EnerFitting ():
     def __init__ (self, jdata, descrpt):
@@ -23,14 +19,33 @@ def __init__ (self, jdata, descrpt):
                .add('numb_aparam',      int,    default = 0)\
                .add('neuron',           list,   default = [120,120,120], alias = 'n_neuron')\
                .add('resnet_dt',        bool,   default = True)\
-               .add('seed',             int)               
+               .add('rcond',            float,  default = 1e-3) \
+               .add('seed',             int)               \
+               .add('atom_ener',        list,   default = [])\
+               .add("activation_function", str,    default = "tanh")\
+               .add("precision",           str, default = "default")\
+               .add("trainable",        [list, bool], default = True)
         class_data = args.parse(jdata)
         self.numb_fparam = class_data['numb_fparam']
         self.numb_aparam = class_data['numb_aparam']
         self.n_neuron = class_data['neuron']
         self.resnet_dt = class_data['resnet_dt']
+        self.rcond = class_data['rcond']
         self.seed = class_data['seed']
+        self.fitting_activation_fn = get_activation_func(class_data["activation_function"])
+        self.fitting_precision = get_precision(class_data['precision'])
+        self.trainable = class_data['trainable']
+        if type(self.trainable) is bool:
+            self.trainable = [self.trainable] * (len(self.n_neuron)+1)
+        assert(len(self.trainable) == len(self.n_neuron) + 1), 'length of trainable should be that of n_neuron + 1'
+        self.atom_ener = []
+        for at, ae in enumerate(class_data['atom_ener']):
+            if ae is not None:
+                self.atom_ener.append(tf.constant(ae, global_tf_float_precision, name = "atom_%d_ener" % at))
+            else:
+                self.atom_ener.append(None)
         self.useBN = False
+        self.bias_atom_e = None
         # data requirement
         if self.numb_fparam > 0 :
             add_data_requirement('fparam', self.numb_fparam, atomic=False, must=True, high_prec=False)
@@ -49,7 +64,33 @@ def get_numb_fparam(self) :
     def get_numb_aparam(self) :
         return self.numb_fparam
 
-    def compute_dstats(self, all_stat, protection):
+    def compute_output_stats(self, all_stat):
+        self.bias_atom_e = self._compute_output_stats(all_stat, rcond = self.rcond)
+
+    @classmethod
+    def _compute_output_stats(self, all_stat, rcond = 1e-3):
+        data = all_stat['energy']
+        # data[sys_idx][batch_idx][frame_idx]
+        sys_ener = np.array([])
+        for ss in range(len(data)):
+            sys_data = []
+            for ii in range(len(data[ss])):
+                for jj in range(len(data[ss][ii])):
+                    sys_data.append(data[ss][ii][jj])
+            sys_data = np.concatenate(sys_data)
+            sys_ener = np.append(sys_ener, np.average(sys_data))
+        data = all_stat['natoms_vec']
+        sys_tynatom = np.array([])
+        nsys = len(data)
+        for ss in range(len(data)):
+            sys_tynatom = np.append(sys_tynatom, data[ss][0].astype(np.float64))
+        sys_tynatom = np.reshape(sys_tynatom, [nsys,-1])
+        sys_tynatom = sys_tynatom[:,2:]
+        energy_shift,resd,rank,s_value \
+            = np.linalg.lstsq(sys_tynatom, sys_ener, rcond = rcond)
+        return energy_shift    
+
+    def compute_input_stats(self, all_stat, protection):
         # stat fparam
         if self.numb_fparam > 0:
             cat_data = np.concatenate(all_stat['fparam'], axis = 0)
@@ -78,7 +119,8 @@ def compute_dstats(self, all_stat, protection):
             for ii in range(self.aparam_std.size):
                 if self.aparam_std[ii] < protection:
                     self.aparam_std[ii] = protection
-            self.aparam_inv_std = 1./self.aparam_std                
+            self.aparam_inv_std = 1./self.aparam_std
+
 
     def _compute_std (self, sumv2, sumv, sumn) :
         return np.sqrt(sumv2/sumn - np.multiply(sumv/sumn, sumv/sumn))
@@ -88,9 +130,9 @@ def build (self,
                inputs,
                input_dict,
                natoms,
-               bias_atom_e = None,
                reuse = None,
                suffix = '') :
+        bias_atom_e = self.bias_atom_e
         if self.numb_fparam > 0 and ( self.fparam_avg is None or self.fparam_inv_std is None ):
             raise RuntimeError('No data stat result. one should do data statisitic, before build')
         if self.numb_aparam > 0 and ( self.aparam_avg is None or self.aparam_inv_std is None ):
@@ -127,8 +169,7 @@ def build (self,
                                                 initializer = tf.constant_initializer(self.aparam_inv_std))
             
         start_index = 0
-        inputs = tf.reshape(inputs, [-1, self.dim_descrpt * natoms[0]])
-        shape = inputs.get_shape().as_list()
+        inputs = tf.cast(tf.reshape(inputs, [-1, self.dim_descrpt * natoms[0]]), self.fitting_precision)
 
         if bias_atom_e is not None :
             assert(len(bias_atom_e) == self.ntypes)
@@ -169,10 +210,26 @@ def build (self,
 
             for ii in range(0,len(self.n_neuron)) :
                 if ii >= 1 and self.n_neuron[ii] == self.n_neuron[ii-1] :
-                    layer+= one_layer(layer, self.n_neuron[ii], name='layer_'+str(ii)+'_type_'+str(type_i)+suffix, reuse=reuse, seed = self.seed, use_timestep = self.resnet_dt)
+                    layer+= one_layer(layer, self.n_neuron[ii], name='layer_'+str(ii)+'_type_'+str(type_i)+suffix, reuse=reuse, seed = self.seed, use_timestep = self.resnet_dt, activation_fn = self.fitting_activation_fn, precision = self.fitting_precision, trainable = self.trainable[ii])
                 else :
-                    layer = one_layer(layer, self.n_neuron[ii], name='layer_'+str(ii)+'_type_'+str(type_i)+suffix, reuse=reuse, seed = self.seed)
-            final_layer = one_layer(layer, 1, activation_fn = None, bavg = type_bias_ae, name='final_layer_type_'+str(type_i)+suffix, reuse=reuse, seed = self.seed)
+                    layer = one_layer(layer, self.n_neuron[ii], name='layer_'+str(ii)+'_type_'+str(type_i)+suffix, reuse=reuse, seed = self.seed, activation_fn = self.fitting_activation_fn, precision = self.fitting_precision, trainable = self.trainable[ii])
+            final_layer = one_layer(layer, 1, activation_fn = None, bavg = type_bias_ae, name='final_layer_type_'+str(type_i)+suffix, reuse=reuse, seed = self.seed, precision = self.fitting_precision, trainable = self.trainable[-1])
+
+            if type_i < len(self.atom_ener) and self.atom_ener[type_i] is not None:
+                inputs_zero = tf.zeros_like(inputs_i, dtype=global_tf_float_precision)
+                layer = inputs_zero
+                if self.numb_fparam > 0 :
+                    layer = tf.concat([layer, ext_fparam], axis = 1)
+                if self.numb_aparam > 0 :
+                    layer = tf.concat([layer, ext_aparam], axis = 1)
+                for ii in range(0,len(self.n_neuron)) :
+                    if ii >= 1 and self.n_neuron[ii] == self.n_neuron[ii-1] :
+                        layer+= one_layer(layer, self.n_neuron[ii], name='layer_'+str(ii)+'_type_'+str(type_i)+suffix, reuse=True, seed = self.seed, use_timestep = self.resnet_dt, activation_fn = self.fitting_activation_fn, precision = self.fitting_precision, trainable = self.trainable[ii])
+                    else :
+                        layer = one_layer(layer, self.n_neuron[ii], name='layer_'+str(ii)+'_type_'+str(type_i)+suffix, reuse=True, seed = self.seed, activation_fn = self.fitting_activation_fn, precision = self.fitting_precision, trainable = self.trainable[ii])
+                zero_layer = one_layer(layer, 1, activation_fn = None, bavg = type_bias_ae, name='final_layer_type_'+str(type_i)+suffix, reuse=True, seed = self.seed, precision = self.fitting_precision, trainable = self.trainable[-1])
+                final_layer += self.atom_ener[type_i] - zero_layer
+
             final_layer = tf.reshape(final_layer, [tf.shape(inputs)[0], natoms[2+type_i]])
 
             # concat the results
@@ -181,7 +238,7 @@ def build (self,
             else:
                 outs = tf.concat([outs, final_layer], axis = 1)
 
-        return tf.reshape(outs, [-1])        
+        return tf.cast(tf.reshape(outs, [-1]), global_tf_float_precision)        
 
 
 class WFCFitting () :
@@ -195,13 +252,17 @@ def __init__ (self, jdata, descrpt) :
                .add('resnet_dt',        bool,   default = True)\
                .add('wfc_numb',         int,    must = True)\
                .add('sel_type',         [list,int],   default = [ii for ii in range(self.ntypes)], alias = 'wfc_type')\
-               .add('seed',             int)
+               .add('seed',             int)\
+               .add("activation_function", str, default = "tanh")\
+               .add('precision',           str,    default = "default")
         class_data = args.parse(jdata)
         self.n_neuron = class_data['neuron']
         self.resnet_dt = class_data['resnet_dt']
         self.wfc_numb = class_data['wfc_numb']
         self.sel_type = class_data['sel_type']
         self.seed = class_data['seed']
+        self.fitting_activation_fn = get_activation_func(class_data["activation_function"])
+        self.fitting_precision = get_precision(class_data['precision'])
         self.useBN = False
 
 
@@ -221,9 +282,8 @@ def build (self,
                reuse = None,
                suffix = '') :
         start_index = 0
-        inputs = tf.reshape(input_d, [-1, self.dim_descrpt * natoms[0]])
+        inputs = tf.cast(tf.reshape(input_d, [-1, self.dim_descrpt * natoms[0]]), self.fitting_precision)
         rot_mat = tf.reshape(rot_mat, [-1, 9 * natoms[0]])
-        shape = inputs.get_shape().as_list()
 
         count = 0
         for type_i in range(self.ntypes):
@@ -242,11 +302,11 @@ def build (self,
             layer = inputs_i
             for ii in range(0,len(self.n_neuron)) :
                 if ii >= 1 and self.n_neuron[ii] == self.n_neuron[ii-1] :
-                    layer+= one_layer(layer, self.n_neuron[ii], name='layer_'+str(ii)+'_type_'+str(type_i)+suffix, reuse=reuse, seed = self.seed, use_timestep = self.resnet_dt)
+                    layer+= one_layer(layer, self.n_neuron[ii], name='layer_'+str(ii)+'_type_'+str(type_i)+suffix, reuse=reuse, seed = self.seed, use_timestep = self.resnet_dt, activation_fn = self.fitting_activation_fn, precision = self.fitting_precision)
                 else :
-                    layer = one_layer(layer, self.n_neuron[ii], name='layer_'+str(ii)+'_type_'+str(type_i)+suffix, reuse=reuse, seed = self.seed)
+                    layer = one_layer(layer, self.n_neuron[ii], name='layer_'+str(ii)+'_type_'+str(type_i)+suffix, reuse=reuse, seed = self.seed, activation_fn = self.fitting_activation_fn, precision = self.fitting_precision)
             # (nframes x natoms) x (nwfc x 3)
-            final_layer = one_layer(layer, self.wfc_numb * 3, activation_fn = None, name='final_layer_type_'+str(type_i)+suffix, reuse=reuse, seed = self.seed)
+            final_layer = one_layer(layer, self.wfc_numb * 3, activation_fn = None, name='final_layer_type_'+str(type_i)+suffix, reuse=reuse, seed = self.seed, precision = self.fitting_precision)
             # (nframes x natoms) x nwfc(wc) x 3(coord_local)
             final_layer = tf.reshape(final_layer, [tf.shape(inputs)[0] * natoms[2+type_i], self.wfc_numb, 3])
             # (nframes x natoms) x nwfc(wc) x 3(coord)
@@ -261,7 +321,7 @@ def build (self,
                 outs = tf.concat([outs, final_layer], axis = 1)
             count += 1
 
-        return tf.reshape(outs, [-1])
+        return tf.cast(tf.reshape(outs, [-1]),  global_tf_float_precision)
 
 
 
@@ -275,12 +335,16 @@ def __init__ (self, jdata, descrpt) :
                .add('neuron',           list, default = [120,120,120], alias = 'n_neuron')\
                .add('resnet_dt',        bool, default = True)\
                .add('sel_type',         [list,int], default = [ii for ii in range(self.ntypes)], alias = 'pol_type')\
-               .add('seed',             int)
+               .add('seed',             int)\
+               .add("activation_function", str, default = "tanh")\
+               .add('precision',           str,    default = "default")    
         class_data = args.parse(jdata)
         self.n_neuron = class_data['neuron']
         self.resnet_dt = class_data['resnet_dt']
         self.sel_type = class_data['sel_type']
         self.seed = class_data['seed']
+        self.fitting_activation_fn = get_activation_func(class_data["activation_function"])
+        self.fitting_precision = get_precision(class_data['precision'])
         self.useBN = False
 
     def get_sel_type(self):
@@ -296,9 +360,8 @@ def build (self,
                reuse = None,
                suffix = '') :
         start_index = 0
-        inputs = tf.reshape(input_d, [-1, self.dim_descrpt * natoms[0]])
+        inputs = tf.cast(tf.reshape(input_d, [-1, self.dim_descrpt * natoms[0]]), self.fitting_precision)
         rot_mat = tf.reshape(rot_mat, [-1, 9 * natoms[0]])
-        shape = inputs.get_shape().as_list()
 
         count = 0
         for type_i in range(self.ntypes):
@@ -317,11 +380,11 @@ def build (self,
             layer = inputs_i
             for ii in range(0,len(self.n_neuron)) :
                 if ii >= 1 and self.n_neuron[ii] == self.n_neuron[ii-1] :
-                    layer+= one_layer(layer, self.n_neuron[ii], name='layer_'+str(ii)+'_type_'+str(type_i)+suffix, reuse=reuse, seed = self.seed, use_timestep = self.resnet_dt)
+                    layer+= one_layer(layer, self.n_neuron[ii], name='layer_'+str(ii)+'_type_'+str(type_i)+suffix, reuse=reuse, seed = self.seed, use_timestep = self.resnet_dt, activation_fn = self.fitting_activation_fn, precision = self.fitting_precision)
                 else :
-                    layer = one_layer(layer, self.n_neuron[ii], name='layer_'+str(ii)+'_type_'+str(type_i)+suffix, reuse=reuse, seed = self.seed)
+                    layer = one_layer(layer, self.n_neuron[ii], name='layer_'+str(ii)+'_type_'+str(type_i)+suffix, reuse=reuse, seed = self.seed, activation_fn = self.fitting_activation_fn, precision = self.fitting_precision)
             # (nframes x natoms) x 9
-            final_layer = one_layer(layer, 9, activation_fn = None, name='final_layer_type_'+str(type_i)+suffix, reuse=reuse, seed = self.seed)
+            final_layer = one_layer(layer, 9, activation_fn = None, name='final_layer_type_'+str(type_i)+suffix, reuse=reuse, seed = self.seed, precision = self.fitting_precision)
             # (nframes x natoms) x 3 x 3
             final_layer = tf.reshape(final_layer, [tf.shape(inputs)[0] * natoms[2+type_i], 3, 3])
             # (nframes x natoms) x 3 x 3
@@ -340,7 +403,7 @@ def build (self,
                 outs = tf.concat([outs, final_layer], axis = 1)
             count += 1
 
-        return tf.reshape(outs, [-1])
+        return tf.cast(tf.reshape(outs, [-1]),  global_tf_float_precision)
 
 
 class PolarFittingSeA () :
@@ -353,14 +416,28 @@ def __init__ (self, jdata, descrpt) :
                .add('neuron',           list,   default = [120,120,120], alias = 'n_neuron')\
                .add('resnet_dt',        bool,   default = True)\
                .add('fit_diag',         bool,   default = True)\
+               .add('diag_shift',       [list,float], default = [0.0 for ii in range(self.ntypes)])\
+               .add('scale',            [list,float], default = [1.0 for ii in range(self.ntypes)])\
                .add('sel_type',         [list,int],   default = [ii for ii in range(self.ntypes)], alias = 'pol_type')\
-               .add('seed',             int)
+               .add('seed',             int)\
+               .add("activation_function", str ,   default = "tanh")\
+               .add('precision',           str,    default = "default")
         class_data = args.parse(jdata)
         self.n_neuron = class_data['neuron']
         self.resnet_dt = class_data['resnet_dt']
         self.sel_type = class_data['sel_type']
         self.fit_diag = class_data['fit_diag']
         self.seed = class_data['seed']
+        self.diag_shift = class_data['diag_shift']
+        self.scale = class_data['scale']
+        self.fitting_activation_fn = get_activation_func(class_data["activation_function"])
+        self.fitting_precision = get_precision(class_data['precision'])
+        if type(self.sel_type) is not list:
+            self.sel_type = [self.sel_type]
+        if type(self.diag_shift) is not list:
+            self.diag_shift = [self.diag_shift]
+        if type(self.scale) is not list:
+            self.scale = [self.scale]
         self.dim_rot_mat_1 = descrpt.get_dim_rot_mat_1()
         self.dim_rot_mat = self.dim_rot_mat_1 * 3
         self.useBN = False
@@ -371,6 +448,23 @@ def get_sel_type(self):
     def get_out_size(self):
         return 9
 
+    def compute_input_stats(self, all_stat, protection = 1e-2):
+        if not ('polarizability' in all_stat.keys()):
+            self.avgeig = np.zeros([9])
+            warnings.warn('no polarizability data, cannot do data stat. use zeros as guess')
+            return
+        data = all_stat['polarizability']
+        all_tmp = []
+        for ss in range(len(data)):
+            tmp = np.concatenate(data[ss], axis = 0)
+            tmp = np.reshape(tmp, [-1, 3, 3])
+            tmp,_ = np.linalg.eig(tmp)
+            tmp = np.absolute(tmp)
+            tmp = np.sort(tmp, axis = 1)
+            all_tmp.append(tmp)
+        all_tmp = np.concatenate(all_tmp, axis = 1)
+        self.avgeig = np.average(all_tmp, axis = 0)
+
     def build (self, 
                input_d,
                rot_mat,
@@ -378,9 +472,8 @@ def build (self,
                reuse = None,
                suffix = '') :
         start_index = 0
-        inputs = tf.reshape(input_d, [-1, self.dim_descrpt * natoms[0]])
+        inputs = tf.cast(tf.reshape(input_d, [-1, self.dim_descrpt * natoms[0]]), self.fitting_precision)
         rot_mat = tf.reshape(rot_mat, [-1, self.dim_rot_mat * natoms[0]])
-        shape = inputs.get_shape().as_list()
 
         count = 0
         for type_i in range(self.ntypes):
@@ -399,19 +492,27 @@ def build (self,
             layer = inputs_i
             for ii in range(0,len(self.n_neuron)) :
                 if ii >= 1 and self.n_neuron[ii] == self.n_neuron[ii-1] :
-                    layer+= one_layer(layer, self.n_neuron[ii], name='layer_'+str(ii)+'_type_'+str(type_i)+suffix, reuse=reuse, seed = self.seed, use_timestep = self.resnet_dt)
+                    layer+= one_layer(layer, self.n_neuron[ii], name='layer_'+str(ii)+'_type_'+str(type_i)+suffix, reuse=reuse, seed = self.seed, use_timestep = self.resnet_dt, activation_fn = self.fitting_activation_fn, precision = self.fitting_precision)
                 else :
-                    layer = one_layer(layer, self.n_neuron[ii], name='layer_'+str(ii)+'_type_'+str(type_i)+suffix, reuse=reuse, seed = self.seed)
+                    layer = one_layer(layer, self.n_neuron[ii], name='layer_'+str(ii)+'_type_'+str(type_i)+suffix, reuse=reuse, seed = self.seed, activation_fn = self.fitting_activation_fn, precision = self.fitting_precision)
             if self.fit_diag :
+                bavg = np.zeros(self.dim_rot_mat_1)
+                # bavg[0] = self.avgeig[0]
+                # bavg[1] = self.avgeig[1]
+                # bavg[2] = self.avgeig[2]
                 # (nframes x natoms) x naxis
-                final_layer = one_layer(layer, self.dim_rot_mat_1, activation_fn = None, name='final_layer_type_'+str(type_i)+suffix, reuse=reuse, seed = self.seed)
+                final_layer = one_layer(layer, self.dim_rot_mat_1, activation_fn = None, name='final_layer_type_'+str(type_i)+suffix, reuse=reuse, seed = self.seed, bavg = bavg, precision = self.fitting_precision)
                 # (nframes x natoms) x naxis
                 final_layer = tf.reshape(final_layer, [tf.shape(inputs)[0] * natoms[2+type_i], self.dim_rot_mat_1])
                 # (nframes x natoms) x naxis x naxis
                 final_layer = tf.matrix_diag(final_layer)                
-            else :                
+            else :
+                bavg = np.zeros(self.dim_rot_mat_1*self.dim_rot_mat_1)
+                # bavg[0*self.dim_rot_mat_1+0] = self.avgeig[0]
+                # bavg[1*self.dim_rot_mat_1+1] = self.avgeig[1]
+                # bavg[2*self.dim_rot_mat_1+2] = self.avgeig[2]
                 # (nframes x natoms) x (naxis x naxis)
-                final_layer = one_layer(layer, self.dim_rot_mat_1*self.dim_rot_mat_1, activation_fn = None, name='final_layer_type_'+str(type_i)+suffix, reuse=reuse, seed = self.seed)
+                final_layer = one_layer(layer, self.dim_rot_mat_1*self.dim_rot_mat_1, activation_fn = None, name='final_layer_type_'+str(type_i)+suffix, reuse=reuse, seed = self.seed, bavg = bavg, precision = self.fitting_precision)
                 # (nframes x natoms) x naxis x naxis
                 final_layer = tf.reshape(final_layer, [tf.shape(inputs)[0] * natoms[2+type_i], self.dim_rot_mat_1, self.dim_rot_mat_1])
                 # (nframes x natoms) x naxis x naxis
@@ -422,6 +523,10 @@ def build (self,
             final_layer = tf.matmul(rot_mat_i, final_layer, transpose_a = True)
             # nframes x natoms x 3 x 3
             final_layer = tf.reshape(final_layer, [tf.shape(inputs)[0], natoms[2+type_i], 3, 3])
+            # shift and scale
+            sel_type_idx = self.sel_type.index(type_i)
+            final_layer = final_layer * self.scale[sel_type_idx]
+            final_layer = final_layer + self.diag_shift[sel_type_idx] * tf.eye(3, batch_shape=[tf.shape(inputs)[0], natoms[2+type_i]], dtype = global_tf_float_precision)
 
             # concat the results
             if count == 0:
@@ -430,7 +535,7 @@ def build (self,
                 outs = tf.concat([outs, final_layer], axis = 1)
             count += 1
 
-        return tf.reshape(outs, [-1])
+        return tf.cast(tf.reshape(outs, [-1]), global_tf_float_precision)
 
 
 class GlobalPolarFittingSeA () :
@@ -471,12 +576,16 @@ def __init__ (self, jdata, descrpt) :
                .add('neuron',           list,   default = [120,120,120], alias = 'n_neuron')\
                .add('resnet_dt',        bool,   default = True)\
                .add('sel_type',         [list,int],   default = [ii for ii in range(self.ntypes)], alias = 'dipole_type')\
-               .add('seed',             int)
+               .add('seed',             int)\
+               .add("activation_function", str, default = "tanh")\
+               .add('precision',           str,    default = "default")
         class_data = args.parse(jdata)
         self.n_neuron = class_data['neuron']
         self.resnet_dt = class_data['resnet_dt']
         self.sel_type = class_data['sel_type']
         self.seed = class_data['seed']
+        self.fitting_activation_fn = get_activation_func(class_data["activation_function"])
+        self.fitting_precision = get_precision(class_data['precision'])
         self.dim_rot_mat_1 = descrpt.get_dim_rot_mat_1()
         self.dim_rot_mat = self.dim_rot_mat_1 * 3
         self.useBN = False
@@ -484,6 +593,9 @@ def __init__ (self, jdata, descrpt) :
     def get_sel_type(self):
         return self.sel_type
 
+    def get_out_size(self):
+        return 3
+
     def build (self, 
                input_d,
                rot_mat,
@@ -491,9 +603,8 @@ def build (self,
                reuse = None,
                suffix = '') :
         start_index = 0
-        inputs = tf.reshape(input_d, [-1, self.dim_descrpt * natoms[0]])
+        inputs = tf.cast(tf.reshape(input_d, [-1, self.dim_descrpt * natoms[0]]), self.fitting_precision)
         rot_mat = tf.reshape(rot_mat, [-1, self.dim_rot_mat * natoms[0]])
-        shape = inputs.get_shape().as_list()
 
         count = 0
         for type_i in range(self.ntypes):
@@ -512,11 +623,11 @@ def build (self,
             layer = inputs_i
             for ii in range(0,len(self.n_neuron)) :
                 if ii >= 1 and self.n_neuron[ii] == self.n_neuron[ii-1] :
-                    layer+= one_layer(layer, self.n_neuron[ii], name='layer_'+str(ii)+'_type_'+str(type_i)+suffix, reuse=reuse, seed = self.seed, use_timestep = self.resnet_dt)
+                    layer+= one_layer(layer, self.n_neuron[ii], name='layer_'+str(ii)+'_type_'+str(type_i)+suffix, reuse=reuse, seed = self.seed, use_timestep = self.resnet_dt, activation_fn = self.fitting_activation_fn, precision = self.fitting_precision)
                 else :
-                    layer = one_layer(layer, self.n_neuron[ii], name='layer_'+str(ii)+'_type_'+str(type_i)+suffix, reuse=reuse, seed = self.seed)
+                    layer = one_layer(layer, self.n_neuron[ii], name='layer_'+str(ii)+'_type_'+str(type_i)+suffix, reuse=reuse, seed = self.seed, activation_fn = self.fitting_activation_fn, precision = self.fitting_precision)
             # (nframes x natoms) x naxis
-            final_layer = one_layer(layer, self.dim_rot_mat_1, activation_fn = None, name='final_layer_type_'+str(type_i)+suffix, reuse=reuse, seed = self.seed)
+            final_layer = one_layer(layer, self.dim_rot_mat_1, activation_fn = None, name='final_layer_type_'+str(type_i)+suffix, reuse=reuse, seed = self.seed, precision = self.fitting_precision)
             # (nframes x natoms) x 1 * naxis
             final_layer = tf.reshape(final_layer, [tf.shape(inputs)[0] * natoms[2+type_i], 1, self.dim_rot_mat_1])
             # (nframes x natoms) x 1 x 3(coord)
@@ -531,4 +642,5 @@ def build (self,
                 outs = tf.concat([outs, final_layer], axis = 1)
             count += 1
 
-        return tf.reshape(outs, [-1])
+        return tf.cast(tf.reshape(outs, [-1]),  global_tf_float_precision)
+        # return tf.reshape(outs, [tf.shape(inputs)[0] * natoms[0] * 3 // 3])
diff --git a/source/train/LearningRate.py b/source/train/LearningRate.py
index a26882ef2b..fe66e52516 100644
--- a/source/train/LearningRate.py
+++ b/source/train/LearningRate.py
@@ -1,4 +1,3 @@
-import os,sys,warnings
 import numpy as np
 from deepmd.env import tf
 from deepmd.common import ClassArg
@@ -7,15 +6,25 @@ class LearningRateExp (object) :
     def __init__ (self, 
                   jdata) :
         args = ClassArg()\
-               .add('decay_steps',      int,    must = True)\
-               .add('decay_rate',       float,  must = True)\
-               .add('start_lr',         float,  must = True)
-        class_data = args.parse(jdata)
-        self.decay_steps_ = class_data['decay_steps']
-        self.decay_rate_ = class_data['decay_rate']
-        self.start_lr_ = class_data['start_lr']
+               .add('decay_steps',      int,    must = False)\
+               .add('decay_rate',       float,  must = False)\
+               .add('start_lr',         float,  must = True)\
+               .add('stop_lr',          float,  must = False)
+        self.cd = args.parse(jdata)
+        self.start_lr_ = self.cd['start_lr']
 
-    def build(self, global_step) :
+    def build(self, global_step, stop_batch = None) :
+        if stop_batch is None:            
+            self.decay_steps_ = self.cd['decay_steps'] if self.cd['decay_steps'] is not None else 5000
+            self.decay_rate_  = self.cd['decay_rate']  if self.cd['decay_rate']  is not None else 0.95
+        else:
+            self.stop_lr_  = self.cd['stop_lr'] if self.cd['stop_lr'] is not None else 5e-8
+            default_ds = 100 if stop_batch // 10 > 100 else stop_batch // 100 + 1
+            self.decay_steps_ = self.cd['decay_steps'] if self.cd['decay_steps'] is not None else default_ds
+            if self.decay_steps_ >= stop_batch:
+                self.decay_steps_ = default_ds
+            self.decay_rate_ = np.exp(np.log(self.stop_lr_ / self.start_lr_) / (stop_batch / self.decay_steps_))
+            
         return tf.train.exponential_decay(self.start_lr_, 
                                           global_step,
                                           self.decay_steps_,
diff --git a/source/train/Loss.py b/source/train/Loss.py
index 6d9abf7026..d939273f26 100644
--- a/source/train/Loss.py
+++ b/source/train/Loss.py
@@ -1,11 +1,7 @@
-import os,sys,warnings
 import numpy as np
 from deepmd.env import tf
 from deepmd.common import ClassArg, add_data_requirement
 
-from deepmd.RunOptions import global_tf_float_precision
-from deepmd.RunOptions import global_np_float_precision
-from deepmd.RunOptions import global_ener_float_precision
 from deepmd.RunOptions import global_cvt_2_tf_float
 from deepmd.RunOptions import global_cvt_2_ener_float
 
@@ -42,11 +38,11 @@ def __init__ (self, jdata, **kwarg) :
         self.has_ae = (self.start_pref_ae != 0 or self.limit_pref_ae != 0)
         self.has_pf = (self.start_pref_pf != 0 or self.limit_pref_pf != 0)
         # data required
-        add_data_requirement('energy', 1, atomic=False, must=False, high_prec=True)
-        add_data_requirement('force',  3, atomic=True,  must=False, high_prec=False)
-        add_data_requirement('virial', 9, atomic=False, must=False, high_prec=False)
-        add_data_requirement('atom_ener', 1, atomic=True, must=False, high_prec=False)
-        add_data_requirement('atom_pref', 1, atomic=True, must=False, high_prec=False, repeat=3)
+        add_data_requirement('energy', 1, atomic=False, must=self.has_e, high_prec=True)
+        add_data_requirement('force',  3, atomic=True,  must=self.has_f, high_prec=False)
+        add_data_requirement('virial', 9, atomic=False, must=self.has_v, high_prec=False)
+        add_data_requirement('atom_ener', 1, atomic=True, must=self.has_ae, high_prec=False)
+        add_data_requirement('atom_pref', 1, atomic=True, must=self.has_pf, high_prec=False, repeat=3)
 
     def build (self, 
                learning_rate,
@@ -75,7 +71,7 @@ def build (self,
         force_hat_reshape = tf.reshape (force_hat, [-1])
         atom_pref_reshape = tf.reshape (atom_pref, [-1])
         diff_f = force_hat_reshape - force_reshape
-        if self.relative_f is not None:
+        if self.relative_f is not None:            
             force_hat_3 = tf.reshape(force_hat, [-1, 3])
             norm_f = tf.reshape(tf.norm(force_hat_3, axis = 1), [-1, 1]) + self.relative_f
             diff_f_3 = tf.reshape(diff_f, [-1, 3])
@@ -173,11 +169,105 @@ def print_on_training(self,
         if self.has_v :
             print_str += prop_fmt % (np.sqrt(error_v_test) / natoms[0], np.sqrt(error_v_train) / natoms[0])
         if self.has_pf:
-            print_str += prop_fmt % (np.sqrt(error_pf_test) / natoms[0], np.sqrt(error_pf_train) / natoms[0])
+            print_str += prop_fmt % (np.sqrt(error_pf_test), np.sqrt(error_pf_train))
 
         return print_str        
 
 
+class EnerDipoleLoss () :
+    def __init__ (self, jdata, **kwarg) :
+        self.starter_learning_rate = kwarg['starter_learning_rate']
+        args = ClassArg()\
+            .add('start_pref_e',        float,  must = True, default = 0.1) \
+            .add('limit_pref_e',        float,  must = True, default = 1.00)\
+            .add('start_pref_ed',       float,  must = True, default = 1.00)\
+            .add('limit_pref_ed',       float,  must = True, default = 1.00)
+        class_data = args.parse(jdata)
+        self.start_pref_e = class_data['start_pref_e']
+        self.limit_pref_e = class_data['limit_pref_e']
+        self.start_pref_ed = class_data['start_pref_ed']
+        self.limit_pref_ed = class_data['limit_pref_ed']
+        # data required
+        add_data_requirement('energy', 1, atomic=False, must=True, high_prec=True)
+        add_data_requirement('energy_dipole', 3, atomic=False, must=True, high_prec=False)
+
+    def build (self, 
+               learning_rate,
+               natoms,
+               model_dict,
+               label_dict,
+               suffix):        
+        coord = model_dict['coord']
+        energy = model_dict['energy']
+        atom_ener = model_dict['atom_ener']
+        nframes = tf.shape(atom_ener)[0]
+        natoms = tf.shape(atom_ener)[1]
+        # build energy dipole
+        atom_ener0 = atom_ener - tf.reshape(tf.tile(tf.reshape(energy/global_cvt_2_ener_float(natoms), [-1, 1]), [1, natoms]), [nframes, natoms])
+        coord = tf.reshape(coord, [nframes, natoms, 3])
+        atom_ener0 = tf.reshape(atom_ener0, [nframes, 1, natoms])
+        ener_dipole = tf.matmul(atom_ener0, coord)
+        ener_dipole = tf.reshape(ener_dipole, [nframes, 3])
+        
+        energy_hat = label_dict['energy']
+        ener_dipole_hat = label_dict['energy_dipole']
+        find_energy = label_dict['find_energy']
+        find_ener_dipole = label_dict['find_energy_dipole']                
+
+        l2_ener_loss = tf.reduce_mean( tf.square(energy - energy_hat), name='l2_'+suffix)
+
+        ener_dipole_reshape = tf.reshape(ener_dipole, [-1])
+        ener_dipole_hat_reshape = tf.reshape(ener_dipole_hat, [-1])
+        l2_ener_dipole_loss = tf.reduce_mean( tf.square(ener_dipole_reshape - ener_dipole_hat_reshape), name='l2_'+suffix)
+
+        # atom_norm_ener  = 1./ global_cvt_2_ener_float(natoms[0]) 
+        atom_norm_ener  = 1./ global_cvt_2_ener_float(natoms) 
+        pref_e  = global_cvt_2_ener_float(find_energy * (self.limit_pref_e + (self.start_pref_e - self.limit_pref_e) * learning_rate / self.starter_learning_rate) )
+        pref_ed = global_cvt_2_tf_float(find_ener_dipole * (self.limit_pref_ed + (self.start_pref_ed - self.limit_pref_ed) * learning_rate / self.starter_learning_rate) )
+
+        l2_loss = 0
+        more_loss = {}
+        l2_loss += atom_norm_ener * (pref_e * l2_ener_loss)
+        l2_loss += global_cvt_2_ener_float(pref_ed * l2_ener_dipole_loss)
+        more_loss['l2_ener_loss'] = l2_ener_loss
+        more_loss['l2_ener_dipole_loss'] = l2_ener_dipole_loss
+
+        self.l2_l = l2_loss
+        self.l2_more = more_loss
+        return l2_loss, more_loss
+
+
+    def print_header(self) :
+        prop_fmt = '   %9s %9s'
+        print_str = ''
+        print_str += prop_fmt % ('l2_tst', 'l2_trn')
+        print_str += prop_fmt % ('l2_e_tst', 'l2_e_trn')
+        print_str += prop_fmt % ('l2_ed_tst', 'l2_ed_trn')
+        return print_str
+
+
+    def print_on_training(self, 
+                          sess, 
+                          natoms,
+                          feed_dict_test,
+                          feed_dict_batch) :
+        error_test, error_e_test, error_ed_test\
+            = sess.run([self.l2_l, \
+                        self.l2_more['l2_ener_loss'], \
+                        self.l2_more['l2_ener_dipole_loss']],
+                       feed_dict=feed_dict_test)
+        error_train, error_e_train, error_ed_train\
+            = sess.run([self.l2_l, \
+                        self.l2_more['l2_ener_loss'], \
+                        self.l2_more['l2_ener_dipole_loss']],
+                       feed_dict=feed_dict_batch)
+        print_str = ""
+        prop_fmt = "   %9.2e %9.2e"
+        print_str += prop_fmt % (np.sqrt(error_test), np.sqrt(error_train))
+        print_str += prop_fmt % (np.sqrt(error_e_test) / natoms[0], np.sqrt(error_e_train) / natoms[0])
+        print_str += prop_fmt % (np.sqrt(error_ed_test), np.sqrt(error_ed_train))
+        return print_str        
+
 
 class TensorLoss () :
     def __init__ (self, jdata, **kwarg) :
@@ -190,6 +280,10 @@ def __init__ (self, jdata, **kwarg) :
         self.tensor_size = kwarg['tensor_size']
         self.label_name = kwarg['label_name']
         self.atomic = kwarg.get('atomic', True)
+        if jdata is not None:
+            self.scale = jdata.get('scale', 1.0)
+        else:
+            self.scale = 1.0
         # data required
         add_data_requirement(self.label_name, 
                              self.tensor_size, 
@@ -206,7 +300,7 @@ def build (self,
                suffix):        
         polar_hat = label_dict[self.label_name]
         polar = model_dict[self.tensor_name]
-        l2_loss = tf.reduce_mean( tf.square(polar - polar_hat), name='l2_'+suffix)
+        l2_loss = tf.reduce_mean( tf.square(self.scale*(polar - polar_hat)), name='l2_'+suffix)
         if not self.atomic :
             atom_norm  = 1./ global_cvt_2_tf_float(natoms[0]) 
             l2_loss = l2_loss * atom_norm
diff --git a/source/train/Model.py b/source/train/Model.py
index 9cbb8e1123..ba48bbc0a0 100644
--- a/source/train/Model.py
+++ b/source/train/Model.py
@@ -1,26 +1,70 @@
-import os,sys,warnings
-import platform
 import numpy as np
 from deepmd.env import tf
 from collections import defaultdict
 from deepmd.TabInter import TabInter
 from deepmd.common import ClassArg
 
-from deepmd.RunOptions import global_tf_float_precision
-from deepmd.RunOptions import global_np_float_precision
-from deepmd.RunOptions import global_ener_float_precision
-from deepmd.RunOptions import global_cvt_2_tf_float
 from deepmd.RunOptions import global_cvt_2_ener_float
+from deepmd.env import op_module
+
+
+def _make_all_stat_ref(data, nbatches):
+    all_stat = defaultdict(list)
+    for ii in range(data.get_nsystems()) :
+        for jj in range(nbatches) :
+            stat_data = data.get_batch (sys_idx = ii)
+            for dd in stat_data:
+                if dd == "natoms_vec":
+                    stat_data[dd] = stat_data[dd].astype(np.int32) 
+                all_stat[dd].append(stat_data[dd])        
+    return all_stat
+
+
+def make_all_stat(data, nbatches, merge_sys = True):
+    """
+    pack data for statistics
+    Parameters
+    ----------
+    data:
+        The data
+    merge_sys: bool (True)
+        Merge system data
+    Returns
+    -------
+    all_stat:
+        A dictionary of list of list storing data for stat. 
+        if merge_sys == False data can be accessed by 
+            all_stat[key][sys_idx][batch_idx][frame_idx]
+        else merge_sys == True can be accessed by 
+            all_stat[key][batch_idx][frame_idx]
+    """
+    all_stat = defaultdict(list)
+    for ii in range(data.get_nsystems()) :
+        sys_stat =  defaultdict(list)
+        for jj in range(nbatches) :
+            stat_data = data.get_batch (sys_idx = ii)
+            for dd in stat_data:
+                if dd == "natoms_vec":
+                    stat_data[dd] = stat_data[dd].astype(np.int32) 
+                sys_stat[dd].append(stat_data[dd])
+        for dd in sys_stat:
+            if merge_sys:
+                for bb in sys_stat[dd]:
+                    all_stat[dd].append(bb)
+            else:                    
+                all_stat[dd].append(sys_stat[dd])
+    return all_stat
+
+def merge_sys_stat(all_stat):
+    first_key = list(all_stat.keys())[0]
+    nsys = len(all_stat[first_key])
+    ret = defaultdict(list)
+    for ii in range(nsys):
+        for dd in all_stat:
+            for bb in all_stat[dd][ii]:
+                ret[dd].append(bb)
+    return ret
 
-if platform.system() == "Windows":
-    ext = "dll"
-elif platform.system() == "Darwin":
-    ext = "dylib"
-else:
-    ext = "so"
-module_path = os.path.dirname(os.path.realpath(__file__)) + "/"
-assert (os.path.isfile (module_path  + "libop_abi.{}".format(ext) )), "op module does not exist"
-op_module = tf.load_op_library(module_path + "libop_abi.{}".format(ext))
 
 class Model() :
     model_type = 'ener'
@@ -35,14 +79,12 @@ def __init__ (self, jdata, descrpt, fitting):
 
         args = ClassArg()\
                .add('type_map',         list,   default = []) \
-               .add('rcond',            float,  default = 1e-3) \
                .add('data_stat_nbatch', int,    default = 10) \
                .add('data_stat_protect',float,  default = 1e-2) \
                .add('use_srtab',        str)
         class_data = args.parse(jdata)
         self.type_map = class_data['type_map']
         self.srtab_name = class_data['use_srtab']
-        self.rcond = class_data['rcond']
         self.data_stat_nbatch = class_data['data_stat_nbatch']
         self.data_stat_protect = class_data['data_stat_protect']
         if self.srtab_name is not None :
@@ -68,26 +110,23 @@ def get_type_map (self) :
         return self.type_map
 
     def data_stat(self, data):
-        all_stat = defaultdict(list)
-        for ii in range(data.get_nsystems()) :
-            for jj in range(self.data_stat_nbatch) :
-                stat_data = data.get_batch (sys_idx = ii)
-                for dd in stat_data:
-                    if dd == "natoms_vec":
-                        stat_data[dd] = stat_data[dd].astype(np.int32) 
-                    all_stat[dd].append(stat_data[dd])        
-        self._compute_dstats (all_stat, protection = self.data_stat_protect)
-        self.bias_atom_e = data.compute_energy_shift(self.rcond)
-
-
-    def _compute_dstats (self, all_stat, protection = 1e-2) :
-        self.davg, self.dstd \
-            = self.descrpt.compute_dstats(all_stat['coord'],
-                                          all_stat['box'],
-                                          all_stat['type'],
-                                          all_stat['natoms_vec'],
-                                          all_stat['default_mesh'])        
-        self.fitting.compute_dstats(all_stat, protection = protection)
+        all_stat = make_all_stat(data, self.data_stat_nbatch, merge_sys = False)
+        m_all_stat = merge_sys_stat(all_stat)
+        self._compute_input_stat(m_all_stat, protection = self.data_stat_protect)
+        self._compute_output_stat(all_stat)
+        # self.bias_atom_e = data.compute_energy_shift(self.rcond)
+
+    def _compute_input_stat (self, all_stat, protection = 1e-2) :
+        self.descrpt.compute_input_stats(all_stat['coord'],
+                                         all_stat['box'],
+                                         all_stat['type'],
+                                         all_stat['natoms_vec'],
+                                         all_stat['default_mesh'])
+        self.fitting.compute_input_stats(all_stat, protection = protection)
+
+    def _compute_output_stat (self, all_stat) :
+        self.fitting.compute_output_stats(all_stat)
+
     
     def build (self, 
                coord_, 
@@ -129,8 +168,6 @@ def build (self,
                                  natoms,
                                  box,
                                  mesh,
-                                 davg = self.davg,
-                                 dstd = self.dstd,
                                  suffix = suffix,
                                  reuse = reuse)
         dout = tf.identity(dout, name='o_descriptor')
@@ -143,7 +180,6 @@ def build (self,
         atom_ener = self.fitting.build (dout, 
                                         input_dict, 
                                         natoms, 
-                                        bias_atom_e = self.bias_atom_e, 
                                         reuse = reuse, 
                                         suffix = suffix)
 
@@ -219,6 +255,8 @@ def build (self,
         model_dict['virial'] = virial
         model_dict['atom_ener'] = energy_raw
         model_dict['atom_virial'] = atom_virial
+        model_dict['coord'] = coord
+        model_dict['atype'] = atype
         
         return model_dict
 
@@ -234,10 +272,12 @@ def __init__ (self, jdata, descrpt, fitting, var_name):
 
         args = ClassArg()\
                .add('type_map',         list,   default = []) \
-               .add('data_stat_nbatch', int,    default = 10)
+               .add('data_stat_nbatch', int,    default = 10) \
+               .add('data_stat_protect',float,  default = 1e-2)
         class_data = args.parse(jdata)
         self.type_map = class_data['type_map']
         self.data_stat_nbatch = class_data['data_stat_nbatch']
+        self.data_stat_protect = class_data['data_stat_protect']
     
     def get_rcut (self) :
         return self.rcut
@@ -255,23 +295,23 @@ def get_out_size (self) :
         return self.fitting.get_out_size()
 
     def data_stat(self, data):
-        all_stat = defaultdict(list)
-        for ii in range(data.get_nsystems()) :
-            for jj in range(self.data_stat_nbatch) :
-                stat_data = data.get_batch (sys_idx = ii)
-                for dd in stat_data:
-                    if dd == "natoms_vec":
-                        stat_data[dd] = stat_data[dd].astype(np.int32) 
-                    all_stat[dd].append(stat_data[dd])        
-        self._compute_dstats (all_stat)
-
-    def _compute_dstats (self, all_stat) :        
-        self.davg, self.dstd \
-            = self.descrpt.compute_dstats(all_stat['coord'],
-                                          all_stat['box'],
-                                          all_stat['type'],
-                                          all_stat['natoms_vec'],
-                                          all_stat['default_mesh'])
+        all_stat = make_all_stat(data, self.data_stat_nbatch, merge_sys = False)
+        m_all_stat = merge_sys_stat(all_stat)        
+        self._compute_input_stat (m_all_stat, protection = self.data_stat_protect)
+        self._compute_output_stat(all_stat)
+
+    def _compute_input_stat(self, all_stat, protection = 1e-2) :
+        self.descrpt.compute_input_stats(all_stat['coord'],
+                                         all_stat['box'],
+                                         all_stat['type'],
+                                         all_stat['natoms_vec'],
+                                         all_stat['default_mesh'])
+        if hasattr(self.fitting, 'compute_input_stats'):
+            self.fitting.compute_input_stats(all_stat, protection = protection)
+
+    def _compute_output_stat (self, all_stat) :
+        if hasattr(self.fitting, 'compute_output_stats'):
+            self.fitting.compute_output_stats(all_stat)
 
     def build (self, 
                coord_, 
@@ -292,9 +332,10 @@ def build (self,
             t_mt = tf.constant(self.model_type, 
                                name = 'model_type', 
                                dtype = tf.string)
+            t_od = tf.constant(self.get_out_size(), 
+                               name = 'output_dim', 
+                               dtype = tf.int32)
 
-        coord = tf.reshape (coord_, [-1, natoms[1] * 3])
-        atype = tf.reshape (atype_, [-1, natoms[1]])
 
         dout \
             = self.descrpt.build(coord_,
@@ -302,8 +343,6 @@ def build (self,
                                  natoms,
                                  box,
                                  mesh,
-                                 davg = self.davg,
-                                 dstd = self.dstd,
                                  suffix = suffix,
                                  reuse = reuse)
         dout = tf.identity(dout, name='o_descriptor')
diff --git a/source/train/Network.py b/source/train/Network.py
index bf8c50775c..ed188c085d 100644
--- a/source/train/Network.py
+++ b/source/train/Network.py
@@ -1,37 +1,39 @@
-import os,warnings
 import numpy as np
 
 from deepmd.env import tf
 from deepmd.RunOptions import global_tf_float_precision
-from deepmd.RunOptions import global_np_float_precision
-from deepmd.RunOptions import global_ener_float_precision
 
 def one_layer(inputs, 
               outputs_size, 
               activation_fn=tf.nn.tanh, 
+              precision = global_tf_float_precision, 
               stddev=1.0,
               bavg=0.0,
               name='linear', 
               reuse=None,
               seed=None, 
               use_timestep = False, 
+              trainable = True,
               useBN = False):
     with tf.variable_scope(name, reuse=reuse):
         shape = inputs.get_shape().as_list()
         w = tf.get_variable('matrix', 
                             [shape[1], outputs_size], 
-                            global_tf_float_precision,
-                            tf.random_normal_initializer(stddev=stddev/np.sqrt(shape[1]+outputs_size), seed = seed))
+                            precision,
+                            tf.random_normal_initializer(stddev=stddev/np.sqrt(shape[1]+outputs_size), seed = seed), 
+                            trainable = trainable)
         b = tf.get_variable('bias', 
                             [outputs_size], 
-                            global_tf_float_precision,
-                            tf.random_normal_initializer(stddev=stddev, mean = bavg, seed = seed))
+                            precision,
+                            tf.random_normal_initializer(stddev=stddev, mean = bavg, seed = seed), 
+                            trainable = trainable)
         hidden = tf.matmul(inputs, w) + b
         if activation_fn != None and use_timestep :
             idt = tf.get_variable('idt',
                                   [outputs_size],
-                                  global_tf_float_precision,
-                                  tf.random_normal_initializer(stddev=0.001, mean = 0.1, seed = seed))
+                                  precision,
+                                  tf.random_normal_initializer(stddev=0.001, mean = 0.1, seed = seed), 
+                                  trainable = trainable)
         if activation_fn != None:
             if useBN:
                 None
diff --git a/source/train/RunOptions.py.in b/source/train/RunOptions.py.in
index aa6bf081db..63d3544ca6 100644
--- a/source/train/RunOptions.py.in
+++ b/source/train/RunOptions.py.in
@@ -1,5 +1,6 @@
 import os,sys
-import tensorflow as tf
+from deepmd.env import tf
+from deepmd.env import get_tf_default_nthreads
 import numpy as np
 import deepmd.cluster.Slurm as Slurm
 import deepmd.cluster.Local as Local
@@ -28,14 +29,6 @@ global_git_branch='@GIT_BRANCH@'
 global_tf_include_dir='@TensorFlow_INCLUDE_DIRS@'
 global_tf_libs='@TensorFlow_LIBRARY@'
 
-def _get_threads_env () :
-    num_intra_threads = None
-    if 'OMP_NUM_THREADS' in os.environ : 
-        num_intra_threads = int(os.environ['OMP_NUM_THREADS'])
-    else :
-        num_intra_threads = 0
-    return num_intra_threads
-
 def _is_slurm() :
     return "SLURM_JOB_NODELIST" in os.environ
 
@@ -106,10 +99,6 @@ class RunOptions (object) :
     def __init__ (self, 
                   args, 
                   try_distrib = False):        
-        # thread settings
-        self.num_intra_threads = _get_threads_env()
-        self.num_inter_threads = 0
-
         # distributed tasks
         if try_distrib :
             self._try_init_mpi()
@@ -132,8 +121,6 @@ class RunOptions (object) :
             if args.restart is not None: 
                 self.restart = os.path.abspath(args.restart)
                 self.init_mode = "restart"
-            if args.inter_threads is not None :
-                self.num_inter_threads = args.inter_threads
 
     def message (self, msg) :
         if self.verbose :
@@ -167,28 +154,32 @@ class RunOptions (object) :
     def print_summary(self) :
         msg = ""
         msg += "---Summary of the training---------------------------------------\n"
-        msg += 'installed to:       %s\n' % global_install_prefix
-        msg += 'source :            %s\n' % global_git_summ
-        msg += 'source brach:       %s\n' % global_git_branch
-        msg += 'source commit:      %s\n' % global_git_hash
-        msg += 'source commit at:   %s\n' % global_git_date
-        msg += 'build float prec:   %s\n' % global_float_prec
-        msg += 'build with tf inc:  %s\n' % global_tf_include_dir
+        msg += 'installed to:         %s\n' % global_install_prefix
+        msg += 'source :              %s\n' % global_git_summ
+        msg += 'source brach:         %s\n' % global_git_branch
+        msg += 'source commit:        %s\n' % global_git_hash
+        msg += 'source commit at:     %s\n' % global_git_date
+        msg += 'build float prec:     %s\n' % global_float_prec
+        msg += 'build with tf inc:    %s\n' % global_tf_include_dir
         for idx,ii in enumerate(global_tf_libs.split(';')) :
             if idx == 0 :
-                msg += 'build with tf lib:  %s\n' % ii
+                msg += 'build with tf lib:    %s\n' % ii
             else :
-                msg += '                    %s\n' % ii
+                msg += '                      %s\n' % ii
         if self.is_distrib:
             msg += "distributed\n" 
-            msg += "ps list:            %s\n" % str(self.cluster['ps'])
-            msg += "worker list:        %s\n" % str(self.cluster['worker'])
-            msg += "chief on:           %s\n" % self.nodename
+            msg += "ps list:              %s\n" % str(self.cluster['ps'])
+            msg += "worker list:          %s\n" % str(self.cluster['worker'])
+            msg += "chief on:             %s\n" % self.nodename
         else :
-            msg += "running on:         %s\n" % self.nodename
-        msg += "gpu per node:       %s\n" % self.gpus
-        msg += "num_inter_threads:  %d\n" % self.num_inter_threads
-        msg += "num_intra_threads:  %d\n" % self.num_intra_threads
+            msg += "running on:           %s\n" % self.nodename
+        if self.gpus is None:
+            msg += "CUDA_VISIBLE_DEVICES: unset\n"
+        else:
+            msg += "CUDA_VISIBLE_DEVICES: %s\n" % self.gpus
+        intra, inter = get_tf_default_nthreads()
+        msg += "num_intra_threads:    %d\n" % intra
+        msg += "num_inter_threads:    %d\n" % inter
         msg += "-----------------------------------------------------------------\n"
         self.message(msg)
 
diff --git a/source/train/TabInter.py b/source/train/TabInter.py
index e6de0bb42e..99190e1191 100644
--- a/source/train/TabInter.py
+++ b/source/train/TabInter.py
@@ -1,6 +1,5 @@
 #!/usr/bin/env python3
 
-import os, sys, shutil
 import numpy as np
 from scipy.interpolate import CubicSpline
 
diff --git a/source/train/Trainer.py b/source/train/Trainer.py
index fb986399e7..50db1adfbc 100644
--- a/source/train/Trainer.py
+++ b/source/train/Trainer.py
@@ -1,42 +1,25 @@
 #!/usr/bin/env python3
 import os
-import platform
-import sys
 import time
 import shutil
-import warnings
 import numpy as np
 from deepmd.env import tf
+from deepmd.env import default_tf_session_config
 from deepmd.RunOptions import global_tf_float_precision
-from deepmd.RunOptions import global_np_float_precision
 from deepmd.RunOptions import global_ener_float_precision
-from deepmd.RunOptions import global_cvt_2_tf_float
-from deepmd.RunOptions import global_cvt_2_ener_float
 from deepmd.Fitting import EnerFitting, WFCFitting, PolarFittingLocFrame, PolarFittingSeA, GlobalPolarFittingSeA, DipoleFittingSeA
 from deepmd.DescrptLocFrame import DescrptLocFrame
 from deepmd.DescrptSeA import DescrptSeA
 from deepmd.DescrptSeR import DescrptSeR
 from deepmd.DescrptSeAR import DescrptSeAR
 from deepmd.Model import Model, WFCModel, DipoleModel, PolarModel, GlobalPolarModel
-from deepmd.Loss import EnerStdLoss, TensorLoss
+from deepmd.Loss import EnerStdLoss, EnerDipoleLoss, TensorLoss
 from deepmd.LearningRate import LearningRateExp
 
-from tensorflow.python.framework import ops
 from tensorflow.python.client import timeline
-
-# load force module
-if platform.system() == "Windows":
-    ext = "dll"
-elif platform.system() == "Darwin":
-    ext = "dylib"
-else:
-    ext = "so"
-module_path = os.path.dirname(os.path.realpath(__file__)) + "/"
-assert (os.path.isfile (module_path  + "libop_abi.{}".format(ext) )), "op module does not exist"
-op_module = tf.load_op_library(module_path + "libop_abi.{}".format(ext))
+from deepmd.env import op_module
 
 # load grad of force module
-sys.path.append (module_path )
 import deepmd._prod_force_grad
 import deepmd._prod_virial_grad
 import deepmd._prod_force_se_a_grad
@@ -45,10 +28,8 @@
 import deepmd._prod_virial_se_r_grad
 import deepmd._soft_min_force_grad
 import deepmd._soft_min_virial_grad
-from deepmd.RunOptions import RunOptions
-from deepmd.TabInter import TabInter
 
-from deepmd.common import j_must_have, ClassArg, add_data_requirement, data_requirement
+from deepmd.common import j_must_have, ClassArg
 
 def _is_subdir(path, directory):
     path = os.path.realpath(path)
@@ -144,10 +125,18 @@ def _init_param(self, jdata):
         # infer loss type by fitting_type
         try :
             loss_param = jdata['loss']
+            loss_type = loss_param.get('type', 'std')
         except:
             loss_param = None
+            loss_type = 'std'
+
         if fitting_type == 'ener':
-            self.loss = EnerStdLoss(loss_param, starter_learning_rate = self.lr.start_lr())
+            if loss_type == 'std':
+                self.loss = EnerStdLoss(loss_param, starter_learning_rate = self.lr.start_lr())
+            elif loss_type == 'ener_dipole':
+                self.loss = EnerDipoleLoss(loss_param, starter_learning_rate = self.lr.start_lr())
+            else:
+                raise RuntimeError('unknow loss type')
         elif fitting_type == 'wfc':
             self.loss = TensorLoss(loss_param, 
                                    model = self.model, 
@@ -189,7 +178,8 @@ def _init_param(self, jdata):
                   .add('timing_in_training',  bool, default = True)\
                   .add('profiling',     bool,   default = False)\
                   .add('profiling_file',str,    default = 'timeline.json')\
-                  .add('sys_weights',   list    )
+                  .add('sys_probs',   list    )\
+                  .add('auto_prob_style', str, default = "prob_sys_size")
         tr_data = tr_args.parse(training_param)
         self.numb_test = tr_data['numb_test']
         self.disp_file = tr_data['disp_file']
@@ -200,7 +190,8 @@ def _init_param(self, jdata):
         self.timing_in_training  = tr_data['timing_in_training']
         self.profiling = tr_data['profiling']
         self.profiling_file = tr_data['profiling_file']
-        self.sys_weights = tr_data['sys_weights']        
+        self.sys_probs = tr_data['sys_probs']        
+        self.auto_prob_style = tr_data['auto_prob_style']        
         self.useBN = False
         if fitting_type == 'ener' and  self.fitting.get_numb_fparam() > 0 :
             self.numb_fparam = self.fitting.get_numb_fparam()
@@ -212,9 +203,11 @@ def _message (self, msg) :
         self.run_opt.message(msg)
 
     def build (self, 
-               data) :
+               data, 
+               stop_batch = 0) :
         self.ntypes = self.model.get_ntypes()
         assert (self.ntypes == data.get_ntypes()), "ntypes should match that found in data"
+        self.stop_batch = stop_batch
 
         self.batch_size = data.get_batch_size()
 
@@ -241,7 +234,7 @@ def build (self,
     def _build_lr(self):
         self._extra_train_ops   = []
         self.global_step = tf.train.get_or_create_global_step()
-        self.learning_rate = self.lr.build(self.global_step)
+        self.learning_rate = self.lr.build(self.global_step, self.stop_batch)
         self._message("built lr")
 
     def _build_network(self, data):        
@@ -260,7 +253,6 @@ def _build_network(self, data):
         self.place_holders['natoms_vec']        = tf.placeholder(tf.int32,   [self.ntypes+2], name='t_natoms')
         self.place_holders['default_mesh']      = tf.placeholder(tf.int32,   [None], name='t_mesh')
         self.place_holders['is_training']       = tf.placeholder(tf.bool)
-
         self.model_pred\
             = self.model.build (self.place_holders['coord'], 
                                 self.place_holders['type'], 
@@ -299,10 +291,7 @@ def _build_training(self):
         self._message("built training")
 
     def _init_sess_serial(self) :
-        self.sess = tf.Session(
-            config=tf.ConfigProto(intra_op_parallelism_threads=self.run_opt.num_intra_threads, 
-                                  inter_op_parallelism_threads=self.run_opt.num_inter_threads
-            ))
+        self.sess = tf.Session(config=default_tf_session_config)
         self.saver = tf.train.Saver()
         saver = self.saver
         if self.run_opt.init_mode == 'init_from_scratch' :
@@ -373,8 +362,8 @@ def _init_sess_distrib(self):
         # save_checkpoint_steps = self.save_freq)
 
     def train (self, 
-               data, 
-               stop_batch) :
+               data) :
+        stop_batch = self.stop_batch
         if self.run_opt.is_distrib :
             self._init_sess_distrib()
         else :
@@ -388,9 +377,11 @@ def train (self,
         cur_batch = self.sess.run(self.global_step)
         is_first_step = True
         self.cur_batch = cur_batch
-        self.run_opt.message("start training at lr %.2e (== %.2e), final lr will be %.2e" % 
+        self.run_opt.message("start training at lr %.2e (== %.2e), decay_step %d, decay_rate %f, final lr will be %.2e" % 
                              (self.sess.run(self.learning_rate),
                               self.lr.value(cur_batch), 
+                              self.lr.decay_steps_,
+                              self.lr.decay_rate_,
                               self.lr.value(stop_batch)) 
         )
 
@@ -402,8 +393,9 @@ def train (self,
 
         train_time = 0
         while cur_batch < stop_batch :
-            batch_data = data.get_batch (sys_weights = self.sys_weights)
-            cur_batch_size = batch_data["coord"].shape[0]
+            batch_data = data.get_batch (sys_probs = self.sys_probs,
+                                         auto_prob_style = self.auto_prob_style
+            )
             feed_dict_batch = {}
             for kk in batch_data.keys():
                 if kk == 'find_type' or kk == 'type' :
@@ -465,7 +457,7 @@ def test_on_the_fly (self,
                          fp,
                          data,
                          feed_dict_batch) :
-        test_data = data.get_test ()
+        test_data = data.get_test(ntests = self.numb_test)
         feed_dict_test = {}
         for kk in test_data.keys():
             if kk == 'find_type' or kk == 'type' :
diff --git a/source/train/calculator.py b/source/train/calculator.py
index 1179eef891..37fb7ad412 100644
--- a/source/train/calculator.py
+++ b/source/train/calculator.py
@@ -32,10 +32,13 @@ class DP(Calculator):
     name = "DP"
     implemented_properties = ["energy", "forces", "stress"]
 
-    def __init__(self, model, label="DP", **kwargs):
+    def __init__(self, model, label="DP", type_dict=None, **kwargs):
         Calculator.__init__(self, label=label, **kwargs)
         self.dp = DeepPot(model)
-        self.type_dict = dict(zip(self.dp.get_type_map(), range(self.dp.get_ntypes())))
+        if type_dict:
+            self.type_dict=type_dict
+        else:
+            self.type_dict = dict(zip(self.dp.get_type_map(), range(self.dp.get_ntypes())))
 
     def calculate(self, atoms=None, properties=["energy", "forces", "stress"], system_changes=all_changes):
         coord = atoms.get_positions().reshape([1, -1])
diff --git a/source/train/common.py b/source/train/common.py
index fc1f0817a6..c250e4cdfc 100644
--- a/source/train/common.py
+++ b/source/train/common.py
@@ -1,7 +1,30 @@
-import warnings
+import os,warnings,fnmatch
+import numpy as np
+import math
+from deepmd.env import tf
+from deepmd.RunOptions import global_tf_float_precision
 
-data_requirement = {}
+def gelu(x):
+    """Gaussian Error Linear Unit.
+    This is a smoother version of the RELU.
+    Original paper: https://arxiv.org/abs/1606.08415
+    Args:
+    x: float Tensor to perform activation.
+    Returns:
+    `x` with the GELU activation applied.
+    """
+    cdf = 0.5 * (1.0 + tf.tanh((math.sqrt(2 / math.pi) * (x + 0.044715 * tf.pow(x, 3)))))
+    return x * cdf
 
+data_requirement = {}
+activation_fn_dict = {
+    "relu": tf.nn.relu,
+    "relu6": tf.nn.relu6,
+    "softplus": tf.nn.softplus,
+    "sigmoid": tf.sigmoid,
+    "tanh": tf.nn.tanh,
+    "gelu": gelu
+}
 def add_data_requirement(key, 
                          ndof, 
                          atomic = False, 
@@ -18,6 +41,37 @@ def add_data_requirement(key,
     }
     
 
+def select_idx_map(atom_type, 
+                   type_sel):
+    sort_type_sel = np.sort(type_sel)
+    idx_map = np.array([], dtype = int)
+    for ii in sort_type_sel:
+        idx_map = np.append(idx_map, np.where(atom_type == ii))
+    return idx_map
+
+
+def make_default_mesh(test_box, cell_size = 3.0) :
+    # nframes = test_box.shape[0]
+    # default_mesh = np.zeros([nframes, 6], dtype = np.int32)
+    # for ff in range(nframes):
+    #     ncell = np.ones (3, dtype=np.int32)
+    #     for ii in range(3) :
+    #         ncell[ii] = int ( np.linalg.norm(test_box[ff][ii]) / cell_size )
+    #         if (ncell[ii] < 2) : ncell[ii] = 2
+    #     default_mesh[ff][3] = ncell[0]
+    #     default_mesh[ff][4] = ncell[1]
+    #     default_mesh[ff][5] = ncell[2]
+    # return default_mesh
+    nframes = test_box.shape[0]
+    lboxv = np.linalg.norm(test_box.reshape([-1, 3, 3]), axis = 2)
+    avg_lboxv = np.average(lboxv, axis = 0)
+    ncell = (avg_lboxv / cell_size).astype(np.int32)
+    ncell[ncell < 2] = 2
+    default_mesh = np.zeros (6, dtype = np.int32)
+    default_mesh[3:6] = ncell
+    return default_mesh    
+
+
 class ClassArg () : 
     def __init__ (self) :
         self.arg_dict = {}
@@ -107,4 +161,28 @@ def j_must_have_d (jdata, key, deprecated_key) :
 
 def j_have (jdata, key) :
     return key in jdata.keys() 
+  
+def get_activation_func(activation_fn):
+    if activation_fn not in activation_fn_dict:
+        raise RuntimeError(activation_fn+" is not a valid activation function")
+    return activation_fn_dict[activation_fn]
+
+def expand_sys_str(root_dir):
+    matches = []
+    for root, dirnames, filenames in os.walk(root_dir, followlinks=True):
+        for filename in fnmatch.filter(filenames, 'type.raw'):
+            matches.append(root)
+    return matches
+
+def get_precision(precision):
+    if precision == "default":
+        return  global_tf_float_precision
+    elif precision == "float16":
+        return tf.float16
+    elif precision == "float32":
+        return tf.float32
+    elif precision == "float64":
+        return tf.float64
+    else:
+        raise RuntimeError("%d is not a valid precision" % precision)
 
diff --git a/source/train/compat.py b/source/train/compat.py
index 0cf522e500..40ac118e23 100644
--- a/source/train/compat.py
+++ b/source/train/compat.py
@@ -1,4 +1,4 @@
-import os,json,warnings
+import json,warnings
 from deepmd.common import j_have,j_must_have,j_must_have_d
 
 def convert_input_v0_v1(jdata, warning = True, dump = None) :
@@ -37,10 +37,6 @@ def _smth_model(jdata):
     return model
 
 def _nonsmth_descriptor(jdata) :
-    output = {}
-    seed = None
-    if j_have (jdata, 'seed') :
-        seed = jdata['seed']
     # model
     descriptor = {}
     descriptor['type'] = 'loc_frame'
diff --git a/source/train/env.py b/source/train/env.py
index 22630724ff..dec9ec8eaa 100644
--- a/source/train/env.py
+++ b/source/train/env.py
@@ -1,5 +1,6 @@
 import os
 import logging
+import platform
 import numpy as np
 from imp import reload
 
@@ -10,10 +11,11 @@
 except ImportError:
     import tensorflow as tf
 
-def set_env_if_empty(key, value):
+def set_env_if_empty(key, value, verbose=True):
     if os.environ.get(key) is None:
         os.environ[key] = value
-        logging.warn("Environment variable {} is empty. Use the default value {}".format(key, value))
+        if verbose:
+            logging.warn("Environment variable {} is empty. Use the default value {}".format(key, value))
 
 def set_mkl():
     """Tuning MKL for the best performance
@@ -30,3 +32,32 @@ def set_mkl():
         set_env_if_empty("KMP_BLOCKTIME", "0")
         set_env_if_empty("KMP_AFFINITY", "granularity=fine,verbose,compact,1,0")
         reload(np)
+
+def set_tf_default_nthreads():
+    set_env_if_empty("TF_INTRA_OP_PARALLELISM_THREADS", "0", verbose=False)
+    set_env_if_empty("TF_INTER_OP_PARALLELISM_THREADS", "0", verbose=False)
+
+def get_tf_default_nthreads():
+    return int(os.environ.get('TF_INTRA_OP_PARALLELISM_THREADS')), int(os.environ.get('TF_INTER_OP_PARALLELISM_THREADS'))
+    
+def get_tf_session_config():
+    set_tf_default_nthreads()    
+    intra, inter = get_tf_default_nthreads()
+    return tf.ConfigProto(intra_op_parallelism_threads=intra, inter_op_parallelism_threads=inter)
+
+def get_module(module_name):
+    """Load force module."""
+    if platform.system() == "Windows":
+        ext = "dll"
+    elif platform.system() == "Darwin":
+        ext = "dylib"
+    else:
+        ext = "so"
+    module_path = os.path.dirname(os.path.realpath(__file__)) + "/"
+    assert (os.path.isfile (module_path  + "{}.{}".format(module_name, ext) )), "module %s does not exist" % module_name
+    module = tf.load_op_library(module_path + "{}.{}".format(module_name, ext))
+    return module
+
+op_module = get_module("libop_abi")
+op_grads_module = get_module("libop_grads")
+default_tf_session_config = get_tf_session_config()
diff --git a/source/train/main.py b/source/train/main.py
index c0e4a173be..1e35d3bf17 100644
--- a/source/train/main.py
+++ b/source/train/main.py
@@ -4,6 +4,7 @@
 from .freeze import freeze
 from .config import config
 from .test import test
+from .transform import transform
 
 def main () :    
     parser = argparse.ArgumentParser(
@@ -15,14 +16,16 @@ def main () :
     #                          help="the output json file")    
     
     default_num_inter_threads = 0
+    parser_transform = subparsers.add_parser('transform', help='pass parameters to another model')
+    parser_transform.add_argument('-r', "--raw-model", default = "raw_frozen_model.pb", type=str, 
+				  help = "the model receiving parameters")
+    parser_transform.add_argument("-o","--old-model", default = "old_frozen_model.pb", type=str, 
+				  help='the model providing parameters')
+    parser_transform.add_argument("-n", "--output", default = "frozen_model.pb", type=str, 
+				  help = "the model after passing parameters")
     parser_train = subparsers.add_parser('train', help='train a model')
     parser_train.add_argument('INPUT', 
                               help='the input parameter file in json format')
-    parser_train.add_argument('-t','--inter-threads', type = int, default = default_num_inter_threads,
-                              help=
-                              'With default value %d. ' % default_num_inter_threads + 
-                              'Setting the "inter_op_parallelism_threads" key for the tensorflow, '  +
-                              'the "intra_op_parallelism_threads" will be set by the env variable OMP_NUM_THREADS')
     parser_train.add_argument('--init-model', type = str, 
                               help=
                               'Initialize the model by the provided checkpoint.')
@@ -42,7 +45,7 @@ def main () :
     parser_tst.add_argument("-m", "--model", default="frozen_model.pb", type=str, 
                             help="Frozen model file to import")
     parser_tst.add_argument("-s", "--system", default=".", type=str, 
-                            help="The system dir")
+                            help="The system dir. Recursively detect systems in this directory")
     parser_tst.add_argument("-S", "--set-prefix", default="set", type=str, 
                             help="The set prefix")
     parser_tst.add_argument("-n", "--numb-test", default=100, type=int, 
@@ -67,5 +70,7 @@ def main () :
         config(args)
     elif args.command == 'test' :
         test(args)
+    elif args.command == 'transform' :
+        transform(args)
     else :
         raise RuntimeError('unknown command ' + args.command)
diff --git a/source/train/print_old_model.py b/source/train/print_old_model.py
index 9e47968672..14719723f9 100644
--- a/source/train/print_old_model.py
+++ b/source/train/print_old_model.py
@@ -12,7 +12,7 @@
 from deepmd.DataSystem import DataSystem
 from deepmd.Model import NNPModel
 from deepmd.Model import LearingRate
-from deepmd.common import j_must_have, j_must_have_d, j_have
+from deepmd.common import j_must_have
 
 def gen_data() :
     tmpdata = Data(rand_pert = 0.1, seed = 1)
@@ -41,7 +41,6 @@ def compute_efv(jfile):
     test_size = j_must_have(jdata, 'numb_test')
     batch_size = 1
     test_size = 1
-    stop_batch = j_must_have(jdata, 'stop_batch')
     rcut = j_must_have (jdata, 'rcut')
 
     data = DataSystem(systems, set_pfx, batch_size, test_size, rcut, run_opt)
diff --git a/source/train/test.py b/source/train/test.py
index a5a273de08..ef71b684da 100644
--- a/source/train/test.py
+++ b/source/train/test.py
@@ -7,6 +7,7 @@
 import numpy as np
 
 from deepmd.Data import DeepmdData
+from deepmd.common import expand_sys_str
 from deepmd import DeepEval
 from deepmd import DeepPot
 from deepmd import DeepDipole
@@ -16,25 +17,82 @@
 
 def test (args):
     de = DeepEval(args.model)
+    all_sys = expand_sys_str(args.system)
+    if len(all_sys) == 0:
+        print('Did not find valid system')
+    err_coll = []
+    siz_coll = []
     if de.model_type == 'ener':
-        test_ener(args)
+        dp = DeepPot(args.model)        
     elif de.model_type == 'dipole':
-        test_dipole(args)
+        dp = DeepDipole(args.model)    
     elif de.model_type == 'polar':
-        test_polar(args)
+        dp = DeepPolar(args.model)    
     elif de.model_type == 'wfc':
-        test_wfc(args)
+        dp = DeepWFC(args.model)    
     else :
         raise RuntimeError('unknow model type '+de.model_type)
+    for ii in all_sys:
+        args.system = ii
+        print ("# ---------------output of dp test--------------- ")
+        print ("# testing system : " + ii)
+        if de.model_type == 'ener':
+            err, siz = test_ener(dp, args)
+        elif de.model_type == 'dipole':
+            err, siz = test_dipole(dp, args)
+        elif de.model_type == 'polar':
+            err, siz = test_polar(dp, args)
+        elif de.model_type == 'wfc':
+            err, siz = test_wfc(dp, args)
+        else :
+            raise RuntimeError('unknow model type '+de.model_type)
+        print ("# ----------------------------------------------- ")
+        err_coll.append(err)
+        siz_coll.append(siz)
+    avg_err = weighted_average(err_coll, siz_coll)
+    if len(all_sys) != len(err_coll):
+        print('Not all systems are tested! Check if the systems are valid')
+    if len(all_sys) > 1:
+        print ("# ----------weighted average of errors----------- ")
+        print ("# number of systems : %d" % len(all_sys))
+        if de.model_type == 'ener':
+            print_ener_sys_avg(avg_err)
+        elif de.model_type == 'dipole':
+            print_dipole_sys_avg(avg_err)
+        elif de.model_type == 'polar':
+            print_polar_sys_avg(avg_err)
+        elif de.model_type == 'wfc':
+            print_wfc_sys_avg(avg_err)
+        else :
+            raise RuntimeError('unknow model type '+de.model_type)
+        print ("# ----------------------------------------------- ")
+
 
 def l2err (diff) :    
     return np.sqrt(np.average (diff*diff))
 
-def test_ener (args) :
+
+def weighted_average(err_coll, siz_coll):
+    nsys = len(err_coll)
+    nitems = len(err_coll[0])
+    assert(len(err_coll) == len(siz_coll))
+    sum_err = np.zeros(nitems)
+    sum_siz = np.zeros(nitems)
+    for sys_error, sys_size in zip(err_coll, siz_coll):
+        for ii in range(nitems):
+            ee = sys_error[ii]
+            ss = sys_size [ii]
+            sum_err[ii] += ee * ee * ss
+            sum_siz[ii] += ss
+    for ii in range(nitems):
+        sum_err[ii] = np.sqrt(sum_err[ii] / sum_siz[ii])
+    return sum_err
+
+
+def test_ener (dp, args) :
     if args.rand_seed is not None :
         np.random.seed(args.rand_seed % (2**32))
 
-    dp = DeepPot(args.model)
     data = DeepmdData(args.system, args.set_prefix, shuffle_test = args.shuffle_test, type_map = dp.get_type_map())
     data.add('energy', 1, atomic=False, must=False, high_prec=True)
     data.add('force',  3, atomic=True,  must=False, high_prec=False)
@@ -49,8 +107,11 @@ def test_ener (args) :
     nframes = test_data["box"].shape[0]
     numb_test = args.numb_test
     numb_test = min(nframes, numb_test)
+
     coord = test_data["coord"][:numb_test].reshape([numb_test, -1])
     box = test_data["box"][:numb_test]
+    if not data.pbc:
+        box = None
     atype = test_data["type"][0]
     if dp.get_dim_fparam() > 0:
         fparam = test_data["fparam"][:numb_test] 
@@ -60,13 +121,24 @@ def test_ener (args) :
         aparam = test_data["aparam"][:numb_test] 
     else :
         aparam = None
+    detail_file = args.detail_file
+    if detail_file is not None:
+        atomic = True
+    else:
+        atomic = False
 
-    energy, force, virial, ae, av = dp.eval(coord, box, atype, fparam = fparam, aparam = aparam, atomic = True)
+    ret = dp.eval(coord, box, atype, fparam = fparam, aparam = aparam, atomic = atomic)
+    energy = ret[0]
+    force  = ret[1]
+    virial = ret[2]
     energy = energy.reshape([numb_test,1])
     force = force.reshape([numb_test,-1])
     virial = virial.reshape([numb_test,9])
-    ae = ae.reshape([numb_test,-1])
-    av = av.reshape([numb_test,-1])
+    if atomic:
+        ae = ret[3]
+        av = ret[4]
+        ae = ae.reshape([numb_test,-1])
+        av = av.reshape([numb_test,-1])
 
     l2e = (l2err (energy - test_data["energy"][:numb_test].reshape([-1,1])))
     l2f = (l2err (force  - test_data["force"] [:numb_test]))
@@ -82,7 +154,6 @@ def test_ener (args) :
     print ("Virial L2err        : %e eV" % l2v)
     print ("Virial L2err/Natoms : %e eV" % l2va)
 
-    detail_file = args.detail_file
     if detail_file is not None :
         pe = np.concatenate((np.reshape(test_data["energy"][:numb_test], [-1,1]),
                              np.reshape(energy, [-1,1])), 
@@ -99,13 +170,19 @@ def test_ener (args) :
                             axis = 1)
         np.savetxt(detail_file+".v.out", pv,
                    header = 'data_vxx data_vxy data_vxz data_vyx data_vyy data_vyz data_vzx data_vzy data_vzz pred_vxx pred_vxy pred_vxz pred_vyx pred_vyy pred_vyz pred_vzx pred_vzy pred_vzz')        
+    return [l2ea, l2f, l2va], [energy.size, force.size, virial.size]
+
 
+def print_ener_sys_avg(avg):
+    print ("Energy L2err/Natoms : %e eV" % avg[0])
+    print ("Force  L2err        : %e eV/A" % avg[1])
+    print ("Virial L2err/Natoms : %e eV" % avg[2])
 
-def test_wfc (args) :
+
+def test_wfc (dp, args) :
     if args.rand_seed is not None :
         np.random.seed(args.rand_seed % (2**32))
 
-    dp = DeepWFC(args.model)    
     data = DeepmdData(args.system, args.set_prefix, shuffle_test = args.shuffle_test)
     data.add('wfc', 12, atomic=True, must=True, high_prec=False, type_sel = dp.get_sel_type())
     test_data = data.get_test ()
@@ -132,13 +209,17 @@ def test_wfc (args) :
                             axis = 1)
         np.savetxt(detail_file+".out", pe, 
                    header = 'ref_wfc(12 dofs)   predicted_wfc(12 dofs)')
+    return [l2f], [wfc.size]
+
 
+def print_wfc_sys_avg(avg):
+    print ("WFC  L2err : %e eV/A" % avg[0])
 
-def test_polar (args) :
+
+def test_polar (dp, args) :
     if args.rand_seed is not None :
         np.random.seed(args.rand_seed % (2**32))
 
-    dp = DeepPolar(args.model)    
     data = DeepmdData(args.system, args.set_prefix, shuffle_test = args.shuffle_test)
     data.add('polarizability', 9, atomic=True, must=True, high_prec=False, type_sel = dp.get_sel_type())
     test_data = data.get_test ()
@@ -165,13 +246,17 @@ def test_polar (args) :
                             axis = 1)
         np.savetxt(detail_file+".out", pe, 
                    header = 'data_pxx data_pxy data_pxz data_pyx data_pyy data_pyz data_pzx data_pzy data_pzz pred_pxx pred_pxy pred_pxz pred_pyx pred_pyy pred_pyz pred_pzx pred_pzy pred_pzz')
+    return [l2f], [polar.size]
+
 
+def print_polar_sys_avg(avg):
+    print ("Polarizability  L2err : %e eV/A" % avg[0])
 
-def test_dipole (args) :
+
+def test_dipole (dp, args) :
     if args.rand_seed is not None :
         np.random.seed(args.rand_seed % (2**32))
 
-    dp = DeepDipole(args.model)    
     data = DeepmdData(args.system, args.set_prefix, shuffle_test = args.shuffle_test)
     data.add('dipole', 3, atomic=True, must=True, high_prec=False, type_sel = dp.get_sel_type())
     test_data = data.get_test ()
@@ -198,3 +283,8 @@ def test_dipole (args) :
                             axis = 1)
         np.savetxt(detail_file+".out", pe, 
                    header = 'data_x data_y data_z pred_x pred_y pred_z')
+    return [l2f], [dipole.size]
+
+
+def print_dipole_sys_avg(avg):
+    print ("Dipole  L2err         : %e eV/A" % avg[0])
diff --git a/source/train/train.py b/source/train/train.py
index a57cb735f1..c89760fa4d 100755
--- a/source/train/train.py
+++ b/source/train/train.py
@@ -4,18 +4,14 @@
 import sys
 import time
 import numpy as np
-import argparse
 import json
 from deepmd.env import tf
 from deepmd.compat import convert_input_v0_v1
-
-lib_path = os.path.dirname(os.path.realpath(__file__)) + "/../lib/"
-sys.path.append (lib_path)
-
 from deepmd.RunOptions import RunOptions
-from deepmd.DataSystem import DataSystem, DeepmdDataSystem
+from deepmd.DataSystem import DeepmdDataSystem
 from deepmd.Trainer import NNPTrainer
-from deepmd.common import data_requirement
+from deepmd.common import data_requirement, expand_sys_str
+from deepmd.DataModifier import DipoleChargeModifier
 
 def create_done_queue(cluster_spec, task_index):
    with tf.device("/job:ps/task:%d" % (task_index)):
@@ -53,8 +49,8 @@ def j_must_have (jdata, key) :
 
 def train (args) :
     # load json database
-    fp = open (args.INPUT, 'r')
-    jdata = json.load (fp)
+    with open (args.INPUT, 'r') as fp:
+       jdata = json.load (fp)
     if not 'model' in jdata.keys():
        jdata = convert_input_v0_v1(jdata, 
                                    warning = True, 
@@ -92,8 +88,9 @@ def _do_work(jdata, run_opt):
     # init params and run options
     assert('training' in jdata)
     systems = j_must_have(jdata['training'], 'systems')
+    if type(systems) == str:
+       systems = expand_sys_str(systems)
     set_pfx = j_must_have(jdata['training'], 'set_prefix')
-    numb_sys = len(systems)
     seed = None
     if 'seed' in jdata['training'].keys() : seed = jdata['training']['seed']
     if seed is not None:
@@ -102,19 +99,42 @@ def _do_work(jdata, run_opt):
     batch_size = j_must_have(jdata['training'], 'batch_size')
     test_size = j_must_have(jdata['training'], 'numb_test')
     stop_batch = j_must_have(jdata['training'], 'stop_batch')
+    sys_probs = jdata['training'].get('sys_probs')
+    auto_prob_style = jdata['training'].get('auto_prob_style', 'prob_sys_size')
     if len(type_map) == 0:
        # empty type_map
        ipt_type_map = None
     else:
        ipt_type_map = type_map
-    data = DeepmdDataSystem(systems, batch_size, test_size, rcut, set_prefix=set_pfx, run_opt=run_opt, type_map = ipt_type_map)
+    # data modifier
+    modifier = None
+    modi_data = jdata['model'].get("modifier", None)
+    if modi_data is not None:
+       if modi_data['type'] == 'dipole_charge':
+          modifier = DipoleChargeModifier(modi_data['model_name'],
+                                          modi_data['model_charge_map'],
+                                          modi_data['sys_charge_map'],
+                                          modi_data['ewald_h'],
+                                          modi_data['ewald_beta'])
+       else:
+          raise RuntimeError('unknown modifier type ' + str(modi_data['type']))
+    # init data
+    data = DeepmdDataSystem(systems, 
+                            batch_size, 
+                            test_size, 
+                            rcut, 
+                            set_prefix=set_pfx, 
+                            type_map = ipt_type_map, 
+                            modifier = modifier)
+    data.print_summary(run_opt, 
+                       sys_probs = sys_probs, 
+                       auto_prob_style = auto_prob_style)
     data.add_dict(data_requirement)
     # build the model with stats from the first system
-    model.build (data)
+    model.build (data, stop_batch)
     # train the model with the provided systems in a cyclic way
     start_time = time.time()
-    cur_batch = 0
-    model.train (data, stop_batch)
+    model.train (data)
     end_time = time.time()
     run_opt.message("finished training\nwall time: %.3f s" % (end_time-start_time))
 
diff --git a/source/train/transform.py b/source/train/transform.py
new file mode 100644
index 0000000000..6ae7723f7e
--- /dev/null
+++ b/source/train/transform.py
@@ -0,0 +1,86 @@
+from deepmd.env import tf
+import re
+import numpy as np
+def transform(args):
+    raw_graph = load_graph(args.raw_model)
+    old_graph = load_graph(args.old_model)
+    print("%d ops in the raw graph\n%d ops in the old graph" %(len(raw_graph.as_graph_def().node),len(old_graph.as_graph_def().node)))
+    new_graph_def = transform_graph(raw_graph,old_graph)
+    with tf.gfile.GFile(args.output, mode='wb') as f:
+        f.write(new_graph_def.SerializeToString())
+    print("the output model is saved in %s" % args.output)
+
+def load_graph(graphName):
+    graph_def = tf.GraphDef()
+    with open(graphName,"rb") as f:
+        graph_def.ParseFromString(f.read())
+    with tf.Graph().as_default() as graph:
+        tf.import_graph_def(graph_def,name = "")
+    return graph
+
+def transform_graph(raw_graph,old_graph):
+    precision_dict = {\
+    1:(np.float32, "float32"),\
+    2:(np.float64, "float64"),\
+    19:(np.float16, "float16")\
+    }
+    old_graph_def = old_graph.as_graph_def()
+    raw_graph_def = raw_graph.as_graph_def()
+    raw_graph_node = load_transform_node(raw_graph_def)
+    old_graph_node = load_transform_node(old_graph_def)
+
+    if len(raw_graph_node) != len(old_graph_node):
+        raise RuntimeError("raw graph and old graph has different network structure")
+
+    for node in raw_graph_def.node:
+        if node.name in raw_graph_node.keys():
+            if precision_dict[old_graph_node[node.name].dtype][1] == "float16" or precision_dict[raw_graph_node[node.name].dtype][1] == "float16":
+                raise RuntimeError("float16 conversions not currently supported")
+
+            check_dim(raw_graph_node, old_graph_node, node.name)
+
+            if re.fullmatch("final_layer_type_\d+/bias",node.name) == None:
+                tensor_value = np.frombuffer(old_graph_node[node.name].tensor_content,dtype = precision_dict[old_graph_node[node.name].dtype][0])
+                tensor_value = tensor_value.astype(dtype=precision_dict[raw_graph_node[node.name].dtype][0])
+                node.attr["value"].tensor.tensor_content = tensor_value.tostring()
+
+            else:
+                if precision_dict[old_graph_node[node.name].dtype][1] == "float64":
+                    tensor_value = (np.array(old_graph_node[node.name].double_val)).astype(precision_dict[raw_graph_node[node.name].dtype][0])
+                    node.attr["value"].CopyFrom(tf.AttrValue(tensor=tf.make_tensor_proto(tensor_value,precision_dict[raw_graph_node[node.name].dtype][0], [1])))
+                
+                elif precision_dict[old_graph_node[node.name].dtype][1] == "float32":
+                    tensor_value = (np.array(old_graph_node[node.name].float_val)).astype(precision_dict[raw_graph_node[node.name].dtype][0])
+                    node.attr["value"].CopyFrom(tf.AttrValue(tensor=tf.make_tensor_proto(tensor_value, precision_dict[raw_graph_node[node.name].dtype][0], [1])))
+                
+                elif precision_dict[old_graph_node[node.name].dtype][1] == "float16":
+                    tensor_value = (np.array(old_graph_node[node.name].half_val)).astype(precision_dict[raw_graph_node[node.name].dtype][0])
+                    node.attr["value"].CopyFrom(tf.AttrValue(tensor=tf.make_tensor_proto(tensor_value, precision_dict[raw_graph_node[node.name].dtype][0], [1])))
+            
+            print("%s is passed from old graph(%s) to raw graph(%s)" % (node.name,precision_dict[old_graph_node[node.name].dtype][1],precision_dict[raw_graph_node[node.name].dtype][1]))
+    
+    return raw_graph_def
+
+def check_dim(raw_graph_node, old_graph_node, node_name):
+    raw_graph_dim = raw_graph_node[node_name].tensor_shape
+    old_graph_dim = old_graph_node[node_name].tensor_shape
+    if raw_graph_dim != old_graph_dim:
+        raise RuntimeError("old graph and raw graph has different"+node_name+" dim")
+
+
+def load_transform_node(graph):
+    transform_node = {}
+    transform_node_pattern = "\
+filter_type_\d+/matrix_\d+_\d+|\
+filter_type_\d+/bias_\d+_\d+|\
+filter_type_\d+/idt_\d+_\d+|\
+layer_\d+_type_\d+/matrix|\
+layer_\d+_type_\d+/bias|\
+layer_\d+_type_\d+/idt|\
+final_layer_type_\d+/bias|\
+final_layer_type_\d+/matrix\
+"
+    for node in graph.node:
+        if re.fullmatch(transform_node_pattern,node.name) != None:
+            transform_node[node.name] = node.attr["value"].tensor
+    return transform_node