From b5166a81812bedd44660b8bece1b91dc0f3c51e1 Mon Sep 17 00:00:00 2001
From: denghuilu <denghuilu@pku.edu.cn>
Date: Wed, 3 Feb 2021 00:55:02 +0800
Subject: [PATCH 01/20] model compression

model compression through tabulating
---
 deepmd/common.py                      |  12 +
 deepmd/descriptor/se_a.py             | 181 +++++++++-
 deepmd/utils/tabulate.py              | 185 ++++++++++
 source/lib/include/CustomeOperation.h | 218 +++++++++++-
 source/lib/include/DeviceFunctor.h    |  16 +
 source/op/CMakeLists.txt              |   4 +-
 source/op/_tabulate_grad.py           |  32 ++
 source/op/cuda/CMakeLists.txt         |   3 +-
 source/op/cuda/descrpt_se_a.cu        | 153 ++++-----
 source/op/cuda/tabulate.cu            | 475 ++++++++++++++++++++++++++
 source/op/data_info.cc                | 408 ++++++++++++++++++++++
 source/op/tabulate.cc                 | 379 ++++++++++++++++++++
 source/op/tabulate_multi_device.cc    | 194 +++++++++++
 source/op/unaggregated_grad.cc        | 320 +++++++++++++++++
 source/train/CMakeLists.txt           |   2 +-
 source/train/Model.py                 |   3 +
 source/train/Trainer.py               |   1 +
 source/train/compress.py              |  52 +++
 source/train/main.py                  |  14 +
 source/train/transform.py             |   6 +-
 20 files changed, 2560 insertions(+), 98 deletions(-)
 create mode 100644 deepmd/utils/tabulate.py
 create mode 100644 source/op/_tabulate_grad.py
 create mode 100644 source/op/cuda/tabulate.cu
 create mode 100644 source/op/data_info.cc
 create mode 100644 source/op/tabulate.cc
 create mode 100644 source/op/tabulate_multi_device.cc
 create mode 100644 source/op/unaggregated_grad.cc
 create mode 100644 source/train/compress.py

diff --git a/deepmd/common.py b/deepmd/common.py
index cad4110ac5..e00485b99a 100644
--- a/deepmd/common.py
+++ b/deepmd/common.py
@@ -216,3 +216,15 @@ def dec(obj):
         obj.__doc__ = obj.__doc__.format(*sub)
         return obj
     return dec
+
+def get_np_precision(precision):
+    if precision == "default":
+        return  global_np_float_precision
+    elif precision == "float16":
+        return np.float16
+    elif precision == "float32":
+        return np.float32
+    elif precision == "float64":
+        return np.float64
+    else:
+        raise RuntimeError("%d is not a valid precision" % precision)
\ No newline at end of file
diff --git a/deepmd/descriptor/se_a.py b/deepmd/descriptor/se_a.py
index 410cf0d9b4..d7ee320bd4 100644
--- a/deepmd/descriptor/se_a.py
+++ b/deepmd/descriptor/se_a.py
@@ -1,14 +1,17 @@
+import math
 import numpy as np
 from typing import Tuple, List
 
 from deepmd.env import tf
-from deepmd.common import get_activation_func, get_precision, activation_fn_dict, precision_dict, docstring_parameter
+from deepmd.common import get_activation_func, get_precision, activation_fn_dict, precision_dict, docstring_parameter, get_np_precision
 from deepmd.utils.argcheck import list_to_doc
 from deepmd.RunOptions import global_tf_float_precision
 from deepmd.RunOptions import global_np_float_precision
 from deepmd.env import op_module
 from deepmd.env import default_tf_session_config
 from deepmd.utils.network import embedding_net
+from deepmd.utils.tabulate import DeepTabulate
+from tqdm import tqdm
 
 
 class DescrptSeA ():
@@ -26,7 +29,10 @@ def __init__ (self,
                   exclude_types: List[int] = [],
                   set_davg_zero: bool = False,
                   activation_function: str = 'tanh',
-                  precision: str = 'default'
+                  precision: str = 'default',
+                  compress: bool = False,
+                  model_file: str = 'frozen_model.pb',
+                  table_info: list = [5, 0.01, 0.1, -1]
     ) -> None:
         """
         Constructor
@@ -60,6 +66,12 @@ def __init__ (self,
                 The activation function in the embedding net. Supported options are {0}
         precision
                 The precision of the embedding net parameters. Supported options are {1}
+        compress
+                Try to compress the embedding nets. Otherwise, building original embedding nets
+        model_file
+                The original frozen model, that will be compressed.
+        table_info
+                The data info of tabulation.
         """
         self.sel_a = sel
         self.rcut_r = rcut
@@ -71,6 +83,7 @@ def __init__ (self,
         self.trainable = trainable
         self.filter_activation_fn = get_activation_func(activation_function)
         self.filter_precision = get_precision(precision)
+        self.filter_np_precision = get_np_precision(precision)
         self.exclude_types = set()
         for tt in exclude_types:
             assert(len(tt) == 2)
@@ -96,6 +109,13 @@ def __init__ (self,
         self.useBN = False
         self.dstd = None
         self.davg = None
+        
+        # compress config
+        self.compress = compress
+        self.model_file = model_file
+        self.table_info = table_info
+        if (self.compress):
+            self.table = DeepTabulate(self.model_file, self.filter_np_precision, self.type_one_side)
 
         self.place_holders = {}
         avg_zero = np.zeros([self.ntypes,self.ndescrpt]).astype(global_np_float_precision)
@@ -103,7 +123,7 @@ def __init__ (self,
         sub_graph = tf.Graph()
         with sub_graph.as_default():
             name_pfx = 'd_sea_'
-            for ii in ['coord', 'box']:
+            for ii in ['coord', 'box', 'avg', 'std']:
                 self.place_holders[ii] = tf.placeholder(global_np_float_precision, [None, None], name = name_pfx+'t_'+ii)
             self.place_holders['type'] = tf.placeholder(tf.int32, [None, None], name=name_pfx+'t_type')
             self.place_holders['natoms_vec'] = tf.placeholder(tf.int32, [self.ntypes+2], name=name_pfx+'t_natoms')
@@ -121,6 +141,19 @@ def __init__ (self,
                                          rcut_r_smth = self.rcut_r_smth,
                                          sel_a = self.sel_a,
                                          sel_r = self.sel_r)
+            descrpt, descrpt_deriv, rij, nlist, self.distance, self.max_nbor_size, self.table_range \
+                = op_module.data_info(self.place_holders['coord'],
+                                         self.place_holders['type'],
+                                         self.place_holders['natoms_vec'],
+                                         self.place_holders['box'],
+                                         self.place_holders['default_mesh'],
+                                         self.place_holders['avg'],
+                                         self.place_holders['std'],
+                                         rcut_a = self.rcut_a,
+                                         rcut_r = self.rcut_r,
+                                         rcut_r_smth = self.rcut_r_smth,
+                                         sel_a = self.sel_a,
+                                         sel_r = self.sel_r)
         self.sub_sess = tf.Session(graph = sub_graph, config=default_tf_session_config)
 
 
@@ -324,6 +357,15 @@ def build (self,
         self.descrpt_deriv = tf.identity(self.descrpt_deriv, name = 'o_rmat_deriv')
         self.rij = tf.identity(self.rij, name = 'o_rij')
         self.nlist = tf.identity(self.nlist, name = 'o_nlist')
+        
+        if (self.compress):
+            self.lower = math.floor(self.lower)
+            self.upper = math.ceil(self.upper)
+            self.table.build(self.lower, 
+                             self.upper, 
+                             self.upper * self.table_info[0], 
+                             self.table_info[1], 
+                             self.table_info[2])
 
         self.dout, self.qmat = self._pass_filter(self.descrpt_reshape, 
                                                  atype,
@@ -337,6 +379,58 @@ def build (self,
         tf.summary.histogram('embedding_net_output', self.dout)
         return self.dout
 
+    def data_info(self, data) -> None:
+        """
+        Print the data info(tabulation boundary, the nearest distance of atoms, max neighbor size) of the training data
+
+        Parameters
+        ----------
+        data
+                The data class that controls input data information
+        """
+        self.lower = 0.0
+        self.upper = 0.0
+        self.dist  = 100.0
+        self.max_nbor = 0
+
+        davg = self.davg
+        dstd = self.dstd
+        if davg is None:
+            davg = np.zeros([self.ntypes, self.ndescrpt])
+        if dstd is None:
+            dstd = np.ones ([self.ntypes, self.ndescrpt])
+
+        for ii in tqdm(range(len(data.system_dirs)), desc = '# DEEPMD: getting data info'):
+            for jj in data.data_systems[ii].dirs:
+                data_set = data.data_systems[ii]._load_set(jj)
+                for kk in range(np.array(data_set['type']).shape[0]):
+                    dt, mn, tr \
+                        = self.sub_sess.run([self.distance, self.max_nbor_size, self.table_range], 
+                                            feed_dict = {
+                                                self.place_holders['coord']: np.array(data_set['coord'])[kk].reshape([-1, data.natoms[ii] * 3]),
+                                                self.place_holders['type']: np.array(data_set['type'])[kk].reshape([-1, data.natoms[ii]]),
+                                                self.place_holders['natoms_vec']: np.array(data.natoms_vec[ii]),
+                                                self.place_holders['box']: np.array(data_set['box'])[kk].reshape([-1, 9]),
+                                                self.place_holders['default_mesh']: np.array(data.default_mesh[ii]),
+                                                self.place_holders['avg']: davg,
+                                                self.place_holders['std']: dstd,
+                                            })
+                    dr = np.array([np.min(tr), np.max(tr)]).astype(global_np_float_precision)
+                    dt = np.min(dt)
+                    mn = np.max(mn)
+                    if (dr[0] < self.lower): 
+                        self.lower = dr[0]
+                    if (dr[1] > self.upper):
+                        self.upper = dr[1]
+                    if (dt < self.dist):
+                        self.dist = dt
+                    if (mn > self.max_nbor):
+                        self.max_nbor = mn
+
+        print('# DEEPMD: training data with lower boundary: ' + str(self.lower))
+        print('# DEEPMD: training data with upper boundary: ' + str(self.upper))
+        print('# DEEPMD: training data with min   distance: ' + str(self.dist))
+        print('# DEEPMD: training data with max   nborsize: ' + str(self.max_nbor))
     
     def get_rot_mat(self) -> tf.Tensor:
         """
@@ -413,7 +507,10 @@ def _pass_filter(self,
                                      [ 0, start_index*      self.ndescrpt],
                                      [-1, natoms[2+type_i]* self.ndescrpt] )
                 inputs_i = tf.reshape(inputs_i, [-1, self.ndescrpt])
-                layer, qmat = self._filter(tf.cast(inputs_i, self.filter_precision), type_i, name='filter_type_'+str(type_i)+suffix, natoms=natoms, reuse=reuse, seed = self.seed, trainable = trainable, activation_fn = self.filter_activation_fn)
+                if not self.compress:
+                    layer, qmat = self._filter(tf.cast(inputs_i, self.filter_precision), type_i, name='filter_type_'+str(type_i)+suffix, natoms=natoms, reuse=reuse, seed = self.seed, trainable = trainable, activation_fn = self.filter_activation_fn)
+                else:
+                    layer, qmat = self._compress_filter(tf.cast(inputs_i, self.filter_precision), type_i, name='filter_type_'+str(type_i)+suffix, natoms=natoms, reuse=reuse, seed = self.seed, trainable = trainable, activation_fn = self.filter_activation_fn)
                 layer = tf.reshape(layer, [tf.shape(inputs)[0], natoms[2+type_i] * self.get_dim_out()])
                 qmat  = tf.reshape(qmat,  [tf.shape(inputs)[0], natoms[2+type_i] * self.get_dim_rot_mat_1() * 3])
                 output.append(layer)
@@ -423,7 +520,10 @@ def _pass_filter(self,
             inputs_i = inputs
             inputs_i = tf.reshape(inputs_i, [-1, self.ndescrpt])
             type_i = -1
-            layer, qmat = self._filter(tf.cast(inputs_i, self.filter_precision), type_i, name='filter_type_all'+suffix, natoms=natoms, reuse=reuse, seed = self.seed, trainable = trainable, activation_fn = self.filter_activation_fn)
+            if not self.compress:
+                layer, qmat = self._filter(tf.cast(inputs_i, self.filter_precision), type_i, name='filter_type_all'+suffix, natoms=natoms, reuse=reuse, seed = self.seed, trainable = trainable, activation_fn = self.filter_activation_fn)
+            else:
+                layer, qmat = self._compress_filter(tf.cast(inputs_i, self.filter_precision), type_i, name='filter_type_all'+suffix, natoms=natoms, reuse=reuse, seed = self.seed, trainable = trainable, activation_fn = self.filter_activation_fn)
             layer = tf.reshape(layer, [tf.shape(inputs)[0], natoms[0] * self.get_dim_out()])
             qmat  = tf.reshape(qmat,  [tf.shape(inputs)[0], natoms[0] * self.get_dim_rot_mat_1() * 3])
             output.append(layer)
@@ -559,3 +659,74 @@ def _filter(self,
           result = tf.reshape(result, [-1, outputs_size_2 * outputs_size[-1]])
 
         return result, qmat
+
+    def _compress_filter(self, 
+                   inputs, 
+                   type_input,
+                   natoms,
+                   activation_fn=tf.nn.tanh, 
+                   stddev=1.0,
+                   bavg=0.0,
+                   name='linear', 
+                   reuse=None,
+                   seed=None, 
+                trainable = True):
+        # natom x (nei x 4)
+        shape = inputs.get_shape().as_list()
+        outputs_size = [1] + self.filter_neuron
+        outputs_size_2 = self.n_axis_neuron
+        with tf.variable_scope(name, reuse=reuse):
+          start_index = 0
+          xyz_scatter_total = []
+          for type_i in range(self.ntypes):
+            # cut-out inputs
+            # with natom x (nei_type_i x 4)  
+            inputs_i = tf.slice (inputs,
+                                 [ 0, start_index*      4],
+                                 [-1, self.sel_a[type_i]* 4] )
+            start_index += self.sel_a[type_i]
+            shape_i = inputs_i.get_shape().as_list()
+            # with (natom x nei_type_i) x 4  
+            inputs_reshape = tf.reshape(inputs_i, [-1, 4])
+            xyz_scatter = tf.reshape(tf.slice(inputs_reshape, [0,0],[-1,1]),[-1,1])
+            if (type_input, type_i) in self.exclude_types:
+              w = tf.zeros((outputs_size[0], outputs_size[-1]), dtype=global_tf_float_precision)
+              xyz_scatter = tf.matmul(xyz_scatter, w)
+              xyz_scatter = tf.reshape(xyz_scatter, (-1, shape_i[1]//4, outputs_size[-1]))
+              if type_i == 0:
+                xyz_scatter_1  = tf.matmul(tf.reshape(inputs_i, [-1, shape_i[1]//4, 4]), xyz_scatter, transpose_a = True)
+              else:
+                xyz_scatter_1 += tf.matmul(tf.reshape(inputs_i, [-1, shape_i[1]//4, 4]), xyz_scatter, transpose_a = True)
+            else:
+              ti = [self.lower, self.upper, self.upper * self.table_info[0], self.table_info[1], self.table_info[2], self.table_info[3]]
+              if self.type_one_side:
+                assert type_input == -1, "Error: when type_one_side was set True, the value of type_input must be -1."
+                net = 'filter_-1_net_' + str(type_i)
+              else:
+                net = 'filter_' + str(type_input) + '_net_' + str(type_i)
+              if type_i == 0:
+                xyz_scatter_1  = op_module.tabulate_fusion(self.table.data[net], ti, xyz_scatter, tf.reshape(inputs_i, [-1, shape_i[1]//4, 4]), last_layer_size = outputs_size[-1])
+              else:
+                xyz_scatter_1 += op_module.tabulate_fusion(self.table.data[net], ti, xyz_scatter, tf.reshape(inputs_i, [-1, shape_i[1]//4, 4]), last_layer_size = outputs_size[-1])
+          # not needed any more!
+          # natom x nei x outputs_size
+          # xyz_scatter = tf.concat(xyz_scatter_total, axis=1)
+          # natom x nei x 4
+          # inputs_reshape = tf.reshape(inputs, [-1, shape[1]//4, 4])
+          # natom x 4 x outputs_size
+          # xyz_scatter_1 = tf.matmul(inputs_reshape, xyz_scatter, transpose_a = True)
+          xyz_scatter_1 = xyz_scatter_1 * (4.0 / shape[1])
+          # natom x 4 x outputs_size_2
+          xyz_scatter_2 = tf.slice(xyz_scatter_1, [0,0,0],[-1,-1,outputs_size_2])
+          # # natom x 3 x outputs_size_2
+          # qmat = tf.slice(xyz_scatter_2, [0,1,0], [-1, 3, -1])
+          # natom x 3 x outputs_size_1
+          qmat = tf.slice(xyz_scatter_1, [0,1,0], [-1, 3, -1])
+          # natom x outputs_size_2 x 3
+          qmat = tf.transpose(qmat, perm = [0, 2, 1])
+          # natom x outputs_size x outputs_size_2
+          result = tf.matmul(xyz_scatter_1, xyz_scatter_2, transpose_a = True)
+          # natom x (outputs_size x outputs_size_2)
+          result = tf.reshape(result, [-1, outputs_size_2 * outputs_size[-1]])
+
+        return result, qmat
\ No newline at end of file
diff --git a/deepmd/utils/tabulate.py b/deepmd/utils/tabulate.py
new file mode 100644
index 0000000000..04b3bde7e9
--- /dev/null
+++ b/deepmd/utils/tabulate.py
@@ -0,0 +1,185 @@
+import re
+import math
+import numpy as np
+import tensorflow.compat.v1 as tf
+from tensorflow.python.platform import gfile
+from tensorflow.python.framework import tensor_util
+from tqdm import tqdm
+from deepmd.env import op_module
+
+class DeepTabulate():
+    def __init__(self,
+                 model_file,
+                 data_type,
+                 type_one_side = False):
+
+        self.model_file = model_file
+        self.data_type = data_type
+        self.type_one_side = type_one_side
+
+        self.graph, self.graph_def = self.load_graph()
+        self.sess = tf.Session(graph = self.graph)
+
+        self.sub_graph, self.sub_graph_def = self.load_sub_graph()
+        self.sub_sess = tf.Session(graph = self.sub_graph)
+
+        self.sel_a = self.graph.get_operation_by_name('DescrptSeA').get_attr('sel_a')
+        self.ntypes = self.get_tensor_value(self.graph.get_tensor_by_name ('descrpt_attr/ntypes:0'))
+
+        self.filter_variable_nodes = self.load_matrix_node()
+        self.layer_size = int(len(self.filter_variable_nodes) / (self.ntypes * self.ntypes * 2))
+        if type_one_side :
+            self.layer_size = int(len(self.filter_variable_nodes) / (self.ntypes * 2))
+        # self.value_type = self.filter_variable_nodes["filter_type_0/matrix_1_0"].dtype #"filter_type_0/matrix_1_0" must exit~
+        # get trained variables
+        self.bias = self.get_bias()
+        self.matrix = self.get_matrix()
+        # self.matrix_layer_3 must exist
+        # self.data_type = type(self.matrix["layer_1"][0][0][0])
+        assert self.matrix["layer_1"][0].size > 0, "no matrix exist in matrix array!"
+        self.last_layer_size = self.matrix["layer_" + str(self.layer_size)][0].shape[1]
+        # define tables
+        self.data = {}
+
+        # TODO: Need a check function to determine if the current model is properly
+        # Need be more robust!
+
+    def load_graph(self):
+        graph_def = tf.GraphDef()
+        with open(self.model_file, "rb") as f:
+            graph_def.ParseFromString(f.read())
+        with tf.Graph().as_default() as graph:
+            tf.import_graph_def(graph_def, name = "")
+        return graph, graph_def
+
+    def load_sub_graph(self):
+        sub_graph_def = tf.GraphDef()
+        with tf.Graph().as_default() as sub_graph:
+            tf.import_graph_def(sub_graph_def, name = "")
+        return sub_graph, sub_graph_def
+
+    def get_tensor_value(self, tensor) :
+        with self.sess.as_default():
+            self.sess.run(tensor)
+            value = tensor.eval()
+        return value
+
+    def load_matrix_node(self):
+        matrix_node = {}
+        matrix_node_pattern = "filter_type_\d+/matrix_\d+_\d+|filter_type_\d+/bias_\d+_\d+|filter_type_\d+/idt_\d+_\d+|filter_type_all/matrix_\d+_\d+|filter_type_all/bias_\d+_\d+|filter_type_all/idt_\d+_\d"
+        for node in self.graph_def.node:
+            if re.fullmatch(matrix_node_pattern, node.name) != None:
+                matrix_node[node.name] = node.attr["value"].tensor
+        for key in matrix_node.keys() :
+            assert key.find('bias') > 0 or key.find('matrix') > 0, "currently, only support weight matrix and bias matrix at the tabulation op!"
+        return matrix_node
+
+    def get_bias(self):
+        bias = {}
+        for layer in range(1, self.layer_size + 1):
+            bias["layer_" + str(layer)] = []
+            if self.type_one_side:
+                for ii in range(0, self.ntypes):
+                    tensor_value = np.frombuffer (self.filter_variable_nodes["filter_type_all/bias_" + str(layer) + "_" + str(int(ii))].tensor_content)
+                    tensor_shape = tf.TensorShape(self.filter_variable_nodes["filter_type_all/bias_" + str(layer) + "_" + str(int(ii))].tensor_shape).as_list()
+                    bias["layer_" + str(layer)].append(np.reshape(tensor_value, tensor_shape).astype(self.data_type))
+            else:
+                for ii in range(0, self.ntypes * self.ntypes):
+                    tensor_value = np.frombuffer(self.filter_variable_nodes["filter_type_" + str(int(ii / self.ntypes)) + "/bias_" + str(layer) + "_" + str(int(ii % self.ntypes))].tensor_content)
+                    tensor_shape = tf.TensorShape(self.filter_variable_nodes["filter_type_" + str(int(ii / self.ntypes)) + "/bias_" + str(layer) + "_" + str(int(ii % self.ntypes))].tensor_shape).as_list()
+                    bias["layer_" + str(layer)].append(np.reshape(tensor_value, tensor_shape).astype(self.data_type))
+        return bias
+
+    def get_matrix(self):
+        matrix = {}
+        for layer in range(1, self.layer_size + 1):
+            matrix["layer_" + str(layer)] = []
+            if self.type_one_side:
+                for ii in range(0, self.ntypes):
+                    tensor_value = np.frombuffer (self.filter_variable_nodes["filter_type_all/matrix_" + str(layer) + "_" + str(int(ii))].tensor_content)
+                    tensor_shape = tf.TensorShape(self.filter_variable_nodes["filter_type_all/matrix_" + str(layer) + "_" + str(int(ii))].tensor_shape).as_list()
+                    matrix["layer_" + str(layer)].append(np.reshape(tensor_value, tensor_shape).astype(self.data_type))
+            else:
+                for ii in range(0, self.ntypes * self.ntypes):
+                    tensor_value = np.frombuffer(self.filter_variable_nodes["filter_type_" + str(int(ii / self.ntypes)) + "/matrix_" + str(layer) + "_" + str(int(ii % self.ntypes))].tensor_content)
+                    tensor_shape = tf.TensorShape(self.filter_variable_nodes["filter_type_" + str(int(ii / self.ntypes)) + "/matrix_" + str(layer) + "_" + str(int(ii % self.ntypes))].tensor_shape).as_list()
+                    matrix["layer_" + str(layer)].append(np.reshape(tensor_value, tensor_shape).astype(self.data_type))
+        return matrix
+
+    def build(self, lower, upper, _max, stride0, stride1):
+        # tabulate range [lower, upper] with stride0 'stride0'
+        lower = math.floor(lower)
+        upper = math.ceil(upper)
+        xx = np.arange(lower, upper, stride0, dtype = self.data_type)
+        xx = np.append(xx, np.arange(upper, _max, stride1, dtype = self.data_type))
+        xx = np.append(xx, np.array([_max], dtype = self.data_type))
+        self.nspline = int((upper - lower) / stride0 + (_max - upper) / stride1)
+        if self.type_one_side:
+            for ii in range(self.ntypes):
+                vv, dd, d2 = self.make_data(xx, ii)
+                net = "filter_-1_net_" + str(int(ii))
+                self.data[net] = np.zeros([self.nspline, 6 * self.last_layer_size], dtype = self.data_type)
+                for jj in tqdm(range(self.nspline), desc = '# DEEPMD: ' + net + ', tabulating'):
+                    for kk in range(self.last_layer_size):
+                        if jj < int((upper - lower) / stride0):
+                            tt = stride0
+                        else:
+                            tt = stride1
+                        hh = vv[jj + 1][kk] - vv[jj][kk]
+                        self.data[net][jj][kk * 6 + 0] = vv[jj][kk]
+                        self.data[net][jj][kk * 6 + 1] = dd[jj][kk]
+                        self.data[net][jj][kk * 6 + 2] = 0.5 * d2[jj][kk]
+                        self.data[net][jj][kk * 6 + 3] = (1 / (2 * tt * tt * tt)) * (20 * hh - (8 * dd[jj + 1][kk] + 12 * dd[jj][kk]) * tt - (3 * d2[jj][kk] - d2[jj + 1][kk]) * tt * tt)
+                        self.data[net][jj][kk * 6 + 4] = (1 / (2 * tt * tt * tt * tt)) * (-30 * hh + (14 * dd[jj + 1][kk] + 16 * dd[jj][kk]) * tt + (3 * d2[jj][kk] - 2 * d2[jj + 1][kk]) * tt * tt)
+                        self.data[net][jj][kk * 6 + 5] = (1 / (2 * tt * tt * tt * tt * tt)) * (12 * hh - 6 * (dd[jj + 1][kk] + dd[jj][kk]) * tt + (d2[jj + 1][kk] - d2[jj][kk]) * tt * tt)
+        else:
+            for ii in range(self.ntypes * self.ntypes):
+                vv, dd, d2 = self.make_data(xx, ii)
+                net = "filter_" + str(int(ii / self.ntypes)) + "_net_" + str(int(ii % self.ntypes))
+                self.data[net] = np.zeros([self.nspline, 6 * self.last_layer_size], dtype = self.data_type)
+                for jj in tqdm(range(self.nspline), desc = '# DEEPMD: ' + net + ', tabulating'):
+                    for kk in range(self.last_layer_size):
+                        if jj < int((upper - lower) / stride0):
+                            tt = stride0
+                        else:
+                            tt = stride1
+                        hh = vv[jj + 1][kk] - vv[jj][kk]
+                        self.data[net][jj][kk * 6 + 0] = vv[jj][kk]
+                        self.data[net][jj][kk * 6 + 1] = dd[jj][kk]
+                        self.data[net][jj][kk * 6 + 2] = 0.5 * d2[jj][kk]
+                        self.data[net][jj][kk * 6 + 3] = (1 / (2 * tt * tt * tt)) * (20 * hh - (8 * dd[jj + 1][kk] + 12 * dd[jj][kk]) * tt - (3 * d2[jj][kk] - d2[jj + 1][kk]) * tt * tt)
+                        self.data[net][jj][kk * 6 + 4] = (1 / (2 * tt * tt * tt * tt)) * (-30 * hh + (14 * dd[jj + 1][kk] + 16 * dd[jj][kk]) * tt + (3 * d2[jj][kk] - 2 * d2[jj + 1][kk]) * tt * tt)
+                        self.data[net][jj][kk * 6 + 5] = (1 / (2 * tt * tt * tt * tt * tt)) * (12 * hh - 6 * (dd[jj + 1][kk] + dd[jj][kk]) * tt + (d2[jj + 1][kk] - d2[jj][kk]) * tt * tt)
+        
+    # one-by-one executions
+    def make_data(self, xx, idx):
+        with self.sub_graph.as_default():
+            with self.sub_sess.as_default():
+                xx = tf.reshape(xx, [xx.size, -1])
+                for layer in range(self.layer_size):
+                    if layer == 0:
+                        yy = self.layer_0(xx, self.matrix["layer_" + str(layer + 1)][idx], self.bias["layer_" + str(layer + 1)][idx])
+                        dy = op_module.unaggregated_dy_dx_s(yy, self.matrix["layer_" + str(layer + 1)][idx])
+                        dy2 = op_module.unaggregated_dy2_dx_s(yy, dy, self.matrix["layer_" + str(layer + 1)][idx])
+                    else:
+                        tt, yy = self.layer_1(yy, self.matrix["layer_" + str(layer + 1)][idx], self.bias["layer_" + str(layer + 1)][idx])
+                        dz = op_module.unaggregated_dy_dx(yy - tt, self.matrix["layer_" + str(layer + 1)][idx], dy)
+                        dy2 = op_module.unaggregated_dy2_dx(yy - tt, self.matrix["layer_" + str(layer + 1)][idx], dz, dy, dy2)
+                        dy = dz
+                
+                vv = yy.eval()
+                dd = dy.eval()
+                d2 = dy2.eval()
+        return vv, dd, d2
+
+    def layer_0(self, x, w, b):
+        return tf.nn.tanh(tf.matmul(x, w) + b)
+
+    def layer_1(self, x, w, b):
+        t = tf.concat([x, x], axis = 1)
+        return t, tf.nn.tanh(tf.matmul(x, w) + b) + t
+
+    def save_data(self):
+        for ii in range(self.ntypes * self.ntypes):
+            net = "filter_" + str(int(ii / self.ntypes)) + "_net_" + str(int(ii % self.ntypes))
+            np.savetxt('data_' + str(int(ii)), self.data[net])
\ No newline at end of file
diff --git a/source/lib/include/CustomeOperation.h b/source/lib/include/CustomeOperation.h
index 0bdd91c3dd..0c0d891fd4 100644
--- a/source/lib/include/CustomeOperation.h
+++ b/source/lib/include/CustomeOperation.h
@@ -169,7 +169,7 @@ void compute_descriptor_se_a_cpu (
 }
 
 template<typename FPTYPE>
-void DescrptSeACPULauncher(const FPTYPE * coord, const int * type, const int * ilist, const int * jrange, const int * jlist, const FPTYPE * avg, const FPTYPE * std, FPTYPE * descrpt, FPTYPE * descrpt_deriv, FPTYPE * rij, int * nlist, const int nloc, const int nall, const int nnei, const int ntypes, const int ndescrpt, const float rcut_r, const float rcut_r_smth, const std::vector<int> sec_a, const bool fill_nei_a, const int max_nbor_size) {
+void DescrptSeACPULauncher(const FPTYPE * coord, const int * type, const int * ilist, const int * jrange, const int * jlist, const FPTYPE * avg, const FPTYPE * std, FPTYPE * descrpt, FPTYPE * descrpt_deriv, FPTYPE * rij, int * nlist, const int nloc, const int nall, const int nnei, const int ntypes, const int ndescrpt, const float rcut_r, const float rcut_r_smth, const std::vector<int> sec_a, const bool fill_nei_a, const int magic_number) {
     // set & normalize coord
     std::vector<FPTYPE> d_coord3(nall * 3);
     for (int ii = 0; ii < nall; ++ii) {
@@ -235,8 +235,8 @@ void DescrptSeACPULauncher(const FPTYPE * coord, const int * type, const int * i
 
 #if GOOGLE_CUDA
 template<typename FPTYPE>
-void DescrptSeAGPULauncher(const FPTYPE * coord, const int * type, const int * ilist, const int * jrange, const int * jlist, int * array_int, unsigned long long * array_longlong, const FPTYPE * avg, const FPTYPE * std, FPTYPE * descrpt, FPTYPE * descrpt_deriv, FPTYPE * rij, int * nlist, const int nloc, const int nall, const int nnei, const int ndescrpt, const float rcut_r, const float rcut_r_smth, const std::vector<int> sec_a, const bool fill_nei_a, const int max_nbor_size) {
-    DescrptSeAGPUExecuteFunctor<FPTYPE>()(coord, type, ilist, jrange, jlist, array_int, array_longlong, avg, std, descrpt, descrpt_deriv, rij, nlist, nloc, nall, nnei, ndescrpt, rcut_r, rcut_r_smth, sec_a, fill_nei_a, max_nbor_size);
+void DescrptSeAGPULauncher(const FPTYPE * coord, const int * type, const int * ilist, const int * jrange, const int * jlist, int * array_int, unsigned long long * array_longlong, const FPTYPE * avg, const FPTYPE * std, FPTYPE * descrpt, FPTYPE * descrpt_deriv, FPTYPE * rij, int * nlist, const int nloc, const int nall, const int nnei, const int ndescrpt, const float rcut_r, const float rcut_r_smth, const std::vector<int> sec_a, const bool fill_nei_a, const int magic_number) {
+    DescrptSeAGPUExecuteFunctor<FPTYPE>()(coord, type, ilist, jrange, jlist, array_int, array_longlong, avg, std, descrpt, descrpt_deriv, rij, nlist, nloc, nall, nnei, ndescrpt, rcut_r, rcut_r_smth, sec_a, fill_nei_a, magic_number);
 }
 #endif // GOOGLE_CUDA
 // ******************************************************************************
@@ -432,7 +432,7 @@ void compute_descriptor_se_r_cpu (
 }
 
 template<typename FPTYPE>
-void DescrptSeRCPULauncher(const FPTYPE * coord, const int * type, const int * ilist, const int * jrange, const int * jlist, const FPTYPE * avg, const FPTYPE * std, FPTYPE * descrpt, FPTYPE * descrpt_deriv, FPTYPE * rij, int * nlist, const int nloc, const int nall, const int nnei, const int ntypes, const int ndescrpt, const float rcut_r, const float rcut_r_smth, const std::vector<int> sec_a, const bool fill_nei_a, const int max_nbor_size) {
+void DescrptSeRCPULauncher(const FPTYPE * coord, const int * type, const int * ilist, const int * jrange, const int * jlist, const FPTYPE * avg, const FPTYPE * std, FPTYPE * descrpt, FPTYPE * descrpt_deriv, FPTYPE * rij, int * nlist, const int nloc, const int nall, const int nnei, const int ntypes, const int ndescrpt, const float rcut_r, const float rcut_r_smth, const std::vector<int> sec_a, const bool fill_nei_a, const int magic_number) {
     // set & normalize coord
     std::vector<FPTYPE> d_coord3(nall * 3);
     for (int ii = 0; ii < nall; ++ii) {
@@ -498,8 +498,8 @@ void DescrptSeRCPULauncher(const FPTYPE * coord, const int * type, const int * i
 
 #if GOOGLE_CUDA
 template<typename FPTYPE>
-void DescrptSeRGPULauncher(const FPTYPE * coord, const int * type, const int * ilist, const int * jrange, const int * jlist, int * array_int, unsigned long long * array_longlong, const FPTYPE * avg, const FPTYPE * std, FPTYPE * descrpt, FPTYPE * descrpt_deriv, FPTYPE * rij, int * nlist, const int nloc, const int nall, const int nnei, const int ndescrpt, const float rcut_r, const float rcut_r_smth, const std::vector<int> sec_a, const bool fill_nei_a, const int max_nbor_size) {
-    DescrptSeRGPUExecuteFunctor<FPTYPE>()(coord, type, ilist, jrange, jlist, array_int, array_longlong, avg, std, descrpt, descrpt_deriv, rij, nlist, nloc, nall, nnei, ndescrpt, rcut_r, rcut_r_smth, sec_a, fill_nei_a, max_nbor_size);
+void DescrptSeRGPULauncher(const FPTYPE * coord, const int * type, const int * ilist, const int * jrange, const int * jlist, int * array_int, unsigned long long * array_longlong, const FPTYPE * avg, const FPTYPE * std, FPTYPE * descrpt, FPTYPE * descrpt_deriv, FPTYPE * rij, int * nlist, const int nloc, const int nall, const int nnei, const int ndescrpt, const float rcut_r, const float rcut_r_smth, const std::vector<int> sec_a, const bool fill_nei_a, const int magic_number) {
+    DescrptSeRGPUExecuteFunctor<FPTYPE>()(coord, type, ilist, jrange, jlist, array_int, array_longlong, avg, std, descrpt, descrpt_deriv, rij, nlist, nloc, nall, nnei, ndescrpt, rcut_r, rcut_r_smth, sec_a, fill_nei_a, magic_number);
 }
 #endif // GOOGLE_CUDA
 // ******************************************************************************
@@ -570,3 +570,209 @@ void ProdVirialSeRGPULauncher(FPTYPE * virial, FPTYPE * atom_virial, const FPTYP
 // ******************************************************************************
 // end of custome op ProdVirialSeR
 // ******************************************************************************
+
+template <typename FPTYPE>
+inline FPTYPE dot(FPTYPE a[4], FPTYPE b[4]) {
+    return a[0] * b[0] + a[1] * b[1] + a[2] * b[2] + a[3] * b[3]; 
+}
+
+/*
+    This inline function was designed to get the table info and bias value for current input xx!
+    lower:      indicate the lower boundary of the first table;
+    upper:      indicate the upper boundary of the first table as well as the lower boundary of the second table;
+    max:        indicate the upper boundary of the second table;
+    stride0:    indicate the stride of the first table;
+    stride1:    indicate the stride of the second table;
+    xx:         indicate the inputs value;
+    table_idx:  indicate the location of table info of input value xx;
+*/
+template <typename FPTYPE>
+inline void locate_xx(const FPTYPE& lower, const FPTYPE& upper,  const FPTYPE& max, const FPTYPE& stride0, const FPTYPE& stride1, FPTYPE& xx, int& table_idx) {
+    if (xx < lower) {
+        table_idx = 0;
+        xx = 0;
+    }
+    else if (xx < upper) {
+        table_idx = (int)((xx - lower) / stride0);
+        xx -= (table_idx * stride0 + lower);
+    }
+    else if (xx < max) {
+        int first_stride = int((upper - lower) / stride0);
+        table_idx = first_stride + (int)((xx - upper) / stride1);
+        xx -= ((table_idx - first_stride) * stride1 + upper);
+    }
+    else {
+        table_idx = int((upper - lower) / stride0) + (int)((max - upper) / stride1) - 1;
+        xx = 0;
+    }
+}
+
+template <typename FPTYPE>
+void TabulateFusionCPULauncher(const FPTYPE * table, const FPTYPE * table_info, const FPTYPE * in, const FPTYPE * ff, const int nloc, const int nnei, const int last_layer_size, FPTYPE * out) {
+    //Currently, Do nothing at all! 
+    // std::cout << "I'm in tabulate @CPU!" << std::endl;
+    memset(out, 0.0, sizeof(FPTYPE) * nloc * 4 * last_layer_size);
+    FPTYPE const lower   = table_info[0];
+    FPTYPE const upper   = table_info[1];
+    FPTYPE const _max    = table_info[2];
+    FPTYPE const stride0 = table_info[3];
+    FPTYPE const stride1 = table_info[4];
+    // for every atom, execute a small gemm~
+    // FPTYPE * res = new FPTYPE[4 * last_layer_size];
+    #pragma omp parallel for
+    for (int ii = 0; ii < nloc; ii++) {
+        FPTYPE ll[4] = {0};
+        FPTYPE ago = in[ii * nnei + nnei - 1];
+        bool unloop = false; 
+        for (int jj = 0; jj < nnei; jj++) { 
+            ll[0] = ff[ii * nnei * 4 + jj * 4 + 0];
+            ll[1] = ff[ii * nnei * 4 + jj * 4 + 1];
+            ll[2] = ff[ii * nnei * 4 + jj * 4 + 2];
+            ll[3] = ff[ii * nnei * 4 + jj * 4 + 3];
+            FPTYPE xx = in[ii * nnei + jj]; 
+            if (ago == xx) {
+                unloop = true;
+            }
+            int table_idx = 0;
+            locate_xx(lower, upper, _max, stride0, stride1, xx, table_idx);
+            for (int kk = 0; kk < last_layer_size; kk++) {
+                // 1.094 timesteps/s                                       
+                FPTYPE a0  = table[table_idx * last_layer_size * 6 + 6 * kk + 0]; 
+                FPTYPE a1  = table[table_idx * last_layer_size * 6 + 6 * kk + 1]; 
+                FPTYPE a2  = table[table_idx * last_layer_size * 6 + 6 * kk + 2]; 
+                FPTYPE a3  = table[table_idx * last_layer_size * 6 + 6 * kk + 3];
+                FPTYPE a4  = table[table_idx * last_layer_size * 6 + 6 * kk + 4];
+                FPTYPE a5  = table[table_idx * last_layer_size * 6 + 6 * kk + 5];
+                FPTYPE var = a0 + a1 * xx + a2 * xx * xx + a3 * xx * xx * xx + a4 * xx * xx * xx * xx + a5 * xx * xx * xx * xx * xx;
+                if (unloop) {
+                    out[ii * last_layer_size * 4 + 0 * last_layer_size + kk] += (nnei - jj) * var * ll[0];
+                    out[ii * last_layer_size * 4 + 1 * last_layer_size + kk] += (nnei - jj) * var * ll[1];
+                    out[ii * last_layer_size * 4 + 2 * last_layer_size + kk] += (nnei - jj) * var * ll[2];
+                    out[ii * last_layer_size * 4 + 3 * last_layer_size + kk] += (nnei - jj) * var * ll[3];
+                }
+                else {
+                    out[ii * last_layer_size * 4 + 0 * last_layer_size + kk] += var * ll[0];
+                    out[ii * last_layer_size * 4 + 1 * last_layer_size + kk] += var * ll[1];
+                    out[ii * last_layer_size * 4 + 2 * last_layer_size + kk] += var * ll[2];
+                    out[ii * last_layer_size * 4 + 3 * last_layer_size + kk] += var * ll[3];
+                }
+            }
+            if (unloop) break;
+        }
+    }
+}
+
+template <typename FPTYPE>
+void TabulateFusionGradCPULauncher(const FPTYPE * table, const FPTYPE * table_info, const FPTYPE * in, const FPTYPE * ff, const FPTYPE * dy, const int nloc, const int nnei, const int last_layer_size, FPTYPE * dy_dx, FPTYPE * dy_df) {
+    // std::cout << "I'm in tabulate gradient @CPU!" << std::endl;
+    memset(dy_dx, 0.0, sizeof(FPTYPE) * nloc * nnei);
+    memset(dy_df, 0.0, sizeof(FPTYPE) * nloc * nnei * 4);
+    FPTYPE const lower   = table_info[0];
+    FPTYPE const upper   = table_info[1];
+    FPTYPE const _max    = table_info[2];
+    FPTYPE const stride0 = table_info[3];
+    FPTYPE const stride1 = table_info[4];
+    // for every atom, execute a small gemm~
+    // FPTYPE * res = new FPTYPE[4 * last_layer_size];
+    #pragma omp parallel for
+    for (int ii = 0; ii < nloc; ii++) {
+        FPTYPE ll[4];
+        FPTYPE rr[4];
+        FPTYPE ago = in[ii * nnei + nnei - 1];
+        bool unloop = false;
+        for (int jj = 0; jj < nnei; jj++) {
+            // construct the dy/dx
+            ll[0] = ff[ii * nnei * 4 + jj * 4 + 0];
+            ll[1] = ff[ii * nnei * 4 + jj * 4 + 1];
+            ll[2] = ff[ii * nnei * 4 + jj * 4 + 2];
+            ll[3] = ff[ii * nnei * 4 + jj * 4 + 3];
+            FPTYPE xx = in[ii * nnei + jj]; 
+            if (ago == xx) {
+                unloop = true;
+            }
+            int table_idx = 0;
+            locate_xx(lower, upper, _max, stride0, stride1, xx, table_idx);
+            FPTYPE grad = 0.0;
+            for (int kk = 0; kk < last_layer_size; kk++) {
+                rr[0] = dy[ii * last_layer_size * 4 + 0 * last_layer_size + kk];
+                rr[1] = dy[ii * last_layer_size * 4 + 1 * last_layer_size + kk];
+                rr[2] = dy[ii * last_layer_size * 4 + 2 * last_layer_size + kk];
+                rr[3] = dy[ii * last_layer_size * 4 + 3 * last_layer_size + kk];
+                // 1.094 timesteps/s
+                FPTYPE a0  = table[table_idx * last_layer_size * 6 + 6 * kk + 0]; 
+                FPTYPE a1  = table[table_idx * last_layer_size * 6 + 6 * kk + 1]; 
+                FPTYPE a2  = table[table_idx * last_layer_size * 6 + 6 * kk + 2]; 
+                FPTYPE a3  = table[table_idx * last_layer_size * 6 + 6 * kk + 3];
+                FPTYPE a4  = table[table_idx * last_layer_size * 6 + 6 * kk + 4];
+                FPTYPE a5  = table[table_idx * last_layer_size * 6 + 6 * kk + 5];
+                FPTYPE res = a0 + a1 * xx + a2 * xx * xx + a3 * xx * xx * xx + a4 * xx * xx * xx * xx + a5 * xx * xx * xx * xx * xx;
+                if (unloop) {
+                    grad += (a1 + 2 * a2 * xx + 3 * a3 * xx * xx + 4 * a4 * xx * xx * xx + 5 * a5 * xx * xx * xx * xx) * dot(ll, rr) * (nnei - jj);
+                    dy_df[ii * nnei * 4 + jj * 4 + 0] += res * rr[0] * (nnei - jj);
+                    dy_df[ii * nnei * 4 + jj * 4 + 1] += res * rr[1] * (nnei - jj);
+                    dy_df[ii * nnei * 4 + jj * 4 + 2] += res * rr[2] * (nnei - jj);
+                    dy_df[ii * nnei * 4 + jj * 4 + 3] += res * rr[3] * (nnei - jj);
+                }
+                else {
+                    grad += (a1 + 2 * a2 * xx + 3 * a3 * xx * xx + 4 * a4 * xx * xx * xx + 5 * a5 * xx * xx * xx * xx) * dot(ll, rr);
+                    dy_df[ii * nnei * 4 + jj * 4 + 0] += res * rr[0];
+                    dy_df[ii * nnei * 4 + jj * 4 + 1] += res * rr[1];
+                    dy_df[ii * nnei * 4 + jj * 4 + 2] += res * rr[2];
+                    dy_df[ii * nnei * 4 + jj * 4 + 3] += res * rr[3];
+                }
+            }
+            dy_dx[ii * nnei + jj] = grad;
+            if (unloop) break;
+        }
+    }
+}
+
+template <typename FPTYPE>
+void TabulateCheckerCPULauncher(const FPTYPE * table_info, const FPTYPE * in, int * out, const int nloc, const int nnei) {
+    FPTYPE const lower   = table_info[0];
+    FPTYPE const upper   = table_info[1];
+    FPTYPE const _max    = table_info[2];
+    FPTYPE const stride0 = table_info[3];
+    FPTYPE const stride1 = table_info[4];
+    // for every atom, execute a small gemm~
+    // FPTYPE * res = new FPTYPE[4 * last_layer_size];
+    int Csub = 0;    // summation of second table approximate;
+    int Dsub = 0;    // summation of the endpoint approximate;
+    for (int ii = 0; ii < nloc; ii++) {
+        for (int jj = 0; jj < nnei; jj++) {
+            FPTYPE xx = in[ii * nnei + jj];
+            if (xx < lower || xx > _max) {
+                Csub += 1;
+            }
+            else if (xx >= upper && xx <= _max) {
+                Dsub += 1;
+            }
+        }
+    }
+    if(Csub > 0) {
+        std::cout << "# DEEPMD: warning! some values [" << Csub << "/" << nloc * nnei << "] overflow the range of the table, using the endpoint approximate processing.." << std::endl;
+    }
+    if(Dsub > 0) {
+        std::cout << "# DEEPMD: warning! some values [" << Dsub << "/" << nloc * nnei << "] overflow the range of the table, using second table approximate processing.." << std::endl;
+    }
+}
+
+#if GOOGLE_CUDA
+template<typename FPTYPE>
+void TabulateFusionGPULauncher(const FPTYPE * table, const FPTYPE * table_info, const FPTYPE * in, const FPTYPE * ff, const int nloc, const int nnei, const int last_layer_size, FPTYPE * out) {
+    TabulateFusionGPUExecuteFunctor<FPTYPE>()(table, table_info, in, ff, nloc, nnei, last_layer_size, out);
+}
+
+template<typename FPTYPE>
+void TabulateFusionGradGPULauncher(const FPTYPE * table, const FPTYPE * table_info, const FPTYPE * in, const FPTYPE * ff, const FPTYPE * dy, const int nloc, const int nnei, const int last_layer_size, FPTYPE * dy_dx, FPTYPE * dy_df) {
+    TabulateFusionGradGPUExecuteFunctor<FPTYPE>()(table, table_info, in, ff, dy, nloc, nnei, last_layer_size, dy_dx, dy_df);
+}
+
+template <typename FPTYPE>
+void TabulateCheckerGPULauncher(const FPTYPE * table_info, const FPTYPE * in, int * out, const int nloc, const int nnei) {
+    TabulateCheckerGPUExecuteFunctor<FPTYPE>()(table_info, in, out, nloc, nnei);
+}
+#endif // GOOGLE_CUDA
+// ******************************************************************************
+// end of custome op Tabulate
+// ******************************************************************************
diff --git a/source/lib/include/DeviceFunctor.h b/source/lib/include/DeviceFunctor.h
index d51d617f84..f482545df3 100644
--- a/source/lib/include/DeviceFunctor.h
+++ b/source/lib/include/DeviceFunctor.h
@@ -7,6 +7,7 @@
 
 typedef unsigned long long int_64;
 #define SQRT_2_PI 0.7978845608028654 
+#define TPB 256
 
 #define cudaErrcheck(res) {cudaAssert((res), __FILE__, __LINE__);}
 inline void cudaAssert(cudaError_t code, const char *file, int line, bool abort=true) {
@@ -59,4 +60,19 @@ struct GeluGradGPUExecuteFunctor {
 template<typename FPTYPE>
 struct GeluGradGradGPUExecuteFunctor {
     void operator()(const FPTYPE * dy, const FPTYPE * dy_, const FPTYPE * in, FPTYPE * out, const int size);
+};
+
+template<typename FPTYPE>
+struct TabulateFusionGPUExecuteFunctor {
+    void operator()(const FPTYPE * table, const FPTYPE * table_info, const FPTYPE * in, const FPTYPE * ff, const int nloc, const int nnei, const int last_layer_size, FPTYPE * out);
+};
+
+template<typename FPTYPE>
+struct TabulateFusionGradGPUExecuteFunctor {
+    void operator()(const FPTYPE * table, const FPTYPE * table_info, const FPTYPE * in, const FPTYPE * ff, const FPTYPE * dy, const int nloc, const int nnei, const int last_layer_size, FPTYPE * dy_dx, FPTYPE * dy_df);
+};
+
+template<typename FPTYPE>
+struct TabulateCheckerGPUExecuteFunctor {
+    void operator()(const FPTYPE * table_info, const FPTYPE * in, int * out, const int nloc, const int nnei);
 };
\ No newline at end of file
diff --git a/source/op/CMakeLists.txt b/source/op/CMakeLists.txt
index 25f125d6be..0e10c24bb4 100644
--- a/source/op/CMakeLists.txt
+++ b/source/op/CMakeLists.txt
@@ -3,8 +3,8 @@
 set(OP_LIB ${PROJECT_SOURCE_DIR}/lib/src/SimulationRegion.cpp ${PROJECT_SOURCE_DIR}/lib/src/NeighborList.cpp)
 
 set (OP_CXX_FLAG -D_GLIBCXX_USE_CXX11_ABI=${OP_CXX_ABI} )
-file(GLOB OP_SRC prod_force.cc prod_virial.cc descrpt.cc descrpt_se_a.cc descrpt_se_a_ef.cc descrpt_se_a_ef.cc descrpt_se_a_ef_para.cc descrpt_se_a_ef_vert.cc descrpt_se_r.cc tab_inter.cc prod_force_se_a.cc prod_virial_se_a.cc prod_force_se_r.cc prod_virial_se_r.cc soft_min.cc soft_min_force.cc soft_min_virial.cc ewald_recp.cc gelu.cc map_aparam.cc)
-file(GLOB OP_CUDA_SRC prod_force.cc prod_virial.cc descrpt.cc descrpt_se_a_multi_device.cc descrpt_se_r_multi_device.cc tab_inter.cc prod_force_se_a_multi_device.cc prod_virial_se_a_multi_device.cc prod_force_se_r_multi_device.cc prod_virial_se_r_multi_device.cc soft_min.cc soft_min_force.cc soft_min_virial.cc gelu_multi_device.cc)
+file(GLOB OP_SRC prod_force.cc prod_virial.cc descrpt.cc descrpt_se_a.cc descrpt_se_a_ef.cc descrpt_se_a_ef.cc descrpt_se_a_ef_para.cc descrpt_se_a_ef_vert.cc descrpt_se_r.cc tab_inter.cc prod_force_se_a.cc prod_virial_se_a.cc prod_force_se_r.cc prod_virial_se_r.cc soft_min.cc soft_min_force.cc soft_min_virial.cc ewald_recp.cc gelu.cc map_aparam.cc data_info.cc unaggregated_grad.cc tabulate.cc)
+file(GLOB OP_CUDA_SRC prod_force.cc prod_virial.cc descrpt.cc descrpt_se_a_multi_device.cc descrpt_se_r_multi_device.cc tab_inter.cc prod_force_se_a_multi_device.cc prod_virial_se_a_multi_device.cc prod_force_se_r_multi_device.cc prod_virial_se_r_multi_device.cc soft_min.cc soft_min_force.cc soft_min_virial.cc gelu_multi_device.cc tabulate_multi_device.cc)
 file(GLOB OP_GRADS_SRC prod_force_grad.cc prod_force_se_a_grad.cc prod_force_se_r_grad.cc prod_virial_grad.cc prod_virial_se_a_grad.cc prod_virial_se_r_grad.cc soft_min_force_grad.cc soft_min_virial_grad.cc )
 file(GLOB OP_PY *.py)
 
diff --git a/source/op/_tabulate_grad.py b/source/op/_tabulate_grad.py
new file mode 100644
index 0000000000..6f8ba1f8bc
--- /dev/null
+++ b/source/op/_tabulate_grad.py
@@ -0,0 +1,32 @@
+#!/usr/bin/env python3
+"""
+Gradients for tabulate.
+"""
+
+from tensorflow.python.framework import ops
+from deepmd.env import op_module
+from deepmd.env import tf
+# from deepmd.DescrptSeATabulate import last_layer_size
+
+# refine is needed!
+# accurate gradient is needed!
+# 'tabulate_one_side' is needed!
+@ops.RegisterGradient("TabulateGrad")
+def _tabulate_grad_cc (op, dy):    
+    return [None, dy]
+
+@ops.RegisterGradient("TabulateFusionGrad")
+def _tabulate_grad_cc (op, dy, dy_):    
+    return [None, None, dy, dy_, None, None]
+
+# old implementations here.
+
+@ops.RegisterGradient("Tabulate")
+def _tabulate_grad_cc (op, dy, dy_):    
+    dy = op_module.tabulate_grad(dy, op.outputs[1])
+    return [None, None, dy]
+
+@ops.RegisterGradient("TabulateFusion")
+def _tabulate_fusion_grad_cc (op, dy):    
+    dy_dx, dy_df = op_module.tabulate_fusion_grad(op.inputs[0], op.inputs[1], op.inputs[2], op.inputs[3], dy, op.outputs[0])
+    return [None, None, dy_dx, dy_df]
\ No newline at end of file
diff --git a/source/op/cuda/CMakeLists.txt b/source/op/cuda/CMakeLists.txt
index 89dd0b5922..20ef4d672e 100644
--- a/source/op/cuda/CMakeLists.txt
+++ b/source/op/cuda/CMakeLists.txt
@@ -28,6 +28,7 @@ if (${CUDA_VERSION_MAJOR} GREATER "10")
                         -gencode arch=compute_61,code=sm_61; # Pascal - GTX 1080, GTX 1070, GTX 1060, GTX 1050, GTX 1030, Titan Xp, Tesla P40, Tesla P4, Discrete GPU on the NVIDIA Drive PX2
                         -gencode arch=compute_70,code=sm_70; # Volta  - GV100/Tesla V100, GTX 1180 (GV104)
                         -gencode arch=compute_75,code=sm_75; # Turing - RTX 2080, Titan RTX, Quadro R8000
+                        -gencode arch=compute_80,code=sm_86; # Anpere - RTX 2080, Titan RTX, Quadro R8000
                         -O3; -Xcompiler -fPIC;
         )
 elseif (${CUDA_VERSION_MAJOR} STREQUAL "10")
@@ -80,7 +81,7 @@ else ()
 endif()
 
 set (SOURCE_FILES
-    descrpt_se_a.cu descrpt_se_r.cu prod_force_se_a.cu prod_force_se_r.cu prod_virial_se_a.cu prod_virial_se_r.cu gelu.cu 
+    descrpt_se_a.cu descrpt_se_r.cu prod_force_se_a.cu prod_force_se_r.cu prod_virial_se_a.cu prod_virial_se_r.cu gelu.cu tabulate.cu
 )
 
 cuda_add_library(deepmd_op_cuda SHARED ${SOURCE_FILES})
diff --git a/source/op/cuda/descrpt_se_a.cu b/source/op/cuda/descrpt_se_a.cu
index a528c4c477..999d4c2b39 100644
--- a/source/op/cuda/descrpt_se_a.cu
+++ b/source/op/cuda/descrpt_se_a.cu
@@ -147,8 +147,10 @@ __global__ void format_nlist_fill_b_se_a(int * nlist,
 }
 //it's ok!
 
-template<typename FPTYPE>
-__global__ void compute_descriptor_se_a (FPTYPE* descript,
+template<
+    typename FPTYPE,
+    int      THREADS_PER_BLOCK>
+__global__ void compute_descriptor_se_a(FPTYPE* descript,
                             const int ndescrpt,
                             FPTYPE* descript_deriv,
                             const int descript_deriv_size,
@@ -164,67 +166,77 @@ __global__ void compute_descriptor_se_a (FPTYPE* descript,
                             const float rmax,
                             const int sec_a_size)
 {   
-    // <<<nloc, sec_a.back()>>>
-    const unsigned int idx = blockIdx.x;
-    const unsigned int idy = blockIdx.y * blockDim.y + threadIdx.y;
-    const int idx_deriv = idy * 4 * 3;	// 4 components time 3 directions
-    const int idx_value = idy * 4;	// 4 components
-    if (idy >= sec_a_size) {return;}
+    // <<<nloc, TPB>>>
+    const unsigned int bid = blockIdx.x;
+    const unsigned int tid = threadIdx.x;
+    // usually false...
+    if (tid >= sec_a_size) {
+        return;
+    }
+    // const int idx_deriv = idy * 4 * 3;	// 4 components time 3 directions
+    // const int idx_value = idy * 4;	// 4 components
+    int * row_nlist = nlist + bid * nlist_size;
+    FPTYPE * row_rij = rij + bid * rij_size;
+    FPTYPE * row_descript = descript + bid * ndescrpt;
+    FPTYPE * row_descript_deriv = descript_deriv + bid * descript_deriv_size;
 
-    // else {return;}
-    FPTYPE * row_descript = descript + idx * ndescrpt;
-    FPTYPE * row_descript_deriv = descript_deriv + idx * descript_deriv_size;
-    FPTYPE * row_rij = rij + idx * rij_size;
-    int * row_nlist = nlist + idx * nlist_size;
+    for (int ii = tid; ii < sec_a_size; ii += THREADS_PER_BLOCK) {
+        const int idx_value = ii * 4;	// 4 components
+        const int idx_deriv = ii * 12;	// 4 components time 3 directions
+        if (row_nlist[ii] >= 0) {
+            FPTYPE rr[3]  = {0};
+            FPTYPE dd[4]  = {0};
+            FPTYPE vv[12] = {0};
+            const int & j_idx = row_nlist[ii];
+            for (int kk = 0; kk < 3; kk++) {
+                rr[kk] = coord[j_idx * 3 + kk] - coord[bid * 3 + kk];
+                row_rij[ii * 3 + kk] = rr[kk];
+            }
+            // const FPTYPE * rr = &row_rij[ii * 3];
+            FPTYPE nr2 = dev_dot(rr, rr);
+            FPTYPE inr = 1./sqrt(nr2);
+            FPTYPE nr = nr2 * inr;
+            FPTYPE inr2 = inr * inr;
+            FPTYPE inr4 = inr2 * inr2;
+            FPTYPE inr3 = inr4 * nr;
+            FPTYPE sw, dsw;
+            spline5_switch(sw, dsw, nr, rmin, rmax);
+            dd[0] = (1./nr)       ;//* sw;
+            dd[1] = (rr[0] / nr2) ;//* sw;
+            dd[2] = (rr[1] / nr2) ;//* sw;
+            dd[3] = (rr[2] / nr2) ;//* sw;
 
-    if (row_nlist[idy] >= 0) {
-        const int & j_idx = row_nlist[idy];
-        for (int kk = 0; kk < 3; kk++) {
-            row_rij[idy * 3 + kk] = coord[j_idx * 3 + kk] - coord[idx * 3 + kk];
+            vv[0] = (rr[0] * inr3 * sw - dd[0] * dsw * rr[0] * inr); // avg[type[(idx_deriv + 0) / (ndescrpt * 3)] * ndescrpt + ((idx_deriv + 0) % (ndescrpt * 3)) / 3];
+            vv[1] = (rr[1] * inr3 * sw - dd[0] * dsw * rr[1] * inr); // avg[type[(idx_deriv + 1) / (ndescrpt * 3)] * ndescrpt + ((idx_deriv + 1) % (ndescrpt * 3)) / 3];
+            vv[2] = (rr[2] * inr3 * sw - dd[0] * dsw * rr[2] * inr); // avg[type[(idx_deriv + 2) / (ndescrpt * 3)] * ndescrpt + ((idx_deriv + 2) % (ndescrpt * 3)) / 3];
+            // ****deriv of component x/r2
+            vv[3] = ((2. * rr[0] * rr[0] * inr4 - inr2) * sw - dd[1] * dsw * rr[0] * inr); // avg[type[(idx_deriv + 3) / (ndescrpt * 3)] * ndescrpt + ((idx_deriv + 3) % (ndescrpt * 3)) / 3];
+            vv[4] = ((2. * rr[0] * rr[1] * inr4	) * sw - dd[1] * dsw * rr[1] * inr); // avg[type[(idx_deriv + 4) / (ndescrpt * 3)] * ndescrpt + ((idx_deriv + 4) % (ndescrpt * 3)) / 3];
+            vv[5] = ((2. * rr[0] * rr[2] * inr4	) * sw - dd[1] * dsw * rr[2] * inr); // avg[type[(idx_deriv + 5) / (ndescrpt * 3)] * ndescrpt + ((idx_deriv + 5) % (ndescrpt * 3)) / 3];
+            // ***deriv of component y/r2
+            vv[6] = ((2. * rr[1] * rr[0] * inr4	) * sw - dd[2] * dsw * rr[0] * inr); // avg[type[(idx_deriv + 6) / (ndescrpt * 3)] * ndescrpt + ((idx_deriv + 6) % (ndescrpt * 3)) / 3];
+            vv[7] = ((2. * rr[1] * rr[1] * inr4 - inr2) * sw - dd[2] * dsw * rr[1] * inr); // avg[type[(idx_deriv + 7) / (ndescrpt * 3)] * ndescrpt + ((idx_deriv + 7) % (ndescrpt * 3)) / 3];
+            vv[8] = ((2. * rr[1] * rr[2] * inr4	) * sw - dd[2] * dsw * rr[2] * inr); // avg[type[(idx_deriv + 8) / (ndescrpt * 3)] * ndescrpt + ((idx_deriv + 8) % (ndescrpt * 3)) / 3];
+            // ***deriv of component z/r2 
+            vv[9] = ((2. * rr[2] * rr[0] * inr4	) * sw - dd[3] * dsw * rr[0] * inr); // avg[type[(idx_deriv + 9) / (ndescrpt * 3)] * ndescrpt + ((idx_deriv + 9) % (ndescrpt * 3)) / 3];
+            vv[10]= ((2. * rr[2] * rr[1] * inr4	) * sw - dd[3] * dsw * rr[1] * inr); // avg[type[(idx_deriv + 10) / (ndescrpt * 3)] * ndescrpt + ((idx_deriv + 10) % (ndescrpt * 3)) / 3];
+            vv[11]= ((2. * rr[2] * rr[2] * inr4 - inr2) * sw - dd[3] * dsw * rr[2] * inr); // avg[type[(idx_deriv + 11) / (ndescrpt * 3)] * ndescrpt + ((idx_deriv + 11) % (ndescrpt * 3)) / 3];
+            // 4 value components
+            dd[0] *= sw; // * descript[idx * ndescrpt + idx_value + 0]);// - avg[type[idx] * ndescrpt + idx_value + 0]) / std[type[idx] * ndescrpt + idx_value + 0];
+            dd[1] *= sw; // * descript[idx * ndescrpt + idx_value + 1]);// - avg[type[idx] * ndescrpt + idx_value + 1]) / std[type[idx] * ndescrpt + idx_value + 1];
+            dd[2] *= sw; // * descript[idx * ndescrpt + idx_value + 2]);// - avg[type[idx] * ndescrpt + idx_value + 2]) / std[type[idx] * ndescrpt + idx_value + 2];
+            dd[3] *= sw; // * descript[idx * ndescrpt + idx_value + 3]);// - avg[type[idx] * ndescrpt + idx_value + 3]) / std[type[idx] * ndescrpt + idx_value + 3];
+            for (int ii = 0; ii < 12; ii++) {
+                row_descript_deriv[idx_deriv + ii] = vv[ii] / std[type[bid] * ndescrpt + idx_value + ii / 3];
+            }
+            for (int ii = 0; ii < 4; ii++) {  
+                row_descript[idx_value + ii] = (dd[ii] - avg[type[bid] * ndescrpt + idx_value + ii]) / std[type[bid] * ndescrpt + idx_value + ii];
+            }
+        }
+        else {
+            // TODO: move it to the memset.
+            row_descript[idx_value] -= avg[type[bid] * ndescrpt + idx_value] / std[type[bid] * ndescrpt + idx_value];
         }
-        const FPTYPE * rr = &row_rij[idy * 3 + 0];
-        FPTYPE nr2 = dev_dot(rr, rr);
-        FPTYPE inr = 1./sqrt(nr2);
-        FPTYPE nr = nr2 * inr;
-        FPTYPE inr2 = inr * inr;
-        FPTYPE inr4 = inr2 * inr2;
-        FPTYPE inr3 = inr4 * nr;
-        FPTYPE sw, dsw;
-        spline5_switch(sw, dsw, nr, rmin, rmax);
-        row_descript[idx_value + 0] = (1./nr)       ;//* sw;
-        row_descript[idx_value + 1] = (rr[0] / nr2) ;//* sw;
-        row_descript[idx_value + 2] = (rr[1] / nr2) ;//* sw;
-        row_descript[idx_value + 3] = (rr[2] / nr2) ;//* sw;
-
-        row_descript_deriv[idx_deriv + 0] = (rr[0] * inr3 * sw - row_descript[idx_value + 0] * dsw * rr[0] * inr); // avg[type[(idx_deriv + 0) / (ndescrpt * 3)] * ndescrpt + ((idx_deriv + 0) % (ndescrpt * 3)) / 3];
-        row_descript_deriv[idx_deriv + 1] = (rr[1] * inr3 * sw - row_descript[idx_value + 0] * dsw * rr[1] * inr); // avg[type[(idx_deriv + 1) / (ndescrpt * 3)] * ndescrpt + ((idx_deriv + 1) % (ndescrpt * 3)) / 3];
-        row_descript_deriv[idx_deriv + 2] = (rr[2] * inr3 * sw - row_descript[idx_value + 0] * dsw * rr[2] * inr); // avg[type[(idx_deriv + 2) / (ndescrpt * 3)] * ndescrpt + ((idx_deriv + 2) % (ndescrpt * 3)) / 3];
-        // ****deriv of component x/r2
-        row_descript_deriv[idx_deriv + 3] = ((2. * rr[0] * rr[0] * inr4 - inr2) * sw - row_descript[idx_value + 1] * dsw * rr[0] * inr); // avg[type[(idx_deriv + 3) / (ndescrpt * 3)] * ndescrpt + ((idx_deriv + 3) % (ndescrpt * 3)) / 3];
-        row_descript_deriv[idx_deriv + 4] = ((2. * rr[0] * rr[1] * inr4	) * sw - row_descript[idx_value + 1] * dsw * rr[1] * inr); // avg[type[(idx_deriv + 4) / (ndescrpt * 3)] * ndescrpt + ((idx_deriv + 4) % (ndescrpt * 3)) / 3];
-        row_descript_deriv[idx_deriv + 5] = ((2. * rr[0] * rr[2] * inr4	) * sw - row_descript[idx_value + 1] * dsw * rr[2] * inr); // avg[type[(idx_deriv + 5) / (ndescrpt * 3)] * ndescrpt + ((idx_deriv + 5) % (ndescrpt * 3)) / 3];
-        // ***deriv of component y/r2
-        row_descript_deriv[idx_deriv + 6] = ((2. * rr[1] * rr[0] * inr4	) * sw - row_descript[idx_value + 2] * dsw * rr[0] * inr); // avg[type[(idx_deriv + 6) / (ndescrpt * 3)] * ndescrpt + ((idx_deriv + 6) % (ndescrpt * 3)) / 3];
-        row_descript_deriv[idx_deriv + 7] = ((2. * rr[1] * rr[1] * inr4 - inr2) * sw - row_descript[idx_value + 2] * dsw * rr[1] * inr); // avg[type[(idx_deriv + 7) / (ndescrpt * 3)] * ndescrpt + ((idx_deriv + 7) % (ndescrpt * 3)) / 3];
-        row_descript_deriv[idx_deriv + 8] = ((2. * rr[1] * rr[2] * inr4	) * sw - row_descript[idx_value + 2] * dsw * rr[2] * inr); // avg[type[(idx_deriv + 8) / (ndescrpt * 3)] * ndescrpt + ((idx_deriv + 8) % (ndescrpt * 3)) / 3];
-        // ***deriv of component z/r2
-        row_descript_deriv[idx_deriv + 9] = ((2. * rr[2] * rr[0] * inr4	) * sw - row_descript[idx_value + 3] * dsw * rr[0] * inr); // avg[type[(idx_deriv + 9) / (ndescrpt * 3)] * ndescrpt + ((idx_deriv + 9) % (ndescrpt * 3)) / 3];
-        row_descript_deriv[idx_deriv +10] = ((2. * rr[2] * rr[1] * inr4	) * sw - row_descript[idx_value + 3] * dsw * rr[1] * inr); // avg[type[(idx_deriv + 10) / (ndescrpt * 3)] * ndescrpt + ((idx_deriv + 10) % (ndescrpt * 3)) / 3];
-        row_descript_deriv[idx_deriv +11] = ((2. * rr[2] * rr[2] * inr4 - inr2) * sw - row_descript[idx_value + 3] * dsw * rr[2] * inr); // avg[type[(idx_deriv + 11) / (ndescrpt * 3)] * ndescrpt + ((idx_deriv + 11) % (ndescrpt * 3)) / 3];
-        // 4 value components
-        row_descript[idx_value + 0] *= sw; // * descript[idx * ndescrpt + idx_value + 0]);// - avg[type[idx] * ndescrpt + idx_value + 0]) / std[type[idx] * ndescrpt + idx_value + 0];
-        row_descript[idx_value + 1] *= sw; // * descript[idx * ndescrpt + idx_value + 1]);// - avg[type[idx] * ndescrpt + idx_value + 1]) / std[type[idx] * ndescrpt + idx_value + 1];
-        row_descript[idx_value + 2] *= sw; // * descript[idx * ndescrpt + idx_value + 2]);// - avg[type[idx] * ndescrpt + idx_value + 2]) / std[type[idx] * ndescrpt + idx_value + 2];
-        row_descript[idx_value + 3] *= sw; // * descript[idx * ndescrpt + idx_value + 3]);// - avg[type[idx] * ndescrpt + idx_value + 3]) / std[type[idx] * ndescrpt + idx_value + 3];
-    }
-
-    for (int ii = 0; ii < 4; ii++) {
-        row_descript[idx_value + ii] = (row_descript[idx_value + ii] - avg[type[idx] * ndescrpt + idx_value + ii]) / std[type[idx] * ndescrpt + idx_value + ii];
-    }
-    // idy nloc, idx ndescrpt * 3
-    // descript_deriv[idy * ndescrpt * 3 + idx] = (descript_deriv_dev[idy * (ndescrpt * 3) + idx]) / std[type[idy] * ndescrpt + idx / 3];
-    for (int ii = 0; ii < 12; ii++) {
-        row_descript_deriv[idx_deriv + ii] /= std[type[idx] * ndescrpt + (idx_deriv + ii) / 3];
     }
 }
 
@@ -401,26 +413,7 @@ void DescrptSeAGPUExecuteFunctor<FPTYPE>::operator()(const FPTYPE * coord, const
         );
     }
 
-    const int nblock_ = (sec_a.back() + LEN -1) / LEN;
-    dim3 block_grid(nloc, nblock_);
-    dim3 thread_grid(1, LEN);
-    compute_descriptor_se_a<<<block_grid, thread_grid>>> (
-                            descript,
-                            ndescrpt,
-                            descript_deriv,
-                            ndescrpt * 3,
-                            rij,
-                            nnei * 3,
-                            type,
-                            avg,
-                            std,
-                            nlist,
-                            nnei,
-                            coord,
-                            rcut_r_smth,
-                            rcut_r,
-                            sec_a.back()
-    );
+    compute_descriptor_se_a<FPTYPE, TPB> <<<nloc, TPB>>> (descript, ndescrpt, descript_deriv, ndescrpt * 3, rij, nnei * 3, type, avg, std, nlist, nnei, coord, rcut_r_smth, rcut_r, sec_a.back());
 }
 
 template struct DescrptSeAGPUExecuteFunctor<float>;
diff --git a/source/op/cuda/tabulate.cu b/source/op/cuda/tabulate.cu
new file mode 100644
index 0000000000..a7231b413f
--- /dev/null
+++ b/source/op/cuda/tabulate.cu
@@ -0,0 +1,475 @@
+#include <iostream>
+#include <stdio.h>
+#include <fstream>
+#include <cuda_runtime.h>
+#include <cub/cub.cuh>   // or equivalently <cub/device/device_radix_sort.cuh>
+#include <cub/device/device_radix_sort.cuh>
+#include "DeviceFunctor.h"
+
+#define MM 4
+#define KK 4
+#define TPB 256
+#define WARP_SIZE 32
+#define FULL_MASK 0xffffffff
+
+template <typename FPTYPE>
+__forceinline__ 
+__device__
+void locate_xx(const FPTYPE& lower, const FPTYPE& upper,  const FPTYPE& max, const FPTYPE& stride0, const FPTYPE& stride1, FPTYPE& xx, int& table_idx) {
+    if (xx < lower) {
+        table_idx = 0;
+        xx = 0;
+    }
+    else if (xx < upper) {
+        table_idx = (int)((xx - lower) / stride0);
+        xx -= (table_idx * stride0 + lower);
+    }
+    else if (xx < max) {
+        int first_stride = int((upper - lower) / stride0);
+        table_idx = first_stride + (int)((xx - upper) / stride1);
+        xx -= ((table_idx - first_stride) * stride1 + upper);
+    }
+    else {
+        table_idx = int((upper - lower) / stride0) + (int)((max - upper) / stride1) - 1;
+        xx = 0;
+    }
+}
+
+template <typename FPTYPE>
+__forceinline__ 
+__device__ 
+FPTYPE dot(FPTYPE ll[4], FPTYPE rr[4]) {
+    return ll[0] * rr[0] + ll[1] * rr[1] + ll[2] * rr[2] + ll[3] * rr[3];
+}
+
+template <typename FPTYPE>
+__forceinline__ 
+__device__
+void warp_reduce(FPTYPE & val) {
+    for (int offset = 16; offset > 0; offset >>= 1)
+        val += __shfl_down_sync(FULL_MASK, val, offset);
+}
+
+// last_layer_size must larger than MTILE * KTILE!
+// TODO: A more flexible implementation of sparse 
+template <
+    typename FPTYPE,
+    int      MTILE,
+    int      KTILE> 
+__global__ void tabulate_fusion(const FPTYPE * table, const FPTYPE * in, const FPTYPE * ff, FPTYPE * out, const FPTYPE lower, const FPTYPE upper, const FPTYPE max, const FPTYPE stride0, const FPTYPE stride1, const int nnei, const int last_layer_size) {
+    extern __shared__ int _data[];
+    int const block_idx = blockIdx.x; // nloc
+    int const thread_idx = threadIdx.x; // last_layer_size
+    FPTYPE ago = __shfl_sync(0xffffffff, in[block_idx * nnei + nnei - 1], 0);
+    bool unloop = false;
+    int breakpoint = nnei - 1;
+    // int const warp_idx = __shfl_sync(0xffffffff, threadIdx.x / 32, 0);
+    // int const lane_idx = threadIdx.x % 32;
+    // iteratorC for data reuse...
+    FPTYPE * iteratorC = (FPTYPE*) &_data[0];
+    for (int kk = 0; kk < MTILE; kk++)
+        iteratorC[kk * last_layer_size + thread_idx] = 0.f;
+    __syncthreads();  
+
+    for (int ii = 0; ii < nnei; ii++) {
+        FPTYPE var[4]; 
+        FPTYPE xx = in[block_idx * nnei + ii];
+
+        if (ago == xx) {
+            unloop = true;
+            breakpoint = ii;
+        } 
+        int table_idx = 0;
+        locate_xx(lower, upper, max, stride0, stride1, xx, table_idx);
+        var[0] = table[table_idx * last_layer_size * 4 + thread_idx * 4 + 0];
+        var[1] = table[table_idx * last_layer_size * 4 + thread_idx * 4 + 1];
+        var[2] = table[table_idx * last_layer_size * 4 + thread_idx * 4 + 2];
+        var[3] = table[table_idx * last_layer_size * 4 + thread_idx * 4 + 3];
+        FPTYPE res = ((var[0] * xx + var[1]) * xx + var[2]) * xx + var[3];
+        for (int kk = 0; kk < MTILE; kk++) {
+            iteratorC[kk * last_layer_size + thread_idx] += (nnei - breakpoint) * ff[block_idx * nnei * MTILE + ii * MTILE + kk] * res;
+        }
+        if (unloop) break;
+    }
+    for (int ii = 0; ii < MTILE; ii++) {
+        out[block_idx * MTILE * last_layer_size + ii * last_layer_size + thread_idx] = iteratorC[ii * last_layer_size + thread_idx];
+    }
+}
+
+// last_layer_size must larger than MTILE * KTILE!
+// TODO: A more flexible implementation of sparse 
+
+
+template <
+    typename FPTYPE,
+    int      MTILE,
+    int      KTILE>
+__global__ void tabulate_fusion_grad_warp_reduce(const FPTYPE * table, const FPTYPE * in, const FPTYPE * ff, const FPTYPE * dy, FPTYPE * dy_dx, FPTYPE * dy_df, const FPTYPE lower, const FPTYPE upper, const FPTYPE max, const FPTYPE stride0, const FPTYPE stride1, const int nnei, const int last_layer_size) {
+    extern __shared__ int _data[];
+    int const block_idx = blockIdx.x;  // nloc
+    int const thread_idx = threadIdx.x; // KTILE * WARP_SIZE, usally 128 here~
+    int warp_idx = __shfl_sync(0xffffffff, threadIdx.x / 32, 0);
+    int lane_idx = threadIdx.x % 32;
+    int breakpoint = nnei - 1;
+    bool unloop = false;
+
+    FPTYPE * iteratorA = (FPTYPE *)&_data[0]; // dy
+    for (int ii = 0; ii < MTILE; ii++) {
+        if (thread_idx < last_layer_size) {
+            iteratorA[ii * last_layer_size + thread_idx] = dy[block_idx * MTILE * last_layer_size + ii * last_layer_size + thread_idx];
+        }
+    }
+    __syncthreads();
+    FPTYPE ago = __shfl_sync(0xffffffff, in[block_idx * nnei + nnei - 1], 0);
+    for (int ii = 0; ii < nnei; ii += KTILE) {
+        FPTYPE xx = in[block_idx * nnei + ii + warp_idx];
+        // if (ago == xx) {
+        //     unloop = true;
+        //     breakpoint = ii;
+        // }
+        
+        int table_idx = 0;
+        locate_xx(lower, upper, max, stride0, stride1, xx, table_idx);
+        FPTYPE sum[KTILE] = {0.f};
+        FPTYPE Csub = 0.f;
+        for (int jj = lane_idx; jj < last_layer_size; jj += WARP_SIZE) {
+            // load iteratorB through table 
+            FPTYPE var[KTILE];
+            var[0] = table[table_idx * last_layer_size * 4 + jj * 4 + 0];
+            var[1] = table[table_idx * last_layer_size * 4 + jj * 4 + 1];
+            var[2] = table[table_idx * last_layer_size * 4 + jj * 4 + 2];
+            var[3] = table[table_idx * last_layer_size * 4 + jj * 4 + 3];
+            FPTYPE tmp = (var[0] * xx + var[1]) * xx + var[2];
+            for (int kk = 0; kk < KTILE; kk++) {
+                sum[kk] += (nnei - breakpoint) * iteratorA[kk * last_layer_size + jj] * (tmp * xx + var[3]);
+            }
+            var[2]  = ff[block_idx * nnei * MTILE + (ii + warp_idx) * 4 + 0] * iteratorA[0 * last_layer_size + jj];
+            var[2] += ff[block_idx * nnei * MTILE + (ii + warp_idx) * 4 + 1] * iteratorA[1 * last_layer_size + jj];
+            var[2] += ff[block_idx * nnei * MTILE + (ii + warp_idx) * 4 + 2] * iteratorA[2 * last_layer_size + jj];
+            var[2] += ff[block_idx * nnei * MTILE + (ii + warp_idx) * 4 + 3] * iteratorA[3 * last_layer_size + jj];
+            Csub += (nnei - breakpoint) * ((2.0 * var[0] * xx + var[1]) * xx + tmp) * var[2];
+        }
+        __syncwarp();
+        for (int kk = 0; kk < KTILE; kk++) {
+            warp_reduce(sum[kk]);
+        }
+        warp_reduce(Csub);
+        if (lane_idx == 0) {
+            for (int kk = 0; kk < KTILE; kk++) {
+                dy_df[block_idx * nnei * MTILE + (ii + warp_idx) * 4 + kk] = sum[kk];
+            }
+            dy_dx[block_idx * nnei + ii + warp_idx] = Csub;
+        }
+        if (unloop) break;
+    }
+}
+
+template <
+    typename FPTYPE,
+    int      MTILE,
+    int      KTILE> 
+__global__ void tabulate_fusion_special(const FPTYPE * table, const FPTYPE * in, const FPTYPE * ff, FPTYPE * out, const FPTYPE lower, const FPTYPE upper, const FPTYPE max, const FPTYPE stride0, const FPTYPE stride1, const int nnei, const int last_layer_size) {
+    extern __shared__ int _data[];
+    int const block_idx = blockIdx.x; // nloc
+    int const thread_idx = threadIdx.x; // last_layer_size
+    FPTYPE ago = __shfl_sync(0xffffffff, in[block_idx * nnei + nnei - 1], 0);
+    bool unloop = false;
+    int breakpoint = nnei - 1;
+
+    FPTYPE * iteratorC = (FPTYPE*) &_data[0];
+    for (int kk = 0; kk < MTILE; kk++)
+        iteratorC[kk * last_layer_size + thread_idx] = 0.f;
+    __syncthreads();
+ 
+    for (int ii = 0; ii < nnei; ii++) {
+        FPTYPE var[6]; 
+        FPTYPE xx = in[block_idx * nnei + ii];
+        if (xx == ago) {
+            unloop = true;
+            breakpoint = ii;
+        }
+        int table_idx = 0;
+        locate_xx(lower, upper, max, stride0, stride1, xx, table_idx);
+        var[0] = table[table_idx * last_layer_size * 6 + thread_idx * 6 + 0];
+        var[1] = table[table_idx * last_layer_size * 6 + thread_idx * 6 + 1];
+        var[2] = table[table_idx * last_layer_size * 6 + thread_idx * 6 + 2];
+        var[3] = table[table_idx * last_layer_size * 6 + thread_idx * 6 + 3];
+        var[4] = table[table_idx * last_layer_size * 6 + thread_idx * 6 + 4];
+        var[5] = table[table_idx * last_layer_size * 6 + thread_idx * 6 + 5];
+        FPTYPE res = var[0] + var[1] * xx + var[2] * xx * xx + var[3] * xx * xx * xx + var[4] * xx * xx * xx * xx + var[5] * xx * xx * xx * xx * xx; 
+        for (int kk = 0; kk < MTILE; kk++) {
+            iteratorC[kk * last_layer_size + thread_idx] += (nnei - breakpoint) * ff[block_idx * nnei * MTILE + ii * MTILE + kk] * res;
+        }
+        if (unloop) break;
+    }
+    for (int ii = 0; ii < MTILE; ii++) {
+        out[block_idx * MTILE * last_layer_size + ii * last_layer_size + thread_idx] = iteratorC[ii * last_layer_size + thread_idx];
+    }
+}
+
+template <
+    typename FPTYPE,
+    int      MTILE,
+    int      KTILE> 
+__global__ void tabulate_fusion_grad_warp_reduce_special(const FPTYPE * table, const FPTYPE * in, const FPTYPE * ff, const FPTYPE * dy, FPTYPE * dy_dx, FPTYPE * dy_df, const FPTYPE lower, const FPTYPE upper, const FPTYPE max, const FPTYPE stride0, const FPTYPE stride1, const int nnei, const int last_layer_size) {
+    extern __shared__ int _data[];
+    int const block_idx = blockIdx.x;  // nloc
+    int const thread_idx = threadIdx.x; // KTILE * WARP_SIZE, usally 128 here~
+    int warp_idx = __shfl_sync(0xffffffff, threadIdx.x / 32, 0);
+    int lane_idx = threadIdx.x % 32;
+    int breakpoint = nnei - 1;
+    bool unloop = false;
+
+    FPTYPE * iteratorA = (FPTYPE *)&_data[0]; // dy
+    for (int ii = 0; ii < MTILE; ii++) {
+        if (thread_idx < last_layer_size) {
+            iteratorA[ii * last_layer_size + thread_idx] = dy[block_idx * MTILE * last_layer_size + ii * last_layer_size + thread_idx];
+        }
+    }
+    __syncthreads();
+    FPTYPE ago = __shfl_sync(0xffffffff, in[block_idx * nnei + nnei - 1], 0);
+    for (int ii = 0; ii < nnei; ii += KTILE) {
+        FPTYPE xx = in[block_idx * nnei + ii + warp_idx];
+        if (ago == xx) { 
+            unloop = true;
+            breakpoint = ii;
+        }
+        
+        int table_idx = 0;
+        locate_xx(lower, upper, max, stride0, stride1, xx, table_idx);
+        FPTYPE sum[KTILE] = {0.f};
+        FPTYPE Csub = 0.f;
+        for (int jj = lane_idx; jj < last_layer_size; jj += WARP_SIZE) {
+            // load iteratorB through table 
+            FPTYPE a0  = table[table_idx * last_layer_size * 6 + 6 * jj + 0]; 
+            FPTYPE a1  = table[table_idx * last_layer_size * 6 + 6 * jj + 1]; 
+            FPTYPE a2  = table[table_idx * last_layer_size * 6 + 6 * jj + 2]; 
+            FPTYPE a3  = table[table_idx * last_layer_size * 6 + 6 * jj + 3];
+            FPTYPE a4  = table[table_idx * last_layer_size * 6 + 6 * jj + 4];
+            FPTYPE a5  = table[table_idx * last_layer_size * 6 + 6 * jj + 5];
+            FPTYPE res = a0 + a1 * xx + a2 * xx * xx + a3 * xx * xx * xx + a4 * xx * xx * xx * xx + a5 * xx * xx * xx * xx * xx;
+            for (int kk = 0; kk < KTILE; kk++) {
+                sum[kk] += (nnei - breakpoint) * iteratorA[kk * last_layer_size + jj] * res;
+            }
+            res  = ff[block_idx * nnei * MTILE + (ii + warp_idx) * 4 + 0] * iteratorA[0 * last_layer_size + jj];
+            res += ff[block_idx * nnei * MTILE + (ii + warp_idx) * 4 + 1] * iteratorA[1 * last_layer_size + jj];
+            res += ff[block_idx * nnei * MTILE + (ii + warp_idx) * 4 + 2] * iteratorA[2 * last_layer_size + jj];
+            res += ff[block_idx * nnei * MTILE + (ii + warp_idx) * 4 + 3] * iteratorA[3 * last_layer_size + jj];
+            Csub += (nnei - breakpoint) * (a1 + 2 * a2 * xx + 3 * a3 * xx * xx + 4 * a4 * xx * xx * xx + 5 * a5 * xx * xx * xx * xx) * res;
+        }
+        __syncwarp();
+        for (int kk = 0; kk < KTILE; kk++) {
+            warp_reduce(sum[kk]);
+        }
+        warp_reduce(Csub);
+        if (lane_idx == 0) {
+            for (int kk = 0; kk < KTILE; kk++) {
+                dy_df[block_idx * nnei * MTILE + (ii + warp_idx) * 4 + kk] = sum[kk];
+            }
+            dy_dx[block_idx * nnei + ii + warp_idx] = Csub;
+        }
+        if (unloop) break;
+    }
+}
+
+template <typename FPTYPE,
+          int      THREADS_PER_BLOCK>
+__global__ void tabulate_checker(const FPTYPE * in, int * out, const FPTYPE lower, const FPTYPE upper, const FPTYPE max, const int nloc, const int nnei) {
+    __shared__ int Csub[THREADS_PER_BLOCK];
+    __shared__ int Dsub[THREADS_PER_BLOCK];
+    int const bid = blockIdx.x;
+    int const tid = threadIdx.x;
+    
+    Csub[tid] = 0;
+    Dsub[tid] = 0;
+    __syncthreads();
+
+    for (int ii = tid; ii < nnei; ii += THREADS_PER_BLOCK) {
+        FPTYPE xx = in[bid * nnei + ii];
+        if (xx < lower || xx > max) {
+            Csub[tid] += 1;
+            printf("# DEEPMD: level 2 overflow, xx:\t%f\n", xx);
+        }
+        else if (xx >= upper && xx <= max) {
+            Dsub[tid] += 1;
+            // printf("# DEEPMD: level 1 overflow, xx:\t%f\n", xx);
+        }
+    }
+    __syncthreads();
+    // do reduction in shared memory
+    for (int ii = THREADS_PER_BLOCK >> 1; ii > 0; ii >>= 1) {
+        if (tid < ii) {
+            Csub[tid] += Csub[tid + ii];
+            Dsub[tid] += Dsub[tid + ii];
+        }
+        __syncthreads();
+    }
+    if (tid == 0) {
+        out[bid] = Csub[0];
+        out[nloc + bid] = Dsub[0];
+    }
+}
+
+void TabulateFusionLauncher(const double * table, const double * table_info, const double * in, const double * ff, const int nloc, const int nnei, const int last_layer_size, double * out) {
+    // std::cout << "I'm in tabulate GPU!" << std::endl;
+    tabulate_fusion_special<double, MM, KK> <<<nloc, last_layer_size, sizeof(double) * MM * last_layer_size>>>(table, in, ff, out, table_info[0], table_info[1], table_info[2], table_info[3], table_info[4], nnei, last_layer_size);
+}
+void TabulateFusionLauncher(const float * table, const float * table_info, const float * in, const float * ff, const int nloc, const int nnei, const int last_layer_size, float * out) {
+    tabulate_fusion_special<float, MM, KK> <<<nloc, last_layer_size, sizeof(float) * MM * last_layer_size>>>(table, in, ff, out, table_info[0], table_info[1], table_info[2], table_info[3], table_info[4], nnei, last_layer_size);
+}
+
+void TabulateFusionGradLauncher(const double * table, const double * table_info, const double * in, const double * ff, const double * dy, const int nloc, const int nnei, const int last_layer_size, double * dy_dx, double * dy_df) {
+    // cudaMemset(dy_df, 0.0, sizeof(double) * nloc * nnei * 4);
+    cudaMemset(dy_dx, 0.0, sizeof(double) * nloc * nnei);
+    cudaMemset(dy_df, 0.0, sizeof(double) * nloc * nnei * 4);
+    tabulate_fusion_grad_warp_reduce_special<double, MM, KK> <<<nloc, KK * WARP_SIZE, sizeof(double) * MM * last_layer_size>>>(table, in, ff, dy, dy_dx, dy_df, table_info[0], table_info[1], table_info[2], table_info[3], table_info[4], nnei, last_layer_size);
+}
+void TabulateFusionGradLauncher(const float * table, const float * table_info, const float * in, const float * ff, const float * dy, const int nloc, const int nnei, const int last_layer_size, float * dy_dx, float * dy_df) {
+    // cudaMemset(dy_df, 0.0, sizeof(float) * nloc * nnei * 4);
+    cudaMemset(dy_dx, 0.0, sizeof(float) * nloc * nnei);
+    cudaMemset(dy_df, 0.0, sizeof(float) * nloc * nnei * 4);
+    tabulate_fusion_grad_warp_reduce_special<float, MM, KK> <<<nloc, KK * WARP_SIZE, sizeof(float) * MM * last_layer_size>>>(table, in, ff, dy, dy_dx, dy_df, table_info[0], table_info[1], table_info[2], table_info[3], table_info[4], nnei, last_layer_size);
+}
+
+void TabulateCheckerLauncher(const double * table_info, const double * in, int * out, const int nloc, const int nnei) {
+    tabulate_checker <double, TPB> <<<nloc, TPB>>>(in, out, table_info[0], table_info[1], table_info[2], nloc, nnei);
+    // Declare, allocate, and initialize device-accessible pointers for input and output
+    int * d_out = NULL;
+    int * h_out = NULL;
+    cudaMalloc((void **)&d_out, sizeof(int));
+    h_out = (int*)malloc(sizeof(int));
+    // Determine temporary device storage requirements
+    void     *d_temp_storage = NULL;
+    size_t   temp_storage_bytes = 0;
+    cub::DeviceReduce::Sum(d_temp_storage, temp_storage_bytes, out, d_out, nloc);
+
+    // Allocate temporary storage
+    cudaMalloc(&d_temp_storage, temp_storage_bytes);
+
+    // Run sum-reduction
+    cub::DeviceReduce::Sum(d_temp_storage, temp_storage_bytes, out, d_out, nloc);
+
+    // d_out <-- [38]
+    cudaMemcpy(h_out, d_out, sizeof(int), cudaMemcpyDeviceToHost);
+    
+    if(h_out[0] > 0) {
+        std::cout << "# DEEPMD: warning! some values [" << h_out[0] << "/" << nloc * nnei << "] overflow the range of the table, using the endpoint approximate processing.." << std::endl;
+    }
+
+    // Run sum-reduction
+    cub::DeviceReduce::Sum(d_temp_storage, temp_storage_bytes, out + nloc, d_out, nloc);
+    
+    // d_out <-- [38]
+    cudaMemcpy(h_out, d_out, sizeof(int), cudaMemcpyDeviceToHost);
+    
+    if(h_out[0] > 0) {
+        std::cout << "# DEEPMD: warning! some values [" << h_out[0] << "/" << nloc * nnei << "] overflow the range of the table, using second table approximate processing.." << std::endl;
+    }
+
+    // free the temperary storage
+    cudaFree(d_out);
+    cudaFree(d_temp_storage);
+    free(h_out);
+}
+
+void TabulateCheckerLauncher(const float * table_info, const float * in, int * out, const int nloc, const int nnei) {
+    tabulate_checker <float, TPB> <<<nloc, TPB>>>(in, out, table_info[0], table_info[1], table_info[2], nloc, nnei);
+    // Declare, allocate, and initialize device-accessible pointers for input and output
+    int * d_out = NULL;
+    int * h_out = NULL;
+    cudaMalloc((void **)&d_out, sizeof(int));
+    h_out = (int*)malloc(sizeof(int));
+    // Determine temporary device storage requirements
+    void     *d_temp_storage = NULL;
+    size_t   temp_storage_bytes = 0;
+    cub::DeviceReduce::Sum(d_temp_storage, temp_storage_bytes, out, d_out, nloc);
+
+    // Allocate temporary storage
+    cudaMalloc(&d_temp_storage, temp_storage_bytes);
+
+    // Run sum-reduction
+    cub::DeviceReduce::Sum(d_temp_storage, temp_storage_bytes, out, d_out, nloc);
+
+    // d_out <-- [38]
+    cudaMemcpy(h_out, d_out, sizeof(int), cudaMemcpyDeviceToHost);
+
+    if(h_out[0] > 0) {
+        std::cout << "# DEEPMD: warning! some values [" << h_out[0] << "/" << nloc * nnei << "] overflow the range of the table, using the endpoint approximate processing.." << std::endl;
+    }
+
+    // Run sum-reduction
+    cub::DeviceReduce::Sum(d_temp_storage, temp_storage_bytes, out + nloc, d_out, nloc);
+    
+    // d_out <-- [38]
+    cudaMemcpy(h_out, d_out, sizeof(int), cudaMemcpyDeviceToHost);
+    
+    if(h_out[0] > 0) {
+        std::cout << "# DEEPMD: warning! some values [" << h_out[0] << "/" << nloc * nnei << "] overflow the range of the table, using second table approximate processing.." << std::endl;
+    }
+
+    // free the temperary storage
+    cudaFree(d_out);
+    cudaFree(d_temp_storage);
+    free(h_out);
+}
+
+template<typename FPTYPE>
+void TabulateFusionGPUExecuteFunctor<FPTYPE>::operator()(const FPTYPE * table, const FPTYPE * table_info, const FPTYPE * in, const FPTYPE * ff, const int nloc, const int nnei, const int last_layer_size, FPTYPE * out) {
+    tabulate_fusion_special<FPTYPE, MM, KK> <<<nloc, last_layer_size, sizeof(FPTYPE) * MM * last_layer_size>>>(table, in, ff, out, table_info[0], table_info[1], table_info[2], table_info[3], table_info[4], nnei, last_layer_size);
+}
+
+template<typename FPTYPE>
+void TabulateFusionGradGPUExecuteFunctor<FPTYPE>::operator()(const FPTYPE * table, const FPTYPE * table_info, const FPTYPE * in, const FPTYPE * ff, const FPTYPE * dy, const int nloc, const int nnei, const int last_layer_size, FPTYPE * dy_dx, FPTYPE * dy_df) {
+    cudaErrcheck(cudaMemset(dy_dx, 0.0, sizeof(FPTYPE) * nloc * nnei));
+    cudaErrcheck(cudaMemset(dy_df, 0.0, sizeof(FPTYPE) * nloc * nnei * 4));
+    tabulate_fusion_grad_warp_reduce_special<FPTYPE, MM, KK> <<<nloc, KK * WARP_SIZE, sizeof(FPTYPE) * MM * last_layer_size>>>(table, in, ff, dy, dy_dx, dy_df, table_info[0], table_info[1], table_info[2], table_info[3], table_info[4], nnei, last_layer_size);
+}
+
+template <typename FPTYPE>
+void TabulateCheckerGPUExecuteFunctor<FPTYPE>::operator()(const FPTYPE * table_info, const FPTYPE * in, int * out, const int nloc, const int nnei) {
+    tabulate_checker <FPTYPE, TPB> <<<nloc, TPB>>>(in, out, table_info[0], table_info[1], table_info[2], nloc, nnei);
+    // Declare, allocate, and initialize device-accessible pointers for input and output
+    int * d_out = NULL;
+    int * h_out = NULL;
+    cudaMalloc((void **)&d_out, sizeof(int));
+    h_out = (int*)malloc(sizeof(int));
+    // Determine temporary device storage requirements
+    void     *d_temp_storage = NULL;
+    size_t   temp_storage_bytes = 0;
+    cub::DeviceReduce::Sum(d_temp_storage, temp_storage_bytes, out, d_out, nloc);
+
+    // Allocate temporary storage
+    cudaMalloc(&d_temp_storage, temp_storage_bytes);
+
+    // Run sum-reduction
+    cub::DeviceReduce::Sum(d_temp_storage, temp_storage_bytes, out, d_out, nloc);
+
+    // d_out <-- [38]
+    cudaMemcpy(h_out, d_out, sizeof(int), cudaMemcpyDeviceToHost);
+
+    if(h_out[0] > 0) {
+        std::cout << "# DEEPMD: warning! some values [" << h_out[0] << "/" << nloc * nnei << "] overflow the range of the table, using the endpoint approximate processing.." << std::endl;
+    }
+
+    // Run sum-reduction
+    cub::DeviceReduce::Sum(d_temp_storage, temp_storage_bytes, out + nloc, d_out, nloc);
+    
+    // d_out <-- [38]
+    cudaMemcpy(h_out, d_out, sizeof(int), cudaMemcpyDeviceToHost);
+    
+    if(h_out[0] > 0) {
+        std::cout << "# DEEPMD: warning! some values [" << h_out[0] << "/" << nloc * nnei << "] overflow the range of the table, using second table approximate processing.." << std::endl;
+    }
+
+    // free the temperary storage
+    cudaFree(d_out);
+    cudaFree(d_temp_storage);
+    free(h_out);
+}
+
+template struct TabulateFusionGPUExecuteFunctor<float>;
+template struct TabulateFusionGPUExecuteFunctor<double>;
+template struct TabulateFusionGradGPUExecuteFunctor<float>;
+template struct TabulateFusionGradGPUExecuteFunctor<double>;
+template struct TabulateCheckerGPUExecuteFunctor<float>;
+template struct TabulateCheckerGPUExecuteFunctor<double>;
\ No newline at end of file
diff --git a/source/op/data_info.cc b/source/op/data_info.cc
new file mode 100644
index 0000000000..08e0f77691
--- /dev/null
+++ b/source/op/data_info.cc
@@ -0,0 +1,408 @@
+#include "tensorflow/core/framework/op.h"
+#include "tensorflow/core/framework/op_kernel.h"
+#include "tensorflow/core/framework/shape_inference.h"
+#include <iostream>
+
+#include "ComputeDescriptor.h"
+#include "NeighborList.h"
+
+typedef double boxtensor_t ;
+typedef double compute_t;
+
+using namespace tensorflow;
+// using namespace std;
+
+using CPUDevice = Eigen::ThreadPoolDevice;
+using GPUDevice = Eigen::GpuDevice;
+
+REGISTER_OP("DataInfo")
+    .Attr("T: {float, double}")
+    .Input("coord: T")          //atomic coordinates
+    .Input("type: int32")       //atomic type
+    .Input("natoms: int32")     //local atomic number; each type atomic number; daizheyingxiangqude atomic numbers
+    .Input("box : T")
+    .Input("mesh : int32")
+    .Input("davg: T")           //average value of data
+    .Input("dstd: T")           //standard deviation
+    .Attr("rcut_a: float")      //no use
+    .Attr("rcut_r: float")
+    .Attr("rcut_r_smth: float")
+    .Attr("sel_a: list(int)")
+    .Attr("sel_r: list(int)")   //all zero
+    .Output("descrpt: T")
+    .Output("descrpt_deriv: T")
+    .Output("rij: T")
+    .Output("nlist: int32")
+    .Output("distance: T")
+    .Output("max_nbor_size: int32")
+    .Output("table_range: T");
+
+template<typename Device, typename FPTYPE>
+class DataInfoOp : public OpKernel {
+public:
+  explicit DataInfoOp(OpKernelConstruction* context) : OpKernel(context) {
+    OP_REQUIRES_OK(context, context->GetAttr("rcut_a", &rcut_a));
+    OP_REQUIRES_OK(context, context->GetAttr("rcut_r", &rcut_r));
+    OP_REQUIRES_OK(context, context->GetAttr("rcut_r_smth", &rcut_r_smth));
+    OP_REQUIRES_OK(context, context->GetAttr("sel_a", &sel_a));
+    OP_REQUIRES_OK(context, context->GetAttr("sel_r", &sel_r));
+    cum_sum (sec_a, sel_a);
+    cum_sum (sec_r, sel_r);
+    ndescrpt_a = sec_a.back() * 4;
+    ndescrpt_r = sec_r.back() * 1;
+    ndescrpt = ndescrpt_a + ndescrpt_r;
+    nnei_a = sec_a.back();
+    nnei_r = sec_r.back();
+    nnei = nnei_a + nnei_r;
+    fill_nei_a = (rcut_a < 0);
+    count_nei_idx_overflow = 0;
+  }
+
+  void Compute(OpKernelContext* context) override {
+    counter++;
+    // Grab the input tensor
+    int context_input_index = 0;
+    const Tensor& coord_tensor	= context->input(context_input_index++);
+    const Tensor& type_tensor	= context->input(context_input_index++);
+    const Tensor& natoms_tensor	= context->input(context_input_index++);
+    const Tensor& box_tensor	= context->input(context_input_index++);
+    const Tensor& mesh_tensor	= context->input(context_input_index++);
+    const Tensor& avg_tensor	= context->input(context_input_index++);
+    const Tensor& std_tensor	= context->input(context_input_index++);
+
+    // set size of the sample
+    OP_REQUIRES (context, (coord_tensor.shape().dims() == 2),	errors::InvalidArgument ("Dim of coord should be 2"));
+    OP_REQUIRES (context, (type_tensor.shape().dims() == 2),	errors::InvalidArgument ("Dim of type should be 2"));
+    OP_REQUIRES (context, (natoms_tensor.shape().dims() == 1),	errors::InvalidArgument ("Dim of natoms should be 1"));
+    OP_REQUIRES (context, (box_tensor.shape().dims() == 2),	errors::InvalidArgument ("Dim of box should be 2"));
+    OP_REQUIRES (context, (mesh_tensor.shape().dims() == 1),	errors::InvalidArgument ("Dim of mesh should be 1"));
+    OP_REQUIRES (context, (avg_tensor.shape().dims() == 2),	errors::InvalidArgument ("Dim of avg should be 2"));
+    OP_REQUIRES (context, (std_tensor.shape().dims() == 2),	errors::InvalidArgument ("Dim of std should be 2"));
+    OP_REQUIRES (context, (fill_nei_a),				errors::InvalidArgument ("Rotational free descriptor only support the case rcut_a < 0"));
+    OP_REQUIRES (context, (sec_r.back() == 0),			errors::InvalidArgument ("Rotational free descriptor only support all-angular information: sel_r should be all zero."));
+
+    OP_REQUIRES (context, (natoms_tensor.shape().dim_size(0) >= 3),		errors::InvalidArgument ("number of atoms should be larger than (or equal to) 3"));
+    auto natoms	= natoms_tensor	.flat<int>();
+    int nloc = natoms(0);
+    int nall = natoms(1);
+    int ntypes = natoms_tensor.shape().dim_size(0) - 2;
+    int nsamples = coord_tensor.shape().dim_size(0);
+
+    // check the sizes
+    OP_REQUIRES (context, (nsamples == type_tensor.shape().dim_size(0)),	errors::InvalidArgument ("number of samples should match"));
+    OP_REQUIRES (context, (nsamples == box_tensor.shape().dim_size(0)),		errors::InvalidArgument ("number of samples should match"));
+    OP_REQUIRES (context, (ntypes == avg_tensor.shape().dim_size(0)),		errors::InvalidArgument ("number of avg should be ntype"));
+    OP_REQUIRES (context, (ntypes == std_tensor.shape().dim_size(0)),		errors::InvalidArgument ("number of std should be ntype"));
+
+    OP_REQUIRES (context, (nall * 3 == coord_tensor.shape().dim_size(1)),	errors::InvalidArgument ("number of atoms should match"));
+    OP_REQUIRES (context, (nall == type_tensor.shape().dim_size(1)),		errors::InvalidArgument ("number of atoms should match"));
+    OP_REQUIRES (context, (9 == box_tensor.shape().dim_size(1)),		errors::InvalidArgument ("number of box should be 9"));
+    OP_REQUIRES (context, (ndescrpt == avg_tensor.shape().dim_size(1)),		errors::InvalidArgument ("number of avg should be ndescrpt"));
+    OP_REQUIRES (context, (ndescrpt == std_tensor.shape().dim_size(1)),		errors::InvalidArgument ("number of std should be ndescrpt"));
+
+    int nei_mode = 0;
+    if (mesh_tensor.shape().dim_size(0) == 16) {
+      // lammps neighbor list
+      nei_mode = 3;
+    }
+    else if (mesh_tensor.shape().dim_size(0) == 12) {
+      // user provided extended mesh
+      nei_mode = 2;
+    }
+    else if (mesh_tensor.shape().dim_size(0) == 6) {
+      // manual copied pbc
+      assert (nloc == nall);
+      nei_mode = 1;
+    }
+    else if (mesh_tensor.shape().dim_size(0) == 0) {
+      // no pbc
+      nei_mode = -1;
+    }
+    else {
+      throw std::runtime_error("invalid mesh tensor");
+    }
+    bool b_pbc = true;
+    // if region is given extended, do not use pbc
+    if (nei_mode >= 1 || nei_mode == -1) {
+      b_pbc = false;
+    }
+    bool b_norm_atom = false;
+    if (nei_mode == 1){
+      b_norm_atom = true;
+    }
+
+    // Create an output tensor
+    TensorShape descrpt_shape ;
+    descrpt_shape.AddDim (nsamples);
+    descrpt_shape.AddDim (nloc * ndescrpt);
+    TensorShape descrpt_deriv_shape ;
+    descrpt_deriv_shape.AddDim (nsamples);
+    descrpt_deriv_shape.AddDim (nloc * ndescrpt * 3);
+    TensorShape rij_shape ;
+    rij_shape.AddDim (nsamples);
+    rij_shape.AddDim (nloc * nnei * 3);
+    TensorShape nlist_shape ;
+    nlist_shape.AddDim (nsamples);
+    nlist_shape.AddDim (nloc * nnei);
+    TensorShape distance_shape ;
+    distance_shape.AddDim (nloc * nnei);
+    TensorShape max_nbor_size_shape ;
+    max_nbor_size_shape.AddDim (nloc);
+    TensorShape table_range_shape ;
+    table_range_shape.AddDim (nloc * nnei);
+
+    int context_output_index = 0;
+    Tensor* descrpt_tensor = NULL;
+    OP_REQUIRES_OK(context, context->allocate_output(context_output_index++, 
+						     descrpt_shape, 
+						     &descrpt_tensor));
+    Tensor* descrpt_deriv_tensor = NULL;
+    OP_REQUIRES_OK(context, context->allocate_output(context_output_index++, 
+						     descrpt_deriv_shape, 
+						     &descrpt_deriv_tensor));
+    Tensor* rij_tensor = NULL;
+    OP_REQUIRES_OK(context, context->allocate_output(context_output_index++, 
+						     rij_shape,
+						     &rij_tensor));
+    Tensor* nlist_tensor = NULL;
+    OP_REQUIRES_OK(context, context->allocate_output(context_output_index++, 
+						     nlist_shape,
+						     &nlist_tensor));
+    
+    Tensor* distance_tensor = NULL;
+    OP_REQUIRES_OK(context, context->allocate_output(context_output_index++, 
+						     distance_shape,
+						     &distance_tensor));
+    Tensor* max_nbor_size_tensor = NULL;
+    OP_REQUIRES_OK(context, context->allocate_output(context_output_index++, 
+						     max_nbor_size_shape,
+						     &max_nbor_size_tensor));
+    Tensor* table_range_tensor = NULL;
+    OP_REQUIRES_OK(context, context->allocate_output(context_output_index++, 
+						     table_range_shape,
+						     &table_range_tensor));
+
+    auto coord	= coord_tensor	.matrix<FPTYPE>();
+    auto type	= type_tensor	.matrix<int>();
+    auto box	= box_tensor	.matrix<FPTYPE>();
+    auto mesh	= mesh_tensor	.flat<int>();
+    auto avg	= avg_tensor	.matrix<FPTYPE>();
+    auto std	= std_tensor	.matrix<FPTYPE>();
+    auto descrpt	= descrpt_tensor	->matrix<FPTYPE>();
+    auto descrpt_deriv	= descrpt_deriv_tensor	->matrix<FPTYPE>();
+    auto rij		= rij_tensor		->matrix<FPTYPE>();
+    auto nlist		= nlist_tensor	->matrix<int>();
+    auto distance		= distance_tensor	->flat<FPTYPE>();
+    // find a potential bug here!
+    auto max_nbor_size	= max_nbor_size_tensor ->flat<int>();
+    auto table_range		= table_range_tensor	->flat<FPTYPE>();
+    
+    for (int ii = 0; ii < static_cast<int>(distance_tensor->NumElements()); ii++) {
+      distance(ii) = 10000.0;
+    }
+    for (int ii = 0; ii < static_cast<int>(max_nbor_size_tensor->NumElements()); ii++) {
+      max_nbor_size(ii) = 0;
+    }
+    for (int ii = 0; ii < static_cast<int>(table_range_tensor->NumElements()); ii++) {
+      table_range(ii) = 0.0;
+    }
+    // // check the types
+    // int max_type_v = 0;
+    // for (int ii = 0; ii < natoms; ++ii){
+    //   if (type(0, ii) > max_type_v) max_type_v = type(0, ii);
+    // }
+    // int ntypes = max_type_v + 1;
+    OP_REQUIRES (context, (ntypes == int(sel_a.size())),	errors::InvalidArgument ("number of types should match the length of sel array"));
+    OP_REQUIRES (context, (ntypes == int(sel_r.size())),	errors::InvalidArgument ("number of types should match the length of sel array"));
+
+    for (int kk = 0; kk < nsamples; ++kk){
+      // set region
+      boxtensor_t boxt [9] = {0};
+      for (int dd = 0; dd < 9; ++dd) {
+	boxt[dd] = box(kk, dd);
+      }
+      SimulationRegion<compute_t > region;
+      region.reinitBox (boxt);
+
+      // set & normalize coord
+      std::vector<compute_t > d_coord3 (nall*3);
+      for (int ii = 0; ii < nall; ++ii){
+	for (int dd = 0; dd < 3; ++dd){
+	  d_coord3[ii*3+dd] = coord(kk, ii*3+dd);
+	}
+	if (b_norm_atom){
+	  compute_t inter[3];
+	  region.phys2Inter (inter, &d_coord3[3*ii]);
+	  for (int dd = 0; dd < 3; ++dd){
+	    if      (inter[dd] < 0 ) inter[dd] += 1.;
+	    else if (inter[dd] >= 1) inter[dd] -= 1.;
+	  }
+	  region.inter2Phys (&d_coord3[3*ii], inter);
+	}
+      }
+
+      // set type
+      std::vector<int > d_type (nall);
+      for (int ii = 0; ii < nall; ++ii) d_type[ii] = type(kk, ii);
+      
+      // build nlist
+      std::vector<std::vector<int > > d_nlist_a;
+      std::vector<std::vector<int > > d_nlist_r;
+      std::vector<int> nlist_map;
+      bool b_nlist_map = false;
+      if (nei_mode == 3) {	
+	int * pilist, *pjrange, *pjlist;
+	memcpy (&pilist, &mesh(4), sizeof(int *));
+	memcpy (&pjrange, &mesh(8), sizeof(int *));
+	memcpy (&pjlist, &mesh(12), sizeof(int *));
+	int inum = mesh(1);
+	assert (inum == nloc);
+	d_nlist_a.resize (inum);
+	d_nlist_r.resize (inum);
+	for (unsigned ii = 0; ii < inum; ++ii){
+	  d_nlist_r.reserve (pjrange[inum] / inum + 10);
+	}
+	for (unsigned ii = 0; ii < inum; ++ii){
+	  int i_idx = pilist[ii];
+	  for (unsigned jj = pjrange[ii]; jj < pjrange[ii+1]; ++jj){
+	    int j_idx = pjlist[jj];
+	    d_nlist_r[i_idx].push_back (j_idx);
+	  }
+	}
+      }
+      else if (nei_mode == 2) {
+          // std::cout << "I'm in nei_mode 2" << std::endl;
+	std::vector<int > nat_stt = {mesh(1-1), mesh(2-1), mesh(3-1)};
+	std::vector<int > nat_end = {mesh(4-1), mesh(5-1), mesh(6-1)};
+	std::vector<int > ext_stt = {mesh(7-1), mesh(8-1), mesh(9-1)};
+	std::vector<int > ext_end = {mesh(10-1), mesh(11-1), mesh(12-1)};
+	std::vector<int > global_grid (3);
+	for (int dd = 0; dd < 3; ++dd) global_grid[dd] = nat_end[dd] - nat_stt[dd];
+	::build_nlist (d_nlist_a, d_nlist_r, d_coord3, nloc, rcut_a, rcut_r, nat_stt, nat_end, ext_stt, ext_end, region, global_grid);
+      }
+      else if (nei_mode == 1) {
+          // std::cout << "I'm in nei_mode 1" << std::endl;
+	std::vector<double > bk_d_coord3 = d_coord3;
+	std::vector<int > bk_d_type = d_type;
+	std::vector<int > ncell, ngcell;
+	copy_coord(d_coord3, d_type, nlist_map, ncell, ngcell, bk_d_coord3, bk_d_type, rcut_r, region);	
+	b_nlist_map = true;
+	std::vector<int> nat_stt(3, 0);
+	std::vector<int> ext_stt(3), ext_end(3);
+	for (int dd = 0; dd < 3; ++dd){
+	  ext_stt[dd] = -ngcell[dd];
+	  ext_end[dd] = ncell[dd] + ngcell[dd];
+	}
+	::build_nlist (d_nlist_a, d_nlist_r, d_coord3, nloc, rcut_a, rcut_r, nat_stt, ncell, ext_stt, ext_end, region, ncell);
+      }
+      else if (nei_mode == -1){
+	::build_nlist (d_nlist_a, d_nlist_r, d_coord3, rcut_a, rcut_r, NULL);
+      }
+      else {
+	throw std::runtime_error("unknow neighbor mode");
+      }
+
+  for (int ii = 0; ii < nloc; ii++) {
+    max_nbor_size(ii) = d_nlist_r[ii].size();
+  }
+      // loop over atoms, compute descriptors for each atom
+#pragma omp parallel for 
+      for (int ii = 0; ii < nloc; ++ii){
+	std::vector<int> fmt_nlist_a;
+	std::vector<int> fmt_nlist_r;
+	int ret = -1;
+	if (fill_nei_a){
+	  if ((ret = format_nlist_fill_a (fmt_nlist_a, fmt_nlist_r, d_coord3, ntypes, d_type, region, b_pbc, ii, d_nlist_a[ii], d_nlist_r[ii], rcut_r, sec_a, sec_r)) != -1){
+	    if (count_nei_idx_overflow == 0) {
+	      std::cout << "WARNING: Radial neighbor list length of type " << ret << " is not enough" << std::endl;
+	      flush(std::cout);
+	      count_nei_idx_overflow ++;
+	    }
+	  }
+	}
+
+	std::vector<compute_t > d_descrpt_a;
+	std::vector<compute_t > d_descrpt_a_deriv;
+	std::vector<compute_t > d_descrpt_r;
+	std::vector<compute_t > d_descrpt_r_deriv;
+	std::vector<compute_t > d_rij_a;
+	std::vector<compute_t > d_rij_r;      
+	compute_descriptor_se_a (d_descrpt_a,
+				 d_descrpt_a_deriv,
+				 d_rij_a,
+				 d_coord3,
+				 ntypes, 
+				 d_type,
+				 region, 
+				 b_pbc,
+				 ii, 
+				 fmt_nlist_a,
+				 sec_a, 
+				 rcut_r_smth, 
+				 rcut_r);
+
+	// check sizes
+	assert (d_descrpt_a.size() == ndescrpt_a);
+	assert (d_descrpt_a_deriv.size() == ndescrpt_a * 3);
+	assert (d_rij_a.size() == nnei_a * 3);
+	assert (int(fmt_nlist_a.size()) == nnei_a);
+  // std::cout << "min:\t" << (0 - avg(0, 0)) / std(0, 0) << std::endl;
+  // if (counter % 1000 == 0) {
+  //   std::cout << "min:\t" << (0 - avg(0, 0)) / std(0, 0) << std::endl;
+  // }
+	// record outputs
+	for (int jj = 0; jj < ndescrpt_a; ++jj) {
+	  descrpt(kk, ii * ndescrpt + jj) = (d_descrpt_a[jj] - avg(d_type[ii], jj)) / std(d_type[ii], jj);
+       if (jj % 4 == 0) {
+         table_range(ii * nnei + jj / 4) = descrpt(kk, ii * ndescrpt + jj);
+       }
+  }
+	for (int jj = 0; jj < ndescrpt_a * 3; ++jj) {
+	  descrpt_deriv(kk, ii * ndescrpt * 3 + jj) = d_descrpt_a_deriv[jj] / std(d_type[ii], jj/3);
+	}
+	for (int jj = 0; jj < nnei_a * 3; ++jj){
+	  rij (kk, ii * nnei * 3 + jj) = d_rij_a[jj];
+    if (jj % 3 == 0 && d_rij_a[jj] > 0) {
+      distance(ii * nnei + jj / 3) = sqrt(d_rij_a[jj] * d_rij_a[jj] + d_rij_a[jj + 1] * d_rij_a[jj + 1] + d_rij_a[jj + 2] * d_rij_a[jj + 2]);
+    }
+	}
+	for (int jj = 0; jj < nnei_a; ++jj){
+	  int record = fmt_nlist_a[jj];
+	  if (b_nlist_map && record >= 0) {
+	    record = nlist_map[record];
+	  }
+	  nlist (kk, ii * nnei + jj) = record;
+	}
+      }
+    }
+  }
+private:
+  int counter = -1;
+  float rcut_a;
+  float rcut_r;
+  float rcut_r_smth;
+  std::vector<int32> sel_r;
+  std::vector<int32> sel_a;
+  std::vector<int> sec_a;
+  std::vector<int> sec_r;
+  int ndescrpt, ndescrpt_a, ndescrpt_r;
+  int nnei, nnei_a, nnei_r;
+  bool fill_nei_a;
+  int count_nei_idx_overflow;
+  void 
+  cum_sum (std::vector<int> & sec,
+	   const std::vector<int32> & n_sel) const {
+    sec.resize (n_sel.size() + 1);
+    sec[0] = 0;
+    for (int ii = 1; ii < sec.size(); ++ii){
+      sec[ii] = sec[ii-1] + n_sel[ii-1];
+    }
+  }
+};
+
+#define REGISTER_CPU(T)                                                                 \
+REGISTER_KERNEL_BUILDER(                                                                \
+    Name("DataInfo").Device(DEVICE_CPU).TypeConstraint<T>("T"),                       \
+    DataInfoOp<CPUDevice, T>); 
+REGISTER_CPU(float);
+REGISTER_CPU(double);
\ No newline at end of file
diff --git a/source/op/tabulate.cc b/source/op/tabulate.cc
new file mode 100644
index 0000000000..14e9b51474
--- /dev/null
+++ b/source/op/tabulate.cc
@@ -0,0 +1,379 @@
+#include "tensorflow/core/framework/op.h"
+#include "tensorflow/core/framework/op_kernel.h"
+#include "tensorflow/core/framework/register_types.h"
+#include "tensorflow/core/framework/shape_inference.h"
+
+using namespace tensorflow;
+using CPUDevice = Eigen::ThreadPoolDevice;
+using GPUDevice = Eigen::GpuDevice;
+
+REGISTER_OP("TabulateFusion")
+    .Attr("T: {float, double}")
+    .Input("table: T")
+    .Input("table_info: T")
+    .Input("input: T")
+    .Input("ff: T")
+    .Attr("last_layer_size: int")
+    .Output("output: T");
+
+REGISTER_OP("TabulateFusionGrad")
+    .Attr("T: {float, double}")
+    .Input("table: T")
+    .Input("table_info: T")
+    .Input("input: T")
+    .Input("ff: T")
+    .Input("dy: T")        
+    .Input("output: T")         
+    .Output("dy_dx: T")
+    .Output("dy_df: T");
+
+void TabulateFusionLauncher(const float * table, const float * table_info, const float * in, const float * ff, const int nloc, const int nnei, const int last_layer_size, float * out);
+void TabulateFusionLauncher(const double * table, const double * table_info, const double * in, const double * ff, const int nloc, const int nnei, const int last_layer_size, double * out);
+void TabulateFusionGradLauncher(const float * table, const float * table_info, const float * in, const float * ff, const float * dy, const int nloc, const int nnei, const int last_layer_size, float * dy_dx, float * dy_df);
+void TabulateFusionGradLauncher(const double * table, const double * table_info, const double * in, const double * ff, const double * dy, const int nloc, const int nnei, const int last_layer_size, double * dy_dx, double * dy_df);
+void TabulateCheckerLauncher(const float * table_info, const float * in, int * out, const int nloc, const int nnei);
+void TabulateCheckerLauncher(const double * table_info, const double * in, int * out, const int nloc, const int nnei);
+
+template <typename FPTYPE>
+inline FPTYPE dot(FPTYPE a[4], FPTYPE b[4]) {
+    return a[0] * b[0] + a[1] * b[1] + a[2] * b[2] + a[3] * b[3]; 
+}
+
+/*
+    This inline function was designed to get the table info and bias value for current input xx!
+    lower:      indicate the lower boundary of the first table;
+    upper:      indicate the upper boundary of the first table as well as the lower boundary of the second table;
+    max:        indicate the upper boundary of the second table;
+    stride0:    indicate the stride of the first table;
+    stride1:    indicate the stride of the second table;
+    xx:         indicate the inputs value;
+    table_idx:  indicate the location of table info of input value xx;
+*/
+template <typename FPTYPE>
+inline void locate_xx(const FPTYPE& lower, const FPTYPE& upper,  const FPTYPE& max, const FPTYPE& stride0, const FPTYPE& stride1, FPTYPE& xx, int& table_idx) {
+    if (xx < lower) {
+        table_idx = 0;
+        xx = 0;
+    }
+    else if (xx < upper) {
+        table_idx = (int)((xx - lower) / stride0);
+        xx -= (table_idx * stride0 + lower);
+    }
+    else if (xx < max) {
+        int first_stride = int((upper - lower) / stride0);
+        table_idx = first_stride + (int)((xx - upper) / stride1);
+        xx -= ((table_idx - first_stride) * stride1 + upper);
+    }
+    else {
+        table_idx = int((upper - lower) / stride0) + (int)((max - upper) / stride1) - 1;
+        xx = 0;
+    }
+}
+
+template <typename FPTYPE>
+struct TabulateFusionFunctor {
+    void operator()(const CPUDevice& d, const FPTYPE * table, const FPTYPE * table_info, const FPTYPE * in, const FPTYPE * ff, const int nloc, const int nnei, const int last_layer_size, FPTYPE * out) {
+        //Currently, Do nothing at all! 
+        // std::cout << "I'm in tabulate @CPU!" << std::endl;
+        memset(out, 0.0, sizeof(FPTYPE) * nloc * 4 * last_layer_size);
+        FPTYPE const lower   = table_info[0];
+        FPTYPE const upper   = table_info[1];
+        FPTYPE const _max    = table_info[2];
+        FPTYPE const stride0 = table_info[3];
+        FPTYPE const stride1 = table_info[4];
+        // for every atom, execute a small gemm~
+        // FPTYPE * res = new FPTYPE[4 * last_layer_size];
+        #pragma omp parallel for
+        for (int ii = 0; ii < nloc; ii++) {
+            FPTYPE ll[4] = {0};
+            FPTYPE ago = in[ii * nnei + nnei - 1];
+            bool unloop = false; 
+            for (int jj = 0; jj < nnei; jj++) { 
+                ll[0] = ff[ii * nnei * 4 + jj * 4 + 0];
+                ll[1] = ff[ii * nnei * 4 + jj * 4 + 1];
+                ll[2] = ff[ii * nnei * 4 + jj * 4 + 2];
+                ll[3] = ff[ii * nnei * 4 + jj * 4 + 3];
+                FPTYPE xx = in[ii * nnei + jj]; 
+                if (ago == xx) {
+                    unloop = true;
+                }
+                int table_idx = 0;
+                locate_xx(lower, upper, _max, stride0, stride1, xx, table_idx);
+                for (int kk = 0; kk < last_layer_size; kk++) {
+                    // 1.094 timesteps/s                                       
+                    FPTYPE a0  = table[table_idx * last_layer_size * 6 + 6 * kk + 0]; 
+                    FPTYPE a1  = table[table_idx * last_layer_size * 6 + 6 * kk + 1]; 
+                    FPTYPE a2  = table[table_idx * last_layer_size * 6 + 6 * kk + 2]; 
+                    FPTYPE a3  = table[table_idx * last_layer_size * 6 + 6 * kk + 3];
+                    FPTYPE a4  = table[table_idx * last_layer_size * 6 + 6 * kk + 4];
+                    FPTYPE a5  = table[table_idx * last_layer_size * 6 + 6 * kk + 5];
+                    FPTYPE var = a0 + a1 * xx + a2 * xx * xx + a3 * xx * xx * xx + a4 * xx * xx * xx * xx + a5 * xx * xx * xx * xx * xx;
+                    if (unloop) {
+                        out[ii * last_layer_size * 4 + 0 * last_layer_size + kk] += (nnei - jj) * var * ll[0];
+                        out[ii * last_layer_size * 4 + 1 * last_layer_size + kk] += (nnei - jj) * var * ll[1];
+                        out[ii * last_layer_size * 4 + 2 * last_layer_size + kk] += (nnei - jj) * var * ll[2];
+                        out[ii * last_layer_size * 4 + 3 * last_layer_size + kk] += (nnei - jj) * var * ll[3];
+                    }
+                    else {
+                        out[ii * last_layer_size * 4 + 0 * last_layer_size + kk] += var * ll[0];
+                        out[ii * last_layer_size * 4 + 1 * last_layer_size + kk] += var * ll[1];
+                        out[ii * last_layer_size * 4 + 2 * last_layer_size + kk] += var * ll[2];
+                        out[ii * last_layer_size * 4 + 3 * last_layer_size + kk] += var * ll[3];
+                    }
+                }
+                if (unloop) break;
+            }
+        }
+    }
+    #if GOOGLE_CUDA
+    void operator()(const GPUDevice& d, const FPTYPE * table, const FPTYPE * table_info, const FPTYPE * in, const FPTYPE * ff, const int nloc, const int nnei, const int last_layer_size, FPTYPE * out) {
+        //Currently, Do nothing at all! 
+        TabulateFusionLauncher(table, table_info, in, ff, nloc, nnei, last_layer_size, out);
+    }
+    #endif // GOOGLE_CUDA 
+};
+
+template <typename FPTYPE>
+struct TabulateFusionGradFunctor {
+    void operator()(const CPUDevice& d, const FPTYPE * table, const FPTYPE * table_info, const FPTYPE * in, const FPTYPE * ff, const FPTYPE * dy, const int nloc, const int nnei, const int last_layer_size, FPTYPE * dy_dx, FPTYPE * dy_df) {
+        // std::cout << "I'm in tabulate gradient @CPU!" << std::endl;
+        memset(dy_dx, 0.0, sizeof(FPTYPE) * nloc * nnei);
+        memset(dy_df, 0.0, sizeof(FPTYPE) * nloc * nnei * 4);
+        FPTYPE const lower   = table_info[0];
+        FPTYPE const upper   = table_info[1];
+        FPTYPE const _max    = table_info[2];
+        FPTYPE const stride0 = table_info[3];
+        FPTYPE const stride1 = table_info[4];
+        // for every atom, execute a small gemm~
+        // FPTYPE * res = new FPTYPE[4 * last_layer_size];
+        #pragma omp parallel for
+        for (int ii = 0; ii < nloc; ii++) {
+            FPTYPE ll[4];
+            FPTYPE rr[4];
+            FPTYPE ago = in[ii * nnei + nnei - 1];
+            bool unloop = false;
+            for (int jj = 0; jj < nnei; jj++) {
+                // construct the dy/dx
+                ll[0] = ff[ii * nnei * 4 + jj * 4 + 0];
+                ll[1] = ff[ii * nnei * 4 + jj * 4 + 1];
+                ll[2] = ff[ii * nnei * 4 + jj * 4 + 2];
+                ll[3] = ff[ii * nnei * 4 + jj * 4 + 3];
+                FPTYPE xx = in[ii * nnei + jj]; 
+                if (ago == xx) {
+                    unloop = true;
+                }
+                int table_idx = 0;
+                locate_xx(lower, upper, _max, stride0, stride1, xx, table_idx);
+                FPTYPE grad = 0.0;
+                for (int kk = 0; kk < last_layer_size; kk++) {
+                    rr[0] = dy[ii * last_layer_size * 4 + 0 * last_layer_size + kk];
+                    rr[1] = dy[ii * last_layer_size * 4 + 1 * last_layer_size + kk];
+                    rr[2] = dy[ii * last_layer_size * 4 + 2 * last_layer_size + kk];
+                    rr[3] = dy[ii * last_layer_size * 4 + 3 * last_layer_size + kk];
+                    // 1.094 timesteps/s
+                    FPTYPE a0  = table[table_idx * last_layer_size * 6 + 6 * kk + 0]; 
+                    FPTYPE a1  = table[table_idx * last_layer_size * 6 + 6 * kk + 1]; 
+                    FPTYPE a2  = table[table_idx * last_layer_size * 6 + 6 * kk + 2]; 
+                    FPTYPE a3  = table[table_idx * last_layer_size * 6 + 6 * kk + 3];
+                    FPTYPE a4  = table[table_idx * last_layer_size * 6 + 6 * kk + 4];
+                    FPTYPE a5  = table[table_idx * last_layer_size * 6 + 6 * kk + 5];
+                    FPTYPE res = a0 + a1 * xx + a2 * xx * xx + a3 * xx * xx * xx + a4 * xx * xx * xx * xx + a5 * xx * xx * xx * xx * xx;
+
+                    if (unloop) {
+                        grad += (a1 + 2 * a2 * xx + 3 * a3 * xx * xx + 4 * a4 * xx * xx * xx + 5 * a5 * xx * xx * xx * xx) * dot(ll, rr) * (nnei - jj);
+                        dy_df[ii * nnei * 4 + jj * 4 + 0] += res * rr[0] * (nnei - jj);
+                        dy_df[ii * nnei * 4 + jj * 4 + 1] += res * rr[1] * (nnei - jj);
+                        dy_df[ii * nnei * 4 + jj * 4 + 2] += res * rr[2] * (nnei - jj);
+                        dy_df[ii * nnei * 4 + jj * 4 + 3] += res * rr[3] * (nnei - jj);
+                    }
+                    else {
+                        grad += (a1 + 2 * a2 * xx + 3 * a3 * xx * xx + 4 * a4 * xx * xx * xx + 5 * a5 * xx * xx * xx * xx) * dot(ll, rr);
+                        dy_df[ii * nnei * 4 + jj * 4 + 0] += res * rr[0];
+                        dy_df[ii * nnei * 4 + jj * 4 + 1] += res * rr[1];
+                        dy_df[ii * nnei * 4 + jj * 4 + 2] += res * rr[2];
+                        dy_df[ii * nnei * 4 + jj * 4 + 3] += res * rr[3];
+                    }
+                }
+                dy_dx[ii * nnei + jj] = grad;
+                if (unloop) break;
+            }
+        }
+    }
+    #if GOOGLE_CUDA
+    void operator()(const GPUDevice& d, const FPTYPE * table, const FPTYPE * table_info, const FPTYPE * in, const FPTYPE * ff, const FPTYPE * dy, const int nloc, const int nnei, const int last_layer_size, FPTYPE * dy_dx, FPTYPE * dy_df) {
+        //Currently, Do nothing at all! 
+        TabulateFusionGradLauncher(table, table_info, in, ff, dy, nloc, nnei, last_layer_size, dy_dx, dy_df);
+    }
+    #endif // GOOGLE_CUDA 
+};
+
+template <typename FPTYPE>
+struct TabulateCheckerFunctor {
+    void operator()(const CPUDevice& d, const FPTYPE * table_info, const FPTYPE * in, int * out, const int nloc, const int nnei) {
+        FPTYPE const lower   = table_info[0];
+        FPTYPE const upper   = table_info[1];
+        FPTYPE const _max    = table_info[2];
+        FPTYPE const stride0 = table_info[3];
+        FPTYPE const stride1 = table_info[4];
+        // for every atom, execute a small gemm~
+        // FPTYPE * res = new FPTYPE[4 * last_layer_size];
+        int Csub = 0;    // summation of second table approximate;
+        int Dsub = 0;    // summation of the endpoint approximate;
+        for (int ii = 0; ii < nloc; ii++) {
+            for (int jj = 0; jj < nnei; jj++) {
+                FPTYPE xx = in[ii * nnei + jj];
+                if (xx < lower || xx > _max) {
+                    Csub += 1;
+                }
+                else if (xx >= upper && xx <= _max) {
+                    Dsub += 1;
+                }
+            }
+        }
+        if(Csub > 0) {
+            std::cout << "# DEEPMD: warning! some values [" << Csub << "/" << nloc * nnei << "] overflow the range of the table, using the endpoint approximate processing.." << std::endl;
+        }
+        if(Dsub > 0) {
+            std::cout << "# DEEPMD: warning! some values [" << Dsub << "/" << nloc * nnei << "] overflow the range of the table, using second table approximate processing.." << std::endl;
+        }
+    }
+    #if GOOGLE_CUDA
+    void operator()(const GPUDevice& d, const FPTYPE * table_info, const FPTYPE * in, int * out, const int nloc, const int nnei) {
+        //Currently, Do nothing at all! 
+        TabulateCheckerLauncher(table_info, in, out, nloc, nnei);
+    }
+    #endif // GOOGLE_CUDA 
+};
+
+template<typename Device, typename FPTYPE>
+class TabulateFusionOp : public OpKernel {
+  public:
+    explicit TabulateFusionOp(OpKernelConstruction* context) : OpKernel(context) {
+        OP_REQUIRES_OK(context, context->GetAttr("last_layer_size", &last_layer_size));
+        counter = -1;
+    }
+
+    void Compute(OpKernelContext* context) override {
+        // Grab the input tensor
+        int context_input_index = 0;
+        const Tensor& table	= context->input(context_input_index++);
+        const Tensor& table_info = context->input(context_input_index++);
+        const Tensor& input	= context->input(context_input_index++);
+        const Tensor& ff	= context->input(context_input_index++);
+
+        // set size of the sample
+        OP_REQUIRES (context, (table.shape().dims() == 2),	    errors::InvalidArgument ("Dim of table should be 2"));
+        OP_REQUIRES (context, (input.shape().dims() == 2),		errors::InvalidArgument ("Dim of input should be 2"));
+        OP_REQUIRES (context, (ff.shape().dims() == 3),		    errors::InvalidArgument ("Dim of input should be 3"));
+
+        TensorShape output_shape;
+        output_shape.AddDim (ff.shape().dim_size(0));
+        output_shape.AddDim (4);
+        output_shape.AddDim (last_layer_size);
+
+        int context_output_index = 0;
+        Tensor* output = NULL;
+        OP_REQUIRES_OK(context, context->allocate_output(context_output_index++,
+	    					     output_shape,
+	    					     &output));
+
+        counter++;
+        if ((int)table_info.flat<FPTYPE>().data()[5] != -1 && counter % (int)table_info.flat<FPTYPE>().data()[5] == 0) {
+            Tensor int_temp;
+            TensorShape int_shape;
+            int_shape.AddDim(2 * ff.shape().dim_size(0));
+            OP_REQUIRES_OK(context, context->allocate_temp(DT_INT32, int_shape, &int_temp));
+            TabulateCheckerFunctor<FPTYPE>()(
+                context->eigen_device<Device>(),
+                table_info.flat<FPTYPE>().data(),
+                input.flat<FPTYPE>().data(),
+                int_temp.flat<int>().data(),
+                ff.shape().dim_size(0),
+                ff.shape().dim_size(1)
+            );
+        }
+
+        TabulateFusionFunctor<FPTYPE>()(
+            context->eigen_device<Device>(),            // define actually graph execution device
+            table.flat<FPTYPE>().data(),
+            table_info.flat<FPTYPE>().data(),
+            input.flat<FPTYPE>().data(),
+            ff.flat<FPTYPE>().data(),
+            ff.shape().dim_size(0),
+            ff.shape().dim_size(1),
+            last_layer_size,
+            output->flat<FPTYPE>().data()
+        );
+    }
+private:
+    int counter;
+    int last_layer_size;
+};
+
+template<typename Device, typename FPTYPE>
+class TabulateFusionGradOp : public OpKernel {
+ public:
+    explicit TabulateFusionGradOp(OpKernelConstruction* context) : OpKernel(context) {}
+
+    void Compute(OpKernelContext* context) override {
+        // std::cout << "I'm here" << std::endl;
+        // Grab the input tensor
+        int context_input_index = 0;
+        const Tensor& table	= context->input(context_input_index++);
+        const Tensor& table_info = context->input(context_input_index++);
+        const Tensor& input	= context->input(context_input_index++);
+        const Tensor& ff	= context->input(context_input_index++);
+        const Tensor& dy	= context->input(context_input_index++);
+        const Tensor& output = context->input(context_input_index++);
+
+        // set size of the sample
+        OP_REQUIRES (context, (dy.shape().dims() == 3),	    errors::InvalidArgument ("Dim of table should be 1"));
+
+        int context_output_index = 0;
+        Tensor* dy_dx = NULL;
+        OP_REQUIRES_OK(context, context->allocate_output(context_output_index++,
+	    					     input.shape(),
+	    					     &dy_dx));
+        Tensor* dy_df = NULL;
+        OP_REQUIRES_OK(context, context->allocate_output(context_output_index++,
+	    					     ff.shape(),
+	    					     &dy_df));
+
+        TabulateFusionGradFunctor<FPTYPE>()(
+            context->eigen_device<Device>(),            // define actually graph execution device
+            table.flat<FPTYPE>().data(),
+            table_info.flat<FPTYPE>().data(),
+            input.flat<FPTYPE>().data(),
+            ff.flat<FPTYPE>().data(),
+            dy.flat<FPTYPE>().data(),
+            ff.shape().dim_size(0),
+            ff.shape().dim_size(1),
+            output.shape().dim_size(2),
+            dy_dx->flat<FPTYPE>().data(),
+            dy_df->flat<FPTYPE>().data()
+        );
+    }
+private:
+};
+
+#define REGISTER_CPU(T)                                                                             \
+REGISTER_KERNEL_BUILDER(                                                                            \
+    Name("TabulateFusion").Device(DEVICE_CPU).TypeConstraint<T>("T").HostMemory("table_info"),      \
+    TabulateFusionOp<CPUDevice, T>);                                                                \
+REGISTER_KERNEL_BUILDER(                                                                            \
+    Name("TabulateFusionGrad").Device(DEVICE_CPU).TypeConstraint<T>("T").HostMemory("table_info"),  \
+    TabulateFusionGradOp<CPUDevice, T>);                                                                
+REGISTER_CPU(float);
+REGISTER_CPU(double);
+
+#if  GOOGLE_CUDA
+#define REGISTER_GPU(T)                                                                             \
+REGISTER_KERNEL_BUILDER(                                                                            \
+    Name("TabulateFusion").Device(DEVICE_GPU).TypeConstraint<T>("T").HostMemory("table_info"),      \
+    TabulateFusionOp<GPUDevice, T>);                                                                \
+REGISTER_KERNEL_BUILDER(                                                                            \
+    Name("TabulateFusionGrad").Device(DEVICE_GPU).TypeConstraint<T>("T").HostMemory("table_info"),  \
+    TabulateFusionGradOp<GPUDevice, T>);                                                                
+REGISTER_GPU(float);
+REGISTER_GPU(double);
+#endif  // GOOGLE_CUDA
diff --git a/source/op/tabulate_multi_device.cc b/source/op/tabulate_multi_device.cc
new file mode 100644
index 0000000000..1e5e8da2a7
--- /dev/null
+++ b/source/op/tabulate_multi_device.cc
@@ -0,0 +1,194 @@
+#include "common.h"
+#include "CustomeOperation.h"
+
+REGISTER_OP("TabulateFusion")
+    .Attr("T: {float, double}")
+    .Input("table: T")
+    .Input("table_info: T")
+    .Input("input: T")
+    .Input("ff: T")
+    .Attr("last_layer_size: int")
+    .Output("output: T");
+
+REGISTER_OP("TabulateFusionGrad")
+    .Attr("T: {float, double}")
+    .Input("table: T")
+    .Input("table_info: T")
+    .Input("input: T")
+    .Input("ff: T")
+    .Input("dy: T")        
+    .Input("output: T")         
+    .Output("dy_dx: T")
+    .Output("dy_df: T");
+
+template <typename FPTYPE>
+struct TabulateFusionFunctor {
+    void operator()(const CPUDevice& d, const FPTYPE * table, const FPTYPE * table_info, const FPTYPE * in, const FPTYPE * ff, const int nloc, const int nnei, const int last_layer_size, FPTYPE * out) {
+        TabulateFusionCPULauncher(table, table_info, in, ff, nloc, nnei, last_layer_size, out);
+    }
+    #if GOOGLE_CUDA
+    void operator()(const GPUDevice& d, const FPTYPE * table, const FPTYPE * table_info, const FPTYPE * in, const FPTYPE * ff, const int nloc, const int nnei, const int last_layer_size, FPTYPE * out) {
+        //Currently, Do nothing at all! 
+        TabulateFusionGPULauncher(table, table_info, in, ff, nloc, nnei, last_layer_size, out);
+    }
+    #endif // GOOGLE_CUDA 
+};
+
+template <typename FPTYPE>
+struct TabulateFusionGradFunctor {
+    void operator()(const CPUDevice& d, const FPTYPE * table, const FPTYPE * table_info, const FPTYPE * in, const FPTYPE * ff, const FPTYPE * dy, const int nloc, const int nnei, const int last_layer_size, FPTYPE * dy_dx, FPTYPE * dy_df) {
+        TabulateFusionGradCPULauncher(table, table_info, in, ff, dy, nloc, nnei, last_layer_size, dy_dx, dy_df);
+    }
+    #if GOOGLE_CUDA
+    void operator()(const GPUDevice& d, const FPTYPE * table, const FPTYPE * table_info, const FPTYPE * in, const FPTYPE * ff, const FPTYPE * dy, const int nloc, const int nnei, const int last_layer_size, FPTYPE * dy_dx, FPTYPE * dy_df) {
+        //Currently, Do nothing at all! 
+        TabulateFusionGradGPULauncher(table, table_info, in, ff, dy, nloc, nnei, last_layer_size, dy_dx, dy_df);
+    }
+    #endif // GOOGLE_CUDA 
+};
+
+template <typename FPTYPE>
+struct TabulateCheckerFunctor {
+    void operator()(const CPUDevice& d, const FPTYPE * table_info, const FPTYPE * in, int * out, const int nloc, const int nnei) {
+        TabulateCheckerCPULauncher(table_info, in, out, nloc, nnei);
+    }
+    #if GOOGLE_CUDA
+    void operator()(const GPUDevice& d, const FPTYPE * table_info, const FPTYPE * in, int * out, const int nloc, const int nnei) {
+        //Currently, Do nothing at all! 
+        TabulateCheckerGPULauncher(table_info, in, out, nloc, nnei);
+    }
+    #endif // GOOGLE_CUDA 
+};
+
+template<typename Device, typename FPTYPE>
+class TabulateFusionOp : public OpKernel {
+  public:
+    explicit TabulateFusionOp(OpKernelConstruction* context) : OpKernel(context) {
+        OP_REQUIRES_OK(context, context->GetAttr("last_layer_size", &last_layer_size));
+        counter = -1;
+    }
+
+    void Compute(OpKernelContext* context) override {
+        // Grab the input tensor
+        int context_input_index = 0;
+        const Tensor& table	= context->input(context_input_index++);
+        const Tensor& table_info = context->input(context_input_index++);
+        const Tensor& input	= context->input(context_input_index++);
+        const Tensor& ff	= context->input(context_input_index++);
+
+        // set size of the sample
+        OP_REQUIRES (context, (table.shape().dims() == 2),	    errors::InvalidArgument ("Dim of table should be 2"));
+        OP_REQUIRES (context, (input.shape().dims() == 2),		errors::InvalidArgument ("Dim of input should be 2"));
+        OP_REQUIRES (context, (ff.shape().dims() == 3),		    errors::InvalidArgument ("Dim of input should be 3"));
+
+        TensorShape output_shape;
+        output_shape.AddDim (ff.shape().dim_size(0));
+        output_shape.AddDim (4);
+        output_shape.AddDim (last_layer_size);
+
+        int context_output_index = 0;
+        Tensor* output = NULL;
+        OP_REQUIRES_OK(context, context->allocate_output(context_output_index++,
+	    					     output_shape,
+	    					     &output));
+
+        counter++;
+        if ((int)table_info.flat<FPTYPE>().data()[5] != -1 && counter % (int)table_info.flat<FPTYPE>().data()[5] == 0) {
+            Tensor int_temp;
+            TensorShape int_shape;
+            int_shape.AddDim(2 * ff.shape().dim_size(0));
+            OP_REQUIRES_OK(context, context->allocate_temp(DT_INT32, int_shape, &int_temp));
+            TabulateCheckerFunctor<FPTYPE>()(
+                context->eigen_device<Device>(),
+                table_info.flat<FPTYPE>().data(),
+                input.flat<FPTYPE>().data(),
+                int_temp.flat<int>().data(),
+                ff.shape().dim_size(0),
+                ff.shape().dim_size(1)
+            );
+        }
+
+        TabulateFusionFunctor<FPTYPE>()(
+            context->eigen_device<Device>(),            // define actually graph execution device
+            table.flat<FPTYPE>().data(),
+            table_info.flat<FPTYPE>().data(),
+            input.flat<FPTYPE>().data(),
+            ff.flat<FPTYPE>().data(),
+            ff.shape().dim_size(0),
+            ff.shape().dim_size(1),
+            last_layer_size,
+            output->flat<FPTYPE>().data()
+        );
+    }
+private:
+    int counter;
+    int last_layer_size;
+};
+
+template<typename Device, typename FPTYPE>
+class TabulateFusionGradOp : public OpKernel {
+ public:
+    explicit TabulateFusionGradOp(OpKernelConstruction* context) : OpKernel(context) {}
+
+    void Compute(OpKernelContext* context) override {
+        // std::cout << "I'm here" << std::endl;
+        // Grab the input tensor
+        int context_input_index = 0;
+        const Tensor& table	= context->input(context_input_index++);
+        const Tensor& table_info = context->input(context_input_index++);
+        const Tensor& input	= context->input(context_input_index++);
+        const Tensor& ff	= context->input(context_input_index++);
+        const Tensor& dy	= context->input(context_input_index++);
+        const Tensor& output = context->input(context_input_index++);
+
+        // set size of the sample
+        OP_REQUIRES (context, (dy.shape().dims() == 3),	    errors::InvalidArgument ("Dim of table should be 1"));
+
+        int context_output_index = 0;
+        Tensor* dy_dx = NULL;
+        OP_REQUIRES_OK(context, context->allocate_output(context_output_index++,
+	    					     input.shape(),
+	    					     &dy_dx));
+        Tensor* dy_df = NULL;
+        OP_REQUIRES_OK(context, context->allocate_output(context_output_index++,
+	    					     ff.shape(),
+	    					     &dy_df));
+
+        TabulateFusionGradFunctor<FPTYPE>()(
+            context->eigen_device<Device>(),            // define actually graph execution device
+            table.flat<FPTYPE>().data(),
+            table_info.flat<FPTYPE>().data(),
+            input.flat<FPTYPE>().data(),
+            ff.flat<FPTYPE>().data(),
+            dy.flat<FPTYPE>().data(),
+            ff.shape().dim_size(0),
+            ff.shape().dim_size(1),
+            output.shape().dim_size(2),
+            dy_dx->flat<FPTYPE>().data(),
+            dy_df->flat<FPTYPE>().data()
+        );
+    }
+private:
+};
+
+#define REGISTER_CPU(T)                                                                             \
+REGISTER_KERNEL_BUILDER(                                                                            \
+    Name("TabulateFusion").Device(DEVICE_CPU).TypeConstraint<T>("T").HostMemory("table_info"),      \
+    TabulateFusionOp<CPUDevice, T>);                                                                \
+REGISTER_KERNEL_BUILDER(                                                                            \
+    Name("TabulateFusionGrad").Device(DEVICE_CPU).TypeConstraint<T>("T").HostMemory("table_info"),  \
+    TabulateFusionGradOp<CPUDevice, T>);                                                                
+REGISTER_CPU(float);
+REGISTER_CPU(double);
+
+#if  GOOGLE_CUDA
+#define REGISTER_GPU(T)                                                                             \
+REGISTER_KERNEL_BUILDER(                                                                            \
+    Name("TabulateFusion").Device(DEVICE_GPU).TypeConstraint<T>("T").HostMemory("table_info"),      \
+    TabulateFusionOp<GPUDevice, T>);                                                                \
+REGISTER_KERNEL_BUILDER(                                                                            \
+    Name("TabulateFusionGrad").Device(DEVICE_GPU).TypeConstraint<T>("T").HostMemory("table_info"),  \
+    TabulateFusionGradOp<GPUDevice, T>);                                                                
+REGISTER_GPU(float);
+REGISTER_GPU(double);
+#endif  // GOOGLE_CUDA
diff --git a/source/op/unaggregated_grad.cc b/source/op/unaggregated_grad.cc
new file mode 100644
index 0000000000..bc489c61ff
--- /dev/null
+++ b/source/op/unaggregated_grad.cc
@@ -0,0 +1,320 @@
+#include "tensorflow/core/framework/op.h"
+#include "tensorflow/core/framework/op_kernel.h"
+#include "tensorflow/core/framework/shape_inference.h"
+#include <iostream>
+
+#include "ComputeDescriptor.h"
+#include "NeighborList.h"
+
+using namespace tensorflow;
+// using namespace std;
+
+using CPUDevice = Eigen::ThreadPoolDevice;
+using GPUDevice = Eigen::GpuDevice;
+
+REGISTER_OP("UnaggregatedDyDxS")
+    .Attr("T: {float, double}") 
+    .Input("y: T")                
+    .Input("w: T")              
+    .Output("dy_dx: T");
+
+REGISTER_OP("UnaggregatedDyDx")
+    .Attr("T: {float, double}")
+    .Input("z: T")           
+    .Input("w: T")     
+    .Input("dy_dx: T")     
+    .Output("dz_dx: T");
+
+REGISTER_OP("UnaggregatedDy2DxS")
+    .Attr("T: {float, double}") 
+    .Input("y: T")                
+    .Input("dy: T")                
+    .Input("w: T")              
+    .Output("dy2_dx: T");
+
+REGISTER_OP("UnaggregatedDy2Dx")
+    .Attr("T: {float, double}")
+    .Input("z: T")           
+    .Input("w: T")     
+    .Input("dz_dx: T")     
+    .Input("dy_dx: T")     
+    .Input("dy2_dx: T")     
+    .Output("dz2_dx: T");
+
+template <typename FPTYPE>
+struct UnaggregatedDyDxSFunctor {
+    void operator()(const CPUDevice& d, const FPTYPE * y, const FPTYPE * w, const int length, const int width, FPTYPE * dy_dx) {
+        #pragma omp parallel for
+        for (int ii = 0; ii < length; ii++) {
+            for (int jj = 0; jj < width; jj++) {
+                dy_dx[ii * width + jj] = (1 - y[ii * width + jj] * y[ii * width + jj]) * w[jj];
+            }
+        }
+    }
+
+    #if GOOGLE_CUDA
+    void operator()(const GPUDevice& d, const FPTYPE * y, const FPTYPE * w, const int length, const int width, FPTYPE * dy_dx) {
+        //Currently, Do nothing at all! 
+        return;
+    }
+    #endif // GOOGLE_CUDA 
+};
+
+// calculate the gradient for all variables!
+template <typename FPTYPE>
+struct UnaggregatedDyDxFunctor {
+    void operator()(const CPUDevice& d, const FPTYPE * z, const FPTYPE * w, const FPTYPE * dy_dx, const int length, const int width, const int size, FPTYPE * dz_dx) {
+        #pragma omp parallel for
+        for (int kk = 0; kk < length; kk++) {
+            for (int ii = 0; ii < width; ii++) {
+                //FPTYPE dz_drou = 1 - (z[kk * width + ii] - y[kk * size + ii % size]) * (z[kk * width + ii] - y[kk * size + ii % size]);
+                FPTYPE dz_drou = 1 - z[kk * width + ii] * z[kk * width + ii];
+                FPTYPE accumulator = 0.0;
+                for (int jj = 0; jj < size; jj++) {
+                    accumulator += w[jj * width + ii] * dy_dx[kk * size + jj];
+                }
+                dz_drou *= accumulator;
+                dz_drou += dy_dx[kk * size + ii % size];
+                dz_dx[kk * width + ii] = dz_drou;
+            }
+        }
+    }
+
+    #if GOOGLE_CUDA
+    void operator()(const GPUDevice& d, const FPTYPE * z, const FPTYPE * w, const FPTYPE * dy_dx, const int length, const int width, const int size, FPTYPE * dz_dx) {
+        //Currently, Do nothing at all! 
+        return;
+    }
+    #endif // GOOGLE_CUDA
+};
+
+template <typename FPTYPE>
+struct UnaggregatedDy2DxSFunctor {
+    void operator()(const CPUDevice& d, const FPTYPE * y, const FPTYPE * dy, const FPTYPE * w, const int length, const int width, FPTYPE * dy2_dx) {
+        #pragma omp parallel for
+        for (int ii = 0; ii < length; ii++) {
+            for (int jj = 0; jj < width; jj++) {
+                dy2_dx[ii * width + jj] = -2 * w[jj] * y[ii * width + jj] * dy[ii * width + jj];
+            }
+        }
+    }
+
+    #if GOOGLE_CUDA
+    void operator()(const GPUDevice& d, const FPTYPE * y, const FPTYPE * w, const int length, const int width, FPTYPE * dy_dx) {
+        //Currently, Do nothing at all! 
+        return;
+    }
+    #endif // GOOGLE_CUDA 
+};
+
+// calculate the gradient for all variables!
+template <typename FPTYPE>
+struct UnaggregatedDy2DxFunctor {
+    void operator()(const CPUDevice& d, const FPTYPE * z, const FPTYPE * w, const FPTYPE * dz_dx, const FPTYPE * dy_dx, const FPTYPE * dy2_dx, const int length, const int width, const int size, FPTYPE * dz2_dx) {
+        #pragma omp parallel for
+        for (int kk = 0; kk < length; kk++) {
+            for (int ii = 0; ii < width; ii++) {
+                //FPTYPE dz_drou = 1 - (z[kk * width + ii] - y[kk * size + ii % size]) * (z[kk * width + ii] - y[kk * size + ii % size]);
+                FPTYPE dz_drou = 1 - z[kk * width + ii] * z[kk * width + ii];
+                FPTYPE accumulator = 0.0;
+                for (int jj = 0; jj < size; jj++) {
+                    accumulator += w[jj * width + ii] * dy2_dx[kk * size + jj];
+                }
+                dz_drou *= accumulator;
+                accumulator = 0.0;
+                for (int jj = 0; jj < size; jj++) {
+                    accumulator += w[jj * width + ii] * dy_dx[kk * size + jj];
+                }
+                dz_drou -= 2 * z[kk * width + ii] * (dz_dx[kk * width + ii] - dy_dx[kk * size + ii % size]) * accumulator;
+                dz_drou += dy2_dx[kk * size + ii % size];
+                dz2_dx[kk * width + ii] = dz_drou;
+            }
+        }
+    }
+
+    #if GOOGLE_CUDA
+    void operator()(const GPUDevice& d, const FPTYPE * z, const FPTYPE * w, const FPTYPE * dz_dx, const FPTYPE * dy_dx, const FPTYPE * dy2_dx, const int length, const int width, const int size, FPTYPE * dz2_dx) {
+        //Currently, Do nothing at all! 
+        return;
+    }
+    #endif // GOOGLE_CUDA
+};
+
+template<typename Device, typename FPTYPE>
+class UnaggregatedDyDxSOp : public OpKernel {
+ public:
+    explicit UnaggregatedDyDxSOp(OpKernelConstruction* context) : OpKernel(context) {}
+
+    void Compute(OpKernelContext* context) override {
+        // Grab the input tensor
+        int context_input_index = 0;
+        const Tensor& y	= context->input(context_input_index++);
+        const Tensor& w	= context->input(context_input_index++);
+
+        // set size of the sample
+        OP_REQUIRES (context, (y.shape().dims() == 2),	    errors::InvalidArgument ("Dim of table should be 1"));
+        OP_REQUIRES (context, (w.shape().dims() == 2),		errors::InvalidArgument ("Dim of input should be 2"));
+
+        int context_output_index = 0;
+        Tensor* dy_dx = NULL;
+        OP_REQUIRES_OK(context, context->allocate_output(context_output_index++,
+	    					     y.shape(),
+	    					     &dy_dx));
+
+        UnaggregatedDyDxSFunctor<FPTYPE>()(
+            context->eigen_device<Device>(),            // define actually graph execution device
+            y.flat<FPTYPE>().data(),
+            w.flat<FPTYPE>().data(),
+            y.shape().dim_size(0),
+            y.shape().dim_size(1),
+            dy_dx->flat<FPTYPE>().data()
+        );
+    }
+private:
+};
+
+template<typename Device, typename FPTYPE>
+class UnaggregatedDy2DxSOp : public OpKernel {
+ public:
+    explicit UnaggregatedDy2DxSOp(OpKernelConstruction* context) : OpKernel(context) {}
+
+    void Compute(OpKernelContext* context) override {
+        // Grab the input tensor
+        int context_input_index = 0;
+        const Tensor& y	    = context->input(context_input_index++);
+        const Tensor& dy	= context->input(context_input_index++);
+        const Tensor& w	    = context->input(context_input_index++);
+
+        // set size of the sample
+        OP_REQUIRES (context, (y.shape().dims()  == 2),	    errors::InvalidArgument ("Dim of input should be 2"));
+        OP_REQUIRES (context, (dy.shape().dims() == 2),	    errors::InvalidArgument ("Dim of input should be 2"));
+        OP_REQUIRES (context, (w.shape().dims()  == 2),		errors::InvalidArgument ("Dim of input should be 2"));
+    
+        int context_output_index = 0;
+        Tensor* dy2_dx = NULL; 
+        OP_REQUIRES_OK(context, context->allocate_output(context_output_index++,
+	    					     y.shape(),
+	    					     &dy2_dx));
+
+        UnaggregatedDy2DxSFunctor<FPTYPE>()(
+            context->eigen_device<Device>(),            // define actually graph execution device
+            y.flat<FPTYPE>().data(),
+            dy.flat<FPTYPE>().data(),
+            w.flat<FPTYPE>().data(),
+            y.shape().dim_size(0),
+            y.shape().dim_size(1),
+            dy2_dx->flat<FPTYPE>().data()
+        );
+    }
+private:
+};
+
+template<typename Device, typename FPTYPE>
+class UnaggregatedDyDxOp : public OpKernel {
+ public:
+    explicit UnaggregatedDyDxOp(OpKernelConstruction* context) : OpKernel(context) {}
+
+    void Compute(OpKernelContext* context) override {
+        // Grab the input tensor
+        int context_input_index = 0;
+        const Tensor& z	= context->input(context_input_index++);
+        const Tensor& w	= context->input(context_input_index++);
+        const Tensor& dy_dx	= context->input(context_input_index++);
+
+        // set size of the sample
+        OP_REQUIRES (context, (z.shape().dims() == 2),	        errors::InvalidArgument ("Dim of table should be 1"));
+        OP_REQUIRES (context, (w.shape().dims() == 2),		    errors::InvalidArgument ("Dim of input should be 2"));
+        OP_REQUIRES (context, (dy_dx.shape().dims() == 2),		errors::InvalidArgument ("Dim of input should be 2"));
+
+        int context_output_index = 0;
+        Tensor* dz_dx = NULL;
+        OP_REQUIRES_OK(context, context->allocate_output(context_output_index++,
+	    					     z.shape(),
+	    					     &dz_dx));
+
+        UnaggregatedDyDxFunctor<FPTYPE>()(
+            context->eigen_device<Device>(),            // define actually graph execution device
+            z.flat<FPTYPE>().data(),
+            w.flat<FPTYPE>().data(),
+            dy_dx.flat<FPTYPE>().data(),
+            z.shape().dim_size(0),
+            z.shape().dim_size(1),
+            w.shape().dim_size(0),
+            dz_dx->flat<FPTYPE>().data()
+        );
+    }
+private:
+};
+
+template<typename Device, typename FPTYPE>
+class UnaggregatedDy2DxOp : public OpKernel {
+ public:
+    explicit UnaggregatedDy2DxOp(OpKernelConstruction* context) : OpKernel(context) {}
+
+    void Compute(OpKernelContext* context) override {
+        // Grab the input tensor
+        int context_input_index = 0;
+        const Tensor& z	= context->input(context_input_index++);
+        const Tensor& w	= context->input(context_input_index++);
+        const Tensor& dz_dx	= context->input(context_input_index++);
+        const Tensor& dy_dx	= context->input(context_input_index++);
+        const Tensor& dy2_dx = context->input(context_input_index++);
+
+        // set size of the sample
+        OP_REQUIRES (context, (z.shape().dims() == 2),	        errors::InvalidArgument ("Dim of input should be 2"));
+        OP_REQUIRES (context, (w.shape().dims() == 2),		    errors::InvalidArgument ("Dim of input should be 2"));
+        OP_REQUIRES (context, (dz_dx.shape().dims() == 2),		errors::InvalidArgument ("Dim of input should be 2"));
+        OP_REQUIRES (context, (dy_dx.shape().dims() == 2),		errors::InvalidArgument ("Dim of input should be 2"));
+        OP_REQUIRES (context, (dy2_dx.shape().dims() == 2),		errors::InvalidArgument ("Dim of input should be 2"));
+
+        int context_output_index = 0;
+        Tensor* dz2_dx = NULL;
+        OP_REQUIRES_OK(context, context->allocate_output(context_output_index++,
+	    					     z.shape(),
+	    					     &dz2_dx));
+
+        UnaggregatedDy2DxFunctor<FPTYPE>()(
+            context->eigen_device<Device>(),            // define actually graph execution device
+            z.flat<FPTYPE>().data(),
+            w.flat<FPTYPE>().data(),
+            dz_dx.flat<FPTYPE>().data(),
+            dy_dx.flat<FPTYPE>().data(),
+            dy2_dx.flat<FPTYPE>().data(),
+            z.shape().dim_size(0),
+            z.shape().dim_size(1),
+            w.shape().dim_size(0),
+            dz2_dx->flat<FPTYPE>().data()
+        );
+    }
+private:
+};
+
+// Register the CPU kernels.
+#define REGISTER_CPU(T)                                                                 \
+REGISTER_KERNEL_BUILDER(                                                                \
+    Name("UnaggregatedDyDxS").Device(DEVICE_CPU).TypeConstraint<T>("T"),                \
+    UnaggregatedDyDxSOp<CPUDevice, T>);                                                 \
+REGISTER_KERNEL_BUILDER(                                                                \
+    Name("UnaggregatedDyDx").Device(DEVICE_CPU).TypeConstraint<T>("T"),                 \
+    UnaggregatedDyDxOp<CPUDevice, T>);                                                  \
+REGISTER_KERNEL_BUILDER(                                                                \
+    Name("UnaggregatedDy2DxS").Device(DEVICE_CPU).TypeConstraint<T>("T"),               \
+    UnaggregatedDy2DxSOp<CPUDevice, T>);                                                \
+REGISTER_KERNEL_BUILDER(                                                                \
+    Name("UnaggregatedDy2Dx").Device(DEVICE_CPU).TypeConstraint<T>("T"),                \
+    UnaggregatedDy2DxOp<CPUDevice, T>);             
+REGISTER_CPU(float);
+REGISTER_CPU(double);
+// Not required in the current situation
+// // Register the GPU kernels.
+// #if GOOGLE_CUDA
+// #define REGISTER_GPU(T)                                                                 \
+// REGISTER_KERNEL_BUILDER(                                                                \
+//     Name("UnaggregatedDyDxS").Device(DEVICE_GPU).TypeConstraint<T>("T"),                \
+//     UnaggregatedDyDxSOp<GPUDevice, T>);                                                 \
+// REGISTER_KERNEL_BUILDER(                                                                \
+//     Name("UnaggregatedDyDx").Device(DEVICE_GPU).TypeConstraint<T>("T"),                 \
+//     UnaggregatedDyDxOp<GPUDevice, T>);                         
+// REGISTER_GPU(float);
+// REGISTER_GPU(double);
+// #endif  // GOOGLE_CUDA
\ No newline at end of file
diff --git a/source/train/CMakeLists.txt b/source/train/CMakeLists.txt
index 818b2f4225..176f20d5e4 100644
--- a/source/train/CMakeLists.txt
+++ b/source/train/CMakeLists.txt
@@ -2,7 +2,7 @@
 
 configure_file("RunOptions.py.in" "${CMAKE_CURRENT_BINARY_DIR}/RunOptions.py" @ONLY)
 
-file(GLOB LIB_PY main.py calculator.py Model*.py Trainer.py ${CMAKE_CURRENT_BINARY_DIR}/RunOptions.py transform.py doc.py)
+file(GLOB LIB_PY main.py calculator.py Model*.py Trainer.py ${CMAKE_CURRENT_BINARY_DIR}/RunOptions.py transform.py doc.py compress.py)
 
 file(GLOB CLS_PY  Local.py Slurm.py)
 
diff --git a/source/train/Model.py b/source/train/Model.py
index 9c9cd7ddff..5a6ae6f153 100644
--- a/source/train/Model.py
+++ b/source/train/Model.py
@@ -114,6 +114,9 @@ def data_stat(self, data):
         m_all_stat = merge_sys_stat(all_stat)
         self._compute_input_stat(m_all_stat, protection = self.data_stat_protect)
         self._compute_output_stat(all_stat)
+
+        if hasattr(self.descrpt, 'compress') and self.descrpt.compress:
+            self.descrpt.data_info(data)
         # self.bias_atom_e = data.compute_energy_shift(self.rcond)
 
     def _compute_input_stat (self, all_stat, protection = 1e-2) :
diff --git a/source/train/Trainer.py b/source/train/Trainer.py
index 765a25d206..c16a0e7157 100644
--- a/source/train/Trainer.py
+++ b/source/train/Trainer.py
@@ -32,6 +32,7 @@
 import deepmd._prod_virial_se_r_grad
 import deepmd._soft_min_force_grad
 import deepmd._soft_min_virial_grad
+import deepmd._tabulate_grad
 import deepmd._gelu
 
 from deepmd.common import j_must_have, ClassArg
diff --git a/source/train/compress.py b/source/train/compress.py
new file mode 100644
index 0000000000..2174171ef6
--- /dev/null
+++ b/source/train/compress.py
@@ -0,0 +1,52 @@
+import re
+import json
+import copy
+import argparse
+import numpy as np
+from deepmd.env import tf
+from .train import train
+from .freeze import freeze
+from .transform import transform
+from deepmd.common import j_loader
+from deepmd.utils.argcheck import normalize
+
+def compress(args):
+    jdata = j_loader(args.INPUT)
+    if not 'model' in jdata.keys():
+       jdata = convert_input_v0_v1(jdata, 
+                                   warning = True, 
+                                   dump = 'input_v1_compat.json')
+    
+    jdata = normalize(jdata)
+    jdata['model']['descriptor']['compress'] = True
+    jdata['model']['descriptor']['model_file'] = args.input
+    jdata['model']['descriptor']['table_info'] = args.table_info
+    
+    # check the descriptor type of input file
+    assert jdata['model']['descriptor']['type'] == 'se_a', 'Model compression error: descriptor type must be se_a!'
+
+    # stage 1: training or refining the model with tabulation
+    print('\n\n# DEEPMD: stage 1: train or refine the model with tabulation')
+    args_train = copy.deepcopy(args)
+    args_train.INPUT = 'compress.json'
+    args_train.output = 'compress.json'
+    args_train.init_model = None
+    args_train.restart = None
+    jdata['training']['stop_batch'] = jdata['training']['save_freq'] # be careful here, if we want refine the model
+    with open(args_train.INPUT, 'w') as fp:
+        json.dump(jdata, fp, indent=4)
+    train(args_train)
+
+    # stage 2: freeze the model
+    print('\n\n# DEEPMD: stage 2: freeze the model')
+    args_frz = copy.deepcopy(args)
+    args_frz.nodes = None
+    freeze(args_frz)
+
+    # stage 3: transform the model
+    print('\n\n# DEEPMD: stage 3: transform the model')
+    args_transform = copy.deepcopy(args)
+    args_transform.old_model = args.input
+    args_transform.raw_model = args.output
+    args_transform.output = args.output
+    transform(args_transform)
diff --git a/source/train/main.py b/source/train/main.py
index b7dd0d0215..248c33f961 100644
--- a/source/train/main.py
+++ b/source/train/main.py
@@ -5,6 +5,7 @@
 from .config import config
 from .test import test
 from .transform import transform
+from .compress import compress
 from .doc import doc_train_input
 
 def main () :    
@@ -63,6 +64,17 @@ def main () :
     parser_tst.add_argument("-a", "--atomic-energy", action = 'store_true', 
                             help="Test the accuracy of atomic energy")
 
+    parser_compress = subparsers.add_parser('compress', help='compress a model')
+    parser_compress.add_argument('INPUT', 
+                            help='the input parameter file in json or yaml format')
+    parser_compress.add_argument('-i', "--input", default = "frozen_model.pb", type=str, 
+				            help = "the original model")
+    parser_compress.add_argument("-o","--output", default = "frozen_model_tab.pb", type=str, 
+				            help='the compressed model')
+    parser_compress.add_argument('-t', '--table-info', nargs='+', default = [5, 0.01, 0.1, 1], type=float)
+    parser_compress.add_argument("-d", "--folder", type=str, default = ".", 
+                            help="path to checkpoint folder")
+
     parser_train = subparsers.add_parser('doc-train-input', 
                                          help='print the documentation (in rst format) of input training parameters.')
 
@@ -81,6 +93,8 @@ def main () :
         test(args)
     elif args.command == 'transform' :
         transform(args)
+    elif args.command == 'compress' :
+        compress(args)
     elif args.command == 'doc-train-input' :
         doc_train_input(args)
     else :
diff --git a/source/train/transform.py b/source/train/transform.py
index 1d587ae531..19efd42976 100644
--- a/source/train/transform.py
+++ b/source/train/transform.py
@@ -44,8 +44,8 @@ def transform_graph(raw_graph,old_graph):
     raw_graph_node = load_transform_node(raw_graph_def)
     old_graph_node = load_transform_node(old_graph_def)
 
-    if len(raw_graph_node) != len(old_graph_node):
-        raise RuntimeError("raw graph and old graph has different network structure")
+    # if len(raw_graph_node) != len(old_graph_node):
+        # raise RuntimeError("raw graph and old graph has different network structure")
 
     for node in raw_graph_def.node:
         if node.name in raw_graph_node.keys():
@@ -108,7 +108,7 @@ def check_dim(raw_graph_node, old_graph_node, node_name):
     raw_graph_dim = raw_graph_node[node_name].tensor_shape
     old_graph_dim = old_graph_node[node_name].tensor_shape
     if raw_graph_dim != old_graph_dim:
-        raise RuntimeError("old graph and raw graph has different"+node_name+" dim")
+        raise RuntimeError("old graph " + str(old_graph_dim) + " and raw graph " + str(raw_graph_dim) + " has different " + str(node_name) + " dim")
 
 
 def load_transform_node(graph):

From c9525a20252924b8bf3b045f354f92fd1649ea1e Mon Sep 17 00:00:00 2001
From: Denghui Lu <denghuilu@pku.edu.cn>
Date: Wed, 3 Feb 2021 11:33:55 +0800
Subject: [PATCH 02/20] Update deepmd/utils/tabulate.py

Co-authored-by: Jinzhe Zeng <jinzhe.zeng@rutgers.edu>
---
 deepmd/utils/tabulate.py | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/deepmd/utils/tabulate.py b/deepmd/utils/tabulate.py
index 04b3bde7e9..2a453ce584 100644
--- a/deepmd/utils/tabulate.py
+++ b/deepmd/utils/tabulate.py
@@ -1,7 +1,7 @@
 import re
 import math
 import numpy as np
-import tensorflow.compat.v1 as tf
+from deepmd.env import tf
 from tensorflow.python.platform import gfile
 from tensorflow.python.framework import tensor_util
 from tqdm import tqdm
@@ -182,4 +182,4 @@ def layer_1(self, x, w, b):
     def save_data(self):
         for ii in range(self.ntypes * self.ntypes):
             net = "filter_" + str(int(ii / self.ntypes)) + "_net_" + str(int(ii % self.ntypes))
-            np.savetxt('data_' + str(int(ii)), self.data[net])
\ No newline at end of file
+            np.savetxt('data_' + str(int(ii)), self.data[net])

From 0e101566a5edef9f598c1927542cb1cdad016b38 Mon Sep 17 00:00:00 2001
From: denghuilu <denghuilu@pku.edu.cn>
Date: Wed, 3 Feb 2021 15:12:09 +0800
Subject: [PATCH 03/20] add package tqdm into setup.py

---
 setup.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/setup.py b/setup.py
index b7b104d33c..57335193b5 100644
--- a/setup.py
+++ b/setup.py
@@ -17,7 +17,7 @@
     with open(readme_file) as f:
         readme = f.read()
 
-install_requires=['numpy', 'scipy', 'pyyaml', 'dargs']
+install_requires=['numpy', 'scipy', 'pyyaml', 'dargs', 'tqdm']
 setup_requires=['setuptools_scm', 'scikit-build']
 
 tf_version = os.environ.get('TENSORFLOW_VERSION', '2.3')

From a2efd34c80e125cfa67d65de2fcde83d4244d06e Mon Sep 17 00:00:00 2001
From: denghuilu <denghuilu@pku.edu.cn>
Date: Wed, 3 Feb 2021 16:22:06 +0800
Subject: [PATCH 04/20] add the reference of global_np_float_precision into
 common.py

---
 deepmd/common.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/deepmd/common.py b/deepmd/common.py
index e00485b99a..c46f31bb82 100644
--- a/deepmd/common.py
+++ b/deepmd/common.py
@@ -3,7 +3,7 @@
 import math
 from deepmd.env import tf
 from deepmd.env import op_module
-from deepmd.RunOptions import global_tf_float_precision
+from deepmd.RunOptions import global_tf_float_precision, global_np_float_precision
 import json
 import yaml
 

From 20ccc0ea32753c72bc84c0a22075f3e33c7b3582 Mon Sep 17 00:00:00 2001
From: denghuilu <denghuilu@pku.edu.cn>
Date: Wed, 3 Feb 2021 17:02:58 +0800
Subject: [PATCH 05/20] optimize the performance of gpu implementations of
 custome ops

---
 source/op/cuda/descrpt_se_r.cu    | 116 ++++++++++++++----------------
 source/op/cuda/prod_force_se_a.cu |  53 +++++++++-----
 source/op/cuda/prod_force_se_r.cu |  53 +++++++++-----
 3 files changed, 124 insertions(+), 98 deletions(-)

diff --git a/source/op/cuda/descrpt_se_r.cu b/source/op/cuda/descrpt_se_r.cu
index 0715f19c5e..33932f4325 100644
--- a/source/op/cuda/descrpt_se_r.cu
+++ b/source/op/cuda/descrpt_se_r.cu
@@ -147,8 +147,10 @@ __global__ void format_nlist_fill_b_se_r(int * nlist,
 }
 //it's ok!
 
-template<typename FPTYPE>
-__global__ void compute_descriptor_se_r (FPTYPE* descript,
+template<
+    typename FPTYPE,
+    int      THREADS_PER_BLOCK>
+__global__ void compute_descriptor_se_r(FPTYPE* descript,
                             const int ndescrpt,
                             FPTYPE* descript_deriv,
                             const int descript_deriv_size,
@@ -164,49 +166,58 @@ __global__ void compute_descriptor_se_r (FPTYPE* descript,
                             const float rmax,
                             const int sec_a_size)
 {   
-    // <<<nloc, sec_a.back()>>>
-    const unsigned int idx = blockIdx.x;
-    const unsigned int idy = blockIdx.y * blockDim.y + threadIdx.y;
-    const int idx_deriv = idy * 3;	// 4 components time 3 directions
-    const int idx_value = idy;	    // 4 components
-    if (idy >= sec_a_size) {return;}
+    // <<<nloc, TPB>>>
+    const unsigned int bid = blockIdx.x;
+    const unsigned int tid = threadIdx.x;
+    // usually false...
+    if (tid >= sec_a_size) {
+        return;
+    }
+    // const int idx_deriv = idy * 4 * 3;	// 4 components time 3 directions
+    // const int idx_value = idy * 4;	// 4 components
+    int * row_nlist = nlist + bid * nlist_size;
+    FPTYPE * row_rij = rij + bid * rij_size;
+    FPTYPE * row_descript = descript + bid * ndescrpt;
+    FPTYPE * row_descript_deriv = descript_deriv + bid * descript_deriv_size;
 
-    // else {return;}
-    FPTYPE * row_descript = descript + idx * ndescrpt;
-    FPTYPE * row_descript_deriv = descript_deriv + idx * descript_deriv_size;
-    FPTYPE * row_rij = rij + idx * rij_size;
-    int * row_nlist = nlist + idx * nlist_size;
+    for (int ii = tid; ii < sec_a_size; ii += THREADS_PER_BLOCK) {
+        const int idx_value = ii;	// 4 components
+        const int idx_deriv = ii * 3;	// 4 components time 3 directions
+        if (row_nlist[ii] >= 0) {
+            FPTYPE rr[3]  = {0};
+            FPTYPE vv[3]  = {0};
+            FPTYPE dd     = 0;
+            const int & j_idx = row_nlist[ii];
+            for (int kk = 0; kk < 3; kk++) {
+                rr[kk] = coord[j_idx * 3 + kk] - coord[bid * 3 + kk];
+                row_rij[ii * 3 + kk] = rr[kk];
+            }
+            // const FPTYPE * rr = &row_rij[ii * 3];
+            FPTYPE nr2 = dev_dot(rr, rr);
+            FPTYPE inr = 1./sqrt(nr2);
+            FPTYPE nr = nr2 * inr;
+            FPTYPE inr2 = inr * inr;
+            FPTYPE inr4 = inr2 * inr2;
+            FPTYPE inr3 = inr4 * nr;
+            FPTYPE sw, dsw;
+            spline5_switch(sw, dsw, nr, rmin, rmax);
+            dd = (1./nr)       ;//* sw;
 
-    if (row_nlist[idy] >= 0) {
-        const int & j_idx = row_nlist[idy];
-        for (int kk = 0; kk < 3; kk++) {
-            row_rij[idy * 3 + kk] = coord[j_idx * 3 + kk] - coord[idx * 3 + kk];
+            vv[0] = (rr[0] * inr3 * sw - dd * dsw * rr[0] * inr); // avg[type[(idx_deriv + 0) / (ndescrpt * 3)] * ndescrpt + ((idx_deriv + 0) % (ndescrpt * 3)) / 3];
+            vv[1] = (rr[1] * inr3 * sw - dd * dsw * rr[1] * inr); // avg[type[(idx_deriv + 1) / (ndescrpt * 3)] * ndescrpt + ((idx_deriv + 1) % (ndescrpt * 3)) / 3];
+            vv[2] = (rr[2] * inr3 * sw - dd * dsw * rr[2] * inr); // avg[type[(idx_deriv + 2) / (ndescrpt * 3)] * ndescrpt + ((idx_deriv + 2) % (ndescrpt * 3)) / 3];
+            
+            // 4 value components
+            dd *= sw; // * descript[idx * ndescrpt + idx_value + 0]);// - avg[type[idx] * ndescrpt + idx_value + 0]) / std[type[idx] * ndescrpt + idx_value + 0];
+            for (int ii = 0; ii < 3; ii++) {
+                row_descript_deriv[idx_deriv + ii] = vv[ii] / std[type[bid] * ndescrpt + idx_value + ii / 3];
+            }
+            row_descript[idx_value] = (dd - avg[type[bid] * ndescrpt + idx_value]) / std[type[bid] * ndescrpt + idx_value];
+        }
+        else {
+            // TODO: move it to the memset.
+            row_descript[idx_value] -= avg[type[bid] * ndescrpt + idx_value] / std[type[bid] * ndescrpt + idx_value];
         }
-        const FPTYPE * rr = &row_rij[idy * 3 + 0];
-        FPTYPE nr2 = dev_dot(rr, rr);
-        FPTYPE inr = 1./sqrt(nr2);
-        FPTYPE nr = nr2 * inr;
-        FPTYPE inr2 = inr * inr;
-        FPTYPE inr4 = inr2 * inr2;
-        FPTYPE inr3 = inr4 * nr;
-        FPTYPE sw, dsw;
-        spline5_switch(sw, dsw, nr, rmin, rmax);
-        row_descript[idx_value + 0] = (1./nr)       ;//* sw;
-
-        row_descript_deriv[idx_deriv + 0] = (rr[0] * inr3 * sw - row_descript[idx_value + 0] * dsw * rr[0] * inr); // avg[type[(idx_deriv + 0) / (ndescrpt * 3)] * ndescrpt + ((idx_deriv + 0) % (ndescrpt * 3)) / 3];
-        row_descript_deriv[idx_deriv + 1] = (rr[1] * inr3 * sw - row_descript[idx_value + 0] * dsw * rr[1] * inr); // avg[type[(idx_deriv + 1) / (ndescrpt * 3)] * ndescrpt + ((idx_deriv + 1) % (ndescrpt * 3)) / 3];
-        row_descript_deriv[idx_deriv + 2] = (rr[2] * inr3 * sw - row_descript[idx_value + 0] * dsw * rr[2] * inr); // avg[type[(idx_deriv + 2) / (ndescrpt * 3)] * ndescrpt + ((idx_deriv + 2) % (ndescrpt * 3)) / 3];
-        // 4 value components
-        row_descript[idx_value + 0] *= sw; // * descript[idx * ndescrpt + idx_value + 0]);// - avg[type[idx] * ndescrpt + idx_value + 0]) / std[type[idx] * ndescrpt + idx_value + 0];
-    }
-
-    for (int ii = 0; ii < 1; ii++) {
-        row_descript[idx_value + ii] = (row_descript[idx_value + ii] - avg[type[idx] * ndescrpt + idx_value + ii]) / std[type[idx] * ndescrpt + idx_value + ii];
-    }
-    // idy nloc, idx ndescrpt * 3
-    // descript_deriv[idy * ndescrpt * 3 + idx] = (descript_deriv_dev[idy * (ndescrpt * 3) + idx]) / std[type[idy] * ndescrpt + idx / 3];
-    for (int ii = 0; ii < 3; ii++) {
-        row_descript_deriv[idx_deriv + ii] /= std[type[idx] * ndescrpt + (idx_deriv + ii) / 3];
     }
 }
 
@@ -383,26 +394,7 @@ void DescrptSeRGPUExecuteFunctor<FPTYPE>::operator()(const FPTYPE * coord, const
         );
     }
 
-    const int nblock_ = (sec_a.back() + LEN -1) / LEN;
-    dim3 block_grid(nloc, nblock_);
-    dim3 thread_grid(1, LEN);
-    compute_descriptor_se_r<<<block_grid, thread_grid>>> (
-                            descript,
-                            ndescrpt,
-                            descript_deriv,
-                            ndescrpt * 3,
-                            rij,
-                            nnei * 3,
-                            type,
-                            avg,
-                            std,
-                            nlist,
-                            nnei,
-                            coord,
-                            rcut_r_smth,
-                            rcut_r,
-                            sec_a.back()
-    );
+    compute_descriptor_se_r<FPTYPE, TPB> <<<nloc, TPB>>> (descript, ndescrpt, descript_deriv, ndescrpt * 3, rij, nnei * 3, type, avg, std, nlist, nnei, coord, rcut_r_smth, rcut_r, sec_a.back());
 }
 
 template struct DescrptSeRGPUExecuteFunctor<float>;
diff --git a/source/op/cuda/prod_force_se_a.cu b/source/op/cuda/prod_force_se_a.cu
index 1667c15f90..84615ff275 100644
--- a/source/op/cuda/prod_force_se_a.cu
+++ b/source/op/cuda/prod_force_se_a.cu
@@ -14,23 +14,44 @@ static __inline__ __device__ double atomicAdd(double* address, double val) {
 }
 #endif
 
-template<typename FPTYPE>
-__global__ void deriv_wrt_center_atom_se_a(FPTYPE * force, 
-                        const FPTYPE * net_deriv,
-                        const FPTYPE * in_deriv,
-                        const int ndescrpt)
+template <
+    typename FPTYPE,
+    int      THREADS_PER_BLOCK>
+__global__ void force_deriv_wrt_center_atom_se_a(FPTYPE * force, const FPTYPE * net_deriv, const FPTYPE * in_deriv, const int ndescrpt)
 {
-    const unsigned int idx = blockIdx.x;
-    const unsigned int idy = blockIdx.y * blockDim.y + threadIdx.y;
-    const unsigned int idz = threadIdx.x;
+    __shared__ FPTYPE data[THREADS_PER_BLOCK * 3];
+    unsigned int bid = blockIdx.x;
+    unsigned int tid = threadIdx.x;
+    for (int ii = tid; ii < THREADS_PER_BLOCK * 3; ii += THREADS_PER_BLOCK) {
+        data[ii] = 0.f;
+    }
 
-    if (idy >= ndescrpt) {return;}
-    
-    atomicAdd(force + idx * 3 + idz, -1.0 * net_deriv[idx * ndescrpt + idy] * in_deriv[idx * ndescrpt * 3 + idy * 3 + idz]);
+    for (int ii = tid; ii < ndescrpt; ii += THREADS_PER_BLOCK) {
+        for (int jj = 0; jj < 3; jj++) {
+            data[jj * THREADS_PER_BLOCK + tid] += net_deriv[bid * ndescrpt + ii] * in_deriv[bid * ndescrpt * 3 + ii * 3 + jj];
+        }
+    }
+    __syncthreads();
+
+    // do reduction in shared memory
+    for (int ii = THREADS_PER_BLOCK >> 1; ii > 0; ii >>= 1) {
+        if (tid < ii) {
+            for (int jj = 0; jj < 3; jj++) {
+                data[jj * THREADS_PER_BLOCK + tid] += data[jj * THREADS_PER_BLOCK + tid + ii];
+            }
+        }
+        __syncthreads();
+    }
+    // write result for this block to global memory
+    if (tid == 0) {
+        force[bid * 3 + 0] -= data[THREADS_PER_BLOCK * 0];
+        force[bid * 3 + 1] -= data[THREADS_PER_BLOCK * 1];
+        force[bid * 3 + 2] -= data[THREADS_PER_BLOCK * 2];
+    }
 }
 
 template<typename FPTYPE>
-__global__ void deriv_wrt_neighbors_se_a(FPTYPE * force, 
+__global__ void force_deriv_wrt_neighbors_se_a(FPTYPE * force, 
                         const FPTYPE * net_deriv,
                         const FPTYPE * in_deriv,
                         const int * nlist,
@@ -71,17 +92,13 @@ void ProdForceSeAGPUExecuteFunctor<FPTYPE>::operator()(FPTYPE * force,
 {   
     // std::cout << "I'm here!" << std::endl;
     cudaErrcheck(cudaMemset(force, 0.0, sizeof(FPTYPE) * nall * 3));
-    const int LEN1 = 256;
-    const int nblock1 = (ndescrpt + LEN1 -1) / LEN1;
-    dim3 grid(nloc, nblock1);
-    dim3 thread(3, LEN1);
-    deriv_wrt_center_atom_se_a<<<grid, thread>>>(force, net_deriv, in_deriv, ndescrpt);
+    force_deriv_wrt_center_atom_se_a<FPTYPE, TPB> <<<nloc, TPB>>>(force, net_deriv, in_deriv, ndescrpt);
     
     const int LEN = 64;
     int nblock = (nloc + LEN -1) / LEN;
     dim3 block_grid(nblock, nnei);
     dim3 thread_grid(LEN, 3, 4);
-    deriv_wrt_neighbors_se_a<<<block_grid, thread_grid>>>(force, net_deriv, in_deriv, nlist, nloc, nnei, ndescrpt, n_a_sel, n_a_shift);
+    force_deriv_wrt_neighbors_se_a<<<block_grid, thread_grid>>>(force, net_deriv, in_deriv, nlist, nloc, nnei, ndescrpt, n_a_sel, n_a_shift);
 }
 
 template struct ProdForceSeAGPUExecuteFunctor<float>;
diff --git a/source/op/cuda/prod_force_se_r.cu b/source/op/cuda/prod_force_se_r.cu
index 5a4b582dd0..88e2962536 100644
--- a/source/op/cuda/prod_force_se_r.cu
+++ b/source/op/cuda/prod_force_se_r.cu
@@ -14,23 +14,44 @@ static __inline__ __device__ double atomicAdd(double* address, double val) {
 }
 #endif
 
-template<typename FPTYPE>
-__global__ void deriv_wrt_center_atom_se_r(FPTYPE * force, 
-                        const FPTYPE * net_deriv,
-                        const FPTYPE * in_deriv,
-                        const int ndescrpt)
+template <
+    typename FPTYPE,
+    int      THREADS_PER_BLOCK>
+__global__ void force_deriv_wrt_center_atom_se_r(FPTYPE * force, const FPTYPE * net_deriv, const FPTYPE * in_deriv, const int ndescrpt)
 {
-    const unsigned int idx = blockIdx.x;
-    const unsigned int idy = blockIdx.y * blockDim.y + threadIdx.y;
-    const unsigned int idz = threadIdx.x;
+    __shared__ FPTYPE data[THREADS_PER_BLOCK * 3];
+    unsigned int bid = blockIdx.x;
+    unsigned int tid = threadIdx.x;
+    for (int ii = tid; ii < THREADS_PER_BLOCK * 3; ii += THREADS_PER_BLOCK) {
+        data[ii] = 0.f;
+    }
 
-    if (idy >= ndescrpt) {return;}
-    
-    atomicAdd(force + idx * 3 + idz, -1.0 * net_deriv[idx * ndescrpt + idy] * in_deriv[idx * ndescrpt * 3 + idy * 3 + idz]);
+    for (int ii = tid; ii < ndescrpt; ii += THREADS_PER_BLOCK) {
+        for (int jj = 0; jj < 3; jj++) {
+            data[jj * THREADS_PER_BLOCK + tid] += net_deriv[bid * ndescrpt + ii] * in_deriv[bid * ndescrpt * 3 + ii * 3 + jj];
+        }
+    }
+    __syncthreads();
+
+    // do reduction in shared memory
+    for (int ii = THREADS_PER_BLOCK >> 1; ii > 0; ii >>= 1) {
+        if (tid < ii) {
+            for (int jj = 0; jj < 3; jj++) {
+                data[jj * THREADS_PER_BLOCK + tid] += data[jj * THREADS_PER_BLOCK + tid + ii];
+            }
+        }
+        __syncthreads();
+    }
+    // write result for this block to global memory
+    if (tid == 0) {
+        force[bid * 3 + 0] -= data[THREADS_PER_BLOCK * 0];
+        force[bid * 3 + 1] -= data[THREADS_PER_BLOCK * 1];
+        force[bid * 3 + 2] -= data[THREADS_PER_BLOCK * 2];
+    }
 }
 
 template<typename FPTYPE>
-__global__ void deriv_wrt_neighbors_se_r(FPTYPE * force, 
+__global__ void force_deriv_wrt_neighbors_se_r(FPTYPE * force, 
                         const FPTYPE * net_deriv,
                         const FPTYPE * in_deriv,
                         const int * nlist,
@@ -66,17 +87,13 @@ void ProdForceSeRGPUExecuteFunctor<FPTYPE>::operator()(FPTYPE * force,
 {   
     // std::cout << "I'm here!" << std::endl;
     cudaErrcheck(cudaMemset(force, 0.0, sizeof(FPTYPE) * nall * 3));
-    const int LEN1 = 256;
-    const int nblock1 = (ndescrpt + LEN1 -1) / LEN1;
-    dim3 grid(nloc, nblock1);
-    dim3 thread(3, LEN1);
-    deriv_wrt_center_atom_se_r<<<grid, thread>>>(force, net_deriv, in_deriv, ndescrpt);
+    force_deriv_wrt_center_atom_se_r<FPTYPE, TPB> <<<nloc, TPB>>>(force, net_deriv, in_deriv, ndescrpt);
     
     const int LEN = 64;
     int nblock = (nloc + LEN -1) / LEN;
     dim3 block_grid(nblock, nnei);
     dim3 thread_grid(LEN, 3);
-    deriv_wrt_neighbors_se_r<<<block_grid, thread_grid>>>(force, net_deriv, in_deriv, nlist, nloc, nnei, ndescrpt);
+    force_deriv_wrt_neighbors_se_r<<<block_grid, thread_grid>>>(force, net_deriv, in_deriv, nlist, nloc, nnei, ndescrpt);
 }
 
 template struct ProdForceSeRGPUExecuteFunctor<float>;

From ed571e10748f1120c7477d47a21dfcf0f13dc72a Mon Sep 17 00:00:00 2001
From: denghuilu <denghuilu@pku.edu.cn>
Date: Wed, 3 Feb 2021 19:19:57 +0800
Subject: [PATCH 06/20] through compression error when resnet_dt is set true
 for descriptor

---
 source/train/compress.py | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

diff --git a/source/train/compress.py b/source/train/compress.py
index 2174171ef6..fb4a505261 100644
--- a/source/train/compress.py
+++ b/source/train/compress.py
@@ -22,8 +22,9 @@ def compress(args):
     jdata['model']['descriptor']['model_file'] = args.input
     jdata['model']['descriptor']['table_info'] = args.table_info
     
-    # check the descriptor type of input file
+    # check the descriptor info of the input file
     assert jdata['model']['descriptor']['type'] == 'se_a', 'Model compression error: descriptor type must be se_a!'
+    assert jdata['model']['descriptor']['resnet_dt'] == False, 'Model compression error: descriptor resnet_dt must be false!'
 
     # stage 1: training or refining the model with tabulation
     print('\n\n# DEEPMD: stage 1: train or refine the model with tabulation')

From 043a2f8c178104058e4c9c67157b2e2349c87981 Mon Sep 17 00:00:00 2001
From: denghuilu <denghuilu@pku.edu.cn>
Date: Wed, 3 Feb 2021 19:28:39 +0800
Subject: [PATCH 07/20] update cmake support for Anpere architecture devices

---
 source/op/cuda/CMakeLists.txt | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

diff --git a/source/op/cuda/CMakeLists.txt b/source/op/cuda/CMakeLists.txt
index 20ef4d672e..96a030909c 100644
--- a/source/op/cuda/CMakeLists.txt
+++ b/source/op/cuda/CMakeLists.txt
@@ -28,7 +28,8 @@ if (${CUDA_VERSION_MAJOR} GREATER "10")
                         -gencode arch=compute_61,code=sm_61; # Pascal - GTX 1080, GTX 1070, GTX 1060, GTX 1050, GTX 1030, Titan Xp, Tesla P40, Tesla P4, Discrete GPU on the NVIDIA Drive PX2
                         -gencode arch=compute_70,code=sm_70; # Volta  - GV100/Tesla V100, GTX 1180 (GV104)
                         -gencode arch=compute_75,code=sm_75; # Turing - RTX 2080, Titan RTX, Quadro R8000
-                        -gencode arch=compute_80,code=sm_86; # Anpere - RTX 2080, Titan RTX, Quadro R8000
+                        -gencode arch=compute_80,code=sm_80; # Anpere - A100
+                        -gencode arch=compute_86,code=sm_86; # Anpere - RTX 3090
                         -O3; -Xcompiler -fPIC;
         )
 elseif (${CUDA_VERSION_MAJOR} STREQUAL "10")

From b32e57d38e3437041fe43ec357429e4f1b4877d6 Mon Sep 17 00:00:00 2001
From: denghuilu <denghuilu@pku.edu.cn>
Date: Thu, 4 Feb 2021 22:00:47 +0800
Subject: [PATCH 08/20] optimize the code structure of model compression

---
 deepmd/descriptor/se_a.py             | 207 +++++---------------------
 deepmd/utils/data_info.py             | 132 ++++++++++++++++
 deepmd/utils/tabulate.py              |  95 +++++++-----
 source/lib/include/CustomeOperation.h |   9 +-
 source/op/cuda/CMakeLists.txt         |   4 +-
 source/op/cuda/tabulate.cu            |  21 +--
 source/op/data_info.cc                |  10 +-
 source/op/tabulate.cc                 |  14 +-
 source/train/Model.py                 |   3 -
 source/train/Trainer.py               |  24 ++-
 source/train/compress.py              |   2 +-
 source/train/main.py                  |   2 +-
 12 files changed, 281 insertions(+), 242 deletions(-)
 create mode 100644 deepmd/utils/data_info.py

diff --git a/deepmd/descriptor/se_a.py b/deepmd/descriptor/se_a.py
index d7ee320bd4..2ca03378dd 100644
--- a/deepmd/descriptor/se_a.py
+++ b/deepmd/descriptor/se_a.py
@@ -11,7 +11,6 @@
 from deepmd.env import default_tf_session_config
 from deepmd.utils.network import embedding_net
 from deepmd.utils.tabulate import DeepTabulate
-from tqdm import tqdm
 
 
 class DescrptSeA ():
@@ -114,7 +113,10 @@ def __init__ (self,
         self.compress = compress
         self.model_file = model_file
         self.table_info = table_info
-        if (self.compress):
+        if self.compress:
+            self.distance = 100.0
+            self.max_nbor_size = 0
+            self.table_range = [-1, 20]
             self.table = DeepTabulate(self.model_file, self.filter_np_precision, self.type_one_side)
 
         self.place_holders = {}
@@ -123,7 +125,7 @@ def __init__ (self,
         sub_graph = tf.Graph()
         with sub_graph.as_default():
             name_pfx = 'd_sea_'
-            for ii in ['coord', 'box', 'avg', 'std']:
+            for ii in ['coord', 'box']:
                 self.place_holders[ii] = tf.placeholder(global_np_float_precision, [None, None], name = name_pfx+'t_'+ii)
             self.place_holders['type'] = tf.placeholder(tf.int32, [None, None], name=name_pfx+'t_type')
             self.place_holders['natoms_vec'] = tf.placeholder(tf.int32, [self.ntypes+2], name=name_pfx+'t_natoms')
@@ -141,19 +143,6 @@ def __init__ (self,
                                          rcut_r_smth = self.rcut_r_smth,
                                          sel_a = self.sel_a,
                                          sel_r = self.sel_r)
-            descrpt, descrpt_deriv, rij, nlist, self.distance, self.max_nbor_size, self.table_range \
-                = op_module.data_info(self.place_holders['coord'],
-                                         self.place_holders['type'],
-                                         self.place_holders['natoms_vec'],
-                                         self.place_holders['box'],
-                                         self.place_holders['default_mesh'],
-                                         self.place_holders['avg'],
-                                         self.place_holders['std'],
-                                         rcut_a = self.rcut_a,
-                                         rcut_r = self.rcut_r,
-                                         rcut_r_smth = self.rcut_r_smth,
-                                         sel_a = self.sel_a,
-                                         sel_r = self.sel_r)
         self.sub_sess = tf.Session(graph = sub_graph, config=default_tf_session_config)
 
 
@@ -358,9 +347,9 @@ def build (self,
         self.rij = tf.identity(self.rij, name = 'o_rij')
         self.nlist = tf.identity(self.nlist, name = 'o_nlist')
         
-        if (self.compress):
-            self.lower = math.floor(self.lower)
-            self.upper = math.ceil(self.upper)
+        if self.compress:
+            self.lower = math.floor(self.table_range[0])
+            self.upper = math.ceil(self.table_range[1])
             self.table.build(self.lower, 
                              self.upper, 
                              self.upper * self.table_info[0], 
@@ -378,59 +367,6 @@ def build (self,
         # only used when tensorboard was set as true
         tf.summary.histogram('embedding_net_output', self.dout)
         return self.dout
-
-    def data_info(self, data) -> None:
-        """
-        Print the data info(tabulation boundary, the nearest distance of atoms, max neighbor size) of the training data
-
-        Parameters
-        ----------
-        data
-                The data class that controls input data information
-        """
-        self.lower = 0.0
-        self.upper = 0.0
-        self.dist  = 100.0
-        self.max_nbor = 0
-
-        davg = self.davg
-        dstd = self.dstd
-        if davg is None:
-            davg = np.zeros([self.ntypes, self.ndescrpt])
-        if dstd is None:
-            dstd = np.ones ([self.ntypes, self.ndescrpt])
-
-        for ii in tqdm(range(len(data.system_dirs)), desc = '# DEEPMD: getting data info'):
-            for jj in data.data_systems[ii].dirs:
-                data_set = data.data_systems[ii]._load_set(jj)
-                for kk in range(np.array(data_set['type']).shape[0]):
-                    dt, mn, tr \
-                        = self.sub_sess.run([self.distance, self.max_nbor_size, self.table_range], 
-                                            feed_dict = {
-                                                self.place_holders['coord']: np.array(data_set['coord'])[kk].reshape([-1, data.natoms[ii] * 3]),
-                                                self.place_holders['type']: np.array(data_set['type'])[kk].reshape([-1, data.natoms[ii]]),
-                                                self.place_holders['natoms_vec']: np.array(data.natoms_vec[ii]),
-                                                self.place_holders['box']: np.array(data_set['box'])[kk].reshape([-1, 9]),
-                                                self.place_holders['default_mesh']: np.array(data.default_mesh[ii]),
-                                                self.place_holders['avg']: davg,
-                                                self.place_holders['std']: dstd,
-                                            })
-                    dr = np.array([np.min(tr), np.max(tr)]).astype(global_np_float_precision)
-                    dt = np.min(dt)
-                    mn = np.max(mn)
-                    if (dr[0] < self.lower): 
-                        self.lower = dr[0]
-                    if (dr[1] > self.upper):
-                        self.upper = dr[1]
-                    if (dt < self.dist):
-                        self.dist = dt
-                    if (mn > self.max_nbor):
-                        self.max_nbor = mn
-
-        print('# DEEPMD: training data with lower boundary: ' + str(self.lower))
-        print('# DEEPMD: training data with upper boundary: ' + str(self.upper))
-        print('# DEEPMD: training data with min   distance: ' + str(self.dist))
-        print('# DEEPMD: training data with max   nborsize: ' + str(self.max_nbor))
     
     def get_rot_mat(self) -> tf.Tensor:
         """
@@ -507,10 +443,7 @@ def _pass_filter(self,
                                      [ 0, start_index*      self.ndescrpt],
                                      [-1, natoms[2+type_i]* self.ndescrpt] )
                 inputs_i = tf.reshape(inputs_i, [-1, self.ndescrpt])
-                if not self.compress:
-                    layer, qmat = self._filter(tf.cast(inputs_i, self.filter_precision), type_i, name='filter_type_'+str(type_i)+suffix, natoms=natoms, reuse=reuse, seed = self.seed, trainable = trainable, activation_fn = self.filter_activation_fn)
-                else:
-                    layer, qmat = self._compress_filter(tf.cast(inputs_i, self.filter_precision), type_i, name='filter_type_'+str(type_i)+suffix, natoms=natoms, reuse=reuse, seed = self.seed, trainable = trainable, activation_fn = self.filter_activation_fn)
+                layer, qmat = self._filter(tf.cast(inputs_i, self.filter_precision), type_i, name='filter_type_'+str(type_i)+suffix, natoms=natoms, reuse=reuse, seed = self.seed, trainable = trainable, activation_fn = self.filter_activation_fn)
                 layer = tf.reshape(layer, [tf.shape(inputs)[0], natoms[2+type_i] * self.get_dim_out()])
                 qmat  = tf.reshape(qmat,  [tf.shape(inputs)[0], natoms[2+type_i] * self.get_dim_rot_mat_1() * 3])
                 output.append(layer)
@@ -520,10 +453,7 @@ def _pass_filter(self,
             inputs_i = inputs
             inputs_i = tf.reshape(inputs_i, [-1, self.ndescrpt])
             type_i = -1
-            if not self.compress:
-                layer, qmat = self._filter(tf.cast(inputs_i, self.filter_precision), type_i, name='filter_type_all'+suffix, natoms=natoms, reuse=reuse, seed = self.seed, trainable = trainable, activation_fn = self.filter_activation_fn)
-            else:
-                layer, qmat = self._compress_filter(tf.cast(inputs_i, self.filter_precision), type_i, name='filter_type_all'+suffix, natoms=natoms, reuse=reuse, seed = self.seed, trainable = trainable, activation_fn = self.filter_activation_fn)
+            layer, qmat = self._filter(tf.cast(inputs_i, self.filter_precision), type_i, name='filter_type_all'+suffix, natoms=natoms, reuse=reuse, seed = self.seed, trainable = trainable, activation_fn = self.filter_activation_fn)
             layer = tf.reshape(layer, [tf.shape(inputs)[0], natoms[0] * self.get_dim_out()])
             qmat  = tf.reshape(qmat,  [tf.shape(inputs)[0], natoms[0] * self.get_dim_rot_mat_1() * 3])
             output.append(layer)
@@ -616,99 +546,38 @@ def _filter(self,
             # with (natom x nei_type_i) x 1
             xyz_scatter = tf.reshape(tf.slice(inputs_reshape, [0,0],[-1,1]),[-1,1])
             # with (natom x nei_type_i) x out_size
-            if (type_input, type_i) not in self.exclude_types:
-                xyz_scatter = embedding_net(xyz_scatter, 
-                                            self.filter_neuron, 
-                                            self.filter_precision, 
-                                            activation_fn = activation_fn, 
-                                            resnet_dt = self.filter_resnet_dt,
-                                            name_suffix = "_"+str(type_i),
-                                            stddev = stddev,
-                                            bavg = bavg,
-                                            seed = seed,
-                                            trainable = trainable)
-            else:
-              w = tf.zeros((outputs_size[0], outputs_size[-1]), dtype=global_tf_float_precision)
-              xyz_scatter = tf.matmul(xyz_scatter, w)
-            # natom x nei_type_i x out_size
-            xyz_scatter = tf.reshape(xyz_scatter, (-1, shape_i[1]//4, outputs_size[-1]))
-
-            # xyz_scatter_total.append(xyz_scatter)
-            if type_i == 0 :
-                xyz_scatter_1 = tf.matmul(tf.reshape(inputs_i, [-1, shape_i[1]//4, 4]), xyz_scatter, transpose_a = True)
-            else :
-                xyz_scatter_1 += tf.matmul(tf.reshape(inputs_i, [-1, shape_i[1]//4, 4]), xyz_scatter, transpose_a = True)
-          # natom x nei x outputs_size
-          # xyz_scatter = tf.concat(xyz_scatter_total, axis=1)
-          # natom x nei x 4
-          # inputs_reshape = tf.reshape(inputs, [-1, shape[1]//4, 4])
-          # natom x 4 x outputs_size
-          # xyz_scatter_1 = tf.matmul(inputs_reshape, xyz_scatter, transpose_a = True)
-          xyz_scatter_1 = xyz_scatter_1 * (4.0 / shape[1])
-          # natom x 4 x outputs_size_2
-          xyz_scatter_2 = tf.slice(xyz_scatter_1, [0,0,0],[-1,-1,outputs_size_2])
-          # # natom x 3 x outputs_size_2
-          # qmat = tf.slice(xyz_scatter_2, [0,1,0], [-1, 3, -1])
-          # natom x 3 x outputs_size_1
-          qmat = tf.slice(xyz_scatter_1, [0,1,0], [-1, 3, -1])
-          # natom x outputs_size_1 x 3
-          qmat = tf.transpose(qmat, perm = [0, 2, 1])
-          # natom x outputs_size x outputs_size_2
-          result = tf.matmul(xyz_scatter_1, xyz_scatter_2, transpose_a = True)
-          # natom x (outputs_size x outputs_size_2)
-          result = tf.reshape(result, [-1, outputs_size_2 * outputs_size[-1]])
-
-        return result, qmat
-
-    def _compress_filter(self, 
-                   inputs, 
-                   type_input,
-                   natoms,
-                   activation_fn=tf.nn.tanh, 
-                   stddev=1.0,
-                   bavg=0.0,
-                   name='linear', 
-                   reuse=None,
-                   seed=None, 
-                trainable = True):
-        # natom x (nei x 4)
-        shape = inputs.get_shape().as_list()
-        outputs_size = [1] + self.filter_neuron
-        outputs_size_2 = self.n_axis_neuron
-        with tf.variable_scope(name, reuse=reuse):
-          start_index = 0
-          xyz_scatter_total = []
-          for type_i in range(self.ntypes):
-            # cut-out inputs
-            # with natom x (nei_type_i x 4)  
-            inputs_i = tf.slice (inputs,
-                                 [ 0, start_index*      4],
-                                 [-1, self.sel_a[type_i]* 4] )
-            start_index += self.sel_a[type_i]
-            shape_i = inputs_i.get_shape().as_list()
-            # with (natom x nei_type_i) x 4  
-            inputs_reshape = tf.reshape(inputs_i, [-1, 4])
-            xyz_scatter = tf.reshape(tf.slice(inputs_reshape, [0,0],[-1,1]),[-1,1])
-            if (type_input, type_i) in self.exclude_types:
-              w = tf.zeros((outputs_size[0], outputs_size[-1]), dtype=global_tf_float_precision)
-              xyz_scatter = tf.matmul(xyz_scatter, w)
-              xyz_scatter = tf.reshape(xyz_scatter, (-1, shape_i[1]//4, outputs_size[-1]))
-              if type_i == 0:
-                xyz_scatter_1  = tf.matmul(tf.reshape(inputs_i, [-1, shape_i[1]//4, 4]), xyz_scatter, transpose_a = True)
-              else:
-                xyz_scatter_1 += tf.matmul(tf.reshape(inputs_i, [-1, shape_i[1]//4, 4]), xyz_scatter, transpose_a = True)
-            else:
-              ti = [self.lower, self.upper, self.upper * self.table_info[0], self.table_info[1], self.table_info[2], self.table_info[3]]
+            if self.compress and (type_input, type_i) not in self.exclude_types:
+              info = [self.lower, self.upper, self.upper * self.table_info[0], self.table_info[1], self.table_info[2], self.table_info[3]]
               if self.type_one_side:
-                assert type_input == -1, "Error: when type_one_side was set True, the value of type_input must be -1."
                 net = 'filter_-1_net_' + str(type_i)
               else:
                 net = 'filter_' + str(type_input) + '_net_' + str(type_i)
               if type_i == 0:
-                xyz_scatter_1  = op_module.tabulate_fusion(self.table.data[net], ti, xyz_scatter, tf.reshape(inputs_i, [-1, shape_i[1]//4, 4]), last_layer_size = outputs_size[-1])
+                xyz_scatter_1  = op_module.tabulate_fusion(self.table.data[net], info, xyz_scatter, tf.reshape(inputs_i, [-1, shape_i[1]//4, 4]), last_layer_size = outputs_size[-1])
               else:
-                xyz_scatter_1 += op_module.tabulate_fusion(self.table.data[net], ti, xyz_scatter, tf.reshape(inputs_i, [-1, shape_i[1]//4, 4]), last_layer_size = outputs_size[-1])
-          # not needed any more!
+                xyz_scatter_1 += op_module.tabulate_fusion(self.table.data[net], info, xyz_scatter, tf.reshape(inputs_i, [-1, shape_i[1]//4, 4]), last_layer_size = outputs_size[-1])
+            else:
+              if (type_input, type_i) not in self.exclude_types:
+                  xyz_scatter = embedding_net(xyz_scatter, 
+                                              self.filter_neuron, 
+                                              self.filter_precision, 
+                                              activation_fn = activation_fn, 
+                                              resnet_dt = self.filter_resnet_dt,
+                                              name_suffix = "_"+str(type_i),
+                                              stddev = stddev,
+                                              bavg = bavg,
+                                              seed = seed,
+                                              trainable = trainable)
+              else:
+                w = tf.zeros((outputs_size[0], outputs_size[-1]), dtype=global_tf_float_precision)
+                xyz_scatter = tf.matmul(xyz_scatter, w)
+              # natom x nei_type_i x out_size
+              xyz_scatter = tf.reshape(xyz_scatter, (-1, shape_i[1]//4, outputs_size[-1]))  
+              # xyz_scatter_total.append(xyz_scatter)
+              if type_i == 0 :
+                  xyz_scatter_1 = tf.matmul(tf.reshape(inputs_i, [-1, shape_i[1]//4, 4]), xyz_scatter, transpose_a = True)
+              else :
+                  xyz_scatter_1 += tf.matmul(tf.reshape(inputs_i, [-1, shape_i[1]//4, 4]), xyz_scatter, transpose_a = True)
           # natom x nei x outputs_size
           # xyz_scatter = tf.concat(xyz_scatter_total, axis=1)
           # natom x nei x 4
@@ -722,11 +591,11 @@ def _compress_filter(self,
           # qmat = tf.slice(xyz_scatter_2, [0,1,0], [-1, 3, -1])
           # natom x 3 x outputs_size_1
           qmat = tf.slice(xyz_scatter_1, [0,1,0], [-1, 3, -1])
-          # natom x outputs_size_2 x 3
+          # natom x outputs_size_1 x 3
           qmat = tf.transpose(qmat, perm = [0, 2, 1])
           # natom x outputs_size x outputs_size_2
           result = tf.matmul(xyz_scatter_1, xyz_scatter_2, transpose_a = True)
           # natom x (outputs_size x outputs_size_2)
           result = tf.reshape(result, [-1, outputs_size_2 * outputs_size[-1]])
 
-        return result, qmat
\ No newline at end of file
+        return result, qmat
diff --git a/deepmd/utils/data_info.py b/deepmd/utils/data_info.py
new file mode 100644
index 0000000000..0e760f9c1b
--- /dev/null
+++ b/deepmd/utils/data_info.py
@@ -0,0 +1,132 @@
+import math
+import numpy as np
+from tqdm import tqdm
+from deepmd.env import tf
+from typing import Tuple, List
+from deepmd.env import op_module
+from deepmd.env import default_tf_session_config
+from deepmd.RunOptions import global_np_float_precision
+
+class DataInfo():
+    """
+    Class for getting training data information. 
+    It loads data from DeepmdData object, and measures the data info, including neareest nbor distance between atoms, max nbor size of atoms and the output data range of the environment matrix.
+    """
+    def __init__(self,
+                 descrpt_type : str,
+                 ntypes : int,
+                 rcut,
+                 rcut_smth,
+                 sel,
+                 davg,
+                 dstd) -> None:
+        """
+        Constructor
+
+        Parameters
+        ----------
+        descrpt_type
+                The descrpt type of the embedding net
+        ntypes
+                The num of atom types
+        rcut
+                The cut-off radius
+        rcut_smth
+                From where the environment matrix should be smoothed
+        sel : list[str]
+                sel[i] specifies the maxmum number of type i atoms in the cut-off radius
+        davg
+                Average of training data
+        dstd
+                Standard deviation of training data
+        """
+        self.ntypes = ntypes
+        self.davg = davg
+        self.dstd = dstd
+        self.descrpt_type = descrpt_type
+        assert self.descrpt_type == 'se_a', 'Model compression error: descriptor type must be se_a!'
+        self.place_holders = {}
+        sub_graph = tf.Graph()
+        with sub_graph.as_default():
+            for ii in ['coord', 'box', 'avg', 'std']:
+                self.place_holders[ii] = tf.placeholder(global_np_float_precision, [None, None], name='t_'+ii)
+            self.place_holders['type'] = tf.placeholder(tf.int32, [None, None], name='t_type')
+            self.place_holders['natoms_vec'] = tf.placeholder(tf.int32, [self.ntypes+2], name='t_natoms')
+            self.place_holders['default_mesh'] = tf.placeholder(tf.int32, [None], name='t_mesh')
+            if self.descrpt_type == 'se_a':
+                self.rcut_a = -1
+                self.rcut_r = rcut
+                self.rcut_r_smth = rcut_smth
+                self.sel_a = sel
+                self.sel_r = [ 0 for ii in range(len(self.sel_a)) ]
+                descrpt, descrpt_deriv, rij, nlist, self.distance, self.max_nbor_size, self.table_range \
+                    = op_module.data_info_se_a(self.place_holders['coord'],
+                                             self.place_holders['type'],
+                                             self.place_holders['natoms_vec'],
+                                             self.place_holders['box'],
+                                             self.place_holders['default_mesh'],
+                                             self.place_holders['avg'],
+                                             self.place_holders['std'],
+                                             rcut_a = self.rcut_a,
+                                             rcut_r = self.rcut_r,
+                                             rcut_r_smth = self.rcut_r_smth,
+                                             sel_a = self.sel_a,
+                                             sel_r = self.sel_r)
+        self.sub_sess = tf.Session(graph = sub_graph, config=default_tf_session_config)
+
+    def data_info(self,
+                  data) -> Tuple[float, int, list]:
+        """
+        get the data info of the training data, including neareest nbor distance between atoms, max nbor size of atoms and the output data range of the environment matrix
+
+        Parameters
+        ----------
+        data
+                Class for manipulating many data systems. It is implemented with the help of DeepmdData.
+        """
+        self.lower = 0.0
+        self.upper = 0.0
+        self.dist  = 100.0
+        self.max_nbor = 0
+
+        davg = self.davg
+        dstd = self.dstd
+        if davg is None:
+            davg = np.zeros([self.ntypes, self.ndescrpt])
+        if dstd is None:
+            dstd = np.ones ([self.ntypes, self.ndescrpt])
+
+        for ii in tqdm(range(len(data.system_dirs)), desc = '# DEEPMD: getting data info'):
+            for jj in data.data_systems[ii].dirs:
+                data_set = data.data_systems[ii]._load_set(jj)
+                for kk in range(np.array(data_set['type']).shape[0]):
+                    dt, mn, tr \
+                        = self.sub_sess.run([self.distance, self.max_nbor_size, self.table_range], 
+                                            feed_dict = {
+                                                self.place_holders['coord']: np.array(data_set['coord'])[kk].reshape([-1, data.natoms[ii] * 3]),
+                                                self.place_holders['type']: np.array(data_set['type'])[kk].reshape([-1, data.natoms[ii]]),
+                                                self.place_holders['natoms_vec']: np.array(data.natoms_vec[ii]),
+                                                self.place_holders['box']: np.array(data_set['box'])[kk].reshape([-1, 9]),
+                                                self.place_holders['default_mesh']: np.array(data.default_mesh[ii]),
+                                                self.place_holders['avg']: davg,
+                                                self.place_holders['std']: dstd,
+                                            })
+                    dr = np.array([np.min(tr), np.max(tr)]).astype(global_np_float_precision)
+                    dt = np.min(dt)
+                    mn = np.max(mn)
+                    if (dr[0] < self.lower): 
+                        self.lower = dr[0]
+                    if (dr[1] > self.upper):
+                        self.upper = dr[1]
+                    if (dt < self.dist):
+                        self.dist = dt
+                    if (mn > self.max_nbor):
+                        self.max_nbor = mn
+
+        print('# DEEPMD: training data with lower boundary: ' + str(self.lower))
+        print('# DEEPMD: training data with upper boundary: ' + str(self.upper))
+        print('# DEEPMD: training data with min   distance: ' + str(self.dist))
+        print('# DEEPMD: training data with max   nborsize: ' + str(self.max_nbor))
+        
+        return self.distance, self.max_nbor_size, [self.lower, self.upper]
+        
\ No newline at end of file
diff --git a/deepmd/utils/tabulate.py b/deepmd/utils/tabulate.py
index 2a453ce584..35a5603ce9 100644
--- a/deepmd/utils/tabulate.py
+++ b/deepmd/utils/tabulate.py
@@ -1,17 +1,34 @@
 import re
 import math
 import numpy as np
+from tqdm import tqdm
 from deepmd.env import tf
+from deepmd.env import op_module
 from tensorflow.python.platform import gfile
 from tensorflow.python.framework import tensor_util
-from tqdm import tqdm
-from deepmd.env import op_module
+
 
 class DeepTabulate():
+    """
+    Class for tabulation.
+    It reads the trained weights and bias from the frozen model, and builds the table according to the weights and bias.
+    """
     def __init__(self,
                  model_file,
                  data_type,
-                 type_one_side = False):
+                 type_one_side = False) -> None:
+        """
+        Constructor
+
+        Parameters
+        ----------
+        model_file
+                The frozen model
+        data_type
+                The precision of the table. Supported options are {1}
+        type_one_side
+                Try to build N_types tables. Otherwise, building N_types^2 tables
+        """
 
         self.model_file = model_file
         self.data_type = data_type
@@ -28,8 +45,10 @@ def __init__(self,
 
         self.filter_variable_nodes = self.load_matrix_node()
         self.layer_size = int(len(self.filter_variable_nodes) / (self.ntypes * self.ntypes * 2))
+        self.table_size = self.ntypes * self.ntypes
         if type_one_side :
             self.layer_size = int(len(self.filter_variable_nodes) / (self.ntypes * 2))
+            self.table_size = self.ntypes
         # self.value_type = self.filter_variable_nodes["filter_type_0/matrix_1_0"].dtype #"filter_type_0/matrix_1_0" must exit~
         # get trained variables
         self.bias = self.get_bias()
@@ -42,7 +61,6 @@ def __init__(self,
         self.data = {}
 
         # TODO: Need a check function to determine if the current model is properly
-        # Need be more robust!
 
     def load_graph(self):
         graph_def = tf.GraphDef()
@@ -107,6 +125,22 @@ def get_matrix(self):
         return matrix
 
     def build(self, lower, upper, _max, stride0, stride1):
+        """
+        Build the tables for model compression
+
+        Parameters
+        ----------
+        lower
+                The lower boundary of the first table
+        upper
+                The upper boundary of the first table as well as the lower boundary of the second table
+        _max
+                The upper boundary of the second table
+        stride0
+                The stride of the first table
+        stride1
+                The stride of the second table
+        """
         # tabulate range [lower, upper] with stride0 'stride0'
         lower = math.floor(lower)
         upper = math.ceil(upper)
@@ -114,42 +148,27 @@ def build(self, lower, upper, _max, stride0, stride1):
         xx = np.append(xx, np.arange(upper, _max, stride1, dtype = self.data_type))
         xx = np.append(xx, np.array([_max], dtype = self.data_type))
         self.nspline = int((upper - lower) / stride0 + (_max - upper) / stride1)
-        if self.type_one_side:
-            for ii in range(self.ntypes):
-                vv, dd, d2 = self.make_data(xx, ii)
+        
+        for ii in range(self.table_size):
+            vv, dd, d2 = self.make_data(xx, ii)
+            if self.type_one_side:
                 net = "filter_-1_net_" + str(int(ii))
-                self.data[net] = np.zeros([self.nspline, 6 * self.last_layer_size], dtype = self.data_type)
-                for jj in tqdm(range(self.nspline), desc = '# DEEPMD: ' + net + ', tabulating'):
-                    for kk in range(self.last_layer_size):
-                        if jj < int((upper - lower) / stride0):
-                            tt = stride0
-                        else:
-                            tt = stride1
-                        hh = vv[jj + 1][kk] - vv[jj][kk]
-                        self.data[net][jj][kk * 6 + 0] = vv[jj][kk]
-                        self.data[net][jj][kk * 6 + 1] = dd[jj][kk]
-                        self.data[net][jj][kk * 6 + 2] = 0.5 * d2[jj][kk]
-                        self.data[net][jj][kk * 6 + 3] = (1 / (2 * tt * tt * tt)) * (20 * hh - (8 * dd[jj + 1][kk] + 12 * dd[jj][kk]) * tt - (3 * d2[jj][kk] - d2[jj + 1][kk]) * tt * tt)
-                        self.data[net][jj][kk * 6 + 4] = (1 / (2 * tt * tt * tt * tt)) * (-30 * hh + (14 * dd[jj + 1][kk] + 16 * dd[jj][kk]) * tt + (3 * d2[jj][kk] - 2 * d2[jj + 1][kk]) * tt * tt)
-                        self.data[net][jj][kk * 6 + 5] = (1 / (2 * tt * tt * tt * tt * tt)) * (12 * hh - 6 * (dd[jj + 1][kk] + dd[jj][kk]) * tt + (d2[jj + 1][kk] - d2[jj][kk]) * tt * tt)
-        else:
-            for ii in range(self.ntypes * self.ntypes):
-                vv, dd, d2 = self.make_data(xx, ii)
+            else:
                 net = "filter_" + str(int(ii / self.ntypes)) + "_net_" + str(int(ii % self.ntypes))
-                self.data[net] = np.zeros([self.nspline, 6 * self.last_layer_size], dtype = self.data_type)
-                for jj in tqdm(range(self.nspline), desc = '# DEEPMD: ' + net + ', tabulating'):
-                    for kk in range(self.last_layer_size):
-                        if jj < int((upper - lower) / stride0):
-                            tt = stride0
-                        else:
-                            tt = stride1
-                        hh = vv[jj + 1][kk] - vv[jj][kk]
-                        self.data[net][jj][kk * 6 + 0] = vv[jj][kk]
-                        self.data[net][jj][kk * 6 + 1] = dd[jj][kk]
-                        self.data[net][jj][kk * 6 + 2] = 0.5 * d2[jj][kk]
-                        self.data[net][jj][kk * 6 + 3] = (1 / (2 * tt * tt * tt)) * (20 * hh - (8 * dd[jj + 1][kk] + 12 * dd[jj][kk]) * tt - (3 * d2[jj][kk] - d2[jj + 1][kk]) * tt * tt)
-                        self.data[net][jj][kk * 6 + 4] = (1 / (2 * tt * tt * tt * tt)) * (-30 * hh + (14 * dd[jj + 1][kk] + 16 * dd[jj][kk]) * tt + (3 * d2[jj][kk] - 2 * d2[jj + 1][kk]) * tt * tt)
-                        self.data[net][jj][kk * 6 + 5] = (1 / (2 * tt * tt * tt * tt * tt)) * (12 * hh - 6 * (dd[jj + 1][kk] + dd[jj][kk]) * tt + (d2[jj + 1][kk] - d2[jj][kk]) * tt * tt)
+            self.data[net] = np.zeros([self.nspline, 6 * self.last_layer_size], dtype = self.data_type)
+            for jj in tqdm(range(self.nspline), desc = '# DEEPMD: ' + net + ', tabulating'):
+                for kk in range(self.last_layer_size):
+                    if jj < int((upper - lower) / stride0):
+                        tt = stride0
+                    else:
+                        tt = stride1
+                    hh = vv[jj + 1][kk] - vv[jj][kk]
+                    self.data[net][jj][kk * 6 + 0] = vv[jj][kk]
+                    self.data[net][jj][kk * 6 + 1] = dd[jj][kk]
+                    self.data[net][jj][kk * 6 + 2] = 0.5 * d2[jj][kk]
+                    self.data[net][jj][kk * 6 + 3] = (1 / (2 * tt * tt * tt)) * (20 * hh - (8 * dd[jj + 1][kk] + 12 * dd[jj][kk]) * tt - (3 * d2[jj][kk] - d2[jj + 1][kk]) * tt * tt)
+                    self.data[net][jj][kk * 6 + 4] = (1 / (2 * tt * tt * tt * tt)) * (-30 * hh + (14 * dd[jj + 1][kk] + 16 * dd[jj][kk]) * tt + (3 * d2[jj][kk] - 2 * d2[jj + 1][kk]) * tt * tt)
+                    self.data[net][jj][kk * 6 + 5] = (1 / (2 * tt * tt * tt * tt * tt)) * (12 * hh - 6 * (dd[jj + 1][kk] + dd[jj][kk]) * tt + (d2[jj + 1][kk] - d2[jj][kk]) * tt * tt)
         
     # one-by-one executions
     def make_data(self, xx, idx):
diff --git a/source/lib/include/CustomeOperation.h b/source/lib/include/CustomeOperation.h
index 0c0d891fd4..98b64e44af 100644
--- a/source/lib/include/CustomeOperation.h
+++ b/source/lib/include/CustomeOperation.h
@@ -643,7 +643,7 @@ void TabulateFusionCPULauncher(const FPTYPE * table, const FPTYPE * table_info,
                 FPTYPE a3  = table[table_idx * last_layer_size * 6 + 6 * kk + 3];
                 FPTYPE a4  = table[table_idx * last_layer_size * 6 + 6 * kk + 4];
                 FPTYPE a5  = table[table_idx * last_layer_size * 6 + 6 * kk + 5];
-                FPTYPE var = a0 + a1 * xx + a2 * xx * xx + a3 * xx * xx * xx + a4 * xx * xx * xx * xx + a5 * xx * xx * xx * xx * xx;
+                FPTYPE var = a0 + (a1 + (a2 + (a3 + (a4 + a5 * xx) * xx) * xx) * xx) * xx;
                 if (unloop) {
                     out[ii * last_layer_size * 4 + 0 * last_layer_size + kk] += (nnei - jj) * var * ll[0];
                     out[ii * last_layer_size * 4 + 1 * last_layer_size + kk] += (nnei - jj) * var * ll[1];
@@ -705,16 +705,17 @@ void TabulateFusionGradCPULauncher(const FPTYPE * table, const FPTYPE * table_in
                 FPTYPE a3  = table[table_idx * last_layer_size * 6 + 6 * kk + 3];
                 FPTYPE a4  = table[table_idx * last_layer_size * 6 + 6 * kk + 4];
                 FPTYPE a5  = table[table_idx * last_layer_size * 6 + 6 * kk + 5];
-                FPTYPE res = a0 + a1 * xx + a2 * xx * xx + a3 * xx * xx * xx + a4 * xx * xx * xx * xx + a5 * xx * xx * xx * xx * xx;
+                FPTYPE res = a0 + (a1 + (a2 + (a3 + (a4 + a5 * xx) * xx) * xx) * xx) * xx;
+                
                 if (unloop) {
-                    grad += (a1 + 2 * a2 * xx + 3 * a3 * xx * xx + 4 * a4 * xx * xx * xx + 5 * a5 * xx * xx * xx * xx) * dot(ll, rr) * (nnei - jj);
+                    grad += (a1 + (2 * a2 + (3 * a3 + (4 * a4 + 5 * a5 * xx) * xx) * xx) * xx) * dot(ll, rr) * (nnei - jj);
                     dy_df[ii * nnei * 4 + jj * 4 + 0] += res * rr[0] * (nnei - jj);
                     dy_df[ii * nnei * 4 + jj * 4 + 1] += res * rr[1] * (nnei - jj);
                     dy_df[ii * nnei * 4 + jj * 4 + 2] += res * rr[2] * (nnei - jj);
                     dy_df[ii * nnei * 4 + jj * 4 + 3] += res * rr[3] * (nnei - jj);
                 }
                 else {
-                    grad += (a1 + 2 * a2 * xx + 3 * a3 * xx * xx + 4 * a4 * xx * xx * xx + 5 * a5 * xx * xx * xx * xx) * dot(ll, rr);
+                    grad += (a1 + (2 * a2 + (3 * a3 + (4 * a4 + 5 * a5 * xx) * xx) * xx) * xx) * dot(ll, rr);
                     dy_df[ii * nnei * 4 + jj * 4 + 0] += res * rr[0];
                     dy_df[ii * nnei * 4 + jj * 4 + 1] += res * rr[1];
                     dy_df[ii * nnei * 4 + jj * 4 + 2] += res * rr[2];
diff --git a/source/op/cuda/CMakeLists.txt b/source/op/cuda/CMakeLists.txt
index 96a030909c..4f47aa1435 100644
--- a/source/op/cuda/CMakeLists.txt
+++ b/source/op/cuda/CMakeLists.txt
@@ -28,8 +28,8 @@ if (${CUDA_VERSION_MAJOR} GREATER "10")
                         -gencode arch=compute_61,code=sm_61; # Pascal - GTX 1080, GTX 1070, GTX 1060, GTX 1050, GTX 1030, Titan Xp, Tesla P40, Tesla P4, Discrete GPU on the NVIDIA Drive PX2
                         -gencode arch=compute_70,code=sm_70; # Volta  - GV100/Tesla V100, GTX 1180 (GV104)
                         -gencode arch=compute_75,code=sm_75; # Turing - RTX 2080, Titan RTX, Quadro R8000
-                        -gencode arch=compute_80,code=sm_80; # Anpere - A100
-                        -gencode arch=compute_86,code=sm_86; # Anpere - RTX 3090
+                        -gencode arch=compute_80,code=sm_80; # Ampere - A100
+                        -gencode arch=compute_86,code=sm_86; # Ampere - RTX 3090
                         -O3; -Xcompiler -fPIC;
         )
 elseif (${CUDA_VERSION_MAJOR} STREQUAL "10")
diff --git a/source/op/cuda/tabulate.cu b/source/op/cuda/tabulate.cu
index a7231b413f..83befeb571 100644
--- a/source/op/cuda/tabulate.cu
+++ b/source/op/cuda/tabulate.cu
@@ -196,7 +196,8 @@ __global__ void tabulate_fusion_special(const FPTYPE * table, const FPTYPE * in,
         var[3] = table[table_idx * last_layer_size * 6 + thread_idx * 6 + 3];
         var[4] = table[table_idx * last_layer_size * 6 + thread_idx * 6 + 4];
         var[5] = table[table_idx * last_layer_size * 6 + thread_idx * 6 + 5];
-        FPTYPE res = var[0] + var[1] * xx + var[2] * xx * xx + var[3] * xx * xx * xx + var[4] * xx * xx * xx * xx + var[5] * xx * xx * xx * xx * xx; 
+        FPTYPE res = var[0] + (var[1] + (var[2] + (var[3] + (var[4] + var[5] * xx) * xx) * xx) * xx) * xx;
+        
         for (int kk = 0; kk < MTILE; kk++) {
             iteratorC[kk * last_layer_size + thread_idx] += (nnei - breakpoint) * ff[block_idx * nnei * MTILE + ii * MTILE + kk] * res;
         }
@@ -240,14 +241,16 @@ __global__ void tabulate_fusion_grad_warp_reduce_special(const FPTYPE * table, c
         FPTYPE sum[KTILE] = {0.f};
         FPTYPE Csub = 0.f;
         for (int jj = lane_idx; jj < last_layer_size; jj += WARP_SIZE) {
+            FPTYPE var[6]; 
             // load iteratorB through table 
-            FPTYPE a0  = table[table_idx * last_layer_size * 6 + 6 * jj + 0]; 
-            FPTYPE a1  = table[table_idx * last_layer_size * 6 + 6 * jj + 1]; 
-            FPTYPE a2  = table[table_idx * last_layer_size * 6 + 6 * jj + 2]; 
-            FPTYPE a3  = table[table_idx * last_layer_size * 6 + 6 * jj + 3];
-            FPTYPE a4  = table[table_idx * last_layer_size * 6 + 6 * jj + 4];
-            FPTYPE a5  = table[table_idx * last_layer_size * 6 + 6 * jj + 5];
-            FPTYPE res = a0 + a1 * xx + a2 * xx * xx + a3 * xx * xx * xx + a4 * xx * xx * xx * xx + a5 * xx * xx * xx * xx * xx;
+            var[0]  = table[table_idx * last_layer_size * 6 + 6 * jj + 0]; 
+            var[1]  = table[table_idx * last_layer_size * 6 + 6 * jj + 1]; 
+            var[2]  = table[table_idx * last_layer_size * 6 + 6 * jj + 2]; 
+            var[3]  = table[table_idx * last_layer_size * 6 + 6 * jj + 3];
+            var[4]  = table[table_idx * last_layer_size * 6 + 6 * jj + 4];
+            var[5]  = table[table_idx * last_layer_size * 6 + 6 * jj + 5];
+            FPTYPE res = var[0] + (var[1] + (var[2] + (var[3] + (var[4] + var[5] * xx) * xx) * xx) * xx) * xx;
+            
             for (int kk = 0; kk < KTILE; kk++) {
                 sum[kk] += (nnei - breakpoint) * iteratorA[kk * last_layer_size + jj] * res;
             }
@@ -255,7 +258,7 @@ __global__ void tabulate_fusion_grad_warp_reduce_special(const FPTYPE * table, c
             res += ff[block_idx * nnei * MTILE + (ii + warp_idx) * 4 + 1] * iteratorA[1 * last_layer_size + jj];
             res += ff[block_idx * nnei * MTILE + (ii + warp_idx) * 4 + 2] * iteratorA[2 * last_layer_size + jj];
             res += ff[block_idx * nnei * MTILE + (ii + warp_idx) * 4 + 3] * iteratorA[3 * last_layer_size + jj];
-            Csub += (nnei - breakpoint) * (a1 + 2 * a2 * xx + 3 * a3 * xx * xx + 4 * a4 * xx * xx * xx + 5 * a5 * xx * xx * xx * xx) * res;
+            Csub += (nnei - breakpoint) * (var[1] + (2 * var[2] + (3 * var[3] + (4 * var[4] + 5 * var[5] * xx) * xx) * xx) * xx) * res;
         }
         __syncwarp();
         for (int kk = 0; kk < KTILE; kk++) {
diff --git a/source/op/data_info.cc b/source/op/data_info.cc
index 08e0f77691..12df043918 100644
--- a/source/op/data_info.cc
+++ b/source/op/data_info.cc
@@ -15,7 +15,7 @@ using namespace tensorflow;
 using CPUDevice = Eigen::ThreadPoolDevice;
 using GPUDevice = Eigen::GpuDevice;
 
-REGISTER_OP("DataInfo")
+REGISTER_OP("DataInfoSeA")
     .Attr("T: {float, double}")
     .Input("coord: T")          //atomic coordinates
     .Input("type: int32")       //atomic type
@@ -38,9 +38,9 @@ REGISTER_OP("DataInfo")
     .Output("table_range: T");
 
 template<typename Device, typename FPTYPE>
-class DataInfoOp : public OpKernel {
+class DataInfoSeAOp : public OpKernel {
 public:
-  explicit DataInfoOp(OpKernelConstruction* context) : OpKernel(context) {
+  explicit DataInfoSeAOp(OpKernelConstruction* context) : OpKernel(context) {
     OP_REQUIRES_OK(context, context->GetAttr("rcut_a", &rcut_a));
     OP_REQUIRES_OK(context, context->GetAttr("rcut_r", &rcut_r));
     OP_REQUIRES_OK(context, context->GetAttr("rcut_r_smth", &rcut_r_smth));
@@ -402,7 +402,7 @@ class DataInfoOp : public OpKernel {
 
 #define REGISTER_CPU(T)                                                                 \
 REGISTER_KERNEL_BUILDER(                                                                \
-    Name("DataInfo").Device(DEVICE_CPU).TypeConstraint<T>("T"),                       \
-    DataInfoOp<CPUDevice, T>); 
+    Name("DataInfoSeA").Device(DEVICE_CPU).TypeConstraint<T>("T"),                      \
+    DataInfoSeAOp<CPUDevice, T>); 
 REGISTER_CPU(float);
 REGISTER_CPU(double);
\ No newline at end of file
diff --git a/source/op/tabulate.cc b/source/op/tabulate.cc
index 14e9b51474..ba7cc550fe 100644
--- a/source/op/tabulate.cc
+++ b/source/op/tabulate.cc
@@ -27,12 +27,14 @@ REGISTER_OP("TabulateFusionGrad")
     .Output("dy_dx: T")
     .Output("dy_df: T");
 
+#if GOOGLE_CUDA
 void TabulateFusionLauncher(const float * table, const float * table_info, const float * in, const float * ff, const int nloc, const int nnei, const int last_layer_size, float * out);
 void TabulateFusionLauncher(const double * table, const double * table_info, const double * in, const double * ff, const int nloc, const int nnei, const int last_layer_size, double * out);
 void TabulateFusionGradLauncher(const float * table, const float * table_info, const float * in, const float * ff, const float * dy, const int nloc, const int nnei, const int last_layer_size, float * dy_dx, float * dy_df);
 void TabulateFusionGradLauncher(const double * table, const double * table_info, const double * in, const double * ff, const double * dy, const int nloc, const int nnei, const int last_layer_size, double * dy_dx, double * dy_df);
 void TabulateCheckerLauncher(const float * table_info, const float * in, int * out, const int nloc, const int nnei);
 void TabulateCheckerLauncher(const double * table_info, const double * in, int * out, const int nloc, const int nnei);
+#endif
 
 template <typename FPTYPE>
 inline FPTYPE dot(FPTYPE a[4], FPTYPE b[4]) {
@@ -107,7 +109,8 @@ struct TabulateFusionFunctor {
                     FPTYPE a3  = table[table_idx * last_layer_size * 6 + 6 * kk + 3];
                     FPTYPE a4  = table[table_idx * last_layer_size * 6 + 6 * kk + 4];
                     FPTYPE a5  = table[table_idx * last_layer_size * 6 + 6 * kk + 5];
-                    FPTYPE var = a0 + a1 * xx + a2 * xx * xx + a3 * xx * xx * xx + a4 * xx * xx * xx * xx + a5 * xx * xx * xx * xx * xx;
+                    // FPTYPE var = a0 + a1 * xx + a2 * xx * xx + a3 * xx * xx * xx + a4 * xx * xx * xx * xx + a5 * xx * xx * xx * xx * xx;
+                    FPTYPE var = a0 + (a1 + (a2 + (a3 + (a4 + a5 * xx) * xx) * xx) * xx) * xx;
                     if (unloop) {
                         out[ii * last_layer_size * 4 + 0 * last_layer_size + kk] += (nnei - jj) * var * ll[0];
                         out[ii * last_layer_size * 4 + 1 * last_layer_size + kk] += (nnei - jj) * var * ll[1];
@@ -177,17 +180,20 @@ struct TabulateFusionGradFunctor {
                     FPTYPE a3  = table[table_idx * last_layer_size * 6 + 6 * kk + 3];
                     FPTYPE a4  = table[table_idx * last_layer_size * 6 + 6 * kk + 4];
                     FPTYPE a5  = table[table_idx * last_layer_size * 6 + 6 * kk + 5];
-                    FPTYPE res = a0 + a1 * xx + a2 * xx * xx + a3 * xx * xx * xx + a4 * xx * xx * xx * xx + a5 * xx * xx * xx * xx * xx;
+                    // FPTYPE res = a0 + a1 * xx + a2 * xx * xx + a3 * xx * xx * xx + a4 * xx * xx * xx * xx + a5 * xx * xx * xx * xx * xx;
+                    FPTYPE res = a0 + (a1 + (a2 + (a3 + (a4 + a5 * xx) * xx) * xx) * xx) * xx;
 
                     if (unloop) {
-                        grad += (a1 + 2 * a2 * xx + 3 * a3 * xx * xx + 4 * a4 * xx * xx * xx + 5 * a5 * xx * xx * xx * xx) * dot(ll, rr) * (nnei - jj);
+                        // grad += (a1 + 2 * a2 * xx + 3 * a3 * xx * xx + 4 * a4 * xx * xx * xx + 5 * a5 * xx * xx * xx * xx) * dot(ll, rr) * (nnei - jj);
+                        grad += (a1 + (2 * a2 + (3 * a3 + (4 * a4 + 5 * a5 * xx) * xx) * xx) * xx) * dot(ll, rr) * (nnei - jj);
                         dy_df[ii * nnei * 4 + jj * 4 + 0] += res * rr[0] * (nnei - jj);
                         dy_df[ii * nnei * 4 + jj * 4 + 1] += res * rr[1] * (nnei - jj);
                         dy_df[ii * nnei * 4 + jj * 4 + 2] += res * rr[2] * (nnei - jj);
                         dy_df[ii * nnei * 4 + jj * 4 + 3] += res * rr[3] * (nnei - jj);
                     }
                     else {
-                        grad += (a1 + 2 * a2 * xx + 3 * a3 * xx * xx + 4 * a4 * xx * xx * xx + 5 * a5 * xx * xx * xx * xx) * dot(ll, rr);
+                        // grad += (a1 + 2 * a2 * xx + 3 * a3 * xx * xx + 4 * a4 * xx * xx * xx + 5 * a5 * xx * xx * xx * xx) * dot(ll, rr);
+                        grad += (a1 + (2 * a2 + (3 * a3 + (4 * a4 + 5 * a5 * xx) * xx) * xx) * xx) * dot(ll, rr);
                         dy_df[ii * nnei * 4 + jj * 4 + 0] += res * rr[0];
                         dy_df[ii * nnei * 4 + jj * 4 + 1] += res * rr[1];
                         dy_df[ii * nnei * 4 + jj * 4 + 2] += res * rr[2];
diff --git a/source/train/Model.py b/source/train/Model.py
index 5a6ae6f153..9c9cd7ddff 100644
--- a/source/train/Model.py
+++ b/source/train/Model.py
@@ -114,9 +114,6 @@ def data_stat(self, data):
         m_all_stat = merge_sys_stat(all_stat)
         self._compute_input_stat(m_all_stat, protection = self.data_stat_protect)
         self._compute_output_stat(all_stat)
-
-        if hasattr(self.descrpt, 'compress') and self.descrpt.compress:
-            self.descrpt.data_info(data)
         # self.bias_atom_e = data.compute_energy_shift(self.rcond)
 
     def _compute_input_stat (self, all_stat, protection = 1e-2) :
diff --git a/source/train/Trainer.py b/source/train/Trainer.py
index c16a0e7157..ce434ae467 100644
--- a/source/train/Trainer.py
+++ b/source/train/Trainer.py
@@ -19,6 +19,7 @@
 from deepmd.Model import Model, WFCModel, DipoleModel, PolarModel, GlobalPolarModel
 from deepmd.loss import EnerStdLoss, EnerDipoleLoss, TensorLoss
 from deepmd.utils.learning_rate import LearningRateExp
+from deepmd.utils.data_info import DataInfo
 
 from tensorflow.python.client import timeline
 from deepmd.env import op_module
@@ -91,11 +92,11 @@ def _init_param(self, jdata):
 
         # descriptor
         try:
-            descrpt_type = descrpt_param['type']
+            self.descrpt_type = descrpt_param['type']
         except KeyError:
             raise KeyError('the type of descriptor should be set by `type`')
 
-        if descrpt_type != 'hybrid':
+        if self.descrpt_type != 'hybrid':
             self.descrpt = _generate_descrpt_from_param_dict(descrpt_param)
         else :
             descrpt_list = []
@@ -115,19 +116,19 @@ def _init_param(self, jdata):
         # elif fitting_type == 'wfc':            
         #     self.fitting = WFCFitting(fitting_param, self.descrpt)
         elif fitting_type == 'dipole':
-            if descrpt_type == 'se_a':
+            if self.descrpt_type == 'se_a':
                 self.fitting = DipoleFittingSeA(**fitting_param)
             else :
                 raise RuntimeError('fitting dipole only supports descrptors: se_a')
         elif fitting_type == 'polar':
-            # if descrpt_type == 'loc_frame':
+            # if self.descrpt_type == 'loc_frame':
             #     self.fitting = PolarFittingLocFrame(fitting_param, self.descrpt)
-            if descrpt_type == 'se_a':
+            if self.descrpt_type == 'se_a':
                 self.fitting = PolarFittingSeA(**fitting_param)
             else :
                 raise RuntimeError('fitting polar only supports descrptors: loc_frame and se_a')
         elif fitting_type == 'global_polar':
-            if descrpt_type == 'se_a':
+            if self.descrpt_type == 'se_a':
                 self.fitting = GlobalPolarFittingSeA(**fitting_param)
             else :
                 raise RuntimeError('fitting global_polar only supports descrptors: loc_frame and se_a')
@@ -272,6 +273,17 @@ def build (self,
 
         self.model.data_stat(data)
 
+        if hasattr(self.descrpt, 'compress') and self.descrpt.compress:
+            assert hasattr(self.descrpt, 'distance'), "Compression error: descrpt must have attr distance"
+            assert hasattr(self.descrpt, 'max_nbor_size'), "Compression error: descrpt must have attr max_nbor_size"
+            assert hasattr(self.descrpt, 'table_range'), "Compression error: descrpt must have attr table_range"
+            if self.descrpt_type == 'se_a':
+                info = DataInfo(self.descrpt_type, self.descrpt.ntypes, self.descrpt.rcut_r, self.descrpt.rcut_r_smth, self.descrpt.sel_a, self.descrpt.davg, self.descrpt.dstd)
+            else:
+                raise RuntimeError ("Model compression error: descriptor type must be se_a!")
+            self.descrpt.distance, self.descrpt.max_nbor_size, self.descrpt.table_range\
+                = info.data_info(data)
+
         worker_device = "/job:%s/task:%d/%s" % (self.run_opt.my_job_name,
                                                 self.run_opt.my_task_index,
                                                 self.run_opt.my_device)
diff --git a/source/train/compress.py b/source/train/compress.py
index fb4a505261..64035d53ed 100644
--- a/source/train/compress.py
+++ b/source/train/compress.py
@@ -33,7 +33,7 @@ def compress(args):
     args_train.output = 'compress.json'
     args_train.init_model = None
     args_train.restart = None
-    jdata['training']['stop_batch'] = jdata['training']['save_freq'] # be careful here, if we want refine the model
+    jdata['training']['stop_batch'] = jdata['training']['save_freq'] # be careful here, if one want to refine the model
     with open(args_train.INPUT, 'w') as fp:
         json.dump(jdata, fp, indent=4)
     train(args_train)
diff --git a/source/train/main.py b/source/train/main.py
index 248c33f961..bd5fbc1d8a 100644
--- a/source/train/main.py
+++ b/source/train/main.py
@@ -69,7 +69,7 @@ def main () :
                             help='the input parameter file in json or yaml format')
     parser_compress.add_argument('-i', "--input", default = "frozen_model.pb", type=str, 
 				            help = "the original model")
-    parser_compress.add_argument("-o","--output", default = "frozen_model_tab.pb", type=str, 
+    parser_compress.add_argument("-o","--output", default = "frozen_model_compress.pb", type=str, 
 				            help='the compressed model')
     parser_compress.add_argument('-t', '--table-info', nargs='+', default = [5, 0.01, 0.1, 1], type=float)
     parser_compress.add_argument("-d", "--folder", type=str, default = ".", 

From 49afc73506d0ac26154eddc9d0ada60e6d1a535c Mon Sep 17 00:00:00 2001
From: denghuilu <denghuilu@pku.edu.cn>
Date: Fri, 5 Feb 2021 11:12:03 +0800
Subject: [PATCH 09/20] change the class name from DataInfo to EnvMatStat

---
 .../utils/{data_info.py => env_mat_stat.py}   | 21 +++++---
 deepmd/utils/tabulate.py                      | 49 ++++++++++---------
 source/op/CMakeLists.txt                      |  2 +-
 source/op/{data_info.cc => env_mat_stat.cc}   | 10 ++--
 source/train/Trainer.py                       |  6 +--
 5 files changed, 51 insertions(+), 37 deletions(-)
 rename deepmd/utils/{data_info.py => env_mat_stat.py} (91%)
 rename source/op/{data_info.cc => env_mat_stat.cc} (98%)

diff --git a/deepmd/utils/data_info.py b/deepmd/utils/env_mat_stat.py
similarity index 91%
rename from deepmd/utils/data_info.py
rename to deepmd/utils/env_mat_stat.py
index 0e760f9c1b..a77ce0ab62 100644
--- a/deepmd/utils/data_info.py
+++ b/deepmd/utils/env_mat_stat.py
@@ -7,7 +7,7 @@
 from deepmd.env import default_tf_session_config
 from deepmd.RunOptions import global_np_float_precision
 
-class DataInfo():
+class EnvMatStat():
     """
     Class for getting training data information. 
     It loads data from DeepmdData object, and measures the data info, including neareest nbor distance between atoms, max nbor size of atoms and the output data range of the environment matrix.
@@ -60,7 +60,7 @@ def __init__(self,
                 self.sel_a = sel
                 self.sel_r = [ 0 for ii in range(len(self.sel_a)) ]
                 descrpt, descrpt_deriv, rij, nlist, self.distance, self.max_nbor_size, self.table_range \
-                    = op_module.data_info_se_a(self.place_holders['coord'],
+                    = op_module.env_mat_stat_se_a(self.place_holders['coord'],
                                              self.place_holders['type'],
                                              self.place_holders['natoms_vec'],
                                              self.place_holders['box'],
@@ -74,8 +74,8 @@ def __init__(self,
                                              sel_r = self.sel_r)
         self.sub_sess = tf.Session(graph = sub_graph, config=default_tf_session_config)
 
-    def data_info(self,
-                  data) -> Tuple[float, int, list]:
+    def env_mat_stat(self,
+                  data) -> Tuple[float, int, List[float]]:
         """
         get the data info of the training data, including neareest nbor distance between atoms, max nbor size of atoms and the output data range of the environment matrix
 
@@ -83,6 +83,15 @@ def data_info(self,
         ----------
         data
                 Class for manipulating many data systems. It is implemented with the help of DeepmdData.
+        
+        Returns
+        -------
+        distance
+                The neareest nbor distance between atoms
+        max_nbor_size
+                The max nbor size of atoms
+        table_range
+                The output data range of the environment matrix
         """
         self.lower = 0.0
         self.upper = 0.0
@@ -127,6 +136,6 @@ def data_info(self,
         print('# DEEPMD: training data with upper boundary: ' + str(self.upper))
         print('# DEEPMD: training data with min   distance: ' + str(self.dist))
         print('# DEEPMD: training data with max   nborsize: ' + str(self.max_nbor))
-        
-        return self.distance, self.max_nbor_size, [self.lower, self.upper]
+        table_range = [self.lower, self.upper]
+        return self.distance, self.max_nbor_size, table_range
         
\ No newline at end of file
diff --git a/deepmd/utils/tabulate.py b/deepmd/utils/tabulate.py
index 35a5603ce9..57ffc0cb5e 100644
--- a/deepmd/utils/tabulate.py
+++ b/deepmd/utils/tabulate.py
@@ -34,16 +34,16 @@ def __init__(self,
         self.data_type = data_type
         self.type_one_side = type_one_side
 
-        self.graph, self.graph_def = self.load_graph()
+        self.graph, self.graph_def = self._load_graph()
         self.sess = tf.Session(graph = self.graph)
 
-        self.sub_graph, self.sub_graph_def = self.load_sub_graph()
+        self.sub_graph, self.sub_graph_def = self._load_sub_graph()
         self.sub_sess = tf.Session(graph = self.sub_graph)
 
         self.sel_a = self.graph.get_operation_by_name('DescrptSeA').get_attr('sel_a')
-        self.ntypes = self.get_tensor_value(self.graph.get_tensor_by_name ('descrpt_attr/ntypes:0'))
+        self.ntypes = self._get_tensor_value(self.graph.get_tensor_by_name ('descrpt_attr/ntypes:0'))
 
-        self.filter_variable_nodes = self.load_matrix_node()
+        self.filter_variable_nodes = self._load_matrix_node()
         self.layer_size = int(len(self.filter_variable_nodes) / (self.ntypes * self.ntypes * 2))
         self.table_size = self.ntypes * self.ntypes
         if type_one_side :
@@ -51,8 +51,8 @@ def __init__(self,
             self.table_size = self.ntypes
         # self.value_type = self.filter_variable_nodes["filter_type_0/matrix_1_0"].dtype #"filter_type_0/matrix_1_0" must exit~
         # get trained variables
-        self.bias = self.get_bias()
-        self.matrix = self.get_matrix()
+        self.bias = self._get_bias()
+        self.matrix = self._get_matrix()
         # self.matrix_layer_3 must exist
         # self.data_type = type(self.matrix["layer_1"][0][0][0])
         assert self.matrix["layer_1"][0].size > 0, "no matrix exist in matrix array!"
@@ -62,7 +62,7 @@ def __init__(self,
 
         # TODO: Need a check function to determine if the current model is properly
 
-    def load_graph(self):
+    def _load_graph(self):
         graph_def = tf.GraphDef()
         with open(self.model_file, "rb") as f:
             graph_def.ParseFromString(f.read())
@@ -70,19 +70,19 @@ def load_graph(self):
             tf.import_graph_def(graph_def, name = "")
         return graph, graph_def
 
-    def load_sub_graph(self):
+    def _load_sub_graph(self):
         sub_graph_def = tf.GraphDef()
         with tf.Graph().as_default() as sub_graph:
             tf.import_graph_def(sub_graph_def, name = "")
         return sub_graph, sub_graph_def
 
-    def get_tensor_value(self, tensor) :
+    def _get_tensor_value(self, tensor) :
         with self.sess.as_default():
             self.sess.run(tensor)
             value = tensor.eval()
         return value
 
-    def load_matrix_node(self):
+    def _load_matrix_node(self):
         matrix_node = {}
         matrix_node_pattern = "filter_type_\d+/matrix_\d+_\d+|filter_type_\d+/bias_\d+_\d+|filter_type_\d+/idt_\d+_\d+|filter_type_all/matrix_\d+_\d+|filter_type_all/bias_\d+_\d+|filter_type_all/idt_\d+_\d"
         for node in self.graph_def.node:
@@ -92,7 +92,7 @@ def load_matrix_node(self):
             assert key.find('bias') > 0 or key.find('matrix') > 0, "currently, only support weight matrix and bias matrix at the tabulation op!"
         return matrix_node
 
-    def get_bias(self):
+    def _get_bias(self):
         bias = {}
         for layer in range(1, self.layer_size + 1):
             bias["layer_" + str(layer)] = []
@@ -108,7 +108,7 @@ def get_bias(self):
                     bias["layer_" + str(layer)].append(np.reshape(tensor_value, tensor_shape).astype(self.data_type))
         return bias
 
-    def get_matrix(self):
+    def _get_matrix(self):
         matrix = {}
         for layer in range(1, self.layer_size + 1):
             matrix["layer_" + str(layer)] = []
@@ -124,7 +124,12 @@ def get_matrix(self):
                     matrix["layer_" + str(layer)].append(np.reshape(tensor_value, tensor_shape).astype(self.data_type))
         return matrix
 
-    def build(self, lower, upper, _max, stride0, stride1):
+    def build(self, 
+              lower, 
+              upper, 
+              _max, 
+              stride0, 
+              stride1) -> None:
         """
         Build the tables for model compression
 
@@ -148,9 +153,9 @@ def build(self, lower, upper, _max, stride0, stride1):
         xx = np.append(xx, np.arange(upper, _max, stride1, dtype = self.data_type))
         xx = np.append(xx, np.array([_max], dtype = self.data_type))
         self.nspline = int((upper - lower) / stride0 + (_max - upper) / stride1)
-        
+
         for ii in range(self.table_size):
-            vv, dd, d2 = self.make_data(xx, ii)
+            vv, dd, d2 = self._make_data(xx, ii)
             if self.type_one_side:
                 net = "filter_-1_net_" + str(int(ii))
             else:
@@ -171,34 +176,34 @@ def build(self, lower, upper, _max, stride0, stride1):
                     self.data[net][jj][kk * 6 + 5] = (1 / (2 * tt * tt * tt * tt * tt)) * (12 * hh - 6 * (dd[jj + 1][kk] + dd[jj][kk]) * tt + (d2[jj + 1][kk] - d2[jj][kk]) * tt * tt)
         
     # one-by-one executions
-    def make_data(self, xx, idx):
+    def _make_data(self, xx, idx):
         with self.sub_graph.as_default():
             with self.sub_sess.as_default():
                 xx = tf.reshape(xx, [xx.size, -1])
                 for layer in range(self.layer_size):
                     if layer == 0:
-                        yy = self.layer_0(xx, self.matrix["layer_" + str(layer + 1)][idx], self.bias["layer_" + str(layer + 1)][idx])
+                        yy = self._layer_0(xx, self.matrix["layer_" + str(layer + 1)][idx], self.bias["layer_" + str(layer + 1)][idx])
                         dy = op_module.unaggregated_dy_dx_s(yy, self.matrix["layer_" + str(layer + 1)][idx])
                         dy2 = op_module.unaggregated_dy2_dx_s(yy, dy, self.matrix["layer_" + str(layer + 1)][idx])
                     else:
-                        tt, yy = self.layer_1(yy, self.matrix["layer_" + str(layer + 1)][idx], self.bias["layer_" + str(layer + 1)][idx])
+                        tt, yy = self._layer_1(yy, self.matrix["layer_" + str(layer + 1)][idx], self.bias["layer_" + str(layer + 1)][idx])
                         dz = op_module.unaggregated_dy_dx(yy - tt, self.matrix["layer_" + str(layer + 1)][idx], dy)
                         dy2 = op_module.unaggregated_dy2_dx(yy - tt, self.matrix["layer_" + str(layer + 1)][idx], dz, dy, dy2)
                         dy = dz
-                
+ 
                 vv = yy.eval()
                 dd = dy.eval()
                 d2 = dy2.eval()
         return vv, dd, d2
 
-    def layer_0(self, x, w, b):
+    def _layer_0(self, x, w, b):
         return tf.nn.tanh(tf.matmul(x, w) + b)
 
-    def layer_1(self, x, w, b):
+    def _layer_1(self, x, w, b):
         t = tf.concat([x, x], axis = 1)
         return t, tf.nn.tanh(tf.matmul(x, w) + b) + t
 
-    def save_data(self):
+    def _save_data(self):
         for ii in range(self.ntypes * self.ntypes):
             net = "filter_" + str(int(ii / self.ntypes)) + "_net_" + str(int(ii % self.ntypes))
             np.savetxt('data_' + str(int(ii)), self.data[net])
diff --git a/source/op/CMakeLists.txt b/source/op/CMakeLists.txt
index 0e10c24bb4..da18377683 100644
--- a/source/op/CMakeLists.txt
+++ b/source/op/CMakeLists.txt
@@ -3,7 +3,7 @@
 set(OP_LIB ${PROJECT_SOURCE_DIR}/lib/src/SimulationRegion.cpp ${PROJECT_SOURCE_DIR}/lib/src/NeighborList.cpp)
 
 set (OP_CXX_FLAG -D_GLIBCXX_USE_CXX11_ABI=${OP_CXX_ABI} )
-file(GLOB OP_SRC prod_force.cc prod_virial.cc descrpt.cc descrpt_se_a.cc descrpt_se_a_ef.cc descrpt_se_a_ef.cc descrpt_se_a_ef_para.cc descrpt_se_a_ef_vert.cc descrpt_se_r.cc tab_inter.cc prod_force_se_a.cc prod_virial_se_a.cc prod_force_se_r.cc prod_virial_se_r.cc soft_min.cc soft_min_force.cc soft_min_virial.cc ewald_recp.cc gelu.cc map_aparam.cc data_info.cc unaggregated_grad.cc tabulate.cc)
+file(GLOB OP_SRC prod_force.cc prod_virial.cc descrpt.cc descrpt_se_a.cc descrpt_se_a_ef.cc descrpt_se_a_ef.cc descrpt_se_a_ef_para.cc descrpt_se_a_ef_vert.cc descrpt_se_r.cc tab_inter.cc prod_force_se_a.cc prod_virial_se_a.cc prod_force_se_r.cc prod_virial_se_r.cc soft_min.cc soft_min_force.cc soft_min_virial.cc ewald_recp.cc gelu.cc map_aparam.cc env_mat_stat.cc unaggregated_grad.cc tabulate.cc)
 file(GLOB OP_CUDA_SRC prod_force.cc prod_virial.cc descrpt.cc descrpt_se_a_multi_device.cc descrpt_se_r_multi_device.cc tab_inter.cc prod_force_se_a_multi_device.cc prod_virial_se_a_multi_device.cc prod_force_se_r_multi_device.cc prod_virial_se_r_multi_device.cc soft_min.cc soft_min_force.cc soft_min_virial.cc gelu_multi_device.cc tabulate_multi_device.cc)
 file(GLOB OP_GRADS_SRC prod_force_grad.cc prod_force_se_a_grad.cc prod_force_se_r_grad.cc prod_virial_grad.cc prod_virial_se_a_grad.cc prod_virial_se_r_grad.cc soft_min_force_grad.cc soft_min_virial_grad.cc )
 file(GLOB OP_PY *.py)
diff --git a/source/op/data_info.cc b/source/op/env_mat_stat.cc
similarity index 98%
rename from source/op/data_info.cc
rename to source/op/env_mat_stat.cc
index 12df043918..c90dc0b92f 100644
--- a/source/op/data_info.cc
+++ b/source/op/env_mat_stat.cc
@@ -15,7 +15,7 @@ using namespace tensorflow;
 using CPUDevice = Eigen::ThreadPoolDevice;
 using GPUDevice = Eigen::GpuDevice;
 
-REGISTER_OP("DataInfoSeA")
+REGISTER_OP("EnvMatStatSeA")
     .Attr("T: {float, double}")
     .Input("coord: T")          //atomic coordinates
     .Input("type: int32")       //atomic type
@@ -38,9 +38,9 @@ REGISTER_OP("DataInfoSeA")
     .Output("table_range: T");
 
 template<typename Device, typename FPTYPE>
-class DataInfoSeAOp : public OpKernel {
+class EnvMatStatSeAOp : public OpKernel {
 public:
-  explicit DataInfoSeAOp(OpKernelConstruction* context) : OpKernel(context) {
+  explicit EnvMatStatSeAOp(OpKernelConstruction* context) : OpKernel(context) {
     OP_REQUIRES_OK(context, context->GetAttr("rcut_a", &rcut_a));
     OP_REQUIRES_OK(context, context->GetAttr("rcut_r", &rcut_r));
     OP_REQUIRES_OK(context, context->GetAttr("rcut_r_smth", &rcut_r_smth));
@@ -402,7 +402,7 @@ class DataInfoSeAOp : public OpKernel {
 
 #define REGISTER_CPU(T)                                                                 \
 REGISTER_KERNEL_BUILDER(                                                                \
-    Name("DataInfoSeA").Device(DEVICE_CPU).TypeConstraint<T>("T"),                      \
-    DataInfoSeAOp<CPUDevice, T>); 
+    Name("EnvMatStatSeA").Device(DEVICE_CPU).TypeConstraint<T>("T"),                      \
+    EnvMatStatSeAOp<CPUDevice, T>); 
 REGISTER_CPU(float);
 REGISTER_CPU(double);
\ No newline at end of file
diff --git a/source/train/Trainer.py b/source/train/Trainer.py
index ce434ae467..fd4b3eac42 100644
--- a/source/train/Trainer.py
+++ b/source/train/Trainer.py
@@ -19,7 +19,7 @@
 from deepmd.Model import Model, WFCModel, DipoleModel, PolarModel, GlobalPolarModel
 from deepmd.loss import EnerStdLoss, EnerDipoleLoss, TensorLoss
 from deepmd.utils.learning_rate import LearningRateExp
-from deepmd.utils.data_info import DataInfo
+from deepmd.utils.env_mat_stat import EnvMatStat
 
 from tensorflow.python.client import timeline
 from deepmd.env import op_module
@@ -278,11 +278,11 @@ def build (self,
             assert hasattr(self.descrpt, 'max_nbor_size'), "Compression error: descrpt must have attr max_nbor_size"
             assert hasattr(self.descrpt, 'table_range'), "Compression error: descrpt must have attr table_range"
             if self.descrpt_type == 'se_a':
-                info = DataInfo(self.descrpt_type, self.descrpt.ntypes, self.descrpt.rcut_r, self.descrpt.rcut_r_smth, self.descrpt.sel_a, self.descrpt.davg, self.descrpt.dstd)
+                stat = EnvMatStat(self.descrpt_type, self.descrpt.ntypes, self.descrpt.rcut_r, self.descrpt.rcut_r_smth, self.descrpt.sel_a, self.descrpt.davg, self.descrpt.dstd)
             else:
                 raise RuntimeError ("Model compression error: descriptor type must be se_a!")
             self.descrpt.distance, self.descrpt.max_nbor_size, self.descrpt.table_range\
-                = info.data_info(data)
+                = stat.env_mat_stat(data)
 
         worker_device = "/job:%s/task:%d/%s" % (self.run_opt.my_job_name,
                                                 self.run_opt.my_task_index,

From 5885eed8c86459c0669785d0a9b4b9969a3c4b81 Mon Sep 17 00:00:00 2001
From: denghuilu <denghuilu@pku.edu.cn>
Date: Fri, 5 Feb 2021 21:37:47 +0800
Subject: [PATCH 10/20] optimize the interface of EnvMatStat class

---
 deepmd/descriptor/se_a.py    |  65 ++++++++++-------
 deepmd/utils/env_mat_stat.py |  45 +++++-------
 source/op/env_mat_stat.cc    | 137 +++++++++++++++--------------------
 source/train/Trainer.py      |  37 ++++++----
 source/train/compress.py     |   7 +-
 source/train/main.py         |   2 +-
 6 files changed, 145 insertions(+), 148 deletions(-)

diff --git a/deepmd/descriptor/se_a.py b/deepmd/descriptor/se_a.py
index 2ca03378dd..e40210de1a 100644
--- a/deepmd/descriptor/se_a.py
+++ b/deepmd/descriptor/se_a.py
@@ -28,10 +28,7 @@ def __init__ (self,
                   exclude_types: List[int] = [],
                   set_davg_zero: bool = False,
                   activation_function: str = 'tanh',
-                  precision: str = 'default',
-                  compress: bool = False,
-                  model_file: str = 'frozen_model.pb',
-                  table_info: list = [5, 0.01, 0.1, -1]
+                  precision: str = 'default'
     ) -> None:
         """
         Constructor
@@ -65,12 +62,6 @@ def __init__ (self,
                 The activation function in the embedding net. Supported options are {0}
         precision
                 The precision of the embedding net parameters. Supported options are {1}
-        compress
-                Try to compress the embedding nets. Otherwise, building original embedding nets
-        model_file
-                The original frozen model, that will be compressed.
-        table_info
-                The data info of tabulation.
         """
         self.sel_a = sel
         self.rcut_r = rcut
@@ -108,17 +99,8 @@ def __init__ (self,
         self.useBN = False
         self.dstd = None
         self.davg = None
+        self.compress = False
         
-        # compress config
-        self.compress = compress
-        self.model_file = model_file
-        self.table_info = table_info
-        if self.compress:
-            self.distance = 100.0
-            self.max_nbor_size = 0
-            self.table_range = [-1, 20]
-            self.table = DeepTabulate(self.model_file, self.filter_np_precision, self.type_one_side)
-
         self.place_holders = {}
         avg_zero = np.zeros([self.ntypes,self.ndescrpt]).astype(global_np_float_precision)
         std_ones = np.ones ([self.ntypes,self.ndescrpt]).astype(global_np_float_precision)
@@ -248,6 +230,37 @@ def compute_input_stats (self,
             self.davg = np.array(all_davg)
         self.dstd = np.array(all_dstd)
 
+    def enable_compression(self,
+                           distance,
+                           max_nbor_size,
+                           env_mat_range,
+                           model_file = 'frozon_model.pb',
+                           table_config = [5, 0.01, 0.1, -1]
+    ) -> None:
+        """
+        Reveive the statisitcs (distance, max_nbor_size and env_mat_range) of the training data.
+        
+        Parameters
+        ----------
+        distance
+                The nearest nbor distance between atoms
+        max_nbor_size
+                The max nbor size of atoms
+        env_mat_range
+                The output data range of the environment matrix
+        model_file
+                The original frozen model, that will be compressed
+        table_info
+                The configuration of the tabulation
+        """   
+        self.compress = True
+        self.model_file = model_file
+        self.table_config = table_config
+        self.distance = distance
+        self.max_nbor_size = max_nbor_size
+        self.env_mat_range = env_mat_range
+        self.table = DeepTabulate(self.model_file, self.filter_np_precision, self.type_one_side)
+
 
     def build (self, 
                coord_ : tf.Tensor, 
@@ -348,13 +361,13 @@ def build (self,
         self.nlist = tf.identity(self.nlist, name = 'o_nlist')
         
         if self.compress:
-            self.lower = math.floor(self.table_range[0])
-            self.upper = math.ceil(self.table_range[1])
+            self.lower = math.floor(self.env_mat_range[0])
+            self.upper = math.ceil(self.env_mat_range[1])
             self.table.build(self.lower, 
                              self.upper, 
-                             self.upper * self.table_info[0], 
-                             self.table_info[1], 
-                             self.table_info[2])
+                             self.upper * self.table_config[0], 
+                             self.table_config[1], 
+                             self.table_config[2])
 
         self.dout, self.qmat = self._pass_filter(self.descrpt_reshape, 
                                                  atype,
@@ -547,7 +560,7 @@ def _filter(self,
             xyz_scatter = tf.reshape(tf.slice(inputs_reshape, [0,0],[-1,1]),[-1,1])
             # with (natom x nei_type_i) x out_size
             if self.compress and (type_input, type_i) not in self.exclude_types:
-              info = [self.lower, self.upper, self.upper * self.table_info[0], self.table_info[1], self.table_info[2], self.table_info[3]]
+              info = [self.lower, self.upper, self.upper * self.table_config[0], self.table_config[1], self.table_config[2], self.table_config[3]]
               if self.type_one_side:
                 net = 'filter_-1_net_' + str(type_i)
               else:
diff --git a/deepmd/utils/env_mat_stat.py b/deepmd/utils/env_mat_stat.py
index a77ce0ab62..4096024007 100644
--- a/deepmd/utils/env_mat_stat.py
+++ b/deepmd/utils/env_mat_stat.py
@@ -40,9 +40,9 @@ def __init__(self,
         dstd
                 Standard deviation of training data
         """
-        self.ntypes = ntypes
         self.davg = davg
         self.dstd = dstd
+        self.ntypes = ntypes
         self.descrpt_type = descrpt_type
         assert self.descrpt_type == 'se_a', 'Model compression error: descriptor type must be se_a!'
         self.place_holders = {}
@@ -53,29 +53,24 @@ def __init__(self,
             self.place_holders['type'] = tf.placeholder(tf.int32, [None, None], name='t_type')
             self.place_holders['natoms_vec'] = tf.placeholder(tf.int32, [self.ntypes+2], name='t_natoms')
             self.place_holders['default_mesh'] = tf.placeholder(tf.int32, [None], name='t_mesh')
-            if self.descrpt_type == 'se_a':
-                self.rcut_a = -1
-                self.rcut_r = rcut
-                self.rcut_r_smth = rcut_smth
-                self.sel_a = sel
-                self.sel_r = [ 0 for ii in range(len(self.sel_a)) ]
-                descrpt, descrpt_deriv, rij, nlist, self.distance, self.max_nbor_size, self.table_range \
-                    = op_module.env_mat_stat_se_a(self.place_holders['coord'],
-                                             self.place_holders['type'],
-                                             self.place_holders['natoms_vec'],
-                                             self.place_holders['box'],
-                                             self.place_holders['default_mesh'],
-                                             self.place_holders['avg'],
-                                             self.place_holders['std'],
-                                             rcut_a = self.rcut_a,
-                                             rcut_r = self.rcut_r,
-                                             rcut_r_smth = self.rcut_r_smth,
-                                             sel_a = self.sel_a,
-                                             sel_r = self.sel_r)
+            self.sel = sel
+            self.rcut = rcut
+            self.rcut_smth = rcut_smth
+            self.distance, self.max_nbor_size, self.table_range \
+                = op_module.env_mat_stat(self.place_holders['coord'],
+                                         self.place_holders['type'],
+                                         self.place_holders['natoms_vec'],
+                                         self.place_holders['box'],
+                                         self.place_holders['default_mesh'],
+                                         self.place_holders['avg'],
+                                         self.place_holders['std'],
+                                         sel = self.sel,
+                                         rcut = self.rcut,
+                                         rcut_smth = self.rcut_smth)
         self.sub_sess = tf.Session(graph = sub_graph, config=default_tf_session_config)
 
     def env_mat_stat(self,
-                  data) -> Tuple[float, int, List[float]]:
+                     data) -> Tuple[float, int, List[float]]:
         """
         get the data info of the training data, including neareest nbor distance between atoms, max nbor size of atoms and the output data range of the environment matrix
 
@@ -87,10 +82,10 @@ def env_mat_stat(self,
         Returns
         -------
         distance
-                The neareest nbor distance between atoms
+                The nearest nbor distance between atoms
         max_nbor_size
                 The max nbor size of atoms
-        table_range
+        env_mat_range
                 The output data range of the environment matrix
         """
         self.lower = 0.0
@@ -136,6 +131,6 @@ def env_mat_stat(self,
         print('# DEEPMD: training data with upper boundary: ' + str(self.upper))
         print('# DEEPMD: training data with min   distance: ' + str(self.dist))
         print('# DEEPMD: training data with max   nborsize: ' + str(self.max_nbor))
-        table_range = [self.lower, self.upper]
-        return self.distance, self.max_nbor_size, table_range
+        env_mat_range = [self.lower, self.upper]
+        return self.distance, self.max_nbor_size, env_mat_range
         
\ No newline at end of file
diff --git a/source/op/env_mat_stat.cc b/source/op/env_mat_stat.cc
index c90dc0b92f..58e555083c 100644
--- a/source/op/env_mat_stat.cc
+++ b/source/op/env_mat_stat.cc
@@ -15,7 +15,7 @@ using namespace tensorflow;
 using CPUDevice = Eigen::ThreadPoolDevice;
 using GPUDevice = Eigen::GpuDevice;
 
-REGISTER_OP("EnvMatStatSeA")
+REGISTER_OP("EnvMatStat")
     .Attr("T: {float, double}")
     .Input("coord: T")          //atomic coordinates
     .Input("type: int32")       //atomic type
@@ -24,37 +24,24 @@ REGISTER_OP("EnvMatStatSeA")
     .Input("mesh : int32")
     .Input("davg: T")           //average value of data
     .Input("dstd: T")           //standard deviation
-    .Attr("rcut_a: float")      //no use
-    .Attr("rcut_r: float")
-    .Attr("rcut_r_smth: float")
-    .Attr("sel_a: list(int)")
-    .Attr("sel_r: list(int)")   //all zero
-    .Output("descrpt: T")
-    .Output("descrpt_deriv: T")
-    .Output("rij: T")
-    .Output("nlist: int32")
+    .Attr("rcut: float")      //no use
+    .Attr("rcut_smth: float")
+    .Attr("sel: list(int)")
     .Output("distance: T")
     .Output("max_nbor_size: int32")
-    .Output("table_range: T");
+    .Output("env_stat_range: T");
 
 template<typename Device, typename FPTYPE>
-class EnvMatStatSeAOp : public OpKernel {
+class EnvMatStatOp : public OpKernel {
 public:
-  explicit EnvMatStatSeAOp(OpKernelConstruction* context) : OpKernel(context) {
-    OP_REQUIRES_OK(context, context->GetAttr("rcut_a", &rcut_a));
-    OP_REQUIRES_OK(context, context->GetAttr("rcut_r", &rcut_r));
-    OP_REQUIRES_OK(context, context->GetAttr("rcut_r_smth", &rcut_r_smth));
-    OP_REQUIRES_OK(context, context->GetAttr("sel_a", &sel_a));
-    OP_REQUIRES_OK(context, context->GetAttr("sel_r", &sel_r));
-    cum_sum (sec_a, sel_a);
-    cum_sum (sec_r, sel_r);
-    ndescrpt_a = sec_a.back() * 4;
-    ndescrpt_r = sec_r.back() * 1;
-    ndescrpt = ndescrpt_a + ndescrpt_r;
-    nnei_a = sec_a.back();
-    nnei_r = sec_r.back();
-    nnei = nnei_a + nnei_r;
-    fill_nei_a = (rcut_a < 0);
+  explicit EnvMatStatOp(OpKernelConstruction* context) : OpKernel(context) {
+    OP_REQUIRES_OK(context, context->GetAttr("rcut", &rcut));
+    OP_REQUIRES_OK(context, context->GetAttr("rcut_smth", &rcut_smth));
+    OP_REQUIRES_OK(context, context->GetAttr("sel", &sel));
+    cum_sum (sec, sel);
+    ndescrpt = sec.back() * 4;
+    nnei = sec.back();
+    fill_nei_a = true;
     count_nei_idx_overflow = 0;
   }
 
@@ -78,8 +65,7 @@ class EnvMatStatSeAOp : public OpKernel {
     OP_REQUIRES (context, (mesh_tensor.shape().dims() == 1),	errors::InvalidArgument ("Dim of mesh should be 1"));
     OP_REQUIRES (context, (avg_tensor.shape().dims() == 2),	errors::InvalidArgument ("Dim of avg should be 2"));
     OP_REQUIRES (context, (std_tensor.shape().dims() == 2),	errors::InvalidArgument ("Dim of std should be 2"));
-    OP_REQUIRES (context, (fill_nei_a),				errors::InvalidArgument ("Rotational free descriptor only support the case rcut_a < 0"));
-    OP_REQUIRES (context, (sec_r.back() == 0),			errors::InvalidArgument ("Rotational free descriptor only support all-angular information: sel_r should be all zero."));
+    OP_REQUIRES (context, (fill_nei_a),				errors::InvalidArgument ("Rotational free descriptor only support the case -1 < 0"));
 
     OP_REQUIRES (context, (natoms_tensor.shape().dim_size(0) >= 3),		errors::InvalidArgument ("number of atoms should be larger than (or equal to) 3"));
     auto natoms	= natoms_tensor	.flat<int>();
@@ -151,24 +137,19 @@ class EnvMatStatSeAOp : public OpKernel {
     TensorShape table_range_shape ;
     table_range_shape.AddDim (nloc * nnei);
 
-    int context_output_index = 0;
-    Tensor* descrpt_tensor = NULL;
-    OP_REQUIRES_OK(context, context->allocate_output(context_output_index++, 
-						     descrpt_shape, 
-						     &descrpt_tensor));
-    Tensor* descrpt_deriv_tensor = NULL;
-    OP_REQUIRES_OK(context, context->allocate_output(context_output_index++, 
-						     descrpt_deriv_shape, 
-						     &descrpt_deriv_tensor));
-    Tensor* rij_tensor = NULL;
-    OP_REQUIRES_OK(context, context->allocate_output(context_output_index++, 
-						     rij_shape,
-						     &rij_tensor));
-    Tensor* nlist_tensor = NULL;
-    OP_REQUIRES_OK(context, context->allocate_output(context_output_index++, 
-						     nlist_shape,
-						     &nlist_tensor));
+    Tensor descrpt_tensor;
+    OP_REQUIRES_OK(context, context->allocate_temp(DT_DOUBLE, descrpt_shape, &descrpt_tensor));
     
+    Tensor descrpt_deriv_tensor;
+    OP_REQUIRES_OK(context, context->allocate_temp(DT_DOUBLE, descrpt_deriv_shape, &descrpt_deriv_tensor));
+
+    Tensor rij_tensor;
+    OP_REQUIRES_OK(context, context->allocate_temp(DT_DOUBLE, rij_shape, &rij_tensor));
+
+    Tensor nlist_tensor;
+    OP_REQUIRES_OK(context, context->allocate_temp(DT_INT32, nlist_shape, &nlist_tensor));
+    
+    int context_output_index = 0;
     Tensor* distance_tensor = NULL;
     OP_REQUIRES_OK(context, context->allocate_output(context_output_index++, 
 						     distance_shape,
@@ -188,10 +169,10 @@ class EnvMatStatSeAOp : public OpKernel {
     auto mesh	= mesh_tensor	.flat<int>();
     auto avg	= avg_tensor	.matrix<FPTYPE>();
     auto std	= std_tensor	.matrix<FPTYPE>();
-    auto descrpt	= descrpt_tensor	->matrix<FPTYPE>();
-    auto descrpt_deriv	= descrpt_deriv_tensor	->matrix<FPTYPE>();
-    auto rij		= rij_tensor		->matrix<FPTYPE>();
-    auto nlist		= nlist_tensor	->matrix<int>();
+    auto descrpt	= descrpt_tensor	.matrix<FPTYPE>();
+    auto descrpt_deriv	= descrpt_deriv_tensor	.matrix<FPTYPE>();
+    auto rij		= rij_tensor		.matrix<FPTYPE>();
+    auto nlist		= nlist_tensor	.matrix<int>();
     auto distance		= distance_tensor	->flat<FPTYPE>();
     // find a potential bug here!
     auto max_nbor_size	= max_nbor_size_tensor ->flat<int>();
@@ -212,8 +193,7 @@ class EnvMatStatSeAOp : public OpKernel {
     //   if (type(0, ii) > max_type_v) max_type_v = type(0, ii);
     // }
     // int ntypes = max_type_v + 1;
-    OP_REQUIRES (context, (ntypes == int(sel_a.size())),	errors::InvalidArgument ("number of types should match the length of sel array"));
-    OP_REQUIRES (context, (ntypes == int(sel_r.size())),	errors::InvalidArgument ("number of types should match the length of sel array"));
+    OP_REQUIRES (context, (ntypes == int(sel.size())),	errors::InvalidArgument ("number of types should match the length of sel array"));
 
     for (int kk = 0; kk < nsamples; ++kk){
       // set region
@@ -278,14 +258,14 @@ class EnvMatStatSeAOp : public OpKernel {
 	std::vector<int > ext_end = {mesh(10-1), mesh(11-1), mesh(12-1)};
 	std::vector<int > global_grid (3);
 	for (int dd = 0; dd < 3; ++dd) global_grid[dd] = nat_end[dd] - nat_stt[dd];
-	::build_nlist (d_nlist_a, d_nlist_r, d_coord3, nloc, rcut_a, rcut_r, nat_stt, nat_end, ext_stt, ext_end, region, global_grid);
+	::build_nlist (d_nlist_a, d_nlist_r, d_coord3, nloc, -1, rcut, nat_stt, nat_end, ext_stt, ext_end, region, global_grid);
       }
       else if (nei_mode == 1) {
           // std::cout << "I'm in nei_mode 1" << std::endl;
 	std::vector<double > bk_d_coord3 = d_coord3;
 	std::vector<int > bk_d_type = d_type;
 	std::vector<int > ncell, ngcell;
-	copy_coord(d_coord3, d_type, nlist_map, ncell, ngcell, bk_d_coord3, bk_d_type, rcut_r, region);	
+	copy_coord(d_coord3, d_type, nlist_map, ncell, ngcell, bk_d_coord3, bk_d_type, rcut, region);	
 	b_nlist_map = true;
 	std::vector<int> nat_stt(3, 0);
 	std::vector<int> ext_stt(3), ext_end(3);
@@ -293,10 +273,10 @@ class EnvMatStatSeAOp : public OpKernel {
 	  ext_stt[dd] = -ngcell[dd];
 	  ext_end[dd] = ncell[dd] + ngcell[dd];
 	}
-	::build_nlist (d_nlist_a, d_nlist_r, d_coord3, nloc, rcut_a, rcut_r, nat_stt, ncell, ext_stt, ext_end, region, ncell);
+	::build_nlist (d_nlist_a, d_nlist_r, d_coord3, nloc, -1, rcut, nat_stt, ncell, ext_stt, ext_end, region, ncell);
       }
       else if (nei_mode == -1){
-	::build_nlist (d_nlist_a, d_nlist_r, d_coord3, rcut_a, rcut_r, NULL);
+	::build_nlist (d_nlist_a, d_nlist_r, d_coord3, -1, rcut, NULL);
       }
       else {
 	throw std::runtime_error("unknow neighbor mode");
@@ -310,9 +290,11 @@ class EnvMatStatSeAOp : public OpKernel {
       for (int ii = 0; ii < nloc; ++ii){
 	std::vector<int> fmt_nlist_a;
 	std::vector<int> fmt_nlist_r;
+  std::vector<int> sec_r(sec.size(), 0);
+
 	int ret = -1;
 	if (fill_nei_a){
-	  if ((ret = format_nlist_fill_a (fmt_nlist_a, fmt_nlist_r, d_coord3, ntypes, d_type, region, b_pbc, ii, d_nlist_a[ii], d_nlist_r[ii], rcut_r, sec_a, sec_r)) != -1){
+	  if ((ret = format_nlist_fill_a (fmt_nlist_a, fmt_nlist_r, d_coord3, ntypes, d_type, region, b_pbc, ii, d_nlist_a[ii], d_nlist_r[ii], rcut, sec, sec_r)) != -1){
 	    if (count_nei_idx_overflow == 0) {
 	      std::cout << "WARNING: Radial neighbor list length of type " << ret << " is not enough" << std::endl;
 	      flush(std::cout);
@@ -337,36 +319,36 @@ class EnvMatStatSeAOp : public OpKernel {
 				 b_pbc,
 				 ii, 
 				 fmt_nlist_a,
-				 sec_a, 
-				 rcut_r_smth, 
-				 rcut_r);
+				 sec, 
+				 rcut_smth, 
+				 rcut);
 
 	// check sizes
-	assert (d_descrpt_a.size() == ndescrpt_a);
-	assert (d_descrpt_a_deriv.size() == ndescrpt_a * 3);
-	assert (d_rij_a.size() == nnei_a * 3);
-	assert (int(fmt_nlist_a.size()) == nnei_a);
+	assert (d_descrpt_a.size() == ndescrpt);
+	assert (d_descrpt_a_deriv.size() == ndescrpt * 3);
+	assert (d_rij_a.size() == nnei * 3);
+	assert (int(fmt_nlist_a.size()) == nnei);
   // std::cout << "min:\t" << (0 - avg(0, 0)) / std(0, 0) << std::endl;
   // if (counter % 1000 == 0) {
   //   std::cout << "min:\t" << (0 - avg(0, 0)) / std(0, 0) << std::endl;
   // }
 	// record outputs
-	for (int jj = 0; jj < ndescrpt_a; ++jj) {
+	for (int jj = 0; jj < ndescrpt; ++jj) {
 	  descrpt(kk, ii * ndescrpt + jj) = (d_descrpt_a[jj] - avg(d_type[ii], jj)) / std(d_type[ii], jj);
        if (jj % 4 == 0) {
          table_range(ii * nnei + jj / 4) = descrpt(kk, ii * ndescrpt + jj);
        }
   }
-	for (int jj = 0; jj < ndescrpt_a * 3; ++jj) {
+	for (int jj = 0; jj < ndescrpt * 3; ++jj) {
 	  descrpt_deriv(kk, ii * ndescrpt * 3 + jj) = d_descrpt_a_deriv[jj] / std(d_type[ii], jj/3);
 	}
-	for (int jj = 0; jj < nnei_a * 3; ++jj){
+	for (int jj = 0; jj < nnei * 3; ++jj){
 	  rij (kk, ii * nnei * 3 + jj) = d_rij_a[jj];
     if (jj % 3 == 0 && d_rij_a[jj] > 0) {
       distance(ii * nnei + jj / 3) = sqrt(d_rij_a[jj] * d_rij_a[jj] + d_rij_a[jj + 1] * d_rij_a[jj + 1] + d_rij_a[jj + 2] * d_rij_a[jj + 2]);
     }
 	}
-	for (int jj = 0; jj < nnei_a; ++jj){
+	for (int jj = 0; jj < nnei; ++jj){
 	  int record = fmt_nlist_a[jj];
 	  if (b_nlist_map && record >= 0) {
 	    record = nlist_map[record];
@@ -378,15 +360,12 @@ class EnvMatStatSeAOp : public OpKernel {
   }
 private:
   int counter = -1;
-  float rcut_a;
-  float rcut_r;
-  float rcut_r_smth;
-  std::vector<int32> sel_r;
-  std::vector<int32> sel_a;
-  std::vector<int> sec_a;
-  std::vector<int> sec_r;
-  int ndescrpt, ndescrpt_a, ndescrpt_r;
-  int nnei, nnei_a, nnei_r;
+  float rcut;
+  float rcut_smth;
+  std::vector<int32> sel;
+  std::vector<int> sec;
+  int ndescrpt;
+  int nnei;
   bool fill_nei_a;
   int count_nei_idx_overflow;
   void 
@@ -402,7 +381,7 @@ class EnvMatStatSeAOp : public OpKernel {
 
 #define REGISTER_CPU(T)                                                                 \
 REGISTER_KERNEL_BUILDER(                                                                \
-    Name("EnvMatStatSeA").Device(DEVICE_CPU).TypeConstraint<T>("T"),                      \
-    EnvMatStatSeAOp<CPUDevice, T>); 
+    Name("EnvMatStat").Device(DEVICE_CPU).TypeConstraint<T>("T"),                      \
+    EnvMatStatOp<CPUDevice, T>); 
 REGISTER_CPU(float);
 REGISTER_CPU(double);
\ No newline at end of file
diff --git a/source/train/Trainer.py b/source/train/Trainer.py
index fd4b3eac42..671a69e4d0 100644
--- a/source/train/Trainer.py
+++ b/source/train/Trainer.py
@@ -89,14 +89,19 @@ def _init_param(self, jdata):
         model_param = j_must_have(jdata, 'model')
         descrpt_param = j_must_have(model_param, 'descriptor')
         fitting_param = j_must_have(model_param, 'fitting_net')
-
+        self.model_param    = model_param
+        self.descrpt_param  = descrpt_param
+        self.descrpt_type   = descrpt_param['type']
+        if 'compress' in model_param:
+            self.compress_param = model_param['compress']
+        
         # descriptor
         try:
-            self.descrpt_type = descrpt_param['type']
+            descrpt_type = descrpt_param['type']
         except KeyError:
             raise KeyError('the type of descriptor should be set by `type`')
 
-        if self.descrpt_type != 'hybrid':
+        if descrpt_type != 'hybrid':
             self.descrpt = _generate_descrpt_from_param_dict(descrpt_param)
         else :
             descrpt_list = []
@@ -116,19 +121,19 @@ def _init_param(self, jdata):
         # elif fitting_type == 'wfc':            
         #     self.fitting = WFCFitting(fitting_param, self.descrpt)
         elif fitting_type == 'dipole':
-            if self.descrpt_type == 'se_a':
+            if descrpt_type == 'se_a':
                 self.fitting = DipoleFittingSeA(**fitting_param)
             else :
                 raise RuntimeError('fitting dipole only supports descrptors: se_a')
         elif fitting_type == 'polar':
-            # if self.descrpt_type == 'loc_frame':
+            # if descrpt_type == 'loc_frame':
             #     self.fitting = PolarFittingLocFrame(fitting_param, self.descrpt)
-            if self.descrpt_type == 'se_a':
+            if descrpt_type == 'se_a':
                 self.fitting = PolarFittingSeA(**fitting_param)
             else :
                 raise RuntimeError('fitting polar only supports descrptors: loc_frame and se_a')
         elif fitting_type == 'global_polar':
-            if self.descrpt_type == 'se_a':
+            if descrpt_type == 'se_a':
                 self.fitting = GlobalPolarFittingSeA(**fitting_param)
             else :
                 raise RuntimeError('fitting global_polar only supports descrptors: loc_frame and se_a')
@@ -248,7 +253,6 @@ def _init_param(self, jdata):
         else :
             self.numb_fparam = 0
 
-
     def _message (self, msg) :
         self.run_opt.message(msg)
 
@@ -273,16 +277,21 @@ def build (self,
 
         self.model.data_stat(data)
 
-        if hasattr(self.descrpt, 'compress') and self.descrpt.compress:
-            assert hasattr(self.descrpt, 'distance'), "Compression error: descrpt must have attr distance"
-            assert hasattr(self.descrpt, 'max_nbor_size'), "Compression error: descrpt must have attr max_nbor_size"
-            assert hasattr(self.descrpt, 'table_range'), "Compression error: descrpt must have attr table_range"
+        if 'compress' in self.model_param and self.compress_param['compress']:
+            assert hasattr(self.descrpt, 'davg'),     "Model compression error: descriptor must have attr davg!"
+            assert hasattr(self.descrpt, 'dstd'),     "Model compression error: descriptor must have attr dstd!"
+            assert hasattr(self.descrpt, 'ntypes'),   "Model compression error: descriptor must have attr ntypes!"
+            assert 'sel' in self.descrpt_param,       "Model compression error: descriptor must have attr sel!"
+            assert 'rcut' in self.descrpt_param,      "Model compression error: descriptor must have attr rcut!"
+            assert 'rcut_smth' in self.descrpt_param, "Model compression error: descriptor must have attr rcut_smth!"
             if self.descrpt_type == 'se_a':
-                stat = EnvMatStat(self.descrpt_type, self.descrpt.ntypes, self.descrpt.rcut_r, self.descrpt.rcut_r_smth, self.descrpt.sel_a, self.descrpt.davg, self.descrpt.dstd)
+                stat = EnvMatStat(self.descrpt_type, self.descrpt.ntypes, self.descrpt_param['rcut'], self.descrpt_param['rcut_smth'], self.descrpt_param['sel'], self.descrpt.davg, self.descrpt.dstd)
             else:
                 raise RuntimeError ("Model compression error: descriptor type must be se_a!")
-            self.descrpt.distance, self.descrpt.max_nbor_size, self.descrpt.table_range\
+
+            distance, max_nbor_size, env_mat_range\
                 = stat.env_mat_stat(data)
+            self.descrpt.enable_compression(distance, max_nbor_size, env_mat_range, self.compress_param['model_file'], self.compress_param['table_config'])            # send the statistics of the training data and activate the descriptor compression mode
 
         worker_device = "/job:%s/task:%d/%s" % (self.run_opt.my_job_name,
                                                 self.run_opt.my_task_index,
diff --git a/source/train/compress.py b/source/train/compress.py
index 64035d53ed..1f36f8e9e1 100644
--- a/source/train/compress.py
+++ b/source/train/compress.py
@@ -18,9 +18,10 @@ def compress(args):
                                    dump = 'input_v1_compat.json')
     
     jdata = normalize(jdata)
-    jdata['model']['descriptor']['compress'] = True
-    jdata['model']['descriptor']['model_file'] = args.input
-    jdata['model']['descriptor']['table_info'] = args.table_info
+    jdata['model']['compress'] = {}
+    jdata['model']['compress']['compress'] = True
+    jdata['model']['compress']['model_file'] = args.input
+    jdata['model']['compress']['table_config'] = args.table_config
     
     # check the descriptor info of the input file
     assert jdata['model']['descriptor']['type'] == 'se_a', 'Model compression error: descriptor type must be se_a!'
diff --git a/source/train/main.py b/source/train/main.py
index bd5fbc1d8a..9baaffce4e 100644
--- a/source/train/main.py
+++ b/source/train/main.py
@@ -71,7 +71,7 @@ def main () :
 				            help = "the original model")
     parser_compress.add_argument("-o","--output", default = "frozen_model_compress.pb", type=str, 
 				            help='the compressed model')
-    parser_compress.add_argument('-t', '--table-info', nargs='+', default = [5, 0.01, 0.1, 1], type=float)
+    parser_compress.add_argument('-t', '--table-config', nargs='+', default = [5, 0.01, 0.1, 1], type=float)
     parser_compress.add_argument("-d", "--folder", type=str, default = ".", 
                             help="path to checkpoint folder")
 

From 8588554e1d9a8e4eb036494a0d9728d1f0fec9bb Mon Sep 17 00:00:00 2001
From: denghuilu <denghuilu@pku.edu.cn>
Date: Fri, 5 Feb 2021 22:08:14 +0800
Subject: [PATCH 11/20] use standard tensorflow op style for custome ops

---
 source/op/descrpt.cc              |  2 +-
 source/op/descrpt_se_a_ef.cc      | 70 +++++++++++-------------------
 source/op/descrpt_se_a_ef_para.cc | 71 +++++++++++--------------------
 source/op/descrpt_se_a_ef_vert.cc | 70 +++++++++++-------------------
 source/op/map_aparam.cc           | 36 ++++++----------
 5 files changed, 93 insertions(+), 156 deletions(-)

diff --git a/source/op/descrpt.cc b/source/op/descrpt.cc
index 71918b4a5a..201169f7f1 100644
--- a/source/op/descrpt.cc
+++ b/source/op/descrpt.cc
@@ -602,7 +602,7 @@ class DescrptOp : public OpKernel {
 
 #define REGISTER_CPU(T)                                                                 \
 REGISTER_KERNEL_BUILDER(                                                                \
-    Name("Descrpt").Device(DEVICE_CPU).TypeConstraint<T>("T"),                       \
+    Name("Descrpt").Device(DEVICE_CPU).TypeConstraint<T>("T"),                          \
     DescrptOp<CPUDevice, T>); 
 REGISTER_CPU(float);
 REGISTER_CPU(double);
diff --git a/source/op/descrpt_se_a_ef.cc b/source/op/descrpt_se_a_ef.cc
index b4a631b7cf..9aca7eb720 100644
--- a/source/op/descrpt_se_a_ef.cc
+++ b/source/op/descrpt_se_a_ef.cc
@@ -12,52 +12,30 @@ typedef double compute_t;
 using namespace tensorflow;
 // using namespace std;
 
-#ifdef HIGH_PREC
-typedef double VALUETYPE ;
-#else 
-typedef float  VALUETYPE ;
-#endif
+using CPUDevice = Eigen::ThreadPoolDevice;
+using GPUDevice = Eigen::GpuDevice;
 
-#ifdef HIGH_PREC
 REGISTER_OP("DescrptSeAEf")
-.Input("coord: double")
+.Attr("T: {float, double}")
+.Input("coord: T")
 .Input("type: int32")
 .Input("natoms: int32")
-.Input("box: double")
+.Input("box: T")
 .Input("mesh: int32")
-.Input("ef: double")
-.Input("davg: double")
-.Input("dstd: double")
+.Input("ef: T")
+.Input("davg: T")
+.Input("dstd: T")
 .Attr("rcut_a: float")
 .Attr("rcut_r: float")
 .Attr("rcut_r_smth: float")
 .Attr("sel_a: list(int)")
 .Attr("sel_r: list(int)")
-.Output("descrpt: double")
-.Output("descrpt_deriv: double")
-.Output("rij: double")
+.Output("descrpt: T")
+.Output("descrpt_deriv: T")
+.Output("rij: T")
 .Output("nlist: int32");
-#else
-REGISTER_OP("DescrptSeAEf")
-.Input("coord: float")
-.Input("type: int32")
-.Input("natoms: int32")
-.Input("box: float")
-.Input("mesh: int32")
-.Input("ef: float")
-.Input("davg: float")
-.Input("dstd: float")
-.Attr("rcut_a: float")
-.Attr("rcut_r: float")
-.Attr("rcut_r_smth: float")
-.Attr("sel_a: list(int)")
-.Attr("sel_r: list(int)")
-.Output("descrpt: float")
-.Output("descrpt_deriv: float")
-.Output("rij: float")
-.Output("nlist: int32");
-#endif
 
+template<typename Device, typename FPTYPE>
 class DescrptSeAEfOp : public OpKernel {
 public:
   explicit DescrptSeAEfOp(OpKernelConstruction* context) : OpKernel(context) {
@@ -186,16 +164,16 @@ class DescrptSeAEfOp : public OpKernel {
 						     nlist_shape,
 						     &nlist_tensor));
     
-    auto coord	= coord_tensor	.matrix<VALUETYPE>();
+    auto coord	= coord_tensor	.matrix<FPTYPE>();
     auto type	= type_tensor	.matrix<int>();
-    auto box	= box_tensor	.matrix<VALUETYPE>();
+    auto box	= box_tensor	.matrix<FPTYPE>();
     auto mesh	= mesh_tensor	.flat<int>();
-    auto ef	= ef_tensor	.matrix<VALUETYPE>();
-    auto avg	= avg_tensor	.matrix<VALUETYPE>();
-    auto std	= std_tensor	.matrix<VALUETYPE>();
-    auto descrpt	= descrpt_tensor	->matrix<VALUETYPE>();
-    auto descrpt_deriv	= descrpt_deriv_tensor	->matrix<VALUETYPE>();
-    auto rij		= rij_tensor		->matrix<VALUETYPE>();
+    auto ef	= ef_tensor	.matrix<FPTYPE>();
+    auto avg	= avg_tensor	.matrix<FPTYPE>();
+    auto std	= std_tensor	.matrix<FPTYPE>();
+    auto descrpt	= descrpt_tensor	->matrix<FPTYPE>();
+    auto descrpt_deriv	= descrpt_deriv_tensor	->matrix<FPTYPE>();
+    auto rij		= rij_tensor		->matrix<FPTYPE>();
     auto nlist		= nlist_tensor		->matrix<int>();
 
     // // check the types
@@ -385,5 +363,9 @@ class DescrptSeAEfOp : public OpKernel {
   }
 };
 
-REGISTER_KERNEL_BUILDER(Name("DescrptSeAEf").Device(DEVICE_CPU), DescrptSeAEfOp);
-
+#define REGISTER_CPU(T)                                                                 \
+REGISTER_KERNEL_BUILDER(                                                                \
+    Name("DescrptSeAEf").Device(DEVICE_CPU).TypeConstraint<T>("T"),                     \
+    DescrptSeAEfOp<CPUDevice, T>); 
+REGISTER_CPU(float);
+REGISTER_CPU(double);
diff --git a/source/op/descrpt_se_a_ef_para.cc b/source/op/descrpt_se_a_ef_para.cc
index af17b3ca12..2dccb40ae3 100644
--- a/source/op/descrpt_se_a_ef_para.cc
+++ b/source/op/descrpt_se_a_ef_para.cc
@@ -11,53 +11,30 @@ typedef double compute_t;
 
 using namespace tensorflow;
 // using namespace std;
+using CPUDevice = Eigen::ThreadPoolDevice;
+using GPUDevice = Eigen::GpuDevice;
 
-#ifdef HIGH_PREC
-typedef double VALUETYPE ;
-#else 
-typedef float  VALUETYPE ;
-#endif
-
-#ifdef HIGH_PREC
-REGISTER_OP("DescrptSeAEfPara")
-.Input("coord: double")
-.Input("type: int32")
-.Input("natoms: int32")
-.Input("box: double")
-.Input("mesh: int32")
-.Input("ef: double")
-.Input("davg: double")
-.Input("dstd: double")
-.Attr("rcut_a: float")
-.Attr("rcut_r: float")
-.Attr("rcut_r_smth: float")
-.Attr("sel_a: list(int)")
-.Attr("sel_r: list(int)")
-.Output("descrpt: double")
-.Output("descrpt_deriv: double")
-.Output("rij: double")
-.Output("nlist: int32");
-#else
 REGISTER_OP("DescrptSeAEfPara")
-.Input("coord: float")
+.Attr("T: {float, double}")
+.Input("coord: T")
 .Input("type: int32")
 .Input("natoms: int32")
-.Input("box: float")
+.Input("box: T")
 .Input("mesh: int32")
-.Input("ef: float")
-.Input("davg: float")
-.Input("dstd: float")
+.Input("ef: T")
+.Input("davg: T")
+.Input("dstd: T")
 .Attr("rcut_a: float")
 .Attr("rcut_r: float")
 .Attr("rcut_r_smth: float")
 .Attr("sel_a: list(int)")
 .Attr("sel_r: list(int)")
-.Output("descrpt: float")
-.Output("descrpt_deriv: float")
-.Output("rij: float")
+.Output("descrpt: T")
+.Output("descrpt_deriv: T")
+.Output("rij: T")
 .Output("nlist: int32");
-#endif
 
+template<typename Device, typename FPTYPE>
 class DescrptSeAEfParaOp : public OpKernel {
 public:
   explicit DescrptSeAEfParaOp(OpKernelConstruction* context) : OpKernel(context) {
@@ -186,16 +163,16 @@ class DescrptSeAEfParaOp : public OpKernel {
 						     nlist_shape,
 						     &nlist_tensor));
     
-    auto coord	= coord_tensor	.matrix<VALUETYPE>();
+    auto coord	= coord_tensor	.matrix<FPTYPE>();
     auto type	= type_tensor	.matrix<int>();
-    auto box	= box_tensor	.matrix<VALUETYPE>();
+    auto box	= box_tensor	.matrix<FPTYPE>();
     auto mesh	= mesh_tensor	.flat<int>();
-    auto ef	= ef_tensor	.matrix<VALUETYPE>();
-    auto avg	= avg_tensor	.matrix<VALUETYPE>();
-    auto std	= std_tensor	.matrix<VALUETYPE>();
-    auto descrpt	= descrpt_tensor	->matrix<VALUETYPE>();
-    auto descrpt_deriv	= descrpt_deriv_tensor	->matrix<VALUETYPE>();
-    auto rij		= rij_tensor		->matrix<VALUETYPE>();
+    auto ef	= ef_tensor	.matrix<FPTYPE>();
+    auto avg	= avg_tensor	.matrix<FPTYPE>();
+    auto std	= std_tensor	.matrix<FPTYPE>();
+    auto descrpt	= descrpt_tensor	->matrix<FPTYPE>();
+    auto descrpt_deriv	= descrpt_deriv_tensor	->matrix<FPTYPE>();
+    auto rij		= rij_tensor		->matrix<FPTYPE>();
     auto nlist		= nlist_tensor		->matrix<int>();
 
     // // check the types
@@ -385,5 +362,9 @@ class DescrptSeAEfParaOp : public OpKernel {
   }
 };
 
-REGISTER_KERNEL_BUILDER(Name("DescrptSeAEfPara").Device(DEVICE_CPU), DescrptSeAEfParaOp);
-
+#define REGISTER_CPU(T)                                                                 \
+REGISTER_KERNEL_BUILDER(                                                                \
+    Name("DescrptSeAEfPara").Device(DEVICE_CPU).TypeConstraint<T>("T"),                 \
+    DescrptSeAEfParaOp<CPUDevice, T>); 
+REGISTER_CPU(float);
+REGISTER_CPU(double);
diff --git a/source/op/descrpt_se_a_ef_vert.cc b/source/op/descrpt_se_a_ef_vert.cc
index 1d416864e2..5734a8811f 100644
--- a/source/op/descrpt_se_a_ef_vert.cc
+++ b/source/op/descrpt_se_a_ef_vert.cc
@@ -12,52 +12,30 @@ typedef double compute_t;
 using namespace tensorflow;
 // using namespace std;
 
-#ifdef HIGH_PREC
-typedef double VALUETYPE ;
-#else 
-typedef float  VALUETYPE ;
-#endif
+using CPUDevice = Eigen::ThreadPoolDevice;
+using GPUDevice = Eigen::GpuDevice;
 
-#ifdef HIGH_PREC
 REGISTER_OP("DescrptSeAEfVert")
-.Input("coord: double")
+.Attr("T: {float, double}")
+.Input("coord: T")
 .Input("type: int32")
 .Input("natoms: int32")
-.Input("box: double")
+.Input("box: T")
 .Input("mesh: int32")
-.Input("ef: double")
-.Input("davg: double")
-.Input("dstd: double")
+.Input("ef: T")
+.Input("davg: T")
+.Input("dstd: T")
 .Attr("rcut_a: float")
 .Attr("rcut_r: float")
 .Attr("rcut_r_smth: float")
 .Attr("sel_a: list(int)")
 .Attr("sel_r: list(int)")
-.Output("descrpt: double")
-.Output("descrpt_deriv: double")
-.Output("rij: double")
+.Output("descrpt: T")
+.Output("descrpt_deriv: T")
+.Output("rij: T")
 .Output("nlist: int32");
-#else
-REGISTER_OP("DescrptSeAEfVert")
-.Input("coord: float")
-.Input("type: int32")
-.Input("natoms: int32")
-.Input("box: float")
-.Input("mesh: int32")
-.Input("ef: float")
-.Input("davg: float")
-.Input("dstd: float")
-.Attr("rcut_a: float")
-.Attr("rcut_r: float")
-.Attr("rcut_r_smth: float")
-.Attr("sel_a: list(int)")
-.Attr("sel_r: list(int)")
-.Output("descrpt: float")
-.Output("descrpt_deriv: float")
-.Output("rij: float")
-.Output("nlist: int32");
-#endif
 
+template<typename Device, typename FPTYPE>
 class DescrptSeAEfVertOp : public OpKernel {
 public:
   explicit DescrptSeAEfVertOp(OpKernelConstruction* context) : OpKernel(context) {
@@ -186,16 +164,16 @@ class DescrptSeAEfVertOp : public OpKernel {
 						     nlist_shape,
 						     &nlist_tensor));
     
-    auto coord	= coord_tensor	.matrix<VALUETYPE>();
+    auto coord	= coord_tensor	.matrix<FPTYPE>();
     auto type	= type_tensor	.matrix<int>();
-    auto box	= box_tensor	.matrix<VALUETYPE>();
+    auto box	= box_tensor	.matrix<FPTYPE>();
     auto mesh	= mesh_tensor	.flat<int>();
-    auto ef	= ef_tensor	.matrix<VALUETYPE>();
-    auto avg	= avg_tensor	.matrix<VALUETYPE>();
-    auto std	= std_tensor	.matrix<VALUETYPE>();
-    auto descrpt	= descrpt_tensor	->matrix<VALUETYPE>();
-    auto descrpt_deriv	= descrpt_deriv_tensor	->matrix<VALUETYPE>();
-    auto rij		= rij_tensor		->matrix<VALUETYPE>();
+    auto ef	= ef_tensor	.matrix<FPTYPE>();
+    auto avg	= avg_tensor	.matrix<FPTYPE>();
+    auto std	= std_tensor	.matrix<FPTYPE>();
+    auto descrpt	= descrpt_tensor	->matrix<FPTYPE>();
+    auto descrpt_deriv	= descrpt_deriv_tensor	->matrix<FPTYPE>();
+    auto rij		= rij_tensor		->matrix<FPTYPE>();
     auto nlist		= nlist_tensor		->matrix<int>();
 
     // // check the types
@@ -385,5 +363,9 @@ class DescrptSeAEfVertOp : public OpKernel {
   }
 };
 
-REGISTER_KERNEL_BUILDER(Name("DescrptSeAEfVert").Device(DEVICE_CPU), DescrptSeAEfVertOp);
-
+#define REGISTER_CPU(T)                                                                 \
+REGISTER_KERNEL_BUILDER(                                                                \
+    Name("DescrptSeAEfVert").Device(DEVICE_CPU).TypeConstraint<T>("T"),                 \
+    DescrptSeAEfVertOp<CPUDevice, T>); 
+REGISTER_CPU(float);
+REGISTER_CPU(double);
\ No newline at end of file
diff --git a/source/op/map_aparam.cc b/source/op/map_aparam.cc
index 43bc8a011f..608f5f614b 100644
--- a/source/op/map_aparam.cc
+++ b/source/op/map_aparam.cc
@@ -6,32 +6,19 @@
 using namespace tensorflow;
 // using namespace std;
 
-#ifdef HIGH_PREC
-typedef double VALUETYPE;
-#else
-typedef float  VALUETYPE;
-#endif
+using CPUDevice = Eigen::ThreadPoolDevice;
+using GPUDevice = Eigen::GpuDevice;
 
-#ifdef HIGH_PREC
 REGISTER_OP("MapAparam")
-.Input("aparam: double")
+.Attr("T: {float, double}")
+.Input("aparam: T")
 .Input("nlist: int32")
 .Input("natoms: int32")
 .Attr("n_a_sel: int")
 .Attr("n_r_sel: int")
-.Output("output: double");
-#else
-REGISTER_OP("MapAparam")
-.Input("aparam: float")
-.Input("nlist: int32")
-.Input("natoms: int32")
-.Attr("n_a_sel: int")
-.Attr("n_r_sel: int")
-.Output("mapped: float");
-#endif
-
-using namespace tensorflow;
+.Output("output: T");
 
+template <typename Device, typename FPTYPE>
 class MapAparamOp : public OpKernel {
  public:
   explicit MapAparamOp(OpKernelConstruction* context) : OpKernel(context) {
@@ -73,9 +60,9 @@ class MapAparamOp : public OpKernel {
     OP_REQUIRES_OK(context, context->allocate_output(0, output_shape, &output_tensor));
     
     // flat the tensors
-    auto aparam = aparam_tensor.flat<VALUETYPE>();
+    auto aparam = aparam_tensor.flat<FPTYPE>();
     auto nlist = nlist_tensor.flat<int>();
-    auto output = output_tensor->flat<VALUETYPE>();
+    auto output = output_tensor->flat<FPTYPE>();
 
     // loop over samples
 #pragma omp parallel for 
@@ -110,7 +97,12 @@ class MapAparamOp : public OpKernel {
   int n_r_sel, n_a_sel, n_a_shift;
 };
 
-REGISTER_KERNEL_BUILDER(Name("MapAparam").Device(DEVICE_CPU), MapAparamOp);
+#define REGISTER_CPU(T)                                                                 \
+REGISTER_KERNEL_BUILDER(                                                                \
+    Name("MapAparam").Device(DEVICE_CPU).TypeConstraint<T>("T"),                        \
+    MapAparamOp<CPUDevice, T>); 
+REGISTER_CPU(float);
+REGISTER_CPU(double);
 
 
 

From feb0669b7666ce256040fc8941378e649a083760 Mon Sep 17 00:00:00 2001
From: denghuilu <denghuilu@pku.edu.cn>
Date: Sat, 6 Feb 2021 19:21:40 +0800
Subject: [PATCH 12/20] optimize code structure for method EnvMatStat

---
 deepmd/descriptor/se_a.py              |  16 ++-
 deepmd/utils/env_mat_stat.py           | 121 +++++++++++++++--------
 source/lib/include/ComputeDescriptor.h |  46 +++++++++
 source/lib/include/CustomeOperation.h  |   4 +-
 source/op/descrpt.cc                   |   2 +-
 source/op/env_mat_stat.cc              | 131 ++++---------------------
 source/train/Trainer.py                |  10 +-
 7 files changed, 159 insertions(+), 171 deletions(-)

diff --git a/deepmd/descriptor/se_a.py b/deepmd/descriptor/se_a.py
index e40210de1a..3b9a4db0af 100644
--- a/deepmd/descriptor/se_a.py
+++ b/deepmd/descriptor/se_a.py
@@ -231,8 +231,6 @@ def compute_input_stats (self,
         self.dstd = np.array(all_dstd)
 
     def enable_compression(self,
-                           distance,
-                           max_nbor_size,
                            env_mat_range,
                            model_file = 'frozon_model.pb',
                            table_config = [5, 0.01, 0.1, -1]
@@ -242,22 +240,22 @@ def enable_compression(self,
         
         Parameters
         ----------
-        distance
-                The nearest nbor distance between atoms
-        max_nbor_size
-                The max nbor size of atoms
         env_mat_range
                 The output data range of the environment matrix
+                env_mat_range[0] denotes the lower boundary of environment matrix
+                env_mat_range[1] denotes the upper boundary of environment matrix
         model_file
                 The original frozen model, that will be compressed
-        table_info
+        table_config
                 The configuration of the tabulation
+                Table_config[0] denotes the scale of model extrapolation
+                Table_config[1] denotes the first table stride
+                Table_config[2] denotes the second table stride
+                Table_config[3] denotes the overflow check frequency
         """   
         self.compress = True
         self.model_file = model_file
         self.table_config = table_config
-        self.distance = distance
-        self.max_nbor_size = max_nbor_size
         self.env_mat_range = env_mat_range
         self.table = DeepTabulate(self.model_file, self.filter_np_precision, self.type_one_side)
 
diff --git a/deepmd/utils/env_mat_stat.py b/deepmd/utils/env_mat_stat.py
index 4096024007..1518207de7 100644
--- a/deepmd/utils/env_mat_stat.py
+++ b/deepmd/utils/env_mat_stat.py
@@ -15,6 +15,7 @@ class EnvMatStat():
     def __init__(self,
                  descrpt_type : str,
                  ntypes : int,
+                 ndescrpt : int,
                  rcut,
                  rcut_smth,
                  sel,
@@ -29,6 +30,8 @@ def __init__(self,
                 The descrpt type of the embedding net
         ntypes
                 The num of atom types
+        ndescrpt
+                The width of environment matrix
         rcut
                 The cut-off radius
         rcut_smth
@@ -40,9 +43,15 @@ def __init__(self,
         dstd
                 Standard deviation of training data
         """
+        self.init_stat = False
         self.davg = davg
         self.dstd = dstd
+        if self.davg is None:
+            self.davg = np.zeros([self.ntypes, self.ndescrpt])
+        if self.dstd is None:
+            self.dstd = np.ones ([self.ntypes, self.ndescrpt])
         self.ntypes = ntypes
+        self.ndescrpt = ndescrpt
         self.descrpt_type = descrpt_type
         assert self.descrpt_type == 'se_a', 'Model compression error: descriptor type must be se_a!'
         self.place_holders = {}
@@ -56,23 +65,21 @@ def __init__(self,
             self.sel = sel
             self.rcut = rcut
             self.rcut_smth = rcut_smth
-            self.distance, self.max_nbor_size, self.table_range \
+            self._min_nbor_dist, self._max_nbor_size \
                 = op_module.env_mat_stat(self.place_holders['coord'],
                                          self.place_holders['type'],
                                          self.place_holders['natoms_vec'],
                                          self.place_holders['box'],
                                          self.place_holders['default_mesh'],
-                                         self.place_holders['avg'],
-                                         self.place_holders['std'],
                                          sel = self.sel,
                                          rcut = self.rcut,
                                          rcut_smth = self.rcut_smth)
         self.sub_sess = tf.Session(graph = sub_graph, config=default_tf_session_config)
 
-    def env_mat_stat(self,
-                     data) -> Tuple[float, int, List[float]]:
+    def get_env_mat_stat(self,
+                         data) -> Tuple[float, int]:
         """
-        get the data info of the training data, including neareest nbor distance between atoms, max nbor size of atoms and the output data range of the environment matrix
+        get the data statistics of the training data, including nearest nbor distance between atoms, max nbor size of atoms
 
         Parameters
         ----------
@@ -81,56 +88,90 @@ def env_mat_stat(self,
         
         Returns
         -------
-        distance
+        min_nbor_dist
                 The nearest nbor distance between atoms
         max_nbor_size
                 The max nbor size of atoms
-        env_mat_range
-                The output data range of the environment matrix
         """
-        self.lower = 0.0
-        self.upper = 0.0
-        self.dist  = 100.0
-        self.max_nbor = 0
-
-        davg = self.davg
-        dstd = self.dstd
-        if davg is None:
-            davg = np.zeros([self.ntypes, self.ndescrpt])
-        if dstd is None:
-            dstd = np.ones ([self.ntypes, self.ndescrpt])
+        self.max_nbor_size = 0
+        self.min_nbor_dist = 100.0
 
         for ii in tqdm(range(len(data.system_dirs)), desc = '# DEEPMD: getting data info'):
             for jj in data.data_systems[ii].dirs:
                 data_set = data.data_systems[ii]._load_set(jj)
                 for kk in range(np.array(data_set['type']).shape[0]):
-                    dt, mn, tr \
-                        = self.sub_sess.run([self.distance, self.max_nbor_size, self.table_range], 
+                    dt, mn \
+                        = self.sub_sess.run([self._min_nbor_dist, self._max_nbor_size], 
                                             feed_dict = {
                                                 self.place_holders['coord']: np.array(data_set['coord'])[kk].reshape([-1, data.natoms[ii] * 3]),
                                                 self.place_holders['type']: np.array(data_set['type'])[kk].reshape([-1, data.natoms[ii]]),
                                                 self.place_holders['natoms_vec']: np.array(data.natoms_vec[ii]),
                                                 self.place_holders['box']: np.array(data_set['box'])[kk].reshape([-1, 9]),
                                                 self.place_holders['default_mesh']: np.array(data.default_mesh[ii]),
-                                                self.place_holders['avg']: davg,
-                                                self.place_holders['std']: dstd,
                                             })
-                    dr = np.array([np.min(tr), np.max(tr)]).astype(global_np_float_precision)
                     dt = np.min(dt)
                     mn = np.max(mn)
-                    if (dr[0] < self.lower): 
-                        self.lower = dr[0]
-                    if (dr[1] > self.upper):
-                        self.upper = dr[1]
-                    if (dt < self.dist):
-                        self.dist = dt
-                    if (mn > self.max_nbor):
-                        self.max_nbor = mn
+                    if (dt < self.min_nbor_dist):
+                        self.min_nbor_dist = dt
+                    if (mn > self.max_nbor_size):
+                        self.max_nbor_size = mn
+        self.init_stat = True
+        return self.min_nbor_dist, self.max_nbor_size
+
+    def get_env_mat_range(self,
+                     data) -> List[float]:
+        """
+        get the data statistics of the training data, including the output data range of the environment matrix
+
+        Parameters
+        ----------
+        data
+                Class for manipulating many data systems. It is implemented with the help of DeepmdData.
+        
+        Returns
+        -------
+        env_mat_range
+                The output data range of the environment matrix
+                env_mat_range[0] denotes the lower boundary of environment matrix
+                env_mat_range[1] denotes the upper boundary of environment matrix
+        """
+        if self.init_stat:
+            min_nbor_dist = self.min_nbor_dist
+            max_nbor_size = self.max_nbor_size
+        else:
+            min_nbor_dist, max_nbor_size = self.get_env_mat_stat(data)
+        self.env_mat_range = self._get_internal_env_mat_range(min_nbor_dist, max_nbor_size)
+        print('# DEEPMD: training data with lower boundary: ' + str(self.env_mat_range[0]))
+        print('# DEEPMD: training data with upper boundary: ' + str(self.env_mat_range[1]))
+        print('# DEEPMD: training data with min   distance: ' + str(self.min_nbor_dist))
+        print('# DEEPMD: training data with max   nborsize: ' + str(self.max_nbor_size))
+        return self.env_mat_range
+
+    def _get_internal_env_mat_range(self,
+                                    min_nbor_dist, 
+                                    max_nbor_size):
+        """
+        Warning: different descrpt_type may have different method to get the mat range
+        """
+        lower = 100.0
+        upper = -10.0
+        sw    = self._spline5_switch(self.min_nbor_dist, self.rcut_smth, self.rcut)
+        for ii in range(self.ntypes):
+            if lower > -self.davg[ii][0] / self.dstd[ii][0]:
+                lower = -self.davg[ii][0] / self.dstd[ii][0]
+            if upper < ((1 / self.min_nbor_dist) * sw - self.davg[ii][0]) / self.dstd[ii][0]:
+                upper = ((1 / self.min_nbor_dist) * sw - self.davg[ii][0]) / self.dstd[ii][0]
+        return [lower, upper]
 
-        print('# DEEPMD: training data with lower boundary: ' + str(self.lower))
-        print('# DEEPMD: training data with upper boundary: ' + str(self.upper))
-        print('# DEEPMD: training data with min   distance: ' + str(self.dist))
-        print('# DEEPMD: training data with max   nborsize: ' + str(self.max_nbor))
-        env_mat_range = [self.lower, self.upper]
-        return self.distance, self.max_nbor_size, env_mat_range
-        
\ No newline at end of file
+    def _spline5_switch(self,
+                        xx,
+                        rmin,
+                        rmax):
+        if xx < rmin:
+            vv = 1
+        elif xx < rmax:
+            uu = (xx - rmin) / (rmax - rmin)
+            vv = uu*uu*uu * (-6 * uu*uu + 15 * uu - 10) + 1
+        else:
+            vv = 0
+        return vv
diff --git a/source/lib/include/ComputeDescriptor.h b/source/lib/include/ComputeDescriptor.h
index b01f9ed44b..bab9032828 100644
--- a/source/lib/include/ComputeDescriptor.h
+++ b/source/lib/include/ComputeDescriptor.h
@@ -143,6 +143,18 @@ void compute_descriptor_se_r (std::vector<double > &		descrpt_r,
 			      const double &			rmin, 
 			      const double &			rmax);
 
+inline
+void get_rij(std::vector<double > &	rij_a,
+			       const std::vector<double > &	posi,
+			       const int &			ntypes,
+			       const std::vector<int > &	type,
+			       const SimulationRegion<double> &	region,
+			       const bool &			b_pbc,
+			       const int &			i_idx,
+			       const std::vector<int > &	fmt_nlist_a,
+			       const std::vector<int > &	sec_a,
+			       const double &			rmin,
+			       const double &			rmax);
 
 struct NeighborInfo 
 {
@@ -1063,6 +1075,40 @@ void compute_descriptor_se_a (std::vector<double > &			descrpt_a,
   }
 }
 
+void get_rij(std::vector<double > &	rij_a,
+			       const std::vector<double > &	posi,
+			       const int &			ntypes,
+			       const std::vector<int > &	type,
+			       const SimulationRegion<double> &	region,
+			       const bool &			b_pbc,
+			       const int &			i_idx,
+			       const std::vector<int > &	fmt_nlist_a,
+			       const std::vector<int > &	sec_a,
+			       const double &			rmin,
+			       const double &			rmax)
+{  
+  // compute the diff of the neighbors
+  std::vector<std::vector<double > > sel_a_diff (sec_a.back());
+  rij_a.resize (sec_a.back() * 3);
+  fill (rij_a.begin(), rij_a.end(), 0.0);
+  for (int ii = 0; ii < int(sec_a.size()) - 1; ++ii){
+    for (int jj = sec_a[ii]; jj < sec_a[ii+1]; ++jj){
+      if (fmt_nlist_a[jj] < 0) break;
+      sel_a_diff[jj].resize(3);
+      const int & j_idx = fmt_nlist_a[jj];
+      if (b_pbc){
+	region.diffNearestNeighbor (posi[j_idx*3+0], posi[j_idx*3+1], posi[j_idx*3+2], 
+				    posi[i_idx*3+0], posi[i_idx*3+1], posi[i_idx*3+2], 
+				    sel_a_diff[jj][0], sel_a_diff[jj][1], sel_a_diff[jj][2]);
+      }
+      else {
+	for (int dd = 0; dd < 3; ++dd) sel_a_diff[jj][dd] = posi[j_idx*3+dd] - posi[i_idx*3+dd];
+      }
+      for (int dd = 0; dd < 3; ++dd) rij_a[jj*3+dd] = sel_a_diff[jj][dd];
+    }
+  }
+}
+
 
 void compute_descriptor_se_r (std::vector<double > &		descrpt,
 			      std::vector<double > &		descrpt_deriv,
diff --git a/source/lib/include/CustomeOperation.h b/source/lib/include/CustomeOperation.h
index 98b64e44af..5871175dd7 100644
--- a/source/lib/include/CustomeOperation.h
+++ b/source/lib/include/CustomeOperation.h
@@ -169,7 +169,7 @@ void compute_descriptor_se_a_cpu (
 }
 
 template<typename FPTYPE>
-void DescrptSeACPULauncher(const FPTYPE * coord, const int * type, const int * ilist, const int * jrange, const int * jlist, const FPTYPE * avg, const FPTYPE * std, FPTYPE * descrpt, FPTYPE * descrpt_deriv, FPTYPE * rij, int * nlist, const int nloc, const int nall, const int nnei, const int ntypes, const int ndescrpt, const float rcut_r, const float rcut_r_smth, const std::vector<int> sec_a, const bool fill_nei_a, const int magic_number) {
+void DescrptSeACPULauncher(const FPTYPE * coord, const int * type, const int * ilist, const int * jrange, const int * jlist, const FPTYPE * avg, const FPTYPE * std, FPTYPE * descrpt, FPTYPE * descrpt_deriv, FPTYPE * rij, int * nlist, const int nloc, const int nall, const int nnei, const int ntypes, const int ndescrpt, const float rcut_r, const float rcut_r_smth, const std::vector<int> sec_a, const bool fill_nei_a, const int max_nbor_size) {
     // set & normalize coord
     std::vector<FPTYPE> d_coord3(nall * 3);
     for (int ii = 0; ii < nall; ++ii) {
@@ -235,7 +235,7 @@ void DescrptSeACPULauncher(const FPTYPE * coord, const int * type, const int * i
 
 #if GOOGLE_CUDA
 template<typename FPTYPE>
-void DescrptSeAGPULauncher(const FPTYPE * coord, const int * type, const int * ilist, const int * jrange, const int * jlist, int * array_int, unsigned long long * array_longlong, const FPTYPE * avg, const FPTYPE * std, FPTYPE * descrpt, FPTYPE * descrpt_deriv, FPTYPE * rij, int * nlist, const int nloc, const int nall, const int nnei, const int ndescrpt, const float rcut_r, const float rcut_r_smth, const std::vector<int> sec_a, const bool fill_nei_a, const int magic_number) {
+void DescrptSeAGPULauncher(const FPTYPE * coord, const int * type, const int * ilist, const int * jrange, const int * jlist, int * array_int, unsigned long long * array_longlong, const FPTYPE * avg, const FPTYPE * std, FPTYPE * descrpt, FPTYPE * descrpt_deriv, FPTYPE * rij, int * nlist, const int nloc, const int nall, const int nnei, const int ndescrpt, const float rcut_r, const float rcut_r_smth, const std::vector<int> sec_a, const bool fill_nei_a, const int max_nbor_size) {
     DescrptSeAGPUExecuteFunctor<FPTYPE>()(coord, type, ilist, jrange, jlist, array_int, array_longlong, avg, std, descrpt, descrpt_deriv, rij, nlist, nloc, nall, nnei, ndescrpt, rcut_r, rcut_r_smth, sec_a, fill_nei_a, magic_number);
 }
 #endif // GOOGLE_CUDA
diff --git a/source/op/descrpt.cc b/source/op/descrpt.cc
index 201169f7f1..71918b4a5a 100644
--- a/source/op/descrpt.cc
+++ b/source/op/descrpt.cc
@@ -602,7 +602,7 @@ class DescrptOp : public OpKernel {
 
 #define REGISTER_CPU(T)                                                                 \
 REGISTER_KERNEL_BUILDER(                                                                \
-    Name("Descrpt").Device(DEVICE_CPU).TypeConstraint<T>("T"),                          \
+    Name("Descrpt").Device(DEVICE_CPU).TypeConstraint<T>("T"),                       \
     DescrptOp<CPUDevice, T>); 
 REGISTER_CPU(float);
 REGISTER_CPU(double);
diff --git a/source/op/env_mat_stat.cc b/source/op/env_mat_stat.cc
index 58e555083c..0db739272f 100644
--- a/source/op/env_mat_stat.cc
+++ b/source/op/env_mat_stat.cc
@@ -22,14 +22,11 @@ REGISTER_OP("EnvMatStat")
     .Input("natoms: int32")     //local atomic number; each type atomic number; daizheyingxiangqude atomic numbers
     .Input("box : T")
     .Input("mesh : int32")
-    .Input("davg: T")           //average value of data
-    .Input("dstd: T")           //standard deviation
     .Attr("rcut: float")      //no use
     .Attr("rcut_smth: float")
     .Attr("sel: list(int)")
-    .Output("distance: T")
-    .Output("max_nbor_size: int32")
-    .Output("env_stat_range: T");
+    .Output("min_nbor_dist: T")
+    .Output("max_nbor_size: int32");
 
 template<typename Device, typename FPTYPE>
 class EnvMatStatOp : public OpKernel {
@@ -41,8 +38,6 @@ class EnvMatStatOp : public OpKernel {
     cum_sum (sec, sel);
     ndescrpt = sec.back() * 4;
     nnei = sec.back();
-    fill_nei_a = true;
-    count_nei_idx_overflow = 0;
   }
 
   void Compute(OpKernelContext* context) override {
@@ -54,8 +49,6 @@ class EnvMatStatOp : public OpKernel {
     const Tensor& natoms_tensor	= context->input(context_input_index++);
     const Tensor& box_tensor	= context->input(context_input_index++);
     const Tensor& mesh_tensor	= context->input(context_input_index++);
-    const Tensor& avg_tensor	= context->input(context_input_index++);
-    const Tensor& std_tensor	= context->input(context_input_index++);
 
     // set size of the sample
     OP_REQUIRES (context, (coord_tensor.shape().dims() == 2),	errors::InvalidArgument ("Dim of coord should be 2"));
@@ -63,10 +56,6 @@ class EnvMatStatOp : public OpKernel {
     OP_REQUIRES (context, (natoms_tensor.shape().dims() == 1),	errors::InvalidArgument ("Dim of natoms should be 1"));
     OP_REQUIRES (context, (box_tensor.shape().dims() == 2),	errors::InvalidArgument ("Dim of box should be 2"));
     OP_REQUIRES (context, (mesh_tensor.shape().dims() == 1),	errors::InvalidArgument ("Dim of mesh should be 1"));
-    OP_REQUIRES (context, (avg_tensor.shape().dims() == 2),	errors::InvalidArgument ("Dim of avg should be 2"));
-    OP_REQUIRES (context, (std_tensor.shape().dims() == 2),	errors::InvalidArgument ("Dim of std should be 2"));
-    OP_REQUIRES (context, (fill_nei_a),				errors::InvalidArgument ("Rotational free descriptor only support the case -1 < 0"));
-
     OP_REQUIRES (context, (natoms_tensor.shape().dim_size(0) >= 3),		errors::InvalidArgument ("number of atoms should be larger than (or equal to) 3"));
     auto natoms	= natoms_tensor	.flat<int>();
     int nloc = natoms(0);
@@ -77,14 +66,9 @@ class EnvMatStatOp : public OpKernel {
     // check the sizes
     OP_REQUIRES (context, (nsamples == type_tensor.shape().dim_size(0)),	errors::InvalidArgument ("number of samples should match"));
     OP_REQUIRES (context, (nsamples == box_tensor.shape().dim_size(0)),		errors::InvalidArgument ("number of samples should match"));
-    OP_REQUIRES (context, (ntypes == avg_tensor.shape().dim_size(0)),		errors::InvalidArgument ("number of avg should be ntype"));
-    OP_REQUIRES (context, (ntypes == std_tensor.shape().dim_size(0)),		errors::InvalidArgument ("number of std should be ntype"));
-
     OP_REQUIRES (context, (nall * 3 == coord_tensor.shape().dim_size(1)),	errors::InvalidArgument ("number of atoms should match"));
     OP_REQUIRES (context, (nall == type_tensor.shape().dim_size(1)),		errors::InvalidArgument ("number of atoms should match"));
     OP_REQUIRES (context, (9 == box_tensor.shape().dim_size(1)),		errors::InvalidArgument ("number of box should be 9"));
-    OP_REQUIRES (context, (ndescrpt == avg_tensor.shape().dim_size(1)),		errors::InvalidArgument ("number of avg should be ndescrpt"));
-    OP_REQUIRES (context, (ndescrpt == std_tensor.shape().dim_size(1)),		errors::InvalidArgument ("number of std should be ndescrpt"));
 
     int nei_mode = 0;
     if (mesh_tensor.shape().dim_size(0) == 16) {
@@ -117,83 +101,35 @@ class EnvMatStatOp : public OpKernel {
       b_norm_atom = true;
     }
 
-    // Create an output tensor
-    TensorShape descrpt_shape ;
-    descrpt_shape.AddDim (nsamples);
-    descrpt_shape.AddDim (nloc * ndescrpt);
-    TensorShape descrpt_deriv_shape ;
-    descrpt_deriv_shape.AddDim (nsamples);
-    descrpt_deriv_shape.AddDim (nloc * ndescrpt * 3);
-    TensorShape rij_shape ;
-    rij_shape.AddDim (nsamples);
-    rij_shape.AddDim (nloc * nnei * 3);
-    TensorShape nlist_shape ;
-    nlist_shape.AddDim (nsamples);
-    nlist_shape.AddDim (nloc * nnei);
-    TensorShape distance_shape ;
-    distance_shape.AddDim (nloc * nnei);
+    TensorShape min_nbor_dist_shape ;
+    min_nbor_dist_shape.AddDim (nloc * nnei);
     TensorShape max_nbor_size_shape ;
     max_nbor_size_shape.AddDim (nloc);
-    TensorShape table_range_shape ;
-    table_range_shape.AddDim (nloc * nnei);
-
-    Tensor descrpt_tensor;
-    OP_REQUIRES_OK(context, context->allocate_temp(DT_DOUBLE, descrpt_shape, &descrpt_tensor));
-    
-    Tensor descrpt_deriv_tensor;
-    OP_REQUIRES_OK(context, context->allocate_temp(DT_DOUBLE, descrpt_deriv_shape, &descrpt_deriv_tensor));
-
-    Tensor rij_tensor;
-    OP_REQUIRES_OK(context, context->allocate_temp(DT_DOUBLE, rij_shape, &rij_tensor));
 
-    Tensor nlist_tensor;
-    OP_REQUIRES_OK(context, context->allocate_temp(DT_INT32, nlist_shape, &nlist_tensor));
-    
     int context_output_index = 0;
-    Tensor* distance_tensor = NULL;
+    Tensor* min_nbor_dist_tensor = NULL;
     OP_REQUIRES_OK(context, context->allocate_output(context_output_index++, 
-						     distance_shape,
-						     &distance_tensor));
+						     min_nbor_dist_shape,
+						     &min_nbor_dist_tensor));
     Tensor* max_nbor_size_tensor = NULL;
     OP_REQUIRES_OK(context, context->allocate_output(context_output_index++, 
 						     max_nbor_size_shape,
 						     &max_nbor_size_tensor));
-    Tensor* table_range_tensor = NULL;
-    OP_REQUIRES_OK(context, context->allocate_output(context_output_index++, 
-						     table_range_shape,
-						     &table_range_tensor));
 
     auto coord	= coord_tensor	.matrix<FPTYPE>();
     auto type	= type_tensor	.matrix<int>();
     auto box	= box_tensor	.matrix<FPTYPE>();
     auto mesh	= mesh_tensor	.flat<int>();
-    auto avg	= avg_tensor	.matrix<FPTYPE>();
-    auto std	= std_tensor	.matrix<FPTYPE>();
-    auto descrpt	= descrpt_tensor	.matrix<FPTYPE>();
-    auto descrpt_deriv	= descrpt_deriv_tensor	.matrix<FPTYPE>();
-    auto rij		= rij_tensor		.matrix<FPTYPE>();
-    auto nlist		= nlist_tensor	.matrix<int>();
-    auto distance		= distance_tensor	->flat<FPTYPE>();
+    auto min_nbor_dist	= min_nbor_dist_tensor ->flat<FPTYPE>();
     // find a potential bug here!
     auto max_nbor_size	= max_nbor_size_tensor ->flat<int>();
-    auto table_range		= table_range_tensor	->flat<FPTYPE>();
     
-    for (int ii = 0; ii < static_cast<int>(distance_tensor->NumElements()); ii++) {
-      distance(ii) = 10000.0;
+    for (int ii = 0; ii < static_cast<int>(min_nbor_dist_tensor->NumElements()); ii++) {
+      min_nbor_dist(ii) = 10000.0;
     }
     for (int ii = 0; ii < static_cast<int>(max_nbor_size_tensor->NumElements()); ii++) {
       max_nbor_size(ii) = 0;
     }
-    for (int ii = 0; ii < static_cast<int>(table_range_tensor->NumElements()); ii++) {
-      table_range(ii) = 0.0;
-    }
-    // // check the types
-    // int max_type_v = 0;
-    // for (int ii = 0; ii < natoms; ++ii){
-    //   if (type(0, ii) > max_type_v) max_type_v = type(0, ii);
-    // }
-    // int ntypes = max_type_v + 1;
-    OP_REQUIRES (context, (ntypes == int(sel.size())),	errors::InvalidArgument ("number of types should match the length of sel array"));
 
     for (int kk = 0; kk < nsamples; ++kk){
       // set region
@@ -293,25 +229,16 @@ class EnvMatStatOp : public OpKernel {
   std::vector<int> sec_r(sec.size(), 0);
 
 	int ret = -1;
-	if (fill_nei_a){
-	  if ((ret = format_nlist_fill_a (fmt_nlist_a, fmt_nlist_r, d_coord3, ntypes, d_type, region, b_pbc, ii, d_nlist_a[ii], d_nlist_r[ii], rcut, sec, sec_r)) != -1){
-	    if (count_nei_idx_overflow == 0) {
-	      std::cout << "WARNING: Radial neighbor list length of type " << ret << " is not enough" << std::endl;
-	      flush(std::cout);
-	      count_nei_idx_overflow ++;
-	    }
+	if ((ret = format_nlist_fill_a (fmt_nlist_a, fmt_nlist_r, d_coord3, ntypes, d_type, region, b_pbc, ii, d_nlist_a[ii], d_nlist_r[ii], rcut, sec, sec_r)) != -1){
+	  if (count_nei_idx_overflow == 0) {
+	    std::cout << "WARNING: Radial neighbor list length of type " << ret << " is not enough" << std::endl;
+	    flush(std::cout);
+	    count_nei_idx_overflow ++;
 	  }
 	}
 
-	std::vector<compute_t > d_descrpt_a;
-	std::vector<compute_t > d_descrpt_a_deriv;
-	std::vector<compute_t > d_descrpt_r;
-	std::vector<compute_t > d_descrpt_r_deriv;
 	std::vector<compute_t > d_rij_a;
-	std::vector<compute_t > d_rij_r;      
-	compute_descriptor_se_a (d_descrpt_a,
-				 d_descrpt_a_deriv,
-				 d_rij_a,
+	get_rij (d_rij_a,
 				 d_coord3,
 				 ntypes, 
 				 d_type,
@@ -324,37 +251,13 @@ class EnvMatStatOp : public OpKernel {
 				 rcut);
 
 	// check sizes
-	assert (d_descrpt_a.size() == ndescrpt);
-	assert (d_descrpt_a_deriv.size() == ndescrpt * 3);
 	assert (d_rij_a.size() == nnei * 3);
 	assert (int(fmt_nlist_a.size()) == nnei);
-  // std::cout << "min:\t" << (0 - avg(0, 0)) / std(0, 0) << std::endl;
-  // if (counter % 1000 == 0) {
-  //   std::cout << "min:\t" << (0 - avg(0, 0)) / std(0, 0) << std::endl;
-  // }
-	// record outputs
-	for (int jj = 0; jj < ndescrpt; ++jj) {
-	  descrpt(kk, ii * ndescrpt + jj) = (d_descrpt_a[jj] - avg(d_type[ii], jj)) / std(d_type[ii], jj);
-       if (jj % 4 == 0) {
-         table_range(ii * nnei + jj / 4) = descrpt(kk, ii * ndescrpt + jj);
-       }
-  }
-	for (int jj = 0; jj < ndescrpt * 3; ++jj) {
-	  descrpt_deriv(kk, ii * ndescrpt * 3 + jj) = d_descrpt_a_deriv[jj] / std(d_type[ii], jj/3);
-	}
 	for (int jj = 0; jj < nnei * 3; ++jj){
-	  rij (kk, ii * nnei * 3 + jj) = d_rij_a[jj];
     if (jj % 3 == 0 && d_rij_a[jj] > 0) {
-      distance(ii * nnei + jj / 3) = sqrt(d_rij_a[jj] * d_rij_a[jj] + d_rij_a[jj + 1] * d_rij_a[jj + 1] + d_rij_a[jj + 2] * d_rij_a[jj + 2]);
+      min_nbor_dist(ii * nnei + jj / 3) = sqrt(d_rij_a[jj] * d_rij_a[jj] + d_rij_a[jj + 1] * d_rij_a[jj + 1] + d_rij_a[jj + 2] * d_rij_a[jj + 2]);
     }
 	}
-	for (int jj = 0; jj < nnei; ++jj){
-	  int record = fmt_nlist_a[jj];
-	  if (b_nlist_map && record >= 0) {
-	    record = nlist_map[record];
-	  }
-	  nlist (kk, ii * nnei + jj) = record;
-	}
       }
     }
   }
diff --git a/source/train/Trainer.py b/source/train/Trainer.py
index 671a69e4d0..e0a1b80c36 100644
--- a/source/train/Trainer.py
+++ b/source/train/Trainer.py
@@ -281,17 +281,17 @@ def build (self,
             assert hasattr(self.descrpt, 'davg'),     "Model compression error: descriptor must have attr davg!"
             assert hasattr(self.descrpt, 'dstd'),     "Model compression error: descriptor must have attr dstd!"
             assert hasattr(self.descrpt, 'ntypes'),   "Model compression error: descriptor must have attr ntypes!"
+            assert hasattr(self.descrpt, 'ndescrpt'), "Model compression error: descriptor must have attr ndescrpt!"
             assert 'sel' in self.descrpt_param,       "Model compression error: descriptor must have attr sel!"
             assert 'rcut' in self.descrpt_param,      "Model compression error: descriptor must have attr rcut!"
             assert 'rcut_smth' in self.descrpt_param, "Model compression error: descriptor must have attr rcut_smth!"
             if self.descrpt_type == 'se_a':
-                stat = EnvMatStat(self.descrpt_type, self.descrpt.ntypes, self.descrpt_param['rcut'], self.descrpt_param['rcut_smth'], self.descrpt_param['sel'], self.descrpt.davg, self.descrpt.dstd)
+                stat = EnvMatStat(self.descrpt_type, self.descrpt.ntypes, self.descrpt.ndescrpt, self.descrpt_param['rcut'], self.descrpt_param['rcut_smth'], self.descrpt_param['sel'], self.descrpt.davg, self.descrpt.dstd)
             else:
                 raise RuntimeError ("Model compression error: descriptor type must be se_a!")
-
-            distance, max_nbor_size, env_mat_range\
-                = stat.env_mat_stat(data)
-            self.descrpt.enable_compression(distance, max_nbor_size, env_mat_range, self.compress_param['model_file'], self.compress_param['table_config'])            # send the statistics of the training data and activate the descriptor compression mode
+            env_mat_range\
+                = stat.get_env_mat_range(data)
+            self.descrpt.enable_compression(env_mat_range, self.compress_param['model_file'], self.compress_param['table_config'])            # send the statistics of the training data and activate the descriptor compression mode
 
         worker_device = "/job:%s/task:%d/%s" % (self.run_opt.my_job_name,
                                                 self.run_opt.my_task_index,

From 0a6bba2f91085d2f63581b49af0b60bc12eff6b4 Mon Sep 17 00:00:00 2001
From: denghuilu <denghuilu@pku.edu.cn>
Date: Sat, 6 Feb 2021 19:28:07 +0800
Subject: [PATCH 13/20] Update CustomeOperation.h

---
 source/lib/include/CustomeOperation.h | 8 ++++----
 1 file changed, 4 insertions(+), 4 deletions(-)

diff --git a/source/lib/include/CustomeOperation.h b/source/lib/include/CustomeOperation.h
index 5871175dd7..e8669c9f28 100644
--- a/source/lib/include/CustomeOperation.h
+++ b/source/lib/include/CustomeOperation.h
@@ -236,7 +236,7 @@ void DescrptSeACPULauncher(const FPTYPE * coord, const int * type, const int * i
 #if GOOGLE_CUDA
 template<typename FPTYPE>
 void DescrptSeAGPULauncher(const FPTYPE * coord, const int * type, const int * ilist, const int * jrange, const int * jlist, int * array_int, unsigned long long * array_longlong, const FPTYPE * avg, const FPTYPE * std, FPTYPE * descrpt, FPTYPE * descrpt_deriv, FPTYPE * rij, int * nlist, const int nloc, const int nall, const int nnei, const int ndescrpt, const float rcut_r, const float rcut_r_smth, const std::vector<int> sec_a, const bool fill_nei_a, const int max_nbor_size) {
-    DescrptSeAGPUExecuteFunctor<FPTYPE>()(coord, type, ilist, jrange, jlist, array_int, array_longlong, avg, std, descrpt, descrpt_deriv, rij, nlist, nloc, nall, nnei, ndescrpt, rcut_r, rcut_r_smth, sec_a, fill_nei_a, magic_number);
+    DescrptSeAGPUExecuteFunctor<FPTYPE>()(coord, type, ilist, jrange, jlist, array_int, array_longlong, avg, std, descrpt, descrpt_deriv, rij, nlist, nloc, nall, nnei, ndescrpt, rcut_r, rcut_r_smth, sec_a, fill_nei_a, max_nbor_size);
 }
 #endif // GOOGLE_CUDA
 // ******************************************************************************
@@ -432,7 +432,7 @@ void compute_descriptor_se_r_cpu (
 }
 
 template<typename FPTYPE>
-void DescrptSeRCPULauncher(const FPTYPE * coord, const int * type, const int * ilist, const int * jrange, const int * jlist, const FPTYPE * avg, const FPTYPE * std, FPTYPE * descrpt, FPTYPE * descrpt_deriv, FPTYPE * rij, int * nlist, const int nloc, const int nall, const int nnei, const int ntypes, const int ndescrpt, const float rcut_r, const float rcut_r_smth, const std::vector<int> sec_a, const bool fill_nei_a, const int magic_number) {
+void DescrptSeRCPULauncher(const FPTYPE * coord, const int * type, const int * ilist, const int * jrange, const int * jlist, const FPTYPE * avg, const FPTYPE * std, FPTYPE * descrpt, FPTYPE * descrpt_deriv, FPTYPE * rij, int * nlist, const int nloc, const int nall, const int nnei, const int ntypes, const int ndescrpt, const float rcut_r, const float rcut_r_smth, const std::vector<int> sec_a, const bool fill_nei_a, const int max_nbor_size) {
     // set & normalize coord
     std::vector<FPTYPE> d_coord3(nall * 3);
     for (int ii = 0; ii < nall; ++ii) {
@@ -498,8 +498,8 @@ void DescrptSeRCPULauncher(const FPTYPE * coord, const int * type, const int * i
 
 #if GOOGLE_CUDA
 template<typename FPTYPE>
-void DescrptSeRGPULauncher(const FPTYPE * coord, const int * type, const int * ilist, const int * jrange, const int * jlist, int * array_int, unsigned long long * array_longlong, const FPTYPE * avg, const FPTYPE * std, FPTYPE * descrpt, FPTYPE * descrpt_deriv, FPTYPE * rij, int * nlist, const int nloc, const int nall, const int nnei, const int ndescrpt, const float rcut_r, const float rcut_r_smth, const std::vector<int> sec_a, const bool fill_nei_a, const int magic_number) {
-    DescrptSeRGPUExecuteFunctor<FPTYPE>()(coord, type, ilist, jrange, jlist, array_int, array_longlong, avg, std, descrpt, descrpt_deriv, rij, nlist, nloc, nall, nnei, ndescrpt, rcut_r, rcut_r_smth, sec_a, fill_nei_a, magic_number);
+void DescrptSeRGPULauncher(const FPTYPE * coord, const int * type, const int * ilist, const int * jrange, const int * jlist, int * array_int, unsigned long long * array_longlong, const FPTYPE * avg, const FPTYPE * std, FPTYPE * descrpt, FPTYPE * descrpt_deriv, FPTYPE * rij, int * nlist, const int nloc, const int nall, const int nnei, const int ndescrpt, const float rcut_r, const float rcut_r_smth, const std::vector<int> sec_a, const bool fill_nei_a, const int max_nbor_size) {
+    DescrptSeRGPUExecuteFunctor<FPTYPE>()(coord, type, ilist, jrange, jlist, array_int, array_longlong, avg, std, descrpt, descrpt_deriv, rij, nlist, nloc, nall, nnei, ndescrpt, rcut_r, rcut_r_smth, sec_a, fill_nei_a, max_nbor_size);
 }
 #endif // GOOGLE_CUDA
 // ******************************************************************************

From e22fdefdc0fe237abdfc517f44f372c9ae803dc4 Mon Sep 17 00:00:00 2001
From: denghuilu <denghuilu@pku.edu.cn>
Date: Sun, 7 Feb 2021 14:16:44 +0800
Subject: [PATCH 14/20] optimize code structure of model compression

---
 deepmd/descriptor/se_a.py    |  32 +++-----
 deepmd/utils/env_mat_stat.py | 105 ++++--------------------
 deepmd/utils/tabulate.py     | 149 ++++++++++++++++++++++-------------
 source/op/env_mat_stat.cc    |   6 +-
 source/train/Trainer.py      |  27 +++----
 source/train/compress.py     |   5 +-
 source/train/main.py         |  22 ++++--
 7 files changed, 152 insertions(+), 194 deletions(-)

diff --git a/deepmd/descriptor/se_a.py b/deepmd/descriptor/se_a.py
index 3b9a4db0af..b137906a50 100644
--- a/deepmd/descriptor/se_a.py
+++ b/deepmd/descriptor/se_a.py
@@ -100,7 +100,6 @@ def __init__ (self,
         self.dstd = None
         self.davg = None
         self.compress = False
-        
         self.place_holders = {}
         avg_zero = np.zeros([self.ntypes,self.ndescrpt]).astype(global_np_float_precision)
         std_ones = np.ones ([self.ntypes,self.ndescrpt]).astype(global_np_float_precision)
@@ -231,7 +230,7 @@ def compute_input_stats (self,
         self.dstd = np.array(all_dstd)
 
     def enable_compression(self,
-                           env_mat_range,
+                           min_nbor_dist,
                            model_file = 'frozon_model.pb',
                            table_config = [5, 0.01, 0.1, -1]
     ) -> None:
@@ -240,25 +239,23 @@ def enable_compression(self,
         
         Parameters
         ----------
-        env_mat_range
-                The output data range of the environment matrix
-                env_mat_range[0] denotes the lower boundary of environment matrix
-                env_mat_range[1] denotes the upper boundary of environment matrix
+        min_nbor_dist
+                The nearest distance between atoms
         model_file
-                The original frozen model, that will be compressed
+                The original frozen model, which will be compressed by the program
         table_config
-                The configuration of the tabulation
+                The configuration including:
                 Table_config[0] denotes the scale of model extrapolation
-                Table_config[1] denotes the first table stride
-                Table_config[2] denotes the second table stride
+                Table_config[1] denotes the uniform stride of the first table
+                Table_config[2] denotes the uniform stride of the second table
                 Table_config[3] denotes the overflow check frequency
-        """   
+        """
         self.compress = True
         self.model_file = model_file
         self.table_config = table_config
-        self.env_mat_range = env_mat_range
         self.table = DeepTabulate(self.model_file, self.filter_np_precision, self.type_one_side)
-
+        self.lower, self.upper \
+            = self.table.build(min_nbor_dist, self.rcut_r, self.rcut_r_smth, self.table_config[0], self.table_config[1], self.table_config[2])
 
     def build (self, 
                coord_ : tf.Tensor, 
@@ -357,15 +354,6 @@ def build (self,
         self.descrpt_deriv = tf.identity(self.descrpt_deriv, name = 'o_rmat_deriv')
         self.rij = tf.identity(self.rij, name = 'o_rij')
         self.nlist = tf.identity(self.nlist, name = 'o_nlist')
-        
-        if self.compress:
-            self.lower = math.floor(self.env_mat_range[0])
-            self.upper = math.ceil(self.env_mat_range[1])
-            self.table.build(self.lower, 
-                             self.upper, 
-                             self.upper * self.table_config[0], 
-                             self.table_config[1], 
-                             self.table_config[2])
 
         self.dout, self.qmat = self._pass_filter(self.descrpt_reshape, 
                                                  atype,
diff --git a/deepmd/utils/env_mat_stat.py b/deepmd/utils/env_mat_stat.py
index 1518207de7..b8f73959eb 100644
--- a/deepmd/utils/env_mat_stat.py
+++ b/deepmd/utils/env_mat_stat.py
@@ -13,51 +13,29 @@ class EnvMatStat():
     It loads data from DeepmdData object, and measures the data info, including neareest nbor distance between atoms, max nbor size of atoms and the output data range of the environment matrix.
     """
     def __init__(self,
-                 descrpt_type : str,
                  ntypes : int,
-                 ndescrpt : int,
                  rcut,
                  rcut_smth,
-                 sel,
-                 davg,
-                 dstd) -> None:
+                 sel) -> None:
         """
         Constructor
 
         Parameters
         ----------
-        descrpt_type
-                The descrpt type of the embedding net
         ntypes
                 The num of atom types
-        ndescrpt
-                The width of environment matrix
         rcut
                 The cut-off radius
         rcut_smth
                 From where the environment matrix should be smoothed
         sel : list[str]
                 sel[i] specifies the maxmum number of type i atoms in the cut-off radius
-        davg
-                Average of training data
-        dstd
-                Standard deviation of training data
         """
-        self.init_stat = False
-        self.davg = davg
-        self.dstd = dstd
-        if self.davg is None:
-            self.davg = np.zeros([self.ntypes, self.ndescrpt])
-        if self.dstd is None:
-            self.dstd = np.ones ([self.ntypes, self.ndescrpt])
         self.ntypes = ntypes
-        self.ndescrpt = ndescrpt
-        self.descrpt_type = descrpt_type
-        assert self.descrpt_type == 'se_a', 'Model compression error: descriptor type must be se_a!'
         self.place_holders = {}
         sub_graph = tf.Graph()
         with sub_graph.as_default():
-            for ii in ['coord', 'box', 'avg', 'std']:
+            for ii in ['coord', 'box']:
                 self.place_holders[ii] = tf.placeholder(global_np_float_precision, [None, None], name='t_'+ii)
             self.place_holders['type'] = tf.placeholder(tf.int32, [None, None], name='t_type')
             self.place_holders['natoms_vec'] = tf.placeholder(tf.int32, [self.ntypes+2], name='t_natoms')
@@ -77,7 +55,7 @@ def __init__(self,
         self.sub_sess = tf.Session(graph = sub_graph, config=default_tf_session_config)
 
     def get_env_mat_stat(self,
-                         data) -> Tuple[float, int]:
+                         data) -> Tuple[float, List[int]]:
         """
         get the data statistics of the training data, including nearest nbor distance between atoms, max nbor size of atoms
 
@@ -89,12 +67,12 @@ def get_env_mat_stat(self,
         Returns
         -------
         min_nbor_dist
-                The nearest nbor distance between atoms
+                The nearest distance between neighbor atoms
         max_nbor_size
-                The max nbor size of atoms
+                A list with ntypes integers, denotes the actual achieved max sel
         """
-        self.max_nbor_size = 0
         self.min_nbor_dist = 100.0
+        self.max_nbor_size = [0] * self.ntypes
 
         for ii in tqdm(range(len(data.system_dirs)), desc = '# DEEPMD: getting data info'):
             for jj in data.data_systems[ii].dirs:
@@ -110,68 +88,13 @@ def get_env_mat_stat(self,
                                                 self.place_holders['default_mesh']: np.array(data.default_mesh[ii]),
                                             })
                     dt = np.min(dt)
-                    mn = np.max(mn)
-                    if (dt < self.min_nbor_dist):
+                    if dt < self.min_nbor_dist:
                         self.min_nbor_dist = dt
-                    if (mn > self.max_nbor_size):
-                        self.max_nbor_size = mn
-        self.init_stat = True
-        return self.min_nbor_dist, self.max_nbor_size
-
-    def get_env_mat_range(self,
-                     data) -> List[float]:
-        """
-        get the data statistics of the training data, including the output data range of the environment matrix
-
-        Parameters
-        ----------
-        data
-                Class for manipulating many data systems. It is implemented with the help of DeepmdData.
-        
-        Returns
-        -------
-        env_mat_range
-                The output data range of the environment matrix
-                env_mat_range[0] denotes the lower boundary of environment matrix
-                env_mat_range[1] denotes the upper boundary of environment matrix
-        """
-        if self.init_stat:
-            min_nbor_dist = self.min_nbor_dist
-            max_nbor_size = self.max_nbor_size
-        else:
-            min_nbor_dist, max_nbor_size = self.get_env_mat_stat(data)
-        self.env_mat_range = self._get_internal_env_mat_range(min_nbor_dist, max_nbor_size)
-        print('# DEEPMD: training data with lower boundary: ' + str(self.env_mat_range[0]))
-        print('# DEEPMD: training data with upper boundary: ' + str(self.env_mat_range[1]))
-        print('# DEEPMD: training data with min   distance: ' + str(self.min_nbor_dist))
-        print('# DEEPMD: training data with max   nborsize: ' + str(self.max_nbor_size))
-        return self.env_mat_range
+                    for ww in range(self.ntypes):
+                        var = np.max(mn[:, ww])
+                        if var > self.max_nbor_size[ww]:
+                            self.max_nbor_size[ww] = var
 
-    def _get_internal_env_mat_range(self,
-                                    min_nbor_dist, 
-                                    max_nbor_size):
-        """
-        Warning: different descrpt_type may have different method to get the mat range
-        """
-        lower = 100.0
-        upper = -10.0
-        sw    = self._spline5_switch(self.min_nbor_dist, self.rcut_smth, self.rcut)
-        for ii in range(self.ntypes):
-            if lower > -self.davg[ii][0] / self.dstd[ii][0]:
-                lower = -self.davg[ii][0] / self.dstd[ii][0]
-            if upper < ((1 / self.min_nbor_dist) * sw - self.davg[ii][0]) / self.dstd[ii][0]:
-                upper = ((1 / self.min_nbor_dist) * sw - self.davg[ii][0]) / self.dstd[ii][0]
-        return [lower, upper]
-
-    def _spline5_switch(self,
-                        xx,
-                        rmin,
-                        rmax):
-        if xx < rmin:
-            vv = 1
-        elif xx < rmax:
-            uu = (xx - rmin) / (rmax - rmin)
-            vv = uu*uu*uu * (-6 * uu*uu + 15 * uu - 10) + 1
-        else:
-            vv = 0
-        return vv
+        print('# DEEPMD: training data with min nbor dist: ' + str(self.min_nbor_dist))
+        print('# DEEPMD: training data with max nbor size: ' + str(self.max_nbor_size))
+        return self.min_nbor_dist, self.max_nbor_size
diff --git a/deepmd/utils/tabulate.py b/deepmd/utils/tabulate.py
index 57ffc0cb5e..84255f6e16 100644
--- a/deepmd/utils/tabulate.py
+++ b/deepmd/utils/tabulate.py
@@ -2,16 +2,18 @@
 import math
 import numpy as np
 from tqdm import tqdm
+from typing import Tuple, List
 from deepmd.env import tf
 from deepmd.env import op_module
 from tensorflow.python.platform import gfile
 from tensorflow.python.framework import tensor_util
 
-
 class DeepTabulate():
     """
     Class for tabulation.
-    It reads the trained weights and bias from the frozen model, and builds the table according to the weights and bias.
+    Compress a model, which including tabulating the embedding-net. 
+    The table is composed of fifth-order polynomial coefficients and is assembled from two sub-tables. The first table takes the stride(parameter) as it\'s uniform stride, while the second table takes 10 * stride as it\s uniform stride 
+    The range of the first table is automatically detected by deepmd-kit, while the second table ranges from the first table\'s upper boundary(upper) to the extrapolate(parameter) * upper.
     """
     def __init__(self,
                  model_file,
@@ -43,6 +45,9 @@ def __init__(self,
         self.sel_a = self.graph.get_operation_by_name('DescrptSeA').get_attr('sel_a')
         self.ntypes = self._get_tensor_value(self.graph.get_tensor_by_name ('descrpt_attr/ntypes:0'))
 
+        self.davg = self._get_tensor_value(self.graph.get_tensor_by_name ('descrpt_attr/t_avg:0'))
+        self.dstd = self._get_tensor_value(self.graph.get_tensor_by_name ('descrpt_attr/t_std:0'))
+
         self.filter_variable_nodes = self._load_matrix_node()
         self.layer_size = int(len(self.filter_variable_nodes) / (self.ntypes * self.ntypes * 2))
         self.table_size = self.ntypes * self.ntypes
@@ -62,6 +67,66 @@ def __init__(self,
 
         # TODO: Need a check function to determine if the current model is properly
 
+    def build(self, 
+              min_nbor_dist,
+              rcut,
+              rcut_smth,
+              extrapolate, 
+              stride0, 
+              stride1) -> Tuple[int, int]:
+        """
+        Build the tables for model compression
+
+        Parameters
+        ----------
+        min_nbor_dist
+                The nearest distance between neighbor atoms
+        rcut
+                The cut-off radius
+        rcut_smth
+                From where the environment matrix should be smoothed
+        extrapolate
+                The scale of model extrapolation
+        stride0
+                The uniform stride of the first table
+        stride1
+                The uniform stride of the second table
+        
+        Returns
+        ----------
+        lower
+                The lower boundary of environment matrix
+        upper
+                The upper boundary of environment matrix
+        """
+        # tabulate range [lower, upper] with stride0 'stride0'
+        lower, upper = self._get_env_mat_range(min_nbor_dist, rcut, rcut_smth)
+        xx = np.arange(lower, upper, stride0, dtype = self.data_type)
+        xx = np.append(xx, np.arange(upper, extrapolate * upper, stride1, dtype = self.data_type))
+        xx = np.append(xx, np.array([extrapolate * upper], dtype = self.data_type))
+        self.nspline = int((upper - lower) / stride0 + (extrapolate * upper - upper) / stride1)
+        for ii in range(self.table_size):
+            vv, dd, d2 = self._make_data(xx, ii)
+            if self.type_one_side:
+                net = "filter_-1_net_" + str(int(ii))
+            else:
+                net = "filter_" + str(int(ii / self.ntypes)) + "_net_" + str(int(ii % self.ntypes))
+            self.data[net] = np.zeros([self.nspline, 6 * self.last_layer_size], dtype = self.data_type)
+            for jj in tqdm(range(self.nspline), desc = '# DEEPMD: ' + net + ', tabulating'):
+                for kk in range(self.last_layer_size):
+                    if jj < int((upper - lower) / stride0):
+                        tt = stride0
+                    else:
+                        tt = stride1
+                    hh = vv[jj + 1][kk] - vv[jj][kk]
+                    self.data[net][jj][kk * 6 + 0] = vv[jj][kk]
+                    self.data[net][jj][kk * 6 + 1] = dd[jj][kk]
+                    self.data[net][jj][kk * 6 + 2] = 0.5 * d2[jj][kk]
+                    self.data[net][jj][kk * 6 + 3] = (1 / (2 * tt * tt * tt)) * (20 * hh - (8 * dd[jj + 1][kk] + 12 * dd[jj][kk]) * tt - (3 * d2[jj][kk] - d2[jj + 1][kk]) * tt * tt)
+                    self.data[net][jj][kk * 6 + 4] = (1 / (2 * tt * tt * tt * tt)) * (-30 * hh + (14 * dd[jj + 1][kk] + 16 * dd[jj][kk]) * tt + (3 * d2[jj][kk] - 2 * d2[jj + 1][kk]) * tt * tt)
+                    self.data[net][jj][kk * 6 + 5] = (1 / (2 * tt * tt * tt * tt * tt)) * (12 * hh - 6 * (dd[jj + 1][kk] + dd[jj][kk]) * tt + (d2[jj + 1][kk] - d2[jj][kk]) * tt * tt)
+        return lower, upper
+
     def _load_graph(self):
         graph_def = tf.GraphDef()
         with open(self.model_file, "rb") as f:
@@ -124,57 +189,6 @@ def _get_matrix(self):
                     matrix["layer_" + str(layer)].append(np.reshape(tensor_value, tensor_shape).astype(self.data_type))
         return matrix
 
-    def build(self, 
-              lower, 
-              upper, 
-              _max, 
-              stride0, 
-              stride1) -> None:
-        """
-        Build the tables for model compression
-
-        Parameters
-        ----------
-        lower
-                The lower boundary of the first table
-        upper
-                The upper boundary of the first table as well as the lower boundary of the second table
-        _max
-                The upper boundary of the second table
-        stride0
-                The stride of the first table
-        stride1
-                The stride of the second table
-        """
-        # tabulate range [lower, upper] with stride0 'stride0'
-        lower = math.floor(lower)
-        upper = math.ceil(upper)
-        xx = np.arange(lower, upper, stride0, dtype = self.data_type)
-        xx = np.append(xx, np.arange(upper, _max, stride1, dtype = self.data_type))
-        xx = np.append(xx, np.array([_max], dtype = self.data_type))
-        self.nspline = int((upper - lower) / stride0 + (_max - upper) / stride1)
-
-        for ii in range(self.table_size):
-            vv, dd, d2 = self._make_data(xx, ii)
-            if self.type_one_side:
-                net = "filter_-1_net_" + str(int(ii))
-            else:
-                net = "filter_" + str(int(ii / self.ntypes)) + "_net_" + str(int(ii % self.ntypes))
-            self.data[net] = np.zeros([self.nspline, 6 * self.last_layer_size], dtype = self.data_type)
-            for jj in tqdm(range(self.nspline), desc = '# DEEPMD: ' + net + ', tabulating'):
-                for kk in range(self.last_layer_size):
-                    if jj < int((upper - lower) / stride0):
-                        tt = stride0
-                    else:
-                        tt = stride1
-                    hh = vv[jj + 1][kk] - vv[jj][kk]
-                    self.data[net][jj][kk * 6 + 0] = vv[jj][kk]
-                    self.data[net][jj][kk * 6 + 1] = dd[jj][kk]
-                    self.data[net][jj][kk * 6 + 2] = 0.5 * d2[jj][kk]
-                    self.data[net][jj][kk * 6 + 3] = (1 / (2 * tt * tt * tt)) * (20 * hh - (8 * dd[jj + 1][kk] + 12 * dd[jj][kk]) * tt - (3 * d2[jj][kk] - d2[jj + 1][kk]) * tt * tt)
-                    self.data[net][jj][kk * 6 + 4] = (1 / (2 * tt * tt * tt * tt)) * (-30 * hh + (14 * dd[jj + 1][kk] + 16 * dd[jj][kk]) * tt + (3 * d2[jj][kk] - 2 * d2[jj + 1][kk]) * tt * tt)
-                    self.data[net][jj][kk * 6 + 5] = (1 / (2 * tt * tt * tt * tt * tt)) * (12 * hh - 6 * (dd[jj + 1][kk] + dd[jj][kk]) * tt + (d2[jj + 1][kk] - d2[jj][kk]) * tt * tt)
-        
     # one-by-one executions
     def _make_data(self, xx, idx):
         with self.sub_graph.as_default():
@@ -207,3 +221,32 @@ def _save_data(self):
         for ii in range(self.ntypes * self.ntypes):
             net = "filter_" + str(int(ii / self.ntypes)) + "_net_" + str(int(ii % self.ntypes))
             np.savetxt('data_' + str(int(ii)), self.data[net])
+
+    def _get_env_mat_range(self,
+                           min_nbor_dist,
+                           rcut,
+                           rcut_smth):
+        lower = 100.0
+        upper = -10.0
+        sw    = self._spline5_switch(min_nbor_dist, rcut_smth, rcut)
+        for ii in range(self.ntypes):
+            if lower > -self.davg[ii][0] / self.dstd[ii][0]:
+                lower = -self.davg[ii][0] / self.dstd[ii][0]
+            if upper < ((1 / min_nbor_dist) * sw - self.davg[ii][0]) / self.dstd[ii][0]:
+                upper = ((1 / min_nbor_dist) * sw - self.davg[ii][0]) / self.dstd[ii][0]
+        print('# DEEPMD: training data with lower boundary: ' + str(lower))
+        print('# DEEPMD: training data with upper boundary: ' + str(upper))
+        return math.floor(lower), math.ceil(upper)
+
+    def _spline5_switch(self,
+                        xx,
+                        rmin,
+                        rmax):
+        if xx < rmin:
+            vv = 1
+        elif xx < rmax:
+            uu = (xx - rmin) / (rmax - rmin)
+            vv = uu*uu*uu * (-6 * uu*uu + 15 * uu - 10) + 1
+        else:
+            vv = 0
+        return vv
\ No newline at end of file
diff --git a/source/op/env_mat_stat.cc b/source/op/env_mat_stat.cc
index 0db739272f..526bf7d4f2 100644
--- a/source/op/env_mat_stat.cc
+++ b/source/op/env_mat_stat.cc
@@ -105,6 +105,7 @@ class EnvMatStatOp : public OpKernel {
     min_nbor_dist_shape.AddDim (nloc * nnei);
     TensorShape max_nbor_size_shape ;
     max_nbor_size_shape.AddDim (nloc);
+    max_nbor_size_shape.AddDim (ntypes);
 
     int context_output_index = 0;
     Tensor* min_nbor_dist_tensor = NULL;
@@ -219,7 +220,10 @@ class EnvMatStatOp : public OpKernel {
       }
 
   for (int ii = 0; ii < nloc; ii++) {
-    max_nbor_size(ii) = d_nlist_r[ii].size();
+    for (int jj = 0; jj < d_nlist_r[ii].size(); jj++) {
+        int type = d_type[d_nlist_r[ii][jj]];
+        max_nbor_size(ii * ntypes + type) += 1;
+    }
   }
       // loop over atoms, compute descriptors for each atom
 #pragma omp parallel for 
diff --git a/source/train/Trainer.py b/source/train/Trainer.py
index e0a1b80c36..8a6afcccf1 100644
--- a/source/train/Trainer.py
+++ b/source/train/Trainer.py
@@ -91,9 +91,6 @@ def _init_param(self, jdata):
         fitting_param = j_must_have(model_param, 'fitting_net')
         self.model_param    = model_param
         self.descrpt_param  = descrpt_param
-        self.descrpt_type   = descrpt_param['type']
-        if 'compress' in model_param:
-            self.compress_param = model_param['compress']
         
         # descriptor
         try:
@@ -277,21 +274,15 @@ def build (self,
 
         self.model.data_stat(data)
 
-        if 'compress' in self.model_param and self.compress_param['compress']:
-            assert hasattr(self.descrpt, 'davg'),     "Model compression error: descriptor must have attr davg!"
-            assert hasattr(self.descrpt, 'dstd'),     "Model compression error: descriptor must have attr dstd!"
-            assert hasattr(self.descrpt, 'ntypes'),   "Model compression error: descriptor must have attr ntypes!"
-            assert hasattr(self.descrpt, 'ndescrpt'), "Model compression error: descriptor must have attr ndescrpt!"
-            assert 'sel' in self.descrpt_param,       "Model compression error: descriptor must have attr sel!"
-            assert 'rcut' in self.descrpt_param,      "Model compression error: descriptor must have attr rcut!"
-            assert 'rcut_smth' in self.descrpt_param, "Model compression error: descriptor must have attr rcut_smth!"
-            if self.descrpt_type == 'se_a':
-                stat = EnvMatStat(self.descrpt_type, self.descrpt.ntypes, self.descrpt.ndescrpt, self.descrpt_param['rcut'], self.descrpt_param['rcut_smth'], self.descrpt_param['sel'], self.descrpt.davg, self.descrpt.dstd)
-            else:
-                raise RuntimeError ("Model compression error: descriptor type must be se_a!")
-            env_mat_range\
-                = stat.get_env_mat_range(data)
-            self.descrpt.enable_compression(env_mat_range, self.compress_param['model_file'], self.compress_param['table_config'])            # send the statistics of the training data and activate the descriptor compression mode
+        if 'compress' in self.model_param and self.model_param['compress']['compress']:
+            assert 'sel' in self.descrpt_param,       "Error: descriptor must have attr sel!"
+            assert 'rcut' in self.descrpt_param,      "Error: descriptor must have attr rcut!"
+            assert 'rcut_smth' in self.descrpt_param, "Error: descriptor must have attr rcut_smth!"
+            self.env_mat_stat \
+                = EnvMatStat(self.ntypes, self.descrpt_param['rcut'], self.descrpt_param['rcut_smth'], self.descrpt_param['sel'])
+            self.min_nbor_dist, self.max_nbor_size \
+                = self.env_mat_stat.get_env_mat_stat(data)
+            self.descrpt.enable_compression(self.min_nbor_dist, self.model_param['compress']['model_file'], self.model_param['compress']['table_config'])
 
         worker_device = "/job:%s/task:%d/%s" % (self.run_opt.my_job_name,
                                                 self.run_opt.my_task_index,
diff --git a/source/train/compress.py b/source/train/compress.py
index 1f36f8e9e1..cd1ba79466 100644
--- a/source/train/compress.py
+++ b/source/train/compress.py
@@ -16,13 +16,12 @@ def compress(args):
        jdata = convert_input_v0_v1(jdata, 
                                    warning = True, 
                                    dump = 'input_v1_compat.json')
-    
     jdata = normalize(jdata)
     jdata['model']['compress'] = {}
     jdata['model']['compress']['compress'] = True
     jdata['model']['compress']['model_file'] = args.input
-    jdata['model']['compress']['table_config'] = args.table_config
-    
+    jdata['model']['compress']['table_config'] = [args.extrapolate, args.stride, 10 * args.stride, args.frequency]
+
     # check the descriptor info of the input file
     assert jdata['model']['descriptor']['type'] == 'se_a', 'Model compression error: descriptor type must be se_a!'
     assert jdata['model']['descriptor']['resnet_dt'] == False, 'Model compression error: descriptor resnet_dt must be false!'
diff --git a/source/train/main.py b/source/train/main.py
index 9baaffce4e..2cd262f7d7 100644
--- a/source/train/main.py
+++ b/source/train/main.py
@@ -63,16 +63,26 @@ def main () :
                             help="The file containing details of energy force and virial accuracy")
     parser_tst.add_argument("-a", "--atomic-energy", action = 'store_true', 
                             help="Test the accuracy of atomic energy")
-
+    
+    """
+    Compress a model, which including tabulating the embedding-net. 
+    The table is composed of fifth-order polynomial coefficients and is assembled from two sub-tables. The first table takes the stride(parameter) as it\'s uniform stride, while the second table takes 10 * stride as it\s uniform stride 
+    The range of the first table is automatically detected by deepmd-kit, while the second table ranges from the first table\'s upper boundary(upper) to the extrapolate(parameter) * upper.
+    """
     parser_compress = subparsers.add_parser('compress', help='compress a model')
     parser_compress.add_argument('INPUT', 
-                            help='the input parameter file in json or yaml format')
+                            help='The input parameter file in json or yaml format, which should be consistent with the original model parameter file')
     parser_compress.add_argument('-i', "--input", default = "frozen_model.pb", type=str, 
-				            help = "the original model")
+				            help = "The original frozen model, which will be compressed by the deepmd-kit")
     parser_compress.add_argument("-o","--output", default = "frozen_model_compress.pb", type=str, 
-				            help='the compressed model')
-    parser_compress.add_argument('-t', '--table-config', nargs='+', default = [5, 0.01, 0.1, 1], type=float)
-    parser_compress.add_argument("-d", "--folder", type=str, default = ".", 
+				            help='The compressed model')
+    parser_compress.add_argument('-e', '--extrapolate', default=5, type=int, 
+                            help="The scale of model extrapolation")
+    parser_compress.add_argument('-s', '--stride', default=0.01, type=float, 
+                            help="The uniform stride of tabulation's first table, the second table will use 10 * stride as it's uniform stride")
+    parser_compress.add_argument('-f', '--frequency', default=-1, type=int, 
+                            help="The frequency of tabulation overflow check(If the input environment matrix overflow the first or second table range). By default do not check the overflow")
+    parser_compress.add_argument("-d", "--folder", type=str, default = ".",
                             help="path to checkpoint folder")
 
     parser_train = subparsers.add_parser('doc-train-input', 

From 7231fb0d8cc9a16aa4c0285dd7a9ecf375bf09fa Mon Sep 17 00:00:00 2001
From: denghuilu <denghuilu@pku.edu.cn>
Date: Mon, 8 Feb 2021 12:57:16 +0800
Subject: [PATCH 15/20] optimize code structure of model compression

---
 deepmd/descriptor/se_a.py                     |  23 +-
 .../{env_mat_stat.py => neighbor_stat.py}     |  30 +-
 deepmd/utils/tabulate.py                      |  35 +--
 source/lib/include/ComputeDescriptor.h        |  47 ---
 source/op/CMakeLists.txt                      |   2 +-
 source/op/env_mat_stat.cc                     | 294 ------------------
 source/op/neighbor_stat.cc                    | 192 ++++++++++++
 source/train/Trainer.py                       |  12 +-
 8 files changed, 238 insertions(+), 397 deletions(-)
 rename deepmd/utils/{env_mat_stat.py => neighbor_stat.py} (81%)
 delete mode 100644 source/op/env_mat_stat.cc
 create mode 100644 source/op/neighbor_stat.cc

diff --git a/deepmd/descriptor/se_a.py b/deepmd/descriptor/se_a.py
index 74ea3afbf1..09adba8dcc 100644
--- a/deepmd/descriptor/se_a.py
+++ b/deepmd/descriptor/se_a.py
@@ -232,7 +232,10 @@ def compute_input_stats (self,
     def enable_compression(self,
                            min_nbor_dist,
                            model_file = 'frozon_model.pb',
-                           table_config = [5, 0.01, 0.1, -1]
+                           table_extrapolate = 5,
+                           table_stride_1 = 0.01,
+                           table_stride_2 = 0.1,
+                           check_frequency = -1
     ) -> None:
         """
         Reveive the statisitcs (distance, max_nbor_size and env_mat_range) of the training data.
@@ -243,19 +246,21 @@ def enable_compression(self,
                 The nearest distance between atoms
         model_file
                 The original frozen model, which will be compressed by the program
-        table_config
-                The configuration including:
-                Table_config[0] denotes the scale of model extrapolation
-                Table_config[1] denotes the uniform stride of the first table
-                Table_config[2] denotes the uniform stride of the second table
-                Table_config[3] denotes the overflow check frequency
+        table_extrapolate
+                The scale of model extrapolation
+        table_stride_1
+                The uniform stride of the first table
+        table_stride_2
+                The uniform stride of the second table
+        check_frequency
+                The overflow check frequency
         """
         self.compress = True
         self.model_file = model_file
-        self.table_config = table_config
+        self.table_config = [table_extrapolate, table_stride_1, table_stride_2, check_frequency]
         self.table = DeepTabulate(self.model_file, self.filter_np_precision, self.type_one_side)
         self.lower, self.upper \
-            = self.table.build(min_nbor_dist, self.rcut_r, self.rcut_r_smth, self.table_config[0], self.table_config[1], self.table_config[2])
+            = self.table.build(min_nbor_dist, table_extrapolate, table_stride_1, table_stride_2)
 
     def build (self, 
                coord_ : tf.Tensor, 
diff --git a/deepmd/utils/env_mat_stat.py b/deepmd/utils/neighbor_stat.py
similarity index 81%
rename from deepmd/utils/env_mat_stat.py
rename to deepmd/utils/neighbor_stat.py
index b8f73959eb..9ccb3a5d0e 100644
--- a/deepmd/utils/env_mat_stat.py
+++ b/deepmd/utils/neighbor_stat.py
@@ -7,16 +7,14 @@
 from deepmd.env import default_tf_session_config
 from deepmd.RunOptions import global_np_float_precision
 
-class EnvMatStat():
+class NeighborStat():
     """
     Class for getting training data information. 
     It loads data from DeepmdData object, and measures the data info, including neareest nbor distance between atoms, max nbor size of atoms and the output data range of the environment matrix.
     """
     def __init__(self,
                  ntypes : int,
-                 rcut,
-                 rcut_smth,
-                 sel) -> None:
+                 rcut) -> None:
         """
         Constructor
 
@@ -26,11 +24,8 @@ def __init__(self,
                 The num of atom types
         rcut
                 The cut-off radius
-        rcut_smth
-                From where the environment matrix should be smoothed
-        sel : list[str]
-                sel[i] specifies the maxmum number of type i atoms in the cut-off radius
         """
+        self.rcut = rcut
         self.ntypes = ntypes
         self.place_holders = {}
         sub_graph = tf.Graph()
@@ -40,22 +35,17 @@ def __init__(self,
             self.place_holders['type'] = tf.placeholder(tf.int32, [None, None], name='t_type')
             self.place_holders['natoms_vec'] = tf.placeholder(tf.int32, [self.ntypes+2], name='t_natoms')
             self.place_holders['default_mesh'] = tf.placeholder(tf.int32, [None], name='t_mesh')
-            self.sel = sel
-            self.rcut = rcut
-            self.rcut_smth = rcut_smth
-            self._min_nbor_dist, self._max_nbor_size \
-                = op_module.env_mat_stat(self.place_holders['coord'],
+            self._max_nbor_size, self._min_nbor_dist \
+                = op_module.neighbor_stat(self.place_holders['coord'],
                                          self.place_holders['type'],
                                          self.place_holders['natoms_vec'],
                                          self.place_holders['box'],
                                          self.place_holders['default_mesh'],
-                                         sel = self.sel,
-                                         rcut = self.rcut,
-                                         rcut_smth = self.rcut_smth)
+                                         rcut = self.rcut)
         self.sub_sess = tf.Session(graph = sub_graph, config=default_tf_session_config)
 
-    def get_env_mat_stat(self,
-                         data) -> Tuple[float, List[int]]:
+    def get_stat(self,
+                 data) -> Tuple[float, List[int]]:
         """
         get the data statistics of the training data, including nearest nbor distance between atoms, max nbor size of atoms
 
@@ -78,8 +68,8 @@ def get_env_mat_stat(self,
             for jj in data.data_systems[ii].dirs:
                 data_set = data.data_systems[ii]._load_set(jj)
                 for kk in range(np.array(data_set['type']).shape[0]):
-                    dt, mn \
-                        = self.sub_sess.run([self._min_nbor_dist, self._max_nbor_size], 
+                    mn, dt \
+                        = self.sub_sess.run([self._max_nbor_size, self._min_nbor_dist], 
                                             feed_dict = {
                                                 self.place_holders['coord']: np.array(data_set['coord'])[kk].reshape([-1, data.natoms[ii] * 3]),
                                                 self.place_holders['type']: np.array(data_set['type'])[kk].reshape([-1, data.natoms[ii]]),
diff --git a/deepmd/utils/tabulate.py b/deepmd/utils/tabulate.py
index 84255f6e16..c7e61a380b 100644
--- a/deepmd/utils/tabulate.py
+++ b/deepmd/utils/tabulate.py
@@ -27,13 +27,13 @@ def __init__(self,
         model_file
                 The frozen model
         data_type
-                The precision of the table. Supported options are {1}
+                The precision of the tables. Supported options are {1}
         type_one_side
                 Try to build N_types tables. Otherwise, building N_types^2 tables
         """
 
         self.model_file = model_file
-        self.data_type = data_type
+        self.np_data_type = data_type
         self.type_one_side = type_one_side
 
         self.graph, self.graph_def = self._load_graph()
@@ -48,6 +48,10 @@ def __init__(self,
         self.davg = self._get_tensor_value(self.graph.get_tensor_by_name ('descrpt_attr/t_avg:0'))
         self.dstd = self._get_tensor_value(self.graph.get_tensor_by_name ('descrpt_attr/t_std:0'))
 
+        self.descrpt = self.graph.get_operation_by_name ('DescrptSeA')
+        self.rcut = self.descrpt.get_attr('rcut_r')
+        self.rcut_smth = self.descrpt.get_attr('rcut_r_smth')
+
         self.filter_variable_nodes = self._load_matrix_node()
         self.layer_size = int(len(self.filter_variable_nodes) / (self.ntypes * self.ntypes * 2))
         self.table_size = self.ntypes * self.ntypes
@@ -58,8 +62,8 @@ def __init__(self,
         # get trained variables
         self.bias = self._get_bias()
         self.matrix = self._get_matrix()
-        # self.matrix_layer_3 must exist
-        # self.data_type = type(self.matrix["layer_1"][0][0][0])
+
+        self.data_type = type(self.matrix["layer_1"][0][0][0])
         assert self.matrix["layer_1"][0].size > 0, "no matrix exist in matrix array!"
         self.last_layer_size = self.matrix["layer_" + str(self.layer_size)][0].shape[1]
         # define tables
@@ -69,8 +73,6 @@ def __init__(self,
 
     def build(self, 
               min_nbor_dist,
-              rcut,
-              rcut_smth,
               extrapolate, 
               stride0, 
               stride1) -> Tuple[int, int]:
@@ -81,10 +83,6 @@ def build(self,
         ----------
         min_nbor_dist
                 The nearest distance between neighbor atoms
-        rcut
-                The cut-off radius
-        rcut_smth
-                From where the environment matrix should be smoothed
         extrapolate
                 The scale of model extrapolation
         stride0
@@ -100,7 +98,7 @@ def build(self,
                 The upper boundary of environment matrix
         """
         # tabulate range [lower, upper] with stride0 'stride0'
-        lower, upper = self._get_env_mat_range(min_nbor_dist, rcut, rcut_smth)
+        lower, upper = self._get_env_mat_range(min_nbor_dist)
         xx = np.arange(lower, upper, stride0, dtype = self.data_type)
         xx = np.append(xx, np.arange(upper, extrapolate * upper, stride1, dtype = self.data_type))
         xx = np.append(xx, np.array([extrapolate * upper], dtype = self.data_type))
@@ -125,6 +123,7 @@ def build(self,
                     self.data[net][jj][kk * 6 + 3] = (1 / (2 * tt * tt * tt)) * (20 * hh - (8 * dd[jj + 1][kk] + 12 * dd[jj][kk]) * tt - (3 * d2[jj][kk] - d2[jj + 1][kk]) * tt * tt)
                     self.data[net][jj][kk * 6 + 4] = (1 / (2 * tt * tt * tt * tt)) * (-30 * hh + (14 * dd[jj + 1][kk] + 16 * dd[jj][kk]) * tt + (3 * d2[jj][kk] - 2 * d2[jj + 1][kk]) * tt * tt)
                     self.data[net][jj][kk * 6 + 5] = (1 / (2 * tt * tt * tt * tt * tt)) * (12 * hh - 6 * (dd[jj + 1][kk] + dd[jj][kk]) * tt + (d2[jj + 1][kk] - d2[jj][kk]) * tt * tt)
+            self.data[net].astype(self.np_data_type)
         return lower, upper
 
     def _load_graph(self):
@@ -165,12 +164,12 @@ def _get_bias(self):
                 for ii in range(0, self.ntypes):
                     tensor_value = np.frombuffer (self.filter_variable_nodes["filter_type_all/bias_" + str(layer) + "_" + str(int(ii))].tensor_content)
                     tensor_shape = tf.TensorShape(self.filter_variable_nodes["filter_type_all/bias_" + str(layer) + "_" + str(int(ii))].tensor_shape).as_list()
-                    bias["layer_" + str(layer)].append(np.reshape(tensor_value, tensor_shape).astype(self.data_type))
+                    bias["layer_" + str(layer)].append(np.reshape(tensor_value, tensor_shape))
             else:
                 for ii in range(0, self.ntypes * self.ntypes):
                     tensor_value = np.frombuffer(self.filter_variable_nodes["filter_type_" + str(int(ii / self.ntypes)) + "/bias_" + str(layer) + "_" + str(int(ii % self.ntypes))].tensor_content)
                     tensor_shape = tf.TensorShape(self.filter_variable_nodes["filter_type_" + str(int(ii / self.ntypes)) + "/bias_" + str(layer) + "_" + str(int(ii % self.ntypes))].tensor_shape).as_list()
-                    bias["layer_" + str(layer)].append(np.reshape(tensor_value, tensor_shape).astype(self.data_type))
+                    bias["layer_" + str(layer)].append(np.reshape(tensor_value, tensor_shape))
         return bias
 
     def _get_matrix(self):
@@ -181,12 +180,12 @@ def _get_matrix(self):
                 for ii in range(0, self.ntypes):
                     tensor_value = np.frombuffer (self.filter_variable_nodes["filter_type_all/matrix_" + str(layer) + "_" + str(int(ii))].tensor_content)
                     tensor_shape = tf.TensorShape(self.filter_variable_nodes["filter_type_all/matrix_" + str(layer) + "_" + str(int(ii))].tensor_shape).as_list()
-                    matrix["layer_" + str(layer)].append(np.reshape(tensor_value, tensor_shape).astype(self.data_type))
+                    matrix["layer_" + str(layer)].append(np.reshape(tensor_value, tensor_shape))
             else:
                 for ii in range(0, self.ntypes * self.ntypes):
                     tensor_value = np.frombuffer(self.filter_variable_nodes["filter_type_" + str(int(ii / self.ntypes)) + "/matrix_" + str(layer) + "_" + str(int(ii % self.ntypes))].tensor_content)
                     tensor_shape = tf.TensorShape(self.filter_variable_nodes["filter_type_" + str(int(ii / self.ntypes)) + "/matrix_" + str(layer) + "_" + str(int(ii % self.ntypes))].tensor_shape).as_list()
-                    matrix["layer_" + str(layer)].append(np.reshape(tensor_value, tensor_shape).astype(self.data_type))
+                    matrix["layer_" + str(layer)].append(np.reshape(tensor_value, tensor_shape))
         return matrix
 
     # one-by-one executions
@@ -223,12 +222,10 @@ def _save_data(self):
             np.savetxt('data_' + str(int(ii)), self.data[net])
 
     def _get_env_mat_range(self,
-                           min_nbor_dist,
-                           rcut,
-                           rcut_smth):
+                           min_nbor_dist):
         lower = 100.0
         upper = -10.0
-        sw    = self._spline5_switch(min_nbor_dist, rcut_smth, rcut)
+        sw    = self._spline5_switch(min_nbor_dist, self.rcut_smth, self.rcut)
         for ii in range(self.ntypes):
             if lower > -self.davg[ii][0] / self.dstd[ii][0]:
                 lower = -self.davg[ii][0] / self.dstd[ii][0]
diff --git a/source/lib/include/ComputeDescriptor.h b/source/lib/include/ComputeDescriptor.h
index bab9032828..3b85f931dc 100644
--- a/source/lib/include/ComputeDescriptor.h
+++ b/source/lib/include/ComputeDescriptor.h
@@ -143,19 +143,6 @@ void compute_descriptor_se_r (std::vector<double > &		descrpt_r,
 			      const double &			rmin, 
 			      const double &			rmax);
 
-inline
-void get_rij(std::vector<double > &	rij_a,
-			       const std::vector<double > &	posi,
-			       const int &			ntypes,
-			       const std::vector<int > &	type,
-			       const SimulationRegion<double> &	region,
-			       const bool &			b_pbc,
-			       const int &			i_idx,
-			       const std::vector<int > &	fmt_nlist_a,
-			       const std::vector<int > &	sec_a,
-			       const double &			rmin,
-			       const double &			rmax);
-
 struct NeighborInfo 
 {
   int type;
@@ -1075,40 +1062,6 @@ void compute_descriptor_se_a (std::vector<double > &			descrpt_a,
   }
 }
 
-void get_rij(std::vector<double > &	rij_a,
-			       const std::vector<double > &	posi,
-			       const int &			ntypes,
-			       const std::vector<int > &	type,
-			       const SimulationRegion<double> &	region,
-			       const bool &			b_pbc,
-			       const int &			i_idx,
-			       const std::vector<int > &	fmt_nlist_a,
-			       const std::vector<int > &	sec_a,
-			       const double &			rmin,
-			       const double &			rmax)
-{  
-  // compute the diff of the neighbors
-  std::vector<std::vector<double > > sel_a_diff (sec_a.back());
-  rij_a.resize (sec_a.back() * 3);
-  fill (rij_a.begin(), rij_a.end(), 0.0);
-  for (int ii = 0; ii < int(sec_a.size()) - 1; ++ii){
-    for (int jj = sec_a[ii]; jj < sec_a[ii+1]; ++jj){
-      if (fmt_nlist_a[jj] < 0) break;
-      sel_a_diff[jj].resize(3);
-      const int & j_idx = fmt_nlist_a[jj];
-      if (b_pbc){
-	region.diffNearestNeighbor (posi[j_idx*3+0], posi[j_idx*3+1], posi[j_idx*3+2], 
-				    posi[i_idx*3+0], posi[i_idx*3+1], posi[i_idx*3+2], 
-				    sel_a_diff[jj][0], sel_a_diff[jj][1], sel_a_diff[jj][2]);
-      }
-      else {
-	for (int dd = 0; dd < 3; ++dd) sel_a_diff[jj][dd] = posi[j_idx*3+dd] - posi[i_idx*3+dd];
-      }
-      for (int dd = 0; dd < 3; ++dd) rij_a[jj*3+dd] = sel_a_diff[jj][dd];
-    }
-  }
-}
-
 
 void compute_descriptor_se_r (std::vector<double > &		descrpt,
 			      std::vector<double > &		descrpt_deriv,
diff --git a/source/op/CMakeLists.txt b/source/op/CMakeLists.txt
index da18377683..244eab6e75 100644
--- a/source/op/CMakeLists.txt
+++ b/source/op/CMakeLists.txt
@@ -3,7 +3,7 @@
 set(OP_LIB ${PROJECT_SOURCE_DIR}/lib/src/SimulationRegion.cpp ${PROJECT_SOURCE_DIR}/lib/src/NeighborList.cpp)
 
 set (OP_CXX_FLAG -D_GLIBCXX_USE_CXX11_ABI=${OP_CXX_ABI} )
-file(GLOB OP_SRC prod_force.cc prod_virial.cc descrpt.cc descrpt_se_a.cc descrpt_se_a_ef.cc descrpt_se_a_ef.cc descrpt_se_a_ef_para.cc descrpt_se_a_ef_vert.cc descrpt_se_r.cc tab_inter.cc prod_force_se_a.cc prod_virial_se_a.cc prod_force_se_r.cc prod_virial_se_r.cc soft_min.cc soft_min_force.cc soft_min_virial.cc ewald_recp.cc gelu.cc map_aparam.cc env_mat_stat.cc unaggregated_grad.cc tabulate.cc)
+file(GLOB OP_SRC prod_force.cc prod_virial.cc descrpt.cc descrpt_se_a.cc descrpt_se_a_ef.cc descrpt_se_a_ef.cc descrpt_se_a_ef_para.cc descrpt_se_a_ef_vert.cc descrpt_se_r.cc tab_inter.cc prod_force_se_a.cc prod_virial_se_a.cc prod_force_se_r.cc prod_virial_se_r.cc soft_min.cc soft_min_force.cc soft_min_virial.cc ewald_recp.cc gelu.cc map_aparam.cc neighbor_stat.cc unaggregated_grad.cc tabulate.cc)
 file(GLOB OP_CUDA_SRC prod_force.cc prod_virial.cc descrpt.cc descrpt_se_a_multi_device.cc descrpt_se_r_multi_device.cc tab_inter.cc prod_force_se_a_multi_device.cc prod_virial_se_a_multi_device.cc prod_force_se_r_multi_device.cc prod_virial_se_r_multi_device.cc soft_min.cc soft_min_force.cc soft_min_virial.cc gelu_multi_device.cc tabulate_multi_device.cc)
 file(GLOB OP_GRADS_SRC prod_force_grad.cc prod_force_se_a_grad.cc prod_force_se_r_grad.cc prod_virial_grad.cc prod_virial_se_a_grad.cc prod_virial_se_r_grad.cc soft_min_force_grad.cc soft_min_virial_grad.cc )
 file(GLOB OP_PY *.py)
diff --git a/source/op/env_mat_stat.cc b/source/op/env_mat_stat.cc
deleted file mode 100644
index 526bf7d4f2..0000000000
--- a/source/op/env_mat_stat.cc
+++ /dev/null
@@ -1,294 +0,0 @@
-#include "tensorflow/core/framework/op.h"
-#include "tensorflow/core/framework/op_kernel.h"
-#include "tensorflow/core/framework/shape_inference.h"
-#include <iostream>
-
-#include "ComputeDescriptor.h"
-#include "NeighborList.h"
-
-typedef double boxtensor_t ;
-typedef double compute_t;
-
-using namespace tensorflow;
-// using namespace std;
-
-using CPUDevice = Eigen::ThreadPoolDevice;
-using GPUDevice = Eigen::GpuDevice;
-
-REGISTER_OP("EnvMatStat")
-    .Attr("T: {float, double}")
-    .Input("coord: T")          //atomic coordinates
-    .Input("type: int32")       //atomic type
-    .Input("natoms: int32")     //local atomic number; each type atomic number; daizheyingxiangqude atomic numbers
-    .Input("box : T")
-    .Input("mesh : int32")
-    .Attr("rcut: float")      //no use
-    .Attr("rcut_smth: float")
-    .Attr("sel: list(int)")
-    .Output("min_nbor_dist: T")
-    .Output("max_nbor_size: int32");
-
-template<typename Device, typename FPTYPE>
-class EnvMatStatOp : public OpKernel {
-public:
-  explicit EnvMatStatOp(OpKernelConstruction* context) : OpKernel(context) {
-    OP_REQUIRES_OK(context, context->GetAttr("rcut", &rcut));
-    OP_REQUIRES_OK(context, context->GetAttr("rcut_smth", &rcut_smth));
-    OP_REQUIRES_OK(context, context->GetAttr("sel", &sel));
-    cum_sum (sec, sel);
-    ndescrpt = sec.back() * 4;
-    nnei = sec.back();
-  }
-
-  void Compute(OpKernelContext* context) override {
-    counter++;
-    // Grab the input tensor
-    int context_input_index = 0;
-    const Tensor& coord_tensor	= context->input(context_input_index++);
-    const Tensor& type_tensor	= context->input(context_input_index++);
-    const Tensor& natoms_tensor	= context->input(context_input_index++);
-    const Tensor& box_tensor	= context->input(context_input_index++);
-    const Tensor& mesh_tensor	= context->input(context_input_index++);
-
-    // set size of the sample
-    OP_REQUIRES (context, (coord_tensor.shape().dims() == 2),	errors::InvalidArgument ("Dim of coord should be 2"));
-    OP_REQUIRES (context, (type_tensor.shape().dims() == 2),	errors::InvalidArgument ("Dim of type should be 2"));
-    OP_REQUIRES (context, (natoms_tensor.shape().dims() == 1),	errors::InvalidArgument ("Dim of natoms should be 1"));
-    OP_REQUIRES (context, (box_tensor.shape().dims() == 2),	errors::InvalidArgument ("Dim of box should be 2"));
-    OP_REQUIRES (context, (mesh_tensor.shape().dims() == 1),	errors::InvalidArgument ("Dim of mesh should be 1"));
-    OP_REQUIRES (context, (natoms_tensor.shape().dim_size(0) >= 3),		errors::InvalidArgument ("number of atoms should be larger than (or equal to) 3"));
-    auto natoms	= natoms_tensor	.flat<int>();
-    int nloc = natoms(0);
-    int nall = natoms(1);
-    int ntypes = natoms_tensor.shape().dim_size(0) - 2;
-    int nsamples = coord_tensor.shape().dim_size(0);
-
-    // check the sizes
-    OP_REQUIRES (context, (nsamples == type_tensor.shape().dim_size(0)),	errors::InvalidArgument ("number of samples should match"));
-    OP_REQUIRES (context, (nsamples == box_tensor.shape().dim_size(0)),		errors::InvalidArgument ("number of samples should match"));
-    OP_REQUIRES (context, (nall * 3 == coord_tensor.shape().dim_size(1)),	errors::InvalidArgument ("number of atoms should match"));
-    OP_REQUIRES (context, (nall == type_tensor.shape().dim_size(1)),		errors::InvalidArgument ("number of atoms should match"));
-    OP_REQUIRES (context, (9 == box_tensor.shape().dim_size(1)),		errors::InvalidArgument ("number of box should be 9"));
-
-    int nei_mode = 0;
-    if (mesh_tensor.shape().dim_size(0) == 16) {
-      // lammps neighbor list
-      nei_mode = 3;
-    }
-    else if (mesh_tensor.shape().dim_size(0) == 12) {
-      // user provided extended mesh
-      nei_mode = 2;
-    }
-    else if (mesh_tensor.shape().dim_size(0) == 6) {
-      // manual copied pbc
-      assert (nloc == nall);
-      nei_mode = 1;
-    }
-    else if (mesh_tensor.shape().dim_size(0) == 0) {
-      // no pbc
-      nei_mode = -1;
-    }
-    else {
-      throw std::runtime_error("invalid mesh tensor");
-    }
-    bool b_pbc = true;
-    // if region is given extended, do not use pbc
-    if (nei_mode >= 1 || nei_mode == -1) {
-      b_pbc = false;
-    }
-    bool b_norm_atom = false;
-    if (nei_mode == 1){
-      b_norm_atom = true;
-    }
-
-    TensorShape min_nbor_dist_shape ;
-    min_nbor_dist_shape.AddDim (nloc * nnei);
-    TensorShape max_nbor_size_shape ;
-    max_nbor_size_shape.AddDim (nloc);
-    max_nbor_size_shape.AddDim (ntypes);
-
-    int context_output_index = 0;
-    Tensor* min_nbor_dist_tensor = NULL;
-    OP_REQUIRES_OK(context, context->allocate_output(context_output_index++, 
-						     min_nbor_dist_shape,
-						     &min_nbor_dist_tensor));
-    Tensor* max_nbor_size_tensor = NULL;
-    OP_REQUIRES_OK(context, context->allocate_output(context_output_index++, 
-						     max_nbor_size_shape,
-						     &max_nbor_size_tensor));
-
-    auto coord	= coord_tensor	.matrix<FPTYPE>();
-    auto type	= type_tensor	.matrix<int>();
-    auto box	= box_tensor	.matrix<FPTYPE>();
-    auto mesh	= mesh_tensor	.flat<int>();
-    auto min_nbor_dist	= min_nbor_dist_tensor ->flat<FPTYPE>();
-    // find a potential bug here!
-    auto max_nbor_size	= max_nbor_size_tensor ->flat<int>();
-    
-    for (int ii = 0; ii < static_cast<int>(min_nbor_dist_tensor->NumElements()); ii++) {
-      min_nbor_dist(ii) = 10000.0;
-    }
-    for (int ii = 0; ii < static_cast<int>(max_nbor_size_tensor->NumElements()); ii++) {
-      max_nbor_size(ii) = 0;
-    }
-
-    for (int kk = 0; kk < nsamples; ++kk){
-      // set region
-      boxtensor_t boxt [9] = {0};
-      for (int dd = 0; dd < 9; ++dd) {
-	boxt[dd] = box(kk, dd);
-      }
-      SimulationRegion<compute_t > region;
-      region.reinitBox (boxt);
-
-      // set & normalize coord
-      std::vector<compute_t > d_coord3 (nall*3);
-      for (int ii = 0; ii < nall; ++ii){
-	for (int dd = 0; dd < 3; ++dd){
-	  d_coord3[ii*3+dd] = coord(kk, ii*3+dd);
-	}
-	if (b_norm_atom){
-	  compute_t inter[3];
-	  region.phys2Inter (inter, &d_coord3[3*ii]);
-	  for (int dd = 0; dd < 3; ++dd){
-	    if      (inter[dd] < 0 ) inter[dd] += 1.;
-	    else if (inter[dd] >= 1) inter[dd] -= 1.;
-	  }
-	  region.inter2Phys (&d_coord3[3*ii], inter);
-	}
-      }
-
-      // set type
-      std::vector<int > d_type (nall);
-      for (int ii = 0; ii < nall; ++ii) d_type[ii] = type(kk, ii);
-      
-      // build nlist
-      std::vector<std::vector<int > > d_nlist_a;
-      std::vector<std::vector<int > > d_nlist_r;
-      std::vector<int> nlist_map;
-      bool b_nlist_map = false;
-      if (nei_mode == 3) {	
-	int * pilist, *pjrange, *pjlist;
-	memcpy (&pilist, &mesh(4), sizeof(int *));
-	memcpy (&pjrange, &mesh(8), sizeof(int *));
-	memcpy (&pjlist, &mesh(12), sizeof(int *));
-	int inum = mesh(1);
-	assert (inum == nloc);
-	d_nlist_a.resize (inum);
-	d_nlist_r.resize (inum);
-	for (unsigned ii = 0; ii < inum; ++ii){
-	  d_nlist_r.reserve (pjrange[inum] / inum + 10);
-	}
-	for (unsigned ii = 0; ii < inum; ++ii){
-	  int i_idx = pilist[ii];
-	  for (unsigned jj = pjrange[ii]; jj < pjrange[ii+1]; ++jj){
-	    int j_idx = pjlist[jj];
-	    d_nlist_r[i_idx].push_back (j_idx);
-	  }
-	}
-      }
-      else if (nei_mode == 2) {
-          // std::cout << "I'm in nei_mode 2" << std::endl;
-	std::vector<int > nat_stt = {mesh(1-1), mesh(2-1), mesh(3-1)};
-	std::vector<int > nat_end = {mesh(4-1), mesh(5-1), mesh(6-1)};
-	std::vector<int > ext_stt = {mesh(7-1), mesh(8-1), mesh(9-1)};
-	std::vector<int > ext_end = {mesh(10-1), mesh(11-1), mesh(12-1)};
-	std::vector<int > global_grid (3);
-	for (int dd = 0; dd < 3; ++dd) global_grid[dd] = nat_end[dd] - nat_stt[dd];
-	::build_nlist (d_nlist_a, d_nlist_r, d_coord3, nloc, -1, rcut, nat_stt, nat_end, ext_stt, ext_end, region, global_grid);
-      }
-      else if (nei_mode == 1) {
-          // std::cout << "I'm in nei_mode 1" << std::endl;
-	std::vector<double > bk_d_coord3 = d_coord3;
-	std::vector<int > bk_d_type = d_type;
-	std::vector<int > ncell, ngcell;
-	copy_coord(d_coord3, d_type, nlist_map, ncell, ngcell, bk_d_coord3, bk_d_type, rcut, region);	
-	b_nlist_map = true;
-	std::vector<int> nat_stt(3, 0);
-	std::vector<int> ext_stt(3), ext_end(3);
-	for (int dd = 0; dd < 3; ++dd){
-	  ext_stt[dd] = -ngcell[dd];
-	  ext_end[dd] = ncell[dd] + ngcell[dd];
-	}
-	::build_nlist (d_nlist_a, d_nlist_r, d_coord3, nloc, -1, rcut, nat_stt, ncell, ext_stt, ext_end, region, ncell);
-      }
-      else if (nei_mode == -1){
-	::build_nlist (d_nlist_a, d_nlist_r, d_coord3, -1, rcut, NULL);
-      }
-      else {
-	throw std::runtime_error("unknow neighbor mode");
-      }
-
-  for (int ii = 0; ii < nloc; ii++) {
-    for (int jj = 0; jj < d_nlist_r[ii].size(); jj++) {
-        int type = d_type[d_nlist_r[ii][jj]];
-        max_nbor_size(ii * ntypes + type) += 1;
-    }
-  }
-      // loop over atoms, compute descriptors for each atom
-#pragma omp parallel for 
-      for (int ii = 0; ii < nloc; ++ii){
-	std::vector<int> fmt_nlist_a;
-	std::vector<int> fmt_nlist_r;
-  std::vector<int> sec_r(sec.size(), 0);
-
-	int ret = -1;
-	if ((ret = format_nlist_fill_a (fmt_nlist_a, fmt_nlist_r, d_coord3, ntypes, d_type, region, b_pbc, ii, d_nlist_a[ii], d_nlist_r[ii], rcut, sec, sec_r)) != -1){
-	  if (count_nei_idx_overflow == 0) {
-	    std::cout << "WARNING: Radial neighbor list length of type " << ret << " is not enough" << std::endl;
-	    flush(std::cout);
-	    count_nei_idx_overflow ++;
-	  }
-	}
-
-	std::vector<compute_t > d_rij_a;
-	get_rij (d_rij_a,
-				 d_coord3,
-				 ntypes, 
-				 d_type,
-				 region, 
-				 b_pbc,
-				 ii, 
-				 fmt_nlist_a,
-				 sec, 
-				 rcut_smth, 
-				 rcut);
-
-	// check sizes
-	assert (d_rij_a.size() == nnei * 3);
-	assert (int(fmt_nlist_a.size()) == nnei);
-	for (int jj = 0; jj < nnei * 3; ++jj){
-    if (jj % 3 == 0 && d_rij_a[jj] > 0) {
-      min_nbor_dist(ii * nnei + jj / 3) = sqrt(d_rij_a[jj] * d_rij_a[jj] + d_rij_a[jj + 1] * d_rij_a[jj + 1] + d_rij_a[jj + 2] * d_rij_a[jj + 2]);
-    }
-	}
-      }
-    }
-  }
-private:
-  int counter = -1;
-  float rcut;
-  float rcut_smth;
-  std::vector<int32> sel;
-  std::vector<int> sec;
-  int ndescrpt;
-  int nnei;
-  bool fill_nei_a;
-  int count_nei_idx_overflow;
-  void 
-  cum_sum (std::vector<int> & sec,
-	   const std::vector<int32> & n_sel) const {
-    sec.resize (n_sel.size() + 1);
-    sec[0] = 0;
-    for (int ii = 1; ii < sec.size(); ++ii){
-      sec[ii] = sec[ii-1] + n_sel[ii-1];
-    }
-  }
-};
-
-#define REGISTER_CPU(T)                                                                 \
-REGISTER_KERNEL_BUILDER(                                                                \
-    Name("EnvMatStat").Device(DEVICE_CPU).TypeConstraint<T>("T"),                      \
-    EnvMatStatOp<CPUDevice, T>); 
-REGISTER_CPU(float);
-REGISTER_CPU(double);
\ No newline at end of file
diff --git a/source/op/neighbor_stat.cc b/source/op/neighbor_stat.cc
new file mode 100644
index 0000000000..1be0ba23d5
--- /dev/null
+++ b/source/op/neighbor_stat.cc
@@ -0,0 +1,192 @@
+#include "tensorflow/core/framework/op.h"
+#include "tensorflow/core/framework/op_kernel.h"
+#include "tensorflow/core/framework/shape_inference.h"
+#include <iostream>
+
+#include "NeighborList.h"
+
+typedef double boxtensor_t ;
+typedef double compute_t;
+
+using namespace tensorflow;
+// using namespace std;
+
+using CPUDevice = Eigen::ThreadPoolDevice;
+using GPUDevice = Eigen::GpuDevice;
+
+REGISTER_OP("NeighborStat")
+    .Attr("T: {float, double}")
+    .Input("coord: T")
+    .Input("type: int32")
+    .Input("natoms: int32")
+    .Input("box : T")
+    .Input("mesh : int32")
+    .Attr("rcut: float")   
+    .Output("max_nbor_size: int32")
+    .Output("min_nbor_dist: T");
+
+template<typename Device, typename FPTYPE>
+class NeighborStatOp : public OpKernel {
+public:
+    explicit NeighborStatOp(OpKernelConstruction* context) : OpKernel(context) {
+        OP_REQUIRES_OK(context, context->GetAttr("rcut", &rcut));
+    }
+
+    void Compute(OpKernelContext* context) override {
+        // Grab the input tensor
+        int context_input_index = 0;
+        const Tensor& coord_tensor	= context->input(context_input_index++);
+        const Tensor& type_tensor	= context->input(context_input_index++);
+        const Tensor& natoms_tensor	= context->input(context_input_index++);
+        const Tensor& box_tensor	= context->input(context_input_index++);
+        const Tensor& mesh_tensor	= context->input(context_input_index++);
+
+
+        OP_REQUIRES (context, (coord_tensor.shape().dims() == 2),	errors::InvalidArgument ("Dim of coord should be 2"));
+        OP_REQUIRES (context, (type_tensor.shape().dims() == 2),	errors::InvalidArgument ("Dim of type should be 2"));
+        OP_REQUIRES (context, (natoms_tensor.shape().dims() == 1),	errors::InvalidArgument ("Dim of natoms should be 1"));
+        OP_REQUIRES (context, (box_tensor.shape().dims() == 2),	errors::InvalidArgument ("Dim of box should be 2"));
+        OP_REQUIRES (context, (mesh_tensor.shape().dims() == 1),	errors::InvalidArgument ("Dim of mesh should be 1"));
+        OP_REQUIRES (context, (natoms_tensor.shape().dim_size(0) >= 3),		errors::InvalidArgument ("number of atoms should be larger than (or equal to) 3"));
+        int nloc = natoms_tensor.flat<int>().data()[0];
+        int nall = natoms_tensor.flat<int>().data()[1];
+        int nsamples = coord_tensor.shape().dim_size(0);
+        int ntypes = natoms_tensor.shape().dim_size(0) - 2;
+        // check the sizes
+        OP_REQUIRES (context, (nsamples == type_tensor.shape().dim_size(0)),	errors::InvalidArgument ("number of samples should match"));
+        OP_REQUIRES (context, (nsamples == box_tensor.shape().dim_size(0)),		errors::InvalidArgument ("number of samples should match"));
+        OP_REQUIRES (context, (nall * 3 == coord_tensor.shape().dim_size(1)),	errors::InvalidArgument ("number of atoms should match"));
+        OP_REQUIRES (context, (nall == type_tensor.shape().dim_size(1)),		errors::InvalidArgument ("number of atoms should match"));
+        OP_REQUIRES (context, (9 == box_tensor.shape().dim_size(1)),		errors::InvalidArgument ("number of box should be 9"));
+
+        int nei_mode = 0;
+        if (mesh_tensor.shape().dim_size(0) == 6) {
+            // manual copied pbc
+            assert (nloc == nall);
+            nei_mode = 1;
+        }
+        else if (mesh_tensor.shape().dim_size(0) == 0) {
+            // no pbc
+            nei_mode = -1;
+        }
+        else {
+            throw std::runtime_error("invalid mesh tensor");
+        }
+        // if region is given extended, do not use pbc
+        bool b_pbc = (nei_mode >= 1 || nei_mode == -1) ? false : true;
+        bool b_norm_atom = (nei_mode == 1) ? true : false;
+
+        TensorShape max_nbor_size_shape ;
+        max_nbor_size_shape.AddDim (nloc);
+        max_nbor_size_shape.AddDim (ntypes);
+
+        int context_output_index = 0;
+        Tensor* max_nbor_size_tensor = NULL;
+        OP_REQUIRES_OK(context, context->allocate_output(context_output_index++, 
+	    					     max_nbor_size_shape,
+	    					     &max_nbor_size_tensor));
+
+        const FPTYPE* coord	= coord_tensor.flat<FPTYPE>().data();
+        const int* type	= type_tensor	  .flat<int>().data();
+        const FPTYPE* box	= box_tensor  .flat<FPTYPE>().data();
+        const int* mesh	= mesh_tensor	  .flat<int>().data();
+        int* max_nbor_size = max_nbor_size_tensor ->flat<int>().data();
+
+        for (int ii = 0; ii < static_cast<int>(max_nbor_size_tensor->NumElements()); ii++) {
+            max_nbor_size[ii] = 0;
+        }
+
+        // set region
+        boxtensor_t boxt [9] = {0};
+        for (int dd = 0; dd < 9; ++dd) {
+	        boxt[dd] = box[dd];
+        }
+        SimulationRegion<compute_t > region;
+        region.reinitBox (boxt);
+        // set & normalize coord
+        std::vector<compute_t > d_coord3 (nall * 3);
+        for (int ii = 0; ii < nall; ++ii) {
+	        for (int dd = 0; dd < 3; ++dd) {
+	            d_coord3[ii * 3 + dd] = coord[ii * 3 + dd];
+	        }
+	        if (b_norm_atom) {
+	            compute_t inter[3];
+	            region.phys2Inter (inter, &d_coord3[3 * ii]);
+	            for (int dd = 0; dd < 3; ++dd) {
+	                if      (inter[dd] < 0 ) inter[dd] += 1.;
+	                else if (inter[dd] >= 1) inter[dd] -= 1.;
+	            }
+	            region.inter2Phys (&d_coord3[3 * ii], inter);
+	        }
+        }
+
+        // set type
+        std::vector<int > d_type (nall);
+        for (int ii = 0; ii < nall; ++ii) d_type[ii] = type[ii];
+      
+        // build nlist
+        std::vector<std::vector<int > > d_nlist_a;
+        std::vector<std::vector<int > > d_nlist_r;
+        std::vector<int> nlist_map;
+        bool b_nlist_map = false;
+
+        if (nei_mode == 1) {
+            // std::cout << "I'm in nei_mode 1" << std::endl;
+	        std::vector<double > bk_d_coord3 = d_coord3;
+	        std::vector<int > bk_d_type = d_type;
+	        std::vector<int > ncell, ngcell;
+	        copy_coord(d_coord3, d_type, nlist_map, ncell, ngcell, bk_d_coord3, bk_d_type, rcut, region);	
+	        b_nlist_map = true;
+	        std::vector<int> nat_stt(3, 0);
+	        std::vector<int> ext_stt(3), ext_end(3);
+	        for (int dd = 0; dd < 3; ++dd) {
+	            ext_stt[dd] = -ngcell[dd];
+	            ext_end[dd] = ncell[dd] + ngcell[dd];
+	        }
+	        ::build_nlist (d_nlist_a, d_nlist_r, d_coord3, nloc, -1, rcut, nat_stt, ncell, ext_stt, ext_end, region, ncell);
+        }
+        else if (nei_mode == -1) {
+	        ::build_nlist (d_nlist_a, d_nlist_r, d_coord3, -1, rcut, NULL);
+        }
+        else {
+	        throw std::runtime_error("unknow neighbor mode");
+        }
+
+        int MAX_NNEI = 0;
+        for (int ii = 0; ii < nloc; ii++) { 
+            MAX_NNEI = MAX_NNEI < d_nlist_r[ii].size() ? d_nlist_r[ii].size() : MAX_NNEI;
+        }
+        // allocate output tensor for deepmd-kit
+        TensorShape min_nbor_dist_shape;
+        min_nbor_dist_shape.AddDim (nloc * MAX_NNEI);
+        Tensor* min_nbor_dist_tensor = NULL;
+        OP_REQUIRES_OK(context, context->allocate_output(context_output_index++, 
+	    					     min_nbor_dist_shape,
+	    					     &min_nbor_dist_tensor));
+        FPTYPE* min_nbor_dist	= min_nbor_dist_tensor ->flat<FPTYPE>().data();
+        for (int ii = 0; ii < static_cast<int>(min_nbor_dist_tensor->NumElements()); ii++) {
+            min_nbor_dist[ii] = 10000.0;
+        }
+
+        #pragma omp parallel for 
+        for (int ii = 0; ii < nloc; ii++) {
+            for (int jj = 0; jj < d_nlist_r[ii].size(); jj++) {
+                int type = d_type[d_nlist_r[ii][jj]];
+                max_nbor_size[ii * ntypes + type] += 1;
+                compute_t rij[3] = {d_coord3[d_nlist_r[ii][jj] * 3 + 0] - d_coord3[ii * 3 + 0], d_coord3[d_nlist_r[ii][jj] * 3 + 1] - d_coord3[ii * 3 + 1], d_coord3[d_nlist_r[ii][jj] * 3 + 2] - d_coord3[ii * 3 + 2]};
+                min_nbor_dist[ii * MAX_NNEI + jj] = sqrt(rij[0] * rij[0] + rij[1] * rij[1] + rij[2] * rij[2]);
+            }
+        }
+    }
+
+private:
+  int nnei;
+  float rcut;
+};
+
+#define REGISTER_CPU(T)                                                                 \
+REGISTER_KERNEL_BUILDER(                                                                \
+    Name("NeighborStat").Device(DEVICE_CPU).TypeConstraint<T>("T"),                     \
+    NeighborStatOp<CPUDevice, T>); 
+REGISTER_CPU(float);
+REGISTER_CPU(double);
\ No newline at end of file
diff --git a/source/train/Trainer.py b/source/train/Trainer.py
index 8a6afcccf1..7c95fda5bc 100644
--- a/source/train/Trainer.py
+++ b/source/train/Trainer.py
@@ -19,7 +19,7 @@
 from deepmd.Model import Model, WFCModel, DipoleModel, PolarModel, GlobalPolarModel
 from deepmd.loss import EnerStdLoss, EnerDipoleLoss, TensorLoss
 from deepmd.utils.learning_rate import LearningRateExp
-from deepmd.utils.env_mat_stat import EnvMatStat
+from deepmd.utils.neighbor_stat import NeighborStat
 
 from tensorflow.python.client import timeline
 from deepmd.env import op_module
@@ -275,14 +275,12 @@ def build (self,
         self.model.data_stat(data)
 
         if 'compress' in self.model_param and self.model_param['compress']['compress']:
-            assert 'sel' in self.descrpt_param,       "Error: descriptor must have attr sel!"
             assert 'rcut' in self.descrpt_param,      "Error: descriptor must have attr rcut!"
-            assert 'rcut_smth' in self.descrpt_param, "Error: descriptor must have attr rcut_smth!"
-            self.env_mat_stat \
-                = EnvMatStat(self.ntypes, self.descrpt_param['rcut'], self.descrpt_param['rcut_smth'], self.descrpt_param['sel'])
+            self.neighbor_stat \
+                = NeighborStat(self.ntypes, self.descrpt_param['rcut'])
             self.min_nbor_dist, self.max_nbor_size \
-                = self.env_mat_stat.get_env_mat_stat(data)
-            self.descrpt.enable_compression(self.min_nbor_dist, self.model_param['compress']['model_file'], self.model_param['compress']['table_config'])
+                = self.neighbor_stat.get_stat(data)
+            self.descrpt.enable_compression(self.min_nbor_dist, self.model_param['compress']['model_file'], self.model_param['compress']['table_config'][0], self.model_param['compress']['table_config'][1], self.model_param['compress']['table_config'][2], self.model_param['compress']['table_config'][3])
 
         worker_device = "/job:%s/task:%d/%s" % (self.run_opt.my_job_name,
                                                 self.run_opt.my_task_index,

From 7dacd889e33f848bc523b3a0f97f360798248878 Mon Sep 17 00:00:00 2001
From: denghuilu <denghuilu@pku.edu.cn>
Date: Mon, 8 Feb 2021 21:03:02 +0800
Subject: [PATCH 16/20] move the table precision control into descriptor

---
 deepmd/descriptor/se_a.py | 6 +++---
 deepmd/utils/tabulate.py  | 6 +-----
 2 files changed, 4 insertions(+), 8 deletions(-)

diff --git a/deepmd/descriptor/se_a.py b/deepmd/descriptor/se_a.py
index 09adba8dcc..9be99d3cd0 100644
--- a/deepmd/descriptor/se_a.py
+++ b/deepmd/descriptor/se_a.py
@@ -258,7 +258,7 @@ def enable_compression(self,
         self.compress = True
         self.model_file = model_file
         self.table_config = [table_extrapolate, table_stride_1, table_stride_2, check_frequency]
-        self.table = DeepTabulate(self.model_file, self.filter_np_precision, self.type_one_side)
+        self.table = DeepTabulate(self.model_file, self.type_one_side)
         self.lower, self.upper \
             = self.table.build(min_nbor_dist, table_extrapolate, table_stride_1, table_stride_2)
 
@@ -557,9 +557,9 @@ def _filter(self,
               else:
                 net = 'filter_' + str(type_input) + '_net_' + str(type_i)
               if type_i == 0:
-                xyz_scatter_1  = op_module.tabulate_fusion(self.table.data[net], info, xyz_scatter, tf.reshape(inputs_i, [-1, shape_i[1]//4, 4]), last_layer_size = outputs_size[-1])
+                xyz_scatter_1  = op_module.tabulate_fusion(self.table.data[net].astype(self.filter_np_precision), info, xyz_scatter, tf.reshape(inputs_i, [-1, shape_i[1]//4, 4]), last_layer_size = outputs_size[-1])
               else:
-                xyz_scatter_1 += op_module.tabulate_fusion(self.table.data[net], info, xyz_scatter, tf.reshape(inputs_i, [-1, shape_i[1]//4, 4]), last_layer_size = outputs_size[-1])
+                xyz_scatter_1 += op_module.tabulate_fusion(self.table.data[net].astype(self.filter_np_precision), info, xyz_scatter, tf.reshape(inputs_i, [-1, shape_i[1]//4, 4]), last_layer_size = outputs_size[-1])
             else:
               if (type_input, type_i) not in self.exclude_types:
                   xyz_scatter = embedding_net(xyz_scatter, 
diff --git a/deepmd/utils/tabulate.py b/deepmd/utils/tabulate.py
index c7e61a380b..fc0ba118fd 100644
--- a/deepmd/utils/tabulate.py
+++ b/deepmd/utils/tabulate.py
@@ -17,7 +17,6 @@ class DeepTabulate():
     """
     def __init__(self,
                  model_file,
-                 data_type,
                  type_one_side = False) -> None:
         """
         Constructor
@@ -26,14 +25,11 @@ def __init__(self,
         ----------
         model_file
                 The frozen model
-        data_type
-                The precision of the tables. Supported options are {1}
         type_one_side
                 Try to build N_types tables. Otherwise, building N_types^2 tables
         """
 
         self.model_file = model_file
-        self.np_data_type = data_type
         self.type_one_side = type_one_side
 
         self.graph, self.graph_def = self._load_graph()
@@ -123,7 +119,7 @@ def build(self,
                     self.data[net][jj][kk * 6 + 3] = (1 / (2 * tt * tt * tt)) * (20 * hh - (8 * dd[jj + 1][kk] + 12 * dd[jj][kk]) * tt - (3 * d2[jj][kk] - d2[jj + 1][kk]) * tt * tt)
                     self.data[net][jj][kk * 6 + 4] = (1 / (2 * tt * tt * tt * tt)) * (-30 * hh + (14 * dd[jj + 1][kk] + 16 * dd[jj][kk]) * tt + (3 * d2[jj][kk] - 2 * d2[jj + 1][kk]) * tt * tt)
                     self.data[net][jj][kk * 6 + 5] = (1 / (2 * tt * tt * tt * tt * tt)) * (12 * hh - 6 * (dd[jj + 1][kk] + dd[jj][kk]) * tt + (d2[jj + 1][kk] - d2[jj][kk]) * tt * tt)
-            self.data[net].astype(self.np_data_type)
+            self.data[net]
         return lower, upper
 
     def _load_graph(self):

From 478603b1095758571151e428bbe36dd02450e9a0 Mon Sep 17 00:00:00 2001
From: Denghui Lu <denghuilu@pku.edu.cn>
Date: Tue, 9 Feb 2021 02:32:47 +0800
Subject: [PATCH 17/20] Update use-deepmd-kit.md

---
 doc/use-deepmd-kit.md | 56 ++++++++++++++++++++++++++++++++++++++++++-
 1 file changed, 55 insertions(+), 1 deletion(-)

diff --git a/doc/use-deepmd-kit.md b/doc/use-deepmd-kit.md
index df942a4994..e4cfff5aa6 100644
--- a/doc/use-deepmd-kit.md
+++ b/doc/use-deepmd-kit.md
@@ -5,6 +5,7 @@
 	    - [The DeepPot-SE model](#the-deeppot-se-model)
 	- [Freeze a model](#freeze-a-model)
 	- [Test a model](#test-a-model)
+	- [Compress a model](#compress-a-model)
 	- [Model inference](#model-inference)
 	- [Run MD with Lammps](#run-md-with-lammps)
 	    - [Include deepmd in the pair style](#include-deepmd-in-the-pair-style)
@@ -19,7 +20,8 @@ In this text, we will call the deep neural network that is used to represent the
 2. Train a model
 3. Freeze the model
 4. Test the model
-5. Inference with the model
+5. Compress the model
+6. Inference with the model
 
 ## Prepare data
 One needs to provide the following information to train a model: the atom type, the simulation box, the atom coordinate, the atom force, system energy and virial. A snapshot of a system that contains these information is called a **frame**. We use the following convention of units:
@@ -270,6 +272,58 @@ optional arguments:
                         accuracy
 ```
 
+## Compress a model
+
+Once the frozen model is obtained from deepmd-kit, we can get the neural network structure and its parameters (weights, biases, etc.) from the trained model, and compress it in the following way:
+```bash
+dp compress input.json -i graph.pb -o graph-compress.pb
+```
+where input.json denotes the original training input script, `-i` gives the original frozen model, `-o` gives the compressed model. Several other command line options can be passed to `dp compress`, which can be checked with
+```bash
+$ dp compress --help
+```
+An explanation will be provided
+```
+usage: dp compress [-h] [-i INPUT] [-o OUTPUT] [-e EXTRAPOLATE] [-s STRIDE]
+                   [-f FREQUENCY] [-d FOLDER]
+                   INPUT
+
+positional arguments:
+  INPUT                 The input parameter file in json or yaml format, which
+                        should be consistent with the original model parameter
+                        file
+
+optional arguments:
+  -h, --help            show this help message and exit
+  -i INPUT, --input INPUT
+                        The original frozen model, which will be compressed by
+                        the deepmd-kit
+  -o OUTPUT, --output OUTPUT
+                        The compressed model
+  -e EXTRAPOLATE, --extrapolate EXTRAPOLATE
+                        The scale of model extrapolation
+  -s STRIDE, --stride STRIDE
+                        The uniform stride of tabulation's first table, the
+                        second table will use 10 * stride as it's uniform
+                        stride
+  -f FREQUENCY, --frequency FREQUENCY
+                        The frequency of tabulation overflow check(If the
+                        input environment matrix overflow the first or second
+                        table range). By default do not check the overflow
+  -d FOLDER, --folder FOLDER
+                        path to checkpoint folder
+```
+**Parameter explanation**
+
+Model compression, which including tabulating the embedding-net.
+The table is composed of fifth-order polynomial coefficients and is assembled from two sub-tables. The first sub-table takes the stride(parameter) as it's uniform stride, while the second sub-table takes 10 * stride as it's uniform stride.
+The range of the first table is automatically detected by deepmd-kit, while the second table ranges from the first table's upper boundary(upper) to the extrapolate(parameter) * upper.
+Finally, we added a check frequency parameter. It indicates how often the program checks for overflow(if the input environment matrix overflow the first or second table range) during the MD inference.
+
+**Justification of model compression**
+
+Model compression, with little loss of accuracy, can greatly speed up MD inference time. According to different simulation systems and training parameters, the speedup can reach more than 10 times at both CPU and GPU devices. At the same time, model compression can greatly change the memory usage, reducing as much as 20 times under the same hardware conditions.
+
 ## Model inference 
 One may use the python interface of DeePMD-kit for model inference, an example is given as follows
 ```python

From 675270dfb59aeb06ddc5cea8e93086cdeaf7ea1e Mon Sep 17 00:00:00 2001
From: Denghui Lu <denghuilu@pku.edu.cn>
Date: Tue, 9 Feb 2021 02:36:25 +0800
Subject: [PATCH 18/20] add intros of model compression to the README

---
 README.md | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

diff --git a/README.md b/README.md
index 7f9b02b628..7d72de9bc9 100644
--- a/README.md
+++ b/README.md
@@ -81,7 +81,8 @@ The typical procedure of using DeePMD-kit includes 5 steps
 3. [Analyze training with Tensorboard](doc/tensorboard.md)
 4. [Freeze the model](doc/use-deepmd-kit.md#freeze-a-model)
 5. [Test the model](doc/use-deepmd-kit.md#test-a-model)
-6. [Inference the model in python](doc/use-deepmd-kit.md#model-inference) or using the model in other molecular simulation packages like [LAMMPS](doc/use-deepmd-kit.md#run-md-with-lammps), [i-PI](doc/use-deepmd-kit.md#run-path-integral-md-with-i-pi) or [ASE](doc/use-deepmd-kit.md#use-deep-potential-with-ase).
+6. [Compress the model](doc/use-deepmd-kit.md#compress-a-model)
+7. [Inference the model in python](doc/use-deepmd-kit.md#model-inference) or using the model in other molecular simulation packages like [LAMMPS](doc/use-deepmd-kit.md#run-md-with-lammps), [i-PI](doc/use-deepmd-kit.md#run-path-integral-md-with-i-pi) or [ASE](doc/use-deepmd-kit.md#use-deep-potential-with-ase).
 
 A quick-start on using DeePMD-kit can be found [here](doc/use-deepmd-kit.md).
 

From f6f25bbd78ece1a2344b438bc8a93172cf9d1c89 Mon Sep 17 00:00:00 2001
From: denghuilu <denghuilu@pku.edu.cn>
Date: Wed, 10 Feb 2021 12:09:19 +0800
Subject: [PATCH 19/20] add type hint and mute warning message

add type hint for class NeighborStat and DeepTabulate and mute warning message from custome op tabulate
---
 deepmd/descriptor/se_a.py          | 17 ++++++++++-------
 deepmd/utils/neighbor_stat.py      |  2 +-
 deepmd/utils/tabulate.py           | 12 ++++++------
 source/op/cuda/tabulate.cu         |  2 +-
 source/op/tabulate.cc              | 30 +++++++++++++++---------------
 source/op/tabulate_multi_device.cc | 30 +++++++++++++++---------------
 source/train/compress.py           |  2 +-
 7 files changed, 49 insertions(+), 46 deletions(-)

diff --git a/deepmd/descriptor/se_a.py b/deepmd/descriptor/se_a.py
index 9be99d3cd0..c123a4c90f 100644
--- a/deepmd/descriptor/se_a.py
+++ b/deepmd/descriptor/se_a.py
@@ -230,12 +230,12 @@ def compute_input_stats (self,
         self.dstd = np.array(all_dstd)
 
     def enable_compression(self,
-                           min_nbor_dist,
-                           model_file = 'frozon_model.pb',
-                           table_extrapolate = 5,
-                           table_stride_1 = 0.01,
-                           table_stride_2 = 0.1,
-                           check_frequency = -1
+                           min_nbor_dist : float,
+                           model_file : str = 'frozon_model.pb',
+                           table_extrapolate : float = 5,
+                           table_stride_1 : float = 0.01,
+                           table_stride_2 : float = 0.1,
+                           check_frequency : int = -1
     ) -> None:
         """
         Reveive the statisitcs (distance, max_nbor_size and env_mat_range) of the training data.
@@ -260,7 +260,10 @@ def enable_compression(self,
         self.table_config = [table_extrapolate, table_stride_1, table_stride_2, check_frequency]
         self.table = DeepTabulate(self.model_file, self.type_one_side)
         self.lower, self.upper \
-            = self.table.build(min_nbor_dist, table_extrapolate, table_stride_1, table_stride_2)
+            = self.table.build(min_nbor_dist, 
+                               table_extrapolate, 
+                               table_stride_1, 
+                               table_stride_2)
 
     def build (self, 
                coord_ : tf.Tensor, 
diff --git a/deepmd/utils/neighbor_stat.py b/deepmd/utils/neighbor_stat.py
index 9ccb3a5d0e..606002e4d3 100644
--- a/deepmd/utils/neighbor_stat.py
+++ b/deepmd/utils/neighbor_stat.py
@@ -14,7 +14,7 @@ class NeighborStat():
     """
     def __init__(self,
                  ntypes : int,
-                 rcut) -> None:
+                 rcut: float) -> None:
         """
         Constructor
 
diff --git a/deepmd/utils/tabulate.py b/deepmd/utils/tabulate.py
index fc0ba118fd..4291a8127b 100644
--- a/deepmd/utils/tabulate.py
+++ b/deepmd/utils/tabulate.py
@@ -16,8 +16,8 @@ class DeepTabulate():
     The range of the first table is automatically detected by deepmd-kit, while the second table ranges from the first table\'s upper boundary(upper) to the extrapolate(parameter) * upper.
     """
     def __init__(self,
-                 model_file,
-                 type_one_side = False) -> None:
+                 model_file : str,
+                 type_one_side : bool = False) -> None:
         """
         Constructor
 
@@ -68,10 +68,10 @@ def __init__(self,
         # TODO: Need a check function to determine if the current model is properly
 
     def build(self, 
-              min_nbor_dist,
-              extrapolate, 
-              stride0, 
-              stride1) -> Tuple[int, int]:
+              min_nbor_dist : float,
+              extrapolate : float, 
+              stride0 : float, 
+              stride1 : float) -> Tuple[int, int]:
         """
         Build the tables for model compression
 
diff --git a/source/op/cuda/tabulate.cu b/source/op/cuda/tabulate.cu
index 83befeb571..db1b917f02 100644
--- a/source/op/cuda/tabulate.cu
+++ b/source/op/cuda/tabulate.cu
@@ -291,7 +291,7 @@ __global__ void tabulate_checker(const FPTYPE * in, int * out, const FPTYPE lowe
         FPTYPE xx = in[bid * nnei + ii];
         if (xx < lower || xx > max) {
             Csub[tid] += 1;
-            printf("# DEEPMD: level 2 overflow, xx:\t%f\n", xx);
+            // printf("# DEEPMD: level 2 overflow, xx:\t%f\n", xx);
         }
         else if (xx >= upper && xx <= max) {
             Dsub[tid] += 1;
diff --git a/source/op/tabulate.cc b/source/op/tabulate.cc
index ba7cc550fe..1ca5b774ea 100644
--- a/source/op/tabulate.cc
+++ b/source/op/tabulate.cc
@@ -283,21 +283,21 @@ class TabulateFusionOp : public OpKernel {
 	    					     output_shape,
 	    					     &output));
 
-        counter++;
-        if ((int)table_info.flat<FPTYPE>().data()[5] != -1 && counter % (int)table_info.flat<FPTYPE>().data()[5] == 0) {
-            Tensor int_temp;
-            TensorShape int_shape;
-            int_shape.AddDim(2 * ff.shape().dim_size(0));
-            OP_REQUIRES_OK(context, context->allocate_temp(DT_INT32, int_shape, &int_temp));
-            TabulateCheckerFunctor<FPTYPE>()(
-                context->eigen_device<Device>(),
-                table_info.flat<FPTYPE>().data(),
-                input.flat<FPTYPE>().data(),
-                int_temp.flat<int>().data(),
-                ff.shape().dim_size(0),
-                ff.shape().dim_size(1)
-            );
-        }
+        // counter++;
+        // if ((int)table_info.flat<FPTYPE>().data()[5] != -1 && counter % (int)table_info.flat<FPTYPE>().data()[5] == 0) {
+        //     Tensor int_temp;
+        //     TensorShape int_shape;
+        //     int_shape.AddDim(2 * ff.shape().dim_size(0));
+        //     OP_REQUIRES_OK(context, context->allocate_temp(DT_INT32, int_shape, &int_temp));
+        //     TabulateCheckerFunctor<FPTYPE>()(
+        //         context->eigen_device<Device>(),
+        //         table_info.flat<FPTYPE>().data(),
+        //         input.flat<FPTYPE>().data(),
+        //         int_temp.flat<int>().data(),
+        //         ff.shape().dim_size(0),
+        //         ff.shape().dim_size(1)
+        //     );
+        // }
 
         TabulateFusionFunctor<FPTYPE>()(
             context->eigen_device<Device>(),            // define actually graph execution device
diff --git a/source/op/tabulate_multi_device.cc b/source/op/tabulate_multi_device.cc
index 1e5e8da2a7..4cce523def 100644
--- a/source/op/tabulate_multi_device.cc
+++ b/source/op/tabulate_multi_device.cc
@@ -92,21 +92,21 @@ class TabulateFusionOp : public OpKernel {
 	    					     output_shape,
 	    					     &output));
 
-        counter++;
-        if ((int)table_info.flat<FPTYPE>().data()[5] != -1 && counter % (int)table_info.flat<FPTYPE>().data()[5] == 0) {
-            Tensor int_temp;
-            TensorShape int_shape;
-            int_shape.AddDim(2 * ff.shape().dim_size(0));
-            OP_REQUIRES_OK(context, context->allocate_temp(DT_INT32, int_shape, &int_temp));
-            TabulateCheckerFunctor<FPTYPE>()(
-                context->eigen_device<Device>(),
-                table_info.flat<FPTYPE>().data(),
-                input.flat<FPTYPE>().data(),
-                int_temp.flat<int>().data(),
-                ff.shape().dim_size(0),
-                ff.shape().dim_size(1)
-            );
-        }
+        // counter++;
+        // if ((int)table_info.flat<FPTYPE>().data()[5] != -1 && counter % (int)table_info.flat<FPTYPE>().data()[5] == 0) {
+        //     Tensor int_temp;
+        //     TensorShape int_shape;
+        //     int_shape.AddDim(2 * ff.shape().dim_size(0));
+        //     OP_REQUIRES_OK(context, context->allocate_temp(DT_INT32, int_shape, &int_temp));
+        //     TabulateCheckerFunctor<FPTYPE>()(
+        //         context->eigen_device<Device>(),
+        //         table_info.flat<FPTYPE>().data(),
+        //         input.flat<FPTYPE>().data(),
+        //         int_temp.flat<int>().data(),
+        //         ff.shape().dim_size(0),
+        //         ff.shape().dim_size(1)
+        //     );
+        // }
 
         TabulateFusionFunctor<FPTYPE>()(
             context->eigen_device<Device>(),            // define actually graph execution device
diff --git a/source/train/compress.py b/source/train/compress.py
index cd1ba79466..24a3a7997a 100644
--- a/source/train/compress.py
+++ b/source/train/compress.py
@@ -20,7 +20,7 @@ def compress(args):
     jdata['model']['compress'] = {}
     jdata['model']['compress']['compress'] = True
     jdata['model']['compress']['model_file'] = args.input
-    jdata['model']['compress']['table_config'] = [args.extrapolate, args.stride, 10 * args.stride, args.frequency]
+    jdata['model']['compress']['table_config'] = [args.extrapolate, args.stride, 10 * args.stride, int(args.frequency)]
 
     # check the descriptor info of the input file
     assert jdata['model']['descriptor']['type'] == 'se_a', 'Model compression error: descriptor type must be se_a!'

From 44aa2496b072bdc994ff4ff1fcaef46e4b35a0a8 Mon Sep 17 00:00:00 2001
From: denghuilu <denghuilu@pku.edu.cn>
Date: Wed, 10 Feb 2021 14:35:44 +0800
Subject: [PATCH 20/20] add type hint for class NrighborStat

---
 deepmd/utils/neighbor_stat.py | 4 +++-
 1 file changed, 3 insertions(+), 1 deletion(-)

diff --git a/deepmd/utils/neighbor_stat.py b/deepmd/utils/neighbor_stat.py
index 606002e4d3..d8fed56e03 100644
--- a/deepmd/utils/neighbor_stat.py
+++ b/deepmd/utils/neighbor_stat.py
@@ -6,6 +6,7 @@
 from deepmd.env import op_module
 from deepmd.env import default_tf_session_config
 from deepmd.RunOptions import global_np_float_precision
+from deepmd.utils.data_system import DeepmdDataSystem
 
 class NeighborStat():
     """
@@ -45,7 +46,7 @@ def __init__(self,
         self.sub_sess = tf.Session(graph = sub_graph, config=default_tf_session_config)
 
     def get_stat(self,
-                 data) -> Tuple[float, List[int]]:
+                 data : DeepmdDataSystem) -> Tuple[float, List[int]]:
         """
         get the data statistics of the training data, including nearest nbor distance between atoms, max nbor size of atoms
 
@@ -61,6 +62,7 @@ def get_stat(self,
         max_nbor_size
                 A list with ntypes integers, denotes the actual achieved max sel
         """
+        print(type(data))
         self.min_nbor_dist = 100.0
         self.max_nbor_size = [0] * self.ntypes