diff --git a/README.md b/README.md
index 7f9b02b628..7d72de9bc9 100644
--- a/README.md
+++ b/README.md
@@ -81,7 +81,8 @@ The typical procedure of using DeePMD-kit includes 5 steps
 3. [Analyze training with Tensorboard](doc/tensorboard.md)
 4. [Freeze the model](doc/use-deepmd-kit.md#freeze-a-model)
 5. [Test the model](doc/use-deepmd-kit.md#test-a-model)
-6. [Inference the model in python](doc/use-deepmd-kit.md#model-inference) or using the model in other molecular simulation packages like [LAMMPS](doc/use-deepmd-kit.md#run-md-with-lammps), [i-PI](doc/use-deepmd-kit.md#run-path-integral-md-with-i-pi) or [ASE](doc/use-deepmd-kit.md#use-deep-potential-with-ase).
+6. [Compress the model](doc/use-deepmd-kit.md#compress-a-model)
+7. [Inference the model in python](doc/use-deepmd-kit.md#model-inference) or using the model in other molecular simulation packages like [LAMMPS](doc/use-deepmd-kit.md#run-md-with-lammps), [i-PI](doc/use-deepmd-kit.md#run-path-integral-md-with-i-pi) or [ASE](doc/use-deepmd-kit.md#use-deep-potential-with-ase).
 
 A quick-start on using DeePMD-kit can be found [here](doc/use-deepmd-kit.md).
 
diff --git a/deepmd/common.py b/deepmd/common.py
index 0e6db4c363..fcd6fd5eed 100644
--- a/deepmd/common.py
+++ b/deepmd/common.py
@@ -20,7 +20,7 @@
 import yaml
 
 from deepmd.env import op_module, tf
-from deepmd.RunOptions import global_tf_float_precision
+from deepmd.RunOptions import global_tf_float_precision, global_np_float_precision
 
 if TYPE_CHECKING:
     _DICT_VAL = TypeVar("_DICT_VAL")
@@ -451,3 +451,15 @@ def dec(obj: "_OBJ") -> "_OBJ":
         return obj
 
     return dec
+
+def get_np_precision(precision):
+    if precision == "default":
+        return  global_np_float_precision
+    elif precision == "float16":
+        return np.float16
+    elif precision == "float32":
+        return np.float32
+    elif precision == "float64":
+        return np.float64
+    else:
+        raise RuntimeError("%d is not a valid precision" % precision)
\ No newline at end of file
diff --git a/deepmd/descriptor/se_a.py b/deepmd/descriptor/se_a.py
index 369a57d677..c123a4c90f 100644
--- a/deepmd/descriptor/se_a.py
+++ b/deepmd/descriptor/se_a.py
@@ -1,14 +1,16 @@
+import math
 import numpy as np
 from typing import Tuple, List
 
 from deepmd.env import tf
-from deepmd.common import get_activation_func, get_precision, ACTIVATION_FN_DICT, PRECISION_DICT, docstring_parameter
+from deepmd.common import get_activation_func, get_precision, ACTIVATION_FN_DICT, PRECISION_DICT, docstring_parameter, get_np_precision
 from deepmd.utils.argcheck import list_to_doc
 from deepmd.RunOptions import global_tf_float_precision
 from deepmd.RunOptions import global_np_float_precision
 from deepmd.env import op_module
 from deepmd.env import default_tf_session_config
 from deepmd.utils.network import embedding_net
+from deepmd.utils.tabulate import DeepTabulate
 
 
 class DescrptSeA ():
@@ -71,6 +73,7 @@ def __init__ (self,
         self.trainable = trainable
         self.filter_activation_fn = get_activation_func(activation_function)
         self.filter_precision = get_precision(precision)
+        self.filter_np_precision = get_np_precision(precision)
         self.exclude_types = set()
         for tt in exclude_types:
             assert(len(tt) == 2)
@@ -96,7 +99,7 @@ def __init__ (self,
         self.useBN = False
         self.dstd = None
         self.davg = None
-
+        self.compress = False
         self.place_holders = {}
         avg_zero = np.zeros([self.ntypes,self.ndescrpt]).astype(global_np_float_precision)
         std_ones = np.ones ([self.ntypes,self.ndescrpt]).astype(global_np_float_precision)
@@ -226,6 +229,41 @@ def compute_input_stats (self,
             self.davg = np.array(all_davg)
         self.dstd = np.array(all_dstd)
 
+    def enable_compression(self,
+                           min_nbor_dist : float,
+                           model_file : str = 'frozon_model.pb',
+                           table_extrapolate : float = 5,
+                           table_stride_1 : float = 0.01,
+                           table_stride_2 : float = 0.1,
+                           check_frequency : int = -1
+    ) -> None:
+        """
+        Reveive the statisitcs (distance, max_nbor_size and env_mat_range) of the training data.
+        
+        Parameters
+        ----------
+        min_nbor_dist
+                The nearest distance between atoms
+        model_file
+                The original frozen model, which will be compressed by the program
+        table_extrapolate
+                The scale of model extrapolation
+        table_stride_1
+                The uniform stride of the first table
+        table_stride_2
+                The uniform stride of the second table
+        check_frequency
+                The overflow check frequency
+        """
+        self.compress = True
+        self.model_file = model_file
+        self.table_config = [table_extrapolate, table_stride_1, table_stride_2, check_frequency]
+        self.table = DeepTabulate(self.model_file, self.type_one_side)
+        self.lower, self.upper \
+            = self.table.build(min_nbor_dist, 
+                               table_extrapolate, 
+                               table_stride_1, 
+                               table_stride_2)
 
     def build (self, 
                coord_ : tf.Tensor, 
@@ -336,7 +374,6 @@ def build (self,
         # only used when tensorboard was set as true
         tf.summary.histogram('embedding_net_output', self.dout)
         return self.dout
-
     
     def get_rot_mat(self) -> tf.Tensor:
         """
@@ -516,28 +553,38 @@ def _filter(self,
             # with (natom x nei_type_i) x 1
             xyz_scatter = tf.reshape(tf.slice(inputs_reshape, [0,0],[-1,1]),[-1,1])
             # with (natom x nei_type_i) x out_size
-            if (type_input, type_i) not in self.exclude_types:
-                xyz_scatter = embedding_net(xyz_scatter, 
-                                            self.filter_neuron, 
-                                            self.filter_precision, 
-                                            activation_fn = activation_fn, 
-                                            resnet_dt = self.filter_resnet_dt,
-                                            name_suffix = "_"+str(type_i),
-                                            stddev = stddev,
-                                            bavg = bavg,
-                                            seed = seed,
-                                            trainable = trainable)
+            if self.compress and (type_input, type_i) not in self.exclude_types:
+              info = [self.lower, self.upper, self.upper * self.table_config[0], self.table_config[1], self.table_config[2], self.table_config[3]]
+              if self.type_one_side:
+                net = 'filter_-1_net_' + str(type_i)
+              else:
+                net = 'filter_' + str(type_input) + '_net_' + str(type_i)
+              if type_i == 0:
+                xyz_scatter_1  = op_module.tabulate_fusion(self.table.data[net].astype(self.filter_np_precision), info, xyz_scatter, tf.reshape(inputs_i, [-1, shape_i[1]//4, 4]), last_layer_size = outputs_size[-1])
+              else:
+                xyz_scatter_1 += op_module.tabulate_fusion(self.table.data[net].astype(self.filter_np_precision), info, xyz_scatter, tf.reshape(inputs_i, [-1, shape_i[1]//4, 4]), last_layer_size = outputs_size[-1])
             else:
-              w = tf.zeros((outputs_size[0], outputs_size[-1]), dtype=global_tf_float_precision)
-              xyz_scatter = tf.matmul(xyz_scatter, w)
-            # natom x nei_type_i x out_size
-            xyz_scatter = tf.reshape(xyz_scatter, (-1, shape_i[1]//4, outputs_size[-1]))
-
-            # xyz_scatter_total.append(xyz_scatter)
-            if type_i == 0 :
-                xyz_scatter_1 = tf.matmul(tf.reshape(inputs_i, [-1, shape_i[1]//4, 4]), xyz_scatter, transpose_a = True)
-            else :
-                xyz_scatter_1 += tf.matmul(tf.reshape(inputs_i, [-1, shape_i[1]//4, 4]), xyz_scatter, transpose_a = True)
+              if (type_input, type_i) not in self.exclude_types:
+                  xyz_scatter = embedding_net(xyz_scatter, 
+                                              self.filter_neuron, 
+                                              self.filter_precision, 
+                                              activation_fn = activation_fn, 
+                                              resnet_dt = self.filter_resnet_dt,
+                                              name_suffix = "_"+str(type_i),
+                                              stddev = stddev,
+                                              bavg = bavg,
+                                              seed = seed,
+                                              trainable = trainable)
+              else:
+                w = tf.zeros((outputs_size[0], outputs_size[-1]), dtype=global_tf_float_precision)
+                xyz_scatter = tf.matmul(xyz_scatter, w)
+              # natom x nei_type_i x out_size
+              xyz_scatter = tf.reshape(xyz_scatter, (-1, shape_i[1]//4, outputs_size[-1]))  
+              # xyz_scatter_total.append(xyz_scatter)
+              if type_i == 0 :
+                  xyz_scatter_1 = tf.matmul(tf.reshape(inputs_i, [-1, shape_i[1]//4, 4]), xyz_scatter, transpose_a = True)
+              else :
+                  xyz_scatter_1 += tf.matmul(tf.reshape(inputs_i, [-1, shape_i[1]//4, 4]), xyz_scatter, transpose_a = True)
           # natom x nei x outputs_size
           # xyz_scatter = tf.concat(xyz_scatter_total, axis=1)
           # natom x nei x 4
diff --git a/deepmd/utils/neighbor_stat.py b/deepmd/utils/neighbor_stat.py
new file mode 100644
index 0000000000..d8fed56e03
--- /dev/null
+++ b/deepmd/utils/neighbor_stat.py
@@ -0,0 +1,92 @@
+import math
+import numpy as np
+from tqdm import tqdm
+from deepmd.env import tf
+from typing import Tuple, List
+from deepmd.env import op_module
+from deepmd.env import default_tf_session_config
+from deepmd.RunOptions import global_np_float_precision
+from deepmd.utils.data_system import DeepmdDataSystem
+
+class NeighborStat():
+    """
+    Class for getting training data information. 
+    It loads data from DeepmdData object, and measures the data info, including neareest nbor distance between atoms, max nbor size of atoms and the output data range of the environment matrix.
+    """
+    def __init__(self,
+                 ntypes : int,
+                 rcut: float) -> None:
+        """
+        Constructor
+
+        Parameters
+        ----------
+        ntypes
+                The num of atom types
+        rcut
+                The cut-off radius
+        """
+        self.rcut = rcut
+        self.ntypes = ntypes
+        self.place_holders = {}
+        sub_graph = tf.Graph()
+        with sub_graph.as_default():
+            for ii in ['coord', 'box']:
+                self.place_holders[ii] = tf.placeholder(global_np_float_precision, [None, None], name='t_'+ii)
+            self.place_holders['type'] = tf.placeholder(tf.int32, [None, None], name='t_type')
+            self.place_holders['natoms_vec'] = tf.placeholder(tf.int32, [self.ntypes+2], name='t_natoms')
+            self.place_holders['default_mesh'] = tf.placeholder(tf.int32, [None], name='t_mesh')
+            self._max_nbor_size, self._min_nbor_dist \
+                = op_module.neighbor_stat(self.place_holders['coord'],
+                                         self.place_holders['type'],
+                                         self.place_holders['natoms_vec'],
+                                         self.place_holders['box'],
+                                         self.place_holders['default_mesh'],
+                                         rcut = self.rcut)
+        self.sub_sess = tf.Session(graph = sub_graph, config=default_tf_session_config)
+
+    def get_stat(self,
+                 data : DeepmdDataSystem) -> Tuple[float, List[int]]:
+        """
+        get the data statistics of the training data, including nearest nbor distance between atoms, max nbor size of atoms
+
+        Parameters
+        ----------
+        data
+                Class for manipulating many data systems. It is implemented with the help of DeepmdData.
+        
+        Returns
+        -------
+        min_nbor_dist
+                The nearest distance between neighbor atoms
+        max_nbor_size
+                A list with ntypes integers, denotes the actual achieved max sel
+        """
+        print(type(data))
+        self.min_nbor_dist = 100.0
+        self.max_nbor_size = [0] * self.ntypes
+
+        for ii in tqdm(range(len(data.system_dirs)), desc = '# DEEPMD: getting data info'):
+            for jj in data.data_systems[ii].dirs:
+                data_set = data.data_systems[ii]._load_set(jj)
+                for kk in range(np.array(data_set['type']).shape[0]):
+                    mn, dt \
+                        = self.sub_sess.run([self._max_nbor_size, self._min_nbor_dist], 
+                                            feed_dict = {
+                                                self.place_holders['coord']: np.array(data_set['coord'])[kk].reshape([-1, data.natoms[ii] * 3]),
+                                                self.place_holders['type']: np.array(data_set['type'])[kk].reshape([-1, data.natoms[ii]]),
+                                                self.place_holders['natoms_vec']: np.array(data.natoms_vec[ii]),
+                                                self.place_holders['box']: np.array(data_set['box'])[kk].reshape([-1, 9]),
+                                                self.place_holders['default_mesh']: np.array(data.default_mesh[ii]),
+                                            })
+                    dt = np.min(dt)
+                    if dt < self.min_nbor_dist:
+                        self.min_nbor_dist = dt
+                    for ww in range(self.ntypes):
+                        var = np.max(mn[:, ww])
+                        if var > self.max_nbor_size[ww]:
+                            self.max_nbor_size[ww] = var
+
+        print('# DEEPMD: training data with min nbor dist: ' + str(self.min_nbor_dist))
+        print('# DEEPMD: training data with max nbor size: ' + str(self.max_nbor_size))
+        return self.min_nbor_dist, self.max_nbor_size
diff --git a/deepmd/utils/tabulate.py b/deepmd/utils/tabulate.py
new file mode 100644
index 0000000000..4291a8127b
--- /dev/null
+++ b/deepmd/utils/tabulate.py
@@ -0,0 +1,245 @@
+import re
+import math
+import numpy as np
+from tqdm import tqdm
+from typing import Tuple, List
+from deepmd.env import tf
+from deepmd.env import op_module
+from tensorflow.python.platform import gfile
+from tensorflow.python.framework import tensor_util
+
+class DeepTabulate():
+    """
+    Class for tabulation.
+    Compress a model, which including tabulating the embedding-net. 
+    The table is composed of fifth-order polynomial coefficients and is assembled from two sub-tables. The first table takes the stride(parameter) as it\'s uniform stride, while the second table takes 10 * stride as it\s uniform stride 
+    The range of the first table is automatically detected by deepmd-kit, while the second table ranges from the first table\'s upper boundary(upper) to the extrapolate(parameter) * upper.
+    """
+    def __init__(self,
+                 model_file : str,
+                 type_one_side : bool = False) -> None:
+        """
+        Constructor
+
+        Parameters
+        ----------
+        model_file
+                The frozen model
+        type_one_side
+                Try to build N_types tables. Otherwise, building N_types^2 tables
+        """
+
+        self.model_file = model_file
+        self.type_one_side = type_one_side
+
+        self.graph, self.graph_def = self._load_graph()
+        self.sess = tf.Session(graph = self.graph)
+
+        self.sub_graph, self.sub_graph_def = self._load_sub_graph()
+        self.sub_sess = tf.Session(graph = self.sub_graph)
+
+        self.sel_a = self.graph.get_operation_by_name('DescrptSeA').get_attr('sel_a')
+        self.ntypes = self._get_tensor_value(self.graph.get_tensor_by_name ('descrpt_attr/ntypes:0'))
+
+        self.davg = self._get_tensor_value(self.graph.get_tensor_by_name ('descrpt_attr/t_avg:0'))
+        self.dstd = self._get_tensor_value(self.graph.get_tensor_by_name ('descrpt_attr/t_std:0'))
+
+        self.descrpt = self.graph.get_operation_by_name ('DescrptSeA')
+        self.rcut = self.descrpt.get_attr('rcut_r')
+        self.rcut_smth = self.descrpt.get_attr('rcut_r_smth')
+
+        self.filter_variable_nodes = self._load_matrix_node()
+        self.layer_size = int(len(self.filter_variable_nodes) / (self.ntypes * self.ntypes * 2))
+        self.table_size = self.ntypes * self.ntypes
+        if type_one_side :
+            self.layer_size = int(len(self.filter_variable_nodes) / (self.ntypes * 2))
+            self.table_size = self.ntypes
+        # self.value_type = self.filter_variable_nodes["filter_type_0/matrix_1_0"].dtype #"filter_type_0/matrix_1_0" must exit~
+        # get trained variables
+        self.bias = self._get_bias()
+        self.matrix = self._get_matrix()
+
+        self.data_type = type(self.matrix["layer_1"][0][0][0])
+        assert self.matrix["layer_1"][0].size > 0, "no matrix exist in matrix array!"
+        self.last_layer_size = self.matrix["layer_" + str(self.layer_size)][0].shape[1]
+        # define tables
+        self.data = {}
+
+        # TODO: Need a check function to determine if the current model is properly
+
+    def build(self, 
+              min_nbor_dist : float,
+              extrapolate : float, 
+              stride0 : float, 
+              stride1 : float) -> Tuple[int, int]:
+        """
+        Build the tables for model compression
+
+        Parameters
+        ----------
+        min_nbor_dist
+                The nearest distance between neighbor atoms
+        extrapolate
+                The scale of model extrapolation
+        stride0
+                The uniform stride of the first table
+        stride1
+                The uniform stride of the second table
+        
+        Returns
+        ----------
+        lower
+                The lower boundary of environment matrix
+        upper
+                The upper boundary of environment matrix
+        """
+        # tabulate range [lower, upper] with stride0 'stride0'
+        lower, upper = self._get_env_mat_range(min_nbor_dist)
+        xx = np.arange(lower, upper, stride0, dtype = self.data_type)
+        xx = np.append(xx, np.arange(upper, extrapolate * upper, stride1, dtype = self.data_type))
+        xx = np.append(xx, np.array([extrapolate * upper], dtype = self.data_type))
+        self.nspline = int((upper - lower) / stride0 + (extrapolate * upper - upper) / stride1)
+        for ii in range(self.table_size):
+            vv, dd, d2 = self._make_data(xx, ii)
+            if self.type_one_side:
+                net = "filter_-1_net_" + str(int(ii))
+            else:
+                net = "filter_" + str(int(ii / self.ntypes)) + "_net_" + str(int(ii % self.ntypes))
+            self.data[net] = np.zeros([self.nspline, 6 * self.last_layer_size], dtype = self.data_type)
+            for jj in tqdm(range(self.nspline), desc = '# DEEPMD: ' + net + ', tabulating'):
+                for kk in range(self.last_layer_size):
+                    if jj < int((upper - lower) / stride0):
+                        tt = stride0
+                    else:
+                        tt = stride1
+                    hh = vv[jj + 1][kk] - vv[jj][kk]
+                    self.data[net][jj][kk * 6 + 0] = vv[jj][kk]
+                    self.data[net][jj][kk * 6 + 1] = dd[jj][kk]
+                    self.data[net][jj][kk * 6 + 2] = 0.5 * d2[jj][kk]
+                    self.data[net][jj][kk * 6 + 3] = (1 / (2 * tt * tt * tt)) * (20 * hh - (8 * dd[jj + 1][kk] + 12 * dd[jj][kk]) * tt - (3 * d2[jj][kk] - d2[jj + 1][kk]) * tt * tt)
+                    self.data[net][jj][kk * 6 + 4] = (1 / (2 * tt * tt * tt * tt)) * (-30 * hh + (14 * dd[jj + 1][kk] + 16 * dd[jj][kk]) * tt + (3 * d2[jj][kk] - 2 * d2[jj + 1][kk]) * tt * tt)
+                    self.data[net][jj][kk * 6 + 5] = (1 / (2 * tt * tt * tt * tt * tt)) * (12 * hh - 6 * (dd[jj + 1][kk] + dd[jj][kk]) * tt + (d2[jj + 1][kk] - d2[jj][kk]) * tt * tt)
+            self.data[net]
+        return lower, upper
+
+    def _load_graph(self):
+        graph_def = tf.GraphDef()
+        with open(self.model_file, "rb") as f:
+            graph_def.ParseFromString(f.read())
+        with tf.Graph().as_default() as graph:
+            tf.import_graph_def(graph_def, name = "")
+        return graph, graph_def
+
+    def _load_sub_graph(self):
+        sub_graph_def = tf.GraphDef()
+        with tf.Graph().as_default() as sub_graph:
+            tf.import_graph_def(sub_graph_def, name = "")
+        return sub_graph, sub_graph_def
+
+    def _get_tensor_value(self, tensor) :
+        with self.sess.as_default():
+            self.sess.run(tensor)
+            value = tensor.eval()
+        return value
+
+    def _load_matrix_node(self):
+        matrix_node = {}
+        matrix_node_pattern = "filter_type_\d+/matrix_\d+_\d+|filter_type_\d+/bias_\d+_\d+|filter_type_\d+/idt_\d+_\d+|filter_type_all/matrix_\d+_\d+|filter_type_all/bias_\d+_\d+|filter_type_all/idt_\d+_\d"
+        for node in self.graph_def.node:
+            if re.fullmatch(matrix_node_pattern, node.name) != None:
+                matrix_node[node.name] = node.attr["value"].tensor
+        for key in matrix_node.keys() :
+            assert key.find('bias') > 0 or key.find('matrix') > 0, "currently, only support weight matrix and bias matrix at the tabulation op!"
+        return matrix_node
+
+    def _get_bias(self):
+        bias = {}
+        for layer in range(1, self.layer_size + 1):
+            bias["layer_" + str(layer)] = []
+            if self.type_one_side:
+                for ii in range(0, self.ntypes):
+                    tensor_value = np.frombuffer (self.filter_variable_nodes["filter_type_all/bias_" + str(layer) + "_" + str(int(ii))].tensor_content)
+                    tensor_shape = tf.TensorShape(self.filter_variable_nodes["filter_type_all/bias_" + str(layer) + "_" + str(int(ii))].tensor_shape).as_list()
+                    bias["layer_" + str(layer)].append(np.reshape(tensor_value, tensor_shape))
+            else:
+                for ii in range(0, self.ntypes * self.ntypes):
+                    tensor_value = np.frombuffer(self.filter_variable_nodes["filter_type_" + str(int(ii / self.ntypes)) + "/bias_" + str(layer) + "_" + str(int(ii % self.ntypes))].tensor_content)
+                    tensor_shape = tf.TensorShape(self.filter_variable_nodes["filter_type_" + str(int(ii / self.ntypes)) + "/bias_" + str(layer) + "_" + str(int(ii % self.ntypes))].tensor_shape).as_list()
+                    bias["layer_" + str(layer)].append(np.reshape(tensor_value, tensor_shape))
+        return bias
+
+    def _get_matrix(self):
+        matrix = {}
+        for layer in range(1, self.layer_size + 1):
+            matrix["layer_" + str(layer)] = []
+            if self.type_one_side:
+                for ii in range(0, self.ntypes):
+                    tensor_value = np.frombuffer (self.filter_variable_nodes["filter_type_all/matrix_" + str(layer) + "_" + str(int(ii))].tensor_content)
+                    tensor_shape = tf.TensorShape(self.filter_variable_nodes["filter_type_all/matrix_" + str(layer) + "_" + str(int(ii))].tensor_shape).as_list()
+                    matrix["layer_" + str(layer)].append(np.reshape(tensor_value, tensor_shape))
+            else:
+                for ii in range(0, self.ntypes * self.ntypes):
+                    tensor_value = np.frombuffer(self.filter_variable_nodes["filter_type_" + str(int(ii / self.ntypes)) + "/matrix_" + str(layer) + "_" + str(int(ii % self.ntypes))].tensor_content)
+                    tensor_shape = tf.TensorShape(self.filter_variable_nodes["filter_type_" + str(int(ii / self.ntypes)) + "/matrix_" + str(layer) + "_" + str(int(ii % self.ntypes))].tensor_shape).as_list()
+                    matrix["layer_" + str(layer)].append(np.reshape(tensor_value, tensor_shape))
+        return matrix
+
+    # one-by-one executions
+    def _make_data(self, xx, idx):
+        with self.sub_graph.as_default():
+            with self.sub_sess.as_default():
+                xx = tf.reshape(xx, [xx.size, -1])
+                for layer in range(self.layer_size):
+                    if layer == 0:
+                        yy = self._layer_0(xx, self.matrix["layer_" + str(layer + 1)][idx], self.bias["layer_" + str(layer + 1)][idx])
+                        dy = op_module.unaggregated_dy_dx_s(yy, self.matrix["layer_" + str(layer + 1)][idx])
+                        dy2 = op_module.unaggregated_dy2_dx_s(yy, dy, self.matrix["layer_" + str(layer + 1)][idx])
+                    else:
+                        tt, yy = self._layer_1(yy, self.matrix["layer_" + str(layer + 1)][idx], self.bias["layer_" + str(layer + 1)][idx])
+                        dz = op_module.unaggregated_dy_dx(yy - tt, self.matrix["layer_" + str(layer + 1)][idx], dy)
+                        dy2 = op_module.unaggregated_dy2_dx(yy - tt, self.matrix["layer_" + str(layer + 1)][idx], dz, dy, dy2)
+                        dy = dz
+ 
+                vv = yy.eval()
+                dd = dy.eval()
+                d2 = dy2.eval()
+        return vv, dd, d2
+
+    def _layer_0(self, x, w, b):
+        return tf.nn.tanh(tf.matmul(x, w) + b)
+
+    def _layer_1(self, x, w, b):
+        t = tf.concat([x, x], axis = 1)
+        return t, tf.nn.tanh(tf.matmul(x, w) + b) + t
+
+    def _save_data(self):
+        for ii in range(self.ntypes * self.ntypes):
+            net = "filter_" + str(int(ii / self.ntypes)) + "_net_" + str(int(ii % self.ntypes))
+            np.savetxt('data_' + str(int(ii)), self.data[net])
+
+    def _get_env_mat_range(self,
+                           min_nbor_dist):
+        lower = 100.0
+        upper = -10.0
+        sw    = self._spline5_switch(min_nbor_dist, self.rcut_smth, self.rcut)
+        for ii in range(self.ntypes):
+            if lower > -self.davg[ii][0] / self.dstd[ii][0]:
+                lower = -self.davg[ii][0] / self.dstd[ii][0]
+            if upper < ((1 / min_nbor_dist) * sw - self.davg[ii][0]) / self.dstd[ii][0]:
+                upper = ((1 / min_nbor_dist) * sw - self.davg[ii][0]) / self.dstd[ii][0]
+        print('# DEEPMD: training data with lower boundary: ' + str(lower))
+        print('# DEEPMD: training data with upper boundary: ' + str(upper))
+        return math.floor(lower), math.ceil(upper)
+
+    def _spline5_switch(self,
+                        xx,
+                        rmin,
+                        rmax):
+        if xx < rmin:
+            vv = 1
+        elif xx < rmax:
+            uu = (xx - rmin) / (rmax - rmin)
+            vv = uu*uu*uu * (-6 * uu*uu + 15 * uu - 10) + 1
+        else:
+            vv = 0
+        return vv
\ No newline at end of file
diff --git a/doc/use-deepmd-kit.md b/doc/use-deepmd-kit.md
index df942a4994..e4cfff5aa6 100644
--- a/doc/use-deepmd-kit.md
+++ b/doc/use-deepmd-kit.md
@@ -5,6 +5,7 @@
 	    - [The DeepPot-SE model](#the-deeppot-se-model)
 	- [Freeze a model](#freeze-a-model)
 	- [Test a model](#test-a-model)
+	- [Compress a model](#compress-a-model)
 	- [Model inference](#model-inference)
 	- [Run MD with Lammps](#run-md-with-lammps)
 	    - [Include deepmd in the pair style](#include-deepmd-in-the-pair-style)
@@ -19,7 +20,8 @@ In this text, we will call the deep neural network that is used to represent the
 2. Train a model
 3. Freeze the model
 4. Test the model
-5. Inference with the model
+5. Compress the model
+6. Inference with the model
 
 ## Prepare data
 One needs to provide the following information to train a model: the atom type, the simulation box, the atom coordinate, the atom force, system energy and virial. A snapshot of a system that contains these information is called a **frame**. We use the following convention of units:
@@ -270,6 +272,58 @@ optional arguments:
                         accuracy
 ```
 
+## Compress a model
+
+Once the frozen model is obtained from deepmd-kit, we can get the neural network structure and its parameters (weights, biases, etc.) from the trained model, and compress it in the following way:
+```bash
+dp compress input.json -i graph.pb -o graph-compress.pb
+```
+where input.json denotes the original training input script, `-i` gives the original frozen model, `-o` gives the compressed model. Several other command line options can be passed to `dp compress`, which can be checked with
+```bash
+$ dp compress --help
+```
+An explanation will be provided
+```
+usage: dp compress [-h] [-i INPUT] [-o OUTPUT] [-e EXTRAPOLATE] [-s STRIDE]
+                   [-f FREQUENCY] [-d FOLDER]
+                   INPUT
+
+positional arguments:
+  INPUT                 The input parameter file in json or yaml format, which
+                        should be consistent with the original model parameter
+                        file
+
+optional arguments:
+  -h, --help            show this help message and exit
+  -i INPUT, --input INPUT
+                        The original frozen model, which will be compressed by
+                        the deepmd-kit
+  -o OUTPUT, --output OUTPUT
+                        The compressed model
+  -e EXTRAPOLATE, --extrapolate EXTRAPOLATE
+                        The scale of model extrapolation
+  -s STRIDE, --stride STRIDE
+                        The uniform stride of tabulation's first table, the
+                        second table will use 10 * stride as it's uniform
+                        stride
+  -f FREQUENCY, --frequency FREQUENCY
+                        The frequency of tabulation overflow check(If the
+                        input environment matrix overflow the first or second
+                        table range). By default do not check the overflow
+  -d FOLDER, --folder FOLDER
+                        path to checkpoint folder
+```
+**Parameter explanation**
+
+Model compression, which including tabulating the embedding-net.
+The table is composed of fifth-order polynomial coefficients and is assembled from two sub-tables. The first sub-table takes the stride(parameter) as it's uniform stride, while the second sub-table takes 10 * stride as it's uniform stride.
+The range of the first table is automatically detected by deepmd-kit, while the second table ranges from the first table's upper boundary(upper) to the extrapolate(parameter) * upper.
+Finally, we added a check frequency parameter. It indicates how often the program checks for overflow(if the input environment matrix overflow the first or second table range) during the MD inference.
+
+**Justification of model compression**
+
+Model compression, with little loss of accuracy, can greatly speed up MD inference time. According to different simulation systems and training parameters, the speedup can reach more than 10 times at both CPU and GPU devices. At the same time, model compression can greatly change the memory usage, reducing as much as 20 times under the same hardware conditions.
+
 ## Model inference 
 One may use the python interface of DeePMD-kit for model inference, an example is given as follows
 ```python
diff --git a/requirements.txt b/requirements.txt
index 0e8ce85ec8..3b80afd9c0 100644
--- a/requirements.txt
+++ b/requirements.txt
@@ -2,4 +2,5 @@ numpy
 scipy
 pyyaml
 dargs
+tqdm
 typing_extensions
\ No newline at end of file
diff --git a/source/lib/include/ComputeDescriptor.h b/source/lib/include/ComputeDescriptor.h
index b01f9ed44b..3b85f931dc 100644
--- a/source/lib/include/ComputeDescriptor.h
+++ b/source/lib/include/ComputeDescriptor.h
@@ -143,7 +143,6 @@ void compute_descriptor_se_r (std::vector<double > &		descrpt_r,
 			      const double &			rmin, 
 			      const double &			rmax);
 
-
 struct NeighborInfo 
 {
   int type;
diff --git a/source/lib/include/CustomeOperation.h b/source/lib/include/CustomeOperation.h
index 0bdd91c3dd..e8669c9f28 100644
--- a/source/lib/include/CustomeOperation.h
+++ b/source/lib/include/CustomeOperation.h
@@ -570,3 +570,210 @@ void ProdVirialSeRGPULauncher(FPTYPE * virial, FPTYPE * atom_virial, const FPTYP
 // ******************************************************************************
 // end of custome op ProdVirialSeR
 // ******************************************************************************
+
+template <typename FPTYPE>
+inline FPTYPE dot(FPTYPE a[4], FPTYPE b[4]) {
+    return a[0] * b[0] + a[1] * b[1] + a[2] * b[2] + a[3] * b[3]; 
+}
+
+/*
+    This inline function was designed to get the table info and bias value for current input xx!
+    lower:      indicate the lower boundary of the first table;
+    upper:      indicate the upper boundary of the first table as well as the lower boundary of the second table;
+    max:        indicate the upper boundary of the second table;
+    stride0:    indicate the stride of the first table;
+    stride1:    indicate the stride of the second table;
+    xx:         indicate the inputs value;
+    table_idx:  indicate the location of table info of input value xx;
+*/
+template <typename FPTYPE>
+inline void locate_xx(const FPTYPE& lower, const FPTYPE& upper,  const FPTYPE& max, const FPTYPE& stride0, const FPTYPE& stride1, FPTYPE& xx, int& table_idx) {
+    if (xx < lower) {
+        table_idx = 0;
+        xx = 0;
+    }
+    else if (xx < upper) {
+        table_idx = (int)((xx - lower) / stride0);
+        xx -= (table_idx * stride0 + lower);
+    }
+    else if (xx < max) {
+        int first_stride = int((upper - lower) / stride0);
+        table_idx = first_stride + (int)((xx - upper) / stride1);
+        xx -= ((table_idx - first_stride) * stride1 + upper);
+    }
+    else {
+        table_idx = int((upper - lower) / stride0) + (int)((max - upper) / stride1) - 1;
+        xx = 0;
+    }
+}
+
+template <typename FPTYPE>
+void TabulateFusionCPULauncher(const FPTYPE * table, const FPTYPE * table_info, const FPTYPE * in, const FPTYPE * ff, const int nloc, const int nnei, const int last_layer_size, FPTYPE * out) {
+    //Currently, Do nothing at all! 
+    // std::cout << "I'm in tabulate @CPU!" << std::endl;
+    memset(out, 0.0, sizeof(FPTYPE) * nloc * 4 * last_layer_size);
+    FPTYPE const lower   = table_info[0];
+    FPTYPE const upper   = table_info[1];
+    FPTYPE const _max    = table_info[2];
+    FPTYPE const stride0 = table_info[3];
+    FPTYPE const stride1 = table_info[4];
+    // for every atom, execute a small gemm~
+    // FPTYPE * res = new FPTYPE[4 * last_layer_size];
+    #pragma omp parallel for
+    for (int ii = 0; ii < nloc; ii++) {
+        FPTYPE ll[4] = {0};
+        FPTYPE ago = in[ii * nnei + nnei - 1];
+        bool unloop = false; 
+        for (int jj = 0; jj < nnei; jj++) { 
+            ll[0] = ff[ii * nnei * 4 + jj * 4 + 0];
+            ll[1] = ff[ii * nnei * 4 + jj * 4 + 1];
+            ll[2] = ff[ii * nnei * 4 + jj * 4 + 2];
+            ll[3] = ff[ii * nnei * 4 + jj * 4 + 3];
+            FPTYPE xx = in[ii * nnei + jj]; 
+            if (ago == xx) {
+                unloop = true;
+            }
+            int table_idx = 0;
+            locate_xx(lower, upper, _max, stride0, stride1, xx, table_idx);
+            for (int kk = 0; kk < last_layer_size; kk++) {
+                // 1.094 timesteps/s                                       
+                FPTYPE a0  = table[table_idx * last_layer_size * 6 + 6 * kk + 0]; 
+                FPTYPE a1  = table[table_idx * last_layer_size * 6 + 6 * kk + 1]; 
+                FPTYPE a2  = table[table_idx * last_layer_size * 6 + 6 * kk + 2]; 
+                FPTYPE a3  = table[table_idx * last_layer_size * 6 + 6 * kk + 3];
+                FPTYPE a4  = table[table_idx * last_layer_size * 6 + 6 * kk + 4];
+                FPTYPE a5  = table[table_idx * last_layer_size * 6 + 6 * kk + 5];
+                FPTYPE var = a0 + (a1 + (a2 + (a3 + (a4 + a5 * xx) * xx) * xx) * xx) * xx;
+                if (unloop) {
+                    out[ii * last_layer_size * 4 + 0 * last_layer_size + kk] += (nnei - jj) * var * ll[0];
+                    out[ii * last_layer_size * 4 + 1 * last_layer_size + kk] += (nnei - jj) * var * ll[1];
+                    out[ii * last_layer_size * 4 + 2 * last_layer_size + kk] += (nnei - jj) * var * ll[2];
+                    out[ii * last_layer_size * 4 + 3 * last_layer_size + kk] += (nnei - jj) * var * ll[3];
+                }
+                else {
+                    out[ii * last_layer_size * 4 + 0 * last_layer_size + kk] += var * ll[0];
+                    out[ii * last_layer_size * 4 + 1 * last_layer_size + kk] += var * ll[1];
+                    out[ii * last_layer_size * 4 + 2 * last_layer_size + kk] += var * ll[2];
+                    out[ii * last_layer_size * 4 + 3 * last_layer_size + kk] += var * ll[3];
+                }
+            }
+            if (unloop) break;
+        }
+    }
+}
+
+template <typename FPTYPE>
+void TabulateFusionGradCPULauncher(const FPTYPE * table, const FPTYPE * table_info, const FPTYPE * in, const FPTYPE * ff, const FPTYPE * dy, const int nloc, const int nnei, const int last_layer_size, FPTYPE * dy_dx, FPTYPE * dy_df) {
+    // std::cout << "I'm in tabulate gradient @CPU!" << std::endl;
+    memset(dy_dx, 0.0, sizeof(FPTYPE) * nloc * nnei);
+    memset(dy_df, 0.0, sizeof(FPTYPE) * nloc * nnei * 4);
+    FPTYPE const lower   = table_info[0];
+    FPTYPE const upper   = table_info[1];
+    FPTYPE const _max    = table_info[2];
+    FPTYPE const stride0 = table_info[3];
+    FPTYPE const stride1 = table_info[4];
+    // for every atom, execute a small gemm~
+    // FPTYPE * res = new FPTYPE[4 * last_layer_size];
+    #pragma omp parallel for
+    for (int ii = 0; ii < nloc; ii++) {
+        FPTYPE ll[4];
+        FPTYPE rr[4];
+        FPTYPE ago = in[ii * nnei + nnei - 1];
+        bool unloop = false;
+        for (int jj = 0; jj < nnei; jj++) {
+            // construct the dy/dx
+            ll[0] = ff[ii * nnei * 4 + jj * 4 + 0];
+            ll[1] = ff[ii * nnei * 4 + jj * 4 + 1];
+            ll[2] = ff[ii * nnei * 4 + jj * 4 + 2];
+            ll[3] = ff[ii * nnei * 4 + jj * 4 + 3];
+            FPTYPE xx = in[ii * nnei + jj]; 
+            if (ago == xx) {
+                unloop = true;
+            }
+            int table_idx = 0;
+            locate_xx(lower, upper, _max, stride0, stride1, xx, table_idx);
+            FPTYPE grad = 0.0;
+            for (int kk = 0; kk < last_layer_size; kk++) {
+                rr[0] = dy[ii * last_layer_size * 4 + 0 * last_layer_size + kk];
+                rr[1] = dy[ii * last_layer_size * 4 + 1 * last_layer_size + kk];
+                rr[2] = dy[ii * last_layer_size * 4 + 2 * last_layer_size + kk];
+                rr[3] = dy[ii * last_layer_size * 4 + 3 * last_layer_size + kk];
+                // 1.094 timesteps/s
+                FPTYPE a0  = table[table_idx * last_layer_size * 6 + 6 * kk + 0]; 
+                FPTYPE a1  = table[table_idx * last_layer_size * 6 + 6 * kk + 1]; 
+                FPTYPE a2  = table[table_idx * last_layer_size * 6 + 6 * kk + 2]; 
+                FPTYPE a3  = table[table_idx * last_layer_size * 6 + 6 * kk + 3];
+                FPTYPE a4  = table[table_idx * last_layer_size * 6 + 6 * kk + 4];
+                FPTYPE a5  = table[table_idx * last_layer_size * 6 + 6 * kk + 5];
+                FPTYPE res = a0 + (a1 + (a2 + (a3 + (a4 + a5 * xx) * xx) * xx) * xx) * xx;
+                
+                if (unloop) {
+                    grad += (a1 + (2 * a2 + (3 * a3 + (4 * a4 + 5 * a5 * xx) * xx) * xx) * xx) * dot(ll, rr) * (nnei - jj);
+                    dy_df[ii * nnei * 4 + jj * 4 + 0] += res * rr[0] * (nnei - jj);
+                    dy_df[ii * nnei * 4 + jj * 4 + 1] += res * rr[1] * (nnei - jj);
+                    dy_df[ii * nnei * 4 + jj * 4 + 2] += res * rr[2] * (nnei - jj);
+                    dy_df[ii * nnei * 4 + jj * 4 + 3] += res * rr[3] * (nnei - jj);
+                }
+                else {
+                    grad += (a1 + (2 * a2 + (3 * a3 + (4 * a4 + 5 * a5 * xx) * xx) * xx) * xx) * dot(ll, rr);
+                    dy_df[ii * nnei * 4 + jj * 4 + 0] += res * rr[0];
+                    dy_df[ii * nnei * 4 + jj * 4 + 1] += res * rr[1];
+                    dy_df[ii * nnei * 4 + jj * 4 + 2] += res * rr[2];
+                    dy_df[ii * nnei * 4 + jj * 4 + 3] += res * rr[3];
+                }
+            }
+            dy_dx[ii * nnei + jj] = grad;
+            if (unloop) break;
+        }
+    }
+}
+
+template <typename FPTYPE>
+void TabulateCheckerCPULauncher(const FPTYPE * table_info, const FPTYPE * in, int * out, const int nloc, const int nnei) {
+    FPTYPE const lower   = table_info[0];
+    FPTYPE const upper   = table_info[1];
+    FPTYPE const _max    = table_info[2];
+    FPTYPE const stride0 = table_info[3];
+    FPTYPE const stride1 = table_info[4];
+    // for every atom, execute a small gemm~
+    // FPTYPE * res = new FPTYPE[4 * last_layer_size];
+    int Csub = 0;    // summation of second table approximate;
+    int Dsub = 0;    // summation of the endpoint approximate;
+    for (int ii = 0; ii < nloc; ii++) {
+        for (int jj = 0; jj < nnei; jj++) {
+            FPTYPE xx = in[ii * nnei + jj];
+            if (xx < lower || xx > _max) {
+                Csub += 1;
+            }
+            else if (xx >= upper && xx <= _max) {
+                Dsub += 1;
+            }
+        }
+    }
+    if(Csub > 0) {
+        std::cout << "# DEEPMD: warning! some values [" << Csub << "/" << nloc * nnei << "] overflow the range of the table, using the endpoint approximate processing.." << std::endl;
+    }
+    if(Dsub > 0) {
+        std::cout << "# DEEPMD: warning! some values [" << Dsub << "/" << nloc * nnei << "] overflow the range of the table, using second table approximate processing.." << std::endl;
+    }
+}
+
+#if GOOGLE_CUDA
+template<typename FPTYPE>
+void TabulateFusionGPULauncher(const FPTYPE * table, const FPTYPE * table_info, const FPTYPE * in, const FPTYPE * ff, const int nloc, const int nnei, const int last_layer_size, FPTYPE * out) {
+    TabulateFusionGPUExecuteFunctor<FPTYPE>()(table, table_info, in, ff, nloc, nnei, last_layer_size, out);
+}
+
+template<typename FPTYPE>
+void TabulateFusionGradGPULauncher(const FPTYPE * table, const FPTYPE * table_info, const FPTYPE * in, const FPTYPE * ff, const FPTYPE * dy, const int nloc, const int nnei, const int last_layer_size, FPTYPE * dy_dx, FPTYPE * dy_df) {
+    TabulateFusionGradGPUExecuteFunctor<FPTYPE>()(table, table_info, in, ff, dy, nloc, nnei, last_layer_size, dy_dx, dy_df);
+}
+
+template <typename FPTYPE>
+void TabulateCheckerGPULauncher(const FPTYPE * table_info, const FPTYPE * in, int * out, const int nloc, const int nnei) {
+    TabulateCheckerGPUExecuteFunctor<FPTYPE>()(table_info, in, out, nloc, nnei);
+}
+#endif // GOOGLE_CUDA
+// ******************************************************************************
+// end of custome op Tabulate
+// ******************************************************************************
diff --git a/source/lib/include/DeviceFunctor.h b/source/lib/include/DeviceFunctor.h
index d51d617f84..f482545df3 100644
--- a/source/lib/include/DeviceFunctor.h
+++ b/source/lib/include/DeviceFunctor.h
@@ -7,6 +7,7 @@
 
 typedef unsigned long long int_64;
 #define SQRT_2_PI 0.7978845608028654 
+#define TPB 256
 
 #define cudaErrcheck(res) {cudaAssert((res), __FILE__, __LINE__);}
 inline void cudaAssert(cudaError_t code, const char *file, int line, bool abort=true) {
@@ -59,4 +60,19 @@ struct GeluGradGPUExecuteFunctor {
 template<typename FPTYPE>
 struct GeluGradGradGPUExecuteFunctor {
     void operator()(const FPTYPE * dy, const FPTYPE * dy_, const FPTYPE * in, FPTYPE * out, const int size);
+};
+
+template<typename FPTYPE>
+struct TabulateFusionGPUExecuteFunctor {
+    void operator()(const FPTYPE * table, const FPTYPE * table_info, const FPTYPE * in, const FPTYPE * ff, const int nloc, const int nnei, const int last_layer_size, FPTYPE * out);
+};
+
+template<typename FPTYPE>
+struct TabulateFusionGradGPUExecuteFunctor {
+    void operator()(const FPTYPE * table, const FPTYPE * table_info, const FPTYPE * in, const FPTYPE * ff, const FPTYPE * dy, const int nloc, const int nnei, const int last_layer_size, FPTYPE * dy_dx, FPTYPE * dy_df);
+};
+
+template<typename FPTYPE>
+struct TabulateCheckerGPUExecuteFunctor {
+    void operator()(const FPTYPE * table_info, const FPTYPE * in, int * out, const int nloc, const int nnei);
 };
\ No newline at end of file
diff --git a/source/op/CMakeLists.txt b/source/op/CMakeLists.txt
index 25f125d6be..244eab6e75 100644
--- a/source/op/CMakeLists.txt
+++ b/source/op/CMakeLists.txt
@@ -3,8 +3,8 @@
 set(OP_LIB ${PROJECT_SOURCE_DIR}/lib/src/SimulationRegion.cpp ${PROJECT_SOURCE_DIR}/lib/src/NeighborList.cpp)
 
 set (OP_CXX_FLAG -D_GLIBCXX_USE_CXX11_ABI=${OP_CXX_ABI} )
-file(GLOB OP_SRC prod_force.cc prod_virial.cc descrpt.cc descrpt_se_a.cc descrpt_se_a_ef.cc descrpt_se_a_ef.cc descrpt_se_a_ef_para.cc descrpt_se_a_ef_vert.cc descrpt_se_r.cc tab_inter.cc prod_force_se_a.cc prod_virial_se_a.cc prod_force_se_r.cc prod_virial_se_r.cc soft_min.cc soft_min_force.cc soft_min_virial.cc ewald_recp.cc gelu.cc map_aparam.cc)
-file(GLOB OP_CUDA_SRC prod_force.cc prod_virial.cc descrpt.cc descrpt_se_a_multi_device.cc descrpt_se_r_multi_device.cc tab_inter.cc prod_force_se_a_multi_device.cc prod_virial_se_a_multi_device.cc prod_force_se_r_multi_device.cc prod_virial_se_r_multi_device.cc soft_min.cc soft_min_force.cc soft_min_virial.cc gelu_multi_device.cc)
+file(GLOB OP_SRC prod_force.cc prod_virial.cc descrpt.cc descrpt_se_a.cc descrpt_se_a_ef.cc descrpt_se_a_ef.cc descrpt_se_a_ef_para.cc descrpt_se_a_ef_vert.cc descrpt_se_r.cc tab_inter.cc prod_force_se_a.cc prod_virial_se_a.cc prod_force_se_r.cc prod_virial_se_r.cc soft_min.cc soft_min_force.cc soft_min_virial.cc ewald_recp.cc gelu.cc map_aparam.cc neighbor_stat.cc unaggregated_grad.cc tabulate.cc)
+file(GLOB OP_CUDA_SRC prod_force.cc prod_virial.cc descrpt.cc descrpt_se_a_multi_device.cc descrpt_se_r_multi_device.cc tab_inter.cc prod_force_se_a_multi_device.cc prod_virial_se_a_multi_device.cc prod_force_se_r_multi_device.cc prod_virial_se_r_multi_device.cc soft_min.cc soft_min_force.cc soft_min_virial.cc gelu_multi_device.cc tabulate_multi_device.cc)
 file(GLOB OP_GRADS_SRC prod_force_grad.cc prod_force_se_a_grad.cc prod_force_se_r_grad.cc prod_virial_grad.cc prod_virial_se_a_grad.cc prod_virial_se_r_grad.cc soft_min_force_grad.cc soft_min_virial_grad.cc )
 file(GLOB OP_PY *.py)
 
diff --git a/source/op/_tabulate_grad.py b/source/op/_tabulate_grad.py
new file mode 100644
index 0000000000..6f8ba1f8bc
--- /dev/null
+++ b/source/op/_tabulate_grad.py
@@ -0,0 +1,32 @@
+#!/usr/bin/env python3
+"""
+Gradients for tabulate.
+"""
+
+from tensorflow.python.framework import ops
+from deepmd.env import op_module
+from deepmd.env import tf
+# from deepmd.DescrptSeATabulate import last_layer_size
+
+# refine is needed!
+# accurate gradient is needed!
+# 'tabulate_one_side' is needed!
+@ops.RegisterGradient("TabulateGrad")
+def _tabulate_grad_cc (op, dy):    
+    return [None, dy]
+
+@ops.RegisterGradient("TabulateFusionGrad")
+def _tabulate_grad_cc (op, dy, dy_):    
+    return [None, None, dy, dy_, None, None]
+
+# old implementations here.
+
+@ops.RegisterGradient("Tabulate")
+def _tabulate_grad_cc (op, dy, dy_):    
+    dy = op_module.tabulate_grad(dy, op.outputs[1])
+    return [None, None, dy]
+
+@ops.RegisterGradient("TabulateFusion")
+def _tabulate_fusion_grad_cc (op, dy):    
+    dy_dx, dy_df = op_module.tabulate_fusion_grad(op.inputs[0], op.inputs[1], op.inputs[2], op.inputs[3], dy, op.outputs[0])
+    return [None, None, dy_dx, dy_df]
\ No newline at end of file
diff --git a/source/op/cuda/CMakeLists.txt b/source/op/cuda/CMakeLists.txt
index 89dd0b5922..4f47aa1435 100644
--- a/source/op/cuda/CMakeLists.txt
+++ b/source/op/cuda/CMakeLists.txt
@@ -28,6 +28,8 @@ if (${CUDA_VERSION_MAJOR} GREATER "10")
                         -gencode arch=compute_61,code=sm_61; # Pascal - GTX 1080, GTX 1070, GTX 1060, GTX 1050, GTX 1030, Titan Xp, Tesla P40, Tesla P4, Discrete GPU on the NVIDIA Drive PX2
                         -gencode arch=compute_70,code=sm_70; # Volta  - GV100/Tesla V100, GTX 1180 (GV104)
                         -gencode arch=compute_75,code=sm_75; # Turing - RTX 2080, Titan RTX, Quadro R8000
+                        -gencode arch=compute_80,code=sm_80; # Ampere - A100
+                        -gencode arch=compute_86,code=sm_86; # Ampere - RTX 3090
                         -O3; -Xcompiler -fPIC;
         )
 elseif (${CUDA_VERSION_MAJOR} STREQUAL "10")
@@ -80,7 +82,7 @@ else ()
 endif()
 
 set (SOURCE_FILES
-    descrpt_se_a.cu descrpt_se_r.cu prod_force_se_a.cu prod_force_se_r.cu prod_virial_se_a.cu prod_virial_se_r.cu gelu.cu 
+    descrpt_se_a.cu descrpt_se_r.cu prod_force_se_a.cu prod_force_se_r.cu prod_virial_se_a.cu prod_virial_se_r.cu gelu.cu tabulate.cu
 )
 
 cuda_add_library(deepmd_op_cuda SHARED ${SOURCE_FILES})
diff --git a/source/op/cuda/descrpt_se_a.cu b/source/op/cuda/descrpt_se_a.cu
index a528c4c477..999d4c2b39 100644
--- a/source/op/cuda/descrpt_se_a.cu
+++ b/source/op/cuda/descrpt_se_a.cu
@@ -147,8 +147,10 @@ __global__ void format_nlist_fill_b_se_a(int * nlist,
 }
 //it's ok!
 
-template<typename FPTYPE>
-__global__ void compute_descriptor_se_a (FPTYPE* descript,
+template<
+    typename FPTYPE,
+    int      THREADS_PER_BLOCK>
+__global__ void compute_descriptor_se_a(FPTYPE* descript,
                             const int ndescrpt,
                             FPTYPE* descript_deriv,
                             const int descript_deriv_size,
@@ -164,67 +166,77 @@ __global__ void compute_descriptor_se_a (FPTYPE* descript,
                             const float rmax,
                             const int sec_a_size)
 {   
-    // <<<nloc, sec_a.back()>>>
-    const unsigned int idx = blockIdx.x;
-    const unsigned int idy = blockIdx.y * blockDim.y + threadIdx.y;
-    const int idx_deriv = idy * 4 * 3;	// 4 components time 3 directions
-    const int idx_value = idy * 4;	// 4 components
-    if (idy >= sec_a_size) {return;}
+    // <<<nloc, TPB>>>
+    const unsigned int bid = blockIdx.x;
+    const unsigned int tid = threadIdx.x;
+    // usually false...
+    if (tid >= sec_a_size) {
+        return;
+    }
+    // const int idx_deriv = idy * 4 * 3;	// 4 components time 3 directions
+    // const int idx_value = idy * 4;	// 4 components
+    int * row_nlist = nlist + bid * nlist_size;
+    FPTYPE * row_rij = rij + bid * rij_size;
+    FPTYPE * row_descript = descript + bid * ndescrpt;
+    FPTYPE * row_descript_deriv = descript_deriv + bid * descript_deriv_size;
 
-    // else {return;}
-    FPTYPE * row_descript = descript + idx * ndescrpt;
-    FPTYPE * row_descript_deriv = descript_deriv + idx * descript_deriv_size;
-    FPTYPE * row_rij = rij + idx * rij_size;
-    int * row_nlist = nlist + idx * nlist_size;
+    for (int ii = tid; ii < sec_a_size; ii += THREADS_PER_BLOCK) {
+        const int idx_value = ii * 4;	// 4 components
+        const int idx_deriv = ii * 12;	// 4 components time 3 directions
+        if (row_nlist[ii] >= 0) {
+            FPTYPE rr[3]  = {0};
+            FPTYPE dd[4]  = {0};
+            FPTYPE vv[12] = {0};
+            const int & j_idx = row_nlist[ii];
+            for (int kk = 0; kk < 3; kk++) {
+                rr[kk] = coord[j_idx * 3 + kk] - coord[bid * 3 + kk];
+                row_rij[ii * 3 + kk] = rr[kk];
+            }
+            // const FPTYPE * rr = &row_rij[ii * 3];
+            FPTYPE nr2 = dev_dot(rr, rr);
+            FPTYPE inr = 1./sqrt(nr2);
+            FPTYPE nr = nr2 * inr;
+            FPTYPE inr2 = inr * inr;
+            FPTYPE inr4 = inr2 * inr2;
+            FPTYPE inr3 = inr4 * nr;
+            FPTYPE sw, dsw;
+            spline5_switch(sw, dsw, nr, rmin, rmax);
+            dd[0] = (1./nr)       ;//* sw;
+            dd[1] = (rr[0] / nr2) ;//* sw;
+            dd[2] = (rr[1] / nr2) ;//* sw;
+            dd[3] = (rr[2] / nr2) ;//* sw;
 
-    if (row_nlist[idy] >= 0) {
-        const int & j_idx = row_nlist[idy];
-        for (int kk = 0; kk < 3; kk++) {
-            row_rij[idy * 3 + kk] = coord[j_idx * 3 + kk] - coord[idx * 3 + kk];
+            vv[0] = (rr[0] * inr3 * sw - dd[0] * dsw * rr[0] * inr); // avg[type[(idx_deriv + 0) / (ndescrpt * 3)] * ndescrpt + ((idx_deriv + 0) % (ndescrpt * 3)) / 3];
+            vv[1] = (rr[1] * inr3 * sw - dd[0] * dsw * rr[1] * inr); // avg[type[(idx_deriv + 1) / (ndescrpt * 3)] * ndescrpt + ((idx_deriv + 1) % (ndescrpt * 3)) / 3];
+            vv[2] = (rr[2] * inr3 * sw - dd[0] * dsw * rr[2] * inr); // avg[type[(idx_deriv + 2) / (ndescrpt * 3)] * ndescrpt + ((idx_deriv + 2) % (ndescrpt * 3)) / 3];
+            // ****deriv of component x/r2
+            vv[3] = ((2. * rr[0] * rr[0] * inr4 - inr2) * sw - dd[1] * dsw * rr[0] * inr); // avg[type[(idx_deriv + 3) / (ndescrpt * 3)] * ndescrpt + ((idx_deriv + 3) % (ndescrpt * 3)) / 3];
+            vv[4] = ((2. * rr[0] * rr[1] * inr4	) * sw - dd[1] * dsw * rr[1] * inr); // avg[type[(idx_deriv + 4) / (ndescrpt * 3)] * ndescrpt + ((idx_deriv + 4) % (ndescrpt * 3)) / 3];
+            vv[5] = ((2. * rr[0] * rr[2] * inr4	) * sw - dd[1] * dsw * rr[2] * inr); // avg[type[(idx_deriv + 5) / (ndescrpt * 3)] * ndescrpt + ((idx_deriv + 5) % (ndescrpt * 3)) / 3];
+            // ***deriv of component y/r2
+            vv[6] = ((2. * rr[1] * rr[0] * inr4	) * sw - dd[2] * dsw * rr[0] * inr); // avg[type[(idx_deriv + 6) / (ndescrpt * 3)] * ndescrpt + ((idx_deriv + 6) % (ndescrpt * 3)) / 3];
+            vv[7] = ((2. * rr[1] * rr[1] * inr4 - inr2) * sw - dd[2] * dsw * rr[1] * inr); // avg[type[(idx_deriv + 7) / (ndescrpt * 3)] * ndescrpt + ((idx_deriv + 7) % (ndescrpt * 3)) / 3];
+            vv[8] = ((2. * rr[1] * rr[2] * inr4	) * sw - dd[2] * dsw * rr[2] * inr); // avg[type[(idx_deriv + 8) / (ndescrpt * 3)] * ndescrpt + ((idx_deriv + 8) % (ndescrpt * 3)) / 3];
+            // ***deriv of component z/r2 
+            vv[9] = ((2. * rr[2] * rr[0] * inr4	) * sw - dd[3] * dsw * rr[0] * inr); // avg[type[(idx_deriv + 9) / (ndescrpt * 3)] * ndescrpt + ((idx_deriv + 9) % (ndescrpt * 3)) / 3];
+            vv[10]= ((2. * rr[2] * rr[1] * inr4	) * sw - dd[3] * dsw * rr[1] * inr); // avg[type[(idx_deriv + 10) / (ndescrpt * 3)] * ndescrpt + ((idx_deriv + 10) % (ndescrpt * 3)) / 3];
+            vv[11]= ((2. * rr[2] * rr[2] * inr4 - inr2) * sw - dd[3] * dsw * rr[2] * inr); // avg[type[(idx_deriv + 11) / (ndescrpt * 3)] * ndescrpt + ((idx_deriv + 11) % (ndescrpt * 3)) / 3];
+            // 4 value components
+            dd[0] *= sw; // * descript[idx * ndescrpt + idx_value + 0]);// - avg[type[idx] * ndescrpt + idx_value + 0]) / std[type[idx] * ndescrpt + idx_value + 0];
+            dd[1] *= sw; // * descript[idx * ndescrpt + idx_value + 1]);// - avg[type[idx] * ndescrpt + idx_value + 1]) / std[type[idx] * ndescrpt + idx_value + 1];
+            dd[2] *= sw; // * descript[idx * ndescrpt + idx_value + 2]);// - avg[type[idx] * ndescrpt + idx_value + 2]) / std[type[idx] * ndescrpt + idx_value + 2];
+            dd[3] *= sw; // * descript[idx * ndescrpt + idx_value + 3]);// - avg[type[idx] * ndescrpt + idx_value + 3]) / std[type[idx] * ndescrpt + idx_value + 3];
+            for (int ii = 0; ii < 12; ii++) {
+                row_descript_deriv[idx_deriv + ii] = vv[ii] / std[type[bid] * ndescrpt + idx_value + ii / 3];
+            }
+            for (int ii = 0; ii < 4; ii++) {  
+                row_descript[idx_value + ii] = (dd[ii] - avg[type[bid] * ndescrpt + idx_value + ii]) / std[type[bid] * ndescrpt + idx_value + ii];
+            }
+        }
+        else {
+            // TODO: move it to the memset.
+            row_descript[idx_value] -= avg[type[bid] * ndescrpt + idx_value] / std[type[bid] * ndescrpt + idx_value];
         }
-        const FPTYPE * rr = &row_rij[idy * 3 + 0];
-        FPTYPE nr2 = dev_dot(rr, rr);
-        FPTYPE inr = 1./sqrt(nr2);
-        FPTYPE nr = nr2 * inr;
-        FPTYPE inr2 = inr * inr;
-        FPTYPE inr4 = inr2 * inr2;
-        FPTYPE inr3 = inr4 * nr;
-        FPTYPE sw, dsw;
-        spline5_switch(sw, dsw, nr, rmin, rmax);
-        row_descript[idx_value + 0] = (1./nr)       ;//* sw;
-        row_descript[idx_value + 1] = (rr[0] / nr2) ;//* sw;
-        row_descript[idx_value + 2] = (rr[1] / nr2) ;//* sw;
-        row_descript[idx_value + 3] = (rr[2] / nr2) ;//* sw;
-
-        row_descript_deriv[idx_deriv + 0] = (rr[0] * inr3 * sw - row_descript[idx_value + 0] * dsw * rr[0] * inr); // avg[type[(idx_deriv + 0) / (ndescrpt * 3)] * ndescrpt + ((idx_deriv + 0) % (ndescrpt * 3)) / 3];
-        row_descript_deriv[idx_deriv + 1] = (rr[1] * inr3 * sw - row_descript[idx_value + 0] * dsw * rr[1] * inr); // avg[type[(idx_deriv + 1) / (ndescrpt * 3)] * ndescrpt + ((idx_deriv + 1) % (ndescrpt * 3)) / 3];
-        row_descript_deriv[idx_deriv + 2] = (rr[2] * inr3 * sw - row_descript[idx_value + 0] * dsw * rr[2] * inr); // avg[type[(idx_deriv + 2) / (ndescrpt * 3)] * ndescrpt + ((idx_deriv + 2) % (ndescrpt * 3)) / 3];
-        // ****deriv of component x/r2
-        row_descript_deriv[idx_deriv + 3] = ((2. * rr[0] * rr[0] * inr4 - inr2) * sw - row_descript[idx_value + 1] * dsw * rr[0] * inr); // avg[type[(idx_deriv + 3) / (ndescrpt * 3)] * ndescrpt + ((idx_deriv + 3) % (ndescrpt * 3)) / 3];
-        row_descript_deriv[idx_deriv + 4] = ((2. * rr[0] * rr[1] * inr4	) * sw - row_descript[idx_value + 1] * dsw * rr[1] * inr); // avg[type[(idx_deriv + 4) / (ndescrpt * 3)] * ndescrpt + ((idx_deriv + 4) % (ndescrpt * 3)) / 3];
-        row_descript_deriv[idx_deriv + 5] = ((2. * rr[0] * rr[2] * inr4	) * sw - row_descript[idx_value + 1] * dsw * rr[2] * inr); // avg[type[(idx_deriv + 5) / (ndescrpt * 3)] * ndescrpt + ((idx_deriv + 5) % (ndescrpt * 3)) / 3];
-        // ***deriv of component y/r2
-        row_descript_deriv[idx_deriv + 6] = ((2. * rr[1] * rr[0] * inr4	) * sw - row_descript[idx_value + 2] * dsw * rr[0] * inr); // avg[type[(idx_deriv + 6) / (ndescrpt * 3)] * ndescrpt + ((idx_deriv + 6) % (ndescrpt * 3)) / 3];
-        row_descript_deriv[idx_deriv + 7] = ((2. * rr[1] * rr[1] * inr4 - inr2) * sw - row_descript[idx_value + 2] * dsw * rr[1] * inr); // avg[type[(idx_deriv + 7) / (ndescrpt * 3)] * ndescrpt + ((idx_deriv + 7) % (ndescrpt * 3)) / 3];
-        row_descript_deriv[idx_deriv + 8] = ((2. * rr[1] * rr[2] * inr4	) * sw - row_descript[idx_value + 2] * dsw * rr[2] * inr); // avg[type[(idx_deriv + 8) / (ndescrpt * 3)] * ndescrpt + ((idx_deriv + 8) % (ndescrpt * 3)) / 3];
-        // ***deriv of component z/r2
-        row_descript_deriv[idx_deriv + 9] = ((2. * rr[2] * rr[0] * inr4	) * sw - row_descript[idx_value + 3] * dsw * rr[0] * inr); // avg[type[(idx_deriv + 9) / (ndescrpt * 3)] * ndescrpt + ((idx_deriv + 9) % (ndescrpt * 3)) / 3];
-        row_descript_deriv[idx_deriv +10] = ((2. * rr[2] * rr[1] * inr4	) * sw - row_descript[idx_value + 3] * dsw * rr[1] * inr); // avg[type[(idx_deriv + 10) / (ndescrpt * 3)] * ndescrpt + ((idx_deriv + 10) % (ndescrpt * 3)) / 3];
-        row_descript_deriv[idx_deriv +11] = ((2. * rr[2] * rr[2] * inr4 - inr2) * sw - row_descript[idx_value + 3] * dsw * rr[2] * inr); // avg[type[(idx_deriv + 11) / (ndescrpt * 3)] * ndescrpt + ((idx_deriv + 11) % (ndescrpt * 3)) / 3];
-        // 4 value components
-        row_descript[idx_value + 0] *= sw; // * descript[idx * ndescrpt + idx_value + 0]);// - avg[type[idx] * ndescrpt + idx_value + 0]) / std[type[idx] * ndescrpt + idx_value + 0];
-        row_descript[idx_value + 1] *= sw; // * descript[idx * ndescrpt + idx_value + 1]);// - avg[type[idx] * ndescrpt + idx_value + 1]) / std[type[idx] * ndescrpt + idx_value + 1];
-        row_descript[idx_value + 2] *= sw; // * descript[idx * ndescrpt + idx_value + 2]);// - avg[type[idx] * ndescrpt + idx_value + 2]) / std[type[idx] * ndescrpt + idx_value + 2];
-        row_descript[idx_value + 3] *= sw; // * descript[idx * ndescrpt + idx_value + 3]);// - avg[type[idx] * ndescrpt + idx_value + 3]) / std[type[idx] * ndescrpt + idx_value + 3];
-    }
-
-    for (int ii = 0; ii < 4; ii++) {
-        row_descript[idx_value + ii] = (row_descript[idx_value + ii] - avg[type[idx] * ndescrpt + idx_value + ii]) / std[type[idx] * ndescrpt + idx_value + ii];
-    }
-    // idy nloc, idx ndescrpt * 3
-    // descript_deriv[idy * ndescrpt * 3 + idx] = (descript_deriv_dev[idy * (ndescrpt * 3) + idx]) / std[type[idy] * ndescrpt + idx / 3];
-    for (int ii = 0; ii < 12; ii++) {
-        row_descript_deriv[idx_deriv + ii] /= std[type[idx] * ndescrpt + (idx_deriv + ii) / 3];
     }
 }
 
@@ -401,26 +413,7 @@ void DescrptSeAGPUExecuteFunctor<FPTYPE>::operator()(const FPTYPE * coord, const
         );
     }
 
-    const int nblock_ = (sec_a.back() + LEN -1) / LEN;
-    dim3 block_grid(nloc, nblock_);
-    dim3 thread_grid(1, LEN);
-    compute_descriptor_se_a<<<block_grid, thread_grid>>> (
-                            descript,
-                            ndescrpt,
-                            descript_deriv,
-                            ndescrpt * 3,
-                            rij,
-                            nnei * 3,
-                            type,
-                            avg,
-                            std,
-                            nlist,
-                            nnei,
-                            coord,
-                            rcut_r_smth,
-                            rcut_r,
-                            sec_a.back()
-    );
+    compute_descriptor_se_a<FPTYPE, TPB> <<<nloc, TPB>>> (descript, ndescrpt, descript_deriv, ndescrpt * 3, rij, nnei * 3, type, avg, std, nlist, nnei, coord, rcut_r_smth, rcut_r, sec_a.back());
 }
 
 template struct DescrptSeAGPUExecuteFunctor<float>;
diff --git a/source/op/cuda/descrpt_se_r.cu b/source/op/cuda/descrpt_se_r.cu
index 0715f19c5e..33932f4325 100644
--- a/source/op/cuda/descrpt_se_r.cu
+++ b/source/op/cuda/descrpt_se_r.cu
@@ -147,8 +147,10 @@ __global__ void format_nlist_fill_b_se_r(int * nlist,
 }
 //it's ok!
 
-template<typename FPTYPE>
-__global__ void compute_descriptor_se_r (FPTYPE* descript,
+template<
+    typename FPTYPE,
+    int      THREADS_PER_BLOCK>
+__global__ void compute_descriptor_se_r(FPTYPE* descript,
                             const int ndescrpt,
                             FPTYPE* descript_deriv,
                             const int descript_deriv_size,
@@ -164,49 +166,58 @@ __global__ void compute_descriptor_se_r (FPTYPE* descript,
                             const float rmax,
                             const int sec_a_size)
 {   
-    // <<<nloc, sec_a.back()>>>
-    const unsigned int idx = blockIdx.x;
-    const unsigned int idy = blockIdx.y * blockDim.y + threadIdx.y;
-    const int idx_deriv = idy * 3;	// 4 components time 3 directions
-    const int idx_value = idy;	    // 4 components
-    if (idy >= sec_a_size) {return;}
+    // <<<nloc, TPB>>>
+    const unsigned int bid = blockIdx.x;
+    const unsigned int tid = threadIdx.x;
+    // usually false...
+    if (tid >= sec_a_size) {
+        return;
+    }
+    // const int idx_deriv = idy * 4 * 3;	// 4 components time 3 directions
+    // const int idx_value = idy * 4;	// 4 components
+    int * row_nlist = nlist + bid * nlist_size;
+    FPTYPE * row_rij = rij + bid * rij_size;
+    FPTYPE * row_descript = descript + bid * ndescrpt;
+    FPTYPE * row_descript_deriv = descript_deriv + bid * descript_deriv_size;
 
-    // else {return;}
-    FPTYPE * row_descript = descript + idx * ndescrpt;
-    FPTYPE * row_descript_deriv = descript_deriv + idx * descript_deriv_size;
-    FPTYPE * row_rij = rij + idx * rij_size;
-    int * row_nlist = nlist + idx * nlist_size;
+    for (int ii = tid; ii < sec_a_size; ii += THREADS_PER_BLOCK) {
+        const int idx_value = ii;	// 4 components
+        const int idx_deriv = ii * 3;	// 4 components time 3 directions
+        if (row_nlist[ii] >= 0) {
+            FPTYPE rr[3]  = {0};
+            FPTYPE vv[3]  = {0};
+            FPTYPE dd     = 0;
+            const int & j_idx = row_nlist[ii];
+            for (int kk = 0; kk < 3; kk++) {
+                rr[kk] = coord[j_idx * 3 + kk] - coord[bid * 3 + kk];
+                row_rij[ii * 3 + kk] = rr[kk];
+            }
+            // const FPTYPE * rr = &row_rij[ii * 3];
+            FPTYPE nr2 = dev_dot(rr, rr);
+            FPTYPE inr = 1./sqrt(nr2);
+            FPTYPE nr = nr2 * inr;
+            FPTYPE inr2 = inr * inr;
+            FPTYPE inr4 = inr2 * inr2;
+            FPTYPE inr3 = inr4 * nr;
+            FPTYPE sw, dsw;
+            spline5_switch(sw, dsw, nr, rmin, rmax);
+            dd = (1./nr)       ;//* sw;
 
-    if (row_nlist[idy] >= 0) {
-        const int & j_idx = row_nlist[idy];
-        for (int kk = 0; kk < 3; kk++) {
-            row_rij[idy * 3 + kk] = coord[j_idx * 3 + kk] - coord[idx * 3 + kk];
+            vv[0] = (rr[0] * inr3 * sw - dd * dsw * rr[0] * inr); // avg[type[(idx_deriv + 0) / (ndescrpt * 3)] * ndescrpt + ((idx_deriv + 0) % (ndescrpt * 3)) / 3];
+            vv[1] = (rr[1] * inr3 * sw - dd * dsw * rr[1] * inr); // avg[type[(idx_deriv + 1) / (ndescrpt * 3)] * ndescrpt + ((idx_deriv + 1) % (ndescrpt * 3)) / 3];
+            vv[2] = (rr[2] * inr3 * sw - dd * dsw * rr[2] * inr); // avg[type[(idx_deriv + 2) / (ndescrpt * 3)] * ndescrpt + ((idx_deriv + 2) % (ndescrpt * 3)) / 3];
+            
+            // 4 value components
+            dd *= sw; // * descript[idx * ndescrpt + idx_value + 0]);// - avg[type[idx] * ndescrpt + idx_value + 0]) / std[type[idx] * ndescrpt + idx_value + 0];
+            for (int ii = 0; ii < 3; ii++) {
+                row_descript_deriv[idx_deriv + ii] = vv[ii] / std[type[bid] * ndescrpt + idx_value + ii / 3];
+            }
+            row_descript[idx_value] = (dd - avg[type[bid] * ndescrpt + idx_value]) / std[type[bid] * ndescrpt + idx_value];
+        }
+        else {
+            // TODO: move it to the memset.
+            row_descript[idx_value] -= avg[type[bid] * ndescrpt + idx_value] / std[type[bid] * ndescrpt + idx_value];
         }
-        const FPTYPE * rr = &row_rij[idy * 3 + 0];
-        FPTYPE nr2 = dev_dot(rr, rr);
-        FPTYPE inr = 1./sqrt(nr2);
-        FPTYPE nr = nr2 * inr;
-        FPTYPE inr2 = inr * inr;
-        FPTYPE inr4 = inr2 * inr2;
-        FPTYPE inr3 = inr4 * nr;
-        FPTYPE sw, dsw;
-        spline5_switch(sw, dsw, nr, rmin, rmax);
-        row_descript[idx_value + 0] = (1./nr)       ;//* sw;
-
-        row_descript_deriv[idx_deriv + 0] = (rr[0] * inr3 * sw - row_descript[idx_value + 0] * dsw * rr[0] * inr); // avg[type[(idx_deriv + 0) / (ndescrpt * 3)] * ndescrpt + ((idx_deriv + 0) % (ndescrpt * 3)) / 3];
-        row_descript_deriv[idx_deriv + 1] = (rr[1] * inr3 * sw - row_descript[idx_value + 0] * dsw * rr[1] * inr); // avg[type[(idx_deriv + 1) / (ndescrpt * 3)] * ndescrpt + ((idx_deriv + 1) % (ndescrpt * 3)) / 3];
-        row_descript_deriv[idx_deriv + 2] = (rr[2] * inr3 * sw - row_descript[idx_value + 0] * dsw * rr[2] * inr); // avg[type[(idx_deriv + 2) / (ndescrpt * 3)] * ndescrpt + ((idx_deriv + 2) % (ndescrpt * 3)) / 3];
-        // 4 value components
-        row_descript[idx_value + 0] *= sw; // * descript[idx * ndescrpt + idx_value + 0]);// - avg[type[idx] * ndescrpt + idx_value + 0]) / std[type[idx] * ndescrpt + idx_value + 0];
-    }
-
-    for (int ii = 0; ii < 1; ii++) {
-        row_descript[idx_value + ii] = (row_descript[idx_value + ii] - avg[type[idx] * ndescrpt + idx_value + ii]) / std[type[idx] * ndescrpt + idx_value + ii];
-    }
-    // idy nloc, idx ndescrpt * 3
-    // descript_deriv[idy * ndescrpt * 3 + idx] = (descript_deriv_dev[idy * (ndescrpt * 3) + idx]) / std[type[idy] * ndescrpt + idx / 3];
-    for (int ii = 0; ii < 3; ii++) {
-        row_descript_deriv[idx_deriv + ii] /= std[type[idx] * ndescrpt + (idx_deriv + ii) / 3];
     }
 }
 
@@ -383,26 +394,7 @@ void DescrptSeRGPUExecuteFunctor<FPTYPE>::operator()(const FPTYPE * coord, const
         );
     }
 
-    const int nblock_ = (sec_a.back() + LEN -1) / LEN;
-    dim3 block_grid(nloc, nblock_);
-    dim3 thread_grid(1, LEN);
-    compute_descriptor_se_r<<<block_grid, thread_grid>>> (
-                            descript,
-                            ndescrpt,
-                            descript_deriv,
-                            ndescrpt * 3,
-                            rij,
-                            nnei * 3,
-                            type,
-                            avg,
-                            std,
-                            nlist,
-                            nnei,
-                            coord,
-                            rcut_r_smth,
-                            rcut_r,
-                            sec_a.back()
-    );
+    compute_descriptor_se_r<FPTYPE, TPB> <<<nloc, TPB>>> (descript, ndescrpt, descript_deriv, ndescrpt * 3, rij, nnei * 3, type, avg, std, nlist, nnei, coord, rcut_r_smth, rcut_r, sec_a.back());
 }
 
 template struct DescrptSeRGPUExecuteFunctor<float>;
diff --git a/source/op/cuda/prod_force_se_a.cu b/source/op/cuda/prod_force_se_a.cu
index 1667c15f90..84615ff275 100644
--- a/source/op/cuda/prod_force_se_a.cu
+++ b/source/op/cuda/prod_force_se_a.cu
@@ -14,23 +14,44 @@ static __inline__ __device__ double atomicAdd(double* address, double val) {
 }
 #endif
 
-template<typename FPTYPE>
-__global__ void deriv_wrt_center_atom_se_a(FPTYPE * force, 
-                        const FPTYPE * net_deriv,
-                        const FPTYPE * in_deriv,
-                        const int ndescrpt)
+template <
+    typename FPTYPE,
+    int      THREADS_PER_BLOCK>
+__global__ void force_deriv_wrt_center_atom_se_a(FPTYPE * force, const FPTYPE * net_deriv, const FPTYPE * in_deriv, const int ndescrpt)
 {
-    const unsigned int idx = blockIdx.x;
-    const unsigned int idy = blockIdx.y * blockDim.y + threadIdx.y;
-    const unsigned int idz = threadIdx.x;
+    __shared__ FPTYPE data[THREADS_PER_BLOCK * 3];
+    unsigned int bid = blockIdx.x;
+    unsigned int tid = threadIdx.x;
+    for (int ii = tid; ii < THREADS_PER_BLOCK * 3; ii += THREADS_PER_BLOCK) {
+        data[ii] = 0.f;
+    }
 
-    if (idy >= ndescrpt) {return;}
-    
-    atomicAdd(force + idx * 3 + idz, -1.0 * net_deriv[idx * ndescrpt + idy] * in_deriv[idx * ndescrpt * 3 + idy * 3 + idz]);
+    for (int ii = tid; ii < ndescrpt; ii += THREADS_PER_BLOCK) {
+        for (int jj = 0; jj < 3; jj++) {
+            data[jj * THREADS_PER_BLOCK + tid] += net_deriv[bid * ndescrpt + ii] * in_deriv[bid * ndescrpt * 3 + ii * 3 + jj];
+        }
+    }
+    __syncthreads();
+
+    // do reduction in shared memory
+    for (int ii = THREADS_PER_BLOCK >> 1; ii > 0; ii >>= 1) {
+        if (tid < ii) {
+            for (int jj = 0; jj < 3; jj++) {
+                data[jj * THREADS_PER_BLOCK + tid] += data[jj * THREADS_PER_BLOCK + tid + ii];
+            }
+        }
+        __syncthreads();
+    }
+    // write result for this block to global memory
+    if (tid == 0) {
+        force[bid * 3 + 0] -= data[THREADS_PER_BLOCK * 0];
+        force[bid * 3 + 1] -= data[THREADS_PER_BLOCK * 1];
+        force[bid * 3 + 2] -= data[THREADS_PER_BLOCK * 2];
+    }
 }
 
 template<typename FPTYPE>
-__global__ void deriv_wrt_neighbors_se_a(FPTYPE * force, 
+__global__ void force_deriv_wrt_neighbors_se_a(FPTYPE * force, 
                         const FPTYPE * net_deriv,
                         const FPTYPE * in_deriv,
                         const int * nlist,
@@ -71,17 +92,13 @@ void ProdForceSeAGPUExecuteFunctor<FPTYPE>::operator()(FPTYPE * force,
 {   
     // std::cout << "I'm here!" << std::endl;
     cudaErrcheck(cudaMemset(force, 0.0, sizeof(FPTYPE) * nall * 3));
-    const int LEN1 = 256;
-    const int nblock1 = (ndescrpt + LEN1 -1) / LEN1;
-    dim3 grid(nloc, nblock1);
-    dim3 thread(3, LEN1);
-    deriv_wrt_center_atom_se_a<<<grid, thread>>>(force, net_deriv, in_deriv, ndescrpt);
+    force_deriv_wrt_center_atom_se_a<FPTYPE, TPB> <<<nloc, TPB>>>(force, net_deriv, in_deriv, ndescrpt);
     
     const int LEN = 64;
     int nblock = (nloc + LEN -1) / LEN;
     dim3 block_grid(nblock, nnei);
     dim3 thread_grid(LEN, 3, 4);
-    deriv_wrt_neighbors_se_a<<<block_grid, thread_grid>>>(force, net_deriv, in_deriv, nlist, nloc, nnei, ndescrpt, n_a_sel, n_a_shift);
+    force_deriv_wrt_neighbors_se_a<<<block_grid, thread_grid>>>(force, net_deriv, in_deriv, nlist, nloc, nnei, ndescrpt, n_a_sel, n_a_shift);
 }
 
 template struct ProdForceSeAGPUExecuteFunctor<float>;
diff --git a/source/op/cuda/prod_force_se_r.cu b/source/op/cuda/prod_force_se_r.cu
index 5a4b582dd0..88e2962536 100644
--- a/source/op/cuda/prod_force_se_r.cu
+++ b/source/op/cuda/prod_force_se_r.cu
@@ -14,23 +14,44 @@ static __inline__ __device__ double atomicAdd(double* address, double val) {
 }
 #endif
 
-template<typename FPTYPE>
-__global__ void deriv_wrt_center_atom_se_r(FPTYPE * force, 
-                        const FPTYPE * net_deriv,
-                        const FPTYPE * in_deriv,
-                        const int ndescrpt)
+template <
+    typename FPTYPE,
+    int      THREADS_PER_BLOCK>
+__global__ void force_deriv_wrt_center_atom_se_r(FPTYPE * force, const FPTYPE * net_deriv, const FPTYPE * in_deriv, const int ndescrpt)
 {
-    const unsigned int idx = blockIdx.x;
-    const unsigned int idy = blockIdx.y * blockDim.y + threadIdx.y;
-    const unsigned int idz = threadIdx.x;
+    __shared__ FPTYPE data[THREADS_PER_BLOCK * 3];
+    unsigned int bid = blockIdx.x;
+    unsigned int tid = threadIdx.x;
+    for (int ii = tid; ii < THREADS_PER_BLOCK * 3; ii += THREADS_PER_BLOCK) {
+        data[ii] = 0.f;
+    }
 
-    if (idy >= ndescrpt) {return;}
-    
-    atomicAdd(force + idx * 3 + idz, -1.0 * net_deriv[idx * ndescrpt + idy] * in_deriv[idx * ndescrpt * 3 + idy * 3 + idz]);
+    for (int ii = tid; ii < ndescrpt; ii += THREADS_PER_BLOCK) {
+        for (int jj = 0; jj < 3; jj++) {
+            data[jj * THREADS_PER_BLOCK + tid] += net_deriv[bid * ndescrpt + ii] * in_deriv[bid * ndescrpt * 3 + ii * 3 + jj];
+        }
+    }
+    __syncthreads();
+
+    // do reduction in shared memory
+    for (int ii = THREADS_PER_BLOCK >> 1; ii > 0; ii >>= 1) {
+        if (tid < ii) {
+            for (int jj = 0; jj < 3; jj++) {
+                data[jj * THREADS_PER_BLOCK + tid] += data[jj * THREADS_PER_BLOCK + tid + ii];
+            }
+        }
+        __syncthreads();
+    }
+    // write result for this block to global memory
+    if (tid == 0) {
+        force[bid * 3 + 0] -= data[THREADS_PER_BLOCK * 0];
+        force[bid * 3 + 1] -= data[THREADS_PER_BLOCK * 1];
+        force[bid * 3 + 2] -= data[THREADS_PER_BLOCK * 2];
+    }
 }
 
 template<typename FPTYPE>
-__global__ void deriv_wrt_neighbors_se_r(FPTYPE * force, 
+__global__ void force_deriv_wrt_neighbors_se_r(FPTYPE * force, 
                         const FPTYPE * net_deriv,
                         const FPTYPE * in_deriv,
                         const int * nlist,
@@ -66,17 +87,13 @@ void ProdForceSeRGPUExecuteFunctor<FPTYPE>::operator()(FPTYPE * force,
 {   
     // std::cout << "I'm here!" << std::endl;
     cudaErrcheck(cudaMemset(force, 0.0, sizeof(FPTYPE) * nall * 3));
-    const int LEN1 = 256;
-    const int nblock1 = (ndescrpt + LEN1 -1) / LEN1;
-    dim3 grid(nloc, nblock1);
-    dim3 thread(3, LEN1);
-    deriv_wrt_center_atom_se_r<<<grid, thread>>>(force, net_deriv, in_deriv, ndescrpt);
+    force_deriv_wrt_center_atom_se_r<FPTYPE, TPB> <<<nloc, TPB>>>(force, net_deriv, in_deriv, ndescrpt);
     
     const int LEN = 64;
     int nblock = (nloc + LEN -1) / LEN;
     dim3 block_grid(nblock, nnei);
     dim3 thread_grid(LEN, 3);
-    deriv_wrt_neighbors_se_r<<<block_grid, thread_grid>>>(force, net_deriv, in_deriv, nlist, nloc, nnei, ndescrpt);
+    force_deriv_wrt_neighbors_se_r<<<block_grid, thread_grid>>>(force, net_deriv, in_deriv, nlist, nloc, nnei, ndescrpt);
 }
 
 template struct ProdForceSeRGPUExecuteFunctor<float>;
diff --git a/source/op/cuda/tabulate.cu b/source/op/cuda/tabulate.cu
new file mode 100644
index 0000000000..db1b917f02
--- /dev/null
+++ b/source/op/cuda/tabulate.cu
@@ -0,0 +1,478 @@
+#include <iostream>
+#include <stdio.h>
+#include <fstream>
+#include <cuda_runtime.h>
+#include <cub/cub.cuh>   // or equivalently <cub/device/device_radix_sort.cuh>
+#include <cub/device/device_radix_sort.cuh>
+#include "DeviceFunctor.h"
+
+#define MM 4
+#define KK 4
+#define TPB 256
+#define WARP_SIZE 32
+#define FULL_MASK 0xffffffff
+
+template <typename FPTYPE>
+__forceinline__ 
+__device__
+void locate_xx(const FPTYPE& lower, const FPTYPE& upper,  const FPTYPE& max, const FPTYPE& stride0, const FPTYPE& stride1, FPTYPE& xx, int& table_idx) {
+    if (xx < lower) {
+        table_idx = 0;
+        xx = 0;
+    }
+    else if (xx < upper) {
+        table_idx = (int)((xx - lower) / stride0);
+        xx -= (table_idx * stride0 + lower);
+    }
+    else if (xx < max) {
+        int first_stride = int((upper - lower) / stride0);
+        table_idx = first_stride + (int)((xx - upper) / stride1);
+        xx -= ((table_idx - first_stride) * stride1 + upper);
+    }
+    else {
+        table_idx = int((upper - lower) / stride0) + (int)((max - upper) / stride1) - 1;
+        xx = 0;
+    }
+}
+
+template <typename FPTYPE>
+__forceinline__ 
+__device__ 
+FPTYPE dot(FPTYPE ll[4], FPTYPE rr[4]) {
+    return ll[0] * rr[0] + ll[1] * rr[1] + ll[2] * rr[2] + ll[3] * rr[3];
+}
+
+template <typename FPTYPE>
+__forceinline__ 
+__device__
+void warp_reduce(FPTYPE & val) {
+    for (int offset = 16; offset > 0; offset >>= 1)
+        val += __shfl_down_sync(FULL_MASK, val, offset);
+}
+
+// last_layer_size must larger than MTILE * KTILE!
+// TODO: A more flexible implementation of sparse 
+template <
+    typename FPTYPE,
+    int      MTILE,
+    int      KTILE> 
+__global__ void tabulate_fusion(const FPTYPE * table, const FPTYPE * in, const FPTYPE * ff, FPTYPE * out, const FPTYPE lower, const FPTYPE upper, const FPTYPE max, const FPTYPE stride0, const FPTYPE stride1, const int nnei, const int last_layer_size) {
+    extern __shared__ int _data[];
+    int const block_idx = blockIdx.x; // nloc
+    int const thread_idx = threadIdx.x; // last_layer_size
+    FPTYPE ago = __shfl_sync(0xffffffff, in[block_idx * nnei + nnei - 1], 0);
+    bool unloop = false;
+    int breakpoint = nnei - 1;
+    // int const warp_idx = __shfl_sync(0xffffffff, threadIdx.x / 32, 0);
+    // int const lane_idx = threadIdx.x % 32;
+    // iteratorC for data reuse...
+    FPTYPE * iteratorC = (FPTYPE*) &_data[0];
+    for (int kk = 0; kk < MTILE; kk++)
+        iteratorC[kk * last_layer_size + thread_idx] = 0.f;
+    __syncthreads();  
+
+    for (int ii = 0; ii < nnei; ii++) {
+        FPTYPE var[4]; 
+        FPTYPE xx = in[block_idx * nnei + ii];
+
+        if (ago == xx) {
+            unloop = true;
+            breakpoint = ii;
+        } 
+        int table_idx = 0;
+        locate_xx(lower, upper, max, stride0, stride1, xx, table_idx);
+        var[0] = table[table_idx * last_layer_size * 4 + thread_idx * 4 + 0];
+        var[1] = table[table_idx * last_layer_size * 4 + thread_idx * 4 + 1];
+        var[2] = table[table_idx * last_layer_size * 4 + thread_idx * 4 + 2];
+        var[3] = table[table_idx * last_layer_size * 4 + thread_idx * 4 + 3];
+        FPTYPE res = ((var[0] * xx + var[1]) * xx + var[2]) * xx + var[3];
+        for (int kk = 0; kk < MTILE; kk++) {
+            iteratorC[kk * last_layer_size + thread_idx] += (nnei - breakpoint) * ff[block_idx * nnei * MTILE + ii * MTILE + kk] * res;
+        }
+        if (unloop) break;
+    }
+    for (int ii = 0; ii < MTILE; ii++) {
+        out[block_idx * MTILE * last_layer_size + ii * last_layer_size + thread_idx] = iteratorC[ii * last_layer_size + thread_idx];
+    }
+}
+
+// last_layer_size must larger than MTILE * KTILE!
+// TODO: A more flexible implementation of sparse 
+
+
+template <
+    typename FPTYPE,
+    int      MTILE,
+    int      KTILE>
+__global__ void tabulate_fusion_grad_warp_reduce(const FPTYPE * table, const FPTYPE * in, const FPTYPE * ff, const FPTYPE * dy, FPTYPE * dy_dx, FPTYPE * dy_df, const FPTYPE lower, const FPTYPE upper, const FPTYPE max, const FPTYPE stride0, const FPTYPE stride1, const int nnei, const int last_layer_size) {
+    extern __shared__ int _data[];
+    int const block_idx = blockIdx.x;  // nloc
+    int const thread_idx = threadIdx.x; // KTILE * WARP_SIZE, usally 128 here~
+    int warp_idx = __shfl_sync(0xffffffff, threadIdx.x / 32, 0);
+    int lane_idx = threadIdx.x % 32;
+    int breakpoint = nnei - 1;
+    bool unloop = false;
+
+    FPTYPE * iteratorA = (FPTYPE *)&_data[0]; // dy
+    for (int ii = 0; ii < MTILE; ii++) {
+        if (thread_idx < last_layer_size) {
+            iteratorA[ii * last_layer_size + thread_idx] = dy[block_idx * MTILE * last_layer_size + ii * last_layer_size + thread_idx];
+        }
+    }
+    __syncthreads();
+    FPTYPE ago = __shfl_sync(0xffffffff, in[block_idx * nnei + nnei - 1], 0);
+    for (int ii = 0; ii < nnei; ii += KTILE) {
+        FPTYPE xx = in[block_idx * nnei + ii + warp_idx];
+        // if (ago == xx) {
+        //     unloop = true;
+        //     breakpoint = ii;
+        // }
+        
+        int table_idx = 0;
+        locate_xx(lower, upper, max, stride0, stride1, xx, table_idx);
+        FPTYPE sum[KTILE] = {0.f};
+        FPTYPE Csub = 0.f;
+        for (int jj = lane_idx; jj < last_layer_size; jj += WARP_SIZE) {
+            // load iteratorB through table 
+            FPTYPE var[KTILE];
+            var[0] = table[table_idx * last_layer_size * 4 + jj * 4 + 0];
+            var[1] = table[table_idx * last_layer_size * 4 + jj * 4 + 1];
+            var[2] = table[table_idx * last_layer_size * 4 + jj * 4 + 2];
+            var[3] = table[table_idx * last_layer_size * 4 + jj * 4 + 3];
+            FPTYPE tmp = (var[0] * xx + var[1]) * xx + var[2];
+            for (int kk = 0; kk < KTILE; kk++) {
+                sum[kk] += (nnei - breakpoint) * iteratorA[kk * last_layer_size + jj] * (tmp * xx + var[3]);
+            }
+            var[2]  = ff[block_idx * nnei * MTILE + (ii + warp_idx) * 4 + 0] * iteratorA[0 * last_layer_size + jj];
+            var[2] += ff[block_idx * nnei * MTILE + (ii + warp_idx) * 4 + 1] * iteratorA[1 * last_layer_size + jj];
+            var[2] += ff[block_idx * nnei * MTILE + (ii + warp_idx) * 4 + 2] * iteratorA[2 * last_layer_size + jj];
+            var[2] += ff[block_idx * nnei * MTILE + (ii + warp_idx) * 4 + 3] * iteratorA[3 * last_layer_size + jj];
+            Csub += (nnei - breakpoint) * ((2.0 * var[0] * xx + var[1]) * xx + tmp) * var[2];
+        }
+        __syncwarp();
+        for (int kk = 0; kk < KTILE; kk++) {
+            warp_reduce(sum[kk]);
+        }
+        warp_reduce(Csub);
+        if (lane_idx == 0) {
+            for (int kk = 0; kk < KTILE; kk++) {
+                dy_df[block_idx * nnei * MTILE + (ii + warp_idx) * 4 + kk] = sum[kk];
+            }
+            dy_dx[block_idx * nnei + ii + warp_idx] = Csub;
+        }
+        if (unloop) break;
+    }
+}
+
+template <
+    typename FPTYPE,
+    int      MTILE,
+    int      KTILE> 
+__global__ void tabulate_fusion_special(const FPTYPE * table, const FPTYPE * in, const FPTYPE * ff, FPTYPE * out, const FPTYPE lower, const FPTYPE upper, const FPTYPE max, const FPTYPE stride0, const FPTYPE stride1, const int nnei, const int last_layer_size) {
+    extern __shared__ int _data[];
+    int const block_idx = blockIdx.x; // nloc
+    int const thread_idx = threadIdx.x; // last_layer_size
+    FPTYPE ago = __shfl_sync(0xffffffff, in[block_idx * nnei + nnei - 1], 0);
+    bool unloop = false;
+    int breakpoint = nnei - 1;
+
+    FPTYPE * iteratorC = (FPTYPE*) &_data[0];
+    for (int kk = 0; kk < MTILE; kk++)
+        iteratorC[kk * last_layer_size + thread_idx] = 0.f;
+    __syncthreads();
+ 
+    for (int ii = 0; ii < nnei; ii++) {
+        FPTYPE var[6]; 
+        FPTYPE xx = in[block_idx * nnei + ii];
+        if (xx == ago) {
+            unloop = true;
+            breakpoint = ii;
+        }
+        int table_idx = 0;
+        locate_xx(lower, upper, max, stride0, stride1, xx, table_idx);
+        var[0] = table[table_idx * last_layer_size * 6 + thread_idx * 6 + 0];
+        var[1] = table[table_idx * last_layer_size * 6 + thread_idx * 6 + 1];
+        var[2] = table[table_idx * last_layer_size * 6 + thread_idx * 6 + 2];
+        var[3] = table[table_idx * last_layer_size * 6 + thread_idx * 6 + 3];
+        var[4] = table[table_idx * last_layer_size * 6 + thread_idx * 6 + 4];
+        var[5] = table[table_idx * last_layer_size * 6 + thread_idx * 6 + 5];
+        FPTYPE res = var[0] + (var[1] + (var[2] + (var[3] + (var[4] + var[5] * xx) * xx) * xx) * xx) * xx;
+        
+        for (int kk = 0; kk < MTILE; kk++) {
+            iteratorC[kk * last_layer_size + thread_idx] += (nnei - breakpoint) * ff[block_idx * nnei * MTILE + ii * MTILE + kk] * res;
+        }
+        if (unloop) break;
+    }
+    for (int ii = 0; ii < MTILE; ii++) {
+        out[block_idx * MTILE * last_layer_size + ii * last_layer_size + thread_idx] = iteratorC[ii * last_layer_size + thread_idx];
+    }
+}
+
+template <
+    typename FPTYPE,
+    int      MTILE,
+    int      KTILE> 
+__global__ void tabulate_fusion_grad_warp_reduce_special(const FPTYPE * table, const FPTYPE * in, const FPTYPE * ff, const FPTYPE * dy, FPTYPE * dy_dx, FPTYPE * dy_df, const FPTYPE lower, const FPTYPE upper, const FPTYPE max, const FPTYPE stride0, const FPTYPE stride1, const int nnei, const int last_layer_size) {
+    extern __shared__ int _data[];
+    int const block_idx = blockIdx.x;  // nloc
+    int const thread_idx = threadIdx.x; // KTILE * WARP_SIZE, usally 128 here~
+    int warp_idx = __shfl_sync(0xffffffff, threadIdx.x / 32, 0);
+    int lane_idx = threadIdx.x % 32;
+    int breakpoint = nnei - 1;
+    bool unloop = false;
+
+    FPTYPE * iteratorA = (FPTYPE *)&_data[0]; // dy
+    for (int ii = 0; ii < MTILE; ii++) {
+        if (thread_idx < last_layer_size) {
+            iteratorA[ii * last_layer_size + thread_idx] = dy[block_idx * MTILE * last_layer_size + ii * last_layer_size + thread_idx];
+        }
+    }
+    __syncthreads();
+    FPTYPE ago = __shfl_sync(0xffffffff, in[block_idx * nnei + nnei - 1], 0);
+    for (int ii = 0; ii < nnei; ii += KTILE) {
+        FPTYPE xx = in[block_idx * nnei + ii + warp_idx];
+        if (ago == xx) { 
+            unloop = true;
+            breakpoint = ii;
+        }
+        
+        int table_idx = 0;
+        locate_xx(lower, upper, max, stride0, stride1, xx, table_idx);
+        FPTYPE sum[KTILE] = {0.f};
+        FPTYPE Csub = 0.f;
+        for (int jj = lane_idx; jj < last_layer_size; jj += WARP_SIZE) {
+            FPTYPE var[6]; 
+            // load iteratorB through table 
+            var[0]  = table[table_idx * last_layer_size * 6 + 6 * jj + 0]; 
+            var[1]  = table[table_idx * last_layer_size * 6 + 6 * jj + 1]; 
+            var[2]  = table[table_idx * last_layer_size * 6 + 6 * jj + 2]; 
+            var[3]  = table[table_idx * last_layer_size * 6 + 6 * jj + 3];
+            var[4]  = table[table_idx * last_layer_size * 6 + 6 * jj + 4];
+            var[5]  = table[table_idx * last_layer_size * 6 + 6 * jj + 5];
+            FPTYPE res = var[0] + (var[1] + (var[2] + (var[3] + (var[4] + var[5] * xx) * xx) * xx) * xx) * xx;
+            
+            for (int kk = 0; kk < KTILE; kk++) {
+                sum[kk] += (nnei - breakpoint) * iteratorA[kk * last_layer_size + jj] * res;
+            }
+            res  = ff[block_idx * nnei * MTILE + (ii + warp_idx) * 4 + 0] * iteratorA[0 * last_layer_size + jj];
+            res += ff[block_idx * nnei * MTILE + (ii + warp_idx) * 4 + 1] * iteratorA[1 * last_layer_size + jj];
+            res += ff[block_idx * nnei * MTILE + (ii + warp_idx) * 4 + 2] * iteratorA[2 * last_layer_size + jj];
+            res += ff[block_idx * nnei * MTILE + (ii + warp_idx) * 4 + 3] * iteratorA[3 * last_layer_size + jj];
+            Csub += (nnei - breakpoint) * (var[1] + (2 * var[2] + (3 * var[3] + (4 * var[4] + 5 * var[5] * xx) * xx) * xx) * xx) * res;
+        }
+        __syncwarp();
+        for (int kk = 0; kk < KTILE; kk++) {
+            warp_reduce(sum[kk]);
+        }
+        warp_reduce(Csub);
+        if (lane_idx == 0) {
+            for (int kk = 0; kk < KTILE; kk++) {
+                dy_df[block_idx * nnei * MTILE + (ii + warp_idx) * 4 + kk] = sum[kk];
+            }
+            dy_dx[block_idx * nnei + ii + warp_idx] = Csub;
+        }
+        if (unloop) break;
+    }
+}
+
+template <typename FPTYPE,
+          int      THREADS_PER_BLOCK>
+__global__ void tabulate_checker(const FPTYPE * in, int * out, const FPTYPE lower, const FPTYPE upper, const FPTYPE max, const int nloc, const int nnei) {
+    __shared__ int Csub[THREADS_PER_BLOCK];
+    __shared__ int Dsub[THREADS_PER_BLOCK];
+    int const bid = blockIdx.x;
+    int const tid = threadIdx.x;
+    
+    Csub[tid] = 0;
+    Dsub[tid] = 0;
+    __syncthreads();
+
+    for (int ii = tid; ii < nnei; ii += THREADS_PER_BLOCK) {
+        FPTYPE xx = in[bid * nnei + ii];
+        if (xx < lower || xx > max) {
+            Csub[tid] += 1;
+            // printf("# DEEPMD: level 2 overflow, xx:\t%f\n", xx);
+        }
+        else if (xx >= upper && xx <= max) {
+            Dsub[tid] += 1;
+            // printf("# DEEPMD: level 1 overflow, xx:\t%f\n", xx);
+        }
+    }
+    __syncthreads();
+    // do reduction in shared memory
+    for (int ii = THREADS_PER_BLOCK >> 1; ii > 0; ii >>= 1) {
+        if (tid < ii) {
+            Csub[tid] += Csub[tid + ii];
+            Dsub[tid] += Dsub[tid + ii];
+        }
+        __syncthreads();
+    }
+    if (tid == 0) {
+        out[bid] = Csub[0];
+        out[nloc + bid] = Dsub[0];
+    }
+}
+
+void TabulateFusionLauncher(const double * table, const double * table_info, const double * in, const double * ff, const int nloc, const int nnei, const int last_layer_size, double * out) {
+    // std::cout << "I'm in tabulate GPU!" << std::endl;
+    tabulate_fusion_special<double, MM, KK> <<<nloc, last_layer_size, sizeof(double) * MM * last_layer_size>>>(table, in, ff, out, table_info[0], table_info[1], table_info[2], table_info[3], table_info[4], nnei, last_layer_size);
+}
+void TabulateFusionLauncher(const float * table, const float * table_info, const float * in, const float * ff, const int nloc, const int nnei, const int last_layer_size, float * out) {
+    tabulate_fusion_special<float, MM, KK> <<<nloc, last_layer_size, sizeof(float) * MM * last_layer_size>>>(table, in, ff, out, table_info[0], table_info[1], table_info[2], table_info[3], table_info[4], nnei, last_layer_size);
+}
+
+void TabulateFusionGradLauncher(const double * table, const double * table_info, const double * in, const double * ff, const double * dy, const int nloc, const int nnei, const int last_layer_size, double * dy_dx, double * dy_df) {
+    // cudaMemset(dy_df, 0.0, sizeof(double) * nloc * nnei * 4);
+    cudaMemset(dy_dx, 0.0, sizeof(double) * nloc * nnei);
+    cudaMemset(dy_df, 0.0, sizeof(double) * nloc * nnei * 4);
+    tabulate_fusion_grad_warp_reduce_special<double, MM, KK> <<<nloc, KK * WARP_SIZE, sizeof(double) * MM * last_layer_size>>>(table, in, ff, dy, dy_dx, dy_df, table_info[0], table_info[1], table_info[2], table_info[3], table_info[4], nnei, last_layer_size);
+}
+void TabulateFusionGradLauncher(const float * table, const float * table_info, const float * in, const float * ff, const float * dy, const int nloc, const int nnei, const int last_layer_size, float * dy_dx, float * dy_df) {
+    // cudaMemset(dy_df, 0.0, sizeof(float) * nloc * nnei * 4);
+    cudaMemset(dy_dx, 0.0, sizeof(float) * nloc * nnei);
+    cudaMemset(dy_df, 0.0, sizeof(float) * nloc * nnei * 4);
+    tabulate_fusion_grad_warp_reduce_special<float, MM, KK> <<<nloc, KK * WARP_SIZE, sizeof(float) * MM * last_layer_size>>>(table, in, ff, dy, dy_dx, dy_df, table_info[0], table_info[1], table_info[2], table_info[3], table_info[4], nnei, last_layer_size);
+}
+
+void TabulateCheckerLauncher(const double * table_info, const double * in, int * out, const int nloc, const int nnei) {
+    tabulate_checker <double, TPB> <<<nloc, TPB>>>(in, out, table_info[0], table_info[1], table_info[2], nloc, nnei);
+    // Declare, allocate, and initialize device-accessible pointers for input and output
+    int * d_out = NULL;
+    int * h_out = NULL;
+    cudaMalloc((void **)&d_out, sizeof(int));
+    h_out = (int*)malloc(sizeof(int));
+    // Determine temporary device storage requirements
+    void     *d_temp_storage = NULL;
+    size_t   temp_storage_bytes = 0;
+    cub::DeviceReduce::Sum(d_temp_storage, temp_storage_bytes, out, d_out, nloc);
+
+    // Allocate temporary storage
+    cudaMalloc(&d_temp_storage, temp_storage_bytes);
+
+    // Run sum-reduction
+    cub::DeviceReduce::Sum(d_temp_storage, temp_storage_bytes, out, d_out, nloc);
+
+    // d_out <-- [38]
+    cudaMemcpy(h_out, d_out, sizeof(int), cudaMemcpyDeviceToHost);
+    
+    if(h_out[0] > 0) {
+        std::cout << "# DEEPMD: warning! some values [" << h_out[0] << "/" << nloc * nnei << "] overflow the range of the table, using the endpoint approximate processing.." << std::endl;
+    }
+
+    // Run sum-reduction
+    cub::DeviceReduce::Sum(d_temp_storage, temp_storage_bytes, out + nloc, d_out, nloc);
+    
+    // d_out <-- [38]
+    cudaMemcpy(h_out, d_out, sizeof(int), cudaMemcpyDeviceToHost);
+    
+    if(h_out[0] > 0) {
+        std::cout << "# DEEPMD: warning! some values [" << h_out[0] << "/" << nloc * nnei << "] overflow the range of the table, using second table approximate processing.." << std::endl;
+    }
+
+    // free the temperary storage
+    cudaFree(d_out);
+    cudaFree(d_temp_storage);
+    free(h_out);
+}
+
+void TabulateCheckerLauncher(const float * table_info, const float * in, int * out, const int nloc, const int nnei) {
+    tabulate_checker <float, TPB> <<<nloc, TPB>>>(in, out, table_info[0], table_info[1], table_info[2], nloc, nnei);
+    // Declare, allocate, and initialize device-accessible pointers for input and output
+    int * d_out = NULL;
+    int * h_out = NULL;
+    cudaMalloc((void **)&d_out, sizeof(int));
+    h_out = (int*)malloc(sizeof(int));
+    // Determine temporary device storage requirements
+    void     *d_temp_storage = NULL;
+    size_t   temp_storage_bytes = 0;
+    cub::DeviceReduce::Sum(d_temp_storage, temp_storage_bytes, out, d_out, nloc);
+
+    // Allocate temporary storage
+    cudaMalloc(&d_temp_storage, temp_storage_bytes);
+
+    // Run sum-reduction
+    cub::DeviceReduce::Sum(d_temp_storage, temp_storage_bytes, out, d_out, nloc);
+
+    // d_out <-- [38]
+    cudaMemcpy(h_out, d_out, sizeof(int), cudaMemcpyDeviceToHost);
+
+    if(h_out[0] > 0) {
+        std::cout << "# DEEPMD: warning! some values [" << h_out[0] << "/" << nloc * nnei << "] overflow the range of the table, using the endpoint approximate processing.." << std::endl;
+    }
+
+    // Run sum-reduction
+    cub::DeviceReduce::Sum(d_temp_storage, temp_storage_bytes, out + nloc, d_out, nloc);
+    
+    // d_out <-- [38]
+    cudaMemcpy(h_out, d_out, sizeof(int), cudaMemcpyDeviceToHost);
+    
+    if(h_out[0] > 0) {
+        std::cout << "# DEEPMD: warning! some values [" << h_out[0] << "/" << nloc * nnei << "] overflow the range of the table, using second table approximate processing.." << std::endl;
+    }
+
+    // free the temperary storage
+    cudaFree(d_out);
+    cudaFree(d_temp_storage);
+    free(h_out);
+}
+
+template<typename FPTYPE>
+void TabulateFusionGPUExecuteFunctor<FPTYPE>::operator()(const FPTYPE * table, const FPTYPE * table_info, const FPTYPE * in, const FPTYPE * ff, const int nloc, const int nnei, const int last_layer_size, FPTYPE * out) {
+    tabulate_fusion_special<FPTYPE, MM, KK> <<<nloc, last_layer_size, sizeof(FPTYPE) * MM * last_layer_size>>>(table, in, ff, out, table_info[0], table_info[1], table_info[2], table_info[3], table_info[4], nnei, last_layer_size);
+}
+
+template<typename FPTYPE>
+void TabulateFusionGradGPUExecuteFunctor<FPTYPE>::operator()(const FPTYPE * table, const FPTYPE * table_info, const FPTYPE * in, const FPTYPE * ff, const FPTYPE * dy, const int nloc, const int nnei, const int last_layer_size, FPTYPE * dy_dx, FPTYPE * dy_df) {
+    cudaErrcheck(cudaMemset(dy_dx, 0.0, sizeof(FPTYPE) * nloc * nnei));
+    cudaErrcheck(cudaMemset(dy_df, 0.0, sizeof(FPTYPE) * nloc * nnei * 4));
+    tabulate_fusion_grad_warp_reduce_special<FPTYPE, MM, KK> <<<nloc, KK * WARP_SIZE, sizeof(FPTYPE) * MM * last_layer_size>>>(table, in, ff, dy, dy_dx, dy_df, table_info[0], table_info[1], table_info[2], table_info[3], table_info[4], nnei, last_layer_size);
+}
+
+template <typename FPTYPE>
+void TabulateCheckerGPUExecuteFunctor<FPTYPE>::operator()(const FPTYPE * table_info, const FPTYPE * in, int * out, const int nloc, const int nnei) {
+    tabulate_checker <FPTYPE, TPB> <<<nloc, TPB>>>(in, out, table_info[0], table_info[1], table_info[2], nloc, nnei);
+    // Declare, allocate, and initialize device-accessible pointers for input and output
+    int * d_out = NULL;
+    int * h_out = NULL;
+    cudaMalloc((void **)&d_out, sizeof(int));
+    h_out = (int*)malloc(sizeof(int));
+    // Determine temporary device storage requirements
+    void     *d_temp_storage = NULL;
+    size_t   temp_storage_bytes = 0;
+    cub::DeviceReduce::Sum(d_temp_storage, temp_storage_bytes, out, d_out, nloc);
+
+    // Allocate temporary storage
+    cudaMalloc(&d_temp_storage, temp_storage_bytes);
+
+    // Run sum-reduction
+    cub::DeviceReduce::Sum(d_temp_storage, temp_storage_bytes, out, d_out, nloc);
+
+    // d_out <-- [38]
+    cudaMemcpy(h_out, d_out, sizeof(int), cudaMemcpyDeviceToHost);
+
+    if(h_out[0] > 0) {
+        std::cout << "# DEEPMD: warning! some values [" << h_out[0] << "/" << nloc * nnei << "] overflow the range of the table, using the endpoint approximate processing.." << std::endl;
+    }
+
+    // Run sum-reduction
+    cub::DeviceReduce::Sum(d_temp_storage, temp_storage_bytes, out + nloc, d_out, nloc);
+    
+    // d_out <-- [38]
+    cudaMemcpy(h_out, d_out, sizeof(int), cudaMemcpyDeviceToHost);
+    
+    if(h_out[0] > 0) {
+        std::cout << "# DEEPMD: warning! some values [" << h_out[0] << "/" << nloc * nnei << "] overflow the range of the table, using second table approximate processing.." << std::endl;
+    }
+
+    // free the temperary storage
+    cudaFree(d_out);
+    cudaFree(d_temp_storage);
+    free(h_out);
+}
+
+template struct TabulateFusionGPUExecuteFunctor<float>;
+template struct TabulateFusionGPUExecuteFunctor<double>;
+template struct TabulateFusionGradGPUExecuteFunctor<float>;
+template struct TabulateFusionGradGPUExecuteFunctor<double>;
+template struct TabulateCheckerGPUExecuteFunctor<float>;
+template struct TabulateCheckerGPUExecuteFunctor<double>;
\ No newline at end of file
diff --git a/source/op/descrpt_se_a_ef.cc b/source/op/descrpt_se_a_ef.cc
index b4a631b7cf..9aca7eb720 100644
--- a/source/op/descrpt_se_a_ef.cc
+++ b/source/op/descrpt_se_a_ef.cc
@@ -12,52 +12,30 @@ typedef double compute_t;
 using namespace tensorflow;
 // using namespace std;
 
-#ifdef HIGH_PREC
-typedef double VALUETYPE ;
-#else 
-typedef float  VALUETYPE ;
-#endif
+using CPUDevice = Eigen::ThreadPoolDevice;
+using GPUDevice = Eigen::GpuDevice;
 
-#ifdef HIGH_PREC
 REGISTER_OP("DescrptSeAEf")
-.Input("coord: double")
+.Attr("T: {float, double}")
+.Input("coord: T")
 .Input("type: int32")
 .Input("natoms: int32")
-.Input("box: double")
+.Input("box: T")
 .Input("mesh: int32")
-.Input("ef: double")
-.Input("davg: double")
-.Input("dstd: double")
+.Input("ef: T")
+.Input("davg: T")
+.Input("dstd: T")
 .Attr("rcut_a: float")
 .Attr("rcut_r: float")
 .Attr("rcut_r_smth: float")
 .Attr("sel_a: list(int)")
 .Attr("sel_r: list(int)")
-.Output("descrpt: double")
-.Output("descrpt_deriv: double")
-.Output("rij: double")
+.Output("descrpt: T")
+.Output("descrpt_deriv: T")
+.Output("rij: T")
 .Output("nlist: int32");
-#else
-REGISTER_OP("DescrptSeAEf")
-.Input("coord: float")
-.Input("type: int32")
-.Input("natoms: int32")
-.Input("box: float")
-.Input("mesh: int32")
-.Input("ef: float")
-.Input("davg: float")
-.Input("dstd: float")
-.Attr("rcut_a: float")
-.Attr("rcut_r: float")
-.Attr("rcut_r_smth: float")
-.Attr("sel_a: list(int)")
-.Attr("sel_r: list(int)")
-.Output("descrpt: float")
-.Output("descrpt_deriv: float")
-.Output("rij: float")
-.Output("nlist: int32");
-#endif
 
+template<typename Device, typename FPTYPE>
 class DescrptSeAEfOp : public OpKernel {
 public:
   explicit DescrptSeAEfOp(OpKernelConstruction* context) : OpKernel(context) {
@@ -186,16 +164,16 @@ class DescrptSeAEfOp : public OpKernel {
 						     nlist_shape,
 						     &nlist_tensor));
     
-    auto coord	= coord_tensor	.matrix<VALUETYPE>();
+    auto coord	= coord_tensor	.matrix<FPTYPE>();
     auto type	= type_tensor	.matrix<int>();
-    auto box	= box_tensor	.matrix<VALUETYPE>();
+    auto box	= box_tensor	.matrix<FPTYPE>();
     auto mesh	= mesh_tensor	.flat<int>();
-    auto ef	= ef_tensor	.matrix<VALUETYPE>();
-    auto avg	= avg_tensor	.matrix<VALUETYPE>();
-    auto std	= std_tensor	.matrix<VALUETYPE>();
-    auto descrpt	= descrpt_tensor	->matrix<VALUETYPE>();
-    auto descrpt_deriv	= descrpt_deriv_tensor	->matrix<VALUETYPE>();
-    auto rij		= rij_tensor		->matrix<VALUETYPE>();
+    auto ef	= ef_tensor	.matrix<FPTYPE>();
+    auto avg	= avg_tensor	.matrix<FPTYPE>();
+    auto std	= std_tensor	.matrix<FPTYPE>();
+    auto descrpt	= descrpt_tensor	->matrix<FPTYPE>();
+    auto descrpt_deriv	= descrpt_deriv_tensor	->matrix<FPTYPE>();
+    auto rij		= rij_tensor		->matrix<FPTYPE>();
     auto nlist		= nlist_tensor		->matrix<int>();
 
     // // check the types
@@ -385,5 +363,9 @@ class DescrptSeAEfOp : public OpKernel {
   }
 };
 
-REGISTER_KERNEL_BUILDER(Name("DescrptSeAEf").Device(DEVICE_CPU), DescrptSeAEfOp);
-
+#define REGISTER_CPU(T)                                                                 \
+REGISTER_KERNEL_BUILDER(                                                                \
+    Name("DescrptSeAEf").Device(DEVICE_CPU).TypeConstraint<T>("T"),                     \
+    DescrptSeAEfOp<CPUDevice, T>); 
+REGISTER_CPU(float);
+REGISTER_CPU(double);
diff --git a/source/op/descrpt_se_a_ef_para.cc b/source/op/descrpt_se_a_ef_para.cc
index af17b3ca12..2dccb40ae3 100644
--- a/source/op/descrpt_se_a_ef_para.cc
+++ b/source/op/descrpt_se_a_ef_para.cc
@@ -11,53 +11,30 @@ typedef double compute_t;
 
 using namespace tensorflow;
 // using namespace std;
+using CPUDevice = Eigen::ThreadPoolDevice;
+using GPUDevice = Eigen::GpuDevice;
 
-#ifdef HIGH_PREC
-typedef double VALUETYPE ;
-#else 
-typedef float  VALUETYPE ;
-#endif
-
-#ifdef HIGH_PREC
-REGISTER_OP("DescrptSeAEfPara")
-.Input("coord: double")
-.Input("type: int32")
-.Input("natoms: int32")
-.Input("box: double")
-.Input("mesh: int32")
-.Input("ef: double")
-.Input("davg: double")
-.Input("dstd: double")
-.Attr("rcut_a: float")
-.Attr("rcut_r: float")
-.Attr("rcut_r_smth: float")
-.Attr("sel_a: list(int)")
-.Attr("sel_r: list(int)")
-.Output("descrpt: double")
-.Output("descrpt_deriv: double")
-.Output("rij: double")
-.Output("nlist: int32");
-#else
 REGISTER_OP("DescrptSeAEfPara")
-.Input("coord: float")
+.Attr("T: {float, double}")
+.Input("coord: T")
 .Input("type: int32")
 .Input("natoms: int32")
-.Input("box: float")
+.Input("box: T")
 .Input("mesh: int32")
-.Input("ef: float")
-.Input("davg: float")
-.Input("dstd: float")
+.Input("ef: T")
+.Input("davg: T")
+.Input("dstd: T")
 .Attr("rcut_a: float")
 .Attr("rcut_r: float")
 .Attr("rcut_r_smth: float")
 .Attr("sel_a: list(int)")
 .Attr("sel_r: list(int)")
-.Output("descrpt: float")
-.Output("descrpt_deriv: float")
-.Output("rij: float")
+.Output("descrpt: T")
+.Output("descrpt_deriv: T")
+.Output("rij: T")
 .Output("nlist: int32");
-#endif
 
+template<typename Device, typename FPTYPE>
 class DescrptSeAEfParaOp : public OpKernel {
 public:
   explicit DescrptSeAEfParaOp(OpKernelConstruction* context) : OpKernel(context) {
@@ -186,16 +163,16 @@ class DescrptSeAEfParaOp : public OpKernel {
 						     nlist_shape,
 						     &nlist_tensor));
     
-    auto coord	= coord_tensor	.matrix<VALUETYPE>();
+    auto coord	= coord_tensor	.matrix<FPTYPE>();
     auto type	= type_tensor	.matrix<int>();
-    auto box	= box_tensor	.matrix<VALUETYPE>();
+    auto box	= box_tensor	.matrix<FPTYPE>();
     auto mesh	= mesh_tensor	.flat<int>();
-    auto ef	= ef_tensor	.matrix<VALUETYPE>();
-    auto avg	= avg_tensor	.matrix<VALUETYPE>();
-    auto std	= std_tensor	.matrix<VALUETYPE>();
-    auto descrpt	= descrpt_tensor	->matrix<VALUETYPE>();
-    auto descrpt_deriv	= descrpt_deriv_tensor	->matrix<VALUETYPE>();
-    auto rij		= rij_tensor		->matrix<VALUETYPE>();
+    auto ef	= ef_tensor	.matrix<FPTYPE>();
+    auto avg	= avg_tensor	.matrix<FPTYPE>();
+    auto std	= std_tensor	.matrix<FPTYPE>();
+    auto descrpt	= descrpt_tensor	->matrix<FPTYPE>();
+    auto descrpt_deriv	= descrpt_deriv_tensor	->matrix<FPTYPE>();
+    auto rij		= rij_tensor		->matrix<FPTYPE>();
     auto nlist		= nlist_tensor		->matrix<int>();
 
     // // check the types
@@ -385,5 +362,9 @@ class DescrptSeAEfParaOp : public OpKernel {
   }
 };
 
-REGISTER_KERNEL_BUILDER(Name("DescrptSeAEfPara").Device(DEVICE_CPU), DescrptSeAEfParaOp);
-
+#define REGISTER_CPU(T)                                                                 \
+REGISTER_KERNEL_BUILDER(                                                                \
+    Name("DescrptSeAEfPara").Device(DEVICE_CPU).TypeConstraint<T>("T"),                 \
+    DescrptSeAEfParaOp<CPUDevice, T>); 
+REGISTER_CPU(float);
+REGISTER_CPU(double);
diff --git a/source/op/descrpt_se_a_ef_vert.cc b/source/op/descrpt_se_a_ef_vert.cc
index 1d416864e2..5734a8811f 100644
--- a/source/op/descrpt_se_a_ef_vert.cc
+++ b/source/op/descrpt_se_a_ef_vert.cc
@@ -12,52 +12,30 @@ typedef double compute_t;
 using namespace tensorflow;
 // using namespace std;
 
-#ifdef HIGH_PREC
-typedef double VALUETYPE ;
-#else 
-typedef float  VALUETYPE ;
-#endif
+using CPUDevice = Eigen::ThreadPoolDevice;
+using GPUDevice = Eigen::GpuDevice;
 
-#ifdef HIGH_PREC
 REGISTER_OP("DescrptSeAEfVert")
-.Input("coord: double")
+.Attr("T: {float, double}")
+.Input("coord: T")
 .Input("type: int32")
 .Input("natoms: int32")
-.Input("box: double")
+.Input("box: T")
 .Input("mesh: int32")
-.Input("ef: double")
-.Input("davg: double")
-.Input("dstd: double")
+.Input("ef: T")
+.Input("davg: T")
+.Input("dstd: T")
 .Attr("rcut_a: float")
 .Attr("rcut_r: float")
 .Attr("rcut_r_smth: float")
 .Attr("sel_a: list(int)")
 .Attr("sel_r: list(int)")
-.Output("descrpt: double")
-.Output("descrpt_deriv: double")
-.Output("rij: double")
+.Output("descrpt: T")
+.Output("descrpt_deriv: T")
+.Output("rij: T")
 .Output("nlist: int32");
-#else
-REGISTER_OP("DescrptSeAEfVert")
-.Input("coord: float")
-.Input("type: int32")
-.Input("natoms: int32")
-.Input("box: float")
-.Input("mesh: int32")
-.Input("ef: float")
-.Input("davg: float")
-.Input("dstd: float")
-.Attr("rcut_a: float")
-.Attr("rcut_r: float")
-.Attr("rcut_r_smth: float")
-.Attr("sel_a: list(int)")
-.Attr("sel_r: list(int)")
-.Output("descrpt: float")
-.Output("descrpt_deriv: float")
-.Output("rij: float")
-.Output("nlist: int32");
-#endif
 
+template<typename Device, typename FPTYPE>
 class DescrptSeAEfVertOp : public OpKernel {
 public:
   explicit DescrptSeAEfVertOp(OpKernelConstruction* context) : OpKernel(context) {
@@ -186,16 +164,16 @@ class DescrptSeAEfVertOp : public OpKernel {
 						     nlist_shape,
 						     &nlist_tensor));
     
-    auto coord	= coord_tensor	.matrix<VALUETYPE>();
+    auto coord	= coord_tensor	.matrix<FPTYPE>();
     auto type	= type_tensor	.matrix<int>();
-    auto box	= box_tensor	.matrix<VALUETYPE>();
+    auto box	= box_tensor	.matrix<FPTYPE>();
     auto mesh	= mesh_tensor	.flat<int>();
-    auto ef	= ef_tensor	.matrix<VALUETYPE>();
-    auto avg	= avg_tensor	.matrix<VALUETYPE>();
-    auto std	= std_tensor	.matrix<VALUETYPE>();
-    auto descrpt	= descrpt_tensor	->matrix<VALUETYPE>();
-    auto descrpt_deriv	= descrpt_deriv_tensor	->matrix<VALUETYPE>();
-    auto rij		= rij_tensor		->matrix<VALUETYPE>();
+    auto ef	= ef_tensor	.matrix<FPTYPE>();
+    auto avg	= avg_tensor	.matrix<FPTYPE>();
+    auto std	= std_tensor	.matrix<FPTYPE>();
+    auto descrpt	= descrpt_tensor	->matrix<FPTYPE>();
+    auto descrpt_deriv	= descrpt_deriv_tensor	->matrix<FPTYPE>();
+    auto rij		= rij_tensor		->matrix<FPTYPE>();
     auto nlist		= nlist_tensor		->matrix<int>();
 
     // // check the types
@@ -385,5 +363,9 @@ class DescrptSeAEfVertOp : public OpKernel {
   }
 };
 
-REGISTER_KERNEL_BUILDER(Name("DescrptSeAEfVert").Device(DEVICE_CPU), DescrptSeAEfVertOp);
-
+#define REGISTER_CPU(T)                                                                 \
+REGISTER_KERNEL_BUILDER(                                                                \
+    Name("DescrptSeAEfVert").Device(DEVICE_CPU).TypeConstraint<T>("T"),                 \
+    DescrptSeAEfVertOp<CPUDevice, T>); 
+REGISTER_CPU(float);
+REGISTER_CPU(double);
\ No newline at end of file
diff --git a/source/op/map_aparam.cc b/source/op/map_aparam.cc
index 43bc8a011f..608f5f614b 100644
--- a/source/op/map_aparam.cc
+++ b/source/op/map_aparam.cc
@@ -6,32 +6,19 @@
 using namespace tensorflow;
 // using namespace std;
 
-#ifdef HIGH_PREC
-typedef double VALUETYPE;
-#else
-typedef float  VALUETYPE;
-#endif
+using CPUDevice = Eigen::ThreadPoolDevice;
+using GPUDevice = Eigen::GpuDevice;
 
-#ifdef HIGH_PREC
 REGISTER_OP("MapAparam")
-.Input("aparam: double")
+.Attr("T: {float, double}")
+.Input("aparam: T")
 .Input("nlist: int32")
 .Input("natoms: int32")
 .Attr("n_a_sel: int")
 .Attr("n_r_sel: int")
-.Output("output: double");
-#else
-REGISTER_OP("MapAparam")
-.Input("aparam: float")
-.Input("nlist: int32")
-.Input("natoms: int32")
-.Attr("n_a_sel: int")
-.Attr("n_r_sel: int")
-.Output("mapped: float");
-#endif
-
-using namespace tensorflow;
+.Output("output: T");
 
+template <typename Device, typename FPTYPE>
 class MapAparamOp : public OpKernel {
  public:
   explicit MapAparamOp(OpKernelConstruction* context) : OpKernel(context) {
@@ -73,9 +60,9 @@ class MapAparamOp : public OpKernel {
     OP_REQUIRES_OK(context, context->allocate_output(0, output_shape, &output_tensor));
     
     // flat the tensors
-    auto aparam = aparam_tensor.flat<VALUETYPE>();
+    auto aparam = aparam_tensor.flat<FPTYPE>();
     auto nlist = nlist_tensor.flat<int>();
-    auto output = output_tensor->flat<VALUETYPE>();
+    auto output = output_tensor->flat<FPTYPE>();
 
     // loop over samples
 #pragma omp parallel for 
@@ -110,7 +97,12 @@ class MapAparamOp : public OpKernel {
   int n_r_sel, n_a_sel, n_a_shift;
 };
 
-REGISTER_KERNEL_BUILDER(Name("MapAparam").Device(DEVICE_CPU), MapAparamOp);
+#define REGISTER_CPU(T)                                                                 \
+REGISTER_KERNEL_BUILDER(                                                                \
+    Name("MapAparam").Device(DEVICE_CPU).TypeConstraint<T>("T"),                        \
+    MapAparamOp<CPUDevice, T>); 
+REGISTER_CPU(float);
+REGISTER_CPU(double);
 
 
 
diff --git a/source/op/neighbor_stat.cc b/source/op/neighbor_stat.cc
new file mode 100644
index 0000000000..1be0ba23d5
--- /dev/null
+++ b/source/op/neighbor_stat.cc
@@ -0,0 +1,192 @@
+#include "tensorflow/core/framework/op.h"
+#include "tensorflow/core/framework/op_kernel.h"
+#include "tensorflow/core/framework/shape_inference.h"
+#include <iostream>
+
+#include "NeighborList.h"
+
+typedef double boxtensor_t ;
+typedef double compute_t;
+
+using namespace tensorflow;
+// using namespace std;
+
+using CPUDevice = Eigen::ThreadPoolDevice;
+using GPUDevice = Eigen::GpuDevice;
+
+REGISTER_OP("NeighborStat")
+    .Attr("T: {float, double}")
+    .Input("coord: T")
+    .Input("type: int32")
+    .Input("natoms: int32")
+    .Input("box : T")
+    .Input("mesh : int32")
+    .Attr("rcut: float")   
+    .Output("max_nbor_size: int32")
+    .Output("min_nbor_dist: T");
+
+template<typename Device, typename FPTYPE>
+class NeighborStatOp : public OpKernel {
+public:
+    explicit NeighborStatOp(OpKernelConstruction* context) : OpKernel(context) {
+        OP_REQUIRES_OK(context, context->GetAttr("rcut", &rcut));
+    }
+
+    void Compute(OpKernelContext* context) override {
+        // Grab the input tensor
+        int context_input_index = 0;
+        const Tensor& coord_tensor	= context->input(context_input_index++);
+        const Tensor& type_tensor	= context->input(context_input_index++);
+        const Tensor& natoms_tensor	= context->input(context_input_index++);
+        const Tensor& box_tensor	= context->input(context_input_index++);
+        const Tensor& mesh_tensor	= context->input(context_input_index++);
+
+
+        OP_REQUIRES (context, (coord_tensor.shape().dims() == 2),	errors::InvalidArgument ("Dim of coord should be 2"));
+        OP_REQUIRES (context, (type_tensor.shape().dims() == 2),	errors::InvalidArgument ("Dim of type should be 2"));
+        OP_REQUIRES (context, (natoms_tensor.shape().dims() == 1),	errors::InvalidArgument ("Dim of natoms should be 1"));
+        OP_REQUIRES (context, (box_tensor.shape().dims() == 2),	errors::InvalidArgument ("Dim of box should be 2"));
+        OP_REQUIRES (context, (mesh_tensor.shape().dims() == 1),	errors::InvalidArgument ("Dim of mesh should be 1"));
+        OP_REQUIRES (context, (natoms_tensor.shape().dim_size(0) >= 3),		errors::InvalidArgument ("number of atoms should be larger than (or equal to) 3"));
+        int nloc = natoms_tensor.flat<int>().data()[0];
+        int nall = natoms_tensor.flat<int>().data()[1];
+        int nsamples = coord_tensor.shape().dim_size(0);
+        int ntypes = natoms_tensor.shape().dim_size(0) - 2;
+        // check the sizes
+        OP_REQUIRES (context, (nsamples == type_tensor.shape().dim_size(0)),	errors::InvalidArgument ("number of samples should match"));
+        OP_REQUIRES (context, (nsamples == box_tensor.shape().dim_size(0)),		errors::InvalidArgument ("number of samples should match"));
+        OP_REQUIRES (context, (nall * 3 == coord_tensor.shape().dim_size(1)),	errors::InvalidArgument ("number of atoms should match"));
+        OP_REQUIRES (context, (nall == type_tensor.shape().dim_size(1)),		errors::InvalidArgument ("number of atoms should match"));
+        OP_REQUIRES (context, (9 == box_tensor.shape().dim_size(1)),		errors::InvalidArgument ("number of box should be 9"));
+
+        int nei_mode = 0;
+        if (mesh_tensor.shape().dim_size(0) == 6) {
+            // manual copied pbc
+            assert (nloc == nall);
+            nei_mode = 1;
+        }
+        else if (mesh_tensor.shape().dim_size(0) == 0) {
+            // no pbc
+            nei_mode = -1;
+        }
+        else {
+            throw std::runtime_error("invalid mesh tensor");
+        }
+        // if region is given extended, do not use pbc
+        bool b_pbc = (nei_mode >= 1 || nei_mode == -1) ? false : true;
+        bool b_norm_atom = (nei_mode == 1) ? true : false;
+
+        TensorShape max_nbor_size_shape ;
+        max_nbor_size_shape.AddDim (nloc);
+        max_nbor_size_shape.AddDim (ntypes);
+
+        int context_output_index = 0;
+        Tensor* max_nbor_size_tensor = NULL;
+        OP_REQUIRES_OK(context, context->allocate_output(context_output_index++, 
+	    					     max_nbor_size_shape,
+	    					     &max_nbor_size_tensor));
+
+        const FPTYPE* coord	= coord_tensor.flat<FPTYPE>().data();
+        const int* type	= type_tensor	  .flat<int>().data();
+        const FPTYPE* box	= box_tensor  .flat<FPTYPE>().data();
+        const int* mesh	= mesh_tensor	  .flat<int>().data();
+        int* max_nbor_size = max_nbor_size_tensor ->flat<int>().data();
+
+        for (int ii = 0; ii < static_cast<int>(max_nbor_size_tensor->NumElements()); ii++) {
+            max_nbor_size[ii] = 0;
+        }
+
+        // set region
+        boxtensor_t boxt [9] = {0};
+        for (int dd = 0; dd < 9; ++dd) {
+	        boxt[dd] = box[dd];
+        }
+        SimulationRegion<compute_t > region;
+        region.reinitBox (boxt);
+        // set & normalize coord
+        std::vector<compute_t > d_coord3 (nall * 3);
+        for (int ii = 0; ii < nall; ++ii) {
+	        for (int dd = 0; dd < 3; ++dd) {
+	            d_coord3[ii * 3 + dd] = coord[ii * 3 + dd];
+	        }
+	        if (b_norm_atom) {
+	            compute_t inter[3];
+	            region.phys2Inter (inter, &d_coord3[3 * ii]);
+	            for (int dd = 0; dd < 3; ++dd) {
+	                if      (inter[dd] < 0 ) inter[dd] += 1.;
+	                else if (inter[dd] >= 1) inter[dd] -= 1.;
+	            }
+	            region.inter2Phys (&d_coord3[3 * ii], inter);
+	        }
+        }
+
+        // set type
+        std::vector<int > d_type (nall);
+        for (int ii = 0; ii < nall; ++ii) d_type[ii] = type[ii];
+      
+        // build nlist
+        std::vector<std::vector<int > > d_nlist_a;
+        std::vector<std::vector<int > > d_nlist_r;
+        std::vector<int> nlist_map;
+        bool b_nlist_map = false;
+
+        if (nei_mode == 1) {
+            // std::cout << "I'm in nei_mode 1" << std::endl;
+	        std::vector<double > bk_d_coord3 = d_coord3;
+	        std::vector<int > bk_d_type = d_type;
+	        std::vector<int > ncell, ngcell;
+	        copy_coord(d_coord3, d_type, nlist_map, ncell, ngcell, bk_d_coord3, bk_d_type, rcut, region);	
+	        b_nlist_map = true;
+	        std::vector<int> nat_stt(3, 0);
+	        std::vector<int> ext_stt(3), ext_end(3);
+	        for (int dd = 0; dd < 3; ++dd) {
+	            ext_stt[dd] = -ngcell[dd];
+	            ext_end[dd] = ncell[dd] + ngcell[dd];
+	        }
+	        ::build_nlist (d_nlist_a, d_nlist_r, d_coord3, nloc, -1, rcut, nat_stt, ncell, ext_stt, ext_end, region, ncell);
+        }
+        else if (nei_mode == -1) {
+	        ::build_nlist (d_nlist_a, d_nlist_r, d_coord3, -1, rcut, NULL);
+        }
+        else {
+	        throw std::runtime_error("unknow neighbor mode");
+        }
+
+        int MAX_NNEI = 0;
+        for (int ii = 0; ii < nloc; ii++) { 
+            MAX_NNEI = MAX_NNEI < d_nlist_r[ii].size() ? d_nlist_r[ii].size() : MAX_NNEI;
+        }
+        // allocate output tensor for deepmd-kit
+        TensorShape min_nbor_dist_shape;
+        min_nbor_dist_shape.AddDim (nloc * MAX_NNEI);
+        Tensor* min_nbor_dist_tensor = NULL;
+        OP_REQUIRES_OK(context, context->allocate_output(context_output_index++, 
+	    					     min_nbor_dist_shape,
+	    					     &min_nbor_dist_tensor));
+        FPTYPE* min_nbor_dist	= min_nbor_dist_tensor ->flat<FPTYPE>().data();
+        for (int ii = 0; ii < static_cast<int>(min_nbor_dist_tensor->NumElements()); ii++) {
+            min_nbor_dist[ii] = 10000.0;
+        }
+
+        #pragma omp parallel for 
+        for (int ii = 0; ii < nloc; ii++) {
+            for (int jj = 0; jj < d_nlist_r[ii].size(); jj++) {
+                int type = d_type[d_nlist_r[ii][jj]];
+                max_nbor_size[ii * ntypes + type] += 1;
+                compute_t rij[3] = {d_coord3[d_nlist_r[ii][jj] * 3 + 0] - d_coord3[ii * 3 + 0], d_coord3[d_nlist_r[ii][jj] * 3 + 1] - d_coord3[ii * 3 + 1], d_coord3[d_nlist_r[ii][jj] * 3 + 2] - d_coord3[ii * 3 + 2]};
+                min_nbor_dist[ii * MAX_NNEI + jj] = sqrt(rij[0] * rij[0] + rij[1] * rij[1] + rij[2] * rij[2]);
+            }
+        }
+    }
+
+private:
+  int nnei;
+  float rcut;
+};
+
+#define REGISTER_CPU(T)                                                                 \
+REGISTER_KERNEL_BUILDER(                                                                \
+    Name("NeighborStat").Device(DEVICE_CPU).TypeConstraint<T>("T"),                     \
+    NeighborStatOp<CPUDevice, T>); 
+REGISTER_CPU(float);
+REGISTER_CPU(double);
\ No newline at end of file
diff --git a/source/op/tabulate.cc b/source/op/tabulate.cc
new file mode 100644
index 0000000000..1ca5b774ea
--- /dev/null
+++ b/source/op/tabulate.cc
@@ -0,0 +1,385 @@
+#include "tensorflow/core/framework/op.h"
+#include "tensorflow/core/framework/op_kernel.h"
+#include "tensorflow/core/framework/register_types.h"
+#include "tensorflow/core/framework/shape_inference.h"
+
+using namespace tensorflow;
+using CPUDevice = Eigen::ThreadPoolDevice;
+using GPUDevice = Eigen::GpuDevice;
+
+REGISTER_OP("TabulateFusion")
+    .Attr("T: {float, double}")
+    .Input("table: T")
+    .Input("table_info: T")
+    .Input("input: T")
+    .Input("ff: T")
+    .Attr("last_layer_size: int")
+    .Output("output: T");
+
+REGISTER_OP("TabulateFusionGrad")
+    .Attr("T: {float, double}")
+    .Input("table: T")
+    .Input("table_info: T")
+    .Input("input: T")
+    .Input("ff: T")
+    .Input("dy: T")        
+    .Input("output: T")         
+    .Output("dy_dx: T")
+    .Output("dy_df: T");
+
+#if GOOGLE_CUDA
+void TabulateFusionLauncher(const float * table, const float * table_info, const float * in, const float * ff, const int nloc, const int nnei, const int last_layer_size, float * out);
+void TabulateFusionLauncher(const double * table, const double * table_info, const double * in, const double * ff, const int nloc, const int nnei, const int last_layer_size, double * out);
+void TabulateFusionGradLauncher(const float * table, const float * table_info, const float * in, const float * ff, const float * dy, const int nloc, const int nnei, const int last_layer_size, float * dy_dx, float * dy_df);
+void TabulateFusionGradLauncher(const double * table, const double * table_info, const double * in, const double * ff, const double * dy, const int nloc, const int nnei, const int last_layer_size, double * dy_dx, double * dy_df);
+void TabulateCheckerLauncher(const float * table_info, const float * in, int * out, const int nloc, const int nnei);
+void TabulateCheckerLauncher(const double * table_info, const double * in, int * out, const int nloc, const int nnei);
+#endif
+
+template <typename FPTYPE>
+inline FPTYPE dot(FPTYPE a[4], FPTYPE b[4]) {
+    return a[0] * b[0] + a[1] * b[1] + a[2] * b[2] + a[3] * b[3]; 
+}
+
+/*
+    This inline function was designed to get the table info and bias value for current input xx!
+    lower:      indicate the lower boundary of the first table;
+    upper:      indicate the upper boundary of the first table as well as the lower boundary of the second table;
+    max:        indicate the upper boundary of the second table;
+    stride0:    indicate the stride of the first table;
+    stride1:    indicate the stride of the second table;
+    xx:         indicate the inputs value;
+    table_idx:  indicate the location of table info of input value xx;
+*/
+template <typename FPTYPE>
+inline void locate_xx(const FPTYPE& lower, const FPTYPE& upper,  const FPTYPE& max, const FPTYPE& stride0, const FPTYPE& stride1, FPTYPE& xx, int& table_idx) {
+    if (xx < lower) {
+        table_idx = 0;
+        xx = 0;
+    }
+    else if (xx < upper) {
+        table_idx = (int)((xx - lower) / stride0);
+        xx -= (table_idx * stride0 + lower);
+    }
+    else if (xx < max) {
+        int first_stride = int((upper - lower) / stride0);
+        table_idx = first_stride + (int)((xx - upper) / stride1);
+        xx -= ((table_idx - first_stride) * stride1 + upper);
+    }
+    else {
+        table_idx = int((upper - lower) / stride0) + (int)((max - upper) / stride1) - 1;
+        xx = 0;
+    }
+}
+
+template <typename FPTYPE>
+struct TabulateFusionFunctor {
+    void operator()(const CPUDevice& d, const FPTYPE * table, const FPTYPE * table_info, const FPTYPE * in, const FPTYPE * ff, const int nloc, const int nnei, const int last_layer_size, FPTYPE * out) {
+        //Currently, Do nothing at all! 
+        // std::cout << "I'm in tabulate @CPU!" << std::endl;
+        memset(out, 0.0, sizeof(FPTYPE) * nloc * 4 * last_layer_size);
+        FPTYPE const lower   = table_info[0];
+        FPTYPE const upper   = table_info[1];
+        FPTYPE const _max    = table_info[2];
+        FPTYPE const stride0 = table_info[3];
+        FPTYPE const stride1 = table_info[4];
+        // for every atom, execute a small gemm~
+        // FPTYPE * res = new FPTYPE[4 * last_layer_size];
+        #pragma omp parallel for
+        for (int ii = 0; ii < nloc; ii++) {
+            FPTYPE ll[4] = {0};
+            FPTYPE ago = in[ii * nnei + nnei - 1];
+            bool unloop = false; 
+            for (int jj = 0; jj < nnei; jj++) { 
+                ll[0] = ff[ii * nnei * 4 + jj * 4 + 0];
+                ll[1] = ff[ii * nnei * 4 + jj * 4 + 1];
+                ll[2] = ff[ii * nnei * 4 + jj * 4 + 2];
+                ll[3] = ff[ii * nnei * 4 + jj * 4 + 3];
+                FPTYPE xx = in[ii * nnei + jj]; 
+                if (ago == xx) {
+                    unloop = true;
+                }
+                int table_idx = 0;
+                locate_xx(lower, upper, _max, stride0, stride1, xx, table_idx);
+                for (int kk = 0; kk < last_layer_size; kk++) {
+                    // 1.094 timesteps/s                                       
+                    FPTYPE a0  = table[table_idx * last_layer_size * 6 + 6 * kk + 0]; 
+                    FPTYPE a1  = table[table_idx * last_layer_size * 6 + 6 * kk + 1]; 
+                    FPTYPE a2  = table[table_idx * last_layer_size * 6 + 6 * kk + 2]; 
+                    FPTYPE a3  = table[table_idx * last_layer_size * 6 + 6 * kk + 3];
+                    FPTYPE a4  = table[table_idx * last_layer_size * 6 + 6 * kk + 4];
+                    FPTYPE a5  = table[table_idx * last_layer_size * 6 + 6 * kk + 5];
+                    // FPTYPE var = a0 + a1 * xx + a2 * xx * xx + a3 * xx * xx * xx + a4 * xx * xx * xx * xx + a5 * xx * xx * xx * xx * xx;
+                    FPTYPE var = a0 + (a1 + (a2 + (a3 + (a4 + a5 * xx) * xx) * xx) * xx) * xx;
+                    if (unloop) {
+                        out[ii * last_layer_size * 4 + 0 * last_layer_size + kk] += (nnei - jj) * var * ll[0];
+                        out[ii * last_layer_size * 4 + 1 * last_layer_size + kk] += (nnei - jj) * var * ll[1];
+                        out[ii * last_layer_size * 4 + 2 * last_layer_size + kk] += (nnei - jj) * var * ll[2];
+                        out[ii * last_layer_size * 4 + 3 * last_layer_size + kk] += (nnei - jj) * var * ll[3];
+                    }
+                    else {
+                        out[ii * last_layer_size * 4 + 0 * last_layer_size + kk] += var * ll[0];
+                        out[ii * last_layer_size * 4 + 1 * last_layer_size + kk] += var * ll[1];
+                        out[ii * last_layer_size * 4 + 2 * last_layer_size + kk] += var * ll[2];
+                        out[ii * last_layer_size * 4 + 3 * last_layer_size + kk] += var * ll[3];
+                    }
+                }
+                if (unloop) break;
+            }
+        }
+    }
+    #if GOOGLE_CUDA
+    void operator()(const GPUDevice& d, const FPTYPE * table, const FPTYPE * table_info, const FPTYPE * in, const FPTYPE * ff, const int nloc, const int nnei, const int last_layer_size, FPTYPE * out) {
+        //Currently, Do nothing at all! 
+        TabulateFusionLauncher(table, table_info, in, ff, nloc, nnei, last_layer_size, out);
+    }
+    #endif // GOOGLE_CUDA 
+};
+
+template <typename FPTYPE>
+struct TabulateFusionGradFunctor {
+    void operator()(const CPUDevice& d, const FPTYPE * table, const FPTYPE * table_info, const FPTYPE * in, const FPTYPE * ff, const FPTYPE * dy, const int nloc, const int nnei, const int last_layer_size, FPTYPE * dy_dx, FPTYPE * dy_df) {
+        // std::cout << "I'm in tabulate gradient @CPU!" << std::endl;
+        memset(dy_dx, 0.0, sizeof(FPTYPE) * nloc * nnei);
+        memset(dy_df, 0.0, sizeof(FPTYPE) * nloc * nnei * 4);
+        FPTYPE const lower   = table_info[0];
+        FPTYPE const upper   = table_info[1];
+        FPTYPE const _max    = table_info[2];
+        FPTYPE const stride0 = table_info[3];
+        FPTYPE const stride1 = table_info[4];
+        // for every atom, execute a small gemm~
+        // FPTYPE * res = new FPTYPE[4 * last_layer_size];
+        #pragma omp parallel for
+        for (int ii = 0; ii < nloc; ii++) {
+            FPTYPE ll[4];
+            FPTYPE rr[4];
+            FPTYPE ago = in[ii * nnei + nnei - 1];
+            bool unloop = false;
+            for (int jj = 0; jj < nnei; jj++) {
+                // construct the dy/dx
+                ll[0] = ff[ii * nnei * 4 + jj * 4 + 0];
+                ll[1] = ff[ii * nnei * 4 + jj * 4 + 1];
+                ll[2] = ff[ii * nnei * 4 + jj * 4 + 2];
+                ll[3] = ff[ii * nnei * 4 + jj * 4 + 3];
+                FPTYPE xx = in[ii * nnei + jj]; 
+                if (ago == xx) {
+                    unloop = true;
+                }
+                int table_idx = 0;
+                locate_xx(lower, upper, _max, stride0, stride1, xx, table_idx);
+                FPTYPE grad = 0.0;
+                for (int kk = 0; kk < last_layer_size; kk++) {
+                    rr[0] = dy[ii * last_layer_size * 4 + 0 * last_layer_size + kk];
+                    rr[1] = dy[ii * last_layer_size * 4 + 1 * last_layer_size + kk];
+                    rr[2] = dy[ii * last_layer_size * 4 + 2 * last_layer_size + kk];
+                    rr[3] = dy[ii * last_layer_size * 4 + 3 * last_layer_size + kk];
+                    // 1.094 timesteps/s
+                    FPTYPE a0  = table[table_idx * last_layer_size * 6 + 6 * kk + 0]; 
+                    FPTYPE a1  = table[table_idx * last_layer_size * 6 + 6 * kk + 1]; 
+                    FPTYPE a2  = table[table_idx * last_layer_size * 6 + 6 * kk + 2]; 
+                    FPTYPE a3  = table[table_idx * last_layer_size * 6 + 6 * kk + 3];
+                    FPTYPE a4  = table[table_idx * last_layer_size * 6 + 6 * kk + 4];
+                    FPTYPE a5  = table[table_idx * last_layer_size * 6 + 6 * kk + 5];
+                    // FPTYPE res = a0 + a1 * xx + a2 * xx * xx + a3 * xx * xx * xx + a4 * xx * xx * xx * xx + a5 * xx * xx * xx * xx * xx;
+                    FPTYPE res = a0 + (a1 + (a2 + (a3 + (a4 + a5 * xx) * xx) * xx) * xx) * xx;
+
+                    if (unloop) {
+                        // grad += (a1 + 2 * a2 * xx + 3 * a3 * xx * xx + 4 * a4 * xx * xx * xx + 5 * a5 * xx * xx * xx * xx) * dot(ll, rr) * (nnei - jj);
+                        grad += (a1 + (2 * a2 + (3 * a3 + (4 * a4 + 5 * a5 * xx) * xx) * xx) * xx) * dot(ll, rr) * (nnei - jj);
+                        dy_df[ii * nnei * 4 + jj * 4 + 0] += res * rr[0] * (nnei - jj);
+                        dy_df[ii * nnei * 4 + jj * 4 + 1] += res * rr[1] * (nnei - jj);
+                        dy_df[ii * nnei * 4 + jj * 4 + 2] += res * rr[2] * (nnei - jj);
+                        dy_df[ii * nnei * 4 + jj * 4 + 3] += res * rr[3] * (nnei - jj);
+                    }
+                    else {
+                        // grad += (a1 + 2 * a2 * xx + 3 * a3 * xx * xx + 4 * a4 * xx * xx * xx + 5 * a5 * xx * xx * xx * xx) * dot(ll, rr);
+                        grad += (a1 + (2 * a2 + (3 * a3 + (4 * a4 + 5 * a5 * xx) * xx) * xx) * xx) * dot(ll, rr);
+                        dy_df[ii * nnei * 4 + jj * 4 + 0] += res * rr[0];
+                        dy_df[ii * nnei * 4 + jj * 4 + 1] += res * rr[1];
+                        dy_df[ii * nnei * 4 + jj * 4 + 2] += res * rr[2];
+                        dy_df[ii * nnei * 4 + jj * 4 + 3] += res * rr[3];
+                    }
+                }
+                dy_dx[ii * nnei + jj] = grad;
+                if (unloop) break;
+            }
+        }
+    }
+    #if GOOGLE_CUDA
+    void operator()(const GPUDevice& d, const FPTYPE * table, const FPTYPE * table_info, const FPTYPE * in, const FPTYPE * ff, const FPTYPE * dy, const int nloc, const int nnei, const int last_layer_size, FPTYPE * dy_dx, FPTYPE * dy_df) {
+        //Currently, Do nothing at all! 
+        TabulateFusionGradLauncher(table, table_info, in, ff, dy, nloc, nnei, last_layer_size, dy_dx, dy_df);
+    }
+    #endif // GOOGLE_CUDA 
+};
+
+template <typename FPTYPE>
+struct TabulateCheckerFunctor {
+    void operator()(const CPUDevice& d, const FPTYPE * table_info, const FPTYPE * in, int * out, const int nloc, const int nnei) {
+        FPTYPE const lower   = table_info[0];
+        FPTYPE const upper   = table_info[1];
+        FPTYPE const _max    = table_info[2];
+        FPTYPE const stride0 = table_info[3];
+        FPTYPE const stride1 = table_info[4];
+        // for every atom, execute a small gemm~
+        // FPTYPE * res = new FPTYPE[4 * last_layer_size];
+        int Csub = 0;    // summation of second table approximate;
+        int Dsub = 0;    // summation of the endpoint approximate;
+        for (int ii = 0; ii < nloc; ii++) {
+            for (int jj = 0; jj < nnei; jj++) {
+                FPTYPE xx = in[ii * nnei + jj];
+                if (xx < lower || xx > _max) {
+                    Csub += 1;
+                }
+                else if (xx >= upper && xx <= _max) {
+                    Dsub += 1;
+                }
+            }
+        }
+        if(Csub > 0) {
+            std::cout << "# DEEPMD: warning! some values [" << Csub << "/" << nloc * nnei << "] overflow the range of the table, using the endpoint approximate processing.." << std::endl;
+        }
+        if(Dsub > 0) {
+            std::cout << "# DEEPMD: warning! some values [" << Dsub << "/" << nloc * nnei << "] overflow the range of the table, using second table approximate processing.." << std::endl;
+        }
+    }
+    #if GOOGLE_CUDA
+    void operator()(const GPUDevice& d, const FPTYPE * table_info, const FPTYPE * in, int * out, const int nloc, const int nnei) {
+        //Currently, Do nothing at all! 
+        TabulateCheckerLauncher(table_info, in, out, nloc, nnei);
+    }
+    #endif // GOOGLE_CUDA 
+};
+
+template<typename Device, typename FPTYPE>
+class TabulateFusionOp : public OpKernel {
+  public:
+    explicit TabulateFusionOp(OpKernelConstruction* context) : OpKernel(context) {
+        OP_REQUIRES_OK(context, context->GetAttr("last_layer_size", &last_layer_size));
+        counter = -1;
+    }
+
+    void Compute(OpKernelContext* context) override {
+        // Grab the input tensor
+        int context_input_index = 0;
+        const Tensor& table	= context->input(context_input_index++);
+        const Tensor& table_info = context->input(context_input_index++);
+        const Tensor& input	= context->input(context_input_index++);
+        const Tensor& ff	= context->input(context_input_index++);
+
+        // set size of the sample
+        OP_REQUIRES (context, (table.shape().dims() == 2),	    errors::InvalidArgument ("Dim of table should be 2"));
+        OP_REQUIRES (context, (input.shape().dims() == 2),		errors::InvalidArgument ("Dim of input should be 2"));
+        OP_REQUIRES (context, (ff.shape().dims() == 3),		    errors::InvalidArgument ("Dim of input should be 3"));
+
+        TensorShape output_shape;
+        output_shape.AddDim (ff.shape().dim_size(0));
+        output_shape.AddDim (4);
+        output_shape.AddDim (last_layer_size);
+
+        int context_output_index = 0;
+        Tensor* output = NULL;
+        OP_REQUIRES_OK(context, context->allocate_output(context_output_index++,
+	    					     output_shape,
+	    					     &output));
+
+        // counter++;
+        // if ((int)table_info.flat<FPTYPE>().data()[5] != -1 && counter % (int)table_info.flat<FPTYPE>().data()[5] == 0) {
+        //     Tensor int_temp;
+        //     TensorShape int_shape;
+        //     int_shape.AddDim(2 * ff.shape().dim_size(0));
+        //     OP_REQUIRES_OK(context, context->allocate_temp(DT_INT32, int_shape, &int_temp));
+        //     TabulateCheckerFunctor<FPTYPE>()(
+        //         context->eigen_device<Device>(),
+        //         table_info.flat<FPTYPE>().data(),
+        //         input.flat<FPTYPE>().data(),
+        //         int_temp.flat<int>().data(),
+        //         ff.shape().dim_size(0),
+        //         ff.shape().dim_size(1)
+        //     );
+        // }
+
+        TabulateFusionFunctor<FPTYPE>()(
+            context->eigen_device<Device>(),            // define actually graph execution device
+            table.flat<FPTYPE>().data(),
+            table_info.flat<FPTYPE>().data(),
+            input.flat<FPTYPE>().data(),
+            ff.flat<FPTYPE>().data(),
+            ff.shape().dim_size(0),
+            ff.shape().dim_size(1),
+            last_layer_size,
+            output->flat<FPTYPE>().data()
+        );
+    }
+private:
+    int counter;
+    int last_layer_size;
+};
+
+template<typename Device, typename FPTYPE>
+class TabulateFusionGradOp : public OpKernel {
+ public:
+    explicit TabulateFusionGradOp(OpKernelConstruction* context) : OpKernel(context) {}
+
+    void Compute(OpKernelContext* context) override {
+        // std::cout << "I'm here" << std::endl;
+        // Grab the input tensor
+        int context_input_index = 0;
+        const Tensor& table	= context->input(context_input_index++);
+        const Tensor& table_info = context->input(context_input_index++);
+        const Tensor& input	= context->input(context_input_index++);
+        const Tensor& ff	= context->input(context_input_index++);
+        const Tensor& dy	= context->input(context_input_index++);
+        const Tensor& output = context->input(context_input_index++);
+
+        // set size of the sample
+        OP_REQUIRES (context, (dy.shape().dims() == 3),	    errors::InvalidArgument ("Dim of table should be 1"));
+
+        int context_output_index = 0;
+        Tensor* dy_dx = NULL;
+        OP_REQUIRES_OK(context, context->allocate_output(context_output_index++,
+	    					     input.shape(),
+	    					     &dy_dx));
+        Tensor* dy_df = NULL;
+        OP_REQUIRES_OK(context, context->allocate_output(context_output_index++,
+	    					     ff.shape(),
+	    					     &dy_df));
+
+        TabulateFusionGradFunctor<FPTYPE>()(
+            context->eigen_device<Device>(),            // define actually graph execution device
+            table.flat<FPTYPE>().data(),
+            table_info.flat<FPTYPE>().data(),
+            input.flat<FPTYPE>().data(),
+            ff.flat<FPTYPE>().data(),
+            dy.flat<FPTYPE>().data(),
+            ff.shape().dim_size(0),
+            ff.shape().dim_size(1),
+            output.shape().dim_size(2),
+            dy_dx->flat<FPTYPE>().data(),
+            dy_df->flat<FPTYPE>().data()
+        );
+    }
+private:
+};
+
+#define REGISTER_CPU(T)                                                                             \
+REGISTER_KERNEL_BUILDER(                                                                            \
+    Name("TabulateFusion").Device(DEVICE_CPU).TypeConstraint<T>("T").HostMemory("table_info"),      \
+    TabulateFusionOp<CPUDevice, T>);                                                                \
+REGISTER_KERNEL_BUILDER(                                                                            \
+    Name("TabulateFusionGrad").Device(DEVICE_CPU).TypeConstraint<T>("T").HostMemory("table_info"),  \
+    TabulateFusionGradOp<CPUDevice, T>);                                                                
+REGISTER_CPU(float);
+REGISTER_CPU(double);
+
+#if  GOOGLE_CUDA
+#define REGISTER_GPU(T)                                                                             \
+REGISTER_KERNEL_BUILDER(                                                                            \
+    Name("TabulateFusion").Device(DEVICE_GPU).TypeConstraint<T>("T").HostMemory("table_info"),      \
+    TabulateFusionOp<GPUDevice, T>);                                                                \
+REGISTER_KERNEL_BUILDER(                                                                            \
+    Name("TabulateFusionGrad").Device(DEVICE_GPU).TypeConstraint<T>("T").HostMemory("table_info"),  \
+    TabulateFusionGradOp<GPUDevice, T>);                                                                
+REGISTER_GPU(float);
+REGISTER_GPU(double);
+#endif  // GOOGLE_CUDA
diff --git a/source/op/tabulate_multi_device.cc b/source/op/tabulate_multi_device.cc
new file mode 100644
index 0000000000..4cce523def
--- /dev/null
+++ b/source/op/tabulate_multi_device.cc
@@ -0,0 +1,194 @@
+#include "common.h"
+#include "CustomeOperation.h"
+
+REGISTER_OP("TabulateFusion")
+    .Attr("T: {float, double}")
+    .Input("table: T")
+    .Input("table_info: T")
+    .Input("input: T")
+    .Input("ff: T")
+    .Attr("last_layer_size: int")
+    .Output("output: T");
+
+REGISTER_OP("TabulateFusionGrad")
+    .Attr("T: {float, double}")
+    .Input("table: T")
+    .Input("table_info: T")
+    .Input("input: T")
+    .Input("ff: T")
+    .Input("dy: T")        
+    .Input("output: T")         
+    .Output("dy_dx: T")
+    .Output("dy_df: T");
+
+template <typename FPTYPE>
+struct TabulateFusionFunctor {
+    void operator()(const CPUDevice& d, const FPTYPE * table, const FPTYPE * table_info, const FPTYPE * in, const FPTYPE * ff, const int nloc, const int nnei, const int last_layer_size, FPTYPE * out) {
+        TabulateFusionCPULauncher(table, table_info, in, ff, nloc, nnei, last_layer_size, out);
+    }
+    #if GOOGLE_CUDA
+    void operator()(const GPUDevice& d, const FPTYPE * table, const FPTYPE * table_info, const FPTYPE * in, const FPTYPE * ff, const int nloc, const int nnei, const int last_layer_size, FPTYPE * out) {
+        //Currently, Do nothing at all! 
+        TabulateFusionGPULauncher(table, table_info, in, ff, nloc, nnei, last_layer_size, out);
+    }
+    #endif // GOOGLE_CUDA 
+};
+
+template <typename FPTYPE>
+struct TabulateFusionGradFunctor {
+    void operator()(const CPUDevice& d, const FPTYPE * table, const FPTYPE * table_info, const FPTYPE * in, const FPTYPE * ff, const FPTYPE * dy, const int nloc, const int nnei, const int last_layer_size, FPTYPE * dy_dx, FPTYPE * dy_df) {
+        TabulateFusionGradCPULauncher(table, table_info, in, ff, dy, nloc, nnei, last_layer_size, dy_dx, dy_df);
+    }
+    #if GOOGLE_CUDA
+    void operator()(const GPUDevice& d, const FPTYPE * table, const FPTYPE * table_info, const FPTYPE * in, const FPTYPE * ff, const FPTYPE * dy, const int nloc, const int nnei, const int last_layer_size, FPTYPE * dy_dx, FPTYPE * dy_df) {
+        //Currently, Do nothing at all! 
+        TabulateFusionGradGPULauncher(table, table_info, in, ff, dy, nloc, nnei, last_layer_size, dy_dx, dy_df);
+    }
+    #endif // GOOGLE_CUDA 
+};
+
+template <typename FPTYPE>
+struct TabulateCheckerFunctor {
+    void operator()(const CPUDevice& d, const FPTYPE * table_info, const FPTYPE * in, int * out, const int nloc, const int nnei) {
+        TabulateCheckerCPULauncher(table_info, in, out, nloc, nnei);
+    }
+    #if GOOGLE_CUDA
+    void operator()(const GPUDevice& d, const FPTYPE * table_info, const FPTYPE * in, int * out, const int nloc, const int nnei) {
+        //Currently, Do nothing at all! 
+        TabulateCheckerGPULauncher(table_info, in, out, nloc, nnei);
+    }
+    #endif // GOOGLE_CUDA 
+};
+
+template<typename Device, typename FPTYPE>
+class TabulateFusionOp : public OpKernel {
+  public:
+    explicit TabulateFusionOp(OpKernelConstruction* context) : OpKernel(context) {
+        OP_REQUIRES_OK(context, context->GetAttr("last_layer_size", &last_layer_size));
+        counter = -1;
+    }
+
+    void Compute(OpKernelContext* context) override {
+        // Grab the input tensor
+        int context_input_index = 0;
+        const Tensor& table	= context->input(context_input_index++);
+        const Tensor& table_info = context->input(context_input_index++);
+        const Tensor& input	= context->input(context_input_index++);
+        const Tensor& ff	= context->input(context_input_index++);
+
+        // set size of the sample
+        OP_REQUIRES (context, (table.shape().dims() == 2),	    errors::InvalidArgument ("Dim of table should be 2"));
+        OP_REQUIRES (context, (input.shape().dims() == 2),		errors::InvalidArgument ("Dim of input should be 2"));
+        OP_REQUIRES (context, (ff.shape().dims() == 3),		    errors::InvalidArgument ("Dim of input should be 3"));
+
+        TensorShape output_shape;
+        output_shape.AddDim (ff.shape().dim_size(0));
+        output_shape.AddDim (4);
+        output_shape.AddDim (last_layer_size);
+
+        int context_output_index = 0;
+        Tensor* output = NULL;
+        OP_REQUIRES_OK(context, context->allocate_output(context_output_index++,
+	    					     output_shape,
+	    					     &output));
+
+        // counter++;
+        // if ((int)table_info.flat<FPTYPE>().data()[5] != -1 && counter % (int)table_info.flat<FPTYPE>().data()[5] == 0) {
+        //     Tensor int_temp;
+        //     TensorShape int_shape;
+        //     int_shape.AddDim(2 * ff.shape().dim_size(0));
+        //     OP_REQUIRES_OK(context, context->allocate_temp(DT_INT32, int_shape, &int_temp));
+        //     TabulateCheckerFunctor<FPTYPE>()(
+        //         context->eigen_device<Device>(),
+        //         table_info.flat<FPTYPE>().data(),
+        //         input.flat<FPTYPE>().data(),
+        //         int_temp.flat<int>().data(),
+        //         ff.shape().dim_size(0),
+        //         ff.shape().dim_size(1)
+        //     );
+        // }
+
+        TabulateFusionFunctor<FPTYPE>()(
+            context->eigen_device<Device>(),            // define actually graph execution device
+            table.flat<FPTYPE>().data(),
+            table_info.flat<FPTYPE>().data(),
+            input.flat<FPTYPE>().data(),
+            ff.flat<FPTYPE>().data(),
+            ff.shape().dim_size(0),
+            ff.shape().dim_size(1),
+            last_layer_size,
+            output->flat<FPTYPE>().data()
+        );
+    }
+private:
+    int counter;
+    int last_layer_size;
+};
+
+template<typename Device, typename FPTYPE>
+class TabulateFusionGradOp : public OpKernel {
+ public:
+    explicit TabulateFusionGradOp(OpKernelConstruction* context) : OpKernel(context) {}
+
+    void Compute(OpKernelContext* context) override {
+        // std::cout << "I'm here" << std::endl;
+        // Grab the input tensor
+        int context_input_index = 0;
+        const Tensor& table	= context->input(context_input_index++);
+        const Tensor& table_info = context->input(context_input_index++);
+        const Tensor& input	= context->input(context_input_index++);
+        const Tensor& ff	= context->input(context_input_index++);
+        const Tensor& dy	= context->input(context_input_index++);
+        const Tensor& output = context->input(context_input_index++);
+
+        // set size of the sample
+        OP_REQUIRES (context, (dy.shape().dims() == 3),	    errors::InvalidArgument ("Dim of table should be 1"));
+
+        int context_output_index = 0;
+        Tensor* dy_dx = NULL;
+        OP_REQUIRES_OK(context, context->allocate_output(context_output_index++,
+	    					     input.shape(),
+	    					     &dy_dx));
+        Tensor* dy_df = NULL;
+        OP_REQUIRES_OK(context, context->allocate_output(context_output_index++,
+	    					     ff.shape(),
+	    					     &dy_df));
+
+        TabulateFusionGradFunctor<FPTYPE>()(
+            context->eigen_device<Device>(),            // define actually graph execution device
+            table.flat<FPTYPE>().data(),
+            table_info.flat<FPTYPE>().data(),
+            input.flat<FPTYPE>().data(),
+            ff.flat<FPTYPE>().data(),
+            dy.flat<FPTYPE>().data(),
+            ff.shape().dim_size(0),
+            ff.shape().dim_size(1),
+            output.shape().dim_size(2),
+            dy_dx->flat<FPTYPE>().data(),
+            dy_df->flat<FPTYPE>().data()
+        );
+    }
+private:
+};
+
+#define REGISTER_CPU(T)                                                                             \
+REGISTER_KERNEL_BUILDER(                                                                            \
+    Name("TabulateFusion").Device(DEVICE_CPU).TypeConstraint<T>("T").HostMemory("table_info"),      \
+    TabulateFusionOp<CPUDevice, T>);                                                                \
+REGISTER_KERNEL_BUILDER(                                                                            \
+    Name("TabulateFusionGrad").Device(DEVICE_CPU).TypeConstraint<T>("T").HostMemory("table_info"),  \
+    TabulateFusionGradOp<CPUDevice, T>);                                                                
+REGISTER_CPU(float);
+REGISTER_CPU(double);
+
+#if  GOOGLE_CUDA
+#define REGISTER_GPU(T)                                                                             \
+REGISTER_KERNEL_BUILDER(                                                                            \
+    Name("TabulateFusion").Device(DEVICE_GPU).TypeConstraint<T>("T").HostMemory("table_info"),      \
+    TabulateFusionOp<GPUDevice, T>);                                                                \
+REGISTER_KERNEL_BUILDER(                                                                            \
+    Name("TabulateFusionGrad").Device(DEVICE_GPU).TypeConstraint<T>("T").HostMemory("table_info"),  \
+    TabulateFusionGradOp<GPUDevice, T>);                                                                
+REGISTER_GPU(float);
+REGISTER_GPU(double);
+#endif  // GOOGLE_CUDA
diff --git a/source/op/unaggregated_grad.cc b/source/op/unaggregated_grad.cc
new file mode 100644
index 0000000000..bc489c61ff
--- /dev/null
+++ b/source/op/unaggregated_grad.cc
@@ -0,0 +1,320 @@
+#include "tensorflow/core/framework/op.h"
+#include "tensorflow/core/framework/op_kernel.h"
+#include "tensorflow/core/framework/shape_inference.h"
+#include <iostream>
+
+#include "ComputeDescriptor.h"
+#include "NeighborList.h"
+
+using namespace tensorflow;
+// using namespace std;
+
+using CPUDevice = Eigen::ThreadPoolDevice;
+using GPUDevice = Eigen::GpuDevice;
+
+REGISTER_OP("UnaggregatedDyDxS")
+    .Attr("T: {float, double}") 
+    .Input("y: T")                
+    .Input("w: T")              
+    .Output("dy_dx: T");
+
+REGISTER_OP("UnaggregatedDyDx")
+    .Attr("T: {float, double}")
+    .Input("z: T")           
+    .Input("w: T")     
+    .Input("dy_dx: T")     
+    .Output("dz_dx: T");
+
+REGISTER_OP("UnaggregatedDy2DxS")
+    .Attr("T: {float, double}") 
+    .Input("y: T")                
+    .Input("dy: T")                
+    .Input("w: T")              
+    .Output("dy2_dx: T");
+
+REGISTER_OP("UnaggregatedDy2Dx")
+    .Attr("T: {float, double}")
+    .Input("z: T")           
+    .Input("w: T")     
+    .Input("dz_dx: T")     
+    .Input("dy_dx: T")     
+    .Input("dy2_dx: T")     
+    .Output("dz2_dx: T");
+
+template <typename FPTYPE>
+struct UnaggregatedDyDxSFunctor {
+    void operator()(const CPUDevice& d, const FPTYPE * y, const FPTYPE * w, const int length, const int width, FPTYPE * dy_dx) {
+        #pragma omp parallel for
+        for (int ii = 0; ii < length; ii++) {
+            for (int jj = 0; jj < width; jj++) {
+                dy_dx[ii * width + jj] = (1 - y[ii * width + jj] * y[ii * width + jj]) * w[jj];
+            }
+        }
+    }
+
+    #if GOOGLE_CUDA
+    void operator()(const GPUDevice& d, const FPTYPE * y, const FPTYPE * w, const int length, const int width, FPTYPE * dy_dx) {
+        //Currently, Do nothing at all! 
+        return;
+    }
+    #endif // GOOGLE_CUDA 
+};
+
+// calculate the gradient for all variables!
+template <typename FPTYPE>
+struct UnaggregatedDyDxFunctor {
+    void operator()(const CPUDevice& d, const FPTYPE * z, const FPTYPE * w, const FPTYPE * dy_dx, const int length, const int width, const int size, FPTYPE * dz_dx) {
+        #pragma omp parallel for
+        for (int kk = 0; kk < length; kk++) {
+            for (int ii = 0; ii < width; ii++) {
+                //FPTYPE dz_drou = 1 - (z[kk * width + ii] - y[kk * size + ii % size]) * (z[kk * width + ii] - y[kk * size + ii % size]);
+                FPTYPE dz_drou = 1 - z[kk * width + ii] * z[kk * width + ii];
+                FPTYPE accumulator = 0.0;
+                for (int jj = 0; jj < size; jj++) {
+                    accumulator += w[jj * width + ii] * dy_dx[kk * size + jj];
+                }
+                dz_drou *= accumulator;
+                dz_drou += dy_dx[kk * size + ii % size];
+                dz_dx[kk * width + ii] = dz_drou;
+            }
+        }
+    }
+
+    #if GOOGLE_CUDA
+    void operator()(const GPUDevice& d, const FPTYPE * z, const FPTYPE * w, const FPTYPE * dy_dx, const int length, const int width, const int size, FPTYPE * dz_dx) {
+        //Currently, Do nothing at all! 
+        return;
+    }
+    #endif // GOOGLE_CUDA
+};
+
+template <typename FPTYPE>
+struct UnaggregatedDy2DxSFunctor {
+    void operator()(const CPUDevice& d, const FPTYPE * y, const FPTYPE * dy, const FPTYPE * w, const int length, const int width, FPTYPE * dy2_dx) {
+        #pragma omp parallel for
+        for (int ii = 0; ii < length; ii++) {
+            for (int jj = 0; jj < width; jj++) {
+                dy2_dx[ii * width + jj] = -2 * w[jj] * y[ii * width + jj] * dy[ii * width + jj];
+            }
+        }
+    }
+
+    #if GOOGLE_CUDA
+    void operator()(const GPUDevice& d, const FPTYPE * y, const FPTYPE * w, const int length, const int width, FPTYPE * dy_dx) {
+        //Currently, Do nothing at all! 
+        return;
+    }
+    #endif // GOOGLE_CUDA 
+};
+
+// calculate the gradient for all variables!
+template <typename FPTYPE>
+struct UnaggregatedDy2DxFunctor {
+    void operator()(const CPUDevice& d, const FPTYPE * z, const FPTYPE * w, const FPTYPE * dz_dx, const FPTYPE * dy_dx, const FPTYPE * dy2_dx, const int length, const int width, const int size, FPTYPE * dz2_dx) {
+        #pragma omp parallel for
+        for (int kk = 0; kk < length; kk++) {
+            for (int ii = 0; ii < width; ii++) {
+                //FPTYPE dz_drou = 1 - (z[kk * width + ii] - y[kk * size + ii % size]) * (z[kk * width + ii] - y[kk * size + ii % size]);
+                FPTYPE dz_drou = 1 - z[kk * width + ii] * z[kk * width + ii];
+                FPTYPE accumulator = 0.0;
+                for (int jj = 0; jj < size; jj++) {
+                    accumulator += w[jj * width + ii] * dy2_dx[kk * size + jj];
+                }
+                dz_drou *= accumulator;
+                accumulator = 0.0;
+                for (int jj = 0; jj < size; jj++) {
+                    accumulator += w[jj * width + ii] * dy_dx[kk * size + jj];
+                }
+                dz_drou -= 2 * z[kk * width + ii] * (dz_dx[kk * width + ii] - dy_dx[kk * size + ii % size]) * accumulator;
+                dz_drou += dy2_dx[kk * size + ii % size];
+                dz2_dx[kk * width + ii] = dz_drou;
+            }
+        }
+    }
+
+    #if GOOGLE_CUDA
+    void operator()(const GPUDevice& d, const FPTYPE * z, const FPTYPE * w, const FPTYPE * dz_dx, const FPTYPE * dy_dx, const FPTYPE * dy2_dx, const int length, const int width, const int size, FPTYPE * dz2_dx) {
+        //Currently, Do nothing at all! 
+        return;
+    }
+    #endif // GOOGLE_CUDA
+};
+
+template<typename Device, typename FPTYPE>
+class UnaggregatedDyDxSOp : public OpKernel {
+ public:
+    explicit UnaggregatedDyDxSOp(OpKernelConstruction* context) : OpKernel(context) {}
+
+    void Compute(OpKernelContext* context) override {
+        // Grab the input tensor
+        int context_input_index = 0;
+        const Tensor& y	= context->input(context_input_index++);
+        const Tensor& w	= context->input(context_input_index++);
+
+        // set size of the sample
+        OP_REQUIRES (context, (y.shape().dims() == 2),	    errors::InvalidArgument ("Dim of table should be 1"));
+        OP_REQUIRES (context, (w.shape().dims() == 2),		errors::InvalidArgument ("Dim of input should be 2"));
+
+        int context_output_index = 0;
+        Tensor* dy_dx = NULL;
+        OP_REQUIRES_OK(context, context->allocate_output(context_output_index++,
+	    					     y.shape(),
+	    					     &dy_dx));
+
+        UnaggregatedDyDxSFunctor<FPTYPE>()(
+            context->eigen_device<Device>(),            // define actually graph execution device
+            y.flat<FPTYPE>().data(),
+            w.flat<FPTYPE>().data(),
+            y.shape().dim_size(0),
+            y.shape().dim_size(1),
+            dy_dx->flat<FPTYPE>().data()
+        );
+    }
+private:
+};
+
+template<typename Device, typename FPTYPE>
+class UnaggregatedDy2DxSOp : public OpKernel {
+ public:
+    explicit UnaggregatedDy2DxSOp(OpKernelConstruction* context) : OpKernel(context) {}
+
+    void Compute(OpKernelContext* context) override {
+        // Grab the input tensor
+        int context_input_index = 0;
+        const Tensor& y	    = context->input(context_input_index++);
+        const Tensor& dy	= context->input(context_input_index++);
+        const Tensor& w	    = context->input(context_input_index++);
+
+        // set size of the sample
+        OP_REQUIRES (context, (y.shape().dims()  == 2),	    errors::InvalidArgument ("Dim of input should be 2"));
+        OP_REQUIRES (context, (dy.shape().dims() == 2),	    errors::InvalidArgument ("Dim of input should be 2"));
+        OP_REQUIRES (context, (w.shape().dims()  == 2),		errors::InvalidArgument ("Dim of input should be 2"));
+    
+        int context_output_index = 0;
+        Tensor* dy2_dx = NULL; 
+        OP_REQUIRES_OK(context, context->allocate_output(context_output_index++,
+	    					     y.shape(),
+	    					     &dy2_dx));
+
+        UnaggregatedDy2DxSFunctor<FPTYPE>()(
+            context->eigen_device<Device>(),            // define actually graph execution device
+            y.flat<FPTYPE>().data(),
+            dy.flat<FPTYPE>().data(),
+            w.flat<FPTYPE>().data(),
+            y.shape().dim_size(0),
+            y.shape().dim_size(1),
+            dy2_dx->flat<FPTYPE>().data()
+        );
+    }
+private:
+};
+
+template<typename Device, typename FPTYPE>
+class UnaggregatedDyDxOp : public OpKernel {
+ public:
+    explicit UnaggregatedDyDxOp(OpKernelConstruction* context) : OpKernel(context) {}
+
+    void Compute(OpKernelContext* context) override {
+        // Grab the input tensor
+        int context_input_index = 0;
+        const Tensor& z	= context->input(context_input_index++);
+        const Tensor& w	= context->input(context_input_index++);
+        const Tensor& dy_dx	= context->input(context_input_index++);
+
+        // set size of the sample
+        OP_REQUIRES (context, (z.shape().dims() == 2),	        errors::InvalidArgument ("Dim of table should be 1"));
+        OP_REQUIRES (context, (w.shape().dims() == 2),		    errors::InvalidArgument ("Dim of input should be 2"));
+        OP_REQUIRES (context, (dy_dx.shape().dims() == 2),		errors::InvalidArgument ("Dim of input should be 2"));
+
+        int context_output_index = 0;
+        Tensor* dz_dx = NULL;
+        OP_REQUIRES_OK(context, context->allocate_output(context_output_index++,
+	    					     z.shape(),
+	    					     &dz_dx));
+
+        UnaggregatedDyDxFunctor<FPTYPE>()(
+            context->eigen_device<Device>(),            // define actually graph execution device
+            z.flat<FPTYPE>().data(),
+            w.flat<FPTYPE>().data(),
+            dy_dx.flat<FPTYPE>().data(),
+            z.shape().dim_size(0),
+            z.shape().dim_size(1),
+            w.shape().dim_size(0),
+            dz_dx->flat<FPTYPE>().data()
+        );
+    }
+private:
+};
+
+template<typename Device, typename FPTYPE>
+class UnaggregatedDy2DxOp : public OpKernel {
+ public:
+    explicit UnaggregatedDy2DxOp(OpKernelConstruction* context) : OpKernel(context) {}
+
+    void Compute(OpKernelContext* context) override {
+        // Grab the input tensor
+        int context_input_index = 0;
+        const Tensor& z	= context->input(context_input_index++);
+        const Tensor& w	= context->input(context_input_index++);
+        const Tensor& dz_dx	= context->input(context_input_index++);
+        const Tensor& dy_dx	= context->input(context_input_index++);
+        const Tensor& dy2_dx = context->input(context_input_index++);
+
+        // set size of the sample
+        OP_REQUIRES (context, (z.shape().dims() == 2),	        errors::InvalidArgument ("Dim of input should be 2"));
+        OP_REQUIRES (context, (w.shape().dims() == 2),		    errors::InvalidArgument ("Dim of input should be 2"));
+        OP_REQUIRES (context, (dz_dx.shape().dims() == 2),		errors::InvalidArgument ("Dim of input should be 2"));
+        OP_REQUIRES (context, (dy_dx.shape().dims() == 2),		errors::InvalidArgument ("Dim of input should be 2"));
+        OP_REQUIRES (context, (dy2_dx.shape().dims() == 2),		errors::InvalidArgument ("Dim of input should be 2"));
+
+        int context_output_index = 0;
+        Tensor* dz2_dx = NULL;
+        OP_REQUIRES_OK(context, context->allocate_output(context_output_index++,
+	    					     z.shape(),
+	    					     &dz2_dx));
+
+        UnaggregatedDy2DxFunctor<FPTYPE>()(
+            context->eigen_device<Device>(),            // define actually graph execution device
+            z.flat<FPTYPE>().data(),
+            w.flat<FPTYPE>().data(),
+            dz_dx.flat<FPTYPE>().data(),
+            dy_dx.flat<FPTYPE>().data(),
+            dy2_dx.flat<FPTYPE>().data(),
+            z.shape().dim_size(0),
+            z.shape().dim_size(1),
+            w.shape().dim_size(0),
+            dz2_dx->flat<FPTYPE>().data()
+        );
+    }
+private:
+};
+
+// Register the CPU kernels.
+#define REGISTER_CPU(T)                                                                 \
+REGISTER_KERNEL_BUILDER(                                                                \
+    Name("UnaggregatedDyDxS").Device(DEVICE_CPU).TypeConstraint<T>("T"),                \
+    UnaggregatedDyDxSOp<CPUDevice, T>);                                                 \
+REGISTER_KERNEL_BUILDER(                                                                \
+    Name("UnaggregatedDyDx").Device(DEVICE_CPU).TypeConstraint<T>("T"),                 \
+    UnaggregatedDyDxOp<CPUDevice, T>);                                                  \
+REGISTER_KERNEL_BUILDER(                                                                \
+    Name("UnaggregatedDy2DxS").Device(DEVICE_CPU).TypeConstraint<T>("T"),               \
+    UnaggregatedDy2DxSOp<CPUDevice, T>);                                                \
+REGISTER_KERNEL_BUILDER(                                                                \
+    Name("UnaggregatedDy2Dx").Device(DEVICE_CPU).TypeConstraint<T>("T"),                \
+    UnaggregatedDy2DxOp<CPUDevice, T>);             
+REGISTER_CPU(float);
+REGISTER_CPU(double);
+// Not required in the current situation
+// // Register the GPU kernels.
+// #if GOOGLE_CUDA
+// #define REGISTER_GPU(T)                                                                 \
+// REGISTER_KERNEL_BUILDER(                                                                \
+//     Name("UnaggregatedDyDxS").Device(DEVICE_GPU).TypeConstraint<T>("T"),                \
+//     UnaggregatedDyDxSOp<GPUDevice, T>);                                                 \
+// REGISTER_KERNEL_BUILDER(                                                                \
+//     Name("UnaggregatedDyDx").Device(DEVICE_GPU).TypeConstraint<T>("T"),                 \
+//     UnaggregatedDyDxOp<GPUDevice, T>);                         
+// REGISTER_GPU(float);
+// REGISTER_GPU(double);
+// #endif  // GOOGLE_CUDA
\ No newline at end of file
diff --git a/source/train/CMakeLists.txt b/source/train/CMakeLists.txt
index 818b2f4225..176f20d5e4 100644
--- a/source/train/CMakeLists.txt
+++ b/source/train/CMakeLists.txt
@@ -2,7 +2,7 @@
 
 configure_file("RunOptions.py.in" "${CMAKE_CURRENT_BINARY_DIR}/RunOptions.py" @ONLY)
 
-file(GLOB LIB_PY main.py calculator.py Model*.py Trainer.py ${CMAKE_CURRENT_BINARY_DIR}/RunOptions.py transform.py doc.py)
+file(GLOB LIB_PY main.py calculator.py Model*.py Trainer.py ${CMAKE_CURRENT_BINARY_DIR}/RunOptions.py transform.py doc.py compress.py)
 
 file(GLOB CLS_PY  Local.py Slurm.py)
 
diff --git a/source/train/Trainer.py b/source/train/Trainer.py
index 765a25d206..7c95fda5bc 100644
--- a/source/train/Trainer.py
+++ b/source/train/Trainer.py
@@ -19,6 +19,7 @@
 from deepmd.Model import Model, WFCModel, DipoleModel, PolarModel, GlobalPolarModel
 from deepmd.loss import EnerStdLoss, EnerDipoleLoss, TensorLoss
 from deepmd.utils.learning_rate import LearningRateExp
+from deepmd.utils.neighbor_stat import NeighborStat
 
 from tensorflow.python.client import timeline
 from deepmd.env import op_module
@@ -32,6 +33,7 @@
 import deepmd._prod_virial_se_r_grad
 import deepmd._soft_min_force_grad
 import deepmd._soft_min_virial_grad
+import deepmd._tabulate_grad
 import deepmd._gelu
 
 from deepmd.common import j_must_have, ClassArg
@@ -87,7 +89,9 @@ def _init_param(self, jdata):
         model_param = j_must_have(jdata, 'model')
         descrpt_param = j_must_have(model_param, 'descriptor')
         fitting_param = j_must_have(model_param, 'fitting_net')
-
+        self.model_param    = model_param
+        self.descrpt_param  = descrpt_param
+        
         # descriptor
         try:
             descrpt_type = descrpt_param['type']
@@ -246,7 +250,6 @@ def _init_param(self, jdata):
         else :
             self.numb_fparam = 0
 
-
     def _message (self, msg) :
         self.run_opt.message(msg)
 
@@ -271,6 +274,14 @@ def build (self,
 
         self.model.data_stat(data)
 
+        if 'compress' in self.model_param and self.model_param['compress']['compress']:
+            assert 'rcut' in self.descrpt_param,      "Error: descriptor must have attr rcut!"
+            self.neighbor_stat \
+                = NeighborStat(self.ntypes, self.descrpt_param['rcut'])
+            self.min_nbor_dist, self.max_nbor_size \
+                = self.neighbor_stat.get_stat(data)
+            self.descrpt.enable_compression(self.min_nbor_dist, self.model_param['compress']['model_file'], self.model_param['compress']['table_config'][0], self.model_param['compress']['table_config'][1], self.model_param['compress']['table_config'][2], self.model_param['compress']['table_config'][3])
+
         worker_device = "/job:%s/task:%d/%s" % (self.run_opt.my_job_name,
                                                 self.run_opt.my_task_index,
                                                 self.run_opt.my_device)
diff --git a/source/train/compress.py b/source/train/compress.py
new file mode 100644
index 0000000000..24a3a7997a
--- /dev/null
+++ b/source/train/compress.py
@@ -0,0 +1,53 @@
+import re
+import json
+import copy
+import argparse
+import numpy as np
+from deepmd.env import tf
+from .train import train
+from .freeze import freeze
+from .transform import transform
+from deepmd.common import j_loader
+from deepmd.utils.argcheck import normalize
+
+def compress(args):
+    jdata = j_loader(args.INPUT)
+    if not 'model' in jdata.keys():
+       jdata = convert_input_v0_v1(jdata, 
+                                   warning = True, 
+                                   dump = 'input_v1_compat.json')
+    jdata = normalize(jdata)
+    jdata['model']['compress'] = {}
+    jdata['model']['compress']['compress'] = True
+    jdata['model']['compress']['model_file'] = args.input
+    jdata['model']['compress']['table_config'] = [args.extrapolate, args.stride, 10 * args.stride, int(args.frequency)]
+
+    # check the descriptor info of the input file
+    assert jdata['model']['descriptor']['type'] == 'se_a', 'Model compression error: descriptor type must be se_a!'
+    assert jdata['model']['descriptor']['resnet_dt'] == False, 'Model compression error: descriptor resnet_dt must be false!'
+
+    # stage 1: training or refining the model with tabulation
+    print('\n\n# DEEPMD: stage 1: train or refine the model with tabulation')
+    args_train = copy.deepcopy(args)
+    args_train.INPUT = 'compress.json'
+    args_train.output = 'compress.json'
+    args_train.init_model = None
+    args_train.restart = None
+    jdata['training']['stop_batch'] = jdata['training']['save_freq'] # be careful here, if one want to refine the model
+    with open(args_train.INPUT, 'w') as fp:
+        json.dump(jdata, fp, indent=4)
+    train(args_train)
+
+    # stage 2: freeze the model
+    print('\n\n# DEEPMD: stage 2: freeze the model')
+    args_frz = copy.deepcopy(args)
+    args_frz.nodes = None
+    freeze(args_frz)
+
+    # stage 3: transform the model
+    print('\n\n# DEEPMD: stage 3: transform the model')
+    args_transform = copy.deepcopy(args)
+    args_transform.old_model = args.input
+    args_transform.raw_model = args.output
+    args_transform.output = args.output
+    transform(args_transform)
diff --git a/source/train/main.py b/source/train/main.py
index b7dd0d0215..2cd262f7d7 100644
--- a/source/train/main.py
+++ b/source/train/main.py
@@ -5,6 +5,7 @@
 from .config import config
 from .test import test
 from .transform import transform
+from .compress import compress
 from .doc import doc_train_input
 
 def main () :    
@@ -62,6 +63,27 @@ def main () :
                             help="The file containing details of energy force and virial accuracy")
     parser_tst.add_argument("-a", "--atomic-energy", action = 'store_true', 
                             help="Test the accuracy of atomic energy")
+    
+    """
+    Compress a model, which including tabulating the embedding-net. 
+    The table is composed of fifth-order polynomial coefficients and is assembled from two sub-tables. The first table takes the stride(parameter) as it\'s uniform stride, while the second table takes 10 * stride as it\s uniform stride 
+    The range of the first table is automatically detected by deepmd-kit, while the second table ranges from the first table\'s upper boundary(upper) to the extrapolate(parameter) * upper.
+    """
+    parser_compress = subparsers.add_parser('compress', help='compress a model')
+    parser_compress.add_argument('INPUT', 
+                            help='The input parameter file in json or yaml format, which should be consistent with the original model parameter file')
+    parser_compress.add_argument('-i', "--input", default = "frozen_model.pb", type=str, 
+				            help = "The original frozen model, which will be compressed by the deepmd-kit")
+    parser_compress.add_argument("-o","--output", default = "frozen_model_compress.pb", type=str, 
+				            help='The compressed model')
+    parser_compress.add_argument('-e', '--extrapolate', default=5, type=int, 
+                            help="The scale of model extrapolation")
+    parser_compress.add_argument('-s', '--stride', default=0.01, type=float, 
+                            help="The uniform stride of tabulation's first table, the second table will use 10 * stride as it's uniform stride")
+    parser_compress.add_argument('-f', '--frequency', default=-1, type=int, 
+                            help="The frequency of tabulation overflow check(If the input environment matrix overflow the first or second table range). By default do not check the overflow")
+    parser_compress.add_argument("-d", "--folder", type=str, default = ".",
+                            help="path to checkpoint folder")
 
     parser_train = subparsers.add_parser('doc-train-input', 
                                          help='print the documentation (in rst format) of input training parameters.')
@@ -81,6 +103,8 @@ def main () :
         test(args)
     elif args.command == 'transform' :
         transform(args)
+    elif args.command == 'compress' :
+        compress(args)
     elif args.command == 'doc-train-input' :
         doc_train_input(args)
     else :
diff --git a/source/train/transform.py b/source/train/transform.py
index 1d587ae531..19efd42976 100644
--- a/source/train/transform.py
+++ b/source/train/transform.py
@@ -44,8 +44,8 @@ def transform_graph(raw_graph,old_graph):
     raw_graph_node = load_transform_node(raw_graph_def)
     old_graph_node = load_transform_node(old_graph_def)
 
-    if len(raw_graph_node) != len(old_graph_node):
-        raise RuntimeError("raw graph and old graph has different network structure")
+    # if len(raw_graph_node) != len(old_graph_node):
+        # raise RuntimeError("raw graph and old graph has different network structure")
 
     for node in raw_graph_def.node:
         if node.name in raw_graph_node.keys():
@@ -108,7 +108,7 @@ def check_dim(raw_graph_node, old_graph_node, node_name):
     raw_graph_dim = raw_graph_node[node_name].tensor_shape
     old_graph_dim = old_graph_node[node_name].tensor_shape
     if raw_graph_dim != old_graph_dim:
-        raise RuntimeError("old graph and raw graph has different"+node_name+" dim")
+        raise RuntimeError("old graph " + str(old_graph_dim) + " and raw graph " + str(raw_graph_dim) + " has different " + str(node_name) + " dim")
 
 
 def load_transform_node(graph):