diff --git a/.github/workflows/build_wheel.yml b/.github/workflows/build_wheel.yml
index 2d522e5cf3..ce72d9cc41 100644
--- a/.github/workflows/build_wheel.yml
+++ b/.github/workflows/build_wheel.yml
@@ -25,8 +25,9 @@ jobs:
 
       - name: Build wheels
         env:
-          CIBW_BUILD: "cp36-* cp37-* cp38-*"
-          CIBW_BEFORE_BUILD: pip install tensorflow && sed -i 's/libresolv.so.2"/libresolv.so.2", "libtensorflow_framework.so.2"/g' /opt/_internal/tools/lib/python*/site-packages/auditwheel/policy/policy.json
+          CIBW_BUILD: "cp36-* cp37-* cp38-* cp39-*"
+          CIBW_MANYLINUX_X86_64_IMAGE: ghcr.io/deepmodeling/manylinux2010_x86_64_tensorflow
+          CIBW_BEFORE_BUILD: pip install tensorflow
           CIBW_SKIP: "*-win32 *-manylinux_i686"
         run: |
           python -m cibuildwheel --output-dir wheelhouse
diff --git a/deepmd/__init__.py b/deepmd/__init__.py
index 64386b754c..3a295dcbef 100644
--- a/deepmd/__init__.py
+++ b/deepmd/__init__.py
@@ -4,7 +4,7 @@
 
 from . import cluster, descriptor, fit, loss, utils
 from .env import set_mkl
-from .infer import DeepPotential
+from .infer import DeepEval, DeepPotential
 from .infer.data_modifier import DipoleChargeModifier
 
 set_mkl()
diff --git a/deepmd/cluster/local.py b/deepmd/cluster/local.py
index 37a2e99b6e..0bc9ec7698 100644
--- a/deepmd/cluster/local.py
+++ b/deepmd/cluster/local.py
@@ -1,10 +1,38 @@
-"""Get local GPU resources from `CUDA_VISIBLE_DEVICES` enviroment variable."""
+"""Get local GPU resources."""
 
 import os
 import socket
+import subprocess as sp
+import sys
+
+from deepmd.env import tf
 from typing import List, Tuple, Optional
 
-__all__ = ["get_resource"]
+
+__all__ = ["get_gpus", "get_resource"]
+
+
+def get_gpus():
+    """Get available IDs of GPU cards at local.
+    These IDs are valid when used as the TensorFlow device ID.
+
+    Returns:
+    -------
+    Optional[List[int]]
+        List of available GPU IDs. Otherwise, None.
+    """
+    test_cmd = 'from tensorflow.python.client import device_lib; ' \
+               'devices = device_lib.list_local_devices(); ' \
+               'gpus = [d.name for d in devices if d.device_type == "GPU"]; ' \
+               'print(len(gpus))'
+    with sp.Popen([sys.executable, "-c", test_cmd], stderr=sp.PIPE, stdout=sp.PIPE) as p:
+        stdout, stderr = p.communicate()
+        if p.returncode != 0:
+            decoded = stderr.decode('UTF-8')
+            raise RuntimeError('Failed to detect availbe GPUs due to:\n%s' % decoded)
+        decoded = stdout.decode('UTF-8').strip()
+        num_gpus = int(decoded)
+        return list(range(num_gpus)) if num_gpus > 0 else None
 
 
 def get_resource() -> Tuple[str, List[str], Optional[List[int]]]:
@@ -17,10 +45,5 @@ def get_resource() -> Tuple[str, List[str], Optional[List[int]]]:
     """
     nodename = socket.gethostname()
     nodelist = [nodename]
-    gpus_env = os.getenv("CUDA_VISIBLE_DEVICES", None)
-    if not gpus_env:
-        gpus = None
-    else:
-        gpus = [gpu for gpu in gpus_env.split(",")]
-
+    gpus = get_gpus()
     return nodename, nodelist, gpus
diff --git a/deepmd/cluster/slurm.py b/deepmd/cluster/slurm.py
index df4ac3dbf9..feafd84117 100644
--- a/deepmd/cluster/slurm.py
+++ b/deepmd/cluster/slurm.py
@@ -5,9 +5,11 @@
 https://github.com/deepsense-ai/tensorflow_on_slurm ####
 """
 
-import re
+import hostlist
 import os
-from typing import List, Tuple, Optional, Iterable
+
+from deepmd.cluster import local
+from typing import List, Tuple, Optional
 
 __all__ = ["get_resource"]
 
@@ -29,7 +31,7 @@ def get_resource() -> Tuple[str, List[str], Optional[List[int]]]:
     ValueError
         if current nodename is not found in node list
     """
-    nodelist = _expand_nodelist(os.environ["SLURM_JOB_NODELIST"])
+    nodelist = hostlist.expand_hostlist(os.environ["SLURM_JOB_NODELIST"])
     nodename = os.environ["SLURMD_NODENAME"]
     num_nodes_env = os.getenv("SLURM_JOB_NUM_NODES")
     if num_nodes_env:
@@ -45,40 +47,5 @@ def get_resource() -> Tuple[str, List[str], Optional[List[int]]]:
         raise ValueError(
             f"Nodename({nodename}) not in nodelist({nodelist}). This should not happen!"
         )
-    gpus_env = os.getenv("CUDA_VISIBLE_DEVICES")
-    if not gpus_env:
-        gpus = None
-    else:
-        gpus = [int(gpu) for gpu in gpus_env.split(",")]
+    gpus = local.get_gpus()
     return nodename, nodelist, gpus
-
-
-def _pad_zeros(iterable: Iterable, length: int):
-    return (str(t).rjust(length, "0") for t in iterable)
-
-
-def _expand_ids(ids: str) -> List[str]:
-    result = []
-    for _id in ids.split(","):
-        if "-" in _id:
-            str_end = _id.split("-")[1]
-            begin, end = [int(token) for token in _id.split("-")]
-            result.extend(_pad_zeros(range(begin, end + 1), len(str_end)))
-        else:
-            result.append(_id)
-    return result
-
-
-def _expand_nodelist(nodelist: str) -> List[str]:
-    result = []
-    interval_list = nodelist.split(",")
-    for interval in interval_list:
-        match = re.search(r"(.*)\[(.*)\]", interval)
-        if match:
-            prefix = match.group(1)
-            ids = match.group(2)
-            ids_list = _expand_ids(ids)
-            result.extend([f"{prefix}{_id}" for _id in ids_list])
-        else:
-            result.append(interval)
-    return result
diff --git a/deepmd/descriptor/se_a.py b/deepmd/descriptor/se_a.py
index 3b5d1d0922..60de701886 100644
--- a/deepmd/descriptor/se_a.py
+++ b/deepmd/descriptor/se_a.py
@@ -586,6 +586,7 @@ def _filter_lower(
                              [ 0, start_index* 4],
                              [-1, incrs_index* 4] )
         shape_i = inputs_i.get_shape().as_list()
+        natom = tf.shape(inputs_i)[0]
         # with (natom x nei_type_i) x 4
         inputs_reshape = tf.reshape(inputs_i, [-1, 4])
         # with (natom x nei_type_i) x 1
@@ -603,7 +604,7 @@ def _filter_lower(
             net = 'filter_-1_net_' + str(type_i)
           else:
             net = 'filter_' + str(type_input) + '_net_' + str(type_i)
-          return op_module.tabulate_fusion(self.table.data[net].astype(self.filter_np_precision), info, xyz_scatter, tf.reshape(inputs_i, [-1, shape_i[1]//4, 4]), last_layer_size = outputs_size[-1])  
+          return op_module.tabulate_fusion(self.table.data[net].astype(self.filter_np_precision), info, xyz_scatter, tf.reshape(inputs_i, [natom, shape_i[1]//4, 4]), last_layer_size = outputs_size[-1])  
         else:
           if (not is_exclude):
               xyz_scatter = embedding_net(
@@ -620,11 +621,16 @@ def _filter_lower(
                   uniform_seed = self.uniform_seed)
               if (not self.uniform_seed) and (self.seed is not None): self.seed += self.seed_shift
           else:
-            w = tf.zeros((outputs_size[0], outputs_size[-1]), dtype=GLOBAL_TF_FLOAT_PRECISION)
-            xyz_scatter = tf.matmul(xyz_scatter, w)
+            # we can safely return the final xyz_scatter filled with zero directly
+            return tf.cast(tf.fill((natom, 4, outputs_size[-1]), 0.), GLOBAL_TF_FLOAT_PRECISION)
           # natom x nei_type_i x out_size
           xyz_scatter = tf.reshape(xyz_scatter, (-1, shape_i[1]//4, outputs_size[-1]))  
-          return tf.matmul(tf.reshape(inputs_i, [-1, shape_i[1]//4, 4]), xyz_scatter, transpose_a = True)
+          # When using tf.reshape(inputs_i, [-1, shape_i[1]//4, 4]) below
+          # [588 24] -> [588 6 4] correct
+          # but if sel is zero
+          # [588 0] -> [147 0 4] incorrect; the correct one is [588 0 4]
+          # So we need to explicitly assign the shape to tf.shape(inputs_i)[0] instead of -1
+          return tf.matmul(tf.reshape(inputs_i, [natom, shape_i[1]//4, 4]), xyz_scatter, transpose_a = True)
 
 
     def _filter(
@@ -644,6 +650,18 @@ def _filter(
         shape = inputs.get_shape().as_list()
         outputs_size = [1] + self.filter_neuron
         outputs_size_2 = self.n_axis_neuron
+        all_excluded = all([(type_input, type_i) in self.exclude_types for type_i in range(self.ntypes)])
+        if all_excluded:
+            # all types are excluded so result and qmat should be zeros
+            # we can safaly return a zero matrix...
+            # See also https://stackoverflow.com/a/34725458/9567349
+            # result: natom x outputs_size x outputs_size_2
+            # qmat: natom x outputs_size x 3
+            natom = tf.shape(inputs)[0]
+            result = tf.cast(tf.fill((natom, outputs_size_2, outputs_size[-1]), 0.), GLOBAL_TF_FLOAT_PRECISION)
+            qmat = tf.cast(tf.fill((natom, outputs_size[-1], 3), 0.), GLOBAL_TF_FLOAT_PRECISION)
+            return result, qmat
+            
         with tf.variable_scope(name, reuse=reuse):
           start_index = 0
           type_i = 0
@@ -665,7 +683,8 @@ def _filter(
                       suffix = "_"+str(type_i))
                   if type_i == 0:
                       xyz_scatter_1 = ret
-                  else:
+                  elif (type_input, type_i) not in self.exclude_types:
+                      # add zero is meaningless; skip
                       xyz_scatter_1+= ret
                   start_index += self.sel_a[type_i]
           else :
diff --git a/deepmd/descriptor/se_r.py b/deepmd/descriptor/se_r.py
index 40bbc21593..f362302f11 100644
--- a/deepmd/descriptor/se_r.py
+++ b/deepmd/descriptor/se_r.py
@@ -478,11 +478,11 @@ def _filter_r(self,
                                                 trainable = trainable, 
                                                 uniform_seed = self.uniform_seed)
                     if (not self.uniform_seed) and (self.seed is not None): self.seed += self.seed_shift
+                    # natom x nei_type_i x out_size
+                    xyz_scatter = tf.reshape(xyz_scatter, (-1, shape_i[1], outputs_size[-1]))
                 else:
-                    w = tf.zeros((outputs_size[0], outputs_size[-1]), dtype=GLOBAL_TF_FLOAT_PRECISION)
-                    xyz_scatter = tf.matmul(xyz_scatter, w)
-                # natom x nei_type_i x out_size
-                xyz_scatter = tf.reshape(xyz_scatter, (-1, shape_i[1], outputs_size[-1]))
+                    natom = tf.shape(inputs)[0]
+                    xyz_scatter = tf.cast(tf.fill((natom, shape_i[1], outputs_size[-1]), 0.), GLOBAL_TF_FLOAT_PRECISION)
                 xyz_scatter_total.append(xyz_scatter)
 
             # natom x nei x outputs_size
diff --git a/deepmd/entrypoints/__init__.py b/deepmd/entrypoints/__init__.py
index d92cbf3076..80d0431f8c 100644
--- a/deepmd/entrypoints/__init__.py
+++ b/deepmd/entrypoints/__init__.py
@@ -17,7 +17,7 @@
     "doc_train_input",
     "freeze",
     "test",
-    "train",
+    "train_dp",
     "transfer",
     "compress",
     "doc_train_input",
diff --git a/deepmd/entrypoints/compress.py b/deepmd/entrypoints/compress.py
index 3c80965bce..6b85999426 100644
--- a/deepmd/entrypoints/compress.py
+++ b/deepmd/entrypoints/compress.py
@@ -6,7 +6,7 @@
 
 from deepmd.common import j_loader
 from deepmd.utils.argcheck import normalize
-from deepmd.utils.compat import convert_input_v0_v1
+from deepmd.utils.compat import updata_deepmd_input
 from deepmd.utils.errors import GraphTooLargeError
 
 from .freeze import freeze
@@ -65,7 +65,7 @@ def compress(
     """
     jdata = j_loader(INPUT)
     if "model" not in jdata.keys():
-        jdata = convert_input_v0_v1(jdata, warning=True, dump="input_v1_compat.json")
+        jdata = updata_deepmd_input(jdata, warning=True, dump="input_v2_compat.json")
     jdata["model"]["compress"] = {}
     jdata["model"]["compress"]["type"] = 'se_e2_a'
     jdata["model"]["compress"]["compress"] = True
diff --git a/deepmd/entrypoints/doc.py b/deepmd/entrypoints/doc.py
index 941efd61c2..0cb555e4d1 100644
--- a/deepmd/entrypoints/doc.py
+++ b/deepmd/entrypoints/doc.py
@@ -1,11 +1,16 @@
 """Module that prints train input arguments docstrings."""
 
-from deepmd.utils.argcheck import gen_doc
+from deepmd.utils.argcheck import gen_doc, gen_json
 
 __all__ = ["doc_train_input"]
 
 
-def doc_train_input():
+def doc_train_input(*, out_type: str = "rst", **kwargs):
     """Print out trining input arguments to console."""
-    doc_str = gen_doc(make_anchor=True)
+    if out_type == "rst":
+        doc_str = gen_doc(make_anchor=True)
+    elif out_type == "json":
+        doc_str = gen_json()
+    else:
+        raise RuntimeError("Unsupported out type %s" % out_type)
     print(doc_str)
diff --git a/deepmd/entrypoints/main.py b/deepmd/entrypoints/main.py
index cda9856a24..9557976bf6 100644
--- a/deepmd/entrypoints/main.py
+++ b/deepmd/entrypoints/main.py
@@ -313,12 +313,18 @@ def parse_args(args: Optional[List[str]] = None):
     )
 
     # * print docs script **************************************************************
-    subparsers.add_parser(
+    parsers_doc = subparsers.add_parser(
         "doc-train-input",
         parents=[parser_log],
         help="print the documentation (in rst format) of input training parameters.",
         formatter_class=argparse.ArgumentDefaultsHelpFormatter,
     )
+    parsers_doc.add_argument(
+        "--out-type", 
+        default="rst", 
+        type=str, 
+        help="The output type"
+    )
 
     # * make model deviation ***********************************************************
     parser_model_devi = subparsers.add_parser(
@@ -428,7 +434,7 @@ def main():
     elif args.command == "compress":
         compress(**dict_args)
     elif args.command == "doc-train-input":
-        doc_train_input()
+        doc_train_input(**dict_args)
     elif args.command == "model-devi":
         make_model_devi(**dict_args)
     elif args.command == "convert-from":
diff --git a/deepmd/entrypoints/train.py b/deepmd/entrypoints/train.py
index cffac25947..bb0b6d20c0 100755
--- a/deepmd/entrypoints/train.py
+++ b/deepmd/entrypoints/train.py
@@ -7,11 +7,11 @@
 import logging
 import time
 import os
-from typing import Dict, TYPE_CHECKING, List, Optional, Any
+from typing import Dict, List, Optional, Any
 
 import numpy as np
 from deepmd.common import data_requirement, expand_sys_str, j_loader, j_must_have
-from deepmd.env import tf
+from deepmd.env import reset_default_tf_session_config
 from deepmd.infer.data_modifier import DipoleChargeModifier
 from deepmd.train.run_options import BUILD, CITATION, WELCOME, RunOptions
 from deepmd.train.trainer import DPTrainer
@@ -21,118 +21,11 @@
 from deepmd.utils.sess import run_sess
 from deepmd.utils.neighbor_stat import NeighborStat
 
-if TYPE_CHECKING:
-    from deepmd.run_options import TFServerV1
-
 __all__ = ["train"]
 
 log = logging.getLogger(__name__)
 
 
-def create_done_queue(
-    cluster_spec: tf.train.ClusterSpec, task_index: int
-) -> tf.FIFOQueue:
-    """Create FIFO queue for distributed tasks.
-
-    Parameters
-    ----------
-    cluster_spec : tf.train.ClusterSpec
-        tf cluster specification object
-    task_index : int
-        identifying index of a task
-
-    Returns
-    -------
-    tf.FIFOQueue
-        tf distributed FIFI queue
-    """
-    with tf.device(f"/job:ps/task:{task_index:d}"):
-        queue = tf.FIFOQueue(
-            cluster_spec.num_tasks("worker"),
-            tf.int32,
-            shared_name=f"done_queue{task_index}",
-        )
-        return queue
-
-
-def wait_done_queue(
-    cluster_spec: tf.train.ClusterSpec,
-    server: "TFServerV1",
-    queue: tf.FIFOQueue,
-    task_index: int,
-):
-    """Wait until all enqued operation in tf distributed queue are finished.
-
-    Parameters
-    ----------
-    cluster_spec : tf.train.ClusterSpec
-        tf cluster specification object
-    server : TFServerV1
-        tf server specification object
-    queue : tf.FIFOQueue
-        tf distributed queue
-    task_index : int
-        identifying index of a task
-    """
-    with tf.Session(server.target) as sess:
-        for i in range(cluster_spec.num_tasks("worker")):
-            run_sess(sess, queue.dequeue())
-            log.debug(f"ps:{task_index:d} received done from worker:{i:d}")
-        log.debug(f"ps:{task_index:f} quitting")
-
-
-def connect_done_queue(
-    cluster_spec: tf.train.ClusterSpec, task_index: int
-) -> List[tf.Operation]:
-    """Create tf FIFO queue filling operations.
-
-    Parameters
-    ----------
-    cluster_spec : tf.train.ClusterSpec
-        tf cluster specification object
-    task_index : int
-        identifying index of a task
-
-    Returns
-    -------
-    List[tf.Operation]
-        list of tf operations that will populate the queue
-    """
-    done_ops = []
-    for i in range(cluster_spec.num_tasks("ps")):
-        with tf.device(f"/job:ps/task:{i:d}"):
-            queue = tf.FIFOQueue(
-                cluster_spec.num_tasks("worker"), tf.int32, shared_name=f"done_queue{i}"
-            )
-            done_ops.append(queue.enqueue(task_index))
-    return done_ops
-
-
-def fill_done_queue(
-    cluster_spec: tf.train.ClusterSpec,
-    server: "TFServerV1",
-    done_ops: List[tf.Operation],
-    task_index: int,
-):
-    """Run specified operations that will fill the tf distributed FIFO queue.
-
-    Parameters
-    ----------
-    cluster_spec : tf.train.ClusterSpec
-        tf cluster specification object
-    server : TFServerV1
-        tf server specification object
-    done_ops : List[tf.Operation]
-        a list of tf operations that will fill the queue
-    task_index : int
-        identifying index of a task
-    """
-    with tf.Session(server.target) as sess:
-        for i in range(cluster_spec.num_tasks("ps")):
-            run_sess(sess, done_ops[i])
-            log.debug(f"worker:{task_index:d} sending done to ps:{i:d}")
-
-
 def train(
     *,
     INPUT: str,
@@ -186,34 +79,14 @@ def train(
         restart=restart,
         log_path=log_path,
         log_level=log_level,
-        mpi_log=mpi_log,
-        try_distrib=jdata.get("with_distrib", False),
+        mpi_log=mpi_log
     )
 
     for message in WELCOME + CITATION + BUILD:
         log.info(message)
 
     run_opt.print_resource_summary()
-
-    if run_opt.is_distrib:
-        # distributed training
-        if run_opt.my_job_name == "ps":
-            queue = create_done_queue(run_opt.cluster_spec, run_opt.my_task_index)
-            wait_done_queue(
-                run_opt.cluster_spec, run_opt.server, queue, run_opt.my_task_index
-            )
-            # server.join()
-        elif run_opt.my_job_name == "worker":
-            done_ops = connect_done_queue(run_opt.cluster_spec, run_opt.my_task_index)
-            _do_work(jdata, run_opt)
-            fill_done_queue(
-                run_opt.cluster_spec, run_opt.server, done_ops, run_opt.my_task_index
-            )
-        else:
-            raise RuntimeError("unknown job name")
-    else:
-        # serial training
-        _do_work(jdata, run_opt)
+    _do_work(jdata, run_opt)
 
 
 def _do_work(jdata: Dict[str, Any], run_opt: RunOptions):
@@ -234,6 +107,10 @@ def _do_work(jdata: Dict[str, Any], run_opt: RunOptions):
     # make necessary checks
     assert "training" in jdata
 
+    # avoid conflict of visible gpus among multipe tf sessions in one process
+    if run_opt.is_distrib and len(run_opt.gpus or []) > 1:
+        reset_default_tf_session_config(cpu_only=True)
+
     # init the model
     model = DPTrainer(jdata, run_opt=run_opt)
     rcut = model.model.get_rcut()
@@ -352,7 +229,7 @@ def get_sel(jdata, rcut):
     max_rcut = get_rcut(jdata)
     type_map = get_type_map(jdata)
 
-    if len(type_map) == 0:
+    if type_map and len(type_map) == 0:
         type_map = None
     train_data = get_data(jdata["training"]["training_data"], max_rcut, type_map, None)
     train_data.get_batch()
@@ -399,11 +276,22 @@ def wrap_up_4(xx):
 
 
 def update_one_sel(jdata, descriptor):
+    rcut = descriptor['rcut']
+    tmp_sel = get_sel(jdata, rcut)
     if parse_auto_sel(descriptor['sel']) :
         ratio = parse_auto_sel_ratio(descriptor['sel'])
-        rcut = descriptor['rcut']
-        tmp_sel = get_sel(jdata, rcut)
         descriptor['sel'] = [int(wrap_up_4(ii * ratio)) for ii in tmp_sel]
+    else:
+        # sel is set by user
+        for ii, (tt, dd) in enumerate(zip(tmp_sel, descriptor['sel'])):
+            if dd and tt > dd:
+                # we may skip warning for sel=0, where the user is likely
+                # to exclude such type in the descriptor
+                log.warning(
+                    "sel of type %d is not enough! The expected value is "
+                    "not less than %d, but you set it to %d. The accuracy"
+                    " of your model may get worse." %(ii, tt, dd)
+                )
     return descriptor
 
 
diff --git a/deepmd/env.py b/deepmd/env.py
index 5f5c344031..ace9d3b76e 100644
--- a/deepmd/env.py
+++ b/deepmd/env.py
@@ -32,6 +32,7 @@
     "MODEL_VERSION",
     "SHARED_LIB_MODULE",
     "default_tf_session_config",
+    "reset_default_tf_session_config",
     "op_module",
     "op_grads_module",
 ]
@@ -117,14 +118,31 @@ def get_tf_session_config() -> Any:
     """
     set_tf_default_nthreads()
     intra, inter = get_tf_default_nthreads()
-    return tf.ConfigProto(
+    config = tf.ConfigProto(
         intra_op_parallelism_threads=intra, inter_op_parallelism_threads=inter
     )
+    return config
 
 
 default_tf_session_config = get_tf_session_config()
 
 
+def reset_default_tf_session_config(cpu_only: bool):
+    """Limit tensorflow session to CPU or not.
+
+    Parameters
+    ----------
+    cpu_only : bool
+        If enabled, no GPU device is visible to the TensorFlow Session.
+    """
+    global default_tf_session_config
+    if cpu_only:
+        default_tf_session_config.device_count['GPU'] = 0
+    else:
+        if 'GPU' in default_tf_session_config.device_count:
+            del default_tf_session_config.device_count['GPU']
+
+
 def get_module(module_name: str) -> "ModuleType":
     """Load force module.
 
@@ -235,16 +253,25 @@ def _get_package_constants(
 op_module = get_module("libop_abi")
 op_grads_module = get_module("libop_grads")
 
-if GLOBAL_CONFIG["precision"] == "-DHIGH_PREC":
+# FLOAT_PREC
+dp_float_prec = os.environ.get("DP_INTERFACE_PREC", "high").lower()
+if dp_float_prec in ("high", ""):
+    # default is high
     GLOBAL_TF_FLOAT_PRECISION = tf.float64
     GLOBAL_NP_FLOAT_PRECISION = np.float64
     GLOBAL_ENER_FLOAT_PRECISION = np.float64
     global_float_prec = "double"
-else:
+elif dp_float_prec == "low":
     GLOBAL_TF_FLOAT_PRECISION = tf.float32
     GLOBAL_NP_FLOAT_PRECISION = np.float32
     GLOBAL_ENER_FLOAT_PRECISION = np.float64
     global_float_prec = "float"
+else:
+    raise RuntimeError(
+        "Unsupported float precision option: %s. Supported: high,"
+        "low. Please set precision with environmental variable "
+        "DP_INTERFACE_PREC." % dp_float_prec
+    )
 
 
 def global_cvt_2_tf_float(xx: tf.Tensor) -> tf.Tensor:
diff --git a/deepmd/loggers/loggers.py b/deepmd/loggers/loggers.py
index f787ff1e1a..3bb9e9fa4c 100644
--- a/deepmd/loggers/loggers.py
+++ b/deepmd/loggers/loggers.py
@@ -137,8 +137,7 @@ def setStream(self, stream):
 def set_log_handles(
     level: int,
     log_path: Optional["Path"] = None,
-    mpi_log: Optional[str] = None,
-    MPI: Optional["MPI"] = None,
+    mpi_log: Optional[str] = None
 ):
     """Set desired level for package loggers and add file handlers.
 
@@ -154,16 +153,13 @@ def set_log_handles(
         only from rank==0. `collect` will write messages from all ranks to one file
         opened under rank==0 and to console. `workers` will open one log file for each
         worker designated by its rank, console behaviour is the same as for `collect`.
-        If this argument is specified than also `MPI` object must be passed in.
-        by default None
-    MPI : Optional[MPI, optional]
-        `MPI` communicator object, must be specified if `mpi_log` is specified,
+        If this argument is specified, package 'mpi4py' must be already installed.
         by default None
 
     Raises
     ------
     RuntimeError
-        if only one of the arguments `mpi_log`, `MPI` is specified
+        If the argument `mpi_log` is specified, package `mpi4py` is not installed.
 
     References
     ----------
@@ -204,8 +200,12 @@ def set_log_handles(
         root_log.removeHandler(hdlr)
 
     # check if arguments are present
-    if (mpi_log and not MPI) or (not mpi_log and MPI):
-        raise RuntimeError("You cannot specify only one of 'mpi_log', 'MPI' arguments")
+    MPI = None
+    if mpi_log:
+        try:
+            from mpi4py import MPI
+        except ImportError as e:
+            raise RuntimeError("You cannot specify 'mpi_log' when mpi4py not installed") from e
 
     # * add console handler ************************************************************
     ch = logging.StreamHandler()
diff --git a/deepmd/train/run_options.py b/deepmd/train/run_options.py
index 25029c4308..1ad5fc4574 100644
--- a/deepmd/train/run_options.py
+++ b/deepmd/train/run_options.py
@@ -11,18 +11,7 @@
 from deepmd.loggers import set_log_handles
 
 if TYPE_CHECKING:
-    from mpi4py import MPI
-
-    try:
-        from typing import Protocol  # python >=3.8
-    except ImportError:
-        from typing_extensions import Protocol  # type: ignore
-
-    class TFServerV1(Protocol):
-        """Prococol mimicking parser object."""
-
-        server_def: tf.train.ServerDef
-        target: str
+    import horovod.tensorflow as HVD
 
 
 __all__ = [
@@ -63,137 +52,36 @@ class TFServerV1(Protocol):
 )
 
 
-def _is_distributed(MPI: "MPI") -> bool:
-    """Check if there are more than one MPI processes.
-
-    Parameters
-    ----------
-    MPI : MPI
-        MPI object
-
-    Returns
-    -------
-    bool
-        True if we have more than 1 MPI process
-    """
-    return MPI.COMM_WORLD.Get_size() > 1
-
-
-def _distributed_task_config(
-    MPI: "MPI",
-    node_name: str,
-    node_list_: List[str],
-    gpu_list: Optional[List[int]] = None,
-    default_port: int = 2222,
-) -> Tuple[Dict[str, List[str]], str, int, str, str]:
-    """Create configuration for distributed tensorflow session.
-
-    Parameters
-    ----------
-    MPI : mpi4py.MPI
-        MPI module
-    node_name : str
-        the name of current node
-    node_list_ : List[str]
-        the list of nodes of the current mpirun
-    gpu_list : Optional[List[int]], optional
-        the list of GPUs on each node, by default None
-    default_port : int, optional
-        the default port for socket communication, by default 2222
-
-    Returns
-    -------
-    Tuple[Dict[str, List[str]], str, int, str, str]
-        cluster specification, job name of this task, index of this task,
-        hostname:port socket of this task, the device for this task
-    """
-    # setup cluster
-    node_list = list(set(node_list_))
-    node_list.sort()
-    node_color = node_list.index(node_name)
-    world_idx = MPI.COMM_WORLD.Get_rank()
-    node_comm = MPI.COMM_WORLD.Split(node_color, world_idx)
-    node_task_idx = node_comm.Get_rank()
-    node_numb_task = node_comm.Get_size()
-
-    socket_list = []
-    for ii in node_list:
-        for jj in range(node_numb_task):
-            socket_list.append(f"{ii}:{default_port + jj}")
-    ps_map = socket_list[0:1]
-    worker_map = socket_list[1:]
-
-    if node_color == 0 and node_task_idx == 0:
-        my_job = "ps"
-        my_socket = ps_map[0]
-        my_task_idx = ps_map.index(my_socket)
-    else:
-        my_job = "worker"
-        my_socket = f"{node_name}:{default_port - node_task_idx}"
-        assert my_socket in worker_map
-        my_task_idx = worker_map.index(my_socket)
-
-    # setup gpu/cpu devices
-    if gpu_list is not None:
-        numb_gpu = len(gpu_list)
-        gpu_idx = node_numb_task - node_task_idx - 1
-        if gpu_idx >= numb_gpu:
-            my_device = "cpu:0"  # "cpu:%d" % node_task_idx
-        else:
-            my_device = f"gpu:{gpu_idx:d}"
-    else:
-        my_device = "cpu:0"  # "cpu:%d" % node_task_idx
-
-    cluster = {"worker": worker_map, "ps": ps_map}
-    return cluster, my_job, my_task_idx, my_socket, my_device
-
-
 class RunOptions:
     """Class with inf oon how to run training (cluster, MPI and GPU config).
 
     Attributes
     ----------
-    cluster: Optional[Dict[str, List[str]]]
-        cluster informations as dict
-    cluster_spec: Optional[tf.train.ClusterSpec]
-        `tf.train.ClusterSpec` or None if training is serial
     gpus: Optional[List[int]]
         list of GPUs if any are present else None
     is_chief: bool
         in distribured training it is true for tha main MPI process in serail it is
         always true
-    my_job_name: str
-        name of the training job
-    my_socket: Optional[str]
-        communication socket for distributed training
-    my_task_index: int
+    world_size: int
+        total worker count
+    my_rank: int
         index of the MPI task
     nodename: str
         name of the node
-    num_ps: Optional[int]
-        number of ps
-    num_workers: Optional[int]
-        number of workers
-    server: Optional[tf.train.Server]
-        `tf.train.Server` or `None` for serial training
+    node_list_ : List[str]
+        the list of nodes of the current mpirun
     my_device: str
         deviice type - gpu or cpu
     """
 
-    cluster: Optional[Dict[str, List[str]]]
-    cluster_spec: Optional[tf.train.ClusterSpec]
     gpus: Optional[List[int]]
-    is_chief: bool
-    my_job_name: str
-    my_socket: Optional[str]
-    my_task_index: int
+    world_size: int
+    my_rank: int
     nodename: str
-    num_ps: Optional[int]
-    num_workers: Optional[int]
-    server: Optional["TFServerV1"]
+    nodelist: List[int]
     my_device: str
 
-    _MPI: Optional["MPI"]
+    _HVD: Optional["HVD"]
     _log_handles_already_set: bool = False
 
     def __init__(
@@ -202,15 +90,9 @@ def __init__(
         restart: Optional[str] = None,
         log_path: Optional[str] = None,
         log_level: int = 0,
-        mpi_log: str = "master",
-        try_distrib: bool = False
+        mpi_log: str = "master"
     ):
-        # distributed tasks
-        if try_distrib:
-            self._try_init_mpi()
-        else:
-            self.is_distrib = False
-            self._init_serial()
+        self._try_init_distrib()
 
         if all((init_model, restart)):
             raise RuntimeError(
@@ -231,20 +113,24 @@ def __init__(
 
         self._setup_logger(Path(log_path) if log_path else None, log_level, mpi_log)
 
+    @property
+    def is_chief(self):
+        """Whether my rank is 0."""
+        return self.my_rank == 0
+
     def print_resource_summary(self):
         """Print build and current running cluster configuration summary."""
         log.info("---Summary of the training---------------------------------------")
         if self.is_distrib:
             log.info("distributed")
-            log.info(f"ps list:              {self.cluster['ps']}")
-            log.info(f"worker list:          {self.cluster['worker']}")
-            log.info(f"chief on:             {self.nodename}")
-        else:
-            log.info(f"running on:           {self.nodename}")
-        if self.gpus is None:
-            log.info(f"CUDA_VISIBLE_DEVICES: unset")
-        else:
-            log.info(f"CUDA_VISIBLE_DEVICES: {self.gpus}")
+            log.info(f"world size:           {self.world_size}")
+            log.info(f"my rank:              {self.my_rank}")
+            log.info(f"node list:            {self.nodelist}")
+        log.info(f"running on:           {self.nodename}")
+        log.info(f"computing device:     {self.my_device}")
+        env_value = os.environ.get('CUDA_VISIBLE_DEVICES', 'unset')
+        log.info(f"CUDA_VISIBLE_DEVICES: {env_value}")
+        log.info(f"Count of visible GPU: {len(self.gpus or [])}")
         intra, inter = get_tf_default_nthreads()
         log.info(f"num_intra_threads:    {intra:d}")
         log.info(f"num_inter_threads:    {inter:d}")
@@ -270,84 +156,73 @@ def _setup_logger(
             console only from rank==0. `collect` will write messages from all ranks to
             one file opened under rank==0 and to console. `workers` will open one log
             file for each worker designated by its rank, console behaviour is the same
-            as for `collect`. If this argument is specified than also `MPI` object must
-            be passed in. by default None
+            as for `collect`.
         """
         if not self._log_handles_already_set:
-            if not self._MPI:
+            if not self._HVD:
                 mpi_log = None
-            set_log_handles(log_level, log_path, mpi_log=mpi_log, MPI=self._MPI)
+            set_log_handles(log_level, log_path, mpi_log=mpi_log)
             self._log_handles_already_set = True
             log.debug("Log handles were successfully set")
         else:
             log.warning(
                 f"Log handles have already been set. It is not advisable to "
-                f"reset them{', especially when runnig with MPI!' if self._MPI else ''}"
+                f"reset them{', especially when runnig with MPI!' if self._HVD else ''}"
             )
 
-    def _try_init_mpi(self):
+    def _try_init_distrib(self):
         try:
-            from mpi4py import MPI
+            import horovod.tensorflow as HVD
+            HVD.init()
+            self.is_distrib = HVD.size() > 1
         except ImportError:
-            raise RuntimeError(
-                "cannot import mpi4py module, cannot do distributed simulation"
-            )
+            log.warning("Switch to serial execution due to lack of horovod module.")
+            self.is_distrib = False
+
+        # Do real intialization
+        if self.is_distrib:
+            self._init_distributed(HVD)
+            self._HVD = HVD
         else:
-            self.is_distrib = _is_distributed(MPI)
-            if self.is_distrib:
-                self._init_distributed(MPI)
-                self._MPI = MPI
-            else:
-                self._init_serial()
-                self._MPI = None
-
-    def _init_distributed(self, MPI: "MPI"):
+            self._init_serial()
+            self._HVD = None
+
+    def _init_distributed(self, HVD: "HVD"):
         """Initialize  settings for distributed training.
 
         Parameters
         ----------
-        MPI : MPI
-            MPI object
+        HVD : HVD
+            horovod object
         """
         nodename, nodelist, gpus = get_resource()
         self.nodename = nodename
+        self.nodelist = nodelist
         self.gpus = gpus
-        (
-            self.cluster,
-            self.my_job_name,
-            self.my_task_index,
-            self.my_socket,
-            self.my_device,
-        ) = _distributed_task_config(MPI, nodename, nodelist, gpus)
-        self.is_chief = self.my_job_name == "worker" and self.my_task_index == 0
-        self.num_ps = len(self.cluster["ps"])
-        self.num_workers = len(self.cluster["worker"])
-        self.cluster_spec = tf.train.ClusterSpec(self.cluster)
-        self.server = tf.train.Server(
-            server_or_cluster_def=self.cluster_spec,
-            job_name=self.my_job_name,
-            task_index=self.my_task_index,
-        )
+        self.my_rank = HVD.rank()
+        self.world_size = HVD.size()
+
+        if gpus is not None:
+            gpu_idx = HVD.local_rank()
+            if gpu_idx >= len(gpus):
+                raise RuntimeError('Count of local processes is larger than that of available GPUs!')
+            self.my_device = f"gpu:{gpu_idx:d}"
+        else:
+            self.my_device = "cpu:0"
 
     def _init_serial(self):
         """Initialize setting for serial training."""
         nodename, _, gpus = get_resource()
 
-        self.cluster = None
-        self.cluster_spec = None
         self.gpus = gpus
-        self.is_chief = True
-        self.my_job_name = nodename
-        self.my_socket = None
-        self.my_task_index = 0
+        self.world_size = 1
+        self.my_rank = 0
         self.nodename = nodename
-        self.num_ps = None
-        self.num_workers = None
-        self.server = None
+        self.nodelist = [nodename]
 
         if gpus is not None:
-            self.my_device = "gpu:" + str(gpus[0])
+            self.my_device = "gpu:0"
         else:
             self.my_device = "cpu:0"
 
-        self._MPI = None
+        self._HVD = None
diff --git a/deepmd/train/trainer.py b/deepmd/train/trainer.py
index 8f283b61cf..4526c2d469 100644
--- a/deepmd/train/trainer.py
+++ b/deepmd/train/trainer.py
@@ -6,7 +6,7 @@
 import google.protobuf.message
 import numpy as np
 from deepmd.env import tf
-from deepmd.env import default_tf_session_config
+from deepmd.env import get_tf_session_config
 from deepmd.env import GLOBAL_TF_FLOAT_PRECISION
 from deepmd.env import GLOBAL_ENER_FLOAT_PRECISION
 from deepmd.fit import EnerFitting, WFCFitting, PolarFittingLocFrame, PolarFittingSeA, GlobalPolarFittingSeA, DipoleFittingSeA
@@ -261,9 +261,9 @@ def _init_param(self, jdata):
         self.save_ckpt = tr_data.get('save_ckpt', 'model.ckpt')
         self.display_in_training = tr_data.get('disp_training', True)
         self.timing_in_training  = tr_data.get('time_training', True)
-        self.profiling = tr_data.get('profiling', False)
+        self.profiling = self.run_opt.is_chief and tr_data.get('profiling', False)
         self.profiling_file = tr_data.get('profiling_file', 'timeline.json')
-        self.tensorboard = tr_data.get('tensorboard', False)
+        self.tensorboard = self.run_opt.is_chief and tr_data.get('tensorboard', False)
         self.tensorboard_log_dir = tr_data.get('tensorboard_log_dir', 'log')
         # self.sys_probs = tr_data['sys_probs']
         # self.auto_prob_style = tr_data['auto_prob']
@@ -308,15 +308,9 @@ def build (self,
                 = self.neighbor_stat.get_stat(data)
             self.descrpt.enable_compression(self.min_nbor_dist, self.model_param['compress']['model_file'], self.model_param['compress']['table_config'][0], self.model_param['compress']['table_config'][1], self.model_param['compress']['table_config'][2], self.model_param['compress']['table_config'][3])
 
-        worker_device = "/job:%s/task:%d/%s" % (self.run_opt.my_job_name,
-                                                self.run_opt.my_task_index,
-                                                self.run_opt.my_device)
-
-        with tf.device(tf.train.replica_device_setter(worker_device = worker_device,
-                                                      cluster = self.run_opt.cluster_spec)):
-            self._build_lr()
-            self._build_network(data)
-            self._build_training()
+        self._build_lr()
+        self._build_network(data)
+        self._build_training()
 
 
     def _build_lr(self):
@@ -362,14 +356,11 @@ def _build_network(self, data):
 
     def _build_training(self):
         trainable_variables = tf.trainable_variables()
-        optimizer = tf.train.AdamOptimizer(learning_rate = self.learning_rate)
-        if self.run_opt.is_distrib :
-            optimizer = tf.train.SyncReplicasOptimizer(
-                optimizer,
-                replicas_to_aggregate = self.run_opt.cluster_spec.num_tasks("worker"),
-                total_num_replicas = self.run_opt.cluster_spec.num_tasks("worker"),
-                name = "sync_replicas")
-            self.sync_replicas_hook = optimizer.make_session_run_hook(self.run_opt.is_chief)            
+        if self.run_opt.is_distrib:
+            optimizer = tf.train.AdamOptimizer(learning_rate = self.learning_rate*self.run_opt.world_size)
+            optimizer = self.run_opt._HVD.DistributedOptimizer(optimizer)
+        else:
+            optimizer = tf.train.AdamOptimizer(learning_rate = self.learning_rate)
         grads = tf.gradients(self.l2_l, trainable_variables)
         apply_op = optimizer.apply_gradients (zip (grads, trainable_variables),
                                               global_step=self.global_step,
@@ -378,76 +369,48 @@ def _build_training(self):
         self.train_op = tf.group(*train_ops)
         log.info("built training")
 
-    def _init_sess_serial(self) :
-        self.sess = tf.Session(config=default_tf_session_config)
-        self.saver = tf.train.Saver()
-        saver = self.saver
-        if self.run_opt.init_mode == 'init_from_scratch' :
-            log.info("initialize model from scratch")
-            init_op = tf.global_variables_initializer()
-            run_sess(self.sess, init_op)
-            fp = open(self.disp_file, "w")
-            fp.close ()
-        elif self.run_opt.init_mode == 'init_from_model' :
-            log.info("initialize from model %s" % self.run_opt.init_model)
-            init_op = tf.global_variables_initializer()
-            run_sess(self.sess, init_op)
-            saver.restore (self.sess, self.run_opt.init_model)            
-            run_sess(self.sess, self.global_step.assign(0))
-            fp = open(self.disp_file, "w")
-            fp.close ()
-        elif self.run_opt.init_mode == 'restart' :
-            log.info("restart from model %s" % self.run_opt.restart)
-            init_op = tf.global_variables_initializer()
-            run_sess(self.sess, init_op)
-            saver.restore (self.sess, self.run_opt.restart)
-        else :
-            raise RuntimeError ("unkown init mode")
-
-    def _init_sess_distrib(self):
-        ckpt_dir = os.path.join(os.getcwd(), self.save_ckpt)
-        assert(_is_subdir(ckpt_dir, os.getcwd())), "the checkpoint dir must be a subdir of the current dir"
-        if self.run_opt.init_mode == 'init_from_scratch' :
-            log.info("initialize model from scratch")
-            if self.run_opt.is_chief :
-                if os.path.exists(ckpt_dir):
-                    shutil.rmtree(ckpt_dir)
-                if not os.path.exists(ckpt_dir) :
-                    os.makedirs(ckpt_dir)
+    def _init_session(self):
+        config = get_tf_session_config()
+        device, idx = self.run_opt.my_device.split(":", 1)
+        if device == "gpu":
+            config.gpu_options.allow_growth = True
+            config.gpu_options.visible_device_list = idx
+        self.sess = tf.Session(config=config)
+
+        # Initializes or restore global variables
+        init_op = tf.global_variables_initializer()
+        if self.run_opt.is_chief:
+            self.saver = tf.train.Saver()
+            if self.run_opt.init_mode == 'init_from_scratch' :
+                log.info("initialize model from scratch")
+                run_sess(self.sess, init_op)
                 fp = open(self.disp_file, "w")
                 fp.close ()
-        elif self.run_opt.init_mode == 'init_from_model' :
-            raise RuntimeError("distributed training does not support %s" % self.run_opt.init_mode)
-        elif self.run_opt.init_mode == 'restart' :
-            log.info("restart from model %s" % ckpt_dir)
-            if self.run_opt.is_chief :
-                assert(os.path.isdir(ckpt_dir)), "the checkpoint dir %s should exists" % ckpt_dir
-        else :
-            raise RuntimeError ("unkown init mode")
-
-        saver = tf.train.Saver(max_to_keep = 1)
-        self.saver = None
-        # gpu_options = tf.GPUOptions(per_process_gpu_memory_fraction=0.5)
-        # config = tf.ConfigProto(allow_soft_placement=True,
-        #                         gpu_options = gpu_options,
-        #                         intra_op_parallelism_threads=self.run_opt.num_intra_threads,
-        #                         inter_op_parallelism_threads=self.run_opt.num_inter_threads)
-        config = tf.ConfigProto(intra_op_parallelism_threads=self.run_opt.num_intra_threads,
-                                inter_op_parallelism_threads=self.run_opt.num_inter_threads)
-        # The stop_hook handles stopping after running given steps
-        # stop_hook = tf.train.StopAtStepHook(last_step = stop_batch)
-        # hooks = [self.sync_replicas_hook, stop_hook]
-        hooks = [self.sync_replicas_hook]
-        scaffold = tf.train.Scaffold(saver=saver)
-        # Use monitor session for distributed computation
-        self.sess = tf.train.MonitoredTrainingSession(master = self.run_opt.server.target,
-                                                      is_chief = self.run_opt.is_chief,
-                                                      config = config,
-                                                      hooks = hooks,
-                                                      scaffold = scaffold,
-                                                      checkpoint_dir = ckpt_dir)
-        # ,
-        # save_checkpoint_steps = self.save_freq)
+            elif self.run_opt.init_mode == 'init_from_model' :
+                log.info("initialize from model %s" % self.run_opt.init_model)
+                run_sess(self.sess, init_op)
+                self.saver.restore (self.sess, self.run_opt.init_model)            
+                run_sess(self.sess, self.global_step.assign(0))
+                fp = open(self.disp_file, "w")
+                fp.close ()
+            elif self.run_opt.init_mode == 'restart' :
+                log.info("restart from model %s" % self.run_opt.restart)
+                run_sess(self.sess, init_op)
+                self.saver.restore (self.sess, self.run_opt.restart)
+            else :
+                raise RuntimeError ("unkown init mode")
+        else:
+            run_sess(self.sess, init_op)
+            self.saver = None
+
+        # Ensure variable consistency among tasks when training starts
+        if self.run_opt.is_distrib:
+            bcast_op = self.run_opt._HVD.broadcast_global_variables(0)
+            if self.run_opt.is_chief:
+                log.info('broadcast global variables to other tasks')
+            else:
+                log.info('receive global variables from task#0')
+            run_sess(self.sess, bcast_op)
 
     def train (self, train_data, valid_data=None) :
 
@@ -455,11 +418,9 @@ def train (self, train_data, valid_data=None) :
         #     valid_data = train_data  # using training set as validation set.
 
         stop_batch = self.stop_batch
-        if self.run_opt.is_distrib :
-            self._init_sess_distrib()
-        else :
-            self._init_sess_serial()
+        self._init_session()
 
+        # Before data shard is enabled, only cheif do evaluation and record it
         # self.print_head()
         fp = None
         if self.run_opt.is_chief :
@@ -478,12 +439,12 @@ def train (self, train_data, valid_data=None) :
 
         prf_options = None
         prf_run_metadata = None
-        if self.profiling :
+        if self.profiling:
             prf_options = tf.RunOptions(trace_level=tf.RunOptions.FULL_TRACE)
             prf_run_metadata = tf.RunMetadata()
 
         # set tensorboard execution environment
-        if self.tensorboard :
+        if self.tensorboard:
             summary_merged_op = tf.summary.merge_all()
             # Remove TB old logging directory from previous run
             try:
@@ -510,8 +471,9 @@ def train (self, train_data, valid_data=None) :
             # first round validation:
             train_batch = train_data.get_batch()
             if self.display_in_training and is_first_step:
-                valid_batches = [valid_data.get_batch() for ii in range(self.valid_numb_batch)] if valid_data is not None else None
-                self.valid_on_the_fly(fp, [train_batch], valid_batches, print_header=True)
+                if self.run_opt.is_chief:
+                    valid_batches = [valid_data.get_batch() for ii in range(self.valid_numb_batch)] if valid_data is not None else None
+                    self.valid_on_the_fly(fp, [train_batch], valid_batches, print_header=True)
                 is_first_step = False
 
             if self.timing_in_training: tic = time.time()
@@ -534,25 +496,25 @@ def train (self, train_data, valid_data=None) :
             if self.display_in_training and (cur_batch % self.disp_freq == 0):
                 if self.timing_in_training:
                     tic = time.time()
-                valid_batches = [valid_data.get_batch() for ii in range(self.valid_numb_batch)] if valid_data is not None else None
-                self.valid_on_the_fly(fp, [train_batch], valid_batches)
+                if self.run_opt.is_chief:
+                    valid_batches = [valid_data.get_batch() for ii in range(self.valid_numb_batch)] if valid_data is not None else None
+                    self.valid_on_the_fly(fp, [train_batch], valid_batches)
                 if self.timing_in_training:
                     toc = time.time()
                     test_time = toc - tic
                     log.info("batch %7d training time %.2f s, testing time %.2f s"
                                   % (cur_batch, train_time, test_time))
                     train_time = 0
-                if self.save_freq > 0 and cur_batch % self.save_freq == 0 and self.run_opt.is_chief :
-                    if self.saver is not None :
-                        try:
-                            self.saver.save (self.sess, os.getcwd() + "/" + self.save_ckpt)
-                        except google.protobuf.message.DecodeError as e:
-                            raise GraphTooLargeError(
-                                "The graph size exceeds 2 GB, the hard limitation of protobuf."
-                                " Then a DecodeError was raised by protobuf. You should "
-                                "reduce the size of your model."
-                            ) from e
-                        log.info("saved checkpoint %s" % self.save_ckpt)
+                if self.save_freq > 0 and cur_batch % self.save_freq == 0 and self.saver is not None:
+                    try:
+                        self.saver.save (self.sess, os.getcwd() + "/" + self.save_ckpt)
+                    except google.protobuf.message.DecodeError as e:
+                        raise GraphTooLargeError(
+                            "The graph size exceeds 2 GB, the hard limitation of protobuf."
+                            " Then a DecodeError was raised by protobuf. You should "
+                            "reduce the size of your model."
+                        ) from e
+                    log.info("saved checkpoint %s" % self.save_ckpt)
         if self.run_opt.is_chief: 
             fp.close ()
         if self.profiling and self.run_opt.is_chief :
diff --git a/deepmd/utils/argcheck.py b/deepmd/utils/argcheck.py
index 4dfd552d89..3f75b39394 100644
--- a/deepmd/utils/argcheck.py
+++ b/deepmd/utils/argcheck.py
@@ -1,5 +1,6 @@
-from dargs import dargs, Argument, Variant
+from dargs import dargs, Argument, Variant, ArgumentEncoder
 from deepmd.common import ACTIVATION_FN_DICT, PRECISION_DICT
+import json
 
 
 def list_to_doc(xx):
@@ -622,6 +623,13 @@ def gen_doc(*, make_anchor=True, make_link=True, **kwargs):
 
     return "\n\n".join(ptr)
 
+def gen_json(**kwargs):
+    return json.dumps((
+        model_args(),
+        learning_rate_args(),
+        loss_args(),
+        training_args(),
+    ), cls=ArgumentEncoder)
 
 def normalize_hybrid_list(hy_list):
     new_list = []
diff --git a/deepmd/utils/compat.py b/deepmd/utils/compat.py
index 861a00439c..e3fd0c3177 100644
--- a/deepmd/utils/compat.py
+++ b/deepmd/utils/compat.py
@@ -27,8 +27,6 @@ def convert_input_v0_v1(
     """
 
     output = {}
-    if "with_distrib" in jdata:
-        output["with_distrib"] = jdata["with_distrib"]
     output["model"] = _model(jdata, jdata["use_smooth"])
     output["learning_rate"] = _learning_rate(jdata)
     output["loss"] = _loss(jdata)
diff --git a/deepmd/utils/data.py b/deepmd/utils/data.py
index fa488f8a33..903190fb54 100644
--- a/deepmd/utils/data.py
+++ b/deepmd/utils/data.py
@@ -57,8 +57,7 @@ def __init__ (self,
         if type_map is not None and self.type_map is not None:
             atom_type_ = [type_map.index(self.type_map[ii]) for ii in self.atom_type]
             self.atom_type = np.array(atom_type_, dtype = np.int32)
-            ntypes = len(self.type_map)
-            self.type_map = type_map[:ntypes]
+            self.type_map = type_map
         # make idx map
         self.idx_map = self._make_idx_map(self.atom_type)
         # train dirs
diff --git a/deepmd/utils/neighbor_stat.py b/deepmd/utils/neighbor_stat.py
index 11a466faac..9b867fe85b 100644
--- a/deepmd/utils/neighbor_stat.py
+++ b/deepmd/utils/neighbor_stat.py
@@ -82,8 +82,20 @@ def get_stat(self,
                                                 self.place_holders['box']: np.array(data_set['box'])[kk].reshape([-1, 9]),
                                                 self.place_holders['default_mesh']: np.array(data.default_mesh[ii]),
                                             })
-                    dt = np.min(dt)
+                    if dt.size != 0:
+                        dt = np.min(dt)              
+                    else:
+                        dt = self.rcut
+                        log.warning("Atoms with no neighbors found in %s. Please make sure it's what you expected."%jj)
+                        
                     if dt < self.min_nbor_dist:
+                        if math.isclose(dt, 0., rel_tol=1e-6):
+                            # it's unexpected that the distance between two atoms is zero
+                            # zero distance will cause nan (#874) 
+                            raise RuntimeError(
+                                "Some atoms in %s are overlapping. Please check your"
+                                " training data to remove duplicated atoms." % jj
+                            )
                         self.min_nbor_dist = dt
                     for ww in range(self.ntypes):
                         var = np.max(mn[:, ww])
diff --git a/doc/getting-started.md b/doc/getting-started.md
index 76ed8acd5b..005926bef4 100644
--- a/doc/getting-started.md
+++ b/doc/getting-started.md
@@ -5,6 +5,7 @@ In this text, we will call the deep neural network that is used to represent the
 2. [Train a model](#train-a-model)
     - [Write the input script](#write-the-input-script)
     - [Training](#training)
+    - [Parallel training](#parallel-training)
     - [Training analysis with Tensorboard](#training-analysis-with-tensorboard)
 3. [Freeze a model](#freeze-a-model)
 4. [Test a model](#test-a-model)
@@ -134,6 +135,57 @@ export TF_INTER_OP_PARALLELISM_THREADS=2
 dp train input.json
 ```
 
+One can set other environmental variables:
+
+| Environment variables | Allowed value          | Default value | Usage                      |
+| --------------------- | ---------------------- | ------------- | -------------------------- |
+| DP_INTERFACE_PREC     | `high`, `low`          | `high`        | Control high (double) or low (float) precision of training. |
+
+
+### Parallel training
+
+Currently, parallel training is enabled in a sychoronized way with help of [Horovod](https://github.com/horovod/horovod). DeePMD-kit will decide parallel training or not according to MPI context. Thus, there is no difference in your json/yaml input file.
+
+Testing `examples/water/se_e2_a` on a 8-GPU host, linear acceleration can be observed with increasing number of cards.
+| Num of GPU cards | Seconds every 100 samples | Samples per second | Speed up |
+|  --  | -- | -- | -- |
+| 1  | 1.6116 | 62.05 | 1.00 |
+| 2  | 1.6310 | 61.31 | 1.98 |
+| 4  | 1.6168 | 61.85 | 3.99 |
+| 8  | 1.6212 | 61.68 | 7.95 |
+
+To experience this powerful feature, please intall Horovod and [mpi4py](https://github.com/mpi4py/mpi4py) first. For better performance on GPU, please follow tuning steps in [Horovod on GPU](https://github.com/horovod/horovod/blob/master/docs/gpus.rst).
+```bash
+# By default, MPI is used as communicator.
+HOROVOD_WITHOUT_GLOO=1 HOROVOD_WITH_TENSORFLOW=1 pip install horovod mpi4py
+```
+
+Horovod works in the data-parallel mode resulting a larger global batch size. For example, the real batch size is 8 when `batch_size` is set to 2 in the input file and you lauch 4 workers. Thus, `learning_rate` is automatically scaled by the number of workers for better convergence. Technical details of such heuristic rule are discussed at [Accurate, Large Minibatch SGD: Training ImageNet in 1 Hour](https://arxiv.org/abs/1706.02677).
+
+With dependencies installed, have a quick try!
+```bash
+# Launch 4 processes on the same host
+CUDA_VISIBLE_DEVICES=4,5,6,7 horovodrun -np 4 \
+    dp train --mpi-log=workers input.json
+```
+
+Need to mention, environment variable `CUDA_VISIBLE_DEVICES` must be set to control parallelism on the occupied host where one process is bound to one GPU card.
+
+What's more, 2 command-line arguments are defined to control the logging behvaior.
+```
+optional arguments:
+  -l LOG_PATH, --log-path LOG_PATH
+                        set log file to log messages to disk, if not
+                        specified, the logs will only be output to console
+                        (default: None)
+  -m {master,collect,workers}, --mpi-log {master,collect,workers}
+                        Set the manner of logging when running with MPI.
+                        'master' logs only on main process, 'collect'
+                        broadcasts logs from workers to master and 'workers'
+                        means each process will output its own log (default:
+                        master)
+```
+
 ### Training analysis with Tensorboard
 
 If enbled in json/yaml input file DeePMD-kit will create log files which can be
@@ -353,7 +405,7 @@ where `e`, `f` and `v` are predicted energy, force and virial of the system, res
 
 You can compile `infer_water.cpp` using `gcc`:
 ```sh
-gcc infer_water.cpp -D HIGH_PREC -L $deepmd_root/lib -L $tensorflow_root/lib -I $deepmd_root/include -I $tensorflow_root/lib -Wl,--no-as-needed -ldeepmd_op -ldeepmd -ldeepmd_cc -ltensorflow_cc -ltensorflow_framework -lstdc++ -Wl,-rpath=$deepmd_root/lib -Wl,-rpath=$tensorflow_root/lib -o infer_water
+gcc infer_water.cpp -D HIGH_PREC -L $deepmd_root/lib -L $tensorflow_root/lib -I $deepmd_root/include -I $tensorflow_root/include -Wl,--no-as-needed -ldeepmd_op -ldeepmd -ldeepmd_cc -ltensorflow_cc -ltensorflow_framework -lstdc++ -Wl,-rpath=$deepmd_root/lib -Wl,-rpath=$tensorflow_root/lib -o infer_water
 ```
 and then run the program:
 ```sh
@@ -365,9 +417,11 @@ and then run the program:
 Note that the model for MD simulations is required to be compatible with the DeePMD-kit package. See [Model compatibility](troubleshooting/model-compatability.md) for details. 
 
 ### Run MD with LAMMPS
-Include deepmd in the pair_style
 
-#### Syntax
+#### pair_style `deepmd`
+
+The DeePMD-kit package provides the pair_style `deepmd`
+
 ```
 pair_style deepmd models ... keyword value ...
 ```
@@ -387,14 +441,14 @@ pair_style deepmd models ... keyword value ...
         level = The level parameter for computing the relative model deviation
 </pre>
 
-#### Examples
+##### Examples
 ```
 pair_style deepmd graph.pb
 pair_style deepmd graph.pb fparam 1.2
 pair_style deepmd graph_0.pb graph_1.pb graph_2.pb out_file md.out out_freq 10 atomic relative 1.0
 ```
 
-#### Description
+##### Description
 Evaluate the interaction of the system by using [Deep Potential][DP] or [Deep Potential Smooth Edition][DP-SE]. It is noticed that deep potential is not a "pairwise" interaction, but a multi-body interaction. 
 
 This pair style takes the deep potential defined in a model file that usually has the .pb extension. The model can be trained and frozen by package [DeePMD-kit](https://github.com/deepmodeling/deepmd-kit).
@@ -409,11 +463,35 @@ Ef_i = -------------
 ```
 where `Df_i` is the absolute model deviation of the force on atom `i`, `|f_i|` is the norm of the the force and `level` is provided as the parameter of the keyword `relative`.
 
-
-#### Restrictions
+##### Restrictions
 - The `deepmd` pair style is provided in the USER-DEEPMD package, which is compiled from the DeePMD-kit, visit the [DeePMD-kit website](https://github.com/deepmodeling/deepmd-kit) for more information.
 
 
+#### Compute tensorial prperties
+
+The DeePMD-kit package provide the compute `deeptensor/atom` for computing atomic tensorial properties. 
+
+```
+compute ID group-ID deeptensor/atom model_file
+```
+- ID: user-assigned name of the computation
+- group-ID: ID of the group of atoms to compute
+- deeptensor/atom: the style of this compute
+- model_file: the name of the binary model file.
+
+##### Examples
+```
+compute         dipole all deeptensor/atom dipole.pb
+```
+The result of the compute can be dump to trajctory file by 
+```
+dump            1 all custom 100 water.dump id type c_dipole[1] c_dipole[2] c_dipole[3] 
+```
+
+##### Restrictions
+- The `deeptensor/atom` compute is provided in the USER-DEEPMD package, which is compiled from the DeePMD-kit, visit the [DeePMD-kit website](https://github.com/deepmodeling/deepmd-kit) for more information.
+
+
 #### Long-range interaction
 The reciprocal space part of the long-range interaction can be calculated by LAMMPS command `kspace_style`. To use it with DeePMD-kit, one writes 
 ```bash
@@ -425,7 +503,7 @@ kspace_modify	gewald 0.45
 Please notice that the DeePMD does nothing to the direct space part of the electrostatic interaction, because this part is assumed to be fitted in the DeePMD model (the direct space cut-off is thus the cut-off of the DeePMD model). The splitting parameter `gewald` is modified by the `kspace_modify` command.
 
 ### Run path-integral MD with i-PI
-The i-PI works in a client-server model. The i-PI provides the server for integrating the replica positions of atoms, while the DeePMD-kit provides a client named `dp_ipi` that computes the interactions (including energy, force and virial). The server and client communicates via the Unix domain socket or the Internet socket. Installation instructions of i-PI can be found [here](install.md#install-i-pi). The client can be started by
+The i-PI works in a client-server model. The i-PI provides the server for integrating the replica positions of atoms, while the DeePMD-kit provides a client named `dp_ipi` (or `dp_ipi_low` for low precision) that computes the interactions (including energy, force and virial). The server and client communicates via the Unix domain socket or the Internet socket. Installation instructions of i-PI can be found [here](install.md#install-i-pi). The client can be started by
 ```bash
 i-pi input.xml &
 dp_ipi water.json
diff --git a/doc/install.md b/doc/install.md
index a45d8c7d9f..89c2d4ad2c 100644
--- a/doc/install.md
+++ b/doc/install.md
@@ -21,7 +21,7 @@ Both CPU and GPU version offline packages are avaiable in [the Releases page](ht
 
 Some packages are splited into two files due to size limit of GitHub. One may merge them into one after downloading:
 ```bash
-cat deepmd-kit-2.0.0-cuda11.1_gpu-Linux-x86_64.sh.0 deepmd-kit-2.0.0-cuda11.1_gpu-Linux-x86_64.sh.1 > deepmd-kit-2.0.0-cuda11.1_gpu-Linux-x86_64.sh
+cat deepmd-kit-2.0.0-cuda11.3_gpu-Linux-x86_64.sh.0 deepmd-kit-2.0.0-cuda11.3_gpu-Linux-x86_64.sh.1 > deepmd-kit-2.0.0-cuda11.3_gpu-Linux-x86_64.sh
 ```
 
 ### Install with conda
@@ -29,18 +29,18 @@ DeePMD-kit is avaiable with [conda](https://github.com/conda/conda). Install [An
 
 One may create an environment that contains the CPU version of DeePMD-kit and LAMMPS:
 ```bash
-conda create -n deepmd deepmd-kit=*=*cpu lammps-dp=*=*cpu -c https://conda.deepmodeling.org
+conda create -n deepmd deepmd-kit=*=*cpu libdeepmd=*=*cpu lammps-dp -c https://conda.deepmodeling.org
 ```
 
 Or one may want to create a GPU environment containing [CUDA Toolkit](https://docs.nvidia.com/deploy/cuda-compatibility/index.html#binary-compatibility__table-toolkit-driver):
 ```bash
-conda create -n deepmd deepmd-kit=*=*gpu lammps-dp=*=*gpu cudatoolkit=11.1 -c https://conda.deepmodeling.org -c nvidia
+conda create -n deepmd deepmd-kit=*=*gpu libdeepmd=*=*gpu lammps-dp cudatoolkit=11.3 -c https://conda.deepmodeling.org
 ```
-One could change the CUDA Toolkit version from `11.1` to `10.1` or `10.0`.
+One could change the CUDA Toolkit version from `10.1` or `11.3`.
 
 One may speficy the DeePMD-kit version such as `2.0.0` using
 ```bash
-conda create -n deepmd deepmd-kit=2.0.0=*cpu lammps-dp=2.0.0=*cpu -c https://conda.deepmodeling.org
+conda create -n deepmd deepmd-kit=2.0.0=*cpu libdeepmd=2.0.0=*cpu lammps-dp=2.0.0 -c https://conda.deepmodeling.org
 ```
 
 One may enable the environment using
@@ -136,7 +136,6 @@ One may set the following environment variables before executing `pip`:
 | Environment variables | Allowed value          | Default value | Usage                      |
 | --------------------- | ---------------------- | ------------- | -------------------------- |
 | DP_VARIANT            | `cpu`, `cuda`, `rocm`  | `cpu`         | Build CPU variant or GPU variant with CUDA or ROCM support. |
-| DP_FLOAT_PREC         | `high`, `low`          | `high`        | Build high (double) or low (float) precision. |
 | CUDA_TOOLKIT_ROOT_DIR | Path                   | Detected automatically | The path to the CUDA toolkit directory. |
 | ROCM_ROOT             | Path                   | Detected automatically | The path to the ROCM toolkit directory. |
 
@@ -201,7 +200,6 @@ One may add the following arguments to `cmake`:
 | ------------------------ | ------------------- | ------------- | ------------------------|
 | -DTENSORFLOW_ROOT=&lt;value&gt;  | Path              | -             | The Path to TensorFlow's C++ interface. |
 | -DCMAKE_INSTALL_PREFIX=&lt;value&gt; | Path          | -             | The Path where DeePMD-kit will be installed. |
-| -DFLOAT_PREC=&lt;value&gt;       | `high` or `low`   | `high`        | Build high (double) or low (float) precision. |
 | -DUSE_CUDA_TOOLKIT=&lt;value&gt; | `TRUE` or `FALSE` | `FALSE`       | If `TRUE`, Build GPU support with CUDA toolkit. |
 | -DCUDA_TOOLKIT_ROOT_DIR=&lt;value&gt; | Path         | Detected automatically | The path to the CUDA toolkit directory. |
 | -DUSE_ROCM_TOOLKIT=&lt;value&gt; | `TRUE` or `FALSE` | `FALSE`       | If `TRUE`, Build GPU support with ROCM toolkit. |
@@ -228,7 +226,7 @@ DeePMD-kit provide module for running MD simulation with LAMMPS. Now make the De
 cd $deepmd_source_dir/source/build
 make lammps
 ```
-DeePMD-kit will generate a module called `USER-DEEPMD` in the `build` directory. Now download the LAMMPS code (`29Oct2020` or later), and uncompress it:
+DeePMD-kit will generate a module called `USER-DEEPMD` in the `build` directory. If you need low precision version, move `env_low.sh` to `env.sh` in the directory. Now download the LAMMPS code (`29Oct2020` or later), and uncompress it:
 ```bash
 cd /some/workspace
 wget https://github.com/lammps/lammps/archive/stable_29Oct2020.tar.gz
diff --git a/requirements.txt b/requirements.txt
index e3a8f501ab..50b597f2fe 100644
--- a/requirements.txt
+++ b/requirements.txt
@@ -1,5 +1,6 @@
 numpy
 scipy
 pyyaml
-dargs >= 0.2.2
+dargs >= 0.2.6
+python-hostlist >= 1.21
 typing_extensions; python_version < "3.7"
diff --git a/setup.py b/setup.py
index 7f917ba11d..e48f9a3f3d 100644
--- a/setup.py
+++ b/setup.py
@@ -52,16 +52,6 @@
 else:
     raise RuntimeError("Unsupported DP_VARIANT option: %s" % dp_variant)
 
-# FLOAT_PREC
-dp_float_prec = os.environ.get("DP_FLOAT_PREC", "").lower()
-if dp_float_prec in ["high", "low"]:
-    cmake_args.append("-DFLOAT_PREC:STRING=%s" % dp_float_prec)
-elif dp_float_prec == "":
-    # default is high
-    cmake_args.append("-DFLOAT_PREC:STRING=high")
-else:
-    raise RuntimeError("Unsupported float precision option: %s" % dp_float_prec)
-
 # get tensorflow spec
 tf_spec = find_spec("tensorflow")
 if not tf_spec:
@@ -135,7 +125,7 @@
     cmake_minimum_required_version="3.0",
     extras_require={
         "test": ["dpdata>=0.1.9", "ase", "pytest", "pytest-cov", "pytest-sugar"],
-        "docs": ["sphinx", "recommonmark", "sphinx_rtd_theme", "sphinx_markdown_tables", "myst-parser", "breathe", "exhale"],
+        "docs": ["sphinx<4.1.0", "recommonmark", "sphinx_rtd_theme", "sphinx_markdown_tables", "myst-parser", "breathe", "exhale"],
         **extras_require,
     },
     entry_points={"console_scripts": ["dp = deepmd.entrypoints.main:main"]},
diff --git a/source/CMakeLists.txt b/source/CMakeLists.txt
index 1c89c3cf25..81acb5bf8f 100644
--- a/source/CMakeLists.txt
+++ b/source/CMakeLists.txt
@@ -162,17 +162,11 @@ if ((NOT DEFINED CMAKE_BUILD_TYPE) OR CMAKE_BUILD_TYPE STREQUAL "")
 endif ()
 
 # set op prec
-if (DEFINED FLOAT_PREC)
-  string ( TOLOWER ${FLOAT_PREC} lower_float_prec )
-  if (lower_float_prec STREQUAL "high")
-    set(PREC_DEF "-DHIGH_PREC")
-  else ()
-    set(PREC_DEF "")    
-  endif ()
-else ()
-    set(PREC_DEF "-DHIGH_PREC")
-endif()
-add_definitions (${PREC_DEF})
+set(HIGH_PREC_DEF "HIGH_PREC")
+# this defination doesn't work, but leaving it empty will cause error
+set(LOW_PREC_DEF "LOW_PREC")
+set(HIGH_PREC_VARIANT "")
+set(LOW_PREC_VARIANT "_low")
 
 # find openmp
 find_package(OpenMP)
diff --git a/source/api_cc/CMakeLists.txt b/source/api_cc/CMakeLists.txt
index cfdfce9b0e..4389fdfc92 100644
--- a/source/api_cc/CMakeLists.txt
+++ b/source/api_cc/CMakeLists.txt
@@ -1,5 +1,4 @@
 # libmd
-set (libname ${LIB_DEEPMD_CC})
 
 configure_file(
   ${CMAKE_CURRENT_SOURCE_DIR}/include/version.h.in
@@ -18,6 +17,10 @@ endif()
 file(GLOB LIB_SRC src/*.cc src/*.cpp)
 file(GLOB INC_SRC include/*.h ${CMAKE_CURRENT_BINARY_DIR}/version.h)
 
+
+function(_add_libapicc_variant variant_name prec_def)
+set (libname "${LIB_DEEPMD_CC}${variant_name}")
+
 add_library(${libname} SHARED ${LIB_SRC})
 
 if (USE_CUDA_TOOLKIT)
@@ -28,6 +31,11 @@ if (USE_ROCM_TOOLKIT)
   target_link_libraries (${libname} ${ROCM_LIBRARIES})
 endif()
 
+set_target_properties(
+  ${libname} 
+  PROPERTIES 
+  COMPILE_DEFINITIONS ${prec_def}
+)
 
 install(TARGETS ${libname} DESTINATION lib/)
 
@@ -36,3 +44,7 @@ install(
   DESTINATION	include/deepmd
 )
 
+endfunction()
+
+_add_libapicc_variant("${HIGH_PREC_VARIANT}" "${HIGH_PREC_DEF}")
+_add_libapicc_variant("${LOW_PREC_VARIANT}" "${LOW_PREC_DEF}")
diff --git a/source/api_cc/src/DeepPot.cc b/source/api_cc/src/DeepPot.cc
index 404ee9e1de..a8890498e5 100644
--- a/source/api_cc/src/DeepPot.cc
+++ b/source/api_cc/src/DeepPot.cc
@@ -1,38 +1,11 @@
 #include "DeepPot.h"
 #include "AtomMap.h"
 #include <stdexcept>	
+#include "device.h"
 
 using namespace tensorflow;
 using namespace deepmd;
 
-#if  GOOGLE_CUDA
-#include "cuda_runtime.h"
-
-#define cudaErrcheck(res) { cudaAssert((res), __FILE__, __LINE__); }
-inline void cudaAssert(cudaError_t code, const char *file, int line, bool abort=true)
-{
-    if (code != cudaSuccess)
-    {
-        fprintf(stderr,"cuda assert: %s %s %d\n", cudaGetErrorString(code), file, line);
-        if (abort) exit(code);
-    }
-}
-#endif
-
-#if  TENSORFLOW_USE_ROCM
-#include<hip/hip_runtime.h>
-
-#define hipErrcheck(res) { hipAssert((res), __FILE__, __LINE__); }
-inline void hipAssert(hipError_t code, const char *file, int line, bool abort=true)
-{
-    if (code != hipSuccess)
-    {
-        fprintf(stderr,"hip assert: %s %s %d\n", hipGetErrorString(code), file, line);
-        if (abort) exit(code);
-    }
-}
-#endif //TENSORFLOW_USE_ROCM
-
 static 
 std::vector<int> cum_sum (const std::vector<int32> & n_sel) {
     std::vector<int> sec;
@@ -218,32 +191,18 @@ init (const std::string & model, const int & gpu_rank, const std::string & file_
   else
     graph_def.ParseFromString(file_content);
   int gpu_num = -1;
-  #if GOOGLE_CUDA
-  cudaGetDeviceCount(&gpu_num); // check current device environment
+  #if GOOGLE_CUDA || TENSORFLOW_USE_ROCM
+  DPGetDeviceCount(gpu_num); // check current device environment
   if (gpu_num > 0) {
     options.config.set_allow_soft_placement(true);
     options.config.mutable_gpu_options()->set_per_process_gpu_memory_fraction(0.9);
     options.config.mutable_gpu_options()->set_allow_growth(true);
-    cudaErrcheck(cudaSetDevice(gpu_rank % gpu_num));
+    DPErrcheck(DPSetDevice(gpu_rank % gpu_num));
     std::string str = "/gpu:";
     str += std::to_string(gpu_rank % gpu_num);
     graph::SetDefaultDevice(str, &graph_def);
   }
-  #endif // GOOGLE_CUDA
-
-  #if TENSORFLOW_USE_ROCM
-  hipGetDeviceCount(&gpu_num); // check current device environment
-  if (gpu_num > 0) {
-    options.config.set_allow_soft_placement(true);
-    options.config.mutable_gpu_options()->set_per_process_gpu_memory_fraction(0.9);
-    options.config.mutable_gpu_options()->set_allow_growth(true);
-    hipErrcheck(hipSetDevice(gpu_rank % gpu_num));
-    std::string str = "/gpu:";
-    str += std::to_string(gpu_rank % gpu_num);
-    graph::SetDefaultDevice(str, &graph_def);
-  }
-  #endif // TENSORFLOW_USE_ROCM
-
+  #endif // GOOGLE_CUDA || TENSORFLOW_USE_ROCM
   check_status (NewSession(options, &session));
   check_status (session->Create(graph_def));
   rcut = get_scalar<VALUETYPE>("descrpt_attr/rcut");
@@ -552,13 +511,9 @@ init (const std::vector<std::string> & models, const int & gpu_rank, const std::
   graph_defs.resize(numb_models);
   
   int gpu_num = -1;
-  #if GOOGLE_CUDA 
-  cudaGetDeviceCount(&gpu_num);
-  #endif // GOOGLE_CUDA
-
-  #if TENSORFLOW_USE_ROCM
-  hipGetDeviceCount(&gpu_num);
-  #endif //TENSORFLOW_USE_ROCM
+  #if GOOGLE_CUDA || TENSORFLOW_USE_ROCM
+  DPGetDeviceCount(gpu_num);
+  #endif // GOOGLE_CUDA || TENSORFLOW_USE_ROCM
 
   SessionOptions options;
   options.config.set_inter_op_parallelism_threads(num_inter_nthreads);
@@ -569,24 +524,14 @@ init (const std::vector<std::string> & models, const int & gpu_rank, const std::
     else
       graph_defs[ii].ParseFromString(file_contents[ii]);
   }
-  #if GOOGLE_CUDA 
-  if (gpu_num > 0) {
-      options.config.set_allow_soft_placement(true);
-      options.config.mutable_gpu_options()->set_per_process_gpu_memory_fraction(0.9);
-      options.config.mutable_gpu_options()->set_allow_growth(true);
-      cudaErrcheck(cudaSetDevice(gpu_rank % gpu_num));
-  }
-  #endif // GOOGLE_CUDA
-
-
-  #if TENSORFLOW_USE_ROCM
+  #if GOOGLE_CUDA || TENSORFLOW_USE_ROCM
   if (gpu_num > 0) {
       options.config.set_allow_soft_placement(true);
       options.config.mutable_gpu_options()->set_per_process_gpu_memory_fraction(0.9);
       options.config.mutable_gpu_options()->set_allow_growth(true);
-      hipErrcheck(hipSetDevice(gpu_rank % gpu_num));
+      DPErrcheck(DPSetDevice(gpu_rank % gpu_num));
   }
-  #endif // TENSORFLOW_USE_ROCM
+  #endif // GOOGLE_CUDA || TENSORFLOW_USE_ROCM
   
   for (unsigned ii = 0; ii < numb_models; ++ii) {
     if (gpu_num > 0) {
diff --git a/source/api_cc/tests/CMakeLists.txt b/source/api_cc/tests/CMakeLists.txt
index 6768ff2ee6..1a5b56fca0 100644
--- a/source/api_cc/tests/CMakeLists.txt
+++ b/source/api_cc/tests/CMakeLists.txt
@@ -37,7 +37,7 @@ configure_file(
 set(opname "deepmd_op")
 set(OP_BASE_DIR ${CMAKE_SOURCE_DIR}/../../op)
 # file(GLOB OP_SRC ${OP_BASE_DIR}/*.cc)
-file(GLOB OP_SRC ${OP_BASE_DIR}/prod_force.cc ${OP_BASE_DIR}/prod_virial.cc ${OP_BASE_DIR}/descrpt.cc ${OP_BASE_DIR}/descrpt_se_a_ef.cc ${OP_BASE_DIR}/descrpt_se_a_ef.cc ${OP_BASE_DIR}/descrpt_se_a_ef_para.cc ${OP_BASE_DIR}/descrpt_se_a_ef_vert.cc ${OP_BASE_DIR}/pair_tab.cc ${OP_BASE_DIR}/prod_force_multi_device.cc ${OP_BASE_DIR}/prod_virial_multi_device.cc ${OP_BASE_DIR}/soft_min.cc ${OP_BASE_DIR}/soft_min_force.cc ${OP_BASE_DIR}/soft_min_virial.cc ${OP_BASE_DIR}/ewald_recp.cc ${OP_BASE_DIR}/gelu_multi_device.cc ${OP_BASE_DIR}/map_aparam.cc ${OP_BASE_DIR}/neighbor_stat.cc ${OP_BASE_DIR}/unaggregated_grad.cc ${OP_BASE_DIR}/tabulate_multi_device.cc ${OP_BASE_DIR}/prod_env_mat_multi_device.cc)
+file(GLOB OP_SRC ${OP_BASE_DIR}/custom_op.cc ${OP_BASE_DIR}/prod_force.cc ${OP_BASE_DIR}/prod_virial.cc ${OP_BASE_DIR}/descrpt.cc ${OP_BASE_DIR}/descrpt_se_a_ef.cc ${OP_BASE_DIR}/descrpt_se_a_ef.cc ${OP_BASE_DIR}/descrpt_se_a_ef_para.cc ${OP_BASE_DIR}/descrpt_se_a_ef_vert.cc ${OP_BASE_DIR}/pair_tab.cc ${OP_BASE_DIR}/prod_force_multi_device.cc ${OP_BASE_DIR}/prod_virial_multi_device.cc ${OP_BASE_DIR}/soft_min.cc ${OP_BASE_DIR}/soft_min_force.cc ${OP_BASE_DIR}/soft_min_virial.cc ${OP_BASE_DIR}/ewald_recp.cc ${OP_BASE_DIR}/gelu_multi_device.cc ${OP_BASE_DIR}/map_aparam.cc ${OP_BASE_DIR}/neighbor_stat.cc ${OP_BASE_DIR}/unaggregated_grad.cc ${OP_BASE_DIR}/tabulate_multi_device.cc ${OP_BASE_DIR}/prod_env_mat_multi_device.cc)
 add_library(${opname} SHARED ${OP_SRC})
 
 list (APPEND CMAKE_MODULE_PATH ${CMAKE_SOURCE_DIR}/../../cmake/)
diff --git a/source/cmake/cmake_lammps.cmake.in b/source/cmake/cmake_lammps.cmake.in
index bee237a1c7..44ad206d9c 100644
--- a/source/cmake/cmake_lammps.cmake.in
+++ b/source/cmake/cmake_lammps.cmake.in
@@ -13,7 +13,12 @@ endforeach ()
 file (
   INSTALL DESTINATION "${LMP_INSTALL_PREFIX}" 
   TYPE FILE 
-  FILES "@CMAKE_BINARY_DIR@/lmp/env.sh"
+  FILES "@CMAKE_BINARY_DIR@/lmp/env@HIGH_PREC_VARIANT@.sh"
+)
+file (
+  INSTALL DESTINATION "${LMP_INSTALL_PREFIX}" 
+  TYPE FILE 
+  FILES "@CMAKE_BINARY_DIR@/lmp/env@LOW_PREC_VARIANT@.sh"
 )
 
 file (
diff --git a/source/config/run_config.ini b/source/config/run_config.ini
index bb04319e47..87f54ef169 100644
--- a/source/config/run_config.ini
+++ b/source/config/run_config.ini
@@ -8,5 +8,4 @@ TF_INCLUDE_DIR = @TensorFlow_INCLUDE_DIRS@
 TF_LIBS = @TensorFlow_LIBRARY@
 TF_VERSION = @TENSORFLOW_VERSION@
 TF_CXX11_ABI_FLAG = @OP_CXX_ABI@
-PRECISION = @PREC_DEF@
 MODEL_VERSION=@MODEL_VERSION@
diff --git a/source/install/build_cc.sh b/source/install/build_cc.sh
index 2a606c61f8..dafcde7606 100755
--- a/source/install/build_cc.sh
+++ b/source/install/build_cc.sh
@@ -1,10 +1,5 @@
 set -e
 
-if [ -z "$FLOAT_PREC" ]
-then
-  FLOAT_PREC=high
-fi
-
 if [ "$DP_VARIANT" == "cuda" ]
 then
   CUDA_ARGS="-DUSE_CUDA_TOOLKIT=TRUE"
@@ -25,7 +20,7 @@ NPROC=$(nproc --all)
 BUILD_TMP_DIR=${SCRIPT_PATH}/../build
 mkdir -p ${BUILD_TMP_DIR}
 cd ${BUILD_TMP_DIR}
-cmake -DCMAKE_INSTALL_PREFIX=${INSTALL_PREFIX} -DFLOAT_PREC=${FLOAT_PREC} -DINSTALL_TENSORFLOW=TRUE ${CUDA_ARGS} ..
+cmake -DCMAKE_INSTALL_PREFIX=${INSTALL_PREFIX} -DINSTALL_TENSORFLOW=TRUE ${CUDA_ARGS} ..
 make -j${NPROC}
 make install
 
diff --git a/source/install/build_lammps.sh b/source/install/build_lammps.sh
index a4fee95f6f..6bcec33685 100755
--- a/source/install/build_lammps.sh
+++ b/source/install/build_lammps.sh
@@ -6,6 +6,14 @@ if [ -z "$FLOAT_PREC" ]
 then
   FLOAT_PREC=high
 fi
+
+if [ ${FLOAT_PREC} == "high" ]; then
+    PREC_DEF="-DHIGH_PREC"
+    PREC_SUFFIX=""
+else
+    PREC_DEF="-DLOW_PREC"
+    PREC_SUFFIX="_low"
+fi
 #------------------
 
 SCRIPT_PATH=$(dirname $(realpath -s $0))
@@ -43,10 +51,7 @@ cp -r ${BUILD_TMP_DIR2}/USER-DEEPMD/* ${BUILD_TMP_DIR}/lammps-${LAMMPS_VERSION}/
 
 mkdir -p ${BUILD_TMP_DIR}/lammps-${LAMMPS_VERSION}/build
 cd ${BUILD_TMP_DIR}/lammps-${LAMMPS_VERSION}/build
-if [ ${FLOAT_PREC} == "high" ]; then
-    export PREC_DEF="-DHIGH_PREC"
-fi
-cmake -C ../cmake/presets/all_off.cmake -D PKG_USER-DEEPMD=ON -D PKG_KSPACE=ON -D CMAKE_INSTALL_PREFIX=${INSTALL_PREFIX} -D CMAKE_CXX_FLAGS="${PREC_DEF} -I${INSTALL_PREFIX}/include -L${INSTALL_PREFIX}/lib -Wl,--no-as-needed -lrt -ldeepmd_op -ldeepmd -ldeepmd_cc -ltensorflow_cc -ltensorflow_framework -Wl,-rpath=${INSTALL_PREFIX}/lib" ../cmake
+cmake -C ../cmake/presets/all_off.cmake -D PKG_USER-DEEPMD=ON -D PKG_KSPACE=ON -D CMAKE_INSTALL_PREFIX=${INSTALL_PREFIX} -D CMAKE_CXX_FLAGS="${PREC_DEF} -I${INSTALL_PREFIX}/include -L${INSTALL_PREFIX}/lib -Wl,--no-as-needed -lrt -ldeepmd_op -ldeepmd -ldeepmd_cc${PREC_SUFFIX} -ltensorflow_cc -ltensorflow_framework -Wl,-rpath=${INSTALL_PREFIX}/lib" ../cmake
 
 make -j${NPROC}
 make install
diff --git a/source/ipi/CMakeLists.txt b/source/ipi/CMakeLists.txt
index c60122da78..cdb3f80b08 100644
--- a/source/ipi/CMakeLists.txt
+++ b/source/ipi/CMakeLists.txt
@@ -4,24 +4,32 @@ list (APPEND MD_INCLUDE_PATH "include")
 include_directories (${MD_INCLUDE_PATH})
 
 file(GLOB IN_SRC src/*.cc src/*.c)
-add_library(${LIB_DEEPMD_IPI} SHARED ${IN_SRC})
+
+function(_add_ipi_variant variant_name prec_def)
+set (ipiname "dp_ipi${variant_name}")
+set (libipiname "${LIB_DEEPMD_IPI}${variant_name}")
+add_library(${libipiname} SHARED ${IN_SRC})
 
 set(DRIVER_SOURCE_FILES driver.cc)
-add_executable(dp_ipi ${DRIVER_SOURCE_FILES})
-target_link_libraries(dp_ipi ${LIB_DEEPMD_IPI} ${LIB_DEEPMD_OP} ${LIB_DEEPMD_CC} ${LIB_DEEPMD} ${TensorFlow_LIBRARY})
+add_executable(${ipiname} ${DRIVER_SOURCE_FILES})
+target_link_libraries(${ipiname} ${libipiname} ${LIB_DEEPMD_OP} ${LIB_DEEPMD_CC}${variant_name} ${LIB_DEEPMD} ${TensorFlow_LIBRARY})
 
 set_target_properties(
-  dp_ipi
+  ${ipiname}
   PROPERTIES
   LINK_FLAGS "-Wl,-rpath,'$ORIGIN'/../lib -Wl,-z,defs"
   INSTALL_RPATH "$ORIGIN/../lib:${TensorFlow_LIBRARY_PATH}"
+  COMPILE_DEFINITIONS ${prec_def}
 )
 
 install(
-  TARGETS	${LIB_DEEPMD_IPI}	   
+  TARGETS	${libipiname}
   DESTINATION	lib/
 )
 install(
-  TARGETS	dp_ipi
+  TARGETS	${ipiname}
   DESTINATION	bin/
 )
+endfunction()
+_add_ipi_variant("${HIGH_PREC_VARIANT}" "${HIGH_PREC_DEF}")
+_add_ipi_variant("${LOW_PREC_VARIANT}" "${LOW_PREC_DEF}")
\ No newline at end of file
diff --git a/source/lib/include/SimulationRegion_Impl.h b/source/lib/include/SimulationRegion_Impl.h
index 5b7b8248fd..528402b7d6 100644
--- a/source/lib/include/SimulationRegion_Impl.h
+++ b/source/lib/include/SimulationRegion_Impl.h
@@ -6,6 +6,7 @@
 #include <limits>
 #include <typeinfo>
 #include <stdexcept>
+#include "errors.h"
 
 // using namespace std;
 
@@ -502,7 +503,7 @@ computeVolume()
       boxt[0*3+2] * (boxt[1*3+0]*boxt[2*3+1] - boxt[2*3+0]*boxt[1*3+1]);
   volumei = static_cast<double>(1.)/volume;
   if (volume < 0) {
-    throw std::runtime_error("Negative volume detected. Please make sure the simulation cell obeys the right-hand rule.");
+    throw deepmd::deepmd_exception("Negative volume detected. Please make sure the simulation cell obeys the right-hand rule.");
   }
 }
 
diff --git a/source/lib/include/errors.h b/source/lib/include/errors.h
new file mode 100644
index 0000000000..fe0a21fc50
--- /dev/null
+++ b/source/lib/include/errors.h
@@ -0,0 +1,20 @@
+#pragma once
+
+#include <stdexcept>
+#include <string>
+
+namespace deepmd{
+    struct
+    deepmd_exception: public std::runtime_error {
+    public:
+        deepmd_exception(): runtime_error("DeePMD-kit Error!") {};
+        deepmd_exception(const std::string& msg): runtime_error(std::string("DeePMD-kit Error: ") + msg) {};
+    };
+
+    struct
+    deepmd_exception_oom: public std::runtime_error{
+    public:
+        deepmd_exception_oom(): runtime_error("DeePMD-kit OOM!") {};
+        deepmd_exception_oom(const std::string& msg): runtime_error(std::string("DeePMD-kit OOM: ") + msg) {};
+    };
+};
\ No newline at end of file
diff --git a/source/lib/include/gpu_cuda.h b/source/lib/include/gpu_cuda.h
index 8464a8f46c..8a2b617c95 100644
--- a/source/lib/include/gpu_cuda.h
+++ b/source/lib/include/gpu_cuda.h
@@ -3,15 +3,16 @@
 #include <stdio.h>
 #include <assert.h>
 #include <cuda_runtime.h>
+#include "errors.h"
 
 #define GPU_MAX_NBOR_SIZE 4096
-#define cudaErrcheck(res) {cudaAssert((res), __FILE__, __LINE__);}
-inline void cudaAssert(cudaError_t code, const char *file, int line, bool abort=true) {
+#define DPErrcheck(res) {DPAssert((res), __FILE__, __LINE__);}
+inline void DPAssert(cudaError_t code, const char *file, int line, bool abort=true) 
+{
   if (code != cudaSuccess) {
     fprintf(stderr,"cuda assert: %s %s %d\n", cudaGetErrorString(code), file, line);
     if (code == 2) {
       // out of memory
-      // TODO: I have no idea how to thorw errors back to Python interface
       fprintf(stderr, "Your memory is not enough, thus an error has been raised " \
         "above. You need to take the following actions:\n" \
         "1. Check if the network size of the model is too large.\n" \
@@ -21,18 +22,19 @@ inline void cudaAssert(cudaError_t code, const char *file, int line, bool abort=
         "4. Check if another program is using the same GPU by execuating `nvidia-smi`. " \
         "The usage of GPUs is controlled by `CUDA_VISIBLE_DEVICES` " \
         "environment variable.\n");
+      if (abort) throw deepmd::deepmd_exception_oom("CUDA Assert");
     }
-    if (abort) exit(code);
+    if (abort) throw deepmd::deepmd_exception("CUDA Assert");
   }
 }
 
 #define nborErrcheck(res) {nborAssert((res), __FILE__, __LINE__);}
-inline void nborAssert(cudaError_t code, const char *file, int line, bool abort=true) {
+inline void nborAssert(cudaError_t code, const char *file, int line, bool abort=true) 
+{
     if (code != cudaSuccess) {
         fprintf(stderr,"cuda assert: %s %s %d\n", "DeePMD-kit:\tillegal nbor list sorting", file, line);
         if (code == 2) {
           // out of memory
-          // TODO: I have no idea how to thorw errors back to Python interface
           fprintf(stderr, "Your memory is not enough, thus an error has been raised " \
             "above. You need to take the following actions:\n" \
             "1. Check if the network size of the model is too large.\n" \
@@ -42,8 +44,9 @@ inline void nborAssert(cudaError_t code, const char *file, int line, bool abort=
             "4. Check if another program is using the same GPU by execuating `nvidia-smi`. " \
             "The usage of GPUs is controlled by `CUDA_VISIBLE_DEVICES` " \
             "environment variable.\n");
+            if (abort) throw deepmd::deepmd_exception_oom("CUDA Assert");
         }
-        if (abort) exit(code);
+        if (abort) throw deepmd::deepmd_exception("CUDA Assert");
     }
 }
 
@@ -65,12 +68,17 @@ static __inline__ __device__ double atomicAdd(
 #endif
 
 namespace deepmd {
+  
+inline void DPGetDeviceCount(int &gpu_num) { cudaGetDeviceCount(&gpu_num) ;}
+
+inline cudaError_t DPSetDevice(int rank) { return  cudaSetDevice(rank); }
+
 template <typename FPTYPE>
 void memcpy_host_to_device(
     FPTYPE * device, 
     const std::vector<FPTYPE> &host) 
 {
-  cudaErrcheck(cudaMemcpy(device, &host[0], sizeof(FPTYPE) * host.size(), cudaMemcpyHostToDevice));  
+  DPErrcheck(cudaMemcpy(device, &host[0], sizeof(FPTYPE) * host.size(), cudaMemcpyHostToDevice));  
 }
 
 template <typename FPTYPE>
@@ -79,7 +87,7 @@ void memcpy_host_to_device(
     const FPTYPE * host,
     const int size) 
 {
-  cudaErrcheck(cudaMemcpy(device, host, sizeof(FPTYPE) * size, cudaMemcpyHostToDevice));  
+  DPErrcheck(cudaMemcpy(device, host, sizeof(FPTYPE) * size, cudaMemcpyHostToDevice));  
 }
 
 template <typename FPTYPE>
@@ -87,7 +95,7 @@ void memcpy_device_to_host(
     const FPTYPE * device, 
     std::vector<FPTYPE> &host) 
 {
-  cudaErrcheck(cudaMemcpy(&host[0], device, sizeof(FPTYPE) * host.size(), cudaMemcpyDeviceToHost));  
+  DPErrcheck(cudaMemcpy(&host[0], device, sizeof(FPTYPE) * host.size(), cudaMemcpyDeviceToHost));  
 }
 
 template <typename FPTYPE>
@@ -96,7 +104,7 @@ void memcpy_device_to_host(
     FPTYPE * host,
     const int size) 
 {
-  cudaErrcheck(cudaMemcpy(host, device, sizeof(FPTYPE) * size, cudaMemcpyDeviceToHost));  
+  DPErrcheck(cudaMemcpy(host, device, sizeof(FPTYPE) * size, cudaMemcpyDeviceToHost));  
 }
 
 template <typename FPTYPE>
@@ -104,7 +112,7 @@ void malloc_device_memory(
     FPTYPE * &device, 
     const std::vector<FPTYPE> &host) 
 {
-  cudaErrcheck(cudaMalloc((void **)&device, sizeof(FPTYPE) * host.size()));
+  DPErrcheck(cudaMalloc((void **)&device, sizeof(FPTYPE) * host.size()));
 }
 
 template <typename FPTYPE>
@@ -112,7 +120,7 @@ void malloc_device_memory(
     FPTYPE * &device, 
     const int size) 
 {
-  cudaErrcheck(cudaMalloc((void **)&device, sizeof(FPTYPE) * size));
+  DPErrcheck(cudaMalloc((void **)&device, sizeof(FPTYPE) * size));
 }
 
 template <typename FPTYPE>
@@ -120,7 +128,7 @@ void malloc_device_memory_sync(
     FPTYPE * &device,
     const std::vector<FPTYPE> &host) 
 {
-  cudaErrcheck(cudaMalloc((void **)&device, sizeof(FPTYPE) * host.size()));
+  DPErrcheck(cudaMalloc((void **)&device, sizeof(FPTYPE) * host.size()));
   memcpy_host_to_device(device, host);
 }
 
@@ -130,7 +138,7 @@ void malloc_device_memory_sync(
     const FPTYPE * host,
     const int size)
 {
-  cudaErrcheck(cudaMalloc((void **)&device, sizeof(FPTYPE) * size));
+  DPErrcheck(cudaMalloc((void **)&device, sizeof(FPTYPE) * size));
   memcpy_host_to_device(device, host, size);
 }
 
@@ -139,7 +147,7 @@ void delete_device_memory(
     FPTYPE * &device) 
 {
   if (device != NULL) {
-    cudaErrcheck(cudaFree(device));
+    DPErrcheck(cudaFree(device));
   }
 }
 
@@ -149,6 +157,6 @@ void memset_device_memory(
     const FPTYPE var,
     const int size) 
 {
-  cudaErrcheck(cudaMemset(device, var, sizeof(FPTYPE) * size));  
+  DPErrcheck(cudaMemset(device, var, sizeof(FPTYPE) * size));  
 }
 } // end of namespace deepmd
\ No newline at end of file
diff --git a/source/lib/include/gpu_rocm.h b/source/lib/include/gpu_rocm.h
index ee3e88ee9e..b6439c3bb8 100644
--- a/source/lib/include/gpu_rocm.h
+++ b/source/lib/include/gpu_rocm.h
@@ -5,14 +5,15 @@
 #include<hip/hip_runtime.h>
 //#include<rocprim/rocprim.hpp>
 //#include <hipcub/hipcub.hpp>
+#include "errors.h"
 
 #define GPU_MAX_NBOR_SIZE 4096
 
-#define hipErrcheck(res) { hipAssert((res), __FILE__, __LINE__); }
-inline void hipAssert(hipError_t code, const char *file, int line, bool abort=true) {
+#define DPErrcheck(res) { DPAssert((res), __FILE__, __LINE__); }
+inline void DPAssert(hipError_t code, const char *file, int line, bool abort=true) {
     if (code != hipSuccess) {
         fprintf(stderr,"hip assert: %s %s %d\n", hipGetErrorString(code), file, line);
-        if (abort) exit(code);
+        if (abort) throw deepmd::deepmd_exception("CUDA Assert");
     }
 }
 
@@ -20,17 +21,22 @@ inline void hipAssert(hipError_t code, const char *file, int line, bool abort=tr
 inline void nborAssert(hipError_t code, const char *file, int line, bool abort=true) {
     if (code != hipSuccess) {
         fprintf(stderr,"hip assert: %s %s %d\n", "DeePMD-kit:\tillegal nbor list sorting", file, line);
-        if (abort) exit(code);
+        if (abort) throw deepmd::deepmd_exception("CUDA Assert");
     }
 }
 
+
 namespace deepmd {
+inline void DPGetDeviceCount(int &gpu_num) { hipGetDeviceCount(&gpu_num) ;}
+
+inline hipError_t DPSetDevice(int rank) { return  hipSetDevice(rank); }
+
 template <typename FPTYPE>
 void memcpy_host_to_device(
     FPTYPE * device, 
     std::vector<FPTYPE> &host) 
 {
-  hipErrcheck(hipMemcpy(device, &host[0], sizeof(FPTYPE) * host.size(), hipMemcpyHostToDevice));  
+  DPErrcheck(hipMemcpy(device, &host[0], sizeof(FPTYPE) * host.size(), hipMemcpyHostToDevice));  
 }
 
 template <typename FPTYPE>
@@ -39,7 +45,7 @@ void memcpy_host_to_device(
     const FPTYPE * host,
     const int size) 
 {
-  hipErrcheck(hipMemcpy(device, host, sizeof(FPTYPE) * size, hipMemcpyHostToDevice));  
+  DPErrcheck(hipMemcpy(device, host, sizeof(FPTYPE) * size, hipMemcpyHostToDevice));  
 }
 
 template <typename FPTYPE>
@@ -47,7 +53,7 @@ void memcpy_device_to_host(
     FPTYPE * device, 
     std::vector<FPTYPE> &host) 
 {
-  hipErrcheck(hipMemcpy(&host[0], device, sizeof(FPTYPE) * host.size(), hipMemcpyDeviceToHost));  
+  DPErrcheck(hipMemcpy(&host[0], device, sizeof(FPTYPE) * host.size(), hipMemcpyDeviceToHost));  
 }
 template <typename FPTYPE>
 void memcpy_device_to_host(
@@ -55,7 +61,7 @@ void memcpy_device_to_host(
     FPTYPE * host,
     const int size) 
 {
-  hipErrcheck(hipMemcpy(host, device, sizeof(FPTYPE) * size, hipMemcpyDeviceToHost));  
+  DPErrcheck(hipMemcpy(host, device, sizeof(FPTYPE) * size, hipMemcpyDeviceToHost));  
 }
 
 template <typename FPTYPE>
@@ -63,7 +69,7 @@ void malloc_device_memory(
     FPTYPE * &device, 
     std::vector<FPTYPE> &host) 
 {
-  hipErrcheck(hipMalloc((void **)&device, sizeof(FPTYPE) * host.size()));
+  DPErrcheck(hipMalloc((void **)&device, sizeof(FPTYPE) * host.size()));
 }
 
 template <typename FPTYPE>
@@ -71,7 +77,7 @@ void malloc_device_memory(
     FPTYPE * &device, 
     const int size) 
 {
-  hipErrcheck(hipMalloc((void **)&device, sizeof(FPTYPE) * size));
+  DPErrcheck(hipMalloc((void **)&device, sizeof(FPTYPE) * size));
 }
 
 template <typename FPTYPE>
@@ -79,7 +85,7 @@ void malloc_device_memory_sync(
     FPTYPE * &device,
     std::vector<FPTYPE> &host) 
 {
-  hipErrcheck(hipMalloc((void **)&device, sizeof(FPTYPE) * host.size()));
+  DPErrcheck(hipMalloc((void **)&device, sizeof(FPTYPE) * host.size()));
   memcpy_host_to_device(device, host);
 }
 template <typename FPTYPE>
@@ -88,7 +94,7 @@ void malloc_device_memory_sync(
     const FPTYPE * host,
     const int size)
 {
-  hipErrcheck(hipMalloc((void **)&device, sizeof(FPTYPE) * size));
+  DPErrcheck(hipMalloc((void **)&device, sizeof(FPTYPE) * size));
   memcpy_host_to_device(device, host, size);
 }
 
@@ -97,7 +103,7 @@ void delete_device_memory(
     FPTYPE * &device) 
 {
   if (device != NULL) {
-    hipErrcheck(hipFree(device));
+    DPErrcheck(hipFree(device));
   }
 }
 
@@ -107,7 +113,7 @@ void memset_device_memory(
   const FPTYPE var,
   const int size)
   {
-    hipErrcheck(hipMemset(device,var,sizeof(FPTYPE)*size));
+    DPErrcheck(hipMemset(device,var,sizeof(FPTYPE)*size));
   }
 }
 
diff --git a/source/lib/include/neighbor_list.h b/source/lib/include/neighbor_list.h
index bc717255b9..e0901c26d0 100644
--- a/source/lib/include/neighbor_list.h
+++ b/source/lib/include/neighbor_list.h
@@ -63,16 +63,25 @@ build_nlist_cpu(
     const int & mem_size,
     const float & rcut);
 
-#if GOOGLE_CUDA
-void convert_nlist_gpu_cuda(
+#if GOOGLE_CUDA || TENSORFLOW_USE_ROCM
+void convert_nlist_gpu_device(
     InputNlist & gpu_nlist,
     InputNlist & cpu_nlist,
     int* & gpu_memory,
     const int & max_nbor_size);
 
-void free_nlist_gpu_cuda(
+void free_nlist_gpu_device(
     InputNlist & gpu_nlist);
 
+void use_nlist_map(
+    int * nlist, 
+    const int * nlist_map, 
+    const int nloc, 
+    const int nnei);
+
+#endif //GOOGLE_CUDA || TENSORFLOW_USE_ROCM
+
+#if GOOGLE_CUDA
 // build neighbor list.
 // outputs
 //	nlist, max_list_size
@@ -96,25 +105,10 @@ build_nlist_gpu(
     const int & mem_size,
     const float & rcut);
 
-void use_nlist_map(
-    int * nlist, 
-    const int * nlist_map, 
-    const int nloc, 
-    const int nnei);
-	
 #endif // GOOGLE_CUDA
 
 
 #if TENSORFLOW_USE_ROCM
-void convert_nlist_gpu_rocm(
-    InputNlist & gpu_nlist,
-    InputNlist & cpu_nlist,
-    int* & gpu_memory,
-    const int & max_nbor_size);
-
-void free_nlist_gpu_rocm(
-    InputNlist & gpu_nlist);
-
 // build neighbor list.
 // outputs
 //	nlist, max_list_size
@@ -137,12 +131,6 @@ build_nlist_gpu_rocm(
     const int & nall, 
     const int & mem_size,
     const float & rcut);
-
-void use_nlist_map(
-    int * nlist, 
-    const int * nlist_map, 
-    const int nloc, 
-    const int nnei);
 	
 #endif // TENSORFLOW_USE_ROCM
 
diff --git a/source/lib/src/cuda/CMakeLists.txt b/source/lib/src/cuda/CMakeLists.txt
index 71afbee9b6..2d9aa03cf6 100644
--- a/source/lib/src/cuda/CMakeLists.txt
+++ b/source/lib/src/cuda/CMakeLists.txt
@@ -15,7 +15,12 @@ SET(CMAKE_CUDA_STANDARD 11)
 # nvcc -o libdeepmd_op_cuda.so -I/usr/local/cub-1.8.0 -rdc=true -DHIGH_PREC=true -gencode arch=compute_61,code=sm_61 -shared -Xcompiler -fPIC deepmd_op.cu -L/usr/local/cuda/lib64 -lcudadevrt
 # very important here! Include path to cub.
 # for searching device compute capability, https://developer.nvidia.com/cuda-gpus
+
+# cub has been included in CUDA Toolkit 11, we do not need to include it any more
+# see https://github.com/NVIDIA/cub
+if (${CUDA_VERSION_MAJOR} LESS_EQUAL "10")
 include_directories(cub)
+endif ()
 
 message(STATUS "CUDA major version is " ${CUDA_VERSION_MAJOR})
 
diff --git a/source/lib/src/cuda/coord.cu b/source/lib/src/cuda/coord.cu
index c11609c57f..982e603f42 100644
--- a/source/lib/src/cuda/coord.cu
+++ b/source/lib/src/cuda/coord.cu
@@ -291,14 +291,20 @@ void compute_int_data(
     const int nblock_loc=(nloc+TPB-1)/TPB;
     _fill_idx_cellmap<<<nblock_loc, TPB>>>(idx_cellmap, idx_cellmap_noshift, in_c, rec_boxt, 
         nat_stt, nat_end, ext_stt, ext_end, nloc);
+    DPErrcheck(cudaGetLastError());
+    DPErrcheck(cudaDeviceSynchronize());
 
     const int nblock_loc_cellnum=(loc_cellnum+TPB-1)/TPB;
     _fill_loc_cellnum_map<<<nblock_loc_cellnum, TPB>>>(temp_idx_order, loc_cellnum_map, 
         idx_cellmap_noshift, nloc, loc_cellnum);
+    DPErrcheck(cudaGetLastError());
+    DPErrcheck(cudaDeviceSynchronize());
 
     const int nblock_total_cellnum=(total_cellnum+TPB-1)/TPB;
     _fill_total_cellnum_map<<<nblock_total_cellnum, TPB>>>(total_cellnum_map, mask_cellnum_map, cell_map, 
         cell_shift_map, nat_stt, nat_end, ext_stt, ext_end, loc_cellnum_map, total_cellnum);
+    DPErrcheck(cudaGetLastError());
+    DPErrcheck(cudaDeviceSynchronize());
 }
 
 void build_loc_clist(
@@ -313,6 +319,8 @@ void build_loc_clist(
     const int * sec_loc_cellnum_map=temp_idx_order+nloc+loc_cellnum+2*total_cellnum+total_cellnum+3*total_cellnum;
     int * loc_clist=int_data+nloc*3+loc_cellnum+total_cellnum*3+total_cellnum*3+loc_cellnum+1+total_cellnum+1;
     _build_loc_clist<<<nblock, TPB>>>(loc_clist, idx_cellmap_noshift, temp_idx_order, sec_loc_cellnum_map, nloc);
+    DPErrcheck(cudaGetLastError());
+    DPErrcheck(cudaDeviceSynchronize());
 }
 
 template <typename FPTYPE>
@@ -340,6 +348,8 @@ void copy_coord(
     const FPTYPE *rec_boxt = region.rec_boxt;
     _copy_coord<<<nblock, TPB>>>(out_c, out_t, mapping, in_c, in_t, cell_map, cell_shift_map, 
         sec_loc_cellnum_map, sec_total_cellnum_map, loc_clist, nloc, nall, total_cellnum, boxt, rec_boxt);
+    DPErrcheck(cudaGetLastError());
+    DPErrcheck(cudaDeviceSynchronize());
 }
 
 namespace deepmd {
@@ -354,6 +364,8 @@ normalize_coord_gpu(
     const FPTYPE * rec_boxt=region.rec_boxt;
     const int nblock=(natom+TPB-1)/TPB;
     normalize_one<<<nblock, TPB>>>(coord, boxt, rec_boxt, natom);
+    DPErrcheck(cudaGetLastError());
+    DPErrcheck(cudaDeviceSynchronize());
 }
 
 //  int_data(temp cuda memory):idx_map,idx_map_noshift,temp_idx_order,loc_cellnum_map,total_cellnum_map,mask_cellnum_map,
@@ -377,7 +389,7 @@ copy_coord_gpu(
 {
     compute_int_data(int_data, in_c, cell_info, region, nloc, loc_cellnum, total_cellnum);
     int * int_data_cpu=new int [loc_cellnum+2*total_cellnum+loc_cellnum+1+total_cellnum+1];//loc_cellnum_map,total_cellnum_map,mask_cellnum_map,sec_loc_cellnum_map,sec_total_cellnum_map
-    cudaErrcheck(cudaMemcpy(int_data_cpu, int_data+3*nloc, sizeof(int) * (loc_cellnum + 2 * total_cellnum), cudaMemcpyDeviceToHost));
+    DPErrcheck(cudaMemcpy(int_data_cpu, int_data+3*nloc, sizeof(int) * (loc_cellnum + 2 * total_cellnum), cudaMemcpyDeviceToHost));
     int * loc_cellnum_map=int_data_cpu;
     int * total_cellnum_map=loc_cellnum_map+loc_cellnum;
     int * mask_cellnum_map=total_cellnum_map+total_cellnum;
@@ -399,7 +411,7 @@ copy_coord_gpu(
         return 1;
     }
     else{
-        cudaErrcheck(cudaMemcpy(int_data+nloc*3+loc_cellnum+total_cellnum*3+total_cellnum*3, 
+        DPErrcheck(cudaMemcpy(int_data+nloc*3+loc_cellnum+total_cellnum*3+total_cellnum*3, 
             sec_loc_cellnum_map, sizeof(int) * (loc_cellnum+1+total_cellnum+1), cudaMemcpyHostToDevice));
         delete[] int_data_cpu;
         build_loc_clist(int_data, nloc, loc_cellnum, total_cellnum);
diff --git a/source/lib/src/cuda/gelu.cu b/source/lib/src/cuda/gelu.cu
index ba9cdad4b7..ca96751895 100644
--- a/source/lib/src/cuda/gelu.cu
+++ b/source/lib/src/cuda/gelu.cu
@@ -62,6 +62,8 @@ void gelu_gpu_cuda(
   const int BLOCK_NUMS = (size + THREAD_ITEMS - 1) / THREAD_ITEMS;
 
   gelu<<<BLOCK_NUMS, THREAD_ITEMS>>>(out, xx, size);
+  DPErrcheck(cudaGetLastError());
+  DPErrcheck(cudaDeviceSynchronize());
 }
 
 template<typename FPTYPE>
@@ -78,6 +80,8 @@ void gelu_grad_gpu_cuda(
   const int BLOCK_NUMS = (size + THREAD_ITEMS - 1) / THREAD_ITEMS;
 
   gelu_grad<<<BLOCK_NUMS, THREAD_ITEMS>>>(out, xx, dy, size);
+  DPErrcheck(cudaGetLastError());
+  DPErrcheck(cudaDeviceSynchronize());
 }
 
 template<typename FPTYPE>
@@ -95,6 +99,8 @@ void gelu_grad_grad_gpu_cuda(
   const int BLOCK_NUMS = (size + THREAD_ITEMS - 1) / THREAD_ITEMS;
   
   gelu_grad_grad<<<BLOCK_NUMS, THREAD_ITEMS>>>(out, xx, dy, dy_2, size);
+  DPErrcheck(cudaGetLastError());
+  DPErrcheck(cudaDeviceSynchronize());
 }
 
 template void gelu_gpu_cuda<float>(float * out, const float * x, const int size);
diff --git a/source/lib/src/cuda/neighbor_list.cu b/source/lib/src/cuda/neighbor_list.cu
index 33bf33aa3c..66bd122079 100644
--- a/source/lib/src/cuda/neighbor_list.cu
+++ b/source/lib/src/cuda/neighbor_list.cu
@@ -124,7 +124,7 @@ int build_nlist_gpu(
     int * ilist = nlist.ilist;
     int * numneigh = nlist.numneigh;
     int ** firstneigh = nlist.firstneigh;
-    cudaErrcheck(cudaMemset(nlist_data, -1, sizeof(int) * 2 * nloc * mem_size));
+    DPErrcheck(cudaMemset(nlist_data, -1, sizeof(int) * 2 * nloc * mem_size));
     int * temp_nlist = nlist_data; //nloc*mem_size
     int * nei_order = temp_nlist + nloc * mem_size;
     nlist.inum = nloc;
@@ -141,6 +141,8 @@ int build_nlist_gpu(
                 nloc,
                 nall,
                 mem_size);
+    DPErrcheck(cudaGetLastError());
+    DPErrcheck(cudaDeviceSynchronize());
     const int nblock_ = (nloc+TPB-1)/TPB;
     scan_nlist<<<nblock_, TPB>>>(
                 numneigh, 
@@ -149,15 +151,18 @@ int build_nlist_gpu(
                 mem_size, 
                 nloc, 
                 nall);
+    DPErrcheck(cudaGetLastError());
+    DPErrcheck(cudaDeviceSynchronize());
     fill_nlist<<<block_grid, thread_grid>>>(
                 firstneigh,
                 temp_nlist,
                 nei_order,
                 mem_size,
-                nall
-    );
+                nall);
+    DPErrcheck(cudaGetLastError());
+    DPErrcheck(cudaDeviceSynchronize());
     int * numneigh_host = new int[nloc];
-    cudaErrcheck(cudaMemcpy(numneigh_host, numneigh, sizeof(int) * nloc, cudaMemcpyDeviceToHost));
+    DPErrcheck(cudaMemcpy(numneigh_host, numneigh, sizeof(int) * nloc, cudaMemcpyDeviceToHost));
     int max_nei = 0;
     for(int ii=0;ii<nloc;ii++){
         if(numneigh_host[ii]>max_nei)max_nei=numneigh_host[ii];
@@ -177,6 +182,8 @@ void use_nlist_map(
     dim3 block_grid(nloc, nblock);
     dim3 thread_grid(1, TPB);
     map_nlist<<<block_grid,thread_grid>>>(nlist, nlist_map, nloc, nnei);
+    DPErrcheck(cudaGetLastError());
+    DPErrcheck(cudaDeviceSynchronize());
 }
 
 template int build_nlist_gpu<float>(InputNlist & nlist, int * max_list_size, int * nlist_data, const float * c_cpy, const int & nloc, const int & nall, const int & mem_size, const float & rcut);
diff --git a/source/lib/src/cuda/prod_env_mat.cu b/source/lib/src/cuda/prod_env_mat.cu
index ccea65d448..ce9bd82638 100644
--- a/source/lib/src/cuda/prod_env_mat.cu
+++ b/source/lib/src/cuda/prod_env_mat.cu
@@ -144,6 +144,29 @@ __global__ void format_nlist_fill_a(
   }
 }
 
+template<typename FPTYPE>
+__global__ void fill_nei_iter(
+    int * nei_iter_dev,
+    const FPTYPE * key,
+    const int nloc,
+    const int max_nbor_size,
+    const int sec_size)
+{
+  int row = blockIdx.x;
+  int col = blockIdx.y * blockDim.x + threadIdx.x;
+  const FPTYPE * key_out = key + nloc * max_nbor_size + row * max_nbor_size;
+  int nei_type_cur = -1, nbor_idx_cur = 0;
+  int nei_type_pre = -1, nbor_idx_pre = 0;
+  if (col < max_nbor_size && key_out[col] != key_out[max_nbor_size - 1]){
+    if (col >= 1) 
+      decoding_nbor_info(nei_type_pre, nbor_idx_pre, key_out[col - 1]);
+    decoding_nbor_info(nei_type_cur, nbor_idx_cur, key_out[col]);
+  }
+  if (nei_type_cur != nei_type_pre){
+    nei_iter_dev[row * sec_size + nei_type_cur] = col;
+  }
+}
+
 template<typename FPTYPE>
 __global__ void format_nlist_fill_b(
     int * nlist,
@@ -155,23 +178,19 @@ __global__ void format_nlist_fill_b(
     int * nei_iter_dev,
     const int max_nbor_size)
 { 
-  const unsigned int idx = blockIdx.x * blockDim.x + threadIdx.x;
-  if(idx >= nloc) {
-    return;
-  }
-  
-  int * row_nlist = nlist + idx * nlist_size;
-  int * nei_iter = nei_iter_dev + idx * sec_size;
-  FPTYPE * key_out = key + nloc * max_nbor_size + idx * max_nbor_size;
-  for (int ii = 0; ii < sec_size; ii++) {
-    nei_iter[ii] = sec[ii];
-  }
-  
-  int nei_type = 0, nbor_idx = 0;
-  for (unsigned int kk = 0; key_out[kk] != key_out[max_nbor_size - 1]; kk++) {
-    decoding_nbor_info(nei_type, nbor_idx, key_out[kk]);
-    if (nei_iter[nei_type] < sec[nei_type + 1]) {
-      row_nlist[nei_iter[nei_type]++] = nbor_idx;
+  int row = blockIdx.x;
+  int col = blockIdx.y * blockDim.x + threadIdx.x;
+  int * nei_iter = nei_iter_dev + row * sec_size;
+  FPTYPE * key_out = key + nloc * max_nbor_size + row * max_nbor_size;
+  int * row_nlist = nlist + row * nlist_size;
+  if (col < max_nbor_size){
+    if (key_out[col] != key_out[max_nbor_size - 1]){
+      int nei_type = 0, nbor_idx = 0;
+      decoding_nbor_info(nei_type, nbor_idx, key_out[col]);
+      int out_indx = col - nei_iter[nei_type] + sec[nei_type];
+      if (out_indx < sec[nei_type + 1]){
+        row_nlist[out_indx] = nbor_idx;
+      }
     }
   }
 }
@@ -213,12 +232,16 @@ void format_nbor_list_1024 (
   format_nlist_fill_a<<<block_grid, thread_grid>>> (
       key,
       coord, type, gpu_inlist.numneigh, gpu_inlist.firstneigh, rcut, i_idx, MAX_NBOR_SIZE);
+  DPErrcheck(cudaGetLastError());
+  DPErrcheck(cudaDeviceSynchronize());
   const int ITEMS_PER_THREAD = 8;
   const int BLOCK_THREADS = MAX_NBOR_SIZE / ITEMS_PER_THREAD;
   // BlockSortKernel<NeighborInfo, BLOCK_THREADS, ITEMS_PER_THREAD><<<g_grid_size, BLOCK_THREADS>>> (
   BlockSortKernel<uint_64, BLOCK_THREADS, ITEMS_PER_THREAD> <<<nloc, BLOCK_THREADS>>> (
       key, 
       key + nloc * MAX_NBOR_SIZE);
+  DPErrcheck(cudaGetLastError());
+  DPErrcheck(cudaDeviceSynchronize());
 }
 
 template<typename FPTYPE>
@@ -239,12 +262,16 @@ void format_nbor_list_2048 (
   format_nlist_fill_a<<<block_grid, thread_grid>>> (
       key,
       coord, type, gpu_inlist.numneigh, gpu_inlist.firstneigh, rcut, i_idx, MAX_NBOR_SIZE);
+  DPErrcheck(cudaGetLastError());
+  DPErrcheck(cudaDeviceSynchronize());
   const int ITEMS_PER_THREAD = 8;
   const int BLOCK_THREADS = MAX_NBOR_SIZE / ITEMS_PER_THREAD;
   // BlockSortKernel<NeighborInfo, BLOCK_THREADS, ITEMS_PER_THREAD><<<g_grid_size, BLOCK_THREADS>>> (
   BlockSortKernel<uint_64, BLOCK_THREADS, ITEMS_PER_THREAD> <<<nloc, BLOCK_THREADS>>> (
       key, 
       key + nloc * MAX_NBOR_SIZE);
+  DPErrcheck(cudaGetLastError());
+  DPErrcheck(cudaDeviceSynchronize());
 }
 
 template<typename FPTYPE>
@@ -265,12 +292,16 @@ void format_nbor_list_4096 (
   format_nlist_fill_a<<<block_grid, thread_grid>>> (
       key,
       coord, type, gpu_inlist.numneigh, gpu_inlist.firstneigh, rcut, i_idx, MAX_NBOR_SIZE);
+  DPErrcheck(cudaGetLastError());
+  DPErrcheck(cudaDeviceSynchronize());
   const int ITEMS_PER_THREAD = 16;
   const int BLOCK_THREADS = MAX_NBOR_SIZE / ITEMS_PER_THREAD;
   // BlockSortKernel<NeighborInfo, BLOCK_THREADS, ITEMS_PER_THREAD><<<g_grid_size, BLOCK_THREADS>>> (
   BlockSortKernel<uint_64, BLOCK_THREADS, ITEMS_PER_THREAD> <<<nloc, BLOCK_THREADS>>> (
       key, 
       key + nloc * MAX_NBOR_SIZE);
+  DPErrcheck(cudaGetLastError());
+  DPErrcheck(cudaDeviceSynchronize());
 }
 
 template<
@@ -449,13 +480,15 @@ void format_nbor_list_gpu_cuda(
   int * i_idx = array_int + sec.size() + nloc * sec.size();
   uint_64 * key = array_longlong;
   assert(max_nbor_size == 1024 || max_nbor_size == 2048 || max_nbor_size == 4096);
-  cudaErrcheck(cudaMemset(nlist, -1, sizeof(int) * nloc * nnei));
-  cudaErrcheck(cudaMemset(key, 0xffffffff, sizeof(uint_64) * nloc * max_nbor_size));
-  cudaErrcheck(cudaMemcpy(sec_dev, &sec[0], sizeof(int) * sec.size(), cudaMemcpyHostToDevice));   
+  DPErrcheck(cudaMemset(nlist, -1, sizeof(int) * nloc * nnei));
+  DPErrcheck(cudaMemset(key, 0xffffffff, sizeof(uint_64) * nloc * max_nbor_size));
+  DPErrcheck(cudaMemcpy(sec_dev, &sec[0], sizeof(int) * sec.size(), cudaMemcpyHostToDevice));   
 
   get_i_idx<<<nblock, LEN>>>(
       i_idx,
       nloc, gpu_inlist.ilist);
+  DPErrcheck(cudaGetLastError());
+  DPErrcheck(cudaDeviceSynchronize());
 
   if (max_nbor_size == 1024) {
     format_nbor_list_1024 (
@@ -473,9 +506,15 @@ void format_nbor_list_gpu_cuda(
         coord, type, gpu_inlist, nloc, rcut, i_idx); 
   }
 
-  format_nlist_fill_b<<<nblock, LEN>>> (
+  fill_nei_iter <<<dim3(nloc, (max_nbor_size + LEN - 1) / LEN) , LEN>>> (
+      nei_iter,
+      key, nloc, max_nbor_size, sec.size());
+  
+  format_nlist_fill_b <<<dim3(nloc, (max_nbor_size + LEN - 1) / LEN), LEN>>> (
       nlist,
       nnei, nloc, key, sec_dev, sec.size(), nei_iter, max_nbor_size);
+  DPErrcheck(cudaGetLastError());
+  DPErrcheck(cudaDeviceSynchronize());
 }
 
 template <typename FPTYPE>
@@ -500,9 +539,9 @@ void prod_env_mat_a_gpu_cuda(
 {
   const int nnei = sec.back();
   const int ndescrpt = nnei * 4;
-  cudaErrcheck(cudaMemset(em, 0.0, sizeof(FPTYPE) * nloc * ndescrpt));
-  cudaErrcheck(cudaMemset(em_deriv, 0.0, sizeof(FPTYPE) * nloc * ndescrpt * 3));
-  cudaErrcheck(cudaMemset(rij, 0., sizeof(FPTYPE) * nloc * nnei * 3));
+  DPErrcheck(cudaMemset(em, 0.0, sizeof(FPTYPE) * nloc * ndescrpt));
+  DPErrcheck(cudaMemset(em_deriv, 0.0, sizeof(FPTYPE) * nloc * ndescrpt * 3));
+  DPErrcheck(cudaMemset(rij, 0., sizeof(FPTYPE) * nloc * nnei * 3));
 
   format_nbor_list_gpu_cuda(
       nlist, 
@@ -513,6 +552,8 @@ void prod_env_mat_a_gpu_cuda(
   compute_env_mat_a<FPTYPE, TPB> <<<nloc, TPB>>> (
       em, em_deriv, rij, 
       coord, avg, std, type, nlist, nnei, rcut_smth, rcut);
+  DPErrcheck(cudaGetLastError());
+  DPErrcheck(cudaDeviceSynchronize());
 }
 
 template <typename FPTYPE>
@@ -537,9 +578,9 @@ void prod_env_mat_r_gpu_cuda(
 {
   const int nnei = sec.back();
   const int ndescrpt = nnei * 1;
-  cudaErrcheck(cudaMemset(em, 0.0, sizeof(FPTYPE) * nloc * ndescrpt));
-  cudaErrcheck(cudaMemset(em_deriv, 0.0, sizeof(FPTYPE) * nloc * ndescrpt * 3));
-  cudaErrcheck(cudaMemset(rij, 0., sizeof(FPTYPE) * nloc * nnei * 3));
+  DPErrcheck(cudaMemset(em, 0.0, sizeof(FPTYPE) * nloc * ndescrpt));
+  DPErrcheck(cudaMemset(em_deriv, 0.0, sizeof(FPTYPE) * nloc * ndescrpt * 3));
+  DPErrcheck(cudaMemset(rij, 0., sizeof(FPTYPE) * nloc * nnei * 3));
 
   format_nbor_list_gpu_cuda(
       nlist, 
@@ -550,6 +591,8 @@ void prod_env_mat_r_gpu_cuda(
   compute_env_mat_r<FPTYPE, TPB> <<<nloc, TPB>>> (
       em, em_deriv, rij, 
       coord, avg, std, type, nlist, nnei, rcut_smth, rcut);
+  DPErrcheck(cudaGetLastError());
+  DPErrcheck(cudaDeviceSynchronize());
 }
 
 template <typename FPTYPE>
@@ -566,6 +609,8 @@ void test_encoding_decoding_nbor_info_gpu_cuda(
   encoding_decoding_nbor_info<<<nblock, TPB>>> (
       key, out_type, out_index,
       in_type, in_dist, in_index, size_of_array);
+  DPErrcheck(cudaGetLastError());
+  DPErrcheck(cudaDeviceSynchronize());
 }
 
 template void prod_env_mat_a_gpu_cuda<float>(float * em, float * em_deriv, float * rij, int * nlist, const float * coord, const int * type, const InputNlist & gpu_inlist, int * array_int, unsigned long long * array_longlong, const int max_nbor_size, const float * avg, const float * std, const int nloc, const int nall, const float rcut, const float rcut_smth, const std::vector<int> sec);
diff --git a/source/lib/src/cuda/prod_force.cu b/source/lib/src/cuda/prod_force.cu
index 62c7ce8926..124f6c806d 100644
--- a/source/lib/src/cuda/prod_force.cu
+++ b/source/lib/src/cuda/prod_force.cu
@@ -108,13 +108,15 @@ void prod_force_a_gpu_cuda(
     const int nnei)
 {
   const int ndescrpt = nnei * 4;
-  cudaErrcheck(cudaMemset(
+  DPErrcheck(cudaMemset(
       force, 
       0.0, sizeof(FPTYPE) * nall * 3));
 
   force_deriv_wrt_center_atom<FPTYPE, TPB> <<<nloc, TPB>>>(
       force, 
       net_deriv, in_deriv, ndescrpt);
+  DPErrcheck(cudaGetLastError());
+  DPErrcheck(cudaDeviceSynchronize());
 
   const int LEN = 64;
   const int nblock = (nnei + LEN - 1) / LEN;
@@ -123,6 +125,8 @@ void prod_force_a_gpu_cuda(
   force_deriv_wrt_neighbors_a<<<block_grid, thread_grid>>>(
       force, 
       net_deriv, in_deriv, nlist, nloc, nnei);
+  DPErrcheck(cudaGetLastError());
+  DPErrcheck(cudaDeviceSynchronize());
 }
 
 template<typename FPTYPE> 
@@ -136,13 +140,15 @@ void prod_force_r_gpu_cuda(
     const int nnei)
 {
   const int ndescrpt = nnei * 1;
-  cudaErrcheck(cudaMemset(
+  DPErrcheck(cudaMemset(
       force, 
       0.0, sizeof(FPTYPE) * nall * 3));
 
   force_deriv_wrt_center_atom<FPTYPE, TPB> <<<nloc, TPB>>>(
       force, 
       net_deriv, in_deriv, ndescrpt);
+  DPErrcheck(cudaGetLastError());
+  DPErrcheck(cudaDeviceSynchronize());
 
   const int LEN = 64;
   const int nblock = (nnei + LEN - 1) / LEN;
@@ -151,6 +157,8 @@ void prod_force_r_gpu_cuda(
   force_deriv_wrt_neighbors_r<<<block_grid, thread_grid>>>(
       force, 
       net_deriv, in_deriv, nlist, nloc, nnei);
+  DPErrcheck(cudaGetLastError());
+  DPErrcheck(cudaDeviceSynchronize());
 }
 
 template void prod_force_a_gpu_cuda<float>(float * force, const float * net_deriv, const float * in_deriv, const int * nlist, const int nloc, const int nall, const int nnei);
diff --git a/source/lib/src/cuda/prod_force_grad.cu b/source/lib/src/cuda/prod_force_grad.cu
index 7fd9359cfe..11af70c245 100644
--- a/source/lib/src/cuda/prod_force_grad.cu
+++ b/source/lib/src/cuda/prod_force_grad.cu
@@ -88,7 +88,7 @@ void prod_force_grad_a_gpu_cuda(
     const int nnei)
 {
     const int ndescrpt = nnei * 4;
-    cudaErrcheck(cudaMemset(
+    DPErrcheck(cudaMemset(
         grad_net, 
         0.0, sizeof(FPTYPE) * nloc * ndescrpt));
     const int nblock = (ndescrpt + TPB - 1) / TPB;
@@ -97,6 +97,8 @@ void prod_force_grad_a_gpu_cuda(
     force_grad_wrt_center_atom<<<block_grid, thread_grid>>>(
         grad_net,
         grad, env_deriv, ndescrpt);
+    DPErrcheck(cudaGetLastError());
+    DPErrcheck(cudaDeviceSynchronize());
 
     const int LEN = 128;
     const int nblock_ = (nloc + LEN -1) / LEN;
@@ -105,6 +107,8 @@ void prod_force_grad_a_gpu_cuda(
     force_grad_wrt_neighbors_a<<<block_grid_, thread_grid_>>>(
         grad_net,
         grad, env_deriv, nlist, nloc, nnei);
+    DPErrcheck(cudaGetLastError());
+    DPErrcheck(cudaDeviceSynchronize());
 }
 
 template<typename FPTYPE>
@@ -117,7 +121,7 @@ void prod_force_grad_r_gpu_cuda(
     const int nnei)
 {
     const int ndescrpt = nnei * 1;
-    cudaErrcheck(cudaMemset(
+    DPErrcheck(cudaMemset(
         grad_net, 
         0.0, sizeof(FPTYPE) * nloc * ndescrpt));
     const int nblock = (ndescrpt + TPB - 1) / TPB;
@@ -126,6 +130,8 @@ void prod_force_grad_r_gpu_cuda(
     force_grad_wrt_center_atom<<<block_grid, thread_grid>>>(
         grad_net,
         grad, env_deriv, ndescrpt);
+    DPErrcheck(cudaGetLastError());
+    DPErrcheck(cudaDeviceSynchronize());
 
     const int LEN = 128;
     const int nblock_ = (nloc + LEN -1) / LEN;
@@ -134,6 +140,8 @@ void prod_force_grad_r_gpu_cuda(
     force_grad_wrt_neighbors_r<<<block_grid_, thread_grid_>>>(
         grad_net,
         grad, env_deriv, nlist, nloc, nnei);
+    DPErrcheck(cudaGetLastError());
+    DPErrcheck(cudaDeviceSynchronize());
 }
 
 template void prod_force_grad_a_gpu_cuda<float>(float * grad_net, const float * grad, const float * env_deriv, const int * nlist, const int nloc, const int nnei);
diff --git a/source/lib/src/cuda/prod_virial.cu b/source/lib/src/cuda/prod_virial.cu
index 08a64d71fe..06d1cefa42 100644
--- a/source/lib/src/cuda/prod_virial.cu
+++ b/source/lib/src/cuda/prod_virial.cu
@@ -115,10 +115,10 @@ void prod_virial_a_gpu_cuda(
     const int nall, 
     const int nnei)
 {
-  cudaErrcheck(cudaMemset(
+  DPErrcheck(cudaMemset(
       virial, 
       0.0, sizeof(FPTYPE) * 9));
-  cudaErrcheck(cudaMemset(
+  DPErrcheck(cudaMemset(
       atom_virial, 
       0.0, sizeof(FPTYPE) * 9 * nall));
     
@@ -130,10 +130,14 @@ void prod_virial_a_gpu_cuda(
   virial_deriv_wrt_neighbors_a<<<block_grid, thread_grid>>>(
       virial, atom_virial, 
       net_deriv, in_deriv, rij, nlist, nloc, nnei);
+  DPErrcheck(cudaGetLastError());
+  DPErrcheck(cudaDeviceSynchronize());
   // reduction atom_virial to virial
   atom_virial_reduction<FPTYPE, TPB> <<<9, TPB>>>(
       virial, 
       atom_virial, nall);
+  DPErrcheck(cudaGetLastError());
+  DPErrcheck(cudaDeviceSynchronize());
 }
 
 template<typename FPTYPE>
@@ -148,10 +152,10 @@ void prod_virial_r_gpu_cuda(
     const int nall, 
     const int nnei)
 {
-  cudaErrcheck(cudaMemset(
+  DPErrcheck(cudaMemset(
       virial, 
       0.0, sizeof(FPTYPE) * 9));
-  cudaErrcheck(cudaMemset(
+  DPErrcheck(cudaMemset(
       atom_virial, 
       0.0, sizeof(FPTYPE) * 9 * nall));
     
@@ -163,10 +167,14 @@ void prod_virial_r_gpu_cuda(
   virial_deriv_wrt_neighbors_r<<<block_grid, thread_grid>>>(
       virial, atom_virial, 
       net_deriv, in_deriv, rij, nlist, nloc, nnei);
+  DPErrcheck(cudaGetLastError());
+  DPErrcheck(cudaDeviceSynchronize());
   // reduction atom_virial to virial
   atom_virial_reduction<FPTYPE, TPB> <<<9, TPB>>>(
     virial, 
     atom_virial, nall);
+  DPErrcheck(cudaGetLastError());
+  DPErrcheck(cudaDeviceSynchronize());
 }
 
 template void prod_virial_a_gpu_cuda<float>(float * virial, float * atom_virial, const float * net_deriv, const float * in_deriv, const float * rij, const int * nlist, const int nloc, const int nall, const int nnei);
diff --git a/source/lib/src/cuda/prod_virial_grad.cu b/source/lib/src/cuda/prod_virial_grad.cu
index 2cdd25ec38..0b9affa948 100644
--- a/source/lib/src/cuda/prod_virial_grad.cu
+++ b/source/lib/src/cuda/prod_virial_grad.cu
@@ -99,7 +99,7 @@ void prod_virial_grad_a_gpu_cuda(
     const int nnei)
 {
     const int ndescrpt = nnei * 4;
-    cudaErrcheck(cudaMemset(
+    DPErrcheck(cudaMemset(
         grad_net, 
         0.0, sizeof(FPTYPE) * nloc * ndescrpt));
     const int LEN = 128;
@@ -109,6 +109,8 @@ void prod_virial_grad_a_gpu_cuda(
     virial_grad_wrt_neighbors_a<<<block_grid, thread_grid>>>(
         grad_net,
         grad, env_deriv, rij, nlist, nloc, nnei);
+    DPErrcheck(cudaGetLastError());
+    DPErrcheck(cudaDeviceSynchronize());
 }
 
 template<typename FPTYPE>
@@ -122,7 +124,7 @@ void prod_virial_grad_r_gpu_cuda(
     const int nnei)
 {
     const int ndescrpt = nnei;
-    cudaErrcheck(cudaMemset(
+    DPErrcheck(cudaMemset(
         grad_net, 
         0.0, sizeof(FPTYPE) * nloc * ndescrpt));
     const int LEN = 128;
@@ -132,6 +134,8 @@ void prod_virial_grad_r_gpu_cuda(
     virial_grad_wrt_neighbors_r<<<block_grid, thread_grid>>>(
         grad_net,
         grad, env_deriv, rij, nlist, nloc, nnei);
+    DPErrcheck(cudaGetLastError());
+    DPErrcheck(cudaDeviceSynchronize());
 }
 
 template void prod_virial_grad_a_gpu_cuda<float>(float * grad_net, const float * grad, const float * env_deriv, const float * rij, const int * nlist, const int nloc, const int nnei);
diff --git a/source/lib/src/cuda/region.cu b/source/lib/src/cuda/region.cu
index 4a95e5f9da..99e203cfcc 100644
--- a/source/lib/src/cuda/region.cu
+++ b/source/lib/src/cuda/region.cu
@@ -39,6 +39,8 @@ convert_to_inter_gpu(
     const FPTYPE * rp)
 {
     _phys2Inter<<<1, 1>>>(ri, rp, region.rec_boxt);
+    DPErrcheck(cudaGetLastError());
+    DPErrcheck(cudaDeviceSynchronize());
 }
 
 template<typename FPTYPE>
@@ -49,6 +51,8 @@ convert_to_phys_gpu(
     const FPTYPE * ri)
 {
     _inter2Phys<<<1, 1>>>(rp, ri, region.boxt);
+    DPErrcheck(cudaGetLastError());
+    DPErrcheck(cudaDeviceSynchronize());
 }
 
 template<typename FPTYPE>
@@ -58,6 +62,8 @@ volume_gpu(
     const Region<FPTYPE> & region)
 {
     _compute_volume<<<1, 1>>>(volume, region.boxt);
+    DPErrcheck(cudaGetLastError());
+    DPErrcheck(cudaDeviceSynchronize());
 }
 
 template void convert_to_inter_gpu<float>(float * ri, const Region<float> & region, const float * rp);
diff --git a/source/lib/src/cuda/tabulate.cu b/source/lib/src/cuda/tabulate.cu
index b71a989819..2cab6d75a7 100644
--- a/source/lib/src/cuda/tabulate.cu
+++ b/source/lib/src/cuda/tabulate.cu
@@ -203,6 +203,8 @@ void tabulate_fusion_gpu_cuda(
   tabulate_fusion_fifth_order_polynomial<FPTYPE, MM, KK> <<<nloc, last_layer_size>>>(
       out, 
       table, em_x, em, table_info[0], table_info[1], table_info[2], table_info[3], table_info[4], nnei, last_layer_size);
+  DPErrcheck(cudaGetLastError());
+  DPErrcheck(cudaDeviceSynchronize());
 }
 
 template<typename FPTYPE>
@@ -219,16 +221,18 @@ void tabulate_fusion_grad_gpu_cuda(
     const int last_layer_size)
 {
   if (nloc <= 0) {return;}
-  cudaErrcheck(cudaMemset(
+  DPErrcheck(cudaMemset(
       dy_dem_x,
       0.0, sizeof(FPTYPE) * nloc * nnei));
-  cudaErrcheck(cudaMemset(
+  DPErrcheck(cudaMemset(
       dy_dem,
       0.0, sizeof(FPTYPE) * nloc * nnei * 4));
 
   tabulate_fusion_grad_fifth_order_polynomial<FPTYPE, MM, KK> <<<nloc, KK * WARP_SIZE, sizeof(FPTYPE) * MM * last_layer_size>>>(
       dy_dem_x, dy_dem,
       table, em_x, em, dy,  table_info[0], table_info[1], table_info[2], table_info[3], table_info[4], nnei, last_layer_size);
+  DPErrcheck(cudaGetLastError());
+  DPErrcheck(cudaDeviceSynchronize());
 }
 
 template void tabulate_fusion_gpu_cuda<float>(float * out, const float * table, const float * table_info, const float * em_x, const float * em, const int nloc, const int nnei, const int last_layer_size);
diff --git a/source/lib/src/fmt_nlist.cc b/source/lib/src/fmt_nlist.cc
index add83dadcf..35155d77d1 100644
--- a/source/lib/src/fmt_nlist.cc
+++ b/source/lib/src/fmt_nlist.cc
@@ -4,6 +4,7 @@
 #include "fmt_nlist.h"
 #include "SimulationRegion.h"
 #include <iostream>
+#include "errors.h"
 
 using namespace deepmd;
 
@@ -185,7 +186,7 @@ format_nlist_cpu (
 		<< fmt_ilist.size()
 		<< " which does not match " 
 		<< nnei	<< std::endl;
-      exit(1);
+      throw deepmd::deepmd_exception();
     }
     std::copy(fmt_ilist.begin(), fmt_ilist.end(), cur_nlist);
   }
diff --git a/source/lib/src/neighbor_list.cc b/source/lib/src/neighbor_list.cc
index a41784bc29..c3cd376fbe 100644
--- a/source/lib/src/neighbor_list.cc
+++ b/source/lib/src/neighbor_list.cc
@@ -843,8 +843,8 @@ build_nlist_cpu<float>(
     const int & mem_size,
     const float & rcut);
 
-#if GOOGLE_CUDA
-void deepmd::convert_nlist_gpu_cuda(
+#if GOOGLE_CUDA || TENSORFLOW_USE_ROCM
+void deepmd::convert_nlist_gpu_device(
     InputNlist & gpu_nlist,
     InputNlist & cpu_nlist,
     int* & gpu_memory,
@@ -867,44 +867,11 @@ void deepmd::convert_nlist_gpu_cuda(
   free(_firstneigh);
 }
 
-void deepmd::free_nlist_gpu_cuda(
+void deepmd::free_nlist_gpu_device(
     InputNlist & gpu_nlist)
 {
   delete_device_memory(gpu_nlist.ilist);
   delete_device_memory(gpu_nlist.numneigh);
   delete_device_memory(gpu_nlist.firstneigh);
 }
-#endif // GOOGLE_CUDA
-
-#if TENSORFLOW_USE_ROCM
-void deepmd::convert_nlist_gpu_rocm(
-    InputNlist & gpu_nlist,
-    InputNlist & cpu_nlist,
-    int* & gpu_memory,
-    const int & max_nbor_size)
-{
-  const int inum = cpu_nlist.inum;
-  gpu_nlist.inum = inum;
-  malloc_device_memory(gpu_nlist.ilist, inum);
-  malloc_device_memory(gpu_nlist.numneigh, inum);
-  malloc_device_memory(gpu_nlist.firstneigh, inum);
-  memcpy_host_to_device(gpu_nlist.ilist, cpu_nlist.ilist, inum);
-  memcpy_host_to_device(gpu_nlist.numneigh, cpu_nlist.numneigh, inum);
-  int ** _firstneigh = NULL;
-  _firstneigh = (int**)malloc(sizeof(int*) * inum);
-  for (int ii = 0; ii < inum; ii++) {
-    memcpy_host_to_device(gpu_memory + ii * max_nbor_size, cpu_nlist.firstneigh[ii], cpu_nlist.numneigh[ii]);
-    _firstneigh[ii] = gpu_memory + ii * max_nbor_size;
-  }
-  memcpy_host_to_device(gpu_nlist.firstneigh, _firstneigh, inum);
-  free(_firstneigh);
-}
-
-void deepmd::free_nlist_gpu_rocm(
-    InputNlist & gpu_nlist)
-{
-  delete_device_memory(gpu_nlist.ilist);
-  delete_device_memory(gpu_nlist.numneigh);
-  delete_device_memory(gpu_nlist.firstneigh);
-}
-#endif // TENSORFLOW_USE_ROCM
+#endif // GOOGLE_CUDA || TENSORFLOW_USE_ROCM
diff --git a/source/lib/src/pair_tab.cc b/source/lib/src/pair_tab.cc
index 5137e17ac9..2c48ce957a 100644
--- a/source/lib/src/pair_tab.cc
+++ b/source/lib/src/pair_tab.cc
@@ -3,6 +3,7 @@
 #include <cassert>
 #include <vector>
 #include "pair_tab.h"
+#include "errors.h"
 
 inline 
 void _pair_tabulated_inter (
@@ -25,7 +26,7 @@ void _pair_tabulated_inter (
   // std::cout << rr << " " << rmin << " " << hh << " " << uu << std::endl;
   if (uu < 0) {
     std::cerr << "coord go beyond table lower boundary" << std::endl;
-    exit(1);
+    throw deepmd::deepmd_exception();
   }
   int idx = uu;
   if (idx >= nspline) {
diff --git a/source/lib/src/prod_env_mat.cc b/source/lib/src/prod_env_mat.cc
index 4ef5af49e3..1ac944786c 100644
--- a/source/lib/src/prod_env_mat.cc
+++ b/source/lib/src/prod_env_mat.cc
@@ -256,7 +256,7 @@ prod_env_mat_r_cpu<float>(
     const float rcut_smth, 
     const std::vector<int> sec);
 
-#if GOOGLE_CUDA
+#if GOOGLE_CUDA || TENSORFLOW_USE_ROCM
 void deepmd::env_mat_nbor_update(
     InputNlist &inlist,
     InputNlist &gpu_inlist,
@@ -266,7 +266,7 @@ void deepmd::env_mat_nbor_update(
     const int size)
 {
   int *mesh_host = new int[size];
-  cudaErrcheck(cudaMemcpy(mesh_host, mesh, sizeof(int) * size, cudaMemcpyDeviceToHost));
+  memcpy_device_to_host(mesh, mesh_host, size);
   memcpy(&inlist.ilist, 4 + mesh_host, sizeof(int *));
 	memcpy(&inlist.numneigh, 8 + mesh_host, sizeof(int *));
 	memcpy(&inlist.firstneigh, 12 + mesh_host, sizeof(int **));
@@ -319,69 +319,4 @@ void deepmd::env_mat_nbor_update(
   }
   delete [] mesh_host;
 }
-#endif // GOOGLE_CUDA
-
-#if TENSORFLOW_USE_ROCM
-void deepmd::env_mat_nbor_update(
-    InputNlist &inlist,
-    InputNlist &gpu_inlist,
-    int &max_nbor_size,
-    int* &nbor_list_dev,
-    const int * mesh, 
-    const int size)
-{
-  int *mesh_host = new int[size];
-  hipErrcheck(hipMemcpy(mesh_host, mesh, sizeof(int) * size, hipMemcpyDeviceToHost));
-  memcpy(&inlist.ilist, 4 + mesh_host, sizeof(int *));
-	memcpy(&inlist.numneigh, 8 + mesh_host, sizeof(int *));
-	memcpy(&inlist.firstneigh, 12 + mesh_host, sizeof(int **));
-  const int ago = mesh_host[0];
-  if (ago == 0) {
-    const int inum = inlist.inum;
-    if (gpu_inlist.inum < inum) {
-      delete_device_memory(gpu_inlist.ilist);
-      delete_device_memory(gpu_inlist.numneigh);
-      delete_device_memory(gpu_inlist.firstneigh);
-      malloc_device_memory(gpu_inlist.ilist, inum);
-      malloc_device_memory(gpu_inlist.numneigh, inum);
-      malloc_device_memory(gpu_inlist.firstneigh, inum);
-    }
-    memcpy_host_to_device(gpu_inlist.ilist, inlist.ilist, inum);
-    memcpy_host_to_device(gpu_inlist.numneigh, inlist.numneigh, inum);
-    int _max_nbor_size = max_numneigh(inlist);
-    if (_max_nbor_size <= 1024) {
-      _max_nbor_size = 1024;
-    }
-    else if (_max_nbor_size <= 2048) {
-      _max_nbor_size = 2048;
-    }
-    else {
-      _max_nbor_size = 4096;
-    }
-    if ( nbor_list_dev == NULL 
-      || _max_nbor_size > max_nbor_size 
-      || inum > gpu_inlist.inum) 
-    {
-      delete_device_memory(nbor_list_dev);
-      malloc_device_memory(nbor_list_dev, inum * _max_nbor_size);
-    }
-    // update info
-    gpu_inlist.inum = inum;
-    max_nbor_size = _max_nbor_size;
-
-    // copy nbor list from host to the device
-    std::vector<int> nbor_list_host(inum * max_nbor_size, 0);
-    int ** _firstneigh = (int**)malloc(sizeof(int*) * inum);
-    for (int ii = 0; ii < inum; ii++) {
-      _firstneigh[ii] = nbor_list_dev + ii * max_nbor_size;
-      for (int jj = 0; jj < inlist.numneigh[ii]; jj++) {
-        nbor_list_host[ii * max_nbor_size + jj] = inlist.firstneigh[ii][jj];
-      }
-    }
-    memcpy_host_to_device(nbor_list_dev, &nbor_list_host[0], inum * max_nbor_size);
-    memcpy_host_to_device(gpu_inlist.firstneigh, _firstneigh, inum);
-    free(_firstneigh);
-  }
-  delete [] mesh_host;
-}
-#endif // TENSORFLOW_USE_ROCM
\ No newline at end of file
+#endif // GOOGLE_CUDA || TENSORFLOW_USE_ROCM
\ No newline at end of file
diff --git a/source/lib/src/prod_force.cc b/source/lib/src/prod_force.cc
index ffe177e16c..e9784d3409 100644
--- a/source/lib/src/prod_force.cc
+++ b/source/lib/src/prod_force.cc
@@ -1,6 +1,7 @@
 #include <stdexcept>
 #include <cstring>
 #include "prod_force.h"
+#include "errors.h"
 
 inline void
 make_index_range (
@@ -14,7 +15,7 @@ make_index_range (
     idx_end   = nei_idx * 4 + 4;
   }
   else {
-    throw std::runtime_error("should no reach here");
+    throw deepmd::deepmd_exception("should no reach here");
   }
 }
 
diff --git a/source/lib/src/prod_force_grad.cc b/source/lib/src/prod_force_grad.cc
index 7872ea5c55..110bf790f4 100644
--- a/source/lib/src/prod_force_grad.cc
+++ b/source/lib/src/prod_force_grad.cc
@@ -2,6 +2,7 @@
 #include <stdexcept>
 #include <cstring>
 #include "prod_force_grad.h"
+#include "errors.h"
 
 inline void
 make_index_range (
@@ -15,7 +16,7 @@ make_index_range (
     idx_end   = nei_idx * 4 + 4;
   }
   else {
-    throw std::runtime_error("should no reach here");
+    throw deepmd::deepmd_exception("should no reach here");
   }
 }
 
diff --git a/source/lib/src/prod_virial.cc b/source/lib/src/prod_virial.cc
index 086bc94245..f1c598c807 100644
--- a/source/lib/src/prod_virial.cc
+++ b/source/lib/src/prod_virial.cc
@@ -2,6 +2,7 @@
 #include <stdexcept>
 #include <cstring>
 #include "prod_virial.h"
+#include "errors.h"
 
 inline void 
 make_index_range (
@@ -15,7 +16,7 @@ make_index_range (
     idx_end   = nei_idx * 4 + 4;
   }
   else {
-    throw std::runtime_error("should no reach here");    
+    throw deepmd::deepmd_exception("should no reach here");    
   }
 }
 
diff --git a/source/lib/src/prod_virial_grad.cc b/source/lib/src/prod_virial_grad.cc
index 59c3192fc0..8e225c0793 100644
--- a/source/lib/src/prod_virial_grad.cc
+++ b/source/lib/src/prod_virial_grad.cc
@@ -1,6 +1,7 @@
 #include <stdexcept>
 #include <cstring>
 #include "prod_virial_grad.h"
+#include "errors.h"
 
 inline void
 make_index_range (
@@ -14,7 +15,7 @@ make_index_range (
     idx_end   = nei_idx * 4 + 4;
   }
   else {
-    throw std::runtime_error("should no reach here");
+    throw deepmd::deepmd_exception("should no reach here");
   }
 }
 
diff --git a/source/lib/src/region.cc b/source/lib/src/region.cc
index 62dcdb9b68..90704016c2 100644
--- a/source/lib/src/region.cc
+++ b/source/lib/src/region.cc
@@ -1,6 +1,7 @@
 #include <stdexcept>
 #include <algorithm>
 #include "region.h"
+#include "errors.h"
 #define BOXT_DIM 9
 
 using namespace deepmd;
@@ -33,7 +34,7 @@ compute_volume(const FPTYPE * boxt)
       boxt[0*3+1] * (boxt[1*3+0]*boxt[2*3+2] - boxt[2*3+0]*boxt[1*3+2]) +
       boxt[0*3+2] * (boxt[1*3+0]*boxt[2*3+1] - boxt[2*3+0]*boxt[1*3+1]);
   if (volume < 0) {
-    throw std::runtime_error("Negative volume detected. Please make sure the simulation cell obeys the right-hand rule.");
+    throw deepmd::deepmd_exception("Negative volume detected. Please make sure the simulation cell obeys the right-hand rule.");
   }
   return volume;
 }
diff --git a/source/lib/src/rocm/coord.hip.cu b/source/lib/src/rocm/coord.hip.cu
index 12c797ab3e..1d01e1da30 100644
--- a/source/lib/src/rocm/coord.hip.cu
+++ b/source/lib/src/rocm/coord.hip.cu
@@ -292,14 +292,20 @@ void compute_int_data(
     const int nblock_loc=(nloc+TPB-1)/TPB;
     hipLaunchKernelGGL(_fill_idx_cellmap, nblock_loc, TPB, 0, 0, idx_cellmap, idx_cellmap_noshift, in_c, rec_boxt, 
         nat_stt, nat_end, ext_stt, ext_end, nloc);
+    DPErrcheck(hipGetLastError());
+    DPErrcheck(hipDeviceSynchronize());
 
     const int nblock_loc_cellnum=(loc_cellnum+TPB-1)/TPB;
     hipLaunchKernelGGL(_fill_loc_cellnum_map, nblock_loc_cellnum, TPB, 0, 0, temp_idx_order, loc_cellnum_map, 
         idx_cellmap_noshift, nloc, loc_cellnum);
+    DPErrcheck(hipGetLastError());
+    DPErrcheck(hipDeviceSynchronize());
 
     const int nblock_total_cellnum=(total_cellnum+TPB-1)/TPB;
     hipLaunchKernelGGL(_fill_total_cellnum_map, nblock_total_cellnum, TPB, 0, 0, total_cellnum_map, mask_cellnum_map, cell_map, 
         cell_shift_map, nat_stt, nat_end, ext_stt, ext_end, loc_cellnum_map, total_cellnum);
+    DPErrcheck(hipGetLastError());
+    DPErrcheck(hipDeviceSynchronize());
 }
 
 void build_loc_clist(
@@ -314,6 +320,8 @@ void build_loc_clist(
     const int * sec_loc_cellnum_map=temp_idx_order+nloc+loc_cellnum+2*total_cellnum+total_cellnum+3*total_cellnum;
     int * loc_clist=int_data+nloc*3+loc_cellnum+total_cellnum*3+total_cellnum*3+loc_cellnum+1+total_cellnum+1;
     hipLaunchKernelGGL(_build_loc_clist, nblock, TPB, 0, 0, loc_clist, idx_cellmap_noshift, temp_idx_order, sec_loc_cellnum_map, nloc);
+    DPErrcheck(hipGetLastError());
+    DPErrcheck(hipDeviceSynchronize());
 }
 
 template <typename FPTYPE>
@@ -341,6 +349,8 @@ void copy_coord(
     const FPTYPE *rec_boxt = region.rec_boxt;
     hipLaunchKernelGGL(_copy_coord, nblock, TPB, 0, 0, out_c, out_t, mapping, in_c, in_t, cell_map, cell_shift_map, 
         sec_loc_cellnum_map, sec_total_cellnum_map, loc_clist, nloc, nall, total_cellnum, boxt, rec_boxt);
+    DPErrcheck(hipGetLastError());
+    DPErrcheck(hipDeviceSynchronize());
 }
 
 namespace deepmd {
@@ -355,6 +365,8 @@ normalize_coord_gpu_rocm(
     const FPTYPE * rec_boxt=region.rec_boxt;
     const int nblock=(natom+TPB-1)/TPB;
     hipLaunchKernelGGL(normalize_one, nblock, TPB, 0, 0, coord, boxt, rec_boxt, natom);
+    DPErrcheck(hipGetLastError());
+    DPErrcheck(hipDeviceSynchronize());
 }
 
 template <typename FPTYPE>
@@ -376,7 +388,9 @@ copy_coord_gpu_rocm(
 {
     compute_int_data(int_data, in_c, cell_info, region, nloc, loc_cellnum, total_cellnum);
     int * int_data_cpu=new int [loc_cellnum+2*total_cellnum+loc_cellnum+1+total_cellnum+1];//loc_cellnum_map,total_cellnum_map,mask_cellnum_map,sec_loc_cellnum_map,sec_total_cellnum_map
-    hipErrcheck(hipMemcpy(int_data_cpu, int_data+3*nloc, sizeof(int) * (loc_cellnum + 2 * total_cellnum), hipMemcpyDeviceToHost));
+    DPErrcheck(hipMemcpy(int_data_cpu, int_data+3*nloc, sizeof(int) * (loc_cellnum + 2 * total_cellnum), hipMemcpyDeviceToHost));
+    DPErrcheck(hipGetLastError());
+    DPErrcheck(hipDeviceSynchronize());
     int * loc_cellnum_map=int_data_cpu;
     int * total_cellnum_map=loc_cellnum_map+loc_cellnum;
     int * mask_cellnum_map=total_cellnum_map+total_cellnum;
@@ -398,7 +412,7 @@ copy_coord_gpu_rocm(
         return 1;
     }
     else{
-        hipErrcheck(hipMemcpy(int_data+nloc*3+loc_cellnum+total_cellnum*3+total_cellnum*3, 
+        DPErrcheck(hipMemcpy(int_data+nloc*3+loc_cellnum+total_cellnum*3+total_cellnum*3, 
             sec_loc_cellnum_map, sizeof(int) * (loc_cellnum+1+total_cellnum+1), hipMemcpyHostToDevice));
         delete[] int_data_cpu;
         build_loc_clist(int_data, nloc, loc_cellnum, total_cellnum);
diff --git a/source/lib/src/rocm/gelu.hip.cu b/source/lib/src/rocm/gelu.hip.cu
index 83e7a3be6d..9cabb53c08 100644
--- a/source/lib/src/rocm/gelu.hip.cu
+++ b/source/lib/src/rocm/gelu.hip.cu
@@ -64,6 +64,8 @@ namespace deepmd {
     const int BLOCK_NUMS = (size + THREAD_ITEMS - 1) / THREAD_ITEMS;
   
     hipLaunchKernelGGL(gelu, BLOCK_NUMS, THREAD_ITEMS, 0, 0, out, xx, size);
+    DPErrcheck(hipGetLastError());
+    DPErrcheck(hipDeviceSynchronize());
   }
   
   template<typename FPTYPE>
@@ -81,6 +83,8 @@ namespace deepmd {
     const int BLOCK_NUMS = (size + THREAD_ITEMS - 1) / THREAD_ITEMS;
   
     hipLaunchKernelGGL(gelu_grad, BLOCK_NUMS, THREAD_ITEMS, 0, 0, out, xx, dy, size);
+    DPErrcheck(hipGetLastError());
+    DPErrcheck(hipDeviceSynchronize());
   }
   
   template<typename FPTYPE>
@@ -99,6 +103,8 @@ namespace deepmd {
     const int BLOCK_NUMS = (size + THREAD_ITEMS - 1) / THREAD_ITEMS;
     
     hipLaunchKernelGGL(gelu_grad_grad, BLOCK_NUMS, THREAD_ITEMS, 0, 0, out, xx, dy, dy_2, size);
+    DPErrcheck(hipGetLastError());
+    DPErrcheck(hipDeviceSynchronize());
   }
   
   template void gelu_gpu_rocm<float>(float * out, const float * x, const int size);
diff --git a/source/lib/src/rocm/neighbor_list.hip.cu b/source/lib/src/rocm/neighbor_list.hip.cu
index a0da866d12..243ea0507a 100644
--- a/source/lib/src/rocm/neighbor_list.hip.cu
+++ b/source/lib/src/rocm/neighbor_list.hip.cu
@@ -124,7 +124,7 @@ int build_nlist_gpu_rocm(
     int * ilist = nlist.ilist;
     int * numneigh = nlist.numneigh;
     int ** firstneigh = nlist.firstneigh;
-    hipErrcheck(hipMemset(nlist_data, -1, sizeof(int) * 2 * nloc * mem_size));
+    DPErrcheck(hipMemset(nlist_data, -1, sizeof(int) * 2 * nloc * mem_size));
     int * temp_nlist = nlist_data; //nloc*mem_size
     int * nei_order = temp_nlist + nloc * mem_size;
     nlist.inum = nloc;
@@ -141,6 +141,8 @@ int build_nlist_gpu_rocm(
                 nloc,
                 nall,
                 mem_size);
+    DPErrcheck(hipGetLastError());
+    DPErrcheck(hipDeviceSynchronize());
     const int nblock_ = (nloc+TPB-1)/TPB;
     hipLaunchKernelGGL(scan_nlist, nblock_, TPB, 0, 0, 
                 numneigh, 
@@ -149,15 +151,18 @@ int build_nlist_gpu_rocm(
                 mem_size, 
                 nloc, 
                 nall);
+    DPErrcheck(hipGetLastError());
+    DPErrcheck(hipDeviceSynchronize());
     hipLaunchKernelGGL(fill_nlist, block_grid, thread_grid, 0, 0, 
                 firstneigh,
                 temp_nlist,
                 nei_order,
                 mem_size,
-                nall
-    );
+                nall);
+    DPErrcheck(hipGetLastError());
+    DPErrcheck(hipDeviceSynchronize());
     int * numneigh_host = new int[nloc];
-    hipErrcheck(hipMemcpy(numneigh_host, numneigh, sizeof(int) * nloc, hipMemcpyDeviceToHost));
+    DPErrcheck(hipMemcpy(numneigh_host, numneigh, sizeof(int) * nloc, hipMemcpyDeviceToHost));
     int max_nei = 0;
     for(int ii=0;ii<nloc;ii++){
         if(numneigh_host[ii]>max_nei)max_nei=numneigh_host[ii];
@@ -177,6 +182,8 @@ void use_nlist_map(
     dim3 block_grid(nloc, nblock);
     dim3 thread_grid(1, TPB);
     hipLaunchKernelGGL(map_nlist, block_grid, thread_grid, 0, 0, nlist, nlist_map, nloc, nnei);
+    DPErrcheck(hipGetLastError());
+    DPErrcheck(hipDeviceSynchronize());
 }
 
 template int build_nlist_gpu_rocm<float>(InputNlist & nlist, int * max_list_size, int * nlist_data, const float * c_cpy, const int & nloc, const int & nall, const int & mem_size, const float & rcut);
diff --git a/source/lib/src/rocm/prod_env_mat.hip.cu b/source/lib/src/rocm/prod_env_mat.hip.cu
index b1251c6a42..f3cd3bf31a 100644
--- a/source/lib/src/rocm/prod_env_mat.hip.cu
+++ b/source/lib/src/rocm/prod_env_mat.hip.cu
@@ -234,12 +234,16 @@ void format_nbor_list_1024 (
   hipLaunchKernelGGL(format_nlist_fill_a, block_grid, thread_grid, 0, 0, 
       key,
       coord, type, gpu_inlist.numneigh, gpu_inlist.firstneigh, rcut, i_idx, MAX_NBOR_SIZE);
+  DPErrcheck(hipGetLastError());
+  DPErrcheck(hipDeviceSynchronize());
   const int ITEMS_PER_THREAD = 8;
   const int BLOCK_THREADS = MAX_NBOR_SIZE / ITEMS_PER_THREAD;
   // hipLaunchKernelGGL(HIP_KERNEL_NAME(BlockSortKernel<NeighborInfo, BLOCK_THREADS, ITEMS_PER_THREAD>), g_grid_size, BLOCK_THREADS, 0, 0, 
   hipLaunchKernelGGL(HIP_KERNEL_NAME(BlockSortKernel<uint_64, BLOCK_THREADS, ITEMS_PER_THREAD>), nloc, BLOCK_THREADS, 0, 0, 
       key, 
       key + nloc * MAX_NBOR_SIZE);
+  DPErrcheck(hipGetLastError());
+  DPErrcheck(hipDeviceSynchronize());
 }
 
 template<typename FPTYPE>
@@ -260,12 +264,16 @@ void format_nbor_list_2048 (
   hipLaunchKernelGGL(format_nlist_fill_a, block_grid, thread_grid, 0, 0, 
       key,
       coord, type, gpu_inlist.numneigh, gpu_inlist.firstneigh, rcut, i_idx, MAX_NBOR_SIZE);
+  DPErrcheck(hipGetLastError());
+  DPErrcheck(hipDeviceSynchronize());
   const int ITEMS_PER_THREAD = 8;
   const int BLOCK_THREADS = MAX_NBOR_SIZE / ITEMS_PER_THREAD;
   // hipLaunchKernelGGL(HIP_KERNEL_NAME(BlockSortKernel<NeighborInfo, BLOCK_THREADS, ITEMS_PER_THREAD>), g_grid_size, BLOCK_THREADS, 0, 0, 
   hipLaunchKernelGGL(HIP_KERNEL_NAME(BlockSortKernel<uint_64, BLOCK_THREADS, ITEMS_PER_THREAD>), nloc, BLOCK_THREADS, 0, 0, 
       key, 
       key + nloc * MAX_NBOR_SIZE);
+  DPErrcheck(hipGetLastError());
+  DPErrcheck(hipDeviceSynchronize());
 }
 
 template<typename FPTYPE>
@@ -286,12 +294,16 @@ void format_nbor_list_4096 (
   hipLaunchKernelGGL(format_nlist_fill_a, block_grid, thread_grid, 0, 0, 
       key,
       coord, type, gpu_inlist.numneigh, gpu_inlist.firstneigh, rcut, i_idx, MAX_NBOR_SIZE);
+  DPErrcheck(hipGetLastError());
+  DPErrcheck(hipDeviceSynchronize());
   const int ITEMS_PER_THREAD = 16;
   const int BLOCK_THREADS = MAX_NBOR_SIZE / ITEMS_PER_THREAD;
   // hipLaunchKernelGGL(HIP_KERNEL_NAME(BlockSortKernel<NeighborInfo, BLOCK_THREADS, ITEMS_PER_THREAD>), g_grid_size, BLOCK_THREADS, 0, 0, 
   hipLaunchKernelGGL(HIP_KERNEL_NAME(BlockSortKernel<uint_64, BLOCK_THREADS, ITEMS_PER_THREAD>), nloc, BLOCK_THREADS, 0, 0, 
       key, 
       key + nloc * MAX_NBOR_SIZE);
+  DPErrcheck(hipGetLastError());
+  DPErrcheck(hipDeviceSynchronize());
 }
 
 template<
@@ -470,13 +482,15 @@ void format_nbor_list_gpu_rocm(
   int * i_idx = array_int + sec.size() + nloc * sec.size();
   uint_64 * key = array_longlong;
   assert(max_nbor_size == 1024 || max_nbor_size == 2048 || max_nbor_size == 4096);
-  hipErrcheck(hipMemset(nlist, -1, sizeof(int) * nloc * nnei));
-  hipErrcheck(hipMemset(key, 0xffffffff, sizeof(uint_64) * nloc * max_nbor_size));
-  hipErrcheck(hipMemcpy(sec_dev, &sec[0], sizeof(int) * sec.size(), hipMemcpyHostToDevice));   
+  DPErrcheck(hipMemset(nlist, -1, sizeof(int) * nloc * nnei));
+  DPErrcheck(hipMemset(key, 0xffffffff, sizeof(uint_64) * nloc * max_nbor_size));
+  DPErrcheck(hipMemcpy(sec_dev, &sec[0], sizeof(int) * sec.size(), hipMemcpyHostToDevice));   
 
   hipLaunchKernelGGL(get_i_idx, nblock, LEN, 0, 0, 
       i_idx,
       nloc, gpu_inlist.ilist);
+  DPErrcheck(hipGetLastError());
+  DPErrcheck(hipDeviceSynchronize());
 
   if (max_nbor_size == 1024) {
     format_nbor_list_1024 (
@@ -501,6 +515,8 @@ void format_nbor_list_gpu_rocm(
   hipLaunchKernelGGL(format_nlist_fill_b, dim3(nloc, (max_nbor_size + LEN - 1) / LEN), LEN, 0, 0, 
       nlist,
       nnei, nloc, key, sec_dev, sec.size(), nei_iter, max_nbor_size);
+  DPErrcheck(hipGetLastError());
+  DPErrcheck(hipDeviceSynchronize());
 }
 
 template <typename FPTYPE>
@@ -525,9 +541,9 @@ void prod_env_mat_a_gpu_rocm(
 {
   const int nnei = sec.back();
   const int ndescrpt = nnei * 4;
-  hipErrcheck(hipMemset(em, 0.0, sizeof(FPTYPE) * nloc * ndescrpt));
-  hipErrcheck(hipMemset(em_deriv, 0.0, sizeof(FPTYPE) * nloc * ndescrpt * 3));
-  hipErrcheck(hipMemset(rij, 0.0, sizeof(FPTYPE) * nloc * nnei * 3));
+  DPErrcheck(hipMemset(em, 0.0, sizeof(FPTYPE) * nloc * ndescrpt));
+  DPErrcheck(hipMemset(em_deriv, 0.0, sizeof(FPTYPE) * nloc * ndescrpt * 3));
+  DPErrcheck(hipMemset(rij, 0.0, sizeof(FPTYPE) * nloc * nnei * 3));
 
   format_nbor_list_gpu_rocm(
       nlist, 
@@ -538,6 +554,8 @@ void prod_env_mat_a_gpu_rocm(
   hipLaunchKernelGGL(HIP_KERNEL_NAME(compute_env_mat_a<FPTYPE, TPB>), nloc, TPB, 0, 0, 
       em, em_deriv, rij, 
       coord, avg, std, type, nlist, nnei, rcut_smth, rcut);
+  DPErrcheck(hipGetLastError());
+  DPErrcheck(hipDeviceSynchronize());
 }
 
 template <typename FPTYPE>
@@ -562,9 +580,9 @@ void prod_env_mat_r_gpu_rocm(
 {
   const int nnei = sec.back();
   const int ndescrpt = nnei * 1;
-  hipErrcheck(hipMemset(em, 0.0, sizeof(FPTYPE) * nloc * ndescrpt));
-  hipErrcheck(hipMemset(em_deriv, 0.0, sizeof(FPTYPE) * nloc * ndescrpt * 3));
-  hipErrcheck(hipMemset(rij, 0.0, sizeof(FPTYPE) * nloc * nnei * 3));
+  DPErrcheck(hipMemset(em, 0.0, sizeof(FPTYPE) * nloc * ndescrpt));
+  DPErrcheck(hipMemset(em_deriv, 0.0, sizeof(FPTYPE) * nloc * ndescrpt * 3));
+  DPErrcheck(hipMemset(rij, 0.0, sizeof(FPTYPE) * nloc * nnei * 3));
 
   format_nbor_list_gpu_rocm(
       nlist, 
@@ -575,6 +593,8 @@ void prod_env_mat_r_gpu_rocm(
   hipLaunchKernelGGL(HIP_KERNEL_NAME(compute_env_mat_r<FPTYPE, TPB>), nloc, TPB, 0, 0, 
       em, em_deriv, rij, 
       coord, avg, std, type, nlist, nnei, rcut_smth, rcut);
+  DPErrcheck(hipGetLastError());
+  DPErrcheck(hipDeviceSynchronize());
 }
 
 template <typename FPTYPE>
@@ -591,6 +611,8 @@ void test_encoding_decoding_nbor_info_gpu_rocm(
   hipLaunchKernelGGL(encoding_decoding_nbor_info, nblock, TPB, 0, 0, 
       key, out_type, out_index,
       in_type, in_dist, in_index, size_of_array);
+  DPErrcheck(hipGetLastError());
+  DPErrcheck(hipDeviceSynchronize());
 }
 
 template void prod_env_mat_a_gpu_rocm<float>(float * em, float * em_deriv, float * rij, int * nlist, const float * coord, const int * type, const InputNlist & gpu_inlist, int * array_int, unsigned long long * array_longlong, const int max_nbor_size, const float * avg, const float * std, const int nloc, const int nall, const float rcut, const float rcut_smth, const std::vector<int> sec);
diff --git a/source/lib/src/rocm/prod_force.hip.cu b/source/lib/src/rocm/prod_force.hip.cu
index 48b12dfa50..815fed44c5 100644
--- a/source/lib/src/rocm/prod_force.hip.cu
+++ b/source/lib/src/rocm/prod_force.hip.cu
@@ -109,13 +109,15 @@ namespace deepmd {
       const int nnei)
   {
     const int ndescrpt = nnei * 4;
-    hipErrcheck(hipMemset(
+    DPErrcheck(hipMemset(
         force, 
         0.0, sizeof(FPTYPE) * nall * 3));
   
     hipLaunchKernelGGL(HIP_KERNEL_NAME(force_deriv_wrt_center_atom<FPTYPE, TPB>), nloc, TPB, 0, 0, 
         force, 
         net_deriv, in_deriv, ndescrpt);
+    DPErrcheck(hipGetLastError());
+    DPErrcheck(hipDeviceSynchronize());
   
     const int LEN = 64;
     const int nblock = (nnei + LEN - 1) / LEN;
@@ -124,6 +126,8 @@ namespace deepmd {
     hipLaunchKernelGGL(force_deriv_wrt_neighbors_a, block_grid, thread_grid, 0, 0, 
         force, 
         net_deriv, in_deriv, nlist, nloc, nnei);
+    DPErrcheck(hipGetLastError());
+    DPErrcheck(hipDeviceSynchronize());
   }
   
   template<typename FPTYPE> 
@@ -137,13 +141,15 @@ namespace deepmd {
       const int nnei)
   {
     const int ndescrpt = nnei * 1;
-    hipErrcheck(hipMemset(
+    DPErrcheck(hipMemset(
         force, 
         0.0, sizeof(FPTYPE) * nall * 3));
   
     hipLaunchKernelGGL(HIP_KERNEL_NAME(force_deriv_wrt_center_atom<FPTYPE, TPB>), nloc, TPB, 0, 0, 
         force, 
         net_deriv, in_deriv, ndescrpt);
+    DPErrcheck(hipGetLastError());
+    DPErrcheck(hipDeviceSynchronize());
   
     const int LEN = 64;
     const int nblock = (nnei + LEN -1) / LEN;
@@ -152,6 +158,8 @@ namespace deepmd {
     hipLaunchKernelGGL(force_deriv_wrt_neighbors_r, block_grid, thread_grid, 0, 0, 
         force, 
         net_deriv, in_deriv, nlist, nloc, nnei);
+    DPErrcheck(hipGetLastError());
+    DPErrcheck(hipDeviceSynchronize());
   }
   
   template void prod_force_a_gpu_rocm<float>(float * force, const float * net_deriv, const float * in_deriv, const int * nlist, const int nloc, const int nall, const int nnei);
diff --git a/source/lib/src/rocm/prod_force_grad.hip.cu b/source/lib/src/rocm/prod_force_grad.hip.cu
index 3a3fd8ea58..f7af9a1746 100644
--- a/source/lib/src/rocm/prod_force_grad.hip.cu
+++ b/source/lib/src/rocm/prod_force_grad.hip.cu
@@ -88,7 +88,7 @@ void prod_force_grad_a_gpu_rocm(
     const int nnei)
 {
     const int ndescrpt = nnei * 4;
-    hipErrcheck(hipMemset(
+    DPErrcheck(hipMemset(
         grad_net, 
         0.0, sizeof(FPTYPE) * nloc * ndescrpt));
     const int nblock = (ndescrpt + TPB - 1) / TPB;
@@ -97,7 +97,8 @@ void prod_force_grad_a_gpu_rocm(
     hipLaunchKernelGGL(force_grad_wrt_center_atom, block_grid, thread_grid, 0, 0, 
         grad_net,
         grad, env_deriv, ndescrpt);
-
+    DPErrcheck(hipGetLastError());
+    DPErrcheck(hipDeviceSynchronize());
     const int LEN = 128;
     const int nblock_ = (nloc + LEN -1) / LEN;
     dim3 block_grid_(nblock_, nnei);
@@ -105,6 +106,8 @@ void prod_force_grad_a_gpu_rocm(
     hipLaunchKernelGGL(force_grad_wrt_neighbors_a, block_grid_, thread_grid_, 0, 0, 
         grad_net,
         grad, env_deriv, nlist, nloc, nnei);
+    DPErrcheck(hipGetLastError());
+    DPErrcheck(hipDeviceSynchronize());
 }
 
 template<typename FPTYPE>
@@ -117,7 +120,7 @@ void prod_force_grad_r_gpu_rocm(
     const int nnei)
 {
     const int ndescrpt = nnei * 1;
-    hipErrcheck(hipMemset(
+    DPErrcheck(hipMemset(
         grad_net, 
         0.0, sizeof(FPTYPE) * nloc * ndescrpt));
     const int nblock = (ndescrpt + TPB - 1) / TPB;
@@ -126,6 +129,8 @@ void prod_force_grad_r_gpu_rocm(
     hipLaunchKernelGGL(force_grad_wrt_center_atom, block_grid, thread_grid, 0, 0, 
         grad_net,
         grad, env_deriv, ndescrpt);
+    DPErrcheck(hipGetLastError());
+    DPErrcheck(hipDeviceSynchronize());
 
     const int LEN = 128;
     const int nblock_ = (nloc + LEN -1) / LEN;
@@ -134,6 +139,8 @@ void prod_force_grad_r_gpu_rocm(
     hipLaunchKernelGGL(force_grad_wrt_neighbors_r, block_grid_, thread_grid_, 0, 0, 
         grad_net,
         grad, env_deriv, nlist, nloc, nnei);
+    DPErrcheck(hipGetLastError());
+    DPErrcheck(hipDeviceSynchronize());
 }
 
 template void prod_force_grad_a_gpu_rocm<float>(float * grad_net, const float * grad, const float * env_deriv, const int * nlist, const int nloc, const int nnei);
diff --git a/source/lib/src/rocm/prod_virial.hip.cu b/source/lib/src/rocm/prod_virial.hip.cu
index ff8017a687..9c4ec5010b 100644
--- a/source/lib/src/rocm/prod_virial.hip.cu
+++ b/source/lib/src/rocm/prod_virial.hip.cu
@@ -113,10 +113,10 @@ void prod_virial_a_gpu_rocm(
     const int nall, 
     const int nnei)
 {
-  hipErrcheck(hipMemset(
-      virial, 
-      0.0, sizeof(FPTYPE) * 9));
-  hipErrcheck(hipMemset(
+    DPErrcheck(hipMemset(
+        virial, 
+        0.0, sizeof(FPTYPE) * 9));
+    DPErrcheck(hipMemset(
       atom_virial, 
       0.0, sizeof(FPTYPE) * 9 * nall));
     
@@ -128,10 +128,14 @@ void prod_virial_a_gpu_rocm(
   hipLaunchKernelGGL(virial_deriv_wrt_neighbors_a, block_grid, thread_grid, 0, 0, 
       virial, atom_virial, 
       net_deriv, in_deriv, rij, nlist, nloc, nnei);
+    DPErrcheck(hipGetLastError());
+    DPErrcheck(hipDeviceSynchronize());
   // reduction atom_virial to virial
   hipLaunchKernelGGL(HIP_KERNEL_NAME(atom_virial_reduction<FPTYPE, TPB>), 9, TPB, 0, 0, 
       virial, 
       atom_virial, nall);
+    DPErrcheck(hipGetLastError());
+    DPErrcheck(hipDeviceSynchronize());
 }
 
 template<typename FPTYPE>
@@ -146,10 +150,10 @@ void prod_virial_r_gpu_rocm(
     const int nall, 
     const int nnei)
 {
-  hipErrcheck(hipMemset(
-      virial, 
-      0.0, sizeof(FPTYPE) * 9));
-  hipErrcheck(hipMemset(
+    DPErrcheck(hipMemset(
+        virial, 
+        0.0, sizeof(FPTYPE) * 9));
+    DPErrcheck(hipMemset(
       atom_virial, 
       0.0, sizeof(FPTYPE) * 9 * nall));
     
@@ -161,10 +165,14 @@ void prod_virial_r_gpu_rocm(
   hipLaunchKernelGGL(virial_deriv_wrt_neighbors_r, block_grid, thread_grid, 0, 0, 
       virial, atom_virial, 
       net_deriv, in_deriv, rij, nlist, nloc, nnei);
+    DPErrcheck(hipGetLastError());
+    DPErrcheck(hipDeviceSynchronize());
   // reduction atom_virial to virial
   hipLaunchKernelGGL(HIP_KERNEL_NAME(atom_virial_reduction<FPTYPE, TPB>), 9, TPB, 0, 0, 
     virial, 
     atom_virial, nall);
+    DPErrcheck(hipGetLastError());
+    DPErrcheck(hipDeviceSynchronize());
 }
 
 template void prod_virial_a_gpu_rocm<float>(float * virial, float * atom_virial, const float * net_deriv, const float * in_deriv, const float * rij, const int * nlist, const int nloc, const int nall, const int nnei);
diff --git a/source/lib/src/rocm/prod_virial_grad.hip.cu b/source/lib/src/rocm/prod_virial_grad.hip.cu
index c4d8a5c19a..4c729453f7 100644
--- a/source/lib/src/rocm/prod_virial_grad.hip.cu
+++ b/source/lib/src/rocm/prod_virial_grad.hip.cu
@@ -99,7 +99,7 @@ void prod_virial_grad_a_gpu_rocm(
     const int nnei)
 {
     const int ndescrpt = nnei * 4;
-    hipErrcheck(hipMemset(
+    DPErrcheck(hipMemset(
         grad_net, 
         0.0, sizeof(FPTYPE) * nloc * ndescrpt));
     const int LEN = 128;
@@ -109,6 +109,8 @@ void prod_virial_grad_a_gpu_rocm(
     hipLaunchKernelGGL(virial_grad_wrt_neighbors_a, block_grid, thread_grid, 0, 0, 
         grad_net,
         grad, env_deriv, rij, nlist, nloc, nnei);
+    DPErrcheck(hipGetLastError());
+    DPErrcheck(hipDeviceSynchronize());
 }
 
 template<typename FPTYPE>
@@ -122,7 +124,7 @@ void prod_virial_grad_r_gpu_rocm(
     const int nnei)
 {
     const int ndescrpt = nnei;
-    hipErrcheck(hipMemset(
+    DPErrcheck(hipMemset(
         grad_net, 
         0.0, sizeof(FPTYPE) * nloc * ndescrpt));
     const int LEN = 128;
@@ -132,6 +134,8 @@ void prod_virial_grad_r_gpu_rocm(
     hipLaunchKernelGGL(virial_grad_wrt_neighbors_r, block_grid, thread_grid, 0, 0, 
         grad_net,
         grad, env_deriv, rij, nlist, nloc, nnei);
+    DPErrcheck(hipGetLastError());
+    DPErrcheck(hipDeviceSynchronize());
 }
 
 template void prod_virial_grad_a_gpu_rocm<float>(float * grad_net, const float * grad, const float * env_deriv, const float * rij, const int * nlist, const int nloc, const int nnei);
diff --git a/source/lib/src/rocm/region.hip.cu b/source/lib/src/rocm/region.hip.cu
index 7f883b14c3..ab40f6bf20 100644
--- a/source/lib/src/rocm/region.hip.cu
+++ b/source/lib/src/rocm/region.hip.cu
@@ -39,6 +39,8 @@ convert_to_inter_gpu_rocm(
     const FPTYPE * rp)
 {
     hipLaunchKernelGGL(_phys2Inter, 1, 1, 0, 0, ri, rp, region.rec_boxt);
+    DPErrcheck(hipGetLastError());
+    DPErrcheck(hipDeviceSynchronize());
 }
 
 template<typename FPTYPE>
@@ -49,6 +51,8 @@ convert_to_phys_gpu_rocm(
     const FPTYPE * ri)
 {
     hipLaunchKernelGGL(_inter2Phys, 1, 1, 0, 0, rp, ri, region.boxt);
+    DPErrcheck(hipGetLastError());
+    DPErrcheck(hipDeviceSynchronize());
 }
 
 template<typename FPTYPE>
@@ -58,6 +62,8 @@ volume_gpu_rocm(
     const Region<FPTYPE> & region)
 {
     hipLaunchKernelGGL(_compute_volume, 1, 1, 0, 0, volume, region.boxt);
+    DPErrcheck(hipGetLastError());
+    DPErrcheck(hipDeviceSynchronize());
 }
 
 template void convert_to_inter_gpu_rocm<float>(float * ri, const Region<float> & region, const float * rp);
diff --git a/source/lib/src/rocm/tabulate.hip.cu b/source/lib/src/rocm/tabulate.hip.cu
index b4097083b0..497f4931cd 100644
--- a/source/lib/src/rocm/tabulate.hip.cu
+++ b/source/lib/src/rocm/tabulate.hip.cu
@@ -213,6 +213,8 @@ template<typename FPTYPE>
       hipLaunchKernelGGL(HIP_KERNEL_NAME(tabulate_fusion_fifth_order_polynomial<FPTYPE, MM, KK>), nloc, last_layer_size, sizeof(FPTYPE) * MM * last_layer_size, 0, 
           out, 
           table, em_x, em, table_info[0], table_info[1], table_info[2], table_info[3], table_info[4], nnei, last_layer_size);
+      DPErrcheck(hipGetLastError());
+      DPErrcheck(hipDeviceSynchronize());
     }
     
     template<typename FPTYPE>
@@ -229,16 +231,18 @@ template<typename FPTYPE>
         const int last_layer_size)
     {
       if( nloc<=0 ) { return;}
-      hipErrcheck(hipMemset(
+      DPErrcheck(hipMemset(
           dy_dem_x,
           0.0, sizeof(FPTYPE) * nloc * nnei));
-      hipErrcheck(hipMemset(
+      DPErrcheck(hipMemset(
           dy_dem,
           0.0, sizeof(FPTYPE) * nloc * nnei * 4));
     
       hipLaunchKernelGGL(HIP_KERNEL_NAME(tabulate_fusion_grad_fifth_order_polynomial<FPTYPE, MM, KK>), nloc, KK * WARP_SIZE, sizeof(FPTYPE) * MM * last_layer_size, 0, 
           dy_dem_x, dy_dem,
           table, em_x, em, dy,  table_info[0], table_info[1], table_info[2], table_info[3], table_info[4], nnei, last_layer_size);
+      DPErrcheck(hipGetLastError());
+      DPErrcheck(hipDeviceSynchronize());
     }
     
     template void tabulate_fusion_gpu_rocm<float>(float * out, const float * table, const float * table_info, const float * em_x, const float * em, const int nloc, const int nnei, const int last_layer_size);
diff --git a/source/lib/tests/test_env_mat_a.cc b/source/lib/tests/test_env_mat_a.cc
index cfb70acfe9..df7b5f38a0 100644
--- a/source/lib/tests/test_env_mat_a.cc
+++ b/source/lib/tests/test_env_mat_a.cc
@@ -557,7 +557,7 @@ TEST_F(TestEnvMatA, prod_gpu_cuda)
   deepmd::malloc_device_memory(array_int_dev, sec_a.size() + nloc * sec_a.size() + nloc);
   deepmd::malloc_device_memory(array_longlong_dev, nloc * GPU_MAX_NBOR_SIZE * 2);
   deepmd::malloc_device_memory(memory_dev, nloc * max_nbor_size);
-  deepmd::convert_nlist_gpu_cuda(gpu_inlist, inlist, memory_dev, max_nbor_size);
+  deepmd::convert_nlist_gpu_device(gpu_inlist, inlist, memory_dev, max_nbor_size);
 
   deepmd::prod_env_mat_a_gpu_cuda(    
       em_dev, 
@@ -588,7 +588,7 @@ TEST_F(TestEnvMatA, prod_gpu_cuda)
   deepmd::delete_device_memory(avg_dev);
   deepmd::delete_device_memory(std_dev);
   deepmd::delete_device_memory(memory_dev);
-  deepmd::free_nlist_gpu_cuda(gpu_inlist);
+  deepmd::free_nlist_gpu_device(gpu_inlist);
 
   for(int ii = 0; ii < nloc; ++ii){
     for (int jj = 0; jj < nnei; ++jj){
@@ -648,7 +648,7 @@ TEST_F(TestEnvMatA, prod_gpu_cuda_equal_cpu)
   deepmd::malloc_device_memory(array_int_dev, sec_a.size() + nloc * sec_a.size() + nloc);
   deepmd::malloc_device_memory(array_longlong_dev, nloc * GPU_MAX_NBOR_SIZE * 2);
   deepmd::malloc_device_memory(memory_dev, nloc * max_nbor_size);
-  deepmd::convert_nlist_gpu_cuda(gpu_inlist, inlist, memory_dev, max_nbor_size);
+  deepmd::convert_nlist_gpu_device(gpu_inlist, inlist, memory_dev, max_nbor_size);
 
   deepmd::prod_env_mat_a_gpu_cuda(    
       em_dev, 
@@ -682,7 +682,7 @@ TEST_F(TestEnvMatA, prod_gpu_cuda_equal_cpu)
   deepmd::delete_device_memory(avg_dev);
   deepmd::delete_device_memory(std_dev);
   deepmd::delete_device_memory(memory_dev);
-  deepmd::free_nlist_gpu_cuda(gpu_inlist);
+  deepmd::free_nlist_gpu_device(gpu_inlist);
 
   std::vector<int> fmt_nlist_a_1, fmt_nlist_r_1;
   std::vector<double> env_1, env_deriv_1, rij_a_1;
@@ -770,7 +770,7 @@ TEST_F(TestEnvMatA, prod_gpu_rocm)
   deepmd::malloc_device_memory(array_int_dev, sec_a.size() + nloc * sec_a.size() + nloc);
   deepmd::malloc_device_memory(array_longlong_dev, nloc * GPU_MAX_NBOR_SIZE * 2);
   deepmd::malloc_device_memory(memory_dev, nloc * max_nbor_size);
-  deepmd::convert_nlist_gpu_rocm(gpu_inlist, inlist, memory_dev, max_nbor_size);
+  deepmd::convert_nlist_gpu_device(gpu_inlist, inlist, memory_dev, max_nbor_size);
 
   deepmd::prod_env_mat_a_gpu_rocm(    
       em_dev, 
@@ -801,7 +801,7 @@ TEST_F(TestEnvMatA, prod_gpu_rocm)
   deepmd::delete_device_memory(avg_dev);
   deepmd::delete_device_memory(std_dev);
   deepmd::delete_device_memory(memory_dev);
-  deepmd::free_nlist_gpu_rocm(gpu_inlist);
+  deepmd::free_nlist_gpu_device(gpu_inlist);
 
   for(int ii = 0; ii < nloc; ++ii){
     for (int jj = 0; jj < nnei; ++jj){
@@ -861,7 +861,7 @@ TEST_F(TestEnvMatA, prod_gpu_rocm_equal_cpu)
   deepmd::malloc_device_memory(array_int_dev, sec_a.size() + nloc * sec_a.size() + nloc);
   deepmd::malloc_device_memory(array_longlong_dev, nloc * GPU_MAX_NBOR_SIZE * 2);
   deepmd::malloc_device_memory(memory_dev, nloc * max_nbor_size);
-  deepmd::convert_nlist_gpu_rocm(gpu_inlist, inlist, memory_dev, max_nbor_size);
+  deepmd::convert_nlist_gpu_device(gpu_inlist, inlist, memory_dev, max_nbor_size);
 
   deepmd::prod_env_mat_a_gpu_rocm(    
       em_dev, 
@@ -895,7 +895,7 @@ TEST_F(TestEnvMatA, prod_gpu_rocm_equal_cpu)
   deepmd::delete_device_memory(avg_dev);
   deepmd::delete_device_memory(std_dev);
   deepmd::delete_device_memory(memory_dev);
-  deepmd::free_nlist_gpu_rocm(gpu_inlist);
+  deepmd::free_nlist_gpu_device(gpu_inlist);
 
   std::vector<int> fmt_nlist_a_1, fmt_nlist_r_1;
   std::vector<double> env_1, env_deriv_1, rij_a_1;
diff --git a/source/lib/tests/test_env_mat_r.cc b/source/lib/tests/test_env_mat_r.cc
index 1b232e28b5..39d36be42d 100644
--- a/source/lib/tests/test_env_mat_r.cc
+++ b/source/lib/tests/test_env_mat_r.cc
@@ -400,7 +400,7 @@ TEST_F(TestEnvMatR, prod_gpu_cuda)
   deepmd::malloc_device_memory(array_int_dev, sec_a.size() + nloc * sec_a.size() + nloc);
   deepmd::malloc_device_memory(array_longlong_dev, nloc * GPU_MAX_NBOR_SIZE * 2);
   deepmd::malloc_device_memory(memory_dev, nloc * max_nbor_size);
-  deepmd::convert_nlist_gpu_cuda(gpu_inlist, inlist, memory_dev, max_nbor_size);
+  deepmd::convert_nlist_gpu_device(gpu_inlist, inlist, memory_dev, max_nbor_size);
 
   deepmd::prod_env_mat_r_gpu_cuda(    
       em_dev, 
@@ -431,7 +431,7 @@ TEST_F(TestEnvMatR, prod_gpu_cuda)
   deepmd::delete_device_memory(avg_dev);
   deepmd::delete_device_memory(std_dev);
   deepmd::delete_device_memory(memory_dev);
-  deepmd::free_nlist_gpu_cuda(gpu_inlist);
+  deepmd::free_nlist_gpu_device(gpu_inlist);
 
   for(int ii = 0; ii < nloc; ++ii){
     for (int jj = 0; jj < nnei; ++jj){
@@ -490,7 +490,7 @@ TEST_F(TestEnvMatR, prod_gpu_cuda_equal_cpu)
   deepmd::malloc_device_memory(array_int_dev, sec_a.size() + nloc * sec_a.size() + nloc);
   deepmd::malloc_device_memory(array_longlong_dev, nloc * GPU_MAX_NBOR_SIZE * 2);
   deepmd::malloc_device_memory(memory_dev, nloc * max_nbor_size);
-  deepmd::convert_nlist_gpu_cuda(gpu_inlist, inlist, memory_dev, max_nbor_size);
+  deepmd::convert_nlist_gpu_device(gpu_inlist, inlist, memory_dev, max_nbor_size);
 
   deepmd::prod_env_mat_r_gpu_cuda(    
       em_dev, 
@@ -524,7 +524,7 @@ TEST_F(TestEnvMatR, prod_gpu_cuda_equal_cpu)
   deepmd::delete_device_memory(avg_dev);
   deepmd::delete_device_memory(std_dev);
   deepmd::delete_device_memory(memory_dev);
-  deepmd::free_nlist_gpu_cuda(gpu_inlist);
+  deepmd::free_nlist_gpu_device(gpu_inlist);
 
   std::vector<int> fmt_nlist_a_1, fmt_nlist_r_1;
   std::vector<double> env_1, env_deriv_1, rij_a_1;
@@ -603,7 +603,7 @@ TEST_F(TestEnvMatR, prod_gpu_rocm)
   deepmd::malloc_device_memory(array_int_dev, sec_a.size() + nloc * sec_a.size() + nloc);
   deepmd::malloc_device_memory(array_longlong_dev, nloc * GPU_MAX_NBOR_SIZE * 2);
   deepmd::malloc_device_memory(memory_dev, nloc * max_nbor_size);
-  deepmd::convert_nlist_gpu_rocm(gpu_inlist, inlist, memory_dev, max_nbor_size);
+  deepmd::convert_nlist_gpu_device(gpu_inlist, inlist, memory_dev, max_nbor_size);
 
   deepmd::prod_env_mat_r_gpu_rocm(    
       em_dev, 
@@ -634,7 +634,7 @@ TEST_F(TestEnvMatR, prod_gpu_rocm)
   deepmd::delete_device_memory(avg_dev);
   deepmd::delete_device_memory(std_dev);
   deepmd::delete_device_memory(memory_dev);
-  deepmd::free_nlist_gpu_rocm(gpu_inlist);
+  deepmd::free_nlist_gpu_device(gpu_inlist);
 
   for(int ii = 0; ii < nloc; ++ii){
     for (int jj = 0; jj < nnei; ++jj){
@@ -693,7 +693,7 @@ TEST_F(TestEnvMatR, prod_gpu_rocm_equal_cpu)
   deepmd::malloc_device_memory(array_int_dev, sec_a.size() + nloc * sec_a.size() + nloc);
   deepmd::malloc_device_memory(array_longlong_dev, nloc * GPU_MAX_NBOR_SIZE * 2);
   deepmd::malloc_device_memory(memory_dev, nloc * max_nbor_size);
-  deepmd::convert_nlist_gpu_rocm(gpu_inlist, inlist, memory_dev, max_nbor_size);
+  deepmd::convert_nlist_gpu_device(gpu_inlist, inlist, memory_dev, max_nbor_size);
 
   deepmd::prod_env_mat_r_gpu_rocm(    
       em_dev, 
@@ -727,7 +727,7 @@ TEST_F(TestEnvMatR, prod_gpu_rocm_equal_cpu)
   deepmd::delete_device_memory(avg_dev);
   deepmd::delete_device_memory(std_dev);
   deepmd::delete_device_memory(memory_dev);
-  deepmd::free_nlist_gpu_rocm(gpu_inlist);
+  deepmd::free_nlist_gpu_device(gpu_inlist);
 
   std::vector<int> fmt_nlist_a_1, fmt_nlist_r_1;
   std::vector<double> env_1, env_deriv_1, rij_a_1;
diff --git a/source/lib/tests/test_fmt_nlist.cc b/source/lib/tests/test_fmt_nlist.cc
index d2de6e8855..844d110de5 100644
--- a/source/lib/tests/test_fmt_nlist.cc
+++ b/source/lib/tests/test_fmt_nlist.cc
@@ -381,7 +381,7 @@ TEST_F(TestFormatNlist, gpu_cuda)
   deepmd::malloc_device_memory(array_int_dev, sec_a.size() + nloc * sec_a.size() + nloc);
   deepmd::malloc_device_memory(array_longlong_dev, nloc * GPU_MAX_NBOR_SIZE * 2);
   deepmd::malloc_device_memory(memory_dev, nloc * max_nbor_size);
-  deepmd::convert_nlist_gpu_cuda(gpu_inlist, in_nlist, memory_dev, max_nbor_size);
+  deepmd::convert_nlist_gpu_device(gpu_inlist, in_nlist, memory_dev, max_nbor_size);
   // format nlist
   format_nbor_list_gpu_cuda(
       nlist_dev, 
@@ -393,7 +393,7 @@ TEST_F(TestFormatNlist, gpu_cuda)
   deepmd::delete_device_memory(array_int_dev);
   deepmd::delete_device_memory(array_longlong_dev);
   deepmd::delete_device_memory(memory_dev);
-  deepmd::free_nlist_gpu_cuda(gpu_inlist);
+  deepmd::free_nlist_gpu_device(gpu_inlist);
   // validate
   for(int ii = 0; ii < nlist.size(); ++ii){
     EXPECT_EQ(nlist[ii], expect_nlist_cpy[ii]);
@@ -437,7 +437,7 @@ TEST_F(TestFormatNlistShortSel, gpu_cuda)
   deepmd::malloc_device_memory(array_int_dev, sec_a.size() + nloc * sec_a.size() + nloc);
   deepmd::malloc_device_memory(array_longlong_dev, nloc * GPU_MAX_NBOR_SIZE * 2);
   deepmd::malloc_device_memory(memory_dev, nloc * max_nbor_size);
-  deepmd::convert_nlist_gpu_cuda(gpu_inlist, in_nlist, memory_dev, max_nbor_size);
+  deepmd::convert_nlist_gpu_device(gpu_inlist, in_nlist, memory_dev, max_nbor_size);
   // format nlist
   format_nbor_list_gpu_cuda(
       nlist_dev, 
@@ -449,7 +449,7 @@ TEST_F(TestFormatNlistShortSel, gpu_cuda)
   deepmd::delete_device_memory(array_int_dev);
   deepmd::delete_device_memory(array_longlong_dev);
   deepmd::delete_device_memory(memory_dev);
-  deepmd::free_nlist_gpu_cuda(gpu_inlist);
+  deepmd::free_nlist_gpu_device(gpu_inlist);
   // validate
   for(int ii = 0; ii < nlist.size(); ++ii){
     EXPECT_EQ(nlist[ii], expect_nlist_cpy[ii]);
@@ -565,7 +565,7 @@ TEST_F(TestFormatNlist, gpu_rocm)
   deepmd::malloc_device_memory(array_int_dev, sec_a.size() + nloc * sec_a.size() + nloc);
   deepmd::malloc_device_memory(array_longlong_dev, nloc * GPU_MAX_NBOR_SIZE * 2);
   deepmd::malloc_device_memory(memory_dev, nloc * max_nbor_size);
-  deepmd::convert_nlist_gpu_rocm(gpu_inlist, in_nlist, memory_dev, max_nbor_size);
+  deepmd::convert_nlist_gpu_device(gpu_inlist, in_nlist, memory_dev, max_nbor_size);
   // format nlist
   format_nbor_list_gpu_rocm(
       nlist_dev, 
@@ -577,7 +577,7 @@ TEST_F(TestFormatNlist, gpu_rocm)
   deepmd::delete_device_memory(array_int_dev);
   deepmd::delete_device_memory(array_longlong_dev);
   deepmd::delete_device_memory(memory_dev);
-  deepmd::free_nlist_gpu_rocm(gpu_inlist);
+  deepmd::free_nlist_gpu_device(gpu_inlist);
   // validate
   for(int ii = 0; ii < nlist.size(); ++ii){
     EXPECT_EQ(nlist[ii], expect_nlist_cpy[ii]);
@@ -621,7 +621,7 @@ TEST_F(TestFormatNlistShortSel, gpu_rocm)
   deepmd::malloc_device_memory(array_int_dev, sec_a.size() + nloc * sec_a.size() + nloc);
   deepmd::malloc_device_memory(array_longlong_dev, nloc * GPU_MAX_NBOR_SIZE * 2);
   deepmd::malloc_device_memory(memory_dev, nloc * max_nbor_size);
-  deepmd::convert_nlist_gpu_rocm(gpu_inlist, in_nlist, memory_dev, max_nbor_size);
+  deepmd::convert_nlist_gpu_device(gpu_inlist, in_nlist, memory_dev, max_nbor_size);
   // format nlist
   format_nbor_list_gpu_rocm(
       nlist_dev, 
@@ -633,7 +633,7 @@ TEST_F(TestFormatNlistShortSel, gpu_rocm)
   deepmd::delete_device_memory(array_int_dev);
   deepmd::delete_device_memory(array_longlong_dev);
   deepmd::delete_device_memory(memory_dev);
-  deepmd::free_nlist_gpu_rocm(gpu_inlist);
+  deepmd::free_nlist_gpu_device(gpu_inlist);
   // validate
   for(int ii = 0; ii < nlist.size(); ++ii){
     EXPECT_EQ(nlist[ii], expect_nlist_cpy[ii]);
diff --git a/source/lmp/CMakeLists.txt b/source/lmp/CMakeLists.txt
index bc52c8fa81..2c68352385 100644
--- a/source/lmp/CMakeLists.txt
+++ b/source/lmp/CMakeLists.txt
@@ -7,6 +7,11 @@ list (APPEND LMP_INSTALL_FILES ${LMP_HEADER})
 list (APPEND LMP_INSTALL_FILES ${LMP_SRC})
 list (APPEND LMP_INSTALL_FILES ${LMP_SHSCRIPT})
 
-configure_file("env.sh.in" "env.sh" @ONLY)
+function(_add_lmp_variant variant_name prec_def)
+configure_file("env.sh.in" "env${variant_name}.sh" @ONLY)
+endfunction()
+_add_lmp_variant("${HIGH_PREC_VARIANT}" "${HIGH_PREC_DEF}")
+_add_lmp_variant("${LOW_PREC_VARIANT}" "${LOW_PREC_DEF}")
+
 configure_file("pair_deepmd.h.in" "pair_deepmd.h" @ONLY)
 configure_file("lammps_install_list.txt.in" "lammps_install_list.txt" @ONLY)
diff --git a/source/lmp/compute_deeptensor_atom.cpp b/source/lmp/compute_deeptensor_atom.cpp
new file mode 100644
index 0000000000..85f7d0a332
--- /dev/null
+++ b/source/lmp/compute_deeptensor_atom.cpp
@@ -0,0 +1,172 @@
+#include "compute_deeptensor_atom.h"
+#include <cstring>
+#include <algorithm>
+#include "atom.h"
+#include "update.h"
+#include "neighbor.h"
+#include "neigh_list.h"
+#include "neigh_request.h"
+#include "comm.h"
+#include "force.h"
+#include "pair.h"
+#include "fix.h"
+#include "memory.h"
+#include "error.h"
+
+#include "domain.h"
+#include "update.h"
+#include "modify.h"
+#include "fix.h"
+
+using namespace LAMMPS_NS;
+
+#ifdef HIGH_PREC
+#define VALUETYPE double
+#else
+#define VALUETYPE float
+#endif
+
+/* ---------------------------------------------------------------------- */
+
+ComputeDeeptensorAtom::ComputeDeeptensorAtom(LAMMPS *lmp, int narg, char **arg) :
+  Compute(lmp, narg, arg),
+  dp(lmp),
+  tensor(nullptr)
+{
+  if (narg < 4) error->all(FLERR,"Illegal compute deeptensor/atom command");
+
+  // parse args
+  std::string model_file = std::string(arg[3]);
+
+  // initialize deeptensor
+  int gpu_rank = dp.get_node_rank();
+  std::string model_file_content = dp.get_file_content(model_file);
+  dt.init(model_file, gpu_rank);
+  sel_types = dt.sel_types();
+  std::sort(sel_types.begin(), sel_types.end());
+
+  peratom_flag = 1;
+  size_peratom_cols = dt.output_dim();
+  pressatomflag = 0;
+  timeflag = 1;
+
+  nmax = 0;
+}
+
+/* ---------------------------------------------------------------------- */
+
+ComputeDeeptensorAtom::~ComputeDeeptensorAtom()
+{
+  memory->destroy(tensor);
+}
+
+/* ---------------------------------------------------------------------- */
+
+void ComputeDeeptensorAtom::init()
+{
+  // need an occasional full neighbor list
+
+  int irequest = neighbor->request(this,instance_me);
+  neighbor->requests[irequest]->half = 0;
+  neighbor->requests[irequest]->pair = 0;
+  neighbor->requests[irequest]->compute = 1;
+  // neighbor->requests[irequest]->full = 1;
+  neighbor->requests[irequest]->occasional = 1;
+}
+
+void ComputeDeeptensorAtom::init_list(int /*id*/, NeighList *ptr)
+{
+  list = ptr;
+}
+
+/* ---------------------------------------------------------------------- */
+
+void ComputeDeeptensorAtom::compute_peratom()
+{
+  invoked_peratom = update->ntimestep;
+
+  // grow local tensor array if necessary
+  // needs to be atom->nmax in length
+  if (atom->nmax > nmax) {
+    memory->destroy(tensor);
+    nmax = atom->nmax;
+    memory->create(tensor, nmax, size_peratom_cols, "deeptensor/atom:tensor");
+    array_atom = tensor;
+  }
+
+  double **x = atom->x;
+  double **f = atom->f;
+  int *type = atom->type;
+  int *mask = atom->mask;
+  int nlocal = atom->nlocal;
+  int nghost = atom->nghost;
+  int nall = nlocal + nghost;
+  int newton_pair = force->newton_pair;
+
+  std::vector<VALUETYPE > dcoord (nall * 3, 0.);
+  std::vector<VALUETYPE > dbox (9, 0) ;
+  std::vector<int > dtype (nall);
+  // get type
+  for (int ii = 0; ii < nall; ++ii){
+    dtype[ii] = type[ii] - 1;
+  }
+  // get box
+  dbox[0] = domain->h[0];	// xx
+  dbox[4] = domain->h[1];	// yy
+  dbox[8] = domain->h[2];	// zz
+  dbox[7] = domain->h[3];	// zy
+  dbox[6] = domain->h[4];	// zx
+  dbox[3] = domain->h[5];	// yx
+  // get coord
+  for (int ii = 0; ii < nall; ++ii){
+    for (int dd = 0; dd < 3; ++dd){
+      dcoord[ii*3+dd] = x[ii][dd] - domain->boxlo[dd];
+    }
+  }
+
+  // invoke full neighbor list (will copy or build if necessary)
+  neighbor->build_one(list);
+  deepmd::InputNlist lmp_list (list->inum, list->ilist, list->numneigh, list->firstneigh);
+  
+  // declare outputs
+  std::vector<VALUETYPE > gtensor, force, virial, atensor, avirial;
+
+  // compute tensors
+  dt.compute (gtensor, force, virial, atensor, avirial,
+	      dcoord, dtype, dbox, nghost, lmp_list);
+  
+  // store the result in tensor
+  int iter_tensor = 0;
+  for(int ii = 0; ii < nlocal; ++ii){
+    std::vector<int>::iterator _it =
+	std::find(sel_types.begin(), sel_types.end(), dtype[ii]);
+    bool selected = (_it != sel_types.end());    
+    bool ingroup = (mask[ii] & groupbit);    
+    // record when selected and in group
+    if (selected && ingroup){
+      for(int jj = 0; jj < size_peratom_cols; ++jj){
+	tensor[ii][jj] = atensor[iter_tensor+jj];
+      }
+    }
+    // if not selected or not in group set to 0.
+    else{
+      for(int jj = 0; jj < size_peratom_cols; ++jj){
+	tensor[ii][jj] = 0.0;
+      }
+    }    
+    if (selected) {
+      iter_tensor += size_peratom_cols;
+    }
+  }
+}
+
+
+/* ----------------------------------------------------------------------
+   memory usage of local atom-based array
+------------------------------------------------------------------------- */
+
+double ComputeDeeptensorAtom::memory_usage()
+{
+  double bytes = nmax*size_peratom_cols * sizeof(double);
+  return bytes;
+}
diff --git a/source/lmp/compute_deeptensor_atom.h b/source/lmp/compute_deeptensor_atom.h
new file mode 100644
index 0000000000..2499e5fda0
--- /dev/null
+++ b/source/lmp/compute_deeptensor_atom.h
@@ -0,0 +1,38 @@
+#ifdef COMPUTE_CLASS
+
+ComputeStyle(deeptensor/atom,ComputeDeeptensorAtom)
+
+#else
+
+#ifndef LMP_COMPUTE_DEEPTENSOR_ATOM_H
+#define LMP_COMPUTE_DEEPTENSOR_ATOM_H
+
+#include "compute.h"
+#include "pair_deepmd.h"
+#include "deepmd/DeepTensor.h"
+
+namespace LAMMPS_NS {
+
+class ComputeDeeptensorAtom : public Compute {
+ public:
+  ComputeDeeptensorAtom(class LAMMPS *, int, char **);
+  ~ComputeDeeptensorAtom();
+  void init();
+  void compute_peratom();
+  double memory_usage();
+  void init_list(int, class NeighList *);
+
+ private:
+  int nmax;
+  double **tensor;
+  PairDeepMD dp;
+  class NeighList *list;
+  deepmd::DeepTensor dt;
+  std::vector<int > sel_types;
+};
+
+}
+
+#endif
+#endif
+
diff --git a/source/lmp/env.sh.in b/source/lmp/env.sh.in
index 8157ddb8dd..25ca8dc90c 100644
--- a/source/lmp/env.sh.in
+++ b/source/lmp/env.sh.in
@@ -6,6 +6,6 @@ TF_INCLUDE_DIRS=`echo $TENSORFLOW_INCLUDE_DIRS | sed "s/;/ -I/g"`
 TF_LIBRARY_PATH=`echo $TENSORFLOW_LIBRARY_PATH | sed "s/;/ -L/g"`
 TF_RPATH=`echo $TENSORFLOW_LIBRARY_PATH | sed "s/;/ -Wl,-rpath=/g"`
 
-NNP_INC=" -std=c++11 @PREC_DEF@ @TTM_DEF@ @OLD_LMP_PPPM_DEF@ -I$TF_INCLUDE_DIRS -I$DEEPMD_ROOT/include/ "
+NNP_INC=" -std=c++11 -D@prec_def@ @TTM_DEF@ @OLD_LMP_PPPM_DEF@ -I$TF_INCLUDE_DIRS -I$DEEPMD_ROOT/include/ "
 NNP_PATH=" -L$TF_LIBRARY_PATH -L$DEEPMD_ROOT/lib"
-NNP_LIB=" -Wl,--no-as-needed -l@LIB_DEEPMD_OP_DEVICE@ -l@LIB_DEEPMD_OP@ -l@LIB_DEEPMD_CC@ -l@LIB_DEEPMD@ -ltensorflow_cc -ltensorflow_framework -Wl,-rpath=$TF_RPATH -Wl,-rpath=$DEEPMD_ROOT/lib"
+NNP_LIB=" -Wl,--no-as-needed -l@LIB_DEEPMD_OP_DEVICE@ -l@LIB_DEEPMD_OP@ -l@LIB_DEEPMD_CC@@variant_name@ -l@LIB_DEEPMD@ -ltensorflow_cc -ltensorflow_framework -Wl,-rpath=$TF_RPATH -Wl,-rpath=$DEEPMD_ROOT/lib"
diff --git a/source/md/CMakeLists.txt b/source/md/CMakeLists.txt
index 2db205dda2..4de0c00d6e 100644
--- a/source/md/CMakeLists.txt
+++ b/source/md/CMakeLists.txt
@@ -17,33 +17,38 @@ if (MAKE_FF_AD)
   set(MDFF_SOURCE_FILES mdff.cc)
 endif()
 
-add_executable(dp_mdnn ${MDNN_SOURCE_FILES})
+function(_add_md_variant variant_name prec_def)
+set (dp_mdnn_name "dp_mdnn${variant_name}")
+set (dp_mdff_name "dp_mdff${variant_name}")
+set (dp_mdad_name "dp_mdad${variant_name}")
+
+add_executable(${dp_mdnn_name} ${MDNN_SOURCE_FILES})
 if (MAKE_FF_AD)
-  add_executable(dp_mdff ${MDFF_SOURCE_FILES})
-  add_executable(dp_mdad ${MDAD_SOURCE_FILES})
+  add_executable(${dp_mdff_name} ${MDFF_SOURCE_FILES})
+  add_executable(${dp_mdad_name} ${MDAD_SOURCE_FILES})
 endif()
 
-target_link_libraries(dp_mdnn ${LIB_DEEPMD_NATIVE} ${LIB_DEEPMD_OP} ${LIB_DEEPMD} ${XDRFILE_LIBRARIES} ${TensorFlow_LIBRARY})
+target_link_libraries(${dp_mdnn_name} ${LIB_DEEPMD_NATIVE} ${LIB_DEEPMD_OP} ${LIB_DEEPMD} ${XDRFILE_LIBRARIES} ${TensorFlow_LIBRARY})
 if (MAKE_FF_AD)
-  target_link_libraries(dp_mdad ${LIB_DEEPMD_NATIVE} ${LIB_DEEPMD_OP} ${LIB_DEEPMD} ${XDRFILE_LIBRARIES} ${TensorFlow_LIBRARY})
-  target_link_libraries(dp_mdff ${LIB_DEEPMD_NATIVE}                  ${LIB_DEEPMD} ${XDRFILE_LIBRARIES} ${TensorFlow_LIBRARY})
+  target_link_libraries(${dp_mdad_name} ${LIB_DEEPMD_NATIVE} ${LIB_DEEPMD_OP} ${LIB_DEEPMD} ${XDRFILE_LIBRARIES} ${TensorFlow_LIBRARY})
+  target_link_libraries(${dp_mdff_name} ${LIB_DEEPMD_NATIVE}                  ${LIB_DEEPMD} ${XDRFILE_LIBRARIES} ${TensorFlow_LIBRARY})
 endif()
 
 set_target_properties(
-  dp_mdnn
+  ${dp_mdnn_name}
   PROPERTIES
   LINK_FLAGS "-Wl,-rpath,'$ORIGIN'/../lib -Wl,-z,defs"
   INSTALL_RPATH "$ORIGIN/../lib:${TensorFlow_LIBRARY_PATH}"
 )
 if (MAKE_FF_AD)
   set_target_properties(
-    dp_mdad
+    ${dp_mdad_name}
     PROPERTIES
     LINK_FLAGS "-Wl,-rpath,'$ORIGIN'/../lib -Wl,-z,defs"
     INSTALL_RPATH "$ORIGIN/../lib:${TensorFlow_LIBRARY_PATH}"
     )
   set_target_properties(
-    dp_mdff
+    ${dp_mdff_name}
     PROPERTIES
     LINK_FLAGS "-Wl,-rpath,'$ORIGIN'/../lib -Wl,-z,defs"
     INSTALL_RPATH "$ORIGIN/../lib:${TensorFlow_LIBRARY_PATH}"
@@ -55,16 +60,19 @@ install(
   DESTINATION	lib/
   )
 install(
-  TARGETS	dp_mdnn 
+  TARGETS	${dp_mdnn_name} 
   DESTINATION	bin/
   )
 if (MAKE_FF_AD)
   install(
-    TARGETS	dp_mdad 
+    TARGETS	${dp_mdad_name} 
     DESTINATION	bin/
     )
   install(
-    TARGETS	dp_mdff 
+    TARGETS	${dp_mdff_name} 
     DESTINATION	bin/
     )
 endif()
+endfunction()
+_add_md_variant("${HIGH_PREC_VARIANT}" "${HIGH_PREC_DEF}")
+_add_md_variant("${LOW_PREC_VARIANT}" "${LOW_PREC_DEF}")
diff --git a/source/op/CMakeLists.txt b/source/op/CMakeLists.txt
index 340c5601fb..1075847953 100644
--- a/source/op/CMakeLists.txt
+++ b/source/op/CMakeLists.txt
@@ -3,10 +3,10 @@
 set(OP_LIB ${PROJECT_SOURCE_DIR}/lib/src/SimulationRegion.cpp ${PROJECT_SOURCE_DIR}/lib/src/neighbor_list.cc)
 
 set (OP_CXX_FLAG -D_GLIBCXX_USE_CXX11_ABI=${OP_CXX_ABI} )
-file(GLOB OP_SRC prod_force.cc prod_virial.cc descrpt.cc descrpt_se_a_ef.cc descrpt_se_a_ef.cc descrpt_se_a_ef_para.cc descrpt_se_a_ef_vert.cc pair_tab.cc prod_force_multi_device.cc prod_virial_multi_device.cc soft_min.cc soft_min_force.cc soft_min_virial.cc ewald_recp.cc gelu_multi_device.cc map_aparam.cc neighbor_stat.cc unaggregated_grad.cc tabulate_multi_device.cc prod_env_mat_multi_device.cc)
-file(GLOB OP_CUDA_SRC prod_force.cc prod_virial.cc descrpt.cc prod_env_mat_multi_device.cc pair_tab.cc prod_force_multi_device.cc prod_virial_multi_device.cc soft_min.cc soft_min_force.cc soft_min_virial.cc gelu_multi_device.cc tabulate_multi_device.cc)
-file(GLOB OP_ROCM_SRC prod_force.cc prod_virial.cc descrpt.cc prod_env_mat_multi_device.cc pair_tab.cc prod_force_multi_device.cc prod_virial_multi_device.cc soft_min.cc soft_min_force.cc soft_min_virial.cc gelu_multi_device.cc tabulate_multi_device.cc)
-file(GLOB OP_GRADS_SRC prod_force_grad.cc prod_force_grad_multi_device.cc prod_virial_grad.cc prod_virial_grad_multi_device.cc soft_min_force_grad.cc soft_min_virial_grad.cc )
+file(GLOB OP_SRC custom_op.cc prod_force.cc prod_virial.cc descrpt.cc descrpt_se_a_ef.cc descrpt_se_a_ef.cc descrpt_se_a_ef_para.cc descrpt_se_a_ef_vert.cc pair_tab.cc prod_force_multi_device.cc prod_virial_multi_device.cc soft_min.cc soft_min_force.cc soft_min_virial.cc ewald_recp.cc gelu_multi_device.cc map_aparam.cc neighbor_stat.cc unaggregated_grad.cc tabulate_multi_device.cc prod_env_mat_multi_device.cc)
+file(GLOB OP_CUDA_SRC custom_op.cc prod_force.cc prod_virial.cc descrpt.cc prod_env_mat_multi_device.cc pair_tab.cc prod_force_multi_device.cc prod_virial_multi_device.cc soft_min.cc soft_min_force.cc soft_min_virial.cc gelu_multi_device.cc tabulate_multi_device.cc)
+file(GLOB OP_ROCM_SRC custom_op.cc prod_force.cc prod_virial.cc descrpt.cc prod_env_mat_multi_device.cc pair_tab.cc prod_force_multi_device.cc prod_virial_multi_device.cc soft_min.cc soft_min_force.cc soft_min_virial.cc gelu_multi_device.cc tabulate_multi_device.cc)
+file(GLOB OP_GRADS_SRC custom_op.cc prod_force_grad.cc prod_force_grad_multi_device.cc prod_virial_grad.cc prod_virial_grad_multi_device.cc soft_min_force_grad.cc soft_min_virial_grad.cc )
 file(GLOB OP_PY *.py)
 
 if (BUILD_CPP_IF) 
diff --git a/source/op/custom_op.cc b/source/op/custom_op.cc
new file mode 100644
index 0000000000..741fb3ace6
--- /dev/null
+++ b/source/op/custom_op.cc
@@ -0,0 +1,20 @@
+#include "custom_op.h"
+#include "errors.h"
+
+namespace deepmd {
+  void safe_compute(OpKernelContext* context, std::function<void(OpKernelContext*)> ff) {
+    try{
+      ff(context);
+    } catch (deepmd::deepmd_exception_oom& e){
+      OP_REQUIRES_OK(
+          context,
+          errors::ResourceExhausted("Operation received an exception: ", e.what(),
+                          ", in file ",__FILE__, ":", __LINE__));
+    } catch (deepmd::deepmd_exception& e) {
+      OP_REQUIRES_OK(
+          context,
+          errors::Internal("Operation received an exception: ", e.what(),
+                          ", in file ",__FILE__, ":", __LINE__));
+    }
+  }
+};
\ No newline at end of file
diff --git a/source/op/custom_op.h b/source/op/custom_op.h
index e4f9211e61..8482e92b03 100644
--- a/source/op/custom_op.h
+++ b/source/op/custom_op.h
@@ -26,4 +26,8 @@ struct DeviceFunctor {
     device = "GPU";
   }
   #endif // GOOGLE_CUDA || TENSORFLOW_USE_ROCM
+};
+
+namespace deepmd {
+  void safe_compute(OpKernelContext* context, std::function<void(OpKernelContext*)> ff);
 };
\ No newline at end of file
diff --git a/source/op/descrpt.cc b/source/op/descrpt.cc
index 10ba125594..7fdf81d986 100644
--- a/source/op/descrpt.cc
+++ b/source/op/descrpt.cc
@@ -2,6 +2,7 @@
 #include "ComputeDescriptor.h"
 #include "neighbor_list.h"
 #include "fmt_nlist.h"
+#include "errors.h"
 
 typedef double boxtensor_t ;
 typedef double compute_t;
@@ -49,6 +50,10 @@ class DescrptOp : public OpKernel {
   }
 
   void Compute(OpKernelContext* context) override {
+    deepmd::safe_compute(context, [this](OpKernelContext* context) {this->_Compute(context);});
+  }
+
+  void _Compute(OpKernelContext* context) {
     // Grab the input tensor
     const Tensor& coord_tensor	= context->input(0);
     const Tensor& type_tensor	= context->input(1);
@@ -105,7 +110,7 @@ class DescrptOp : public OpKernel {
       nei_mode = -1;
     }
     else {
-      throw std::runtime_error("invalid mesh tensor");
+      throw deepmd::deepmd_exception("invalid mesh tensor");
     }
     bool b_pbc = true;
     // if region is given extended, do not use pbc
@@ -254,7 +259,7 @@ class DescrptOp : public OpKernel {
 	::build_nlist (d_nlist_a, d_nlist_r, d_coord3, rcut_a, rcut_r, NULL);
       }
       else {
-	throw std::runtime_error("unknow neighbor mode");
+	throw deepmd::deepmd_exception("unknow neighbor mode");
       }
 
       // loop over atoms, compute descriptors for each atom
diff --git a/source/op/descrpt_se_a_ef.cc b/source/op/descrpt_se_a_ef.cc
index 3ba41624d9..121205c9cf 100644
--- a/source/op/descrpt_se_a_ef.cc
+++ b/source/op/descrpt_se_a_ef.cc
@@ -3,6 +3,7 @@
 #include "ComputeDescriptor.h"
 #include "neighbor_list.h"
 #include "fmt_nlist.h"
+#include "errors.h"
 
 typedef double boxtensor_t ;
 typedef double compute_t;
@@ -49,6 +50,10 @@ class DescrptSeAEfOp : public OpKernel {
   }
 
   void Compute(OpKernelContext* context) override {
+    deepmd::safe_compute(context, [this](OpKernelContext* context) {this->_Compute(context);});
+  }
+
+  void _Compute(OpKernelContext* context) {
     // Grab the input tensor
     int context_input_index = 0;
     const Tensor& coord_tensor	= context->input(context_input_index++);
@@ -112,7 +117,7 @@ class DescrptSeAEfOp : public OpKernel {
       nei_mode = -1;
     }
     else {
-      throw std::runtime_error("invalid mesh tensor");
+      throw deepmd::deepmd_exception("invalid mesh tensor");
     }
     bool b_pbc = true;
     // if region is given extended, do not use pbc
@@ -267,7 +272,7 @@ class DescrptSeAEfOp : public OpKernel {
 	::build_nlist (d_nlist_a, d_nlist_r, d_coord3, rcut_a, rcut_r, NULL);
       }
       else {
-	throw std::runtime_error("unknow neighbor mode");
+	throw deepmd::deepmd_exception("unknow neighbor mode");
       }
 
       // loop over atoms, compute descriptors for each atom
diff --git a/source/op/descrpt_se_a_ef_para.cc b/source/op/descrpt_se_a_ef_para.cc
index 2cb3b3445c..952c53d473 100644
--- a/source/op/descrpt_se_a_ef_para.cc
+++ b/source/op/descrpt_se_a_ef_para.cc
@@ -2,6 +2,7 @@
 #include "ComputeDescriptor.h"
 #include "neighbor_list.h"
 #include "fmt_nlist.h"
+#include "errors.h"
 
 typedef double boxtensor_t ;
 typedef double compute_t;
@@ -48,6 +49,10 @@ class DescrptSeAEfParaOp : public OpKernel {
   }
 
   void Compute(OpKernelContext* context) override {
+    deepmd::safe_compute(context, [this](OpKernelContext* context) {this->_Compute(context);});
+  }
+
+  void _Compute(OpKernelContext* context) {
     // Grab the input tensor
     int context_input_index = 0;
     const Tensor& coord_tensor	= context->input(context_input_index++);
@@ -111,7 +116,7 @@ class DescrptSeAEfParaOp : public OpKernel {
       nei_mode = -1;
     }
     else {
-      throw std::runtime_error("invalid mesh tensor");
+      throw deepmd::deepmd_exception("invalid mesh tensor");
     }
     bool b_pbc = true;
     // if region is given extended, do not use pbc
@@ -266,7 +271,7 @@ class DescrptSeAEfParaOp : public OpKernel {
 	::build_nlist (d_nlist_a, d_nlist_r, d_coord3, rcut_a, rcut_r, NULL);
       }
       else {
-	throw std::runtime_error("unknow neighbor mode");
+	throw deepmd::deepmd_exception("unknow neighbor mode");
       }
 
       // loop over atoms, compute descriptors for each atom
diff --git a/source/op/descrpt_se_a_ef_vert.cc b/source/op/descrpt_se_a_ef_vert.cc
index 615b153bf3..4ef76f8e0f 100644
--- a/source/op/descrpt_se_a_ef_vert.cc
+++ b/source/op/descrpt_se_a_ef_vert.cc
@@ -2,6 +2,7 @@
 #include "ComputeDescriptor.h"
 #include "neighbor_list.h"
 #include "fmt_nlist.h"
+#include "errors.h"
 
 typedef double boxtensor_t ;
 typedef double compute_t;
@@ -48,6 +49,10 @@ class DescrptSeAEfVertOp : public OpKernel {
   }
 
   void Compute(OpKernelContext* context) override {
+    deepmd::safe_compute(context, [this](OpKernelContext* context) {this->_Compute(context);});
+  }
+
+  void _Compute(OpKernelContext* context) {
     // Grab the input tensor
     int context_input_index = 0;
     const Tensor& coord_tensor	= context->input(context_input_index++);
@@ -111,7 +116,7 @@ class DescrptSeAEfVertOp : public OpKernel {
       nei_mode = -1;
     }
     else {
-      throw std::runtime_error("invalid mesh tensor");
+      throw deepmd::deepmd_exception("invalid mesh tensor");
     }
     bool b_pbc = true;
     // if region is given extended, do not use pbc
@@ -266,7 +271,7 @@ class DescrptSeAEfVertOp : public OpKernel {
 	::build_nlist (d_nlist_a, d_nlist_r, d_coord3, rcut_a, rcut_r, NULL);
       }
       else {
-	throw std::runtime_error("unknow neighbor mode");
+	throw deepmd::deepmd_exception("unknow neighbor mode");
       }
 
       // loop over atoms, compute descriptors for each atom
diff --git a/source/op/ewald_recp.cc b/source/op/ewald_recp.cc
index 9159dc5931..c9cc22b480 100644
--- a/source/op/ewald_recp.cc
+++ b/source/op/ewald_recp.cc
@@ -28,6 +28,10 @@ class EwaldRecpOp : public OpKernel {
   }
 
   void Compute(OpKernelContext* context) override {
+    deepmd::safe_compute(context, [this](OpKernelContext* context) {this->_Compute(context);});
+  }
+
+  void _Compute(OpKernelContext* context) {
     // Grab the input tensor
     int cc = 0;
     const Tensor& coord_tensor	= context->input(cc++);
diff --git a/source/op/gelu_multi_device.cc b/source/op/gelu_multi_device.cc
index 508f60ccef..dc86ab6c8d 100644
--- a/source/op/gelu_multi_device.cc
+++ b/source/op/gelu_multi_device.cc
@@ -26,6 +26,10 @@ class GeluOp : public OpKernel {
  public :
   explicit GeluOp(OpKernelConstruction* context) : OpKernel(context) {}
   void Compute(OpKernelContext* context) override {
+    deepmd::safe_compute(context, [this](OpKernelContext* context) {this->_Compute(context);});
+  }
+
+  void _Compute(OpKernelContext* context) {
     // Grab the input tensor
     const Tensor& x_tensor = context->input(0);
     Tensor * output_tensor = NULL;
@@ -73,6 +77,10 @@ class GeluGradOp : public OpKernel {
  public :
   explicit GeluGradOp(OpKernelConstruction* context) : OpKernel(context) {}
   void Compute(OpKernelContext* context) override {
+    deepmd::safe_compute(context, [this](OpKernelContext* context) {this->_Compute(context);});
+  }
+
+  void _Compute(OpKernelContext* context) {
     // Grab the input tensor
     const Tensor& dy_tensor = context->input(0);
     const Tensor& x_tensor  = context->input(1);
@@ -122,6 +130,10 @@ class GeluGradGradOp : public OpKernel {
  public :
   explicit GeluGradGradOp(OpKernelConstruction* context) : OpKernel(context) {}
   void Compute(OpKernelContext* context) override {
+    deepmd::safe_compute(context, [this](OpKernelContext* context) {this->_Compute(context);});
+  }
+
+  void _Compute(OpKernelContext* context) {
     // Grab the input tensor
     const Tensor& dy_tensor = context->input(0);
     const Tensor& dy_2_tensor = context->input(1);
diff --git a/source/op/legacy/descrpt_se_a.cc b/source/op/legacy/descrpt_se_a.cc
index 51b8e26e0f..cd7abf8a76 100644
--- a/source/op/legacy/descrpt_se_a.cc
+++ b/source/op/legacy/descrpt_se_a.cc
@@ -3,6 +3,7 @@
 #include "neighbor_list.h"
 #include "fmt_nlist.h"
 #include "env_mat.h"
+#include "errors.h"
 
 typedef double boxtensor_t ;
 typedef double compute_t;
@@ -107,7 +108,7 @@ class DescrptSeAOp : public OpKernel {
       nei_mode = -1;
     }
     else {
-      throw std::runtime_error("invalid mesh tensor");
+      throw deepmd::deepmd_exception("invalid mesh tensor");
     }
     bool b_pbc = true;
     // if region is given extended, do not use pbc
@@ -253,7 +254,7 @@ class DescrptSeAOp : public OpKernel {
 	::build_nlist (d_nlist_a, d_nlist_r, d_coord3, rcut_a, rcut_r, NULL);
       }
       else {
-	throw std::runtime_error("unknow neighbor mode");
+	throw deepmd::deepmd_exception("unknow neighbor mode");
       }
 
       // loop over atoms, compute descriptors for each atom
diff --git a/source/op/legacy/descrpt_se_r.cc b/source/op/legacy/descrpt_se_r.cc
index 7031ed20e8..408818fbee 100644
--- a/source/op/legacy/descrpt_se_r.cc
+++ b/source/op/legacy/descrpt_se_r.cc
@@ -3,6 +3,7 @@
 #include "neighbor_list.h"
 #include "fmt_nlist.h"
 #include "env_mat.h"
+#include "errors.h"
 
 typedef double boxtensor_t ;
 typedef double compute_t;
@@ -99,7 +100,7 @@ class DescrptSeROp : public OpKernel {
       nei_mode = -1;
     }
     else {
-      throw std::runtime_error("invalid mesh tensor");
+      throw deepmd::deepmd_exception("invalid mesh tensor");
     }
     bool b_pbc = true;
     // if region is given extended, do not use pbc
@@ -238,7 +239,7 @@ class DescrptSeROp : public OpKernel {
 	::build_nlist (d_nlist_null, d_nlist, d_coord3, -1, rcut, NULL);
       }
       else {
-	throw std::runtime_error("unknow neighbor mode");
+	throw deepmd::deepmd_exception("unknow neighbor mode");
       }
 
       // loop over atoms, compute descriptors for each atom
diff --git a/source/op/map_aparam.cc b/source/op/map_aparam.cc
index f1c98bdc9c..cd70435f99 100644
--- a/source/op/map_aparam.cc
+++ b/source/op/map_aparam.cc
@@ -20,6 +20,10 @@ class MapAparamOp : public OpKernel {
   }
 
   void Compute(OpKernelContext* context) override {
+    deepmd::safe_compute(context, [this](OpKernelContext* context) {this->_Compute(context);});
+  }
+
+  void _Compute(OpKernelContext* context) {
     // Grab the input tensor
     int context_input_index = 0;
     const Tensor& aparam_tensor		= context->input(context_input_index++);
diff --git a/source/op/neighbor_stat.cc b/source/op/neighbor_stat.cc
index 11f991b4b7..fad4617cc5 100644
--- a/source/op/neighbor_stat.cc
+++ b/source/op/neighbor_stat.cc
@@ -1,5 +1,6 @@
 #include "custom_op.h"
 #include "neighbor_list.h"
+#include "errors.h"
 
 typedef double boxtensor_t ;
 typedef double compute_t;
@@ -23,6 +24,10 @@ class NeighborStatOp : public OpKernel {
     }
 
     void Compute(OpKernelContext* context) override {
+        deepmd::safe_compute(context, [this](OpKernelContext* context) {this->_Compute(context);});
+    }
+
+    void _Compute(OpKernelContext* context) {
         // Grab the input tensor
         int context_input_index = 0;
         const Tensor& coord_tensor	= context->input(context_input_index++);
@@ -60,7 +65,7 @@ class NeighborStatOp : public OpKernel {
             nei_mode = -1;
         }
         else {
-            throw std::runtime_error("invalid mesh tensor");
+            throw deepmd::deepmd_exception("invalid mesh tensor");
         }
         // if region is given extended, do not use pbc
         bool b_pbc = (nei_mode >= 1 || nei_mode == -1) ? false : true;
@@ -139,7 +144,7 @@ class NeighborStatOp : public OpKernel {
 	        ::build_nlist (d_nlist_a, d_nlist_r, d_coord3, -1, rcut, NULL);
         }
         else {
-	        throw std::runtime_error("unknow neighbor mode");
+	        throw deepmd::deepmd_exception("unknow neighbor mode");
         }
 
         int MAX_NNEI = 0;
diff --git a/source/op/pair_tab.cc b/source/op/pair_tab.cc
index e09ef460b4..2a22e17102 100644
--- a/source/op/pair_tab.cc
+++ b/source/op/pair_tab.cc
@@ -34,6 +34,10 @@ class PairTabOp : public OpKernel {
   }
 
   void Compute(OpKernelContext* context) override {
+    deepmd::safe_compute(context, [this](OpKernelContext* context) {this->_Compute(context);});
+  }
+
+  void _Compute(OpKernelContext* context) {
     // Grab the input tensor
     int tmp_idx = 0;
     const Tensor& table_info_tensor	= context->input(tmp_idx++);
diff --git a/source/op/prod_env_mat_multi_device.cc b/source/op/prod_env_mat_multi_device.cc
index 7c7130cda0..69e08eaa5e 100644
--- a/source/op/prod_env_mat_multi_device.cc
+++ b/source/op/prod_env_mat_multi_device.cc
@@ -4,6 +4,7 @@
 #include "region.h"
 #include "neighbor_list.h"
 #include "prod_env_mat.h"
+#include "errors.h"
 
 REGISTER_OP("ProdEnvMatA")
     .Attr("T: {float, double} = DT_DOUBLE")
@@ -321,6 +322,10 @@ class ProdEnvMatAOp : public OpKernel {
   }
 
   void Compute(OpKernelContext* context) override {
+    deepmd::safe_compute(context, [this](OpKernelContext* context) {this->_Compute(context);});
+  }
+
+  void _Compute(OpKernelContext* context) {
     // Grab the input tensor
     int context_input_index = 0;
     const Tensor& coord_tensor	= context->input(context_input_index++);
@@ -382,7 +387,7 @@ class ProdEnvMatAOp : public OpKernel {
       nei_mode = -1;
     }
     else {
-      throw std::runtime_error("invalid mesh tensor");
+      throw deepmd::deepmd_exception("invalid mesh tensor");
     }
 
     // Create output tensors
@@ -584,6 +589,10 @@ class ProdEnvMatROp : public OpKernel {
   }
 
   void Compute(OpKernelContext* context) override {
+    deepmd::safe_compute(context, [this](OpKernelContext* context) {this->_Compute(context);});
+  }
+
+  void _Compute(OpKernelContext* context) {
     // Grab the input tensor
     int context_input_index = 0;
     const Tensor& coord_tensor  = context->input(context_input_index++);
@@ -642,7 +651,7 @@ class ProdEnvMatROp : public OpKernel {
       nei_mode = -1;
     }
     else {
-      throw std::runtime_error("invalid mesh tensor");
+      throw deepmd::deepmd_exception("invalid mesh tensor");
     }
 
     // Create an output tensor
@@ -995,7 +1004,7 @@ _norm_copy_coord_gpu(
   FPTYPE_shape.AddDim(nall*3);
   context->allocate_temp(DataTypeToEnum<FPTYPE>::value, FPTYPE_shape, tensor_list);
   FPTYPE * tmp_coord = (*tensor_list).flat<FPTYPE>().data();
-  cudaErrcheck(cudaMemcpy(tmp_coord, coord, sizeof(FPTYPE) * nall * 3, cudaMemcpyDeviceToDevice));
+  DPErrcheck(cudaMemcpy(tmp_coord, coord, sizeof(FPTYPE) * nall * 3, cudaMemcpyDeviceToDevice));
   
   deepmd::Region<FPTYPE> region;
   init_region_cpu(region, box);
@@ -1210,7 +1219,7 @@ _norm_copy_coord_gpu_rocm(
   FPTYPE_shape.AddDim(nall*3);
   context->allocate_temp(DataTypeToEnum<FPTYPE>::value, FPTYPE_shape, tensor_list);
   FPTYPE * tmp_coord = (*tensor_list).flat<FPTYPE>().data();
-  hipErrcheck(hipMemcpy(tmp_coord, coord, sizeof(FPTYPE) * nall * 3, hipMemcpyDeviceToDevice));
+  DPErrcheck(hipMemcpy(tmp_coord, coord, sizeof(FPTYPE) * nall * 3, hipMemcpyDeviceToDevice));
   
   deepmd::Region<FPTYPE> region;
   init_region_cpu(region, box);
diff --git a/source/op/prod_force.cc b/source/op/prod_force.cc
index 307d00a85d..a97fb6c575 100644
--- a/source/op/prod_force.cc
+++ b/source/op/prod_force.cc
@@ -26,6 +26,10 @@ class ProdForceOp : public OpKernel {
   }
 
   void Compute(OpKernelContext* context) override {
+    deepmd::safe_compute(context, [this](OpKernelContext* context) {this->_Compute(context);});
+  }
+
+  void _Compute(OpKernelContext* context) {
     // Grab the input tensor
     const Tensor& net_deriv_tensor	= context->input(0);
     const Tensor& in_deriv_tensor	= context->input(1);
diff --git a/source/op/prod_force_grad.cc b/source/op/prod_force_grad.cc
index 52c8ed845f..67423d7489 100644
--- a/source/op/prod_force_grad.cc
+++ b/source/op/prod_force_grad.cc
@@ -25,6 +25,10 @@ class ProdForceGradOp : public OpKernel
   }
 
   void Compute(OpKernelContext* context) override {
+    deepmd::safe_compute(context, [this](OpKernelContext* context) {this->_Compute(context);});
+  }
+
+  void _Compute(OpKernelContext* context) {
     // Grab the input tensor
     const Tensor& grad_tensor		= context->input(0);
     const Tensor& net_deriv_tensor	= context->input(1);
diff --git a/source/op/prod_force_grad_multi_device.cc b/source/op/prod_force_grad_multi_device.cc
index 5aff4bbbef..533f6cbf14 100644
--- a/source/op/prod_force_grad_multi_device.cc
+++ b/source/op/prod_force_grad_multi_device.cc
@@ -31,6 +31,10 @@ class ProdForceSeAGradOp : public OpKernel {
   }
 
   void Compute(OpKernelContext* context) override {
+      deepmd::safe_compute(context, [this](OpKernelContext* context) {this->_Compute(context);});
+  }
+
+  void _Compute(OpKernelContext* context) {
     // Grab the input tensor
     int context_input_index = 0;
     const Tensor& grad_tensor		= context->input(context_input_index++);
@@ -139,6 +143,10 @@ class ProdForceSeRGradOp : public OpKernel
   explicit ProdForceSeRGradOp(OpKernelConstruction* context) : OpKernel(context) {}
 
   void Compute(OpKernelContext* context) override {
+      deepmd::safe_compute(context, [this](OpKernelContext* context) {this->_Compute(context);});
+  }
+
+  void _Compute(OpKernelContext* context) {
     // Grab the input tensor
     int context_input_index = 0;
     const Tensor& grad_tensor		= context->input(context_input_index++);
diff --git a/source/op/prod_force_multi_device.cc b/source/op/prod_force_multi_device.cc
index 63e6945906..8df25636f6 100644
--- a/source/op/prod_force_multi_device.cc
+++ b/source/op/prod_force_multi_device.cc
@@ -25,6 +25,10 @@ class ProdForceSeAOp : public OpKernel {
   explicit ProdForceSeAOp(OpKernelConstruction* context) : OpKernel(context) {}
 
   void Compute(OpKernelContext* context) override {
+    deepmd::safe_compute(context, [this](OpKernelContext* context) {this->_Compute(context);});
+  }
+
+  void _Compute(OpKernelContext* context) {
     // Grab the input tensor
     int context_input_index = 0;
     const Tensor& net_deriv_tensor  = context->input(context_input_index++);
diff --git a/source/op/prod_force_se_a_grad.cc b/source/op/prod_force_se_a_grad.cc
index 7617c244ed..84b2a7ed3b 100644
--- a/source/op/prod_force_se_a_grad.cc
+++ b/source/op/prod_force_se_a_grad.cc
@@ -25,6 +25,10 @@ class ProdForceSeAGradOp : public OpKernel
   }
 
   void Compute(OpKernelContext* context) override {
+    deepmd::safe_compute(context, [this](OpKernelContext* context) {this->_Compute(context);});
+  }
+
+  void _Compute(OpKernelContext* context) {
     // Grab the input tensor
     int context_input_index = 0;
     const Tensor& grad_tensor		= context->input(context_input_index++);
diff --git a/source/op/prod_force_se_r_grad.cc b/source/op/prod_force_se_r_grad.cc
index 9fff3724ed..e02f0c8750 100644
--- a/source/op/prod_force_se_r_grad.cc
+++ b/source/op/prod_force_se_r_grad.cc
@@ -20,6 +20,10 @@ class ProdForceSeRGradOp : public OpKernel
   }
 
   void Compute(OpKernelContext* context) override {
+    deepmd::safe_compute(context, [this](OpKernelContext* context) {this->_Compute(context);});
+  }
+
+  void _Compute(OpKernelContext* context) {
     // Grab the input tensor
     int context_input_index = 0;
     const Tensor& grad_tensor		= context->input(context_input_index++);
diff --git a/source/op/prod_virial.cc b/source/op/prod_virial.cc
index d83ab27225..a8df2bc848 100644
--- a/source/op/prod_virial.cc
+++ b/source/op/prod_virial.cc
@@ -28,6 +28,10 @@ class ProdVirialOp : public OpKernel {
   }
 
   void Compute(OpKernelContext* context) override {
+    deepmd::safe_compute(context, [this](OpKernelContext* context) {this->_Compute(context);});
+  }
+
+  void _Compute(OpKernelContext* context) {
     // Grab the input tensor
     const Tensor& net_deriv_tensor	= context->input(0);
     const Tensor& in_deriv_tensor	= context->input(1);
diff --git a/source/op/prod_virial_grad.cc b/source/op/prod_virial_grad.cc
index d07a661cb9..33fa0348dc 100644
--- a/source/op/prod_virial_grad.cc
+++ b/source/op/prod_virial_grad.cc
@@ -26,6 +26,10 @@ class ProdVirialGradOp : public OpKernel
   }
 
   void Compute(OpKernelContext* context) override {
+    deepmd::safe_compute(context, [this](OpKernelContext* context) {this->_Compute(context);});
+  }
+
+  void _Compute(OpKernelContext* context) {
     // Grab the input tensor
     const Tensor& grad_tensor		= context->input(0);
     const Tensor& net_deriv_tensor	= context->input(1);
diff --git a/source/op/prod_virial_grad_multi_device.cc b/source/op/prod_virial_grad_multi_device.cc
index 7a37da9b38..9afd4462eb 100644
--- a/source/op/prod_virial_grad_multi_device.cc
+++ b/source/op/prod_virial_grad_multi_device.cc
@@ -34,6 +34,10 @@ class ProdVirialSeAGradOp : public OpKernel
   }
 
   void Compute(OpKernelContext* context) override {
+    deepmd::safe_compute(context, [this](OpKernelContext* context) {this->_Compute(context);});
+  }
+
+  void _Compute(OpKernelContext* context) {
     // Grab the input tensor
     int context_input_index = 0;
     const Tensor& grad_tensor		= context->input(context_input_index++);
@@ -153,6 +157,10 @@ class ProdVirialSeRGradOp : public OpKernel
   explicit ProdVirialSeRGradOp(OpKernelConstruction* context) : OpKernel(context) {}
 
   void Compute(OpKernelContext* context) override {
+    deepmd::safe_compute(context, [this](OpKernelContext* context) {this->_Compute(context);});
+  }
+
+  void _Compute(OpKernelContext* context) {
     // Grab the input tensor
     int context_input_index = 0;
     const Tensor& grad_tensor		= context->input(context_input_index++);
diff --git a/source/op/prod_virial_multi_device.cc b/source/op/prod_virial_multi_device.cc
index 02c212a2d9..33c263ef84 100644
--- a/source/op/prod_virial_multi_device.cc
+++ b/source/op/prod_virial_multi_device.cc
@@ -28,6 +28,10 @@ class ProdVirialSeAOp : public OpKernel {
  public:
   explicit ProdVirialSeAOp(OpKernelConstruction* context) : OpKernel(context) {}
   void Compute(OpKernelContext* context) override {
+      deepmd::safe_compute(context, [this](OpKernelContext* context) {this->_Compute(context);});
+  }
+
+  void _Compute(OpKernelContext* context) {
     // Grab the input tensor
     int context_input_index = 0;
     const Tensor& net_deriv_tensor  = context->input(context_input_index++);
@@ -120,6 +124,10 @@ class ProdVirialSeROp : public OpKernel {
  public:
   explicit ProdVirialSeROp(OpKernelConstruction* context) : OpKernel(context) {}
   void Compute(OpKernelContext* context) override {
+      deepmd::safe_compute(context, [this](OpKernelContext* context) {this->_Compute(context);});
+  }
+
+  void _Compute(OpKernelContext* context) {
     // Grab the input tensor
     int context_input_index = 0;
     const Tensor& net_deriv_tensor  = context->input(context_input_index++);
diff --git a/source/op/prod_virial_se_a_grad.cc b/source/op/prod_virial_se_a_grad.cc
index cb76d29512..00a88e0f76 100644
--- a/source/op/prod_virial_se_a_grad.cc
+++ b/source/op/prod_virial_se_a_grad.cc
@@ -26,6 +26,10 @@ class ProdVirialSeAGradOp : public OpKernel
   }
 
   void Compute(OpKernelContext* context) override {
+    deepmd::safe_compute(context, [this](OpKernelContext* context) {this->_Compute(context);});
+  }
+
+  void _Compute(OpKernelContext* context) {
     // Grab the input tensor
     int context_input_index = 0;
     const Tensor& grad_tensor		= context->input(context_input_index++);
diff --git a/source/op/prod_virial_se_r_grad.cc b/source/op/prod_virial_se_r_grad.cc
index 247f2ee909..7f9005abe4 100644
--- a/source/op/prod_virial_se_r_grad.cc
+++ b/source/op/prod_virial_se_r_grad.cc
@@ -21,6 +21,10 @@ class ProdVirialSeRGradOp : public OpKernel
   }
 
   void Compute(OpKernelContext* context) override {
+    deepmd::safe_compute(context, [this](OpKernelContext* context) {this->_Compute(context);});
+  }
+
+  void _Compute(OpKernelContext* context) {
     // Grab the input tensor
     int context_input_index = 0;
     const Tensor& grad_tensor		= context->input(context_input_index++);
diff --git a/source/op/soft_min.cc b/source/op/soft_min.cc
index c30d9c409a..f7770ab58b 100644
--- a/source/op/soft_min.cc
+++ b/source/op/soft_min.cc
@@ -37,6 +37,10 @@ class SoftMinSwitchOp : public OpKernel {
   }
 
   void Compute(OpKernelContext* context) override {
+    deepmd::safe_compute(context, [this](OpKernelContext* context) {this->_Compute(context);});
+  }
+
+  void _Compute(OpKernelContext* context) {
     // Grab the input tensor
     int tmp_idx = 0;
     const Tensor& type_tensor	= context->input(tmp_idx++);
diff --git a/source/op/soft_min_force.cc b/source/op/soft_min_force.cc
index 7d09da6613..f10a48dc26 100644
--- a/source/op/soft_min_force.cc
+++ b/source/op/soft_min_force.cc
@@ -24,6 +24,10 @@ class SoftMinForceOp : public OpKernel {
   }
 
   void Compute(OpKernelContext* context) override {
+    deepmd::safe_compute(context, [this](OpKernelContext* context) {this->_Compute(context);});
+  }
+
+  void _Compute(OpKernelContext* context) {
     // Grab the input tensor
     const Tensor& du_tensor		= context->input(0);
     const Tensor& sw_deriv_tensor	= context->input(1);
diff --git a/source/op/soft_min_force_grad.cc b/source/op/soft_min_force_grad.cc
index a7328734b6..d5095d1005 100644
--- a/source/op/soft_min_force_grad.cc
+++ b/source/op/soft_min_force_grad.cc
@@ -24,6 +24,10 @@ class SoftMinForceGradOp : public OpKernel
   }
 
   void Compute(OpKernelContext* context) override {
+    deepmd::safe_compute(context, [this](OpKernelContext* context) {this->_Compute(context);});
+  }
+
+  void _Compute(OpKernelContext* context) {
     // Grab the input tensor
     int context_input_index = 0;
     const Tensor& grad_tensor		= context->input(context_input_index++);
diff --git a/source/op/soft_min_virial.cc b/source/op/soft_min_virial.cc
index 3273160fe3..72d4a21e55 100644
--- a/source/op/soft_min_virial.cc
+++ b/source/op/soft_min_virial.cc
@@ -26,6 +26,10 @@ class SoftMinVirialOp : public OpKernel {
   }
 
   void Compute(OpKernelContext* context) override {
+    deepmd::safe_compute(context, [this](OpKernelContext* context) {this->_Compute(context);});
+  }
+
+  void _Compute(OpKernelContext* context) {
     // Grab the input tensor
     int context_input_index = 0;
     const Tensor& du_tensor		= context->input(context_input_index++);
diff --git a/source/op/soft_min_virial_grad.cc b/source/op/soft_min_virial_grad.cc
index 034aeb7a09..f92ac2a5c9 100644
--- a/source/op/soft_min_virial_grad.cc
+++ b/source/op/soft_min_virial_grad.cc
@@ -25,6 +25,10 @@ class SoftMinVirialGradOp : public OpKernel
   }
 
   void Compute(OpKernelContext* context) override {
+    deepmd::safe_compute(context, [this](OpKernelContext* context) {this->_Compute(context);});
+  }
+
+  void _Compute(OpKernelContext* context) {
     // Grab the input tensor
     int context_input_index = 0;
     const Tensor& grad_tensor		= context->input(context_input_index++);
diff --git a/source/op/tabulate_multi_device.cc b/source/op/tabulate_multi_device.cc
index 6fafa5698e..3d5765b843 100644
--- a/source/op/tabulate_multi_device.cc
+++ b/source/op/tabulate_multi_device.cc
@@ -28,6 +28,10 @@ class TabulateFusionOp : public OpKernel {
     OP_REQUIRES_OK(context, context->GetAttr("last_layer_size", &last_layer_size));
   }
   void Compute(OpKernelContext* context) override {
+      deepmd::safe_compute(context, [this](OpKernelContext* context) {this->_Compute(context);});
+  }
+
+  void _Compute(OpKernelContext* context) {
     // Grab the input tensor
     int context_input_index = 0;
     const Tensor& table_tensor	= context->input(context_input_index++);
@@ -90,6 +94,10 @@ class TabulateFusionGradOp : public OpKernel {
  public:
   explicit TabulateFusionGradOp(OpKernelConstruction* context) : OpKernel(context) {}
   void Compute(OpKernelContext* context) override {
+      deepmd::safe_compute(context, [this](OpKernelContext* context) {this->_Compute(context);});
+  }
+
+  void _Compute(OpKernelContext* context) {
     // Grab the input tensor
     int context_input_index = 0;
     const Tensor& table_tensor	= context->input(context_input_index++);
diff --git a/source/op/unaggregated_grad.cc b/source/op/unaggregated_grad.cc
index 56502efc55..343a339a92 100644
--- a/source/op/unaggregated_grad.cc
+++ b/source/op/unaggregated_grad.cc
@@ -136,6 +136,10 @@ class UnaggregatedDyDxSOp : public OpKernel {
     explicit UnaggregatedDyDxSOp(OpKernelConstruction* context) : OpKernel(context) {}
 
     void Compute(OpKernelContext* context) override {
+        deepmd::safe_compute(context, [this](OpKernelContext* context) {this->_Compute(context);});
+    }
+
+    void _Compute(OpKernelContext* context) {
         // Grab the input tensor
         int context_input_index = 0;
         const Tensor& y	= context->input(context_input_index++);
@@ -169,6 +173,10 @@ class UnaggregatedDy2DxSOp : public OpKernel {
     explicit UnaggregatedDy2DxSOp(OpKernelConstruction* context) : OpKernel(context) {}
 
     void Compute(OpKernelContext* context) override {
+        deepmd::safe_compute(context, [this](OpKernelContext* context) {this->_Compute(context);});
+    }
+
+    void _Compute(OpKernelContext* context) {
         // Grab the input tensor
         int context_input_index = 0;
         const Tensor& y	    = context->input(context_input_index++);
@@ -205,6 +213,10 @@ class UnaggregatedDyDxOp : public OpKernel {
     explicit UnaggregatedDyDxOp(OpKernelConstruction* context) : OpKernel(context) {}
 
     void Compute(OpKernelContext* context) override {
+        deepmd::safe_compute(context, [this](OpKernelContext* context) {this->_Compute(context);});
+    }
+
+    void _Compute(OpKernelContext* context) {
         // Grab the input tensor
         int context_input_index = 0;
         const Tensor& z	= context->input(context_input_index++);
@@ -242,6 +254,10 @@ class UnaggregatedDy2DxOp : public OpKernel {
     explicit UnaggregatedDy2DxOp(OpKernelConstruction* context) : OpKernel(context) {}
 
     void Compute(OpKernelContext* context) override {
+        deepmd::safe_compute(context, [this](OpKernelContext* context) {this->_Compute(context);});
+    }
+
+    void _Compute(OpKernelContext* context) {
         // Grab the input tensor
         int context_input_index = 0;
         const Tensor& z	= context->input(context_input_index++);
diff --git a/source/tests/compat_inputs/water_v0.json b/source/tests/compat_inputs/water_v0.json
index 88f868ff47..70eedcf72b 100644
--- a/source/tests/compat_inputs/water_v0.json
+++ b/source/tests/compat_inputs/water_v0.json
@@ -1,5 +1,4 @@
 {
-    "with_distrib":	false,
     "_comment": " model parameters",
     "use_smooth":	false,
     "sel_a":		[16, 32],
diff --git a/source/tests/compat_inputs/water_v1.json b/source/tests/compat_inputs/water_v1.json
index e5f2032ea2..e8b1d8a196 100644
--- a/source/tests/compat_inputs/water_v1.json
+++ b/source/tests/compat_inputs/water_v1.json
@@ -1,5 +1,4 @@
 {
-    "with_distrib":	false,
     "model":{
 	"descriptor": {
 	    "type":		"loc_frame",
diff --git a/source/tests/compat_inputs/water_v2.json b/source/tests/compat_inputs/water_v2.json
index e49add4467..0bb1281f55 100644
--- a/source/tests/compat_inputs/water_v2.json
+++ b/source/tests/compat_inputs/water_v2.json
@@ -1,5 +1,4 @@
 {
-    "with_distrib":	false,
     "model":{
 		"descriptor": {
 			"type":		"loc_frame",
diff --git a/source/tests/data_modifier/dipole.json b/source/tests/data_modifier/dipole.json
index 9e968ba98c..5bd8b505f4 100644
--- a/source/tests/data_modifier/dipole.json
+++ b/source/tests/data_modifier/dipole.json
@@ -1,5 +1,4 @@
 {
-    "with_distrib":	false,
     "_comment": " model parameters",
     "model":{
 	"type_map":		["O", "H"],
diff --git a/source/tests/polar_se_a.json b/source/tests/polar_se_a.json
index 5e831e19d8..7b3362dbe7 100644
--- a/source/tests/polar_se_a.json
+++ b/source/tests/polar_se_a.json
@@ -1,5 +1,4 @@
 {
-    "with_distrib":	false,
     "_comment": " model parameters",
     "model":{
 	"type":			"polar",
diff --git a/source/tests/test_argument_parser.py b/source/tests/test_argument_parser.py
index f9f28fb81b..615199603a 100644
--- a/source/tests/test_argument_parser.py
+++ b/source/tests/test_argument_parser.py
@@ -282,7 +282,9 @@ def test_parser_compress(self):
 
     def test_parser_doc(self):
         """Test doc subparser."""
-        ARGS = {}
+        ARGS = {
+            "--out-type": dict(type=str, value="rst"),
+        }
 
         self.run_test(command="doc-train-input", mapping=ARGS)
     
diff --git a/source/tests/test_cluster.py b/source/tests/test_cluster.py
new file mode 100644
index 0000000000..01e128b401
--- /dev/null
+++ b/source/tests/test_cluster.py
@@ -0,0 +1,115 @@
+import unittest
+
+from deepmd.cluster import local, slurm
+from unittest import mock
+
+
+kHostName = 'compute-b24-1'
+
+
+class FakePopen(object):
+    def __init__(self, stdout=b'', stderr=b'', returncode=0):
+        self._stdout = stdout
+        self._stderr = stderr
+        self._returncode = returncode
+
+    def communicate(self):
+        return self._stdout, self._stderr
+
+    @property
+    def returncode(self):
+        return self._returncode
+
+
+class TestGPU(unittest.TestCase):
+    @mock.patch('subprocess.Popen')
+    def test_none(self, mock_Popen):
+        mock_Popen.return_value.__enter__.return_value = FakePopen(b'0', b'')
+        gpus = local.get_gpus()
+        self.assertIsNone(gpus)
+
+    @mock.patch('subprocess.Popen')
+    def test_valid(self, mock_Popen):
+        mock_Popen.return_value.__enter__.return_value = FakePopen(b'2', b'')
+        gpus = local.get_gpus()
+        self.assertEqual(gpus, [0, 1])
+
+    @mock.patch('subprocess.Popen')
+    def test_error(self, mock_Popen):
+        mock_Popen.return_value.__enter__.return_value = \
+            FakePopen(stderr=b'!', returncode=1)
+        with self.assertRaises(RuntimeError) as cm:
+            _ = local.get_gpus()
+            self.assertIn('Failed to detect', str(cm.exception))
+
+
+class TestLocal(unittest.TestCase):
+    @mock.patch('socket.gethostname')
+    def test_resource(self, mock_gethostname):
+        mock_gethostname.return_value = kHostName
+        nodename, nodelist, _ = local.get_resource()
+        self.assertEqual(nodename, kHostName)
+        self.assertEqual(nodelist, [kHostName])
+
+
+class TestSlurm(unittest.TestCase):
+    @mock.patch.dict('os.environ', values={
+        'SLURM_JOB_NODELIST': kHostName,
+        'SLURMD_NODENAME': kHostName,
+        'SLURM_JOB_NUM_NODES': '1'
+    })
+    def test_single(self):
+        nodename, nodelist, _ = slurm.get_resource()
+        self.assertEqual(nodename, kHostName)
+        self.assertEqual(nodelist, [kHostName])
+
+    @mock.patch.dict('os.environ', values={
+        'SLURM_JOB_NODELIST': 'compute-b24-[1-3,5-9],compute-b25-[4,8]',
+        'SLURMD_NODENAME': 'compute-b24-2',
+        'SLURM_JOB_NUM_NODES': '10'
+    })
+    def test_multiple(self):
+        nodename, nodelist, _ = slurm.get_resource()
+        self.assertEqual(nodename, 'compute-b24-2')
+        self.assertEqual(nodelist, [
+            'compute-b24-1',
+            'compute-b24-2',
+            'compute-b24-3',
+            'compute-b24-5',
+            'compute-b24-6',
+            'compute-b24-7',
+            'compute-b24-8',
+            'compute-b24-9',
+            'compute-b25-4',
+            'compute-b25-8'
+        ])
+
+    def test_illegal(self):
+        environ = {
+            'SLURM_JOB_NODELIST': 'compute-b24-[3-5]',
+            'SLURMD_NODENAME': 'compute-b24-4'
+        }
+        with mock.patch.dict('os.environ', environ):
+            with self.assertRaises(RuntimeError) as cm:
+                _ = slurm.get_resource()
+                self.assertIn('Could not get SLURM number', str(cm.exception))
+
+        environ = {
+            'SLURM_JOB_NODELIST': 'compute-b24-1,compute-b25-2',
+            'SLURMD_NODENAME': 'compute-b25-2',
+            'SLURM_JOB_NUM_NODES': '4'
+        }
+        with mock.patch.dict('os.environ', environ):
+            with self.assertRaises(ValueError) as cm:
+                _ = slurm.get_resource()
+                self.assertIn('Number of slurm nodes 2', str(cm.exception))
+
+        environ = {
+            'SLURM_JOB_NODELIST': 'compute-b24-1,compute-b25-3',
+            'SLURMD_NODENAME': 'compute-b25-2',
+            'SLURM_JOB_NUM_NODES': '2'
+        }
+        with mock.patch.dict('os.environ', environ):
+            with self.assertRaises(ValueError) as cm:
+                _ = slurm.get_resource()
+                self.assertIn('Nodename(compute-b25-2', str(cm.exception))
diff --git a/source/tests/test_data_modifier.py b/source/tests/test_data_modifier.py
index 829a589d7e..977df9a2b6 100644
--- a/source/tests/test_data_modifier.py
+++ b/source/tests/test_data_modifier.py
@@ -44,8 +44,7 @@ def _setUp(self):
             init_model=None,
             log_path=None,
             log_level=30,
-            mpi_log="master",
-            try_distrib=False
+            mpi_log="master"
         )
         jdata = j_loader(INPUT)
 
diff --git a/source/tests/test_data_modifier_shuffle.py b/source/tests/test_data_modifier_shuffle.py
index bd4ab58132..c14b6dd105 100644
--- a/source/tests/test_data_modifier_shuffle.py
+++ b/source/tests/test_data_modifier_shuffle.py
@@ -49,8 +49,7 @@ def _setUp(self):
             init_model=None,
             log_path=None,
             log_level=30,
-            mpi_log="master",
-            try_distrib=False
+            mpi_log="master"
         )
         jdata = self._setUp_jdata()
         self._setUp_data()
diff --git a/source/tests/test_deepmd_data.py b/source/tests/test_deepmd_data.py
index 3b1352ad93..78d0a36cf2 100644
--- a/source/tests/test_deepmd_data.py
+++ b/source/tests/test_deepmd_data.py
@@ -147,7 +147,7 @@ def test_init_type_map (self) :
         self.assertEqual(dd.idx_map[1], 1)
         self.assertEqual(dd.atom_type[0], 0)
         self.assertEqual(dd.atom_type[1], 1)
-        self.assertEqual(dd.type_map, ['bar', 'foo'])
+        self.assertEqual(dd.type_map, ['bar', 'foo', 'tar'])
 
     def test_load_set(self) :
         dd = DeepmdData(self.data_name)\
diff --git a/source/tests/test_env.py b/source/tests/test_env.py
new file mode 100644
index 0000000000..ea886046f4
--- /dev/null
+++ b/source/tests/test_env.py
@@ -0,0 +1,42 @@
+import unittest
+
+from deepmd import env
+from unittest import mock
+
+
+class TestTFThreadCount(unittest.TestCase):
+    @mock.patch.dict('os.environ', values={})
+    def test_empty(self):
+        intra, inter = env.get_tf_default_nthreads()
+        self.assertEqual(intra, 0)
+        self.assertEqual(inter, 0)
+
+    @mock.patch.dict('os.environ', values={
+        'TF_INTRA_OP_PARALLELISM_THREADS': '5',
+        'TF_INTER_OP_PARALLELISM_THREADS': '3'
+    })
+    def test_given(self):
+        intra, inter = env.get_tf_default_nthreads()
+        self.assertEqual(intra, 5)
+        self.assertEqual(inter, 3)
+
+
+class TestTFSessionConfig(unittest.TestCase):
+    def test_default(self):
+        shared = env.default_tf_session_config
+        new = env.get_tf_session_config()
+        self.assertNotEqual(id(shared), id(new))
+
+    @mock.patch('deepmd.env.get_tf_default_nthreads')
+    def test_get(self, mock_method):
+        mock_method.return_value = (5, 3)
+        config = env.get_tf_session_config()
+        self.assertEqual(config.intra_op_parallelism_threads, 5)
+        self.assertEqual(config.inter_op_parallelism_threads, 3)
+
+    def test_reset(self):
+        shared = env.default_tf_session_config
+        env.reset_default_tf_session_config(True)
+        self.assertEqual(shared.device_count['GPU'], 0)
+        env.reset_default_tf_session_config(False)
+        self.assertEqual(len(shared.device_count), 0)
diff --git a/source/tests/water.json b/source/tests/water.json
index b4817fecf0..f4909a0971 100644
--- a/source/tests/water.json
+++ b/source/tests/water.json
@@ -1,5 +1,4 @@
 {
-    "with_distrib":	false,
     "_comment": " model parameters",
     "model" :{
 	"descriptor":{
diff --git a/source/tests/wfc.json b/source/tests/wfc.json
index 556ef2a992..ab2ba7fc99 100644
--- a/source/tests/wfc.json
+++ b/source/tests/wfc.json
@@ -1,5 +1,4 @@
 {
-    "with_distrib":	false,
     "_comment": " model parameters",
     "model":{
 	"type":			"polar",
diff --git a/source/tests/yaml_inputs/water_v1.json b/source/tests/yaml_inputs/water_v1.json
index e5f2032ea2..e8b1d8a196 100644
--- a/source/tests/yaml_inputs/water_v1.json
+++ b/source/tests/yaml_inputs/water_v1.json
@@ -1,5 +1,4 @@
 {
-    "with_distrib":	false,
     "model":{
 	"descriptor": {
 	    "type":		"loc_frame",
diff --git a/source/tests/yaml_inputs/water_v1.yaml b/source/tests/yaml_inputs/water_v1.yaml
index 5121a961b0..9ddbb89f9c 100644
--- a/source/tests/yaml_inputs/water_v1.yaml
+++ b/source/tests/yaml_inputs/water_v1.yaml
@@ -1,4 +1,3 @@
-with_distrib: false
 model:
   descriptor:
     type: loc_frame