deepmodeling · njzjz · Aug 8, 2021 · Jul 13, 2021 · Jul 15, 2021 · Jul 15, 2021
diff --git a/.github/workflows/build_wheel.yml b/.github/workflows/build_wheel.yml
@@ -25,8 +25,9 @@ jobs:
 
       - name: Build wheels
         env:
-          CIBW_BUILD: "cp36-* cp37-* cp38-*"
-          CIBW_BEFORE_BUILD: pip install tensorflow && sed -i 's/libresolv.so.2"/libresolv.so.2", "libtensorflow_framework.so.2"/g' /opt/_internal/tools/lib/python*/site-packages/auditwheel/policy/policy.json
+          CIBW_BUILD: "cp36-* cp37-* cp38-* cp39-*"
+          CIBW_MANYLINUX_X86_64_IMAGE: ghcr.io/deepmodeling/manylinux2010_x86_64_tensorflow
+          CIBW_BEFORE_BUILD: pip install tensorflow
           CIBW_SKIP: "*-win32 *-manylinux_i686"
         run: |
           python -m cibuildwheel --output-dir wheelhouse

diff --git a/deepmd/__init__.py b/deepmd/__init__.py
@@ -4,7 +4,7 @@
 
 from . import cluster, descriptor, fit, loss, utils
 from .env import set_mkl
-from .infer import DeepPotential
+from .infer import DeepEval, DeepPotential
 from .infer.data_modifier import DipoleChargeModifier
 
 set_mkl()

diff --git a/deepmd/cluster/local.py b/deepmd/cluster/local.py
@@ -1,10 +1,38 @@
-"""Get local GPU resources from `CUDA_VISIBLE_DEVICES` enviroment variable."""
+"""Get local GPU resources."""
 
 import os
 import socket
+import subprocess as sp
+import sys
+
+from deepmd.env import tf
 from typing import List, Tuple, Optional
 
-__all__ = ["get_resource"]
+
+__all__ = ["get_gpus", "get_resource"]
+
+
+def get_gpus():
+    """Get available IDs of GPU cards at local.
+    These IDs are valid when used as the TensorFlow device ID.
+
+    Returns:
+    -------
+    Optional[List[int]]
+        List of available GPU IDs. Otherwise, None.
+    """
+    test_cmd = 'from tensorflow.python.client import device_lib; ' \
+               'devices = device_lib.list_local_devices(); ' \
+               'gpus = [d.name for d in devices if d.device_type == "GPU"]; ' \
+               'print(len(gpus))'
+    with sp.Popen([sys.executable, "-c", test_cmd], stderr=sp.PIPE, stdout=sp.PIPE) as p:
+        stdout, stderr = p.communicate()
+        if p.returncode != 0:
+            decoded = stderr.decode('UTF-8')
+            raise RuntimeError('Failed to detect availbe GPUs due to:\n%s' % decoded)
+        decoded = stdout.decode('UTF-8').strip()
+        num_gpus = int(decoded)
+        return list(range(num_gpus)) if num_gpus > 0 else None
 
 
 def get_resource() -> Tuple[str, List[str], Optional[List[int]]]:
@@ -17,10 +45,5 @@ def get_resource() -> Tuple[str, List[str], Optional[List[int]]]:
     """
     nodename = socket.gethostname()
     nodelist = [nodename]
-    gpus_env = os.getenv("CUDA_VISIBLE_DEVICES", None)
-    if not gpus_env:
-        gpus = None
-    else:
-        gpus = [gpu for gpu in gpus_env.split(",")]
-
+    gpus = get_gpus()
     return nodename, nodelist, gpus
diff --git a/deepmd/cluster/slurm.py b/deepmd/cluster/slurm.py
@@ -5,9 +5,11 @@
 https://github.com/deepsense-ai/tensorflow_on_slurm ####
 """
 
-import re
+import hostlist
 import os
-from typing import List, Tuple, Optional, Iterable
+
+from deepmd.cluster import local
+from typing import List, Tuple, Optional
 
 __all__ = ["get_resource"]
 
@@ -29,7 +31,7 @@ def get_resource() -> Tuple[str, List[str], Optional[List[int]]]:
     ValueError
         if current nodename is not found in node list
     """
-    nodelist = _expand_nodelist(os.environ["SLURM_JOB_NODELIST"])
+    nodelist = hostlist.expand_hostlist(os.environ["SLURM_JOB_NODELIST"])
     nodename = os.environ["SLURMD_NODENAME"]
     num_nodes_env = os.getenv("SLURM_JOB_NUM_NODES")
     if num_nodes_env:
@@ -45,40 +47,5 @@ def get_resource() -> Tuple[str, List[str], Optional[List[int]]]:
         raise ValueError(
             f"Nodename({nodename}) not in nodelist({nodelist}). This should not happen!"
         )
-    gpus_env = os.getenv("CUDA_VISIBLE_DEVICES")
-    if not gpus_env:
-        gpus = None
-    else:
-        gpus = [int(gpu) for gpu in gpus_env.split(",")]
+    gpus = local.get_gpus()
     return nodename, nodelist, gpus
-
-
-def _pad_zeros(iterable: Iterable, length: int):
-    return (str(t).rjust(length, "0") for t in iterable)
-
-
-def _expand_ids(ids: str) -> List[str]:
-    result = []
-    for _id in ids.split(","):
-        if "-" in _id:
-            str_end = _id.split("-")[1]
-            begin, end = [int(token) for token in _id.split("-")]
-            result.extend(_pad_zeros(range(begin, end + 1), len(str_end)))
-        else:
-            result.append(_id)
-    return result
-
-
-def _expand_nodelist(nodelist: str) -> List[str]:
-    result = []
-    interval_list = nodelist.split(",")
-    for interval in interval_list:
-        match = re.search(r"(.*)\[(.*)\]", interval)
-        if match:
-            prefix = match.group(1)
-            ids = match.group(2)
-            ids_list = _expand_ids(ids)
-            result.extend([f"{prefix}{_id}" for _id in ids_list])
-        else:
-            result.append(interval)
-    return result
diff --git a/deepmd/descriptor/se_a.py b/deepmd/descriptor/se_a.py
@@ -586,6 +586,7 @@ def _filter_lower(
                              [ 0, start_index* 4],
                              [-1, incrs_index* 4] )
         shape_i = inputs_i.get_shape().as_list()
+        natom = tf.shape(inputs_i)[0]
         # with (natom x nei_type_i) x 4
         inputs_reshape = tf.reshape(inputs_i, [-1, 4])
         # with (natom x nei_type_i) x 1
@@ -603,7 +604,7 @@ def _filter_lower(
             net = 'filter_-1_net_' + str(type_i)
           else:
             net = 'filter_' + str(type_input) + '_net_' + str(type_i)
-          return op_module.tabulate_fusion(self.table.data[net].astype(self.filter_np_precision), info, xyz_scatter, tf.reshape(inputs_i, [-1, shape_i[1]//4, 4]), last_layer_size = outputs_size[-1])  
+          return op_module.tabulate_fusion(self.table.data[net].astype(self.filter_np_precision), info, xyz_scatter, tf.reshape(inputs_i, [natom, shape_i[1]//4, 4]), last_layer_size = outputs_size[-1])  
         else:
           if (not is_exclude):
               xyz_scatter = embedding_net(
@@ -620,11 +621,16 @@ def _filter_lower(
                   uniform_seed = self.uniform_seed)
               if (not self.uniform_seed) and (self.seed is not None): self.seed += self.seed_shift
           else:
-            w = tf.zeros((outputs_size[0], outputs_size[-1]), dtype=GLOBAL_TF_FLOAT_PRECISION)
-            xyz_scatter = tf.matmul(xyz_scatter, w)
+            # we can safely return the final xyz_scatter filled with zero directly
+            return tf.cast(tf.fill((natom, 4, outputs_size[-1]), 0.), GLOBAL_TF_FLOAT_PRECISION)
           # natom x nei_type_i x out_size
           xyz_scatter = tf.reshape(xyz_scatter, (-1, shape_i[1]//4, outputs_size[-1]))  
-          return tf.matmul(tf.reshape(inputs_i, [-1, shape_i[1]//4, 4]), xyz_scatter, transpose_a = True)
+          # When using tf.reshape(inputs_i, [-1, shape_i[1]//4, 4]) below
+          # [588 24] -> [588 6 4] correct
+          # but if sel is zero
+          # [588 0] -> [147 0 4] incorrect; the correct one is [588 0 4]
+          # So we need to explicitly assign the shape to tf.shape(inputs_i)[0] instead of -1
+          return tf.matmul(tf.reshape(inputs_i, [natom, shape_i[1]//4, 4]), xyz_scatter, transpose_a = True)
 
 
     def _filter(
@@ -644,6 +650,18 @@ def _filter(
         shape = inputs.get_shape().as_list()
         outputs_size = [1] + self.filter_neuron
         outputs_size_2 = self.n_axis_neuron
+        all_excluded = all([(type_input, type_i) in self.exclude_types for type_i in range(self.ntypes)])
+        if all_excluded:
+            # all types are excluded so result and qmat should be zeros
+            # we can safaly return a zero matrix...
+            # See also https://stackoverflow.com/a/34725458/9567349
+            # result: natom x outputs_size x outputs_size_2
+            # qmat: natom x outputs_size x 3
+            natom = tf.shape(inputs)[0]
+            result = tf.cast(tf.fill((natom, outputs_size_2, outputs_size[-1]), 0.), GLOBAL_TF_FLOAT_PRECISION)
+            qmat = tf.cast(tf.fill((natom, outputs_size[-1], 3), 0.), GLOBAL_TF_FLOAT_PRECISION)
+            return result, qmat
+
         with tf.variable_scope(name, reuse=reuse):
           start_index = 0
           type_i = 0
@@ -665,7 +683,8 @@ def _filter(
                       suffix = "_"+str(type_i))
                   if type_i == 0:
                       xyz_scatter_1 = ret
-                  else:
+                  elif (type_input, type_i) not in self.exclude_types:
+                      # add zero is meaningless; skip
                       xyz_scatter_1+= ret
                   start_index += self.sel_a[type_i]
           else :

diff --git a/deepmd/descriptor/se_r.py b/deepmd/descriptor/se_r.py
@@ -478,11 +478,11 @@ def _filter_r(self,
                                                 trainable = trainable, 
                                                 uniform_seed = self.uniform_seed)
                     if (not self.uniform_seed) and (self.seed is not None): self.seed += self.seed_shift
+                    # natom x nei_type_i x out_size
+                    xyz_scatter = tf.reshape(xyz_scatter, (-1, shape_i[1], outputs_size[-1]))
                 else:
-                    w = tf.zeros((outputs_size[0], outputs_size[-1]), dtype=GLOBAL_TF_FLOAT_PRECISION)
-                    xyz_scatter = tf.matmul(xyz_scatter, w)
-                # natom x nei_type_i x out_size
-                xyz_scatter = tf.reshape(xyz_scatter, (-1, shape_i[1], outputs_size[-1]))
+                    natom = tf.shape(inputs)[0]
+                    xyz_scatter = tf.cast(tf.fill((natom, shape_i[1], outputs_size[-1]), 0.), GLOBAL_TF_FLOAT_PRECISION)
                 xyz_scatter_total.append(xyz_scatter)
 
             # natom x nei x outputs_size

diff --git a/deepmd/entrypoints/__init__.py b/deepmd/entrypoints/__init__.py
@@ -17,7 +17,7 @@
     "doc_train_input",
     "freeze",
     "test",
-    "train",
+    "train_dp",
     "transfer",
     "compress",
     "doc_train_input",

diff --git a/deepmd/entrypoints/compress.py b/deepmd/entrypoints/compress.py
@@ -6,7 +6,7 @@
 
 from deepmd.common import j_loader
 from deepmd.utils.argcheck import normalize
-from deepmd.utils.compat import convert_input_v0_v1
+from deepmd.utils.compat import updata_deepmd_input
 from deepmd.utils.errors import GraphTooLargeError
 
 from .freeze import freeze
@@ -65,7 +65,7 @@ def compress(
     """
     jdata = j_loader(INPUT)
     if "model" not in jdata.keys():
-        jdata = convert_input_v0_v1(jdata, warning=True, dump="input_v1_compat.json")
+        jdata = updata_deepmd_input(jdata, warning=True, dump="input_v2_compat.json")
     jdata["model"]["compress"] = {}
     jdata["model"]["compress"]["type"] = 'se_e2_a'
     jdata["model"]["compress"]["compress"] = True

diff --git a/deepmd/entrypoints/doc.py b/deepmd/entrypoints/doc.py
@@ -1,11 +1,16 @@
 """Module that prints train input arguments docstrings."""
 
-from deepmd.utils.argcheck import gen_doc
+from deepmd.utils.argcheck import gen_doc, gen_json
 
 __all__ = ["doc_train_input"]
 
 
-def doc_train_input():
+def doc_train_input(*, out_type: str = "rst", **kwargs):
     """Print out trining input arguments to console."""
-    doc_str = gen_doc(make_anchor=True)
+    if out_type == "rst":
+        doc_str = gen_doc(make_anchor=True)
+    elif out_type == "json":
+        doc_str = gen_json()
+    else:
+        raise RuntimeError("Unsupported out type %s" % out_type)
     print(doc_str)
diff --git a/deepmd/entrypoints/main.py b/deepmd/entrypoints/main.py
@@ -313,12 +313,18 @@ def parse_args(args: Optional[List[str]] = None):
     )
 
     # * print docs script **************************************************************
-    subparsers.add_parser(
+    parsers_doc = subparsers.add_parser(
         "doc-train-input",
         parents=[parser_log],
         help="print the documentation (in rst format) of input training parameters.",
         formatter_class=argparse.ArgumentDefaultsHelpFormatter,
     )
+    parsers_doc.add_argument(
+        "--out-type", 
+        default="rst", 
+        type=str, 
+        help="The output type"
+    )
 
     # * make model deviation ***********************************************************
     parser_model_devi = subparsers.add_parser(
@@ -428,7 +434,7 @@ def main():
     elif args.command == "compress":
         compress(**dict_args)
     elif args.command == "doc-train-input":
-        doc_train_input()
+        doc_train_input(**dict_args)
     elif args.command == "model-devi":
         make_model_devi(**dict_args)
     elif args.command == "convert-from":