Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
Show all changes
21 commits
Select commit Hold shift + click to select a range
94635ba
also convert input v1 to v2 for dp compress (#844)
njzjz Jul 13, 2021
78123a5
Synchronize format_nlist_b in CUDA with ROCm (#845)
BaoHhhhhhan Jul 15, 2021
1f01650
pin sphinx to a previous version (#848)
njzjz Jul 15, 2021
9b5e2b9
remove cub include for CUDA>=11 (#866)
njzjz Jul 19, 2021
4985932
Add Errcheck after every kernel function runs And merge redundant cod…
galeselee Jul 22, 2021
851fa96
Fix the empty neighbor distance array in neighbor_stat.py (#882)
nicklin96 Jul 26, 2021
c4b9c9e
adapt changes to auditwheel directory in manylinux (#889)
njzjz Jul 28, 2021
953621f
Update getting-started.md (#898)
Jul 29, 2021
4a229fe
build low and high precision at the same time (#879)
njzjz Jul 30, 2021
31f1ef6
Replace PS-Worker mode with multi-worker one. (#892)
shishaochen Jul 30, 2021
70508a5
fix `InvalidArgumentError` caused by zero `sel` and optimize zero mat…
njzjz Jul 31, 2021
043ac86
enhance the cli to generate doc json file (#891)
njzjz Jul 31, 2021
7b9f303
Find available GPUs in an elegant way. (#905)
shishaochen Aug 1, 2021
b5b15fa
fix 'NoneType' has no len() in `auto_sel` (#911)
njzjz Aug 1, 2021
689ffa4
raise warning before training if sel is not enough (#914)
njzjz Aug 2, 2021
ee0ed99
Fix the expanding logic of `SLURM_JOB_NODELIST` and add unit tests fo…
shishaochen Aug 2, 2021
0c66c92
update conda install documents (#925)
njzjz Aug 6, 2021
3ae80b3
Fix member declartion of `deepmd` and `deepmd.entrypoints`. (#922)
shishaochen Aug 6, 2021
4ced020
set input DeepmdData.type_map to input type_map (#924)
amcadmus Aug 6, 2021
4be5eb1
Passing error to TF instead of exit (#918)
njzjz Aug 6, 2021
b30a75e
add lammps compute style for atomic deep tensor (#927)
amcadmus Aug 8, 2021
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
5 changes: 3 additions & 2 deletions .github/workflows/build_wheel.yml
Original file line number Diff line number Diff line change
Expand Up @@ -25,8 +25,9 @@ jobs:

- name: Build wheels
env:
CIBW_BUILD: "cp36-* cp37-* cp38-*"
CIBW_BEFORE_BUILD: pip install tensorflow && sed -i 's/libresolv.so.2"/libresolv.so.2", "libtensorflow_framework.so.2"/g' /opt/_internal/tools/lib/python*/site-packages/auditwheel/policy/policy.json
CIBW_BUILD: "cp36-* cp37-* cp38-* cp39-*"
CIBW_MANYLINUX_X86_64_IMAGE: ghcr.io/deepmodeling/manylinux2010_x86_64_tensorflow
CIBW_BEFORE_BUILD: pip install tensorflow
CIBW_SKIP: "*-win32 *-manylinux_i686"
run: |
python -m cibuildwheel --output-dir wheelhouse
Expand Down
2 changes: 1 addition & 1 deletion deepmd/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -4,7 +4,7 @@

from . import cluster, descriptor, fit, loss, utils
from .env import set_mkl
from .infer import DeepPotential
from .infer import DeepEval, DeepPotential
from .infer.data_modifier import DipoleChargeModifier

set_mkl()
Expand Down
39 changes: 31 additions & 8 deletions deepmd/cluster/local.py
Original file line number Diff line number Diff line change
@@ -1,10 +1,38 @@
"""Get local GPU resources from `CUDA_VISIBLE_DEVICES` enviroment variable."""
"""Get local GPU resources."""

import os
import socket
import subprocess as sp
import sys

from deepmd.env import tf
from typing import List, Tuple, Optional

__all__ = ["get_resource"]

__all__ = ["get_gpus", "get_resource"]


def get_gpus():
"""Get available IDs of GPU cards at local.
These IDs are valid when used as the TensorFlow device ID.

Returns:
-------
Optional[List[int]]
List of available GPU IDs. Otherwise, None.
"""
test_cmd = 'from tensorflow.python.client import device_lib; ' \
'devices = device_lib.list_local_devices(); ' \
'gpus = [d.name for d in devices if d.device_type == "GPU"]; ' \
'print(len(gpus))'
with sp.Popen([sys.executable, "-c", test_cmd], stderr=sp.PIPE, stdout=sp.PIPE) as p:
stdout, stderr = p.communicate()
if p.returncode != 0:
decoded = stderr.decode('UTF-8')
raise RuntimeError('Failed to detect availbe GPUs due to:\n%s' % decoded)
decoded = stdout.decode('UTF-8').strip()
num_gpus = int(decoded)
return list(range(num_gpus)) if num_gpus > 0 else None


def get_resource() -> Tuple[str, List[str], Optional[List[int]]]:
Expand All @@ -17,10 +45,5 @@ def get_resource() -> Tuple[str, List[str], Optional[List[int]]]:
"""
nodename = socket.gethostname()
nodelist = [nodename]
gpus_env = os.getenv("CUDA_VISIBLE_DEVICES", None)
if not gpus_env:
gpus = None
else:
gpus = [gpu for gpu in gpus_env.split(",")]

gpus = get_gpus()
return nodename, nodelist, gpus
45 changes: 6 additions & 39 deletions deepmd/cluster/slurm.py
Original file line number Diff line number Diff line change
Expand Up @@ -5,9 +5,11 @@
https://github.com/deepsense-ai/tensorflow_on_slurm ####
"""

import re
import hostlist
import os
from typing import List, Tuple, Optional, Iterable

from deepmd.cluster import local
from typing import List, Tuple, Optional

__all__ = ["get_resource"]

Expand All @@ -29,7 +31,7 @@ def get_resource() -> Tuple[str, List[str], Optional[List[int]]]:
ValueError
if current nodename is not found in node list
"""
nodelist = _expand_nodelist(os.environ["SLURM_JOB_NODELIST"])
nodelist = hostlist.expand_hostlist(os.environ["SLURM_JOB_NODELIST"])
nodename = os.environ["SLURMD_NODENAME"]
num_nodes_env = os.getenv("SLURM_JOB_NUM_NODES")
if num_nodes_env:
Expand All @@ -45,40 +47,5 @@ def get_resource() -> Tuple[str, List[str], Optional[List[int]]]:
raise ValueError(
f"Nodename({nodename}) not in nodelist({nodelist}). This should not happen!"
)
gpus_env = os.getenv("CUDA_VISIBLE_DEVICES")
if not gpus_env:
gpus = None
else:
gpus = [int(gpu) for gpu in gpus_env.split(",")]
gpus = local.get_gpus()
return nodename, nodelist, gpus


def _pad_zeros(iterable: Iterable, length: int):
return (str(t).rjust(length, "0") for t in iterable)


def _expand_ids(ids: str) -> List[str]:
result = []
for _id in ids.split(","):
if "-" in _id:
str_end = _id.split("-")[1]
begin, end = [int(token) for token in _id.split("-")]
result.extend(_pad_zeros(range(begin, end + 1), len(str_end)))
else:
result.append(_id)
return result


def _expand_nodelist(nodelist: str) -> List[str]:
result = []
interval_list = nodelist.split(",")
for interval in interval_list:
match = re.search(r"(.*)\[(.*)\]", interval)
if match:
prefix = match.group(1)
ids = match.group(2)
ids_list = _expand_ids(ids)
result.extend([f"{prefix}{_id}" for _id in ids_list])
else:
result.append(interval)
return result
29 changes: 24 additions & 5 deletions deepmd/descriptor/se_a.py
Original file line number Diff line number Diff line change
Expand Up @@ -586,6 +586,7 @@ def _filter_lower(
[ 0, start_index* 4],
[-1, incrs_index* 4] )
shape_i = inputs_i.get_shape().as_list()
natom = tf.shape(inputs_i)[0]
# with (natom x nei_type_i) x 4
inputs_reshape = tf.reshape(inputs_i, [-1, 4])
# with (natom x nei_type_i) x 1
Expand All @@ -603,7 +604,7 @@ def _filter_lower(
net = 'filter_-1_net_' + str(type_i)
else:
net = 'filter_' + str(type_input) + '_net_' + str(type_i)
return op_module.tabulate_fusion(self.table.data[net].astype(self.filter_np_precision), info, xyz_scatter, tf.reshape(inputs_i, [-1, shape_i[1]//4, 4]), last_layer_size = outputs_size[-1])
return op_module.tabulate_fusion(self.table.data[net].astype(self.filter_np_precision), info, xyz_scatter, tf.reshape(inputs_i, [natom, shape_i[1]//4, 4]), last_layer_size = outputs_size[-1])
else:
if (not is_exclude):
xyz_scatter = embedding_net(
Expand All @@ -620,11 +621,16 @@ def _filter_lower(
uniform_seed = self.uniform_seed)
if (not self.uniform_seed) and (self.seed is not None): self.seed += self.seed_shift
else:
w = tf.zeros((outputs_size[0], outputs_size[-1]), dtype=GLOBAL_TF_FLOAT_PRECISION)
xyz_scatter = tf.matmul(xyz_scatter, w)
# we can safely return the final xyz_scatter filled with zero directly
return tf.cast(tf.fill((natom, 4, outputs_size[-1]), 0.), GLOBAL_TF_FLOAT_PRECISION)
# natom x nei_type_i x out_size
xyz_scatter = tf.reshape(xyz_scatter, (-1, shape_i[1]//4, outputs_size[-1]))
return tf.matmul(tf.reshape(inputs_i, [-1, shape_i[1]//4, 4]), xyz_scatter, transpose_a = True)
# When using tf.reshape(inputs_i, [-1, shape_i[1]//4, 4]) below
# [588 24] -> [588 6 4] correct
# but if sel is zero
# [588 0] -> [147 0 4] incorrect; the correct one is [588 0 4]
# So we need to explicitly assign the shape to tf.shape(inputs_i)[0] instead of -1
return tf.matmul(tf.reshape(inputs_i, [natom, shape_i[1]//4, 4]), xyz_scatter, transpose_a = True)


def _filter(
Expand All @@ -644,6 +650,18 @@ def _filter(
shape = inputs.get_shape().as_list()
outputs_size = [1] + self.filter_neuron
outputs_size_2 = self.n_axis_neuron
all_excluded = all([(type_input, type_i) in self.exclude_types for type_i in range(self.ntypes)])
if all_excluded:
# all types are excluded so result and qmat should be zeros
# we can safaly return a zero matrix...
# See also https://stackoverflow.com/a/34725458/9567349
# result: natom x outputs_size x outputs_size_2
# qmat: natom x outputs_size x 3
natom = tf.shape(inputs)[0]
result = tf.cast(tf.fill((natom, outputs_size_2, outputs_size[-1]), 0.), GLOBAL_TF_FLOAT_PRECISION)
qmat = tf.cast(tf.fill((natom, outputs_size[-1], 3), 0.), GLOBAL_TF_FLOAT_PRECISION)
return result, qmat

with tf.variable_scope(name, reuse=reuse):
start_index = 0
type_i = 0
Expand All @@ -665,7 +683,8 @@ def _filter(
suffix = "_"+str(type_i))
if type_i == 0:
xyz_scatter_1 = ret
else:
elif (type_input, type_i) not in self.exclude_types:
# add zero is meaningless; skip
xyz_scatter_1+= ret
start_index += self.sel_a[type_i]
else :
Expand Down
8 changes: 4 additions & 4 deletions deepmd/descriptor/se_r.py
Original file line number Diff line number Diff line change
Expand Up @@ -478,11 +478,11 @@ def _filter_r(self,
trainable = trainable,
uniform_seed = self.uniform_seed)
if (not self.uniform_seed) and (self.seed is not None): self.seed += self.seed_shift
# natom x nei_type_i x out_size
xyz_scatter = tf.reshape(xyz_scatter, (-1, shape_i[1], outputs_size[-1]))
else:
w = tf.zeros((outputs_size[0], outputs_size[-1]), dtype=GLOBAL_TF_FLOAT_PRECISION)
xyz_scatter = tf.matmul(xyz_scatter, w)
# natom x nei_type_i x out_size
xyz_scatter = tf.reshape(xyz_scatter, (-1, shape_i[1], outputs_size[-1]))
natom = tf.shape(inputs)[0]
xyz_scatter = tf.cast(tf.fill((natom, shape_i[1], outputs_size[-1]), 0.), GLOBAL_TF_FLOAT_PRECISION)
xyz_scatter_total.append(xyz_scatter)

# natom x nei x outputs_size
Expand Down
2 changes: 1 addition & 1 deletion deepmd/entrypoints/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -17,7 +17,7 @@
"doc_train_input",
"freeze",
"test",
"train",
"train_dp",
"transfer",
"compress",
"doc_train_input",
Expand Down
4 changes: 2 additions & 2 deletions deepmd/entrypoints/compress.py
Original file line number Diff line number Diff line change
Expand Up @@ -6,7 +6,7 @@

from deepmd.common import j_loader
from deepmd.utils.argcheck import normalize
from deepmd.utils.compat import convert_input_v0_v1
from deepmd.utils.compat import updata_deepmd_input
from deepmd.utils.errors import GraphTooLargeError

from .freeze import freeze
Expand Down Expand Up @@ -65,7 +65,7 @@ def compress(
"""
jdata = j_loader(INPUT)
if "model" not in jdata.keys():
jdata = convert_input_v0_v1(jdata, warning=True, dump="input_v1_compat.json")
jdata = updata_deepmd_input(jdata, warning=True, dump="input_v2_compat.json")
jdata["model"]["compress"] = {}
jdata["model"]["compress"]["type"] = 'se_e2_a'
jdata["model"]["compress"]["compress"] = True
Expand Down
11 changes: 8 additions & 3 deletions deepmd/entrypoints/doc.py
Original file line number Diff line number Diff line change
@@ -1,11 +1,16 @@
"""Module that prints train input arguments docstrings."""

from deepmd.utils.argcheck import gen_doc
from deepmd.utils.argcheck import gen_doc, gen_json

__all__ = ["doc_train_input"]


def doc_train_input():
def doc_train_input(*, out_type: str = "rst", **kwargs):
"""Print out trining input arguments to console."""
doc_str = gen_doc(make_anchor=True)
if out_type == "rst":
doc_str = gen_doc(make_anchor=True)
elif out_type == "json":
doc_str = gen_json()
else:
raise RuntimeError("Unsupported out type %s" % out_type)
print(doc_str)
10 changes: 8 additions & 2 deletions deepmd/entrypoints/main.py
Original file line number Diff line number Diff line change
Expand Up @@ -313,12 +313,18 @@ def parse_args(args: Optional[List[str]] = None):
)

# * print docs script **************************************************************
subparsers.add_parser(
parsers_doc = subparsers.add_parser(
"doc-train-input",
parents=[parser_log],
help="print the documentation (in rst format) of input training parameters.",
formatter_class=argparse.ArgumentDefaultsHelpFormatter,
)
parsers_doc.add_argument(
"--out-type",
default="rst",
type=str,
help="The output type"
)

# * make model deviation ***********************************************************
parser_model_devi = subparsers.add_parser(
Expand Down Expand Up @@ -428,7 +434,7 @@ def main():
elif args.command == "compress":
compress(**dict_args)
elif args.command == "doc-train-input":
doc_train_input()
doc_train_input(**dict_args)
elif args.command == "model-devi":
make_model_devi(**dict_args)
elif args.command == "convert-from":
Expand Down
Loading