From 492fdca3587314ea2ab82aa6573362bf5c1a8fd2 Mon Sep 17 00:00:00 2001 From: Jinzhe Zeng Date: Mon, 14 Feb 2022 03:12:25 -0500 Subject: [PATCH 1/5] calculate neighbor statistics from CLI In many cases, we need to confirm neighbor statistics before training. This command provides such CLI option. --- deepmd/entrypoints/__init__.py | 2 ++ deepmd/entrypoints/main.py | 33 +++++++++++++++++++ deepmd/entrypoints/neighbor_stat.py | 50 +++++++++++++++++++++++++++++ 3 files changed, 85 insertions(+) create mode 100644 deepmd/entrypoints/neighbor_stat.py diff --git a/deepmd/entrypoints/__init__.py b/deepmd/entrypoints/__init__.py index 80d0431f8c..d7559ad74b 100644 --- a/deepmd/entrypoints/__init__.py +++ b/deepmd/entrypoints/__init__.py @@ -11,6 +11,7 @@ from .transfer import transfer from ..infer.model_devi import make_model_devi from .convert import convert +from .neighbor_stat import neighbor_stat __all__ = [ "config", @@ -23,4 +24,5 @@ "doc_train_input", "make_model_devi", "convert", + "neighbor_stat", ] diff --git a/deepmd/entrypoints/main.py b/deepmd/entrypoints/main.py index 0f7372e0b9..3a9d3991ce 100644 --- a/deepmd/entrypoints/main.py +++ b/deepmd/entrypoints/main.py @@ -16,6 +16,7 @@ transfer, make_model_devi, convert, + neighbor_stat, ) from deepmd.loggers import set_log_handles @@ -408,6 +409,36 @@ def parse_args(args: Optional[List[str]] = None): type=str, help='the output model', ) + + # neighbor_stat + parser_neighbor_stat = subparsers.add_parser( + 'neighbor-stat', + parents=[parser_log], + help='Calculate neighbor statistics', + ) + parser_neighbor_stat.add_argument( + "-s", + "--system", + default=".", + type=str, + help="The system dir. Recursively detect systems in this directory", + ) + parser_neighbor_stat.add_argument( + "-r", + "--rcut", + type=float, + required=True, + help="cutoff radius", + ) + parser_neighbor_stat.add_argument( + "-t", + "--type-map", + type=str, + nargs='+', + required=True, + help="type map", + ) + # --version parser.add_argument('--version', action='version', version='DeePMD-kit v%s' % __version__) @@ -456,6 +487,8 @@ def main(): make_model_devi(**dict_args) elif args.command == "convert-from": convert(**dict_args) + elif args.command == "neighbor-stat": + neighbor_stat(**dict_args) elif args.command is None: pass else: diff --git a/deepmd/entrypoints/neighbor_stat.py b/deepmd/entrypoints/neighbor_stat.py new file mode 100644 index 0000000000..5471817973 --- /dev/null +++ b/deepmd/entrypoints/neighbor_stat.py @@ -0,0 +1,50 @@ +import logging +from typing import List + +from deepmd.common import data_requirement, expand_sys_str +from deepmd.utils.data_system import DeepmdDataSystem +from deepmd.utils.neighbor_stat import NeighborStat + +log = logging.getLogger(__name__) + +def neighbor_stat( + *, + system: str, + rcut: float, + type_map: List[str], + **kwargs, +): + """Calculate neighbor statistics. + + Parameters + ---------- + system : str + system to stat + rcut : float + cutoff radius + type_map : list[str] + type map + + Examples + -------- + >>> neighbor_stat(system='.', rcut=6., type_map=["C", "H", "O", "N", "P", "S", "Mg", "Na", "HW", "OW", "mNa", "mCl", "mC", "mH", "mMg", "mN", "mO", "mP"]) + min_nbor_dist: 0.6599510670195264 + max_nbor_size: [23, 26, 19, 16, 2, 2, 1, 1, 72, 37, 5, 0, 31, 29, 1, 21, 20, 5] + """ + all_sys = expand_sys_str(system) + if not len(all_sys): + raise RuntimeError("Did not find valid system") + data = DeepmdDataSystem( + systems=all_sys, + batch_size=1, + test_size=1, + rcut=rcut, + type_map=type_map, + ) + data.add_dict(data_requirement) + data.get_batch() + nei = NeighborStat(data.get_ntypes(), rcut) + min_nbor_dist, max_nbor_size = nei.get_stat(data) + log.info("min_nbor_dist: %f" % min_nbor_dist) + log.info("max_nbor_size: %s" % str(max_nbor_size)) + From a0c1013407384c75012e1dc8d739417fb3de252b Mon Sep 17 00:00:00 2001 From: Jinzhe Zeng Date: Tue, 15 Feb 2022 15:44:58 -0500 Subject: [PATCH 2/5] add UT and docs --- README.md | 1 + deepmd/entrypoints/neighbor_stat.py | 2 +- doc/model/index.md | 1 + doc/model/index.rst | 1 + doc/model/sel.md | 13 ++++++++ source/tests/test_neighbor_stat.py | 47 +++++++++++++++++++++++++++++ 6 files changed, 64 insertions(+), 1 deletion(-) create mode 100644 doc/model/sel.md create mode 100644 source/tests/test_neighbor_stat.py diff --git a/README.md b/README.md index 0b84c705dd..995ac6e914 100644 --- a/README.md +++ b/README.md @@ -94,6 +94,7 @@ A full [document](doc/train/train-input-auto.rst) on options in the training inp - [Descriptor `"se_e2_r"`](doc/model/train-se-e2-r.md) - [Descriptor `"se_e3"`](doc/model/train-se-e3.md) - [Descriptor `"hybrid"`](doc/model/train-hybrid.md) + - [Descriptor `sel`](doc/model/sel.md) - [Fit energy](doc/model/train-energy.md) - [Fit `tensor` like `Dipole` and `Polarizability`](doc/model/train-fitting-tensor.md) - [Train a Deep Potential model using `type embedding` approach](doc/model/train-se-e2-a-tebd.md) diff --git a/deepmd/entrypoints/neighbor_stat.py b/deepmd/entrypoints/neighbor_stat.py index 5471817973..b366c95d71 100644 --- a/deepmd/entrypoints/neighbor_stat.py +++ b/deepmd/entrypoints/neighbor_stat.py @@ -47,4 +47,4 @@ def neighbor_stat( min_nbor_dist, max_nbor_size = nei.get_stat(data) log.info("min_nbor_dist: %f" % min_nbor_dist) log.info("max_nbor_size: %s" % str(max_nbor_size)) - + return min_nbor_dist, max_nbor_size diff --git a/doc/model/index.md b/doc/model/index.md index cce3ec57ec..94575f67c6 100644 --- a/doc/model/index.md +++ b/doc/model/index.md @@ -5,6 +5,7 @@ - [Descriptor `"se_e2_r"`](train-se-e2-r.md) - [Descriptor `"se_e3"`](train-se-e3.md) - [Descriptor `"hybrid"`](train-hybrid.md) +- [Descriptor `sel`](sel.md) - [Fit energy](train-energy.md) - [Fit `tensor` like `Dipole` and `Polarizability`](train-fitting-tensor.md) - [Train a Deep Potential model using `type embedding` approach](train-se-e2-a-tebd.md) diff --git a/doc/model/index.rst b/doc/model/index.rst index 1cc0f7e6ea..10f41b375c 100644 --- a/doc/model/index.rst +++ b/doc/model/index.rst @@ -9,6 +9,7 @@ Model train-se-e2-r train-se-e3 train-hybrid + sel train-energy train-fitting-tensor train-se-e2-a-tebd diff --git a/doc/model/sel.md b/doc/model/sel.md new file mode 100644 index 0000000000..7713789f7b --- /dev/null +++ b/doc/model/sel.md @@ -0,0 +1,13 @@ +# Determine `sel` + +All descriptors require to set `sel`, which means the expected maximum number of type-i neighbors of an atom. DeePMD-kit will allocate memory according to `sel`. + +`sel` should not be too large or too small. If `sel` is too large, the computing will become much slower and cost more memory. If `sel` is not enough, the energy will be not conserved, making the accuracy of the model worse. + +To determine a proper `sel`, one can calculate the neighbor stat of the training data before training: +```sh +dp neighbor-stat -s data -r 6.0 -t O H +``` +where `data` is the directory of data, `6.0` is the cutoff radius, and `O` and `H` is the type map.The program will give the `max_nbor_size`. For example, `max_nbor_size` of the water example is `[38, 72]`, meaning an atom may have 38 O neighbors and 72 H neighbors in the training data. + +The `sel` should be set to a higher value than that of the training data, considering there may be some extreme geometries during MD simulations. As a result, we set to `[46, 92]` in the water example. diff --git a/source/tests/test_neighbor_stat.py b/source/tests/test_neighbor_stat.py new file mode 100644 index 0000000000..2bc97cb4d5 --- /dev/null +++ b/source/tests/test_neighbor_stat.py @@ -0,0 +1,47 @@ +import shutil +import numpy as np +import unittest +import dpdata + +from deepmd.entrypoints.neighbor_stat import neighbor_stat + +def gen_sys(nframes): + natoms = 1000 + data = {} + X, Y, Z = np.mgrid[0:9:10j, 0:9:10j, 0:9:10j] + positions = np.vstack([X.ravel(), Y.ravel(), Z.ravel()]).T #+ 0.1 + data['coords'] = np.repeat(positions[np.newaxis, :, :], nframes, axis=0) + data['forces'] = np.random.random([nframes, natoms, 3]) + data['cells'] = np.array([10., 0., 0., 0., 10., 0., 0., 0., 10.]).reshape(1,3,3) + data['energies'] = np.random.random([nframes, 1]) + data['atom_names'] = ['TYPE'] + data['atom_numbs'] = [1000] + data['atom_types'] = np.repeat(0, 1000) + return data + + +class TestEnerShift(unittest.TestCase): + def setUp(self): + data0 = gen_sys(1) + sys0 = dpdata.LabeledSystem() + sys0.data = data0 + sys0.to_deepmd_npy('system_0', set_size = 10) + + def tearDown(self): + shutil.rmtree('system_0') + + def test_ener_shift(self): + # set rcut to 0. will cause a core dumped + # TODO: check what is wrong + for rcut in (3., 6., 11.): + with self.subTest(): + rcut += 1e-3 # prevent numerical errors + min_nbor_dist, max_nbor_size = neighbor_stat(system="system_0", rcut=rcut, type_map=["TYPE"]) + upper = np.ceil(rcut)+1 + X, Y, Z = np.mgrid[-upper:upper, -upper:upper, -upper:upper] + positions = np.vstack([X.ravel(), Y.ravel(), Z.ravel()]).T + # distance to (0,0,0) + distance = np.linalg.norm(positions, axis=1) + expected_neighbors = np.count_nonzero(np.logical_and(distance > 0, distance <= rcut)) + self.assertAlmostEqual(min_nbor_dist, 1.0, 6) + self.assertEqual(max_nbor_size, [expected_neighbors]) From fc7229fb4da7d5b34e16cf7e259a3b661386d82b Mon Sep 17 00:00:00 2001 From: Jinzhe Zeng Date: Tue, 15 Feb 2022 15:46:01 -0500 Subject: [PATCH 3/5] fix typo --- doc/model/sel.md | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/doc/model/sel.md b/doc/model/sel.md index 7713789f7b..dce1b8f860 100644 --- a/doc/model/sel.md +++ b/doc/model/sel.md @@ -8,6 +8,6 @@ To determine a proper `sel`, one can calculate the neighbor stat of the training ```sh dp neighbor-stat -s data -r 6.0 -t O H ``` -where `data` is the directory of data, `6.0` is the cutoff radius, and `O` and `H` is the type map.The program will give the `max_nbor_size`. For example, `max_nbor_size` of the water example is `[38, 72]`, meaning an atom may have 38 O neighbors and 72 H neighbors in the training data. +where `data` is the directory of data, `6.0` is the cutoff radius, and `O` and `H` is the type map. The program will give the `max_nbor_size`. For example, `max_nbor_size` of the water example is `[38, 72]`, meaning an atom may have 38 O neighbors and 72 H neighbors in the training data. The `sel` should be set to a higher value than that of the training data, considering there may be some extreme geometries during MD simulations. As a result, we set to `[46, 92]` in the water example. From c7ccc082dcff4c0a66b65d1049ff7b21078c6712 Mon Sep 17 00:00:00 2001 From: Jinzhe Zeng Date: Tue, 15 Feb 2022 15:51:14 -0500 Subject: [PATCH 4/5] correct name of the class --- source/tests/test_neighbor_stat.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/source/tests/test_neighbor_stat.py b/source/tests/test_neighbor_stat.py index 2bc97cb4d5..2c044c76a4 100644 --- a/source/tests/test_neighbor_stat.py +++ b/source/tests/test_neighbor_stat.py @@ -20,7 +20,7 @@ def gen_sys(nframes): return data -class TestEnerShift(unittest.TestCase): +class TestNeighborStat(unittest.TestCase): def setUp(self): data0 = gen_sys(1) sys0 = dpdata.LabeledSystem() @@ -30,7 +30,7 @@ def setUp(self): def tearDown(self): shutil.rmtree('system_0') - def test_ener_shift(self): + def test_neighbor_stat(self): # set rcut to 0. will cause a core dumped # TODO: check what is wrong for rcut in (3., 6., 11.): From 0618f61d530edd17af67182d3124b56e05dca19e Mon Sep 17 00:00:00 2001 From: Jinzhe Zeng Date: Tue, 15 Feb 2022 16:06:06 -0500 Subject: [PATCH 5/5] remove data_requirement --- deepmd/entrypoints/neighbor_stat.py | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) diff --git a/deepmd/entrypoints/neighbor_stat.py b/deepmd/entrypoints/neighbor_stat.py index b366c95d71..eaf8b3efe3 100644 --- a/deepmd/entrypoints/neighbor_stat.py +++ b/deepmd/entrypoints/neighbor_stat.py @@ -1,7 +1,7 @@ import logging from typing import List -from deepmd.common import data_requirement, expand_sys_str +from deepmd.common import expand_sys_str from deepmd.utils.data_system import DeepmdDataSystem from deepmd.utils.neighbor_stat import NeighborStat @@ -41,7 +41,6 @@ def neighbor_stat( rcut=rcut, type_map=type_map, ) - data.add_dict(data_requirement) data.get_batch() nei = NeighborStat(data.get_ntypes(), rcut) min_nbor_dist, max_nbor_size = nei.get_stat(data)