From 492fdca3587314ea2ab82aa6573362bf5c1a8fd2 Mon Sep 17 00:00:00 2001
From: Jinzhe Zeng <jinzhe.zeng@rutgers.edu>
Date: Mon, 14 Feb 2022 03:12:25 -0500
Subject: [PATCH 1/5] calculate neighbor statistics from CLI

In many cases, we need to confirm neighbor statistics before training.
This command provides such CLI option.
---
 deepmd/entrypoints/__init__.py      |  2 ++
 deepmd/entrypoints/main.py          | 33 +++++++++++++++++++
 deepmd/entrypoints/neighbor_stat.py | 50 +++++++++++++++++++++++++++++
 3 files changed, 85 insertions(+)
 create mode 100644 deepmd/entrypoints/neighbor_stat.py

diff --git a/deepmd/entrypoints/__init__.py b/deepmd/entrypoints/__init__.py
index 80d0431f8c..d7559ad74b 100644
--- a/deepmd/entrypoints/__init__.py
+++ b/deepmd/entrypoints/__init__.py
@@ -11,6 +11,7 @@
 from .transfer import transfer
 from ..infer.model_devi import make_model_devi
 from .convert import convert
+from .neighbor_stat import neighbor_stat
 
 __all__ = [
     "config",
@@ -23,4 +24,5 @@
     "doc_train_input",
     "make_model_devi",
     "convert",
+    "neighbor_stat",
 ]
diff --git a/deepmd/entrypoints/main.py b/deepmd/entrypoints/main.py
index 0f7372e0b9..3a9d3991ce 100644
--- a/deepmd/entrypoints/main.py
+++ b/deepmd/entrypoints/main.py
@@ -16,6 +16,7 @@
     transfer,
     make_model_devi,
     convert,
+    neighbor_stat,
 )
 from deepmd.loggers import set_log_handles
 
@@ -408,6 +409,36 @@ def parse_args(args: Optional[List[str]] = None):
         type=str, 
 		help='the output model',
     )
+
+    # neighbor_stat
+    parser_neighbor_stat = subparsers.add_parser(
+        'neighbor-stat',
+        parents=[parser_log],
+        help='Calculate neighbor statistics',
+    )
+    parser_neighbor_stat.add_argument(
+        "-s",
+        "--system",
+        default=".",
+        type=str,
+        help="The system dir. Recursively detect systems in this directory",
+    )
+    parser_neighbor_stat.add_argument(
+        "-r",
+        "--rcut",
+        type=float,
+        required=True,
+        help="cutoff radius",
+    )
+    parser_neighbor_stat.add_argument(
+        "-t",
+        "--type-map",
+        type=str,
+        nargs='+',
+        required=True,
+        help="type map",
+    )
+        
     # --version
     parser.add_argument('--version', action='version', version='DeePMD-kit v%s' % __version__)
 
@@ -456,6 +487,8 @@ def main():
         make_model_devi(**dict_args)
     elif args.command == "convert-from":
         convert(**dict_args)
+    elif args.command == "neighbor-stat":
+        neighbor_stat(**dict_args)
     elif args.command is None:
         pass
     else:
diff --git a/deepmd/entrypoints/neighbor_stat.py b/deepmd/entrypoints/neighbor_stat.py
new file mode 100644
index 0000000000..5471817973
--- /dev/null
+++ b/deepmd/entrypoints/neighbor_stat.py
@@ -0,0 +1,50 @@
+import logging
+from typing import List
+
+from deepmd.common import data_requirement, expand_sys_str
+from deepmd.utils.data_system import DeepmdDataSystem
+from deepmd.utils.neighbor_stat import NeighborStat
+
+log = logging.getLogger(__name__)
+
+def neighbor_stat(
+    *,
+    system: str,
+    rcut: float,
+    type_map: List[str],
+    **kwargs,
+):
+    """Calculate neighbor statistics.
+
+    Parameters
+    ----------
+    system : str
+        system to stat
+    rcut : float
+        cutoff radius
+    type_map : list[str]
+        type map
+
+    Examples
+    --------
+    >>> neighbor_stat(system='.', rcut=6., type_map=["C", "H", "O", "N", "P", "S", "Mg", "Na", "HW", "OW", "mNa", "mCl", "mC", "mH", "mMg", "mN", "mO", "mP"])
+    min_nbor_dist: 0.6599510670195264
+    max_nbor_size: [23, 26, 19, 16, 2, 2, 1, 1, 72, 37, 5, 0, 31, 29, 1, 21, 20, 5]
+    """ 
+    all_sys = expand_sys_str(system)
+    if not len(all_sys):
+        raise RuntimeError("Did not find valid system")
+    data = DeepmdDataSystem(
+        systems=all_sys,
+        batch_size=1,
+        test_size=1,
+        rcut=rcut,
+        type_map=type_map,
+    )
+    data.add_dict(data_requirement)
+    data.get_batch()
+    nei = NeighborStat(data.get_ntypes(), rcut)
+    min_nbor_dist, max_nbor_size = nei.get_stat(data)
+    log.info("min_nbor_dist: %f" % min_nbor_dist)
+    log.info("max_nbor_size: %s" % str(max_nbor_size))
+

From a0c1013407384c75012e1dc8d739417fb3de252b Mon Sep 17 00:00:00 2001
From: Jinzhe Zeng <jinzhe.zeng@rutgers.edu>
Date: Tue, 15 Feb 2022 15:44:58 -0500
Subject: [PATCH 2/5] add UT and docs

---
 README.md                           |  1 +
 deepmd/entrypoints/neighbor_stat.py |  2 +-
 doc/model/index.md                  |  1 +
 doc/model/index.rst                 |  1 +
 doc/model/sel.md                    | 13 ++++++++
 source/tests/test_neighbor_stat.py  | 47 +++++++++++++++++++++++++++++
 6 files changed, 64 insertions(+), 1 deletion(-)
 create mode 100644 doc/model/sel.md
 create mode 100644 source/tests/test_neighbor_stat.py

diff --git a/README.md b/README.md
index 0b84c705dd..995ac6e914 100644
--- a/README.md
+++ b/README.md
@@ -94,6 +94,7 @@ A full [document](doc/train/train-input-auto.rst) on options in the training inp
     - [Descriptor `"se_e2_r"`](doc/model/train-se-e2-r.md)
     - [Descriptor `"se_e3"`](doc/model/train-se-e3.md)
     - [Descriptor `"hybrid"`](doc/model/train-hybrid.md)
+    - [Descriptor `sel`](doc/model/sel.md)
     - [Fit energy](doc/model/train-energy.md)
     - [Fit `tensor` like `Dipole` and `Polarizability`](doc/model/train-fitting-tensor.md)
     - [Train a Deep Potential model using `type embedding` approach](doc/model/train-se-e2-a-tebd.md)
diff --git a/deepmd/entrypoints/neighbor_stat.py b/deepmd/entrypoints/neighbor_stat.py
index 5471817973..b366c95d71 100644
--- a/deepmd/entrypoints/neighbor_stat.py
+++ b/deepmd/entrypoints/neighbor_stat.py
@@ -47,4 +47,4 @@ def neighbor_stat(
     min_nbor_dist, max_nbor_size = nei.get_stat(data)
     log.info("min_nbor_dist: %f" % min_nbor_dist)
     log.info("max_nbor_size: %s" % str(max_nbor_size))
-
+    return min_nbor_dist, max_nbor_size
diff --git a/doc/model/index.md b/doc/model/index.md
index cce3ec57ec..94575f67c6 100644
--- a/doc/model/index.md
+++ b/doc/model/index.md
@@ -5,6 +5,7 @@
 - [Descriptor `"se_e2_r"`](train-se-e2-r.md)
 - [Descriptor `"se_e3"`](train-se-e3.md)
 - [Descriptor `"hybrid"`](train-hybrid.md)
+- [Descriptor `sel`](sel.md)
 - [Fit energy](train-energy.md)
 - [Fit `tensor` like `Dipole` and `Polarizability`](train-fitting-tensor.md)
 - [Train a Deep Potential model using `type embedding` approach](train-se-e2-a-tebd.md)
diff --git a/doc/model/index.rst b/doc/model/index.rst
index 1cc0f7e6ea..10f41b375c 100644
--- a/doc/model/index.rst
+++ b/doc/model/index.rst
@@ -9,6 +9,7 @@ Model
    train-se-e2-r
    train-se-e3
    train-hybrid
+   sel
    train-energy
    train-fitting-tensor
    train-se-e2-a-tebd
diff --git a/doc/model/sel.md b/doc/model/sel.md
new file mode 100644
index 0000000000..7713789f7b
--- /dev/null
+++ b/doc/model/sel.md
@@ -0,0 +1,13 @@
+# Determine `sel`
+
+All descriptors require to set `sel`, which means the expected maximum number of type-i neighbors of an atom. DeePMD-kit will allocate memory according to `sel`.
+
+`sel` should not be too large or too small. If `sel` is too large, the computing will become much slower and cost more memory. If `sel` is not enough, the energy will be not conserved, making the accuracy of the model worse.
+
+To determine a proper `sel`, one can calculate the neighbor stat of the training data before training:
+```sh
+dp neighbor-stat -s data -r 6.0 -t O H
+```
+where `data` is the directory of data, `6.0` is the cutoff radius, and `O` and `H` is the type map.The program will give the `max_nbor_size`. For example, `max_nbor_size` of the water example is `[38, 72]`, meaning an atom may have 38 O neighbors and 72 H neighbors in the training data.
+
+The `sel` should be set to a higher value than that of the training data, considering there may be some extreme geometries during MD simulations. As a result, we set to `[46, 92]` in the water example.
diff --git a/source/tests/test_neighbor_stat.py b/source/tests/test_neighbor_stat.py
new file mode 100644
index 0000000000..2bc97cb4d5
--- /dev/null
+++ b/source/tests/test_neighbor_stat.py
@@ -0,0 +1,47 @@
+import shutil
+import numpy as np
+import unittest
+import dpdata
+
+from deepmd.entrypoints.neighbor_stat import neighbor_stat
+
+def gen_sys(nframes):
+    natoms = 1000
+    data = {}
+    X, Y, Z = np.mgrid[0:9:10j, 0:9:10j, 0:9:10j]
+    positions = np.vstack([X.ravel(), Y.ravel(), Z.ravel()]).T #+ 0.1
+    data['coords'] = np.repeat(positions[np.newaxis, :, :], nframes, axis=0)
+    data['forces'] = np.random.random([nframes, natoms, 3])
+    data['cells'] = np.array([10., 0., 0., 0., 10., 0., 0., 0., 10.]).reshape(1,3,3)
+    data['energies'] = np.random.random([nframes, 1])
+    data['atom_names'] = ['TYPE']
+    data['atom_numbs'] = [1000]
+    data['atom_types'] = np.repeat(0, 1000)
+    return data
+
+
+class TestEnerShift(unittest.TestCase):
+    def setUp(self):
+        data0 = gen_sys(1)
+        sys0 = dpdata.LabeledSystem()
+        sys0.data = data0
+        sys0.to_deepmd_npy('system_0', set_size = 10)
+        
+    def tearDown(self):
+        shutil.rmtree('system_0')
+
+    def test_ener_shift(self):
+        # set rcut to 0. will cause a core dumped
+        # TODO: check what is wrong
+        for rcut in (3., 6., 11.):
+            with self.subTest():
+                rcut += 1e-3 # prevent numerical errors
+                min_nbor_dist, max_nbor_size = neighbor_stat(system="system_0", rcut=rcut, type_map=["TYPE"])
+                upper = np.ceil(rcut)+1
+                X, Y, Z = np.mgrid[-upper:upper, -upper:upper, -upper:upper]
+                positions = np.vstack([X.ravel(), Y.ravel(), Z.ravel()]).T
+                # distance to (0,0,0)
+                distance = np.linalg.norm(positions, axis=1)
+                expected_neighbors = np.count_nonzero(np.logical_and(distance > 0, distance <= rcut))
+                self.assertAlmostEqual(min_nbor_dist, 1.0, 6)
+                self.assertEqual(max_nbor_size, [expected_neighbors])

From fc7229fb4da7d5b34e16cf7e259a3b661386d82b Mon Sep 17 00:00:00 2001
From: Jinzhe Zeng <jinzhe.zeng@rutgers.edu>
Date: Tue, 15 Feb 2022 15:46:01 -0500
Subject: [PATCH 3/5] fix typo

---
 doc/model/sel.md | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/doc/model/sel.md b/doc/model/sel.md
index 7713789f7b..dce1b8f860 100644
--- a/doc/model/sel.md
+++ b/doc/model/sel.md
@@ -8,6 +8,6 @@ To determine a proper `sel`, one can calculate the neighbor stat of the training
 ```sh
 dp neighbor-stat -s data -r 6.0 -t O H
 ```
-where `data` is the directory of data, `6.0` is the cutoff radius, and `O` and `H` is the type map.The program will give the `max_nbor_size`. For example, `max_nbor_size` of the water example is `[38, 72]`, meaning an atom may have 38 O neighbors and 72 H neighbors in the training data.
+where `data` is the directory of data, `6.0` is the cutoff radius, and `O` and `H` is the type map. The program will give the `max_nbor_size`. For example, `max_nbor_size` of the water example is `[38, 72]`, meaning an atom may have 38 O neighbors and 72 H neighbors in the training data.
 
 The `sel` should be set to a higher value than that of the training data, considering there may be some extreme geometries during MD simulations. As a result, we set to `[46, 92]` in the water example.

From c7ccc082dcff4c0a66b65d1049ff7b21078c6712 Mon Sep 17 00:00:00 2001
From: Jinzhe Zeng <jinzhe.zeng@rutgers.edu>
Date: Tue, 15 Feb 2022 15:51:14 -0500
Subject: [PATCH 4/5] correct name of the class

---
 source/tests/test_neighbor_stat.py | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/source/tests/test_neighbor_stat.py b/source/tests/test_neighbor_stat.py
index 2bc97cb4d5..2c044c76a4 100644
--- a/source/tests/test_neighbor_stat.py
+++ b/source/tests/test_neighbor_stat.py
@@ -20,7 +20,7 @@ def gen_sys(nframes):
     return data
 
 
-class TestEnerShift(unittest.TestCase):
+class TestNeighborStat(unittest.TestCase):
     def setUp(self):
         data0 = gen_sys(1)
         sys0 = dpdata.LabeledSystem()
@@ -30,7 +30,7 @@ def setUp(self):
     def tearDown(self):
         shutil.rmtree('system_0')
 
-    def test_ener_shift(self):
+    def test_neighbor_stat(self):
         # set rcut to 0. will cause a core dumped
         # TODO: check what is wrong
         for rcut in (3., 6., 11.):

From 0618f61d530edd17af67182d3124b56e05dca19e Mon Sep 17 00:00:00 2001
From: Jinzhe Zeng <jinzhe.zeng@rutgers.edu>
Date: Tue, 15 Feb 2022 16:06:06 -0500
Subject: [PATCH 5/5] remove data_requirement

---
 deepmd/entrypoints/neighbor_stat.py | 3 +--
 1 file changed, 1 insertion(+), 2 deletions(-)

diff --git a/deepmd/entrypoints/neighbor_stat.py b/deepmd/entrypoints/neighbor_stat.py
index b366c95d71..eaf8b3efe3 100644
--- a/deepmd/entrypoints/neighbor_stat.py
+++ b/deepmd/entrypoints/neighbor_stat.py
@@ -1,7 +1,7 @@
 import logging
 from typing import List
 
-from deepmd.common import data_requirement, expand_sys_str
+from deepmd.common import expand_sys_str
 from deepmd.utils.data_system import DeepmdDataSystem
 from deepmd.utils.neighbor_stat import NeighborStat
 
@@ -41,7 +41,6 @@ def neighbor_stat(
         rcut=rcut,
         type_map=type_map,
     )
-    data.add_dict(data_requirement)
     data.get_batch()
     nei = NeighborStat(data.get_ntypes(), rcut)
     min_nbor_dist, max_nbor_size = nei.get_stat(data)