From e3dc0ddcc9301e326a260267e416412b1d0e8347 Mon Sep 17 00:00:00 2001
From: Jinzhe Zeng <jinzhe.zeng@rutgers.edu>
Date: Tue, 7 Mar 2023 19:35:30 -0500
Subject: [PATCH 1/6] mixed system training

---
 deepmd/utils/data_system.py | 99 ++++++++++++++++++++++++++++++++-----
 1 file changed, 88 insertions(+), 11 deletions(-)

diff --git a/deepmd/utils/data_system.py b/deepmd/utils/data_system.py
index 28dc799bf8..a0e5fd0dc0 100644
--- a/deepmd/utils/data_system.py
+++ b/deepmd/utils/data_system.py
@@ -112,6 +112,7 @@ def __init__(
         # batch size
         self.batch_size = batch_size
         is_auto_bs = False
+        self.mixed_systems = False
         if isinstance(self.batch_size, int):
             self.batch_size = self.batch_size * np.ones(self.nsystems, dtype=int)
         elif isinstance(self.batch_size, str):
@@ -121,9 +122,16 @@ def __init__(
                 rule = 32
                 if len(words) == 2:
                     rule = int(words[1])
+                self.batch_size = self._make_auto_bs(rule)
+            elif "mixed" == words[0]:
+                self.mixed_systems = True
+                if len(words) == 2:
+                    rule = int(words[1])
+                else:
+                    raise RuntimeError("batch size must be specified for mixed systems")
+                self.batch_size = rule * np.ones(self.nsystems, dtype=int)
             else:
                 raise RuntimeError("unknown batch_size rule " + words[0])
-            self.batch_size = self._make_auto_bs(rule)
         elif isinstance(self.batch_size, list):
             pass
         else:
@@ -361,7 +369,7 @@ def _get_sys_probs(self, sys_probs, auto_prob_style):  # depreciated
             prob = self._process_sys_probs(sys_probs)
         return prob
 
-    def get_batch(self, sys_idx: Optional[int] = None):
+    def get_batch(self, sys_idx: Optional[int] = None) -> dict:
         # batch generation style altered by Ziyao Li:
         # one should specify the "sys_prob" and "auto_prob_style" params
         # via set_sys_prob() function. The sys_probs this function uses is
@@ -375,19 +383,88 @@ def get_batch(self, sys_idx: Optional[int] = None):
             The index of system from which the batch is get.
             If sys_idx is not None, `sys_probs` and `auto_prob_style` are ignored
             If sys_idx is None, automatically determine the system according to `sys_probs` or `auto_prob_style`, see the following.
+            This option does not work for mixed systems.
+
+        Returns
+        -------
+        dict
+            The batch data
         """
         if not hasattr(self, "default_mesh"):
             self._make_default_mesh()
-        if sys_idx is not None:
-            self.pick_idx = sys_idx
+        if not self.mixed_systems:
+            if sys_idx is not None:
+                self.pick_idx = sys_idx
+            else:
+                # prob = self._get_sys_probs(sys_probs, auto_prob_style)
+                self.pick_idx = dp_random.choice(
+                    np.arange(self.nsystems), p=self.sys_probs
+                )
+            b_data = self.data_systems[self.pick_idx].get_batch(
+                self.batch_size[self.pick_idx]
+            )
+            b_data["natoms_vec"] = self.natoms_vec[self.pick_idx]
+            b_data["default_mesh"] = self.default_mesh[self.pick_idx]
         else:
-            # prob = self._get_sys_probs(sys_probs, auto_prob_style)
-            self.pick_idx = dp_random.choice(np.arange(self.nsystems), p=self.sys_probs)
-        b_data = self.data_systems[self.pick_idx].get_batch(
-            self.batch_size[self.pick_idx]
-        )
-        b_data["natoms_vec"] = self.natoms_vec[self.pick_idx]
-        b_data["default_mesh"] = self.default_mesh[self.pick_idx]
+            # mixed systems have a global batch size
+            batch_size = self.batch_size[0]
+            batch_data = []
+            for _ in range(batch_size):
+                self.pick_idx = dp_random.choice(
+                    np.arange(self.nsystems), p=self.sys_probs
+                )
+                bb_data = self.data_systems[self.pick_idx].get_batch(1)
+                bb_data["natoms_vec"] = self.natoms_vec[self.pick_idx]
+                bb_data["default_mesh"] = self.default_mesh[self.pick_idx]
+                batch_data.append(bb_data)
+            b_data = self._merge_batch_data(batch_data)
+        return b_data
+
+    def _merge_batch_data(self, batch_data: List[dict]) -> dict:
+        """Merge batch data from different systems.
+
+        Parameters
+        ----------
+        batch_data : list of dict
+            A list of batch data from different systems.
+
+        Returns
+        -------
+        dict
+            The merged batch data.
+        """
+        b_data = {}
+        max_natoms = max(bb["natoms_vec"][0] for bb in batch_data)
+        # natoms_vec
+        natoms_vec = np.zeros(2 + self.get_ntypes(), dtype=int)
+        natoms_vec[0:3] = max_natoms
+        b_data["natoms_vec"] = natoms_vec
+        # real_natoms_vec
+        real_natoms_vec = np.vstack([bb["natoms_vec"] for bb in batch_data])
+        b_data["real_natoms_vec"] = real_natoms_vec
+        # type
+        type_vec = np.full((len(batch_data), max_natoms), -1, dtype=int)
+        for ii, bb in enumerate(batch_data):
+            type_vec[ii, : bb["type"].shape[1]] = bb["type"][0]
+        b_data["type"] = type_vec
+        # default_mesh
+        default_mesh = np.mean([bb["default_mesh"] for bb in batch_data], axis=0)
+        b_data["default_mesh"] = default_mesh
+        # other data
+        data_dict = self.get_data_dict(0)
+        for kk, vv in data_dict.items():
+            if kk not in batch_data[0]:
+                continue
+            b_data["find_" + kk] = batch_data[0]["find_" + kk]
+            if not vv["atomic"]:
+                b_data[kk] = np.concatenate([bb[kk] for bb in batch_data], axis=0)
+            else:
+                b_data[kk] = np.zeros(
+                    (len(batch_data), max_natoms * vv["ndof"] * vv["repeat"]),
+                    dtype=batch_data[0][kk].dtype,
+                )
+                for ii, bb in enumerate(batch_data):
+                    b_data[kk][ii, : bb[kk].shape[1]] = bb[kk][0]
         return b_data
 
     # ! altered by Marián Rynik

From f86f61e28c060ea22ec7b9187b2085c72362118a Mon Sep 17 00:00:00 2001
From: Jinzhe Zeng <jinzhe.zeng@rutgers.edu>
Date: Tue, 7 Mar 2023 21:38:01 -0500
Subject: [PATCH 2/6] improve doc

Signed-off-by: Jinzhe Zeng <jinzhe.zeng@rutgers.edu>
---
 deepmd/utils/argcheck.py        |  3 +-
 examples/nopbc/mixed/input.json | 71 +++++++++++++++++++++++++++++++++
 2 files changed, 73 insertions(+), 1 deletion(-)
 create mode 100644 examples/nopbc/mixed/input.json

diff --git a/deepmd/utils/argcheck.py b/deepmd/utils/argcheck.py
index 09c43e14a3..933223cc66 100644
--- a/deepmd/utils/argcheck.py
+++ b/deepmd/utils/argcheck.py
@@ -972,7 +972,8 @@ def training_data_args():  # ! added by Ziyao: new specification style for data
 - list: the length of which is the same as the {link_sys}. The batch size of each system is given by the elements of the list.\n\n\
 - int: all {link_sys} use the same batch size.\n\n\
 - string "auto": automatically determines the batch size so that the batch_size times the number of atoms in the system is no less than 32.\n\n\
-- string "auto:N": automatically determines the batch size so that the batch_size times the number of atoms in the system is no less than N.'
+- string "auto:N": automatically determines the batch size so that the batch_size times the number of atoms in the system is no less than N.\n\n\
+- string "mixed:N": the batch data will be sampled from all systems and merged into a mixed system with the batch size N. Only support the se_atten descriptor.'
     doc_auto_prob_style = 'Determine the probability of systems automatically. The method is assigned by this key and can be\n\n\
 - "prob_uniform"  : the probability all the systems are equal, namely 1.0/self.get_nsystems()\n\n\
 - "prob_sys_size" : the probability of a system is proportional to the number of batches in the system\n\n\
diff --git a/examples/nopbc/mixed/input.json b/examples/nopbc/mixed/input.json
new file mode 100644
index 0000000000..ab49b4e442
--- /dev/null
+++ b/examples/nopbc/mixed/input.json
@@ -0,0 +1,71 @@
+{
+  "_comment": " model parameters",
+  "model": {
+    "type_map": [
+      "C",
+      "H",
+      "O"
+    ],
+    "descriptor": {
+      "type": "se_atten",
+      "sel": 120,
+      "rcut_smth": 1.00,
+      "rcut": 6.00,
+      "neuron": [
+        25,
+        50,
+        100
+      ],
+      "resnet_dt": false,
+      "axis_neuron": 12,
+      "seed": 1,
+      "_comment": " that's all"
+    },
+    "fitting_net": {
+      "neuron": [
+        240,
+        240,
+        240
+      ],
+      "resnet_dt": true,
+      "seed": 1,
+      "_comment": " that's all"
+    },
+    "_comment": " that's all"
+  },
+
+  "learning_rate": {
+    "type": "exp",
+    "decay_steps": 4000,
+    "start_lr": 0.001,
+    "stop_lr": 3.51e-8,
+    "_comment": "that's all"
+  },
+
+  "loss": {
+    "type": "ener",
+    "start_pref_e": 0.02,
+    "limit_pref_e": 1,
+    "start_pref_f": 1000,
+    "limit_pref_f": 1,
+    "start_pref_v": 0,
+    "limit_pref_v": 0,
+    "_comment": " that's all"
+  },
+
+  "training": {
+    "training_data": {
+      "systems": "../data/",
+      "batch_size": "mixed:4",
+      "_comment": "that's all"
+    },
+    "numb_steps": 4000000,
+    "seed": 10,
+    "disp_file": "lcurve.out",
+    "disp_freq": 100,
+    "save_freq": 1000,
+    "_comment": "that's all"
+  },
+
+  "_comment": "that's all"
+}

From 0d4e0201eb9c1eb497484093f3b9a1379ed29ae9 Mon Sep 17 00:00:00 2001
From: Jinzhe Zeng <jinzhe.zeng@rutgers.edu>
Date: Tue, 7 Mar 2023 22:03:53 -0500
Subject: [PATCH 3/6] add tests

Signed-off-by: Jinzhe Zeng <jinzhe.zeng@rutgers.edu>
---
 source/tests/test_deepmd_data_sys.py | 24 ++++++++++++++++++++++++
 1 file changed, 24 insertions(+)

diff --git a/source/tests/test_deepmd_data_sys.py b/source/tests/test_deepmd_data_sys.py
index d24291114d..9df48c3f57 100644
--- a/source/tests/test_deepmd_data_sys.py
+++ b/source/tests/test_deepmd_data_sys.py
@@ -7,6 +7,9 @@
 from deepmd.env import (
     GLOBAL_NP_FLOAT_PRECISION,
 )
+from deepmd.utils import (
+    random,
+)
 from deepmd.utils.data_system import (
     DeepmdDataSystem,
 )
@@ -398,3 +401,24 @@ def test_sys_prob_floating_point_error(self):
         ]
         ds = DeepmdDataSystem(self.sys_name, 3, 2, 2.0, sys_probs=sys_probs)
         self.assertEqual(ds.sys_probs.size, len(sys_probs))
+
+    def test_get_mixed_batch(self):
+        """test get_batch with mixed system."""
+        batch_size = "mixed:3"
+        test_size = 2
+
+        ds = DeepmdDataSystem(self.sys_name, batch_size, test_size, 2.0)
+        ds.add("test", self.test_ndof, atomic=True, must=True)
+        ds.add("null", self.test_ndof, atomic=True, must=False)
+        random.seed(114514)
+        # with this seed, the batch is fixed, with natoms 3, 6, 6
+        data = ds.get_batch()
+        np.testing.assert_equal(data["natoms_vec"], np.array([6, 6, 6, 0, 0]))
+        np.testing.assert_equal(data["real_natoms_vec"][:, 0], np.array([3, 6, 6]))
+        np.testing.assert_equal(data["type"][0, 3:6], np.array([-1, -1, -1]))
+        np.testing.assert_equal(data["coord"][0, 9:18], np.zeros(9))
+        for kk in ("test", "null"):
+            np.testing.assert_equal(
+                data[kk][0, 3 * self.test_ndof : 6 * self.test_ndof],
+                np.zeros(3 * self.test_ndof),
+            )

From 9bd3575b05a1d23538b7daba072c6029e9ac0918 Mon Sep 17 00:00:00 2001
From: Jinzhe Zeng <jinzhe.zeng@rutgers.edu>
Date: Wed, 15 Mar 2023 15:25:05 -0400
Subject: [PATCH 4/6] split the method

Signed-off-by: Jinzhe Zeng <jinzhe.zeng@rutgers.edu>
---
 deepmd/utils/data_system.py | 77 +++++++++++++++++++++++++------------
 1 file changed, 53 insertions(+), 24 deletions(-)

diff --git a/deepmd/utils/data_system.py b/deepmd/utils/data_system.py
index a0e5fd0dc0..f55a4d3298 100644
--- a/deepmd/utils/data_system.py
+++ b/deepmd/utils/data_system.py
@@ -393,31 +393,60 @@ def get_batch(self, sys_idx: Optional[int] = None) -> dict:
         if not hasattr(self, "default_mesh"):
             self._make_default_mesh()
         if not self.mixed_systems:
-            if sys_idx is not None:
-                self.pick_idx = sys_idx
-            else:
-                # prob = self._get_sys_probs(sys_probs, auto_prob_style)
-                self.pick_idx = dp_random.choice(
-                    np.arange(self.nsystems), p=self.sys_probs
-                )
-            b_data = self.data_systems[self.pick_idx].get_batch(
-                self.batch_size[self.pick_idx]
-            )
-            b_data["natoms_vec"] = self.natoms_vec[self.pick_idx]
-            b_data["default_mesh"] = self.default_mesh[self.pick_idx]
+            self.get_batch_standard(sys_idx)
         else:
-            # mixed systems have a global batch size
-            batch_size = self.batch_size[0]
-            batch_data = []
-            for _ in range(batch_size):
-                self.pick_idx = dp_random.choice(
-                    np.arange(self.nsystems), p=self.sys_probs
-                )
-                bb_data = self.data_systems[self.pick_idx].get_batch(1)
-                bb_data["natoms_vec"] = self.natoms_vec[self.pick_idx]
-                bb_data["default_mesh"] = self.default_mesh[self.pick_idx]
-                batch_data.append(bb_data)
-            b_data = self._merge_batch_data(batch_data)
+            self.get_batch_mixed()
+        return b_data
+
+    def get_batch_standard(self, sys_idx: Optional[int] = None) -> dict:
+        """Get a batch of data from the data systems in the standard way.
+
+        Parameters
+        ----------
+        sys_idx : int
+            The index of system from which the batch is get.
+            If sys_idx is not None, `sys_probs` and `auto_prob_style` are ignored
+            If sys_idx is None, automatically determine the system according to `sys_probs` or `auto_prob_style`, see the following.
+
+        Returns
+        -------
+        dict
+            The batch data
+        """
+        if sys_idx is not None:
+            self.pick_idx = sys_idx
+        else:
+            # prob = self._get_sys_probs(sys_probs, auto_prob_style)
+            self.pick_idx = dp_random.choice(
+                np.arange(self.nsystems), p=self.sys_probs
+            )
+        b_data = self.data_systems[self.pick_idx].get_batch(
+            self.batch_size[self.pick_idx]
+        )
+        b_data["natoms_vec"] = self.natoms_vec[self.pick_idx]
+        b_data["default_mesh"] = self.default_mesh[self.pick_idx]
+        return b_data
+
+    def get_batch_mixed(self) -> dict:
+        """Get a batch of data from the data systems in the mixed way.
+
+        Returns
+        -------
+        dict
+            The batch data
+        """
+        # mixed systems have a global batch size
+        batch_size = self.batch_size[0]
+        batch_data = []
+        for _ in range(batch_size):
+            self.pick_idx = dp_random.choice(
+                np.arange(self.nsystems), p=self.sys_probs
+            )
+            bb_data = self.data_systems[self.pick_idx].get_batch(1)
+            bb_data["natoms_vec"] = self.natoms_vec[self.pick_idx]
+            bb_data["default_mesh"] = self.default_mesh[self.pick_idx]
+            batch_data.append(bb_data)
+        b_data = self._merge_batch_data(batch_data)
         return b_data
 
     def _merge_batch_data(self, batch_data: List[dict]) -> dict:

From c73bb9cdaee9370908e1a116cf5f1126ab480f02 Mon Sep 17 00:00:00 2001
From: "pre-commit-ci[bot]"
 <66853113+pre-commit-ci[bot]@users.noreply.github.com>
Date: Wed, 15 Mar 2023 19:25:59 +0000
Subject: [PATCH 5/6] [pre-commit.ci] auto fixes from pre-commit.com hooks

for more information, see https://pre-commit.ci
---
 deepmd/utils/data_system.py | 8 ++------
 1 file changed, 2 insertions(+), 6 deletions(-)

diff --git a/deepmd/utils/data_system.py b/deepmd/utils/data_system.py
index f55a4d3298..9de328cb3a 100644
--- a/deepmd/utils/data_system.py
+++ b/deepmd/utils/data_system.py
@@ -417,9 +417,7 @@ def get_batch_standard(self, sys_idx: Optional[int] = None) -> dict:
             self.pick_idx = sys_idx
         else:
             # prob = self._get_sys_probs(sys_probs, auto_prob_style)
-            self.pick_idx = dp_random.choice(
-                np.arange(self.nsystems), p=self.sys_probs
-            )
+            self.pick_idx = dp_random.choice(np.arange(self.nsystems), p=self.sys_probs)
         b_data = self.data_systems[self.pick_idx].get_batch(
             self.batch_size[self.pick_idx]
         )
@@ -439,9 +437,7 @@ def get_batch_mixed(self) -> dict:
         batch_size = self.batch_size[0]
         batch_data = []
         for _ in range(batch_size):
-            self.pick_idx = dp_random.choice(
-                np.arange(self.nsystems), p=self.sys_probs
-            )
+            self.pick_idx = dp_random.choice(np.arange(self.nsystems), p=self.sys_probs)
             bb_data = self.data_systems[self.pick_idx].get_batch(1)
             bb_data["natoms_vec"] = self.natoms_vec[self.pick_idx]
             bb_data["default_mesh"] = self.default_mesh[self.pick_idx]

From 2355f6f7ab9acbf381d930842996b22a940060c7 Mon Sep 17 00:00:00 2001
From: Jinzhe Zeng <jinzhe.zeng@rutgers.edu>
Date: Wed, 15 Mar 2023 15:51:14 -0400
Subject: [PATCH 6/6] fix b_data not defined

Signed-off-by: Jinzhe Zeng <jinzhe.zeng@rutgers.edu>
---
 deepmd/utils/data_system.py | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/deepmd/utils/data_system.py b/deepmd/utils/data_system.py
index 9de328cb3a..d87219fcc9 100644
--- a/deepmd/utils/data_system.py
+++ b/deepmd/utils/data_system.py
@@ -393,9 +393,9 @@ def get_batch(self, sys_idx: Optional[int] = None) -> dict:
         if not hasattr(self, "default_mesh"):
             self._make_default_mesh()
         if not self.mixed_systems:
-            self.get_batch_standard(sys_idx)
+            b_data = self.get_batch_standard(sys_idx)
         else:
-            self.get_batch_mixed()
+            b_data = self.get_batch_mixed()
         return b_data
 
     def get_batch_standard(self, sys_idx: Optional[int] = None) -> dict: