From e3dc0ddcc9301e326a260267e416412b1d0e8347 Mon Sep 17 00:00:00 2001 From: Jinzhe Zeng Date: Tue, 7 Mar 2023 19:35:30 -0500 Subject: [PATCH 1/6] mixed system training --- deepmd/utils/data_system.py | 99 ++++++++++++++++++++++++++++++++----- 1 file changed, 88 insertions(+), 11 deletions(-) diff --git a/deepmd/utils/data_system.py b/deepmd/utils/data_system.py index 28dc799bf8..a0e5fd0dc0 100644 --- a/deepmd/utils/data_system.py +++ b/deepmd/utils/data_system.py @@ -112,6 +112,7 @@ def __init__( # batch size self.batch_size = batch_size is_auto_bs = False + self.mixed_systems = False if isinstance(self.batch_size, int): self.batch_size = self.batch_size * np.ones(self.nsystems, dtype=int) elif isinstance(self.batch_size, str): @@ -121,9 +122,16 @@ def __init__( rule = 32 if len(words) == 2: rule = int(words[1]) + self.batch_size = self._make_auto_bs(rule) + elif "mixed" == words[0]: + self.mixed_systems = True + if len(words) == 2: + rule = int(words[1]) + else: + raise RuntimeError("batch size must be specified for mixed systems") + self.batch_size = rule * np.ones(self.nsystems, dtype=int) else: raise RuntimeError("unknown batch_size rule " + words[0]) - self.batch_size = self._make_auto_bs(rule) elif isinstance(self.batch_size, list): pass else: @@ -361,7 +369,7 @@ def _get_sys_probs(self, sys_probs, auto_prob_style): # depreciated prob = self._process_sys_probs(sys_probs) return prob - def get_batch(self, sys_idx: Optional[int] = None): + def get_batch(self, sys_idx: Optional[int] = None) -> dict: # batch generation style altered by Ziyao Li: # one should specify the "sys_prob" and "auto_prob_style" params # via set_sys_prob() function. The sys_probs this function uses is @@ -375,19 +383,88 @@ def get_batch(self, sys_idx: Optional[int] = None): The index of system from which the batch is get. If sys_idx is not None, `sys_probs` and `auto_prob_style` are ignored If sys_idx is None, automatically determine the system according to `sys_probs` or `auto_prob_style`, see the following. + This option does not work for mixed systems. + + Returns + ------- + dict + The batch data """ if not hasattr(self, "default_mesh"): self._make_default_mesh() - if sys_idx is not None: - self.pick_idx = sys_idx + if not self.mixed_systems: + if sys_idx is not None: + self.pick_idx = sys_idx + else: + # prob = self._get_sys_probs(sys_probs, auto_prob_style) + self.pick_idx = dp_random.choice( + np.arange(self.nsystems), p=self.sys_probs + ) + b_data = self.data_systems[self.pick_idx].get_batch( + self.batch_size[self.pick_idx] + ) + b_data["natoms_vec"] = self.natoms_vec[self.pick_idx] + b_data["default_mesh"] = self.default_mesh[self.pick_idx] else: - # prob = self._get_sys_probs(sys_probs, auto_prob_style) - self.pick_idx = dp_random.choice(np.arange(self.nsystems), p=self.sys_probs) - b_data = self.data_systems[self.pick_idx].get_batch( - self.batch_size[self.pick_idx] - ) - b_data["natoms_vec"] = self.natoms_vec[self.pick_idx] - b_data["default_mesh"] = self.default_mesh[self.pick_idx] + # mixed systems have a global batch size + batch_size = self.batch_size[0] + batch_data = [] + for _ in range(batch_size): + self.pick_idx = dp_random.choice( + np.arange(self.nsystems), p=self.sys_probs + ) + bb_data = self.data_systems[self.pick_idx].get_batch(1) + bb_data["natoms_vec"] = self.natoms_vec[self.pick_idx] + bb_data["default_mesh"] = self.default_mesh[self.pick_idx] + batch_data.append(bb_data) + b_data = self._merge_batch_data(batch_data) + return b_data + + def _merge_batch_data(self, batch_data: List[dict]) -> dict: + """Merge batch data from different systems. + + Parameters + ---------- + batch_data : list of dict + A list of batch data from different systems. + + Returns + ------- + dict + The merged batch data. + """ + b_data = {} + max_natoms = max(bb["natoms_vec"][0] for bb in batch_data) + # natoms_vec + natoms_vec = np.zeros(2 + self.get_ntypes(), dtype=int) + natoms_vec[0:3] = max_natoms + b_data["natoms_vec"] = natoms_vec + # real_natoms_vec + real_natoms_vec = np.vstack([bb["natoms_vec"] for bb in batch_data]) + b_data["real_natoms_vec"] = real_natoms_vec + # type + type_vec = np.full((len(batch_data), max_natoms), -1, dtype=int) + for ii, bb in enumerate(batch_data): + type_vec[ii, : bb["type"].shape[1]] = bb["type"][0] + b_data["type"] = type_vec + # default_mesh + default_mesh = np.mean([bb["default_mesh"] for bb in batch_data], axis=0) + b_data["default_mesh"] = default_mesh + # other data + data_dict = self.get_data_dict(0) + for kk, vv in data_dict.items(): + if kk not in batch_data[0]: + continue + b_data["find_" + kk] = batch_data[0]["find_" + kk] + if not vv["atomic"]: + b_data[kk] = np.concatenate([bb[kk] for bb in batch_data], axis=0) + else: + b_data[kk] = np.zeros( + (len(batch_data), max_natoms * vv["ndof"] * vv["repeat"]), + dtype=batch_data[0][kk].dtype, + ) + for ii, bb in enumerate(batch_data): + b_data[kk][ii, : bb[kk].shape[1]] = bb[kk][0] return b_data # ! altered by Marián Rynik From f86f61e28c060ea22ec7b9187b2085c72362118a Mon Sep 17 00:00:00 2001 From: Jinzhe Zeng Date: Tue, 7 Mar 2023 21:38:01 -0500 Subject: [PATCH 2/6] improve doc Signed-off-by: Jinzhe Zeng --- deepmd/utils/argcheck.py | 3 +- examples/nopbc/mixed/input.json | 71 +++++++++++++++++++++++++++++++++ 2 files changed, 73 insertions(+), 1 deletion(-) create mode 100644 examples/nopbc/mixed/input.json diff --git a/deepmd/utils/argcheck.py b/deepmd/utils/argcheck.py index 09c43e14a3..933223cc66 100644 --- a/deepmd/utils/argcheck.py +++ b/deepmd/utils/argcheck.py @@ -972,7 +972,8 @@ def training_data_args(): # ! added by Ziyao: new specification style for data - list: the length of which is the same as the {link_sys}. The batch size of each system is given by the elements of the list.\n\n\ - int: all {link_sys} use the same batch size.\n\n\ - string "auto": automatically determines the batch size so that the batch_size times the number of atoms in the system is no less than 32.\n\n\ -- string "auto:N": automatically determines the batch size so that the batch_size times the number of atoms in the system is no less than N.' +- string "auto:N": automatically determines the batch size so that the batch_size times the number of atoms in the system is no less than N.\n\n\ +- string "mixed:N": the batch data will be sampled from all systems and merged into a mixed system with the batch size N. Only support the se_atten descriptor.' doc_auto_prob_style = 'Determine the probability of systems automatically. The method is assigned by this key and can be\n\n\ - "prob_uniform" : the probability all the systems are equal, namely 1.0/self.get_nsystems()\n\n\ - "prob_sys_size" : the probability of a system is proportional to the number of batches in the system\n\n\ diff --git a/examples/nopbc/mixed/input.json b/examples/nopbc/mixed/input.json new file mode 100644 index 0000000000..ab49b4e442 --- /dev/null +++ b/examples/nopbc/mixed/input.json @@ -0,0 +1,71 @@ +{ + "_comment": " model parameters", + "model": { + "type_map": [ + "C", + "H", + "O" + ], + "descriptor": { + "type": "se_atten", + "sel": 120, + "rcut_smth": 1.00, + "rcut": 6.00, + "neuron": [ + 25, + 50, + 100 + ], + "resnet_dt": false, + "axis_neuron": 12, + "seed": 1, + "_comment": " that's all" + }, + "fitting_net": { + "neuron": [ + 240, + 240, + 240 + ], + "resnet_dt": true, + "seed": 1, + "_comment": " that's all" + }, + "_comment": " that's all" + }, + + "learning_rate": { + "type": "exp", + "decay_steps": 4000, + "start_lr": 0.001, + "stop_lr": 3.51e-8, + "_comment": "that's all" + }, + + "loss": { + "type": "ener", + "start_pref_e": 0.02, + "limit_pref_e": 1, + "start_pref_f": 1000, + "limit_pref_f": 1, + "start_pref_v": 0, + "limit_pref_v": 0, + "_comment": " that's all" + }, + + "training": { + "training_data": { + "systems": "../data/", + "batch_size": "mixed:4", + "_comment": "that's all" + }, + "numb_steps": 4000000, + "seed": 10, + "disp_file": "lcurve.out", + "disp_freq": 100, + "save_freq": 1000, + "_comment": "that's all" + }, + + "_comment": "that's all" +} From 0d4e0201eb9c1eb497484093f3b9a1379ed29ae9 Mon Sep 17 00:00:00 2001 From: Jinzhe Zeng Date: Tue, 7 Mar 2023 22:03:53 -0500 Subject: [PATCH 3/6] add tests Signed-off-by: Jinzhe Zeng --- source/tests/test_deepmd_data_sys.py | 24 ++++++++++++++++++++++++ 1 file changed, 24 insertions(+) diff --git a/source/tests/test_deepmd_data_sys.py b/source/tests/test_deepmd_data_sys.py index d24291114d..9df48c3f57 100644 --- a/source/tests/test_deepmd_data_sys.py +++ b/source/tests/test_deepmd_data_sys.py @@ -7,6 +7,9 @@ from deepmd.env import ( GLOBAL_NP_FLOAT_PRECISION, ) +from deepmd.utils import ( + random, +) from deepmd.utils.data_system import ( DeepmdDataSystem, ) @@ -398,3 +401,24 @@ def test_sys_prob_floating_point_error(self): ] ds = DeepmdDataSystem(self.sys_name, 3, 2, 2.0, sys_probs=sys_probs) self.assertEqual(ds.sys_probs.size, len(sys_probs)) + + def test_get_mixed_batch(self): + """test get_batch with mixed system.""" + batch_size = "mixed:3" + test_size = 2 + + ds = DeepmdDataSystem(self.sys_name, batch_size, test_size, 2.0) + ds.add("test", self.test_ndof, atomic=True, must=True) + ds.add("null", self.test_ndof, atomic=True, must=False) + random.seed(114514) + # with this seed, the batch is fixed, with natoms 3, 6, 6 + data = ds.get_batch() + np.testing.assert_equal(data["natoms_vec"], np.array([6, 6, 6, 0, 0])) + np.testing.assert_equal(data["real_natoms_vec"][:, 0], np.array([3, 6, 6])) + np.testing.assert_equal(data["type"][0, 3:6], np.array([-1, -1, -1])) + np.testing.assert_equal(data["coord"][0, 9:18], np.zeros(9)) + for kk in ("test", "null"): + np.testing.assert_equal( + data[kk][0, 3 * self.test_ndof : 6 * self.test_ndof], + np.zeros(3 * self.test_ndof), + ) From 9bd3575b05a1d23538b7daba072c6029e9ac0918 Mon Sep 17 00:00:00 2001 From: Jinzhe Zeng Date: Wed, 15 Mar 2023 15:25:05 -0400 Subject: [PATCH 4/6] split the method Signed-off-by: Jinzhe Zeng --- deepmd/utils/data_system.py | 77 +++++++++++++++++++++++++------------ 1 file changed, 53 insertions(+), 24 deletions(-) diff --git a/deepmd/utils/data_system.py b/deepmd/utils/data_system.py index a0e5fd0dc0..f55a4d3298 100644 --- a/deepmd/utils/data_system.py +++ b/deepmd/utils/data_system.py @@ -393,31 +393,60 @@ def get_batch(self, sys_idx: Optional[int] = None) -> dict: if not hasattr(self, "default_mesh"): self._make_default_mesh() if not self.mixed_systems: - if sys_idx is not None: - self.pick_idx = sys_idx - else: - # prob = self._get_sys_probs(sys_probs, auto_prob_style) - self.pick_idx = dp_random.choice( - np.arange(self.nsystems), p=self.sys_probs - ) - b_data = self.data_systems[self.pick_idx].get_batch( - self.batch_size[self.pick_idx] - ) - b_data["natoms_vec"] = self.natoms_vec[self.pick_idx] - b_data["default_mesh"] = self.default_mesh[self.pick_idx] + self.get_batch_standard(sys_idx) else: - # mixed systems have a global batch size - batch_size = self.batch_size[0] - batch_data = [] - for _ in range(batch_size): - self.pick_idx = dp_random.choice( - np.arange(self.nsystems), p=self.sys_probs - ) - bb_data = self.data_systems[self.pick_idx].get_batch(1) - bb_data["natoms_vec"] = self.natoms_vec[self.pick_idx] - bb_data["default_mesh"] = self.default_mesh[self.pick_idx] - batch_data.append(bb_data) - b_data = self._merge_batch_data(batch_data) + self.get_batch_mixed() + return b_data + + def get_batch_standard(self, sys_idx: Optional[int] = None) -> dict: + """Get a batch of data from the data systems in the standard way. + + Parameters + ---------- + sys_idx : int + The index of system from which the batch is get. + If sys_idx is not None, `sys_probs` and `auto_prob_style` are ignored + If sys_idx is None, automatically determine the system according to `sys_probs` or `auto_prob_style`, see the following. + + Returns + ------- + dict + The batch data + """ + if sys_idx is not None: + self.pick_idx = sys_idx + else: + # prob = self._get_sys_probs(sys_probs, auto_prob_style) + self.pick_idx = dp_random.choice( + np.arange(self.nsystems), p=self.sys_probs + ) + b_data = self.data_systems[self.pick_idx].get_batch( + self.batch_size[self.pick_idx] + ) + b_data["natoms_vec"] = self.natoms_vec[self.pick_idx] + b_data["default_mesh"] = self.default_mesh[self.pick_idx] + return b_data + + def get_batch_mixed(self) -> dict: + """Get a batch of data from the data systems in the mixed way. + + Returns + ------- + dict + The batch data + """ + # mixed systems have a global batch size + batch_size = self.batch_size[0] + batch_data = [] + for _ in range(batch_size): + self.pick_idx = dp_random.choice( + np.arange(self.nsystems), p=self.sys_probs + ) + bb_data = self.data_systems[self.pick_idx].get_batch(1) + bb_data["natoms_vec"] = self.natoms_vec[self.pick_idx] + bb_data["default_mesh"] = self.default_mesh[self.pick_idx] + batch_data.append(bb_data) + b_data = self._merge_batch_data(batch_data) return b_data def _merge_batch_data(self, batch_data: List[dict]) -> dict: From c73bb9cdaee9370908e1a116cf5f1126ab480f02 Mon Sep 17 00:00:00 2001 From: "pre-commit-ci[bot]" <66853113+pre-commit-ci[bot]@users.noreply.github.com> Date: Wed, 15 Mar 2023 19:25:59 +0000 Subject: [PATCH 5/6] [pre-commit.ci] auto fixes from pre-commit.com hooks for more information, see https://pre-commit.ci --- deepmd/utils/data_system.py | 8 ++------ 1 file changed, 2 insertions(+), 6 deletions(-) diff --git a/deepmd/utils/data_system.py b/deepmd/utils/data_system.py index f55a4d3298..9de328cb3a 100644 --- a/deepmd/utils/data_system.py +++ b/deepmd/utils/data_system.py @@ -417,9 +417,7 @@ def get_batch_standard(self, sys_idx: Optional[int] = None) -> dict: self.pick_idx = sys_idx else: # prob = self._get_sys_probs(sys_probs, auto_prob_style) - self.pick_idx = dp_random.choice( - np.arange(self.nsystems), p=self.sys_probs - ) + self.pick_idx = dp_random.choice(np.arange(self.nsystems), p=self.sys_probs) b_data = self.data_systems[self.pick_idx].get_batch( self.batch_size[self.pick_idx] ) @@ -439,9 +437,7 @@ def get_batch_mixed(self) -> dict: batch_size = self.batch_size[0] batch_data = [] for _ in range(batch_size): - self.pick_idx = dp_random.choice( - np.arange(self.nsystems), p=self.sys_probs - ) + self.pick_idx = dp_random.choice(np.arange(self.nsystems), p=self.sys_probs) bb_data = self.data_systems[self.pick_idx].get_batch(1) bb_data["natoms_vec"] = self.natoms_vec[self.pick_idx] bb_data["default_mesh"] = self.default_mesh[self.pick_idx] From 2355f6f7ab9acbf381d930842996b22a940060c7 Mon Sep 17 00:00:00 2001 From: Jinzhe Zeng Date: Wed, 15 Mar 2023 15:51:14 -0400 Subject: [PATCH 6/6] fix b_data not defined Signed-off-by: Jinzhe Zeng --- deepmd/utils/data_system.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/deepmd/utils/data_system.py b/deepmd/utils/data_system.py index 9de328cb3a..d87219fcc9 100644 --- a/deepmd/utils/data_system.py +++ b/deepmd/utils/data_system.py @@ -393,9 +393,9 @@ def get_batch(self, sys_idx: Optional[int] = None) -> dict: if not hasattr(self, "default_mesh"): self._make_default_mesh() if not self.mixed_systems: - self.get_batch_standard(sys_idx) + b_data = self.get_batch_standard(sys_idx) else: - self.get_batch_mixed() + b_data = self.get_batch_mixed() return b_data def get_batch_standard(self, sys_idx: Optional[int] = None) -> dict: