From 3b38a0e08a7947b0de1fcffc0911352d3a1b00d9 Mon Sep 17 00:00:00 2001 From: Jinzhe Zeng Date: Wed, 26 Apr 2023 16:07:39 -0400 Subject: [PATCH 01/10] add train_test_split method Split a MultiSystems into training and test sub set Signed-off-by: Jinzhe Zeng --- dpdata/system.py | 49 ++++++++++++++++++++++++++++++++++++- tests/test_split_dataset.py | 19 ++++++++++++++ 2 files changed, 67 insertions(+), 1 deletion(-) create mode 100644 tests/test_split_dataset.py diff --git a/dpdata/system.py b/dpdata/system.py index 5e0c96e7d..a5caa50c5 100644 --- a/dpdata/system.py +++ b/dpdata/system.py @@ -3,7 +3,7 @@ import os from copy import deepcopy from enum import Enum, unique -from typing import Any, Tuple, Union +from typing import Any, Dict, Tuple, Union import numpy as np from monty.json import MSONable @@ -1584,6 +1584,53 @@ def correction(self, hl_sys: "MultiSystems"): corrected_sys.append(ll_ss.correction(hl_ss)) return corrected_sys + def train_test_split(self, test_size: Union[float, int]) -> Tuple["MultiSystems", "MultiSystems", Dict[str, np.ndarray]]: + """Split systems into random train and test subsets. + + Parameters + ---------- + test_size : float or int + If float, should be between 0.0 and 1.0 and represent the proportion of the dataset to include in the test split. + If int, represents the absolute number of test samples. + + Returns + ------- + MultiSystems + The training set + MultiSystems + The testing set + Dict[str, np.ndarray] + The indices of training and testing sets for each system + """ + nframes = self.get_nframes() + if isinstance(test_size, float): + assert 0 <= test_size <= 1 + test_size = int(np.floor(test_size * nframes)) + elif isinstance(test_size, int): + assert 0 <= test_size <= nframes + else: + raise RuntimeError("test_size should be float or int") + # get random indices + rng = np.random.default_rng() + test_idx = rng.choice(nframes, test_size, replace=False) + select_test = np.zeros(nframes, dtype=bool) + select_test[test_idx] = True + select_train = np.logical_not(select_test) + # flatten systems dict + system_names, system_sizes = zip(*((kk, len(vv)) for (kk, vv) in self.systems.items())) + system_idx = np.empty(len(system_sizes) + 1, dtype=int) + system_idx[0] = 0 + np.cumsum(system_sizes, out=system_idx[1:]) + # make new systems + train_systems = MultiSystems(type_map=self.atom_names) + test_systems = MultiSystems(type_map=self.atom_names) + test_system_idx = {} + for ii, nn in enumerate(system_names): + train_systems.append(self[nn][select_train[system_idx[ii] : system_idx[ii + 1]]]) + test_systems.append(self[nn][select_test[system_idx[ii] : system_idx[ii + 1]]]) + test_system_idx[nn] = test_idx[system_idx[ii] : system_idx[ii + 1]] + return train_systems, test_systems, test_system_idx + def get_cls_name(cls: object) -> str: """Returns the fully qualified name of a class, such as `np.ndarray`. diff --git a/tests/test_split_dataset.py b/tests/test_split_dataset.py new file mode 100644 index 000000000..3322470f4 --- /dev/null +++ b/tests/test_split_dataset.py @@ -0,0 +1,19 @@ +import unittest + +import numpy as np + +from context import dpdata + + +class TestSplitDataset(unittest.TestCase): + def setUp(self): + self.systems = dpdata.MultiSystems() + sing_sys = dpdata.LabeledSystem("poscars/OUTCAR.h2o.md", fmt="vasp/outcar") + for ii in range(10): + self.systems.append(sing_sys.copy()) + + def test_split_dataset(self): + train, test, test_idx = self.systems.train_test_split(0.2) + self.assertEqual(train.get_nframes(), int(self.systems.get_nframes() * 0.8)) + self.assertEqual(test.get_nframes(), int(self.systems.get_nframes() * 0.2)) + self.assertEqual(sum([np.count_nonzero(x) for x in test_idx.values()]), int(self.systems.get_nframes() * 0.2)) From e5d212844686582176d4a898154bec60c1233644 Mon Sep 17 00:00:00 2001 From: "pre-commit-ci[bot]" <66853113+pre-commit-ci[bot]@users.noreply.github.com> Date: Wed, 26 Apr 2023 20:08:14 +0000 Subject: [PATCH 02/10] [pre-commit.ci] auto fixes from pre-commit.com hooks for more information, see https://pre-commit.ci --- dpdata/system.py | 20 ++++++++++++++------ tests/test_split_dataset.py | 6 ++++-- 2 files changed, 18 insertions(+), 8 deletions(-) diff --git a/dpdata/system.py b/dpdata/system.py index a5caa50c5..d14146722 100644 --- a/dpdata/system.py +++ b/dpdata/system.py @@ -1584,15 +1584,17 @@ def correction(self, hl_sys: "MultiSystems"): corrected_sys.append(ll_ss.correction(hl_ss)) return corrected_sys - def train_test_split(self, test_size: Union[float, int]) -> Tuple["MultiSystems", "MultiSystems", Dict[str, np.ndarray]]: + def train_test_split( + self, test_size: Union[float, int] + ) -> Tuple["MultiSystems", "MultiSystems", Dict[str, np.ndarray]]: """Split systems into random train and test subsets. - + Parameters ---------- test_size : float or int If float, should be between 0.0 and 1.0 and represent the proportion of the dataset to include in the test split. If int, represents the absolute number of test samples. - + Returns ------- MultiSystems @@ -1617,7 +1619,9 @@ def train_test_split(self, test_size: Union[float, int]) -> Tuple["MultiSystems" select_test[test_idx] = True select_train = np.logical_not(select_test) # flatten systems dict - system_names, system_sizes = zip(*((kk, len(vv)) for (kk, vv) in self.systems.items())) + system_names, system_sizes = zip( + *((kk, len(vv)) for (kk, vv) in self.systems.items()) + ) system_idx = np.empty(len(system_sizes) + 1, dtype=int) system_idx[0] = 0 np.cumsum(system_sizes, out=system_idx[1:]) @@ -1626,8 +1630,12 @@ def train_test_split(self, test_size: Union[float, int]) -> Tuple["MultiSystems" test_systems = MultiSystems(type_map=self.atom_names) test_system_idx = {} for ii, nn in enumerate(system_names): - train_systems.append(self[nn][select_train[system_idx[ii] : system_idx[ii + 1]]]) - test_systems.append(self[nn][select_test[system_idx[ii] : system_idx[ii + 1]]]) + train_systems.append( + self[nn][select_train[system_idx[ii] : system_idx[ii + 1]]] + ) + test_systems.append( + self[nn][select_test[system_idx[ii] : system_idx[ii + 1]]] + ) test_system_idx[nn] = test_idx[system_idx[ii] : system_idx[ii + 1]] return train_systems, test_systems, test_system_idx diff --git a/tests/test_split_dataset.py b/tests/test_split_dataset.py index 3322470f4..44ab20fdb 100644 --- a/tests/test_split_dataset.py +++ b/tests/test_split_dataset.py @@ -1,7 +1,6 @@ import unittest import numpy as np - from context import dpdata @@ -16,4 +15,7 @@ def test_split_dataset(self): train, test, test_idx = self.systems.train_test_split(0.2) self.assertEqual(train.get_nframes(), int(self.systems.get_nframes() * 0.8)) self.assertEqual(test.get_nframes(), int(self.systems.get_nframes() * 0.2)) - self.assertEqual(sum([np.count_nonzero(x) for x in test_idx.values()]), int(self.systems.get_nframes() * 0.2)) + self.assertEqual( + sum([np.count_nonzero(x) for x in test_idx.values()]), + int(self.systems.get_nframes() * 0.2), + ) From 45b60a618fc81a4d880bfa99268fa00fb704cdb3 Mon Sep 17 00:00:00 2001 From: Jinzhe Zeng Date: Wed, 26 Apr 2023 16:10:53 -0400 Subject: [PATCH 03/10] improve docs Signed-off-by: Jinzhe Zeng --- dpdata/system.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/dpdata/system.py b/dpdata/system.py index d14146722..85ab7ff05 100644 --- a/dpdata/system.py +++ b/dpdata/system.py @@ -1602,7 +1602,7 @@ def train_test_split( MultiSystems The testing set Dict[str, np.ndarray] - The indices of training and testing sets for each system + The bool array of training and testing sets for each system. False for training set and True for testing set. """ nframes = self.get_nframes() if isinstance(test_size, float): From 8cd8e66ce7a14a0848b96a3f93f8cdbb246b8690 Mon Sep 17 00:00:00 2001 From: Jinzhe Zeng Date: Wed, 26 Apr 2023 16:13:23 -0400 Subject: [PATCH 04/10] allow seed Signed-off-by: Jinzhe Zeng --- dpdata/system.py | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/dpdata/system.py b/dpdata/system.py index 85ab7ff05..74307a15b 100644 --- a/dpdata/system.py +++ b/dpdata/system.py @@ -3,7 +3,7 @@ import os from copy import deepcopy from enum import Enum, unique -from typing import Any, Dict, Tuple, Union +from typing import Any, Dict, Optional, Tuple, Union import numpy as np from monty.json import MSONable @@ -1585,7 +1585,7 @@ def correction(self, hl_sys: "MultiSystems"): return corrected_sys def train_test_split( - self, test_size: Union[float, int] + self, test_size: Union[float, int], seed: Optional[int] = None ) -> Tuple["MultiSystems", "MultiSystems", Dict[str, np.ndarray]]: """Split systems into random train and test subsets. @@ -1613,7 +1613,7 @@ def train_test_split( else: raise RuntimeError("test_size should be float or int") # get random indices - rng = np.random.default_rng() + rng = np.random.default_rng(seed=seed) test_idx = rng.choice(nframes, test_size, replace=False) select_test = np.zeros(nframes, dtype=bool) select_test[test_idx] = True From 936485a0cf13bbd0bab4be50b4c2336c02ed4444 Mon Sep 17 00:00:00 2001 From: Jinzhe Zeng Date: Wed, 26 Apr 2023 16:13:27 -0400 Subject: [PATCH 05/10] allow seed Signed-off-by: Jinzhe Zeng --- dpdata/system.py | 2 ++ 1 file changed, 2 insertions(+) diff --git a/dpdata/system.py b/dpdata/system.py index 74307a15b..65422350b 100644 --- a/dpdata/system.py +++ b/dpdata/system.py @@ -1594,6 +1594,8 @@ def train_test_split( test_size : float or int If float, should be between 0.0 and 1.0 and represent the proportion of the dataset to include in the test split. If int, represents the absolute number of test samples. + seed : int, default=None + Random seed Returns ------- From d8c286ef167004b12e903b0210bf9eff554c3320 Mon Sep 17 00:00:00 2001 From: Jinzhe Zeng Date: Wed, 26 Apr 2023 16:14:43 -0400 Subject: [PATCH 06/10] test with np.floor Signed-off-by: Jinzhe Zeng --- tests/test_split_dataset.py | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/tests/test_split_dataset.py b/tests/test_split_dataset.py index 44ab20fdb..b9b5be89c 100644 --- a/tests/test_split_dataset.py +++ b/tests/test_split_dataset.py @@ -13,9 +13,9 @@ def setUp(self): def test_split_dataset(self): train, test, test_idx = self.systems.train_test_split(0.2) - self.assertEqual(train.get_nframes(), int(self.systems.get_nframes() * 0.8)) - self.assertEqual(test.get_nframes(), int(self.systems.get_nframes() * 0.2)) + self.assertEqual(train.get_nframes(), int(np.floor(self.systems.get_nframes() * 0.8))) + self.assertEqual(test.get_nframes(), int(np.floor(self.systems.get_nframes() * 0.2))) self.assertEqual( sum([np.count_nonzero(x) for x in test_idx.values()]), - int(self.systems.get_nframes() * 0.2), + int(np.floor(self.systems.get_nframes() * 0.2)), ) From e1ed6082f7adb49db045c149308e8a07febe0fa2 Mon Sep 17 00:00:00 2001 From: "pre-commit-ci[bot]" <66853113+pre-commit-ci[bot]@users.noreply.github.com> Date: Wed, 26 Apr 2023 20:14:56 +0000 Subject: [PATCH 07/10] [pre-commit.ci] auto fixes from pre-commit.com hooks for more information, see https://pre-commit.ci --- tests/test_split_dataset.py | 8 ++++++-- 1 file changed, 6 insertions(+), 2 deletions(-) diff --git a/tests/test_split_dataset.py b/tests/test_split_dataset.py index b9b5be89c..a5419b7b1 100644 --- a/tests/test_split_dataset.py +++ b/tests/test_split_dataset.py @@ -13,8 +13,12 @@ def setUp(self): def test_split_dataset(self): train, test, test_idx = self.systems.train_test_split(0.2) - self.assertEqual(train.get_nframes(), int(np.floor(self.systems.get_nframes() * 0.8))) - self.assertEqual(test.get_nframes(), int(np.floor(self.systems.get_nframes() * 0.2))) + self.assertEqual( + train.get_nframes(), int(np.floor(self.systems.get_nframes() * 0.8)) + ) + self.assertEqual( + test.get_nframes(), int(np.floor(self.systems.get_nframes() * 0.2)) + ) self.assertEqual( sum([np.count_nonzero(x) for x in test_idx.values()]), int(np.floor(self.systems.get_nframes() * 0.2)), From 4285341f48677330c449248f322a9d6afcb05282 Mon Sep 17 00:00:00 2001 From: Jinzhe Zeng Date: Wed, 26 Apr 2023 16:24:44 -0400 Subject: [PATCH 08/10] fix typo Signed-off-by: Jinzhe Zeng --- dpdata/system.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/dpdata/system.py b/dpdata/system.py index 65422350b..a964e82ab 100644 --- a/dpdata/system.py +++ b/dpdata/system.py @@ -1638,7 +1638,7 @@ def train_test_split( test_systems.append( self[nn][select_test[system_idx[ii] : system_idx[ii + 1]]] ) - test_system_idx[nn] = test_idx[system_idx[ii] : system_idx[ii + 1]] + test_system_idx[nn] = select_test[system_idx[ii] : system_idx[ii + 1]] return train_systems, test_systems, test_system_idx From bb2b37718a8124b96d4c3ba8f8213fc29a7782de Mon Sep 17 00:00:00 2001 From: Jinzhe Zeng Date: Wed, 26 Apr 2023 17:06:22 -0400 Subject: [PATCH 09/10] do not append when system is empty Signed-off-by: Jinzhe Zeng --- dpdata/system.py | 16 ++++++++++------ 1 file changed, 10 insertions(+), 6 deletions(-) diff --git a/dpdata/system.py b/dpdata/system.py index a964e82ab..a34b092e9 100644 --- a/dpdata/system.py +++ b/dpdata/system.py @@ -1632,12 +1632,16 @@ def train_test_split( test_systems = MultiSystems(type_map=self.atom_names) test_system_idx = {} for ii, nn in enumerate(system_names): - train_systems.append( - self[nn][select_train[system_idx[ii] : system_idx[ii + 1]]] - ) - test_systems.append( - self[nn][select_test[system_idx[ii] : system_idx[ii + 1]]] - ) + sub_train = self[nn][select_train[system_idx[ii] : system_idx[ii + 1]]] + if len(sub_train): + train_systems.append( + sub_train + ) + sub_test = self[nn][select_test[system_idx[ii] : system_idx[ii + 1]]] + if len(sub_test): + test_systems.append( + sub_test + ) test_system_idx[nn] = select_test[system_idx[ii] : system_idx[ii + 1]] return train_systems, test_systems, test_system_idx From 9d56937e85a7ed0ad904ecb2e96094f709f42e24 Mon Sep 17 00:00:00 2001 From: "pre-commit-ci[bot]" <66853113+pre-commit-ci[bot]@users.noreply.github.com> Date: Wed, 26 Apr 2023 21:06:38 +0000 Subject: [PATCH 10/10] [pre-commit.ci] auto fixes from pre-commit.com hooks for more information, see https://pre-commit.ci --- dpdata/system.py | 8 ++------ 1 file changed, 2 insertions(+), 6 deletions(-) diff --git a/dpdata/system.py b/dpdata/system.py index a34b092e9..743f3d057 100644 --- a/dpdata/system.py +++ b/dpdata/system.py @@ -1634,14 +1634,10 @@ def train_test_split( for ii, nn in enumerate(system_names): sub_train = self[nn][select_train[system_idx[ii] : system_idx[ii + 1]]] if len(sub_train): - train_systems.append( - sub_train - ) + train_systems.append(sub_train) sub_test = self[nn][select_test[system_idx[ii] : system_idx[ii + 1]]] if len(sub_test): - test_systems.append( - sub_test - ) + test_systems.append(sub_test) test_system_idx[nn] = select_test[system_idx[ii] : system_idx[ii + 1]] return train_systems, test_systems, test_system_idx