From 7c9a3143381d5299bec88b69563379e016873863 Mon Sep 17 00:00:00 2001 From: Jinzhe Zeng Date: Fri, 3 Mar 2023 16:41:27 -0500 Subject: [PATCH 1/7] add a method to remove outlier frames Signed-off-by: Jinzhe Zeng --- dpdata/system.py | 36 ++++++++++++++++++++++++++++++++++++ tests/test_remove_outlier.py | 31 +++++++++++++++++++++++++++++++ 2 files changed, 67 insertions(+) create mode 100644 tests/test_remove_outlier.py diff --git a/dpdata/system.py b/dpdata/system.py index 802b352c5..74ce33237 100644 --- a/dpdata/system.py +++ b/dpdata/system.py @@ -1271,6 +1271,42 @@ def correction(self, hl_sys): ) return corrected_sys + def remove_outlier(self, threshold: float = 8.0) -> "LabeledSystem": + r"""Remove outlier frames from the system. + + Remove the frames whose energies satisfy the condition + + .. math:: + + \frac{\left \| E - \bar{E} \right \|}{\sigma(E)} > \text{threshold} + + where :math:`\bar{E}` and :math:`\sigma(E)` are the mean and standard deviation + of the energies in the system. + + Parameters + ---------- + threshold : float + The threshold of outlier detection. The default value is 8.0. + + Returns + ------- + LabeledSystem + The system without outlier frames. + + References + ---------- + .. [1] Gao, X.; Ramezanghorbani, F.; Isayev, O.; Smith, J. S.; + Roitberg, A. E. TorchANI: A Free and Open Source PyTorch-Based + Deep Learning Implementation of the ANI Neural Network + Potentials. J. Chem. Inf. Model. 2020, 60, 3408-3415. + .. [2] Zeng, J.; Tao, Y.; Giese, T. J.; York, D. M.. QDπ: A Quantum + Deep Potential Interaction Model for Drug Discovery. J. Comput. + Chem. 2023, 19, 1261-1275. + """ + energies = self.data['energies'] + idx = (energies - np.mean(energies)) / np.std(energies) < threshold + return self.sub_system(idx) + class MultiSystems: """A set containing several systems.""" diff --git a/tests/test_remove_outlier.py b/tests/test_remove_outlier.py new file mode 100644 index 000000000..50d662a08 --- /dev/null +++ b/tests/test_remove_outlier.py @@ -0,0 +1,31 @@ +import os +import unittest + +import numpy as np +from context import dpdata +from comp_sys import CompLabeledSys + + +class TestRemoveOutlier(unittest.TestCase, CompLabeledSys): + @classmethod + def setUpClass(cls): + system = dpdata.LabeledSystem( + data={ + "atom_names": ["H"], + "atom_numbs": [1], + "atom_types": np.zeros((1,), dtype=int), + "coords": np.zeros((100, 1, 3), dtype=np.float32), + "cells": np.zeros((100, 3, 3), dtype=np.float32), + "orig": np.zeros(3, dtype=np.float32), + "nopbc": True, + "energies": np.zeros((100,), dtype=np.float32), + "forces": np.zeros((100, 1, 3), dtype=np.float32), + } + ) + system.data['energies'][0] = 100.0 + cls.system_1 = system.remove_outlier() + cls.system_2 = system[1:] + cls.places = 6 + cls.e_places = 6 + cls.f_places = 6 + cls.v_places = 6 From 69c65e2380af41ebc64d4a1385e1ddc5749be984 Mon Sep 17 00:00:00 2001 From: Jinzhe Zeng Date: Fri, 3 Mar 2023 16:42:59 -0500 Subject: [PATCH 2/7] geq Signed-off-by: Jinzhe Zeng --- dpdata/system.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/dpdata/system.py b/dpdata/system.py index 74ce33237..949628cb8 100644 --- a/dpdata/system.py +++ b/dpdata/system.py @@ -1278,7 +1278,7 @@ def remove_outlier(self, threshold: float = 8.0) -> "LabeledSystem": .. math:: - \frac{\left \| E - \bar{E} \right \|}{\sigma(E)} > \text{threshold} + \frac{\left \| E - \bar{E} \right \|}{\sigma(E)} \geq \text{threshold} where :math:`\bar{E}` and :math:`\sigma(E)` are the mean and standard deviation of the energies in the system. From 917aa7338bb578e58f060b8b10886ca328bf701f Mon Sep 17 00:00:00 2001 From: "pre-commit-ci[bot]" <66853113+pre-commit-ci[bot]@users.noreply.github.com> Date: Fri, 3 Mar 2023 21:43:21 +0000 Subject: [PATCH 3/7] [pre-commit.ci] auto fixes from pre-commit.com hooks for more information, see https://pre-commit.ci --- dpdata/system.py | 2 +- tests/test_remove_outlier.py | 4 ++-- 2 files changed, 3 insertions(+), 3 deletions(-) diff --git a/dpdata/system.py b/dpdata/system.py index 949628cb8..454e016b8 100644 --- a/dpdata/system.py +++ b/dpdata/system.py @@ -1303,7 +1303,7 @@ def remove_outlier(self, threshold: float = 8.0) -> "LabeledSystem": Deep Potential Interaction Model for Drug Discovery. J. Comput. Chem. 2023, 19, 1261-1275. """ - energies = self.data['energies'] + energies = self.data["energies"] idx = (energies - np.mean(energies)) / np.std(energies) < threshold return self.sub_system(idx) diff --git a/tests/test_remove_outlier.py b/tests/test_remove_outlier.py index 50d662a08..faef12b38 100644 --- a/tests/test_remove_outlier.py +++ b/tests/test_remove_outlier.py @@ -2,8 +2,8 @@ import unittest import numpy as np -from context import dpdata from comp_sys import CompLabeledSys +from context import dpdata class TestRemoveOutlier(unittest.TestCase, CompLabeledSys): @@ -22,7 +22,7 @@ def setUpClass(cls): "forces": np.zeros((100, 1, 3), dtype=np.float32), } ) - system.data['energies'][0] = 100.0 + system.data["energies"][0] = 100.0 cls.system_1 = system.remove_outlier() cls.system_2 = system[1:] cls.places = 6 From baa2bb8400696002059e97eeb7eee628b2003d71 Mon Sep 17 00:00:00 2001 From: Jinzhe Zeng Date: Fri, 3 Mar 2023 17:35:00 -0500 Subject: [PATCH 4/7] abs Signed-off-by: Jinzhe Zeng --- dpdata/system.py | 2 +- tests/test_remove_outlier.py | 3 ++- 2 files changed, 3 insertions(+), 2 deletions(-) diff --git a/dpdata/system.py b/dpdata/system.py index 949628cb8..5a8a9872e 100644 --- a/dpdata/system.py +++ b/dpdata/system.py @@ -1304,7 +1304,7 @@ def remove_outlier(self, threshold: float = 8.0) -> "LabeledSystem": Chem. 2023, 19, 1261-1275. """ energies = self.data['energies'] - idx = (energies - np.mean(energies)) / np.std(energies) < threshold + idx = np.abs(energies - np.mean(energies)) / np.std(energies) < threshold return self.sub_system(idx) diff --git a/tests/test_remove_outlier.py b/tests/test_remove_outlier.py index 50d662a08..d2677438e 100644 --- a/tests/test_remove_outlier.py +++ b/tests/test_remove_outlier.py @@ -23,8 +23,9 @@ def setUpClass(cls): } ) system.data['energies'][0] = 100.0 + system.data['energies'][1] = -100.0 cls.system_1 = system.remove_outlier() - cls.system_2 = system[1:] + cls.system_2 = system[2:] cls.places = 6 cls.e_places = 6 cls.f_places = 6 From 2ad78b0c52f93a2b9d0de4aaa234f3388cf087e9 Mon Sep 17 00:00:00 2001 From: Jinzhe Zeng Date: Fri, 3 Mar 2023 17:44:22 -0500 Subject: [PATCH 5/7] Update test_remove_outlier.py --- tests/test_remove_outlier.py | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) diff --git a/tests/test_remove_outlier.py b/tests/test_remove_outlier.py index 8020380bf..faef12b38 100644 --- a/tests/test_remove_outlier.py +++ b/tests/test_remove_outlier.py @@ -23,9 +23,8 @@ def setUpClass(cls): } ) system.data["energies"][0] = 100.0 - system.data["energies"][1] = -100.0 cls.system_1 = system.remove_outlier() - cls.system_2 = system[2:] + cls.system_2 = system[1:] cls.places = 6 cls.e_places = 6 cls.f_places = 6 From 9f77041607e36cff4743ead9cf1044c13d66a534 Mon Sep 17 00:00:00 2001 From: Jinzhe Zeng Date: Mon, 6 Mar 2023 16:36:03 -0500 Subject: [PATCH 6/7] handle std == 0.0 Signed-off-by: Jinzhe Zeng --- dpdata/system.py | 5 ++++- 1 file changed, 4 insertions(+), 1 deletion(-) diff --git a/dpdata/system.py b/dpdata/system.py index c4df294cf..c05cb0d1e 100644 --- a/dpdata/system.py +++ b/dpdata/system.py @@ -1304,7 +1304,10 @@ def remove_outlier(self, threshold: float = 8.0) -> "LabeledSystem": Chem. 2023, 19, 1261-1275. """ energies = self.data["energies"] - idx = np.abs(energies - np.mean(energies)) / np.std(energies) < threshold + std = np.std(energies) + if np.isclose(std, 0.0): + return self.copy() + idx = np.abs(energies - np.mean(energies)) / std < threshold return self.sub_system(idx) From 010af5495f3f25c7affe7c624f9309b82b961fd4 Mon Sep 17 00:00:00 2001 From: Jinzhe Zeng Date: Tue, 7 Mar 2023 15:11:15 -0500 Subject: [PATCH 7/7] test std=0 Signed-off-by: Jinzhe Zeng --- tests/test_remove_outlier.py | 24 ++++++++++++++++++++++++ 1 file changed, 24 insertions(+) diff --git a/tests/test_remove_outlier.py b/tests/test_remove_outlier.py index faef12b38..192f4d8f0 100644 --- a/tests/test_remove_outlier.py +++ b/tests/test_remove_outlier.py @@ -29,3 +29,27 @@ def setUpClass(cls): cls.e_places = 6 cls.f_places = 6 cls.v_places = 6 + + +class TestRemoveOutlierStdZero(unittest.TestCase, CompLabeledSys): + @classmethod + def setUpClass(cls): + system = dpdata.LabeledSystem( + data={ + "atom_names": ["H"], + "atom_numbs": [1], + "atom_types": np.zeros((1,), dtype=int), + "coords": np.zeros((100, 1, 3), dtype=np.float32), + "cells": np.zeros((100, 3, 3), dtype=np.float32), + "orig": np.zeros(3, dtype=np.float32), + "nopbc": True, + "energies": np.zeros((100,), dtype=np.float32), + "forces": np.zeros((100, 1, 3), dtype=np.float32), + } + ) + cls.system_1 = system.remove_outlier() + cls.system_2 = system + cls.places = 6 + cls.e_places = 6 + cls.f_places = 6 + cls.v_places = 6