From 8a0a0c86431995b06c6464ed3107062c1f0cb135 Mon Sep 17 00:00:00 2001 From: Shaochen Shi Date: Sun, 1 Aug 2021 21:37:37 +0800 Subject: [PATCH 1/2] Add unit tests of `cluster` and `env`. --- deepmd/env.py | 1 + source/tests/test_cluster.py | 109 +++++++++++++++++++++++++++++++++++ source/tests/test_env.py | 42 ++++++++++++++ 3 files changed, 152 insertions(+) create mode 100644 source/tests/test_cluster.py create mode 100644 source/tests/test_env.py diff --git a/deepmd/env.py b/deepmd/env.py index 925976a09b..ace9d3b76e 100644 --- a/deepmd/env.py +++ b/deepmd/env.py @@ -32,6 +32,7 @@ "MODEL_VERSION", "SHARED_LIB_MODULE", "default_tf_session_config", + "reset_default_tf_session_config", "op_module", "op_grads_module", ] diff --git a/source/tests/test_cluster.py b/source/tests/test_cluster.py new file mode 100644 index 0000000000..e9ef366e96 --- /dev/null +++ b/source/tests/test_cluster.py @@ -0,0 +1,109 @@ +import unittest + +from deepmd.cluster import local, slurm +from unittest import mock + + +kHostName = 'org.deepmd.unittest' + + +class FakePopen(object): + def __init__(self, stdout=b'', stderr=b'', returncode=0): + self._stdout = stdout + self._stderr = stderr + self._returncode = returncode + + def communicate(self): + return self._stdout, self._stderr + + @property + def returncode(self): + return self._returncode + + +class TestGPU(unittest.TestCase): + @mock.patch('subprocess.Popen') + def test_none(self, mock_Popen): + mock_Popen.return_value.__enter__.return_value = FakePopen(b'0', b'') + gpus = local.get_gpus() + self.assertIsNone(gpus) + + @mock.patch('subprocess.Popen') + def test_valid(self, mock_Popen): + mock_Popen.return_value.__enter__.return_value = FakePopen(b'2', b'') + gpus = local.get_gpus() + self.assertEqual(gpus, [0, 1]) + + @mock.patch('subprocess.Popen') + def test_error(self, mock_Popen): + mock_Popen.return_value.__enter__.return_value = \ + FakePopen(stderr=b'!', returncode=1) + with self.assertRaises(RuntimeError) as cm: + gpus = local.get_gpus() + self.assertIn('Failed to detect', str(cm.exception)) + + +class TestLocal(unittest.TestCase): + @mock.patch('socket.gethostname') + def test_resource(self, mock_gethostname): + mock_gethostname.return_value = kHostName + nodename, nodelist, _ = local.get_resource() + self.assertEqual(nodename, kHostName) + self.assertEqual(nodelist, [kHostName]) + + +class TestSlurm(unittest.TestCase): + @mock.patch.dict('os.environ', values={ + 'SLURM_JOB_NODELIST': kHostName, + 'SLURMD_NODENAME': kHostName, + 'SLURM_JOB_NUM_NODES': '1' + }) + def test_single(self): + nodename, nodelist, _ = slurm.get_resource() + self.assertEqual(nodename, kHostName) + self.assertEqual(nodelist, [kHostName]) + + @mock.patch.dict('os.environ', values={ + 'SLURM_JOB_NODELIST': 'org.deepmd.host-[3-5],com.github.jack', + 'SLURMD_NODENAME': 'org.deepmd.host-4', + 'SLURM_JOB_NUM_NODES': '4' + }) + def test_multiple(self): + nodename, nodelist, _ = slurm.get_resource() + self.assertEqual(nodename, 'org.deepmd.host-4') + self.assertEqual(nodelist, [ + 'org.deepmd.host-3', + 'org.deepmd.host-4', + 'org.deepmd.host-5', + 'com.github.jack' + ]) + + def test_illegal(self): + environ = { + 'SLURM_JOB_NODELIST': 'org.deepmd.host-[3-5]', + 'SLURMD_NODENAME': 'org.deepmd.host-4' + } + with mock.patch.dict('os.environ', environ): + with self.assertRaises(RuntimeError) as cm: + nodename, nodelist, _ = slurm.get_resource() + self.assertIn('Could not get SLURM number', str(cm.exception)) + + environ = { + 'SLURM_JOB_NODELIST': 'org.deepmd.mike,com.github.jack', + 'SLURMD_NODENAME': 'org.deepmd.mike', + 'SLURM_JOB_NUM_NODES': '4' + } + with mock.patch.dict('os.environ', environ): + with self.assertRaises(ValueError) as cm: + nodename, nodelist, _ = slurm.get_resource() + self.assertIn('Number of slurm nodes 2', str(cm.exception)) + + environ = { + 'SLURM_JOB_NODELIST': 'org.deepmd.bob,com.github.jack', + 'SLURMD_NODENAME': 'org.deepmd.mike', + 'SLURM_JOB_NUM_NODES': '2' + } + with mock.patch.dict('os.environ', environ): + with self.assertRaises(ValueError) as cm: + nodename, nodelist, _ = slurm.get_resource() + self.assertIn('Nodename(org.deepmd.mike', str(cm.exception)) diff --git a/source/tests/test_env.py b/source/tests/test_env.py new file mode 100644 index 0000000000..ea886046f4 --- /dev/null +++ b/source/tests/test_env.py @@ -0,0 +1,42 @@ +import unittest + +from deepmd import env +from unittest import mock + + +class TestTFThreadCount(unittest.TestCase): + @mock.patch.dict('os.environ', values={}) + def test_empty(self): + intra, inter = env.get_tf_default_nthreads() + self.assertEqual(intra, 0) + self.assertEqual(inter, 0) + + @mock.patch.dict('os.environ', values={ + 'TF_INTRA_OP_PARALLELISM_THREADS': '5', + 'TF_INTER_OP_PARALLELISM_THREADS': '3' + }) + def test_given(self): + intra, inter = env.get_tf_default_nthreads() + self.assertEqual(intra, 5) + self.assertEqual(inter, 3) + + +class TestTFSessionConfig(unittest.TestCase): + def test_default(self): + shared = env.default_tf_session_config + new = env.get_tf_session_config() + self.assertNotEqual(id(shared), id(new)) + + @mock.patch('deepmd.env.get_tf_default_nthreads') + def test_get(self, mock_method): + mock_method.return_value = (5, 3) + config = env.get_tf_session_config() + self.assertEqual(config.intra_op_parallelism_threads, 5) + self.assertEqual(config.inter_op_parallelism_threads, 3) + + def test_reset(self): + shared = env.default_tf_session_config + env.reset_default_tf_session_config(True) + self.assertEqual(shared.device_count['GPU'], 0) + env.reset_default_tf_session_config(False) + self.assertEqual(len(shared.device_count), 0) From 79efdb560c715ab23d79d240363874e9b23bb958 Mon Sep 17 00:00:00 2001 From: Shaochen Shi Date: Mon, 2 Aug 2021 09:46:18 +0800 Subject: [PATCH 2/2] Fix the expanding logic of `SLURM_JOB_NODELIST`. --- deepmd/cluster/slurm.py | 37 +++-------------------------- requirements.txt | 1 + source/tests/test_cluster.py | 46 ++++++++++++++++++++---------------- 3 files changed, 30 insertions(+), 54 deletions(-) diff --git a/deepmd/cluster/slurm.py b/deepmd/cluster/slurm.py index 6372d4d83b..feafd84117 100644 --- a/deepmd/cluster/slurm.py +++ b/deepmd/cluster/slurm.py @@ -5,11 +5,11 @@ https://github.com/deepsense-ai/tensorflow_on_slurm #### """ -import re +import hostlist import os from deepmd.cluster import local -from typing import List, Tuple, Optional, Iterable +from typing import List, Tuple, Optional __all__ = ["get_resource"] @@ -31,7 +31,7 @@ def get_resource() -> Tuple[str, List[str], Optional[List[int]]]: ValueError if current nodename is not found in node list """ - nodelist = _expand_nodelist(os.environ["SLURM_JOB_NODELIST"]) + nodelist = hostlist.expand_hostlist(os.environ["SLURM_JOB_NODELIST"]) nodename = os.environ["SLURMD_NODENAME"] num_nodes_env = os.getenv("SLURM_JOB_NUM_NODES") if num_nodes_env: @@ -49,34 +49,3 @@ def get_resource() -> Tuple[str, List[str], Optional[List[int]]]: ) gpus = local.get_gpus() return nodename, nodelist, gpus - - -def _pad_zeros(iterable: Iterable, length: int): - return (str(t).rjust(length, "0") for t in iterable) - - -def _expand_ids(ids: str) -> List[str]: - result = [] - for _id in ids.split(","): - if "-" in _id: - str_end = _id.split("-")[1] - begin, end = [int(token) for token in _id.split("-")] - result.extend(_pad_zeros(range(begin, end + 1), len(str_end))) - else: - result.append(_id) - return result - - -def _expand_nodelist(nodelist: str) -> List[str]: - result = [] - interval_list = nodelist.split(",") - for interval in interval_list: - match = re.search(r"(.*)\[(.*)\]", interval) - if match: - prefix = match.group(1) - ids = match.group(2) - ids_list = _expand_ids(ids) - result.extend([f"{prefix}{_id}" for _id in ids_list]) - else: - result.append(interval) - return result diff --git a/requirements.txt b/requirements.txt index 21befa3722..50b597f2fe 100644 --- a/requirements.txt +++ b/requirements.txt @@ -2,4 +2,5 @@ numpy scipy pyyaml dargs >= 0.2.6 +python-hostlist >= 1.21 typing_extensions; python_version < "3.7" diff --git a/source/tests/test_cluster.py b/source/tests/test_cluster.py index e9ef366e96..01e128b401 100644 --- a/source/tests/test_cluster.py +++ b/source/tests/test_cluster.py @@ -4,7 +4,7 @@ from unittest import mock -kHostName = 'org.deepmd.unittest' +kHostName = 'compute-b24-1' class FakePopen(object): @@ -39,7 +39,7 @@ def test_error(self, mock_Popen): mock_Popen.return_value.__enter__.return_value = \ FakePopen(stderr=b'!', returncode=1) with self.assertRaises(RuntimeError) as cm: - gpus = local.get_gpus() + _ = local.get_gpus() self.assertIn('Failed to detect', str(cm.exception)) @@ -64,46 +64,52 @@ def test_single(self): self.assertEqual(nodelist, [kHostName]) @mock.patch.dict('os.environ', values={ - 'SLURM_JOB_NODELIST': 'org.deepmd.host-[3-5],com.github.jack', - 'SLURMD_NODENAME': 'org.deepmd.host-4', - 'SLURM_JOB_NUM_NODES': '4' + 'SLURM_JOB_NODELIST': 'compute-b24-[1-3,5-9],compute-b25-[4,8]', + 'SLURMD_NODENAME': 'compute-b24-2', + 'SLURM_JOB_NUM_NODES': '10' }) def test_multiple(self): nodename, nodelist, _ = slurm.get_resource() - self.assertEqual(nodename, 'org.deepmd.host-4') + self.assertEqual(nodename, 'compute-b24-2') self.assertEqual(nodelist, [ - 'org.deepmd.host-3', - 'org.deepmd.host-4', - 'org.deepmd.host-5', - 'com.github.jack' + 'compute-b24-1', + 'compute-b24-2', + 'compute-b24-3', + 'compute-b24-5', + 'compute-b24-6', + 'compute-b24-7', + 'compute-b24-8', + 'compute-b24-9', + 'compute-b25-4', + 'compute-b25-8' ]) def test_illegal(self): environ = { - 'SLURM_JOB_NODELIST': 'org.deepmd.host-[3-5]', - 'SLURMD_NODENAME': 'org.deepmd.host-4' + 'SLURM_JOB_NODELIST': 'compute-b24-[3-5]', + 'SLURMD_NODENAME': 'compute-b24-4' } with mock.patch.dict('os.environ', environ): with self.assertRaises(RuntimeError) as cm: - nodename, nodelist, _ = slurm.get_resource() + _ = slurm.get_resource() self.assertIn('Could not get SLURM number', str(cm.exception)) environ = { - 'SLURM_JOB_NODELIST': 'org.deepmd.mike,com.github.jack', - 'SLURMD_NODENAME': 'org.deepmd.mike', + 'SLURM_JOB_NODELIST': 'compute-b24-1,compute-b25-2', + 'SLURMD_NODENAME': 'compute-b25-2', 'SLURM_JOB_NUM_NODES': '4' } with mock.patch.dict('os.environ', environ): with self.assertRaises(ValueError) as cm: - nodename, nodelist, _ = slurm.get_resource() + _ = slurm.get_resource() self.assertIn('Number of slurm nodes 2', str(cm.exception)) environ = { - 'SLURM_JOB_NODELIST': 'org.deepmd.bob,com.github.jack', - 'SLURMD_NODENAME': 'org.deepmd.mike', + 'SLURM_JOB_NODELIST': 'compute-b24-1,compute-b25-3', + 'SLURMD_NODENAME': 'compute-b25-2', 'SLURM_JOB_NUM_NODES': '2' } with mock.patch.dict('os.environ', environ): with self.assertRaises(ValueError) as cm: - nodename, nodelist, _ = slurm.get_resource() - self.assertIn('Nodename(org.deepmd.mike', str(cm.exception)) + _ = slurm.get_resource() + self.assertIn('Nodename(compute-b25-2', str(cm.exception))