From 19f5398833bb31869ba152a1805686330c693587 Mon Sep 17 00:00:00 2001 From: Shaochen Shi Date: Sat, 31 Jul 2021 09:39:28 +0800 Subject: [PATCH 1/6] Find available GPUs in an elegant way. --- deepmd/cluster/local.py | 63 +++++++++++++++++++++++++++++++++++------ deepmd/cluster/slurm.py | 8 ++---- requirements.txt | 1 + 3 files changed, 58 insertions(+), 14 deletions(-) diff --git a/deepmd/cluster/local.py b/deepmd/cluster/local.py index 37a2e99b6e..388077edbd 100644 --- a/deepmd/cluster/local.py +++ b/deepmd/cluster/local.py @@ -1,10 +1,60 @@ -"""Get local GPU resources from `CUDA_VISIBLE_DEVICES` enviroment variable.""" - +"""Get local GPU resources.""" +import logging import os import socket + +import GPUtil + +from deepmd.env import tf from typing import List, Tuple, Optional -__all__ = ["get_resource"] +__all__ = ["get_gpus", "get_resource"] + +log = logging.getLogger(__name__) + + +def get_gpus(): + """Get available IDs of GPU cards at local. + These IDs are valid when used as the TensorFlow device ID. + + Returns: + ------- + Optional[List[int]] + List of available GPU IDs. Otherwise, None. + """ + # TODO: Create a pull request of `GPUtil` to cover ROCM devices. + # Currently, even if None is returned, a ROCM device is still visible in TensorFlow. + available = GPUtil.getGPUs() + num_gpus = len(available) + if num_gpus == 0: + return None + + # Print help messages + gpu_str_list = ["- %d#%s" % (item.id, item.name) for item in available] + log.info("Availalbe GPUs are:\n%s", "\n".join(gpu_str_list)) + + # Ensure TensorFlow is compatible + if num_gpus > 0 and not tf.test.is_built_with_gpu_support(): + log.warning("GPU devices are found while your installed TensorFlow has no GPU support!" + + " Switch to CPU device for calculation.") + return None + + # Warn for better GPU visibility + if "CUDA_VISIBLE_DEVICES" not in os.environ: + if num_gpus > 1: + log.warning("Multiple GPU devices are found while only the first one will be used!" + + " It is recommended to limit GPU visibility by the environment variable" + + " `CUDA_VISIBLE_DEVICES`.") + return list(range(num_gpus)) + + # In case where user set "CUDA_VISIBLE_DEVICES=-1" to disable GPU usage + valid_ids = [] + for item in os.environ["CUDA_VISIBLE_DEVICES"].split(","): + idx = int(item) + if idx >= 0 and idx < num_gpus: + gpu_id = len(valid_ids) + valid_ids.append(gpu_id) + return valid_ids if len(valid_ids) > 0 else None # Always None if no GPU available def get_resource() -> Tuple[str, List[str], Optional[List[int]]]: @@ -17,10 +67,5 @@ def get_resource() -> Tuple[str, List[str], Optional[List[int]]]: """ nodename = socket.gethostname() nodelist = [nodename] - gpus_env = os.getenv("CUDA_VISIBLE_DEVICES", None) - if not gpus_env: - gpus = None - else: - gpus = [gpu for gpu in gpus_env.split(",")] - + gpus = get_gpus() return nodename, nodelist, gpus diff --git a/deepmd/cluster/slurm.py b/deepmd/cluster/slurm.py index df4ac3dbf9..6372d4d83b 100644 --- a/deepmd/cluster/slurm.py +++ b/deepmd/cluster/slurm.py @@ -7,6 +7,8 @@ import re import os + +from deepmd.cluster import local from typing import List, Tuple, Optional, Iterable __all__ = ["get_resource"] @@ -45,11 +47,7 @@ def get_resource() -> Tuple[str, List[str], Optional[List[int]]]: raise ValueError( f"Nodename({nodename}) not in nodelist({nodelist}). This should not happen!" ) - gpus_env = os.getenv("CUDA_VISIBLE_DEVICES") - if not gpus_env: - gpus = None - else: - gpus = [int(gpu) for gpu in gpus_env.split(",")] + gpus = local.get_gpus() return nodename, nodelist, gpus diff --git a/requirements.txt b/requirements.txt index e3a8f501ab..146f5d9038 100644 --- a/requirements.txt +++ b/requirements.txt @@ -2,4 +2,5 @@ numpy scipy pyyaml dargs >= 0.2.2 +GPUtil >= 0.14.0 typing_extensions; python_version < "3.7" From 93f48b3bb61f7f252d8110b528630342a57003ea Mon Sep 17 00:00:00 2001 From: Shaochen Shi Date: Sat, 31 Jul 2021 10:33:59 +0800 Subject: [PATCH 2/6] Clean codes of preparing parallel context. --- deepmd/cluster/local.py | 8 ++--- deepmd/train/run_options.py | 68 ++++++------------------------------- 2 files changed, 14 insertions(+), 62 deletions(-) diff --git a/deepmd/cluster/local.py b/deepmd/cluster/local.py index 388077edbd..a9a49416ac 100644 --- a/deepmd/cluster/local.py +++ b/deepmd/cluster/local.py @@ -39,12 +39,8 @@ def get_gpus(): + " Switch to CPU device for calculation.") return None - # Warn for better GPU visibility + # All GPUs are avaiable if "CUDA_VISIBLE_DEVICES" not in os.environ: - if num_gpus > 1: - log.warning("Multiple GPU devices are found while only the first one will be used!" - + " It is recommended to limit GPU visibility by the environment variable" - + " `CUDA_VISIBLE_DEVICES`.") return list(range(num_gpus)) # In case where user set "CUDA_VISIBLE_DEVICES=-1" to disable GPU usage @@ -54,6 +50,8 @@ def get_gpus(): if idx >= 0 and idx < num_gpus: gpu_id = len(valid_ids) valid_ids.append(gpu_id) + else: + log.warning("GPU ID %d in `` is out of range and thus ignored!") return valid_ids if len(valid_ids) > 0 else None # Always None if no GPU available diff --git a/deepmd/train/run_options.py b/deepmd/train/run_options.py index 1a1145817a..4ac97e8b78 100644 --- a/deepmd/train/run_options.py +++ b/deepmd/train/run_options.py @@ -52,57 +52,6 @@ ) -def _is_distributed(HVD: "HVD") -> bool: - """Check if there are more than one MPI processes. - - Parameters - ---------- - HVD : HVD - Horovod object - - Returns - ------- - bool - True if we have more than 1 MPI process - """ - return HVD.size() > 1 - - -def _distributed_task_config( - HVD: "HVD", - gpu_list: Optional[List[int]] = None -) -> Tuple[int, int, str]: - """Create configuration for distributed tensorflow session. - - Parameters - ---------- - HVD : horovod.tensorflow - Horovod TensorFlow module - gpu_list : Optional[List[int]], optional - the list of GPUs on each node, by default None - - Returns - ------- - Tuple[int, int, str] - task count, index of this task, the device for this task - """ - my_rank = HVD.rank() - world_size = HVD.size() - - # setup gpu/cpu devices - if gpu_list is not None: - numb_gpu = len(gpu_list) - gpu_idx = HVD.local_rank() - if gpu_idx >= numb_gpu: - my_device = "cpu:0" # "cpu:%d" % node_task_idx - else: - my_device = f"gpu:{gpu_idx:d}" - else: - my_device = "cpu:0" # "cpu:%d" % node_task_idx - - return world_size, my_rank, my_device - - class RunOptions: """Class with inf oon how to run training (cluster, MPI and GPU config). @@ -225,7 +174,7 @@ def _try_init_distrib(self): try: import horovod.tensorflow as HVD HVD.init() - self.is_distrib = _is_distributed(HVD) + self.is_distrib = HVD.size() > 1 except ImportError: log.warning("Switch to serial execution due to lack of horovod module.") self.is_distrib = False @@ -250,11 +199,16 @@ def _init_distributed(self, HVD: "HVD"): self.nodename = nodename self.nodelist = nodelist self.gpus = gpus - ( - self.world_size, - self.my_rank, - self.my_device, - ) = _distributed_task_config(HVD, gpus) + self.my_rank = HVD.rank() + self.world_size = HVD.size() + + if gpus is not None: + gpu_idx = HVD.local_rank() + if gpu_idx >= len(gpus): + raise RuntimeError('Count of local processes is larger than that of available GPUs!') + my_device = f"gpu:{gpu_idx:d}" + else: + self.my_device = "cpu:0" def _init_serial(self): """Initialize setting for serial training.""" From 3010b50555b79f74c1454b15d688dd6ecdb846c7 Mon Sep 17 00:00:00 2001 From: Shaochen Shi Date: Sat, 31 Jul 2021 10:47:38 +0800 Subject: [PATCH 3/6] Fix code style and typo. --- deepmd/cluster/local.py | 8 +++++--- deepmd/train/run_options.py | 2 +- 2 files changed, 6 insertions(+), 4 deletions(-) diff --git a/deepmd/cluster/local.py b/deepmd/cluster/local.py index a9a49416ac..2a5619a59e 100644 --- a/deepmd/cluster/local.py +++ b/deepmd/cluster/local.py @@ -1,4 +1,5 @@ """Get local GPU resources.""" + import logging import os import socket @@ -35,8 +36,8 @@ def get_gpus(): # Ensure TensorFlow is compatible if num_gpus > 0 and not tf.test.is_built_with_gpu_support(): - log.warning("GPU devices are found while your installed TensorFlow has no GPU support!" - + " Switch to CPU device for calculation.") + log.warning("GPU devices are found while your installed TensorFlow has no GPU " + "support! Switch to CPU device for calculation.") return None # All GPUs are avaiable @@ -51,7 +52,8 @@ def get_gpus(): gpu_id = len(valid_ids) valid_ids.append(gpu_id) else: - log.warning("GPU ID %d in `` is out of range and thus ignored!") + log.warning("GPU ID %d in `CUDA_VISIBLE_DEVICES` is out of range and thus " + "ignored!", idx) return valid_ids if len(valid_ids) > 0 else None # Always None if no GPU available diff --git a/deepmd/train/run_options.py b/deepmd/train/run_options.py index 4ac97e8b78..7f12ed6d2e 100644 --- a/deepmd/train/run_options.py +++ b/deepmd/train/run_options.py @@ -206,7 +206,7 @@ def _init_distributed(self, HVD: "HVD"): gpu_idx = HVD.local_rank() if gpu_idx >= len(gpus): raise RuntimeError('Count of local processes is larger than that of available GPUs!') - my_device = f"gpu:{gpu_idx:d}" + self.my_device = f"gpu:{gpu_idx:d}" else: self.my_device = "cpu:0" From 6af0ef75feeae2c4b1494318993c99ca6d674c9c Mon Sep 17 00:00:00 2001 From: Shaochen Shi Date: Sun, 1 Aug 2021 09:10:07 +0800 Subject: [PATCH 4/6] Use a subprocess to detect GPU. --- deepmd/cluster/local.py | 52 +++++++++++-------------------------- deepmd/train/run_options.py | 13 +++++----- requirements.txt | 1 - 3 files changed, 22 insertions(+), 44 deletions(-) diff --git a/deepmd/cluster/local.py b/deepmd/cluster/local.py index 2a5619a59e..905cd2741d 100644 --- a/deepmd/cluster/local.py +++ b/deepmd/cluster/local.py @@ -1,17 +1,15 @@ """Get local GPU resources.""" -import logging import os import socket - -import GPUtil +import subprocess as sp +import sys from deepmd.env import tf from typing import List, Tuple, Optional -__all__ = ["get_gpus", "get_resource"] -log = logging.getLogger(__name__) +__all__ = ["get_gpus", "get_resource"] def get_gpus(): @@ -23,38 +21,18 @@ def get_gpus(): Optional[List[int]] List of available GPU IDs. Otherwise, None. """ - # TODO: Create a pull request of `GPUtil` to cover ROCM devices. - # Currently, even if None is returned, a ROCM device is still visible in TensorFlow. - available = GPUtil.getGPUs() - num_gpus = len(available) - if num_gpus == 0: - return None - - # Print help messages - gpu_str_list = ["- %d#%s" % (item.id, item.name) for item in available] - log.info("Availalbe GPUs are:\n%s", "\n".join(gpu_str_list)) - - # Ensure TensorFlow is compatible - if num_gpus > 0 and not tf.test.is_built_with_gpu_support(): - log.warning("GPU devices are found while your installed TensorFlow has no GPU " - "support! Switch to CPU device for calculation.") - return None - - # All GPUs are avaiable - if "CUDA_VISIBLE_DEVICES" not in os.environ: - return list(range(num_gpus)) - - # In case where user set "CUDA_VISIBLE_DEVICES=-1" to disable GPU usage - valid_ids = [] - for item in os.environ["CUDA_VISIBLE_DEVICES"].split(","): - idx = int(item) - if idx >= 0 and idx < num_gpus: - gpu_id = len(valid_ids) - valid_ids.append(gpu_id) - else: - log.warning("GPU ID %d in `CUDA_VISIBLE_DEVICES` is out of range and thus " - "ignored!", idx) - return valid_ids if len(valid_ids) > 0 else None # Always None if no GPU available + test_cmd = 'from tensorflow.python.client import device_lib; ' \ + 'devices = device_lib.list_local_devices(); ' \ + 'gpus = [d.name for d in devices if d.device_type == "GPU"]; ' \ + 'print(len(gpus))' + p = sp.Popen([sys.executable, "-c", test_cmd], stderr=sp.PIPE, stdout=sp.PIPE) + stdout, stderr = p.communicate() + if p.returncode != 0: + decoded = stderr.decode('UTF-8') + raise RuntimeError('Failed to detect availbe GPUs due to:\n%s' % decoded) + decoded = stdout.decode('UTF-8').strip() + num_gpus = int(decoded) + return list(range(num_gpus)) if num_gpus > 0 else None def get_resource() -> Tuple[str, List[str], Optional[List[int]]]: diff --git a/deepmd/train/run_options.py b/deepmd/train/run_options.py index 7f12ed6d2e..2b1e73a5aa 100644 --- a/deepmd/train/run_options.py +++ b/deepmd/train/run_options.py @@ -123,14 +123,15 @@ def print_resource_summary(self): log.info("---Summary of the training---------------------------------------") if self.is_distrib: log.info("distributed") - log.info(f"world size: {self.world_size}") + log.info(f"world size: {self.world_size}") log.info(f"my rank: {self.my_rank}") - log.info(f"node list: {self.nodelist}") + log.info(f"node list: {self.nodelist}") log.info(f"running on: {self.nodename}") - if self.gpus is None: - log.info(f"CUDA_VISIBLE_DEVICES: unset") - else: - log.info(f"CUDA_VISIBLE_DEVICES: {self.gpus}") + log.info(f"computing device: {self.my_device}") + if tf.test.is_built_with_gpu_support(): + env_value = os.environ.get('CUDA_VISIBLE_DEVICES', 'unset') + log.info(f"CUDA_VISIBLE_DEVICES: {env_value}") + log.info(f"Count of visible GPU: {len(self.gpus)}") intra, inter = get_tf_default_nthreads() log.info(f"num_intra_threads: {intra:d}") log.info(f"num_inter_threads: {inter:d}") diff --git a/requirements.txt b/requirements.txt index b7159fe643..21befa3722 100644 --- a/requirements.txt +++ b/requirements.txt @@ -2,5 +2,4 @@ numpy scipy pyyaml dargs >= 0.2.6 -GPUtil >= 0.14.0 typing_extensions; python_version < "3.7" From 65a68f66863f13c05e60fbc7096f7a8fc17307c6 Mon Sep 17 00:00:00 2001 From: Shaochen Shi Date: Sun, 1 Aug 2021 09:40:21 +0800 Subject: [PATCH 5/6] Use Popen as a context manager. --- deepmd/cluster/local.py | 16 ++++++++-------- 1 file changed, 8 insertions(+), 8 deletions(-) diff --git a/deepmd/cluster/local.py b/deepmd/cluster/local.py index 905cd2741d..0bc9ec7698 100644 --- a/deepmd/cluster/local.py +++ b/deepmd/cluster/local.py @@ -25,14 +25,14 @@ def get_gpus(): 'devices = device_lib.list_local_devices(); ' \ 'gpus = [d.name for d in devices if d.device_type == "GPU"]; ' \ 'print(len(gpus))' - p = sp.Popen([sys.executable, "-c", test_cmd], stderr=sp.PIPE, stdout=sp.PIPE) - stdout, stderr = p.communicate() - if p.returncode != 0: - decoded = stderr.decode('UTF-8') - raise RuntimeError('Failed to detect availbe GPUs due to:\n%s' % decoded) - decoded = stdout.decode('UTF-8').strip() - num_gpus = int(decoded) - return list(range(num_gpus)) if num_gpus > 0 else None + with sp.Popen([sys.executable, "-c", test_cmd], stderr=sp.PIPE, stdout=sp.PIPE) as p: + stdout, stderr = p.communicate() + if p.returncode != 0: + decoded = stderr.decode('UTF-8') + raise RuntimeError('Failed to detect availbe GPUs due to:\n%s' % decoded) + decoded = stdout.decode('UTF-8').strip() + num_gpus = int(decoded) + return list(range(num_gpus)) if num_gpus > 0 else None def get_resource() -> Tuple[str, List[str], Optional[List[int]]]: From 5401ab2755d1a330df782703ba67c862a1d999a4 Mon Sep 17 00:00:00 2001 From: Shaochen Shi Date: Sun, 1 Aug 2021 11:26:34 +0800 Subject: [PATCH 6/6] Do not use `tf.test.built_with_gpu_support`. --- deepmd/train/run_options.py | 7 +++---- 1 file changed, 3 insertions(+), 4 deletions(-) diff --git a/deepmd/train/run_options.py b/deepmd/train/run_options.py index 2b1e73a5aa..1ad5fc4574 100644 --- a/deepmd/train/run_options.py +++ b/deepmd/train/run_options.py @@ -128,10 +128,9 @@ def print_resource_summary(self): log.info(f"node list: {self.nodelist}") log.info(f"running on: {self.nodename}") log.info(f"computing device: {self.my_device}") - if tf.test.is_built_with_gpu_support(): - env_value = os.environ.get('CUDA_VISIBLE_DEVICES', 'unset') - log.info(f"CUDA_VISIBLE_DEVICES: {env_value}") - log.info(f"Count of visible GPU: {len(self.gpus)}") + env_value = os.environ.get('CUDA_VISIBLE_DEVICES', 'unset') + log.info(f"CUDA_VISIBLE_DEVICES: {env_value}") + log.info(f"Count of visible GPU: {len(self.gpus or [])}") intra, inter = get_tf_default_nthreads() log.info(f"num_intra_threads: {intra:d}") log.info(f"num_inter_threads: {inter:d}")