From 19f5398833bb31869ba152a1805686330c693587 Mon Sep 17 00:00:00 2001
From: Shaochen Shi <shishaochen_ha@sina.com>
Date: Sat, 31 Jul 2021 09:39:28 +0800
Subject: [PATCH 1/6] Find available GPUs in an elegant way.

---
 deepmd/cluster/local.py | 63 +++++++++++++++++++++++++++++++++++------
 deepmd/cluster/slurm.py |  8 ++----
 requirements.txt        |  1 +
 3 files changed, 58 insertions(+), 14 deletions(-)

diff --git a/deepmd/cluster/local.py b/deepmd/cluster/local.py
index 37a2e99b6e..388077edbd 100644
--- a/deepmd/cluster/local.py
+++ b/deepmd/cluster/local.py
@@ -1,10 +1,60 @@
-"""Get local GPU resources from `CUDA_VISIBLE_DEVICES` enviroment variable."""
-
+"""Get local GPU resources."""
+import logging
 import os
 import socket
+
+import GPUtil
+
+from deepmd.env import tf
 from typing import List, Tuple, Optional
 
-__all__ = ["get_resource"]
+__all__ = ["get_gpus", "get_resource"]
+
+log = logging.getLogger(__name__)
+
+
+def get_gpus():
+    """Get available IDs of GPU cards at local.
+    These IDs are valid when used as the TensorFlow device ID.
+
+    Returns:
+    -------
+    Optional[List[int]]
+        List of available GPU IDs. Otherwise, None.
+    """
+    # TODO: Create a pull request of `GPUtil` to cover ROCM devices.
+    # Currently, even if None is returned, a ROCM device is still visible in TensorFlow.
+    available = GPUtil.getGPUs()
+    num_gpus = len(available)
+    if num_gpus == 0:
+        return None
+
+    # Print help messages
+    gpu_str_list = ["- %d#%s" % (item.id, item.name) for item in available]
+    log.info("Availalbe GPUs are:\n%s", "\n".join(gpu_str_list))
+
+    # Ensure TensorFlow is compatible
+    if num_gpus > 0 and not tf.test.is_built_with_gpu_support():
+        log.warning("GPU devices are found while your installed TensorFlow has no GPU support!"
+            + " Switch to CPU device for calculation.")
+        return None
+
+    # Warn for better GPU visibility
+    if "CUDA_VISIBLE_DEVICES" not in os.environ:
+        if num_gpus > 1:
+            log.warning("Multiple GPU devices are found while only the first one will be used!"
+            + " It is recommended to limit GPU visibility by the environment variable"
+            + " `CUDA_VISIBLE_DEVICES`.")
+        return list(range(num_gpus))
+
+    # In case where user set "CUDA_VISIBLE_DEVICES=-1" to disable GPU usage
+    valid_ids = []
+    for item in os.environ["CUDA_VISIBLE_DEVICES"].split(","):
+        idx = int(item)
+        if idx >= 0 and idx < num_gpus:
+            gpu_id = len(valid_ids)
+            valid_ids.append(gpu_id)
+    return valid_ids if len(valid_ids) > 0 else None  # Always None if no GPU available
 
 
 def get_resource() -> Tuple[str, List[str], Optional[List[int]]]:
@@ -17,10 +67,5 @@ def get_resource() -> Tuple[str, List[str], Optional[List[int]]]:
     """
     nodename = socket.gethostname()
     nodelist = [nodename]
-    gpus_env = os.getenv("CUDA_VISIBLE_DEVICES", None)
-    if not gpus_env:
-        gpus = None
-    else:
-        gpus = [gpu for gpu in gpus_env.split(",")]
-
+    gpus = get_gpus()
     return nodename, nodelist, gpus
diff --git a/deepmd/cluster/slurm.py b/deepmd/cluster/slurm.py
index df4ac3dbf9..6372d4d83b 100644
--- a/deepmd/cluster/slurm.py
+++ b/deepmd/cluster/slurm.py
@@ -7,6 +7,8 @@
 
 import re
 import os
+
+from deepmd.cluster import local
 from typing import List, Tuple, Optional, Iterable
 
 __all__ = ["get_resource"]
@@ -45,11 +47,7 @@ def get_resource() -> Tuple[str, List[str], Optional[List[int]]]:
         raise ValueError(
             f"Nodename({nodename}) not in nodelist({nodelist}). This should not happen!"
         )
-    gpus_env = os.getenv("CUDA_VISIBLE_DEVICES")
-    if not gpus_env:
-        gpus = None
-    else:
-        gpus = [int(gpu) for gpu in gpus_env.split(",")]
+    gpus = local.get_gpus()
     return nodename, nodelist, gpus
 
 
diff --git a/requirements.txt b/requirements.txt
index e3a8f501ab..146f5d9038 100644
--- a/requirements.txt
+++ b/requirements.txt
@@ -2,4 +2,5 @@ numpy
 scipy
 pyyaml
 dargs >= 0.2.2
+GPUtil >= 0.14.0
 typing_extensions; python_version < "3.7"

From 93f48b3bb61f7f252d8110b528630342a57003ea Mon Sep 17 00:00:00 2001
From: Shaochen Shi <shishaochen_ha@sina.com>
Date: Sat, 31 Jul 2021 10:33:59 +0800
Subject: [PATCH 2/6] Clean codes of preparing parallel context.

---
 deepmd/cluster/local.py     |  8 ++---
 deepmd/train/run_options.py | 68 ++++++-------------------------------
 2 files changed, 14 insertions(+), 62 deletions(-)

diff --git a/deepmd/cluster/local.py b/deepmd/cluster/local.py
index 388077edbd..a9a49416ac 100644
--- a/deepmd/cluster/local.py
+++ b/deepmd/cluster/local.py
@@ -39,12 +39,8 @@ def get_gpus():
             + " Switch to CPU device for calculation.")
         return None
 
-    # Warn for better GPU visibility
+    # All GPUs are avaiable
     if "CUDA_VISIBLE_DEVICES" not in os.environ:
-        if num_gpus > 1:
-            log.warning("Multiple GPU devices are found while only the first one will be used!"
-            + " It is recommended to limit GPU visibility by the environment variable"
-            + " `CUDA_VISIBLE_DEVICES`.")
         return list(range(num_gpus))
 
     # In case where user set "CUDA_VISIBLE_DEVICES=-1" to disable GPU usage
@@ -54,6 +50,8 @@ def get_gpus():
         if idx >= 0 and idx < num_gpus:
             gpu_id = len(valid_ids)
             valid_ids.append(gpu_id)
+        else:
+            log.warning("GPU ID %d in `` is out of range and thus ignored!")
     return valid_ids if len(valid_ids) > 0 else None  # Always None if no GPU available
 
 
diff --git a/deepmd/train/run_options.py b/deepmd/train/run_options.py
index 1a1145817a..4ac97e8b78 100644
--- a/deepmd/train/run_options.py
+++ b/deepmd/train/run_options.py
@@ -52,57 +52,6 @@
 )
 
 
-def _is_distributed(HVD: "HVD") -> bool:
-    """Check if there are more than one MPI processes.
-
-    Parameters
-    ----------
-    HVD : HVD
-        Horovod object
-
-    Returns
-    -------
-    bool
-        True if we have more than 1 MPI process
-    """
-    return HVD.size() > 1
-
-
-def _distributed_task_config(
-    HVD: "HVD",
-    gpu_list: Optional[List[int]] = None
-) -> Tuple[int, int, str]:
-    """Create configuration for distributed tensorflow session.
-
-    Parameters
-    ----------
-    HVD : horovod.tensorflow
-        Horovod TensorFlow module
-    gpu_list : Optional[List[int]], optional
-        the list of GPUs on each node, by default None
-
-    Returns
-    -------
-    Tuple[int, int, str]
-        task count, index of this task, the device for this task
-    """
-    my_rank = HVD.rank()
-    world_size = HVD.size()
-
-    # setup gpu/cpu devices
-    if gpu_list is not None:
-        numb_gpu = len(gpu_list)
-        gpu_idx = HVD.local_rank()
-        if gpu_idx >= numb_gpu:
-            my_device = "cpu:0"  # "cpu:%d" % node_task_idx
-        else:
-            my_device = f"gpu:{gpu_idx:d}"
-    else:
-        my_device = "cpu:0"  # "cpu:%d" % node_task_idx
-
-    return world_size, my_rank, my_device
-
-
 class RunOptions:
     """Class with inf oon how to run training (cluster, MPI and GPU config).
 
@@ -225,7 +174,7 @@ def _try_init_distrib(self):
         try:
             import horovod.tensorflow as HVD
             HVD.init()
-            self.is_distrib = _is_distributed(HVD)
+            self.is_distrib = HVD.size() > 1
         except ImportError:
             log.warning("Switch to serial execution due to lack of horovod module.")
             self.is_distrib = False
@@ -250,11 +199,16 @@ def _init_distributed(self, HVD: "HVD"):
         self.nodename = nodename
         self.nodelist = nodelist
         self.gpus = gpus
-        (
-            self.world_size,
-            self.my_rank,
-            self.my_device,
-        ) = _distributed_task_config(HVD, gpus)
+        self.my_rank = HVD.rank()
+        self.world_size = HVD.size()
+
+        if gpus is not None:
+            gpu_idx = HVD.local_rank()
+            if gpu_idx >= len(gpus):
+                raise RuntimeError('Count of local processes is larger than that of available GPUs!')
+            my_device = f"gpu:{gpu_idx:d}"
+        else:
+            self.my_device = "cpu:0"
 
     def _init_serial(self):
         """Initialize setting for serial training."""

From 3010b50555b79f74c1454b15d688dd6ecdb846c7 Mon Sep 17 00:00:00 2001
From: Shaochen Shi <shishaochen_ha@sina.com>
Date: Sat, 31 Jul 2021 10:47:38 +0800
Subject: [PATCH 3/6] Fix code style and typo.

---
 deepmd/cluster/local.py     | 8 +++++---
 deepmd/train/run_options.py | 2 +-
 2 files changed, 6 insertions(+), 4 deletions(-)

diff --git a/deepmd/cluster/local.py b/deepmd/cluster/local.py
index a9a49416ac..2a5619a59e 100644
--- a/deepmd/cluster/local.py
+++ b/deepmd/cluster/local.py
@@ -1,4 +1,5 @@
 """Get local GPU resources."""
+
 import logging
 import os
 import socket
@@ -35,8 +36,8 @@ def get_gpus():
 
     # Ensure TensorFlow is compatible
     if num_gpus > 0 and not tf.test.is_built_with_gpu_support():
-        log.warning("GPU devices are found while your installed TensorFlow has no GPU support!"
-            + " Switch to CPU device for calculation.")
+        log.warning("GPU devices are found while your installed TensorFlow has no GPU "
+                    "support! Switch to CPU device for calculation.")
         return None
 
     # All GPUs are avaiable
@@ -51,7 +52,8 @@ def get_gpus():
             gpu_id = len(valid_ids)
             valid_ids.append(gpu_id)
         else:
-            log.warning("GPU ID %d in `` is out of range and thus ignored!")
+            log.warning("GPU ID %d in `CUDA_VISIBLE_DEVICES` is out of range and thus "
+                        "ignored!", idx)
     return valid_ids if len(valid_ids) > 0 else None  # Always None if no GPU available
 
 
diff --git a/deepmd/train/run_options.py b/deepmd/train/run_options.py
index 4ac97e8b78..7f12ed6d2e 100644
--- a/deepmd/train/run_options.py
+++ b/deepmd/train/run_options.py
@@ -206,7 +206,7 @@ def _init_distributed(self, HVD: "HVD"):
             gpu_idx = HVD.local_rank()
             if gpu_idx >= len(gpus):
                 raise RuntimeError('Count of local processes is larger than that of available GPUs!')
-            my_device = f"gpu:{gpu_idx:d}"
+            self.my_device = f"gpu:{gpu_idx:d}"
         else:
             self.my_device = "cpu:0"
 

From 6af0ef75feeae2c4b1494318993c99ca6d674c9c Mon Sep 17 00:00:00 2001
From: Shaochen Shi <shishaochen_ha@sina.com>
Date: Sun, 1 Aug 2021 09:10:07 +0800
Subject: [PATCH 4/6] Use a subprocess to detect GPU.

---
 deepmd/cluster/local.py     | 52 +++++++++++--------------------------
 deepmd/train/run_options.py | 13 +++++-----
 requirements.txt            |  1 -
 3 files changed, 22 insertions(+), 44 deletions(-)

diff --git a/deepmd/cluster/local.py b/deepmd/cluster/local.py
index 2a5619a59e..905cd2741d 100644
--- a/deepmd/cluster/local.py
+++ b/deepmd/cluster/local.py
@@ -1,17 +1,15 @@
 """Get local GPU resources."""
 
-import logging
 import os
 import socket
-
-import GPUtil
+import subprocess as sp
+import sys
 
 from deepmd.env import tf
 from typing import List, Tuple, Optional
 
-__all__ = ["get_gpus", "get_resource"]
 
-log = logging.getLogger(__name__)
+__all__ = ["get_gpus", "get_resource"]
 
 
 def get_gpus():
@@ -23,38 +21,18 @@ def get_gpus():
     Optional[List[int]]
         List of available GPU IDs. Otherwise, None.
     """
-    # TODO: Create a pull request of `GPUtil` to cover ROCM devices.
-    # Currently, even if None is returned, a ROCM device is still visible in TensorFlow.
-    available = GPUtil.getGPUs()
-    num_gpus = len(available)
-    if num_gpus == 0:
-        return None
-
-    # Print help messages
-    gpu_str_list = ["- %d#%s" % (item.id, item.name) for item in available]
-    log.info("Availalbe GPUs are:\n%s", "\n".join(gpu_str_list))
-
-    # Ensure TensorFlow is compatible
-    if num_gpus > 0 and not tf.test.is_built_with_gpu_support():
-        log.warning("GPU devices are found while your installed TensorFlow has no GPU "
-                    "support! Switch to CPU device for calculation.")
-        return None
-
-    # All GPUs are avaiable
-    if "CUDA_VISIBLE_DEVICES" not in os.environ:
-        return list(range(num_gpus))
-
-    # In case where user set "CUDA_VISIBLE_DEVICES=-1" to disable GPU usage
-    valid_ids = []
-    for item in os.environ["CUDA_VISIBLE_DEVICES"].split(","):
-        idx = int(item)
-        if idx >= 0 and idx < num_gpus:
-            gpu_id = len(valid_ids)
-            valid_ids.append(gpu_id)
-        else:
-            log.warning("GPU ID %d in `CUDA_VISIBLE_DEVICES` is out of range and thus "
-                        "ignored!", idx)
-    return valid_ids if len(valid_ids) > 0 else None  # Always None if no GPU available
+    test_cmd = 'from tensorflow.python.client import device_lib; ' \
+               'devices = device_lib.list_local_devices(); ' \
+               'gpus = [d.name for d in devices if d.device_type == "GPU"]; ' \
+               'print(len(gpus))'
+    p = sp.Popen([sys.executable, "-c", test_cmd], stderr=sp.PIPE, stdout=sp.PIPE)
+    stdout, stderr = p.communicate()
+    if p.returncode != 0:
+        decoded = stderr.decode('UTF-8')
+        raise RuntimeError('Failed to detect availbe GPUs due to:\n%s' % decoded)
+    decoded = stdout.decode('UTF-8').strip()
+    num_gpus = int(decoded)
+    return list(range(num_gpus)) if num_gpus > 0 else None
 
 
 def get_resource() -> Tuple[str, List[str], Optional[List[int]]]:
diff --git a/deepmd/train/run_options.py b/deepmd/train/run_options.py
index 7f12ed6d2e..2b1e73a5aa 100644
--- a/deepmd/train/run_options.py
+++ b/deepmd/train/run_options.py
@@ -123,14 +123,15 @@ def print_resource_summary(self):
         log.info("---Summary of the training---------------------------------------")
         if self.is_distrib:
             log.info("distributed")
-            log.info(f"world size:              {self.world_size}")
+            log.info(f"world size:           {self.world_size}")
             log.info(f"my rank:              {self.my_rank}")
-            log.info(f"node list:          {self.nodelist}")
+            log.info(f"node list:            {self.nodelist}")
         log.info(f"running on:           {self.nodename}")
-        if self.gpus is None:
-            log.info(f"CUDA_VISIBLE_DEVICES: unset")
-        else:
-            log.info(f"CUDA_VISIBLE_DEVICES: {self.gpus}")
+        log.info(f"computing device:     {self.my_device}")
+        if tf.test.is_built_with_gpu_support():
+            env_value = os.environ.get('CUDA_VISIBLE_DEVICES', 'unset')
+            log.info(f"CUDA_VISIBLE_DEVICES: {env_value}")
+            log.info(f"Count of visible GPU: {len(self.gpus)}")
         intra, inter = get_tf_default_nthreads()
         log.info(f"num_intra_threads:    {intra:d}")
         log.info(f"num_inter_threads:    {inter:d}")
diff --git a/requirements.txt b/requirements.txt
index b7159fe643..21befa3722 100644
--- a/requirements.txt
+++ b/requirements.txt
@@ -2,5 +2,4 @@ numpy
 scipy
 pyyaml
 dargs >= 0.2.6
-GPUtil >= 0.14.0
 typing_extensions; python_version < "3.7"

From 65a68f66863f13c05e60fbc7096f7a8fc17307c6 Mon Sep 17 00:00:00 2001
From: Shaochen Shi <shishaochen_ha@sina.com>
Date: Sun, 1 Aug 2021 09:40:21 +0800
Subject: [PATCH 5/6] Use Popen as a context manager.

---
 deepmd/cluster/local.py | 16 ++++++++--------
 1 file changed, 8 insertions(+), 8 deletions(-)

diff --git a/deepmd/cluster/local.py b/deepmd/cluster/local.py
index 905cd2741d..0bc9ec7698 100644
--- a/deepmd/cluster/local.py
+++ b/deepmd/cluster/local.py
@@ -25,14 +25,14 @@ def get_gpus():
                'devices = device_lib.list_local_devices(); ' \
                'gpus = [d.name for d in devices if d.device_type == "GPU"]; ' \
                'print(len(gpus))'
-    p = sp.Popen([sys.executable, "-c", test_cmd], stderr=sp.PIPE, stdout=sp.PIPE)
-    stdout, stderr = p.communicate()
-    if p.returncode != 0:
-        decoded = stderr.decode('UTF-8')
-        raise RuntimeError('Failed to detect availbe GPUs due to:\n%s' % decoded)
-    decoded = stdout.decode('UTF-8').strip()
-    num_gpus = int(decoded)
-    return list(range(num_gpus)) if num_gpus > 0 else None
+    with sp.Popen([sys.executable, "-c", test_cmd], stderr=sp.PIPE, stdout=sp.PIPE) as p:
+        stdout, stderr = p.communicate()
+        if p.returncode != 0:
+            decoded = stderr.decode('UTF-8')
+            raise RuntimeError('Failed to detect availbe GPUs due to:\n%s' % decoded)
+        decoded = stdout.decode('UTF-8').strip()
+        num_gpus = int(decoded)
+        return list(range(num_gpus)) if num_gpus > 0 else None
 
 
 def get_resource() -> Tuple[str, List[str], Optional[List[int]]]:

From 5401ab2755d1a330df782703ba67c862a1d999a4 Mon Sep 17 00:00:00 2001
From: Shaochen Shi <shishaochen_ha@sina.com>
Date: Sun, 1 Aug 2021 11:26:34 +0800
Subject: [PATCH 6/6] Do not use `tf.test.built_with_gpu_support`.

---
 deepmd/train/run_options.py | 7 +++----
 1 file changed, 3 insertions(+), 4 deletions(-)

diff --git a/deepmd/train/run_options.py b/deepmd/train/run_options.py
index 2b1e73a5aa..1ad5fc4574 100644
--- a/deepmd/train/run_options.py
+++ b/deepmd/train/run_options.py
@@ -128,10 +128,9 @@ def print_resource_summary(self):
             log.info(f"node list:            {self.nodelist}")
         log.info(f"running on:           {self.nodename}")
         log.info(f"computing device:     {self.my_device}")
-        if tf.test.is_built_with_gpu_support():
-            env_value = os.environ.get('CUDA_VISIBLE_DEVICES', 'unset')
-            log.info(f"CUDA_VISIBLE_DEVICES: {env_value}")
-            log.info(f"Count of visible GPU: {len(self.gpus)}")
+        env_value = os.environ.get('CUDA_VISIBLE_DEVICES', 'unset')
+        log.info(f"CUDA_VISIBLE_DEVICES: {env_value}")
+        log.info(f"Count of visible GPU: {len(self.gpus or [])}")
         intra, inter = get_tf_default_nthreads()
         log.info(f"num_intra_threads:    {intra:d}")
         log.info(f"num_inter_threads:    {inter:d}")