From 21a636b7fcf5f1bb404f4d1b35ec75dc7e05155b Mon Sep 17 00:00:00 2001
From: Jinzhe Zeng <jinzhe.zeng@rutgers.edu>
Date: Sat, 26 Nov 2022 19:41:41 -0500
Subject: [PATCH 1/4] skip `get_gpus` subprocess when TF is cpu only

When I benchmark deepmd-kit on my machine, I found `get_gpus` takes about 2s and is quite slow. In #905, a subprocess is added to get available GPUs. As benchmarked in #2121, it's quite slow to import tensorflow.

I don't have better ideas not to call a subprocess, but we can skip this process when TensorFlow is not built against GPUs. The tests on the GitHub Actions will also benefit.

Attached selected profiling:
>  ncalls  tottime  percall  cumtime  percall filename:lineno(function)
> 1    0.000    0.000    2.141    2.141 local.py:15(get_gpus)
> 1    0.000    0.000    2.133    2.133 subprocess.py:1090(communicate)
---
 deepmd/cluster/local.py      |  4 ++++
 source/tests/test_cluster.py | 18 +++++++++++++++---
 2 files changed, 19 insertions(+), 3 deletions(-)

diff --git a/deepmd/cluster/local.py b/deepmd/cluster/local.py
index 6fe454a9a2..69af55040d 100644
--- a/deepmd/cluster/local.py
+++ b/deepmd/cluster/local.py
@@ -21,6 +21,10 @@ def get_gpus():
     Optional[List[int]]
         List of available GPU IDs. Otherwise, None.
     """
+    if (not tf.test.is_built_with_cuda() and 
+        not (hasattr(tf.test, 'is_built_with_rocm') and tf.test.is_built_with_rocm())):
+        # TF is built with CPU only, skip expensive subprocess call
+        return None
     test_cmd = 'from tensorflow.python.client import device_lib; ' \
                'devices = device_lib.list_local_devices(); ' \
                'gpus = [d.name for d in devices if d.device_type == "GPU"]; ' \
diff --git a/source/tests/test_cluster.py b/source/tests/test_cluster.py
index 01e128b401..1aa700d1c7 100644
--- a/source/tests/test_cluster.py
+++ b/source/tests/test_cluster.py
@@ -23,25 +23,37 @@ def returncode(self):
 
 class TestGPU(unittest.TestCase):
     @mock.patch('subprocess.Popen')
-    def test_none(self, mock_Popen):
+    @mock.patch('tf.test.is_built_with_cuda')
+    def test_none(self, mock_Popen, mock_is_built_with_cuda):
         mock_Popen.return_value.__enter__.return_value = FakePopen(b'0', b'')
+        mock_is_built_with_cuda.return_value = True
         gpus = local.get_gpus()
         self.assertIsNone(gpus)
 
     @mock.patch('subprocess.Popen')
-    def test_valid(self, mock_Popen):
+    @mock.patch('tf.test.is_built_with_cuda')
+    def test_valid(self, mock_Popen, mock_is_built_with_cuda):
         mock_Popen.return_value.__enter__.return_value = FakePopen(b'2', b'')
+        mock_is_built_with_cuda.return_value = True
         gpus = local.get_gpus()
         self.assertEqual(gpus, [0, 1])
 
     @mock.patch('subprocess.Popen')
-    def test_error(self, mock_Popen):
+    @mock.patch('tf.test.is_built_with_cuda')
+    def test_error(self, mock_Popen, mock_is_built_with_cuda):
         mock_Popen.return_value.__enter__.return_value = \
             FakePopen(stderr=b'!', returncode=1)
+        mock_is_built_with_cuda.return_value = True
         with self.assertRaises(RuntimeError) as cm:
             _ = local.get_gpus()
             self.assertIn('Failed to detect', str(cm.exception))
 
+    @mock.patch('tf.test.is_built_with_cuda')
+    def test_cpu(self, mock_is_built_with_cuda):
+        mock_is_built_with_cuda.return_value = False
+        gpus = local.get_gpus()
+        self.assertIsNone(gpus)
+
 
 class TestLocal(unittest.TestCase):
     @mock.patch('socket.gethostname')

From 8e55bcca4ff1ad2e69bd21d8fb24f1a403c4724b Mon Sep 17 00:00:00 2001
From: Jinzhe Zeng <jinzhe.zeng@rutgers.edu>
Date: Sat, 26 Nov 2022 19:54:48 -0500
Subject: [PATCH 2/4] fix mock.patch

---
 source/tests/test_cluster.py | 9 +++++----
 1 file changed, 5 insertions(+), 4 deletions(-)

diff --git a/source/tests/test_cluster.py b/source/tests/test_cluster.py
index 1aa700d1c7..562fa87ab4 100644
--- a/source/tests/test_cluster.py
+++ b/source/tests/test_cluster.py
@@ -1,6 +1,7 @@
 import unittest
 
 from deepmd.cluster import local, slurm
+from deepmd.env import tf
 from unittest import mock
 
 
@@ -23,7 +24,7 @@ def returncode(self):
 
 class TestGPU(unittest.TestCase):
     @mock.patch('subprocess.Popen')
-    @mock.patch('tf.test.is_built_with_cuda')
+    @mock.patch('tensorflow.compat.v1.test.is_built_with_cuda')
     def test_none(self, mock_Popen, mock_is_built_with_cuda):
         mock_Popen.return_value.__enter__.return_value = FakePopen(b'0', b'')
         mock_is_built_with_cuda.return_value = True
@@ -31,7 +32,7 @@ def test_none(self, mock_Popen, mock_is_built_with_cuda):
         self.assertIsNone(gpus)
 
     @mock.patch('subprocess.Popen')
-    @mock.patch('tf.test.is_built_with_cuda')
+    @mock.patch('tensorflow.compat.v1.test.is_built_with_cuda')
     def test_valid(self, mock_Popen, mock_is_built_with_cuda):
         mock_Popen.return_value.__enter__.return_value = FakePopen(b'2', b'')
         mock_is_built_with_cuda.return_value = True
@@ -39,7 +40,7 @@ def test_valid(self, mock_Popen, mock_is_built_with_cuda):
         self.assertEqual(gpus, [0, 1])
 
     @mock.patch('subprocess.Popen')
-    @mock.patch('tf.test.is_built_with_cuda')
+    @mock.patch('tensorflow.compat.v1.test.is_built_with_cuda')
     def test_error(self, mock_Popen, mock_is_built_with_cuda):
         mock_Popen.return_value.__enter__.return_value = \
             FakePopen(stderr=b'!', returncode=1)
@@ -48,7 +49,7 @@ def test_error(self, mock_Popen, mock_is_built_with_cuda):
             _ = local.get_gpus()
             self.assertIn('Failed to detect', str(cm.exception))
 
-    @mock.patch('tf.test.is_built_with_cuda')
+    @mock.patch('tensorflow.compat.v1.test.is_built_with_cuda')
     def test_cpu(self, mock_is_built_with_cuda):
         mock_is_built_with_cuda.return_value = False
         gpus = local.get_gpus()

From e3aed0132685e79ec9b5aab106e73115c4cb3b09 Mon Sep 17 00:00:00 2001
From: Jinzhe Zeng <jinzhe.zeng@rutgers.edu>
Date: Sat, 26 Nov 2022 20:07:51 -0500
Subject: [PATCH 3/4] fix patch

---
 source/tests/test_cluster.py | 6 +++---
 1 file changed, 3 insertions(+), 3 deletions(-)

diff --git a/source/tests/test_cluster.py b/source/tests/test_cluster.py
index 562fa87ab4..f096914a36 100644
--- a/source/tests/test_cluster.py
+++ b/source/tests/test_cluster.py
@@ -23,24 +23,24 @@ def returncode(self):
 
 
 class TestGPU(unittest.TestCase):
-    @mock.patch('subprocess.Popen')
     @mock.patch('tensorflow.compat.v1.test.is_built_with_cuda')
+    @mock.patch('subprocess.Popen')
     def test_none(self, mock_Popen, mock_is_built_with_cuda):
         mock_Popen.return_value.__enter__.return_value = FakePopen(b'0', b'')
         mock_is_built_with_cuda.return_value = True
         gpus = local.get_gpus()
         self.assertIsNone(gpus)
 
-    @mock.patch('subprocess.Popen')
     @mock.patch('tensorflow.compat.v1.test.is_built_with_cuda')
+    @mock.patch('subprocess.Popen')
     def test_valid(self, mock_Popen, mock_is_built_with_cuda):
         mock_Popen.return_value.__enter__.return_value = FakePopen(b'2', b'')
         mock_is_built_with_cuda.return_value = True
         gpus = local.get_gpus()
         self.assertEqual(gpus, [0, 1])
 
-    @mock.patch('subprocess.Popen')
     @mock.patch('tensorflow.compat.v1.test.is_built_with_cuda')
+    @mock.patch('subprocess.Popen')
     def test_error(self, mock_Popen, mock_is_built_with_cuda):
         mock_Popen.return_value.__enter__.return_value = \
             FakePopen(stderr=b'!', returncode=1)

From d5451508a4dd4cd6407947071b51a382c841067e Mon Sep 17 00:00:00 2001
From: Jinzhe Zeng <jinzhe.zeng@rutgers.edu>
Date: Sat, 26 Nov 2022 20:56:05 -0500
Subject: [PATCH 4/4] patch rocm

Signed-off-by: Jinzhe Zeng <jinzhe.zeng@rutgers.edu>
---
 source/tests/test_cluster.py | 4 +++-
 1 file changed, 3 insertions(+), 1 deletion(-)

diff --git a/source/tests/test_cluster.py b/source/tests/test_cluster.py
index f096914a36..d67b572bb1 100644
--- a/source/tests/test_cluster.py
+++ b/source/tests/test_cluster.py
@@ -49,9 +49,11 @@ def test_error(self, mock_Popen, mock_is_built_with_cuda):
             _ = local.get_gpus()
             self.assertIn('Failed to detect', str(cm.exception))
 
+    @mock.patch('tensorflow.compat.v1.test.is_built_with_rocm', create=True)
     @mock.patch('tensorflow.compat.v1.test.is_built_with_cuda')
-    def test_cpu(self, mock_is_built_with_cuda):
+    def test_cpu(self, mock_is_built_with_cuda, mock_is_built_with_rocm):
         mock_is_built_with_cuda.return_value = False
+        mock_is_built_with_rocm.return_value = False
         gpus = local.get_gpus()
         self.assertIsNone(gpus)