From 3fbc4faae1b109e1918577b9ebbf0e330e032b6e Mon Sep 17 00:00:00 2001
From: Wenqi Li <wenqil@nvidia.com>
Date: Wed, 22 Dec 2021 10:28:03 +0000
Subject: [PATCH 1/8] adds min. memory testing utils

Signed-off-by: Wenqi Li <wenqil@nvidia.com>
---
 tests/test_densenet.py |  3 ++-
 tests/utils.py         | 48 ++++++++++++++++++++++++++++++++++++++++++
 2 files changed, 50 insertions(+), 1 deletion(-)

diff --git a/tests/test_densenet.py b/tests/test_densenet.py
index 47f584297e..2f7cc7cf41 100644
--- a/tests/test_densenet.py
+++ b/tests/test_densenet.py
@@ -19,7 +19,7 @@
 from monai.networks import eval_mode
 from monai.networks.nets import DenseNet121, Densenet169, DenseNet264, densenet201
 from monai.utils import optional_import
-from tests.utils import skip_if_quick, test_script_save
+from tests.utils import SkipIfGPUMemoryLessThan, skip_if_quick, test_script_save
 
 if TYPE_CHECKING:
     import torchvision
@@ -90,6 +90,7 @@ def test_121_2d_shape_pretrain(self, model, input_param, input_shape, expected_s
 
     @parameterized.expand([TEST_PRETRAINED_2D_CASE_3])
     @skipUnless(has_torchvision, "Requires `torchvision` package.")
+    @SkipIfGPUMemoryLessThan(1024)
     def test_pretrain_consistency(self, model, input_param, input_shape):
         example = torch.randn(input_shape).to(device)
         net = model(**input_param).to(device)
diff --git a/tests/utils.py b/tests/utils.py
index af4b7ca4ef..1cd7f7830e 100644
--- a/tests/utils.py
+++ b/tests/utils.py
@@ -186,6 +186,28 @@ def skip_if_windows(obj):
     return unittest.skipIf(sys.platform == "win32", "Skipping tests on Windows")(obj)
 
 
+class SkipIfGPUMemoryLessThan:
+    """
+    Skip the unit tests if the GPU memory is less than a given amount.
+    """
+
+    def __init__(self, required_mb=1000, idx=0):
+        """
+        Args:
+            required_mb: minimum GPU memory size in MB
+            idx: device index
+
+        """
+        self.required_mb = required_mb
+        self.idx = idx
+
+    def __call__(self, obj):
+        return unittest.skipIf(
+            get_gpu_memory(self.idx) < self.required_mb,
+            f"Skipping because GPU has less than {self.required_mb} MB of memory on device {self.idx}.",
+        )(obj)
+
+
 class SkipIfBeforePyTorchVersion:
     """Decorator to be used if test should be skipped
     with PyTorch versions older than that given."""
@@ -292,6 +314,7 @@ def __init__(
         backend: Optional[str] = None,
         daemon: Optional[bool] = None,
         method: Optional[str] = "spawn",
+        min_gpu_memory=2048,
         verbose: bool = False,
     ):
         """
@@ -311,6 +334,7 @@ def __init__(
                 When daemon=None, the initial value is inherited from the creating process.
             method: set the method which should be used to start a child process.
                 method can be 'fork', 'spawn' or 'forkserver'.
+            min_gpu_memory: minimum amount of GPU memory per process (in megabytes) required to run the test.
             verbose: whether to print NCCL debug info.
         """
         self.nnodes = int(nnodes)
@@ -333,6 +357,7 @@ def __init__(
         self.timeout = datetime.timedelta(0, timeout)
         self.daemon = daemon
         self.method = method
+        self.min_gpu_memory = min_gpu_memory / self.nproc_per_node
         self.verbose = verbose
 
     def run_process(self, func, local_rank, args, kwargs, results):
@@ -387,6 +412,14 @@ def __call__(self, obj):
                 f"Skipping distributed tests because it requires {self.nnodes} devices "
                 f"but got {torch.cuda.device_count()}",
             )(obj)
+        for i in range(self.nproc_per_node):  # check free memory for the current node
+            free_mem = get_gpu_memory(i)
+            if get_gpu_memory(i) < self.min_gpu_memory:
+                return unittest.skipIf(
+                    True,
+                    f"Skipping distributed tests because it requires at least {self.min_gpu_memory}MB gpu memory "
+                    f"but got {free_mem}MB on gpu {i}",
+                )(obj)
 
         _cache_original_func(obj)
 
@@ -616,6 +649,21 @@ def query_memory(n=2):
     return ",".join(f"{int(x)}" for x in ids)
 
 
+def get_gpu_memory(idx=0) -> float:
+    """
+    Return the amount of GPU free memory in MB.
+    """
+    bash_string = f"nvidia-smi --query-gpu=memory.free --format=csv,noheader,nounits --id={idx}"
+
+    try:
+        p1 = Popen(bash_string.split(), stdout=PIPE)
+        output, error = p1.communicate()
+        free_memory = [x.split(",") for x in output.decode("utf-8").split("\n")[:-1]]
+        return np.asarray(free_memory, dtype=float).ravel()[0]
+    except (TypeError, IndexError, OSError):
+        return 0.0
+
+
 TEST_NDARRAYS: Tuple[Callable] = (np.array, torch.as_tensor)  # type: ignore
 if torch.cuda.is_available():
     gpu_tensor: Callable = partial(torch.as_tensor, device="cuda")

From af43e08856695963791f081f635d219af207c194 Mon Sep 17 00:00:00 2001
From: Wenqi Li <wenqil@nvidia.com>
Date: Wed, 22 Dec 2021 10:32:54 +0000
Subject: [PATCH 2/8] include valueerror for robust outcome

Signed-off-by: Wenqi Li <wenqil@nvidia.com>
---
 tests/utils.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/tests/utils.py b/tests/utils.py
index 1cd7f7830e..c97735e361 100644
--- a/tests/utils.py
+++ b/tests/utils.py
@@ -660,7 +660,7 @@ def get_gpu_memory(idx=0) -> float:
         output, error = p1.communicate()
         free_memory = [x.split(",") for x in output.decode("utf-8").split("\n")[:-1]]
         return np.asarray(free_memory, dtype=float).ravel()[0]
-    except (TypeError, IndexError, OSError):
+    except (TypeError, IndexError, OSError, ValueError):
         return 0.0
 
 

From 2e29c742a9f6e09de83a252daa32640983be5697 Mon Sep 17 00:00:00 2001
From: Wenqi Li <wenqil@nvidia.com>
Date: Wed, 22 Dec 2021 10:39:56 +0000
Subject: [PATCH 3/8] ensure float

Signed-off-by: Wenqi Li <wenqil@nvidia.com>
---
 tests/utils.py | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/tests/utils.py b/tests/utils.py
index c97735e361..edb769bf79 100644
--- a/tests/utils.py
+++ b/tests/utils.py
@@ -659,9 +659,9 @@ def get_gpu_memory(idx=0) -> float:
         p1 = Popen(bash_string.split(), stdout=PIPE)
         output, error = p1.communicate()
         free_memory = [x.split(",") for x in output.decode("utf-8").split("\n")[:-1]]
-        return np.asarray(free_memory, dtype=float).ravel()[0]
+        return float(np.asarray(free_memory, dtype=float).ravel()[0])
     except (TypeError, IndexError, OSError, ValueError):
-        return 0.0
+        return float(0.0)
 
 
 TEST_NDARRAYS: Tuple[Callable] = (np.array, torch.as_tensor)  # type: ignore

From 87146f6c3c5a95cb6d900ee0078152aa0daabffb Mon Sep 17 00:00:00 2001
From: Wenqi Li <wenqil@nvidia.com>
Date: Wed, 22 Dec 2021 10:46:46 +0000
Subject: [PATCH 4/8] msg improvements

Signed-off-by: Wenqi Li <wenqil@nvidia.com>
---
 tests/utils.py | 5 +++--
 1 file changed, 3 insertions(+), 2 deletions(-)

diff --git a/tests/utils.py b/tests/utils.py
index edb769bf79..1175bb939c 100644
--- a/tests/utils.py
+++ b/tests/utils.py
@@ -202,9 +202,10 @@ def __init__(self, required_mb=1000, idx=0):
         self.idx = idx
 
     def __call__(self, obj):
+        _mem = get_gpu_memory(self.idx)
         return unittest.skipIf(
-            get_gpu_memory(self.idx) < self.required_mb,
-            f"Skipping because GPU has less than {self.required_mb} MB of memory on device {self.idx}.",
+            _mem < self.required_mb,
+            f"Skipping because GPU has less than {self.required_mb} MB of memory on device {self.idx} (avail. {_mem}).",
         )(obj)
 
 

From 4ec8978b69dd81ef1e44f8fb6954f02162228f22 Mon Sep 17 00:00:00 2001
From: Wenqi Li <wenqil@nvidia.com>
Date: Wed, 22 Dec 2021 10:54:30 +0000
Subject: [PATCH 5/8] update threshold

Signed-off-by: Wenqi Li <wenqil@nvidia.com>
---
 tests/utils.py | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/tests/utils.py b/tests/utils.py
index 1175bb939c..e4bf7e8da5 100644
--- a/tests/utils.py
+++ b/tests/utils.py
@@ -315,7 +315,7 @@ def __init__(
         backend: Optional[str] = None,
         daemon: Optional[bool] = None,
         method: Optional[str] = "spawn",
-        min_gpu_memory=2048,
+        min_gpu_memory=9000,
         verbose: bool = False,
     ):
         """
@@ -415,7 +415,7 @@ def __call__(self, obj):
             )(obj)
         for i in range(self.nproc_per_node):  # check free memory for the current node
             free_mem = get_gpu_memory(i)
-            if get_gpu_memory(i) < self.min_gpu_memory:
+            if free_mem < self.min_gpu_memory:
                 return unittest.skipIf(
                     True,
                     f"Skipping distributed tests because it requires at least {self.min_gpu_memory}MB gpu memory "

From 5f44fd161002f1f68df04cd502f8f868d8b7271b Mon Sep 17 00:00:00 2001
From: Wenqi Li <wenqil@nvidia.com>
Date: Wed, 22 Dec 2021 21:59:58 +0000
Subject: [PATCH 6/8] remove ref

Signed-off-by: Wenqi Li <wenqil@nvidia.com>
---
 tests/utils.py | 11 +++++++++++
 1 file changed, 11 insertions(+)

diff --git a/tests/utils.py b/tests/utils.py
index e4bf7e8da5..57462d973a 100644
--- a/tests/utils.py
+++ b/tests/utils.py
@@ -440,6 +440,7 @@ def _wrapper(*args, **kwargs):
             for p in processes:
                 p.join()
                 assert results.get(), "Distributed call failed."
+            _del_original_func(obj)
 
         return _wrapper
 
@@ -521,6 +522,7 @@ def _wrapper(*args, **kwargs):
             finally:
                 p.join()
 
+            _del_original_func(obj)
             res = None
             try:
                 res = results.get(block=False)
@@ -546,6 +548,15 @@ def _cache_original_func(obj) -> None:
     _original_funcs[obj.__name__] = obj
 
 
+def _del_original_func(obj):
+    """pop the original function from cache."""
+    global _original_funcs
+    _original_funcs.pop(obj.__name__, None)
+    if torch.cuda.is_available():  # clean up the cached function
+        torch.cuda.synchronize()
+        torch.cuda.empty_cache()
+
+
 def _call_original_func(name, module, *args, **kwargs):
     if name not in _original_funcs:
         _original_module = importlib.import_module(module)  # reimport, refresh _original_funcs

From da02476c0048490c0846e06eabd2bddefe1d3be6 Mon Sep 17 00:00:00 2001
From: Wenqi Li <wenqil@nvidia.com>
Date: Thu, 23 Dec 2021 00:00:49 +0000
Subject: [PATCH 7/8] separate disttests

Signed-off-by: Wenqi Li <wenqil@nvidia.com>
---
 .github/pull_request_template.md    | 2 +-
 .github/workflows/cron.yml          | 6 +++---
 .github/workflows/integration.yml   | 2 +-
 .github/workflows/pythonapp-gpu.yml | 2 +-
 .github/workflows/setupapp.yml      | 4 ++--
 runtests.sh                         | 8 ++++++--
 6 files changed, 14 insertions(+), 10 deletions(-)

diff --git a/.github/pull_request_template.md b/.github/pull_request_template.md
index f7024f1a08..e1eeb92c6b 100644
--- a/.github/pull_request_template.md
+++ b/.github/pull_request_template.md
@@ -12,6 +12,6 @@ A few sentences describing the changes proposed in this pull request.
 - [ ] Breaking change (fix or new feature that would cause existing functionality to change).
 - [ ] New tests added to cover the changes.
 - [ ] Integration tests passed locally by running `./runtests.sh -f -u --net --coverage`.
-- [ ] Quick tests passed locally by running `./runtests.sh --quick --unittests`.
+- [ ] Quick tests passed locally by running `./runtests.sh --quick --unittests  --disttests`.
 - [ ] In-line docstrings updated.
 - [ ] Documentation updated, tested `make html` command in the `docs/` folder.
diff --git a/.github/workflows/cron.yml b/.github/workflows/cron.yml
index 70585eade0..9cc5f595ac 100644
--- a/.github/workflows/cron.yml
+++ b/.github/workflows/cron.yml
@@ -48,7 +48,7 @@ jobs:
         python -c $'import torch\na,b=torch.zeros(1,device="cuda:0"),torch.zeros(1,device="cuda:1");\nwhile True:print(a,b)' > /dev/null &
         python -c "import torch; print(torch.__version__); print('{} of GPUs available'.format(torch.cuda.device_count()))"
         python -c 'import torch; print(torch.rand(5, 3, device=torch.device("cuda:0")))'
-        BUILD_MONAI=1 ./runtests.sh --coverage --unittests  # unit tests with coverage report
+        BUILD_MONAI=1 ./runtests.sh --coverage --unittests --disttests  # unit tests with coverage report
         BUILD_MONAI=1 ./runtests.sh --coverage --net  # integration tests with coverage report
         coverage xml
         if pgrep python; then pkill python; fi
@@ -91,7 +91,7 @@ jobs:
         python -c $'import torch\na,b=torch.zeros(1,device="cuda:0"),torch.zeros(1,device="cuda:1");\nwhile True:print(a,b)' > /dev/null &
         python -c "import torch; print(torch.__version__); print('{} of GPUs available'.format(torch.cuda.device_count()))"
         python -c 'import torch; print(torch.rand(5, 3, device=torch.device("cuda:0")))'
-        BUILD_MONAI=1 ./runtests.sh --coverage --unittests  # unit tests with coverage report
+        BUILD_MONAI=1 ./runtests.sh --coverage --unittests --disttests  # unit tests with coverage report
         BUILD_MONAI=1 ./runtests.sh --coverage --net  # integration tests with coverage report
         coverage xml
         if pgrep python; then pkill python; fi
@@ -190,7 +190,7 @@ jobs:
         python -c "import torch; print(torch.__version__); print('{} of GPUs available'.format(torch.cuda.device_count()))"
         python -c 'import torch; print(torch.rand(5,3, device=torch.device("cuda:0")))'
         ngc --version
-        BUILD_MONAI=1 ./runtests.sh --coverage --pytype --unittests  # unit tests with pytype checks, coverage report
+        BUILD_MONAI=1 ./runtests.sh --coverage --pytype --unittests --disttests  # unit tests with pytype checks, coverage report
         BUILD_MONAI=1 ./runtests.sh --coverage --net  # integration tests with coverage report
         coverage xml
         if pgrep python; then pkill python; fi
diff --git a/.github/workflows/integration.yml b/.github/workflows/integration.yml
index 8c5517b183..6da018f8cf 100644
--- a/.github/workflows/integration.yml
+++ b/.github/workflows/integration.yml
@@ -47,7 +47,7 @@ jobs:
         python -c "import torch; print(torch.__version__); print('{} of GPUs available'.format(torch.cuda.device_count()))"
         python -c 'import torch; print(torch.rand(5,3, device=torch.device("cuda:0")))'
         BUILD_MONAI=1 ./runtests.sh --net
-        BUILD_MONAI=1 ./runtests.sh --unittests
+        BUILD_MONAI=1 ./runtests.sh --unittests --disttests
         if pgrep python; then pkill python; fi
       shell: bash
     - name: Add reaction
diff --git a/.github/workflows/pythonapp-gpu.yml b/.github/workflows/pythonapp-gpu.yml
index bae009d0a1..88aca4ae01 100644
--- a/.github/workflows/pythonapp-gpu.yml
+++ b/.github/workflows/pythonapp-gpu.yml
@@ -123,7 +123,7 @@ jobs:
         python -c 'import torch; print(torch.rand(5, 3, device=torch.device("cuda:0")))'
         python -c "import monai; monai.config.print_config()"
         # build for the current self-hosted CI Tesla V100
-        BUILD_MONAI=1 TORCH_CUDA_ARCH_LIST="7.0" ./runtests.sh --quick --unittests
+        BUILD_MONAI=1 TORCH_CUDA_ARCH_LIST="7.0" ./runtests.sh --quick --unittests --disttests
         if [ ${{ matrix.environment }} = "PT110+CUDA102" ]; then
           # test the clang-format tool downloading once
           coverage run -m tests.clang_format_utils
diff --git a/.github/workflows/setupapp.yml b/.github/workflows/setupapp.yml
index 419c73fc10..f2bc319644 100644
--- a/.github/workflows/setupapp.yml
+++ b/.github/workflows/setupapp.yml
@@ -59,7 +59,7 @@ jobs:
         python -c $'import torch\na,b=torch.zeros(1,device="cuda:0"),torch.zeros(1,device="cuda:1");\nwhile True:print(a,b)' > /dev/null &
         python -c "import torch; print(torch.__version__); print('{} of GPUs available'.format(torch.cuda.device_count()))"
         python -c 'import torch; print(torch.rand(5, 3, device=torch.device("cuda:0")))'
-        BUILD_MONAI=1 ./runtests.sh --coverage --unittests  # unit tests with coverage report
+        BUILD_MONAI=1 ./runtests.sh --coverage --unittests --disttests  # unit tests with coverage report
         BUILD_MONAI=1 ./runtests.sh --coverage --net  # integration tests with coverage report
         coverage xml
         if pgrep python; then pkill python; fi
@@ -104,7 +104,7 @@ jobs:
       run: |
         python -m pip list
         python -c 'import torch; print(torch.__version__); print(torch.rand(5,3))'
-        BUILD_MONAI=1 ./runtests.sh --quick --unittests
+        BUILD_MONAI=1 ./runtests.sh --quick --unittests --disttests
         coverage xml
     - name: Upload coverage
       uses: codecov/codecov-action@v1
diff --git a/runtests.sh b/runtests.sh
index e3adabe184..fd84c2f102 100755
--- a/runtests.sh
+++ b/runtests.sh
@@ -567,7 +567,7 @@ if [ $doUnitTests = true ]
 then
     echo "${separator}${blue}unittests${noColor}"
     torch_validate
-    ${cmdPrefix}${cmd} ./tests/runner.py -p "test_((?!integration).)"
+    ${cmdPrefix}${cmd} ./tests/runner.py -p "^(?!test_integration).*(?<!_dist)$"  # excluding integration/dist tests
 fi
 
 # distributed test only
@@ -575,7 +575,11 @@ if [ $doDistTests = true ]
 then
     echo "${separator}${blue}run distributed unit test cases${noColor}"
     torch_validate
-    ${cmdPrefix}${cmd} ./tests/runner.py -p "test_.*_dist$"
+    for i in tests/test_*_dist.py
+    do
+        echo "$i"
+        ${cmdPrefix}${cmd} "$i"
+    done
 fi
 
 # network training/inference/eval integration tests

From f31e9d3ac39dfdcd5b7935bff0af93a8c8b9142c Mon Sep 17 00:00:00 2001
From: Wenqi Li <wenqil@nvidia.com>
Date: Thu, 23 Dec 2021 14:05:58 +0000
Subject: [PATCH 8/8] update based on comments

Signed-off-by: Wenqi Li <wenqil@nvidia.com>
---
 tests/test_densenet.py |  3 +--
 tests/utils.py         | 54 ++----------------------------------------
 2 files changed, 3 insertions(+), 54 deletions(-)

diff --git a/tests/test_densenet.py b/tests/test_densenet.py
index 2f7cc7cf41..47f584297e 100644
--- a/tests/test_densenet.py
+++ b/tests/test_densenet.py
@@ -19,7 +19,7 @@
 from monai.networks import eval_mode
 from monai.networks.nets import DenseNet121, Densenet169, DenseNet264, densenet201
 from monai.utils import optional_import
-from tests.utils import SkipIfGPUMemoryLessThan, skip_if_quick, test_script_save
+from tests.utils import skip_if_quick, test_script_save
 
 if TYPE_CHECKING:
     import torchvision
@@ -90,7 +90,6 @@ def test_121_2d_shape_pretrain(self, model, input_param, input_shape, expected_s
 
     @parameterized.expand([TEST_PRETRAINED_2D_CASE_3])
     @skipUnless(has_torchvision, "Requires `torchvision` package.")
-    @SkipIfGPUMemoryLessThan(1024)
     def test_pretrain_consistency(self, model, input_param, input_shape):
         example = torch.randn(input_shape).to(device)
         net = model(**input_param).to(device)
diff --git a/tests/utils.py b/tests/utils.py
index 57462d973a..4ad0b0ebb6 100644
--- a/tests/utils.py
+++ b/tests/utils.py
@@ -186,29 +186,6 @@ def skip_if_windows(obj):
     return unittest.skipIf(sys.platform == "win32", "Skipping tests on Windows")(obj)
 
 
-class SkipIfGPUMemoryLessThan:
-    """
-    Skip the unit tests if the GPU memory is less than a given amount.
-    """
-
-    def __init__(self, required_mb=1000, idx=0):
-        """
-        Args:
-            required_mb: minimum GPU memory size in MB
-            idx: device index
-
-        """
-        self.required_mb = required_mb
-        self.idx = idx
-
-    def __call__(self, obj):
-        _mem = get_gpu_memory(self.idx)
-        return unittest.skipIf(
-            _mem < self.required_mb,
-            f"Skipping because GPU has less than {self.required_mb} MB of memory on device {self.idx} (avail. {_mem}).",
-        )(obj)
-
-
 class SkipIfBeforePyTorchVersion:
     """Decorator to be used if test should be skipped
     with PyTorch versions older than that given."""
@@ -315,7 +292,6 @@ def __init__(
         backend: Optional[str] = None,
         daemon: Optional[bool] = None,
         method: Optional[str] = "spawn",
-        min_gpu_memory=9000,
         verbose: bool = False,
     ):
         """
@@ -335,7 +311,6 @@ def __init__(
                 When daemon=None, the initial value is inherited from the creating process.
             method: set the method which should be used to start a child process.
                 method can be 'fork', 'spawn' or 'forkserver'.
-            min_gpu_memory: minimum amount of GPU memory per process (in megabytes) required to run the test.
             verbose: whether to print NCCL debug info.
         """
         self.nnodes = int(nnodes)
@@ -358,7 +333,6 @@ def __init__(
         self.timeout = datetime.timedelta(0, timeout)
         self.daemon = daemon
         self.method = method
-        self.min_gpu_memory = min_gpu_memory / self.nproc_per_node
         self.verbose = verbose
 
     def run_process(self, func, local_rank, args, kwargs, results):
@@ -376,8 +350,7 @@ def run_process(self, func, local_rank, args, kwargs, results):
             os.environ["RANK"] = str(self.nproc_per_node * self.node_rank + local_rank)
 
             if torch.cuda.is_available():
-                os.environ["CUDA_DEVICE_ORDER"] = "PCI_BUS_ID"
-                torch.cuda.set_device(int(local_rank))
+                torch.cuda.set_device(int(local_rank))  # using device ids from CUDA_VISIBILE_DEVICES
 
             dist.init_process_group(
                 backend=self.backend,
@@ -413,14 +386,6 @@ def __call__(self, obj):
                 f"Skipping distributed tests because it requires {self.nnodes} devices "
                 f"but got {torch.cuda.device_count()}",
             )(obj)
-        for i in range(self.nproc_per_node):  # check free memory for the current node
-            free_mem = get_gpu_memory(i)
-            if free_mem < self.min_gpu_memory:
-                return unittest.skipIf(
-                    True,
-                    f"Skipping distributed tests because it requires at least {self.min_gpu_memory}MB gpu memory "
-                    f"but got {free_mem}MB on gpu {i}",
-                )(obj)
 
         _cache_original_func(obj)
 
@@ -645,7 +610,7 @@ def test_script_save(net, *inputs, device=None, rtol=1e-4, atol=0.0):
 
 def query_memory(n=2):
     """
-    Find best n idle devices and return a string of device ids.
+    Find best n idle devices and return a string of device ids using the `nvidia-smi` command.
     """
     bash_string = "nvidia-smi --query-gpu=power.draw,temperature.gpu,memory.used --format=csv,noheader,nounits"
 
@@ -661,21 +626,6 @@ def query_memory(n=2):
     return ",".join(f"{int(x)}" for x in ids)
 
 
-def get_gpu_memory(idx=0) -> float:
-    """
-    Return the amount of GPU free memory in MB.
-    """
-    bash_string = f"nvidia-smi --query-gpu=memory.free --format=csv,noheader,nounits --id={idx}"
-
-    try:
-        p1 = Popen(bash_string.split(), stdout=PIPE)
-        output, error = p1.communicate()
-        free_memory = [x.split(",") for x in output.decode("utf-8").split("\n")[:-1]]
-        return float(np.asarray(free_memory, dtype=float).ravel()[0])
-    except (TypeError, IndexError, OSError, ValueError):
-        return float(0.0)
-
-
 TEST_NDARRAYS: Tuple[Callable] = (np.array, torch.as_tensor)  # type: ignore
 if torch.cuda.is_available():
     gpu_tensor: Callable = partial(torch.as_tensor, device="cuda")