From 3fbc4faae1b109e1918577b9ebbf0e330e032b6e Mon Sep 17 00:00:00 2001 From: Wenqi Li Date: Wed, 22 Dec 2021 10:28:03 +0000 Subject: [PATCH 1/8] adds min. memory testing utils Signed-off-by: Wenqi Li --- tests/test_densenet.py | 3 ++- tests/utils.py | 48 ++++++++++++++++++++++++++++++++++++++++++ 2 files changed, 50 insertions(+), 1 deletion(-) diff --git a/tests/test_densenet.py b/tests/test_densenet.py index 47f584297e..2f7cc7cf41 100644 --- a/tests/test_densenet.py +++ b/tests/test_densenet.py @@ -19,7 +19,7 @@ from monai.networks import eval_mode from monai.networks.nets import DenseNet121, Densenet169, DenseNet264, densenet201 from monai.utils import optional_import -from tests.utils import skip_if_quick, test_script_save +from tests.utils import SkipIfGPUMemoryLessThan, skip_if_quick, test_script_save if TYPE_CHECKING: import torchvision @@ -90,6 +90,7 @@ def test_121_2d_shape_pretrain(self, model, input_param, input_shape, expected_s @parameterized.expand([TEST_PRETRAINED_2D_CASE_3]) @skipUnless(has_torchvision, "Requires `torchvision` package.") + @SkipIfGPUMemoryLessThan(1024) def test_pretrain_consistency(self, model, input_param, input_shape): example = torch.randn(input_shape).to(device) net = model(**input_param).to(device) diff --git a/tests/utils.py b/tests/utils.py index af4b7ca4ef..1cd7f7830e 100644 --- a/tests/utils.py +++ b/tests/utils.py @@ -186,6 +186,28 @@ def skip_if_windows(obj): return unittest.skipIf(sys.platform == "win32", "Skipping tests on Windows")(obj) +class SkipIfGPUMemoryLessThan: + """ + Skip the unit tests if the GPU memory is less than a given amount. + """ + + def __init__(self, required_mb=1000, idx=0): + """ + Args: + required_mb: minimum GPU memory size in MB + idx: device index + + """ + self.required_mb = required_mb + self.idx = idx + + def __call__(self, obj): + return unittest.skipIf( + get_gpu_memory(self.idx) < self.required_mb, + f"Skipping because GPU has less than {self.required_mb} MB of memory on device {self.idx}.", + )(obj) + + class SkipIfBeforePyTorchVersion: """Decorator to be used if test should be skipped with PyTorch versions older than that given.""" @@ -292,6 +314,7 @@ def __init__( backend: Optional[str] = None, daemon: Optional[bool] = None, method: Optional[str] = "spawn", + min_gpu_memory=2048, verbose: bool = False, ): """ @@ -311,6 +334,7 @@ def __init__( When daemon=None, the initial value is inherited from the creating process. method: set the method which should be used to start a child process. method can be 'fork', 'spawn' or 'forkserver'. + min_gpu_memory: minimum amount of GPU memory per process (in megabytes) required to run the test. verbose: whether to print NCCL debug info. """ self.nnodes = int(nnodes) @@ -333,6 +357,7 @@ def __init__( self.timeout = datetime.timedelta(0, timeout) self.daemon = daemon self.method = method + self.min_gpu_memory = min_gpu_memory / self.nproc_per_node self.verbose = verbose def run_process(self, func, local_rank, args, kwargs, results): @@ -387,6 +412,14 @@ def __call__(self, obj): f"Skipping distributed tests because it requires {self.nnodes} devices " f"but got {torch.cuda.device_count()}", )(obj) + for i in range(self.nproc_per_node): # check free memory for the current node + free_mem = get_gpu_memory(i) + if get_gpu_memory(i) < self.min_gpu_memory: + return unittest.skipIf( + True, + f"Skipping distributed tests because it requires at least {self.min_gpu_memory}MB gpu memory " + f"but got {free_mem}MB on gpu {i}", + )(obj) _cache_original_func(obj) @@ -616,6 +649,21 @@ def query_memory(n=2): return ",".join(f"{int(x)}" for x in ids) +def get_gpu_memory(idx=0) -> float: + """ + Return the amount of GPU free memory in MB. + """ + bash_string = f"nvidia-smi --query-gpu=memory.free --format=csv,noheader,nounits --id={idx}" + + try: + p1 = Popen(bash_string.split(), stdout=PIPE) + output, error = p1.communicate() + free_memory = [x.split(",") for x in output.decode("utf-8").split("\n")[:-1]] + return np.asarray(free_memory, dtype=float).ravel()[0] + except (TypeError, IndexError, OSError): + return 0.0 + + TEST_NDARRAYS: Tuple[Callable] = (np.array, torch.as_tensor) # type: ignore if torch.cuda.is_available(): gpu_tensor: Callable = partial(torch.as_tensor, device="cuda") From af43e08856695963791f081f635d219af207c194 Mon Sep 17 00:00:00 2001 From: Wenqi Li Date: Wed, 22 Dec 2021 10:32:54 +0000 Subject: [PATCH 2/8] include valueerror for robust outcome Signed-off-by: Wenqi Li --- tests/utils.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/tests/utils.py b/tests/utils.py index 1cd7f7830e..c97735e361 100644 --- a/tests/utils.py +++ b/tests/utils.py @@ -660,7 +660,7 @@ def get_gpu_memory(idx=0) -> float: output, error = p1.communicate() free_memory = [x.split(",") for x in output.decode("utf-8").split("\n")[:-1]] return np.asarray(free_memory, dtype=float).ravel()[0] - except (TypeError, IndexError, OSError): + except (TypeError, IndexError, OSError, ValueError): return 0.0 From 2e29c742a9f6e09de83a252daa32640983be5697 Mon Sep 17 00:00:00 2001 From: Wenqi Li Date: Wed, 22 Dec 2021 10:39:56 +0000 Subject: [PATCH 3/8] ensure float Signed-off-by: Wenqi Li --- tests/utils.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/tests/utils.py b/tests/utils.py index c97735e361..edb769bf79 100644 --- a/tests/utils.py +++ b/tests/utils.py @@ -659,9 +659,9 @@ def get_gpu_memory(idx=0) -> float: p1 = Popen(bash_string.split(), stdout=PIPE) output, error = p1.communicate() free_memory = [x.split(",") for x in output.decode("utf-8").split("\n")[:-1]] - return np.asarray(free_memory, dtype=float).ravel()[0] + return float(np.asarray(free_memory, dtype=float).ravel()[0]) except (TypeError, IndexError, OSError, ValueError): - return 0.0 + return float(0.0) TEST_NDARRAYS: Tuple[Callable] = (np.array, torch.as_tensor) # type: ignore From 87146f6c3c5a95cb6d900ee0078152aa0daabffb Mon Sep 17 00:00:00 2001 From: Wenqi Li Date: Wed, 22 Dec 2021 10:46:46 +0000 Subject: [PATCH 4/8] msg improvements Signed-off-by: Wenqi Li --- tests/utils.py | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) diff --git a/tests/utils.py b/tests/utils.py index edb769bf79..1175bb939c 100644 --- a/tests/utils.py +++ b/tests/utils.py @@ -202,9 +202,10 @@ def __init__(self, required_mb=1000, idx=0): self.idx = idx def __call__(self, obj): + _mem = get_gpu_memory(self.idx) return unittest.skipIf( - get_gpu_memory(self.idx) < self.required_mb, - f"Skipping because GPU has less than {self.required_mb} MB of memory on device {self.idx}.", + _mem < self.required_mb, + f"Skipping because GPU has less than {self.required_mb} MB of memory on device {self.idx} (avail. {_mem}).", )(obj) From 4ec8978b69dd81ef1e44f8fb6954f02162228f22 Mon Sep 17 00:00:00 2001 From: Wenqi Li Date: Wed, 22 Dec 2021 10:54:30 +0000 Subject: [PATCH 5/8] update threshold Signed-off-by: Wenqi Li --- tests/utils.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/tests/utils.py b/tests/utils.py index 1175bb939c..e4bf7e8da5 100644 --- a/tests/utils.py +++ b/tests/utils.py @@ -315,7 +315,7 @@ def __init__( backend: Optional[str] = None, daemon: Optional[bool] = None, method: Optional[str] = "spawn", - min_gpu_memory=2048, + min_gpu_memory=9000, verbose: bool = False, ): """ @@ -415,7 +415,7 @@ def __call__(self, obj): )(obj) for i in range(self.nproc_per_node): # check free memory for the current node free_mem = get_gpu_memory(i) - if get_gpu_memory(i) < self.min_gpu_memory: + if free_mem < self.min_gpu_memory: return unittest.skipIf( True, f"Skipping distributed tests because it requires at least {self.min_gpu_memory}MB gpu memory " From 5f44fd161002f1f68df04cd502f8f868d8b7271b Mon Sep 17 00:00:00 2001 From: Wenqi Li Date: Wed, 22 Dec 2021 21:59:58 +0000 Subject: [PATCH 6/8] remove ref Signed-off-by: Wenqi Li --- tests/utils.py | 11 +++++++++++ 1 file changed, 11 insertions(+) diff --git a/tests/utils.py b/tests/utils.py index e4bf7e8da5..57462d973a 100644 --- a/tests/utils.py +++ b/tests/utils.py @@ -440,6 +440,7 @@ def _wrapper(*args, **kwargs): for p in processes: p.join() assert results.get(), "Distributed call failed." + _del_original_func(obj) return _wrapper @@ -521,6 +522,7 @@ def _wrapper(*args, **kwargs): finally: p.join() + _del_original_func(obj) res = None try: res = results.get(block=False) @@ -546,6 +548,15 @@ def _cache_original_func(obj) -> None: _original_funcs[obj.__name__] = obj +def _del_original_func(obj): + """pop the original function from cache.""" + global _original_funcs + _original_funcs.pop(obj.__name__, None) + if torch.cuda.is_available(): # clean up the cached function + torch.cuda.synchronize() + torch.cuda.empty_cache() + + def _call_original_func(name, module, *args, **kwargs): if name not in _original_funcs: _original_module = importlib.import_module(module) # reimport, refresh _original_funcs From da02476c0048490c0846e06eabd2bddefe1d3be6 Mon Sep 17 00:00:00 2001 From: Wenqi Li Date: Thu, 23 Dec 2021 00:00:49 +0000 Subject: [PATCH 7/8] separate disttests Signed-off-by: Wenqi Li --- .github/pull_request_template.md | 2 +- .github/workflows/cron.yml | 6 +++--- .github/workflows/integration.yml | 2 +- .github/workflows/pythonapp-gpu.yml | 2 +- .github/workflows/setupapp.yml | 4 ++-- runtests.sh | 8 ++++++-- 6 files changed, 14 insertions(+), 10 deletions(-) diff --git a/.github/pull_request_template.md b/.github/pull_request_template.md index f7024f1a08..e1eeb92c6b 100644 --- a/.github/pull_request_template.md +++ b/.github/pull_request_template.md @@ -12,6 +12,6 @@ A few sentences describing the changes proposed in this pull request. - [ ] Breaking change (fix or new feature that would cause existing functionality to change). - [ ] New tests added to cover the changes. - [ ] Integration tests passed locally by running `./runtests.sh -f -u --net --coverage`. -- [ ] Quick tests passed locally by running `./runtests.sh --quick --unittests`. +- [ ] Quick tests passed locally by running `./runtests.sh --quick --unittests --disttests`. - [ ] In-line docstrings updated. - [ ] Documentation updated, tested `make html` command in the `docs/` folder. diff --git a/.github/workflows/cron.yml b/.github/workflows/cron.yml index 70585eade0..9cc5f595ac 100644 --- a/.github/workflows/cron.yml +++ b/.github/workflows/cron.yml @@ -48,7 +48,7 @@ jobs: python -c $'import torch\na,b=torch.zeros(1,device="cuda:0"),torch.zeros(1,device="cuda:1");\nwhile True:print(a,b)' > /dev/null & python -c "import torch; print(torch.__version__); print('{} of GPUs available'.format(torch.cuda.device_count()))" python -c 'import torch; print(torch.rand(5, 3, device=torch.device("cuda:0")))' - BUILD_MONAI=1 ./runtests.sh --coverage --unittests # unit tests with coverage report + BUILD_MONAI=1 ./runtests.sh --coverage --unittests --disttests # unit tests with coverage report BUILD_MONAI=1 ./runtests.sh --coverage --net # integration tests with coverage report coverage xml if pgrep python; then pkill python; fi @@ -91,7 +91,7 @@ jobs: python -c $'import torch\na,b=torch.zeros(1,device="cuda:0"),torch.zeros(1,device="cuda:1");\nwhile True:print(a,b)' > /dev/null & python -c "import torch; print(torch.__version__); print('{} of GPUs available'.format(torch.cuda.device_count()))" python -c 'import torch; print(torch.rand(5, 3, device=torch.device("cuda:0")))' - BUILD_MONAI=1 ./runtests.sh --coverage --unittests # unit tests with coverage report + BUILD_MONAI=1 ./runtests.sh --coverage --unittests --disttests # unit tests with coverage report BUILD_MONAI=1 ./runtests.sh --coverage --net # integration tests with coverage report coverage xml if pgrep python; then pkill python; fi @@ -190,7 +190,7 @@ jobs: python -c "import torch; print(torch.__version__); print('{} of GPUs available'.format(torch.cuda.device_count()))" python -c 'import torch; print(torch.rand(5,3, device=torch.device("cuda:0")))' ngc --version - BUILD_MONAI=1 ./runtests.sh --coverage --pytype --unittests # unit tests with pytype checks, coverage report + BUILD_MONAI=1 ./runtests.sh --coverage --pytype --unittests --disttests # unit tests with pytype checks, coverage report BUILD_MONAI=1 ./runtests.sh --coverage --net # integration tests with coverage report coverage xml if pgrep python; then pkill python; fi diff --git a/.github/workflows/integration.yml b/.github/workflows/integration.yml index 8c5517b183..6da018f8cf 100644 --- a/.github/workflows/integration.yml +++ b/.github/workflows/integration.yml @@ -47,7 +47,7 @@ jobs: python -c "import torch; print(torch.__version__); print('{} of GPUs available'.format(torch.cuda.device_count()))" python -c 'import torch; print(torch.rand(5,3, device=torch.device("cuda:0")))' BUILD_MONAI=1 ./runtests.sh --net - BUILD_MONAI=1 ./runtests.sh --unittests + BUILD_MONAI=1 ./runtests.sh --unittests --disttests if pgrep python; then pkill python; fi shell: bash - name: Add reaction diff --git a/.github/workflows/pythonapp-gpu.yml b/.github/workflows/pythonapp-gpu.yml index bae009d0a1..88aca4ae01 100644 --- a/.github/workflows/pythonapp-gpu.yml +++ b/.github/workflows/pythonapp-gpu.yml @@ -123,7 +123,7 @@ jobs: python -c 'import torch; print(torch.rand(5, 3, device=torch.device("cuda:0")))' python -c "import monai; monai.config.print_config()" # build for the current self-hosted CI Tesla V100 - BUILD_MONAI=1 TORCH_CUDA_ARCH_LIST="7.0" ./runtests.sh --quick --unittests + BUILD_MONAI=1 TORCH_CUDA_ARCH_LIST="7.0" ./runtests.sh --quick --unittests --disttests if [ ${{ matrix.environment }} = "PT110+CUDA102" ]; then # test the clang-format tool downloading once coverage run -m tests.clang_format_utils diff --git a/.github/workflows/setupapp.yml b/.github/workflows/setupapp.yml index 419c73fc10..f2bc319644 100644 --- a/.github/workflows/setupapp.yml +++ b/.github/workflows/setupapp.yml @@ -59,7 +59,7 @@ jobs: python -c $'import torch\na,b=torch.zeros(1,device="cuda:0"),torch.zeros(1,device="cuda:1");\nwhile True:print(a,b)' > /dev/null & python -c "import torch; print(torch.__version__); print('{} of GPUs available'.format(torch.cuda.device_count()))" python -c 'import torch; print(torch.rand(5, 3, device=torch.device("cuda:0")))' - BUILD_MONAI=1 ./runtests.sh --coverage --unittests # unit tests with coverage report + BUILD_MONAI=1 ./runtests.sh --coverage --unittests --disttests # unit tests with coverage report BUILD_MONAI=1 ./runtests.sh --coverage --net # integration tests with coverage report coverage xml if pgrep python; then pkill python; fi @@ -104,7 +104,7 @@ jobs: run: | python -m pip list python -c 'import torch; print(torch.__version__); print(torch.rand(5,3))' - BUILD_MONAI=1 ./runtests.sh --quick --unittests + BUILD_MONAI=1 ./runtests.sh --quick --unittests --disttests coverage xml - name: Upload coverage uses: codecov/codecov-action@v1 diff --git a/runtests.sh b/runtests.sh index e3adabe184..fd84c2f102 100755 --- a/runtests.sh +++ b/runtests.sh @@ -567,7 +567,7 @@ if [ $doUnitTests = true ] then echo "${separator}${blue}unittests${noColor}" torch_validate - ${cmdPrefix}${cmd} ./tests/runner.py -p "test_((?!integration).)" + ${cmdPrefix}${cmd} ./tests/runner.py -p "^(?!test_integration).*(? Date: Thu, 23 Dec 2021 14:05:58 +0000 Subject: [PATCH 8/8] update based on comments Signed-off-by: Wenqi Li --- tests/test_densenet.py | 3 +-- tests/utils.py | 54 ++---------------------------------------- 2 files changed, 3 insertions(+), 54 deletions(-) diff --git a/tests/test_densenet.py b/tests/test_densenet.py index 2f7cc7cf41..47f584297e 100644 --- a/tests/test_densenet.py +++ b/tests/test_densenet.py @@ -19,7 +19,7 @@ from monai.networks import eval_mode from monai.networks.nets import DenseNet121, Densenet169, DenseNet264, densenet201 from monai.utils import optional_import -from tests.utils import SkipIfGPUMemoryLessThan, skip_if_quick, test_script_save +from tests.utils import skip_if_quick, test_script_save if TYPE_CHECKING: import torchvision @@ -90,7 +90,6 @@ def test_121_2d_shape_pretrain(self, model, input_param, input_shape, expected_s @parameterized.expand([TEST_PRETRAINED_2D_CASE_3]) @skipUnless(has_torchvision, "Requires `torchvision` package.") - @SkipIfGPUMemoryLessThan(1024) def test_pretrain_consistency(self, model, input_param, input_shape): example = torch.randn(input_shape).to(device) net = model(**input_param).to(device) diff --git a/tests/utils.py b/tests/utils.py index 57462d973a..4ad0b0ebb6 100644 --- a/tests/utils.py +++ b/tests/utils.py @@ -186,29 +186,6 @@ def skip_if_windows(obj): return unittest.skipIf(sys.platform == "win32", "Skipping tests on Windows")(obj) -class SkipIfGPUMemoryLessThan: - """ - Skip the unit tests if the GPU memory is less than a given amount. - """ - - def __init__(self, required_mb=1000, idx=0): - """ - Args: - required_mb: minimum GPU memory size in MB - idx: device index - - """ - self.required_mb = required_mb - self.idx = idx - - def __call__(self, obj): - _mem = get_gpu_memory(self.idx) - return unittest.skipIf( - _mem < self.required_mb, - f"Skipping because GPU has less than {self.required_mb} MB of memory on device {self.idx} (avail. {_mem}).", - )(obj) - - class SkipIfBeforePyTorchVersion: """Decorator to be used if test should be skipped with PyTorch versions older than that given.""" @@ -315,7 +292,6 @@ def __init__( backend: Optional[str] = None, daemon: Optional[bool] = None, method: Optional[str] = "spawn", - min_gpu_memory=9000, verbose: bool = False, ): """ @@ -335,7 +311,6 @@ def __init__( When daemon=None, the initial value is inherited from the creating process. method: set the method which should be used to start a child process. method can be 'fork', 'spawn' or 'forkserver'. - min_gpu_memory: minimum amount of GPU memory per process (in megabytes) required to run the test. verbose: whether to print NCCL debug info. """ self.nnodes = int(nnodes) @@ -358,7 +333,6 @@ def __init__( self.timeout = datetime.timedelta(0, timeout) self.daemon = daemon self.method = method - self.min_gpu_memory = min_gpu_memory / self.nproc_per_node self.verbose = verbose def run_process(self, func, local_rank, args, kwargs, results): @@ -376,8 +350,7 @@ def run_process(self, func, local_rank, args, kwargs, results): os.environ["RANK"] = str(self.nproc_per_node * self.node_rank + local_rank) if torch.cuda.is_available(): - os.environ["CUDA_DEVICE_ORDER"] = "PCI_BUS_ID" - torch.cuda.set_device(int(local_rank)) + torch.cuda.set_device(int(local_rank)) # using device ids from CUDA_VISIBILE_DEVICES dist.init_process_group( backend=self.backend, @@ -413,14 +386,6 @@ def __call__(self, obj): f"Skipping distributed tests because it requires {self.nnodes} devices " f"but got {torch.cuda.device_count()}", )(obj) - for i in range(self.nproc_per_node): # check free memory for the current node - free_mem = get_gpu_memory(i) - if free_mem < self.min_gpu_memory: - return unittest.skipIf( - True, - f"Skipping distributed tests because it requires at least {self.min_gpu_memory}MB gpu memory " - f"but got {free_mem}MB on gpu {i}", - )(obj) _cache_original_func(obj) @@ -645,7 +610,7 @@ def test_script_save(net, *inputs, device=None, rtol=1e-4, atol=0.0): def query_memory(n=2): """ - Find best n idle devices and return a string of device ids. + Find best n idle devices and return a string of device ids using the `nvidia-smi` command. """ bash_string = "nvidia-smi --query-gpu=power.draw,temperature.gpu,memory.used --format=csv,noheader,nounits" @@ -661,21 +626,6 @@ def query_memory(n=2): return ",".join(f"{int(x)}" for x in ids) -def get_gpu_memory(idx=0) -> float: - """ - Return the amount of GPU free memory in MB. - """ - bash_string = f"nvidia-smi --query-gpu=memory.free --format=csv,noheader,nounits --id={idx}" - - try: - p1 = Popen(bash_string.split(), stdout=PIPE) - output, error = p1.communicate() - free_memory = [x.split(",") for x in output.decode("utf-8").split("\n")[:-1]] - return float(np.asarray(free_memory, dtype=float).ravel()[0]) - except (TypeError, IndexError, OSError, ValueError): - return float(0.0) - - TEST_NDARRAYS: Tuple[Callable] = (np.array, torch.as_tensor) # type: ignore if torch.cuda.is_available(): gpu_tensor: Callable = partial(torch.as_tensor, device="cuda")