From 41a52702ef4a5eee29428e99454419ba1d0fa703 Mon Sep 17 00:00:00 2001 From: Wenqi Li <831580+wyli@users.noreply.github.com> Date: Wed, 5 Apr 2023 12:19:46 +0100 Subject: [PATCH 01/16] Revert "Auto3DSeg skip trained algos (#6290)" This reverts commit fa0560988f978642c4d8b2096039f2c11dc1ee54. --- monai/apps/auto3dseg/auto_runner.py | 20 +++++++------------- monai/apps/auto3dseg/utils.py | 9 +++------ 2 files changed, 10 insertions(+), 19 deletions(-) diff --git a/monai/apps/auto3dseg/auto_runner.py b/monai/apps/auto3dseg/auto_runner.py index fcd76e02a0..57d3b6c014 100644 --- a/monai/apps/auto3dseg/auto_runner.py +++ b/monai/apps/auto3dseg/auto_runner.py @@ -281,7 +281,7 @@ def __init__( # determine if we need to analyze, algo_gen or train from cache, unless manually provided self.analyze = not self.cache["analyze"] if analyze is None else analyze self.algo_gen = not self.cache["algo_gen"] if algo_gen is None else algo_gen - self.train = train + self.train = not self.cache["train"] if train is None else train self.ensemble = ensemble # last step, no need to check self.set_training_params() @@ -758,8 +758,7 @@ def run(self): logger.info("Skipping algorithm generation...") # step 3: algo training - auto_train_choice = self.train is None - if self.train or (auto_train_choice and not self.cache["train"]): + if self.train: history = import_bundle_algo_history(self.work_dir, only_trained=False) if len(history) == 0: @@ -768,15 +767,10 @@ def run(self): "Possibly the required algorithms generation step was not completed." ) - if auto_train_choice: - history = [h for h in history if not h["is_trained"]] # skip trained - - if len(history) > 0: - if not self.hpo: - self._train_algo_in_sequence(history) - else: - self._train_algo_in_nni(history) - + if not self.hpo: + self._train_algo_in_sequence(history) + else: + self._train_algo_in_nni(history) self.export_cache(train=True) else: logger.info("Skipping algorithm training...") @@ -804,4 +798,4 @@ def run(self): self.save_image(pred) logger.info(f"Auto3Dseg ensemble prediction outputs are saved in {self.output_dir}.") - logger.info("Auto3Dseg pipeline is completed successfully.") + logger.info("Auto3Dseg pipeline is complete successfully.") diff --git a/monai/apps/auto3dseg/utils.py b/monai/apps/auto3dseg/utils.py index feadc08808..67cde64a2c 100644 --- a/monai/apps/auto3dseg/utils.py +++ b/monai/apps/auto3dseg/utils.py @@ -47,14 +47,11 @@ def import_bundle_algo_history( if isinstance(algo, BundleAlgo): # algo's template path needs override algo.template_path = algo_meta_data["template_path"] - best_metrics = "best_metrics" - is_trained = best_metrics in algo_meta_data - if only_trained: - if is_trained: - history.append({name: algo, "is_trained": is_trained, best_metrics: algo_meta_data[best_metrics]}) + if "best_metrics" in algo_meta_data: + history.append({name: algo}) else: - history.append({name: algo, "is_trained": is_trained, best_metrics: algo_meta_data.get(best_metrics, None)}) + history.append({name: algo}) return history From 77f5e62d25d45f407cd11e298c5718f984242b2e Mon Sep 17 00:00:00 2001 From: Wenqi Li Date: Wed, 5 Apr 2023 14:10:45 +0100 Subject: [PATCH 02/16] workaround Signed-off-by: Wenqi Li --- tests/test_retinanet.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/tests/test_retinanet.py b/tests/test_retinanet.py index b43ad49dfd..17df14929f 100644 --- a/tests/test_retinanet.py +++ b/tests/test_retinanet.py @@ -97,7 +97,7 @@ TEST_CASES_TS.append([model, *case]) -@SkipIfBeforePyTorchVersion((1, 9)) +@SkipIfBeforePyTorchVersion((1, 10)) @unittest.skipUnless(has_torchvision, "Requires torchvision") class TestRetinaNet(unittest.TestCase): @parameterized.expand(TEST_CASES) From 8179e676f979b79c9b06631c29638c497eb6546f Mon Sep 17 00:00:00 2001 From: Wenqi Li Date: Wed, 5 Apr 2023 14:23:00 +0100 Subject: [PATCH 03/16] update temp tests Signed-off-by: Wenqi Li --- .github/workflows/integration.yml | 33 +++++++++++++++++++++++++++---- 1 file changed, 29 insertions(+), 4 deletions(-) diff --git a/.github/workflows/integration.yml b/.github/workflows/integration.yml index 9894848a53..f28a36dec2 100644 --- a/.github/workflows/integration.yml +++ b/.github/workflows/integration.yml @@ -2,6 +2,9 @@ name: integration on: + push: + branches: + - temp-test repository_dispatch: type: [integration-test-command] @@ -9,8 +12,8 @@ jobs: integration-py3: container: image: nvcr.io/nvidia/pytorch:22.04-py3 # CUDA 11.6 py38 - options: --gpus all # shm-size 4g works fine - runs-on: [self-hosted, linux, x64, integration] + options: --gpus all --ipc host # shm-size 4g works fine + runs-on: [self-hosted, linux, x64, command] steps: # checkout the pull request branch - uses: actions/checkout@v3 @@ -34,7 +37,7 @@ jobs: run: | which python python -m pip install --upgrade pip wheel - python -m pip install --upgrade torch torchvision + python -m pip install --upgrade torch torchvision torchaudio python -m pip install -r requirements-dev.txt rm -rf /github/home/.cache/torch/hub/mmars/ - name: Run integration tests @@ -49,8 +52,30 @@ jobs: python -c $'import torch\na,b=torch.zeros(1,device="cuda:0"),torch.zeros(1,device="cuda:1");\nwhile True:print(a,b)' > /dev/null & python -c "import torch; print(torch.__version__); print('{} of GPUs available'.format(torch.cuda.device_count()))" python -c 'import torch; print(torch.rand(5,3, device=torch.device("cuda:0")))' + + # test auto3dseg + python -m tests.test_auto3dseg_ensemble + python -m tests.test_auto3dseg_hpo + python -m tests.test_integration_autorunner + python -m tests.test_integration_gpu_customization + + # test latest template + cd ../ + git clone --depth 1 --branch main --single-branch https://github.com/Project-MONAI/research-contributions.git + ls research-contributions/ + cp -r research-contributions/auto3dseg/algorithm_templates ../MONAI/ + cd research-contributions && git log -1 && cd .. + export OMP_NUM_THREADS=4 + export MKL_NUM_THREADS=4 + export MONAI_TESTING_ALGO_TEMPLATE=algorithm_templates + python -m tests.test_auto3dseg_ensemble + python -m tests.test_auto3dseg_hpo + python -m tests.test_integration_autorunner + python -m tests.test_integration_gpu_customization + + # the other tests BUILD_MONAI=1 ./runtests.sh --build --net - BUILD_MONAI=1 ./runtests.sh --build --unittests --disttests + BUILD_MONAI=1 ./runtests.sh --build --unittests if pgrep python; then pkill python; fi shell: bash - name: Add reaction From 4770a8abf0108f70e3d6618aa8b35a3e84b821a9 Mon Sep 17 00:00:00 2001 From: Wenqi Li Date: Wed, 5 Apr 2023 14:39:19 +0100 Subject: [PATCH 04/16] single gpu test Signed-off-by: Wenqi Li --- .github/workflows/integration.yml | 2 +- tests/utils.py | 6 +++++- 2 files changed, 6 insertions(+), 2 deletions(-) diff --git a/.github/workflows/integration.yml b/.github/workflows/integration.yml index f28a36dec2..7c27db2d74 100644 --- a/.github/workflows/integration.yml +++ b/.github/workflows/integration.yml @@ -46,7 +46,7 @@ jobs: git config --global --add safe.directory /__w/MONAI/MONAI git clean -ffdx nvidia-smi - export CUDA_VISIBLE_DEVICES=$(python -m tests.utils | tail -n 1) + export CUDA_VISIBLE_DEVICES=$(python -m tests.utils 1 | tail -n 1) echo $CUDA_VISIBLE_DEVICES trap 'if pgrep python; then pkill python; fi;' ERR python -c $'import torch\na,b=torch.zeros(1,device="cuda:0"),torch.zeros(1,device="cuda:1");\nwhile True:print(a,b)' > /dev/null & diff --git a/tests/utils.py b/tests/utils.py index d195bd2ac5..2576997031 100644 --- a/tests/utils.py +++ b/tests/utils.py @@ -11,6 +11,7 @@ from __future__ import annotations +import argparse import copy import datetime import functools @@ -842,5 +843,8 @@ def command_line_tests(cmd, copy_env=True): TEST_DEVICES.append([torch.device("cuda")]) if __name__ == "__main__": - print("\n", query_memory(), sep="\n") # print to stdout + parser = argparse.ArgumentParser(prog="util") + parser.add_argument("-c", "--count", default=2, help="max number of gpus") + args = parser.parse_args() + print("\n", query_memory(args.count), sep="\n") # print to stdout sys.exit(0) From cfc7f20f34cedafced4bec02c3f0c8b703ee569c Mon Sep 17 00:00:00 2001 From: Wenqi Li Date: Wed, 5 Apr 2023 14:44:08 +0100 Subject: [PATCH 05/16] update Signed-off-by: Wenqi Li --- .github/workflows/integration.yml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/.github/workflows/integration.yml b/.github/workflows/integration.yml index 7c27db2d74..41eb6ac632 100644 --- a/.github/workflows/integration.yml +++ b/.github/workflows/integration.yml @@ -46,7 +46,7 @@ jobs: git config --global --add safe.directory /__w/MONAI/MONAI git clean -ffdx nvidia-smi - export CUDA_VISIBLE_DEVICES=$(python -m tests.utils 1 | tail -n 1) + export CUDA_VISIBLE_DEVICES=$(python -m tests.utils -c 1 | tail -n 1) echo $CUDA_VISIBLE_DEVICES trap 'if pgrep python; then pkill python; fi;' ERR python -c $'import torch\na,b=torch.zeros(1,device="cuda:0"),torch.zeros(1,device="cuda:1");\nwhile True:print(a,b)' > /dev/null & From 8b1d38defe0d15199de7837bf86921844b5e612f Mon Sep 17 00:00:00 2001 From: Wenqi Li Date: Wed, 5 Apr 2023 14:55:44 +0100 Subject: [PATCH 06/16] update Signed-off-by: Wenqi Li --- .github/workflows/integration.yml | 1 + monai/data/utils.py | 1 - 2 files changed, 1 insertion(+), 1 deletion(-) diff --git a/.github/workflows/integration.yml b/.github/workflows/integration.yml index 41eb6ac632..314803c39c 100644 --- a/.github/workflows/integration.yml +++ b/.github/workflows/integration.yml @@ -46,6 +46,7 @@ jobs: git config --global --add safe.directory /__w/MONAI/MONAI git clean -ffdx nvidia-smi + python -m tests.utils -c 1 export CUDA_VISIBLE_DEVICES=$(python -m tests.utils -c 1 | tail -n 1) echo $CUDA_VISIBLE_DEVICES trap 'if pgrep python; then pkill python; fi;' ERR diff --git a/monai/data/utils.py b/monai/data/utils.py index 2c035afb3f..ba9e44fa00 100644 --- a/monai/data/utils.py +++ b/monai/data/utils.py @@ -786,7 +786,6 @@ def rectify_header_sform_qform(img_nii): return img_nii norm = affine_to_spacing(img_nii.affine, r=d) - warnings.warn(f"Modifying image pixdim from {pixdim} to {norm}") img_nii.header.set_zooms(norm) return img_nii From d3cb6edc5d4065ae81638b06f5f9f020b0552b0c Mon Sep 17 00:00:00 2001 From: Wenqi Li Date: Wed, 5 Apr 2023 15:03:10 +0100 Subject: [PATCH 07/16] update Signed-off-by: Wenqi Li --- tests/utils.py | 1 + 1 file changed, 1 insertion(+) diff --git a/tests/utils.py b/tests/utils.py index 2576997031..b0fde7bfea 100644 --- a/tests/utils.py +++ b/tests/utils.py @@ -785,6 +785,7 @@ def query_memory(n=2): bash_string = "nvidia-smi --query-gpu=power.draw,temperature.gpu,memory.used --format=csv,noheader,nounits" try: + print(f"query memory with {n}") p1 = Popen(bash_string.split(), stdout=PIPE) output, error = p1.communicate() free_memory = [x.split(",") for x in output.decode("utf-8").split("\n")[:-1]] From f341655cd87d5b80711057a18de09ce1a7a090b0 Mon Sep 17 00:00:00 2001 From: Wenqi Li Date: Wed, 5 Apr 2023 15:07:52 +0100 Subject: [PATCH 08/16] update Signed-off-by: Wenqi Li --- tests/utils.py | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/tests/utils.py b/tests/utils.py index b0fde7bfea..4c8cf89cc1 100644 --- a/tests/utils.py +++ b/tests/utils.py @@ -785,7 +785,7 @@ def query_memory(n=2): bash_string = "nvidia-smi --query-gpu=power.draw,temperature.gpu,memory.used --format=csv,noheader,nounits" try: - print(f"query memory with {n}") + print(f"query memory with n={n} {bash_string}") p1 = Popen(bash_string.split(), stdout=PIPE) output, error = p1.communicate() free_memory = [x.split(",") for x in output.decode("utf-8").split("\n")[:-1]] @@ -794,6 +794,7 @@ def query_memory(n=2): ids = np.lexsort(free_memory)[:n] except (TypeError, ValueError, IndexError, OSError): ids = range(n) if isinstance(n, int) else [] + print("query_memory", ids) return ",".join(f"{int(x)}" for x in ids) From 440febd750e8415e7366cc8bfaa66963998adac6 Mon Sep 17 00:00:00 2001 From: Wenqi Li Date: Wed, 5 Apr 2023 15:12:17 +0100 Subject: [PATCH 09/16] update Signed-off-by: Wenqi Li --- tests/utils.py | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/tests/utils.py b/tests/utils.py index 4c8cf89cc1..194b4ce8b0 100644 --- a/tests/utils.py +++ b/tests/utils.py @@ -785,9 +785,11 @@ def query_memory(n=2): bash_string = "nvidia-smi --query-gpu=power.draw,temperature.gpu,memory.used --format=csv,noheader,nounits" try: - print(f"query memory with n={n} {bash_string}") + print(f"query memory with n={n}") p1 = Popen(bash_string.split(), stdout=PIPE) output, error = p1.communicate() + print(output) + print("error", error) free_memory = [x.split(",") for x in output.decode("utf-8").split("\n")[:-1]] free_memory = np.asarray(free_memory, dtype=float).T free_memory[1] += free_memory[0] # combine 0/1 column measures From 6fdbd927b6c825965edeacfc6f28916263c0c3eb Mon Sep 17 00:00:00 2001 From: Wenqi Li Date: Wed, 5 Apr 2023 15:19:37 +0100 Subject: [PATCH 10/16] update Signed-off-by: Wenqi Li --- tests/utils.py | 5 +---- 1 file changed, 1 insertion(+), 4 deletions(-) diff --git a/tests/utils.py b/tests/utils.py index 194b4ce8b0..99ab876244 100644 --- a/tests/utils.py +++ b/tests/utils.py @@ -788,15 +788,12 @@ def query_memory(n=2): print(f"query memory with n={n}") p1 = Popen(bash_string.split(), stdout=PIPE) output, error = p1.communicate() - print(output) - print("error", error) free_memory = [x.split(",") for x in output.decode("utf-8").split("\n")[:-1]] free_memory = np.asarray(free_memory, dtype=float).T free_memory[1] += free_memory[0] # combine 0/1 column measures ids = np.lexsort(free_memory)[:n] except (TypeError, ValueError, IndexError, OSError): ids = range(n) if isinstance(n, int) else [] - print("query_memory", ids) return ",".join(f"{int(x)}" for x in ids) @@ -850,5 +847,5 @@ def command_line_tests(cmd, copy_env=True): parser = argparse.ArgumentParser(prog="util") parser.add_argument("-c", "--count", default=2, help="max number of gpus") args = parser.parse_args() - print("\n", query_memory(args.count), sep="\n") # print to stdout + print("\n", query_memory(int(args.count)), sep="\n") # print to stdout sys.exit(0) From c4a2158a5d1d21dfaa13b533237c1545858c3828 Mon Sep 17 00:00:00 2001 From: Wenqi Li Date: Wed, 5 Apr 2023 15:24:36 +0100 Subject: [PATCH 11/16] update Signed-off-by: Wenqi Li --- .github/workflows/integration.yml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/.github/workflows/integration.yml b/.github/workflows/integration.yml index 314803c39c..76ee9e4c70 100644 --- a/.github/workflows/integration.yml +++ b/.github/workflows/integration.yml @@ -46,7 +46,6 @@ jobs: git config --global --add safe.directory /__w/MONAI/MONAI git clean -ffdx nvidia-smi - python -m tests.utils -c 1 export CUDA_VISIBLE_DEVICES=$(python -m tests.utils -c 1 | tail -n 1) echo $CUDA_VISIBLE_DEVICES trap 'if pgrep python; then pkill python; fi;' ERR @@ -55,6 +54,7 @@ jobs: python -c 'import torch; print(torch.rand(5,3, device=torch.device("cuda:0")))' # test auto3dseg + BUILD_MONAI=0 ./runtests.sh --build python -m tests.test_auto3dseg_ensemble python -m tests.test_auto3dseg_hpo python -m tests.test_integration_autorunner From 04c3d77667602c00aba5a9dd4cd040a55912dcd5 Mon Sep 17 00:00:00 2001 From: Wenqi Li Date: Wed, 5 Apr 2023 15:34:33 +0100 Subject: [PATCH 12/16] fixes Signed-off-by: Wenqi Li --- .github/workflows/integration.yml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/.github/workflows/integration.yml b/.github/workflows/integration.yml index 76ee9e4c70..89111cbc47 100644 --- a/.github/workflows/integration.yml +++ b/.github/workflows/integration.yml @@ -49,7 +49,7 @@ jobs: export CUDA_VISIBLE_DEVICES=$(python -m tests.utils -c 1 | tail -n 1) echo $CUDA_VISIBLE_DEVICES trap 'if pgrep python; then pkill python; fi;' ERR - python -c $'import torch\na,b=torch.zeros(1,device="cuda:0"),torch.zeros(1,device="cuda:1");\nwhile True:print(a,b)' > /dev/null & + python -c $'import torch\na=[torch.zeros(1,device=f"cuda:{i}") for i in range(torch.cuda.device_count())];\nwhile True:print(a)' > /dev/null & python -c "import torch; print(torch.__version__); print('{} of GPUs available'.format(torch.cuda.device_count()))" python -c 'import torch; print(torch.rand(5,3, device=torch.device("cuda:0")))' From fa48db0e4afb36ed62880e56f69a9c63a6aea6e4 Mon Sep 17 00:00:00 2001 From: Wenqi Li Date: Wed, 5 Apr 2023 15:38:22 +0100 Subject: [PATCH 13/16] remove temp Signed-off-by: Wenqi Li --- .github/workflows/integration.yml | 3 --- 1 file changed, 3 deletions(-) diff --git a/.github/workflows/integration.yml b/.github/workflows/integration.yml index 89111cbc47..6155e8ec38 100644 --- a/.github/workflows/integration.yml +++ b/.github/workflows/integration.yml @@ -2,9 +2,6 @@ name: integration on: - push: - branches: - - temp-test repository_dispatch: type: [integration-test-command] From 5fb53334665382ec9a7f3ec32562b693ebe07ecd Mon Sep 17 00:00:00 2001 From: "pre-commit-ci[bot]" <66853113+pre-commit-ci[bot]@users.noreply.github.com> Date: Wed, 5 Apr 2023 14:41:28 +0000 Subject: [PATCH 14/16] [pre-commit.ci] auto fixes from pre-commit.com hooks for more information, see https://pre-commit.ci --- monai/data/utils.py | 1 - 1 file changed, 1 deletion(-) diff --git a/monai/data/utils.py b/monai/data/utils.py index ba9e44fa00..d5dddb5d55 100644 --- a/monai/data/utils.py +++ b/monai/data/utils.py @@ -17,7 +17,6 @@ import math import os import pickle -import warnings from collections import abc, defaultdict from collections.abc import Generator, Iterable, Mapping, Sequence, Sized from copy import deepcopy From f8bdf0f157661ca9d123b7f1dbff10f96a6c7822 Mon Sep 17 00:00:00 2001 From: Wenqi Li Date: Wed, 5 Apr 2023 19:01:26 +0100 Subject: [PATCH 15/16] update Signed-off-by: Wenqi Li --- .github/workflows/integration.yml | 2 +- tests/test_retinanet.py | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/.github/workflows/integration.yml b/.github/workflows/integration.yml index 6155e8ec38..456fa10c41 100644 --- a/.github/workflows/integration.yml +++ b/.github/workflows/integration.yml @@ -9,7 +9,7 @@ jobs: integration-py3: container: image: nvcr.io/nvidia/pytorch:22.04-py3 # CUDA 11.6 py38 - options: --gpus all --ipc host # shm-size 4g works fine + options: --gpus "device=0" --ipc host # shm-size 4g works fine runs-on: [self-hosted, linux, x64, command] steps: # checkout the pull request branch diff --git a/tests/test_retinanet.py b/tests/test_retinanet.py index 17df14929f..f143550b91 100644 --- a/tests/test_retinanet.py +++ b/tests/test_retinanet.py @@ -97,7 +97,7 @@ TEST_CASES_TS.append([model, *case]) -@SkipIfBeforePyTorchVersion((1, 10)) +@SkipIfBeforePyTorchVersion((1, 12)) @unittest.skipUnless(has_torchvision, "Requires torchvision") class TestRetinaNet(unittest.TestCase): @parameterized.expand(TEST_CASES) From c7952c2e5292a4e57cea367534adec799681aee2 Mon Sep 17 00:00:00 2001 From: Mingxin Zheng <18563433+mingxin-zheng@users.noreply.github.com> Date: Wed, 5 Apr 2023 11:30:00 -0400 Subject: [PATCH 16/16] Update AlGO_HASH (#6301) Fixes set_device issue and disclaimer - https://github.com/Project-MONAI/research-contributions/pull/216 - https://github.com/Project-MONAI/research-contributions/pull/212 ### Description A few sentences describing the changes proposed in this pull request. ### Types of changes - [x] Non-breaking change (fix or new feature that would not break existing functionality). Signed-off-by: Mingxin Zheng <18563433+mingxin-zheng@users.noreply.github.com> --- monai/apps/auto3dseg/bundle_gen.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/monai/apps/auto3dseg/bundle_gen.py b/monai/apps/auto3dseg/bundle_gen.py index 8104a79d15..33a3afd07c 100644 --- a/monai/apps/auto3dseg/bundle_gen.py +++ b/monai/apps/auto3dseg/bundle_gen.py @@ -35,7 +35,7 @@ from monai.utils import ensure_tuple logger = get_logger(module_name=__name__) -ALGO_HASH = os.environ.get("MONAI_ALGO_HASH", "4af80e1") +ALGO_HASH = os.environ.get("MONAI_ALGO_HASH", "7758ad1") __all__ = ["BundleAlgo", "BundleGen"]