From fb296b6c28afb704e4346c9c61fb731e79b835b5 Mon Sep 17 00:00:00 2001 From: Yiheng Wang Date: Sat, 7 May 2022 14:49:22 +0800 Subject: [PATCH 01/12] reproduce issue Signed-off-by: Yiheng Wang --- .github/workflows/cron.yml | 6 ++++-- 1 file changed, 4 insertions(+), 2 deletions(-) diff --git a/.github/workflows/cron.yml b/.github/workflows/cron.yml index 08065147e5..7e755dbe47 100644 --- a/.github/workflows/cron.yml +++ b/.github/workflows/cron.yml @@ -5,6 +5,9 @@ on: # - cron: "0 2 * * *" # at 02:00 UTC # Allows you to run this workflow manually from the Actions tab workflow_dispatch: + push: + branches: + - 4234-fix-2204-nvfuser-issue jobs: cron-gpu: @@ -103,7 +106,6 @@ jobs: cron-pip: # pip install monai[all] and use it to run unit tests - if: github.repository == 'Project-MONAI/MONAI' strategy: matrix: container: ["pytorch:21.02", "pytorch:21.10", "pytorch:22.04"] # 21.02, 21.10 for backward comp. @@ -167,7 +169,7 @@ jobs: python -c "import torch; print(torch.__version__); print('{} of GPUs available'.format(torch.cuda.device_count()))" python -m pip install -r requirements-dev.txt - PYTHONPATH="$tmp_dir":$PYTHONPATH BUILD_MONAI=1 python ./tests/runner.py -p 'test_((?!integration).)' # unit tests + PYTHONPATH="$tmp_dir":$PYTHONPATH BUILD_MONAI=1 python ./tests/test_dynunet.py # unit tests if pgrep python; then pkill python; fi cron-docker: From ce28bd8ad397ae98618675c6a3c5fab541c40675 Mon Sep 17 00:00:00 2001 From: Yiheng Wang Date: Sat, 7 May 2022 14:52:21 +0800 Subject: [PATCH 02/12] remove 22.01 02 Signed-off-by: Yiheng Wang --- .github/workflows/cron.yml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/.github/workflows/cron.yml b/.github/workflows/cron.yml index 7e755dbe47..4823259597 100644 --- a/.github/workflows/cron.yml +++ b/.github/workflows/cron.yml @@ -108,7 +108,7 @@ jobs: # pip install monai[all] and use it to run unit tests strategy: matrix: - container: ["pytorch:21.02", "pytorch:21.10", "pytorch:22.04"] # 21.02, 21.10 for backward comp. + container: ["pytorch:22.04"] # 21.02, 21.10 for backward comp. container: image: nvcr.io/nvidia/${{ matrix.container }}-py3 # testing with the latest pytorch base image options: "--gpus all" From d15d6c03b3fb106c9f3094a59402ec74e412bda6 Mon Sep 17 00:00:00 2001 From: Yiheng Wang Date: Sat, 7 May 2022 15:05:38 +0800 Subject: [PATCH 03/12] remove other workflows Signed-off-by: Yiheng Wang --- .github/workflows/cron.yml | 319 +++++++++++++++++++------------------ 1 file changed, 160 insertions(+), 159 deletions(-) diff --git a/.github/workflows/cron.yml b/.github/workflows/cron.yml index 4823259597..f6c27565b6 100644 --- a/.github/workflows/cron.yml +++ b/.github/workflows/cron.yml @@ -10,102 +10,103 @@ on: - 4234-fix-2204-nvfuser-issue jobs: - cron-gpu: - if: github.repository == 'Project-MONAI/MONAI' - container: - image: nvcr.io/nvidia/pytorch:21.06-py3 # CUDA 11.3 - options: "--gpus all" - runs-on: [self-hosted, linux, x64, common] - strategy: - matrix: - pytorch-version: [1.7.1, 1.8.1, 1.9.1, 1.10.2, latest] - steps: - - uses: actions/checkout@v2 - - name: Install the dependencies - run: | - which python - python -m pip install --upgrade pip wheel - python -m pip uninstall -y torch torchvision - if [ ${{ matrix.pytorch-version }} == "latest" ]; then - python -m pip install torch torchvision --extra-index-url https://download.pytorch.org/whl/cu113 - elif [ ${{ matrix.pytorch-version }} == "1.7.1" ]; then - python -m pip install torch==1.7.1 torchvision==0.8.2 --extra-index-url https://download.pytorch.org/whl/cu113 - elif [ ${{ matrix.pytorch-version }} == "1.8.1" ]; then - python -m pip install torch==1.8.1 torchvision==0.9.1 --extra-index-url https://download.pytorch.org/whl/cu113 - elif [ ${{ matrix.pytorch-version }} == "1.9.1" ]; then - python -m pip install torch==1.9.1 torchvision==0.10.1 --extra-index-url https://download.pytorch.org/whl/cu113 - elif [ ${{ matrix.pytorch-version }} == "1.10.2" ]; then - python -m pip install torch==1.10.2 torchvision==0.11.3 --extra-index-url https://download.pytorch.org/whl/cu113 - fi - python -m pip install -r requirements-dev.txt - python -m pip list - - name: Run tests report coverage - run: | - export LAUNCH_DELAY=$[ $RANDOM % 16 * 60 ] - echo "Sleep $LAUNCH_DELAY" - sleep $LAUNCH_DELAY - nvidia-smi - export CUDA_VISIBLE_DEVICES=$(python -m tests.utils) - echo $CUDA_VISIBLE_DEVICES - trap 'if pgrep python; then pkill python; fi;' ERR - python -c $'import torch\na,b=torch.zeros(1,device="cuda:0"),torch.zeros(1,device="cuda:1");\nwhile True:print(a,b)' > /dev/null & - python -c "import torch; print(torch.__version__); print('{} of GPUs available'.format(torch.cuda.device_count()))" - python -c 'import torch; print(torch.rand(5, 3, device=torch.device("cuda:0")))' - BUILD_MONAI=1 ./runtests.sh --build --coverage --unittests --disttests # unit tests with coverage report - BUILD_MONAI=1 ./runtests.sh --build --coverage --net # integration tests with coverage report - coverage xml - if pgrep python; then pkill python; fi - - name: Upload coverage - uses: codecov/codecov-action@v1 - with: - fail_ci_if_error: false - file: ./coverage.xml + # cron-gpu: + # if: github.repository == 'Project-MONAI/MONAI' + # container: + # image: nvcr.io/nvidia/pytorch:21.06-py3 # CUDA 11.3 + # options: "--gpus all" + # runs-on: [self-hosted, linux, x64, common] + # strategy: + # matrix: + # pytorch-version: [1.7.1, 1.8.1, 1.9.1, 1.10.2, latest] + # steps: + # - uses: actions/checkout@v2 + # - name: Install the dependencies + # run: | + # which python + # python -m pip install --upgrade pip wheel + # python -m pip uninstall -y torch torchvision + # if [ ${{ matrix.pytorch-version }} == "latest" ]; then + # python -m pip install torch torchvision --extra-index-url https://download.pytorch.org/whl/cu113 + # elif [ ${{ matrix.pytorch-version }} == "1.7.1" ]; then + # python -m pip install torch==1.7.1 torchvision==0.8.2 --extra-index-url https://download.pytorch.org/whl/cu113 + # elif [ ${{ matrix.pytorch-version }} == "1.8.1" ]; then + # python -m pip install torch==1.8.1 torchvision==0.9.1 --extra-index-url https://download.pytorch.org/whl/cu113 + # elif [ ${{ matrix.pytorch-version }} == "1.9.1" ]; then + # python -m pip install torch==1.9.1 torchvision==0.10.1 --extra-index-url https://download.pytorch.org/whl/cu113 + # elif [ ${{ matrix.pytorch-version }} == "1.10.2" ]; then + # python -m pip install torch==1.10.2 torchvision==0.11.3 --extra-index-url https://download.pytorch.org/whl/cu113 + # fi + # python -m pip install -r requirements-dev.txt + # python -m pip list + # - name: Run tests report coverage + # run: | + # export LAUNCH_DELAY=$[ $RANDOM % 16 * 60 ] + # echo "Sleep $LAUNCH_DELAY" + # sleep $LAUNCH_DELAY + # nvidia-smi + # export CUDA_VISIBLE_DEVICES=$(python -m tests.utils) + # echo $CUDA_VISIBLE_DEVICES + # trap 'if pgrep python; then pkill python; fi;' ERR + # python -c $'import torch\na,b=torch.zeros(1,device="cuda:0"),torch.zeros(1,device="cuda:1");\nwhile True:print(a,b)' > /dev/null & + # python -c "import torch; print(torch.__version__); print('{} of GPUs available'.format(torch.cuda.device_count()))" + # python -c 'import torch; print(torch.rand(5, 3, device=torch.device("cuda:0")))' + # BUILD_MONAI=1 ./runtests.sh --build --coverage --unittests --disttests # unit tests with coverage report + # BUILD_MONAI=1 ./runtests.sh --build --coverage --net # integration tests with coverage report + # coverage xml + # if pgrep python; then pkill python; fi + # - name: Upload coverage + # uses: codecov/codecov-action@v1 + # with: + # fail_ci_if_error: false + # file: ./coverage.xml - cron-pt-image: - if: github.repository == 'Project-MONAI/MONAI' - strategy: - matrix: - container: ["pytorch:21.02", "pytorch:21.10", "pytorch:22.04"] # 21.02, 21.10 for backward comp. - container: - image: nvcr.io/nvidia/${{ matrix.container }}-py3 # testing with the latest pytorch base image - options: "--gpus all" - runs-on: [self-hosted, linux, x64, common] - steps: - - uses: actions/checkout@v2 - - name: Install APT dependencies - run: | - apt-get update - DEBIAN_FRONTEND="noninteractive" apt-get install -y libopenslide0 - - name: Install Python dependencies - run: | - which python - python -m pip install --upgrade pip wheel - python -m pip install -r requirements-dev.txt - python -m pip list - - name: Run tests report coverage - run: | - export LAUNCH_DELAY=$[ $RANDOM % 16 * 60 ] - echo "Sleep $LAUNCH_DELAY" - sleep $LAUNCH_DELAY - nvidia-smi - export CUDA_VISIBLE_DEVICES=$(python -m tests.utils) - echo $CUDA_VISIBLE_DEVICES - trap 'if pgrep python; then pkill python; fi;' ERR - python -c $'import torch\na,b=torch.zeros(1,device="cuda:0"),torch.zeros(1,device="cuda:1");\nwhile True:print(a,b)' > /dev/null & - python -c "import torch; print(torch.__version__); print('{} of GPUs available'.format(torch.cuda.device_count()))" - python -c 'import torch; print(torch.rand(5, 3, device=torch.device("cuda:0")))' - BUILD_MONAI=1 ./runtests.sh --build --coverage --unittests --disttests # unit tests with coverage report - BUILD_MONAI=1 ./runtests.sh --build --coverage --net # integration tests with coverage report - coverage xml - if pgrep python; then pkill python; fi - - name: Upload coverage - uses: codecov/codecov-action@v1 - with: - fail_ci_if_error: false - file: ./coverage.xml + # cron-pt-image: + # if: github.repository == 'Project-MONAI/MONAI' + # strategy: + # matrix: + # container: ["pytorch:21.02", "pytorch:21.10", "pytorch:22.04"] # 21.02, 21.10 for backward comp. + # container: + # image: nvcr.io/nvidia/${{ matrix.container }}-py3 # testing with the latest pytorch base image + # options: "--gpus all" + # runs-on: [self-hosted, linux, x64, common] + # steps: + # - uses: actions/checkout@v2 + # - name: Install APT dependencies + # run: | + # apt-get update + # DEBIAN_FRONTEND="noninteractive" apt-get install -y libopenslide0 + # - name: Install Python dependencies + # run: | + # which python + # python -m pip install --upgrade pip wheel + # python -m pip install -r requirements-dev.txt + # python -m pip list + # - name: Run tests report coverage + # run: | + # export LAUNCH_DELAY=$[ $RANDOM % 16 * 60 ] + # echo "Sleep $LAUNCH_DELAY" + # sleep $LAUNCH_DELAY + # nvidia-smi + # export CUDA_VISIBLE_DEVICES=$(python -m tests.utils) + # echo $CUDA_VISIBLE_DEVICES + # trap 'if pgrep python; then pkill python; fi;' ERR + # python -c $'import torch\na,b=torch.zeros(1,device="cuda:0"),torch.zeros(1,device="cuda:1");\nwhile True:print(a,b)' > /dev/null & + # python -c "import torch; print(torch.__version__); print('{} of GPUs available'.format(torch.cuda.device_count()))" + # python -c 'import torch; print(torch.rand(5, 3, device=torch.device("cuda:0")))' + # BUILD_MONAI=1 ./runtests.sh --build --coverage --unittests --disttests # unit tests with coverage report + # BUILD_MONAI=1 ./runtests.sh --build --coverage --net # integration tests with coverage report + # coverage xml + # if pgrep python; then pkill python; fi + # - name: Upload coverage + # uses: codecov/codecov-action@v1 + # with: + # fail_ci_if_error: false + # file: ./coverage.xml cron-pip: # pip install monai[all] and use it to run unit tests + if: github.repository == 'Project-MONAI/MONAI' strategy: matrix: container: ["pytorch:22.04"] # 21.02, 21.10 for backward comp. @@ -172,71 +173,71 @@ jobs: PYTHONPATH="$tmp_dir":$PYTHONPATH BUILD_MONAI=1 python ./tests/test_dynunet.py # unit tests if pgrep python; then pkill python; fi - cron-docker: - if: github.repository == 'Project-MONAI/MONAI' - container: - image: docker://projectmonai/monai:latest # this might be slow and has the pull count limitations - options: "--gpus all" - runs-on: [self-hosted, linux, x64, common] - steps: - - name: Run tests report coverage - # The docker image process has done the compilation. - # BUILD_MONAI=1 is necessary for triggering the USE_COMPILED flag. - run: | - cd /opt/monai - nvidia-smi - export CUDA_VISIBLE_DEVICES=$(python -m tests.utils) - echo $CUDA_VISIBLE_DEVICES - trap 'if pgrep python; then pkill python; fi;' ERR - python -c $'import torch\na,b=torch.zeros(1,device="cuda:0"),torch.zeros(1,device="cuda:1");\nwhile True:print(a,b)' > /dev/null & - python -c "import torch; print(torch.__version__); print('{} of GPUs available'.format(torch.cuda.device_count()))" - python -c 'import torch; print(torch.rand(5,3, device=torch.device("cuda:0")))' - ngc --version - BUILD_MONAI=1 ./runtests.sh --build --coverage --pytype --unittests --disttests # unit tests with pytype checks, coverage report - BUILD_MONAI=1 ./runtests.sh --build --coverage --net # integration tests with coverage report - coverage xml - if pgrep python; then pkill python; fi - - name: Upload coverage - uses: codecov/codecov-action@v1 - with: - fail_ci_if_error: false - file: ./coverage.xml + # cron-docker: + # if: github.repository == 'Project-MONAI/MONAI' + # container: + # image: docker://projectmonai/monai:latest # this might be slow and has the pull count limitations + # options: "--gpus all" + # runs-on: [self-hosted, linux, x64, common] + # steps: + # - name: Run tests report coverage + # # The docker image process has done the compilation. + # # BUILD_MONAI=1 is necessary for triggering the USE_COMPILED flag. + # run: | + # cd /opt/monai + # nvidia-smi + # export CUDA_VISIBLE_DEVICES=$(python -m tests.utils) + # echo $CUDA_VISIBLE_DEVICES + # trap 'if pgrep python; then pkill python; fi;' ERR + # python -c $'import torch\na,b=torch.zeros(1,device="cuda:0"),torch.zeros(1,device="cuda:1");\nwhile True:print(a,b)' > /dev/null & + # python -c "import torch; print(torch.__version__); print('{} of GPUs available'.format(torch.cuda.device_count()))" + # python -c 'import torch; print(torch.rand(5,3, device=torch.device("cuda:0")))' + # ngc --version + # BUILD_MONAI=1 ./runtests.sh --build --coverage --pytype --unittests --disttests # unit tests with pytype checks, coverage report + # BUILD_MONAI=1 ./runtests.sh --build --coverage --net # integration tests with coverage report + # coverage xml + # if pgrep python; then pkill python; fi + # - name: Upload coverage + # uses: codecov/codecov-action@v1 + # with: + # fail_ci_if_error: false + # file: ./coverage.xml - cron-tutorial-notebooks: - if: github.repository == 'Project-MONAI/MONAI' - needs: cron-gpu # so that monai itself is verified first - container: - image: nvcr.io/nvidia/pytorch:22.04-py3 # testing with the latest pytorch base image - options: "--gpus all --ipc=host" - runs-on: [self-hosted, linux, x64, common] - steps: - - uses: actions/checkout@v2 - - name: Install MONAI - id: monai-install - run: | - which python - python -m pip install --upgrade pip wheel - python -m pip install -r requirements-dev.txt - BUILD_MONAI=1 python setup.py develop # install monai - nvidia-smi - export CUDA_VISIBLE_DEVICES=$(python -m tests.utils) - echo $CUDA_VISIBLE_DEVICES - echo "::set-output name=devices::$CUDA_VISIBLE_DEVICES" - - name: Checkout tutorials and install their requirements - run: | - cd /opt - git clone --depth 1 --branch master --single-branch https://github.com/Project-MONAI/tutorials.git # latest commit of master branch - cd tutorials - python -m pip install -r requirements.txt - - name: Run tutorial notebooks - timeout-minutes: 150 - run: | - export CUDA_VISIBLE_DEVICES=${{ steps.monai-install.outputs.devices }} - echo $CUDA_VISIBLE_DEVICES - trap 'if pgrep python; then pkill python; fi;' ERR - python -c $'import torch\na,b=torch.zeros(1,device="cuda:0"),torch.zeros(1,device="cuda:1");\nwhile True:print(a,b)' > /dev/null & - cd /opt/tutorials - python -c 'import monai; monai.config.print_debug_info()' - $(pwd)/runner.sh - python -c 'import monai; monai.config.print_debug_info()' - if pgrep python; then pkill python; fi + # cron-tutorial-notebooks: + # if: github.repository == 'Project-MONAI/MONAI' + # needs: cron-gpu # so that monai itself is verified first + # container: + # image: nvcr.io/nvidia/pytorch:22.04-py3 # testing with the latest pytorch base image + # options: "--gpus all --ipc=host" + # runs-on: [self-hosted, linux, x64, common] + # steps: + # - uses: actions/checkout@v2 + # - name: Install MONAI + # id: monai-install + # run: | + # which python + # python -m pip install --upgrade pip wheel + # python -m pip install -r requirements-dev.txt + # BUILD_MONAI=1 python setup.py develop # install monai + # nvidia-smi + # export CUDA_VISIBLE_DEVICES=$(python -m tests.utils) + # echo $CUDA_VISIBLE_DEVICES + # echo "::set-output name=devices::$CUDA_VISIBLE_DEVICES" + # - name: Checkout tutorials and install their requirements + # run: | + # cd /opt + # git clone --depth 1 --branch master --single-branch https://github.com/Project-MONAI/tutorials.git # latest commit of master branch + # cd tutorials + # python -m pip install -r requirements.txt + # - name: Run tutorial notebooks + # timeout-minutes: 150 + # run: | + # export CUDA_VISIBLE_DEVICES=${{ steps.monai-install.outputs.devices }} + # echo $CUDA_VISIBLE_DEVICES + # trap 'if pgrep python; then pkill python; fi;' ERR + # python -c $'import torch\na,b=torch.zeros(1,device="cuda:0"),torch.zeros(1,device="cuda:1");\nwhile True:print(a,b)' > /dev/null & + # cd /opt/tutorials + # python -c 'import monai; monai.config.print_debug_info()' + # $(pwd)/runner.sh + # python -c 'import monai; monai.config.print_debug_info()' + # if pgrep python; then pkill python; fi From 72e9a2f3c463c123b0d02ead3be641a29c380b3a Mon Sep 17 00:00:00 2001 From: Yiheng Wang Date: Sat, 7 May 2022 15:39:18 +0800 Subject: [PATCH 04/12] run on pull request Signed-off-by: Yiheng Wang --- .github/workflows/cron.yml | 4 +--- 1 file changed, 1 insertion(+), 3 deletions(-) diff --git a/.github/workflows/cron.yml b/.github/workflows/cron.yml index f6c27565b6..409b1ac5f7 100644 --- a/.github/workflows/cron.yml +++ b/.github/workflows/cron.yml @@ -5,9 +5,7 @@ on: # - cron: "0 2 * * *" # at 02:00 UTC # Allows you to run this workflow manually from the Actions tab workflow_dispatch: - push: - branches: - - 4234-fix-2204-nvfuser-issue + pull_request: jobs: # cron-gpu: From 4a04ab4e69ce492a223b7b7e6535c25884bfa6c0 Mon Sep 17 00:00:00 2001 From: Yiheng Wang Date: Sat, 7 May 2022 15:50:11 +0800 Subject: [PATCH 05/12] remove sleep Signed-off-by: Yiheng Wang --- .github/workflows/cron.yml | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/.github/workflows/cron.yml b/.github/workflows/cron.yml index 409b1ac5f7..8c7386b6d7 100644 --- a/.github/workflows/cron.yml +++ b/.github/workflows/cron.yml @@ -158,8 +158,8 @@ jobs: ls -al export LAUNCH_DELAY=$[ $RANDOM % 16 * 60 ] - echo "Sleep $LAUNCH_DELAY" - sleep $LAUNCH_DELAY + # echo "Sleep $LAUNCH_DELAY" + # sleep $LAUNCH_DELAY nvidia-smi export CUDA_VISIBLE_DEVICES=$(python -m tests.utils) echo $CUDA_VISIBLE_DEVICES From 281a9eab71d725f4decb3ff01b36c6dad68f6ab5 Mon Sep 17 00:00:00 2001 From: Yiheng Wang Date: Tue, 10 May 2022 22:29:20 +0800 Subject: [PATCH 06/12] test single layer forward Signed-off-by: Yiheng Wang --- tests/test_dynunet.py | 43 +++++++++++++++++++++++-------------------- tests/utils.py | 22 ++++++++++++++++++++++ 2 files changed, 45 insertions(+), 20 deletions(-) diff --git a/tests/test_dynunet.py b/tests/test_dynunet.py index 14006b96e6..9ad8cf6217 100644 --- a/tests/test_dynunet.py +++ b/tests/test_dynunet.py @@ -20,7 +20,7 @@ from monai.utils import optional_import from tests.utils import skip_if_no_cuda, skip_if_windows, test_script_save -_, has_nvfuser = optional_import("apex.normalization", name="InstanceNorm3dNVFuser") +InstanceNorm3dNVFuser, has_nvfuser = optional_import("apex.normalization", name="InstanceNorm3dNVFuser") device = "cuda" if torch.cuda.is_available() else "cpu" @@ -127,25 +127,28 @@ def test_script(self): class TestDynUNetWithInstanceNorm3dNVFuser(unittest.TestCase): @parameterized.expand([TEST_CASE_DYNUNET_3D[0]]) def test_consistency(self, input_param, input_shape, _): - for eps in [1e-4, 1e-5]: - for momentum in [0.1, 0.01]: - for affine in [True, False]: - norm_param = {"eps": eps, "momentum": momentum, "affine": affine} - input_param["norm_name"] = ("instance", norm_param) - input_param_fuser = input_param.copy() - input_param_fuser["norm_name"] = ("instance_nvfuser", norm_param) - for memory_format in [torch.contiguous_format, torch.channels_last_3d]: - net = DynUNet(**input_param).to("cuda:0", memory_format=memory_format) - net_fuser = DynUNet(**input_param_fuser).to("cuda:0", memory_format=memory_format) - net_fuser.load_state_dict(net.state_dict()) - - input_tensor = torch.randn(input_shape).to("cuda:0", memory_format=memory_format) - with eval_mode(net): - result = net(input_tensor) - with eval_mode(net_fuser): - result_fuser = net_fuser(input_tensor) - - torch.testing.assert_close(result, result_fuser) + layer = InstanceNorm3dNVFuser(num_features=1, affine=True).to("cuda:0") + inp = torch.randn([1, 1, 1, 1, 1]).to("cuda:0") + out = layer(inp) + # for eps in [1e-4, 1e-5]: + # for momentum in [0.1, 0.01]: + # for affine in [True, False]: + # norm_param = {"eps": eps, "momentum": momentum, "affine": affine} + # input_param["norm_name"] = ("instance", norm_param) + # input_param_fuser = input_param.copy() + # input_param_fuser["norm_name"] = ("instance_nvfuser", norm_param) + # for memory_format in [torch.contiguous_format, torch.channels_last_3d]: + # net = DynUNet(**input_param).to("cuda:0", memory_format=memory_format) + # net_fuser = DynUNet(**input_param_fuser).to("cuda:0", memory_format=memory_format) + # net_fuser.load_state_dict(net.state_dict()) + + # input_tensor = torch.randn(input_shape).to("cuda:0", memory_format=memory_format) + # with eval_mode(net): + # result = net(input_tensor) + # with eval_mode(net_fuser): + # result_fuser = net_fuser(input_tensor) + + # torch.testing.assert_close(result, result_fuser) class TestDynUNetDeepSupervision(unittest.TestCase): diff --git a/tests/utils.py b/tests/utils.py index 1a547fc2d2..f0a3f7d716 100644 --- a/tests/utils.py +++ b/tests/utils.py @@ -278,6 +278,28 @@ def has_cupy(): HAS_CUPY = has_cupy() +# def has_nvfuser(): +# """ +# Returns True if the user has installed a proper version of apex that contains `normalization.InstanceNorm3dNVFuser`. +# """ +# InstanceNorm3dNVFuser, has_nvfuser = optional_import("apex.normalization", name="InstanceNorm3dNVFuser") +# if not has_nvfuser: +# return False +# if not torch.cuda.is_available(): +# return False +# try: # test nvfuser installation with a basic example +# layer = InstanceNorm3dNVFuser(num_features=1, affine=True).to("cuda:0") +# inp = torch.randn([1, 1, 1, 1, 1]).to("cuda:0") +# out = layer(inp) +# del inp, out +# return True +# except Exception: +# return False + + +# HAS_NVFUSER = has_nvfuser() + + def make_nifti_image(array: NdarrayOrTensor, affine=None, dir=None, fname=None, suffix=".nii.gz", verbose=False): """ Create a temporary nifti image on the disk and return the image name. From 7db3530c70d414ea0a3a32dfa65e47b4e6ee3a8b Mon Sep 17 00:00:00 2001 From: Yiheng Wang Date: Tue, 10 May 2022 22:51:50 +0800 Subject: [PATCH 07/12] add has_nvfuser Signed-off-by: Yiheng Wang --- tests/test_dynunet.py | 48 +++++++++++++++++++------------------------ tests/utils.py | 40 ++++++++++++++++++------------------ 2 files changed, 41 insertions(+), 47 deletions(-) diff --git a/tests/test_dynunet.py b/tests/test_dynunet.py index 9ad8cf6217..d06892602f 100644 --- a/tests/test_dynunet.py +++ b/tests/test_dynunet.py @@ -17,10 +17,7 @@ from monai.networks import eval_mode from monai.networks.nets import DynUNet -from monai.utils import optional_import -from tests.utils import skip_if_no_cuda, skip_if_windows, test_script_save - -InstanceNorm3dNVFuser, has_nvfuser = optional_import("apex.normalization", name="InstanceNorm3dNVFuser") +from tests.utils import HAS_NVFUSER, skip_if_no_cuda, skip_if_windows, test_script_save device = "cuda" if torch.cuda.is_available() else "cpu" @@ -123,32 +120,29 @@ def test_script(self): @skip_if_no_cuda @skip_if_windows -@unittest.skipUnless(has_nvfuser, "To use `instance_nvfuser`, `apex.normalization.InstanceNorm3dNVFuser` is needed.") +@unittest.skipUnless(HAS_NVFUSER, "To use `instance_nvfuser`, `apex.normalization.InstanceNorm3dNVFuser` is needed.") class TestDynUNetWithInstanceNorm3dNVFuser(unittest.TestCase): @parameterized.expand([TEST_CASE_DYNUNET_3D[0]]) def test_consistency(self, input_param, input_shape, _): - layer = InstanceNorm3dNVFuser(num_features=1, affine=True).to("cuda:0") - inp = torch.randn([1, 1, 1, 1, 1]).to("cuda:0") - out = layer(inp) - # for eps in [1e-4, 1e-5]: - # for momentum in [0.1, 0.01]: - # for affine in [True, False]: - # norm_param = {"eps": eps, "momentum": momentum, "affine": affine} - # input_param["norm_name"] = ("instance", norm_param) - # input_param_fuser = input_param.copy() - # input_param_fuser["norm_name"] = ("instance_nvfuser", norm_param) - # for memory_format in [torch.contiguous_format, torch.channels_last_3d]: - # net = DynUNet(**input_param).to("cuda:0", memory_format=memory_format) - # net_fuser = DynUNet(**input_param_fuser).to("cuda:0", memory_format=memory_format) - # net_fuser.load_state_dict(net.state_dict()) - - # input_tensor = torch.randn(input_shape).to("cuda:0", memory_format=memory_format) - # with eval_mode(net): - # result = net(input_tensor) - # with eval_mode(net_fuser): - # result_fuser = net_fuser(input_tensor) - - # torch.testing.assert_close(result, result_fuser) + for eps in [1e-4, 1e-5]: + for momentum in [0.1, 0.01]: + for affine in [True, False]: + norm_param = {"eps": eps, "momentum": momentum, "affine": affine} + input_param["norm_name"] = ("instance", norm_param) + input_param_fuser = input_param.copy() + input_param_fuser["norm_name"] = ("instance_nvfuser", norm_param) + for memory_format in [torch.contiguous_format, torch.channels_last_3d]: + net = DynUNet(**input_param).to("cuda:0", memory_format=memory_format) + net_fuser = DynUNet(**input_param_fuser).to("cuda:0", memory_format=memory_format) + net_fuser.load_state_dict(net.state_dict()) + + input_tensor = torch.randn(input_shape).to("cuda:0", memory_format=memory_format) + with eval_mode(net): + result = net(input_tensor) + with eval_mode(net_fuser): + result_fuser = net_fuser(input_tensor) + + torch.testing.assert_close(result, result_fuser) class TestDynUNetDeepSupervision(unittest.TestCase): diff --git a/tests/utils.py b/tests/utils.py index f0a3f7d716..6e5fa77c02 100644 --- a/tests/utils.py +++ b/tests/utils.py @@ -278,26 +278,26 @@ def has_cupy(): HAS_CUPY = has_cupy() -# def has_nvfuser(): -# """ -# Returns True if the user has installed a proper version of apex that contains `normalization.InstanceNorm3dNVFuser`. -# """ -# InstanceNorm3dNVFuser, has_nvfuser = optional_import("apex.normalization", name="InstanceNorm3dNVFuser") -# if not has_nvfuser: -# return False -# if not torch.cuda.is_available(): -# return False -# try: # test nvfuser installation with a basic example -# layer = InstanceNorm3dNVFuser(num_features=1, affine=True).to("cuda:0") -# inp = torch.randn([1, 1, 1, 1, 1]).to("cuda:0") -# out = layer(inp) -# del inp, out -# return True -# except Exception: -# return False - - -# HAS_NVFUSER = has_nvfuser() +def has_nvfuser(): + """ + Returns True if the user has installed a proper version of apex that contains `normalization.InstanceNorm3dNVFuser`. + """ + instancenorm_3dnvfuser, has_nvfuser = optional_import("apex.normalization", name="InstanceNorm3dNVFuser") + if not has_nvfuser: + return False + if not torch.cuda.is_available(): + return False + try: # test nvfuser installation with a basic example + layer = instancenorm_3dnvfuser(num_features=1, affine=True).to("cuda:0") + inp = torch.randn([1, 1, 1, 1, 1]).to("cuda:0") + out = layer(inp) + del inp, out + return True + except Exception: + return False + + +HAS_NVFUSER = has_nvfuser() def make_nifti_image(array: NdarrayOrTensor, affine=None, dir=None, fname=None, suffix=".nii.gz", verbose=False): From 4e84f691d7b65a857f7c8eab9162f0585b6061fa Mon Sep 17 00:00:00 2001 From: Yiheng Wang Date: Tue, 10 May 2022 23:10:25 +0800 Subject: [PATCH 08/12] add check within factory Signed-off-by: Yiheng Wang --- monai/networks/layers/factories.py | 19 +++++++++++++++++-- tests/test_dynunet.py | 3 +-- tests/utils.py | 22 ---------------------- 3 files changed, 18 insertions(+), 26 deletions(-) diff --git a/monai/networks/layers/factories.py b/monai/networks/layers/factories.py index b808c24de0..58a04b122b 100644 --- a/monai/networks/layers/factories.py +++ b/monai/networks/layers/factories.py @@ -63,12 +63,14 @@ def use_factory(fact_args): import warnings from typing import Any, Callable, Dict, Tuple, Type, Union +import torch import torch.nn as nn from monai.utils import look_up_option, optional_import InstanceNorm3dNVFuser, has_nvfuser = optional_import("apex.normalization", name="InstanceNorm3dNVFuser") + __all__ = ["LayerFactory", "Dropout", "Norm", "Act", "Conv", "Pool", "Pad", "split_args"] @@ -263,8 +265,21 @@ def instance_nvfuser_factory(dim): if dim != 3: warnings.warn(f"`InstanceNorm3dNVFuser` only supports 3d cases, use {types[dim - 1]} instead.") return types[dim - 1] - if not has_nvfuser: - warnings.warn("`apex.normalization.InstanceNorm3dNVFuser` is not found, use nn.InstanceNorm3d instead.") + # test InstanceNorm3dNVFuser installation with a basic example + has_nvfuser_flag = has_nvfuser + if not torch.cuda.is_available(): + has_nvfuser_flag = False + try: + layer = InstanceNorm3dNVFuser(num_features=1, affine=True).to("cuda:0") + inp = torch.randn([1, 1, 1, 1, 1]).to("cuda:0") + out = layer(inp) + del inp, out, layer + except Exception: + has_nvfuser_flag = False + if not has_nvfuser_flag: + warnings.warn( + "`apex.normalization.InstanceNorm3dNVFuser` is not installed properly, use nn.InstanceNorm3d instead." + ) return nn.InstanceNorm3d return InstanceNorm3dNVFuser diff --git a/tests/test_dynunet.py b/tests/test_dynunet.py index d06892602f..a35ee36c95 100644 --- a/tests/test_dynunet.py +++ b/tests/test_dynunet.py @@ -17,7 +17,7 @@ from monai.networks import eval_mode from monai.networks.nets import DynUNet -from tests.utils import HAS_NVFUSER, skip_if_no_cuda, skip_if_windows, test_script_save +from tests.utils import skip_if_no_cuda, skip_if_windows, test_script_save device = "cuda" if torch.cuda.is_available() else "cpu" @@ -120,7 +120,6 @@ def test_script(self): @skip_if_no_cuda @skip_if_windows -@unittest.skipUnless(HAS_NVFUSER, "To use `instance_nvfuser`, `apex.normalization.InstanceNorm3dNVFuser` is needed.") class TestDynUNetWithInstanceNorm3dNVFuser(unittest.TestCase): @parameterized.expand([TEST_CASE_DYNUNET_3D[0]]) def test_consistency(self, input_param, input_shape, _): diff --git a/tests/utils.py b/tests/utils.py index 6e5fa77c02..1a547fc2d2 100644 --- a/tests/utils.py +++ b/tests/utils.py @@ -278,28 +278,6 @@ def has_cupy(): HAS_CUPY = has_cupy() -def has_nvfuser(): - """ - Returns True if the user has installed a proper version of apex that contains `normalization.InstanceNorm3dNVFuser`. - """ - instancenorm_3dnvfuser, has_nvfuser = optional_import("apex.normalization", name="InstanceNorm3dNVFuser") - if not has_nvfuser: - return False - if not torch.cuda.is_available(): - return False - try: # test nvfuser installation with a basic example - layer = instancenorm_3dnvfuser(num_features=1, affine=True).to("cuda:0") - inp = torch.randn([1, 1, 1, 1, 1]).to("cuda:0") - out = layer(inp) - del inp, out - return True - except Exception: - return False - - -HAS_NVFUSER = has_nvfuser() - - def make_nifti_image(array: NdarrayOrTensor, affine=None, dir=None, fname=None, suffix=".nii.gz", verbose=False): """ Create a temporary nifti image on the disk and return the image name. From 0a5be1a7fcd7ec709ec87c453519f4f5501f0850 Mon Sep 17 00:00:00 2001 From: Yiheng Wang Date: Wed, 11 May 2022 11:46:49 +0800 Subject: [PATCH 09/12] revert to original cron.yml Signed-off-by: Yiheng Wang --- .github/workflows/cron.yml | 327 ++++++++++++++++++------------------- 1 file changed, 163 insertions(+), 164 deletions(-) diff --git a/.github/workflows/cron.yml b/.github/workflows/cron.yml index 8c7386b6d7..4ef8652e97 100644 --- a/.github/workflows/cron.yml +++ b/.github/workflows/cron.yml @@ -5,109 +5,108 @@ on: # - cron: "0 2 * * *" # at 02:00 UTC # Allows you to run this workflow manually from the Actions tab workflow_dispatch: - pull_request: jobs: - # cron-gpu: - # if: github.repository == 'Project-MONAI/MONAI' - # container: - # image: nvcr.io/nvidia/pytorch:21.06-py3 # CUDA 11.3 - # options: "--gpus all" - # runs-on: [self-hosted, linux, x64, common] - # strategy: - # matrix: - # pytorch-version: [1.7.1, 1.8.1, 1.9.1, 1.10.2, latest] - # steps: - # - uses: actions/checkout@v2 - # - name: Install the dependencies - # run: | - # which python - # python -m pip install --upgrade pip wheel - # python -m pip uninstall -y torch torchvision - # if [ ${{ matrix.pytorch-version }} == "latest" ]; then - # python -m pip install torch torchvision --extra-index-url https://download.pytorch.org/whl/cu113 - # elif [ ${{ matrix.pytorch-version }} == "1.7.1" ]; then - # python -m pip install torch==1.7.1 torchvision==0.8.2 --extra-index-url https://download.pytorch.org/whl/cu113 - # elif [ ${{ matrix.pytorch-version }} == "1.8.1" ]; then - # python -m pip install torch==1.8.1 torchvision==0.9.1 --extra-index-url https://download.pytorch.org/whl/cu113 - # elif [ ${{ matrix.pytorch-version }} == "1.9.1" ]; then - # python -m pip install torch==1.9.1 torchvision==0.10.1 --extra-index-url https://download.pytorch.org/whl/cu113 - # elif [ ${{ matrix.pytorch-version }} == "1.10.2" ]; then - # python -m pip install torch==1.10.2 torchvision==0.11.3 --extra-index-url https://download.pytorch.org/whl/cu113 - # fi - # python -m pip install -r requirements-dev.txt - # python -m pip list - # - name: Run tests report coverage - # run: | - # export LAUNCH_DELAY=$[ $RANDOM % 16 * 60 ] - # echo "Sleep $LAUNCH_DELAY" - # sleep $LAUNCH_DELAY - # nvidia-smi - # export CUDA_VISIBLE_DEVICES=$(python -m tests.utils) - # echo $CUDA_VISIBLE_DEVICES - # trap 'if pgrep python; then pkill python; fi;' ERR - # python -c $'import torch\na,b=torch.zeros(1,device="cuda:0"),torch.zeros(1,device="cuda:1");\nwhile True:print(a,b)' > /dev/null & - # python -c "import torch; print(torch.__version__); print('{} of GPUs available'.format(torch.cuda.device_count()))" - # python -c 'import torch; print(torch.rand(5, 3, device=torch.device("cuda:0")))' - # BUILD_MONAI=1 ./runtests.sh --build --coverage --unittests --disttests # unit tests with coverage report - # BUILD_MONAI=1 ./runtests.sh --build --coverage --net # integration tests with coverage report - # coverage xml - # if pgrep python; then pkill python; fi - # - name: Upload coverage - # uses: codecov/codecov-action@v1 - # with: - # fail_ci_if_error: false - # file: ./coverage.xml + cron-gpu: + if: github.repository == 'Project-MONAI/MONAI' + container: + image: nvcr.io/nvidia/pytorch:21.06-py3 # CUDA 11.3 + options: "--gpus all" + runs-on: [self-hosted, linux, x64, common] + strategy: + matrix: + pytorch-version: [1.7.1, 1.8.1, 1.9.1, 1.10.2, latest] + steps: + - uses: actions/checkout@v2 + - name: Install the dependencies + run: | + which python + python -m pip install --upgrade pip wheel + python -m pip uninstall -y torch torchvision + if [ ${{ matrix.pytorch-version }} == "latest" ]; then + python -m pip install torch torchvision --extra-index-url https://download.pytorch.org/whl/cu113 + elif [ ${{ matrix.pytorch-version }} == "1.7.1" ]; then + python -m pip install torch==1.7.1 torchvision==0.8.2 --extra-index-url https://download.pytorch.org/whl/cu113 + elif [ ${{ matrix.pytorch-version }} == "1.8.1" ]; then + python -m pip install torch==1.8.1 torchvision==0.9.1 --extra-index-url https://download.pytorch.org/whl/cu113 + elif [ ${{ matrix.pytorch-version }} == "1.9.1" ]; then + python -m pip install torch==1.9.1 torchvision==0.10.1 --extra-index-url https://download.pytorch.org/whl/cu113 + elif [ ${{ matrix.pytorch-version }} == "1.10.2" ]; then + python -m pip install torch==1.10.2 torchvision==0.11.3 --extra-index-url https://download.pytorch.org/whl/cu113 + fi + python -m pip install -r requirements-dev.txt + python -m pip list + - name: Run tests report coverage + run: | + export LAUNCH_DELAY=$[ $RANDOM % 16 * 60 ] + echo "Sleep $LAUNCH_DELAY" + sleep $LAUNCH_DELAY + nvidia-smi + export CUDA_VISIBLE_DEVICES=$(python -m tests.utils) + echo $CUDA_VISIBLE_DEVICES + trap 'if pgrep python; then pkill python; fi;' ERR + python -c $'import torch\na,b=torch.zeros(1,device="cuda:0"),torch.zeros(1,device="cuda:1");\nwhile True:print(a,b)' > /dev/null & + python -c "import torch; print(torch.__version__); print('{} of GPUs available'.format(torch.cuda.device_count()))" + python -c 'import torch; print(torch.rand(5, 3, device=torch.device("cuda:0")))' + BUILD_MONAI=1 ./runtests.sh --build --coverage --unittests --disttests # unit tests with coverage report + BUILD_MONAI=1 ./runtests.sh --build --coverage --net # integration tests with coverage report + coverage xml + if pgrep python; then pkill python; fi + - name: Upload coverage + uses: codecov/codecov-action@v1 + with: + fail_ci_if_error: false + file: ./coverage.xml - # cron-pt-image: - # if: github.repository == 'Project-MONAI/MONAI' - # strategy: - # matrix: - # container: ["pytorch:21.02", "pytorch:21.10", "pytorch:22.04"] # 21.02, 21.10 for backward comp. - # container: - # image: nvcr.io/nvidia/${{ matrix.container }}-py3 # testing with the latest pytorch base image - # options: "--gpus all" - # runs-on: [self-hosted, linux, x64, common] - # steps: - # - uses: actions/checkout@v2 - # - name: Install APT dependencies - # run: | - # apt-get update - # DEBIAN_FRONTEND="noninteractive" apt-get install -y libopenslide0 - # - name: Install Python dependencies - # run: | - # which python - # python -m pip install --upgrade pip wheel - # python -m pip install -r requirements-dev.txt - # python -m pip list - # - name: Run tests report coverage - # run: | - # export LAUNCH_DELAY=$[ $RANDOM % 16 * 60 ] - # echo "Sleep $LAUNCH_DELAY" - # sleep $LAUNCH_DELAY - # nvidia-smi - # export CUDA_VISIBLE_DEVICES=$(python -m tests.utils) - # echo $CUDA_VISIBLE_DEVICES - # trap 'if pgrep python; then pkill python; fi;' ERR - # python -c $'import torch\na,b=torch.zeros(1,device="cuda:0"),torch.zeros(1,device="cuda:1");\nwhile True:print(a,b)' > /dev/null & - # python -c "import torch; print(torch.__version__); print('{} of GPUs available'.format(torch.cuda.device_count()))" - # python -c 'import torch; print(torch.rand(5, 3, device=torch.device("cuda:0")))' - # BUILD_MONAI=1 ./runtests.sh --build --coverage --unittests --disttests # unit tests with coverage report - # BUILD_MONAI=1 ./runtests.sh --build --coverage --net # integration tests with coverage report - # coverage xml - # if pgrep python; then pkill python; fi - # - name: Upload coverage - # uses: codecov/codecov-action@v1 - # with: - # fail_ci_if_error: false - # file: ./coverage.xml + cron-pt-image: + if: github.repository == 'Project-MONAI/MONAI' + strategy: + matrix: + container: ["pytorch:21.02", "pytorch:21.10", "pytorch:22.04"] # 21.02, 21.10 for backward comp. + container: + image: nvcr.io/nvidia/${{ matrix.container }}-py3 # testing with the latest pytorch base image + options: "--gpus all" + runs-on: [self-hosted, linux, x64, common] + steps: + - uses: actions/checkout@v2 + - name: Install APT dependencies + run: | + apt-get update + DEBIAN_FRONTEND="noninteractive" apt-get install -y libopenslide0 + - name: Install Python dependencies + run: | + which python + python -m pip install --upgrade pip wheel + python -m pip install -r requirements-dev.txt + python -m pip list + - name: Run tests report coverage + run: | + export LAUNCH_DELAY=$[ $RANDOM % 16 * 60 ] + echo "Sleep $LAUNCH_DELAY" + sleep $LAUNCH_DELAY + nvidia-smi + export CUDA_VISIBLE_DEVICES=$(python -m tests.utils) + echo $CUDA_VISIBLE_DEVICES + trap 'if pgrep python; then pkill python; fi;' ERR + python -c $'import torch\na,b=torch.zeros(1,device="cuda:0"),torch.zeros(1,device="cuda:1");\nwhile True:print(a,b)' > /dev/null & + python -c "import torch; print(torch.__version__); print('{} of GPUs available'.format(torch.cuda.device_count()))" + python -c 'import torch; print(torch.rand(5, 3, device=torch.device("cuda:0")))' + BUILD_MONAI=1 ./runtests.sh --build --coverage --unittests --disttests # unit tests with coverage report + BUILD_MONAI=1 ./runtests.sh --build --coverage --net # integration tests with coverage report + coverage xml + if pgrep python; then pkill python; fi + - name: Upload coverage + uses: codecov/codecov-action@v1 + with: + fail_ci_if_error: false + file: ./coverage.xml cron-pip: # pip install monai[all] and use it to run unit tests if: github.repository == 'Project-MONAI/MONAI' strategy: matrix: - container: ["pytorch:22.04"] # 21.02, 21.10 for backward comp. + container: ["pytorch:21.02", "pytorch:21.10", "pytorch:22.04"] # 21.02, 21.10 for backward comp. container: image: nvcr.io/nvidia/${{ matrix.container }}-py3 # testing with the latest pytorch base image options: "--gpus all" @@ -158,8 +157,8 @@ jobs: ls -al export LAUNCH_DELAY=$[ $RANDOM % 16 * 60 ] - # echo "Sleep $LAUNCH_DELAY" - # sleep $LAUNCH_DELAY + echo "Sleep $LAUNCH_DELAY" + sleep $LAUNCH_DELAY nvidia-smi export CUDA_VISIBLE_DEVICES=$(python -m tests.utils) echo $CUDA_VISIBLE_DEVICES @@ -168,74 +167,74 @@ jobs: python -c "import torch; print(torch.__version__); print('{} of GPUs available'.format(torch.cuda.device_count()))" python -m pip install -r requirements-dev.txt - PYTHONPATH="$tmp_dir":$PYTHONPATH BUILD_MONAI=1 python ./tests/test_dynunet.py # unit tests + PYTHONPATH="$tmp_dir":$PYTHONPATH BUILD_MONAI=1 python ./tests/runner.py -p 'test_((?!integration).)' # unit tests if pgrep python; then pkill python; fi - # cron-docker: - # if: github.repository == 'Project-MONAI/MONAI' - # container: - # image: docker://projectmonai/monai:latest # this might be slow and has the pull count limitations - # options: "--gpus all" - # runs-on: [self-hosted, linux, x64, common] - # steps: - # - name: Run tests report coverage - # # The docker image process has done the compilation. - # # BUILD_MONAI=1 is necessary for triggering the USE_COMPILED flag. - # run: | - # cd /opt/monai - # nvidia-smi - # export CUDA_VISIBLE_DEVICES=$(python -m tests.utils) - # echo $CUDA_VISIBLE_DEVICES - # trap 'if pgrep python; then pkill python; fi;' ERR - # python -c $'import torch\na,b=torch.zeros(1,device="cuda:0"),torch.zeros(1,device="cuda:1");\nwhile True:print(a,b)' > /dev/null & - # python -c "import torch; print(torch.__version__); print('{} of GPUs available'.format(torch.cuda.device_count()))" - # python -c 'import torch; print(torch.rand(5,3, device=torch.device("cuda:0")))' - # ngc --version - # BUILD_MONAI=1 ./runtests.sh --build --coverage --pytype --unittests --disttests # unit tests with pytype checks, coverage report - # BUILD_MONAI=1 ./runtests.sh --build --coverage --net # integration tests with coverage report - # coverage xml - # if pgrep python; then pkill python; fi - # - name: Upload coverage - # uses: codecov/codecov-action@v1 - # with: - # fail_ci_if_error: false - # file: ./coverage.xml + cron-docker: + if: github.repository == 'Project-MONAI/MONAI' + container: + image: docker://projectmonai/monai:latest # this might be slow and has the pull count limitations + options: "--gpus all" + runs-on: [self-hosted, linux, x64, common] + steps: + - name: Run tests report coverage + # The docker image process has done the compilation. + # BUILD_MONAI=1 is necessary for triggering the USE_COMPILED flag. + run: | + cd /opt/monai + nvidia-smi + export CUDA_VISIBLE_DEVICES=$(python -m tests.utils) + echo $CUDA_VISIBLE_DEVICES + trap 'if pgrep python; then pkill python; fi;' ERR + python -c $'import torch\na,b=torch.zeros(1,device="cuda:0"),torch.zeros(1,device="cuda:1");\nwhile True:print(a,b)' > /dev/null & + python -c "import torch; print(torch.__version__); print('{} of GPUs available'.format(torch.cuda.device_count()))" + python -c 'import torch; print(torch.rand(5,3, device=torch.device("cuda:0")))' + ngc --version + BUILD_MONAI=1 ./runtests.sh --build --coverage --pytype --unittests --disttests # unit tests with pytype checks, coverage report + BUILD_MONAI=1 ./runtests.sh --build --coverage --net # integration tests with coverage report + coverage xml + if pgrep python; then pkill python; fi + - name: Upload coverage + uses: codecov/codecov-action@v1 + with: + fail_ci_if_error: false + file: ./coverage.xml - # cron-tutorial-notebooks: - # if: github.repository == 'Project-MONAI/MONAI' - # needs: cron-gpu # so that monai itself is verified first - # container: - # image: nvcr.io/nvidia/pytorch:22.04-py3 # testing with the latest pytorch base image - # options: "--gpus all --ipc=host" - # runs-on: [self-hosted, linux, x64, common] - # steps: - # - uses: actions/checkout@v2 - # - name: Install MONAI - # id: monai-install - # run: | - # which python - # python -m pip install --upgrade pip wheel - # python -m pip install -r requirements-dev.txt - # BUILD_MONAI=1 python setup.py develop # install monai - # nvidia-smi - # export CUDA_VISIBLE_DEVICES=$(python -m tests.utils) - # echo $CUDA_VISIBLE_DEVICES - # echo "::set-output name=devices::$CUDA_VISIBLE_DEVICES" - # - name: Checkout tutorials and install their requirements - # run: | - # cd /opt - # git clone --depth 1 --branch master --single-branch https://github.com/Project-MONAI/tutorials.git # latest commit of master branch - # cd tutorials - # python -m pip install -r requirements.txt - # - name: Run tutorial notebooks - # timeout-minutes: 150 - # run: | - # export CUDA_VISIBLE_DEVICES=${{ steps.monai-install.outputs.devices }} - # echo $CUDA_VISIBLE_DEVICES - # trap 'if pgrep python; then pkill python; fi;' ERR - # python -c $'import torch\na,b=torch.zeros(1,device="cuda:0"),torch.zeros(1,device="cuda:1");\nwhile True:print(a,b)' > /dev/null & - # cd /opt/tutorials - # python -c 'import monai; monai.config.print_debug_info()' - # $(pwd)/runner.sh - # python -c 'import monai; monai.config.print_debug_info()' - # if pgrep python; then pkill python; fi + cron-tutorial-notebooks: + if: github.repository == 'Project-MONAI/MONAI' + needs: cron-gpu # so that monai itself is verified first + container: + image: nvcr.io/nvidia/pytorch:22.04-py3 # testing with the latest pytorch base image + options: "--gpus all --ipc=host" + runs-on: [self-hosted, linux, x64, common] + steps: + - uses: actions/checkout@v2 + - name: Install MONAI + id: monai-install + run: | + which python + python -m pip install --upgrade pip wheel + python -m pip install -r requirements-dev.txt + BUILD_MONAI=1 python setup.py develop # install monai + nvidia-smi + export CUDA_VISIBLE_DEVICES=$(python -m tests.utils) + echo $CUDA_VISIBLE_DEVICES + echo "::set-output name=devices::$CUDA_VISIBLE_DEVICES" + - name: Checkout tutorials and install their requirements + run: | + cd /opt + git clone --depth 1 --branch master --single-branch https://github.com/Project-MONAI/tutorials.git # latest commit of master branch + cd tutorials + python -m pip install -r requirements.txt + - name: Run tutorial notebooks + timeout-minutes: 150 + run: | + export CUDA_VISIBLE_DEVICES=${{ steps.monai-install.outputs.devices }} + echo $CUDA_VISIBLE_DEVICES + trap 'if pgrep python; then pkill python; fi;' ERR + python -c $'import torch\na,b=torch.zeros(1,device="cuda:0"),torch.zeros(1,device="cuda:1");\nwhile True:print(a,b)' > /dev/null & + cd /opt/tutorials + python -c 'import monai; monai.config.print_debug_info()' + $(pwd)/runner.sh + python -c 'import monai; monai.config.print_debug_info()' + if pgrep python; then pkill python; fi \ No newline at end of file From e8227bfd6814573498d8bf2f3d30e6d1c45b2b40 Mon Sep 17 00:00:00 2001 From: "pre-commit-ci[bot]" <66853113+pre-commit-ci[bot]@users.noreply.github.com> Date: Wed, 11 May 2022 03:47:33 +0000 Subject: [PATCH 10/12] [pre-commit.ci] auto fixes from pre-commit.com hooks for more information, see https://pre-commit.ci --- .github/workflows/cron.yml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/.github/workflows/cron.yml b/.github/workflows/cron.yml index 4ef8652e97..08065147e5 100644 --- a/.github/workflows/cron.yml +++ b/.github/workflows/cron.yml @@ -237,4 +237,4 @@ jobs: python -c 'import monai; monai.config.print_debug_info()' $(pwd)/runner.sh python -c 'import monai; monai.config.print_debug_info()' - if pgrep python; then pkill python; fi \ No newline at end of file + if pgrep python; then pkill python; fi From c09a97d16ff5633fa4cdab19ab9250c8e7cb2e54 Mon Sep 17 00:00:00 2001 From: Yiheng Wang Date: Wed, 11 May 2022 14:28:05 +0800 Subject: [PATCH 11/12] fix old pt issue Signed-off-by: Yiheng Wang --- tests/test_dynunet.py | 7 ++++++- 1 file changed, 6 insertions(+), 1 deletion(-) diff --git a/tests/test_dynunet.py b/tests/test_dynunet.py index a35ee36c95..ff5d5efbef 100644 --- a/tests/test_dynunet.py +++ b/tests/test_dynunet.py @@ -17,6 +17,7 @@ from monai.networks import eval_mode from monai.networks.nets import DynUNet +from monai.utils.module import pytorch_after from tests.utils import skip_if_no_cuda, skip_if_windows, test_script_save device = "cuda" if torch.cuda.is_available() else "cpu" @@ -141,7 +142,11 @@ def test_consistency(self, input_param, input_shape, _): with eval_mode(net_fuser): result_fuser = net_fuser(input_tensor) - torch.testing.assert_close(result, result_fuser) + # torch.testing.assert_allclose() is deprecated since 1.12 and will be removed in 1.14 + if pytorch_after(1, 12): + torch.testing.assert_close(result, result_fuser) + else: + torch.testing.assert_allclose(result, result_fuser) class TestDynUNetDeepSupervision(unittest.TestCase): From 2f559907620a6f854ce1dc0e47dbd61ddb6ec265 Mon Sep 17 00:00:00 2001 From: Yiheng Wang Date: Wed, 11 May 2022 16:08:48 +0800 Subject: [PATCH 12/12] change to return directly if no cuda Signed-off-by: Yiheng Wang --- monai/networks/layers/factories.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/monai/networks/layers/factories.py b/monai/networks/layers/factories.py index 58a04b122b..89fe1912a5 100644 --- a/monai/networks/layers/factories.py +++ b/monai/networks/layers/factories.py @@ -268,7 +268,7 @@ def instance_nvfuser_factory(dim): # test InstanceNorm3dNVFuser installation with a basic example has_nvfuser_flag = has_nvfuser if not torch.cuda.is_available(): - has_nvfuser_flag = False + return nn.InstanceNorm3d try: layer = InstanceNorm3dNVFuser(num_features=1, affine=True).to("cuda:0") inp = torch.randn([1, 1, 1, 1, 1]).to("cuda:0")