From 0d0691da91455432ac47bd5e7675b415cc97a791 Mon Sep 17 00:00:00 2001 From: Wenqi Li Date: Wed, 3 Mar 2021 16:56:20 +0000 Subject: [PATCH 1/6] fixes #1685 Signed-off-by: Wenqi Li --- .github/workflows/cron.yml | 2 +- Dockerfile | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/.github/workflows/cron.yml b/.github/workflows/cron.yml index e568ba9e15..98834dbc8f 100644 --- a/.github/workflows/cron.yml +++ b/.github/workflows/cron.yml @@ -56,7 +56,7 @@ jobs: cron-pt-image: if: github.repository == 'Project-MONAI/MONAI' container: - image: nvcr.io/nvidia/pytorch:20.12-py3 # testing with the latest pytorch base image + image: nvcr.io/nvidia/pytorch:21.02-py3 # testing with the latest pytorch base image options: "--gpus all" runs-on: [self-hosted, linux, x64, common] steps: diff --git a/Dockerfile b/Dockerfile index 47976b97b1..982dc64bf0 100644 --- a/Dockerfile +++ b/Dockerfile @@ -9,7 +9,7 @@ # See the License for the specific language governing permissions and # limitations under the License. -ARG PYTORCH_IMAGE=nvcr.io/nvidia/pytorch:20.12-py3 +ARG PYTORCH_IMAGE=nvcr.io/nvidia/pytorch:21.02-py3 FROM ${PYTORCH_IMAGE} From b28c0b1f763e8b3a1c95e7b52bb2c0631a804e71 Mon Sep 17 00:00:00 2001 From: Wenqi Li Date: Wed, 3 Mar 2021 16:57:58 +0000 Subject: [PATCH 2/6] add temp test Signed-off-by: Wenqi Li --- .github/workflows/cron.yml | 151 +++++++++++++++++++------------------ 1 file changed, 77 insertions(+), 74 deletions(-) diff --git a/.github/workflows/cron.yml b/.github/workflows/cron.yml index 98834dbc8f..fb6e5c7c73 100644 --- a/.github/workflows/cron.yml +++ b/.github/workflows/cron.yml @@ -1,57 +1,60 @@ name: crons on: - schedule: - - cron: "0 2 * * *" # at 02:00 UTC +# schedule: +# - cron: "0 2 * * *" # at 02:00 UTC + push: + branches: + - test-21-02 jobs: - cron-gpu: - if: github.repository == 'Project-MONAI/MONAI' - container: - image: nvcr.io/nvidia/pytorch:20.03-py3 # CUDA 10.2 - options: "--gpus all" - runs-on: [self-hosted, linux, x64, common] - strategy: - matrix: - pytorch-version: [1.5.0, 1.5.1, 1.6.0, latest] - steps: - - uses: actions/checkout@v2 - - name: Install the dependencies - run: | - which python - python -m pip install --upgrade pip wheel - python -m pip uninstall -y torch torchvision - if [ ${{ matrix.pytorch-version }} == "latest" ]; then - python -m pip install torch torchvision - elif [ ${{ matrix.pytorch-version }} == "1.5.0" ]; then - python -m pip install torch==1.5.0 - python -m pip install torchvision==0.6.0 - elif [ ${{ matrix.pytorch-version }} == "1.5.1" ]; then - python -m pip install torch==1.5.1 - python -m pip install torchvision==0.6.1 - elif [ ${{ matrix.pytorch-version }} == "1.6.0" ]; then - python -m pip install torch==1.6.0 - python -m pip install torchvision==0.7.0 - fi - python -m pip install -r requirements-dev.txt - python -m pip list - - name: Run tests report coverage - run: | - export LAUNCH_DELAY=$[ $RANDOM % 16 * 60 ] - echo "Sleep $LAUNCH_DELAY" - sleep $LAUNCH_DELAY - nvidia-smi - export CUDA_VISIBLE_DEVICES=$(python -m tests.utils) - echo $CUDA_VISIBLE_DEVICES - python -c "import torch; print(torch.__version__); print('{} of GPUs available'.format(torch.cuda.device_count()))" - python -c 'import torch; print(torch.rand(5,3, device=torch.device("cuda:0")))' - BUILD_MONAI=1 ./runtests.sh --coverage - coverage xml - - name: Upload coverage - uses: codecov/codecov-action@v1 - with: - fail_ci_if_error: false - file: ./coverage.xml +# cron-gpu: +# if: github.repository == 'Project-MONAI/MONAI' +# container: +# image: nvcr.io/nvidia/pytorch:20.03-py3 # CUDA 10.2 +# options: "--gpus all" +# runs-on: [self-hosted, linux, x64, common] +# strategy: +# matrix: +# pytorch-version: [1.5.0, 1.5.1, 1.6.0, latest] +# steps: +# - uses: actions/checkout@v2 +# - name: Install the dependencies +# run: | +# which python +# python -m pip install --upgrade pip wheel +# python -m pip uninstall -y torch torchvision +# if [ ${{ matrix.pytorch-version }} == "latest" ]; then +# python -m pip install torch torchvision +# elif [ ${{ matrix.pytorch-version }} == "1.5.0" ]; then +# python -m pip install torch==1.5.0 +# python -m pip install torchvision==0.6.0 +# elif [ ${{ matrix.pytorch-version }} == "1.5.1" ]; then +# python -m pip install torch==1.5.1 +# python -m pip install torchvision==0.6.1 +# elif [ ${{ matrix.pytorch-version }} == "1.6.0" ]; then +# python -m pip install torch==1.6.0 +# python -m pip install torchvision==0.7.0 +# fi +# python -m pip install -r requirements-dev.txt +# python -m pip list +# - name: Run tests report coverage +# run: | +# export LAUNCH_DELAY=$[ $RANDOM % 16 * 60 ] +# echo "Sleep $LAUNCH_DELAY" +# sleep $LAUNCH_DELAY +# nvidia-smi +# export CUDA_VISIBLE_DEVICES=$(python -m tests.utils) +# echo $CUDA_VISIBLE_DEVICES +# python -c "import torch; print(torch.__version__); print('{} of GPUs available'.format(torch.cuda.device_count()))" +# python -c 'import torch; print(torch.rand(5,3, device=torch.device("cuda:0")))' +# BUILD_MONAI=1 ./runtests.sh --coverage +# coverage xml +# - name: Upload coverage +# uses: codecov/codecov-action@v1 +# with: +# fail_ci_if_error: false +# file: ./coverage.xml cron-pt-image: if: github.repository == 'Project-MONAI/MONAI' @@ -85,28 +88,28 @@ jobs: fail_ci_if_error: false file: ./coverage.xml - cron-docker: - if: github.repository == 'Project-MONAI/MONAI' - container: - image: localhost:5000/local_monai:dockerhub # use currently latest, locally available dockerhub image - options: "--gpus all" - runs-on: [self-hosted, linux, x64, common] - steps: - - name: Run tests report coverage - # The docker image process has done the compilation. - # BUILD_MONAI=1 is necessary for triggering the USE_COMPILED flag. - run: | - cd /opt/monai - nvidia-smi - export CUDA_VISIBLE_DEVICES=$(python -m tests.utils) - echo $CUDA_VISIBLE_DEVICES - python -c "import torch; print(torch.__version__); print('{} of GPUs available'.format(torch.cuda.device_count()))" - python -c 'import torch; print(torch.rand(5,3, device=torch.device("cuda:0")))' - ngc --version - BUILD_MONAI=1 ./runtests.sh --coverage --pytype - coverage xml - - name: Upload coverage - uses: codecov/codecov-action@v1 - with: - fail_ci_if_error: false - file: ./coverage.xml +# cron-docker: +# if: github.repository == 'Project-MONAI/MONAI' +# container: +# image: localhost:5000/local_monai:dockerhub # use currently latest, locally available dockerhub image +# options: "--gpus all" +# runs-on: [self-hosted, linux, x64, common] +# steps: +# - name: Run tests report coverage +# # The docker image process has done the compilation. +# # BUILD_MONAI=1 is necessary for triggering the USE_COMPILED flag. +# run: | +# cd /opt/monai +# nvidia-smi +# export CUDA_VISIBLE_DEVICES=$(python -m tests.utils) +# echo $CUDA_VISIBLE_DEVICES +# python -c "import torch; print(torch.__version__); print('{} of GPUs available'.format(torch.cuda.device_count()))" +# python -c 'import torch; print(torch.rand(5,3, device=torch.device("cuda:0")))' +# ngc --version +# BUILD_MONAI=1 ./runtests.sh --coverage --pytype +# coverage xml +# - name: Upload coverage +# uses: codecov/codecov-action@v1 +# with: +# fail_ci_if_error: false +# file: ./coverage.xml From 6ab6a6693be9af2818c2744ba88ba0d5ade461c6 Mon Sep 17 00:00:00 2001 From: Wenqi Li Date: Wed, 3 Mar 2021 20:22:34 +0000 Subject: [PATCH 3/6] adds docstring Signed-off-by: Wenqi Li --- Dockerfile | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/Dockerfile b/Dockerfile index 982dc64bf0..c90558c970 100644 --- a/Dockerfile +++ b/Dockerfile @@ -9,8 +9,9 @@ # See the License for the specific language governing permissions and # limitations under the License. +# To build with a different base image +# please run `docker build` using the `--build-arg PYTORCH_IMAGE=...` flag. ARG PYTORCH_IMAGE=nvcr.io/nvidia/pytorch:21.02-py3 - FROM ${PYTORCH_IMAGE} LABEL maintainer="monai.contact@gmail.com" From 3ad9f698d46e9cb0ffdcb15d46cca50001a7e0d1 Mon Sep 17 00:00:00 2001 From: Wenqi Li Date: Wed, 3 Mar 2021 20:44:27 -0500 Subject: [PATCH 4/6] fixes dist sampler Signed-off-by: Wenqi Li --- monai/data/utils.py | 4 ---- tests/test_distributed_sampler.py | 2 ++ tests/utils.py | 7 +++++++ 3 files changed, 9 insertions(+), 4 deletions(-) diff --git a/monai/data/utils.py b/monai/data/utils.py index 7717ddf3aa..60250af441 100644 --- a/monai/data/utils.py +++ b/monai/data/utils.py @@ -922,10 +922,6 @@ class DistributedSampler(_TorchDistributedSampler): """ def __init__(self, even_divisible: bool = True, *args, **kwargs): - self.total_size: int = 0 - self.rank: int = 0 - self.num_samples: int = 0 - self.num_replicas: int = 0 super().__init__(*args, **kwargs) if not even_divisible: diff --git a/tests/test_distributed_sampler.py b/tests/test_distributed_sampler.py index d0054885eb..0a439874bd 100644 --- a/tests/test_distributed_sampler.py +++ b/tests/test_distributed_sampler.py @@ -24,6 +24,7 @@ def test_even(self): data = [1, 2, 3, 4, 5] sampler = DistributedSampler(dataset=data, shuffle=False) samples = np.array([data[i] for i in list(sampler)]) + self.assertEqual(dist.get_rank(), sampler.rank) if dist.get_rank() == 0: np.testing.assert_allclose(samples, np.array([1, 3, 5])) @@ -35,6 +36,7 @@ def test_uneven(self): data = [1, 2, 3, 4, 5] sampler = DistributedSampler(dataset=data, shuffle=False, even_divisible=False) samples = np.array([data[i] for i in list(sampler)]) + self.assertEqual(dist.get_rank(), sampler.rank) if dist.get_rank() == 0: np.testing.assert_allclose(samples, np.array([1, 3, 5])) diff --git a/tests/utils.py b/tests/utils.py index 8b367158b2..3636cbe974 100644 --- a/tests/utils.py +++ b/tests/utils.py @@ -16,6 +16,7 @@ import queue import sys import tempfile +import time import traceback import unittest import warnings @@ -273,6 +274,7 @@ def run_process(self, func, local_rank, args, kwargs, results): os.environ["RANK"] = str(self.nproc_per_node * self.node_rank + local_rank) if torch.cuda.is_available(): + os.environ["CUDA_DEVICE_ORDER"] = "PCI_BUS_ID" torch.cuda.set_device(int(local_rank)) dist.init_process_group( @@ -283,6 +285,11 @@ def run_process(self, func, local_rank, args, kwargs, results): rank=int(os.environ["RANK"]), ) func(*args, **kwargs) + # the primary node lives longer to + # avoid _store_based_barrier, RuntimeError: Broken pipe + # as the TCP store daemon is on the rank 0 + if int(os.environ["RANK"]) == 0: + time.sleep(0.1) results.put(True) except Exception as e: results.put(False) From 6a4f68f19d5ede38b82405b4dfd1dae97a1e7b22 Mon Sep 17 00:00:00 2001 From: Wenqi Li Date: Thu, 4 Mar 2021 08:38:09 +0000 Subject: [PATCH 5/6] remove temp tests Signed-off-by: Wenqi Li --- .github/workflows/cron.yml | 151 ++++++++++++++++++------------------- 1 file changed, 74 insertions(+), 77 deletions(-) diff --git a/.github/workflows/cron.yml b/.github/workflows/cron.yml index fb6e5c7c73..98834dbc8f 100644 --- a/.github/workflows/cron.yml +++ b/.github/workflows/cron.yml @@ -1,60 +1,57 @@ name: crons on: -# schedule: -# - cron: "0 2 * * *" # at 02:00 UTC - push: - branches: - - test-21-02 + schedule: + - cron: "0 2 * * *" # at 02:00 UTC jobs: -# cron-gpu: -# if: github.repository == 'Project-MONAI/MONAI' -# container: -# image: nvcr.io/nvidia/pytorch:20.03-py3 # CUDA 10.2 -# options: "--gpus all" -# runs-on: [self-hosted, linux, x64, common] -# strategy: -# matrix: -# pytorch-version: [1.5.0, 1.5.1, 1.6.0, latest] -# steps: -# - uses: actions/checkout@v2 -# - name: Install the dependencies -# run: | -# which python -# python -m pip install --upgrade pip wheel -# python -m pip uninstall -y torch torchvision -# if [ ${{ matrix.pytorch-version }} == "latest" ]; then -# python -m pip install torch torchvision -# elif [ ${{ matrix.pytorch-version }} == "1.5.0" ]; then -# python -m pip install torch==1.5.0 -# python -m pip install torchvision==0.6.0 -# elif [ ${{ matrix.pytorch-version }} == "1.5.1" ]; then -# python -m pip install torch==1.5.1 -# python -m pip install torchvision==0.6.1 -# elif [ ${{ matrix.pytorch-version }} == "1.6.0" ]; then -# python -m pip install torch==1.6.0 -# python -m pip install torchvision==0.7.0 -# fi -# python -m pip install -r requirements-dev.txt -# python -m pip list -# - name: Run tests report coverage -# run: | -# export LAUNCH_DELAY=$[ $RANDOM % 16 * 60 ] -# echo "Sleep $LAUNCH_DELAY" -# sleep $LAUNCH_DELAY -# nvidia-smi -# export CUDA_VISIBLE_DEVICES=$(python -m tests.utils) -# echo $CUDA_VISIBLE_DEVICES -# python -c "import torch; print(torch.__version__); print('{} of GPUs available'.format(torch.cuda.device_count()))" -# python -c 'import torch; print(torch.rand(5,3, device=torch.device("cuda:0")))' -# BUILD_MONAI=1 ./runtests.sh --coverage -# coverage xml -# - name: Upload coverage -# uses: codecov/codecov-action@v1 -# with: -# fail_ci_if_error: false -# file: ./coverage.xml + cron-gpu: + if: github.repository == 'Project-MONAI/MONAI' + container: + image: nvcr.io/nvidia/pytorch:20.03-py3 # CUDA 10.2 + options: "--gpus all" + runs-on: [self-hosted, linux, x64, common] + strategy: + matrix: + pytorch-version: [1.5.0, 1.5.1, 1.6.0, latest] + steps: + - uses: actions/checkout@v2 + - name: Install the dependencies + run: | + which python + python -m pip install --upgrade pip wheel + python -m pip uninstall -y torch torchvision + if [ ${{ matrix.pytorch-version }} == "latest" ]; then + python -m pip install torch torchvision + elif [ ${{ matrix.pytorch-version }} == "1.5.0" ]; then + python -m pip install torch==1.5.0 + python -m pip install torchvision==0.6.0 + elif [ ${{ matrix.pytorch-version }} == "1.5.1" ]; then + python -m pip install torch==1.5.1 + python -m pip install torchvision==0.6.1 + elif [ ${{ matrix.pytorch-version }} == "1.6.0" ]; then + python -m pip install torch==1.6.0 + python -m pip install torchvision==0.7.0 + fi + python -m pip install -r requirements-dev.txt + python -m pip list + - name: Run tests report coverage + run: | + export LAUNCH_DELAY=$[ $RANDOM % 16 * 60 ] + echo "Sleep $LAUNCH_DELAY" + sleep $LAUNCH_DELAY + nvidia-smi + export CUDA_VISIBLE_DEVICES=$(python -m tests.utils) + echo $CUDA_VISIBLE_DEVICES + python -c "import torch; print(torch.__version__); print('{} of GPUs available'.format(torch.cuda.device_count()))" + python -c 'import torch; print(torch.rand(5,3, device=torch.device("cuda:0")))' + BUILD_MONAI=1 ./runtests.sh --coverage + coverage xml + - name: Upload coverage + uses: codecov/codecov-action@v1 + with: + fail_ci_if_error: false + file: ./coverage.xml cron-pt-image: if: github.repository == 'Project-MONAI/MONAI' @@ -88,28 +85,28 @@ jobs: fail_ci_if_error: false file: ./coverage.xml -# cron-docker: -# if: github.repository == 'Project-MONAI/MONAI' -# container: -# image: localhost:5000/local_monai:dockerhub # use currently latest, locally available dockerhub image -# options: "--gpus all" -# runs-on: [self-hosted, linux, x64, common] -# steps: -# - name: Run tests report coverage -# # The docker image process has done the compilation. -# # BUILD_MONAI=1 is necessary for triggering the USE_COMPILED flag. -# run: | -# cd /opt/monai -# nvidia-smi -# export CUDA_VISIBLE_DEVICES=$(python -m tests.utils) -# echo $CUDA_VISIBLE_DEVICES -# python -c "import torch; print(torch.__version__); print('{} of GPUs available'.format(torch.cuda.device_count()))" -# python -c 'import torch; print(torch.rand(5,3, device=torch.device("cuda:0")))' -# ngc --version -# BUILD_MONAI=1 ./runtests.sh --coverage --pytype -# coverage xml -# - name: Upload coverage -# uses: codecov/codecov-action@v1 -# with: -# fail_ci_if_error: false -# file: ./coverage.xml + cron-docker: + if: github.repository == 'Project-MONAI/MONAI' + container: + image: localhost:5000/local_monai:dockerhub # use currently latest, locally available dockerhub image + options: "--gpus all" + runs-on: [self-hosted, linux, x64, common] + steps: + - name: Run tests report coverage + # The docker image process has done the compilation. + # BUILD_MONAI=1 is necessary for triggering the USE_COMPILED flag. + run: | + cd /opt/monai + nvidia-smi + export CUDA_VISIBLE_DEVICES=$(python -m tests.utils) + echo $CUDA_VISIBLE_DEVICES + python -c "import torch; print(torch.__version__); print('{} of GPUs available'.format(torch.cuda.device_count()))" + python -c 'import torch; print(torch.rand(5,3, device=torch.device("cuda:0")))' + ngc --version + BUILD_MONAI=1 ./runtests.sh --coverage --pytype + coverage xml + - name: Upload coverage + uses: codecov/codecov-action@v1 + with: + fail_ci_if_error: false + file: ./coverage.xml From 67b2b1d77bb7f2209f633a8a2be04e1d79401eae Mon Sep 17 00:00:00 2001 From: Wenqi Li Date: Thu, 4 Mar 2021 22:21:42 +0000 Subject: [PATCH 6/6] fixes type hint issue Signed-off-by: Wenqi Li --- monai/networks/utils.py | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/monai/networks/utils.py b/monai/networks/utils.py index 48efe3934e..bd25e358f6 100644 --- a/monai/networks/utils.py +++ b/monai/networks/utils.py @@ -14,7 +14,7 @@ import warnings from contextlib import contextmanager -from typing import Any, Callable, Optional, Sequence, cast +from typing import Any, Callable, Optional, Sequence import torch import torch.nn as nn @@ -86,10 +86,10 @@ def predict_segmentation( threshold: thresholding the prediction values if multi-labels task. """ if not mutually_exclusive: - return (cast(torch.Tensor, logits >= threshold)).int() + return (logits >= threshold).int() if logits.shape[1] == 1: warnings.warn("single channel prediction, `mutually_exclusive=True` ignored, use threshold instead.") - return (cast(torch.Tensor, logits >= threshold)).int() + return (logits >= threshold).int() return logits.argmax(1, keepdim=True)