diff --git a/.github/workflows/cron.yml b/.github/workflows/cron.yml index e568ba9e15..98834dbc8f 100644 --- a/.github/workflows/cron.yml +++ b/.github/workflows/cron.yml @@ -56,7 +56,7 @@ jobs: cron-pt-image: if: github.repository == 'Project-MONAI/MONAI' container: - image: nvcr.io/nvidia/pytorch:20.12-py3 # testing with the latest pytorch base image + image: nvcr.io/nvidia/pytorch:21.02-py3 # testing with the latest pytorch base image options: "--gpus all" runs-on: [self-hosted, linux, x64, common] steps: diff --git a/Dockerfile b/Dockerfile index 47976b97b1..c90558c970 100644 --- a/Dockerfile +++ b/Dockerfile @@ -9,8 +9,9 @@ # See the License for the specific language governing permissions and # limitations under the License. -ARG PYTORCH_IMAGE=nvcr.io/nvidia/pytorch:20.12-py3 - +# To build with a different base image +# please run `docker build` using the `--build-arg PYTORCH_IMAGE=...` flag. +ARG PYTORCH_IMAGE=nvcr.io/nvidia/pytorch:21.02-py3 FROM ${PYTORCH_IMAGE} LABEL maintainer="monai.contact@gmail.com" diff --git a/monai/data/utils.py b/monai/data/utils.py index 7717ddf3aa..60250af441 100644 --- a/monai/data/utils.py +++ b/monai/data/utils.py @@ -922,10 +922,6 @@ class DistributedSampler(_TorchDistributedSampler): """ def __init__(self, even_divisible: bool = True, *args, **kwargs): - self.total_size: int = 0 - self.rank: int = 0 - self.num_samples: int = 0 - self.num_replicas: int = 0 super().__init__(*args, **kwargs) if not even_divisible: diff --git a/monai/networks/utils.py b/monai/networks/utils.py index 48efe3934e..bd25e358f6 100644 --- a/monai/networks/utils.py +++ b/monai/networks/utils.py @@ -14,7 +14,7 @@ import warnings from contextlib import contextmanager -from typing import Any, Callable, Optional, Sequence, cast +from typing import Any, Callable, Optional, Sequence import torch import torch.nn as nn @@ -86,10 +86,10 @@ def predict_segmentation( threshold: thresholding the prediction values if multi-labels task. """ if not mutually_exclusive: - return (cast(torch.Tensor, logits >= threshold)).int() + return (logits >= threshold).int() if logits.shape[1] == 1: warnings.warn("single channel prediction, `mutually_exclusive=True` ignored, use threshold instead.") - return (cast(torch.Tensor, logits >= threshold)).int() + return (logits >= threshold).int() return logits.argmax(1, keepdim=True) diff --git a/tests/test_distributed_sampler.py b/tests/test_distributed_sampler.py index d0054885eb..0a439874bd 100644 --- a/tests/test_distributed_sampler.py +++ b/tests/test_distributed_sampler.py @@ -24,6 +24,7 @@ def test_even(self): data = [1, 2, 3, 4, 5] sampler = DistributedSampler(dataset=data, shuffle=False) samples = np.array([data[i] for i in list(sampler)]) + self.assertEqual(dist.get_rank(), sampler.rank) if dist.get_rank() == 0: np.testing.assert_allclose(samples, np.array([1, 3, 5])) @@ -35,6 +36,7 @@ def test_uneven(self): data = [1, 2, 3, 4, 5] sampler = DistributedSampler(dataset=data, shuffle=False, even_divisible=False) samples = np.array([data[i] for i in list(sampler)]) + self.assertEqual(dist.get_rank(), sampler.rank) if dist.get_rank() == 0: np.testing.assert_allclose(samples, np.array([1, 3, 5])) diff --git a/tests/utils.py b/tests/utils.py index 8b367158b2..3636cbe974 100644 --- a/tests/utils.py +++ b/tests/utils.py @@ -16,6 +16,7 @@ import queue import sys import tempfile +import time import traceback import unittest import warnings @@ -273,6 +274,7 @@ def run_process(self, func, local_rank, args, kwargs, results): os.environ["RANK"] = str(self.nproc_per_node * self.node_rank + local_rank) if torch.cuda.is_available(): + os.environ["CUDA_DEVICE_ORDER"] = "PCI_BUS_ID" torch.cuda.set_device(int(local_rank)) dist.init_process_group( @@ -283,6 +285,11 @@ def run_process(self, func, local_rank, args, kwargs, results): rank=int(os.environ["RANK"]), ) func(*args, **kwargs) + # the primary node lives longer to + # avoid _store_based_barrier, RuntimeError: Broken pipe + # as the TCP store daemon is on the rank 0 + if int(os.environ["RANK"]) == 0: + time.sleep(0.1) results.put(True) except Exception as e: results.put(False)