diff --git a/.github/workflows/cron.yml b/.github/workflows/cron.yml
index e568ba9e15..98834dbc8f 100644
--- a/.github/workflows/cron.yml
+++ b/.github/workflows/cron.yml
@@ -56,7 +56,7 @@ jobs:
   cron-pt-image:
     if: github.repository == 'Project-MONAI/MONAI'
     container:
-      image: nvcr.io/nvidia/pytorch:20.12-py3  # testing with the latest pytorch base image
+      image: nvcr.io/nvidia/pytorch:21.02-py3  # testing with the latest pytorch base image
       options: "--gpus all"
     runs-on: [self-hosted, linux, x64, common]
     steps:
diff --git a/Dockerfile b/Dockerfile
index 47976b97b1..c90558c970 100644
--- a/Dockerfile
+++ b/Dockerfile
@@ -9,8 +9,9 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 
-ARG PYTORCH_IMAGE=nvcr.io/nvidia/pytorch:20.12-py3
-
+# To build with a different base image
+# please run `docker build` using the `--build-arg PYTORCH_IMAGE=...` flag.
+ARG PYTORCH_IMAGE=nvcr.io/nvidia/pytorch:21.02-py3
 FROM ${PYTORCH_IMAGE}
 
 LABEL maintainer="monai.contact@gmail.com"
diff --git a/monai/data/utils.py b/monai/data/utils.py
index 7717ddf3aa..60250af441 100644
--- a/monai/data/utils.py
+++ b/monai/data/utils.py
@@ -922,10 +922,6 @@ class DistributedSampler(_TorchDistributedSampler):
     """
 
     def __init__(self, even_divisible: bool = True, *args, **kwargs):
-        self.total_size: int = 0
-        self.rank: int = 0
-        self.num_samples: int = 0
-        self.num_replicas: int = 0
         super().__init__(*args, **kwargs)
 
         if not even_divisible:
diff --git a/monai/networks/utils.py b/monai/networks/utils.py
index 48efe3934e..bd25e358f6 100644
--- a/monai/networks/utils.py
+++ b/monai/networks/utils.py
@@ -14,7 +14,7 @@
 
 import warnings
 from contextlib import contextmanager
-from typing import Any, Callable, Optional, Sequence, cast
+from typing import Any, Callable, Optional, Sequence
 
 import torch
 import torch.nn as nn
@@ -86,10 +86,10 @@ def predict_segmentation(
         threshold: thresholding the prediction values if multi-labels task.
     """
     if not mutually_exclusive:
-        return (cast(torch.Tensor, logits >= threshold)).int()
+        return (logits >= threshold).int()
     if logits.shape[1] == 1:
         warnings.warn("single channel prediction, `mutually_exclusive=True` ignored, use threshold instead.")
-        return (cast(torch.Tensor, logits >= threshold)).int()
+        return (logits >= threshold).int()
     return logits.argmax(1, keepdim=True)
 
 
diff --git a/tests/test_distributed_sampler.py b/tests/test_distributed_sampler.py
index d0054885eb..0a439874bd 100644
--- a/tests/test_distributed_sampler.py
+++ b/tests/test_distributed_sampler.py
@@ -24,6 +24,7 @@ def test_even(self):
         data = [1, 2, 3, 4, 5]
         sampler = DistributedSampler(dataset=data, shuffle=False)
         samples = np.array([data[i] for i in list(sampler)])
+        self.assertEqual(dist.get_rank(), sampler.rank)
         if dist.get_rank() == 0:
             np.testing.assert_allclose(samples, np.array([1, 3, 5]))
 
@@ -35,6 +36,7 @@ def test_uneven(self):
         data = [1, 2, 3, 4, 5]
         sampler = DistributedSampler(dataset=data, shuffle=False, even_divisible=False)
         samples = np.array([data[i] for i in list(sampler)])
+        self.assertEqual(dist.get_rank(), sampler.rank)
         if dist.get_rank() == 0:
             np.testing.assert_allclose(samples, np.array([1, 3, 5]))
 
diff --git a/tests/utils.py b/tests/utils.py
index 8b367158b2..3636cbe974 100644
--- a/tests/utils.py
+++ b/tests/utils.py
@@ -16,6 +16,7 @@
 import queue
 import sys
 import tempfile
+import time
 import traceback
 import unittest
 import warnings
@@ -273,6 +274,7 @@ def run_process(self, func, local_rank, args, kwargs, results):
             os.environ["RANK"] = str(self.nproc_per_node * self.node_rank + local_rank)
 
             if torch.cuda.is_available():
+                os.environ["CUDA_DEVICE_ORDER"] = "PCI_BUS_ID"
                 torch.cuda.set_device(int(local_rank))
 
             dist.init_process_group(
@@ -283,6 +285,11 @@ def run_process(self, func, local_rank, args, kwargs, results):
                 rank=int(os.environ["RANK"]),
             )
             func(*args, **kwargs)
+            # the primary node lives longer to
+            # avoid _store_based_barrier, RuntimeError: Broken pipe
+            # as the TCP store daemon is on the rank 0
+            if int(os.environ["RANK"]) == 0:
+                time.sleep(0.1)
             results.put(True)
         except Exception as e:
             results.put(False)