From 126ef2b072b739a0d773616dc81ecf169de495ad Mon Sep 17 00:00:00 2001
From: ahatamizadeh <ahatamizadeh@nvidia.com>
Date: Tue, 29 Jun 2021 02:19:46 -0700
Subject: [PATCH 1/7] add UNETR, ViT

Signed-off-by: ahatamizadeh <ahatamizadeh@nvidia.com>
---
 docs/requirements.txt                     |   1 +
 docs/source/installation.md               |   4 +-
 docs/source/networks.rst                  |  39 ++++
 monai/config/deviceconfig.py              |   1 +
 monai/networks/blocks/__init__.py         |   5 +
 monai/networks/blocks/mlp.py              |  51 +++++
 monai/networks/blocks/patchembedding.py   | 140 ++++++++++++
 monai/networks/blocks/selfattention.py    |  68 ++++++
 monai/networks/blocks/transformerblock.py |  56 +++++
 monai/networks/blocks/unetr_block.py      | 261 ++++++++++++++++++++++
 monai/networks/nets/__init__.py           |   2 +
 monai/networks/nets/unetr.py              | 198 ++++++++++++++++
 monai/networks/nets/vit.py                |  91 ++++++++
 requirements-dev.txt                      |   3 +-
 setup.cfg                                 |   4 +-
 tests/min_tests.py                        |   7 +
 tests/test_mlp.py                         |  52 +++++
 tests/test_patchembedding.py              | 123 ++++++++++
 tests/test_selfattention.py               |  60 +++++
 tests/test_transformerblock.py            |  57 +++++
 tests/test_unetr.py                       | 121 ++++++++++
 tests/test_unetr_block.py                 | 157 +++++++++++++
 tests/test_vit.py                         | 133 +++++++++++
 23 files changed, 1630 insertions(+), 4 deletions(-)
 create mode 100644 monai/networks/blocks/mlp.py
 create mode 100644 monai/networks/blocks/patchembedding.py
 create mode 100644 monai/networks/blocks/selfattention.py
 create mode 100644 monai/networks/blocks/transformerblock.py
 create mode 100644 monai/networks/blocks/unetr_block.py
 create mode 100644 monai/networks/nets/unetr.py
 create mode 100644 monai/networks/nets/vit.py
 create mode 100644 tests/test_mlp.py
 create mode 100644 tests/test_patchembedding.py
 create mode 100644 tests/test_selfattention.py
 create mode 100644 tests/test_transformerblock.py
 create mode 100644 tests/test_unetr.py
 create mode 100644 tests/test_unetr_block.py
 create mode 100644 tests/test_vit.py

diff --git a/docs/requirements.txt b/docs/requirements.txt
index 3622fd599c..eac5d0f734 100644
--- a/docs/requirements.txt
+++ b/docs/requirements.txt
@@ -19,3 +19,4 @@ sphinxcontrib-qthelp
 sphinxcontrib-serializinghtml
 sphinx-autodoc-typehints==1.11.1
 pandas
+einops
diff --git a/docs/source/installation.md b/docs/source/installation.md
index efa5cf08a8..d8dddff205 100644
--- a/docs/source/installation.md
+++ b/docs/source/installation.md
@@ -174,9 +174,9 @@ Since MONAI v0.2.0, the extras syntax such as `pip install 'monai[nibabel]'` is
 
 - The options are
 ```
-[nibabel, skimage, pillow, tensorboard, gdown, ignite, torchvision, itk, tqdm, lmdb, psutil, cucim, openslide, pandas]
+[nibabel, skimage, pillow, tensorboard, gdown, ignite, torchvision, itk, tqdm, lmdb, psutil, cucim, openslide, pandas, einops]
 ```
 which correspond to `nibabel`, `scikit-image`, `pillow`, `tensorboard`,
-`gdown`, `pytorch-ignite`, `torchvision`, `itk`, `tqdm`, `lmdb`, `psutil`, `cucim` `openslide-python` and `pandas`, respectively.
+`gdown`, `pytorch-ignite`, `torchvision`, `itk`, `tqdm`, `lmdb`, `psutil`, `cucim`, `openslide-python`, `pandas` and `einops`, respectively.
 
 - `pip install 'monai[all]'` installs all the optional dependencies.
diff --git a/docs/source/networks.rst b/docs/source/networks.rst
index 4177696aa4..f70fc4f024 100644
--- a/docs/source/networks.rst
+++ b/docs/source/networks.rst
@@ -79,11 +79,30 @@ Blocks
 .. autoclass:: ResBlock
     :members:
 
+`SABlock Block`
+~~~~~~~~~~~~~~~~~
+.. autoclass:: SABlock
+    :members:
+
 `Squeeze-and-Excitation`
 ~~~~~~~~~~~~~~~~~~~~~~~~
 .. autoclass:: ChannelSELayer
     :members:
 
+`Transformer Block`
+~~~~~~~~~~~~~~~~~~~~~~~~
+.. autoclass:: TransformerBlock
+    :members:
+
+`UNETR Block`
+~~~~~~~~~~~~~~~~~~~~
+.. autoclass:: UnetrBasicBlock
+    :members:
+.. autoclass:: UnetrUpBlock
+    :members:
+.. autoclass:: UnetrPrUpBlock
+    :members:
+
 `Residual Squeeze-and-Excitation`
 ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
 .. autoclass:: ResidualSELayer
@@ -159,6 +178,16 @@ Blocks
 .. autoclass:: LocalNetFeatureExtractorBlock
     :members:
 
+`MLP Block`
+~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+.. autoclass:: MLPBlock
+    :members:
+
+`Patch Embedding Block`
+~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+.. autoclass:: PatchEmbeddingBlock
+    :members:
+
 `Warp`
 ~~~~~~
 .. autoclass:: Warp
@@ -389,6 +418,11 @@ Nets
 .. autoclass:: Unet
 .. autoclass:: unet
 
+`UNETR`
+~~~~~~~~~~~~~~~~~
+.. autoclass:: UNETR
+    :members:
+
 `BasicUNet`
 ~~~~~~~~~~~
 .. autoclass:: BasicUNet
@@ -426,6 +460,11 @@ Nets
 .. autoclass:: VarAutoEncoder
   :members:
 
+`ViT`
+~~~~~~
+.. autoclass:: ViT
+  :members:
+
 `FullyConnectedNet`
 ~~~~~~~~~~~~~~~~~~~
 .. autoclass:: FullyConnectedNet
diff --git a/monai/config/deviceconfig.py b/monai/config/deviceconfig.py
index c790a85277..b8ee3a81fc 100644
--- a/monai/config/deviceconfig.py
+++ b/monai/config/deviceconfig.py
@@ -80,6 +80,7 @@ def get_optional_config_values():
     output["lmdb"] = get_package_version("lmdb")
     output["psutil"] = psutil_version
     output["pandas"] = get_package_version("pandas")
+    output["einops"] = get_package_version("einops")
 
     return output
 
diff --git a/monai/networks/blocks/__init__.py b/monai/networks/blocks/__init__.py
index ed6ac12430..db723f622d 100644
--- a/monai/networks/blocks/__init__.py
+++ b/monai/networks/blocks/__init__.py
@@ -18,8 +18,11 @@
 from .dynunet_block import UnetBasicBlock, UnetOutBlock, UnetResBlock, UnetUpBlock, get_output_padding, get_padding
 from .fcn import FCN, GCN, MCFCN, Refine
 from .localnet_block import LocalNetDownSampleBlock, LocalNetFeatureExtractorBlock, LocalNetUpSampleBlock
+from .mlp import MLPBlock
+from .patchembedding import PatchEmbeddingBlock
 from .regunet_block import RegistrationDownSampleBlock, RegistrationExtractionBlock, RegistrationResidualConvBlock
 from .segresnet_block import ResBlock
+from .selfattention import SABlock
 from .squeeze_and_excitation import (
     ChannelSELayer,
     ResidualSELayer,
@@ -28,5 +31,7 @@
     SEResNetBottleneck,
     SEResNeXtBottleneck,
 )
+from .transformerblock import TransformerBlock
+from .unetr_block import UnetrBasicBlock, UnetrPrUpBlock, UnetrUpBlock
 from .upsample import SubpixelUpsample, Subpixelupsample, SubpixelUpSample, Upsample, UpSample
 from .warp import DVF2DDF, Warp
diff --git a/monai/networks/blocks/mlp.py b/monai/networks/blocks/mlp.py
new file mode 100644
index 0000000000..b108188605
--- /dev/null
+++ b/monai/networks/blocks/mlp.py
@@ -0,0 +1,51 @@
+# Copyright 2020 - 2021 MONAI Consortium
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#     http://www.apache.org/licenses/LICENSE-2.0
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import torch.nn as nn
+
+
+class MLPBlock(nn.Module):
+    """
+    A multi-layer perceptron block, based on: "Dosovitskiy et al.,
+    An Image is Worth 16x16 Words: Transformers for Image Recognition at Scale <https://arxiv.org/abs/2010.11929>"
+    """
+
+    def __init__(
+        self,
+        hidden_size: int,
+        mlp_dim: int,
+        dropout_rate: float = 0.0,
+    ) -> None:
+        """
+        Args:
+            hidden_size: dimension of hidden layer.
+            mlp_dim: dimension of feedforward layer.
+            dropout_rate: faction of the input units to drop.
+
+        """
+
+        super().__init__()
+
+        if not (0 <= dropout_rate <= 1):
+            raise AssertionError("dropout_rate should be between 0 and 1.")
+
+        self.linear1 = nn.Linear(hidden_size, mlp_dim)
+        self.linear2 = nn.Linear(mlp_dim, hidden_size)
+        self.fn = nn.GELU()
+        self.drop1 = nn.Dropout(dropout_rate)
+        self.drop2 = nn.Dropout(dropout_rate)
+
+    def forward(self, x):
+        x = self.fn(self.linear1(x))
+        x = self.drop1(x)
+        x = self.linear2(x)
+        x = self.drop2(x)
+        return x
diff --git a/monai/networks/blocks/patchembedding.py b/monai/networks/blocks/patchembedding.py
new file mode 100644
index 0000000000..7d74c02493
--- /dev/null
+++ b/monai/networks/blocks/patchembedding.py
@@ -0,0 +1,140 @@
+# Copyright 2020 - 2021 MONAI Consortium
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#     http://www.apache.org/licenses/LICENSE-2.0
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+
+import math
+from typing import Tuple, Union
+
+import torch
+import torch.nn as nn
+
+from monai.utils import optional_import
+
+einops, has_einops = optional_import("einops")
+
+
+class PatchEmbeddingBlock(nn.Module):
+    """
+    A patch embedding block, based on: "Dosovitskiy et al.,
+    An Image is Worth 16x16 Words: Transformers for Image Recognition at Scale <https://arxiv.org/abs/2010.11929>"
+    """
+
+    def __init__(
+        self,
+        in_channels: int,
+        img_size: Tuple,
+        patch_size: Tuple,
+        hidden_size: int,
+        num_heads: int,
+        pos_embed: Union[Tuple, str],
+        classification: bool,
+        dropout_rate: float = 0.0,
+    ) -> None:
+        """
+        Args:
+            in_channels: dimension of input channels.
+            img_size: dimension of input image.
+            patch_size: dimension of patch size.
+            hidden_size: dimension of hidden layer.
+            num_heads: number of attention heads.
+            pos_embed: position embedding layer type.
+            classification: bool argument to determine if classification is used.
+            dropout_rate: faction of the input units to drop.
+
+        """
+
+        super().__init__()
+
+        if not (0 <= dropout_rate <= 1):
+            raise AssertionError("dropout_rate should be between 0 and 1.")
+
+        if hidden_size % num_heads != 0:
+            raise AssertionError("hidden size should be divisible by num_heads.")
+
+        if img_size < patch_size:
+            raise AssertionError("patch_size should be smaller than img_size.")
+
+        if pos_embed not in ["conv", "perceptron"]:
+            raise KeyError(f"Position embedding layer of type {pos_embed} is not supported.")
+
+        if pos_embed == "perceptron":
+            if img_size[0] % patch_size[0] != 0:
+                raise AssertionError("img_size should be divisible by patch_size for perceptron patch embedding.")
+
+        if has_einops:
+            from einops.layers.torch import Rearrange
+
+            self.Rearrange = Rearrange
+        else:
+            raise ValueError('"Requires einops.')
+
+        self.n_patches = (
+            (img_size[0] // patch_size[0]) * (img_size[1] // patch_size[1]) * (img_size[2] // patch_size[2])
+        )
+        self.patch_dim = in_channels * patch_size[0] * patch_size[1] * patch_size[2]
+        self.pos_embed = pos_embed
+        if self.pos_embed == "conv":
+            self.patch_embeddings = nn.Conv3d(
+                in_channels=in_channels, out_channels=hidden_size, kernel_size=patch_size, stride=patch_size
+            )
+        elif self.pos_embed == "perceptron":
+            self.patch_embeddings = nn.Sequential(
+                self.Rearrange(
+                    "b c (h p1) (w p2) (d p3)-> b (h w d) (p1 p2 p3 c)",
+                    p1=patch_size[0],
+                    p2=patch_size[1],
+                    p3=patch_size[2],
+                ),
+                nn.Linear(self.patch_dim, hidden_size),
+            )
+        if classification:
+            self.position_embeddings_cls = nn.Parameter(torch.zeros(1, self.n_patches + 1, hidden_size))
+        self.position_embeddings = nn.Parameter(torch.zeros(1, self.n_patches, hidden_size))
+        self.cls_token = nn.Parameter(torch.zeros(1, 1, hidden_size))
+        self.dropout = nn.Dropout(dropout_rate)
+        self.trunc_normal_(self.position_embeddings, mean=0.0, std=0.02, a=-2.0, b=2.0)
+        self.apply(self._init_weights)
+
+    def _init_weights(self, m):
+        if isinstance(m, nn.Linear):
+            self.trunc_normal_(m.weight, mean=0.0, std=0.02, a=-2.0, b=2.0)
+            if isinstance(m, nn.Linear) and m.bias is not None:
+                nn.init.constant_(m.bias, 0)
+        elif isinstance(m, nn.LayerNorm):
+            nn.init.constant_(m.bias, 0)
+            nn.init.constant_(m.weight, 1.0)
+
+    def trunc_normal_(self, tensor, mean, std, a, b):
+        # From PyTorch official master until it's in a few official releases - RW
+        # Method based on https://people.sc.fsu.edu/~jburkardt/presentations/truncated_normal.pdf
+        def norm_cdf(x):
+            return (1.0 + math.erf(x / math.sqrt(2.0))) / 2.0
+
+        with torch.no_grad():
+            l = norm_cdf((a - mean) / std)
+            u = norm_cdf((b - mean) / std)
+            tensor.uniform_(2 * l - 1, 2 * u - 1)
+            tensor.erfinv_()
+            tensor.mul_(std * math.sqrt(2.0))
+            tensor.add_(mean)
+            tensor.clamp_(min=a, max=b)
+            return tensor
+
+    def forward(self, x):
+        if self.pos_embed == "conv":
+            x = self.patch_embeddings(x)
+            x = x.flatten(2)
+            x = x.transpose(-1, -2)
+        elif self.pos_embed == "perceptron":
+            x = self.patch_embeddings(x)
+        embeddings = x + self.position_embeddings
+        embeddings = self.dropout(embeddings)
+        return embeddings
diff --git a/monai/networks/blocks/selfattention.py b/monai/networks/blocks/selfattention.py
new file mode 100644
index 0000000000..bd5bbfa072
--- /dev/null
+++ b/monai/networks/blocks/selfattention.py
@@ -0,0 +1,68 @@
+# Copyright 2020 - 2021 MONAI Consortium
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#     http://www.apache.org/licenses/LICENSE-2.0
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import torch
+import torch.nn as nn
+
+from monai.utils import optional_import
+
+einops, has_einops = optional_import("einops")
+
+
+class SABlock(nn.Module):
+    """
+    A self-attention block, based on: "Dosovitskiy et al.,
+    An Image is Worth 16x16 Words: Transformers for Image Recognition at Scale <https://arxiv.org/abs/2010.11929>"
+    """
+
+    def __init__(
+        self,
+        hidden_size: int,
+        num_heads: int,
+        dropout_rate: float = 0.0,
+    ) -> None:
+        """
+        Args:
+            hidden_size: dimension of hidden layer.
+            num_heads: number of attention heads.
+            dropout_rate: faction of the input units to drop.
+
+        """
+
+        super().__init__()
+
+        if not (0 <= dropout_rate <= 1):
+            raise AssertionError("dropout_rate should be between 0 and 1.")
+
+        if hidden_size % num_heads != 0:
+            raise AssertionError("hidden size should be divisible by num_heads.")
+
+        self.num_heads = num_heads
+        self.out_proj = nn.Linear(hidden_size, hidden_size)
+        self.qkv = nn.Linear(hidden_size, hidden_size * 3, bias=False)
+        self.drop_output = nn.Dropout(dropout_rate)
+        self.drop_weights = nn.Dropout(dropout_rate)
+        self.head_dim = hidden_size // num_heads
+        self.scale = self.head_dim ** -0.5
+        if has_einops:
+            self.rearrange = einops.rearrange
+        else:
+            raise ValueError('"Requires einops.')
+
+    def forward(self, x):
+        q, k, v = self.rearrange(self.qkv(x), "b h (qkv l d) -> qkv b l h d", qkv=3, l=self.num_heads)
+        att_mat = (torch.einsum("blxd,blyd->blxy", q, k) * self.scale).softmax(dim=-1)
+        att_mat = self.drop_weights(att_mat)
+        x = torch.einsum("bhxy,bhyd->bhxd", att_mat, v)
+        x = self.rearrange(x, "b h l d -> b l (h d)")
+        x = self.out_proj(x)
+        x = self.drop_output(x)
+        return x
diff --git a/monai/networks/blocks/transformerblock.py b/monai/networks/blocks/transformerblock.py
new file mode 100644
index 0000000000..3dd80f58ad
--- /dev/null
+++ b/monai/networks/blocks/transformerblock.py
@@ -0,0 +1,56 @@
+# Copyright 2020 - 2021 MONAI Consortium
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#     http://www.apache.org/licenses/LICENSE-2.0
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import torch.nn as nn
+
+from monai.networks.blocks.mlp import MLPBlock
+from monai.networks.blocks.selfattention import SABlock
+
+
+class TransformerBlock(nn.Module):
+    """
+    A transformer block, based on: "Dosovitskiy et al.,
+    An Image is Worth 16x16 Words: Transformers for Image Recognition at Scale <https://arxiv.org/abs/2010.11929>"
+    """
+
+    def __init__(
+        self,
+        hidden_size: int,
+        mlp_dim: int,
+        num_heads: int,
+        dropout_rate: float = 0.0,
+    ) -> None:
+        """
+        Args:
+            hidden_size: dimension of hidden layer.
+            mlp_dim: dimension of feedforward layer.
+            num_heads: number of attention heads.
+            dropout_rate: faction of the input units to drop.
+
+        """
+
+        super().__init__()
+
+        if not (0 <= dropout_rate <= 1):
+            raise AssertionError("dropout_rate should be between 0 and 1.")
+
+        if hidden_size % num_heads != 0:
+            raise AssertionError("hidden size should be divisible by num_heads.")
+
+        self.mlp = MLPBlock(hidden_size, mlp_dim, dropout_rate)
+        self.norm1 = nn.LayerNorm(hidden_size)
+        self.attn = SABlock(hidden_size, num_heads, dropout_rate)
+        self.norm2 = nn.LayerNorm(hidden_size)
+
+    def forward(self, x):
+        x = x + self.attn(self.norm1(x))
+        x = x + self.mlp(self.norm2(x))
+        return x
diff --git a/monai/networks/blocks/unetr_block.py b/monai/networks/blocks/unetr_block.py
new file mode 100644
index 0000000000..24cefdcf1a
--- /dev/null
+++ b/monai/networks/blocks/unetr_block.py
@@ -0,0 +1,261 @@
+# Copyright 2020 - 2021 MONAI Consortium
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#     http://www.apache.org/licenses/LICENSE-2.0
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+
+from typing import Sequence, Tuple, Union
+
+import torch
+import torch.nn as nn
+
+from monai.networks.blocks.dynunet_block import UnetBasicBlock, UnetResBlock, get_conv_layer
+
+
+class UnetrUpBlock(nn.Module):
+    """
+    An upsampling module that can be used for UNETR: "Hatamizadeh et al.,
+    UNETR: Transformers for 3D Medical Image Segmentation <https://arxiv.org/abs/2103.10504>"
+    """
+
+    def __init__(
+        self,
+        spatial_dims: int,
+        in_channels: int,
+        out_channels: int,
+        kernel_size: Union[Sequence[int], int],
+        stride: Union[Sequence[int], int],
+        upsample_kernel_size: Union[Sequence[int], int],
+        norm_name: Union[Tuple, str],
+        res_block: bool = False,
+    ) -> None:
+        """
+        Args:
+            spatial_dims: number of spatial dimensions.
+            in_channels: number of input channels.
+            out_channels: number of output channels.
+            kernel_size: convolution kernel size.
+            stride: convolution stride.
+            upsample_kernel_size: convolution kernel size for transposed convolution layers.
+            norm_name: feature normalization type and arguments.
+            res_block: bool argument to determine if residual block is used.
+
+        """
+
+        super(UnetrUpBlock, self).__init__()
+        upsample_stride = upsample_kernel_size
+        self.transp_conv = get_conv_layer(
+            spatial_dims,
+            in_channels,
+            out_channels,
+            kernel_size=upsample_kernel_size,
+            stride=upsample_stride,
+            conv_only=True,
+            is_transposed=True,
+        )
+
+        if res_block:
+            self.conv_block = UnetResBlock(
+                spatial_dims,
+                out_channels + out_channels,
+                out_channels,
+                kernel_size=kernel_size,
+                stride=stride,
+                norm_name=norm_name,
+            )
+        else:
+            self.conv_block = UnetBasicBlock(
+                spatial_dims,
+                out_channels + out_channels,
+                out_channels,
+                kernel_size=kernel_size,
+                stride=1,
+                norm_name=norm_name,
+            )
+
+    def forward(self, inp, skip):
+        # number of channels for skip should equals to out_channels
+        out = self.transp_conv(inp)
+        out = torch.cat((out, skip), dim=1)
+        out = self.conv_block(out)
+        return out
+
+
+class UnetrPrUpBlock(nn.Module):
+    """
+    A projection upsampling module that can be used for UNETR: "Hatamizadeh et al.,
+    UNETR: Transformers for 3D Medical Image Segmentation <https://arxiv.org/abs/2103.10504>"
+    """
+
+    def __init__(
+        self,
+        spatial_dims: int,
+        in_channels: int,
+        out_channels: int,
+        num_layer: int,
+        kernel_size: Union[Sequence[int], int],
+        stride: Union[Sequence[int], int],
+        upsample_kernel_size: Union[Sequence[int], int],
+        norm_name: Union[Tuple, str],
+        conv_block: bool = False,
+        res_block: bool = False,
+    ) -> None:
+        """
+        Args:
+            spatial_dims: number of spatial dimensions.
+            in_channels: number of input channels.
+            out_channels: number of output channels.
+            num_layer: number of upsampling blocks.
+            kernel_size: convolution kernel size.
+            stride: convolution stride.
+            upsample_kernel_size: convolution kernel size for transposed convolution layers.
+            norm_name: feature normalization type and arguments.
+            conv_block: bool argument to determine if convolutional block is used.
+            res_block: bool argument to determine if residual block is used.
+
+        """
+
+        super().__init__()
+
+        upsample_stride = upsample_kernel_size
+        self.transp_conv_init = get_conv_layer(
+            spatial_dims,
+            in_channels,
+            out_channels,
+            kernel_size=upsample_kernel_size,
+            stride=upsample_stride,
+            conv_only=True,
+            is_transposed=True,
+        )
+        if conv_block:
+            if res_block:
+                self.blocks = nn.ModuleList(
+                    [
+                        nn.Sequential(
+                            get_conv_layer(
+                                spatial_dims,
+                                out_channels,
+                                out_channels,
+                                kernel_size=upsample_kernel_size,
+                                stride=upsample_stride,
+                                conv_only=True,
+                                is_transposed=True,
+                            ),
+                            UnetResBlock(
+                                spatial_dims=3,
+                                in_channels=out_channels,
+                                out_channels=out_channels,
+                                kernel_size=kernel_size,
+                                stride=stride,
+                                norm_name=norm_name,
+                            ),
+                        )
+                        for i in range(num_layer)
+                    ]
+                )
+            else:
+                self.blocks = nn.ModuleList(
+                    [
+                        nn.Sequential(
+                            get_conv_layer(
+                                spatial_dims,
+                                out_channels,
+                                out_channels,
+                                kernel_size=upsample_kernel_size,
+                                stride=upsample_stride,
+                                conv_only=True,
+                                is_transposed=True,
+                            ),
+                            UnetBasicBlock(
+                                spatial_dims=3,
+                                in_channels=out_channels,
+                                out_channels=out_channels,
+                                kernel_size=kernel_size,
+                                stride=stride,
+                                norm_name=norm_name,
+                            ),
+                        )
+                        for i in range(num_layer)
+                    ]
+                )
+        else:
+            self.blocks = nn.ModuleList(
+                [
+                    get_conv_layer(
+                        spatial_dims,
+                        out_channels,
+                        out_channels,
+                        kernel_size=upsample_kernel_size,
+                        stride=upsample_stride,
+                        conv_only=True,
+                        is_transposed=True,
+                    )
+                    for i in range(num_layer)
+                ]
+            )
+
+    def forward(self, x):
+        x = self.transp_conv_init(x)
+        for blk in self.blocks:
+            x = blk(x)
+        return x
+
+
+class UnetrBasicBlock(nn.Module):
+    """
+    A CNN module that can be used for UNETR, based on: "Hatamizadeh et al.,
+    UNETR: Transformers for 3D Medical Image Segmentation <https://arxiv.org/abs/2103.10504>"
+    """
+
+    def __init__(
+        self,
+        spatial_dims: int,
+        in_channels: int,
+        out_channels: int,
+        kernel_size: Union[Sequence[int], int],
+        stride: Union[Sequence[int], int],
+        norm_name: Union[Tuple, str],
+        res_block: bool = False,
+    ) -> None:
+        """
+        Args:
+            spatial_dims: number of spatial dimensions.
+            in_channels: number of input channels.
+            out_channels: number of output channels.
+            kernel_size: convolution kernel size.
+            stride: convolution stride.
+            norm_name: feature normalization type and arguments.
+            res_block: bool argument to determine if residual block is used.
+
+        """
+
+        super().__init__()
+
+        if res_block:
+            self.layer = UnetResBlock(
+                spatial_dims=spatial_dims,
+                in_channels=in_channels,
+                out_channels=out_channels,
+                kernel_size=kernel_size,
+                stride=stride,
+                norm_name=norm_name,
+            )
+        else:
+            self.layer = UnetBasicBlock(
+                spatial_dims=spatial_dims,
+                in_channels=in_channels,
+                out_channels=out_channels,
+                kernel_size=kernel_size,
+                stride=stride,
+                norm_name=norm_name,
+            )
+
+    def forward(self, inp):
+        out = self.layer(inp)
+        return out
diff --git a/monai/networks/nets/__init__.py b/monai/networks/nets/__init__.py
index 82a68aeea8..9cf6c5e07f 100644
--- a/monai/networks/nets/__init__.py
+++ b/monai/networks/nets/__init__.py
@@ -74,5 +74,7 @@
 )
 from .torchvision_fc import TorchVisionFCModel, TorchVisionFullyConvModel
 from .unet import UNet, Unet, unet
+from .unetr import UNETR
 from .varautoencoder import VarAutoEncoder
+from .vit import ViT
 from .vnet import VNet
diff --git a/monai/networks/nets/unetr.py b/monai/networks/nets/unetr.py
new file mode 100644
index 0000000000..14230e7a88
--- /dev/null
+++ b/monai/networks/nets/unetr.py
@@ -0,0 +1,198 @@
+# Copyright 2020 - 2021 MONAI Consortium
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#     http://www.apache.org/licenses/LICENSE-2.0
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+from typing import Tuple, Union
+
+import torch.nn as nn
+
+from monai.networks.blocks.dynunet_block import UnetOutBlock
+from monai.networks.blocks.unetr_block import UnetrBasicBlock, UnetrPrUpBlock, UnetrUpBlock
+from monai.networks.nets.vit import ViT
+
+
+class UNETR(nn.Module):
+    """
+    UNETR based on: "Hatamizadeh et al.,
+    UNETR: Transformers for 3D Medical Image Segmentation <https://arxiv.org/abs/2103.10504>"
+    """
+
+    def __init__(
+        self,
+        in_channels: int,
+        out_channels: int,
+        img_size: Tuple,
+        feature_size: int,
+        hidden_size: int,
+        mlp_dim: int,
+        num_heads: int,
+        pos_embed: Union[Tuple, str],
+        norm_name: Union[Tuple, str],
+        conv_block: bool = False,
+        res_block: bool = False,
+        dropout_rate: float = 0.0,
+    ) -> None:
+        """
+        Args:
+            in_channels: dimension of input channels.
+            out_channels: dimension of output channels.
+            img_size: dimension of input image.
+            feature_size: dimension of network feature size.
+            hidden_size: dimension of hidden layer.
+            mlp_dim: dimension of feedforward layer.
+            num_heads: number of attention heads.
+            pos_embed: position embedding layer type.
+            norm_name: feature normalization type and arguments.
+            conv_block: bool argument to determine if convolutional block is used.
+            res_block: bool argument to determine if residual block is used.
+            dropout_rate: faction of the input units to drop.
+
+        """
+
+        super().__init__()
+
+        if not (0 <= dropout_rate <= 1):
+            raise AssertionError("dropout_rate should be between 0 and 1.")
+
+        if hidden_size % num_heads != 0:
+            raise AssertionError("hidden size should be divisible by num_heads.")
+
+        if pos_embed not in ["conv", "perceptron"]:
+            raise KeyError(f"Position embedding layer of type {pos_embed} is not supported.")
+
+        self.num_layers = 12
+        self.patch_size = (16, 16, 16)
+        self.feat_size = (
+            img_size[0] // self.patch_size[0],
+            img_size[1] // self.patch_size[1],
+            img_size[2] // self.patch_size[2],
+        )
+        self.hidden_size = hidden_size
+        self.classification = False
+        self.vit = ViT(
+            in_channels=in_channels,
+            img_size=img_size,
+            patch_size=self.patch_size,
+            hidden_size=hidden_size,
+            mlp_dim=mlp_dim,
+            num_layers=self.num_layers,
+            num_heads=num_heads,
+            pos_embed=pos_embed,
+            classification=self.classification,
+            dropout_rate=dropout_rate,
+        )
+        self.encoder1 = UnetrBasicBlock(
+            spatial_dims=3,
+            in_channels=in_channels,
+            out_channels=feature_size,
+            kernel_size=3,
+            stride=1,
+            norm_name=norm_name,
+            res_block=res_block,
+        )
+        self.encoder2 = UnetrPrUpBlock(
+            spatial_dims=3,
+            in_channels=hidden_size,
+            out_channels=feature_size * 2,
+            num_layer=2,
+            kernel_size=3,
+            stride=1,
+            upsample_kernel_size=2,
+            norm_name=norm_name,
+            conv_block=conv_block,
+            res_block=res_block,
+        )
+        self.encoder3 = UnetrPrUpBlock(
+            spatial_dims=3,
+            in_channels=hidden_size,
+            out_channels=feature_size * 4,
+            num_layer=1,
+            kernel_size=3,
+            stride=1,
+            upsample_kernel_size=2,
+            norm_name=norm_name,
+            conv_block=conv_block,
+            res_block=res_block,
+        )
+        self.encoder4 = UnetrPrUpBlock(
+            spatial_dims=3,
+            in_channels=hidden_size,
+            out_channels=feature_size * 8,
+            num_layer=0,
+            kernel_size=3,
+            stride=1,
+            upsample_kernel_size=2,
+            norm_name=norm_name,
+            conv_block=conv_block,
+            res_block=res_block,
+        )
+        self.decoder5 = UnetrUpBlock(
+            spatial_dims=3,
+            in_channels=hidden_size,
+            out_channels=feature_size * 8,
+            stride=1,
+            kernel_size=3,
+            upsample_kernel_size=2,
+            norm_name=norm_name,
+            res_block=res_block,
+        )
+        self.decoder4 = UnetrUpBlock(
+            spatial_dims=3,
+            in_channels=feature_size * 8,
+            out_channels=feature_size * 4,
+            stride=1,
+            kernel_size=3,
+            upsample_kernel_size=2,
+            norm_name=norm_name,
+            res_block=res_block,
+        )
+        self.decoder3 = UnetrUpBlock(
+            spatial_dims=3,
+            in_channels=feature_size * 4,
+            out_channels=feature_size * 2,
+            stride=1,
+            kernel_size=3,
+            upsample_kernel_size=2,
+            norm_name=norm_name,
+            res_block=res_block,
+        )
+        self.decoder2 = UnetrUpBlock(
+            spatial_dims=3,
+            in_channels=feature_size * 2,
+            out_channels=feature_size,
+            stride=1,
+            kernel_size=3,
+            upsample_kernel_size=2,
+            norm_name=norm_name,
+            res_block=res_block,
+        )
+        self.out = UnetOutBlock(spatial_dims=3, in_channels=feature_size, out_channels=out_channels)
+
+    def proj_feat(self, x, hidden_size, feat_size):
+        x = x.view(x.size(0), feat_size[0], feat_size[1], feat_size[2], hidden_size)
+        x = x.permute(0, 4, 1, 2, 3).contiguous()
+        return x
+
+    def forward(self, x_in):
+        x, hidden_states_out = self.vit(x_in)
+        enc1 = self.encoder1(x_in)
+        x2 = hidden_states_out[3]
+        enc2 = self.encoder2(self.proj_feat(x2, self.hidden_size, self.feat_size))
+        x3 = hidden_states_out[6]
+        enc3 = self.encoder3(self.proj_feat(x3, self.hidden_size, self.feat_size))
+        x4 = hidden_states_out[9]
+        enc4 = self.encoder4(self.proj_feat(x4, self.hidden_size, self.feat_size))
+        dec4 = self.proj_feat(x, self.hidden_size, self.feat_size)
+        dec3 = self.decoder5(dec4, enc4)
+        dec2 = self.decoder4(dec3, enc3)
+        dec1 = self.decoder3(dec2, enc2)
+        out = self.decoder2(dec1, enc1)
+        logits = self.out(out)
+        return logits
diff --git a/monai/networks/nets/vit.py b/monai/networks/nets/vit.py
new file mode 100644
index 0000000000..ec7e27f3ec
--- /dev/null
+++ b/monai/networks/nets/vit.py
@@ -0,0 +1,91 @@
+# Copyright 2020 - 2021 MONAI Consortium
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#     http://www.apache.org/licenses/LICENSE-2.0
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+
+from typing import Tuple, Union
+
+import torch.nn as nn
+
+from monai.networks.blocks.patchembedding import PatchEmbeddingBlock
+from monai.networks.blocks.transformerblock import TransformerBlock
+
+
+class ViT(nn.Module):
+    """
+    Vision Transformer (ViT), based on: "Dosovitskiy et al.,
+    An Image is Worth 16x16 Words: Transformers for Image Recognition at Scale <https://arxiv.org/abs/2010.11929>"
+    """
+
+    def __init__(
+        self,
+        in_channels: int,
+        img_size: Tuple,
+        patch_size: Tuple,
+        hidden_size: int,
+        mlp_dim: int,
+        num_layers: int,
+        num_heads: int,
+        pos_embed: Union[Tuple, str],
+        classification: bool,
+        num_classes: int = 2,
+        dropout_rate: float = 0.0,
+    ) -> None:
+        """
+        Args:
+            in_channels: dimension of input channels.
+            img_size: dimension of input image.
+            patch_size: dimension of patch size.
+            hidden_size: dimension of hidden layer.
+            mlp_dim: dimension of feedforward layer.
+            num_layers: number of transformer blocks.
+            num_heads: number of attention heads.
+            pos_embed: position embedding layer type.
+            classification: bool argument to determine if classification is used.
+            num_classes: number of classes if classification is used.
+            dropout_rate: faction of the input units to drop.
+
+        """
+
+        super().__init__()
+
+        if not (0 <= dropout_rate <= 1):
+            raise AssertionError("dropout_rate should be between 0 and 1.")
+
+        if hidden_size % num_heads != 0:
+            raise AssertionError("hidden size should be divisible by num_heads.")
+
+        if img_size < patch_size:
+            raise AssertionError("patch_size should be smaller than img_size.")
+
+        if pos_embed not in ["conv", "perceptron"]:
+            raise KeyError(f"Position embedding layer of type {pos_embed} is not supported.")
+
+        self.classification = classification
+        self.patch_embedding = PatchEmbeddingBlock(
+            in_channels, img_size, patch_size, hidden_size, num_heads, pos_embed, classification, dropout_rate
+        )
+        self.blocks = nn.ModuleList(
+            [TransformerBlock(hidden_size, mlp_dim, num_heads, dropout_rate) for i in range(num_layers)]
+        )
+        self.norm = nn.LayerNorm(hidden_size)
+        if self.classification:
+            self.classification_head = nn.Linear(hidden_size, num_classes)
+
+    def forward(self, x):
+        x = self.patch_embedding(x)
+        hidden_states_out = []
+        for blk in self.blocks:
+            x = blk(x)
+            hidden_states_out.append(x)
+        x = self.norm(x)
+        if self.classification:
+            x = self.classification_head(x[:, 0])
+        return x, hidden_states_out
diff --git a/requirements-dev.txt b/requirements-dev.txt
index f9aa56d801..0bbb5c75b5 100644
--- a/requirements-dev.txt
+++ b/requirements-dev.txt
@@ -34,4 +34,5 @@ sphinx-rtd-theme==0.5.2
 cucim~=0.19.0; platform_system == "Linux"
 openslide-python==1.1.2
 pandas
-requests
\ No newline at end of file
+requests
+einops
diff --git a/setup.cfg b/setup.cfg
index 1d26953adf..57118a9260 100644
--- a/setup.cfg
+++ b/setup.cfg
@@ -43,6 +43,7 @@ all =
     cucim~=0.19.0
     openslide-python==1.1.2
     pandas
+    einops
 nibabel =
     nibabel
 skimage =
@@ -71,7 +72,8 @@ openslide =
     openslide-python==1.1.2
 pandas =
     pandas
-
+einops =
+    einops
 [flake8]
 select = B,C,E,F,N,P,T4,W,B9
 max_line_length = 120
diff --git a/tests/min_tests.py b/tests/min_tests.py
index 046f9b4a40..a3f140b856 100644
--- a/tests/min_tests.py
+++ b/tests/min_tests.py
@@ -127,6 +127,13 @@ def run_testsuit():
         "test_write_metrics_reports",
         "test_csv_dataset",
         "test_csv_iterable_dataset",
+        "test_mlp",
+        "test_patchembedding",
+        "test_selfattention",
+        "test_transformerblock",
+        "test_unetr",
+        "test_unetr_block",
+        "test_vit",
     ]
     assert sorted(exclude_cases) == sorted(set(exclude_cases)), f"Duplicated items in {exclude_cases}"
 
diff --git a/tests/test_mlp.py b/tests/test_mlp.py
new file mode 100644
index 0000000000..efc8db74c2
--- /dev/null
+++ b/tests/test_mlp.py
@@ -0,0 +1,52 @@
+# Copyright 2020 - 2021 MONAI Consortium
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#     http://www.apache.org/licenses/LICENSE-2.0
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import unittest
+
+import numpy as np
+import torch
+from parameterized import parameterized
+
+from monai.networks import eval_mode
+from monai.networks.blocks.mlp import MLPBlock
+
+TEST_CASE_MLP = []
+for dropout_rate in np.linspace(0, 1, 4):
+    for hidden_size in [128, 256, 512, 768]:
+        for mlp_dim in [512, 1028, 2048, 3072]:
+
+            test_case = [
+                {
+                    "hidden_size": hidden_size,
+                    "mlp_dim": mlp_dim,
+                    "dropout_rate": dropout_rate,
+                },
+                (2, 512, hidden_size),
+                (2, 512, hidden_size),
+            ]
+            TEST_CASE_MLP.append(test_case)
+
+
+class TestMLPBlock(unittest.TestCase):
+    @parameterized.expand(TEST_CASE_MLP)
+    def test_shape(self, input_param, input_shape, expected_shape):
+        net = MLPBlock(**input_param)
+        with eval_mode(net):
+            result = net(torch.randn(input_shape))
+            self.assertEqual(result.shape, expected_shape)
+
+    def test_ill_arg(self):
+        with self.assertRaises(AssertionError):
+            MLPBlock(hidden_size=128, mlp_dim=512, dropout_rate=5.0)
+
+
+if __name__ == "__main__":
+    unittest.main()
diff --git a/tests/test_patchembedding.py b/tests/test_patchembedding.py
new file mode 100644
index 0000000000..f3ca277a3d
--- /dev/null
+++ b/tests/test_patchembedding.py
@@ -0,0 +1,123 @@
+# Copyright 2020 - 2021 MONAI Consortium
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#     http://www.apache.org/licenses/LICENSE-2.0
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import unittest
+from unittest import skipUnless
+
+import numpy as np
+import torch
+from parameterized import parameterized
+
+from monai.networks import eval_mode
+from monai.networks.blocks.patchembedding import PatchEmbeddingBlock
+from monai.utils import optional_import
+
+einops, has_einops = optional_import("einops")
+
+TEST_CASE_PATCHEMBEDDINGBLOCK = []
+for dropout_rate in np.linspace(0, 1, 2):
+    for in_channels in [1, 4]:
+        for hidden_size in [360, 768]:
+            for img_size in [96, 128]:
+                for patch_size in [8, 16]:
+                    for num_heads in [8, 12]:
+                        for pos_embed in ["conv", "perceptron"]:
+                            for classification in ["False", "True"]:
+                                test_case = [
+                                    {
+                                        "in_channels": in_channels,
+                                        "img_size": (img_size, img_size, img_size),
+                                        "patch_size": (patch_size, patch_size, patch_size),
+                                        "hidden_size": hidden_size,
+                                        "num_heads": num_heads,
+                                        "pos_embed": pos_embed,
+                                        "classification": False,
+                                        "dropout_rate": dropout_rate,
+                                    },
+                                    (2, in_channels, img_size, *([img_size] * 2)),
+                                    (2, (img_size // patch_size) ** 3, hidden_size),
+                                ]
+                                TEST_CASE_PATCHEMBEDDINGBLOCK.append(test_case)
+
+
+class TestPatchEmbeddingBlock(unittest.TestCase):
+    @parameterized.expand(TEST_CASE_PATCHEMBEDDINGBLOCK)
+    @skipUnless(has_einops, "Requires einops")
+    def test_shape(self, input_param, input_shape, expected_shape):
+        net = PatchEmbeddingBlock(**input_param)
+        with eval_mode(net):
+            result = net(torch.randn(input_shape))
+            self.assertEqual(result.shape, expected_shape)
+
+    def test_ill_arg(self):
+        with self.assertRaises(AssertionError):
+            PatchEmbeddingBlock(
+                in_channels=1,
+                img_size=(128, 128, 128),
+                patch_size=(16, 16, 16),
+                hidden_size=128,
+                num_heads=12,
+                pos_embed="conv",
+                classification=False,
+                dropout_rate=5.0,
+            )
+
+        with self.assertRaises(AssertionError):
+            PatchEmbeddingBlock(
+                in_channels=1,
+                img_size=(32, 32, 32),
+                patch_size=(64, 64, 64),
+                hidden_size=512,
+                num_heads=8,
+                pos_embed="perceptron",
+                classification=False,
+                dropout_rate=0.3,
+            )
+
+        with self.assertRaises(AssertionError):
+            PatchEmbeddingBlock(
+                in_channels=1,
+                img_size=(96, 96, 96),
+                patch_size=(8, 8, 8),
+                hidden_size=512,
+                num_heads=14,
+                pos_embed="conv",
+                classification=False,
+                dropout_rate=0.3,
+            )
+
+        with self.assertRaises(AssertionError):
+            PatchEmbeddingBlock(
+                in_channels=1,
+                img_size=(97, 97, 97),
+                patch_size=(4, 4, 4),
+                hidden_size=768,
+                num_heads=8,
+                pos_embed="perceptron",
+                classification=False,
+                dropout_rate=0.3,
+            )
+
+        with self.assertRaises(KeyError):
+            PatchEmbeddingBlock(
+                in_channels=4,
+                img_size=(96, 96, 96),
+                patch_size=(16, 16, 16),
+                hidden_size=768,
+                num_heads=12,
+                pos_embed="perc",
+                classification=False,
+                dropout_rate=0.3,
+            )
+
+
+if __name__ == "__main__":
+    unittest.main()
diff --git a/tests/test_selfattention.py b/tests/test_selfattention.py
new file mode 100644
index 0000000000..2430b82c9b
--- /dev/null
+++ b/tests/test_selfattention.py
@@ -0,0 +1,60 @@
+# Copyright 2020 - 2021 MONAI Consortium
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#     http://www.apache.org/licenses/LICENSE-2.0
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import unittest
+from unittest import skipUnless
+
+import numpy as np
+import torch
+from parameterized import parameterized
+
+from monai.networks import eval_mode
+from monai.networks.blocks.selfattention import SABlock
+from monai.utils import optional_import
+
+einops, has_einops = optional_import("einops")
+
+TEST_CASE_SABLOCK = []
+for dropout_rate in np.linspace(0, 1, 4):
+    for hidden_size in [360, 480, 600, 768]:
+        for num_heads in [4, 6, 8, 12]:
+
+            test_case = [
+                {
+                    "hidden_size": hidden_size,
+                    "num_heads": num_heads,
+                    "dropout_rate": dropout_rate,
+                },
+                (2, 512, hidden_size),
+                (2, 512, hidden_size),
+            ]
+            TEST_CASE_SABLOCK.append(test_case)
+
+
+class TestResBlock(unittest.TestCase):
+    @parameterized.expand(TEST_CASE_SABLOCK)
+    @skipUnless(has_einops, "Requires einops")
+    def test_shape(self, input_param, input_shape, expected_shape):
+        net = SABlock(**input_param)
+        with eval_mode(net):
+            result = net(torch.randn(input_shape))
+            self.assertEqual(result.shape, expected_shape)
+
+    def test_ill_arg(self):
+        with self.assertRaises(AssertionError):
+            SABlock(hidden_size=128, num_heads=12, dropout_rate=6.0)
+
+        with self.assertRaises(AssertionError):
+            SABlock(hidden_size=620, num_heads=8, dropout_rate=0.4)
+
+
+if __name__ == "__main__":
+    unittest.main()
diff --git a/tests/test_transformerblock.py b/tests/test_transformerblock.py
new file mode 100644
index 0000000000..24d16c77aa
--- /dev/null
+++ b/tests/test_transformerblock.py
@@ -0,0 +1,57 @@
+# Copyright 2020 - 2021 MONAI Consortium
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#     http://www.apache.org/licenses/LICENSE-2.0
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import unittest
+
+import numpy as np
+import torch
+from parameterized import parameterized
+
+from monai.networks import eval_mode
+from monai.networks.blocks.transformerblock import TransformerBlock
+
+TEST_CASE_TRANSFORMERBLOCK = []
+for dropout_rate in np.linspace(0, 1, 4):
+    for hidden_size in [360, 480, 600, 768]:
+        for num_heads in [4, 8, 12]:
+            for mlp_dim in [1024, 3072]:
+
+                test_case = [
+                    {
+                        "hidden_size": hidden_size,
+                        "num_heads": num_heads,
+                        "mlp_dim": mlp_dim,
+                        "dropout_rate": dropout_rate,
+                    },
+                    (2, 512, hidden_size),
+                    (2, 512, hidden_size),
+                ]
+                TEST_CASE_TRANSFORMERBLOCK.append(test_case)
+
+
+class TestTransformerBlock(unittest.TestCase):
+    @parameterized.expand(TEST_CASE_TRANSFORMERBLOCK)
+    def test_shape(self, input_param, input_shape, expected_shape):
+        net = TransformerBlock(**input_param)
+        with eval_mode(net):
+            result = net(torch.randn(input_shape))
+            self.assertEqual(result.shape, expected_shape)
+
+    def test_ill_arg(self):
+        with self.assertRaises(AssertionError):
+            TransformerBlock(hidden_size=128, num_heads=12, mlp_dim=2048, dropout_rate=4.0)
+
+        with self.assertRaises(AssertionError):
+            TransformerBlock(hidden_size=622, num_heads=8, mlp_dim=3072, dropout_rate=0.4)
+
+
+if __name__ == "__main__":
+    unittest.main()
diff --git a/tests/test_unetr.py b/tests/test_unetr.py
new file mode 100644
index 0000000000..3193c12465
--- /dev/null
+++ b/tests/test_unetr.py
@@ -0,0 +1,121 @@
+# Copyright 2020 - 2021 MONAI Consortium
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#     http://www.apache.org/licenses/LICENSE-2.0
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import unittest
+
+import torch
+from parameterized import parameterized
+
+from monai.networks import eval_mode
+from monai.networks.nets.unetr import UNETR
+
+TEST_CASE_UNETR = []
+for dropout_rate in [0.4]:
+    for in_channels in [1, 4]:
+        for out_channels in [2, 3]:
+            for hidden_size in [768]:
+                for img_size in [96, 128]:
+                    for feature_size in [16]:
+                        for num_heads in [8]:
+                            for mlp_dim in [3072]:
+                                for norm_name in ["instance"]:
+                                    for pos_embed in ["conv", "perceptron"]:
+                                        for conv_block in [True]:
+                                            for res_block in [False]:
+                                                test_case = [
+                                                    {
+                                                        "in_channels": in_channels,
+                                                        "out_channels": out_channels,
+                                                        "img_size": (img_size, img_size, img_size),
+                                                        "hidden_size": hidden_size,
+                                                        "feature_size": feature_size,
+                                                        "norm_name": norm_name,
+                                                        "mlp_dim": mlp_dim,
+                                                        "num_heads": num_heads,
+                                                        "pos_embed": pos_embed,
+                                                        "dropout_rate": dropout_rate,
+                                                        "conv_block": conv_block,
+                                                        "res_block": conv_block,
+                                                    },
+                                                    (2, in_channels, img_size, *([img_size] * 2)),
+                                                    (2, out_channels, img_size, *([img_size] * 2)),
+                                                ]
+                                                TEST_CASE_UNETR.append(test_case)
+
+
+class TestPatchEmbeddingBlock(unittest.TestCase):
+    @parameterized.expand(TEST_CASE_UNETR)
+    def test_shape(self, input_param, input_shape, expected_shape):
+        net = UNETR(**input_param)
+        with eval_mode(net):
+            result = net(torch.randn(input_shape))
+            self.assertEqual(result.shape, expected_shape)
+
+    def test_ill_arg(self):
+        with self.assertRaises(AssertionError):
+            UNETR(
+                in_channels=1,
+                out_channels=3,
+                img_size=(128, 128, 128),
+                feature_size=16,
+                hidden_size=128,
+                mlp_dim=3072,
+                num_heads=12,
+                pos_embed="conv",
+                norm_name="instance",
+                dropout_rate=5.0,
+            )
+
+        with self.assertRaises(AssertionError):
+            UNETR(
+                in_channels=1,
+                out_channels=4,
+                img_size=(32, 32, 32),
+                feature_size=32,
+                hidden_size=512,
+                mlp_dim=3072,
+                num_heads=12,
+                pos_embed="conv",
+                norm_name="instance",
+                dropout_rate=0.5,
+            )
+
+        with self.assertRaises(AssertionError):
+            UNETR(
+                in_channels=1,
+                out_channels=3,
+                img_size=(96, 96, 96),
+                feature_size=16,
+                hidden_size=512,
+                mlp_dim=3072,
+                num_heads=14,
+                pos_embed="conv",
+                norm_name="batch",
+                dropout_rate=0.4,
+            )
+
+        with self.assertRaises(KeyError):
+            UNETR(
+                in_channels=1,
+                out_channels=4,
+                img_size=(96, 96, 96),
+                feature_size=8,
+                hidden_size=768,
+                mlp_dim=3072,
+                num_heads=12,
+                pos_embed="perc",
+                norm_name="instance",
+                dropout_rate=0.2,
+            )
+
+
+if __name__ == "__main__":
+    unittest.main()
diff --git a/tests/test_unetr_block.py b/tests/test_unetr_block.py
new file mode 100644
index 0000000000..d2484558e8
--- /dev/null
+++ b/tests/test_unetr_block.py
@@ -0,0 +1,157 @@
+# Copyright 2020 - 2021 MONAI Consortium
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#     http://www.apache.org/licenses/LICENSE-2.0
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import unittest
+
+import torch
+from parameterized import parameterized
+
+from monai.networks import eval_mode
+from monai.networks.blocks.dynunet_block import get_padding
+from monai.networks.blocks.unetr_block import UnetrBasicBlock, UnetrPrUpBlock, UnetrUpBlock
+from tests.utils import test_script_save
+
+TEST_CASE_UNETR_BASIC_BLOCK = []
+for spatial_dims in range(2, 4):
+    for kernel_size in [1, 3]:
+        for stride in [2]:
+            for norm_name in [("GROUP", {"num_groups": 16}), ("batch", {"track_running_stats": False}), "instance"]:
+                for in_size in [15, 16]:
+                    padding = get_padding(kernel_size, stride)
+                    if not isinstance(padding, int):
+                        padding = padding[0]
+                    out_size = int((in_size + 2 * padding - kernel_size) / stride) + 1
+                    test_case = [
+                        {
+                            "spatial_dims": spatial_dims,
+                            "in_channels": 16,
+                            "out_channels": 16,
+                            "kernel_size": kernel_size,
+                            "norm_name": norm_name,
+                            "stride": stride,
+                        },
+                        (1, 16, *([in_size] * spatial_dims)),
+                        (1, 16, *([out_size] * spatial_dims)),
+                    ]
+                    TEST_CASE_UNETR_BASIC_BLOCK.append(test_case)
+
+TEST_UP_BLOCK = []
+in_channels, out_channels = 4, 2
+for spatial_dims in range(2, 4):
+    for kernel_size in [1, 3]:
+        for stride in [1, 2]:
+            for res_block in [False, True]:
+                for norm_name in ["batch", "instance"]:
+                    for in_size in [15, 16]:
+                        out_size = in_size * stride
+                        test_case = [
+                            {
+                                "spatial_dims": spatial_dims,
+                                "in_channels": in_channels,
+                                "out_channels": out_channels,
+                                "kernel_size": kernel_size,
+                                "norm_name": norm_name,
+                                "stride": stride,
+                                "upsample_kernel_size": stride,
+                            },
+                            (1, in_channels, *([in_size] * spatial_dims)),
+                            (1, out_channels, *([out_size] * spatial_dims)),
+                            (1, out_channels, *([in_size * stride] * spatial_dims)),
+                        ]
+                        TEST_UP_BLOCK.append(test_case)
+
+
+TEST_PRUP_BLOCK = []
+in_channels, out_channels = 4, 2
+for spatial_dims in range(2, 4):
+    for kernel_size in [1, 3]:
+        for upsample_kernel_size in [2, 3]:
+            for stride in [1, 2]:
+                for res_block in [False, True]:
+                    for norm_name in ["batch", "instance"]:
+                        for in_size in [15, 16]:
+                            for num_layer in [0, 2]:
+                                in_size_tmp = in_size
+                                for num in range(num_layer + 1):
+                                    out_size = in_size_tmp * upsample_kernel_size
+                                    in_size_tmp = out_size
+                            test_case = [
+                                {
+                                    "spatial_dims": spatial_dims,
+                                    "in_channels": in_channels,
+                                    "out_channels": out_channels,
+                                    "num_layer": num_layer,
+                                    "kernel_size": kernel_size,
+                                    "norm_name": norm_name,
+                                    "stride": stride,
+                                    "upsample_kernel_size": upsample_kernel_size,
+                                },
+                                (1, in_channels, *([in_size] * spatial_dims)),
+                                (1, out_channels, *([out_size] * spatial_dims)),
+                            ]
+                            TEST_PRUP_BLOCK.append(test_case)
+
+
+class TestResBasicBlock(unittest.TestCase):
+    @parameterized.expand(TEST_CASE_UNETR_BASIC_BLOCK)
+    def test_shape(self, input_param, input_shape, expected_shape):
+        for net in [UnetrBasicBlock(**input_param)]:
+            with eval_mode(net):
+                result = net(torch.randn(input_shape))
+                self.assertEqual(result.shape, expected_shape)
+
+    def test_ill_arg(self):
+        with self.assertRaises(KeyError):
+            UnetrBasicBlock(3, 4, 2, kernel_size=3, stride=1, norm_name="norm")
+        with self.assertRaises(AssertionError):
+            UnetrBasicBlock(3, 4, 2, kernel_size=1, stride=4, norm_name="batch")
+
+    def test_script(self):
+        input_param, input_shape, _ = TEST_CASE_UNETR_BASIC_BLOCK[0]
+        net = UnetrBasicBlock(**input_param)
+        with eval_mode(net):
+            test_data = torch.randn(input_shape)
+            test_script_save(net, test_data)
+
+
+class TestUpBlock(unittest.TestCase):
+    @parameterized.expand(TEST_UP_BLOCK)
+    def test_shape(self, input_param, input_shape, expected_shape, skip_shape):
+        net = UnetrUpBlock(**input_param)
+        with eval_mode(net):
+            result = net(torch.randn(input_shape), torch.randn(skip_shape))
+            self.assertEqual(result.shape, expected_shape)
+
+    def test_script(self):
+        input_param, input_shape, _, skip_shape = TEST_UP_BLOCK[0]
+        net = UnetrUpBlock(**input_param)
+        test_data = torch.randn(input_shape)
+        skip_data = torch.randn(skip_shape)
+        test_script_save(net, test_data, skip_data)
+
+
+class TestPrUpBlock(unittest.TestCase):
+    @parameterized.expand(TEST_PRUP_BLOCK)
+    def test_shape(self, input_param, input_shape, expected_shape):
+        net = UnetrPrUpBlock(**input_param)
+        with eval_mode(net):
+            result = net(torch.randn(input_shape))
+            self.assertEqual(result.shape, expected_shape)
+
+    def test_script(self):
+        input_param, input_shape, _ = TEST_PRUP_BLOCK[0]
+        net = UnetrPrUpBlock(**input_param)
+        test_data = torch.randn(input_shape)
+        test_script_save(net, test_data)
+
+
+if __name__ == "__main__":
+    unittest.main()
diff --git a/tests/test_vit.py b/tests/test_vit.py
new file mode 100644
index 0000000000..cd34141d87
--- /dev/null
+++ b/tests/test_vit.py
@@ -0,0 +1,133 @@
+# Copyright 2020 - 2021 MONAI Consortium
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#     http://www.apache.org/licenses/LICENSE-2.0
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import unittest
+
+import torch
+from parameterized import parameterized
+
+from monai.networks import eval_mode
+from monai.networks.nets.vit import ViT
+
+TEST_CASE_Vit = []
+for dropout_rate in [0.6]:
+    for in_channels in [1, 4]:
+        for hidden_size in [360, 768]:
+            for img_size in [96, 128]:
+                for patch_size in [8, 16]:
+                    for num_heads in [8, 12]:
+                        for mlp_dim in [3072]:
+                            for num_layers in [4]:
+                                for num_classes in [2]:
+                                    for pos_embed in ["conv", "perceptron"]:
+                                        for classification in ["False", "True"]:
+                                            test_case = [
+                                                {
+                                                    "in_channels": in_channels,
+                                                    "img_size": (img_size, img_size, img_size),
+                                                    "patch_size": (patch_size, patch_size, patch_size),
+                                                    "hidden_size": hidden_size,
+                                                    "mlp_dim": mlp_dim,
+                                                    "num_layers": num_layers,
+                                                    "num_heads": num_heads,
+                                                    "pos_embed": pos_embed,
+                                                    "classification": False,
+                                                    "num_classes": num_classes,
+                                                    "dropout_rate": dropout_rate,
+                                                },
+                                                (2, in_channels, img_size, *([img_size] * 2)),
+                                                (2, (img_size // patch_size) ** 3, hidden_size),
+                                            ]
+                                            TEST_CASE_Vit.append(test_case)
+
+
+class TestPatchEmbeddingBlock(unittest.TestCase):
+    @parameterized.expand(TEST_CASE_Vit)
+    def test_shape(self, input_param, input_shape, expected_shape):
+        net = ViT(**input_param)
+        with eval_mode(net):
+            result, _ = net(torch.randn(input_shape))
+            self.assertEqual(result.shape, expected_shape)
+
+    def test_ill_arg(self):
+        with self.assertRaises(AssertionError):
+            ViT(
+                in_channels=1,
+                img_size=(128, 128, 128),
+                patch_size=(16, 16, 16),
+                hidden_size=128,
+                mlp_dim=3072,
+                num_layers=12,
+                num_heads=12,
+                pos_embed="conv",
+                classification=False,
+                dropout_rate=5.0,
+            )
+
+        with self.assertRaises(AssertionError):
+            ViT(
+                in_channels=1,
+                img_size=(32, 32, 32),
+                patch_size=(64, 64, 64),
+                hidden_size=512,
+                mlp_dim=3072,
+                num_layers=12,
+                num_heads=8,
+                pos_embed="perceptron",
+                classification=False,
+                dropout_rate=0.3,
+            )
+
+        with self.assertRaises(AssertionError):
+            ViT(
+                in_channels=1,
+                img_size=(96, 96, 96),
+                patch_size=(8, 8, 8),
+                hidden_size=512,
+                mlp_dim=3072,
+                num_layers=12,
+                num_heads=14,
+                pos_embed="conv",
+                classification=False,
+                dropout_rate=0.3,
+            )
+
+        with self.assertRaises(AssertionError):
+            ViT(
+                in_channels=1,
+                img_size=(97, 97, 97),
+                patch_size=(4, 4, 4),
+                hidden_size=768,
+                mlp_dim=3072,
+                num_layers=12,
+                num_heads=8,
+                pos_embed="perceptron",
+                classification=False,
+                dropout_rate=0.3,
+            )
+
+        with self.assertRaises(KeyError):
+            ViT(
+                in_channels=4,
+                img_size=(96, 96, 96),
+                patch_size=(16, 16, 16),
+                hidden_size=768,
+                mlp_dim=3072,
+                num_layers=12,
+                num_heads=12,
+                pos_embed="perc",
+                classification=False,
+                dropout_rate=0.3,
+            )
+
+
+if __name__ == "__main__":
+    unittest.main()

From d268f96a5436038624769d09f759ef3d7405834d Mon Sep 17 00:00:00 2001
From: ahatamizadeh <ahatamizadeh@nvidia.com>
Date: Tue, 29 Jun 2021 03:34:29 -0700
Subject: [PATCH 2/7] add UNETR, ViT

Signed-off-by: ahatamizadeh <ahatamizadeh@nvidia.com>
---
 monai/networks/blocks/unetr_block.py | 2 +-
 tests/test_patchembedding.py         | 6 +++++-
 tests/test_unetr.py                  | 4 ++--
 tests/test_unetr_block.py            | 4 +++-
 tests/test_vit.py                    | 8 ++++++--
 5 files changed, 17 insertions(+), 7 deletions(-)

diff --git a/monai/networks/blocks/unetr_block.py b/monai/networks/blocks/unetr_block.py
index 24cefdcf1a..87f640347b 100644
--- a/monai/networks/blocks/unetr_block.py
+++ b/monai/networks/blocks/unetr_block.py
@@ -66,7 +66,7 @@ def __init__(
                 out_channels + out_channels,
                 out_channels,
                 kernel_size=kernel_size,
-                stride=stride,
+                stride=1,
                 norm_name=norm_name,
             )
         else:
diff --git a/tests/test_patchembedding.py b/tests/test_patchembedding.py
index f3ca277a3d..c19c970c2b 100644
--- a/tests/test_patchembedding.py
+++ b/tests/test_patchembedding.py
@@ -31,6 +31,10 @@
                     for num_heads in [8, 12]:
                         for pos_embed in ["conv", "perceptron"]:
                             for classification in ["False", "True"]:
+                                if classification:
+                                    out = (2, (img_size // patch_size) ** 3 + 1, hidden_size)
+                                else:
+                                    out = (2, (img_size // patch_size) ** 3, hidden_size)
                                 test_case = [
                                     {
                                         "in_channels": in_channels,
@@ -39,7 +43,7 @@
                                         "hidden_size": hidden_size,
                                         "num_heads": num_heads,
                                         "pos_embed": pos_embed,
-                                        "classification": False,
+                                        "classification": classification,
                                         "dropout_rate": dropout_rate,
                                     },
                                     (2, in_channels, img_size, *([img_size] * 2)),
diff --git a/tests/test_unetr.py b/tests/test_unetr.py
index 3193c12465..2384070901 100644
--- a/tests/test_unetr.py
+++ b/tests/test_unetr.py
@@ -29,7 +29,7 @@
                                 for norm_name in ["instance"]:
                                     for pos_embed in ["conv", "perceptron"]:
                                         for conv_block in [True]:
-                                            for res_block in [False]:
+                                            for res_block in [True, False]:
                                                 test_case = [
                                                     {
                                                         "in_channels": in_channels,
@@ -43,7 +43,7 @@
                                                         "pos_embed": pos_embed,
                                                         "dropout_rate": dropout_rate,
                                                         "conv_block": conv_block,
-                                                        "res_block": conv_block,
+                                                        "res_block": res_block,
                                                     },
                                                     (2, in_channels, img_size, *([img_size] * 2)),
                                                     (2, out_channels, img_size, *([img_size] * 2)),
diff --git a/tests/test_unetr_block.py b/tests/test_unetr_block.py
index d2484558e8..93060c0e95 100644
--- a/tests/test_unetr_block.py
+++ b/tests/test_unetr_block.py
@@ -60,6 +60,7 @@
                                 "kernel_size": kernel_size,
                                 "norm_name": norm_name,
                                 "stride": stride,
+                                "res_block": res_block,
                                 "upsample_kernel_size": stride,
                             },
                             (1, in_channels, *([in_size] * spatial_dims)),
@@ -80,7 +81,7 @@
                         for in_size in [15, 16]:
                             for num_layer in [0, 2]:
                                 in_size_tmp = in_size
-                                for num in range(num_layer + 1):
+                                for _num in range(num_layer + 1):
                                     out_size = in_size_tmp * upsample_kernel_size
                                     in_size_tmp = out_size
                             test_case = [
@@ -92,6 +93,7 @@
                                     "kernel_size": kernel_size,
                                     "norm_name": norm_name,
                                     "stride": stride,
+                                    "res_block": res_block,
                                     "upsample_kernel_size": upsample_kernel_size,
                                 },
                                 (1, in_channels, *([in_size] * spatial_dims)),
diff --git a/tests/test_vit.py b/tests/test_vit.py
index cd34141d87..82980fe967 100644
--- a/tests/test_vit.py
+++ b/tests/test_vit.py
@@ -29,6 +29,10 @@
                                 for num_classes in [2]:
                                     for pos_embed in ["conv", "perceptron"]:
                                         for classification in ["False", "True"]:
+                                            if classification:
+                                                out = (2, num_classes)
+                                            else:
+                                                out = (2, (img_size // patch_size) ** 3, hidden_size)
                                             test_case = [
                                                 {
                                                     "in_channels": in_channels,
@@ -39,12 +43,12 @@
                                                     "num_layers": num_layers,
                                                     "num_heads": num_heads,
                                                     "pos_embed": pos_embed,
-                                                    "classification": False,
+                                                    "classification": classification,
                                                     "num_classes": num_classes,
                                                     "dropout_rate": dropout_rate,
                                                 },
                                                 (2, in_channels, img_size, *([img_size] * 2)),
-                                                (2, (img_size // patch_size) ** 3, hidden_size),
+                                                out,
                                             ]
                                             TEST_CASE_Vit.append(test_case)
 

From 70674f7a8cf2609e343d2dba50aee1e2ba53f35d Mon Sep 17 00:00:00 2001
From: ahatamizadeh <ahatamizadeh@nvidia.com>
Date: Wed, 30 Jun 2021 00:32:21 -0700
Subject: [PATCH 3/7] add UNETR, ViT

Signed-off-by: ahatamizadeh <ahatamizadeh@nvidia.com>
---
 monai/networks/blocks/patchembedding.py | 14 ++++++--------
 monai/networks/blocks/unetr_block.py    |  6 +++---
 tests/test_vit.py                       |  2 +-
 3 files changed, 10 insertions(+), 12 deletions(-)

diff --git a/monai/networks/blocks/patchembedding.py b/monai/networks/blocks/patchembedding.py
index 7d74c02493..8bd9911b12 100644
--- a/monai/networks/blocks/patchembedding.py
+++ b/monai/networks/blocks/patchembedding.py
@@ -30,11 +30,11 @@ class PatchEmbeddingBlock(nn.Module):
     def __init__(
         self,
         in_channels: int,
-        img_size: Tuple,
-        patch_size: Tuple,
+        img_size: Union[int, Tuple[int, int, int]],
+        patch_size: Union[int, Tuple[int, int, int]],
         hidden_size: int,
         num_heads: int,
-        pos_embed: Union[Tuple, str],
+        pos_embed: Union[Tuple, str], # type: ignore
         classification: bool,
         dropout_rate: float = 0.0,
     ) -> None:
@@ -77,16 +77,16 @@ def __init__(
             raise ValueError('"Requires einops.')
 
         self.n_patches = (
-            (img_size[0] // patch_size[0]) * (img_size[1] // patch_size[1]) * (img_size[2] // patch_size[2])
+            (img_size[0] // patch_size[0]) * (img_size[1] // patch_size[1]) * (img_size[2] // patch_size[2])  # type: ignore
         )
         self.patch_dim = in_channels * patch_size[0] * patch_size[1] * patch_size[2]
         self.pos_embed = pos_embed
         if self.pos_embed == "conv":
             self.patch_embeddings = nn.Conv3d(
-                in_channels=in_channels, out_channels=hidden_size, kernel_size=patch_size, stride=patch_size
+                in_channels=in_channels, out_channels=hidden_size, kernel_size=patch_size, stride=patch_size  # type: ignore
             )
         elif self.pos_embed == "perceptron":
-            self.patch_embeddings = nn.Sequential(
+            self.patch_embeddings = nn.Sequential(  # type: ignore
                 self.Rearrange(
                     "b c (h p1) (w p2) (d p3)-> b (h w d) (p1 p2 p3 c)",
                     p1=patch_size[0],
@@ -95,8 +95,6 @@ def __init__(
                 ),
                 nn.Linear(self.patch_dim, hidden_size),
             )
-        if classification:
-            self.position_embeddings_cls = nn.Parameter(torch.zeros(1, self.n_patches + 1, hidden_size))
         self.position_embeddings = nn.Parameter(torch.zeros(1, self.n_patches, hidden_size))
         self.cls_token = nn.Parameter(torch.zeros(1, 1, hidden_size))
         self.dropout = nn.Dropout(dropout_rate)
diff --git a/monai/networks/blocks/unetr_block.py b/monai/networks/blocks/unetr_block.py
index 87f640347b..bf3211a9df 100644
--- a/monai/networks/blocks/unetr_block.py
+++ b/monai/networks/blocks/unetr_block.py
@@ -28,7 +28,7 @@ def __init__(
         self,
         spatial_dims: int,
         in_channels: int,
-        out_channels: int,
+        out_channels: int, # type: ignore
         kernel_size: Union[Sequence[int], int],
         stride: Union[Sequence[int], int],
         upsample_kernel_size: Union[Sequence[int], int],
@@ -70,7 +70,7 @@ def __init__(
                 norm_name=norm_name,
             )
         else:
-            self.conv_block = UnetBasicBlock(
+            self.conv_block = UnetBasicBlock(  # type: ignore
                 spatial_dims,
                 out_channels + out_channels,
                 out_channels,
@@ -247,7 +247,7 @@ def __init__(
                 norm_name=norm_name,
             )
         else:
-            self.layer = UnetBasicBlock(
+            self.layer = UnetBasicBlock( # type: ignore
                 spatial_dims=spatial_dims,
                 in_channels=in_channels,
                 out_channels=out_channels,
diff --git a/tests/test_vit.py b/tests/test_vit.py
index 82980fe967..03cd7ce64a 100644
--- a/tests/test_vit.py
+++ b/tests/test_vit.py
@@ -32,7 +32,7 @@
                                             if classification:
                                                 out = (2, num_classes)
                                             else:
-                                                out = (2, (img_size // patch_size) ** 3, hidden_size)
+                                                out = (2, (img_size // patch_size) ** 3, hidden_size) # type: ignore
                                             test_case = [
                                                 {
                                                     "in_channels": in_channels,

From c49d9f54f42df7992776f8c873d115fc21889273 Mon Sep 17 00:00:00 2001
From: ahatamizadeh <ahatamizadeh@nvidia.com>
Date: Wed, 30 Jun 2021 00:38:22 -0700
Subject: [PATCH 4/7] add UNETR, ViT

Signed-off-by: ahatamizadeh <ahatamizadeh@nvidia.com>
---
 monai/networks/blocks/patchembedding.py | 2 +-
 monai/networks/blocks/unetr_block.py    | 4 ++--
 tests/test_vit.py                       | 2 +-
 3 files changed, 4 insertions(+), 4 deletions(-)

diff --git a/monai/networks/blocks/patchembedding.py b/monai/networks/blocks/patchembedding.py
index 8bd9911b12..4ce530b590 100644
--- a/monai/networks/blocks/patchembedding.py
+++ b/monai/networks/blocks/patchembedding.py
@@ -34,7 +34,7 @@ def __init__(
         patch_size: Union[int, Tuple[int, int, int]],
         hidden_size: int,
         num_heads: int,
-        pos_embed: Union[Tuple, str], # type: ignore
+        pos_embed: Union[Tuple, str],  # type: ignore
         classification: bool,
         dropout_rate: float = 0.0,
     ) -> None:
diff --git a/monai/networks/blocks/unetr_block.py b/monai/networks/blocks/unetr_block.py
index bf3211a9df..20c39f6240 100644
--- a/monai/networks/blocks/unetr_block.py
+++ b/monai/networks/blocks/unetr_block.py
@@ -28,7 +28,7 @@ def __init__(
         self,
         spatial_dims: int,
         in_channels: int,
-        out_channels: int, # type: ignore
+        out_channels: int,  # type: ignore
         kernel_size: Union[Sequence[int], int],
         stride: Union[Sequence[int], int],
         upsample_kernel_size: Union[Sequence[int], int],
@@ -247,7 +247,7 @@ def __init__(
                 norm_name=norm_name,
             )
         else:
-            self.layer = UnetBasicBlock( # type: ignore
+            self.layer = UnetBasicBlock(  # type: ignore
                 spatial_dims=spatial_dims,
                 in_channels=in_channels,
                 out_channels=out_channels,
diff --git a/tests/test_vit.py b/tests/test_vit.py
index 03cd7ce64a..1df2f33f0f 100644
--- a/tests/test_vit.py
+++ b/tests/test_vit.py
@@ -32,7 +32,7 @@
                                             if classification:
                                                 out = (2, num_classes)
                                             else:
-                                                out = (2, (img_size // patch_size) ** 3, hidden_size) # type: ignore
+                                                out = (2, (img_size // patch_size) ** 3, hidden_size)  # type: ignore
                                             test_case = [
                                                 {
                                                     "in_channels": in_channels,

From 70c2a1926abe679dfe8013e600a8f8e481830cdb Mon Sep 17 00:00:00 2001
From: ahatamizadeh <ahatamizadeh@nvidia.com>
Date: Wed, 30 Jun 2021 00:54:19 -0700
Subject: [PATCH 5/7] add UNETR, ViT

Signed-off-by: ahatamizadeh <ahatamizadeh@nvidia.com>
---
 monai/networks/blocks/patchembedding.py | 18 +++++++++---------
 1 file changed, 9 insertions(+), 9 deletions(-)

diff --git a/monai/networks/blocks/patchembedding.py b/monai/networks/blocks/patchembedding.py
index 4ce530b590..6b80fdcc40 100644
--- a/monai/networks/blocks/patchembedding.py
+++ b/monai/networks/blocks/patchembedding.py
@@ -59,27 +59,27 @@ def __init__(
         if hidden_size % num_heads != 0:
             raise AssertionError("hidden size should be divisible by num_heads.")
 
-        if img_size < patch_size:
+        if img_size < patch_size:  # type: ignore
             raise AssertionError("patch_size should be smaller than img_size.")
 
         if pos_embed not in ["conv", "perceptron"]:
             raise KeyError(f"Position embedding layer of type {pos_embed} is not supported.")
 
         if pos_embed == "perceptron":
-            if img_size[0] % patch_size[0] != 0:
+            if img_size[0] % patch_size[0] != 0:  # type: ignore
                 raise AssertionError("img_size should be divisible by patch_size for perceptron patch embedding.")
 
-        if has_einops:
-            from einops.layers.torch import Rearrange
+        if has_einops:  # type: ignore
+            from einops.layers.torch import Rearrange  # type: ignore
 
-            self.Rearrange = Rearrange
+            self.Rearrange = Rearrange  # type: ignore
         else:
             raise ValueError('"Requires einops.')
 
         self.n_patches = (
             (img_size[0] // patch_size[0]) * (img_size[1] // patch_size[1]) * (img_size[2] // patch_size[2])  # type: ignore
         )
-        self.patch_dim = in_channels * patch_size[0] * patch_size[1] * patch_size[2]
+        self.patch_dim = in_channels * patch_size[0] * patch_size[1] * patch_size[2]  # type: ignore
         self.pos_embed = pos_embed
         if self.pos_embed == "conv":
             self.patch_embeddings = nn.Conv3d(
@@ -89,9 +89,9 @@ def __init__(
             self.patch_embeddings = nn.Sequential(  # type: ignore
                 self.Rearrange(
                     "b c (h p1) (w p2) (d p3)-> b (h w d) (p1 p2 p3 c)",
-                    p1=patch_size[0],
-                    p2=patch_size[1],
-                    p3=patch_size[2],
+                    p1=patch_size[0],  # type: ignore
+                    p2=patch_size[1],  # type: ignore
+                    p3=patch_size[2],  # type: ignore
                 ),
                 nn.Linear(self.patch_dim, hidden_size),
             )

From 4f864a43a13f11a3843b432889e2905eb5370d05 Mon Sep 17 00:00:00 2001
From: ahatamizadeh <ahatamizadeh@nvidia.com>
Date: Wed, 30 Jun 2021 01:20:04 -0700
Subject: [PATCH 6/7] add UNETR, ViT

Signed-off-by: ahatamizadeh <ahatamizadeh@nvidia.com>
---
 monai/networks/nets/unetr.py | 10 +++++-----
 monai/networks/nets/vit.py   |  8 ++++----
 2 files changed, 9 insertions(+), 9 deletions(-)

diff --git a/monai/networks/nets/unetr.py b/monai/networks/nets/unetr.py
index 14230e7a88..4e8b68b43e 100644
--- a/monai/networks/nets/unetr.py
+++ b/monai/networks/nets/unetr.py
@@ -28,7 +28,7 @@ def __init__(
         self,
         in_channels: int,
         out_channels: int,
-        img_size: Tuple,
+        img_size: Tuple,  # type: ignore
         feature_size: int,
         hidden_size: int,
         mlp_dim: int,
@@ -70,9 +70,9 @@ def __init__(
         self.num_layers = 12
         self.patch_size = (16, 16, 16)
         self.feat_size = (
-            img_size[0] // self.patch_size[0],
-            img_size[1] // self.patch_size[1],
-            img_size[2] // self.patch_size[2],
+            img_size[0] // self.patch_size[0],  # type: ignore
+            img_size[1] // self.patch_size[1],  # type: ignore
+            img_size[2] // self.patch_size[2],  # type: ignore
         )
         self.hidden_size = hidden_size
         self.classification = False
@@ -173,7 +173,7 @@ def __init__(
             norm_name=norm_name,
             res_block=res_block,
         )
-        self.out = UnetOutBlock(spatial_dims=3, in_channels=feature_size, out_channels=out_channels)
+        self.out = UnetOutBlock(spatial_dims=3, in_channels=feature_size, out_channels=out_channels)  # type: ignore
 
     def proj_feat(self, x, hidden_size, feat_size):
         x = x.view(x.size(0), feat_size[0], feat_size[1], feat_size[2], hidden_size)
diff --git a/monai/networks/nets/vit.py b/monai/networks/nets/vit.py
index ec7e27f3ec..1a43414aca 100644
--- a/monai/networks/nets/vit.py
+++ b/monai/networks/nets/vit.py
@@ -27,8 +27,8 @@ class ViT(nn.Module):
     def __init__(
         self,
         in_channels: int,
-        img_size: Tuple,
-        patch_size: Tuple,
+        img_size: Tuple,  # type: ignore
+        patch_size: Tuple,  # type: ignore
         hidden_size: int,
         mlp_dim: int,
         num_layers: int,
@@ -70,10 +70,10 @@ def __init__(
 
         self.classification = classification
         self.patch_embedding = PatchEmbeddingBlock(
-            in_channels, img_size, patch_size, hidden_size, num_heads, pos_embed, classification, dropout_rate
+            in_channels, img_size, patch_size, hidden_size, num_heads, pos_embed, classification, dropout_rate  # type: ignore
         )
         self.blocks = nn.ModuleList(
-            [TransformerBlock(hidden_size, mlp_dim, num_heads, dropout_rate) for i in range(num_layers)]
+            [TransformerBlock(hidden_size, mlp_dim, num_heads, dropout_rate) for i in range(num_layers)]  # type: ignore
         )
         self.norm = nn.LayerNorm(hidden_size)
         if self.classification:

From cc408d912f16697fe99bf08d7127986fb3bf250a Mon Sep 17 00:00:00 2001
From: ahatamizadeh <ahatamizadeh@nvidia.com>
Date: Wed, 30 Jun 2021 21:32:22 -0700
Subject: [PATCH 7/7] add UNETR, ViT

Signed-off-by: ahatamizadeh <ahatamizadeh@nvidia.com>
---
 tests/test_unetr.py       |  8 ++++----
 tests/test_unetr_block.py |  4 ++--
 tests/test_vit.py         | 12 ++++++------
 3 files changed, 12 insertions(+), 12 deletions(-)

diff --git a/tests/test_unetr.py b/tests/test_unetr.py
index 2384070901..cd50cb487c 100644
--- a/tests/test_unetr.py
+++ b/tests/test_unetr.py
@@ -19,17 +19,17 @@
 
 TEST_CASE_UNETR = []
 for dropout_rate in [0.4]:
-    for in_channels in [1, 4]:
-        for out_channels in [2, 3]:
+    for in_channels in [1]:
+        for out_channels in [2]:
             for hidden_size in [768]:
                 for img_size in [96, 128]:
                     for feature_size in [16]:
                         for num_heads in [8]:
                             for mlp_dim in [3072]:
                                 for norm_name in ["instance"]:
-                                    for pos_embed in ["conv", "perceptron"]:
+                                    for pos_embed in ["perceptron"]:
                                         for conv_block in [True]:
-                                            for res_block in [True, False]:
+                                            for res_block in [False]:
                                                 test_case = [
                                                     {
                                                         "in_channels": in_channels,
diff --git a/tests/test_unetr_block.py b/tests/test_unetr_block.py
index 93060c0e95..ba988d07e6 100644
--- a/tests/test_unetr_block.py
+++ b/tests/test_unetr_block.py
@@ -49,7 +49,7 @@
     for kernel_size in [1, 3]:
         for stride in [1, 2]:
             for res_block in [False, True]:
-                for norm_name in ["batch", "instance"]:
+                for norm_name in ["instance"]:
                     for in_size in [15, 16]:
                         out_size = in_size * stride
                         test_case = [
@@ -77,7 +77,7 @@
         for upsample_kernel_size in [2, 3]:
             for stride in [1, 2]:
                 for res_block in [False, True]:
-                    for norm_name in ["batch", "instance"]:
+                    for norm_name in ["instance"]:
                         for in_size in [15, 16]:
                             for num_layer in [0, 2]:
                                 in_size_tmp = in_size
diff --git a/tests/test_vit.py b/tests/test_vit.py
index 1df2f33f0f..0d0d58093b 100644
--- a/tests/test_vit.py
+++ b/tests/test_vit.py
@@ -19,16 +19,16 @@
 
 TEST_CASE_Vit = []
 for dropout_rate in [0.6]:
-    for in_channels in [1, 4]:
-        for hidden_size in [360, 768]:
+    for in_channels in [4]:
+        for hidden_size in [768]:
             for img_size in [96, 128]:
-                for patch_size in [8, 16]:
-                    for num_heads in [8, 12]:
+                for patch_size in [16]:
+                    for num_heads in [12]:
                         for mlp_dim in [3072]:
                             for num_layers in [4]:
                                 for num_classes in [2]:
-                                    for pos_embed in ["conv", "perceptron"]:
-                                        for classification in ["False", "True"]:
+                                    for pos_embed in ["conv"]:
+                                        for classification in ["False"]:
                                             if classification:
                                                 out = (2, num_classes)
                                             else: