diff --git a/monai/apps/generation/maisi/networks/diffusion_model_unet_maisi.py b/monai/apps/generation/maisi/networks/diffusion_model_unet_maisi.py
new file mode 100644
index 0000000000..d5f5f6136b
--- /dev/null
+++ b/monai/apps/generation/maisi/networks/diffusion_model_unet_maisi.py
@@ -0,0 +1,410 @@
+# Copyright (c) MONAI Consortium
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#     http://www.apache.org/licenses/LICENSE-2.0
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+#
+# =========================================================================
+# Adapted from https://github.com/huggingface/diffusers
+# which has the following license:
+# https://github.com/huggingface/diffusers/blob/main/LICENSE
+#
+# Copyright 2022 UC Berkeley Team and The HuggingFace Team. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# =========================================================================
+
+from __future__ import annotations
+
+from collections.abc import Sequence
+
+import torch
+from torch import nn
+
+from monai.networks.blocks import Convolution
+from monai.utils import ensure_tuple_rep, optional_import
+from monai.utils.type_conversion import convert_to_tensor
+
+get_down_block, has_get_down_block = optional_import(
+    "generative.networks.nets.diffusion_model_unet", name="get_down_block"
+)
+get_mid_block, has_get_mid_block = optional_import(
+    "generative.networks.nets.diffusion_model_unet", name="get_mid_block"
+)
+get_timestep_embedding, has_get_timestep_embedding = optional_import(
+    "generative.networks.nets.diffusion_model_unet", name="get_timestep_embedding"
+)
+get_up_block, has_get_up_block = optional_import("generative.networks.nets.diffusion_model_unet", name="get_up_block")
+xformers, has_xformers = optional_import("xformers")
+zero_module, has_zero_module = optional_import("generative.networks.nets.diffusion_model_unet", name="zero_module")
+
+__all__ = ["DiffusionModelUNetMaisi"]
+
+
+class DiffusionModelUNetMaisi(nn.Module):
+    """
+    U-Net network with timestep embedding and attention mechanisms for conditioning based on
+    Rombach et al. "High-Resolution Image Synthesis with Latent Diffusion Models" https://arxiv.org/abs/2112.10752
+    and Pinaya et al. "Brain Imaging Generation with Latent Diffusion Models" https://arxiv.org/abs/2209.07162
+
+    Args:
+        spatial_dims: Number of spatial dimensions.
+        in_channels: Number of input channels.
+        out_channels: Number of output channels.
+        num_res_blocks: Number of residual blocks (see ResnetBlock) per level. Can be a single integer or a sequence of integers.
+        num_channels: Tuple of block output channels.
+        attention_levels: List of levels to add attention.
+        norm_num_groups: Number of groups for the normalization.
+        norm_eps: Epsilon for the normalization.
+        resblock_updown: If True, use residual blocks for up/downsampling.
+        num_head_channels: Number of channels in each attention head. Can be a single integer or a sequence of integers.
+        with_conditioning: If True, add spatial transformers to perform conditioning.
+        transformer_num_layers: Number of layers of Transformer blocks to use.
+        cross_attention_dim: Number of context dimensions to use.
+        num_class_embeds: If specified (as an int), then this model will be class-conditional with `num_class_embeds` classes.
+        upcast_attention: If True, upcast attention operations to full precision.
+        use_flash_attention: If True, use flash attention for a memory efficient attention mechanism.
+        dropout_cattn: If different from zero, this will be the dropout value for the cross-attention layers.
+        include_top_region_index_input: If True, use top region index input.
+        include_bottom_region_index_input: If True, use bottom region index input.
+        include_spacing_input: If True, use spacing input.
+    """
+
+    def __init__(
+        self,
+        spatial_dims: int,
+        in_channels: int,
+        out_channels: int,
+        num_res_blocks: Sequence[int] | int = (2, 2, 2, 2),
+        num_channels: Sequence[int] = (32, 64, 64, 64),
+        attention_levels: Sequence[bool] = (False, False, True, True),
+        norm_num_groups: int = 32,
+        norm_eps: float = 1e-6,
+        resblock_updown: bool = False,
+        num_head_channels: int | Sequence[int] = 8,
+        with_conditioning: bool = False,
+        transformer_num_layers: int = 1,
+        cross_attention_dim: int | None = None,
+        num_class_embeds: int | None = None,
+        upcast_attention: bool = False,
+        use_flash_attention: bool = False,
+        dropout_cattn: float = 0.0,
+        include_top_region_index_input: bool = False,
+        include_bottom_region_index_input: bool = False,
+        include_spacing_input: bool = False,
+    ) -> None:
+        super().__init__()
+        if with_conditioning is True and cross_attention_dim is None:
+            raise ValueError(
+                "DiffusionModelUNetMaisi expects dimension of the cross-attention conditioning (cross_attention_dim) "
+                "when using with_conditioning."
+            )
+        if cross_attention_dim is not None and with_conditioning is False:
+            raise ValueError(
+                "DiffusionModelUNetMaisi expects with_conditioning=True when specifying the cross_attention_dim."
+            )
+        if dropout_cattn > 1.0 or dropout_cattn < 0.0:
+            raise ValueError("Dropout cannot be negative or >1.0!")
+
+        # All number of channels should be multiple of num_groups
+        if any((out_channel % norm_num_groups) != 0 for out_channel in num_channels):
+            raise ValueError(
+                f"DiffusionModelUNetMaisi expects all num_channels being multiple of norm_num_groups, "
+                f"but get num_channels: {num_channels} and norm_num_groups: {norm_num_groups}"
+            )
+
+        if len(num_channels) != len(attention_levels):
+            raise ValueError(
+                f"DiffusionModelUNetMaisi expects num_channels being same size of attention_levels, "
+                f"but get num_channels: {len(num_channels)} and attention_levels: {len(attention_levels)}"
+            )
+
+        if isinstance(num_head_channels, int):
+            num_head_channels = ensure_tuple_rep(num_head_channels, len(attention_levels))
+
+        if len(num_head_channels) != len(attention_levels):
+            raise ValueError(
+                "num_head_channels should have the same length as attention_levels. For the i levels without attention,"
+                " i.e. `attention_level[i]=False`, the num_head_channels[i] will be ignored."
+            )
+
+        if isinstance(num_res_blocks, int):
+            num_res_blocks = ensure_tuple_rep(num_res_blocks, len(num_channels))
+
+        if len(num_res_blocks) != len(num_channels):
+            raise ValueError(
+                "`num_res_blocks` should be a single integer or a tuple of integers with the same length as "
+                "`num_channels`."
+            )
+
+        if use_flash_attention and not has_xformers:
+            raise ValueError("use_flash_attention is True but xformers is not installed.")
+
+        if use_flash_attention is True and not torch.cuda.is_available():
+            raise ValueError(
+                "torch.cuda.is_available() should be True but is False. Flash attention is only available for GPU."
+            )
+
+        self.in_channels = in_channels
+        self.block_out_channels = num_channels
+        self.out_channels = out_channels
+        self.num_res_blocks = num_res_blocks
+        self.attention_levels = attention_levels
+        self.num_head_channels = num_head_channels
+        self.with_conditioning = with_conditioning
+
+        # input
+        self.conv_in = Convolution(
+            spatial_dims=spatial_dims,
+            in_channels=in_channels,
+            out_channels=num_channels[0],
+            strides=1,
+            kernel_size=3,
+            padding=1,
+            conv_only=True,
+        )
+
+        # time
+        time_embed_dim = num_channels[0] * 4
+        self.time_embed = self._create_embedding_module(num_channels[0], time_embed_dim)
+
+        # class embedding
+        self.num_class_embeds = num_class_embeds
+        if num_class_embeds is not None:
+            self.class_embedding = nn.Embedding(num_class_embeds, time_embed_dim)
+
+        self.include_top_region_index_input = include_top_region_index_input
+        self.include_bottom_region_index_input = include_bottom_region_index_input
+        self.include_spacing_input = include_spacing_input
+
+        new_time_embed_dim = time_embed_dim
+        if self.include_top_region_index_input:
+            self.top_region_index_layer = self._create_embedding_module(4, time_embed_dim)
+            new_time_embed_dim += time_embed_dim
+        if self.include_bottom_region_index_input:
+            self.bottom_region_index_layer = self._create_embedding_module(4, time_embed_dim)
+            new_time_embed_dim += time_embed_dim
+        if self.include_spacing_input:
+            self.spacing_layer = self._create_embedding_module(3, time_embed_dim)
+            new_time_embed_dim += time_embed_dim
+
+        # down
+        self.down_blocks = nn.ModuleList([])
+        output_channel = num_channels[0]
+        for i in range(len(num_channels)):
+            input_channel = output_channel
+            output_channel = num_channels[i]
+            is_final_block = i == len(num_channels) - 1
+
+            down_block = get_down_block(
+                spatial_dims=spatial_dims,
+                in_channels=input_channel,
+                out_channels=output_channel,
+                temb_channels=new_time_embed_dim,
+                num_res_blocks=num_res_blocks[i],
+                norm_num_groups=norm_num_groups,
+                norm_eps=norm_eps,
+                add_downsample=not is_final_block,
+                resblock_updown=resblock_updown,
+                with_attn=(attention_levels[i] and not with_conditioning),
+                with_cross_attn=(attention_levels[i] and with_conditioning),
+                num_head_channels=num_head_channels[i],
+                transformer_num_layers=transformer_num_layers,
+                cross_attention_dim=cross_attention_dim,
+                upcast_attention=upcast_attention,
+                use_flash_attention=use_flash_attention,
+                dropout_cattn=dropout_cattn,
+            )
+
+            self.down_blocks.append(down_block)
+
+        # mid
+        self.middle_block = get_mid_block(
+            spatial_dims=spatial_dims,
+            in_channels=num_channels[-1],
+            temb_channels=new_time_embed_dim,
+            norm_num_groups=norm_num_groups,
+            norm_eps=norm_eps,
+            with_conditioning=with_conditioning,
+            num_head_channels=num_head_channels[-1],
+            transformer_num_layers=transformer_num_layers,
+            cross_attention_dim=cross_attention_dim,
+            upcast_attention=upcast_attention,
+            use_flash_attention=use_flash_attention,
+            dropout_cattn=dropout_cattn,
+        )
+
+        # up
+        self.up_blocks = nn.ModuleList([])
+        reversed_block_out_channels = list(reversed(num_channels))
+        reversed_num_res_blocks = list(reversed(num_res_blocks))
+        reversed_attention_levels = list(reversed(attention_levels))
+        reversed_num_head_channels = list(reversed(num_head_channels))
+        output_channel = reversed_block_out_channels[0]
+        for i in range(len(reversed_block_out_channels)):
+            prev_output_channel = output_channel
+            output_channel = reversed_block_out_channels[i]
+            input_channel = reversed_block_out_channels[min(i + 1, len(num_channels) - 1)]
+
+            is_final_block = i == len(num_channels) - 1
+
+            up_block = get_up_block(
+                spatial_dims=spatial_dims,
+                in_channels=input_channel,
+                prev_output_channel=prev_output_channel,
+                out_channels=output_channel,
+                temb_channels=new_time_embed_dim,
+                num_res_blocks=reversed_num_res_blocks[i] + 1,
+                norm_num_groups=norm_num_groups,
+                norm_eps=norm_eps,
+                add_upsample=not is_final_block,
+                resblock_updown=resblock_updown,
+                with_attn=(reversed_attention_levels[i] and not with_conditioning),
+                with_cross_attn=(reversed_attention_levels[i] and with_conditioning),
+                num_head_channels=reversed_num_head_channels[i],
+                transformer_num_layers=transformer_num_layers,
+                cross_attention_dim=cross_attention_dim,
+                upcast_attention=upcast_attention,
+                use_flash_attention=use_flash_attention,
+                dropout_cattn=dropout_cattn,
+            )
+
+            self.up_blocks.append(up_block)
+
+        # out
+        self.out = nn.Sequential(
+            nn.GroupNorm(num_groups=norm_num_groups, num_channels=num_channels[0], eps=norm_eps, affine=True),
+            nn.SiLU(),
+            zero_module(
+                Convolution(
+                    spatial_dims=spatial_dims,
+                    in_channels=num_channels[0],
+                    out_channels=out_channels,
+                    strides=1,
+                    kernel_size=3,
+                    padding=1,
+                    conv_only=True,
+                )
+            ),
+        )
+
+    def _create_embedding_module(self, input_dim, embed_dim):
+        model = nn.Sequential(nn.Linear(input_dim, embed_dim), nn.SiLU(), nn.Linear(embed_dim, embed_dim))
+        return model
+
+    def _get_time_and_class_embedding(self, x, timesteps, class_labels):
+        t_emb = get_timestep_embedding(timesteps, self.block_out_channels[0])
+
+        # timesteps does not contain any weights and will always return f32 tensors
+        # but time_embedding might actually be running in fp16. so we need to cast here.
+        # there might be better ways to encapsulate this.
+        t_emb = t_emb.to(dtype=x.dtype)
+        emb = self.time_embed(t_emb)
+
+        if self.num_class_embeds is not None:
+            if class_labels is None:
+                raise ValueError("class_labels should be provided when num_class_embeds > 0")
+            class_emb = self.class_embedding(class_labels)
+            class_emb = class_emb.to(dtype=x.dtype)
+            emb += class_emb
+        return emb
+
+    def _get_input_embeddings(self, emb, top_index, bottom_index, spacing):
+        if self.include_top_region_index_input:
+            _emb = self.top_region_index_layer(top_index)
+            emb = torch.cat((emb, _emb), dim=1)
+        if self.include_bottom_region_index_input:
+            _emb = self.bottom_region_index_layer(bottom_index)
+            emb = torch.cat((emb, _emb), dim=1)
+        if self.include_spacing_input:
+            _emb = self.spacing_layer(spacing)
+            emb = torch.cat((emb, _emb), dim=1)
+        return emb
+
+    def _apply_down_blocks(self, h, emb, context, down_block_additional_residuals):
+        if context is not None and self.with_conditioning is False:
+            raise ValueError("model should have with_conditioning = True if context is provided")
+        down_block_res_samples: list[torch.Tensor] = [h]
+        for downsample_block in self.down_blocks:
+            h, res_samples = downsample_block(hidden_states=h, temb=emb, context=context)
+            down_block_res_samples.extend(res_samples)
+
+        # Additional residual conections for Controlnets
+        if down_block_additional_residuals is not None:
+            new_down_block_res_samples: list[torch.Tensor] = []
+            for down_block_res_sample, down_block_additional_residual in zip(
+                down_block_res_samples, down_block_additional_residuals
+            ):
+                down_block_res_sample += down_block_additional_residual
+                new_down_block_res_samples.append(down_block_res_sample)
+
+            down_block_res_samples = new_down_block_res_samples
+        return h, down_block_res_samples
+
+    def _apply_up_blocks(self, h, emb, context, down_block_res_samples):
+        for upsample_block in self.up_blocks:
+            res_samples = down_block_res_samples[-len(upsample_block.resnets) :]
+            down_block_res_samples = down_block_res_samples[: -len(upsample_block.resnets)]
+            h = upsample_block(hidden_states=h, res_hidden_states_list=res_samples, temb=emb, context=context)
+
+        return h
+
+    def forward(
+        self,
+        x: torch.Tensor,
+        timesteps: torch.Tensor,
+        context: torch.Tensor | None = None,
+        class_labels: torch.Tensor | None = None,
+        down_block_additional_residuals: tuple[torch.Tensor] | None = None,
+        mid_block_additional_residual: torch.Tensor | None = None,
+        top_region_index_tensor: torch.Tensor | None = None,
+        bottom_region_index_tensor: torch.Tensor | None = None,
+        spacing_tensor: torch.Tensor | None = None,
+    ) -> torch.Tensor:
+        """
+        Forward pass through the UNet model.
+
+        Args:
+            x: Input tensor of shape (N, C, SpatialDims).
+            timesteps: Timestep tensor of shape (N,).
+            context: Context tensor of shape (N, 1, ContextDim).
+            class_labels: Class labels tensor of shape (N,).
+            down_block_additional_residuals: Additional residual tensors for down blocks of shape (N, C, FeatureMapsDims).
+            mid_block_additional_residual: Additional residual tensor for mid block of shape (N, C, FeatureMapsDims).
+            top_region_index_tensor: Tensor representing top region index of shape (N, 4).
+            bottom_region_index_tensor: Tensor representing bottom region index of shape (N, 4).
+            spacing_tensor: Tensor representing spacing of shape (N, 3).
+
+        Returns:
+            A tensor representing the output of the UNet model.
+        """
+
+        emb = self._get_time_and_class_embedding(x, timesteps, class_labels)
+        emb = self._get_input_embeddings(emb, top_region_index_tensor, bottom_region_index_tensor, spacing_tensor)
+        h = self.conv_in(x)
+        h, _updated_down_block_res_samples = self._apply_down_blocks(h, emb, context, down_block_additional_residuals)
+        h = self.middle_block(h, emb, context)
+
+        # Additional residual conections for Controlnets
+        if mid_block_additional_residual is not None:
+            h += mid_block_additional_residual
+
+        h = self._apply_up_blocks(h, emb, context, _updated_down_block_res_samples)
+        h = self.out(h)
+        h_tensor: torch.Tensor = convert_to_tensor(h)
+        return h_tensor
diff --git a/requirements-dev.txt b/requirements-dev.txt
index b598f301f6..1bba930273 100644
--- a/requirements-dev.txt
+++ b/requirements-dev.txt
@@ -58,4 +58,4 @@ lpips==0.1.4
 nvidia-ml-py
 huggingface_hub
 pyamg>=5.0.0
-git+https://github.com/KumoLiu/GenerativeModels.git@cuda#egg=monai-generative
+git+https://github.com/Project-MONAI/GenerativeModels.git@7428fce193771e9564f29b91d29e523dd1b6b4cd
diff --git a/tests/test_diffusion_model_unet_maisi.py b/tests/test_diffusion_model_unet_maisi.py
new file mode 100644
index 0000000000..b5c14192d9
--- /dev/null
+++ b/tests/test_diffusion_model_unet_maisi.py
@@ -0,0 +1,592 @@
+# Copyright (c) MONAI Consortium
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#     http://www.apache.org/licenses/LICENSE-2.0
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+from __future__ import annotations
+
+import unittest
+from unittest import skipUnless
+
+import torch
+from parameterized import parameterized
+
+from monai.networks import eval_mode
+from monai.utils import optional_import
+
+_, has_einops = optional_import("einops")
+_, has_generative = optional_import("generative")
+
+if has_generative:
+    from monai.apps.generation.maisi.networks.diffusion_model_unet_maisi import DiffusionModelUNetMaisi
+
+
+UNCOND_CASES_2D = [
+    [
+        {
+            "spatial_dims": 2,
+            "in_channels": 1,
+            "out_channels": 1,
+            "num_res_blocks": 1,
+            "num_channels": (8, 8, 8),
+            "attention_levels": (False, False, False),
+            "norm_num_groups": 8,
+        }
+    ],
+    [
+        {
+            "spatial_dims": 2,
+            "in_channels": 1,
+            "out_channels": 1,
+            "num_res_blocks": (1, 1, 2),
+            "num_channels": (8, 8, 8),
+            "attention_levels": (False, False, False),
+            "norm_num_groups": 8,
+        }
+    ],
+    [
+        {
+            "spatial_dims": 2,
+            "in_channels": 1,
+            "out_channels": 1,
+            "num_res_blocks": 1,
+            "num_channels": (8, 8, 8),
+            "attention_levels": (False, False, False),
+            "norm_num_groups": 8,
+            "resblock_updown": True,
+        }
+    ],
+    [
+        {
+            "spatial_dims": 2,
+            "in_channels": 1,
+            "out_channels": 1,
+            "num_res_blocks": 1,
+            "num_channels": (8, 8, 8),
+            "attention_levels": (False, False, True),
+            "num_head_channels": 8,
+            "norm_num_groups": 8,
+        }
+    ],
+    [
+        {
+            "spatial_dims": 2,
+            "in_channels": 1,
+            "out_channels": 1,
+            "num_res_blocks": 1,
+            "num_channels": (8, 8, 8),
+            "attention_levels": (False, False, True),
+            "num_head_channels": 8,
+            "norm_num_groups": 8,
+            "resblock_updown": True,
+        }
+    ],
+    [
+        {
+            "spatial_dims": 2,
+            "in_channels": 1,
+            "out_channels": 1,
+            "num_res_blocks": 1,
+            "num_channels": (8, 8, 8),
+            "attention_levels": (False, False, True),
+            "num_head_channels": 4,
+            "norm_num_groups": 8,
+        }
+    ],
+    [
+        {
+            "spatial_dims": 2,
+            "in_channels": 1,
+            "out_channels": 1,
+            "num_res_blocks": 1,
+            "num_channels": (8, 8, 8),
+            "attention_levels": (False, True, True),
+            "num_head_channels": (0, 2, 4),
+            "norm_num_groups": 8,
+        }
+    ],
+]
+
+UNCOND_CASES_3D = [
+    [
+        {
+            "spatial_dims": 3,
+            "in_channels": 1,
+            "out_channels": 1,
+            "num_res_blocks": 1,
+            "num_channels": (8, 8, 8),
+            "attention_levels": (False, False, False),
+            "norm_num_groups": 8,
+        }
+    ],
+    [
+        {
+            "spatial_dims": 3,
+            "in_channels": 1,
+            "out_channels": 1,
+            "num_res_blocks": 1,
+            "num_channels": (8, 8, 8),
+            "attention_levels": (False, False, False),
+            "norm_num_groups": 8,
+            "resblock_updown": True,
+        }
+    ],
+    [
+        {
+            "spatial_dims": 3,
+            "in_channels": 1,
+            "out_channels": 1,
+            "num_res_blocks": 1,
+            "num_channels": (8, 8, 8),
+            "attention_levels": (False, False, True),
+            "num_head_channels": 8,
+            "norm_num_groups": 8,
+        }
+    ],
+    [
+        {
+            "spatial_dims": 3,
+            "in_channels": 1,
+            "out_channels": 1,
+            "num_res_blocks": 1,
+            "num_channels": (8, 8, 8),
+            "attention_levels": (False, False, True),
+            "num_head_channels": 8,
+            "norm_num_groups": 8,
+            "resblock_updown": True,
+        }
+    ],
+    [
+        {
+            "spatial_dims": 3,
+            "in_channels": 1,
+            "out_channels": 1,
+            "num_res_blocks": 1,
+            "num_channels": (8, 8, 8),
+            "attention_levels": (False, False, True),
+            "num_head_channels": 4,
+            "norm_num_groups": 8,
+        }
+    ],
+    [
+        {
+            "spatial_dims": 3,
+            "in_channels": 1,
+            "out_channels": 1,
+            "num_res_blocks": 1,
+            "num_channels": (8, 8, 8),
+            "attention_levels": (False, False, True),
+            "num_head_channels": (0, 0, 4),
+            "norm_num_groups": 8,
+        }
+    ],
+]
+
+COND_CASES_2D = [
+    [
+        {
+            "spatial_dims": 2,
+            "in_channels": 1,
+            "out_channels": 1,
+            "num_res_blocks": 1,
+            "num_channels": (8, 8, 8),
+            "attention_levels": (False, False, True),
+            "num_head_channels": 4,
+            "norm_num_groups": 8,
+            "with_conditioning": True,
+            "transformer_num_layers": 1,
+            "cross_attention_dim": 3,
+        }
+    ],
+    [
+        {
+            "spatial_dims": 2,
+            "in_channels": 1,
+            "out_channels": 1,
+            "num_res_blocks": 1,
+            "num_channels": (8, 8, 8),
+            "attention_levels": (False, False, True),
+            "num_head_channels": 4,
+            "norm_num_groups": 8,
+            "with_conditioning": True,
+            "transformer_num_layers": 1,
+            "cross_attention_dim": 3,
+            "resblock_updown": True,
+        }
+    ],
+    [
+        {
+            "spatial_dims": 2,
+            "in_channels": 1,
+            "out_channels": 1,
+            "num_res_blocks": 1,
+            "num_channels": (8, 8, 8),
+            "attention_levels": (False, False, True),
+            "num_head_channels": 4,
+            "norm_num_groups": 8,
+            "with_conditioning": True,
+            "transformer_num_layers": 1,
+            "cross_attention_dim": 3,
+            "upcast_attention": True,
+        }
+    ],
+]
+
+DROPOUT_OK = [
+    [
+        {
+            "spatial_dims": 2,
+            "in_channels": 1,
+            "out_channels": 1,
+            "num_res_blocks": 1,
+            "num_channels": (8, 8, 8),
+            "attention_levels": (False, False, True),
+            "num_head_channels": 4,
+            "norm_num_groups": 8,
+            "with_conditioning": True,
+            "transformer_num_layers": 1,
+            "cross_attention_dim": 3,
+            "dropout_cattn": 0.25,
+        }
+    ],
+    [
+        {
+            "spatial_dims": 2,
+            "in_channels": 1,
+            "out_channels": 1,
+            "num_res_blocks": 1,
+            "num_channels": (8, 8, 8),
+            "attention_levels": (False, False, True),
+            "num_head_channels": 4,
+            "norm_num_groups": 8,
+            "with_conditioning": True,
+            "transformer_num_layers": 1,
+            "cross_attention_dim": 3,
+        }
+    ],
+]
+
+DROPOUT_WRONG = [
+    [
+        {
+            "spatial_dims": 2,
+            "in_channels": 1,
+            "out_channels": 1,
+            "num_res_blocks": 1,
+            "num_channels": (8, 8, 8),
+            "attention_levels": (False, False, True),
+            "num_head_channels": 4,
+            "norm_num_groups": 8,
+            "with_conditioning": True,
+            "transformer_num_layers": 1,
+            "cross_attention_dim": 3,
+            "dropout_cattn": 3.0,
+        }
+    ]
+]
+
+
+@skipUnless(has_generative, "monai-generative required")
+class TestDiffusionModelUNetMaisi2D(unittest.TestCase):
+    @parameterized.expand(UNCOND_CASES_2D)
+    @skipUnless(has_einops, "Requires einops")
+    def test_shape_unconditioned_models(self, input_param):
+        net = DiffusionModelUNetMaisi(**input_param)
+        with eval_mode(net):
+            result = net.forward(torch.rand((1, 1, 16, 16)), torch.randint(0, 1000, (1,)).long())
+            self.assertEqual(result.shape, (1, 1, 16, 16))
+
+    @skipUnless(has_einops, "Requires einops")
+    def test_timestep_with_wrong_shape(self):
+        net = DiffusionModelUNetMaisi(
+            spatial_dims=2,
+            in_channels=1,
+            out_channels=1,
+            num_res_blocks=1,
+            num_channels=(8, 8, 8),
+            attention_levels=(False, False, False),
+            norm_num_groups=8,
+        )
+        with self.assertRaises(ValueError):
+            with eval_mode(net):
+                net.forward(torch.rand((1, 1, 16, 16)), torch.randint(0, 1000, (1, 1)).long())
+
+    @skipUnless(has_einops, "Requires einops")
+    def test_shape_with_different_in_channel_out_channel(self):
+        in_channels = 6
+        out_channels = 3
+        net = DiffusionModelUNetMaisi(
+            spatial_dims=2,
+            in_channels=in_channels,
+            out_channels=out_channels,
+            num_res_blocks=1,
+            num_channels=(8, 8, 8),
+            attention_levels=(False, False, False),
+            norm_num_groups=8,
+        )
+        with eval_mode(net):
+            result = net.forward(torch.rand((1, in_channels, 16, 16)), torch.randint(0, 1000, (1,)).long())
+            self.assertEqual(result.shape, (1, out_channels, 16, 16))
+
+    def test_model_channels_not_multiple_of_norm_num_group(self):
+        with self.assertRaises(ValueError):
+            DiffusionModelUNetMaisi(
+                spatial_dims=2,
+                in_channels=1,
+                out_channels=1,
+                num_res_blocks=1,
+                num_channels=(8, 8, 12),
+                attention_levels=(False, False, False),
+                norm_num_groups=8,
+            )
+
+    def test_attention_levels_with_different_length_num_head_channels(self):
+        with self.assertRaises(ValueError):
+            DiffusionModelUNetMaisi(
+                spatial_dims=2,
+                in_channels=1,
+                out_channels=1,
+                num_res_blocks=1,
+                num_channels=(8, 8, 8),
+                attention_levels=(False, False, False),
+                num_head_channels=(0, 2),
+                norm_num_groups=8,
+            )
+
+    def test_num_res_blocks_with_different_length_channels(self):
+        with self.assertRaises(ValueError):
+            DiffusionModelUNetMaisi(
+                spatial_dims=2,
+                in_channels=1,
+                out_channels=1,
+                num_res_blocks=(1, 1),
+                num_channels=(8, 8, 8),
+                attention_levels=(False, False, False),
+                norm_num_groups=8,
+            )
+
+    @skipUnless(has_einops, "Requires einops")
+    def test_shape_conditioned_models(self):
+        net = DiffusionModelUNetMaisi(
+            spatial_dims=2,
+            in_channels=1,
+            out_channels=1,
+            num_res_blocks=1,
+            num_channels=(8, 8, 8),
+            attention_levels=(False, False, True),
+            with_conditioning=True,
+            transformer_num_layers=1,
+            cross_attention_dim=3,
+            norm_num_groups=8,
+            num_head_channels=8,
+        )
+        with eval_mode(net):
+            result = net.forward(
+                x=torch.rand((1, 1, 16, 32)),
+                timesteps=torch.randint(0, 1000, (1,)).long(),
+                context=torch.rand((1, 1, 3)),
+            )
+            self.assertEqual(result.shape, (1, 1, 16, 32))
+
+    def test_with_conditioning_cross_attention_dim_none(self):
+        with self.assertRaises(ValueError):
+            DiffusionModelUNetMaisi(
+                spatial_dims=2,
+                in_channels=1,
+                out_channels=1,
+                num_res_blocks=1,
+                num_channels=(8, 8, 8),
+                attention_levels=(False, False, True),
+                with_conditioning=True,
+                transformer_num_layers=1,
+                cross_attention_dim=None,
+                norm_num_groups=8,
+            )
+
+    @skipUnless(has_einops, "Requires einops")
+    def test_context_with_conditioning_none(self):
+        net = DiffusionModelUNetMaisi(
+            spatial_dims=2,
+            in_channels=1,
+            out_channels=1,
+            num_res_blocks=1,
+            num_channels=(8, 8, 8),
+            attention_levels=(False, False, True),
+            with_conditioning=False,
+            transformer_num_layers=1,
+            norm_num_groups=8,
+        )
+
+        with self.assertRaises(ValueError):
+            with eval_mode(net):
+                net.forward(
+                    x=torch.rand((1, 1, 16, 32)),
+                    timesteps=torch.randint(0, 1000, (1,)).long(),
+                    context=torch.rand((1, 1, 3)),
+                )
+
+    @skipUnless(has_einops, "Requires einops")
+    def test_shape_conditioned_models_class_conditioning(self):
+        net = DiffusionModelUNetMaisi(
+            spatial_dims=2,
+            in_channels=1,
+            out_channels=1,
+            num_res_blocks=1,
+            num_channels=(8, 8, 8),
+            attention_levels=(False, False, True),
+            norm_num_groups=8,
+            num_head_channels=8,
+            num_class_embeds=2,
+        )
+        with eval_mode(net):
+            result = net.forward(
+                x=torch.rand((1, 1, 16, 32)),
+                timesteps=torch.randint(0, 1000, (1,)).long(),
+                class_labels=torch.randint(0, 2, (1,)).long(),
+            )
+            self.assertEqual(result.shape, (1, 1, 16, 32))
+
+    @skipUnless(has_einops, "Requires einops")
+    def test_conditioned_models_no_class_labels(self):
+        net = DiffusionModelUNetMaisi(
+            spatial_dims=2,
+            in_channels=1,
+            out_channels=1,
+            num_res_blocks=1,
+            num_channels=(8, 8, 8),
+            attention_levels=(False, False, True),
+            norm_num_groups=8,
+            num_head_channels=8,
+            num_class_embeds=2,
+        )
+
+        with self.assertRaises(ValueError):
+            net.forward(x=torch.rand((1, 1, 16, 32)), timesteps=torch.randint(0, 1000, (1,)).long())
+
+    @skipUnless(has_einops, "Requires einops")
+    def test_model_channels_not_same_size_of_attention_levels(self):
+        with self.assertRaises(ValueError):
+            DiffusionModelUNetMaisi(
+                spatial_dims=2,
+                in_channels=1,
+                out_channels=1,
+                num_res_blocks=1,
+                num_channels=(8, 8, 8),
+                attention_levels=(False, False),
+                norm_num_groups=8,
+                num_head_channels=8,
+                num_class_embeds=2,
+            )
+
+    @parameterized.expand(COND_CASES_2D)
+    @skipUnless(has_einops, "Requires einops")
+    def test_conditioned_2d_models_shape(self, input_param):
+        net = DiffusionModelUNetMaisi(**input_param)
+        with eval_mode(net):
+            result = net.forward(torch.rand((1, 1, 16, 16)), torch.randint(0, 1000, (1,)).long(), torch.rand((1, 1, 3)))
+            self.assertEqual(result.shape, (1, 1, 16, 16))
+
+    @parameterized.expand(UNCOND_CASES_2D)
+    @skipUnless(has_einops, "Requires einops")
+    def test_shape_with_additional_inputs(self, input_param):
+        input_param["include_top_region_index_input"] = True
+        input_param["include_bottom_region_index_input"] = True
+        input_param["include_spacing_input"] = True
+        net = DiffusionModelUNetMaisi(**input_param)
+        with eval_mode(net):
+            result = net.forward(
+                x=torch.rand((1, 1, 16, 16)),
+                timesteps=torch.randint(0, 1000, (1,)).long(),
+                top_region_index_tensor=torch.rand((1, 4)),
+                bottom_region_index_tensor=torch.rand((1, 4)),
+                spacing_tensor=torch.rand((1, 3)),
+            )
+            self.assertEqual(result.shape, (1, 1, 16, 16))
+
+
+@skipUnless(has_generative, "monai-generative required")
+class TestDiffusionModelUNetMaisi3D(unittest.TestCase):
+    @parameterized.expand(UNCOND_CASES_3D)
+    @skipUnless(has_einops, "Requires einops")
+    def test_shape_unconditioned_models(self, input_param):
+        net = DiffusionModelUNetMaisi(**input_param)
+        with eval_mode(net):
+            result = net.forward(torch.rand((1, 1, 16, 16, 16)), torch.randint(0, 1000, (1,)).long())
+            self.assertEqual(result.shape, (1, 1, 16, 16, 16))
+
+    @skipUnless(has_einops, "Requires einops")
+    def test_shape_with_different_in_channel_out_channel(self):
+        in_channels = 6
+        out_channels = 3
+        net = DiffusionModelUNetMaisi(
+            spatial_dims=3,
+            in_channels=in_channels,
+            out_channels=out_channels,
+            num_res_blocks=1,
+            num_channels=(8, 8, 8),
+            attention_levels=(False, False, True),
+            norm_num_groups=4,
+        )
+        with eval_mode(net):
+            result = net.forward(torch.rand((1, in_channels, 16, 16, 16)), torch.randint(0, 1000, (1,)).long())
+            self.assertEqual(result.shape, (1, out_channels, 16, 16, 16))
+
+    @skipUnless(has_einops, "Requires einops")
+    def test_shape_conditioned_models(self):
+        net = DiffusionModelUNetMaisi(
+            spatial_dims=3,
+            in_channels=1,
+            out_channels=1,
+            num_res_blocks=1,
+            num_channels=(16, 16, 16),
+            attention_levels=(False, False, True),
+            norm_num_groups=16,
+            with_conditioning=True,
+            transformer_num_layers=1,
+            cross_attention_dim=3,
+        )
+        with eval_mode(net):
+            result = net.forward(
+                x=torch.rand((1, 1, 16, 16, 16)),
+                timesteps=torch.randint(0, 1000, (1,)).long(),
+                context=torch.rand((1, 1, 3)),
+            )
+            self.assertEqual(result.shape, (1, 1, 16, 16, 16))
+
+    # Test dropout specification for cross-attention blocks
+    @parameterized.expand(DROPOUT_WRONG)
+    def test_wrong_dropout(self, input_param):
+        with self.assertRaises(ValueError):
+            _ = DiffusionModelUNetMaisi(**input_param)
+
+    @parameterized.expand(DROPOUT_OK)
+    @skipUnless(has_einops, "Requires einops")
+    def test_right_dropout(self, input_param):
+        _ = DiffusionModelUNetMaisi(**input_param)
+
+    @parameterized.expand(UNCOND_CASES_3D)
+    @skipUnless(has_einops, "Requires einops")
+    def test_shape_with_additional_inputs(self, input_param):
+        input_param["include_top_region_index_input"] = True
+        input_param["include_bottom_region_index_input"] = True
+        input_param["include_spacing_input"] = True
+        net = DiffusionModelUNetMaisi(**input_param)
+        with eval_mode(net):
+            result = net.forward(
+                x=torch.rand((1, 1, 16, 16, 16)),
+                timesteps=torch.randint(0, 1000, (1,)).long(),
+                top_region_index_tensor=torch.rand((1, 4)),
+                bottom_region_index_tensor=torch.rand((1, 4)),
+                spacing_tensor=torch.rand((1, 3)),
+            )
+            self.assertEqual(result.shape, (1, 1, 16, 16, 16))
+
+
+if __name__ == "__main__":
+    unittest.main()