From 90c1f7a05279bf04227c8bf19270ef004ce6c417 Mon Sep 17 00:00:00 2001
From: yiyixuxu <yixu310@gmail.com>
Date: Tue, 9 Dec 2025 03:28:46 +0100
Subject: [PATCH 1/7] initiL

---
 .../modular_pipelines/z_image/__init__.py     |  57 ++
 .../z_image/before_denoise.py                 | 616 ++++++++++++++++++
 .../modular_pipelines/z_image/decoders.py     |  89 +++
 .../modular_pipelines/z_image/denoise.py      | 306 +++++++++
 .../modular_pipelines/z_image/encoders.py     | 347 ++++++++++
 .../z_image/modular_blocks.py                 | 192 ++++++
 .../z_image/modular_pipeline.py               |  74 +++
 7 files changed, 1681 insertions(+)
 create mode 100644 src/diffusers/modular_pipelines/z_image/__init__.py
 create mode 100644 src/diffusers/modular_pipelines/z_image/before_denoise.py
 create mode 100644 src/diffusers/modular_pipelines/z_image/decoders.py
 create mode 100644 src/diffusers/modular_pipelines/z_image/denoise.py
 create mode 100644 src/diffusers/modular_pipelines/z_image/encoders.py
 create mode 100644 src/diffusers/modular_pipelines/z_image/modular_blocks.py
 create mode 100644 src/diffusers/modular_pipelines/z_image/modular_pipeline.py

diff --git a/src/diffusers/modular_pipelines/z_image/__init__.py b/src/diffusers/modular_pipelines/z_image/__init__.py
new file mode 100644
index 000000000000..c8a8c14396c0
--- /dev/null
+++ b/src/diffusers/modular_pipelines/z_image/__init__.py
@@ -0,0 +1,57 @@
+from typing import TYPE_CHECKING
+
+from ...utils import (
+    DIFFUSERS_SLOW_IMPORT,
+    OptionalDependencyNotAvailable,
+    _LazyModule,
+    get_objects_from_module,
+    is_torch_available,
+    is_transformers_available,
+)
+
+
+_dummy_objects = {}
+_import_structure = {}
+
+try:
+    if not (is_transformers_available() and is_torch_available()):
+        raise OptionalDependencyNotAvailable()
+except OptionalDependencyNotAvailable:
+    from ...utils import dummy_torch_and_transformers_objects  # noqa F403
+
+    _dummy_objects.update(get_objects_from_module(dummy_torch_and_transformers_objects))
+else:
+    _import_structure["decoders"] = ["ZImageVaeDecoderStep"]
+    _import_structure["encoders"] = ["ZImageTextEncoderStep", "ZImageVaeImageEncoderStep"]
+    _import_structure["modular_blocks"] = [
+        "ALL_BLOCKS",
+        "ZImageAutoBlocks",
+    ]
+    _import_structure["modular_pipeline"] = ["ZImageModularPipeline"]
+
+if TYPE_CHECKING or DIFFUSERS_SLOW_IMPORT:
+    try:
+        if not (is_transformers_available() and is_torch_available()):
+            raise OptionalDependencyNotAvailable()
+    except OptionalDependencyNotAvailable:
+        from ...utils.dummy_torch_and_transformers_objects import *  # noqa F403
+    else:
+        from .decoders import ZImageVaeDecoderStep
+        from .encoders import ZImageTextEncoderStep
+        from .modular_blocks import (
+            ALL_BLOCKS,
+            ZImageAutoBlocks,
+        )
+        from .modular_pipeline import ZImageModularPipeline
+else:
+    import sys
+
+    sys.modules[__name__] = _LazyModule(
+        __name__,
+        globals()["__file__"],
+        _import_structure,
+        module_spec=__spec__,
+    )
+
+    for name, value in _dummy_objects.items():
+        setattr(sys.modules[__name__], name, value)
diff --git a/src/diffusers/modular_pipelines/z_image/before_denoise.py b/src/diffusers/modular_pipelines/z_image/before_denoise.py
new file mode 100644
index 000000000000..eac00482b7ad
--- /dev/null
+++ b/src/diffusers/modular_pipelines/z_image/before_denoise.py
@@ -0,0 +1,616 @@
+# Copyright 2025 Alibaba Z-Image Team and The HuggingFace Team. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import inspect
+from typing import List, Optional, Tuple, Union
+
+import torch
+
+from ...models import ZImageTransformer2DModel
+from ...schedulers import FlowMatchEulerDiscreteScheduler
+from ...utils import logging
+from ...utils.torch_utils import randn_tensor
+from ..modular_pipeline import ModularPipelineBlocks, PipelineState
+from ..modular_pipeline_utils import ComponentSpec, InputParam, OutputParam
+from .modular_pipeline import ZImageModularPipeline
+
+
+logger = logging.get_logger(__name__)  # pylint: disable=invalid-name
+
+
+# TODO(yiyi, aryan): We need another step before text encoder to set the `num_inference_steps` attribute for guider so that
+# things like when to do guidance and how many conditions to be prepared can be determined. Currently, this is done by
+# always assuming you want to do guidance in the Guiders. So, negative embeddings are prepared regardless of what the
+# configuration of guider is.
+
+
+def repeat_tensor_to_batch_size(
+    input_name: str,
+    input_tensor: torch.Tensor,
+    batch_size: int,
+    num_images_per_prompt: int = 1,
+) -> torch.Tensor:
+    """Repeat tensor elements to match the final batch size.
+
+    This function expands a tensor's batch dimension to match the final batch size (batch_size * num_videos_per_prompt)
+    by repeating each element along dimension 0.
+
+    The input tensor must have batch size 1 or batch_size. The function will:
+    - If batch size is 1: repeat each element (batch_size * num_videos_per_prompt) times
+    - If batch size equals batch_size: repeat each element num_videos_per_prompt times
+
+    Args:
+        input_name (str): Name of the input tensor (used for error messages)
+        input_tensor (torch.Tensor): The tensor to repeat. Must have batch size 1 or batch_size.
+        batch_size (int): The base batch size (number of prompts)
+        num_videos_per_prompt (int, optional): Number of videos to generate per prompt. Defaults to 1.
+
+    Returns:
+        torch.Tensor: The repeated tensor with final batch size (batch_size * num_videos_per_prompt)
+
+    Raises:
+        ValueError: If input_tensor is not a torch.Tensor or has invalid batch size
+
+    Examples:
+        tensor = torch.tensor([[1, 2, 3]]) # shape: [1, 3] repeated = repeat_tensor_to_batch_size("image", tensor,
+        batch_size=2, num_videos_per_prompt=2) repeated # tensor([[1, 2, 3], [1, 2, 3], [1, 2, 3], [1, 2, 3]]) - shape:
+        [4, 3]
+
+        tensor = torch.tensor([[1, 2, 3], [4, 5, 6]]) # shape: [2, 3] repeated = repeat_tensor_to_batch_size("image",
+        tensor, batch_size=2, num_videos_per_prompt=2) repeated # tensor([[1, 2, 3], [1, 2, 3], [4, 5, 6], [4, 5, 6]])
+        - shape: [4, 3]
+    """
+    # make sure input is a tensor
+    if not isinstance(input_tensor, torch.Tensor):
+        raise ValueError(f"`{input_name}` must be a tensor")
+
+    # make sure input tensor e.g. image_latents has batch size 1 or batch_size same as prompts
+    if input_tensor.shape[0] == 1:
+        repeat_by = batch_size * num_images_per_prompt
+    elif input_tensor.shape[0] == batch_size:
+        repeat_by = num_images_per_prompt
+    else:
+        raise ValueError(
+            f"`{input_name}` must have have batch size 1 or {batch_size}, but got {input_tensor.shape[0]}"
+        )
+
+    # expand the tensor to match the batch_size * num_images_per_prompt
+    input_tensor = input_tensor.repeat_interleave(repeat_by, dim=0)
+
+    return input_tensor
+
+
+def calculate_dimension_from_latents(
+    latents: torch.Tensor, vae_scale_factor_spatial: int
+) -> Tuple[int, int]:
+    """Calculate image dimensions from latent tensor dimensions.
+
+    This function converts latent spatial dimensions to image spatial dimensions by
+    multiplying the latent height/width by the VAE scale factor.
+
+    Args:
+        latents (torch.Tensor): The latent tensor. Must have 4 dimensions.
+            Expected shapes: [batch, channels, height, width]
+        vae_scale_factor (int): The scale factor used by the VAE to compress image spatial dimension.
+            By default, it is 16
+    Returns:
+        Tuple[int, int]: The calculated image dimensions as (height, width)
+    """
+    latent_height, latent_width = latents.shape[2:]
+    height = latent_height * vae_scale_factor_spatial
+    width = latent_width * vae_scale_factor_spatial
+
+    return height, width
+
+# Copied from diffusers.pipelines.flux.pipeline_flux.calculate_shift
+def calculate_shift(
+    image_seq_len,
+    base_seq_len: int = 256,
+    max_seq_len: int = 4096,
+    base_shift: float = 0.5,
+    max_shift: float = 1.15,
+):
+    m = (max_shift - base_shift) / (max_seq_len - base_seq_len)
+    b = base_shift - m * base_seq_len
+    mu = image_seq_len * m + b
+    return mu
+
+
+# Copied from diffusers.pipelines.stable_diffusion.pipeline_stable_diffusion.retrieve_timesteps
+def retrieve_timesteps(
+    scheduler,
+    num_inference_steps: Optional[int] = None,
+    device: Optional[Union[str, torch.device]] = None,
+    timesteps: Optional[List[int]] = None,
+    sigmas: Optional[List[float]] = None,
+    **kwargs,
+):
+    r"""
+    Calls the scheduler's `set_timesteps` method and retrieves timesteps from the scheduler after the call. Handles
+    custom timesteps. Any kwargs will be supplied to `scheduler.set_timesteps`.
+
+    Args:
+        scheduler (`SchedulerMixin`):
+            The scheduler to get timesteps from.
+        num_inference_steps (`int`):
+            The number of diffusion steps used when generating samples with a pre-trained model. If used, `timesteps`
+            must be `None`.
+        device (`str` or `torch.device`, *optional*):
+            The device to which the timesteps should be moved to. If `None`, the timesteps are not moved.
+        timesteps (`List[int]`, *optional*):
+            Custom timesteps used to override the timestep spacing strategy of the scheduler. If `timesteps` is passed,
+            `num_inference_steps` and `sigmas` must be `None`.
+        sigmas (`List[float]`, *optional*):
+            Custom sigmas used to override the timestep spacing strategy of the scheduler. If `sigmas` is passed,
+            `num_inference_steps` and `timesteps` must be `None`.
+
+    Returns:
+        `Tuple[torch.Tensor, int]`: A tuple where the first element is the timestep schedule from the scheduler and the
+        second element is the number of inference steps.
+    """
+    if timesteps is not None and sigmas is not None:
+        raise ValueError("Only one of `timesteps` or `sigmas` can be passed. Please choose one to set custom values")
+    if timesteps is not None:
+        accepts_timesteps = "timesteps" in set(inspect.signature(scheduler.set_timesteps).parameters.keys())
+        if not accepts_timesteps:
+            raise ValueError(
+                f"The current scheduler class {scheduler.__class__}'s `set_timesteps` does not support custom"
+                f" timestep schedules. Please check whether you are using the correct scheduler."
+            )
+        scheduler.set_timesteps(timesteps=timesteps, device=device, **kwargs)
+        timesteps = scheduler.timesteps
+        num_inference_steps = len(timesteps)
+    elif sigmas is not None:
+        accept_sigmas = "sigmas" in set(inspect.signature(scheduler.set_timesteps).parameters.keys())
+        if not accept_sigmas:
+            raise ValueError(
+                f"The current scheduler class {scheduler.__class__}'s `set_timesteps` does not support custom"
+                f" sigmas schedules. Please check whether you are using the correct scheduler."
+            )
+        scheduler.set_timesteps(sigmas=sigmas, device=device, **kwargs)
+        timesteps = scheduler.timesteps
+        num_inference_steps = len(timesteps)
+    else:
+        scheduler.set_timesteps(num_inference_steps, device=device, **kwargs)
+        timesteps = scheduler.timesteps
+    return timesteps, num_inference_steps
+
+
+class ZImageTextInputStep(ModularPipelineBlocks):
+    model_name = "z_image"
+
+    @property
+    def description(self) -> str:
+        return (
+            "Input processing step that:\n"
+            "  1. Determines `batch_size` and `dtype` based on `prompt_embeds`\n"
+            "  2. Adjusts input tensor shapes based on `batch_size` (number of prompts) and `num_images_per_prompt`\n\n"
+            "All input tensors are expected to have either batch_size=1 or match the batch_size\n"
+            "of prompt_embeds. The tensors will be duplicated across the batch dimension to\n"
+            "have a final batch_size of batch_size * num_images_per_prompt."
+        )
+
+    @property
+    def expected_components(self) -> List[ComponentSpec]:
+        return [
+            ComponentSpec("transformer", ZImageTransformer2DModel),
+        ]
+
+    @property
+    def inputs(self) -> List[InputParam]:
+        return [
+            InputParam("num_images_per_prompt", default=1),
+            InputParam(
+                "prompt_embeds",
+                required=True,
+                type_hint=List[torch.Tensor],
+                description="Pre-generated text embeddings. Can be generated from text_encoder step.",
+            ),
+            InputParam(
+                "negative_prompt_embeds",
+                type_hint=List[torch.Tensor],
+                description="Pre-generated negative text embeddings. Can be generated from text_encoder step.",
+            ),
+        ]
+
+    @property
+    def intermediate_outputs(self) -> List[str]:
+        return [
+            OutputParam(
+                "batch_size",
+                type_hint=int,
+                description="Number of prompts, the final batch size of model inputs should be batch_size * num_videos_per_prompt",
+            ),
+            OutputParam(
+                "dtype",
+                type_hint=torch.dtype,
+                description="Data type of model tensor inputs (determined by `transformer.dtype`)",
+            ),
+        ]
+
+    def check_inputs(self, components, block_state):
+        if block_state.prompt_embeds is not None and block_state.negative_prompt_embeds is not None:
+            if not isinstance(block_state.prompt_embeds, list):
+                raise ValueError(
+                    f"`prompt_embeds` must be a list when passed directly, but got {type(block_state.prompt_embeds)}."
+                )
+            if not isinstance(block_state.negative_prompt_embeds, list):
+                raise ValueError(
+                    f"`negative_prompt_embeds` must be a list when passed directly, but got {type(block_state.negative_prompt_embeds)}."
+                )
+            if len(block_state.prompt_embeds) != len(block_state.negative_prompt_embeds):
+                raise ValueError(
+                    "`prompt_embeds` and `negative_prompt_embeds` must have the same length when passed directly, but"
+                    f" got: `prompt_embeds` {len(block_state.prompt_embeds)} != `negative_prompt_embeds`"
+                    f" {len(block_state.negative_prompt_embeds)}."
+                )
+
+    @torch.no_grad()
+    def __call__(self, components: ZImageModularPipeline, state: PipelineState) -> PipelineState:
+        block_state = self.get_block_state(state)
+        self.check_inputs(components, block_state)
+
+        block_state.batch_size =len(block_state.prompt_embeds)
+        block_state.dtype = block_state.prompt_embeds[0].dtype
+
+        if block_state.num_images_per_prompt > 1:
+            prompt_embeds = [pe for pe in block_state.prompt_embeds for _ in range(block_state.num_images_per_prompt)]
+            block_state.prompt_embeds = prompt_embeds
+
+            if block_state.negative_prompt_embeds is not None:
+                negative_prompt_embeds = [npe for npe in block_state.negative_prompt_embeds for _ in range(block_state.num_images_per_prompt)]
+                block_state.negative_prompt_embeds = negative_prompt_embeds
+
+        self.set_block_state(state, block_state)
+
+        return components, state
+
+
+class ZImageAdditionalInputsStep(ModularPipelineBlocks):
+    model_name = "z_image"
+
+    def __init__(
+        self,
+        image_latent_inputs: List[str] = ["image_latents"],
+        additional_batch_inputs: List[str] = [],
+    ):
+        """Initialize a configurable step that standardizes the inputs for the denoising step. It:\n"
+
+        This step handles multiple common tasks to prepare inputs for the denoising step:
+        1. For encoded image latents, use it update height/width if None, and expands batch size
+        2. For additional_batch_inputs: Only expands batch dimensions to match final batch size
+
+        This is a dynamic block that allows you to configure which inputs to process.
+
+        Args:
+            image_latent_inputs (List[str], optional): Names of image latent tensors to process.
+                In additional to adjust batch size of these inputs, they will be used to determine height/width. Can be
+                a single string or list of strings. Defaults to ["image_latents"].
+            additional_batch_inputs (List[str], optional):
+                Names of additional conditional input tensors to expand batch size. These tensors will only have their
+                batch dimensions adjusted to match the final batch size. Can be a single string or list of strings.
+                Defaults to [].
+
+        Examples:
+            # Configure to process image_latents (default behavior) ZImageAdditionalInputsStep()
+
+            # Configure to process multiple image latent inputs
+            ZImageAdditionalInputsStep(image_latent_inputs=["image_latents", "control_image_latents"])
+
+            # Configure to process image latents and additional batch inputs ZImageAdditionalInputsStep(
+                image_latent_inputs=["image_latents"], additional_batch_inputs=["image_embeds"]
+            )
+        """
+        if not isinstance(image_latent_inputs, list):
+            image_latent_inputs = [image_latent_inputs]
+        if not isinstance(additional_batch_inputs, list):
+            additional_batch_inputs = [additional_batch_inputs]
+
+        self._image_latent_inputs = image_latent_inputs
+        self._additional_batch_inputs = additional_batch_inputs
+        super().__init__()
+
+    @property
+    def description(self) -> str:
+        # Functionality section
+        summary_section = (
+            "Input processing step that:\n"
+            "  1. For image latent inputs: Updates height/width if None, and expands batch size\n"
+            "  2. For additional batch inputs: Expands batch dimensions to match final batch size"
+        )
+
+        # Inputs info
+        inputs_info = ""
+        if self._image_latent_inputs or self._additional_batch_inputs:
+            inputs_info = "\n\nConfigured inputs:"
+            if self._image_latent_inputs:
+                inputs_info += f"\n  - Image latent inputs: {self._image_latent_inputs}"
+            if self._additional_batch_inputs:
+                inputs_info += f"\n  - Additional batch inputs: {self._additional_batch_inputs}"
+
+        # Placement guidance
+        placement_section = "\n\nThis block should be placed after the encoder steps and the text input step."
+
+        return summary_section + inputs_info + placement_section
+
+    @property
+    def inputs(self) -> List[InputParam]:
+        inputs = [
+            InputParam(name="num_images_per_prompt", default=1),
+            InputParam(name="batch_size", required=True),
+            InputParam(name="height"),
+            InputParam(name="width"),
+        ]
+
+        # Add image latent inputs
+        for image_latent_input_name in self._image_latent_inputs:
+            inputs.append(InputParam(name=image_latent_input_name))
+
+        # Add additional batch inputs
+        for input_name in self._additional_batch_inputs:
+            inputs.append(InputParam(name=input_name))
+
+        return inputs
+
+    def __call__(self, components: ZImageModularPipeline, state: PipelineState) -> PipelineState:
+        block_state = self.get_block_state(state)
+
+        # Process image latent inputs (height/width calculation, patchify, and batch expansion)
+        for image_latent_input_name in self._image_latent_inputs:
+            image_latent_tensor = getattr(block_state, image_latent_input_name)
+            if image_latent_tensor is None:
+                continue
+
+            # 1. Calculate num_frames, height/width from latents
+            height, width = calculate_dimension_from_latents(
+                image_latent_tensor, components.vae_scale_factor_spatial
+            )
+            block_state.height = block_state.height or height
+            block_state.width = block_state.width or width
+            
+
+        # Process additional batch inputs (only batch expansion)
+        for input_name in self._additional_batch_inputs:
+            input_tensor = getattr(block_state, input_name)
+            if input_tensor is None:
+                continue
+
+            # Only expand batch size
+            input_tensor = repeat_tensor_to_batch_size(
+                input_name=input_name,
+                input_tensor=input_tensor,
+                num_images_per_prompt=block_state.num_images_per_prompt,
+                batch_size=block_state.batch_size,
+            )
+
+            setattr(block_state, input_name, input_tensor)
+
+        self.set_block_state(state, block_state)
+        return components, state
+
+
+class ZImagePrepareLatentsStep(ModularPipelineBlocks):
+    model_name = "z_image"
+
+    @property
+    def description(self) -> str:
+        return "Prepare latents step that prepares the latents for the text-to-video generation process"
+
+    @property
+    def inputs(self) -> List[InputParam]:
+        return [
+            InputParam("height", type_hint=int),
+            InputParam("width", type_hint=int),
+            InputParam("latents", type_hint=Optional[torch.Tensor]),
+            InputParam("num_images_per_prompt", type_hint=int, default=1),
+            InputParam("generator"),
+            InputParam(
+                "batch_size",
+                required=True,
+                type_hint=int,
+                description="Number of prompts, the final batch size of model inputs should be `batch_size * num_videos_per_prompt`. Can be generated in input step.",
+            ),
+            InputParam("dtype", type_hint=torch.dtype, description="The dtype of the model inputs"),
+        ]
+
+    @property
+    def intermediate_outputs(self) -> List[OutputParam]:
+        return [
+            OutputParam(
+                "latents", type_hint=torch.Tensor, description="The initial latents to use for the denoising process"
+            )
+        ]
+
+    def check_inputs(self, components, block_state):
+        if (block_state.height is not None and block_state.height % components.vae_scale_factor_spatial != 0) or (
+            block_state.width is not None and block_state.width % components.vae_scale_factor_spatial != 0
+        ):
+            raise ValueError(
+                f"`height` and `width` have to be divisible by {components.vae_scale_factor_spatial} but are {block_state.height} and {block_state.width}."
+            )
+
+    @staticmethod
+    # Copied from diffusers.pipelines.z_image.pipeline_z_image.ZImagePipeline.prepare_latents with self->comp
+    def prepare_latents(
+        components,
+        batch_size,
+        num_channels_latents,
+        height,
+        width,
+        dtype,
+        device,
+        generator,
+        latents=None,
+    ):
+        height = 2 * (int(height) // (components.vae_scale_factor * 2))
+        width = 2 * (int(width) // (components.vae_scale_factor * 2))
+
+        shape = (batch_size, num_channels_latents, height, width)
+
+        if latents is None:
+            latents = randn_tensor(shape, generator=generator, device=device, dtype=dtype)
+        else:
+            if latents.shape != shape:
+                raise ValueError(f"Unexpected latents shape, got {latents.shape}, expected {shape}")
+            latents = latents.to(device)
+        return latents
+
+    @torch.no_grad()
+    def __call__(self, components: ZImageModularPipeline, state: PipelineState) -> PipelineState:
+        block_state = self.get_block_state(state)
+        self.check_inputs(components, block_state)
+
+        device = components._execution_device
+        dtype = torch.float32  # Wan latents should be torch.float32 for best quality
+
+        block_state.height = block_state.height or components.default_height
+        block_state.width = block_state.width or components.default_width
+
+        block_state.latents = self.prepare_latents(
+            components,
+            batch_size=block_state.batch_size * block_state.num_images_per_prompt,
+            num_channels_latents=components.num_channels_latents,
+            height=block_state.height,
+            width=block_state.width,
+            dtype=dtype,
+            device=device,
+            generator=block_state.generator,
+            latents=block_state.latents,
+        )
+
+        self.set_block_state(state, block_state)
+
+        return components, state
+
+
+class ZImageSetTimestepsStep(ModularPipelineBlocks):
+    model_name = "z_image"
+
+    @property
+    def expected_components(self) -> List[ComponentSpec]:
+        return [
+            ComponentSpec("scheduler", FlowMatchEulerDiscreteScheduler),
+        ]
+
+    @property
+    def description(self) -> str:
+        return "Step that sets the scheduler's timesteps for inference. Need to run after prepare latents step."
+
+    @property
+    def inputs(self) -> List[InputParam]:
+        return [
+            InputParam("latents", required=True),
+            InputParam("num_inference_steps", default=50),
+            InputParam("sigmas"),
+        ]
+
+    @torch.no_grad()
+    def __call__(self, components: ZImageModularPipeline, state: PipelineState) -> PipelineState:
+        block_state = self.get_block_state(state)
+        device = components._execution_device
+
+        latent_height, latent_width = block_state.latents.shape[2], block_state.latents.shape[3]
+        image_seq_len = (latent_height //2)  * (latent_width //2) # sequence length  after patchify 
+
+        mu = calculate_shift(
+            image_seq_len,
+            base_seq_len=components.scheduler.config.get("base_image_seq_len", 256),
+            max_seq_len=components.scheduler.config.get("max_image_seq_len", 4096),
+            base_shift=components.scheduler.config.get("base_shift", 0.5),
+            max_shift=components.scheduler.config.get("max_shift", 1.15),
+        )
+        components.scheduler.sigma_min = 0.0
+
+        block_state.timesteps, block_state.num_inference_steps = retrieve_timesteps(
+            components.scheduler,
+            block_state.num_inference_steps,
+            device,
+            sigmas=block_state.sigmas,
+            mu=mu,
+        )
+
+        self.set_block_state(state, block_state)
+        return components, state
+
+
+class ZImageSetTimestepsWithStrengthStep(ModularPipelineBlocks):
+    model_name = "z_image"
+
+    @property
+    def expected_components(self) -> List[ComponentSpec]:
+        return [
+            ComponentSpec("scheduler", FlowMatchEulerDiscreteScheduler),
+        ]
+
+    @property
+    def description(self) -> str:
+        return "Step that sets the scheduler's timesteps for inference with strength. Need to run after set timesteps step."
+
+    @property
+    def inputs(self) -> List[InputParam]:
+        return [
+            InputParam("timesteps", required=True),
+            InputParam("num_inference_steps",required=True),
+            InputParam("strength", default=0.6),
+        ]
+
+    
+    def check_inputs(self, components, block_state):
+        if block_state.strength < 0.0 or block_state.strength > 1.0:
+            raise ValueError(f"Strength must be between 0.0 and 1.0, but got {block_state.strength}")
+
+
+    @torch.no_grad()
+    def __call__(self, components: ZImageModularPipeline, state: PipelineState) -> PipelineState:
+        block_state = self.get_block_state(state)
+        self.check_inputs(components, block_state)
+
+        device = components._execution_device
+        init_timestep = min(block_state.num_inference_steps * block_state.strength, block_state.num_inference_steps)
+
+        t_start = int(max(block_state.num_inference_steps - init_timestep, 0))
+        timesteps = components.scheduler.timesteps[t_start * components.scheduler.order :]
+        if hasattr(components.scheduler, "set_begin_index"):
+            components.scheduler.set_begin_index(t_start * components.scheduler.order)
+
+        block_state.timesteps = timesteps
+        block_state.num_inference_steps = block_state.num_inference_steps - t_start
+
+        self.set_block_state(state, block_state)
+        return components, state
+
+
+class ZImagePrepareLatentswithImageStep(ModularPipelineBlocks):
+    model_name = "z_image"
+
+    @property
+    def description(self) -> str:
+        return "step that prepares the latents with image condition, need to run after set timesteps and prepare latents step."
+
+    @property
+    def inputs(self) -> List[InputParam]:
+        return [
+            InputParam("latents", required=True),
+            InputParam("image_latents", required=True),
+            InputParam("timesteps", required=True),
+        ]
+
+    def __call__(self, components: ZImageModularPipeline, state: PipelineState) -> PipelineState:
+        block_state = self.get_block_state(state)
+
+        latent_timestep = block_state.timesteps[:1].repeat(block_state.latents.shape[0])
+        block_state.latents = components.scheduler.scale_noise(block_state.image_latents, latent_timestep, block_state.latents)
+
+        self.set_block_state(state, block_state)
+        return components, state
\ No newline at end of file
diff --git a/src/diffusers/modular_pipelines/z_image/decoders.py b/src/diffusers/modular_pipelines/z_image/decoders.py
new file mode 100644
index 000000000000..70ad1c645e66
--- /dev/null
+++ b/src/diffusers/modular_pipelines/z_image/decoders.py
@@ -0,0 +1,89 @@
+# Copyright 2025 Alibaba Z-Image Team and The HuggingFace Team. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+from typing import Any, List, Tuple, Union
+
+import numpy as np
+import PIL
+import torch
+
+from ...configuration_utils import FrozenDict
+from ...models import AutoencoderKL
+from ...utils import logging
+from ...image_processor import VaeImageProcessor
+from ..modular_pipeline import ModularPipelineBlocks, PipelineState
+from ..modular_pipeline_utils import ComponentSpec, InputParam, OutputParam
+
+
+logger = logging.get_logger(__name__)  # pylint: disable=invalid-name
+
+
+class ZImageVaeDecoderStep(ModularPipelineBlocks):
+    model_name = "z_image"
+
+    @property
+    def expected_components(self) -> List[ComponentSpec]:
+        return [
+            ComponentSpec("vae", AutoencoderKL),
+            ComponentSpec(
+                "image_processor",
+                VaeImageProcessor,
+                config=FrozenDict({"vae_scale_factor": 8 * 2}),
+                default_creation_method="from_config",
+            ),
+        ]
+
+    @property
+    def description(self) -> str:
+        return "Step that decodes the denoised latents into images"
+
+    @property
+    def inputs(self) -> List[Tuple[str, Any]]:
+        return [
+            InputParam(
+                "latents",
+                required=True,
+            ),
+            InputParam(
+                name="output_type",
+                default="pil",
+                type_hint=str,
+                description="The type of the output images, can be 'pil', 'np', 'pt'",
+            ),
+        ]
+
+    @property
+    def intermediate_outputs(self) -> List[str]:
+        return [
+            OutputParam(
+                "images",
+                type_hint=Union[List[PIL.Image.Image], List[torch.Tensor], List[np.ndarray]],
+                description="The generated images, can be a PIL.Image.Image, torch.Tensor or a numpy array",
+            )
+        ]
+
+    @torch.no_grad()
+    def __call__(self, components, state: PipelineState) -> PipelineState:
+        block_state = self.get_block_state(state)
+        vae_dtype = components.vae.dtype
+
+        latents = block_state.latents.to(vae_dtype)
+        latents = latents / components.vae.config.scaling_factor + components.vae.config.shift_factor
+
+        block_state.images = components.vae.decode(latents, return_dict=False)[0]
+        block_state.images = components.image_processor.postprocess(block_state.images, output_type=block_state.output_type)
+
+        self.set_block_state(state, block_state)
+
+        return components, state
diff --git a/src/diffusers/modular_pipelines/z_image/denoise.py b/src/diffusers/modular_pipelines/z_image/denoise.py
new file mode 100644
index 000000000000..657c8c386288
--- /dev/null
+++ b/src/diffusers/modular_pipelines/z_image/denoise.py
@@ -0,0 +1,306 @@
+# Copyright 2025 Alibaba Z-Image Team and The HuggingFace Team. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+from typing import Any, Dict, List, Tuple
+
+import torch
+
+from ...configuration_utils import FrozenDict
+from ...guiders import ClassifierFreeGuidance
+from ...models import ZImageTransformer2DModel
+from ...schedulers import FlowMatchEulerDiscreteScheduler
+from ...utils import logging
+from ..modular_pipeline import (
+    BlockState,
+    LoopSequentialPipelineBlocks,
+    ModularPipelineBlocks,
+    PipelineState,
+)
+from ..modular_pipeline_utils import ComponentSpec, ConfigSpec, InputParam
+from .modular_pipeline import ZImageModularPipeline
+
+
+logger = logging.get_logger(__name__)  # pylint: disable=invalid-name
+
+
+class ZImageLoopBeforeDenoiser(ModularPipelineBlocks):
+    model_name = "z_image"
+
+    @property
+    def description(self) -> str:
+        return (
+            "step within the denoising loop that prepares the latent input for the denoiser. "
+            "This block should be used to compose the `sub_blocks` attribute of a `LoopSequentialPipelineBlocks` "
+            "object (e.g. `WanDenoiseLoopWrapper`)"
+        )
+
+    @property
+    def inputs(self) -> List[InputParam]:
+        return [
+            InputParam(
+                "latents",
+                required=True,
+                type_hint=torch.Tensor,
+                description="The initial latents to use for the denoising process. Can be generated in prepare_latent step.",
+            ),
+            InputParam(
+                "dtype",
+                required=True,
+                type_hint=torch.dtype,
+                description="The dtype of the model inputs. Can be generated in input step.",
+            ),
+        ]
+
+    @torch.no_grad()
+    def __call__(self, components: ZImageModularPipeline, block_state: BlockState, i: int, t: torch.Tensor):
+
+        latents = block_state.latents.unsqueeze(2).to(block_state.dtype) # [batch_size, num_channels, 1, height, width]
+        block_state.latent_model_input = list(latents.unbind(dim=0)) # list of [num_channels, 1, height, width]
+
+        timestep = t.expand(latents.shape[0]).to(block_state.dtype)
+        timestep = (1000 - timestep) / 1000
+        block_state.timestep = timestep
+        return components, block_state
+
+
+class ZImageLoopDenoiser(ModularPipelineBlocks):
+    model_name = "z_image"
+
+    def __init__(
+        self,
+        guider_input_fields: Dict[str, Any] = {"encoder_hidden_states": ("prompt_embeds", "negative_prompt_embeds")},
+    ):
+        """Initialize a denoiser block that calls the denoiser model. This block is used in Z-Image.
+
+        Args:
+            guider_input_fields: A dictionary that maps each argument expected by the denoiser model
+                (for example, "encoder_hidden_states") to data stored on 'block_state'. The value can be either:
+
+                - A tuple of strings. For instance, {"encoder_hidden_states": ("prompt_embeds",
+                  "negative_prompt_embeds")} tells the guider to read `block_state.prompt_embeds` and
+                  `block_state.negative_prompt_embeds` and pass them as the conditional and unconditional batches of
+                  'encoder_hidden_states'.
+                - A string. For example, {"encoder_hidden_image": "image_embeds"} makes the guider forward
+                  `block_state.image_embeds` for both conditional and unconditional batches.
+        """
+        if not isinstance(guider_input_fields, dict):
+            raise ValueError(f"guider_input_fields must be a dictionary but is {type(guider_input_fields)}")
+        self._guider_input_fields = guider_input_fields
+        super().__init__()
+
+    @property
+    def expected_components(self) -> List[ComponentSpec]:
+        return [
+            ComponentSpec(
+                "guider",
+                ClassifierFreeGuidance,
+                config=FrozenDict({"guidance_scale": 5.0}),
+                default_creation_method="from_config",
+            ),
+            ComponentSpec("transformer", ZImageTransformer2DModel),
+        ]
+
+    @property
+    def description(self) -> str:
+        return (
+            "Step within the denoising loop that denoise the latents with guidance. "
+            "This block should be used to compose the `sub_blocks` attribute of a `LoopSequentialPipelineBlocks` "
+            "object (e.g. `ZImageDenoiseLoopWrapper`)"
+        )
+
+    @property
+    def inputs(self) -> List[Tuple[str, Any]]:
+        inputs = [
+            InputParam("attention_kwargs"),
+            InputParam(
+                "num_inference_steps",
+                required=True,
+                type_hint=int,
+                description="The number of inference steps to use for the denoising process. Can be generated in set_timesteps step.",
+            ),
+        ]
+        guider_input_names = []
+        for value in self._guider_input_fields.values():
+            if isinstance(value, tuple):
+                guider_input_names.extend(value)
+            else:
+                guider_input_names.append(value)
+
+        for name in guider_input_names:
+            inputs.append(InputParam(name=name, required=True, type_hint=torch.Tensor))
+        return inputs
+
+    @torch.no_grad()
+    def __call__(
+        self, components: ZImageModularPipeline, block_state: BlockState, i: int, t: torch.Tensor
+    ) -> PipelineState:
+        components.guider.set_state(step=i, num_inference_steps=block_state.num_inference_steps, timestep=t)
+
+        # The guider splits model inputs into separate batches for conditional/unconditional predictions.
+        # For CFG with guider_inputs = {"encoder_hidden_states": (prompt_embeds, negative_prompt_embeds)}:
+        # you will get a guider_state with two batches:
+        #   guider_state = [
+        #       {"encoder_hidden_states": prompt_embeds, "__guidance_identifier__": "pred_cond"},      # conditional batch
+        #       {"encoder_hidden_states": negative_prompt_embeds, "__guidance_identifier__": "pred_uncond"},  # unconditional batch
+        #   ]
+        # Other guidance methods may return 1 batch (no guidance) or 3+ batches (e.g., PAG, APG).
+        guider_state = components.guider.prepare_inputs_from_block_state(block_state, self._guider_input_fields)
+
+        # run the denoiser for each guidance batch
+        for guider_state_batch in guider_state:
+            components.guider.prepare_models(components.transformer)
+            cond_kwargs = guider_state_batch.as_dict()
+            def _convert_dtype(v, dtype):
+                if isinstance(v, torch.Tensor):
+                    return v.to(dtype)
+                elif isinstance(v, list):
+                    return [_convert_dtype(t, dtype) for t in v]
+                return v
+
+            cond_kwargs = {
+                k: _convert_dtype(v, block_state.dtype)
+                for k, v in cond_kwargs.items()
+                if k in self._guider_input_fields.keys()
+            }
+
+            # Predict the noise residual
+            # store the noise_pred in guider_state_batch so that we can apply guidance across all batches
+            model_out_list = components.transformer(
+                hidden_states=block_state.latent_model_input,
+                timestep=block_state.timestep,
+                attention_kwargs=block_state.attention_kwargs,
+                return_dict=False,
+                **cond_kwargs,
+            )[0]
+            noise_pred = torch.stack(model_out_list, dim=0).squeeze(2)
+            guider_state_batch.noise_pred = -noise_pred
+            components.guider.cleanup_models(components.transformer)
+
+        # Perform guidance
+        block_state.noise_pred = components.guider(guider_state)[0]
+
+        return components, block_state
+
+
+class ZImageLoopAfterDenoiser(ModularPipelineBlocks):
+    model_name = "z_image"
+
+    @property
+    def expected_components(self) -> List[ComponentSpec]:
+        return [
+            ComponentSpec("scheduler", FlowMatchEulerDiscreteScheduler),
+        ]
+
+    @property
+    def description(self) -> str:
+        return (
+            "step within the denoising loop that update the latents. "
+            "This block should be used to compose the `sub_blocks` attribute of a `LoopSequentialPipelineBlocks` "
+            "object (e.g. `ZImageDenoiseLoopWrapper`)"
+        )
+
+    @torch.no_grad()
+    def __call__(self, components: ZImageModularPipeline, block_state: BlockState, i: int, t: torch.Tensor):
+        # Perform scheduler step using the predicted output
+        latents_dtype = block_state.latents.dtype
+        block_state.latents = components.scheduler.step(
+            block_state.noise_pred.float(),
+            t,
+            block_state.latents.float(),
+            return_dict=False,
+        )[0]
+
+        if block_state.latents.dtype != latents_dtype:
+            block_state.latents = block_state.latents.to(latents_dtype)
+
+        return components, block_state
+
+
+class ZImageDenoiseLoopWrapper(LoopSequentialPipelineBlocks):
+    model_name = "z_image"
+
+    @property
+    def description(self) -> str:
+        return (
+            "Pipeline block that iteratively denoise the latents over `timesteps`. "
+            "The specific steps with each iteration can be customized with `sub_blocks` attributes"
+        )
+
+    @property
+    def loop_expected_components(self) -> List[ComponentSpec]:
+        return [
+            ComponentSpec("scheduler", FlowMatchEulerDiscreteScheduler),
+        ]
+
+    @property
+    def loop_inputs(self) -> List[InputParam]:
+        return [
+            InputParam(
+                "timesteps",
+                required=True,
+                type_hint=torch.Tensor,
+                description="The timesteps to use for the denoising process. Can be generated in set_timesteps step.",
+            ),
+            InputParam(
+                "num_inference_steps",
+                required=True,
+                type_hint=int,
+                description="The number of inference steps to use for the denoising process. Can be generated in set_timesteps step.",
+            ),
+        ]
+
+    @torch.no_grad()
+    def __call__(self, components: ZImageModularPipeline, state: PipelineState) -> PipelineState:
+        block_state = self.get_block_state(state)
+
+        block_state.num_warmup_steps = max(
+            len(block_state.timesteps) - block_state.num_inference_steps * components.scheduler.order, 0
+        )
+
+        with self.progress_bar(total=block_state.num_inference_steps) as progress_bar:
+            for i, t in enumerate(block_state.timesteps):
+                components, block_state = self.loop_step(components, block_state, i=i, t=t)
+                if i == len(block_state.timesteps) - 1 or (
+                    (i + 1) > block_state.num_warmup_steps and (i + 1) % components.scheduler.order == 0
+                ):
+                    progress_bar.update()
+
+        self.set_block_state(state, block_state)
+
+        return components, state
+
+
+class ZImageDenoiseStep(ZImageDenoiseLoopWrapper):
+    block_classes = [
+        ZImageLoopBeforeDenoiser,
+        ZImageLoopDenoiser(
+            guider_input_fields={
+                "encoder_hidden_states": ("prompt_embeds", "negative_prompt_embeds"),
+            }
+        ),
+        ZImageLoopAfterDenoiser,
+    ]
+    block_names = ["before_denoiser", "denoiser", "after_denoiser"]
+
+    @property
+    def description(self) -> str:
+        return (
+            "Denoise step that iteratively denoise the latents. \n"
+            "Its loop logic is defined in `WanDenoiseLoopWrapper.__call__` method \n"
+            "At each iteration, it runs blocks defined in `sub_blocks` sequentially:\n"
+            " - `ZImageLoopBeforeDenoiser`\n"
+            " - `ZImageLoopDenoiser`\n"
+            " - `WanLoopAfterDenoiser`\n"
+            "This block supports text-to-image and image-to-image tasks for Z-Image."
+        )
\ No newline at end of file
diff --git a/src/diffusers/modular_pipelines/z_image/encoders.py b/src/diffusers/modular_pipelines/z_image/encoders.py
new file mode 100644
index 000000000000..581473923cf2
--- /dev/null
+++ b/src/diffusers/modular_pipelines/z_image/encoders.py
@@ -0,0 +1,347 @@
+# Copyright 2025 Alibaba Z-Image Team and The HuggingFace Team. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import html
+from typing import List, Optional, Union
+
+import numpy as np
+import PIL
+import regex as re
+import torch
+from transformers import  Qwen3Model, Qwen2Tokenizer
+
+from ...configuration_utils import FrozenDict
+from ...guiders import ClassifierFreeGuidance
+from ...models import AutoencoderKL
+from ...utils import is_ftfy_available, logging
+from ...image_processor import VaeImageProcessor
+from ..modular_pipeline import ModularPipelineBlocks, PipelineState
+from ..modular_pipeline_utils import ComponentSpec, InputParam, OutputParam
+from .modular_pipeline import WanModularPipeline
+
+
+if is_ftfy_available():
+    import ftfy
+
+logger = logging.get_logger(__name__)  # pylint: disable=invalid-name
+
+
+def get_qwen_prompt_embeds(
+    text_encoder: Qwen3Model,
+    tokenizer: Qwen2Tokenizer,
+    prompt: Union[str, List[str]],
+    device: torch.device,
+    max_sequence_length: int = 512,
+) -> List[torch.Tensor]:
+
+    prompt = [prompt] if isinstance(prompt, str) else prompt
+    
+    for i, prompt_item in enumerate(prompt):
+        messages = [
+            {"role": "user", "content": prompt_item},
+        ]
+        prompt_item = tokenizer.apply_chat_template(
+            messages,
+            tokenize=False,
+            add_generation_prompt=True,
+            enable_thinking=True,
+        )
+        prompt[i] = prompt_item
+
+    text_inputs = tokenizer(
+        prompt,
+        padding="max_length",
+        max_length=max_sequence_length,
+        truncation=True,
+        return_tensors="pt",
+    )
+
+    text_input_ids = text_inputs.input_ids.to(device)
+    prompt_masks = text_inputs.attention_mask.to(device).bool()
+
+    prompt_embeds = text_encoder(
+        input_ids=text_input_ids,
+        attention_mask=prompt_masks,
+        output_hidden_states=True,
+    ).hidden_states[-2]
+
+    prompt_embeds = []
+
+    for i in range(len(prompt_embeds)):
+        prompt_embeds.append(prompt_embeds[i][prompt_masks[i]])
+
+    return prompt_embeds
+
+
+# Copied from diffusers.pipelines.stable_diffusion.pipeline_stable_diffusion_img2img.retrieve_latents
+def retrieve_latents(
+    encoder_output: torch.Tensor, generator: Optional[torch.Generator] = None, sample_mode: str = "sample"
+):
+    if hasattr(encoder_output, "latent_dist") and sample_mode == "sample":
+        return encoder_output.latent_dist.sample(generator)
+    elif hasattr(encoder_output, "latent_dist") and sample_mode == "argmax":
+        return encoder_output.latent_dist.mode()
+    elif hasattr(encoder_output, "latents"):
+        return encoder_output.latents
+    else:
+        raise AttributeError("Could not access latents of provided encoder_output")
+
+
+def encode_vae_image(
+    image_tensor: torch.Tensor,
+    vae: AutoencoderKL,
+    generator: torch.Generator,
+    device: torch.device,
+    dtype: torch.dtype,
+    latent_channels: int = 16,
+):
+    if not isinstance(image_tensor, torch.Tensor):
+        raise ValueError(f"Expected image_tensor to be a tensor, got {type(image_tensor)}.")
+
+    if isinstance(generator, list) and len(generator) != image_tensor.shape[0]:
+        raise ValueError(
+            f"You have passed a list of generators of length {len(generator)}, but it is not same as number of images {image_tensor.shape[0]}."
+        )
+
+    image_tensor = image_tensor.to(device=device, dtype=dtype)
+
+    if isinstance(generator, list):
+        image_latents = [
+            retrieve_latents(vae.encode(image_tensor[i : i + 1]), generator=generator[i])
+            for i in range(image_tensor.shape[0])
+        ]
+        image_latents = torch.cat(image_latents, dim=0)
+    else:
+        image_latents = retrieve_latents(vae.encode(image_tensor), generator=generator)
+
+    image_latents = (image_latents - vae.config.shift_factor) * vae.config.scaling_factor
+
+    return image_latents
+
+
+class ZImageTextEncoderStep(ModularPipelineBlocks):
+    model_name = "z_image"
+
+    @property
+    def description(self) -> str:
+        return "Text Encoder step that generate text_embeddings to guide the video generation"
+
+    @property
+    def expected_components(self) -> List[ComponentSpec]:
+        return [
+            ComponentSpec("text_encoder", Qwen3Model),
+            ComponentSpec("tokenizer", Qwen2Tokenizer),
+            ComponentSpec(
+                "guider",
+                ClassifierFreeGuidance,
+                config=FrozenDict({"guidance_scale": 5.0}),
+                default_creation_method="from_config",
+            ),
+        ]
+
+    @property
+    def inputs(self) -> List[InputParam]:
+        return [
+            InputParam("prompt"),
+            InputParam("negative_prompt"),
+            InputParam("max_sequence_length", default=512),
+        ]
+
+    @property
+    def intermediate_outputs(self) -> List[OutputParam]:
+        return [
+            OutputParam(
+                "prompt_embeds",
+                type_hint=List[torch.Tensor],
+                kwargs_type="denoiser_input_fields",
+                description="text embeddings used to guide the image generation",
+            ),
+            OutputParam(
+                "negative_prompt_embeds",
+                type_hint=List[torch.Tensor],
+                kwargs_type="denoiser_input_fields",
+                description="negative text embeddings used to guide the image generation",
+            ),
+        ]
+
+    @staticmethod
+    def check_inputs(block_state):
+        if block_state.prompt is not None and (
+            not isinstance(block_state.prompt, str) and not isinstance(block_state.prompt, list)
+        ):
+            raise ValueError(f"`prompt` has to be of type `str` or `list` but is {type(block_state.prompt)}")
+
+    @staticmethod
+    def encode_prompt(
+        components,
+        prompt: str,
+        device: Optional[torch.device] = None,
+        prepare_unconditional_embeds: bool = True,
+        negative_prompt: Optional[str] = None,
+        max_sequence_length: int = 512,
+    ):
+        r"""
+        Encodes the prompt into text encoder hidden states.
+
+        Args:
+            prompt (`str` or `List[str]`, *optional*):
+                prompt to be encoded
+            device: (`torch.device`):
+                torch device
+            prepare_unconditional_embeds (`bool`):
+                whether to use prepare unconditional embeddings or not
+            negative_prompt (`str` or `List[str]`, *optional*):
+                The prompt or prompts not to guide the image generation. If not defined, one has to pass
+                `negative_prompt_embeds` instead. Ignored when not using guidance (i.e., ignored if `guidance_scale` is
+                less than `1`).
+            max_sequence_length (`int`, defaults to `512`):
+                The maximum number of text tokens to be used for the generation process.
+        """
+        device = device or components._execution_device
+        if not isinstance(prompt, list):
+            prompt = [prompt]
+        batch_size = len(prompt)
+
+        prompt_embeds = get_qwen_prompt_embeds(
+            text_encoder=components.text_encoder,
+            tokenizer=components.tokenizer,
+            prompt=prompt,
+            max_sequence_length=max_sequence_length,
+            device=device,
+        )
+
+        if prepare_unconditional_embeds:
+            negative_prompt = negative_prompt or ""
+            negative_prompt = batch_size * [negative_prompt] if isinstance(negative_prompt, str) else negative_prompt
+
+            if prompt is not None and type(prompt) is not type(negative_prompt):
+                raise TypeError(
+                    f"`negative_prompt` should be the same type to `prompt`, but got {type(negative_prompt)} !="
+                    f" {type(prompt)}."
+                )
+            elif batch_size != len(negative_prompt):
+                raise ValueError(
+                    f"`negative_prompt`: {negative_prompt} has batch size {len(negative_prompt)}, but `prompt`:"
+                    f" {prompt} has batch size {batch_size}. Please make sure that passed `negative_prompt` matches"
+                    " the batch size of `prompt`."
+                )
+
+            negative_prompt_embeds = get_qwen_prompt_embeds(
+                text_encoder=components.text_encoder,
+                tokenizer=components.tokenizer,
+                prompt=negative_prompt,
+                max_sequence_length=max_sequence_length,
+                device=device,
+            )
+
+        return prompt_embeds, negative_prompt_embeds
+
+    @torch.no_grad()
+    def __call__(self, components: WanModularPipeline, state: PipelineState) -> PipelineState:
+        # Get inputs and intermediates
+        block_state = self.get_block_state(state)
+        self.check_inputs(block_state)
+
+        block_state.device = components._execution_device
+
+        # Encode input prompt
+        (
+            block_state.prompt_embeds,
+            block_state.negative_prompt_embeds,
+        ) = self.encode_prompt(
+            components=components,
+            prompt=block_state.prompt,
+            device=block_state.device,
+            prepare_unconditional_embeds=components.requires_unconditional_embeds,
+            negative_prompt=block_state.negative_prompt,
+            max_sequence_length=block_state.max_sequence_length,
+        )
+
+        # Add outputs
+        self.set_block_state(state, block_state)
+        return components, state
+
+
+class ZImageVaeImageEncoderStep(ModularPipelineBlocks):
+    model_name = "z_image"
+
+    @property
+    def description(self) -> str:
+        return "Vae Image Encoder step that generate condition_latents based on image to guide the image generation"
+
+    @property
+    def expected_components(self) -> List[ComponentSpec]:
+        return [
+            ComponentSpec("vae", AutoencoderKL),
+            ComponentSpec(
+                "image_processor",
+                VaeImageProcessor,
+                config=FrozenDict({"vae_scale_factor": 8 * 2}),
+                default_creation_method="from_config",
+            ),
+        ]
+
+    @property
+    def inputs(self) -> List[InputParam]:
+        return [
+            InputParam("image", type_hint=PIL.Image.Image, required=True),
+            InputParam("height"),
+            InputParam("width"),
+            InputParam("generator"),
+        ]
+
+    @property
+    def intermediate_outputs(self) -> List[OutputParam]:
+        return [
+            OutputParam(
+                "image_latents",
+                type_hint=torch.Tensor,
+                description="video latent representation with the first frame image condition",
+            ),
+        ]
+
+    @staticmethod
+    def check_inputs(components, block_state):
+        if (block_state.height is not None and block_state.height % components.vae_scale_factor_spatial != 0) or (
+            block_state.width is not None and block_state.width % components.vae_scale_factor_spatial != 0
+        ):
+            raise ValueError(
+                f"`height` and `width` have to be divisible by {components.vae_scale_factor_spatial} but are {block_state.height} and {block_state.width}."
+            )
+
+    def __call__(self, components: WanModularPipeline, state: PipelineState) -> PipelineState:
+        block_state = self.get_block_state(state)
+        self.check_inputs(components, block_state)
+
+        image = block_state.image
+
+        device = components._execution_device
+        dtype = torch.float32
+
+        image_tensor = components.image_processor.preprocess(image, height=block_state.height, width=block_state.width).to(
+            device=device, dtype=dtype
+        )
+
+        block_state.image_latents = encode_vae_image(
+            image_tensor=image_tensor,
+            vae=components.vae,
+            generator=block_state.generator,
+            device=device,
+            dtype=dtype,
+            latent_channels=components.num_channels_latents,
+        )
+
+        self.set_block_state(state, block_state)
+        return components, state
+
diff --git a/src/diffusers/modular_pipelines/z_image/modular_blocks.py b/src/diffusers/modular_pipelines/z_image/modular_blocks.py
new file mode 100644
index 000000000000..c269a5b8f0b9
--- /dev/null
+++ b/src/diffusers/modular_pipelines/z_image/modular_blocks.py
@@ -0,0 +1,192 @@
+# Copyright 2025 Alibaba Z-Image Team and The HuggingFace Team. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+from ...utils import logging
+from ..modular_pipeline import AutoPipelineBlocks, SequentialPipelineBlocks
+from ..modular_pipeline_utils import InsertableDict
+from .before_denoise import (
+    ZImageTextInputStep,
+    ZImageSetTimestepsWithStrengthStep,
+    ZImagePrepareLatentsStep,
+    ZImageAdditionalInputsStep,
+    ZImagePrepareLatentswithImageStep,
+    ZImageSetTimestepsStep,
+)
+from .decoders import ZImageVaeDecoderStep
+from .denoise import (
+    ZImageDenoiseStep,
+)
+from .encoders import (
+    ZImageTextEncoderStep,
+    ZImageVaeImageEncoderStep,
+)
+
+
+logger = logging.get_logger(__name__)  # pylint: disable=invalid-name
+
+
+# z-image
+# text2image
+class ZImageCoreDenoiseStep(SequentialPipelineBlocks):
+    block_classes = [
+        ZImageTextInputStep,
+        ZImagePrepareLatentsStep,
+        ZImageSetTimestepsStep,
+        ZImageDenoiseStep,
+    ]
+    block_names = ["input", "prepare_latents", "set_timesteps", "denoise"]
+
+    @property
+    def description(self):
+        return (
+            "denoise block that takes encoded conditions and runs the denoising process.\n"
+            + "This is a sequential pipeline blocks:\n"
+            + " - `ZImageTextInputStep` is used to adjust the batch size of the model inputs\n"
+            + " - `ZImagePrepareLatentsStep` is used to prepare the latents\n"
+            + " - `ZImageSetTimestepsStep` is used to set the timesteps\n"
+            + " - `ZImageDenoiseStep` is used to denoise the latents\n"
+        )
+
+
+# z-image: image2image
+## denoise
+class ZImageImage2ImageCoreDenoiseStep(SequentialPipelineBlocks):
+    block_classes = [
+        ZImageTextInputStep,
+        ZImageAdditionalInputsStep(image_latent_inputs=["image_latents"]),
+        ZImagePrepareLatentsStep,
+        ZImageSetTimestepsStep,
+        ZImageSetTimestepsWithStrengthStep,
+        ZImagePrepareLatentswithImageStep,
+        ZImageDenoiseStep,
+    ]
+    block_names = [
+        "input",
+        "additional_inputs",
+        "prepare_latents",
+        "set_timesteps",
+        "set_timesteps_with_strength",
+        "prepare_latents_with_image",
+        "denoise",
+    ]
+
+    @property
+    def description(self):
+        return (
+            "denoise block that takes encoded text and image latent conditions and runs the denoising process.\n"
+            + "This is a sequential pipeline blocks:\n"
+            + " - `ZImageTextInputStep` is used to adjust the batch size of the model inputs\n"
+            + " - `ZImageAdditionalInputsStep` is used to adjust the batch size of the latent conditions\n"
+            + " - `ZImagePrepareLatentsStep` is used to prepare the latents\n"
+            + " - `ZImageSetTimestepsStep` is used to set the timesteps\n"
+            + " - `ZImageSetTimestepsWithStrengthStep` is used to set the timesteps with strength\n"
+            + " - `ZImagePrepareLatentswithImageStep` is used to prepare the latents with image\n"
+            + " - `ZImageDenoiseStep` is used to denoise the latents\n"
+        )
+
+
+## auto blocks
+class ZImageAutoDenoiseStep(AutoPipelineBlocks):
+    block_classes = [
+        ZImageImage2ImageCoreDenoiseStep,
+        ZImageCoreDenoiseStep,
+    ]
+    block_names = ["image2image", "text2image"]
+    block_trigger_inputs = ["image_latents", None]
+
+    @property
+    def description(self) -> str:
+        return (
+            "Denoise step that iteratively denoise the latents. "
+            "This is a auto pipeline block that works for text2image and image2image tasks."
+            " - `ZImageCoreDenoiseStep` (text2image) for text2image tasks."
+            " - `ZImageImage2ImageCoreDenoiseStep` (image2image) for image2image tasks."
+            + " - if `image_latents` is provided, `ZImageImage2ImageCoreDenoiseStep` will be used.\n"
+            + " - if `image_latents` is not provided, `ZImageCoreDenoiseStep` will be used.\n"
+        )
+
+
+class ZImageAutoVaeImageEncoderStep(AutoPipelineBlocks):
+    block_classes = [ZImageVaeImageEncoderStep]
+    block_names = ["vae_image_encoder"]
+    block_trigger_inputs = ["image"]
+
+    @property
+    def description(self) -> str:
+        return "Vae Image Encoder step that encode the image to generate the image latents"
+        + "This is an auto pipeline block that works for image2image tasks."
+        + " - `ZImageVaeImageEncoderStep` is used when `image` is provided."
+        + " - if `image` is not provided, step will be skipped."
+
+
+class ZImageAutoBlocks(SequentialPipelineBlocks):
+    block_classes = [
+        ZImageTextEncoderStep,
+        ZImageAutoVaeImageEncoderStep,
+        ZImageAutoDenoiseStep,
+        ZImageVaeDecoderStep,
+    ]
+    block_names = ["text_encoder", "vae_image_encoder", "denoise", "decode"]
+
+
+    @property
+    def description(self) -> str:
+        return "Auto Modular pipeline for text-to-image and image-to-image using ZImage.\n"
+        + " - for text-to-image generation, all you need to provide is `prompt`\n"
+        + " - for image-to-image generation, you need to provide `image`\n"
+        + " - if `image` is not provided, step will be skipped."
+
+
+# presets 
+TEXT2IMAGE_BLOCKS = InsertableDict(
+    [
+        ("text_encoder", ZImageTextEncoderStep),
+        ("input", ZImageTextInputStep),
+        ("prepare_latents", ZImagePrepareLatentsStep),
+        ("set_timesteps", ZImageSetTimestepsStep),
+        ("denoise", ZImageDenoiseStep),
+        ("decode", ZImageVaeDecoderStep),
+    ]
+)
+
+IMAGE2IMAGE_BLOCKS = InsertableDict(
+    [
+        ("text_encoder", ZImageTextEncoderStep),
+        ("vae_image_encoder", ZImageVaeImageEncoderStep),
+        ("input", ZImageTextInputStep),
+        ("additional_inputs", ZImageAdditionalInputsStep(image_latent_inputs=["image_latents"])),
+        ("prepare_latents", ZImagePrepareLatentsStep),
+        ("set_timesteps", ZImageSetTimestepsStep),
+        ("set_timesteps_with_strength", ZImageSetTimestepsWithStrengthStep),
+        ("prepare_latents_with_image", ZImagePrepareLatentswithImageStep),
+        ("denoise", ZImageDenoiseStep),
+        ("decode", ZImageVaeDecoderStep),
+    ]
+)
+
+
+AUTO_BLOCKS = InsertableDict(
+    [
+        ("text_encoder", ZImageTextEncoderStep),
+        ("vae_image_encoder", ZImageAutoVaeImageEncoderStep),
+        ("denoise", ZImageAutoDenoiseStep),
+        ("decode", ZImageVaeDecoderStep),
+    ]
+)
+
+ALL_BLOCKS = {
+    "text2image": TEXT2IMAGE_BLOCKS,
+    "image2image": IMAGE2IMAGE_BLOCKS,
+    "auto": AUTO_BLOCKS,
+}
diff --git a/src/diffusers/modular_pipelines/z_image/modular_pipeline.py b/src/diffusers/modular_pipelines/z_image/modular_pipeline.py
new file mode 100644
index 000000000000..ddef79fbfeb8
--- /dev/null
+++ b/src/diffusers/modular_pipelines/z_image/modular_pipeline.py
@@ -0,0 +1,74 @@
+# Copyright 2025 Alibaba Z-Image Team and The HuggingFace Team. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+
+from typing import Any, Dict, Optional
+
+from ...loaders import ZImageLoraLoaderMixin
+from ...utils import logging
+from ..modular_pipeline import ModularPipeline
+
+
+logger = logging.get_logger(__name__)  # pylint: disable=invalid-name
+
+
+class ZImageModularPipeline(
+    ModularPipeline,
+    ZImageLoraLoaderMixin,
+):
+    """
+    A ModularPipeline for Z-Image.
+
+    > [!WARNING] > This is an experimental feature and is likely to change in the future.
+    """
+
+    default_blocks_name = "ZImageAutoBlocks"
+
+    @property
+    def default_height(self):
+        return 1024
+
+    @property
+    def default_width(self):
+        return 1024
+
+    @property
+    def vae_scale_factor_spatial(self):
+        vae_scale_factor_spatial = 16
+        if hasattr(self, "image_processor") and self.image_processor is not None:
+            vae_scale_factor_spatial = self.image_processor.vae_scale_factor
+        return vae_scale_factor_spatial
+
+    @property
+    def vae_scale_factor(self):
+        vae_scale_factor = 8
+        if hasattr(self, "vae") and self.vae is not None:
+            vae_scale_factor = 2 ** (len(self.vae.config.block_out_channels) - 1)
+        return vae_scale_factor
+
+    @property
+    def num_channels_latents(self):
+        num_channels_latents = 16
+        if hasattr(self, "transformer") and self.transformer is not None:
+            num_channels_latents = self.transformer.config.in_channels
+        return num_channels_latents
+
+    @property
+    def requires_unconditional_embeds(self):
+        requires_unconditional_embeds = False
+
+        if hasattr(self, "guider") and self.guider is not None:
+            requires_unconditional_embeds = self.guider._enabled and self.guider.num_conditions > 1
+
+        return requires_unconditional_embeds

From 0bb53baa0ea3a9cc57602c0ed46dc5cd098b0fd6 Mon Sep 17 00:00:00 2001
From: yiyixuxu <yixu310@gmail.com>
Date: Tue, 9 Dec 2025 12:59:46 +0100
Subject: [PATCH 2/7] up up

---
 src/diffusers/__init__.py                     |  4 ++++
 src/diffusers/modular_pipelines/__init__.py   |  5 +++++
 .../modular_pipelines/modular_pipeline.py     |  1 +
 .../modular_pipelines/wan/encoders.py         |  6 ++++--
 .../z_image/before_denoise.py                 | 12 ++++++++---
 .../modular_pipelines/z_image/denoise.py      | 20 ++++++++++---------
 .../modular_pipelines/z_image/encoders.py     | 18 +++++++++--------
 7 files changed, 44 insertions(+), 22 deletions(-)

diff --git a/src/diffusers/__init__.py b/src/diffusers/__init__.py
index d80363349d72..8f872a2be203 100644
--- a/src/diffusers/__init__.py
+++ b/src/diffusers/__init__.py
@@ -419,6 +419,8 @@
             "Wan22AutoBlocks",
             "WanAutoBlocks",
             "WanModularPipeline",
+            "ZImageAutoBlocks",
+            "ZImageModularPipeline",
         ]
     )
     _import_structure["pipelines"].extend(
@@ -1123,6 +1125,8 @@
             Wan22AutoBlocks,
             WanAutoBlocks,
             WanModularPipeline,
+            ZImageAutoBlocks,
+            ZImageModularPipeline,
         )
         from .pipelines import (
             AllegroPipeline,
diff --git a/src/diffusers/modular_pipelines/__init__.py b/src/diffusers/modular_pipelines/__init__.py
index 252b9f33dfe8..dea9da0269b4 100644
--- a/src/diffusers/modular_pipelines/__init__.py
+++ b/src/diffusers/modular_pipelines/__init__.py
@@ -60,6 +60,10 @@
         "QwenImageEditPlusModularPipeline",
         "QwenImageEditPlusAutoBlocks",
     ]
+    _import_structure["z_image"] = [
+        "ZImageAutoBlocks",
+        "ZImageModularPipeline",
+    ]
     _import_structure["components_manager"] = ["ComponentsManager"]
 
 if TYPE_CHECKING or DIFFUSERS_SLOW_IMPORT:
@@ -91,6 +95,7 @@
         )
         from .stable_diffusion_xl import StableDiffusionXLAutoBlocks, StableDiffusionXLModularPipeline
         from .wan import Wan22AutoBlocks, WanAutoBlocks, WanModularPipeline
+        from .z_image import ZImageAutoBlocks, ZImageModularPipeline
 else:
     import sys
 
diff --git a/src/diffusers/modular_pipelines/modular_pipeline.py b/src/diffusers/modular_pipelines/modular_pipeline.py
index a6336de71a52..d4dcf9feb174 100644
--- a/src/diffusers/modular_pipelines/modular_pipeline.py
+++ b/src/diffusers/modular_pipelines/modular_pipeline.py
@@ -61,6 +61,7 @@
         ("qwenimage", "QwenImageModularPipeline"),
         ("qwenimage-edit", "QwenImageEditModularPipeline"),
         ("qwenimage-edit-plus", "QwenImageEditPlusModularPipeline"),
+        ("z_image", "ZImageModularPipeline"),
     ]
 )
 
diff --git a/src/diffusers/modular_pipelines/wan/encoders.py b/src/diffusers/modular_pipelines/wan/encoders.py
index dc49df8eab8c..4fd69c6ca6ab 100644
--- a/src/diffusers/modular_pipelines/wan/encoders.py
+++ b/src/diffusers/modular_pipelines/wan/encoders.py
@@ -530,6 +530,7 @@ def __call__(self, components: WanModularPipeline, state: PipelineState) -> Pipe
 
         device = components._execution_device
         dtype = torch.float32
+        vae_dtype = components.vae.dtype
 
         height = block_state.height or components.default_height
         width = block_state.width or components.default_width
@@ -555,7 +556,7 @@ def __call__(self, components: WanModularPipeline, state: PipelineState) -> Pipe
             vae=components.vae,
             generator=block_state.generator,
             device=device,
-            dtype=dtype,
+            dtype=vae_dtype,
             latent_channels=components.num_channels_latents,
         )
 
@@ -627,6 +628,7 @@ def __call__(self, components: WanModularPipeline, state: PipelineState) -> Pipe
 
         device = components._execution_device
         dtype = torch.float32
+        vae_dtype = components.vae.dtype
 
         height = block_state.height or components.default_height
         width = block_state.width or components.default_width
@@ -659,7 +661,7 @@ def __call__(self, components: WanModularPipeline, state: PipelineState) -> Pipe
             vae=components.vae,
             generator=block_state.generator,
             device=device,
-            dtype=dtype,
+            dtype=vae_dtype,
             latent_channels=components.num_channels_latents,
         )
 
diff --git a/src/diffusers/modular_pipelines/z_image/before_denoise.py b/src/diffusers/modular_pipelines/z_image/before_denoise.py
index eac00482b7ad..c8549db4b8bc 100644
--- a/src/diffusers/modular_pipelines/z_image/before_denoise.py
+++ b/src/diffusers/modular_pipelines/z_image/before_denoise.py
@@ -108,8 +108,8 @@ def calculate_dimension_from_latents(
         Tuple[int, int]: The calculated image dimensions as (height, width)
     """
     latent_height, latent_width = latents.shape[2:]
-    height = latent_height * vae_scale_factor_spatial
-    width = latent_width * vae_scale_factor_spatial
+    height = latent_height * vae_scale_factor_spatial // 2
+    width = latent_width * vae_scale_factor_spatial // 2
 
     return height, width
 
@@ -511,10 +511,16 @@ def description(self) -> str:
     def inputs(self) -> List[InputParam]:
         return [
             InputParam("latents", required=True),
-            InputParam("num_inference_steps", default=50),
+            InputParam("num_inference_steps", default=9),
             InputParam("sigmas"),
         ]
 
+    @property
+    def intermediate_outputs(self) -> List[OutputParam]:
+        return [
+            OutputParam("timesteps", type_hint=torch.Tensor, description="The timesteps to use for the denoising process"),
+        ]
+
     @torch.no_grad()
     def __call__(self, components: ZImageModularPipeline, state: PipelineState) -> PipelineState:
         block_state = self.get_block_state(state)
diff --git a/src/diffusers/modular_pipelines/z_image/denoise.py b/src/diffusers/modular_pipelines/z_image/denoise.py
index 657c8c386288..c033e24fce29 100644
--- a/src/diffusers/modular_pipelines/z_image/denoise.py
+++ b/src/diffusers/modular_pipelines/z_image/denoise.py
@@ -79,7 +79,7 @@ class ZImageLoopDenoiser(ModularPipelineBlocks):
 
     def __init__(
         self,
-        guider_input_fields: Dict[str, Any] = {"encoder_hidden_states": ("prompt_embeds", "negative_prompt_embeds")},
+        guider_input_fields: Dict[str, Any] = {"cap_feats": ("prompt_embeds", "negative_prompt_embeds")},
     ):
         """Initialize a denoiser block that calls the denoiser model. This block is used in Z-Image.
 
@@ -105,7 +105,7 @@ def expected_components(self) -> List[ComponentSpec]:
             ComponentSpec(
                 "guider",
                 ClassifierFreeGuidance,
-                config=FrozenDict({"guidance_scale": 5.0}),
+                config=FrozenDict({"guidance_scale": 5.0, "enabled": False}),
                 default_creation_method="from_config",
             ),
             ComponentSpec("transformer", ZImageTransformer2DModel),
@@ -122,7 +122,6 @@ def description(self) -> str:
     @property
     def inputs(self) -> List[Tuple[str, Any]]:
         inputs = [
-            InputParam("attention_kwargs"),
             InputParam(
                 "num_inference_steps",
                 required=True,
@@ -131,14 +130,18 @@ def inputs(self) -> List[Tuple[str, Any]]:
             ),
         ]
         guider_input_names = []
+        uncond_guider_input_names =  []
         for value in self._guider_input_fields.values():
             if isinstance(value, tuple):
-                guider_input_names.extend(value)
+                guider_input_names.append(value[0])
+                uncond_guider_input_names.append(value[1])
             else:
                 guider_input_names.append(value)
 
         for name in guider_input_names:
-            inputs.append(InputParam(name=name, required=True, type_hint=torch.Tensor))
+            inputs.append(InputParam(name=name, required=True))
+        for name in uncond_guider_input_names:
+            inputs.append(InputParam(name=name))
         return inputs
 
     @torch.no_grad()
@@ -177,9 +180,8 @@ def _convert_dtype(v, dtype):
             # Predict the noise residual
             # store the noise_pred in guider_state_batch so that we can apply guidance across all batches
             model_out_list = components.transformer(
-                hidden_states=block_state.latent_model_input,
-                timestep=block_state.timestep,
-                attention_kwargs=block_state.attention_kwargs,
+                x=block_state.latent_model_input,
+                t=block_state.timestep,
                 return_dict=False,
                 **cond_kwargs,
             )[0]
@@ -286,7 +288,7 @@ class ZImageDenoiseStep(ZImageDenoiseLoopWrapper):
         ZImageLoopBeforeDenoiser,
         ZImageLoopDenoiser(
             guider_input_fields={
-                "encoder_hidden_states": ("prompt_embeds", "negative_prompt_embeds"),
+                "cap_feats": ("prompt_embeds", "negative_prompt_embeds"),
             }
         ),
         ZImageLoopAfterDenoiser,
diff --git a/src/diffusers/modular_pipelines/z_image/encoders.py b/src/diffusers/modular_pipelines/z_image/encoders.py
index 581473923cf2..c7f4a5932f97 100644
--- a/src/diffusers/modular_pipelines/z_image/encoders.py
+++ b/src/diffusers/modular_pipelines/z_image/encoders.py
@@ -28,7 +28,7 @@
 from ...image_processor import VaeImageProcessor
 from ..modular_pipeline import ModularPipelineBlocks, PipelineState
 from ..modular_pipeline_utils import ComponentSpec, InputParam, OutputParam
-from .modular_pipeline import WanModularPipeline
+from .modular_pipeline import ZImageModularPipeline
 
 
 if is_ftfy_available():
@@ -76,12 +76,12 @@ def get_qwen_prompt_embeds(
         output_hidden_states=True,
     ).hidden_states[-2]
 
-    prompt_embeds = []
+    prompt_embeds_list = []
 
     for i in range(len(prompt_embeds)):
-        prompt_embeds.append(prompt_embeds[i][prompt_masks[i]])
+        prompt_embeds_list.append(prompt_embeds[i][prompt_masks[i]])
 
-    return prompt_embeds
+    return prompt_embeds_list
 
 
 # Copied from diffusers.pipelines.stable_diffusion.pipeline_stable_diffusion_img2img.retrieve_latents
@@ -145,7 +145,7 @@ def expected_components(self) -> List[ComponentSpec]:
             ComponentSpec(
                 "guider",
                 ClassifierFreeGuidance,
-                config=FrozenDict({"guidance_scale": 5.0}),
+                config=FrozenDict({"guidance_scale": 5.0, "enabled": False}),
                 default_creation_method="from_config",
             ),
         ]
@@ -221,6 +221,7 @@ def encode_prompt(
             device=device,
         )
 
+        negative_prompt_embeds = None
         if prepare_unconditional_embeds:
             negative_prompt = negative_prompt or ""
             negative_prompt = batch_size * [negative_prompt] if isinstance(negative_prompt, str) else negative_prompt
@@ -248,7 +249,7 @@ def encode_prompt(
         return prompt_embeds, negative_prompt_embeds
 
     @torch.no_grad()
-    def __call__(self, components: WanModularPipeline, state: PipelineState) -> PipelineState:
+    def __call__(self, components: ZImageModularPipeline, state: PipelineState) -> PipelineState:
         # Get inputs and intermediates
         block_state = self.get_block_state(state)
         self.check_inputs(block_state)
@@ -320,7 +321,7 @@ def check_inputs(components, block_state):
                 f"`height` and `width` have to be divisible by {components.vae_scale_factor_spatial} but are {block_state.height} and {block_state.width}."
             )
 
-    def __call__(self, components: WanModularPipeline, state: PipelineState) -> PipelineState:
+    def __call__(self, components: ZImageModularPipeline, state: PipelineState) -> PipelineState:
         block_state = self.get_block_state(state)
         self.check_inputs(components, block_state)
 
@@ -328,6 +329,7 @@ def __call__(self, components: WanModularPipeline, state: PipelineState) -> Pipe
 
         device = components._execution_device
         dtype = torch.float32
+        vae_dtype = components.vae.dtype
 
         image_tensor = components.image_processor.preprocess(image, height=block_state.height, width=block_state.width).to(
             device=device, dtype=dtype
@@ -338,7 +340,7 @@ def __call__(self, components: WanModularPipeline, state: PipelineState) -> Pipe
             vae=components.vae,
             generator=block_state.generator,
             device=device,
-            dtype=dtype,
+            dtype=vae_dtype,
             latent_channels=components.num_channels_latents,
         )
 

From 56195bebcbd6590439a31d95f68798a93fe139ea Mon Sep 17 00:00:00 2001
From: yiyixuxu <yixu310@gmail.com>
Date: Tue, 9 Dec 2025 13:09:20 +0100
Subject: [PATCH 3/7] fix: z_image -> z-image

---
 src/diffusers/modular_pipelines/modular_pipeline.py  |  2 +-
 .../modular_pipelines/z_image/before_denoise.py      | 12 ++++++------
 src/diffusers/modular_pipelines/z_image/decoders.py  |  2 +-
 src/diffusers/modular_pipelines/z_image/denoise.py   |  8 ++++----
 src/diffusers/modular_pipelines/z_image/encoders.py  |  4 ++--
 5 files changed, 14 insertions(+), 14 deletions(-)

diff --git a/src/diffusers/modular_pipelines/modular_pipeline.py b/src/diffusers/modular_pipelines/modular_pipeline.py
index d4dcf9feb174..bba89e612183 100644
--- a/src/diffusers/modular_pipelines/modular_pipeline.py
+++ b/src/diffusers/modular_pipelines/modular_pipeline.py
@@ -61,7 +61,7 @@
         ("qwenimage", "QwenImageModularPipeline"),
         ("qwenimage-edit", "QwenImageEditModularPipeline"),
         ("qwenimage-edit-plus", "QwenImageEditPlusModularPipeline"),
-        ("z_image", "ZImageModularPipeline"),
+        ("z-image", "ZImageModularPipeline"),
     ]
 )
 
diff --git a/src/diffusers/modular_pipelines/z_image/before_denoise.py b/src/diffusers/modular_pipelines/z_image/before_denoise.py
index c8549db4b8bc..8b7ce2d43a0a 100644
--- a/src/diffusers/modular_pipelines/z_image/before_denoise.py
+++ b/src/diffusers/modular_pipelines/z_image/before_denoise.py
@@ -188,7 +188,7 @@ def retrieve_timesteps(
 
 
 class ZImageTextInputStep(ModularPipelineBlocks):
-    model_name = "z_image"
+    model_name = "z-image"
 
     @property
     def description(self) -> str:
@@ -278,7 +278,7 @@ def __call__(self, components: ZImageModularPipeline, state: PipelineState) -> P
 
 
 class ZImageAdditionalInputsStep(ModularPipelineBlocks):
-    model_name = "z_image"
+    model_name = "z-image"
 
     def __init__(
         self,
@@ -401,7 +401,7 @@ def __call__(self, components: ZImageModularPipeline, state: PipelineState) -> P
 
 
 class ZImagePrepareLatentsStep(ModularPipelineBlocks):
-    model_name = "z_image"
+    model_name = "z-image"
 
     @property
     def description(self) -> str:
@@ -495,7 +495,7 @@ def __call__(self, components: ZImageModularPipeline, state: PipelineState) -> P
 
 
 class ZImageSetTimestepsStep(ModularPipelineBlocks):
-    model_name = "z_image"
+    model_name = "z-image"
 
     @property
     def expected_components(self) -> List[ComponentSpec]:
@@ -551,7 +551,7 @@ def __call__(self, components: ZImageModularPipeline, state: PipelineState) -> P
 
 
 class ZImageSetTimestepsWithStrengthStep(ModularPipelineBlocks):
-    model_name = "z_image"
+    model_name = "z-image"
 
     @property
     def expected_components(self) -> List[ComponentSpec]:
@@ -598,7 +598,7 @@ def __call__(self, components: ZImageModularPipeline, state: PipelineState) -> P
 
 
 class ZImagePrepareLatentswithImageStep(ModularPipelineBlocks):
-    model_name = "z_image"
+    model_name = "z-image"
 
     @property
     def description(self) -> str:
diff --git a/src/diffusers/modular_pipelines/z_image/decoders.py b/src/diffusers/modular_pipelines/z_image/decoders.py
index 70ad1c645e66..b541330308ca 100644
--- a/src/diffusers/modular_pipelines/z_image/decoders.py
+++ b/src/diffusers/modular_pipelines/z_image/decoders.py
@@ -30,7 +30,7 @@
 
 
 class ZImageVaeDecoderStep(ModularPipelineBlocks):
-    model_name = "z_image"
+    model_name = "z-image"
 
     @property
     def expected_components(self) -> List[ComponentSpec]:
diff --git a/src/diffusers/modular_pipelines/z_image/denoise.py b/src/diffusers/modular_pipelines/z_image/denoise.py
index c033e24fce29..d3db470e36a1 100644
--- a/src/diffusers/modular_pipelines/z_image/denoise.py
+++ b/src/diffusers/modular_pipelines/z_image/denoise.py
@@ -35,7 +35,7 @@
 
 
 class ZImageLoopBeforeDenoiser(ModularPipelineBlocks):
-    model_name = "z_image"
+    model_name = "z-image"
 
     @property
     def description(self) -> str:
@@ -75,7 +75,7 @@ def __call__(self, components: ZImageModularPipeline, block_state: BlockState, i
 
 
 class ZImageLoopDenoiser(ModularPipelineBlocks):
-    model_name = "z_image"
+    model_name = "z-image"
 
     def __init__(
         self,
@@ -196,7 +196,7 @@ def _convert_dtype(v, dtype):
 
 
 class ZImageLoopAfterDenoiser(ModularPipelineBlocks):
-    model_name = "z_image"
+    model_name = "z-image"
 
     @property
     def expected_components(self) -> List[ComponentSpec]:
@@ -230,7 +230,7 @@ def __call__(self, components: ZImageModularPipeline, block_state: BlockState, i
 
 
 class ZImageDenoiseLoopWrapper(LoopSequentialPipelineBlocks):
-    model_name = "z_image"
+    model_name = "z-image"
 
     @property
     def description(self) -> str:
diff --git a/src/diffusers/modular_pipelines/z_image/encoders.py b/src/diffusers/modular_pipelines/z_image/encoders.py
index c7f4a5932f97..57a842f85784 100644
--- a/src/diffusers/modular_pipelines/z_image/encoders.py
+++ b/src/diffusers/modular_pipelines/z_image/encoders.py
@@ -131,7 +131,7 @@ def encode_vae_image(
 
 
 class ZImageTextEncoderStep(ModularPipelineBlocks):
-    model_name = "z_image"
+    model_name = "z-image"
 
     @property
     def description(self) -> str:
@@ -275,7 +275,7 @@ def __call__(self, components: ZImageModularPipeline, state: PipelineState) -> P
 
 
 class ZImageVaeImageEncoderStep(ModularPipelineBlocks):
-    model_name = "z_image"
+    model_name = "z-image"
 
     @property
     def description(self) -> str:

From 8e637f06f021f920ef0cbaf0aa114c17cfd53ef6 Mon Sep 17 00:00:00 2001
From: yiyixuxu <yixu310@gmail.com>
Date: Tue, 9 Dec 2025 13:11:48 +0100
Subject: [PATCH 4/7] style

---
 .../z_image/before_denoise.py                 | 37 +++++++++----------
 .../modular_pipelines/z_image/decoders.py     |  6 ++-
 .../modular_pipelines/z_image/denoise.py      | 14 ++++---
 .../modular_pipelines/z_image/encoders.py     | 19 ++++------
 .../z_image/modular_blocks.py                 | 21 +++++------
 .../z_image/modular_pipeline.py               |  2 -
 6 files changed, 47 insertions(+), 52 deletions(-)

diff --git a/src/diffusers/modular_pipelines/z_image/before_denoise.py b/src/diffusers/modular_pipelines/z_image/before_denoise.py
index 8b7ce2d43a0a..f628ed6828cd 100644
--- a/src/diffusers/modular_pipelines/z_image/before_denoise.py
+++ b/src/diffusers/modular_pipelines/z_image/before_denoise.py
@@ -91,13 +91,11 @@ def repeat_tensor_to_batch_size(
     return input_tensor
 
 
-def calculate_dimension_from_latents(
-    latents: torch.Tensor, vae_scale_factor_spatial: int
-) -> Tuple[int, int]:
+def calculate_dimension_from_latents(latents: torch.Tensor, vae_scale_factor_spatial: int) -> Tuple[int, int]:
     """Calculate image dimensions from latent tensor dimensions.
 
-    This function converts latent spatial dimensions to image spatial dimensions by
-    multiplying the latent height/width by the VAE scale factor.
+    This function converts latent spatial dimensions to image spatial dimensions by multiplying the latent height/width
+    by the VAE scale factor.
 
     Args:
         latents (torch.Tensor): The latent tensor. Must have 4 dimensions.
@@ -113,6 +111,7 @@ def calculate_dimension_from_latents(
 
     return height, width
 
+
 # Copied from diffusers.pipelines.flux.pipeline_flux.calculate_shift
 def calculate_shift(
     image_seq_len,
@@ -261,7 +260,7 @@ def __call__(self, components: ZImageModularPipeline, state: PipelineState) -> P
         block_state = self.get_block_state(state)
         self.check_inputs(components, block_state)
 
-        block_state.batch_size =len(block_state.prompt_embeds)
+        block_state.batch_size = len(block_state.prompt_embeds)
         block_state.dtype = block_state.prompt_embeds[0].dtype
 
         if block_state.num_images_per_prompt > 1:
@@ -269,7 +268,9 @@ def __call__(self, components: ZImageModularPipeline, state: PipelineState) -> P
             block_state.prompt_embeds = prompt_embeds
 
             if block_state.negative_prompt_embeds is not None:
-                negative_prompt_embeds = [npe for npe in block_state.negative_prompt_embeds for _ in range(block_state.num_images_per_prompt)]
+                negative_prompt_embeds = [
+                    npe for npe in block_state.negative_prompt_embeds for _ in range(block_state.num_images_per_prompt)
+                ]
                 block_state.negative_prompt_embeds = negative_prompt_embeds
 
         self.set_block_state(state, block_state)
@@ -373,12 +374,9 @@ def __call__(self, components: ZImageModularPipeline, state: PipelineState) -> P
                 continue
 
             # 1. Calculate num_frames, height/width from latents
-            height, width = calculate_dimension_from_latents(
-                image_latent_tensor, components.vae_scale_factor_spatial
-            )
+            height, width = calculate_dimension_from_latents(image_latent_tensor, components.vae_scale_factor_spatial)
             block_state.height = block_state.height or height
             block_state.width = block_state.width or width
-            
 
         # Process additional batch inputs (only batch expansion)
         for input_name in self._additional_batch_inputs:
@@ -518,7 +516,9 @@ def inputs(self) -> List[InputParam]:
     @property
     def intermediate_outputs(self) -> List[OutputParam]:
         return [
-            OutputParam("timesteps", type_hint=torch.Tensor, description="The timesteps to use for the denoising process"),
+            OutputParam(
+                "timesteps", type_hint=torch.Tensor, description="The timesteps to use for the denoising process"
+            ),
         ]
 
     @torch.no_grad()
@@ -527,7 +527,7 @@ def __call__(self, components: ZImageModularPipeline, state: PipelineState) -> P
         device = components._execution_device
 
         latent_height, latent_width = block_state.latents.shape[2], block_state.latents.shape[3]
-        image_seq_len = (latent_height //2)  * (latent_width //2) # sequence length  after patchify 
+        image_seq_len = (latent_height // 2) * (latent_width // 2)  # sequence length  after patchify
 
         mu = calculate_shift(
             image_seq_len,
@@ -567,22 +567,19 @@ def description(self) -> str:
     def inputs(self) -> List[InputParam]:
         return [
             InputParam("timesteps", required=True),
-            InputParam("num_inference_steps",required=True),
+            InputParam("num_inference_steps", required=True),
             InputParam("strength", default=0.6),
         ]
 
-    
     def check_inputs(self, components, block_state):
         if block_state.strength < 0.0 or block_state.strength > 1.0:
             raise ValueError(f"Strength must be between 0.0 and 1.0, but got {block_state.strength}")
 
-
     @torch.no_grad()
     def __call__(self, components: ZImageModularPipeline, state: PipelineState) -> PipelineState:
         block_state = self.get_block_state(state)
         self.check_inputs(components, block_state)
 
-        device = components._execution_device
         init_timestep = min(block_state.num_inference_steps * block_state.strength, block_state.num_inference_steps)
 
         t_start = int(max(block_state.num_inference_steps - init_timestep, 0))
@@ -616,7 +613,9 @@ def __call__(self, components: ZImageModularPipeline, state: PipelineState) -> P
         block_state = self.get_block_state(state)
 
         latent_timestep = block_state.timesteps[:1].repeat(block_state.latents.shape[0])
-        block_state.latents = components.scheduler.scale_noise(block_state.image_latents, latent_timestep, block_state.latents)
+        block_state.latents = components.scheduler.scale_noise(
+            block_state.image_latents, latent_timestep, block_state.latents
+        )
 
         self.set_block_state(state, block_state)
-        return components, state
\ No newline at end of file
+        return components, state
diff --git a/src/diffusers/modular_pipelines/z_image/decoders.py b/src/diffusers/modular_pipelines/z_image/decoders.py
index b541330308ca..cdb6a2e5eac1 100644
--- a/src/diffusers/modular_pipelines/z_image/decoders.py
+++ b/src/diffusers/modular_pipelines/z_image/decoders.py
@@ -19,9 +19,9 @@
 import torch
 
 from ...configuration_utils import FrozenDict
+from ...image_processor import VaeImageProcessor
 from ...models import AutoencoderKL
 from ...utils import logging
-from ...image_processor import VaeImageProcessor
 from ..modular_pipeline import ModularPipelineBlocks, PipelineState
 from ..modular_pipeline_utils import ComponentSpec, InputParam, OutputParam
 
@@ -82,7 +82,9 @@ def __call__(self, components, state: PipelineState) -> PipelineState:
         latents = latents / components.vae.config.scaling_factor + components.vae.config.shift_factor
 
         block_state.images = components.vae.decode(latents, return_dict=False)[0]
-        block_state.images = components.image_processor.postprocess(block_state.images, output_type=block_state.output_type)
+        block_state.images = components.image_processor.postprocess(
+            block_state.images, output_type=block_state.output_type
+        )
 
         self.set_block_state(state, block_state)
 
diff --git a/src/diffusers/modular_pipelines/z_image/denoise.py b/src/diffusers/modular_pipelines/z_image/denoise.py
index d3db470e36a1..5e6d5d4c54b6 100644
--- a/src/diffusers/modular_pipelines/z_image/denoise.py
+++ b/src/diffusers/modular_pipelines/z_image/denoise.py
@@ -27,7 +27,7 @@
     ModularPipelineBlocks,
     PipelineState,
 )
-from ..modular_pipeline_utils import ComponentSpec, ConfigSpec, InputParam
+from ..modular_pipeline_utils import ComponentSpec, InputParam
 from .modular_pipeline import ZImageModularPipeline
 
 
@@ -64,9 +64,10 @@ def inputs(self) -> List[InputParam]:
 
     @torch.no_grad()
     def __call__(self, components: ZImageModularPipeline, block_state: BlockState, i: int, t: torch.Tensor):
-
-        latents = block_state.latents.unsqueeze(2).to(block_state.dtype) # [batch_size, num_channels, 1, height, width]
-        block_state.latent_model_input = list(latents.unbind(dim=0)) # list of [num_channels, 1, height, width]
+        latents = block_state.latents.unsqueeze(2).to(
+            block_state.dtype
+        )  # [batch_size, num_channels, 1, height, width]
+        block_state.latent_model_input = list(latents.unbind(dim=0))  # list of [num_channels, 1, height, width]
 
         timestep = t.expand(latents.shape[0]).to(block_state.dtype)
         timestep = (1000 - timestep) / 1000
@@ -130,7 +131,7 @@ def inputs(self) -> List[Tuple[str, Any]]:
             ),
         ]
         guider_input_names = []
-        uncond_guider_input_names =  []
+        uncond_guider_input_names = []
         for value in self._guider_input_fields.values():
             if isinstance(value, tuple):
                 guider_input_names.append(value[0])
@@ -164,6 +165,7 @@ def __call__(
         for guider_state_batch in guider_state:
             components.guider.prepare_models(components.transformer)
             cond_kwargs = guider_state_batch.as_dict()
+
             def _convert_dtype(v, dtype):
                 if isinstance(v, torch.Tensor):
                     return v.to(dtype)
@@ -305,4 +307,4 @@ def description(self) -> str:
             " - `ZImageLoopDenoiser`\n"
             " - `WanLoopAfterDenoiser`\n"
             "This block supports text-to-image and image-to-image tasks for Z-Image."
-        )
\ No newline at end of file
+        )
diff --git a/src/diffusers/modular_pipelines/z_image/encoders.py b/src/diffusers/modular_pipelines/z_image/encoders.py
index 57a842f85784..f5769fe2deec 100644
--- a/src/diffusers/modular_pipelines/z_image/encoders.py
+++ b/src/diffusers/modular_pipelines/z_image/encoders.py
@@ -12,27 +12,24 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 
-import html
 from typing import List, Optional, Union
 
-import numpy as np
 import PIL
-import regex as re
 import torch
-from transformers import  Qwen3Model, Qwen2Tokenizer
+from transformers import Qwen2Tokenizer, Qwen3Model
 
 from ...configuration_utils import FrozenDict
 from ...guiders import ClassifierFreeGuidance
+from ...image_processor import VaeImageProcessor
 from ...models import AutoencoderKL
 from ...utils import is_ftfy_available, logging
-from ...image_processor import VaeImageProcessor
 from ..modular_pipeline import ModularPipelineBlocks, PipelineState
 from ..modular_pipeline_utils import ComponentSpec, InputParam, OutputParam
 from .modular_pipeline import ZImageModularPipeline
 
 
 if is_ftfy_available():
-    import ftfy
+    pass
 
 logger = logging.get_logger(__name__)  # pylint: disable=invalid-name
 
@@ -44,9 +41,8 @@ def get_qwen_prompt_embeds(
     device: torch.device,
     max_sequence_length: int = 512,
 ) -> List[torch.Tensor]:
-
     prompt = [prompt] if isinstance(prompt, str) else prompt
-    
+
     for i, prompt_item in enumerate(prompt):
         messages = [
             {"role": "user", "content": prompt_item},
@@ -331,9 +327,9 @@ def __call__(self, components: ZImageModularPipeline, state: PipelineState) -> P
         dtype = torch.float32
         vae_dtype = components.vae.dtype
 
-        image_tensor = components.image_processor.preprocess(image, height=block_state.height, width=block_state.width).to(
-            device=device, dtype=dtype
-        )
+        image_tensor = components.image_processor.preprocess(
+            image, height=block_state.height, width=block_state.width
+        ).to(device=device, dtype=dtype)
 
         block_state.image_latents = encode_vae_image(
             image_tensor=image_tensor,
@@ -346,4 +342,3 @@ def __call__(self, components: ZImageModularPipeline, state: PipelineState) -> P
 
         self.set_block_state(state, block_state)
         return components, state
-
diff --git a/src/diffusers/modular_pipelines/z_image/modular_blocks.py b/src/diffusers/modular_pipelines/z_image/modular_blocks.py
index c269a5b8f0b9..a7c520301a39 100644
--- a/src/diffusers/modular_pipelines/z_image/modular_blocks.py
+++ b/src/diffusers/modular_pipelines/z_image/modular_blocks.py
@@ -16,12 +16,12 @@
 from ..modular_pipeline import AutoPipelineBlocks, SequentialPipelineBlocks
 from ..modular_pipeline_utils import InsertableDict
 from .before_denoise import (
-    ZImageTextInputStep,
-    ZImageSetTimestepsWithStrengthStep,
-    ZImagePrepareLatentsStep,
     ZImageAdditionalInputsStep,
+    ZImagePrepareLatentsStep,
     ZImagePrepareLatentswithImageStep,
     ZImageSetTimestepsStep,
+    ZImageSetTimestepsWithStrengthStep,
+    ZImageTextInputStep,
 )
 from .decoders import ZImageVaeDecoderStep
 from .denoise import (
@@ -125,9 +125,9 @@ class ZImageAutoVaeImageEncoderStep(AutoPipelineBlocks):
     @property
     def description(self) -> str:
         return "Vae Image Encoder step that encode the image to generate the image latents"
-        + "This is an auto pipeline block that works for image2image tasks."
-        + " - `ZImageVaeImageEncoderStep` is used when `image` is provided."
-        + " - if `image` is not provided, step will be skipped."
+        +"This is an auto pipeline block that works for image2image tasks."
+        +" - `ZImageVaeImageEncoderStep` is used when `image` is provided."
+        +" - if `image` is not provided, step will be skipped."
 
 
 class ZImageAutoBlocks(SequentialPipelineBlocks):
@@ -139,16 +139,15 @@ class ZImageAutoBlocks(SequentialPipelineBlocks):
     ]
     block_names = ["text_encoder", "vae_image_encoder", "denoise", "decode"]
 
-
     @property
     def description(self) -> str:
         return "Auto Modular pipeline for text-to-image and image-to-image using ZImage.\n"
-        + " - for text-to-image generation, all you need to provide is `prompt`\n"
-        + " - for image-to-image generation, you need to provide `image`\n"
-        + " - if `image` is not provided, step will be skipped."
+        +" - for text-to-image generation, all you need to provide is `prompt`\n"
+        +" - for image-to-image generation, you need to provide `image`\n"
+        +" - if `image` is not provided, step will be skipped."
 
 
-# presets 
+# presets
 TEXT2IMAGE_BLOCKS = InsertableDict(
     [
         ("text_encoder", ZImageTextEncoderStep),
diff --git a/src/diffusers/modular_pipelines/z_image/modular_pipeline.py b/src/diffusers/modular_pipelines/z_image/modular_pipeline.py
index ddef79fbfeb8..9217b14f0bbf 100644
--- a/src/diffusers/modular_pipelines/z_image/modular_pipeline.py
+++ b/src/diffusers/modular_pipelines/z_image/modular_pipeline.py
@@ -13,8 +13,6 @@
 # limitations under the License.
 
 
-from typing import Any, Dict, Optional
-
 from ...loaders import ZImageLoraLoaderMixin
 from ...utils import logging
 from ..modular_pipeline import ModularPipeline

From 9161313f97839f0b00bb06f5b9e31facbd800eb7 Mon Sep 17 00:00:00 2001
From: yiyixuxu <yixu310@gmail.com>
Date: Tue, 9 Dec 2025 13:12:34 +0100
Subject: [PATCH 5/7] copy

---
 .../z_image/before_denoise.py                 |  6 ++--
 .../dummy_torch_and_transformers_objects.py   | 30 +++++++++++++++++++
 2 files changed, 33 insertions(+), 3 deletions(-)

diff --git a/src/diffusers/modular_pipelines/z_image/before_denoise.py b/src/diffusers/modular_pipelines/z_image/before_denoise.py
index f628ed6828cd..2a49fc151ca7 100644
--- a/src/diffusers/modular_pipelines/z_image/before_denoise.py
+++ b/src/diffusers/modular_pipelines/z_image/before_denoise.py
@@ -441,7 +441,7 @@ def check_inputs(self, components, block_state):
     @staticmethod
     # Copied from diffusers.pipelines.z_image.pipeline_z_image.ZImagePipeline.prepare_latents with self->comp
     def prepare_latents(
-        components,
+        comp,
         batch_size,
         num_channels_latents,
         height,
@@ -451,8 +451,8 @@ def prepare_latents(
         generator,
         latents=None,
     ):
-        height = 2 * (int(height) // (components.vae_scale_factor * 2))
-        width = 2 * (int(width) // (components.vae_scale_factor * 2))
+        height = 2 * (int(height) // (comp.vae_scale_factor * 2))
+        width = 2 * (int(width) // (comp.vae_scale_factor * 2))
 
         shape = (batch_size, num_channels_latents, height, width)
 
diff --git a/src/diffusers/utils/dummy_torch_and_transformers_objects.py b/src/diffusers/utils/dummy_torch_and_transformers_objects.py
index 79a21d2ac6e5..da64742518bb 100644
--- a/src/diffusers/utils/dummy_torch_and_transformers_objects.py
+++ b/src/diffusers/utils/dummy_torch_and_transformers_objects.py
@@ -227,6 +227,36 @@ def from_pretrained(cls, *args, **kwargs):
         requires_backends(cls, ["torch", "transformers"])
 
 
+class ZImageAutoBlocks(metaclass=DummyObject):
+    _backends = ["torch", "transformers"]
+
+    def __init__(self, *args, **kwargs):
+        requires_backends(self, ["torch", "transformers"])
+
+    @classmethod
+    def from_config(cls, *args, **kwargs):
+        requires_backends(cls, ["torch", "transformers"])
+
+    @classmethod
+    def from_pretrained(cls, *args, **kwargs):
+        requires_backends(cls, ["torch", "transformers"])
+
+
+class ZImageModularPipeline(metaclass=DummyObject):
+    _backends = ["torch", "transformers"]
+
+    def __init__(self, *args, **kwargs):
+        requires_backends(self, ["torch", "transformers"])
+
+    @classmethod
+    def from_config(cls, *args, **kwargs):
+        requires_backends(cls, ["torch", "transformers"])
+
+    @classmethod
+    def from_pretrained(cls, *args, **kwargs):
+        requires_backends(cls, ["torch", "transformers"])
+
+
 class AllegroPipeline(metaclass=DummyObject):
     _backends = ["torch", "transformers"]
 

From b61ee2dff8af13e70d5382c4b36a80b962e12737 Mon Sep 17 00:00:00 2001
From: yiyixuxu <yixu310@gmail.com>
Date: Tue, 9 Dec 2025 13:14:14 +0100
Subject: [PATCH 6/7] fix more

---
 src/diffusers/modular_pipelines/z_image/modular_pipeline.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/src/diffusers/modular_pipelines/z_image/modular_pipeline.py b/src/diffusers/modular_pipelines/z_image/modular_pipeline.py
index 9217b14f0bbf..f1d8e53a3639 100644
--- a/src/diffusers/modular_pipelines/z_image/modular_pipeline.py
+++ b/src/diffusers/modular_pipelines/z_image/modular_pipeline.py
@@ -45,7 +45,7 @@ def default_width(self):
     def vae_scale_factor_spatial(self):
         vae_scale_factor_spatial = 16
         if hasattr(self, "image_processor") and self.image_processor is not None:
-            vae_scale_factor_spatial = self.image_processor.vae_scale_factor
+            vae_scale_factor_spatial = self.image_processor.config.vae_scale_factor
         return vae_scale_factor_spatial
 
     @property

From b4f049b0d5507d3ee249521114fed08ca691e33f Mon Sep 17 00:00:00 2001
From: yiyixuxu <yixu310@gmail.com>
Date: Tue, 9 Dec 2025 13:23:32 +0100
Subject: [PATCH 7/7] some docstring fix

---
 .../z_image/before_denoise.py                 | 20 +++++++++----------
 .../modular_pipelines/z_image/denoise.py      |  6 +++---
 2 files changed, 13 insertions(+), 13 deletions(-)

diff --git a/src/diffusers/modular_pipelines/z_image/before_denoise.py b/src/diffusers/modular_pipelines/z_image/before_denoise.py
index 2a49fc151ca7..35ea768f12c3 100644
--- a/src/diffusers/modular_pipelines/z_image/before_denoise.py
+++ b/src/diffusers/modular_pipelines/z_image/before_denoise.py
@@ -43,32 +43,32 @@ def repeat_tensor_to_batch_size(
 ) -> torch.Tensor:
     """Repeat tensor elements to match the final batch size.
 
-    This function expands a tensor's batch dimension to match the final batch size (batch_size * num_videos_per_prompt)
+    This function expands a tensor's batch dimension to match the final batch size (batch_size * num_images_per_prompt)
     by repeating each element along dimension 0.
 
     The input tensor must have batch size 1 or batch_size. The function will:
-    - If batch size is 1: repeat each element (batch_size * num_videos_per_prompt) times
-    - If batch size equals batch_size: repeat each element num_videos_per_prompt times
+    - If batch size is 1: repeat each element (batch_size * num_images_per_prompt) times
+    - If batch size equals batch_size: repeat each element num_images_per_prompt times
 
     Args:
         input_name (str): Name of the input tensor (used for error messages)
         input_tensor (torch.Tensor): The tensor to repeat. Must have batch size 1 or batch_size.
         batch_size (int): The base batch size (number of prompts)
-        num_videos_per_prompt (int, optional): Number of videos to generate per prompt. Defaults to 1.
+        num_images_per_prompt (int, optional): Number of images to generate per prompt. Defaults to 1.
 
     Returns:
-        torch.Tensor: The repeated tensor with final batch size (batch_size * num_videos_per_prompt)
+        torch.Tensor: The repeated tensor with final batch size (batch_size * num_images_per_prompt)
 
     Raises:
         ValueError: If input_tensor is not a torch.Tensor or has invalid batch size
 
     Examples:
         tensor = torch.tensor([[1, 2, 3]]) # shape: [1, 3] repeated = repeat_tensor_to_batch_size("image", tensor,
-        batch_size=2, num_videos_per_prompt=2) repeated # tensor([[1, 2, 3], [1, 2, 3], [1, 2, 3], [1, 2, 3]]) - shape:
+        batch_size=2, num_images_per_prompt=2) repeated # tensor([[1, 2, 3], [1, 2, 3], [1, 2, 3], [1, 2, 3]]) - shape:
         [4, 3]
 
         tensor = torch.tensor([[1, 2, 3], [4, 5, 6]]) # shape: [2, 3] repeated = repeat_tensor_to_batch_size("image",
-        tensor, batch_size=2, num_videos_per_prompt=2) repeated # tensor([[1, 2, 3], [1, 2, 3], [4, 5, 6], [4, 5, 6]])
+        tensor, batch_size=2, num_images_per_prompt=2) repeated # tensor([[1, 2, 3], [1, 2, 3], [4, 5, 6], [4, 5, 6]])
         - shape: [4, 3]
     """
     # make sure input is a tensor
@@ -229,7 +229,7 @@ def intermediate_outputs(self) -> List[str]:
             OutputParam(
                 "batch_size",
                 type_hint=int,
-                description="Number of prompts, the final batch size of model inputs should be batch_size * num_videos_per_prompt",
+                description="Number of prompts, the final batch size of model inputs should be batch_size * num_images_per_prompt",
             ),
             OutputParam(
                 "dtype",
@@ -417,7 +417,7 @@ def inputs(self) -> List[InputParam]:
                 "batch_size",
                 required=True,
                 type_hint=int,
-                description="Number of prompts, the final batch size of model inputs should be `batch_size * num_videos_per_prompt`. Can be generated in input step.",
+                description="Number of prompts, the final batch size of model inputs should be `batch_size * num_images_per_prompt`. Can be generated in input step.",
             ),
             InputParam("dtype", type_hint=torch.dtype, description="The dtype of the model inputs"),
         ]
@@ -470,7 +470,7 @@ def __call__(self, components: ZImageModularPipeline, state: PipelineState) -> P
         self.check_inputs(components, block_state)
 
         device = components._execution_device
-        dtype = torch.float32  # Wan latents should be torch.float32 for best quality
+        dtype = torch.float32
 
         block_state.height = block_state.height or components.default_height
         block_state.width = block_state.width or components.default_width
diff --git a/src/diffusers/modular_pipelines/z_image/denoise.py b/src/diffusers/modular_pipelines/z_image/denoise.py
index 5e6d5d4c54b6..ec815f77ad1e 100644
--- a/src/diffusers/modular_pipelines/z_image/denoise.py
+++ b/src/diffusers/modular_pipelines/z_image/denoise.py
@@ -42,7 +42,7 @@ def description(self) -> str:
         return (
             "step within the denoising loop that prepares the latent input for the denoiser. "
             "This block should be used to compose the `sub_blocks` attribute of a `LoopSequentialPipelineBlocks` "
-            "object (e.g. `WanDenoiseLoopWrapper`)"
+            "object (e.g. `ZImageDenoiseLoopWrapper`)"
         )
 
     @property
@@ -301,10 +301,10 @@ class ZImageDenoiseStep(ZImageDenoiseLoopWrapper):
     def description(self) -> str:
         return (
             "Denoise step that iteratively denoise the latents. \n"
-            "Its loop logic is defined in `WanDenoiseLoopWrapper.__call__` method \n"
+            "Its loop logic is defined in `ZImageDenoiseLoopWrapper.__call__` method \n"
             "At each iteration, it runs blocks defined in `sub_blocks` sequentially:\n"
             " - `ZImageLoopBeforeDenoiser`\n"
             " - `ZImageLoopDenoiser`\n"
-            " - `WanLoopAfterDenoiser`\n"
+            " - `ZImageLoopAfterDenoiser`\n"
             "This block supports text-to-image and image-to-image tasks for Z-Image."
         )